Why Generalized Pareto Distribution (GPD) MLE estimation of Tail Index fails?
On the chart multiple simulation of the same $\text{StudentT}(\nu=4)$ with tail estimated with GPD estimator (blue lines).
x - various tresholds, y - estimated degrees of freedom. Each line - separate simulation (sample), with points - estimation at different treshold value (other colors - different estimators, for comparison).
The code for GPT estimator seems to be correct:
def student_sample(df, n, seed=None): rng = np.random.default_rng(seed) return rng.standard_t(df, size=n) def estimate_gpd_mle(x, threshold, init=(1/5.0, 1.0), min_exceed=5): # Generalized Pareto Distribution (GPD) MLE estimation y = x[x > threshold] - threshold if y.size < min_exceed: return None def nll(params): ξ, β = params if β <= 0: return np.inf base = 1 + ξ * y / β if np.any(base <= 0): return np.inf pdf = (1 / β) * base ** -(1/ξ + 1) if np.any(pdf <= 0) or np.any(~np.isfinite(pdf)): return np.inf return -np.sum(np.log(pdf)) res = minimize(nll, x0=init, bounds=((1/20, 1/2), (1e-3, 10))) if not res.success: return None ξ, β = res.x return (1/ξ, β) # return df = 1/xi P.S.
The other approaches Hill (black), Least Squares (green) - seems also work poorly, failing to find correct df=4 and producing biased estimation ~3.5, why? The full distribution MLE fit (red) seem so produce the best result.
What's supposed to be the best way to estimate the tail, without using the full distribution?
Full code to reproduce the chart
import numpy as np import matplotlib.pyplot as plt from scipy.stats import t from scipy.optimize import minimize_scalar, minimize def student_sample(df, n, seed=None): rng = np.random.default_rng(seed) return rng.standard_t(df, size=n) def estimate_hill(x): x = np.sort(x)[::-1] logx = np.log(x) sumk = np.cumsum(logx) k = np.arange(1, len(x)+1) denom = (sumk/k) - logx return np.array([1/d if d != 0 else np.nan for d in denom]) def fit_mle_student(x): def nll(df): return np.inf if df <= 2 else -np.sum(t.logpdf(x, df, loc=0, scale=1)) res = minimize_scalar(nll, bounds=(2.01, 100), method='bounded') if not res.success: raise RuntimeError("MLE fit failed") return res.x def estimate_gpd_mle(x, threshold, init=(1/5.0, 1.0), min_exceed=5): # Generalized Pareto Distribution (GPD) MLE estimation y = x[x > threshold] - threshold if y.size < min_exceed: return None def nll(params): ξ, β = params if β <= 0: return np.inf base = 1 + ξ * y / β if np.any(base <= 0): return np.inf pdf = (1 / β) * base ** -(1/ξ + 1) if np.any(pdf <= 0) or np.any(~np.isfinite(pdf)): return np.inf return -np.sum(np.log(pdf)) res = minimize(nll, x0=init, bounds=((1/20, 1/2), (1e-3, 10))) if not res.success: return None ξ, β = res.x return (1/ξ, β) # return df = 1/xi def estimate_gpd_mle_and_prepare_data_for_plot(x, K, count=20, init=(0.2, 1.0)): # Ignore, nothing interesting, just some data preparation for the plot x_sorted = np.sort(x) ranks = np.linspace(10, K, count).astype(int) df_estimates = np.full_like(ranks, np.nan, dtype=float) current_init = init for i, k in enumerate(ranks): threshold = x_sorted[-k] res = estimate_gpd_mle(x, threshold, init=current_init) if res is None: continue df, beta = res df_estimates[i] = df # warm start next with xi,beta current_init = (1/df, beta) return ranks, df_estimates def estimate_tail_ls(x): # Least Squares estimation of the tail index n = len(x) if n < 3: return None x_sorted = np.sort(x) logx = np.log(x_sorted) logp = np.log(np.arange(n, 0, -1)) - np.log(n + 1) A = np.vstack([logx, np.ones_like(logx)]).T try: slope, _ = np.linalg.lstsq(A, logp, rcond=None)[0] alpha = -slope return alpha if alpha > 0 else None except: return None def estimate_tail_ls_and_prepare_data_for_plot(x, K, count=20): # Ignore, nothing interesting, just some data preparation for the plot x_sorted = np.sort(x) ranks = np.linspace(10, K, count).astype(int) df_estimates = np.full_like(ranks, np.nan, dtype=float) for i, k in enumerate(ranks): tail = x_sorted[-k:] df = estimate_tail_ls(tail) if df is not None: df_estimates[i] = df return ranks, df_estimates # # Parameters N = 30 n_obs = 20_000 q = 0.97 plt.figure(figsize=(8, 6)) for _ in range(N): x = student_sample(4, n_obs) # Hill estimates k = int((1 - q) * len(x)) tail = np.sort(x)[-k:][::-1] hill = estimate_hill(tail) plt.plot(hill, linewidth=1, alpha=0.5, color='black') # Exact t MLE mle_df = fit_mle_student(x) plt.axhline(mle_df, linewidth=1, alpha=0.5, color='red') # GPD estimates across multiple order-statistic thresholds ranks, dfs = estimate_gpd_mle_and_prepare_data_for_plot(x, K=k, count=20) plt.plot(ranks, dfs, marker='o', linestyle='-', markersize=3, alpha=0.5, color='blue', linewidth=1, ) # LS estimates across multiple order-statistic thresholds ranks, dfs = estimate_tail_ls_and_prepare_data_for_plot(x, K=k, count=20) plt.plot(ranks, dfs, marker='o', linestyle='-', markersize=3, alpha=0.5, color='green', linewidth=1) plt.ylim(2, 6) plt.xlabel('Order-stats rank k') plt.ylabel('Tail index estimate') plt.title(f'Hill (black), MLE (red), GPD (blue), LS (green) over {N} samples. True ν = 4') plt.show() 