Source code for bluemath_tk.teslakit.extremes

#!/usr/bin/env python
# -*- coding: utf-8 -*-

# pip
from datetime import timedelta
import numpy as np
import xarray as xr
from scipy.stats import  gumbel_l, genextreme

from .util.time_operations import date2datenum as d2d


[docs]
def FitGEV_KMA_Frechet(bmus, n_clusters, var):
    '''
    Returns stationary GEV/Gumbel_L params for KMA bmus and varible series

    bmus        - KMA bmus (time series of KMA centroids)
    n_clusters  - number of KMA clusters
    var         - time series of variable to fit to GEV/Gumbel_L

    returns np.array (n_clusters x parameters). parameters = (shape, loc, scale)
    for gumbel distributions shape value will be ~0 (0.0000000001)
    '''

    param_GEV = np.empty((n_clusters, 3))
    for i in range(n_clusters):
        c = i+1
        pos = np.where((bmus==c))[0]

        if len(pos) == 0:
            param_GEV[i,:] = [np.nan, np.nan, np.nan]

        else:

            # get variable at cluster position
            var_c = var[pos]
            var_c = var_c[~np.isnan(var_c)]

            # fit to Gumbel_l and get negative loglikelihood
            loc_gl, scale_gl = gumbel_l.fit(-var_c)
            theta_gl = (0.0000000001, -1*loc_gl, scale_gl)
            nLogL_gl = genextreme.nnlf(theta_gl, var_c)

            # fit to GEV and get negative loglikelihood
            c = -0.1
            shape_gev, loc_gev, scale_gev = genextreme.fit(var_c, c)
            theta_gev = (shape_gev, loc_gev, scale_gev)
            nLogL_gev = genextreme.nnlf(theta_gev, var_c)

            # store negative shape
            theta_gev_fix = (-shape_gev, loc_gev, scale_gev)

            # apply significance test if Frechet
            if shape_gev < 0:

                # TODO: cant replicate ML exact solution
                if nLogL_gl - nLogL_gev >= 1.92:
                    param_GEV[i,:] = list(theta_gev_fix)
                else:
                    param_GEV[i,:] = list(theta_gl)
            else:
                param_GEV[i,:] = list(theta_gev_fix)

    return param_GEV



[docs]
def Smooth_GEV_Shape(cenEOFs, param):
    '''
    Smooth GEV shape parameter (for each KMA cluster) by promediation
    with neighbour EOFs centroids

    cenEOFs  - (n_clusters, n_features) KMA centroids
    param    - GEV shape parameter for each KMA cluster

    returns smoothed GEV shape parameter as a np.array (n_clusters)
    '''

    # number of clusters
    n_cs = cenEOFs.shape[0]

    # calculate distances (optimized)
    cenEOFs_b = cenEOFs.reshape(cenEOFs.shape[0], 1, cenEOFs.shape[1])
    D = np.sqrt(np.einsum('ijk, ijk->ij', cenEOFs-cenEOFs_b, cenEOFs-cenEOFs_b))
    np.fill_diagonal(D, np.nan)

    # sort distances matrix to find neighbours
    sort_ord = np.empty((n_cs, n_cs), dtype=int)
    D_sorted = np.empty((n_cs, n_cs))
    for i in range(n_cs):
        order = np.argsort(D[i,:])
        sort_ord[i,:] = order
        D_sorted[i,:] = D[i, order]

    # calculate smoothed parameter
    denom = np.sum(1/D_sorted[:,:4], axis=1)
    param_c = 0.5 * np.sum(np.column_stack(
        [
            param[:],
            param[sort_ord[:,:4]] * (1/D_sorted[:,:4])/denom[:,None]
        ]
    ), axis=1)

    return param_c



[docs]
def ACOV(f, theta, x):
    '''
    Returns asyntotyc variance matrix using Fisher Information matrix inverse
    Generalized functions, parameters and data.

    f      - function to evaluate: GEV, GUMBELL, ...
    theta  - function parameters: for GEV (shape, location, scale)
    x      - data used for function evaluation

    Second derivative evaluation - variance and covariance
    dxx = (f(x+dt_x) - 2f(x) + f(x-dt_x)) / (dt_x**2)
    dxy = (f(x,y) - f(x-dt_x,y) - f(x,y-dt_y) + f(x-dt_x, u-dt_y)) / (dt_x*dt_y)
    '''

    # parameters differential
    pm = 0.00001
    params = np.asarray(theta)
    dt_p = pm * params

    # Fisher information matrix holder 
    ss = len(params)
    FI = np.ones((ss,ss)) * np.nan

    # TODO: evaluar f falla en algunos casos?? 
    if np.isinf(f(theta, x)):
        #print ('ACOV error: nLogL = Inf. {0}'.format(theta))
        return np.ones((ss,ss))*0.0001

    # variance and covariance
    for i in range(ss):

        # diferential parameter FI evaluation
        p1 = np.asarray(theta); p1[i] = p1[i] + dt_p[i]
        p2 = np.asarray(theta); p2[i] = p2[i] - dt_p[i]

        # variance
        FI[i,i] = (f(tuple(p1), x) - 2*f(theta,x) + f(tuple(p2), x))/(dt_p[i]**2)

        for j in range(i+1,ss):

            # diferential parameter FI evaluation
            p1 = np.asarray(theta); p1[i] = p1[i] - dt_p[i]
            p2 = np.asarray(theta); p2[j] = p2[j] - dt_p[j]
            p3 = np.asarray(theta); p3[i] = p3[i] - dt_p[i]; p3[j] = p3[j] - dt_p[j]

            # covariance
            cov = (f(theta,x) - f(tuple(p1),x) - f(tuple(p2),x) + f(tuple(p3),x)) \
                    / (dt_p[i]*dt_p[j])
            FI[i,j] = cov
            FI[j,i] = cov

    # asynptotic variance covariance matrix
    acov = np.linalg.inv(FI)

    return acov



[docs]
def Peaks_Over_Threshold(xds, var_name, percentile=99, threshold=None,
                         window_days=3):
    '''

    Peaks Over Threshold methodology to find extreme values.

    xds         - xarray.Dataset (time dim)
    var_name    - variable to apply POT
    percentile  - if threshold not given, calculate it with this percentile
    threshold   - optional, threshold to apply POT
    window_days - minimum number of days between consecutive independent peaks

    returns xarray.Dataset ('time', ) vars: peaks, excedeence, area, and duration
    '''

    # TODO: refactor with times
    def to_hours(time_diff):

        # single
        if isinstance(time_diff, (timedelta, np.timedelta64)):
            if isinstance(time_diff, np.timedelta64):
                dt_h = time_diff / np.timedelta64(1, 'h')

            elif isinstance(time_diff, timedelta):
                dt_h = np.array(time_diff.total_seconds()/3600.0)

            return dt_h

        # else: array
        if isinstance(time_diff[0], np.timedelta64):
            dt_h = time_diff / np.timedelta64(1, 'h')

        elif isinstance(time_diff[0], timedelta):
            dt_h = np.array([x.total_seconds()/3600.0 for x in time_diff])

        return dt_h


    # get variable
    vv = xds[var_name].values[:]
    vt = xds['time'].values[:]

    # data delta time (hours)
    npdf = np.diff(vt)
    dt_h = to_hours(npdf)

    # threshold
    if threshold == None:
        threshold = np.percentile(vv, percentile)

    # consecutive peaks over threshold
    ind_mask = np.where(vv >= threshold, 1, 0)
    ind_dif = np.diff(ind_mask)

    # peaks start and end
    ind_ini = np.where(ind_dif == 1)[0] + 1
    ind_end = np.where(ind_dif == -1)[0] + 1

    # start and end corrections
    if ind_end[0]<ind_ini[0]: ind_ini = np.insert(ind_ini,0,0)
    if ind_end[-1]<ind_ini[-1]: ind_end = np.append(ind_end, len(ind_dif))

    # store variable max, time max, duration and area above threshold for each peak
    time_max, vv_max, area, durac = [], [], [], []

    for i, f in zip(ind_ini, ind_end):

        vv_max.append(np.max(vv[i:f]))

        area.append(np.sum(vv[i:f] * dt_h[i:f] - threshold))

        ind_max = np.argmax(vv[i:f])
        ind_time = list(range(i, f))
        time_max.append(vt[ind_time[ind_max]])

        durac.append(to_hours(vt[f] - vt[i]))


    # ensure independence between maxs
    ind_window_mask = np.where(to_hours(np.diff(time_max)) < 24*window_days, 1, 0)
    if np.any(ind_window_mask):

        # solve dependent events
        ind_dif = np.diff(ind_window_mask)

        # first event corrections 
        ind_dif = np.insert(ind_dif, 0, 0)
        if ind_dif[1] == -1: ind_dif[0] = 1

        ind_ini = np.where(ind_dif == 1)[0]
        ind_fin = np.where(ind_dif == -1)[0] + 1

        # Keep maximum of dependent events
        time_max_indep = []
        vv_max_indep = []
        durac_indep = []
        area_indep = []
        ind_delete = []
        for ind_i, ind_f in zip(ind_ini, ind_fin):

            vv_temp = np.max(vv_max[ind_i:ind_f])
            vv_max_indep.append(vv_temp)

            vv_ind_max = np.argmax(vv_max[ind_i:ind_f])
            ind_time = list(range(ind_i,ind_f))
            time_temp = time_max[ind_time[vv_ind_max]]
            time_max_indep.append(time_temp)

            durac_indep.append(
                to_hours(time_max[ind_f]-time_max[ind_i])
            )

            area_indep.append(np.sum(area[ind_i:ind_f]))

            ind_delete.extend(ind_time)

        # remove all dependent events
        vv_max = np.delete(vv_max, [ind_delete])
        time_max = np.delete(time_max, [ind_delete])
        durac = np.delete(durac, [ind_delete])
        area = np.delete(area, [ind_delete])

        # add independent events
        vv_max = np.concatenate((vv_max, vv_max_indep))
        time_max = np.concatenate((time_max, time_max_indep))
        durac = np.concatenate((durac, durac_indep))  # hours
        area = np.concatenate((area, area_indep))

    # output
    peaks = xr.Dataset(
        {
            '{0}'.format(var_name): (('time',), vv_max),
            '{0}_exceedances'.format(var_name): (('time',), vv_max-threshold),
            'duration': (('time',), durac),
            'area': (('time',), area),
        },
        coords = {'time': time_max},
    )
    peaks = peaks.sortby('time')

    # add some attrs
    peaks.attrs['threshold'] = threshold
    peaks.attrs['percentile'] = percentile

    return peaks