Find and analyse drought events#

To analyse drought events we need to set baselines for drought events. The features we focus on are the duration of each drought and the area affected. We first need to identify all the historical drought events of the region we focus on and then extract these indexes and subsequently the baselines for the region. These baselines can show us if an event was anomalous.

Import necessary libraries#

import sys
import os
import glob
import xarray as xr
from functools import partial
import datetime
import numpy as np
import plotly.graph_objects as go
import dask.array as da
import pandas as pd
import matplotlib.pyplot as plt
from scipy.ndimage import label, generate_binary_structure
import geopandas as gpd
import pandas as pd
from scipy.ndimage import label, generate_binary_structure
import hvplot.xarray  # to plot xarray with hvplot
import cartopy.crs as ccrs
import sys
from pyprojroot import here
root = here()
sys.path.append(str(root / "chapters/shared/"))
from utils.widgets_handler import read_json_to_dict
color_palette_json = 'color_palette_bright.json'
cmap = read_json_to_dict(color_palette_json)
cmap['No Data'] = '#cccccc'

Load data function#

def get_spi_dataset(acc_period: str = 1, years: list = [2020]):
    data_root_folder = '/data1/drought_dataset/spi/'
    spi_folder = os.path.join(data_root_folder, f'spi{acc_period}')
    spi_paths = []

    for year in years:
        spi_paths.extend(sorted(glob.glob(
            f'{data_root_folder}spi{acc_period}/SPI{acc_period}_gamma_global_era5_moda_ref1991to2020_{year}*.nc')))

    return xr.open_mfdataset(spi_paths, chunks={'time': "auto"}, concat_dim="time", combine='nested', parallel=False)


def get_spei_dataset(acc_period: str = 1, years: list = [2020]):
    data_root_folder = '/data1/drought_dataset/spei/'
    spi_folder = os.path.join(data_root_folder, f'spi{acc_period}')
    spi_paths = []

    for year in years:
        spi_paths.extend(sorted(glob.glob(
            f'{data_root_folder}spei{acc_period}/SPEI{acc_period}_genlogistic_global_era5_moda_ref1991to2020_{year}*.nc')))

    return xr.open_mfdataset(spi_paths, chunks={'time': "auto"}, concat_dim="time", combine='nested', parallel=False)


def mask_invalid_values(ds, variable, value=-9999):
    ds[variable] = ds[variable].where(ds[variable] != value, np.nan)
    return ds


def subset_region(dataset, variable, bbox):
    # data = dataset.sel(time=np.datetime64(time), method='nearest')[variable]

    # Define the geographical boundaries for Madagascar
    lat_bounds = [bbox[1], bbox[3]]  # from south to north
    lon_bounds = [bbox[0], bbox[2]]  # from west to east

    # Check for NaN values in latitude and longitude coordinates
    lat_nan = dataset['lat'].isnull().any()
    lon_nan = dataset['lon'].isnull().any()

    # Handle NaN values if they exist
    if lat_nan:
        dataset = dataset.dropna(dim='lat', how='all')
    if lon_nan:
        dataset = dataset.dropna(dim='lon', how='all')

    # Ensure no NaN values in the data itself
    dataset = dataset.fillna(np.nan)  # or use another appropriate method like interpolation

    # Ensure the lat/lon bounds are within the data's range
    lat_min, lat_max = dataset['lat'].min().item(), dataset['lat'].max().item()
    lon_min, lon_max = dataset['lon'].min().item(), dataset['lon'].max().item()

    if lat_bounds[0] < lat_min or lat_bounds[1] > lat_max or lon_bounds[0] < lon_min or lon_bounds[1] > lon_max:
        raise ValueError("The specified latitude/longitude bounds are outside the range of the dataset.")

    # Subset the data using where and dropna
    dataset = dataset.where(
        (dataset['lat'] >= lat_bounds[0]) & (dataset['lat'] <= lat_bounds[1]) &
        (dataset['lon'] >= lon_bounds[0]) & (dataset['lon'] <= lon_bounds[1]),
        drop=True
    )

    # return xr.Dataset(data)
    return dataset
    
def get_spei_significance_dataset(variable='SPEI1', year=2020):
    data_root_folder='/data1/drought_dataset/spei/'
    quality_paths = []
    for month in range(1, 13):
        month_str = f'{month:02d}'
        quality_paths.append(f'{data_root_folder}{variable.lower()}/parameter/{variable}_significance_global_era5_moda_{year}{month_str}_ref1991to2020.nc')
    return xr.open_mfdataset(quality_paths, concat_dim="time", combine='nested', parallel=False)

def get_spi_significance_dataset(variable='SPI1', year=2020):
    data_root_folder='/data1/drought_dataset/spi/'
    quality_paths = []
    for month in range(1, 13):
        month_str = f'{month:02d}'
        quality_paths.append(f'{data_root_folder}{variable.lower()}/parameter/{variable}_significance_global_era5_moda_{year}{month_str}_ref1991to2020.nc')
    return xr.open_mfdataset(quality_paths, concat_dim="time", combine='nested', parallel=False)

Load dataset#

# Load dataset
spei_data = get_spei_dataset(acc_period=12, years=list(range(1940, 2025)))
spei48_region = mask_invalid_values(spei_data, variable='SPEI12')

Filter dataset for specific bounding box#

# Get a subset of the dataset for a bbox
world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))
world = world.to_crs(epsg=4326)
# country_list = world['name'].unique().tolist()
# country_list.sort()
# country_shape = world[world['name'] == 'Kenya']
# country_shape = world[world['name'] == 'S. Sudan']
country_shape = world[world['name'] == 'S. Sudan']
spei_data = spei_data.rio.write_crs("EPSG:4326", inplace=True)

spei_data_country = spei48_region.rio.clip(country_shape.geometry, world.crs, drop=True)
spei = spei_data_country['SPEI12']
/tmp/ipykernel_2562388/3093083712.py:2: FutureWarning: The geopandas.dataset module is deprecated and will be removed in GeoPandas 1.0. You can get the original 'naturalearth_lowres' data from https://www.naturalearthdata.com/downloads/110m-cultural-vectors/.
  world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))

Plot animated SPEI dataset#

spei.hvplot(
    clim=(-8,8),
    groupby="time",
    widget_type="scrubber", 
    widget_location="bottom", 
    projection=ccrs.PlateCarree(), 
    coastline='10m',
    cmap='BrBG',
    features=['borders']
)

Setup drought severity classification function and classes#

Analyse each month and find the area coverage for each condition. The month classification is the worst condition that covers at least a percentage of the area.#

The classes are as bellow:

  • ‘Extremely dry’: \(SPEI < -2\)

  • ‘Severely dry’: \(-2 < SPEI < -1.5\)

  • ‘Moderately dry’: \(-1.5 < SPEI < -1\)

  • ‘Mildly dry’: \(-1 < SPEI < 0\)

  • ‘Mildly wet’: \(0 < SPEI < 1\)

  • ‘Moderately wet’: \(1 < SPEI < 1.5\)

  • ‘Severely wet’: \(1.5 < SPEI < 2\)

  • ‘Extremely wet’: \(SPEI < 2\)

import xarray as xr
import numpy as np


def classify_drought_severity(spei, classes, conditions, threshold_percentage=50):
    """
    Classifies drought severity based on SPEI values and percentage of grid points in each class.

    Parameters:
    - spei: An xarray DataArray containing SPEI values (dimensions: time, lat, lon).
    - classes: A list of class names (e.g., ['Extreme Drought', 'Severe Drought', ...]).
    - conditions: A list of conditions corresponding to each class (Boolean conditions for the grid).
    - threshold_percentage: Minimum percentage of grid points required to classify a time step into a specific class.

    Returns:
    - result_df: A pandas DataFrame with counts and percentages of grid points in each class for each time step,
      including a 'Final Classification' column based on the percentage threshold.
    """
    
    # Calculate the total number of grid points (excluding NaN values if any)
    total_grid_points = spei.notnull().sum(dim=['lat', 'lon'])
    
    # Count the number of grid points in each condition for each time step
    counts = [condition.sum(dim=['lat', 'lon']) for condition in conditions]

    # Combine counts along a new dimension called 'class'
    counts_concat = xr.concat(counts, dim=pd.Index(classes, name="class"))

    # Convert to DataFrame
    counts_df = counts_concat.to_dataframe(name='count').reset_index()

    # Pivot the DataFrame to have classes as columns
    result_df = counts_df.pivot(index='time', columns='class', values='count').fillna(0)

    # Add total grid points to the result DataFrame
    result_df['Total Grid Points'] = total_grid_points.values

    # Calculate the percentage of grid points for each class
    for class_name in classes:
        result_df[f'{class_name} Percentage'] = (result_df[class_name] / result_df['Total Grid Points']) * 100

    # Determine the final classification for each time step based on the percentage threshold
    def classify_row(row):
        for class_name in classes:
            if row[f'{class_name} Percentage'] >= threshold_percentage:
                return class_name
        return 'No Data'  # If no class meets the threshold

    result_df['Final Classification'] = result_df.apply(classify_row, axis=1)

    return result_df

# Example usage
# Load the dataset (assuming it's already in xarray format)
# ds = xr.open_dataset('your_dataset.nc')  # Uncomment if loading from file
# spei = ds['SPEI']  # Replace 'SPEI' with your actual variable name

# Define the conditions and corresponding classes
conditions = [
    spei < -2, # 'Extremely dry'
    (spei >= -2) & (spei < -1.5), # 'Severely dry'
    (spei >= -1.5) & (spei < -1), # 'Moderately dry'
    (spei >= -1) & (spei < 0), # 'Mildly dry'
    (spei >= 0) & (spei <= 1), # 'Mildly wet'
    (spei >= 1) & (spei <= 1.5), # 'Moderately wet'
    (spei >= 1.5) & (spei <= 2), # 'Severely wet'
    spei > 2 # 'Extremely wet'
]
classes = ['Extremely dry', 
           'Severely dry', 
           'Moderately dry', 
           'Mildly dry', 
           'Mildly wet', 
           'Moderately wet', 
           'Severely wet', 
           'Extremely wet']

Classify months in SPEI#

# Get the result DataFrame
result_df = classify_drought_severity(spei, classes, conditions, threshold_percentage=20)
result_df = result_df.reset_index()
# Output the result
result_df
class time Extremely dry Extremely wet Mildly dry Mildly wet Moderately dry Moderately wet Severely dry Severely wet Total Grid Points Extremely dry Percentage Severely dry Percentage Moderately dry Percentage Mildly dry Percentage Mildly wet Percentage Moderately wet Percentage Severely wet Percentage Extremely wet Percentage Final Classification
0 1940-01-01 06:00:00 0 0 0 0 0 0 0 0 0 NaN NaN NaN NaN NaN NaN NaN NaN No Data
1 1940-02-01 06:00:00 0 0 0 0 0 0 0 0 0 NaN NaN NaN NaN NaN NaN NaN NaN No Data
2 1940-03-01 06:00:00 0 0 0 0 0 0 0 0 0 NaN NaN NaN NaN NaN NaN NaN NaN No Data
3 1940-04-01 06:00:00 0 0 0 0 0 0 0 0 0 NaN NaN NaN NaN NaN NaN NaN NaN No Data
4 1940-05-01 06:00:00 0 0 0 0 0 0 0 0 0 NaN NaN NaN NaN NaN NaN NaN NaN No Data
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1004 2023-09-01 06:00:00 756 0 5 0 13 0 44 0 818 92.420538 5.378973 1.589242 0.611247 0.000000 0.0 0.0 0.0 Extremely dry
1005 2023-10-01 06:00:00 723 0 14 0 27 0 54 0 818 88.386308 6.601467 3.300733 1.711491 0.000000 0.0 0.0 0.0 Extremely dry
1006 2023-11-01 06:00:00 644 0 33 2 41 0 98 0 818 78.728606 11.980440 5.012225 4.034230 0.244499 0.0 0.0 0.0 Extremely dry
1007 2023-12-01 06:00:00 647 0 34 1 38 0 98 0 818 79.095355 11.980440 4.645477 4.156479 0.122249 0.0 0.0 0.0 Extremely dry
1008 2024-01-01 06:00:00 652 0 40 1 39 0 86 0 818 79.706601 10.513447 4.767726 4.889976 0.122249 0.0 0.0 0.0 Extremely dry

1009 rows × 19 columns

Generate barplot for the dataset to visuallize drought events#

# import matplotlib.pyplot as plt
# import pandas as pd
# import numpy as np
# import matplotlib.dates as mdates

# # Map the classifications to colors
# result_df['Color'] = result_df['Final Classification'].map(cmap)

# # Create the plot
# plt.figure(figsize=(12, 4))  # Adjust the width and height of the plot

# # Plot bars
# plt.bar(result_df['time'], 1, color=result_df['Color'], width=60, align='edge')  # Adjust width for visibility

# # Customize x-axis and y-axis
# plt.gca().yaxis.set_visible(False)  # Hide y-axis

# # Set x-axis major locator and formatter to show only yearly ticks
# plt.gca().xaxis.set_major_locator(mdates.YearLocator())  # Place ticks at yearly intervals
# plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y'))  # Format x-axis labels to show only year

# # Set x-axis limits
# plt.xlim(pd.Timestamp(result_df.time.min()), pd.Timestamp(result_df.time.max()))

# # Rotate x-axis labels for better readability
# plt.xticks(rotation=90)

# # Label the x-axis
# plt.xlabel('Time')

# # Set the title of the plot
# plt.title('Drought Classification Over Time')

# # Add legend
# handles = [plt.Line2D([0], [0], color=color, lw=4) for color in cmap.values()]
# labels = list(cmap.keys())
# plt.legend(handles, labels, title='Drought Classification', bbox_to_anchor=(1.05, 1), loc='upper left')

# # Adjust layout for better fit
# plt.tight_layout()

# # Show the plot
# plt.show()
import plotly.graph_objs as go
import pandas as pd
import numpy as np

# Map the classifications to colors
result_df['Color'] = result_df['Final Classification'].map(cmap)

# Create the plot
fig = go.Figure()

legend_order = [
    'Extremely dry', 'Severely dry', 'Moderately dry', 'Mildly dry',
    'Mildly wet', 'Moderately wet', 'Severely wet', 'Extremely wet',
    'No Data'
]

# Add bars
for lbl in legend_order:
    fig.add_trace(go.Bar(
        x=result_df['time'].loc[result_df['Final Classification']==lbl],
        y=[1] * len(result_df),
        name=lbl,
        marker=dict(color=result_df['Color'].loc[result_df['Final Classification']==lbl], line=dict(width=0)),
        width=60 * 24 * 60 * 60 * 1000,  # Width in milliseconds
        orientation='v',  # Vertical bars
        # name='Drought Classification'
    ))

x_min = result_df['time'].min()
x_max = result_df['time'].max()

# Update x-axis and y-axis
fig.update_xaxes(
    title_text='Time (Months)',
    tickformat='%Y',  # Format x-axis labels to show only year
    tickangle=90,  # Rotate x-axis labels
    rangeslider_visible=False,  # Hide the range slider
    type='date',# Ensure x-axis is treated as dates
    range=[x_min, x_max]
)

fig.update_yaxes(
    visible=False  # Hide y-axis
)

# Add legend
fig.update_layout(
    title='Drought Classification Over Time',
    legend_title='Drought Classification',
    legend=dict(
        x=1.05,  # Positioning the legend to the right of the plot
        y=1,
        orientation='v',
        traceorder='normal'  # Ensure legend entries are in the order they appear in the plot
    ),
    margin=dict(l=50, r=200, t=50, b=50),
    paper_bgcolor='white',
    plot_bgcolor='white',
    font=dict(
        color='#2a3f5f',
        family='sans-serif'
    ),
)

# Show the plot
fig.show()

Setup function to detect continuous periods of a condition#

A drought event is defined as continuous periods of at least midly dry condition. Periods that are separated by a threshold number of months (default to 1) count as one event.

def detect_continuous_periods_with_dates(df, binary_col, date_col, min_sep=1):
    """
    Detects continuous periods of 1s in a binary vector within a DataFrame and returns a new DataFrame
    with the start date, end date, and duration of each period.
    
    Parameters:
    - df: Input DataFrame containing the binary vector and dates.
    - binary_col: Column name for the binary vector (0s and 1s).
    - date_col: Column name for the corresponding dates.
    - min_sep: Minimum number of continuous 0s required to separate periods of 1s.
    
    Returns:
    - periods_df: A DataFrame with 'Start Date', 'End Date', and 'Duration' columns.
    """
    
    # Ensure binary_col is binary (0s and 1s)
    assert df[binary_col].isin([0, 1]).all(), "The binary column must contain only 0s and 1s."
    
    # Detect transitions in the binary column
    transitions = df[binary_col].diff().fillna(0)
    
    # Find where the vector changes from 0 to 1 (start of 1s) and 1 to 0 (end of 1s)
    start_ones = transitions == 1
    end_ones = transitions == -1
    
    # Get the indices of these transitions
    start_indices = start_ones[start_ones].index
    end_indices = end_ones[end_ones].index
    
    # If the series starts with 1s, add a start at the beginning
    if df[binary_col].iloc[0] == 1:
        start_indices = pd.Index([df.index[0]]).append(start_indices)
    
    # If the series ends with 1s, add an end at the end
    if df[binary_col].iloc[-1] == 1:
        end_indices = end_indices.append(pd.Index([df.index[-1]]))
    
    # Ensure indices are aligned
    assert len(start_indices) == len(end_indices), "Mismatched start and end periods."
    
    # Filter out periods that are too close to each other based on min_sep
    valid_periods = []
    last_end = -min_sep - 1  # Initialize last_end to be far enough back
    
    for start, end in zip(start_indices, end_indices):
        if start - last_end >= min_sep:
            valid_periods.append((start, end))
        last_end = end
    
    # Create a new DataFrame for the detected periods
    periods = []
    for start, end in valid_periods:
        start_date = df.loc[start, date_col]
        end_date = df.loc[end, date_col]
        duration = (end_date.year - start_date.year) * 12 + end_date.month - start_date.month + 1  # Duration in months
        periods.append({'Start Date': start_date, 'End Date': end_date, 'Duration': duration})
    
    periods_df = pd.DataFrame(periods)
    return periods_df

Convert the timeline to a binary vector.#

Every dry condition is marked as drought and everything else as no drought. A minimum separation of 2 months with no drought is regarded as no change.

min_sep = 1  # Minimum separation of 1 zeros to consider periods distinct

result_df['class'] = np.where((result_df['Final Classification']=='Extremely dry')|
                              (result_df['Final Classification']=='Severely dry')|
                              (result_df['Final Classification']=='Moderately dry')|
                              (result_df['Final Classification']=='Mildly dry'), 1, 0)

Find the continuous periods and calculate their duration#

periods_df = detect_continuous_periods_with_dates(result_df, binary_col='class', date_col='time', min_sep=min_sep)
periods_df
Start Date End Date Duration
0 1941-07-01 06:00:00 1942-05-01 06:00:00 11
1 1942-11-01 06:00:00 1943-06-01 06:00:00 8
2 1944-11-01 06:00:00 1945-10-01 06:00:00 12
3 1946-02-01 06:00:00 1946-09-01 06:00:00 8
4 1965-10-01 06:00:00 1966-09-01 06:00:00 12
5 1966-10-01 06:00:00 1967-04-01 06:00:00 7
6 1988-06-01 06:00:00 1988-07-01 06:00:00 2
7 1991-03-01 06:00:00 1991-04-01 06:00:00 2
8 1994-03-01 06:00:00 1994-06-01 06:00:00 4
9 1997-09-01 06:00:00 1997-12-01 06:00:00 4
10 1998-04-01 06:00:00 1998-08-01 06:00:00 5
11 2000-10-01 06:00:00 2001-03-01 06:00:00 6
12 2001-05-01 06:00:00 2001-06-01 06:00:00 2
13 2001-07-01 06:00:00 2001-08-01 06:00:00 2
14 2002-07-01 06:00:00 2003-09-01 06:00:00 15
15 2003-10-01 06:00:00 2008-05-01 06:00:00 56
16 2008-06-01 06:00:00 2024-01-01 06:00:00 188

Plot all the event durations and find the 75 percentile to find drought events with an anomalous duration#

Events with a duration more than the 75 percentile can be characterized as very anomalous. For S.Sudan one such drought event was between July 2002 and September 2003.

# def plot_duration_bar_plot(data, percentile=75):
#     percentile_9_duration = np.percentile(data.Duration, 90)
#     percentile_1_duration = np.percentile(data.Duration, 10)
#     median_duration = data.Duration.median()
    
    
#     # Create the plot
#     plt.figure(figsize=(10, 6))
    
#     # Create bars for each event
#     bars = plt.bar(data.index, data['Duration'], color='skyblue', edgecolor='black')
    
#     # Add a dashed red line for the average duration
#     plt.axhline(y=percentile_9_duration, color='red', linestyle='--', linewidth=2, label=f'{90} percentile of durations: {percentile_9_duration:.2f} months')
#     plt.axhline(y=percentile_1_duration, color='green', linestyle='--', linewidth=2, label=f'{10} percentile of durations: {percentile_1_duration:.2f} months')
#     plt.axhline(y=median_duration, color='blue', linestyle='--', linewidth=2, label=f'Median duration: {median_duration:.2f} months')
    
#     # Labeling the x-axis ticks with the start and end dates
#     xticks_labels = [f"{start.strftime('%Y-%m')} - {end.strftime('%Y-%m')}" for start, end in zip(data['Start Date'], data['End Date'])]
#     plt.xticks(ticks=np.arange(len(data.index)), labels=xticks_labels)
    
#     # Label axes
#     plt.xlabel('Events')
#     plt.ylabel('Duration (Months)')
#     plt.title('Duration of drought events in history')
    
#     # Add legend
#     plt.legend()
    
#     # Rotate x-axis labels for better readability
#     plt.xticks(rotation=45, ha='right')
    
#     # Adjust layout for better fit
#     plt.tight_layout()
    
#     # Show the plot
#     plt.show()
# plot_duration_bar_plot(periods_df)
def plot_duration_bar_plot(data, percentile=75):
    percentile_9_duration = np.percentile(data.Duration, 90)
    percentile_1_duration = np.percentile(data.Duration, 10)
    median_duration = data.Duration.median()

    # Generate x-axis labels based on the dates
    x_labels = [f"{start.strftime('%Y-%m')} - {end.strftime('%Y-%m')}" for start, end in zip(data['Start Date'], data['End Date'])]

    # Create a numerical x-axis for the plot
    x_numeric = list(range(len(x_labels)))

    # Create bars for each event
    bar = go.Bar(
        x=x_numeric,
        y=data['Duration'],
        marker=dict(color='skyblue', line=dict(color='black', width=1)),
        name='Event period',
    )
    
    # Define the x-axis range for the lines
    line_x_values = [x_numeric[0] - 1, x_numeric[-1] + 1]  # Extend beyond the first and last data point
    
    # Create lines for percentiles and median
    percentile_9_line = go.Scatter(
        x=line_x_values,
        y=[percentile_9_duration, percentile_9_duration],
        mode='lines',
        line=dict(color='red', dash='dash'),
        name=f'90th percentile: {percentile_9_duration:.2f} months'
    )
    
    percentile_1_line = go.Scatter(
        x=line_x_values,
        y=[percentile_1_duration, percentile_1_duration],
        mode='lines',
        line=dict(color='green', dash='dash'),
        name=f'10th percentile: {percentile_1_duration:.2f} months'
    )
    
    median_line = go.Scatter(
        x=line_x_values,
        y=[median_duration, median_duration],
        mode='lines',
        line=dict(color='blue', dash='dash'),
        name=f'Median: {median_duration:.2f} months'
    )
    
    # Create the layout
    layout = go.Layout(
        title='Duration of drought events',
        xaxis=dict(
            title='Events',
            tickangle=-45,
            tickmode='array',
            tickvals=x_numeric,
            ticktext=x_labels,
            range=[x_numeric[0] - 1, x_numeric[-1] + 1],  # Extend x-axis range
        ),
        yaxis=dict(title='Duration (Months)'),
        barmode='group',
        legend=dict(x=1, y=0.5, orientation='v'),
        margin=dict(l=50, r=50, t=50, b=100),
        paper_bgcolor='white', 
        plot_bgcolor='white',
        font=dict(
            color='#2a3f5f',
            family='sans-serif'
            ),
    )
    
    # Create the figure and add the traces
    fig = go.Figure(data=[bar, percentile_9_line, percentile_1_line, median_line], layout=layout)
    
    # Show the plot
    fig.show()
plot_duration_bar_plot(periods_df)

Calculate area percentage for each class for each month and aggregate for each event#

def calculate_area_percentage(monthly_data, periods):
    columns_to_use = ['Extremely dry',
                     'Severely dry', 
                     'Moderately dry',
                     'Mildly dry', 
                     'Mildly wet', 
                     'Moderately wet',
                     'Severely wet',
                     'Extremely wet']
    
    new_columns = ['Extremely dry %',
                   'Severely dry %', 
                   'Moderately dry %',
                   'Mildly dry %',
                   'Mildly wet %', 
                   'Moderately wet %',
                   'Severely wet %',
                   'Extremely wet %']
    
    rows = []
    for i, row in periods.iterrows():
        start_date = row['Start Date']
        end_date = row['End Date']
        df = monthly_data.loc[(monthly_data.time >= start_date) & (monthly_data.time <= end_date)]
        total = df[columns_to_use].sum(axis=1)
        # Calculate the percentage for each specified column
        df_percentage = df[columns_to_use].div(total, axis=0) * 100
        cols = {i[0]:i[1] for i in zip(columns_to_use, new_columns)}
        df_percentage.rename(columns=cols,inplace=-True)
        # Add the percentage columns back to the original dataframe, if needed
        df.loc[:, new_columns] = df_percentage
        rows.append(df[new_columns].mean(axis=0))
    new_df = pd.concat(rows, axis=1).T.reset_index(drop=True)
    new_df['Start Date'] = periods['Start Date']
    new_df['End Date'] = periods['End Date']
    return new_df
percentages = calculate_area_percentage(result_df, periods_df)
percentages
class Extremely dry % Severely dry % Moderately dry % Mildly dry % Mildly wet % Moderately wet % Severely wet % Extremely wet % Start Date End Date
0 0.522338 4.156479 11.658146 53.723050 29.228717 0.700156 0.011114 0.000000 1941-07-01 06:00:00 1942-05-01 06:00:00
1 2.444988 5.684597 8.175428 27.735330 48.960880 4.859413 1.589242 0.550122 1942-11-01 06:00:00 1943-06-01 06:00:00
2 1.629992 4.024042 7.620212 45.069275 38.457620 2.485738 0.550122 0.162999 1944-11-01 06:00:00 1945-10-01 06:00:00
3 0.702934 1.191932 4.003667 28.896699 61.048289 3.438264 0.473716 0.244499 1946-02-01 06:00:00 1946-09-01 06:00:00
4 5.063162 1.996740 3.249796 28.698044 44.947025 10.431948 3.942543 1.670742 1965-10-01 06:00:00 1966-09-01 06:00:00
5 3.021306 2.095704 4.016766 30.143206 40.866224 12.155082 5.780650 1.921062 1966-10-01 06:00:00 1967-04-01 06:00:00
6 0.061125 0.000000 0.000000 12.958435 75.611247 8.985330 2.322738 0.061125 1988-06-01 06:00:00 1988-07-01 06:00:00
7 0.000000 0.000000 0.122249 13.630807 57.762836 23.349633 5.012225 0.122249 1991-03-01 06:00:00 1991-04-01 06:00:00
8 1.344743 2.139364 4.828851 29.217604 61.308068 1.161369 0.000000 0.000000 1994-03-01 06:00:00 1994-06-01 06:00:00
9 0.244499 0.275061 2.414425 20.690709 54.125917 21.943765 0.305623 0.000000 1997-09-01 06:00:00 1997-12-01 06:00:00
10 0.097800 0.000000 0.855746 30.268949 63.960880 4.523227 0.293399 0.000000 1998-04-01 06:00:00 1998-08-01 06:00:00
11 0.285249 0.753871 1.915240 26.385493 70.252649 0.407498 0.000000 0.000000 2000-10-01 06:00:00 2001-03-01 06:00:00
12 0.061125 0.000000 0.061125 18.459658 81.356968 0.061125 0.000000 0.000000 2001-05-01 06:00:00 2001-06-01 06:00:00
13 0.122249 0.000000 0.061125 20.048900 78.606357 1.161369 0.000000 0.000000 2001-07-01 06:00:00 2001-08-01 06:00:00
14 0.089650 4.947025 13.504482 50.472698 30.839446 0.146699 0.000000 0.000000 2002-07-01 06:00:00 2003-09-01 06:00:00
15 0.074223 0.093870 2.984195 52.023664 42.630108 1.547765 0.628711 0.017464 2003-10-01 06:00:00 2008-05-01 06:00:00
16 9.442465 17.419237 20.608126 37.739947 12.841388 1.170473 0.498751 0.279613 2008-06-01 06:00:00 2024-01-01 06:00:00
percentages['Dry'] = percentages.loc[:, ['Extremely dry %', 'Severely dry %', 'Moderately dry %', 'Mildly dry %']].sum(axis=1)
percentages
class Extremely dry % Severely dry % Moderately dry % Mildly dry % Mildly wet % Moderately wet % Severely wet % Extremely wet % Start Date End Date Dry
0 0.522338 4.156479 11.658146 53.723050 29.228717 0.700156 0.011114 0.000000 1941-07-01 06:00:00 1942-05-01 06:00:00 70.060013
1 2.444988 5.684597 8.175428 27.735330 48.960880 4.859413 1.589242 0.550122 1942-11-01 06:00:00 1943-06-01 06:00:00 44.040342
2 1.629992 4.024042 7.620212 45.069275 38.457620 2.485738 0.550122 0.162999 1944-11-01 06:00:00 1945-10-01 06:00:00 58.343521
3 0.702934 1.191932 4.003667 28.896699 61.048289 3.438264 0.473716 0.244499 1946-02-01 06:00:00 1946-09-01 06:00:00 34.795232
4 5.063162 1.996740 3.249796 28.698044 44.947025 10.431948 3.942543 1.670742 1965-10-01 06:00:00 1966-09-01 06:00:00 39.007742
5 3.021306 2.095704 4.016766 30.143206 40.866224 12.155082 5.780650 1.921062 1966-10-01 06:00:00 1967-04-01 06:00:00 39.276982
6 0.061125 0.000000 0.000000 12.958435 75.611247 8.985330 2.322738 0.061125 1988-06-01 06:00:00 1988-07-01 06:00:00 13.019560
7 0.000000 0.000000 0.122249 13.630807 57.762836 23.349633 5.012225 0.122249 1991-03-01 06:00:00 1991-04-01 06:00:00 13.753056
8 1.344743 2.139364 4.828851 29.217604 61.308068 1.161369 0.000000 0.000000 1994-03-01 06:00:00 1994-06-01 06:00:00 37.530562
9 0.244499 0.275061 2.414425 20.690709 54.125917 21.943765 0.305623 0.000000 1997-09-01 06:00:00 1997-12-01 06:00:00 23.624694
10 0.097800 0.000000 0.855746 30.268949 63.960880 4.523227 0.293399 0.000000 1998-04-01 06:00:00 1998-08-01 06:00:00 31.222494
11 0.285249 0.753871 1.915240 26.385493 70.252649 0.407498 0.000000 0.000000 2000-10-01 06:00:00 2001-03-01 06:00:00 29.339853
12 0.061125 0.000000 0.061125 18.459658 81.356968 0.061125 0.000000 0.000000 2001-05-01 06:00:00 2001-06-01 06:00:00 18.581907
13 0.122249 0.000000 0.061125 20.048900 78.606357 1.161369 0.000000 0.000000 2001-07-01 06:00:00 2001-08-01 06:00:00 20.232274
14 0.089650 4.947025 13.504482 50.472698 30.839446 0.146699 0.000000 0.000000 2002-07-01 06:00:00 2003-09-01 06:00:00 69.013855
15 0.074223 0.093870 2.984195 52.023664 42.630108 1.547765 0.628711 0.017464 2003-10-01 06:00:00 2008-05-01 06:00:00 55.175952
16 9.442465 17.419237 20.608126 37.739947 12.841388 1.170473 0.498751 0.279613 2008-06-01 06:00:00 2024-01-01 06:00:00 85.209775
def plot_area_bar_plot(data, columns_to_sum=['Moderately dry %',
                                             'Mildly dry %',
                                             'Mildly wet %',
                                             'Moderately wet %',
                                             'Severely wet %',
                                             'Extremely wet %']):
    columns = [i for i in data.columns if '%' in i and i not in columns_to_sum]

    fig = go.Figure()
    x_axis_labels =  [f"{start.strftime('%Y-%m')} - {end.strftime('%Y-%m')}" for start, end in zip(data['Start Date'], data['End Date'])]

    # Adding bars for each category
    if columns_to_sum:
        fig.add_trace(go.Bar(
            x=x_axis_labels,
            y=data[columns_to_sum].sum(axis=1),
            name='Normal',
            marker=dict(color=cmap['Severely wet'], line=dict(width=0))
        ))
    for category in columns[::-1]:
        fig.add_trace(go.Bar(
            x=x_axis_labels,
            y=data[category],
            name=category[:-2],
            marker=dict(color=cmap[category[:-2]])
        ))
    
    # Updating the layout for stacked bar
    fig.update_layout(
        barmode='stack',  # This ensures the bars are stacked
        title='Area of each type of drought',
        xaxis=dict(title='Events',
                   tickangle=-45,  # Rotate the x-axis labels by -45 degrees
                   tickmode='array',
                   tickvals=x_axis_labels,
                   ticktext=x_axis_labels,),
        yaxis=dict(title='Percentage'),
        legend=dict(orientation='v',x=1, y=0.5),
        margin=dict(l=50, r=50, t=50, b=100),
        paper_bgcolor='white',  # Transparent background for the entire paper
        plot_bgcolor='white',
        font=dict(
            color='#2a3f5f',
            family='sans-serif'
            ),
        bargap=0
    )
    
    # Show the plot
    fig.show()
plot_area_bar_plot(percentages, columns_to_sum=[])
plot_area_bar_plot(percentages)

Plot durations for each grid point on map for drought event 1#

event_1 = periods_df.iloc[4]
event_data = spei.loc[event_1['Start Date'].isoformat():event_1['End Date'].isoformat()]
# condition = event_data < -1.5
nan_mask = np.isnan(event_data)
condition = xr.where(nan_mask, np.nan, event_data < -1.5)
occurrences_xr = condition.sum(dim='time')
occurrences_xr = occurrences_xr.rio.write_crs("EPSG:4326", inplace=True)
occurrences_xr = occurrences_xr.rio.set_spatial_dims(x_dim='lon', y_dim='lat')
occurrences_xr = occurrences_xr.rio.clip(country_shape.geometry, world.crs, drop=True)
occurrences_xr.hvplot(
    widget_type="scrubber", 
    widget_location="bottom", 
    projection=ccrs.PlateCarree(), 
    coastline='10m',
    cmap='BrBG_r',
    features=['borders']
)
# Mask area outside of aoi shape
nan_mask = np.isnan(event_data)
result = xr.where(nan_mask, np.nan, occurrences_xr)
result = result.rio.write_crs("EPSG:4326", inplace=True)
result = result.rio.set_spatial_dims(x_dim='lon', y_dim='lat')
result = result.rio.clip(country_shape.geometry, world.crs, drop=True)
df = occurrences_xr.to_dataframe().reset_index()
df = df.dropna(subset=['lat', 'lon', 'SPEI12'])

# Extract the latitude, longitude, and occurrence data
lat = df['lat'].values
lon = df['lon'].values
occurrences = df['SPEI12'].values

# Create the base map with country borders
fig = go.Figure(go.Scattergeo(
    locationmode = 'country names',
    lon = lon,
    lat = lat,
    text = occurrences,
    marker = dict(
        size = 7,
        color = occurrences, # Use occurrences for color
        colorscale = 'BrBG_r',
        showscale = True,
        colorbar=dict(title="Occurrences"),
        line=dict(width=1, color='black')
    )
))

# Update the layout for better visibility
fig.update_layout(
    # width=1200,  # Increase plot width
    # height=800,  # Increase plot height
    geo=dict(
        scope='world',
        projection_type='natural earth',
        showcoastlines=True,
        showcountries=True,
        countrycolor="Black",
        coastlinecolor="Black",
        
        # Define the region of interest by setting lat/lon bounds
        lonaxis=dict(range=[lon.min()-0.5, lon.max()+0.5]),  # Longitude bounds
        lataxis=dict(range=[lat.min()-0.5, lat.max()+0.5]),  # Latitude bounds
        
        # Optionally, you can specify the center of the map
        center=dict(lon=lon.mean(), lat=lat.mean()),  # Center of the map
    ),
    title="Monthly Occurrences",
)

# Show the plot
fig.show()

Group durations of each grid point in groups:#

  • Up to 1 month

  • Up to 3 months

  • Up to 6 months

  • More than 6 months

from plotly.subplots import make_subplots

conditions = [
    (lambda x: x < -0.5, 'Mildly dry'),
    (lambda x: x < -1, 'Moderately dry'), 
    (lambda x: x < -1.5, 'Severely dry'), 
    (lambda x: x < -2, 'Extremely dry')
]



def categorize_occurrences(x):
    if np.isnan(x):
        return 'No data'
    elif x <= 1:
        return 'Up to 1 month'
    elif x <= 3:
        return 'Up to 3 months'
    elif x <= 6:
        return 'Up to 6 months'
    else:
        return 'More than 6 months'

def add_traces_to_subplot(fig, df):
    df = df.dropna(subset=['lat', 'lon', 'category'])
    lat = df['lat'].values
    lon = df['lon'].values
    category = df['category'].values

    color_map = {
        # 'No data': 'white',
        'Up to 1 month': 'blue',
        'Up to 3 months': 'green',
        'Up to 6 months': 'orange',
        'More than 6 months': 'red'
    }
    
    # Plot each category separately to apply different colors
    for cat in color_map:
        mask = df['category'] == cat
        fig.add_trace(go.Scattergeo(
            lon=lon[mask],
            lat=lat[mask],
            text=category[mask],
            mode='markers',
            marker=dict(
                size=10,
                color=color_map[cat],
                opacity=1,
                line=dict(width=1, color='black')
            ),
            name=cat,  # Label for the legend
            showlegend=True  # Control legend visibility
        ))

for condition, description in conditions:
    
    # Apply spei condition
    data = condition(event_data).sum(dim='time')
    
    # Mask area outside of aoi shape
    nan_mask = np.isnan(data)
    result = xr.where(nan_mask, np.nan, data)
    result = result.rio.write_crs("EPSG:4326", inplace=True)
    result = result.rio.set_spatial_dims(x_dim='lon', y_dim='lat')
    result = result.rio.clip(country_shape.geometry, world.crs, drop=True)
    
    fig = go.Figure()

    # Convert xarray to Dataframe and categorize grid point
    df = result.to_dataframe().reset_index()
    df['category'] = df['SPEI12'].apply(categorize_occurrences)

    # Plot data
    add_traces_to_subplot(fig, df)

    # Update the layout: Increase the size and set the geographic scope
    fig.update_layout(
        geo=dict(
                scope='world',
                projection_type='natural earth',
                showcoastlines=True,
                showcountries=True,
                countrycolor="Black",
                coastlinecolor="Black",
                
                # Define the region of interest by setting lat/lon bounds so the plot is zoomed in the AOI
                lonaxis=dict(range=[lon.min() - 0.5, lon.max() + 0.5]),  # Longitude bounds
                lataxis=dict(range=[lat.min() - 0.5, lat.max() + 0.5]),  # Latitude bounds
                
                # Specify the center of the map
                center=dict(lon=lon.mean(), lat=lat.mean()),  # Center of the map
            ),
        title=f"Monthly Occurrences of {description}",
        margin=dict(l=1, r=1, t=80, b=5),
        legend=dict(
            x=1.05,
            y=0.5,
            traceorder='normal',
            orientation='v'
        )
    )
        
    # Show the plot
    fig.show()