#!/usr/bin/env python
"""
Time Averaging
==============
This module contains functions for time averaging of data arrays.
The approximate interval for time averaging is prescribed in the CMOR tables,
using the key ``'approx_interval'``. This information is also provided
within the library.
Functions
---------
_get_time_method(frequency: str) -> str:
Determine the time method based on the frequency string from
rule.data_request_variable.frequency.
_frequency_from_approx_interval(interval: str) -> str:
Convert an interval expressed in days to a frequency string.
timeavg(da: xr.DataArray, rule: Dict) -> xr.DataArray:
Time averages data with respect to time-method (mean/climatology/instant.)
Module Variables
----------------
_IGNORED_CELL_METHODS : list
List of cell_methods to ignore when calculating time averages.
"""
import re
import pandas as pd
import xarray as xr
from ..core.logging import logger
[docs]
def _get_time_method(frequency: str) -> str:
"""
Determine the time method based on the frequency string from CMIP6 table for
a specific variable (rule.data_request_variable.frequency).
The type of time method influences how the data is processed for time averaging.
Parameters
----------
frequency : str
The frequency string from CMIP6 tables (example: "mon").
Returns
-------
str
The corresponding time method ('INSTANTANEOUS', 'CLIMATOLOGY', or 'MEAN').
"""
if frequency.endswith("Pt"):
return "INSTANTANEOUS"
if frequency.endswith("C") or frequency.endswith("CM"):
return "CLIMATOLOGY"
return "MEAN"
[docs]
def _frequency_from_approx_interval(interval: str):
"""
Convert an interval expressed in days to a frequency string.
This function takes an interval expressed in days and converts it to a frequency string
in a suitable time unit (decade, year, month, day, hour, minute, second, millisecond).
The conversion is based on an approximate number of days for each time unit.
Parameters
----------
interval : str
The interval expressed in days.
Returns
-------
str
The frequency string in a suitable time unit.
Raises
------
ValueError
If the interval cannot be converted to a float.
"""
try:
interval = float(interval)
except ValueError:
raise ValueError(f"Invalid interval: {interval}")
# NOTE: A reference date is needed to calculate
# datetime diffs. We use the Unix epoch as
# an arbitrary reference date stamp:
ref = pd.Timestamp("1970-01-01")
dt = pd.Timedelta(interval, unit="d")
dt = dt.round(freq="s")
# handle special case of 60 days
if dt.days == 60:
dt = pd.Timedelta(59, unit="d")
ts = ref + dt
year = ts.year - ref.year
# account leap years, add deficit days
extra_days, reminder = divmod(year, 4)
if extra_days or reminder:
extra_days = extra_days + reminder // 2
ts = ts + pd.Timedelta(extra_days, unit="d")
year = ts.year - ref.year
month = ts.month - ref.month
day = ts.day - ref.day
hour = ts.hour - ref.hour
minute = ts.minute - ref.minute
second = ts.second - ref.second
result = []
if year:
result.append(f"{year}YS")
if month:
result.append(f"{month}MS")
if day:
if day == 30 and month == 0:
result.append("1MS")
else:
result.append(f"{day}D")
if hour:
result.append(f"{hour}h")
if minute:
result.append(f"{minute}m")
if second:
result.append(f"{second}s")
return "".join(result)
[docs]
def timeavg(da: xr.DataArray, rule):
"""
Time averages data with respect to time-method (mean/climatology/instant.)
This function takes a data array and a rule, computes the timespan of the data
array, and then performs time averaging based on the time method specified in the
rule. The time methods can be ``"INSTANTANEOUS"``, ``"MEAN"``, or ``"CLIMATOLOGY"``.
For ``"MEAN"`` time method, the timestamps can be adjusted using the ``adjust_timestamp``
parameter in the rule dict.
This can be either:
- A float between 0 and 1 representing the position within each period (e.g., 0.5 for mid-point)
- A string preset: "first"/"start" (0.0), "last"/"end" (1.0), "mid"/"middle" (0.5)
- A pandas offset string (e.g., "2d" for 2 days offset)
This feature is useful for setting consistent mid-month dates by setting
``adjust_timestamp`` to "14d".
Parameters
----------
da : xr.DataArray
The data array to compute the timespan for.
rule : dict
The rule dict containing the time method and other parameters.
For "MEAN" time method, can include 'adjust_timestamp' to control timestamp positioning.
Returns
-------
xr.DataArray
The time averaged data array.
"""
drv = rule.data_request_variable
approx_interval = drv.table_header.approx_interval
frequency_str = _frequency_from_approx_interval(approx_interval)
logger.debug(f"{approx_interval=} {frequency_str=}")
# attach the frequency_str to rule, it is referenced when creating file name
rule.frequency_str = frequency_str
time_method = _get_time_method(drv.frequency)
rule.time_method = time_method
if time_method == "INSTANTANEOUS":
ds = da.resample(time=frequency_str).first()
elif time_method == "MEAN":
ds = da.resample(time=frequency_str).mean()
offset = rule.get("adjust_timestamp", None)
offset_presets = {
"first": 0,
"start": 0,
"last": 1,
"end": 1,
"mid": 0.5,
"middle": 0.5,
}
offset = offset_presets.get(offset, offset)
if offset is None:
return ds
try:
offset = float(offset)
except (ValueError, TypeError):
# dealing with a alphanumerical string. example: "14days"
try:
offset = pd.to_timedelta(offset)
except ValueError:
# don't know how to deal with it so raise the error
# supporing datetime kind of string is a bit complex.
# at the moment, skip supporting it.
raise ValueError(f"offset ({offset}) can not converted to timedelta")
else:
ds["time"] = ds.time.values + offset
return ds
else:
if offset == 0.0:
return ds
elif "MS" in frequency_str or "YS" in frequency_str:
timestamps = []
magnitude = re.search(r"(\d+(?:\.\d+)?)?", frequency_str).group(0) or 1
magnitude = float(magnitude)
if "MS" in frequency_str:
for timestamp, grp in da.resample(time=frequency_str):
ndays = grp.time.dt.days_in_month.values[0] * magnitude
# NOTE: removing a day is requied to avoid overflow of the interval into next month
new_offset = pd.to_timedelta(
f"{ndays}d"
) * offset - pd.to_timedelta("1d")
timestamp = timestamp + new_offset
timestamps.append(timestamp)
elif "YS" in frequency_str:
for timestamp, grp in da.resample(time=frequency_str):
ndays = grp.time.dt.days_in_year.values[0] * magnitude
new_offset = pd.to_timedelta(
f"{ndays}d"
) * offset - pd.to_timedelta("1d")
timestamp = timestamp + new_offset
timestamps.append(timestamp)
else:
print(
"It is not possible to reach this branch."
"If you are here, know that Pycmor has gone nuts."
f"{frequency_str=} {offset=}"
)
ds["time"] = timestamps
return ds
else:
new_offset = pd.to_timedelta(frequency_str) * offset
ds["time"] = ds.time.values + new_offset
return ds
elif time_method == "CLIMATOLOGY":
if drv.frequency == "monC":
ds = da.groupby("time.month").mean("time")
elif drv.frequency == "1hrCM":
ds = da.groupby("time.hour").mean("time")
else:
raise ValueError(
f"Unknown Climatology {drv.frequency} in Table {drv.table_header.table_id}"
)
else:
raise ValueError(f"Unknown time method: {time_method}")
return ds
[docs]
def custom_resample(df, freq="M", offset=0.5, func="mean"):
"""
Resample a DataFrame and place timestamps at a custom offset within each period.
Parameters
----------
df : DataFrame
DataFrame with a DatetimeIndex
freq : str
Frequency string (e.g., 'M' for month, 'Y' for year)
offset : float
Float between 0 and 1, representing the position within each period
func : str
Resampling function (e.g., 'mean', 'sum', 'max')
Returns
-------
DataFrame
Resampled DataFrame with adjusted timestamps
Examples
--------
First, set up our imports and random seed:
>>> import numpy as np
>>> import pandas as pd
>>> rng = np.random.default_rng(42)
>>> date_rng = pd.date_range(start="2023-01-01", end="2023-12-31", freq="D")
>>> df = pd.DataFrame({"value": rng.random(len(date_rng))}, index=date_rng)
Test mid-month resampling:
>>> df_month_mid = custom_resample(df, freq="ME", offset=0.5)
>>> print(df_month_mid.head())
value
2023-01-16 00:00:00 0.565127
2023-02-14 12:00:00 0.484111
2023-03-16 00:00:00 0.434221
2023-04-15 12:00:00 0.510354
2023-05-16 00:00:00 0.443399
Test mid-year resampling:
>>> df_year_mid = custom_resample(df, freq="YE", offset=0.5)
>>> print(df_year_mid)
value
2023-07-02 0.492457
Test mid-week resampling:
>>> df_week_mid = custom_resample(df, freq="W", offset=0.5)
>>> print(df_week_mid.head())
value
2023-01-01 0.773956
2023-01-05 0.658835
2023-01-12 0.540872
2023-01-19 0.488221
2023-01-26 0.500237
Test one-third through each month:
>>> df_month_third = custom_resample(df, freq="ME", offset=1/3)
>>> print(df_month_third.head())
value
2023-01-11 00:00:00 0.565127
2023-02-10 00:00:00 0.484111
2023-03-11 00:00:00 0.434221
2023-04-10 16:00:00 0.510354
2023-05-11 00:00:00 0.443399
Test quarter-end resampling:
>>> df_quarter_end = custom_resample(df, freq="QE", offset=1)
>>> print(df_quarter_end)
value
2023-03-31 0.494832
2023-06-30 0.496207
2023-09-30 0.461806
2023-12-31 0.517077
Test with irregular time series:
>>> irregular_dates = pd.date_range("2023-01-01", periods=100, freq="D").tolist()
>>> irregular_dates += pd.date_range("2023-05-01", periods=50, freq="2D").tolist()
>>> irregular_dates += pd.date_range("2023-07-01", periods=30, freq="3D").tolist()
>>> df_irregular = pd.DataFrame({"value": rng.random(len(irregular_dates))}, index=irregular_dates)
>>> df_irregular_month = custom_resample(df_irregular, freq="ME", offset=0.5)
>>> print(df_irregular_month.head())
value
2023-01-16 00:00:00 0.543549
2023-02-14 12:00:00 0.485275
2023-03-16 00:00:00 0.513365
2023-04-05 12:00:00 0.558554
2023-05-16 00:00:00 0.447175
"""
# Perform the resampling
resampled = getattr(df.resample(freq), func)()
# Adjust the timestamps
new_index = []
for name, group in df.groupby(pd.Grouper(freq=freq)):
if not group.empty:
period_start = group.index[0]
period_end = group.index[-1]
new_timestamp = period_start + (period_end - period_start) * offset
new_index.append(new_timestamp)
resampled.index = pd.DatetimeIndex(new_index)
return resampled
_IGNORED_CELL_METHODS = """
area: depth: time: mean
area: mean
area: mean (comment: over land and sea ice) time: point
area: mean time: maximum
area: mean time: maximum within days time: mean over days
area: mean time: mean within days time: mean over days
area: mean time: mean within hours time: maximum over hours
area: mean time: mean within years time: mean over years
area: mean time: minimum
area: mean time: minimum within days time: mean over days
area: mean time: point
area: mean time: sum
area: mean where crops time: maximum
area: mean where crops time: maximum within days time: mean over days
area: mean where crops time: minimum
area: mean where crops time: minimum within days time: mean over days
area: mean where grounded_ice_sheet
area: mean where ice_free_sea over sea time: mean
area: mean where ice_sheet
area: mean where land
area: mean where land over all_area_types time: mean
area: mean where land over all_area_types time: point
area: mean where land over all_area_types time: sum
area: mean where land time: mean
area: mean where land time: mean (with samples weighted by snow mass)
area: mean where land time: point
area: mean where sea
area: mean where sea depth: sum where sea (top 100m only) time: mean
area: mean where sea depth: sum where sea time: mean
area: mean where sea time: mean
area: mean where sea time: point
area: mean where sea_ice (comment: mask=siconc) time: point
area: mean where sector time: point
area: mean where snow over sea_ice area: time: mean where sea_ice
area: point
area: point time: point
area: sum
area: sum where ice_sheet time: mean
area: sum where sea time: mean
area: time: mean
area: time: mean (comment: over land and sea ice)
area: time: mean where cloud
area: time: mean where crops (comment: mask=cropFrac)
area: time: mean where floating_ice_shelf (comment: mask=sftflf)
area: time: mean where grounded_ice_sheet (comment: mask=sfgrlf)
area: time: mean where ice_sheet
area: time: mean where natural_grasses (comment: mask=grassFrac)
area: time: mean where pastures (comment: mask=pastureFrac)
area: time: mean where sea_ice (comment: mask=siconc)
area: time: mean where sea_ice (comment: mask=siconca)
area: time: mean where sea_ice (comment: mask=siitdconc)
area: time: mean where sea_ice_melt_pond (comment: mask=simpconc)
area: time: mean where sea_ice_ridges (comment: mask=sirdgconc)
area: time: mean where sector
area: time: mean where shrubs (comment: mask=shrubFrac)
area: time: mean where snow (comment: mask=snc)
area: time: mean where trees (comment: mask=treeFrac)
area: time: mean where unfrozen_soil
area: time: mean where vegetation (comment: mask=vegFrac)
longitude: mean time: mean
longitude: mean time: point
longitude: sum (comment: basin sum [along zig-zag grid path]) depth: sum time: mean
time: mean
time: mean grid_longitude: mean
time: point
""".strip().split(
"\n"
)
"""list: cell_methods to ignore when calculating time averages"""