Source code for pycmor.std_lib.dataset_helpers

from collections import deque

import cftime
import numpy as np
import pandas as pd
import xarray as xr
from xarray.core.utils import is_scalar


[docs] def is_datetime_type(arr: np.ndarray) -> bool: "Checks if array elements are datetime objects or cftime objects" return isinstance( arr.item(0), tuple(cftime._cftime.DATE_TYPES.values()) ) or np.issubdtype(arr, np.datetime64)
[docs] def get_time_label(ds): """ Determines the name of the coordinate in the dataset that can serve as a time label. Parameters ---------- ds : xarray.Dataset The dataset containing coordinates to check for a time label. Returns ------- str or None The name of the coordinate that is a datetime type and can serve as a time label, or None if no such coordinate is found. Example ------- >>> import xarray as xr >>> import pandas as pd >>> import numpy as np >>> ds = xr.Dataset({'time': ('time', pd.date_range('2000-01-01', periods=10))}) >>> get_time_label(ds) 'time' >>> ds = xr.DataArray(np.ones(10), coords={'T': ('T', pd.date_range('2000-01-01', periods=10))}) >>> get_time_label(ds) 'T' >>> # The following does have a valid time coordinate, expected to return None >>> da = xr.Dataset({'time': ('time', [1,2,3,4,5])}) >>> get_time_label(da) is None True """ label = deque() for name, coord in ds.coords.items(): if not is_datetime_type(coord): continue if not coord.dims: continue if name in coord.dims: label.appendleft(name) else: label.append(name) label.append(None) return label.popleft()
[docs] def has_time_axis(ds) -> bool: """ Checks if the given dataset has a time axis. Parameters ---------- ds : xarray.Dataset or xarray.DataArray The dataset to check. Returns ------- bool True if the dataset has a time axis, False otherwise. """ return bool(get_time_label(ds))
[docs] def needs_resampling(ds, timespan): """ Checks if a given dataset needs resampling based on its time axis. Parameters ---------- ds : xr.Dataset or xr.DataArray The dataset to check. timespan : str The time span for which the dataset is to be resampled. 10YS, 1YS, 6MS, etc. Returns ------- bool True if the dataset needs resampling, False otherwise. Notes: ------ After time-averaging step, this function aids in determining if splitting into multiple files is required based on provided timespan. """ if (timespan is None) or (not timespan): return False time_label = get_time_label(ds) if time_label is None: return False if is_scalar(ds[time_label]): return False # string representation is need to deal with cftime start = pd.Timestamp(str(ds[time_label].data[0])) end = pd.Timestamp(str(ds[time_label].data[-1])) offset = pd.tseries.frequencies.to_offset(timespan) return (start + offset) < end
[docs] def freq_is_coarser_than_data( freq: str, ds: xr.Dataset, ref_time: pd.Timestamp = pd.Timestamp("1970-01-01"), ) -> bool: """ Checks if the frequency is coarser than the time frequency of the xarray Dataset. Parameters ---------- freq : str The frequency to compare (e.g. 'M', 'D', '6H'). ds : xr.Dataset The dataset containing a time coordinate. ref_time : pd.Timestamp, optional Reference timestamp used to convert frequency to a time delta. Defaults to the beginning of the Unix Epoch. Returns ------- bool True if `freq` is coarser (covers a longer duration) than the dataset's frequency. """ time_label = get_time_label(ds) if time_label is None: raise ValueError("The dataset does not contain a valid time coordinate.") time_index = ds.indexes[time_label] data_freq = pd.infer_freq(time_index) if data_freq is None: raise ValueError( "Could not infer frequency from the dataset's time coordinate." ) delta1 = (ref_time + pd.tseries.frequencies.to_offset(freq)) - ref_time delta2 = (ref_time + pd.tseries.frequencies.to_offset(data_freq)) - ref_time return delta1 > delta2