Skip to content

gcages.assertions#

Useful assertions

Classes:

Name Description
DataIsNotAllNumericError

Raised when not all data in a pd.DataFrame is numeric

IndexIsNotMultiIndexError

Raised when the index is not a pd.MultiIndex

MissingDataForTimesError

Raised when a pd.DataFrame is missing data for expected times

MissingIndexLevelsError

Raised when a pd.DataFrame is missing expected index levels

NotAllowedMetadataValuesError

Raised when a pd.DataFrame contains disallowed metadata values

Functions:

Name Description
assert_data_is_all_numeric

Assert that all data in a pd.DataFrame is numeric

assert_has_data_for_times

Assert that a pd.DataFrame has data for the given times

assert_has_index_levels

Assert that a pd.DataFrame has all the given levels in its index

assert_index_is_multiindex

Assert that the index is a pd.MultiIndex

assert_metadata_values_all_allowed

Assert that a pd.DataFrame only contains allowed metadata values

assert_only_working_on_variable_unit_region_variations

Assert that we're only working on variations in variable, unit and region

assert_only_working_on_variable_unit_variations

Assert that we're only working on variations in variable and unit

DataIsNotAllNumericError #

Bases: ValueError

Raised when not all data in a pd.DataFrame is numeric

Methods:

Name Description
__init__

Initialise the error

Source code in src/gcages/assertions.py
class DataIsNotAllNumericError(ValueError):
    """
    Raised when not all data in a [pd.DataFrame][pandas.DataFrame] is numeric
    """

    def __init__(self, df: pd.DataFrame, non_numeric_cols: Collection[Any]) -> None:
        """
        Initialise the error

        Parameters
        ----------
        df
            [pd.DataFrame][pandas.DataFrame] containing non-numeric data

        non_numeric_cols
            The columns that contain non-numeric data
        """
        # Including df in API, but not sure how to use it well right now
        # (not easy to just get the non-numeric values in a column,
        # because that's not a trivial question to ask [is "0" non-numeric or not?])
        error_msg = (
            f"The following columns contain non-numeric data: {non_numeric_cols}"
        )
        super().__init__(error_msg)

__init__ #

__init__(
    df: DataFrame, non_numeric_cols: Collection[Any]
) -> None

Initialise the error

Parameters:

Name Type Description Default
df DataFrame

pd.DataFrame containing non-numeric data

required
non_numeric_cols Collection[Any]

The columns that contain non-numeric data

required
Source code in src/gcages/assertions.py
def __init__(self, df: pd.DataFrame, non_numeric_cols: Collection[Any]) -> None:
    """
    Initialise the error

    Parameters
    ----------
    df
        [pd.DataFrame][pandas.DataFrame] containing non-numeric data

    non_numeric_cols
        The columns that contain non-numeric data
    """
    # Including df in API, but not sure how to use it well right now
    # (not easy to just get the non-numeric values in a column,
    # because that's not a trivial question to ask [is "0" non-numeric or not?])
    error_msg = (
        f"The following columns contain non-numeric data: {non_numeric_cols}"
    )
    super().__init__(error_msg)

IndexIsNotMultiIndexError #

Bases: TypeError

Raised when the index is not a pd.MultiIndex

Methods:

Name Description
__init__

Initialise the error

Source code in src/gcages/assertions.py
class IndexIsNotMultiIndexError(TypeError):
    """
    Raised when the index is not a [pd.MultiIndex][pandas.MultiIndex]
    """

    def __init__(self, df: pd.DataFrame) -> None:
        """
        Initialise the error

        Parameters
        ----------
        df
            [pd.DataFrame][pandas.DataFrame]
        """
        error_msg = (
            f"The index is not a `pd.MultiIndex`, instead we have {type(df.index)=}"
        )
        super().__init__(error_msg)

__init__ #

__init__(df: DataFrame) -> None

Initialise the error

Parameters:

Name Type Description Default
df DataFrame required
Source code in src/gcages/assertions.py
def __init__(self, df: pd.DataFrame) -> None:
    """
    Initialise the error

    Parameters
    ----------
    df
        [pd.DataFrame][pandas.DataFrame]
    """
    error_msg = (
        f"The index is not a `pd.MultiIndex`, instead we have {type(df.index)=}"
    )
    super().__init__(error_msg)

MissingDataForTimesError #

Bases: KeyError

Raised when a pd.DataFrame is missing data for expected times

Methods:

Name Description
__init__

Initialise the error

Source code in src/gcages/assertions.py
class MissingDataForTimesError(KeyError):
    """
    Raised when a [pd.DataFrame][pandas.DataFrame] is missing data for expected times
    """

    def __init__(
        self,
        df: pd.DataFrame,
        name: str,
        missing_times: Collection[Any],
        allow_nan: bool,
    ) -> None:
        """
        Initialise the error

        Parameters
        ----------
        df
            [pd.DataFrame][pandas.DataFrame] that is missing expected index levels

        name
            Name of `df` to display in the error message

        missing_times
            Times in `df` that are missing data

        allow_nan
            Were NaN values allowed in the values of `times` when checking the data?
        """
        if allow_nan:
            error_msg = (
                f"{name} is missing data for the following times: "
                f"{missing_times}. "
                f"Available times: {df.columns}"
            )

        else:
            tmp = df[missing_times]
            nan_view = tmp[tmp.isnull().any(axis="columns")]
            error_msg = (
                f"{name} has NaNs for the following times: {missing_times}. "
                f"Rows with Nans:\n{nan_view}"
            )

        super().__init__(error_msg)

__init__ #

__init__(
    df: DataFrame,
    name: str,
    missing_times: Collection[Any],
    allow_nan: bool,
) -> None

Initialise the error

Parameters:

Name Type Description Default
df DataFrame

pd.DataFrame that is missing expected index levels

required
name str

Name of df to display in the error message

required
missing_times Collection[Any]

Times in df that are missing data

required
allow_nan bool

Were NaN values allowed in the values of times when checking the data?

required
Source code in src/gcages/assertions.py
def __init__(
    self,
    df: pd.DataFrame,
    name: str,
    missing_times: Collection[Any],
    allow_nan: bool,
) -> None:
    """
    Initialise the error

    Parameters
    ----------
    df
        [pd.DataFrame][pandas.DataFrame] that is missing expected index levels

    name
        Name of `df` to display in the error message

    missing_times
        Times in `df` that are missing data

    allow_nan
        Were NaN values allowed in the values of `times` when checking the data?
    """
    if allow_nan:
        error_msg = (
            f"{name} is missing data for the following times: "
            f"{missing_times}. "
            f"Available times: {df.columns}"
        )

    else:
        tmp = df[missing_times]
        nan_view = tmp[tmp.isnull().any(axis="columns")]
        error_msg = (
            f"{name} has NaNs for the following times: {missing_times}. "
            f"Rows with Nans:\n{nan_view}"
        )

    super().__init__(error_msg)

MissingIndexLevelsError #

Bases: KeyError

Raised when a pd.DataFrame is missing expected index levels

Methods:

Name Description
__init__

Initialise the error

Source code in src/gcages/assertions.py
class MissingIndexLevelsError(KeyError):
    """
    Raised when a [pd.DataFrame][pandas.DataFrame] is missing expected index levels
    """

    def __init__(
        self,
        df: pd.DataFrame,
        missing_levels: Collection[Any],
    ) -> None:
        """
        Initialise the error

        Parameters
        ----------
        df
            [pd.DataFrame][pandas.DataFrame] that is missing expected index levels

        missing_levels
            Levels that are missing from `df.index`
        """
        error_msg = (
            f"The DataFrame is missing the following index levels: {missing_levels}. "
            f"Available index levels: {df.index.names}"
        )
        super().__init__(error_msg)

__init__ #

__init__(
    df: DataFrame, missing_levels: Collection[Any]
) -> None

Initialise the error

Parameters:

Name Type Description Default
df DataFrame

pd.DataFrame that is missing expected index levels

required
missing_levels Collection[Any]

Levels that are missing from df.index

required
Source code in src/gcages/assertions.py
def __init__(
    self,
    df: pd.DataFrame,
    missing_levels: Collection[Any],
) -> None:
    """
    Initialise the error

    Parameters
    ----------
    df
        [pd.DataFrame][pandas.DataFrame] that is missing expected index levels

    missing_levels
        Levels that are missing from `df.index`
    """
    error_msg = (
        f"The DataFrame is missing the following index levels: {missing_levels}. "
        f"Available index levels: {df.index.names}"
    )
    super().__init__(error_msg)

NotAllowedMetadataValuesError #

Bases: ValueError

Raised when a pd.DataFrame contains disallowed metadata values

Methods:

Name Description
__init__

Initialise the error

Source code in src/gcages/assertions.py
class NotAllowedMetadataValuesError(ValueError):
    """
    Raised when a [pd.DataFrame][pandas.DataFrame] contains disallowed metadata values
    """

    def __init__(
        self,
        df: pd.DataFrame,
        metadata_key: Any,
        disallowed_values: Collection[Any],
        allowed_values: Collection[Any],
    ) -> None:
        """
        Initialise the error

        Parameters
        ----------
        df
            [pd.DataFrame][pandas.DataFrame] that contains diasallowed metadata values

        metadata_key
            The metadata key which is being considered (e.g. "variable", "unit")

        disallowed_values
            The values which are not allowed but appear in `df`

        allowed_values
            The values which are allowed for `metadata_key`
        """
        error_msg = (
            f"The DataFrame contains disallowed values for {metadata_key}: "
            f"{disallowed_values}. "
            f"Allowed values: {allowed_values}"
        )
        super().__init__(error_msg)

__init__ #

__init__(
    df: DataFrame,
    metadata_key: Any,
    disallowed_values: Collection[Any],
    allowed_values: Collection[Any],
) -> None

Initialise the error

Parameters:

Name Type Description Default
df DataFrame

pd.DataFrame that contains diasallowed metadata values

required
metadata_key Any

The metadata key which is being considered (e.g. "variable", "unit")

required
disallowed_values Collection[Any]

The values which are not allowed but appear in df

required
allowed_values Collection[Any]

The values which are allowed for metadata_key

required
Source code in src/gcages/assertions.py
def __init__(
    self,
    df: pd.DataFrame,
    metadata_key: Any,
    disallowed_values: Collection[Any],
    allowed_values: Collection[Any],
) -> None:
    """
    Initialise the error

    Parameters
    ----------
    df
        [pd.DataFrame][pandas.DataFrame] that contains diasallowed metadata values

    metadata_key
        The metadata key which is being considered (e.g. "variable", "unit")

    disallowed_values
        The values which are not allowed but appear in `df`

    allowed_values
        The values which are allowed for `metadata_key`
    """
    error_msg = (
        f"The DataFrame contains disallowed values for {metadata_key}: "
        f"{disallowed_values}. "
        f"Allowed values: {allowed_values}"
    )
    super().__init__(error_msg)

assert_data_is_all_numeric #

assert_data_is_all_numeric(df: DataFrame) -> None

Assert that all data in a pd.DataFrame is numeric

Parameters:

Name Type Description Default
df DataFrame

pd.DataFrame to check

required

Raises:

Type Description
DataIsNotAllNumericError

If there are columns in df are not numeric

Source code in src/gcages/assertions.py
def assert_data_is_all_numeric(df: pd.DataFrame) -> None:
    """
    Assert that all data in a [pd.DataFrame][pandas.DataFrame] is numeric

    Parameters
    ----------
    df
        [pd.DataFrame][pandas.DataFrame] to check

    Raises
    ------
    DataIsNotAllNumericError
        If there are columns in `df` are not numeric
    """
    non_numeric = tuple(c for c in df if not is_numeric_dtype(df[c]))
    if non_numeric:
        raise DataIsNotAllNumericError(df=df, non_numeric_cols=non_numeric)

assert_has_data_for_times #

assert_has_data_for_times(
    df: DataFrame,
    name: str,
    times: Iterable[Any],
    allow_nan: bool,
) -> None

Assert that a pd.DataFrame has data for the given times

Parameters:

Name Type Description Default
df DataFrame

pd.DataFrame to check

required
name str

Name of df to display in the error message

required
times Iterable[Any]

Times (i.e. columns) that we expect to have data in df

required
allow_nan bool

Are NaN values allowed in the values of times (or should all data be non-Nan)?

required

Raises:

Type Description
MissingDataForTimesError

The data in df does not contain all times in times.

If not allow_nan, this will also be raised if any of the data in df contains NaN for a time in times.

Source code in src/gcages/assertions.py
def assert_has_data_for_times(
    df: pd.DataFrame, name: str, times: Iterable[Any], allow_nan: bool
) -> None:
    """
    Assert that a [pd.DataFrame][pandas.DataFrame] has data for the given times

    Parameters
    ----------
    df
        [pd.DataFrame][pandas.DataFrame] to check

    name
        Name of `df` to display in the error message

    times
        Times (i.e. columns) that we expect to have data in `df`

    allow_nan
        Are NaN values allowed in the values of `times` (or should all data be non-Nan)?

    Raises
    ------
    MissingDataForTimesError
        The data in `df` does not contain all times in `times`.

        If `not allow_nan`, this will also be raised if any of the data in `df`
        contains NaN for a time in `times`.
    """
    missing_times = [v for v in times if v not in df.columns]
    if missing_times:
        raise MissingDataForTimesError(
            df=df,
            name=name,
            missing_times=missing_times,
            # Failed before we even considered NaN
            allow_nan=True,
        )

    if not allow_nan:
        nan_times = [v for v in times if df[v].isnull().any()]
        if nan_times:
            raise MissingDataForTimesError(
                df=df, name=name, missing_times=nan_times, allow_nan=allow_nan
            )

assert_has_index_levels #

assert_has_index_levels(
    df: DataFrame, levels: Iterable[Any]
) -> None

Assert that a pd.DataFrame has all the given levels in its index

Parameters:

Name Type Description Default
df DataFrame

pd.DataFrame to check

required
levels Iterable[Any]

Levels that we expect to be in the index of df

required

Raises:

Type Description
MissingIndexLevelsError

The index of df does not contain all levels in levels

Source code in src/gcages/assertions.py
def assert_has_index_levels(df: pd.DataFrame, levels: Iterable[Any]) -> None:
    """
    Assert that a [pd.DataFrame][pandas.DataFrame] has all the given levels in its index

    Parameters
    ----------
    df
        [pd.DataFrame][pandas.DataFrame] to check

    levels
        Levels that we expect to be in the index of `df`

    Raises
    ------
    MissingIndexLevelsError
        The index of `df` does not contain all levels in `levels`
    """
    missing_levels = [v for v in levels if v not in df.index.names]
    if missing_levels:
        raise MissingIndexLevelsError(df=df, missing_levels=missing_levels)

assert_index_is_multiindex #

assert_index_is_multiindex(df: DataFrame) -> None

Assert that the index is a pd.MultiIndex

Parameters:

Name Type Description Default
df DataFrame

pd.DataFrame to check

required

Raises:

Type Description
IndexIsNotMultiIndexError

The index of df is not a pd.MultiIndex

Source code in src/gcages/assertions.py
def assert_index_is_multiindex(df: pd.DataFrame) -> None:
    """
    Assert that the index is a [pd.MultiIndex][pandas.MultiIndex]

    Parameters
    ----------
    df
        [pd.DataFrame][pandas.DataFrame] to check

    Raises
    ------
    IndexIsNotMultiIndexError
        The index of `df` is not a [pd.MultiIndex][pandas.MultiIndex]
    """
    if not isinstance(df.index, pd.MultiIndex):
        raise IndexIsNotMultiIndexError(df)

assert_metadata_values_all_allowed #

assert_metadata_values_all_allowed(
    df: DataFrame,
    metadata_key: Any,
    allowed_values: Collection[Any],
) -> None

Assert that a pd.DataFrame only contains allowed metadata values

Parameters:

Name Type Description Default
df DataFrame

pd.DataFrame to check

required
metadata_key Any

The metadata key to check (e.g. "variable", "unit")

required
allowed_values Collection[Any]

The values which are allowed for this metadata key

required

Raises:

Type Description
NotAllowedMetadataValuesError

There is metadata for metadata_key in df that is not in allowed_values.

Source code in src/gcages/assertions.py
def assert_metadata_values_all_allowed(
    df: pd.DataFrame, metadata_key: Any, allowed_values: Collection[Any]
) -> None:
    """
    Assert that a [pd.DataFrame][pandas.DataFrame] only contains allowed metadata values

    Parameters
    ----------
    df
        [pd.DataFrame][pandas.DataFrame] to check

    metadata_key
        The metadata key to check (e.g. "variable", "unit")

    allowed_values
        The values which are allowed for this metadata key

    Raises
    ------
    NotAllowedMetadataValuesError
        There is metadata for `metadata_key` in `df` that is not in `allowed_values`.
    """
    disallowed_values = [
        v
        for v in df.index.get_level_values(metadata_key).unique()
        if v not in allowed_values
    ]
    if disallowed_values:
        raise NotAllowedMetadataValuesError(
            df=df,
            metadata_key=metadata_key,
            disallowed_values=disallowed_values,
            allowed_values=allowed_values,
        )

assert_only_working_on_variable_unit_region_variations #

assert_only_working_on_variable_unit_region_variations(
    indf: DataFrame,
) -> None

Assert that we're only working on variations in variable, unit and region

In other words, we don't have variations in scenarios, models etc.

Parameters:

Name Type Description Default
indf DataFrame

Data to verify

required

Raises:

Type Description
AssertionError

There are variations in columns other than variable and unit

Source code in src/gcages/assertions.py
def assert_only_working_on_variable_unit_region_variations(indf: pd.DataFrame) -> None:
    """
    Assert that we're only working on variations in variable, unit and region

    In other words, we don't have variations in scenarios, models etc.

    Parameters
    ----------
    indf
        Data to verify

    Raises
    ------
    AssertionError
        There are variations in columns other than variable and unit
    """
    variations_in_other_cols = indf.index.droplevel(
        ["variable", "unit", "region"]
    ).unique()
    if len(variations_in_other_cols) > 1:
        msg = f"variations_in_other_cols=\n{variations_in_other_cols}"
        raise AssertionError(msg)

assert_only_working_on_variable_unit_variations #

assert_only_working_on_variable_unit_variations(
    indf: DataFrame,
) -> None

Assert that we're only working on variations in variable and unit

In other words, we don't have variations in scenarios, models etc.

Parameters:

Name Type Description Default
indf DataFrame

Data to verify

required

Raises:

Type Description
AssertionError

There are variations in columns other than variable and unit

Source code in src/gcages/assertions.py
def assert_only_working_on_variable_unit_variations(indf: pd.DataFrame) -> None:
    """
    Assert that we're only working on variations in variable and unit

    In other words, we don't have variations in scenarios, models etc.

    Parameters
    ----------
    indf
        Data to verify

    Raises
    ------
    AssertionError
        There are variations in columns other than variable and unit
    """
    variations_in_other_cols = indf.index.droplevel(["variable", "unit"]).unique()
    if len(variations_in_other_cols) > 1:
        msg = f"variations_in_other_cols=\n{variations_in_other_cols}"
        raise AssertionError(msg)