Skip to content

gcages.harmonisation.common#

Common tools across different approaches

Classes:

Name Description
NotHarmonisedError

Raised when a pd.DataFrame is not harmonised

Functions:

Name Description
align_history_to_data_at_time

Align history to a given set of data for a given column

assert_harmonised

Assert that the input is harmonised

NotHarmonisedError #

Bases: ValueError

Raised when a pd.DataFrame is not harmonised

Methods:

Name Description
__init__

Initialise the error

Source code in src/gcages/harmonisation/common.py
class NotHarmonisedError(ValueError):
    """
    Raised when a [pd.DataFrame][pandas.DataFrame] is not harmonised
    """

    def __init__(
        self,
        comparison: pd.DataFrame,
        harmonisation_time: TIME_POINT,
    ) -> None:
        """
        Initialise the error

        Parameters
        ----------
        comparison
            Results of comparing the data and history

        harmonisation_time
            Expected harmonisation time
        """
        error_msg = (
            f"The DataFrame is not harmonised in {harmonisation_time}. "
            f"comparison=\n{comparison}"
        )
        super().__init__(error_msg)

__init__ #

__init__(
    comparison: DataFrame, harmonisation_time: TIME_POINT
) -> None

Initialise the error

Parameters:

Name Type Description Default
comparison DataFrame

Results of comparing the data and history

required
harmonisation_time TIME_POINT

Expected harmonisation time

required
Source code in src/gcages/harmonisation/common.py
def __init__(
    self,
    comparison: pd.DataFrame,
    harmonisation_time: TIME_POINT,
) -> None:
    """
    Initialise the error

    Parameters
    ----------
    comparison
        Results of comparing the data and history

    harmonisation_time
        Expected harmonisation time
    """
    error_msg = (
        f"The DataFrame is not harmonised in {harmonisation_time}. "
        f"comparison=\n{comparison}"
    )
    super().__init__(error_msg)

align_history_to_data_at_time #

align_history_to_data_at_time(
    df: TimeseriesDataFrame,
    *,
    history: TimeseriesDataFrame,
    time: Any,
) -> tuple[Series[NUMERIC_DATA], Series[NUMERIC_DATA]]

Align history to a given set of data for a given column

Parameters:

Name Type Description Default
df TimeseriesDataFrame

Data to which to align history

required
history TimeseriesDataFrame

History data to align

required
time Any

Time (i.e. column) for which to align the data

required

Returns:

Type Description
tuple[Series[NUMERIC_DATA], Series[NUMERIC_DATA]]

History, aligned with df for the given column

Raises:

Type Description
AssertionError

df and history could not be aligned for some reason

Source code in src/gcages/harmonisation/common.py
def align_history_to_data_at_time(
    df: TimeseriesDataFrame, *, history: TimeseriesDataFrame, time: Any
) -> tuple[pd.Series[NUMERIC_DATA], pd.Series[NUMERIC_DATA]]:  # type: ignore # pandas-stubs not up to date
    """
    Align history to a given set of data for a given column

    Parameters
    ----------
    df
        Data to which to align history

    history
        History data to align

    time
        Time (i.e. column) for which to align the data

    Returns
    -------
    :
        History, aligned with `df` for the given column

    Raises
    ------
    AssertionError
        `df` and `history` could not be aligned for some reason
    """
    df_year_aligned, history_year_aligned = df[time].align(history[time], join="left")

    # Implicitly assuming that people have already checked
    # that they have history values for all timeseries in `df`,
    # so any null is an obvious issue.
    if history_year_aligned.isnull().any():
        msg_l = ["history did not align properly with df"]

        if df.index.names == history.index.names:
            msg_l.append(
                "history and df have the same index levels "
                f"({list(history.index.names)}). "
                "You probably need to drop some of history's index levels "
                "so alignment can happen along the levels of interest "
                "(usually dropping everything except variable and unit (or similar)). "
            )

        # Might be useful, pandas might handle it
        # names_only_in_hist = history.index.names.difference(df.index.names)

        for unit_col_guess in ["unit", "units"]:
            if (
                unit_col_guess in df.index.names
                and unit_col_guess in history.index.names
            ):
                df_units_guess = df.index.get_level_values(unit_col_guess)
                history_units_guess = history.index.get_level_values(unit_col_guess)

                differing_units = (
                    df_units_guess.difference(history_units_guess).unique().tolist()
                )
                msg_l.append(
                    "The following units only appear in `df`, "
                    f"which might be why the data isn't aligned: {differing_units}. "
                    f"{df_units_guess=} {history_units_guess=}"
                )

        msg = ". ".join(msg_l)
        raise AssertionError(msg)

    return df_year_aligned, history_year_aligned

assert_harmonised #

assert_harmonised(
    df: TimeseriesDataFrame,
    *,
    history: TimeseriesDataFrame,
    harmonisation_time: TIME_POINT,
    rounding: int = 10,
    df_unit_level: str = "unit",
    history_unit_level: str | None = None,
    ur: UnitRegistry | None = None,
) -> None

Assert that the input is harmonised

Parameters:

Name Type Description Default
df TimeseriesDataFrame

Data to check

required
history TimeseriesDataFrame

History to which df should be harmonised

required
harmonisation_time TIME_POINT

Time at which df should be harmonised to history

required
rounding int

Rounding to apply to the data before comparing

10
df_unit_level str

Level in df's index which has unit information

Only used if unit conversion is required

'unit'
history_unit_level str | None

Level in history's index which has unit information

If not provided, we assume this is the same as df_unit_level

Only used if unit conversion is required

None
ur UnitRegistry | None

Unit registry to use for determining unit conversions

Passed to gcages.units_helpers.convert_unit_like

Only used if unit conversion is required

None

Raises:

Type Description
NotHarmonisedError

df is not harmonised to history

Source code in src/gcages/harmonisation/common.py
def assert_harmonised(  # noqa: PLR0913
    df: TimeseriesDataFrame,
    *,
    history: TimeseriesDataFrame,
    harmonisation_time: TIME_POINT,
    rounding: int = 10,
    df_unit_level: str = "unit",
    history_unit_level: str | None = None,
    ur: pint.UnitRegistry | None = None,
) -> None:
    """
    Assert that the input is harmonised

    Parameters
    ----------
    df
        Data to check

    history
        History to which `df` should be harmonised

    harmonisation_time
        Time at which `df` should be harmonised to `history`

    rounding
        Rounding to apply to the data before comparing

    df_unit_level
        Level in `df`'s index which has unit information

        Only used if unit conversion is required

    history_unit_level
        Level in `history`'s index which has unit information

        If not provided, we assume this is the same as `df_unit_level`

        Only used if unit conversion is required

    ur
        Unit registry to use for determining unit conversions

        Passed to [gcages.units_helpers.convert_unit_like][]

        Only used if unit conversion is required

    Raises
    ------
    NotHarmonisedError
        `df` is not harmonised to `history`
    """
    df_unit_match = convert_unit_like(
        df,
        target=history,
        df_unit_level=df_unit_level,
        target_unit_level=history_unit_level,
        ur=ur,
    )
    df_harm_year_aligned, history_harm_year_aligned = align_history_to_data_at_time(
        df_unit_match, history=history, time=harmonisation_time
    )
    comparison = df_harm_year_aligned.round(rounding).compare(  # type: ignore # pandas-stubs confused
        history_harm_year_aligned.round(rounding), result_names=("df", "history")
    )
    if not comparison.empty:
        raise NotHarmonisedError(
            comparison=comparison, harmonisation_time=harmonisation_time
        )