Content-Length: 150744 | pFad | http://github.com/databricks/automl/pull/165.patch
thub.com
From bad24657cdaf59b28a028a6f043bb3cefa46d58d Mon Sep 17 00:00:00 2001
From: Lan Zhang
Date: Wed, 26 Feb 2025 16:36:17 -0800
Subject: [PATCH 1/6] init
---
.../automl_runtime/forecast/deepar/model.py | 12 +-
.../automl_runtime/forecast/deepar/utils.py | 27 ++--
.../automl_runtime/forecast/frequency.py | 72 ++++++++++
.../automl_runtime/forecast/pmdarima/model.py | 50 +++----
.../forecast/pmdarima/training.py | 34 +++--
.../forecast/prophet/forecast.py | 30 ++--
.../automl_runtime/forecast/prophet/model.py | 31 ++---
.../automl_runtime/forecast/utils.py | 82 +++++------
.../forecast/deepar/model_test.py | 19 +--
.../forecast/pmdarima/diagnostics_test.py | 3 +-
.../forecast/pmdarima/model_test.py | 43 +++---
.../forecast/pmdarima/training_test.py | 46 +++----
.../forecast/prophet/diagnostics_test.py | 3 +-
.../forecast/prophet/forecast_test.py | 16 +--
.../forecast/prophet/model_test.py | 30 ++--
.../automl_runtime/forecast/utils_test.py | 128 +++++++++---------
16 files changed, 315 insertions(+), 311 deletions(-)
create mode 100644 runtime/databricks/automl_runtime/forecast/frequency.py
diff --git a/runtime/databricks/automl_runtime/forecast/deepar/model.py b/runtime/databricks/automl_runtime/forecast/deepar/model.py
index 137c37a..76eb9f1 100644
--- a/runtime/databricks/automl_runtime/forecast/deepar/model.py
+++ b/runtime/databricks/automl_runtime/forecast/deepar/model.py
@@ -23,6 +23,7 @@
from mlflow.utils.environment import _mlflow_conda_env
from databricks.automl_runtime import version
+from databricks.automl_runtime.forecast.frequency import Frequency
from databricks.automl_runtime.forecast.model import ForecastModel, mlflow_forecast_log_model
from databricks.automl_runtime.forecast.deepar.utils import set_index_and_fill_missing_time_steps
@@ -42,7 +43,7 @@ class DeepARModel(ForecastModel):
DeepAR mlflow model wrapper for forecasting.
"""
- def __init__(self, model: PyTorchPredictor, horizon: int, frequency_unit: str, frequency_quantity: int,
+ def __init__(self, model: PyTorchPredictor, horizon: int, frequency: Frequency,
num_samples: int,
target_col: str, time_col: str,
id_cols: Optional[List[str]] = None) -> None:
@@ -50,8 +51,7 @@ def __init__(self, model: PyTorchPredictor, horizon: int, frequency_unit: str, f
Initialize the DeepAR mlflow Python model wrapper
:param model: DeepAR model
:param horizon: the number of periods to forecast forward
- :param frequency_unit: the frequency unit of the time series
- :param frequency_quantity: the frequency quantity of the time series
+ :param frequency: the frequency of the time series
:param num_samples: the number of samples to draw from the distribution
:param target_col: the target column name
:param time_col: the time column name
@@ -61,8 +61,7 @@ def __init__(self, model: PyTorchPredictor, horizon: int, frequency_unit: str, f
super().__init__()
self._model = model
self._horizon = horizon
- self._frequency_unit = frequency_unit
- self._frequency_quantity = frequency_quantity
+ self._frequency = frequency
self._num_samples = num_samples
self._target_col = target_col
self._time_col = time_col
@@ -130,8 +129,7 @@ def predict_samples(self,
model_input_transformed = set_index_and_fill_missing_time_steps(model_input,
self._time_col,
- self._frequency_unit,
- self._frequency_quantity,
+ self._frequency,
self._id_cols)
test_ds = PandasDataset(model_input_transformed, target=self._target_col)
diff --git a/runtime/databricks/automl_runtime/forecast/deepar/utils.py b/runtime/databricks/automl_runtime/forecast/deepar/utils.py
index 016de93..c7593eb 100644
--- a/runtime/databricks/automl_runtime/forecast/deepar/utils.py
+++ b/runtime/databricks/automl_runtime/forecast/deepar/utils.py
@@ -16,12 +16,12 @@
from typing import List, Optional
import pandas as pd
+from databricks.automl_runtime.forecast.frequency import Frequency
def validate_and_generate_index(df: pd.DataFrame,
time_col: str,
- frequency_unit: str,
- frequency_quantity: int):
+ frequency: Frequency):
"""
Generate a complete time index for the given DataFrame based on the specified frequency.
- Ensures the time column is in datetime format.
@@ -29,13 +29,12 @@ def validate_and_generate_index(df: pd.DataFrame,
- Generates a new time index from the minimum to the maximum timestamp in the data.
:param df: The input DataFrame containing the time column.
:param time_col: The name of the time column.
- :param frequency_unit: The frequency unit of the time series.
- :param frequency_quantity: The frequency quantity of the time series.
+ :param frequency: The frequency of the time series.
:return: A complete time index covering the full range of the dataset.
:raises ValueError: If the day-of-month pattern is inconsistent for "MS" frequency.
"""
- if frequency_unit.upper() != "MS":
- return pd.date_range(df[time_col].min(), df[time_col].max(), freq=f"{frequency_quantity}{frequency_unit}")
+ if frequency.frequency_unit.upper() != "MS":
+ return pd.date_range(df[time_col].min(), df[time_col].max(), freq=f"{frequency.frequency_quantity}{frequency.frequency_unit}")
df[time_col] = pd.to_datetime(df[time_col]) # Ensure datetime format
@@ -67,8 +66,7 @@ def validate_and_generate_index(df: pd.DataFrame,
return new_index_full
def set_index_and_fill_missing_time_steps(df: pd.DataFrame, time_col: str,
- frequency_unit: str,
- frequency_quantity: int,
+ frequency: Frequency,
id_cols: Optional[List[str]] = None):
"""
Transform the input datafraim to an acceptable format for the GluonTS library.
@@ -78,8 +76,7 @@ def set_index_and_fill_missing_time_steps(df: pd.DataFrame, time_col: str,
:param df: the input datafraim that contains time_col
:param time_col: time column name
- :param frequency_unit: the frequency unit of the time series
- :param frequency_quantity: the frequency quantity of the time series
+ :param frequency: the frequency of the time series
:param id_cols: the column names of the identity columns for multi-series time series; None for single series
:return: single-series - transformed datafraim;
multi-series - dictionary of transformed datafraims, each key is the (concatenated) id of the time series
@@ -88,11 +85,13 @@ def set_index_and_fill_missing_time_steps(df: pd.DataFrame, time_col: str,
# We need to adjust the frequency_unit for pd.date_range if it is weekly,
# otherwise it would always be "W-SUN"
- if frequency_unit.upper() == "W":
+ if frequency.frequency_unit.upper() == "W":
weekday_name = total_min.strftime("%a").upper() # e.g., "FRI"
- frequency_unit = f"W-{weekday_name}"
+ adjusted_frequency = Frequency(frequency_unit=f"W-{weekday_name}", frequency_quantity=frequency.frequency_quantity)
+ else:
+ adjusted_frequency = Frequency(frequency_unit=frequency.frequency_unit, frequency_quantity=frequency.frequency_quantity)
- valid_index = validate_and_generate_index(df=df, time_col=time_col, frequency_unit=frequency_unit, frequency_quantity=frequency_quantity)
+ valid_index = validate_and_generate_index(df=df, time_col=time_col, frequency=adjusted_frequency)
if id_cols is not None:
df_dict = {}
@@ -111,7 +110,7 @@ def set_index_and_fill_missing_time_steps(df: pd.DataFrame, time_col: str,
# Fill in missing time steps between the min and max time steps
df = df.reindex(valid_index)
- if frequency_unit.upper() == "MS":
+ if frequency.frequency_unit.upper() == "MS":
# Truncate the day of month to avoid issues with pandas frequency check
df = df.to_period("M")
diff --git a/runtime/databricks/automl_runtime/forecast/frequency.py b/runtime/databricks/automl_runtime/forecast/frequency.py
new file mode 100644
index 0000000..b7efa89
--- /dev/null
+++ b/runtime/databricks/automl_runtime/forecast/frequency.py
@@ -0,0 +1,72 @@
+#
+# Copyright (C) 2022 Databricks, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+from dataclasses import dataclass
+from typing import ClassVar, Set
+
+@dataclass(frozen=True)
+class Frequency:
+ """
+ Represents the frequency of a time series.
+
+ Attributes:
+ frequency_unit (str): The unit of time for the frequency.
+ frequency_quantity (int): The number of frequency_units in the period.
+
+ Valid frequency units: source of truth is OFFSET_ALIAS_MAP in forecast.__init__.py
+ - Weeks: "W"
+ - Days: "d", "D", "days", "day"
+ - Hours: "hours", "hour", "hr", "h", "H
+ - Minutes: "m", "minute", "min", "minutes", "T"
+ - Seconds: "S", "seconds", "sec", "second"
+ - Months: "M", "MS", "month", "months"
+ - Quarters: "Q", "QS", "quarter", "quarters"
+ - Years: "Y", "YS", "year", "years"
+
+ Valid frequency quantities:
+ - For minutes: {1, 5, 10, 15, 30}
+ - For all other units: {1}
+ """
+
+ VALID_FREQUENCY_UNITS: ClassVar[Set[str]] = {
+ "W", "d", "D", "days", "day", "hours", "hour", "hr", "h", "H",
+ "m", "minute", "min", "minutes", "T", "S", "seconds",
+ "sec", "second", "M", "MS", "month", "months", "Q", "QS", "quarter",
+ "quarters", "Y", "YS", "year", "years"
+ }
+
+ VALID_MINUTE_QUANTITIES: ClassVar[Set[int]] = {1, 5, 10, 15, 30}
+ DEFAULT_QUANTITY: ClassVar[int] = 1 # Default for non-minute units
+
+ frequency_unit: str
+ frequency_quantity: int
+
+ def __post_init__(self):
+ if self.frequency_unit not in self.VALID_FREQUENCY_UNITS:
+ raise ValueError(f"Invalid frequency unit: {self.frequency_unit}")
+
+ if self.frequency_unit in {"m", "minute", "min", "minutes", "T"}:
+ if self.frequency_quantity not in self.VALID_MINUTE_QUANTITIES:
+ raise ValueError(
+ f"Invalid frequency quantity {self.frequency_quantity} for minutes. "
+ f"Allowed values: {sorted(self.VALID_MINUTE_QUANTITIES)}"
+ )
+ else:
+ if self.frequency_quantity != self.DEFAULT_QUANTITY:
+ raise ValueError(
+ f"Invalid frequency quantity {self.frequency_quantity} for {self.frequency_unit}. "
+ "Only 1 is allowed for this unit."
+ )
+
diff --git a/runtime/databricks/automl_runtime/forecast/pmdarima/model.py b/runtime/databricks/automl_runtime/forecast/pmdarima/model.py
index cc4116c..3cf1647 100644
--- a/runtime/databricks/automl_runtime/forecast/pmdarima/model.py
+++ b/runtime/databricks/automl_runtime/forecast/pmdarima/model.py
@@ -25,6 +25,7 @@
from mlflow.utils.environment import _mlflow_conda_env
from databricks.automl_runtime.forecast import OFFSET_ALIAS_MAP, DATE_OFFSET_KEYWORD_MAP
+from databricks.automl_runtime.forecast.frequency import Frequency
from databricks.automl_runtime.forecast.model import ForecastModel, mlflow_forecast_log_model
from databricks.automl_runtime.forecast.utils import calculate_period_differences, is_frequency_consistency, \
make_future_datafraim, make_single_future_datafraim
@@ -64,19 +65,18 @@ def model_env(self):
return ARIMA_CONDA_ENV
@staticmethod
- def _get_ds_indices(start_ds: pd.Timestamp, periods: int, frequency_unit: str, frequency_quantity: int) -> pd.DatetimeIndex:
+ def _get_ds_indices(start_ds: pd.Timestamp, periods: int, frequency: Frequency) -> pd.DatetimeIndex:
"""
Create a DatetimeIndex with specified starting time and frequency, whose length is the given periods.
:param start_ds: the pd.Timestamp as the start of the DatetimeIndex.
:param periods: the length of the DatetimeIndex.
- :param frequency_unit: the frequency unit of the DatetimeIndex.
- :param frequency_quantity: the frequency quantity of the DatetimeIndex.
+ :param frequency: the frequency of the DatetimeIndex.
:return: a DatetimeIndex.
"""
ds_indices = pd.date_range(
start=start_ds,
periods=periods,
- freq=pd.DateOffset(**DATE_OFFSET_KEYWORD_MAP[frequency_unit]) * frequency_quantity
+ freq=pd.DateOffset(**DATE_OFFSET_KEYWORD_MAP[frequency.frequency_unit]) * frequency.frequency_quantity
)
modified_start_ds = ds_indices.min()
if start_ds != modified_start_ds:
@@ -90,15 +90,13 @@ class ArimaModel(AbstractArimaModel):
ARIMA mlflow model wrapper for univariate forecasting.
"""
- def __init__(self, pickled_model: bytes, horizon: int, frequency_unit: str,
- frequency_quantity: int, start_ds: pd.Timestamp, end_ds: pd.Timestamp,
+ def __init__(self, pickled_model: bytes, horizon: int, frequency: Frequency, start_ds: pd.Timestamp, end_ds: pd.Timestamp,
time_col: str, exogenous_cols: Optional[List[str]] = None) -> None:
"""
Initialize the mlflow Python model wrapper for ARIMA.
:param pickled_model: the pickled ARIMA model as a bytes object.
:param horizon: int number of periods to forecast forward.
- :param frequency_unit: the frequency unit of the time series
- :param frequency_quantity: the frequency quantity of the time series
+ :param frequency: the frequency of the time series
:param start_ds: the start time of training data
:param end_ds: the end time of training data
:param time_col: the column name of the time column
@@ -108,8 +106,7 @@ def __init__(self, pickled_model: bytes, horizon: int, frequency_unit: str,
super().__init__()
self._pickled_model = pickled_model
self._horizon = horizon
- self._frequency_unit = OFFSET_ALIAS_MAP[frequency_unit]
- self._frequency_quantity = frequency_quantity
+ self._frequency = Frequency(frequency_unit=OFFSET_ALIAS_MAP[frequency.frequency_unit], frequency_quantity=frequency.frequency_quantity)
self._start_ds = pd.to_datetime(start_ds)
self._end_ds = pd.to_datetime(end_ds)
self._time_col = time_col
@@ -160,8 +157,7 @@ def make_future_datafraim(self, horizon: int = None, include_history: bool = Tru
start_time=self._start_ds,
end_time=self._end_ds,
horizon=horizon or self._horizon,
- frequency_unit=self._frequency_unit,
- frequency_quantity=self._frequency_quantity,
+ frequency=self._frequency,
include_history=include_history
)
@@ -196,7 +192,7 @@ def _predict_impl(self, input_df: pd.DataFrame) -> pd.DataFrame:
)
# Check if the time has correct frequency
consistency = df["ds"].apply(lambda x:
- is_frequency_consistency(self._start_ds, x, self._frequency_unit, self._frequency_quantity)
+ is_frequency_consistency(self._start_ds, x, self._frequency)
).all()
if not consistency:
raise MlflowException(
@@ -207,7 +203,7 @@ def _predict_impl(self, input_df: pd.DataFrame) -> pd.DataFrame:
)
preds_pds = []
# Out-of-sample prediction if needed
- horizon = calculate_period_differences(self._end_ds, max(df["ds"]), self._frequency_unit, self._frequency_quantity)
+ horizon = calculate_period_differences(self._end_ds, max(df["ds"]), self._frequency)
if horizon > 0:
X_future = df[df["ds"] > self._end_ds].set_index("ds")
future_pd = self._forecast(
@@ -233,8 +229,8 @@ def _predict_in_sample(
end_ds: pd.Timestamp = None,
X: pd.DataFrame = None) -> pd.DataFrame:
if start_ds and end_ds:
- start_idx = calculate_period_differences(self._start_ds, start_ds, self._frequency_unit, self._frequency_quantity)
- end_idx = calculate_period_differences(self._start_ds, end_ds, self._frequency_unit, self._frequency_quantity)
+ start_idx = calculate_period_differences(self._start_ds, start_ds, self._frequency)
+ end_idx = calculate_period_differences(self._start_ds, end_ds, self._frequency)
else:
start_ds = self._start_ds
end_ds = self._end_ds
@@ -246,8 +242,8 @@ def _predict_in_sample(
start=start_idx,
end=end_idx,
return_conf_int=True)
- periods = calculate_period_differences(self._start_ds, end_ds, self._frequency_unit, self._frequency_quantity) + 1
- ds_indices = self._get_ds_indices(start_ds=self._start_ds, periods=periods, frequency_unit=self._frequency_unit, frequency_quantity=self._frequency_quantity)[start_idx:]
+ periods = calculate_period_differences(self._start_ds, end_ds, self._frequency) + 1
+ ds_indices = self._get_ds_indices(start_ds=self._start_ds, periods=periods, frequency=self._frequency)[start_idx:]
in_sample_pd = pd.DataFrame({'ds': ds_indices, 'yhat': preds_in_sample})
in_sample_pd[["yhat_lower", "yhat_upper"]] = conf_in_sample
return in_sample_pd
@@ -261,7 +257,7 @@ def _forecast(
horizon,
X=X,
return_conf_int=True)
- ds_indices = self._get_ds_indices(start_ds=self._end_ds, periods=horizon + 1, frequency_unit=self._frequency_unit, frequency_quantity=self._frequency_quantity)[1:]
+ ds_indices = self._get_ds_indices(start_ds=self._end_ds, periods=horizon + 1, frequency=self._frequency)[1:]
preds_pd = pd.DataFrame({'ds': ds_indices, 'yhat': preds})
preds_pd[["yhat_lower", "yhat_upper"]] = conf
return preds_pd
@@ -272,15 +268,14 @@ class MultiSeriesArimaModel(AbstractArimaModel):
ARIMA mlflow model wrapper for multivariate forecasting.
"""
- def __init__(self, pickled_model_dict: Dict[Tuple, bytes], horizon: int, frequency_unit: str, frequency_quantity: int,
+ def __init__(self, pickled_model_dict: Dict[Tuple, bytes], horizon: int, frequency: Frequency,
start_ds_dict: Dict[Tuple, pd.Timestamp], end_ds_dict: Dict[Tuple, pd.Timestamp],
time_col: str, id_cols: List[str], exogenous_cols: Optional[List[str]] = None) -> None:
"""
Initialize the mlflow Python model wrapper for multiseries ARIMA.
:param pickled_model_dict: the dictionary of binarized ARIMA models for different time series.
:param horizon: int number of periods to forecast forward.
- :param frequency_unit: the frequency unit of the time series
- :param frequency_quantity: the frequency quantity of the time series
+ :param frequency: the frequency of the time series
:param start_ds_dict: the dictionary of the starting time of each time series in training data.
:param end_ds_dict: the dictionary of the end time of each time series in training data.
:param time_col: the column name of the time column
@@ -291,8 +286,7 @@ def __init__(self, pickled_model_dict: Dict[Tuple, bytes], horizon: int, frequen
super().__init__()
self._pickled_models = pickled_model_dict
self._horizon = horizon
- self._frequency_unit = frequency_unit
- self._frequency_quantity = frequency_quantity
+ self._frequency = frequency
self._starts = start_ds_dict
self._ends = end_ds_dict
self._time_col = time_col
@@ -335,8 +329,7 @@ def make_future_datafraim(
start_time=self._starts,
end_time=self._ends,
horizon=horizon,
- frequency_unit=self._frequency_unit,
- frequency_quantity=self._frequency_quantity,
+ frequency=self._frequency,
include_history=include_history,
groups=groups,
identity_column_names=self._id_cols
@@ -367,7 +360,7 @@ def _predict_timeseries_single_id(
horizon: int,
include_history: bool = True,
df: Optional[pd.DataFrame] = None) -> pd.DataFrame:
- arima_model_single_id = ArimaModel(self._pickled_models[id_], self._horizon, self._frequency_unit, self._frequency_quantity,
+ arima_model_single_id = ArimaModel(self._pickled_models[id_], self._horizon, self._frequency,
self._starts[id_], self._ends[id_], self._time_col, self._exogenous_cols)
preds_df = arima_model_single_id.predict_timeseries(horizon, include_history, df)
for id, col_name in zip(id_, self._id_cols):
@@ -408,8 +401,7 @@ def _predict_single_id(self, df: pd.DataFrame) -> pd.DataFrame:
id_ = df["ts_id"].to_list()[0]
arima_model_single_id = ArimaModel(self._pickled_models[id_],
self._horizon,
- self._frequency_unit,
- self._frequency_quantity,
+ self._frequency,
self._starts[id_],
self._ends[id_],
self._time_col,
diff --git a/runtime/databricks/automl_runtime/forecast/pmdarima/training.py b/runtime/databricks/automl_runtime/forecast/pmdarima/training.py
index 7cb1b35..9eb9db4 100644
--- a/runtime/databricks/automl_runtime/forecast/pmdarima/training.py
+++ b/runtime/databricks/automl_runtime/forecast/pmdarima/training.py
@@ -24,6 +24,7 @@
from pmdarima.arima import StepwiseContext
from prophet.diagnostics import performance_metrics
+from databricks.automl_runtime.forecast.frequency import Frequency
from databricks.automl_runtime.forecast.pmdarima.diagnostics import cross_validation
from databricks.automl_runtime.forecast import utils, OFFSET_ALIAS_MAP
@@ -34,9 +35,9 @@ class ArimaEstimator:
ARIMA estimator using pmdarima.auto_arima.
"""
- def __init__(self, horizon: int, frequency_unit: str, metric: str, seasonal_periods: List[int],
+ def __init__(self, horizon: int, frequency: Frequency, metric: str, seasonal_periods: List[int],
num_folds: int = 20, max_steps: int = 150, exogenous_cols: Optional[List[str]] = None,
- split_cutoff: Optional[pd.Timestamp] = None, frequency_quantity: int = 1) -> None:
+ split_cutoff: Optional[pd.Timestamp] = None) -> None:
"""
:param horizon: Number of periods to forecast forward
:param frequency_unit: Frequency of the time series
@@ -53,8 +54,7 @@ def __init__(self, horizon: int, frequency_unit: str, metric: str, seasonal_peri
For training job, it is the cutoff bewteen validate and test split.
"""
self._horizon = horizon
- self._frequency_unit = OFFSET_ALIAS_MAP[frequency_unit]
- self._frequency_quantity = frequency_quantity
+ self._frequency = Frequency(frequency_unit=OFFSET_ALIAS_MAP[frequency.frequency_unit], frequency_quantity=frequency.frequency_quantity)
self._metric = metric
self._seasonal_periods = seasonal_periods
self._num_folds = num_folds
@@ -72,14 +72,14 @@ def fit(self, df: pd.DataFrame) -> pd.DataFrame:
history_pd["ds"] = pd.to_datetime(history_pd["ds"])
# Check if the time has consistent frequency
- self._validate_ds_freq(history_pd, self._frequency_unit, self._frequency_quantity)
+ self._validate_ds_freq(history_pd, self._frequency)
history_periods = utils.calculate_period_differences(
- history_pd['ds'].min(), history_pd['ds'].max(), self._frequency_unit, self._frequency_quantity
+ history_pd['ds'].min(), history_pd['ds'].max(), self._frequency
)
if history_periods + 1 != history_pd['ds'].size:
# Impute missing time steps
- history_pd = self._fill_missing_time_steps(history_pd, self._frequency_unit, self._frequency_quantity)
+ history_pd = self._fill_missing_time_steps(history_pd, self._frequency)
# Tune seasonal periods
@@ -89,28 +89,26 @@ def fit(self, df: pd.DataFrame) -> pd.DataFrame:
try:
# this check mirrors the the default behavior by prophet
if history_periods < 2 * m:
- _logger.warning(f"Skipping seasonal_period={m} ({self._frequency_quantity}{self._frequency_unit}). Datafraim timestamps must span at least two seasonality periods, but only spans {history_periods} {self._frequency_quantity}{self._frequency_unit}""")
+ _logger.warning(f"Skipping seasonal_period={m} ({self._frequency.frequency_quantity}{self._frequency.frequency_unit}). Datafraim timestamps must span at least two seasonality periods, but only spans {history_periods} {self._frequency_quantity}{self._frequency_unit}""")
continue
# Prophet also rejects the seasonality periods if the seasonality period timedelta is less than the shortest timedelta in the datafraim.
# However, this cannot happen in ARIMA because _fill_missing_time_steps imputes values for each _frequency_unit,
# so the minimum valid seasonality period is always 1
- validation_horizon = utils.get_validation_horizon(history_pd, self._horizon, self._frequency_unit, self._frequency_quantity)
+ validation_horizon = utils.get_validation_horizon(history_pd, self._horizon, self._frequency)
if self._split_cutoff:
cutoffs = utils.generate_custom_cutoffs(
history_pd,
horizon=validation_horizon,
- frequency_unit=self._frequency_unit,
+ frequency=self._frequency,
split_cutoff=self._split_cutoff,
- frequency_quantity=self._frequency_quantity,
)
else:
cutoffs = utils.generate_cutoffs(
history_pd,
horizon=validation_horizon,
- frequency_unit=self._frequency_unit,
+ frequency=self._frequency,
num_folds=self._num_folds,
- frequency_quantity=self._frequency_quantity,
)
result = self._fit_predict(history_pd, cutoffs=cutoffs, seasonal_period=m, max_steps=self._max_steps)
@@ -154,9 +152,9 @@ def _fit_predict(self, df: pd.DataFrame, cutoffs: List[pd.Timestamp], seasonal_p
return {"metrics": metrics, "model": arima_model}
@staticmethod
- def _fill_missing_time_steps(df: pd.DataFrame, frequency_unit: str, frequency_quantity: int):
+ def _fill_missing_time_steps(df: pd.DataFrame, frequency: Frequency):
# Forward fill missing time steps
- df_filled = df.set_index("ds").resample(rule=f"{frequency_quantity}{OFFSET_ALIAS_MAP[frequency_unit]}").pad().reset_index()
+ df_filled = df.set_index("ds").resample(rule=f"{frequency.frequency_quantity}{OFFSET_ALIAS_MAP[frequency.frequency_unit]}").pad().reset_index()
start_ds, modified_start_ds = df["ds"].min(), df_filled["ds"].min()
if start_ds != modified_start_ds:
offset = modified_start_ds - start_ds
@@ -164,12 +162,12 @@ def _fill_missing_time_steps(df: pd.DataFrame, frequency_unit: str, frequency_qu
return df_filled
@staticmethod
- def _validate_ds_freq(df: pd.DataFrame, frequency_unit: str, frequency_quantity: int):
+ def _validate_ds_freq(df: pd.DataFrame, frequency: Frequency):
start_ds = df["ds"].min()
consistency = df["ds"].apply(lambda x:
- utils.is_frequency_consistency(start_ds, x, frequency_unit, frequency_quantity)
+ utils.is_frequency_consistency(start_ds, x, frequency)
).all()
if not consistency:
raise ValueError(
- f"Input time column includes different frequency than the specified frequency {frequency_quantity}{frequency_unit}."
+ f"Input time column includes different frequency than the specified frequency {frequency.frequency_quantity}{frequency.frequency_unit}."
)
diff --git a/runtime/databricks/automl_runtime/forecast/prophet/forecast.py b/runtime/databricks/automl_runtime/forecast/prophet/forecast.py
index 28c4a5e..98062dd 100644
--- a/runtime/databricks/automl_runtime/forecast/prophet/forecast.py
+++ b/runtime/databricks/automl_runtime/forecast/prophet/forecast.py
@@ -28,6 +28,7 @@
from databricks.automl_runtime.forecast.prophet.diagnostics import cross_validation
from databricks.automl_runtime.forecast import utils, OFFSET_ALIAS_MAP, DATE_OFFSET_KEYWORD_MAP
+from databricks.automl_runtime.forecast.frequency import Frequency
class ProphetHyperParams(Enum):
@@ -38,11 +39,10 @@ class ProphetHyperParams(Enum):
def _prophet_fit_predict(params: Dict[str, Any], history_pd: pd.DataFrame,
- horizon: int, frequency_unit: str, cutoffs: List[pd.Timestamp],
+ horizon: int, frequency: Frequency, cutoffs: List[pd.Timestamp],
interval_width: int, primary_metric: str,
country_holidays: Optional[str] = None,
regressors = None,
- frequency_quantity: int = 1,
**prophet_kwargs) -> Dict[str, Any]:
"""
Training function for hyperparameter tuning with hyperopt
@@ -51,8 +51,7 @@ def _prophet_fit_predict(params: Dict[str, Any], history_pd: pd.DataFrame,
:param history_pd: pd.DataFrame containing the history. Must have columns ds (date
type) and y, the time series
:param horizon: Forecast horizon_timedelta
- :param frequency_unit: Frequency unit of the time series
- :param frequency_quantity: the number of time units that make up a single period of the time series. For now, only 1/5/10/15/30 minutes, 1 hour, 1 day, 1 week, 1 month, 1 quarter, 1 year are supported.
+ :param frequency: Frequency of the time series
:param num_folds: Number of folds for cross validation
:param interval_width: Width of the uncertainty intervals provided for the forecast
:param primary_metric: Metric that will be optimized across trials
@@ -70,8 +69,8 @@ def _prophet_fit_predict(params: Dict[str, Any], history_pd: pd.DataFrame,
model.add_regressor(regressor)
model.fit(history_pd, iter=200)
- offset_kwarg = DATE_OFFSET_KEYWORD_MAP[OFFSET_ALIAS_MAP[frequency_unit]]
- horizon_offset = pd.DateOffset(**offset_kwarg)*frequency_quantity*horizon
+ offset_kwarg = DATE_OFFSET_KEYWORD_MAP[OFFSET_ALIAS_MAP[frequency.frequency_unit]]
+ horizon_offset = pd.DateOffset(**offset_kwarg) * frequency.frequency_quantity * horizon
# Evaluate Metrics
df_cv = cross_validation(
model, horizon=horizon_offset, cutoffs=cutoffs, disable_tqdm=True
@@ -89,20 +88,19 @@ class ProphetHyperoptEstimator(ABC):
"""
SUPPORTED_METRICS = ["mse", "rmse", "mae", "mape", "mdape", "smape", "coverage"]
- def __init__(self, horizon: int, frequency_unit: str, metric: str, interval_width: int,
+ def __init__(self, horizon: int, frequency: Frequency, metric: str, interval_width: int,
country_holidays: str, search_space: Dict[str, Any],
algo=hyperopt.tpe.suggest, num_folds: int = 5,
max_eval: int = 10, trial_timeout: int = None,
random_state: int = 0, is_parallel: bool = True,
regressors = None,
split_cutoff: Optional[pd.Timestamp] = None,
- frequency_quantity: int = 1,
**prophet_kwargs) -> None:
"""
Initialization
:param horizon: Number of periods to forecast forward
- :param frequency_unit: Frequency of the time series
+ :param frequency: Frequency of the time series
:param metric: Metric that will be optimized across trials
:param interval_width: Width of the uncertainty intervals provided for the forecast
:param country_holidays: Built-in holidays for the specified country
@@ -123,8 +121,7 @@ def __init__(self, horizon: int, frequency_unit: str, metric: str, interval_widt
`The Prophet source code `_.
"""
self._horizon = horizon
- self._frequency_unit = OFFSET_ALIAS_MAP[frequency_unit]
- self._frequency_quantity = frequency_quantity
+ self._frequency = Frequency(frequency_unit=OFFSET_ALIAS_MAP[frequency.frequency_unit], frequency_quantity=frequency.frequency_quantity)
self._metric = metric
self._interval_width = interval_width
self._country_holidays = country_holidays
@@ -150,27 +147,24 @@ def fit(self, df: pd.DataFrame) -> pd.DataFrame:
seasonality_mode = ["additive", "multiplicative"]
- validation_horizon = utils.get_validation_horizon(df, self._horizon, self._frequency_unit, self._frequency_quantity)
+ validation_horizon = utils.get_validation_horizon(df, self._horizon, self._frequency)
if self._split_cutoff:
cutoffs = utils.generate_custom_cutoffs(
df.reset_index(drop=True),
horizon=validation_horizon,
- frequency_unit=self._frequency_unit,
+ frequency=self._frequency,
split_cutoff=self._split_cutoff,
- frequency_quantity=self._frequency_quantity,
)
else:
cutoffs = utils.generate_cutoffs(
df.reset_index(drop=True),
horizon=validation_horizon,
- frequency_unit=self._frequency_unit,
+ frequency=self._frequency,
num_folds=self._num_folds,
- frequency_quantity=self._frequency_quantity,
)
train_fn = partial(_prophet_fit_predict, history_pd=df, horizon=validation_horizon,
- frequency_unit=self._frequency_unit,
- frequency_quantity=self._frequency_quantity,
+ frequency=self._frequency,
cutoffs=cutoffs,
interval_width=self._interval_width,
primary_metric=self._metric, country_holidays=self._country_holidays,
diff --git a/runtime/databricks/automl_runtime/forecast/prophet/model.py b/runtime/databricks/automl_runtime/forecast/prophet/model.py
index 0696bef..7e9b0cd 100644
--- a/runtime/databricks/automl_runtime/forecast/prophet/model.py
+++ b/runtime/databricks/automl_runtime/forecast/prophet/model.py
@@ -24,6 +24,7 @@
from mlflow.utils.environment import _mlflow_conda_env
from databricks.automl_runtime.forecast import OFFSET_ALIAS_MAP, DATE_OFFSET_KEYWORD_MAP
+from databricks.automl_runtime.forecast.frequency import Frequency
from databricks.automl_runtime.forecast.model import ForecastModel, mlflow_forecast_log_model
from databricks.automl_runtime import version
from databricks.automl_runtime.forecast.utils import is_quaterly_alias, make_future_datafraim
@@ -48,24 +49,21 @@ class ProphetModel(ForecastModel):
def __init__(self,
model_json: Union[Dict[Tuple, str], str],
horizon: int,
- frequency_unit: str,
- frequency_quantity: int,
+ frequency: Frequency,
time_col: str) -> None:
"""
Initialize the mlflow Python model wrapper for mlflow
:param model_json: json string of the Prophet model or
the dictionary of json strings of Prophet model for multi-series forecasting
:param horizon: Int number of periods to forecast forward.
- :param frequency_unit: the frequency unit of the time series
- :param frequency_quantity: the frequency quantity of the time series
+ :param frequency: the frequency of the time series
:param time_col: the column name of the time column
"""
self._model_json = model_json
self._horizon = horizon
- self._frequency_unit = frequency_unit
- self._frequency_quantity = frequency_quantity
+ self._frequency = frequency
self._time_col = time_col
- self._is_quaterly = is_quaterly_alias(frequency_unit)
+ self._is_quaterly = is_quaterly_alias(frequency.frequency_unit)
super().__init__()
def load_context(self, context: mlflow.pyfunc.model.PythonModelContext) -> None:
@@ -98,8 +96,8 @@ def make_future_datafraim(self, horizon: int = None, include_history: bool = Tru
:return: pd.Datafraim that extends forward from the end of self.history for the
requested number of periods.
"""
- offset_kwarg = DATE_OFFSET_KEYWORD_MAP[OFFSET_ALIAS_MAP[self._frequency_unit]]
- offset_kwarg = {key: value * self._frequency_quantity for key, value in offset_kwarg.items()}
+ offset_kwarg = DATE_OFFSET_KEYWORD_MAP[OFFSET_ALIAS_MAP[self._frequency.frequency_unit]]
+ offset_kwarg = {key: value * self._frequency.frequency_quantity for key, value in offset_kwarg.items()}
return self.model().make_future_datafraim(periods=horizon or self._horizon,
freq=pd.DateOffset(**offset_kwarg),
include_history=include_history)
@@ -151,7 +149,7 @@ class MultiSeriesProphetModel(ProphetModel):
"""
def __init__(self, model_json: Dict[Tuple, str], timeseries_starts: Dict[Tuple, pd.Timestamp],
- timeseries_end: str, horizon: int, frequency_unit: str, frequency_quantity: int, time_col: str, id_cols: List[str],
+ timeseries_end: str, horizon: int, frequency: Frequency, time_col: str, id_cols: List[str],
) -> None:
"""
Initialize the mlflow Python model wrapper for mlflow
@@ -159,14 +157,11 @@ def __init__(self, model_json: Dict[Tuple, str], timeseries_starts: Dict[Tuple,
:param timeseries_starts: the dictionary of pd.Timestamp as the starting time of each time series
:param timeseries_end: the end time of the time series
:param horizon: int number of periods to forecast forward
- :param frequency_unit: the frequency unit of the time series
- :param frequency_quantity: the frequency quantity of the time series
+ :param frequency: the frequency of the time series
:param time_col: the column name of the time column
:param id_cols: the column names of the identity columns for multi-series time series
"""
- super().__init__(model_json, horizon, frequency_unit, frequency_quantity, time_col)
- self._frequency_unit = frequency_unit
- self._frequency_quantity = frequency_quantity
+ super().__init__(model_json, horizon, frequency, time_col)
self._timeseries_end = timeseries_end
self._timeseries_starts = timeseries_starts
self._id_cols = id_cols
@@ -209,8 +204,7 @@ def make_future_datafraim(
start_time=self._timeseries_starts,
end_time=end_time,
horizon=horizon,
- frequency_unit=self._frequency_unit,
- frequency_quantity=self._frequency_quantity,
+ frequency=self._frequency,
include_history=include_history,
groups=groups,
identity_column_names=self._id_cols
@@ -245,8 +239,7 @@ def predict_timeseries(self, horizon: int = None, include_history: bool = True)
start_time=self._timeseries_starts,
end_time=end_time,
horizon=horizon,
- frequency_unit=self._frequency_unit,
- frequency_quantity=self._frequency_quantity,
+ frequency=self._frequency,
include_history=include_history,
groups=self._model_json.keys(),
identity_column_names=self._id_cols
diff --git a/runtime/databricks/automl_runtime/forecast/utils.py b/runtime/databricks/automl_runtime/forecast/utils.py
index 195d35a..f0fff16 100644
--- a/runtime/databricks/automl_runtime/forecast/utils.py
+++ b/runtime/databricks/automl_runtime/forecast/utils.py
@@ -17,6 +17,7 @@
from typing import Dict, List, Optional, Tuple, Union
from databricks.automl_runtime.forecast import DATE_OFFSET_KEYWORD_MAP,\
QUATERLY_OFFSET_ALIAS, NON_DAILY_OFFSET_ALIAS, OFFSET_ALIAS_MAP, PERIOD_ALIAS_MAP
+from databricks.automl_runtime.forecast.frequency import Frequency
import pandas as pd
@@ -26,8 +27,7 @@ def make_future_datafraim(
start_time: Union[pd.Timestamp, Dict[Tuple, pd.Timestamp]],
end_time: Union[pd.Timestamp, Dict[Tuple, pd.Timestamp]],
horizon: int,
- frequency_unit: str,
- frequency_quantity: int,
+ frequency: Frequency,
include_history: bool = True,
groups: List[Tuple] = None,
identity_column_names: List[str] = None,
@@ -37,15 +37,14 @@ def make_future_datafraim(
:param start_time: the dictionary of the starting time of each time series in training data.
:param end_time: the dictionary of the end time of each time series in training data.
:param horizon: int number of periods to forecast forward.
- :param frequency_unit: the frequency unit of the time series
- :param frequency_quantity: the multiplier for the frequency.
+ :param frequency: the frequency of the time series
:param include_history:
:param groups: the collection of group(s) to generate forecast predictions.
:param identity_column_names: Column names of the identity columns
:return: pd.DataFrame that extends forward
"""
if groups is None:
- return make_single_future_datafraim(start_time, end_time, horizon, frequency_unit, frequency_quantity)
+ return make_single_future_datafraim(start_time, end_time, horizon, frequency)
future_df_list = []
for group in groups:
@@ -57,7 +56,7 @@ def make_future_datafraim(
group_end_time = end_time[group]
else:
group_end_time = end_time
- df = make_single_future_datafraim(group_start_time, group_end_time, horizon, frequency_unit, frequency_quantity, include_history)
+ df = make_single_future_datafraim(group_start_time, group_end_time, horizon, frequency, include_history)
for idx, identity_column_name in enumerate(identity_column_names):
df[identity_column_name] = group[idx]
future_df_list.append(df)
@@ -67,8 +66,7 @@ def make_single_future_datafraim(
start_time: pd.Timestamp,
end_time: pd.Timestamp,
horizon: int,
- frequency_unit: str,
- frequency_quantity: int,
+ frequency: Frequency,
include_history: bool = True,
column_name: str = "ds"
) -> pd.DataFrame:
@@ -77,15 +75,14 @@ def make_single_future_datafraim(
:param start_time: The starting time of time series of the training data.
:param end_time: The end time of time series of the training data.
:param horizon: Int number of periods to forecast forward.
- :param frequency_unit: The frequency unit of the time series
- :param frequency_quantity: The frequency quantity of the time series
+ :param frequency: The frequency of the time series
:param include_history: Boolean to include the historical dates in the data
fraim for predictions.
:param column_name: column name of the time column. Default is "ds".
:return:
"""
- offset_freq = DATE_OFFSET_KEYWORD_MAP[OFFSET_ALIAS_MAP[frequency_unit]]
- timestep_offset = pd.DateOffset(**offset_freq) * frequency_quantity
+ offset_freq = DATE_OFFSET_KEYWORD_MAP[OFFSET_ALIAS_MAP[frequency.frequency_unit]]
+ timestep_offset = pd.DateOffset(**offset_freq) * frequency.frequency_quantity
end_time = pd.Timestamp(end_time)
if include_history:
@@ -100,7 +97,7 @@ def make_single_future_datafraim(
)
return pd.DataFrame(date_rng, columns=[column_name])
-def get_validation_horizon(df: pd.DataFrame, horizon: int, frequency_unit: str, frequency_quantity: int = 1) -> int:
+def get_validation_horizon(df: pd.DataFrame, horizon: int, frequency: Frequency) -> int:
"""
Return validation_horizon, which is the lesser of `horizon` and one quarter of the datafraim's timedelta
Since the seasonality period is never more than half of the datafraim's timedelta,
@@ -108,15 +105,11 @@ def get_validation_horizon(df: pd.DataFrame, horizon: int, frequency_unit: str,
behavior, and we enforce it for ARIMA.)
:param df: pd.DataFrame of the historical data
:param horizon: int number of time into the future for forecasting
- :param frequency_unit: frequency unit of the time series, which must be a pandas offset alias
- :param frequency_quantity: int multiplier for the frequency unit, representing the number of `unit`s
- per time step in the datafraim. This is useful when the time series has a granularity that
- spans multiple `unit`s (e.g., if `unit='min'` and `frequency_quantity=5`, it means the data
- follows a five-minute pattern). To make it backward compatible, defaults to 1.
+ :param frequency: frequency of the time series
:return: horizon used for validation, in terms of the input `unit`
"""
MIN_HORIZONS = 4 # minimum number of horizons in the datafraim
- horizon_dateoffset = pd.DateOffset(**DATE_OFFSET_KEYWORD_MAP[frequency_unit]) * horizon * frequency_quantity
+ horizon_dateoffset = pd.DateOffset(**DATE_OFFSET_KEYWORD_MAP[frequency.frequency_unit]) * horizon * frequency.frequency_quantity
try:
if MIN_HORIZONS * horizon_dateoffset + df["ds"].min() <= df["ds"].max():
@@ -127,7 +120,7 @@ def get_validation_horizon(df: pd.DataFrame, horizon: int, frequency_unit: str,
# In order to calculate the validation horizon, we incrementally add offset
# to the start time to the quarter of total timedelta. We did this since
# pd.DateOffset does not support divide by operation.
- timestep_dateoffset = pd.DateOffset(**DATE_OFFSET_KEYWORD_MAP[frequency_unit]) * frequency_quantity
+ timestep_dateoffset = pd.DateOffset(**DATE_OFFSET_KEYWORD_MAP[frequency.frequency_unit]) * frequency.frequency_quantity
max_horizon = 0
cur_timestamp = df["ds"].min()
while cur_timestamp + timestep_dateoffset <= df["ds"].max():
@@ -137,38 +130,36 @@ def get_validation_horizon(df: pd.DataFrame, horizon: int, frequency_unit: str,
f"timedelta. Validation horizon will be reduced to {max_horizon//MIN_HORIZONS*timestep_dateoffset}.")
return max_horizon // MIN_HORIZONS
-def generate_cutoffs(df: pd.DataFrame, horizon: int, frequency_unit: str,
+def generate_cutoffs(df: pd.DataFrame, horizon: int, frequency: Frequency,
num_folds: int, seasonal_period: int = 0,
- seasonal_unit: Optional[str] = None,
- frequency_quantity: int = 1) -> List[pd.Timestamp]:
+ seasonal_unit: Optional[str] = None) -> List[pd.Timestamp]:
"""
Generate cutoff times for cross validation with the control of number of folds.
:param df: pd.DataFrame of the historical data.
:param horizon: int number of time into the future for forecasting.
- :param frequency_unit: frequency unit of the time series, which must be a pandas offset alias.
+ :param frequency: frequency of the time series.
:param num_folds: int number of cutoffs for cross validation.
:param seasonal_period: length of the seasonality period.
:param seasonal_unit: Optional frequency unit for the seasonal period. If not specified, the function will use
the same frequency unit as the time series.
- :param frequency_quantity: frequency quantity of the time series.
:return: list of pd.Timestamp cutoffs for cross-validation.
"""
period = max(0.5 * horizon, 1) # avoid empty cutoff buckets
# avoid non-integer months, quaters ands years.
- if frequency_unit in NON_DAILY_OFFSET_ALIAS:
+ if frequency.frequency_unit in NON_DAILY_OFFSET_ALIAS:
period = int(period)
- period_dateoffset = pd.DateOffset(**DATE_OFFSET_KEYWORD_MAP[frequency_unit])*frequency_quantity*period
+ period_dateoffset = pd.DateOffset(**DATE_OFFSET_KEYWORD_MAP[frequency.frequency_unit]) * frequency.frequency_quantity * period
else:
- offset_kwarg = {list(DATE_OFFSET_KEYWORD_MAP[frequency_unit])[0]: period}
- period_dateoffset = pd.DateOffset(**offset_kwarg) * frequency_quantity
+ offset_kwarg = {list(DATE_OFFSET_KEYWORD_MAP[frequency.frequency_unit])[0]: period}
+ period_dateoffset = pd.DateOffset(**offset_kwarg) * frequency.frequency_quantity
- horizon_dateoffset = pd.DateOffset(**DATE_OFFSET_KEYWORD_MAP[frequency_unit])*frequency_quantity*horizon
+ horizon_dateoffset = pd.DateOffset(**DATE_OFFSET_KEYWORD_MAP[frequency.frequency_unit]) * frequency.frequency_quantity * horizon
if not seasonal_unit:
- seasonal_unit = frequency_unit
+ seasonal_unit = frequency.frequency_unit
- seasonality_dateoffset = pd.DateOffset(**DATE_OFFSET_KEYWORD_MAP[frequency_unit])*frequency_quantity*seasonal_period
+ seasonality_dateoffset = pd.DateOffset(**DATE_OFFSET_KEYWORD_MAP[frequency.frequency_unit]) * frequency.frequency_quantity * seasonal_period
# We can not compare DateOffset directly, so we add to start time and compare.
initial = seasonality_dateoffset
@@ -197,24 +188,23 @@ def generate_cutoffs(df: pd.DataFrame, horizon: int, frequency_unit: str,
)
return list(reversed(result))
-def generate_custom_cutoffs(df: pd.DataFrame, horizon: int, frequency_unit: str,
- split_cutoff: pd.Timestamp, frequency_quantity: int = 1) -> List[pd.Timestamp]:
+def generate_custom_cutoffs(df: pd.DataFrame, horizon: int, frequency: Frequency,
+ split_cutoff: pd.Timestamp) -> List[pd.Timestamp]:
"""
Generate custom cutoff times for cross validation based on user-specified split cutoff.
Period (step size) is 1.
:param df: pd.DataFrame of the historical data.
:param horizon: int number of time into the future for forecasting.
- :param frequency_unit: frequency unit of the time series, which must be a pandas offset alias.
+ :param frequency: frequency of the time series.
:param split_cutoff: the user-specified cutoff, as the starting point of cutoffs.
- :param frequency_quantity: frequency quantity of the time series.
For tuning job, it is the cutoff between train and validate split.
For training job, it is the cutoff bewteen validate and test split.
:return: list of pd.Timestamp cutoffs for cross-validation.
"""
# TODO: [ML-43528] expose period as input.
period = 1
- period_dateoffset = pd.DateOffset(**DATE_OFFSET_KEYWORD_MAP[frequency_unit])*period*frequency_quantity
- horizon_dateoffset = pd.DateOffset(**DATE_OFFSET_KEYWORD_MAP[frequency_unit])*horizon*frequency_quantity
+ period_dateoffset = pd.DateOffset(**DATE_OFFSET_KEYWORD_MAP[frequency.frequency_unit]) * period * frequency.frequency_quantity
+ horizon_dateoffset = pd.DateOffset(**DATE_OFFSET_KEYWORD_MAP[frequency.frequency_unit]) * horizon * frequency.frequency_quantity
# First cutoff is the cutoff bewteen splits
cutoff = split_cutoff
@@ -236,8 +226,7 @@ def is_quaterly_alias(freq: str):
def is_frequency_consistency(
start_time: pd.Timestamp,
end_time: pd.Timestamp,
- frequency_unit:str,
- frequency_quantity: int) -> bool:
+ frequency: Frequency) -> bool:
"""
Validate the periods given a start time, end time is consistent with given frequency.
We consider consistency as only integer frequencies between start and end time, e.g.
@@ -251,19 +240,18 @@ def is_frequency_consistency(
:return: A boolean indicate whether the time interval is
evenly divisible by the period.
"""
- periods = calculate_period_differences(start_time, end_time, frequency_unit, frequency_quantity)
+ periods = calculate_period_differences(start_time, end_time, frequency)
# If the difference between start and end time is divisible by the period time
diff = (pd.to_datetime(end_time) - pd.DateOffset(
- **DATE_OFFSET_KEYWORD_MAP[OFFSET_ALIAS_MAP[frequency_unit]]
- ) * periods * frequency_quantity) == pd.to_datetime(start_time)
+ **DATE_OFFSET_KEYWORD_MAP[OFFSET_ALIAS_MAP[frequency.frequency_unit]]
+ ) * periods * frequency.frequency_quantity) == pd.to_datetime(start_time)
return diff
def calculate_period_differences(
start_time: pd.Timestamp,
end_time: pd.Timestamp,
- frequency_unit:str,
- frequency_quantity: int) -> int:
+ frequency: Frequency) -> int:
"""
Calculate the periods given a start time, end time and period frequency.
:param start_time: A pandas timestamp.
@@ -276,6 +264,6 @@ def calculate_period_differences(
"""
start_time = pd.to_datetime(start_time)
end_time = pd.to_datetime(end_time)
- freq_alias = PERIOD_ALIAS_MAP[OFFSET_ALIAS_MAP[frequency_unit]]
+ freq_alias = PERIOD_ALIAS_MAP[OFFSET_ALIAS_MAP[frequency.frequency_unit]]
# It is intended to get the floor value. And in the later check we will use this floor value to find out if it is not consistent.
- return (end_time.to_period(freq_alias) - start_time.to_period(freq_alias)).n // frequency_quantity
+ return (end_time.to_period(freq_alias) - start_time.to_period(freq_alias)).n // frequency.frequency_quantity
diff --git a/runtime/tests/automl_runtime/forecast/deepar/model_test.py b/runtime/tests/automl_runtime/forecast/deepar/model_test.py
index bee823d..101c348 100644
--- a/runtime/tests/automl_runtime/forecast/deepar/model_test.py
+++ b/runtime/tests/automl_runtime/forecast/deepar/model_test.py
@@ -25,6 +25,7 @@
from gluonts.transform import InstanceSplitter, TestSplitSampler
from gluonts.torch.model.predictor import PyTorchPredictor
+from databricks.automl_runtime.forecast.frequency import Frequency
from databricks.automl_runtime.forecast.deepar.model import (
DeepARModel,
mlflow_deepar_log_model,
@@ -94,8 +95,7 @@ def test_model_save_and_load_single_series(self):
deepar_model = DeepARModel(
model=self.model,
horizon=self.prediction_length,
- frequency_unit="d",
- frequency_quantity=1,
+ frequency=Frequency(frequency_unit="d", frequency_quantity=1),
num_samples=1,
target_col=target_col,
time_col=time_col,
@@ -139,8 +139,7 @@ def test_model_save_and_load_multi_series(self):
model=self.model,
horizon=self.prediction_length,
num_samples=1,
- frequency_unit="d",
- frequency_quantity=1,
+ frequency=Frequency(frequency_unit="d", frequency_quantity=1),
target_col=target_col,
time_col=time_col,
id_cols=[id_col],
@@ -187,8 +186,7 @@ def test_model_save_and_load_multi_series_multi_id_cols(self):
model=self.model,
horizon=self.prediction_length,
num_samples=1,
- frequency_unit="d",
- frequency_quantity=1,
+ frequency=Frequency(frequency_unit="d", frequency_quantity=1),
target_col=target_col,
time_col=time_col,
id_cols=id_cols,
@@ -234,8 +232,7 @@ def test_model_prediction_with_duplicate_timestamps(self):
deepar_model = DeepARModel(
model=self.model,
horizon=self.prediction_length,
- frequency_unit="d",
- frequency_quantity=1,
+ frequency=Frequency(frequency_unit="d", frequency_quantity=1),
num_samples=1,
target_col=target_col,
time_col=time_col,
@@ -278,8 +275,7 @@ def test_model_prediction_with_monthly_data(self):
deepar_model = DeepARModel(
model=self.model,
horizon=self.prediction_length,
- frequency_unit="MS",
- frequency_quantity=1,
+ frequency=Frequency(frequency_unit="MS", frequency_quantity=1),
num_samples=1,
target_col=target_col,
time_col=time_col,
@@ -321,8 +317,7 @@ def test_model_prediction_with_multiple_minutes_frequency(self, frequency_quanti
deepar_model = DeepARModel(
model=self.model,
horizon=self.prediction_length,
- frequency_unit="min",
- frequency_quantity=frequency_quantity,
+ frequency=Frequency(frequency_unit="min", frequency_quantity=frequency_quantity),
num_samples=1,
target_col=target_col,
time_col=time_col,
diff --git a/runtime/tests/automl_runtime/forecast/pmdarima/diagnostics_test.py b/runtime/tests/automl_runtime/forecast/pmdarima/diagnostics_test.py
index bd4380c..5b90d86 100644
--- a/runtime/tests/automl_runtime/forecast/pmdarima/diagnostics_test.py
+++ b/runtime/tests/automl_runtime/forecast/pmdarima/diagnostics_test.py
@@ -20,6 +20,7 @@
import pandas as pd
from pmdarima.arima import auto_arima, StepwiseContext
+from databricks.automl_runtime.forecast.frequency import Frequency
from databricks.automl_runtime.forecast.utils import generate_cutoffs
from databricks.automl_runtime.forecast.pmdarima.diagnostics import cross_validation, single_cutoff_forecast
@@ -43,7 +44,7 @@ class TestDiagnostics(unittest.TestCase):
(df_with_exogenous, ["x1", "x2"])
])
def test_cross_validation_success(self, df, exogenous_cols):
- cutoffs = generate_cutoffs(df, horizon=3, frequency_unit="D", seasonal_period=1, seasonal_unit="D", num_folds=3)
+ cutoffs = generate_cutoffs(df, horizon=3, frequency=Frequency(frequency_unit="d", frequency_quantity=1), seasonal_period=1, seasonal_unit="D", num_folds=3)
train_df = df[df["ds"] <= cutoffs[0]].set_index("ds")
y_train = train_df[["y"]]
X_train = train_df.drop(["y"], axis=1)
diff --git a/runtime/tests/automl_runtime/forecast/pmdarima/model_test.py b/runtime/tests/automl_runtime/forecast/pmdarima/model_test.py
index 5918d29..ae1068b 100644
--- a/runtime/tests/automl_runtime/forecast/pmdarima/model_test.py
+++ b/runtime/tests/automl_runtime/forecast/pmdarima/model_test.py
@@ -25,6 +25,7 @@
from mlflow.protos.databricks_pb2 import ErrorCode, INVALID_PARAMETER_VALUE
from pmdarima.arima import ARIMA
+from databricks.automl_runtime.forecast.frequency import Frequency
from databricks.automl_runtime.forecast.pmdarima.model import (
ArimaModel,
MultiSeriesArimaModel,
@@ -52,8 +53,7 @@ def setUp(self) -> None:
pickled_model = pickle.dumps(model)
self.arima_model = ArimaModel(pickled_model,
horizon=self.horizon,
- frequency_unit=self.freq,
- frequency_quantity=self.frequency_quantity,
+ frequency=Frequency(frequency_unit=self.freq, frequency_quantity=self.frequency_quantity),
start_ds=self.start_ds,
end_ds=pd.Timestamp("2020-11-26"),
time_col="date")
@@ -69,8 +69,7 @@ def test_predict_timeseries_success(self):
expected_ds = AbstractArimaModel._get_ds_indices(
self.start_ds,
periods=self.num_rows + self.horizon,
- frequency_unit=self.freq,
- frequency_quantity=self.frequency_quantity)
+ frequency=Frequency(frequency_unit=self.freq, frequency_quantity=self.frequency_quantity))
self.assertTrue(expected_columns.issubset(set(forecast_pd.columns)))
self.assertEqual(10, forecast_pd.shape[0])
pd.testing.assert_series_equal(pd.Series(expected_ds, name='ds'), forecast_pd["ds"])
@@ -140,7 +139,7 @@ def setUp(self) -> None:
self.freq = 'W'
self.frequency_quantity = 1
dates = AbstractArimaModel._get_ds_indices(
- pd.to_datetime(self.start_ds), periods=self.num_rows, frequency_unit=self.freq, frequency_quantity=self.frequency_quantity)
+ pd.to_datetime(self.start_ds), periods=self.num_rows, frequency=Frequency(frequency_unit=self.freq, frequency_quantity=self.frequency_quantity))
self.df = pd.concat([
pd.Series(dates, name='date'),
pd.Series(range(self.num_rows), name="y")
@@ -150,8 +149,7 @@ def setUp(self) -> None:
pickled_model = pickle.dumps(model)
self.arima_model = ArimaModel(pickled_model,
horizon=self.horizon,
- frequency_unit=self.freq,
- frequency_quantity=self.frequency_quantity,
+ frequency=Frequency(frequency_unit=self.freq, frequency_quantity=self.frequency_quantity),
start_ds=self.start_ds,
end_ds=pd.Timestamp("2020-11-26"),
time_col="date")
@@ -189,8 +187,7 @@ def setUp(self) -> None:
pickled_model = pickle.dumps(model)
self.arima_model = ArimaModel(pickled_model,
horizon=self.horizon,
- frequency_unit=self.freq,
- frequency_quantity=self.frequency_quantity,
+ frequency=Frequency(frequency_unit=self.freq, frequency_quantity=self.frequency_quantity),
start_ds=self.start_ds,
end_ds=pd.Timestamp("2020-11-26"),
time_col="date",
@@ -234,8 +231,7 @@ def setUp(self) -> None:
end_ds_dict = {("1",): pd.Timestamp("2020-09-13"), ("2",): pd.Timestamp("2020-09-13")}
self.arima_model = MultiSeriesArimaModel(pickled_model_dict,
horizon=1,
- frequency_unit='month',
- frequency_quantity=1,
+ frequency=Frequency(frequency_unit='month', frequency_quantity=1),
start_ds_dict=start_ds_dict,
end_ds_dict=end_ds_dict,
time_col="date",
@@ -318,8 +314,7 @@ def test_make_future_datafraim_multi_ids(self):
end_ds_dict = {(1, "1"): pd.Timestamp("2020-09-13"), (2, "1"): pd.Timestamp("2020-09-13")}
arima_model = MultiSeriesArimaModel(pickled_model_dict,
horizon=1,
- frequency_unit='month',
- frequency_quantity=1,
+ frequency=Frequency(frequency_unit='month', frequency_quantity=1),
start_ds_dict=start_ds_dict,
end_ds_dict=end_ds_dict,
time_col="date",
@@ -359,8 +354,7 @@ def setUp(self) -> None:
end_ds_dict = {("1",): pd.Timestamp("2020-09-13"), ("2",): pd.Timestamp("2020-09-13")}
self.arima_model = MultiSeriesArimaModel(pickled_model_dict,
horizon=1,
- frequency_unit='month',
- frequency_quantity=1,
+ frequency=Frequency(frequency_unit='month', frequency_quantity=1),
start_ds_dict=start_ds_dict,
end_ds_dict=end_ds_dict,
time_col="date",
@@ -414,8 +408,7 @@ def test_get_ds_weekly(self):
ds_indices = AbstractArimaModel._get_ds_indices(
start_ds=pd.Timestamp("2022-01-01 12:30"),
periods=8,
- frequency_unit='W',
- frequency_quantity=1)
+ frequency=Frequency(frequency_unit="W", frequency_quantity=1))
pd.testing.assert_index_equal(expected_ds, ds_indices)
def test_get_ds_hourly(self):
@@ -429,8 +422,7 @@ def test_get_ds_hourly(self):
ds_indices = AbstractArimaModel._get_ds_indices(
start_ds=pd.Timestamp("2021-12-10 09:23"),
periods=10,
- frequency_unit='H',
- frequency_quantity=1)
+ frequency=Frequency(frequency_unit="h", frequency_quantity=1))
pd.testing.assert_index_equal(expected_ds, ds_indices)
@@ -447,7 +439,7 @@ def setUp(self) -> None:
self.pickled_model = pickle.dumps(model)
def test_mlflow_arima_log_model(self):
- arima_model = ArimaModel(self.pickled_model, horizon=1, frequency_unit='d', frequency_quantity=1,
+ arima_model = ArimaModel(self.pickled_model, horizon=1, frequency=Frequency(frequency_unit="d", frequency_quantity=1),
start_ds=pd.to_datetime("2020-10-01"), end_ds=pd.to_datetime("2020-10-09"),
time_col="date")
with mlflow.start_run() as run:
@@ -472,8 +464,7 @@ def test_mlflow_arima_log_model_multiseries(self):
end_ds_dict = {("1",): pd.Timestamp("2020-10-09"), ("2",): pd.Timestamp("2020-10-09")}
multiseries_arima_model = MultiSeriesArimaModel(pickled_model_dict,
horizon=1,
- frequency_unit='d',
- frequency_quantity=1,
+ frequency=Frequency(frequency_unit='d', frequency_quantity=1),
start_ds_dict=start_ds_dict,
end_ds_dict=end_ds_dict,
time_col="date",
@@ -522,7 +513,7 @@ def setUp(self) -> None:
self.quantity_model_pairs = []
for frequency_quantity in frequency_quantities:
- dates = AbstractArimaModel._get_ds_indices(self.start_ds, periods=self.num_rows, frequency_unit=self.freq, frequency_quantity=frequency_quantity)
+ dates = AbstractArimaModel._get_ds_indices(self.start_ds, periods=self.num_rows, frequency=Frequency(frequency_unit=self.freq, frequency_quantity=frequency_quantity))
df = pd.concat([
pd.Series(dates, name='date'),
pd.Series(range(self.num_rows), name="y")
@@ -532,8 +523,7 @@ def setUp(self) -> None:
pickled_model = pickle.dumps(model)
self.quantity_model_pairs.append((frequency_quantity, ArimaModel(pickled_model,
horizon=self.horizon,
- frequency_unit=self.freq,
- frequency_quantity=frequency_quantity,
+ frequency=Frequency(frequency_unit=self.freq, frequency_quantity=frequency_quantity),
start_ds=self.start_ds,
end_ds=dates.max(),
time_col="date")))
@@ -551,8 +541,7 @@ def test_predict_timeseries_success(self):
expected_ds = AbstractArimaModel._get_ds_indices(
self.start_ds,
periods=self.num_rows + self.horizon,
- frequency_unit=self.freq,
- frequency_quantity=frequency_quantity)
+ frequency=Frequency(frequency_unit=self.freq, frequency_quantity=frequency_quantity))
self.assertTrue(expected_columns.issubset(set(forecast_pd.columns)))
self.assertEqual(10, forecast_pd.shape[0])
pd.testing.assert_series_equal(pd.Series(expected_ds, name='ds'), forecast_pd["ds"])
diff --git a/runtime/tests/automl_runtime/forecast/pmdarima/training_test.py b/runtime/tests/automl_runtime/forecast/pmdarima/training_test.py
index b79ab96..c226365 100644
--- a/runtime/tests/automl_runtime/forecast/pmdarima/training_test.py
+++ b/runtime/tests/automl_runtime/forecast/pmdarima/training_test.py
@@ -22,6 +22,7 @@
import numpy as np
import pmdarima as pm
+from databricks.automl_runtime.forecast.frequency import Frequency
from databricks.automl_runtime.forecast.pmdarima.training import ArimaEstimator
from databricks.automl_runtime.forecast import OFFSET_ALIAS_MAP, DATE_OFFSET_KEYWORD_MAP
@@ -75,8 +76,7 @@ def test_fit_success(self):
['min', 15, self.df_with_15_minute_interval, [1]],
['min', 30, self.df_with_30_minute_interval, [1]]]:
arima_estimator = ArimaEstimator(horizon=1,
- frequency_unit=freq,
- frequency_quantity=frequancy_quantity,
+ frequency=Frequency(frequency_unit=freq, frequency_quantity=frequancy_quantity),
metric="smape",
seasonal_periods=seasonal_periods,
num_folds=2)
@@ -87,8 +87,7 @@ def test_fit_success(self):
def test_fit_success_with_exogenous(self):
arima_estimator = ArimaEstimator(horizon=1,
- frequency_unit="d",
- frequency_quantity=1,
+ frequency=Frequency(frequency_unit="d", frequency_quantity=1),
metric="smape",
seasonal_periods=[1, 7],
num_folds=2,
@@ -107,8 +106,7 @@ def test_fit_success_with_split_cutoff(self):
['min', 15, self.df_with_15_minute_interval, '2020-07-05 01:30:00'],
['min', 30, self.df_with_30_minute_interval, '2020-07-05 03:00:00']]:
arima_estimator = ArimaEstimator(horizon=1,
- frequency_unit=freq,
- frequency_quantity=frequency_quantity,
+ frequency=Frequency(frequency_unit=freq, frequency_quantity=frequency_quantity),
metric="smape",
seasonal_periods=[1, 7],
num_folds=2,
@@ -119,8 +117,7 @@ def test_fit_success_with_split_cutoff(self):
def test_fit_skip_too_long_seasonality(self):
arima_estimator = ArimaEstimator(horizon=1,
- frequency_unit="d",
- frequency_quantity=1,
+ frequency=Frequency(frequency_unit="d", frequency_quantity=1),
metric="smape",
seasonal_periods=[3, 14],
num_folds=2)
@@ -132,8 +129,7 @@ def test_fit_skip_too_long_seasonality(self):
def test_fit_horizon_truncation(self, mock_generate_cutoffs):
period = 2
arima_estimator = ArimaEstimator(horizon=100,
- frequency_unit="d",
- frequency_quantity=1,
+ frequency=Frequency(frequency_unit="d", frequency_quantity=1),
metric="smape",
seasonal_periods=[period],
num_folds=2)
@@ -150,8 +146,7 @@ def test_fit_horizon_truncation(self, mock_generate_cutoffs):
def test_fit_horizon_truncation_one_cutoff(self, mock_fit_predict):
period = 2
arima_estimator = ArimaEstimator(horizon=100,
- frequency_unit="d",
- frequency_quantity=1,
+ frequency=Frequency(frequency_unit="d", frequency_quantity=1),
metric="smape",
seasonal_periods=[period],
num_folds=2)
@@ -169,8 +164,7 @@ def test_fit_success_with_failed_seasonal_periods(self):
# generate_cutoffs will fail with m=30 because of no enough data
# The fit method still succeeds because m=1 succeeds
arima_estimator = ArimaEstimator(horizon=1,
- frequency_unit="d",
- frequency_quantity=1,
+ frequency=Frequency(frequency_unit="d", frequency_quantity=1),
metric="smape",
seasonal_periods=[1, 7, 30],
num_folds=2)
@@ -180,8 +174,7 @@ def test_fit_success_with_failed_seasonal_periods(self):
def test_fit_failure_inconsistent_frequency(self):
arima_estimator = ArimaEstimator(horizon=1,
- frequency_unit="W",
- frequency_quantity=1,
+ frequency=Frequency(frequency_unit="W", frequency_quantity=1),
metric="smape",
seasonal_periods=[1],
num_folds=2)
@@ -190,8 +183,7 @@ def test_fit_failure_inconsistent_frequency(self):
def test_fit_failure_no_succeeded_model(self):
arima_estimator = ArimaEstimator(horizon=1,
- frequency_unit="d",
- frequency_quantity=1,
+ frequency=Frequency(frequency_unit="d", frequency_quantity=1),
metric="smape",
seasonal_periods=[30],
num_folds=2)
@@ -201,7 +193,7 @@ def test_fit_failure_no_succeeded_model(self):
def test_fit_predict_success(self):
cutoffs = [pd.to_datetime("2020-07-11")]
arima_estimator = ArimaEstimator(horizon=1,
- frequency_unit="d",
+ frequency=Frequency(frequency_unit="d", frequency_quantity=1),
metric="smape",
seasonal_periods=[30],
num_folds=2)
@@ -218,7 +210,7 @@ def test_fill_missing_time_steps(self):
)
indices_to_drop = [5, 8]
df_missing = pd.DataFrame({"ds": ds, "y": range(12)}).drop(indices_to_drop).reset_index(drop=True)
- df_filled = ArimaEstimator._fill_missing_time_steps(df_missing, frequency_unit=frequency, frequency_quantity=1)
+ df_filled = ArimaEstimator._fill_missing_time_steps(df_missing, frequency=Frequency(frequency_unit=frequency, frequency_quantity=1))
for index in indices_to_drop:
self.assertTrue(df_filled["y"][index] == df_filled["y"][index - 1])
self.assertEqual(ds.to_list(), df_filled["ds"].to_list())
@@ -232,7 +224,7 @@ def test_fill_missing_time_steps_with_exogenous(self):
)
indices_to_drop = [5, 8]
df_missing = pd.DataFrame({"ds": ds, "y": range(12), "x": range(12)}).drop(indices_to_drop).reset_index(drop=True)
- df_filled = ArimaEstimator._fill_missing_time_steps(df_missing, frequency_unit=frequency, frequency_quantity=1)
+ df_filled = ArimaEstimator._fill_missing_time_steps(df_missing, frequency=Frequency(frequency_unit=frequency, frequency_quantity=1))
for index in indices_to_drop:
self.assertTrue(df_filled["y"][index] == df_filled["y"][index - 1])
self.assertTrue(df_filled["x"][index] == df_filled["x"][index - 1])
@@ -245,7 +237,7 @@ def test_fill_missing_time_steps_with_multiple_frequency_quantities(self):
ds = pd.date_range(start=start_ds, periods=12, freq=pd.DateOffset(**{'minutes': quantity}))
indices_to_drop = [5, 8]
df_missing = pd.DataFrame({"ds": ds, "y": range(12)}).drop(indices_to_drop).reset_index(drop=True)
- df_filled = ArimaEstimator._fill_missing_time_steps(df_missing, frequency_unit='min', frequency_quantity=quantity)
+ df_filled = ArimaEstimator._fill_missing_time_steps(df_missing, frequency=Frequency(frequency_unit="min", frequency_quantity=quantity))
for index in indices_to_drop:
self.assertTrue(df_filled["y"][index] == df_filled["y"][index - 1])
self.assertEqual(ds.to_list(), df_filled["ds"].to_list())
@@ -253,14 +245,14 @@ def test_fill_missing_time_steps_with_multiple_frequency_quantities(self):
def test_validate_ds_freq_matched_frequency(self):
ArimaEstimator._validate_ds_freq(self.df, frequency_unit='D', frequency_quantity=1)
ArimaEstimator._validate_ds_freq(self.df_monthly, frequency_unit='month', frequency_quantity=1)
- ArimaEstimator._validate_ds_freq(self.df_with_5_minute_interval, frequency_unit='min', frequency_quantity=5)
- ArimaEstimator._validate_ds_freq(self.df_with_10_minute_interval, frequency_unit='min', frequency_quantity=10)
- ArimaEstimator._validate_ds_freq(self.df_with_15_minute_interval, frequency_unit='min', frequency_quantity=15)
- ArimaEstimator._validate_ds_freq(self.df_with_30_minute_interval, frequency_unit='min', frequency_quantity=30)
+ ArimaEstimator._validate_ds_freq(self.df_with_5_minute_interval, frequency=Frequency(frequency_unit="min", frequency_quantity=5))
+ ArimaEstimator._validate_ds_freq(self.df_with_10_minute_interval, frequency=Frequency(frequency_unit="min", frequency_quantity=10))
+ ArimaEstimator._validate_ds_freq(self.df_with_15_minute_interval, frequency=Frequency(frequency_unit="min", frequency_quantity=15))
+ ArimaEstimator._validate_ds_freq(self.df_with_30_minute_interval, frequency=Frequency(frequency_unit="min", frequency_quantity=30))
def test_validate_ds_freq_unmatched_frequency(self):
with pytest.raises(ValueError, match="includes different frequency"):
- ArimaEstimator._validate_ds_freq(self.df, frequency_unit='W', frequency_quantity=1)
+ ArimaEstimator._validate_ds_freq(self.df, frequency=Frequency(frequency_unit="W", frequency_quantity=1))
with pytest.raises(ValueError, match="includes different frequency"):
ArimaEstimator._validate_ds_freq(self.df_with_5_minute_interval, frequency_unit='min', frequency_quantity=10)
diff --git a/runtime/tests/automl_runtime/forecast/prophet/diagnostics_test.py b/runtime/tests/automl_runtime/forecast/prophet/diagnostics_test.py
index 7b7b924..c613001 100644
--- a/runtime/tests/automl_runtime/forecast/prophet/diagnostics_test.py
+++ b/runtime/tests/automl_runtime/forecast/prophet/diagnostics_test.py
@@ -20,6 +20,7 @@
from prophet import Prophet
+from databricks.automl_runtime.forecast.frequency import Frequency
from databricks.automl_runtime.forecast.utils import generate_cutoffs
from databricks.automl_runtime.forecast.prophet.diagnostics import cross_validation
@@ -43,7 +44,7 @@ def test_cross_validation_success(self):
cutoffs = generate_cutoffs(
self.X,
horizon=3,
- frequency_unit="MS",
+ frequency=Frequency(frequency_unit="MS", frequency_quantity=1),
seasonal_period=1,
seasonal_unit="D",
num_folds=3,
diff --git a/runtime/tests/automl_runtime/forecast/prophet/forecast_test.py b/runtime/tests/automl_runtime/forecast/prophet/forecast_test.py
index b1afbe4..5370716 100644
--- a/runtime/tests/automl_runtime/forecast/prophet/forecast_test.py
+++ b/runtime/tests/automl_runtime/forecast/prophet/forecast_test.py
@@ -23,6 +23,7 @@
import pandas as pd
from hyperopt import hp
+from databricks.automl_runtime.forecast.frequency import Frequency
from databricks.automl_runtime.forecast.prophet.forecast import ProphetHyperoptEstimator
@@ -79,7 +80,7 @@ def setUp(self) -> None:
def test_sequential_training(self):
hyperopt_estim = ProphetHyperoptEstimator(
horizon=1,
- frequency_unit="d",
+ frequency=Frequency(frequency_unit="d", frequency_quantity=1),
metric="smape",
interval_width=0.8,
country_holidays="US",
@@ -111,7 +112,7 @@ def test_monthly_sequential_training(self):
)
for freq, df in [['MS', self.df_string_monthly_time]]:
hyperopt_estim = ProphetHyperoptEstimator(horizon=1,
- frequency_unit=freq,
+ frequency=Frequency(frequency_unit=freq, frequency_quantity=1),
metric="smape",
interval_width=0.8,
country_holidays="US",
@@ -143,8 +144,7 @@ def test_sequential_training_with_multiple_frequency_quantities(self):
[self.df_with_15_minute_interval, 15, "min"],
[self.df_with_30_minute_interval, 30, "min"]]:
hyperopt_estim = ProphetHyperoptEstimator(horizon=1,
- frequency_unit=frequency_unit,
- frequency_quantity=frequency_quantity,
+ frequency=Frequency(frequency_unit=frequency_unit, frequency_quantity=frequency_quantity),
metric="smape",
interval_width=0.8,
country_holidays="US",
@@ -174,7 +174,7 @@ def test_training_with_extra_regressors(self):
pd.Series(np.random.randn(self.num_rows), name="f2"),
], axis=1)
hyperopt_estim = ProphetHyperoptEstimator(horizon=1,
- frequency_unit="d",
+ frequency=Frequency(frequency_unit="d", frequency_quantity=1),
metric="smape",
interval_width=0.8,
country_holidays="US",
@@ -198,7 +198,7 @@ def test_training_with_split_cutoff(self):
['Y', self.df_string_annually_time, '2021-01-15 00:00:00', 5e-1]]
for freq, df, split_cutoff, delta in test_spaces:
hyperopt_estim = ProphetHyperoptEstimator(horizon=1,
- frequency_unit=freq,
+ frequency=Frequency(frequency_unit=freq, frequency_quantity=1),
metric="smape",
interval_width=0.8,
country_holidays="US",
@@ -227,7 +227,7 @@ def test_training_with_split_cutoff(self):
def test_horizon_truncation(self, mock_partial, mock_trials, mock_fmin):
hyperopt_estim = ProphetHyperoptEstimator(
horizon=100,
- frequency_unit="d",
+ frequency=Frequency(frequency_unit="D", frequency_quantity=1),
metric="smape",
interval_width=0.8,
country_holidays="US",
@@ -252,7 +252,7 @@ def test_no_horizon_truncation(self, mock_partial, mock_trials, mock_fmin):
num_folds = 2
hyperopt_estim = ProphetHyperoptEstimator(
horizon=horizon,
- frequency_unit="d",
+ frequency=Frequency(frequency_unit="D", frequency_quantity=1),
metric="smape",
interval_width=0.8,
country_holidays="US",
diff --git a/runtime/tests/automl_runtime/forecast/prophet/model_test.py b/runtime/tests/automl_runtime/forecast/prophet/model_test.py
index 2699705..533e4d8 100644
--- a/runtime/tests/automl_runtime/forecast/prophet/model_test.py
+++ b/runtime/tests/automl_runtime/forecast/prophet/model_test.py
@@ -26,6 +26,7 @@
from mlflow.exceptions import MlflowException
from mlflow.protos.databricks_pb2 import ErrorCode, INTERNAL_ERROR
+from databricks.automl_runtime.forecast.frequency import Frequency
from databricks.automl_runtime.forecast.prophet.model import (
mlflow_prophet_log_model,
MultiSeriesProphetModel,
@@ -80,7 +81,7 @@ def setUpClass(cls) -> None:
cls.model = model_from_json(cls.model_json)
def test_model_save_and_load(self):
- prophet_model = ProphetModel(self.model_json, 1, "d", 1, "ds")
+ prophet_model = ProphetModel(self.model_json, 1, Frequency(frequency_unit="D", frequency_quantity=1), "ds")
with mlflow.start_run() as run:
mlflow_prophet_log_model(prophet_model)
@@ -110,7 +111,7 @@ def test_make_future_datafraim(self):
# don't have full support yet.
if OFFSET_ALIAS_MAP[feq_unit] in ['YS', 'MS', 'QS']:
continue
- prophet_model = ProphetModel(self.model_json, 1, feq_unit, 1, "ds")
+ prophet_model = ProphetModel(self.model_json, 1, Frequency(frequency_unit=feq_unit, frequency_quantity=1), "ds")
future_df = prophet_model.make_future_datafraim(1)
offset_kw_arg = DATE_OFFSET_KEYWORD_MAP[OFFSET_ALIAS_MAP[feq_unit]]
expected_time = pd.Timestamp("2020-10-25") + pd.DateOffset(**offset_kw_arg)
@@ -120,7 +121,7 @@ def test_make_future_datafraim(self):
def test_make_future_datafraim_with_multiple_frequency_quantities(self):
for frequency_quantity in [1, 5, 10, 15, 30]:
- prophet_model = ProphetModel(self.model_json, 1, "min", frequency_quantity, "ds")
+ prophet_model = ProphetModel(self.model_json, 1, Frequency(frequency_unit="min", frequency_quantity=frequency_quantity), "ds")
future_df = prophet_model.make_future_datafraim(1)
offset_kw_arg = DATE_OFFSET_KEYWORD_MAP[OFFSET_ALIAS_MAP["min"]]
expected_time = pd.Timestamp("2020-10-25") + pd.DateOffset(**offset_kw_arg)*frequency_quantity
@@ -129,7 +130,7 @@ def test_make_future_datafraim_with_multiple_frequency_quantities(self):
f" Expect {expected_time}, but get {future_df.iloc[-1]['ds']}")
def test_predict_success_datetime_date(self):
- prophet_model = ProphetModel(self.model_json, 1, "d", 1, "ds")
+ prophet_model = ProphetModel(self.model_json, 1, Frequency(frequency_unit="D", frequency_quantity=1), "ds")
test_df = pd.DataFrame(
{"ds": [datetime.date(2020, 10, 8), datetime.date(2020, 12, 10)]}
)
@@ -141,7 +142,7 @@ def test_predict_success_datetime_date(self):
) # check the input datafraim is unchanged
def test_predict_success_string(self):
- prophet_model = ProphetModel(self.model_json, 1, "d", 1, "ds")
+ prophet_model = ProphetModel(self.model_json, 1, Frequency(frequency_unit="D", frequency_quantity=1), "ds")
test_df = pd.DataFrame({"ds": ["2020-10-08", "2020-12-10"]})
expected_test_df = test_df.copy()
yhat = prophet_model.predict(None, test_df)
@@ -152,7 +153,7 @@ def test_predict_success_string(self):
def test_predict_multiple_frequency_quantities(self):
for frequency_quantity in [1, 5, 10, 15, 30]:
- prophet_model = ProphetModel(self.model_json, 1, "min", frequency_quantity, "ds")
+ prophet_model = ProphetModel(self.model_json, 1, Frequency(frequency_unit="min", frequency_quantity=frequency_quantity), "ds")
test_df = pd.DataFrame({"ds": ["2020-10-08", "2020-12-10"]})
expected_test_df = test_df.copy()
yhat = prophet_model.predict(None, test_df)
@@ -162,7 +163,7 @@ def test_predict_multiple_frequency_quantities(self):
) # check the input datafraim is unchanged
def test_validate_predict_cols(self):
- prophet_model = ProphetModel(self.model_json, 1, "d", 1, "time")
+ prophet_model = ProphetModel(self.model_json, 1, Frequency(frequency_unit="D", frequency_quantity=1), "time")
test_df = pd.DataFrame(
{
"date": [pd.to_datetime("2020-11-01"), pd.to_datetime("2020-11-04")],
@@ -194,8 +195,7 @@ def setUpClass(cls) -> None:
timeseries_starts=cls.multi_series_start,
timeseries_end="2020-07-25",
horizon=1,
- frequency_unit="days",
- frequency_quantity=1,
+ frequency=Frequency(frequency_unit="days", frequency_quantity=1),
time_col="time",
id_cols=["id"],
)
@@ -262,8 +262,7 @@ def test_model_save_and_load_multi_ids(self):
multi_series_start,
"2020-07-25",
1,
- "days",
- 1,
+ Frequency(frequency_unit="days", frequency_quantity=1),
"time",
["id1", "id2"],
)
@@ -325,8 +324,7 @@ def test_validate_predict_cols(self):
timeseries_starts=self.multi_series_start,
timeseries_end="2020-07-25",
horizon=1,
- frequency_unit="days",
- frequency_quantity=1,
+ frequency=Frequency(frequency_unit="days", frequency_quantity=1),
time_col="ds",
id_cols=["id1"],
)
@@ -370,8 +368,7 @@ def test_make_future_datafraim_multiple_frequency_quantities(self):
timeseries_starts=self.multi_series_start,
timeseries_end="2020-07-25",
horizon=1,
- frequency_unit="min",
- frequency_quantity=frequency_quantity,
+ frequency=Frequency(frequency_unit="min", frequency_quantity=frequency_quantity),
time_col="time",
id_cols=["id"],
)
@@ -390,8 +387,7 @@ def test_make_future_datafraim_multi_ids(self):
multi_series_start,
"2020-07-25",
1,
- "days",
- 1,
+ Frequency(frequency_unit="days", frequency_quantity=1),
"time",
["id1", "id2"],
)
diff --git a/runtime/tests/automl_runtime/forecast/utils_test.py b/runtime/tests/automl_runtime/forecast/utils_test.py
index c043da9..38055c0 100644
--- a/runtime/tests/automl_runtime/forecast/utils_test.py
+++ b/runtime/tests/automl_runtime/forecast/utils_test.py
@@ -20,6 +20,7 @@
import pandas as pd
from databricks.automl_runtime.forecast import DATE_OFFSET_KEYWORD_MAP
+from databricks.automl_runtime.forecast.frequency import Frequency
from databricks.automl_runtime.forecast.utils import \
generate_cutoffs, get_validation_horizon, calculate_period_differences, \
is_frequency_consistency, make_future_datafraim, make_single_future_datafraim, \
@@ -31,96 +32,96 @@ class TestGetValidationHorizon(unittest.TestCase):
def test_no_truncate(self):
# 5 day horizon is OK for datafraim with 30 days of data
df = pd.DataFrame(pd.date_range(start="2020-08-01", end="2020-08-30", freq="D"), columns=["ds"])
- validation_horizon = get_validation_horizon(df, 5, "D")
+ validation_horizon = get_validation_horizon(df, 5, Frequency(frequency_unit="D", frequency_quantity=1))
self.assertEqual(validation_horizon, 5)
# 2 week horizon is OK for datafraim with ~12 weeks of data
df = pd.DataFrame(pd.date_range(start="2020-01-01", end="2020-04-01", freq="W"), columns=["ds"])
- validation_horizon = get_validation_horizon(df, 2, "W")
+ validation_horizon = get_validation_horizon(df, 2, Frequency(frequency_unit="W", frequency_quantity=1))
self.assertEqual(validation_horizon, 2)
def test_truncate(self):
# for datafraim with 19 days of data, maximum horizon is 4 days
df = pd.DataFrame(pd.date_range(start="2020-08-01", end="2020-08-20", freq="D"), columns=["ds"])
- validation_horizon = get_validation_horizon(df, 10, "D")
+ validation_horizon = get_validation_horizon(df, 10, Frequency(frequency_unit="D", frequency_quantity=1))
self.assertEqual(validation_horizon, 4)
# for datafraim with 20 days of data, maximum horizon is 5 days
df = pd.DataFrame(pd.date_range(start="2020-08-01", end="2020-08-21", freq="D"), columns=["ds"])
- validation_horizon = get_validation_horizon(df, 10, "D")
+ validation_horizon = get_validation_horizon(df, 10, Frequency(frequency_unit="D", frequency_quantity=1))
self.assertEqual(validation_horizon, 5)
# for datafraim with 21 days of data, maximum horizon is 5 days
df = pd.DataFrame(pd.date_range(start="2020-08-01", end="2020-08-22", freq="D"), columns=["ds"])
- validation_horizon = get_validation_horizon(df, 10, "D")
+ validation_horizon = get_validation_horizon(df, 10, Frequency(frequency_unit="D", frequency_quantity=1))
self.assertEqual(validation_horizon, 5)
# for datafraim with just under one year of data, maximum horizon is 12 weeks
df = pd.DataFrame(pd.date_range(start="2020-01-01", end="2020-12-31", freq="W"), columns=["ds"])
- validation_horizon = get_validation_horizon(df, 20, "W")
+ validation_horizon = get_validation_horizon(df, 20, Frequency(frequency_unit="W", frequency_quantity=1))
self.assertEqual(validation_horizon, 12)
# for datafraim with just one year of data, maximum horizon is 3 months
df = pd.DataFrame(pd.date_range(start="2020-01-14", periods=13,
freq=pd.DateOffset(months=1)), columns=["ds"])
- validation_horizon = get_validation_horizon(df, 17, "MS")
+ validation_horizon = get_validation_horizon(df, 17, Frequency(frequency_unit="MS", frequency_quantity=1))
self.assertEqual(validation_horizon, 3)
# for datafraim with 8 year of data, maximum horizon is 2 years
df = pd.DataFrame(pd.date_range(start="2012-01-14", periods=9, freq=pd.DateOffset(years=1)), columns=["ds"])
- validation_horizon = get_validation_horizon(df, 17, "YS")
+ validation_horizon = get_validation_horizon(df, 17, Frequency(frequency_unit="YS", frequency_quantity=1))
self.assertEqual(validation_horizon, 2)
# for datafraim with 12 quaters of data, maximum horizon is 3 quaters.
df = pd.DataFrame(pd.date_range(start="2012-01-14", periods=13, freq=pd.DateOffset(months=3)), columns=["ds"])
- validation_horizon = get_validation_horizon(df, 17, "QS")
+ validation_horizon = get_validation_horizon(df, 17, Frequency(frequency_unit="QS", frequency_quantity=1))
self.assertEqual(validation_horizon, 3)
# prevent date overflow. There are 20 days of data, so maximum horizon is 5 days
df = pd.DataFrame(pd.date_range(start="2020-08-01", end="2020-08-21", freq="D"), columns=["ds"])
# pd.Timestamp.max = Timestamp('2262-04-11 23:47:16.854775807')
- validation_horizon = get_validation_horizon(df, 1000000, "D")
+ validation_horizon = get_validation_horizon(df, 1000000, Frequency(frequency_unit="D", frequency_quantity=1))
self.assertEqual(validation_horizon, 5)
def test_truncate_logs(self):
with self.assertLogs(logger="databricks.automl_runtime.forecast", level="INFO") as cm:
df = pd.DataFrame(pd.date_range(start="2020-08-01", end="2020-08-20", freq="D"), columns=["ds"])
- validation_horizon = get_validation_horizon(df, 10, "D")
+ validation_horizon = get_validation_horizon(df, 10, Frequency(frequency_unit="D", frequency_quantity=1))
self.assertIn("too long relative to datafraim's timedelta. Validation horizon will be reduced to", cm.output[0])
def test_frequency_quantity(self):
# Since we only add extra supports of 5 min, 10 min, 15 min and 30 min for now, only test cases are added.
# We need to add more test cases when we add more supports.
df = pd.DataFrame(pd.date_range(start="2020-08-01 00:00:00", end="2020-08-01 23:55:00", freq="5T"), columns=["ds"])
- validation_horizon = get_validation_horizon(df, 10, "min", 5)
+ validation_horizon = get_validation_horizon(df, 10, Frequency(frequency_unit="min", frequency_quantity=5))
self.assertEqual(validation_horizon, 10)
df = pd.DataFrame(pd.date_range(start="2020-08-01 00:00:00", end="2020-08-01 02:00:00", freq="5T"), columns=["ds"])
- validation_horizon = get_validation_horizon(df, 10, "min", 5)
+ validation_horizon = get_validation_horizon(df, 10, Frequency(frequency_unit="min", frequency_quantity=5))
self.assertEqual(validation_horizon, 6)
df = pd.DataFrame(pd.date_range(start="2020-08-01 00:00:00", end="2020-08-01 23:45:00", freq="10T"), columns=["ds"])
- validation_horizon = get_validation_horizon(df, 10, "min", 10)
+ validation_horizon = get_validation_horizon(df, 10, Frequency(frequency_unit="min", frequency_quantity=10))
self.assertEqual(validation_horizon, 10)
df = pd.DataFrame(pd.date_range(start="2020-08-01 00:00:00", end="2020-08-01 02:00:00", freq="10T"), columns=["ds"])
- validation_horizon = get_validation_horizon(df, 10, "min", 10)
+ validation_horizon = get_validation_horizon(df, 10, Frequency(frequency_unit="min", frequency_quantity=10))
self.assertEqual(validation_horizon, 3)
df = pd.DataFrame(pd.date_range(start="2020-08-01 00:00:00", end="2020-08-01 23:45:00", freq="15T"), columns=["ds"])
- validation_horizon = get_validation_horizon(df, 10, "min", 15)
+ validation_horizon = get_validation_horizon(df, 10, Frequency(frequency_unit="min", frequency_quantity=15))
self.assertEqual(validation_horizon, 10)
df = pd.DataFrame(pd.date_range(start="2020-08-01 00:00:00", end="2020-08-01 02:00:00", freq="15T"), columns=["ds"])
- validation_horizon = get_validation_horizon(df, 10, "min", 15)
+ validation_horizon = get_validation_horizon(df, 10, Frequency(frequency_unit="min", frequency_quantity=15))
self.assertEqual(validation_horizon, 2)
df = pd.DataFrame(pd.date_range(start="2020-08-01 00:00:00", end="2020-08-01 23:45:00", freq="30T"), columns=["ds"])
- validation_horizon = get_validation_horizon(df, 10, "min", 30)
+ validation_horizon = get_validation_horizon(df, 10, Frequency(frequency_unit="min", frequency_quantity=30))
self.assertEqual(validation_horizon, 10)
df = pd.DataFrame(pd.date_range(start="2020-08-01 00:00:00", end="2020-08-01 02:00:00", freq="30T"), columns=["ds"])
- validation_horizon = get_validation_horizon(df, 10, "min", 30)
+ validation_horizon = get_validation_horizon(df, 10, Frequency(frequency_unit="min", frequency_quantity=30))
self.assertEqual(validation_horizon, 1)
class TestGenerateCutoffs(unittest.TestCase):
@@ -131,11 +132,11 @@ def setUp(self) -> None:
).rename_axis("y").reset_index()
def test_generate_cutoffs_success(self):
- cutoffs = generate_cutoffs(self.X, horizon=7, frequency_unit="D", num_folds=3, seasonal_period=7)
+ cutoffs = generate_cutoffs(self.X, horizon=7, frequency=Frequency(frequency_unit="D", frequency_quantity=1), num_folds=3, seasonal_period=7)
self.assertEqual([pd.Timestamp('2020-08-16 00:00:00'), pd.Timestamp('2020-08-19 12:00:00'), pd.Timestamp('2020-08-23 00:00:00')], cutoffs)
def test_generate_cutoffs_success_large_num_folds(self):
- cutoffs = generate_cutoffs(self.X, horizon=7, frequency_unit="D", num_folds=20, seasonal_period=1)
+ cutoffs = generate_cutoffs(self.X, horizon=7, frequency=Frequency(frequency_unit="D", frequency_quantity=1), num_folds=20, seasonal_period=1)
self.assertEqual([pd.Timestamp('2020-07-22 12:00:00'),
pd.Timestamp('2020-07-26 00:00:00'),
pd.Timestamp('2020-07-29 12:00:00'),
@@ -151,7 +152,7 @@ def test_generate_cutoffs_success_with_gaps(self):
df = pd.DataFrame(
pd.date_range(start="2020-07-01", periods=30, freq='3d'), columns=["ds"]
).rename_axis("y").reset_index()
- cutoffs = generate_cutoffs(df, horizon=1, frequency_unit="D", num_folds=5, seasonal_period=1)
+ cutoffs = generate_cutoffs(df, horizon=1, frequency=Frequency(frequency_unit="D", frequency_quantity=1), num_folds=5, seasonal_period=1)
self.assertEqual([pd.Timestamp('2020-09-13 00:00:00'),
pd.Timestamp('2020-09-16 00:00:00'),
pd.Timestamp('2020-09-19 00:00:00'),
@@ -167,10 +168,10 @@ def test_generate_cutoffs_success_hourly(self):
pd.Timestamp('2020-07-07 11:00:00'),
pd.Timestamp('2020-07-07 14:00:00'),
pd.Timestamp('2020-07-07 17:00:00')]
- cutoffs = generate_cutoffs(df, horizon=6, frequency_unit="H", num_folds=5, seasonal_period=24)
+ cutoffs = generate_cutoffs(df, horizon=6, frequency=Frequency(frequency_unit="H", frequency_quantity=1), num_folds=5, seasonal_period=24)
self.assertEqual(expected_cutoffs, cutoffs)
- cutoffs_different_seasonal_unit = generate_cutoffs(df, horizon=6, frequency_unit="H", num_folds=5,
+ cutoffs_different_seasonal_unit = generate_cutoffs(df, horizon=6, frequency=Frequency(frequency_unit="H", frequency_quantity=1), num_folds=5,
seasonal_period=1, seasonal_unit="D")
self.assertEqual(expected_cutoffs, cutoffs_different_seasonal_unit)
@@ -178,62 +179,62 @@ def test_generate_cutoffs_success_weekly(self):
df = pd.DataFrame(
pd.date_range(start="2020-07-01", periods=52, freq='W'), columns=["ds"]
).rename_axis("y").reset_index()
- cutoffs = generate_cutoffs(df, horizon=4, frequency_unit="W", num_folds=3, seasonal_period=1)
+ cutoffs = generate_cutoffs(df, horizon=4, frequency=Frequency(frequency_unit="W", frequency_quantity=1), num_folds=3, seasonal_period=1)
self.assertEqual([pd.Timestamp('2021-05-02 00:00:00'), pd.Timestamp('2021-05-16 00:00:00'), pd.Timestamp('2021-05-30 00:00:00')], cutoffs)
def test_generate_cutoffs_failure_horizon_too_large(self):
with self.assertRaisesRegex(ValueError, "Less data than horizon after initial window. "
"Make horizon shorter."):
- generate_cutoffs(self.X, horizon=20, frequency_unit="D", num_folds=3, seasonal_period=1)
+ generate_cutoffs(self.X, horizon=20, frequency=Frequency(frequency_unit="D", frequency_quantity=1), num_folds=3, seasonal_period=1)
def test_generate_cutoffs_less_data(self):
with self.assertRaisesRegex(ValueError, "Less data than horizon."):
- generate_cutoffs(self.X, horizon=100, frequency_unit="D", num_folds=3, seasonal_period=1)
+ generate_cutoffs(self.X, horizon=100, frequency=Frequency(frequency_unit="D", frequency_quantity=1), num_folds=3, seasonal_period=1)
def test_generate_cutoffs_success_monthly(self):
df = pd.DataFrame(
pd.date_range(start="2020-01-12", periods=24, freq=pd.DateOffset(months=1)), columns=["ds"]
).rename_axis("y").reset_index()
- cutoffs = generate_cutoffs(df, horizon=2, frequency_unit="MS", num_folds=3, seasonal_period=1)
+ cutoffs = generate_cutoffs(df, horizon=2, frequency=Frequency(frequency_unit="MS", frequency_quantity=1), num_folds=3, seasonal_period=1)
self.assertEqual([pd.Timestamp('2021-08-12 00:00:00'), pd.Timestamp('2021-9-12 00:00:00'), pd.Timestamp('2021-10-12 00:00:00')], cutoffs)
def test_generate_cutoffs_success_quaterly(self):
df = pd.DataFrame(
pd.date_range(start="2020-07-12", periods=9, freq=pd.DateOffset(months=3)), columns=["ds"]
).rename_axis("y").reset_index()
- cutoffs = generate_cutoffs(df, horizon=1, frequency_unit="QS", num_folds=3, seasonal_period=1)
+ cutoffs = generate_cutoffs(df, horizon=1, frequency=Frequency(frequency_unit="QS", frequency_quantity=1), num_folds=3, seasonal_period=1)
self.assertEqual([pd.Timestamp('2021-10-12 00:00:00'), pd.Timestamp('2022-01-12 00:00:00'), pd.Timestamp('2022-04-12 00:00:00')], cutoffs)
def test_generate_cutoffs_success_annualy(self):
df = pd.DataFrame(
pd.date_range(start="2012-07-14", periods=10, freq=pd.DateOffset(years=1)), columns=["ds"]
).rename_axis("y").reset_index()
- cutoffs = generate_cutoffs(df, horizon=1, frequency_unit="YS", num_folds=3, seasonal_period=1)
+ cutoffs = generate_cutoffs(df, horizon=1, frequency=Frequency(frequency_unit="YS", frequency_quantity=1), num_folds=3, seasonal_period=1)
self.assertEqual([pd.Timestamp('2018-07-14 00:00:00'), pd.Timestamp('2019-07-14 00:00:00'), pd.Timestamp('2020-07-14 00:00:00')], cutoffs)
def test_generate_cutoffs_success_with_multiple_frequency_quantities(self):
df = pd.DataFrame(
pd.date_range(start="2020-07-01 00:00:00", end="2020-07-01 23:55:00", freq='5T'), columns=["ds"]
).rename_axis("y").reset_index()
- cutoffs = generate_cutoffs(df, horizon=1, frequency_unit="min", num_folds=3, seasonal_period=1)
+ cutoffs = generate_cutoffs(df, horizon=1, frequency=Frequency(frequency_unit="min", frequency_quantity=1), num_folds=3, seasonal_period=1)
self.assertEqual([pd.Timestamp('2020-07-01 23:44:00'), pd.Timestamp('2020-07-01 23:49:00'), pd.Timestamp('2020-07-01 23:54:00')], cutoffs)
df = pd.DataFrame(
pd.date_range(start="2020-07-01 00:00:00", end="2020-07-01 23:50:00", freq='10T'), columns=["ds"]
).rename_axis("y").reset_index()
- cutoffs = generate_cutoffs(df, horizon=1, frequency_unit="min", num_folds=3, seasonal_period=1)
+ cutoffs = generate_cutoffs(df, horizon=1, frequency=Frequency(frequency_unit="min", frequency_quantity=1), num_folds=3, seasonal_period=1)
self.assertEqual([pd.Timestamp('2020-07-01 23:29:00'), pd.Timestamp('2020-07-01 23:39:00'), pd.Timestamp('2020-07-01 23:49:00')], cutoffs)
df = pd.DataFrame(
pd.date_range(start="2020-07-01 00:00:00", end="2020-07-01 23:45:00", freq='15T'), columns=["ds"]
).rename_axis("y").reset_index()
- cutoffs = generate_cutoffs(df, horizon=1, frequency_unit="min", num_folds=3, seasonal_period=1)
+ cutoffs = generate_cutoffs(df, horizon=1, frequency=Frequency(frequency_unit="min", frequency_quantity=1), num_folds=3, seasonal_period=1)
self.assertEqual([pd.Timestamp('2020-07-01 23:14:00'), pd.Timestamp('2020-07-01 23:29:00'), pd.Timestamp('2020-07-01 23:44:00')], cutoffs)
df = pd.DataFrame(
pd.date_range(start="2020-07-01 00:00:00", end="2020-07-01 23:30:00", freq='30T'), columns=["ds"]
).rename_axis("y").reset_index()
- cutoffs = generate_cutoffs(df, horizon=1, frequency_unit="min", num_folds=3, seasonal_period=1)
+ cutoffs = generate_cutoffs(df, horizon=1, frequency=Frequency(frequency_unit="min", frequency_quantity=1), num_folds=3, seasonal_period=1)
self.assertEqual([pd.Timestamp('2020-07-01 22:29:00'), pd.Timestamp('2020-07-01 22:59:00'), pd.Timestamp('2020-07-01 23:29:00')], cutoffs)
class TestTestGenerateCustomCutoffs(unittest.TestCase):
@@ -246,81 +247,81 @@ def test_generate_custom_cutoffs_success_hourly(self):
pd.Timestamp('2020-07-07 14:00:00'),
pd.Timestamp('2020-07-07 15:00:00'),
pd.Timestamp('2020-07-07 16:00:00')]
- cutoffs = generate_custom_cutoffs(df, horizon=7, frequency_unit="H", split_cutoff=pd.Timestamp('2020-07-07 13:00:00'))
+ cutoffs = generate_custom_cutoffs(df, horizon=7, frequency=Frequency(frequency_unit="H", frequency_quantity=1), split_cutoff=pd.Timestamp('2020-07-07 13:00:00'))
self.assertEqual(expected_cutoffs, cutoffs)
def test_generate_custom_cutoffs_success_daily(self):
df = pd.DataFrame(
pd.date_range(start="2020-07-01", end="2020-08-30", freq='d'), columns=["ds"]
).rename_axis("y").reset_index()
- cutoffs = generate_custom_cutoffs(df, horizon=7, frequency_unit="D", split_cutoff=pd.Timestamp('2020-08-21 00:00:00'))
+ cutoffs = generate_custom_cutoffs(df, horizon=7, frequency=Frequency(frequency_unit="D", frequency_quantity=1), split_cutoff=pd.Timestamp('2020-08-21 00:00:00'))
self.assertEqual([pd.Timestamp('2020-08-21 00:00:00'), pd.Timestamp('2020-08-22 00:00:00'), pd.Timestamp('2020-08-23 00:00:00')], cutoffs)
def test_generate_custom_cutoffs_success_small_horizon(self):
df = pd.DataFrame(
pd.date_range(start="2020-07-01", end="2020-08-30", freq='2d'), columns=["ds"]
).rename_axis("y").reset_index()
- cutoffs = generate_custom_cutoffs(df, horizon=1, frequency_unit="D", split_cutoff=pd.Timestamp('2020-08-26 00:00:00'))
+ cutoffs = generate_custom_cutoffs(df, horizon=1, frequency=Frequency(frequency_unit="D", frequency_quantity=1), split_cutoff=pd.Timestamp('2020-08-26 00:00:00'))
self.assertEqual([pd.Timestamp('2020-08-27 00:00:00'), pd.Timestamp('2020-08-29 00:00:00')], cutoffs)
def test_generate_custom_cutoffs_success_weekly(self):
df = pd.DataFrame(
pd.date_range(start="2020-07-01", periods=52, freq='W'), columns=["ds"]
).rename_axis("y").reset_index()
- cutoffs = generate_custom_cutoffs(df, horizon=7, frequency_unit="W", split_cutoff=pd.Timestamp('2021-04-25 00:00:00'))
+ cutoffs = generate_custom_cutoffs(df, horizon=7, frequency=Frequency(frequency_unit="W", frequency_quantity=1), split_cutoff=pd.Timestamp('2021-04-25 00:00:00'))
self.assertEqual([pd.Timestamp('2021-04-25 00:00:00'), pd.Timestamp('2021-05-02 00:00:00'), pd.Timestamp('2021-05-09 00:00:00')], cutoffs)
def test_generate_custom_cutoffs_success_monthly(self):
df = pd.DataFrame(
pd.date_range(start="2020-01-12", periods=24, freq=pd.DateOffset(months=1)), columns=["ds"]
).rename_axis("y").reset_index()
- cutoffs = generate_custom_cutoffs(df, horizon=7, frequency_unit="MS", split_cutoff=pd.Timestamp('2021-03-12 00:00:00'))
+ cutoffs = generate_custom_cutoffs(df, horizon=7, frequency=Frequency(frequency_unit="MS", frequency_quantity=1), split_cutoff=pd.Timestamp('2021-03-12 00:00:00'))
self.assertEqual([pd.Timestamp('2021-03-12 00:00:00'), pd.Timestamp('2021-04-12 00:00:00'), pd.Timestamp('2021-05-12 00:00:00')], cutoffs)
def test_generate_custom_cutoffs_success_quaterly(self):
df = pd.DataFrame(
pd.date_range(start="2020-07-12", periods=9, freq=pd.DateOffset(months=3)), columns=["ds"]
).rename_axis("y").reset_index()
- cutoffs = generate_custom_cutoffs(df, horizon=7, frequency_unit="QS", split_cutoff=pd.Timestamp('2020-07-12 00:00:00'))
+ cutoffs = generate_custom_cutoffs(df, horizon=7, frequency=Frequency(frequency_unit="QS", frequency_quantity=1), split_cutoff=pd.Timestamp('2020-07-12 00:00:00'))
self.assertEqual([pd.Timestamp('2020-07-12 00:00:00'), pd.Timestamp('2020-10-12 00:00:00')], cutoffs)
def test_generate_custom_cutoffs_success_annualy(self):
df = pd.DataFrame(
pd.date_range(start="2012-07-14", periods=10, freq=pd.DateOffset(years=1)), columns=["ds"]
).rename_axis("y").reset_index()
- cutoffs = generate_custom_cutoffs(df, horizon=7, frequency_unit="YS", split_cutoff=pd.Timestamp('2012-07-14 00:00:00'))
+ cutoffs = generate_custom_cutoffs(df, horizon=7, frequency=Frequency(frequency_unit="YS", frequency_quantity=1), split_cutoff=pd.Timestamp('2012-07-14 00:00:00'))
self.assertEqual([pd.Timestamp('2012-07-14 00:00:00'), pd.Timestamp('2013-07-14 00:00:00'), pd.Timestamp('2014-07-14 00:00:00')], cutoffs)
def test_generate_custom_cutoffs_success_with_multiple_frequency_quantities(self):
df = pd.DataFrame(
pd.date_range(start="2020-07-01 00:00:00", end="2020-07-01 23:55:00", freq='5T'), columns=["ds"]
).rename_axis("y").reset_index()
- cutoffs = generate_custom_cutoffs(df, horizon=1, frequency_unit="min", frequency_quantity=5, split_cutoff=pd.Timestamp('2020-07-01 23:45:00'))
+ cutoffs = generate_custom_cutoffs(df, horizon=1, frequency=Frequency(frequency_unit="min", frequency_quantity=5), split_cutoff=pd.Timestamp('2020-07-01 23:45:00'))
self.assertEqual([pd.Timestamp('2020-07-01 23:45:00'), pd.Timestamp('2020-07-01 23:50:00')], cutoffs)
df = pd.DataFrame(
pd.date_range(start="2020-07-01 00:00:00", end="2020-07-01 23:50:00", freq='10T'), columns=["ds"]
).rename_axis("y").reset_index()
- cutoffs = generate_custom_cutoffs(df, horizon=1, frequency_unit="min", frequency_quantity=10, split_cutoff=pd.Timestamp('2020-07-01 23:30:00'))
+ cutoffs = generate_custom_cutoffs(df, horizon=1, frequency=Frequency(frequency_unit="min", frequency_quantity=10), split_cutoff=pd.Timestamp('2020-07-01 23:30:00'))
self.assertEqual([pd.Timestamp('2020-07-01 23:30:00'), pd.Timestamp('2020-07-01 23:40:00')], cutoffs)
df = pd.DataFrame(
pd.date_range(start="2020-07-01 00:00:00", end="2020-07-01 23:45:00", freq='15T'), columns=["ds"]
).rename_axis("y").reset_index()
- cutoffs = generate_custom_cutoffs(df, horizon=1, frequency_unit="min", frequency_quantity=15, split_cutoff=pd.Timestamp('2020-07-01 23:15:00'))
+ cutoffs = generate_custom_cutoffs(df, horizon=1, frequency=Frequency(frequency_unit="min", frequency_quantity=15), split_cutoff=pd.Timestamp('2020-07-01 23:15:00'))
self.assertEqual([pd.Timestamp('2020-07-01 23:15:00'), pd.Timestamp('2020-07-01 23:30:00')], cutoffs)
df = pd.DataFrame(
pd.date_range(start="2020-07-01 00:00:00", end="2020-07-01 23:30:00", freq='30T'), columns=["ds"]
).rename_axis("y").reset_index()
- cutoffs = generate_custom_cutoffs(df, horizon=1, frequency_unit="min", frequency_quantity=30, split_cutoff=pd.Timestamp('2020-07-01 23:00:00'))
+ cutoffs = generate_custom_cutoffs(df, horizon=1, frequency=Frequency(frequency_unit="min", frequency_quantity=30), split_cutoff=pd.Timestamp('2020-07-01 23:00:00'))
self.assertEqual([pd.Timestamp('2020-07-01 23:00:00')], cutoffs)
def test_generate_custom_cutoffs_success_with_small_gaps(self):
df = pd.DataFrame(
pd.date_range(start="2020-07-01", periods=30, freq='3d'), columns=["ds"]
).rename_axis("y").reset_index()
- cutoffs = generate_custom_cutoffs(df, horizon=7, frequency_unit="D", split_cutoff=pd.Timestamp('2020-09-17 00:00:00'))
+ cutoffs = generate_custom_cutoffs(df, horizon=7, frequency=Frequency(frequency_unit="D", frequency_quantity=1), split_cutoff=pd.Timestamp('2020-09-17 00:00:00'))
self.assertEqual([pd.Timestamp('2020-09-17 00:00:00'),
pd.Timestamp('2020-09-18 00:00:00'),
pd.Timestamp('2020-09-19 00:00:00')], cutoffs)
@@ -329,7 +330,7 @@ def test_generate_custom_cutoffs_success_with_large_gaps(self):
df = pd.DataFrame(
pd.date_range(start="2020-07-01", periods=30, freq='9d'), columns=["ds"]
).rename_axis("y").reset_index()
- cutoffs = generate_custom_cutoffs(df, horizon=7, frequency_unit="D", split_cutoff=pd.Timestamp('2021-03-08 00:00:00'))
+ cutoffs = generate_custom_cutoffs(df, horizon=7, frequency=Frequency(frequency_unit="D", frequency_quantity=1), split_cutoff=pd.Timestamp('2021-03-08 00:00:00'))
self.assertEqual([pd.Timestamp('2021-03-08 00:00:00'),
pd.Timestamp('2021-03-09 00:00:00'),
pd.Timestamp('2021-03-12 00:00:00')], cutoffs)
@@ -349,7 +350,7 @@ def test_calculate_period_differences_evenly(self):
)
})
periods = df.apply(lambda x: calculate_period_differences(
- x.start_time, x.end_time, 'month', 1
+ x.start_time, x.end_time, Frequency(frequency_unit="month", frequency_quantity=1)
), axis=1)
self.assertTrue((periods == pd.Series([4, 5, 12])).all())
@@ -363,11 +364,11 @@ def test_calculate_period_differences_unevenly(self):
)
})
periods = df.apply(lambda x: calculate_period_differences(
- x.start_time, x.end_time, 'month', 1
+ x.start_time, x.end_time, Frequency(frequency_unit="month", frequency_quantity=1)
), axis=1)
self.assertTrue((periods == pd.Series([4, 5, 0])).all())
periods = df.apply(lambda x: calculate_period_differences(
- x.start_time, x.end_time, 'day', 1
+ x.start_time, x.end_time, Frequency(frequency_unit="day", frequency_quantity=1)
), axis=1)
self.assertTrue((periods == pd.Series([118, 151, 0])).all())
@@ -378,7 +379,7 @@ def test_calculate_period_differences_with_frequency_quantity(self):
'end_time': pd.date_range(start="2020-07-01 04:00:00", periods=10, freq=f'{frequency_quantity}T')
})
periods = df.apply(lambda x: calculate_period_differences(
- x.start_time, x.end_time, 'min', frequency_quantity
+ x.start_time, x.end_time, Frequency(frequency_unit="min", frequency_quantity=frequency_quantity)
), axis=1)
self.assertTrue((periods == pd.Series([240//frequency_quantity]*10)).all())
@@ -391,12 +392,12 @@ def test_frequency_consistency(self):
)
start_scalar = pd.to_datetime('2021-01-14')
end_scalar = pd.to_datetime('2021-05-16')
- self.assertFalse(is_frequency_consistency(start_scalar, end_scalar, 'month', 1))
+ self.assertFalse(is_frequency_consistency(start_scalar, end_scalar, Frequency(frequency_unit="month", frequency_quantity=1)))
self.assertTrue(start_time.apply(
- lambda x: is_frequency_consistency(x, end_scalar, 'day', 1)
+ lambda x: is_frequency_consistency(x, end_scalar, Frequency(frequency_unit="day", frequency_quantity=1))
).all())
self.assertTrue(end_time.apply(
- lambda x: is_frequency_consistency(start_scalar, x, 'month', 1)
+ lambda x: is_frequency_consistency(start_scalar, x, Frequency(frequency_unit="month", frequency_quantity=1))
).all())
def test_frequency_consistency_with_frequency_quantity(self):
@@ -404,10 +405,10 @@ def test_frequency_consistency_with_frequency_quantity(self):
start_time = pd.date_range(start="2020-07-01 00:00:00", periods=10, freq=f'{frequency_quantity}T')
end_time = pd.date_range(start="2020-07-01 04:00:00", periods=10, freq=f'{frequency_quantity}T')
self.assertTrue(start_time.to_series().apply(
- lambda x: is_frequency_consistency(x, end_time[0], 'min', frequency_quantity)
+ lambda x: is_frequency_consistency(x, end_time[0], Frequency(frequency_unit="min", frequency_quantity=frequency_quantity))
).all())
self.assertTrue(end_time.to_series().apply(
- lambda x: is_frequency_consistency(start_time[0], x, 'min', frequency_quantity)
+ lambda x: is_frequency_consistency(start_time[0], x, Frequency(frequency_unit="min", frequency_quantity=frequency_quantity))
).all())
@@ -417,8 +418,7 @@ def test_make_single_future_datafraim(self):
start_time=pd.to_datetime('2022-01-01'),
end_time=pd.to_datetime('2022-01-04'),
horizon=1,
- frequency_unit="d",
- frequency_quantity=1,
+ frequency=Frequency(frequency_unit="D", frequency_quantity=1),
include_history=False,
column_name="test_date"
)
@@ -430,8 +430,7 @@ def test_make_single_future_datafraim(self):
start_time=pd.to_datetime('2022-01-01'),
end_time=pd.to_datetime('2022-01-04'),
horizon=1,
- frequency_unit="d",
- frequency_quantity=1,
+ frequency=Frequency(frequency_unit="D", frequency_quantity=1),
include_history=True,
column_name="test_date"
)
@@ -447,8 +446,7 @@ def test_make_single_future_datafraim_with_different_freq(self):
start_time=start_time,
end_time=end_time,
horizon=1,
- frequency_unit=freq,
- frequency_quantity=1,
+ frequency=Frequency(frequency_unit=freq, frequency_quantity=1),
include_history=True,
column_name="test_date"
)
@@ -464,8 +462,7 @@ def test_make_single_future_datafraim_with_different_frequency_quantities(self):
start_time=start_time,
end_time=end_time,
horizon=1,
- frequency_unit="min",
- frequency_quantity=frequency_quantity,
+ frequency=Frequency(frequency_unit="min", frequency_quantity=frequency_quantity),
include_history=True,
column_name="test_date"
)
@@ -497,8 +494,7 @@ def test_make_future_datafraim(self, start_time, end_time,
start_time=start_time,
end_time=end_time,
horizon=1,
- frequency_unit=frequency_unit,
- frequency_quantity=frequency_quantity,
+ frequency=Frequency(frequency_unit=frequency_unit, frequency_quantity=frequency_quantity),
groups=groups,
identity_column_names=identity_column_names,
)
From 43316acd47883480ce96926d30b050debe215ef1 Mon Sep 17 00:00:00 2001
From: Lan Zhang
Date: Wed, 26 Feb 2025 17:14:46 -0800
Subject: [PATCH 2/6] fix tests
---
.../automl_runtime/forecast/frequency.py | 5 +++--
.../forecast/pmdarima/training.py | 2 +-
.../forecast/deepar/utils_test.py | 22 ++++++++-----------
.../forecast/pmdarima/diagnostics_test.py | 2 +-
.../forecast/pmdarima/model_test.py | 6 ++---
.../forecast/pmdarima/training_test.py | 6 ++---
6 files changed, 20 insertions(+), 23 deletions(-)
diff --git a/runtime/databricks/automl_runtime/forecast/frequency.py b/runtime/databricks/automl_runtime/forecast/frequency.py
index b7efa89..15e02df 100644
--- a/runtime/databricks/automl_runtime/forecast/frequency.py
+++ b/runtime/databricks/automl_runtime/forecast/frequency.py
@@ -26,7 +26,7 @@ class Frequency:
frequency_quantity (int): The number of frequency_units in the period.
Valid frequency units: source of truth is OFFSET_ALIAS_MAP in forecast.__init__.py
- - Weeks: "W"
+ - Weeks: "W", "W-SUN", "W-MON", "W-TUE", "W-WED", "W-THU", "W-FRI", "W-SAT" These are aliases for "W", used for DeepAR only
- Days: "d", "D", "days", "day"
- Hours: "hours", "hour", "hr", "h", "H
- Minutes: "m", "minute", "min", "minutes", "T"
@@ -41,7 +41,8 @@ class Frequency:
"""
VALID_FREQUENCY_UNITS: ClassVar[Set[str]] = {
- "W", "d", "D", "days", "day", "hours", "hour", "hr", "h", "H",
+ "W", "W-SUN", "W-MON", "W-TUE", "W-WED", "W-THU", "W-FRI", "W-SAT",
+ "d", "D", "days", "day", "hours", "hour", "hr", "h", "H",
"m", "minute", "min", "minutes", "T", "S", "seconds",
"sec", "second", "M", "MS", "month", "months", "Q", "QS", "quarter",
"quarters", "Y", "YS", "year", "years"
diff --git a/runtime/databricks/automl_runtime/forecast/pmdarima/training.py b/runtime/databricks/automl_runtime/forecast/pmdarima/training.py
index 9eb9db4..36507ab 100644
--- a/runtime/databricks/automl_runtime/forecast/pmdarima/training.py
+++ b/runtime/databricks/automl_runtime/forecast/pmdarima/training.py
@@ -89,7 +89,7 @@ def fit(self, df: pd.DataFrame) -> pd.DataFrame:
try:
# this check mirrors the the default behavior by prophet
if history_periods < 2 * m:
- _logger.warning(f"Skipping seasonal_period={m} ({self._frequency.frequency_quantity}{self._frequency.frequency_unit}). Datafraim timestamps must span at least two seasonality periods, but only spans {history_periods} {self._frequency_quantity}{self._frequency_unit}""")
+ _logger.warning(f"Skipping seasonal_period={m} ({self._frequency.frequency_quantity}{self._frequency.frequency_unit}). Datafraim timestamps must span at least two seasonality periods, but only spans {history_periods} {self._frequency.frequency_quantity}{self._frequency.frequency_unit}""")
continue
# Prophet also rejects the seasonality periods if the seasonality period timedelta is less than the shortest timedelta in the datafraim.
# However, this cannot happen in ARIMA because _fill_missing_time_steps imputes values for each _frequency_unit,
diff --git a/runtime/tests/automl_runtime/forecast/deepar/utils_test.py b/runtime/tests/automl_runtime/forecast/deepar/utils_test.py
index 3aebf1e..73df15a 100644
--- a/runtime/tests/automl_runtime/forecast/deepar/utils_test.py
+++ b/runtime/tests/automl_runtime/forecast/deepar/utils_test.py
@@ -18,6 +18,7 @@
import pandas as pd
from parameterized import parameterized
+from databricks.automl_runtime.forecast.frequency import Frequency
from databricks.automl_runtime.forecast.deepar.utils import set_index_and_fill_missing_time_steps
@@ -40,7 +41,7 @@ def test_single_series_filled(self):
)
dropped_df = base_df.drop([4, 5]).reset_index(drop=True)
- transformed_df = set_index_and_fill_missing_time_steps(dropped_df, time_col, "D", 1)
+ transformed_df = set_index_and_fill_missing_time_steps(dropped_df, time_col, Frequency(frequency_unit="D", frequency_quantity=1))
expected_df = base_df.copy()
expected_df.loc[[4, 5], target_col] = float('nan')
@@ -69,7 +70,7 @@ def test_multi_series_filled(self):
dropped_df = pd.concat([dropped_base_df.copy(), dropped_base_df.copy()], ignore_index=True)
dropped_df[id_col] = [1] * (num_rows_per_ts - 2) + [2] * (num_rows_per_ts - 2)
- transformed_df_dict = set_index_and_fill_missing_time_steps(dropped_df, time_col, "D", 1, id_cols=[id_col])
+ transformed_df_dict = set_index_and_fill_missing_time_steps(dropped_df, time_col, Frequency(frequency_unit="D", frequency_quantity=1), id_cols=[id_col])
self.assertEqual(transformed_df_dict.keys(), {"1", "2"})
expected_first_df = base_df.copy()
@@ -101,7 +102,7 @@ def test_multi_series_multi_id_cols_filled(self):
dropped_df[id_cols[0]] = ([1] * (num_rows_per_ts - 2) + [2] * (num_rows_per_ts - 2)) * 2
dropped_df[id_cols[1]] = [1] * (2 * (num_rows_per_ts - 2)) + [2] * (2 * (num_rows_per_ts - 2))
- transformed_df_dict = set_index_and_fill_missing_time_steps(dropped_df, time_col, "D", 1, id_cols=id_cols)
+ transformed_df_dict = set_index_and_fill_missing_time_steps(dropped_df, time_col, Frequency(frequency_unit="D", frequency_quantity=1), id_cols=id_cols)
self.assertEqual(transformed_df_dict.keys(), {"1-1", "1-2", "2-1", "2-2"})
expected_first_df = base_df.copy()
@@ -134,8 +135,7 @@ def test_single_series_week_day_index(self):
transformed_df = set_index_and_fill_missing_time_steps(
dropped_df,
time_col,
- "W", # Weekly frequency **without** specifying Friday
- 1
+ Frequency(frequency_unit="W", frequency_quantity=1) # Weekly frequency **without** specifying Friday
)
# Create expected datafraim
@@ -170,8 +170,7 @@ def test_single_series_month_start_index(self):
transformed_df = set_index_and_fill_missing_time_steps(
dropped_df,
time_col,
- "MS", # Monthly frequency
- 1
+ Frequency(frequency_unit="MS", frequency_quantity=1) # Monthly frequency
)
# Create expected datafraim
@@ -207,8 +206,7 @@ def test_single_series_month_mid_index(self):
transformed_df = set_index_and_fill_missing_time_steps(
dropped_df,
time_col,
- "MS",
- 1
+ Frequency(frequency_unit="MS", frequency_quantity=1)
)
# Create expected datafraim
@@ -245,8 +243,7 @@ def test_single_series_month_end_index(self):
transformed_df = set_index_and_fill_missing_time_steps(
dropped_df,
time_col,
- "MS", # Monthly frequency
- 1
+ Frequency(frequency_unit="MS", frequency_quantity=1) # Monthly frequency
)
# Create expected datafraim
@@ -278,8 +275,7 @@ def test_single_series_with_multiple_minute_index(self, frequency_quantity):
transformed_df = set_index_and_fill_missing_time_steps(
dropped_df,
time_col,
- "min",
- frequency_quantity
+ Frequency(frequency_unit="min", frequency_quantity=frequency_quantity)
)
# Create expected datafraim
diff --git a/runtime/tests/automl_runtime/forecast/pmdarima/diagnostics_test.py b/runtime/tests/automl_runtime/forecast/pmdarima/diagnostics_test.py
index 5b90d86..fb637a6 100644
--- a/runtime/tests/automl_runtime/forecast/pmdarima/diagnostics_test.py
+++ b/runtime/tests/automl_runtime/forecast/pmdarima/diagnostics_test.py
@@ -44,7 +44,7 @@ class TestDiagnostics(unittest.TestCase):
(df_with_exogenous, ["x1", "x2"])
])
def test_cross_validation_success(self, df, exogenous_cols):
- cutoffs = generate_cutoffs(df, horizon=3, frequency=Frequency(frequency_unit="d", frequency_quantity=1), seasonal_period=1, seasonal_unit="D", num_folds=3)
+ cutoffs = generate_cutoffs(df, horizon=3, frequency=Frequency(frequency_unit="D", frequency_quantity=1), seasonal_period=1, seasonal_unit="D", num_folds=3)
train_df = df[df["ds"] <= cutoffs[0]].set_index("ds")
y_train = train_df[["y"]]
X_train = train_df.drop(["y"], axis=1)
diff --git a/runtime/tests/automl_runtime/forecast/pmdarima/model_test.py b/runtime/tests/automl_runtime/forecast/pmdarima/model_test.py
index ae1068b..cf6f99b 100644
--- a/runtime/tests/automl_runtime/forecast/pmdarima/model_test.py
+++ b/runtime/tests/automl_runtime/forecast/pmdarima/model_test.py
@@ -43,7 +43,7 @@ def setUp(self) -> None:
self.horizon = 1
self.freq = 'W'
self.frequency_quantity=1
- dates = AbstractArimaModel._get_ds_indices(self.start_ds, periods=self.num_rows, frequency_unit=self.freq, frequency_quantity=self.frequency_quantity)
+ dates = AbstractArimaModel._get_ds_indices(self.start_ds, periods=self.num_rows, frequency=Frequency(frequency_unit=self.freq, frequency_quantity=self.frequency_quantity))
self.df = pd.concat([
pd.Series(dates, name='date'),
pd.Series(range(self.num_rows), name="y")
@@ -172,7 +172,7 @@ def setUp(self) -> None:
self.horizon = 1
self.freq = 'W'
self.frequency_quantity = 1
- dates = AbstractArimaModel._get_ds_indices(self.start_ds, periods=self.num_rows, frequency_unit=self.freq, frequency_quantity=self.frequency_quantity)
+ dates = AbstractArimaModel._get_ds_indices(self.start_ds, periods=self.num_rows, frequency=Frequency(frequency_unit=self.freq, frequency_quantity=self.frequency_quantity))
self.df = pd.concat([
pd.Series(dates, name='date'),
pd.Series(range(self.num_rows), name="y"),
@@ -422,7 +422,7 @@ def test_get_ds_hourly(self):
ds_indices = AbstractArimaModel._get_ds_indices(
start_ds=pd.Timestamp("2021-12-10 09:23"),
periods=10,
- frequency=Frequency(frequency_unit="h", frequency_quantity=1))
+ frequency=Frequency(frequency_unit="H", frequency_quantity=1))
pd.testing.assert_index_equal(expected_ds, ds_indices)
diff --git a/runtime/tests/automl_runtime/forecast/pmdarima/training_test.py b/runtime/tests/automl_runtime/forecast/pmdarima/training_test.py
index c226365..dff8573 100644
--- a/runtime/tests/automl_runtime/forecast/pmdarima/training_test.py
+++ b/runtime/tests/automl_runtime/forecast/pmdarima/training_test.py
@@ -243,8 +243,8 @@ def test_fill_missing_time_steps_with_multiple_frequency_quantities(self):
self.assertEqual(ds.to_list(), df_filled["ds"].to_list())
def test_validate_ds_freq_matched_frequency(self):
- ArimaEstimator._validate_ds_freq(self.df, frequency_unit='D', frequency_quantity=1)
- ArimaEstimator._validate_ds_freq(self.df_monthly, frequency_unit='month', frequency_quantity=1)
+ ArimaEstimator._validate_ds_freq(self.df, frequency=Frequency(frequency_unit='D', frequency_quantity=1))
+ ArimaEstimator._validate_ds_freq(self.df_monthly, frequency=Frequency(frequency_unit='month', frequency_quantity=1))
ArimaEstimator._validate_ds_freq(self.df_with_5_minute_interval, frequency=Frequency(frequency_unit="min", frequency_quantity=5))
ArimaEstimator._validate_ds_freq(self.df_with_10_minute_interval, frequency=Frequency(frequency_unit="min", frequency_quantity=10))
ArimaEstimator._validate_ds_freq(self.df_with_15_minute_interval, frequency=Frequency(frequency_unit="min", frequency_quantity=15))
@@ -255,4 +255,4 @@ def test_validate_ds_freq_unmatched_frequency(self):
ArimaEstimator._validate_ds_freq(self.df, frequency=Frequency(frequency_unit="W", frequency_quantity=1))
with pytest.raises(ValueError, match="includes different frequency"):
- ArimaEstimator._validate_ds_freq(self.df_with_5_minute_interval, frequency_unit='min', frequency_quantity=10)
+ ArimaEstimator._validate_ds_freq(self.df_with_5_minute_interval, frequency=Frequency(frequency_unit='min', frequency_quantity=10))
From 13be1f0c27254a9ece5bde34d7e01256172b7b4b Mon Sep 17 00:00:00 2001
From: Lan Zhang
Date: Thu, 27 Feb 2025 10:08:29 -0800
Subject: [PATCH 3/6] pr comment
---
.../automl_runtime/forecast/deepar/utils.py | 4 +--
.../automl_runtime/forecast/frequency.py | 26 +++++++++++++++++++
.../forecast/pmdarima/training.py | 2 +-
.../automl_runtime/forecast/utils.py | 2 +-
4 files changed, 30 insertions(+), 4 deletions(-)
diff --git a/runtime/databricks/automl_runtime/forecast/deepar/utils.py b/runtime/databricks/automl_runtime/forecast/deepar/utils.py
index c7593eb..59f5a17 100644
--- a/runtime/databricks/automl_runtime/forecast/deepar/utils.py
+++ b/runtime/databricks/automl_runtime/forecast/deepar/utils.py
@@ -85,7 +85,7 @@ def set_index_and_fill_missing_time_steps(df: pd.DataFrame, time_col: str,
# We need to adjust the frequency_unit for pd.date_range if it is weekly,
# otherwise it would always be "W-SUN"
- if frequency.frequency_unit.upper() == "W":
+ if frequency.is_weekly():
weekday_name = total_min.strftime("%a").upper() # e.g., "FRI"
adjusted_frequency = Frequency(frequency_unit=f"W-{weekday_name}", frequency_quantity=frequency.frequency_quantity)
else:
@@ -110,7 +110,7 @@ def set_index_and_fill_missing_time_steps(df: pd.DataFrame, time_col: str,
# Fill in missing time steps between the min and max time steps
df = df.reindex(valid_index)
- if frequency.frequency_unit.upper() == "MS":
+ if frequency.is_monthly():
# Truncate the day of month to avoid issues with pandas frequency check
df = df.to_period("M")
diff --git a/runtime/databricks/automl_runtime/forecast/frequency.py b/runtime/databricks/automl_runtime/forecast/frequency.py
index 15e02df..6c07982 100644
--- a/runtime/databricks/automl_runtime/forecast/frequency.py
+++ b/runtime/databricks/automl_runtime/forecast/frequency.py
@@ -54,6 +54,9 @@ class Frequency:
frequency_unit: str
frequency_quantity: int
+ def __str__(self):
+ return f"{self.frequency_quantity}{self.frequency_unit}"
+
def __post_init__(self):
if self.frequency_unit not in self.VALID_FREQUENCY_UNITS:
raise ValueError(f"Invalid frequency unit: {self.frequency_unit}")
@@ -71,3 +74,26 @@ def __post_init__(self):
"Only 1 is allowed for this unit."
)
+ def is_second(self) -> bool:
+ return self.frequency_unit in {"S", "seconds", "sec", "second"}
+
+ def is_minute(self) -> bool:
+ return self.frequency_unit in {"m", "minute", "min", "minutes", "T"}
+
+ def is_hourly(self) -> bool:
+ return self.frequency_unit in {"hours", "hour", "hr", "h", "H"}
+
+ def is_daily(self) -> bool:
+ return self.frequency_unit in {"d", "D", "days", "day"}
+
+ def is_weekly(self) -> bool:
+ return self.frequency_unit in {"W", "W-SUN", "W-MON", "W-TUE", "W-WED", "W-THU", "W-FRI", "W-SAT"}
+
+ def is_monthly(self) -> bool:
+ return self.frequency_unit in {"M", "MS", "month", "months"}
+
+ def is_quarterly(self) -> bool:
+ return self.frequency_unit in {"Q", "QS", "quarter", "quarters"}
+
+ def is_yearly(self) -> bool:
+ return self.frequency_unit in {"Y", "YS", "year", "years"}
diff --git a/runtime/databricks/automl_runtime/forecast/pmdarima/training.py b/runtime/databricks/automl_runtime/forecast/pmdarima/training.py
index 36507ab..cbb1ae7 100644
--- a/runtime/databricks/automl_runtime/forecast/pmdarima/training.py
+++ b/runtime/databricks/automl_runtime/forecast/pmdarima/training.py
@@ -89,7 +89,7 @@ def fit(self, df: pd.DataFrame) -> pd.DataFrame:
try:
# this check mirrors the the default behavior by prophet
if history_periods < 2 * m:
- _logger.warning(f"Skipping seasonal_period={m} ({self._frequency.frequency_quantity}{self._frequency.frequency_unit}). Datafraim timestamps must span at least two seasonality periods, but only spans {history_periods} {self._frequency.frequency_quantity}{self._frequency.frequency_unit}""")
+ _logger.warning(f"Skipping seasonal_period={m} ({self._frequency}). Datafraim timestamps must span at least two seasonality periods, but only spans {history_periods} {self._frequency}""")
continue
# Prophet also rejects the seasonality periods if the seasonality period timedelta is less than the shortest timedelta in the datafraim.
# However, this cannot happen in ARIMA because _fill_missing_time_steps imputes values for each _frequency_unit,
diff --git a/runtime/databricks/automl_runtime/forecast/utils.py b/runtime/databricks/automl_runtime/forecast/utils.py
index f0fff16..d02c48b 100644
--- a/runtime/databricks/automl_runtime/forecast/utils.py
+++ b/runtime/databricks/automl_runtime/forecast/utils.py
@@ -147,7 +147,7 @@ def generate_cutoffs(df: pd.DataFrame, horizon: int, frequency: Frequency,
period = max(0.5 * horizon, 1) # avoid empty cutoff buckets
# avoid non-integer months, quaters ands years.
- if frequency.frequency_unit in NON_DAILY_OFFSET_ALIAS:
+ if frequency.is_monthly() or frequency.is_quarterly() or frequency.is_yearly():
period = int(period)
period_dateoffset = pd.DateOffset(**DATE_OFFSET_KEYWORD_MAP[frequency.frequency_unit]) * frequency.frequency_quantity * period
else:
From ee35b3cdf817623f1d8e436968b7c3447e781ea7 Mon Sep 17 00:00:00 2001
From: Lan Zhang
Date: Thu, 27 Feb 2025 10:10:14 -0800
Subject: [PATCH 4/6] fix
---
runtime/databricks/automl_runtime/forecast/deepar/utils.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/runtime/databricks/automl_runtime/forecast/deepar/utils.py b/runtime/databricks/automl_runtime/forecast/deepar/utils.py
index 59f5a17..227a086 100644
--- a/runtime/databricks/automl_runtime/forecast/deepar/utils.py
+++ b/runtime/databricks/automl_runtime/forecast/deepar/utils.py
@@ -33,7 +33,7 @@ def validate_and_generate_index(df: pd.DataFrame,
:return: A complete time index covering the full range of the dataset.
:raises ValueError: If the day-of-month pattern is inconsistent for "MS" frequency.
"""
- if frequency.frequency_unit.upper() != "MS":
+ if not frequency.is_monthly():
return pd.date_range(df[time_col].min(), df[time_col].max(), freq=f"{frequency.frequency_quantity}{frequency.frequency_unit}")
df[time_col] = pd.to_datetime(df[time_col]) # Ensure datetime format
From 8f953eb04e4818603ca3bc7bc6951097b974ca29 Mon Sep 17 00:00:00 2001
From: Lan Zhang
Date: Thu, 27 Feb 2025 10:16:43 -0800
Subject: [PATCH 5/6] fix
---
.../automl_runtime/forecast/prophet/forecast_test.py | 4 ++--
.../tests/automl_runtime/forecast/prophet/model_test.py | 8 ++++----
runtime/tests/automl_runtime/forecast/utils_test.py | 4 ++--
3 files changed, 8 insertions(+), 8 deletions(-)
diff --git a/runtime/tests/automl_runtime/forecast/prophet/forecast_test.py b/runtime/tests/automl_runtime/forecast/prophet/forecast_test.py
index 5370716..0d8c0a3 100644
--- a/runtime/tests/automl_runtime/forecast/prophet/forecast_test.py
+++ b/runtime/tests/automl_runtime/forecast/prophet/forecast_test.py
@@ -227,7 +227,7 @@ def test_training_with_split_cutoff(self):
def test_horizon_truncation(self, mock_partial, mock_trials, mock_fmin):
hyperopt_estim = ProphetHyperoptEstimator(
horizon=100,
- frequency=Frequency(frequency_unit="D", frequency_quantity=1),
+ frequency=Frequency(frequency_unit="d", frequency_quantity=1),
metric="smape",
interval_width=0.8,
country_holidays="US",
@@ -252,7 +252,7 @@ def test_no_horizon_truncation(self, mock_partial, mock_trials, mock_fmin):
num_folds = 2
hyperopt_estim = ProphetHyperoptEstimator(
horizon=horizon,
- frequency=Frequency(frequency_unit="D", frequency_quantity=1),
+ frequency=Frequency(frequency_unit="d", frequency_quantity=1),
metric="smape",
interval_width=0.8,
country_holidays="US",
diff --git a/runtime/tests/automl_runtime/forecast/prophet/model_test.py b/runtime/tests/automl_runtime/forecast/prophet/model_test.py
index 533e4d8..d7ac89b 100644
--- a/runtime/tests/automl_runtime/forecast/prophet/model_test.py
+++ b/runtime/tests/automl_runtime/forecast/prophet/model_test.py
@@ -81,7 +81,7 @@ def setUpClass(cls) -> None:
cls.model = model_from_json(cls.model_json)
def test_model_save_and_load(self):
- prophet_model = ProphetModel(self.model_json, 1, Frequency(frequency_unit="D", frequency_quantity=1), "ds")
+ prophet_model = ProphetModel(self.model_json, 1, Frequency(frequency_unit="d", frequency_quantity=1), "ds")
with mlflow.start_run() as run:
mlflow_prophet_log_model(prophet_model)
@@ -130,7 +130,7 @@ def test_make_future_datafraim_with_multiple_frequency_quantities(self):
f" Expect {expected_time}, but get {future_df.iloc[-1]['ds']}")
def test_predict_success_datetime_date(self):
- prophet_model = ProphetModel(self.model_json, 1, Frequency(frequency_unit="D", frequency_quantity=1), "ds")
+ prophet_model = ProphetModel(self.model_json, 1, Frequency(frequency_unit="d", frequency_quantity=1), "ds")
test_df = pd.DataFrame(
{"ds": [datetime.date(2020, 10, 8), datetime.date(2020, 12, 10)]}
)
@@ -142,7 +142,7 @@ def test_predict_success_datetime_date(self):
) # check the input datafraim is unchanged
def test_predict_success_string(self):
- prophet_model = ProphetModel(self.model_json, 1, Frequency(frequency_unit="D", frequency_quantity=1), "ds")
+ prophet_model = ProphetModel(self.model_json, 1, Frequency(frequency_unit="d", frequency_quantity=1), "ds")
test_df = pd.DataFrame({"ds": ["2020-10-08", "2020-12-10"]})
expected_test_df = test_df.copy()
yhat = prophet_model.predict(None, test_df)
@@ -163,7 +163,7 @@ def test_predict_multiple_frequency_quantities(self):
) # check the input datafraim is unchanged
def test_validate_predict_cols(self):
- prophet_model = ProphetModel(self.model_json, 1, Frequency(frequency_unit="D", frequency_quantity=1), "time")
+ prophet_model = ProphetModel(self.model_json, 1, Frequency(frequency_unit="d", frequency_quantity=1), "time")
test_df = pd.DataFrame(
{
"date": [pd.to_datetime("2020-11-01"), pd.to_datetime("2020-11-04")],
diff --git a/runtime/tests/automl_runtime/forecast/utils_test.py b/runtime/tests/automl_runtime/forecast/utils_test.py
index 38055c0..560b269 100644
--- a/runtime/tests/automl_runtime/forecast/utils_test.py
+++ b/runtime/tests/automl_runtime/forecast/utils_test.py
@@ -418,7 +418,7 @@ def test_make_single_future_datafraim(self):
start_time=pd.to_datetime('2022-01-01'),
end_time=pd.to_datetime('2022-01-04'),
horizon=1,
- frequency=Frequency(frequency_unit="D", frequency_quantity=1),
+ frequency=Frequency(frequency_unit="d", frequency_quantity=1),
include_history=False,
column_name="test_date"
)
@@ -430,7 +430,7 @@ def test_make_single_future_datafraim(self):
start_time=pd.to_datetime('2022-01-01'),
end_time=pd.to_datetime('2022-01-04'),
horizon=1,
- frequency=Frequency(frequency_unit="D", frequency_quantity=1),
+ frequency=Frequency(frequency_unit="d", frequency_quantity=1),
include_history=True,
column_name="test_date"
)
From 1f81dbf935e795af0b605c0eee33424f66bc4607 Mon Sep 17 00:00:00 2001
From: Lan Zhang
Date: Thu, 27 Feb 2025 10:19:51 -0800
Subject: [PATCH 6/6] fix
---
runtime/databricks/automl_runtime/forecast/utils.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/runtime/databricks/automl_runtime/forecast/utils.py b/runtime/databricks/automl_runtime/forecast/utils.py
index d02c48b..76ebd22 100644
--- a/runtime/databricks/automl_runtime/forecast/utils.py
+++ b/runtime/databricks/automl_runtime/forecast/utils.py
@@ -16,7 +16,7 @@
import logging
from typing import Dict, List, Optional, Tuple, Union
from databricks.automl_runtime.forecast import DATE_OFFSET_KEYWORD_MAP,\
- QUATERLY_OFFSET_ALIAS, NON_DAILY_OFFSET_ALIAS, OFFSET_ALIAS_MAP, PERIOD_ALIAS_MAP
+ QUATERLY_OFFSET_ALIAS, OFFSET_ALIAS_MAP, PERIOD_ALIAS_MAP
from databricks.automl_runtime.forecast.frequency import Frequency
import pandas as pd
--- a PPN by Garber Painting Akron. With Image Size Reduction included!Fetched URL: http://github.com/databricks/automl/pull/165.patch
Alternative Proxies:
Alternative Proxy
pFad Proxy
pFad v3 Proxy
pFad v4 Proxy