diff --git a/tsururu/dataset.py b/tsururu/dataset.py index 5a6be3f..2e78c2c 100644 --- a/tsururu/dataset.py +++ b/tsururu/dataset.py @@ -13,7 +13,7 @@ class IndexSlicer: """ @staticmethod - def timedelta(x: Tuple[NDArray[Union[np.int, np.float]], pd.Timedelta]): + def timedelta(x: Tuple[NDArray[Union[np.integer, np.floating]], pd.Timedelta]): """ Returns the difference between neighboring observations in the array in terms of delta and the delta itself. @@ -67,7 +67,7 @@ def get_cols_idx(data: pd.DataFrame, columns: List): @staticmethod def get_slice( data: pd.DataFrame, - k: Tuple[NDArray[np.int], NDArray[np.int]], + k: Tuple[NDArray[int], NDArray[int]], ) -> pd.DataFrame: """Get 3d slice. @@ -118,14 +118,14 @@ def ids_from_date( return ids def _rolling_window( - self, a: NDArray[np.float], window: int, step: int, from_last: bool = True + self, a: NDArray[np.floating], window: int, step: int, from_last: bool = True ): sliding_window = np.lib.stride_tricks.sliding_window_view(a, window) - return sliding_window[(len(a) - window) % step if from_last else 0:][::step] + return sliding_window[(len(a) - window) % step if from_last else 0 :][::step] def _create_idx_data( self, - data: NDArray[np.float], + data: NDArray[np.floating], horizon: int, history: int, step: int, @@ -136,7 +136,7 @@ def _create_idx_data( def _create_idx_target( self, - data: NDArray[np.float], + data: NDArray[np.floating], horizon: int, history: int, step: int, @@ -149,23 +149,23 @@ def _create_idx_target( def _create_idx_test( self, - data: NDArray[np.float], + data: NDArray[np.floating], horizon: int, history: int, step: int, _, __, ): - return self._rolling_window(np.arange(len(data)), history, step)[-(horizon + 1):-horizon] + return self._rolling_window(np.arange(len(data)), history, step)[-(horizon + 1) : -horizon] def _get_ids( self, func, - data: NDArray[np.float], + data: NDArray[np.floating], horizon: int, history: int, step: int, - ids: NDArray[np.int], + ids: NDArray[np.integer], cond: int = 0, n_last_horizon: Optional[int] = None, ): @@ -185,11 +185,11 @@ def _get_ids( def create_idx_data( self, - data: NDArray[np.float], + data: NDArray[np.floating], horizon: int, history: int, step: int, - ids: Optional[NDArray[np.int]] = None, + ids: Optional[NDArray[np.integer]] = None, date_column: Optional[str] = None, ): """Find indices that, when applied to the original dataset, @@ -224,11 +224,11 @@ def create_idx_data( def create_idx_test( self, - data: NDArray[np.float], + data: NDArray[np.floating], horizon: int, history: int, step: int, - ids: Optional[NDArray[np.int]] = None, + ids: Optional[NDArray[np.integer]] = None, date_column: Optional[str] = None, ): """Find indices that, when applied to the original dataset, @@ -263,11 +263,11 @@ def create_idx_test( def create_idx_target( self, - data: NDArray[np.float], + data: NDArray[np.floating], horizon: int, history: int, step: int, - ids: Optional[NDArray[np.int]] = None, + ids: Optional[NDArray[np.integer]] = None, date_column: Optional[str] = None, n_last_horizon: Optional[int] = None, ): @@ -368,21 +368,21 @@ def make_padded_test( """ def _crop_segment( - segment: NDArray[Union[np.float, np.str]], + segment: NDArray[Union[np.floating, np.str_]], test_last: bool, - ) -> NDArray[Union[np.float, np.str]]: + ) -> NDArray[Union[np.floating, np.str_]]: if test_last: - return segment[-self.history:] - return segment[-self.history - horizon:-horizon] + return segment[-self.history :] + return segment[-self.history - horizon : -horizon] def _pad_segment( - segment: NDArray[Union[np.float, np.str]], + segment: NDArray[Union[np.floating, np.str_]], horizon: int, time_delta: pd.Timedelta, date_col_id: Optional[int], - id_col_id: Optional[Union[str, NDArray[np.str]]], - ) -> NDArray[Union[np.float, np.str]]: - result = np.full((horizon, segment.shape[1]), None) + id_col_id: Optional[Union[str, NDArray[np.str_]]], + ) -> NDArray[Union[np.floating, np.str_]]: + result = np.full((horizon, segment.shape[1]), np.nan) last_date = segment[-1, date_col_id] new_dates = pd.date_range(last_date + time_delta, periods=horizon, freq=time_delta) @@ -420,4 +420,9 @@ def _pad_segment( # Concatenate together result = np.vstack(np.concatenate((segments, padded_segments_results), axis=1)) - return pd.DataFrame(result, columns=columns) + result = pd.DataFrame(result, columns=columns) + result[self.date_column] = pd.to_datetime(result[self.date_column]) + result[self.id_column] = result[self.id_column].astype("int") + other = [col for col in columns if col not in [self.id_column, self.date_column]] + result[other] = result[other].astype("float") + return result diff --git a/tsururu/models.py b/tsururu/models.py index 04be088..a40deef 100644 --- a/tsururu/models.py +++ b/tsururu/models.py @@ -70,7 +70,7 @@ def initialize_validator(self): cv = TimeSeriesSplit(n_splits=self.validation_params["n_splits"]) return cv - def fit(self, X: pd.DataFrame, y: NDArray[np.float]) -> None: + def fit(self, X: pd.DataFrame, y: NDArray[np.floating]) -> None: """Initialization and training of the model according to the passed parameters. @@ -80,7 +80,7 @@ def fit(self, X: pd.DataFrame, y: NDArray[np.float]) -> None: """ raise NotImplementedError() - def predict(self, X: pd.DataFrame) -> NDArray[np.float]: + def predict(self, X: pd.DataFrame) -> NDArray[np.floating]: """Obtaining model predictions. Arguments: @@ -101,7 +101,7 @@ def __init__( ): super().__init__(get_num_iterations, validation_params, model_params) - def fit(self, X: pd.DataFrame, y: NDArray[np.float]) -> None: + def fit(self, X: pd.DataFrame, y: NDArray[np.floating]) -> None: # Initialize cv object cv = self.initialize_validator() @@ -151,7 +151,7 @@ def fit(self, X: pd.DataFrame, y: NDArray[np.float]) -> None: print(f"Mean {self.model_params['loss_function']}: {np.mean(self.scores).round(4)}") print(f"Std: {np.std(self.scores).round(4)}") - def predict(self, X: pd.DataFrame) -> NDArray[np.float]: + def predict(self, X: pd.DataFrame) -> NDArray[np.floating]: models_preds = [model.predict(X) for model in self.models] y_pred = np.mean(models_preds, axis=0) return y_pred diff --git a/tsururu/strategies.py b/tsururu/strategies.py index 7e07c9a..1636915 100644 --- a/tsururu/strategies.py +++ b/tsururu/strategies.py @@ -151,8 +151,8 @@ def _backtest_generator( @staticmethod def _make_multivariate_X_y( - X: pd.DataFrame, y: NDArray[np.float] - ) -> Tuple[pd.DataFrame, NDArray[np.float]]: + X: pd.DataFrame, y: NDArray[np.floating] + ) -> Tuple[pd.DataFrame, NDArray[np.floating]]: raise NotImplementedError() def _generate_X_y( @@ -162,7 +162,7 @@ def _generate_X_y( target_horizon: int, is_train: bool, history: str = None, - idx: Optional[NDArray[np.float]] = None, + idx: Optional[NDArray[np.floating]] = None, n_last_horizon: Optional[int] = None, X_only: bool = False, ): @@ -177,7 +177,7 @@ def fit(self, dataset: TSDataset): def back_test( self, dataset: TSDataset, cv: int - ) -> Union[List, NDArray[Union[np.float, np.str]]]: + ) -> Union[List, NDArray[Union[np.floating, np.str_]]]: ids_list = [] test_list = [] preds_list = [] @@ -221,7 +221,7 @@ def back_test( ) @timing_decorator - def predict(self, dataset: TSDataset) -> NDArray[np.float]: + def predict(self, dataset: TSDataset) -> NDArray[np.floating]: raise NotImplementedError() @@ -283,7 +283,7 @@ def _make_multivariate_X_y( self, X: pd.DataFrame, date_column: "str", - y: Optional[NDArray[np.float]] = None, + y: Optional[NDArray[np.floating]] = None, ): idx_slicer = IndexSlicer() @@ -337,7 +337,7 @@ def _generate_X_y( target_horizon: int, is_train: bool, history: str = None, - idx: Optional[NDArray[np.float]] = None, + idx: Optional[NDArray[np.floating]] = None, n_last_horizon: Optional[int] = None, X_only: bool = False, ): @@ -1148,7 +1148,7 @@ def _generate_X_y( target_horizon: int, is_train: bool, history: str = None, - idx: Optional[NDArray[np.float]] = None, + idx: Optional[NDArray[np.floating]] = None, n_last_horizon: Optional[int] = None, X_only: bool = False, ): diff --git a/tsururu/transformers.py b/tsururu/transformers.py index a35c715..6ef0e97 100644 --- a/tsururu/transformers.py +++ b/tsururu/transformers.py @@ -125,13 +125,13 @@ def transform( X_only: bool, ) -> Tuple[pd.DataFrame]: if self.transform_train: - raw_ts_X = raw_ts_X.groupby(self.id_column).apply(self._transform_segment) + raw_ts_X = raw_ts_X.groupby(self.id_column).apply(self._transform_segment).reset_index(level=self.id_column, drop=True) if self.transform_target and not X_only: - raw_ts_y = raw_ts_y.groupby(self.id_column).apply(self._transform_segment) + raw_ts_y = raw_ts_y.groupby(self.id_column).apply(self._transform_segment).reset_index(level=self.id_column, drop=True) return raw_ts_X, raw_ts_y, features_X, y def inverse_transform_y(self, y: pd.DataFrame) -> pd.DataFrame: - return y.groupby(self.id_column).apply(self._inverse_transform_segment) + return y.groupby(self.id_column).apply(self._inverse_transform_segment).reset_index(level=self.id_column, drop=True) class FeaturesToFeaturesTransformer(SeriesToFeaturesTransformer): @@ -283,7 +283,7 @@ def fit( column for column in self.columns if issubclass(raw_ts_X[column].dtype.type, np.integer) - or issubclass(raw_ts_X[column].dtype.type, np.float) + or issubclass(raw_ts_X[column].dtype.type, np.floating) ] stat_df = raw_ts_X.groupby(id_column)[self.columns].agg(["mean", "std"]) self.params = stat_df.to_dict(orient="index") @@ -297,7 +297,7 @@ class DifferenceNormalizer(SeriesToSeriesTransformer): type: "delta" to take the difference or "ratio" -- ratio between the current and the previous value. - self.params: dict with first values by each id + self.params: dict with last values by each id (for targets' inverse transform) """ def __init__(self, regime: str = "delta"): @@ -327,13 +327,9 @@ def _inverse_transform_segment(self, segment: pd.Series) -> pd.Series: current_columns_mask = [segment.columns.str.contains(current_column_name)][0] current_last_value = self.params[current_id][current_column_name] if self.type == "delta": - segment.loc[:, current_columns_mask] = ( - segment.loc[:, current_columns_mask] + current_last_value - ) + segment.loc[:, current_columns_mask] = np.cumsum(np.append(current_last_value, segment.loc[:, current_columns_mask].values))[1:] if self.type == "ratio": - segment.loc[:, current_columns_mask] = ( - segment.loc[:, current_columns_mask] * current_last_value - ) + segment.loc[:, current_columns_mask] = np.cumprod(np.append(current_last_value, segment.loc[:, current_columns_mask].values))[1:] return segment def fit( @@ -361,7 +357,7 @@ def fit( column for column in self.columns if issubclass(raw_ts_X[column].dtype.type, np.integer) - or issubclass(raw_ts_X[column].dtype.type, np.float) + or issubclass(raw_ts_X[column].dtype.type, np.floating) ] last_values_df = raw_ts_X.groupby(self.id_column)[self.columns].last() self.params = last_values_df.to_dict(orient="index") @@ -438,7 +434,7 @@ def transform( features_X[columns_to_transform] - last_values ) if self.transform_target and not X_only: - y.loc[:, column_name] = y[column_name] - last_values + y = y - last_values elif self.regime == "ratio": if self.transform_train: features_X.loc[:, columns_to_transform] = ( @@ -656,7 +652,7 @@ def __init__( self, lags: Union[int, List[int], np.ndarray], drop_raw_features: bool, - idx_data: NDArray[np.float], + idx_data: NDArray[np.floating], ): super().__init__() if isinstance(lags, list):