diff --git a/doc/samples/samples_sklearn.rst b/doc/samples/samples_sklearn.rst index 80df3fe4..ab6bc4e6 100644 --- a/doc/samples/samples_sklearn.rst +++ b/doc/samples/samples_sklearn.rst @@ -402,6 +402,95 @@ Samples print("Predicted classes (first 10):") print(y_predicted[:10]) print("---") +.. autofunction:: khiops_classifier_advanced +.. code-block:: python + + # Imports + import os + import pandas as pd + from khiops import core as kh + from khiops.sklearn import KhiopsClassifier + from sklearn import metrics + from sklearn.model_selection import train_test_split + + # Load the root table of the dataset into a pandas dataframe + accidents_dataset_path = os.path.join(kh.get_samples_dir(), "AccidentsSummary") + accidents_df = pd.read_csv( + os.path.join(accidents_dataset_path, "Accidents.txt"), + sep="\t", + encoding="latin1", + ) + + # Split the root dataframe into train and test + accidents_train_df, accidents_test_df = train_test_split( + accidents_df, test_size=0.3, random_state=1 + ) + + # Obtain the main X feature table and the y target vector ("Class" column) + y_train = accidents_train_df["Gravity"] + y_test = accidents_test_df["Gravity"] + X_train_main = accidents_train_df.drop("Gravity", axis=1) + X_test_main = accidents_test_df.drop("Gravity", axis=1) + + # Load the secondary table of the dataset into a pandas dataframe + vehicles_df = pd.read_csv( + os.path.join(accidents_dataset_path, "Vehicles.txt"), sep="\t" + ) + + # Split the secondary dataframe with the keys of the splitted root dataframe + X_train_ids = X_train_main["AccidentId"].to_frame() + X_test_ids = X_test_main["AccidentId"].to_frame() + X_train_secondary = X_train_ids.merge(vehicles_df, on="AccidentId") + X_test_secondary = X_test_ids.merge(vehicles_df, on="AccidentId") + + # Create the dataset multitable specification for the train/test split + # We specify each table with a name and a tuple (dataframe, key_columns) + X_train = { + "main_table": "Accidents", + "tables": { + "Accidents": (X_train_main, "AccidentId"), + "Vehicles": (X_train_secondary, ["AccidentId", "VehicleId"]), + }, + } + X_test = { + "main_table": "Accidents", + "tables": { + "Accidents": (X_test_main, "AccidentId"), + "Vehicles": (X_test_secondary, ["AccidentId", "VehicleId"]), + }, + } + # Train the classifier (by default it analyzes 100 multi-table features) + khc = KhiopsClassifier( + n_features=20, + n_pairs=5, + n_trees=5, + n_selected_features=10, + n_evaluated_features=15, + specific_pairs=[("Light", "Weather"), ("Light", "IntersectionType")], + all_possible_pairs=True, + construction_rules=["TableMode", "TableSelection"], + group_target_value=False, + ) + khc.fit(X_train, y_train) + + # Predict the class on the test dataset + y_test_pred = khc.predict(X_test) + print("Predicted classes (first 10):") + print(y_test_pred[:10]) + print("---") + + # Predict the class probability on the test dataset + y_test_probas = khc.predict_proba(X_test) + print(f"Class order: {khc.classes_}") + print("Predicted class probabilities (first 10):") + print(y_test_probas[:10]) + print("---") + + # Evaluate accuracy and auc metrics on the test dataset + test_accuracy = metrics.accuracy_score(y_test, y_test_pred) + test_auc = metrics.roc_auc_score(y_test, y_test_probas[:, 1]) + print(f"Test accuracy = {test_accuracy}") + print(f"Test auc = {test_auc}") .. autofunction:: khiops_regressor .. code-block:: python @@ -663,6 +752,64 @@ Samples test_auc = metrics.roc_auc_score(y_test, y_test_probas[:, 1]) print(f"Test accuracy = {test_accuracy}") print(f"Test auc = {test_auc}") +.. autofunction:: khiops_encoder_advanced +.. code-block:: python + + # Imports + import os + import pandas as pd + from khiops import core as kh + from khiops.sklearn import KhiopsEncoder + + # Load the root table of the dataset into a pandas dataframe + accidents_dataset_path = os.path.join(kh.get_samples_dir(), "AccidentsSummary") + accidents_df = pd.read_csv( + os.path.join(accidents_dataset_path, "Accidents.txt"), + sep="\t", + encoding="latin1", + ) + + # Obtain the root X feature table and the y target vector ("Class" column) + X_main = accidents_df.drop("Gravity", axis=1) + y = accidents_df["Gravity"] + + # Load the secondary table of the dataset into a pandas dataframe + X_secondary = pd.read_csv( + os.path.join(accidents_dataset_path, "Vehicles.txt"), sep="\t" + ) + + # Create the dataset multitable specification for the train/test split + # We specify each table with a name and a tuple (dataframe, key_columns) + X_dataset = { + "main_table": "Accidents", + "tables": { + "Accidents": (X_main, "AccidentId"), + "Vehicles": (X_secondary, ["AccidentId", "VehicleId"]), + }, + } + + # Create the KhiopsEncoder with 10 additional multitable features and fit it + khe = KhiopsEncoder( + n_features=20, + n_pairs=5, + n_trees=5, + specific_pairs=[("Light", "Weather"), ("Light", "IntersectionType")], + all_possible_pairs=True, + construction_rules=["TableMode", "TableSelection"], + group_target_value=False, + informative_features_only=True, + keep_initial_variables=True, + transform_type_categorical="part_id", + transform_type_numerical="part_id", + transform_pairs="part_id", + ) + khe.fit(X_dataset, y) + + # Transform the train dataset + print("Encoded feature names:") + print(khe.feature_names_out_) + print("Encoded data:") + print(khe.transform(X_dataset)[:10]) .. autofunction:: khiops_coclustering .. code-block:: python diff --git a/khiops/samples/samples_sklearn.ipynb b/khiops/samples/samples_sklearn.ipynb index 2aa029fb..2993b743 100644 --- a/khiops/samples/samples_sklearn.ipynb +++ b/khiops/samples/samples_sklearn.ipynb @@ -455,6 +455,108 @@ "print(\"---\")" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### `khiops_classifier_advanced()`\n\n", + "Trains a `.KhiopsClassifier` on a star multi-table dataset\n (advanced version with more hyperparameters)\n \n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Imports\n", + "import os\n", + "import pandas as pd\n", + "from khiops import core as kh\n", + "from khiops.sklearn import KhiopsClassifier\n", + "from sklearn import metrics\n", + "from sklearn.model_selection import train_test_split\n", + "\n", + "# Load the root table of the dataset into a pandas dataframe\n", + "accidents_dataset_path = os.path.join(kh.get_samples_dir(), \"AccidentsSummary\")\n", + "accidents_df = pd.read_csv(\n", + " os.path.join(accidents_dataset_path, \"Accidents.txt\"),\n", + " sep=\"\\t\",\n", + " encoding=\"latin1\",\n", + ")\n", + "\n", + "# Split the root dataframe into train and test\n", + "accidents_train_df, accidents_test_df = train_test_split(\n", + " accidents_df, test_size=0.3, random_state=1\n", + ")\n", + "\n", + "# Obtain the main X feature table and the y target vector (\"Class\" column)\n", + "y_train = accidents_train_df[\"Gravity\"]\n", + "y_test = accidents_test_df[\"Gravity\"]\n", + "X_train_main = accidents_train_df.drop(\"Gravity\", axis=1)\n", + "X_test_main = accidents_test_df.drop(\"Gravity\", axis=1)\n", + "\n", + "# Load the secondary table of the dataset into a pandas dataframe\n", + "vehicles_df = pd.read_csv(\n", + " os.path.join(accidents_dataset_path, \"Vehicles.txt\"), sep=\"\\t\"\n", + ")\n", + "\n", + "# Split the secondary dataframe with the keys of the splitted root dataframe\n", + "X_train_ids = X_train_main[\"AccidentId\"].to_frame()\n", + "X_test_ids = X_test_main[\"AccidentId\"].to_frame()\n", + "X_train_secondary = X_train_ids.merge(vehicles_df, on=\"AccidentId\")\n", + "X_test_secondary = X_test_ids.merge(vehicles_df, on=\"AccidentId\")\n", + "\n", + "# Create the dataset multitable specification for the train/test split\n", + "# We specify each table with a name and a tuple (dataframe, key_columns)\n", + "X_train = {\n", + " \"main_table\": \"Accidents\",\n", + " \"tables\": {\n", + " \"Accidents\": (X_train_main, \"AccidentId\"),\n", + " \"Vehicles\": (X_train_secondary, [\"AccidentId\", \"VehicleId\"]),\n", + " },\n", + "}\n", + "X_test = {\n", + " \"main_table\": \"Accidents\",\n", + " \"tables\": {\n", + " \"Accidents\": (X_test_main, \"AccidentId\"),\n", + " \"Vehicles\": (X_test_secondary, [\"AccidentId\", \"VehicleId\"]),\n", + " },\n", + "}\n", + "# Train the classifier (by default it analyzes 100 multi-table features)\n", + "khc = KhiopsClassifier(\n", + " n_features=20,\n", + " n_pairs=5,\n", + " n_trees=5,\n", + " n_selected_features=10,\n", + " n_evaluated_features=15,\n", + " specific_pairs=[(\"Light\", \"Weather\"), (\"Light\", \"IntersectionType\")],\n", + " all_possible_pairs=True,\n", + " construction_rules=[\"TableMode\", \"TableSelection\"],\n", + " group_target_value=False,\n", + ")\n", + "khc.fit(X_train, y_train)\n", + "\n", + "# Predict the class on the test dataset\n", + "y_test_pred = khc.predict(X_test)\n", + "print(\"Predicted classes (first 10):\")\n", + "print(y_test_pred[:10])\n", + "print(\"---\")\n", + "\n", + "# Predict the class probability on the test dataset\n", + "y_test_probas = khc.predict_proba(X_test)\n", + "print(f\"Class order: {khc.classes_}\")\n", + "print(\"Predicted class probabilities (first 10):\")\n", + "print(y_test_probas[:10])\n", + "print(\"---\")\n", + "\n", + "# Evaluate accuracy and auc metrics on the test dataset\n", + "test_accuracy = metrics.accuracy_score(y_test, y_test_pred)\n", + "test_auc = metrics.roc_auc_score(y_test, y_test_probas[:, 1])\n", + "print(f\"Test accuracy = {test_accuracy}\")\n", + "print(f\"Test auc = {test_auc}\")" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -781,6 +883,77 @@ "print(f\"Test auc = {test_auc}\")" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### `khiops_encoder_advanced()`\n\n", + "Trains a `.KhiopsEncoder` on a star multi-table dataset\n (advanced version with more hyperparameters)\n \n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Imports\n", + "import os\n", + "import pandas as pd\n", + "from khiops import core as kh\n", + "from khiops.sklearn import KhiopsEncoder\n", + "\n", + "# Load the root table of the dataset into a pandas dataframe\n", + "accidents_dataset_path = os.path.join(kh.get_samples_dir(), \"AccidentsSummary\")\n", + "accidents_df = pd.read_csv(\n", + " os.path.join(accidents_dataset_path, \"Accidents.txt\"),\n", + " sep=\"\\t\",\n", + " encoding=\"latin1\",\n", + ")\n", + "\n", + "# Obtain the root X feature table and the y target vector (\"Class\" column)\n", + "X_main = accidents_df.drop(\"Gravity\", axis=1)\n", + "y = accidents_df[\"Gravity\"]\n", + "\n", + "# Load the secondary table of the dataset into a pandas dataframe\n", + "X_secondary = pd.read_csv(\n", + " os.path.join(accidents_dataset_path, \"Vehicles.txt\"), sep=\"\\t\"\n", + ")\n", + "\n", + "# Create the dataset multitable specification for the train/test split\n", + "# We specify each table with a name and a tuple (dataframe, key_columns)\n", + "X_dataset = {\n", + " \"main_table\": \"Accidents\",\n", + " \"tables\": {\n", + " \"Accidents\": (X_main, \"AccidentId\"),\n", + " \"Vehicles\": (X_secondary, [\"AccidentId\", \"VehicleId\"]),\n", + " },\n", + "}\n", + "\n", + "# Create the KhiopsEncoder with 10 additional multitable features and fit it\n", + "khe = KhiopsEncoder(\n", + " n_features=20,\n", + " n_pairs=5,\n", + " n_trees=5,\n", + " specific_pairs=[(\"Light\", \"Weather\"), (\"Light\", \"IntersectionType\")],\n", + " all_possible_pairs=True,\n", + " construction_rules=[\"TableMode\", \"TableSelection\"],\n", + " group_target_value=False,\n", + " informative_features_only=True,\n", + " keep_initial_variables=True,\n", + " transform_type_categorical=\"part_id\",\n", + " transform_type_numerical=\"part_id\",\n", + " transform_pairs=\"part_id\",\n", + ")\n", + "khe.fit(X_dataset, y)\n", + "\n", + "# Transform the train dataset\n", + "print(\"Encoded feature names:\")\n", + "print(khe.feature_names_out_)\n", + "print(\"Encoded data:\")\n", + "print(khe.transform(X_dataset)[:10])" + ] + }, { "cell_type": "markdown", "metadata": {}, diff --git a/khiops/samples/samples_sklearn.py b/khiops/samples/samples_sklearn.py index 976db54c..8d78a29c 100644 --- a/khiops/samples/samples_sklearn.py +++ b/khiops/samples/samples_sklearn.py @@ -406,6 +406,98 @@ def khiops_classifier_pickle(): print("---") +def khiops_classifier_advanced(): + """Trains a `.KhiopsClassifier` on a star multi-table dataset + (advanced version with more hyperparameters) + """ + # Imports + import os + import pandas as pd + from khiops import core as kh + from khiops.sklearn import KhiopsClassifier + from sklearn import metrics + from sklearn.model_selection import train_test_split + + # Load the root table of the dataset into a pandas dataframe + accidents_dataset_path = os.path.join(kh.get_samples_dir(), "AccidentsSummary") + accidents_df = pd.read_csv( + os.path.join(accidents_dataset_path, "Accidents.txt"), + sep="\t", + encoding="latin1", + ) + + # Split the root dataframe into train and test + accidents_train_df, accidents_test_df = train_test_split( + accidents_df, test_size=0.3, random_state=1 + ) + + # Obtain the main X feature table and the y target vector ("Class" column) + y_train = accidents_train_df["Gravity"] + y_test = accidents_test_df["Gravity"] + X_train_main = accidents_train_df.drop("Gravity", axis=1) + X_test_main = accidents_test_df.drop("Gravity", axis=1) + + # Load the secondary table of the dataset into a pandas dataframe + vehicles_df = pd.read_csv( + os.path.join(accidents_dataset_path, "Vehicles.txt"), sep="\t" + ) + + # Split the secondary dataframe with the keys of the splitted root dataframe + X_train_ids = X_train_main["AccidentId"].to_frame() + X_test_ids = X_test_main["AccidentId"].to_frame() + X_train_secondary = X_train_ids.merge(vehicles_df, on="AccidentId") + X_test_secondary = X_test_ids.merge(vehicles_df, on="AccidentId") + + # Create the dataset multitable specification for the train/test split + # We specify each table with a name and a tuple (dataframe, key_columns) + X_train = { + "main_table": "Accidents", + "tables": { + "Accidents": (X_train_main, "AccidentId"), + "Vehicles": (X_train_secondary, ["AccidentId", "VehicleId"]), + }, + } + X_test = { + "main_table": "Accidents", + "tables": { + "Accidents": (X_test_main, "AccidentId"), + "Vehicles": (X_test_secondary, ["AccidentId", "VehicleId"]), + }, + } + # Train the classifier (by default it analyzes 100 multi-table features) + khc = KhiopsClassifier( + n_features=20, + n_pairs=5, + n_trees=5, + n_selected_features=10, + n_evaluated_features=15, + specific_pairs=[("Light", "Weather"), ("Light", "IntersectionType")], + all_possible_pairs=True, + construction_rules=["TableMode", "TableSelection"], + group_target_value=False, + ) + khc.fit(X_train, y_train) + + # Predict the class on the test dataset + y_test_pred = khc.predict(X_test) + print("Predicted classes (first 10):") + print(y_test_pred[:10]) + print("---") + + # Predict the class probability on the test dataset + y_test_probas = khc.predict_proba(X_test) + print(f"Class order: {khc.classes_}") + print("Predicted class probabilities (first 10):") + print(y_test_probas[:10]) + print("---") + + # Evaluate accuracy and auc metrics on the test dataset + test_accuracy = metrics.accuracy_score(y_test, y_test_pred) + test_auc = metrics.roc_auc_score(y_test, y_test_probas[:, 1]) + print(f"Test accuracy = {test_accuracy}") + print(f"Test auc = {test_auc}") + + def khiops_regressor(): """Trains a `.KhiopsRegressor` on a monotable dataframe""" # Imports @@ -688,6 +780,67 @@ def khiops_encoder_pipeline_with_hgbc(): print(f"Test auc = {test_auc}") +def khiops_encoder_advanced(): + """Trains a `.KhiopsEncoder` on a star multi-table dataset + (advanced version with more hyperparameters) + """ + # Imports + import os + import pandas as pd + from khiops import core as kh + from khiops.sklearn import KhiopsEncoder + + # Load the root table of the dataset into a pandas dataframe + accidents_dataset_path = os.path.join(kh.get_samples_dir(), "AccidentsSummary") + accidents_df = pd.read_csv( + os.path.join(accidents_dataset_path, "Accidents.txt"), + sep="\t", + encoding="latin1", + ) + + # Obtain the root X feature table and the y target vector ("Class" column) + X_main = accidents_df.drop("Gravity", axis=1) + y = accidents_df["Gravity"] + + # Load the secondary table of the dataset into a pandas dataframe + X_secondary = pd.read_csv( + os.path.join(accidents_dataset_path, "Vehicles.txt"), sep="\t" + ) + + # Create the dataset multitable specification for the train/test split + # We specify each table with a name and a tuple (dataframe, key_columns) + X_dataset = { + "main_table": "Accidents", + "tables": { + "Accidents": (X_main, "AccidentId"), + "Vehicles": (X_secondary, ["AccidentId", "VehicleId"]), + }, + } + + # Create the KhiopsEncoder with 10 additional multitable features and fit it + khe = KhiopsEncoder( + n_features=20, + n_pairs=5, + n_trees=5, + specific_pairs=[("Light", "Weather"), ("Light", "IntersectionType")], + all_possible_pairs=True, + construction_rules=["TableMode", "TableSelection"], + group_target_value=False, + informative_features_only=True, + keep_initial_variables=True, + transform_type_categorical="part_id", + transform_type_numerical="part_id", + transform_pairs="part_id", + ) + khe.fit(X_dataset, y) + + # Transform the train dataset + print("Encoded feature names:") + print(khe.feature_names_out_) + print("Encoded data:") + print(khe.transform(X_dataset)[:10]) + + # pylint: enable=line-too-long @@ -944,11 +1097,13 @@ def khiops_classifier_multitable_star_file(): khiops_classifier_multitable_snowflake, khiops_classifier_sparse, khiops_classifier_pickle, + khiops_classifier_advanced, khiops_regressor, khiops_encoder, khiops_encoder_multitable_star, khiops_encoder_multitable_snowflake, khiops_encoder_pipeline_with_hgbc, + khiops_encoder_advanced, khiops_coclustering, khiops_coclustering_simplify, khiops_classifier_multitable_list, diff --git a/khiops/sklearn/estimators.py b/khiops/sklearn/estimators.py index d863e532..798c6788 100644 --- a/khiops/sklearn/estimators.py +++ b/khiops/sklearn/estimators.py @@ -1308,6 +1308,9 @@ def __init__( n_features=100, n_pairs=0, n_trees=10, + specific_pairs=None, + all_possible_pairs=True, + construction_rules=None, verbose=False, output_dir=None, auto_sort=True, @@ -1324,6 +1327,9 @@ def __init__( self.n_features = n_features self.n_pairs = n_pairs self.n_trees = n_trees + self.specific_pairs = specific_pairs + self.all_possible_pairs = all_possible_pairs + self.construction_rules = construction_rules self._predicted_target_meta_data_tag = None # Deprecation message for 'key' constructor parameter @@ -1403,6 +1409,30 @@ def _fit_check_params(self, dataset, **kwargs): raise TypeError(type_error_message("n_pairs", self.n_pairs, int)) if self.n_pairs < 0: raise ValueError("'n_pairs' must be positive") + if self.specific_pairs is not None: + if not isinstance(self.specific_pairs, list): + raise TypeError( + type_error_message("specific_pairs", self.specific_pairs, list) + ) + else: + for pair in self.specific_pairs: + if not isinstance(pair, tuple): + raise TypeError(type_error_message(pair, pair, tuple)) + if not isinstance(self.all_possible_pairs, bool): + raise TypeError( + type_error_message("all_possible_pairs", self.all_possible_pairs, bool) + ) + if self.construction_rules is not None: + if not isinstance(self.construction_rules, list): + raise TypeError( + type_error_message( + "construction_rules", self.construction_rules, list + ) + ) + else: + for rule in self.construction_rules: + if not isinstance(rule, str): + raise TypeError(type_error_message(rule, rule, str)) def _fit_train_model(self, dataset, computation_dir, **kwargs): # Train the model with Khiops @@ -1655,6 +1685,11 @@ def __init__( n_features=100, n_pairs=0, n_trees=10, + n_selected_features=0, + n_evaluated_features=0, + specific_pairs=None, + all_possible_pairs=True, + construction_rules=None, verbose=False, output_dir=None, auto_sort=True, @@ -1665,6 +1700,9 @@ def __init__( n_features=n_features, n_pairs=n_pairs, n_trees=n_trees, + specific_pairs=specific_pairs, + all_possible_pairs=all_possible_pairs, + construction_rules=construction_rules, verbose=verbose, output_dir=output_dir, auto_sort=auto_sort, @@ -1673,6 +1711,8 @@ def __init__( ) # Data to be specified by inherited classes self._predicted_target_meta_data_tag = None + self.n_evaluated_features = n_evaluated_features + self.n_selected_features = n_selected_features def predict(self, X): """Predicts the target variable for the test dataset X @@ -1707,6 +1747,18 @@ def predict(self, X): assert isinstance(y_pred, (str, pd.DataFrame)), "Expected str or DataFrame" return y_pred + def _fit_prepare_training_function_inputs(self, dataset, computation_dir): + # Call the parent method + args, kwargs = super()._fit_prepare_training_function_inputs( + dataset, computation_dir + ) + + # Rename parameters to be compatible with khiops.core + kwargs["max_evaluated_variables"] = kwargs.pop("n_evaluated_features") + kwargs["max_selected_variables"] = kwargs.pop("n_selected_features") + + return args, kwargs + def _transform_prepare_deployment_model_for_predict(self): assert ( self._predicted_target_meta_data_tag is not None @@ -1744,6 +1796,16 @@ def get_feature_used_statistics(self, modeling_report): feature_used_importances_ = np.array([]) return feature_used_names_, feature_used_importances_ + def _fit_check_params(self, dataset, **kwargs): + # Call parent method + super()._fit_check_params(dataset, **kwargs) + + # Check estimator parameters + if self.n_evaluated_features < 0: + raise ValueError("'n_evaluated_features' must be positive") + if self.n_selected_features < 0: + raise ValueError("'n_selected_features' must be positive") + class KhiopsClassifier(KhiopsPredictor, ClassifierMixin): # Disable line too long as this docstring *needs* to have lines longer than 88c @@ -1765,7 +1827,7 @@ class KhiopsClassifier(KhiopsPredictor, ClassifierMixin): construct. See :doc:`/multi_table_primer` for more details. n_pairs : int, default 0 Maximum number of pair features to construct. These features represent a 2D grid - partition of the domain of a pair of variables in which is optimized in a way + partition of the domain of a pair of features in which is optimized in a way that the cells are the purest possible with respect to the target. Only pairs which jointly are more informative that its univariate components may be taken into account in the classifier. @@ -1774,6 +1836,25 @@ class KhiopsClassifier(KhiopsPredictor, ClassifierMixin): combine other features, either native or constructed. These features usually improve the classifier's performance at the cost of interpretability of the model. + n_selected_features : int, default 0 + Maximum number of features to be selected in the SNB predictor. If equal to + 0 it selects all the features kept in the training. + n_evaluated_features : int, default 0 + Maximum number of features to be evaluated in the SNB predictor training. If + equal to 0 it evaluates all informative features. + specific_pairs : list of tuple, optional + User-specified pairs as a list of 2-tuples of feature names. If a given tuple + contains only one non-empty feature name, then it generates all the pairs + containing it (within the maximum limit n_pairs). + all_possible_pairs : bool, default True + If True tries to create all possible pairs within the limit max_pairs. + The pairs and features given in specific_pairs have priority. + construction_rules : list of str, optional + Allowed rules for the automatic feature construction. If not set, it uses all + possible rules. + group_target_value : bool, default ``False`` + Allows grouping of the target values in classification. It can substantially + increase the training time. verbose : bool, default ``False`` If ``True`` it prints debug information and it does not erase temporary files when fitting, predicting or transforming. @@ -1864,6 +1945,12 @@ def __init__( n_features=100, n_pairs=0, n_trees=10, + n_selected_features=0, + n_evaluated_features=0, + specific_pairs=None, + all_possible_pairs=True, + construction_rules=None, + group_target_value=False, verbose=False, output_dir=None, auto_sort=True, @@ -1874,12 +1961,18 @@ def __init__( n_features=n_features, n_pairs=n_pairs, n_trees=n_trees, + n_selected_features=n_selected_features, + n_evaluated_features=n_evaluated_features, + specific_pairs=specific_pairs, + all_possible_pairs=all_possible_pairs, + construction_rules=construction_rules, verbose=verbose, output_dir=output_dir, auto_sort=auto_sort, key=key, internal_sort=internal_sort, ) + self.group_target_value = group_target_value self._khiops_model_prefix = "SNB_" self._predicted_target_meta_data_tag = "Prediction" @@ -1910,6 +2003,16 @@ def _sorted_prob_variable_names(self): return sorted_prob_variable_names + def _fit_check_params(self, dataset, **kwargs): + # Call parent method + super()._fit_check_params(dataset, **kwargs) + + # Check 'group_target_value' parameter + if not isinstance(self.group_target_value, bool): + raise TypeError( + type_error_message("group_target_value", self.group_target_value, bool) + ) + def fit(self, X, y, **kwargs): """Fits a Selective Naive Bayes classifier according to X, y @@ -2171,10 +2274,26 @@ class KhiopsRegressor(KhiopsPredictor, RegressorMixin): construct. See :doc:`/multi_table_primer` for more details. n_pairs : int, default 0 Maximum number of pair features to construct. These features represent a 2D grid - partition of the domain of a pair of variables in which is optimized in a way + partition of the domain of a pair of features in which is optimized in a way that the cells are the purest possible with respect to the target. Only pairs which jointly are more informative that its univariate components may be taken into account in the regressor. + n_selected_features : int, default 0 + Maximum number of features to be selected in the SNB predictor. If equal to + 0 it selects all the features kept in the training. + n_evaluated_features : int, default 0 + Maximum number of features to be evaluated in the SNB predictor training. If + equal to 0 it evaluates all informative features. + specific_pairs : list of tuple, optional + User-specified pairs as a list of 2-tuples of feature names. If a given tuple + contains only one non-empty feature name, then it generates all the pairs + containing it (within the maximum limit n_pairs). + all_possible_pairs : bool, default True + If True tries to create all possible pairs within the limit max_pairs. + The pairs and features given in specific_pairs have priority. + construction_rules : list of str, optional + Allowed rules for the automatic feature construction. If not set, it uses all + possible rules. verbose : bool, default ``False`` If ``True`` it prints debug information and it does not erase temporary files when fitting, predicting or transforming. @@ -2255,6 +2374,11 @@ def __init__( n_features=100, n_pairs=0, n_trees=0, + n_selected_features=0, + n_evaluated_features=0, + specific_pairs=None, + all_possible_pairs=True, + construction_rules=None, verbose=False, output_dir=None, auto_sort=True, @@ -2265,6 +2389,11 @@ def __init__( n_features=n_features, n_pairs=n_pairs, n_trees=n_trees, + n_selected_features=n_selected_features, + n_evaluated_features=n_evaluated_features, + specific_pairs=specific_pairs, + all_possible_pairs=all_possible_pairs, + construction_rules=construction_rules, verbose=verbose, output_dir=output_dir, auto_sort=auto_sort, @@ -2411,16 +2540,31 @@ class KhiopsEncoder(KhiopsSupervisedEstimator, TransformerMixin): construct. See :doc:`/multi_table_primer` for more details. n_pairs : int, default 0 Maximum number of pair features to construct. These features represent a 2D grid - partition of the domain of a pair of variables in which is optimized in a way + partition of the domain of a pair of features in which is optimized in a way that the cells are the purest possible with respect to the target. n_trees : int, default 10 Maximum number of decision tree features to construct. The constructed trees combine other features, either native or constructed. These features usually improve a predictor's performance at the cost of interpretability of the model. + specific_pairs : list of tuple, optional + User-specified pairs as a list of 2-tuples of feature names. If a given tuple + contains only one non-empty feature name, then it generates all the pairs + containing it (within the maximum limit n_pairs). + all_possible_pairs : bool, default True + If True tries to create all possible pairs within the limit max_pairs. + The pairs and features given in specific_pairs have priority. + construction_rules : list of str, optional + Allowed rules for the automatic feature construction. If not set, it uses all + possible rules. + informative_features_only : bool, default ``True`` + If ``True`` keeps only informative features. + group_target_value : bool, default ``False`` + Allows grouping of the target values in classification. It can substantially + increase the training time. keep_initial_variables : bool, default ``False`` If ``True`` the original columns are kept in the transformed data. transform_type_categorical : str, default "part_id" - Type of transformation for categorical variables. Valid values: + Type of transformation for categorical features. Valid values: - "part_id" - "part_label" - "dummies" @@ -2440,6 +2584,12 @@ class KhiopsEncoder(KhiopsSupervisedEstimator, TransformerMixin): See the documentation for the ``numerical_recoding_method`` parameter of the `~.api.train_recoder` function for more details. + transform_pairs: str, default "part_id" + Type of transformation for bivariate features. Valid values: + - "part_id" + - "part_label" + - "dummies" + - "conditional_info" verbose : bool, default ``False`` If ``True`` it prints debug information and it does not erase temporary files when fitting, predicting or transforming. @@ -2506,9 +2656,15 @@ def __init__( n_features=100, n_pairs=0, n_trees=0, + specific_pairs=None, + all_possible_pairs=True, + construction_rules=None, + informative_features_only=True, + group_target_value=False, + keep_initial_variables=False, transform_type_categorical="part_id", transform_type_numerical="part_id", - keep_initial_variables=False, + transform_pairs="part_id", verbose=False, output_dir=None, auto_sort=True, @@ -2519,6 +2675,9 @@ def __init__( n_features=n_features, n_pairs=n_pairs, n_trees=n_trees, + specific_pairs=specific_pairs, + all_possible_pairs=all_possible_pairs, + construction_rules=construction_rules, verbose=verbose, output_dir=output_dir, auto_sort=auto_sort, @@ -2526,8 +2685,11 @@ def __init__( internal_sort=internal_sort, ) self.categorical_target = categorical_target + self.group_target_value = group_target_value self.transform_type_categorical = transform_type_categorical self.transform_type_numerical = transform_type_numerical + self.transform_pairs = transform_pairs + self.informative_features_only = informative_features_only self.keep_initial_variables = keep_initial_variables self._khiops_model_prefix = "R_" @@ -2567,6 +2729,21 @@ def _numerical_transform_method(self): ) return _transform_types_numerical[self.transform_type_numerical] + def _pairs_transform_method(self): + _transform_types = { + "part_id": "part Id", + "part_label": "part label", + "dummies": "0-1 binarization", + "conditional_info": "conditional info", + None: "none", + } + if self.transform_pairs not in _transform_types: + raise ValueError( + "'transform_pairs' must be one of the following:" + ",".join(_transform_types.keys) + ) + return _transform_types[self.transform_pairs] + def _fit_check_params(self, dataset, **kwargs): # Call parent method super()._fit_check_params(dataset, **kwargs) @@ -2600,6 +2777,25 @@ def _fit_check_params(self, dataset, **kwargs): "transform_type_categorical and transform_type_numerical " "cannot be both None with n_trees == 0." ) + # Check 'transform_pairs' parameter + if not isinstance(self.transform_pairs, str): + raise TypeError( + type_error_message("transform_pairs", self.transform_pairs, str) + ) + self._pairs_transform_method() # Raises ValueError if invalid + + # Check 'informative_features_only' parameter + if not isinstance(self.informative_features_only, bool): + raise TypeError( + type_error_message( + "informative_features_only", self.informative_features_only, bool + ) + ) + # Check 'group_target_value' parameter + if not isinstance(self.group_target_value, bool): + raise TypeError( + type_error_message("group_target_value", self.group_target_value, bool) + ) def _check_target_type(self, dataset): if self.categorical_target: @@ -2650,16 +2846,20 @@ def _fit_prepare_training_function_inputs(self, dataset, computation_dir): args, kwargs = super()._fit_prepare_training_function_inputs( dataset, computation_dir ) - # Rename encoder parameters, delete unused ones + # to be compatible with khiops.core kwargs["keep_initial_categorical_variables"] = kwargs["keep_initial_variables"] kwargs["keep_initial_numerical_variables"] = kwargs.pop( "keep_initial_variables" ) kwargs["categorical_recoding_method"] = self._categorical_transform_method() kwargs["numerical_recoding_method"] = self._numerical_transform_method() + kwargs["pairs_recoding_method"] = self._pairs_transform_method() + kwargs["informative_variables_only"] = kwargs.pop("informative_features_only") + del kwargs["transform_type_categorical"] del kwargs["transform_type_numerical"] + del kwargs["transform_pairs"] del kwargs["categorical_target"] return args, kwargs @@ -2686,7 +2886,7 @@ def transform(self, X): """Transforms X with a fitted Khiops supervised encoder .. note:: - Numerical variables are encoded to categorical ones. See the + Numerical features are encoded to categorical ones. See the ``transform_type_numerical`` parameter for details. Parameters diff --git a/tests/test_sklearn.py b/tests/test_sklearn.py index 463fae80..a250192f 100644 --- a/tests/test_sklearn.py +++ b/tests/test_sklearn.py @@ -1081,12 +1081,48 @@ def setUpClass(cls): }, "monotable": { ("dataframe", "dataframe_xy"): { - KhiopsPredictor: { + KhiopsClassifier: { "fit": { ("khiops.core", "train_predictor"): { "field_separator": "\t", "detect_format": False, "header_line": True, + "max_pairs": 1, + "max_trees": 5, + "max_selected_variables": 1, + "max_evaluated_variables": 3, + "specific_pairs": [("age", "race")], + "all_possible_pairs": False, + "construction_rules": ["TableMode", "TableSelection"], + "group_target_value": False, + "additional_data_tables": {}, + } + }, + "predict": { + ("khiops.core", "deploy_model"): { + "field_separator": "\t", + "detect_format": False, + "header_line": True, + "log_file_path": os.path.join( + cls.output_dir, "khiops.log" + ), + "additional_data_tables": {}, + } + }, + }, + KhiopsRegressor: { + "fit": { + ("khiops.core", "train_predictor"): { + "field_separator": "\t", + "detect_format": False, + "header_line": True, + "max_pairs": 1, + "max_trees": 0, + "max_selected_variables": 1, + "max_evaluated_variables": 3, + "specific_pairs": [("age", "race")], + "all_possible_pairs": False, + "construction_rules": ["TableMode", "TableSelection"], "additional_data_tables": {}, } }, @@ -1108,11 +1144,19 @@ def setUpClass(cls): "field_separator": "\t", "detect_format": False, "header_line": True, - "additional_data_tables": {}, + "max_pairs": 1, + "max_trees": 5, + "specific_pairs": [("age", "race")], + "all_possible_pairs": False, + "construction_rules": ["TableMode", "TableSelection"], + "informative_variables_only": True, + "group_target_value": False, "keep_initial_categorical_variables": False, "keep_initial_numerical_variables": False, "categorical_recoding_method": "part Id", "numerical_recoding_method": "part Id", + "pairs_recoding_method": "part Id", + "additional_data_tables": {}, } }, "predict": { @@ -1129,12 +1173,48 @@ def setUpClass(cls): }, }, ("file_dataset",): { - KhiopsPredictor: { + KhiopsClassifier: { + "fit": { + ("khiops.core", "train_predictor"): { + "field_separator": "\t", + "detect_format": False, + "header_line": True, + "max_pairs": 1, + "max_trees": 5, + "max_selected_variables": 1, + "max_evaluated_variables": 3, + "specific_pairs": [("age", "race")], + "all_possible_pairs": False, + "construction_rules": ["TableMode", "TableSelection"], + "group_target_value": False, + "additional_data_tables": {}, + } + }, + "predict": { + ("khiops.core", "deploy_model"): { + "field_separator": "\t", + "detect_format": False, + "header_line": True, + "log_file_path": os.path.join( + cls.output_dir, "khiops.log" + ), + "additional_data_tables": {}, + } + }, + }, + KhiopsRegressor: { "fit": { ("khiops.core", "train_predictor"): { "field_separator": "\t", "detect_format": False, "header_line": True, + "max_pairs": 1, + "max_trees": 0, + "max_selected_variables": 1, + "max_evaluated_variables": 3, + "specific_pairs": [("age", "race")], + "all_possible_pairs": False, + "construction_rules": ["TableMode", "TableSelection"], "additional_data_tables": {}, } }, @@ -1156,11 +1236,19 @@ def setUpClass(cls): "field_separator": "\t", "detect_format": False, "header_line": True, - "additional_data_tables": {}, + "max_pairs": 1, + "max_trees": 5, + "specific_pairs": [("age", "race")], + "all_possible_pairs": False, + "construction_rules": ["TableMode", "TableSelection"], + "informative_variables_only": True, + "group_target_value": False, "keep_initial_categorical_variables": False, "keep_initial_numerical_variables": False, "categorical_recoding_method": "part Id", "numerical_recoding_method": "part Id", + "pairs_recoding_method": "part Id", + "additional_data_tables": {}, } }, "predict": { @@ -1179,12 +1267,21 @@ def setUpClass(cls): }, "multitable": { ("dataframe",): { - KhiopsPredictor: { + KhiopsClassifier: { "fit": { ("khiops.core", "train_predictor"): { "field_separator": "\t", "detect_format": False, "header_line": True, + "max_constructed_variables": 10, + "max_pairs": 1, + "max_trees": 5, + "max_selected_variables": 1, + "max_evaluated_variables": 3, + "specific_pairs": [], + "all_possible_pairs": False, + "construction_rules": ["TableMode", "TableSelection"], + "group_target_value": False, "additional_data_tables": { "SpliceJunction`SpliceJunctionDNA" }, @@ -1204,19 +1301,61 @@ def setUpClass(cls): } }, }, - KhiopsEncoder: { + KhiopsRegressor: { "fit": { - ("khiops.core", "train_recoder"): { + ("khiops.core", "train_predictor"): { "field_separator": "\t", "detect_format": False, "header_line": True, + "max_constructed_variables": 10, + "max_pairs": 1, + "max_trees": 0, + "max_selected_variables": 1, + "max_evaluated_variables": 3, + "specific_pairs": [], + "all_possible_pairs": False, + "construction_rules": ["TableMode", "TableSelection"], "additional_data_tables": { "SpliceJunction`SpliceJunctionDNA" }, + } + }, + "predict": { + ("khiops.core", "deploy_model"): { + "field_separator": "\t", + "detect_format": False, + "header_line": True, + "log_file_path": os.path.join( + cls.output_dir, "khiops.log" + ), + "additional_data_tables": { + "SNB_SpliceJunction`SpliceJunctionDNA" + }, + } + }, + }, + KhiopsEncoder: { + "fit": { + ("khiops.core", "train_recoder"): { + "field_separator": "\t", + "detect_format": False, + "header_line": True, + "max_constructed_variables": 10, + "max_pairs": 1, + "max_trees": 5, + "specific_pairs": [], + "all_possible_pairs": False, + "construction_rules": ["TableMode", "TableSelection"], + "informative_variables_only": True, + "group_target_value": False, "keep_initial_categorical_variables": False, "keep_initial_numerical_variables": False, "categorical_recoding_method": "part Id", "numerical_recoding_method": "part Id", + "pairs_recoding_method": "part Id", + "additional_data_tables": { + "SpliceJunction`SpliceJunctionDNA" + }, } }, "predict": { @@ -1235,12 +1374,21 @@ def setUpClass(cls): }, }, ("file_dataset",): { - KhiopsPredictor: { + KhiopsClassifier: { "fit": { ("khiops.core", "train_predictor"): { "field_separator": "\t", "detect_format": False, "header_line": True, + "max_constructed_variables": 10, + "max_pairs": 1, + "max_trees": 5, + "max_selected_variables": 1, + "max_evaluated_variables": 3, + "specific_pairs": [], + "all_possible_pairs": False, + "construction_rules": ["TableMode", "TableSelection"], + "group_target_value": False, "log_file_path": os.path.join( cls.output_dir, "khiops.log" ), @@ -1263,19 +1411,64 @@ def setUpClass(cls): } }, }, - KhiopsEncoder: { + KhiopsRegressor: { "fit": { - ("khiops.core", "train_recoder"): { + ("khiops.core", "train_predictor"): { "field_separator": "\t", "detect_format": False, "header_line": True, + "max_constructed_variables": 10, + "max_pairs": 1, + "max_trees": 0, + "max_selected_variables": 1, + "max_evaluated_variables": 3, + "specific_pairs": [], + "all_possible_pairs": False, + "construction_rules": ["TableMode", "TableSelection"], + "log_file_path": os.path.join( + cls.output_dir, "khiops.log" + ), "additional_data_tables": { "SpliceJunction`SpliceJunctionDNA" }, + } + }, + "predict": { + ("khiops.core", "deploy_model"): { + "field_separator": "\t", + "detect_format": False, + "header_line": True, + "log_file_path": os.path.join( + cls.output_dir, "khiops.log" + ), + "additional_data_tables": { + "SNB_SpliceJunction`SpliceJunctionDNA" + }, + } + }, + }, + KhiopsEncoder: { + "fit": { + ("khiops.core", "train_recoder"): { + "field_separator": "\t", + "detect_format": False, + "header_line": True, + "max_constructed_variables": 10, + "max_pairs": 1, + "max_trees": 5, + "specific_pairs": [], + "all_possible_pairs": False, + "construction_rules": ["TableMode", "TableSelection"], + "informative_variables_only": True, + "group_target_value": False, "keep_initial_categorical_variables": False, "keep_initial_numerical_variables": False, "categorical_recoding_method": "part Id", "numerical_recoding_method": "part Id", + "pairs_recoding_method": "part Id", + "additional_data_tables": { + "SpliceJunction`SpliceJunctionDNA" + }, } }, "predict": { @@ -1677,6 +1870,7 @@ def _test_template( schema_type, source_type, custom_kwargs=None, + extra_estimator_kwargs=None, ): """Test template @@ -1808,7 +2002,11 @@ def _test_template( ) # Train the estimator - estimator = estimator_type(output_dir=self.output_dir, auto_sort=False) + if extra_estimator_kwargs is None: + extra_estimator_kwargs = {} + estimator = estimator_type( + output_dir=self.output_dir, auto_sort=False, **extra_estimator_kwargs + ) fit_kwargs = ( custom_kwargs.get("fit", {}) if custom_kwargs is not None else {} ) @@ -1884,7 +2082,7 @@ def _test_template( # Check the function kwargs expected_kwargs_list = ( - adict.get(estimator_type_key) + adict.get(estimator_type) .get(estimator_method) .get((module_name, function_name)) for adict in KhiopsTestHelper.get_with_subkey( @@ -1910,6 +2108,16 @@ def test_parameter_transfer_classifier_fit_from_monotable_dataframe(self): estimator_method="fit", schema_type="monotable", source_type="dataframe", + extra_estimator_kwargs={ + "n_pairs": 1, + "n_trees": 5, + "n_selected_features": 1, + "n_evaluated_features": 3, + "specific_pairs": [("age", "race")], + "all_possible_pairs": False, + "construction_rules": ["TableMode", "TableSelection"], + "group_target_value": False, + }, ) def test_parameter_transfer_classifier_fit_from_monotable_dataframe_with_df_y( @@ -1921,6 +2129,16 @@ def test_parameter_transfer_classifier_fit_from_monotable_dataframe_with_df_y( estimator_method="fit", schema_type="monotable", source_type="dataframe_xy", + extra_estimator_kwargs={ + "n_pairs": 1, + "n_trees": 5, + "n_selected_features": 1, + "n_evaluated_features": 3, + "specific_pairs": [("age", "race")], + "all_possible_pairs": False, + "construction_rules": ["TableMode", "TableSelection"], + "group_target_value": False, + }, ) def test_parameter_transfer_classifier_fit_from_monotable_file_dataset(self): @@ -1930,6 +2148,16 @@ def test_parameter_transfer_classifier_fit_from_monotable_file_dataset(self): estimator_method="fit", schema_type="monotable", source_type="file_dataset", + extra_estimator_kwargs={ + "n_pairs": 1, + "n_trees": 5, + "n_selected_features": 1, + "n_evaluated_features": 3, + "specific_pairs": [("age", "race")], + "all_possible_pairs": False, + "construction_rules": ["TableMode", "TableSelection"], + "group_target_value": False, + }, ) def test_parameter_transfer_classifier_fit_from_multitable_dataframe(self): @@ -1939,6 +2167,17 @@ def test_parameter_transfer_classifier_fit_from_multitable_dataframe(self): estimator_method="fit", schema_type="multitable", source_type="dataframe", + extra_estimator_kwargs={ + "n_features": 10, + "n_pairs": 1, + "n_trees": 5, + "n_selected_features": 1, + "n_evaluated_features": 3, + "specific_pairs": [], + "all_possible_pairs": False, + "construction_rules": ["TableMode", "TableSelection"], + "group_target_value": False, + }, ) def test_parameter_transfer_classifier_fit_from_multitable_file_dataset(self): @@ -1948,6 +2187,17 @@ def test_parameter_transfer_classifier_fit_from_multitable_file_dataset(self): estimator_method="fit", schema_type="multitable", source_type="file_dataset", + extra_estimator_kwargs={ + "n_features": 10, + "n_pairs": 1, + "n_trees": 5, + "n_selected_features": 1, + "n_evaluated_features": 3, + "specific_pairs": [], + "all_possible_pairs": False, + "construction_rules": ["TableMode", "TableSelection"], + "group_target_value": False, + }, ) def test_parameter_transfer_classifier_predict_from_monotable_dataframe(self): @@ -1997,6 +2247,19 @@ def test_parameter_transfer_encoder_fit_from_monotable_dataframe(self): estimator_method="fit", schema_type="monotable", source_type="dataframe", + extra_estimator_kwargs={ + "n_pairs": 1, + "n_trees": 5, + "specific_pairs": [("age", "race")], + "all_possible_pairs": False, + "construction_rules": ["TableMode", "TableSelection"], + "informative_features_only": True, + "group_target_value": False, + "keep_initial_variables": False, + "transform_type_categorical": "part_id", + "transform_type_numerical": "part_id", + "transform_pairs": "part_id", + }, ) def test_parameter_transfer_encoder_fit_from_monotable_dataframe_with_df_y( @@ -2008,6 +2271,19 @@ def test_parameter_transfer_encoder_fit_from_monotable_dataframe_with_df_y( estimator_method="fit", schema_type="monotable", source_type="dataframe_xy", + extra_estimator_kwargs={ + "n_pairs": 1, + "n_trees": 5, + "specific_pairs": [("age", "race")], + "all_possible_pairs": False, + "construction_rules": ["TableMode", "TableSelection"], + "informative_features_only": True, + "group_target_value": False, + "keep_initial_variables": False, + "transform_type_categorical": "part_id", + "transform_type_numerical": "part_id", + "transform_pairs": "part_id", + }, ) def test_parameter_transfer_encoder_fit_from_monotable_file_dataset(self): @@ -2017,6 +2293,19 @@ def test_parameter_transfer_encoder_fit_from_monotable_file_dataset(self): estimator_method="fit", schema_type="monotable", source_type="file_dataset", + extra_estimator_kwargs={ + "n_pairs": 1, + "n_trees": 5, + "specific_pairs": [("age", "race")], + "all_possible_pairs": False, + "construction_rules": ["TableMode", "TableSelection"], + "informative_features_only": True, + "group_target_value": False, + "keep_initial_variables": False, + "transform_type_categorical": "part_id", + "transform_type_numerical": "part_id", + "transform_pairs": "part_id", + }, ) def test_parameter_transfer_encoder_fit_from_multitable_dataframe(self): @@ -2026,6 +2315,20 @@ def test_parameter_transfer_encoder_fit_from_multitable_dataframe(self): estimator_method="fit", schema_type="multitable", source_type="dataframe", + extra_estimator_kwargs={ + "n_features": 10, + "n_pairs": 1, + "n_trees": 5, + "specific_pairs": [], + "all_possible_pairs": False, + "construction_rules": ["TableMode", "TableSelection"], + "informative_features_only": True, + "group_target_value": False, + "keep_initial_variables": False, + "transform_type_categorical": "part_id", + "transform_type_numerical": "part_id", + "transform_pairs": "part_id", + }, ) def test_parameter_transfer_encoder_fit_from_multitable_file_dataset(self): @@ -2035,6 +2338,20 @@ def test_parameter_transfer_encoder_fit_from_multitable_file_dataset(self): estimator_method="fit", schema_type="multitable", source_type="file_dataset", + extra_estimator_kwargs={ + "n_features": 10, + "n_pairs": 1, + "n_trees": 5, + "specific_pairs": [], + "all_possible_pairs": False, + "construction_rules": ["TableMode", "TableSelection"], + "informative_features_only": True, + "group_target_value": False, + "keep_initial_variables": False, + "transform_type_categorical": "part_id", + "transform_type_numerical": "part_id", + "transform_pairs": "part_id", + }, ) def test_parameter_transfer_encoder_predict_from_monotable_dataframe(self): @@ -2080,6 +2397,14 @@ def test_parameter_transfer_regressor_fit_from_monotable_dataframe(self): estimator_method="fit", schema_type="monotable", source_type="dataframe", + extra_estimator_kwargs={ + "n_pairs": 1, + "n_selected_features": 1, + "n_evaluated_features": 3, + "specific_pairs": [("age", "race")], + "all_possible_pairs": False, + "construction_rules": ["TableMode", "TableSelection"], + }, ) def test_parameter_transfer_regressor_fit_from_monotable_dataframe_with_df_y( @@ -2091,6 +2416,14 @@ def test_parameter_transfer_regressor_fit_from_monotable_dataframe_with_df_y( estimator_method="fit", schema_type="monotable", source_type="dataframe_xy", + extra_estimator_kwargs={ + "n_pairs": 1, + "n_selected_features": 1, + "n_evaluated_features": 3, + "specific_pairs": [("age", "race")], + "all_possible_pairs": False, + "construction_rules": ["TableMode", "TableSelection"], + }, ) def test_parameter_transfer_regressor_fit_from_monotable_file_dataset(self): @@ -2100,6 +2433,14 @@ def test_parameter_transfer_regressor_fit_from_monotable_file_dataset(self): estimator_method="fit", schema_type="monotable", source_type="file_dataset", + extra_estimator_kwargs={ + "n_pairs": 1, + "n_selected_features": 1, + "n_evaluated_features": 3, + "specific_pairs": [("age", "race")], + "all_possible_pairs": False, + "construction_rules": ["TableMode", "TableSelection"], + }, ) def test_parameter_transfer_regressor_fit_from_multitable_dataframe(self): @@ -2109,6 +2450,16 @@ def test_parameter_transfer_regressor_fit_from_multitable_dataframe(self): estimator_method="fit", schema_type="multitable", source_type="dataframe", + extra_estimator_kwargs={ + "n_features": 10, + "n_pairs": 1, + "n_trees": 0, + "n_selected_features": 1, + "n_evaluated_features": 3, + "specific_pairs": [], + "all_possible_pairs": False, + "construction_rules": ["TableMode", "TableSelection"], + }, ) def test_parameter_transfer_regressor_fit_from_multitable_file_dataset(self): @@ -2118,6 +2469,16 @@ def test_parameter_transfer_regressor_fit_from_multitable_file_dataset(self): estimator_method="fit", schema_type="multitable", source_type="file_dataset", + extra_estimator_kwargs={ + "n_features": 10, + "n_pairs": 1, + "n_trees": 0, + "n_selected_features": 1, + "n_evaluated_features": 3, + "specific_pairs": [], + "all_possible_pairs": False, + "construction_rules": ["TableMode", "TableSelection"], + }, ) def test_parameter_transfer_regressor_predict_from_monotable_dataframe(self):