KhiopsML · nairbenrekia · Apr 16, 2024 · Sep 30, 2024 · Oct 4, 2024 · Oct 4, 2024
diff --git a/doc/samples/samples_sklearn.rst b/doc/samples/samples_sklearn.rst
@@ -402,6 +402,95 @@ Samples
     print("Predicted classes (first 10):")
     print(y_predicted[:10])
     print("---")
+.. autofunction:: khiops_classifier_advanced
+.. code-block:: python
+
+    # Imports
+    import os
+    import pandas as pd
+    from khiops import core as kh
+    from khiops.sklearn import KhiopsClassifier
+    from sklearn import metrics
+    from sklearn.model_selection import train_test_split
+
+    # Load the root table of the dataset into a pandas dataframe
+    accidents_dataset_path = os.path.join(kh.get_samples_dir(), "AccidentsSummary")
+    accidents_df = pd.read_csv(
+        os.path.join(accidents_dataset_path, "Accidents.txt"),
+        sep="\t",
+        encoding="latin1",
+    )
+
+    # Split the root dataframe into train and test
+    accidents_train_df, accidents_test_df = train_test_split(
+        accidents_df, test_size=0.3, random_state=1
+    )
+
+    # Obtain the main X feature table and the y target vector ("Class" column)
+    y_train = accidents_train_df["Gravity"]
+    y_test = accidents_test_df["Gravity"]
+    X_train_main = accidents_train_df.drop("Gravity", axis=1)
+    X_test_main = accidents_test_df.drop("Gravity", axis=1)
+
+    # Load the secondary table of the dataset into a pandas dataframe
+    vehicles_df = pd.read_csv(
+        os.path.join(accidents_dataset_path, "Vehicles.txt"), sep="\t"
+    )
+
+    # Split the secondary dataframe with the keys of the splitted root dataframe
+    X_train_ids = X_train_main["AccidentId"].to_frame()
+    X_test_ids = X_test_main["AccidentId"].to_frame()
+    X_train_secondary = X_train_ids.merge(vehicles_df, on="AccidentId")
+    X_test_secondary = X_test_ids.merge(vehicles_df, on="AccidentId")
+
+    # Create the dataset multitable specification for the train/test split
+    # We specify each table with a name and a tuple (dataframe, key_columns)
+    X_train = {
+        "main_table": "Accidents",
+        "tables": {
+            "Accidents": (X_train_main, "AccidentId"),
+            "Vehicles": (X_train_secondary, ["AccidentId", "VehicleId"]),
+        },
+    }
+    X_test = {
+        "main_table": "Accidents",
+        "tables": {
+            "Accidents": (X_test_main, "AccidentId"),
+            "Vehicles": (X_test_secondary, ["AccidentId", "VehicleId"]),
+        },
+    }
+    # Train the classifier (by default it analyzes 100 multi-table features)
+    khc = KhiopsClassifier(
+        n_features=20,
+        n_pairs=5,
+        n_trees=5,
+        n_selected_features=10,
+        n_evaluated_features=15,
+        specific_pairs=[("Light", "Weather"), ("Light", "IntersectionType")],
+        all_possible_pairs=True,
+        construction_rules=["TableMode", "TableSelection"],
+        group_target_value=False,
+    )
+    khc.fit(X_train, y_train)
+
+    # Predict the class on the test dataset
+    y_test_pred = khc.predict(X_test)
+    print("Predicted classes (first 10):")
+    print(y_test_pred[:10])
+    print("---")
+
+    # Predict the class probability on the test dataset
+    y_test_probas = khc.predict_proba(X_test)
+    print(f"Class order: {khc.classes_}")
+    print("Predicted class probabilities (first 10):")
+    print(y_test_probas[:10])
+    print("---")
+
+    # Evaluate accuracy and auc metrics on the test dataset
+    test_accuracy = metrics.accuracy_score(y_test, y_test_pred)
+    test_auc = metrics.roc_auc_score(y_test, y_test_probas[:, 1])
+    print(f"Test accuracy = {test_accuracy}")
+    print(f"Test auc      = {test_auc}")
 .. autofunction:: khiops_regressor
 .. code-block:: python
 
@@ -663,6 +752,64 @@ Samples
     test_auc = metrics.roc_auc_score(y_test, y_test_probas[:, 1])
     print(f"Test accuracy = {test_accuracy}")
     print(f"Test auc      = {test_auc}")
+.. autofunction:: khiops_encoder_advanced
+.. code-block:: python
+
+    # Imports
+    import os
+    import pandas as pd
+    from khiops import core as kh
+    from khiops.sklearn import KhiopsEncoder
+
+    # Load the root table of the dataset into a pandas dataframe
+    accidents_dataset_path = os.path.join(kh.get_samples_dir(), "AccidentsSummary")
+    accidents_df = pd.read_csv(
+        os.path.join(accidents_dataset_path, "Accidents.txt"),
+        sep="\t",
+        encoding="latin1",
+    )
+
+    # Obtain the root X feature table and the y target vector ("Class" column)
+    X_main = accidents_df.drop("Gravity", axis=1)
+    y = accidents_df["Gravity"]
+
+    # Load the secondary table of the dataset into a pandas dataframe
+    X_secondary = pd.read_csv(
+        os.path.join(accidents_dataset_path, "Vehicles.txt"), sep="\t"
+    )
+
+    # Create the dataset multitable specification for the train/test split
+    # We specify each table with a name and a tuple (dataframe, key_columns)
+    X_dataset = {
+        "main_table": "Accidents",
+        "tables": {
+            "Accidents": (X_main, "AccidentId"),
+            "Vehicles": (X_secondary, ["AccidentId", "VehicleId"]),
+        },
+    }
+
+    # Create the KhiopsEncoder with 10 additional multitable features and fit it
+    khe = KhiopsEncoder(
+        n_features=20,
+        n_pairs=5,
+        n_trees=5,
+        specific_pairs=[("Light", "Weather"), ("Light", "IntersectionType")],
+        all_possible_pairs=True,
+        construction_rules=["TableMode", "TableSelection"],
+        group_target_value=False,
+        informative_features_only=True,
+        keep_initial_variables=True,
+        transform_type_categorical="part_id",
+        transform_type_numerical="part_id",
+        transform_pairs="part_id",
+    )
+    khe.fit(X_dataset, y)
+
+    # Transform the train dataset
+    print("Encoded feature names:")
+    print(khe.feature_names_out_)
+    print("Encoded data:")
+    print(khe.transform(X_dataset)[:10])
 .. autofunction:: khiops_coclustering
 .. code-block:: python
 

diff --git a/khiops/samples/samples_sklearn.ipynb b/khiops/samples/samples_sklearn.ipynb
@@ -455,6 +455,108 @@
     "print(\"---\")"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### `khiops_classifier_advanced()`\n\n",
+    "Trains a `.KhiopsClassifier` on a star multi-table dataset\n    (advanced version with more hyperparameters)\n    \n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Imports\n",
+    "import os\n",
+    "import pandas as pd\n",
+    "from khiops import core as kh\n",
+    "from khiops.sklearn import KhiopsClassifier\n",
+    "from sklearn import metrics\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "\n",
+    "# Load the root table of the dataset into a pandas dataframe\n",
+    "accidents_dataset_path = os.path.join(kh.get_samples_dir(), \"AccidentsSummary\")\n",
+    "accidents_df = pd.read_csv(\n",
+    "    os.path.join(accidents_dataset_path, \"Accidents.txt\"),\n",
+    "    sep=\"\\t\",\n",
+    "    encoding=\"latin1\",\n",
+    ")\n",
+    "\n",
+    "# Split the root dataframe into train and test\n",
+    "accidents_train_df, accidents_test_df = train_test_split(\n",
+    "    accidents_df, test_size=0.3, random_state=1\n",
+    ")\n",
+    "\n",
+    "# Obtain the main X feature table and the y target vector (\"Class\" column)\n",
+    "y_train = accidents_train_df[\"Gravity\"]\n",
+    "y_test = accidents_test_df[\"Gravity\"]\n",
+    "X_train_main = accidents_train_df.drop(\"Gravity\", axis=1)\n",
+    "X_test_main = accidents_test_df.drop(\"Gravity\", axis=1)\n",
+    "\n",
+    "# Load the secondary table of the dataset into a pandas dataframe\n",
+    "vehicles_df = pd.read_csv(\n",
+    "    os.path.join(accidents_dataset_path, \"Vehicles.txt\"), sep=\"\\t\"\n",
+    ")\n",
+    "\n",
+    "# Split the secondary dataframe with the keys of the splitted root dataframe\n",
+    "X_train_ids = X_train_main[\"AccidentId\"].to_frame()\n",
+    "X_test_ids = X_test_main[\"AccidentId\"].to_frame()\n",
+    "X_train_secondary = X_train_ids.merge(vehicles_df, on=\"AccidentId\")\n",
+    "X_test_secondary = X_test_ids.merge(vehicles_df, on=\"AccidentId\")\n",
+    "\n",
+    "# Create the dataset multitable specification for the train/test split\n",
+    "# We specify each table with a name and a tuple (dataframe, key_columns)\n",
+    "X_train = {\n",
+    "    \"main_table\": \"Accidents\",\n",
+    "    \"tables\": {\n",
+    "        \"Accidents\": (X_train_main, \"AccidentId\"),\n",
+    "        \"Vehicles\": (X_train_secondary, [\"AccidentId\", \"VehicleId\"]),\n",
+    "    },\n",
+    "}\n",
+    "X_test = {\n",
+    "    \"main_table\": \"Accidents\",\n",
+    "    \"tables\": {\n",
+    "        \"Accidents\": (X_test_main, \"AccidentId\"),\n",
+    "        \"Vehicles\": (X_test_secondary, [\"AccidentId\", \"VehicleId\"]),\n",
+    "    },\n",
+    "}\n",
+    "# Train the classifier (by default it analyzes 100 multi-table features)\n",
+    "khc = KhiopsClassifier(\n",
+    "    n_features=20,\n",
+    "    n_pairs=5,\n",
+    "    n_trees=5,\n",
+    "    n_selected_features=10,\n",
+    "    n_evaluated_features=15,\n",
+    "    specific_pairs=[(\"Light\", \"Weather\"), (\"Light\", \"IntersectionType\")],\n",
+    "    all_possible_pairs=True,\n",
+    "    construction_rules=[\"TableMode\", \"TableSelection\"],\n",
+    "    group_target_value=False,\n",
+    ")\n",
+    "khc.fit(X_train, y_train)\n",
+    "\n",
+    "# Predict the class on the test dataset\n",
+    "y_test_pred = khc.predict(X_test)\n",
+    "print(\"Predicted classes (first 10):\")\n",
+    "print(y_test_pred[:10])\n",
+    "print(\"---\")\n",
+    "\n",
+    "# Predict the class probability on the test dataset\n",
+    "y_test_probas = khc.predict_proba(X_test)\n",
+    "print(f\"Class order: {khc.classes_}\")\n",
+    "print(\"Predicted class probabilities (first 10):\")\n",
+    "print(y_test_probas[:10])\n",
+    "print(\"---\")\n",
+    "\n",
+    "# Evaluate accuracy and auc metrics on the test dataset\n",
+    "test_accuracy = metrics.accuracy_score(y_test, y_test_pred)\n",
+    "test_auc = metrics.roc_auc_score(y_test, y_test_probas[:, 1])\n",
+    "print(f\"Test accuracy = {test_accuracy}\")\n",
+    "print(f\"Test auc      = {test_auc}\")"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -781,6 +883,77 @@
     "print(f\"Test auc      = {test_auc}\")"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### `khiops_encoder_advanced()`\n\n",
+    "Trains a `.KhiopsEncoder` on a star multi-table dataset\n    (advanced version with more hyperparameters)\n    \n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Imports\n",
+    "import os\n",
+    "import pandas as pd\n",
+    "from khiops import core as kh\n",
+    "from khiops.sklearn import KhiopsEncoder\n",
+    "\n",
+    "# Load the root table of the dataset into a pandas dataframe\n",
+    "accidents_dataset_path = os.path.join(kh.get_samples_dir(), \"AccidentsSummary\")\n",
+    "accidents_df = pd.read_csv(\n",
+    "    os.path.join(accidents_dataset_path, \"Accidents.txt\"),\n",
+    "    sep=\"\\t\",\n",
+    "    encoding=\"latin1\",\n",
+    ")\n",
+    "\n",
+    "# Obtain the root X feature table and the y target vector (\"Class\" column)\n",
+    "X_main = accidents_df.drop(\"Gravity\", axis=1)\n",
+    "y = accidents_df[\"Gravity\"]\n",
+    "\n",
+    "# Load the secondary table of the dataset into a pandas dataframe\n",
+    "X_secondary = pd.read_csv(\n",
+    "    os.path.join(accidents_dataset_path, \"Vehicles.txt\"), sep=\"\\t\"\n",
+    ")\n",
+    "\n",
+    "# Create the dataset multitable specification for the train/test split\n",
+    "# We specify each table with a name and a tuple (dataframe, key_columns)\n",
+    "X_dataset = {\n",
+    "    \"main_table\": \"Accidents\",\n",
+    "    \"tables\": {\n",
+    "        \"Accidents\": (X_main, \"AccidentId\"),\n",
+    "        \"Vehicles\": (X_secondary, [\"AccidentId\", \"VehicleId\"]),\n",
+    "    },\n",
+    "}\n",
+    "\n",
+    "# Create the KhiopsEncoder with 10 additional multitable features and fit it\n",
+    "khe = KhiopsEncoder(\n",
+    "    n_features=20,\n",
+    "    n_pairs=5,\n",
+    "    n_trees=5,\n",
+    "    specific_pairs=[(\"Light\", \"Weather\"), (\"Light\", \"IntersectionType\")],\n",
+    "    all_possible_pairs=True,\n",
+    "    construction_rules=[\"TableMode\", \"TableSelection\"],\n",
+    "    group_target_value=False,\n",
+    "    informative_features_only=True,\n",
+    "    keep_initial_variables=True,\n",
+    "    transform_type_categorical=\"part_id\",\n",
+    "    transform_type_numerical=\"part_id\",\n",
+    "    transform_pairs=\"part_id\",\n",
+    ")\n",
+    "khe.fit(X_dataset, y)\n",
+    "\n",
+    "# Transform the train dataset\n",
+    "print(\"Encoded feature names:\")\n",
+    "print(khe.feature_names_out_)\n",
+    "print(\"Encoded data:\")\n",
+    "print(khe.transform(X_dataset)[:10])"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},