Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

71 add khiops parameters to predictor constructors #242

Open
wants to merge 4 commits into
base: dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
147 changes: 147 additions & 0 deletions doc/samples/samples_sklearn.rst
Original file line number Diff line number Diff line change
Expand Up @@ -402,6 +402,95 @@ Samples
print("Predicted classes (first 10):")
print(y_predicted[:10])
print("---")
.. autofunction:: khiops_classifier_advanced
.. code-block:: python

# Imports
import os
import pandas as pd
from khiops import core as kh
from khiops.sklearn import KhiopsClassifier
from sklearn import metrics
from sklearn.model_selection import train_test_split

# Load the root table of the dataset into a pandas dataframe
accidents_dataset_path = os.path.join(kh.get_samples_dir(), "AccidentsSummary")
accidents_df = pd.read_csv(
os.path.join(accidents_dataset_path, "Accidents.txt"),
sep="\t",
encoding="latin1",
)

# Split the root dataframe into train and test
accidents_train_df, accidents_test_df = train_test_split(
accidents_df, test_size=0.3, random_state=1
)

# Obtain the main X feature table and the y target vector ("Class" column)
y_train = accidents_train_df["Gravity"]
y_test = accidents_test_df["Gravity"]
X_train_main = accidents_train_df.drop("Gravity", axis=1)
X_test_main = accidents_test_df.drop("Gravity", axis=1)

# Load the secondary table of the dataset into a pandas dataframe
vehicles_df = pd.read_csv(
os.path.join(accidents_dataset_path, "Vehicles.txt"), sep="\t"
)

# Split the secondary dataframe with the keys of the splitted root dataframe
X_train_ids = X_train_main["AccidentId"].to_frame()
X_test_ids = X_test_main["AccidentId"].to_frame()
X_train_secondary = X_train_ids.merge(vehicles_df, on="AccidentId")
X_test_secondary = X_test_ids.merge(vehicles_df, on="AccidentId")

# Create the dataset multitable specification for the train/test split
# We specify each table with a name and a tuple (dataframe, key_columns)
X_train = {
"main_table": "Accidents",
"tables": {
"Accidents": (X_train_main, "AccidentId"),
"Vehicles": (X_train_secondary, ["AccidentId", "VehicleId"]),
},
}
X_test = {
"main_table": "Accidents",
"tables": {
"Accidents": (X_test_main, "AccidentId"),
"Vehicles": (X_test_secondary, ["AccidentId", "VehicleId"]),
},
}
# Train the classifier (by default it analyzes 100 multi-table features)
khc = KhiopsClassifier(
n_features=20,
n_pairs=5,
n_trees=5,
n_selected_features=10,
n_evaluated_features=15,
specific_pairs=[("Light", "Weather"), ("Light", "IntersectionType")],
all_possible_pairs=True,
construction_rules=["TableMode", "TableSelection"],
group_target_value=False,
)
khc.fit(X_train, y_train)

# Predict the class on the test dataset
y_test_pred = khc.predict(X_test)
print("Predicted classes (first 10):")
print(y_test_pred[:10])
print("---")

# Predict the class probability on the test dataset
y_test_probas = khc.predict_proba(X_test)
print(f"Class order: {khc.classes_}")
print("Predicted class probabilities (first 10):")
print(y_test_probas[:10])
print("---")

# Evaluate accuracy and auc metrics on the test dataset
test_accuracy = metrics.accuracy_score(y_test, y_test_pred)
test_auc = metrics.roc_auc_score(y_test, y_test_probas[:, 1])
print(f"Test accuracy = {test_accuracy}")
print(f"Test auc = {test_auc}")
.. autofunction:: khiops_regressor
.. code-block:: python

Expand Down Expand Up @@ -663,6 +752,64 @@ Samples
test_auc = metrics.roc_auc_score(y_test, y_test_probas[:, 1])
print(f"Test accuracy = {test_accuracy}")
print(f"Test auc = {test_auc}")
.. autofunction:: khiops_encoder_advanced
.. code-block:: python

# Imports
import os
import pandas as pd
from khiops import core as kh
from khiops.sklearn import KhiopsEncoder

# Load the root table of the dataset into a pandas dataframe
accidents_dataset_path = os.path.join(kh.get_samples_dir(), "AccidentsSummary")
accidents_df = pd.read_csv(
os.path.join(accidents_dataset_path, "Accidents.txt"),
sep="\t",
encoding="latin1",
)

# Obtain the root X feature table and the y target vector ("Class" column)
X_main = accidents_df.drop("Gravity", axis=1)
y = accidents_df["Gravity"]

# Load the secondary table of the dataset into a pandas dataframe
X_secondary = pd.read_csv(
os.path.join(accidents_dataset_path, "Vehicles.txt"), sep="\t"
)

# Create the dataset multitable specification for the train/test split
# We specify each table with a name and a tuple (dataframe, key_columns)
X_dataset = {
"main_table": "Accidents",
"tables": {
"Accidents": (X_main, "AccidentId"),
"Vehicles": (X_secondary, ["AccidentId", "VehicleId"]),
},
}

# Create the KhiopsEncoder with 10 additional multitable features and fit it
khe = KhiopsEncoder(
n_features=20,
n_pairs=5,
n_trees=5,
specific_pairs=[("Light", "Weather"), ("Light", "IntersectionType")],
all_possible_pairs=True,
construction_rules=["TableMode", "TableSelection"],
group_target_value=False,
informative_features_only=True,
keep_initial_variables=True,
transform_type_categorical="part_id",
transform_type_numerical="part_id",
transform_pairs="part_id",
)
khe.fit(X_dataset, y)

# Transform the train dataset
print("Encoded feature names:")
print(khe.feature_names_out_)
print("Encoded data:")
print(khe.transform(X_dataset)[:10])
.. autofunction:: khiops_coclustering
.. code-block:: python

Expand Down
173 changes: 173 additions & 0 deletions khiops/samples/samples_sklearn.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -455,6 +455,108 @@
"print(\"---\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### `khiops_classifier_advanced()`\n\n",
"Trains a `.KhiopsClassifier` on a star multi-table dataset\n (advanced version with more hyperparameters)\n \n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Imports\n",
"import os\n",
"import pandas as pd\n",
"from khiops import core as kh\n",
"from khiops.sklearn import KhiopsClassifier\n",
"from sklearn import metrics\n",
"from sklearn.model_selection import train_test_split\n",
"\n",
"# Load the root table of the dataset into a pandas dataframe\n",
"accidents_dataset_path = os.path.join(kh.get_samples_dir(), \"AccidentsSummary\")\n",
"accidents_df = pd.read_csv(\n",
" os.path.join(accidents_dataset_path, \"Accidents.txt\"),\n",
" sep=\"\\t\",\n",
" encoding=\"latin1\",\n",
")\n",
"\n",
"# Split the root dataframe into train and test\n",
"accidents_train_df, accidents_test_df = train_test_split(\n",
" accidents_df, test_size=0.3, random_state=1\n",
")\n",
"\n",
"# Obtain the main X feature table and the y target vector (\"Class\" column)\n",
"y_train = accidents_train_df[\"Gravity\"]\n",
"y_test = accidents_test_df[\"Gravity\"]\n",
"X_train_main = accidents_train_df.drop(\"Gravity\", axis=1)\n",
"X_test_main = accidents_test_df.drop(\"Gravity\", axis=1)\n",
"\n",
"# Load the secondary table of the dataset into a pandas dataframe\n",
"vehicles_df = pd.read_csv(\n",
" os.path.join(accidents_dataset_path, \"Vehicles.txt\"), sep=\"\\t\"\n",
")\n",
"\n",
"# Split the secondary dataframe with the keys of the splitted root dataframe\n",
"X_train_ids = X_train_main[\"AccidentId\"].to_frame()\n",
"X_test_ids = X_test_main[\"AccidentId\"].to_frame()\n",
"X_train_secondary = X_train_ids.merge(vehicles_df, on=\"AccidentId\")\n",
"X_test_secondary = X_test_ids.merge(vehicles_df, on=\"AccidentId\")\n",
"\n",
"# Create the dataset multitable specification for the train/test split\n",
"# We specify each table with a name and a tuple (dataframe, key_columns)\n",
"X_train = {\n",
" \"main_table\": \"Accidents\",\n",
" \"tables\": {\n",
" \"Accidents\": (X_train_main, \"AccidentId\"),\n",
" \"Vehicles\": (X_train_secondary, [\"AccidentId\", \"VehicleId\"]),\n",
" },\n",
"}\n",
"X_test = {\n",
" \"main_table\": \"Accidents\",\n",
" \"tables\": {\n",
" \"Accidents\": (X_test_main, \"AccidentId\"),\n",
" \"Vehicles\": (X_test_secondary, [\"AccidentId\", \"VehicleId\"]),\n",
" },\n",
"}\n",
"# Train the classifier (by default it analyzes 100 multi-table features)\n",
"khc = KhiopsClassifier(\n",
" n_features=20,\n",
" n_pairs=5,\n",
" n_trees=5,\n",
" n_selected_features=10,\n",
" n_evaluated_features=15,\n",
" specific_pairs=[(\"Light\", \"Weather\"), (\"Light\", \"IntersectionType\")],\n",
" all_possible_pairs=True,\n",
" construction_rules=[\"TableMode\", \"TableSelection\"],\n",
" group_target_value=False,\n",
")\n",
"khc.fit(X_train, y_train)\n",
"\n",
"# Predict the class on the test dataset\n",
"y_test_pred = khc.predict(X_test)\n",
"print(\"Predicted classes (first 10):\")\n",
"print(y_test_pred[:10])\n",
"print(\"---\")\n",
"\n",
"# Predict the class probability on the test dataset\n",
"y_test_probas = khc.predict_proba(X_test)\n",
"print(f\"Class order: {khc.classes_}\")\n",
"print(\"Predicted class probabilities (first 10):\")\n",
"print(y_test_probas[:10])\n",
"print(\"---\")\n",
"\n",
"# Evaluate accuracy and auc metrics on the test dataset\n",
"test_accuracy = metrics.accuracy_score(y_test, y_test_pred)\n",
"test_auc = metrics.roc_auc_score(y_test, y_test_probas[:, 1])\n",
"print(f\"Test accuracy = {test_accuracy}\")\n",
"print(f\"Test auc = {test_auc}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
Expand Down Expand Up @@ -781,6 +883,77 @@
"print(f\"Test auc = {test_auc}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### `khiops_encoder_advanced()`\n\n",
"Trains a `.KhiopsEncoder` on a star multi-table dataset\n (advanced version with more hyperparameters)\n \n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Imports\n",
"import os\n",
"import pandas as pd\n",
"from khiops import core as kh\n",
"from khiops.sklearn import KhiopsEncoder\n",
"\n",
"# Load the root table of the dataset into a pandas dataframe\n",
"accidents_dataset_path = os.path.join(kh.get_samples_dir(), \"AccidentsSummary\")\n",
"accidents_df = pd.read_csv(\n",
" os.path.join(accidents_dataset_path, \"Accidents.txt\"),\n",
" sep=\"\\t\",\n",
" encoding=\"latin1\",\n",
")\n",
"\n",
"# Obtain the root X feature table and the y target vector (\"Class\" column)\n",
"X_main = accidents_df.drop(\"Gravity\", axis=1)\n",
"y = accidents_df[\"Gravity\"]\n",
"\n",
"# Load the secondary table of the dataset into a pandas dataframe\n",
"X_secondary = pd.read_csv(\n",
" os.path.join(accidents_dataset_path, \"Vehicles.txt\"), sep=\"\\t\"\n",
")\n",
"\n",
"# Create the dataset multitable specification for the train/test split\n",
"# We specify each table with a name and a tuple (dataframe, key_columns)\n",
"X_dataset = {\n",
" \"main_table\": \"Accidents\",\n",
" \"tables\": {\n",
" \"Accidents\": (X_main, \"AccidentId\"),\n",
" \"Vehicles\": (X_secondary, [\"AccidentId\", \"VehicleId\"]),\n",
" },\n",
"}\n",
"\n",
"# Create the KhiopsEncoder with 10 additional multitable features and fit it\n",
"khe = KhiopsEncoder(\n",
" n_features=20,\n",
" n_pairs=5,\n",
" n_trees=5,\n",
" specific_pairs=[(\"Light\", \"Weather\"), (\"Light\", \"IntersectionType\")],\n",
" all_possible_pairs=True,\n",
" construction_rules=[\"TableMode\", \"TableSelection\"],\n",
" group_target_value=False,\n",
" informative_features_only=True,\n",
" keep_initial_variables=True,\n",
" transform_type_categorical=\"part_id\",\n",
" transform_type_numerical=\"part_id\",\n",
" transform_pairs=\"part_id\",\n",
")\n",
"khe.fit(X_dataset, y)\n",
"\n",
"# Transform the train dataset\n",
"print(\"Encoded feature names:\")\n",
"print(khe.feature_names_out_)\n",
"print(\"Encoded data:\")\n",
"print(khe.transform(X_dataset)[:10])"
]
},
{
"cell_type": "markdown",
"metadata": {},
Expand Down
Loading
Loading