From a51a2f0ac294dce7473e07f65d23ff49930c6596 Mon Sep 17 00:00:00 2001 From: PrivacyGo-PETPlatform Date: Thu, 26 Sep 2024 17:48:40 +0800 Subject: [PATCH] feat: version 0.1.1 --- CHANGELOG.MD | 11 ++ docs/user_guide/Horizontal_SecureBoost.md | 16 +-- petml/fl/boosting/decision_tree.py | 30 ++++++ petml/fl/boosting/loss.py | 21 +++- petml/fl/boosting/xgb_model.py | 124 +++++++++++++++------- petml/operators/boosting/xgb_model.py | 32 +++--- tests/operators/boosting/test_xgb.py | 16 +-- 7 files changed, 180 insertions(+), 70 deletions(-) create mode 100644 CHANGELOG.MD diff --git a/CHANGELOG.MD b/CHANGELOG.MD new file mode 100644 index 0000000..cf7a52b --- /dev/null +++ b/CHANGELOG.MD @@ -0,0 +1,11 @@ +# List of Changes + +## Version 0.1.1 +### Changed +- The model storage method has changed from pickle to json. + +## Version 0.1.0 +### Add +- Federated Leiden algorithm +- Two-party Secure Xgboost +- Two-party PSI diff --git a/docs/user_guide/Horizontal_SecureBoost.md b/docs/user_guide/Horizontal_SecureBoost.md index e3b0a3a..6089395 100644 --- a/docs/user_guide/Horizontal_SecureBoost.md +++ b/docs/user_guide/Horizontal_SecureBoost.md @@ -116,7 +116,7 @@ petml.operators.boosting.XGBoostClassifierFit | Name | File Type | Description | | --- |-----------| --- | -| model_path | pkl | The trained model | +| model_path | json | The trained model | #### Examples ``` @@ -148,7 +148,7 @@ config = { "train_data": "data0.csv", }, "outputs": { - "model_path": "model_name0.pkl" + "model_path": "model_name0.json" } }, "party_b": { @@ -156,7 +156,7 @@ config = { "train_data": "data1.csv", }, "outputs": { - "model_path": "model_name1.pkl" + "model_path": "model_name1.json" } } } @@ -176,7 +176,7 @@ petml.operators.boosting.XGBoostClassifierPredict | Name | File Type | Description | |--------------|-----------|-----------------------| | predict_data | csv | The inference dataset | -| model_path | pkl | The trained model | +| model_path | json | The trained model | #### Output @@ -204,7 +204,7 @@ config = { "party_a": { "inputs": { "predict_data": "data0.csv", - "model_path": "model_name0.pkl" + "model_path": "model_name0.json" }, "outputs": { "inference_res_path": "predict0.csv" @@ -213,7 +213,7 @@ config = { "party_b": { "inputs": { "predict_data": "data1.csv", - "model_path": "model_name1.pkl" + "model_path": "model_name1.json" }, "outputs": { "inference_res_path": "predict1.csv" @@ -258,7 +258,7 @@ petml.operators.boosting.XGBoostRegressorFit | Name | File Type | Description | |------------|-----------|-------------------| -| model_path | pkl | The trained model | +| model_path | json | The trained model | #### Examples Refer to the examples in classifier training config @@ -273,7 +273,7 @@ petml.operators.boosting.XGBoostRegressorPredict | Name | File Type | Description | |--------------|-----------|-----------------------| | predict_data | csv | The inference dataset | -| model_path | pkl | The trained model | +| model_path | json | The trained model | #### Output diff --git a/petml/fl/boosting/decision_tree.py b/petml/fl/boosting/decision_tree.py index c8a3b4d..81ad421 100644 --- a/petml/fl/boosting/decision_tree.py +++ b/petml/fl/boosting/decision_tree.py @@ -56,6 +56,21 @@ def __init__(self, self.left_child = left_child self.right_child = right_child + def to_dict(self): + """transform object to dict""" + return {k: v.to_dict() if isinstance(v, MPCTreeNode) else v for k, v in vars(self).items()} + + @classmethod + def from_dict(cls, data): + """transform from dict to object""" + obj = cls() + for k, v in data.items(): + if isinstance(v, dict): + setattr(obj, k, cls.from_dict(v)) + else: + setattr(obj, k, v) + return obj + class MPCTree: """ @@ -121,6 +136,21 @@ def __init__(self, self.min_split_loss = min_split_loss self.max_depth = max_depth + def to_dict(self): + """transform object to dict""" + return {k: v.to_dict() if isinstance(v, MPCTreeNode) else v for k, v in vars(self).items()} + + @classmethod + def from_dict(cls, data): + """transform from dict to object""" + obj = cls() + for k, v in data.items(): + if isinstance(v, dict): + setattr(obj, k, cls.from_dict(v)) + else: + setattr(obj, k, v) + return obj + def _calc_threshold(self, gsum): """clip the value of gain""" res = snp.where(gsum > self.reg_alpha, gsum - self.reg_alpha, diff --git a/petml/fl/boosting/loss.py b/petml/fl/boosting/loss.py index a6ad1c5..8668f0b 100644 --- a/petml/fl/boosting/loss.py +++ b/petml/fl/boosting/loss.py @@ -25,6 +25,16 @@ class LogisticLoss: class for calculate logistic loss function """ + def to_dict(self): + return {'class': 'LogisticLoss'} + + @classmethod + def from_dict(cls, data): + if data['class'] == 'LogisticLoss': + return cls() + else: + raise ValueError('Invalid class: ' + data['class']) + def _sigmoid(self, y_pred: np.ndarray): """ Implemented sigmoid equation @@ -90,8 +100,15 @@ class SquareLoss: class for calculate square loss function """ - def __init__(self): - pass + def to_dict(self): + return {'class': 'SquareLoss'} + + @classmethod + def from_dict(cls, data): + if data['class'] == 'SquareLoss': + return cls() + else: + raise ValueError('Invalid class: ' + data['class']) def grad(self, y_pred: Union[SecureArray, np.ndarray], label: Union[SecureArray, np.ndarray]): """ diff --git a/petml/fl/boosting/xgb_model.py b/petml/fl/boosting/xgb_model.py index d4c4077..a5dbaab 100644 --- a/petml/fl/boosting/xgb_model.py +++ b/petml/fl/boosting/xgb_model.py @@ -13,7 +13,7 @@ # limitations under the License. import hashlib -import pickle +import json import time import numpy as np @@ -366,35 +366,43 @@ def transform_one_tree(self, train_x, train_y, train_y_cipher, eval_x_cipher, y_ return tree, y_hat, eval_y_hat + @staticmethod + def export_share(share_value) -> list: + return share_value.to_share().astype(np.int64).tolist() + + @staticmethod + def load_share(load_data): + return snp.fromshare(np.array(load_data).astype(np.int64), np.float64) + def save_tree_from_ss_to_numpy(self, trees): for tree in trees: - tree.columns = tree.columns.to_share().astype(np.int64) + tree.columns = self.export_share(tree.columns) self._save_tree_from_ss_to_numpy(tree.root) def _save_tree_from_ss_to_numpy(self, tree_node): """Convert secure object to numerical value""" if tree_node.is_leaf: - tree_node.leaf_weight = tree_node.leaf_weight.to_share().astype(np.int64) + tree_node.leaf_weight = self.export_share(tree_node.leaf_weight) return - tree_node.split_feat = tree_node.split_feat.to_share().astype(np.int64) - tree_node.split_val = tree_node.split_val.to_share().astype(np.int64) + tree_node.split_feat = self.export_share(tree_node.split_feat) + tree_node.split_val = self.export_share(tree_node.split_val) self._save_tree_from_ss_to_numpy(tree_node.left_child) self._save_tree_from_ss_to_numpy(tree_node.right_child) def load_tree_from_numpy_to_ss(self, trees): for tree in trees: - tree.columns = snp.fromshare(tree.columns, np.float64) + tree.columns = self.load_share(tree.columns) self._load_tree_from_numpy_to_ss(tree.root) def _load_tree_from_numpy_to_ss(self, tree_node): """Convert to secure object from numerical value""" if tree_node.is_leaf: - tree_node.leaf_weight = snp.fromshare(tree_node.leaf_weight, np.float64) + tree_node.leaf_weight = self.load_share(tree_node.leaf_weight) return - tree_node.split_feat = snp.fromshare(tree_node.split_feat, np.float64) - tree_node.split_val = snp.fromshare(tree_node.split_val, np.float64) + tree_node.split_feat = self.load_share(tree_node.split_feat) + tree_node.split_val = self.load_share(tree_node.split_val) self._load_tree_from_numpy_to_ss(tree_node.left_child) self._load_tree_from_numpy_to_ss(tree_node.right_child) @@ -502,6 +510,30 @@ def __init__( self.trees = [] self.loss_func = LogisticLoss() + def to_dict(self): + result = vars(self).copy() + del result['logger'] + result['trees'] = [tree.to_dict() for tree in self.trees] + result['loss_func'] = self.loss_func.to_dict() + return result + + @classmethod + def from_dict(cls, data): + obj = cls() + for k, v in data.items(): + if k == 'trees': + setattr(obj, k, [MPCTree.from_dict(tree_dict) for tree_dict in v]) + elif k == 'loss_func': + loss_map = { + 'LogisticLoss': LogisticLoss, + 'SquareLoss': SquareLoss, + } + loss_class = loss_map[v['class']] + setattr(obj, k, loss_class.from_dict(v)) + else: + setattr(obj, k, v) + return obj + def fit(self, data: pd.DataFrame) -> None: """ Fit the model @@ -619,10 +651,11 @@ def save_model(self, model_path: str) -> None: try: self._federation = None self._mpc_engine = None - with open(model_path, 'wb') as f: - pickle.dump(self, f) + json_str = json.dumps(self.to_dict()) + with open(model_path, 'w') as f: + f.write(json_str) self.logger.info("Save model success") - except pickle.PickleError as e: + except json.JSONDecodeError as e: self.logger.error(f"Save model file. err={e}") def load_model(self, model_path: str) -> None: @@ -634,21 +667,20 @@ def load_model(self, model_path: str) -> None: model_path: string File path of the saved model - Returns - ------- - loadobj: model - Saved XGboost model """ try: - with open(model_path, 'rb') as f: - load_obj = pickle.load(f) + with open(model_path, 'r') as f: + load_obj = json.load(f) + load_attributes = self.from_dict(load_obj) + + for attr in vars(self): + if attr not in ['logger', '_federation', '_mpc_engine']: + setattr(self, attr, getattr(load_attributes, attr)) - self.learning_rate = load_obj.learning_rate - self.base_score = load_obj.base_score - self.trees = load_obj.trees self.load_tree_from_numpy_to_ss(self.trees) self.logger.info("Load model success") - except pickle.PickleError as e: + + except json.JSONDecodeError as e: self.logger.error(f"Load model fail. err={e}") @@ -747,6 +779,27 @@ def __init__( self.trees = [] self.loss_func = LogisticLoss() + def to_dict(self): + result = vars(self).copy() + del result['logger'] + result['trees'] = [tree.to_dict() for tree in self.trees] + result['loss_func'] = self.loss_func.to_dict() + return result + + @classmethod + def from_dict(cls, data): + obj = cls() + for k, v in data.items(): + if k == 'trees': + setattr(obj, k, [MPCTree.from_dict(tree_dict) for tree_dict in v]) + elif k == 'loss_func': + loss_map = {'LogisticLoss': LogisticLoss, 'SquareLoss': SquareLoss} + loss_class = loss_map[v['class']] + setattr(obj, k, loss_class.from_dict(v)) + else: + setattr(obj, k, v) + return obj + def fit(self, data: pd.DataFrame) -> None: """ Fit the model @@ -846,10 +899,11 @@ def save_model(self, model_path: str) -> None: try: self._federation = None self._mpc_engine = None - with open(model_path, 'wb') as f: - pickle.dump(self, f) + json_str = json.dumps(self.to_dict()) + with open(model_path, 'w') as f: + f.write(json_str) self.logger.info("Save model success") - except pickle.PickleError as e: + except json.JSONDecodeError as e: self.logger.error(f"Save model file. err={e}") def load_model(self, model_path: str) -> None: @@ -860,20 +914,18 @@ def load_model(self, model_path: str) -> None: ---------- model_path: string File path of the saved model - - Returns - ------- - loadobj: model - Saved XGboost model """ try: - with open(model_path, 'rb') as f: - load_obj = pickle.load(f) + with open(model_path, 'r') as f: + load_obj = json.load(f) + load_attributes = self.from_dict(load_obj) + + for attr in vars(self): + if attr not in ['logger', '_federation', '_mpc_engine']: + setattr(self, attr, getattr(load_attributes, attr)) - self.learning_rate = load_obj.learning_rate - self.base_score = load_obj.base_score - self.trees = load_obj.trees self.load_tree_from_numpy_to_ss(self.trees) self.logger.info("Load model success") - except pickle.PickleError as e: + + except json.JSONDecodeError as e: self.logger.error(f"Load model fail. err={e}") diff --git a/petml/operators/boosting/xgb_model.py b/petml/operators/boosting/xgb_model.py index 9240c35..6b4a194 100644 --- a/petml/operators/boosting/xgb_model.py +++ b/petml/operators/boosting/xgb_model.py @@ -55,7 +55,7 @@ def _run(self, net, configs: dict) -> bool: "train_data": "/path/to/data.csv", }, "outputs": { - "model_path": "/path/to/model_name.pkl" + "model_path": "/path/to/model_name.json" } }, "party_b": { @@ -63,7 +63,7 @@ def _run(self, net, configs: dict) -> bool: "train_data": "/path/to/data.csv", }, "outputs": { - "model_path": "/path/to/model_name.pkl" + "model_path": "/path/to/model_name.json" } } } @@ -88,8 +88,8 @@ def _run(self, net, configs: dict) -> bool: train_data = CsvStorage.read(configs["inputs"]["train_data"]) model_path = configs["outputs"]["model_path"] ext = Path(model_path) - if ext.suffix != '.pkl': - raise ValueError('The `model_path` should end with the `.pkl` format.') + if ext.suffix != '.json': + raise ValueError('The `model_path` should end with the `.json` format.') # construct model model = XGBoostClassifier(min_split_loss, @@ -133,7 +133,7 @@ def _run(self, net, configs: dict) -> bool: "party_a": { "inputs": { "predict_data": "/path/to/data.csv", - "model_path": "/path/to/model_name.pkl" + "model_path": "/path/to/model_name.json" }, "outputs": { "inference_res_path": "pathto/predict_proba_value.csv" @@ -142,7 +142,7 @@ def _run(self, net, configs: dict) -> bool: "party_b": { "inputs": { "predict_data": "/path/to/data.csv", - "model_path": "/path/to/model_name.pkl" + "model_path": "/path/to/model_name.json" }, "outputs": { "inference_res_path": "/path/to/predict_proba_value.csv" @@ -157,8 +157,8 @@ def _run(self, net, configs: dict) -> bool: predict_data = CsvStorage.read(configs["inputs"]["predict_data"]) model_path = configs["inputs"]["model_path"] ext = Path(model_path) - if ext.suffix != '.pkl': - raise ValueError('The `model_path` should end with the `.pkl` format.') + if ext.suffix != '.json': + raise ValueError('The `model_path` should end with the `.json` format.') inference_res_path = configs["outputs"]["inference_res_path"] # inference model @@ -205,7 +205,7 @@ def _run(self, net, configs: dict) -> bool: "train_data": "/path/to/data.csv", }, "outputs": { - "model_path": "/path/to/model_name.pkl" + "model_path": "/path/to/model_name.json" } }, "party_b": { @@ -213,7 +213,7 @@ def _run(self, net, configs: dict) -> bool: "train_data": "/path/to/data.csv", }, "outputs": { - "model_path": "/path/to/model_name.pkl" + "model_path": "/path/to/model_name.json" } } } @@ -238,8 +238,8 @@ def _run(self, net, configs: dict) -> bool: train_data = CsvStorage.read(configs["inputs"]["train_data"]) model_path = configs["outputs"]["model_path"] ext = Path(model_path) - if ext.suffix != '.pkl': - raise ValueError('The `model_path` should end with the `.pkl` format.') + if ext.suffix != '.json': + raise ValueError('The `model_path` should end with the `.json` format.') # construct model model = XGBoostRegressor(min_split_loss, @@ -283,7 +283,7 @@ def _run(self, net, configs: dict) -> bool: "party_a": { "inputs": { "predict_data": "/path/to/data.csv", - "model_path": "/path/to/model_name.pkl" + "model_path": "/path/to/model_name.json" }, "outputs": { "inference_res_path": "/path/to/predict_value.csv" @@ -292,7 +292,7 @@ def _run(self, net, configs: dict) -> bool: "party_b": { "inputs": { "predict_data": "/path/to/data.csv", - "model_path": "/path/to/model_name.pkl" + "model_path": "/path/to/model_name.json" } "outputs": { "inference_res_path": "path/to/predict_value.csv" @@ -307,8 +307,8 @@ def _run(self, net, configs: dict) -> bool: predict_data = CsvStorage.read(configs["inputs"]["predict_data"]) model_path = configs["inputs"]["model_path"] ext = Path(model_path) - if ext.suffix != '.pkl': - raise ValueError('The `model_path` should end with the `.pkl` format.') + if ext.suffix != '.json': + raise ValueError('The `model_path` should end with the `.json` format.') inference_res_path = configs["outputs"]["inference_res_path"] # inference model diff --git a/tests/operators/boosting/test_xgb.py b/tests/operators/boosting/test_xgb.py index d519d40..a7f0351 100644 --- a/tests/operators/boosting/test_xgb.py +++ b/tests/operators/boosting/test_xgb.py @@ -60,7 +60,7 @@ def test_xgb_classifier(self): "train_data": "examples/data/iris_binary_mini_server.csv", }, "outputs": { - "model_path": "tmp/test_binary_xgb_server.pkl" + "model_path": "tmp/test_binary_xgb_server.json" } }, "party_b": { @@ -68,7 +68,7 @@ def test_xgb_classifier(self): "train_data": "examples/data/iris_binary_mini_client.csv", }, "outputs": { - "model_path": "tmp/test_binary_xgb_client.pkl" + "model_path": "tmp/test_binary_xgb_client.json" } } } @@ -88,7 +88,7 @@ def test_xgb_classifier(self): "party_a": { "inputs": { "predict_data": "examples/data/iris_binary_mini_server.csv", - "model_path": "tmp/test_binary_xgb_server.pkl" + "model_path": "tmp/test_binary_xgb_server.json" }, "outputs": { "inference_res_path": "tmp/test_binary_predict_server.csv" @@ -97,7 +97,7 @@ def test_xgb_classifier(self): "party_b": { "inputs": { "predict_data": "examples/data/iris_binary_mini_client.csv", - "model_path": "tmp/test_binary_xgb_client.pkl" + "model_path": "tmp/test_binary_xgb_client.json" }, "outputs": { "inference_res_path": "tmp/test_binary_predict_client.csv" @@ -159,7 +159,7 @@ def test_xgb_regressor(self): "train_data": "examples/data/students_reg_mini_server.csv", }, "outputs": { - "model_path": "tmp/test_reg_xgb_server.pkl" + "model_path": "tmp/test_reg_xgb_server.json" } }, "party_b": { @@ -167,7 +167,7 @@ def test_xgb_regressor(self): "train_data": "examples/data/students_reg_mini_client.csv", }, "outputs": { - "model_path": "tmp/test_reg_xgb_client.pkl" + "model_path": "tmp/test_reg_xgb_client.json" } } } @@ -187,7 +187,7 @@ def test_xgb_regressor(self): "party_a": { "inputs": { "predict_data": "examples/data/students_reg_mini_server.csv", - "model_path": "tmp/test_reg_xgb_server.pkl" + "model_path": "tmp/test_reg_xgb_server.json" }, "outputs": { "inference_res_path": "tmp/test_reg_predict_server.csv" @@ -196,7 +196,7 @@ def test_xgb_regressor(self): "party_b": { "inputs": { "predict_data": "examples/data/students_reg_mini_client.csv", - "model_path": "tmp/test_reg_xgb_client.pkl" + "model_path": "tmp/test_reg_xgb_client.json" }, "outputs": { "inference_res_path": "tmp/test_reg_predict_client.csv"