From 14b7b0ab8ccb2c04f6595aeccb37f689e090c806 Mon Sep 17 00:00:00 2001 From: Kaihui-intel Date: Tue, 27 Feb 2024 14:25:23 +0800 Subject: [PATCH] Fix Neural Solution SQL/CMD injection (#1627) Signed-off-by: Kaihui-intel Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- neural_solution/backend/scheduler.py | 2 +- .../tf_example1/README.md | 4 +- .../tf_example1/task_request.json | 2 +- .../tf_example1/task_request_distributed.json | 2 +- neural_solution/examples/hf_models/README.md | 2 +- .../examples/hf_models/task_request.json | 2 +- .../examples/hf_models_grpc/README.md | 2 +- .../examples/hf_models_grpc/task_request.json | 2 +- .../frontend/fastapi/main_server.py | 5 + neural_solution/frontend/utility.py | 58 +++++++++++ .../test/frontend/fastapi/test_main_server.py | 22 ++++- .../test/frontend/fastapi/test_utils.py | 97 +++++++++++++++++++ 12 files changed, 187 insertions(+), 13 deletions(-) diff --git a/neural_solution/backend/scheduler.py b/neural_solution/backend/scheduler.py index a08bebaf534..14ffa3afb1b 100644 --- a/neural_solution/backend/scheduler.py +++ b/neural_solution/backend/scheduler.py @@ -154,7 +154,7 @@ def prepare_task(self, task: Task): if not task.optimized: # Generate quantization code with Neural Coder API neural_coder_cmd = ["python -m neural_coder --enable --approach"] - # for users to define approach: "static, ""static_ipex", "dynamic", "auto" + # for users to define approach: "static", "static_ipex", "dynamic", "auto" approach = task.approach neural_coder_cmd.append(approach) if is_remote_url(task.script_url): diff --git a/neural_solution/examples/custom_models_optimized/tf_example1/README.md b/neural_solution/examples/custom_models_optimized/tf_example1/README.md index 6d6dfce1bf7..19aa66c9f7d 100644 --- a/neural_solution/examples/custom_models_optimized/tf_example1/README.md +++ b/neural_solution/examples/custom_models_optimized/tf_example1/README.md @@ -91,7 +91,7 @@ optional arguments: "script_url": "tf_example1", "optimized": "True", "arguments": [ - "--dataset_location=dataset --model_path=model" + "--dataset_location=dataset", "--model_path=model" ], "approach": "static", "requirements": [ @@ -106,7 +106,7 @@ When using distributed quantization, the `workers` needs to be set to greater th "script_url": "tf_example1", "optimized": "True", "arguments": [ - "--dataset_location=dataset --model_path=model" + "--dataset_location=dataset", "--model_path=model" ], "approach": "static", "requirements": [ diff --git a/neural_solution/examples/custom_models_optimized/tf_example1/task_request.json b/neural_solution/examples/custom_models_optimized/tf_example1/task_request.json index 0ec0d52336f..7cce23ec4d4 100644 --- a/neural_solution/examples/custom_models_optimized/tf_example1/task_request.json +++ b/neural_solution/examples/custom_models_optimized/tf_example1/task_request.json @@ -2,7 +2,7 @@ "script_url": "custom_models_optimized/tf_example1", "optimized": "True", "arguments": [ - "--dataset_location=dataset --model_path=model" + "--dataset_location=dataset", "--model_path=model" ], "approach": "static", "requirements": ["tensorflow" diff --git a/neural_solution/examples/custom_models_optimized/tf_example1/task_request_distributed.json b/neural_solution/examples/custom_models_optimized/tf_example1/task_request_distributed.json index aa41089b0cd..d2d42585171 100644 --- a/neural_solution/examples/custom_models_optimized/tf_example1/task_request_distributed.json +++ b/neural_solution/examples/custom_models_optimized/tf_example1/task_request_distributed.json @@ -2,7 +2,7 @@ "script_url": "custom_models_optimized/tf_example1", "optimized": "True", "arguments": [ - "--dataset_location=dataset --model_path=model" + "--dataset_location=dataset", "--model_path=model" ], "approach": "static", "requirements": ["tensorflow" diff --git a/neural_solution/examples/hf_models/README.md b/neural_solution/examples/hf_models/README.md index afa2e83e698..4b72b5d88c7 100644 --- a/neural_solution/examples/hf_models/README.md +++ b/neural_solution/examples/hf_models/README.md @@ -72,7 +72,7 @@ optional arguments: "script_url": "https://github.com/huggingface/transformers/blob/v4.21-release/examples/pytorch/text-classification/run_glue.py", "optimized": "False", "arguments": [ - "--model_name_or_path bert-base-cased --task_name mrpc --do_eval --output_dir result" + "--model_name_or_path=bert-base-cased", "--task_name=mrpc", "--do_eval", "--output_dir=result" ], "approach": "static", "requirements": [], diff --git a/neural_solution/examples/hf_models/task_request.json b/neural_solution/examples/hf_models/task_request.json index 969e624dce3..0566be32f3d 100644 --- a/neural_solution/examples/hf_models/task_request.json +++ b/neural_solution/examples/hf_models/task_request.json @@ -2,7 +2,7 @@ "script_url": "https://github.com/huggingface/transformers/blob/v4.21-release/examples/pytorch/text-classification/run_glue.py", "optimized": "False", "arguments": [ - "--model_name_or_path bert-base-cased --task_name mrpc --do_eval --output_dir result" + "--model_name_or_path=bert-base-cased", "--task_name=mrpc", "--do_eval", "--output_dir=result" ], "approach": "static", "requirements": ["datasets", "transformers=4.21.0", "torch"], diff --git a/neural_solution/examples/hf_models_grpc/README.md b/neural_solution/examples/hf_models_grpc/README.md index d97cda6d312..4712c1b6aeb 100644 --- a/neural_solution/examples/hf_models_grpc/README.md +++ b/neural_solution/examples/hf_models_grpc/README.md @@ -68,7 +68,7 @@ optional arguments: "script_url": "https://github.com/huggingface/transformers/blob/v4.21-release/examples/pytorch/text-classification/run_glue.py", "optimized": "False", "arguments": [ - "--model_name_or_path bert-base-cased --task_name mrpc --do_eval --output_dir result" + "--model_name_or_path=bert-base-cased", "--task_name=mrpc", "--do_eval", "--output_dir=result" ], "approach": "static", "requirements": [], diff --git a/neural_solution/examples/hf_models_grpc/task_request.json b/neural_solution/examples/hf_models_grpc/task_request.json index 969e624dce3..0566be32f3d 100644 --- a/neural_solution/examples/hf_models_grpc/task_request.json +++ b/neural_solution/examples/hf_models_grpc/task_request.json @@ -2,7 +2,7 @@ "script_url": "https://github.com/huggingface/transformers/blob/v4.21-release/examples/pytorch/text-classification/run_glue.py", "optimized": "False", "arguments": [ - "--model_name_or_path bert-base-cased --task_name mrpc --do_eval --output_dir result" + "--model_name_or_path=bert-base-cased", "--task_name=mrpc", "--do_eval", "--output_dir=result" ], "approach": "static", "requirements": ["datasets", "transformers=4.21.0", "torch"], diff --git a/neural_solution/frontend/fastapi/main_server.py b/neural_solution/frontend/fastapi/main_server.py index e0fd05a3331..7e01b355e59 100644 --- a/neural_solution/frontend/fastapi/main_server.py +++ b/neural_solution/frontend/fastapi/main_server.py @@ -36,6 +36,7 @@ get_cluster_info, get_cluster_table, get_res_during_tuning, + is_valid_task, list_to_string, serialize, ) @@ -153,10 +154,14 @@ async def submit_task(task: Task): Returns: json: status , id of task and messages. """ + if not is_valid_task(task.dict()): + raise HTTPException(status_code=422, detail="Invalid task") + msg = "Task submitted successfully" status = "successfully" # search the current db_path = get_db_path(config.workspace) + if os.path.isfile(db_path): conn = sqlite3.connect(db_path) cursor = conn.cursor() diff --git a/neural_solution/frontend/utility.py b/neural_solution/frontend/utility.py index 62df985c810..a3303abc5e4 100644 --- a/neural_solution/frontend/utility.py +++ b/neural_solution/frontend/utility.py @@ -295,3 +295,61 @@ def list_to_string(lst: list): str: string """ return " ".join(str(i) for i in lst) + + +def is_invalid_str(to_test_str: str): + """Verify whether the to_test_str is valid. + + Args: + to_test_str (str): string to be tested. + + Returns: + bool: valid or invalid + """ + return any(char in to_test_str for char in [" ", '"', "'", "&", "|", ";", "`", ">"]) + + +def is_valid_task(task: dict) -> bool: + """Verify whether the task is valid. + + Args: + task (dict): task request + + Returns: + bool: valid or invalid + """ + required_fields = ["script_url", "optimized", "arguments", "approach", "requirements", "workers"] + + for field in required_fields: + if field not in task: + return False + + if not isinstance(task["script_url"], str) or is_invalid_str(task["script_url"]): + return False + + if (isinstance(task["optimized"], str) and task["optimized"] not in ["True", "False"]) or ( + not isinstance(task["optimized"], str) and not isinstance(task["optimized"], bool) + ): + return False + + if not isinstance(task["arguments"], list): + return False + else: + for argument in task["arguments"]: + if is_invalid_str(argument): + return False + + if not isinstance(task["approach"], str) or task["approach"] not in ["static", "static_ipex", "dynamic", "auto"]: + return False + + if not isinstance(task["requirements"], list): + return False + else: + for requirement in task["requirements"]: + if is_invalid_str(requirement): + return False + + if not isinstance(task["workers"], int) or task["workers"] < 1: + return False + + return True diff --git a/neural_solution/test/frontend/fastapi/test_main_server.py b/neural_solution/test/frontend/fastapi/test_main_server.py index cd8490bba9e..11ab1179847 100644 --- a/neural_solution/test/frontend/fastapi/test_main_server.py +++ b/neural_solution/test/frontend/fastapi/test_main_server.py @@ -118,12 +118,26 @@ def test_get_description(self): def test_submit_task(self, mock_submit_task): task = { "script_url": "http://example.com/script.py", - "optimized": True, + "optimized": "True", "arguments": ["arg1", "arg2"], - "approach": "approach1", + "approach": "static", + "requirements": ["req1", "req2"], + "workers": 3, + } + + # test invalid task + task_invalid = { + "script_url": "http://example.com/script.py", + "optimized": "True", + "arguments": "invalid str, should be list", + "approach": "static", "requirements": ["req1", "req2"], "workers": 3, } + response = client.post("/task/submit/", json=task_invalid) + print(response) + self.assertEqual(response.status_code, 422) + self.assertIn("arguments", response.text) # test no db case delete_db() @@ -174,7 +188,7 @@ def test_get_task_by_id(self, mock_submit_task): "script_url": "http://example.com/script.py", "optimized": True, "arguments": ["arg1", "arg2"], - "approach": "approach1", + "approach": "static", "requirements": ["req1", "req2"], "workers": 3, } @@ -200,7 +214,7 @@ def test_get_task_status_by_id(self, mock_submit_task): "script_url": "http://example.com/script.py", "optimized": True, "arguments": ["arg1", "arg2"], - "approach": "approach1", + "approach": "static", "requirements": ["req1", "req2"], "workers": 3, } diff --git a/neural_solution/test/frontend/fastapi/test_utils.py b/neural_solution/test/frontend/fastapi/test_utils.py index abb57fff92b..42a5f42568f 100644 --- a/neural_solution/test/frontend/fastapi/test_utils.py +++ b/neural_solution/test/frontend/fastapi/test_utils.py @@ -10,6 +10,7 @@ get_cluster_info, get_cluster_table, get_res_during_tuning, + is_valid_task, list_to_string, serialize, ) @@ -110,6 +111,102 @@ def test_list_to_string(self): expected_result = "Hello Neural Solution" self.assertEqual(list_to_string(lst), expected_result) + def test_is_valid_task(self): + task_sql_injection = { + "script_url": "https://github.com/huggingface/transformers/blob/v4.21-release/examples/pytorch/text-classification/run_glue.py", + "optimized": "False", + "arguments": [], + "approach": "5', '6', 7, 'pending'), ('1b9ff5c2fd2143d58522bd71d18845a3', '2', 3, '4', '5', '6', 7, 'pending') ON CONFLICT (id) DO UPDATE SET id = '1b9ff5c2fd2143d58522bd71d18845a3', q_model_path = '/home/victim/.ssh' --", + "requirements": [], + "workers": 1, + } + self.assertFalse(is_valid_task(task_sql_injection)) + + task_cmd_injection = { + "script_url": 'https://github.com/huggingface/transformers/blob/v4.21-release/examples/pytorch/text-classification/run_glue.py & eval "$(echo ZWNobyAiRG9tYWluIGV4cGFuc2lvbiIgPiB+L2F0dGFjay5weSI= | base64 --decode)"', + "optimized": "False", + "arguments": ["--model_name_or_path bert-base-cased --task_name mrpc --do_eval --output_dir result"], + "approach": "static", + "requirements": [], + "workers": 1, + } + self.assertFalse(is_valid_task(task_cmd_injection)) + + task_lack_field = { + "optimized": "True", + } + self.assertFalse(is_valid_task(task_lack_field)) + + task_script_url_not_str = { + "script_url": ["custom_models_optimized/tf_example1"], + "optimized": "True", + "arguments": ["--dataset_location=dataset --model_path=model"], + "approach": "static", + "requirements": ["tensorflow"], + "workers": 1, + } + self.assertFalse(is_valid_task(task_script_url_not_str)) + + task_optimized_not_bool_str = { + "script_url": ["custom_models_optimized/tf_example1"], + "optimized": "True or False", + "arguments": ["--dataset_location=dataset", "--model_path=model"], + "approach": "static", + "requirements": ["tensorflow"], + "workers": 1, + } + self.assertFalse(is_valid_task(task_optimized_not_bool_str)) + + task_arguments_not_list = { + "script_url": ["custom_models_optimized/tf_example1"], + "optimized": "True", + "arguments": 123, + "approach": "static", + "requirements": ["tensorflow"], + "workers": 1, + } + self.assertFalse(is_valid_task(task_arguments_not_list)) + + task_arguments_invalid = { + "script_url": ["custom_models_optimized/tf_example1"], + "optimized": "True", + "arguments": ["--dataset_location=dataset --model_path=model"], + "approach": "static", + "requirements": ["tensorflow"], + "workers": 1, + } + self.assertFalse(is_valid_task(task_arguments_not_list)) + + task_approach_is_invalid = { + "script_url": ["custom_models_optimized/tf_example1"], + "optimized": "True", + "arguments": [], + "approach": "static or dynamic", + "requirements": ["tensorflow"], + "workers": 1, + } + self.assertFalse(is_valid_task(task_approach_is_invalid)) + + task_requirements_not_list = { + "script_url": ["custom_models_optimized/tf_example1"], + "optimized": "True", + "arguments": [], + "approach": "static", + "requirements": "tensorflow", + "workers": 1, + } + self.assertFalse(is_valid_task(task_requirements_not_list)) + + task_normal = { + "script_url": "custom_models_optimized/tf_example1", + "optimized": "True", + "arguments": ["--dataset_location=dataset", "--model_path=model"], + "approach": "static", + "requirements": ["tensorflow"], + "workers": 1, + } + self.assertTrue(is_valid_task(task_normal)) + if __name__ == "__main__": unittest.main()