Skip to content

Commit

Permalink
small fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
blublinsky committed Sep 8, 2024
1 parent 14a425a commit ee6c979
Show file tree
Hide file tree
Showing 4 changed files with 6 additions and 4 deletions.
1 change: 1 addition & 0 deletions scripts/k8s-setup/populate_minio.sh
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ mc cp --recursive ${REPOROOT}/transforms/language/doc_chunk/ray/test-data/input/
mc cp --recursive ${REPOROOT}/transforms/universal/doc_id/ray/test-data/input/ kfp/test/doc_id/input
mc cp --recursive ${REPOROOT}/transforms/universal/ededup/ray/test-data/input/ kfp/test/ededup/input
mc cp --recursive ${REPOROOT}/transforms/universal/fdedup/ray/test-data/input/ kfp/test/fdedup/input
mc cp --recursive ${REPOROOT}/transforms/universal/fdedup_multi_step/ray/test-data/input/ kfp/test/fdedup_ms/input
mc cp --recursive ${REPOROOT}/transforms/universal/filter/ray/test-data/input/ kfp/test/filter/input
mc cp --recursive ${REPOROOT}/transforms/universal/noop/ray/test-data/input/ kfp/test/noop/input
mc cp --recursive ${REPOROOT}/transforms/universal/tokenization/ray/test-data/ds01/input/ kfp/test/tokenization/ds01/input
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
# limitations under the License.
################################################################################
import os
import ast

import kfp.compiler as compiler
import kfp.components as comp
Expand Down Expand Up @@ -77,7 +78,7 @@ def fdedup_bucket_processor(
ray_worker_options: dict = {"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, "image": task_image},
server_url: str = "http://kuberay-apiserver-service.kuberay.svc.cluster.local:8888",
# data access. checkpointing is not supported by dedup
data_s3_config: str = "{'input_folder': 'test/fdedup/input/', 'output_folder': 'test/fdedup/output/'}",
data_s3_config: str = "{'input_folder': 'test/fdedup_ms/output/snapshot/buckets', 'output_folder': 'test/fdedup_ms/output/'}",
data_s3_access_secret: str = "s3-secret",
data_max_files: int = -1,
data_num_samples: int = -1,
Expand Down Expand Up @@ -195,7 +196,7 @@ def fdedup_bucket_processor(
ray_name=ray_name,
run_id=run_id,
additional_params=additional_params,
exec_params=compute_exec_params.output,
exec_params=compute_exec_params.output | {"data_files_to_use": ast.literal_eval("['']")},
exec_script_name=EXEC_SCRIPT_NAME,
server_url=server_url,
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ def fdedup_filter(
ray_worker_options: dict = {"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, "image": task_image},
server_url: str = "http://kuberay-apiserver-service.kuberay.svc.cluster.local:8888",
# data access. checkpointing is not supported by dedup
data_s3_config: str = "{'input_folder': 'test/fdedup/input/', 'output_folder': 'test/fdedup/output/'}",
data_s3_config: str = "{'input_folder': 'test/fdedup_ms/input/', 'output_folder': 'test/fdedup_ms/output/'}",
data_s3_access_secret: str = "s3-secret",
data_max_files: int = -1,
data_num_samples: int = -1,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ def fdedup_preprocessor(
ray_worker_options: dict = {"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, "image": task_image},
server_url: str = "http://kuberay-apiserver-service.kuberay.svc.cluster.local:8888",
# data access. checkpointing is not supported by dedup
data_s3_config: str = "{'input_folder': 'test/fdedup/input/', 'output_folder': 'test/fdedup/output/'}",
data_s3_config: str = "{'input_folder': 'test/fdedup_ms/input/', 'output_folder': 'test/fdedup_ms/output/'}",
data_s3_access_secret: str = "s3-secret",
data_max_files: int = -1,
data_num_samples: int = -1,
Expand Down

0 comments on commit ee6c979

Please sign in to comment.