opensearch-project · lewijacn · Apr 16, 2024 · Apr 9, 2024 · Apr 9, 2024 · Apr 9, 2024
@@ -78,9 +78,9 @@ RFS has support for packaging its java application as a Docker image by using th
 ```
 Also built into this Docker/Gradle support is the ability to spin up a testing RFS environment using Docker compose. This compose file can be seen [here](./docker/docker-compose.yml) and includes the RFS container, a source cluster container, and a target cluster container.
 
-This environment can be spun up with the Gradle command, and use the optional `-Pdataset` flag to preload a defined dataset from `docker/TestSource_ES_7_10/test-resources`
+This environment can be spun up with the Gradle command, and use the optional `-PshouldGenerateData` flag to preload a dataset from the `generateDatasetStage` in the multi-stage Docker [here](docker/TestSource_ES_7_10/Dockerfile). This stage will take a few minutes to run on its first attempt if it is generating data, as it will be making requests with OSB. This will be cached for future runs as long as the image remains the same.
 ```shell
-./gradlew composeUp -Pdataset='small-benchmark-single-node.tar.gz'
+./gradlew composeUp -PshouldGenerateData=true
 
 ```
 And deleted with the Gradle command

@@ -18,6 +18,7 @@ class DockerServiceProps {
     String dockerImageName = ""
     String inputDir = ""
     Map<String, String> buildArgs = [:]
+    List<String> taskDependencies = []
 }
 
 repositories {
@@ -49,7 +50,7 @@ clean.doFirst {
 }
 
 ext {
-    dataset = findProperty('dataset') ?: 'no-data.tar.gz'
+    dataset = findProperty('shouldGenerateData') ?: 'false'
 }
 
 task demoPrintOutSnapshot (type: JavaExec) {
@@ -73,18 +74,19 @@ task copyDockerRuntimeJars (type: Copy) {
 DockerServiceProps[] dockerServices = [
         new DockerServiceProps([projectName:"reindexFromSnapshot",
                                 dockerImageName:"reindex_from_snapshot",
-                                inputDir:"./docker"]),
+                                inputDir:"./docker",
+                                taskDependencies:["copyDockerRuntimeJars"]]),
         new DockerServiceProps([projectName:"elasticsearchRFSSource",
                                 dockerImageName:"elasticsearch_rfs_source",
                                 inputDir:"./docker/TestSource_ES_7_10",
-                                buildArgs:['EXISTING_DATA': "${project.ext.dataset}"]]),
+                                buildArgs:['SHOULD_GENERATE_DATA': "${project.ext.dataset}"]]),
 ] as DockerServiceProps[]
 
 
 for (dockerService in dockerServices) {
     task "buildDockerImage_${dockerService.projectName}" (type: DockerBuildImage) {
-        if (dockerService.projectName == "reindexFromSnapshot") {
-            dependsOn "copyDockerRuntimeJars"
+        for (dep in dockerService.taskDependencies) {
+            dependsOn dep
         }
         inputDir = project.file(dockerService.inputDir)
         buildArgs = dockerService.buildArgs

@@ -1,10 +1,29 @@
-FROM docker.elastic.co/elasticsearch/elasticsearch-oss:7.10.2
+FROM docker.elastic.co/elasticsearch/elasticsearch-oss:7.10.2 AS base
+
+# Configure Elastic
+ENV ELASTIC_SEARCH_CONFIG_FILE=/usr/share/elasticsearch/config/elasticsearch.yml
+# Prevents ES from complaining about nodes count
+RUN echo "discovery.type: single-node" >> $ELASTIC_SEARCH_CONFIG_FILE
+ENV PATH=${PATH}:/usr/share/elasticsearch/jdk/bin/
 
 RUN cd /etc/yum.repos.d/ && \
     sed -i 's/mirrorlist/#mirrorlist/g' /etc/yum.repos.d/CentOS-* && \
-    sed -i 's|#baseurl=http://mirror.centos.org|baseurl=http://vault.centos.org|g' /etc/yum.repos.d/CentOS-* && \
-    yum install -y python3.9 vim git && \
-    pip3 install opensearch-benchmark
+    sed -i 's|#baseurl=http://mirror.centos.org|baseurl=http://vault.centos.org|g' /etc/yum.repos.d/CentOS-*
+RUN yum install -y gcc python3.9 python39-devel vim git less
+RUN pip3 install opensearch-benchmark
+
+
+FROM base AS generateDatasetStage
+
+ARG SHOULD_GENERATE_DATA=false
+COPY generateDataset.sh /root
+RUN chmod ug+x /root/generateDataset.sh
+
+RUN /root/generateDataset.sh ${SHOULD_GENERATE_DATA} &&  \
+    cd /usr/share/elasticsearch/data && tar -cvzf esDataFiles.tgz nodes
+
+
+FROM base
 
 # Install the S3 Repo Plugin
 RUN echo y | /usr/share/elasticsearch/bin/elasticsearch-plugin install repository-s3
@@ -14,21 +33,12 @@ RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2
     unzip awscliv2.zip && \
     ./aws/install
 
-ARG EXISTING_DATA="no-data.tar.gz"
-
 RUN mkdir /snapshots && chown elasticsearch /snapshots
-COPY ./test-resources/${EXISTING_DATA} /usr/share/elasticsearch
-RUN tar -xzf /usr/share/elasticsearch/${EXISTING_DATA} -C /usr/share/elasticsearch/data && \
-    chown -R elasticsearch /usr/share/elasticsearch/data && \
-    rm /usr/share/elasticsearch/${EXISTING_DATA}
 
+COPY --from=generateDatasetStage /usr/share/elasticsearch/data/esDataFiles.tgz /root/esDataFiles.tgz
 # Install our custom entrypoint script
 COPY ./container-start.sh /usr/share/elasticsearch/container-start.sh
 
-# Configure Elastic
-ENV ELASTIC_SEARCH_CONFIG_FILE=/usr/share/elasticsearch/config/elasticsearch.yml
-# Prevents ES from complaining about nodes coun
-RUN echo "discovery.type: single-node" >> $ELASTIC_SEARCH_CONFIG_FILE
-ENV PATH=${PATH}:/usr/share/elasticsearch/jdk/bin/
+RUN tar -xzf /root/esDataFiles.tgz -C /usr/share/elasticsearch/data
 
 CMD /usr/share/elasticsearch/container-start.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+
+generate_data_requests() {
+  endpoint="http://localhost:9200"
+  # If auth or SSL is used, the correlating OSB options should be provided in this array
+  options=()
+  client_options=$(IFS=,; echo "${options[*]}")
+  set -o xtrace
+
+  echo "Running opensearch-benchmark workloads against ${endpoint}"
+  echo "Running opensearch-benchmark w/ 'geonames' workload..." &&
+  opensearch-benchmark execute-test --distribution-version=1.0.0 --target-host=$endpoint --workload=geonames --pipeline=benchmark-only --test-mode --kill-running-processes --workload-params "target_throughput:0.5,bulk_size:10,bulk_indexing_clients:1,search_clients:1" --client-options=$client_options &&
+  echo "Running opensearch-benchmark w/ 'http_logs' workload..." &&
+  opensearch-benchmark execute-test --distribution-version=1.0.0 --target-host=$endpoint --workload=http_logs --pipeline=benchmark-only --test-mode --kill-running-processes --workload-params "target_throughput:0.5,bulk_size:10,bulk_indexing_clients:1,search_clients:1" --client-options=$client_options &&
+  echo "Running opensearch-benchmark w/ 'nested' workload..." &&
+  opensearch-benchmark execute-test --distribution-version=1.0.0 --target-host=$endpoint --workload=nested --pipeline=benchmark-only --test-mode --kill-running-processes --workload-params "target_throughput:0.5,bulk_size:10,bulk_indexing_clients:1,search_clients:1" --client-options=$client_options &&
+  echo "Running opensearch-benchmark w/ 'nyc_taxis' workload..." &&
+  opensearch-benchmark execute-test --distribution-version=1.0.0 --target-host=$endpoint --workload=nyc_taxis --pipeline=benchmark-only --test-mode --kill-running-processes --workload-params "target_throughput:0.5,bulk_size:10,bulk_indexing_clients:1,search_clients:1" --client-options=$client_options
+}
+
+should_generate_data=$1
+
+if [[ "$should_generate_data" == true ]]; then
+   /usr/local/bin/docker-entrypoint.sh eswrapper & echo $! > /tmp/esWrapperProcess.pid && sleep 10 && generate_data_requests
+else
+   mkdir -p /usr/share/elasticsearch/data/nodes
+fi
diff --git a/RFS/docker/TestSource_ES_7_10/test-resources/inventory.md b/RFS/docker/TestSource_ES_7_10/test-resources/inventory.md
diff --git a/RFS/docker/TestSource_ES_7_10/test-resources/no-data.tar.gz b/RFS/docker/TestSource_ES_7_10/test-resources/no-data.tar.gz
diff --git a/RFS/docker/TestSource_ES_7_10/test-resources/small-benchmark-single-node.tar.gz b/RFS/docker/TestSource_ES_7_10/test-resources/small-benchmark-single-node.tar.gz
@@ -198,14 +198,14 @@ Please note that it will be base64 encoded.
 
 ## Kicking off Reindex from Snapshot (RFS)
 
-When the RFS service gets deployed, it does not start running immediately. This is by design to put the needed infrastructure in place, and then allow the user to control when the historical data migration should occur.
+When the RFS service gets deployed, it does not start running immediately. Instead, the user controls when they want to kick off a historical data migration.
 
 The following command can be run from the Migration Console to initiate the RFS historical data migration
 ```shell
-aws ecs update-service --cluster migration-<STAGE>-ecs-cluster --service migration-<STAGE>-rfs --desired-count 1
+aws ecs update-service --cluster migration-<STAGE>-ecs-cluster --service migration-<STAGE>-reindex-from-snapshot --desired-count 1
 ```
 
-Currently, the RFS service will enter an idle state upon completion and can be cleaned up by using the same command with `--desired-count 0`
+Currently, the RFS application will enter an idle state with the ECS container still running upon completion. This can be cleaned up by using the same command with `--desired-count 0`
 
 
 ## Monitoring Progress via Instrumentation

@@ -6,4 +6,7 @@ script_dir_abs_path=$(dirname "$script_abs_path")
 cd "$script_dir_abs_path" || exit
 
 cd ../../../TrafficCapture || exit
-./gradlew :dockerSolution:buildDockerImages -x test
+./gradlew :dockerSolution:buildDockerImages -x test
+
+cd ../RFS || exit
+./gradlew buildDockerImages -x test
@@ -93,10 +93,10 @@ export class MigrationConsoleStack extends MigrationServiceCore {
 
         const ecsClusterArn = `arn:aws:ecs:${props.env?.region}:${props.env?.account}:service/migration-${props.stage}-ecs-cluster`
         const allReplayerServiceArn = `${ecsClusterArn}/migration-${props.stage}-traffic-replayer*`
-        const rfsServiceArn = `${ecsClusterArn}/migration-${props.stage}-rfs`
+        const reindexFromSnapshotServiceArn = `${ecsClusterArn}/migration-${props.stage}-reindex-from-snapshot`
         const updateReplayerServicePolicy = new PolicyStatement({
             effect: Effect.ALLOW,
-            resources: [allReplayerServiceArn, rfsServiceArn],
+            resources: [allReplayerServiceArn, reindexFromSnapshotServiceArn],
             actions: [
                 "ecs:UpdateService"
             ]

@@ -49,7 +49,7 @@ export class ReindexFromSnapshotStack extends MigrationServiceCore {
         rfsCommand = props.extraArgs ? rfsCommand.concat(` ${props.extraArgs}`) : rfsCommand
 
         this.createService({
-            serviceName: 'rfs',
+            serviceName: 'reindex-from-snapshot',
             taskInstanceCount: 0,
             dockerDirectoryPath: join(__dirname, "../../../../../", "RFS/docker"),
             dockerImageCommand: ['/bin/sh', '-c', rfsCommand],

@@ -184,8 +184,8 @@ export class StackComposer {
         const sourceClusterEndpoint = this.getContextForType('sourceClusterEndpoint', 'string', defaultValues, contextJSON)
         const osContainerServiceEnabled = this.getContextForType('osContainerServiceEnabled', 'boolean', defaultValues, contextJSON)
         const otelCollectorEnabled = this.getContextForType('otelCollectorEnabled', 'boolean', defaultValues, contextJSON)
-        const rfsServiceEnabled = this.getContextForType('rfsServiceEnabled', 'boolean', defaultValues, contextJSON)
-        const rfsExtraArgs = this.getContextForType('rfsExtraArgs', 'string', defaultValues, contextJSON)
+        const reindexFromSnapshotServiceEnabled = this.getContextForType('reindexFromSnapshotServiceEnabled', 'boolean', defaultValues, contextJSON)
+        const reindexFromSnapshotExtraArgs = this.getContextForType('reindexFromSnapshotExtraArgs', 'string', defaultValues, contextJSON)
 
         const requiredFields: { [key: string]: any; } = {"stage":stage, "domainName":domainName}
         for (let key in requiredFields) {
@@ -397,21 +397,21 @@ export class StackComposer {
             this.stacks.push(fetchMigrationStack)
         }
 
-        let rfsStack
-        if (rfsServiceEnabled && networkStack && migrationStack) {
-            rfsStack = new ReindexFromSnapshotStack(scope, "reindexFromSnapshotStack", {
+        let reindexFromSnapshotStack
+        if (reindexFromSnapshotServiceEnabled && networkStack && migrationStack) {
+            reindexFromSnapshotStack = new ReindexFromSnapshotStack(scope, "reindexFromSnapshotStack", {
                 vpc: networkStack.vpc,
                 sourceEndpoint: sourceClusterEndpoint,
-                extraArgs: rfsExtraArgs,
+                extraArgs: reindexFromSnapshotExtraArgs,
                 stackName: `OSMigrations-${stage}-${region}-ReindexFromSnapshot`,
-                description: "This stack contains resources to assist migrating historical data, via RFS, to a target cluster",
+                description: "This stack contains resources to assist migrating historical data, via Reindex from Snapshot, to a target cluster",
                 stage: stage,
                 defaultDeployId: defaultDeployId,
                 fargateCpuArch: fargateCpuArch,
                 ...props,
             })
-            this.addDependentStacks(rfsStack, [migrationStack, openSearchStack, osContainerStack])
-            this.stacks.push(rfsStack)
+            this.addDependentStacks(reindexFromSnapshotStack, [migrationStack, openSearchStack, osContainerStack])
+            this.stacks.push(reindexFromSnapshotStack)
         }
 
         let captureProxyESStack