Skip to content

Commit

Permalink
RFS Workers now use EBS to store/unpack the snapshot (opensearch-proj…
Browse files Browse the repository at this point in the history
…ect#1041)

* RFS Workers now use EBS to store/unpack the snapshot

Signed-off-by: Chris Helma <chelma+github@amazon.com>

* Updated CDK unit test per changes

Signed-off-by: Chris Helma <chelma+github@amazon.com>

* Updates per PR comments

Signed-off-by: Chris Helma <chelma+github@amazon.com>

* More updates per PR comments

Signed-off-by: Chris Helma <chelma+github@amazon.com>

* Fix unit test

Signed-off-by: Chris Helma <chelma+github@amazon.com>

* Corrected RFS Snapshot Volume tag

Signed-off-by: Chris Helma <chelma+github@amazon.com>

---------

Signed-off-by: Chris Helma <chelma+github@amazon.com>
  • Loading branch information
chelma authored Oct 2, 2024
1 parent 5015220 commit 647f601
Show file tree
Hide file tree
Showing 7 changed files with 76 additions and 15 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
},
"vpcId": "<VPC_ID>",
"reindexFromSnapshotServiceEnabled": true,
"reindexFromSnapshotMaxShardSizeGiB": 80,
"trafficReplayerServiceEnabled": true
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ export class NetworkStack extends Stack {
// General interface endpoints
const interfaceEndpoints = [
InterfaceVpcEndpointAwsService.CLOUDWATCH_LOGS, // Push Logs from tasks
InterfaceVpcEndpointAwsService.CLOUDWATCH_MONITORING, // Pull Metrics from Migration Console
InterfaceVpcEndpointAwsService.CLOUDWATCH_MONITORING, // Pull Metrics from Migration Console
InterfaceVpcEndpointAwsService.ECR_DOCKER, // Pull Images on Startup
InterfaceVpcEndpointAwsService.ECR, // List Images on Startup
InterfaceVpcEndpointAwsService.ECS_AGENT, // Task Container Metrics
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ import {
Volume,
AwsLogDriverMode,
ContainerDependencyCondition,
ServiceManagedVolume
} from "aws-cdk-lib/aws-ecs";
import {DockerImageAsset} from "aws-cdk-lib/aws-ecr-assets";
import {Duration, RemovalPolicy, Stack} from "aws-cdk-lib";
Expand Down Expand Up @@ -186,6 +187,11 @@ export class MigrationServiceCore extends Stack {
vpcSubnets: props.vpc.selectSubnets({subnetType: SubnetType.PRIVATE_WITH_EGRESS}),
});

// Add any ServiceManagedVolumes to the service, if they exist
if (props.volumes) {
props.volumes.filter(vol => vol instanceof ServiceManagedVolume).forEach(vol => fargateService.addVolume(vol as ServiceManagedVolume));
}

if (props.targetGroups) {
props.targetGroups.filter(tg => tg !== undefined).forEach(tg => tg.addTarget(fargateService));
}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@
import {StackPropsExt} from "../stack-composer";
import {IVpc, SecurityGroup} from "aws-cdk-lib/aws-ec2";
import {CpuArchitecture} from "aws-cdk-lib/aws-ecs";
import {Size} from "aws-cdk-lib/core";
import {IVpc, SecurityGroup, EbsDeviceVolumeType} from "aws-cdk-lib/aws-ec2";
import {
CpuArchitecture,
ServiceManagedVolume,
FileSystemType,
EbsPropagatedTagSource
} from "aws-cdk-lib/aws-ecs";
import {Construct} from "constructs";
import {join} from "path";
import {MigrationServiceCore} from "./migration-service-core";
Expand All @@ -24,7 +30,9 @@ export interface ReindexFromSnapshotProps extends StackPropsExt {
readonly extraArgs?: string,
readonly otelCollectorEnabled: boolean,
readonly clusterAuthDetails: ClusterAuth
readonly sourceClusterVersion?: string
readonly sourceClusterVersion?: string,
readonly maxShardSizeGiB?: number

}

export class ReindexFromSnapshotStack extends MigrationServiceCore {
Expand Down Expand Up @@ -70,12 +78,16 @@ export class ReindexFromSnapshotStack extends MigrationServiceCore {
const s3Uri = `s3://migration-artifacts-${this.account}-${props.stage}-${this.region}/rfs-snapshot-repo`;
let command = "/rfs-app/runJavaWithClasspath.sh org.opensearch.migrations.RfsMigrateDocuments"
const extraArgsDict = parseArgsToDict(props.extraArgs)
command = appendArgIfNotInExtraArgs(command, extraArgsDict, "--s3-local-dir", "/tmp/s3_files")
const storagePath = "/storage"
const planningSize = props.maxShardSizeGiB ?? 80;
const maxShardSizeBytes = `${planningSize * 1024 * 1024 * 1024}`
command = appendArgIfNotInExtraArgs(command, extraArgsDict, "--s3-local-dir", `"${storagePath}/s3_files"`)
command = appendArgIfNotInExtraArgs(command, extraArgsDict, "--s3-repo-uri", `"${s3Uri}"`)
command = appendArgIfNotInExtraArgs(command, extraArgsDict, "--s3-region", this.region)
command = appendArgIfNotInExtraArgs(command, extraArgsDict, "--snapshot-name", "rfs-snapshot")
command = appendArgIfNotInExtraArgs(command, extraArgsDict, "--lucene-dir", "/lucene")
command = appendArgIfNotInExtraArgs(command, extraArgsDict, "--lucene-dir", `"${storagePath}/lucene"`)
command = appendArgIfNotInExtraArgs(command, extraArgsDict, "--target-host", osClusterEndpoint)
command = appendArgIfNotInExtraArgs(command, extraArgsDict, "--max-shard-size-bytes", maxShardSizeBytes)
if (props.clusterAuthDetails.sigv4) {
command = appendArgIfNotInExtraArgs(command, extraArgsDict, "--target-aws-service-signing-name", props.clusterAuthDetails.sigv4.serviceSigningName)
command = appendArgIfNotInExtraArgs(command, extraArgsDict, "--target-aws-region", props.clusterAuthDetails.sigv4.region)
Expand Down Expand Up @@ -113,19 +125,57 @@ export class ReindexFromSnapshotStack extends MigrationServiceCore {
servicePolicies.push(getSecretsPolicy);
}

const volumes = [sharedLogFileSystem.asVolume()];
const mountPoints = [sharedLogFileSystem.asMountPoint()];

// Calculate the volume size based on the max shard size
// Have space for the snapshot and an unpacked copy, with buffer
const volumeSize = Math.max(
Math.ceil(planningSize * 2 * 1.15),
1
)

if (volumeSize > 16000) {
// 16 TiB is the maximum volume size for GP3
throw new Error(`"Your max shard size of ${props.maxShardSizeGiB} GiB is too large to migrate."`)
}

// Volume we'll use to download and unpack the snapshot
const snapshotVolume = new ServiceManagedVolume(this, 'SnapshotVolume', {
name: 'snapshot-volume',
managedEBSVolume: {
size: Size.gibibytes(volumeSize),
volumeType: EbsDeviceVolumeType.GP3,
fileSystemType: FileSystemType.XFS,
tagSpecifications: [{
tags: {
Name: `rfs-snapshot-volume-${props.stage}`,
},
propagateTags: EbsPropagatedTagSource.SERVICE,
}],
encrypted: true,
},
});

volumes.push(snapshotVolume);
mountPoints.push({
containerPath: storagePath,
readOnly: false,
sourceVolume: snapshotVolume.name,
});

this.createService({
serviceName: 'reindex-from-snapshot',
taskInstanceCount: 0,
dockerDirectoryPath: join(__dirname, "../../../../../", "DocumentsFromSnapshotMigration/docker"),
dockerImageCommand: ['/bin/sh', '-c', "/rfs-app/entrypoint.sh"],
securityGroups: securityGroups,
volumes: [sharedLogFileSystem.asVolume()],
mountPoints: [sharedLogFileSystem.asMountPoint()],
volumes: volumes,
mountPoints: mountPoints,
taskRolePolicies: servicePolicies,
cpuArchitecture: props.fargateCpuArch,
taskCpuUnits: 2048,
taskMemoryLimitMiB: 4096,
ephemeralStorageGiB: 200,
environment: {
"RFS_COMMAND": command,
"RFS_TARGET_USER": targetUser,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,7 @@ export class StackComposer {
const otelCollectorEnabled = this.getContextForType('otelCollectorEnabled', 'boolean', defaultValues, contextJSON)
const reindexFromSnapshotServiceEnabled = this.getContextForType('reindexFromSnapshotServiceEnabled', 'boolean', defaultValues, contextJSON)
const reindexFromSnapshotExtraArgs = this.getContextForType('reindexFromSnapshotExtraArgs', 'string', defaultValues, contextJSON)
const reindexFromSnapshotMaxShardSizeGiB = this.getContextForType('reindexFromSnapshotMaxShardSizeGiB', 'number', defaultValues, contextJSON)
const albAcmCertArn = this.getContextForType('albAcmCertArn', 'string', defaultValues, contextJSON);
const managedServiceSourceSnapshotEnabled = this.getContextForType('managedServiceSourceSnapshotEnabled', 'boolean', defaultValues, contextJSON)

Expand Down Expand Up @@ -483,7 +484,8 @@ export class StackComposer {
otelCollectorEnabled: otelCollectorEnabled,
defaultDeployId: defaultDeployId,
fargateCpuArch: fargateCpuArch,
env: props.env
env: props.env,
maxShardSizeGiB: reindexFromSnapshotMaxShardSizeGiB
})
this.addDependentStacks(reindexFromSnapshotStack, [migrationStack, openSearchStack, osContainerStack])
this.stacks.push(reindexFromSnapshotStack)
Expand Down
1 change: 1 addition & 0 deletions deployment/cdk/opensearch-service-migration/options.md
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ The optional component is:
| reindexFromSnapshotExtraArgs | string | "--target-aws-region us-east-1 --target-aws-service-signing-name es" | Extra arguments to provide to the Document Migration command with space separation. See [RFS Arguments](../../../DocumentsFromSnapshotMigration/README.md#Arguments). [^1] |
| sourceClusterEndpoint | string | `"https://source-cluster.elb.us-east-1.endpoint.com"` | The endpoint for the source cluster from which RFS will take a snapshot |
| managedServiceSourceSnapshotEnabled | boolean | true | Create the necessary roles and trust relationships to take a snapshot of a managed service source cluster. This is only compatible with SigV4 auth. |
| reindexFromSnapshotMaxShardSizeGiB | integer | 80 | OPTIONAL: The size, in whole GiB, of the largest shard you want to migrate across all indices; used to ensure we have enough disk space reserved to perform the migration. Default: 80 GiB |

### VPC Options

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -115,10 +115,11 @@ describe('ReindexFromSnapshotStack Tests', () => {
Value: {
"Fn::Join": [
"",
[ "/rfs-app/runJavaWithClasspath.sh org.opensearch.migrations.RfsMigrateDocuments --s3-local-dir /tmp/s3_files --s3-repo-uri \"s3://migration-artifacts-test-account-unit-test-us-east-1/rfs-snapshot-repo\" --s3-region us-east-1 --snapshot-name rfs-snapshot --lucene-dir /lucene --target-host ",
[ "/rfs-app/runJavaWithClasspath.sh org.opensearch.migrations.RfsMigrateDocuments --s3-local-dir \"/storage/s3_files\" --s3-repo-uri \"s3://migration-artifacts-test-account-unit-test-us-east-1/rfs-snapshot-repo\" --s3-region us-east-1 --snapshot-name rfs-snapshot --lucene-dir \"/storage/lucene\" --target-host ",
{
"Ref": "SsmParameterValuemigrationunittestdefaultosClusterEndpointC96584B6F00A464EAD1953AFF4B05118Parameter",
},
" --max-shard-size-bytes 85899345920"
],
],
}
Expand Down Expand Up @@ -182,11 +183,11 @@ describe('ReindexFromSnapshotStack Tests', () => {
Value: {
"Fn::Join": [
"",
[ "/rfs-app/runJavaWithClasspath.sh org.opensearch.migrations.RfsMigrateDocuments --s3-local-dir /tmp/s3_files --s3-repo-uri \"s3://migration-artifacts-test-account-unit-test-us-east-1/rfs-snapshot-repo\" --s3-region us-east-1 --snapshot-name rfs-snapshot --lucene-dir /lucene --target-host ",
[ "/rfs-app/runJavaWithClasspath.sh org.opensearch.migrations.RfsMigrateDocuments --s3-local-dir \"/storage/s3_files\" --s3-repo-uri \"s3://migration-artifacts-test-account-unit-test-us-east-1/rfs-snapshot-repo\" --s3-region us-east-1 --snapshot-name rfs-snapshot --lucene-dir \"/storage/lucene\" --target-host ",
{
"Ref": "SsmParameterValuemigrationunittestdefaultosClusterEndpointC96584B6F00A464EAD1953AFF4B05118Parameter",
},
" --target-aws-service-signing-name aoss --target-aws-region eu-west-1",
" --max-shard-size-bytes 85899345920 --target-aws-service-signing-name aoss --target-aws-region eu-west-1",
],
],
}
Expand Down Expand Up @@ -271,11 +272,11 @@ describe('ReindexFromSnapshotStack Tests', () => {
Value: {
"Fn::Join": [
"",
[ "/rfs-app/runJavaWithClasspath.sh org.opensearch.migrations.RfsMigrateDocuments --s3-local-dir /tmp/s3_files --s3-repo-uri \"s3://migration-artifacts-test-account-unit-test-us-east-1/rfs-snapshot-repo\" --s3-region us-east-1 --lucene-dir /lucene --target-host ",
[ "/rfs-app/runJavaWithClasspath.sh org.opensearch.migrations.RfsMigrateDocuments --s3-local-dir \"/storage/s3_files\" --s3-repo-uri \"s3://migration-artifacts-test-account-unit-test-us-east-1/rfs-snapshot-repo\" --s3-region us-east-1 --lucene-dir \"/storage/lucene\" --target-host ",
{
"Ref": "SsmParameterValuemigrationunittestdefaultosClusterEndpointC96584B6F00A464EAD1953AFF4B05118Parameter",
},
" --custom-arg value --flag --snapshot-name \"custom-snapshot\""
" --max-shard-size-bytes 85899345920 --custom-arg value --flag --snapshot-name \"custom-snapshot\""
]
]
}
Expand Down

0 comments on commit 647f601

Please sign in to comment.