forked from awslabs/aws-support-tools
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request awslabs#56 from aarts00/master
DataPipeline Scripts
- Loading branch information
Showing
5 changed files
with
764 additions
and
10 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,10 +1 @@ | ||
Support related content for AWS Data Pipeline | ||
|
||
--------------------------------------- | ||
Directories | ||
--------------------------------------- | ||
| Name | Description | | ||
| :----------------------------:|:-------------------------------------------------:| | ||
| MySqlRdsToPostgreSqlRds | Copy Mysql RDS table to PostgreSql RDS Table | | ||
| PostgresqlRdsToRedshift | Copy PostgreSql RDS table to Redshift Table | | ||
|
||
Scripts and data pipeline definition examples for non-default templates. |
270 changes: 270 additions & 0 deletions
270
DataPipeline/WriteToAnySchemaInRedshiftFromRds/RDStoRedshift-AnySchema.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,270 @@ | ||
{ | ||
"objects": [ | ||
{ | ||
"myComment" : "Cleanup intermediate staging S3 after other activities are completed", | ||
"input": { | ||
"ref": "S3StagingDataNode" | ||
}, | ||
"dependsOn": { | ||
"ref": "S3ToRedshiftCopyActivity" | ||
}, | ||
"stage": "false", | ||
"name": "S3StagingCleanupActivity", | ||
"id": "S3StagingCleanupActivity", | ||
"runsOn": { | ||
"ref": "myEc2Instance" | ||
}, | ||
"type": "ShellCommandActivity", | ||
"command": "(sudo yum -y update aws-cli) && (aws s3 rm #{input.directoryPath} --recursive)" | ||
}, | ||
{ | ||
"myComment" : "Ec2 resource to run pipeline activities. keyPair,subnetId and securityGroupIds are optional, if used please refer your resource ids", | ||
"subnetId": "subnet-7b3a9f46", | ||
"securityGroupIds": [ | ||
"sg-f3973b95", | ||
"sg-068b2160" | ||
], | ||
"instanceType": "m3.medium", | ||
"name": "myEc2Instance", | ||
"keyPair": "ec2-VA-keypair", | ||
"id": "myEc2Instance", | ||
"type": "Ec2Resource", | ||
"terminateAfter": "2 Hours" | ||
}, | ||
{ | ||
"myComment" : "S3 path used for intermediate staging", | ||
"directoryPath": "#{myS3StagingLoc}/#{format(@scheduledStartTime, 'YYYY-MM-dd-HH-mm-ss')}", | ||
"name": "S3StagingDataNode", | ||
"id": "S3StagingDataNode", | ||
"type": "S3DataNode" | ||
}, | ||
{ | ||
"myComment" : "Your Redshift DB details", | ||
"connectionString": "#{myRedshiftJdbcConnectStr}", | ||
"*password": "#{*myRedshiftPassword}", | ||
"name": "RedshiftCluster", | ||
"id": "RedshiftCluster", | ||
"type": "RedshiftDatabase", | ||
"username": "#{myRedshiftUsername}" | ||
}, | ||
{ | ||
"myComment" : "Activity to copy from source RDS to intermediate staging S3 path", | ||
"output": { | ||
"ref": "S3StagingDataNode" | ||
}, | ||
"input": { | ||
"ref": "SrcRDSTable" | ||
}, | ||
"name": "RDSToS3CopyActivity", | ||
"id": "RDSToS3CopyActivity", | ||
"runsOn": { | ||
"ref": "myEc2Instance" | ||
}, | ||
"type": "CopyActivity" | ||
}, | ||
{ | ||
"myComment" : "Target Redshift table details, please provide your table primary keys", | ||
"database": { | ||
"ref": "RedshiftCluster" | ||
}, | ||
"name": "DestRedshiftTable", | ||
"id": "DestRedshiftTable", | ||
"type": "RedshiftDataNode", | ||
"tableName": "#{myRedshiftTableName}", | ||
"schemaName": "#{myRedshiftSchemaName}", | ||
"primaryKeys": "#{myRedshiftPrimaryKeys}" | ||
}, | ||
{ | ||
"myComment" : "Your source RDS DB details", | ||
"connectionString": "#{myRDSJdbcConnectStr}", | ||
"*password": "#{*myRDSPassword}", | ||
"name": "rds_mysql", | ||
"jdbcProperties": "allowMultiQueries=true", | ||
"id": "rds_mysql", | ||
"type": "JdbcDatabase", | ||
"jdbcDriverClass": "com.mysql.jdbc.Driver", | ||
"username": "#{myRDSUsername}" | ||
}, | ||
{ | ||
"myComment" : "Default Pipeline object that defines IAM roles, pipeline log bucket path and Schedule", | ||
"failureAndRerunMode": "CASCADE", | ||
"resourceRole": "DataPipelineDefaultResourceRole", | ||
"role": "DataPipelineDefaultRole", | ||
"pipelineLogUri": "s3://testbucket/logs/", | ||
"scheduleType": "ONDEMAND", | ||
"name": "Default", | ||
"id": "Default" | ||
}, | ||
{ | ||
"myComment" : "Activity that creates target table in Redshift Database. Please refer to shell script hosted in your S3 bucket", | ||
"output": { | ||
"ref": "DestRedshiftTable" | ||
}, | ||
"input": { | ||
"ref": "SrcRDSTable" | ||
}, | ||
"scriptUri": "s3://testbucket/scripts/dbconv_tablename_noquotes.sh", | ||
"dependsOn": { | ||
"ref": "RDSToS3CopyActivity" | ||
}, | ||
"name": "RedshiftTableCreateActivity", | ||
"runsOn": { | ||
"ref": "myEc2Instance" | ||
}, | ||
"scriptArgument": [ | ||
"--rds_jdbc=#{myRDSJdbcConnectStr}", | ||
"--rds_tbl=#{myRDSTableName}", | ||
"--rds_pwd=#{*myRDSPassword}", | ||
"--rds_usr=#{myRDSUsername}", | ||
"--red_jdbc=#{myRedshiftJdbcConnectStr}", | ||
"--red_usr=#{myRedshiftUsername}", | ||
"--red_pwd=#{*myRedshiftPassword}", | ||
"--red_tbl=#{myRedshiftSchemaName}.#{myRedshiftTableName}", | ||
"--red_dist=#{myRedshiftDistributionKey}", | ||
"--red_sort=#{myRedshiftSortKeys}", | ||
"--red_map=#{myRedshiftTypeConvOverrideMap}", | ||
"--red_ins=#{myInsertMode}" | ||
], | ||
"id": "RedshiftTableCreateActivity", | ||
"type": "ShellCommandActivity" | ||
}, | ||
{ | ||
"myComment" : "Source RDS table details", | ||
"database": { | ||
"ref": "rds_mysql" | ||
}, | ||
"name": "SrcRDSTable", | ||
"id": "SrcRDSTable", | ||
"type": "SqlDataNode", | ||
"table": "#{myRDSTableName}", | ||
"selectQuery": "select * from #{table}" | ||
}, | ||
{ | ||
"myComment" : "Activity to copy from intermediate staging S3 path to target table", | ||
"output": { | ||
"ref": "DestRedshiftTable" | ||
}, | ||
"input": { | ||
"ref": "S3StagingDataNode" | ||
}, | ||
"dependsOn": { | ||
"ref": "RedshiftTableCreateActivity" | ||
}, | ||
"name": "S3ToRedshiftCopyActivity", | ||
"id": "S3ToRedshiftCopyActivity", | ||
"runsOn": { | ||
"ref": "myEc2Instance" | ||
}, | ||
"type": "RedshiftCopyActivity", | ||
"insertMode": "#{myInsertMode}", | ||
"maximumRetries": "2" | ||
} | ||
], | ||
"parameters": [ | ||
{ | ||
"description": "RDS MySQL password", | ||
"id": "*myRDSPassword", | ||
"type": "String" | ||
}, | ||
{ | ||
"description": "RDS MySQL table name", | ||
"id": "myRDSTableName", | ||
"type": "String" | ||
}, | ||
{ | ||
"description": "Redshift username", | ||
"id": "myRedshiftUsername", | ||
"type": "String" | ||
}, | ||
{ | ||
"watermark": "jdbc:mysql://dbinstance.id.region.rds.amazonaws.com:3306/dbname", | ||
"description": "RDS MySQL connection string", | ||
"id": "myRDSJdbcConnectStr", | ||
"type": "String" | ||
}, | ||
{ | ||
"helpText": "The S3 folder to store RDS MySQL table data before loading to Redshift. The S3 folder must be in the same region as the Redshift cluster.", | ||
"description": "S3 staging folder", | ||
"id": "myS3StagingLoc", | ||
"type": "AWS::S3::ObjectKey" | ||
}, | ||
{ | ||
"watermark": "columnName", | ||
"helpText": "Distribution key column in the Redshift table. If the distribution key is not specified the primary key is set as a distribution key.", | ||
"description": "Redshift table distribution key", | ||
"optional": "true", | ||
"id": "myRedshiftDistributionKey", | ||
"type": "String" | ||
}, | ||
{ | ||
"helpText": "Override the default mapping of RDS MySQL data types to Redshift data types.", | ||
"watermark": "tinyint(1):smallint,char(35):varchar(70),bigint(20) unsigned:bigint", | ||
"description": "MySQL to Redshift type conversion overrides", | ||
"optional": "true", | ||
"id": "myRedshiftTypeConvOverrideMap", | ||
"type": "String" | ||
}, | ||
{ | ||
"helpText": "The name of an existing table or a new table that will be automatically created.", | ||
"description": "Redshift table name", | ||
"id": "myRedshiftTableName", | ||
"type": "String" | ||
}, | ||
{ | ||
"default": "OVERWRITE_EXISTING", | ||
"helpLink": "https://docs.aws.amazon.com/console/datapipeline/redshiftcopyactivity", | ||
"helpText": "Determines how to handle pre-existing data in the target table that overlaps with rows in the data to be loaded.", | ||
"description": "Redshift table insert mode", | ||
"id": "myInsertMode", | ||
"type": "String" | ||
}, | ||
{ | ||
"description": "Redshift password", | ||
"id": "*myRedshiftPassword", | ||
"type": "String" | ||
}, | ||
{ | ||
"default": "default", | ||
"watermark": "security group name", | ||
"helpText": "The names of one or more security groups that collectively provide the EC2 instance connectivity to both the RDS instance and Redshift cluster.", | ||
"description": "RDS and Redshift security group(s)", | ||
"isArray": "true", | ||
"id": "myRDSRedshiftSecurityGrps", | ||
"type": "String" | ||
}, | ||
{ | ||
"description": "RDS MySQL username", | ||
"id": "myRDSUsername", | ||
"type": "String" | ||
}, | ||
{ | ||
"watermark": "columnName1,columnName2", | ||
"helpText": "Sort key columns in the Redshift table.", | ||
"description": "Redshift table sort keys", | ||
"optional": "true", | ||
"id": "myRedshiftSortKeys", | ||
"type": "String" | ||
}, | ||
{ | ||
"watermark": "jdbc:postgresql://endpoint:port/database?tcpKeepAlive=true", | ||
"description": "Redshift JDBC connection string", | ||
"id": "myRedshiftJdbcConnectStr", | ||
"type": "String" | ||
} | ||
], | ||
"values": { | ||
"myRedshiftUsername": "", | ||
"myRDSUsername": "", | ||
"myS3StagingLoc": "", | ||
"myRedshiftJdbcConnectStr": "", | ||
"*myRedshiftPassword": "", | ||
"myInsertMode": "", | ||
"myRDSJdbcConnectStr": "", | ||
"myRDSRedshiftSecurityGrps": "", | ||
"*myRDSPassword": "", | ||
"myRedshiftTableName": "", | ||
"myRedshiftSchemaName": "", | ||
"myRDSTableName": "", | ||
"myRedshiftPrimaryKeys": "" | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
Data Pipeline has RedshiftCopyActivity which is used by many of its default templates to write from S3 into a target Redshift Table. The default RDS to Redshift Copy template has a chain of activities that also uses a schema translation script to convert the Mysql RDS schema into a Redshift compatible schema. | ||
|
||
|
||
The default templates and the associated scripts for 'RDS to Redshift Copy' will allow writing ONLY to the Public schema in Redshift. The default scripts and templates are modified and shared here to copy an RDS table into a Redshift table in a schema other than Public. | ||
|
||
|
||
-- | ||
[1] mysql_to_redshift_tablename_noquotes.py -- Code changes done to Ignore COMMENT portion from Column definition of RDS since Redshift does not support COMMENT in create table statement. Host this file in your S3 bucket and give it Public Access. | ||
|
||
|
||
[2] dbconv_tablename_noquotes.sh -- Edit line 125 to point to your S3 location of [1]. Host this file also in your S3 bucket and give it Public Access. | ||
|
||
|
||
[3] RDStoRedshift-AnySchema.json is a data pipeline definition file example. It illustrates the usage of the scripts in [1] and [2] for copying a table in RDS to a table in any schema of Redshift. Update this json to refer your RDS table, Redshift table, S3 and Ec2Resource details, as well as make the RedshiftTableCreateActivity point to your S3 path where you had hosted [2]. | ||
|
Oops, something went wrong.