From f700442c4aa62d7a4454632267ddb1c6d37c7dc2 Mon Sep 17 00:00:00 2001 From: AJ Steers Date: Tue, 30 Mar 2021 11:17:34 -0700 Subject: [PATCH 1/6] add meltano as clone of singer-taps module --- catalog/aws/meltano/README.md | 367 +++++++++++++++++++ catalog/aws/meltano/USAGE.md | 39 ++ catalog/aws/meltano/cloudwatch.tf | 177 +++++++++ catalog/aws/meltano/lambda-notify.tf | 21 ++ catalog/aws/meltano/lambda/requirements.txt | 1 + catalog/aws/meltano/lambda/webhook_notify.py | 72 ++++ catalog/aws/meltano/main.tf | 109 ++++++ catalog/aws/meltano/outputs.tf | 24 ++ catalog/aws/meltano/s3-path-parsing.tf | 18 + catalog/aws/meltano/s3-upload.tf | 38 ++ catalog/aws/meltano/step-functions.tf | 118 ++++++ catalog/aws/meltano/variables.tf | 244 ++++++++++++ 12 files changed, 1228 insertions(+) create mode 100644 catalog/aws/meltano/README.md create mode 100644 catalog/aws/meltano/USAGE.md create mode 100644 catalog/aws/meltano/cloudwatch.tf create mode 100644 catalog/aws/meltano/lambda-notify.tf create mode 100644 catalog/aws/meltano/lambda/requirements.txt create mode 100644 catalog/aws/meltano/lambda/webhook_notify.py create mode 100644 catalog/aws/meltano/main.tf create mode 100644 catalog/aws/meltano/outputs.tf create mode 100644 catalog/aws/meltano/s3-path-parsing.tf create mode 100644 catalog/aws/meltano/s3-upload.tf create mode 100644 catalog/aws/meltano/step-functions.tf create mode 100644 catalog/aws/meltano/variables.tf diff --git a/catalog/aws/meltano/README.md b/catalog/aws/meltano/README.md new file mode 100644 index 00000000..b86b5edb --- /dev/null +++ b/catalog/aws/meltano/README.md @@ -0,0 +1,367 @@ +--- +parent: Infrastructure Catalog +title: AWS Singer-Taps +nav_exclude: false +--- +# AWS Singer-Taps + +[`source = "git::https://github.com/slalom-ggp/dataops-infra/tree/main/catalog/aws/singer-taps?ref=main"`](https://github.com/slalom-ggp/dataops-infra/tree/main/catalog/aws/singer-taps) + +## Overview + + +The Singer Taps platform is the open source stack which powers the [Stitcher](https://www.stitcher.com) EL platform. For more information, see [singer.io](https://singer.io) + +## Requirements + +No requirements. + +## Providers + +The following providers are used by this module: + +- aws + +## Required Inputs + +The following input variables are required: + +### name\_prefix + +Description: Standard `name_prefix` module input. (Prefix counts towards 64-character max length for certain resource types.) + +Type: `string` + +### environment + +Description: Standard `environment` module input. + +Type: + +```hcl +object({ + vpc_id = string + aws_region = string + public_subnets = list(string) + private_subnets = list(string) + }) +``` + +### resource\_tags + +Description: Standard `resource_tags` module input. + +Type: `map(string)` + +### taps + +Description: A list of tap configurations with the following setting keys: + +- `id` - The official id of the tap plugin to be used, without the 'tap-' prefix. +- `name` - The friendly name of the tap, without the 'tap-' prefix. +- `schedule` - A list of one or more daily sync times in `HHMM` format. E.g.: `0400` for 4am, `1600` for 4pm. +- `settings` - Map of tap settings to their values. +- `secrets` - Map of secrets names mapped to any of the following: + A. file path ("path/to/file") that contains a matching key name, + B. the file path and the json/yaml key ("path/to/file:key"), + C. the AWS Secrets Manager ID of an already stored secret + +Type: + +```hcl +list(object({ + id = string + name = string + schedule = list(string) + settings = map(string) + secrets = map(string) + })) +``` + +### local\_metadata\_path + +Description: The local folder which countains tap definitions files: `{tap-name}.rules.txt` and `{tap-name}.plan.yml` + +Type: `string` + +### data\_lake\_metadata\_path + +Description: The remote folder for storing tap definitions files. +Currently only S3 paths (s3://...) are supported. + +Type: `string` + +## Optional Inputs + +The following input variables are optional (have default values): + +### target + +Description: The definition of which target to load data into. +Note: You must specify `target` or `data_lake_storage_path` but not both. +See the 'taps' input variable for more information on expected configuration values. + +Type: + +```hcl +object({ + id = string + settings = map(string) + secrets = map(string) + }) +``` + +Default: `null` + +### pipeline\_version\_number + +Description: Optional. (Default="1") Specify a pipeline version number when there are breaking changes which require +isolation. Note if you want to avoid overlap between versions, be sure to (1) cancel the +previous version and (2) specify a `start_date` on the new version which is not duplicative +of the previously covered time period. + +Type: `string` + +Default: `"1"` + +### data\_lake\_type + +Description: Specify `S3` if loading to an S3 data lake, otherwise leave blank. + +Type: `any` + +Default: `null` + +### data\_lake\_logging\_path + +Description: The remote folder for storing tap execution logs and log artifacts. +Currently only S3 paths (s3://...) are supported. + +Type: `string` + +Default: `null` + +### data\_lake\_storage\_path + +Description: The root path where files should be stored in the data lake. +Note: + - Currently only S3 paths (S3://...) are supported. + - You must specify `target` or `data_lake_storage_path` but not both. + - This path will be combined with the value provided in `data_file_naming_scheme`. + +Type: `string` + +Default: `null` + +### scheduled\_timezone + +Description: The timezone used in scheduling. +Currently the following codes are supported: PST, PDT, EST, UTC + +Type: `string` + +Default: `"PST"` + +### timeout\_hours + +Description: Optional. The number of hours before the sync task is canceled and retried. + +Type: `number` + +Default: `48` + +### num\_retries + +Description: Optional. The number of retries to attempt if the task fails. + +Type: `number` + +Default: `0` + +### container\_num\_cores + +Description: Optional. Specify the number of cores to use in the container. + +Type: `number` + +Default: `0.5` + +### container\_ram\_gb + +Description: Optional. Specify the amount of RAM to be available to the container. + +Type: `number` + +Default: `1` + +### use\_private\_subnet + +Description: If True, tasks will use a private subnet and will require a NAT gateway to pull the docker +image, and for any outbound traffic. If False, tasks will use a public subnet and will +not require a NAT gateway. + +Type: `bool` + +Default: `false` + +### data\_file\_naming\_scheme + +Description: The naming pattern to use when landing new files in the data lake. Allowed variables are: +`{tap}`, `{table}`, `{version}`, and `{file}`. This value will be combined with the root +data lake path provided in `data_lake_storage_path`." + +Type: `string` + +Default: `"{tap}/{table}/v{version}/{file}"` + +### state\_file\_naming\_scheme + +Description: The naming pattern to use when writing or updating state files. State files keep track of +data recency and are necessary for incremental loading. Allowed variables are: +`{tap}`, `{table}`, `{version}`, and `{file}`" + +Type: `string` + +Default: `"{tap}/{table}/state/{tap}-{table}-v{version}-state.json"` + +### container\_image\_override + +Description: Optional. Override the docker images with a custom-managed image. + +Type: `string` + +Default: `null` + +### container\_image\_suffix + +Description: Optional. Appends a suffix to the default container images. +(e.g. '--pre' for prerelease containers) + +Type: `string` + +Default: `""` + +### container\_command + +Description: Optional. Override the docker image's command. + +Type: `any` + +Default: `null` + +### container\_args + +Description: Optional. A list of additional args to send to the container. + +Type: `list(string)` + +Default: `[]` + +### container\_entrypoint + +Description: Optional. Override the docker image's entrypoint. + +Type: `string` + +Default: `null` + +### alerts\_webhook\_url + +Description: Optionally, specify a webhook for MS Teams notifications. + +Type: `string` + +Default: `null` + +### alerts\_webhook\_message + +Description: Optionally, specify a message for webhook notifications. + +Type: `string` + +Default: `"Warning: A failure occured in the pipeline. Please check on it using the information below.\n"` + +### success\_webhook\_url + +Description: Optionally, specify a webhook for MS Teams notifications. + +Type: `string` + +Default: `null` + +### success\_webhook\_message + +Description: Optionally, specify a message for webhook notifications. + +Type: `string` + +Default: `"Success! The pipeline completed successfully.\n"` + +## Outputs + +The following outputs are exported: + +### summary + +Description: Summary of resources created by this module. +## Usage + +This module supports multiple taps per each target. To target multiple destinations, simply create +additional instances of the module. + +### Tap configuration overview + +The `taps` input variable expects a list of specifications for each tap. The specification for each +tap should include the following properties: + +- `id` - The name or alias of the tap as registered in the [Singer Index](#singer-index), without the `tap-` prefix. + - Note: in most cases, this is exactly what you'd expect: `mssql` for `tap-mssql`, etc. However, + for forks or experimental releases, this might contain a suffix such as `mssql-test` for a test + version of `tap-mssql` or `snowflake-singer` for the singer edition of `tap-snowflake`. + - A future release will add a separate and optional flag for `owner` or `variant`, in place of the + currently used alias/suffix convention. See the [Singer Index](#singer-index) section for more + info. +- `name` - What you want to call the data source. For instance, if you have multiple SQL Servers, you may want to use a more memorable name such as `finance-system` or `gl-db`. This name should still align with tap naming conventions, which is to say it should be in _lower-case-with-dashes_ format. +- `settings` - A simple map of the tap settings' names to their values. These are specific to each tap and they are required for each tap to work. + - Note: While singer does not distinguish between 'secrets' and 'settings', we should and do treat these two types of config separately. Be sure to put all sensitive config in the `secrets` collection, and not here in `settings`. +- `secrets` - Same as `config` except for sensitive values. When passing secrets, you specify the setting name in the same way but you _must_ either pass the value as a pointer to the file containing the secret (a config.json file, for instance) or else pass a AWS Secrets Manager ARN. + - _If you pass a Secrets Manager ARN as the config value_, that secret pointer will be passed to the ECS container securely, and only the running container will have access to the secret. + - _If you pass a pointer to a config file_, the module will automatically create a new AWS Secrets Manager secret, upload the secret to AWS Secrets Manager, and then the above process will continue by passing the Secrets Manager pointer _only_ to the running ECS container. + +### Singer Index + +There are actually two Singer Indexes currently available. + +1. The first and primary index for this module today is the tapdance index stored [here](https://github.com/aaronsteers/tapdance/blob/master/docker/singer_index.yml). + +2. This index will be eventually be replaced by a new dedicated [Singer DB](https://github.com/aaronsteers/singer-db), which is still a work-in-progress. + +Note: + +- Both of these sources support multiple versions (forks) of each tap, and both provide a "default" + or "recommended" version for those new users who just want to get started quickly. +- The new [Singer DB](https://github.com/aaronsteers/singer-db) will implement a new "owner" or "variant" + flag to replace the current "alias" technique used by the + [tapdance index](https://github.com/aaronsteers/tapdance/blob/master/docker/singer_index.yml). + + +--------------------- + +## Source Files + +_Source code for this module is available using the links below._ + +* [cloudwatch.tf](https://github.com/slalom-ggp/dataops-infra/tree/main//catalog/aws/singer-taps/cloudwatch.tf) +* [lambda-notify.tf](https://github.com/slalom-ggp/dataops-infra/tree/main//catalog/aws/singer-taps/lambda-notify.tf) +* [main.tf](https://github.com/slalom-ggp/dataops-infra/tree/main//catalog/aws/singer-taps/main.tf) +* [outputs.tf](https://github.com/slalom-ggp/dataops-infra/tree/main//catalog/aws/singer-taps/outputs.tf) +* [s3-path-parsing.tf](https://github.com/slalom-ggp/dataops-infra/tree/main//catalog/aws/singer-taps/s3-path-parsing.tf) +* [s3-upload.tf](https://github.com/slalom-ggp/dataops-infra/tree/main//catalog/aws/singer-taps/s3-upload.tf) +* [step-functions.tf](https://github.com/slalom-ggp/dataops-infra/tree/main//catalog/aws/singer-taps/step-functions.tf) +* [variables.tf](https://github.com/slalom-ggp/dataops-infra/tree/main//catalog/aws/singer-taps/variables.tf) + +--------------------- + +_**NOTE:** This documentation was auto-generated using +`terraform-docs` and `s-infra` from `slalom.dataops`. +Please do not attempt to manually update this file._ diff --git a/catalog/aws/meltano/USAGE.md b/catalog/aws/meltano/USAGE.md new file mode 100644 index 00000000..58e6a8c0 --- /dev/null +++ b/catalog/aws/meltano/USAGE.md @@ -0,0 +1,39 @@ +## Usage + +This module supports multiple taps per each target. To target multiple destinations, simply create +additional instances of the module. + +### Tap configuration overview + +The `taps` input variable expects a list of specifications for each tap. The specification for each +tap should include the following properties: + +- `id` - The name or alias of the tap as registered in the [Singer Index](#singer-index), without the `tap-` prefix. + - Note: in most cases, this is exactly what you'd expect: `mssql` for `tap-mssql`, etc. However, + for forks or experimental releases, this might contain a suffix such as `mssql-test` for a test + version of `tap-mssql` or `snowflake-singer` for the singer edition of `tap-snowflake`. + - A future release will add a separate and optional flag for `owner` or `variant`, in place of the + currently used alias/suffix convention. See the [Singer Index](#singer-index) section for more + info. +- `name` - What you want to call the data source. For instance, if you have multiple SQL Servers, you may want to use a more memorable name such as `finance-system` or `gl-db`. This name should still align with tap naming conventions, which is to say it should be in _lower-case-with-dashes_ format. +- `settings` - A simple map of the tap settings' names to their values. These are specific to each tap and they are required for each tap to work. + - Note: While singer does not distinguish between 'secrets' and 'settings', we should and do treat these two types of config separately. Be sure to put all sensitive config in the `secrets` collection, and not here in `settings`. +- `secrets` - Same as `config` except for sensitive values. When passing secrets, you specify the setting name in the same way but you _must_ either pass the value as a pointer to the file containing the secret (a config.json file, for instance) or else pass a AWS Secrets Manager ARN. + - _If you pass a Secrets Manager ARN as the config value_, that secret pointer will be passed to the ECS container securely, and only the running container will have access to the secret. + - _If you pass a pointer to a config file_, the module will automatically create a new AWS Secrets Manager secret, upload the secret to AWS Secrets Manager, and then the above process will continue by passing the Secrets Manager pointer _only_ to the running ECS container. + +### Singer Index + +There are actually two Singer Indexes currently available. + +1. The first and primary index for this module today is the tapdance index stored [here](https://github.com/aaronsteers/tapdance/blob/master/docker/singer_index.yml). + +2. This index will be eventually be replaced by a new dedicated [Singer DB](https://github.com/aaronsteers/singer-db), which is still a work-in-progress. + +Note: + +- Both of these sources support multiple versions (forks) of each tap, and both provide a "default" + or "recommended" version for those new users who just want to get started quickly. +- The new [Singer DB](https://github.com/aaronsteers/singer-db) will implement a new "owner" or "variant" + flag to replace the current "alias" technique used by the + [tapdance index](https://github.com/aaronsteers/tapdance/blob/master/docker/singer_index.yml). diff --git a/catalog/aws/meltano/cloudwatch.tf b/catalog/aws/meltano/cloudwatch.tf new file mode 100644 index 00000000..15fac0ba --- /dev/null +++ b/catalog/aws/meltano/cloudwatch.tf @@ -0,0 +1,177 @@ +locals { + cloudwatch_errors_query = < None: + """ + Responds to AWS lambda trigger. + + Parameters + ---------- + event : [type] + The event payload that was submitted to the Lambda function. + context : [type] + A LambdaContext object: + - https://docs.aws.amazon.com/lambda/latest/dg/python-context.html + """ + msg, url = None, None + if "MESSAGE_TEXT" in event: + msg = str(event.pop("MESSAGE_TEXT")) + if "WEBHOOK_URL" in event: + url = str(event.pop("WEBHOOK_URL")) + if url and msg: + post_to_webhook(msg, url, payload=event) + + +def post_to_webhook(msg: str, url: str, payload=None) -> None: + """Post to the webhook. + + Parameters + ---------- + msg : [str] + The message text to post. + url : [str] + The webhook URL. + payload : [dict] + Optional. Additional key-value pairs to attach to the message. + """ + if payload: + msg += "\n\n\n - " + msg += "\n\n - ".join([f"**{k}**: {v}" for k, v in payload.items()]) + json_msg_body = {"text": msg} + encoded_msg = json.dumps(json_msg_body).encode("utf-8") + print({"message": msg, "url": url, "payload": payload}) + resp = http.request("POST", url, body=encoded_msg) + print( + {"message": msg, "url": url, "status_code": resp.status, "response": resp.data} + ) + + +if __name__ == "__main__": + try: + url = sys.argv[1] + except Exception: + raise ValueError("Missing required positional argument 'webhook_url'.") + post_to_webhook( + "This is a test.", url, {"something": "http://slalom.com", "else": "here"} + ) diff --git a/catalog/aws/meltano/main.tf b/catalog/aws/meltano/main.tf new file mode 100644 index 00000000..308a8677 --- /dev/null +++ b/catalog/aws/meltano/main.tf @@ -0,0 +1,109 @@ +/* +* The Singer Taps platform is the open source stack which powers the [Stitcher](https://www.stitcher.com) EL platform. For more information, see [singer.io](https://singer.io) +* +*/ + +# Timezone math: +locals { + tz_hour_offset = ( + contains(["PST"], var.scheduled_timezone) ? -8 : + contains(["PDT"], var.scheduled_timezone) ? -7 : + contains(["MST"], var.scheduled_timezone) ? -7 : + contains(["CST"], var.scheduled_timezone) ? -6 : + contains(["EST"], var.scheduled_timezone) ? -5 : + contains(["UTC", "GMT"], var.scheduled_timezone) ? 0 : + 1 / 0 + # ERROR: currently supported timezone code are: UTC, MST, GMT, CST, EST, PST and PDT + ) +} + +# Target config: +locals { + default_target_def = { + id = "s3-csv" + settings = { + s3_bucket = local.data_lake_storage_bucket + s3_key_prefix = local.data_lake_storage_key_prefix + } + secrets = {} + } + target = var.data_lake_type != "S3" || var.target != null ? var.target : local.default_target_def + target_env_prefix = "TARGET_${replace(upper(local.target.id), "-", "_")}_" +} + +# Tap config: +locals { + name_prefix = "${var.name_prefix}Tap-" + tap_env_prefix = [ + for tap in var.taps : + "TAP_${replace(upper(tap.name), "-", "_")}_" + ] + taps_specs = [ + for tap in var.taps : + { + id = tap.id + name = coalesce(lookup(tap, "name", null), tap.id) # default to `id` if `name` not provided. + schedule = coalesce(lookup(tap, "schedule", null), []) # default to no schedule ([]) + settings = tap.settings + secrets = tap.secrets + sync_command = "tapdance sync ${tap.name} ${local.target.id} ${join(" ", var.container_args)}" + image = coalesce( + var.container_image_override, + "dataopstk/tapdance:${tap.id}-to-${local.target.id}${var.container_image_suffix}" + ) + } + ] +} + +module "ecs_cluster" { + source = "../../../components/aws/ecs-cluster" + name_prefix = local.name_prefix + environment = var.environment + resource_tags = var.resource_tags +} + +module "ecs_tap_sync_task" { + count = length(local.taps_specs) + source = "../../../components/aws/ecs-task" + name_prefix = "${local.name_prefix}task${count.index}-" + environment = var.environment + resource_tags = var.resource_tags + ecs_cluster_name = module.ecs_cluster.ecs_cluster_name + container_image = local.taps_specs[count.index].image + container_command = local.taps_specs[count.index].sync_command + container_ram_gb = var.container_ram_gb + container_num_cores = var.container_num_cores + use_private_subnet = var.use_private_subnet + use_fargate = true + permitted_s3_buckets = local.needed_s3_buckets + environment_vars = merge( + { + TAP_CONFIG_DIR = "${var.data_lake_metadata_path}/tap-snapshot-${local.unique_suffix}", + TAP_STATE_FILE = "${coalesce(var.data_lake_storage_path, var.data_lake_metadata_path)}/${var.state_file_naming_scheme}", + PIPELINE_VERSION_NUMBER = var.pipeline_version_number + "${local.tap_env_prefix[count.index]}CONFIG_FILE" = "False" # Config will be passed via env vars + "${local.target_env_prefix}CONFIG_FILE" = "False" # Config will be passed via env vars + }, + var.data_lake_logging_path == null ? {} : { + TAP_LOG_DIR = "${var.data_lake_logging_path}/tap-${local.taps_specs[count.index].name}/" + }, + { + for k, v in local.taps_specs[count.index].settings : + "${local.tap_env_prefix[count.index]}${k}" => v + }, + { + for k, v in local.target.settings : + "${local.target_env_prefix}${k}" => v + } + ) + environment_secrets = merge( + { + for k, v in local.taps_specs[count.index].secrets : + "${local.tap_env_prefix[count.index]}${k}" => length(split(":", v)) > 1 ? v : "${v}:${k}" + }, + { + for k, v in local.target.secrets : + "${local.target_env_prefix}${k}" => length(split(":", v)) > 1 ? v : "${v}:${k}" + } + ) +} diff --git a/catalog/aws/meltano/outputs.tf b/catalog/aws/meltano/outputs.tf new file mode 100644 index 00000000..ffa18d13 --- /dev/null +++ b/catalog/aws/meltano/outputs.tf @@ -0,0 +1,24 @@ +output "summary" { + description = "Summary of resources created by this module." + value = < 0 + ]) + source_files_hash = join(",", [ + for filepath in local.source_files : + filebase64sha256("${var.local_metadata_path}/${filepath}") + ]) + unique_hash = md5(local.source_files_hash) + unique_suffix = substr(local.unique_hash, 0, 4) +} + +resource "aws_s3_bucket_object" "s3_source_uploads" { + for_each = local.source_files + # Parse the S3 path into 'bucket' and 'key' values: + # https://gist.github.com/aaronsteers/19eb4d6cba926327f8b25089cb79259b + bucket = split("/", split("//", var.data_lake_metadata_path)[1])[0] + key = join("/", + [ + join("/", slice( + split("/", split("//", var.data_lake_metadata_path)[1]), + 1, + length(split("/", split("//", var.data_lake_metadata_path)[1])) + )), + "tap-snapshot-${local.unique_suffix}/${each.value}" + ] + ) + source = "${var.local_metadata_path}/${each.value}" + tags = var.resource_tags + metadata = {} + # etag = filebase64sha256("${var.local_metadata_path}/${each.value}") +} diff --git a/catalog/aws/meltano/step-functions.tf b/catalog/aws/meltano/step-functions.tf new file mode 100644 index 00000000..2cd2284e --- /dev/null +++ b/catalog/aws/meltano/step-functions.tf @@ -0,0 +1,118 @@ +locals { + state_machine_json = [ + for i, tap_spec in local.taps_specs : + < 0 + # error_message = "One or more tap configurations is missing a required key. Expected minimum config: 'id', 'settings', and 'secrets'." + # } + # validation { + # condition = length([ + # for tap in var.taps : + # tap["id"] + # if length(setsubtract(keys(tap), ["id", "settings", "secrets", "name", "schedule"])) > 0 # unknown keys + # ]) > 0 + # error_message = "One or more tap configurations has an unexpected key. Allowed keys: 'id', 'name', 'schedule', 'settings', and 'secrets'." + # } +} +variable "target" { + description = < Date: Tue, 30 Mar 2021 11:25:28 -0700 Subject: [PATCH 2/6] begin update module variables --- catalog/aws/meltano/variables.tf | 73 +++++++++----------------------- 1 file changed, 19 insertions(+), 54 deletions(-) diff --git a/catalog/aws/meltano/variables.tf b/catalog/aws/meltano/variables.tf index ffd0f4ce..4b55a563 100644 --- a/catalog/aws/meltano/variables.tf +++ b/catalog/aws/meltano/variables.tf @@ -46,60 +46,22 @@ EOF settings = map(string) secrets = map(string) })) - # type = list(map(any)) - # validation { - # condition = length([ - # for tap in var.taps : - # tap["id"] - # if length(setintersect(keys(tap), ["id", "settings", "secrets"])) < 3 # missing required key - # ]) > 0 - # error_message = "One or more tap configurations is missing a required key. Expected minimum config: 'id', 'settings', and 'secrets'." - # } - # validation { - # condition = length([ - # for tap in var.taps : - # tap["id"] - # if length(setsubtract(keys(tap), ["id", "settings", "secrets", "name", "schedule"])) > 0 # unknown keys - # ]) > 0 - # error_message = "One or more tap configurations has an unexpected key. Allowed keys: 'id', 'name', 'schedule', 'settings', and 'secrets'." - # } -} -variable "target" { - description = < Date: Sat, 24 Apr 2021 13:51:14 -0700 Subject: [PATCH 3/6] add dockerfile with gitaware bootstrap --- .gitignore | 3 ++ .vscode/settings.json | 4 +++ catalog/aws/meltano/docker/.dockerignore | 2 ++ catalog/aws/meltano/docker/Dockerfile | 22 ++++++++++++++ catalog/aws/meltano/docker/README.md | 38 ++++++++++++++++++++++++ catalog/aws/meltano/docker/bootstrap.sh | 9 ++++++ 6 files changed, 78 insertions(+) create mode 100644 catalog/aws/meltano/docker/.dockerignore create mode 100644 catalog/aws/meltano/docker/Dockerfile create mode 100644 catalog/aws/meltano/docker/README.md create mode 100644 catalog/aws/meltano/docker/bootstrap.sh diff --git a/.gitignore b/.gitignore index 950ec075..6ea5e15a 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,6 @@ +# Env files +.env + # Config file (comment this line to modify the template): samples/infra-config.yml build diff --git a/.vscode/settings.json b/.vscode/settings.json index 00b826b8..67a9ede8 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -10,4 +10,8 @@ }, "editor.formatOnSave": true, "workbench.colorTheme": "Amethyst Dark", + "python.testing.promptToConfigure": false, + "python.testing.pytestEnabled": false, + "python.testing.unittestEnabled": false, + "python.testing.nosetestsEnabled": false, } \ No newline at end of file diff --git a/catalog/aws/meltano/docker/.dockerignore b/catalog/aws/meltano/docker/.dockerignore new file mode 100644 index 00000000..01f9d551 --- /dev/null +++ b/catalog/aws/meltano/docker/.dockerignore @@ -0,0 +1,2 @@ +* +!*.sh diff --git a/catalog/aws/meltano/docker/Dockerfile b/catalog/aws/meltano/docker/Dockerfile new file mode 100644 index 00000000..bfae9ca0 --- /dev/null +++ b/catalog/aws/meltano/docker/Dockerfile @@ -0,0 +1,22 @@ +FROM meltano/meltano + +# Setup bootstrap scripts: +WORKDIR / + +## Install the local boostrap script +COPY ./bootstrap.sh /bootstrap.sh +RUN chmod +x /bootstrap.sh + +## Install the latest gitenv scripts +ENV GITENV_REPO_ROOT=https://raw.githubusercontent.com/dataops-tk/gitenv-init/main +RUN wget ${GITENV_REPO_ROOT}/gitenv-init.sh +RUN wget ${GITENV_REPO_ROOT}/gitenv-bootstrap.sh +RUN chmod +x /gitenv-bootstrap.sh +RUN chmod +x /gitenv-init.sh +ENTRYPOINT [ "/gitenv-bootstrap.sh" ] + +WORKDIR /project + +# Set our bootstrap script to run within gitenv-bootstrap.sh +ENV BASE_BOOTSTRAP=/bootstrap.sh +CMD ["ui"] diff --git a/catalog/aws/meltano/docker/README.md b/catalog/aws/meltano/docker/README.md new file mode 100644 index 00000000..5a91d346 --- /dev/null +++ b/catalog/aws/meltano/docker/README.md @@ -0,0 +1,38 @@ +# Meltano Module - Dockerfile + +This Dockerfile will be used to generate the image which will be built and deployed +to the cloud for serverless execution. + +## `gitenv-init.sh` + +Meltano needs to be git aware in order to dynamically adapt to +projects' changes in real time. I've written the initial git bootstraping +logic generic in a new [`gitenv-init`](https://github.com/dataops-tk/gitenv-init/blob/main/gitenv-bootstrap.sh) +tool which can be applied to any similar scenario where a runtime +environment needs to be seeded by a specific git asset. + +## Example usage + +### Test locally + +Initialize environment variables: + +```bash +export GIT_REPO=gitlab.com/meltano/singerhub +export GIT_REF=meltano-project +export GIT_USER=username +export GIT_EMAIL=username@example.com + +# Only one of these is required: +export GIT_ACCESS_TOKEN= +export GIT_SSH_PRIVATE_KEY="$(cat /path/to/keyfile)" # SSH not yet supported +``` + +Build and run the image: + +```bash +docker build -t mymelt . && docker run -it --rm -p 5000:5000 -e GIT_REPO -e GIT_REF -e GIT_USER -e GIT_EMAIL --name meltui mymelt + +# Or if you have a `.env` file: +docker build -t mymelt . && docker run -it --rm -p 5000:5000 --env-file=./.env --name meltui mymelt +``` diff --git a/catalog/aws/meltano/docker/bootstrap.sh b/catalog/aws/meltano/docker/bootstrap.sh new file mode 100644 index 00000000..9950d7d9 --- /dev/null +++ b/catalog/aws/meltano/docker/bootstrap.sh @@ -0,0 +1,9 @@ +#!/bin/bash + +set -e # abort on error + +echo "Running 'meltano install'..." +meltano install + +echo "Running 'meltano $@'..." +meltano $@ From c3539f4985e6075953cccdd45ffc2c04fb742df8 Mon Sep 17 00:00:00 2001 From: AJ Steers Date: Sat, 24 Apr 2021 14:04:22 -0700 Subject: [PATCH 4/6] reorganize dockerfile --- catalog/aws/meltano/docker/Dockerfile | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/catalog/aws/meltano/docker/Dockerfile b/catalog/aws/meltano/docker/Dockerfile index bfae9ca0..e3a73e89 100644 --- a/catalog/aws/meltano/docker/Dockerfile +++ b/catalog/aws/meltano/docker/Dockerfile @@ -1,13 +1,8 @@ FROM meltano/meltano -# Setup bootstrap scripts: WORKDIR / -## Install the local boostrap script -COPY ./bootstrap.sh /bootstrap.sh -RUN chmod +x /bootstrap.sh - -## Install the latest gitenv scripts +# Install the latest `gitenv-init` scripts ENV GITENV_REPO_ROOT=https://raw.githubusercontent.com/dataops-tk/gitenv-init/main RUN wget ${GITENV_REPO_ROOT}/gitenv-init.sh RUN wget ${GITENV_REPO_ROOT}/gitenv-bootstrap.sh @@ -15,6 +10,12 @@ RUN chmod +x /gitenv-bootstrap.sh RUN chmod +x /gitenv-init.sh ENTRYPOINT [ "/gitenv-bootstrap.sh" ] +# Meltano-specific steps here: + +# Install the Meltano boostrap script +COPY ./bootstrap.sh /bootstrap.sh +RUN chmod +x /bootstrap.sh + WORKDIR /project # Set our bootstrap script to run within gitenv-bootstrap.sh From 6c3fd7558f8d5f36140db88afe50677e00c29b53 Mon Sep 17 00:00:00 2001 From: AJ Steers Date: Sat, 24 Apr 2021 14:08:24 -0700 Subject: [PATCH 5/6] singer layer install --- catalog/aws/meltano/docker/Dockerfile | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/catalog/aws/meltano/docker/Dockerfile b/catalog/aws/meltano/docker/Dockerfile index e3a73e89..8e1e1a1a 100644 --- a/catalog/aws/meltano/docker/Dockerfile +++ b/catalog/aws/meltano/docker/Dockerfile @@ -3,11 +3,11 @@ FROM meltano/meltano WORKDIR / # Install the latest `gitenv-init` scripts -ENV GITENV_REPO_ROOT=https://raw.githubusercontent.com/dataops-tk/gitenv-init/main -RUN wget ${GITENV_REPO_ROOT}/gitenv-init.sh -RUN wget ${GITENV_REPO_ROOT}/gitenv-bootstrap.sh -RUN chmod +x /gitenv-bootstrap.sh -RUN chmod +x /gitenv-init.sh +RUN GITENV_REPO_ROOT=https://raw.githubusercontent.com/dataops-tk/gitenv-init/main && \ + wget ${GITENV_REPO_ROOT}/gitenv-init.sh && \ + wget ${GITENV_REPO_ROOT}/gitenv-bootstrap.sh && \ + chmod +x /gitenv-bootstrap.sh && \ + chmod +x /gitenv-init.sh ENTRYPOINT [ "/gitenv-bootstrap.sh" ] # Meltano-specific steps here: From 77b9f6163b8cf99aca8a3d289237beecc45f3dd9 Mon Sep 17 00:00:00 2001 From: AJ Steers Date: Mon, 3 May 2021 22:56:51 -0700 Subject: [PATCH 6/6] bug fixes --- catalog/aws/dbt/main.tf | 4 +-- catalog/aws/meltano/cloudwatch.tf | 2 +- catalog/aws/meltano/main.tf | 42 +++++++---------------- catalog/aws/meltano/s3-upload.tf | 8 ++--- catalog/aws/meltano/step-functions.tf | 2 +- catalog/aws/meltano/variables.tf | 38 +++++++++----------- catalog/aws/singer-taps/main.tf | 2 +- catalog/aws/singer-taps/step-functions.tf | 2 +- 8 files changed, 39 insertions(+), 61 deletions(-) diff --git a/catalog/aws/dbt/main.tf b/catalog/aws/dbt/main.tf index d09db18e..d6ca5c3a 100644 --- a/catalog/aws/dbt/main.tf +++ b/catalog/aws/dbt/main.tf @@ -8,7 +8,7 @@ locals { name_prefix = "${var.name_prefix}DBT-" admin_cidr = var.admin_cidr admin_ports = ["8080", "10000"] - tz_hour_offset = ( + tz_utc_offset = ( contains(["PST"], var.scheduled_timezone) ? -8 : contains(["PDT"], var.scheduled_timezone) ? -7 : contains(["MST"], var.scheduled_timezone) ? -7 : @@ -49,7 +49,7 @@ module "ecs_task" { "cron(${ tonumber(substr(cron_expr, 2, 2)) } ${ - (24 + tonumber(substr(cron_expr, 0, 2)) - local.tz_hour_offset) % 24 + (24 + tonumber(substr(cron_expr, 0, 2)) - local.tz_utc_offset) % 24 } * * ? *)" ] ]) diff --git a/catalog/aws/meltano/cloudwatch.tf b/catalog/aws/meltano/cloudwatch.tf index 15fac0ba..2737aa90 100644 --- a/catalog/aws/meltano/cloudwatch.tf +++ b/catalog/aws/meltano/cloudwatch.tf @@ -16,7 +16,7 @@ filter @message not like /INFO\sUsed/ EOF dashboard_names = [ for i, tap_spec in local.taps_specs : - "${tap_spec.name}${i}-to-${local.target.id}-v${var.pipeline_version_number}-${var.name_prefix}-TapDashboard" + "${tap_spec.name}${i}-to-${var.default_target}-v${var.pipeline_version_number}-${var.name_prefix}-TapDashboard" ] dashboard_urls = [ for dashboard_name in local.dashboard_names : diff --git a/catalog/aws/meltano/main.tf b/catalog/aws/meltano/main.tf index 308a8677..a02b3a57 100644 --- a/catalog/aws/meltano/main.tf +++ b/catalog/aws/meltano/main.tf @@ -3,53 +3,37 @@ * */ -# Timezone math: -locals { - tz_hour_offset = ( - contains(["PST"], var.scheduled_timezone) ? -8 : - contains(["PDT"], var.scheduled_timezone) ? -7 : - contains(["MST"], var.scheduled_timezone) ? -7 : - contains(["CST"], var.scheduled_timezone) ? -6 : - contains(["EST"], var.scheduled_timezone) ? -5 : - contains(["UTC", "GMT"], var.scheduled_timezone) ? 0 : - 1 / 0 - # ERROR: currently supported timezone code are: UTC, MST, GMT, CST, EST, PST and PDT - ) -} +data "local_file" "meltano_yml" { filename = var.meltano_yml_path } # Target config: locals { - default_target_def = { - id = "s3-csv" - settings = { - s3_bucket = local.data_lake_storage_bucket - s3_key_prefix = local.data_lake_storage_key_prefix - } - secrets = {} - } - target = var.data_lake_type != "S3" || var.target != null ? var.target : local.default_target_def - target_env_prefix = "TARGET_${replace(upper(local.target.id), "-", "_")}_" + meltano_config = yamldecode(data.local_file.meltano_yml.content) + local_metadata_path = abspath("${var.meltano_yml_path}/..}") + taps = local.meltano_config["extractors"] + target = local.meltano_config["loaders"][var.default_target] + target_env_prefix = "TARGET_${replace(upper(var.default_target), "-", "_")}_" } # Tap config: locals { name_prefix = "${var.name_prefix}Tap-" + tap_env_prefix = [ - for tap in var.taps : + for tap in local.taps : "TAP_${replace(upper(tap.name), "-", "_")}_" ] taps_specs = [ - for tap in var.taps : + for tap in local.taps : { id = tap.id name = coalesce(lookup(tap, "name", null), tap.id) # default to `id` if `name` not provided. schedule = coalesce(lookup(tap, "schedule", null), []) # default to no schedule ([]) settings = tap.settings secrets = tap.secrets - sync_command = "tapdance sync ${tap.name} ${local.target.id} ${join(" ", var.container_args)}" + sync_command = "tapdance sync ${tap.name} ${var.default_target} ${join(" ", var.container_args)}" image = coalesce( var.container_image_override, - "dataopstk/tapdance:${tap.id}-to-${local.target.id}${var.container_image_suffix}" + "dataopstk/tapdance:${tap.id}-to-${var.default_target}${var.container_image_suffix}" ) } ] @@ -71,8 +55,8 @@ module "ecs_tap_sync_task" { ecs_cluster_name = module.ecs_cluster.ecs_cluster_name container_image = local.taps_specs[count.index].image container_command = local.taps_specs[count.index].sync_command - container_ram_gb = var.container_ram_gb - container_num_cores = var.container_num_cores + container_ram_gb = var.elt_container_ram_gb + container_num_cores = var.elt_container_num_cores use_private_subnet = var.use_private_subnet use_fargate = true permitted_s3_buckets = local.needed_s3_buckets diff --git a/catalog/aws/meltano/s3-upload.tf b/catalog/aws/meltano/s3-upload.tf index 282ef420..f0d6545a 100644 --- a/catalog/aws/meltano/s3-upload.tf +++ b/catalog/aws/meltano/s3-upload.tf @@ -1,6 +1,6 @@ locals { source_files = toset([ - for f in fileset(var.local_metadata_path, "*") : + for f in fileset(local.local_metadata_path, "*") : f if length([ for tap_name in local.taps_specs.*.name : @@ -10,7 +10,7 @@ locals { ]) source_files_hash = join(",", [ for filepath in local.source_files : - filebase64sha256("${var.local_metadata_path}/${filepath}") + filebase64sha256("${local.local_metadata_path}/${filepath}") ]) unique_hash = md5(local.source_files_hash) unique_suffix = substr(local.unique_hash, 0, 4) @@ -31,8 +31,8 @@ resource "aws_s3_bucket_object" "s3_source_uploads" { "tap-snapshot-${local.unique_suffix}/${each.value}" ] ) - source = "${var.local_metadata_path}/${each.value}" + source = "${local.local_metadata_path}/${each.value}" tags = var.resource_tags metadata = {} - # etag = filebase64sha256("${var.local_metadata_path}/${each.value}") + # etag = filebase64sha256("${local.local_metadata_path}/${each.value}") } diff --git a/catalog/aws/meltano/step-functions.tf b/catalog/aws/meltano/step-functions.tf index 2cd2284e..30be156b 100644 --- a/catalog/aws/meltano/step-functions.tf +++ b/catalog/aws/meltano/step-functions.tf @@ -111,7 +111,7 @@ module "step_function" { "cron(${ tonumber(substr(cron_expr, 2, 2)) } ${ - (24 + tonumber(substr(cron_expr, 0, 2)) - local.tz_hour_offset) % 24 + (24 + tonumber(substr(cron_expr, 0, 2)) - var.tz_utc_offset) % 24 } * * ? *)" ] state_machine_definition = local.state_machine_json[count.index] diff --git a/catalog/aws/meltano/variables.tf b/catalog/aws/meltano/variables.tf index 4b55a563..57cf8b76 100644 --- a/catalog/aws/meltano/variables.tf +++ b/catalog/aws/meltano/variables.tf @@ -24,30 +24,19 @@ variable "resource_tags" { ### Custom variables for this module ### ######################################## -# Tap and Target Config - -variable "taps" { - description = <