diff --git a/README.md b/README.md index 5ee89ff..30c7921 100644 --- a/README.md +++ b/README.md @@ -49,13 +49,16 @@ Then perform the following commands on the root folder: | Name | Description | Type | Default | Required | |------|-------------|:----:|:-----:|:-----:| +| machine\_type | The machine type to use for the job. | string | `""` | no | | max\_workers | The number of workers permitted to work on the job. More workers may improve processing speed at additional cost. | string | `"1"` | no | | name | The name of the dataflow job | string | n/a | yes | +| network\_self\_link | The network self link to which VMs will be assigned. | string | `"default"` | no | | on\_delete | One of drain or cancel. Specifies behavior of deletion during terraform destroy. The default is cancel. | string | `"cancel"` | no | | parameters | Key/Value pairs to be passed to the Dataflow job (as used in the template). | map | `` | no | | project\_id | The project in which the resource belongs. If it is not provided, the provider project is used. | string | n/a | yes | | region | The bucket's region location | string | `"us-central1"` | no | | service\_account\_email | The Service Account email that will be used to identify the VMs in which the jobs are running | string | `""` | no | +| subnetwork\_self\_link | The subnetwork self link to which VMs will be assigned. | string | `""` | no | | temp\_gcs\_location | A writeable location on GCS for the Dataflow job to dump its temporary data. | string | n/a | yes | | template\_gcs\_path | The GCS path to the Dataflow job template. | string | n/a | yes | | zone | The zone in which the created job should run. If it is not provided, the provider zone is used. | string | `"us-central1-a"` | no | @@ -103,7 +106,8 @@ If you want to use the service_account_email input to specify a service account ### Enable APIs In order to launch a Dataflow Job, the Dataflow API must be enabled: -- Dataflow API - dataflow.googleapis.com +- Dataflow API - `dataflow.googleapis.com` +- Compute Engine API: `compute.googleapis.com` ## Install diff --git a/examples/simple_example/README.md b/examples/simple_example/README.md index 8087dec..06dbe5c 100644 --- a/examples/simple_example/README.md +++ b/examples/simple_example/README.md @@ -1,6 +1,7 @@ # Simple Example This example illustrates how to use the Dataflow module to start multiple jobs with a common bucket for temporary job data. +A network and subnetwork are created as well to demonstrate how Dataflow instance can be created in a specific network and subnetwork. ## Best practices @@ -9,6 +10,12 @@ This example illustrates how to use the Dataflow module to start multiple jobs w As featured in this example, using a single regional bucket for storing your jobs' temporary data is recommended to optimize cost. Also, to optimize your jobs performance, this bucket should always in the corresponding region of the zones in which your jobs are running. +## Running the example +Make sure you grant the addtional permissions below to the service account execute the module: + +- roles/compute.networkAdmin + + ### Controller Service Account This example features the use of a controller service accoun which is specified with the `service_account_email` input variables. We recommend using a custome service account with fine-grained access control to mitigate security risks. See more about controller service accounts [here](https://cloud.google.com/dataflow/docs/concepts/security-and-permissions#controller_service_account) @@ -20,6 +27,7 @@ We recommend using a custome service account with fine-grained access control to | Name | Description | Type | Default | Required | |------|-------------|:----:|:-----:|:-----:| +| force\_destroy | When deleting a bucket, this boolean option will delete all contained objects. If you try to delete a bucket that contains objects, Terraform will fail that run. | string | `"false"` | no | | project\_id | The project ID to deploy to | string | n/a | yes | | region | The region in which the bucket and the dataflow job will be deployed | string | n/a | yes | | service\_account\_email | The Service Account email used to create the job. | string | n/a | yes | diff --git a/examples/simple_example/main.tf b/examples/simple_example/main.tf index e7a91de..539cb6e 100644 --- a/examples/simple_example/main.tf +++ b/examples/simple_example/main.tf @@ -15,7 +15,7 @@ */ provider "google" { - version = "~> 2.0" + version = "~> 2.8.0" region = "${var.region}" } @@ -27,23 +27,49 @@ locals { gcs_bucket_name = "tmp-dir-bucket-${random_id.random_suffix.hex}" } +module "vpc" { + source = "terraform-google-modules/network/google" + version = "~> 0.8.0" + project_id = "${var.project_id}" + network_name = "dataflow-network" + + subnets = [ + { + subnet_name = "dataflow-subnetwork" + subnet_ip = "10.1.3.0/24" + subnet_region = "us-central1" + }, + ] + + secondary_ranges = { + dataflow-subnetwork = [{ + range_name = "my-secondary-range" + ip_cidr_range = "192.168.64.0/24" + }] + } +} + module "dataflow-bucket" { source = "../../modules/dataflow_bucket" - name = "${local.gcs_bucket_name}" - region = "${var.region}" + name = "${local.gcs_bucket_name}" + region = "${var.region}" project_id = "${var.project_id}" + force_destroy = "${var.force_destroy}" } module "dataflow-job" { source = "../../" project_id = "${var.project_id}" - name = "wordcount-terraform-example" + name = "wordcount-terraform-example" on_delete = "cancel" zone = "${var.region}-a" max_workers = 1 template_gcs_path = "gs://dataflow-templates/latest/Word_Count" temp_gcs_location = "${module.dataflow-bucket.name}" service_account_email = "${var.service_account_email}" + network_self_link = "${module.vpc.network_self_link}" + subnetwork_self_link = "${module.vpc.subnets_self_links[0]}" + machine_type = "n1-standard-1" parameters = { inputFile = "gs://dataflow-samples/shakespeare/kinglear.txt" @@ -54,13 +80,16 @@ module "dataflow-job" { module "dataflow-job-2" { source = "../../" project_id = "${var.project_id}" - name = "wordcount-terraform-example-2" + name = "wordcount-terraform-example-2" on_delete = "cancel" zone = "${var.region}-a" max_workers = 1 template_gcs_path = "gs://dataflow-templates/latest/Word_Count" temp_gcs_location = "${module.dataflow-bucket.name}" service_account_email = "${var.service_account_email}" + network_self_link = "${module.vpc.network_self_link}" + subnetwork_self_link = "${module.vpc.subnets_self_links[0]}" + machine_type = "n1-standard-2" parameters = { inputFile = "gs://dataflow-samples/shakespeare/kinglear.txt" diff --git a/examples/simple_example/variables.tf b/examples/simple_example/variables.tf index 532ecee..06df9db 100644 --- a/examples/simple_example/variables.tf +++ b/examples/simple_example/variables.tf @@ -25,3 +25,8 @@ variable "region" { variable "service_account_email" { description = "The Service Account email used to create the job." } + +variable "force_destroy" { + description = "When deleting a bucket, this boolean option will delete all contained objects. If you try to delete a bucket that contains objects, Terraform will fail that run." + default = "false" +} diff --git a/main.tf b/main.tf index e0c4e39..84eb0c7 100644 --- a/main.tf +++ b/main.tf @@ -25,4 +25,7 @@ resource "google_dataflow_job" "dataflow_job" { temp_gcs_location = "gs://${var.temp_gcs_location}/tmp_dir" parameters = "${var.parameters}" service_account_email = "${var.service_account_email}" + network = "${replace(var.network_self_link, "/(.*)/networks/(.*)/", "$2")}" + subnetwork = "${replace(var.subnetwork_self_link, "/(.*)/regions/(.*)/", "regions/$2")}" + machine_type = "${var.machine_type}" } diff --git a/modules/dataflow_bucket/README.md b/modules/dataflow_bucket/README.md index fd7872c..5ee693f 100644 --- a/modules/dataflow_bucket/README.md +++ b/modules/dataflow_bucket/README.md @@ -20,6 +20,7 @@ See [here](../example/simple_example) for a multi jobs example. | Name | Description | Type | Default | Required | |------|-------------|:----:|:-----:|:-----:| +| force\_destroy | When deleting a bucket, this boolean option will delete all contained objects. If you try to delete a bucket that contains objects, Terraform will fail that run. | string | `"false"` | no | | name | The name of the bucket. | string | n/a | yes | | project\_id | The project_id to deploy the example instance into. (e.g. "simple-sample-project-1234") | string | n/a | yes | | region | The GCS bucket region. This should be the same as your dataflow job's zone ot optimize performance. | string | `"us-central1"` | no | diff --git a/modules/dataflow_bucket/main.tf b/modules/dataflow_bucket/main.tf index 0db3446..b051cee 100644 --- a/modules/dataflow_bucket/main.tf +++ b/modules/dataflow_bucket/main.tf @@ -20,4 +20,5 @@ resource "google_storage_bucket" "tmp_dir_bucket" { location = "${var.region}" storage_class = "REGIONAL" project = "${var.project_id}" + force_destroy = "${var.force_destroy}" } diff --git a/modules/dataflow_bucket/outputs.tf b/modules/dataflow_bucket/outputs.tf index fd94c16..44b184e 100644 --- a/modules/dataflow_bucket/outputs.tf +++ b/modules/dataflow_bucket/outputs.tf @@ -6,4 +6,4 @@ output "name" { output "region" { description = "The bucket's region location" value = "${var.region}" -} \ No newline at end of file +} diff --git a/modules/dataflow_bucket/variables.tf b/modules/dataflow_bucket/variables.tf index f231cae..5faaa70 100644 --- a/modules/dataflow_bucket/variables.tf +++ b/modules/dataflow_bucket/variables.tf @@ -10,3 +10,8 @@ variable "region" { variable "name" { description = "The name of the bucket." } + +variable "force_destroy" { + description = "When deleting a bucket, this boolean option will delete all contained objects. If you try to delete a bucket that contains objects, Terraform will fail that run." + default = "false" +} diff --git a/test/fixtures/simple_example/main.tf b/test/fixtures/simple_example/main.tf index afc278a..293b8db 100644 --- a/test/fixtures/simple_example/main.tf +++ b/test/fixtures/simple_example/main.tf @@ -19,4 +19,5 @@ module "example" { project_id = "${var.project_id}" region = "${var.region}" service_account_email = "${var.service_account_email}" + force_destroy = "true" } diff --git a/variables.tf b/variables.tf index bd09066..046572b 100644 --- a/variables.tf +++ b/variables.tf @@ -58,3 +58,18 @@ variable "region" { description = "The bucket's region location" default = "us-central1" } + +variable "subnetwork_self_link" { + description = "The subnetwork self link to which VMs will be assigned." + default = "" +} + +variable "network_self_link" { + description = "The network self link to which VMs will be assigned." + default = "default" +} + +variable "machine_type" { + description = "The machine type to use for the job." + default = "" +}