From e498f3407edfb1594a419aabfc5a98a49b26f073 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wiktor=20Niesiob=C4=99dzki?= Date: Tue, 12 Aug 2025 18:43:27 +0000 Subject: [PATCH] Add support for GPUs in Cloud Run --- .github/pull_request_template.md | 4 + modules/cloud-run-v2/README.md | 136 +++++++++++++++--- modules/cloud-run-v2/job.tf | 18 ++- modules/cloud-run-v2/service.tf | 14 ++ modules/cloud-run-v2/variables.tf | 25 +++- modules/cloud-run-v2/workerpool.tf | 17 ++- .../cloud_run_v2/examples/gpu-job.yaml | 61 ++++++++ .../cloud_run_v2/examples/gpu-service.yaml | 71 +++++++++ .../cloud_run_v2/examples/gpu-workerpool.yaml | 61 ++++++++ 9 files changed, 379 insertions(+), 28 deletions(-) create mode 100644 tests/modules/cloud_run_v2/examples/gpu-job.yaml create mode 100644 tests/modules/cloud_run_v2/examples/gpu-service.yaml create mode 100644 tests/modules/cloud_run_v2/examples/gpu-workerpool.yaml diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md index 5ddfe1da0..3d187fafd 100644 --- a/.github/pull_request_template.md +++ b/.github/pull_request_template.md @@ -25,4 +25,8 @@ If your code introduces any breaking changes, uncomment and complete the section ```upgrade-note `modules/project`: example upgrade note 2. ``` +```upgrade-note +`terraform-google-provider`: version updated to X.XX, because ... +``` + --> diff --git a/modules/cloud-run-v2/README.md b/modules/cloud-run-v2/README.md index 9bbb4408e..5bf58b818 100644 --- a/modules/cloud-run-v2/README.md +++ b/modules/cloud-run-v2/README.md @@ -21,6 +21,7 @@ Cloud Run Services and Jobs, with support for IAM roles and Eventarc trigger cre - [Creating Cloud Run Jobs](#creating-cloud-run-jobs) - [Tag bindings](#tag-bindings) - [IAP Configuration](#iap-configuration) +- [Adding GPUs](#adding-gpus) - [Variables](#variables) - [Outputs](#outputs) - [Fixtures](#fixtures) @@ -931,31 +932,128 @@ module "cloud_run" { } # tftest modules=1 resources=2 e2e ``` + +## Adding GPUs +GPU support is available for all types of Cloud Run resources: jobs, services and worker pools. + +```hcl +module "job" { + source = "./fabric/modules/cloud-run-v2" + project_id = var.project_id + name = "job" + region = var.region + launch_stage = "BETA" + revision = { + gpu_zonal_redundancy_disabled = true + node_selector = { + accelerator = "nvidia-l4" + } + } + type = "JOB" + containers = { + hello = { + image = "us-docker.pkg.dev/cloudrun/container/hello" + resources = { + limits = { + cpu = "4000m" + memory = "16Gi" + "nvidia.com/gpu" = "1" + } + } + } + } + deletion_protection = false +} +# tftest inventory=gpu-job.yaml e2e +``` + +```hcl +module "service" { + source = "./fabric/modules/cloud-run-v2" + project_id = var.project_id + name = "service" + region = var.region + revision = { + gpu_zonal_redundancy_disabled = true + node_selector = { + accelerator = "nvidia-l4" + } + } + service_config = { + gen2_execution_environment = true + } + containers = { + hello = { + image = "us-docker.pkg.dev/cloudrun/container/hello" + resources = { + limits = { + cpu = "4000m" + memory = "16Gi" + "nvidia.com/gpu" = "1" + } + } + } + } + deletion_protection = false +} +# tftest inventory=gpu-service.yaml e2e +``` + +```hcl +module "worker" { + source = "./fabric/modules/cloud-run-v2" + project_id = var.project_id + name = "worker" + region = var.region + launch_stage = "ALPHA" + revision = { + gpu_zonal_redundancy_disabled = true + node_selector = { + accelerator = "nvidia-l4" + } + } + type = "WORKERPOOL" + containers = { + hello = { + image = "us-docker.pkg.dev/cloudrun/container/hello" + resources = { + limits = { + cpu = "4000m" + memory = "16Gi" + "nvidia.com/gpu" = "1" + } + } + } + } + deletion_protection = false +} +# tftest inventory=gpu-workerpool.yaml e2e +``` ## Variables | name | description | type | required | default | |---|---|:---:|:---:|:---:| -| [name](variables.tf#L143) | Name used for Cloud Run service. | string | ✓ | | -| [project_id](variables.tf#L148) | Project id used for all resources. | string | ✓ | | -| [region](variables.tf#L153) | Region used for all resources. | string | ✓ | | -| [containers](variables.tf#L17) | Containers in name => attributes format. | map(object({…})) | | {} | -| [deletion_protection](variables.tf#L80) | Deletion protection setting for this Cloud Run service. | string | | null | -| [encryption_key](variables.tf#L86) | The full resource name of the Cloud KMS CryptoKey. | string | | null | -| [iam](variables.tf#L92) | IAM bindings for Cloud Run service in {ROLE => [MEMBERS]} format. | map(list(string)) | | {} | -| [job_config](variables.tf#L98) | Cloud Run Job specific configuration. | object({…}) | | {} | -| [labels](variables.tf#L113) | Resource labels. | map(string) | | {} | -| [launch_stage](variables.tf#L119) | The launch stage as defined by Google Cloud Platform Launch Stages. | string | | null | -| [managed_revision](variables.tf#L136) | Whether the Terraform module should control the deployment of revisions. | bool | | true | -| [revision](variables.tf#L158) | Revision template configurations. | object({…}) | | {} | -| [service_account](variables.tf#L214) | Service account email. Unused if service account is auto-created. | string | | null | -| [service_account_create](variables.tf#L220) | Auto-create service account. | bool | | false | -| [service_config](variables.tf#L226) | Cloud Run service specific configuration options. | object({…}) | | {} | -| [tag_bindings](variables.tf#L290) | Tag bindings for this service, in key => tag value id format. | map(string) | | {} | -| [type](variables.tf#L297) | Type of Cloud Run resource to deploy: JOB, SERVICE or WORKERPOOL. | string | | "SERVICE" | -| [volumes](variables.tf#L307) | Named volumes in containers in name => attributes format. | map(object({…})) | | {} | +| [name](variables.tf#L152) | Name used for Cloud Run service. | string | ✓ | | +| [project_id](variables.tf#L157) | Project id used for all resources. | string | ✓ | | +| [region](variables.tf#L162) | Region used for all resources. | string | ✓ | | +| [containers](variables.tf#L17) | Containers in name => attributes format. | map(object({…})) | | {} | +| [deletion_protection](variables.tf#L89) | Deletion protection setting for this Cloud Run service. | string | | null | +| [encryption_key](variables.tf#L95) | The full resource name of the Cloud KMS CryptoKey. | string | | null | +| [iam](variables.tf#L101) | IAM bindings for Cloud Run service in {ROLE => [MEMBERS]} format. | map(list(string)) | | {} | +| [job_config](variables.tf#L107) | Cloud Run Job specific configuration. | object({…}) | | {} | +| [labels](variables.tf#L122) | Resource labels. | map(string) | | {} | +| [launch_stage](variables.tf#L128) | The launch stage as defined by Google Cloud Platform Launch Stages. | string | | null | +| [managed_revision](variables.tf#L145) | Whether the Terraform module should control the deployment of revisions. | bool | | true | +| [revision](variables.tf#L167) | Revision template configurations. | object({…}) | | {} | +| [service_account](variables.tf#L227) | Service account email. Unused if service account is auto-created. | string | | null | +| [service_account_create](variables.tf#L233) | Auto-create service account. | bool | | false | +| [service_config](variables.tf#L239) | Cloud Run service specific configuration options. | object({…}) | | {} | +| [tag_bindings](variables.tf#L303) | Tag bindings for this service, in key => tag value id format. | map(string) | | {} | +| [type](variables.tf#L310) | Type of Cloud Run resource to deploy: JOB, SERVICE or WORKERPOOL. | string | | "SERVICE" | +| [volumes](variables.tf#L320) | Named volumes in containers in name => attributes format. | map(object({…})) | | {} | | [vpc_connector_create](variables-vpcconnector.tf#L17) | Populate this to create a Serverless VPC Access connector. | object({…}) | | null | -| [workerpool_config](variables.tf#L341) | Cloud Run Worker Pool specific configuration. | object({…}) | | {} | +| [workerpool_config](variables.tf#L354) | Cloud Run Worker Pool specific configuration. | object({…}) | | {} | ## Outputs diff --git a/modules/cloud-run-v2/job.tf b/modules/cloud-run-v2/job.tf index 0cb87c2eb..106482f0d 100644 --- a/modules/cloud-run-v2/job.tf +++ b/modules/cloud-run-v2/job.tf @@ -27,7 +27,14 @@ resource "google_cloud_run_v2_job" "job" { labels = var.revision.labels task_count = var.job_config.task_count template { - encryption_key = var.encryption_key + encryption_key = var.encryption_key + gpu_zonal_redundancy_disabled = var.revision.gpu_zonal_redundancy_disabled + dynamic "node_selector" { + for_each = var.revision.node_selector == null ? [] : [""] + content { + accelerator = var.revision.node_selector.accelerator + } + } dynamic "vpc_access" { for_each = local.connector == null ? [] : [""] content { @@ -222,7 +229,14 @@ resource "google_cloud_run_v2_job" "job_unmanaged" { labels = var.revision.labels task_count = var.job_config.task_count template { - encryption_key = var.encryption_key + encryption_key = var.encryption_key + gpu_zonal_redundancy_disabled = var.revision.gpu_zonal_redundancy_disabled + dynamic "node_selector" { + for_each = var.revision.node_selector == null ? [] : [""] + content { + accelerator = var.revision.node_selector.accelerator + } + } dynamic "vpc_access" { for_each = local.connector == null ? [] : [""] content { diff --git a/modules/cloud-run-v2/service.tf b/modules/cloud-run-v2/service.tf index 07948963e..05074e687 100644 --- a/modules/cloud-run-v2/service.tf +++ b/modules/cloud-run-v2/service.tf @@ -36,7 +36,14 @@ resource "google_cloud_run_v2_service" "service" { var.service_config.gen2_execution_environment ? "EXECUTION_ENVIRONMENT_GEN2" : "EXECUTION_ENVIRONMENT_GEN1" ) + gpu_zonal_redundancy_disabled = var.revision.gpu_zonal_redundancy_disabled max_instance_request_concurrency = var.service_config.max_concurrency + dynamic "node_selector" { + for_each = var.revision.node_selector == null ? [] : [""] + content { + accelerator = var.revision.node_selector.accelerator + } + } dynamic "scaling" { for_each = var.service_config.scaling == null ? [] : [""] content { @@ -279,7 +286,14 @@ resource "google_cloud_run_v2_service" "service_unmanaged" { var.service_config.gen2_execution_environment ? "EXECUTION_ENVIRONMENT_GEN2" : "EXECUTION_ENVIRONMENT_GEN1" ) + gpu_zonal_redundancy_disabled = var.revision.gpu_zonal_redundancy_disabled max_instance_request_concurrency = var.service_config.max_concurrency + dynamic "node_selector" { + for_each = var.revision.node_selector == null ? [] : [""] + content { + accelerator = var.revision.node_selector.accelerator + } + } dynamic "scaling" { for_each = var.service_config.scaling == null ? [] : [""] content { diff --git a/modules/cloud-run-v2/variables.tf b/modules/cloud-run-v2/variables.tf index 38b800006..a540a17a9 100644 --- a/modules/cloud-run-v2/variables.tf +++ b/modules/cloud-run-v2/variables.tf @@ -46,10 +46,7 @@ variable "containers" { name = optional(string) }))) resources = optional(object({ - limits = optional(object({ - cpu = string - memory = string - })) + limits = optional(map(string)) cpu_idle = optional(bool) startup_cpu_boost = optional(bool) })) @@ -75,6 +72,18 @@ variable "containers" { })) default = {} nullable = false + + validation { + condition = alltrue([ + for c in var.containers : ( + c.resources == null ? true : 0 == length(setsubtract( + keys(lookup(c.resources, "limits", {})), + ["cpu", "memory", "nvidia.com/gpu"] + )) + ) + ]) + error_message = "Only following resource limits are available: 'cpu', 'memory' and 'nvidia.com/gpu'." + } } variable "deletion_protection" { @@ -158,8 +167,12 @@ variable "region" { variable "revision" { description = "Revision template configurations." type = object({ - labels = optional(map(string)) - name = optional(string) + gpu_zonal_redundancy_disabled = optional(bool) + labels = optional(map(string)) + name = optional(string) + node_selector = optional(object({ + accelerator = string + })) vpc_access = optional(object({ connector = optional(string) egress = optional(string) diff --git a/modules/cloud-run-v2/workerpool.tf b/modules/cloud-run-v2/workerpool.tf index 3c4f52bd5..ba8ced516 100644 --- a/modules/cloud-run-v2/workerpool.tf +++ b/modules/cloud-run-v2/workerpool.tf @@ -39,6 +39,13 @@ resource "google_cloud_run_v2_worker_pool" "default_managed" { encryption_key = var.encryption_key revision = local.revision_name + gpu_zonal_redundancy_disabled = var.revision.gpu_zonal_redundancy_disabled + dynamic "node_selector" { + for_each = var.revision.node_selector == null ? [] : [""] + content { + accelerator = var.revision.node_selector.accelerator + } + } # Serverless VPC connector is not supported # dynamic "vpc_access" { # for_each = local.connector == null ? [] : [""] @@ -178,7 +185,7 @@ resource "google_cloud_run_v2_worker_pool" "default_managed" { } resource "google_cloud_run_v2_worker_pool" "default_unmanaged" { - count = var.type == "WORKERPOOL" && var.managed_revision ? 1 : 0 + count = var.type == "WORKERPOOL" && !var.managed_revision ? 1 : 0 provider = google-beta project = var.project_id location = var.region @@ -202,6 +209,14 @@ resource "google_cloud_run_v2_worker_pool" "default_unmanaged" { encryption_key = var.encryption_key revision = local.revision_name + gpu_zonal_redundancy_disabled = var.revision.gpu_zonal_redundancy_disabled + dynamic "node_selector" { + for_each = var.revision.node_selector == null ? [] : [""] + content { + accelerator = var.revision.node_selector.accelerator + } + } + # Serverless VPC connector is not supported # dynamic "vpc_access" { # for_each = local.connector == null ? [] : [""] diff --git a/tests/modules/cloud_run_v2/examples/gpu-job.yaml b/tests/modules/cloud_run_v2/examples/gpu-job.yaml new file mode 100644 index 000000000..da21bdf32 --- /dev/null +++ b/tests/modules/cloud_run_v2/examples/gpu-job.yaml @@ -0,0 +1,61 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +values: + module.job.google_cloud_run_v2_job.job[0]: + annotations: null + binary_authorization: [] + client: null + client_version: null + deletion_protection: false + effective_labels: + goog-terraform-provisioned: 'true' + labels: null + location: europe-west8 + name: job + project: project-id + run_execution_token: null + start_execution_token: null + template: + - annotations: null + labels: null + template: + - containers: + - args: null + command: null + depends_on: null + env: [] + image: us-docker.pkg.dev/cloudrun/container/hello + name: hello + ports: [] + resources: + - limits: + nvidia.com/gpu: '1' + volume_mounts: [] + working_dir: null + encryption_key: null + gpu_zonal_redundancy_disabled: true + max_retries: 3 + node_selector: + - accelerator: nvidia-l4 + volumes: [] + vpc_access: [] + terraform_labels: + goog-terraform-provisioned: 'true' + timeouts: null + +counts: + google_cloud_run_v2_job: 1 + modules: 1 + resources: 1 diff --git a/tests/modules/cloud_run_v2/examples/gpu-service.yaml b/tests/modules/cloud_run_v2/examples/gpu-service.yaml new file mode 100644 index 000000000..2467695e1 --- /dev/null +++ b/tests/modules/cloud_run_v2/examples/gpu-service.yaml @@ -0,0 +1,71 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +values: + module.service.google_cloud_run_v2_service.service[0]: + annotations: null + binary_authorization: [] + build_config: [] + client: null + client_version: null + custom_audiences: null + default_uri_disabled: null + deletion_protection: false + description: null + effective_labels: + goog-terraform-provisioned: 'true' + iap_enabled: false + invoker_iam_disabled: false + labels: null + location: europe-west8 + name: service + project: project-id + scaling: [] + template: + - annotations: null + containers: + - args: null + base_image_uri: null + command: null + depends_on: null + env: [] + image: us-docker.pkg.dev/cloudrun/container/hello + liveness_probe: [] + name: hello + resources: + - cpu_idle: null + limits: + nvidia.com/gpu: '1' + startup_cpu_boost: null + volume_mounts: [] + working_dir: null + encryption_key: null + execution_environment: EXECUTION_ENVIRONMENT_GEN2 + gpu_zonal_redundancy_disabled: true + labels: null + node_selector: + - accelerator: nvidia-l4 + revision: null + service_mesh: [] + session_affinity: null + volumes: [] + vpc_access: [] + terraform_labels: + goog-terraform-provisioned: 'true' + timeouts: null + +counts: + google_cloud_run_v2_service: 1 + modules: 1 + resources: 1 diff --git a/tests/modules/cloud_run_v2/examples/gpu-workerpool.yaml b/tests/modules/cloud_run_v2/examples/gpu-workerpool.yaml new file mode 100644 index 000000000..46fa53713 --- /dev/null +++ b/tests/modules/cloud_run_v2/examples/gpu-workerpool.yaml @@ -0,0 +1,61 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +values: + module.worker.google_cloud_run_v2_worker_pool.default_managed[0]: + annotations: null + binary_authorization: [] + client: null + client_version: null + custom_audiences: null + deletion_protection: false + description: null + effective_labels: + goog-terraform-provisioned: 'true' + labels: null + location: europe-west8 + name: worker + project: project-id + template: + - annotations: null + containers: + - args: null + command: null + depends_on: null + env: [] + image: us-docker.pkg.dev/cloudrun/container/hello + name: hello + resources: + - limits: + nvidia.com/gpu: '1' + volume_mounts: [] + working_dir: null + encryption_key: null + encryption_key_revocation_action: null + encryption_key_shutdown_duration: null + gpu_zonal_redundancy_disabled: true + labels: null + node_selector: + - accelerator: nvidia-l4 + revision: null + volumes: [] + vpc_access: [] + terraform_labels: + goog-terraform-provisioned: 'true' + timeouts: null + +counts: + google_cloud_run_v2_worker_pool: 1 + modules: 1 + resources: 1