From 58301c9edaa98a23d328978427a85223e0d0956f Mon Sep 17 00:00:00 2001 From: Abhishek Date: Wed, 27 May 2026 11:00:26 +0100 Subject: [PATCH] Add containerd_config support to gke-nodepool (#3973) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Add ephemeral_storage_local_ssd_config support to modules/gke-nodepool Adds ephemeral_storage_local_ssd_count to node_config variable and the corresponding dynamic ephemeral_storage_local_ssd_config block in the node pool resource, enabling use of local SSDs as ephemeral storage. * feat(gke-nodepool): add flex_start support to node_config Add `flex_start` as an optional bool to the `node_config` variable type and wire it through to the `google_container_node_pool` resource's node_config block. This enables DWS (Dynamic Workload Scheduler) flex-start mode for node pools, used for on-demand capacity access without requiring ProvisioningRequest objects (e.g. spot TPU pools). * feat(gke-nodepool): add flex_start support to node_config Add `flex_start` as an optional bool to the `node_config` variable type and wire it through to the `google_container_node_pool` resource's node_config block. This enables DWS (Dynamic Workload Scheduler) flex-start mode for node pools, which allows the Cluster Autoscaler to request capacity on-demand without requiring ProvisioningRequest objects (unlike queued_provisioning). Typical use case is spot TPU node pools. * feat(gke-nodepool): add advanced_machine_features support to node_config Add `advanced_machine_features` as an optional object to the `node_config` variable type and wire it through to the `google_container_node_pool` resource via a dynamic block. This allows callers to configure `threads_per_core` (e.g. set to 1 to disable hyperthreading) and `enable_nested_virtualization` for node pools that require fine-grained CPU threading control or nested hypervisor support. GKE auto-sets `advanced_machine_features` (threads_per_core=1) on ct6e/TPU machine types; exposing this field also lets consumers add it to ignore_changes in their own lifecycle blocks to avoid forced replacements. * feat(gke-nodepool): add containerd_config support to node_config Add `containerd_config` as an optional object to the `node_config` variable and wire it through to the `google_container_node_pool` resource via a dynamic block. This allows callers to configure private registry mirrors or custom containerd registry hosts per node pool — useful for air-gapped environments and internal registry proxies. The `registry_hosts` list maps each upstream server to one or more mirror hosts, with optional `capabilities`, `override_path`, and `dial_timeout` fields (all defaulting to sensible values). * refactor(gke-nodepool): use maps for containerd_config registry_hosts and hosts Convert registry_hosts and hosts from lists to maps so that the registry server and host URLs serve as stable keys, avoiding index-shifting issues with for_each. Add default values for capabilities, override_path, and dial_timeout. Update README example and test inventory accordingly. * Remove default values from containerd_config hosts fields Leave capabilities, override_path, and dial_timeout without defaults so the provider/API picks them rather than the module imposing values. * Refine containerd_config variable interface - Simplify header to optional(map(list(string))) - Flatten ca, client cert/key to strings with descriptive names - Derive private_registry_access_config enabled from ca domain config list - Simplify writable_cgroups to optional(bool) - Flatten gcp_secret_manager_certificate_config to string - Remove redundant defaults where try() handles null in main.tf - Fix long lines in main.tf to stay within 79-char limit - Update copyright year to 2026 in inventory files * fix(gke-nodepool): run terraform fmt to fix attribute alignment in containerd_config * docs(gke-nodepool): regenerate README with updated variable line numbers * fix(gke-nodepool): use coalesce instead of try for null header map in for_each * tests(gke-nodepool): update containerd-config inventory to match actual plan output --------- Co-authored-by: Julio Castillo --- modules/gke-nodepool/README.md | 49 +++++++--- modules/gke-nodepool/main.tf | 95 +++++++++++++++++++ modules/gke-nodepool/variables.tf | 22 +++++ .../examples/advanced-machine-features.yaml | 28 ++++++ .../examples/containerd-config.yaml | 38 ++++++++ 5 files changed, 221 insertions(+), 11 deletions(-) create mode 100644 tests/modules/gke_nodepool/examples/advanced-machine-features.yaml create mode 100644 tests/modules/gke_nodepool/examples/containerd-config.yaml diff --git a/modules/gke-nodepool/README.md b/modules/gke-nodepool/README.md index 713e924f4..dd27eb3e0 100644 --- a/modules/gke-nodepool/README.md +++ b/modules/gke-nodepool/README.md @@ -230,7 +230,34 @@ module "cluster-1-nodepool-advanced-machine-features" { } } } -# tftest modules=1 resources=1 +# tftest modules=1 resources=1 inventory=advanced-machine-features.yaml +``` + +### Containerd registry mirror configuration + +This example shows how to configure a private registry mirror for containerd on each node, useful for air-gapped environments or when pulling images through an internal registry proxy. + +```hcl +module "cluster-1-nodepool-containerd" { + source = "./fabric/modules/gke-nodepool" + project_id = "myproject" + cluster_name = "cluster-1" + location = "europe-west4-a" + name = "nodepool-containerd" + node_config = { + machine_type = "n2-standard-4" + containerd_config = { + registry_hosts = { + "registry.example.com" = { + hosts = { + "mirror.example.com" = {} + } + } + } + } + } +} +# tftest modules=1 resources=1 inventory=containerd-config.yaml ``` ## Variables @@ -239,7 +266,7 @@ module "cluster-1-nodepool-advanced-machine-features" { |---|---|:---:|:---:|:---:| | [cluster_name](variables.tf#L23) | Cluster name. | string | ✓ | | | [location](variables.tf#L48) | Cluster location. | string | ✓ | | -| [project_id](variables.tf#L229) | Cluster project id. | string | ✓ | | +| [project_id](variables.tf#L251) | Cluster project id. | string | ✓ | | | [cluster_id](variables.tf#L17) | Cluster id. Optional, but providing cluster_id is recommended to prevent cluster misconfiguration in some of the edge cases. | string | | null | | [gke_version](variables.tf#L28) | Kubernetes nodes version. Ignored if auto_upgrade is set in management_config. | string | | null | | [k8s_labels](variables.tf#L34) | Kubernetes labels applied to each node. | map(string) | | {} | @@ -248,15 +275,15 @@ module "cluster-1-nodepool-advanced-machine-features" { | [name](variables.tf#L59) | Optional nodepool name. | string | | null | | [network_config](variables.tf#L65) | Network configuration. | object({…}) | | null | | [node_config](variables.tf#L89) | Node-level configuration. | object({…}) | | {} | -| [node_count](variables.tf#L175) | Number of nodes per instance group. Initial value can only be changed by recreation, current is ignored when autoscaling is used. | object({…}) | | {…} | -| [node_locations](variables.tf#L187) | Node locations. | list(string) | | null | -| [nodepool_config](variables.tf#L193) | Nodepool-level configuration. | object({…}) | | null | -| [reservation_affinity](variables.tf#L234) | Configuration of the desired reservation which instances could take capacity from. | object({…}) | | null | -| [resource_manager_tags](variables.tf#L244) | A map of resource manager tag keys and values to be attached to the nodes for managing Compute Engine firewalls using Network Firewall Policies. | map(string) | | null | -| [service_account](variables.tf#L250) | Nodepool service account. If this variable is set to null, the default GCE service account will be used. If set and email is null, a service account will be created. If scopes are null a default will be used. | object({…}) | | {} | -| [sole_tenant_nodegroup](variables.tf#L262) | Sole tenant node group. | string | | null | -| [tags](variables.tf#L268) | Network tags applied to nodes. | list(string) | | null | -| [taints](variables.tf#L274) | Kubernetes taints applied to all nodes. | map(object({…})) | | {} | +| [node_count](variables.tf#L197) | Number of nodes per instance group. Initial value can only be changed by recreation, current is ignored when autoscaling is used. | object({…}) | | {…} | +| [node_locations](variables.tf#L209) | Node locations. | list(string) | | null | +| [nodepool_config](variables.tf#L215) | Nodepool-level configuration. | object({…}) | | null | +| [reservation_affinity](variables.tf#L256) | Configuration of the desired reservation which instances could take capacity from. | object({…}) | | null | +| [resource_manager_tags](variables.tf#L266) | A map of resource manager tag keys and values to be attached to the nodes for managing Compute Engine firewalls using Network Firewall Policies. | map(string) | | null | +| [service_account](variables.tf#L272) | Nodepool service account. If this variable is set to null, the default GCE service account will be used. If set and email is null, a service account will be created. If scopes are null a default will be used. | object({…}) | | {} | +| [sole_tenant_nodegroup](variables.tf#L284) | Sole tenant node group. | string | | null | +| [tags](variables.tf#L290) | Network tags applied to nodes. | list(string) | | null | +| [taints](variables.tf#L296) | Kubernetes taints applied to all nodes. | map(object({…})) | | {} | ## Outputs diff --git a/modules/gke-nodepool/main.tf b/modules/gke-nodepool/main.tf index 37624edc8..56c41475b 100644 --- a/modules/gke-nodepool/main.tf +++ b/modules/gke-nodepool/main.tf @@ -345,5 +345,100 @@ resource "google_container_node_pool" "nodepool" { threads_per_core = var.node_config.advanced_machine_features.threads_per_core } } + dynamic "containerd_config" { + for_each = var.node_config.containerd_config != null ? [""] : [] + content { + dynamic "private_registry_access_config" { + for_each = try(var.node_config.containerd_config.private_registry_access_config, null) != null ? [""] : [] + content { + enabled = ( + length(try( + var.node_config.containerd_config + .private_registry_access_config + .certificate_authority_domain_config, + [] + )) > 0 + ) + dynamic "certificate_authority_domain_config" { + for_each = try( + var.node_config.containerd_config + .private_registry_access_config + .certificate_authority_domain_config, + [] + ) + content { + fqdns = certificate_authority_domain_config.value.fqdns + gcp_secret_manager_certificate_config { + secret_uri = ( + certificate_authority_domain_config.value + .gcp_secret_manager_certificate_config_secret_uri + ) + } + } + } + } + } + dynamic "writable_cgroups" { + for_each = var.node_config.containerd_config.writable_cgroups != null ? [""] : [] + content { + enabled = var.node_config.containerd_config.writable_cgroups + } + } + dynamic "registry_hosts" { + for_each = try(var.node_config.containerd_config.registry_hosts, {}) + content { + server = registry_hosts.key + dynamic "hosts" { + for_each = registry_hosts.value.hosts + content { + host = hosts.key + capabilities = hosts.value.capabilities + override_path = hosts.value.override_path + dial_timeout = hosts.value.dial_timeout + dynamic "header" { + for_each = coalesce(hosts.value.header, {}) + content { + key = header.key + value = header.value + } + } + dynamic "ca" { + for_each = ( + hosts.value.ca_gcp_secret_manager_secret_uri != null + ? [hosts.value.ca_gcp_secret_manager_secret_uri] + : [] + ) + content { + gcp_secret_manager_secret_uri = ca.value + } + } + dynamic "client" { + for_each = ( + hosts.value.client != null ? [hosts.value.client] : [] + ) + content { + cert { + gcp_secret_manager_secret_uri = ( + client.value.cert_gcp_secret_manager_secret_uri + ) + } + dynamic "key" { + for_each = ( + client.value.key_gcp_secret_manager_secret_uri != null + ? [client.value.key_gcp_secret_manager_secret_uri] + : [] + ) + content { + gcp_secret_manager_secret_uri = key.value + } + } + } + } + } + } + } + } + } + } } } diff --git a/modules/gke-nodepool/variables.tf b/modules/gke-nodepool/variables.tf index 886a95457..89319f023 100644 --- a/modules/gke-nodepool/variables.tf +++ b/modules/gke-nodepool/variables.tf @@ -149,6 +149,28 @@ variable "node_config" { enable_nested_virtualization = optional(bool) threads_per_core = optional(number) })) + containerd_config = optional(object({ + private_registry_access_config = optional(object({ + certificate_authority_domain_config = optional(list(object({ + fqdns = list(string) + gcp_secret_manager_certificate_config_secret_uri = string + }))) + })) + writable_cgroups = optional(bool) + registry_hosts = optional(map(object({ + hosts = optional(map(object({ + capabilities = optional(list(string)) + override_path = optional(bool) + dial_timeout = optional(string) + header = optional(map(list(string))) + ca_gcp_secret_manager_secret_uri = optional(string) + client = optional(object({ + cert_gcp_secret_manager_secret_uri = string + key_gcp_secret_manager_secret_uri = optional(string) + })) + })), {}) + })), {}) + })) }) default = {} nullable = false diff --git a/tests/modules/gke_nodepool/examples/advanced-machine-features.yaml b/tests/modules/gke_nodepool/examples/advanced-machine-features.yaml new file mode 100644 index 000000000..3e4b6d33c --- /dev/null +++ b/tests/modules/gke_nodepool/examples/advanced-machine-features.yaml @@ -0,0 +1,28 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +values: + module.cluster-1-nodepool-advanced-machine-features.google_container_node_pool.nodepool: + cluster: cluster-1 + location: europe-west4-a + name: nodepool-advanced-machine-features + project: myproject + node_config: + - machine_type: n2-standard-4 + advanced_machine_features: + - threads_per_core: 1 + enable_nested_virtualization: null + +counts: + google_container_node_pool: 1 diff --git a/tests/modules/gke_nodepool/examples/containerd-config.yaml b/tests/modules/gke_nodepool/examples/containerd-config.yaml new file mode 100644 index 000000000..46f18ec34 --- /dev/null +++ b/tests/modules/gke_nodepool/examples/containerd-config.yaml @@ -0,0 +1,38 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +values: + module.cluster-1-nodepool-containerd.google_container_node_pool.nodepool: + cluster: cluster-1 + location: europe-west4-a + name: nodepool-containerd + project: myproject + node_config: + - machine_type: n2-standard-4 + containerd_config: + - private_registry_access_config: [] + writable_cgroups: [] + registry_hosts: + - server: registry.example.com + hosts: + - host: mirror.example.com + capabilities: null + override_path: null + dial_timeout: null + header: [] + ca: [] + client: [] + +counts: + google_container_node_pool: 1