diff --git a/CHANGELOG.md b/CHANGELOG.md
index 23097fc16..dc57bb683 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,8 +5,14 @@ All notable changes to this project will be documented in this file.
## [Unreleased]
+### BLUEPRINTS
+
+- [[#2841](https://github.com/GoogleCloudPlatform/cloud-foundation-fabric/pull/2841)] Add cAdvisor Metrics to Autopilot/Standard GKE Cluster ([HeiglAnna](https://github.com/HeiglAnna))
+
### MODULES
+- [[#2855](https://github.com/GoogleCloudPlatform/cloud-foundation-fabric/pull/2855)] Add support for advanced machine features to compute-vm ([ludoo](https://github.com/ludoo))
+- [[#2841](https://github.com/GoogleCloudPlatform/cloud-foundation-fabric/pull/2841)] Add cAdvisor Metrics to Autopilot/Standard GKE Cluster ([HeiglAnna](https://github.com/HeiglAnna))
- [[#2852](https://github.com/GoogleCloudPlatform/cloud-foundation-fabric/pull/2852)] Allow universe-bound projects to exclude services ([juliocc](https://github.com/juliocc))
- [[#2848](https://github.com/GoogleCloudPlatform/cloud-foundation-fabric/pull/2848)] Support project creation in different universes ([juliocc](https://github.com/juliocc))
- [[#2842](https://github.com/GoogleCloudPlatform/cloud-foundation-fabric/pull/2842)] Refactor data catalog tag template module ([ludoo](https://github.com/ludoo))
diff --git a/blueprints/gke/autopilot/cluster.tf b/blueprints/gke/autopilot/cluster.tf
index 1e0cd0cd4..cfeee02d2 100644
--- a/blueprints/gke/autopilot/cluster.tf
+++ b/blueprints/gke/autopilot/cluster.tf
@@ -46,6 +46,7 @@ module "cluster" {
# enable_pod_metrics = true
# enable_statefulset_metrics = true
# enable_storage_metrics = true
+ # enable_cadvisor_metrics = true
# }
# cluster_autoscaling = {
# auto_provisioning_defaults = {
diff --git a/blueprints/gke/patterns/autopilot-cluster/cluster.tf b/blueprints/gke/patterns/autopilot-cluster/cluster.tf
index e2b94c647..59415f9fb 100644
--- a/blueprints/gke/patterns/autopilot-cluster/cluster.tf
+++ b/blueprints/gke/patterns/autopilot-cluster/cluster.tf
@@ -107,6 +107,7 @@ module "cluster" {
enable_api_server_metrics = true
enable_controller_manager_metrics = true
enable_scheduler_metrics = true
+ enable_cadvisor_metrics = true
}
logging_config = {
enable_api_server_logs = true
diff --git a/modules/compute-vm/README.md b/modules/compute-vm/README.md
index 97e653b6e..393400091 100644
--- a/modules/compute-vm/README.md
+++ b/modules/compute-vm/README.md
@@ -31,6 +31,7 @@ In both modes, an optional service account can be created and assigned to either
- [Spot VM](#spot-vm)
- [Confidential compute](#confidential-compute)
- [Disk encryption with Cloud KMS](#disk-encryption-with-cloud-kms)
+ - [Advanced machine features](#advanced-machine-features)
- [Instance template](#instance-template)
- [Instance group](#instance-group)
- [Instance Schedule](#instance-schedule)
@@ -614,6 +615,31 @@ module "kms-vm-example" {
# tftest inventory=cmek.yaml e2e
```
+### Advanced machine features
+
+Advanced machine features can be configured via the `options.advanced_machine_features` variable.
+
+```hcl
+module "simple-vm-example" {
+ source = "./fabric/modules/compute-vm"
+ project_id = var.project_id
+ zone = "${var.region}-b"
+ name = "test"
+ network_interfaces = [{
+ network = var.vpc.self_link
+ subnetwork = var.subnet.self_link
+ }]
+ options = {
+ advanced_machine_features = {
+ enable_nested_virtualization = true
+ enable_turbo_mode = true
+ threads_per_core = 2
+ }
+ }
+}
+# tftest modules=1 resources=1
+```
+
### Instance template
This example shows how to use the module to manage an instance template that defines an additional attached disk for each instance, and overrides defaults for the boot disk image and service account.
@@ -886,8 +912,8 @@ module "sole-tenancy" {
|---|---|:---:|:---:|:---:|
| [name](variables.tf#L264) | Instance name. | string | ✓ | |
| [network_interfaces](variables.tf#L276) | Network interfaces configuration. Use self links for Shared VPC, set addresses to null if not needed. | list(object({…})) | ✓ | |
-| [project_id](variables.tf#L322) | Project id. | string | ✓ | |
-| [zone](variables.tf#L420) | Compute zone. | string | ✓ | |
+| [project_id](variables.tf#L345) | Project id. | string | ✓ | |
+| [zone](variables.tf#L443) | Compute zone. | string | ✓ | |
| [attached_disk_defaults](variables.tf#L17) | Defaults for attached disks options. | object({…}) | | {…} |
| [attached_disks](variables.tf#L37) | Additional disks, if options is null defaults will be used in its place. Source type is one of 'image' (zonal disks in vms and template), 'snapshot' (vm), 'existing', and null. | list(object({…})) | | [] |
| [boot_disk](variables.tf#L83) | Boot disk properties. | object({…}) | | {…} |
@@ -907,14 +933,14 @@ module "sole-tenancy" {
| [metadata](variables.tf#L252) | Instance metadata. | map(string) | | {} |
| [min_cpu_platform](variables.tf#L258) | Minimum CPU platform. | string | | null |
| [network_attached_interfaces](variables.tf#L269) | Network interfaces using network attachments. | list(string) | | [] |
-| [options](variables.tf#L292) | Instance options. | object({…}) | | {…} |
-| [scratch_disks](variables.tf#L327) | Scratch disks configuration. | object({…}) | | {…} |
-| [service_account](variables.tf#L339) | Service account email and scopes. If email is null, the default Compute service account will be used unless auto_create is true, in which case a service account will be created. Set the variable to null to avoid attaching a service account. | object({…}) | | {} |
-| [shielded_config](variables.tf#L349) | Shielded VM configuration of the instances. | object({…}) | | null |
-| [snapshot_schedules](variables.tf#L359) | Snapshot schedule resource policies that can be attached to disks. | map(object({…})) | | {} |
-| [tag_bindings](variables.tf#L402) | Resource manager tag bindings for this instance, in tag key => tag value format. | map(string) | | null |
-| [tag_bindings_firewall](variables.tf#L408) | Firewall (network scoped) tag bindings for this instance, in tag key => tag value format. | map(string) | | null |
-| [tags](variables.tf#L414) | Instance network tags for firewall rule targets. | list(string) | | [] |
+| [options](variables.tf#L292) | Instance options. | object({…}) | | {…} |
+| [scratch_disks](variables.tf#L350) | Scratch disks configuration. | object({…}) | | {…} |
+| [service_account](variables.tf#L362) | Service account email and scopes. If email is null, the default Compute service account will be used unless auto_create is true, in which case a service account will be created. Set the variable to null to avoid attaching a service account. | object({…}) | | {} |
+| [shielded_config](variables.tf#L372) | Shielded VM configuration of the instances. | object({…}) | | null |
+| [snapshot_schedules](variables.tf#L382) | Snapshot schedule resource policies that can be attached to disks. | map(object({…})) | | {} |
+| [tag_bindings](variables.tf#L425) | Resource manager tag bindings for this instance, in tag key => tag value format. | map(string) | | null |
+| [tag_bindings_firewall](variables.tf#L431) | Firewall (network scoped) tag bindings for this instance, in tag key => tag value format. | map(string) | | null |
+| [tags](variables.tf#L437) | Instance network tags for firewall rule targets. | list(string) | | [] |
## Outputs
diff --git a/modules/compute-vm/main.tf b/modules/compute-vm/main.tf
index 1a1c97e2b..bf215af06 100644
--- a/modules/compute-vm/main.tf
+++ b/modules/compute-vm/main.tf
@@ -15,6 +15,7 @@
*/
locals {
+ advanced_mf = var.options.advanced_machine_features
attached_disks = {
for disk in var.attached_disks :
(disk.name != null ? disk.name : disk.device_name) => merge(disk, {
@@ -165,6 +166,20 @@ resource "google_compute_instance" "default" {
metadata = var.metadata
resource_policies = local.ischedule_attach
+ dynamic "advanced_machine_features" {
+ for_each = local.advanced_mf != null ? [""] : []
+ content {
+ enable_nested_virtualization = local.advanced_mf.enable_nested_virtualization
+ enable_uefi_networking = local.advanced_mf.enable_uefi_networking
+ performance_monitoring_unit = local.advanced_mf.performance_monitoring_unit
+ threads_per_core = local.advanced_mf.threads_per_core
+ turbo_mode = (
+ local.advanced_mf.enable_turbo_mode ? "ALL_CORE_MAX" : null
+ )
+ visible_core_count = local.advanced_mf.visible_core_count
+ }
+ }
+
dynamic "attached_disk" {
for_each = local.attached_disks_zonal
iterator = config
@@ -369,6 +384,20 @@ resource "google_compute_instance_template" "default" {
labels = var.labels
resource_manager_tags = local.tags_combined
+ dynamic "advanced_machine_features" {
+ for_each = local.advanced_mf != null ? [""] : []
+ content {
+ enable_nested_virtualization = local.advanced_mf.enable_nested_virtualization
+ enable_uefi_networking = local.advanced_mf.enable_uefi_networking
+ performance_monitoring_unit = local.advanced_mf.performance_monitoring_unit
+ threads_per_core = local.advanced_mf.threads_per_core
+ turbo_mode = (
+ local.advanced_mf.enable_turbo_mode ? "ALL_CORE_MAX" : null
+ )
+ visible_core_count = local.advanced_mf.visible_core_count
+ }
+ }
+
disk {
auto_delete = var.boot_disk.auto_delete
boot = true
diff --git a/modules/compute-vm/variables.tf b/modules/compute-vm/variables.tf
index 8d6e8bc37..6e84daa84 100644
--- a/modules/compute-vm/variables.tf
+++ b/modules/compute-vm/variables.tf
@@ -292,6 +292,14 @@ variable "network_interfaces" {
variable "options" {
description = "Instance options."
type = object({
+ advanced_machine_features = optional(object({
+ enable_nested_virtualization = optional(bool)
+ enable_turbo_mode = optional(bool)
+ enable_uefi_networking = optional(bool)
+ performance_monitoring_unit = optional(string)
+ threads_per_core = optional(number)
+ visible_core_count = optional(number)
+ }))
allow_stopping_for_update = optional(bool, true)
deletion_protection = optional(bool, false)
max_run_duration = optional(object({
@@ -312,11 +320,26 @@ variable "options" {
termination_action = null
}
validation {
- condition = (var.options.termination_action == null
+ condition = (
+ var.options.termination_action == null
||
- contains(["STOP", "DELETE"], coalesce(var.options.termination_action, "1")))
+ contains(["STOP", "DELETE"], coalesce(var.options.termination_action, "1"))
+ )
error_message = "Allowed values for options.termination_action are 'STOP', 'DELETE' and null."
}
+ validation {
+ condition = (
+ try(var.options.advanced_machine_features.performance_monitoring_unit, null) == null
+ ||
+ contains(["ARCHITECTURAL", "ENHANCED", "STANDARD"], coalesce(
+ try(
+ var.options.advanced_machine_features.performance_monitoring_unit, null
+ ), "-"
+ )
+ )
+ )
+ error_message = "Allowed values for options.advanced_machine_features.performance_monitoring_unit are ARCHITECTURAL', 'ENHANCED', 'STANDARD' and null."
+ }
}
variable "project_id" {
diff --git a/modules/gke-cluster-autopilot/README.md b/modules/gke-cluster-autopilot/README.md
index e73a0b6c3..bfc0613d2 100644
--- a/modules/gke-cluster-autopilot/README.md
+++ b/modules/gke-cluster-autopilot/README.md
@@ -146,6 +146,7 @@ module "cluster-1" {
secondary_range_names = {} # use default names "pods" and "services"
}
monitoring_config = {
+ enable_cadvisor_metrics = true
enable_daemonset_metrics = true
enable_deployment_metrics = true
enable_hpa_metrics = true
@@ -205,9 +206,9 @@ module "cluster-1" {
| name | description | type | required | default |
|---|---|:---:|:---:|:---:|
| [location](variables.tf#L143) | Autopilot clusters are always regional. | string | ✓ | |
-| [name](variables.tf#L220) | Cluster name. | string | ✓ | |
-| [project_id](variables.tf#L251) | Cluster project ID. | string | ✓ | |
-| [vpc_config](variables.tf#L267) | VPC-level configuration. | object({…}) | ✓ | |
+| [name](variables.tf#L222) | Cluster name. | string | ✓ | |
+| [project_id](variables.tf#L253) | Cluster project ID. | string | ✓ | |
+| [vpc_config](variables.tf#L269) | VPC-level configuration. | object({…}) | ✓ | |
| [access_config](variables.tf#L17) | Control plane endpoint and nodes access configurations. | object({…}) | | {} |
| [backup_configs](variables.tf#L42) | Configuration for Backup for GKE. | object({…}) | | {} |
| [deletion_protection](variables.tf#L63) | Whether or not to allow Terraform to destroy the cluster. Unless this field is set to false in Terraform state, a terraform destroy or terraform apply that would delete the cluster will fail. | bool | | true |
@@ -219,10 +220,10 @@ module "cluster-1" {
| [logging_config](variables.tf#L148) | Logging configuration. | object({…}) | | {} |
| [maintenance_config](variables.tf#L159) | Maintenance window configuration. | object({…}) | | {…} |
| [min_master_version](variables.tf#L182) | Minimum version of the master, defaults to the version of the most recent official release. | string | | null |
-| [monitoring_config](variables.tf#L188) | Monitoring configuration. System metrics collection cannot be disabled. Control plane metrics are optional. Kube state metrics are optional. Google Cloud Managed Service for Prometheus is enabled by default. | object({…}) | | {} |
-| [node_config](variables.tf#L225) | Configuration for nodes and nodepools. | object({…}) | | {} |
-| [node_locations](variables.tf#L244) | Zones in which the cluster's nodes are located. | list(string) | | [] |
-| [release_channel](variables.tf#L256) | Release channel for GKE upgrades. Clusters created in the Autopilot mode must use a release channel. Choose between \"RAPID\", \"REGULAR\", and \"STABLE\". | string | | "REGULAR" |
+| [monitoring_config](variables.tf#L188) | Monitoring configuration. System metrics collection cannot be disabled. Control plane metrics are optional. Kube state metrics are optional. Google Cloud Managed Service for Prometheus is enabled by default. | object({…}) | | {} |
+| [node_config](variables.tf#L227) | Configuration for nodes and nodepools. | object({…}) | | {} |
+| [node_locations](variables.tf#L246) | Zones in which the cluster's nodes are located. | list(string) | | [] |
+| [release_channel](variables.tf#L258) | Release channel for GKE upgrades. Clusters created in the Autopilot mode must use a release channel. Choose between \"RAPID\", \"REGULAR\", and \"STABLE\". | string | | "REGULAR" |
## Outputs
diff --git a/modules/gke-cluster-autopilot/main.tf b/modules/gke-cluster-autopilot/main.tf
index af93d684a..54259b704 100644
--- a/modules/gke-cluster-autopilot/main.tf
+++ b/modules/gke-cluster-autopilot/main.tf
@@ -241,6 +241,7 @@ resource "google_container_cluster" "cluster" {
var.monitoring_config.enable_pod_metrics ? "POD" : null,
var.monitoring_config.enable_statefulset_metrics ? "STATEFULSET" : null,
var.monitoring_config.enable_storage_metrics ? "STORAGE" : null,
+ var.monitoring_config.enable_cadvisor_metrics ? "CADVISOR" : null,
]))
managed_prometheus {
enabled = var.monitoring_config.enable_managed_prometheus
diff --git a/modules/gke-cluster-autopilot/variables.tf b/modules/gke-cluster-autopilot/variables.tf
index 53122c945..17b53faf8 100644
--- a/modules/gke-cluster-autopilot/variables.tf
+++ b/modules/gke-cluster-autopilot/variables.tf
@@ -199,6 +199,7 @@ variable "monitoring_config" {
enable_pod_metrics = optional(bool, false)
enable_statefulset_metrics = optional(bool, false)
enable_storage_metrics = optional(bool, false)
+ enable_cadvisor_metrics = optional(bool, false)
# Google Cloud Managed Service for Prometheus. Autopilot clusters version >= 1.25 must have this on.
enable_managed_prometheus = optional(bool, true)
})
@@ -212,6 +213,7 @@ variable "monitoring_config" {
var.monitoring_config.enable_pod_metrics,
var.monitoring_config.enable_statefulset_metrics,
var.monitoring_config.enable_storage_metrics,
+ var.monitoring_config.enable_cadvisor_metrics,
]) ? var.monitoring_config.enable_managed_prometheus : true
error_message = "Kube state metrics collection requires Google Cloud Managed Service for Prometheus to be enabled."
}
diff --git a/modules/gke-cluster-standard/README.md b/modules/gke-cluster-standard/README.md
index c73a1555b..ec1bccaec 100644
--- a/modules/gke-cluster-standard/README.md
+++ b/modules/gke-cluster-standard/README.md
@@ -231,6 +231,7 @@ module "cluster-1" {
secondary_range_names = {} # use default names "pods" and "services"
}
monitoring_config = {
+ enable_cadvisor_metrics = true
enable_daemonset_metrics = true
enable_deployment_metrics = true
enable_hpa_metrics = true
@@ -428,9 +429,9 @@ module "cluster-1" {
| name | description | type | required | default |
|---|---|:---:|:---:|:---:|
| [location](variables.tf#L267) | Cluster zone or region. | string | ✓ | |
-| [name](variables.tf#L379) | Cluster name. | string | ✓ | |
-| [project_id](variables.tf#L412) | Cluster project id. | string | ✓ | |
-| [vpc_config](variables.tf#L423) | VPC-level configuration. | object({…}) | ✓ | |
+| [name](variables.tf#L382) | Cluster name. | string | ✓ | |
+| [project_id](variables.tf#L415) | Cluster project id. | string | ✓ | |
+| [vpc_config](variables.tf#L426) | VPC-level configuration. | object({…}) | ✓ | |
| [access_config](variables.tf#L17) | Control plane endpoint and nodes access configurations. | object({…}) | | {} |
| [backup_configs](variables.tf#L42) | Configuration for Backup for GKE. | object({…}) | | {} |
| [cluster_autoscaling](variables.tf#L64) | Enable and configure limits for Node Auto-Provisioning with Cluster Autoscaler. | object({…}) | | null |
@@ -445,10 +446,10 @@ module "cluster-1" {
| [maintenance_config](variables.tf#L293) | Maintenance window configuration. | object({…}) | | {…} |
| [max_pods_per_node](variables.tf#L316) | Maximum number of pods per node in this cluster. | number | | 110 |
| [min_master_version](variables.tf#L322) | Minimum version of the master, defaults to the version of the most recent official release. | string | | null |
-| [monitoring_config](variables.tf#L328) | Monitoring configuration. Google Cloud Managed Service for Prometheus is enabled by default. | object({…}) | | {} |
-| [node_config](variables.tf#L384) | Node-level configuration. | object({…}) | | {} |
-| [node_locations](variables.tf#L405) | Zones in which the cluster's nodes are located. | list(string) | | [] |
-| [release_channel](variables.tf#L417) | Release channel for GKE upgrades. | string | | null |
+| [monitoring_config](variables.tf#L328) | Monitoring configuration. Google Cloud Managed Service for Prometheus is enabled by default. | object({…}) | | {} |
+| [node_config](variables.tf#L387) | Node-level configuration. | object({…}) | | {} |
+| [node_locations](variables.tf#L408) | Zones in which the cluster's nodes are located. | list(string) | | [] |
+| [release_channel](variables.tf#L420) | Release channel for GKE upgrades. | string | | null |
## Outputs
diff --git a/modules/gke-cluster-standard/main.tf b/modules/gke-cluster-standard/main.tf
index e0c1ece2d..73c5bc112 100644
--- a/modules/gke-cluster-standard/main.tf
+++ b/modules/gke-cluster-standard/main.tf
@@ -426,6 +426,7 @@ resource "google_container_cluster" "cluster" {
var.monitoring_config.enable_pod_metrics ? "POD" : null,
var.monitoring_config.enable_statefulset_metrics ? "STATEFULSET" : null,
var.monitoring_config.enable_storage_metrics ? "STORAGE" : null,
+ var.monitoring_config.enable_cadvisor_metrics ? "CADVISOR" : null,
]))
managed_prometheus {
enabled = var.monitoring_config.enable_managed_prometheus
diff --git a/modules/gke-cluster-standard/variables.tf b/modules/gke-cluster-standard/variables.tf
index 20d3764d2..f63af3ebe 100644
--- a/modules/gke-cluster-standard/variables.tf
+++ b/modules/gke-cluster-standard/variables.tf
@@ -340,6 +340,7 @@ variable "monitoring_config" {
enable_pod_metrics = optional(bool, false)
enable_statefulset_metrics = optional(bool, false)
enable_storage_metrics = optional(bool, false)
+ enable_cadvisor_metrics = optional(bool, false)
# Google Cloud Managed Service for Prometheus
enable_managed_prometheus = optional(bool, true)
advanced_datapath_observability = optional(object({
@@ -360,6 +361,7 @@ variable "monitoring_config" {
var.monitoring_config.enable_pod_metrics,
var.monitoring_config.enable_statefulset_metrics,
var.monitoring_config.enable_storage_metrics,
+ var.monitoring_config.enable_cadvisor_metrics,
]) ? var.monitoring_config.enable_system_metrics : true
error_message = "System metrics are the minimum required component for enabling metrics collection."
}
@@ -371,6 +373,7 @@ variable "monitoring_config" {
var.monitoring_config.enable_pod_metrics,
var.monitoring_config.enable_statefulset_metrics,
var.monitoring_config.enable_storage_metrics,
+ var.monitoring_config.enable_cadvisor_metrics,
]) ? var.monitoring_config.enable_managed_prometheus : true
error_message = "Kube state metrics collection requires Google Cloud Managed Service for Prometheus to be enabled."
}
diff --git a/tests/modules/gke_cluster_autopilot/examples/monitoring-config-kube-state.yaml b/tests/modules/gke_cluster_autopilot/examples/monitoring-config-kube-state.yaml
index 32e5bad58..7ce922d47 100644
--- a/tests/modules/gke_cluster_autopilot/examples/monitoring-config-kube-state.yaml
+++ b/tests/modules/gke_cluster_autopilot/examples/monitoring-config-kube-state.yaml
@@ -1,4 +1,4 @@
-# Copyright 2023 Google LLC
+# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@@ -16,6 +16,7 @@ values:
module.cluster-1.google_container_cluster.cluster:
monitoring_config:
- enable_components:
+ - CADVISOR
- DAEMONSET
- DEPLOYMENT
- HPA
diff --git a/tests/modules/gke_cluster_standard/examples/monitoring-config-kube-state.yaml b/tests/modules/gke_cluster_standard/examples/monitoring-config-kube-state.yaml
index 32e5bad58..7ce922d47 100644
--- a/tests/modules/gke_cluster_standard/examples/monitoring-config-kube-state.yaml
+++ b/tests/modules/gke_cluster_standard/examples/monitoring-config-kube-state.yaml
@@ -1,4 +1,4 @@
-# Copyright 2023 Google LLC
+# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@@ -16,6 +16,7 @@ values:
module.cluster-1.google_container_cluster.cluster:
monitoring_config:
- enable_components:
+ - CADVISOR
- DAEMONSET
- DEPLOYMENT
- HPA