From efa1ef6f7e3089009844211ac492fe21dd36d5e0 Mon Sep 17 00:00:00 2001 From: HeiglAnna Date: Thu, 30 Jan 2025 14:29:07 +0100 Subject: [PATCH] Add cAdvisor Metrics to Autopilot/Standard GKE Cluster (#2841) * Add cAdvisor Metrics to Autopilot/Standard GKE Cluster * Fix tests --------- Co-authored-by: AnnaHeigl <85624888+AnnaHeigl@users.noreply.github.com> Co-authored-by: Julio Castillo --- blueprints/gke/autopilot/cluster.tf | 1 + .../gke/patterns/autopilot-cluster/cluster.tf | 1 + modules/gke-cluster-autopilot/README.md | 15 ++++++++------- modules/gke-cluster-autopilot/main.tf | 1 + modules/gke-cluster-autopilot/variables.tf | 2 ++ modules/gke-cluster-standard/README.md | 15 ++++++++------- modules/gke-cluster-standard/main.tf | 1 + modules/gke-cluster-standard/variables.tf | 3 +++ .../examples/monitoring-config-kube-state.yaml | 3 ++- .../examples/monitoring-config-kube-state.yaml | 3 ++- 10 files changed, 29 insertions(+), 16 deletions(-) diff --git a/blueprints/gke/autopilot/cluster.tf b/blueprints/gke/autopilot/cluster.tf index 1e0cd0cd4..cfeee02d2 100644 --- a/blueprints/gke/autopilot/cluster.tf +++ b/blueprints/gke/autopilot/cluster.tf @@ -46,6 +46,7 @@ module "cluster" { # enable_pod_metrics = true # enable_statefulset_metrics = true # enable_storage_metrics = true + # enable_cadvisor_metrics = true # } # cluster_autoscaling = { # auto_provisioning_defaults = { diff --git a/blueprints/gke/patterns/autopilot-cluster/cluster.tf b/blueprints/gke/patterns/autopilot-cluster/cluster.tf index e2b94c647..59415f9fb 100644 --- a/blueprints/gke/patterns/autopilot-cluster/cluster.tf +++ b/blueprints/gke/patterns/autopilot-cluster/cluster.tf @@ -107,6 +107,7 @@ module "cluster" { enable_api_server_metrics = true enable_controller_manager_metrics = true enable_scheduler_metrics = true + enable_cadvisor_metrics = true } logging_config = { enable_api_server_logs = true diff --git a/modules/gke-cluster-autopilot/README.md b/modules/gke-cluster-autopilot/README.md index e73a0b6c3..bfc0613d2 100644 --- a/modules/gke-cluster-autopilot/README.md +++ b/modules/gke-cluster-autopilot/README.md @@ -146,6 +146,7 @@ module "cluster-1" { secondary_range_names = {} # use default names "pods" and "services" } monitoring_config = { + enable_cadvisor_metrics = true enable_daemonset_metrics = true enable_deployment_metrics = true enable_hpa_metrics = true @@ -205,9 +206,9 @@ module "cluster-1" { | name | description | type | required | default | |---|---|:---:|:---:|:---:| | [location](variables.tf#L143) | Autopilot clusters are always regional. | string | ✓ | | -| [name](variables.tf#L220) | Cluster name. | string | ✓ | | -| [project_id](variables.tf#L251) | Cluster project ID. | string | ✓ | | -| [vpc_config](variables.tf#L267) | VPC-level configuration. | object({…}) | ✓ | | +| [name](variables.tf#L222) | Cluster name. | string | ✓ | | +| [project_id](variables.tf#L253) | Cluster project ID. | string | ✓ | | +| [vpc_config](variables.tf#L269) | VPC-level configuration. | object({…}) | ✓ | | | [access_config](variables.tf#L17) | Control plane endpoint and nodes access configurations. | object({…}) | | {} | | [backup_configs](variables.tf#L42) | Configuration for Backup for GKE. | object({…}) | | {} | | [deletion_protection](variables.tf#L63) | Whether or not to allow Terraform to destroy the cluster. Unless this field is set to false in Terraform state, a terraform destroy or terraform apply that would delete the cluster will fail. | bool | | true | @@ -219,10 +220,10 @@ module "cluster-1" { | [logging_config](variables.tf#L148) | Logging configuration. | object({…}) | | {} | | [maintenance_config](variables.tf#L159) | Maintenance window configuration. | object({…}) | | {…} | | [min_master_version](variables.tf#L182) | Minimum version of the master, defaults to the version of the most recent official release. | string | | null | -| [monitoring_config](variables.tf#L188) | Monitoring configuration. System metrics collection cannot be disabled. Control plane metrics are optional. Kube state metrics are optional. Google Cloud Managed Service for Prometheus is enabled by default. | object({…}) | | {} | -| [node_config](variables.tf#L225) | Configuration for nodes and nodepools. | object({…}) | | {} | -| [node_locations](variables.tf#L244) | Zones in which the cluster's nodes are located. | list(string) | | [] | -| [release_channel](variables.tf#L256) | Release channel for GKE upgrades. Clusters created in the Autopilot mode must use a release channel. Choose between \"RAPID\", \"REGULAR\", and \"STABLE\". | string | | "REGULAR" | +| [monitoring_config](variables.tf#L188) | Monitoring configuration. System metrics collection cannot be disabled. Control plane metrics are optional. Kube state metrics are optional. Google Cloud Managed Service for Prometheus is enabled by default. | object({…}) | | {} | +| [node_config](variables.tf#L227) | Configuration for nodes and nodepools. | object({…}) | | {} | +| [node_locations](variables.tf#L246) | Zones in which the cluster's nodes are located. | list(string) | | [] | +| [release_channel](variables.tf#L258) | Release channel for GKE upgrades. Clusters created in the Autopilot mode must use a release channel. Choose between \"RAPID\", \"REGULAR\", and \"STABLE\". | string | | "REGULAR" | ## Outputs diff --git a/modules/gke-cluster-autopilot/main.tf b/modules/gke-cluster-autopilot/main.tf index af93d684a..54259b704 100644 --- a/modules/gke-cluster-autopilot/main.tf +++ b/modules/gke-cluster-autopilot/main.tf @@ -241,6 +241,7 @@ resource "google_container_cluster" "cluster" { var.monitoring_config.enable_pod_metrics ? "POD" : null, var.monitoring_config.enable_statefulset_metrics ? "STATEFULSET" : null, var.monitoring_config.enable_storage_metrics ? "STORAGE" : null, + var.monitoring_config.enable_cadvisor_metrics ? "CADVISOR" : null, ])) managed_prometheus { enabled = var.monitoring_config.enable_managed_prometheus diff --git a/modules/gke-cluster-autopilot/variables.tf b/modules/gke-cluster-autopilot/variables.tf index 53122c945..17b53faf8 100644 --- a/modules/gke-cluster-autopilot/variables.tf +++ b/modules/gke-cluster-autopilot/variables.tf @@ -199,6 +199,7 @@ variable "monitoring_config" { enable_pod_metrics = optional(bool, false) enable_statefulset_metrics = optional(bool, false) enable_storage_metrics = optional(bool, false) + enable_cadvisor_metrics = optional(bool, false) # Google Cloud Managed Service for Prometheus. Autopilot clusters version >= 1.25 must have this on. enable_managed_prometheus = optional(bool, true) }) @@ -212,6 +213,7 @@ variable "monitoring_config" { var.monitoring_config.enable_pod_metrics, var.monitoring_config.enable_statefulset_metrics, var.monitoring_config.enable_storage_metrics, + var.monitoring_config.enable_cadvisor_metrics, ]) ? var.monitoring_config.enable_managed_prometheus : true error_message = "Kube state metrics collection requires Google Cloud Managed Service for Prometheus to be enabled." } diff --git a/modules/gke-cluster-standard/README.md b/modules/gke-cluster-standard/README.md index c73a1555b..ec1bccaec 100644 --- a/modules/gke-cluster-standard/README.md +++ b/modules/gke-cluster-standard/README.md @@ -231,6 +231,7 @@ module "cluster-1" { secondary_range_names = {} # use default names "pods" and "services" } monitoring_config = { + enable_cadvisor_metrics = true enable_daemonset_metrics = true enable_deployment_metrics = true enable_hpa_metrics = true @@ -428,9 +429,9 @@ module "cluster-1" { | name | description | type | required | default | |---|---|:---:|:---:|:---:| | [location](variables.tf#L267) | Cluster zone or region. | string | ✓ | | -| [name](variables.tf#L379) | Cluster name. | string | ✓ | | -| [project_id](variables.tf#L412) | Cluster project id. | string | ✓ | | -| [vpc_config](variables.tf#L423) | VPC-level configuration. | object({…}) | ✓ | | +| [name](variables.tf#L382) | Cluster name. | string | ✓ | | +| [project_id](variables.tf#L415) | Cluster project id. | string | ✓ | | +| [vpc_config](variables.tf#L426) | VPC-level configuration. | object({…}) | ✓ | | | [access_config](variables.tf#L17) | Control plane endpoint and nodes access configurations. | object({…}) | | {} | | [backup_configs](variables.tf#L42) | Configuration for Backup for GKE. | object({…}) | | {} | | [cluster_autoscaling](variables.tf#L64) | Enable and configure limits for Node Auto-Provisioning with Cluster Autoscaler. | object({…}) | | null | @@ -445,10 +446,10 @@ module "cluster-1" { | [maintenance_config](variables.tf#L293) | Maintenance window configuration. | object({…}) | | {…} | | [max_pods_per_node](variables.tf#L316) | Maximum number of pods per node in this cluster. | number | | 110 | | [min_master_version](variables.tf#L322) | Minimum version of the master, defaults to the version of the most recent official release. | string | | null | -| [monitoring_config](variables.tf#L328) | Monitoring configuration. Google Cloud Managed Service for Prometheus is enabled by default. | object({…}) | | {} | -| [node_config](variables.tf#L384) | Node-level configuration. | object({…}) | | {} | -| [node_locations](variables.tf#L405) | Zones in which the cluster's nodes are located. | list(string) | | [] | -| [release_channel](variables.tf#L417) | Release channel for GKE upgrades. | string | | null | +| [monitoring_config](variables.tf#L328) | Monitoring configuration. Google Cloud Managed Service for Prometheus is enabled by default. | object({…}) | | {} | +| [node_config](variables.tf#L387) | Node-level configuration. | object({…}) | | {} | +| [node_locations](variables.tf#L408) | Zones in which the cluster's nodes are located. | list(string) | | [] | +| [release_channel](variables.tf#L420) | Release channel for GKE upgrades. | string | | null | ## Outputs diff --git a/modules/gke-cluster-standard/main.tf b/modules/gke-cluster-standard/main.tf index e0c1ece2d..73c5bc112 100644 --- a/modules/gke-cluster-standard/main.tf +++ b/modules/gke-cluster-standard/main.tf @@ -426,6 +426,7 @@ resource "google_container_cluster" "cluster" { var.monitoring_config.enable_pod_metrics ? "POD" : null, var.monitoring_config.enable_statefulset_metrics ? "STATEFULSET" : null, var.monitoring_config.enable_storage_metrics ? "STORAGE" : null, + var.monitoring_config.enable_cadvisor_metrics ? "CADVISOR" : null, ])) managed_prometheus { enabled = var.monitoring_config.enable_managed_prometheus diff --git a/modules/gke-cluster-standard/variables.tf b/modules/gke-cluster-standard/variables.tf index 20d3764d2..f63af3ebe 100644 --- a/modules/gke-cluster-standard/variables.tf +++ b/modules/gke-cluster-standard/variables.tf @@ -340,6 +340,7 @@ variable "monitoring_config" { enable_pod_metrics = optional(bool, false) enable_statefulset_metrics = optional(bool, false) enable_storage_metrics = optional(bool, false) + enable_cadvisor_metrics = optional(bool, false) # Google Cloud Managed Service for Prometheus enable_managed_prometheus = optional(bool, true) advanced_datapath_observability = optional(object({ @@ -360,6 +361,7 @@ variable "monitoring_config" { var.monitoring_config.enable_pod_metrics, var.monitoring_config.enable_statefulset_metrics, var.monitoring_config.enable_storage_metrics, + var.monitoring_config.enable_cadvisor_metrics, ]) ? var.monitoring_config.enable_system_metrics : true error_message = "System metrics are the minimum required component for enabling metrics collection." } @@ -371,6 +373,7 @@ variable "monitoring_config" { var.monitoring_config.enable_pod_metrics, var.monitoring_config.enable_statefulset_metrics, var.monitoring_config.enable_storage_metrics, + var.monitoring_config.enable_cadvisor_metrics, ]) ? var.monitoring_config.enable_managed_prometheus : true error_message = "Kube state metrics collection requires Google Cloud Managed Service for Prometheus to be enabled." } diff --git a/tests/modules/gke_cluster_autopilot/examples/monitoring-config-kube-state.yaml b/tests/modules/gke_cluster_autopilot/examples/monitoring-config-kube-state.yaml index 32e5bad58..7ce922d47 100644 --- a/tests/modules/gke_cluster_autopilot/examples/monitoring-config-kube-state.yaml +++ b/tests/modules/gke_cluster_autopilot/examples/monitoring-config-kube-state.yaml @@ -1,4 +1,4 @@ -# Copyright 2023 Google LLC +# Copyright 2025 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,6 +16,7 @@ values: module.cluster-1.google_container_cluster.cluster: monitoring_config: - enable_components: + - CADVISOR - DAEMONSET - DEPLOYMENT - HPA diff --git a/tests/modules/gke_cluster_standard/examples/monitoring-config-kube-state.yaml b/tests/modules/gke_cluster_standard/examples/monitoring-config-kube-state.yaml index 32e5bad58..7ce922d47 100644 --- a/tests/modules/gke_cluster_standard/examples/monitoring-config-kube-state.yaml +++ b/tests/modules/gke_cluster_standard/examples/monitoring-config-kube-state.yaml @@ -1,4 +1,4 @@ -# Copyright 2023 Google LLC +# Copyright 2025 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,6 +16,7 @@ values: module.cluster-1.google_container_cluster.cluster: monitoring_config: - enable_components: + - CADVISOR - DAEMONSET - DEPLOYMENT - HPA