confgenerator/config.go (340 lines of code) (raw):
// Copyright 2023 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package confgenerator
import (
"context"
"fmt"
"io/ioutil"
"log"
"os"
"strings"
"github.com/GoogleCloudPlatform/run-gmp-sidecar/confgenerator/otel"
"github.com/prometheus/prometheus/discovery"
"github.com/prometheus/prometheus/discovery/targetgroup"
"github.com/prometheus/prometheus/model/relabel"
yaml "github.com/goccy/go-yaml"
prommodel "github.com/prometheus/common/model"
promconfig "github.com/prometheus/prometheus/config"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)
type RunMonitoringConfig struct {
metav1.TypeMeta `yaml:",inline"`
metav1.ObjectMeta `yaml:"metadata,omitempty"`
Spec RunMonitoringSpec `yaml:"spec"`
Env *CloudRunEnvironment
}
// RunMonitoringSpec contains specification parameters for RunMonitoring.
type RunMonitoringSpec struct {
// The endpoints to scrape on the selected pods.
Endpoints []ScrapeEndpoint `yaml:"endpoints"`
// Labels to add to the Prometheus target for discovered endpoints.
TargetLabels RunTargetLabels `yaml:"targetLabels,omitempty"`
// Limits to apply at scrape time.
Limits *ScrapeLimits `yaml:"limits,omitempty"`
}
// RunTargetLabels specifies the additional metadata about the target
// users can add to their metric. Allowed options are {service, revision
// , configuration}. If not specified, the sidecar defaults to adding all
// of them to every metric.
type RunTargetLabels struct {
Metadata *[]string `yaml:"metadata,omitempty"`
}
// ScrapeEndpoint specifies a Prometheus metrics endpoint to scrape.
type ScrapeEndpoint struct {
// Name or number of the port to scrape.
Port string `yaml:"port"`
// Protocol scheme to use to scrape.
Scheme string `yaml:"scheme,omitempty"`
// HTTP path to scrape metrics from. Defaults to "/metrics".
Path string `yaml:"path,omitempty"`
// HTTP GET params to use when scraping.
Params map[string][]string `yaml:"params,omitempty"`
// Proxy URL to scrape through. Encoded passwords are not supported.
ProxyURL string `yaml:"proxyUrl,omitempty"`
// Interval at which to scrape metrics. Must be a valid Prometheus duration.
Interval string `yaml:"interval,omitempty"`
// Timeout for metrics scrapes. Must be a valid Prometheus duration.
// Must not be larger then the scrape interval.
Timeout string `yaml:"timeout,omitempty"`
// Relabeling rules for metrics scraped from this endpoint. Relabeling rules
// that override protected target labels (project_id, location, cluster,
// namespace, job, instance, instanceId or __address__) are not permitted.
MetricRelabeling []RelabelingRule `yaml:"metricRelabeling,omitempty"`
}
type RelabelingRule struct {
// The source labels select values from existing labels. Their content is concatenated
// using the configured separator and matched against the configured regular expression
// for the replace, keep, and drop actions.
SourceLabels []string `yaml:"sourceLabels,omitempty"`
// Separator placed between concatenated source label values. Defaults to ';'.
Separator string `yaml:"separator,omitempty"`
// Label to which the resulting value is written in a replace action.
// It is mandatory for replace actions. Regex capture groups are available.
TargetLabel string `yaml:"targetLabel,omitempty"`
// Regular expression against which the extracted value is matched. Defaults to '(.*)'.
Regex string `yaml:"regex,omitempty"`
// Modulus to take of the hash of the source label values.
Modulus uint64 `yaml:"modulus,omitempty"`
// Replacement value against which a regex replace is performed if the
// regular expression matches. Regex capture groups are available. Defaults to '$1'.
Replacement string `yaml:"replacement,omitempty"`
// Action to perform based on regex matching. Defaults to 'replace'.
Action string `yaml:"action,omitempty"`
}
type ScrapeLimits struct {
// Maximum number of samples accepted within a single scrape.
// Uses Prometheus default if left unspecified.
Samples uint64 `yaml:"samples,omitempty"`
// Maximum number of labels accepted for a single sample.
// Uses Prometheus default if left unspecified.
Labels uint64 `yaml:"labels,omitempty"`
// Maximum label name length.
// Uses Prometheus default if left unspecified.
LabelNameLength uint64 `yaml:"labelNameLength,omitempty"`
// Maximum label value length.
// Uses Prometheus default if left unspecified.
LabelValueLength uint64 `yaml:"labelValueLength,omitempty"`
}
var allowedTargetMetadata = []string{"instance", "revision", "service", "configuration"}
const (
kind = "RunMonitoring"
apiVersion = "monitoring.googleapis.com/v1beta"
// Metric labels names that will be added to metrics based on the RunTargetLabels.Metadata
// configuration.
cloudRunInstanceLabel = "instanceId"
cloudRunServiceLabel = "service_name"
cloudRunRevisionLabel = "revision_name"
cloudRunConfigurationLabel = "configuration_name"
)
// DefaultRunMonitoringConfig creates a config that will be used by default if
// no user config (or an empty one) is found. It scrapes the default location of
// 0.0.0.0:8080/metrics for prometheus metrics.
func DefaultRunMonitoringConfig() *RunMonitoringConfig {
return &RunMonitoringConfig{
metav1.TypeMeta{
Kind: kind,
APIVersion: apiVersion,
},
metav1.ObjectMeta{
Name: "run-gmp-sidecar",
},
RunMonitoringSpec{
Endpoints: []ScrapeEndpoint{
{
Port: "8080",
Path: "/metrics",
Interval: "30s",
},
},
TargetLabels: RunTargetLabels{Metadata: &allowedTargetMetadata},
},
nil,
}
}
// ReadConfigFromFile reads the user config file and returns a RunMonitoringConfig.
// If the user config file does not exist, or is empty - it returns the default
// RunMonitoringConfig.
func ReadConfigFromFile(ctx context.Context, path string) (*RunMonitoringConfig, error) {
config := DefaultRunMonitoringConfig()
// Fetch metadata from the available environment variables.
config.Env = fetchMetadata()
if _, err := os.Stat(path); err != nil {
if os.IsNotExist(err) {
log.Println("confgenerator: no user config file found, using default config")
return config, nil
}
return nil, fmt.Errorf("failed to retrieve the user config file %q: %w", path, err)
}
data, err := ioutil.ReadFile(path)
if err != nil {
return nil, err
}
log.Printf("confgenerator: using RunMonitoring config:\n%s", string(data))
// Unmarshal the user config over the default config. If some options are unspecified
// the collector uses the default settings for those options. For example, if not specified
// targetLabels is set to {"revision", "service", "configuration"}
if err := yaml.UnmarshalContext(ctx, data, config, yaml.Strict()); err != nil {
return nil, err
}
// Validate the RunMonitoring config
if err := config.Validate(); err != nil {
return nil, err
}
return config, nil
}
// OTelReceiverPipeline creates the appropriate OTel pipeline translated from the
// RunMonitoringConfig.
func (rc *RunMonitoringConfig) OTelReceiverPipeline() (*otel.ReceiverPipeline, error) {
scrapeConfig, err := rc.scrapeConfigs()
if err != nil {
return nil, err
}
// Prefix the `instance` resource label with the faas.id.
processors := []otel.Component{
otel.GCPResourceDetector(),
otel.TransformationMetrics(otel.PrefixResourceAttribute("service.instance.id", "faas.id", ":")),
}
// If the users configure to add the instance metadata, add it as a metric label.
if rc.Spec.TargetLabels.Metadata != nil && contains(*rc.Spec.TargetLabels.Metadata, "instance") {
processors = append(processors, otel.TransformationMetrics(otel.FlattenResourceAttribute("faas.id", cloudRunInstanceLabel)))
}
// Group by the GMP attributes.
processors = append(processors, otel.GroupByGMPAttrs())
// If the user updates the `project_id` label, we need to update the gcp.project.id resource attribute
// so the exporter can pick it up.
processors = append(processors, otel.TransformationMetrics(otel.GroupByAttribute("gcp.project.id", "project_id"), otel.DeleteMetricAttribute("project_id")))
return &otel.ReceiverPipeline{
Receiver: otel.Component{
Type: "prometheus",
Config: map[string]interface{}{
"use_start_time_metric": true,
"use_collector_start_time_fallback": true,
"allow_cumulative_resets": true,
"config": map[string]interface{}{
"scrape_configs": scrapeConfig,
},
},
},
Processors: processors,
}, nil
}
// Validate validates the RunMonitoring config.
func (rc *RunMonitoringConfig) Validate() error {
if rc.APIVersion != apiVersion {
return fmt.Errorf("apiVersion must be %s", apiVersion)
}
if rc.Kind != kind {
return fmt.Errorf("kind must be %s", kind)
}
return nil
}
// scrapeConfigs converts the given RunMonitoringConfig to an equivalent set of Prometheus ScrapeConfigs.
func (rc *RunMonitoringConfig) scrapeConfigs() (res []*promconfig.ScrapeConfig, err error) {
for i := range rc.Spec.Endpoints {
c, err := rc.endpointScrapeConfig(i)
if err != nil {
return nil, fmt.Errorf("invalid definition for endpoint with index %d: %w", i, err)
}
res = append(res, c)
}
return res, nil
}
// endpointScrapeConfig creates a scrape config for the endpoint specified.
func (rc *RunMonitoringConfig) endpointScrapeConfig(index int) (*promconfig.ScrapeConfig, error) {
metadataLabels := map[string]struct{}{}
if rc.Spec.TargetLabels.Metadata != nil {
for _, l := range *rc.Spec.TargetLabels.Metadata {
if !contains(allowedTargetMetadata, l) {
return nil, fmt.Errorf("metadata label %q not allowed, must be one of %v", l, allowedTargetMetadata)
}
metadataLabels[l] = struct{}{}
}
}
relabelCfgs := relabelingsForMetadata(metadataLabels, rc.Env)
jobName := fmt.Sprintf("run-gmp-sidecar-%d", index)
return endpointScrapeConfig(
jobName,
rc.Name,
rc.Spec.Endpoints[index],
relabelCfgs,
rc.Spec.Limits,
rc.Env,
)
}
func relabelingsForMetadata(keys map[string]struct{}, env *CloudRunEnvironment) (res []*relabel.Config) {
if env == nil {
return
}
if _, ok := keys["service"]; ok {
res = append(res, &relabel.Config{
Action: relabel.Replace,
Replacement: env.Service,
TargetLabel: cloudRunServiceLabel,
})
}
if _, ok := keys["revision"]; ok {
res = append(res, &relabel.Config{
Action: relabel.Replace,
Replacement: env.Revision,
TargetLabel: cloudRunRevisionLabel,
})
}
if _, ok := keys["configuration"]; ok {
res = append(res, &relabel.Config{
Action: relabel.Replace,
Replacement: env.Configuration,
TargetLabel: cloudRunConfigurationLabel,
})
}
return res
}
func endpointScrapeConfig(id, cfgName string, ep ScrapeEndpoint, relabelCfgs []*relabel.Config, limits *ScrapeLimits, env *CloudRunEnvironment) (*promconfig.ScrapeConfig, error) {
if env == nil {
return nil, fmt.Errorf("metadata from Cloud Run was not found")
}
labelSet := make(map[prommodel.LabelName]prommodel.LabelValue)
labelSet[prommodel.AddressLabel] = prommodel.LabelValue("0.0.0.0:" + ep.Port)
discoveryCfgs := discovery.Configs{
discovery.StaticConfig{
&targetgroup.Group{Targets: []prommodel.LabelSet{labelSet}},
},
}
relabelCfgs = append(relabelCfgs,
&relabel.Config{
Action: relabel.Replace,
Replacement: cfgName,
TargetLabel: "job",
},
&relabel.Config{
Action: relabel.Replace,
TargetLabel: "cluster",
Replacement: "__run__",
},
&relabel.Config{
Action: relabel.Replace,
TargetLabel: "namespace",
Replacement: env.Service,
},
// The `instance` label will be <faas.id>:<port> in the final metric.
// But since <faas.id> is unavailable until the gcp resource detector
// runs later in the pipeline we just populate the port for now.
//
// See the usage of PrefixResourceAttribute for when the rest of the
// instance label is filled in.
&relabel.Config{
Action: relabel.Replace,
TargetLabel: "instance",
Replacement: ep.Port,
},
)
interval, err := prommodel.ParseDuration(ep.Interval)
if err != nil {
return nil, fmt.Errorf("invalid scrape interval: %w", err)
}
timeout := interval
if ep.Timeout != "" {
timeout, err = prommodel.ParseDuration(ep.Timeout)
if err != nil {
return nil, fmt.Errorf("invalid scrape timeout: %w", err)
}
if timeout > interval {
return nil, fmt.Errorf("scrape timeout %v must not be greater than scrape interval %v", timeout, interval)
}
}
metricsPath := "/metrics"
if ep.Path != "" {
metricsPath = ep.Path
}
var metricRelabelCfgs []*relabel.Config
for _, r := range ep.MetricRelabeling {
rcfg, err := convertRelabelingRule(r)
if err != nil {
return nil, err
}
metricRelabelCfgs = append(metricRelabelCfgs, rcfg)
}
scrapeCfg := &promconfig.ScrapeConfig{
JobName: id,
ServiceDiscoveryConfigs: discoveryCfgs,
MetricsPath: metricsPath,
Scheme: ep.Scheme,
Params: ep.Params,
ScrapeInterval: interval,
ScrapeTimeout: timeout,
RelabelConfigs: relabelCfgs,
MetricRelabelConfigs: metricRelabelCfgs,
ScrapeProtocols: promconfig.DefaultScrapeProtocols,
}
if limits != nil {
scrapeCfg.SampleLimit = uint(limits.Samples)
scrapeCfg.LabelLimit = uint(limits.Labels)
scrapeCfg.LabelNameLengthLimit = uint(limits.LabelNameLength)
scrapeCfg.LabelValueLengthLimit = uint(limits.LabelValueLength)
}
if err := scrapeCfg.Validate(promconfig.DefaultGlobalConfig); err != nil {
return nil, fmt.Errorf("invalid scrape config: %w", err)
}
// The Prometheus configuration structs do not generally have validation methods and embed their
// validation logic in the UnmarshalYAML methods. To keep things reasonable we don't re-validate
// everything and simply do a final marshal-unmarshal cycle at the end to run all validation
// upstream provides at the end of this method.
b, err := yaml.Marshal(scrapeCfg)
if err != nil {
return nil, fmt.Errorf("scrape config cannot be marshalled: %w", err)
}
var scrapeCfgCopy promconfig.ScrapeConfig
if err := yaml.Unmarshal(b, &scrapeCfgCopy); err != nil {
return nil, fmt.Errorf("invalid scrape configuration: %w", err)
}
return scrapeCfg, nil
}
// convertRelabelingRule converts the rule to a relabel configuration. An error is returned
// if the rule would modify one of the protected labels.
func convertRelabelingRule(r RelabelingRule) (*relabel.Config, error) {
if contains(r.SourceLabels, cloudRunInstanceLabel) {
return nil, fmt.Errorf("cannot relabel with action %q using source label %q", r.Action, cloudRunInstanceLabel)
}
rcfg := &relabel.Config{
// Upstream applies ToLower when digesting the config, so we allow the same.
Action: relabel.Action(strings.ToLower(r.Action)),
TargetLabel: r.TargetLabel,
Separator: r.Separator,
Replacement: r.Replacement,
Modulus: r.Modulus,
}
for _, n := range r.SourceLabels {
rcfg.SourceLabels = append(rcfg.SourceLabels, prommodel.LabelName(n))
}
// Instantiate the default regex Prometheus uses so that the checks below can be run
// if no explicit value is provided.
re := relabel.MustNewRegexp(`(.*)`)
// We must only set the regex if its not empty. Like in other cases, the Prometheus code does
// not setup the structs correctly and this would default to the string "null" when marshalled,
// which is then interpreted as a regex again when read by Prometheus.
if r.Regex != "" {
var err error
re, err = relabel.NewRegexp(r.Regex)
if err != nil {
return nil, fmt.Errorf("invalid regex %q: %w", r.Regex, err)
}
rcfg.Regex = re
}
// Validate that the protected target labels are not mutated by the provided relabeling rules.
switch rcfg.Action {
// Default action is "replace" per https://prometheus.io/docs/prometheus/latest/configuration/configuration/#relabel_config.
case relabel.Replace, relabel.HashMod, "":
// These actions write into the target label and it must not be a protected one.
if isProtectedLabel(r.TargetLabel) {
return nil, fmt.Errorf("cannot relabel with action %q onto protected label %q", r.Action, r.TargetLabel)
}
case relabel.LabelDrop:
if matchesAnyProtectedLabel(re) {
return nil, fmt.Errorf("regex %s would drop at least one of the protected labels %s", r.Regex, strings.Join(protectedLabels, ", "))
}
case relabel.LabelKeep:
// Keep drops all labels that don't match the regex. So all protected labels must
// match keep.
if !matchesAllProtectedLabels(re) {
return nil, fmt.Errorf("regex %s would drop at least one of the protected labels %s", r.Regex, strings.Join(protectedLabels, ", "))
}
case relabel.LabelMap:
// It is difficult to prove for certain that labelmap does not override a protected label.
// Thus we just prohibit its use for now.
// The most feasible way to support this would probably be store all protected labels
// in __tmp_protected_<name> via a replace rule, then apply labelmap, then replace the
// __tmp label back onto the protected label.
return nil, fmt.Errorf("relabeling with action %q not allowed", r.Action)
case relabel.Keep, relabel.Drop:
// These actions don't modify a series and are OK.
default:
return nil, fmt.Errorf("unknown relabeling action %q", r.Action)
}
return rcfg, nil
}