From 7ab3215cdcea32aa02b86bceb5b17fdfa372227c Mon Sep 17 00:00:00 2001 From: Moritz Wiesinger <6901203+mowies@users.noreply.github.com> Date: Wed, 6 May 2026 15:36:10 +0200 Subject: [PATCH 1/5] chore: add large scale Prometheus examples --- .../allocator.values.yaml | 44 +++ .../prometheus-large-scale/avalanche.yaml | 65 ++++ .../prometheus-large-scale/rbac.yaml | 208 ++++++++++++ .../prometheus-large-scale/scrapeconfig.yaml | 52 +++ .../selfmon-scraper.yaml | 278 ++++++++++++++++ .../tier1-scraper.values.yaml | 188 +++++++++++ .../tier2-gateway.values.yaml | 306 ++++++++++++++++++ .../tier3-sink.values.yaml | 293 +++++++++++++++++ 8 files changed, 1434 insertions(+) create mode 100644 config_examples/prometheus-large-scale/allocator.values.yaml create mode 100644 config_examples/prometheus-large-scale/avalanche.yaml create mode 100644 config_examples/prometheus-large-scale/rbac.yaml create mode 100644 config_examples/prometheus-large-scale/scrapeconfig.yaml create mode 100644 config_examples/prometheus-large-scale/selfmon-scraper.yaml create mode 100644 config_examples/prometheus-large-scale/tier1-scraper.values.yaml create mode 100644 config_examples/prometheus-large-scale/tier2-gateway.values.yaml create mode 100644 config_examples/prometheus-large-scale/tier3-sink.values.yaml diff --git a/config_examples/prometheus-large-scale/allocator.values.yaml b/config_examples/prometheus-large-scale/allocator.values.yaml new file mode 100644 index 000000000..bc7ea2514 --- /dev/null +++ b/config_examples/prometheus-large-scale/allocator.values.yaml @@ -0,0 +1,44 @@ +nameOverride: "" +fullnameOverride: "tiered-allocator" + +replicaCount: 1 + +targetAllocator: + podAnnotations: + metrics.dynatrace.com/scrape: "true" + metrics.dynatrace.com/port: "8080" + image: + repository: ghcr.io/open-telemetry/opentelemetry-operator/target-allocator + tag: "0.150.0" + serviceAccount: + create: false + name: "tiered-otel-allocator" + service: + port: 8080 + config: + allocation_strategy: consistent-hashing + collector_namespace: ${NAMESPACE} + collector_selector: + matchlabels: + app.kubernetes.io/name: opentelemetry-collector + app.kubernetes.io/instance: otel-scraper + prometheus_cr: + enabled: true + scrapeInterval: 60s + service_monitor_selector: + prometheus.dynatrace.com: "true" + pod_monitor_selector: + prometheus.dynatrace.com: "true" + scrape_config_selector: + prometheus.dynatrace.com: "true" + +# filter_strategy: relabel-config +# config: +# scrape_configs: [] + + resources: + limits: + memory: 200Mi + requests: + cpu: 10m + memory: 150Mi diff --git a/config_examples/prometheus-large-scale/avalanche.yaml b/config_examples/prometheus-large-scale/avalanche.yaml new file mode 100644 index 000000000..189af2b08 --- /dev/null +++ b/config_examples/prometheus-large-scale/avalanche.yaml @@ -0,0 +1,65 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + labels: + app: avalanche + name: avalanche + namespace: avalanche +spec: + replicas: 30 + strategy: + rollingUpdate: + maxSurge: 25% + maxUnavailable: 0 + selector: + matchLabels: + app: avalanche + template: + metadata: + annotations: + metrics.dynatrace.com/scrape: "true" + metrics.dynatrace.com/port: "9001" + labels: + app: avalanche + spec: + containers: + - image: quay.io/prometheuscommunity/avalanche:v0.7.0 + name: avalanche + args: + # Target: ~5M datapoints/min across 30 replicas (60s scrape interval) + # Per pod (~83.5k series): + # Gauges: 25 metrics × 817 series = 20,425 series + # Counters: 25 metrics × 817 series = 20,425 series + # Histograms: 4 metrics × 817 series × 13 = 42,484 series + # (10 buckets + 1 +Inf bucket + _sum + _count) + - "--gauge-metric-count=25" + - "--counter-metric-count=25" + - "--histogram-metric-count=4" + - "--histogram-metric-bucket-count=10" + - "--native-histogram-metric-count=0" + - "--summary-metric-count=0" + - "--series-count=817" # ← halved from 1634 + - "--value-interval=300" + - "--series-interval=3600" + - "--metric-interval=0" + - "--port=9001" # Pod total: ~83,334 series → 30 pods × 83,334 = ~2,500,020 DPM + resources: + requests: + memory: 256Mi + cpu: "100m" + limits: + cpu: "500m" + memory: "384Mi" +--- +apiVersion: v1 +kind: Service +metadata: + name: avalanche + namespace: avalanche +spec: + selector: + app: avalanche + ports: + - name: metrics + port: 9001 + targetPort: 9001 \ No newline at end of file diff --git a/config_examples/prometheus-large-scale/rbac.yaml b/config_examples/prometheus-large-scale/rbac.yaml new file mode 100644 index 000000000..909867e5c --- /dev/null +++ b/config_examples/prometheus-large-scale/rbac.yaml @@ -0,0 +1,208 @@ +apiVersion: v1 +automountServiceAccountToken: true +kind: ServiceAccount +metadata: + name: tiered-otel-scraper + namespace: ${NAMESPACE} +--- +apiVersion: v1 +automountServiceAccountToken: true +kind: ServiceAccount +metadata: + name: tiered-otel-gateway + namespace: ${NAMESPACE} +--- +apiVersion: v1 +automountServiceAccountToken: true +kind: ServiceAccount +metadata: + name: tiered-otel-sink + namespace: ${NAMESPACE} +--- +apiVersion: v1 +automountServiceAccountToken: true +kind: ServiceAccount +metadata: + name: tiered-otel-allocator + namespace: ${NAMESPACE} +--- +# Scraper (tier 1): k8s resolver for loadbalancing exporter +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: tiered-otel-scraper +rules: + - apiGroups: [""] + resources: + - pods + - endpoints + - services + verbs: [get, list, watch] + - apiGroups: [discovery.k8s.io] + resources: + - endpointslices + verbs: [get, list, watch] +--- +# Gateway (tier 2): k8s_attributes processor +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: tiered-otel-gateway +rules: + - apiGroups: [""] + resources: + - pods + - namespaces + - nodes + verbs: [get, watch, list] + - apiGroups: [apps] + resources: + - replicasets + verbs: [get, watch, list] + - apiGroups: [batch] + resources: + - jobs + - cronjobs + verbs: [get, watch, list] +--- +# Sink (tier 3): k8s_attributes processor +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: tiered-otel-sink +rules: + - apiGroups: [""] + resources: + - pods + - namespaces + - nodes + verbs: [get, watch, list] + - apiGroups: [apps] + resources: + - replicasets + verbs: [get, watch, list] + - apiGroups: [batch] + resources: + - jobs + - cronjobs + verbs: [get, watch, list] +--- +# Allocator: service discovery + Prometheus CR access +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: tiered-otel-allocator +rules: + - apiGroups: [""] + resources: + - pods + - endpoints + - services + - nodes + - nodes/metrics + verbs: [get, watch, list] + - apiGroups: [discovery.k8s.io] + resources: + - endpointslices + verbs: [get, watch, list] + - apiGroups: [""] + resources: + - configmaps + verbs: [get] + - apiGroups: [networking.k8s.io] + resources: + - ingresses + verbs: [get, list, watch] + - nonResourceURLs: ["/metrics"] + verbs: [get] + - apiGroups: [monitoring.coreos.com] + resources: + - servicemonitors + - podmonitors + - scrapeconfigs + - probes + verbs: ["*"] + - apiGroups: [""] + resources: + - namespaces + verbs: [get, list, watch] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: tiered-otel-scraper +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: tiered-otel-scraper +subjects: + - kind: ServiceAccount + name: tiered-otel-scraper + namespace: ${NAMESPACE} +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: tiered-otel-gateway +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: tiered-otel-gateway +subjects: + - kind: ServiceAccount + name: tiered-otel-gateway + namespace: ${NAMESPACE} +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: tiered-otel-sink +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: tiered-otel-sink +subjects: + - kind: ServiceAccount + name: tiered-otel-sink + namespace: ${NAMESPACE} +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: tiered-otel-allocator +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: tiered-otel-allocator +subjects: + - kind: ServiceAccount + name: tiered-otel-allocator + namespace: ${NAMESPACE} +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: tiered-otel-sink +rules: + - apiGroups: [""] + resources: ["pods", "namespaces", "nodes"] + verbs: ["get", "list", "watch"] + - apiGroups: ["apps"] + resources: ["replicasets", "deployments", "statefulsets", "daemonsets"] + verbs: ["get", "list", "watch"] + - apiGroups: ["batch"] + resources: ["jobs", "cronjobs"] + verbs: ["get", "list", "watch"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: tiered-otel-sink +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: tiered-otel-sink +subjects: + - kind: ServiceAccount + name: tiered-otel-sink + namespace: ${NAMESPACE} diff --git a/config_examples/prometheus-large-scale/scrapeconfig.yaml b/config_examples/prometheus-large-scale/scrapeconfig.yaml new file mode 100644 index 000000000..a2cd83977 --- /dev/null +++ b/config_examples/prometheus-large-scale/scrapeconfig.yaml @@ -0,0 +1,52 @@ +--- +apiVersion: monitoring.coreos.com/v1alpha1 +kind: ScrapeConfig +metadata: + name: dynatrace-com + namespace: ${NAMESPACE} + labels: + prometheus.dynatrace.com: "true" +spec: + jobName: dynatrace-com + scrapeInterval: 60s + sampleLimit: 5000000 + labelLimit: 50 + labelNameLengthLimit: 100 + labelValueLengthLimit: 1000 + kubernetesSDConfigs: + - role: Pod + namespaces: + names: + - avalanche + relabelings: + - sourceLabels: + - __meta_kubernetes_pod_annotation_metrics_dynatrace_com_scrape + - __meta_kubernetes_pod_annotationpresent_metrics_dynatrace_com_scrape + action: keep + regex: true;true + - sourceLabels: + - __meta_kubernetes_pod_annotation_metrics_dynatrace_com_secure + - __meta_kubernetes_pod_annotationpresent_metrics_dynatrace_com_secure + action: replace + regex: true;true + targetLabel: __scheme__ + replacement: https + - sourceLabels: + - __address__ + - __meta_kubernetes_pod_annotation_metrics_dynatrace_com_port + - __meta_kubernetes_pod_annotationpresent_metrics_dynatrace_com_port + action: replace + regex: (.+?)(?::\d+)?;(\d+);true + targetLabel: __address__ + replacement: $1:$2 + - sourceLabels: + - __meta_kubernetes_pod_annotation_metrics_dynatrace_com_path + - __meta_kubernetes_pod_annotationpresent_metrics_dynatrace_com_path + action: replace + regex: (.+);true + targetLabel: __metrics_path__ + replacement: $1 + - sourceLabels: + - __meta_kubernetes_pod_phase + action: drop + regex: (Failed|Succeeded) diff --git a/config_examples/prometheus-large-scale/selfmon-scraper.yaml b/config_examples/prometheus-large-scale/selfmon-scraper.yaml new file mode 100644 index 000000000..3dd762fca --- /dev/null +++ b/config_examples/prometheus-large-scale/selfmon-scraper.yaml @@ -0,0 +1,278 @@ +# Selfmon Scraper — scrapes collector/allocator metrics, exports direct to DT +# Deploy with: +# helm install otel-selfmon open-telemetry/opentelemetry-collector \ +# --namespace otel-ta -f selfmon-scraper.values.yaml + +fullnameOverride: "selfmon-scraper" + +mode: deployment +replicaCount: 1 + +image: + repository: ghcr.io/dynatrace/dynatrace-otel-collector/dynatrace-otel-collector + tag: 0.47.0 +command: + name: dynatrace-otel-collector + +# Don't scrape this scraper's own metrics (avoid recursion) — or set to true if you want it +podAnnotations: + metrics.dynatrace.com/scrape: "false" + +resources: + requests: + cpu: 100m + memory: 256Mi + limits: + memory: 512Mi + +service: + enabled: false + +clusterRole: + create: false + +serviceAccount: + create: false + name: tiered-otel-scraper # reuse existing SA with pod list permissions + +extraEnvsFrom: + - secretRef: + name: dynatrace-otelcol-credentials +extraEnvs: + - name: K8S_CLUSTER_NAME + value: "tiered-otelcol-ta-test" + - name: K8S_NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + - name: K8S_POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: K8S_NAMESPACE_NAME + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: K8S_POD_UID + valueFrom: + fieldRef: + fieldPath: metadata.uid + +alternateConfig: + extensions: + health_check: + endpoint: "${env:MY_POD_IP}:13133" + + receivers: + prometheus: + config: + scrape_configs: + - job_name: otel-selfmon + scrape_interval: 60s + kubernetes_sd_configs: + - role: pod + namespaces: + names: + - otel-ta + relabel_configs: + # Keep only pods with scrape annotation + - source_labels: [__meta_kubernetes_pod_annotation_metrics_dynatrace_com_scrape] + action: keep + regex: "true" + # Use custom port from annotation + - source_labels: [__address__, __meta_kubernetes_pod_annotation_metrics_dynatrace_com_port] + action: replace + regex: (.+?)(?::\d+)?;(\d+) + target_label: __address__ + replacement: $1:$2 + # Drop terminated pods + - source_labels: [__meta_kubernetes_pod_phase] + action: drop + regex: (Failed|Succeeded) + # Add pod name label + - source_labels: [__meta_kubernetes_pod_name] + target_label: k8s_pod_name + # Add pod IP for k8sattributes matching + - source_labels: [ __meta_kubernetes_pod_ip ] + target_label: k8s_pod_ip + # Add namespace + - source_labels: [ __meta_kubernetes_namespace ] + target_label: k8s_namespace_name + + processors: + memory_limiter: + check_interval: 1s + limit_percentage: 80 + spike_limit_percentage: 20 + metric_start_time: + cumulativetodelta: + max_staleness: 25h + batch: + send_batch_size: 1000 + timeout: 10s + k8s_attributes: + # Enable passthrough mode - doesn't require matching existing attributes + passthrough: false + + # How to find the pod - ORDER MATTERS + extract: + annotations: + - from: pod + key_regex: metadata.dynatrace.com/(.*) + tag_name: $$1 + - from: pod + key: metadata.dynatrace.com + tag_name: metadata.dynatrace.com + metadata: + - k8s.pod.name + - k8s.pod.uid + - k8s.pod.ip + - k8s.deployment.name + - k8s.replicaset.name + - k8s.statefulset.name + - k8s.daemonset.name + - k8s.job.name + - k8s.cronjob.name + - k8s.namespace.name + - k8s.node.name + - k8s.cluster.uid + - k8s.container.name + - k8s.deployment.uid + - k8s.replicaset.uid + - k8s.statefulset.uid + - k8s.daemonset.uid + - k8s.job.uid + - k8s.cronjob.uid + pod_association: + # First try: match by IP from scrape target + - sources: + - from: resource_attribute + name: net.peer.ip + - sources: + - from: resource_attribute + name: server.address + - sources: + - from: resource_attribute + name: k8s.pod.name + - from: resource_attribute + name: k8s.namespace.name + - sources: + - from: resource_attribute + name: k8s.pod.ip + - sources: + - from: resource_attribute + name: k8s.pod.uid + - sources: + - from: connection + transform: + metric_statements: + + - context: datapoint + statements: + - set(resource.attributes["k8s.pod.name"], attributes["k8s_pod_name"]) where attributes["k8s_pod_name"] != nil + - set(resource.attributes["k8s.namespace.name"], attributes["k8s_namespace_name"]) where attributes["k8s_namespace_name"] != nil + - set(resource.attributes["k8s.pod.ip"], attributes["k8s_pod_ip"]) where attributes["k8s_pod_ip"] != nil + + - context: resource + statements: + - set(resource.attributes["k8s.workload.name"], resource.attributes["k8s.statefulset.name"]) where IsString(resource.attributes["k8s.statefulset.name"]) + - set(resource.attributes["k8s.workload.name"], resource.attributes["k8s.replicaset.name"]) where IsString(resource.attributes["k8s.replicaset.name"]) + - set(resource.attributes["k8s.workload.name"], resource.attributes["k8s.job.name"]) where IsString(resource.attributes["k8s.job.name"]) + - set(resource.attributes["k8s.workload.name"], resource.attributes["k8s.deployment.name"]) where IsString(resource.attributes["k8s.deployment.name"]) + - set(resource.attributes["k8s.workload.name"], resource.attributes["k8s.daemonset.name"]) where IsString(resource.attributes["k8s.daemonset.name"]) + - set(resource.attributes["k8s.workload.name"], resource.attributes["k8s.cronjob.name"]) where IsString(resource.attributes["k8s.cronjob.name"]) + + - set(resource.attributes["k8s.workload.kind"], "statefulset") where IsString(resource.attributes["k8s.statefulset.name"]) + - set(resource.attributes["k8s.workload.kind"], "replicaset") where IsString(resource.attributes["k8s.replicaset.name"]) + - set(resource.attributes["k8s.workload.kind"], "job") where IsString(resource.attributes["k8s.job.name"]) + - set(resource.attributes["k8s.workload.kind"], "deployment") where IsString(resource.attributes["k8s.deployment.name"]) + - set(resource.attributes["k8s.workload.kind"], "daemonset") where IsString(resource.attributes["k8s.daemonset.name"]) + - set(resource.attributes["k8s.workload.kind"], "cronjob") where IsString(resource.attributes["k8s.cronjob.name"]) + + - set(resource.attributes["k8s.workload.uid"], resource.attributes["k8s.statefulset.uid"]) where IsString(resource.attributes["k8s.statefulset.uid"]) + - set(resource.attributes["k8s.workload.uid"], resource.attributes["k8s.replicaset.uid"]) where IsString(resource.attributes["k8s.replicaset.uid"]) + - set(resource.attributes["k8s.workload.uid"], resource.attributes["k8s.job.uid"]) where IsString(resource.attributes["k8s.job.uid"]) + - set(resource.attributes["k8s.workload.uid"], resource.attributes["k8s.deployment.uid"]) where IsString(resource.attributes["k8s.deployment.uid"]) + - set(resource.attributes["k8s.workload.uid"], resource.attributes["k8s.daemonset.uid"]) where IsString(resource.attributes["k8s.daemonset.uid"]) + - set(resource.attributes["k8s.workload.uid"], resource.attributes["k8s.cronjob.uid"]) where IsString(resource.attributes["k8s.cronjob.uid"]) + + - delete_key(resource.attributes, "k8s.statefulset.name") + - delete_key(resource.attributes, "k8s.replicaset.name") + - delete_key(resource.attributes, "k8s.job.name") + - delete_key(resource.attributes, "k8s.deployment.name") + - delete_key(resource.attributes, "k8s.daemonset.name") + - delete_key(resource.attributes, "k8s.cronjob.name") + - delete_key(resource.attributes, "k8s.statefulset.uid") + - delete_key(resource.attributes, "k8s.replicaset.uid") + - delete_key(resource.attributes, "k8s.deployment.uid") + - delete_key(resource.attributes, "k8s.daemonset.uid") + - delete_key(resource.attributes, "k8s.job.uid") + - delete_key(resource.attributes, "k8s.cronjob.uid") + + - context: resource + statements: + - delete_key(resource.attributes, "processor") + - delete_key(resource.attributes, "otel.signal") + - delete_key(resource.attributes, "otel.scope.name") + - delete_key(resource.attributes, "otel.scope.version") + + - context: resource + statements: + - set(resource.attributes["k8s.cluster.name"], "${env:K8S_CLUSTER_NAME}") where resource.attributes["k8s.cluster.name"] == nil and Len("${env:K8S_CLUSTER_NAME}") > 0 + + - context: resource + statements: + - merge_maps(resource.attributes, ParseJSON(resource.attributes["metadata.dynatrace.com"]), "upsert") where IsMatch(resource.attributes["metadata.dynatrace.com"], "^\\{") + - delete_key(resource.attributes, "metadata.dynatrace.com") + + exporters: + debug: + verbosity: basic + otlphttp/dynatrace: + endpoint: "${env:DT_ENDPOINT}" + headers: + Authorization: "Api-Token ${env:DT_API_TOKEN}" + + service: + extensions: + - health_check + pipelines: + metrics: + receivers: + - prometheus + processors: + - memory_limiter + - metric_start_time + - cumulativetodelta + - transform + - k8s_attributes + - batch + exporters: + #- debug + - otlphttp/dynatrace + telemetry: + logs: + level: INFO + metrics: + level: basic + readers: + - pull: + exporter: + prometheus: + host: "0.0.0.0" + port: 8888 +ports: + otlp: + enabled: false + otlp-http: + enabled: false + jaeger-compact: + enabled: false + jaeger-thrift: + enabled: false + jaeger-grpc: + enabled: false + zipkin: + enabled: false + metrics: + enabled: false diff --git a/config_examples/prometheus-large-scale/tier1-scraper.values.yaml b/config_examples/prometheus-large-scale/tier1-scraper.values.yaml new file mode 100644 index 000000000..cbfe5844c --- /dev/null +++ b/config_examples/prometheus-large-scale/tier1-scraper.values.yaml @@ -0,0 +1,188 @@ +# Tier 1: Scraper — scrapes Prometheus targets via TA, load-balances to tier 2. +# Deploy with: +# helm install otel-scraper open-telemetry/opentelemetry-collector \ +# --namespace otel-ta -f tier1-scraper.values.yaml + +fullnameOverride: "tiered-scraper" + +podAnnotations: + metrics.dynatrace.com/scrape: "true" + metrics.dynatrace.com/port: "8888" + +rollout: + rollingUpdate: + maxSurge: 25% + maxUnavailable: 0 + strategy: RollingUpdate + +resources: + requests: + cpu: 300m + memory: 3Gi + limits: + memory: 3Gi +autoscaling: + enabled: true + minReplicas: 2 + maxReplicas: 24 + targetCPUUtilizationPercentage: 80 + targetMemoryUtilizationPercentage: 80 + +image: + repository: ghcr.io/dynatrace/dynatrace-otel-collector/dynatrace-otel-collector + # renovate: datasource=docker depName=ghcr.io/dynatrace/dynatrace-otel-collector/dynatrace-otel-collector + tag: 0.47.0 +command: + name: dynatrace-otel-collector + +mode: deployment + +service: + enabled: true + type: ClusterIP + +clusterRole: + create: false + +serviceAccount: + create: false + name: tiered-otel-scraper + +extraEnvsFrom: + - secretRef: + name: dynatrace-otelcol-credentials +extraEnvs: + - name: K8S_SERVICE_NAME + value: "{{ include \"opentelemetry-collector.fullname\" . }}" + - name: K8S_CLUSTER_NAME + value: "tiered-otelcol-ta-test" + - name: K8S_NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + - name: K8S_POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: K8S_NAMESPACE_NAME + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: K8S_POD_UID + valueFrom: + fieldRef: + fieldPath: metadata.uid + +alternateConfig: + extensions: + health_check: + endpoint: "${env:MY_POD_IP}:13133" + + receivers: + otlp: + protocols: + grpc: + endpoint: "${env:MY_POD_IP}:4317" + http: + endpoint: "${env:MY_POD_IP}:4318" + prometheus: + config: {} + target_allocator: + collector_id: ${K8S_POD_NAME} + endpoint: http://tiered-allocator-ta + interval: 60s + + processors: + memory_limiter: + check_interval: 1s + limit_percentage: 95 + spike_limit_percentage: 20 + batch/metrics: # OTLP max is 15k datapoints or 4 MiB + send_batch_max_size: 5000 + send_batch_size: 500 + timeout: 10s + + exporters: + debug: + verbosity: basic + loadbalancing: + routing_key: "resource" + timeout: 10s + retry_on_failure: + enabled: true + initial_interval: 5s + max_interval: 30s + max_elapsed_time: 300s + sending_queue: + enabled: true + num_consumers: 10 + queue_size: 10000 + protocol: + otlp: + tls: + insecure: true + timeout: 10s + resolver: + k8s: + service: "tiered-gateway.otel-ta" + ports: + - 4317 + + service: + extensions: + - health_check + pipelines: + metrics: + receivers: + - otlp + - prometheus + processors: + - memory_limiter + - batch/metrics + exporters: + # - debug + - loadbalancing + telemetry: + resource: + k8s.cluster.name: "${env:K8S_CLUSTER_NAME}" + k8s.namespace.name: "${env:K8S_NAMESPACE_NAME}" + k8s.pod.name: "${env:K8S_POD_NAME}" + k8s.pod.uid: "${env:K8S_POD_UID}" + k8s.node.name: "${env:K8S_NODE_NAME}" + metrics: + level: detailed + readers: + - pull: + exporter: + prometheus: + host: "0.0.0.0" + port: 8888 + without_type_suffix: true + without_units: true + logs: + level: INFO + processors: + - batch: + exporter: + otlp: + protocol: http/protobuf + endpoint: "${env:DT_ENDPOINT}/v1/logs" + headers: + - name: Authorization + value: "Api-Token ${env:DT_API_TOKEN}" + +ports: + otlp: + enabled: true + otlp-http: + enabled: true + jaeger-compact: + enabled: false + jaeger-thrift: + enabled: false + jaeger-grpc: + enabled: false + zipkin: + enabled: false + metrics: + enabled: false diff --git a/config_examples/prometheus-large-scale/tier2-gateway.values.yaml b/config_examples/prometheus-large-scale/tier2-gateway.values.yaml new file mode 100644 index 000000000..d86f73a5d --- /dev/null +++ b/config_examples/prometheus-large-scale/tier2-gateway.values.yaml @@ -0,0 +1,306 @@ +# Tier 2: Gateway — receives OTLP from tier 1, enriches data, forwards to tier 3. +# Deploy with: +# helm install otel-gateway open-telemetry/opentelemetry-collector \ +# --namespace otel-ta -f tier2-gateway.values.yaml + +fullnameOverride: "tiered-gateway" + +podAnnotations: + metrics.dynatrace.com/scrape: "true" + metrics.dynatrace.com/port: "8888" + +rollout: + rollingUpdate: + maxSurge: 25% + maxUnavailable: 0 + strategy: RollingUpdate + +resources: + requests: + cpu: 2 + memory: 8Gi + limits: + memory: 8Gi +autoscaling: + enabled: true + minReplicas: 1 + maxReplicas: 16 + targetCPUUtilizationPercentage: 80 + targetMemoryUtilizationPercentage: 80 + +image: + repository: ghcr.io/dynatrace/dynatrace-otel-collector/dynatrace-otel-collector + # renovate: datasource=docker depName=ghcr.io/dynatrace/dynatrace-otel-collector/dynatrace-otel-collector + tag: 0.47.0 +command: + name: dynatrace-otel-collector + +mode: deployment + +service: + enabled: true + +clusterRole: + create: false + +serviceAccount: + create: false + name: tiered-otel-gateway + +extraEnvsFrom: + - secretRef: + name: dynatrace-otelcol-credentials +extraEnvs: + - name: K8S_SERVICE_NAME + value: "{{ include \"opentelemetry-collector.fullname\" . }}" + - name: K8S_CLUSTER_NAME + value: "tiered-otelcol-ta-test" + - name: K8S_NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + - name: K8S_POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: K8S_NAMESPACE_NAME + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: POD_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: K8S_POD_UID + valueFrom: + fieldRef: + fieldPath: metadata.uid + +alternateConfig: + extensions: + health_check: + endpoint: "${env:MY_POD_IP}:13133" + + receivers: + otlp: + protocols: + grpc: + endpoint: "${env:MY_POD_IP}:4317" + http: + endpoint: "${env:MY_POD_IP}:4318" + + processors: + memory_limiter: + check_interval: 1s + limit_percentage: 95 + spike_limit_percentage: 20 + metric_start_time: + cumulativetodelta: + max_staleness: 25h + batch/metrics: # OTLP max is 15k datapoints or 4 MiB + send_batch_max_size: 5000 + send_batch_size: 500 + timeout: 10s + k8s_attributes: + extract: + annotations: + - from: pod + key_regex: metadata.dynatrace.com/(.*) + tag_name: $$1 + - from: pod + key: metadata.dynatrace.com + tag_name: metadata.dynatrace.com + metadata: + - k8s.pod.name + - k8s.pod.uid + - k8s.pod.ip + - k8s.deployment.name + - k8s.replicaset.name + - k8s.statefulset.name + - k8s.daemonset.name + - k8s.job.name + - k8s.cronjob.name + - k8s.namespace.name + - k8s.node.name + - k8s.cluster.uid + - k8s.container.name + - k8s.deployment.uid + - k8s.replicaset.uid + - k8s.statefulset.uid + - k8s.daemonset.uid + - k8s.job.uid + - k8s.cronjob.uid + pod_association: + - sources: + - from: resource_attribute + name: server.address + - sources: + - from: resource_attribute + name: k8s.pod.name + - from: resource_attribute + name: k8s.namespace.name + - sources: + - from: resource_attribute + name: k8s.pod.ip + - sources: + - from: resource_attribute + name: k8s.pod.uid + - sources: + - from: connection + transform: + metric_statements: + - context: datapoint + statements: + - set(attributes["http.request.method"], attributes["http_request_method"]) where attributes["http_request_method"] != nil + - delete_key(attributes, "http_request_method") + - set(attributes["http.response.status_code"], attributes["http_response_status_code"]) where attributes["http_response_status_code"] != nil + - delete_key(attributes, "http_response_status_code") + - set(attributes["network.protocol.name"], attributes["network_protocol_name"]) where attributes["network_protocol_name"] != nil + - delete_key(attributes, "network_protocol_name") + - set(attributes["network.protocol.version"], attributes["network_protocol_version"]) where attributes["network_protocol_version"] != nil + - delete_key(attributes, "network_protocol_version") + - set(attributes["rpc.method"], attributes["rpc_method"]) where attributes["rpc_method"] != nil + - delete_key(attributes, "rpc_method") + - set(attributes["rpc.response.status_code"], attributes["rpc_response_status_code"]) where attributes["rpc_response_status_code"] != nil + - delete_key(attributes, "rpc_response_status_code") + - set(attributes["rpc.system.name"], attributes["rpc_system_name"]) where attributes["rpc_system_name"] != nil + - delete_key(attributes, "rpc_system_name") + - set(attributes["server.address"], attributes["server_address"]) where attributes["server_address"] != nil + - delete_key(attributes, "server_address") + - set(attributes["server.port"], attributes["server_port"]) where attributes["server_port"] != nil + - delete_key(attributes, "server_port") + - set(attributes["url.scheme"], attributes["url_scheme"]) where attributes["url_scheme"] != nil + - delete_key(attributes, "url_scheme") + + - context: resource + statements: + - set(attributes["k8s.workload.name"], attributes["k8s.statefulset.name"]) where IsString(attributes["k8s.statefulset.name"]) + - set(attributes["k8s.workload.name"], attributes["k8s.replicaset.name"]) where IsString(attributes["k8s.replicaset.name"]) + - set(attributes["k8s.workload.name"], attributes["k8s.job.name"]) where IsString(attributes["k8s.job.name"]) + - set(attributes["k8s.workload.name"], attributes["k8s.deployment.name"]) where IsString(attributes["k8s.deployment.name"]) + - set(attributes["k8s.workload.name"], attributes["k8s.daemonset.name"]) where IsString(attributes["k8s.daemonset.name"]) + - set(attributes["k8s.workload.name"], attributes["k8s.cronjob.name"]) where IsString(attributes["k8s.cronjob.name"]) + + - set(attributes["k8s.workload.kind"], "statefulset") where IsString(attributes["k8s.statefulset.name"]) + - set(attributes["k8s.workload.kind"], "replicaset") where IsString(attributes["k8s.replicaset.name"]) + - set(attributes["k8s.workload.kind"], "job") where IsString(attributes["k8s.job.name"]) + - set(attributes["k8s.workload.kind"], "deployment") where IsString(attributes["k8s.deployment.name"]) + - set(attributes["k8s.workload.kind"], "daemonset") where IsString(attributes["k8s.daemonset.name"]) + - set(attributes["k8s.workload.kind"], "cronjob") where IsString(attributes["k8s.cronjob.name"]) + + - set(attributes["k8s.workload.uid"], attributes["k8s.statefulset.uid"]) where IsString(attributes["k8s.statefulset.uid"]) + - set(attributes["k8s.workload.uid"], attributes["k8s.replicaset.uid"]) where IsString(attributes["k8s.replicaset.uid"]) + - set(attributes["k8s.workload.uid"], attributes["k8s.job.uid"]) where IsString(attributes["k8s.job.uid"]) + - set(attributes["k8s.workload.uid"], attributes["k8s.deployment.uid"]) where IsString(attributes["k8s.deployment.uid"]) + - set(attributes["k8s.workload.uid"], attributes["k8s.daemonset.uid"]) where IsString(attributes["k8s.daemonset.uid"]) + - set(attributes["k8s.workload.uid"], attributes["k8s.cronjob.uid"]) where IsString(attributes["k8s.cronjob.uid"]) + + # experimental attributes + # https://docs.dynatrace.com/docs/discover-dynatrace/references/semantic-dictionary/fields#kubernetes + - delete_key(attributes, "k8s.statefulset.name") + - delete_key(attributes, "k8s.replicaset.name") + - delete_key(attributes, "k8s.job.name") + - delete_key(attributes, "k8s.deployment.name") + - delete_key(attributes, "k8s.daemonset.name") + - delete_key(attributes, "k8s.cronjob.name") + # + - delete_key(attributes, "k8s.statefulset.uid") + - delete_key(attributes, "k8s.replicaset.uid") + - delete_key(attributes, "k8s.deployment.uid") + - delete_key(attributes, "k8s.daemonset.uid") + - delete_key(attributes, "k8s.job.uid") + - delete_key(attributes, "k8s.cronjob.uid") + + - context: resource + statements: + - delete_key(attributes, "processor") + - delete_key(attributes, "otel.signal") + - delete_key(attributes, "otel.scope.name") + - delete_key(attributes, "otel.scope.version") + + - context: resource + statements: + - set(attributes["k8s.cluster.name"], "${env:K8S_CLUSTER_NAME}") where attributes["k8s.cluster.name"] == nil and Len("${env:K8S_CLUSTER_NAME}") > 0 + - set(attributes["dt.entity.kubernetes_cluster"], "${env:DT_ENTITY_KUBERNETES_CLUSTER}") where attributes["dt.entity.kubernetes_cluster"] == nil and Len("${env:DT_ENTITY_KUBERNETES_CLUSTER}") > 0 + + - context: resource + statements: + - merge_maps(attributes, ParseJSON(attributes["metadata.dynatrace.com"]), "upsert") where IsMatch(attributes["metadata.dynatrace.com"], "^\\{") + - delete_key(attributes, "metadata.dynatrace.com") + + exporters: + debug: + verbosity: basic + otlphttp/sink: + endpoint: "http://tiered-sink:4318" + sending_queue: + enabled: true + num_consumers: 4 + queue_size: 10000 + retry_on_failure: + enabled: true + initial_interval: 5s + max_interval: 30s + max_elapsed_time: 300s + + service: + extensions: + - health_check + pipelines: + metrics: + receivers: + - otlp + processors: + - memory_limiter + - metric_start_time + - cumulativetodelta + - k8s_attributes + - transform + - batch/metrics + exporters: + # - debug + - otlphttp/sink + telemetry: + resource: + k8s.cluster.name: "${env:K8S_CLUSTER_NAME}" + k8s.namespace.name: "${env:K8S_NAMESPACE_NAME}" + k8s.pod.name: "${env:K8S_POD_NAME}" + k8s.pod.uid: "${env:K8S_POD_UID}" + k8s.node.name: "${env:K8S_NODE_NAME}" + metrics: + level: detailed + readers: + - pull: + exporter: + prometheus: + host: "0.0.0.0" + port: 8888 + without_type_suffix: true + without_units: true + logs: + level: INFO + processors: + - batch: + exporter: + otlp: + protocol: http/protobuf + endpoint: "${env:DT_ENDPOINT}/v1/logs" + headers: + - name: Authorization + value: "Api-Token ${env:DT_API_TOKEN}" + +ports: + otlp: + enabled: true + otlp-http: + enabled: true + jaeger-compact: + enabled: false + jaeger-thrift: + enabled: false + jaeger-grpc: + enabled: false + zipkin: + enabled: false + metrics: + enabled: false diff --git a/config_examples/prometheus-large-scale/tier3-sink.values.yaml b/config_examples/prometheus-large-scale/tier3-sink.values.yaml new file mode 100644 index 000000000..49d55c284 --- /dev/null +++ b/config_examples/prometheus-large-scale/tier3-sink.values.yaml @@ -0,0 +1,293 @@ +# Tier 3: Sink — receives OTLP from tier 2, splits scrape metadata from +# application metrics, and exports both to Dynatrace. +# Deploy with: +# helm install otel-sink open-telemetry/opentelemetry-collector \ +# --namespace otel-ta -f tier3-sink.values.yaml + +fullnameOverride: "tiered-sink" + +mode: deployment + +podAnnotations: + metrics.dynatrace.com/scrape: "true" + metrics.dynatrace.com/port: "8888" + +image: + repository: ghcr.io/dynatrace/dynatrace-otel-collector/dynatrace-otel-collector + # renovate: datasource=docker depName=ghcr.io/dynatrace/dynatrace-otel-collector/dynatrace-otel-collector + tag: 0.46.0 +command: + name: dynatrace-otel-collector + +resources: + requests: + cpu: 500m + memory: 3Gi + limits: + memory: 3Gi + +clusterRole: + create: false + +serviceAccount: + create: false + name: tiered-otel-sink + +extraEnvsFrom: + - secretRef: + name: dynatrace-otelcol-credentials +extraEnvs: + - name: K8S_SERVICE_NAME + value: "{{ include \"opentelemetry-collector.fullname\" . }}" + - name: K8S_CLUSTER_NAME + value: "tiered-otelcol-ta-test" + - name: K8S_NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + - name: K8S_POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: K8S_NAMESPACE_NAME + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: K8S_POD_UID + valueFrom: + fieldRef: + fieldPath: metadata.uid + +alternateConfig: + extensions: + health_check: + endpoint: "${env:MY_POD_IP}:13133" + + receivers: + otlp: + protocols: + grpc: + endpoint: "${env:MY_POD_IP}:4317" + http: + endpoint: "${env:MY_POD_IP}:4318" + + processors: + memory_limiter: + check_interval: 1s + limit_percentage: 99 + spike_limit_percentage: 15 + cumulativetodelta: + max_staleness: 5m + # Keep ONLY scrape metadata metrics + filter/keep-scrape-meta: + error_mode: ignore + metrics: + metric: + - 'not (name == "up" or name == "scrape_duration_seconds" or name == "scrape_samples_scraped" or name == "scrape_series_added" or name == "scrape_samples_post_metric_relabeling")' + # Drop scrape metadata metrics from the main pipeline + filter/drop-scrape-meta: + error_mode: ignore + metrics: + metric: + - 'IsMatch(name, "^avalanche_.*")' + - 'name == "up" or name == "scrape_duration_seconds" or name == "scrape_samples_scraped" or name == "scrape_series_added" or name == "scrape_samples_post_metric_relabeling"' + k8s_attributes: + extract: + annotations: + - from: pod + key_regex: metadata.dynatrace.com/(.*) + tag_name: $$1 + - from: pod + key: metadata.dynatrace.com + tag_name: metadata.dynatrace.com + metadata: + - k8s.pod.name + - k8s.pod.uid + - k8s.pod.ip + - k8s.deployment.name + - k8s.replicaset.name + - k8s.statefulset.name + - k8s.daemonset.name + - k8s.job.name + - k8s.cronjob.name + - k8s.namespace.name + - k8s.node.name + - k8s.cluster.uid + - k8s.container.name + - k8s.deployment.uid + - k8s.replicaset.uid + - k8s.statefulset.uid + - k8s.daemonset.uid + - k8s.job.uid + - k8s.cronjob.uid + pod_association: + - sources: + - from: resource_attribute + name: server.address + - sources: + - from: resource_attribute + name: k8s.pod.name + - from: resource_attribute + name: k8s.namespace.name + - sources: + - from: resource_attribute + name: k8s.pod.ip + - sources: + - from: resource_attribute + name: k8s.pod.uid + - sources: + - from: connection + transform: + metric_statements: + - context: resource + statements: + - set(resource.attributes["k8s.workload.name"], resource.attributes["k8s.statefulset.name"]) where IsString(resource.attributes["k8s.statefulset.name"]) + - set(resource.attributes["k8s.workload.name"], resource.attributes["k8s.replicaset.name"]) where IsString(resource.attributes["k8s.replicaset.name"]) + - set(resource.attributes["k8s.workload.name"], resource.attributes["k8s.job.name"]) where IsString(resource.attributes["k8s.job.name"]) + - set(resource.attributes["k8s.workload.name"], resource.attributes["k8s.deployment.name"]) where IsString(resource.attributes["k8s.deployment.name"]) + - set(resource.attributes["k8s.workload.name"], resource.attributes["k8s.daemonset.name"]) where IsString(resource.attributes["k8s.daemonset.name"]) + - set(resource.attributes["k8s.workload.name"], resource.attributes["k8s.cronjob.name"]) where IsString(resource.attributes["k8s.cronjob.name"]) + + - set(resource.attributes["k8s.workload.kind"], "statefulset") where IsString(resource.attributes["k8s.statefulset.name"]) + - set(resource.attributes["k8s.workload.kind"], "replicaset") where IsString(resource.attributes["k8s.replicaset.name"]) + - set(resource.attributes["k8s.workload.kind"], "job") where IsString(resource.attributes["k8s.job.name"]) + - set(resource.attributes["k8s.workload.kind"], "deployment") where IsString(resource.attributes["k8s.deployment.name"]) + - set(resource.attributes["k8s.workload.kind"], "daemonset") where IsString(resource.attributes["k8s.daemonset.name"]) + - set(resource.attributes["k8s.workload.kind"], "cronjob") where IsString(resource.attributes["k8s.cronjob.name"]) + + - set(resource.attributes["k8s.workload.uid"], resource.attributes["k8s.statefulset.uid"]) where IsString(resource.attributes["k8s.statefulset.uid"]) + - set(resource.attributes["k8s.workload.uid"], resource.attributes["k8s.replicaset.uid"]) where IsString(resource.attributes["k8s.replicaset.uid"]) + - set(resource.attributes["k8s.workload.uid"], resource.attributes["k8s.job.uid"]) where IsString(resource.attributes["k8s.job.uid"]) + - set(resource.attributes["k8s.workload.uid"], resource.attributes["k8s.deployment.uid"]) where IsString(resource.attributes["k8s.deployment.uid"]) + - set(resource.attributes["k8s.workload.uid"], resource.attributes["k8s.daemonset.uid"]) where IsString(resource.attributes["k8s.daemonset.uid"]) + - set(resource.attributes["k8s.workload.uid"], resource.attributes["k8s.cronjob.uid"]) where IsString(resource.attributes["k8s.cronjob.uid"]) + + - delete_key(resource.attributes, "k8s.statefulset.name") + - delete_key(resource.attributes, "k8s.replicaset.name") + - delete_key(resource.attributes, "k8s.job.name") + - delete_key(resource.attributes, "k8s.deployment.name") + - delete_key(resource.attributes, "k8s.daemonset.name") + - delete_key(resource.attributes, "k8s.cronjob.name") + - delete_key(resource.attributes, "k8s.statefulset.uid") + - delete_key(resource.attributes, "k8s.replicaset.uid") + - delete_key(resource.attributes, "k8s.deployment.uid") + - delete_key(resource.attributes, "k8s.daemonset.uid") + - delete_key(resource.attributes, "k8s.job.uid") + - delete_key(resource.attributes, "k8s.cronjob.uid") + + - context: resource + statements: + - delete_key(resource.attributes, "processor") + - delete_key(resource.attributes, "otel.signal") + - delete_key(resource.attributes, "otel.scope.name") + - delete_key(resource.attributes, "otel.scope.version") + + - context: resource + statements: + - set(resource.attributes["k8s.cluster.name"], "${env:K8S_CLUSTER_NAME}") where resource.attributes["k8s.cluster.name"] == nil and Len("${env:K8S_CLUSTER_NAME}") > 0 + + - context: resource + statements: + - merge_maps(resource.attributes, ParseJSON(resource.attributes["metadata.dynatrace.com"]), "upsert") where IsMatch(resource.attributes["metadata.dynatrace.com"], "^\\{") + - delete_key(resource.attributes, "metadata.dynatrace.com") + + exporters: + debug: + verbosity: basic + otlphttp/dynatrace: + endpoint: "${env:DT_ENDPOINT}" + headers: + Authorization: "Api-Token ${env:DT_API_TOKEN}" + + service: + extensions: + - health_check + pipelines: + # Main metrics — everything except scrape metadata + metrics: + receivers: + - otlp + processors: + - memory_limiter + - filter/drop-scrape-meta + - cumulativetodelta + - k8s_attributes + - transform + exporters: + #- debug + - otlphttp/dynatrace + # Scrape metadata only (up, scrape_duration_seconds, etc.) + metrics/scrape-meta: + receivers: + - otlp + processors: + - memory_limiter + - filter/keep-scrape-meta + - k8s_attributes + - transform + exporters: + #- debug + - otlphttp/dynatrace + traces: + receivers: + - otlp + processors: + - memory_limiter + exporters: + #- debug + - otlphttp/dynatrace + logs: + receivers: + - otlp + processors: + - memory_limiter + exporters: + #- debug + - otlphttp/dynatrace + telemetry: + resource: + k8s.cluster.name: "${env:K8S_CLUSTER_NAME}" + k8s.namespace.name: "${env:K8S_NAMESPACE_NAME}" + k8s.pod.name: "${env:K8S_POD_NAME}" + k8s.pod.uid: "${env:K8S_POD_UID}" + k8s.node.name: "${env:K8S_NODE_NAME}" + metrics: + level: detailed + readers: + - pull: + exporter: + prometheus: + host: "0.0.0.0" + port: 8888 + without_type_suffix: true + without_units: true + logs: + level: INFO + processors: + - batch: + exporter: + otlp: + protocol: http/protobuf + endpoint: "${env:DT_ENDPOINT}/v1/logs" + headers: + - name: Authorization + value: "Api-Token ${env:DT_API_TOKEN}" + +ports: + otlp: + enabled: true + otlp-http: + enabled: true + jaeger-compact: + enabled: false + jaeger-thrift: + enabled: false + jaeger-grpc: + enabled: false + zipkin: + enabled: false + metrics: + enabled: false + +service: + enabled: true From 8e6cd8be7c12ad465f2176bca97a163206e07cf1 Mon Sep 17 00:00:00 2001 From: Moritz Wiesinger <6901203+mowies@users.noreply.github.com> Date: Wed, 6 May 2026 15:40:10 +0200 Subject: [PATCH 2/5] remove avalanche --- .../prometheus-large-scale/avalanche.yaml | 65 ------------------- 1 file changed, 65 deletions(-) delete mode 100644 config_examples/prometheus-large-scale/avalanche.yaml diff --git a/config_examples/prometheus-large-scale/avalanche.yaml b/config_examples/prometheus-large-scale/avalanche.yaml deleted file mode 100644 index 189af2b08..000000000 --- a/config_examples/prometheus-large-scale/avalanche.yaml +++ /dev/null @@ -1,65 +0,0 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - labels: - app: avalanche - name: avalanche - namespace: avalanche -spec: - replicas: 30 - strategy: - rollingUpdate: - maxSurge: 25% - maxUnavailable: 0 - selector: - matchLabels: - app: avalanche - template: - metadata: - annotations: - metrics.dynatrace.com/scrape: "true" - metrics.dynatrace.com/port: "9001" - labels: - app: avalanche - spec: - containers: - - image: quay.io/prometheuscommunity/avalanche:v0.7.0 - name: avalanche - args: - # Target: ~5M datapoints/min across 30 replicas (60s scrape interval) - # Per pod (~83.5k series): - # Gauges: 25 metrics × 817 series = 20,425 series - # Counters: 25 metrics × 817 series = 20,425 series - # Histograms: 4 metrics × 817 series × 13 = 42,484 series - # (10 buckets + 1 +Inf bucket + _sum + _count) - - "--gauge-metric-count=25" - - "--counter-metric-count=25" - - "--histogram-metric-count=4" - - "--histogram-metric-bucket-count=10" - - "--native-histogram-metric-count=0" - - "--summary-metric-count=0" - - "--series-count=817" # ← halved from 1634 - - "--value-interval=300" - - "--series-interval=3600" - - "--metric-interval=0" - - "--port=9001" # Pod total: ~83,334 series → 30 pods × 83,334 = ~2,500,020 DPM - resources: - requests: - memory: 256Mi - cpu: "100m" - limits: - cpu: "500m" - memory: "384Mi" ---- -apiVersion: v1 -kind: Service -metadata: - name: avalanche - namespace: avalanche -spec: - selector: - app: avalanche - ports: - - name: metrics - port: 9001 - targetPort: 9001 \ No newline at end of file From 5cd2607a206ed9bb4fda4bdbb3b5238a576c15f9 Mon Sep 17 00:00:00 2001 From: Moritz Wiesinger <6901203+mowies@users.noreply.github.com> Date: Wed, 6 May 2026 15:43:02 +0200 Subject: [PATCH 3/5] remove sink, add dt exporter --- .../tier2-gateway.values.yaml | 17 +- .../tier3-sink.values.yaml | 293 ------------------ 2 files changed, 5 insertions(+), 305 deletions(-) delete mode 100644 config_examples/prometheus-large-scale/tier3-sink.values.yaml diff --git a/config_examples/prometheus-large-scale/tier2-gateway.values.yaml b/config_examples/prometheus-large-scale/tier2-gateway.values.yaml index d86f73a5d..de3093aa0 100644 --- a/config_examples/prometheus-large-scale/tier2-gateway.values.yaml +++ b/config_examples/prometheus-large-scale/tier2-gateway.values.yaml @@ -231,17 +231,10 @@ alternateConfig: exporters: debug: verbosity: basic - otlphttp/sink: - endpoint: "http://tiered-sink:4318" - sending_queue: - enabled: true - num_consumers: 4 - queue_size: 10000 - retry_on_failure: - enabled: true - initial_interval: 5s - max_interval: 30s - max_elapsed_time: 300s + otlp_http: + endpoint: ${env:DT_ENDPOINT} + headers: + Authorization: "Api-Token ${env:DT_API_TOKEN}" service: extensions: @@ -259,7 +252,7 @@ alternateConfig: - batch/metrics exporters: # - debug - - otlphttp/sink + - otlp_http/dynatrace telemetry: resource: k8s.cluster.name: "${env:K8S_CLUSTER_NAME}" diff --git a/config_examples/prometheus-large-scale/tier3-sink.values.yaml b/config_examples/prometheus-large-scale/tier3-sink.values.yaml deleted file mode 100644 index 49d55c284..000000000 --- a/config_examples/prometheus-large-scale/tier3-sink.values.yaml +++ /dev/null @@ -1,293 +0,0 @@ -# Tier 3: Sink — receives OTLP from tier 2, splits scrape metadata from -# application metrics, and exports both to Dynatrace. -# Deploy with: -# helm install otel-sink open-telemetry/opentelemetry-collector \ -# --namespace otel-ta -f tier3-sink.values.yaml - -fullnameOverride: "tiered-sink" - -mode: deployment - -podAnnotations: - metrics.dynatrace.com/scrape: "true" - metrics.dynatrace.com/port: "8888" - -image: - repository: ghcr.io/dynatrace/dynatrace-otel-collector/dynatrace-otel-collector - # renovate: datasource=docker depName=ghcr.io/dynatrace/dynatrace-otel-collector/dynatrace-otel-collector - tag: 0.46.0 -command: - name: dynatrace-otel-collector - -resources: - requests: - cpu: 500m - memory: 3Gi - limits: - memory: 3Gi - -clusterRole: - create: false - -serviceAccount: - create: false - name: tiered-otel-sink - -extraEnvsFrom: - - secretRef: - name: dynatrace-otelcol-credentials -extraEnvs: - - name: K8S_SERVICE_NAME - value: "{{ include \"opentelemetry-collector.fullname\" . }}" - - name: K8S_CLUSTER_NAME - value: "tiered-otelcol-ta-test" - - name: K8S_NODE_NAME - valueFrom: - fieldRef: - fieldPath: spec.nodeName - - name: K8S_POD_NAME - valueFrom: - fieldRef: - fieldPath: metadata.name - - name: K8S_NAMESPACE_NAME - valueFrom: - fieldRef: - fieldPath: metadata.namespace - - name: K8S_POD_UID - valueFrom: - fieldRef: - fieldPath: metadata.uid - -alternateConfig: - extensions: - health_check: - endpoint: "${env:MY_POD_IP}:13133" - - receivers: - otlp: - protocols: - grpc: - endpoint: "${env:MY_POD_IP}:4317" - http: - endpoint: "${env:MY_POD_IP}:4318" - - processors: - memory_limiter: - check_interval: 1s - limit_percentage: 99 - spike_limit_percentage: 15 - cumulativetodelta: - max_staleness: 5m - # Keep ONLY scrape metadata metrics - filter/keep-scrape-meta: - error_mode: ignore - metrics: - metric: - - 'not (name == "up" or name == "scrape_duration_seconds" or name == "scrape_samples_scraped" or name == "scrape_series_added" or name == "scrape_samples_post_metric_relabeling")' - # Drop scrape metadata metrics from the main pipeline - filter/drop-scrape-meta: - error_mode: ignore - metrics: - metric: - - 'IsMatch(name, "^avalanche_.*")' - - 'name == "up" or name == "scrape_duration_seconds" or name == "scrape_samples_scraped" or name == "scrape_series_added" or name == "scrape_samples_post_metric_relabeling"' - k8s_attributes: - extract: - annotations: - - from: pod - key_regex: metadata.dynatrace.com/(.*) - tag_name: $$1 - - from: pod - key: metadata.dynatrace.com - tag_name: metadata.dynatrace.com - metadata: - - k8s.pod.name - - k8s.pod.uid - - k8s.pod.ip - - k8s.deployment.name - - k8s.replicaset.name - - k8s.statefulset.name - - k8s.daemonset.name - - k8s.job.name - - k8s.cronjob.name - - k8s.namespace.name - - k8s.node.name - - k8s.cluster.uid - - k8s.container.name - - k8s.deployment.uid - - k8s.replicaset.uid - - k8s.statefulset.uid - - k8s.daemonset.uid - - k8s.job.uid - - k8s.cronjob.uid - pod_association: - - sources: - - from: resource_attribute - name: server.address - - sources: - - from: resource_attribute - name: k8s.pod.name - - from: resource_attribute - name: k8s.namespace.name - - sources: - - from: resource_attribute - name: k8s.pod.ip - - sources: - - from: resource_attribute - name: k8s.pod.uid - - sources: - - from: connection - transform: - metric_statements: - - context: resource - statements: - - set(resource.attributes["k8s.workload.name"], resource.attributes["k8s.statefulset.name"]) where IsString(resource.attributes["k8s.statefulset.name"]) - - set(resource.attributes["k8s.workload.name"], resource.attributes["k8s.replicaset.name"]) where IsString(resource.attributes["k8s.replicaset.name"]) - - set(resource.attributes["k8s.workload.name"], resource.attributes["k8s.job.name"]) where IsString(resource.attributes["k8s.job.name"]) - - set(resource.attributes["k8s.workload.name"], resource.attributes["k8s.deployment.name"]) where IsString(resource.attributes["k8s.deployment.name"]) - - set(resource.attributes["k8s.workload.name"], resource.attributes["k8s.daemonset.name"]) where IsString(resource.attributes["k8s.daemonset.name"]) - - set(resource.attributes["k8s.workload.name"], resource.attributes["k8s.cronjob.name"]) where IsString(resource.attributes["k8s.cronjob.name"]) - - - set(resource.attributes["k8s.workload.kind"], "statefulset") where IsString(resource.attributes["k8s.statefulset.name"]) - - set(resource.attributes["k8s.workload.kind"], "replicaset") where IsString(resource.attributes["k8s.replicaset.name"]) - - set(resource.attributes["k8s.workload.kind"], "job") where IsString(resource.attributes["k8s.job.name"]) - - set(resource.attributes["k8s.workload.kind"], "deployment") where IsString(resource.attributes["k8s.deployment.name"]) - - set(resource.attributes["k8s.workload.kind"], "daemonset") where IsString(resource.attributes["k8s.daemonset.name"]) - - set(resource.attributes["k8s.workload.kind"], "cronjob") where IsString(resource.attributes["k8s.cronjob.name"]) - - - set(resource.attributes["k8s.workload.uid"], resource.attributes["k8s.statefulset.uid"]) where IsString(resource.attributes["k8s.statefulset.uid"]) - - set(resource.attributes["k8s.workload.uid"], resource.attributes["k8s.replicaset.uid"]) where IsString(resource.attributes["k8s.replicaset.uid"]) - - set(resource.attributes["k8s.workload.uid"], resource.attributes["k8s.job.uid"]) where IsString(resource.attributes["k8s.job.uid"]) - - set(resource.attributes["k8s.workload.uid"], resource.attributes["k8s.deployment.uid"]) where IsString(resource.attributes["k8s.deployment.uid"]) - - set(resource.attributes["k8s.workload.uid"], resource.attributes["k8s.daemonset.uid"]) where IsString(resource.attributes["k8s.daemonset.uid"]) - - set(resource.attributes["k8s.workload.uid"], resource.attributes["k8s.cronjob.uid"]) where IsString(resource.attributes["k8s.cronjob.uid"]) - - - delete_key(resource.attributes, "k8s.statefulset.name") - - delete_key(resource.attributes, "k8s.replicaset.name") - - delete_key(resource.attributes, "k8s.job.name") - - delete_key(resource.attributes, "k8s.deployment.name") - - delete_key(resource.attributes, "k8s.daemonset.name") - - delete_key(resource.attributes, "k8s.cronjob.name") - - delete_key(resource.attributes, "k8s.statefulset.uid") - - delete_key(resource.attributes, "k8s.replicaset.uid") - - delete_key(resource.attributes, "k8s.deployment.uid") - - delete_key(resource.attributes, "k8s.daemonset.uid") - - delete_key(resource.attributes, "k8s.job.uid") - - delete_key(resource.attributes, "k8s.cronjob.uid") - - - context: resource - statements: - - delete_key(resource.attributes, "processor") - - delete_key(resource.attributes, "otel.signal") - - delete_key(resource.attributes, "otel.scope.name") - - delete_key(resource.attributes, "otel.scope.version") - - - context: resource - statements: - - set(resource.attributes["k8s.cluster.name"], "${env:K8S_CLUSTER_NAME}") where resource.attributes["k8s.cluster.name"] == nil and Len("${env:K8S_CLUSTER_NAME}") > 0 - - - context: resource - statements: - - merge_maps(resource.attributes, ParseJSON(resource.attributes["metadata.dynatrace.com"]), "upsert") where IsMatch(resource.attributes["metadata.dynatrace.com"], "^\\{") - - delete_key(resource.attributes, "metadata.dynatrace.com") - - exporters: - debug: - verbosity: basic - otlphttp/dynatrace: - endpoint: "${env:DT_ENDPOINT}" - headers: - Authorization: "Api-Token ${env:DT_API_TOKEN}" - - service: - extensions: - - health_check - pipelines: - # Main metrics — everything except scrape metadata - metrics: - receivers: - - otlp - processors: - - memory_limiter - - filter/drop-scrape-meta - - cumulativetodelta - - k8s_attributes - - transform - exporters: - #- debug - - otlphttp/dynatrace - # Scrape metadata only (up, scrape_duration_seconds, etc.) - metrics/scrape-meta: - receivers: - - otlp - processors: - - memory_limiter - - filter/keep-scrape-meta - - k8s_attributes - - transform - exporters: - #- debug - - otlphttp/dynatrace - traces: - receivers: - - otlp - processors: - - memory_limiter - exporters: - #- debug - - otlphttp/dynatrace - logs: - receivers: - - otlp - processors: - - memory_limiter - exporters: - #- debug - - otlphttp/dynatrace - telemetry: - resource: - k8s.cluster.name: "${env:K8S_CLUSTER_NAME}" - k8s.namespace.name: "${env:K8S_NAMESPACE_NAME}" - k8s.pod.name: "${env:K8S_POD_NAME}" - k8s.pod.uid: "${env:K8S_POD_UID}" - k8s.node.name: "${env:K8S_NODE_NAME}" - metrics: - level: detailed - readers: - - pull: - exporter: - prometheus: - host: "0.0.0.0" - port: 8888 - without_type_suffix: true - without_units: true - logs: - level: INFO - processors: - - batch: - exporter: - otlp: - protocol: http/protobuf - endpoint: "${env:DT_ENDPOINT}/v1/logs" - headers: - - name: Authorization - value: "Api-Token ${env:DT_API_TOKEN}" - -ports: - otlp: - enabled: true - otlp-http: - enabled: true - jaeger-compact: - enabled: false - jaeger-thrift: - enabled: false - jaeger-grpc: - enabled: false - zipkin: - enabled: false - metrics: - enabled: false - -service: - enabled: true From 53154f93cb0c800ba983e80c21ba849ac0cf95df Mon Sep 17 00:00:00 2001 From: Moritz Wiesinger <6901203+mowies@users.noreply.github.com> Date: Thu, 7 May 2026 08:09:46 +0200 Subject: [PATCH 4/5] remove comment --- config_examples/prometheus-large-scale/allocator.values.yaml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/config_examples/prometheus-large-scale/allocator.values.yaml b/config_examples/prometheus-large-scale/allocator.values.yaml index bc7ea2514..c321c4664 100644 --- a/config_examples/prometheus-large-scale/allocator.values.yaml +++ b/config_examples/prometheus-large-scale/allocator.values.yaml @@ -32,10 +32,6 @@ targetAllocator: scrape_config_selector: prometheus.dynatrace.com: "true" -# filter_strategy: relabel-config -# config: -# scrape_configs: [] - resources: limits: memory: 200Mi From 716e93d7e10a99ee5dd5b9cb36511630cf2a60c2 Mon Sep 17 00:00:00 2001 From: Moritz Wiesinger <6901203+mowies@users.noreply.github.com> Date: Thu, 7 May 2026 08:14:27 +0200 Subject: [PATCH 5/5] add readme --- config_examples/README.md | 1 + .../prometheus-large-scale/README.md | 26 +++++++++++++++++++ 2 files changed, 27 insertions(+) create mode 100644 config_examples/prometheus-large-scale/README.md diff --git a/config_examples/README.md b/config_examples/README.md index 9f4114d3d..8839c80a6 100644 --- a/config_examples/README.md +++ b/config_examples/README.md @@ -29,6 +29,7 @@ Dynatrace distribution of the OpenTelemetry Collector. - [Redaction Processor](redaction.yaml) - [Host Metrics Receiver](host-metrics.yaml) - [Dynatrace Resource Detector](resource-detection.yaml) +- [Large Scale Prometheus Scraping](./prometheus-large-scale) ## Sending data to Dynatrace diff --git a/config_examples/prometheus-large-scale/README.md b/config_examples/prometheus-large-scale/README.md new file mode 100644 index 000000000..6fdafa9ca --- /dev/null +++ b/config_examples/prometheus-large-scale/README.md @@ -0,0 +1,26 @@ +# Prometheus Large-Scale + +Tiered OTel Collector setup for scraping Prometheus targets at scale and shipping to Dynatrace. + +## Architecture + +- **Tier 1 — Scraper** (`tier1-scraper.values.yaml`): scrapes targets assigned by Target Allocator, load-balances OTLP to tier 2. +- **Tier 2 — Gateway** (`tier2-gateway.values.yaml`): enriches metrics, exports to Dynatrace. +- **Target Allocator** (`allocator.values.yaml`): distributes scrape targets across tier 1 replicas (consistent-hashing). +- **Selfmon Scraper** (`selfmon-scraper.yaml`): scrapes collector/allocator self-metrics direct to Dynatrace. +- **ScrapeConfig** (`scrapeconfig.yaml`): example Prometheus Operator `ScrapeConfig` CR consumed by TA. +- **RBAC** (`rbac.yaml`): ServiceAccounts + roles for scraper, gateway, sink, allocator. + +## Deploy + +Set `NAMESPACE` and apply RBAC + ScrapeConfig, then install Helm charts: + +```sh +kubectl apply -f rbac.yaml +kubectl apply -f scrapeconfig.yaml + +helm install otel-allocator open-telemetry/opentelemetry-target-allocator -f allocator.values.yaml +helm install otel-scraper open-telemetry/opentelemetry-collector -f tier1-scraper.values.yaml +helm install otel-gateway open-telemetry/opentelemetry-collector -f tier2-gateway.values.yaml +helm install otel-selfmon open-telemetry/opentelemetry-collector -f selfmon-scraper.yaml +```