Post

[Kubernetes] OTel(OpenTelemetry) Collector - Logging

[Kubernetes] OTel(OpenTelemetry) Collector - Logging

Opentelemetry Collector

아키텍쳐는 OTel Collector와 기존의 로그 수집 도구를 혼합해 구성한 Plan A와 OTel Collector만으로 구성한 Plan B로 나눌 수 있다.

OpenTelemetry Collector 배포 및 구성

  • vi /etc/rsyslog.conf에 아래의 Code 추가
    1
    
    *.* action(type="omfwd" target="0.0.0.0" port="54527" protocol="tcp" action.resumeRetryCount="10" queue.type="linkedList" queue.size="10000")
    
  • syslog 및 container log
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48
    49
    50
    51
    52
    53
    54
    55
    56
    57
    58
    59
    60
    61
    62
    63
    64
    65
    66
    67
    68
    69
    70
    71
    72
    73
    74
    75
    76
    77
    78
    79
    80
    81
    82
    83
    84
    85
    86
    87
    88
    89
    90
    91
    92
    93
    94
    95
    96
    97
    98
    99
    100
    101
    102
    103
    104
    105
    106
    107
    108
    109
    110
    111
    112
    113
    114
    115
    116
    117
    118
    119
    120
    121
    122
    123
    124
    125
    126
    127
    128
    129
    130
    131
    132
    133
    134
    135
    136
    137
    138
    139
    140
    141
    142
    143
    144
    145
    146
    147
    148
    149
    150
    151
    152
    153
    154
    155
    156
    157
    158
    159
    160
    161
    162
    163
    164
    165
    166
    167
    168
    169
    170
    171
    172
    173
    174
    175
    176
    
    apiVersion: opentelemetry.io/v1beta1
    kind: OpenTelemetryCollector
    metadata:
      name: otel-log
    spec:
      mode: daemonset
      hostNetwork: true
      podSecurityContext:
        runAsUser: 0
        runAsGroup: 0
      tolerations:
        - operator: Exists
      volumes:
        # Typically the collector will want access to pod logs and container logs
        - name: varlogpods
          hostPath:
            path: /var/log/pods
        - name: varlibdockercontainers
          hostPath:
            path: /var/lib/docker/containers
        - name: applogs
          hostPath:
            path: /appdata/applog
      volumeMounts:
        # Mount the volumes to the collector container
        - name: varlogpods
          mountPath: /var/log/pods
          readOnly: true
        - name: varlibdockercontainers
          mountPath: /var/lib/docker/containers
          readOnly: true
        - name: applogs
          mountPath: /appdata/applog
          readOnly: true
      config:
        # This is a new configuration file - do not merge this with your metrics configuration!
        receivers:
          syslog:
            tcp:
              listen_address: '0.0.0.0:54527'
            protocol: rfc3164
            location: UTC or Asia/Seoul # specify server timezone here
            operators:
              - type: move
                from: attributes.message
                to: body
              - type: move
                from: attributes.hostname
                to: resource["hostname"]
              - type: move
                from: attributes.appname
                to: resource["daemon"]
    
          filelog/applog:
            include:
              - /appdata/applog/*/*/*.log
            operators:
              # Extract metadata from file path
              - type: regex_parser
                id: extract_metadata_from_filepath
                # Pod UID is not always 36 characters long
                regex: '^.*\/(?P<namespace>\S+)\/(?P<pod_name>\S+)\/(?P<log_file_name>\S+)\.log$'
                parse_from: attributes["log.file.path"]
                cache:
                  size: 128 # default maximum amount of Pods per Node is 110
              # Rename attributes
              - type: move
                from: attributes["log.file.path"]
                to: resource["filename"]
              - type: move
                from: attributes.namespace
                to: resource["namespace"]
              - type: move
                from: attributes.pod_name
                to: resource["pod"]
              - type: add
                field: resource["cluster"]
                value: 'your-cluster-name'
    
          filelog:
            include:
              - /var/log/pods/*/*/*.log
            exclude:
              # Exclude logs from all containers named otel-collector
              - /var/log/pods/*/otel-collector/*.log
            start_at: beginning
            include_file_path: true
            include_file_name: false
            operators:
              # Find out which format is used by kubernetes
              - type: router
                id: get-format
                routes:
                  - output: parser-docker
                    expr: 'body matches "^\\{"'
                  - output: parser-crio
                    expr: 'body matches "^[^ Z]+ "'
                  - output: parser-containerd
                    expr: 'body matches "^[^ Z]+Z"'
              # Parse CRI-O format
              - type: regex_parser
                id: parser-crio
                regex: '^(?P<time>[^ Z]+) (?P<stream>stdout|stderr) (?P<logtag>[^ ]*) ?(?P<log>.*)$'
                output: extract_metadata_from_filepath
                timestamp:
                  parse_from: attributes.time
                  layout_type: gotime
                  layout: '2006-01-02T15:04:05.999999999Z07:00'
              # Parse CRI-Containerd format
              - type: regex_parser
                id: parser-containerd
                regex: '^(?P<time>[^ ^Z]+Z) (?P<stream>stdout|stderr) (?P<logtag>[^ ]*) ?(?P<log>.*)$'
                output: extract_metadata_from_filepath
                timestamp:
                  parse_from: attributes.time
                  layout: '%Y-%m-%dT%H:%M:%S.%LZ'
              # Parse Docker format
              - type: json_parser
                id: parser-docker
                output: extract_metadata_from_filepath
                timestamp:
                  parse_from: attributes.time
                  layout: '%Y-%m-%dT%H:%M:%S.%LZ'
              # Extract metadata from file path
              - type: regex_parser
                id: extract_metadata_from_filepath
                # Pod UID is not always 36 characters long
                regex: '^.*\/(?P<namespace>[^_]+)_(?P<pod_name>[^_]+)_(?P<uid>[a-f0-9\-]{16,36})\/(?P<container_name>[^\._]+)\/(?P<restart_count>\d+)\.log$'
                parse_from: attributes["log.file.path"]
                cache:
                  size: 128 # default maximum amount of Pods per Node is 110
              # Rename attributes
              - type: move
                from: attributes["log.file.path"]
                to: resource["filename"]
              - type: move
                from: attributes.container_name
                to: resource["container"]
              - type: move
                from: attributes.namespace
                to: resource["namespace"]
              - type: move
                from: attributes.pod_name
                to: resource["pod"]
              - type: add
                field: resource["cluster"]
                value: 'your-cluster-name' # Set your cluster name here
              - type: move
                from: attributes.log
                to: body
    
        processors:
          attributes:
            actions:
            - action: insert
              key: loki.resource.labels
              value: hostname, daemon
          resource:
            attributes:
              - action: insert
                key: loki.format
                value: raw
              - action: insert
                key: loki.resource.labels
                value: pod, namespace, container, cluster, filename
    
        exporters:
          loki:
            endpoint: https://LOKI_USERNAME:ACCESS_POLICY_TOKEN@LOKI_URL/loki/api/v1/push or http://<Loki-svc>.<Loki-Namespace>.svc/loki/api/v1/push
    
        service:
          pipelines:
            logs:
              receivers: [syslog, filelog/applog, filelog]
              processors: [attributes, resource]
              exporters: [loki]
    
  • 변경될 Container Log 수집 방법
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48
    49
    50
    51
    52
    53
    54
    
    apiVersion: opentelemetry.io/v1beta1
    kind: OpenTelemetryCollector
    metadata:
      name: otel-log
    spec:
      mode: daemonset
      hostNetwork: true
      volumes:
        # Typically the collector will want access to pod logs and container logs
        - name: varlogpods
          hostPath:
            path: /var/log/pods
        - name: varlibdockercontainers
          hostPath:
            path: /var/lib/docker/containers
      volumeMounts:
        # Mount the volumes to the collector container
        - name: varlogpods
          mountPath: /var/log/pods
          readOnly: true
        - name: varlibdockercontainers
          mountPath: /var/lib/docker/containers
          readOnly: true
      config:
        # This is a new configuration file - do not merge this with your metrics configuration!
        receivers:
          filelog:
            include_file_path: true
            include:
              - /var/log/pods/*/*/*.log
            operators:
              - id: container-parser
                type: container
    
        processors:
          resource:
            attributes:
              - action: insert
                key: loki.format
                value: raw
              - action: insert
                key: loki.resource.labels
                value: pod, namespace, container, cluster, filename
    
        exporters:
          loki:
            endpoint: https://LOKI_USERNAME:ACCESS_POLICY_TOKEN@LOKI_URL/loki/api/v1/push or http://<Loki-svc>.<Loki-Namespace>.svc/loki/api/v1/push
    
        service:
          pipelines:
            logs:
              receivers: [filelog]
              processors: [resource]
              exporters: [loki]
    

    참고 : https://opentelemetry.io/blog/2024/otel-collector-container-log-parser/

Receiver Configuration - Plan A

Receiver는 Promtail 및 EventExporter로부터 Log 데이터를 받는 진입점을 위한 loki receiver를 사용합니다. loki receiver를 사용하면 Otel Collector에 기존의 Loki가 노출하는 endpoint를 동일하게 노출시켜 기존의 Log 수집 컴포넌트들이 동일한 방법으로 OTel Collector에 Log를 보낼 수 있도록 구성할 수 있습니다.

Receiver Configuration - Plan B

Receiver는 Container Log 수집을 위한 filelog, System Log 수집을 위한 filelog, Kubernetes Event Log 수집을 위한 k8s_event 3개를 사용합니다.

Container Log는 filelog receiver로 /var/log/pods///*.log 경로에서 수집하고, 수집한 파일들을 기반으로 Path 및 Body를 분석해 Container 명, Pod 명, Namespace 명 등의 정보를 추출합니다.

System Log는 별도의 filelog receiver로 /var/log 경로에서 수집한 dmesg, messages, secure 파일들에서 syslog_parser로 정보를 추출해 수집합니다.

Kubernetes Event Log는 k8s_event receiver를 이용해 Kubernetes API로부터 수집합니다.

Processor Configuration

Processor는 Log에 Kubernetes Attribute를 부착하기 위한 k8sattributes, Loki Label을 구성하기 위한 resource, OOM 방지를 위한 memory_limiter, Log를 batch성으로 전송하기 위한 batch 4개를 사용합니다.

k8sattributes Processor는 filelog로부터 수집한 Container log를 기반으로 이와 일치하는 Pod, Deployment, Cluster 등의 정보를 데이터에 부착합니다.

resource Processor는 위에서 부착한 정보를 Loki의 indexing에 필요한 Label로 변환하는 작업을 수행합니다.

batch와 memory_limiter Processor는 가공한 Log 데이터를 Export하는 방법을 제공합니다.

Exporter Configuration

Exporter는 Log를 Loki로 전송하기 위한 loki exporter를 사용합니다.

loki의 endpoint 어트리뷰트에 loki 주소의 “/loki/api/v1/push” Path를 붙여 로그 진입점을 값으로 넣어 수집한 Log를 Loki로 전송합니다.

Pipeline Configuration

마지막으로 위에서 정의한 Receiver, Processor, Exporter를 순서에 맞게 조합하는 Pipeline을 정의합니다.

특히 Processor 요소들의 배치 순서에 따라 Log를 가공하는 순서가 달라지기 때문에, 위의 순서를 준수하는 것이 중요합니다.

Loki Receiver에서 Log 데이터를 수집해 k8sattributes, resource, memory_limiter, batch 순으로 가공한 뒤, Loki Exporter를 사용해 Loki backend로 전송합니다.

Pipeline Configuration - Plan B

filelog, k8s_events Receiver에서 Log 데이터를 수집해k8sattributes, resource, memory_limiter, batch순으로 가공한 뒤, Loki Exporter를 사용해 Loki backend로 전송합니다.

Node Collector(Daemonset)

  • File Logs
  • Host metrics
  • Kubelet state metrics

  • 공식 문서에서 DaemonSet을 권장하는 receiver가 모인 collector이다.

Log | Filelog

수집 대상은 stdout/stderr로 생성된 Kubernetes, app log으로,
사실상 Fluentbit를 대체한다.
이를 위해 log scraping 및 전달 뿐 아니라 Processors 에서 언급한 다양한 processor 사용을 고려해야 한다.

Metric | Kubelet Stats

node, pod, container, volume, filesystem network I/O and error metrics 등 CPU, memory 등 infra resource에 관한 metric을 다루어,
각 노드의 kubelet이 노출하는 API에서 추출한다. 사실 상 cAdvisor의 대체이다.

Metric | Host Metrics

수집 대상은 node (cpu, disk, CPU load, filesystem, memory, network, paging, process..)의 metric으로,
사실 상 Prometheus Node Exporter를 대체한다.
Kubelet Stats Receiver와 일부 항목이 겹치므로 동시 운용 시 중복 처리가 필요하다.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
# otel-node-collector service accounts are created automatically
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
  name: otel-node-collector
rules:
  - apiGroups: [""]
    resources: ["nodes/stats", "nodes/proxy"]
    verbs: ["get", "watch", "list"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
  name: otel-node-collector
roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: ClusterRole
  name: otel-node-collector
subjects:
  - kind: ServiceAccount
    name: otel-node-collector
    namespace: cluster
---
apiVersion: opentelemetry.io/v1beta1
kind: OpenTelemetryCollector
metadata:
  name: otel-node
  namespace: cluster
  labels:
    app: otel-node-collector
spec:
  mode: daemonset
  resources:
    # requests:
    #   cpu: 10m
    #   memory: 10Mi
    limits:
      cpu: 500m
      memory: 1000Mi
  podAnnotations:
    prometheus.io/scrape: "true"
    prometheus.io/port: "8888"
  env:
    - name: NODE_NAME
      valueFrom:
        fieldRef:
          fieldPath: spec.nodeName
  # volumes:
  #   - name: hostfs
  #     hostPath:
  #       path: /
  # volumeMounts:
  #   - name: hostfs
  #     mountPath: /hostfs
  #     readOnly: true
  #     mountPropagation: HostToContainer
  config:
    extensions:
      health_check: # for k8s liveness and readiness probes
        endpoint: 0.0.0.0:13133 # default

    processors:
      batch: # buffer up to 10000 spans, metric data points, log records for up to 5 seconds
        send_batch_size: 10000
        timeout: 5s
      memory_limiter:
        check_interval: 1s # recommended by official README
        limit_percentage: 80 # in 1Gi memory environment, hard limit is 800Mi
        spike_limit_percentage: 25 # in 1Gi memory environment, soft limit is 500Mi (800 - 250 = 550Mi)

    service:
      extensions:
        - health_check

      telemetry:
        logs:
          level: INFO
        metrics:
          address: 0.0.0.0:8888

      pipelines:
        metrics:
          receivers:
            - kubeletstats
            # - hostmetrics
          processors:
            - memory_limiter
            - batch
          exporters:
            - otlphttp/prometheus

    receivers:
      kubeletstats:
        auth_type: serviceAccount
        endpoint: https://${env:NODE_NAME}:10250
        collection_interval: 10s
        insecure_skip_verify: true
        extra_metadata_labels:
          - k8s.volume.type
        k8s_api_config:
          auth_type: serviceAccount
        metric_groups:
          - node
          - pod
          - container
          - volume

      # hostmetrics:
      #   collection_interval: 10s
      #   root_path: /hostfs
      #   scrapers:
      #     cpu:        # CPU utilization metrics
      #     load:       # CPU load metrics
      #     memory:     # Memory utilization
      #     disk:       # Disk I/O metrics
      #     filesystem: # File System utilization metrics
      #     network:    # Network interface I/O metrics & TCP connection metrics
      #     paging:     # Paging/Swap space utilization and I/O metrics
      #     processes:  # Process count metrics
      #     process:    # Per process CPU, Memory, and Disk I/O metrics
      #       # The following settings can be used to handle the error to work hostmetrics: 2024-05-12T01:06:30.683Z        error   scraperhelper/scrapercontroller.go:197  Error scraping metrics  {"kind": "receiver", "name": "hostmetrics", "data_type": "metrics", "error": "error reading process executable for pid 1: readlink /hostfs/proc/1/exe: permission denied; error reading username for process \"systemd\" (pid 1): open /etc/passwd: no such file or directory;
      #       # refer: https://github.com/open-telemetry/opentelemetry-collector-contrib/pull/28661
      #       mute_process_name_error: true
      #       mute_process_exe_error: true
      #       mute_process_io_error: true
      #       mute_process_user_error: true
      #       mute_process_cgroup_error: true

    exporters:
      debug:
        verbosity: basic # detailed, basic

      otlphttp/prometheus:
        metrics_endpoint: http://prometheus-server.cluster.svc.cluster.local:80/api/v1/otlp/v1/metrics
        tls:
          insecure: true

Cluster Collector(Single Pod)

  • k8s events(log)
  • k8s objects(metrics)

단일 replica 사용 권장인 receivers 대상으로,
이들 receiver는 2개 이상의 instance 사용 시 중복이 발생 가능하기 때문이라고 공식 문서에서 논한다.
두 receiver 모두 cluster 관점에서 추출하기 때문이라고. 이에 따라 deployment type에 1개의 replica로 설정한다.

Log | Kubernetes Objects

주로 Kubernetes event 수집용으로 Kubernetes API server 출처의 objects(전체 목록은 kubectl api-resources 로 확인) 수집에도 사용한다.

Metric | Kubernetes Cluster

사실 상 Kube State Metrics의 대체로 Kubernetes API server에서 cluster level의 metric과 entity events를 추출한다.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
apiVersion: v1
kind: ServiceAccount
metadata:
  name: otel-collector-opentelemetry-collector
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
  name: otel-collector-opentelemetry-collector
rules:
  - apiGroups:
      - ''
    resources:
      - events
      - namespaces
      - namespaces/status
      - nodes
      - nodes/spec
      - pods
      - pods/status
      - replicationcontrollers
      - replicationcontrollers/status
      - resourcequotas
      - services
    verbs:
      - get
      - list
      - watch
  - apiGroups:
      - apps
    resources:
      - daemonsets
      - deployments
      - replicasets
      - statefulsets
    verbs:
      - get
      - list
      - watch
  - apiGroups:
      - extensions
    resources:
      - daemonsets
      - deployments
      - replicasets
    verbs:
      - get
      - list
      - watch
  - apiGroups:
      - batch
    resources:
      - jobs
      - cronjobs
    verbs:
      - get
      - list
      - watch
  - apiGroups:
      - autoscaling
    resources:
      - horizontalpodautoscalers
    verbs:
      - get
      - list
      - watch
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
  name: otel-collector-opentelemetry-collector
roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: ClusterRole
  name: otel-collector-opentelemetry-collector
subjects:
  - kind: ServiceAccount
    name: otel-collector-opentelemetry-collector
    namespace: default
---
# otel-cluster-collector service accounts are created automatically
apiVersion: opentelemetry.io/v1beta1
kind: OpenTelemetryCollector
metadata:
  name: otel-cluster
  namespace: cluster
  labels:
    app: otel-cluster-collector
spec:
  mode: deployment
  replicas: 1
  podAnnotations:
    prometheus.io/scrape: "true"
    prometheus.io/port: "8888"
  config:
    extensions:
      health_check: # for k8s liveness and readiness probes
        endpoint: 0.0.0.0:13133 # default

    processors:
      batch: # buffer up to 10000 spans, metric data points, log records for up to 5 seconds
        send_batch_size: 10000
        timeout: 5s
      memory_limiter:
        check_interval: 1s # recommended by official README
        limit_percentage: 80 # in 1Gi memory environment, hard limit is 800Mi
        spike_limit_percentage: 25 # in 1Gi memory environment, soft limit is 500Mi (800 - 250 = 550Mi)
      attributes:
        actions:
          key: elasticsearch.index.prefix
          value: otel-k8sobject
          action: insert
    service:
      extensions:
        - health_check

      telemetry:
        logs:
          level: DEBUG
        metrics:
          address: 0.0.0.0:8888

      pipelines:
        logs:
          receivers:
            - k8sobjects
          processors:
            - memory_limiter
            - batch
            - attributes
          exporters:
            - debug
            - elasticsearch

        metrics:
          receivers:
            - k8s_cluster
          processors:
            - memory_limiter
            - batch
          exporters:
            - otlphttp/prometheus

    receivers:
      k8sobjects:
        objects:
          - name: pods
            mode: pull
          - name: events
            mode: watch
      k8s_cluster:
        collection_interval: 10s
        node_conditions_to_report:
          - Ready
          - MemoryPressure
        allocatable_types_to_report:
          - cpu
          - memory
          - ephemeral-storage
          - storage

    exporters:
      debug:
        verbosity: detailed # default is basic

      otlphttp/prometheus:
        metrics_endpoint: http://prometheus-server.cluster.svc.cluster.local:80/api/v1/otlp/v1/metrics
        tls:
          insecure: true

      elasticsearch:
        endpoints:
          - http://elasticsearch-es-http.cluster.svc.cluster.local:9200
        logs_index: ""
        logs_dynamic_index:
          enabled: true
        logstash_format:
          enabled: true
        user: anyflow
        password: mycluster
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
apiVersion: opentelemetry.io/v1beta1
kind: OpenTelemetryCollector
metadata:
  name: otel-cluster-k8s-events
  namespace: cluster
  labels:
    app: otel-cluster-collector
spec:
  mode: deployment
  replicas: 1
  config:
    receivers:
      k8s_events:
        auth_type: serviceAccount

    processors:
      batch:

    exporters:
      loki:
        endpoint: https://LOKI_USERNAME:ACCESS_POLICY_TOKEN@LOKI_URL/loki/api/v1/push or http://<Loki-svc>.<Loki-Namespace>.svc/loki/api/v1/push
    service:
      pipelines:
        logs:
          receivers: [k8s_events]
          processors: [batch]
          exporters: [loki]

prometheus Collector(statefulset)

  • prometheus metrics

OTLP Collector(Deployment)

  • Traces(OTEL)
  • Generic OTEL Logs
  • Generic OTEL metrics

공용 receiver, exporter 공통적으로 otlp 프로토콜을 사용하고 replica 개수 제약이 없는 signal 대상 collector로서,
제약이 없을 경우 가장 운용에 유리한 배포 패턴인 Deployment 를 사용한다. MLT 모두를 대상으로 한다.

Trace | Generic OTEL trace

JaegerGrafana Tempo는 OTLP Receiver를 자체적으로 지원한다.

Metric | Generic OTEL metric

앞서 논한 metric 이외의 app level metrics 등의 여타 metric 수집을 위한 endpoint이다.

Log | Generic OTEL log

Istio의 OTel access log를 포함한 여타 log 수집을 위한 endpoint이다.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
# otel-otlp-collector service accounts are created automatically
apiVersion: opentelemetry.io/v1beta1
kind: OpenTelemetryCollector
metadata:
  name: otel-otlp
  namespace: cluster
  labels:
    app: otel-otlp-collector
spec:
  mode: deployment
  # replicas: 1
  autoscaler:
    minReplicas: 1
    maxReplicas: 2
  resources:
    # requests:
    #   cpu: 10m
    #   memory: 10Mi
    limits:
      cpu: 500m
      memory: 1000Mi
  podAnnotations:
    prometheus.io/scrape: "true"
    prometheus.io/port: "8888"
  config:
    extensions:
      health_check: # for k8s liveness and readiness probes
        endpoint: 0.0.0.0:13133 # default

    processors:
      batch: # buffer up to 10000 spans, metric data points, log records for up to 5 seconds
        send_batch_size: 10000
        timeout: 5s
      memory_limiter:
        check_interval: 1s # recommended by official README
        limit_percentage: 80 # in 1Gi memory environment, hard limit is 800Mi
        spike_limit_percentage: 25 # in 1Gi memory environment, soft limit is 500Mi (800 - 250 = 550Mi)

    service:
      extensions:
        - health_check

      telemetry:
        logs:
          level: INFO
        metrics:
          address: 0.0.0.0:8888

      pipelines:
        traces:
          receivers:
            - otlp
          processors:
            - memory_limiter
            - batch
          exporters:
            - debug
            - otlp/jaeger

        logs:
          receivers:
            - otlp
          processors:
            - memory_limiter
            - batch
          exporters:
            - debug
            - elasticsearch

        metrics:
          receivers:
            - otlp
          processors:
            - memory_limiter
            - batch
          exporters:
            - debug
            - otlphttp/prometheus

    receivers:
      otlp:
        protocols:
          grpc:
            endpoint: 0.0.0.0:4317
          http:
            endpoint: 0.0.0.0:4318

    exporters:
      debug:
        verbosity: basic # detailed, basic

      otlp/jaeger:
        endpoint: jaeger-collector.istio-system.svc.cluster.local:4317
        tls:
          insecure: true

      otlphttp/prometheus:
        metrics_endpoint: http://prometheus-server.cluster.svc.cluster.local:80/api/v1/otlp/v1/metrics
        tls:
          insecure: true

      elasticsearch:
        endpoints:
          - http://elasticsearch-es-http.cluster.svc.cluster.local:9200
        logs_index: "istio-access-log"
        logs_dynamic_index:
          enabled: true
        logstash_format:
          enabled: true
        user: anyflow
        password: mycluster
This post is licensed under CC BY 4.0 by the author.