×
Sample configuration for AWS EC2 handling 25k entities and 10k metrics/sec (small) with NGINX Ingress controller
Download this sample AWS EC2 handling 25k entities and 10k metrics/sec (small) configuration provided by ITRS.
# Example ITRS Analytics configuration for AWS EC2 handling 25k Obcerv entities and 10k metrics/sec, and 5k OpenTelemetry
# spans/sec (pre-sampling).
#
# Nodes: (4) c5.4xlarge (16CPU, 32GB)
#
# The resource requests total ~43 cores and ~107GiB memory (assuming collection-agent DaemonSet runs on 4 nodes)
# and includes Linkerd resources.
#
# HA CONFIGURATION NOTE:
# This configuration provides seamless HA for service layer workloads (2 replicas minimum for stateless services).
# However, timescale.clusterSize is set to 2 (1 primary + 1 read replica) due to resource impact.
#
# With clusterSize: 2, losing the read replica will cause query failures until it recovers (~2-5 minutes).
# For complete seamless HA including database reads, set timescale.clusterSize: 3 (adds 1 additional replica).
# Resource tradeoff: clusterSize 3 adds significant storage (4x512GiB + 50GiB + 50GiB per replica) and compute
# (6 CPU cores + 14GiB memory per replica). Adjust based on HA vs resource priorities.
#
# Disk requirements:
# - Timescale:
# - 4 x 512 GiB timeseries data disk for each replica (x2)
# - 50 GiB data disk for each replica (x2)
# - 50 GiB WAL disk for each replica (x2)
# - Kafka broker: 100 GiB for each replica (x3)
# - Kafka controller: 1 GiB for each replica (x3)
# - Postgres: 3 GiB for each replica (x2)
# - ClickHouse Keeper: 2 GiB for each replica (x3)
# - ClickHouse Traces: 40 GiB for each replica (x2)
# - Loki: 30 GiB
# - etcd: 1 GiB for each replica (x3)
# - Downsampled Metrics:
# - Raw: 5 GiB for each replica (x1)
# - Bucketed: 5 GiB for each replica (x1)
#
# The configuration references a default storage class named `gp3` which uses EBS gp3 volumes. This storage class should
# be configured with the default minimum gp3 settings of 3000 IOPS and 125 MiB/s throughput - you can create
# this class or change the config to use a class of your own, but it should be similar in performance.
#
# This configuration is based upon a certain number of Obcerv entities, average metrics per entity, and
# average metrics collection interval. The following function can be used to figure out what type of load to expect:
#
# metrics/sec = (Obcerv entities * metrics/entity) / average metrics collection interval
#
# In this example configuration, we have the following:
#
# 10,000 metrics/sec = (25,000 Obcerv entities * 4 metrics/entity) / 10 seconds average metrics collection interval
#
# NOTE: Ingestion, storage, and retrieval of OpenTelemetry spans is a beta feature.
#
# Additionally, the configuration is based upon a certain number of OpenTelemetry spans per second that are sampled
# based upon the following rules:
# - Error traces are always sampled
# - Target sampling probability per endpoint (corresponds to the name of the root span) is 0.01
# - Target sampling rate / second / endpoint (corresponds to the name of the root span) is 0.5
# - Root span duration outlier quantile is 0.95. The durations of all root spans are tracked and used to make guesses about
# abnormally long spans
#
# For higher-volume installations, it is recommended to use a storage class with increased IOPS for the Timescale workload.
defaultStorageClass: "gp3"
apps:
externalHostname: "obcerv.mydomain.internal"
ingress:
className: "nginx"
ingestion:
externalHostname: "obcerv-ingestion.mydomain.internal"
replicas: 2
ingress:
className: "nginx"
annotations:
nginx.ingress.kubernetes.io/backend-protocol: "GRPC"
nginx.ingress.kubernetes.io/use-regex: "true"
usePathRegex: true
resources:
requests:
memory: "512Mi"
cpu: "500m"
limits:
memory: "768Mi"
cpu: "750m"
traces:
jvmOpts: "-XX:MaxDirectMemorySize=120M"
resources:
requests:
memory: "2500Mi"
cpu: "750m"
limits:
memory: "4500Mi"
cpu: "1"
iam:
keycloak:
replicas: 2
ingress:
className: "nginx"
kafka:
replicas: 3
diskSize: "100Gi"
resources:
requests:
memory: "3Gi"
cpu: "1"
limits:
memory: "3Gi"
cpu: "2"
controller:
replicas: 3
producer:
bufferMemory: 67108864
timescale:
sharedBuffersPercentage: 40
clusterSize: 2
dataDiskSize: "50Gi"
timeseriesDiskCount: 4
timeseriesDiskSize: "512Gi"
walDiskSize: "50Gi"
resources:
requests:
memory: "14Gi"
cpu: "6"
limits:
memory: "14Gi"
cpu: "6"
retention:
metrics:
chunkSize: 8h
retention: 30d
metrics_5m:
chunkSize: 1d
retention: 90d
metrics_1h:
chunkSize: 5d
retention: 180d
metrics_1d:
chunkSize: 20d
retention: 1y
statuses:
chunkSize: 7d
retention: 1y
signal_details:
chunkSize: 1d
retention: 30d
loki:
diskSize: "30Gi"
sinkd:
timeseriesCacheMaxSize: 1000000
jvmOpts: "-XX:InitialRAMPercentage=50 -XX:MaxRAMPercentage=75 -XX:MaxDirectMemorySize=100M"
rawJvmOpts: "-XX:InitialRAMPercentage=50 -XX:MaxRAMPercentage=75"
replicas: 2
rawReplicas: 2
resources:
requests:
memory: "1280Mi"
cpu: "500m"
limits:
memory: "2Gi"
cpu: "2"
rawResources:
requests:
memory: "1Gi"
cpu: "250m"
limits:
memory: "2Gi"
cpu: "2"
metrics:
consumerProperties:
max.partition.fetch.bytes: 524288
dsMetrics:
consumerProperties:
max.partition.fetch.bytes: 1048576
loki:
consumerProperties:
max.partition.fetch.bytes: 1048576
entities:
consumerProperties:
max.partition.fetch.bytes: 1048576
max.poll.records: 75000
signals:
consumerProperties:
max.partition.fetch.bytes: 1048576
traces:
consumerProperties:
max.poll.records: 20000
resources:
requests:
memory: "756Mi"
cpu: "100m"
limits:
memory: "1200Mi"
cpu: "1"
platformd:
replicas: 2
resources:
requests:
memory: "1536Mi"
cpu: "1"
limits:
memory: "2Gi"
cpu: "2250m"
dpd:
replicas: 2
jvmOpts: "-XX:MaxRAMPercentage=75"
maxEntitySerdeCacheEntries: 25000
consumerProperties:
fetch.min.bytes: 524288
metricsMultiplexer:
maxFilterResultCacheSize: 500000
maxConcurrentOps: 100
localParallelism: 6
selfMonitoringThresholds:
metrics_partition_lag_warn: 100000
metrics_partition_lag_critical: 500000
resources:
requests:
memory: "5Gi"
cpu: "2"
limits:
memory: "5500Mi"
cpu: "3"
downsampledMetricsStream:
consumerProperties:
fetch.min.bytes: 524288
max.partition.fetch.bytes: 1048576
resources:
requests:
memory: "3Gi"
cpu: "1"
limits:
memory: "3Gi"
cpu: "3"
bucketedConsumerProperties:
fetch.min.bytes: 524288
max.partition.fetch.bytes: 1048576
bucketedJvmOpts: "-XX:InitialRAMPercentage=75 -XX:MaxRAMPercentage=75"
bucketedResources:
requests:
memory: "3Gi"
cpu: "1"
limits:
memory: "6Gi"
cpu: "4"
rocksdb:
raw:
indexAndFilterRatio: 0.5
memoryMib: 100
writeBufferMib: 8
writeBufferRatio: 0.25
bucketed:
indexAndFilterRatio: 0.5
memoryMib: 100
writeBufferMib: 8
writeBufferRatio: 0.25
entityStream:
intermediate:
consumerProperties:
max.partition.fetch.bytes: 1048576
storedEntitiesCacheSize: 1000
final:
consumerProperties:
max.partition.fetch.bytes: 1048576
resources:
requests:
memory: "1350Mi"
cpu: "300m"
limits:
memory: "2Gi"
cpu: "3"
signalsStream:
consumerProperties:
max.partition.fetch.bytes: 1048576
resources:
requests:
memory: "768Mi"
cpu: "150m"
limits:
memory: "1536Mi"
cpu: "1200m"
etcd:
replicas: 3
kvStore:
replicas: 2
licenced:
replicas: 2
latestMetricsService:
replicas: 2
clickhouse:
traces:
replicas: 2
diskSize: "40Gi"
resources:
limits:
cpu: "2"
memory: "8Gi"
requests:
cpu: "2"
memory: "8Gi"
keeper:
replicas: 3
postgres:
clusterSize: 2
["ITRS Analytics"]
["User Guide", "Technical Reference"]