Skip to content

Commit 967e9d4

Browse files
committed
Emit duration metrics for TACS connect/disconnect
1 parent 606da52 commit 967e9d4

5 files changed

Lines changed: 124 additions & 6 deletions

File tree

agent/vendor/github.com/aws/amazon-ecs-agent/ecs-agent/metrics/constants.go

Lines changed: 5 additions & 3 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

agent/vendor/github.com/aws/amazon-ecs-agent/ecs-agent/tcs/handler/handler.go

Lines changed: 15 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

ecs-agent/metrics/constants.go

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -61,9 +61,11 @@ const (
6161
ACSDisconnectedDurationName = acsSessionNamespace + ".ACSDisconnectedDuration"
6262

6363
// TACS Connection Metrics
64-
tacsConnectionNamespace = "TACSConnection"
65-
TACSConnectionFailure = tacsConnectionNamespace + ".Failure"
66-
TACSPublishMetricFailure = tacsConnectionNamespace + ".PublishMetricFailure"
64+
tacsConnectionNamespace = "TACSConnection"
65+
TACSConnectionFailure = tacsConnectionNamespace + ".Failure"
66+
TACSPublishMetricFailure = tacsConnectionNamespace + ".PublishMetricFailure"
67+
TACSSessionCallDurationName = tacsConnectionNamespace + ".ConnectDuration"
68+
TACSDisconnectedDurationName = tacsConnectionNamespace + ".DisconnectedDuration"
6769

6870
// ECS Client Metrics
6971
ecsClientNamespace = "ECSClient"

ecs-agent/tcs/handler/handler.go

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,7 @@ type telemetrySession struct {
7575
instanceStatusChannel <-chan ecstcs.InstanceStatusMessage
7676
doctor *doctor.Doctor
7777
ecsClient TcsEcsClient
78+
lastDisconnectedTime time.Time
7879
}
7980

8081
func NewTelemetrySession(
@@ -118,6 +119,7 @@ func NewTelemetrySession(
118119
metricsFactory: metricsFactory,
119120
doctor: doctor,
120121
ecsClient: ecsClient,
122+
lastDisconnectedTime: time.Time{},
121123
}
122124
}
123125

@@ -132,6 +134,7 @@ func (session *telemetrySession) Start(ctx context.Context) error {
132134
default:
133135
}
134136
tcsError := session.StartTelemetrySession(ctx)
137+
session.lastDisconnectedTime = time.Now()
135138
switch tcsError {
136139
case context.Canceled, context.DeadlineExceeded:
137140
return tcsError
@@ -175,6 +178,7 @@ func (session *telemetrySession) StartTelemetrySession(ctx context.Context) erro
175178
defer session.deregisterInstanceEventStream.Unsubscribe(deregisterContainerInstanceHandler)
176179
}
177180

181+
tacsConnectionStartTime := time.Now()
178182
disconnectTimer, err := client.Connect(metrics.TCSDisconnectTimeoutMetricName,
179183
session.disconnectTimeout,
180184
session.disconnectJitterMax)
@@ -184,6 +188,12 @@ func (session *telemetrySession) StartTelemetrySession(ctx context.Context) erro
184188
})
185189
return err
186190
}
191+
session.metricsFactory.New(metrics.TACSSessionCallDurationName).WithGauge(time.Since(tacsConnectionStartTime).
192+
Milliseconds()).Done(nil)
193+
if !session.GetLastDisconnectedTime().IsZero() {
194+
session.metricsFactory.New(metrics.TACSDisconnectedDurationName).WithGauge(time.Since(
195+
session.GetLastDisconnectedTime()).Milliseconds()).Done(nil)
196+
}
187197
defer disconnectTimer.Stop()
188198
logger.Info("Connected to TCS endpoint")
189199
// start a timer and listens for tcs heartbeats/acks. The timer is reset when
@@ -295,3 +305,8 @@ func formatURL(endpoint, cluster, containerInstance, agentVersion, agentHash, co
295305
}
296306
return tcsURL + "ws?" + query.Encode()
297307
}
308+
309+
// GetLastDisconnectedTime returns the timestamp that the last time Agent was disconnected from TACS.
310+
func (s *telemetrySession) GetLastDisconnectedTime() time.Time {
311+
return s.lastDisconnectedTime
312+
}

ecs-agent/tcs/handler/handler_test.go

Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -387,6 +387,90 @@ func TestConnectionInactiveTimeout(t *testing.T) {
387387
closeSocket(closeWS)
388388
}
389389

390+
// TestTACSDurationMetrics tests that the TACSSessionCallDurationName and TACSDisconnectedDuration and metrics are
391+
// emitted when they should be.
392+
func TestTACSDurationMetrics(t *testing.T) {
393+
ctrl := gomock.NewController(t)
394+
defer ctrl.Finish()
395+
396+
numTimesConnectToTACS := 2
397+
mockMetricsFactory := mock_metrics.NewMockEntryFactory(ctrl)
398+
399+
// TACSSessionCallDuration is emitted on all connections to TACS.
400+
mockDurationEntry := mock_metrics.NewMockEntry(ctrl)
401+
mockDurationEntry.EXPECT().WithGauge(gomock.Any()).Return(mockDurationEntry).Times(numTimesConnectToTACS)
402+
mockDurationEntry.EXPECT().Done(gomock.Any()).Times(numTimesConnectToTACS)
403+
mockMetricsFactory.EXPECT().New(metrics.TACSSessionCallDurationName).Return(mockDurationEntry).
404+
Times(numTimesConnectToTACS)
405+
406+
// TACSDisconnectedDuration should be emitted upon reconnection to TACS only (i.e., all connections to TACS except
407+
// the very first one).
408+
mockDisconnectedEntry := mock_metrics.NewMockEntry(ctrl)
409+
mockDisconnectedEntry.EXPECT().WithGauge(gomock.Any()).Return(mockDisconnectedEntry).Times(numTimesConnectToTACS - 1)
410+
mockDisconnectedEntry.EXPECT().Done(gomock.Any()).Times(numTimesConnectToTACS - 1)
411+
mockMetricsFactory.EXPECT().New(metrics.TACSDisconnectedDurationName).Return(mockDisconnectedEntry).
412+
Times(numTimesConnectToTACS - 1)
413+
414+
// Start test server.
415+
closeWS := make(chan []byte)
416+
server, _, requestChan, _, err := wsmock.GetMockServer(closeWS)
417+
if err != nil {
418+
t.Fatal(err)
419+
}
420+
server.StartTLS()
421+
defer server.Close()
422+
423+
go func() {
424+
for {
425+
select {
426+
case <-requestChan:
427+
}
428+
}
429+
}()
430+
431+
testecsclient := &wsmock.TestECSClient{
432+
TCSurl: server.URL,
433+
}
434+
435+
ctx, cancel := context.WithCancel(context.Background())
436+
defer cancel()
437+
438+
deregisterInstanceEventStream := eventstream.NewEventStream("Deregister_Instance", ctx)
439+
deregisterInstanceEventStream.StartListening()
440+
441+
telemetryMessages := make(chan ecstcs.TelemetryMessage, testTelemetryChannelDefaultBufferSize)
442+
healthMessages := make(chan ecstcs.HealthMessage, testTelemetryChannelDefaultBufferSize)
443+
instanceStatusMessages := make(chan ecstcs.InstanceStatusMessage, testTelemetryChannelDefaultBufferSize)
444+
445+
tacsSession := telemetrySession{
446+
containerInstanceArn: testInstanceArn,
447+
cluster: testClusterArn,
448+
agentVersion: testAgentVersion,
449+
agentHash: testAgentHash,
450+
containerRuntimeVersion: testContainerRuntimeVersion,
451+
disableMetrics: false,
452+
credentialsCache: aws.NewCredentialsCache(testCreds),
453+
cfg: testCfg,
454+
deregisterInstanceEventStream: deregisterInstanceEventStream,
455+
heartbeatTimeout: 50 * time.Millisecond, // use smaller value than testHeartbeatTimeout
456+
heartbeatJitterMax: 10 * time.Millisecond, // use smaller value than testHeartbeatJitter
457+
disconnectTimeout: testDisconnectionTimeout,
458+
disconnectJitterMax: testDisconnectionJitter,
459+
metricsFactory: mockMetricsFactory, // use the mock metrics factory
460+
metricsChannel: telemetryMessages,
461+
healthChannel: healthMessages,
462+
instanceStatusChannel: instanceStatusMessages,
463+
doctor: emptyDoctor,
464+
ecsClient: testecsclient,
465+
lastDisconnectedTime: time.Time{},
466+
}
467+
468+
for i := 0; i < numTimesConnectToTACS; i++ {
469+
tacsSession.StartTelemetrySession(ctx)
470+
tacsSession.lastDisconnectedTime = time.Now()
471+
}
472+
}
473+
390474
// TestTACSConnectionFailureMetric tests that the TACSConnectionFailure metric is recorded when there's a connection error
391475
func TestTACSConnectionFailureMetric(t *testing.T) {
392476
ctrl := gomock.NewController(t)

0 commit comments

Comments
 (0)