Skip to content

Commit 7f996e0

Browse files
committed
feat(activation): add reconnection retries
1 parent d5be589 commit 7f996e0

2 files changed

Lines changed: 75 additions & 15 deletions

File tree

internal/activation/activation.go

Lines changed: 73 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ import (
1818
"github.com/nix-community/nixos-cli/internal/logger"
1919
"github.com/nix-community/nixos-cli/internal/settings"
2020
"github.com/nix-community/nixos-cli/internal/system"
21+
"golang.org/x/crypto/ssh"
2122
)
2223

2324
const (
@@ -348,6 +349,10 @@ func RunActivationSupervisor(
348349
action SwitchToConfigurationAction,
349350
opts *RunActivationSupervisorOptions,
350351
) error {
352+
if _, ok := s.(*system.SSHSystem); !ok {
353+
panic("RunActivationSupervisor() called with a non-SSH system")
354+
}
355+
351356
log := s.Logger()
352357

353358
argv := []string{
@@ -417,6 +422,9 @@ func RunActivationSupervisor(
417422

418423
activationComplete := make(chan error, 1)
419424
go func() {
425+
// FIXME: there are times where if a connection is taken
426+
// down using `ip link set down`, then the whole process
427+
// hangs. Figure out a way to detect this condition.
420428
_, activationErr := s.Run(cmd)
421429
activationComplete <- activationErr
422430
}()
@@ -425,6 +433,7 @@ func RunActivationSupervisor(
425433
defer successTriggerCheckTimer.Stop()
426434

427435
successDetected := false
436+
activationChConsumed := false
428437
for !successDetected {
429438
select {
430439
case err := <-activationComplete:
@@ -434,19 +443,38 @@ func RunActivationSupervisor(
434443
// final check is here.
435444
if _, statErr := s.FS().Stat(SWITCH_SUCCESS_PATH); statErr == nil {
436445
successDetected = true
446+
activationChConsumed = true
437447
break
438448
}
439449

440450
// At this point, the supervisor exited before the success file appeared,
441451
// so the switch has either failed, or a transport error has occurred
442452
// and we need to re-initiate the SSH connection.
443-
// FIXME: we need to still try and create the ACK file
444-
// in case the connection was terminated, with a new
445-
// SSH client.
453+
454+
if _, ok := err.(*ssh.ExitMissingError); ok {
455+
log.Warn("lost connection to target host, attempting to reconnect")
456+
err = s.(*system.SSHSystem).Reconnect()
457+
if err != nil {
458+
log.Errorf("%v", err)
459+
log.Info("the target host should rollback soon")
460+
return err
461+
}
462+
463+
log.Debug("attempting to acknowledge success after reconnecting")
464+
if err = s.FS().CreateFile(successTrigger); err != nil {
465+
log.Errorf("failed to create %v on remote system: %v", successTrigger, err)
466+
log.Info("the target host should rollback soon")
467+
return err
468+
}
469+
return nil
470+
}
471+
472+
// At this point, the SSH command has failed to run and
473+
// we cannot do anything more, so exit with an error.
446474
if err != nil {
447475
return fmt.Errorf("activation supervisor exited early: %w", err)
448476
}
449-
return fmt.Errorf("activation supervisor exited without success signal")
477+
return errors.New("activation supervisor exited without success signal")
450478
case <-successTriggerCheckTimer.C:
451479
// Check for the existence of the switch success trigger.
452480
// If it exists, then the switch has completed
@@ -463,20 +491,50 @@ func RunActivationSupervisor(
463491
}
464492
}
465493

466-
// TODO: reconnect system connection first if SSH was restarted
494+
// Create a new target host connection, since SSH sessions
495+
// will not terminate if sshd itself has terminated.
496+
reconnectCh := make(chan *system.SSHSystem, 1)
467497

468-
err := s.FS().CreateFile(successTrigger)
469-
if err != nil {
470-
log.Errorf("failed to create %v on remote system: %v", successTrigger, err)
498+
go func() {
499+
log.Debug("attempting reconnect")
500+
s2, err := s.(*system.SSHSystem).Clone()
501+
if err != nil {
502+
log.Errorf("%v", err)
503+
log.Warnf("it is very likely that SSH access cannot be re-established")
504+
log.Warnf("the target host should rollback soon")
505+
reconnectCh <- nil
506+
return
507+
}
508+
reconnectCh <- s2
509+
}()
510+
511+
select {
512+
case s2 := <-reconnectCh:
513+
if s2 == nil {
514+
break
515+
}
516+
517+
defer s2.Close()
518+
err := s2.FS().CreateFile(successTrigger)
519+
if err != nil {
520+
log.Errorf("failed to create %v on remote system: %v", successTrigger, err)
521+
}
522+
case <-time.After(opts.AckTimeout):
523+
// FIXME: fix race condition where s2 is never closed if this is reached first.
524+
break
471525
}
472526

473-
// Wait for the watchdog to either rollback or finish.
474-
// If the SSH connection is broken, then this case
475-
// cannot be hit because the connection was
476-
// already terminated with a broken pipe.
477-
err = <-activationComplete
478-
if err != nil {
479-
return fmt.Errorf("activation supervisor exited with error: %v", err)
527+
log.Debug("waiting for activation process to complete")
528+
529+
if !activationChConsumed {
530+
// Wait for the watchdog to either rollback or finish.
531+
// If the SSH connection is broken, then this case
532+
// cannot be hit because the connection was
533+
// already terminated with a broken pipe.
534+
err := <-activationComplete
535+
if err != nil {
536+
return fmt.Errorf("activation supervisor exited with error: %v", err)
537+
}
480538
}
481539

482540
return nil

internal/activation/supervisor.sh

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,8 @@ rollback() {
6666
# `nixos apply --no-boot` or `nixos-rebuild test` to another
6767
# one.
6868
if [ "$ROLLBACK_PROFILE_ON_FAILURE" != "" ]; then
69+
# FIXME: --rollback may not be the correct solution for the system profile!
70+
# may need to specify explicit generation number
6971
nix-env -p "$PROFILE" --rollback "$verbose_flag"
7072
fi
7173

0 commit comments

Comments
 (0)