@@ -18,6 +18,7 @@ import (
1818 "github.com/nix-community/nixos-cli/internal/logger"
1919 "github.com/nix-community/nixos-cli/internal/settings"
2020 "github.com/nix-community/nixos-cli/internal/system"
21+ "golang.org/x/crypto/ssh"
2122)
2223
2324const (
@@ -348,6 +349,10 @@ func RunActivationSupervisor(
348349 action SwitchToConfigurationAction ,
349350 opts * RunActivationSupervisorOptions ,
350351) error {
352+ if _ , ok := s .(* system.SSHSystem ); ! ok {
353+ panic ("RunActivationSupervisor() called with a non-SSH system" )
354+ }
355+
351356 log := s .Logger ()
352357
353358 argv := []string {
@@ -417,6 +422,9 @@ func RunActivationSupervisor(
417422
418423 activationComplete := make (chan error , 1 )
419424 go func () {
425+ // FIXME: there are times where if a connection is taken
426+ // down using `ip link set down`, then the whole process
427+ // hangs. Figure out a way to detect this condition.
420428 _ , activationErr := s .Run (cmd )
421429 activationComplete <- activationErr
422430 }()
@@ -425,6 +433,7 @@ func RunActivationSupervisor(
425433 defer successTriggerCheckTimer .Stop ()
426434
427435 successDetected := false
436+ activationChConsumed := false
428437 for ! successDetected {
429438 select {
430439 case err := <- activationComplete :
@@ -434,19 +443,38 @@ func RunActivationSupervisor(
434443 // final check is here.
435444 if _ , statErr := s .FS ().Stat (SWITCH_SUCCESS_PATH ); statErr == nil {
436445 successDetected = true
446+ activationChConsumed = true
437447 break
438448 }
439449
440450 // At this point, the supervisor exited before the success file appeared,
441451 // so the switch has either failed, or a transport error has occurred
442452 // and we need to re-initiate the SSH connection.
443- // FIXME: we need to still try and create the ACK file
444- // in case the connection was terminated, with a new
445- // SSH client.
453+
454+ if _ , ok := err .(* ssh.ExitMissingError ); ok {
455+ log .Warn ("lost connection to target host, attempting to reconnect" )
456+ err = s .(* system.SSHSystem ).Reconnect ()
457+ if err != nil {
458+ log .Errorf ("%v" , err )
459+ log .Info ("the target host should rollback soon" )
460+ return err
461+ }
462+
463+ log .Debug ("attempting to acknowledge success after reconnecting" )
464+ if err = s .FS ().CreateFile (successTrigger ); err != nil {
465+ log .Errorf ("failed to create %v on remote system: %v" , successTrigger , err )
466+ log .Info ("the target host should rollback soon" )
467+ return err
468+ }
469+ return nil
470+ }
471+
472+ // At this point, the SSH command has failed to run and
473+ // we cannot do anything more, so exit with an error.
446474 if err != nil {
447475 return fmt .Errorf ("activation supervisor exited early: %w" , err )
448476 }
449- return fmt . Errorf ("activation supervisor exited without success signal" )
477+ return errors . New ("activation supervisor exited without success signal" )
450478 case <- successTriggerCheckTimer .C :
451479 // Check for the existence of the switch success trigger.
452480 // If it exists, then the switch has completed
@@ -463,20 +491,50 @@ func RunActivationSupervisor(
463491 }
464492 }
465493
466- // TODO: reconnect system connection first if SSH was restarted
494+ // Create a new target host connection, since SSH sessions
495+ // will not terminate if sshd itself has terminated.
496+ reconnectCh := make (chan * system.SSHSystem , 1 )
467497
468- err := s .FS ().CreateFile (successTrigger )
469- if err != nil {
470- log .Errorf ("failed to create %v on remote system: %v" , successTrigger , err )
498+ go func () {
499+ log .Debug ("attempting reconnect" )
500+ s2 , err := s .(* system.SSHSystem ).Clone ()
501+ if err != nil {
502+ log .Errorf ("%v" , err )
503+ log .Warnf ("it is very likely that SSH access cannot be re-established" )
504+ log .Warnf ("the target host should rollback soon" )
505+ reconnectCh <- nil
506+ return
507+ }
508+ reconnectCh <- s2
509+ }()
510+
511+ select {
512+ case s2 := <- reconnectCh :
513+ if s2 == nil {
514+ break
515+ }
516+
517+ defer s2 .Close ()
518+ err := s2 .FS ().CreateFile (successTrigger )
519+ if err != nil {
520+ log .Errorf ("failed to create %v on remote system: %v" , successTrigger , err )
521+ }
522+ case <- time .After (opts .AckTimeout ):
523+ // FIXME: fix race condition where s2 is never closed if this is reached first.
524+ break
471525 }
472526
473- // Wait for the watchdog to either rollback or finish.
474- // If the SSH connection is broken, then this case
475- // cannot be hit because the connection was
476- // already terminated with a broken pipe.
477- err = <- activationComplete
478- if err != nil {
479- return fmt .Errorf ("activation supervisor exited with error: %v" , err )
527+ log .Debug ("waiting for activation process to complete" )
528+
529+ if ! activationChConsumed {
530+ // Wait for the watchdog to either rollback or finish.
531+ // If the SSH connection is broken, then this case
532+ // cannot be hit because the connection was
533+ // already terminated with a broken pipe.
534+ err := <- activationComplete
535+ if err != nil {
536+ return fmt .Errorf ("activation supervisor exited with error: %v" , err )
537+ }
480538 }
481539
482540 return nil
0 commit comments