|
7 | 7 |
|
8 | 8 | # Version is set at build time via git describe --tags |
9 | 9 | # Format: "4.3.0" for tagged releases, "4.3.0-5-gabcdef1" for dev builds |
10 | | -__version__ = "4.5.0-rc2" |
| 10 | +__version__ = "4.5.0-rc4" |
11 | 11 |
|
12 | 12 | import subprocess |
13 | 13 | import sys |
@@ -154,6 +154,7 @@ class RemoteServerConfig: |
154 | 154 | shutdown_command: str = "sudo shutdown -h now" |
155 | 155 | ssh_options: List[str] = field(default_factory=list) |
156 | 156 | pre_shutdown_commands: List[RemoteCommandConfig] = field(default_factory=list) |
| 157 | + parallel: bool = True # If False, server is shutdown sequentially before parallel batch |
157 | 158 |
|
158 | 159 |
|
159 | 160 | @dataclass |
@@ -492,6 +493,7 @@ def _parse_config(cls, data: Dict[str, Any]) -> Config: |
492 | 493 | shutdown_command=server_data.get('shutdown_command', 'sudo shutdown -h now'), |
493 | 494 | ssh_options=server_data.get('ssh_options', []), |
494 | 495 | pre_shutdown_commands=pre_cmds, |
| 496 | + parallel=server_data.get('parallel', True), |
495 | 497 | )) |
496 | 498 | config.remote_servers = servers |
497 | 499 |
|
@@ -1676,71 +1678,104 @@ def _unmount_filesystems(self): |
1676 | 1678 | self._log_message(f" ℹ️ {mount_point} was likely not mounted.") |
1677 | 1679 |
|
1678 | 1680 | def _shutdown_remote_servers(self): |
1679 | | - """Shutdown all enabled remote servers via SSH in parallel. |
| 1681 | + """Shutdown all enabled remote servers via SSH. |
1680 | 1682 |
|
1681 | | - All remote servers are shut down concurrently using threads to avoid |
1682 | | - sequential timeouts blocking the shutdown sequence. This is critical |
1683 | | - when a server is unreachable - waiting 60+ seconds per dead host could |
1684 | | - mean the UPS battery dies before reaching other servers. |
| 1683 | + Servers are processed in two phases: |
| 1684 | + 1. Sequential phase: Servers with parallel=False are shutdown one by one |
| 1685 | + in config order. Use this for servers with dependencies (e.g., a server |
| 1686 | + that hosts storage used by other servers should be shutdown last). |
| 1687 | + 2. Parallel phase: Remaining servers (parallel=True, the default) are |
| 1688 | + shutdown concurrently using threads to avoid sequential timeouts. |
| 1689 | +
|
| 1690 | + This hybrid approach ensures dependency order while still benefiting from |
| 1691 | + parallel execution for independent servers. |
1685 | 1692 | """ |
1686 | 1693 | enabled_servers = [s for s in self.config.remote_servers if s.enabled] |
1687 | 1694 |
|
1688 | 1695 | if not enabled_servers: |
1689 | 1696 | return |
1690 | 1697 |
|
| 1698 | + # Separate servers into sequential and parallel groups |
| 1699 | + sequential_servers = [s for s in enabled_servers if not s.parallel] |
| 1700 | + parallel_servers = [s for s in enabled_servers if s.parallel] |
| 1701 | + |
1691 | 1702 | server_count = len(enabled_servers) |
1692 | | - self._log_message(f"🌐 Shutting down {server_count} remote server(s) in parallel...") |
| 1703 | + seq_count = len(sequential_servers) |
| 1704 | + par_count = len(parallel_servers) |
1693 | 1705 |
|
1694 | | - # Calculate max timeout for all servers (for the join timeout) |
1695 | | - # Each server's max time = sum of pre_shutdown timeouts + shutdown timeout + buffer |
1696 | | - def calc_server_timeout(server: RemoteServerConfig) -> int: |
1697 | | - pre_cmd_time = sum( |
1698 | | - (cmd.timeout or server.command_timeout) for cmd in server.pre_shutdown_commands |
| 1706 | + if seq_count > 0 and par_count > 0: |
| 1707 | + self._log_message( |
| 1708 | + f"🌐 Shutting down {server_count} remote server(s) " |
| 1709 | + f"({seq_count} sequential, {par_count} parallel)..." |
1699 | 1710 | ) |
1700 | | - return pre_cmd_time + server.command_timeout + server.connect_timeout + 60 |
1701 | | - |
1702 | | - max_timeout = max(calc_server_timeout(s) for s in enabled_servers) |
| 1711 | + elif seq_count > 0: |
| 1712 | + self._log_message(f"🌐 Shutting down {server_count} remote server(s) sequentially...") |
| 1713 | + else: |
| 1714 | + self._log_message(f"🌐 Shutting down {server_count} remote server(s) in parallel...") |
1703 | 1715 |
|
1704 | | - # Track results for logging |
1705 | | - results: Dict[str, Tuple[bool, str]] = {} |
1706 | | - results_lock = threading.Lock() |
| 1716 | + completed = 0 |
1707 | 1717 |
|
1708 | | - def shutdown_server_thread(server: RemoteServerConfig): |
1709 | | - """Thread worker for shutting down a single server.""" |
| 1718 | + # Phase 1: Sequential servers (in config order) |
| 1719 | + for server in sequential_servers: |
1710 | 1720 | display_name = server.name or server.host |
1711 | 1721 | try: |
1712 | 1722 | self._shutdown_remote_server(server) |
1713 | | - with results_lock: |
1714 | | - results[display_name] = (True, "") |
| 1723 | + completed += 1 |
1715 | 1724 | except Exception as e: |
1716 | | - with results_lock: |
1717 | | - results[display_name] = (False, str(e)) |
1718 | | - |
1719 | | - # Start all threads |
1720 | | - threads: List[threading.Thread] = [] |
1721 | | - for server in enabled_servers: |
1722 | | - t = threading.Thread( |
1723 | | - target=shutdown_server_thread, |
1724 | | - args=(server,), |
1725 | | - name=f"remote-shutdown-{server.name or server.host}" |
1726 | | - ) |
1727 | | - t.start() |
1728 | | - threads.append(t) |
| 1725 | + self._log_message(f" ❌ {display_name} shutdown failed: {e}") |
| 1726 | + |
| 1727 | + # Phase 2: Parallel servers |
| 1728 | + if parallel_servers: |
| 1729 | + # Calculate max timeout for parallel servers (for the join timeout) |
| 1730 | + def calc_server_timeout(server: RemoteServerConfig) -> int: |
| 1731 | + pre_cmd_time = sum( |
| 1732 | + (cmd.timeout or server.command_timeout) for cmd in server.pre_shutdown_commands |
| 1733 | + ) |
| 1734 | + return pre_cmd_time + server.command_timeout + server.connect_timeout + 60 |
1729 | 1735 |
|
1730 | | - # Wait for all threads to complete with global timeout |
1731 | | - for t in threads: |
1732 | | - t.join(timeout=max_timeout) |
| 1736 | + max_timeout = max(calc_server_timeout(s) for s in parallel_servers) |
1733 | 1737 |
|
1734 | | - # Check for any threads that are still running (timed out) |
1735 | | - still_running = [t for t in threads if t.is_alive()] |
1736 | | - if still_running: |
1737 | | - self._log_message( |
1738 | | - f" ⚠️ {len(still_running)} remote shutdown(s) still in progress " |
1739 | | - "(continuing with local shutdown)" |
1740 | | - ) |
| 1738 | + # Track results for logging |
| 1739 | + results: Dict[str, Tuple[bool, str]] = {} |
| 1740 | + results_lock = threading.Lock() |
| 1741 | + |
| 1742 | + def shutdown_server_thread(server: RemoteServerConfig): |
| 1743 | + """Thread worker for shutting down a single server.""" |
| 1744 | + display_name = server.name or server.host |
| 1745 | + try: |
| 1746 | + self._shutdown_remote_server(server) |
| 1747 | + with results_lock: |
| 1748 | + results[display_name] = (True, "") |
| 1749 | + except Exception as e: |
| 1750 | + with results_lock: |
| 1751 | + results[display_name] = (False, str(e)) |
| 1752 | + |
| 1753 | + # Start all threads |
| 1754 | + threads: List[threading.Thread] = [] |
| 1755 | + for server in parallel_servers: |
| 1756 | + t = threading.Thread( |
| 1757 | + target=shutdown_server_thread, |
| 1758 | + args=(server,), |
| 1759 | + name=f"remote-shutdown-{server.name or server.host}" |
| 1760 | + ) |
| 1761 | + t.start() |
| 1762 | + threads.append(t) |
| 1763 | + |
| 1764 | + # Wait for all threads to complete with global timeout |
| 1765 | + for t in threads: |
| 1766 | + t.join(timeout=max_timeout) |
| 1767 | + |
| 1768 | + # Check for any threads that are still running (timed out) |
| 1769 | + still_running = [t for t in threads if t.is_alive()] |
| 1770 | + if still_running: |
| 1771 | + self._log_message( |
| 1772 | + f" ⚠️ {len(still_running)} remote shutdown(s) still in progress " |
| 1773 | + "(continuing with local shutdown)" |
| 1774 | + ) |
| 1775 | + |
| 1776 | + completed += par_count - len(still_running) |
1741 | 1777 |
|
1742 | 1778 | # Log summary |
1743 | | - completed = server_count - len(still_running) |
1744 | 1779 | self._log_message(f" ✅ Remote shutdown complete ({completed}/{server_count} servers)") |
1745 | 1780 |
|
1746 | 1781 | def _run_remote_command( |
|
0 commit comments