diff --git a/pkg/container/kubernetes/client.go b/pkg/container/kubernetes/client.go index 71155680c5..d01c5548e6 100644 --- a/pkg/container/kubernetes/client.go +++ b/pkg/container/kubernetes/client.go @@ -169,26 +169,31 @@ func (c *Client) AttachToWorkload(ctx context.Context, workloadName string) (io. TTY: false, } - // Set up the attach request + // Set up the attach request URL (used to create fresh SPDY executors for each retry) req := c.client.CoreV1().RESTClient().Post(). Resource("pods"). Name(podName). Namespace(c.getCurrentNamespace()). SubResource("attach"). VersionedParams(attachOpts, scheme.ParameterCodec) - - config := c.config - // Create a SPDY executor - exec, err := remotecommand.NewSPDYExecutor(config, "POST", req.URL()) - if err != nil { - return nil, nil, fmt.Errorf("failed to create SPDY executor: %w", err) - } + attachURL := req.URL() logger.Infof("Attaching to pod %s workload %s...", podName, workloadName) stdinReader, stdinWriter := io.Pipe() stdoutReader, stdoutWriter := io.Pipe() go func() { + // Close pipes when this goroutine exits to signal the transport layer. + // This ensures processStdout() sees EOF and can attempt re-attachment or exit. + defer func() { + if err := stdoutWriter.Close(); err != nil { + logger.Debugf("Error closing stdout writer: %v", err) + } + if err := stdinReader.Close(); err != nil { + logger.Debugf("Error closing stdin reader: %v", err) + } + }() + // Create exponential backoff with extended retry window to handle pod restarts // in both local and CI environments. expBackoff := backoff.NewExponentialBackOff() @@ -196,6 +201,15 @@ func (c *Client) AttachToWorkload(ctx context.Context, workloadName string) (io. expBackoff.InitialInterval = attachInitialRetryInterval _, err := backoff.Retry(ctx, func() (any, error) { + // Create a fresh SPDY executor for each retry attempt. + // This is critical because the SPDY connection state becomes corrupted + // after certain failures (e.g., EOF from idle timeout), and reusing + // a stale executor prevents recovery. + exec, execErr := remotecommand.NewSPDYExecutor(c.config, "POST", attachURL) + if execErr != nil { + return nil, fmt.Errorf("failed to create SPDY executor: %w", execErr) + } + return nil, exec.StreamWithContext(ctx, remotecommand.StreamOptions{ Stdin: stdinReader, Stdout: stdoutWriter, diff --git a/pkg/container/kubernetes/client_test.go b/pkg/container/kubernetes/client_test.go index 956f2eb3c2..4566c275ba 100644 --- a/pkg/container/kubernetes/client_test.go +++ b/pkg/container/kubernetes/client_test.go @@ -8,6 +8,7 @@ import ( "encoding/json" "strings" "testing" + "time" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" @@ -861,6 +862,25 @@ func TestAttachToWorkloadNoPodFound(t *testing.T) { assert.Contains(t, err.Error(), "no pods found") } +// TestAttachRetryConstants verifies the retry configuration constants are set +// to reasonable values for handling pod restarts. +func TestAttachRetryConstants(t *testing.T) { + t.Parallel() + + // The retry timeout should be long enough to accommodate pod restarts + // (including image pulls) but not so long that failures take forever to detect + assert.Equal(t, 90*time.Second, attachRetryTimeout, + "attachRetryTimeout should be 90 seconds to accommodate pod restarts") + + // Initial retry interval should be short for quick recovery from transient errors + assert.Equal(t, 1*time.Second, attachInitialRetryInterval, + "attachInitialRetryInterval should be 1 second for quick recovery") + + // Max retry interval caps the backoff to prevent excessive delays + assert.Equal(t, 15*time.Second, attachMaxRetryInterval, + "attachMaxRetryInterval should be 15 seconds to cap backoff") +} + // TestApplyPodTemplatePatchAnnotations tests that annotations are correctly applied // from the pod template patch to the base template. // This is a regression test for the bug where annotations were not being applied.