Content-Length: 78710 | pFad | http://github.com/coder/coder/pull/18245.diff
thub.com diff --git a/agent/agent.go b/agent/agent.go index 74cf305c9434a..17298e7aa5772 100644 --- a/agent/agent.go +++ b/agent/agent.go @@ -1188,7 +1188,7 @@ func (a *agent) handleManifest(manifestOK *checkpoint) func(ctx context.Context, // createOrUpdateNetwork waits for the manifest to be set using manifestOK, then creates or updates // the tailnet using the information in the manifest func (a *agent) createOrUpdateNetwork(manifestOK, networkOK *checkpoint) func(context.Context, proto.DRPCAgentClient26) error { - return func(ctx context.Context, _ proto.DRPCAgentClient26) (retErr error) { + return func(ctx context.Context, aAPI proto.DRPCAgentClient26) (retErr error) { if err := manifestOK.wait(ctx); err != nil { return xerrors.Errorf("no manifest: %w", err) } @@ -1208,6 +1208,7 @@ func (a *agent) createOrUpdateNetwork(manifestOK, networkOK *checkpoint) func(co // agent API. network, err = a.createTailnet( a.gracefulCtx, + aAPI, manifest.AgentID, manifest.DERPMap, manifest.DERPForceWebSockets, @@ -1355,6 +1356,7 @@ func (a *agent) trackGoroutine(fn func()) error { func (a *agent) createTailnet( ctx context.Context, + aAPI proto.DRPCAgentClient26, agentID uuid.UUID, derpMap *tailcfg.DERPMap, derpForceWebSockets, disableDirectConnections bool, @@ -1487,7 +1489,7 @@ func (a *agent) createTailnet( }() if err = a.trackGoroutine(func() { defer apiListener.Close() - apiHandler, closeAPIHAndler := a.apiHandler() + apiHandler, closeAPIHAndler := a.apiHandler(aAPI) defer func() { _ = closeAPIHAndler() }() diff --git a/agent/agent_test.go b/agent/agent_test.go index 3a2562237b603..3ef9e4f4c75ba 100644 --- a/agent/agent_test.go +++ b/agent/agent_test.go @@ -48,6 +48,7 @@ import ( "cdr.dev/slog/sloggers/slogtest" "github.com/coder/coder/v2/agent" + "github.com/coder/coder/v2/agent/agentcontainers" "github.com/coder/coder/v2/agent/agentssh" "github.com/coder/coder/v2/agent/agenttest" "github.com/coder/coder/v2/agent/proto" @@ -60,9 +61,16 @@ import ( "github.com/coder/coder/v2/tailnet" "github.com/coder/coder/v2/tailnet/tailnettest" "github.com/coder/coder/v2/testutil" + "github.com/coder/quartz" ) func TestMain(m *testing.M) { + if os.Getenv("CODER_TEST_RUN_SUB_AGENT_MAIN") == "1" { + // If we're running as a subagent, we don't want to run the main tests. + // Instead, we just run the subagent tests. + exit := runSubAgentMain() + os.Exit(exit) + } goleak.VerifyTestMain(m, testutil.GoleakOptions...) } @@ -1930,6 +1938,9 @@ func TestAgent_ReconnectingPTYContainer(t *testing.T) { if os.Getenv("CODER_TEST_USE_DOCKER") != "1" { t.Skip("Set CODER_TEST_USE_DOCKER=1 to run this test") } + if _, err := exec.LookPath("devcontainer"); err != nil { + t.Skip("This test requires the devcontainer CLI: npm install -g @devcontainers/cli") + } pool, err := dockertest.NewPool("") require.NoError(t, err, "Could not connect to docker") @@ -1955,6 +1966,9 @@ func TestAgent_ReconnectingPTYContainer(t *testing.T) { // nolint: dogsled conn, _, _, _, _ := setupAgent(t, agentsdk.Manifest{}, 0, func(_ *agenttest.Client, o *agent.Options) { o.ExperimentalDevcontainersEnabled = true + o.ContainerAPIOptions = append(o.ContainerAPIOptions, + agentcontainers.WithContainerLabelIncludeFilter("this.label.does.not.exist.ignore.devcontainers", "true"), + ) }) ctx := testutil.Context(t, testutil.WaitLong) ac, err := conn.ReconnectingPTY(ctx, uuid.New(), 80, 80, "/bin/sh", func(arp *workspacesdk.AgentReconnectingPTYInit) { @@ -1986,6 +2000,60 @@ func TestAgent_ReconnectingPTYContainer(t *testing.T) { require.ErrorIs(t, tr.ReadUntil(ctx, nil), io.EOF) } +type subAgentRequestPayload struct { + Token string `json:"token"` + Directory string `json:"directory"` +} + +// runSubAgentMain is the main function for the sub-agent that connects +// to the control plane. It reads the CODER_AGENT_URL and +// CODER_AGENT_TOKEN environment variables, sends the token, and exits +// with a status code based on the response. +func runSubAgentMain() int { + url := os.Getenv("CODER_AGENT_URL") + token := os.Getenv("CODER_AGENT_TOKEN") + if url == "" || token == "" { + _, _ = fmt.Fprintln(os.Stderr, "CODER_AGENT_URL and CODER_AGENT_TOKEN must be set") + return 10 + } + + dir, err := os.Getwd() + if err != nil { + _, _ = fmt.Fprintf(os.Stderr, "failed to get current working directory: %v\n", err) + return 1 + } + payload := subAgentRequestPayload{ + Token: token, + Directory: dir, + } + b, err := json.Marshal(payload) + if err != nil { + _, _ = fmt.Fprintf(os.Stderr, "failed to marshal payload: %v\n", err) + return 1 + } + + req, err := http.NewRequest("POST", url, bytes.NewReader(b)) + if err != nil { + _, _ = fmt.Fprintf(os.Stderr, "failed to create request: %v\n", err) + return 1 + } + ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitLong) + defer cancel() + req = req.WithContext(ctx) + resp, err := http.DefaultClient.Do(req) + if err != nil { + _, _ = fmt.Fprintf(os.Stderr, "agent connection failed: %v\n", err) + return 11 + } + defer resp.Body.Close() + if resp.StatusCode != http.StatusOK { + _, _ = fmt.Fprintf(os.Stderr, "agent exiting with non-zero exit code %d\n", resp.StatusCode) + return 12 + } + _, _ = fmt.Println("sub-agent connected successfully") + return 0 +} + // This tests end-to-end functionality of auto-starting a devcontainer. // It runs "devcontainer up" which creates a real Docker container. As // such, it does not run by default in CI. @@ -1999,6 +2067,56 @@ func TestAgent_DevcontainerAutostart(t *testing.T) { if os.Getenv("CODER_TEST_USE_DOCKER") != "1" { t.Skip("Set CODER_TEST_USE_DOCKER=1 to run this test") } + if _, err := exec.LookPath("devcontainer"); err != nil { + t.Skip("This test requires the devcontainer CLI: npm install -g @devcontainers/cli") + } + + // This HTTP handler handles requests from runSubAgentMain which + // acts as a fake sub-agent. We want to verify that the sub-agent + // connects and sends its token. We use a channel to signal + // that the sub-agent has connected successfully and then we wait + // until we receive another signal to return from the handler. This + // keeps the agent "alive" for as long as we want. + subAgentConnected := make(chan subAgentRequestPayload, 1) + subAgentReady := make(chan struct{}, 1) + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + t.Logf("Sub-agent request received: %s %s", r.Method, r.URL.Path) + + if r.Method != http.MethodPost { + http.Error(w, "Method not allowed", http.StatusMethodNotAllowed) + return + } + + // Read the token from the request body. + var payload subAgentRequestPayload + if err := json.NewDecoder(r.Body).Decode(&payload); err != nil { + http.Error(w, "Failed to read token", http.StatusBadRequest) + t.Logf("Failed to read token: %v", err) + return + } + defer r.Body.Close() + + t.Logf("Sub-agent request payload received: %+v", payload) + + // Signal that the sub-agent has connected successfully. + select { + case <-t.Context().Done(): + t.Logf("Test context done, not processing sub-agent request") + return + case subAgentConnected <- payload: + } + + // Wait for the signal to return from the handler. + select { + case <-t.Context().Done(): + t.Logf("Test context done, not waiting for sub-agent ready") + return + case <-subAgentReady: + } + + w.WriteHeader(http.StatusOK) + })) + defer srv.Close() pool, err := dockertest.NewPool("") require.NoError(t, err, "Could not connect to docker") @@ -2016,9 +2134,10 @@ func TestAgent_DevcontainerAutostart(t *testing.T) { require.NoError(t, err, "create devcontainer directory") devcontainerFile := filepath.Join(devcontainerPath, "devcontainer.json") err = os.WriteFile(devcontainerFile, []byte(`{ - "name": "mywork", - "image": "busybox:latest", - "cmd": ["sleep", "infinity"] + "name": "mywork", + "image": "ubuntu:latest", + "cmd": ["sleep", "infinity"], + "runArgs": ["--network=host"] }`), 0o600) require.NoError(t, err, "write devcontainer.json") @@ -2043,9 +2162,24 @@ func TestAgent_DevcontainerAutostart(t *testing.T) { }, }, } + mClock := quartz.NewMock(t) + mClock.Set(time.Now()) + tickerFuncTrap := mClock.Trap().TickerFunc("agentcontainers") + //nolint:dogsled - conn, _, _, _, _ := setupAgent(t, manifest, 0, func(_ *agenttest.Client, o *agent.Options) { + _, agentClient, _, _, _ := setupAgent(t, manifest, 0, func(_ *agenttest.Client, o *agent.Options) { o.ExperimentalDevcontainersEnabled = true + o.ContainerAPIOptions = append( + o.ContainerAPIOptions, + // Only match this specific dev container. + agentcontainers.WithClock(mClock), + agentcontainers.WithContainerLabelIncludeFilter("devcontainer.local_folder", tempWorkspaceFolder), + agentcontainers.WithSubAgentURL(srv.URL), + // The agent will copy "itself", but in the case of this test, the + // agent is actually this test binary. So we'll tell the test binary + // to execute the sub-agent main function via this env. + agentcontainers.WithSubAgentEnv("CODER_TEST_RUN_SUB_AGENT_MAIN=1"), + ) }) t.Logf("Waiting for container with label: devcontainer.local_folder=%s", tempWorkspaceFolder) @@ -2089,32 +2223,34 @@ func TestAgent_DevcontainerAutostart(t *testing.T) { ctx := testutil.Context(t, testutil.WaitLong) - ac, err := conn.ReconnectingPTY(ctx, uuid.New(), 80, 80, "", func(opts *workspacesdk.AgentReconnectingPTYInit) { - opts.Container = container.ID - }) - require.NoError(t, err, "failed to create ReconnectingPTY") - defer ac.Close() + // Ensure the container update routine runs. + tickerFuncTrap.MustWait(ctx).MustRelease(ctx) + tickerFuncTrap.Close() + _, next := mClock.AdvanceNext() + next.MustWait(ctx) - // Use terminal reader so we can see output in case somethin goes wrong. - tr := testutil.NewTerminalReader(t, ac) + // Verify that a subagent was created. + subAgents := agentClient.GetSubAgents() + require.Len(t, subAgents, 1, "expected one sub agent") - require.NoError(t, tr.ReadUntil(ctx, func(line string) bool { - return strings.Contains(line, "#") || strings.Contains(line, "$") - }), "find prompt") + subAgent := subAgents[0] + subAgentID, err := uuid.FromBytes(subAgent.GetId()) + require.NoError(t, err, "failed to parse sub-agent ID") + t.Logf("Connecting to sub-agent: %s (ID: %s)", subAgent.Name, subAgentID) - wantFileName := "file-from-devcontainer" - wantFile := filepath.Join(tempWorkspaceFolder, wantFileName) + gotDir, err := agentClient.GetSubAgentDirectory(subAgentID) + require.NoError(t, err, "failed to get sub-agent directory") + require.Equal(t, "/workspaces/mywork", gotDir, "sub-agent directory should match") - require.NoError(t, json.NewEncoder(ac).Encode(workspacesdk.ReconnectingPTYRequest{ - // NOTE(mafredri): We must use absolute path here for some reason. - Data: fmt.Sprintf("touch /workspaces/mywork/%s; exit\r", wantFileName), - }), "create file inside devcontainer") + subAgentToken, err := uuid.FromBytes(subAgent.GetAuthToken()) + require.NoError(t, err, "failed to parse sub-agent token") - // Wait for the connection to close to ensure the touch was executed. - require.ErrorIs(t, tr.ReadUntil(ctx, nil), io.EOF) + payload := testutil.RequireReceive(ctx, t, subAgentConnected) + require.Equal(t, subAgentToken.String(), payload.Token, "sub-agent token should match") + require.Equal(t, "/workspaces/mywork", payload.Directory, "sub-agent directory should match") - _, err = os.Stat(wantFile) - require.NoError(t, err, "file should exist outside devcontainer") + // Allow the subagent to exit. + close(subAgentReady) } // TestAgent_DevcontainerRecreate tests that RecreateDevcontainer @@ -2173,6 +2309,9 @@ func TestAgent_DevcontainerRecreate(t *testing.T) { //nolint:dogsled conn, client, _, _, _ := setupAgent(t, manifest, 0, func(_ *agenttest.Client, o *agent.Options) { o.ExperimentalDevcontainersEnabled = true + o.ContainerAPIOptions = append(o.ContainerAPIOptions, + agentcontainers.WithContainerLabelIncludeFilter("devcontainer.local_folder", workspaceFolder), + ) }) ctx := testutil.Context(t, testutil.WaitLong) diff --git a/agent/agentcontainers/api.go b/agent/agentcontainers/api.go index d826cb23cbc1f..301553c651048 100644 --- a/agent/agentcontainers/api.go +++ b/agent/agentcontainers/api.go @@ -1,11 +1,16 @@ package agentcontainers import ( + "bytes" "context" "errors" "fmt" + "io" "net/http" + "os" "path" + "path/filepath" + "runtime" "slices" "strings" "sync" @@ -26,27 +31,37 @@ import ( ) const ( - defaultUpdateInterval = 10 * time.Second - listContainersTimeout = 15 * time.Second + defaultUpdateInterval = 10 * time.Second + defaultOperationTimeout = 15 * time.Second + + // Destination path inside the container, we store it in a fixed location + // under /.coder-agent/coder to avoid conflicts and avoid being shadowed + // by tmpfs or other mounts. This assumes the container root filesystem is + // read-write, which seems sensible for dev containers. + coderPathInsideContainer = "/.coder-agent/coder" ) // API is responsible for container-related operations in the agent. // It provides methods to list and manage containers. type API struct { - ctx context.Context - cancel context.CancelFunc - watcherDone chan struct{} - updaterDone chan struct{} - initialUpdateDone chan struct{} // Closed after first update in updaterLoop. - updateTrigger chan chan error // Channel to trigger manual refresh. - updateInterval time.Duration // Interval for periodic container updates. - logger slog.Logger - watcher watcher.Watcher - execer agentexec.Execer - ccli ContainerCLI - dccli DevcontainerCLI - clock quartz.Clock - scriptLogger func(logSourceID uuid.UUID) ScriptLogger + ctx context.Context + cancel context.CancelFunc + watcherDone chan struct{} + updaterDone chan struct{} + initialUpdateDone chan struct{} // Closed after first update in updaterLoop. + updateTrigger chan chan error // Channel to trigger manual refresh. + updateInterval time.Duration // Interval for periodic container updates. + logger slog.Logger + watcher watcher.Watcher + execer agentexec.Execer + ccli ContainerCLI + containerLabelIncludeFilter map[string]string // Labels to filter containers by. + dccli DevcontainerCLI + clock quartz.Clock + scriptLogger func(logSourceID uuid.UUID) ScriptLogger + subAgentClient SubAgentClient + subAgentURL string + subAgentEnv []string mu sync.RWMutex closed bool @@ -57,11 +72,18 @@ type API struct { configFileModifiedTimes map[string]time.Time // By config file path. recreateSuccessTimes map[string]time.Time // By workspace folder. recreateErrorTimes map[string]time.Time // By workspace folder. - recreateWg sync.WaitGroup + injectedSubAgentProcs map[string]subAgentProcess // By container ID. + asyncWg sync.WaitGroup devcontainerLogSourceIDs map[string]uuid.UUID // By workspace folder. } +type subAgentProcess struct { + agent SubAgent + ctx context.Context + stop context.CancelFunc +} + // Option is a functional option for API. type Option func(*API) @@ -88,6 +110,16 @@ func WithContainerCLI(ccli ContainerCLI) Option { } } +// WithContainerLabelIncludeFilter sets a label filter for containers. +// This option can be given multiple times to filter by multiple labels. +// The behavior is such that only containers matching one or more of the +// provided labels will be included. +func WithContainerLabelIncludeFilter(label, value string) Option { + return func(api *API) { + api.containerLabelIncludeFilter[label] = value + } +} + // WithDevcontainerCLI sets the DevcontainerCLI implementation to use. // This can be used in tests to modify @devcontainer/cli behavior. func WithDevcontainerCLI(dccli DevcontainerCLI) Option { @@ -96,6 +128,29 @@ func WithDevcontainerCLI(dccli DevcontainerCLI) Option { } } +// WithSubAgentClient sets the SubAgentClient implementation to use. +// This is used to list, create and delete Dev Container agents. +func WithSubAgentClient(client SubAgentClient) Option { + return func(api *API) { + api.subAgentClient = client + } +} + +// WithSubAgentURL sets the agent URL for the sub-agent for +// communicating with the control plane. +func WithSubAgentURL(url string) Option { + return func(api *API) { + api.subAgentURL = url + } +} + +// WithSubAgent sets the environment variables for the sub-agent. +func WithSubAgentEnv(env ...string) Option { + return func(api *API) { + api.subAgentEnv = env + } +} + // WithDevcontainers sets the known devcontainers for the API. This // allows the API to be aware of devcontainers defined in the workspace // agent manifest. @@ -164,22 +219,25 @@ func WithScriptLogger(scriptLogger func(logSourceID uuid.UUID) ScriptLogger) Opt func NewAPI(logger slog.Logger, options ...Option) *API { ctx, cancel := context.WithCancel(context.Background()) api := &API{ - ctx: ctx, - cancel: cancel, - watcherDone: make(chan struct{}), - updaterDone: make(chan struct{}), - initialUpdateDone: make(chan struct{}), - updateTrigger: make(chan chan error), - updateInterval: defaultUpdateInterval, - logger: logger, - clock: quartz.NewReal(), - execer: agentexec.DefaultExecer, - devcontainerNames: make(map[string]bool), - knownDevcontainers: make(map[string]codersdk.WorkspaceAgentDevcontainer), - configFileModifiedTimes: make(map[string]time.Time), - recreateSuccessTimes: make(map[string]time.Time), - recreateErrorTimes: make(map[string]time.Time), - scriptLogger: func(uuid.UUID) ScriptLogger { return noopScriptLogger{} }, + ctx: ctx, + cancel: cancel, + watcherDone: make(chan struct{}), + updaterDone: make(chan struct{}), + initialUpdateDone: make(chan struct{}), + updateTrigger: make(chan chan error), + updateInterval: defaultUpdateInterval, + logger: logger, + clock: quartz.NewReal(), + execer: agentexec.DefaultExecer, + subAgentClient: noopSubAgentClient{}, + containerLabelIncludeFilter: make(map[string]string), + devcontainerNames: make(map[string]bool), + knownDevcontainers: make(map[string]codersdk.WorkspaceAgentDevcontainer), + configFileModifiedTimes: make(map[string]time.Time), + recreateSuccessTimes: make(map[string]time.Time), + recreateErrorTimes: make(map[string]time.Time), + scriptLogger: func(uuid.UUID) ScriptLogger { return noopScriptLogger{} }, + injectedSubAgentProcs: make(map[string]subAgentProcess), } // The ctx and logger must be set before applying options to avoid // nil pointer dereference. @@ -230,7 +288,7 @@ func (api *API) watcherLoop() { continue } - now := api.clock.Now("watcherLoop") + now := api.clock.Now("agentcontainers", "watcherLoop") switch { case event.Has(fsnotify.Create | fsnotify.Write): api.logger.Debug(api.ctx, "devcontainer config file changed", slog.F("file", event.Name)) @@ -254,6 +312,15 @@ func (api *API) updaterLoop() { defer api.logger.Debug(api.ctx, "updater loop stopped") api.logger.Debug(api.ctx, "updater loop started") + // Make sure we clean up any subagents not tracked by this process + // before starting the update loop and creating new ones. + api.logger.Debug(api.ctx, "cleaning up subagents") + if err := api.cleanupSubAgents(api.ctx); err != nil { + api.logger.Error(api.ctx, "cleanup subagents failed", slog.Error(err)) + } else { + api.logger.Debug(api.ctx, "cleanup subagents complete") + } + // Perform an initial update to populate the container list, this // gives us a guarantee that the API has loaded the initial state // before returning any responses. This is useful for both tests @@ -288,9 +355,9 @@ func (api *API) updaterLoop() { } return nil // Always nil to keep the ticker going. - }, "updaterLoop") + }, "agentcontainers", "updaterLoop") defer func() { - if err := ticker.Wait("updaterLoop"); err != nil && !errors.Is(err, context.Canceled) { + if err := ticker.Wait("agentcontainers", "updaterLoop"); err != nil && !errors.Is(err, context.Canceled) { api.logger.Error(api.ctx, "updater loop ticker failed", slog.Error(err)) } }() @@ -360,7 +427,7 @@ func (api *API) handleList(rw http.ResponseWriter, r *http.Request) { // updateContainers fetches the latest container list, processes it, and // updates the cache. It performs locking for updating shared API state. func (api *API) updateContainers(ctx context.Context) error { - listCtx, listCancel := context.WithTimeout(ctx, listContainersTimeout) + listCtx, listCancel := context.WithTimeout(ctx, defaultOperationTimeout) defer listCancel() updated, err := api.ccli.List(listCtx) @@ -395,6 +462,20 @@ func (api *API) updateContainers(ctx context.Context) error { // on the latest list of containers. This method assumes that api.mu is // held. func (api *API) processUpdatedContainersLocked(ctx context.Context, updated codersdk.WorkspaceAgentListContainersResponse) { + dcFields := func(dc codersdk.WorkspaceAgentDevcontainer) []slog.Field { + f := []slog.Field{ + slog.F("devcontainer_id", dc.ID), + slog.F("devcontainer_name", dc.Name), + slog.F("workspace_folder", dc.WorkspaceFolder), + slog.F("config_path", dc.ConfigPath), + } + if dc.Container != nil { + f = append(f, slog.F("container_id", dc.Container.ID)) + f = append(f, slog.F("container_name", dc.Container.FriendlyName)) + } + return f + } + // Reset the container links in known devcontainers to detect if // they still exist. for _, dc := range api.knownDevcontainers { @@ -415,6 +496,29 @@ func (api *API) processUpdatedContainersLocked(ctx context.Context, updated code continue } + logger := api.logger.With( + slog.F("container_id", updated.Containers[i].ID), + slog.F("container_name", updated.Containers[i].FriendlyName), + slog.F("workspace_folder", workspaceFolder), + slog.F("config_file", configFile), + ) + + if len(api.containerLabelIncludeFilter) > 0 { + var ok bool + for label, value := range api.containerLabelIncludeFilter { + if v, found := container.Labels[label]; found && v == value { + ok = true + } + } + // Verbose debug logging is fine here since typically filters + // are only used in development or testing environments. + if !ok { + logger.Debug(ctx, "container does not match include filter, ignoring dev container", slog.F("container_labels", container.Labels), slog.F("include_filter", api.containerLabelIncludeFilter)) + continue + } + logger.Debug(ctx, "container matches include filter, processing dev container", slog.F("container_labels", container.Labels), slog.F("include_filter", api.containerLabelIncludeFilter)) + } + if dc, ok := api.knownDevcontainers[workspaceFolder]; ok { // If no config path is set, this devcontainer was defined // in Terraform without the optional config file. Assume the @@ -423,7 +527,7 @@ func (api *API) processUpdatedContainersLocked(ctx context.Context, updated code if dc.ConfigPath == "" && configFile != "" { dc.ConfigPath = configFile if err := api.watcher.Add(configFile); err != nil { - api.logger.Error(ctx, "watch devcontainer config file failed", slog.Error(err), slog.F("file", configFile)) + logger.With(dcFields(dc)...).Error(ctx, "watch devcontainer config file failed", slog.Error(err)) } } @@ -432,53 +536,47 @@ func (api *API) processUpdatedContainersLocked(ctx context.Context, updated code continue } - // NOTE(mafredri): This name impl. may change to accommodate devcontainer agents RFC. - // If not in our known list, add as a runtime detected entry. - name := path.Base(workspaceFolder) - if api.devcontainerNames[name] { - // Try to find a unique name by appending a number. - for i := 2; ; i++ { - newName := fmt.Sprintf("%s-%d", name, i) - if !api.devcontainerNames[newName] { - name = newName - break - } - } - } - api.devcontainerNames[name] = true - if configFile != "" { - if err := api.watcher.Add(configFile); err != nil { - api.logger.Error(ctx, "watch devcontainer config file failed", slog.Error(err), slog.F("file", configFile)) - } - } - - api.knownDevcontainers[workspaceFolder] = codersdk.WorkspaceAgentDevcontainer{ + dc := codersdk.WorkspaceAgentDevcontainer{ ID: uuid.New(), - Name: name, + Name: "", // Updated later based on container state. WorkspaceFolder: workspaceFolder, ConfigPath: configFile, Status: "", // Updated later based on container state. Dirty: false, // Updated later based on config file changes. Container: container, } + + if configFile != "" { + if err := api.watcher.Add(configFile); err != nil { + logger.With(dcFields(dc)...).Error(ctx, "watch devcontainer config file failed", slog.Error(err)) + } + } + + api.knownDevcontainers[workspaceFolder] = dc } // Iterate through all known devcontainers and update their status // based on the current state of the containers. for _, dc := range api.knownDevcontainers { + logger := api.logger.With(dcFields(dc)...) + + if dc.Container != nil { + if !api.devcontainerNames[dc.Name] { + // If the devcontainer name wasn't set via terraform, we + // use the containers friendly name as a fallback which + // will keep changing as the dev container is recreated. + // TODO(mafredri): Parse the container label (i.e. devcontainer.json) for customization. + dc.Name = safeFriendlyName(dc.Container.FriendlyName) + } + dc.Container.DevcontainerStatus = dc.Status + dc.Container.DevcontainerDirty = dc.Dirty + } + switch { case dc.Status == codersdk.WorkspaceAgentDevcontainerStatusStarting: - if dc.Container != nil { - dc.Container.DevcontainerStatus = dc.Status - dc.Container.DevcontainerDirty = dc.Dirty - } continue // This state is handled by the recreation routine. case dc.Status == codersdk.WorkspaceAgentDevcontainerStatusError && (dc.Container == nil || dc.Container.CreatedAt.Before(api.recreateErrorTimes[dc.WorkspaceFolder])): - if dc.Container != nil { - dc.Container.DevcontainerStatus = dc.Status - dc.Container.DevcontainerDirty = dc.Dirty - } continue // The devcontainer needs to be recreated. case dc.Container != nil: @@ -494,7 +592,17 @@ func (api *API) processUpdatedContainersLocked(ctx context.Context, updated code } dc.Container.DevcontainerDirty = dc.Dirty + if _, injected := api.injectedSubAgentProcs[dc.Container.ID]; !injected && dc.Status == codersdk.WorkspaceAgentDevcontainerStatusRunning { + err := api.injectSubAgentIntoContainerLocked(ctx, dc) + if err != nil { + logger.Error(ctx, "inject subagent into container failed", slog.Error(err)) + } + } + case dc.Container == nil: + if !api.devcontainerNames[dc.Name] { + dc.Name = "" + } dc.Status = codersdk.WorkspaceAgentDevcontainerStatusStopped dc.Dirty = false } @@ -507,6 +615,18 @@ func (api *API) processUpdatedContainersLocked(ctx context.Context, updated code api.containersErr = nil } +// safeFriendlyName returns a API safe version of the container's +// friendly name. +// +// See provisioner/regexes.go for the regex used to validate +// the friendly name on the API side. +func safeFriendlyName(name string) string { + name = strings.ToLower(name) + name = strings.ReplaceAll(name, "_", "-") + + return name +} + // refreshContainers triggers an immediate update of the container list // and waits for it to complete. func (api *API) refreshContainers(ctx context.Context) (err error) { @@ -624,7 +744,7 @@ func (api *API) handleDevcontainerRecreate(w http.ResponseWriter, r *http.Reques dc.Container.DevcontainerStatus = dc.Status } api.knownDevcontainers[dc.WorkspaceFolder] = dc - api.recreateWg.Add(1) + api.asyncWg.Add(1) go api.recreateDevcontainer(dc, configPath) api.mu.Unlock() @@ -640,10 +760,10 @@ func (api *API) handleDevcontainerRecreate(w http.ResponseWriter, r *http.Reques // It updates the devcontainer status and logs the process. The configPath is // passed as a parameter for the odd chance that the container being recreated // has a different config file than the one stored in the devcontainer state. -// The devcontainer state must be set to starting and the recreateWg must be +// The devcontainer state must be set to starting and the asyncWg must be // incremented before calling this function. func (api *API) recreateDevcontainer(dc codersdk.WorkspaceAgentDevcontainer, configPath string) { - defer api.recreateWg.Done() + defer api.asyncWg.Done() var ( err error @@ -699,7 +819,7 @@ func (api *API) recreateDevcontainer(dc codersdk.WorkspaceAgentDevcontainer, con dc.Container.DevcontainerStatus = dc.Status } api.knownDevcontainers[dc.WorkspaceFolder] = dc - api.recreateErrorTimes[dc.WorkspaceFolder] = api.clock.Now("recreate", "errorTimes") + api.recreateErrorTimes[dc.WorkspaceFolder] = api.clock.Now("agentcontainers", "recreate", "errorTimes") api.mu.Unlock() return } @@ -721,7 +841,7 @@ func (api *API) recreateDevcontainer(dc codersdk.WorkspaceAgentDevcontainer, con dc.Container.DevcontainerStatus = dc.Status } dc.Dirty = false - api.recreateSuccessTimes[dc.WorkspaceFolder] = api.clock.Now("recreate", "successTimes") + api.recreateSuccessTimes[dc.WorkspaceFolder] = api.clock.Now("agentcontainers", "recreate", "successTimes") api.knownDevcontainers[dc.WorkspaceFolder] = dc api.mu.Unlock() @@ -803,6 +923,269 @@ func (api *API) markDevcontainerDirty(configPath string, modifiedAt time.Time) { } } +// cleanupSubAgents removes subagents that are no longer managed by +// this agent. This is usually only run at startup to ensure a clean +// slate. This method has an internal timeout to prevent blocking +// indefinitely if something goes wrong with the subagent deletion. +func (api *API) cleanupSubAgents(ctx context.Context) error { + agents, err := api.subAgentClient.List(ctx) + if err != nil { + return xerrors.Errorf("list agents: %w", err) + } + if len(agents) == 0 { + return nil + } + + api.mu.Lock() + defer api.mu.Unlock() + + injected := make(map[uuid.UUID]bool, len(api.injectedSubAgentProcs)) + for _, proc := range api.injectedSubAgentProcs { + injected[proc.agent.ID] = true + } + + ctx, cancel := context.WithTimeout(ctx, defaultOperationTimeout) + defer cancel() + + for _, agent := range agents { + if injected[agent.ID] { + continue + } + err := api.subAgentClient.Delete(ctx, agent.ID) + if err != nil { + api.logger.Error(ctx, "failed to delete agent", + slog.Error(err), + slog.F("agent_id", agent.ID), + slog.F("agent_name", agent.Name), + ) + } + } + + return nil +} + +// injectSubAgentIntoContainerLocked injects a subagent into a dev +// container and starts the subagent process. This method assumes that +// api.mu is held. +// +// This method uses an internal timeout to prevent blocking indefinitely +// if something goes wrong with the injection. +func (api *API) injectSubAgentIntoContainerLocked(ctx context.Context, dc codersdk.WorkspaceAgentDevcontainer) (err error) { + ctx, cancel := context.WithTimeout(ctx, defaultOperationTimeout) + defer cancel() + + container := dc.Container + if container == nil { + return xerrors.New("container is nil, cannot inject subagent") + } + + // Skip if subagent already exists for this container. + if _, injected := api.injectedSubAgentProcs[container.ID]; injected || api.closed { + return nil + } + + // Mark subagent as being injected immediately with a placeholder. + subAgent := subAgentProcess{ + ctx: context.Background(), + stop: func() {}, + } + api.injectedSubAgentProcs[container.ID] = subAgent + + // This is used to track the goroutine that will run the subagent + // process inside the container. It will be decremented when the + // subagent process completes or if an error occurs before we can + // start the subagent. + api.asyncWg.Add(1) + ranSubAgent := false + + // Clean up if injection fails. + defer func() { + if !ranSubAgent { + api.asyncWg.Done() + } + if err != nil { + // Mutex is held (defer re-lock). + delete(api.injectedSubAgentProcs, container.ID) + } + }() + + // Unlock the mutex to allow other operations while we + // inject the subagent into the container. + api.mu.Unlock() + defer api.mu.Lock() // Re-lock. + + logger := api.logger.With( + slog.F("devcontainer_id", dc.ID), + slog.F("devcontainer_name", dc.Name), + slog.F("workspace_folder", dc.WorkspaceFolder), + slog.F("config_path", dc.ConfigPath), + ) + + arch, err := api.ccli.DetectArchitecture(ctx, container.ID) + if err != nil { + return xerrors.Errorf("detect architecture: %w", err) + } + + logger.Info(ctx, "detected container architecture", slog.F("architecture", arch)) + + // For now, only support injecting if the architecture matches the host. + hostArch := runtime.GOARCH + + // TODO(mafredri): Add support for downloading agents for supported architectures. + if arch != hostArch { + logger.Warn(ctx, "skipping subagent injection for unsupported architecture", + slog.F("container_arch", arch), + slog.F("host_arch", hostArch)) + return nil + } + agentBinaryPath, err := os.Executable() + if err != nil { + return xerrors.Errorf("get agent binary path: %w", err) + } + agentBinaryPath, err = filepath.EvalSymlinks(agentBinaryPath) + if err != nil { + return xerrors.Errorf("resolve agent binary path: %w", err) + } + + // If we scripted this as a `/bin/sh` script, we could reduce these + // steps to one instruction, speeding up the injection process. + // + // Note: We use `path` instead of `filepath` here because we are + // working with Unix-style paths inside the container. + if _, err := api.ccli.ExecAs(ctx, container.ID, "root", "mkdir", "-p", path.Dir(coderPathInsideContainer)); err != nil { + return xerrors.Errorf("create agent directory in container: %w", err) + } + + if err := api.ccli.Copy(ctx, container.ID, agentBinaryPath, coderPathInsideContainer); err != nil { + return xerrors.Errorf("copy agent binary: %w", err) + } + + logger.Info(ctx, "copied agent binary to container") + + // Make sure the agent binary is executable so we can run it. + if _, err := api.ccli.ExecAs(ctx, container.ID, "root", "chmod", "0755", path.Dir(coderPathInsideContainer), coderPathInsideContainer); err != nil { + return xerrors.Errorf("set agent binary executable: %w", err) + } + // Set the owner of the agent binary to root:root (UID 0, GID 0). + if _, err := api.ccli.ExecAs(ctx, container.ID, "root", "chown", "0:0", path.Dir(coderPathInsideContainer), coderPathInsideContainer); err != nil { + return xerrors.Errorf("set agent binary owner: %w", err) + } + + // Attempt to add CAP_NET_ADMIN to the binary to improve network + // performance (optional, allow to fail). See `bootstrap_linux.sh`. + if _, err := api.ccli.ExecAs(ctx, container.ID, "root", "setcap", "cap_net_admin+ep", coderPathInsideContainer); err != nil { + logger.Warn(ctx, "set CAP_NET_ADMIN on agent binary failed", slog.Error(err)) + } + + // Detect workspace folder by executing `pwd` in the container. + // NOTE(mafredri): This is a quick and dirty way to detect the + // workspace folder inside the container. In the future we will + // rely more on `devcontainer read-configuration`. + var pwdBuf bytes.Buffer + err = api.dccli.Exec(ctx, dc.WorkspaceFolder, dc.ConfigPath, "pwd", []string{}, + WithExecOutput(&pwdBuf, io.Discard), + WithExecContainerID(container.ID), + ) + if err != nil { + return xerrors.Errorf("check workspace folder in container: %w", err) + } + directory := strings.TrimSpace(pwdBuf.String()) + if directory == "" { + logger.Warn(ctx, "detected workspace folder is empty, using default workspace folder", + slog.F("default_workspace_folder", DevcontainerDefaultContainerWorkspaceFolder)) + directory = DevcontainerDefaultContainerWorkspaceFolder + } + + // The preparation of the subagent is done, now we can create the + // subagent record in the database to receive the auth token. + createdAgent, err := api.subAgentClient.Create(ctx, SubAgent{ + Name: dc.Name, + Directory: directory, + OperatingSystem: "linux", // Assuming Linux for dev containers. + Architecture: arch, + }) + if err != nil { + return xerrors.Errorf("create agent: %w", err) + } + + logger.Info(ctx, "created subagent record", slog.F("agent_id", createdAgent.ID)) + + // Start the subagent in the container in a new goroutine to avoid + // blocking. Note that we pass the api.ctx to the subagent process + // so that it isn't affected by the timeout. + go api.runSubAgentInContainer(api.ctx, dc, createdAgent, coderPathInsideContainer) + ranSubAgent = true + + return nil +} + +// runSubAgentInContainer runs the subagent process inside a dev +// container. The api.asyncWg must be incremented before calling this +// function, and it will be decremented when the subagent process +// completes or if an error occurs. +func (api *API) runSubAgentInContainer(ctx context.Context, dc codersdk.WorkspaceAgentDevcontainer, agent SubAgent, agentPath string) { + container := dc.Container // Must not be nil. + logger := api.logger.With( + slog.F("container_name", container.FriendlyName), + slog.F("agent_id", agent.ID), + ) + + agentCtx, agentStop := context.WithCancel(ctx) + defer func() { + agentStop() + + // Best effort cleanup of the agent record after the process + // completes. Note that we use the background context here + // because the api.ctx will be canceled when the API is closed. + // This may delay shutdown of the agent by the given timeout. + deleteCtx, cancel := context.WithTimeout(context.Background(), defaultOperationTimeout) + defer cancel() + err := api.subAgentClient.Delete(deleteCtx, agent.ID) + if err != nil { + logger.Error(deleteCtx, "failed to delete agent record after process completion", slog.Error(err)) + } + + api.mu.Lock() + delete(api.injectedSubAgentProcs, container.ID) + api.mu.Unlock() + + logger.Debug(ctx, "agent process cleanup complete") + api.asyncWg.Done() + }() + + api.mu.Lock() + if api.closed { + api.mu.Unlock() + // If the API is closed, we should not run the agent. + logger.Debug(ctx, "the API is closed, not running subagent in container") + return + } + // Update the placeholder with a valid subagent, context and stop. + api.injectedSubAgentProcs[container.ID] = subAgentProcess{ + agent: agent, + ctx: agentCtx, + stop: agentStop, + } + api.mu.Unlock() + + logger.Info(ctx, "starting subagent in dev container") + + env := []string{ + "CODER_AGENT_URL=" + api.subAgentURL, + "CODER_AGENT_TOKEN=" + agent.AuthToken.String(), + } + env = append(env, api.subAgentEnv...) + err := api.dccli.Exec(agentCtx, dc.WorkspaceFolder, dc.ConfigPath, agentPath, []string{"agent"}, + WithExecContainerID(container.ID), + WithRemoteEnv(env...), + ) + if err != nil && !errors.Is(err, context.Canceled) { + logger.Error(ctx, "subagent process failed", slog.Error(err)) + } else { + logger.Info(ctx, "subagent process finished") + } +} + func (api *API) Close() error { api.mu.Lock() if api.closed { @@ -811,6 +1194,12 @@ func (api *API) Close() error { } api.logger.Debug(api.ctx, "closing API") api.closed = true + + for _, proc := range api.injectedSubAgentProcs { + api.logger.Debug(api.ctx, "canceling subagent process", slog.F("agent_name", proc.agent.Name), slog.F("agent_id", proc.agent.ID)) + proc.stop() + } + api.cancel() // Interrupt all routines. api.mu.Unlock() // Release lock before waiting for goroutines. @@ -821,8 +1210,8 @@ func (api *API) Close() error { <-api.watcherDone <-api.updaterDone - // Wait for all devcontainer recreation tasks to complete. - api.recreateWg.Wait() + // Wait for all async tasks to complete. + api.asyncWg.Wait() api.logger.Debug(api.ctx, "closed API") return err diff --git a/agent/agentcontainers/api_test.go b/agent/agentcontainers/api_test.go index 313da6f9f615f..59b0461c7948a 100644 --- a/agent/agentcontainers/api_test.go +++ b/agent/agentcontainers/api_test.go @@ -6,6 +6,8 @@ import ( "math/rand" "net/http" "net/http/httptest" + "os" + "runtime" "strings" "testing" "time" @@ -62,7 +64,7 @@ type fakeDevcontainerCLI struct { upErr error upErrC chan error // If set, send to return err, close to return upErr. execErr error - execErrC chan error // If set, send to return err, close to return execErr. + execErrC chan func(cmd string, args ...string) error // If set, send fn to return err, nil or close to return execErr. } func (f *fakeDevcontainerCLI) Up(ctx context.Context, _, _ string, _ ...agentcontainers.DevcontainerCLIUpOptions) (string, error) { @@ -79,14 +81,14 @@ func (f *fakeDevcontainerCLI) Up(ctx context.Context, _, _ string, _ ...agentcon return f.upID, f.upErr } -func (f *fakeDevcontainerCLI) Exec(ctx context.Context, _, _ string, _ string, _ []string, _ ...agentcontainers.DevcontainerCLIExecOptions) error { +func (f *fakeDevcontainerCLI) Exec(ctx context.Context, _, _ string, cmd string, args []string, _ ...agentcontainers.DevcontainerCLIExecOptions) error { if f.execErrC != nil { select { case <-ctx.Done(): return ctx.Err() - case err, ok := <-f.execErrC: - if ok { - return err + case fn, ok := <-f.execErrC: + if ok && fn != nil { + return fn(cmd, args...) } } } @@ -190,6 +192,80 @@ func (w *fakeWatcher) sendEventWaitNextCalled(ctx context.Context, event fsnotif w.waitNext(ctx) } +// fakeSubAgentClient implements SubAgentClient for testing purposes. +type fakeSubAgentClient struct { + agents map[uuid.UUID]agentcontainers.SubAgent + nextID int + + listErrC chan error // If set, send to return error, close to return nil. + created []agentcontainers.SubAgent + createErrC chan error // If set, send to return error, close to return nil. + deleted []uuid.UUID + deleteErrC chan error // If set, send to return error, close to return nil. +} + +func (m *fakeSubAgentClient) List(ctx context.Context) ([]agentcontainers.SubAgent, error) { + var listErr error + if m.listErrC != nil { + select { + case <-ctx.Done(): + return nil, ctx.Err() + case err, ok := <-m.listErrC: + if ok { + listErr = err + } + } + } + var agents []agentcontainers.SubAgent + for _, agent := range m.agents { + agents = append(agents, agent) + } + return agents, listErr +} + +func (m *fakeSubAgentClient) Create(ctx context.Context, agent agentcontainers.SubAgent) (agentcontainers.SubAgent, error) { + var createErr error + if m.createErrC != nil { + select { + case <-ctx.Done(): + return agentcontainers.SubAgent{}, ctx.Err() + case err, ok := <-m.createErrC: + if ok { + createErr = err + } + } + } + m.nextID++ + agent.ID = uuid.New() + agent.AuthToken = uuid.New() + if m.agents == nil { + m.agents = make(map[uuid.UUID]agentcontainers.SubAgent) + } + m.agents[agent.ID] = agent + m.created = append(m.created, agent) + return agent, createErr +} + +func (m *fakeSubAgentClient) Delete(ctx context.Context, id uuid.UUID) error { + var deleteErr error + if m.deleteErrC != nil { + select { + case <-ctx.Done(): + return ctx.Err() + case err, ok := <-m.deleteErrC: + if ok { + deleteErr = err + } + } + } + if m.agents == nil { + m.agents = make(map[uuid.UUID]agentcontainers.SubAgent) + } + delete(m.agents, id) + m.deleted = append(m.deleted, id) + return deleteErr +} + func TestAPI(t *testing.T) { t.Parallel() @@ -286,6 +362,7 @@ func TestAPI(t *testing.T) { api := agentcontainers.NewAPI(logger, agentcontainers.WithClock(mClock), agentcontainers.WithContainerCLI(mLister), + agentcontainers.WithContainerLabelIncludeFilter("this.label.does.not.exist.ignore.devcontainers", "true"), ) defer api.Close() r.Mount("/", api.Routes()) @@ -347,7 +424,7 @@ func TestAPI(t *testing.T) { FriendlyName: "container-name", Running: true, Labels: map[string]string{ - agentcontainers.DevcontainerLocalFolderLabel: "/workspace", + agentcontainers.DevcontainerLocalFolderLabel: "/workspaces", agentcontainers.DevcontainerConfigFileLabel: "/workspace/.devcontainer/devcontainer.json", }, } @@ -415,6 +492,7 @@ func TestAPI(t *testing.T) { containers: codersdk.WorkspaceAgentListContainersResponse{ Containers: []codersdk.WorkspaceAgentContainer{validContainer}, }, + arch: "Fetched URL: http://github.com/coder/coder/pull/18245.diff
Alternative Proxies: