Skip to content

Commit c1816e3

Browse files
authored
fix(agent): fix deadlock if closed while starting listeners (#17329)
fixes #17328 Fixes a deadlock if we close the Agent in the middle of starting listeners on the tailnet.
1 parent 8faaa14 commit c1816e3

File tree

2 files changed

+78
-19
lines changed

2 files changed

+78
-19
lines changed

agent/agent.go

Lines changed: 30 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -229,13 +229,21 @@ type agent struct {
229229
// we track 2 contexts and associated cancel functions: "graceful" which is Done when it is time
230230
// to start gracefully shutting down and "hard" which is Done when it is time to close
231231
// everything down (regardless of whether graceful shutdown completed).
232-
gracefulCtx context.Context
233-
gracefulCancel context.CancelFunc
234-
hardCtx context.Context
235-
hardCancel context.CancelFunc
236-
closeWaitGroup sync.WaitGroup
232+
gracefulCtx context.Context
233+
gracefulCancel context.CancelFunc
234+
hardCtx context.Context
235+
hardCancel context.CancelFunc
236+
237+
// closeMutex protects the following:
237238
closeMutex sync.Mutex
239+
closeWaitGroup sync.WaitGroup
238240
coordDisconnected chan struct{}
241+
closing bool
242+
// note that once the network is set to non-nil, it is never modified, as with the statsReporter. So, routines
243+
// that run after createOrUpdateNetwork and check the networkOK checkpoint do not need to hold the lock to use them.
244+
network *tailnet.Conn
245+
statsReporter *statsReporter
246+
// end fields protected by closeMutex
239247

240248
environmentVariables map[string]string
241249

@@ -259,9 +267,7 @@ type agent struct {
259267
reportConnectionsMu sync.Mutex
260268
reportConnections []*proto.ReportConnectionRequest
261269

262-
network *tailnet.Conn
263-
statsReporter *statsReporter
264-
logSender *agentsdk.LogSender
270+
logSender *agentsdk.LogSender
265271

266272
prometheusRegistry *prometheus.Registry
267273
// metrics are prometheus registered metrics that will be collected and
@@ -274,6 +280,8 @@ type agent struct {
274280
}
275281

276282
func (a *agent) TailnetConn() *tailnet.Conn {
283+
a.closeMutex.Lock()
284+
defer a.closeMutex.Unlock()
277285
return a.network
278286
}
279287

@@ -1205,15 +1213,15 @@ func (a *agent) createOrUpdateNetwork(manifestOK, networkOK *checkpoint) func(co
12051213
}
12061214
a.closeMutex.Lock()
12071215
// Re-check if agent was closed while initializing the network.
1208-
closed := a.isClosed()
1209-
if !closed {
1216+
closing := a.closing
1217+
if !closing {
12101218
a.network = network
12111219
a.statsReporter = newStatsReporter(a.logger, network, a)
12121220
}
12131221
a.closeMutex.Unlock()
1214-
if closed {
1222+
if closing {
12151223
_ = network.Close()
1216-
return xerrors.New("agent is closed")
1224+
return xerrors.New("agent is closing")
12171225
}
12181226
} else {
12191227
// Update the wireguard IPs if the agent ID changed.
@@ -1328,8 +1336,8 @@ func (*agent) wireguardAddresses(agentID uuid.UUID) []netip.Prefix {
13281336
func (a *agent) trackGoroutine(fn func()) error {
13291337
a.closeMutex.Lock()
13301338
defer a.closeMutex.Unlock()
1331-
if a.isClosed() {
1332-
return xerrors.New("track conn goroutine: agent is closed")
1339+
if a.closing {
1340+
return xerrors.New("track conn goroutine: agent is closing")
13331341
}
13341342
a.closeWaitGroup.Add(1)
13351343
go func() {
@@ -1547,7 +1555,7 @@ func (a *agent) runCoordinator(ctx context.Context, tClient tailnetproto.DRPCTai
15471555
func (a *agent) setCoordDisconnected() chan struct{} {
15481556
a.closeMutex.Lock()
15491557
defer a.closeMutex.Unlock()
1550-
if a.isClosed() {
1558+
if a.closing {
15511559
return nil
15521560
}
15531561
disconnected := make(chan struct{})
@@ -1772,7 +1780,10 @@ func (a *agent) HTTPDebug() http.Handler {
17721780

17731781
func (a *agent) Close() error {
17741782
a.closeMutex.Lock()
1775-
defer a.closeMutex.Unlock()
1783+
network := a.network
1784+
coordDisconnected := a.coordDisconnected
1785+
a.closing = true
1786+
a.closeMutex.Unlock()
17761787
if a.isClosed() {
17771788
return nil
17781789
}
@@ -1849,7 +1860,7 @@ lifecycleWaitLoop:
18491860
select {
18501861
case <-a.hardCtx.Done():
18511862
a.logger.Warn(context.Background(), "timed out waiting for Coordinator RPC disconnect")
1852-
case <-a.coordDisconnected:
1863+
case <-coordDisconnected:
18531864
a.logger.Debug(context.Background(), "coordinator RPC disconnected")
18541865
}
18551866

@@ -1860,8 +1871,8 @@ lifecycleWaitLoop:
18601871
}
18611872

18621873
a.hardCancel()
1863-
if a.network != nil {
1864-
_ = a.network.Close()
1874+
if network != nil {
1875+
_ = network.Close()
18651876
}
18661877
a.closeWaitGroup.Wait()
18671878

agent/agent_test.go

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,54 @@ func TestMain(m *testing.M) {
6868

6969
var sshPorts = []uint16{workspacesdk.AgentSSHPort, workspacesdk.AgentStandardSSHPort}
7070

71+
// TestAgent_CloseWhileStarting is a regression test for https://github.com/coder/coder/issues/17328
72+
func TestAgent_ImmediateClose(t *testing.T) {
73+
t.Parallel()
74+
75+
ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitLong)
76+
defer cancel()
77+
78+
logger := slogtest.Make(t, &slogtest.Options{
79+
// Agent can drop errors when shutting down, and some, like the
80+
// fasthttplistener connection closed error, are unexported.
81+
IgnoreErrors: true,
82+
}).Leveled(slog.LevelDebug)
83+
manifest := agentsdk.Manifest{
84+
AgentID: uuid.New(),
85+
AgentName: "test-agent",
86+
WorkspaceName: "test-workspace",
87+
WorkspaceID: uuid.New(),
88+
}
89+
90+
coordinator := tailnet.NewCoordinator(logger)
91+
t.Cleanup(func() {
92+
_ = coordinator.Close()
93+
})
94+
statsCh := make(chan *proto.Stats, 50)
95+
fs := afero.NewMemMapFs()
96+
client := agenttest.NewClient(t, logger.Named("agenttest"), manifest.AgentID, manifest, statsCh, coordinator)
97+
t.Cleanup(client.Close)
98+
99+
options := agent.Options{
100+
Client: client,
101+
Filesystem: fs,
102+
Logger: logger.Named("agent"),
103+
ReconnectingPTYTimeout: 0,
104+
EnvironmentVariables: map[string]string{},
105+
}
106+
107+
agentUnderTest := agent.New(options)
108+
t.Cleanup(func() {
109+
_ = agentUnderTest.Close()
110+
})
111+
112+
// wait until the agent has connected and is starting to find races in the startup code
113+
_ = testutil.RequireRecvCtx(ctx, t, client.GetStartup())
114+
t.Log("Closing Agent")
115+
err := agentUnderTest.Close()
116+
require.NoError(t, err)
117+
}
118+
71119
// NOTE: These tests only work when your default shell is bash for some reason.
72120

73121
func TestAgent_Stats_SSH(t *testing.T) {

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy