Skip to content

Commit d22ac1c

Browse files
authored
chore: don't cache errors in file cache (#18555)
1 parent ab254ad commit d22ac1c

File tree

3 files changed

+273
-102
lines changed

3 files changed

+273
-102
lines changed

coderd/files/cache.go

Lines changed: 150 additions & 102 deletions
Original file line numberDiff line numberDiff line change
@@ -25,60 +25,61 @@ type FileAcquirer interface {
2525

2626
// New returns a file cache that will fetch files from a database
2727
func New(registerer prometheus.Registerer, authz rbac.Authorizer) *Cache {
28-
return (&Cache{
29-
lock: sync.Mutex{},
30-
data: make(map[uuid.UUID]*cacheEntry),
31-
authz: authz,
32-
}).registerMetrics(registerer)
28+
return &Cache{
29+
lock: sync.Mutex{},
30+
data: make(map[uuid.UUID]*cacheEntry),
31+
authz: authz,
32+
cacheMetrics: newCacheMetrics(registerer),
33+
}
3334
}
3435

35-
func (c *Cache) registerMetrics(registerer prometheus.Registerer) *Cache {
36+
func newCacheMetrics(registerer prometheus.Registerer) cacheMetrics {
3637
subsystem := "file_cache"
3738
f := promauto.With(registerer)
3839

39-
c.currentCacheSize = f.NewGauge(prometheus.GaugeOpts{
40-
Namespace: "coderd",
41-
Subsystem: subsystem,
42-
Name: "open_files_size_bytes_current",
43-
Help: "The current amount of memory of all files currently open in the file cache.",
44-
})
45-
46-
c.totalCacheSize = f.NewCounter(prometheus.CounterOpts{
47-
Namespace: "coderd",
48-
Subsystem: subsystem,
49-
Name: "open_files_size_bytes_total",
50-
Help: "The total amount of memory ever opened in the file cache. This number never decrements.",
51-
})
52-
53-
c.currentOpenFiles = f.NewGauge(prometheus.GaugeOpts{
54-
Namespace: "coderd",
55-
Subsystem: subsystem,
56-
Name: "open_files_current",
57-
Help: "The count of unique files currently open in the file cache.",
58-
})
59-
60-
c.totalOpenedFiles = f.NewCounter(prometheus.CounterOpts{
61-
Namespace: "coderd",
62-
Subsystem: subsystem,
63-
Name: "open_files_total",
64-
Help: "The total count of unique files ever opened in the file cache.",
65-
})
66-
67-
c.currentOpenFileReferences = f.NewGauge(prometheus.GaugeOpts{
68-
Namespace: "coderd",
69-
Subsystem: subsystem,
70-
Name: "open_file_refs_current",
71-
Help: "The count of file references currently open in the file cache. Multiple references can be held for the same file.",
72-
})
73-
74-
c.totalOpenFileReferences = f.NewCounterVec(prometheus.CounterOpts{
75-
Namespace: "coderd",
76-
Subsystem: subsystem,
77-
Name: "open_file_refs_total",
78-
Help: "The total number of file references ever opened in the file cache. The 'hit' label indicates if the file was loaded from the cache.",
79-
}, []string{"hit"})
80-
81-
return c
40+
return cacheMetrics{
41+
currentCacheSize: f.NewGauge(prometheus.GaugeOpts{
42+
Namespace: "coderd",
43+
Subsystem: subsystem,
44+
Name: "open_files_size_bytes_current",
45+
Help: "The current amount of memory of all files currently open in the file cache.",
46+
}),
47+
48+
totalCacheSize: f.NewCounter(prometheus.CounterOpts{
49+
Namespace: "coderd",
50+
Subsystem: subsystem,
51+
Name: "open_files_size_bytes_total",
52+
Help: "The total amount of memory ever opened in the file cache. This number never decrements.",
53+
}),
54+
55+
currentOpenFiles: f.NewGauge(prometheus.GaugeOpts{
56+
Namespace: "coderd",
57+
Subsystem: subsystem,
58+
Name: "open_files_current",
59+
Help: "The count of unique files currently open in the file cache.",
60+
}),
61+
62+
totalOpenedFiles: f.NewCounter(prometheus.CounterOpts{
63+
Namespace: "coderd",
64+
Subsystem: subsystem,
65+
Name: "open_files_total",
66+
Help: "The total count of unique files ever opened in the file cache.",
67+
}),
68+
69+
currentOpenFileReferences: f.NewGauge(prometheus.GaugeOpts{
70+
Namespace: "coderd",
71+
Subsystem: subsystem,
72+
Name: "open_file_refs_current",
73+
Help: "The count of file references currently open in the file cache. Multiple references can be held for the same file.",
74+
}),
75+
76+
totalOpenFileReferences: f.NewCounterVec(prometheus.CounterOpts{
77+
Namespace: "coderd",
78+
Subsystem: subsystem,
79+
Name: "open_file_refs_total",
80+
Help: "The total number of file references ever opened in the file cache. The 'hit' label indicates if the file was loaded from the cache.",
81+
}, []string{"hit"}),
82+
}
8283
}
8384

8485
// Cache persists the files for template versions, and is used by dynamic
@@ -106,18 +107,23 @@ type cacheMetrics struct {
106107
totalCacheSize prometheus.Counter
107108
}
108109

110+
type cacheEntry struct {
111+
// Safety: refCount must only be accessed while the Cache lock is held.
112+
refCount int
113+
value *lazy.ValueWithError[CacheEntryValue]
114+
115+
// Safety: close must only be called while the Cache lock is held
116+
close func()
117+
// Safety: purge must only be called while the Cache lock is held
118+
purge func()
119+
}
120+
109121
type CacheEntryValue struct {
110122
fs.FS
111123
Object rbac.Object
112124
Size int64
113125
}
114126

115-
type cacheEntry struct {
116-
// refCount must only be accessed while the Cache lock is held.
117-
refCount int
118-
value *lazy.ValueWithError[CacheEntryValue]
119-
}
120-
121127
var _ fs.FS = (*CloseFS)(nil)
122128

123129
// CloseFS is a wrapper around fs.FS that implements io.Closer. The Close()
@@ -129,106 +135,141 @@ type CloseFS struct {
129135
close func()
130136
}
131137

132-
func (f *CloseFS) Close() { f.close() }
138+
func (f *CloseFS) Close() {
139+
f.close()
140+
}
133141

134142
// Acquire will load the fs.FS for the given file. It guarantees that parallel
135143
// calls for the same fileID will only result in one fetch, and that parallel
136144
// calls for distinct fileIDs will fetch in parallel.
137145
//
138-
// Safety: Every call to Acquire that does not return an error must have a
139-
// matching call to Release.
146+
// Safety: Every call to Acquire that does not return an error must call close
147+
// on the returned value when it is done being used.
140148
func (c *Cache) Acquire(ctx context.Context, db database.Store, fileID uuid.UUID) (*CloseFS, error) {
141149
// It's important that this `Load` call occurs outside `prepare`, after the
142150
// mutex has been released, or we would continue to hold the lock until the
143151
// entire file has been fetched, which may be slow, and would prevent other
144152
// files from being fetched in parallel.
145-
it, err := c.prepare(ctx, db, fileID).Load()
153+
e := c.prepare(db, fileID)
154+
ev, err := e.value.Load()
146155
if err != nil {
147-
c.release(fileID)
156+
c.lock.Lock()
157+
defer c.lock.Unlock()
158+
e.close()
159+
e.purge()
148160
return nil, err
149161
}
150162

163+
cleanup := func() {
164+
c.lock.Lock()
165+
defer c.lock.Unlock()
166+
e.close()
167+
}
168+
169+
// We always run the fetch under a system context and actor, so we need to
170+
// check the caller's context (including the actor) manually before returning.
171+
172+
// Check if the caller's context was canceled. Even though `Authorize` takes
173+
// a context, we still check it manually first because none of our mock
174+
// database implementations check for context cancellation.
175+
if err := ctx.Err(); err != nil {
176+
cleanup()
177+
return nil, err
178+
}
179+
180+
// Check that the caller is authorized to access the file
151181
subject, ok := dbauthz.ActorFromContext(ctx)
152182
if !ok {
183+
cleanup()
153184
return nil, dbauthz.ErrNoActor
154185
}
155-
// Always check the caller can actually read the file.
156-
if err := c.authz.Authorize(ctx, subject, policy.ActionRead, it.Object); err != nil {
157-
c.release(fileID)
186+
if err := c.authz.Authorize(ctx, subject, policy.ActionRead, ev.Object); err != nil {
187+
cleanup()
158188
return nil, err
159189
}
160190

161-
var once sync.Once
191+
var closeOnce sync.Once
162192
return &CloseFS{
163-
FS: it.FS,
193+
FS: ev.FS,
164194
close: func() {
165195
// sync.Once makes the Close() idempotent, so we can call it
166196
// multiple times without worrying about double-releasing.
167-
once.Do(func() { c.release(fileID) })
197+
closeOnce.Do(func() {
198+
c.lock.Lock()
199+
defer c.lock.Unlock()
200+
e.close()
201+
})
168202
},
169203
}, nil
170204
}
171205

172-
func (c *Cache) prepare(ctx context.Context, db database.Store, fileID uuid.UUID) *lazy.ValueWithError[CacheEntryValue] {
206+
func (c *Cache) prepare(db database.Store, fileID uuid.UUID) *cacheEntry {
173207
c.lock.Lock()
174208
defer c.lock.Unlock()
175209

176210
hitLabel := "true"
177211
entry, ok := c.data[fileID]
178212
if !ok {
179-
value := lazy.NewWithError(func() (CacheEntryValue, error) {
180-
val, err := fetch(ctx, db, fileID)
213+
hitLabel = "false"
214+
215+
var purgeOnce sync.Once
216+
entry = &cacheEntry{
217+
value: lazy.NewWithError(func() (CacheEntryValue, error) {
218+
val, err := fetch(db, fileID)
219+
if err != nil {
220+
return val, err
221+
}
181222

182-
// Always add to the cache size the bytes of the file loaded.
183-
if err == nil {
223+
// Add the size of the file to the cache size metrics.
184224
c.currentCacheSize.Add(float64(val.Size))
185225
c.totalCacheSize.Add(float64(val.Size))
186-
}
187226

188-
return val, err
189-
})
227+
return val, err
228+
}),
190229

191-
entry = &cacheEntry{
192-
value: value,
193-
refCount: 0,
230+
close: func() {
231+
entry.refCount--
232+
c.currentOpenFileReferences.Dec()
233+
if entry.refCount > 0 {
234+
return
235+
}
236+
237+
entry.purge()
238+
},
239+
240+
purge: func() {
241+
purgeOnce.Do(func() {
242+
c.purge(fileID)
243+
})
244+
},
194245
}
195246
c.data[fileID] = entry
247+
196248
c.currentOpenFiles.Inc()
197249
c.totalOpenedFiles.Inc()
198-
hitLabel = "false"
199250
}
200251

201252
c.currentOpenFileReferences.Inc()
202253
c.totalOpenFileReferences.WithLabelValues(hitLabel).Inc()
203254
entry.refCount++
204-
return entry.value
255+
return entry
205256
}
206257

207-
// release decrements the reference count for the given fileID, and frees the
208-
// backing data if there are no further references being held.
209-
//
210-
// release should only be called after a successful call to Acquire using the Release()
211-
// method on the returned *CloseFS.
212-
func (c *Cache) release(fileID uuid.UUID) {
213-
c.lock.Lock()
214-
defer c.lock.Unlock()
215-
258+
// purge immediately removes an entry from the cache, even if it has open
259+
// references.
260+
// Safety: Must only be called while the Cache lock is held
261+
func (c *Cache) purge(fileID uuid.UUID) {
216262
entry, ok := c.data[fileID]
217263
if !ok {
218-
// If we land here, it's almost certainly because a bug already happened,
219-
// and we're freeing something that's already been freed, or we're calling
220-
// this function with an incorrect ID. Should this function return an error?
221-
return
222-
}
223-
224-
c.currentOpenFileReferences.Dec()
225-
entry.refCount--
226-
if entry.refCount > 0 {
264+
// If we land here, it's probably because of a fetch attempt that
265+
// resulted in an error, and got purged already. It may also be an
266+
// erroneous extra close, but we can't really distinguish between those
267+
// two cases currently.
227268
return
228269
}
229270

271+
// Purge the file from the cache.
230272
c.currentOpenFiles.Dec()
231-
232273
ev, err := entry.value.Load()
233274
if err == nil {
234275
c.currentCacheSize.Add(-1 * float64(ev.Size))
@@ -246,11 +287,18 @@ func (c *Cache) Count() int {
246287
return len(c.data)
247288
}
248289

249-
func fetch(ctx context.Context, store database.Store, fileID uuid.UUID) (CacheEntryValue, error) {
250-
// Make sure the read does not fail due to authorization issues.
251-
// Authz is checked on the Acquire call, so this is safe.
290+
func fetch(store database.Store, fileID uuid.UUID) (CacheEntryValue, error) {
291+
// Because many callers can be waiting on the same file fetch concurrently, we
292+
// want to prevent any failures that would cause them all to receive errors
293+
// because the caller who initiated the fetch would fail.
294+
// - We always run the fetch with an uncancelable context, and then check
295+
// context cancellation for each acquirer afterwards.
296+
// - We always run the fetch as a system user, and then check authorization
297+
// for each acquirer afterwards.
298+
// This prevents a canceled context or an unauthorized user from "holding up
299+
// the queue".
252300
//nolint:gocritic
253-
file, err := store.GetFileByID(dbauthz.AsFileReader(ctx), fileID)
301+
file, err := store.GetFileByID(dbauthz.AsFileReader(context.Background()), fileID)
254302
if err != nil {
255303
return CacheEntryValue{}, xerrors.Errorf("failed to read file from database: %w", err)
256304
}

coderd/files/cache_internal_test.go

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
package files
2+
3+
import (
4+
"context"
5+
6+
"github.com/google/uuid"
7+
8+
"github.com/coder/coder/v2/coderd/database"
9+
)
10+
11+
// LeakCache prevents entries from even being released to enable testing certain
12+
// behaviors.
13+
type LeakCache struct {
14+
*Cache
15+
}
16+
17+
func (c *LeakCache) Acquire(ctx context.Context, db database.Store, fileID uuid.UUID) (*CloseFS, error) {
18+
// We need to call prepare first to both 1. leak a reference and 2. prevent
19+
// the behavior of immediately closing on an error (as implemented in Acquire)
20+
// from freeing the file.
21+
c.prepare(db, fileID)
22+
return c.Cache.Acquire(ctx, db, fileID)
23+
}

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy