Skip to content

Commit 25f5d09

Browse files
committed
Improve shutdown logic: wait until no requests are made
Pods in Kubernetes endpoints are expected to shut-down 'gracefully' after receiving SIGTERM - we should keep accepting new connections for a while. This is because Kubernetes updates Service endpoints and sends SIGTERM to pods *in parallel*. See kubernetes/kubernetes#106476 for more detail.
1 parent e76ea47 commit 25f5d09

File tree

3 files changed

+151
-1
lines changed

3 files changed

+151
-1
lines changed

internal/ingress/controller/nginx.go

Lines changed: 60 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ import (
2323
"errors"
2424
"fmt"
2525
"io/fs"
26+
"k8s.io/ingress-nginx/internal/ingress/metric/collectors"
2627
"net"
2728
"net/http"
2829
"os"
@@ -377,6 +378,63 @@ func (n *NGINXController) Start() {
377378
}
378379
}
379380

381+
// stopWait waits until no more connections are made to nginx.
382+
//
383+
// This waits until all of following conditions are met:
384+
// - No more requests are made to nginx for the last 5 seconds.
385+
// - 'shutdown-grace-period' seconds have passed after calling this method.
386+
//
387+
// Pods in Kubernetes endpoints are expected to shut-down 'gracefully' after receiving SIGTERM -
388+
// we should keep accepting new connections for a while. This is because Kubernetes updates Service endpoints
389+
// and sends SIGTERM to pods *in parallel*.
390+
// If we don't see new requests for 5 seconds, then we assume that this pod was removed from the upstream endpoints
391+
// (AWS ALB endpoints for example), and proceed with shutdown.
392+
//
393+
// See https://github.com/kubernetes/kubernetes/issues/106476 for more detail on this issue.
394+
func (n *NGINXController) stopWait() {
395+
const checkFrequency = time.Second
396+
const waitUntilNoConnectionsFor = int((5 * time.Second) / checkFrequency)
397+
waitAtLeastUntil := time.Now().Add(time.Duration(n.cfg.ShutdownGracePeriod) * time.Second)
398+
399+
var scraper collectors.NginxStatusScraper
400+
lastRequests := 0
401+
noChangeTimes := 0
402+
403+
for ; ; time.Sleep(checkFrequency) {
404+
st, err := scraper.Scrape()
405+
if err != nil {
406+
klog.Warningf("failed to scrape nginx status: %v", err)
407+
noChangeTimes = 0
408+
continue
409+
}
410+
411+
diff := st.Requests - lastRequests
412+
// We assume that there were no client requests to nginx, if and only if
413+
// there were 0 to 2 increase in handled requests from the last scrape.
414+
// 1 is to account for our own stub_status request from this method,
415+
// and the other 1 is to account for the readinessProbe.
416+
// Note that readinessProbe DO happen even when the pod is terminating.
417+
// See: https://github.com/kubernetes/kubernetes/issues/122824#issuecomment-1899224434
418+
noChange := 0 <= diff && diff <= 2
419+
if noChange {
420+
noChangeTimes++
421+
if noChangeTimes >= waitUntilNoConnectionsFor {
422+
// Safe to proceed shutdown, we are seeing no more client request.
423+
break
424+
}
425+
} else {
426+
noChangeTimes = 0
427+
}
428+
lastRequests = st.Requests
429+
}
430+
431+
// Wait at least for the configured duration, if any
432+
delay := waitAtLeastUntil.Sub(time.Now())
433+
if delay > 0 {
434+
time.Sleep(delay)
435+
}
436+
}
437+
380438
// Stop gracefully stops the NGINX master process.
381439
func (n *NGINXController) Stop() error {
382440
n.isShuttingDown = true
@@ -388,7 +446,8 @@ func (n *NGINXController) Stop() error {
388446
return fmt.Errorf("shutdown already in progress")
389447
}
390448

391-
time.Sleep(time.Duration(n.cfg.ShutdownGracePeriod) * time.Second)
449+
klog.InfoS("Graceful shutdown - waiting until no more requests are made")
450+
n.stopWait()
392451

393452
klog.InfoS("Shutting down controller queues")
394453
close(n.stopCh)

test/e2e/framework/deployment.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -624,6 +624,10 @@ func (f *Framework) ScaleDeploymentToZero(name string) {
624624
assert.Nil(ginkgo.GinkgoT(), err, "getting deployment")
625625
assert.NotNil(ginkgo.GinkgoT(), d, "expected a deployment but none returned")
626626

627+
err = waitForPodsDeleted(f.KubeClientSet, 2*time.Minute, f.Namespace, &metav1.ListOptions{
628+
LabelSelector: labelSelectorToString(d.Spec.Selector.MatchLabels),
629+
})
630+
assert.Nil(ginkgo.GinkgoT(), err, "waiting for no pods")
627631
err = WaitForEndpoints(f.KubeClientSet, DefaultTimeout, name, f.Namespace, 0)
628632
assert.Nil(ginkgo.GinkgoT(), err, "waiting for no endpoints")
629633
}
Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
/*
2+
Copyright 2020 The Kubernetes Authors.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package gracefulshutdown
18+
19+
import (
20+
"context"
21+
"fmt"
22+
"github.com/onsi/ginkgo/v2"
23+
"github.com/stretchr/testify/assert"
24+
appsv1 "k8s.io/api/apps/v1"
25+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
26+
"k8s.io/ingress-nginx/test/e2e/framework"
27+
"net/http"
28+
"strings"
29+
"time"
30+
)
31+
32+
var _ = framework.IngressNginxDescribe("[Shutdown] Asynchronous shutdown", func() {
33+
f := framework.NewDefaultFramework("k8s-async-shutdown", func(f *framework.Framework) {
34+
f.Namespace = "k8s-async-shutdown"
35+
})
36+
37+
host := "async-shutdown"
38+
39+
ginkgo.BeforeEach(func() {
40+
f.NewSlowEchoDeployment()
41+
})
42+
43+
ginkgo.It("should not shut down while still receiving traffic", func() {
44+
defer ginkgo.GinkgoRecover()
45+
46+
err := f.UpdateIngressControllerDeployment(func(deployment *appsv1.Deployment) error {
47+
// Note: e2e's default terminationGracePeriodSeconds is 1 for some reason, so extend it
48+
grace := int64(300)
49+
deployment.Spec.Template.Spec.TerminationGracePeriodSeconds = &grace
50+
_, err := f.KubeClientSet.AppsV1().Deployments(f.Namespace).Update(context.TODO(), deployment, metav1.UpdateOptions{})
51+
return err
52+
})
53+
assert.Nil(ginkgo.GinkgoT(), err, "updating ingress controller deployment")
54+
55+
f.EnsureIngress(framework.NewSingleIngress(host, "/", host, f.Namespace, framework.SlowEchoService, 80, nil))
56+
57+
f.WaitForNginxServer(host,
58+
func(server string) bool {
59+
return strings.Contains(server, "server_name "+host)
60+
})
61+
62+
// We need to get pod IP first because after the pod becomes terminating,
63+
// it is removed from Service endpoints, and becomes unable to be discovered by "f.HTTPTestClient()".
64+
ip := f.GetNginxPodIP()
65+
66+
// Assume that the upstream takes 30 seconds to update its endpoints,
67+
// therefore we are still receiving traffic while shutting down
68+
go func() {
69+
defer ginkgo.GinkgoRecover()
70+
for i := 0; i < 120; i++ {
71+
f.HTTPDumbTestClient().
72+
GET("/").
73+
WithURL(fmt.Sprintf("http://%s/", ip)).
74+
WithHeader("Host", host).
75+
Expect().
76+
Status(http.StatusOK)
77+
78+
framework.Sleep(250 * time.Millisecond)
79+
}
80+
}()
81+
82+
start := time.Now()
83+
f.ScaleDeploymentToZero("nginx-ingress-controller")
84+
assert.GreaterOrEqualf(ginkgo.GinkgoT(), int(time.Since(start).Seconds()), 35,
85+
"should take more than 30 + 5 seconds for graceful shutdown")
86+
})
87+
})

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy