Skip to content

Commit 3f552bd

Browse files
author
Valentin Obst
committed
net/tcp: add Rust implementation of BIC
Reimplement the Binary Increase Congestion (BIC) control algorithm in Rust. BIC is one of the smallest CCAs in the kernel and this mainly serves as a minimal example for a real-world algorithm. Signed-off-by: Valentin Obst <kernel@valentinobst.de>
1 parent b4aaf14 commit 3f552bd

File tree

3 files changed

+314
-0
lines changed

3 files changed

+314
-0
lines changed

net/ipv4/Kconfig

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -510,6 +510,15 @@ config TCP_CONG_BIC
510510
increase provides TCP friendliness.
511511
See http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/
512512

513+
config TCP_CONG_BIC_RUST
514+
tristate "Binary Increase Congestion (BIC) control (Rust rewrite)"
515+
depends on RUST_TCP_ABSTRACTIONS
516+
help
517+
Rust rewrite of the original implementation of Binary Increase
518+
Congestion (BIC) control.
519+
520+
If unsure, say N.
521+
513522
config TCP_CONG_CUBIC
514523
tristate "CUBIC TCP"
515524
default y
@@ -705,6 +714,9 @@ choice
705714
config DEFAULT_BIC
706715
bool "Bic" if TCP_CONG_BIC=y
707716

717+
config DEFAULT_BIC_RUST
718+
bool "Bic (Rust)" if TCP_CONG_BIC_RUST=y
719+
708720
config DEFAULT_CUBIC
709721
bool "Cubic" if TCP_CONG_CUBIC=y
710722

@@ -746,6 +758,7 @@ config TCP_CONG_CUBIC
746758
config DEFAULT_TCP_CONG
747759
string
748760
default "bic" if DEFAULT_BIC
761+
default "bic_rust" if DEFAULT_BIC_RUST
749762
default "cubic" if DEFAULT_CUBIC
750763
default "htcp" if DEFAULT_HTCP
751764
default "hybla" if DEFAULT_HYBLA

net/ipv4/Makefile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ obj-$(CONFIG_INET_UDP_DIAG) += udp_diag.o
4646
obj-$(CONFIG_INET_RAW_DIAG) += raw_diag.o
4747
obj-$(CONFIG_TCP_CONG_BBR) += tcp_bbr.o
4848
obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o
49+
obj-$(CONFIG_TCP_CONG_BIC_RUST) += tcp_bic_rust.o
4950
obj-$(CONFIG_TCP_CONG_CDG) += tcp_cdg.o
5051
obj-$(CONFIG_TCP_CONG_CUBIC) += tcp_cubic.o
5152
obj-$(CONFIG_TCP_CONG_DCTCP) += tcp_dctcp.o

net/ipv4/tcp_bic_rust.rs

Lines changed: 300 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,300 @@
1+
//! SPDX-License-Identifier: GPL-2.0
2+
//!
3+
//! Binary Increase Congestion control (BIC). Based on:
4+
//! Binary Increase Congestion Control (BIC) for Fast Long-Distance
5+
//! Networks - Lisong Xu, Khaled Harfoush, and Injong Rhee
6+
//! IEEE INFOCOM 2004, Hong Kong, China, 2004, pp. 2514-2524 vol.4
7+
//! doi: 10.1109/INFCOM.2004.1354672
8+
//! Link: https://doi.org/10.1109/INFCOM.2004.1354672
9+
//! Link: https://web.archive.org/web/20160417213452/http://netsrv.csc.ncsu.edu/export/bitcp.pdf
10+
11+
use core::cmp::{max, min};
12+
use core::num::NonZeroU32;
13+
use kernel::net::tcp::cong;
14+
use kernel::prelude::*;
15+
use kernel::time;
16+
use kernel::{c_str, module_cca};
17+
18+
const ACK_RATIO_SHIFT: u32 = 4;
19+
20+
// TODO: Convert to module parameters once they are available.
21+
/// Value of ssthresh for new connections.
22+
const INITIAL_SSTHRESH: Option<u32> = None;
23+
/// If cwnd is larger than this threshold, BIC engages; otherwise normal TCP
24+
/// increase/decrease will be performed.
25+
// NOTE: cwnd is expressed in units of full-sized segments.
26+
const LOW_WINDOW: u32 = 14;
27+
/// In binary search, go to point: `cwnd + (W_max - cwnd) / BICTCP_B`.
28+
// SAFETY: This will panic at compile time when passing zero.
29+
// TODO: Convert to `new::(x).unwrap()` once 'const_option' is stabilised.
30+
const BICTCP_B: NonZeroU32 = unsafe { NonZeroU32::new_unchecked(4) };
31+
/// The maximum increment, i.e., `S_max`. This is used during additive increase.
32+
/// After crossing `W_max`, slow start is performed until passing
33+
/// `MAX_INCREMENT * (BICTCP_B - 1)`.
34+
// SAFETY: This will panic at compile time when passing zero.
35+
const MAX_INCREMENT: NonZeroU32 = unsafe { NonZeroU32::new_unchecked(16) };
36+
/// The number of RTT it takes to get from `W_max - BICTCP_B` to `W_max` (and
37+
/// from `W_max` to `W_max + BICTCP_B`). This is not part of the original paper
38+
/// and results in a slow additive increase across `W_max`.
39+
const SMOOTH_PART: u32 = 20;
40+
/// Enable or disable fast convergence.
41+
const FAST_CONVERGENCE: bool = true;
42+
/// Factor for multiplicative decrease. In fast retransmit we have:
43+
/// `cwnd = cwnd * BETA/BETA_SCALE`
44+
/// and if fast convergence is active:
45+
/// `W_max = cwnd * (1 + BETA/BETA_SCALE)/2`
46+
/// instead of `W_max = cwnd`.
47+
const BETA: u32 = 819;
48+
/// Used to calculate beta in [0, 1] with integer arithmetics.
49+
// SAFETY: This will panic at compile time when passing zero.
50+
const BETA_SCALE: NonZeroU32 = unsafe { NonZeroU32::new_unchecked(1024) };
51+
/// The minimum amount of time that has to pass between two updates of the cwnd.
52+
const MIN_UPDATE_INTERVAL: time::Nsecs = 31250000;
53+
54+
module_cca! {
55+
type: Bic,
56+
name: "tcp_bic_rust",
57+
author: "Rust for Linux Contributors",
58+
description: "Binary Increase Congestion (BIC) control algorithm, Rust implementation",
59+
license: "GPL v2",
60+
}
61+
62+
struct Bic {}
63+
64+
#[vtable]
65+
impl cong::Algorithm for Bic {
66+
type Data = BicState;
67+
68+
const NAME: &'static CStr = c_str!("bic_rust");
69+
70+
fn pkts_acked(sk: &mut cong::Sock<'_, Self>, sample: &cong::AckSample) {
71+
if let Ok(cong::State::Open) = sk.inet_csk().ca_state() {
72+
let ca = sk.inet_csk_ca_mut();
73+
74+
// This is supposed to wrap.
75+
ca.delayed_ack = ca.delayed_ack.wrapping_add(
76+
sample
77+
.pkts_acked()
78+
.wrapping_sub(ca.delayed_ack >> ACK_RATIO_SHIFT),
79+
);
80+
}
81+
}
82+
83+
fn ssthresh(sk: &mut cong::Sock<'_, Self>) -> u32 {
84+
let cwnd = sk.tcp_sk().snd_cwnd();
85+
let ca = sk.inet_csk_ca_mut();
86+
87+
pr_info!(
88+
// TODO: remove
89+
"Enter fast retransmit: time {}, start {}",
90+
time::ktime_get_boot_fast_ns(),
91+
ca.start_time
92+
);
93+
94+
// Epoch has ended.
95+
ca.epoch_start = 0;
96+
ca.last_max_cwnd = if cwnd < ca.last_max_cwnd && FAST_CONVERGENCE {
97+
(cwnd * (BETA_SCALE.get() + BETA)) / (2 * BETA_SCALE.get())
98+
} else {
99+
cwnd
100+
};
101+
102+
if cwnd <= LOW_WINDOW {
103+
max(cwnd >> 1, 2)
104+
} else {
105+
max((cwnd * BETA) / BETA_SCALE, 2)
106+
}
107+
}
108+
109+
fn cong_avoid(sk: &mut cong::Sock<'_, Self>, _ack: u32, mut acked: u32) {
110+
if !sk.tcp_is_cwnd_limited() {
111+
return;
112+
}
113+
114+
let tp = sk.tcp_sk_mut();
115+
116+
if tp.in_slow_start() {
117+
acked = tp.slow_start(acked);
118+
if acked == 0 {
119+
pr_info!(
120+
// TODO: remove
121+
"New cwnd {}, time {}, ssthresh {}, start {}, ss 1",
122+
sk.tcp_sk().snd_cwnd(),
123+
time::ktime_get_boot_fast_ns(),
124+
sk.tcp_sk().snd_ssthresh(),
125+
sk.inet_csk_ca().start_time
126+
);
127+
return;
128+
}
129+
}
130+
131+
let cwnd = tp.snd_cwnd();
132+
let cnt = sk.inet_csk_ca_mut().update(cwnd);
133+
sk.tcp_sk_mut().cong_avoid_ai(cnt, acked);
134+
135+
pr_info!(
136+
// TODO: remove
137+
"New cwnd {}, time {}, ssthresh {}, start {}, ss 0",
138+
sk.tcp_sk().snd_cwnd(),
139+
time::ktime_get_boot_fast_ns(),
140+
sk.tcp_sk().snd_ssthresh(),
141+
sk.inet_csk_ca().start_time
142+
);
143+
}
144+
145+
fn set_state(sk: &mut cong::Sock<'_, Self>, new_state: cong::State) {
146+
if matches!(new_state, cong::State::Loss) {
147+
pr_info!(
148+
// TODO: remove
149+
"Retransmission timeout fired: time {}, start {}",
150+
time::ktime_get_boot_fast_ns(),
151+
sk.inet_csk_ca().start_time
152+
);
153+
sk.inet_csk_ca_mut().reset()
154+
}
155+
}
156+
157+
fn undo_cwnd(sk: &mut cong::Sock<'_, Self>) -> u32 {
158+
pr_info!(
159+
// TODO: remove
160+
"Undo cwnd reduction: time {}, start {}",
161+
time::ktime_get_boot_fast_ns(),
162+
sk.inet_csk_ca().start_time
163+
);
164+
165+
cong::reno::undo_cwnd(sk)
166+
}
167+
168+
fn init(sk: &mut cong::Sock<'_, Self>) {
169+
if let Some(ssthresh) = INITIAL_SSTHRESH {
170+
sk.tcp_sk_mut().set_snd_ssthresh(ssthresh);
171+
}
172+
173+
// TODO: remove
174+
pr_info!("Socket created: start {}", sk.inet_csk_ca().start_time);
175+
}
176+
177+
// TODO: remove
178+
fn release(sk: &mut cong::Sock<'_, Self>) {
179+
pr_info!(
180+
"Socket destroyed: start {}, end {}",
181+
sk.inet_csk_ca().start_time,
182+
time::ktime_get_boot_fast_ns()
183+
);
184+
}
185+
}
186+
187+
/// Internal state of each instance of the algorithm.
188+
struct BicState {
189+
/// During congestion avoidance, cwnd is increased at most every `cnt`
190+
/// acknowledged packets, i.e., the average increase per acknowledged packet
191+
/// is proportional to `1 / cnt`.
192+
// NOTE: The C impl initialises this to zero. It then ensures that zero is
193+
// never passed to `cong_avoid_ai`, which could divide by it. Make it
194+
// explicit in the types that zero is not a valid value.
195+
cnt: NonZeroU32,
196+
/// Last maximum `snd_cwnd`, i.e, `W_max`.
197+
last_max_cwnd: u32,
198+
/// The last `snd_cwnd`.
199+
last_cwnd: u32,
200+
/// Time when `last_cwnd` was updated.
201+
last_time: time::Nsecs,
202+
/// Records the beginning of an epoch.
203+
epoch_start: time::Nsecs,
204+
/// Estimates the ratio of `packets/ACK << 4`. This allows us to adjust cwnd
205+
/// per packet when a receiver is sending a single ACK for multiple received
206+
/// packets.
207+
delayed_ack: u32,
208+
/// Time when algorithm was initialised.
209+
// TODO: remove
210+
start_time: time::Nsecs,
211+
}
212+
213+
impl Default for BicState {
214+
fn default() -> Self {
215+
Self {
216+
// NOTE: Initializing this to 1 deviates from the C code. It does
217+
// not change the behavior.
218+
cnt: NonZeroU32::MIN,
219+
last_max_cwnd: 0,
220+
last_cwnd: 0,
221+
last_time: 0,
222+
epoch_start: 0,
223+
delayed_ack: 2 << ACK_RATIO_SHIFT,
224+
// TODO: remove
225+
start_time: time::ktime_get_boot_fast_ns(),
226+
}
227+
}
228+
}
229+
230+
impl BicState {
231+
/// Compute congestion window to use. Returns the new `cnt`.
232+
///
233+
/// This governs the behavior of the algorithm during congestion avoidance.
234+
fn update(&mut self, cwnd: u32) -> NonZeroU32 {
235+
let timestamp = time::ktime_get_boot_fast_ns();
236+
237+
// Do nothing if we are invoked too frequently.
238+
if self.last_cwnd == cwnd && (timestamp - self.last_time) <= MIN_UPDATE_INTERVAL {
239+
return self.cnt;
240+
}
241+
242+
self.last_cwnd = cwnd;
243+
self.last_time = timestamp;
244+
245+
// Record the beginning of an epoch.
246+
if self.epoch_start == 0 {
247+
self.epoch_start = timestamp;
248+
}
249+
250+
// Start off like normal TCP.
251+
if cwnd <= LOW_WINDOW {
252+
self.cnt = NonZeroU32::new(cwnd).unwrap_or(NonZeroU32::MIN);
253+
return self.cnt;
254+
}
255+
256+
let mut new_cnt = if cwnd < self.last_max_cwnd {
257+
// binary increase
258+
let dist: u32 = (self.last_max_cwnd - cwnd) / BICTCP_B;
259+
260+
if dist > MAX_INCREMENT.get() {
261+
cwnd / MAX_INCREMENT // additive increase
262+
} else if dist <= 1 {
263+
(cwnd * SMOOTH_PART) / BICTCP_B // careful additive increase
264+
} else {
265+
cwnd / dist // binary search
266+
}
267+
} else {
268+
if cwnd < self.last_max_cwnd + BICTCP_B.get() {
269+
(cwnd * SMOOTH_PART) / BICTCP_B // careful additive increase
270+
} else if cwnd < self.last_max_cwnd + MAX_INCREMENT.get() * (BICTCP_B.get() - 1) {
271+
(cwnd * (BICTCP_B.get() - 1)) / (cwnd - self.last_max_cwnd) // slow start
272+
} else {
273+
cwnd / MAX_INCREMENT // linear increase
274+
}
275+
};
276+
277+
// If in initial slow start or link utilization is very low.
278+
if self.last_max_cwnd == 0 {
279+
new_cnt = min(new_cnt, 20);
280+
}
281+
282+
// Account for estimated packets/ACK to ensure that we increase per
283+
// packet.
284+
new_cnt = (new_cnt << ACK_RATIO_SHIFT) / self.delayed_ack;
285+
286+
self.cnt = NonZeroU32::new(new_cnt).unwrap_or(NonZeroU32::MIN);
287+
288+
self.cnt
289+
}
290+
291+
fn reset(&mut self) {
292+
// TODO: remove
293+
let tmp = self.start_time;
294+
295+
*self = Self::default();
296+
297+
// TODO: remove
298+
self.start_time = tmp;
299+
}
300+
}

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy