aboutsummaryrefslogtreecommitdiffstats
path: root/src/include
diff options
context:
space:
mode:
authorMichael Brown <mcb30@ipxe.org>2015-03-11 17:53:29 +0000
committerMichael Brown <mcb30@ipxe.org>2015-03-11 23:14:39 +0000
commite0fc8fe781b81083298dcddb08744c4c1ddf41ff (patch)
treebc59126d610fed247365b466eb57451bcae0360d /src/include
parentbc985ca08970951e2ba0ca16cb6eff17eedc1848 (diff)
downloadipxe-e0fc8fe781b81083298dcddb08744c4c1ddf41ff.tar.gz
[tcp] Implement support for TCP Selective Acknowledgements (SACK)
The TCP Selective Acknowledgement option (specified in RFC2018) provides a mechanism for the receiver to indicate packets that have been received out of order (e.g. due to earlier dropped packets). iPXE often operates in environments in which there is a high probability of packet loss. For example, the legacy USB keyboard emulation in some BIOSes involves polling the USB bus from within a system management interrupt: this introduces an invisible delay of around 500us which is long enough for around 40 full-length packets to be dropped. Similarly, almost all 1Gbps USB2 devices will eventually end up dropping packets because the USB2 bus does not provide enough bandwidth to sustain a 1Gbps stream, and most devices will not provide enough internal buffering to hold a full TCP window's worth of received packets. Add support for sending TCP Selective Acknowledgements. This provides the sender with more detailed information about which packets have been lost, and so allows for a more efficient retransmission strategy. We include a SACK-permitted option in our SYN packet, since experimentation shows that at least Linux peers will not include a SACK-permitted option in the SYN-ACK packet if one was not present in the initial SYN. (RFC2018 does not seem to mandate this behaviour, but it is consistent with the approach taken in RFC1323.) We ignore any received SACK options; this is safe to do since SACK is only ever advisory and we never have to send non-trivial amounts of data. Since our TCP receive queue is a candidate for cache discarding under low memory conditions, we may end up discarding data that has been reported as received via a SACK option. This is permitted by RFC2018. We follow the stricture that SACK blocks must not report data which is no longer held by the receiver: previously-reported blocks are validated against the current receive queue before being included within the current SACK block list. Experiments in a qemu VM using forced packet drops (by setting NETDEV_DISCARD_RATE to 32) show that implementing SACK improves throughput by around 400%. Experiments with a USB2 NIC (an SMSC7500) show that implementing SACK improves throughput by around 700%, increasing the download rate from 35Mbps up to 250Mbps (which is approximately the usable bandwidth limit for USB2). Signed-off-by: Michael Brown <mcb30@ipxe.org>
Diffstat (limited to 'src/include')
-rw-r--r--src/include/ipxe/tcp.h44
1 files changed, 44 insertions, 0 deletions
diff --git a/src/include/ipxe/tcp.h b/src/include/ipxe/tcp.h
index f12b4283..65fee85e 100644
--- a/src/include/ipxe/tcp.h
+++ b/src/include/ipxe/tcp.h
@@ -79,6 +79,48 @@ struct tcp_window_scale_padded_option {
*/
#define TCP_RX_WINDOW_SCALE 9
+/** TCP selective acknowledgement permitted option */
+struct tcp_sack_permitted_option {
+ uint8_t kind;
+ uint8_t length;
+} __attribute__ (( packed ));
+
+/** Padded TCP selective acknowledgement permitted option (used for sending) */
+struct tcp_sack_permitted_padded_option {
+ uint8_t nop[2];
+ struct tcp_sack_permitted_option spopt;
+} __attribute__ (( packed ));
+
+/** Code for the TCP selective acknowledgement permitted option */
+#define TCP_OPTION_SACK_PERMITTED 4
+
+/** TCP selective acknowledgement option */
+struct tcp_sack_option {
+ uint8_t kind;
+ uint8_t length;
+} __attribute__ (( packed ));
+
+/** TCP selective acknowledgement block */
+struct tcp_sack_block {
+ uint32_t left;
+ uint32_t right;
+} __attribute__ (( packed ));
+
+/** Maximum number of selective acknowledgement blocks
+ *
+ * This allows for the presence of the TCP timestamp option.
+ */
+#define TCP_SACK_MAX 3
+
+/** Padded TCP selective acknowledgement option (used for sending) */
+struct tcp_sack_padded_option {
+ uint8_t nop[2];
+ struct tcp_sack_option sackopt;
+} __attribute__ (( packed ));
+
+/** Code for the TCP selective acknowledgement option */
+#define TCP_OPTION_SACK 5
+
/** TCP timestamp option */
struct tcp_timestamp_option {
uint8_t kind;
@@ -102,6 +144,8 @@ struct tcp_options {
const struct tcp_mss_option *mssopt;
/** Window scale option, if present */
const struct tcp_window_scale_option *wsopt;
+ /** SACK permitted option, if present */
+ const struct tcp_sack_permitted_option *spopt;
/** Timestamp option, if present */
const struct tcp_timestamp_option *tsopt;
};