diff options
author | Michael Brown <mcb30@ipxe.org> | 2021-04-14 16:33:41 +0100 |
---|---|---|
committer | Michael Brown <mcb30@ipxe.org> | 2021-04-14 16:33:41 +0100 |
commit | 85d179f2c65d0a2afe9122b844a90c011d551ae1 (patch) | |
tree | 4e28fe6260f2df4911089d7747408d9f0c359688 | |
parent | 8ca43ccbc1984d60e50711ea326ca59ac03985d2 (diff) | |
download | ipxe-xen-sg.tar.gz |
[xen] Support scatter-gather to allow for jumbo framesxen-sg
The use of jumbo frames for the Xen netfront virtual NIC requires the
use of scatter-gather ("feature-sg"), with the receive descriptor ring
becoming a list of page-sized buffers and the backend using as many
page buffers as required for each packet.
Since iPXE's abstraction of an I/O buffer does not include any sort of
scatter-gather list, this requires an extra allocation and copy on the
receive datapath for any packet that spans more than a single page.
This support is required in order to successfully boot an AWS EC2
virtual machine (with non-enhanced networking) via iSCSI if jumbo
frames are enabled, since the netback driver used in EC2 seems not to
allow "feature-sg" to be renegotiated once the Linux kernel driver
takes over.
Signed-off-by: Michael Brown <mcb30@ipxe.org>
-rw-r--r-- | src/drivers/net/netfront.c | 188 | ||||
-rw-r--r-- | src/drivers/net/netfront.h | 16 | ||||
-rw-r--r-- | src/include/ipxe/xengrant.h | 7 |
3 files changed, 154 insertions, 57 deletions
diff --git a/src/drivers/net/netfront.c b/src/drivers/net/netfront.c index be210850a..1203e585c 100644 --- a/src/drivers/net/netfront.c +++ b/src/drivers/net/netfront.c @@ -56,7 +56,7 @@ FILE_LICENCE ( GPL2_OR_LATER_OR_UBDL ); __einfo_uniqify ( EINFO_EIO, -NETIF_RSP_DROPPED, \ "Packet dropped" ) #define EIO_NETIF_RSP( status ) \ - EUNIQ ( EINFO_EIO, -(status), \ + EUNIQ ( EINFO_EIO, ( -(status) & 0x1f ), \ EIO_NETIF_RSP_ERROR, EIO_NETIF_RSP_DROPPED ) /****************************************************************************** @@ -326,6 +326,7 @@ static int netfront_create_ring ( struct netfront_nic *netfront, struct netfront_ring *ring ) { struct xen_device *xendev = netfront->xendev; struct xen_hypervisor *xen = xendev->xen; + physaddr_t addr; unsigned int i; int rc; @@ -345,11 +346,11 @@ static int netfront_create_ring ( struct netfront_nic *netfront, } /* Grant access to shared ring */ + addr = virt_to_phys ( ring->sring.raw ); if ( ( rc = xengrant_permit_access ( xen, ring->ref, xendev->backend_id, - 0, ring->sring.raw ) ) != 0 ) { + 0, addr ) ) != 0 ) { DBGC ( netfront, "NETFRONT %s could not permit access to " - "%#08lx: %s\n", xendev->key, - virt_to_phys ( ring->sring.raw ), strerror ( rc ) ); + "%#08lx: %s\n", xendev->key, addr, strerror ( rc ) ); goto err_permit_access; } @@ -358,10 +359,8 @@ static int netfront_create_ring ( struct netfront_nic *netfront, ring->ref ) ) != 0 ) goto err_write_num; - DBGC ( netfront, "NETFRONT %s %s=\"%d\" [%08lx,%08lx)\n", - xendev->key, ring->ref_key, ring->ref, - virt_to_phys ( ring->sring.raw ), - ( virt_to_phys ( ring->sring.raw ) + PAGE_SIZE ) ); + DBGC ( netfront, "NETFRONT %s %s=\"%d\" [%08lx,%08lx)\n", xendev->key, + ring->ref_key, ring->ref, addr, ( addr + PAGE_SIZE ) ); return 0; netfront_rm ( netfront, ring->ref_key ); @@ -378,7 +377,8 @@ static int netfront_create_ring ( struct netfront_nic *netfront, * * @v netfront Netfront device * @v ring Descriptor ring - * @v iobuf I/O buffer + * @v addr Physical address + * @v iobuf Associated I/O buffer, or NULL * @v id Buffer ID to fill in * @v ref Grant reference to fill in * @ret rc Return status code @@ -387,8 +387,9 @@ static int netfront_create_ring ( struct netfront_nic *netfront, * ring. */ static int netfront_push ( struct netfront_nic *netfront, - struct netfront_ring *ring, struct io_buffer *iobuf, - uint16_t *id, grant_ref_t *ref ) { + struct netfront_ring *ring, physaddr_t addr, + struct io_buffer *iobuf, uint16_t *id, + grant_ref_t *ref ) { struct xen_device *xendev = netfront->xendev; struct xen_hypervisor *xen = xendev->xen; unsigned int next_id; @@ -402,19 +403,15 @@ static int netfront_push ( struct netfront_nic *netfront, next_id = ring->ids[ ring->id_prod & ( ring->count - 1 ) ]; next_ref = ring->refs[next_id]; - /* Grant access to I/O buffer page. I/O buffers are naturally - * aligned, so we never need to worry about crossing a page - * boundary. - */ + /* Grant access to page containing address */ if ( ( rc = xengrant_permit_access ( xen, next_ref, xendev->backend_id, - 0, iobuf->data ) ) != 0 ) { + 0, addr ) ) != 0 ) { DBGC ( netfront, "NETFRONT %s could not permit access to " - "%#08lx: %s\n", xendev->key, - virt_to_phys ( iobuf->data ), strerror ( rc ) ); + "%#08lx: %s\n", xendev->key, addr, strerror ( rc ) ); return rc; } - /* Store I/O buffer */ + /* Store associated I/O buffer, if any */ assert ( ring->iobufs[next_id] == NULL ); ring->iobufs[next_id] = iobuf; @@ -434,7 +431,7 @@ static int netfront_push ( struct netfront_nic *netfront, * @v netfront Netfront device * @v ring Descriptor ring * @v id Buffer ID - * @ret iobuf I/O buffer + * @ret iobuf Associated I/O buffer, if any */ static struct io_buffer * netfront_pull ( struct netfront_nic *netfront, struct netfront_ring *ring, @@ -451,7 +448,6 @@ static struct io_buffer * netfront_pull ( struct netfront_nic *netfront, /* Retrieve I/O buffer */ iobuf = ring->iobufs[id]; - assert ( iobuf != NULL ); ring->iobufs[id] = NULL; /* Free buffer ID */ @@ -494,6 +490,22 @@ static void netfront_destroy_ring ( struct netfront_nic *netfront, ring->sring.raw = NULL; } +/** + * Discard partially received I/O buffers + * + * @v netfront Netfront device + */ +static void netfront_discard ( struct netfront_nic *netfront ) { + struct io_buffer *iobuf; + struct io_buffer *tmp; + + /* Discard all buffers in the list */ + list_for_each_entry_safe ( iobuf, tmp, &netfront->rx_partial, list ) { + list_del ( &iobuf->list ); + free_iob ( iobuf ); + } +} + /****************************************************************************** * * Network device interface @@ -512,6 +524,7 @@ static void netfront_refill_rx ( struct net_device *netdev ) { struct io_buffer *iobuf; struct netif_rx_request *request; unsigned int refilled = 0; + physaddr_t addr; int notify; int rc; @@ -524,24 +537,24 @@ static void netfront_refill_rx ( struct net_device *netdev ) { /* Wait for next refill */ break; } + addr = virt_to_phys ( iobuf->data ); /* Add to descriptor ring */ request = RING_GET_REQUEST ( &netfront->rx_fring, netfront->rx_fring.req_prod_pvt ); - if ( ( rc = netfront_push ( netfront, &netfront->rx, + if ( ( rc = netfront_push ( netfront, &netfront->rx, addr, iobuf, &request->id, &request->gref ) ) != 0 ) { netdev_rx_err ( netdev, iobuf, rc ); break; } DBGC2 ( netfront, "NETFRONT %s RX id %d ref %d is %#08lx+%zx\n", - xendev->key, request->id, request->gref, - virt_to_phys ( iobuf->data ), iob_tailroom ( iobuf ) ); + xendev->key, request->id, request->gref, addr, + iob_tailroom ( iobuf ) ); /* Move to next descriptor */ netfront->rx_fring.req_prod_pvt++; refilled++; - } /* Push new descriptors and notify backend if applicable */ @@ -593,6 +606,10 @@ static int netfront_open ( struct net_device *netdev ) { if ( ( rc = netfront_write_flag ( netfront, "request-rx-copy" ) ) != 0 ) goto err_request_rx_copy; + /* Inform backend that we can support scatter-gather */ + if ( ( rc = netfront_write_flag ( netfront, "feature-sg" ) ) != 0 ) + goto err_feature_sg; + /* Disable checksum offload, since we will always do the work anyway */ if ( ( rc = netfront_write_flag ( netfront, "feature-no-csum-offload" ) ) != 0 ) @@ -632,6 +649,8 @@ static int netfront_open ( struct net_device *netdev ) { err_feature_rx_notify: netfront_rm ( netfront, "feature-no-csum-offload" ); err_feature_no_csum_offload: + netfront_rm ( netfront, "feature-sg" ); + err_feature_sg: netfront_rm ( netfront, "request-rx-copy" ); err_request_rx_copy: netfront_destroy_event ( netfront ); @@ -675,11 +694,15 @@ static void netfront_close ( struct net_device *netdev ) { /* Delete flags */ netfront_rm ( netfront, "feature-rx-notify" ); netfront_rm ( netfront, "feature-no-csum-offload" ); + netfront_rm ( netfront, "feature-sg" ); netfront_rm ( netfront, "request-rx-copy" ); /* Destroy event channel */ netfront_destroy_event ( netfront ); + /* Discard any partially received I/O buffers */ + netfront_discard ( netfront ); + /* Destroy receive descriptor ring, freeing any outstanding * I/O buffers. */ @@ -703,34 +726,66 @@ static int netfront_transmit ( struct net_device *netdev, struct netfront_nic *netfront = netdev->priv; struct xen_device *xendev = netfront->xendev; struct netif_tx_request *request; + physaddr_t addr; + size_t len; + size_t remaining; + size_t frag_len; + unsigned int offset; + unsigned int count; + unsigned int more; int notify; int rc; + /* Calculate number of page buffers required */ + addr = virt_to_phys ( iobuf->data ); + len = iob_len ( iobuf ); + offset = ( addr & ( PAGE_SIZE - 1 ) ); + count = ( ( offset + len + PAGE_SIZE - 1 ) / PAGE_SIZE ); + /* Check that we have space in the ring */ - if ( netfront_ring_is_full ( &netfront->tx ) ) { + if ( netfront_ring_space ( &netfront->tx ) < count ) { DBGC ( netfront, "NETFRONT %s out of transmit descriptors\n", xendev->key ); return -ENOBUFS; } /* Add to descriptor ring */ - request = RING_GET_REQUEST ( &netfront->tx_fring, - netfront->tx_fring.req_prod_pvt ); - if ( ( rc = netfront_push ( netfront, &netfront->tx, iobuf, - &request->id, &request->gref ) ) != 0 ) { - return rc; - } - request->offset = ( virt_to_phys ( iobuf->data ) & ( PAGE_SIZE - 1 ) ); - request->flags = NETTXF_data_validated; - request->size = iob_len ( iobuf ); - DBGC2 ( netfront, "NETFRONT %s TX id %d ref %d is %#08lx+%zx\n", - xendev->key, request->id, request->gref, - virt_to_phys ( iobuf->data ), iob_len ( iobuf ) ); + remaining = len; + while ( remaining ) { + + /* Calculate length of this fragment */ + frag_len = ( PAGE_SIZE - offset ); + if ( frag_len >= remaining ) { + frag_len = remaining; + more = 0; + } else { + more = NETTXF_more_data; + } - /* Consume descriptor */ - netfront->tx_fring.req_prod_pvt++; + /* Populate request */ + request = RING_GET_REQUEST ( &netfront->tx_fring, + netfront->tx_fring.req_prod_pvt ); + if ( ( rc = netfront_push ( netfront, &netfront->tx, addr, + ( more ? NULL : iobuf ), + &request->id, + &request->gref ) ) != 0 ) { + return rc; + } + request->flags = ( NETTXF_data_validated | more ); + request->offset = offset; + request->size = ( ( remaining == len ) ? len : frag_len ); + DBGC2 ( netfront, "NETFRONT %s TX id %d ref %d is " + "%#08lx+%zx%s\n", xendev->key, request->id, + request->gref, addr, frag_len, ( more ? "..." : "" ) ); + + /* Move to next descriptor */ + netfront->tx_fring.req_prod_pvt++; + addr += frag_len; + remaining -= frag_len; + offset = 0; + } - /* Push new descriptor and notify backend if applicable */ + /* Push new descriptors and notify backend if applicable */ RING_PUSH_REQUESTS_AND_CHECK_NOTIFY ( &netfront->tx_fring, notify ); if ( notify ) netfront_send_event ( netfront ); @@ -748,7 +803,7 @@ static void netfront_poll_tx ( struct net_device *netdev ) { struct xen_device *xendev = netfront->xendev; struct netif_tx_response *response; struct io_buffer *iobuf; - unsigned int status; + int status; int rc; /* Consume any unconsumed responses */ @@ -761,10 +816,11 @@ static void netfront_poll_tx ( struct net_device *netdev ) { /* Retrieve from descriptor ring */ iobuf = netfront_pull ( netfront, &netfront->tx, response->id ); status = response->status; - if ( status == NETIF_RSP_OKAY ) { + if ( status >= NETIF_RSP_OKAY ) { DBGC2 ( netfront, "NETFRONT %s TX id %d complete\n", xendev->key, response->id ); - netdev_tx_complete ( netdev, iobuf ); + if ( iobuf ) + netdev_tx_complete ( netdev, iobuf ); } else { rc = -EIO_NETIF_RSP ( status ); DBGC2 ( netfront, "NETFRONT %s TX id %d error %d: %s\n", @@ -786,6 +842,7 @@ static void netfront_poll_rx ( struct net_device *netdev ) { struct netif_rx_response *response; struct io_buffer *iobuf; int status; + int more; size_t len; int rc; @@ -799,21 +856,45 @@ static void netfront_poll_rx ( struct net_device *netdev ) { /* Retrieve from descriptor ring */ iobuf = netfront_pull ( netfront, &netfront->rx, response->id ); status = response->status; - if ( status >= 0 ) { - len = status; - iob_reserve ( iobuf, response->offset ); - iob_put ( iobuf, len ); - DBGC2 ( netfront, "NETFRONT %s RX id %d complete " - "%#08lx+%zx\n", xendev->key, response->id, - virt_to_phys ( iobuf->data ), len ); - netdev_rx ( netdev, iobuf ); - } else { + more = ( response->flags & NETRXF_more_data ); + + /* Report errors */ + if ( status < 0 ) { rc = -EIO_NETIF_RSP ( status ); DBGC2 ( netfront, "NETFRONT %s RX id %d error %d: %s\n", xendev->key, response->id, status, strerror ( rc ) ); + netfront_discard ( netfront ); netdev_rx_err ( netdev, iobuf, rc ); + continue; } + + /* Add to partial receive list */ + len = status; + iob_reserve ( iobuf, response->offset ); + iob_put ( iobuf, len ); + DBGC2 ( netfront, "NETFRONT %s RX id %d complete " + "%#08lx+%zx%s\n", xendev->key, response->id, + virt_to_phys ( iobuf->data ), len, + ( more ? "..." : "" ) ); + list_add_tail ( &iobuf->list, &netfront->rx_partial ); + + /* Wait until complete packet has been received */ + if ( more ) + continue; + + /* Reassemble complete packet */ + iobuf = iob_concatenate ( &netfront->rx_partial ); + if ( ! iobuf ) { + DBGC2 ( netfront, "NETFRONT %s RX reassembly failed\n", + xendev->key ); + netfront_discard ( netfront ); + netdev_rx_err ( netdev, NULL, -ENOMEM ); + continue; + } + + /* Hand off to network stack */ + netdev_rx ( netdev, iobuf ); } } @@ -871,6 +952,7 @@ static int netfront_probe ( struct xen_device *xendev ) { netdev->dev = &xendev->dev; netfront = netdev->priv; netfront->xendev = xendev; + INIT_LIST_HEAD ( &netfront->rx_partial ); DBGC ( netfront, "NETFRONT %s backend=\"%s\" in domain %ld\n", xendev->key, xendev->backend, xendev->backend_id ); diff --git a/src/drivers/net/netfront.h b/src/drivers/net/netfront.h index c95ed2645..dca3ff1c5 100644 --- a/src/drivers/net/netfront.h +++ b/src/drivers/net/netfront.h @@ -65,7 +65,7 @@ struct netfront_ring { size_t count; /** I/O buffers, indexed by buffer ID */ struct io_buffer **iobufs; - /** I/O buffer grant references, indexed by buffer ID */ + /** Grant references, indexed by buffer ID */ grant_ref_t *refs; /** Buffer ID ring */ @@ -117,6 +117,18 @@ netfront_ring_fill ( struct netfront_ring *ring ) { } /** + * Calculate descriptor ring remaining space + * + * @v ring Descriptor ring + * @v space Number of unused entries + */ +static inline __attribute__ (( always_inline )) unsigned int +netfront_ring_space ( struct netfront_ring *ring ) { + + return ( ring->count - netfront_ring_fill ( ring ) ); +} + +/** * Check whether or not descriptor ring is full * * @v ring Descriptor ring @@ -164,6 +176,8 @@ struct netfront_nic { struct io_buffer *rx_iobufs[NETFRONT_NUM_RX_DESC]; /** Receive I/O buffer IDs */ uint8_t rx_ids[NETFRONT_NUM_RX_DESC]; + /** Partial receive I/O buffer list */ + struct list_head rx_partial; /** Event channel */ struct evtchn_send event; diff --git a/src/include/ipxe/xengrant.h b/src/include/ipxe/xengrant.h index 451a3ceee..fcb7a7157 100644 --- a/src/include/ipxe/xengrant.h +++ b/src/include/ipxe/xengrant.h @@ -166,16 +166,17 @@ xengrant_invalidate ( struct xen_hypervisor *xen, grant_ref_t ref ) { * @v ref Grant reference * @v domid Domain ID * @v subflags Additional flags - * @v page Page start + * @v addr Physical address within page * @ret rc Return status code */ static inline __attribute__ (( always_inline )) int xengrant_permit_access ( struct xen_hypervisor *xen, grant_ref_t ref, - domid_t domid, unsigned int subflags, void *page ) { + domid_t domid, unsigned int subflags, + physaddr_t addr ) { struct grant_entry_header *hdr = xengrant_header ( xen, ref ); struct grant_entry_v1 *v1 = xengrant_v1 ( hdr ); union grant_entry_v2 *v2 = xengrant_v2 ( hdr ); - unsigned long frame = ( virt_to_phys ( page ) / PAGE_SIZE ); + unsigned long frame = ( addr / PAGE_SIZE ); /* Fail (for test purposes) if applicable */ if ( ( XENGRANT_FAIL_RATE > 0 ) && |