diff options
Diffstat (limited to 'src/drivers/net/gve.h')
-rw-r--r-- | src/drivers/net/gve.h | 702 |
1 files changed, 702 insertions, 0 deletions
diff --git a/src/drivers/net/gve.h b/src/drivers/net/gve.h new file mode 100644 index 000000000..2845699ac --- /dev/null +++ b/src/drivers/net/gve.h @@ -0,0 +1,702 @@ +#ifndef _GVE_H +#define _GVE_H + +/** @file + * + * Google Virtual Ethernet network driver + * + * The Google Virtual Ethernet NIC (GVE or gVNIC) is found only in + * Google Cloud instances. There is essentially zero documentation + * available beyond the mostly uncommented source code in the Linux + * kernel. + */ + +FILE_LICENCE ( GPL2_OR_LATER_OR_UBDL ); + +#include <stdint.h> +#include <ipxe/dma.h> +#include <ipxe/pci.h> +#include <ipxe/in.h> +#include <ipxe/uaccess.h> +#include <ipxe/process.h> +#include <ipxe/retry.h> + +struct gve_nic; + +/** + * A Google Cloud MAC address + * + * Google Cloud locally assigned MAC addresses encode the local IPv4 + * address in the trailing 32 bits, presumably as a performance + * optimisation to allow ARP resolution to be skipped by a suitably + * aware network stack. + */ +struct google_mac { + /** Reserved */ + uint8_t reserved[2]; + /** Local IPv4 address */ + struct in_addr in; +} __attribute__ (( packed )); + +/** Page size */ +#define GVE_PAGE_SIZE 0x1000 + +/** + * Address alignment + * + * All DMA data structure base addresses seem to need to be aligned to + * a page boundary. (This is not documented anywhere, but is inferred + * from existing source code and experimentation.) + */ +#define GVE_ALIGN GVE_PAGE_SIZE + +/** + * Length alignment + * + * All DMA data structure lengths seem to need to be aligned to a + * multiple of 64 bytes. (This is not documented anywhere, but is + * inferred from existing source code and experimentation.) + */ +#define GVE_LEN_ALIGN 64 + +/** Maximum number of pages per queue (must be a power of two) */ +#define GVE_QPL_MAX 16 + +/** Configuration BAR */ +#define GVE_CFG_BAR PCI_BASE_ADDRESS_0 + +/** + * Configuration BAR size + * + * All registers within the configuration BAR are big-endian. + */ +#define GVE_CFG_SIZE 0x1000 + +/** Device status */ +#define GVE_CFG_DEVSTAT 0x0000 +#define GVE_CFG_DEVSTAT_RESET 0x00000010UL /**< Device is reset */ + +/** Driver status */ +#define GVE_CFG_DRVSTAT 0x0004 +#define GVE_CFG_DRVSTAT_RUN 0x00000001UL /**< Run admin queue */ + +/** Maximum time to wait for reset */ +#define GVE_RESET_MAX_WAIT_MS 500 + +/** Admin queue page frame number (for older devices) */ +#define GVE_CFG_ADMIN_PFN 0x0010 + +/** Admin queue doorbell */ +#define GVE_CFG_ADMIN_DB 0x0014 + +/** Admin queue event counter */ +#define GVE_CFG_ADMIN_EVT 0x0018 + +/** Driver version (8-bit register) */ +#define GVE_CFG_VERSION 0x001f + +/** Admin queue base address high 32 bits */ +#define GVE_CFG_ADMIN_BASE_HI 0x0020 + +/** Admin queue base address low 32 bits */ +#define GVE_CFG_ADMIN_BASE_LO 0x0024 + +/** Admin queue base address length (16-bit register) */ +#define GVE_CFG_ADMIN_LEN 0x0028 + +/** Doorbell BAR */ +#define GVE_DB_BAR PCI_BASE_ADDRESS_2 + +/** + * Admin queue entry header + * + * All values within admin queue entries are big-endian. + */ +struct gve_admin_header { + /** Reserved */ + uint8_t reserved[3]; + /** Operation code */ + uint8_t opcode; + /** Status */ + uint32_t status; +} __attribute__ (( packed )); + +/** Command succeeded */ +#define GVE_ADMIN_STATUS_OK 0x00000001 + +/** Simple admin command */ +struct gve_admin_simple { + /** Header */ + struct gve_admin_header hdr; + /** ID */ + uint32_t id; +} __attribute__ (( packed )); + +/** Describe device command */ +#define GVE_ADMIN_DESCRIBE 0x0001 + +/** Describe device command */ +struct gve_admin_describe { + /** Header */ + struct gve_admin_header hdr; + /** Descriptor buffer address */ + uint64_t addr; + /** Descriptor version */ + uint32_t ver; + /** Descriptor maximum length */ + uint32_t len; +} __attribute__ (( packed )); + +/** Device descriptor version */ +#define GVE_ADMIN_DESCRIBE_VER 1 + +/** Device descriptor */ +struct gve_device_descriptor { + /** Reserved */ + uint8_t reserved_a[10]; + /** Number of transmit queue entries */ + uint16_t tx_count; + /** Number of receive queue entries */ + uint16_t rx_count; + /** Reserved */ + uint8_t reserved_b[2]; + /** Maximum transmit unit */ + uint16_t mtu; + /** Number of event counters */ + uint16_t counters; + /** Reserved */ + uint8_t reserved_c[4]; + /** MAC address */ + struct google_mac mac; + /** Reserved */ + uint8_t reserved_d[10]; +} __attribute__ (( packed )); + +/** Configure device resources command */ +#define GVE_ADMIN_CONFIGURE 0x0002 + +/** Configure device resources command */ +struct gve_admin_configure { + /** Header */ + struct gve_admin_header hdr; + /** Event counter array */ + uint64_t events; + /** IRQ doorbell address */ + uint64_t irqs; + /** Number of event counters */ + uint32_t num_events; + /** Number of IRQ doorbells */ + uint32_t num_irqs; + /** IRQ doorbell stride */ + uint32_t irq_stride; +} __attribute__ (( packed )); + +/** Register page list command */ +#define GVE_ADMIN_REGISTER 0x0003 + +/** Register page list command */ +struct gve_admin_register { + /** Header */ + struct gve_admin_header hdr; + /** Page list ID */ + uint32_t id; + /** Number of pages */ + uint32_t count; + /** Address list address */ + uint64_t addr; + /** Page size */ + uint64_t size; +} __attribute__ (( packed )); + +/** Page list */ +struct gve_pages { + /** Page address */ + uint64_t addr[GVE_QPL_MAX]; +} __attribute__ (( packed )); + +/** Unregister page list command */ +#define GVE_ADMIN_UNREGISTER 0x0004 + +/** Create transmit queue command */ +#define GVE_ADMIN_CREATE_TX 0x0005 + +/** Create transmit queue command */ +struct gve_admin_create_tx { + /** Header */ + struct gve_admin_header hdr; + /** Queue ID */ + uint32_t id; + /** Reserved */ + uint8_t reserved_a[4]; + /** Queue resources address */ + uint64_t res; + /** Descriptor ring address */ + uint64_t desc; + /** Queue page list ID */ + uint32_t qpl_id; + /** Notification channel ID */ + uint32_t notify_id; +} __attribute__ (( packed )); + +/** Create receive queue command */ +#define GVE_ADMIN_CREATE_RX 0x0006 + +/** Create receive queue command */ +struct gve_admin_create_rx { + /** Header */ + struct gve_admin_header hdr; + /** Queue ID */ + uint32_t id; + /** Index */ + uint32_t index; + /** Reserved */ + uint8_t reserved_a[4]; + /** Notification channel ID */ + uint32_t notify_id; + /** Queue resources address */ + uint64_t res; + /** Completion ring address */ + uint64_t cmplt; + /** Descriptor ring address */ + uint64_t desc; + /** Queue page list ID */ + uint32_t qpl_id; + /** Reserved */ + uint8_t reserved_b[2]; + /** Packet buffer size */ + uint16_t bufsz; +} __attribute__ (( packed )); + +/** Destroy transmit queue command */ +#define GVE_ADMIN_DESTROY_TX 0x0007 + +/** Destroy receive queue command */ +#define GVE_ADMIN_DESTROY_RX 0x0008 + +/** Deconfigure device resources command */ +#define GVE_ADMIN_DECONFIGURE 0x0009 + +/** An admin queue command */ +union gve_admin_command { + /** Header */ + struct gve_admin_header hdr; + /** Simple command */ + struct gve_admin_simple simple; + /** Describe device */ + struct gve_admin_describe desc; + /** Configure device resources */ + struct gve_admin_configure conf; + /** Register page list */ + struct gve_admin_register reg; + /** Create transmit queue */ + struct gve_admin_create_tx create_tx; + /** Create receive queue */ + struct gve_admin_create_rx create_rx; + /** Padding */ + uint8_t pad[64]; +}; + +/** + * Number of admin queue commands + * + * This is theoretically a policy decision. However, older revisions + * of the hardware seem to have only the "admin queue page frame + * number" register and no "admin queue length" register, with the + * implication that the admin queue must be exactly one page in + * length. + * + * Choose to use a one page (4kB) admin queue for both older and newer + * versions of the hardware, to minimise variability. + */ +#define GVE_ADMIN_COUNT ( GVE_PAGE_SIZE / sizeof ( union gve_admin_command ) ) + +/** Admin queue */ +struct gve_admin { + /** Commands */ + union gve_admin_command *cmd; + /** Producer counter */ + uint32_t prod; + /** DMA mapping */ + struct dma_mapping map; +}; + +/** Scratch buffer for admin queue commands */ +struct gve_scratch { + /** Buffer contents */ + union { + /** Device descriptor */ + struct gve_device_descriptor desc; + /** Page address list */ + struct gve_pages pages; + } *buf; + /** DMA mapping */ + struct dma_mapping map; +}; + +/** + * An event counter + * + * Written by the device to indicate completions. The device chooses + * which counter to use for each transmit queue, and stores the index + * of the chosen counter in the queue resources. + */ +struct gve_event { + /** Number of events that have occurred */ + volatile uint32_t count; +} __attribute__ (( packed )); + +/** + * Maximum number of event counters + * + * We tell the device how many event counters we have provided via the + * "configure device resources" admin queue command. The device will + * accept being given only a single counter, but will subsequently + * fail to create a receive queue. + * + * There is, of course, no documentation indicating how may event + * counters actually need to be provided. In the absence of evidence + * to the contrary, assume that 16 counters (i.e. the smallest number + * we can allocate, given the length alignment constraint on + * allocations) will be sufficient. + */ +#define GVE_EVENT_MAX ( GVE_LEN_ALIGN / sizeof ( struct gve_event ) ) + +/** Event counter array */ +struct gve_events { + /** Event counters */ + struct gve_event *event; + /** DMA mapping */ + struct dma_mapping map; + /** Actual number of event counters */ + unsigned int count; +}; + +/** An interrupt channel */ +struct gve_irq { + /** Interrupt doorbell index (within doorbell BAR) */ + uint32_t db_idx; + /** Reserved */ + uint8_t reserved[60]; +} __attribute__ (( packed )); + +/** + * Number of interrupt channels + * + * We tell the device how many interrupt channels we have provided via + * the "configure device resources" admin queue command. The device + * will accept being given zero interrupt channels, but will + * subsequently fail to create more than a single queue (either + * transmit or receive). + * + * There is, of course, no documentation indicating how may interrupt + * channels actually need to be provided. In the absence of evidence + * to the contrary, assume that two channels (one for transmit, one + * for receive) will be sufficient. + */ +#define GVE_IRQ_COUNT 2 + +/** Interrupt channel array */ +struct gve_irqs { + /** Interrupt channels */ + struct gve_irq *irq; + /** DMA mapping */ + struct dma_mapping map; + /** Interrupt doorbells */ + volatile uint32_t *db[GVE_IRQ_COUNT]; +}; + +/** Disable interrupts */ +#define GVE_IRQ_DISABLE 0x40000000UL + +/** + * Queue resources + * + * Written by the device to indicate the indices of the chosen event + * counter and descriptor doorbell register. + * + * This appears to be a largely pointless data structure: the relevant + * information is static for the lifetime of the queue and could + * trivially have been returned in the response for the "create + * transmit/receive queue" command, instead of requiring yet another + * page-aligned coherent DMA buffer allocation. + */ +struct gve_resources { + /** Descriptor doorbell index (within doorbell BAR) */ + uint32_t db_idx; + /** Event counter index (within event counter array) */ + uint32_t evt_idx; + /** Reserved */ + uint8_t reserved[56]; +} __attribute__ (( packed )); + +/** + * Queue data buffer size + * + * In theory, we may specify the size of receive buffers. However, + * the original version of the device seems not to have a parameter + * for this, and assumes the use of half-page (2kB) buffers. Choose + * to use this as the buffer size, on the assumption that older + * devices will not support any other buffer size. + */ +#define GVE_BUF_SIZE ( GVE_PAGE_SIZE / 2 ) + +/** Number of data buffers per page */ +#define GVE_BUF_PER_PAGE ( GVE_PAGE_SIZE / GVE_BUF_SIZE ) + +/** + * Queue page list + * + * The device uses preregistered pages for fast-path DMA operations + * (i.e. transmit and receive buffers). A list of device addresses + * for each page must be registered before the transmit or receive + * queue is created, and cannot subsequently be modified. + * + * The Linux driver allocates pages as DMA_TO_DEVICE or + * DMA_FROM_DEVICE as appropriate, and uses dma_sync_single_for_cpu() + * etc to ensure that data is copied to/from bounce buffers as needed. + * + * Unfortunately there is no such sync operation available within our + * DMA API, since we are constrained by the limitations imposed by + * EFI_PCI_IO_PROTOCOL. There is no way to synchronise a buffer + * without also [un]mapping it, and no way to force the reuse of the + * same device address for a subsequent remapping. We are therefore + * constrained to use only DMA-coherent buffers, since this is the + * only way we can repeatedly reuse the same device address. + * + * Newer versions of the gVNIC device support "raw DMA addressing + * (RDA)", which is essentially a prebuilt queue page list covering + * the whole of the guest address space. Unfortunately we cannot rely + * on this, since older versions will not support it. + * + * Experimentation suggests that the device will accept a request to + * create a queue page list covering the whole of the guest address + * space via two giant "pages" of 2^63 bytes each. However, + * experimentation also suggests that the device will accept any old + * garbage value as the "page size". In the total absence of any + * documentation, it is probably unsafe to conclude that the device is + * bothering to look at or respect the "page size" parameter: it is + * most likely just presuming the use of 4kB pages. + */ +struct gve_qpl { + /** Page addresses */ + userptr_t data; + /** Page mapping */ + struct dma_mapping map; + /** Number of pages */ + unsigned int count; + /** Queue page list ID */ + unsigned int id; +}; + +/** + * Maximum number of transmit buffers + * + * This is a policy decision. + */ +#define GVE_TX_FILL 8 + +/** Transmit queue page list ID */ +#define GVE_TX_QPL 0x18ae5458 + +/** Tranmsit queue interrupt channel */ +#define GVE_TX_IRQ 0 + +/** A transmit or receive buffer descriptor */ +struct gve_buffer { + /** Address (within queue page list address space) */ + uint64_t addr; +} __attribute__ (( packed )); + +/** A transmit packet descriptor */ +struct gve_tx_packet { + /** Type */ + uint8_t type; + /** Reserved */ + uint8_t reserved_a[2]; + /** Number of descriptors in this packet */ + uint8_t count; + /** Total length of this packet */ + uint16_t total; + /** Length of this descriptor */ + uint16_t len; +} __attribute__ (( packed )); + +/** A transmit descriptor */ +struct gve_tx_descriptor { + /** Packet descriptor */ + struct gve_tx_packet pkt; + /** Buffer descriptor */ + struct gve_buffer buf; +} __attribute__ (( packed )); + +/** Start of packet transmit descriptor type */ +#define GVE_TX_TYPE_START 0x00 + +/** Continuation of packet transmit descriptor type */ +#define GVE_TX_TYPE_CONT 0x20 + +/** + * Maximum number of receive buffers + * + * This is a policy decision. + */ +#define GVE_RX_FILL 16 + +/** Receive queue page list ID */ +#define GVE_RX_QPL 0x18ae5258 + +/** Receive queue interrupt channel */ +#define GVE_RX_IRQ 1 + +/** A receive descriptor */ +struct gve_rx_descriptor { + /** Buffer descriptor */ + struct gve_buffer buf; +} __attribute__ (( packed )); + +/** A receive packet descriptor */ +struct gve_rx_packet { + /** Length */ + uint16_t len; + /** Flags */ + uint8_t flags; + /** Sequence number */ + uint8_t seq; +} __attribute__ (( packed )); + +/** Receive error */ +#define GVE_RXF_ERROR 0x08 + +/** Receive packet continues into next descriptor */ +#define GVE_RXF_MORE 0x20 + +/** Receive sequence number mask */ +#define GVE_RX_SEQ_MASK 0x07 + +/** A receive completion descriptor */ +struct gve_rx_completion { + /** Reserved */ + uint8_t reserved[60]; + /** Packet descriptor */ + struct gve_rx_packet pkt; +} __attribute__ (( packed )); + +/** Padding at the start of all received packets */ +#define GVE_RX_PAD 2 + +/** A descriptor queue */ +struct gve_queue { + /** Descriptor ring */ + userptr_t desc; + /** Completion ring */ + userptr_t cmplt; + /** Queue resources */ + struct gve_resources *res; + + /** Queue type */ + const struct gve_queue_type *type; + /** Number of descriptors (must be a power of two) */ + unsigned int count; + /** Maximum fill level (must be a power of two) */ + unsigned int fill; + + /** Descriptor mapping */ + struct dma_mapping desc_map; + /** Completion mapping */ + struct dma_mapping cmplt_map; + /** Queue resources mapping */ + struct dma_mapping res_map; + + /** Doorbell register */ + volatile uint32_t *db; + /** Event counter */ + struct gve_event *event; + + /** Producer counter */ + uint32_t prod; + /** Consumer counter */ + uint32_t cons; + + /** Queue page list */ + struct gve_qpl qpl; +}; + +/** A descriptor queue type */ +struct gve_queue_type { + /** Name */ + const char *name; + /** + * Populate command parameters to create queue + * + * @v queue Descriptor queue + * @v cmd Admin queue command + */ + void ( * param ) ( struct gve_queue *queue, + union gve_admin_command *cmd ); + /** Queue page list ID */ + uint32_t qpl; + /** Interrupt channel */ + uint8_t irq; + /** Maximum fill level */ + uint8_t fill; + /** Descriptor size */ + uint8_t desc_len; + /** Completion size */ + uint8_t cmplt_len; + /** Command to create queue */ + uint8_t create; + /** Command to destroy queue */ + uint8_t destroy; +}; + +/** A Google Virtual Ethernet NIC */ +struct gve_nic { + /** Configuration registers */ + void *cfg; + /** Doorbell registers */ + void *db; + /** PCI revision */ + uint8_t revision; + /** Network device */ + struct net_device *netdev; + /** DMA device */ + struct dma_device *dma; + + /** Admin queue */ + struct gve_admin admin; + /** Interrupt channels */ + struct gve_irqs irqs; + /** Event counters */ + struct gve_events events; + /** Scratch buffer */ + struct gve_scratch scratch; + + /** Transmit queue */ + struct gve_queue tx; + /** Receive queue */ + struct gve_queue rx; + /** Transmit I/O buffers */ + struct io_buffer *tx_iobuf[GVE_TX_FILL]; + /** Receive sequence number */ + unsigned int seq; + + /** Startup process */ + struct process startup; + /** Startup process retry counter */ + unsigned int retries; + /** Reset recovery watchdog timer */ + struct retry_timer watchdog; + /** Reset recovery recorded activity counter */ + uint32_t activity; +}; + +/** Maximum time to wait for admin queue commands */ +#define GVE_ADMIN_MAX_WAIT_MS 500 + +/** Maximum number of times to reattempt device reset */ +#define GVE_RESET_MAX_RETRY 5 + +/** Time between reset recovery checks */ +#define GVE_WATCHDOG_TIMEOUT ( 1 * TICKS_PER_SEC ) + +#endif /* _GVE_H */ |