]> Pileus Git - ~andy/linux/commitdiff
Merge branch 'for-davem' of git://git.kernel.org/pub/scm/linux/kernel/git/bwh/sfc...
authorDavid S. Miller <davem@davemloft.net>
Mon, 23 Sep 2013 20:09:36 +0000 (16:09 -0400)
committerDavid S. Miller <davem@davemloft.net>
Mon, 23 Sep 2013 20:09:36 +0000 (16:09 -0400)
Ben Hutchings says:

====================
1. Some cleanup from Fengguang Wu and his kbuild robot.
2. Support for ethtool register dump on EF10.
3. Change soft-TSO to take advantage of firmware assistance on EF10.
4. Support for PIO TX buffers and descriptors on EF10, enabled on
architectures that support write-combining mappings.
5. Accelerated RFS support for TCP/IPv6 and UDP/IPv6 on EF10.
====================

Signed-off-by: David S. Miller <davem@davemloft.net>
drivers/net/ethernet/sfc/ef10.c
drivers/net/ethernet/sfc/ef10_regs.h
drivers/net/ethernet/sfc/efx.h
drivers/net/ethernet/sfc/ethtool.c
drivers/net/ethernet/sfc/io.h
drivers/net/ethernet/sfc/net_driver.h
drivers/net/ethernet/sfc/nic.c
drivers/net/ethernet/sfc/nic.h
drivers/net/ethernet/sfc/rx.c
drivers/net/ethernet/sfc/tx.c

index 9f18ae984f9ed38386b16e5284d4d10864687c4b..844ee7539d3350427e0802e4b745b2aee1a9cb99 100644 (file)
@@ -285,6 +285,181 @@ static int efx_ef10_free_vis(struct efx_nic *efx)
        return rc;
 }
 
+#ifdef EFX_USE_PIO
+
+static void efx_ef10_free_piobufs(struct efx_nic *efx)
+{
+       struct efx_ef10_nic_data *nic_data = efx->nic_data;
+       MCDI_DECLARE_BUF(inbuf, MC_CMD_FREE_PIOBUF_IN_LEN);
+       unsigned int i;
+       int rc;
+
+       BUILD_BUG_ON(MC_CMD_FREE_PIOBUF_OUT_LEN != 0);
+
+       for (i = 0; i < nic_data->n_piobufs; i++) {
+               MCDI_SET_DWORD(inbuf, FREE_PIOBUF_IN_PIOBUF_HANDLE,
+                              nic_data->piobuf_handle[i]);
+               rc = efx_mcdi_rpc(efx, MC_CMD_FREE_PIOBUF, inbuf, sizeof(inbuf),
+                                 NULL, 0, NULL);
+               WARN_ON(rc);
+       }
+
+       nic_data->n_piobufs = 0;
+}
+
+static int efx_ef10_alloc_piobufs(struct efx_nic *efx, unsigned int n)
+{
+       struct efx_ef10_nic_data *nic_data = efx->nic_data;
+       MCDI_DECLARE_BUF(outbuf, MC_CMD_ALLOC_PIOBUF_OUT_LEN);
+       unsigned int i;
+       size_t outlen;
+       int rc = 0;
+
+       BUILD_BUG_ON(MC_CMD_ALLOC_PIOBUF_IN_LEN != 0);
+
+       for (i = 0; i < n; i++) {
+               rc = efx_mcdi_rpc(efx, MC_CMD_ALLOC_PIOBUF, NULL, 0,
+                                 outbuf, sizeof(outbuf), &outlen);
+               if (rc)
+                       break;
+               if (outlen < MC_CMD_ALLOC_PIOBUF_OUT_LEN) {
+                       rc = -EIO;
+                       break;
+               }
+               nic_data->piobuf_handle[i] =
+                       MCDI_DWORD(outbuf, ALLOC_PIOBUF_OUT_PIOBUF_HANDLE);
+               netif_dbg(efx, probe, efx->net_dev,
+                         "allocated PIO buffer %u handle %x\n", i,
+                         nic_data->piobuf_handle[i]);
+       }
+
+       nic_data->n_piobufs = i;
+       if (rc)
+               efx_ef10_free_piobufs(efx);
+       return rc;
+}
+
+static int efx_ef10_link_piobufs(struct efx_nic *efx)
+{
+       struct efx_ef10_nic_data *nic_data = efx->nic_data;
+       MCDI_DECLARE_BUF(inbuf,
+                        max(MC_CMD_LINK_PIOBUF_IN_LEN,
+                            MC_CMD_UNLINK_PIOBUF_IN_LEN));
+       struct efx_channel *channel;
+       struct efx_tx_queue *tx_queue;
+       unsigned int offset, index;
+       int rc;
+
+       BUILD_BUG_ON(MC_CMD_LINK_PIOBUF_OUT_LEN != 0);
+       BUILD_BUG_ON(MC_CMD_UNLINK_PIOBUF_OUT_LEN != 0);
+
+       /* Link a buffer to each VI in the write-combining mapping */
+       for (index = 0; index < nic_data->n_piobufs; ++index) {
+               MCDI_SET_DWORD(inbuf, LINK_PIOBUF_IN_PIOBUF_HANDLE,
+                              nic_data->piobuf_handle[index]);
+               MCDI_SET_DWORD(inbuf, LINK_PIOBUF_IN_TXQ_INSTANCE,
+                              nic_data->pio_write_vi_base + index);
+               rc = efx_mcdi_rpc(efx, MC_CMD_LINK_PIOBUF,
+                                 inbuf, MC_CMD_LINK_PIOBUF_IN_LEN,
+                                 NULL, 0, NULL);
+               if (rc) {
+                       netif_err(efx, drv, efx->net_dev,
+                                 "failed to link VI %u to PIO buffer %u (%d)\n",
+                                 nic_data->pio_write_vi_base + index, index,
+                                 rc);
+                       goto fail;
+               }
+               netif_dbg(efx, probe, efx->net_dev,
+                         "linked VI %u to PIO buffer %u\n",
+                         nic_data->pio_write_vi_base + index, index);
+       }
+
+       /* Link a buffer to each TX queue */
+       efx_for_each_channel(channel, efx) {
+               efx_for_each_channel_tx_queue(tx_queue, channel) {
+                       /* We assign the PIO buffers to queues in
+                        * reverse order to allow for the following
+                        * special case.
+                        */
+                       offset = ((efx->tx_channel_offset + efx->n_tx_channels -
+                                  tx_queue->channel->channel - 1) *
+                                 efx_piobuf_size);
+                       index = offset / ER_DZ_TX_PIOBUF_SIZE;
+                       offset = offset % ER_DZ_TX_PIOBUF_SIZE;
+
+                       /* When the host page size is 4K, the first
+                        * host page in the WC mapping may be within
+                        * the same VI page as the last TX queue.  We
+                        * can only link one buffer to each VI.
+                        */
+                       if (tx_queue->queue == nic_data->pio_write_vi_base) {
+                               BUG_ON(index != 0);
+                               rc = 0;
+                       } else {
+                               MCDI_SET_DWORD(inbuf,
+                                              LINK_PIOBUF_IN_PIOBUF_HANDLE,
+                                              nic_data->piobuf_handle[index]);
+                               MCDI_SET_DWORD(inbuf,
+                                              LINK_PIOBUF_IN_TXQ_INSTANCE,
+                                              tx_queue->queue);
+                               rc = efx_mcdi_rpc(efx, MC_CMD_LINK_PIOBUF,
+                                                 inbuf, MC_CMD_LINK_PIOBUF_IN_LEN,
+                                                 NULL, 0, NULL);
+                       }
+
+                       if (rc) {
+                               /* This is non-fatal; the TX path just
+                                * won't use PIO for this queue
+                                */
+                               netif_err(efx, drv, efx->net_dev,
+                                         "failed to link VI %u to PIO buffer %u (%d)\n",
+                                         tx_queue->queue, index, rc);
+                               tx_queue->piobuf = NULL;
+                       } else {
+                               tx_queue->piobuf =
+                                       nic_data->pio_write_base +
+                                       index * EFX_VI_PAGE_SIZE + offset;
+                               tx_queue->piobuf_offset = offset;
+                               netif_dbg(efx, probe, efx->net_dev,
+                                         "linked VI %u to PIO buffer %u offset %x addr %p\n",
+                                         tx_queue->queue, index,
+                                         tx_queue->piobuf_offset,
+                                         tx_queue->piobuf);
+                       }
+               }
+       }
+
+       return 0;
+
+fail:
+       while (index--) {
+               MCDI_SET_DWORD(inbuf, UNLINK_PIOBUF_IN_TXQ_INSTANCE,
+                              nic_data->pio_write_vi_base + index);
+               efx_mcdi_rpc(efx, MC_CMD_UNLINK_PIOBUF,
+                            inbuf, MC_CMD_UNLINK_PIOBUF_IN_LEN,
+                            NULL, 0, NULL);
+       }
+       return rc;
+}
+
+#else /* !EFX_USE_PIO */
+
+static int efx_ef10_alloc_piobufs(struct efx_nic *efx, unsigned int n)
+{
+       return n == 0 ? 0 : -ENOBUFS;
+}
+
+static int efx_ef10_link_piobufs(struct efx_nic *efx)
+{
+       return 0;
+}
+
+static void efx_ef10_free_piobufs(struct efx_nic *efx)
+{
+}
+
+#endif /* EFX_USE_PIO */
+
 static void efx_ef10_remove(struct efx_nic *efx)
 {
        struct efx_ef10_nic_data *nic_data = efx->nic_data;
@@ -295,9 +470,15 @@ static void efx_ef10_remove(struct efx_nic *efx)
        /* This needs to be after efx_ptp_remove_channel() with no filters */
        efx_ef10_rx_free_indir_table(efx);
 
+       if (nic_data->wc_membase)
+               iounmap(nic_data->wc_membase);
+
        rc = efx_ef10_free_vis(efx);
        WARN_ON(rc != 0);
 
+       if (!nic_data->must_restore_piobufs)
+               efx_ef10_free_piobufs(efx);
+
        efx_mcdi_fini(efx);
        efx_nic_free_buffer(efx, &nic_data->mcdi_buf);
        kfree(nic_data);
@@ -330,12 +511,126 @@ static int efx_ef10_alloc_vis(struct efx_nic *efx,
        return 0;
 }
 
+/* Note that the failure path of this function does not free
+ * resources, as this will be done by efx_ef10_remove().
+ */
 static int efx_ef10_dimension_resources(struct efx_nic *efx)
 {
-       unsigned int n_vis =
-               max(efx->n_channels, efx->n_tx_channels * EFX_TXQ_TYPES);
+       struct efx_ef10_nic_data *nic_data = efx->nic_data;
+       unsigned int uc_mem_map_size, wc_mem_map_size;
+       unsigned int min_vis, pio_write_vi_base, max_vis;
+       void __iomem *membase;
+       int rc;
+
+       min_vis = max(efx->n_channels, efx->n_tx_channels * EFX_TXQ_TYPES);
+
+#ifdef EFX_USE_PIO
+       /* Try to allocate PIO buffers if wanted and if the full
+        * number of PIO buffers would be sufficient to allocate one
+        * copy-buffer per TX channel.  Failure is non-fatal, as there
+        * are only a small number of PIO buffers shared between all
+        * functions of the controller.
+        */
+       if (efx_piobuf_size != 0 &&
+           ER_DZ_TX_PIOBUF_SIZE / efx_piobuf_size * EF10_TX_PIOBUF_COUNT >=
+           efx->n_tx_channels) {
+               unsigned int n_piobufs =
+                       DIV_ROUND_UP(efx->n_tx_channels,
+                                    ER_DZ_TX_PIOBUF_SIZE / efx_piobuf_size);
+
+               rc = efx_ef10_alloc_piobufs(efx, n_piobufs);
+               if (rc)
+                       netif_err(efx, probe, efx->net_dev,
+                                 "failed to allocate PIO buffers (%d)\n", rc);
+               else
+                       netif_dbg(efx, probe, efx->net_dev,
+                                 "allocated %u PIO buffers\n", n_piobufs);
+       }
+#else
+       nic_data->n_piobufs = 0;
+#endif
 
-       return efx_ef10_alloc_vis(efx, n_vis, n_vis);
+       /* PIO buffers should be mapped with write-combining enabled,
+        * and we want to make single UC and WC mappings rather than
+        * several of each (in fact that's the only option if host
+        * page size is >4K).  So we may allocate some extra VIs just
+        * for writing PIO buffers through.
+        */
+       uc_mem_map_size = PAGE_ALIGN((min_vis - 1) * EFX_VI_PAGE_SIZE +
+                                    ER_DZ_TX_PIOBUF);
+       if (nic_data->n_piobufs) {
+               pio_write_vi_base = uc_mem_map_size / EFX_VI_PAGE_SIZE;
+               wc_mem_map_size = (PAGE_ALIGN((pio_write_vi_base +
+                                              nic_data->n_piobufs) *
+                                             EFX_VI_PAGE_SIZE) -
+                                  uc_mem_map_size);
+               max_vis = pio_write_vi_base + nic_data->n_piobufs;
+       } else {
+               pio_write_vi_base = 0;
+               wc_mem_map_size = 0;
+               max_vis = min_vis;
+       }
+
+       /* In case the last attached driver failed to free VIs, do it now */
+       rc = efx_ef10_free_vis(efx);
+       if (rc != 0)
+               return rc;
+
+       rc = efx_ef10_alloc_vis(efx, min_vis, max_vis);
+       if (rc != 0)
+               return rc;
+
+       /* If we didn't get enough VIs to map all the PIO buffers, free the
+        * PIO buffers
+        */
+       if (nic_data->n_piobufs &&
+           nic_data->n_allocated_vis <
+           pio_write_vi_base + nic_data->n_piobufs) {
+               netif_dbg(efx, probe, efx->net_dev,
+                         "%u VIs are not sufficient to map %u PIO buffers\n",
+                         nic_data->n_allocated_vis, nic_data->n_piobufs);
+               efx_ef10_free_piobufs(efx);
+       }
+
+       /* Shrink the original UC mapping of the memory BAR */
+       membase = ioremap_nocache(efx->membase_phys, uc_mem_map_size);
+       if (!membase) {
+               netif_err(efx, probe, efx->net_dev,
+                         "could not shrink memory BAR to %x\n",
+                         uc_mem_map_size);
+               return -ENOMEM;
+       }
+       iounmap(efx->membase);
+       efx->membase = membase;
+
+       /* Set up the WC mapping if needed */
+       if (wc_mem_map_size) {
+               nic_data->wc_membase = ioremap_wc(efx->membase_phys +
+                                                 uc_mem_map_size,
+                                                 wc_mem_map_size);
+               if (!nic_data->wc_membase) {
+                       netif_err(efx, probe, efx->net_dev,
+                                 "could not allocate WC mapping of size %x\n",
+                                 wc_mem_map_size);
+                       return -ENOMEM;
+               }
+               nic_data->pio_write_vi_base = pio_write_vi_base;
+               nic_data->pio_write_base =
+                       nic_data->wc_membase +
+                       (pio_write_vi_base * EFX_VI_PAGE_SIZE + ER_DZ_TX_PIOBUF -
+                        uc_mem_map_size);
+
+               rc = efx_ef10_link_piobufs(efx);
+               if (rc)
+                       efx_ef10_free_piobufs(efx);
+       }
+
+       netif_dbg(efx, probe, efx->net_dev,
+                 "memory BAR at %pa (virtual %p+%x UC, %p+%x WC)\n",
+                 &efx->membase_phys, efx->membase, uc_mem_map_size,
+                 nic_data->wc_membase, wc_mem_map_size);
+
+       return 0;
 }
 
 static int efx_ef10_init_nic(struct efx_nic *efx)
@@ -359,6 +654,21 @@ static int efx_ef10_init_nic(struct efx_nic *efx)
                nic_data->must_realloc_vis = false;
        }
 
+       if (nic_data->must_restore_piobufs && nic_data->n_piobufs) {
+               rc = efx_ef10_alloc_piobufs(efx, nic_data->n_piobufs);
+               if (rc == 0) {
+                       rc = efx_ef10_link_piobufs(efx);
+                       if (rc)
+                               efx_ef10_free_piobufs(efx);
+               }
+
+               /* Log an error on failure, but this is non-fatal */
+               if (rc)
+                       netif_err(efx, drv, efx->net_dev,
+                                 "failed to restore PIO buffers (%d)\n", rc);
+               nic_data->must_restore_piobufs = false;
+       }
+
        efx_ef10_rx_push_indir_table(efx);
        return 0;
 }
@@ -716,6 +1026,7 @@ static int efx_ef10_mcdi_poll_reboot(struct efx_nic *efx)
        /* All our allocations have been reset */
        nic_data->must_realloc_vis = true;
        nic_data->must_restore_filters = true;
+       nic_data->must_restore_piobufs = true;
        nic_data->rx_rss_context = EFX_EF10_RSS_CONTEXT_INVALID;
 
        /* The datapath firmware might have been changed */
@@ -2137,7 +2448,7 @@ out_unlock:
        return rc;
 }
 
-void efx_ef10_filter_update_rx_scatter(struct efx_nic *efx)
+static void efx_ef10_filter_update_rx_scatter(struct efx_nic *efx)
 {
        /* no need to do anything here on EF10 */
 }
index b3f4e3755fd97cd7c38130597d54e22c20b591d6..207ac9a1e3de989d0f5e4d588c27b6035eec31ec 100644 (file)
 #define        ESF_DZ_TX_PIO_TYPE_WIDTH 1
 #define        ESF_DZ_TX_PIO_OPT_LBN 60
 #define        ESF_DZ_TX_PIO_OPT_WIDTH 3
+#define        ESE_DZ_TX_OPTION_DESC_PIO 1
 #define        ESF_DZ_TX_PIO_CONT_LBN 59
 #define        ESF_DZ_TX_PIO_CONT_WIDTH 1
 #define        ESF_DZ_TX_PIO_BYTE_CNT_LBN 32
index 34d00f5771fe4c521177f17061acb9abac846c50..31d01284e333c80aeceb3d31083339593e0fabaf 100644 (file)
@@ -30,6 +30,7 @@ efx_enqueue_skb(struct efx_tx_queue *tx_queue, struct sk_buff *skb);
 extern void efx_xmit_done(struct efx_tx_queue *tx_queue, unsigned int index);
 extern int efx_setup_tc(struct net_device *net_dev, u8 num_tc);
 extern unsigned int efx_tx_max_skb_descs(struct efx_nic *efx);
+extern unsigned int efx_piobuf_size;
 
 /* RX */
 extern void efx_rx_config_page_split(struct efx_nic *efx);
index 5b471cf5c323a2aabae6ae3c55404194e6f90c42..1f529fa2edb10008a5c03944bea2fb0d995e1e61 100644 (file)
@@ -70,6 +70,7 @@ static const struct efx_sw_stat_desc efx_sw_stat_desc[] = {
        EFX_ETHTOOL_UINT_TXQ_STAT(tso_long_headers),
        EFX_ETHTOOL_UINT_TXQ_STAT(tso_packets),
        EFX_ETHTOOL_UINT_TXQ_STAT(pushes),
+       EFX_ETHTOOL_UINT_TXQ_STAT(pio_packets),
        EFX_ETHTOOL_ATOMIC_NIC_ERROR_STAT(rx_reset),
        EFX_ETHTOOL_UINT_CHANNEL_STAT(rx_tobe_disc),
        EFX_ETHTOOL_UINT_CHANNEL_STAT(rx_ip_hdr_chksum_err),
@@ -1035,8 +1036,8 @@ static int efx_ethtool_set_rxfh_indir(struct net_device *net_dev,
        return 0;
 }
 
-int efx_ethtool_get_ts_info(struct net_device *net_dev,
-                           struct ethtool_ts_info *ts_info)
+static int efx_ethtool_get_ts_info(struct net_device *net_dev,
+                                  struct ethtool_ts_info *ts_info)
 {
        struct efx_nic *efx = netdev_priv(net_dev);
 
index 96ce507d8602eb4a8f40a7cfa034f5cf3dede3c3..4d3f119b67b38ec35719ebac40d59f49ea9b844f 100644 (file)
 #define EFX_USE_QWORD_IO 1
 #endif
 
+/* PIO is a win only if write-combining is possible */
+#ifdef ARCH_HAS_IOREMAP_WC
+#define EFX_USE_PIO 1
+#endif
+
 #ifdef EFX_USE_QWORD_IO
 static inline void _efx_writeq(struct efx_nic *efx, __le64 value,
                                  unsigned int reg)
index b172ed13305554cf6009fc445ac4c09123b97550..aac22a1e85b8e081f5c83723721c637161ba1e46 100644 (file)
@@ -182,6 +182,9 @@ struct efx_tx_buffer {
  * @tsoh_page: Array of pages of TSO header buffers
  * @txd: The hardware descriptor ring
  * @ptr_mask: The size of the ring minus 1.
+ * @piobuf: PIO buffer region for this TX queue (shared with its partner).
+ *     Size of the region is efx_piobuf_size.
+ * @piobuf_offset: Buffer offset to be specified in PIO descriptors
  * @initialised: Has hardware queue been initialised?
  * @read_count: Current read pointer.
  *     This is the number of buffers that have been removed from both rings.
@@ -209,6 +212,7 @@ struct efx_tx_buffer {
  *     blocks
  * @tso_packets: Number of packets via the TSO xmit path
  * @pushes: Number of times the TX push feature has been used
+ * @pio_packets: Number of times the TX PIO feature has been used
  * @empty_read_count: If the completion path has seen the queue as empty
  *     and the transmission path has not yet checked this, the value of
  *     @read_count bitwise-added to %EFX_EMPTY_COUNT_VALID; otherwise 0.
@@ -223,6 +227,8 @@ struct efx_tx_queue {
        struct efx_buffer *tsoh_page;
        struct efx_special_buffer txd;
        unsigned int ptr_mask;
+       void __iomem *piobuf;
+       unsigned int piobuf_offset;
        bool initialised;
 
        /* Members used mainly on the completion path */
@@ -238,6 +244,7 @@ struct efx_tx_queue {
        unsigned int tso_long_headers;
        unsigned int tso_packets;
        unsigned int pushes;
+       unsigned int pio_packets;
 
        /* Members shared between paths and sometimes updated */
        unsigned int empty_read_count ____cacheline_aligned_in_smp;
index e7dbd2dd202e8bb7332b83560f1fa93d56071674..c75009b8c0d9d1d9e8572d457585ded67cbdc55d 100644 (file)
@@ -19,6 +19,7 @@
 #include "bitfield.h"
 #include "efx.h"
 #include "nic.h"
+#include "ef10_regs.h"
 #include "farch_regs.h"
 #include "io.h"
 #include "workarounds.h"
@@ -166,26 +167,30 @@ void efx_nic_fini_interrupt(struct efx_nic *efx)
 
 /* Register dump */
 
-#define REGISTER_REVISION_A    1
-#define REGISTER_REVISION_B    2
-#define REGISTER_REVISION_C    3
-#define REGISTER_REVISION_Z    3       /* latest revision */
+#define REGISTER_REVISION_FA   1
+#define REGISTER_REVISION_FB   2
+#define REGISTER_REVISION_FC   3
+#define REGISTER_REVISION_FZ   3       /* last Falcon arch revision */
+#define REGISTER_REVISION_ED   4
+#define REGISTER_REVISION_EZ   4       /* latest EF10 revision */
 
 struct efx_nic_reg {
        u32 offset:24;
-       u32 min_revision:2, max_revision:2;
+       u32 min_revision:3, max_revision:3;
 };
 
-#define REGISTER(name, min_rev, max_rev) {                             \
-       FR_ ## min_rev ## max_rev ## _ ## name,                         \
-       REGISTER_REVISION_ ## min_rev, REGISTER_REVISION_ ## max_rev    \
+#define REGISTER(name, arch, min_rev, max_rev) {                       \
+       arch ## R_ ## min_rev ## max_rev ## _ ## name,                  \
+       REGISTER_REVISION_ ## arch ## min_rev,                          \
+       REGISTER_REVISION_ ## arch ## max_rev                           \
 }
-#define REGISTER_AA(name) REGISTER(name, A, A)
-#define REGISTER_AB(name) REGISTER(name, A, B)
-#define REGISTER_AZ(name) REGISTER(name, A, Z)
-#define REGISTER_BB(name) REGISTER(name, B, B)
-#define REGISTER_BZ(name) REGISTER(name, B, Z)
-#define REGISTER_CZ(name) REGISTER(name, C, Z)
+#define REGISTER_AA(name) REGISTER(name, F, A, A)
+#define REGISTER_AB(name) REGISTER(name, F, A, B)
+#define REGISTER_AZ(name) REGISTER(name, F, A, Z)
+#define REGISTER_BB(name) REGISTER(name, F, B, B)
+#define REGISTER_BZ(name) REGISTER(name, F, B, Z)
+#define REGISTER_CZ(name) REGISTER(name, F, C, Z)
+#define REGISTER_DZ(name) REGISTER(name, E, D, Z)
 
 static const struct efx_nic_reg efx_nic_regs[] = {
        REGISTER_AZ(ADR_REGION),
@@ -292,37 +297,42 @@ static const struct efx_nic_reg efx_nic_regs[] = {
        REGISTER_AB(XX_TXDRV_CTL),
        /* XX_PRBS_CTL, XX_PRBS_CHK and XX_PRBS_ERR are not used */
        /* XX_CORE_STAT is partly RC */
+       REGISTER_DZ(BIU_HW_REV_ID),
+       REGISTER_DZ(MC_DB_LWRD),
+       REGISTER_DZ(MC_DB_HWRD),
 };
 
 struct efx_nic_reg_table {
        u32 offset:24;
-       u32 min_revision:2, max_revision:2;
+       u32 min_revision:3, max_revision:3;
        u32 step:6, rows:21;
 };
 
-#define REGISTER_TABLE_DIMENSIONS(_, offset, min_rev, max_rev, step, rows) { \
+#define REGISTER_TABLE_DIMENSIONS(_, offset, arch, min_rev, max_rev, step, rows) { \
        offset,                                                         \
-       REGISTER_REVISION_ ## min_rev, REGISTER_REVISION_ ## max_rev,   \
+       REGISTER_REVISION_ ## arch ## min_rev,                          \
+       REGISTER_REVISION_ ## arch ## max_rev,                          \
        step, rows                                                      \
 }
-#define REGISTER_TABLE(name, min_rev, max_rev)                         \
+#define REGISTER_TABLE(name, arch, min_rev, max_rev)                   \
        REGISTER_TABLE_DIMENSIONS(                                      \
-               name, FR_ ## min_rev ## max_rev ## _ ## name,           \
-               min_rev, max_rev,                                       \
-               FR_ ## min_rev ## max_rev ## _ ## name ## _STEP,        \
-               FR_ ## min_rev ## max_rev ## _ ## name ## _ROWS)
-#define REGISTER_TABLE_AA(name) REGISTER_TABLE(name, A, A)
-#define REGISTER_TABLE_AZ(name) REGISTER_TABLE(name, A, Z)
-#define REGISTER_TABLE_BB(name) REGISTER_TABLE(name, B, B)
-#define REGISTER_TABLE_BZ(name) REGISTER_TABLE(name, B, Z)
+               name, arch ## R_ ## min_rev ## max_rev ## _ ## name,    \
+               arch, min_rev, max_rev,                                 \
+               arch ## R_ ## min_rev ## max_rev ## _ ## name ## _STEP, \
+               arch ## R_ ## min_rev ## max_rev ## _ ## name ## _ROWS)
+#define REGISTER_TABLE_AA(name) REGISTER_TABLE(name, F, A, A)
+#define REGISTER_TABLE_AZ(name) REGISTER_TABLE(name, F, A, Z)
+#define REGISTER_TABLE_BB(name) REGISTER_TABLE(name, F, B, B)
+#define REGISTER_TABLE_BZ(name) REGISTER_TABLE(name, F, B, Z)
 #define REGISTER_TABLE_BB_CZ(name)                                     \
-       REGISTER_TABLE_DIMENSIONS(name, FR_BZ_ ## name, B, B,           \
+       REGISTER_TABLE_DIMENSIONS(name, FR_BZ_ ## name, F, B, B,        \
                                  FR_BZ_ ## name ## _STEP,              \
                                  FR_BB_ ## name ## _ROWS),             \
-       REGISTER_TABLE_DIMENSIONS(name, FR_BZ_ ## name, C, Z,           \
+       REGISTER_TABLE_DIMENSIONS(name, FR_BZ_ ## name, F, C, Z,        \
                                  FR_BZ_ ## name ## _STEP,              \
                                  FR_CZ_ ## name ## _ROWS)
-#define REGISTER_TABLE_CZ(name) REGISTER_TABLE(name, C, Z)
+#define REGISTER_TABLE_CZ(name) REGISTER_TABLE(name, F, C, Z)
+#define REGISTER_TABLE_DZ(name) REGISTER_TABLE(name, E, D, Z)
 
 static const struct efx_nic_reg_table efx_nic_reg_tables[] = {
        /* DRIVER is not used */
@@ -340,9 +350,9 @@ static const struct efx_nic_reg_table efx_nic_reg_tables[] = {
         * 1K entries allows for some expansion of queue count and
         * size before we need to change the version. */
        REGISTER_TABLE_DIMENSIONS(BUF_FULL_TBL_KER, FR_AA_BUF_FULL_TBL_KER,
-                                 A, A, 8, 1024),
+                                 F, A, A, 8, 1024),
        REGISTER_TABLE_DIMENSIONS(BUF_FULL_TBL, FR_BZ_BUF_FULL_TBL,
-                                 B, Z, 8, 1024),
+                                 F, B, Z, 8, 1024),
        REGISTER_TABLE_CZ(RX_MAC_FILTER_TBL0),
        REGISTER_TABLE_BB_CZ(TIMER_TBL),
        REGISTER_TABLE_BB_CZ(TX_PACE_TBL),
@@ -353,6 +363,7 @@ static const struct efx_nic_reg_table efx_nic_reg_tables[] = {
        /* MSIX_PBA_TABLE is not mapped */
        /* SRM_DBG is not mapped (and is redundant with BUF_FLL_TBL) */
        REGISTER_TABLE_BZ(RX_FILTER_TBL0),
+       REGISTER_TABLE_DZ(BIU_MC_SFT_STATUS),
 };
 
 size_t efx_nic_get_regs_len(struct efx_nic *efx)
index fda29d39032f422d2c395a4fe68a4c5091654006..609f06769245395da24220320e8944ee01c24aa7 100644 (file)
@@ -71,6 +71,26 @@ efx_tx_desc(struct efx_tx_queue *tx_queue, unsigned int index)
        return ((efx_qword_t *) (tx_queue->txd.buf.addr)) + index;
 }
 
+/* Report whether the NIC considers this TX queue empty, given the
+ * write_count used for the last doorbell push.  May return false
+ * negative.
+ */
+static inline bool __efx_nic_tx_is_empty(struct efx_tx_queue *tx_queue,
+                                        unsigned int write_count)
+{
+       unsigned int empty_read_count = ACCESS_ONCE(tx_queue->empty_read_count);
+
+       if (empty_read_count == 0)
+               return false;
+
+       return ((empty_read_count ^ write_count) & ~EFX_EMPTY_COUNT_VALID) == 0;
+}
+
+static inline bool efx_nic_tx_is_empty(struct efx_tx_queue *tx_queue)
+{
+       return __efx_nic_tx_is_empty(tx_queue, tx_queue->write_count);
+}
+
 /* Decide whether to push a TX descriptor to the NIC vs merely writing
  * the doorbell.  This can reduce latency when we are adding a single
  * descriptor to an empty queue, but is otherwise pointless.  Further,
@@ -80,14 +100,10 @@ efx_tx_desc(struct efx_tx_queue *tx_queue, unsigned int index)
 static inline bool efx_nic_may_push_tx_desc(struct efx_tx_queue *tx_queue,
                                            unsigned int write_count)
 {
-       unsigned empty_read_count = ACCESS_ONCE(tx_queue->empty_read_count);
-
-       if (empty_read_count == 0)
-               return false;
+       bool was_empty = __efx_nic_tx_is_empty(tx_queue, write_count);
 
        tx_queue->empty_read_count = 0;
-       return ((empty_read_count ^ write_count) & ~EFX_EMPTY_COUNT_VALID) == 0
-               && tx_queue->write_count - write_count == 1;
+       return was_empty && tx_queue->write_count - write_count == 1;
 }
 
 /* Returns a pointer to the specified descriptor in the RX descriptor queue */
@@ -389,6 +405,12 @@ enum {
        EF10_STAT_COUNT
 };
 
+/* Maximum number of TX PIO buffers we may allocate to a function.
+ * This matches the total number of buffers on each SFC9100-family
+ * controller.
+ */
+#define EF10_TX_PIOBUF_COUNT 16
+
 /**
  * struct efx_ef10_nic_data - EF10 architecture NIC state
  * @mcdi_buf: DMA buffer for MCDI
@@ -397,6 +419,13 @@ enum {
  * @n_allocated_vis: Number of VIs allocated to this function
  * @must_realloc_vis: Flag: VIs have yet to be reallocated after MC reboot
  * @must_restore_filters: Flag: filters have yet to be restored after MC reboot
+ * @n_piobufs: Number of PIO buffers allocated to this function
+ * @wc_membase: Base address of write-combining mapping of the memory BAR
+ * @pio_write_base: Base address for writing PIO buffers
+ * @pio_write_vi_base: Relative VI number for @pio_write_base
+ * @piobuf_handle: Handle of each PIO buffer allocated
+ * @must_restore_piobufs: Flag: PIO buffers have yet to be restored after MC
+ *     reboot
  * @rx_rss_context: Firmware handle for our RSS context
  * @stats: Hardware statistics
  * @workaround_35388: Flag: firmware supports workaround for bug 35388
@@ -412,6 +441,11 @@ struct efx_ef10_nic_data {
        unsigned int n_allocated_vis;
        bool must_realloc_vis;
        bool must_restore_filters;
+       unsigned int n_piobufs;
+       void __iomem *wc_membase, *pio_write_base;
+       unsigned int pio_write_vi_base;
+       unsigned int piobuf_handle[EF10_TX_PIOBUF_COUNT];
+       bool must_restore_piobufs;
        u32 rx_rss_context;
        u64 stats[EF10_STAT_COUNT];
        bool workaround_35388;
index 4a596725023f07284ef66b3bb2cc337e92a0d69e..8f09e686fc2392a80f56610c78a61c6374b4a410 100644 (file)
@@ -12,6 +12,7 @@
 #include <linux/in.h>
 #include <linux/slab.h>
 #include <linux/ip.h>
+#include <linux/ipv6.h>
 #include <linux/tcp.h>
 #include <linux/udp.h>
 #include <linux/prefetch.h>
@@ -818,44 +819,70 @@ int efx_filter_rfs(struct net_device *net_dev, const struct sk_buff *skb,
        struct efx_nic *efx = netdev_priv(net_dev);
        struct efx_channel *channel;
        struct efx_filter_spec spec;
-       const struct iphdr *ip;
        const __be16 *ports;
+       __be16 ether_type;
        int nhoff;
        int rc;
 
-       nhoff = skb_network_offset(skb);
+       /* The core RPS/RFS code has already parsed and validated
+        * VLAN, IP and transport headers.  We assume they are in the
+        * header area.
+        */
 
        if (skb->protocol == htons(ETH_P_8021Q)) {
-               EFX_BUG_ON_PARANOID(skb_headlen(skb) <
-                                   nhoff + sizeof(struct vlan_hdr));
-               if (((const struct vlan_hdr *)skb->data + nhoff)->
-                   h_vlan_encapsulated_proto != htons(ETH_P_IP))
-                       return -EPROTONOSUPPORT;
+               const struct vlan_hdr *vh =
+                       (const struct vlan_hdr *)skb->data;
 
-               /* This is IP over 802.1q VLAN.  We can't filter on the
-                * IP 5-tuple and the vlan together, so just strip the
-                * vlan header and filter on the IP part.
+               /* We can't filter on the IP 5-tuple and the vlan
+                * together, so just strip the vlan header and filter
+                * on the IP part.
                 */
-               nhoff += sizeof(struct vlan_hdr);
-       } else if (skb->protocol != htons(ETH_P_IP)) {
-               return -EPROTONOSUPPORT;
+               EFX_BUG_ON_PARANOID(skb_headlen(skb) < sizeof(*vh));
+               ether_type = vh->h_vlan_encapsulated_proto;
+               nhoff = sizeof(struct vlan_hdr);
+       } else {
+               ether_type = skb->protocol;
+               nhoff = 0;
        }
 
-       /* RFS must validate the IP header length before calling us */
-       EFX_BUG_ON_PARANOID(skb_headlen(skb) < nhoff + sizeof(*ip));
-       ip = (const struct iphdr *)(skb->data + nhoff);
-       if (ip_is_fragment(ip))
+       if (ether_type != htons(ETH_P_IP) && ether_type != htons(ETH_P_IPV6))
                return -EPROTONOSUPPORT;
-       EFX_BUG_ON_PARANOID(skb_headlen(skb) < nhoff + 4 * ip->ihl + 4);
-       ports = (const __be16 *)(skb->data + nhoff + 4 * ip->ihl);
 
        efx_filter_init_rx(&spec, EFX_FILTER_PRI_HINT,
                           efx->rx_scatter ? EFX_FILTER_FLAG_RX_SCATTER : 0,
                           rxq_index);
-       rc = efx_filter_set_ipv4_full(&spec, ip->protocol,
-                                     ip->daddr, ports[1], ip->saddr, ports[0]);
-       if (rc)
-               return rc;
+       spec.match_flags =
+               EFX_FILTER_MATCH_ETHER_TYPE | EFX_FILTER_MATCH_IP_PROTO |
+               EFX_FILTER_MATCH_LOC_HOST | EFX_FILTER_MATCH_LOC_PORT |
+               EFX_FILTER_MATCH_REM_HOST | EFX_FILTER_MATCH_REM_PORT;
+       spec.ether_type = ether_type;
+
+       if (ether_type == htons(ETH_P_IP)) {
+               const struct iphdr *ip =
+                       (const struct iphdr *)(skb->data + nhoff);
+
+               EFX_BUG_ON_PARANOID(skb_headlen(skb) < nhoff + sizeof(*ip));
+               if (ip_is_fragment(ip))
+                       return -EPROTONOSUPPORT;
+               spec.ip_proto = ip->protocol;
+               spec.rem_host[0] = ip->saddr;
+               spec.loc_host[0] = ip->daddr;
+               EFX_BUG_ON_PARANOID(skb_headlen(skb) < nhoff + 4 * ip->ihl + 4);
+               ports = (const __be16 *)(skb->data + nhoff + 4 * ip->ihl);
+       } else {
+               const struct ipv6hdr *ip6 =
+                       (const struct ipv6hdr *)(skb->data + nhoff);
+
+               EFX_BUG_ON_PARANOID(skb_headlen(skb) <
+                                   nhoff + sizeof(*ip6) + 4);
+               spec.ip_proto = ip6->nexthdr;
+               memcpy(spec.rem_host, &ip6->saddr, sizeof(ip6->saddr));
+               memcpy(spec.loc_host, &ip6->daddr, sizeof(ip6->daddr));
+               ports = (const __be16 *)(ip6 + 1);
+       }
+
+       spec.rem_port = ports[0];
+       spec.loc_port = ports[1];
 
        rc = efx->type->filter_rfs_insert(efx, &spec);
        if (rc < 0)
@@ -866,11 +893,18 @@ int efx_filter_rfs(struct net_device *net_dev, const struct sk_buff *skb,
        channel = efx_get_channel(efx, skb_get_rx_queue(skb));
        ++channel->rfs_filters_added;
 
-       netif_info(efx, rx_status, efx->net_dev,
-                  "steering %s %pI4:%u:%pI4:%u to queue %u [flow %u filter %d]\n",
-                  (ip->protocol == IPPROTO_TCP) ? "TCP" : "UDP",
-                  &ip->saddr, ntohs(ports[0]), &ip->daddr, ntohs(ports[1]),
-                  rxq_index, flow_id, rc);
+       if (ether_type == htons(ETH_P_IP))
+               netif_info(efx, rx_status, efx->net_dev,
+                          "steering %s %pI4:%u:%pI4:%u to queue %u [flow %u filter %d]\n",
+                          (spec.ip_proto == IPPROTO_TCP) ? "TCP" : "UDP",
+                          spec.rem_host, ntohs(ports[0]), spec.loc_host,
+                          ntohs(ports[1]), rxq_index, flow_id, rc);
+       else
+               netif_info(efx, rx_status, efx->net_dev,
+                          "steering %s [%pI6]:%u:[%pI6]:%u to queue %u [flow %u filter %d]\n",
+                          (spec.ip_proto == IPPROTO_TCP) ? "TCP" : "UDP",
+                          spec.rem_host, ntohs(ports[0]), spec.loc_host,
+                          ntohs(ports[1]), rxq_index, flow_id, rc);
 
        return rc;
 }
index 2ac91c5b5eeae4e7a210e5b3693c9f01a04c9f9c..282692c48e6b6de94949e814ecdadf848186edc6 100644 (file)
 #include <net/ipv6.h>
 #include <linux/if_ether.h>
 #include <linux/highmem.h>
+#include <linux/cache.h>
 #include "net_driver.h"
 #include "efx.h"
+#include "io.h"
 #include "nic.h"
 #include "workarounds.h"
+#include "ef10_regs.h"
+
+#ifdef EFX_USE_PIO
+
+#define EFX_PIOBUF_SIZE_MAX ER_DZ_TX_PIOBUF_SIZE
+#define EFX_PIOBUF_SIZE_DEF ALIGN(256, L1_CACHE_BYTES)
+unsigned int efx_piobuf_size __read_mostly = EFX_PIOBUF_SIZE_DEF;
+
+#endif /* EFX_USE_PIO */
+
+static inline unsigned int
+efx_tx_queue_get_insert_index(const struct efx_tx_queue *tx_queue)
+{
+       return tx_queue->insert_count & tx_queue->ptr_mask;
+}
+
+static inline struct efx_tx_buffer *
+__efx_tx_queue_get_insert_buffer(const struct efx_tx_queue *tx_queue)
+{
+       return &tx_queue->buffer[efx_tx_queue_get_insert_index(tx_queue)];
+}
+
+static inline struct efx_tx_buffer *
+efx_tx_queue_get_insert_buffer(const struct efx_tx_queue *tx_queue)
+{
+       struct efx_tx_buffer *buffer =
+               __efx_tx_queue_get_insert_buffer(tx_queue);
+
+       EFX_BUG_ON_PARANOID(buffer->len);
+       EFX_BUG_ON_PARANOID(buffer->flags);
+       EFX_BUG_ON_PARANOID(buffer->unmap_len);
+
+       return buffer;
+}
 
 static void efx_dequeue_buffer(struct efx_tx_queue *tx_queue,
                               struct efx_tx_buffer *buffer,
@@ -83,8 +119,10 @@ unsigned int efx_tx_max_skb_descs(struct efx_nic *efx)
         */
        unsigned int max_descs = EFX_TSO_MAX_SEGS * 2 + MAX_SKB_FRAGS;
 
-       /* Possibly one more per segment for the alignment workaround */
-       if (EFX_WORKAROUND_5391(efx))
+       /* Possibly one more per segment for the alignment workaround,
+        * or for option descriptors
+        */
+       if (EFX_WORKAROUND_5391(efx) || efx_nic_rev(efx) >= EFX_REV_HUNT_A0)
                max_descs += EFX_TSO_MAX_SEGS;
 
        /* Possibly more for PCIe page boundaries within input fragments */
@@ -145,6 +183,145 @@ static void efx_tx_maybe_stop_queue(struct efx_tx_queue *txq1)
        }
 }
 
+#ifdef EFX_USE_PIO
+
+struct efx_short_copy_buffer {
+       int used;
+       u8 buf[L1_CACHE_BYTES];
+};
+
+/* Copy to PIO, respecting that writes to PIO buffers must be dword aligned.
+ * Advances piobuf pointer. Leaves additional data in the copy buffer.
+ */
+static void efx_memcpy_toio_aligned(struct efx_nic *efx, u8 __iomem **piobuf,
+                                   u8 *data, int len,
+                                   struct efx_short_copy_buffer *copy_buf)
+{
+       int block_len = len & ~(sizeof(copy_buf->buf) - 1);
+
+       memcpy_toio(*piobuf, data, block_len);
+       *piobuf += block_len;
+       len -= block_len;
+
+       if (len) {
+               data += block_len;
+               BUG_ON(copy_buf->used);
+               BUG_ON(len > sizeof(copy_buf->buf));
+               memcpy(copy_buf->buf, data, len);
+               copy_buf->used = len;
+       }
+}
+
+/* Copy to PIO, respecting dword alignment, popping data from copy buffer first.
+ * Advances piobuf pointer. Leaves additional data in the copy buffer.
+ */
+static void efx_memcpy_toio_aligned_cb(struct efx_nic *efx, u8 __iomem **piobuf,
+                                      u8 *data, int len,
+                                      struct efx_short_copy_buffer *copy_buf)
+{
+       if (copy_buf->used) {
+               /* if the copy buffer is partially full, fill it up and write */
+               int copy_to_buf =
+                       min_t(int, sizeof(copy_buf->buf) - copy_buf->used, len);
+
+               memcpy(copy_buf->buf + copy_buf->used, data, copy_to_buf);
+               copy_buf->used += copy_to_buf;
+
+               /* if we didn't fill it up then we're done for now */
+               if (copy_buf->used < sizeof(copy_buf->buf))
+                       return;
+
+               memcpy_toio(*piobuf, copy_buf->buf, sizeof(copy_buf->buf));
+               *piobuf += sizeof(copy_buf->buf);
+               data += copy_to_buf;
+               len -= copy_to_buf;
+               copy_buf->used = 0;
+       }
+
+       efx_memcpy_toio_aligned(efx, piobuf, data, len, copy_buf);
+}
+
+static void efx_flush_copy_buffer(struct efx_nic *efx, u8 __iomem *piobuf,
+                                 struct efx_short_copy_buffer *copy_buf)
+{
+       /* if there's anything in it, write the whole buffer, including junk */
+       if (copy_buf->used)
+               memcpy_toio(piobuf, copy_buf->buf, sizeof(copy_buf->buf));
+}
+
+/* Traverse skb structure and copy fragments in to PIO buffer.
+ * Advances piobuf pointer.
+ */
+static void efx_skb_copy_bits_to_pio(struct efx_nic *efx, struct sk_buff *skb,
+                                    u8 __iomem **piobuf,
+                                    struct efx_short_copy_buffer *copy_buf)
+{
+       int i;
+
+       efx_memcpy_toio_aligned(efx, piobuf, skb->data, skb_headlen(skb),
+                               copy_buf);
+
+       for (i = 0; i < skb_shinfo(skb)->nr_frags; ++i) {
+               skb_frag_t *f = &skb_shinfo(skb)->frags[i];
+               u8 *vaddr;
+
+               vaddr = kmap_atomic(skb_frag_page(f));
+
+               efx_memcpy_toio_aligned_cb(efx, piobuf, vaddr + f->page_offset,
+                                          skb_frag_size(f), copy_buf);
+               kunmap_atomic(vaddr);
+       }
+
+       EFX_BUG_ON_PARANOID(skb_shinfo(skb)->frag_list);
+}
+
+static struct efx_tx_buffer *
+efx_enqueue_skb_pio(struct efx_tx_queue *tx_queue, struct sk_buff *skb)
+{
+       struct efx_tx_buffer *buffer =
+               efx_tx_queue_get_insert_buffer(tx_queue);
+       u8 __iomem *piobuf = tx_queue->piobuf;
+
+       /* Copy to PIO buffer. Ensure the writes are padded to the end
+        * of a cache line, as this is required for write-combining to be
+        * effective on at least x86.
+        */
+
+       if (skb_shinfo(skb)->nr_frags) {
+               /* The size of the copy buffer will ensure all writes
+                * are the size of a cache line.
+                */
+               struct efx_short_copy_buffer copy_buf;
+
+               copy_buf.used = 0;
+
+               efx_skb_copy_bits_to_pio(tx_queue->efx, skb,
+                                        &piobuf, &copy_buf);
+               efx_flush_copy_buffer(tx_queue->efx, piobuf, &copy_buf);
+       } else {
+               /* Pad the write to the size of a cache line.
+                * We can do this because we know the skb_shared_info sruct is
+                * after the source, and the destination buffer is big enough.
+                */
+               BUILD_BUG_ON(L1_CACHE_BYTES >
+                            SKB_DATA_ALIGN(sizeof(struct skb_shared_info)));
+               memcpy_toio(tx_queue->piobuf, skb->data,
+                           ALIGN(skb->len, L1_CACHE_BYTES));
+       }
+
+       EFX_POPULATE_QWORD_5(buffer->option,
+                            ESF_DZ_TX_DESC_IS_OPT, 1,
+                            ESF_DZ_TX_OPTION_TYPE, ESE_DZ_TX_OPTION_DESC_PIO,
+                            ESF_DZ_TX_PIO_CONT, 0,
+                            ESF_DZ_TX_PIO_BYTE_CNT, skb->len,
+                            ESF_DZ_TX_PIO_BUF_ADDR,
+                            tx_queue->piobuf_offset);
+       ++tx_queue->pio_packets;
+       ++tx_queue->insert_count;
+       return buffer;
+}
+#endif /* EFX_USE_PIO */
+
 /*
  * Add a socket buffer to a TX queue
  *
@@ -167,7 +344,7 @@ netdev_tx_t efx_enqueue_skb(struct efx_tx_queue *tx_queue, struct sk_buff *skb)
        struct device *dma_dev = &efx->pci_dev->dev;
        struct efx_tx_buffer *buffer;
        skb_frag_t *fragment;
-       unsigned int len, unmap_len = 0, insert_ptr;
+       unsigned int len, unmap_len = 0;
        dma_addr_t dma_addr, unmap_addr = 0;
        unsigned int dma_len;
        unsigned short dma_flags;
@@ -189,6 +366,17 @@ netdev_tx_t efx_enqueue_skb(struct efx_tx_queue *tx_queue, struct sk_buff *skb)
                        return NETDEV_TX_OK;
        }
 
+       /* Consider using PIO for short packets */
+#ifdef EFX_USE_PIO
+       if (skb->len <= efx_piobuf_size && tx_queue->piobuf &&
+           efx_nic_tx_is_empty(tx_queue) &&
+           efx_nic_tx_is_empty(efx_tx_queue_partner(tx_queue))) {
+               buffer = efx_enqueue_skb_pio(tx_queue, skb);
+               dma_flags = EFX_TX_BUF_OPTION;
+               goto finish_packet;
+       }
+#endif
+
        /* Map for DMA.  Use dma_map_single rather than dma_map_page
         * since this is more efficient on machines with sparse
         * memory.
@@ -208,11 +396,7 @@ netdev_tx_t efx_enqueue_skb(struct efx_tx_queue *tx_queue, struct sk_buff *skb)
 
                /* Add to TX queue, splitting across DMA boundaries */
                do {
-                       insert_ptr = tx_queue->insert_count & tx_queue->ptr_mask;
-                       buffer = &tx_queue->buffer[insert_ptr];
-                       EFX_BUG_ON_PARANOID(buffer->flags);
-                       EFX_BUG_ON_PARANOID(buffer->len);
-                       EFX_BUG_ON_PARANOID(buffer->unmap_len);
+                       buffer = efx_tx_queue_get_insert_buffer(tx_queue);
 
                        dma_len = efx_max_tx_len(efx, dma_addr);
                        if (likely(dma_len >= len))
@@ -245,6 +429,7 @@ netdev_tx_t efx_enqueue_skb(struct efx_tx_queue *tx_queue, struct sk_buff *skb)
        }
 
        /* Transfer ownership of the skb to the final buffer */
+finish_packet:
        buffer->skb = skb;
        buffer->flags = EFX_TX_BUF_SKB | dma_flags;
 
@@ -270,8 +455,7 @@ netdev_tx_t efx_enqueue_skb(struct efx_tx_queue *tx_queue, struct sk_buff *skb)
        while (tx_queue->insert_count != tx_queue->write_count) {
                unsigned int pkts_compl = 0, bytes_compl = 0;
                --tx_queue->insert_count;
-               insert_ptr = tx_queue->insert_count & tx_queue->ptr_mask;
-               buffer = &tx_queue->buffer[insert_ptr];
+               buffer = __efx_tx_queue_get_insert_buffer(tx_queue);
                efx_dequeue_buffer(tx_queue, buffer, &pkts_compl, &bytes_compl);
        }
 
@@ -628,6 +812,9 @@ void efx_remove_tx_queue(struct efx_tx_queue *tx_queue)
  * @tcp_off: Offset of TCP header
  * @header_len: Number of bytes of header
  * @ip_base_len: IPv4 tot_len or IPv6 payload_len, before TCP payload
+ * @header_dma_addr: Header DMA address, when using option descriptors
+ * @header_unmap_len: Header DMA mapped length, or 0 if not using option
+ *     descriptors
  *
  * The state used during segmentation.  It is put into this data structure
  * just to make it easy to pass into inline functions.
@@ -636,7 +823,7 @@ struct tso_state {
        /* Output position */
        unsigned out_len;
        unsigned seqnum;
-       unsigned ipv4_id;
+       u16 ipv4_id;
        unsigned packet_space;
 
        /* Input position */
@@ -651,6 +838,8 @@ struct tso_state {
        unsigned int tcp_off;
        unsigned header_len;
        unsigned int ip_base_len;
+       dma_addr_t header_dma_addr;
+       unsigned int header_unmap_len;
 };
 
 
@@ -737,23 +926,18 @@ static void efx_tx_queue_insert(struct efx_tx_queue *tx_queue,
 {
        struct efx_tx_buffer *buffer;
        struct efx_nic *efx = tx_queue->efx;
-       unsigned dma_len, insert_ptr;
+       unsigned dma_len;
 
        EFX_BUG_ON_PARANOID(len <= 0);
 
        while (1) {
-               insert_ptr = tx_queue->insert_count & tx_queue->ptr_mask;
-               buffer = &tx_queue->buffer[insert_ptr];
+               buffer = efx_tx_queue_get_insert_buffer(tx_queue);
                ++tx_queue->insert_count;
 
                EFX_BUG_ON_PARANOID(tx_queue->insert_count -
                                    tx_queue->read_count >=
                                    efx->txq_entries);
 
-               EFX_BUG_ON_PARANOID(buffer->len);
-               EFX_BUG_ON_PARANOID(buffer->unmap_len);
-               EFX_BUG_ON_PARANOID(buffer->flags);
-
                buffer->dma_addr = dma_addr;
 
                dma_len = efx_max_tx_len(efx, dma_addr);
@@ -814,19 +998,27 @@ static void efx_enqueue_unwind(struct efx_tx_queue *tx_queue)
        /* Work backwards until we hit the original insert pointer value */
        while (tx_queue->insert_count != tx_queue->write_count) {
                --tx_queue->insert_count;
-               buffer = &tx_queue->buffer[tx_queue->insert_count &
-                                          tx_queue->ptr_mask];
+               buffer = __efx_tx_queue_get_insert_buffer(tx_queue);
                efx_dequeue_buffer(tx_queue, buffer, NULL, NULL);
        }
 }
 
 
 /* Parse the SKB header and initialise state. */
-static void tso_start(struct tso_state *st, const struct sk_buff *skb)
+static int tso_start(struct tso_state *st, struct efx_nic *efx,
+                    const struct sk_buff *skb)
 {
+       bool use_options = efx_nic_rev(efx) >= EFX_REV_HUNT_A0;
+       struct device *dma_dev = &efx->pci_dev->dev;
+       unsigned int header_len, in_len;
+       dma_addr_t dma_addr;
+
        st->ip_off = skb_network_header(skb) - skb->data;
        st->tcp_off = skb_transport_header(skb) - skb->data;
-       st->header_len = st->tcp_off + (tcp_hdr(skb)->doff << 2u);
+       header_len = st->tcp_off + (tcp_hdr(skb)->doff << 2u);
+       in_len = skb_headlen(skb) - header_len;
+       st->header_len = header_len;
+       st->in_len = in_len;
        if (st->protocol == htons(ETH_P_IP)) {
                st->ip_base_len = st->header_len - st->ip_off;
                st->ipv4_id = ntohs(ip_hdr(skb)->id);
@@ -840,9 +1032,34 @@ static void tso_start(struct tso_state *st, const struct sk_buff *skb)
        EFX_BUG_ON_PARANOID(tcp_hdr(skb)->syn);
        EFX_BUG_ON_PARANOID(tcp_hdr(skb)->rst);
 
-       st->out_len = skb->len - st->header_len;
-       st->unmap_len = 0;
-       st->dma_flags = 0;
+       st->out_len = skb->len - header_len;
+
+       if (!use_options) {
+               st->header_unmap_len = 0;
+
+               if (likely(in_len == 0)) {
+                       st->dma_flags = 0;
+                       st->unmap_len = 0;
+                       return 0;
+               }
+
+               dma_addr = dma_map_single(dma_dev, skb->data + header_len,
+                                         in_len, DMA_TO_DEVICE);
+               st->dma_flags = EFX_TX_BUF_MAP_SINGLE;
+               st->dma_addr = dma_addr;
+               st->unmap_addr = dma_addr;
+               st->unmap_len = in_len;
+       } else {
+               dma_addr = dma_map_single(dma_dev, skb->data,
+                                         skb_headlen(skb), DMA_TO_DEVICE);
+               st->header_dma_addr = dma_addr;
+               st->header_unmap_len = skb_headlen(skb);
+               st->dma_flags = 0;
+               st->dma_addr = dma_addr + header_len;
+               st->unmap_len = 0;
+       }
+
+       return unlikely(dma_mapping_error(dma_dev, dma_addr)) ? -ENOMEM : 0;
 }
 
 static int tso_get_fragment(struct tso_state *st, struct efx_nic *efx,
@@ -860,24 +1077,6 @@ static int tso_get_fragment(struct tso_state *st, struct efx_nic *efx,
        return -ENOMEM;
 }
 
-static int tso_get_head_fragment(struct tso_state *st, struct efx_nic *efx,
-                                const struct sk_buff *skb)
-{
-       int hl = st->header_len;
-       int len = skb_headlen(skb) - hl;
-
-       st->unmap_addr = dma_map_single(&efx->pci_dev->dev, skb->data + hl,
-                                       len, DMA_TO_DEVICE);
-       if (likely(!dma_mapping_error(&efx->pci_dev->dev, st->unmap_addr))) {
-               st->dma_flags = EFX_TX_BUF_MAP_SINGLE;
-               st->unmap_len = len;
-               st->in_len = len;
-               st->dma_addr = st->unmap_addr;
-               return 0;
-       }
-       return -ENOMEM;
-}
-
 
 /**
  * tso_fill_packet_with_fragment - form descriptors for the current fragment
@@ -944,55 +1143,97 @@ static int tso_start_new_packet(struct efx_tx_queue *tx_queue,
                                struct tso_state *st)
 {
        struct efx_tx_buffer *buffer =
-               &tx_queue->buffer[tx_queue->insert_count & tx_queue->ptr_mask];
-       struct tcphdr *tsoh_th;
-       unsigned ip_length;
-       u8 *header;
-       int rc;
+               efx_tx_queue_get_insert_buffer(tx_queue);
+       bool is_last = st->out_len <= skb_shinfo(skb)->gso_size;
+       u8 tcp_flags_clear;
 
-       /* Allocate and insert a DMA-mapped header buffer. */
-       header = efx_tsoh_get_buffer(tx_queue, buffer, st->header_len);
-       if (!header)
-               return -ENOMEM;
-
-       tsoh_th = (struct tcphdr *)(header + st->tcp_off);
-
-       /* Copy and update the headers. */
-       memcpy(header, skb->data, st->header_len);
-
-       tsoh_th->seq = htonl(st->seqnum);
-       st->seqnum += skb_shinfo(skb)->gso_size;
-       if (st->out_len > skb_shinfo(skb)->gso_size) {
-               /* This packet will not finish the TSO burst. */
+       if (!is_last) {
                st->packet_space = skb_shinfo(skb)->gso_size;
-               tsoh_th->fin = 0;
-               tsoh_th->psh = 0;
+               tcp_flags_clear = 0x09; /* mask out FIN and PSH */
        } else {
-               /* This packet will be the last in the TSO burst. */
                st->packet_space = st->out_len;
-               tsoh_th->fin = tcp_hdr(skb)->fin;
-               tsoh_th->psh = tcp_hdr(skb)->psh;
+               tcp_flags_clear = 0x00;
        }
-       ip_length = st->ip_base_len + st->packet_space;
 
-       if (st->protocol == htons(ETH_P_IP)) {
-               struct iphdr *tsoh_iph = (struct iphdr *)(header + st->ip_off);
+       if (!st->header_unmap_len) {
+               /* Allocate and insert a DMA-mapped header buffer. */
+               struct tcphdr *tsoh_th;
+               unsigned ip_length;
+               u8 *header;
+               int rc;
+
+               header = efx_tsoh_get_buffer(tx_queue, buffer, st->header_len);
+               if (!header)
+                       return -ENOMEM;
 
-               tsoh_iph->tot_len = htons(ip_length);
+               tsoh_th = (struct tcphdr *)(header + st->tcp_off);
+
+               /* Copy and update the headers. */
+               memcpy(header, skb->data, st->header_len);
+
+               tsoh_th->seq = htonl(st->seqnum);
+               ((u8 *)tsoh_th)[13] &= ~tcp_flags_clear;
+
+               ip_length = st->ip_base_len + st->packet_space;
+
+               if (st->protocol == htons(ETH_P_IP)) {
+                       struct iphdr *tsoh_iph =
+                               (struct iphdr *)(header + st->ip_off);
+
+                       tsoh_iph->tot_len = htons(ip_length);
+                       tsoh_iph->id = htons(st->ipv4_id);
+               } else {
+                       struct ipv6hdr *tsoh_iph =
+                               (struct ipv6hdr *)(header + st->ip_off);
+
+                       tsoh_iph->payload_len = htons(ip_length);
+               }
 
-               /* Linux leaves suitable gaps in the IP ID space for us to fill. */
-               tsoh_iph->id = htons(st->ipv4_id);
-               st->ipv4_id++;
+               rc = efx_tso_put_header(tx_queue, buffer, header);
+               if (unlikely(rc))
+                       return rc;
        } else {
-               struct ipv6hdr *tsoh_iph =
-                       (struct ipv6hdr *)(header + st->ip_off);
+               /* Send the original headers with a TSO option descriptor
+                * in front
+                */
+               u8 tcp_flags = ((u8 *)tcp_hdr(skb))[13] & ~tcp_flags_clear;
 
-               tsoh_iph->payload_len = htons(ip_length);
+               buffer->flags = EFX_TX_BUF_OPTION;
+               buffer->len = 0;
+               buffer->unmap_len = 0;
+               EFX_POPULATE_QWORD_5(buffer->option,
+                                    ESF_DZ_TX_DESC_IS_OPT, 1,
+                                    ESF_DZ_TX_OPTION_TYPE,
+                                    ESE_DZ_TX_OPTION_DESC_TSO,
+                                    ESF_DZ_TX_TSO_TCP_FLAGS, tcp_flags,
+                                    ESF_DZ_TX_TSO_IP_ID, st->ipv4_id,
+                                    ESF_DZ_TX_TSO_TCP_SEQNO, st->seqnum);
+               ++tx_queue->insert_count;
+
+               /* We mapped the headers in tso_start().  Unmap them
+                * when the last segment is completed.
+                */
+               buffer = efx_tx_queue_get_insert_buffer(tx_queue);
+               buffer->dma_addr = st->header_dma_addr;
+               buffer->len = st->header_len;
+               if (is_last) {
+                       buffer->flags = EFX_TX_BUF_CONT | EFX_TX_BUF_MAP_SINGLE;
+                       buffer->unmap_len = st->header_unmap_len;
+                       /* Ensure we only unmap them once in case of a
+                        * later DMA mapping error and rollback
+                        */
+                       st->header_unmap_len = 0;
+               } else {
+                       buffer->flags = EFX_TX_BUF_CONT;
+                       buffer->unmap_len = 0;
+               }
+               ++tx_queue->insert_count;
        }
 
-       rc = efx_tso_put_header(tx_queue, buffer, header);
-       if (unlikely(rc))
-               return rc;
+       st->seqnum += skb_shinfo(skb)->gso_size;
+
+       /* Linux leaves suitable gaps in the IP ID space for us to fill. */
+       ++st->ipv4_id;
 
        ++tx_queue->tso_packets;
 
@@ -1023,12 +1264,11 @@ static int efx_enqueue_skb_tso(struct efx_tx_queue *tx_queue,
 
        EFX_BUG_ON_PARANOID(tx_queue->write_count != tx_queue->insert_count);
 
-       tso_start(&state, skb);
+       rc = tso_start(&state, efx, skb);
+       if (rc)
+               goto mem_err;
 
-       /* Assume that skb header area contains exactly the headers, and
-        * all payload is in the frag list.
-        */
-       if (skb_headlen(skb) == state.header_len) {
+       if (likely(state.in_len == 0)) {
                /* Grab the first payload fragment. */
                EFX_BUG_ON_PARANOID(skb_shinfo(skb)->nr_frags < 1);
                frag_i = 0;
@@ -1037,9 +1277,7 @@ static int efx_enqueue_skb_tso(struct efx_tx_queue *tx_queue,
                if (rc)
                        goto mem_err;
        } else {
-               rc = tso_get_head_fragment(&state, efx, skb);
-               if (rc)
-                       goto mem_err;
+               /* Payload starts in the header area. */
                frag_i = -1;
        }
 
@@ -1091,6 +1329,11 @@ static int efx_enqueue_skb_tso(struct efx_tx_queue *tx_queue,
                                       state.unmap_len, DMA_TO_DEVICE);
        }
 
+       /* Free the header DMA mapping, if using option descriptors */
+       if (state.header_unmap_len)
+               dma_unmap_single(&efx->pci_dev->dev, state.header_dma_addr,
+                                state.header_unmap_len, DMA_TO_DEVICE);
+
        efx_enqueue_unwind(tx_queue);
        return NETDEV_TX_OK;
 }