Index: sys/compat/linuxkpi/common/include/linux/netdevice.h =================================================================== --- sys/compat/linuxkpi/common/include/linux/netdevice.h +++ sys/compat/linuxkpi/common/include/linux/netdevice.h @@ -4,6 +4,11 @@ * Copyright (c) 2010 Panasas, Inc. * Copyright (c) 2013-2019 Mellanox Technologies, Ltd. * All rights reserved. + * Copyright (c) 2020-2021 The FreeBSD Foundation + * Copyright (c) 2020-2021 Bjoern A. Zeeb + * + * Portions of this software were developed by Björn Zeeb + * under sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -28,20 +33,32 @@ * * $FreeBSD$ */ -#ifndef _LINUX_NETDEVICE_H_ -#define _LINUX_NETDEVICE_H_ +#ifndef __LINUXKPI_LINUX_NETDEVICE_H_ +#define __LINUXKPI_LINUX_NETDEVICE_H_ #include +#include +#include +#include +#include +#include #include +#include #include #include #include #include +#include +#include +#include #include +#include #include +#include +#include #ifdef VIMAGE #define init_net *vnet0 @@ -49,13 +66,92 @@ #define init_net *((struct vnet *)0) #endif +struct sk_buff; +struct net_device; +struct wireless_dev; /* net/cfg80211.h */ + #define MAX_ADDR_LEN 20 -#define net_device ifnet +#define NET_NAME_UNKNOWN 0 + +enum netdev_tx { + NETDEV_TX_OK = 0, +}; +typedef enum netdev_tx netdev_tx_t; + +struct netdev_hw_addr { + struct list_head addr_list; + uint8_t addr[MAX_ADDR_LEN]; +}; + +struct netdev_hw_addr_list { + struct list_head addr_list; + int count; +}; + +enum net_device_reg_state { + NETREG_DUMMY = 1, + NETREG_REGISTERED, +}; + +struct net_device_ops { + int (*ndo_open)(struct net_device *); + int (*ndo_stop)(struct net_device *); + int (*ndo_set_mac_address)(struct net_device *, void *); + netdev_tx_t (*ndo_start_xmit)(struct sk_buff *, struct net_device *); + void (*ndo_set_rx_mode)(struct net_device *); +}; + +struct net_device { + /* BSD specific for compat. */ + struct ifnet bsdifp; + + /* net_device fields seen publicly. */ + /* XXX can we later make some aliases to ifnet? */ + char name[IFNAMSIZ]; + struct wireless_dev *ieee80211_ptr; + uint8_t dev_addr[ETH_ALEN]; + struct netdev_hw_addr_list mc; + netdev_features_t features; + struct { + unsigned long multicast; + + unsigned long rx_bytes; + unsigned long rx_errors; + unsigned long rx_packets; + unsigned long tx_bytes; + unsigned long tx_dropped; + unsigned long tx_errors; + unsigned long tx_packets; + } stats; + enum net_device_reg_state reg_state; + const struct ethtool_ops *ethtool_ops; + const struct net_device_ops *netdev_ops; -#define rtnl_lock() -#define rtnl_unlock() + bool needs_free_netdev; + /* Not properly typed as-of now. */ + int flags, type; + int name_assign_type, needed_headroom; + void (*priv_destructor)(struct net_device *); + + /* net_device internal. */ + struct device dev; + + /* + * In case we delete the net_device we need to be able to clear all + * NAPI consumers. XXX which lock? + */ + TAILQ_HEAD(, napi_struct) napi_head; + struct taskqueue *napi_tq; + + /* Must stay last. */ + uint8_t drv_priv[0] __aligned(CACHE_LINE_SIZE); +}; + +#define SET_NETDEV_DEV(_ndev, _dev) (_ndev)->dev.parent = _dev; + +/* -------------------------------------------------------------------------- */ /* According to linux::ipoib_main.c. */ struct netdev_notifier_info { struct net_device *dev; @@ -79,4 +175,134 @@ int unregister_netdevice_notifier(struct notifier_block *); int unregister_inetaddr_notifier(struct notifier_block *); -#endif /* _LINUX_NETDEVICE_H_ */ +/* -------------------------------------------------------------------------- */ + +#define NAPI_POLL_WEIGHT 64 /* budget */ + +struct napi_struct { + TAILQ_ENTRY(napi_struct) entry; + + /* XXX TODO */ + struct list_head rx_list; + struct net_device *dev; + int (*poll)(struct napi_struct *, int); + int budget; + int rx_count; + + /* + * These flags mostly need to be checked/changed atomically + * (multiple together in some cases). + */ + volatile unsigned long _flags; + + /* FreeBSD internal. */ + /* Use task for now, so we can easily switch between direct and task. */ + struct task napi_task; +}; + +void linuxkpi_init_dummy_netdev(struct net_device *); +void linuxkpi_netif_napi_add(struct net_device *, struct napi_struct *, + int(*napi_poll)(struct napi_struct *, int), int); +void linuxkpi_netif_napi_del(struct napi_struct *); +bool linuxkpi_napi_schedule_prep(struct napi_struct *); +void linuxkpi___napi_schedule(struct napi_struct *); +void linuxkpi_napi_schedule(struct napi_struct *); +void linuxkpi_napi_reschedule(struct napi_struct *); +bool linuxkpi_napi_complete_done(struct napi_struct *, int); +bool linuxkpi_napi_complete(struct napi_struct *); +void linuxkpi_napi_disable(struct napi_struct *); +void linuxkpi_napi_enable(struct napi_struct *); +void linuxkpi_napi_synchronize(struct napi_struct *); + +#define init_dummy_netdev(_n) \ + linuxkpi_init_dummy_netdev(_n) +#define netif_napi_add(_nd, _ns, _p, _b) \ + linuxkpi_netif_napi_add(_nd, _ns, _p, _b) +#define netif_napi_del(_n) \ + linuxkpi_netif_napi_del(_n) +#define napi_schedule_prep(_n) \ + linuxkpi_napi_schedule_prep(_n) +#define __napi_schedule(_n) \ + linuxkpi___napi_schedule(_n) +#define napi_schedule(_n) \ + linuxkpi_napi_schedule(_n) +#define napi_reschedule(_n) \ + linuxkpi_napi_reschedule(_n) +#define napi_complete_done(_n, _r) \ + linuxkpi_napi_complete_done(_n, _r) +#define napi_complete(_n) \ + linuxkpi_napi_complete(_n) +#define napi_disable(_n) \ + linuxkpi_napi_disable(_n) +#define napi_enable(_n) \ + linuxkpi_napi_enable(_n) +#define napi_synchronize(_n) \ + linuxkpi_napi_synchronize(_n) + +/* -------------------------------------------------------------------------- */ + +static inline void +netdev_rss_key_fill(uint32_t *buf, size_t len) +{ + + /* + * Remembering from a previous life there was discussions on what is + * a good RSS hash key. See end of rss_init() in net/rss_config.c. + * iwlwifi is looking for a 10byte "secret" so stay with random for now. + */ + get_random_bytes(buf, len); +} + +static inline int +netdev_hw_addr_list_count(struct netdev_hw_addr_list *list) +{ + + return (list->count); +} + +static inline int +netdev_mc_count(struct net_device *ndev) +{ + + return (netdev_hw_addr_list_count(&ndev->mc)); +} + +#define netdev_hw_addr_list_for_each(_addr, _list) \ + list_for_each_entry((_addr), &(_list)->addr_list, addr_list) + +#define netdev_for_each_mc_addr(na, ndev) \ + netdev_hw_addr_list_for_each(na, &(ndev)->mc) + +static __inline void +synchronize_net(void) +{ + + /* We probably cannot do that unconditionally at some point anymore. */ + synchronize_rcu(); +} + +/* -------------------------------------------------------------------------- */ + +struct net_device *linuxkpi_alloc_netdev(size_t, const char *, uint32_t, + void(*)(struct net_device *)); +void linuxkpi_free_netdev(struct net_device *); + +#define alloc_netdev(_l, _n, _f, _func) \ + linuxkpi_alloc_netdev(_l, _n, _f, _func) +#define free_netdev(_n) \ + linuxkpi_free_netdev(_n) + +static inline void * +netdev_priv(const struct net_device *ndev) +{ + + return (__DECONST(void *, ndev->drv_priv)); +} + +/* -------------------------------------------------------------------------- */ +/* This is really rtnetlink and probably belongs elsewhere. */ + +#define rtnl_lock() do { } while(0) +#define rtnl_unlock() do { } while(0) + +#endif /* __LINUXKPI_LINUX_NETDEVICE_H_ */ Index: sys/compat/linuxkpi/common/src/linux_netdev.c =================================================================== --- /dev/null +++ sys/compat/linuxkpi/common/src/linux_netdev.c @@ -0,0 +1,384 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2021 The FreeBSD Foundation + * + * This software was developed by Björn Zeeb under sponsorship from + * the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include + +#include +#include +#include + +MALLOC_DEFINE(M_NETDEV, "lkpindev", "Linux KPI netdevice compat"); + +/* -------------------------------------------------------------------------- */ + +static int debug_napi; +SYSCTL_INT(_compat_linuxkpi, OID_AUTO, debug_napi, CTLFLAG_RWTUN, + &debug_napi, 0, "NAPI debug level"); + +/* Do not schedule new things while we are waiting to clear things. */ +#define LKPI_NAPI_FLAG_DISABLE_PENDING 0 +/* To synchronise that only one poll is ever running. */ +#define LKPI_NAPI_FLAG_IS_SCHEDULED 1 +/* If trying to schedule while poll is running. Need to re-schedule. */ +#define LKPI_NAPI_FLAG_LOST_RACE_TRY_AGAIN 2 + +#define LKPI_NAPI_FLAGS \ + "\20\1DISABLE_PENDING\2IS_SCHEDULED\3LOST_RACE_TRY_AGAIN" + +#define NAPI_DEBUG +#ifdef NAPI_DEBUG + +#define DNAPI_TODO 0x01 +#define DNAPI_IMPROVE 0x02 +#define DNAPI_TRACE 0x10 +#define DNAPI_TRACE_TASK 0x20 +#define DNAPI_DIRECT_DISPATCH 0x1000 + +#define NAPI_TRACE(_n) if (debug_napi & DNAPI_TRACE) \ + printf("NAPI_TRACE %s:%d %u %p (%#jx %b)\n", __func__, __LINE__, \ + (unsigned int)ticks, _n, (uintmax_t)(_n)->_flags, \ + (int)(_n)->_flags, LKPI_NAPI_FLAGS) +#define NAPI_TRACE2D(_n, _d) if (debug_napi & DNAPI_TRACE) \ + printf("NAPI_TRACE %s:%d %u %p (%#jx %b) %d\n", __func__, __LINE__, \ + (unsigned int)ticks, _n, (uintmax_t)(_n)->_flags, \ + (int)(_n)->_flags, LKPI_NAPI_FLAGS, _d) +#define NAPI_TRACE_TASK(_n, _p, _c) if (debug_napi & DNAPI_TRACE_TASK) \ + printf("NAPI_TRACE %s:%d %u %p (%#jx %b) pending %d count %d " \ + "rx_count %d\n", __func__, __LINE__, \ + (unsigned int)ticks, _n, (uintmax_t)(_n)->_flags, \ + (int)(_n)->_flags, LKPI_NAPI_FLAGS, _p, _c, (_n)->rx_count) +#define NAPI_TODO() if (debug_napi & DNAPI_TODO) \ + printf("NAPI_TODO %s:%d %d\n", __func__, __LINE__, ticks) +#define NAPI_IMPROVE() if (debug_napi & DNAPI_IMPROVE) \ + printf("NAPI_IMPROVE %s:%d %d\n", __func__, __LINE__, ticks) +#else +#define NAPI_TRACE(_n) do { } while(0) +#define NAPI_TRACE2D(_n, _d) do { } while(0) +#define NAPI_TRACE_TASK(_n) do { } while(0) +#define NAPI_TODO() do { } while(0) +#define NAPI_IMPROVE() do { } while(0) +#endif + +static void +lkpi_napi_task(void *ctx, int pending) +{ + struct napi_struct *napi; + int count; + + KASSERT(ctx != NULL, ("%s: napi %p, pending %d\n", + __func__, ctx, pending)); + napi = ctx; + KASSERT(napi->poll != NULL, ("%s: napi %p poll is NULL\n", + __func__, napi)); + + NAPI_TRACE_TASK(napi, pending, napi->budget); + count = napi->poll(napi, napi->budget); + napi->rx_count += count; + NAPI_TRACE_TASK(napi, pending, count); + + /* + * We must not check against count < pending here. There are situations + * when a driver may "poll" and we may not have any work to do and that + * would make us re-schedule oursevles for-ever. + */ + if (count >= napi->budget) { + /* Have to re-schedule ourselves. */ + __napi_schedule(napi); + } +} + +void +linuxkpi_init_dummy_netdev(struct net_device *ndev) +{ + + memset(ndev, 0, sizeof(*ndev)); + + ndev->reg_state = NETREG_DUMMY; + TAILQ_INIT(&ndev->napi_head); + /* Anything else? */ + + ndev->napi_tq = taskqueue_create("tq_ndev_napi", M_WAITOK, + taskqueue_thread_enqueue, &ndev->napi_tq); + /* One thread for now. */ + (void) taskqueue_start_threads(&ndev->napi_tq, 1, PWAIT, + "ndev napi taskq"); +} + +void +linuxkpi_netif_napi_add(struct net_device *ndev, struct napi_struct *napi, + int(*napi_poll)(struct napi_struct *, int), int budget) +{ + + napi->dev = ndev; + napi->poll = napi_poll; + napi->budget = budget; + + INIT_LIST_HEAD(&napi->rx_list); + napi->rx_count = 0; + + TASK_INIT(&napi->napi_task, 0, lkpi_napi_task, napi); + + /* XXX locking? */ + TAILQ_INSERT_TAIL(&ndev->napi_head, napi, entry); + + /* What else? Anything to do on ndev? */ +} + +void +linuxkpi_netif_napi_del(struct napi_struct *napi) +{ + + /* XXX locking? */ + TAILQ_REMOVE(&napi->dev->napi_head, napi, entry); + + /* XXX TODO */ +} + +/* + * Check if a poll is running or can run and and if the latter + * make us as running. That way we ensure that only one poll + * can only ever run at the same time. Returns true if no poll + * was scheduled yet. + */ +bool +linuxkpi_napi_schedule_prep(struct napi_struct *napi) +{ + unsigned long old, new; + + NAPI_TRACE(napi); + + /* Can can only update/return if all flags agree. */ + do { + old = READ_ONCE(napi->_flags); + + /* Is we are stopping, cannot run again. */ + if ((old & BIT(LKPI_NAPI_FLAG_DISABLE_PENDING)) != 0) { + NAPI_TRACE(napi); + return (false); + } + + new = old; + /* We were already scheduled. Need to try again? */ + if ((old & BIT(LKPI_NAPI_FLAG_IS_SCHEDULED)) != 0) + new |= BIT(LKPI_NAPI_FLAG_LOST_RACE_TRY_AGAIN); + new |= BIT(LKPI_NAPI_FLAG_IS_SCHEDULED); + + } while (atomic_cmpset_acq_long(&napi->_flags, old, new) == 0); + + NAPI_TRACE(napi); + + /* XXX TODO */ + return ((old & BIT(LKPI_NAPI_FLAG_IS_SCHEDULED)) == 0); +} + +void +linuxkpi___napi_schedule(struct napi_struct *napi) +{ + unsigned long old, new; + int rc; + +if ((debug_napi & DNAPI_DIRECT_DISPATCH) != 0) { + /* XXX TODO */ + rc = 0; +again: + NAPI_TRACE2D(napi, rc); + if (napi->poll != NULL) + rc = napi->poll(napi, napi->budget); + napi->rx_count += rc; + + /* Check if interrupts are still disabled, more work to do. */ + /* Bandaid for now. */ + if (rc >= napi->budget) + goto again; + + /* Bandaid for now. */ + if (test_bit(LKPI_NAPI_FLAG_LOST_RACE_TRY_AGAIN, &napi->_flags)) + goto again; + + do { + new = old = READ_ONCE(napi->_flags); + clear_bit(LKPI_NAPI_FLAG_LOST_RACE_TRY_AGAIN, &new); + clear_bit(LKPI_NAPI_FLAG_IS_SCHEDULED, &new); + } while (atomic_cmpset_acq_long(&napi->_flags, old, new) == 0); + +} else { + rc = taskqueue_enqueue(napi->dev->napi_tq, &napi->napi_task); + if (rc != 0) { + /* Should we assert EPIPE? */ + return; + } +} + NAPI_TRACE2D(napi, rc); +} + +void +linuxkpi_napi_schedule(struct napi_struct *napi) +{ + + NAPI_TRACE(napi); + + /* + * iwlwifi calls this sequence instead of napi_schedule() + * to be able to test the prep result. + */ + if (napi_schedule_prep(napi)) + __napi_schedule(napi); +} + +void +linuxkpi_napi_reschedule(struct napi_struct *napi) +{ + + NAPI_TRACE(napi); + + /* XXX Not sure what is different to napi_schedule yet. */ + if (napi_schedule_prep(napi)) + __napi_schedule(napi); +} + +bool +linuxkpi_napi_complete_done(struct napi_struct *napi, int ret) +{ + unsigned long old, new; + + /* XXX TODO */ + + NAPI_TRACE(napi); + +if ((debug_napi & DNAPI_DIRECT_DISPATCH) == 0) { + do { + new = old = READ_ONCE(napi->_flags); + + /* + * If we lost a race before, we need to re-schedule. + * Leave IS_SCHEDULED set essentially doing "_prep". + */ + if (!test_bit(LKPI_NAPI_FLAG_LOST_RACE_TRY_AGAIN, &old)) + clear_bit(LKPI_NAPI_FLAG_IS_SCHEDULED, &new); + clear_bit(LKPI_NAPI_FLAG_LOST_RACE_TRY_AGAIN, &new); + } while (atomic_cmpset_acq_long(&napi->_flags, old, new) == 0); + + /* Someone tried to schedule while poll was running. Re-sched. */ + if (test_bit(LKPI_NAPI_FLAG_LOST_RACE_TRY_AGAIN, &old)) { + __napi_schedule(napi); + return (false); + } +} + + NAPI_TRACE(napi); + + return (true); +} + +bool +linuxkpi_napi_complete(struct napi_struct *napi) +{ + + NAPI_TRACE(napi); + return (napi_complete_done(napi, 0)); +} + +void +linuxkpi_napi_disable(struct napi_struct *napi) +{ + NAPI_TRACE(napi); + set_bit(LKPI_NAPI_FLAG_DISABLE_PENDING, &napi->_flags); + while (test_bit(LKPI_NAPI_FLAG_IS_SCHEDULED, &napi->_flags)) + pause_sbt("napidslp", SBT_1MS, 0, C_HARDCLOCK); + clear_bit(LKPI_NAPI_FLAG_DISABLE_PENDING, &napi->_flags); +} + +void +linuxkpi_napi_enable(struct napi_struct *napi) +{ + + NAPI_TRACE(napi); + KASSERT(!test_bit(LKPI_NAPI_FLAG_IS_SCHEDULED, &napi->_flags), + ("%s: enabling napi %p already scheduled\n", __func__, napi)); + mb(); + /* Let us be scheduled. */ + clear_bit(LKPI_NAPI_FLAG_IS_SCHEDULED, &napi->_flags); +} + +void +linuxkpi_napi_synchronize(struct napi_struct *napi) +{ + NAPI_TRACE(napi); +#if defined(SMP) + /* Check & sleep while a napi is scheduled. */ + while (test_bit(LKPI_NAPI_FLAG_IS_SCHEDULED, &napi->_flags)) + pause_sbt("napisslp", SBT_1MS, 0, C_HARDCLOCK); +#else + mb(); +#endif +} + +/* -------------------------------------------------------------------------- */ + +struct net_device * +linuxkpi_alloc_netdev(size_t len, const char *name, uint32_t flags, + void(*setup_func)(struct net_device *)) +{ + struct net_device *ndev; + + ndev = malloc(sizeof(*ndev) + len, M_NETDEV, M_NOWAIT | M_ZERO); + if (ndev == NULL) + return (ndev); + + TAILQ_INIT(&ndev->napi_head); + strlcpy(ndev->name, name, sizeof(*ndev->name)); + + /* This needs extending as we support more. */ + + setup_func(ndev); + + return (ndev); +} + +void +linuxkpi_free_netdev(struct net_device *ndev) +{ + struct napi_struct *napi, *temp; + + /* XXX locking? */ + TAILQ_FOREACH_SAFE(napi, &ndev->napi_head, entry, temp) { + linuxkpi_netif_napi_del(napi); + } + + /* This needs extending as we support more. */ + + free(ndev, M_NETDEV); +} Index: sys/conf/files =================================================================== --- sys/conf/files +++ sys/conf/files @@ -4581,6 +4581,8 @@ compile-with "${LINUXKPI_C}" compat/linuxkpi/common/src/linux_lock.c optional compat_linuxkpi \ compile-with "${LINUXKPI_C}" +compat/linuxkpi/common/src/linux_netdev.c optional compat_linuxkpi \ + compile-with "${LINUXKPI_C}" compat/linuxkpi/common/src/linux_page.c optional compat_linuxkpi \ compile-with "${LINUXKPI_C}" compat/linuxkpi/common/src/linux_pci.c optional compat_linuxkpi pci \ Index: sys/modules/linuxkpi/Makefile =================================================================== --- sys/modules/linuxkpi/Makefile +++ sys/modules/linuxkpi/Makefile @@ -15,6 +15,7 @@ linux_kmod.c \ linux_kthread.c \ linux_lock.c \ + linux_netdev.c \ linux_page.c \ linux_pci.c \ linux_radix.c \