Index: head/sys/kern/kern_alq.c =================================================================== --- head/sys/kern/kern_alq.c (revision 296687) +++ head/sys/kern/kern_alq.c (revision 296688) @@ -1,973 +1,973 @@ /*- * Copyright (c) 2002, Jeffrey Roberson * Copyright (c) 2008-2009, Lawrence Stewart * Copyright (c) 2009-2010, The FreeBSD Foundation * All rights reserved. * * Portions of this software were developed at the Centre for Advanced * Internet Architectures, Swinburne University of Technology, Melbourne, * Australia by Lawrence Stewart under sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_mac.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* Async. Logging Queue */ struct alq { char *aq_entbuf; /* Buffer for stored entries */ int aq_entmax; /* Max entries */ int aq_entlen; /* Entry length */ int aq_freebytes; /* Bytes available in buffer */ int aq_buflen; /* Total length of our buffer */ int aq_writehead; /* Location for next write */ int aq_writetail; /* Flush starts at this location */ int aq_wrapearly; /* # bytes left blank at end of buf */ int aq_flags; /* Queue flags */ int aq_waiters; /* Num threads waiting for resources * NB: Used as a wait channel so must * not be first field in the alq struct */ struct ale aq_getpost; /* ALE for use by get/post */ struct mtx aq_mtx; /* Queue lock */ struct vnode *aq_vp; /* Open vnode handle */ struct ucred *aq_cred; /* Credentials of the opening thread */ LIST_ENTRY(alq) aq_act; /* List of active queues */ LIST_ENTRY(alq) aq_link; /* List of all queues */ }; #define AQ_WANTED 0x0001 /* Wakeup sleeper when io is done */ #define AQ_ACTIVE 0x0002 /* on the active list */ #define AQ_FLUSHING 0x0004 /* doing IO */ #define AQ_SHUTDOWN 0x0008 /* Queue no longer valid */ #define AQ_ORDERED 0x0010 /* Queue enforces ordered writes */ #define AQ_LEGACY 0x0020 /* Legacy queue (fixed length writes) */ #define ALQ_LOCK(alq) mtx_lock_spin(&(alq)->aq_mtx) #define ALQ_UNLOCK(alq) mtx_unlock_spin(&(alq)->aq_mtx) #define HAS_PENDING_DATA(alq) ((alq)->aq_freebytes != (alq)->aq_buflen) static MALLOC_DEFINE(M_ALD, "ALD", "ALD"); /* * The ald_mtx protects the ald_queues list and the ald_active list. */ static struct mtx ald_mtx; static LIST_HEAD(, alq) ald_queues; static LIST_HEAD(, alq) ald_active; static int ald_shutingdown = 0; struct thread *ald_thread; static struct proc *ald_proc; static eventhandler_tag alq_eventhandler_tag = NULL; #define ALD_LOCK() mtx_lock(&ald_mtx) #define ALD_UNLOCK() mtx_unlock(&ald_mtx) /* Daemon functions */ static int ald_add(struct alq *); static int ald_rem(struct alq *); static void ald_startup(void *); static void ald_daemon(void); static void ald_shutdown(void *, int); static void ald_activate(struct alq *); static void ald_deactivate(struct alq *); /* Internal queue functions */ static void alq_shutdown(struct alq *); static void alq_destroy(struct alq *); static int alq_doio(struct alq *); /* * Add a new queue to the global list. Fail if we're shutting down. */ static int ald_add(struct alq *alq) { int error; error = 0; ALD_LOCK(); if (ald_shutingdown) { error = EBUSY; goto done; } LIST_INSERT_HEAD(&ald_queues, alq, aq_link); done: ALD_UNLOCK(); return (error); } /* * Remove a queue from the global list unless we're shutting down. If so, * the ald will take care of cleaning up it's resources. */ static int ald_rem(struct alq *alq) { int error; error = 0; ALD_LOCK(); if (ald_shutingdown) { error = EBUSY; goto done; } LIST_REMOVE(alq, aq_link); done: ALD_UNLOCK(); return (error); } /* * Put a queue on the active list. This will schedule it for writing. */ static void ald_activate(struct alq *alq) { LIST_INSERT_HEAD(&ald_active, alq, aq_act); wakeup(&ald_active); } static void ald_deactivate(struct alq *alq) { LIST_REMOVE(alq, aq_act); alq->aq_flags &= ~AQ_ACTIVE; } static void ald_startup(void *unused) { mtx_init(&ald_mtx, "ALDmtx", NULL, MTX_DEF|MTX_QUIET); LIST_INIT(&ald_queues); LIST_INIT(&ald_active); } static void ald_daemon(void) { int needwakeup; struct alq *alq; ald_thread = FIRST_THREAD_IN_PROC(ald_proc); alq_eventhandler_tag = EVENTHANDLER_REGISTER(shutdown_pre_sync, ald_shutdown, NULL, SHUTDOWN_PRI_FIRST); ALD_LOCK(); for (;;) { while ((alq = LIST_FIRST(&ald_active)) == NULL && !ald_shutingdown) mtx_sleep(&ald_active, &ald_mtx, PWAIT, "aldslp", 0); /* Don't shutdown until all active ALQs are flushed. */ if (ald_shutingdown && alq == NULL) { ALD_UNLOCK(); break; } ALQ_LOCK(alq); ald_deactivate(alq); ALD_UNLOCK(); needwakeup = alq_doio(alq); ALQ_UNLOCK(alq); if (needwakeup) wakeup_one(alq); ALD_LOCK(); } kproc_exit(0); } static void ald_shutdown(void *arg, int howto) { struct alq *alq; ALD_LOCK(); /* Ensure no new queues can be created. */ ald_shutingdown = 1; /* Shutdown all ALQs prior to terminating the ald_daemon. */ while ((alq = LIST_FIRST(&ald_queues)) != NULL) { LIST_REMOVE(alq, aq_link); ALD_UNLOCK(); alq_shutdown(alq); ALD_LOCK(); } /* At this point, all ALQs are flushed and shutdown. */ /* * Wake ald_daemon so that it exits. It won't be able to do * anything until we mtx_sleep because we hold the ald_mtx. */ wakeup(&ald_active); /* Wait for ald_daemon to exit. */ mtx_sleep(ald_proc, &ald_mtx, PWAIT, "aldslp", 0); ALD_UNLOCK(); } static void alq_shutdown(struct alq *alq) { ALQ_LOCK(alq); /* Stop any new writers. */ alq->aq_flags |= AQ_SHUTDOWN; /* * If the ALQ isn't active but has unwritten data (possible if * the ALQ_NOACTIVATE flag has been used), explicitly activate the * ALQ here so that the pending data gets flushed by the ald_daemon. */ if (!(alq->aq_flags & AQ_ACTIVE) && HAS_PENDING_DATA(alq)) { alq->aq_flags |= AQ_ACTIVE; ALQ_UNLOCK(alq); ALD_LOCK(); ald_activate(alq); ALD_UNLOCK(); ALQ_LOCK(alq); } /* Drain IO */ while (alq->aq_flags & AQ_ACTIVE) { alq->aq_flags |= AQ_WANTED; msleep_spin(alq, &alq->aq_mtx, "aldclose", 0); } ALQ_UNLOCK(alq); vn_close(alq->aq_vp, FWRITE, alq->aq_cred, curthread); crfree(alq->aq_cred); } void alq_destroy(struct alq *alq) { /* Drain all pending IO. */ alq_shutdown(alq); mtx_destroy(&alq->aq_mtx); free(alq->aq_entbuf, M_ALD); free(alq, M_ALD); } /* * Flush all pending data to disk. This operation will block. */ static int alq_doio(struct alq *alq) { struct thread *td; struct mount *mp; struct vnode *vp; struct uio auio; struct iovec aiov[2]; int totlen; int iov; int wrapearly; KASSERT((HAS_PENDING_DATA(alq)), ("%s: queue empty!", __func__)); vp = alq->aq_vp; td = curthread; totlen = 0; iov = 1; wrapearly = alq->aq_wrapearly; bzero(&aiov, sizeof(aiov)); bzero(&auio, sizeof(auio)); /* Start the write from the location of our buffer tail pointer. */ aiov[0].iov_base = alq->aq_entbuf + alq->aq_writetail; if (alq->aq_writetail < alq->aq_writehead) { /* Buffer not wrapped. */ totlen = aiov[0].iov_len = alq->aq_writehead - alq->aq_writetail; } else if (alq->aq_writehead == 0) { /* Buffer not wrapped (special case to avoid an empty iov). */ totlen = aiov[0].iov_len = alq->aq_buflen - alq->aq_writetail - wrapearly; } else { /* * Buffer wrapped, requires 2 aiov entries: * - first is from writetail to end of buffer * - second is from start of buffer to writehead */ aiov[0].iov_len = alq->aq_buflen - alq->aq_writetail - wrapearly; iov++; aiov[1].iov_base = alq->aq_entbuf; aiov[1].iov_len = alq->aq_writehead; totlen = aiov[0].iov_len + aiov[1].iov_len; } alq->aq_flags |= AQ_FLUSHING; ALQ_UNLOCK(alq); auio.uio_iov = &aiov[0]; auio.uio_offset = 0; auio.uio_segflg = UIO_SYSSPACE; auio.uio_rw = UIO_WRITE; auio.uio_iovcnt = iov; auio.uio_resid = totlen; auio.uio_td = td; /* * Do all of the junk required to write now. */ vn_start_write(vp, &mp, V_WAIT); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); /* * XXX: VOP_WRITE error checks are ignored. */ #ifdef MAC if (mac_vnode_check_write(alq->aq_cred, NOCRED, vp) == 0) #endif VOP_WRITE(vp, &auio, IO_UNIT | IO_APPEND, alq->aq_cred); VOP_UNLOCK(vp, 0); vn_finished_write(mp); ALQ_LOCK(alq); alq->aq_flags &= ~AQ_FLUSHING; /* Adjust writetail as required, taking into account wrapping. */ alq->aq_writetail = (alq->aq_writetail + totlen + wrapearly) % alq->aq_buflen; alq->aq_freebytes += totlen + wrapearly; /* * If we just flushed part of the buffer which wrapped, reset the * wrapearly indicator. */ if (wrapearly) alq->aq_wrapearly = 0; /* * If we just flushed the buffer completely, reset indexes to 0 to * minimise buffer wraps. * This is also required to ensure alq_getn() can't wedge itself. */ if (!HAS_PENDING_DATA(alq)) alq->aq_writehead = alq->aq_writetail = 0; KASSERT((alq->aq_writetail >= 0 && alq->aq_writetail < alq->aq_buflen), ("%s: aq_writetail < 0 || aq_writetail >= aq_buflen", __func__)); if (alq->aq_flags & AQ_WANTED) { alq->aq_flags &= ~AQ_WANTED; return (1); } return(0); } static struct kproc_desc ald_kp = { "ALQ Daemon", ald_daemon, &ald_proc }; SYSINIT(aldthread, SI_SUB_KTHREAD_IDLE, SI_ORDER_ANY, kproc_start, &ald_kp); SYSINIT(ald, SI_SUB_LOCK, SI_ORDER_ANY, ald_startup, NULL); /* User visible queue functions */ /* * Create the queue data structure, allocate the buffer, and open the file. */ int alq_open_flags(struct alq **alqp, const char *file, struct ucred *cred, int cmode, int size, int flags) { struct thread *td; struct nameidata nd; struct alq *alq; int oflags; int error; KASSERT((size > 0), ("%s: size <= 0", __func__)); *alqp = NULL; td = curthread; NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, file, td); oflags = FWRITE | O_NOFOLLOW | O_CREAT; error = vn_open_cred(&nd, &oflags, cmode, 0, cred, NULL); if (error) return (error); NDFREE(&nd, NDF_ONLY_PNBUF); /* We just unlock so we hold a reference */ VOP_UNLOCK(nd.ni_vp, 0); alq = malloc(sizeof(*alq), M_ALD, M_WAITOK|M_ZERO); alq->aq_vp = nd.ni_vp; alq->aq_cred = crhold(cred); mtx_init(&alq->aq_mtx, "ALD Queue", NULL, MTX_SPIN|MTX_QUIET); alq->aq_buflen = size; alq->aq_entmax = 0; alq->aq_entlen = 0; alq->aq_freebytes = alq->aq_buflen; alq->aq_entbuf = malloc(alq->aq_buflen, M_ALD, M_WAITOK|M_ZERO); alq->aq_writehead = alq->aq_writetail = 0; if (flags & ALQ_ORDERED) alq->aq_flags |= AQ_ORDERED; if ((error = ald_add(alq)) != 0) { alq_destroy(alq); return (error); } *alqp = alq; return (0); } int alq_open(struct alq **alqp, const char *file, struct ucred *cred, int cmode, int size, int count) { int ret; KASSERT((count >= 0), ("%s: count < 0", __func__)); if (count > 0) { if ((ret = alq_open_flags(alqp, file, cred, cmode, size*count, 0)) == 0) { (*alqp)->aq_flags |= AQ_LEGACY; (*alqp)->aq_entmax = count; (*alqp)->aq_entlen = size; } } else ret = alq_open_flags(alqp, file, cred, cmode, size, 0); return (ret); } /* * Copy a new entry into the queue. If the operation would block either * wait or return an error depending on the value of waitok. */ int alq_writen(struct alq *alq, void *data, int len, int flags) { int activate, copy, ret; void *waitchan; KASSERT((len > 0 && len <= alq->aq_buflen), ("%s: len <= 0 || len > aq_buflen", __func__)); activate = ret = 0; copy = len; waitchan = NULL; ALQ_LOCK(alq); /* * Fail to perform the write and return EWOULDBLOCK if: * - The message is larger than our underlying buffer. * - The ALQ is being shutdown. * - There is insufficient free space in our underlying buffer * to accept the message and the user can't wait for space. * - There is insufficient free space in our underlying buffer * to accept the message and the alq is inactive due to prior * use of the ALQ_NOACTIVATE flag (which would lead to deadlock). */ if (len > alq->aq_buflen || alq->aq_flags & AQ_SHUTDOWN || (((flags & ALQ_NOWAIT) || (!(alq->aq_flags & AQ_ACTIVE) && HAS_PENDING_DATA(alq))) && alq->aq_freebytes < len)) { ALQ_UNLOCK(alq); return (EWOULDBLOCK); } /* * If we want ordered writes and there is already at least one thread * waiting for resources to become available, sleep until we're woken. */ if (alq->aq_flags & AQ_ORDERED && alq->aq_waiters > 0) { KASSERT(!(flags & ALQ_NOWAIT), ("%s: ALQ_NOWAIT set but incorrectly ignored!", __func__)); alq->aq_waiters++; msleep_spin(&alq->aq_waiters, &alq->aq_mtx, "alqwnord", 0); alq->aq_waiters--; } /* * (ALQ_WAITOK && aq_freebytes < len) or aq_freebytes >= len, either * enter while loop and sleep until we have enough free bytes (former) * or skip (latter). If AQ_ORDERED is set, only 1 thread at a time will * be in this loop. Otherwise, multiple threads may be sleeping here * competing for ALQ resources. */ while (alq->aq_freebytes < len && !(alq->aq_flags & AQ_SHUTDOWN)) { KASSERT(!(flags & ALQ_NOWAIT), ("%s: ALQ_NOWAIT set but incorrectly ignored!", __func__)); alq->aq_flags |= AQ_WANTED; alq->aq_waiters++; if (waitchan) wakeup(waitchan); msleep_spin(alq, &alq->aq_mtx, "alqwnres", 0); alq->aq_waiters--; /* * If we're the first thread to wake after an AQ_WANTED wakeup * but there isn't enough free space for us, we're going to loop * and sleep again. If there are other threads waiting in this * loop, schedule a wakeup so that they can see if the space * they require is available. */ if (alq->aq_waiters > 0 && !(alq->aq_flags & AQ_ORDERED) && alq->aq_freebytes < len && !(alq->aq_flags & AQ_WANTED)) waitchan = alq; else waitchan = NULL; } /* * If there are waiters, we need to signal the waiting threads after we * complete our work. The alq ptr is used as a wait channel for threads * requiring resources to be freed up. In the AQ_ORDERED case, threads * are not allowed to concurrently compete for resources in the above * while loop, so we use a different wait channel in this case. */ if (alq->aq_waiters > 0) { if (alq->aq_flags & AQ_ORDERED) waitchan = &alq->aq_waiters; else waitchan = alq; } else waitchan = NULL; /* Bail if we're shutting down. */ if (alq->aq_flags & AQ_SHUTDOWN) { ret = EWOULDBLOCK; goto unlock; } /* * If we need to wrap the buffer to accommodate the write, * we'll need 2 calls to bcopy. */ if ((alq->aq_buflen - alq->aq_writehead) < len) copy = alq->aq_buflen - alq->aq_writehead; /* Copy message (or part thereof if wrap required) to the buffer. */ bcopy(data, alq->aq_entbuf + alq->aq_writehead, copy); alq->aq_writehead += copy; if (alq->aq_writehead >= alq->aq_buflen) { KASSERT((alq->aq_writehead == alq->aq_buflen), ("%s: alq->aq_writehead (%d) > alq->aq_buflen (%d)", __func__, alq->aq_writehead, alq->aq_buflen)); alq->aq_writehead = 0; } if (copy != len) { /* * Wrap the buffer by copying the remainder of our message * to the start of the buffer and resetting aq_writehead. */ bcopy(((uint8_t *)data)+copy, alq->aq_entbuf, len - copy); alq->aq_writehead = len - copy; } KASSERT((alq->aq_writehead >= 0 && alq->aq_writehead < alq->aq_buflen), ("%s: aq_writehead < 0 || aq_writehead >= aq_buflen", __func__)); alq->aq_freebytes -= len; if (!(alq->aq_flags & AQ_ACTIVE) && !(flags & ALQ_NOACTIVATE)) { alq->aq_flags |= AQ_ACTIVE; activate = 1; } KASSERT((HAS_PENDING_DATA(alq)), ("%s: queue empty!", __func__)); unlock: ALQ_UNLOCK(alq); if (activate) { ALD_LOCK(); ald_activate(alq); ALD_UNLOCK(); } /* NB: We rely on wakeup_one waking threads in a FIFO manner. */ if (waitchan != NULL) wakeup_one(waitchan); return (ret); } int alq_write(struct alq *alq, void *data, int flags) { /* Should only be called in fixed length message (legacy) mode. */ KASSERT((alq->aq_flags & AQ_LEGACY), ("%s: fixed length write on variable length queue", __func__)); return (alq_writen(alq, data, alq->aq_entlen, flags)); } /* * Retrieve a pointer for the ALQ to write directly into, avoiding bcopy. */ struct ale * alq_getn(struct alq *alq, int len, int flags) { int contigbytes; void *waitchan; KASSERT((len > 0 && len <= alq->aq_buflen), ("%s: len <= 0 || len > alq->aq_buflen", __func__)); waitchan = NULL; ALQ_LOCK(alq); /* * Determine the number of free contiguous bytes. * We ensure elsewhere that if aq_writehead == aq_writetail because * the buffer is empty, they will both be set to 0 and therefore * aq_freebytes == aq_buflen and is fully contiguous. * If they are equal and the buffer is not empty, aq_freebytes will * be 0 indicating the buffer is full. */ if (alq->aq_writehead <= alq->aq_writetail) contigbytes = alq->aq_freebytes; else { contigbytes = alq->aq_buflen - alq->aq_writehead; if (contigbytes < len) { /* * Insufficient space at end of buffer to handle a * contiguous write. Wrap early if there's space at * the beginning. This will leave a hole at the end * of the buffer which we will have to skip over when * flushing the buffer to disk. */ if (alq->aq_writetail >= len || flags & ALQ_WAITOK) { /* Keep track of # bytes left blank. */ alq->aq_wrapearly = contigbytes; /* Do the wrap and adjust counters. */ contigbytes = alq->aq_freebytes = alq->aq_writetail; alq->aq_writehead = 0; } } } /* * Return a NULL ALE if: * - The message is larger than our underlying buffer. * - The ALQ is being shutdown. * - There is insufficient free space in our underlying buffer * to accept the message and the user can't wait for space. * - There is insufficient free space in our underlying buffer * to accept the message and the alq is inactive due to prior * use of the ALQ_NOACTIVATE flag (which would lead to deadlock). */ if (len > alq->aq_buflen || alq->aq_flags & AQ_SHUTDOWN || (((flags & ALQ_NOWAIT) || (!(alq->aq_flags & AQ_ACTIVE) && HAS_PENDING_DATA(alq))) && contigbytes < len)) { ALQ_UNLOCK(alq); return (NULL); } /* * If we want ordered writes and there is already at least one thread * waiting for resources to become available, sleep until we're woken. */ if (alq->aq_flags & AQ_ORDERED && alq->aq_waiters > 0) { KASSERT(!(flags & ALQ_NOWAIT), ("%s: ALQ_NOWAIT set but incorrectly ignored!", __func__)); alq->aq_waiters++; msleep_spin(&alq->aq_waiters, &alq->aq_mtx, "alqgnord", 0); alq->aq_waiters--; } /* * (ALQ_WAITOK && contigbytes < len) or contigbytes >= len, either enter * while loop and sleep until we have enough contiguous free bytes * (former) or skip (latter). If AQ_ORDERED is set, only 1 thread at a * time will be in this loop. Otherwise, multiple threads may be * sleeping here competing for ALQ resources. */ while (contigbytes < len && !(alq->aq_flags & AQ_SHUTDOWN)) { KASSERT(!(flags & ALQ_NOWAIT), ("%s: ALQ_NOWAIT set but incorrectly ignored!", __func__)); alq->aq_flags |= AQ_WANTED; alq->aq_waiters++; if (waitchan) wakeup(waitchan); msleep_spin(alq, &alq->aq_mtx, "alqgnres", 0); alq->aq_waiters--; if (alq->aq_writehead <= alq->aq_writetail) contigbytes = alq->aq_freebytes; else contigbytes = alq->aq_buflen - alq->aq_writehead; /* * If we're the first thread to wake after an AQ_WANTED wakeup * but there isn't enough free space for us, we're going to loop * and sleep again. If there are other threads waiting in this * loop, schedule a wakeup so that they can see if the space * they require is available. */ if (alq->aq_waiters > 0 && !(alq->aq_flags & AQ_ORDERED) && contigbytes < len && !(alq->aq_flags & AQ_WANTED)) waitchan = alq; else waitchan = NULL; } /* * If there are waiters, we need to signal the waiting threads after we * complete our work. The alq ptr is used as a wait channel for threads * requiring resources to be freed up. In the AQ_ORDERED case, threads * are not allowed to concurrently compete for resources in the above * while loop, so we use a different wait channel in this case. */ if (alq->aq_waiters > 0) { if (alq->aq_flags & AQ_ORDERED) waitchan = &alq->aq_waiters; else waitchan = alq; } else waitchan = NULL; /* Bail if we're shutting down. */ if (alq->aq_flags & AQ_SHUTDOWN) { ALQ_UNLOCK(alq); if (waitchan != NULL) wakeup_one(waitchan); return (NULL); } /* * If we are here, we have a contiguous number of bytes >= len * available in our buffer starting at aq_writehead. */ alq->aq_getpost.ae_data = alq->aq_entbuf + alq->aq_writehead; alq->aq_getpost.ae_bytesused = len; return (&alq->aq_getpost); } struct ale * alq_get(struct alq *alq, int flags) { /* Should only be called in fixed length message (legacy) mode. */ KASSERT((alq->aq_flags & AQ_LEGACY), ("%s: fixed length get on variable length queue", __func__)); return (alq_getn(alq, alq->aq_entlen, flags)); } void alq_post_flags(struct alq *alq, struct ale *ale, int flags) { int activate; void *waitchan; activate = 0; if (ale->ae_bytesused > 0) { if (!(alq->aq_flags & AQ_ACTIVE) && !(flags & ALQ_NOACTIVATE)) { alq->aq_flags |= AQ_ACTIVE; activate = 1; } alq->aq_writehead += ale->ae_bytesused; alq->aq_freebytes -= ale->ae_bytesused; /* Wrap aq_writehead if we filled to the end of the buffer. */ if (alq->aq_writehead == alq->aq_buflen) alq->aq_writehead = 0; KASSERT((alq->aq_writehead >= 0 && alq->aq_writehead < alq->aq_buflen), ("%s: aq_writehead < 0 || aq_writehead >= aq_buflen", __func__)); KASSERT((HAS_PENDING_DATA(alq)), ("%s: queue empty!", __func__)); } /* * If there are waiters, we need to signal the waiting threads after we * complete our work. The alq ptr is used as a wait channel for threads * requiring resources to be freed up. In the AQ_ORDERED case, threads * are not allowed to concurrently compete for resources in the * alq_getn() while loop, so we use a different wait channel in this case. */ if (alq->aq_waiters > 0) { if (alq->aq_flags & AQ_ORDERED) waitchan = &alq->aq_waiters; else waitchan = alq; } else waitchan = NULL; ALQ_UNLOCK(alq); if (activate) { ALD_LOCK(); ald_activate(alq); ALD_UNLOCK(); } /* NB: We rely on wakeup_one waking threads in a FIFO manner. */ if (waitchan != NULL) wakeup_one(waitchan); } void alq_flush(struct alq *alq) { int needwakeup = 0; ALD_LOCK(); ALQ_LOCK(alq); /* * Pull the lever iff there is data to flush and we're * not already in the middle of a flush operation. */ if (HAS_PENDING_DATA(alq) && !(alq->aq_flags & AQ_FLUSHING)) { if (alq->aq_flags & AQ_ACTIVE) ald_deactivate(alq); ALD_UNLOCK(); needwakeup = alq_doio(alq); } else ALD_UNLOCK(); ALQ_UNLOCK(alq); if (needwakeup) wakeup_one(alq); } /* * Flush remaining data, close the file and free all resources. */ void alq_close(struct alq *alq) { /* Only flush and destroy alq if not already shutting down. */ if (ald_rem(alq) == 0) alq_destroy(alq); } static int alq_load_handler(module_t mod, int what, void *arg) { int ret; ret = 0; switch (what) { case MOD_LOAD: case MOD_SHUTDOWN: break; case MOD_QUIESCE: ALD_LOCK(); /* Only allow unload if there are no open queues. */ if (LIST_FIRST(&ald_queues) == NULL) { ald_shutingdown = 1; ALD_UNLOCK(); EVENTHANDLER_DEREGISTER(shutdown_pre_sync, alq_eventhandler_tag); ald_shutdown(NULL, 0); mtx_destroy(&ald_mtx); } else { ALD_UNLOCK(); ret = EBUSY; } break; case MOD_UNLOAD: /* If MOD_QUIESCE failed we must fail here too. */ if (ald_shutingdown == 0) ret = EBUSY; break; default: ret = EINVAL; break; } return (ret); } static moduledata_t alq_mod = { "alq", alq_load_handler, NULL }; -DECLARE_MODULE(alq, alq_mod, SI_SUB_SMP, SI_ORDER_ANY); +DECLARE_MODULE(alq, alq_mod, SI_SUB_LAST, SI_ORDER_ANY); MODULE_VERSION(alq, 1); Index: head/sys/netinet/siftr.c =================================================================== --- head/sys/netinet/siftr.c (revision 296687) +++ head/sys/netinet/siftr.c (revision 296688) @@ -1,1566 +1,1566 @@ /*- * Copyright (c) 2007-2009 * Swinburne University of Technology, Melbourne, Australia. * Copyright (c) 2009-2010, The FreeBSD Foundation * All rights reserved. * * Portions of this software were developed at the Centre for Advanced * Internet Architectures, Swinburne University of Technology, Melbourne, * Australia by Lawrence Stewart under sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /****************************************************** * Statistical Information For TCP Research (SIFTR) * * A FreeBSD kernel module that adds very basic intrumentation to the * TCP stack, allowing internal stats to be recorded to a log file * for experimental, debugging and performance analysis purposes. * * SIFTR was first released in 2007 by James Healy and Lawrence Stewart whilst * working on the NewTCP research project at Swinburne University of * Technology's Centre for Advanced Internet Architectures, Melbourne, * Australia, which was made possible in part by a grant from the Cisco * University Research Program Fund at Community Foundation Silicon Valley. * More details are available at: * http://caia.swin.edu.au/urp/newtcp/ * * Work on SIFTR v1.2.x was sponsored by the FreeBSD Foundation as part of * the "Enhancing the FreeBSD TCP Implementation" project 2008-2009. * More details are available at: * http://www.freebsdfoundation.org/ * http://caia.swin.edu.au/freebsd/etcp09/ * * Lawrence Stewart is the current maintainer, and all contact regarding * SIFTR should be directed to him via email: lastewart@swin.edu.au * * Initial release date: June 2007 * Most recent update: September 2010 ******************************************************/ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef SIFTR_IPV6 #include #include #endif /* SIFTR_IPV6 */ #include /* * Three digit version number refers to X.Y.Z where: * X is the major version number * Y is bumped to mark backwards incompatible changes * Z is bumped to mark backwards compatible changes */ #define V_MAJOR 1 #define V_BACKBREAK 2 #define V_BACKCOMPAT 4 #define MODVERSION __CONCAT(V_MAJOR, __CONCAT(V_BACKBREAK, V_BACKCOMPAT)) #define MODVERSION_STR __XSTRING(V_MAJOR) "." __XSTRING(V_BACKBREAK) "." \ __XSTRING(V_BACKCOMPAT) #define HOOK 0 #define UNHOOK 1 #define SIFTR_EXPECTED_MAX_TCP_FLOWS 65536 #define SYS_NAME "FreeBSD" #define PACKET_TAG_SIFTR 100 #define PACKET_COOKIE_SIFTR 21749576 #define SIFTR_LOG_FILE_MODE 0644 #define SIFTR_DISABLE 0 #define SIFTR_ENABLE 1 /* * Hard upper limit on the length of log messages. Bump this up if you add new * data fields such that the line length could exceed the below value. */ #define MAX_LOG_MSG_LEN 200 /* XXX: Make this a sysctl tunable. */ #define SIFTR_ALQ_BUFLEN (1000*MAX_LOG_MSG_LEN) /* * 1 byte for IP version * IPv4: src/dst IP (4+4) + src/dst port (2+2) = 12 bytes * IPv6: src/dst IP (16+16) + src/dst port (2+2) = 36 bytes */ #ifdef SIFTR_IPV6 #define FLOW_KEY_LEN 37 #else #define FLOW_KEY_LEN 13 #endif #ifdef SIFTR_IPV6 #define SIFTR_IPMODE 6 #else #define SIFTR_IPMODE 4 #endif /* useful macros */ #define CAST_PTR_INT(X) (*((int*)(X))) #define UPPER_SHORT(X) (((X) & 0xFFFF0000) >> 16) #define LOWER_SHORT(X) ((X) & 0x0000FFFF) #define FIRST_OCTET(X) (((X) & 0xFF000000) >> 24) #define SECOND_OCTET(X) (((X) & 0x00FF0000) >> 16) #define THIRD_OCTET(X) (((X) & 0x0000FF00) >> 8) #define FOURTH_OCTET(X) ((X) & 0x000000FF) static MALLOC_DEFINE(M_SIFTR, "siftr", "dynamic memory used by SIFTR"); static MALLOC_DEFINE(M_SIFTR_PKTNODE, "siftr_pktnode", "SIFTR pkt_node struct"); static MALLOC_DEFINE(M_SIFTR_HASHNODE, "siftr_hashnode", "SIFTR flow_hash_node struct"); /* Used as links in the pkt manager queue. */ struct pkt_node { /* Timestamp of pkt as noted in the pfil hook. */ struct timeval tval; /* Direction pkt is travelling; either PFIL_IN or PFIL_OUT. */ uint8_t direction; /* IP version pkt_node relates to; either INP_IPV4 or INP_IPV6. */ uint8_t ipver; /* Hash of the pkt which triggered the log message. */ uint32_t hash; /* Local/foreign IP address. */ #ifdef SIFTR_IPV6 uint32_t ip_laddr[4]; uint32_t ip_faddr[4]; #else uint8_t ip_laddr[4]; uint8_t ip_faddr[4]; #endif /* Local TCP port. */ uint16_t tcp_localport; /* Foreign TCP port. */ uint16_t tcp_foreignport; /* Congestion Window (bytes). */ u_long snd_cwnd; /* Sending Window (bytes). */ u_long snd_wnd; /* Receive Window (bytes). */ u_long rcv_wnd; /* Unused (was: Bandwidth Controlled Window (bytes)). */ u_long snd_bwnd; /* Slow Start Threshold (bytes). */ u_long snd_ssthresh; /* Current state of the TCP FSM. */ int conn_state; /* Max Segment Size (bytes). */ u_int max_seg_size; /* * Smoothed RTT stored as found in the TCP control block * in units of (TCP_RTT_SCALE*hz). */ int smoothed_rtt; /* Is SACK enabled? */ u_char sack_enabled; /* Window scaling for snd window. */ u_char snd_scale; /* Window scaling for recv window. */ u_char rcv_scale; /* TCP control block flags. */ u_int flags; /* Retransmit timeout length. */ int rxt_length; /* Size of the TCP send buffer in bytes. */ u_int snd_buf_hiwater; /* Current num bytes in the send socket buffer. */ u_int snd_buf_cc; /* Size of the TCP receive buffer in bytes. */ u_int rcv_buf_hiwater; /* Current num bytes in the receive socket buffer. */ u_int rcv_buf_cc; /* Number of bytes inflight that we are waiting on ACKs for. */ u_int sent_inflight_bytes; /* Number of segments currently in the reassembly queue. */ int t_segqlen; /* Flowid for the connection. */ u_int flowid; /* Flow type for the connection. */ u_int flowtype; /* Link to next pkt_node in the list. */ STAILQ_ENTRY(pkt_node) nodes; }; struct flow_hash_node { uint16_t counter; uint8_t key[FLOW_KEY_LEN]; LIST_ENTRY(flow_hash_node) nodes; }; struct siftr_stats { /* # TCP pkts seen by the SIFTR PFIL hooks, including any skipped. */ uint64_t n_in; uint64_t n_out; /* # pkts skipped due to failed malloc calls. */ uint32_t nskip_in_malloc; uint32_t nskip_out_malloc; /* # pkts skipped due to failed mtx acquisition. */ uint32_t nskip_in_mtx; uint32_t nskip_out_mtx; /* # pkts skipped due to failed inpcb lookups. */ uint32_t nskip_in_inpcb; uint32_t nskip_out_inpcb; /* # pkts skipped due to failed tcpcb lookups. */ uint32_t nskip_in_tcpcb; uint32_t nskip_out_tcpcb; /* # pkts skipped due to stack reinjection. */ uint32_t nskip_in_dejavu; uint32_t nskip_out_dejavu; }; static DPCPU_DEFINE(struct siftr_stats, ss); static volatile unsigned int siftr_exit_pkt_manager_thread = 0; static unsigned int siftr_enabled = 0; static unsigned int siftr_pkts_per_log = 1; static unsigned int siftr_generate_hashes = 0; /* static unsigned int siftr_binary_log = 0; */ static char siftr_logfile[PATH_MAX] = "/var/log/siftr.log"; static char siftr_logfile_shadow[PATH_MAX] = "/var/log/siftr.log"; static u_long siftr_hashmask; STAILQ_HEAD(pkthead, pkt_node) pkt_queue = STAILQ_HEAD_INITIALIZER(pkt_queue); LIST_HEAD(listhead, flow_hash_node) *counter_hash; static int wait_for_pkt; static struct alq *siftr_alq = NULL; static struct mtx siftr_pkt_queue_mtx; static struct mtx siftr_pkt_mgr_mtx; static struct thread *siftr_pkt_manager_thr = NULL; /* * pfil.h defines PFIL_IN as 1 and PFIL_OUT as 2, * which we use as an index into this array. */ static char direction[3] = {'\0', 'i','o'}; /* Required function prototypes. */ static int siftr_sysctl_enabled_handler(SYSCTL_HANDLER_ARGS); static int siftr_sysctl_logfile_name_handler(SYSCTL_HANDLER_ARGS); /* Declare the net.inet.siftr sysctl tree and populate it. */ SYSCTL_DECL(_net_inet_siftr); SYSCTL_NODE(_net_inet, OID_AUTO, siftr, CTLFLAG_RW, NULL, "siftr related settings"); SYSCTL_PROC(_net_inet_siftr, OID_AUTO, enabled, CTLTYPE_UINT|CTLFLAG_RW, &siftr_enabled, 0, &siftr_sysctl_enabled_handler, "IU", "switch siftr module operations on/off"); SYSCTL_PROC(_net_inet_siftr, OID_AUTO, logfile, CTLTYPE_STRING|CTLFLAG_RW, &siftr_logfile_shadow, sizeof(siftr_logfile_shadow), &siftr_sysctl_logfile_name_handler, "A", "file to save siftr log messages to"); SYSCTL_UINT(_net_inet_siftr, OID_AUTO, ppl, CTLFLAG_RW, &siftr_pkts_per_log, 1, "number of packets between generating a log message"); SYSCTL_UINT(_net_inet_siftr, OID_AUTO, genhashes, CTLFLAG_RW, &siftr_generate_hashes, 0, "enable packet hash generation"); /* XXX: TODO SYSCTL_UINT(_net_inet_siftr, OID_AUTO, binary, CTLFLAG_RW, &siftr_binary_log, 0, "write log files in binary instead of ascii"); */ /* Begin functions. */ static void siftr_process_pkt(struct pkt_node * pkt_node) { struct flow_hash_node *hash_node; struct listhead *counter_list; struct siftr_stats *ss; struct ale *log_buf; uint8_t key[FLOW_KEY_LEN]; uint8_t found_match, key_offset; hash_node = NULL; ss = DPCPU_PTR(ss); found_match = 0; key_offset = 1; /* * Create the key that will be used to create a hash index * into our hash table. Our key consists of: * ipversion, localip, localport, foreignip, foreignport */ key[0] = pkt_node->ipver; memcpy(key + key_offset, &pkt_node->ip_laddr, sizeof(pkt_node->ip_laddr)); key_offset += sizeof(pkt_node->ip_laddr); memcpy(key + key_offset, &pkt_node->tcp_localport, sizeof(pkt_node->tcp_localport)); key_offset += sizeof(pkt_node->tcp_localport); memcpy(key + key_offset, &pkt_node->ip_faddr, sizeof(pkt_node->ip_faddr)); key_offset += sizeof(pkt_node->ip_faddr); memcpy(key + key_offset, &pkt_node->tcp_foreignport, sizeof(pkt_node->tcp_foreignport)); counter_list = counter_hash + (hash32_buf(key, sizeof(key), 0) & siftr_hashmask); /* * If the list is not empty i.e. the hash index has * been used by another flow previously. */ if (LIST_FIRST(counter_list) != NULL) { /* * Loop through the hash nodes in the list. * There should normally only be 1 hash node in the list, * except if there have been collisions at the hash index * computed by hash32_buf(). */ LIST_FOREACH(hash_node, counter_list, nodes) { /* * Check if the key for the pkt we are currently * processing is the same as the key stored in the * hash node we are currently processing. * If they are the same, then we've found the * hash node that stores the counter for the flow * the pkt belongs to. */ if (memcmp(hash_node->key, key, sizeof(key)) == 0) { found_match = 1; break; } } } /* If this flow hash hasn't been seen before or we have a collision. */ if (hash_node == NULL || !found_match) { /* Create a new hash node to store the flow's counter. */ hash_node = malloc(sizeof(struct flow_hash_node), M_SIFTR_HASHNODE, M_WAITOK); if (hash_node != NULL) { /* Initialise our new hash node list entry. */ hash_node->counter = 0; memcpy(hash_node->key, key, sizeof(key)); LIST_INSERT_HEAD(counter_list, hash_node, nodes); } else { /* Malloc failed. */ if (pkt_node->direction == PFIL_IN) ss->nskip_in_malloc++; else ss->nskip_out_malloc++; return; } } else if (siftr_pkts_per_log > 1) { /* * Taking the remainder of the counter divided * by the current value of siftr_pkts_per_log * and storing that in counter provides a neat * way to modulate the frequency of log * messages being written to the log file. */ hash_node->counter = (hash_node->counter + 1) % siftr_pkts_per_log; /* * If we have not seen enough packets since the last time * we wrote a log message for this connection, return. */ if (hash_node->counter > 0) return; } log_buf = alq_getn(siftr_alq, MAX_LOG_MSG_LEN, ALQ_WAITOK); if (log_buf == NULL) return; /* Should only happen if the ALQ is shutting down. */ #ifdef SIFTR_IPV6 pkt_node->ip_laddr[3] = ntohl(pkt_node->ip_laddr[3]); pkt_node->ip_faddr[3] = ntohl(pkt_node->ip_faddr[3]); if (pkt_node->ipver == INP_IPV6) { /* IPv6 packet */ pkt_node->ip_laddr[0] = ntohl(pkt_node->ip_laddr[0]); pkt_node->ip_laddr[1] = ntohl(pkt_node->ip_laddr[1]); pkt_node->ip_laddr[2] = ntohl(pkt_node->ip_laddr[2]); pkt_node->ip_faddr[0] = ntohl(pkt_node->ip_faddr[0]); pkt_node->ip_faddr[1] = ntohl(pkt_node->ip_faddr[1]); pkt_node->ip_faddr[2] = ntohl(pkt_node->ip_faddr[2]); /* Construct an IPv6 log message. */ log_buf->ae_bytesused = snprintf(log_buf->ae_data, MAX_LOG_MSG_LEN, "%c,0x%08x,%zd.%06ld,%x:%x:%x:%x:%x:%x:%x:%x,%u,%x:%x:%x:" "%x:%x:%x:%x:%x,%u,%ld,%ld,%ld,%ld,%ld,%u,%u,%u,%u,%u,%u," "%u,%d,%u,%u,%u,%u,%u,%u,%u,%u\n", direction[pkt_node->direction], pkt_node->hash, pkt_node->tval.tv_sec, pkt_node->tval.tv_usec, UPPER_SHORT(pkt_node->ip_laddr[0]), LOWER_SHORT(pkt_node->ip_laddr[0]), UPPER_SHORT(pkt_node->ip_laddr[1]), LOWER_SHORT(pkt_node->ip_laddr[1]), UPPER_SHORT(pkt_node->ip_laddr[2]), LOWER_SHORT(pkt_node->ip_laddr[2]), UPPER_SHORT(pkt_node->ip_laddr[3]), LOWER_SHORT(pkt_node->ip_laddr[3]), ntohs(pkt_node->tcp_localport), UPPER_SHORT(pkt_node->ip_faddr[0]), LOWER_SHORT(pkt_node->ip_faddr[0]), UPPER_SHORT(pkt_node->ip_faddr[1]), LOWER_SHORT(pkt_node->ip_faddr[1]), UPPER_SHORT(pkt_node->ip_faddr[2]), LOWER_SHORT(pkt_node->ip_faddr[2]), UPPER_SHORT(pkt_node->ip_faddr[3]), LOWER_SHORT(pkt_node->ip_faddr[3]), ntohs(pkt_node->tcp_foreignport), pkt_node->snd_ssthresh, pkt_node->snd_cwnd, pkt_node->snd_bwnd, pkt_node->snd_wnd, pkt_node->rcv_wnd, pkt_node->snd_scale, pkt_node->rcv_scale, pkt_node->conn_state, pkt_node->max_seg_size, pkt_node->smoothed_rtt, pkt_node->sack_enabled, pkt_node->flags, pkt_node->rxt_length, pkt_node->snd_buf_hiwater, pkt_node->snd_buf_cc, pkt_node->rcv_buf_hiwater, pkt_node->rcv_buf_cc, pkt_node->sent_inflight_bytes, pkt_node->t_segqlen, pkt_node->flowid, pkt_node->flowtype); } else { /* IPv4 packet */ pkt_node->ip_laddr[0] = FIRST_OCTET(pkt_node->ip_laddr[3]); pkt_node->ip_laddr[1] = SECOND_OCTET(pkt_node->ip_laddr[3]); pkt_node->ip_laddr[2] = THIRD_OCTET(pkt_node->ip_laddr[3]); pkt_node->ip_laddr[3] = FOURTH_OCTET(pkt_node->ip_laddr[3]); pkt_node->ip_faddr[0] = FIRST_OCTET(pkt_node->ip_faddr[3]); pkt_node->ip_faddr[1] = SECOND_OCTET(pkt_node->ip_faddr[3]); pkt_node->ip_faddr[2] = THIRD_OCTET(pkt_node->ip_faddr[3]); pkt_node->ip_faddr[3] = FOURTH_OCTET(pkt_node->ip_faddr[3]); #endif /* SIFTR_IPV6 */ /* Construct an IPv4 log message. */ log_buf->ae_bytesused = snprintf(log_buf->ae_data, MAX_LOG_MSG_LEN, "%c,0x%08x,%jd.%06ld,%u.%u.%u.%u,%u,%u.%u.%u.%u,%u,%ld,%ld," "%ld,%ld,%ld,%u,%u,%u,%u,%u,%u,%u,%d,%u,%u,%u,%u,%u,%u,%u,%u\n", direction[pkt_node->direction], pkt_node->hash, (intmax_t)pkt_node->tval.tv_sec, pkt_node->tval.tv_usec, pkt_node->ip_laddr[0], pkt_node->ip_laddr[1], pkt_node->ip_laddr[2], pkt_node->ip_laddr[3], ntohs(pkt_node->tcp_localport), pkt_node->ip_faddr[0], pkt_node->ip_faddr[1], pkt_node->ip_faddr[2], pkt_node->ip_faddr[3], ntohs(pkt_node->tcp_foreignport), pkt_node->snd_ssthresh, pkt_node->snd_cwnd, pkt_node->snd_bwnd, pkt_node->snd_wnd, pkt_node->rcv_wnd, pkt_node->snd_scale, pkt_node->rcv_scale, pkt_node->conn_state, pkt_node->max_seg_size, pkt_node->smoothed_rtt, pkt_node->sack_enabled, pkt_node->flags, pkt_node->rxt_length, pkt_node->snd_buf_hiwater, pkt_node->snd_buf_cc, pkt_node->rcv_buf_hiwater, pkt_node->rcv_buf_cc, pkt_node->sent_inflight_bytes, pkt_node->t_segqlen, pkt_node->flowid, pkt_node->flowtype); #ifdef SIFTR_IPV6 } #endif alq_post_flags(siftr_alq, log_buf, 0); } static void siftr_pkt_manager_thread(void *arg) { STAILQ_HEAD(pkthead, pkt_node) tmp_pkt_queue = STAILQ_HEAD_INITIALIZER(tmp_pkt_queue); struct pkt_node *pkt_node, *pkt_node_temp; uint8_t draining; draining = 2; mtx_lock(&siftr_pkt_mgr_mtx); /* draining == 0 when queue has been flushed and it's safe to exit. */ while (draining) { /* * Sleep until we are signalled to wake because thread has * been told to exit or until 1 tick has passed. */ mtx_sleep(&wait_for_pkt, &siftr_pkt_mgr_mtx, PWAIT, "pktwait", 1); /* Gain exclusive access to the pkt_node queue. */ mtx_lock(&siftr_pkt_queue_mtx); /* * Move pkt_queue to tmp_pkt_queue, which leaves * pkt_queue empty and ready to receive more pkt_nodes. */ STAILQ_CONCAT(&tmp_pkt_queue, &pkt_queue); /* * We've finished making changes to the list. Unlock it * so the pfil hooks can continue queuing pkt_nodes. */ mtx_unlock(&siftr_pkt_queue_mtx); /* * We can't hold a mutex whilst calling siftr_process_pkt * because ALQ might sleep waiting for buffer space. */ mtx_unlock(&siftr_pkt_mgr_mtx); /* Flush all pkt_nodes to the log file. */ STAILQ_FOREACH_SAFE(pkt_node, &tmp_pkt_queue, nodes, pkt_node_temp) { siftr_process_pkt(pkt_node); STAILQ_REMOVE_HEAD(&tmp_pkt_queue, nodes); free(pkt_node, M_SIFTR_PKTNODE); } KASSERT(STAILQ_EMPTY(&tmp_pkt_queue), ("SIFTR tmp_pkt_queue not empty after flush")); mtx_lock(&siftr_pkt_mgr_mtx); /* * If siftr_exit_pkt_manager_thread gets set during the window * where we are draining the tmp_pkt_queue above, there might * still be pkts in pkt_queue that need to be drained. * Allow one further iteration to occur after * siftr_exit_pkt_manager_thread has been set to ensure * pkt_queue is completely empty before we kill the thread. * * siftr_exit_pkt_manager_thread is set only after the pfil * hooks have been removed, so only 1 extra iteration * is needed to drain the queue. */ if (siftr_exit_pkt_manager_thread) draining--; } mtx_unlock(&siftr_pkt_mgr_mtx); /* Calls wakeup on this thread's struct thread ptr. */ kthread_exit(); } static uint32_t hash_pkt(struct mbuf *m, uint32_t offset) { uint32_t hash; hash = 0; while (m != NULL && offset > m->m_len) { /* * The IP packet payload does not start in this mbuf, so * need to figure out which mbuf it starts in and what offset * into the mbuf's data region the payload starts at. */ offset -= m->m_len; m = m->m_next; } while (m != NULL) { /* Ensure there is data in the mbuf */ if ((m->m_len - offset) > 0) hash = hash32_buf(m->m_data + offset, m->m_len - offset, hash); m = m->m_next; offset = 0; } return (hash); } /* * Check if a given mbuf has the SIFTR mbuf tag. If it does, log the fact that * it's a reinjected packet and return. If it doesn't, tag the mbuf and return. * Return value >0 means the caller should skip processing this mbuf. */ static inline int siftr_chkreinject(struct mbuf *m, int dir, struct siftr_stats *ss) { if (m_tag_locate(m, PACKET_COOKIE_SIFTR, PACKET_TAG_SIFTR, NULL) != NULL) { if (dir == PFIL_IN) ss->nskip_in_dejavu++; else ss->nskip_out_dejavu++; return (1); } else { struct m_tag *tag = m_tag_alloc(PACKET_COOKIE_SIFTR, PACKET_TAG_SIFTR, 0, M_NOWAIT); if (tag == NULL) { if (dir == PFIL_IN) ss->nskip_in_malloc++; else ss->nskip_out_malloc++; return (1); } m_tag_prepend(m, tag); } return (0); } /* * Look up an inpcb for a packet. Return the inpcb pointer if found, or NULL * otherwise. */ static inline struct inpcb * siftr_findinpcb(int ipver, struct ip *ip, struct mbuf *m, uint16_t sport, uint16_t dport, int dir, struct siftr_stats *ss) { struct inpcb *inp; /* We need the tcbinfo lock. */ INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); if (dir == PFIL_IN) inp = (ipver == INP_IPV4 ? in_pcblookup(&V_tcbinfo, ip->ip_src, sport, ip->ip_dst, dport, INPLOOKUP_RLOCKPCB, m->m_pkthdr.rcvif) : #ifdef SIFTR_IPV6 in6_pcblookup(&V_tcbinfo, &((struct ip6_hdr *)ip)->ip6_src, sport, &((struct ip6_hdr *)ip)->ip6_dst, dport, INPLOOKUP_RLOCKPCB, m->m_pkthdr.rcvif) #else NULL #endif ); else inp = (ipver == INP_IPV4 ? in_pcblookup(&V_tcbinfo, ip->ip_dst, dport, ip->ip_src, sport, INPLOOKUP_RLOCKPCB, m->m_pkthdr.rcvif) : #ifdef SIFTR_IPV6 in6_pcblookup(&V_tcbinfo, &((struct ip6_hdr *)ip)->ip6_dst, dport, &((struct ip6_hdr *)ip)->ip6_src, sport, INPLOOKUP_RLOCKPCB, m->m_pkthdr.rcvif) #else NULL #endif ); /* If we can't find the inpcb, bail. */ if (inp == NULL) { if (dir == PFIL_IN) ss->nskip_in_inpcb++; else ss->nskip_out_inpcb++; } return (inp); } static inline void siftr_siftdata(struct pkt_node *pn, struct inpcb *inp, struct tcpcb *tp, int ipver, int dir, int inp_locally_locked) { #ifdef SIFTR_IPV6 if (ipver == INP_IPV4) { pn->ip_laddr[3] = inp->inp_laddr.s_addr; pn->ip_faddr[3] = inp->inp_faddr.s_addr; #else *((uint32_t *)pn->ip_laddr) = inp->inp_laddr.s_addr; *((uint32_t *)pn->ip_faddr) = inp->inp_faddr.s_addr; #endif #ifdef SIFTR_IPV6 } else { pn->ip_laddr[0] = inp->in6p_laddr.s6_addr32[0]; pn->ip_laddr[1] = inp->in6p_laddr.s6_addr32[1]; pn->ip_laddr[2] = inp->in6p_laddr.s6_addr32[2]; pn->ip_laddr[3] = inp->in6p_laddr.s6_addr32[3]; pn->ip_faddr[0] = inp->in6p_faddr.s6_addr32[0]; pn->ip_faddr[1] = inp->in6p_faddr.s6_addr32[1]; pn->ip_faddr[2] = inp->in6p_faddr.s6_addr32[2]; pn->ip_faddr[3] = inp->in6p_faddr.s6_addr32[3]; } #endif pn->tcp_localport = inp->inp_lport; pn->tcp_foreignport = inp->inp_fport; pn->snd_cwnd = tp->snd_cwnd; pn->snd_wnd = tp->snd_wnd; pn->rcv_wnd = tp->rcv_wnd; pn->snd_bwnd = 0; /* Unused, kept for compat. */ pn->snd_ssthresh = tp->snd_ssthresh; pn->snd_scale = tp->snd_scale; pn->rcv_scale = tp->rcv_scale; pn->conn_state = tp->t_state; pn->max_seg_size = tp->t_maxseg; pn->smoothed_rtt = tp->t_srtt; pn->sack_enabled = (tp->t_flags & TF_SACK_PERMIT) != 0; pn->flags = tp->t_flags; pn->rxt_length = tp->t_rxtcur; pn->snd_buf_hiwater = inp->inp_socket->so_snd.sb_hiwat; pn->snd_buf_cc = sbused(&inp->inp_socket->so_snd); pn->rcv_buf_hiwater = inp->inp_socket->so_rcv.sb_hiwat; pn->rcv_buf_cc = sbused(&inp->inp_socket->so_rcv); pn->sent_inflight_bytes = tp->snd_max - tp->snd_una; pn->t_segqlen = tp->t_segqlen; pn->flowid = inp->inp_flowid; pn->flowtype = inp->inp_flowtype; /* We've finished accessing the tcb so release the lock. */ if (inp_locally_locked) INP_RUNLOCK(inp); pn->ipver = ipver; pn->direction = dir; /* * Significantly more accurate than using getmicrotime(), but slower! * Gives true microsecond resolution at the expense of a hit to * maximum pps throughput processing when SIFTR is loaded and enabled. */ microtime(&pn->tval); TCP_PROBE1(siftr, &pn); } /* * pfil hook that is called for each IPv4 packet making its way through the * stack in either direction. * The pfil subsystem holds a non-sleepable mutex somewhere when * calling our hook function, so we can't sleep at all. * It's very important to use the M_NOWAIT flag with all function calls * that support it so that they won't sleep, otherwise you get a panic. */ static int siftr_chkpkt(void *arg, struct mbuf **m, struct ifnet *ifp, int dir, struct inpcb *inp) { struct pkt_node *pn; struct ip *ip; struct tcphdr *th; struct tcpcb *tp; struct siftr_stats *ss; unsigned int ip_hl; int inp_locally_locked; inp_locally_locked = 0; ss = DPCPU_PTR(ss); /* * m_pullup is not required here because ip_{input|output} * already do the heavy lifting for us. */ ip = mtod(*m, struct ip *); /* Only continue processing if the packet is TCP. */ if (ip->ip_p != IPPROTO_TCP) goto ret; /* * If a kernel subsystem reinjects packets into the stack, our pfil * hook will be called multiple times for the same packet. * Make sure we only process unique packets. */ if (siftr_chkreinject(*m, dir, ss)) goto ret; if (dir == PFIL_IN) ss->n_in++; else ss->n_out++; /* * Create a tcphdr struct starting at the correct offset * in the IP packet. ip->ip_hl gives the ip header length * in 4-byte words, so multiply it to get the size in bytes. */ ip_hl = (ip->ip_hl << 2); th = (struct tcphdr *)((caddr_t)ip + ip_hl); /* * If the pfil hooks don't provide a pointer to the * inpcb, we need to find it ourselves and lock it. */ if (!inp) { /* Find the corresponding inpcb for this pkt. */ inp = siftr_findinpcb(INP_IPV4, ip, *m, th->th_sport, th->th_dport, dir, ss); if (inp == NULL) goto ret; else inp_locally_locked = 1; } INP_LOCK_ASSERT(inp); /* Find the TCP control block that corresponds with this packet */ tp = intotcpcb(inp); /* * If we can't find the TCP control block (happens occasionaly for a * packet sent during the shutdown phase of a TCP connection), * or we're in the timewait state, bail */ if (tp == NULL || inp->inp_flags & INP_TIMEWAIT) { if (dir == PFIL_IN) ss->nskip_in_tcpcb++; else ss->nskip_out_tcpcb++; goto inp_unlock; } pn = malloc(sizeof(struct pkt_node), M_SIFTR_PKTNODE, M_NOWAIT|M_ZERO); if (pn == NULL) { if (dir == PFIL_IN) ss->nskip_in_malloc++; else ss->nskip_out_malloc++; goto inp_unlock; } siftr_siftdata(pn, inp, tp, INP_IPV4, dir, inp_locally_locked); if (siftr_generate_hashes) { if ((*m)->m_pkthdr.csum_flags & CSUM_TCP) { /* * For outbound packets, the TCP checksum isn't * calculated yet. This is a problem for our packet * hashing as the receiver will calc a different hash * to ours if we don't include the correct TCP checksum * in the bytes being hashed. To work around this * problem, we manually calc the TCP checksum here in * software. We unset the CSUM_TCP flag so the lower * layers don't recalc it. */ (*m)->m_pkthdr.csum_flags &= ~CSUM_TCP; /* * Calculate the TCP checksum in software and assign * to correct TCP header field, which will follow the * packet mbuf down the stack. The trick here is that * tcp_output() sets th->th_sum to the checksum of the * pseudo header for us already. Because of the nature * of the checksumming algorithm, we can sum over the * entire IP payload (i.e. TCP header and data), which * will include the already calculated pseduo header * checksum, thus giving us the complete TCP checksum. * * To put it in simple terms, if checksum(1,2,3,4)=10, * then checksum(1,2,3,4,5) == checksum(10,5). * This property is what allows us to "cheat" and * checksum only the IP payload which has the TCP * th_sum field populated with the pseudo header's * checksum, and not need to futz around checksumming * pseudo header bytes and TCP header/data in one hit. * Refer to RFC 1071 for more info. * * NB: in_cksum_skip(struct mbuf *m, int len, int skip) * in_cksum_skip 2nd argument is NOT the number of * bytes to read from the mbuf at "skip" bytes offset * from the start of the mbuf (very counter intuitive!). * The number of bytes to read is calculated internally * by the function as len-skip i.e. to sum over the IP * payload (TCP header + data) bytes, it is INCORRECT * to call the function like this: * in_cksum_skip(at, ip->ip_len - offset, offset) * Rather, it should be called like this: * in_cksum_skip(at, ip->ip_len, offset) * which means read "ip->ip_len - offset" bytes from * the mbuf cluster "at" at offset "offset" bytes from * the beginning of the "at" mbuf's data pointer. */ th->th_sum = in_cksum_skip(*m, ntohs(ip->ip_len), ip_hl); } /* * XXX: Having to calculate the checksum in software and then * hash over all bytes is really inefficient. Would be nice to * find a way to create the hash and checksum in the same pass * over the bytes. */ pn->hash = hash_pkt(*m, ip_hl); } mtx_lock(&siftr_pkt_queue_mtx); STAILQ_INSERT_TAIL(&pkt_queue, pn, nodes); mtx_unlock(&siftr_pkt_queue_mtx); goto ret; inp_unlock: if (inp_locally_locked) INP_RUNLOCK(inp); ret: /* Returning 0 ensures pfil will not discard the pkt */ return (0); } #ifdef SIFTR_IPV6 static int siftr_chkpkt6(void *arg, struct mbuf **m, struct ifnet *ifp, int dir, struct inpcb *inp) { struct pkt_node *pn; struct ip6_hdr *ip6; struct tcphdr *th; struct tcpcb *tp; struct siftr_stats *ss; unsigned int ip6_hl; int inp_locally_locked; inp_locally_locked = 0; ss = DPCPU_PTR(ss); /* * m_pullup is not required here because ip6_{input|output} * already do the heavy lifting for us. */ ip6 = mtod(*m, struct ip6_hdr *); /* * Only continue processing if the packet is TCP * XXX: We should follow the next header fields * as shown on Pg 6 RFC 2460, but right now we'll * only check pkts that have no extension headers. */ if (ip6->ip6_nxt != IPPROTO_TCP) goto ret6; /* * If a kernel subsystem reinjects packets into the stack, our pfil * hook will be called multiple times for the same packet. * Make sure we only process unique packets. */ if (siftr_chkreinject(*m, dir, ss)) goto ret6; if (dir == PFIL_IN) ss->n_in++; else ss->n_out++; ip6_hl = sizeof(struct ip6_hdr); /* * Create a tcphdr struct starting at the correct offset * in the ipv6 packet. ip->ip_hl gives the ip header length * in 4-byte words, so multiply it to get the size in bytes. */ th = (struct tcphdr *)((caddr_t)ip6 + ip6_hl); /* * For inbound packets, the pfil hooks don't provide a pointer to the * inpcb, so we need to find it ourselves and lock it. */ if (!inp) { /* Find the corresponding inpcb for this pkt. */ inp = siftr_findinpcb(INP_IPV6, (struct ip *)ip6, *m, th->th_sport, th->th_dport, dir, ss); if (inp == NULL) goto ret6; else inp_locally_locked = 1; } /* Find the TCP control block that corresponds with this packet. */ tp = intotcpcb(inp); /* * If we can't find the TCP control block (happens occasionaly for a * packet sent during the shutdown phase of a TCP connection), * or we're in the timewait state, bail. */ if (tp == NULL || inp->inp_flags & INP_TIMEWAIT) { if (dir == PFIL_IN) ss->nskip_in_tcpcb++; else ss->nskip_out_tcpcb++; goto inp_unlock6; } pn = malloc(sizeof(struct pkt_node), M_SIFTR_PKTNODE, M_NOWAIT|M_ZERO); if (pn == NULL) { if (dir == PFIL_IN) ss->nskip_in_malloc++; else ss->nskip_out_malloc++; goto inp_unlock6; } siftr_siftdata(pn, inp, tp, INP_IPV6, dir, inp_locally_locked); /* XXX: Figure out how to generate hashes for IPv6 packets. */ mtx_lock(&siftr_pkt_queue_mtx); STAILQ_INSERT_TAIL(&pkt_queue, pn, nodes); mtx_unlock(&siftr_pkt_queue_mtx); goto ret6; inp_unlock6: if (inp_locally_locked) INP_RUNLOCK(inp); ret6: /* Returning 0 ensures pfil will not discard the pkt. */ return (0); } #endif /* #ifdef SIFTR_IPV6 */ static int siftr_pfil(int action) { struct pfil_head *pfh_inet; #ifdef SIFTR_IPV6 struct pfil_head *pfh_inet6; #endif VNET_ITERATOR_DECL(vnet_iter); VNET_LIST_RLOCK(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); pfh_inet = pfil_head_get(PFIL_TYPE_AF, AF_INET); #ifdef SIFTR_IPV6 pfh_inet6 = pfil_head_get(PFIL_TYPE_AF, AF_INET6); #endif if (action == HOOK) { pfil_add_hook(siftr_chkpkt, NULL, PFIL_IN | PFIL_OUT | PFIL_WAITOK, pfh_inet); #ifdef SIFTR_IPV6 pfil_add_hook(siftr_chkpkt6, NULL, PFIL_IN | PFIL_OUT | PFIL_WAITOK, pfh_inet6); #endif } else if (action == UNHOOK) { pfil_remove_hook(siftr_chkpkt, NULL, PFIL_IN | PFIL_OUT | PFIL_WAITOK, pfh_inet); #ifdef SIFTR_IPV6 pfil_remove_hook(siftr_chkpkt6, NULL, PFIL_IN | PFIL_OUT | PFIL_WAITOK, pfh_inet6); #endif } CURVNET_RESTORE(); } VNET_LIST_RUNLOCK(); return (0); } static int siftr_sysctl_logfile_name_handler(SYSCTL_HANDLER_ARGS) { struct alq *new_alq; int error; error = sysctl_handle_string(oidp, arg1, arg2, req); /* Check for error or same filename */ if (error != 0 || req->newptr == NULL || strncmp(siftr_logfile, arg1, arg2) == 0) goto done; /* Filname changed */ error = alq_open(&new_alq, arg1, curthread->td_ucred, SIFTR_LOG_FILE_MODE, SIFTR_ALQ_BUFLEN, 0); if (error != 0) goto done; /* * If disabled, siftr_alq == NULL so we simply close * the alq as we've proved it can be opened. * If enabled, close the existing alq and switch the old * for the new. */ if (siftr_alq == NULL) { alq_close(new_alq); } else { alq_close(siftr_alq); siftr_alq = new_alq; } /* Update filename upon success */ strlcpy(siftr_logfile, arg1, arg2); done: return (error); } static int siftr_manage_ops(uint8_t action) { struct siftr_stats totalss; struct timeval tval; struct flow_hash_node *counter, *tmp_counter; struct sbuf *s; int i, key_index, ret, error; uint32_t bytes_to_write, total_skipped_pkts; uint16_t lport, fport; uint8_t *key, ipver; #ifdef SIFTR_IPV6 uint32_t laddr[4]; uint32_t faddr[4]; #else uint8_t laddr[4]; uint8_t faddr[4]; #endif error = 0; total_skipped_pkts = 0; /* Init an autosizing sbuf that initially holds 200 chars. */ if ((s = sbuf_new(NULL, NULL, 200, SBUF_AUTOEXTEND)) == NULL) return (-1); if (action == SIFTR_ENABLE) { /* * Create our alq * XXX: We should abort if alq_open fails! */ alq_open(&siftr_alq, siftr_logfile, curthread->td_ucred, SIFTR_LOG_FILE_MODE, SIFTR_ALQ_BUFLEN, 0); STAILQ_INIT(&pkt_queue); DPCPU_ZERO(ss); siftr_exit_pkt_manager_thread = 0; ret = kthread_add(&siftr_pkt_manager_thread, NULL, NULL, &siftr_pkt_manager_thr, RFNOWAIT, 0, "siftr_pkt_manager_thr"); siftr_pfil(HOOK); microtime(&tval); sbuf_printf(s, "enable_time_secs=%jd\tenable_time_usecs=%06ld\t" "siftrver=%s\thz=%u\ttcp_rtt_scale=%u\tsysname=%s\t" "sysver=%u\tipmode=%u\n", (intmax_t)tval.tv_sec, tval.tv_usec, MODVERSION_STR, hz, TCP_RTT_SCALE, SYS_NAME, __FreeBSD_version, SIFTR_IPMODE); sbuf_finish(s); alq_writen(siftr_alq, sbuf_data(s), sbuf_len(s), ALQ_WAITOK); } else if (action == SIFTR_DISABLE && siftr_pkt_manager_thr != NULL) { /* * Remove the pfil hook functions. All threads currently in * the hook functions are allowed to exit before siftr_pfil() * returns. */ siftr_pfil(UNHOOK); /* This will block until the pkt manager thread unlocks it. */ mtx_lock(&siftr_pkt_mgr_mtx); /* Tell the pkt manager thread that it should exit now. */ siftr_exit_pkt_manager_thread = 1; /* * Wake the pkt_manager thread so it realises that * siftr_exit_pkt_manager_thread == 1 and exits gracefully. * The wakeup won't be delivered until we unlock * siftr_pkt_mgr_mtx so this isn't racy. */ wakeup(&wait_for_pkt); /* Wait for the pkt_manager thread to exit. */ mtx_sleep(siftr_pkt_manager_thr, &siftr_pkt_mgr_mtx, PWAIT, "thrwait", 0); siftr_pkt_manager_thr = NULL; mtx_unlock(&siftr_pkt_mgr_mtx); totalss.n_in = DPCPU_VARSUM(ss, n_in); totalss.n_out = DPCPU_VARSUM(ss, n_out); totalss.nskip_in_malloc = DPCPU_VARSUM(ss, nskip_in_malloc); totalss.nskip_out_malloc = DPCPU_VARSUM(ss, nskip_out_malloc); totalss.nskip_in_mtx = DPCPU_VARSUM(ss, nskip_in_mtx); totalss.nskip_out_mtx = DPCPU_VARSUM(ss, nskip_out_mtx); totalss.nskip_in_tcpcb = DPCPU_VARSUM(ss, nskip_in_tcpcb); totalss.nskip_out_tcpcb = DPCPU_VARSUM(ss, nskip_out_tcpcb); totalss.nskip_in_inpcb = DPCPU_VARSUM(ss, nskip_in_inpcb); totalss.nskip_out_inpcb = DPCPU_VARSUM(ss, nskip_out_inpcb); total_skipped_pkts = totalss.nskip_in_malloc + totalss.nskip_out_malloc + totalss.nskip_in_mtx + totalss.nskip_out_mtx + totalss.nskip_in_tcpcb + totalss.nskip_out_tcpcb + totalss.nskip_in_inpcb + totalss.nskip_out_inpcb; microtime(&tval); sbuf_printf(s, "disable_time_secs=%jd\tdisable_time_usecs=%06ld\t" "num_inbound_tcp_pkts=%ju\tnum_outbound_tcp_pkts=%ju\t" "total_tcp_pkts=%ju\tnum_inbound_skipped_pkts_malloc=%u\t" "num_outbound_skipped_pkts_malloc=%u\t" "num_inbound_skipped_pkts_mtx=%u\t" "num_outbound_skipped_pkts_mtx=%u\t" "num_inbound_skipped_pkts_tcpcb=%u\t" "num_outbound_skipped_pkts_tcpcb=%u\t" "num_inbound_skipped_pkts_inpcb=%u\t" "num_outbound_skipped_pkts_inpcb=%u\t" "total_skipped_tcp_pkts=%u\tflow_list=", (intmax_t)tval.tv_sec, tval.tv_usec, (uintmax_t)totalss.n_in, (uintmax_t)totalss.n_out, (uintmax_t)(totalss.n_in + totalss.n_out), totalss.nskip_in_malloc, totalss.nskip_out_malloc, totalss.nskip_in_mtx, totalss.nskip_out_mtx, totalss.nskip_in_tcpcb, totalss.nskip_out_tcpcb, totalss.nskip_in_inpcb, totalss.nskip_out_inpcb, total_skipped_pkts); /* * Iterate over the flow hash, printing a summary of each * flow seen and freeing any malloc'd memory. * The hash consists of an array of LISTs (man 3 queue). */ for (i = 0; i <= siftr_hashmask; i++) { LIST_FOREACH_SAFE(counter, counter_hash + i, nodes, tmp_counter) { key = counter->key; key_index = 1; ipver = key[0]; memcpy(laddr, key + key_index, sizeof(laddr)); key_index += sizeof(laddr); memcpy(&lport, key + key_index, sizeof(lport)); key_index += sizeof(lport); memcpy(faddr, key + key_index, sizeof(faddr)); key_index += sizeof(faddr); memcpy(&fport, key + key_index, sizeof(fport)); #ifdef SIFTR_IPV6 laddr[3] = ntohl(laddr[3]); faddr[3] = ntohl(faddr[3]); if (ipver == INP_IPV6) { laddr[0] = ntohl(laddr[0]); laddr[1] = ntohl(laddr[1]); laddr[2] = ntohl(laddr[2]); faddr[0] = ntohl(faddr[0]); faddr[1] = ntohl(faddr[1]); faddr[2] = ntohl(faddr[2]); sbuf_printf(s, "%x:%x:%x:%x:%x:%x:%x:%x;%u-" "%x:%x:%x:%x:%x:%x:%x:%x;%u,", UPPER_SHORT(laddr[0]), LOWER_SHORT(laddr[0]), UPPER_SHORT(laddr[1]), LOWER_SHORT(laddr[1]), UPPER_SHORT(laddr[2]), LOWER_SHORT(laddr[2]), UPPER_SHORT(laddr[3]), LOWER_SHORT(laddr[3]), ntohs(lport), UPPER_SHORT(faddr[0]), LOWER_SHORT(faddr[0]), UPPER_SHORT(faddr[1]), LOWER_SHORT(faddr[1]), UPPER_SHORT(faddr[2]), LOWER_SHORT(faddr[2]), UPPER_SHORT(faddr[3]), LOWER_SHORT(faddr[3]), ntohs(fport)); } else { laddr[0] = FIRST_OCTET(laddr[3]); laddr[1] = SECOND_OCTET(laddr[3]); laddr[2] = THIRD_OCTET(laddr[3]); laddr[3] = FOURTH_OCTET(laddr[3]); faddr[0] = FIRST_OCTET(faddr[3]); faddr[1] = SECOND_OCTET(faddr[3]); faddr[2] = THIRD_OCTET(faddr[3]); faddr[3] = FOURTH_OCTET(faddr[3]); #endif sbuf_printf(s, "%u.%u.%u.%u;%u-%u.%u.%u.%u;%u,", laddr[0], laddr[1], laddr[2], laddr[3], ntohs(lport), faddr[0], faddr[1], faddr[2], faddr[3], ntohs(fport)); #ifdef SIFTR_IPV6 } #endif free(counter, M_SIFTR_HASHNODE); } LIST_INIT(counter_hash + i); } sbuf_printf(s, "\n"); sbuf_finish(s); i = 0; do { bytes_to_write = min(SIFTR_ALQ_BUFLEN, sbuf_len(s)-i); alq_writen(siftr_alq, sbuf_data(s)+i, bytes_to_write, ALQ_WAITOK); i += bytes_to_write; } while (i < sbuf_len(s)); alq_close(siftr_alq); siftr_alq = NULL; } sbuf_delete(s); /* * XXX: Should be using ret to check if any functions fail * and set error appropriately */ return (error); } static int siftr_sysctl_enabled_handler(SYSCTL_HANDLER_ARGS) { if (req->newptr == NULL) goto skip; /* If the value passed in isn't 0 or 1, return an error. */ if (CAST_PTR_INT(req->newptr) != 0 && CAST_PTR_INT(req->newptr) != 1) return (1); /* If we are changing state (0 to 1 or 1 to 0). */ if (CAST_PTR_INT(req->newptr) != siftr_enabled ) if (siftr_manage_ops(CAST_PTR_INT(req->newptr))) { siftr_manage_ops(SIFTR_DISABLE); return (1); } skip: return (sysctl_handle_int(oidp, arg1, arg2, req)); } static void siftr_shutdown_handler(void *arg) { siftr_manage_ops(SIFTR_DISABLE); } /* * Module is being unloaded or machine is shutting down. Take care of cleanup. */ static int deinit_siftr(void) { /* Cleanup. */ siftr_manage_ops(SIFTR_DISABLE); hashdestroy(counter_hash, M_SIFTR, siftr_hashmask); mtx_destroy(&siftr_pkt_queue_mtx); mtx_destroy(&siftr_pkt_mgr_mtx); return (0); } /* * Module has just been loaded into the kernel. */ static int init_siftr(void) { EVENTHANDLER_REGISTER(shutdown_pre_sync, siftr_shutdown_handler, NULL, SHUTDOWN_PRI_FIRST); /* Initialise our flow counter hash table. */ counter_hash = hashinit(SIFTR_EXPECTED_MAX_TCP_FLOWS, M_SIFTR, &siftr_hashmask); mtx_init(&siftr_pkt_queue_mtx, "siftr_pkt_queue_mtx", NULL, MTX_DEF); mtx_init(&siftr_pkt_mgr_mtx, "siftr_pkt_mgr_mtx", NULL, MTX_DEF); /* Print message to the user's current terminal. */ uprintf("\nStatistical Information For TCP Research (SIFTR) %s\n" " http://caia.swin.edu.au/urp/newtcp\n\n", MODVERSION_STR); return (0); } /* * This is the function that is called to load and unload the module. * When the module is loaded, this function is called once with * "what" == MOD_LOAD * When the module is unloaded, this function is called twice with * "what" = MOD_QUIESCE first, followed by "what" = MOD_UNLOAD second * When the system is shut down e.g. CTRL-ALT-DEL or using the shutdown command, * this function is called once with "what" = MOD_SHUTDOWN * When the system is shut down, the handler isn't called until the very end * of the shutdown sequence i.e. after the disks have been synced. */ static int siftr_load_handler(module_t mod, int what, void *arg) { int ret; switch (what) { case MOD_LOAD: ret = init_siftr(); break; case MOD_QUIESCE: case MOD_SHUTDOWN: ret = deinit_siftr(); break; case MOD_UNLOAD: ret = 0; break; default: ret = EINVAL; break; } return (ret); } static moduledata_t siftr_mod = { .name = "siftr", .evhand = siftr_load_handler, }; /* * Param 1: name of the kernel module * Param 2: moduledata_t struct containing info about the kernel module * and the execution entry point for the module * Param 3: From sysinit_sub_id enumeration in /usr/include/sys/kernel.h * Defines the module initialisation order * Param 4: From sysinit_elem_order enumeration in /usr/include/sys/kernel.h * Defines the initialisation order of this kld relative to others * within the same subsystem as defined by param 3 */ -DECLARE_MODULE(siftr, siftr_mod, SI_SUB_SMP, SI_ORDER_ANY); +DECLARE_MODULE(siftr, siftr_mod, SI_SUB_LAST, SI_ORDER_ANY); MODULE_DEPEND(siftr, alq, 1, 1, 1); MODULE_VERSION(siftr, MODVERSION); Index: head/sys/ofed/drivers/infiniband/core/device.c =================================================================== --- head/sys/ofed/drivers/infiniband/core/device.c (revision 296687) +++ head/sys/ofed/drivers/infiniband/core/device.c (revision 296688) @@ -1,793 +1,793 @@ /* * Copyright (c) 2004 Topspin Communications. All rights reserved. * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include #include #include #include #include #include #include "core_priv.h" MODULE_AUTHOR("Roland Dreier"); MODULE_DESCRIPTION("core kernel InfiniBand API"); MODULE_LICENSE("Dual BSD/GPL"); struct ib_client_data { struct list_head list; struct ib_client *client; void * data; }; struct workqueue_struct *ib_wq; EXPORT_SYMBOL_GPL(ib_wq); static LIST_HEAD(device_list); static LIST_HEAD(client_list); /* * device_mutex protects access to both device_list and client_list. * There's no real point to using multiple locks or something fancier * like an rwsem: we always access both lists, and we're always * modifying one list or the other list. In any case this is not a * hot path so there's no point in trying to optimize. */ static DEFINE_MUTEX(device_mutex); static int ib_device_check_mandatory(struct ib_device *device) { #define IB_MANDATORY_FUNC(x) { offsetof(struct ib_device, x), #x } static const struct { size_t offset; char *name; } mandatory_table[] = { IB_MANDATORY_FUNC(query_device), IB_MANDATORY_FUNC(query_port), IB_MANDATORY_FUNC(query_pkey), IB_MANDATORY_FUNC(query_gid), IB_MANDATORY_FUNC(alloc_pd), IB_MANDATORY_FUNC(dealloc_pd), IB_MANDATORY_FUNC(create_ah), IB_MANDATORY_FUNC(destroy_ah), IB_MANDATORY_FUNC(create_qp), IB_MANDATORY_FUNC(modify_qp), IB_MANDATORY_FUNC(destroy_qp), IB_MANDATORY_FUNC(post_send), IB_MANDATORY_FUNC(post_recv), IB_MANDATORY_FUNC(create_cq), IB_MANDATORY_FUNC(destroy_cq), IB_MANDATORY_FUNC(poll_cq), IB_MANDATORY_FUNC(req_notify_cq), IB_MANDATORY_FUNC(get_dma_mr), IB_MANDATORY_FUNC(dereg_mr) }; int i; for (i = 0; i < ARRAY_SIZE(mandatory_table); ++i) { if (!*(void **) ((void *) device + mandatory_table[i].offset)) { printk(KERN_WARNING "Device %s is missing mandatory function %s\n", device->name, mandatory_table[i].name); return -EINVAL; } } return 0; } static struct ib_device *__ib_device_get_by_name(const char *name) { struct ib_device *device; list_for_each_entry(device, &device_list, core_list) if (!strncmp(name, device->name, IB_DEVICE_NAME_MAX)) return device; return NULL; } static int alloc_name(char *name) { unsigned long *inuse; char buf[IB_DEVICE_NAME_MAX]; struct ib_device *device; int i; inuse = (unsigned long *) get_zeroed_page(GFP_KERNEL); if (!inuse) return -ENOMEM; list_for_each_entry(device, &device_list, core_list) { if (!sscanf(device->name, name, &i)) continue; if (i < 0 || i >= PAGE_SIZE * 8) continue; snprintf(buf, sizeof buf, name, i); if (!strncmp(buf, device->name, IB_DEVICE_NAME_MAX)) set_bit(i, inuse); } i = find_first_zero_bit(inuse, PAGE_SIZE * 8); free_page((unsigned long) inuse); snprintf(buf, sizeof buf, name, i); if (__ib_device_get_by_name(buf)) return -ENFILE; strlcpy(name, buf, IB_DEVICE_NAME_MAX); return 0; } static int start_port(struct ib_device *device) { return (device->node_type == RDMA_NODE_IB_SWITCH) ? 0 : 1; } static int end_port(struct ib_device *device) { return (device->node_type == RDMA_NODE_IB_SWITCH) ? 0 : device->phys_port_cnt; } /** * ib_alloc_device - allocate an IB device struct * @size:size of structure to allocate * * Low-level drivers should use ib_alloc_device() to allocate &struct * ib_device. @size is the size of the structure to be allocated, * including any private data used by the low-level driver. * ib_dealloc_device() must be used to free structures allocated with * ib_alloc_device(). */ struct ib_device *ib_alloc_device(size_t size) { struct ib_device *dev; BUG_ON(size < sizeof (struct ib_device)); dev = kzalloc(size, GFP_KERNEL); spin_lock_init(&dev->cmd_perf_lock); return dev; } EXPORT_SYMBOL(ib_alloc_device); /** * ib_dealloc_device - free an IB device struct * @device:structure to free * * Free a structure allocated with ib_alloc_device(). */ void ib_dealloc_device(struct ib_device *device) { if (device->reg_state == IB_DEV_UNINITIALIZED) { kfree(device); return; } BUG_ON(device->reg_state != IB_DEV_UNREGISTERED); kobject_put(&device->dev.kobj); } EXPORT_SYMBOL(ib_dealloc_device); static int add_client_context(struct ib_device *device, struct ib_client *client) { struct ib_client_data *context; unsigned long flags; context = kmalloc(sizeof *context, GFP_KERNEL); if (!context) { printk(KERN_WARNING "Couldn't allocate client context for %s/%s\n", device->name, client->name); return -ENOMEM; } context->client = client; context->data = NULL; spin_lock_irqsave(&device->client_data_lock, flags); list_add(&context->list, &device->client_data_list); spin_unlock_irqrestore(&device->client_data_lock, flags); return 0; } static int read_port_table_lengths(struct ib_device *device) { struct ib_port_attr *tprops = NULL; int num_ports, ret = -ENOMEM; u8 port_index; tprops = kmalloc(sizeof *tprops, GFP_KERNEL); if (!tprops) goto out; num_ports = end_port(device) - start_port(device) + 1; device->pkey_tbl_len = kmalloc(sizeof *device->pkey_tbl_len * num_ports, GFP_KERNEL); device->gid_tbl_len = kmalloc(sizeof *device->gid_tbl_len * num_ports, GFP_KERNEL); if (!device->pkey_tbl_len || !device->gid_tbl_len) goto err; for (port_index = 0; port_index < num_ports; ++port_index) { ret = ib_query_port(device, port_index + start_port(device), tprops); if (ret) goto err; device->pkey_tbl_len[port_index] = tprops->pkey_tbl_len; device->gid_tbl_len[port_index] = tprops->gid_tbl_len; } ret = 0; goto out; err: kfree(device->gid_tbl_len); kfree(device->pkey_tbl_len); out: kfree(tprops); return ret; } /** * ib_register_device - Register an IB device with IB core * @device:Device to register * * Low-level drivers use ib_register_device() to register their * devices with the IB core. All registered clients will receive a * callback for each device that is added. @device must be allocated * with ib_alloc_device(). */ int ib_register_device(struct ib_device *device, int (*port_callback)(struct ib_device *, u8, struct kobject *)) { int ret; mutex_lock(&device_mutex); if (strchr(device->name, '%')) { ret = alloc_name(device->name); if (ret) goto out; } if (ib_device_check_mandatory(device)) { ret = -EINVAL; goto out; } INIT_LIST_HEAD(&device->event_handler_list); INIT_LIST_HEAD(&device->client_data_list); spin_lock_init(&device->event_handler_lock); spin_lock_init(&device->client_data_lock); ret = read_port_table_lengths(device); if (ret) { printk(KERN_WARNING "Couldn't create table lengths cache for device %s\n", device->name); goto out; } ret = ib_device_register_sysfs(device, port_callback); if (ret) { printk(KERN_WARNING "Couldn't register device %s with driver model\n", device->name); kfree(device->gid_tbl_len); kfree(device->pkey_tbl_len); goto out; } list_add_tail(&device->core_list, &device_list); device->reg_state = IB_DEV_REGISTERED; { struct ib_client *client; list_for_each_entry(client, &client_list, list) if (client->add && !add_client_context(device, client)) client->add(device); } out: mutex_unlock(&device_mutex); return ret; } EXPORT_SYMBOL(ib_register_device); /** * ib_unregister_device - Unregister an IB device * @device:Device to unregister * * Unregister an IB device. All clients will receive a remove callback. */ void ib_unregister_device(struct ib_device *device) { struct ib_client *client; struct ib_client_data *context, *tmp; unsigned long flags; mutex_lock(&device_mutex); list_for_each_entry_reverse(client, &client_list, list) if (client->remove) client->remove(device); list_del(&device->core_list); kfree(device->gid_tbl_len); kfree(device->pkey_tbl_len); mutex_unlock(&device_mutex); ib_device_unregister_sysfs(device); spin_lock_irqsave(&device->client_data_lock, flags); list_for_each_entry_safe(context, tmp, &device->client_data_list, list) kfree(context); spin_unlock_irqrestore(&device->client_data_lock, flags); device->reg_state = IB_DEV_UNREGISTERED; } EXPORT_SYMBOL(ib_unregister_device); /** * ib_register_client - Register an IB client * @client:Client to register * * Upper level users of the IB drivers can use ib_register_client() to * register callbacks for IB device addition and removal. When an IB * device is added, each registered client's add method will be called * (in the order the clients were registered), and when a device is * removed, each client's remove method will be called (in the reverse * order that clients were registered). In addition, when * ib_register_client() is called, the client will receive an add * callback for all devices already registered. */ int ib_register_client(struct ib_client *client) { struct ib_device *device; mutex_lock(&device_mutex); list_add_tail(&client->list, &client_list); list_for_each_entry(device, &device_list, core_list) if (client->add && !add_client_context(device, client)) client->add(device); mutex_unlock(&device_mutex); return 0; } EXPORT_SYMBOL(ib_register_client); /** * ib_unregister_client - Unregister an IB client * @client:Client to unregister * * Upper level users use ib_unregister_client() to remove their client * registration. When ib_unregister_client() is called, the client * will receive a remove callback for each IB device still registered. */ void ib_unregister_client(struct ib_client *client) { struct ib_client_data *context, *tmp; struct ib_device *device; unsigned long flags; mutex_lock(&device_mutex); list_for_each_entry(device, &device_list, core_list) { if (client->remove) client->remove(device); spin_lock_irqsave(&device->client_data_lock, flags); list_for_each_entry_safe(context, tmp, &device->client_data_list, list) if (context->client == client) { list_del(&context->list); kfree(context); } spin_unlock_irqrestore(&device->client_data_lock, flags); } list_del(&client->list); mutex_unlock(&device_mutex); } EXPORT_SYMBOL(ib_unregister_client); /** * ib_get_client_data - Get IB client context * @device:Device to get context for * @client:Client to get context for * * ib_get_client_data() returns client context set with * ib_set_client_data(). */ void *ib_get_client_data(struct ib_device *device, struct ib_client *client) { struct ib_client_data *context; void *ret = NULL; unsigned long flags; spin_lock_irqsave(&device->client_data_lock, flags); list_for_each_entry(context, &device->client_data_list, list) if (context->client == client) { ret = context->data; break; } spin_unlock_irqrestore(&device->client_data_lock, flags); return ret; } EXPORT_SYMBOL(ib_get_client_data); /** * ib_set_client_data - Set IB client context * @device:Device to set context for * @client:Client to set context for * @data:Context to set * * ib_set_client_data() sets client context that can be retrieved with * ib_get_client_data(). */ void ib_set_client_data(struct ib_device *device, struct ib_client *client, void *data) { struct ib_client_data *context; unsigned long flags; spin_lock_irqsave(&device->client_data_lock, flags); list_for_each_entry(context, &device->client_data_list, list) if (context->client == client) { context->data = data; goto out; } printk(KERN_WARNING "No client context found for %s/%s\n", device->name, client->name); out: spin_unlock_irqrestore(&device->client_data_lock, flags); } EXPORT_SYMBOL(ib_set_client_data); /** * ib_register_event_handler - Register an IB event handler * @event_handler:Handler to register * * ib_register_event_handler() registers an event handler that will be * called back when asynchronous IB events occur (as defined in * chapter 11 of the InfiniBand Architecture Specification). This * callback may occur in interrupt context. */ int ib_register_event_handler (struct ib_event_handler *event_handler) { unsigned long flags; spin_lock_irqsave(&event_handler->device->event_handler_lock, flags); list_add_tail(&event_handler->list, &event_handler->device->event_handler_list); spin_unlock_irqrestore(&event_handler->device->event_handler_lock, flags); return 0; } EXPORT_SYMBOL(ib_register_event_handler); /** * ib_unregister_event_handler - Unregister an event handler * @event_handler:Handler to unregister * * Unregister an event handler registered with * ib_register_event_handler(). */ int ib_unregister_event_handler(struct ib_event_handler *event_handler) { unsigned long flags; spin_lock_irqsave(&event_handler->device->event_handler_lock, flags); list_del(&event_handler->list); spin_unlock_irqrestore(&event_handler->device->event_handler_lock, flags); return 0; } EXPORT_SYMBOL(ib_unregister_event_handler); /** * ib_dispatch_event - Dispatch an asynchronous event * @event:Event to dispatch * * Low-level drivers must call ib_dispatch_event() to dispatch the * event to all registered event handlers when an asynchronous event * occurs. */ void ib_dispatch_event(struct ib_event *event) { unsigned long flags; struct ib_event_handler *handler; spin_lock_irqsave(&event->device->event_handler_lock, flags); list_for_each_entry(handler, &event->device->event_handler_list, list) handler->handler(handler, event); spin_unlock_irqrestore(&event->device->event_handler_lock, flags); } EXPORT_SYMBOL(ib_dispatch_event); /** * ib_query_device - Query IB device attributes * @device:Device to query * @device_attr:Device attributes * * ib_query_device() returns the attributes of a device through the * @device_attr pointer. */ int ib_query_device(struct ib_device *device, struct ib_device_attr *device_attr) { return device->query_device(device, device_attr); } EXPORT_SYMBOL(ib_query_device); /** * ib_query_port - Query IB port attributes * @device:Device to query * @port_num:Port number to query * @port_attr:Port attributes * * ib_query_port() returns the attributes of a port through the * @port_attr pointer. */ int ib_query_port(struct ib_device *device, u8 port_num, struct ib_port_attr *port_attr) { if (port_num < start_port(device) || port_num > end_port(device)) return -EINVAL; return device->query_port(device, port_num, port_attr); } EXPORT_SYMBOL(ib_query_port); /** * ib_query_gid - Get GID table entry * @device:Device to query * @port_num:Port number to query * @index:GID table index to query * @gid:Returned GID * * ib_query_gid() fetches the specified GID table entry. */ int ib_query_gid(struct ib_device *device, u8 port_num, int index, union ib_gid *gid) { return device->query_gid(device, port_num, index, gid); } EXPORT_SYMBOL(ib_query_gid); /** * ib_query_pkey - Get P_Key table entry * @device:Device to query * @port_num:Port number to query * @index:P_Key table index to query * @pkey:Returned P_Key * * ib_query_pkey() fetches the specified P_Key table entry. */ int ib_query_pkey(struct ib_device *device, u8 port_num, u16 index, u16 *pkey) { return device->query_pkey(device, port_num, index, pkey); } EXPORT_SYMBOL(ib_query_pkey); /** * ib_modify_device - Change IB device attributes * @device:Device to modify * @device_modify_mask:Mask of attributes to change * @device_modify:New attribute values * * ib_modify_device() changes a device's attributes as specified by * the @device_modify_mask and @device_modify structure. */ int ib_modify_device(struct ib_device *device, int device_modify_mask, struct ib_device_modify *device_modify) { if (!device->modify_device) return -ENOSYS; return device->modify_device(device, device_modify_mask, device_modify); } EXPORT_SYMBOL(ib_modify_device); /** * ib_modify_port - Modifies the attributes for the specified port. * @device: The device to modify. * @port_num: The number of the port to modify. * @port_modify_mask: Mask used to specify which attributes of the port * to change. * @port_modify: New attribute values for the port. * * ib_modify_port() changes a port's attributes as specified by the * @port_modify_mask and @port_modify structure. */ int ib_modify_port(struct ib_device *device, u8 port_num, int port_modify_mask, struct ib_port_modify *port_modify) { if (!device->modify_port) return -ENOSYS; if (port_num < start_port(device) || port_num > end_port(device)) return -EINVAL; return device->modify_port(device, port_num, port_modify_mask, port_modify); } EXPORT_SYMBOL(ib_modify_port); /** * ib_find_gid - Returns the port number and GID table index where * a specified GID value occurs. * @device: The device to query. * @gid: The GID value to search for. * @port_num: The port number of the device where the GID value was found. * @index: The index into the GID table where the GID was found. This * parameter may be NULL. */ int ib_find_gid(struct ib_device *device, union ib_gid *gid, u8 *port_num, u16 *index) { union ib_gid tmp_gid; int ret, port, i; for (port = start_port(device); port <= end_port(device); ++port) { for (i = 0; i < device->gid_tbl_len[port - start_port(device)]; ++i) { ret = ib_query_gid(device, port, i, &tmp_gid); if (ret) return ret; if (!memcmp(&tmp_gid, gid, sizeof *gid)) { *port_num = port; if (index) *index = i; return 0; } } } return -ENOENT; } EXPORT_SYMBOL(ib_find_gid); /** * ib_find_pkey - Returns the PKey table index where a specified * PKey value occurs. * @device: The device to query. * @port_num: The port number of the device to search for the PKey. * @pkey: The PKey value to search for. * @index: The index into the PKey table where the PKey was found. */ int ib_find_pkey(struct ib_device *device, u8 port_num, u16 pkey, u16 *index) { int ret, i; u16 tmp_pkey; int partial_ix = -1; for (i = 0; i < device->pkey_tbl_len[port_num - start_port(device)]; ++i) { ret = ib_query_pkey(device, port_num, i, &tmp_pkey); if (ret) return ret; if ((pkey & 0x7fff) == (tmp_pkey & 0x7fff)) { /* if there is full-member pkey take it.*/ if (tmp_pkey & 0x8000) { *index = i; return 0; } if (partial_ix < 0) partial_ix = i; } } /*no full-member, if exists take the limited*/ if (partial_ix >= 0) { *index = partial_ix; return 0; } return -ENOENT; } EXPORT_SYMBOL(ib_find_pkey); static int __init ib_core_init(void) { int ret; ib_wq = create_workqueue("infiniband"); if (!ib_wq) return -ENOMEM; ret = ib_sysfs_setup(); if (ret) { printk(KERN_WARNING "Couldn't create InfiniBand device class\n"); goto err; } ret = ib_cache_setup(); if (ret) { printk(KERN_WARNING "Couldn't set up InfiniBand P_Key/GID cache\n"); goto err_sysfs; } return 0; err_sysfs: ib_sysfs_cleanup(); err: destroy_workqueue(ib_wq); return ret; } static void __exit ib_core_cleanup(void) { ib_cache_cleanup(); ib_sysfs_cleanup(); /* Make sure that any pending umem accounting work is done. */ destroy_workqueue(ib_wq); } module_init(ib_core_init); module_exit(ib_core_cleanup); static int ibcore_evhand(module_t mod, int event, void *arg) { return (0); } static moduledata_t ibcore_mod = { .name = "ibcore", .evhand = ibcore_evhand, }; MODULE_VERSION(ibcore, 1); MODULE_DEPEND(ibcore, linuxkpi, 1, 1, 1); -DECLARE_MODULE(ibcore, ibcore_mod, SI_SUB_SMP, SI_ORDER_ANY); +DECLARE_MODULE(ibcore, ibcore_mod, SI_SUB_LAST, SI_ORDER_ANY); Index: head/sys/ofed/drivers/infiniband/hw/mlx4/main.c =================================================================== --- head/sys/ofed/drivers/infiniband/hw/mlx4/main.c (revision 296687) +++ head/sys/ofed/drivers/infiniband/hw/mlx4/main.c (revision 296688) @@ -1,2887 +1,2887 @@ /* * Copyright (c) 2006, 2007 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "mlx4_ib.h" #include "mlx4_exp.h" #include "user.h" #include "wc.h" #define DRV_NAME MLX4_IB_DRV_NAME #define DRV_VERSION "1.0" #define DRV_RELDATE __DATE__ #define MLX4_IB_DRIVER_PROC_DIR_NAME "driver/mlx4_ib" #define MLX4_IB_MRS_PROC_DIR_NAME "mrs" #define MLX4_IB_FLOW_MAX_PRIO 0xFFF #define MLX4_IB_FLOW_QPN_MASK 0xFFFFFF MODULE_AUTHOR("Roland Dreier"); MODULE_DESCRIPTION("Mellanox ConnectX HCA InfiniBand driver"); MODULE_LICENSE("Dual BSD/GPL"); #ifdef __linux__ MODULE_VERSION(DRV_VERSION); #endif int mlx4_ib_sm_guid_assign = 1; module_param_named(sm_guid_assign, mlx4_ib_sm_guid_assign, int, 0444); MODULE_PARM_DESC(sm_guid_assign, "Enable SM alias_GUID assignment if sm_guid_assign > 0 (Default: 1)"); enum { MAX_NUM_STR_BITMAP = 1 << 15, DEFAULT_TBL_VAL = -1 }; static struct mlx4_dbdf2val_lst dev_assign_str = { .name = "dev_assign_str param", .num_vals = 1, .def_val = {DEFAULT_TBL_VAL}, .range = {0, MAX_NUM_STR_BITMAP - 1} }; module_param_string(dev_assign_str, dev_assign_str.str, sizeof(dev_assign_str.str), 0444); MODULE_PARM_DESC(dev_assign_str, "Map device function numbers to IB device numbers (e.g. '0000:04:00.0-0,002b:1c:0b.a-1,...').\n" "\t\tHexadecimal digits for the device function (e.g. 002b:1c:0b.a) and decimal for IB device numbers (e.g. 1).\n" "\t\tMax supported devices - 32"); static unsigned long *dev_num_str_bitmap; static spinlock_t dev_num_str_lock; static const char mlx4_ib_version[] = DRV_NAME ": Mellanox ConnectX InfiniBand driver v" DRV_VERSION " (" DRV_RELDATE ")\n"; struct update_gid_work { struct work_struct work; union ib_gid gids[128]; struct mlx4_ib_dev *dev; int port; }; struct dev_rec { int bus; int dev; int func; int nr; }; static int dr_active; static void do_slave_init(struct mlx4_ib_dev *ibdev, int slave, int do_init); static void mlx4_ib_scan_netdevs(struct mlx4_ib_dev *ibdev, struct net_device*, unsigned long); static u8 mlx4_ib_get_dev_port(struct net_device *dev, struct mlx4_ib_dev *ibdev); static struct workqueue_struct *wq; static void init_query_mad(struct ib_smp *mad) { mad->base_version = 1; mad->mgmt_class = IB_MGMT_CLASS_SUBN_LID_ROUTED; mad->class_version = 1; mad->method = IB_MGMT_METHOD_GET; } static union ib_gid zgid; static int check_flow_steering_support(struct mlx4_dev *dev) { int eth_num_ports = 0; int ib_num_ports = 0; int dmfs = dev->caps.steering_mode == MLX4_STEERING_MODE_DEVICE_MANAGED; if (dmfs) { int i; mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_ETH) eth_num_ports++; mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_IB) ib_num_ports++; dmfs &= (!ib_num_ports || (dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_DMFS_IPOIB)) && (!eth_num_ports || (dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_FS_EN)); if (ib_num_ports && mlx4_is_mfunc(dev)) { dmfs = 0; } } return dmfs; } int mlx4_ib_query_device(struct ib_device *ibdev, struct ib_device_attr *props) { struct mlx4_ib_dev *dev = to_mdev(ibdev); struct ib_smp *in_mad = NULL; struct ib_smp *out_mad = NULL; int err = -ENOMEM; in_mad = kzalloc(sizeof *in_mad, GFP_KERNEL); out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL); if (!in_mad || !out_mad) goto out; init_query_mad(in_mad); in_mad->attr_id = IB_SMP_ATTR_NODE_INFO; err = mlx4_MAD_IFC(to_mdev(ibdev), MLX4_MAD_IFC_IGNORE_KEYS, 1, NULL, NULL, in_mad, out_mad); if (err) goto out; memset(props, 0, sizeof *props); props->fw_ver = dev->dev->caps.fw_ver; props->device_cap_flags = IB_DEVICE_CHANGE_PHY_PORT | IB_DEVICE_PORT_ACTIVE_EVENT | IB_DEVICE_SYS_IMAGE_GUID | IB_DEVICE_RC_RNR_NAK_GEN | IB_DEVICE_BLOCK_MULTICAST_LOOPBACK | IB_DEVICE_SHARED_MR; if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_BAD_PKEY_CNTR) props->device_cap_flags |= IB_DEVICE_BAD_PKEY_CNTR; if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_BAD_QKEY_CNTR) props->device_cap_flags |= IB_DEVICE_BAD_QKEY_CNTR; if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_APM) props->device_cap_flags |= IB_DEVICE_AUTO_PATH_MIG; if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_UD_AV_PORT) props->device_cap_flags |= IB_DEVICE_UD_AV_PORT_ENFORCE; if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_IPOIB_CSUM) props->device_cap_flags |= IB_DEVICE_UD_IP_CSUM; if (dev->dev->caps.max_gso_sz && dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_BLH) props->device_cap_flags |= IB_DEVICE_UD_TSO; if (dev->dev->caps.bmme_flags & MLX4_BMME_FLAG_RESERVED_LKEY) props->device_cap_flags |= IB_DEVICE_LOCAL_DMA_LKEY; if ((dev->dev->caps.bmme_flags & MLX4_BMME_FLAG_LOCAL_INV) && (dev->dev->caps.bmme_flags & MLX4_BMME_FLAG_REMOTE_INV) && (dev->dev->caps.bmme_flags & MLX4_BMME_FLAG_FAST_REG_WR)) props->device_cap_flags |= IB_DEVICE_MEM_MGT_EXTENSIONS; if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC) props->device_cap_flags |= IB_DEVICE_XRC; if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_CROSS_CHANNEL) props->device_cap_flags |= IB_DEVICE_CROSS_CHANNEL; if (check_flow_steering_support(dev->dev)) props->device_cap_flags |= IB_DEVICE_MANAGED_FLOW_STEERING; props->device_cap_flags |= IB_DEVICE_QPG; if (dev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_RSS) { props->device_cap_flags |= IB_DEVICE_UD_RSS; props->max_rss_tbl_sz = dev->dev->caps.max_rss_tbl_sz; } if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_MEM_WINDOW) props->device_cap_flags |= IB_DEVICE_MEM_WINDOW; if (dev->dev->caps.bmme_flags & MLX4_BMME_FLAG_TYPE_2_WIN) { if (dev->dev->caps.bmme_flags & MLX4_BMME_FLAG_WIN_TYPE_2B) props->device_cap_flags |= IB_DEVICE_MEM_WINDOW_TYPE_2B; else props->device_cap_flags |= IB_DEVICE_MEM_WINDOW_TYPE_2A; } props->vendor_id = be32_to_cpup((__be32 *) (out_mad->data + 36)) & 0xffffff; props->vendor_part_id = dev->dev->pdev->device; props->hw_ver = be32_to_cpup((__be32 *) (out_mad->data + 32)); memcpy(&props->sys_image_guid, out_mad->data + 4, 8); props->max_mr_size = ~0ull; props->page_size_cap = dev->dev->caps.page_size_cap; props->max_qp = dev->dev->quotas.qp; props->max_qp_wr = dev->dev->caps.max_wqes - MLX4_IB_SQ_MAX_SPARE; props->max_sge = min(dev->dev->caps.max_sq_sg, dev->dev->caps.max_rq_sg); props->max_cq = dev->dev->quotas.cq; props->max_cqe = dev->dev->caps.max_cqes; props->max_mr = dev->dev->quotas.mpt; props->max_pd = dev->dev->caps.num_pds - dev->dev->caps.reserved_pds; props->max_qp_rd_atom = dev->dev->caps.max_qp_dest_rdma; props->max_qp_init_rd_atom = dev->dev->caps.max_qp_init_rdma; props->max_res_rd_atom = props->max_qp_rd_atom * props->max_qp; props->max_srq = dev->dev->quotas.srq; props->max_srq_wr = dev->dev->caps.max_srq_wqes - 1; props->max_srq_sge = dev->dev->caps.max_srq_sge; props->max_fast_reg_page_list_len = MLX4_MAX_FAST_REG_PAGES; props->local_ca_ack_delay = dev->dev->caps.local_ca_ack_delay; props->atomic_cap = dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_ATOMIC ? IB_ATOMIC_HCA : IB_ATOMIC_NONE; props->masked_atomic_cap = props->atomic_cap; props->max_pkeys = dev->dev->caps.pkey_table_len[1]; props->max_mcast_grp = dev->dev->caps.num_mgms + dev->dev->caps.num_amgms; props->max_mcast_qp_attach = dev->dev->caps.num_qp_per_mgm; props->max_total_mcast_qp_attach = props->max_mcast_qp_attach * props->max_mcast_grp; props->max_map_per_fmr = dev->dev->caps.max_fmr_maps; props->hca_core_clock = dev->dev->caps.hca_core_clock; if (dev->dev->caps.hca_core_clock > 0) props->comp_mask |= IB_DEVICE_ATTR_WITH_HCA_CORE_CLOCK; if (dev->dev->caps.cq_timestamp) { props->timestamp_mask = 0xFFFFFFFFFFFF; props->comp_mask |= IB_DEVICE_ATTR_WITH_TIMESTAMP_MASK; } out: kfree(in_mad); kfree(out_mad); return err; } static enum rdma_link_layer mlx4_ib_port_link_layer(struct ib_device *device, u8 port_num) { struct mlx4_dev *dev = to_mdev(device)->dev; return dev->caps.port_mask[port_num] == MLX4_PORT_TYPE_IB ? IB_LINK_LAYER_INFINIBAND : IB_LINK_LAYER_ETHERNET; } static int ib_link_query_port(struct ib_device *ibdev, u8 port, struct ib_port_attr *props, int netw_view) { struct ib_smp *in_mad = NULL; struct ib_smp *out_mad = NULL; int ext_active_speed; int mad_ifc_flags = MLX4_MAD_IFC_IGNORE_KEYS; int err = -ENOMEM; in_mad = kzalloc(sizeof *in_mad, GFP_KERNEL); out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL); if (!in_mad || !out_mad) goto out; init_query_mad(in_mad); in_mad->attr_id = IB_SMP_ATTR_PORT_INFO; in_mad->attr_mod = cpu_to_be32(port); if (mlx4_is_mfunc(to_mdev(ibdev)->dev) && netw_view) mad_ifc_flags |= MLX4_MAD_IFC_NET_VIEW; err = mlx4_MAD_IFC(to_mdev(ibdev), mad_ifc_flags, port, NULL, NULL, in_mad, out_mad); if (err) goto out; props->lid = be16_to_cpup((__be16 *) (out_mad->data + 16)); props->lmc = out_mad->data[34] & 0x7; props->sm_lid = be16_to_cpup((__be16 *) (out_mad->data + 18)); props->sm_sl = out_mad->data[36] & 0xf; props->state = out_mad->data[32] & 0xf; props->phys_state = out_mad->data[33] >> 4; props->port_cap_flags = be32_to_cpup((__be32 *) (out_mad->data + 20)); if (netw_view) props->gid_tbl_len = out_mad->data[50]; else props->gid_tbl_len = to_mdev(ibdev)->dev->caps.gid_table_len[port]; props->max_msg_sz = to_mdev(ibdev)->dev->caps.max_msg_sz; props->pkey_tbl_len = to_mdev(ibdev)->dev->caps.pkey_table_len[port]; props->bad_pkey_cntr = be16_to_cpup((__be16 *) (out_mad->data + 46)); props->qkey_viol_cntr = be16_to_cpup((__be16 *) (out_mad->data + 48)); props->active_width = out_mad->data[31] & 0xf; props->active_speed = out_mad->data[35] >> 4; props->max_mtu = out_mad->data[41] & 0xf; props->active_mtu = out_mad->data[36] >> 4; props->subnet_timeout = out_mad->data[51] & 0x1f; props->max_vl_num = out_mad->data[37] >> 4; props->init_type_reply = out_mad->data[41] >> 4; /* Check if extended speeds (EDR/FDR/...) are supported */ if (props->port_cap_flags & IB_PORT_EXTENDED_SPEEDS_SUP) { ext_active_speed = out_mad->data[62] >> 4; switch (ext_active_speed) { case 1: props->active_speed = IB_SPEED_FDR; break; case 2: props->active_speed = IB_SPEED_EDR; break; } } /* If reported active speed is QDR, check if is FDR-10 */ if (props->active_speed == IB_SPEED_QDR) { init_query_mad(in_mad); in_mad->attr_id = MLX4_ATTR_EXTENDED_PORT_INFO; in_mad->attr_mod = cpu_to_be32(port); err = mlx4_MAD_IFC(to_mdev(ibdev), mad_ifc_flags, port, NULL, NULL, in_mad, out_mad); if (err) goto out; /* Checking LinkSpeedActive for FDR-10 */ if (out_mad->data[15] & 0x1) props->active_speed = IB_SPEED_FDR10; } /* Avoid wrong speed value returned by FW if the IB link is down. */ if (props->state == IB_PORT_DOWN) props->active_speed = IB_SPEED_SDR; out: kfree(in_mad); kfree(out_mad); return err; } static u8 state_to_phys_state(enum ib_port_state state) { return state == IB_PORT_ACTIVE ? 5 : 3; } static int eth_link_query_port(struct ib_device *ibdev, u8 port, struct ib_port_attr *props, int netw_view) { struct mlx4_ib_dev *mdev = to_mdev(ibdev); struct mlx4_ib_iboe *iboe = &mdev->iboe; struct net_device *ndev; enum ib_mtu tmp; struct mlx4_cmd_mailbox *mailbox; unsigned long flags; int err = 0; mailbox = mlx4_alloc_cmd_mailbox(mdev->dev); if (IS_ERR(mailbox)) return PTR_ERR(mailbox); err = mlx4_cmd_box(mdev->dev, 0, mailbox->dma, port, 0, MLX4_CMD_QUERY_PORT, MLX4_CMD_TIME_CLASS_B, MLX4_CMD_WRAPPED); if (err) goto out; props->active_width = (((u8 *)mailbox->buf)[5] == 0x40) ? IB_WIDTH_4X : IB_WIDTH_1X; props->active_speed = IB_SPEED_QDR; props->port_cap_flags = IB_PORT_CM_SUP; if (netw_view) props->gid_tbl_len = MLX4_ROCE_MAX_GIDS; else props->gid_tbl_len = mdev->dev->caps.gid_table_len[port]; props->max_msg_sz = mdev->dev->caps.max_msg_sz; props->pkey_tbl_len = 1; props->max_mtu = IB_MTU_4096; props->max_vl_num = 2; props->state = IB_PORT_DOWN; props->phys_state = state_to_phys_state(props->state); props->active_mtu = IB_MTU_256; spin_lock_irqsave(&iboe->lock, flags); ndev = iboe->netdevs[port - 1]; if (!ndev) goto out_unlock; tmp = iboe_get_mtu(ndev->if_mtu); props->active_mtu = tmp ? min(props->max_mtu, tmp) : IB_MTU_256; props->state = (netif_running(ndev) && netif_carrier_ok(ndev)) ? IB_PORT_ACTIVE : IB_PORT_DOWN; props->phys_state = state_to_phys_state(props->state); out_unlock: spin_unlock_irqrestore(&iboe->lock, flags); out: mlx4_free_cmd_mailbox(mdev->dev, mailbox); return err; } int __mlx4_ib_query_port(struct ib_device *ibdev, u8 port, struct ib_port_attr *props, int netw_view) { int err; memset(props, 0, sizeof *props); err = mlx4_ib_port_link_layer(ibdev, port) == IB_LINK_LAYER_INFINIBAND ? ib_link_query_port(ibdev, port, props, netw_view) : eth_link_query_port(ibdev, port, props, netw_view); return err; } static int mlx4_ib_query_port(struct ib_device *ibdev, u8 port, struct ib_port_attr *props) { /* returns host view */ return __mlx4_ib_query_port(ibdev, port, props, 0); } int __mlx4_ib_query_gid(struct ib_device *ibdev, u8 port, int index, union ib_gid *gid, int netw_view) { struct ib_smp *in_mad = NULL; struct ib_smp *out_mad = NULL; int err = -ENOMEM; struct mlx4_ib_dev *dev = to_mdev(ibdev); int clear = 0; int mad_ifc_flags = MLX4_MAD_IFC_IGNORE_KEYS; in_mad = kzalloc(sizeof *in_mad, GFP_KERNEL); out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL); if (!in_mad || !out_mad) goto out; init_query_mad(in_mad); in_mad->attr_id = IB_SMP_ATTR_PORT_INFO; in_mad->attr_mod = cpu_to_be32(port); if (mlx4_is_mfunc(dev->dev) && netw_view) mad_ifc_flags |= MLX4_MAD_IFC_NET_VIEW; err = mlx4_MAD_IFC(dev, mad_ifc_flags, port, NULL, NULL, in_mad, out_mad); if (err) goto out; memcpy(gid->raw, out_mad->data + 8, 8); if (mlx4_is_mfunc(dev->dev) && !netw_view) { if (index) { /* For any index > 0, return the null guid */ err = 0; clear = 1; goto out; } } init_query_mad(in_mad); in_mad->attr_id = IB_SMP_ATTR_GUID_INFO; in_mad->attr_mod = cpu_to_be32(index / 8); err = mlx4_MAD_IFC(dev, mad_ifc_flags, port, NULL, NULL, in_mad, out_mad); if (err) goto out; memcpy(gid->raw + 8, out_mad->data + (index % 8) * 8, 8); out: if (clear) memset(gid->raw + 8, 0, 8); kfree(in_mad); kfree(out_mad); return err; } static int iboe_query_gid(struct ib_device *ibdev, u8 port, int index, union ib_gid *gid) { struct mlx4_ib_dev *dev = to_mdev(ibdev); *gid = dev->iboe.gid_table[port - 1][index]; return 0; } static int mlx4_ib_query_gid(struct ib_device *ibdev, u8 port, int index, union ib_gid *gid) { if (rdma_port_get_link_layer(ibdev, port) == IB_LINK_LAYER_INFINIBAND) return __mlx4_ib_query_gid(ibdev, port, index, gid, 0); else return iboe_query_gid(ibdev, port, index, gid); } int __mlx4_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index, u16 *pkey, int netw_view) { struct ib_smp *in_mad = NULL; struct ib_smp *out_mad = NULL; int mad_ifc_flags = MLX4_MAD_IFC_IGNORE_KEYS; int err = -ENOMEM; in_mad = kzalloc(sizeof *in_mad, GFP_KERNEL); out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL); if (!in_mad || !out_mad) goto out; init_query_mad(in_mad); in_mad->attr_id = IB_SMP_ATTR_PKEY_TABLE; in_mad->attr_mod = cpu_to_be32(index / 32); if (mlx4_is_mfunc(to_mdev(ibdev)->dev) && netw_view) mad_ifc_flags |= MLX4_MAD_IFC_NET_VIEW; err = mlx4_MAD_IFC(to_mdev(ibdev), mad_ifc_flags, port, NULL, NULL, in_mad, out_mad); if (err) goto out; *pkey = be16_to_cpu(((__be16 *) out_mad->data)[index % 32]); out: kfree(in_mad); kfree(out_mad); return err; } static int mlx4_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index, u16 *pkey) { return __mlx4_ib_query_pkey(ibdev, port, index, pkey, 0); } static int mlx4_ib_modify_device(struct ib_device *ibdev, int mask, struct ib_device_modify *props) { struct mlx4_cmd_mailbox *mailbox; unsigned long flags; if (mask & ~IB_DEVICE_MODIFY_NODE_DESC) return -EOPNOTSUPP; if (!(mask & IB_DEVICE_MODIFY_NODE_DESC)) return 0; if (mlx4_is_slave(to_mdev(ibdev)->dev)) return -EOPNOTSUPP; spin_lock_irqsave(&to_mdev(ibdev)->sm_lock, flags); memcpy(ibdev->node_desc, props->node_desc, 64); spin_unlock_irqrestore(&to_mdev(ibdev)->sm_lock, flags); /* * If possible, pass node desc to FW, so it can generate * a 144 trap. If cmd fails, just ignore. */ mailbox = mlx4_alloc_cmd_mailbox(to_mdev(ibdev)->dev); if (IS_ERR(mailbox)) return 0; memset(mailbox->buf, 0, 256); memcpy(mailbox->buf, props->node_desc, 64); mlx4_cmd(to_mdev(ibdev)->dev, mailbox->dma, 1, 0, MLX4_CMD_SET_NODE, MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE); mlx4_free_cmd_mailbox(to_mdev(ibdev)->dev, mailbox); return 0; } static int mlx4_SET_PORT(struct mlx4_ib_dev *dev, u8 port, int reset_qkey_viols, u32 cap_mask) { struct mlx4_cmd_mailbox *mailbox; int err; u8 is_eth = dev->dev->caps.port_type[port] == MLX4_PORT_TYPE_ETH; mailbox = mlx4_alloc_cmd_mailbox(dev->dev); if (IS_ERR(mailbox)) return PTR_ERR(mailbox); memset(mailbox->buf, 0, 256); if (dev->dev->flags & MLX4_FLAG_OLD_PORT_CMDS) { *(u8 *) mailbox->buf = !!reset_qkey_viols << 6; ((__be32 *) mailbox->buf)[2] = cpu_to_be32(cap_mask); } else { ((u8 *) mailbox->buf)[3] = !!reset_qkey_viols; ((__be32 *) mailbox->buf)[1] = cpu_to_be32(cap_mask); } err = mlx4_cmd(dev->dev, mailbox->dma, port, is_eth, MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE); mlx4_free_cmd_mailbox(dev->dev, mailbox); return err; } static int mlx4_ib_modify_port(struct ib_device *ibdev, u8 port, int mask, struct ib_port_modify *props) { struct ib_port_attr attr; u32 cap_mask; int err; mutex_lock(&to_mdev(ibdev)->cap_mask_mutex); err = mlx4_ib_query_port(ibdev, port, &attr); if (err) goto out; cap_mask = (attr.port_cap_flags | props->set_port_cap_mask) & ~props->clr_port_cap_mask; err = mlx4_SET_PORT(to_mdev(ibdev), port, !!(mask & IB_PORT_RESET_QKEY_CNTR), cap_mask); out: mutex_unlock(&to_mdev(ibdev)->cap_mask_mutex); return err; } static struct ib_ucontext *mlx4_ib_alloc_ucontext(struct ib_device *ibdev, struct ib_udata *udata) { struct mlx4_ib_dev *dev = to_mdev(ibdev); struct mlx4_ib_ucontext *context; struct mlx4_ib_alloc_ucontext_resp_v3 resp_v3; struct mlx4_ib_alloc_ucontext_resp resp; int err; if (!dev->ib_active) return ERR_PTR(-EAGAIN); if (ibdev->uverbs_abi_ver == MLX4_IB_UVERBS_NO_DEV_CAPS_ABI_VERSION) { resp_v3.qp_tab_size = dev->dev->caps.num_qps; if (mlx4_wc_enabled()) { resp_v3.bf_reg_size = dev->dev->caps.bf_reg_size; resp_v3.bf_regs_per_page = dev->dev->caps.bf_regs_per_page; } else { resp_v3.bf_reg_size = 0; resp_v3.bf_regs_per_page = 0; } } else { resp.dev_caps = dev->dev->caps.userspace_caps; resp.qp_tab_size = dev->dev->caps.num_qps; if (mlx4_wc_enabled()) { resp.bf_reg_size = dev->dev->caps.bf_reg_size; resp.bf_regs_per_page = dev->dev->caps.bf_regs_per_page; } else { resp.bf_reg_size = 0; resp.bf_regs_per_page = 0; } resp.cqe_size = dev->dev->caps.cqe_size; } context = kmalloc(sizeof *context, GFP_KERNEL); if (!context) return ERR_PTR(-ENOMEM); err = mlx4_uar_alloc(to_mdev(ibdev)->dev, &context->uar); if (err) { kfree(context); return ERR_PTR(err); } INIT_LIST_HEAD(&context->db_page_list); mutex_init(&context->db_page_mutex); if (ibdev->uverbs_abi_ver == MLX4_IB_UVERBS_NO_DEV_CAPS_ABI_VERSION) err = ib_copy_to_udata(udata, &resp_v3, sizeof(resp_v3)); else err = ib_copy_to_udata(udata, &resp, sizeof(resp)); if (err) { mlx4_uar_free(to_mdev(ibdev)->dev, &context->uar); kfree(context); return ERR_PTR(-EFAULT); } return &context->ibucontext; } static int mlx4_ib_dealloc_ucontext(struct ib_ucontext *ibcontext) { struct mlx4_ib_ucontext *context = to_mucontext(ibcontext); mlx4_uar_free(to_mdev(ibcontext->device)->dev, &context->uar); kfree(context); return 0; } /* XXX FBSD has no support for get_unmapped_area function */ #if 0 static unsigned long mlx4_ib_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { struct mm_struct *mm; struct vm_area_struct *vma; unsigned long start_addr; unsigned long page_size_order; unsigned long command; mm = current->mm; if (addr) return current->mm->get_unmapped_area(file, addr, len, pgoff, flags); /* Last 8 bits hold the command others are data per that command */ command = pgoff & MLX4_IB_MMAP_CMD_MASK; if (command != MLX4_IB_MMAP_GET_CONTIGUOUS_PAGES) return current->mm->get_unmapped_area(file, addr, len, pgoff, flags); page_size_order = pgoff >> MLX4_IB_MMAP_CMD_BITS; /* code is based on the huge-pages get_unmapped_area code */ start_addr = mm->free_area_cache; if (len <= mm->cached_hole_size) start_addr = TASK_UNMAPPED_BASE; full_search: addr = ALIGN(start_addr, 1 << page_size_order); for (vma = find_vma(mm, addr); ; vma = vma->vm_next) { /* At this point: (!vma || addr < vma->vm_end). */ if (TASK_SIZE - len < addr) { /* * Start a new search - just in case we missed * some holes. */ if (start_addr != TASK_UNMAPPED_BASE) { start_addr = TASK_UNMAPPED_BASE; goto full_search; } return -ENOMEM; } if (!vma || addr + len <= vma->vm_start) return addr; addr = ALIGN(vma->vm_end, 1 << page_size_order); } } #endif static int mlx4_ib_mmap(struct ib_ucontext *context, struct vm_area_struct *vma) { struct mlx4_ib_dev *dev = to_mdev(context->device); /* Last 8 bits hold the command others are data per that command */ unsigned long command = vma->vm_pgoff & MLX4_IB_MMAP_CMD_MASK; if (command < MLX4_IB_MMAP_GET_CONTIGUOUS_PAGES) { /* compatability handling for commands 0 & 1*/ if (vma->vm_end - vma->vm_start != PAGE_SIZE) return -EINVAL; } if (command == MLX4_IB_MMAP_UAR_PAGE) { vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); if (io_remap_pfn_range(vma, vma->vm_start, to_mucontext(context)->uar.pfn, PAGE_SIZE, vma->vm_page_prot)) return -EAGAIN; } else if (command == MLX4_IB_MMAP_BLUE_FLAME_PAGE && dev->dev->caps.bf_reg_size != 0) { vma->vm_page_prot = pgprot_wc(vma->vm_page_prot); if (io_remap_pfn_range(vma, vma->vm_start, to_mucontext(context)->uar.pfn + dev->dev->caps.num_uars, PAGE_SIZE, vma->vm_page_prot)) return -EAGAIN; } else if (command == MLX4_IB_MMAP_GET_HW_CLOCK) { struct mlx4_clock_params params; int ret; ret = mlx4_get_internal_clock_params(dev->dev, ¶ms); if (ret) return ret; vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); if (io_remap_pfn_range(vma, vma->vm_start, (pci_resource_start(dev->dev->pdev, params.bar) + params.offset) >> PAGE_SHIFT, PAGE_SIZE, vma->vm_page_prot)) return -EAGAIN; } else return -EINVAL; return 0; } static int mlx4_ib_ioctl(struct ib_ucontext *context, unsigned int cmd, unsigned long arg) { struct mlx4_ib_dev *dev = to_mdev(context->device); int ret; int offset; switch (cmd) { case MLX4_IOCHWCLOCKOFFSET: { struct mlx4_clock_params params; int ret; ret = mlx4_get_internal_clock_params(dev->dev, ¶ms); if (!ret) { offset = params.offset % PAGE_SIZE; ret = put_user(offset, (int *)arg); return sizeof(int); } else { return ret; } } default: { pr_err("mlx4_ib: invalid ioctl %u command with arg %lX\n", cmd, arg); return -ENOTTY; } } return ret; } static int mlx4_ib_query_values(struct ib_device *device, int q_values, struct ib_device_values *values) { struct mlx4_ib_dev *dev = to_mdev(device); cycle_t cycles; values->values_mask = 0; if (q_values & IBV_VALUES_HW_CLOCK) { cycles = mlx4_read_clock(dev->dev); if (cycles < 0) { values->hwclock = cycles & CORE_CLOCK_MASK; values->values_mask |= IBV_VALUES_HW_CLOCK; } q_values &= ~IBV_VALUES_HW_CLOCK; } if (q_values) return -ENOTTY; return 0; } static struct ib_pd *mlx4_ib_alloc_pd(struct ib_device *ibdev, struct ib_ucontext *context, struct ib_udata *udata) { struct mlx4_ib_pd *pd; int err; pd = kmalloc(sizeof *pd, GFP_KERNEL); if (!pd) return ERR_PTR(-ENOMEM); err = mlx4_pd_alloc(to_mdev(ibdev)->dev, &pd->pdn); if (err) { kfree(pd); return ERR_PTR(err); } if (context) if (ib_copy_to_udata(udata, &pd->pdn, sizeof (__u32))) { mlx4_pd_free(to_mdev(ibdev)->dev, pd->pdn); kfree(pd); return ERR_PTR(-EFAULT); } return &pd->ibpd; } static int mlx4_ib_dealloc_pd(struct ib_pd *pd) { mlx4_pd_free(to_mdev(pd->device)->dev, to_mpd(pd)->pdn); kfree(pd); return 0; } static struct ib_xrcd *mlx4_ib_alloc_xrcd(struct ib_device *ibdev, struct ib_ucontext *context, struct ib_udata *udata) { struct mlx4_ib_xrcd *xrcd; int err; if (!(to_mdev(ibdev)->dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC)) return ERR_PTR(-ENOSYS); xrcd = kmalloc(sizeof *xrcd, GFP_KERNEL); if (!xrcd) return ERR_PTR(-ENOMEM); err = mlx4_xrcd_alloc(to_mdev(ibdev)->dev, &xrcd->xrcdn); if (err) goto err1; xrcd->pd = ib_alloc_pd(ibdev); if (IS_ERR(xrcd->pd)) { err = PTR_ERR(xrcd->pd); goto err2; } xrcd->cq = ib_create_cq(ibdev, NULL, NULL, xrcd, 1, 0); if (IS_ERR(xrcd->cq)) { err = PTR_ERR(xrcd->cq); goto err3; } return &xrcd->ibxrcd; err3: ib_dealloc_pd(xrcd->pd); err2: mlx4_xrcd_free(to_mdev(ibdev)->dev, xrcd->xrcdn); err1: kfree(xrcd); return ERR_PTR(err); } static int mlx4_ib_dealloc_xrcd(struct ib_xrcd *xrcd) { ib_destroy_cq(to_mxrcd(xrcd)->cq); ib_dealloc_pd(to_mxrcd(xrcd)->pd); mlx4_xrcd_free(to_mdev(xrcd->device)->dev, to_mxrcd(xrcd)->xrcdn); kfree(xrcd); return 0; } static int add_gid_entry(struct ib_qp *ibqp, union ib_gid *gid) { struct mlx4_ib_qp *mqp = to_mqp(ibqp); struct mlx4_ib_dev *mdev = to_mdev(ibqp->device); struct mlx4_ib_gid_entry *ge; ge = kzalloc(sizeof *ge, GFP_KERNEL); if (!ge) return -ENOMEM; ge->gid = *gid; if (mlx4_ib_add_mc(mdev, mqp, gid)) { ge->port = mqp->port; ge->added = 1; } mutex_lock(&mqp->mutex); list_add_tail(&ge->list, &mqp->gid_list); mutex_unlock(&mqp->mutex); return 0; } int mlx4_ib_add_mc(struct mlx4_ib_dev *mdev, struct mlx4_ib_qp *mqp, union ib_gid *gid) { u8 mac[6]; struct net_device *ndev; int ret = 0; if (!mqp->port) return 0; spin_lock(&mdev->iboe.lock); ndev = mdev->iboe.netdevs[mqp->port - 1]; if (ndev) dev_hold(ndev); spin_unlock(&mdev->iboe.lock); if (ndev) { rdma_get_mcast_mac((struct in6_addr *)gid, mac); rtnl_lock(); dev_mc_add(mdev->iboe.netdevs[mqp->port - 1], mac, 6, 0); ret = 1; rtnl_unlock(); dev_put(ndev); } return ret; } struct mlx4_ib_steering { struct list_head list; u64 reg_id; union ib_gid gid; }; static int parse_flow_attr(struct mlx4_dev *dev, union ib_flow_spec *ib_spec, struct _rule_hw *mlx4_spec) { enum mlx4_net_trans_rule_id type; switch (ib_spec->type) { case IB_FLOW_SPEC_ETH: type = MLX4_NET_TRANS_RULE_ID_ETH; memcpy(mlx4_spec->eth.dst_mac, ib_spec->eth.val.dst_mac, ETH_ALEN); memcpy(mlx4_spec->eth.dst_mac_msk, ib_spec->eth.mask.dst_mac, ETH_ALEN); mlx4_spec->eth.vlan_tag = ib_spec->eth.val.vlan_tag; mlx4_spec->eth.vlan_tag_msk = ib_spec->eth.mask.vlan_tag; break; case IB_FLOW_SPEC_IB: type = MLX4_NET_TRANS_RULE_ID_IB; mlx4_spec->ib.l3_qpn = ib_spec->ib.val.l3_type_qpn; mlx4_spec->ib.qpn_mask = ib_spec->ib.mask.l3_type_qpn; memcpy(&mlx4_spec->ib.dst_gid, ib_spec->ib.val.dst_gid, 16); memcpy(&mlx4_spec->ib.dst_gid_msk, ib_spec->ib.mask.dst_gid, 16); break; case IB_FLOW_SPEC_IPV4: type = MLX4_NET_TRANS_RULE_ID_IPV4; mlx4_spec->ipv4.src_ip = ib_spec->ipv4.val.src_ip; mlx4_spec->ipv4.src_ip_msk = ib_spec->ipv4.mask.src_ip; mlx4_spec->ipv4.dst_ip = ib_spec->ipv4.val.dst_ip; mlx4_spec->ipv4.dst_ip_msk = ib_spec->ipv4.mask.dst_ip; break; case IB_FLOW_SPEC_TCP: case IB_FLOW_SPEC_UDP: type = ib_spec->type == IB_FLOW_SPEC_TCP ? MLX4_NET_TRANS_RULE_ID_TCP : MLX4_NET_TRANS_RULE_ID_UDP; mlx4_spec->tcp_udp.dst_port = ib_spec->tcp_udp.val.dst_port; mlx4_spec->tcp_udp.dst_port_msk = ib_spec->tcp_udp.mask.dst_port; mlx4_spec->tcp_udp.src_port = ib_spec->tcp_udp.val.src_port; mlx4_spec->tcp_udp.src_port_msk = ib_spec->tcp_udp.mask.src_port; break; default: return -EINVAL; } if (map_sw_to_hw_steering_id(dev, type) < 0 || hw_rule_sz(dev, type) < 0) return -EINVAL; mlx4_spec->id = cpu_to_be16(map_sw_to_hw_steering_id(dev, type)); mlx4_spec->size = hw_rule_sz(dev, type) >> 2; return hw_rule_sz(dev, type); } static int __mlx4_ib_create_flow(struct ib_qp *qp, struct ib_flow_attr *flow_attr, int domain, enum mlx4_net_trans_promisc_mode flow_type, u64 *reg_id) { int ret, i; int size = 0; void *ib_flow; struct mlx4_ib_dev *mdev = to_mdev(qp->device); struct mlx4_cmd_mailbox *mailbox; struct mlx4_net_trans_rule_hw_ctrl *ctrl; size_t rule_size = sizeof(struct mlx4_net_trans_rule_hw_ctrl) + (sizeof(struct _rule_hw) * flow_attr->num_of_specs); static const u16 __mlx4_domain[] = { [IB_FLOW_DOMAIN_USER] = MLX4_DOMAIN_UVERBS, [IB_FLOW_DOMAIN_ETHTOOL] = MLX4_DOMAIN_ETHTOOL, [IB_FLOW_DOMAIN_RFS] = MLX4_DOMAIN_RFS, [IB_FLOW_DOMAIN_NIC] = MLX4_DOMAIN_NIC, }; if (flow_attr->priority > MLX4_IB_FLOW_MAX_PRIO) { pr_err("Invalid priority value.\n"); return -EINVAL; } if (domain >= IB_FLOW_DOMAIN_NUM) { pr_err("Invalid domain value.\n"); return -EINVAL; } if (map_sw_to_hw_steering_mode(mdev->dev, flow_type) < 0) return -EINVAL; mailbox = mlx4_alloc_cmd_mailbox(mdev->dev); if (IS_ERR(mailbox)) return PTR_ERR(mailbox); memset(mailbox->buf, 0, rule_size); ctrl = mailbox->buf; ctrl->prio = cpu_to_be16(__mlx4_domain[domain] | flow_attr->priority); ctrl->type = map_sw_to_hw_steering_mode(mdev->dev, flow_type); ctrl->port = flow_attr->port; ctrl->qpn = cpu_to_be32(qp->qp_num); if (flow_attr->flags & IB_FLOW_ATTR_FLAGS_ALLOW_LOOP_BACK) ctrl->flags = (1 << 3); ib_flow = flow_attr + 1; size += sizeof(struct mlx4_net_trans_rule_hw_ctrl); for (i = 0; i < flow_attr->num_of_specs; i++) { ret = parse_flow_attr(mdev->dev, ib_flow, mailbox->buf + size); if (ret < 0) { mlx4_free_cmd_mailbox(mdev->dev, mailbox); return -EINVAL; } ib_flow += ((union ib_flow_spec *)ib_flow)->size; size += ret; } ret = mlx4_cmd_imm(mdev->dev, mailbox->dma, reg_id, size >> 2, 0, MLX4_QP_FLOW_STEERING_ATTACH, MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE); if (ret == -ENOMEM) pr_err("mcg table is full. Fail to register network rule.\n"); else if (ret == -ENXIO) pr_err("Device managed flow steering is disabled. Fail to register network rule.\n"); else if (ret) pr_err("Invalid argumant. Fail to register network rule.\n"); mlx4_free_cmd_mailbox(mdev->dev, mailbox); return ret; } static int __mlx4_ib_destroy_flow(struct mlx4_dev *dev, u64 reg_id) { int err; err = mlx4_cmd(dev, reg_id, 0, 0, MLX4_QP_FLOW_STEERING_DETACH, MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE); if (err) pr_err("Fail to detach network rule. registration id = 0x%llx\n", (unsigned long long)reg_id); return err; } static struct ib_flow *mlx4_ib_create_flow(struct ib_qp *qp, struct ib_flow_attr *flow_attr, int domain) { int err = 0, i = 0; struct mlx4_ib_flow *mflow; enum mlx4_net_trans_promisc_mode type[2]; memset(type, 0, sizeof(type)); mflow = kzalloc(sizeof(struct mlx4_ib_flow), GFP_KERNEL); if (!mflow) { err = -ENOMEM; goto err_free; } switch (flow_attr->type) { case IB_FLOW_ATTR_NORMAL: type[0] = MLX4_FS_REGULAR; break; case IB_FLOW_ATTR_ALL_DEFAULT: type[0] = MLX4_FS_ALL_DEFAULT; break; case IB_FLOW_ATTR_MC_DEFAULT: type[0] = MLX4_FS_MC_DEFAULT; break; case IB_FLOW_ATTR_SNIFFER: type[0] = MLX4_FS_UC_SNIFFER; type[1] = MLX4_FS_MC_SNIFFER; break; default: err = -EINVAL; goto err_free; } while (i < ARRAY_SIZE(type) && type[i]) { err = __mlx4_ib_create_flow(qp, flow_attr, domain, type[i], &mflow->reg_id[i]); if (err) goto err_free; i++; } return &mflow->ibflow; err_free: kfree(mflow); return ERR_PTR(err); } static int mlx4_ib_destroy_flow(struct ib_flow *flow_id) { int err, ret = 0; int i = 0; struct mlx4_ib_dev *mdev = to_mdev(flow_id->qp->device); struct mlx4_ib_flow *mflow = to_mflow(flow_id); while (i < ARRAY_SIZE(mflow->reg_id) && mflow->reg_id[i]) { err = __mlx4_ib_destroy_flow(mdev->dev, mflow->reg_id[i]); if (err) ret = err; i++; } kfree(mflow); return ret; } static struct mlx4_ib_gid_entry *find_gid_entry(struct mlx4_ib_qp *qp, u8 *raw) { struct mlx4_ib_gid_entry *ge; struct mlx4_ib_gid_entry *tmp; struct mlx4_ib_gid_entry *ret = NULL; list_for_each_entry_safe(ge, tmp, &qp->gid_list, list) { if (!memcmp(raw, ge->gid.raw, 16)) { ret = ge; break; } } return ret; } static int del_gid_entry(struct ib_qp *ibqp, union ib_gid *gid) { struct mlx4_ib_dev *mdev = to_mdev(ibqp->device); struct mlx4_ib_qp *mqp = to_mqp(ibqp); struct mlx4_ib_gid_entry *ge; struct net_device *ndev; u8 mac[6]; mutex_lock(&mqp->mutex); ge = find_gid_entry(mqp, gid->raw); if (ge) { spin_lock(&mdev->iboe.lock); ndev = ge->added ? mdev->iboe.netdevs[ge->port - 1] : NULL; if (ndev) dev_hold(ndev); spin_unlock(&mdev->iboe.lock); rdma_get_mcast_mac((struct in6_addr *)gid, mac); if (ndev) { rtnl_lock(); dev_mc_delete(mdev->iboe.netdevs[ge->port - 1], mac, 6, 0); rtnl_unlock(); dev_put(ndev); } list_del(&ge->list); kfree(ge); } else pr_warn("could not find mgid entry\n"); mutex_unlock(&mqp->mutex); return ge != 0 ? 0 : -EINVAL; } static int _mlx4_ib_mcg_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid, int count) { int err; struct mlx4_ib_dev *mdev = to_mdev(ibqp->device); struct mlx4_ib_qp *mqp = to_mqp(ibqp); u64 reg_id = 0; int record_err = 0; if (mdev->dev->caps.steering_mode == MLX4_STEERING_MODE_DEVICE_MANAGED) { struct mlx4_ib_steering *ib_steering; struct mlx4_ib_steering *tmp; LIST_HEAD(temp); mutex_lock(&mqp->mutex); list_for_each_entry_safe(ib_steering, tmp, &mqp->steering_rules, list) { if (memcmp(ib_steering->gid.raw, gid->raw, 16)) continue; if (--count < 0) break; list_del(&ib_steering->list); list_add(&ib_steering->list, &temp); } mutex_unlock(&mqp->mutex); list_for_each_entry_safe(ib_steering, tmp, &temp, list) { reg_id = ib_steering->reg_id; err = mlx4_multicast_detach(mdev->dev, &mqp->mqp, gid->raw, (ibqp->qp_type == IB_QPT_RAW_PACKET) ? MLX4_PROT_ETH : MLX4_PROT_IB_IPV6, reg_id); if (err) { record_err = record_err ?: err; continue; } err = del_gid_entry(ibqp, gid); if (err) { record_err = record_err ?: err; continue; } list_del(&ib_steering->list); kfree(ib_steering); } mutex_lock(&mqp->mutex); list_for_each_entry(ib_steering, &temp, list) { list_add(&ib_steering->list, &mqp->steering_rules); } mutex_unlock(&mqp->mutex); if (count) { pr_warn("Couldn't release all reg_ids for mgid. Steering rule is left attached\n"); return -EINVAL; } } else { if (mdev->dev->caps.steering_mode == MLX4_STEERING_MODE_B0 && ibqp->qp_type == IB_QPT_RAW_PACKET) gid->raw[5] = mqp->port; err = mlx4_multicast_detach(mdev->dev, &mqp->mqp, gid->raw, (ibqp->qp_type == IB_QPT_RAW_PACKET) ? MLX4_PROT_ETH : MLX4_PROT_IB_IPV6, reg_id); if (err) return err; err = del_gid_entry(ibqp, gid); if (err) return err; } return record_err; } static int mlx4_ib_mcg_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) { struct mlx4_ib_dev *mdev = to_mdev(ibqp->device); int count = (mdev->dev->caps.steering_mode == MLX4_STEERING_MODE_DEVICE_MANAGED) ? mdev->dev->caps.num_ports : 1; return _mlx4_ib_mcg_detach(ibqp, gid, lid, count); } static int mlx4_ib_mcg_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) { int err = -ENODEV; struct mlx4_ib_dev *mdev = to_mdev(ibqp->device); struct mlx4_ib_qp *mqp = to_mqp(ibqp); DECLARE_BITMAP(ports, MLX4_MAX_PORTS); int i = 0; if (mdev->dev->caps.steering_mode == MLX4_STEERING_MODE_B0 && ibqp->qp_type == IB_QPT_RAW_PACKET) gid->raw[5] = mqp->port; if (mdev->dev->caps.steering_mode == MLX4_STEERING_MODE_DEVICE_MANAGED) { bitmap_fill(ports, mdev->dev->caps.num_ports); } else { if (mqp->port <= mdev->dev->caps.num_ports) { bitmap_zero(ports, mdev->dev->caps.num_ports); set_bit(0, ports); } else { return -EINVAL; } } for (; i < mdev->dev->caps.num_ports; i++) { u64 reg_id; struct mlx4_ib_steering *ib_steering = NULL; if (!test_bit(i, ports)) continue; if (mdev->dev->caps.steering_mode == MLX4_STEERING_MODE_DEVICE_MANAGED) { ib_steering = kmalloc(sizeof(*ib_steering), GFP_KERNEL); if (!ib_steering) goto err_add; } err = mlx4_multicast_attach(mdev->dev, &mqp->mqp, gid->raw, i + 1, !!(mqp->flags & MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK), (ibqp->qp_type == IB_QPT_RAW_PACKET) ? MLX4_PROT_ETH : MLX4_PROT_IB_IPV6, ®_id); if (err) { kfree(ib_steering); goto err_add; } err = add_gid_entry(ibqp, gid); if (err) { mlx4_multicast_detach(mdev->dev, &mqp->mqp, gid->raw, MLX4_PROT_IB_IPV6, reg_id); kfree(ib_steering); goto err_add; } if (ib_steering) { memcpy(ib_steering->gid.raw, gid->raw, 16); mutex_lock(&mqp->mutex); list_add(&ib_steering->list, &mqp->steering_rules); mutex_unlock(&mqp->mutex); ib_steering->reg_id = reg_id; } } return 0; err_add: if (i > 0) _mlx4_ib_mcg_detach(ibqp, gid, lid, i); return err; } static int init_node_data(struct mlx4_ib_dev *dev) { struct ib_smp *in_mad = NULL; struct ib_smp *out_mad = NULL; int mad_ifc_flags = MLX4_MAD_IFC_IGNORE_KEYS; int err = -ENOMEM; in_mad = kzalloc(sizeof *in_mad, GFP_KERNEL); out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL); if (!in_mad || !out_mad) goto out; init_query_mad(in_mad); in_mad->attr_id = IB_SMP_ATTR_NODE_DESC; if (mlx4_is_master(dev->dev)) mad_ifc_flags |= MLX4_MAD_IFC_NET_VIEW; err = mlx4_MAD_IFC(dev, mad_ifc_flags, 1, NULL, NULL, in_mad, out_mad); if (err) goto out; memcpy(dev->ib_dev.node_desc, out_mad->data, 64); in_mad->attr_id = IB_SMP_ATTR_NODE_INFO; err = mlx4_MAD_IFC(dev, mad_ifc_flags, 1, NULL, NULL, in_mad, out_mad); if (err) goto out; dev->dev->rev_id = be32_to_cpup((__be32 *) (out_mad->data + 32)); memcpy(&dev->ib_dev.node_guid, out_mad->data + 12, 8); out: kfree(in_mad); kfree(out_mad); return err; } static ssize_t show_hca(struct device *device, struct device_attribute *attr, char *buf) { struct mlx4_ib_dev *dev = container_of(device, struct mlx4_ib_dev, ib_dev.dev); return sprintf(buf, "MT%d\n", dev->dev->pdev->device); } static ssize_t show_fw_ver(struct device *device, struct device_attribute *attr, char *buf) { struct mlx4_ib_dev *dev = container_of(device, struct mlx4_ib_dev, ib_dev.dev); return sprintf(buf, "%d.%d.%d\n", (int) (dev->dev->caps.fw_ver >> 32), (int) (dev->dev->caps.fw_ver >> 16) & 0xffff, (int) dev->dev->caps.fw_ver & 0xffff); } static ssize_t show_rev(struct device *device, struct device_attribute *attr, char *buf) { struct mlx4_ib_dev *dev = container_of(device, struct mlx4_ib_dev, ib_dev.dev); return sprintf(buf, "%x\n", dev->dev->rev_id); } static ssize_t show_board(struct device *device, struct device_attribute *attr, char *buf) { struct mlx4_ib_dev *dev = container_of(device, struct mlx4_ib_dev, ib_dev.dev); return sprintf(buf, "%.*s\n", MLX4_BOARD_ID_LEN, dev->dev->board_id); } static ssize_t show_vsd(struct device *device, struct device_attribute *attr, char *buf) { struct mlx4_ib_dev *dev = container_of(device, struct mlx4_ib_dev, ib_dev.dev); ssize_t len = MLX4_VSD_LEN; if (dev->dev->vsd_vendor_id == PCI_VENDOR_ID_MELLANOX) len = sprintf(buf, "%.*s\n", MLX4_VSD_LEN, dev->dev->vsd); else memcpy(buf, dev->dev->vsd, MLX4_VSD_LEN); return len; } static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL); static DEVICE_ATTR(fw_ver, S_IRUGO, show_fw_ver, NULL); static DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL); static DEVICE_ATTR(board_id, S_IRUGO, show_board, NULL); static DEVICE_ATTR(vsd, S_IRUGO, show_vsd, NULL); static struct device_attribute *mlx4_class_attributes[] = { &dev_attr_hw_rev, &dev_attr_fw_ver, &dev_attr_hca_type, &dev_attr_board_id, &dev_attr_vsd }; static void mlx4_addrconf_ifid_eui48(u8 *eui, u16 vlan_id, struct net_device *dev, u8 port) { memcpy(eui, IF_LLADDR(dev), 3); memcpy(eui + 5, IF_LLADDR(dev) + 3, 3); if (vlan_id < 0x1000) { eui[3] = vlan_id >> 8; eui[4] = vlan_id & 0xff; } else { eui[3] = 0xff; eui[4] = 0xfe; } eui[0] ^= 2; } static void update_gids_task(struct work_struct *work) { struct update_gid_work *gw = container_of(work, struct update_gid_work, work); struct mlx4_cmd_mailbox *mailbox; union ib_gid *gids; int err; struct mlx4_dev *dev = gw->dev->dev; mailbox = mlx4_alloc_cmd_mailbox(dev); if (IS_ERR(mailbox)) { pr_warn("update gid table failed %ld\n", PTR_ERR(mailbox)); goto free; } gids = mailbox->buf; memcpy(gids, gw->gids, sizeof gw->gids); if (mlx4_ib_port_link_layer(&gw->dev->ib_dev, gw->port) == IB_LINK_LAYER_ETHERNET) { err = mlx4_cmd(dev, mailbox->dma, MLX4_SET_PORT_GID_TABLE << 8 | gw->port, 1, MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B, MLX4_CMD_WRAPPED); if (err) pr_warn("set port command failed\n"); else mlx4_ib_dispatch_event(gw->dev, gw->port, IB_EVENT_GID_CHANGE); } mlx4_free_cmd_mailbox(dev, mailbox); free: kfree(gw); } static void reset_gids_task(struct work_struct *work) { struct update_gid_work *gw = container_of(work, struct update_gid_work, work); struct mlx4_cmd_mailbox *mailbox; union ib_gid *gids; int err; struct mlx4_dev *dev = gw->dev->dev; mailbox = mlx4_alloc_cmd_mailbox(dev); if (IS_ERR(mailbox)) { pr_warn("reset gid table failed\n"); goto free; } gids = mailbox->buf; memcpy(gids, gw->gids, sizeof(gw->gids)); if (mlx4_ib_port_link_layer(&gw->dev->ib_dev, 1) == IB_LINK_LAYER_ETHERNET && dev->caps.num_ports > 0) { err = mlx4_cmd(dev, mailbox->dma, MLX4_SET_PORT_GID_TABLE << 8 | 1, 1, MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B, MLX4_CMD_WRAPPED); if (err) pr_warn("set port 1 command failed\n"); } if (mlx4_ib_port_link_layer(&gw->dev->ib_dev, 2) == IB_LINK_LAYER_ETHERNET && dev->caps.num_ports > 1) { err = mlx4_cmd(dev, mailbox->dma, MLX4_SET_PORT_GID_TABLE << 8 | 2, 1, MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B, MLX4_CMD_WRAPPED); if (err) pr_warn("set port 2 command failed\n"); } mlx4_free_cmd_mailbox(dev, mailbox); free: kfree(gw); } static int update_gid_table(struct mlx4_ib_dev *dev, int port, union ib_gid *gid, int clear, int default_gid) { struct update_gid_work *work; int i; int need_update = 0; int free = -1; int found = -1; int max_gids; int start_index = !default_gid; max_gids = dev->dev->caps.gid_table_len[port]; for (i = start_index; i < max_gids; ++i) { if (!memcmp(&dev->iboe.gid_table[port - 1][i], gid, sizeof(*gid))) found = i; if (clear) { if (found >= 0) { need_update = 1; dev->iboe.gid_table[port - 1][found] = zgid; break; } } else { if (found >= 0) break; if (free < 0 && !memcmp(&dev->iboe.gid_table[port - 1][i], &zgid, sizeof(*gid))) free = i; } } if (found == -1 && !clear && free < 0) { pr_err("GID table of port %d is full. Can't add "GID_PRINT_FMT"\n", port, GID_PRINT_ARGS(gid)); return -ENOMEM; } if (found == -1 && clear) { pr_err(GID_PRINT_FMT" is not in GID table of port %d\n", GID_PRINT_ARGS(gid), port); return -EINVAL; } if (found == -1 && !clear && free >= 0) { dev->iboe.gid_table[port - 1][free] = *gid; need_update = 1; } if (!need_update) return 0; work = kzalloc(sizeof *work, GFP_ATOMIC); if (!work) return -ENOMEM; memcpy(work->gids, dev->iboe.gid_table[port - 1], sizeof(work->gids)); INIT_WORK(&work->work, update_gids_task); work->port = port; work->dev = dev; queue_work(wq, &work->work); return 0; } static int reset_gid_table(struct mlx4_ib_dev *dev) { struct update_gid_work *work; work = kzalloc(sizeof(*work), GFP_ATOMIC); if (!work) return -ENOMEM; memset(dev->iboe.gid_table, 0, sizeof(dev->iboe.gid_table)); memset(work->gids, 0, sizeof(work->gids)); INIT_WORK(&work->work, reset_gids_task); work->dev = dev; queue_work(wq, &work->work); return 0; } /* XXX BOND Related - stub (no support for these flags in FBSD)*/ static inline int netif_is_bond_master(struct net_device *dev) { #if 0 return (dev->flags & IFF_MASTER) && (dev->priv_flags & IFF_BONDING); #endif return 0; } static void mlx4_make_default_gid(struct net_device *dev, union ib_gid *gid, u8 port) { gid->global.subnet_prefix = cpu_to_be64(0xfe80000000000000LL); mlx4_addrconf_ifid_eui48(&gid->raw[8], 0xffff, dev, port); } static u8 mlx4_ib_get_dev_port(struct net_device *dev, struct mlx4_ib_dev *ibdev) { u8 port = 0; struct mlx4_ib_iboe *iboe; struct net_device *real_dev = rdma_vlan_dev_real_dev(dev) ? rdma_vlan_dev_real_dev(dev) : dev; iboe = &ibdev->iboe; for (port = 1; port <= MLX4_MAX_PORTS; ++port) if ((netif_is_bond_master(real_dev) && (real_dev == iboe->masters[port - 1])) || (!netif_is_bond_master(real_dev) && (real_dev == iboe->netdevs[port - 1]))) break; return port > MLX4_MAX_PORTS ? 0 : port; } static void mlx4_ib_get_dev_addr(struct net_device *dev, struct mlx4_ib_dev *ibdev, u8 port) { struct ifaddr *ifa; #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) struct inet6_dev *in6_dev; union ib_gid *pgid; struct inet6_ifaddr *ifp; #endif union ib_gid gid; if ((port == 0) || (port > MLX4_MAX_PORTS)) return; /* IPv4 gids */ TAILQ_FOREACH(ifa, &dev->if_addrhead, ifa_link) { if (ifa->ifa_addr && ifa->ifa_addr->sa_family == AF_INET){ ipv6_addr_set_v4mapped( ((struct sockaddr_in *) ifa->ifa_addr)->sin_addr.s_addr, (struct in6_addr *)&gid); update_gid_table(ibdev, port, &gid, 0, 0); } } #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) /* IPv6 gids */ in6_dev = in6_dev_get(dev); if (in6_dev) { read_lock_bh(&in6_dev->lock); list_for_each_entry(ifp, &in6_dev->addr_list, if_list) { pgid = (union ib_gid *)&ifp->addr; update_gid_table(ibdev, port, pgid, 0, 0); } read_unlock_bh(&in6_dev->lock); in6_dev_put(in6_dev); } #endif } static void mlx4_set_default_gid(struct mlx4_ib_dev *ibdev, struct net_device *dev, u8 port) { union ib_gid gid; mlx4_make_default_gid(dev, &gid, port); update_gid_table(ibdev, port, &gid, 0, 1); } static int mlx4_ib_init_gid_table(struct mlx4_ib_dev *ibdev) { struct net_device *dev; if (reset_gid_table(ibdev)) return -1; IFNET_RLOCK_NOSLEEP(); TAILQ_FOREACH(dev, &V_ifnet, if_link) { u8 port = mlx4_ib_get_dev_port(dev, ibdev); if (port) { if (!rdma_vlan_dev_real_dev(dev) && !netif_is_bond_master(dev)) mlx4_set_default_gid(ibdev, dev, port); mlx4_ib_get_dev_addr(dev, ibdev, port); } } IFNET_RUNLOCK_NOSLEEP(); return 0; } static void mlx4_ib_scan_netdevs(struct mlx4_ib_dev *ibdev, struct net_device *dev, unsigned long event) { struct mlx4_ib_iboe *iboe; int port; int init = 0; unsigned long flags; iboe = &ibdev->iboe; spin_lock_irqsave(&iboe->lock, flags); mlx4_foreach_ib_transport_port(port, ibdev->dev) { struct net_device *old_netdev = iboe->netdevs[port - 1]; /* XXX BOND related */ #if 0 struct net_device *old_master = iboe->masters[port - 1]; #endif iboe->masters[port - 1] = NULL; iboe->netdevs[port - 1] = mlx4_get_protocol_dev(ibdev->dev, MLX4_PROT_ETH, port); if (old_netdev != iboe->netdevs[port - 1]) init = 1; if (dev == iboe->netdevs[port - 1] && event == NETDEV_CHANGEADDR) init = 1; /* XXX BOND related */ #if 0 if (iboe->netdevs[port - 1] && netif_is_bond_slave(iboe->netdevs[port - 1])) iboe->masters[port - 1] = iboe->netdevs[port - 1]->master; /* if bonding is used it is possible that we add it to masters only after IP address is assigned to the net bonding interface */ if (old_master != iboe->masters[port - 1]) init = 1; #endif } spin_unlock_irqrestore(&iboe->lock, flags); if (init) if (mlx4_ib_init_gid_table(ibdev)) pr_warn("Fail to reset gid table\n"); } static int mlx4_ib_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = ptr; struct mlx4_ib_dev *ibdev; ibdev = container_of(this, struct mlx4_ib_dev, iboe.nb); mlx4_ib_scan_netdevs(ibdev, dev, event); return NOTIFY_DONE; } /* This function initializes the gid table only if the event_netdev real device is an iboe * device, will be invoked by the inet/inet6 events */ static int mlx4_ib_inet_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *event_netdev = ptr; struct mlx4_ib_dev *ibdev; struct mlx4_ib_iboe *ibdev_iboe; int port = 0; ibdev = container_of(this, struct mlx4_ib_dev, iboe.nb_inet); struct net_device *real_dev = rdma_vlan_dev_real_dev(event_netdev) ? rdma_vlan_dev_real_dev(event_netdev) : event_netdev; ibdev_iboe = &ibdev->iboe; port = mlx4_ib_get_dev_port(real_dev, ibdev); /* Perform init_gid_table if the event real_dev is the net_device which represents this port, * otherwise this event is not related and would be ignored.*/ if(port && (real_dev == ibdev_iboe->netdevs[port - 1])) if (mlx4_ib_init_gid_table(ibdev)) pr_warn("Fail to reset gid table\n"); return NOTIFY_DONE; } static void init_pkeys(struct mlx4_ib_dev *ibdev) { int port; int slave; int i; if (mlx4_is_master(ibdev->dev)) { for (slave = 0; slave <= ibdev->dev->num_vfs; ++slave) { for (port = 1; port <= ibdev->dev->caps.num_ports; ++port) { for (i = 0; i < ibdev->dev->phys_caps.pkey_phys_table_len[port]; ++i) { ibdev->pkeys.virt2phys_pkey[slave][port - 1][i] = /* master has the identity virt2phys pkey mapping */ (slave == mlx4_master_func_num(ibdev->dev) || !i) ? i : ibdev->dev->phys_caps.pkey_phys_table_len[port] - 1; mlx4_sync_pkey_table(ibdev->dev, slave, port, i, ibdev->pkeys.virt2phys_pkey[slave][port - 1][i]); } } } /* initialize pkey cache */ for (port = 1; port <= ibdev->dev->caps.num_ports; ++port) { for (i = 0; i < ibdev->dev->phys_caps.pkey_phys_table_len[port]; ++i) ibdev->pkeys.phys_pkey_cache[port-1][i] = (i) ? 0 : 0xFFFF; } } } static void mlx4_ib_alloc_eqs(struct mlx4_dev *dev, struct mlx4_ib_dev *ibdev) { char name[32]; int eq_per_port = 0; int added_eqs = 0; int total_eqs = 0; int i, j, eq; /* Legacy mode or comp_pool is not large enough */ if (dev->caps.comp_pool == 0 || dev->caps.num_ports > dev->caps.comp_pool) return; eq_per_port = rounddown_pow_of_two(dev->caps.comp_pool/ dev->caps.num_ports); /* Init eq table */ added_eqs = 0; mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_IB) added_eqs += eq_per_port; total_eqs = dev->caps.num_comp_vectors + added_eqs; ibdev->eq_table = kzalloc(total_eqs * sizeof(int), GFP_KERNEL); if (!ibdev->eq_table) return; ibdev->eq_added = added_eqs; eq = 0; mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_IB) { for (j = 0; j < eq_per_port; j++) { sprintf(name, "mlx4-ib-%d-%d@%d:%d:%d:%d", i, j, pci_get_domain(dev->pdev->dev.bsddev), pci_get_bus(dev->pdev->dev.bsddev), PCI_SLOT(dev->pdev->devfn), PCI_FUNC(dev->pdev->devfn)); /* Set IRQ for specific name (per ring) */ if (mlx4_assign_eq(dev, name, &ibdev->eq_table[eq])) { /* Use legacy (same as mlx4_en driver) */ pr_warn("Can't allocate EQ %d; reverting to legacy\n", eq); ibdev->eq_table[eq] = (eq % dev->caps.num_comp_vectors); } eq++; } } /* Fill the reset of the vector with legacy EQ */ for (i = 0, eq = added_eqs; i < dev->caps.num_comp_vectors; i++) ibdev->eq_table[eq++] = i; /* Advertise the new number of EQs to clients */ ibdev->ib_dev.num_comp_vectors = total_eqs; } static void mlx4_ib_free_eqs(struct mlx4_dev *dev, struct mlx4_ib_dev *ibdev) { int i; /* no additional eqs were added */ if (!ibdev->eq_table) return; /* Reset the advertised EQ number */ ibdev->ib_dev.num_comp_vectors = dev->caps.num_comp_vectors; /* Free only the added eqs */ for (i = 0; i < ibdev->eq_added; i++) { /* Don't free legacy eqs if used */ if (ibdev->eq_table[i] <= dev->caps.num_comp_vectors) continue; mlx4_release_eq(dev, ibdev->eq_table[i]); } kfree(ibdev->eq_table); } /* * create show function and a device_attribute struct pointing to * the function for _name */ #define DEVICE_DIAG_RPRT_ATTR(_name, _offset, _op_mod) \ static ssize_t show_rprt_##_name(struct device *dev, \ struct device_attribute *attr, \ char *buf){ \ return show_diag_rprt(dev, buf, _offset, _op_mod); \ } \ static DEVICE_ATTR(_name, S_IRUGO, show_rprt_##_name, NULL); #define MLX4_DIAG_RPRT_CLEAR_DIAGS 3 static size_t show_diag_rprt(struct device *device, char *buf, u32 offset, u8 op_modifier) { size_t ret; u32 counter_offset = offset; u32 diag_counter = 0; struct mlx4_ib_dev *dev = container_of(device, struct mlx4_ib_dev, ib_dev.dev); ret = mlx4_query_diag_counters(dev->dev, 1, op_modifier, &counter_offset, &diag_counter); if (ret) return ret; return sprintf(buf, "%d\n", diag_counter); } static ssize_t clear_diag_counters(struct device *device, struct device_attribute *attr, const char *buf, size_t length) { size_t ret; struct mlx4_ib_dev *dev = container_of(device, struct mlx4_ib_dev, ib_dev.dev); ret = mlx4_query_diag_counters(dev->dev, 0, MLX4_DIAG_RPRT_CLEAR_DIAGS, NULL, NULL); if (ret) return ret; return length; } DEVICE_DIAG_RPRT_ATTR(rq_num_lle , 0x00, 2); DEVICE_DIAG_RPRT_ATTR(sq_num_lle , 0x04, 2); DEVICE_DIAG_RPRT_ATTR(rq_num_lqpoe , 0x08, 2); DEVICE_DIAG_RPRT_ATTR(sq_num_lqpoe , 0x0C, 2); DEVICE_DIAG_RPRT_ATTR(rq_num_lpe , 0x18, 2); DEVICE_DIAG_RPRT_ATTR(sq_num_lpe , 0x1C, 2); DEVICE_DIAG_RPRT_ATTR(rq_num_wrfe , 0x20, 2); DEVICE_DIAG_RPRT_ATTR(sq_num_wrfe , 0x24, 2); DEVICE_DIAG_RPRT_ATTR(sq_num_mwbe , 0x2C, 2); DEVICE_DIAG_RPRT_ATTR(sq_num_bre , 0x34, 2); DEVICE_DIAG_RPRT_ATTR(rq_num_lae , 0x38, 2); DEVICE_DIAG_RPRT_ATTR(sq_num_rire , 0x44, 2); DEVICE_DIAG_RPRT_ATTR(rq_num_rire , 0x48, 2); DEVICE_DIAG_RPRT_ATTR(sq_num_rae , 0x4C, 2); DEVICE_DIAG_RPRT_ATTR(rq_num_rae , 0x50, 2); DEVICE_DIAG_RPRT_ATTR(sq_num_roe , 0x54, 2); DEVICE_DIAG_RPRT_ATTR(sq_num_tree , 0x5C, 2); DEVICE_DIAG_RPRT_ATTR(sq_num_rree , 0x64, 2); DEVICE_DIAG_RPRT_ATTR(rq_num_rnr , 0x68, 2); DEVICE_DIAG_RPRT_ATTR(sq_num_rnr , 0x6C, 2); DEVICE_DIAG_RPRT_ATTR(rq_num_oos , 0x100, 2); DEVICE_DIAG_RPRT_ATTR(sq_num_oos , 0x104, 2); DEVICE_DIAG_RPRT_ATTR(rq_num_mce , 0x108, 2); DEVICE_DIAG_RPRT_ATTR(rq_num_udsdprd , 0x118, 2); DEVICE_DIAG_RPRT_ATTR(rq_num_ucsdprd , 0x120, 2); DEVICE_DIAG_RPRT_ATTR(num_cqovf , 0x1A0, 2); DEVICE_DIAG_RPRT_ATTR(num_eqovf , 0x1A4, 2); DEVICE_DIAG_RPRT_ATTR(num_baddb , 0x1A8, 2); static DEVICE_ATTR(clear_diag, S_IWUSR, NULL, clear_diag_counters); static struct attribute *diag_rprt_attrs[] = { &dev_attr_rq_num_lle.attr, &dev_attr_sq_num_lle.attr, &dev_attr_rq_num_lqpoe.attr, &dev_attr_sq_num_lqpoe.attr, &dev_attr_rq_num_lpe.attr, &dev_attr_sq_num_lpe.attr, &dev_attr_rq_num_wrfe.attr, &dev_attr_sq_num_wrfe.attr, &dev_attr_sq_num_mwbe.attr, &dev_attr_sq_num_bre.attr, &dev_attr_rq_num_lae.attr, &dev_attr_sq_num_rire.attr, &dev_attr_rq_num_rire.attr, &dev_attr_sq_num_rae.attr, &dev_attr_rq_num_rae.attr, &dev_attr_sq_num_roe.attr, &dev_attr_sq_num_tree.attr, &dev_attr_sq_num_rree.attr, &dev_attr_rq_num_rnr.attr, &dev_attr_sq_num_rnr.attr, &dev_attr_rq_num_oos.attr, &dev_attr_sq_num_oos.attr, &dev_attr_rq_num_mce.attr, &dev_attr_rq_num_udsdprd.attr, &dev_attr_rq_num_ucsdprd.attr, &dev_attr_num_cqovf.attr, &dev_attr_num_eqovf.attr, &dev_attr_num_baddb.attr, &dev_attr_clear_diag.attr, NULL }; static struct attribute_group diag_counters_group = { .name = "diag_counters", .attrs = diag_rprt_attrs }; static void init_dev_assign(void) { int i = 1; spin_lock_init(&dev_num_str_lock); if (mlx4_fill_dbdf2val_tbl(&dev_assign_str)) return; dev_num_str_bitmap = kmalloc(BITS_TO_LONGS(MAX_NUM_STR_BITMAP) * sizeof(long), GFP_KERNEL); if (!dev_num_str_bitmap) { pr_warn("bitmap alloc failed -- cannot apply dev_assign_str parameter\n"); return; } bitmap_zero(dev_num_str_bitmap, MAX_NUM_STR_BITMAP); while ((i < MLX4_DEVS_TBL_SIZE) && (dev_assign_str.tbl[i].dbdf != MLX4_ENDOF_TBL)) { if (bitmap_allocate_region(dev_num_str_bitmap, dev_assign_str.tbl[i].val[0], 0)) goto err; i++; } dr_active = 1; return; err: kfree(dev_num_str_bitmap); dev_num_str_bitmap = NULL; pr_warn("mlx4_ib: The value of 'dev_assign_str' parameter " "is incorrect. The parameter value is discarded!"); } static int mlx4_ib_dev_idx(struct mlx4_dev *dev) { int i, val; if (!dr_active) return -1; if (!dev) return -1; if (mlx4_get_val(dev_assign_str.tbl, dev->pdev, 0, &val)) return -1; if (val != DEFAULT_TBL_VAL) { dev->flags |= MLX4_FLAG_DEV_NUM_STR; return val; } spin_lock(&dev_num_str_lock); i = bitmap_find_free_region(dev_num_str_bitmap, MAX_NUM_STR_BITMAP, 0); spin_unlock(&dev_num_str_lock); if (i >= 0) return i; return -1; } static void *mlx4_ib_add(struct mlx4_dev *dev) { struct mlx4_ib_dev *ibdev; int num_ports = 0; int i, j; int err; struct mlx4_ib_iboe *iboe; int dev_idx; pr_info_once("%s", mlx4_ib_version); mlx4_foreach_ib_transport_port(i, dev) num_ports++; /* No point in registering a device with no ports... */ if (num_ports == 0) return NULL; ibdev = (struct mlx4_ib_dev *) ib_alloc_device(sizeof *ibdev); if (!ibdev) { dev_err(&dev->pdev->dev, "Device struct alloc failed\n"); return NULL; } iboe = &ibdev->iboe; if (mlx4_pd_alloc(dev, &ibdev->priv_pdn)) goto err_dealloc; if (mlx4_uar_alloc(dev, &ibdev->priv_uar)) goto err_pd; ibdev->priv_uar.map = ioremap(ibdev->priv_uar.pfn << PAGE_SHIFT, PAGE_SIZE); if (!ibdev->priv_uar.map) goto err_uar; MLX4_INIT_DOORBELL_LOCK(&ibdev->uar_lock); ibdev->dev = dev; dev_idx = mlx4_ib_dev_idx(dev); if (dev_idx >= 0) sprintf(ibdev->ib_dev.name, "mlx4_%d", dev_idx); else strlcpy(ibdev->ib_dev.name, "mlx4_%d", IB_DEVICE_NAME_MAX); ibdev->ib_dev.owner = THIS_MODULE; ibdev->ib_dev.node_type = RDMA_NODE_IB_CA; ibdev->ib_dev.local_dma_lkey = dev->caps.reserved_lkey; ibdev->num_ports = num_ports; ibdev->ib_dev.phys_port_cnt = ibdev->num_ports; ibdev->ib_dev.num_comp_vectors = dev->caps.num_comp_vectors; ibdev->ib_dev.dma_device = &dev->pdev->dev; if (dev->caps.userspace_caps) ibdev->ib_dev.uverbs_abi_ver = MLX4_IB_UVERBS_ABI_VERSION; else ibdev->ib_dev.uverbs_abi_ver = MLX4_IB_UVERBS_NO_DEV_CAPS_ABI_VERSION; ibdev->ib_dev.uverbs_cmd_mask = (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) | (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) | (1ull << IB_USER_VERBS_CMD_QUERY_PORT) | (1ull << IB_USER_VERBS_CMD_ALLOC_PD) | (1ull << IB_USER_VERBS_CMD_DEALLOC_PD) | (1ull << IB_USER_VERBS_CMD_REG_MR) | (1ull << IB_USER_VERBS_CMD_DEREG_MR) | (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) | (1ull << IB_USER_VERBS_CMD_CREATE_CQ) | (1ull << IB_USER_VERBS_CMD_RESIZE_CQ) | (1ull << IB_USER_VERBS_CMD_DESTROY_CQ) | (1ull << IB_USER_VERBS_CMD_CREATE_QP) | (1ull << IB_USER_VERBS_CMD_MODIFY_QP) | (1ull << IB_USER_VERBS_CMD_QUERY_QP) | (1ull << IB_USER_VERBS_CMD_DESTROY_QP) | (1ull << IB_USER_VERBS_CMD_ATTACH_MCAST) | (1ull << IB_USER_VERBS_CMD_DETACH_MCAST) | (1ull << IB_USER_VERBS_CMD_CREATE_SRQ) | (1ull << IB_USER_VERBS_CMD_MODIFY_SRQ) | (1ull << IB_USER_VERBS_CMD_QUERY_SRQ) | (1ull << IB_USER_VERBS_CMD_DESTROY_SRQ) | (1ull << IB_USER_VERBS_CMD_CREATE_XSRQ) | (1ull << IB_USER_VERBS_CMD_OPEN_QP); ibdev->ib_dev.query_device = mlx4_ib_query_device; ibdev->ib_dev.query_port = mlx4_ib_query_port; ibdev->ib_dev.get_link_layer = mlx4_ib_port_link_layer; ibdev->ib_dev.query_gid = mlx4_ib_query_gid; ibdev->ib_dev.query_pkey = mlx4_ib_query_pkey; ibdev->ib_dev.modify_device = mlx4_ib_modify_device; ibdev->ib_dev.modify_port = mlx4_ib_modify_port; ibdev->ib_dev.alloc_ucontext = mlx4_ib_alloc_ucontext; ibdev->ib_dev.dealloc_ucontext = mlx4_ib_dealloc_ucontext; ibdev->ib_dev.mmap = mlx4_ib_mmap; /* XXX FBSD has no support for get_unmapped_area function */ #if 0 ibdev->ib_dev.get_unmapped_area = mlx4_ib_get_unmapped_area; #endif ibdev->ib_dev.alloc_pd = mlx4_ib_alloc_pd; ibdev->ib_dev.dealloc_pd = mlx4_ib_dealloc_pd; ibdev->ib_dev.create_ah = mlx4_ib_create_ah; ibdev->ib_dev.query_ah = mlx4_ib_query_ah; ibdev->ib_dev.destroy_ah = mlx4_ib_destroy_ah; ibdev->ib_dev.create_srq = mlx4_ib_create_srq; ibdev->ib_dev.modify_srq = mlx4_ib_modify_srq; ibdev->ib_dev.query_srq = mlx4_ib_query_srq; ibdev->ib_dev.destroy_srq = mlx4_ib_destroy_srq; ibdev->ib_dev.post_srq_recv = mlx4_ib_post_srq_recv; ibdev->ib_dev.create_qp = mlx4_ib_create_qp; ibdev->ib_dev.modify_qp = mlx4_ib_modify_qp; ibdev->ib_dev.query_qp = mlx4_ib_query_qp; ibdev->ib_dev.destroy_qp = mlx4_ib_destroy_qp; ibdev->ib_dev.post_send = mlx4_ib_post_send; ibdev->ib_dev.post_recv = mlx4_ib_post_recv; ibdev->ib_dev.create_cq = mlx4_ib_create_cq; ibdev->ib_dev.modify_cq = mlx4_ib_modify_cq; ibdev->ib_dev.resize_cq = mlx4_ib_resize_cq; ibdev->ib_dev.destroy_cq = mlx4_ib_destroy_cq; ibdev->ib_dev.poll_cq = mlx4_ib_poll_cq; ibdev->ib_dev.req_notify_cq = mlx4_ib_arm_cq; ibdev->ib_dev.get_dma_mr = mlx4_ib_get_dma_mr; ibdev->ib_dev.reg_user_mr = mlx4_ib_reg_user_mr; ibdev->ib_dev.dereg_mr = mlx4_ib_dereg_mr; ibdev->ib_dev.alloc_fast_reg_mr = mlx4_ib_alloc_fast_reg_mr; ibdev->ib_dev.alloc_fast_reg_page_list = mlx4_ib_alloc_fast_reg_page_list; ibdev->ib_dev.free_fast_reg_page_list = mlx4_ib_free_fast_reg_page_list; ibdev->ib_dev.attach_mcast = mlx4_ib_mcg_attach; ibdev->ib_dev.detach_mcast = mlx4_ib_mcg_detach; ibdev->ib_dev.process_mad = mlx4_ib_process_mad; ibdev->ib_dev.ioctl = mlx4_ib_ioctl; ibdev->ib_dev.query_values = mlx4_ib_query_values; if (!mlx4_is_slave(ibdev->dev)) { ibdev->ib_dev.alloc_fmr = mlx4_ib_fmr_alloc; ibdev->ib_dev.map_phys_fmr = mlx4_ib_map_phys_fmr; ibdev->ib_dev.unmap_fmr = mlx4_ib_unmap_fmr; ibdev->ib_dev.dealloc_fmr = mlx4_ib_fmr_dealloc; } if (dev->caps.flags & MLX4_DEV_CAP_FLAG_MEM_WINDOW) { ibdev->ib_dev.alloc_mw = mlx4_ib_alloc_mw; ibdev->ib_dev.bind_mw = mlx4_ib_bind_mw; ibdev->ib_dev.dealloc_mw = mlx4_ib_dealloc_mw; ibdev->ib_dev.uverbs_cmd_mask |= (1ull << IB_USER_VERBS_CMD_ALLOC_MW) | (1ull << IB_USER_VERBS_CMD_DEALLOC_MW); } if (dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC) { ibdev->ib_dev.alloc_xrcd = mlx4_ib_alloc_xrcd; ibdev->ib_dev.dealloc_xrcd = mlx4_ib_dealloc_xrcd; ibdev->ib_dev.uverbs_cmd_mask |= (1ull << IB_USER_VERBS_CMD_OPEN_XRCD) | (1ull << IB_USER_VERBS_CMD_CLOSE_XRCD); } /* * Set experimental data */ ibdev->ib_dev.uverbs_exp_cmd_mask = (1ull << IB_USER_VERBS_EXP_CMD_CREATE_QP) | (1ull << IB_USER_VERBS_EXP_CMD_MODIFY_CQ) | (1ull << IB_USER_VERBS_EXP_CMD_QUERY_DEVICE) | (1ull << IB_USER_VERBS_EXP_CMD_CREATE_CQ); ibdev->ib_dev.exp_create_qp = mlx4_ib_exp_create_qp; ibdev->ib_dev.exp_query_device = mlx4_ib_exp_query_device; if (check_flow_steering_support(dev)) { ibdev->ib_dev.uverbs_ex_cmd_mask |= (1ull << IB_USER_VERBS_EX_CMD_CREATE_FLOW) | (1ull << IB_USER_VERBS_EX_CMD_DESTROY_FLOW); ibdev->ib_dev.create_flow = mlx4_ib_create_flow; ibdev->ib_dev.destroy_flow = mlx4_ib_destroy_flow; } else { pr_debug("Device managed flow steering is unavailable for this configuration.\n"); } /* * End of experimental data */ mlx4_ib_alloc_eqs(dev, ibdev); spin_lock_init(&iboe->lock); if (init_node_data(ibdev)) goto err_map; for (i = 0; i < ibdev->num_ports; ++i) { if (mlx4_ib_port_link_layer(&ibdev->ib_dev, i + 1) == IB_LINK_LAYER_ETHERNET) { if (mlx4_is_slave(dev)) { ibdev->counters[i].status = mlx4_counter_alloc(ibdev->dev, i + 1, &ibdev->counters[i].counter_index); } else {/* allocating the PF IB default counter indices reserved in mlx4_init_counters_table */ ibdev->counters[i].counter_index = ((i + 1) << 1) - 1; ibdev->counters[i].status = 0; } dev_info(&dev->pdev->dev, "%s: allocated counter index %d for port %d\n", __func__, ibdev->counters[i].counter_index, i+1); } else { ibdev->counters[i].counter_index = MLX4_SINK_COUNTER_INDEX; ibdev->counters[i].status = -ENOSPC; } } spin_lock_init(&ibdev->sm_lock); mutex_init(&ibdev->cap_mask_mutex); if (dev->caps.steering_mode == MLX4_STEERING_MODE_DEVICE_MANAGED && !mlx4_is_mfunc(dev)) { ibdev->steer_qpn_count = MLX4_IB_UC_MAX_NUM_QPS; err = mlx4_qp_reserve_range(dev, ibdev->steer_qpn_count, MLX4_IB_UC_STEER_QPN_ALIGN, &ibdev->steer_qpn_base, 0); if (err) goto err_counter; ibdev->ib_uc_qpns_bitmap = kmalloc(BITS_TO_LONGS(ibdev->steer_qpn_count) * sizeof(long), GFP_KERNEL); if (!ibdev->ib_uc_qpns_bitmap) { dev_err(&dev->pdev->dev, "bit map alloc failed\n"); goto err_steer_qp_release; } bitmap_zero(ibdev->ib_uc_qpns_bitmap, ibdev->steer_qpn_count); err = mlx4_FLOW_STEERING_IB_UC_QP_RANGE(dev, ibdev->steer_qpn_base, ibdev->steer_qpn_base + ibdev->steer_qpn_count - 1); if (err) goto err_steer_free_bitmap; } if (ib_register_device(&ibdev->ib_dev, NULL)) goto err_steer_free_bitmap; if (mlx4_ib_mad_init(ibdev)) goto err_reg; if (mlx4_ib_init_sriov(ibdev)) goto err_mad; if (dev->caps.flags & MLX4_DEV_CAP_FLAG_IBOE) { if (!iboe->nb.notifier_call) { iboe->nb.notifier_call = mlx4_ib_netdev_event; err = register_netdevice_notifier(&iboe->nb); if (err) { iboe->nb.notifier_call = NULL; goto err_notify; } } if (!iboe->nb_inet.notifier_call) { iboe->nb_inet.notifier_call = mlx4_ib_inet_event; err = register_inetaddr_notifier(&iboe->nb_inet); if (err) { iboe->nb_inet.notifier_call = NULL; goto err_notify; } } mlx4_ib_scan_netdevs(ibdev, NULL, 0); } for (j = 0; j < ARRAY_SIZE(mlx4_class_attributes); ++j) { if (device_create_file(&ibdev->ib_dev.dev, mlx4_class_attributes[j])) goto err_notify; } if (sysfs_create_group(&ibdev->ib_dev.dev.kobj, &diag_counters_group)) goto err_notify; ibdev->ib_active = true; if (mlx4_is_mfunc(ibdev->dev)) init_pkeys(ibdev); /* create paravirt contexts for any VFs which are active */ if (mlx4_is_master(ibdev->dev)) { for (j = 0; j < MLX4_MFUNC_MAX; j++) { if (j == mlx4_master_func_num(ibdev->dev)) continue; if (mlx4_is_slave_active(ibdev->dev, j)) do_slave_init(ibdev, j, 1); } } return ibdev; err_notify: for (j = 0; j < ARRAY_SIZE(mlx4_class_attributes); ++j) { device_remove_file(&ibdev->ib_dev.dev, mlx4_class_attributes[j]); } if (ibdev->iboe.nb.notifier_call) { if (unregister_netdevice_notifier(&ibdev->iboe.nb)) pr_warn("failure unregistering notifier\n"); ibdev->iboe.nb.notifier_call = NULL; } if (ibdev->iboe.nb_inet.notifier_call) { if (unregister_inetaddr_notifier(&ibdev->iboe.nb_inet)) pr_warn("failure unregistering notifier\n"); ibdev->iboe.nb_inet.notifier_call = NULL; } flush_workqueue(wq); mlx4_ib_close_sriov(ibdev); err_mad: mlx4_ib_mad_cleanup(ibdev); err_reg: ib_unregister_device(&ibdev->ib_dev); err_steer_free_bitmap: kfree(ibdev->ib_uc_qpns_bitmap); err_steer_qp_release: if (dev->caps.steering_mode == MLX4_STEERING_MODE_DEVICE_MANAGED) mlx4_qp_release_range(dev, ibdev->steer_qpn_base, ibdev->steer_qpn_count); err_counter: for (; i; --i) { if (mlx4_ib_port_link_layer(&ibdev->ib_dev, i) == IB_LINK_LAYER_ETHERNET) { mlx4_counter_free(ibdev->dev, i, ibdev->counters[i - 1].counter_index); } } err_map: iounmap(ibdev->priv_uar.map); mlx4_ib_free_eqs(dev, ibdev); err_uar: mlx4_uar_free(dev, &ibdev->priv_uar); err_pd: mlx4_pd_free(dev, ibdev->priv_pdn); err_dealloc: ib_dealloc_device(&ibdev->ib_dev); return NULL; } int mlx4_ib_steer_qp_alloc(struct mlx4_ib_dev *dev, int count, int *qpn) { int offset; WARN_ON(!dev->ib_uc_qpns_bitmap); offset = bitmap_find_free_region(dev->ib_uc_qpns_bitmap, dev->steer_qpn_count, get_count_order(count)); if (offset < 0) return offset; *qpn = dev->steer_qpn_base + offset; return 0; } void mlx4_ib_steer_qp_free(struct mlx4_ib_dev *dev, u32 qpn, int count) { if (!qpn || dev->dev->caps.steering_mode != MLX4_STEERING_MODE_DEVICE_MANAGED) return; BUG_ON(qpn < dev->steer_qpn_base); bitmap_release_region(dev->ib_uc_qpns_bitmap, qpn - dev->steer_qpn_base, get_count_order(count)); } int mlx4_ib_steer_qp_reg(struct mlx4_ib_dev *mdev, struct mlx4_ib_qp *mqp, int is_attach) { int err; size_t flow_size; struct ib_flow_attr *flow = NULL; struct ib_flow_spec_ib *ib_spec; if (is_attach) { flow_size = sizeof(struct ib_flow_attr) + sizeof(struct ib_flow_spec_ib); flow = kzalloc(flow_size, GFP_KERNEL); if (!flow) return -ENOMEM; flow->port = mqp->port; flow->num_of_specs = 1; flow->size = flow_size; ib_spec = (struct ib_flow_spec_ib *)(flow + 1); ib_spec->type = IB_FLOW_SPEC_IB; ib_spec->size = sizeof(struct ib_flow_spec_ib); ib_spec->val.l3_type_qpn = mqp->ibqp.qp_num; ib_spec->mask.l3_type_qpn = MLX4_IB_FLOW_QPN_MASK; err = __mlx4_ib_create_flow(&mqp->ibqp, flow, IB_FLOW_DOMAIN_NIC, MLX4_FS_REGULAR, &mqp->reg_id); } else { err = __mlx4_ib_destroy_flow(mdev->dev, mqp->reg_id); } kfree(flow); return err; } static void mlx4_ib_remove(struct mlx4_dev *dev, void *ibdev_ptr) { struct mlx4_ib_dev *ibdev = ibdev_ptr; int p, j; int dev_idx, ret; if (ibdev->iboe.nb_inet.notifier_call) { if (unregister_inetaddr_notifier(&ibdev->iboe.nb_inet)) pr_warn("failure unregistering notifier\n"); ibdev->iboe.nb_inet.notifier_call = NULL; } mlx4_ib_close_sriov(ibdev); sysfs_remove_group(&ibdev->ib_dev.dev.kobj, &diag_counters_group); mlx4_ib_mad_cleanup(ibdev); for (j = 0; j < ARRAY_SIZE(mlx4_class_attributes); ++j) { device_remove_file(&ibdev->ib_dev.dev, mlx4_class_attributes[j]); } dev_idx = -1; if (dr_active && !(ibdev->dev->flags & MLX4_FLAG_DEV_NUM_STR)) { ret = sscanf(ibdev->ib_dev.name, "mlx4_%d", &dev_idx); if (ret != 1) dev_idx = -1; } ib_unregister_device(&ibdev->ib_dev); if (dev_idx >= 0) { spin_lock(&dev_num_str_lock); bitmap_release_region(dev_num_str_bitmap, dev_idx, 0); spin_unlock(&dev_num_str_lock); } if (dev->caps.steering_mode == MLX4_STEERING_MODE_DEVICE_MANAGED) { mlx4_qp_release_range(dev, ibdev->steer_qpn_base, ibdev->steer_qpn_count); kfree(ibdev->ib_uc_qpns_bitmap); } if (ibdev->iboe.nb.notifier_call) { if (unregister_netdevice_notifier(&ibdev->iboe.nb)) pr_warn("failure unregistering notifier\n"); ibdev->iboe.nb.notifier_call = NULL; } iounmap(ibdev->priv_uar.map); for (p = 0; p < ibdev->num_ports; ++p) { if (mlx4_ib_port_link_layer(&ibdev->ib_dev, p + 1) == IB_LINK_LAYER_ETHERNET) { mlx4_counter_free(ibdev->dev, p + 1, ibdev->counters[p].counter_index); } } mlx4_foreach_port(p, dev, MLX4_PORT_TYPE_IB) mlx4_CLOSE_PORT(dev, p); mlx4_ib_free_eqs(dev, ibdev); mlx4_uar_free(dev, &ibdev->priv_uar); mlx4_pd_free(dev, ibdev->priv_pdn); ib_dealloc_device(&ibdev->ib_dev); } static void do_slave_init(struct mlx4_ib_dev *ibdev, int slave, int do_init) { struct mlx4_ib_demux_work **dm = NULL; struct mlx4_dev *dev = ibdev->dev; int i; unsigned long flags; if (!mlx4_is_master(dev)) return; dm = kcalloc(dev->caps.num_ports, sizeof *dm, GFP_ATOMIC); if (!dm) { pr_err("failed to allocate memory for tunneling qp update\n"); goto out; } for (i = 0; i < dev->caps.num_ports; i++) { dm[i] = kmalloc(sizeof (struct mlx4_ib_demux_work), GFP_ATOMIC); if (!dm[i]) { pr_err("failed to allocate memory for tunneling qp update work struct\n"); for (i = 0; i < dev->caps.num_ports; i++) { if (dm[i]) kfree(dm[i]); } goto out; } } /* initialize or tear down tunnel QPs for the slave */ for (i = 0; i < dev->caps.num_ports; i++) { INIT_WORK(&dm[i]->work, mlx4_ib_tunnels_update_work); dm[i]->port = i + 1; dm[i]->slave = slave; dm[i]->do_init = do_init; dm[i]->dev = ibdev; spin_lock_irqsave(&ibdev->sriov.going_down_lock, flags); if (!ibdev->sriov.is_going_down) queue_work(ibdev->sriov.demux[i].ud_wq, &dm[i]->work); spin_unlock_irqrestore(&ibdev->sriov.going_down_lock, flags); } out: if (dm) kfree(dm); return; } static void mlx4_ib_event(struct mlx4_dev *dev, void *ibdev_ptr, enum mlx4_dev_event event, unsigned long param) { struct ib_event ibev; struct mlx4_ib_dev *ibdev = to_mdev((struct ib_device *) ibdev_ptr); struct mlx4_eqe *eqe = NULL; struct ib_event_work *ew; int p = 0; if (event == MLX4_DEV_EVENT_PORT_MGMT_CHANGE) eqe = (struct mlx4_eqe *)param; else p = (int) param; switch (event) { case MLX4_DEV_EVENT_PORT_UP: if (p > ibdev->num_ports) return; if (mlx4_is_master(dev) && rdma_port_get_link_layer(&ibdev->ib_dev, p) == IB_LINK_LAYER_INFINIBAND) { mlx4_ib_invalidate_all_guid_record(ibdev, p); } mlx4_ib_info((struct ib_device *) ibdev_ptr, "Port %d logical link is up\n", p); ibev.event = IB_EVENT_PORT_ACTIVE; break; case MLX4_DEV_EVENT_PORT_DOWN: if (p > ibdev->num_ports) return; mlx4_ib_info((struct ib_device *) ibdev_ptr, "Port %d logical link is down\n", p); ibev.event = IB_EVENT_PORT_ERR; break; case MLX4_DEV_EVENT_CATASTROPHIC_ERROR: ibdev->ib_active = false; ibev.event = IB_EVENT_DEVICE_FATAL; break; case MLX4_DEV_EVENT_PORT_MGMT_CHANGE: ew = kmalloc(sizeof *ew, GFP_ATOMIC); if (!ew) { pr_err("failed to allocate memory for events work\n"); break; } INIT_WORK(&ew->work, handle_port_mgmt_change_event); memcpy(&ew->ib_eqe, eqe, sizeof *eqe); ew->ib_dev = ibdev; /* need to queue only for port owner, which uses GEN_EQE */ if (mlx4_is_master(dev)) queue_work(wq, &ew->work); else handle_port_mgmt_change_event(&ew->work); return; case MLX4_DEV_EVENT_SLAVE_INIT: /* here, p is the slave id */ do_slave_init(ibdev, p, 1); return; case MLX4_DEV_EVENT_SLAVE_SHUTDOWN: /* here, p is the slave id */ do_slave_init(ibdev, p, 0); return; default: return; } ibev.device = ibdev_ptr; ibev.element.port_num = (u8) p; ib_dispatch_event(&ibev); } static struct mlx4_interface mlx4_ib_interface = { .add = mlx4_ib_add, .remove = mlx4_ib_remove, .event = mlx4_ib_event, .protocol = MLX4_PROT_IB_IPV6 }; static int __init mlx4_ib_init(void) { int err; wq = create_singlethread_workqueue("mlx4_ib"); if (!wq) return -ENOMEM; err = mlx4_ib_mcg_init(); if (err) goto clean_proc; init_dev_assign(); err = mlx4_register_interface(&mlx4_ib_interface); if (err) goto clean_mcg; return 0; clean_mcg: mlx4_ib_mcg_destroy(); clean_proc: destroy_workqueue(wq); return err; } static void __exit mlx4_ib_cleanup(void) { mlx4_unregister_interface(&mlx4_ib_interface); mlx4_ib_mcg_destroy(); destroy_workqueue(wq); kfree(dev_num_str_bitmap); } module_init_order(mlx4_ib_init, SI_ORDER_MIDDLE); module_exit(mlx4_ib_cleanup); static int mlx4ib_evhand(module_t mod, int event, void *arg) { return (0); } static moduledata_t mlx4ib_mod = { .name = "mlx4ib", .evhand = mlx4ib_evhand, }; -DECLARE_MODULE(mlx4ib, mlx4ib_mod, SI_SUB_SMP, SI_ORDER_ANY); +DECLARE_MODULE(mlx4ib, mlx4ib_mod, SI_SUB_LAST, SI_ORDER_ANY); MODULE_DEPEND(mlx4ib, mlx4, 1, 1, 1); MODULE_DEPEND(mlx4ib, ibcore, 1, 1, 1); MODULE_DEPEND(mlx4ib, linuxkpi, 1, 1, 1); Index: head/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_main.c =================================================================== --- head/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_main.c (revision 296687) +++ head/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_main.c (revision 296688) @@ -1,1540 +1,1540 @@ /* * Copyright (c) 2004 Topspin Communications. All rights reserved. * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2004 Voltaire, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include "ipoib.h" static int ipoib_resolvemulti(struct ifnet *, struct sockaddr **, struct sockaddr *); #include #include #include #include #include /* For ARPHRD_xxx */ #include #include #include MODULE_AUTHOR("Roland Dreier"); MODULE_DESCRIPTION("IP-over-InfiniBand net driver"); MODULE_LICENSE("Dual BSD/GPL"); int ipoib_sendq_size = IPOIB_TX_RING_SIZE; int ipoib_recvq_size = IPOIB_RX_RING_SIZE; module_param_named(send_queue_size, ipoib_sendq_size, int, 0444); MODULE_PARM_DESC(send_queue_size, "Number of descriptors in send queue"); module_param_named(recv_queue_size, ipoib_recvq_size, int, 0444); MODULE_PARM_DESC(recv_queue_size, "Number of descriptors in receive queue"); #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG int ipoib_debug_level = 1; module_param_named(debug_level, ipoib_debug_level, int, 0644); MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0"); #endif struct ipoib_path_iter { struct ipoib_dev_priv *priv; struct ipoib_path path; }; static const u8 ipv4_bcast_addr[] = { 0x00, 0xff, 0xff, 0xff, 0xff, 0x12, 0x40, 0x1b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff }; struct workqueue_struct *ipoib_workqueue; struct ib_sa_client ipoib_sa_client; static void ipoib_add_one(struct ib_device *device); static void ipoib_remove_one(struct ib_device *device); static void ipoib_start(struct ifnet *dev); static int ipoib_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, struct route *ro); static int ipoib_ioctl(struct ifnet *ifp, u_long command, caddr_t data); static void ipoib_input(struct ifnet *ifp, struct mbuf *m); #define IPOIB_MTAP(_ifp, _m) \ do { \ if (bpf_peers_present((_ifp)->if_bpf)) { \ M_ASSERTVALID(_m); \ ipoib_mtap_mb((_ifp), (_m)); \ } \ } while (0) /* * This is for clients that have an ipoib_header in the mbuf. */ static void ipoib_mtap_mb(struct ifnet *ifp, struct mbuf *mb) { struct ipoib_header *ih; struct ether_header eh; ih = mtod(mb, struct ipoib_header *); eh.ether_type = ih->proto; bcopy(ih->hwaddr, &eh.ether_dhost, ETHER_ADDR_LEN); bzero(&eh.ether_shost, ETHER_ADDR_LEN); mb->m_data += sizeof(struct ipoib_header); mb->m_len -= sizeof(struct ipoib_header); bpf_mtap2(ifp->if_bpf, &eh, sizeof(eh), mb); mb->m_data -= sizeof(struct ipoib_header); mb->m_len += sizeof(struct ipoib_header); } void ipoib_mtap_proto(struct ifnet *ifp, struct mbuf *mb, uint16_t proto) { struct ether_header eh; eh.ether_type = proto; bzero(&eh.ether_shost, ETHER_ADDR_LEN); bzero(&eh.ether_dhost, ETHER_ADDR_LEN); bpf_mtap2(ifp->if_bpf, &eh, sizeof(eh), mb); } static struct ib_client ipoib_client = { .name = "ipoib", .add = ipoib_add_one, .remove = ipoib_remove_one }; int ipoib_open(struct ipoib_dev_priv *priv) { struct ifnet *dev = priv->dev; ipoib_dbg(priv, "bringing up interface\n"); set_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags); if (ipoib_pkey_dev_delay_open(priv)) return 0; if (ipoib_ib_dev_open(priv)) goto err_disable; if (ipoib_ib_dev_up(priv)) goto err_stop; if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) { struct ipoib_dev_priv *cpriv; /* Bring up any child interfaces too */ mutex_lock(&priv->vlan_mutex); list_for_each_entry(cpriv, &priv->child_intfs, list) if ((cpriv->dev->if_drv_flags & IFF_DRV_RUNNING) == 0) ipoib_open(cpriv); mutex_unlock(&priv->vlan_mutex); } dev->if_drv_flags |= IFF_DRV_RUNNING; dev->if_drv_flags &= ~IFF_DRV_OACTIVE; return 0; err_stop: ipoib_ib_dev_stop(priv, 1); err_disable: clear_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags); return -EINVAL; } static void ipoib_init(void *arg) { struct ifnet *dev; struct ipoib_dev_priv *priv; priv = arg; dev = priv->dev; if ((dev->if_drv_flags & IFF_DRV_RUNNING) == 0) ipoib_open(priv); queue_work(ipoib_workqueue, &priv->flush_light); } static int ipoib_stop(struct ipoib_dev_priv *priv) { struct ifnet *dev = priv->dev; ipoib_dbg(priv, "stopping interface\n"); clear_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags); dev->if_drv_flags &= ~(IFF_DRV_RUNNING | IFF_DRV_OACTIVE); ipoib_ib_dev_down(priv, 0); ipoib_ib_dev_stop(priv, 0); if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) { struct ipoib_dev_priv *cpriv; /* Bring down any child interfaces too */ mutex_lock(&priv->vlan_mutex); list_for_each_entry(cpriv, &priv->child_intfs, list) if ((cpriv->dev->if_drv_flags & IFF_DRV_RUNNING) != 0) ipoib_stop(cpriv); mutex_unlock(&priv->vlan_mutex); } return 0; } int ipoib_change_mtu(struct ipoib_dev_priv *priv, int new_mtu) { struct ifnet *dev = priv->dev; /* dev->if_mtu > 2K ==> connected mode */ if (ipoib_cm_admin_enabled(priv)) { if (new_mtu > IPOIB_CM_MTU(ipoib_cm_max_mtu(priv))) return -EINVAL; if (new_mtu > priv->mcast_mtu) ipoib_warn(priv, "mtu > %d will cause multicast packet drops.\n", priv->mcast_mtu); dev->if_mtu = new_mtu; return 0; } if (new_mtu > IPOIB_UD_MTU(priv->max_ib_mtu)) return -EINVAL; priv->admin_mtu = new_mtu; dev->if_mtu = min(priv->mcast_mtu, priv->admin_mtu); queue_work(ipoib_workqueue, &priv->flush_light); return 0; } static int ipoib_ioctl(struct ifnet *ifp, u_long command, caddr_t data) { struct ipoib_dev_priv *priv = ifp->if_softc; struct ifaddr *ifa = (struct ifaddr *) data; struct ifreq *ifr = (struct ifreq *) data; int error = 0; switch (command) { case SIOCSIFFLAGS: if (ifp->if_flags & IFF_UP) { if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) error = -ipoib_open(priv); } else if (ifp->if_drv_flags & IFF_DRV_RUNNING) ipoib_stop(priv); break; case SIOCADDMULTI: case SIOCDELMULTI: if (ifp->if_drv_flags & IFF_DRV_RUNNING) queue_work(ipoib_workqueue, &priv->restart_task); break; case SIOCSIFADDR: ifp->if_flags |= IFF_UP; switch (ifa->ifa_addr->sa_family) { #ifdef INET case AF_INET: ifp->if_init(ifp->if_softc); /* before arpwhohas */ arp_ifinit(ifp, ifa); break; #endif default: ifp->if_init(ifp->if_softc); break; } break; case SIOCGIFADDR: { struct sockaddr *sa; sa = (struct sockaddr *) & ifr->ifr_data; bcopy(IF_LLADDR(ifp), (caddr_t) sa->sa_data, INFINIBAND_ALEN); } break; case SIOCSIFMTU: /* * Set the interface MTU. */ error = -ipoib_change_mtu(priv, ifr->ifr_mtu); break; default: error = EINVAL; break; } return (error); } static struct ipoib_path * __path_find(struct ipoib_dev_priv *priv, void *gid) { struct rb_node *n = priv->path_tree.rb_node; struct ipoib_path *path; int ret; while (n) { path = rb_entry(n, struct ipoib_path, rb_node); ret = memcmp(gid, path->pathrec.dgid.raw, sizeof (union ib_gid)); if (ret < 0) n = n->rb_left; else if (ret > 0) n = n->rb_right; else return path; } return NULL; } static int __path_add(struct ipoib_dev_priv *priv, struct ipoib_path *path) { struct rb_node **n = &priv->path_tree.rb_node; struct rb_node *pn = NULL; struct ipoib_path *tpath; int ret; while (*n) { pn = *n; tpath = rb_entry(pn, struct ipoib_path, rb_node); ret = memcmp(path->pathrec.dgid.raw, tpath->pathrec.dgid.raw, sizeof (union ib_gid)); if (ret < 0) n = &pn->rb_left; else if (ret > 0) n = &pn->rb_right; else return -EEXIST; } rb_link_node(&path->rb_node, pn, n); rb_insert_color(&path->rb_node, &priv->path_tree); list_add_tail(&path->list, &priv->path_list); return 0; } void ipoib_path_free(struct ipoib_dev_priv *priv, struct ipoib_path *path) { _IF_DRAIN(&path->queue); if (path->ah) ipoib_put_ah(path->ah); if (ipoib_cm_get(path)) ipoib_cm_destroy_tx(ipoib_cm_get(path)); kfree(path); } #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG struct ipoib_path_iter * ipoib_path_iter_init(struct ipoib_dev_priv *priv) { struct ipoib_path_iter *iter; iter = kmalloc(sizeof *iter, GFP_KERNEL); if (!iter) return NULL; iter->priv = priv; memset(iter->path.pathrec.dgid.raw, 0, 16); if (ipoib_path_iter_next(iter)) { kfree(iter); return NULL; } return iter; } int ipoib_path_iter_next(struct ipoib_path_iter *iter) { struct ipoib_dev_priv *priv = iter->priv; struct rb_node *n; struct ipoib_path *path; int ret = 1; spin_lock_irq(&priv->lock); n = rb_first(&priv->path_tree); while (n) { path = rb_entry(n, struct ipoib_path, rb_node); if (memcmp(iter->path.pathrec.dgid.raw, path->pathrec.dgid.raw, sizeof (union ib_gid)) < 0) { iter->path = *path; ret = 0; break; } n = rb_next(n); } spin_unlock_irq(&priv->lock); return ret; } void ipoib_path_iter_read(struct ipoib_path_iter *iter, struct ipoib_path *path) { *path = iter->path; } #endif /* CONFIG_INFINIBAND_IPOIB_DEBUG */ void ipoib_mark_paths_invalid(struct ipoib_dev_priv *priv) { struct ipoib_path *path, *tp; spin_lock_irq(&priv->lock); list_for_each_entry_safe(path, tp, &priv->path_list, list) { ipoib_dbg(priv, "mark path LID 0x%04x GID %16D invalid\n", be16_to_cpu(path->pathrec.dlid), path->pathrec.dgid.raw, ":"); path->valid = 0; } spin_unlock_irq(&priv->lock); } void ipoib_flush_paths(struct ipoib_dev_priv *priv) { struct ipoib_path *path, *tp; LIST_HEAD(remove_list); unsigned long flags; spin_lock_irqsave(&priv->lock, flags); list_splice_init(&priv->path_list, &remove_list); list_for_each_entry(path, &remove_list, list) rb_erase(&path->rb_node, &priv->path_tree); list_for_each_entry_safe(path, tp, &remove_list, list) { if (path->query) ib_sa_cancel_query(path->query_id, path->query); spin_unlock_irqrestore(&priv->lock, flags); wait_for_completion(&path->done); ipoib_path_free(priv, path); spin_lock_irqsave(&priv->lock, flags); } spin_unlock_irqrestore(&priv->lock, flags); } static void path_rec_completion(int status, struct ib_sa_path_rec *pathrec, void *path_ptr) { struct ipoib_path *path = path_ptr; struct ipoib_dev_priv *priv = path->priv; struct ifnet *dev = priv->dev; struct ipoib_ah *ah = NULL; struct ipoib_ah *old_ah = NULL; struct ifqueue mbqueue; struct mbuf *mb; unsigned long flags; if (!status) ipoib_dbg(priv, "PathRec LID 0x%04x for GID %16D\n", be16_to_cpu(pathrec->dlid), pathrec->dgid.raw, ":"); else ipoib_dbg(priv, "PathRec status %d for GID %16D\n", status, path->pathrec.dgid.raw, ":"); bzero(&mbqueue, sizeof(mbqueue)); if (!status) { struct ib_ah_attr av; if (!ib_init_ah_from_path(priv->ca, priv->port, pathrec, &av)) ah = ipoib_create_ah(priv, priv->pd, &av); } spin_lock_irqsave(&priv->lock, flags); if (ah) { path->pathrec = *pathrec; old_ah = path->ah; path->ah = ah; ipoib_dbg(priv, "created address handle %p for LID 0x%04x, SL %d\n", ah, be16_to_cpu(pathrec->dlid), pathrec->sl); for (;;) { _IF_DEQUEUE(&path->queue, mb); if (mb == NULL) break; _IF_ENQUEUE(&mbqueue, mb); } #ifdef CONFIG_INFINIBAND_IPOIB_CM if (ipoib_cm_enabled(priv, path->hwaddr) && !ipoib_cm_get(path)) ipoib_cm_set(path, ipoib_cm_create_tx(priv, path)); #endif path->valid = 1; } path->query = NULL; complete(&path->done); spin_unlock_irqrestore(&priv->lock, flags); if (old_ah) ipoib_put_ah(old_ah); for (;;) { _IF_DEQUEUE(&mbqueue, mb); if (mb == NULL) break; mb->m_pkthdr.rcvif = dev; if (dev->if_transmit(dev, mb)) ipoib_warn(priv, "dev_queue_xmit failed " "to requeue packet\n"); } } static struct ipoib_path * path_rec_create(struct ipoib_dev_priv *priv, uint8_t *hwaddr) { struct ipoib_path *path; if (!priv->broadcast) return NULL; path = kzalloc(sizeof *path, GFP_ATOMIC); if (!path) return NULL; path->priv = priv; bzero(&path->queue, sizeof(path->queue)); #ifdef CONFIG_INFINIBAND_IPOIB_CM memcpy(&path->hwaddr, hwaddr, INFINIBAND_ALEN); #endif memcpy(path->pathrec.dgid.raw, &hwaddr[4], sizeof (union ib_gid)); path->pathrec.sgid = priv->local_gid; path->pathrec.pkey = cpu_to_be16(priv->pkey); path->pathrec.numb_path = 1; path->pathrec.traffic_class = priv->broadcast->mcmember.traffic_class; return path; } static int path_rec_start(struct ipoib_dev_priv *priv, struct ipoib_path *path) { struct ifnet *dev = priv->dev; ib_sa_comp_mask comp_mask = IB_SA_PATH_REC_MTU_SELECTOR | IB_SA_PATH_REC_MTU; struct ib_sa_path_rec p_rec; p_rec = path->pathrec; p_rec.mtu_selector = IB_SA_GT; switch (roundup_pow_of_two(dev->if_mtu + IPOIB_ENCAP_LEN)) { case 512: p_rec.mtu = IB_MTU_256; break; case 1024: p_rec.mtu = IB_MTU_512; break; case 2048: p_rec.mtu = IB_MTU_1024; break; case 4096: p_rec.mtu = IB_MTU_2048; break; default: /* Wildcard everything */ comp_mask = 0; p_rec.mtu = 0; p_rec.mtu_selector = 0; } ipoib_dbg(priv, "Start path record lookup for %16D MTU > %d\n", p_rec.dgid.raw, ":", comp_mask ? ib_mtu_enum_to_int(p_rec.mtu) : 0); init_completion(&path->done); path->query_id = ib_sa_path_rec_get(&ipoib_sa_client, priv->ca, priv->port, &p_rec, comp_mask | IB_SA_PATH_REC_DGID | IB_SA_PATH_REC_SGID | IB_SA_PATH_REC_NUMB_PATH | IB_SA_PATH_REC_TRAFFIC_CLASS | IB_SA_PATH_REC_PKEY, 1000, GFP_ATOMIC, path_rec_completion, path, &path->query); if (path->query_id < 0) { ipoib_warn(priv, "ib_sa_path_rec_get failed: %d\n", path->query_id); path->query = NULL; complete(&path->done); return path->query_id; } return 0; } static void ipoib_unicast_send(struct mbuf *mb, struct ipoib_dev_priv *priv, struct ipoib_header *eh) { struct ipoib_path *path; path = __path_find(priv, eh->hwaddr + 4); if (!path || !path->valid) { int new_path = 0; if (!path) { path = path_rec_create(priv, eh->hwaddr); new_path = 1; } if (path) { _IF_ENQUEUE(&path->queue, mb); if (!path->query && path_rec_start(priv, path)) { spin_unlock_irqrestore(&priv->lock, flags); if (new_path) ipoib_path_free(priv, path); return; } else __path_add(priv, path); } else { if_inc_counter(priv->dev, IFCOUNTER_OERRORS, 1); m_freem(mb); } return; } if (ipoib_cm_get(path) && ipoib_cm_up(path)) { ipoib_cm_send(priv, mb, ipoib_cm_get(path)); } else if (path->ah) { ipoib_send(priv, mb, path->ah, IPOIB_QPN(eh->hwaddr)); } else if ((path->query || !path_rec_start(priv, path)) && path->queue.ifq_len < IPOIB_MAX_PATH_REC_QUEUE) { _IF_ENQUEUE(&path->queue, mb); } else { if_inc_counter(priv->dev, IFCOUNTER_OERRORS, 1); m_freem(mb); } } static int ipoib_send_one(struct ipoib_dev_priv *priv, struct mbuf *mb) { struct ipoib_header *eh; eh = mtod(mb, struct ipoib_header *); if (IPOIB_IS_MULTICAST(eh->hwaddr)) { /* Add in the P_Key for multicast*/ eh->hwaddr[8] = (priv->pkey >> 8) & 0xff; eh->hwaddr[9] = priv->pkey & 0xff; ipoib_mcast_send(priv, eh->hwaddr + 4, mb); } else ipoib_unicast_send(mb, priv, eh); return 0; } static void _ipoib_start(struct ifnet *dev, struct ipoib_dev_priv *priv) { struct mbuf *mb; if ((dev->if_drv_flags & (IFF_DRV_RUNNING|IFF_DRV_OACTIVE)) != IFF_DRV_RUNNING) return; spin_lock(&priv->lock); while (!IFQ_DRV_IS_EMPTY(&dev->if_snd) && (dev->if_drv_flags & IFF_DRV_OACTIVE) == 0) { IFQ_DRV_DEQUEUE(&dev->if_snd, mb); if (mb == NULL) break; IPOIB_MTAP(dev, mb); ipoib_send_one(priv, mb); } spin_unlock(&priv->lock); } static void ipoib_start(struct ifnet *dev) { _ipoib_start(dev, dev->if_softc); } static void ipoib_vlan_start(struct ifnet *dev) { struct ipoib_dev_priv *priv; struct mbuf *mb; priv = VLAN_COOKIE(dev); if (priv != NULL) return _ipoib_start(dev, priv); while (!IFQ_DRV_IS_EMPTY(&dev->if_snd)) { IFQ_DRV_DEQUEUE(&dev->if_snd, mb); if (mb == NULL) break; m_freem(mb); if_inc_counter(dev, IFCOUNTER_OERRORS, 1); } } int ipoib_dev_init(struct ipoib_dev_priv *priv, struct ib_device *ca, int port) { /* Allocate RX/TX "rings" to hold queued mbs */ priv->rx_ring = kzalloc(ipoib_recvq_size * sizeof *priv->rx_ring, GFP_KERNEL); if (!priv->rx_ring) { printk(KERN_WARNING "%s: failed to allocate RX ring (%d entries)\n", ca->name, ipoib_recvq_size); goto out; } priv->tx_ring = kzalloc(ipoib_sendq_size * sizeof *priv->tx_ring, GFP_KERNEL); if (!priv->tx_ring) { printk(KERN_WARNING "%s: failed to allocate TX ring (%d entries)\n", ca->name, ipoib_sendq_size); goto out_rx_ring_cleanup; } memset(priv->tx_ring, 0, ipoib_sendq_size * sizeof *priv->tx_ring); /* priv->tx_head, tx_tail & tx_outstanding are already 0 */ if (ipoib_ib_dev_init(priv, ca, port)) goto out_tx_ring_cleanup; return 0; out_tx_ring_cleanup: kfree(priv->tx_ring); out_rx_ring_cleanup: kfree(priv->rx_ring); out: return -ENOMEM; } static void ipoib_detach(struct ipoib_dev_priv *priv) { struct ifnet *dev; dev = priv->dev; if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) { bpfdetach(dev); if_detach(dev); if_free(dev); } else VLAN_SETCOOKIE(priv->dev, NULL); free(priv, M_TEMP); } void ipoib_dev_cleanup(struct ipoib_dev_priv *priv) { struct ipoib_dev_priv *cpriv, *tcpriv; /* Delete any child interfaces first */ list_for_each_entry_safe(cpriv, tcpriv, &priv->child_intfs, list) { ipoib_dev_cleanup(cpriv); ipoib_detach(cpriv); } ipoib_ib_dev_cleanup(priv); kfree(priv->rx_ring); kfree(priv->tx_ring); priv->rx_ring = NULL; priv->tx_ring = NULL; } static volatile int ipoib_unit; static struct ipoib_dev_priv * ipoib_priv_alloc(void) { struct ipoib_dev_priv *priv; priv = malloc(sizeof(struct ipoib_dev_priv), M_TEMP, M_ZERO|M_WAITOK); spin_lock_init(&priv->lock); spin_lock_init(&priv->drain_lock); mutex_init(&priv->vlan_mutex); INIT_LIST_HEAD(&priv->path_list); INIT_LIST_HEAD(&priv->child_intfs); INIT_LIST_HEAD(&priv->dead_ahs); INIT_LIST_HEAD(&priv->multicast_list); INIT_DELAYED_WORK(&priv->pkey_poll_task, ipoib_pkey_poll); INIT_DELAYED_WORK(&priv->mcast_task, ipoib_mcast_join_task); INIT_WORK(&priv->carrier_on_task, ipoib_mcast_carrier_on_task); INIT_WORK(&priv->flush_light, ipoib_ib_dev_flush_light); INIT_WORK(&priv->flush_normal, ipoib_ib_dev_flush_normal); INIT_WORK(&priv->flush_heavy, ipoib_ib_dev_flush_heavy); INIT_WORK(&priv->restart_task, ipoib_mcast_restart_task); INIT_DELAYED_WORK(&priv->ah_reap_task, ipoib_reap_ah); memcpy(priv->broadcastaddr, ipv4_bcast_addr, INFINIBAND_ALEN); return (priv); } struct ipoib_dev_priv * ipoib_intf_alloc(const char *name) { struct ipoib_dev_priv *priv; struct sockaddr_dl *sdl; struct ifnet *dev; priv = ipoib_priv_alloc(); dev = priv->dev = if_alloc(IFT_INFINIBAND); if (!dev) { free(priv, M_TEMP); return NULL; } dev->if_softc = priv; if_initname(dev, name, atomic_fetchadd_int(&ipoib_unit, 1)); dev->if_flags = IFF_BROADCAST | IFF_MULTICAST; dev->if_addrlen = INFINIBAND_ALEN; dev->if_hdrlen = IPOIB_HEADER_LEN; if_attach(dev); dev->if_init = ipoib_init; dev->if_ioctl = ipoib_ioctl; dev->if_start = ipoib_start; dev->if_output = ipoib_output; dev->if_input = ipoib_input; dev->if_resolvemulti = ipoib_resolvemulti; dev->if_baudrate = IF_Gbps(10); dev->if_broadcastaddr = priv->broadcastaddr; dev->if_snd.ifq_maxlen = ipoib_sendq_size * 2; sdl = (struct sockaddr_dl *)dev->if_addr->ifa_addr; sdl->sdl_type = IFT_INFINIBAND; sdl->sdl_alen = dev->if_addrlen; priv->dev = dev; if_link_state_change(dev, LINK_STATE_DOWN); bpfattach(dev, DLT_EN10MB, ETHER_HDR_LEN); return dev->if_softc; } int ipoib_set_dev_features(struct ipoib_dev_priv *priv, struct ib_device *hca) { struct ib_device_attr *device_attr; int result = -ENOMEM; device_attr = kmalloc(sizeof *device_attr, GFP_KERNEL); if (!device_attr) { printk(KERN_WARNING "%s: allocation of %zu bytes failed\n", hca->name, sizeof *device_attr); return result; } result = ib_query_device(hca, device_attr); if (result) { printk(KERN_WARNING "%s: ib_query_device failed (ret = %d)\n", hca->name, result); kfree(device_attr); return result; } priv->hca_caps = device_attr->device_cap_flags; kfree(device_attr); priv->dev->if_hwassist = 0; priv->dev->if_capabilities = 0; #ifndef CONFIG_INFINIBAND_IPOIB_CM if (priv->hca_caps & IB_DEVICE_UD_IP_CSUM) { set_bit(IPOIB_FLAG_CSUM, &priv->flags); priv->dev->if_hwassist = CSUM_IP | CSUM_TCP | CSUM_UDP; priv->dev->if_capabilities = IFCAP_HWCSUM | IFCAP_VLAN_HWCSUM; } #if 0 if (priv->dev->features & NETIF_F_SG && priv->hca_caps & IB_DEVICE_UD_TSO) { priv->dev->if_capabilities |= IFCAP_TSO4; priv->dev->if_hwassist |= CSUM_TSO; } #endif #endif priv->dev->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU | IFCAP_LINKSTATE; priv->dev->if_capenable = priv->dev->if_capabilities; return 0; } static struct ifnet * ipoib_add_port(const char *format, struct ib_device *hca, u8 port) { struct ipoib_dev_priv *priv; struct ib_port_attr attr; int result = -ENOMEM; priv = ipoib_intf_alloc(format); if (!priv) goto alloc_mem_failed; if (!ib_query_port(hca, port, &attr)) priv->max_ib_mtu = ib_mtu_enum_to_int(attr.max_mtu); else { printk(KERN_WARNING "%s: ib_query_port %d failed\n", hca->name, port); goto device_init_failed; } /* MTU will be reset when mcast join happens */ priv->dev->if_mtu = IPOIB_UD_MTU(priv->max_ib_mtu); priv->mcast_mtu = priv->admin_mtu = priv->dev->if_mtu; result = ib_query_pkey(hca, port, 0, &priv->pkey); if (result) { printk(KERN_WARNING "%s: ib_query_pkey port %d failed (ret = %d)\n", hca->name, port, result); goto device_init_failed; } if (ipoib_set_dev_features(priv, hca)) goto device_init_failed; /* * Set the full membership bit, so that we join the right * broadcast group, etc. */ priv->pkey |= 0x8000; priv->broadcastaddr[8] = priv->pkey >> 8; priv->broadcastaddr[9] = priv->pkey & 0xff; result = ib_query_gid(hca, port, 0, &priv->local_gid); if (result) { printk(KERN_WARNING "%s: ib_query_gid port %d failed (ret = %d)\n", hca->name, port, result); goto device_init_failed; } memcpy(IF_LLADDR(priv->dev) + 4, priv->local_gid.raw, sizeof (union ib_gid)); result = ipoib_dev_init(priv, hca, port); if (result < 0) { printk(KERN_WARNING "%s: failed to initialize port %d (ret = %d)\n", hca->name, port, result); goto device_init_failed; } if (ipoib_cm_admin_enabled(priv)) priv->dev->if_mtu = IPOIB_CM_MTU(ipoib_cm_max_mtu(priv)); INIT_IB_EVENT_HANDLER(&priv->event_handler, priv->ca, ipoib_event); result = ib_register_event_handler(&priv->event_handler); if (result < 0) { printk(KERN_WARNING "%s: ib_register_event_handler failed for " "port %d (ret = %d)\n", hca->name, port, result); goto event_failed; } if_printf(priv->dev, "Attached to %s port %d\n", hca->name, port); return priv->dev; event_failed: ipoib_dev_cleanup(priv); device_init_failed: ipoib_detach(priv); alloc_mem_failed: return ERR_PTR(result); } static void ipoib_add_one(struct ib_device *device) { struct list_head *dev_list; struct ifnet *dev; struct ipoib_dev_priv *priv; int s, e, p; if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB) return; dev_list = kmalloc(sizeof *dev_list, GFP_KERNEL); if (!dev_list) return; INIT_LIST_HEAD(dev_list); if (device->node_type == RDMA_NODE_IB_SWITCH) { s = 0; e = 0; } else { s = 1; e = device->phys_port_cnt; } for (p = s; p <= e; ++p) { if (rdma_port_get_link_layer(device, p) != IB_LINK_LAYER_INFINIBAND) continue; dev = ipoib_add_port("ib", device, p); if (!IS_ERR(dev)) { priv = dev->if_softc; list_add_tail(&priv->list, dev_list); } } ib_set_client_data(device, &ipoib_client, dev_list); } static void ipoib_remove_one(struct ib_device *device) { struct ipoib_dev_priv *priv, *tmp; struct list_head *dev_list; if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB) return; dev_list = ib_get_client_data(device, &ipoib_client); list_for_each_entry_safe(priv, tmp, dev_list, list) { if (rdma_port_get_link_layer(device, priv->port) != IB_LINK_LAYER_INFINIBAND) continue; ipoib_stop(priv); ib_unregister_event_handler(&priv->event_handler); /* dev_change_flags(priv->dev, priv->dev->flags & ~IFF_UP); */ flush_workqueue(ipoib_workqueue); ipoib_dev_cleanup(priv); ipoib_detach(priv); } kfree(dev_list); } static void ipoib_config_vlan(void *arg, struct ifnet *ifp, u_int16_t vtag) { struct ipoib_dev_priv *parent; struct ipoib_dev_priv *priv; struct ifnet *dev; uint16_t pkey; int error; if (ifp->if_type != IFT_INFINIBAND) return; dev = VLAN_DEVAT(ifp, vtag); if (dev == NULL) return; priv = NULL; error = 0; parent = ifp->if_softc; /* We only support 15 bits of pkey. */ if (vtag & 0x8000) return; pkey = vtag | 0x8000; /* Set full membership bit. */ if (pkey == parent->pkey) return; /* Check for dups */ mutex_lock(&parent->vlan_mutex); list_for_each_entry(priv, &parent->child_intfs, list) { if (priv->pkey == pkey) { priv = NULL; error = EBUSY; goto out; } } priv = ipoib_priv_alloc(); priv->dev = dev; priv->max_ib_mtu = parent->max_ib_mtu; priv->mcast_mtu = priv->admin_mtu = parent->dev->if_mtu; set_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags); error = ipoib_set_dev_features(priv, parent->ca); if (error) goto out; priv->pkey = pkey; priv->broadcastaddr[8] = pkey >> 8; priv->broadcastaddr[9] = pkey & 0xff; dev->if_broadcastaddr = priv->broadcastaddr; error = ipoib_dev_init(priv, parent->ca, parent->port); if (error) goto out; priv->parent = parent->dev; list_add_tail(&priv->list, &parent->child_intfs); VLAN_SETCOOKIE(dev, priv); dev->if_start = ipoib_vlan_start; dev->if_drv_flags &= ~IFF_DRV_RUNNING; dev->if_hdrlen = IPOIB_HEADER_LEN; if (ifp->if_drv_flags & IFF_DRV_RUNNING) ipoib_open(priv); mutex_unlock(&parent->vlan_mutex); return; out: mutex_unlock(&parent->vlan_mutex); if (priv) free(priv, M_TEMP); if (error) ipoib_warn(parent, "failed to initialize subinterface: device %s, port %d vtag 0x%X", parent->ca->name, parent->port, vtag); return; } static void ipoib_unconfig_vlan(void *arg, struct ifnet *ifp, u_int16_t vtag) { struct ipoib_dev_priv *parent; struct ipoib_dev_priv *priv; struct ifnet *dev; uint16_t pkey; if (ifp->if_type != IFT_INFINIBAND) return; dev = VLAN_DEVAT(ifp, vtag); if (dev) VLAN_SETCOOKIE(dev, NULL); pkey = vtag | 0x8000; parent = ifp->if_softc; mutex_lock(&parent->vlan_mutex); list_for_each_entry(priv, &parent->child_intfs, list) { if (priv->pkey == pkey) { ipoib_dev_cleanup(priv); list_del(&priv->list); break; } } mutex_unlock(&parent->vlan_mutex); } eventhandler_tag ipoib_vlan_attach; eventhandler_tag ipoib_vlan_detach; static int __init ipoib_init_module(void) { int ret; ipoib_recvq_size = roundup_pow_of_two(ipoib_recvq_size); ipoib_recvq_size = min(ipoib_recvq_size, IPOIB_MAX_QUEUE_SIZE); ipoib_recvq_size = max(ipoib_recvq_size, IPOIB_MIN_QUEUE_SIZE); ipoib_sendq_size = roundup_pow_of_two(ipoib_sendq_size); ipoib_sendq_size = min(ipoib_sendq_size, IPOIB_MAX_QUEUE_SIZE); ipoib_sendq_size = max(ipoib_sendq_size, max(2 * MAX_SEND_CQE, IPOIB_MIN_QUEUE_SIZE)); #ifdef CONFIG_INFINIBAND_IPOIB_CM ipoib_max_conn_qp = min(ipoib_max_conn_qp, IPOIB_CM_MAX_CONN_QP); #endif ipoib_vlan_attach = EVENTHANDLER_REGISTER(vlan_config, ipoib_config_vlan, NULL, EVENTHANDLER_PRI_FIRST); ipoib_vlan_detach = EVENTHANDLER_REGISTER(vlan_unconfig, ipoib_unconfig_vlan, NULL, EVENTHANDLER_PRI_FIRST); /* * We create our own workqueue mainly because we want to be * able to flush it when devices are being removed. We can't * use schedule_work()/flush_scheduled_work() because both * unregister_netdev() and linkwatch_event take the rtnl lock, * so flush_scheduled_work() can deadlock during device * removal. */ ipoib_workqueue = create_singlethread_workqueue("ipoib"); if (!ipoib_workqueue) { ret = -ENOMEM; goto err_fs; } ib_sa_register_client(&ipoib_sa_client); ret = ib_register_client(&ipoib_client); if (ret) goto err_sa; return 0; err_sa: ib_sa_unregister_client(&ipoib_sa_client); destroy_workqueue(ipoib_workqueue); err_fs: return ret; } static void __exit ipoib_cleanup_module(void) { EVENTHANDLER_DEREGISTER(vlan_config, ipoib_vlan_attach); EVENTHANDLER_DEREGISTER(vlan_unconfig, ipoib_vlan_detach); ib_unregister_client(&ipoib_client); ib_sa_unregister_client(&ipoib_sa_client); destroy_workqueue(ipoib_workqueue); } /* * Infiniband output routine. */ static int ipoib_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, struct route *ro) { u_char edst[INFINIBAND_ALEN]; #if defined(INET) || defined(INET6) struct llentry *lle = NULL; #endif struct ipoib_header *eh; int error = 0, is_gw = 0; short type; if (ro != NULL) is_gw = (ro->ro_flags & RT_HAS_GW) != 0; #ifdef MAC error = mac_ifnet_check_transmit(ifp, m); if (error) goto bad; #endif M_PROFILE(m); if (ifp->if_flags & IFF_MONITOR) { error = ENETDOWN; goto bad; } if (!((ifp->if_flags & IFF_UP) && (ifp->if_drv_flags & IFF_DRV_RUNNING))) { error = ENETDOWN; goto bad; } switch (dst->sa_family) { #ifdef INET case AF_INET: if (lle != NULL && (lle->la_flags & LLE_VALID)) memcpy(edst, lle->ll_addr, sizeof(edst)); else if (m->m_flags & M_MCAST) ip_ib_mc_map(((struct sockaddr_in *)dst)->sin_addr.s_addr, ifp->if_broadcastaddr, edst); else error = arpresolve(ifp, is_gw, m, dst, edst, NULL); if (error) return (error == EWOULDBLOCK ? 0 : error); type = htons(ETHERTYPE_IP); break; case AF_ARP: { struct arphdr *ah; ah = mtod(m, struct arphdr *); ah->ar_hrd = htons(ARPHRD_INFINIBAND); switch(ntohs(ah->ar_op)) { case ARPOP_REVREQUEST: case ARPOP_REVREPLY: type = htons(ETHERTYPE_REVARP); break; case ARPOP_REQUEST: case ARPOP_REPLY: default: type = htons(ETHERTYPE_ARP); break; } if (m->m_flags & M_BCAST) bcopy(ifp->if_broadcastaddr, edst, INFINIBAND_ALEN); else bcopy(ar_tha(ah), edst, INFINIBAND_ALEN); } break; #endif #ifdef INET6 case AF_INET6: if (lle != NULL && (lle->la_flags & LLE_VALID)) memcpy(edst, lle->ll_addr, sizeof(edst)); else if (m->m_flags & M_MCAST) ipv6_ib_mc_map(&((struct sockaddr_in6 *)dst)->sin6_addr, ifp->if_broadcastaddr, edst); else error = nd6_resolve(ifp, is_gw, m, dst, edst, NULL); if (error) return error; type = htons(ETHERTYPE_IPV6); break; #endif default: if_printf(ifp, "can't handle af%d\n", dst->sa_family); error = EAFNOSUPPORT; goto bad; } /* * Add local net header. If no space in first mbuf, * allocate another. */ M_PREPEND(m, IPOIB_HEADER_LEN, M_NOWAIT); if (m == NULL) { error = ENOBUFS; goto bad; } eh = mtod(m, struct ipoib_header *); (void)memcpy(&eh->proto, &type, sizeof(eh->proto)); (void)memcpy(&eh->hwaddr, edst, sizeof (edst)); /* * Queue message on interface, update output statistics if * successful, and start output if interface not yet active. */ return ((ifp->if_transmit)(ifp, m)); bad: if (m != NULL) m_freem(m); return (error); } /* * Upper layer processing for a received Infiniband packet. */ void ipoib_demux(struct ifnet *ifp, struct mbuf *m, u_short proto) { int isr; #ifdef MAC /* * Tag the mbuf with an appropriate MAC label before any other * consumers can get to it. */ mac_ifnet_create_mbuf(ifp, m); #endif /* Allow monitor mode to claim this frame, after stats are updated. */ if (ifp->if_flags & IFF_MONITOR) { if_printf(ifp, "discard frame at IFF_MONITOR\n"); m_freem(m); return; } /* * Dispatch frame to upper layer. */ switch (proto) { #ifdef INET case ETHERTYPE_IP: isr = NETISR_IP; break; case ETHERTYPE_ARP: if (ifp->if_flags & IFF_NOARP) { /* Discard packet if ARP is disabled on interface */ m_freem(m); return; } isr = NETISR_ARP; break; #endif #ifdef INET6 case ETHERTYPE_IPV6: isr = NETISR_IPV6; break; #endif default: goto discard; } netisr_dispatch(isr, m); return; discard: m_freem(m); } /* * Process a received Infiniband packet. */ static void ipoib_input(struct ifnet *ifp, struct mbuf *m) { struct ipoib_header *eh; if ((ifp->if_flags & IFF_UP) == 0) { m_freem(m); return; } CURVNET_SET_QUIET(ifp->if_vnet); /* Let BPF have it before we strip the header. */ IPOIB_MTAP(ifp, m); eh = mtod(m, struct ipoib_header *); /* * Reset layer specific mbuf flags to avoid confusing upper layers. * Strip off Infiniband header. */ m->m_flags &= ~M_VLANTAG; m_clrprotoflags(m); m_adj(m, IPOIB_HEADER_LEN); if (IPOIB_IS_MULTICAST(eh->hwaddr)) { if (memcmp(eh->hwaddr, ifp->if_broadcastaddr, ifp->if_addrlen) == 0) m->m_flags |= M_BCAST; else m->m_flags |= M_MCAST; if_inc_counter(ifp, IFCOUNTER_IMCASTS, 1); } ipoib_demux(ifp, m, ntohs(eh->proto)); CURVNET_RESTORE(); } static int ipoib_resolvemulti(struct ifnet *ifp, struct sockaddr **llsa, struct sockaddr *sa) { struct sockaddr_dl *sdl; #ifdef INET struct sockaddr_in *sin; #endif #ifdef INET6 struct sockaddr_in6 *sin6; #endif u_char *e_addr; switch(sa->sa_family) { case AF_LINK: /* * No mapping needed. Just check that it's a valid MC address. */ sdl = (struct sockaddr_dl *)sa; e_addr = LLADDR(sdl); if (!IPOIB_IS_MULTICAST(e_addr)) return EADDRNOTAVAIL; *llsa = 0; return 0; #ifdef INET case AF_INET: sin = (struct sockaddr_in *)sa; if (!IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) return EADDRNOTAVAIL; sdl = link_init_sdl(ifp, *llsa, IFT_INFINIBAND); sdl->sdl_alen = INFINIBAND_ALEN; e_addr = LLADDR(sdl); ip_ib_mc_map(sin->sin_addr.s_addr, ifp->if_broadcastaddr, e_addr); *llsa = (struct sockaddr *)sdl; return 0; #endif #ifdef INET6 case AF_INET6: sin6 = (struct sockaddr_in6 *)sa; /* * An IP6 address of 0 means listen to all * of the multicast address used for IP6. * This has no meaning in ipoib. */ if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) return EADDRNOTAVAIL; if (!IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) return EADDRNOTAVAIL; sdl = link_init_sdl(ifp, *llsa, IFT_INFINIBAND); sdl->sdl_alen = INFINIBAND_ALEN; e_addr = LLADDR(sdl); ipv6_ib_mc_map(&sin6->sin6_addr, ifp->if_broadcastaddr, e_addr); *llsa = (struct sockaddr *)sdl; return 0; #endif default: return EAFNOSUPPORT; } } module_init(ipoib_init_module); module_exit(ipoib_cleanup_module); static int ipoib_evhand(module_t mod, int event, void *arg) { return (0); } static moduledata_t ipoib_mod = { .name = "ipoib", .evhand = ipoib_evhand, }; -DECLARE_MODULE(ipoib, ipoib_mod, SI_SUB_SMP, SI_ORDER_ANY); +DECLARE_MODULE(ipoib, ipoib_mod, SI_SUB_LAST, SI_ORDER_ANY); MODULE_DEPEND(ipoib, ibcore, 1, 1, 1); MODULE_DEPEND(ipoib, linuxkpi, 1, 1, 1);