Index: sys/dev/ntb/test/ntb_perf.c
===================================================================
--- /dev/null
+++ sys/dev/ntb/test/ntb_perf.c
@@ -0,0 +1,1306 @@
+/*-
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2019 Shreyank Amartya <shreyank.amartya@amd.com>
+ * Copyright(c) 2019 Advanced Mirco Devices, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2015 AMD Corporation. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *   * Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *   * Redistributions in binary form must reproduce the above copy
+ *     notice, this list of conditions and the following disclaimer in
+ *     the documentation and/or other materials provided with the
+ *     distribution.
+ *   * Neither the name of AMD Corporation nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * PCIe NTB Perf FreeBSD driver
+ */
+
+/*
+ * How to use this tool, by example.
+ *
+ * Suppose aside from local device there is at least one remote device
+ * connected to NTB with index 0.
+ *-----------------------------------------------------------------------------
+ * Eg: install driver with specified chunk/total orders and dma-enabled flag
+ *
+ * root@freebsd:~ #kldload ntb_perf
+ *-----------------------------------------------------------------------------
+ * Eg: Set test parameter total order using sysctl total_order
+ *
+ * root@freebsd:~ #sysctl dev.ntb_perf.0.total_order=20
+ *-----------------------------------------------------------------------------
+ * Eg: Set test parameter chunk order using sysctl data_order
+ *
+ * root@freebsd:~ #sysctl dev.ntb_perf.0.data_order=20
+ *-----------------------------------------------------------------------------
+ * Eg: start performance test with peer (index 0) and get the test metrics
+ *
+ * root@freebsd:~ #sysctl dev.ntb_perf.0.run=0
+ * root@freebsd:~ #sysctl dev.ntb_perf.0.read_stats
+ *-----------------------------------------------------------------------------
+ * Eg: check NTB ports (index) and MW mapping information
+ *
+ * root@freebsd:~ #sysctl dev.ntb_perf.0.info
+ *-----------------------------------------------------------------------------
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/bus.h>
+#include <sys/kernel.h>
+#include <sys/libkern.h>
+#include <sys/limits.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/module.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/sbuf.h>
+#include <sys/sched.h>
+#include <sys/sysctl.h>
+#include <sys/systm.h>
+#include <sys/taskqueue.h>
+#include <sys/time.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+
+#include <dev/pci/pcireg.h>
+#include <dev/pci/pcivar.h>
+
+#include <machine/atomic.h>
+#include <machine/bus.h>
+
+#include <linux/bitops.h>
+#include <linux/ktime.h>
+
+#include "../ntb.h"
+
+#define DRIVER_NAME		"ntb_perf"
+#define DRIVER_VERSION		"1.0"
+
+#define MAX_THREADS_CNT		32
+#define DEF_THREADS_CNT		1
+#define MAX_CHUNK_SIZE		0x100000
+#define MAX_CHUNK_ORDER		20 /* no larger than 1M */
+
+#define MSG_TRIES		500
+
+#define lower_32_bits(n)	((uint32_t)(n))
+#define upper_32_bits(n)	((uint32_t)((n) >> 32))
+
+MALLOC_DEFINE(M_PERF, "ntb_perf","ntb perf data");
+
+SYSCTL_NODE(_debug, OID_AUTO, ntb_perf, CTLFLAG_RWTUN, NULL, "NTB Perf Debugging");
+
+static unsigned g_ntb_perf_debug_level;
+SYSCTL_UINT(_debug_ntb_perf, OID_AUTO, debug_level, CTLFLAG_RWTUN,
+    &g_ntb_perf_debug_level, 0, "NTB Perf log level -- higher is verbose");
+
+#define ntb_perf_printf(lvl, ...) do {				\
+        if (lvl <= g_ntb_perf_debug_level)			\
+                device_printf(perf->dev, __VA_ARGS__);		\
+} while (0)
+
+/*
+ *==============================================================================
+ *                           Static data declarations
+ *==============================================================================
+ */
+
+static unsigned int max_mw_size = 0x100000;
+static unsigned int chunk_order = 19;
+static unsigned int total_order = 30;
+static bool use_dma = false;
+static struct taskqueue *work_queue;
+
+/*
+ *==============================================================================
+ *                         Perf driver data definition
+ *==============================================================================
+ */
+
+enum perf_cmd {
+	PERF_CMD_INVAL = -1,/* invalid spad command */
+	PERF_CMD_SSIZE = 0, /* send out buffer size */
+	PERF_CMD_RSIZE = 1, /* recv in  buffer size */
+	PERF_CMD_SXLAT = 2, /* send in  buffer xlat */
+	PERF_CMD_RXLAT = 3, /* recv out buffer xlat */
+	PERF_CMD_CLEAR = 4, /* clear allocated memory */
+	PERF_STS_DONE  = 5, /* init is done */
+	PERF_STS_LNKUP = 6, /* link up state flag */
+};
+
+struct perf_ctx;
+
+struct perf_peer {
+	struct perf_ctx	*perf;
+	int pidx;
+	int gidx;
+
+	/* Outbound MW params */
+	bus_addr_t	addr_limit;
+	caddr_t 	outbuf;
+	size_t		outbuf_size;
+	size_t		xlat_align_size;
+	size_t		xlat_align;
+	uint64_t	outbuf_xlat;
+
+	/* Inbound MW params */
+	bus_dmamap_t	dma_map;
+	bus_dma_tag_t	dma_tag;
+	caddr_t		*inbuf;
+	size_t		inbuf_size;
+	bus_addr_t	inbuf_xlat;
+
+	/* NTB connection setup service */
+	unsigned long	sts;
+	struct		task service_task;
+};
+
+struct perf_thread {
+	struct perf_ctx *perf;
+	int		tidx;
+
+	/* Data source and measured statistics */
+	ktime_t		duration;
+	void		*src;
+	int		status;
+	uint64_t	copied;
+	struct task	work_task;
+};
+
+struct perf_ctx {
+	device_t dev;
+
+	/* Global device index and peers descriptors */
+	int	gidx;
+	int	pcnt;
+	struct perf_peer *peers;
+
+	/* Performance measuring work-threads interface */
+	size_t		tcnt;
+	unsigned int	tsync;
+	unsigned long	busy_flag;
+	struct callout	clout;
+	struct perf_peer	*test_peer;
+	struct perf_thread	threads[MAX_THREADS_CNT];
+
+	/* Scratchpad/Message IO operations */
+	int (*cmd_send)(struct perf_peer *peer, enum perf_cmd cmd, uint64_t data);
+	int (*cmd_recv)(struct perf_ctx *perf, int *pidx, enum perf_cmd *cmd,
+			uint64_t *data);
+};
+
+struct ntb_load_cb_args {
+	bus_addr_t addr;
+	int error;
+};
+
+/*
+ * Scratchpads-base commands interface
+ */
+#define PERF_SPAD_CNT(_pcnt)	(3*((_pcnt) + 1))
+#define PERF_SPAD_CMD(_gidx)	(3*(_gidx))
+#define PERF_SPAD_LDATA(_gidx)	(3*(_gidx) + 1)
+#define PERF_SPAD_HDATA(_gidx)	(3*(_gidx) + 2)
+#define PERF_SPAD_NOTIFY(_gidx)	(BIT_ULL(_gidx))
+
+
+#define	BIT_ULL_MASK(nr)	(1ULL << ((nr) % BITS_PER_LONG_LONG))
+
+/*
+ *==============================================================================
+ *                  NTB cross-link commands execution service
+ *==============================================================================
+ */
+
+static void perf_terminate_test(struct perf_ctx *perf);
+
+static inline bool
+perf_link_is_up(struct perf_peer *peer)
+{
+	uint64_t link;
+
+	link = ntb_link_is_up(peer->perf->dev, NULL, NULL);
+	return !!(link & BIT_ULL_MASK(peer->pidx));
+}
+
+static int
+perf_spad_cmd_send(struct perf_peer *peer, enum perf_cmd cmd,
+		uint64_t data)
+{
+	struct perf_ctx *perf = peer->perf;
+	int try, ret;
+	uint32_t val = 0;
+
+	for (try = 0; try < MSG_TRIES; try++) {
+		if (!perf_link_is_up(peer))
+			return (ENOLINK);
+
+		ret = ntb_peer_spad_read(perf->dev, PERF_SPAD_CMD(perf->gidx),
+		    &val);
+
+		if (val != PERF_CMD_INVAL) {
+			DELAY(2000);
+			continue;
+		}
+
+		ntb_peer_spad_write(perf->dev,
+		    PERF_SPAD_LDATA(perf->gidx), lower_32_bits(data));
+
+		ntb_peer_spad_write(perf->dev,
+		    PERF_SPAD_HDATA(perf->gidx), upper_32_bits(data));
+
+		barrier();
+
+		ntb_peer_spad_write(perf->dev, PERF_SPAD_CMD(perf->gidx), cmd);
+
+		barrier();
+
+		ntb_peer_db_set(perf->dev, PERF_SPAD_NOTIFY(peer->gidx));
+
+		ntb_perf_printf(1, "%s: DB ring peer %#llx\n", __func__,
+		    PERF_SPAD_NOTIFY(peer->gidx));
+		break;
+	}
+
+	return (try < MSG_TRIES ? 0 : EAGAIN);
+}
+
+static int
+perf_spad_cmd_recv(struct perf_ctx *perf, int *pidx,
+	enum perf_cmd *cmd, uint64_t *data)
+{
+	struct perf_peer *peer;
+	uint32_t val, ret;
+
+	ntb_db_clear(perf->dev, PERF_SPAD_NOTIFY(perf->gidx));
+
+	for (*pidx = 0; *pidx < perf->pcnt; (*pidx)++) {
+		peer = &perf->peers[*pidx];
+
+		if (!perf_link_is_up(peer))
+			continue;
+
+		ret = ntb_spad_read(perf->dev, PERF_SPAD_CMD(peer->gidx), &val);
+		if (val == PERF_CMD_INVAL)
+			continue;
+
+		*cmd = val;
+
+		ret = ntb_spad_read(perf->dev, PERF_SPAD_LDATA(peer->gidx),
+		    &val);
+		*data = val;
+
+		ret = ntb_spad_read(perf->dev, PERF_SPAD_HDATA(peer->gidx),
+		    &val);
+		*data |= (uint64_t)val << 32;
+
+		/* Next command can be retrieved from now */
+		ntb_spad_write(perf->dev, PERF_SPAD_CMD(peer->gidx),
+		    PERF_CMD_INVAL);
+
+		ntb_perf_printf(1, "%s: CMD recv: %d 0x%llx\n",
+		    __func__, *cmd, (unsigned long long)*data);
+
+		return (0);
+	}
+
+	return (EINVAL);
+}
+
+static int
+perf_cmd_send(struct perf_peer *peer, enum perf_cmd cmd, uint64_t data)
+{
+	struct perf_ctx *perf = peer->perf;
+
+	if (cmd == PERF_CMD_SSIZE || cmd == PERF_CMD_SXLAT)
+		return perf->cmd_send(peer, cmd, data);
+
+	return (EINVAL);
+}
+
+static int
+perf_cmd_exec(struct perf_peer *peer, enum perf_cmd cmd)
+{
+	struct perf_ctx *perf = peer->perf;
+
+	switch(cmd) {
+	case PERF_CMD_SSIZE:
+	case PERF_CMD_RSIZE:
+	case PERF_CMD_SXLAT:
+	case PERF_CMD_RXLAT:
+	case PERF_CMD_CLEAR:
+		break;
+	default:
+		ntb_perf_printf(1, "%s: Exec invalid command\n", __func__);
+		return (EINVAL);
+	}
+
+	set_bit(cmd, &peer->sts);
+	ntb_perf_printf(1, "%s: CMD exec: %d\n", __func__, cmd);
+	taskqueue_enqueue(taskqueue_swi, &peer->service_task);
+
+	return (0);
+}
+
+static int
+perf_cmd_recv(struct perf_ctx *perf)
+{
+	struct perf_peer *peer;
+	int ret, pidx, cmd;
+	uint64_t data;
+
+	while (!ntb_link_is_up(perf->dev, NULL, NULL));
+
+	while (!(ret = perf->cmd_recv(perf, &pidx, &cmd, &data))) {
+		peer = &perf->peers[pidx];
+
+		switch(cmd) {
+		case PERF_CMD_SSIZE:
+			peer->inbuf_size = data;
+			return perf_cmd_exec(peer, PERF_CMD_RSIZE);
+		case PERF_CMD_SXLAT:
+			peer->outbuf_xlat = data;
+			return perf_cmd_exec(peer, PERF_CMD_RXLAT);
+		default:
+			ntb_perf_printf(1, "%s: Received invalid command\n",
+			    __func__);
+			return (EINVAL);
+		}
+	}
+
+	return (0);
+}
+
+static void
+perf_link_event(void *ctx)
+{
+	struct perf_ctx *perf = ctx;
+	struct perf_peer *peer;
+	bool lnk_up;
+	int pidx;
+
+	for (pidx = 0; pidx < perf->pcnt; pidx++) {
+		peer = &perf->peers[pidx];
+
+		lnk_up = perf_link_is_up(peer);
+
+		ntb_perf_printf(1, "%s: Link status:%x\n", __func__, lnk_up);
+		if (lnk_up && !test_and_set_bit(PERF_STS_LNKUP, &peer->sts)) {
+			perf_cmd_exec(peer, PERF_CMD_SSIZE);
+		} else if (!lnk_up &&
+			   test_and_clear_bit(PERF_STS_LNKUP, &peer->sts)) {
+			perf_cmd_exec(peer, PERF_CMD_CLEAR);
+		}
+	}
+}
+
+static void
+perf_db_event(void *ctx, uint32_t vec)
+{
+	struct perf_ctx *perf = ctx;
+
+	ntb_perf_printf(1, "%s: DB vec %d mask %#llx bits %#llx\n", __func__, vec,
+	    (unsigned long long)ntb_db_vector_mask(perf->dev, vec),
+	    (unsigned long long)ntb_db_read(perf->dev));
+
+	/* Just receive all available commands */
+	perf_cmd_recv(perf);
+}
+
+static const struct ntb_ctx_ops perf_ops = {
+	.link_event = perf_link_event,
+	.db_event = perf_db_event,
+};
+
+static int
+perf_setup_outbuf(struct perf_peer *peer)
+{
+	/* Initialization is finally done */
+	set_bit(PERF_STS_DONE, &peer->sts);
+
+	return (0);
+}
+
+static void
+perf_free_inbuf(struct perf_peer *peer)
+{
+	if (peer->inbuf == NULL)
+		return;
+
+	ntb_mw_clear_trans(peer->perf->dev, peer->gidx);
+	if (peer->dma_tag) {
+		bus_dmamap_unload(peer->dma_tag, peer->dma_map);
+		bus_dmamem_free(peer->dma_tag, peer->inbuf, peer->dma_map);
+		bus_dma_tag_destroy(peer->dma_tag);
+	}
+	peer->inbuf_size = 0;
+	peer->inbuf = NULL;
+}
+
+static void
+ntb_load_cb(void *xsc, bus_dma_segment_t *segs, int nsegs, int error)
+{
+	struct ntb_load_cb_args *cba = (struct ntb_load_cb_args *)xsc;
+
+	if (!(cba->error = error))
+		cba->addr = segs[0].ds_addr;
+}
+
+static int
+perf_setup_inbuf(struct perf_peer *peer)
+{
+	struct perf_ctx *perf = peer->perf;
+	struct ntb_load_cb_args cba;
+	int ret;
+
+	if (peer->inbuf_size > peer->outbuf_size) {
+		ntb_perf_printf(1, "%s: Too big inbuf size %pa > %pa", __func__,
+		    &peer->inbuf_size, &peer->outbuf_size);
+		return (EINVAL);
+	}
+
+	perf_free_inbuf(peer);
+
+	if (bus_dma_tag_create(bus_get_dma_tag(perf->dev),peer->xlat_align, 0,
+		peer->addr_limit, BUS_SPACE_MAXADDR,
+		NULL, NULL, peer->inbuf_size, 1, peer->inbuf_size,
+		0, NULL, NULL, &peer->dma_tag)) {
+		ntb_perf_printf(1,
+		    "%s: Unable to create MW tag of size %zu/%zu\n",
+		    __func__, peer->inbuf_size, peer->outbuf_size);
+		peer->outbuf_size = 0;
+		peer->inbuf_size = 0;
+		return (ENOMEM);
+	}
+	if (bus_dmamem_alloc(peer->dma_tag, (void **)&peer->inbuf,
+	    BUS_DMA_WAITOK | BUS_DMA_ZERO, &peer->dma_map)) {
+		bus_dma_tag_destroy(peer->dma_tag);
+		ntb_perf_printf(1,
+		    "%s: Unable to allocate MW buffer of size %zu/%zu\n",
+		    __func__, peer->inbuf_size, peer->outbuf_size);
+		peer->outbuf_size = 0;
+		peer->inbuf_size = 0;
+		return (ENOMEM);
+	}
+	if (bus_dmamap_load(peer->dma_tag, peer->dma_map, peer->inbuf,
+	    peer->inbuf_size, ntb_load_cb, &cba, BUS_DMA_NOWAIT) || cba.error) {
+		bus_dma_tag_destroy(peer->dma_tag);
+		ntb_perf_printf(1, "%s: Unable to load MW buffer of size %zu/%zu\n",
+		    __func__, peer->inbuf_size, peer->outbuf_size);
+		peer->outbuf_size = 0;
+		peer->inbuf_size = 0;
+	}
+
+	peer->inbuf_xlat = cba.addr;
+	ret = ntb_mw_set_trans(perf->dev, peer->gidx, peer->inbuf_xlat, peer->inbuf_size);
+	if (ret) {
+		ntb_perf_printf(1, "%s: Failed to set inbuf translation\n",
+		    __func__);
+		goto err_free_inbuf;
+	}
+
+	perf_cmd_exec(peer, PERF_CMD_SXLAT);
+
+	return (0);
+
+err_free_inbuf:
+	perf_free_inbuf(peer);
+	return (ret);
+}
+
+static int
+perf_init_service(struct perf_ctx *perf)
+{
+	uint64_t mask;
+
+	/* Check MW count */
+	mask = GENMASK_ULL(perf->pcnt, 0);
+	if (ntb_spad_count(perf->dev) >= PERF_SPAD_CNT(perf->pcnt) &&
+		(ntb_db_valid_mask(perf->dev) & mask) == mask) {
+		perf->cmd_send = perf_spad_cmd_send;
+		perf->cmd_recv = perf_spad_cmd_recv;
+
+		ntb_perf_printf(1, "%s: Scratchpad service initialized\n",
+		    __func__);
+		return (0);
+	}
+
+	ntb_perf_printf(1, "%s: Scratchpad service unsupported\n", __func__);
+
+	return (EINVAL);
+}
+
+static int
+perf_enable_service(struct perf_ctx *perf)
+{
+	uint64_t mask, incmd_bit;
+	int ret, sidx, scnt;
+
+	mask = ntb_db_valid_mask(perf->dev);
+	ntb_db_set_mask(perf->dev, mask);
+
+	ret = ntb_set_ctx(perf->dev, perf, &perf_ops);
+	if(ret)
+		return (ret);
+
+	if (perf->cmd_send == perf_spad_cmd_send) {
+		scnt = ntb_spad_count(perf->dev);
+		for (sidx = 0; sidx < scnt; sidx++)
+			ntb_spad_write(perf->dev, sidx, PERF_CMD_INVAL);
+		incmd_bit = PERF_SPAD_NOTIFY(perf->gidx);
+		ntb_db_clear_mask(perf->dev, incmd_bit);
+
+		ntb_perf_printf(1, "%s: DB bits unmasked %#llx\n",
+		    __func__, (unsigned long long)incmd_bit);
+	}
+
+	ntb_link_enable(perf->dev, NTB_SPEED_AUTO, NTB_WIDTH_AUTO);
+	ntb_perf_printf(1, "%s: Service Enabled\n", __func__);
+
+	return (0);
+}
+
+static void
+perf_service_work(void *arg, int npending)
+{
+	struct perf_peer *peer = arg;
+	struct perf_ctx *perf = peer->perf;
+
+	if (test_and_clear_bit(PERF_CMD_SSIZE, &peer->sts))
+		perf_cmd_send(peer, PERF_CMD_SSIZE, peer->outbuf_size);
+
+	if (test_and_clear_bit(PERF_CMD_RSIZE, &peer->sts))
+		perf_setup_inbuf(peer);
+
+	if (test_and_clear_bit(PERF_CMD_SXLAT, &peer->sts))
+		perf_cmd_send(peer, PERF_CMD_SXLAT, peer->inbuf_xlat);
+
+	if (test_and_clear_bit(PERF_CMD_RXLAT, &peer->sts))
+		perf_setup_outbuf(peer);
+
+	if (test_and_clear_bit(PERF_CMD_CLEAR, &peer->sts)) {
+		clear_bit(PERF_STS_DONE, &peer->sts);
+		if (test_bit(0, &peer->perf->busy_flag) &&
+			peer == peer->perf->test_peer) {
+			ntb_perf_printf(1, "%s: Freeing while test on-fly\n",
+			    __func__);
+			perf_terminate_test(peer->perf);
+		}
+		perf_free_inbuf(peer);
+	}
+}
+
+static void
+perf_disable_service(struct perf_ctx *perf)
+{
+	int pidx;
+
+	ntb_link_disable(perf->dev);
+
+	ntb_db_set_mask(perf->dev, PERF_SPAD_NOTIFY(perf->gidx));
+
+	ntb_clear_ctx(perf->dev);
+
+	for (pidx = 0; pidx < perf->pcnt; pidx++)
+		perf_cmd_exec(&perf->peers[pidx], PERF_CMD_CLEAR);
+
+	for (pidx = 0; pidx < perf->pcnt; pidx++)
+		taskqueue_drain(taskqueue_swi, &perf->peers[pidx].service_task);
+}
+
+/*
+ *==============================================================================
+ *                      Performance measuring work-thread
+ *==============================================================================
+ */
+static int
+perf_copy_chunk(struct perf_thread *pthr,
+			   void *dst, void *src, size_t len)
+{
+	if (!use_dma) {
+		memcpy(dst, src, len);
+		goto ret_check_tsync;
+	}
+
+ret_check_tsync:
+	return likely(atomic_load_int(&pthr->perf->tsync) > 0) ? 0 : EINTR;
+}
+
+static int
+perf_init_test(struct perf_thread *pthr)
+{
+	struct perf_ctx *perf = pthr->perf;
+
+	pthr->src = malloc(perf->test_peer->outbuf_size, M_PERF, M_WAITOK);
+	if (!pthr->src)
+		return (ENOMEM);
+	arc4rand(pthr->src, perf->test_peer->outbuf_size, 1);
+	ntb_perf_printf(1, "%s: Test init, alloc %llu random bytes\n",
+	    __func__, (unsigned long long)perf->test_peer->outbuf_size);
+
+	return (0);
+}
+
+static int
+perf_run_test(struct perf_thread *pthr)
+{
+	struct perf_peer *peer = pthr->perf->test_peer;
+	struct perf_ctx *perf = pthr->perf;
+	void *flt_dst, *bnd_dst;
+	uint64_t total_size, chunk_size;
+	void *flt_src;
+	int ret = 0;
+
+	total_size = 1ULL << total_order;
+	chunk_size = 1ULL << chunk_order;
+	chunk_size = min(peer->outbuf_size, chunk_size);
+
+	flt_src = pthr->src;
+	bnd_dst = peer->outbuf + peer->outbuf_size;
+	flt_dst = peer->outbuf;
+
+	pthr->duration = ktime_get();
+
+	/* Copied field is cleared on test launch stage */
+	while (pthr->copied < total_size) {
+		ret = perf_copy_chunk(pthr, flt_dst, flt_src, chunk_size);
+		if (ret) {
+			ntb_perf_printf(1, "%s: %d: Got error %d on test\n",
+			    __func__, pthr->tidx, ret);
+			return (ret);
+		}
+		pthr->copied += chunk_size;
+
+		flt_dst = (char*)flt_dst + chunk_size;
+		flt_src = (char*)flt_src + chunk_size;
+		if (flt_dst >= bnd_dst || flt_dst < (void*)peer->outbuf) {
+			flt_dst = peer->outbuf;
+			flt_src = pthr->src;
+		}
+		sched_relinquish(curthread);
+	}
+	ntb_perf_printf(1, "%s: Data transfer complete\n", __func__);
+
+	return (0);
+}
+
+static int
+perf_sync_test(struct perf_thread *pthr)
+{
+	struct perf_ctx *perf = pthr->perf;
+
+	pthr->duration = ktime_sub(ktime_get(), pthr->duration);
+
+	ntb_perf_printf(1, "%s: %d: copied %llu bytes\n",
+	    __func__, pthr->tidx, (unsigned long long)pthr->copied);
+
+	ntb_perf_printf(1, "%s: %d: lasted %llu usecs\n",
+	    __func__, pthr->tidx,
+	    (unsigned long long)ktime_to_us(pthr->duration));
+
+	ntb_perf_printf(1, "%s: %d: %llu MBytes/s\n", __func__, pthr->tidx,
+	    (unsigned long long)(pthr->copied / ktime_to_us(pthr->duration)));
+
+	return (0);
+}
+
+static void
+perf_clear_test(struct perf_thread *pthr)
+{
+	atomic_subtract_int(&pthr->perf->tsync, 1);
+	free(pthr->src, M_PERF);
+	if (atomic_load_int(&pthr->perf->tsync) == 0) {
+		clear_bit(0, &pthr->perf->busy_flag);
+	}
+}
+
+static void
+perf_thread_work(void *arg, int npending)
+{
+	struct perf_thread *pthr = arg;
+	struct perf_ctx *perf = pthr->perf;
+	int ret;
+
+	ntb_perf_printf(1, "%s: Perf thread work tidx: %d\n",
+	    __func__, pthr->tidx);
+
+	/*
+	 * Perform stages in compliance with use_dma flag value.
+	 * Test status is changed only if error happened, otherwise
+	 * status -EINVAL is kept while test is on-fly. Results
+	 * synchronization is performed only if test fininshed
+	 * without an error or interruption.
+	 */
+	ret = perf_init_test(pthr);
+	if (ret) {
+		pthr->status = ret;
+		return;
+	}
+
+	ret = perf_run_test(pthr);
+	if (ret) {
+		pthr->status = ret;
+		goto err_clear_test;
+	}
+
+	pthr->status = perf_sync_test(pthr);
+
+err_clear_test:
+	perf_clear_test(pthr);
+}
+
+static int
+perf_submit_test(struct perf_peer *peer)
+{
+	struct perf_ctx *perf = peer->perf;
+	struct perf_thread *pthr;
+	int tidx;
+
+	ntb_perf_printf(1, "%s: Perf test submitted\n", __func__);
+
+	callout_init(&perf->clout, 1);
+	taskqueue_start_threads(&work_queue, perf->tcnt,
+		PI_DISK, "ntbtqthread");
+
+	if (!test_bit(PERF_STS_DONE, &peer->sts))
+		return (ENOLINK);
+
+	if (test_and_set_bit(0, &perf->busy_flag))
+		return (EBUSY);
+
+	perf->test_peer = peer;
+	atomic_set_int(&perf->tsync, perf->tcnt);
+
+	for (tidx = 0; tidx < MAX_THREADS_CNT; tidx++) {
+		pthr = &perf->threads[tidx];
+
+		pthr->status = -EINVAL;
+		pthr->copied = 0;
+		pthr->duration = ktime_set(0, 0);
+		if (tidx < perf->tcnt)
+			taskqueue_enqueue(work_queue, &pthr->work_task);
+	}
+
+	return (0);
+}
+
+static void
+perf_terminate_test(struct perf_ctx *perf)
+{
+	atomic_set_int(&perf->tsync, -1);
+}
+
+static int
+perf_read_stats(struct perf_ctx *perf, struct sysctl_req *req)
+{
+	struct perf_thread *pthr;
+	struct sbuf *sb;
+	int tidx;
+	int rc;
+	size_t size = 1024;
+
+	if (!perf->test_peer)
+		return (0);
+
+	if (test_bit(0, &perf->busy_flag))
+		return (EBUSY);
+
+	sb = sbuf_new_for_sysctl(NULL, NULL, size, req);
+	if (sb == NULL) {
+		rc = sb->s_error;
+		return (rc);
+	}
+
+	sbuf_printf(sb, "\nPeer %d test statistics:\n" , perf->test_peer->pidx);
+
+	for (tidx = 0; tidx < MAX_THREADS_CNT; tidx++) {
+		pthr = &perf->threads[tidx];
+
+		if (pthr->status == -EINVAL)
+			continue;
+
+		if (pthr->status) {
+			sbuf_printf(sb, "%d: error status %d\n",
+			    tidx, pthr->status);
+			continue;
+		}
+
+		sbuf_printf(sb, "%d: copied %llu bytes in %llu usecs,\
+		    %llu MBytes/s\n", tidx, (unsigned long long)pthr->copied,
+		    (unsigned long long)ktime_to_us(pthr->duration),
+		    (unsigned long long)(pthr->copied /
+		    ktime_to_us(pthr->duration)));
+	}
+	rc = sbuf_finish(sb);
+	sbuf_delete(sb);
+
+	return (rc);
+}
+
+static void
+perf_init_threads(struct perf_ctx *perf)
+{
+	struct perf_thread *pthr;
+	int tidx;
+
+	perf->tcnt = DEF_THREADS_CNT;
+	perf->test_peer = &perf->peers[0];
+
+	for (tidx = 0; tidx < MAX_THREADS_CNT; tidx++) {
+		pthr = &perf->threads[tidx];
+		pthr->perf = perf;
+		pthr->tidx = tidx;
+		pthr->status = -EINVAL;
+		TASK_INIT(&pthr->work_task, 0, perf_thread_work, pthr);
+	}
+}
+
+static void
+perf_clear_threads(struct perf_ctx *perf)
+{
+	perf_terminate_test(perf);
+}
+
+/*
+ *==============================================================================
+ *                               Sysctl functions
+ *==============================================================================
+ */
+
+static int
+ntb_sysctl_info_handler(SYSCTL_HANDLER_ARGS)
+{
+	struct perf_ctx* perf = (struct perf_ctx*)arg1;
+	struct perf_peer *peer;
+	struct sbuf *sb;
+	int rc, pidx;
+	uint32_t size = 4096;
+
+	rc = sysctl_wire_old_buffer(req, 0);
+	if (rc != 0)
+		goto error;
+
+	sb = sbuf_new_for_sysctl(NULL, NULL, size, req);
+	if (sb == NULL) {
+		rc = sb->s_error;
+		goto error;
+	}
+
+	sbuf_printf(sb, "NTB Performance measuring tool info:\n\n");
+	sbuf_printf(sb, "Local Port %d, Global Index %d\n",
+	    ntb_port_number(perf->dev),perf->gidx);
+	sbuf_printf(sb, "Test status: ");
+	if (test_bit(0, &perf->busy_flag)) {
+		sbuf_printf(sb, "on-fly with port %d (%d)\n",
+		    ntb_peer_port_number(perf->dev,
+		    perf->test_peer->pidx), perf->test_peer->pidx);
+	} else {
+		sbuf_printf(sb, "idle\n");
+	}
+
+	for (pidx = 0; pidx < perf->pcnt; pidx++) {
+		peer = &perf->peers[pidx];
+
+		sbuf_printf(sb, "Port %d (%d), Global index %d:\n",
+		    ntb_peer_port_number(perf->dev, peer->pidx),
+		    peer->pidx, peer->gidx);
+
+		sbuf_printf(sb, "\tLink Status: %s\n",
+		    test_bit(PERF_STS_LNKUP, &peer->sts) ? "up" : "down");
+
+		sbuf_printf(sb, "\tOut buffer addr %pK\n",
+		    peer->outbuf);
+
+		sbuf_printf(sb, "\tOut buffer size 0x%zx\n",
+		    peer->outbuf_size);
+
+		sbuf_printf(sb, "\tOut buffer xlat 0x%pad[p]\n",
+		    &peer->outbuf_xlat);
+
+		if (peer->inbuf == NULL) {
+		    sbuf_printf(sb, "\tIn buffer addr: unallocated\n");
+		    continue;
+		}
+
+		sbuf_printf(sb, "\tIn buffer addr %pK\n",
+		    peer->inbuf);
+
+		sbuf_printf(sb, "\tIn buffer size 0x%zx\n",
+		    peer->inbuf_size);
+
+		sbuf_printf(sb, "\tIn buffer xlat %pad[p]\n",
+		    &peer->inbuf_xlat);
+	}
+	rc = sbuf_finish(sb);
+	sbuf_delete(sb);
+error:
+	return (rc);
+}
+
+static int
+ntb_sysctl_run_handler(SYSCTL_HANDLER_ARGS)
+{
+	struct perf_ctx* perf = (struct perf_ctx*)arg1;
+	struct perf_peer *peer = perf->test_peer;
+	int ret;
+	int pidx = -1;
+
+	if (peer)
+		pidx = peer->pidx;
+	else
+		pidx = -1;
+
+	ret = sysctl_handle_int(oidp, &pidx, 0, req);
+	if (ret != 0 || req->newptr == NULL)
+		goto error;
+
+	if (pidx < 0 || pidx >= perf->pcnt) {
+		ret = EINVAL;
+		goto error;
+	}
+	peer = &perf->peers[pidx];
+	ret = perf_submit_test(peer);
+error:
+	return (ret);
+}
+
+static int
+ntb_sysctl_tcount_handler(SYSCTL_HANDLER_ARGS)
+{
+	struct perf_ctx* perf = (struct perf_ctx*)arg1;
+	int ret = 0;
+	size_t tcnt = 0;
+
+	tcnt = perf->tcnt;
+
+	ret = sysctl_handle_int(oidp, &tcnt, 0, req);
+	if (ret != 0 || req->newptr == NULL)
+		goto error;
+
+	if (tcnt <= MAX_THREADS_CNT)
+		perf->tcnt = tcnt;
+	else {
+		ret = EINVAL;
+		goto error;
+	}
+	ntb_perf_printf(1, "%s: Thread count set to:%zi\n",
+		__func__, perf->tcnt);
+error:
+	return (ret);
+}
+
+static int
+ntb_sysctl_mwsize_handler(SYSCTL_HANDLER_ARGS)
+{
+	struct perf_ctx* perf = (struct perf_ctx*)arg1;
+	int ret = 0;
+
+	ret = sysctl_handle_int(oidp, &max_mw_size, 0, req);
+	if (ret != 0 || req->newptr == NULL)
+		goto error;
+
+	ntb_perf_printf(1, "%s: Max MW size set to:%i\n",
+	    __func__, max_mw_size);
+error:
+	return (ret);
+}
+
+static int
+ntb_sysctl_corder_handler(SYSCTL_HANDLER_ARGS)
+{
+	struct perf_ctx* perf = (struct perf_ctx*)arg1;
+	int ret = 0;
+
+	ret = sysctl_handle_int(oidp, &chunk_order, 0, req);
+	if (ret != 0 || req->newptr == NULL)
+		goto error;
+
+	if (chunk_order > MAX_CHUNK_ORDER)
+		chunk_order = MAX_CHUNK_ORDER;
+
+	ntb_perf_printf(1, "%s: Chunk order set to:%u\n",
+	    __func__, chunk_order);
+error:
+	return (ret);
+}
+
+static int
+ntb_sysctl_torder_handler(SYSCTL_HANDLER_ARGS)
+{
+	struct perf_ctx* perf = (struct perf_ctx*)arg1;
+	int ret = 0;
+
+	ret = sysctl_handle_int(oidp, &total_order, 0, req);
+	if (ret != 0 || req->newptr == NULL)
+		goto error;
+
+	if (total_order < chunk_order)
+		total_order = chunk_order;
+
+	ntb_perf_printf(1, "%s: Total order set to:%u\n",
+	    __func__, total_order);
+error:
+	return (ret);
+}
+
+static int
+ntb_sysctl_usedma_handler(SYSCTL_HANDLER_ARGS)
+{
+	struct perf_ctx* perf = (struct perf_ctx*)arg1;
+	int ret = 0;
+
+	ret = sysctl_handle_bool(oidp, &use_dma, 0, req);
+	if (ret != 0 || req->newptr == NULL)
+		goto error;
+
+	ntb_perf_printf(1, "%s: Use DMA: %s\n", __func__,
+	    use_dma ? "True" : "False");
+error:
+	return (ret);
+}
+
+static int
+ntb_sysctl_read_stats_handler(SYSCTL_HANDLER_ARGS)
+{
+	struct perf_ctx *perf = (struct perf_ctx*)arg1;
+	int ret;
+
+	ret = perf_read_stats(perf, req);
+	if (ret != 0 || req->newptr == NULL)
+		goto error;
+error:
+	return (ret);
+}
+
+static void
+perf_setup_sysctl(struct perf_ctx *perf)
+{
+	struct sysctl_oid_list *globals;
+	struct sysctl_ctx_list *ctx;
+
+	ctx = device_get_sysctl_ctx(perf->dev);
+	globals = SYSCTL_CHILDREN(device_get_sysctl_tree(perf->dev));
+
+	SYSCTL_ADD_PROC(ctx, globals, OID_AUTO, "info",
+	    CTLTYPE_STRING | CTLFLAG_RD, perf, 0,
+	    ntb_sysctl_info_handler, "A",
+	    "NTB performance information" );
+	SYSCTL_ADD_PROC(ctx, globals, OID_AUTO, "run",
+	    CTLTYPE_INT | CTLFLAG_RW, perf, 0,
+	    ntb_sysctl_run_handler, "I", "NTB run" );
+	SYSCTL_ADD_PROC(ctx, globals, OID_AUTO, "thread_count",
+	    CTLTYPE_UINT | CTLFLAG_RW, perf, 0,
+	    ntb_sysctl_tcount_handler, "IU", "NTB thread count" );
+	SYSCTL_ADD_PROC(ctx, globals, OID_AUTO, "mw_size",
+	    CTLTYPE_UINT | CTLFLAG_RW, perf, 0,
+	    ntb_sysctl_mwsize_handler, "IU",
+	    "Upper limit of memory window size" );
+	SYSCTL_ADD_PROC(ctx, globals, OID_AUTO, "data_order",
+	    CTLTYPE_UINT | CTLFLAG_RW, perf, 0,
+	    ntb_sysctl_corder_handler, "IU",
+	    "Data chunk order [2^n] to transfer" );
+	SYSCTL_ADD_PROC(ctx, globals, OID_AUTO, "total_order",
+	    CTLTYPE_UINT | CTLFLAG_RW, perf, 0,
+	    ntb_sysctl_torder_handler, "IU",
+	    "Total data order [2^n] to transfer" );
+	SYSCTL_ADD_PROC(ctx, globals, OID_AUTO, "use_dma",
+	    CTLTYPE_U8 | CTLFLAG_RW, perf, 0,
+	    ntb_sysctl_usedma_handler, "CU",
+	    "Use DMA engine to measure performance" );
+	SYSCTL_ADD_PROC(ctx, globals, OID_AUTO, "read_stats",
+	    CTLTYPE_STRING | CTLFLAG_RW, perf, 0,
+	    ntb_sysctl_read_stats_handler, "A",
+	    "NTB Perf test statistics" );
+}
+
+/*
+ *==============================================================================
+ *                        Basic driver initialization
+ *==============================================================================
+ */
+
+static int
+perf_init(struct perf_ctx* perf)
+{
+	work_queue = taskqueue_create("perf_wq", M_WAITOK | M_ZERO,
+	    taskqueue_thread_enqueue, &work_queue);
+	return (0);
+}
+
+static int
+perf_setup_peer_mw(struct perf_peer *peer)
+{
+	struct perf_ctx *perf = peer->perf;
+	vm_paddr_t bus_addr;
+	int ret;
+
+	/* Get outbound MW parameters and map it */
+	ret = ntb_mw_get_range(perf->dev, peer->gidx, &bus_addr,
+	    &peer->outbuf, &peer->outbuf_size, &peer->xlat_align,
+	    &peer->xlat_align_size, &peer->addr_limit);
+
+	if (ret)
+		return (ret);
+
+	if (!peer->outbuf)
+		return (ENOMEM);
+
+	ret = ntb_mw_set_wc(perf->dev, peer->gidx, VM_MEMATTR_WRITE_COMBINING);
+	if (ret)
+		return (ret);
+
+	if (max_mw_size && peer->outbuf_size > max_mw_size) {
+		peer->outbuf_size = max_mw_size;
+		ntb_perf_printf(1,
+		    "%s: Warning: Peer %d outbuf reduced to %zx\n",
+		    __func__, peer->pidx, peer->outbuf_size);
+	}
+	return (0);
+}
+
+static int
+perf_init_peers(struct perf_ctx *perf)
+{
+	struct perf_peer *peer;
+	int pidx, lport, ret;
+
+	perf->test_peer = NULL;
+	perf->pcnt = ntb_peer_port_count(perf->dev);
+	perf->peers = malloc(perf->pcnt*sizeof(*perf->peers), M_PERF, M_WAITOK);
+	if (!perf->peers)
+		return (ENOMEM);
+
+	lport = ntb_port_number(perf->dev);
+	perf->gidx = -1;
+	for (pidx = 0; pidx < perf->pcnt; pidx++) {
+		peer = &perf->peers[pidx];
+
+		peer->perf = perf;
+		peer->pidx = pidx;
+		if (lport < ntb_peer_port_number(perf->dev, pidx)) {
+			if (perf->gidx == -1)
+				perf->gidx = pidx;
+			peer->gidx = pidx + 1;
+		} else {
+			peer->gidx = pidx;
+		}
+
+		peer->sts = 0;
+		TASK_INIT(&peer->service_task, 0, perf_service_work, peer);
+	}
+	if (perf->gidx == -1)
+		perf->gidx = pidx;
+
+	ntb_perf_printf(1,"%s: Peer Count: %d\n",__func__, perf->pcnt);
+	for (pidx = 0; pidx < perf->pcnt; pidx++) {
+		ret = perf_setup_peer_mw(&perf->peers[pidx]);
+		if (ret)
+			return (ret);
+	}
+
+	ntb_perf_printf(1, "%s: Global port index %d\n", __func__, perf->gidx);
+
+	return (0);
+}
+
+static int
+ntb_perf_probe(device_t dev)
+{
+	device_set_desc(dev, "NTB Perf");
+	return (0);
+}
+
+static int
+ntb_perf_attach(device_t dev)
+{
+	struct perf_ctx *perf = device_get_softc(dev);
+	int ret=0;
+
+	perf->dev = dev;
+
+	perf_init(perf);
+
+	ret = perf_init_peers(perf);
+	if (ret)
+		return (ret);
+
+	perf_init_threads(perf);
+
+	ret = perf_init_service(perf);
+	if (ret)
+		return (ret);
+
+	ret = perf_enable_service(perf);
+	if (ret)
+		return (ret);
+
+	perf_setup_sysctl(perf);
+	return (0);
+}
+
+
+static int
+ntb_perf_detach(device_t dev)
+{
+	struct perf_ctx *perf = device_get_softc(dev);
+
+	taskqueue_free(work_queue);
+	perf_disable_service(perf);
+	perf_clear_threads(perf);
+	free(perf->peers, M_PERF);
+
+	return (0);
+}
+
+
+
+static device_method_t ntb_perf_methods[] = {
+	/* Device interface */
+	DEVMETHOD(device_probe,		ntb_perf_probe),
+	DEVMETHOD(device_attach,	ntb_perf_attach),
+	DEVMETHOD(device_detach,	ntb_perf_detach),
+	DEVMETHOD_END
+};
+
+devclass_t ntb_perf_devclass;
+static DEFINE_CLASS_0(ntb_perf, ntb_perf_driver,
+	ntb_perf_methods, sizeof(struct perf_ctx));
+DRIVER_MODULE(ntb_perf, ntb_hw, ntb_perf_driver,
+	ntb_perf_devclass, NULL, NULL);
+MODULE_DEPEND(ntb_perf, ntb, 1, 1, 1);
+MODULE_VERSION(ntb_perf, 1);
Index: sys/modules/ntb/Makefile
===================================================================
--- sys/modules/ntb/Makefile
+++ sys/modules/ntb/Makefile
@@ -1,5 +1,5 @@
 # $FreeBSD$
 
-SUBDIR=	ntb ntb_hw_amd ntb_hw_intel ntb_hw_plx ntb_transport ntb_tool if_ntb
+SUBDIR=	ntb ntb_hw_amd ntb_hw_intel ntb_hw_plx ntb_transport ntb_perf ntb_tool if_ntb
 
 .include <bsd.subdir.mk>
Index: sys/modules/ntb/ntb_perf/Makefile
===================================================================
--- /dev/null
+++ sys/modules/ntb/ntb_perf/Makefile
@@ -0,0 +1,10 @@
+# $FreeBSD$
+
+.PATH:  ${SRCTOP}/sys/dev/ntb/test
+
+KMOD    = ntb_perf
+SRCS    = ntb_perf.c
+SRCS += device_if.h bus_if.h pci_if.h ntb_if.h
+CFLAGS+= -I${SRCTOP}/sys/compat/linuxkpi/common/include
+
+.include <bsd.kmod.mk>