Index: head/etc/mtree/BSD.tests.dist =================================================================== --- head/etc/mtree/BSD.tests.dist (revision 299089) +++ head/etc/mtree/BSD.tests.dist (revision 299090) @@ -1,664 +1,666 @@ # $FreeBSD$ # # Please see the file src/etc/mtree/README before making changes to this file. # /set type=dir uname=root gname=wheel mode=0755 . bin cat .. chown .. date .. dd .. expr .. ls .. mv .. pax .. pkill .. sh builtins .. errors .. execution .. expansion .. parameters .. parser .. set-e .. .. sleep .. test .. .. cddl lib .. sbin .. usr.bin .. usr.sbin dtrace common aggs .. arithmetic .. arrays .. assocs .. begin .. bitfields .. buffering .. builtinvar .. cg .. clauses .. cpc .. decls .. drops .. dtraceUtil .. end .. enum .. error .. exit .. fbtprovider .. funcs .. grammar .. include .. inline .. io .. ip .. java_api .. json .. lexer .. llquantize .. mdb .. mib .. misc .. multiaggs .. offsetof .. operators .. pid .. plockstat .. pointers .. pragma .. predicates .. preprocessor .. print .. printa .. printf .. privs .. probes .. proc .. profile-n .. providers .. raise .. rates .. safety .. scalars .. sched .. scripting .. sdt .. sizeof .. speculation .. stability .. stack .. stackdepth .. stop .. strlen .. strtoll .. struct .. syscall .. sysevent .. tick-n .. trace .. tracemem .. translators .. typedef .. types .. uctf .. union .. usdt .. ustack .. vars .. version .. .. .. .. .. etc rc.d .. .. games .. gnu lib .. usr.bin diff .. .. .. lib atf libatf-c detail .. .. libatf-c++ detail .. .. test-programs .. .. libarchive .. libc c063 .. db .. gen execve .. posix_spawn .. .. hash data .. .. inet .. locale .. net getaddrinfo data .. .. .. nss .. regex data .. .. resolv .. rpc .. ssp .. setjmp .. stdio .. stdlib .. string .. sys .. time .. tls dso .. .. termios .. ttyio .. .. libcrypt .. libmp .. libnv .. libpam .. libproc .. librt .. libthr dlopen .. .. libutil .. libxo .. msun .. .. libexec atf atf-check .. atf-sh .. .. rtld-elf .. .. sbin dhclient .. devd .. growfs .. ifconfig .. mdconfig .. .. secure lib .. libexec .. usr.bin .. usr.sbin .. .. share examples tests atf .. plain .. .. .. .. sys acl .. aio .. fifo .. file .. geom class concat .. eli .. gate .. gpt .. mirror .. nop .. raid3 .. shsec .. stripe .. uzip etalon .. .. .. .. kern acct .. execve .. pipe .. .. kqueue .. mac bsdextended .. portacl .. .. mqueue .. netinet .. opencrypto .. pjdfstest chflags .. chmod .. chown .. ftruncate .. granular .. link .. mkdir .. mkfifo .. mknod .. open .. rename .. rmdir .. symlink .. truncate .. unlink .. .. posixshm .. + sys + .. vfs .. vm .. .. usr.bin apply .. basename .. bmake archives fmt_44bsd .. fmt_44bsd_mod .. fmt_oldbsd .. .. basic t0 .. t1 .. t2 .. t3 .. .. execution ellipsis .. empty .. joberr .. plus .. .. shell builtin .. meta .. path .. path_select .. replace .. select .. .. suffixes basic .. src_wild1 .. src_wild2 .. .. syntax directive-t0 .. enl .. funny-targets .. semi .. .. sysmk t0 2 1 .. .. mk .. .. t1 2 1 .. .. mk .. .. t2 2 1 .. .. mk .. .. .. variables modifier_M .. modifier_t .. opt_V .. t0 .. .. .. calendar .. cmp .. cpio .. col .. comm .. cut .. dirname .. file2c .. grep .. gzip .. ident .. join .. jot .. lastcomm .. limits .. m4 .. mkimg .. ncal .. opensm .. printf .. sdiff .. sed regress.multitest.out .. .. soelim .. tar .. timeout .. tr .. truncate .. units .. uudecode .. uuencode .. xargs .. xo .. yacc yacc .. .. .. usr.sbin etcupdate .. extattr .. fstyp .. makefs .. newsyslog .. nmtree .. pw .. rpcbind .. sa .. .. .. # vim: set expandtab ts=4 sw=4: Index: head/include/bitstring.h =================================================================== --- head/include/bitstring.h (revision 299089) +++ head/include/bitstring.h (revision 299090) @@ -1,35 +1,37 @@ /*- * Copyright (c) 2003 Poul-Henning Kamp * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _BITSTRING_H_ #define _BITSTRING_H_ +#include +#include #include #endif /* _BITSTRING_H_ */ Index: head/lib/libbluetooth/bluetooth.h =================================================================== --- head/lib/libbluetooth/bluetooth.h (revision 299089) +++ head/lib/libbluetooth/bluetooth.h (revision 299090) @@ -1,216 +1,218 @@ /* * bluetooth.h */ /*- * Copyright (c) 2001-2009 Maksim Yevmenkin * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $Id: bluetooth.h,v 1.5 2003/09/14 23:28:42 max Exp $ * $FreeBSD$ */ #ifndef _BLUETOOTH_H_ #define _BLUETOOTH_H_ #include -#include #include #include #include #include #include + #include #include +#include + #include #include #include #include #include __BEGIN_DECLS /* * Linux BlueZ compatibility */ #define bacmp(ba1, ba2) memcmp((ba1), (ba2), sizeof(bdaddr_t)) #define bacpy(dst, src) memcpy((dst), (src), sizeof(bdaddr_t)) #define ba2str(ba, str) bt_ntoa((ba), (str)) #define str2ba(str, ba) (bt_aton((str), (ba)) == 1? 0 : -1) #define htobs(d) htole16(d) #define htobl(d) htole32(d) #define btohs(d) le16toh(d) #define btohl(d) le32toh(d) /* * Interface to the outside world */ struct hostent * bt_gethostbyname (char const *name); struct hostent * bt_gethostbyaddr (char const *addr, int len, int type); struct hostent * bt_gethostent (void); void bt_sethostent (int stayopen); void bt_endhostent (void); struct protoent * bt_getprotobyname (char const *name); struct protoent * bt_getprotobynumber (int proto); struct protoent * bt_getprotoent (void); void bt_setprotoent (int stayopen); void bt_endprotoent (void); char const * bt_ntoa (bdaddr_t const *ba, char *str); int bt_aton (char const *str, bdaddr_t *ba); /* bt_devXXXX() functions (inspired by NetBSD) */ int bt_devaddr (char const *devname, bdaddr_t *addr); int bt_devname (char *devname, bdaddr_t const *addr); /* * Bluetooth HCI functions */ #define HCI_DEVMAX 32 /* arbitrary */ #define HCI_DEVNAME_SIZE NG_NODESIZ #define HCI_DEVFEATURES_SIZE NG_HCI_FEATURES_SIZE struct bt_devinfo { char devname[HCI_DEVNAME_SIZE]; uint32_t state; /* device/implementation specific */ bdaddr_t bdaddr; uint16_t _reserved0; uint8_t features[HCI_DEVFEATURES_SIZE]; /* buffer info */ uint16_t _reserved1; uint16_t cmd_free; uint16_t sco_size; uint16_t sco_pkts; uint16_t sco_free; uint16_t acl_size; uint16_t acl_pkts; uint16_t acl_free; /* stats */ uint32_t cmd_sent; uint32_t evnt_recv; uint32_t acl_recv; uint32_t acl_sent; uint32_t sco_recv; uint32_t sco_sent; uint32_t bytes_recv; uint32_t bytes_sent; /* misc/specific */ uint16_t link_policy_info; uint16_t packet_type_info; uint16_t role_switch_info; uint16_t debug; uint8_t _padding[20]; /* leave space for future additions */ }; struct bt_devreq { uint16_t opcode; uint8_t event; void *cparam; size_t clen; void *rparam; size_t rlen; }; struct bt_devfilter { bitstr_t bit_decl(packet_mask, 8); bitstr_t bit_decl(event_mask, 256); }; struct bt_devinquiry { bdaddr_t bdaddr; uint8_t pscan_rep_mode; uint8_t pscan_period_mode; uint8_t dev_class[3]; uint16_t clock_offset; int8_t rssi; uint8_t data[240]; }; typedef int (bt_devenum_cb_t)(int, struct bt_devinfo const *, void *); int bt_devopen (char const *devname); int bt_devclose(int s); int bt_devsend (int s, uint16_t opcode, void *param, size_t plen); ssize_t bt_devrecv (int s, void *buf, size_t size, time_t to); int bt_devreq (int s, struct bt_devreq *r, time_t to); int bt_devfilter(int s, struct bt_devfilter const *newp, struct bt_devfilter *oldp); void bt_devfilter_pkt_set(struct bt_devfilter *filter, uint8_t type); void bt_devfilter_pkt_clr(struct bt_devfilter *filter, uint8_t type); int bt_devfilter_pkt_tst(struct bt_devfilter const *filter, uint8_t type); void bt_devfilter_evt_set(struct bt_devfilter *filter, uint8_t event); void bt_devfilter_evt_clr(struct bt_devfilter *filter, uint8_t event); int bt_devfilter_evt_tst(struct bt_devfilter const *filter, uint8_t event); int bt_devinquiry(char const *devname, time_t length, int num_rsp, struct bt_devinquiry **ii); int bt_devinfo (struct bt_devinfo *di); int bt_devenum (bt_devenum_cb_t cb, void *arg); /* * bdaddr utility functions (from NetBSD) */ static __inline int bdaddr_same(const bdaddr_t *a, const bdaddr_t *b) { return (a->b[0] == b->b[0] && a->b[1] == b->b[1] && a->b[2] == b->b[2] && a->b[3] == b->b[3] && a->b[4] == b->b[4] && a->b[5] == b->b[5]); } static __inline int bdaddr_any(const bdaddr_t *a) { return (a->b[0] == 0 && a->b[1] == 0 && a->b[2] == 0 && a->b[3] == 0 && a->b[4] == 0 && a->b[5] == 0); } static __inline void bdaddr_copy(bdaddr_t *d, const bdaddr_t *s) { d->b[0] = s->b[0]; d->b[1] = s->b[1]; d->b[2] = s->b[2]; d->b[3] = s->b[3]; d->b[4] = s->b[4]; d->b[5] = s->b[5]; } __END_DECLS #endif /* ndef _BLUETOOTH_H_ */ Index: head/sbin/hastd/activemap.c =================================================================== --- head/sbin/hastd/activemap.c (revision 299089) +++ head/sbin/hastd/activemap.c (revision 299090) @@ -1,701 +1,701 @@ /*- * Copyright (c) 2009-2010 The FreeBSD Foundation * All rights reserved. * * This software was developed by Pawel Jakub Dawidek under sponsorship from * the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include /* powerof2() */ #include #include #include #include #include #include #include #include #include "activemap.h" #ifndef PJDLOG_ASSERT #include #define PJDLOG_ASSERT(...) assert(__VA_ARGS__) #endif #define ACTIVEMAP_MAGIC 0xac71e4 struct activemap { int am_magic; /* Magic value. */ off_t am_mediasize; /* Media size in bytes. */ uint32_t am_extentsize; /* Extent size in bytes, must be power of 2. */ uint8_t am_extentshift;/* 2 ^ extentbits == extentsize */ int am_nextents; /* Number of extents. */ size_t am_mapsize; /* Bitmap size in bytes. */ uint16_t *am_memtab; /* An array that holds number of pending writes per extent. */ bitstr_t *am_diskmap; /* On-disk bitmap of dirty extents. */ bitstr_t *am_memmap; /* In-memory bitmap of dirty extents. */ size_t am_diskmapsize; /* Map size rounded up to sector size. */ uint64_t am_ndirty; /* Number of dirty regions. */ bitstr_t *am_syncmap; /* Bitmap of extents to sync. */ off_t am_syncoff; /* Next synchronization offset. */ TAILQ_HEAD(skeepdirty, keepdirty) am_keepdirty; /* List of extents that we keep dirty to reduce bitmap updates. */ int am_nkeepdirty; /* Number of am_keepdirty elements. */ int am_nkeepdirty_limit; /* Maximum number of am_keepdirty elements. */ }; struct keepdirty { int kd_extent; TAILQ_ENTRY(keepdirty) kd_next; }; /* * Helper function taken from sys/systm.h to calculate extentshift. */ static uint32_t bitcount32(uint32_t x) { x = (x & 0x55555555) + ((x & 0xaaaaaaaa) >> 1); x = (x & 0x33333333) + ((x & 0xcccccccc) >> 2); x = (x + (x >> 4)) & 0x0f0f0f0f; x = (x + (x >> 8)); x = (x + (x >> 16)) & 0x000000ff; return (x); } static __inline int off2ext(const struct activemap *amp, off_t offset) { int extent; PJDLOG_ASSERT(offset >= 0 && offset < amp->am_mediasize); extent = (offset >> amp->am_extentshift); PJDLOG_ASSERT(extent >= 0 && extent < amp->am_nextents); return (extent); } static __inline off_t ext2off(const struct activemap *amp, int extent) { off_t offset; PJDLOG_ASSERT(extent >= 0 && extent < amp->am_nextents); offset = ((off_t)extent << amp->am_extentshift); PJDLOG_ASSERT(offset >= 0 && offset < amp->am_mediasize); return (offset); } /* * Function calculates number of requests needed to synchronize the given * extent. */ static __inline int ext2reqs(const struct activemap *amp, int ext) { off_t left; if (ext < amp->am_nextents - 1) return (((amp->am_extentsize - 1) / MAXPHYS) + 1); PJDLOG_ASSERT(ext == amp->am_nextents - 1); left = amp->am_mediasize % amp->am_extentsize; if (left == 0) left = amp->am_extentsize; return (((left - 1) / MAXPHYS) + 1); } /* * Initialize activemap structure and allocate memory for internal needs. * Function returns 0 on success and -1 if any of the allocations failed. */ int activemap_init(struct activemap **ampp, uint64_t mediasize, uint32_t extentsize, uint32_t sectorsize, uint32_t keepdirty) { struct activemap *amp; PJDLOG_ASSERT(ampp != NULL); PJDLOG_ASSERT(mediasize > 0); PJDLOG_ASSERT(extentsize > 0); PJDLOG_ASSERT(powerof2(extentsize)); PJDLOG_ASSERT(sectorsize > 0); PJDLOG_ASSERT(powerof2(sectorsize)); PJDLOG_ASSERT(keepdirty > 0); amp = malloc(sizeof(*amp)); if (amp == NULL) return (-1); amp->am_mediasize = mediasize; amp->am_nkeepdirty_limit = keepdirty; amp->am_extentsize = extentsize; amp->am_extentshift = bitcount32(extentsize - 1); amp->am_nextents = ((mediasize - 1) / extentsize) + 1; - amp->am_mapsize = sizeof(bitstr_t) * bitstr_size(amp->am_nextents); + amp->am_mapsize = bitstr_size(amp->am_nextents); amp->am_diskmapsize = roundup2(amp->am_mapsize, sectorsize); amp->am_ndirty = 0; amp->am_syncoff = -2; TAILQ_INIT(&->am_keepdirty); amp->am_nkeepdirty = 0; amp->am_memtab = calloc(amp->am_nextents, sizeof(amp->am_memtab[0])); amp->am_diskmap = calloc(1, amp->am_diskmapsize); amp->am_memmap = bit_alloc(amp->am_nextents); amp->am_syncmap = bit_alloc(amp->am_nextents); /* * Check to see if any of the allocations above failed. */ if (amp->am_memtab == NULL || amp->am_diskmap == NULL || amp->am_memmap == NULL || amp->am_syncmap == NULL) { if (amp->am_memtab != NULL) free(amp->am_memtab); if (amp->am_diskmap != NULL) free(amp->am_diskmap); if (amp->am_memmap != NULL) free(amp->am_memmap); if (amp->am_syncmap != NULL) free(amp->am_syncmap); amp->am_magic = 0; free(amp); errno = ENOMEM; return (-1); } amp->am_magic = ACTIVEMAP_MAGIC; *ampp = amp; return (0); } static struct keepdirty * keepdirty_find(struct activemap *amp, int extent) { struct keepdirty *kd; TAILQ_FOREACH(kd, &->am_keepdirty, kd_next) { if (kd->kd_extent == extent) break; } return (kd); } static bool keepdirty_add(struct activemap *amp, int extent) { struct keepdirty *kd; kd = keepdirty_find(amp, extent); if (kd != NULL) { /* * Only move element at the beginning. */ TAILQ_REMOVE(&->am_keepdirty, kd, kd_next); TAILQ_INSERT_HEAD(&->am_keepdirty, kd, kd_next); return (false); } /* * Add new element, but first remove the most unused one if * we have too many. */ if (amp->am_nkeepdirty >= amp->am_nkeepdirty_limit) { kd = TAILQ_LAST(&->am_keepdirty, skeepdirty); PJDLOG_ASSERT(kd != NULL); TAILQ_REMOVE(&->am_keepdirty, kd, kd_next); amp->am_nkeepdirty--; PJDLOG_ASSERT(amp->am_nkeepdirty > 0); } if (kd == NULL) kd = malloc(sizeof(*kd)); /* We can ignore allocation failure. */ if (kd != NULL) { kd->kd_extent = extent; amp->am_nkeepdirty++; TAILQ_INSERT_HEAD(&->am_keepdirty, kd, kd_next); } return (true); } static void keepdirty_fill(struct activemap *amp) { struct keepdirty *kd; TAILQ_FOREACH(kd, &->am_keepdirty, kd_next) bit_set(amp->am_diskmap, kd->kd_extent); } static void keepdirty_free(struct activemap *amp) { struct keepdirty *kd; while ((kd = TAILQ_FIRST(&->am_keepdirty)) != NULL) { TAILQ_REMOVE(&->am_keepdirty, kd, kd_next); amp->am_nkeepdirty--; free(kd); } PJDLOG_ASSERT(amp->am_nkeepdirty == 0); } /* * Function frees resources allocated by activemap_init() function. */ void activemap_free(struct activemap *amp) { PJDLOG_ASSERT(amp->am_magic == ACTIVEMAP_MAGIC); amp->am_magic = 0; keepdirty_free(amp); free(amp->am_memtab); free(amp->am_diskmap); free(amp->am_memmap); free(amp->am_syncmap); } /* * Function should be called before we handle write requests. It updates * internal structures and returns true if on-disk metadata should be updated. */ bool activemap_write_start(struct activemap *amp, off_t offset, off_t length) { bool modified; off_t end; int ext; PJDLOG_ASSERT(amp->am_magic == ACTIVEMAP_MAGIC); PJDLOG_ASSERT(length > 0); modified = false; end = offset + length - 1; for (ext = off2ext(amp, offset); ext <= off2ext(amp, end); ext++) { /* * If the number of pending writes is increased from 0, * we have to mark the extent as dirty also in on-disk bitmap. * By returning true we inform the caller that on-disk bitmap * was modified and has to be flushed to disk. */ if (amp->am_memtab[ext]++ == 0) { PJDLOG_ASSERT(!bit_test(amp->am_memmap, ext)); bit_set(amp->am_memmap, ext); amp->am_ndirty++; } if (keepdirty_add(amp, ext)) modified = true; } return (modified); } /* * Function should be called after receiving write confirmation. It updates * internal structures and returns true if on-disk metadata should be updated. */ bool activemap_write_complete(struct activemap *amp, off_t offset, off_t length) { bool modified; off_t end; int ext; PJDLOG_ASSERT(amp->am_magic == ACTIVEMAP_MAGIC); PJDLOG_ASSERT(length > 0); modified = false; end = offset + length - 1; for (ext = off2ext(amp, offset); ext <= off2ext(amp, end); ext++) { /* * If the number of pending writes goes down to 0, we have to * mark the extent as clean also in on-disk bitmap. * By returning true we inform the caller that on-disk bitmap * was modified and has to be flushed to disk. */ PJDLOG_ASSERT(amp->am_memtab[ext] > 0); PJDLOG_ASSERT(bit_test(amp->am_memmap, ext)); if (--amp->am_memtab[ext] == 0) { bit_clear(amp->am_memmap, ext); amp->am_ndirty--; if (keepdirty_find(amp, ext) == NULL) modified = true; } } return (modified); } /* * Function should be called after finishing synchronization of one extent. * It returns true if on-disk metadata should be updated. */ bool activemap_extent_complete(struct activemap *amp, int extent) { bool modified; int reqs; PJDLOG_ASSERT(amp->am_magic == ACTIVEMAP_MAGIC); PJDLOG_ASSERT(extent >= 0 && extent < amp->am_nextents); modified = false; reqs = ext2reqs(amp, extent); PJDLOG_ASSERT(amp->am_memtab[extent] >= reqs); amp->am_memtab[extent] -= reqs; PJDLOG_ASSERT(bit_test(amp->am_memmap, extent)); if (amp->am_memtab[extent] == 0) { bit_clear(amp->am_memmap, extent); amp->am_ndirty--; modified = true; } return (modified); } /* * Function returns number of dirty regions. */ uint64_t activemap_ndirty(const struct activemap *amp) { PJDLOG_ASSERT(amp->am_magic == ACTIVEMAP_MAGIC); return (amp->am_ndirty); } /* * Function compare on-disk bitmap and in-memory bitmap and returns true if * they differ and should be flushed to the disk. */ bool activemap_differ(const struct activemap *amp) { PJDLOG_ASSERT(amp->am_magic == ACTIVEMAP_MAGIC); return (memcmp(amp->am_diskmap, amp->am_memmap, amp->am_mapsize) != 0); } /* * Function returns number of bytes used by bitmap. */ size_t activemap_size(const struct activemap *amp) { PJDLOG_ASSERT(amp->am_magic == ACTIVEMAP_MAGIC); return (amp->am_mapsize); } /* * Function returns number of bytes needed for storing on-disk bitmap. * This is the same as activemap_size(), but rounded up to sector size. */ size_t activemap_ondisk_size(const struct activemap *amp) { PJDLOG_ASSERT(amp->am_magic == ACTIVEMAP_MAGIC); return (amp->am_diskmapsize); } /* * Function copies the given buffer read from disk to the internal bitmap. */ void activemap_copyin(struct activemap *amp, const unsigned char *buf, size_t size) { int ext; PJDLOG_ASSERT(amp->am_magic == ACTIVEMAP_MAGIC); PJDLOG_ASSERT(size >= amp->am_mapsize); memcpy(amp->am_diskmap, buf, amp->am_mapsize); memcpy(amp->am_memmap, buf, amp->am_mapsize); memcpy(amp->am_syncmap, buf, amp->am_mapsize); bit_ffs(amp->am_memmap, amp->am_nextents, &ext); if (ext == -1) { /* There are no dirty extents, so we can leave now. */ return; } /* * Set synchronization offset to the first dirty extent. */ activemap_sync_rewind(amp); /* * We have dirty extents and we want them to stay that way until * we synchronize, so we set number of pending writes to number * of requests needed to synchronize one extent. */ amp->am_ndirty = 0; for (; ext < amp->am_nextents; ext++) { if (bit_test(amp->am_memmap, ext)) { amp->am_memtab[ext] = ext2reqs(amp, ext); amp->am_ndirty++; } } } /* * Function merges the given bitmap with existing one. */ void activemap_merge(struct activemap *amp, const unsigned char *buf, size_t size) { bitstr_t *remmap = __DECONST(bitstr_t *, buf); int ext; PJDLOG_ASSERT(amp->am_magic == ACTIVEMAP_MAGIC); PJDLOG_ASSERT(size >= amp->am_mapsize); bit_ffs(remmap, amp->am_nextents, &ext); if (ext == -1) { /* There are no dirty extents, so we can leave now. */ return; } /* * We have dirty extents and we want them to stay that way until * we synchronize, so we set number of pending writes to number * of requests needed to synchronize one extent. */ for (; ext < amp->am_nextents; ext++) { /* Local extent already dirty. */ if (bit_test(amp->am_syncmap, ext)) continue; /* Remote extent isn't dirty. */ if (!bit_test(remmap, ext)) continue; bit_set(amp->am_syncmap, ext); bit_set(amp->am_memmap, ext); bit_set(amp->am_diskmap, ext); if (amp->am_memtab[ext] == 0) amp->am_ndirty++; amp->am_memtab[ext] = ext2reqs(amp, ext); } /* * Set synchronization offset to the first dirty extent. */ activemap_sync_rewind(amp); } /* * Function returns pointer to internal bitmap that should be written to disk. */ const unsigned char * activemap_bitmap(struct activemap *amp, size_t *sizep) { PJDLOG_ASSERT(amp->am_magic == ACTIVEMAP_MAGIC); if (sizep != NULL) *sizep = amp->am_diskmapsize; memcpy(amp->am_diskmap, amp->am_memmap, amp->am_mapsize); keepdirty_fill(amp); return ((const unsigned char *)amp->am_diskmap); } /* * Function calculates size needed to store bitmap on disk. */ size_t activemap_calc_ondisk_size(uint64_t mediasize, uint32_t extentsize, uint32_t sectorsize) { uint64_t nextents, mapsize; PJDLOG_ASSERT(mediasize > 0); PJDLOG_ASSERT(extentsize > 0); PJDLOG_ASSERT(powerof2(extentsize)); PJDLOG_ASSERT(sectorsize > 0); PJDLOG_ASSERT(powerof2(sectorsize)); nextents = ((mediasize - 1) / extentsize) + 1; - mapsize = sizeof(bitstr_t) * bitstr_size(nextents); + mapsize = bitstr_size(nextents); return (roundup2(mapsize, sectorsize)); } /* * Set synchronization offset to the first dirty extent. */ void activemap_sync_rewind(struct activemap *amp) { int ext; PJDLOG_ASSERT(amp->am_magic == ACTIVEMAP_MAGIC); bit_ffs(amp->am_syncmap, amp->am_nextents, &ext); if (ext == -1) { /* There are no extents to synchronize. */ amp->am_syncoff = -2; return; } /* * Mark that we want to start synchronization from the beginning. */ amp->am_syncoff = -1; } /* * Return next offset of where we should synchronize. */ off_t activemap_sync_offset(struct activemap *amp, off_t *lengthp, int *syncextp) { off_t syncoff, left; int ext; PJDLOG_ASSERT(amp->am_magic == ACTIVEMAP_MAGIC); PJDLOG_ASSERT(lengthp != NULL); PJDLOG_ASSERT(syncextp != NULL); *syncextp = -1; if (amp->am_syncoff == -2) return (-1); if (amp->am_syncoff >= 0 && (amp->am_syncoff + MAXPHYS >= amp->am_mediasize || off2ext(amp, amp->am_syncoff) != off2ext(amp, amp->am_syncoff + MAXPHYS))) { /* * We are about to change extent, so mark previous one as clean. */ ext = off2ext(amp, amp->am_syncoff); bit_clear(amp->am_syncmap, ext); *syncextp = ext; amp->am_syncoff = -1; } if (amp->am_syncoff == -1) { /* * Let's find first extent to synchronize. */ bit_ffs(amp->am_syncmap, amp->am_nextents, &ext); if (ext == -1) { amp->am_syncoff = -2; return (-1); } amp->am_syncoff = ext2off(amp, ext); } else { /* * We don't change extent, so just increase offset. */ amp->am_syncoff += MAXPHYS; if (amp->am_syncoff >= amp->am_mediasize) { amp->am_syncoff = -2; return (-1); } } syncoff = amp->am_syncoff; left = ext2off(amp, off2ext(amp, syncoff)) + amp->am_extentsize - syncoff; if (syncoff + left > amp->am_mediasize) left = amp->am_mediasize - syncoff; if (left > MAXPHYS) left = MAXPHYS; PJDLOG_ASSERT(left >= 0 && left <= MAXPHYS); PJDLOG_ASSERT(syncoff >= 0 && syncoff < amp->am_mediasize); PJDLOG_ASSERT(syncoff + left >= 0 && syncoff + left <= amp->am_mediasize); *lengthp = left; return (syncoff); } /* * Mark extent(s) containing the given region for synchronization. * Most likely one of the components is unavailable. */ bool activemap_need_sync(struct activemap *amp, off_t offset, off_t length) { bool modified; off_t end; int ext; PJDLOG_ASSERT(amp->am_magic == ACTIVEMAP_MAGIC); modified = false; end = offset + length - 1; for (ext = off2ext(amp, offset); ext <= off2ext(amp, end); ext++) { if (bit_test(amp->am_syncmap, ext)) { /* Already marked for synchronization. */ PJDLOG_ASSERT(bit_test(amp->am_memmap, ext)); continue; } bit_set(amp->am_syncmap, ext); if (!bit_test(amp->am_memmap, ext)) { bit_set(amp->am_memmap, ext); amp->am_ndirty++; } amp->am_memtab[ext] += ext2reqs(amp, ext); modified = true; } return (modified); } void activemap_dump(const struct activemap *amp) { int bit; printf("M: "); for (bit = 0; bit < amp->am_nextents; bit++) printf("%d", bit_test(amp->am_memmap, bit) ? 1 : 0); printf("\n"); printf("D: "); for (bit = 0; bit < amp->am_nextents; bit++) printf("%d", bit_test(amp->am_diskmap, bit) ? 1 : 0); printf("\n"); printf("S: "); for (bit = 0; bit < amp->am_nextents; bit++) printf("%d", bit_test(amp->am_syncmap, bit) ? 1 : 0); printf("\n"); } Index: head/share/man/man3/bitstring.3 =================================================================== --- head/share/man/man3/bitstring.3 (revision 299089) +++ head/share/man/man3/bitstring.3 (revision 299090) @@ -1,187 +1,263 @@ .\" Copyright (c) 1989, 1991, 1993 .\" The Regents of the University of California. All rights reserved. .\" .\" This code is derived from software contributed to Berkeley by .\" Paul Vixie. .\" Redistribution and use in source and binary forms, with or without .\" modification, are permitted provided that the following conditions .\" are met: .\" 1. Redistributions of source code must retain the above copyright .\" notice, this list of conditions and the following disclaimer. .\" 2. Redistributions in binary form must reproduce the above copyright .\" notice, this list of conditions and the following disclaimer in the .\" documentation and/or other materials provided with the distribution. .\" 3. Neither the name of the University nor the names of its contributors .\" may be used to endorse or promote products derived from this software .\" without specific prior written permission. .\" .\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE .\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE .\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL .\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS .\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) .\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF .\" SUCH DAMAGE. .\" +.\" Copyright (c) 2014 Spectra Logic Corporation +.\" All rights reserved. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions, and the following disclaimer, +.\" without modification. +.\" 2. Redistributions in binary form must reproduce at minimum a disclaimer +.\" substantially similar to the "NO WARRANTY" disclaimer below +.\" ("Disclaimer") and any redistribution must be conditioned upon +.\" including a substantially similar Disclaimer requirement for further +.\" binary redistribution. +.\" +.\" NO WARRANTY +.\" THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +.\" "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +.\" LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR +.\" A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +.\" HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, +.\" STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING +.\" IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +.\" POSSIBILITY OF SUCH DAMAGES. +.\" .\" @(#)bitstring.3 8.1 (Berkeley) 7/19/93 .\" $FreeBSD$ .\" -.Dd October 17, 2015 +.Dd May 4, 2016 .Dt BITSTRING 3 .Os .Sh NAME .Nm bit_alloc , .Nm bit_clear , .Nm bit_decl , +.Nm bit_ffc , .Nm bit_ffs , +.Nm bit_ffc_at , +.Nm bit_ffs_at , .Nm bit_nclear , .Nm bit_nset , .Nm bit_set , -.Nm bitstr_size , -.Nm bit_test -.Nd bit-string manipulation macros +.Nm bit_test , +.Nm bitstr_size +.Nd bit-string manipulation functions and macros .Sh SYNOPSIS .In bitstring.h .Ft bitstr_t * .Fn bit_alloc "int nbits" .Ft void .Fn bit_decl "bitstr_t *name" "int nbits" .Ft void .Fn bit_clear "bitstr_t *name" "int bit" .Ft void .Fn bit_ffc "bitstr_t *name" "int nbits" "int *value" .Ft void .Fn bit_ffs "bitstr_t *name" "int nbits" "int *value" .Ft void +.Fn bit_ffc_at "bitstr_t *name" "int start" "int nbits" "int *value" +.Ft void +.Fn bit_ffs_at "bitstr_t *name" "int start" "int nbits" "int *value" +.Ft void .Fn bit_nclear "bitstr_t *name" "int start" "int stop" .Ft void .Fn bit_nset "bitstr_t *name" "int start" "int stop" .Ft void .Fn bit_set "bitstr_t *name" "int bit" .Ft int .Fn bitstr_size "int nbits" .Ft int .Fn bit_test "bitstr_t *name" "int bit" .Sh DESCRIPTION These macros operate on strings of bits. .Pp -The macro +The function .Fn bit_alloc returns a pointer of type .Dq Fa "bitstr_t *" to sufficient space to store .Fa nbits bits, or .Dv NULL if no space is available. +If successful, the returned bit string is initialized with all bits cleared. .Pp The macro .Fn bit_decl -allocates sufficient space to store +declares a bit string with sufficient space to store .Fa nbits -bits on the stack. +bits. +.Fn bit_decl +may be used to include statically sized bit strings in structure +definitions or to create bit strings on the stack. +Users of this macro are responsible for initialization of the bit string, +typically via a global initialization of the containing struct or use of the +.Fn bit_nset +or +.Fn bin_nclear +functions. .Pp The macro .Fn bitstr_size -returns the number of elements of type -.Fa bitstr_t -necessary to store +returns the number of bytes necessary to store .Fa nbits bits. This is useful for copying bit strings. .Pp -The macros +The functions .Fn bit_clear and .Fn bit_set clear or set the zero-based numbered bit .Fa bit , in the bit string .Ar name . .Pp The .Fn bit_nset and .Fn bit_nclear -macros +functions set or clear the zero-based numbered bits from .Fa start through .Fa stop in the bit string .Ar name . .Pp The .Fn bit_test -macro +function evaluates to non-zero if the zero-based numbered bit .Fa bit of bit string .Fa name is set, and zero otherwise. .Pp +The function +.Fn bit_ffc +stores in the location referenced by +.Fa value +the zero-based number of the first bit not set in the array of +.Fa nbits +bits referenced by +.Fa name . +If all bits are set, the location referenced by +.Fa value +is set to \-1. +.Pp The .Fn bit_ffs -macro +function stores in the location referenced by .Fa value the zero-based number of the first bit set in the array of .Fa nbits bits referenced by .Fa name . If no bits are set, the location referenced by .Fa value is set to \-1. .Pp -The macro -.Fn bit_ffc +The function +.Fn bit_ffc_at stores in the location referenced by .Fa value the zero-based number of the first bit not set in the array of .Fa nbits bits referenced by -.Fa name . -If all bits are set, the location referenced by +.Fa name , +at or after the zero-based bit index +.Fa start . +If all bits at or after +.Fa start +are set, the location referenced by .Fa value is set to \-1. .Pp -The arguments to these macros are evaluated only once and may safely +The +.Fn bit_ffs_at +function +stores in the location referenced by +.Fa value +the zero-based number of the first bit set in the array of +.Fa nbits +bits referenced by +.Fa name , +at or after the zero-based bit index +.Fa start . +If no bits are set after +.Fa start , +the location referenced by +.Fa value +is set to \-1. +.Pp +The arguments in bit string macros are evaluated only once and may safely have side effects. .Sh EXAMPLES .Bd -literal -offset indent #include #include \&... #define LPR_BUSY_BIT 0 #define LPR_FORMAT_BIT 1 #define LPR_DOWNLOAD_BIT 2 \&... #define LPR_AVAILABLE_BIT 9 #define LPR_MAX_BITS 10 make_lpr_available() { bitstr_t bit_decl(bitlist, LPR_MAX_BITS); ... bit_nclear(bitlist, 0, LPR_MAX_BITS - 1); ... if (!bit_test(bitlist, LPR_BUSY_BIT)) { bit_clear(bitlist, LPR_FORMAT_BIT); bit_clear(bitlist, LPR_DOWNLOAD_BIT); bit_set(bitlist, LPR_AVAILABLE_BIT); } } .Ed .Sh SEE ALSO .Xr malloc 3 , .Xr bitset 9 .Sh HISTORY The .Nm bitstring functions first appeared in .Bx 4.4 . Index: head/sys/dev/xen/blkback/blkback.c =================================================================== --- head/sys/dev/xen/blkback/blkback.c (revision 299089) +++ head/sys/dev/xen/blkback/blkback.c (revision 299090) @@ -1,3891 +1,3887 @@ /*- * Copyright (c) 2009-2012 Spectra Logic Corporation * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions, and the following disclaimer, * without modification. * 2. Redistributions in binary form must reproduce at minimum a disclaimer * substantially similar to the "NO WARRANTY" disclaimer below * ("Disclaimer") and any redistribution must be conditioned upon * including a substantially similar Disclaimer requirement for further * binary redistribution. * * NO WARRANTY * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGES. * * Authors: Justin T. Gibbs (Spectra Logic Corporation) * Ken Merry (Spectra Logic Corporation) */ #include __FBSDID("$FreeBSD$"); /** * \file blkback.c * * \brief Device driver supporting the vending of block storage from * a FreeBSD domain to other domains. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /*--------------------------- Compile-time Tunables --------------------------*/ /** * The maximum number of shared memory ring pages we will allow in a * negotiated block-front/back communication channel. Allow enough * ring space for all requests to be XBB_MAX_REQUEST_SIZE'd. */ #define XBB_MAX_RING_PAGES 32 /** * The maximum number of outstanding request blocks (request headers plus * additional segment blocks) we will allow in a negotiated block-front/back * communication channel. */ #define XBB_MAX_REQUESTS \ __CONST_RING_SIZE(blkif, PAGE_SIZE * XBB_MAX_RING_PAGES) /** * \brief Define to force all I/O to be performed on memory owned by the * backend device, with a copy-in/out to the remote domain's memory. * * \note This option is currently required when this driver's domain is * operating in HVM mode on a system using an IOMMU. * * This driver uses Xen's grant table API to gain access to the memory of * the remote domains it serves. When our domain is operating in PV mode, * the grant table mechanism directly updates our domain's page table entries * to point to the physical pages of the remote domain. This scheme guarantees * that blkback and the backing devices it uses can safely perform DMA * operations to satisfy requests. In HVM mode, Xen may use a HW IOMMU to * insure that our domain cannot DMA to pages owned by another domain. As * of Xen 4.0, IOMMU mappings for HVM guests are not updated via the grant * table API. For this reason, in HVM mode, we must bounce all requests into * memory that is mapped into our domain at domain startup and thus has * valid IOMMU mappings. */ #define XBB_USE_BOUNCE_BUFFERS /** * \brief Define to enable rudimentary request logging to the console. */ #undef XBB_DEBUG /*---------------------------------- Macros ----------------------------------*/ /** * Custom malloc type for all driver allocations. */ static MALLOC_DEFINE(M_XENBLOCKBACK, "xbbd", "Xen Block Back Driver Data"); #ifdef XBB_DEBUG #define DPRINTF(fmt, args...) \ printf("xbb(%s:%d): " fmt, __FUNCTION__, __LINE__, ##args) #else #define DPRINTF(fmt, args...) do {} while(0) #endif /** * The maximum mapped region size per request we will allow in a negotiated * block-front/back communication channel. */ #define XBB_MAX_REQUEST_SIZE \ MIN(MAXPHYS, BLKIF_MAX_SEGMENTS_PER_REQUEST * PAGE_SIZE) /** * The maximum number of segments (within a request header and accompanying * segment blocks) per request we will allow in a negotiated block-front/back * communication channel. */ #define XBB_MAX_SEGMENTS_PER_REQUEST \ (MIN(UIO_MAXIOV, \ MIN(BLKIF_MAX_SEGMENTS_PER_REQUEST, \ (XBB_MAX_REQUEST_SIZE / PAGE_SIZE) + 1))) /** * The maximum number of ring pages that we can allow per request list. * We limit this to the maximum number of segments per request, because * that is already a reasonable number of segments to aggregate. This * number should never be smaller than XBB_MAX_SEGMENTS_PER_REQUEST, * because that would leave situations where we can't dispatch even one * large request. */ #define XBB_MAX_SEGMENTS_PER_REQLIST XBB_MAX_SEGMENTS_PER_REQUEST /*--------------------------- Forward Declarations ---------------------------*/ struct xbb_softc; struct xbb_xen_req; static void xbb_attach_failed(struct xbb_softc *xbb, int err, const char *fmt, ...) __attribute__((format(printf, 3, 4))); static int xbb_shutdown(struct xbb_softc *xbb); static int xbb_detach(device_t dev); /*------------------------------ Data Structures -----------------------------*/ STAILQ_HEAD(xbb_xen_req_list, xbb_xen_req); typedef enum { XBB_REQLIST_NONE = 0x00, XBB_REQLIST_MAPPED = 0x01 } xbb_reqlist_flags; struct xbb_xen_reqlist { /** * Back reference to the parent block back instance for this * request. Used during bio_done handling. */ struct xbb_softc *xbb; /** * BLKIF_OP code for this request. */ int operation; /** * Set to BLKIF_RSP_* to indicate request status. * * This field allows an error status to be recorded even if the * delivery of this status must be deferred. Deferred reporting * is necessary, for example, when an error is detected during * completion processing of one bio when other bios for this * request are still outstanding. */ int status; /** * Number of 512 byte sectors not transferred. */ int residual_512b_sectors; /** * Starting sector number of the first request in the list. */ off_t starting_sector_number; /** * If we're going to coalesce, the next contiguous sector would be * this one. */ off_t next_contig_sector; /** * Number of child requests in the list. */ int num_children; /** * Number of I/O requests still pending on the backend. */ int pendcnt; /** * Total number of segments for requests in the list. */ int nr_segments; /** * Flags for this particular request list. */ xbb_reqlist_flags flags; /** * Kernel virtual address space reserved for this request * list structure and used to map the remote domain's pages for * this I/O, into our domain's address space. */ uint8_t *kva; /** * Base, psuedo-physical address, corresponding to the start * of this request's kva region. */ uint64_t gnt_base; #ifdef XBB_USE_BOUNCE_BUFFERS /** * Pre-allocated domain local memory used to proxy remote * domain memory during I/O operations. */ uint8_t *bounce; #endif /** * Array of grant handles (one per page) used to map this request. */ grant_handle_t *gnt_handles; /** * Device statistics request ordering type (ordered or simple). */ devstat_tag_type ds_tag_type; /** * Device statistics request type (read, write, no_data). */ devstat_trans_flags ds_trans_type; /** * The start time for this request. */ struct bintime ds_t0; /** * Linked list of contiguous requests with the same operation type. */ struct xbb_xen_req_list contig_req_list; /** * Linked list links used to aggregate idle requests in the * request list free pool (xbb->reqlist_free_stailq) and pending * requests waiting for execution (xbb->reqlist_pending_stailq). */ STAILQ_ENTRY(xbb_xen_reqlist) links; }; STAILQ_HEAD(xbb_xen_reqlist_list, xbb_xen_reqlist); /** * \brief Object tracking an in-flight I/O from a Xen VBD consumer. */ struct xbb_xen_req { /** * Linked list links used to aggregate requests into a reqlist * and to store them in the request free pool. */ STAILQ_ENTRY(xbb_xen_req) links; /** * The remote domain's identifier for this I/O request. */ uint64_t id; /** * The number of pages currently mapped for this request. */ int nr_pages; /** * The number of 512 byte sectors comprising this requests. */ int nr_512b_sectors; /** * BLKIF_OP code for this request. */ int operation; /** * Storage used for non-native ring requests. */ blkif_request_t ring_req_storage; /** * Pointer to the Xen request in the ring. */ blkif_request_t *ring_req; /** * Consumer index for this request. */ RING_IDX req_ring_idx; /** * The start time for this request. */ struct bintime ds_t0; /** * Pointer back to our parent request list. */ struct xbb_xen_reqlist *reqlist; }; SLIST_HEAD(xbb_xen_req_slist, xbb_xen_req); /** * \brief Configuration data for the shared memory request ring * used to communicate with the front-end client of this * this driver. */ struct xbb_ring_config { /** KVA address where ring memory is mapped. */ vm_offset_t va; /** The pseudo-physical address where ring memory is mapped.*/ uint64_t gnt_addr; /** * Grant table handles, one per-ring page, returned by the * hyperpervisor upon mapping of the ring and required to * unmap it when a connection is torn down. */ grant_handle_t handle[XBB_MAX_RING_PAGES]; /** * The device bus address returned by the hypervisor when * mapping the ring and required to unmap it when a connection * is torn down. */ uint64_t bus_addr[XBB_MAX_RING_PAGES]; /** The number of ring pages mapped for the current connection. */ u_int ring_pages; /** * The grant references, one per-ring page, supplied by the * front-end, allowing us to reference the ring pages in the * front-end's domain and to map these pages into our own domain. */ grant_ref_t ring_ref[XBB_MAX_RING_PAGES]; /** The interrupt driven even channel used to signal ring events. */ evtchn_port_t evtchn; }; /** * Per-instance connection state flags. */ typedef enum { /** * The front-end requested a read-only mount of the * back-end device/file. */ XBBF_READ_ONLY = 0x01, /** Communication with the front-end has been established. */ XBBF_RING_CONNECTED = 0x02, /** * Front-end requests exist in the ring and are waiting for * xbb_xen_req objects to free up. */ XBBF_RESOURCE_SHORTAGE = 0x04, /** Connection teardown in progress. */ XBBF_SHUTDOWN = 0x08, /** A thread is already performing shutdown processing. */ XBBF_IN_SHUTDOWN = 0x10 } xbb_flag_t; /** Backend device type. */ typedef enum { /** Backend type unknown. */ XBB_TYPE_NONE = 0x00, /** * Backend type disk (access via cdev switch * strategy routine). */ XBB_TYPE_DISK = 0x01, /** Backend type file (access vnode operations.). */ XBB_TYPE_FILE = 0x02 } xbb_type; /** * \brief Structure used to memoize information about a per-request * scatter-gather list. * * The chief benefit of using this data structure is it avoids having * to reparse the possibly discontiguous S/G list in the original * request. Due to the way that the mapping of the memory backing an * I/O transaction is handled by Xen, a second pass is unavoidable. * At least this way the second walk is a simple array traversal. * * \note A single Scatter/Gather element in the block interface covers * at most 1 machine page. In this context a sector (blkif * nomenclature, not what I'd choose) is a 512b aligned unit * of mapping within the machine page referenced by an S/G * element. */ struct xbb_sg { /** The number of 512b data chunks mapped in this S/G element. */ int16_t nsect; /** * The index (0 based) of the first 512b data chunk mapped * in this S/G element. */ uint8_t first_sect; /** * The index (0 based) of the last 512b data chunk mapped * in this S/G element. */ uint8_t last_sect; }; /** * Character device backend specific configuration data. */ struct xbb_dev_data { /** Cdev used for device backend access. */ struct cdev *cdev; /** Cdev switch used for device backend access. */ struct cdevsw *csw; /** Used to hold a reference on opened cdev backend devices. */ int dev_ref; }; /** * File backend specific configuration data. */ struct xbb_file_data { /** Credentials to use for vnode backed (file based) I/O. */ struct ucred *cred; /** * \brief Array of io vectors used to process file based I/O. * * Only a single file based request is outstanding per-xbb instance, * so we only need one of these. */ struct iovec xiovecs[XBB_MAX_SEGMENTS_PER_REQLIST]; #ifdef XBB_USE_BOUNCE_BUFFERS /** * \brief Array of io vectors used to handle bouncing of file reads. * * Vnode operations are free to modify uio data during their * exectuion. In the case of a read with bounce buffering active, * we need some of the data from the original uio in order to * bounce-out the read data. This array serves as the temporary * storage for this saved data. */ struct iovec saved_xiovecs[XBB_MAX_SEGMENTS_PER_REQLIST]; /** * \brief Array of memoized bounce buffer kva offsets used * in the file based backend. * * Due to the way that the mapping of the memory backing an * I/O transaction is handled by Xen, a second pass through * the request sg elements is unavoidable. We memoize the computed * bounce address here to reduce the cost of the second walk. */ void *xiovecs_vaddr[XBB_MAX_SEGMENTS_PER_REQLIST]; #endif /* XBB_USE_BOUNCE_BUFFERS */ }; /** * Collection of backend type specific data. */ union xbb_backend_data { struct xbb_dev_data dev; struct xbb_file_data file; }; /** * Function signature of backend specific I/O handlers. */ typedef int (*xbb_dispatch_t)(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist, int operation, int flags); /** * Per-instance configuration data. */ struct xbb_softc { /** * Task-queue used to process I/O requests. */ struct taskqueue *io_taskqueue; /** * Single "run the request queue" task enqueued * on io_taskqueue. */ struct task io_task; /** Device type for this instance. */ xbb_type device_type; /** NewBus device corresponding to this instance. */ device_t dev; /** Backend specific dispatch routine for this instance. */ xbb_dispatch_t dispatch_io; /** The number of requests outstanding on the backend device/file. */ int active_request_count; /** Free pool of request tracking structures. */ struct xbb_xen_req_list request_free_stailq; /** Array, sized at connection time, of request tracking structures. */ struct xbb_xen_req *requests; /** Free pool of request list structures. */ struct xbb_xen_reqlist_list reqlist_free_stailq; /** List of pending request lists awaiting execution. */ struct xbb_xen_reqlist_list reqlist_pending_stailq; /** Array, sized at connection time, of request list structures. */ struct xbb_xen_reqlist *request_lists; /** * Global pool of kva used for mapping remote domain ring * and I/O transaction data. */ vm_offset_t kva; /** Psuedo-physical address corresponding to kva. */ uint64_t gnt_base_addr; /** The size of the global kva pool. */ int kva_size; /** The size of the KVA area used for request lists. */ int reqlist_kva_size; /** The number of pages of KVA used for request lists */ int reqlist_kva_pages; /** Bitmap of free KVA pages */ bitstr_t *kva_free; /** * \brief Cached value of the front-end's domain id. * * This value is used at once for each mapped page in * a transaction. We cache it to avoid incuring the * cost of an ivar access every time this is needed. */ domid_t otherend_id; /** * \brief The blkif protocol abi in effect. * * There are situations where the back and front ends can * have a different, native abi (e.g. intel x86_64 and * 32bit x86 domains on the same machine). The back-end * always accommodates the front-end's native abi. That * value is pulled from the XenStore and recorded here. */ int abi; /** * \brief The maximum number of requests and request lists allowed * to be in flight at a time. * * This value is negotiated via the XenStore. */ u_int max_requests; /** * \brief The maximum number of segments (1 page per segment) * that can be mapped by a request. * * This value is negotiated via the XenStore. */ u_int max_request_segments; /** * \brief Maximum number of segments per request list. * * This value is derived from and will generally be larger than * max_request_segments. */ u_int max_reqlist_segments; /** * The maximum size of any request to this back-end * device. * * This value is negotiated via the XenStore. */ u_int max_request_size; /** * The maximum size of any request list. This is derived directly * from max_reqlist_segments. */ u_int max_reqlist_size; /** Various configuration and state bit flags. */ xbb_flag_t flags; /** Ring mapping and interrupt configuration data. */ struct xbb_ring_config ring_config; /** Runtime, cross-abi safe, structures for ring access. */ blkif_back_rings_t rings; /** IRQ mapping for the communication ring event channel. */ xen_intr_handle_t xen_intr_handle; /** * \brief Backend access mode flags (e.g. write, or read-only). * * This value is passed to us by the front-end via the XenStore. */ char *dev_mode; /** * \brief Backend device type (e.g. "disk", "cdrom", "floppy"). * * This value is passed to us by the front-end via the XenStore. * Currently unused. */ char *dev_type; /** * \brief Backend device/file identifier. * * This value is passed to us by the front-end via the XenStore. * We expect this to be a POSIX path indicating the file or * device to open. */ char *dev_name; /** * Vnode corresponding to the backend device node or file * we are acessing. */ struct vnode *vn; union xbb_backend_data backend; /** The native sector size of the backend. */ u_int sector_size; /** log2 of sector_size. */ u_int sector_size_shift; /** Size in bytes of the backend device or file. */ off_t media_size; /** * \brief media_size expressed in terms of the backend native * sector size. * * (e.g. xbb->media_size >> xbb->sector_size_shift). */ uint64_t media_num_sectors; /** * \brief Array of memoized scatter gather data computed during the * conversion of blkif ring requests to internal xbb_xen_req * structures. * * Ring processing is serialized so we only need one of these. */ struct xbb_sg xbb_sgs[XBB_MAX_SEGMENTS_PER_REQLIST]; /** * Temporary grant table map used in xbb_dispatch_io(). When * XBB_MAX_SEGMENTS_PER_REQLIST gets large, keeping this on the * stack could cause a stack overflow. */ struct gnttab_map_grant_ref maps[XBB_MAX_SEGMENTS_PER_REQLIST]; /** Mutex protecting per-instance data. */ struct mtx lock; /** * Resource representing allocated physical address space * associated with our per-instance kva region. */ struct resource *pseudo_phys_res; /** Resource id for allocated physical address space. */ int pseudo_phys_res_id; /** * I/O statistics from BlockBack dispatch down. These are * coalesced requests, and we start them right before execution. */ struct devstat *xbb_stats; /** * I/O statistics coming into BlockBack. These are the requests as * we get them from BlockFront. They are started as soon as we * receive a request, and completed when the I/O is complete. */ struct devstat *xbb_stats_in; /** Disable sending flush to the backend */ int disable_flush; /** Send a real flush for every N flush requests */ int flush_interval; /** Count of flush requests in the interval */ int flush_count; /** Don't coalesce requests if this is set */ int no_coalesce_reqs; /** Number of requests we have received */ uint64_t reqs_received; /** Number of requests we have completed*/ uint64_t reqs_completed; /** Number of requests we queued but not pushed*/ uint64_t reqs_queued_for_completion; /** Number of requests we completed with an error status*/ uint64_t reqs_completed_with_error; /** How many forced dispatches (i.e. without coalescing) have happened */ uint64_t forced_dispatch; /** How many normal dispatches have happened */ uint64_t normal_dispatch; /** How many total dispatches have happened */ uint64_t total_dispatch; /** How many times we have run out of KVA */ uint64_t kva_shortages; /** How many times we have run out of request structures */ uint64_t request_shortages; }; /*---------------------------- Request Processing ----------------------------*/ /** * Allocate an internal transaction tracking structure from the free pool. * * \param xbb Per-instance xbb configuration structure. * * \return On success, a pointer to the allocated xbb_xen_req structure. * Otherwise NULL. */ static inline struct xbb_xen_req * xbb_get_req(struct xbb_softc *xbb) { struct xbb_xen_req *req; req = NULL; mtx_assert(&xbb->lock, MA_OWNED); if ((req = STAILQ_FIRST(&xbb->request_free_stailq)) != NULL) { STAILQ_REMOVE_HEAD(&xbb->request_free_stailq, links); xbb->active_request_count++; } return (req); } /** * Return an allocated transaction tracking structure to the free pool. * * \param xbb Per-instance xbb configuration structure. * \param req The request structure to free. */ static inline void xbb_release_req(struct xbb_softc *xbb, struct xbb_xen_req *req) { mtx_assert(&xbb->lock, MA_OWNED); STAILQ_INSERT_HEAD(&xbb->request_free_stailq, req, links); xbb->active_request_count--; KASSERT(xbb->active_request_count >= 0, ("xbb_release_req: negative active count")); } /** * Return an xbb_xen_req_list of allocated xbb_xen_reqs to the free pool. * * \param xbb Per-instance xbb configuration structure. * \param req_list The list of requests to free. * \param nreqs The number of items in the list. */ static inline void xbb_release_reqs(struct xbb_softc *xbb, struct xbb_xen_req_list *req_list, int nreqs) { mtx_assert(&xbb->lock, MA_OWNED); STAILQ_CONCAT(&xbb->request_free_stailq, req_list); xbb->active_request_count -= nreqs; KASSERT(xbb->active_request_count >= 0, ("xbb_release_reqs: negative active count")); } /** * Given a page index and 512b sector offset within that page, * calculate an offset into a request's kva region. * * \param reqlist The request structure whose kva region will be accessed. * \param pagenr The page index used to compute the kva offset. * \param sector The 512b sector index used to compute the page relative * kva offset. * * \return The computed global KVA offset. */ static inline uint8_t * xbb_reqlist_vaddr(struct xbb_xen_reqlist *reqlist, int pagenr, int sector) { return (reqlist->kva + (PAGE_SIZE * pagenr) + (sector << 9)); } #ifdef XBB_USE_BOUNCE_BUFFERS /** * Given a page index and 512b sector offset within that page, * calculate an offset into a request's local bounce memory region. * * \param reqlist The request structure whose bounce region will be accessed. * \param pagenr The page index used to compute the bounce offset. * \param sector The 512b sector index used to compute the page relative * bounce offset. * * \return The computed global bounce buffer address. */ static inline uint8_t * xbb_reqlist_bounce_addr(struct xbb_xen_reqlist *reqlist, int pagenr, int sector) { return (reqlist->bounce + (PAGE_SIZE * pagenr) + (sector << 9)); } #endif /** * Given a page number and 512b sector offset within that page, * calculate an offset into the request's memory region that the * underlying backend device/file should use for I/O. * * \param reqlist The request structure whose I/O region will be accessed. * \param pagenr The page index used to compute the I/O offset. * \param sector The 512b sector index used to compute the page relative * I/O offset. * * \return The computed global I/O address. * * Depending on configuration, this will either be a local bounce buffer * or a pointer to the memory mapped in from the front-end domain for * this request. */ static inline uint8_t * xbb_reqlist_ioaddr(struct xbb_xen_reqlist *reqlist, int pagenr, int sector) { #ifdef XBB_USE_BOUNCE_BUFFERS return (xbb_reqlist_bounce_addr(reqlist, pagenr, sector)); #else return (xbb_reqlist_vaddr(reqlist, pagenr, sector)); #endif } /** * Given a page index and 512b sector offset within that page, calculate * an offset into the local psuedo-physical address space used to map a * front-end's request data into a request. * * \param reqlist The request list structure whose pseudo-physical region * will be accessed. * \param pagenr The page index used to compute the pseudo-physical offset. * \param sector The 512b sector index used to compute the page relative * pseudo-physical offset. * * \return The computed global pseudo-phsyical address. * * Depending on configuration, this will either be a local bounce buffer * or a pointer to the memory mapped in from the front-end domain for * this request. */ static inline uintptr_t xbb_get_gntaddr(struct xbb_xen_reqlist *reqlist, int pagenr, int sector) { struct xbb_softc *xbb; xbb = reqlist->xbb; return ((uintptr_t)(xbb->gnt_base_addr + (uintptr_t)(reqlist->kva - xbb->kva) + (PAGE_SIZE * pagenr) + (sector << 9))); } /** * Get Kernel Virtual Address space for mapping requests. * * \param xbb Per-instance xbb configuration structure. * \param nr_pages Number of pages needed. * \param check_only If set, check for free KVA but don't allocate it. * \param have_lock If set, xbb lock is already held. * * \return On success, a pointer to the allocated KVA region. Otherwise NULL. * * Note: This should be unnecessary once we have either chaining or * scatter/gather support for struct bio. At that point we'll be able to * put multiple addresses and lengths in one bio/bio chain and won't need * to map everything into one virtual segment. */ static uint8_t * xbb_get_kva(struct xbb_softc *xbb, int nr_pages) { - intptr_t first_clear; - intptr_t num_clear; + int first_clear; + int num_clear; uint8_t *free_kva; int i; KASSERT(nr_pages != 0, ("xbb_get_kva of zero length")); first_clear = 0; free_kva = NULL; mtx_lock(&xbb->lock); /* * Look for the first available page. If there are none, we're done. */ bit_ffc(xbb->kva_free, xbb->reqlist_kva_pages, &first_clear); if (first_clear == -1) goto bailout; /* * Starting at the first available page, look for consecutive free * pages that will satisfy the user's request. */ for (i = first_clear, num_clear = 0; i < xbb->reqlist_kva_pages; i++) { /* * If this is true, the page is used, so we have to reset * the number of clear pages and the first clear page * (since it pointed to a region with an insufficient number * of clear pages). */ if (bit_test(xbb->kva_free, i)) { num_clear = 0; first_clear = -1; continue; } if (first_clear == -1) first_clear = i; /* * If this is true, we've found a large enough free region * to satisfy the request. */ if (++num_clear == nr_pages) { bit_nset(xbb->kva_free, first_clear, first_clear + nr_pages - 1); free_kva = xbb->kva + - (uint8_t *)(first_clear * PAGE_SIZE); + (uint8_t *)((intptr_t)first_clear * PAGE_SIZE); KASSERT(free_kva >= (uint8_t *)xbb->kva && free_kva + (nr_pages * PAGE_SIZE) <= (uint8_t *)xbb->ring_config.va, ("Free KVA %p len %d out of range, " "kva = %#jx, ring VA = %#jx\n", free_kva, nr_pages * PAGE_SIZE, (uintmax_t)xbb->kva, (uintmax_t)xbb->ring_config.va)); break; } } bailout: if (free_kva == NULL) { xbb->flags |= XBBF_RESOURCE_SHORTAGE; xbb->kva_shortages++; } mtx_unlock(&xbb->lock); return (free_kva); } /** * Free allocated KVA. * * \param xbb Per-instance xbb configuration structure. * \param kva_ptr Pointer to allocated KVA region. * \param nr_pages Number of pages in the KVA region. */ static void xbb_free_kva(struct xbb_softc *xbb, uint8_t *kva_ptr, int nr_pages) { intptr_t start_page; mtx_assert(&xbb->lock, MA_OWNED); start_page = (intptr_t)(kva_ptr - xbb->kva) >> PAGE_SHIFT; bit_nclear(xbb->kva_free, start_page, start_page + nr_pages - 1); } /** * Unmap the front-end pages associated with this I/O request. * * \param req The request structure to unmap. */ static void xbb_unmap_reqlist(struct xbb_xen_reqlist *reqlist) { struct gnttab_unmap_grant_ref unmap[XBB_MAX_SEGMENTS_PER_REQLIST]; u_int i; u_int invcount; int error; invcount = 0; for (i = 0; i < reqlist->nr_segments; i++) { if (reqlist->gnt_handles[i] == GRANT_REF_INVALID) continue; unmap[invcount].host_addr = xbb_get_gntaddr(reqlist, i, 0); unmap[invcount].dev_bus_addr = 0; unmap[invcount].handle = reqlist->gnt_handles[i]; reqlist->gnt_handles[i] = GRANT_REF_INVALID; invcount++; } error = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, unmap, invcount); KASSERT(error == 0, ("Grant table operation failed")); } /** * Allocate an internal transaction tracking structure from the free pool. * * \param xbb Per-instance xbb configuration structure. * * \return On success, a pointer to the allocated xbb_xen_reqlist structure. * Otherwise NULL. */ static inline struct xbb_xen_reqlist * xbb_get_reqlist(struct xbb_softc *xbb) { struct xbb_xen_reqlist *reqlist; reqlist = NULL; mtx_assert(&xbb->lock, MA_OWNED); if ((reqlist = STAILQ_FIRST(&xbb->reqlist_free_stailq)) != NULL) { STAILQ_REMOVE_HEAD(&xbb->reqlist_free_stailq, links); reqlist->flags = XBB_REQLIST_NONE; reqlist->kva = NULL; reqlist->status = BLKIF_RSP_OKAY; reqlist->residual_512b_sectors = 0; reqlist->num_children = 0; reqlist->nr_segments = 0; STAILQ_INIT(&reqlist->contig_req_list); } return (reqlist); } /** * Return an allocated transaction tracking structure to the free pool. * * \param xbb Per-instance xbb configuration structure. * \param req The request list structure to free. * \param wakeup If set, wakeup the work thread if freeing this reqlist * during a resource shortage condition. */ static inline void xbb_release_reqlist(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist, int wakeup) { mtx_assert(&xbb->lock, MA_OWNED); if (wakeup) { wakeup = xbb->flags & XBBF_RESOURCE_SHORTAGE; xbb->flags &= ~XBBF_RESOURCE_SHORTAGE; } if (reqlist->kva != NULL) xbb_free_kva(xbb, reqlist->kva, reqlist->nr_segments); xbb_release_reqs(xbb, &reqlist->contig_req_list, reqlist->num_children); STAILQ_INSERT_TAIL(&xbb->reqlist_free_stailq, reqlist, links); if ((xbb->flags & XBBF_SHUTDOWN) != 0) { /* * Shutdown is in progress. See if we can * progress further now that one more request * has completed and been returned to the * free pool. */ xbb_shutdown(xbb); } if (wakeup != 0) taskqueue_enqueue(xbb->io_taskqueue, &xbb->io_task); } /** * Request resources and do basic request setup. * * \param xbb Per-instance xbb configuration structure. * \param reqlist Pointer to reqlist pointer. * \param ring_req Pointer to a block ring request. * \param ring_index The ring index of this request. * * \return 0 for success, non-zero for failure. */ static int xbb_get_resources(struct xbb_softc *xbb, struct xbb_xen_reqlist **reqlist, blkif_request_t *ring_req, RING_IDX ring_idx) { struct xbb_xen_reqlist *nreqlist; struct xbb_xen_req *nreq; nreqlist = NULL; nreq = NULL; mtx_lock(&xbb->lock); /* * We don't allow new resources to be allocated if we're in the * process of shutting down. */ if ((xbb->flags & XBBF_SHUTDOWN) != 0) { mtx_unlock(&xbb->lock); return (1); } /* * Allocate a reqlist if the caller doesn't have one already. */ if (*reqlist == NULL) { nreqlist = xbb_get_reqlist(xbb); if (nreqlist == NULL) goto bailout_error; } /* We always allocate a request. */ nreq = xbb_get_req(xbb); if (nreq == NULL) goto bailout_error; mtx_unlock(&xbb->lock); if (*reqlist == NULL) { *reqlist = nreqlist; nreqlist->operation = ring_req->operation; nreqlist->starting_sector_number = ring_req->sector_number; STAILQ_INSERT_TAIL(&xbb->reqlist_pending_stailq, nreqlist, links); } nreq->reqlist = *reqlist; nreq->req_ring_idx = ring_idx; nreq->id = ring_req->id; nreq->operation = ring_req->operation; if (xbb->abi != BLKIF_PROTOCOL_NATIVE) { bcopy(ring_req, &nreq->ring_req_storage, sizeof(*ring_req)); nreq->ring_req = &nreq->ring_req_storage; } else { nreq->ring_req = ring_req; } binuptime(&nreq->ds_t0); devstat_start_transaction(xbb->xbb_stats_in, &nreq->ds_t0); STAILQ_INSERT_TAIL(&(*reqlist)->contig_req_list, nreq, links); (*reqlist)->num_children++; (*reqlist)->nr_segments += ring_req->nr_segments; return (0); bailout_error: /* * We're out of resources, so set the shortage flag. The next time * a request is released, we'll try waking up the work thread to * see if we can allocate more resources. */ xbb->flags |= XBBF_RESOURCE_SHORTAGE; xbb->request_shortages++; if (nreq != NULL) xbb_release_req(xbb, nreq); if (nreqlist != NULL) xbb_release_reqlist(xbb, nreqlist, /*wakeup*/ 0); mtx_unlock(&xbb->lock); return (1); } /** * Create and queue a response to a blkif request. * * \param xbb Per-instance xbb configuration structure. * \param req The request structure to which to respond. * \param status The status code to report. See BLKIF_RSP_* * in sys/xen/interface/io/blkif.h. */ static void xbb_queue_response(struct xbb_softc *xbb, struct xbb_xen_req *req, int status) { blkif_response_t *resp; /* * The mutex is required here, and should be held across this call * until after the subsequent call to xbb_push_responses(). This * is to guarantee that another context won't queue responses and * push them while we're active. * * That could lead to the other end being notified of responses * before the resources have been freed on this end. The other end * would then be able to queue additional I/O, and we may run out * of resources because we haven't freed them all yet. */ mtx_assert(&xbb->lock, MA_OWNED); /* * Place on the response ring for the relevant domain. * For now, only the spacing between entries is different * in the different ABIs, not the response entry layout. */ switch (xbb->abi) { case BLKIF_PROTOCOL_NATIVE: resp = RING_GET_RESPONSE(&xbb->rings.native, xbb->rings.native.rsp_prod_pvt); break; case BLKIF_PROTOCOL_X86_32: resp = (blkif_response_t *) RING_GET_RESPONSE(&xbb->rings.x86_32, xbb->rings.x86_32.rsp_prod_pvt); break; case BLKIF_PROTOCOL_X86_64: resp = (blkif_response_t *) RING_GET_RESPONSE(&xbb->rings.x86_64, xbb->rings.x86_64.rsp_prod_pvt); break; default: panic("Unexpected blkif protocol ABI."); } resp->id = req->id; resp->operation = req->operation; resp->status = status; if (status != BLKIF_RSP_OKAY) xbb->reqs_completed_with_error++; xbb->rings.common.rsp_prod_pvt++; xbb->reqs_queued_for_completion++; } /** * Send queued responses to blkif requests. * * \param xbb Per-instance xbb configuration structure. * \param run_taskqueue Flag that is set to 1 if the taskqueue * should be run, 0 if it does not need to be run. * \param notify Flag that is set to 1 if the other end should be * notified via irq, 0 if the other end should not be * notified. */ static void xbb_push_responses(struct xbb_softc *xbb, int *run_taskqueue, int *notify) { int more_to_do; /* * The mutex is required here. */ mtx_assert(&xbb->lock, MA_OWNED); more_to_do = 0; RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&xbb->rings.common, *notify); if (xbb->rings.common.rsp_prod_pvt == xbb->rings.common.req_cons) { /* * Tail check for pending requests. Allows frontend to avoid * notifications if requests are already in flight (lower * overheads and promotes batching). */ RING_FINAL_CHECK_FOR_REQUESTS(&xbb->rings.common, more_to_do); } else if (RING_HAS_UNCONSUMED_REQUESTS(&xbb->rings.common)) { more_to_do = 1; } xbb->reqs_completed += xbb->reqs_queued_for_completion; xbb->reqs_queued_for_completion = 0; *run_taskqueue = more_to_do; } /** * Complete a request list. * * \param xbb Per-instance xbb configuration structure. * \param reqlist Allocated internal request list structure. */ static void xbb_complete_reqlist(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist) { struct xbb_xen_req *nreq; off_t sectors_sent; int notify, run_taskqueue; sectors_sent = 0; if (reqlist->flags & XBB_REQLIST_MAPPED) xbb_unmap_reqlist(reqlist); mtx_lock(&xbb->lock); /* * All I/O is done, send the response. A lock is not necessary * to protect the request list, because all requests have * completed. Therefore this is the only context accessing this * reqlist right now. However, in order to make sure that no one * else queues responses onto the queue or pushes them to the other * side while we're active, we need to hold the lock across the * calls to xbb_queue_response() and xbb_push_responses(). */ STAILQ_FOREACH(nreq, &reqlist->contig_req_list, links) { off_t cur_sectors_sent; /* Put this response on the ring, but don't push yet */ xbb_queue_response(xbb, nreq, reqlist->status); /* We don't report bytes sent if there is an error. */ if (reqlist->status == BLKIF_RSP_OKAY) cur_sectors_sent = nreq->nr_512b_sectors; else cur_sectors_sent = 0; sectors_sent += cur_sectors_sent; devstat_end_transaction(xbb->xbb_stats_in, /*bytes*/cur_sectors_sent << 9, reqlist->ds_tag_type, reqlist->ds_trans_type, /*now*/NULL, /*then*/&nreq->ds_t0); } /* * Take out any sectors not sent. If we wind up negative (which * might happen if an error is reported as well as a residual), just * report 0 sectors sent. */ sectors_sent -= reqlist->residual_512b_sectors; if (sectors_sent < 0) sectors_sent = 0; devstat_end_transaction(xbb->xbb_stats, /*bytes*/ sectors_sent << 9, reqlist->ds_tag_type, reqlist->ds_trans_type, /*now*/NULL, /*then*/&reqlist->ds_t0); xbb_release_reqlist(xbb, reqlist, /*wakeup*/ 1); xbb_push_responses(xbb, &run_taskqueue, ¬ify); mtx_unlock(&xbb->lock); if (run_taskqueue) taskqueue_enqueue(xbb->io_taskqueue, &xbb->io_task); if (notify) xen_intr_signal(xbb->xen_intr_handle); } /** * Completion handler for buffer I/O requests issued by the device * backend driver. * * \param bio The buffer I/O request on which to perform completion * processing. */ static void xbb_bio_done(struct bio *bio) { struct xbb_softc *xbb; struct xbb_xen_reqlist *reqlist; reqlist = bio->bio_caller1; xbb = reqlist->xbb; reqlist->residual_512b_sectors += bio->bio_resid >> 9; /* * This is a bit imprecise. With aggregated I/O a single * request list can contain multiple front-end requests and * a multiple bios may point to a single request. By carefully * walking the request list, we could map residuals and errors * back to the original front-end request, but the interface * isn't sufficiently rich for us to properly report the error. * So, we just treat the entire request list as having failed if an * error occurs on any part. And, if an error occurs, we treat * the amount of data transferred as 0. * * For residuals, we report it on the overall aggregated device, * but not on the individual requests, since we don't currently * do the work to determine which front-end request to which the * residual applies. */ if (bio->bio_error) { DPRINTF("BIO returned error %d for operation on device %s\n", bio->bio_error, xbb->dev_name); reqlist->status = BLKIF_RSP_ERROR; if (bio->bio_error == ENXIO && xenbus_get_state(xbb->dev) == XenbusStateConnected) { /* * Backend device has disappeared. Signal the * front-end that we (the device proxy) want to * go away. */ xenbus_set_state(xbb->dev, XenbusStateClosing); } } #ifdef XBB_USE_BOUNCE_BUFFERS if (bio->bio_cmd == BIO_READ) { vm_offset_t kva_offset; kva_offset = (vm_offset_t)bio->bio_data - (vm_offset_t)reqlist->bounce; memcpy((uint8_t *)reqlist->kva + kva_offset, bio->bio_data, bio->bio_bcount); } #endif /* XBB_USE_BOUNCE_BUFFERS */ /* * Decrement the pending count for the request list. When we're * done with the requests, send status back for all of them. */ if (atomic_fetchadd_int(&reqlist->pendcnt, -1) == 1) xbb_complete_reqlist(xbb, reqlist); g_destroy_bio(bio); } /** * Parse a blkif request into an internal request structure and send * it to the backend for processing. * * \param xbb Per-instance xbb configuration structure. * \param reqlist Allocated internal request list structure. * * \return On success, 0. For resource shortages, non-zero. * * This routine performs the backend common aspects of request parsing * including compiling an internal request structure, parsing the S/G * list and any secondary ring requests in which they may reside, and * the mapping of front-end I/O pages into our domain. */ static int xbb_dispatch_io(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist) { struct xbb_sg *xbb_sg; struct gnttab_map_grant_ref *map; struct blkif_request_segment *sg; struct blkif_request_segment *last_block_sg; struct xbb_xen_req *nreq; u_int nseg; u_int seg_idx; u_int block_segs; int nr_sects; int total_sects; int operation; uint8_t bio_flags; int error; reqlist->ds_tag_type = DEVSTAT_TAG_SIMPLE; bio_flags = 0; total_sects = 0; nr_sects = 0; /* * First determine whether we have enough free KVA to satisfy this * request list. If not, tell xbb_run_queue() so it can go to * sleep until we have more KVA. */ reqlist->kva = NULL; if (reqlist->nr_segments != 0) { reqlist->kva = xbb_get_kva(xbb, reqlist->nr_segments); if (reqlist->kva == NULL) { /* * If we're out of KVA, return ENOMEM. */ return (ENOMEM); } } binuptime(&reqlist->ds_t0); devstat_start_transaction(xbb->xbb_stats, &reqlist->ds_t0); switch (reqlist->operation) { case BLKIF_OP_WRITE_BARRIER: bio_flags |= BIO_ORDERED; reqlist->ds_tag_type = DEVSTAT_TAG_ORDERED; /* FALLTHROUGH */ case BLKIF_OP_WRITE: operation = BIO_WRITE; reqlist->ds_trans_type = DEVSTAT_WRITE; if ((xbb->flags & XBBF_READ_ONLY) != 0) { DPRINTF("Attempt to write to read only device %s\n", xbb->dev_name); reqlist->status = BLKIF_RSP_ERROR; goto send_response; } break; case BLKIF_OP_READ: operation = BIO_READ; reqlist->ds_trans_type = DEVSTAT_READ; break; case BLKIF_OP_FLUSH_DISKCACHE: /* * If this is true, the user has requested that we disable * flush support. So we just complete the requests * successfully. */ if (xbb->disable_flush != 0) { goto send_response; } /* * The user has requested that we only send a real flush * for every N flush requests. So keep count, and either * complete the request immediately or queue it for the * backend. */ if (xbb->flush_interval != 0) { if (++(xbb->flush_count) < xbb->flush_interval) { goto send_response; } else xbb->flush_count = 0; } operation = BIO_FLUSH; reqlist->ds_tag_type = DEVSTAT_TAG_ORDERED; reqlist->ds_trans_type = DEVSTAT_NO_DATA; goto do_dispatch; /*NOTREACHED*/ default: DPRINTF("error: unknown block io operation [%d]\n", reqlist->operation); reqlist->status = BLKIF_RSP_ERROR; goto send_response; } reqlist->xbb = xbb; xbb_sg = xbb->xbb_sgs; map = xbb->maps; seg_idx = 0; STAILQ_FOREACH(nreq, &reqlist->contig_req_list, links) { blkif_request_t *ring_req; RING_IDX req_ring_idx; u_int req_seg_idx; ring_req = nreq->ring_req; req_ring_idx = nreq->req_ring_idx; nr_sects = 0; nseg = ring_req->nr_segments; nreq->nr_pages = nseg; nreq->nr_512b_sectors = 0; req_seg_idx = 0; sg = NULL; /* Check that number of segments is sane. */ if (__predict_false(nseg == 0) || __predict_false(nseg > xbb->max_request_segments)) { DPRINTF("Bad number of segments in request (%d)\n", nseg); reqlist->status = BLKIF_RSP_ERROR; goto send_response; } block_segs = nseg; sg = ring_req->seg; last_block_sg = sg + block_segs; while (sg < last_block_sg) { KASSERT(seg_idx < XBB_MAX_SEGMENTS_PER_REQLIST, ("seg_idx %d is too large, max " "segs %d\n", seg_idx, XBB_MAX_SEGMENTS_PER_REQLIST)); xbb_sg->first_sect = sg->first_sect; xbb_sg->last_sect = sg->last_sect; xbb_sg->nsect = (int8_t)(sg->last_sect - sg->first_sect + 1); if ((sg->last_sect >= (PAGE_SIZE >> 9)) || (xbb_sg->nsect <= 0)) { reqlist->status = BLKIF_RSP_ERROR; goto send_response; } nr_sects += xbb_sg->nsect; map->host_addr = xbb_get_gntaddr(reqlist, seg_idx, /*sector*/0); KASSERT(map->host_addr + PAGE_SIZE <= xbb->ring_config.gnt_addr, ("Host address %#jx len %d overlaps " "ring address %#jx\n", (uintmax_t)map->host_addr, PAGE_SIZE, (uintmax_t)xbb->ring_config.gnt_addr)); map->flags = GNTMAP_host_map; map->ref = sg->gref; map->dom = xbb->otherend_id; if (operation == BIO_WRITE) map->flags |= GNTMAP_readonly; sg++; map++; xbb_sg++; seg_idx++; req_seg_idx++; } /* Convert to the disk's sector size */ nreq->nr_512b_sectors = nr_sects; nr_sects = (nr_sects << 9) >> xbb->sector_size_shift; total_sects += nr_sects; if ((nreq->nr_512b_sectors & ((xbb->sector_size >> 9) - 1)) != 0) { device_printf(xbb->dev, "%s: I/O size (%d) is not " "a multiple of the backing store sector " "size (%d)\n", __func__, nreq->nr_512b_sectors << 9, xbb->sector_size); reqlist->status = BLKIF_RSP_ERROR; goto send_response; } } error = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, xbb->maps, reqlist->nr_segments); if (error != 0) panic("Grant table operation failed (%d)", error); reqlist->flags |= XBB_REQLIST_MAPPED; for (seg_idx = 0, map = xbb->maps; seg_idx < reqlist->nr_segments; seg_idx++, map++){ if (__predict_false(map->status != 0)) { DPRINTF("invalid buffer -- could not remap " "it (%d)\n", map->status); DPRINTF("Mapping(%d): Host Addr 0x%"PRIx64", flags " "0x%x ref 0x%x, dom %d\n", seg_idx, map->host_addr, map->flags, map->ref, map->dom); reqlist->status = BLKIF_RSP_ERROR; goto send_response; } reqlist->gnt_handles[seg_idx] = map->handle; } if (reqlist->starting_sector_number + total_sects > xbb->media_num_sectors) { DPRINTF("%s of [%" PRIu64 ",%" PRIu64 "] " "extends past end of device %s\n", operation == BIO_READ ? "read" : "write", reqlist->starting_sector_number, reqlist->starting_sector_number + total_sects, xbb->dev_name); reqlist->status = BLKIF_RSP_ERROR; goto send_response; } do_dispatch: error = xbb->dispatch_io(xbb, reqlist, operation, bio_flags); if (error != 0) { reqlist->status = BLKIF_RSP_ERROR; goto send_response; } return (0); send_response: xbb_complete_reqlist(xbb, reqlist); return (0); } static __inline int xbb_count_sects(blkif_request_t *ring_req) { int i; int cur_size = 0; for (i = 0; i < ring_req->nr_segments; i++) { int nsect; nsect = (int8_t)(ring_req->seg[i].last_sect - ring_req->seg[i].first_sect + 1); if (nsect <= 0) break; cur_size += nsect; } return (cur_size); } /** * Process incoming requests from the shared communication ring in response * to a signal on the ring's event channel. * * \param context Callback argument registerd during task initialization - * the xbb_softc for this instance. * \param pending The number of taskqueue_enqueue events that have * occurred since this handler was last run. */ static void xbb_run_queue(void *context, int pending) { struct xbb_softc *xbb; blkif_back_rings_t *rings; RING_IDX rp; uint64_t cur_sector; int cur_operation; struct xbb_xen_reqlist *reqlist; xbb = (struct xbb_softc *)context; rings = &xbb->rings; /* * Work gather and dispatch loop. Note that we have a bias here * towards gathering I/O sent by blockfront. We first gather up * everything in the ring, as long as we have resources. Then we * dispatch one request, and then attempt to gather up any * additional requests that have come in while we were dispatching * the request. * * This allows us to get a clearer picture (via devstat) of how * many requests blockfront is queueing to us at any given time. */ for (;;) { int retval; /* * Initialize reqlist to the last element in the pending * queue, if there is one. This allows us to add more * requests to that request list, if we have room. */ reqlist = STAILQ_LAST(&xbb->reqlist_pending_stailq, xbb_xen_reqlist, links); if (reqlist != NULL) { cur_sector = reqlist->next_contig_sector; cur_operation = reqlist->operation; } else { cur_operation = 0; cur_sector = 0; } /* * Cache req_prod to avoid accessing a cache line shared * with the frontend. */ rp = rings->common.sring->req_prod; /* Ensure we see queued requests up to 'rp'. */ rmb(); /** * Run so long as there is work to consume and the generation * of a response will not overflow the ring. * * @note There's a 1 to 1 relationship between requests and * responses, so an overflow should never occur. This * test is to protect our domain from digesting bogus * data. Shouldn't we log this? */ while (rings->common.req_cons != rp && RING_REQUEST_CONS_OVERFLOW(&rings->common, rings->common.req_cons) == 0){ blkif_request_t ring_req_storage; blkif_request_t *ring_req; int cur_size; switch (xbb->abi) { case BLKIF_PROTOCOL_NATIVE: ring_req = RING_GET_REQUEST(&xbb->rings.native, rings->common.req_cons); break; case BLKIF_PROTOCOL_X86_32: { struct blkif_x86_32_request *ring_req32; ring_req32 = RING_GET_REQUEST( &xbb->rings.x86_32, rings->common.req_cons); blkif_get_x86_32_req(&ring_req_storage, ring_req32); ring_req = &ring_req_storage; break; } case BLKIF_PROTOCOL_X86_64: { struct blkif_x86_64_request *ring_req64; ring_req64 =RING_GET_REQUEST(&xbb->rings.x86_64, rings->common.req_cons); blkif_get_x86_64_req(&ring_req_storage, ring_req64); ring_req = &ring_req_storage; break; } default: panic("Unexpected blkif protocol ABI."); /* NOTREACHED */ } /* * Check for situations that would require closing * off this I/O for further coalescing: * - Coalescing is turned off. * - Current I/O is out of sequence with the previous * I/O. * - Coalesced I/O would be too large. */ if ((reqlist != NULL) && ((xbb->no_coalesce_reqs != 0) || ((xbb->no_coalesce_reqs == 0) && ((ring_req->sector_number != cur_sector) || (ring_req->operation != cur_operation) || ((ring_req->nr_segments + reqlist->nr_segments) > xbb->max_reqlist_segments))))) { reqlist = NULL; } /* * Grab and check for all resources in one shot. * If we can't get all of the resources we need, * the shortage is noted and the thread will get * woken up when more resources are available. */ retval = xbb_get_resources(xbb, &reqlist, ring_req, xbb->rings.common.req_cons); if (retval != 0) { /* * Resource shortage has been recorded. * We'll be scheduled to run once a request * object frees up due to a completion. */ break; } /* * Signify that we can overwrite this request with * a response by incrementing our consumer index. * The response won't be generated until after * we've already consumed all necessary data out * of the version of the request in the ring buffer * (for native mode). We must update the consumer * index before issuing back-end I/O so there is * no possibility that it will complete and a * response be generated before we make room in * the queue for that response. */ xbb->rings.common.req_cons++; xbb->reqs_received++; cur_size = xbb_count_sects(ring_req); cur_sector = ring_req->sector_number + cur_size; reqlist->next_contig_sector = cur_sector; cur_operation = ring_req->operation; } /* Check for I/O to dispatch */ reqlist = STAILQ_FIRST(&xbb->reqlist_pending_stailq); if (reqlist == NULL) { /* * We're out of work to do, put the task queue to * sleep. */ break; } /* * Grab the first request off the queue and attempt * to dispatch it. */ STAILQ_REMOVE_HEAD(&xbb->reqlist_pending_stailq, links); retval = xbb_dispatch_io(xbb, reqlist); if (retval != 0) { /* * xbb_dispatch_io() returns non-zero only when * there is a resource shortage. If that's the * case, re-queue this request on the head of the * queue, and go to sleep until we have more * resources. */ STAILQ_INSERT_HEAD(&xbb->reqlist_pending_stailq, reqlist, links); break; } else { /* * If we still have anything on the queue after * removing the head entry, that is because we * met one of the criteria to create a new * request list (outlined above), and we'll call * that a forced dispatch for statistical purposes. * * Otherwise, if there is only one element on the * queue, we coalesced everything available on * the ring and we'll call that a normal dispatch. */ reqlist = STAILQ_FIRST(&xbb->reqlist_pending_stailq); if (reqlist != NULL) xbb->forced_dispatch++; else xbb->normal_dispatch++; xbb->total_dispatch++; } } } /** * Interrupt handler bound to the shared ring's event channel. * * \param arg Callback argument registerd during event channel * binding - the xbb_softc for this instance. */ static int xbb_filter(void *arg) { struct xbb_softc *xbb; /* Defer to taskqueue thread. */ xbb = (struct xbb_softc *)arg; taskqueue_enqueue(xbb->io_taskqueue, &xbb->io_task); return (FILTER_HANDLED); } SDT_PROVIDER_DEFINE(xbb); SDT_PROBE_DEFINE1(xbb, kernel, xbb_dispatch_dev, flush, "int"); SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_dev, read, "int", "uint64_t", "uint64_t"); SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_dev, write, "int", "uint64_t", "uint64_t"); /*----------------------------- Backend Handlers -----------------------------*/ /** * Backend handler for character device access. * * \param xbb Per-instance xbb configuration structure. * \param reqlist Allocated internal request list structure. * \param operation BIO_* I/O operation code. * \param bio_flags Additional bio_flag data to pass to any generated * bios (e.g. BIO_ORDERED).. * * \return 0 for success, errno codes for failure. */ static int xbb_dispatch_dev(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist, int operation, int bio_flags) { struct xbb_dev_data *dev_data; struct bio *bios[XBB_MAX_SEGMENTS_PER_REQLIST]; off_t bio_offset; struct bio *bio; struct xbb_sg *xbb_sg; u_int nbio; u_int bio_idx; u_int nseg; u_int seg_idx; int error; dev_data = &xbb->backend.dev; bio_offset = (off_t)reqlist->starting_sector_number << xbb->sector_size_shift; error = 0; nbio = 0; bio_idx = 0; if (operation == BIO_FLUSH) { bio = g_new_bio(); if (__predict_false(bio == NULL)) { DPRINTF("Unable to allocate bio for BIO_FLUSH\n"); error = ENOMEM; return (error); } bio->bio_cmd = BIO_FLUSH; bio->bio_flags |= BIO_ORDERED; bio->bio_dev = dev_data->cdev; bio->bio_offset = 0; bio->bio_data = 0; bio->bio_done = xbb_bio_done; bio->bio_caller1 = reqlist; bio->bio_pblkno = 0; reqlist->pendcnt = 1; SDT_PROBE1(xbb, kernel, xbb_dispatch_dev, flush, device_get_unit(xbb->dev)); (*dev_data->csw->d_strategy)(bio); return (0); } xbb_sg = xbb->xbb_sgs; bio = NULL; nseg = reqlist->nr_segments; for (seg_idx = 0; seg_idx < nseg; seg_idx++, xbb_sg++) { /* * KVA will not be contiguous, so any additional * I/O will need to be represented in a new bio. */ if ((bio != NULL) && (xbb_sg->first_sect != 0)) { if ((bio->bio_length & (xbb->sector_size - 1)) != 0) { printf("%s: Discontiguous I/O request " "from domain %d ends on " "non-sector boundary\n", __func__, xbb->otherend_id); error = EINVAL; goto fail_free_bios; } bio = NULL; } if (bio == NULL) { /* * Make sure that the start of this bio is * aligned to a device sector. */ if ((bio_offset & (xbb->sector_size - 1)) != 0){ printf("%s: Misaligned I/O request " "from domain %d\n", __func__, xbb->otherend_id); error = EINVAL; goto fail_free_bios; } bio = bios[nbio++] = g_new_bio(); if (__predict_false(bio == NULL)) { error = ENOMEM; goto fail_free_bios; } bio->bio_cmd = operation; bio->bio_flags |= bio_flags; bio->bio_dev = dev_data->cdev; bio->bio_offset = bio_offset; bio->bio_data = xbb_reqlist_ioaddr(reqlist, seg_idx, xbb_sg->first_sect); bio->bio_done = xbb_bio_done; bio->bio_caller1 = reqlist; bio->bio_pblkno = bio_offset >> xbb->sector_size_shift; } bio->bio_length += xbb_sg->nsect << 9; bio->bio_bcount = bio->bio_length; bio_offset += xbb_sg->nsect << 9; if (xbb_sg->last_sect != (PAGE_SIZE - 512) >> 9) { if ((bio->bio_length & (xbb->sector_size - 1)) != 0) { printf("%s: Discontiguous I/O request " "from domain %d ends on " "non-sector boundary\n", __func__, xbb->otherend_id); error = EINVAL; goto fail_free_bios; } /* * KVA will not be contiguous, so any additional * I/O will need to be represented in a new bio. */ bio = NULL; } } reqlist->pendcnt = nbio; for (bio_idx = 0; bio_idx < nbio; bio_idx++) { #ifdef XBB_USE_BOUNCE_BUFFERS vm_offset_t kva_offset; kva_offset = (vm_offset_t)bios[bio_idx]->bio_data - (vm_offset_t)reqlist->bounce; if (operation == BIO_WRITE) { memcpy(bios[bio_idx]->bio_data, (uint8_t *)reqlist->kva + kva_offset, bios[bio_idx]->bio_bcount); } #endif if (operation == BIO_READ) { SDT_PROBE3(xbb, kernel, xbb_dispatch_dev, read, device_get_unit(xbb->dev), bios[bio_idx]->bio_offset, bios[bio_idx]->bio_length); } else if (operation == BIO_WRITE) { SDT_PROBE3(xbb, kernel, xbb_dispatch_dev, write, device_get_unit(xbb->dev), bios[bio_idx]->bio_offset, bios[bio_idx]->bio_length); } (*dev_data->csw->d_strategy)(bios[bio_idx]); } return (error); fail_free_bios: for (bio_idx = 0; bio_idx < (nbio-1); bio_idx++) g_destroy_bio(bios[bio_idx]); return (error); } SDT_PROBE_DEFINE1(xbb, kernel, xbb_dispatch_file, flush, "int"); SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_file, read, "int", "uint64_t", "uint64_t"); SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_file, write, "int", "uint64_t", "uint64_t"); /** * Backend handler for file access. * * \param xbb Per-instance xbb configuration structure. * \param reqlist Allocated internal request list. * \param operation BIO_* I/O operation code. * \param flags Additional bio_flag data to pass to any generated bios * (e.g. BIO_ORDERED).. * * \return 0 for success, errno codes for failure. */ static int xbb_dispatch_file(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist, int operation, int flags) { struct xbb_file_data *file_data; u_int seg_idx; u_int nseg; struct uio xuio; struct xbb_sg *xbb_sg; struct iovec *xiovec; #ifdef XBB_USE_BOUNCE_BUFFERS void **p_vaddr; int saved_uio_iovcnt; #endif /* XBB_USE_BOUNCE_BUFFERS */ int error; file_data = &xbb->backend.file; error = 0; bzero(&xuio, sizeof(xuio)); switch (operation) { case BIO_READ: xuio.uio_rw = UIO_READ; break; case BIO_WRITE: xuio.uio_rw = UIO_WRITE; break; case BIO_FLUSH: { struct mount *mountpoint; SDT_PROBE1(xbb, kernel, xbb_dispatch_file, flush, device_get_unit(xbb->dev)); (void) vn_start_write(xbb->vn, &mountpoint, V_WAIT); vn_lock(xbb->vn, LK_EXCLUSIVE | LK_RETRY); error = VOP_FSYNC(xbb->vn, MNT_WAIT, curthread); VOP_UNLOCK(xbb->vn, 0); vn_finished_write(mountpoint); goto bailout_send_response; /* NOTREACHED */ } default: panic("invalid operation %d", operation); /* NOTREACHED */ } xuio.uio_offset = (vm_offset_t)reqlist->starting_sector_number << xbb->sector_size_shift; xuio.uio_segflg = UIO_SYSSPACE; xuio.uio_iov = file_data->xiovecs; xuio.uio_iovcnt = 0; xbb_sg = xbb->xbb_sgs; nseg = reqlist->nr_segments; for (xiovec = NULL, seg_idx = 0; seg_idx < nseg; seg_idx++, xbb_sg++) { /* * If the first sector is not 0, the KVA will * not be contiguous and we'll need to go on * to another segment. */ if (xbb_sg->first_sect != 0) xiovec = NULL; if (xiovec == NULL) { xiovec = &file_data->xiovecs[xuio.uio_iovcnt]; xiovec->iov_base = xbb_reqlist_ioaddr(reqlist, seg_idx, xbb_sg->first_sect); #ifdef XBB_USE_BOUNCE_BUFFERS /* * Store the address of the incoming * buffer at this particular offset * as well, so we can do the copy * later without having to do more * work to recalculate this address. */ p_vaddr = &file_data->xiovecs_vaddr[xuio.uio_iovcnt]; *p_vaddr = xbb_reqlist_vaddr(reqlist, seg_idx, xbb_sg->first_sect); #endif /* XBB_USE_BOUNCE_BUFFERS */ xiovec->iov_len = 0; xuio.uio_iovcnt++; } xiovec->iov_len += xbb_sg->nsect << 9; xuio.uio_resid += xbb_sg->nsect << 9; /* * If the last sector is not the full page * size count, the next segment will not be * contiguous in KVA and we need a new iovec. */ if (xbb_sg->last_sect != (PAGE_SIZE - 512) >> 9) xiovec = NULL; } xuio.uio_td = curthread; #ifdef XBB_USE_BOUNCE_BUFFERS saved_uio_iovcnt = xuio.uio_iovcnt; if (operation == BIO_WRITE) { /* Copy the write data to the local buffer. */ for (seg_idx = 0, p_vaddr = file_data->xiovecs_vaddr, xiovec = xuio.uio_iov; seg_idx < xuio.uio_iovcnt; seg_idx++, xiovec++, p_vaddr++) { memcpy(xiovec->iov_base, *p_vaddr, xiovec->iov_len); } } else { /* * We only need to save off the iovecs in the case of a * read, because the copy for the read happens after the * VOP_READ(). (The uio will get modified in that call * sequence.) */ memcpy(file_data->saved_xiovecs, xuio.uio_iov, xuio.uio_iovcnt * sizeof(xuio.uio_iov[0])); } #endif /* XBB_USE_BOUNCE_BUFFERS */ switch (operation) { case BIO_READ: SDT_PROBE3(xbb, kernel, xbb_dispatch_file, read, device_get_unit(xbb->dev), xuio.uio_offset, xuio.uio_resid); vn_lock(xbb->vn, LK_EXCLUSIVE | LK_RETRY); /* * UFS pays attention to IO_DIRECT for reads. If the * DIRECTIO option is configured into the kernel, it calls * ffs_rawread(). But that only works for single-segment * uios with user space addresses. In our case, with a * kernel uio, it still reads into the buffer cache, but it * will just try to release the buffer from the cache later * on in ffs_read(). * * ZFS does not pay attention to IO_DIRECT for reads. * * UFS does not pay attention to IO_SYNC for reads. * * ZFS pays attention to IO_SYNC (which translates into the * Solaris define FRSYNC for zfs_read()) for reads. It * attempts to sync the file before reading. * * So, to attempt to provide some barrier semantics in the * BIO_ORDERED case, set both IO_DIRECT and IO_SYNC. */ error = VOP_READ(xbb->vn, &xuio, (flags & BIO_ORDERED) ? (IO_DIRECT|IO_SYNC) : 0, file_data->cred); VOP_UNLOCK(xbb->vn, 0); break; case BIO_WRITE: { struct mount *mountpoint; SDT_PROBE3(xbb, kernel, xbb_dispatch_file, write, device_get_unit(xbb->dev), xuio.uio_offset, xuio.uio_resid); (void)vn_start_write(xbb->vn, &mountpoint, V_WAIT); vn_lock(xbb->vn, LK_EXCLUSIVE | LK_RETRY); /* * UFS pays attention to IO_DIRECT for writes. The write * is done asynchronously. (Normally the write would just * get put into cache. * * UFS pays attention to IO_SYNC for writes. It will * attempt to write the buffer out synchronously if that * flag is set. * * ZFS does not pay attention to IO_DIRECT for writes. * * ZFS pays attention to IO_SYNC (a.k.a. FSYNC or FRSYNC) * for writes. It will flush the transaction from the * cache before returning. * * So if we've got the BIO_ORDERED flag set, we want * IO_SYNC in either the UFS or ZFS case. */ error = VOP_WRITE(xbb->vn, &xuio, (flags & BIO_ORDERED) ? IO_SYNC : 0, file_data->cred); VOP_UNLOCK(xbb->vn, 0); vn_finished_write(mountpoint); break; } default: panic("invalid operation %d", operation); /* NOTREACHED */ } #ifdef XBB_USE_BOUNCE_BUFFERS /* We only need to copy here for read operations */ if (operation == BIO_READ) { for (seg_idx = 0, p_vaddr = file_data->xiovecs_vaddr, xiovec = file_data->saved_xiovecs; seg_idx < saved_uio_iovcnt; seg_idx++, xiovec++, p_vaddr++) { /* * Note that we have to use the copy of the * io vector we made above. uiomove() modifies * the uio and its referenced vector as uiomove * performs the copy, so we can't rely on any * state from the original uio. */ memcpy(*p_vaddr, xiovec->iov_base, xiovec->iov_len); } } #endif /* XBB_USE_BOUNCE_BUFFERS */ bailout_send_response: if (error != 0) reqlist->status = BLKIF_RSP_ERROR; xbb_complete_reqlist(xbb, reqlist); return (0); } /*--------------------------- Backend Configuration --------------------------*/ /** * Close and cleanup any backend device/file specific state for this * block back instance. * * \param xbb Per-instance xbb configuration structure. */ static void xbb_close_backend(struct xbb_softc *xbb) { DROP_GIANT(); DPRINTF("closing dev=%s\n", xbb->dev_name); if (xbb->vn) { int flags = FREAD; if ((xbb->flags & XBBF_READ_ONLY) == 0) flags |= FWRITE; switch (xbb->device_type) { case XBB_TYPE_DISK: if (xbb->backend.dev.csw) { dev_relthread(xbb->backend.dev.cdev, xbb->backend.dev.dev_ref); xbb->backend.dev.csw = NULL; xbb->backend.dev.cdev = NULL; } break; case XBB_TYPE_FILE: break; case XBB_TYPE_NONE: default: panic("Unexpected backend type."); break; } (void)vn_close(xbb->vn, flags, NOCRED, curthread); xbb->vn = NULL; switch (xbb->device_type) { case XBB_TYPE_DISK: break; case XBB_TYPE_FILE: if (xbb->backend.file.cred != NULL) { crfree(xbb->backend.file.cred); xbb->backend.file.cred = NULL; } break; case XBB_TYPE_NONE: default: panic("Unexpected backend type."); break; } } PICKUP_GIANT(); } /** * Open a character device to be used for backend I/O. * * \param xbb Per-instance xbb configuration structure. * * \return 0 for success, errno codes for failure. */ static int xbb_open_dev(struct xbb_softc *xbb) { struct vattr vattr; struct cdev *dev; struct cdevsw *devsw; int error; xbb->device_type = XBB_TYPE_DISK; xbb->dispatch_io = xbb_dispatch_dev; xbb->backend.dev.cdev = xbb->vn->v_rdev; xbb->backend.dev.csw = dev_refthread(xbb->backend.dev.cdev, &xbb->backend.dev.dev_ref); if (xbb->backend.dev.csw == NULL) panic("Unable to retrieve device switch"); error = VOP_GETATTR(xbb->vn, &vattr, NOCRED); if (error) { xenbus_dev_fatal(xbb->dev, error, "error getting " "vnode attributes for device %s", xbb->dev_name); return (error); } dev = xbb->vn->v_rdev; devsw = dev->si_devsw; if (!devsw->d_ioctl) { xenbus_dev_fatal(xbb->dev, ENODEV, "no d_ioctl for " "device %s!", xbb->dev_name); return (ENODEV); } error = devsw->d_ioctl(dev, DIOCGSECTORSIZE, (caddr_t)&xbb->sector_size, FREAD, curthread); if (error) { xenbus_dev_fatal(xbb->dev, error, "error calling ioctl DIOCGSECTORSIZE " "for device %s", xbb->dev_name); return (error); } error = devsw->d_ioctl(dev, DIOCGMEDIASIZE, (caddr_t)&xbb->media_size, FREAD, curthread); if (error) { xenbus_dev_fatal(xbb->dev, error, "error calling ioctl DIOCGMEDIASIZE " "for device %s", xbb->dev_name); return (error); } return (0); } /** * Open a file to be used for backend I/O. * * \param xbb Per-instance xbb configuration structure. * * \return 0 for success, errno codes for failure. */ static int xbb_open_file(struct xbb_softc *xbb) { struct xbb_file_data *file_data; struct vattr vattr; int error; file_data = &xbb->backend.file; xbb->device_type = XBB_TYPE_FILE; xbb->dispatch_io = xbb_dispatch_file; error = VOP_GETATTR(xbb->vn, &vattr, curthread->td_ucred); if (error != 0) { xenbus_dev_fatal(xbb->dev, error, "error calling VOP_GETATTR()" "for file %s", xbb->dev_name); return (error); } /* * Verify that we have the ability to upgrade to exclusive * access on this file so we can trap errors at open instead * of reporting them during first access. */ if (VOP_ISLOCKED(xbb->vn) != LK_EXCLUSIVE) { vn_lock(xbb->vn, LK_UPGRADE | LK_RETRY); if (xbb->vn->v_iflag & VI_DOOMED) { error = EBADF; xenbus_dev_fatal(xbb->dev, error, "error locking file %s", xbb->dev_name); return (error); } } file_data->cred = crhold(curthread->td_ucred); xbb->media_size = vattr.va_size; /* * XXX KDM vattr.va_blocksize may be larger than 512 bytes here. * With ZFS, it is 131072 bytes. Block sizes that large don't work * with disklabel and UFS on FreeBSD at least. Large block sizes * may not work with other OSes as well. So just export a sector * size of 512 bytes, which should work with any OS or * application. Since our backing is a file, any block size will * work fine for the backing store. */ #if 0 xbb->sector_size = vattr.va_blocksize; #endif xbb->sector_size = 512; /* * Sanity check. The media size has to be at least one * sector long. */ if (xbb->media_size < xbb->sector_size) { error = EINVAL; xenbus_dev_fatal(xbb->dev, error, "file %s size %ju < block size %u", xbb->dev_name, (uintmax_t)xbb->media_size, xbb->sector_size); } return (error); } /** * Open the backend provider for this connection. * * \param xbb Per-instance xbb configuration structure. * * \return 0 for success, errno codes for failure. */ static int xbb_open_backend(struct xbb_softc *xbb) { struct nameidata nd; int flags; int error; flags = FREAD; error = 0; DPRINTF("opening dev=%s\n", xbb->dev_name); if (rootvnode == NULL) { xenbus_dev_fatal(xbb->dev, ENOENT, "Root file system not mounted"); return (ENOENT); } if ((xbb->flags & XBBF_READ_ONLY) == 0) flags |= FWRITE; pwd_ensure_dirs(); again: NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, xbb->dev_name, curthread); error = vn_open(&nd, &flags, 0, NULL); if (error) { /* * This is the only reasonable guess we can make as far as * path if the user doesn't give us a fully qualified path. * If they want to specify a file, they need to specify the * full path. */ if (xbb->dev_name[0] != '/') { char *dev_path = "/dev/"; char *dev_name; /* Try adding device path at beginning of name */ dev_name = malloc(strlen(xbb->dev_name) + strlen(dev_path) + 1, M_XENBLOCKBACK, M_NOWAIT); if (dev_name) { sprintf(dev_name, "%s%s", dev_path, xbb->dev_name); free(xbb->dev_name, M_XENBLOCKBACK); xbb->dev_name = dev_name; goto again; } } xenbus_dev_fatal(xbb->dev, error, "error opening device %s", xbb->dev_name); return (error); } NDFREE(&nd, NDF_ONLY_PNBUF); xbb->vn = nd.ni_vp; /* We only support disks and files. */ if (vn_isdisk(xbb->vn, &error)) { error = xbb_open_dev(xbb); } else if (xbb->vn->v_type == VREG) { error = xbb_open_file(xbb); } else { error = EINVAL; xenbus_dev_fatal(xbb->dev, error, "%s is not a disk " "or file", xbb->dev_name); } VOP_UNLOCK(xbb->vn, 0); if (error != 0) { xbb_close_backend(xbb); return (error); } xbb->sector_size_shift = fls(xbb->sector_size) - 1; xbb->media_num_sectors = xbb->media_size >> xbb->sector_size_shift; DPRINTF("opened %s=%s sector_size=%u media_size=%" PRId64 "\n", (xbb->device_type == XBB_TYPE_DISK) ? "dev" : "file", xbb->dev_name, xbb->sector_size, xbb->media_size); return (0); } /*------------------------ Inter-Domain Communication ------------------------*/ /** * Free dynamically allocated KVA or pseudo-physical address allocations. * * \param xbb Per-instance xbb configuration structure. */ static void xbb_free_communication_mem(struct xbb_softc *xbb) { if (xbb->kva != 0) { if (xbb->pseudo_phys_res != NULL) { xenmem_free(xbb->dev, xbb->pseudo_phys_res_id, xbb->pseudo_phys_res); xbb->pseudo_phys_res = NULL; } } xbb->kva = 0; xbb->gnt_base_addr = 0; if (xbb->kva_free != NULL) { free(xbb->kva_free, M_XENBLOCKBACK); xbb->kva_free = NULL; } } /** * Cleanup all inter-domain communication mechanisms. * * \param xbb Per-instance xbb configuration structure. */ static int xbb_disconnect(struct xbb_softc *xbb) { struct gnttab_unmap_grant_ref ops[XBB_MAX_RING_PAGES]; struct gnttab_unmap_grant_ref *op; u_int ring_idx; int error; DPRINTF("\n"); if ((xbb->flags & XBBF_RING_CONNECTED) == 0) return (0); xen_intr_unbind(&xbb->xen_intr_handle); mtx_unlock(&xbb->lock); taskqueue_drain(xbb->io_taskqueue, &xbb->io_task); mtx_lock(&xbb->lock); /* * No new interrupts can generate work, but we must wait * for all currently active requests to drain. */ if (xbb->active_request_count != 0) return (EAGAIN); for (ring_idx = 0, op = ops; ring_idx < xbb->ring_config.ring_pages; ring_idx++, op++) { op->host_addr = xbb->ring_config.gnt_addr + (ring_idx * PAGE_SIZE); op->dev_bus_addr = xbb->ring_config.bus_addr[ring_idx]; op->handle = xbb->ring_config.handle[ring_idx]; } error = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, ops, xbb->ring_config.ring_pages); if (error != 0) panic("Grant table op failed (%d)", error); xbb_free_communication_mem(xbb); if (xbb->requests != NULL) { free(xbb->requests, M_XENBLOCKBACK); xbb->requests = NULL; } if (xbb->request_lists != NULL) { struct xbb_xen_reqlist *reqlist; int i; /* There is one request list for ever allocated request. */ for (i = 0, reqlist = xbb->request_lists; i < xbb->max_requests; i++, reqlist++){ #ifdef XBB_USE_BOUNCE_BUFFERS if (reqlist->bounce != NULL) { free(reqlist->bounce, M_XENBLOCKBACK); reqlist->bounce = NULL; } #endif if (reqlist->gnt_handles != NULL) { free(reqlist->gnt_handles, M_XENBLOCKBACK); reqlist->gnt_handles = NULL; } } free(xbb->request_lists, M_XENBLOCKBACK); xbb->request_lists = NULL; } xbb->flags &= ~XBBF_RING_CONNECTED; return (0); } /** * Map shared memory ring into domain local address space, initialize * ring control structures, and bind an interrupt to the event channel * used to notify us of ring changes. * * \param xbb Per-instance xbb configuration structure. */ static int xbb_connect_ring(struct xbb_softc *xbb) { struct gnttab_map_grant_ref gnts[XBB_MAX_RING_PAGES]; struct gnttab_map_grant_ref *gnt; u_int ring_idx; int error; if ((xbb->flags & XBBF_RING_CONNECTED) != 0) return (0); /* * Kva for our ring is at the tail of the region of kva allocated * by xbb_alloc_communication_mem(). */ xbb->ring_config.va = xbb->kva + (xbb->kva_size - (xbb->ring_config.ring_pages * PAGE_SIZE)); xbb->ring_config.gnt_addr = xbb->gnt_base_addr + (xbb->kva_size - (xbb->ring_config.ring_pages * PAGE_SIZE)); for (ring_idx = 0, gnt = gnts; ring_idx < xbb->ring_config.ring_pages; ring_idx++, gnt++) { gnt->host_addr = xbb->ring_config.gnt_addr + (ring_idx * PAGE_SIZE); gnt->flags = GNTMAP_host_map; gnt->ref = xbb->ring_config.ring_ref[ring_idx]; gnt->dom = xbb->otherend_id; } error = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, gnts, xbb->ring_config.ring_pages); if (error) panic("blkback: Ring page grant table op failed (%d)", error); for (ring_idx = 0, gnt = gnts; ring_idx < xbb->ring_config.ring_pages; ring_idx++, gnt++) { if (gnt->status != 0) { xbb->ring_config.va = 0; xenbus_dev_fatal(xbb->dev, EACCES, "Ring shared page mapping failed. " "Status %d.", gnt->status); return (EACCES); } xbb->ring_config.handle[ring_idx] = gnt->handle; xbb->ring_config.bus_addr[ring_idx] = gnt->dev_bus_addr; } /* Initialize the ring based on ABI. */ switch (xbb->abi) { case BLKIF_PROTOCOL_NATIVE: { blkif_sring_t *sring; sring = (blkif_sring_t *)xbb->ring_config.va; BACK_RING_INIT(&xbb->rings.native, sring, xbb->ring_config.ring_pages * PAGE_SIZE); break; } case BLKIF_PROTOCOL_X86_32: { blkif_x86_32_sring_t *sring_x86_32; sring_x86_32 = (blkif_x86_32_sring_t *)xbb->ring_config.va; BACK_RING_INIT(&xbb->rings.x86_32, sring_x86_32, xbb->ring_config.ring_pages * PAGE_SIZE); break; } case BLKIF_PROTOCOL_X86_64: { blkif_x86_64_sring_t *sring_x86_64; sring_x86_64 = (blkif_x86_64_sring_t *)xbb->ring_config.va; BACK_RING_INIT(&xbb->rings.x86_64, sring_x86_64, xbb->ring_config.ring_pages * PAGE_SIZE); break; } default: panic("Unexpected blkif protocol ABI."); } xbb->flags |= XBBF_RING_CONNECTED; error = xen_intr_bind_remote_port(xbb->dev, xbb->otherend_id, xbb->ring_config.evtchn, xbb_filter, /*ithread_handler*/NULL, /*arg*/xbb, INTR_TYPE_BIO | INTR_MPSAFE, &xbb->xen_intr_handle); if (error) { (void)xbb_disconnect(xbb); xenbus_dev_fatal(xbb->dev, error, "binding event channel"); return (error); } DPRINTF("rings connected!\n"); return 0; } -/* Needed to make bit_alloc() macro work */ -#define calloc(count, size) malloc((count)*(size), M_XENBLOCKBACK, \ - M_NOWAIT|M_ZERO); - /** * Size KVA and pseudo-physical address allocations based on negotiated * values for the size and number of I/O requests, and the size of our * communication ring. * * \param xbb Per-instance xbb configuration structure. * * These address spaces are used to dynamically map pages in the * front-end's domain into our own. */ static int xbb_alloc_communication_mem(struct xbb_softc *xbb) { xbb->reqlist_kva_pages = xbb->max_requests * xbb->max_request_segments; xbb->reqlist_kva_size = xbb->reqlist_kva_pages * PAGE_SIZE; xbb->kva_size = xbb->reqlist_kva_size + (xbb->ring_config.ring_pages * PAGE_SIZE); - xbb->kva_free = bit_alloc(xbb->reqlist_kva_pages); + xbb->kva_free = bit_alloc(xbb->reqlist_kva_pages, M_XENBLOCKBACK, M_NOWAIT); if (xbb->kva_free == NULL) return (ENOMEM); DPRINTF("%s: kva_size = %d, reqlist_kva_size = %d\n", device_get_nameunit(xbb->dev), xbb->kva_size, xbb->reqlist_kva_size); /* * Reserve a range of pseudo physical memory that we can map * into kva. These pages will only be backed by machine * pages ("real memory") during the lifetime of front-end requests * via grant table operations. */ xbb->pseudo_phys_res_id = 0; xbb->pseudo_phys_res = xenmem_alloc(xbb->dev, &xbb->pseudo_phys_res_id, xbb->kva_size); if (xbb->pseudo_phys_res == NULL) { xbb->kva = 0; return (ENOMEM); } xbb->kva = (vm_offset_t)rman_get_virtual(xbb->pseudo_phys_res); xbb->gnt_base_addr = rman_get_start(xbb->pseudo_phys_res); DPRINTF("%s: kva: %#jx, gnt_base_addr: %#jx\n", device_get_nameunit(xbb->dev), (uintmax_t)xbb->kva, (uintmax_t)xbb->gnt_base_addr); return (0); } /** * Collect front-end information from the XenStore. * * \param xbb Per-instance xbb configuration structure. */ static int xbb_collect_frontend_info(struct xbb_softc *xbb) { char protocol_abi[64]; const char *otherend_path; int error; u_int ring_idx; u_int ring_page_order; size_t ring_size; otherend_path = xenbus_get_otherend_path(xbb->dev); /* * Protocol defaults valid even if all negotiation fails. */ xbb->ring_config.ring_pages = 1; xbb->max_request_segments = BLKIF_MAX_SEGMENTS_PER_REQUEST; xbb->max_request_size = xbb->max_request_segments * PAGE_SIZE; /* * Mandatory data (used in all versions of the protocol) first. */ error = xs_scanf(XST_NIL, otherend_path, "event-channel", NULL, "%" PRIu32, &xbb->ring_config.evtchn); if (error != 0) { xenbus_dev_fatal(xbb->dev, error, "Unable to retrieve event-channel information " "from frontend %s. Unable to connect.", xenbus_get_otherend_path(xbb->dev)); return (error); } /* * These fields are initialized to legacy protocol defaults * so we only need to fail if reading the updated value succeeds * and the new value is outside of its allowed range. * * \note xs_gather() returns on the first encountered error, so * we must use independent calls in order to guarantee * we don't miss information in a sparsly populated front-end * tree. * * \note xs_scanf() does not update variables for unmatched * fields. */ ring_page_order = 0; xbb->max_requests = 32; (void)xs_scanf(XST_NIL, otherend_path, "ring-page-order", NULL, "%u", &ring_page_order); xbb->ring_config.ring_pages = 1 << ring_page_order; ring_size = PAGE_SIZE * xbb->ring_config.ring_pages; xbb->max_requests = BLKIF_MAX_RING_REQUESTS(ring_size); if (xbb->ring_config.ring_pages > XBB_MAX_RING_PAGES) { xenbus_dev_fatal(xbb->dev, EINVAL, "Front-end specified ring-pages of %u " "exceeds backend limit of %u. " "Unable to connect.", xbb->ring_config.ring_pages, XBB_MAX_RING_PAGES); return (EINVAL); } if (xbb->ring_config.ring_pages == 1) { error = xs_gather(XST_NIL, otherend_path, "ring-ref", "%" PRIu32, &xbb->ring_config.ring_ref[0], NULL); if (error != 0) { xenbus_dev_fatal(xbb->dev, error, "Unable to retrieve ring information " "from frontend %s. Unable to " "connect.", xenbus_get_otherend_path(xbb->dev)); return (error); } } else { /* Multi-page ring format. */ for (ring_idx = 0; ring_idx < xbb->ring_config.ring_pages; ring_idx++) { char ring_ref_name[]= "ring_refXX"; snprintf(ring_ref_name, sizeof(ring_ref_name), "ring-ref%u", ring_idx); error = xs_scanf(XST_NIL, otherend_path, ring_ref_name, NULL, "%" PRIu32, &xbb->ring_config.ring_ref[ring_idx]); if (error != 0) { xenbus_dev_fatal(xbb->dev, error, "Failed to retriev grant " "reference for page %u of " "shared ring. Unable " "to connect.", ring_idx); return (error); } } } error = xs_gather(XST_NIL, otherend_path, "protocol", "%63s", protocol_abi, NULL); if (error != 0 || !strcmp(protocol_abi, XEN_IO_PROTO_ABI_NATIVE)) { /* * Assume native if the frontend has not * published ABI data or it has published and * matches our own ABI. */ xbb->abi = BLKIF_PROTOCOL_NATIVE; } else if (!strcmp(protocol_abi, XEN_IO_PROTO_ABI_X86_32)) { xbb->abi = BLKIF_PROTOCOL_X86_32; } else if (!strcmp(protocol_abi, XEN_IO_PROTO_ABI_X86_64)) { xbb->abi = BLKIF_PROTOCOL_X86_64; } else { xenbus_dev_fatal(xbb->dev, EINVAL, "Unknown protocol ABI (%s) published by " "frontend. Unable to connect.", protocol_abi); return (EINVAL); } return (0); } /** * Allocate per-request data structures given request size and number * information negotiated with the front-end. * * \param xbb Per-instance xbb configuration structure. */ static int xbb_alloc_requests(struct xbb_softc *xbb) { struct xbb_xen_req *req; struct xbb_xen_req *last_req; /* * Allocate request book keeping datastructures. */ xbb->requests = malloc(xbb->max_requests * sizeof(*xbb->requests), M_XENBLOCKBACK, M_NOWAIT|M_ZERO); if (xbb->requests == NULL) { xenbus_dev_fatal(xbb->dev, ENOMEM, "Unable to allocate request structures"); return (ENOMEM); } req = xbb->requests; last_req = &xbb->requests[xbb->max_requests - 1]; STAILQ_INIT(&xbb->request_free_stailq); while (req <= last_req) { STAILQ_INSERT_TAIL(&xbb->request_free_stailq, req, links); req++; } return (0); } static int xbb_alloc_request_lists(struct xbb_softc *xbb) { struct xbb_xen_reqlist *reqlist; int i; /* * If no requests can be merged, we need 1 request list per * in flight request. */ xbb->request_lists = malloc(xbb->max_requests * sizeof(*xbb->request_lists), M_XENBLOCKBACK, M_NOWAIT|M_ZERO); if (xbb->request_lists == NULL) { xenbus_dev_fatal(xbb->dev, ENOMEM, "Unable to allocate request list structures"); return (ENOMEM); } STAILQ_INIT(&xbb->reqlist_free_stailq); STAILQ_INIT(&xbb->reqlist_pending_stailq); for (i = 0; i < xbb->max_requests; i++) { int seg; reqlist = &xbb->request_lists[i]; reqlist->xbb = xbb; #ifdef XBB_USE_BOUNCE_BUFFERS reqlist->bounce = malloc(xbb->max_reqlist_size, M_XENBLOCKBACK, M_NOWAIT); if (reqlist->bounce == NULL) { xenbus_dev_fatal(xbb->dev, ENOMEM, "Unable to allocate request " "bounce buffers"); return (ENOMEM); } #endif /* XBB_USE_BOUNCE_BUFFERS */ reqlist->gnt_handles = malloc(xbb->max_reqlist_segments * sizeof(*reqlist->gnt_handles), M_XENBLOCKBACK, M_NOWAIT|M_ZERO); if (reqlist->gnt_handles == NULL) { xenbus_dev_fatal(xbb->dev, ENOMEM, "Unable to allocate request " "grant references"); return (ENOMEM); } for (seg = 0; seg < xbb->max_reqlist_segments; seg++) reqlist->gnt_handles[seg] = GRANT_REF_INVALID; STAILQ_INSERT_TAIL(&xbb->reqlist_free_stailq, reqlist, links); } return (0); } /** * Supply information about the physical device to the frontend * via XenBus. * * \param xbb Per-instance xbb configuration structure. */ static int xbb_publish_backend_info(struct xbb_softc *xbb) { struct xs_transaction xst; const char *our_path; const char *leaf; int error; our_path = xenbus_get_node(xbb->dev); while (1) { error = xs_transaction_start(&xst); if (error != 0) { xenbus_dev_fatal(xbb->dev, error, "Error publishing backend info " "(start transaction)"); return (error); } leaf = "sectors"; error = xs_printf(xst, our_path, leaf, "%"PRIu64, xbb->media_num_sectors); if (error != 0) break; /* XXX Support all VBD attributes here. */ leaf = "info"; error = xs_printf(xst, our_path, leaf, "%u", xbb->flags & XBBF_READ_ONLY ? VDISK_READONLY : 0); if (error != 0) break; leaf = "sector-size"; error = xs_printf(xst, our_path, leaf, "%u", xbb->sector_size); if (error != 0) break; error = xs_transaction_end(xst, 0); if (error == 0) { return (0); } else if (error != EAGAIN) { xenbus_dev_fatal(xbb->dev, error, "ending transaction"); return (error); } } xenbus_dev_fatal(xbb->dev, error, "writing %s/%s", our_path, leaf); xs_transaction_end(xst, 1); return (error); } /** * Connect to our blkfront peer now that it has completed publishing * its configuration into the XenStore. * * \param xbb Per-instance xbb configuration structure. */ static void xbb_connect(struct xbb_softc *xbb) { int error; if (xenbus_get_state(xbb->dev) == XenbusStateConnected) return; if (xbb_collect_frontend_info(xbb) != 0) return; xbb->flags &= ~XBBF_SHUTDOWN; /* * We limit the maximum number of reqlist segments to the maximum * number of segments in the ring, or our absolute maximum, * whichever is smaller. */ xbb->max_reqlist_segments = MIN(xbb->max_request_segments * xbb->max_requests, XBB_MAX_SEGMENTS_PER_REQLIST); /* * The maximum size is simply a function of the number of segments * we can handle. */ xbb->max_reqlist_size = xbb->max_reqlist_segments * PAGE_SIZE; /* Allocate resources whose size depends on front-end configuration. */ error = xbb_alloc_communication_mem(xbb); if (error != 0) { xenbus_dev_fatal(xbb->dev, error, "Unable to allocate communication memory"); return; } error = xbb_alloc_requests(xbb); if (error != 0) { /* Specific errors are reported by xbb_alloc_requests(). */ return; } error = xbb_alloc_request_lists(xbb); if (error != 0) { /* Specific errors are reported by xbb_alloc_request_lists(). */ return; } /* * Connect communication channel. */ error = xbb_connect_ring(xbb); if (error != 0) { /* Specific errors are reported by xbb_connect_ring(). */ return; } if (xbb_publish_backend_info(xbb) != 0) { /* * If we can't publish our data, we cannot participate * in this connection, and waiting for a front-end state * change will not help the situation. */ (void)xbb_disconnect(xbb); return; } /* Ready for I/O. */ xenbus_set_state(xbb->dev, XenbusStateConnected); } /*-------------------------- Device Teardown Support -------------------------*/ /** * Perform device shutdown functions. * * \param xbb Per-instance xbb configuration structure. * * Mark this instance as shutting down, wait for any active I/O on the * backend device/file to drain, disconnect from the front-end, and notify * any waiters (e.g. a thread invoking our detach method) that detach can * now proceed. */ static int xbb_shutdown(struct xbb_softc *xbb) { XenbusState frontState; int error; DPRINTF("\n"); /* * Due to the need to drop our mutex during some * xenbus operations, it is possible for two threads * to attempt to close out shutdown processing at * the same time. Tell the caller that hits this * race to try back later. */ if ((xbb->flags & XBBF_IN_SHUTDOWN) != 0) return (EAGAIN); xbb->flags |= XBBF_IN_SHUTDOWN; mtx_unlock(&xbb->lock); if (xenbus_get_state(xbb->dev) < XenbusStateClosing) xenbus_set_state(xbb->dev, XenbusStateClosing); frontState = xenbus_get_otherend_state(xbb->dev); mtx_lock(&xbb->lock); xbb->flags &= ~XBBF_IN_SHUTDOWN; /* The front can submit I/O until entering the closed state. */ if (frontState < XenbusStateClosed) return (EAGAIN); DPRINTF("\n"); /* Indicate shutdown is in progress. */ xbb->flags |= XBBF_SHUTDOWN; /* Disconnect from the front-end. */ error = xbb_disconnect(xbb); if (error != 0) { /* * Requests still outstanding. We'll be called again * once they complete. */ KASSERT(error == EAGAIN, ("%s: Unexpected xbb_disconnect() failure %d", __func__, error)); return (error); } DPRINTF("\n"); /* Indicate to xbb_detach() that is it safe to proceed. */ wakeup(xbb); return (0); } /** * Report an attach time error to the console and Xen, and cleanup * this instance by forcing immediate detach processing. * * \param xbb Per-instance xbb configuration structure. * \param err Errno describing the error. * \param fmt Printf style format and arguments */ static void xbb_attach_failed(struct xbb_softc *xbb, int err, const char *fmt, ...) { va_list ap; va_list ap_hotplug; va_start(ap, fmt); va_copy(ap_hotplug, ap); xs_vprintf(XST_NIL, xenbus_get_node(xbb->dev), "hotplug-error", fmt, ap_hotplug); va_end(ap_hotplug); xs_printf(XST_NIL, xenbus_get_node(xbb->dev), "hotplug-status", "error"); xenbus_dev_vfatal(xbb->dev, err, fmt, ap); va_end(ap); xs_printf(XST_NIL, xenbus_get_node(xbb->dev), "online", "0"); xbb_detach(xbb->dev); } /*---------------------------- NewBus Entrypoints ----------------------------*/ /** * Inspect a XenBus device and claim it if is of the appropriate type. * * \param dev NewBus device object representing a candidate XenBus device. * * \return 0 for success, errno codes for failure. */ static int xbb_probe(device_t dev) { if (!strcmp(xenbus_get_type(dev), "vbd")) { device_set_desc(dev, "Backend Virtual Block Device"); device_quiet(dev); return (0); } return (ENXIO); } /** * Setup sysctl variables to control various Block Back parameters. * * \param xbb Xen Block Back softc. * */ static void xbb_setup_sysctl(struct xbb_softc *xbb) { struct sysctl_ctx_list *sysctl_ctx = NULL; struct sysctl_oid *sysctl_tree = NULL; sysctl_ctx = device_get_sysctl_ctx(xbb->dev); if (sysctl_ctx == NULL) return; sysctl_tree = device_get_sysctl_tree(xbb->dev); if (sysctl_tree == NULL) return; SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, "disable_flush", CTLFLAG_RW, &xbb->disable_flush, 0, "fake the flush command"); SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, "flush_interval", CTLFLAG_RW, &xbb->flush_interval, 0, "send a real flush for N flush requests"); SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, "no_coalesce_reqs", CTLFLAG_RW, &xbb->no_coalesce_reqs,0, "Don't coalesce contiguous requests"); SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, "reqs_received", CTLFLAG_RW, &xbb->reqs_received, "how many I/O requests we have received"); SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, "reqs_completed", CTLFLAG_RW, &xbb->reqs_completed, "how many I/O requests have been completed"); SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, "reqs_queued_for_completion", CTLFLAG_RW, &xbb->reqs_queued_for_completion, "how many I/O requests queued but not yet pushed"); SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, "reqs_completed_with_error", CTLFLAG_RW, &xbb->reqs_completed_with_error, "how many I/O requests completed with error status"); SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, "forced_dispatch", CTLFLAG_RW, &xbb->forced_dispatch, "how many I/O dispatches were forced"); SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, "normal_dispatch", CTLFLAG_RW, &xbb->normal_dispatch, "how many I/O dispatches were normal"); SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, "total_dispatch", CTLFLAG_RW, &xbb->total_dispatch, "total number of I/O dispatches"); SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, "kva_shortages", CTLFLAG_RW, &xbb->kva_shortages, "how many times we have run out of KVA"); SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, "request_shortages", CTLFLAG_RW, &xbb->request_shortages, "how many times we have run out of requests"); SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, "max_requests", CTLFLAG_RD, &xbb->max_requests, 0, "maximum outstanding requests (negotiated)"); SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, "max_request_segments", CTLFLAG_RD, &xbb->max_request_segments, 0, "maximum number of pages per requests (negotiated)"); SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, "max_request_size", CTLFLAG_RD, &xbb->max_request_size, 0, "maximum size in bytes of a request (negotiated)"); SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, "ring_pages", CTLFLAG_RD, &xbb->ring_config.ring_pages, 0, "communication channel pages (negotiated)"); } /** * Attach to a XenBus device that has been claimed by our probe routine. * * \param dev NewBus device object representing this Xen Block Back instance. * * \return 0 for success, errno codes for failure. */ static int xbb_attach(device_t dev) { struct xbb_softc *xbb; int error; u_int max_ring_page_order; DPRINTF("Attaching to %s\n", xenbus_get_node(dev)); /* * Basic initialization. * After this block it is safe to call xbb_detach() * to clean up any allocated data for this instance. */ xbb = device_get_softc(dev); xbb->dev = dev; xbb->otherend_id = xenbus_get_otherend_id(dev); TASK_INIT(&xbb->io_task, /*priority*/0, xbb_run_queue, xbb); mtx_init(&xbb->lock, device_get_nameunit(dev), NULL, MTX_DEF); /* * Publish protocol capabilities for consumption by the * front-end. */ error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev), "feature-barrier", "1"); if (error) { xbb_attach_failed(xbb, error, "writing %s/feature-barrier", xenbus_get_node(xbb->dev)); return (error); } error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev), "feature-flush-cache", "1"); if (error) { xbb_attach_failed(xbb, error, "writing %s/feature-flush-cache", xenbus_get_node(xbb->dev)); return (error); } max_ring_page_order = flsl(XBB_MAX_RING_PAGES) - 1; error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev), "max-ring-page-order", "%u", max_ring_page_order); if (error) { xbb_attach_failed(xbb, error, "writing %s/max-ring-page-order", xenbus_get_node(xbb->dev)); return (error); } /* Collect physical device information. */ error = xs_gather(XST_NIL, xenbus_get_otherend_path(xbb->dev), "device-type", NULL, &xbb->dev_type, NULL); if (error != 0) xbb->dev_type = NULL; error = xs_gather(XST_NIL, xenbus_get_node(dev), "mode", NULL, &xbb->dev_mode, "params", NULL, &xbb->dev_name, NULL); if (error != 0) { xbb_attach_failed(xbb, error, "reading backend fields at %s", xenbus_get_node(dev)); return (ENXIO); } /* Parse fopen style mode flags. */ if (strchr(xbb->dev_mode, 'w') == NULL) xbb->flags |= XBBF_READ_ONLY; /* * Verify the physical device is present and can support * the desired I/O mode. */ DROP_GIANT(); error = xbb_open_backend(xbb); PICKUP_GIANT(); if (error != 0) { xbb_attach_failed(xbb, error, "Unable to open %s", xbb->dev_name); return (ENXIO); } /* Use devstat(9) for recording statistics. */ xbb->xbb_stats = devstat_new_entry("xbb", device_get_unit(xbb->dev), xbb->sector_size, DEVSTAT_ALL_SUPPORTED, DEVSTAT_TYPE_DIRECT | DEVSTAT_TYPE_IF_OTHER, DEVSTAT_PRIORITY_OTHER); xbb->xbb_stats_in = devstat_new_entry("xbbi", device_get_unit(xbb->dev), xbb->sector_size, DEVSTAT_ALL_SUPPORTED, DEVSTAT_TYPE_DIRECT | DEVSTAT_TYPE_IF_OTHER, DEVSTAT_PRIORITY_OTHER); /* * Setup sysctl variables. */ xbb_setup_sysctl(xbb); /* * Create a taskqueue for doing work that must occur from a * thread context. */ xbb->io_taskqueue = taskqueue_create_fast(device_get_nameunit(dev), M_NOWAIT, taskqueue_thread_enqueue, /*contxt*/&xbb->io_taskqueue); if (xbb->io_taskqueue == NULL) { xbb_attach_failed(xbb, error, "Unable to create taskqueue"); return (ENOMEM); } taskqueue_start_threads(&xbb->io_taskqueue, /*num threads*/1, /*priority*/PWAIT, /*thread name*/ "%s taskq", device_get_nameunit(dev)); /* Update hot-plug status to satisfy xend. */ error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev), "hotplug-status", "connected"); if (error) { xbb_attach_failed(xbb, error, "writing %s/hotplug-status", xenbus_get_node(xbb->dev)); return (error); } /* Tell the front end that we are ready to connect. */ xenbus_set_state(dev, XenbusStateInitWait); return (0); } /** * Detach from a block back device instance. * * \param dev NewBus device object representing this Xen Block Back instance. * * \return 0 for success, errno codes for failure. * * \note A block back device may be detached at any time in its life-cycle, * including part way through the attach process. For this reason, * initialization order and the initialization state checks in this * routine must be carefully coupled so that attach time failures * are gracefully handled. */ static int xbb_detach(device_t dev) { struct xbb_softc *xbb; DPRINTF("\n"); xbb = device_get_softc(dev); mtx_lock(&xbb->lock); while (xbb_shutdown(xbb) == EAGAIN) { msleep(xbb, &xbb->lock, /*wakeup prio unchanged*/0, "xbb_shutdown", 0); } mtx_unlock(&xbb->lock); DPRINTF("\n"); if (xbb->io_taskqueue != NULL) taskqueue_free(xbb->io_taskqueue); if (xbb->xbb_stats != NULL) devstat_remove_entry(xbb->xbb_stats); if (xbb->xbb_stats_in != NULL) devstat_remove_entry(xbb->xbb_stats_in); xbb_close_backend(xbb); if (xbb->dev_mode != NULL) { free(xbb->dev_mode, M_XENSTORE); xbb->dev_mode = NULL; } if (xbb->dev_type != NULL) { free(xbb->dev_type, M_XENSTORE); xbb->dev_type = NULL; } if (xbb->dev_name != NULL) { free(xbb->dev_name, M_XENSTORE); xbb->dev_name = NULL; } mtx_destroy(&xbb->lock); return (0); } /** * Prepare this block back device for suspension of this VM. * * \param dev NewBus device object representing this Xen Block Back instance. * * \return 0 for success, errno codes for failure. */ static int xbb_suspend(device_t dev) { #ifdef NOT_YET struct xbb_softc *sc = device_get_softc(dev); /* Prevent new requests being issued until we fix things up. */ mtx_lock(&sc->xb_io_lock); sc->connected = BLKIF_STATE_SUSPENDED; mtx_unlock(&sc->xb_io_lock); #endif return (0); } /** * Perform any processing required to recover from a suspended state. * * \param dev NewBus device object representing this Xen Block Back instance. * * \return 0 for success, errno codes for failure. */ static int xbb_resume(device_t dev) { return (0); } /** * Handle state changes expressed via the XenStore by our front-end peer. * * \param dev NewBus device object representing this Xen * Block Back instance. * \param frontend_state The new state of the front-end. * * \return 0 for success, errno codes for failure. */ static void xbb_frontend_changed(device_t dev, XenbusState frontend_state) { struct xbb_softc *xbb = device_get_softc(dev); DPRINTF("frontend_state=%s, xbb_state=%s\n", xenbus_strstate(frontend_state), xenbus_strstate(xenbus_get_state(xbb->dev))); switch (frontend_state) { case XenbusStateInitialising: break; case XenbusStateInitialised: case XenbusStateConnected: xbb_connect(xbb); break; case XenbusStateClosing: case XenbusStateClosed: mtx_lock(&xbb->lock); xbb_shutdown(xbb); mtx_unlock(&xbb->lock); if (frontend_state == XenbusStateClosed) xenbus_set_state(xbb->dev, XenbusStateClosed); break; default: xenbus_dev_fatal(xbb->dev, EINVAL, "saw state %d at frontend", frontend_state); break; } } /*---------------------------- NewBus Registration ---------------------------*/ static device_method_t xbb_methods[] = { /* Device interface */ DEVMETHOD(device_probe, xbb_probe), DEVMETHOD(device_attach, xbb_attach), DEVMETHOD(device_detach, xbb_detach), DEVMETHOD(device_shutdown, bus_generic_shutdown), DEVMETHOD(device_suspend, xbb_suspend), DEVMETHOD(device_resume, xbb_resume), /* Xenbus interface */ DEVMETHOD(xenbus_otherend_changed, xbb_frontend_changed), { 0, 0 } }; static driver_t xbb_driver = { "xbbd", xbb_methods, sizeof(struct xbb_softc), }; devclass_t xbb_devclass; DRIVER_MODULE(xbbd, xenbusb_back, xbb_driver, xbb_devclass, 0, 0); Index: head/sys/kern/subr_unit.c =================================================================== --- head/sys/kern/subr_unit.c (revision 299089) +++ head/sys/kern/subr_unit.c (revision 299090) @@ -1,1060 +1,1061 @@ /*- * Copyright (c) 2004 Poul-Henning Kamp * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ * * * Unit number allocation functions. * * These functions implement a mixed run-length/bitmap management of unit * number spaces in a very memory efficient manner. * * Allocation policy is always lowest free number first. * * A return value of -1 signals that no more unit numbers are available. * * There is no cost associated with the range of unitnumbers, so unless * the resource really is finite, specify INT_MAX to new_unrhdr() and * forget about checking the return value. * * If a mutex is not provided when the unit number space is created, a * default global mutex is used. The advantage to passing a mutex in, is * that the alloc_unrl() function can be called with the mutex already * held (it will not be released by alloc_unrl()). * * The allocation function alloc_unr{l}() never sleeps (but it may block on * the mutex of course). * * Freeing a unit number may require allocating memory, and can therefore * sleep so the free_unr() function does not come in a pre-locked variant. * * A userland test program is included. * * Memory usage is a very complex function of the exact allocation * pattern, but always very compact: * * For the very typical case where a single unbroken run of unit * numbers are allocated 44 bytes are used on i386. * * For a unit number space of 1000 units and the random pattern * in the usermode test program included, the worst case usage * was 252 bytes on i386 for 500 allocated and 500 free units. * * For a unit number space of 10000 units and the random pattern * in the usermode test program included, the worst case usage * was 798 bytes on i386 for 5000 allocated and 5000 free units. * * The worst case is where every other unit number is allocated and * the rest are free. In that case 44 + N/4 bytes are used where * N is the number of the highest unit allocated. */ +#include #include #include #ifdef _KERNEL #include -#include #include #include #include #include #include #include /* * In theory it would be smarter to allocate the individual blocks * with the zone allocator, but at this time the expectation is that * there will typically not even be enough allocations to fill a single * page, so we stick with malloc for now. */ static MALLOC_DEFINE(M_UNIT, "Unitno", "Unit number allocation"); #define Malloc(foo) malloc(foo, M_UNIT, M_WAITOK | M_ZERO) #define Free(foo) free(foo, M_UNIT) static struct mtx unitmtx; MTX_SYSINIT(unit, &unitmtx, "unit# allocation", MTX_DEF); #else /* ...USERLAND */ #include #include #include #include #include #include #include #include #define KASSERT(cond, arg) \ do { \ if (!(cond)) { \ printf arg; \ abort(); \ } \ } while (0) static int no_alloc; #define Malloc(foo) _Malloc(foo, __LINE__) static void * _Malloc(size_t foo, int line) { KASSERT(no_alloc == 0, ("malloc in wrong place() line %d", line)); return (calloc(foo, 1)); } #define Free(foo) free(foo) struct unrhdr; struct mtx { int state; } unitmtx; static void mtx_lock(struct mtx *mp) { KASSERT(mp->state == 0, ("mutex already locked")); mp->state = 1; } static void mtx_unlock(struct mtx *mp) { KASSERT(mp->state == 1, ("mutex not locked")); mp->state = 0; } #define MA_OWNED 9 static void mtx_assert(struct mtx *mp, int flag) { if (flag == MA_OWNED) { KASSERT(mp->state == 1, ("mtx_assert(MA_OWNED) not true")); } } #define CTASSERT(foo) #define WITNESS_WARN(flags, lock, fmt, ...) (void)0 #endif /* USERLAND */ /* * This is our basic building block. * * It can be used in three different ways depending on the value of the ptr * element: * If ptr is NULL, it represents a run of free items. * If ptr points to the unrhdr it represents a run of allocated items. - * Otherwise it points to an bitstring of allocated items. + * Otherwise it points to a bitstring of allocated items. * * For runs the len field is the length of the run. * For bitmaps the len field represents the number of allocated items. * * The bitmap is the same size as struct unr to optimize memory management. */ struct unr { TAILQ_ENTRY(unr) list; u_int len; void *ptr; }; struct unrb { - u_char busy; - bitstr_t map[sizeof(struct unr) - 1]; + bitstr_t map[sizeof(struct unr) / sizeof(bitstr_t)]; }; -CTASSERT(sizeof(struct unr) == sizeof(struct unrb)); +CTASSERT((sizeof(struct unr) % sizeof(bitstr_t)) == 0); -/* Number of bits in the bitmap */ -#define NBITS ((int)sizeof(((struct unrb *)NULL)->map) * 8) +/* Number of bits we can store in the bitmap */ +#define NBITS (8 * sizeof(((struct unrb*)NULL)->map)) +/* Is the unrb empty in at least the first len bits? */ +static inline bool +ub_empty(struct unrb *ub, int len) { + int first_set; + + bit_ffs(ub->map, len, &first_set); + return (first_set == -1); +} + +/* Is the unrb full? That is, is the number of set elements equal to len? */ +static inline bool +ub_full(struct unrb *ub, int len) +{ + int first_clear; + + bit_ffc(ub->map, len, &first_clear); + return (first_clear == -1); +} + + #if defined(DIAGNOSTIC) || !defined(_KERNEL) /* * Consistency check function. * * Checks the internal consistency as well as we can. * * Called at all boundaries of this API. */ static void check_unrhdr(struct unrhdr *uh, int line) { struct unr *up; struct unrb *ub; u_int x, y, z, w; y = uh->first; z = 0; TAILQ_FOREACH(up, &uh->head, list) { z++; if (up->ptr != uh && up->ptr != NULL) { ub = up->ptr; KASSERT (up->len <= NBITS, - ("UNR inconsistency: len %u max %d (line %d)\n", + ("UNR inconsistency: len %u max %zd (line %d)\n", up->len, NBITS, line)); z++; w = 0; for (x = 0; x < up->len; x++) if (bit_test(ub->map, x)) w++; - KASSERT (w == ub->busy, - ("UNR inconsistency: busy %u found %u (line %d)\n", - ub->busy, w, line)); y += w; } else if (up->ptr != NULL) y += up->len; } KASSERT (y == uh->busy, ("UNR inconsistency: items %u found %u (line %d)\n", uh->busy, y, line)); KASSERT (z == uh->alloc, ("UNR inconsistency: chunks %u found %u (line %d)\n", uh->alloc, z, line)); } #else static __inline void -check_unrhdr(struct unrhdr *uh, int line) +check_unrhdr(struct unrhdr *uh __unused, int line __unused) { } #endif /* * Userland memory management. Just use calloc and keep track of how * many elements we have allocated for check_unrhdr(). */ static __inline void * new_unr(struct unrhdr *uh, void **p1, void **p2) { void *p; uh->alloc++; KASSERT(*p1 != NULL || *p2 != NULL, ("Out of cached memory")); if (*p1 != NULL) { p = *p1; *p1 = NULL; return (p); } else { p = *p2; *p2 = NULL; return (p); } } static __inline void delete_unr(struct unrhdr *uh, void *ptr) { struct unr *up; uh->alloc--; up = ptr; TAILQ_INSERT_TAIL(&uh->ppfree, up, list); } void clean_unrhdrl(struct unrhdr *uh) { struct unr *up; mtx_assert(uh->mtx, MA_OWNED); while ((up = TAILQ_FIRST(&uh->ppfree)) != NULL) { TAILQ_REMOVE(&uh->ppfree, up, list); mtx_unlock(uh->mtx); Free(up); mtx_lock(uh->mtx); } } void clean_unrhdr(struct unrhdr *uh) { mtx_lock(uh->mtx); clean_unrhdrl(uh); mtx_unlock(uh->mtx); } void init_unrhdr(struct unrhdr *uh, int low, int high, struct mtx *mutex) { KASSERT(low >= 0 && low <= high, ("UNR: use error: new_unrhdr(%d, %d)", low, high)); if (mutex != NULL) uh->mtx = mutex; else uh->mtx = &unitmtx; TAILQ_INIT(&uh->head); TAILQ_INIT(&uh->ppfree); uh->low = low; uh->high = high; uh->first = 0; uh->last = 1 + (high - low); check_unrhdr(uh, __LINE__); } /* * Allocate a new unrheader set. * * Highest and lowest valid values given as parameters. */ struct unrhdr * new_unrhdr(int low, int high, struct mtx *mutex) { struct unrhdr *uh; uh = Malloc(sizeof *uh); init_unrhdr(uh, low, high, mutex); return (uh); } void delete_unrhdr(struct unrhdr *uh) { check_unrhdr(uh, __LINE__); KASSERT(uh->busy == 0, ("unrhdr has %u allocations", uh->busy)); KASSERT(uh->alloc == 0, ("UNR memory leak in delete_unrhdr")); KASSERT(TAILQ_FIRST(&uh->ppfree) == NULL, ("unrhdr has postponed item for free")); Free(uh); } static __inline int is_bitmap(struct unrhdr *uh, struct unr *up) { return (up->ptr != uh && up->ptr != NULL); } /* * Look for sequence of items which can be combined into a bitmap, if * multiple are present, take the one which saves most memory. * * Return (1) if a sequence was found to indicate that another call * might be able to do more. Return (0) if we found no suitable sequence. * * NB: called from alloc_unr(), no new memory allocation allowed. */ static int optimize_unr(struct unrhdr *uh) { struct unr *up, *uf, *us; struct unrb *ub, *ubf; u_int a, l, ba; /* * Look for the run of items (if any) which when collapsed into * a bitmap would save most memory. */ us = NULL; ba = 0; TAILQ_FOREACH(uf, &uh->head, list) { if (uf->len >= NBITS) continue; a = 1; if (is_bitmap(uh, uf)) a++; l = uf->len; up = uf; while (1) { up = TAILQ_NEXT(up, list); if (up == NULL) break; if ((up->len + l) > NBITS) break; a++; if (is_bitmap(uh, up)) a++; l += up->len; } if (a > ba) { ba = a; us = uf; } } if (ba < 3) return (0); /* * If the first element is not a bitmap, make it one. * Trying to do so without allocating more memory complicates things * a bit */ if (!is_bitmap(uh, us)) { uf = TAILQ_NEXT(us, list); TAILQ_REMOVE(&uh->head, us, list); a = us->len; l = us->ptr == uh ? 1 : 0; ub = (void *)us; - ub->busy = 0; - if (l) { + bit_nclear(ub->map, 0, NBITS - 1); + if (l) bit_nset(ub->map, 0, a); - ub->busy += a; - } else { - bit_nclear(ub->map, 0, a); - } if (!is_bitmap(uh, uf)) { - if (uf->ptr == NULL) { + if (uf->ptr == NULL) bit_nclear(ub->map, a, a + uf->len - 1); - } else { + else bit_nset(ub->map, a, a + uf->len - 1); - ub->busy += uf->len; - } uf->ptr = ub; uf->len += a; us = uf; } else { ubf = uf->ptr; for (l = 0; l < uf->len; l++, a++) { - if (bit_test(ubf->map, l)) { + if (bit_test(ubf->map, l)) bit_set(ub->map, a); - ub->busy++; - } else { + else bit_clear(ub->map, a); - } } uf->len = a; delete_unr(uh, uf->ptr); uf->ptr = ub; us = uf; } } ub = us->ptr; while (1) { uf = TAILQ_NEXT(us, list); if (uf == NULL) return (1); if (uf->len + us->len > NBITS) return (1); if (uf->ptr == NULL) { bit_nclear(ub->map, us->len, us->len + uf->len - 1); us->len += uf->len; TAILQ_REMOVE(&uh->head, uf, list); delete_unr(uh, uf); } else if (uf->ptr == uh) { bit_nset(ub->map, us->len, us->len + uf->len - 1); - ub->busy += uf->len; us->len += uf->len; TAILQ_REMOVE(&uh->head, uf, list); delete_unr(uh, uf); } else { ubf = uf->ptr; for (l = 0; l < uf->len; l++, us->len++) { - if (bit_test(ubf->map, l)) { + if (bit_test(ubf->map, l)) bit_set(ub->map, us->len); - ub->busy++; - } else { + else bit_clear(ub->map, us->len); - } } TAILQ_REMOVE(&uh->head, uf, list); delete_unr(uh, ubf); delete_unr(uh, uf); } } } /* * See if a given unr should be collapsed with a neighbor. * * NB: called from alloc_unr(), no new memory allocation allowed. */ static void collapse_unr(struct unrhdr *uh, struct unr *up) { struct unr *upp; struct unrb *ub; /* If bitmap is all set or clear, change it to runlength */ if (is_bitmap(uh, up)) { ub = up->ptr; - if (ub->busy == up->len) { + if (ub_full(ub, up->len)) { delete_unr(uh, up->ptr); up->ptr = uh; - } else if (ub->busy == 0) { + } else if (ub_empty(ub, up->len)) { delete_unr(uh, up->ptr); up->ptr = NULL; } } /* If nothing left in runlength, delete it */ if (up->len == 0) { upp = TAILQ_PREV(up, unrhd, list); if (upp == NULL) upp = TAILQ_NEXT(up, list); TAILQ_REMOVE(&uh->head, up, list); delete_unr(uh, up); up = upp; } /* If we have "hot-spot" still, merge with neighbor if possible */ if (up != NULL) { upp = TAILQ_PREV(up, unrhd, list); if (upp != NULL && up->ptr == upp->ptr) { up->len += upp->len; TAILQ_REMOVE(&uh->head, upp, list); delete_unr(uh, upp); } upp = TAILQ_NEXT(up, list); if (upp != NULL && up->ptr == upp->ptr) { up->len += upp->len; TAILQ_REMOVE(&uh->head, upp, list); delete_unr(uh, upp); } } /* Merge into ->first if possible */ upp = TAILQ_FIRST(&uh->head); if (upp != NULL && upp->ptr == uh) { uh->first += upp->len; TAILQ_REMOVE(&uh->head, upp, list); delete_unr(uh, upp); if (up == upp) up = NULL; } /* Merge into ->last if possible */ upp = TAILQ_LAST(&uh->head, unrhd); if (upp != NULL && upp->ptr == NULL) { uh->last += upp->len; TAILQ_REMOVE(&uh->head, upp, list); delete_unr(uh, upp); if (up == upp) up = NULL; } /* Try to make bitmaps */ while (optimize_unr(uh)) continue; } /* * Allocate a free unr. */ int alloc_unrl(struct unrhdr *uh) { struct unr *up; struct unrb *ub; u_int x; int y; mtx_assert(uh->mtx, MA_OWNED); check_unrhdr(uh, __LINE__); x = uh->low + uh->first; up = TAILQ_FIRST(&uh->head); /* * If we have an ideal split, just adjust the first+last */ if (up == NULL && uh->last > 0) { uh->first++; uh->last--; uh->busy++; return (x); } /* * We can always allocate from the first list element, so if we have * nothing on the list, we must have run out of unit numbers. */ if (up == NULL) return (-1); KASSERT(up->ptr != uh, ("UNR first element is allocated")); if (up->ptr == NULL) { /* free run */ uh->first++; up->len--; } else { /* bitmap */ ub = up->ptr; - KASSERT(ub->busy < up->len, ("UNR bitmap confusion")); bit_ffc(ub->map, up->len, &y); KASSERT(y != -1, ("UNR corruption: No clear bit in bitmap.")); bit_set(ub->map, y); - ub->busy++; x += y; } uh->busy++; collapse_unr(uh, up); return (x); } int alloc_unr(struct unrhdr *uh) { int i; mtx_lock(uh->mtx); i = alloc_unrl(uh); clean_unrhdrl(uh); mtx_unlock(uh->mtx); return (i); } static int alloc_unr_specificl(struct unrhdr *uh, u_int item, void **p1, void **p2) { struct unr *up, *upn; struct unrb *ub; u_int i, last, tl; mtx_assert(uh->mtx, MA_OWNED); if (item < uh->low + uh->first || item > uh->high) return (-1); up = TAILQ_FIRST(&uh->head); /* Ideal split. */ if (up == NULL && item - uh->low == uh->first) { uh->first++; uh->last--; uh->busy++; check_unrhdr(uh, __LINE__); return (item); } i = item - uh->low - uh->first; if (up == NULL) { up = new_unr(uh, p1, p2); up->ptr = NULL; up->len = i; TAILQ_INSERT_TAIL(&uh->head, up, list); up = new_unr(uh, p1, p2); up->ptr = uh; up->len = 1; TAILQ_INSERT_TAIL(&uh->head, up, list); uh->last = uh->high - uh->low - i; uh->busy++; check_unrhdr(uh, __LINE__); return (item); } else { /* Find the item which contains the unit we want to allocate. */ TAILQ_FOREACH(up, &uh->head, list) { if (up->len > i) break; i -= up->len; } } if (up == NULL) { if (i > 0) { up = new_unr(uh, p1, p2); up->ptr = NULL; up->len = i; TAILQ_INSERT_TAIL(&uh->head, up, list); } up = new_unr(uh, p1, p2); up->ptr = uh; up->len = 1; TAILQ_INSERT_TAIL(&uh->head, up, list); goto done; } if (is_bitmap(uh, up)) { ub = up->ptr; if (bit_test(ub->map, i) == 0) { bit_set(ub->map, i); - ub->busy++; goto done; } else return (-1); } else if (up->ptr == uh) return (-1); KASSERT(up->ptr == NULL, ("alloc_unr_specificl: up->ptr != NULL (up=%p)", up)); /* Split off the tail end, if any. */ tl = up->len - (1 + i); if (tl > 0) { upn = new_unr(uh, p1, p2); upn->ptr = NULL; upn->len = tl; TAILQ_INSERT_AFTER(&uh->head, up, upn, list); } /* Split off head end, if any */ if (i > 0) { upn = new_unr(uh, p1, p2); upn->len = i; upn->ptr = NULL; TAILQ_INSERT_BEFORE(up, upn, list); } up->len = 1; up->ptr = uh; done: last = uh->high - uh->low - (item - uh->low); if (uh->last > last) uh->last = last; uh->busy++; collapse_unr(uh, up); check_unrhdr(uh, __LINE__); return (item); } int alloc_unr_specific(struct unrhdr *uh, u_int item) { void *p1, *p2; int i; WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "alloc_unr_specific"); p1 = Malloc(sizeof(struct unr)); p2 = Malloc(sizeof(struct unr)); mtx_lock(uh->mtx); i = alloc_unr_specificl(uh, item, &p1, &p2); mtx_unlock(uh->mtx); if (p1 != NULL) Free(p1); if (p2 != NULL) Free(p2); return (i); } /* * Free a unr. * * If we can save unrs by using a bitmap, do so. */ static void free_unrl(struct unrhdr *uh, u_int item, void **p1, void **p2) { struct unr *up, *upp, *upn; struct unrb *ub; u_int pl; KASSERT(item >= uh->low && item <= uh->high, ("UNR: free_unr(%u) out of range [%u...%u]", item, uh->low, uh->high)); check_unrhdr(uh, __LINE__); item -= uh->low; upp = TAILQ_FIRST(&uh->head); /* * Freeing in the ideal split case */ if (item + 1 == uh->first && upp == NULL) { uh->last++; uh->first--; uh->busy--; check_unrhdr(uh, __LINE__); return; } /* * Freeing in the ->first section. Create a run starting at the * freed item. The code below will subdivide it. */ if (item < uh->first) { up = new_unr(uh, p1, p2); up->ptr = uh; up->len = uh->first - item; TAILQ_INSERT_HEAD(&uh->head, up, list); uh->first -= up->len; } item -= uh->first; /* Find the item which contains the unit we want to free */ TAILQ_FOREACH(up, &uh->head, list) { if (up->len > item) break; item -= up->len; } /* Handle bitmap items */ if (is_bitmap(uh, up)) { ub = up->ptr; KASSERT(bit_test(ub->map, item) != 0, ("UNR: Freeing free item %d (bitmap)\n", item)); bit_clear(ub->map, item); uh->busy--; - ub->busy--; collapse_unr(uh, up); return; } KASSERT(up->ptr == uh, ("UNR Freeing free item %d (run))\n", item)); /* Just this one left, reap it */ if (up->len == 1) { up->ptr = NULL; uh->busy--; collapse_unr(uh, up); return; } /* Check if we can shift the item into the previous 'free' run */ upp = TAILQ_PREV(up, unrhd, list); if (item == 0 && upp != NULL && upp->ptr == NULL) { upp->len++; up->len--; uh->busy--; collapse_unr(uh, up); return; } /* Check if we can shift the item to the next 'free' run */ upn = TAILQ_NEXT(up, list); if (item == up->len - 1 && upn != NULL && upn->ptr == NULL) { upn->len++; up->len--; uh->busy--; collapse_unr(uh, up); return; } /* Split off the tail end, if any. */ pl = up->len - (1 + item); if (pl > 0) { upp = new_unr(uh, p1, p2); upp->ptr = uh; upp->len = pl; TAILQ_INSERT_AFTER(&uh->head, up, upp, list); } /* Split off head end, if any */ if (item > 0) { upp = new_unr(uh, p1, p2); upp->len = item; upp->ptr = uh; TAILQ_INSERT_BEFORE(up, upp, list); } up->len = 1; up->ptr = NULL; uh->busy--; collapse_unr(uh, up); } void free_unr(struct unrhdr *uh, u_int item) { void *p1, *p2; WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "free_unr"); p1 = Malloc(sizeof(struct unr)); p2 = Malloc(sizeof(struct unr)); mtx_lock(uh->mtx); free_unrl(uh, item, &p1, &p2); clean_unrhdrl(uh); mtx_unlock(uh->mtx); if (p1 != NULL) Free(p1); if (p2 != NULL) Free(p2); } #ifndef _KERNEL /* USERLAND test driver */ /* * Simple stochastic test driver for the above functions. The code resides * here so that it can access static functions and structures. */ static bool verbose; #define VPRINTF(...) {if (verbose) printf(__VA_ARGS__);} static void print_unr(struct unrhdr *uh, struct unr *up) { u_int x; struct unrb *ub; printf(" %p len = %5u ", up, up->len); if (up->ptr == NULL) printf("free\n"); else if (up->ptr == uh) printf("alloc\n"); else { ub = up->ptr; - printf("bitmap(%d) [", ub->busy); + printf("bitmap ["); for (x = 0; x < up->len; x++) { if (bit_test(ub->map, x)) printf("#"); else printf(" "); } printf("]\n"); } } static void print_unrhdr(struct unrhdr *uh) { struct unr *up; u_int x; printf( "%p low = %u high = %u first = %u last = %u busy %u chunks = %u\n", uh, uh->low, uh->high, uh->first, uh->last, uh->busy, uh->alloc); x = uh->low + uh->first; TAILQ_FOREACH(up, &uh->head, list) { printf(" from = %5u", x); print_unr(uh, up); if (up->ptr == NULL || up->ptr == uh) x += up->len; else x += NBITS; } } static void test_alloc_unr(struct unrhdr *uh, u_int i, char a[]) { int j; if (a[i]) { VPRINTF("F %u\n", i); free_unr(uh, i); a[i] = 0; } else { no_alloc = 1; j = alloc_unr(uh); if (j != -1) { a[j] = 1; VPRINTF("A %d\n", j); } no_alloc = 0; } } static void test_alloc_unr_specific(struct unrhdr *uh, u_int i, char a[]) { int j; j = alloc_unr_specific(uh, i); if (j == -1) { VPRINTF("F %u\n", i); a[i] = 0; free_unr(uh, i); } else { a[i] = 1; VPRINTF("A %d\n", j); } } static void usage(char** argv) { printf("%s [-h] [-r REPETITIONS] [-v]\n", argv[0]); } int main(int argc, char **argv) { struct unrhdr *uh; char *a; long count = 10000; /* Number of unrs to test */ long reps = 1; int ch; u_int i, x, m, j; verbose = false; while ((ch = getopt(argc, argv, "hr:v")) != -1) { switch (ch) { case 'r': errno = 0; reps = strtol(optarg, NULL, 0); if (errno == ERANGE || errno == EINVAL) { usage(argv); exit(2); } break; case 'v': verbose = true; break; case 'h': default: usage(argv); exit(2); } } setbuf(stdout, NULL); uh = new_unrhdr(0, count - 1, NULL); print_unrhdr(uh); a = calloc(count, sizeof(char)); if (a == NULL) err(1, "calloc failed"); srandomdev(); printf("sizeof(struct unr) %zu\n", sizeof(struct unr)); printf("sizeof(struct unrb) %zu\n", sizeof(struct unrb)); printf("sizeof(struct unrhdr) %zu\n", sizeof(struct unrhdr)); - printf("NBITS %d\n", NBITS); + printf("NBITS %lu\n", NBITS); x = 1; for (m = 0; m < count * reps; m++) { j = random(); i = (j >> 1) % count; #if 0 if (a[i] && (j & 1)) continue; #endif if ((random() & 1) != 0) test_alloc_unr(uh, i, a); else test_alloc_unr_specific(uh, i, a); if (verbose) print_unrhdr(uh); check_unrhdr(uh, __LINE__); } for (i = 0; i < count; i++) { if (a[i]) { if (verbose) { printf("C %u\n", i); print_unrhdr(uh); } free_unr(uh, i); } } print_unrhdr(uh); delete_unrhdr(uh); free(a); return (0); } #endif Index: head/sys/net/flowtable.c =================================================================== --- head/sys/net/flowtable.c (revision 299089) +++ head/sys/net/flowtable.c (revision 299090) @@ -1,1194 +1,1189 @@ /*- * Copyright (c) 2014 Gleb Smirnoff * Copyright (c) 2008-2010, BitGravity Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Neither the name of the BitGravity Corporation nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include "opt_route.h" #include "opt_mpath.h" #include "opt_ddb.h" #include "opt_inet.h" #include "opt_inet6.h" #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #endif #ifdef FLOWTABLE_HASH_ALL #include #include #include #endif #include #ifdef FLOWTABLE_HASH_ALL #define KEY_PORTS (sizeof(uint16_t) * 2) #define KEY_ADDRS 2 #else #define KEY_PORTS 0 #define KEY_ADDRS 1 #endif #ifdef INET6 #define KEY_ADDR_LEN sizeof(struct in6_addr) #else #define KEY_ADDR_LEN sizeof(struct in_addr) #endif #define KEYLEN ((KEY_ADDR_LEN * KEY_ADDRS + KEY_PORTS) / sizeof(uint32_t)) struct flentry { uint32_t f_hash; /* hash flowing forward */ uint32_t f_key[KEYLEN]; /* address(es and ports) */ uint32_t f_uptime; /* uptime at last access */ uint16_t f_fibnum; /* fib index */ #ifdef FLOWTABLE_HASH_ALL uint8_t f_proto; /* protocol */ uint8_t f_flags; /* stale? */ #define FL_STALE 1 #endif SLIST_ENTRY(flentry) f_next; /* pointer to collision entry */ struct rtentry *f_rt; /* rtentry for flow */ struct llentry *f_lle; /* llentry for flow */ }; #undef KEYLEN SLIST_HEAD(flist, flentry); /* Make sure we can use pcpu_zone_ptr for struct flist. */ CTASSERT(sizeof(struct flist) == sizeof(void *)); struct flowtable { counter_u64_t *ft_stat; int ft_size; /* * ft_table is a malloc(9)ed array of pointers. Pointers point to * memory from UMA_ZONE_PCPU zone. * ft_masks is per-cpu pointer itself. Each instance points * to a malloc(9)ed bitset, that is private to corresponding CPU. */ struct flist **ft_table; bitstr_t **ft_masks; bitstr_t *ft_tmpmask; }; #define FLOWSTAT_ADD(ft, name, v) \ counter_u64_add((ft)->ft_stat[offsetof(struct flowtable_stat, name) / sizeof(uint64_t)], (v)) #define FLOWSTAT_INC(ft, name) FLOWSTAT_ADD(ft, name, 1) static struct proc *flowcleanerproc; static uint32_t flow_hashjitter; static struct cv flowclean_f_cv; static struct cv flowclean_c_cv; static struct mtx flowclean_lock; static uint32_t flowclean_cycles; /* * TODO: * - add sysctls to resize && flush flow tables * - Add per flowtable sysctls for statistics and configuring timeouts * - add saturation counter to rtentry to support per-packet load-balancing * add flag to indicate round-robin flow, add list lookup from head for flows * - add sysctl / device node / syscall to support exporting and importing * of flows with flag to indicate that a flow was imported so should * not be considered for auto-cleaning * - support explicit connection state (currently only ad-hoc for DSR) * - idetach() cleanup for options VIMAGE builds. */ #ifdef INET static VNET_DEFINE(struct flowtable, ip4_ft); #define V_ip4_ft VNET(ip4_ft) #endif #ifdef INET6 static VNET_DEFINE(struct flowtable, ip6_ft); #define V_ip6_ft VNET(ip6_ft) #endif static uma_zone_t flow_zone; static VNET_DEFINE(int, flowtable_enable) = 1; #define V_flowtable_enable VNET(flowtable_enable) static SYSCTL_NODE(_net, OID_AUTO, flowtable, CTLFLAG_RD, NULL, "flowtable"); SYSCTL_INT(_net_flowtable, OID_AUTO, enable, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(flowtable_enable), 0, "enable flowtable caching."); SYSCTL_UMA_MAX(_net_flowtable, OID_AUTO, maxflows, CTLFLAG_RW, &flow_zone, "Maximum number of flows allowed"); static MALLOC_DEFINE(M_FTABLE, "flowtable", "flowtable hashes and bitstrings"); static struct flentry * flowtable_lookup_common(struct flowtable *, uint32_t *, int, uint32_t); #ifdef INET static struct flentry * flowtable_lookup_ipv4(struct mbuf *m, struct route *ro) { struct flentry *fle; struct sockaddr_in *sin; struct ip *ip; uint32_t fibnum; #ifdef FLOWTABLE_HASH_ALL uint32_t key[3]; int iphlen; uint16_t sport, dport; uint8_t proto; #endif ip = mtod(m, struct ip *); if (ip->ip_src.s_addr == ip->ip_dst.s_addr || (ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET || (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) return (NULL); fibnum = M_GETFIB(m); #ifdef FLOWTABLE_HASH_ALL iphlen = ip->ip_hl << 2; proto = ip->ip_p; switch (proto) { case IPPROTO_TCP: { struct tcphdr *th; th = (struct tcphdr *)((char *)ip + iphlen); sport = th->th_sport; dport = th->th_dport; if (th->th_flags & (TH_RST|TH_FIN)) fibnum |= (FL_STALE << 24); break; } case IPPROTO_UDP: { struct udphdr *uh; uh = (struct udphdr *)((char *)ip + iphlen); sport = uh->uh_sport; dport = uh->uh_dport; break; } case IPPROTO_SCTP: { struct sctphdr *sh; sh = (struct sctphdr *)((char *)ip + iphlen); sport = sh->src_port; dport = sh->dest_port; /* XXXGL: handle stale? */ break; } default: sport = dport = 0; break; } key[0] = ip->ip_dst.s_addr; key[1] = ip->ip_src.s_addr; key[2] = (dport << 16) | sport; fibnum |= proto << 16; fle = flowtable_lookup_common(&V_ip4_ft, key, 3 * sizeof(uint32_t), fibnum); #else /* !FLOWTABLE_HASH_ALL */ fle = flowtable_lookup_common(&V_ip4_ft, (uint32_t *)&ip->ip_dst, sizeof(struct in_addr), fibnum); #endif /* FLOWTABLE_HASH_ALL */ if (fle == NULL) return (NULL); sin = (struct sockaddr_in *)&ro->ro_dst; sin->sin_family = AF_INET; sin->sin_len = sizeof(*sin); sin->sin_addr = ip->ip_dst; return (fle); } #endif /* INET */ #ifdef INET6 /* * PULLUP_TO(len, p, T) makes sure that len + sizeof(T) is contiguous, * then it sets p to point at the offset "len" in the mbuf. WARNING: the * pointer might become stale after other pullups (but we never use it * this way). */ #define PULLUP_TO(_len, p, T) \ do { \ int x = (_len) + sizeof(T); \ if ((m)->m_len < x) \ return (NULL); \ p = (mtod(m, char *) + (_len)); \ } while (0) #define TCP(p) ((struct tcphdr *)(p)) #define SCTP(p) ((struct sctphdr *)(p)) #define UDP(p) ((struct udphdr *)(p)) static struct flentry * flowtable_lookup_ipv6(struct mbuf *m, struct route *ro) { struct flentry *fle; struct sockaddr_in6 *sin6; struct ip6_hdr *ip6; uint32_t fibnum; #ifdef FLOWTABLE_HASH_ALL uint32_t key[9]; void *ulp; int hlen; uint16_t sport, dport; u_short offset; uint8_t proto; #else uint32_t key[4]; #endif ip6 = mtod(m, struct ip6_hdr *); if (in6_localaddr(&ip6->ip6_dst)) return (NULL); fibnum = M_GETFIB(m); #ifdef FLOWTABLE_HASH_ALL hlen = sizeof(struct ip6_hdr); proto = ip6->ip6_nxt; offset = sport = dport = 0; ulp = NULL; while (ulp == NULL) { switch (proto) { case IPPROTO_ICMPV6: case IPPROTO_OSPFIGP: case IPPROTO_PIM: case IPPROTO_CARP: case IPPROTO_ESP: case IPPROTO_NONE: ulp = ip6; break; case IPPROTO_TCP: PULLUP_TO(hlen, ulp, struct tcphdr); dport = TCP(ulp)->th_dport; sport = TCP(ulp)->th_sport; if (TCP(ulp)->th_flags & (TH_RST|TH_FIN)) fibnum |= (FL_STALE << 24); break; case IPPROTO_SCTP: PULLUP_TO(hlen, ulp, struct sctphdr); dport = SCTP(ulp)->src_port; sport = SCTP(ulp)->dest_port; /* XXXGL: handle stale? */ break; case IPPROTO_UDP: PULLUP_TO(hlen, ulp, struct udphdr); dport = UDP(ulp)->uh_dport; sport = UDP(ulp)->uh_sport; break; case IPPROTO_HOPOPTS: /* RFC 2460 */ PULLUP_TO(hlen, ulp, struct ip6_hbh); hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3; proto = ((struct ip6_hbh *)ulp)->ip6h_nxt; ulp = NULL; break; case IPPROTO_ROUTING: /* RFC 2460 */ PULLUP_TO(hlen, ulp, struct ip6_rthdr); hlen += (((struct ip6_rthdr *)ulp)->ip6r_len + 1) << 3; proto = ((struct ip6_rthdr *)ulp)->ip6r_nxt; ulp = NULL; break; case IPPROTO_FRAGMENT: /* RFC 2460 */ PULLUP_TO(hlen, ulp, struct ip6_frag); hlen += sizeof (struct ip6_frag); proto = ((struct ip6_frag *)ulp)->ip6f_nxt; offset = ((struct ip6_frag *)ulp)->ip6f_offlg & IP6F_OFF_MASK; ulp = NULL; break; case IPPROTO_DSTOPTS: /* RFC 2460 */ PULLUP_TO(hlen, ulp, struct ip6_hbh); hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3; proto = ((struct ip6_hbh *)ulp)->ip6h_nxt; ulp = NULL; break; case IPPROTO_AH: /* RFC 2402 */ PULLUP_TO(hlen, ulp, struct ip6_ext); hlen += (((struct ip6_ext *)ulp)->ip6e_len + 2) << 2; proto = ((struct ip6_ext *)ulp)->ip6e_nxt; ulp = NULL; break; default: PULLUP_TO(hlen, ulp, struct ip6_ext); break; } } bcopy(&ip6->ip6_dst, &key[0], sizeof(struct in6_addr)); bcopy(&ip6->ip6_src, &key[4], sizeof(struct in6_addr)); key[8] = (dport << 16) | sport; fibnum |= proto << 16; fle = flowtable_lookup_common(&V_ip6_ft, key, 9 * sizeof(uint32_t), fibnum); #else /* !FLOWTABLE_HASH_ALL */ bcopy(&ip6->ip6_dst, &key[0], sizeof(struct in6_addr)); fle = flowtable_lookup_common(&V_ip6_ft, key, sizeof(struct in6_addr), fibnum); #endif /* FLOWTABLE_HASH_ALL */ if (fle == NULL) return (NULL); sin6 = (struct sockaddr_in6 *)&ro->ro_dst; sin6->sin6_family = AF_INET6; sin6->sin6_len = sizeof(*sin6); bcopy(&ip6->ip6_dst, &sin6->sin6_addr, sizeof(struct in6_addr)); return (fle); } #endif /* INET6 */ static bitstr_t * flowtable_mask(struct flowtable *ft) { /* * flowtable_free_stale() calls w/o critical section, but * with sched_bind(). Since pointer is stable throughout * ft lifetime, it is safe, otherwise... * * CRITICAL_ASSERT(curthread); */ return (*(bitstr_t **)zpcpu_get(ft->ft_masks)); } static struct flist * flowtable_list(struct flowtable *ft, uint32_t hash) { CRITICAL_ASSERT(curthread); return (zpcpu_get(ft->ft_table[hash % ft->ft_size])); } static int flow_stale(struct flowtable *ft, struct flentry *fle, int maxidle) { if (((fle->f_rt->rt_flags & RTF_UP) == 0) || (fle->f_rt->rt_ifp == NULL) || !RT_LINK_IS_UP(fle->f_rt->rt_ifp) || (fle->f_lle->la_flags & LLE_VALID) == 0) return (1); if (time_uptime - fle->f_uptime > maxidle) return (1); #ifdef FLOWTABLE_HASH_ALL if (fle->f_flags & FL_STALE) return (1); #endif return (0); } static int flow_full(void) { int count, max; count = uma_zone_get_cur(flow_zone); max = uma_zone_get_max(flow_zone); return (count > (max - (max >> 3))); } static int flow_matches(struct flentry *fle, uint32_t *key, int keylen, uint32_t fibnum) { #ifdef FLOWTABLE_HASH_ALL uint8_t proto; proto = (fibnum >> 16) & 0xff; fibnum &= 0xffff; #endif CRITICAL_ASSERT(curthread); /* Microoptimization for IPv4: don't use bcmp(). */ if (((keylen == sizeof(uint32_t) && (fle->f_key[0] == key[0])) || (bcmp(fle->f_key, key, keylen) == 0)) && fibnum == fle->f_fibnum && #ifdef FLOWTABLE_HASH_ALL proto == fle->f_proto && #endif (fle->f_rt->rt_flags & RTF_UP) && fle->f_rt->rt_ifp != NULL && (fle->f_lle->la_flags & LLE_VALID)) return (1); return (0); } static struct flentry * flowtable_insert(struct flowtable *ft, uint32_t hash, uint32_t *key, int keylen, uint32_t fibnum0) { #ifdef INET6 struct route_in6 sro6; #endif #ifdef INET struct route sro; #endif struct route *ro = NULL; struct rtentry *rt; struct lltable *lt = NULL; struct llentry *lle; struct sockaddr_storage *l3addr; struct ifnet *ifp; struct flist *flist; struct flentry *fle, *iter; bitstr_t *mask; uint16_t fibnum = fibnum0; #ifdef FLOWTABLE_HASH_ALL uint8_t proto; proto = (fibnum0 >> 16) & 0xff; fibnum = fibnum0 & 0xffff; #endif /* * This bit of code ends up locking the * same route 3 times (just like ip_output + ether_output) * - at lookup * - in rt_check when called by arpresolve * - dropping the refcount for the rtentry * * This could be consolidated to one if we wrote a variant * of arpresolve with an rt_check variant that expected to * receive the route locked */ #ifdef INET if (ft == &V_ip4_ft) { struct sockaddr_in *sin; ro = &sro; bzero(&sro.ro_dst, sizeof(sro.ro_dst)); sin = (struct sockaddr_in *)&sro.ro_dst; sin->sin_family = AF_INET; sin->sin_len = sizeof(*sin); sin->sin_addr.s_addr = key[0]; } #endif #ifdef INET6 if (ft == &V_ip6_ft) { struct sockaddr_in6 *sin6; ro = (struct route *)&sro6; sin6 = &sro6.ro_dst; bzero(sin6, sizeof(*sin6)); sin6->sin6_family = AF_INET6; sin6->sin6_len = sizeof(*sin6); bcopy(key, &sin6->sin6_addr, sizeof(struct in6_addr)); } #endif ro->ro_rt = NULL; #ifdef RADIX_MPATH rtalloc_mpath_fib(ro, hash, fibnum); #else rtalloc_ign_fib(ro, 0, fibnum); #endif if (ro->ro_rt == NULL) return (NULL); rt = ro->ro_rt; ifp = rt->rt_ifp; if (ifp->if_flags & (IFF_POINTOPOINT | IFF_LOOPBACK)) { RTFREE(rt); return (NULL); } #ifdef INET if (ft == &V_ip4_ft) lt = LLTABLE(ifp); #endif #ifdef INET6 if (ft == &V_ip6_ft) lt = LLTABLE6(ifp); #endif if (rt->rt_flags & RTF_GATEWAY) l3addr = (struct sockaddr_storage *)rt->rt_gateway; else l3addr = (struct sockaddr_storage *)&ro->ro_dst; lle = llentry_alloc(ifp, lt, l3addr); if (lle == NULL) { RTFREE(rt); return (NULL); } /* Don't insert the entry if the ARP hasn't yet finished resolving. */ if ((lle->la_flags & LLE_VALID) == 0) { RTFREE(rt); LLE_FREE(lle); FLOWSTAT_INC(ft, ft_fail_lle_invalid); return (NULL); } fle = uma_zalloc(flow_zone, M_NOWAIT | M_ZERO); if (fle == NULL) { RTFREE(rt); LLE_FREE(lle); return (NULL); } fle->f_hash = hash; bcopy(key, &fle->f_key, keylen); fle->f_rt = rt; fle->f_lle = lle; fle->f_fibnum = fibnum; fle->f_uptime = time_uptime; #ifdef FLOWTABLE_HASH_ALL fle->f_proto = proto; fle->f_flags = fibnum0 >> 24; #endif critical_enter(); mask = flowtable_mask(ft); flist = flowtable_list(ft, hash); if (SLIST_EMPTY(flist)) { bit_set(mask, (hash % ft->ft_size)); SLIST_INSERT_HEAD(flist, fle, f_next); goto skip; } /* * find end of list and make sure that we were not * preempted by another thread handling this flow */ SLIST_FOREACH(iter, flist, f_next) { KASSERT(iter->f_hash % ft->ft_size == hash % ft->ft_size, ("%s: wrong hash", __func__)); if (flow_matches(iter, key, keylen, fibnum)) { /* * We probably migrated to an other CPU after * lookup in flowtable_lookup_common() failed. * It appeared that this CPU already has flow * entry. */ iter->f_uptime = time_uptime; #ifdef FLOWTABLE_HASH_ALL iter->f_flags |= fibnum >> 24; #endif critical_exit(); FLOWSTAT_INC(ft, ft_collisions); uma_zfree(flow_zone, fle); return (iter); } } SLIST_INSERT_HEAD(flist, fle, f_next); skip: critical_exit(); FLOWSTAT_INC(ft, ft_inserts); return (fle); } int flowtable_lookup(sa_family_t sa, struct mbuf *m, struct route *ro) { struct flentry *fle; struct llentry *lle; if (V_flowtable_enable == 0) return (ENXIO); switch (sa) { #ifdef INET case AF_INET: fle = flowtable_lookup_ipv4(m, ro); break; #endif #ifdef INET6 case AF_INET6: fle = flowtable_lookup_ipv6(m, ro); break; #endif default: panic("%s: sa %d", __func__, sa); } if (fle == NULL) return (EHOSTUNREACH); if (M_HASHTYPE_GET(m) == M_HASHTYPE_NONE) { M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE); m->m_pkthdr.flowid = fle->f_hash; } ro->ro_rt = fle->f_rt; ro->ro_flags |= RT_NORTREF; lle = fle->f_lle; if (lle != NULL && (lle->la_flags & LLE_VALID)) { ro->ro_prepend = lle->r_linkdata; ro->ro_plen = lle->r_hdrlen; ro->ro_flags |= RT_MAY_LOOP; if (lle->la_flags & LLE_IFADDR) ro->ro_flags |= RT_L2_ME; } return (0); } static struct flentry * flowtable_lookup_common(struct flowtable *ft, uint32_t *key, int keylen, uint32_t fibnum) { struct flist *flist; struct flentry *fle; uint32_t hash; FLOWSTAT_INC(ft, ft_lookups); hash = jenkins_hash32(key, keylen / sizeof(uint32_t), flow_hashjitter); critical_enter(); flist = flowtable_list(ft, hash); SLIST_FOREACH(fle, flist, f_next) { KASSERT(fle->f_hash % ft->ft_size == hash % ft->ft_size, ("%s: wrong hash", __func__)); if (flow_matches(fle, key, keylen, fibnum)) { fle->f_uptime = time_uptime; #ifdef FLOWTABLE_HASH_ALL fle->f_flags |= fibnum >> 24; #endif critical_exit(); FLOWSTAT_INC(ft, ft_hits); return (fle); } } critical_exit(); FLOWSTAT_INC(ft, ft_misses); return (flowtable_insert(ft, hash, key, keylen, fibnum)); } -/* - * used by the bit_alloc macro - */ -#define calloc(count, size) malloc((count)*(size), M_FTABLE, M_WAITOK | M_ZERO) static void flowtable_alloc(struct flowtable *ft) { ft->ft_table = malloc(ft->ft_size * sizeof(struct flist), M_FTABLE, M_WAITOK); for (int i = 0; i < ft->ft_size; i++) ft->ft_table[i] = uma_zalloc(pcpu_zone_ptr, M_WAITOK | M_ZERO); ft->ft_masks = uma_zalloc(pcpu_zone_ptr, M_WAITOK); for (int i = 0; i < mp_ncpus; i++) { bitstr_t **b; b = zpcpu_get_cpu(ft->ft_masks, i); - *b = bit_alloc(ft->ft_size); + *b = bit_alloc(ft->ft_size, M_FTABLE, M_WAITOK); } - ft->ft_tmpmask = bit_alloc(ft->ft_size); + ft->ft_tmpmask = bit_alloc(ft->ft_size, M_FTABLE, M_WAITOK); } -#undef calloc static void flowtable_free_stale(struct flowtable *ft, struct rtentry *rt, int maxidle) { struct flist *flist, freelist; struct flentry *fle, *fle1, *fleprev; bitstr_t *mask, *tmpmask; int curbit, tmpsize; SLIST_INIT(&freelist); mask = flowtable_mask(ft); tmpmask = ft->ft_tmpmask; tmpsize = ft->ft_size; memcpy(tmpmask, mask, ft->ft_size/8); curbit = 0; fleprev = NULL; /* pacify gcc */ /* * XXX Note to self, bit_ffs operates at the byte level * and thus adds gratuitous overhead */ bit_ffs(tmpmask, ft->ft_size, &curbit); while (curbit != -1) { if (curbit >= ft->ft_size || curbit < -1) { log(LOG_ALERT, "warning: bad curbit value %d \n", curbit); break; } FLOWSTAT_INC(ft, ft_free_checks); critical_enter(); flist = flowtable_list(ft, curbit); #ifdef DIAGNOSTIC if (SLIST_EMPTY(flist) && curbit > 0) { log(LOG_ALERT, "warning bit=%d set, but no fle found\n", curbit); } #endif SLIST_FOREACH_SAFE(fle, flist, f_next, fle1) { if (rt != NULL && fle->f_rt != rt) { fleprev = fle; continue; } if (!flow_stale(ft, fle, maxidle)) { fleprev = fle; continue; } if (fle == SLIST_FIRST(flist)) SLIST_REMOVE_HEAD(flist, f_next); else SLIST_REMOVE_AFTER(fleprev, f_next); SLIST_INSERT_HEAD(&freelist, fle, f_next); } if (SLIST_EMPTY(flist)) bit_clear(mask, curbit); critical_exit(); bit_clear(tmpmask, curbit); bit_ffs(tmpmask, tmpsize, &curbit); } SLIST_FOREACH_SAFE(fle, &freelist, f_next, fle1) { FLOWSTAT_INC(ft, ft_frees); if (fle->f_rt != NULL) RTFREE(fle->f_rt); if (fle->f_lle != NULL) LLE_FREE(fle->f_lle); uma_zfree(flow_zone, fle); } } static void flowtable_clean_vnet(struct flowtable *ft, struct rtentry *rt, int maxidle) { int i; CPU_FOREACH(i) { if (smp_started == 1) { thread_lock(curthread); sched_bind(curthread, i); thread_unlock(curthread); } flowtable_free_stale(ft, rt, maxidle); if (smp_started == 1) { thread_lock(curthread); sched_unbind(curthread); thread_unlock(curthread); } } } void flowtable_route_flush(sa_family_t sa, struct rtentry *rt) { struct flowtable *ft; switch (sa) { #ifdef INET case AF_INET: ft = &V_ip4_ft; break; #endif #ifdef INET6 case AF_INET6: ft = &V_ip6_ft; break; #endif default: panic("%s: sa %d", __func__, sa); } flowtable_clean_vnet(ft, rt, 0); } static void flowtable_cleaner(void) { VNET_ITERATOR_DECL(vnet_iter); struct thread *td; if (bootverbose) log(LOG_INFO, "flowtable cleaner started\n"); td = curthread; while (1) { uint32_t flowclean_freq, maxidle; /* * The maximum idle time, as well as frequency are arbitrary. */ if (flow_full()) maxidle = 5; else maxidle = 30; VNET_LIST_RLOCK(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); #ifdef INET flowtable_clean_vnet(&V_ip4_ft, NULL, maxidle); #endif #ifdef INET6 flowtable_clean_vnet(&V_ip6_ft, NULL, maxidle); #endif CURVNET_RESTORE(); } VNET_LIST_RUNLOCK(); if (flow_full()) flowclean_freq = 4*hz; else flowclean_freq = 20*hz; mtx_lock(&flowclean_lock); thread_lock(td); sched_prio(td, PPAUSE); thread_unlock(td); flowclean_cycles++; cv_broadcast(&flowclean_f_cv); cv_timedwait(&flowclean_c_cv, &flowclean_lock, flowclean_freq); mtx_unlock(&flowclean_lock); } } static void flowtable_flush(void *unused __unused) { uint64_t start; mtx_lock(&flowclean_lock); start = flowclean_cycles; while (start == flowclean_cycles) { cv_broadcast(&flowclean_c_cv); cv_wait(&flowclean_f_cv, &flowclean_lock); } mtx_unlock(&flowclean_lock); } static struct kproc_desc flow_kp = { "flowcleaner", flowtable_cleaner, &flowcleanerproc }; SYSINIT(flowcleaner, SI_SUB_KTHREAD_IDLE, SI_ORDER_ANY, kproc_start, &flow_kp); static int flowtable_get_size(char *name) { int size; if (TUNABLE_INT_FETCH(name, &size)) { if (size < 256) size = 256; if (!powerof2(size)) { printf("%s must be power of 2\n", name); size = 2048; } } else { /* * round up to the next power of 2 */ size = 1 << fls((1024 + maxusers * 64) - 1); } return (size); } static void flowtable_init(const void *unused __unused) { flow_hashjitter = arc4random(); flow_zone = uma_zcreate("flows", sizeof(struct flentry), NULL, NULL, NULL, NULL, (64-1), UMA_ZONE_MAXBUCKET); uma_zone_set_max(flow_zone, 1024 + maxusers * 64 * mp_ncpus); cv_init(&flowclean_c_cv, "c_flowcleanwait"); cv_init(&flowclean_f_cv, "f_flowcleanwait"); mtx_init(&flowclean_lock, "flowclean lock", NULL, MTX_DEF); EVENTHANDLER_REGISTER(ifnet_departure_event, flowtable_flush, NULL, EVENTHANDLER_PRI_ANY); } SYSINIT(flowtable_init, SI_SUB_PROTO_BEGIN, SI_ORDER_FIRST, flowtable_init, NULL); #ifdef INET static SYSCTL_NODE(_net_flowtable, OID_AUTO, ip4, CTLFLAG_RD, NULL, "Flowtable for IPv4"); static VNET_PCPUSTAT_DEFINE(struct flowtable_stat, ip4_ftstat); VNET_PCPUSTAT_SYSINIT(ip4_ftstat); VNET_PCPUSTAT_SYSUNINIT(ip4_ftstat); SYSCTL_VNET_PCPUSTAT(_net_flowtable_ip4, OID_AUTO, stat, struct flowtable_stat, ip4_ftstat, "Flowtable statistics for IPv4 " "(struct flowtable_stat, net/flowtable.h)"); static void flowtable_init_vnet_v4(const void *unused __unused) { V_ip4_ft.ft_size = flowtable_get_size("net.flowtable.ip4.size"); V_ip4_ft.ft_stat = VNET(ip4_ftstat); flowtable_alloc(&V_ip4_ft); } VNET_SYSINIT(ft_vnet_v4, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY, flowtable_init_vnet_v4, NULL); #endif /* INET */ #ifdef INET6 static SYSCTL_NODE(_net_flowtable, OID_AUTO, ip6, CTLFLAG_RD, NULL, "Flowtable for IPv6"); static VNET_PCPUSTAT_DEFINE(struct flowtable_stat, ip6_ftstat); VNET_PCPUSTAT_SYSINIT(ip6_ftstat); VNET_PCPUSTAT_SYSUNINIT(ip6_ftstat); SYSCTL_VNET_PCPUSTAT(_net_flowtable_ip6, OID_AUTO, stat, struct flowtable_stat, ip6_ftstat, "Flowtable statistics for IPv6 " "(struct flowtable_stat, net/flowtable.h)"); static void flowtable_init_vnet_v6(const void *unused __unused) { V_ip6_ft.ft_size = flowtable_get_size("net.flowtable.ip6.size"); V_ip6_ft.ft_stat = VNET(ip6_ftstat); flowtable_alloc(&V_ip6_ft); } VNET_SYSINIT(flowtable_init_vnet_v6, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY, flowtable_init_vnet_v6, NULL); #endif /* INET6 */ #ifdef DDB static bitstr_t * flowtable_mask_pcpu(struct flowtable *ft, int cpuid) { return (zpcpu_get_cpu(*ft->ft_masks, cpuid)); } static struct flist * flowtable_list_pcpu(struct flowtable *ft, uint32_t hash, int cpuid) { return (zpcpu_get_cpu(&ft->ft_table[hash % ft->ft_size], cpuid)); } static void flow_show(struct flowtable *ft, struct flentry *fle) { int idle_time; int rt_valid, ifp_valid; volatile struct rtentry *rt; struct ifnet *ifp = NULL; uint32_t *hashkey = fle->f_key; idle_time = (int)(time_uptime - fle->f_uptime); rt = fle->f_rt; rt_valid = rt != NULL; if (rt_valid) ifp = rt->rt_ifp; ifp_valid = ifp != NULL; #ifdef INET if (ft == &V_ip4_ft) { char daddr[4*sizeof "123"]; #ifdef FLOWTABLE_HASH_ALL char saddr[4*sizeof "123"]; uint16_t sport, dport; #endif inet_ntoa_r(*(struct in_addr *) &hashkey[0], daddr); #ifdef FLOWTABLE_HASH_ALL inet_ntoa_r(*(struct in_addr *) &hashkey[1], saddr); dport = ntohs((uint16_t)(hashkey[2] >> 16)); sport = ntohs((uint16_t)(hashkey[2] & 0xffff)); db_printf("%s:%d->%s:%d", saddr, sport, daddr, dport); #else db_printf("%s ", daddr); #endif } #endif /* INET */ #ifdef INET6 if (ft == &V_ip6_ft) { #ifdef FLOWTABLE_HASH_ALL db_printf("\n\tkey=%08x:%08x:%08x%08x:%08x:%08x%08x:%08x:%08x", hashkey[0], hashkey[1], hashkey[2], hashkey[3], hashkey[4], hashkey[5], hashkey[6], hashkey[7], hashkey[8]); #else db_printf("\n\tkey=%08x:%08x:%08x ", hashkey[0], hashkey[1], hashkey[2]); #endif } #endif /* INET6 */ db_printf("hash=%08x idle_time=%03d" "\n\tfibnum=%02d rt=%p", fle->f_hash, idle_time, fle->f_fibnum, fle->f_rt); #ifdef FLOWTABLE_HASH_ALL if (fle->f_flags & FL_STALE) db_printf(" FL_STALE "); #endif if (rt_valid) { if (rt->rt_flags & RTF_UP) db_printf(" RTF_UP "); } if (ifp_valid) { if (ifp->if_flags & IFF_LOOPBACK) db_printf(" IFF_LOOPBACK "); if (ifp->if_flags & IFF_UP) db_printf(" IFF_UP "); if (ifp->if_flags & IFF_POINTOPOINT) db_printf(" IFF_POINTOPOINT "); } db_printf("\n"); } static void flowtable_show(struct flowtable *ft, int cpuid) { int curbit = 0; bitstr_t *mask, *tmpmask; if (cpuid != -1) db_printf("cpu: %d\n", cpuid); mask = flowtable_mask_pcpu(ft, cpuid); tmpmask = ft->ft_tmpmask; memcpy(tmpmask, mask, ft->ft_size/8); /* * XXX Note to self, bit_ffs operates at the byte level * and thus adds gratuitous overhead */ bit_ffs(tmpmask, ft->ft_size, &curbit); while (curbit != -1) { struct flist *flist; struct flentry *fle; if (curbit >= ft->ft_size || curbit < -1) { db_printf("warning: bad curbit value %d \n", curbit); break; } flist = flowtable_list_pcpu(ft, curbit, cpuid); SLIST_FOREACH(fle, flist, f_next) flow_show(ft, fle); bit_clear(tmpmask, curbit); bit_ffs(tmpmask, ft->ft_size, &curbit); } } static void flowtable_show_vnet(struct flowtable *ft) { int i; CPU_FOREACH(i) flowtable_show(ft, i); } DB_SHOW_COMMAND(flowtables, db_show_flowtables) { VNET_ITERATOR_DECL(vnet_iter); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); #ifdef VIMAGE db_printf("vnet %p\n", vnet_iter); #endif #ifdef INET printf("IPv4:\n"); flowtable_show_vnet(&V_ip4_ft); #endif #ifdef INET6 printf("IPv6:\n"); flowtable_show_vnet(&V_ip6_ft); #endif CURVNET_RESTORE(); } } #endif Index: head/sys/sys/bitstring.h =================================================================== --- head/sys/sys/bitstring.h (revision 299089) +++ head/sys/sys/bitstring.h (revision 299090) @@ -1,146 +1,259 @@ /*- * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Paul Vixie. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * + * Copyright (c) 2014 Spectra Logic Corporation + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions, and the following disclaimer, + * without modification. + * 2. Redistributions in binary form must reproduce at minimum a disclaimer + * substantially similar to the "NO WARRANTY" disclaimer below + * ("Disclaimer") and any redistribution must be conditioned upon + * including a substantially similar Disclaimer requirement for further + * binary redistribution. + * + * NO WARRANTY + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING + * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGES. + * * $FreeBSD$ */ - #ifndef _SYS_BITSTRING_H_ #define _SYS_BITSTRING_H_ -typedef unsigned char bitstr_t; +#ifdef _KERNEL +#include +#include +#endif -/* internal macros */ - /* byte of the bitstring bit is in */ -#define _bit_byte(bit) \ - ((bit) >> 3) +typedef unsigned long bitstr_t; - /* mask for the bit within its byte */ -#define _bit_mask(bit) \ - (1 << ((bit)&0x7)) +/*---------------------- Private Implementation Details ----------------------*/ +#define _BITSTR_MASK (~0UL) +#define _BITSTR_BITS (sizeof(bitstr_t) * 8) -/* external macros */ - /* bytes in a bitstring of nbits bits */ -#define bitstr_size(nbits) \ - (((nbits) + 7) >> 3) +/* bitstr_t in bit string containing the bit. */ +static inline int +_bit_idx(int _bit) +{ + return (_bit / _BITSTR_BITS); +} - /* allocate a bitstring */ -#define bit_alloc(nbits) \ - (bitstr_t *)calloc((size_t)bitstr_size(nbits), sizeof(bitstr_t)) +/* bit number within bitstr_t at _bit_idx(_bit). */ +static inline int +_bit_offset(int _bit) +{ + return (_bit % _BITSTR_BITS); +} - /* allocate a bitstring on the stack */ +/* Mask for the bit within its long. */ +static inline bitstr_t +_bit_mask(int _bit) +{ + return (1UL << _bit_offset(_bit)); +} + +static inline bitstr_t +_bit_make_mask(int _start, int _stop) +{ + return ((_BITSTR_MASK << _bit_offset(_start)) & + (_BITSTR_MASK >> (_BITSTR_BITS - _bit_offset(_stop) - 1))); +} + +/*----------------------------- Public Interface -----------------------------*/ +/* Number of bytes consumed by a bit string of nbits bits */ +#define bitstr_size(_nbits) \ + (((_nbits) + _BITSTR_BITS - 1) / 8) + +/* Allocate a bit string initialized with no bits set. */ +#ifdef _KERNEL +static inline bitstr_t * +bit_alloc(int _nbits, struct malloc_type *type, int flags) +{ + return ((bitstr_t *)malloc(bitstr_size(_nbits), type, flags | M_ZERO)); +} +#else +static inline bitstr_t * +bit_alloc(int _nbits) +{ + return ((bitstr_t *)calloc(bitstr_size(_nbits), 1)); +} +#endif + +/* Allocate a bit string on the stack with no bits set. */ #define bit_decl(name, nbits) \ - ((name)[bitstr_size(nbits)]) + ((name)[bitstr_size(nbits) / sizeof(bitstr_t)]) - /* is bit N of bitstring name set? */ -#define bit_test(name, bit) \ - ((name)[_bit_byte(bit)] & _bit_mask(bit)) +/* Is bit N of bit string set? */ +static inline int +bit_test(const bitstr_t *_bitstr, int _bit) +{ + return ((_bitstr[_bit_idx(_bit)] & _bit_mask(_bit)) != 0); +} - /* set bit N of bitstring name */ -#define bit_set(name, bit) \ - ((name)[_bit_byte(bit)] |= _bit_mask(bit)) +/* Set bit N of bit string. */ +static inline void +bit_set(bitstr_t *_bitstr, int _bit) +{ + _bitstr[_bit_idx(_bit)] |= _bit_mask(_bit); +} - /* clear bit N of bitstring name */ -#define bit_clear(name, bit) \ - ((name)[_bit_byte(bit)] &= ~_bit_mask(bit)) +/* clear bit N of bit string name */ +static inline void +bit_clear(bitstr_t *_bitstr, int _bit) +{ + _bitstr[_bit_idx(_bit)] &= ~_bit_mask(_bit); +} - /* clear bits start ... stop in bitstring */ -#define bit_nclear(name, start, stop) do { \ - register bitstr_t *_name = (name); \ - register int _start = (start), _stop = (stop); \ - register int _startbyte = _bit_byte(_start); \ - register int _stopbyte = _bit_byte(_stop); \ - if (_startbyte == _stopbyte) { \ - _name[_startbyte] &= ((0xff >> (8 - (_start&0x7))) | \ - (0xff << ((_stop&0x7) + 1))); \ - } else { \ - _name[_startbyte] &= 0xff >> (8 - (_start&0x7)); \ - while (++_startbyte < _stopbyte) \ - _name[_startbyte] = 0; \ - _name[_stopbyte] &= 0xff << ((_stop&0x7) + 1); \ - } \ -} while (0) +/* Set bits start ... stop inclusive in bit string. */ +static inline void +bit_nset(bitstr_t *_bitstr, int _start, int _stop) +{ + bitstr_t *_stopbitstr; - /* set bits start ... stop in bitstring */ -#define bit_nset(name, start, stop) do { \ - register bitstr_t *_name = (name); \ - register int _start = (start), _stop = (stop); \ - register int _startbyte = _bit_byte(_start); \ - register int _stopbyte = _bit_byte(_stop); \ - if (_startbyte == _stopbyte) { \ - _name[_startbyte] |= ((0xff << (_start&0x7)) & \ - (0xff >> (7 - (_stop&0x7)))); \ - } else { \ - _name[_startbyte] |= 0xff << ((_start)&0x7); \ - while (++_startbyte < _stopbyte) \ - _name[_startbyte] = 0xff; \ - _name[_stopbyte] |= 0xff >> (7 - (_stop&0x7)); \ - } \ -} while (0) + _stopbitstr = _bitstr + _bit_idx(_stop); + _bitstr += _bit_idx(_start); - /* find first bit clear in name */ -#define bit_ffc(name, nbits, value) do { \ - register bitstr_t *_name = (name); \ - register int _byte, _nbits = (nbits); \ - register int _stopbyte = _bit_byte(_nbits - 1), _value = -1; \ - if (_nbits > 0) \ - for (_byte = 0; _byte <= _stopbyte; ++_byte) \ - if (_name[_byte] != 0xff) { \ - bitstr_t _lb; \ - _value = _byte << 3; \ - for (_lb = _name[_byte]; (_lb&0x1); \ - ++_value, _lb >>= 1); \ - break; \ - } \ - if (_value >= nbits) \ - _value = -1; \ - *(value) = _value; \ -} while (0) + if (_bitstr == _stopbitstr) { + *_bitstr |= _bit_make_mask(_start, _stop); + } else { + *_bitstr |= _bit_make_mask(_start, _BITSTR_BITS - 1); + while (++_bitstr < _stopbitstr) + *_bitstr = _BITSTR_MASK; + *_stopbitstr |= _bit_make_mask(0, _stop); + } +} - /* find first bit set in name */ -#define bit_ffs(name, nbits, value) do { \ - register bitstr_t *_name = (name); \ - register int _byte, _nbits = (nbits); \ - register int _stopbyte = _bit_byte(_nbits - 1), _value = -1; \ - if (_nbits > 0) \ - for (_byte = 0; _byte <= _stopbyte; ++_byte) \ - if (_name[_byte]) { \ - bitstr_t _lb; \ - _value = _byte << 3; \ - for (_lb = _name[_byte]; !(_lb&0x1); \ - ++_value, _lb >>= 1); \ - break; \ - } \ - if (_value >= nbits) \ - _value = -1; \ - *(value) = _value; \ -} while (0) +/* Clear bits start ... stop inclusive in bit string. */ +static inline void +bit_nclear(bitstr_t *_bitstr, int _start, int _stop) +{ + bitstr_t *_stopbitstr; -#endif /* !_SYS_BITSTRING_H_ */ + _stopbitstr = _bitstr + _bit_idx(_stop); + _bitstr += _bit_idx(_start); + + if (_bitstr == _stopbitstr) { + *_bitstr &= ~_bit_make_mask(_start, _stop); + } else { + *_bitstr &= ~_bit_make_mask(_start, _BITSTR_BITS - 1); + while (++_bitstr < _stopbitstr) + *_bitstr = 0; + *_stopbitstr &= ~_bit_make_mask(0, _stop); + } +} + +/* Find the first bit set in bit string at or after bit start. */ +static inline void +bit_ffs_at(bitstr_t *_bitstr, int _start, int _nbits, int *_result) +{ + bitstr_t *_curbitstr; + bitstr_t *_stopbitstr; + bitstr_t _test; + int _value, _offset; + + if (_nbits > 0) { + _curbitstr = _bitstr + _bit_idx(_start); + _stopbitstr = _bitstr + _bit_idx(_nbits - 1); + + _test = *_curbitstr; + if (_bit_offset(_start) != 0) + _test &= _bit_make_mask(_start, _BITSTR_BITS - 1); + while (_test == 0 && _curbitstr < _stopbitstr) + _test = *(++_curbitstr); + + _offset = ffsl(_test); + _value = ((_curbitstr - _bitstr) * _BITSTR_BITS) + _offset - 1; + if (_offset == 0 || _value >= _nbits) + _value = -1; + } else { + _value = -1; + } + *_result = _value; +} + +/* Find the first bit clear in bit string at or after bit start. */ +static inline void +bit_ffc_at(bitstr_t *_bitstr, int _start, int _nbits, int *_result) +{ + bitstr_t *_curbitstr; + bitstr_t *_stopbitstr; + bitstr_t _test; + int _value, _offset; + + if (_nbits > 0) { + _curbitstr = _bitstr + _bit_idx(_start); + _stopbitstr = _bitstr + _bit_idx(_nbits - 1); + + _test = *_curbitstr; + if (_bit_offset(_start) != 0) + _test |= _bit_make_mask(0, _start - 1); + while (_test == _BITSTR_MASK && _curbitstr < _stopbitstr) + _test = *(++_curbitstr); + + _offset = ffsl(~_test); + _value = ((_curbitstr - _bitstr) * _BITSTR_BITS) + _offset - 1; + if (_offset == 0 || _value >= _nbits) + _value = -1; + } else { + _value = -1; + } + *_result = _value; +} + +/* Find the first bit set in bit string. */ +static inline void +bit_ffs(bitstr_t *_bitstr, int _nbits, int *_result) +{ + bit_ffs_at(_bitstr, /*start*/0, _nbits, _result); +} + +/* Find the first bit clear in bit string. */ +static inline void +bit_ffc(bitstr_t *_bitstr, int _nbits, int *_result) +{ + bit_ffc_at(_bitstr, /*start*/0, _nbits, _result); +} + +#endif /* _SYS_BITSTRING_H_ */ Index: head/sys/sys/param.h =================================================================== --- head/sys/sys/param.h (revision 299089) +++ head/sys/sys/param.h (revision 299090) @@ -1,363 +1,363 @@ /*- * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)param.h 8.3 (Berkeley) 4/4/95 * $FreeBSD$ */ #ifndef _SYS_PARAM_H_ #define _SYS_PARAM_H_ #include #define BSD 199506 /* System version (year & month). */ #define BSD4_3 1 #define BSD4_4 1 /* * __FreeBSD_version numbers are documented in the Porter's Handbook. * If you bump the version for any reason, you should update the documentation * there. * Currently this lives here in the doc/ repository: * * head/en_US.ISO8859-1/books/porters-handbook/versions/chapter.xml * * scheme is: Rxx * 'R' is in the range 0 to 4 if this is a release branch or * x.0-CURRENT before RELENG_*_0 is created, otherwise 'R' is * in the range 5 to 9. */ #undef __FreeBSD_version -#define __FreeBSD_version 1100106 /* Master, propagated to newvers */ +#define __FreeBSD_version 1100107 /* Master, propagated to newvers */ /* * __FreeBSD_kernel__ indicates that this system uses the kernel of FreeBSD, * which by definition is always true on FreeBSD. This macro is also defined * on other systems that use the kernel of FreeBSD, such as GNU/kFreeBSD. * * It is tempting to use this macro in userland code when we want to enable * kernel-specific routines, and in fact it's fine to do this in code that * is part of FreeBSD itself. However, be aware that as presence of this * macro is still not widespread (e.g. older FreeBSD versions, 3rd party * compilers, etc), it is STRONGLY DISCOURAGED to check for this macro in * external applications without also checking for __FreeBSD__ as an * alternative. */ #undef __FreeBSD_kernel__ #define __FreeBSD_kernel__ #ifdef _KERNEL #define P_OSREL_SIGWAIT 700000 #define P_OSREL_SIGSEGV 700004 #define P_OSREL_MAP_ANON 800104 #define P_OSREL_MAP_FSTRICT 1100036 #define P_OSREL_SHUTDOWN_ENOTCONN 1100077 #define P_OSREL_MAJOR(x) ((x) / 100000) #endif #ifndef LOCORE #include #endif /* * Machine-independent constants (some used in following include files). * Redefined constants are from POSIX 1003.1 limits file. * * MAXCOMLEN should be >= sizeof(ac_comm) (see ) */ #include #define MAXCOMLEN 19 /* max command name remembered */ #define MAXINTERP PATH_MAX /* max interpreter file name length */ #define MAXLOGNAME 33 /* max login name length (incl. NUL) */ #define MAXUPRC CHILD_MAX /* max simultaneous processes */ #define NCARGS ARG_MAX /* max bytes for an exec function */ #define NGROUPS (NGROUPS_MAX+1) /* max number groups */ #define NOFILE OPEN_MAX /* max open files per process */ #define NOGROUP 65535 /* marker for empty group set member */ #define MAXHOSTNAMELEN 256 /* max hostname size */ #define SPECNAMELEN 63 /* max length of devicename */ /* More types and definitions used throughout the kernel. */ #ifdef _KERNEL #include #include #ifndef LOCORE #include #include #endif #ifndef FALSE #define FALSE 0 #endif #ifndef TRUE #define TRUE 1 #endif #endif #ifndef _KERNEL /* Signals. */ #include #endif /* Machine type dependent parameters. */ #include #ifndef _KERNEL #include #endif #ifndef DEV_BSHIFT #define DEV_BSHIFT 9 /* log2(DEV_BSIZE) */ #endif #define DEV_BSIZE (1<>PAGE_SHIFT) #endif /* * btodb() is messy and perhaps slow because `bytes' may be an off_t. We * want to shift an unsigned type to avoid sign extension and we don't * want to widen `bytes' unnecessarily. Assume that the result fits in * a daddr_t. */ #ifndef btodb #define btodb(bytes) /* calculates (bytes / DEV_BSIZE) */ \ (sizeof (bytes) > sizeof(long) \ ? (daddr_t)((unsigned long long)(bytes) >> DEV_BSHIFT) \ : (daddr_t)((unsigned long)(bytes) >> DEV_BSHIFT)) #endif #ifndef dbtob #define dbtob(db) /* calculates (db * DEV_BSIZE) */ \ ((off_t)(db) << DEV_BSHIFT) #endif #define PRIMASK 0x0ff #define PCATCH 0x100 /* OR'd with pri for tsleep to check signals */ #define PDROP 0x200 /* OR'd with pri to stop re-entry of interlock mutex */ #define NZERO 0 /* default "nice" */ #define NBBY 8 /* number of bits in a byte */ #define NBPW sizeof(int) /* number of bytes per word (integer) */ #define CMASK 022 /* default file mask: S_IWGRP|S_IWOTH */ #define NODEV (dev_t)(-1) /* non-existent device */ /* * File system parameters and macros. * * MAXBSIZE - Filesystems are made out of blocks of at most MAXBSIZE bytes * per block. MAXBSIZE may be made larger without effecting * any existing filesystems as long as it does not exceed MAXPHYS, * and may be made smaller at the risk of not being able to use * filesystems which require a block size exceeding MAXBSIZE. * * MAXBCACHEBUF - Maximum size of a buffer in the buffer cache. This must * be >= MAXBSIZE and can be set differently for different * architectures by defining it in . * Making this larger allows NFS to do larger reads/writes. * * BKVASIZE - Nominal buffer space per buffer, in bytes. BKVASIZE is the * minimum KVM memory reservation the kernel is willing to make. * Filesystems can of course request smaller chunks. Actual * backing memory uses a chunk size of a page (PAGE_SIZE). * The default value here can be overridden on a per-architecture * basis by defining it in . This should * probably be done to increase its value, when MAXBCACHEBUF is * defined as a larger value in . * * If you make BKVASIZE too small you risk seriously fragmenting * the buffer KVM map which may slow things down a bit. If you * make it too big the kernel will not be able to optimally use * the KVM memory reserved for the buffer cache and will wind * up with too-few buffers. * * The default is 16384, roughly 2x the block size used by a * normal UFS filesystem. */ #define MAXBSIZE 65536 /* must be power of 2 */ #ifndef MAXBCACHEBUF #define MAXBCACHEBUF MAXBSIZE /* must be a power of 2 >= MAXBSIZE */ #endif #ifndef BKVASIZE #define BKVASIZE 16384 /* must be power of 2 */ #endif #define BKVAMASK (BKVASIZE-1) /* * MAXPATHLEN defines the longest permissible path length after expanding * symbolic links. It is used to allocate a temporary buffer from the buffer * pool in which to do the name expansion, hence should be a power of two, * and must be less than or equal to MAXBSIZE. MAXSYMLINKS defines the * maximum number of symbolic links that may be expanded in a path name. * It should be set high enough to allow all legitimate uses, but halt * infinite loops reasonably quickly. */ #define MAXPATHLEN PATH_MAX #define MAXSYMLINKS 32 /* Bit map related macros. */ #define setbit(a,i) (((unsigned char *)(a))[(i)/NBBY] |= 1<<((i)%NBBY)) #define clrbit(a,i) (((unsigned char *)(a))[(i)/NBBY] &= ~(1<<((i)%NBBY))) #define isset(a,i) \ (((const unsigned char *)(a))[(i)/NBBY] & (1<<((i)%NBBY))) #define isclr(a,i) \ ((((const unsigned char *)(a))[(i)/NBBY] & (1<<((i)%NBBY))) == 0) /* Macros for counting and rounding. */ #ifndef howmany #define howmany(x, y) (((x)+((y)-1))/(y)) #endif #define nitems(x) (sizeof((x)) / sizeof((x)[0])) #define rounddown(x, y) (((x)/(y))*(y)) #define rounddown2(x, y) ((x)&(~((y)-1))) /* if y is power of two */ #define roundup(x, y) ((((x)+((y)-1))/(y))*(y)) /* to any y */ #define roundup2(x, y) (((x)+((y)-1))&(~((y)-1))) /* if y is powers of two */ #define powerof2(x) ((((x)-1)&(x))==0) /* Macros for min/max. */ #define MIN(a,b) (((a)<(b))?(a):(b)) #define MAX(a,b) (((a)>(b))?(a):(b)) #ifdef _KERNEL /* * Basic byte order function prototypes for non-inline functions. */ #ifndef LOCORE #ifndef _BYTEORDER_PROTOTYPED #define _BYTEORDER_PROTOTYPED __BEGIN_DECLS __uint32_t htonl(__uint32_t); __uint16_t htons(__uint16_t); __uint32_t ntohl(__uint32_t); __uint16_t ntohs(__uint16_t); __END_DECLS #endif #endif #ifndef lint #ifndef _BYTEORDER_FUNC_DEFINED #define _BYTEORDER_FUNC_DEFINED #define htonl(x) __htonl(x) #define htons(x) __htons(x) #define ntohl(x) __ntohl(x) #define ntohs(x) __ntohs(x) #endif /* !_BYTEORDER_FUNC_DEFINED */ #endif /* lint */ #endif /* _KERNEL */ /* * Scale factor for scaled integers used to count %cpu time and load avgs. * * The number of CPU `tick's that map to a unique `%age' can be expressed * by the formula (1 / (2 ^ (FSHIFT - 11))). The maximum load average that * can be calculated (assuming 32 bits) can be closely approximated using * the formula (2 ^ (2 * (16 - FSHIFT))) for (FSHIFT < 15). * * For the scheduler to maintain a 1:1 mapping of CPU `tick' to `%age', * FSHIFT must be at least 11; this gives us a maximum load avg of ~1024. */ #define FSHIFT 11 /* bits to right of fixed binary point */ #define FSCALE (1<> (PAGE_SHIFT - DEV_BSHIFT)) #define ctodb(db) /* calculates pages to devblks */ \ ((db) << (PAGE_SHIFT - DEV_BSHIFT)) /* * Old spelling of __containerof(). */ #define member2struct(s, m, x) \ ((struct s *)(void *)((char *)(x) - offsetof(struct s, m))) /* * Access a variable length array that has been declared as a fixed * length array. */ #define __PAST_END(array, offset) (((__typeof__(*(array)) *)(array))[offset]) #endif /* _SYS_PARAM_H_ */ Index: head/tests/sys/Makefile =================================================================== --- head/tests/sys/Makefile (revision 299089) +++ head/tests/sys/Makefile (revision 299090) @@ -1,28 +1,29 @@ # $FreeBSD$ .include PACKAGE= tests FILESGROUPS= TESTS TESTSPACKAGE= ${PACKAGE} TESTSDIR= ${TESTSBASE}/sys TESTS_SUBDIRS+= acl TESTS_SUBDIRS+= aio TESTS_SUBDIRS+= fifo TESTS_SUBDIRS+= file TESTS_SUBDIRS+= geom TESTS_SUBDIRS+= kern TESTS_SUBDIRS+= kqueue TESTS_SUBDIRS+= mac TESTS_SUBDIRS+= mqueue TESTS_SUBDIRS+= netinet TESTS_SUBDIRS+= opencrypto TESTS_SUBDIRS+= posixshm +TESTS_SUBDIRS+= sys TESTS_SUBDIRS+= vfs TESTS_SUBDIRS+= vm # Items not integrated into kyua runs by default SUBDIR+= pjdfstest .include Index: head/tests/sys/sys/Makefile =================================================================== --- head/tests/sys/sys/Makefile (nonexistent) +++ head/tests/sys/sys/Makefile (revision 299090) @@ -0,0 +1,13 @@ +# $FreeBSD$ + +PACKAGE= tests +FILESGROUPS= TESTS +TESTSPACKAGE= ${PACKAGE} + +TESTSDIR= ${TESTSBASE}/sys/sys + +ATF_TESTS_C= bitstring_test + +WARNS?= 5 + +.include Property changes on: head/tests/sys/sys/Makefile ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Index: head/tests/sys/sys/bitstring_test.c =================================================================== --- head/tests/sys/sys/bitstring_test.c (nonexistent) +++ head/tests/sys/sys/bitstring_test.c (revision 299090) @@ -0,0 +1,359 @@ +/*- + * Copyright (c) 2014 Spectra Logic Corporation + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions, and the following disclaimer, + * without modification. + * 2. Redistributions in binary form must reproduce at minimum a disclaimer + * substantially similar to the "NO WARRANTY" disclaimer below + * ("Disclaimer") and any redistribution must be conditioned upon + * including a substantially similar Disclaimer requirement for further + * binary redistribution. + * + * NO WARRANTY + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING + * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGES. + * + * $FreeBSD$ + */ +#include + +#include +#include + +#include + +typedef void (testfunc_t)(bitstr_t *bstr, int nbits, const char *memloc); + +static void +bitstring_run_stack_test(testfunc_t *test, int nbits) +{ + bitstr_t bit_decl(bitstr, nbits); + + test(bitstr, nbits, "stack"); +} + +static void +bitstring_run_heap_test(testfunc_t *test, int nbits) +{ + bitstr_t *bitstr = bit_alloc(nbits); + + test(bitstr, nbits, "heap"); +} + +static void +bitstring_test_runner(testfunc_t *test) +{ + const int bitstr_sizes[] = { + 0, + 1, + _BITSTR_BITS - 1, + _BITSTR_BITS, + _BITSTR_BITS + 1, + 2 * _BITSTR_BITS - 1, + 2 * _BITSTR_BITS, + 1023, + 1024 + }; + + for (unsigned long i = 0; i < nitems(bitstr_sizes); i++) { + bitstring_run_stack_test(test, bitstr_sizes[i]); + bitstring_run_heap_test(test, bitstr_sizes[i]); + } +} + +#define BITSTRING_TC_DEFINE(name) \ +ATF_TC_WITHOUT_HEAD(name); \ +static testfunc_t name ## _test; \ + \ +ATF_TC_BODY(name, tc) \ +{ \ + bitstring_test_runner(name ## _test); \ +} \ + \ +static void \ +name ## _test(bitstr_t *bitstr, int nbits, const char *memloc) + +#define BITSTRING_TC_ADD(tp, name) \ +do { \ + ATF_TP_ADD_TC(tp, name); \ +} while (0) + +ATF_TC_WITHOUT_HEAD(bitstr_in_struct); +ATF_TC_BODY(bitstr_in_struct, tc) +{ + struct bitstr_containing_struct { + bitstr_t bit_decl(bitstr, 8); + } test_struct; + + bit_nclear(test_struct.bitstr, 0, 8); +} + +BITSTRING_TC_DEFINE(bit_set) +/* bitstr_t *bitstr, int nbits, const char *memloc */ +{ + memset(bitstr, 0, bitstr_size(nbits)); + + for (int i = 0; i < nbits; i++) { + bit_set(bitstr, i); + + for (int j = 0; j < nbits; j++) { + ATF_REQUIRE_MSG(bit_test(bitstr, j) == (j == i) ? 1 : 0, + "bit_set_%d_%s: Failed on bit %d", + nbits, memloc, i); + } + + bit_clear(bitstr, i); + } +} + +BITSTRING_TC_DEFINE(bit_clear) +/* bitstr_t *bitstr, int nbits, const char *memloc */ +{ + int i, j; + + memset(bitstr, 0xFF, bitstr_size(nbits)); + for (i = 0; i < nbits; i++) { + bit_clear(bitstr, i); + + for (j = 0; j < nbits; j++) { + ATF_REQUIRE_MSG(bit_test(bitstr, j) == (j == i) ? 0 : 1, + "bit_clear_%d_%s: Failed on bit %d", + nbits, memloc, i); + } + + bit_set(bitstr, i); + } +} + +BITSTRING_TC_DEFINE(bit_ffs) +/* bitstr_t *bitstr, int nbits, const char *memloc */ +{ + int i; + int found_set_bit; + + memset(bitstr, 0, bitstr_size(nbits)); + bit_ffs(bitstr, nbits, &found_set_bit); + ATF_REQUIRE_MSG(found_set_bit == -1, + "bit_ffs_%d_%s: Failed all clear bits.", nbits, memloc); + + for (i = 0; i < nbits; i++) { + memset(bitstr, 0xFF, bitstr_size(nbits)); + if (i > 0) + bit_nclear(bitstr, 0, i - 1); + + bit_ffs(bitstr, nbits, &found_set_bit); + ATF_REQUIRE_MSG(found_set_bit == i, + "bit_ffs_%d_%s: Failed on bit %d, Result %d", + nbits, memloc, i, found_set_bit); + } +} + +BITSTRING_TC_DEFINE(bit_ffc) +/* bitstr_t *bitstr, int nbits, const char *memloc */ +{ + int i; + int found_clear_bit; + + memset(bitstr, 0xFF, bitstr_size(nbits)); + bit_ffc(bitstr, nbits, &found_clear_bit); + ATF_REQUIRE_MSG(found_clear_bit == -1, + "bit_ffc_%d_%s: Failed all set bits.", nbits, memloc); + + for (i = 0; i < nbits; i++) { + memset(bitstr, 0, bitstr_size(nbits)); + if (i > 0) + bit_nset(bitstr, 0, i - 1); + + bit_ffc(bitstr, nbits, &found_clear_bit); + ATF_REQUIRE_MSG(found_clear_bit == i, + "bit_ffc_%d_%s: Failed on bit %d, Result %d", + nbits, memloc, i, found_clear_bit); + } +} + +BITSTRING_TC_DEFINE(bit_ffs_at) +/* bitstr_t *bitstr, int nbits, const char *memloc */ +{ + int i; + int found_set_bit; + + memset(bitstr, 0xFF, bitstr_size(nbits)); + for (i = 0; i < nbits; i++) { + bit_ffs_at(bitstr, i, nbits, &found_set_bit); + ATF_REQUIRE_MSG(found_set_bit == i, + "bit_ffs_at_%d_%s: Failed on bit %d, Result %d", + nbits, memloc, i, found_set_bit); + } + + memset(bitstr, 0, bitstr_size(nbits)); + for (i = 0; i < nbits; i++) { + bit_ffs_at(bitstr, i, nbits, &found_set_bit); + ATF_REQUIRE_MSG(found_set_bit == -1, + "bit_ffs_at_%d_%s: Failed on bit %d, Result %d", + nbits, memloc, i, found_set_bit); + } + + memset(bitstr, 0x55, bitstr_size(nbits)); + for (i = 0; i < nbits; i++) { + bit_ffs_at(bitstr, i, nbits, &found_set_bit); + if (i == nbits - 1 && (nbits & 1) == 0) { + ATF_REQUIRE_MSG(found_set_bit == -1, + "bit_ffs_at_%d_%s: Failed on bit %d, Result %d", + nbits, memloc, i, found_set_bit); + } else { + ATF_REQUIRE_MSG(found_set_bit == i + (i & 1), + "bit_ffs_at_%d_%s: Failed on bit %d, Result %d", + nbits, memloc, i, found_set_bit); + } + } + + memset(bitstr, 0xAA, bitstr_size(nbits)); + for (i = 0; i < nbits; i++) { + bit_ffs_at(bitstr, i, nbits, &found_set_bit); + if (i == nbits - 1 && (nbits & 1) != 0) { + ATF_REQUIRE_MSG(found_set_bit == -1, + "bit_ffs_at_%d_%s: Failed on bit %d, Result %d", + nbits, memloc, i, found_set_bit); + } else { + ATF_REQUIRE_MSG( + found_set_bit == i + ((i & 1) ? 0 : 1), + "bit_ffs_at_%d_%s: Failed on bit %d, Result %d", + nbits, memloc, i, found_set_bit); + } + } +} + +BITSTRING_TC_DEFINE(bit_ffc_at) +/* bitstr_t *bitstr, int nbits, const char *memloc */ +{ + int i, found_clear_bit; + + memset(bitstr, 0, bitstr_size(nbits)); + for (i = 0; i < nbits; i++) { + bit_ffc_at(bitstr, i, nbits, &found_clear_bit); + ATF_REQUIRE_MSG(found_clear_bit == i, + "bit_ffc_at_%d_%s: Failed on bit %d, Result %d", + nbits, memloc, i, found_clear_bit); + } + + memset(bitstr, 0xFF, bitstr_size(nbits)); + for (i = 0; i < nbits; i++) { + bit_ffc_at(bitstr, i, nbits, &found_clear_bit); + ATF_REQUIRE_MSG(found_clear_bit == -1, + "bit_ffc_at_%d_%s: Failed on bit %d, Result %d", + nbits, memloc, i, found_clear_bit); + } + + memset(bitstr, 0x55, bitstr_size(nbits)); + for (i = 0; i < nbits; i++) { + bit_ffc_at(bitstr, i, nbits, &found_clear_bit); + if (i == nbits - 1 && (nbits & 1) != 0) { + ATF_REQUIRE_MSG(found_clear_bit == -1, + "bit_ffc_at_%d_%s: Failed on bit %d, Result %d", + nbits, memloc, i, found_clear_bit); + } else { + ATF_REQUIRE_MSG( + found_clear_bit == i + ((i & 1) ? 0 : 1), + "bit_ffc_at_%d_%s: Failed on bit %d, Result %d", + nbits, memloc, i, found_clear_bit); + } + } + + memset(bitstr, 0xAA, bitstr_size(nbits)); + for (i = 0; i < nbits; i++) { + bit_ffc_at(bitstr, i, nbits, &found_clear_bit); + if (i == nbits - 1 && (nbits & 1) == 0) { + ATF_REQUIRE_MSG(found_clear_bit == -1, + "bit_ffc_at_%d_%s: Failed on bit %d, Result %d", + nbits, memloc, i, found_clear_bit); + } else { + ATF_REQUIRE_MSG(found_clear_bit == i + (i & 1), + "bit_ffc_at_%d_%s: Failed on bit %d, Result %d", + nbits, memloc, i, found_clear_bit); + } + } +} + +BITSTRING_TC_DEFINE(bit_nclear) +/* bitstr_t *bitstr, int nbits, const char *memloc */ +{ + int i, j; + int found_set_bit; + int found_clear_bit; + + for (i = 0; i < nbits; i++) { + for (j = i; j < nbits; j++) { + memset(bitstr, 0xFF, bitstr_size(nbits)); + bit_nclear(bitstr, i, j); + + bit_ffc(bitstr, nbits, &found_clear_bit); + ATF_REQUIRE_MSG( + found_clear_bit == i, + "bit_nclear_%d_%d_%d%s: Failed with result %d", + nbits, i, j, memloc, found_clear_bit); + + bit_ffs_at(bitstr, i, nbits, &found_set_bit); + ATF_REQUIRE_MSG( + (j + 1 < nbits) ? found_set_bit == j + 1 : -1, + "bit_nset_%d_%d_%d%s: Failed with result %d", + nbits, i, j, memloc, found_set_bit); + } + } +} + +BITSTRING_TC_DEFINE(bit_nset) +/* bitstr_t *bitstr, int nbits, const char *memloc */ +{ + int i, j; + int found_set_bit; + int found_clear_bit; + + for (i = 0; i < nbits; i++) { + for (j = i; j < nbits; j++) { + memset(bitstr, 0, bitstr_size(nbits)); + bit_nset(bitstr, i, j); + + bit_ffs(bitstr, nbits, &found_set_bit); + ATF_REQUIRE_MSG( + found_set_bit == i, + "bit_nset_%d_%d_%d%s: Failed with result %d", + nbits, i, j, memloc, found_set_bit); + + bit_ffc_at(bitstr, i, nbits, &found_clear_bit); + ATF_REQUIRE_MSG( + (j + 1 < nbits) ? found_clear_bit == j + 1 : -1, + "bit_nset_%d_%d_%d%s: Failed with result %d", + nbits, i, j, memloc, found_clear_bit); + } + } +} + +ATF_TP_ADD_TCS(tp) +{ + + ATF_TP_ADD_TC(tp, bitstr_in_struct); + BITSTRING_TC_ADD(tp, bit_set); + BITSTRING_TC_ADD(tp, bit_clear); + BITSTRING_TC_ADD(tp, bit_ffs); + BITSTRING_TC_ADD(tp, bit_ffc); + BITSTRING_TC_ADD(tp, bit_ffs_at); + BITSTRING_TC_ADD(tp, bit_ffc_at); + BITSTRING_TC_ADD(tp, bit_nclear); + BITSTRING_TC_ADD(tp, bit_nset); + + return (atf_no_error()); +} Property changes on: head/tests/sys/sys/bitstring_test.c ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Index: head/usr.sbin/bluetooth/hccontrol/le.c =================================================================== --- head/usr.sbin/bluetooth/hccontrol/le.c (revision 299089) +++ head/usr.sbin/bluetooth/hccontrol/le.c (revision 299090) @@ -1,356 +1,356 @@ /* * le.c * * Copyright (c) 2015 Takanori Watanabe * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $Id: hccontrol.c,v 1.5 2003/09/05 00:38:24 max Exp $ * $FreeBSD$ */ #include #include #include -#include #include #include +#include #include #include #include #include #include #include #include #include #include #define L2CAP_SOCKET_CHECKED #include #include "hccontrol.h" static int le_set_scan_param(int s, int argc, char *argv[]); static int le_set_scan_enable(int s, int argc, char *argv[]); static int parse_param(int argc, char *argv[], char *buf, int *len); static int le_set_scan_response(int s, int argc, char *argv[]); static int le_read_supported_status(int s, int argc, char *argv[]); static int le_read_local_supported_features(int s, int argc ,char *argv[]); static int set_le_event_mask(int s, uint64_t mask); static int set_event_mask(int s, uint64_t mask); static int le_enable(int s, int argc, char *argv[]); static int le_set_scan_param(int s, int argc, char *argv[]) { int type; int interval; int window; int adrtype; int policy; int e, n; ng_hci_le_set_scan_parameters_cp cp; ng_hci_le_set_scan_parameters_rp rp; if (argc != 5) return USAGE; if (strcmp(argv[0], "active") == 0) type = 1; else if (strcmp(argv[0], "passive") == 0) type = 0; else return USAGE; interval = (int)(atof(argv[1])/0.625); interval = (interval < 4)? 4: interval; window = (int)(atof(argv[2])/0.625); window = (window < 4) ? 4 : interval; if (strcmp(argv[3], "public") == 0) adrtype = 0; else if (strcmp(argv[3], "random") == 0) adrtype = 1; else return USAGE; if (strcmp(argv[4], "all") == 0) policy = 0; else if (strcmp(argv[4], "whitelist") == 0) policy = 1; else return USAGE; cp.le_scan_type = type; cp.le_scan_interval = interval; cp.own_address_type = adrtype; cp.le_scan_window = window; cp.scanning_filter_policy = policy; n = sizeof(rp); e = hci_request(s, NG_HCI_OPCODE(NG_HCI_OGF_LE, NG_HCI_OCF_LE_SET_SCAN_PARAMETERS), (void *)&cp, sizeof(cp), (void *)&rp, &n); return 0; } static int le_set_scan_enable(int s, int argc, char *argv[]) { ng_hci_le_set_scan_enable_cp cp; ng_hci_le_set_scan_enable_rp rp; int e, n, enable = 0; if (argc != 1) return USAGE; if (strcmp(argv[0], "enable") == 0) enable = 1; else if (strcmp(argv[0], "disable") != 0) return USAGE; n = sizeof(rp); cp.le_scan_enable = enable; cp.filter_duplicates = 0; e = hci_request(s, NG_HCI_OPCODE(NG_HCI_OGF_LE, NG_HCI_OCF_LE_SET_SCAN_ENABLE), (void *)&cp, sizeof(cp), (void *)&rp, &n); if (e != 0 || rp.status != 0) return ERROR; return OK; } static int parse_param(int argc, char *argv[], char *buf, int *len) { char *buflast = buf + (*len); char *curbuf = buf; char *token,*lenpos; int ch; int datalen; uint16_t value; optreset = 1; optind = 0; while ((ch = getopt(argc, argv , "n:f:u:")) != -1) { switch(ch){ case 'n': datalen = strlen(optarg); if ((curbuf + datalen + 2) >= buflast) goto done; curbuf[0] = datalen + 1; curbuf[1] = 8; curbuf += 2; memcpy(curbuf, optarg, datalen); curbuf += datalen; break; case 'f': if (curbuf+3 > buflast) goto done; curbuf[0] = 2; curbuf[1] = 1; curbuf[2] = atoi(optarg); curbuf += 3; break; case 'u': lenpos = buf; if ((buf+2) >= buflast) goto done; curbuf[1] = 2; *lenpos = 1; curbuf += 2; while ((token = strsep(&optarg, ",")) != NULL) { value = strtol(token, NULL, 16); if ((curbuf+2) >= buflast) break; curbuf[0] = value &0xff; curbuf[1] = (value>>8)&0xff; curbuf += 2; } } } done: *len = curbuf - buf; return OK; } static int le_set_scan_response(int s, int argc, char *argv[]) { ng_hci_le_set_scan_response_data_cp cp; ng_hci_le_set_scan_response_data_rp rp; int n; int e; int len; char buf[NG_HCI_ADVERTISING_DATA_SIZE]; len = sizeof(buf); parse_param(argc, argv, buf, &len); memset(cp.scan_response_data, 0, sizeof(cp.scan_response_data)); cp.scan_response_data_length = len; memcpy(cp.scan_response_data, buf, len); n = sizeof(rp); e = hci_request(s, NG_HCI_OPCODE(NG_HCI_OGF_LE, NG_HCI_OCF_LE_SET_SCAN_RESPONSE_DATA), (void *)&cp, sizeof(cp), (void *)&rp, &n); printf("SET SCAN RESPONSE %d %d %d\n", e, rp.status, n); return OK; } static int le_read_local_supported_features(int s, int argc ,char *argv[]) { ng_hci_le_read_local_supported_features_rp rp; int e; int n = sizeof(rp); e = hci_simple_request(s, NG_HCI_OPCODE(NG_HCI_OGF_LE, NG_HCI_OCF_LE_READ_LOCAL_SUPPORTED_FEATURES), (void *)&rp, &n); printf("LOCAL SUPPORTED: %d %d %jx\n", e, rp.status, (uintmax_t) rp.le_features); return 0; } static int le_read_supported_status(int s, int argc, char *argv[]) { ng_hci_le_read_supported_status_rp rp; int e; int n = sizeof(rp); e = hci_simple_request(s, NG_HCI_OPCODE( NG_HCI_OGF_LE, NG_HCI_OCF_LE_READ_SUPPORTED_STATUS), (void *)&rp, &n); printf("LE_STATUS: %d %d %jx\n", e, rp.status, (uintmax_t)rp.le_status); return 0; } static int set_le_event_mask(int s, uint64_t mask) { ng_hci_le_set_event_mask_cp semc; ng_hci_le_set_event_mask_rp rp; int i, n ,e; n = sizeof(rp); for (i=0; i < NG_HCI_LE_EVENT_MASK_SIZE; i++) { semc.event_mask[i] = mask&0xff; mask >>= 8; } e = hci_request(s, NG_HCI_OPCODE(NG_HCI_OGF_LE, NG_HCI_OCF_LE_SET_EVENT_MASK), (void *)&semc, sizeof(semc), (void *)&rp, &n); return 0; } static int set_event_mask(int s, uint64_t mask) { ng_hci_set_event_mask_cp semc; ng_hci_set_event_mask_rp rp; int i, n, e; n = sizeof(rp); for (i=0; i < NG_HCI_EVENT_MASK_SIZE; i++) { semc.event_mask[i] = mask&0xff; mask >>= 8; } e = hci_request(s, NG_HCI_OPCODE(NG_HCI_OGF_HC_BASEBAND, NG_HCI_OCF_SET_EVENT_MASK), (void *)&semc, sizeof(semc), (void *)&rp, &n); return 0; } static int le_enable(int s, int argc, char *argv[]) { if (argc != 1) return USAGE; if (strcasecmp(argv[0], "enable") == 0) { set_event_mask(s, NG_HCI_EVENT_MASK_DEFAULT | NG_HCI_EVENT_MASK_LE); set_le_event_mask(s, NG_HCI_LE_EVENT_MASK_ALL); } else if (strcasecmp(argv[0], "disble") == 0) set_event_mask(s, NG_HCI_EVENT_MASK_DEFAULT); else return USAGE; return OK; } struct hci_command le_commands[] = { { "le_enable", "le_enable [enable|disable] \n" "Enable LE event ", &le_enable, }, { "le_read_local_supported_features", "le_read_local_supported_features\n" "read local supported features mask", &le_read_local_supported_features, }, { "le_read_supported_status", "le_read_supported_status\n" "read supported status" , &le_read_supported_status, }, { "le_set_scan_response", "le_set_scan_response -n $name -f $flag -u $uuid16,$uuid16 \n" "set LE scan response data" , &le_set_scan_response, }, { "le_set_scan_enable", "le_set_scan_enable [enable|disable] \n" "enable or disable LE device scan", &le_set_scan_enable }, { "le_set_scan_param", "le_set_scan_param [active|passive] interval(ms) window(ms) [public|random] [all|whitelist] \n" "set LE device scan parameter", &le_set_scan_param }, };