Index: projects/building-blocks/etc/rc.d/routing =================================================================== --- projects/building-blocks/etc/rc.d/routing (revision 278311) +++ projects/building-blocks/etc/rc.d/routing (revision 278312) @@ -1,379 +1,380 @@ #!/bin/sh # # Configure routing and miscellaneous network tunables # # $FreeBSD$ # # PROVIDE: routing # REQUIRE: netif # KEYWORD: nojailvnet . /etc/rc.subr . /etc/network.subr name="routing" start_cmd="routing_start doall" stop_cmd="routing_stop" extra_commands="options static" static_cmd="routing_start static" options_cmd="routing_start options" ROUTE_CMD="/sbin/route" routing_start() { local _cmd _af _if _a _ret _cmd=$1 _af=$2 _if=$3 _ret=0 case $_if in ""|[Aa][Ll][Ll]|[Aa][Nn][Yy]) _if="" ;; esac case $_af in ""|[Aa][Ll][Ll]|[Aa][Nn][Yy]) for _a in inet inet6 atm; do afexists $_a || continue setroutes $_cmd $_a $_if || _ret=1 done ;; *) if afexists $_af; then setroutes $_cmd $_af $_if || _ret=1 else err 1 "Unsupported address family: $_af." fi ;; esac return $_ret } routing_stop() { local _af _if _a _af=$1 _if=$2 case $_if in ""|[Aa][Ll][Ll]|[Aa][Nn][Yy]) _if="" ;; esac case $_af in ""|[Aa][Ll][Ll]|[Aa][Nn][Yy]) for _a in inet inet6 atm; do afexists $_a || continue eval static_${_a} delete $_if # When $_if is specified, do not flush routes. if ! [ -n "$_if" ]; then eval routing_stop_${_a} fi done ;; *) if afexists $_af; then eval static_${_af} delete $_if # When $_if is specified, do not flush routes. if ! [ -n "$_if" ]; then eval routing_stop_${_af} fi else err 1 "Unsupported address family: $_af." fi ;; esac } setroutes() { case $1 in static) static_$2 add $3 ;; options) options_$2 ;; doall) static_$2 add $3 options_$2 ;; esac } routing_stop_inet() { ${ROUTE_CMD} -n flush -inet } routing_stop_inet6() { local i ${ROUTE_CMD} -n flush -inet6 for i in `list_net_interfaces`; do if ipv6if $i; then ifconfig $i inet6 -defaultif fi done } routing_stop_atm() { return 0 } static_inet() { local _action _if _skip _action=$1 _if=$2 # Add default route. case ${defaultrouter} in [Nn][Oo] | '') ;; *) static_routes="${static_routes} _default" route__default="default ${defaultrouter}" ;; esac # Install configured routes. if [ -n "${static_routes}" ]; then for i in ${static_routes}; do _skip=0 if [ -n "$_if" ]; then case $i in *:$_if) ;; *) _skip=1 ;; esac fi if [ $_skip = 0 ]; then route_args=`get_if_var ${i%:*} route_IF` if [ -n "$route_args" ]; then ${ROUTE_CMD} ${_action} ${route_args} else warn "route_${i%:*} not found." fi fi done fi } static_inet6() { - local _action _if _skip fibmod fibs + local _action _if _skip fibmod fibs allfibs _action=$1 _if=$2 # get the number of FIBs supported. fibs=$((`${SYSCTL_N} net.fibs` - 1)) - if [ "$fibs" -gt 0 ]; then + allfibs=`${SYSCTL_N} net.add_addr_allfibs` + if [ "$fibs" -gt 0 ] && [ "$allfibs" -ne 0 ]; then fibmod="-fib 0-$fibs" else fibmod= fi # Add pre-defined static routes first. ipv6_static_routes="_v4mapped _v4compat ${ipv6_static_routes}" ipv6_static_routes="_lla _llma ${ipv6_static_routes}" # disallow "internal" addresses to appear on the wire ipv6_route__v4mapped="::ffff:0.0.0.0 -prefixlen 96 ::1 -reject ${fibmod}" ipv6_route__v4compat="::0.0.0.0 -prefixlen 96 ::1 -reject ${fibmod}" # Disallow link-local unicast packets without outgoing scope # identifiers. However, if you set "ipv6_default_interface", # for the host case, you will allow to omit the identifiers. # Under this configuration, the packets will go to the default # interface. ipv6_route__lla="fe80:: -prefixlen 10 ::1 -reject ${fibmod}" ipv6_route__llma="ff02:: -prefixlen 16 ::1 -reject ${fibmod}" # Add default route. case ${ipv6_defaultrouter} in [Nn][Oo] | '') ;; *) ipv6_static_routes="${ipv6_static_routes} _default" ipv6_route__default="default ${ipv6_defaultrouter}" ;; esac # Install configured routes. if [ -n "${ipv6_static_routes}" ]; then for i in ${ipv6_static_routes}; do _skip=0 if [ -n "$_if" ]; then case $i in *:$_if) ;; *) _skip=1 ;; esac fi if [ $_skip = 0 ]; then ipv6_route_args=`get_if_var ${i%:*} ipv6_route_IF` if [ -n "$ipv6_route_args" ]; then ${ROUTE_CMD} ${_action} \ -inet6 ${ipv6_route_args} else warn "route_${i%:*} not found" fi fi done fi # Install the "default interface" to kernel, which will be used # as the default route when there's no router. # Disable installing the default interface when we act # as router to avoid conflict between the default # router list and the manual configured default route. if checkyesno ipv6_gateway_enable; then return fi case "${ipv6_default_interface}" in [Nn][Oo] | [Nn][Oo][Nn][Ee]) return ;; [Aa][Uu][Tt][Oo] | "") for i in ${ipv6_network_interfaces}; do case $i in [Nn][Oo][Nn][Ee]) return ;; lo0) continue ;; esac laddr=`network6_getladdr $i exclude_tentative` case ${laddr} in '') ;; *) ipv6_default_interface=$i break ;; esac done ;; esac ifconfig ${ipv6_default_interface} inet6 defaultif sysctl net.inet6.ip6.use_defaultzone=1 } static_atm() { local _action i route_args _action=$1 if [ -n "${natm_static_routes}" ]; then for i in ${natm_static_routes}; do route_args=`get_if_var $i route_IF` if [ -n "$route_args" ]; then atmconfig natm ${_action} ${route_args} else warn "route_${i} not found." fi done fi } ropts_init() { if [ -z "${_ropts_initdone}" ]; then echo -n "Additional $1 routing options:" _ropts_initdone=yes fi } options_inet() { _ropts_initdone= if checkyesno icmp_bmcastecho; then ropts_init inet echo -n ' broadcast ping responses=YES' ${SYSCTL} net.inet.icmp.bmcastecho=1 > /dev/null else ${SYSCTL} net.inet.icmp.bmcastecho=0 > /dev/null fi if checkyesno icmp_drop_redirect; then ropts_init inet echo -n ' ignore ICMP redirect=YES' ${SYSCTL} net.inet.icmp.drop_redirect=1 > /dev/null else ${SYSCTL} net.inet.icmp.drop_redirect=0 > /dev/null fi if checkyesno icmp_log_redirect; then ropts_init inet echo -n ' log ICMP redirect=YES' ${SYSCTL} net.inet.icmp.log_redirect=1 > /dev/null else ${SYSCTL} net.inet.icmp.log_redirect=0 > /dev/null fi if checkyesno gateway_enable; then ropts_init inet echo -n ' gateway=YES' ${SYSCTL} net.inet.ip.forwarding=1 > /dev/null else ${SYSCTL} net.inet.ip.forwarding=0 > /dev/null fi if checkyesno forward_sourceroute; then ropts_init inet echo -n ' do source routing=YES' ${SYSCTL} net.inet.ip.sourceroute=1 > /dev/null else ${SYSCTL} net.inet.ip.sourceroute=0 > /dev/null fi if checkyesno accept_sourceroute; then ropts_init inet echo -n ' accept source routing=YES' ${SYSCTL} net.inet.ip.accept_sourceroute=1 > /dev/null else ${SYSCTL} net.inet.ip.accept_sourceroute=0 > /dev/null fi if checkyesno arpproxy_all; then ropts_init inet echo -n ' ARP proxyall=YES' ${SYSCTL} net.link.ether.inet.proxyall=1 > /dev/null else ${SYSCTL} net.link.ether.inet.proxyall=0 > /dev/null fi [ -n "${_ropts_initdone}" ] && echo '.' } options_inet6() { _ropts_initdone= if checkyesno ipv6_gateway_enable; then ropts_init inet6 echo -n ' gateway=YES' ${SYSCTL} net.inet6.ip6.forwarding=1 > /dev/null else ${SYSCTL} net.inet6.ip6.forwarding=0 > /dev/null fi [ -n "${_ropts_initdone}" ] && echo '.' } options_atm() { _ropts_initdone= [ -n "${_ropts_initdone}" ] && echo '.' } load_rc_config $name run_rc_command "$@" Index: projects/building-blocks/etc =================================================================== --- projects/building-blocks/etc (revision 278311) +++ projects/building-blocks/etc (revision 278312) Property changes on: projects/building-blocks/etc ___________________________________________________________________ Modified: svn:mergeinfo ## -0,0 +0,1 ## Merged /head/etc:r278256-278311 Index: projects/building-blocks/lib/csu/powerpc64/Makefile =================================================================== --- projects/building-blocks/lib/csu/powerpc64/Makefile (revision 278311) +++ projects/building-blocks/lib/csu/powerpc64/Makefile (revision 278312) @@ -1,48 +1,52 @@ # $FreeBSD$ .PATH: ${.CURDIR}/../common SRCS= crt1.c crti.S crtn.S OBJS= ${SRCS:N*.h:R:S/$/.o/g} OBJS+= Scrt1.o gcrt1.o CFLAGS+= -I${.CURDIR}/../common \ -I${.CURDIR}/../../libc/include \ -mlongcall +# XXX: See the log for r232932 as to why the above -mlongcall is needed. Since +# clang doesn't support -mlongcall, and testing shows a clang linked with a +# clang-built csu segfaults, this must currently be compiled with gcc. Once +# clang supports -mlongcall, or we get a fixed ld, this can be revisited. CC:= gcc COMPILER_TYPE:= gcc all: ${OBJS} CLEANFILES= ${OBJS} CLEANFILES+= crt1.s gcrt1.s Scrt1.s # See the comment in lib/csu/common/crtbrand.c for the reason crt1.c is not # directly compiled to .o files. crt1.s: crt1.c ${CC} ${CFLAGS} -S -o ${.TARGET} ${.CURDIR}/crt1.c sed ${SED_FIX_NOTE} ${.TARGET} crt1.o: crt1.s ${CC} ${ACFLAGS} -c -o ${.TARGET} crt1.s gcrt1.s: crt1.c ${CC} ${CFLAGS} -DGCRT -S -o ${.TARGET} ${.CURDIR}/crt1.c sed ${SED_FIX_NOTE} ${.TARGET} gcrt1.o: gcrt1.s ${CC} ${ACFLAGS} -c -o ${.TARGET} gcrt1.s Scrt1.s: crt1.c ${CC} ${CFLAGS} -fPIC -DPIC -S -o ${.TARGET} ${.CURDIR}/crt1.c sed ${SED_FIX_NOTE} ${.TARGET} Scrt1.o: Scrt1.s ${CC} ${ACFLAGS} -c -o ${.TARGET} Scrt1.s realinstall: ${INSTALL} -o ${LIBOWN} -g ${LIBGRP} -m ${LIBMODE} \ ${OBJS} ${DESTDIR}${LIBDIR} .include Index: projects/building-blocks/lib/libc/gen/disklabel.c =================================================================== --- projects/building-blocks/lib/libc/gen/disklabel.c (revision 278311) +++ projects/building-blocks/lib/libc/gen/disklabel.c (revision 278312) @@ -1,159 +1,167 @@ /* * Copyright (c) 1983, 1987, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #if defined(LIBC_SCCS) && !defined(lint) static char sccsid[] = "@(#)disklabel.c 8.2 (Berkeley) 5/3/95"; #endif /* LIBC_SCCS and not lint */ #include __FBSDID("$FreeBSD$"); #include #define DKTYPENAMES #define FSTYPENAMES #include #include #include #include #include #include #include static int gettype(char *t, const char **names) { const char **nm; for (nm = names; *nm; nm++) if (strcasecmp(t, *nm) == 0) return (nm - names); if (isdigit((unsigned char)*t)) return (atoi(t)); return (0); } struct disklabel * getdiskbyname(const char *name) { static struct disklabel disk; struct disklabel *dp = &disk; struct partition *pp; char *buf; char *db_array[2] = { _PATH_DISKTAB, 0 }; char *cp, *cq; /* can't be register */ char p, max, psize[3], pbsize[3], pfsize[3], poffset[3], ptype[3]; u_int32_t *dx; if (cgetent(&buf, db_array, (char *) name) < 0) return NULL; bzero((char *)&disk, sizeof(disk)); /* * typename */ cq = dp->d_typename; cp = buf; while (cq < dp->d_typename + sizeof(dp->d_typename) - 1 && (*cq = *cp) && *cq != '|' && *cq != ':') cq++, cp++; *cq = '\0'; - if (cgetstr(buf, "ty", &cq) > 0 && strcmp(cq, "removable") == 0) - dp->d_flags |= D_REMOVABLE; - else if (cq && strcmp(cq, "simulated") == 0) - dp->d_flags |= D_RAMDISK; + if (cgetstr(buf, "ty", &cq) > 0) { + if (strcmp(cq, "removable") == 0) + dp->d_flags |= D_REMOVABLE; + else if (cq && strcmp(cq, "simulated") == 0) + dp->d_flags |= D_RAMDISK; + free(cq); + } if (cgetcap(buf, "sf", ':') != NULL) dp->d_flags |= D_BADSECT; #define getnumdflt(field, dname, dflt) \ { long f; (field) = (cgetnum(buf, dname, &f) == -1) ? (dflt) : f; } getnumdflt(dp->d_secsize, "se", DEV_BSIZE); getnumdflt(dp->d_ntracks, "nt", 0); getnumdflt(dp->d_nsectors, "ns", 0); getnumdflt(dp->d_ncylinders, "nc", 0); - if (cgetstr(buf, "dt", &cq) > 0) + if (cgetstr(buf, "dt", &cq) > 0) { dp->d_type = gettype(cq, dktypenames); - else + free(cq); + } else getnumdflt(dp->d_type, "dt", 0); getnumdflt(dp->d_secpercyl, "sc", dp->d_nsectors * dp->d_ntracks); getnumdflt(dp->d_secperunit, "su", dp->d_secpercyl * dp->d_ncylinders); getnumdflt(dp->d_rpm, "rm", 3600); getnumdflt(dp->d_interleave, "il", 1); getnumdflt(dp->d_trackskew, "sk", 0); getnumdflt(dp->d_cylskew, "cs", 0); getnumdflt(dp->d_headswitch, "hs", 0); getnumdflt(dp->d_trkseek, "ts", 0); getnumdflt(dp->d_bbsize, "bs", BBSIZE); getnumdflt(dp->d_sbsize, "sb", 0); strcpy(psize, "px"); strcpy(pbsize, "bx"); strcpy(pfsize, "fx"); strcpy(poffset, "ox"); strcpy(ptype, "tx"); max = 'a' - 1; pp = &dp->d_partitions[0]; for (p = 'a'; p < 'a' + MAXPARTITIONS; p++, pp++) { long l; psize[1] = pbsize[1] = pfsize[1] = poffset[1] = ptype[1] = p; if (cgetnum(buf, psize, &l) == -1) pp->p_size = 0; else { pp->p_size = l; cgetnum(buf, poffset, &l); pp->p_offset = l; getnumdflt(pp->p_fsize, pfsize, 0); if (pp->p_fsize) { long bsize; if (cgetnum(buf, pbsize, &bsize) == 0) pp->p_frag = bsize / pp->p_fsize; else pp->p_frag = 8; } getnumdflt(pp->p_fstype, ptype, 0); - if (pp->p_fstype == 0 && cgetstr(buf, ptype, &cq) > 0) - pp->p_fstype = gettype(cq, fstypenames); + if (pp->p_fstype == 0) + if (cgetstr(buf, ptype, &cq) >= 0) { + pp->p_fstype = gettype(cq, fstypenames); + free(cq); + } max = p; } } dp->d_npartitions = max + 1 - 'a'; (void)strcpy(psize, "dx"); dx = dp->d_drivedata; for (p = '0'; p < '0' + NDDATA; p++, dx++) { psize[1] = p; getnumdflt(*dx, psize, 0); } dp->d_magic = DISKMAGIC; dp->d_magic2 = DISKMAGIC; free(buf); + (void)cgetclose(); return (dp); } Index: projects/building-blocks/lib/libc/stdlib/tdelete.c =================================================================== --- projects/building-blocks/lib/libc/stdlib/tdelete.c (revision 278311) +++ projects/building-blocks/lib/libc/stdlib/tdelete.c (revision 278312) @@ -1,71 +1,72 @@ /* $NetBSD: tdelete.c,v 1.2 1999/09/16 11:45:37 lukem Exp $ */ /* * Tree search generalized from Knuth (6.2.2) Algorithm T just like * the AT&T man page says. * * The node_t structure is for internal use only, lint doesn't grok it. * * Written by reading the System V Interface Definition, not the code. * * Totally public domain. */ #include #if 0 #if defined(LIBC_SCCS) && !defined(lint) __RCSID("$NetBSD: tdelete.c,v 1.2 1999/09/16 11:45:37 lukem Exp $"); #endif /* LIBC_SCCS and not lint */ #endif __FBSDID("$FreeBSD$"); #define _SEARCH_PRIVATE #include #include /* * delete node with given key * * vkey: key to be deleted * vrootp: address of the root of the tree * compar: function to carry out node comparisons */ void * tdelete(const void * __restrict vkey, void ** __restrict vrootp, int (*compar)(const void *, const void *)) { node_t **rootp = (node_t **)vrootp; node_t *p, *q, *r; int cmp; if (rootp == NULL || (p = *rootp) == NULL) return NULL; while ((cmp = (*compar)(vkey, (*rootp)->key)) != 0) { p = *rootp; rootp = (cmp < 0) ? &(*rootp)->llink : /* follow llink branch */ &(*rootp)->rlink; /* follow rlink branch */ if (*rootp == NULL) return NULL; /* key not found */ } r = (*rootp)->rlink; /* D1: */ if ((q = (*rootp)->llink) == NULL) /* Left NULL? */ q = r; else if (r != NULL) { /* Right link is NULL? */ if (r->llink == NULL) { /* D2: Find successor */ r->llink = q; q = r; } else { /* D3: Find NULL link */ for (q = r->llink; q->llink != NULL; q = r->llink) r = q; r->llink = q->rlink; q->llink = (*rootp)->llink; q->rlink = (*rootp)->rlink; } } - free(*rootp); /* D4: Free node */ + if (p != *rootp) + free(*rootp); /* D4: Free node */ *rootp = q; /* link parent to new node */ return p; } Index: projects/building-blocks/lib/libc =================================================================== --- projects/building-blocks/lib/libc (revision 278311) +++ projects/building-blocks/lib/libc (revision 278312) Property changes on: projects/building-blocks/lib/libc ___________________________________________________________________ Modified: svn:mergeinfo ## -0,0 +0,1 ## Merged /head/lib/libc:r278050-278311 Index: projects/building-blocks/lib/libdevinfo/devinfo.h =================================================================== --- projects/building-blocks/lib/libdevinfo/devinfo.h (revision 278311) +++ projects/building-blocks/lib/libdevinfo/devinfo.h (revision 278312) @@ -1,139 +1,139 @@ /*- * Copyright (c) 2000 Michael Smith * Copyright (c) 2000 BSDi * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _DEVINFO_H_INCLUDED #define _DEVINFO_H_INCLUDED #include #include #include typedef __uintptr_t devinfo_handle_t; #define DEVINFO_ROOT_DEVICE ((devinfo_handle_t)0) typedef enum device_state devinfo_state_t; struct devinfo_dev { devinfo_handle_t dd_handle; /* device handle */ devinfo_handle_t dd_parent; /* parent handle */ char *dd_name; /* name of device */ char *dd_desc; /* device description */ char *dd_drivername; /* name of attached driver*/ char *dd_pnpinfo; /* pnp info from parent bus */ char *dd_location; /* Where bus thinks dev at */ uint32_t dd_devflags; /* API flags */ uint16_t dd_flags; /* internal dev flags */ - devinfo_state_t dd_state; /* attacement state of dev */ + devinfo_state_t dd_state; /* attachment state of dev */ }; struct devinfo_rman { devinfo_handle_t dm_handle; /* resource manager handle */ unsigned long dm_start; /* resource start */ unsigned long dm_size; /* resource size */ char *dm_desc; /* resource description */ }; struct devinfo_res { devinfo_handle_t dr_handle; /* resource handle */ devinfo_handle_t dr_rman; /* resource manager handle */ devinfo_handle_t dr_device; /* owning device */ unsigned long dr_start; /* region start */ unsigned long dr_size; /* region size */ /* XXX add flags */ }; __BEGIN_DECLS /* * Acquire a coherent copy of the kernel's device and resource tables. * This must return success (zero) before any other interfaces will * function. Sets errno on failure. */ extern int devinfo_init(void); /* * Release the storage associated with the internal copy of the device * and resource tables. devinfo_init must be called before any attempt * is made to use any other interfaces. */ extern void devinfo_free(void); /* * Find a device/resource/resource manager by its handle. */ extern struct devinfo_dev *devinfo_handle_to_device(devinfo_handle_t handle); extern struct devinfo_res *devinfo_handle_to_resource(devinfo_handle_t handle); extern struct devinfo_rman *devinfo_handle_to_rman(devinfo_handle_t handle); /* * Iterate over the children of a device, calling (fn) on each. If * (fn) returns nonzero, abort the scan and return. */ extern int devinfo_foreach_device_child(struct devinfo_dev *parent, int (* fn)(struct devinfo_dev *child, void *arg), void *arg); /* * Iterate over all the resources owned by a device, calling (fn) on each. * If (fn) returns nonzero, abort the scan and return. */ extern int devinfo_foreach_device_resource(struct devinfo_dev *dev, int (* fn)(struct devinfo_dev *dev, struct devinfo_res *res, void *arg), void *arg); /* * Iterate over all the resources owned by a resource manager, calling (fn) * on each. If (fn) returns nonzero, abort the scan and return. */ extern int devinfo_foreach_rman_resource(struct devinfo_rman *rman, int (* fn)(struct devinfo_res *res, void *arg), void *arg); /* * Iterate over all the resource managers, calling (fn) on each. If (fn) * returns nonzero, abort the scan and return. */ extern int devinfo_foreach_rman(int (* fn)(struct devinfo_rman *rman, void *arg), void *arg); __END_DECLS #endif /* ! _DEVINFO_H_INCLUDED */ Index: projects/building-blocks/lib/libnetgraph/debug.c =================================================================== --- projects/building-blocks/lib/libnetgraph/debug.c (revision 278311) +++ projects/building-blocks/lib/libnetgraph/debug.c (revision 278312) @@ -1,344 +1,359 @@ /* * debug.c * * Copyright (c) 1996-1999 Whistle Communications, Inc. * All rights reserved. * * Subject to the following obligations and disclaimer of warranty, use and * redistribution of this software, in source or object code forms, with or * without modifications are expressly permitted by Whistle Communications; * provided, however, that: * 1. Any and all reproductions of the source or object code must include the * copyright notice above and the following disclaimer of warranties; and * 2. No rights are granted, in any manner or form, to use Whistle * Communications, Inc. trademarks, including the mark "WHISTLE * COMMUNICATIONS" on advertising, endorsements, or otherwise except as * such appears in the above copyright notice or in the software. * * THIS SOFTWARE IS BEING PROVIDED BY WHISTLE COMMUNICATIONS "AS IS", AND * TO THE MAXIMUM EXTENT PERMITTED BY LAW, WHISTLE COMMUNICATIONS MAKES NO * REPRESENTATIONS OR WARRANTIES, EXPRESS OR IMPLIED, REGARDING THIS SOFTWARE, * INCLUDING WITHOUT LIMITATION, ANY AND ALL IMPLIED WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT. * WHISTLE COMMUNICATIONS DOES NOT WARRANT, GUARANTEE, OR MAKE ANY * REPRESENTATIONS REGARDING THE USE OF, OR THE RESULTS OF THE USE OF THIS * SOFTWARE IN TERMS OF ITS CORRECTNESS, ACCURACY, RELIABILITY OR OTHERWISE. * IN NO EVENT SHALL WHISTLE COMMUNICATIONS BE LIABLE FOR ANY DAMAGES * RESULTING FROM OR ARISING OUT OF ANY USE OF THIS SOFTWARE, INCLUDING * WITHOUT LIMITATION, ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, * PUNITIVE, OR CONSEQUENTIAL DAMAGES, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES, LOSS OF USE, DATA OR PROFITS, HOWEVER CAUSED AND UNDER ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF WHISTLE COMMUNICATIONS IS ADVISED OF THE POSSIBILITY * OF SUCH DAMAGE. * * Author: Archie Cobbs * * $Whistle: debug.c,v 1.24 1999/01/24 01:15:33 archie Exp $ */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include "netgraph.h" #include "internal.h" #include #include #include #include #include +#include #include +#include #include #include #include #include #include +#include #include #include #include #include #include #include #include #include #include #include #include #include #include +#include #include +#include +#include #include #include #include +#include #include #include #include #include #include +#include #include #include #include #include #include #ifdef WHISTLE #include #include #include #include #include #include #endif /* Global debug level */ int _gNgDebugLevel = 0; /* Debug printing functions */ void (*_NgLog) (const char *fmt,...) = warn; void (*_NgLogx) (const char *fmt,...) = warnx; /* Internal functions */ static const char *NgCookie(int cookie); /* Known typecookie list */ struct ng_cookie { int cookie; const char *type; }; #define COOKIE(c) { NGM_ ## c ## _COOKIE, #c } /* List of known cookies */ static const struct ng_cookie cookies[] = { COOKIE(UI), COOKIE(ASYNC), COOKIE(ATMLLC), COOKIE(BPF), COOKIE(BRIDGE), + COOKIE(CAR), COOKIE(CISCO), + COOKIE(DEFLATE), COOKIE(DEVICE), COOKIE(ECHO), COOKIE(EIFACE), COOKIE(ETF), COOKIE(ETHER), + COOKIE(ETHER_ECHO), COOKIE(FRAMERELAY), COOKIE(GIF), COOKIE(GIF_DEMUX), COOKIE(GENERIC), COOKIE(HOLE), COOKIE(HUB), COOKIE(IFACE), COOKIE(IP_INPUT), COOKIE(IPFW), COOKIE(KSOCKET), COOKIE(L2TP), COOKIE(LMI), COOKIE(MPPC), COOKIE(NAT), + COOKIE(NETFLOW), COOKIE(ONE2MANY), + COOKIE(PATCH), + COOKIE(PIPE), COOKIE(PPP), COOKIE(PPPOE), COOKIE(PPTPGRE), + COOKIE(PRED1), COOKIE(RFC1490), COOKIE(SOCKET), COOKIE(SOURCE), COOKIE(SPLIT), COOKIE(SPPP), + COOKIE(TAG), COOKIE(TCPMSS), COOKIE(TEE), COOKIE(TTY), COOKIE(VJC), COOKIE(VLAN), #ifdef WHISTLE COOKIE(DF), COOKIE(IPAC), COOKIE(TN), COOKIE(WFRA), #endif { 0, NULL } }; /* * Set debug level, ie, verbosity, if "level" is non-negative. * Returns old debug level. */ int NgSetDebug(int level) { int old = _gNgDebugLevel; - if (level < 0) - level = old; - _gNgDebugLevel = level; + if (level >= 0) + _gNgDebugLevel = level; return (old); } /* * Set debug logging functions. */ void NgSetErrLog(void (*log) (const char *fmt,...), void (*logx) (const char *fmt,...)) { _NgLog = log; _NgLogx = logx; } /* * Display a netgraph sockaddr */ void _NgDebugSockaddr(const struct sockaddr_ng *sg) { NGLOGX("SOCKADDR: { fam=%d len=%d addr=\"%s\" }", sg->sg_family, sg->sg_len, sg->sg_data); } #define ARGS_BUFSIZE 2048 #define RECURSIVE_DEBUG_ADJUST 4 /* * Display a negraph message */ void _NgDebugMsg(const struct ng_mesg *msg, const char *path) { u_char buf[2 * sizeof(struct ng_mesg) + ARGS_BUFSIZE]; struct ng_mesg *const req = (struct ng_mesg *)buf; struct ng_mesg *const bin = (struct ng_mesg *)req->data; int arglen, csock = -1; /* Display header stuff */ NGLOGX("NG_MESG :"); NGLOGX(" vers %d", msg->header.version); - NGLOGX(" arglen %d", msg->header.arglen); - NGLOGX(" flags %ld", msg->header.flags); - NGLOGX(" token %lu", (u_long)msg->header.token); - NGLOGX(" cookie %s (%d)", + NGLOGX(" arglen %u", msg->header.arglen); + NGLOGX(" flags %x", msg->header.flags); + NGLOGX(" token %u", msg->header.token); + NGLOGX(" cookie %s (%u)", NgCookie(msg->header.typecookie), msg->header.typecookie); /* At lower debugging levels, skip ASCII translation */ if (_gNgDebugLevel <= 2) goto fail2; /* If path is not absolute, don't bother trying to use relative address on a different socket for the ASCII translation */ if (strchr(path, ':') == NULL) goto fail2; /* Get a temporary socket */ if (NgMkSockNode(NULL, &csock, NULL) < 0) goto fail; /* Copy binary message into request message payload */ arglen = msg->header.arglen; if (arglen > ARGS_BUFSIZE) arglen = ARGS_BUFSIZE; memcpy(bin, msg, sizeof(*msg) + arglen); bin->header.arglen = arglen; /* Lower debugging to avoid infinite recursion */ _gNgDebugLevel -= RECURSIVE_DEBUG_ADJUST; /* Ask the node to translate the binary message to ASCII for us */ if (NgSendMsg(csock, path, NGM_GENERIC_COOKIE, NGM_BINARY2ASCII, bin, sizeof(*bin) + bin->header.arglen) < 0) { _gNgDebugLevel += RECURSIVE_DEBUG_ADJUST; goto fail; } if (NgRecvMsg(csock, req, sizeof(buf), NULL) < 0) { _gNgDebugLevel += RECURSIVE_DEBUG_ADJUST; goto fail; } /* Restore debugging level */ _gNgDebugLevel += RECURSIVE_DEBUG_ADJUST; /* Display command string and arguments */ NGLOGX(" cmd %s (%d)", bin->header.cmdstr, bin->header.cmd); NGLOGX(" args %s", bin->data); goto done; fail: /* Just display binary version */ NGLOGX(" [error decoding message: %s]", strerror(errno)); fail2: NGLOGX(" cmd %d", msg->header.cmd); NGLOGX(" args (%d bytes)", msg->header.arglen); _NgDebugBytes((u_char *)msg->data, msg->header.arglen); done: if (csock != -1) (void)close(csock); } /* * Return the name of the node type corresponding to the cookie */ static const char * NgCookie(int cookie) { int k; for (k = 0; cookies[k].cookie != 0; k++) { if (cookies[k].cookie == cookie) return cookies[k].type; } return "??"; } /* * Dump bytes in hex */ void _NgDebugBytes(const u_char *ptr, int len) { char buf[100]; int k, count; #define BYPERLINE 16 for (count = 0; count < len; ptr += BYPERLINE, count += BYPERLINE) { /* Do hex */ snprintf(buf, sizeof(buf), "%04x: ", count); for (k = 0; k < BYPERLINE; k++, count++) if (count < len) snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "%02x ", ptr[k]); else snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), " "); snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), " "); count -= BYPERLINE; /* Do ASCII */ for (k = 0; k < BYPERLINE; k++, count++) if (count < len) snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "%c", isprint(ptr[k]) ? ptr[k] : '.'); else snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), " "); count -= BYPERLINE; /* Print it */ NGLOGX("%s", buf); } } Index: projects/building-blocks/release/doc/en_US.ISO8859-1/errata/article.xml =================================================================== --- projects/building-blocks/release/doc/en_US.ISO8859-1/errata/article.xml (revision 278311) +++ projects/building-blocks/release/doc/en_US.ISO8859-1/errata/article.xml (revision 278312) @@ -1,93 +1,101 @@ %release; + + ]>
&os; &release; Errata The &os; Project $FreeBSD$ - 2014 + 2015 The &os; Documentation Project &tm-attrib.freebsd; &tm-attrib.intel; &tm-attrib.sparc; &tm-attrib.general; This document lists errata items for &os; &release;, containing significant information discovered after the release or too late in the release cycle to be otherwise included in the release documentation. This information includes security advisories, as well as news relating to the software or documentation that could affect its operation or usability. An up-to-date version of this document should always be consulted before installing this version of &os;. This errata document for &os; &release; will be maintained until the release of &os; &release.next;. Introduction This errata document contains late-breaking news about &os; &release; Before installing this version, it is important to consult this document to learn about any post-release discoveries or problems that may already have been found and fixed. Any version of this errata document actually distributed with the release (for example, on a CDROM distribution) will be out of date by definition, but other copies are kept updated on the Internet and should be consulted as the current errata for this release. These other copies of the errata are located at , plus any sites which keep up-to-date mirrors of this location. Source and binary snapshots of &os; &release.branch; also contain up-to-date copies of this document (as of the time of the snapshot). For a list of all &os; CERT security advisories, see or . Security Advisories - No advisory. + &security; + + + + Errata Notices + + &errata; Open Issues No open issues. Late-Breaking News No news.
Index: projects/building-blocks/release/doc/en_US.ISO8859-1/relnotes/article.xml =================================================================== --- projects/building-blocks/release/doc/en_US.ISO8859-1/relnotes/article.xml (revision 278311) +++ projects/building-blocks/release/doc/en_US.ISO8859-1/relnotes/article.xml (revision 278312) @@ -1,967 +1,967 @@ %release; %sponsor; %vendor; + + ]>
&os; &release.current; Release Notes The &os; Project $FreeBSD$ 2015 The &os; Documentation Project &tm-attrib.freebsd; &tm-attrib.ibm; &tm-attrib.ieee; &tm-attrib.intel; &tm-attrib.sparc; &tm-attrib.general; The release notes for &os; &release.current; contain a summary of the changes made to the &os; base system on the &release.branch; development line. This document lists applicable security advisories that were issued since the last release, as well as significant changes to the &os; kernel and userland. Some brief remarks on upgrading are also presented. Introduction This document contains the release notes for &os; &release.current;. It describes recently added, changed, or deleted features of &os;. It also provides some notes on upgrading from previous versions of &os;. The &release.type; distribution to which these release notes apply represents the latest point along the &release.branch; development branch since &release.branch; was created. Information regarding pre-built, binary &release.type; distributions along this branch can be found at &release.url;. The &release.type; distribution to which these release notes apply represents a point along the &release.branch; development branch between &release.prev; and the future &release.next;. Information regarding pre-built, binary &release.type; distributions along this branch can be found at &release.url;. This distribution of &os; &release.current; is a &release.type; distribution. It can be found at &release.url; or any of its mirrors. More information on obtaining this (or other) &release.type; distributions of &os; can be found in the Obtaining &os; appendix to the &os; Handbook. All users are encouraged to consult the release errata before installing &os;. The errata document is updated with late-breaking information discovered late in the release cycle or after the release. Typically, it contains information on known bugs, security advisories, and corrections to documentation. An up-to-date copy of the errata for &os; &release.current; can be found on the &os; Web site. This document describes the most user-visible new or changed features in &os; since &release.prev;. In general, changes described here are unique to the &release.branch; branch unless specifically marked as &merged; features. Typical release note items document recent security advisories issued after &release.prev;, new drivers or hardware support, new commands or options, major bug fixes, or contributed software upgrades. They may also list changes to major ports/packages or release engineering practices. Clearly the release notes cannot list every single change made to &os; between releases; this document focuses primarily on security advisories, user-visible changes, and major architectural improvements. Upgrading from Previous Releases of &os; Binary upgrades between RELEASE versions (and snapshots of the various security branches) are supported using the &man.freebsd-update.8; utility. The binary upgrade procedure will update unmodified userland utilities, as well as unmodified GENERIC kernels distributed as a part of an official &os; release. The &man.freebsd-update.8; utility requires that the host being upgraded have Internet connectivity. Source-based upgrades (those based on recompiling the &os; base system from source code) from previous versions are supported, according to the instructions in /usr/src/UPDATING. Upgrading &os; should only be attempted after backing up all data and configuration files. Security and Errata This section lists the various Security Advisories and Errata Notices since &release.prev;. Security Advisories - No advisories. - + &security; Errata Notices - No errata notices. - + &errata; Userland This section covers changes and additions to userland applications, contributed software, and system utilities. Userland Configuration Changes The default &man.newsyslog.conf.5; now includes files in the /etc/newsyslog.conf.d/ and /usr/local/etc/newsyslog.conf.d/ directories by default for &man.newsyslog.8;. The &man.mailwrapper.8; utility has been updated to use &man.mailer.conf.5; from the LOCALBASE environment variable, which defaults to /usr/local if unset. The MK_ARM_EABI &man.src.conf.5; option has been removed. Userland Application Changes The &man.casperd.8; daemon has been added, which provides access to functionality that is not available in the capability mode sandbox. When unable to load a kernel module with &man.kldload.8;, a message informing to view output of &man.dmesg.8; is now printed, opposed to the previous output Exec format error.. Allow &man.pciconf.8; to identify PCI devices that are attached to a driver to be identified by their device name instead of just the selector. Additionally, an optional device argument to the -l flag to restrict the output to only listing details about a single device. A new flag, onifconsole has been added to /etc/ttys. This allows the system to provide a login prompt via serial console if the device is an active kernel console, otherwise it is equivalent to off. Support for displaying VPD for PCI devices via &man.pciconf.8; has been added. &man.ping.8; protects against malicious network packets using the Capsicum framework to drop privileges. The &man.ps.1; utility has been updated to include the -J flag, used to filter output by matching &man.jail.8; IDs and names. Additionally, argument 0 can be used to -J to only list processes running on the host system. The &man.top.1; utility has been updated to filter by &man.jail.8; ID or name, in followup to the &man.ps.1; change in r265229. The Blowfish &man.crypt.3; default format has been changed to $2b$. The &man.pmcstat.8; utility has been updated to include a new flag, -l, which ends event collection after the specified number of seconds. The &man.ps.1; utility has been updated to include a new keyword, tracer, which displays the PID of the tracing process. Support for adding empty partitions has been added to the &man.mkimg.1; utility. The &man.primes.6; utility has been updated to correctly enumerate prime numbers between 4295098369 and 3825123056546413050, which prior to this change, it would be possible for returned values to be incorrectly identified as prime numbers. The &man.mkimg.1; utility has been updated to include three options used to print information about &man.mkimg.1; itself: Option Output --version The current version of the &man.mkimg.1; utility --formats The disk image file formats supported by &man.mkimg.1; --schemes The partition schemes supported by &man.mkimg.1; Userland &man.ctf.5; support in &man.dtrace.1; has been added. With this change, &man.dtrace.1; is able to resolve type info for function and USDT probe arguments, and function return values. The &man.elfdump.1; utility has been updated to support capability mode provided by &man.capsicum.4;. The &man.fstyp.8; utility has been added, which is used to determine the filesystem on a specified device. The libedit library has been updated to support UTF-8, which additionally provides unicode support to &man.sh.1;. The &man.ptrace.2; system call has been updated include support for Altivec registers on &os;/&arch.powerpc;. Contributed Software &man.lldb.1; has been updated to upstream snapshot version r196259. Timezone data files have been updated to version 2013i. &man.byacc.1; has been updated to version 20140101. &man.jemalloc.3; has been updated to version 3.5.0. bmake has been updated to version 20140101. libc++ has been updated to version 3.4. OpenSSH has been updated to 6.5p1. mdocml has been updated to version 1.12.3. LLVM and Clang have been updated to version 3.4. Sendmail has been updated from 8.14.7 to 8.14.9. file has been updated to version 5.22. The binutils suite of utilities has been updated to include upstream patches that add new relocations for &arch.powerpc; support. The ELF Tool Chain has been updated to upstream revision r3136. The texinfo utility and info pages were removed from the base system. The print/texinfo port should be installed on systems where info pages are needed. The ELF object manipulation tools addr2line, elfcopy (strip), nm, size, and strings were switched to the versions from the ELF Tool Chain project. The libedit library has been updated to include UTF-8 support, adding UTF-8 support to the &man.sh.1; shell. OpenSSL has been updated to version 1.0.1l. Installation and Configuration Tools The &man.bsdinstall.8; partition editor and &man.sade.8; utility have been updated to include native ZFS support. The &os; installation utility, &man.bsdinstall.8;, has been updated to set the canmount &man.zfs.8; property to off for the /var dataset, preventing the contents of directories within /var from conflicting when using multiple boot environments, such as that provided by sysutils/beadm. The &man.bsdconfig.8; utility has been updated to skip the initial &man.tzsetup.8; UTC versus wall-clock time prompt when run in a virtual machine, determined when the kern.vm_guest &man.sysctl.8; is set to 1. The &man.bsdinstall.8; utility has been updated to use the new &man.dpv.3; library to display progress when extracting the &os; distributions. <filename class="directory">/etc/rc.d</filename> Scripts The &man.rc.8; subsystem has been updated to allow configuring services in ${LOCALBASE}/etc/rc.conf.d/. If LOCALBASE is unset, it defaults to /usr/local. The mrouted &man.rc.8; script has been removed from the base system. An equivalent script is available from the net/mrouted port. <filename class="directory">/etc/periodic</filename> Scripts The daily &man.periodic.8; script 110.clean-tmps has been updated to avoid crossing filesystem mount boundaries when cleaning files in /tmp. Runtime Libraries and API The &man.readline.3; library is now statically linked in software within the base system, and the shared library is no longer installed, allowing the Ports Collection to use a modern version of the library. The &man.strptime.3; library has been updated to add support for POSIX-2001 features %U and %W. The &man.dl.iterate.phdr.3; library has been changed to always return the path name of the ELF object in the dlpi_name structure member. A userland library for Chelsio Terminator 5 based iWARP cards has been added, allowing userland RDMA applications to work over compatible NICs. The &man.gpio.3; library has been added, providing a wrapper around the &man.gpio.4; kernel interface. ABI Compatibility The &linux; compatibility version has been updated to 2.6.18. The compat.linux.osrelease &man.sysctl.8; is evaluated when building the emulators/linux-c6 and related ports. Kernel This section covers changes to kernel configurations, system tuning, and system control parameters that are not otherwise categorized. Kernel Bug Fixes A kernel bug that inhibited proper functionality of the dev.cpu.0.freq &man.sysctl.8; on &intel; processors with Turbo Boost ™ enabled has been fixed. Support for &man.dtrace.1; stack tracing has been fixed for &os;/&arch.powerpc;, using the trapexit() and asttrapexit() functions instead of checking within addressed kernel space. A bug in &man.ipfw.4; that could potentially lead to a kernel panic when using &man.dummynet.4; at layer 2 has been fixed. Kernel Configuration The IMAGACT_BINMISC kernel configuration option has been enabled by default, which enables application execution through emulators, such as Qemu. The VT kernel configuration file has been removed, and the &man.vt.4; driver is included in the GENERIC kernel. To enable &man.vt.4;, enter set kern.vty=vt at the &man.loader.8; prompt during boot, or add kern.vty=vt to &man.loader.conf.5; and reboot the system. System Tuning and Controls   Devices and Drivers This section covers changes and additions to devices and device drivers since &release.prev;. Device Drivers Support for GPS ports has been added to &man.uhso.4;. The &man.full.4; device has been added, and the lindev(4) device has been removed. Prior to this change, lindev(4) provided only the /dev/full character device, returning ENOSPC on write attempts. As this device is not specific to &linux;, a native &os; version has been added. Hardware context support has been added to the drm/i915 driver, adding support for Mesa 9.2 and later. The &man.vt.4; driver has been updated, replacing the bitmapped kern.vt.spclkeys &man.sysctl.8; with individual kern.vt.kbd_* variants. The &man.hpet.4; driver has been updated to create a /dev/hpetN device, providing access to HPET from userspace. Storage Drivers The &man.mpr.4; device has been added, providing support for LSI Fusion-MPT 3 12Gb SCSI/SATA controllers. The &man.mrsas.4; driver has been added, providing support for LSI MegaRAID SAS controllers. The &man.mfi.4; driver will attach to the controller, by default. To enable &man.mrsas.4; add hw.mfi.mrsas_enable=1 to /boot/loader.conf, which turns off &man.mfi.4; device probing. At this time, the &man.mfiutil.8; utility and the &os; version of MegaCLI and StorCli do not work with &man.mrsas.4;. The &man.ctl.4; subsystem has been updated, increasing the ports limit from 128 to 256, and LUN limit from 256 to 1024. The asr(4) driver has been removed, and is no longer supported. Network Drivers Support for Broadcom chipsets BCM57764, BCM57767, BCM57782, BCM57786 and BCM57787 has been added to &man.bge.4;. Support for the &intel; Centrino™ Wireless-N 135 chipset has been added. Firmware for &intel; Centrino™ Wireless-N 105 devices has been added to the base system. The deprecated nve(4) driver has been removed. Users of NVIDIA nForce MCP network adapters are advised to use the &man.nfe.4; driver instead, which has been the default driver for this hardware since &os; 7.0. The if_nf10bmac(4) device has been added, providing support for NetFPGA-10G Embedded CPU Ethernet Core. The if_nf10bmac(4) driver operates on the FPGA, and is not suited for the PCI host interface. The &man.ath.hal.4; driver has been updated to support the Atheros AR1111 chipset. Support for the &intel; Centrino™ Wireless-N 105 chipset has been added. Support for the &man.cxgbe.4; Terminator 5 (T5) 10G/40G cards has been added to &man.netmap.4;. The &man.alc.4; driver has been updated to support AR816x and AR817x ethernet controllers. The &man.vxlan.4; driver has been added, which creates a virtual Layer 2 (Ethernet) network overlaid in a Layer 3 (IP/UDP) network. The &man.vxlan.4; driver is analogous to &man.vlan.4;, but is designed to be better suited for large, multiple-tenant datacenter environments. The &man.gre.4; driver has been significantly overhauled, and has been split into two separate modules, &man.gre.4; and &man.me.4;. Hardware Support This section covers general hardware support for physical machines, hypervisors, and virtualization environments, as well as hardware changes and updates that do not otherwise fit in other sections of this document. Hardware Support The &man.asmc.4; driver has been updated to support the &apple; MacMini 3,1. Support for &os;/ia64 has been dropped as of &os; 11. An issue that could cause a system to hang when entering ACPI S3 state (suspend to RAM) has been corrected in the &man.acpi.4; and &man.pci.4; drivers. The power management unit subsystem has been updated to support power button events on certain &arch.powerpc; hardware, such as aluminum PowerBook ®. The &man.hwpmc.4; driver has been updated to correct performance counter sampling on G4 (MPC74xxx) and G5 class processors. The OpenCrypto framework has been updated to include AES-ICM and AES-GCM modes, both of which have also been added to the &man.aesni.4; driver. Virtualization Support Support for the Virtual Interrupt Delivery feature of &intel; VT-x is enabled if supported by the CPU. This feature can be disabled by running sysctl hw.vmm.vmx.use_apic_vid=0. Additionally, to persist this setting across reboots, add hw.vmm.vmx.use_apic_vid=0 to /etc/sysctl.conf. Support for Posted Interrupt Processing is enabled if supported by the CPU. This feature can be disabled by running sysctl hw.vmm.vmx.use_apic_pir=0. Additionally, to persist this setting across reboots, add hw.vmm.vmx.use_apic_pir=0 to /etc/sysctl.conf. Unmapped IO support has been added to &man.virtio_blk.4;. Unmapped IO support has been added to &man.virtio_scsi.4;. The &man.virtio_random.4; driver has been added to harvest entropy from the host system. &os;/&arch.i386; guests can be run under bhyve. Support for running a &os;/&arch.amd64; Xen guest instance as PVH guest has been added. PVH mode, short for Para-Virtualized Hardware, uses para-virtualized drivers for boot and I/O, and uses hardware virtualization extensions for all other tasks, without the need for emulation. The &man.virtio.console.4; driver has been added, which provides an interface to VirtIO console devices through a &man.tty.4; device. ARM Support The &man.nand.4; device is enabled for ARM devices by default. An issue that could cause instability when detecting SD cards on the Raspberry Pi SOC has been fixed. The bcm2835_cpufreq driver has been added, which supports CPU frequency and voltage control on the Raspberry Pi SOC. Storage This section covers changes and additions to file systems and other storage subsystems, both local and networked. Networked Storage The new filesystem automount facility, &man.autofs.5;, has been added. The new &man.autofs.5; facility is similar to that found in other &unix;-like operating systems, such as OS X™ and Solaris™. The &man.autofs.5; facility uses a &sun;-compatible &man.auto.master.5; configuration file, and is administered with the &man.automount.8; userland utility, and the &man.automountd.8; and &man.autounmountd.8; daemons. ZFS The arc_meta_limit statistics are now visible through the kstat &man.sysctl.8;. As a result of this change, the vfs.zfs.arc_meta_used &man.sysctl.8; has been removed, and replaced with the kstat.zfs.misc.arcstats.arc_meta_used &man.sysctl.8;. &man.geom.4; Support for the disklabel64 partitioning scheme has been added to &man.gpart.8;. Boot Loader Changes This section covers the boot loader, boot menu, and other boot-related changes. Boot Loader Changes The &man.vt.4; driver has been made the default system console driver. The &man.syscons.4; driver is still available, and can be enabled by adding kern.vty=sc in &man.loader.conf.5;. Alternatively, &man.syscons.4; can be enabled at boot time by entering set kern.vty=sc at the &man.loader.8; prompt. Boot Menu Changes   Networking This section describes changes that affect networking in &os;. Network Procols Support for the IPX network transport protocol has been removed, and will not be supported in &os; 11 and later releases. Support for PLPMTUD blackhole detection (RFC 4821) has been added to the &man.tcp.4; stack, disabled by default. New control tunables have been added: Tunable Description net.inet.tcp.pmtud_blackhole_detection Enables or disables PLPMTUD blackhole detection net.inet.tcp.pmtud_blackhole_mss MSS to try for IPv4 net.inet.tcp.v6pmtud_blackhole_mss MSS to try for IPv6 New monitoring &man.sysctl.8;s haven been added: Tunable Description net.inet.tcp.pmtud_blackhole_activated Number of times the code was activated to attempt downshifting the MSS net.inet.tcp.pmtud_blackhole_min_activated Number of times the blackhole MSS was used in an attempt to downshift net.inet.tcp.pmtud_blackhole_failed Number of times that the blackhole failed to connect after downshifting the MSS Ports Collection and Package Infrastructure This section covers changes to the &os; Ports Collection, package infrastructure, and package maintenance and installation tools. Infrastructure Changes   Packaging Changes   Documentation This section covers changes to the &os; Documentation Project sources and toolchain. Documentation Source Changes   Documentation Toolchain Changes   Release Engineering and Integration This section convers changes that are specific to the &os; Release Engineering processes. Integration Changes The Release Engineering build tools have been updated to include support for producing virtual machine disk images for various cloud hosting providers.
Index: projects/building-blocks/release/doc/share/xml/errata.xml =================================================================== --- projects/building-blocks/release/doc/share/xml/errata.xml (nonexistent) +++ projects/building-blocks/release/doc/share/xml/errata.xml (revision 278312) @@ -0,0 +1,28 @@ + + + + + + + + + + Errata + Date + Topic + + + + + + No errata notices. +   +   + + + + Property changes on: projects/building-blocks/release/doc/share/xml/errata.xml ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/xml \ No newline at end of property Index: projects/building-blocks/release/doc/share/xml/release.ent =================================================================== --- projects/building-blocks/release/doc/share/xml/release.ent (revision 278311) +++ projects/building-blocks/release/doc/share/xml/release.ent (revision 278312) @@ -1,78 +1,81 @@ - + - + + + + ]]> ]]> ]]> - + Index: projects/building-blocks/release/doc/share/xml/security.xml =================================================================== --- projects/building-blocks/release/doc/share/xml/security.xml (nonexistent) +++ projects/building-blocks/release/doc/share/xml/security.xml (revision 278312) @@ -0,0 +1,28 @@ + + + + + + + + + + Advisory + Date + Topic + + + + + + No advisories. +   +   + + + + Property changes on: projects/building-blocks/release/doc/share/xml/security.xml ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/xml \ No newline at end of property Index: projects/building-blocks/sbin/ipfw/ipfw2.c =================================================================== --- projects/building-blocks/sbin/ipfw/ipfw2.c (revision 278311) +++ projects/building-blocks/sbin/ipfw/ipfw2.c (revision 278312) @@ -1,4969 +1,4971 @@ /* * Copyright (c) 2002-2003 Luigi Rizzo * Copyright (c) 1996 Alex Nash, Paul Traina, Poul-Henning Kamp * Copyright (c) 1994 Ugen J.S.Antsilevich * * Idea and grammar partially left from: * Copyright (c) 1993 Daniel Boulet * * Redistribution and use in source forms, with and without modification, * are permitted provided that this entire comment appears intact. * * Redistribution in binary form may occur without any restrictions. * Obviously, it would be nice if you gave credit where credit is due * but requiring it would be too onerous. * * This software is provided ``AS IS'' without any warranties of any kind. * * NEW command line interface for IP firewall facility * * $FreeBSD$ */ #include #include #include #include #include #include "ipfw2.h" #include #include #include #include #include #include #include #include #include #include #include #include /* ctime */ #include /* _long_to_time */ #include #include #include /* offsetof */ #include #include /* only IFNAMSIZ */ #include #include /* only n_short, n_long */ #include #include #include #include #include struct cmdline_opts co; /* global options */ struct format_opts { int bcwidth; int pcwidth; int show_counters; + int show_time; /* show timestamp */ uint32_t set_mask; /* enabled sets mask */ uint32_t flags; /* request flags */ uint32_t first; /* first rule to request */ uint32_t last; /* last rule to request */ uint32_t dcnt; /* number of dynamic states */ ipfw_obj_ctlv *tstate; /* table state data */ }; int resvd_set_number = RESVD_SET; int ipfw_socket = -1; #define CHECK_LENGTH(v, len) do { \ if ((v) < (len)) \ errx(EX_DATAERR, "Rule too long"); \ } while (0) /* * Check if we have enough space in cmd buffer. Note that since * first 8? u32 words are reserved by reserved header, full cmd * buffer can't be used, so we need to protect from buffer overrun * only. At the beginnig, cblen is less than actual buffer size by * size of ipfw_insn_u32 instruction + 1 u32 work. This eliminates need * for checking small instructions fitting in given range. * We also (ab)use the fact that ipfw_insn is always the first field * for any custom instruction. */ #define CHECK_CMDLEN CHECK_LENGTH(cblen, F_LEN((ipfw_insn *)cmd)) #define GET_UINT_ARG(arg, min, max, tok, s_x) do { \ if (!av[0]) \ errx(EX_USAGE, "%s: missing argument", match_value(s_x, tok)); \ if (_substrcmp(*av, "tablearg") == 0) { \ arg = IP_FW_TARG; \ break; \ } \ \ { \ long _xval; \ char *end; \ \ _xval = strtol(*av, &end, 10); \ \ if (!isdigit(**av) || *end != '\0' || (_xval == 0 && errno == EINVAL)) \ errx(EX_DATAERR, "%s: invalid argument: %s", \ match_value(s_x, tok), *av); \ \ if (errno == ERANGE || _xval < min || _xval > max) \ errx(EX_DATAERR, "%s: argument is out of range (%u..%u): %s", \ match_value(s_x, tok), min, max, *av); \ \ if (_xval == IP_FW_TARG) \ errx(EX_DATAERR, "%s: illegal argument value: %s", \ match_value(s_x, tok), *av); \ arg = _xval; \ } \ } while (0) static struct _s_x f_tcpflags[] = { { "syn", TH_SYN }, { "fin", TH_FIN }, { "ack", TH_ACK }, { "psh", TH_PUSH }, { "rst", TH_RST }, { "urg", TH_URG }, { "tcp flag", 0 }, { NULL, 0 } }; static struct _s_x f_tcpopts[] = { { "mss", IP_FW_TCPOPT_MSS }, { "maxseg", IP_FW_TCPOPT_MSS }, { "window", IP_FW_TCPOPT_WINDOW }, { "sack", IP_FW_TCPOPT_SACK }, { "ts", IP_FW_TCPOPT_TS }, { "timestamp", IP_FW_TCPOPT_TS }, { "cc", IP_FW_TCPOPT_CC }, { "tcp option", 0 }, { NULL, 0 } }; /* * IP options span the range 0 to 255 so we need to remap them * (though in fact only the low 5 bits are significant). */ static struct _s_x f_ipopts[] = { { "ssrr", IP_FW_IPOPT_SSRR}, { "lsrr", IP_FW_IPOPT_LSRR}, { "rr", IP_FW_IPOPT_RR}, { "ts", IP_FW_IPOPT_TS}, { "ip option", 0 }, { NULL, 0 } }; static struct _s_x f_iptos[] = { { "lowdelay", IPTOS_LOWDELAY}, { "throughput", IPTOS_THROUGHPUT}, { "reliability", IPTOS_RELIABILITY}, { "mincost", IPTOS_MINCOST}, { "congestion", IPTOS_ECN_CE}, { "ecntransport", IPTOS_ECN_ECT0}, { "ip tos option", 0}, { NULL, 0 } }; struct _s_x f_ipdscp[] = { { "af11", IPTOS_DSCP_AF11 >> 2 }, /* 001010 */ { "af12", IPTOS_DSCP_AF12 >> 2 }, /* 001100 */ { "af13", IPTOS_DSCP_AF13 >> 2 }, /* 001110 */ { "af21", IPTOS_DSCP_AF21 >> 2 }, /* 010010 */ { "af22", IPTOS_DSCP_AF22 >> 2 }, /* 010100 */ { "af23", IPTOS_DSCP_AF23 >> 2 }, /* 010110 */ { "af31", IPTOS_DSCP_AF31 >> 2 }, /* 011010 */ { "af32", IPTOS_DSCP_AF32 >> 2 }, /* 011100 */ { "af33", IPTOS_DSCP_AF33 >> 2 }, /* 011110 */ { "af41", IPTOS_DSCP_AF41 >> 2 }, /* 100010 */ { "af42", IPTOS_DSCP_AF42 >> 2 }, /* 100100 */ { "af43", IPTOS_DSCP_AF43 >> 2 }, /* 100110 */ { "be", IPTOS_DSCP_CS0 >> 2 }, /* 000000 */ { "ef", IPTOS_DSCP_EF >> 2 }, /* 101110 */ { "cs0", IPTOS_DSCP_CS0 >> 2 }, /* 000000 */ { "cs1", IPTOS_DSCP_CS1 >> 2 }, /* 001000 */ { "cs2", IPTOS_DSCP_CS2 >> 2 }, /* 010000 */ { "cs3", IPTOS_DSCP_CS3 >> 2 }, /* 011000 */ { "cs4", IPTOS_DSCP_CS4 >> 2 }, /* 100000 */ { "cs5", IPTOS_DSCP_CS5 >> 2 }, /* 101000 */ { "cs6", IPTOS_DSCP_CS6 >> 2 }, /* 110000 */ { "cs7", IPTOS_DSCP_CS7 >> 2 }, /* 100000 */ { NULL, 0 } }; static struct _s_x limit_masks[] = { {"all", DYN_SRC_ADDR|DYN_SRC_PORT|DYN_DST_ADDR|DYN_DST_PORT}, {"src-addr", DYN_SRC_ADDR}, {"src-port", DYN_SRC_PORT}, {"dst-addr", DYN_DST_ADDR}, {"dst-port", DYN_DST_PORT}, {NULL, 0} }; /* * we use IPPROTO_ETHERTYPE as a fake protocol id to call the print routines * This is only used in this code. */ #define IPPROTO_ETHERTYPE 0x1000 static struct _s_x ether_types[] = { /* * Note, we cannot use "-:&/" in the names because they are field * separators in the type specifications. Also, we use s = NULL as * end-delimiter, because a type of 0 can be legal. */ { "ip", 0x0800 }, { "ipv4", 0x0800 }, { "ipv6", 0x86dd }, { "arp", 0x0806 }, { "rarp", 0x8035 }, { "vlan", 0x8100 }, { "loop", 0x9000 }, { "trail", 0x1000 }, { "at", 0x809b }, { "atalk", 0x809b }, { "aarp", 0x80f3 }, { "pppoe_disc", 0x8863 }, { "pppoe_sess", 0x8864 }, { "ipx_8022", 0x00E0 }, { "ipx_8023", 0x0000 }, { "ipx_ii", 0x8137 }, { "ipx_snap", 0x8137 }, { "ipx", 0x8137 }, { "ns", 0x0600 }, { NULL, 0 } }; static struct _s_x rule_actions[] = { { "accept", TOK_ACCEPT }, { "pass", TOK_ACCEPT }, { "allow", TOK_ACCEPT }, { "permit", TOK_ACCEPT }, { "count", TOK_COUNT }, { "pipe", TOK_PIPE }, { "queue", TOK_QUEUE }, { "divert", TOK_DIVERT }, { "tee", TOK_TEE }, { "netgraph", TOK_NETGRAPH }, { "ngtee", TOK_NGTEE }, { "fwd", TOK_FORWARD }, { "forward", TOK_FORWARD }, { "skipto", TOK_SKIPTO }, { "deny", TOK_DENY }, { "drop", TOK_DENY }, { "reject", TOK_REJECT }, { "reset6", TOK_RESET6 }, { "reset", TOK_RESET }, { "unreach6", TOK_UNREACH6 }, { "unreach", TOK_UNREACH }, { "check-state", TOK_CHECKSTATE }, { "//", TOK_COMMENT }, { "nat", TOK_NAT }, { "reass", TOK_REASS }, { "setfib", TOK_SETFIB }, { "setdscp", TOK_SETDSCP }, { "call", TOK_CALL }, { "return", TOK_RETURN }, { NULL, 0 } /* terminator */ }; static struct _s_x rule_action_params[] = { { "altq", TOK_ALTQ }, { "log", TOK_LOG }, { "tag", TOK_TAG }, { "untag", TOK_UNTAG }, { NULL, 0 } /* terminator */ }; /* * The 'lookup' instruction accepts one of the following arguments. * -1 is a terminator for the list. * Arguments are passed as v[1] in O_DST_LOOKUP options. */ static int lookup_key[] = { TOK_DSTIP, TOK_SRCIP, TOK_DSTPORT, TOK_SRCPORT, TOK_UID, TOK_JAIL, TOK_DSCP, -1 }; static struct _s_x rule_options[] = { { "tagged", TOK_TAGGED }, { "uid", TOK_UID }, { "gid", TOK_GID }, { "jail", TOK_JAIL }, { "in", TOK_IN }, { "limit", TOK_LIMIT }, { "keep-state", TOK_KEEPSTATE }, { "bridged", TOK_LAYER2 }, { "layer2", TOK_LAYER2 }, { "out", TOK_OUT }, { "diverted", TOK_DIVERTED }, { "diverted-loopback", TOK_DIVERTEDLOOPBACK }, { "diverted-output", TOK_DIVERTEDOUTPUT }, { "xmit", TOK_XMIT }, { "recv", TOK_RECV }, { "via", TOK_VIA }, { "fragment", TOK_FRAG }, { "frag", TOK_FRAG }, { "fib", TOK_FIB }, { "ipoptions", TOK_IPOPTS }, { "ipopts", TOK_IPOPTS }, { "iplen", TOK_IPLEN }, { "ipid", TOK_IPID }, { "ipprecedence", TOK_IPPRECEDENCE }, { "dscp", TOK_DSCP }, { "iptos", TOK_IPTOS }, { "ipttl", TOK_IPTTL }, { "ipversion", TOK_IPVER }, { "ipver", TOK_IPVER }, { "estab", TOK_ESTAB }, { "established", TOK_ESTAB }, { "setup", TOK_SETUP }, { "sockarg", TOK_SOCKARG }, { "tcpdatalen", TOK_TCPDATALEN }, { "tcpflags", TOK_TCPFLAGS }, { "tcpflgs", TOK_TCPFLAGS }, { "tcpoptions", TOK_TCPOPTS }, { "tcpopts", TOK_TCPOPTS }, { "tcpseq", TOK_TCPSEQ }, { "tcpack", TOK_TCPACK }, { "tcpwin", TOK_TCPWIN }, { "icmptype", TOK_ICMPTYPES }, { "icmptypes", TOK_ICMPTYPES }, { "dst-ip", TOK_DSTIP }, { "src-ip", TOK_SRCIP }, { "dst-port", TOK_DSTPORT }, { "src-port", TOK_SRCPORT }, { "proto", TOK_PROTO }, { "MAC", TOK_MAC }, { "mac", TOK_MAC }, { "mac-type", TOK_MACTYPE }, { "verrevpath", TOK_VERREVPATH }, { "versrcreach", TOK_VERSRCREACH }, { "antispoof", TOK_ANTISPOOF }, { "ipsec", TOK_IPSEC }, { "icmp6type", TOK_ICMP6TYPES }, { "icmp6types", TOK_ICMP6TYPES }, { "ext6hdr", TOK_EXT6HDR}, { "flow-id", TOK_FLOWID}, { "ipv6", TOK_IPV6}, { "ip6", TOK_IPV6}, { "ipv4", TOK_IPV4}, { "ip4", TOK_IPV4}, { "dst-ipv6", TOK_DSTIP6}, { "dst-ip6", TOK_DSTIP6}, { "src-ipv6", TOK_SRCIP6}, { "src-ip6", TOK_SRCIP6}, { "lookup", TOK_LOOKUP}, { "flow", TOK_FLOW}, { "//", TOK_COMMENT }, { "not", TOK_NOT }, /* pseudo option */ { "!", /* escape ? */ TOK_NOT }, /* pseudo option */ { "or", TOK_OR }, /* pseudo option */ { "|", /* escape */ TOK_OR }, /* pseudo option */ { "{", TOK_STARTBRACE }, /* pseudo option */ { "(", TOK_STARTBRACE }, /* pseudo option */ { "}", TOK_ENDBRACE }, /* pseudo option */ { ")", TOK_ENDBRACE }, /* pseudo option */ { NULL, 0 } /* terminator */ }; void bprint_uint_arg(struct buf_pr *bp, const char *str, uint32_t arg); static int ipfw_get_config(struct cmdline_opts *co, struct format_opts *fo, ipfw_cfg_lheader **pcfg, size_t *psize); static int ipfw_show_config(struct cmdline_opts *co, struct format_opts *fo, ipfw_cfg_lheader *cfg, size_t sz, int ac, char **av); static void ipfw_list_tifaces(void); /* * Simple string buffer API. * Used to simplify buffer passing between function and for * transparent overrun handling. */ /* * Allocates new buffer of given size @sz. * * Returns 0 on success. */ int bp_alloc(struct buf_pr *b, size_t size) { memset(b, 0, sizeof(struct buf_pr)); if ((b->buf = calloc(1, size)) == NULL) return (ENOMEM); b->ptr = b->buf; b->size = size; b->avail = b->size; return (0); } void bp_free(struct buf_pr *b) { free(b->buf); } /* * Flushes buffer so new writer start from beginning. */ void bp_flush(struct buf_pr *b) { b->ptr = b->buf; b->avail = b->size; } /* * Print message specified by @format and args. * Automatically manage buffer space and transparently handle * buffer overruns. * * Returns number of bytes that should have been printed. */ int bprintf(struct buf_pr *b, char *format, ...) { va_list args; int i; va_start(args, format); i = vsnprintf(b->ptr, b->avail, format, args); va_end(args); if (i > b->avail || i < 0) { /* Overflow or print error */ b->avail = 0; } else { b->ptr += i; b->avail -= i; } b->needed += i; return (i); } /* * Special values printer for tablearg-aware opcodes. */ void bprint_uint_arg(struct buf_pr *bp, const char *str, uint32_t arg) { if (str != NULL) bprintf(bp, "%s", str); if (arg == IP_FW_TARG) bprintf(bp, "tablearg"); else bprintf(bp, "%u", arg); } /* * Helper routine to print a possibly unaligned uint64_t on * various platform. If width > 0, print the value with * the desired width, followed by a space; * otherwise, return the required width. */ int pr_u64(struct buf_pr *b, uint64_t *pd, int width) { #ifdef TCC #define U64_FMT "I64" #else #define U64_FMT "llu" #endif uint64_t u; unsigned long long d; bcopy (pd, &u, sizeof(u)); d = u; return (width > 0) ? bprintf(b, "%*" U64_FMT " ", width, d) : snprintf(NULL, 0, "%" U64_FMT, d) ; #undef U64_FMT } void * safe_calloc(size_t number, size_t size) { void *ret = calloc(number, size); if (ret == NULL) err(EX_OSERR, "calloc"); return ret; } void * safe_realloc(void *ptr, size_t size) { void *ret = realloc(ptr, size); if (ret == NULL) err(EX_OSERR, "realloc"); return ret; } /* * Compare things like interface or table names. */ int stringnum_cmp(const char *a, const char *b) { int la, lb; la = strlen(a); lb = strlen(b); if (la > lb) return (1); else if (la < lb) return (-01); return (strcmp(a, b)); } /* * conditionally runs the command. * Selected options or negative -> getsockopt */ int do_cmd(int optname, void *optval, uintptr_t optlen) { int i; if (co.test_only) return 0; if (ipfw_socket == -1) ipfw_socket = socket(AF_INET, SOCK_RAW, IPPROTO_RAW); if (ipfw_socket < 0) err(EX_UNAVAILABLE, "socket"); if (optname == IP_FW_GET || optname == IP_DUMMYNET_GET || optname == IP_FW_ADD || optname == IP_FW3 || optname == IP_FW_NAT_GET_CONFIG || optname < 0 || optname == IP_FW_NAT_GET_LOG) { if (optname < 0) optname = -optname; i = getsockopt(ipfw_socket, IPPROTO_IP, optname, optval, (socklen_t *)optlen); } else { i = setsockopt(ipfw_socket, IPPROTO_IP, optname, optval, optlen); } return i; } /* * do_set3 - pass ipfw control cmd to kernel * @optname: option name * @optval: pointer to option data * @optlen: option length * * Assumes op3 header is already embedded. * Calls setsockopt() with IP_FW3 as kernel-visible opcode. * Returns 0 on success or errno otherwise. */ int do_set3(int optname, ip_fw3_opheader *op3, uintptr_t optlen) { if (co.test_only) return (0); if (ipfw_socket == -1) ipfw_socket = socket(AF_INET, SOCK_RAW, IPPROTO_RAW); if (ipfw_socket < 0) err(EX_UNAVAILABLE, "socket"); op3->opcode = optname; return (setsockopt(ipfw_socket, IPPROTO_IP, IP_FW3, op3, optlen)); } /* * do_get3 - pass ipfw control cmd to kernel * @optname: option name * @optval: pointer to option data * @optlen: pointer to option length * * Assumes op3 header is already embedded. * Calls getsockopt() with IP_FW3 as kernel-visible opcode. * Returns 0 on success or errno otherwise. */ int do_get3(int optname, ip_fw3_opheader *op3, size_t *optlen) { int error; if (co.test_only) return (0); if (ipfw_socket == -1) ipfw_socket = socket(AF_INET, SOCK_RAW, IPPROTO_RAW); if (ipfw_socket < 0) err(EX_UNAVAILABLE, "socket"); op3->opcode = optname; error = getsockopt(ipfw_socket, IPPROTO_IP, IP_FW3, op3, (socklen_t *)optlen); return (error); } /** * match_token takes a table and a string, returns the value associated * with the string (-1 in case of failure). */ int match_token(struct _s_x *table, char *string) { struct _s_x *pt; uint i = strlen(string); for (pt = table ; i && pt->s != NULL ; pt++) if (strlen(pt->s) == i && !bcmp(string, pt->s, i)) return pt->x; return (-1); } /** * match_token takes a table and a string, returns the value associated * with the string for the best match. * * Returns: * value from @table for matched records * -1 for non-matched records * -2 if more than one records match @string. */ int match_token_relaxed(struct _s_x *table, char *string) { struct _s_x *pt, *m; int i, c; i = strlen(string); c = 0; for (pt = table ; i != 0 && pt->s != NULL ; pt++) { if (strncmp(pt->s, string, i) != 0) continue; m = pt; c++; } if (c == 1) return (m->x); return (c > 0 ? -2: -1); } /** * match_value takes a table and a value, returns the string associated * with the value (NULL in case of failure). */ char const * match_value(struct _s_x *p, int value) { for (; p->s != NULL; p++) if (p->x == value) return p->s; return NULL; } size_t concat_tokens(char *buf, size_t bufsize, struct _s_x *table, char *delimiter) { struct _s_x *pt; int l; size_t sz; for (sz = 0, pt = table ; pt->s != NULL; pt++) { l = snprintf(buf + sz, bufsize - sz, "%s%s", (sz == 0) ? "" : delimiter, pt->s); sz += l; bufsize += l; if (sz > bufsize) return (bufsize); } return (sz); } /* * helper function to process a set of flags and set bits in the * appropriate masks. */ int fill_flags(struct _s_x *flags, char *p, char **e, uint32_t *set, uint32_t *clear) { char *q; /* points to the separator */ int val; uint32_t *which; /* mask we are working on */ while (p && *p) { if (*p == '!') { p++; which = clear; } else which = set; q = strchr(p, ','); if (q) *q++ = '\0'; val = match_token(flags, p); if (val <= 0) { if (e != NULL) *e = p; return (-1); } *which |= (uint32_t)val; p = q; } return (0); } void print_flags_buffer(char *buf, size_t sz, struct _s_x *list, uint32_t set) { char const *comma = ""; int i, l; for (i = 0; list[i].x != 0; i++) { if ((set & list[i].x) == 0) continue; set &= ~list[i].x; l = snprintf(buf, sz, "%s%s", comma, list[i].s); if (l >= sz) return; comma = ","; buf += l; sz -=l; } } /* * _substrcmp takes two strings and returns 1 if they do not match, * and 0 if they match exactly or the first string is a sub-string * of the second. A warning is printed to stderr in the case that the * first string is a sub-string of the second. * * This function will be removed in the future through the usual * deprecation process. */ int _substrcmp(const char *str1, const char* str2) { if (strncmp(str1, str2, strlen(str1)) != 0) return 1; if (strlen(str1) != strlen(str2)) warnx("DEPRECATED: '%s' matched '%s' as a sub-string", str1, str2); return 0; } /* * _substrcmp2 takes three strings and returns 1 if the first two do not match, * and 0 if they match exactly or the second string is a sub-string * of the first. A warning is printed to stderr in the case that the * first string does not match the third. * * This function exists to warn about the bizarre construction * strncmp(str, "by", 2) which is used to allow people to use a shortcut * for "bytes". The problem is that in addition to accepting "by", * "byt", "byte", and "bytes", it also excepts "by_rabid_dogs" and any * other string beginning with "by". * * This function will be removed in the future through the usual * deprecation process. */ int _substrcmp2(const char *str1, const char* str2, const char* str3) { if (strncmp(str1, str2, strlen(str2)) != 0) return 1; if (strcmp(str1, str3) != 0) warnx("DEPRECATED: '%s' matched '%s'", str1, str3); return 0; } /* * prints one port, symbolic or numeric */ static void print_port(struct buf_pr *bp, int proto, uint16_t port) { if (proto == IPPROTO_ETHERTYPE) { char const *s; if (co.do_resolv && (s = match_value(ether_types, port)) ) bprintf(bp, "%s", s); else bprintf(bp, "0x%04x", port); } else { struct servent *se = NULL; if (co.do_resolv) { struct protoent *pe = getprotobynumber(proto); se = getservbyport(htons(port), pe ? pe->p_name : NULL); } if (se) bprintf(bp, "%s", se->s_name); else bprintf(bp, "%d", port); } } static struct _s_x _port_name[] = { {"dst-port", O_IP_DSTPORT}, {"src-port", O_IP_SRCPORT}, {"ipid", O_IPID}, {"iplen", O_IPLEN}, {"ipttl", O_IPTTL}, {"mac-type", O_MAC_TYPE}, {"tcpdatalen", O_TCPDATALEN}, {"tcpwin", O_TCPWIN}, {"tagged", O_TAGGED}, {NULL, 0} }; /* * Print the values in a list 16-bit items of the types above. * XXX todo: add support for mask. */ static void print_newports(struct buf_pr *bp, ipfw_insn_u16 *cmd, int proto, int opcode) { uint16_t *p = cmd->ports; int i; char const *sep; if (opcode != 0) { sep = match_value(_port_name, opcode); if (sep == NULL) sep = "???"; bprintf(bp, " %s", sep); } sep = " "; for (i = F_LEN((ipfw_insn *)cmd) - 1; i > 0; i--, p += 2) { bprintf(bp, "%s", sep); print_port(bp, proto, p[0]); if (p[0] != p[1]) { bprintf(bp, "-"); print_port(bp, proto, p[1]); } sep = ","; } } /* * Like strtol, but also translates service names into port numbers * for some protocols. * In particular: * proto == -1 disables the protocol check; * proto == IPPROTO_ETHERTYPE looks up an internal table * proto == matches the values there. * Returns *end == s in case the parameter is not found. */ static int strtoport(char *s, char **end, int base, int proto) { char *p, *buf; char *s1; int i; *end = s; /* default - not found */ if (*s == '\0') return 0; /* not found */ if (isdigit(*s)) return strtol(s, end, base); /* * find separator. '\\' escapes the next char. */ for (s1 = s; *s1 && (isalnum(*s1) || *s1 == '\\') ; s1++) if (*s1 == '\\' && s1[1] != '\0') s1++; buf = safe_calloc(s1 - s + 1, 1); /* * copy into a buffer skipping backslashes */ for (p = s, i = 0; p != s1 ; p++) if (*p != '\\') buf[i++] = *p; buf[i++] = '\0'; if (proto == IPPROTO_ETHERTYPE) { i = match_token(ether_types, buf); free(buf); if (i != -1) { /* found */ *end = s1; return i; } } else { struct protoent *pe = NULL; struct servent *se; if (proto != 0) pe = getprotobynumber(proto); setservent(1); se = getservbyname(buf, pe ? pe->p_name : NULL); free(buf); if (se != NULL) { *end = s1; return ntohs(se->s_port); } } return 0; /* not found */ } /* * Fill the body of the command with the list of port ranges. */ static int fill_newports(ipfw_insn_u16 *cmd, char *av, int proto, int cblen) { uint16_t a, b, *p = cmd->ports; int i = 0; char *s = av; while (*s) { a = strtoport(av, &s, 0, proto); if (s == av) /* empty or invalid argument */ return (0); CHECK_LENGTH(cblen, i + 2); switch (*s) { case '-': /* a range */ av = s + 1; b = strtoport(av, &s, 0, proto); /* Reject expressions like '1-abc' or '1-2-3'. */ if (s == av || (*s != ',' && *s != '\0')) return (0); p[0] = a; p[1] = b; break; case ',': /* comma separated list */ case '\0': p[0] = p[1] = a; break; default: warnx("port list: invalid separator <%c> in <%s>", *s, av); return (0); } i++; p += 2; av = s + 1; } if (i > 0) { if (i + 1 > F_LEN_MASK) errx(EX_DATAERR, "too many ports/ranges\n"); cmd->o.len |= i + 1; /* leave F_NOT and F_OR untouched */ } return (i); } /* * Fill the body of the command with the list of DiffServ codepoints. */ static void fill_dscp(ipfw_insn *cmd, char *av, int cblen) { uint32_t *low, *high; char *s = av, *a; int code; cmd->opcode = O_DSCP; cmd->len |= F_INSN_SIZE(ipfw_insn_u32) + 1; CHECK_CMDLEN; low = (uint32_t *)(cmd + 1); high = low + 1; *low = 0; *high = 0; while (s != NULL) { a = strchr(s, ','); if (a != NULL) *a++ = '\0'; if (isalpha(*s)) { if ((code = match_token(f_ipdscp, s)) == -1) errx(EX_DATAERR, "Unknown DSCP code"); } else { code = strtoul(s, NULL, 10); if (code < 0 || code > 63) errx(EX_DATAERR, "Invalid DSCP value"); } if (code > 32) *high |= 1 << (code - 32); else *low |= 1 << code; s = a; } } static struct _s_x icmpcodes[] = { { "net", ICMP_UNREACH_NET }, { "host", ICMP_UNREACH_HOST }, { "protocol", ICMP_UNREACH_PROTOCOL }, { "port", ICMP_UNREACH_PORT }, { "needfrag", ICMP_UNREACH_NEEDFRAG }, { "srcfail", ICMP_UNREACH_SRCFAIL }, { "net-unknown", ICMP_UNREACH_NET_UNKNOWN }, { "host-unknown", ICMP_UNREACH_HOST_UNKNOWN }, { "isolated", ICMP_UNREACH_ISOLATED }, { "net-prohib", ICMP_UNREACH_NET_PROHIB }, { "host-prohib", ICMP_UNREACH_HOST_PROHIB }, { "tosnet", ICMP_UNREACH_TOSNET }, { "toshost", ICMP_UNREACH_TOSHOST }, { "filter-prohib", ICMP_UNREACH_FILTER_PROHIB }, { "host-precedence", ICMP_UNREACH_HOST_PRECEDENCE }, { "precedence-cutoff", ICMP_UNREACH_PRECEDENCE_CUTOFF }, { NULL, 0 } }; static void fill_reject_code(u_short *codep, char *str) { int val; char *s; val = strtoul(str, &s, 0); if (s == str || *s != '\0' || val >= 0x100) val = match_token(icmpcodes, str); if (val < 0) errx(EX_DATAERR, "unknown ICMP unreachable code ``%s''", str); *codep = val; return; } static void print_reject_code(struct buf_pr *bp, uint16_t code) { char const *s; if ((s = match_value(icmpcodes, code)) != NULL) bprintf(bp, "unreach %s", s); else bprintf(bp, "unreach %u", code); } /* * Returns the number of bits set (from left) in a contiguous bitmask, * or -1 if the mask is not contiguous. * XXX this needs a proper fix. * This effectively works on masks in big-endian (network) format. * when compiled on little endian architectures. * * First bit is bit 7 of the first byte -- note, for MAC addresses, * the first bit on the wire is bit 0 of the first byte. * len is the max length in bits. */ int contigmask(uint8_t *p, int len) { int i, n; for (i=0; iarg1 & 0xff; uint8_t clear = (cmd->arg1 >> 8) & 0xff; if (list == f_tcpflags && set == TH_SYN && clear == TH_ACK) { bprintf(bp, " setup"); return; } bprintf(bp, " %s ", name); for (i=0; list[i].x != 0; i++) { if (set & list[i].x) { set &= ~list[i].x; bprintf(bp, "%s%s", comma, list[i].s); comma = ","; } if (clear & list[i].x) { clear &= ~list[i].x; bprintf(bp, "%s!%s", comma, list[i].s); comma = ","; } } } /* * Print the ip address contained in a command. */ static void print_ip(struct buf_pr *bp, struct format_opts *fo, ipfw_insn_ip *cmd, char const *s) { struct hostent *he = NULL; struct in_addr *ia; uint32_t len = F_LEN((ipfw_insn *)cmd); uint32_t *a = ((ipfw_insn_u32 *)cmd)->d; char *t; if (cmd->o.opcode == O_IP_DST_LOOKUP && len > F_INSN_SIZE(ipfw_insn_u32)) { uint32_t d = a[1]; const char *arg = ""; if (d < sizeof(lookup_key)/sizeof(lookup_key[0])) arg = match_value(rule_options, lookup_key[d]); t = table_search_ctlv(fo->tstate, ((ipfw_insn *)cmd)->arg1); bprintf(bp, "%s lookup %s %s", cmd->o.len & F_NOT ? " not": "", arg, t); return; } bprintf(bp, "%s%s ", cmd->o.len & F_NOT ? " not": "", s); if (cmd->o.opcode == O_IP_SRC_ME || cmd->o.opcode == O_IP_DST_ME) { bprintf(bp, "me"); return; } if (cmd->o.opcode == O_IP_SRC_LOOKUP || cmd->o.opcode == O_IP_DST_LOOKUP) { t = table_search_ctlv(fo->tstate, ((ipfw_insn *)cmd)->arg1); bprintf(bp, "table(%s", t); if (len == F_INSN_SIZE(ipfw_insn_u32)) bprintf(bp, ",%u", *a); bprintf(bp, ")"); return; } if (cmd->o.opcode == O_IP_SRC_SET || cmd->o.opcode == O_IP_DST_SET) { uint32_t x, *map = (uint32_t *)&(cmd->mask); int i, j; char comma = '{'; x = cmd->o.arg1 - 1; x = htonl( ~x ); cmd->addr.s_addr = htonl(cmd->addr.s_addr); bprintf(bp, "%s/%d", inet_ntoa(cmd->addr), contigmask((uint8_t *)&x, 32)); x = cmd->addr.s_addr = htonl(cmd->addr.s_addr); x &= 0xff; /* base */ /* * Print bits and ranges. * Locate first bit set (i), then locate first bit unset (j). * If we have 3+ consecutive bits set, then print them as a * range, otherwise only print the initial bit and rescan. */ for (i=0; i < cmd->o.arg1; i++) if (map[i/32] & (1<<(i & 31))) { for (j=i+1; j < cmd->o.arg1; j++) if (!(map[ j/32] & (1<<(j & 31)))) break; bprintf(bp, "%c%d", comma, i+x); if (j>i+2) { /* range has at least 3 elements */ bprintf(bp, "-%d", j-1+x); i = j-1; } comma = ','; } bprintf(bp, "}"); return; } /* * len == 2 indicates a single IP, whereas lists of 1 or more * addr/mask pairs have len = (2n+1). We convert len to n so we * use that to count the number of entries. */ for (len = len / 2; len > 0; len--, a += 2) { int mb = /* mask length */ (cmd->o.opcode == O_IP_SRC || cmd->o.opcode == O_IP_DST) ? 32 : contigmask((uint8_t *)&(a[1]), 32); if (mb == 32 && co.do_resolv) he = gethostbyaddr((char *)&(a[0]), sizeof(u_long), AF_INET); if (he != NULL) /* resolved to name */ bprintf(bp, "%s", he->h_name); else if (mb == 0) /* any */ bprintf(bp, "any"); else { /* numeric IP followed by some kind of mask */ ia = (struct in_addr *)&a[0]; bprintf(bp, "%s", inet_ntoa(*ia)); if (mb < 0) { ia = (struct in_addr *)&a[1]; bprintf(bp, ":%s", inet_ntoa(*ia)); } else if (mb < 32) bprintf(bp, "/%d", mb); } if (len > 1) bprintf(bp, ","); } } /* * prints a MAC address/mask pair */ static void print_mac(struct buf_pr *bp, uint8_t *addr, uint8_t *mask) { int l = contigmask(mask, 48); if (l == 0) bprintf(bp, " any"); else { bprintf(bp, " %02x:%02x:%02x:%02x:%02x:%02x", addr[0], addr[1], addr[2], addr[3], addr[4], addr[5]); if (l == -1) bprintf(bp, "&%02x:%02x:%02x:%02x:%02x:%02x", mask[0], mask[1], mask[2], mask[3], mask[4], mask[5]); else if (l < 48) bprintf(bp, "/%d", l); } } static void fill_icmptypes(ipfw_insn_u32 *cmd, char *av) { uint8_t type; cmd->d[0] = 0; while (*av) { if (*av == ',') av++; type = strtoul(av, &av, 0); if (*av != ',' && *av != '\0') errx(EX_DATAERR, "invalid ICMP type"); if (type > 31) errx(EX_DATAERR, "ICMP type out of range"); cmd->d[0] |= 1 << type; } cmd->o.opcode = O_ICMPTYPE; cmd->o.len |= F_INSN_SIZE(ipfw_insn_u32); } static void print_icmptypes(struct buf_pr *bp, ipfw_insn_u32 *cmd) { int i; char sep= ' '; bprintf(bp, " icmptypes"); for (i = 0; i < 32; i++) { if ( (cmd->d[0] & (1 << (i))) == 0) continue; bprintf(bp, "%c%d", sep, i); sep = ','; } } static void print_dscp(struct buf_pr *bp, ipfw_insn_u32 *cmd) { int i, c; uint32_t *v; char sep= ' '; const char *code; bprintf(bp, " dscp"); i = 0; c = 0; v = cmd->d; while (i < 64) { if (*v & (1 << i)) { if ((code = match_value(f_ipdscp, i)) != NULL) bprintf(bp, "%c%s", sep, code); else bprintf(bp, "%c%d", sep, i); sep = ','; } if ((++i % 32) == 0) v++; } } /* * show_ipfw() prints the body of an ipfw rule. * Because the standard rule has at least proto src_ip dst_ip, we use * a helper function to produce these entries if not provided explicitly. * The first argument is the list of fields we have, the second is * the list of fields we want to be printed. * * Special cases if we have provided a MAC header: * + if the rule does not contain IP addresses/ports, do not print them; * + if the rule does not contain an IP proto, print "all" instead of "ip"; * * Once we have 'have_options', IP header fields are printed as options. */ #define HAVE_PROTO 0x0001 #define HAVE_SRCIP 0x0002 #define HAVE_DSTIP 0x0004 #define HAVE_PROTO4 0x0008 #define HAVE_PROTO6 0x0010 #define HAVE_IP 0x0100 #define HAVE_OPTIONS 0x8000 static void show_prerequisites(struct buf_pr *bp, int *flags, int want, int cmd) { (void)cmd; /* UNUSED */ if (co.comment_only) return; if ( (*flags & HAVE_IP) == HAVE_IP) *flags |= HAVE_OPTIONS; if ( !(*flags & HAVE_OPTIONS)) { if ( !(*flags & HAVE_PROTO) && (want & HAVE_PROTO)) { if ( (*flags & HAVE_PROTO4)) bprintf(bp, " ip4"); else if ( (*flags & HAVE_PROTO6)) bprintf(bp, " ip6"); else bprintf(bp, " ip"); } if ( !(*flags & HAVE_SRCIP) && (want & HAVE_SRCIP)) bprintf(bp, " from any"); if ( !(*flags & HAVE_DSTIP) && (want & HAVE_DSTIP)) bprintf(bp, " to any"); } *flags |= want; } static void show_static_rule(struct cmdline_opts *co, struct format_opts *fo, struct buf_pr *bp, struct ip_fw_rule *rule, struct ip_fw_bcounter *cntr) { static int twidth = 0; int l; ipfw_insn *cmd, *tagptr = NULL; const char *comment = NULL; /* ptr to comment if we have one */ int proto = 0; /* default */ int flags = 0; /* prerequisites */ ipfw_insn_log *logptr = NULL; /* set if we find an O_LOG */ ipfw_insn_altq *altqptr = NULL; /* set if we find an O_ALTQ */ int or_block = 0; /* we are in an or block */ uint32_t uval; if ((fo->set_mask & (1 << rule->set)) == 0) { /* disabled mask */ if (!co->show_sets) return; else bprintf(bp, "# DISABLED "); } bprintf(bp, "%05u ", rule->rulenum); /* Print counters if enabled */ if (fo->pcwidth > 0 || fo->bcwidth > 0) { pr_u64(bp, &cntr->pcnt, fo->pcwidth); pr_u64(bp, &cntr->bcnt, fo->bcwidth); } if (co->do_time == 2) bprintf(bp, "%10u ", cntr->timestamp); else if (co->do_time == 1) { char timestr[30]; time_t t = (time_t)0; if (twidth == 0) { strcpy(timestr, ctime(&t)); *strchr(timestr, '\n') = '\0'; twidth = strlen(timestr); } if (cntr->timestamp > 0) { t = _long_to_time(cntr->timestamp); strcpy(timestr, ctime(&t)); *strchr(timestr, '\n') = '\0'; bprintf(bp, "%s ", timestr); } else { bprintf(bp, "%*s", twidth, " "); } } if (co->show_sets) bprintf(bp, "set %d ", rule->set); /* * print the optional "match probability" */ if (rule->cmd_len > 0) { cmd = rule->cmd ; if (cmd->opcode == O_PROB) { ipfw_insn_u32 *p = (ipfw_insn_u32 *)cmd; double d = 1.0 * p->d[0]; d = (d / 0x7fffffff); bprintf(bp, "prob %f ", d); } } /* * first print actions */ for (l = rule->cmd_len - rule->act_ofs, cmd = ACTION_PTR(rule); l > 0 ; l -= F_LEN(cmd), cmd += F_LEN(cmd)) { switch(cmd->opcode) { case O_CHECK_STATE: bprintf(bp, "check-state"); /* avoid printing anything else */ flags = HAVE_PROTO | HAVE_SRCIP | HAVE_DSTIP | HAVE_IP; break; case O_ACCEPT: bprintf(bp, "allow"); break; case O_COUNT: bprintf(bp, "count"); break; case O_DENY: bprintf(bp, "deny"); break; case O_REJECT: if (cmd->arg1 == ICMP_REJECT_RST) bprintf(bp, "reset"); else if (cmd->arg1 == ICMP_UNREACH_HOST) bprintf(bp, "reject"); else print_reject_code(bp, cmd->arg1); break; case O_UNREACH6: if (cmd->arg1 == ICMP6_UNREACH_RST) bprintf(bp, "reset6"); else print_unreach6_code(cmd->arg1); break; case O_SKIPTO: bprint_uint_arg(bp, "skipto ", cmd->arg1); break; case O_PIPE: bprint_uint_arg(bp, "pipe ", cmd->arg1); break; case O_QUEUE: bprint_uint_arg(bp, "queue ", cmd->arg1); break; case O_DIVERT: bprint_uint_arg(bp, "divert ", cmd->arg1); break; case O_TEE: bprint_uint_arg(bp, "tee ", cmd->arg1); break; case O_NETGRAPH: bprint_uint_arg(bp, "netgraph ", cmd->arg1); break; case O_NGTEE: bprint_uint_arg(bp, "ngtee ", cmd->arg1); break; case O_FORWARD_IP: { ipfw_insn_sa *s = (ipfw_insn_sa *)cmd; if (s->sa.sin_addr.s_addr == INADDR_ANY) { bprintf(bp, "fwd tablearg"); } else { bprintf(bp, "fwd %s",inet_ntoa(s->sa.sin_addr)); } if (s->sa.sin_port) bprintf(bp, ",%d", s->sa.sin_port); } break; case O_FORWARD_IP6: { char buf[4 + INET6_ADDRSTRLEN + 1]; ipfw_insn_sa6 *s = (ipfw_insn_sa6 *)cmd; bprintf(bp, "fwd %s", inet_ntop(AF_INET6, &s->sa.sin6_addr, buf, sizeof(buf))); if (s->sa.sin6_port) bprintf(bp, ",%d", s->sa.sin6_port); } break; case O_LOG: /* O_LOG is printed last */ logptr = (ipfw_insn_log *)cmd; break; case O_ALTQ: /* O_ALTQ is printed after O_LOG */ altqptr = (ipfw_insn_altq *)cmd; break; case O_TAG: tagptr = cmd; break; case O_NAT: if (cmd->arg1 != 0) bprint_uint_arg(bp, "nat ", cmd->arg1); else bprintf(bp, "nat global"); break; case O_SETFIB: bprint_uint_arg(bp, "setfib ", cmd->arg1 & 0x7FFF); break; case O_SETDSCP: { const char *code; if (cmd->arg1 == IP_FW_TARG) { bprint_uint_arg(bp, "setdscp ", cmd->arg1); break; } uval = cmd->arg1 & 0x3F; if ((code = match_value(f_ipdscp, uval)) != NULL) bprintf(bp, "setdscp %s", code); else bprint_uint_arg(bp, "setdscp ", uval); } break; case O_REASS: bprintf(bp, "reass"); break; case O_CALLRETURN: if (cmd->len & F_NOT) bprintf(bp, "return"); else bprint_uint_arg(bp, "call ", cmd->arg1); break; default: bprintf(bp, "** unrecognized action %d len %d ", cmd->opcode, cmd->len); } } if (logptr) { if (logptr->max_log > 0) bprintf(bp, " log logamount %d", logptr->max_log); else bprintf(bp, " log"); } #ifndef NO_ALTQ if (altqptr) { print_altq_cmd(bp, altqptr); } #endif if (tagptr) { if (tagptr->len & F_NOT) bprint_uint_arg(bp, " untag ", tagptr->arg1); else bprint_uint_arg(bp, " tag ", tagptr->arg1); } /* * then print the body. */ for (l = rule->act_ofs, cmd = rule->cmd; l > 0 ; l -= F_LEN(cmd) , cmd += F_LEN(cmd)) { if ((cmd->len & F_OR) || (cmd->len & F_NOT)) continue; if (cmd->opcode == O_IP4) { flags |= HAVE_PROTO4; break; } else if (cmd->opcode == O_IP6) { flags |= HAVE_PROTO6; break; } } if (rule->flags & IPFW_RULE_NOOPT) { /* empty rules before options */ if (!co->do_compact) { show_prerequisites(bp, &flags, HAVE_PROTO, 0); bprintf(bp, " from any to any"); } flags |= HAVE_IP | HAVE_OPTIONS | HAVE_PROTO | HAVE_SRCIP | HAVE_DSTIP; } if (co->comment_only) comment = "..."; for (l = rule->act_ofs, cmd = rule->cmd; l > 0 ; l -= F_LEN(cmd) , cmd += F_LEN(cmd)) { /* useful alias */ ipfw_insn_u32 *cmd32 = (ipfw_insn_u32 *)cmd; if (co->comment_only) { if (cmd->opcode != O_NOP) continue; bprintf(bp, " // %s\n", (char *)(cmd + 1)); return; } show_prerequisites(bp, &flags, 0, cmd->opcode); switch(cmd->opcode) { case O_PROB: break; /* done already */ case O_PROBE_STATE: break; /* no need to print anything here */ case O_IP_SRC: case O_IP_SRC_LOOKUP: case O_IP_SRC_MASK: case O_IP_SRC_ME: case O_IP_SRC_SET: show_prerequisites(bp, &flags, HAVE_PROTO, 0); if (!(flags & HAVE_SRCIP)) bprintf(bp, " from"); if ((cmd->len & F_OR) && !or_block) bprintf(bp, " {"); print_ip(bp, fo, (ipfw_insn_ip *)cmd, (flags & HAVE_OPTIONS) ? " src-ip" : ""); flags |= HAVE_SRCIP; break; case O_IP_DST: case O_IP_DST_LOOKUP: case O_IP_DST_MASK: case O_IP_DST_ME: case O_IP_DST_SET: show_prerequisites(bp, &flags, HAVE_PROTO|HAVE_SRCIP, 0); if (!(flags & HAVE_DSTIP)) bprintf(bp, " to"); if ((cmd->len & F_OR) && !or_block) bprintf(bp, " {"); print_ip(bp, fo, (ipfw_insn_ip *)cmd, (flags & HAVE_OPTIONS) ? " dst-ip" : ""); flags |= HAVE_DSTIP; break; case O_IP6_SRC: case O_IP6_SRC_MASK: case O_IP6_SRC_ME: show_prerequisites(bp, &flags, HAVE_PROTO, 0); if (!(flags & HAVE_SRCIP)) bprintf(bp, " from"); if ((cmd->len & F_OR) && !or_block) bprintf(bp, " {"); print_ip6(bp, (ipfw_insn_ip6 *)cmd, (flags & HAVE_OPTIONS) ? " src-ip6" : ""); flags |= HAVE_SRCIP | HAVE_PROTO; break; case O_IP6_DST: case O_IP6_DST_MASK: case O_IP6_DST_ME: show_prerequisites(bp, &flags, HAVE_PROTO|HAVE_SRCIP, 0); if (!(flags & HAVE_DSTIP)) bprintf(bp, " to"); if ((cmd->len & F_OR) && !or_block) bprintf(bp, " {"); print_ip6(bp, (ipfw_insn_ip6 *)cmd, (flags & HAVE_OPTIONS) ? " dst-ip6" : ""); flags |= HAVE_DSTIP; break; case O_FLOW6ID: print_flow6id(bp, (ipfw_insn_u32 *) cmd ); flags |= HAVE_OPTIONS; break; case O_IP_DSTPORT: show_prerequisites(bp, &flags, HAVE_PROTO | HAVE_SRCIP | HAVE_DSTIP | HAVE_IP, 0); case O_IP_SRCPORT: if (flags & HAVE_DSTIP) flags |= HAVE_IP; show_prerequisites(bp, &flags, HAVE_PROTO | HAVE_SRCIP, 0); if ((cmd->len & F_OR) && !or_block) bprintf(bp, " {"); if (cmd->len & F_NOT) bprintf(bp, " not"); print_newports(bp, (ipfw_insn_u16 *)cmd, proto, (flags & HAVE_OPTIONS) ? cmd->opcode : 0); break; case O_PROTO: { struct protoent *pe = NULL; if ((cmd->len & F_OR) && !or_block) bprintf(bp, " {"); if (cmd->len & F_NOT) bprintf(bp, " not"); proto = cmd->arg1; pe = getprotobynumber(cmd->arg1); if ((flags & (HAVE_PROTO4 | HAVE_PROTO6)) && !(flags & HAVE_PROTO)) show_prerequisites(bp, &flags, HAVE_PROTO | HAVE_IP | HAVE_SRCIP | HAVE_DSTIP | HAVE_OPTIONS, 0); if (flags & HAVE_OPTIONS) bprintf(bp, " proto"); if (pe) bprintf(bp, " %s", pe->p_name); else bprintf(bp, " %u", cmd->arg1); } flags |= HAVE_PROTO; break; default: /*options ... */ if (!(cmd->len & (F_OR|F_NOT))) if (((cmd->opcode == O_IP6) && (flags & HAVE_PROTO6)) || ((cmd->opcode == O_IP4) && (flags & HAVE_PROTO4))) break; show_prerequisites(bp, &flags, HAVE_PROTO | HAVE_SRCIP | HAVE_DSTIP | HAVE_IP | HAVE_OPTIONS, 0); if ((cmd->len & F_OR) && !or_block) bprintf(bp, " {"); if (cmd->len & F_NOT && cmd->opcode != O_IN) bprintf(bp, " not"); switch(cmd->opcode) { case O_MACADDR2: { ipfw_insn_mac *m = (ipfw_insn_mac *)cmd; bprintf(bp, " MAC"); print_mac(bp, m->addr, m->mask); print_mac(bp, m->addr + 6, m->mask + 6); } break; case O_MAC_TYPE: print_newports(bp, (ipfw_insn_u16 *)cmd, IPPROTO_ETHERTYPE, cmd->opcode); break; case O_FRAG: bprintf(bp, " frag"); break; case O_FIB: bprintf(bp, " fib %u", cmd->arg1 ); break; case O_SOCKARG: bprintf(bp, " sockarg"); break; case O_IN: bprintf(bp, cmd->len & F_NOT ? " out" : " in"); break; case O_DIVERTED: switch (cmd->arg1) { case 3: bprintf(bp, " diverted"); break; case 1: bprintf(bp, " diverted-loopback"); break; case 2: bprintf(bp, " diverted-output"); break; default: bprintf(bp, " diverted-?<%u>", cmd->arg1); break; } break; case O_LAYER2: bprintf(bp, " layer2"); break; case O_XMIT: case O_RECV: case O_VIA: { char const *s, *t; ipfw_insn_if *cmdif = (ipfw_insn_if *)cmd; if (cmd->opcode == O_XMIT) s = "xmit"; else if (cmd->opcode == O_RECV) s = "recv"; else /* if (cmd->opcode == O_VIA) */ s = "via"; if (cmdif->name[0] == '\0') bprintf(bp, " %s %s", s, inet_ntoa(cmdif->p.ip)); else if (cmdif->name[0] == '\1') { /* interface table */ t = table_search_ctlv(fo->tstate, cmdif->p.kidx); bprintf(bp, " %s table(%s)", s, t); } else bprintf(bp, " %s %s", s, cmdif->name); break; } case O_IP_FLOW_LOOKUP: { char *t; t = table_search_ctlv(fo->tstate, cmd->arg1); bprintf(bp, " flow table(%s", t); if (F_LEN(cmd) == F_INSN_SIZE(ipfw_insn_u32)) bprintf(bp, ",%u", ((ipfw_insn_u32 *)cmd)->d[0]); bprintf(bp, ")"); break; } case O_IPID: if (F_LEN(cmd) == 1) bprintf(bp, " ipid %u", cmd->arg1 ); else print_newports(bp, (ipfw_insn_u16 *)cmd, 0, O_IPID); break; case O_IPTTL: if (F_LEN(cmd) == 1) bprintf(bp, " ipttl %u", cmd->arg1 ); else print_newports(bp, (ipfw_insn_u16 *)cmd, 0, O_IPTTL); break; case O_IPVER: bprintf(bp, " ipver %u", cmd->arg1 ); break; case O_IPPRECEDENCE: bprintf(bp, " ipprecedence %u", cmd->arg1 >> 5); break; case O_DSCP: print_dscp(bp, (ipfw_insn_u32 *)cmd); break; case O_IPLEN: if (F_LEN(cmd) == 1) bprintf(bp, " iplen %u", cmd->arg1 ); else print_newports(bp, (ipfw_insn_u16 *)cmd, 0, O_IPLEN); break; case O_IPOPT: print_flags(bp, "ipoptions", cmd, f_ipopts); break; case O_IPTOS: print_flags(bp, "iptos", cmd, f_iptos); break; case O_ICMPTYPE: print_icmptypes(bp, (ipfw_insn_u32 *)cmd); break; case O_ESTAB: bprintf(bp, " established"); break; case O_TCPDATALEN: if (F_LEN(cmd) == 1) bprintf(bp, " tcpdatalen %u", cmd->arg1 ); else print_newports(bp, (ipfw_insn_u16 *)cmd, 0, O_TCPDATALEN); break; case O_TCPFLAGS: print_flags(bp, "tcpflags", cmd, f_tcpflags); break; case O_TCPOPTS: print_flags(bp, "tcpoptions", cmd, f_tcpopts); break; case O_TCPWIN: if (F_LEN(cmd) == 1) bprintf(bp, " tcpwin %u", cmd->arg1); else print_newports(bp, (ipfw_insn_u16 *)cmd, 0, O_TCPWIN); break; case O_TCPACK: bprintf(bp, " tcpack %d", ntohl(cmd32->d[0])); break; case O_TCPSEQ: bprintf(bp, " tcpseq %d", ntohl(cmd32->d[0])); break; case O_UID: { struct passwd *pwd = getpwuid(cmd32->d[0]); if (pwd) bprintf(bp, " uid %s", pwd->pw_name); else bprintf(bp, " uid %u", cmd32->d[0]); } break; case O_GID: { struct group *grp = getgrgid(cmd32->d[0]); if (grp) bprintf(bp, " gid %s", grp->gr_name); else bprintf(bp, " gid %u", cmd32->d[0]); } break; case O_JAIL: bprintf(bp, " jail %d", cmd32->d[0]); break; case O_VERREVPATH: bprintf(bp, " verrevpath"); break; case O_VERSRCREACH: bprintf(bp, " versrcreach"); break; case O_ANTISPOOF: bprintf(bp, " antispoof"); break; case O_IPSEC: bprintf(bp, " ipsec"); break; case O_NOP: comment = (char *)(cmd + 1); break; case O_KEEP_STATE: bprintf(bp, " keep-state"); break; case O_LIMIT: { struct _s_x *p = limit_masks; ipfw_insn_limit *c = (ipfw_insn_limit *)cmd; uint8_t x = c->limit_mask; char const *comma = " "; bprintf(bp, " limit"); for (; p->x != 0 ; p++) if ((x & p->x) == p->x) { x &= ~p->x; bprintf(bp, "%s%s", comma,p->s); comma = ","; } bprint_uint_arg(bp, " ", c->conn_limit); break; } case O_IP6: bprintf(bp, " ip6"); break; case O_IP4: bprintf(bp, " ip4"); break; case O_ICMP6TYPE: print_icmp6types(bp, (ipfw_insn_u32 *)cmd); break; case O_EXT_HDR: print_ext6hdr(bp, (ipfw_insn *)cmd); break; case O_TAGGED: if (F_LEN(cmd) == 1) bprint_uint_arg(bp, " tagged ", cmd->arg1); else print_newports(bp, (ipfw_insn_u16 *)cmd, 0, O_TAGGED); break; default: bprintf(bp, " [opcode %d len %d]", cmd->opcode, cmd->len); } } if (cmd->len & F_OR) { bprintf(bp, " or"); or_block = 1; } else if (or_block) { bprintf(bp, " }"); or_block = 0; } } show_prerequisites(bp, &flags, HAVE_PROTO | HAVE_SRCIP | HAVE_DSTIP | HAVE_IP, 0); if (comment) bprintf(bp, " // %s", comment); bprintf(bp, "\n"); } static void show_dyn_state(struct cmdline_opts *co, struct format_opts *fo, struct buf_pr *bp, ipfw_dyn_rule *d) { struct protoent *pe; struct in_addr a; uint16_t rulenum; char buf[INET6_ADDRSTRLEN]; if (!co->do_expired) { if (!d->expire && !(d->dyn_type == O_LIMIT_PARENT)) return; } bcopy(&d->rule, &rulenum, sizeof(rulenum)); bprintf(bp, "%05d", rulenum); if (fo->pcwidth > 0 || fo->bcwidth > 0) { bprintf(bp, " "); pr_u64(bp, &d->pcnt, fo->pcwidth); pr_u64(bp, &d->bcnt, fo->bcwidth); bprintf(bp, "(%ds)", d->expire); } switch (d->dyn_type) { case O_LIMIT_PARENT: bprintf(bp, " PARENT %d", d->count); break; case O_LIMIT: bprintf(bp, " LIMIT"); break; case O_KEEP_STATE: /* bidir, no mask */ bprintf(bp, " STATE"); break; } if ((pe = getprotobynumber(d->id.proto)) != NULL) bprintf(bp, " %s", pe->p_name); else bprintf(bp, " proto %u", d->id.proto); if (d->id.addr_type == 4) { a.s_addr = htonl(d->id.src_ip); bprintf(bp, " %s %d", inet_ntoa(a), d->id.src_port); a.s_addr = htonl(d->id.dst_ip); bprintf(bp, " <-> %s %d", inet_ntoa(a), d->id.dst_port); } else if (d->id.addr_type == 6) { bprintf(bp, " %s %d", inet_ntop(AF_INET6, &d->id.src_ip6, buf, sizeof(buf)), d->id.src_port); bprintf(bp, " <-> %s %d", inet_ntop(AF_INET6, &d->id.dst_ip6, buf, sizeof(buf)), d->id.dst_port); } else bprintf(bp, " UNKNOWN <-> UNKNOWN\n"); } static int do_range_cmd(int cmd, ipfw_range_tlv *rt) { ipfw_range_header rh; size_t sz; memset(&rh, 0, sizeof(rh)); memcpy(&rh.range, rt, sizeof(*rt)); rh.range.head.length = sizeof(*rt); rh.range.head.type = IPFW_TLV_RANGE; sz = sizeof(rh); if (do_get3(cmd, &rh.opheader, &sz) != 0) return (-1); /* Save number of matched objects */ rt->new_set = rh.range.new_set; return (0); } /* * This one handles all set-related commands * ipfw set { show | enable | disable } * ipfw set swap X Y * ipfw set move X to Y * ipfw set move rule X to Y */ void ipfw_sets_handler(char *av[]) { uint32_t masks[2]; int i; uint8_t cmd, rulenum; ipfw_range_tlv rt; char *msg; size_t size; av++; memset(&rt, 0, sizeof(rt)); if (av[0] == NULL) errx(EX_USAGE, "set needs command"); if (_substrcmp(*av, "show") == 0) { struct format_opts fo; ipfw_cfg_lheader *cfg; memset(&fo, 0, sizeof(fo)); if (ipfw_get_config(&co, &fo, &cfg, &size) != 0) err(EX_OSERR, "requesting config failed"); for (i = 0, msg = "disable"; i < RESVD_SET; i++) if ((cfg->set_mask & (1<set_mask != (uint32_t)-1) ? " enable" : "enable"; for (i = 0; i < RESVD_SET; i++) if ((cfg->set_mask & (1< RESVD_SET) errx(EX_DATAERR, "invalid set number %s\n", av[0]); if (!isdigit(*(av[1])) || rt.new_set > RESVD_SET) errx(EX_DATAERR, "invalid set number %s\n", av[1]); i = do_range_cmd(IP_FW_SET_SWAP, &rt); } else if (_substrcmp(*av, "move") == 0) { av++; if (av[0] && _substrcmp(*av, "rule") == 0) { rt.flags = IPFW_RCFLAG_RANGE; /* move rules to new set */ cmd = IP_FW_XMOVE; av++; } else cmd = IP_FW_SET_MOVE; /* Move set to new one */ if (av[0] == NULL || av[1] == NULL || av[2] == NULL || av[3] != NULL || _substrcmp(av[1], "to") != 0) errx(EX_USAGE, "syntax: set move [rule] X to Y\n"); rulenum = atoi(av[0]); rt.new_set = atoi(av[2]); if (cmd == IP_FW_XMOVE) { rt.start_rule = rulenum; rt.end_rule = rulenum; } else rt.set = rulenum; rt.new_set = atoi(av[2]); if (!isdigit(*(av[0])) || (cmd == 3 && rt.set > RESVD_SET) || (cmd == 2 && rt.start_rule == IPFW_DEFAULT_RULE) ) errx(EX_DATAERR, "invalid source number %s\n", av[0]); if (!isdigit(*(av[2])) || rt.new_set > RESVD_SET) errx(EX_DATAERR, "invalid dest. set %s\n", av[1]); i = do_range_cmd(cmd, &rt); } else if (_substrcmp(*av, "disable") == 0 || _substrcmp(*av, "enable") == 0 ) { int which = _substrcmp(*av, "enable") == 0 ? 1 : 0; av++; masks[0] = masks[1] = 0; while (av[0]) { if (isdigit(**av)) { i = atoi(*av); if (i < 0 || i > RESVD_SET) errx(EX_DATAERR, "invalid set number %d\n", i); masks[which] |= (1<dcnt++; if (fo->show_counters == 0) return; if (co->use_set) { /* skip states from another set */ bcopy((char *)&d->rule + sizeof(uint16_t), &set, sizeof(uint8_t)); if (set != co->use_set - 1) return; } width = pr_u64(NULL, &d->pcnt, 0); if (width > fo->pcwidth) fo->pcwidth = width; width = pr_u64(NULL, &d->bcnt, 0); if (width > fo->bcwidth) fo->bcwidth = width; } static int foreach_state(struct cmdline_opts *co, struct format_opts *fo, caddr_t base, size_t sz, state_cb dyn_bc, void *dyn_arg) { int ttype; state_cb *fptr; void *farg; ipfw_obj_tlv *tlv; ipfw_obj_ctlv *ctlv; fptr = NULL; ttype = 0; while (sz > 0) { ctlv = (ipfw_obj_ctlv *)base; switch (ctlv->head.type) { case IPFW_TLV_DYNSTATE_LIST: base += sizeof(*ctlv); sz -= sizeof(*ctlv); ttype = IPFW_TLV_DYN_ENT; fptr = dyn_bc; farg = dyn_arg; break; default: return (sz); } while (sz > 0) { tlv = (ipfw_obj_tlv *)base; if (tlv->type != ttype) break; fptr(co, fo, farg, tlv + 1); sz -= tlv->length; base += tlv->length; } } return (sz); } static void prepare_format_opts(struct cmdline_opts *co, struct format_opts *fo, ipfw_obj_tlv *rtlv, int rcnt, caddr_t dynbase, size_t dynsz) { int bcwidth, pcwidth, width; int n; struct ip_fw_bcounter *cntr; struct ip_fw_rule *r; bcwidth = 0; pcwidth = 0; if (fo->show_counters != 0) { for (n = 0; n < rcnt; n++, rtlv = (ipfw_obj_tlv *)((caddr_t)rtlv + rtlv->length)) { cntr = (struct ip_fw_bcounter *)(rtlv + 1); r = (struct ip_fw_rule *)((caddr_t)cntr + cntr->size); /* skip rules from another set */ if (co->use_set && r->set != co->use_set - 1) continue; /* packet counter */ width = pr_u64(NULL, &cntr->pcnt, 0); if (width > pcwidth) pcwidth = width; /* byte counter */ width = pr_u64(NULL, &cntr->bcnt, 0); if (width > bcwidth) bcwidth = width; } } fo->bcwidth = bcwidth; fo->pcwidth = pcwidth; fo->dcnt = 0; if (co->do_dynamic && dynsz > 0) foreach_state(co, fo, dynbase, dynsz, prepare_format_dyn, NULL); } static int list_static_range(struct cmdline_opts *co, struct format_opts *fo, struct buf_pr *bp, ipfw_obj_tlv *rtlv, int rcnt) { int n, seen; struct ip_fw_rule *r; struct ip_fw_bcounter *cntr; int c = 0; for (n = seen = 0; n < rcnt; n++, rtlv = (ipfw_obj_tlv *)((caddr_t)rtlv + rtlv->length)) { - if (fo->show_counters != 0) { + if ((fo->show_counters | fo->show_time) != 0) { cntr = (struct ip_fw_bcounter *)(rtlv + 1); r = (struct ip_fw_rule *)((caddr_t)cntr + cntr->size); } else { cntr = NULL; r = (struct ip_fw_rule *)(rtlv + 1); } if (r->rulenum > fo->last) break; if (co->use_set && r->set != co->use_set - 1) continue; if (r->rulenum >= fo->first && r->rulenum <= fo->last) { show_static_rule(co, fo, bp, r, cntr); printf("%s", bp->buf); c += rtlv->length; bp_flush(bp); seen++; } } return (seen); } static void list_dyn_state(struct cmdline_opts *co, struct format_opts *fo, void *_arg, void *_state) { uint16_t rulenum; uint8_t set; ipfw_dyn_rule *d; struct buf_pr *bp; d = (ipfw_dyn_rule *)_state; bp = (struct buf_pr *)_arg; bcopy(&d->rule, &rulenum, sizeof(rulenum)); if (rulenum > fo->last) return; if (co->use_set) { bcopy((char *)&d->rule + sizeof(uint16_t), &set, sizeof(uint8_t)); if (set != co->use_set - 1) return; } if (rulenum >= fo->first) { show_dyn_state(co, fo, bp, d); printf("%s\n", bp->buf); bp_flush(bp); } } static int list_dyn_range(struct cmdline_opts *co, struct format_opts *fo, struct buf_pr *bp, caddr_t base, size_t sz) { sz = foreach_state(co, fo, base, sz, list_dyn_state, bp); return (sz); } void ipfw_list(int ac, char *av[], int show_counters) { ipfw_cfg_lheader *cfg; struct format_opts sfo; size_t sz; int error; int lac; char **lav; uint32_t rnum; char *endptr; if (co.test_only) { fprintf(stderr, "Testing only, list disabled\n"); return; } if (co.do_pipe) { dummynet_list(ac, av, show_counters); return; } ac--; av++; memset(&sfo, 0, sizeof(sfo)); /* Determine rule range to request */ if (ac > 0) { for (lac = ac, lav = av; lac != 0; lac--) { rnum = strtoul(*lav++, &endptr, 10); if (sfo.first == 0 || rnum < sfo.first) sfo.first = rnum; if (*endptr == '-') rnum = strtoul(endptr + 1, &endptr, 10); if (sfo.last == 0 || rnum > sfo.last) sfo.last = rnum; } } /* get configuraion from kernel */ cfg = NULL; sfo.show_counters = show_counters; + sfo.show_time = co.do_time; sfo.flags = IPFW_CFG_GET_STATIC; if (co.do_dynamic != 0) sfo.flags |= IPFW_CFG_GET_STATES; - if (sfo.show_counters != 0) + if ((sfo.show_counters | sfo.show_time) != 0) sfo.flags |= IPFW_CFG_GET_COUNTERS; if (ipfw_get_config(&co, &sfo, &cfg, &sz) != 0) err(EX_OSERR, "retrieving config failed"); error = ipfw_show_config(&co, &sfo, cfg, sz, ac, av); free(cfg); if (error != EX_OK) exit(error); } static int ipfw_show_config(struct cmdline_opts *co, struct format_opts *fo, ipfw_cfg_lheader *cfg, size_t sz, int ac, char *av[]) { caddr_t dynbase; size_t dynsz; int rcnt; int exitval = EX_OK; int lac; char **lav; char *endptr; size_t readsz; struct buf_pr bp; ipfw_obj_ctlv *ctlv, *tstate; ipfw_obj_tlv *rbase; /* * Handle tablenames TLV first, if any */ tstate = NULL; rbase = NULL; dynbase = NULL; dynsz = 0; readsz = sizeof(*cfg); rcnt = 0; fo->set_mask = cfg->set_mask; ctlv = (ipfw_obj_ctlv *)(cfg + 1); if (cfg->flags & IPFW_CFG_GET_STATIC) { /* We've requested static rules */ if (ctlv->head.type == IPFW_TLV_TBLNAME_LIST) { fo->tstate = ctlv; readsz += ctlv->head.length; ctlv = (ipfw_obj_ctlv *)((caddr_t)ctlv + ctlv->head.length); } if (ctlv->head.type == IPFW_TLV_RULE_LIST) { rbase = (ipfw_obj_tlv *)(ctlv + 1); rcnt = ctlv->count; readsz += ctlv->head.length; ctlv = (ipfw_obj_ctlv *)((caddr_t)ctlv + ctlv->head.length); } } if ((cfg->flags & IPFW_CFG_GET_STATES) && (readsz != sz)) { /* We may have some dynamic states */ dynsz = sz - readsz; /* Skip empty header */ if (dynsz != sizeof(ipfw_obj_ctlv)) dynbase = (caddr_t)ctlv; else dynsz = 0; } prepare_format_opts(co, fo, rbase, rcnt, dynbase, dynsz); bp_alloc(&bp, 4096); /* if no rule numbers were specified, list all rules */ if (ac == 0) { fo->first = 0; fo->last = IPFW_DEFAULT_RULE; list_static_range(co, fo, &bp, rbase, rcnt); if (co->do_dynamic && dynsz > 0) { printf("## Dynamic rules (%d %zu):\n", fo->dcnt, dynsz); list_dyn_range(co, fo, &bp, dynbase, dynsz); } bp_free(&bp); return (EX_OK); } /* display specific rules requested on command line */ for (lac = ac, lav = av; lac != 0; lac--) { /* convert command line rule # */ fo->last = fo->first = strtoul(*lav++, &endptr, 10); if (*endptr == '-') fo->last = strtoul(endptr + 1, &endptr, 10); if (*endptr) { exitval = EX_USAGE; warnx("invalid rule number: %s", *(lav - 1)); continue; } if (list_static_range(co, fo, &bp, rbase, rcnt) == 0) { /* give precedence to other error(s) */ if (exitval == EX_OK) exitval = EX_UNAVAILABLE; if (fo->first == fo->last) warnx("rule %u does not exist", fo->first); else warnx("no rules in range %u-%u", fo->first, fo->last); } } if (co->do_dynamic && dynsz > 0) { printf("## Dynamic rules:\n"); for (lac = ac, lav = av; lac != 0; lac--) { fo->last = fo->first = strtoul(*lav++, &endptr, 10); if (*endptr == '-') fo->last = strtoul(endptr+1, &endptr, 10); if (*endptr) /* already warned */ continue; list_dyn_range(co, fo, &bp, dynbase, dynsz); } } bp_free(&bp); return (exitval); } /* * Retrieves current ipfw configuration of given type * and stores its pointer to @pcfg. * * Caller is responsible for freeing @pcfg. * * Returns 0 on success. */ static int ipfw_get_config(struct cmdline_opts *co, struct format_opts *fo, ipfw_cfg_lheader **pcfg, size_t *psize) { ipfw_cfg_lheader *cfg; size_t sz; int i; if (co->test_only != 0) { fprintf(stderr, "Testing only, list disabled\n"); return (0); } /* Start with some data size */ sz = 4096; cfg = NULL; for (i = 0; i < 16; i++) { if (cfg != NULL) free(cfg); if ((cfg = calloc(1, sz)) == NULL) return (ENOMEM); cfg->flags = fo->flags; cfg->start_rule = fo->first; cfg->end_rule = fo->last; if (do_get3(IP_FW_XGET, &cfg->opheader, &sz) != 0) { if (errno != ENOMEM) { free(cfg); return (errno); } /* Buffer size is not enough. Try to increase */ sz = sz * 2; if (sz < cfg->size) sz = cfg->size; continue; } *pcfg = cfg; *psize = sz; return (0); } free(cfg); return (ENOMEM); } static int lookup_host (char *host, struct in_addr *ipaddr) { struct hostent *he; if (!inet_aton(host, ipaddr)) { if ((he = gethostbyname(host)) == NULL) return(-1); *ipaddr = *(struct in_addr *)he->h_addr_list[0]; } return(0); } struct tidx { ipfw_obj_ntlv *idx; uint32_t count; uint32_t size; uint16_t counter; uint8_t set; }; static uint16_t pack_table(struct tidx *tstate, char *name) { int i; ipfw_obj_ntlv *ntlv; if (table_check_name(name) != 0) return (0); for (i = 0; i < tstate->count; i++) { if (strcmp(tstate->idx[i].name, name) != 0) continue; if (tstate->idx[i].set != tstate->set) continue; return (tstate->idx[i].idx); } if (tstate->count + 1 > tstate->size) { tstate->size += 4; tstate->idx = realloc(tstate->idx, tstate->size * sizeof(ipfw_obj_ntlv)); if (tstate->idx == NULL) return (0); } ntlv = &tstate->idx[i]; memset(ntlv, 0, sizeof(ipfw_obj_ntlv)); strlcpy(ntlv->name, name, sizeof(ntlv->name)); ntlv->head.type = IPFW_TLV_TBL_NAME; ntlv->head.length = sizeof(ipfw_obj_ntlv); ntlv->set = tstate->set; ntlv->idx = ++tstate->counter; tstate->count++; return (ntlv->idx); } static void fill_table(ipfw_insn *cmd, char *av, uint8_t opcode, struct tidx *tstate) { uint32_t *d = ((ipfw_insn_u32 *)cmd)->d; uint16_t uidx; char *p; if ((p = strchr(av + 6, ')')) == NULL) errx(EX_DATAERR, "forgotten parenthesis: '%s'", av); *p = '\0'; p = strchr(av + 6, ','); if (p) *p++ = '\0'; if ((uidx = pack_table(tstate, av + 6)) == 0) errx(EX_DATAERR, "Invalid table name: %s", av + 6); cmd->opcode = opcode; cmd->arg1 = uidx; if (p) { cmd->len |= F_INSN_SIZE(ipfw_insn_u32); d[0] = strtoul(p, NULL, 0); } else cmd->len |= F_INSN_SIZE(ipfw_insn); } /* * fills the addr and mask fields in the instruction as appropriate from av. * Update length as appropriate. * The following formats are allowed: * me returns O_IP_*_ME * 1.2.3.4 single IP address * 1.2.3.4:5.6.7.8 address:mask * 1.2.3.4/24 address/mask * 1.2.3.4/26{1,6,5,4,23} set of addresses in a subnet * We can have multiple comma-separated address/mask entries. */ static void fill_ip(ipfw_insn_ip *cmd, char *av, int cblen, struct tidx *tstate) { int len = 0; uint32_t *d = ((ipfw_insn_u32 *)cmd)->d; cmd->o.len &= ~F_LEN_MASK; /* zero len */ if (_substrcmp(av, "any") == 0) return; if (_substrcmp(av, "me") == 0) { cmd->o.len |= F_INSN_SIZE(ipfw_insn); return; } if (strncmp(av, "table(", 6) == 0) { fill_table(&cmd->o, av, O_IP_DST_LOOKUP, tstate); return; } while (av) { /* * After the address we can have '/' or ':' indicating a mask, * ',' indicating another address follows, '{' indicating a * set of addresses of unspecified size. */ char *t = NULL, *p = strpbrk(av, "/:,{"); int masklen; char md, nd = '\0'; CHECK_LENGTH(cblen, F_INSN_SIZE(ipfw_insn) + 2 + len); if (p) { md = *p; *p++ = '\0'; if ((t = strpbrk(p, ",{")) != NULL) { nd = *t; *t = '\0'; } } else md = '\0'; if (lookup_host(av, (struct in_addr *)&d[0]) != 0) errx(EX_NOHOST, "hostname ``%s'' unknown", av); switch (md) { case ':': if (!inet_aton(p, (struct in_addr *)&d[1])) errx(EX_DATAERR, "bad netmask ``%s''", p); break; case '/': masklen = atoi(p); if (masklen == 0) d[1] = htonl(0); /* mask */ else if (masklen > 32) errx(EX_DATAERR, "bad width ``%s''", p); else d[1] = htonl(~0 << (32 - masklen)); break; case '{': /* no mask, assume /24 and put back the '{' */ d[1] = htonl(~0 << (32 - 24)); *(--p) = md; break; case ',': /* single address plus continuation */ *(--p) = md; /* FALLTHROUGH */ case 0: /* initialization value */ default: d[1] = htonl(~0); /* force /32 */ break; } d[0] &= d[1]; /* mask base address with mask */ if (t) *t = nd; /* find next separator */ if (p) p = strpbrk(p, ",{"); if (p && *p == '{') { /* * We have a set of addresses. They are stored as follows: * arg1 is the set size (powers of 2, 2..256) * addr is the base address IN HOST FORMAT * mask.. is an array of arg1 bits (rounded up to * the next multiple of 32) with bits set * for each host in the map. */ uint32_t *map = (uint32_t *)&cmd->mask; int low, high; int i = contigmask((uint8_t *)&(d[1]), 32); if (len > 0) errx(EX_DATAERR, "address set cannot be in a list"); if (i < 24 || i > 31) errx(EX_DATAERR, "invalid set with mask %d\n", i); cmd->o.arg1 = 1<<(32-i); /* map length */ d[0] = ntohl(d[0]); /* base addr in host format */ cmd->o.opcode = O_IP_DST_SET; /* default */ cmd->o.len |= F_INSN_SIZE(ipfw_insn_u32) + (cmd->o.arg1+31)/32; for (i = 0; i < (cmd->o.arg1+31)/32 ; i++) map[i] = 0; /* clear map */ av = p + 1; low = d[0] & 0xff; high = low + cmd->o.arg1 - 1; /* * Here, i stores the previous value when we specify a range * of addresses within a mask, e.g. 45-63. i = -1 means we * have no previous value. */ i = -1; /* previous value in a range */ while (isdigit(*av)) { char *s; int a = strtol(av, &s, 0); if (s == av) { /* no parameter */ if (*av != '}') errx(EX_DATAERR, "set not closed\n"); if (i != -1) errx(EX_DATAERR, "incomplete range %d-", i); break; } if (a < low || a > high) errx(EX_DATAERR, "addr %d out of range [%d-%d]\n", a, low, high); a -= low; if (i == -1) /* no previous in range */ i = a; else { /* check that range is valid */ if (i > a) errx(EX_DATAERR, "invalid range %d-%d", i+low, a+low); if (*s == '-') errx(EX_DATAERR, "double '-' in range"); } for (; i <= a; i++) map[i/32] |= 1<<(i & 31); i = -1; if (*s == '-') i = a; else if (*s == '}') break; av = s+1; } return; } av = p; if (av) /* then *av must be a ',' */ av++; /* Check this entry */ if (d[1] == 0) { /* "any", specified as x.x.x.x/0 */ /* * 'any' turns the entire list into a NOP. * 'not any' never matches, so it is removed from the * list unless it is the only item, in which case we * report an error. */ if (cmd->o.len & F_NOT) { /* "not any" never matches */ if (av == NULL && len == 0) /* only this entry */ errx(EX_DATAERR, "not any never matches"); } /* else do nothing and skip this entry */ return; } /* A single IP can be stored in an optimized format */ if (d[1] == (uint32_t)~0 && av == NULL && len == 0) { cmd->o.len |= F_INSN_SIZE(ipfw_insn_u32); return; } len += 2; /* two words... */ d += 2; } /* end while */ if (len + 1 > F_LEN_MASK) errx(EX_DATAERR, "address list too long"); cmd->o.len |= len+1; } /* n2mask sets n bits of the mask */ void n2mask(struct in6_addr *mask, int n) { static int minimask[9] = { 0x00, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe, 0xff }; u_char *p; memset(mask, 0, sizeof(struct in6_addr)); p = (u_char *) mask; for (; n > 0; p++, n -= 8) { if (n >= 8) *p = 0xff; else *p = minimask[n]; } return; } static void fill_flags_cmd(ipfw_insn *cmd, enum ipfw_opcodes opcode, struct _s_x *flags, char *p) { char *e; uint32_t set = 0, clear = 0; if (fill_flags(flags, p, &e, &set, &clear) != 0) errx(EX_DATAERR, "invalid flag %s", e); cmd->opcode = opcode; cmd->len = (cmd->len & (F_NOT | F_OR)) | 1; cmd->arg1 = (set & 0xff) | ( (clear & 0xff) << 8); } void ipfw_delete(char *av[]) { int i; int exitval = EX_OK; int do_set = 0; ipfw_range_tlv rt; av++; NEED1("missing rule specification"); memset(&rt, 0, sizeof(rt)); if ( *av && _substrcmp(*av, "set") == 0) { /* Do not allow using the following syntax: * ipfw set N delete set M */ if (co.use_set) errx(EX_DATAERR, "invalid syntax"); do_set = 1; /* delete set */ av++; } /* Rule number */ while (*av && isdigit(**av)) { i = atoi(*av); av++; if (co.do_nat) { exitval = do_cmd(IP_FW_NAT_DEL, &i, sizeof i); if (exitval) { exitval = EX_UNAVAILABLE; warn("rule %u not available", i); } } else if (co.do_pipe) { exitval = ipfw_delete_pipe(co.do_pipe, i); } else { if (do_set != 0) { rt.set = i & 31; rt.flags = IPFW_RCFLAG_SET; } else { rt.start_rule = i & 0xffff; rt.end_rule = i & 0xffff; if (rt.start_rule == 0 && rt.end_rule == 0) rt.flags |= IPFW_RCFLAG_ALL; else rt.flags |= IPFW_RCFLAG_RANGE; if (co.use_set != 0) { rt.set = co.use_set - 1; rt.flags |= IPFW_RCFLAG_SET; } } i = do_range_cmd(IP_FW_XDEL, &rt); if (i != 0) { exitval = EX_UNAVAILABLE; warn("rule %u: setsockopt(IP_FW_XDEL)", rt.start_rule); } else if (rt.new_set == 0) { exitval = EX_UNAVAILABLE; if (rt.start_rule != rt.end_rule) warnx("no rules rules in %u-%u range", rt.start_rule, rt.end_rule); else warnx("rule %u not found", rt.start_rule); } } } if (exitval != EX_OK) exit(exitval); } /* * fill the interface structure. We do not check the name as we can * create interfaces dynamically, so checking them at insert time * makes relatively little sense. * Interface names containing '*', '?', or '[' are assumed to be shell * patterns which match interfaces. */ static void fill_iface(ipfw_insn_if *cmd, char *arg, int cblen, struct tidx *tstate) { char *p; uint16_t uidx; cmd->name[0] = '\0'; cmd->o.len |= F_INSN_SIZE(ipfw_insn_if); CHECK_CMDLEN; /* Parse the interface or address */ if (strcmp(arg, "any") == 0) cmd->o.len = 0; /* effectively ignore this command */ else if (strncmp(arg, "table(", 6) == 0) { if ((p = strchr(arg + 6, ')')) == NULL) errx(EX_DATAERR, "forgotten parenthesis: '%s'", arg); *p = '\0'; p = strchr(arg + 6, ','); if (p) *p++ = '\0'; if ((uidx = pack_table(tstate, arg + 6)) == 0) errx(EX_DATAERR, "Invalid table name: %s", arg + 6); cmd->name[0] = '\1'; /* Special value indicating table */ cmd->p.kidx = uidx; } else if (!isdigit(*arg)) { strlcpy(cmd->name, arg, sizeof(cmd->name)); cmd->p.glob = strpbrk(arg, "*?[") != NULL ? 1 : 0; } else if (!inet_aton(arg, &cmd->p.ip)) errx(EX_DATAERR, "bad ip address ``%s''", arg); } static void get_mac_addr_mask(const char *p, uint8_t *addr, uint8_t *mask) { int i; size_t l; char *ap, *ptr, *optr; struct ether_addr *mac; const char *macset = "0123456789abcdefABCDEF:"; if (strcmp(p, "any") == 0) { for (i = 0; i < ETHER_ADDR_LEN; i++) addr[i] = mask[i] = 0; return; } optr = ptr = strdup(p); if ((ap = strsep(&ptr, "&/")) != NULL && *ap != 0) { l = strlen(ap); if (strspn(ap, macset) != l || (mac = ether_aton(ap)) == NULL) errx(EX_DATAERR, "Incorrect MAC address"); bcopy(mac, addr, ETHER_ADDR_LEN); } else errx(EX_DATAERR, "Incorrect MAC address"); if (ptr != NULL) { /* we have mask? */ if (p[ptr - optr - 1] == '/') { /* mask len */ long ml = strtol(ptr, &ap, 10); if (*ap != 0 || ml > ETHER_ADDR_LEN * 8 || ml < 0) errx(EX_DATAERR, "Incorrect mask length"); for (i = 0; ml > 0 && i < ETHER_ADDR_LEN; ml -= 8, i++) mask[i] = (ml >= 8) ? 0xff: (~0) << (8 - ml); } else { /* mask */ l = strlen(ptr); if (strspn(ptr, macset) != l || (mac = ether_aton(ptr)) == NULL) errx(EX_DATAERR, "Incorrect mask"); bcopy(mac, mask, ETHER_ADDR_LEN); } } else { /* default mask: ff:ff:ff:ff:ff:ff */ for (i = 0; i < ETHER_ADDR_LEN; i++) mask[i] = 0xff; } for (i = 0; i < ETHER_ADDR_LEN; i++) addr[i] &= mask[i]; free(optr); } /* * helper function, updates the pointer to cmd with the length * of the current command, and also cleans up the first word of * the new command in case it has been clobbered before. */ static ipfw_insn * next_cmd(ipfw_insn *cmd, int *len) { *len -= F_LEN(cmd); CHECK_LENGTH(*len, 0); cmd += F_LEN(cmd); bzero(cmd, sizeof(*cmd)); return cmd; } /* * Takes arguments and copies them into a comment */ static void fill_comment(ipfw_insn *cmd, char **av, int cblen) { int i, l; char *p = (char *)(cmd + 1); cmd->opcode = O_NOP; cmd->len = (cmd->len & (F_NOT | F_OR)); /* Compute length of comment string. */ for (i = 0, l = 0; av[i] != NULL; i++) l += strlen(av[i]) + 1; if (l == 0) return; if (l > 84) errx(EX_DATAERR, "comment too long (max 80 chars)"); l = 1 + (l+3)/4; cmd->len = (cmd->len & (F_NOT | F_OR)) | l; CHECK_CMDLEN; for (i = 0; av[i] != NULL; i++) { strcpy(p, av[i]); p += strlen(av[i]); *p++ = ' '; } *(--p) = '\0'; } /* * A function to fill simple commands of size 1. * Existing flags are preserved. */ static void fill_cmd(ipfw_insn *cmd, enum ipfw_opcodes opcode, int flags, uint16_t arg) { cmd->opcode = opcode; cmd->len = ((cmd->len | flags) & (F_NOT | F_OR)) | 1; cmd->arg1 = arg; } /* * Fetch and add the MAC address and type, with masks. This generates one or * two microinstructions, and returns the pointer to the last one. */ static ipfw_insn * add_mac(ipfw_insn *cmd, char *av[], int cblen) { ipfw_insn_mac *mac; if ( ( av[0] == NULL ) || ( av[1] == NULL ) ) errx(EX_DATAERR, "MAC dst src"); cmd->opcode = O_MACADDR2; cmd->len = (cmd->len & (F_NOT | F_OR)) | F_INSN_SIZE(ipfw_insn_mac); CHECK_CMDLEN; mac = (ipfw_insn_mac *)cmd; get_mac_addr_mask(av[0], mac->addr, mac->mask); /* dst */ get_mac_addr_mask(av[1], &(mac->addr[ETHER_ADDR_LEN]), &(mac->mask[ETHER_ADDR_LEN])); /* src */ return cmd; } static ipfw_insn * add_mactype(ipfw_insn *cmd, char *av, int cblen) { if (!av) errx(EX_DATAERR, "missing MAC type"); if (strcmp(av, "any") != 0) { /* we have a non-null type */ fill_newports((ipfw_insn_u16 *)cmd, av, IPPROTO_ETHERTYPE, cblen); cmd->opcode = O_MAC_TYPE; return cmd; } else return NULL; } static ipfw_insn * add_proto0(ipfw_insn *cmd, char *av, u_char *protop) { struct protoent *pe; char *ep; int proto; proto = strtol(av, &ep, 10); if (*ep != '\0' || proto <= 0) { if ((pe = getprotobyname(av)) == NULL) return NULL; proto = pe->p_proto; } fill_cmd(cmd, O_PROTO, 0, proto); *protop = proto; return cmd; } static ipfw_insn * add_proto(ipfw_insn *cmd, char *av, u_char *protop) { u_char proto = IPPROTO_IP; if (_substrcmp(av, "all") == 0 || strcmp(av, "ip") == 0) ; /* do not set O_IP4 nor O_IP6 */ else if (strcmp(av, "ip4") == 0) /* explicit "just IPv4" rule */ fill_cmd(cmd, O_IP4, 0, 0); else if (strcmp(av, "ip6") == 0) { /* explicit "just IPv6" rule */ proto = IPPROTO_IPV6; fill_cmd(cmd, O_IP6, 0, 0); } else return add_proto0(cmd, av, protop); *protop = proto; return cmd; } static ipfw_insn * add_proto_compat(ipfw_insn *cmd, char *av, u_char *protop) { u_char proto = IPPROTO_IP; if (_substrcmp(av, "all") == 0 || strcmp(av, "ip") == 0) ; /* do not set O_IP4 nor O_IP6 */ else if (strcmp(av, "ipv4") == 0 || strcmp(av, "ip4") == 0) /* explicit "just IPv4" rule */ fill_cmd(cmd, O_IP4, 0, 0); else if (strcmp(av, "ipv6") == 0 || strcmp(av, "ip6") == 0) { /* explicit "just IPv6" rule */ proto = IPPROTO_IPV6; fill_cmd(cmd, O_IP6, 0, 0); } else return add_proto0(cmd, av, protop); *protop = proto; return cmd; } static ipfw_insn * add_srcip(ipfw_insn *cmd, char *av, int cblen, struct tidx *tstate) { fill_ip((ipfw_insn_ip *)cmd, av, cblen, tstate); if (cmd->opcode == O_IP_DST_SET) /* set */ cmd->opcode = O_IP_SRC_SET; else if (cmd->opcode == O_IP_DST_LOOKUP) /* table */ cmd->opcode = O_IP_SRC_LOOKUP; else if (F_LEN(cmd) == F_INSN_SIZE(ipfw_insn)) /* me */ cmd->opcode = O_IP_SRC_ME; else if (F_LEN(cmd) == F_INSN_SIZE(ipfw_insn_u32)) /* one IP */ cmd->opcode = O_IP_SRC; else /* addr/mask */ cmd->opcode = O_IP_SRC_MASK; return cmd; } static ipfw_insn * add_dstip(ipfw_insn *cmd, char *av, int cblen, struct tidx *tstate) { fill_ip((ipfw_insn_ip *)cmd, av, cblen, tstate); if (cmd->opcode == O_IP_DST_SET) /* set */ ; else if (cmd->opcode == O_IP_DST_LOOKUP) /* table */ ; else if (F_LEN(cmd) == F_INSN_SIZE(ipfw_insn)) /* me */ cmd->opcode = O_IP_DST_ME; else if (F_LEN(cmd) == F_INSN_SIZE(ipfw_insn_u32)) /* one IP */ cmd->opcode = O_IP_DST; else /* addr/mask */ cmd->opcode = O_IP_DST_MASK; return cmd; } static struct _s_x f_reserved_keywords[] = { { "altq", TOK_OR }, { "//", TOK_OR }, { "diverted", TOK_OR }, { "dst-port", TOK_OR }, { "src-port", TOK_OR }, { "established", TOK_OR }, { "keep-state", TOK_OR }, { "frag", TOK_OR }, { "icmptypes", TOK_OR }, { "in", TOK_OR }, { "out", TOK_OR }, { "ip6", TOK_OR }, { "any", TOK_OR }, { "to", TOK_OR }, { "via", TOK_OR }, { "{", TOK_OR }, { NULL, 0 } /* terminator */ }; static ipfw_insn * add_ports(ipfw_insn *cmd, char *av, u_char proto, int opcode, int cblen) { if (match_token(f_reserved_keywords, av) != -1) return (NULL); if (fill_newports((ipfw_insn_u16 *)cmd, av, proto, cblen)) { /* XXX todo: check that we have a protocol with ports */ cmd->opcode = opcode; return cmd; } return NULL; } static ipfw_insn * add_src(ipfw_insn *cmd, char *av, u_char proto, int cblen, struct tidx *tstate) { struct in6_addr a; char *host, *ch, buf[INET6_ADDRSTRLEN]; ipfw_insn *ret = NULL; int len; /* Copy first address in set if needed */ if ((ch = strpbrk(av, "/,")) != NULL) { len = ch - av; strlcpy(buf, av, sizeof(buf)); if (len < sizeof(buf)) buf[len] = '\0'; host = buf; } else host = av; if (proto == IPPROTO_IPV6 || strcmp(av, "me6") == 0 || inet_pton(AF_INET6, host, &a) == 1) ret = add_srcip6(cmd, av, cblen); /* XXX: should check for IPv4, not !IPv6 */ if (ret == NULL && (proto == IPPROTO_IP || strcmp(av, "me") == 0 || inet_pton(AF_INET6, host, &a) != 1)) ret = add_srcip(cmd, av, cblen, tstate); if (ret == NULL && strcmp(av, "any") != 0) ret = cmd; return ret; } static ipfw_insn * add_dst(ipfw_insn *cmd, char *av, u_char proto, int cblen, struct tidx *tstate) { struct in6_addr a; char *host, *ch, buf[INET6_ADDRSTRLEN]; ipfw_insn *ret = NULL; int len; /* Copy first address in set if needed */ if ((ch = strpbrk(av, "/,")) != NULL) { len = ch - av; strlcpy(buf, av, sizeof(buf)); if (len < sizeof(buf)) buf[len] = '\0'; host = buf; } else host = av; if (proto == IPPROTO_IPV6 || strcmp(av, "me6") == 0 || inet_pton(AF_INET6, host, &a) == 1) ret = add_dstip6(cmd, av, cblen); /* XXX: should check for IPv4, not !IPv6 */ if (ret == NULL && (proto == IPPROTO_IP || strcmp(av, "me") == 0 || inet_pton(AF_INET6, host, &a) != 1)) ret = add_dstip(cmd, av, cblen, tstate); if (ret == NULL && strcmp(av, "any") != 0) ret = cmd; return ret; } /* * Parse arguments and assemble the microinstructions which make up a rule. * Rules are added into the 'rulebuf' and then copied in the correct order * into the actual rule. * * The syntax for a rule starts with the action, followed by * optional action parameters, and the various match patterns. * In the assembled microcode, the first opcode must be an O_PROBE_STATE * (generated if the rule includes a keep-state option), then the * various match patterns, log/altq actions, and the actual action. * */ void compile_rule(char *av[], uint32_t *rbuf, int *rbufsize, struct tidx *tstate) { /* * rules are added into the 'rulebuf' and then copied in * the correct order into the actual rule. * Some things that need to go out of order (prob, action etc.) * go into actbuf[]. */ static uint32_t actbuf[255], cmdbuf[255]; int rblen, ablen, cblen; ipfw_insn *src, *dst, *cmd, *action, *prev=NULL; ipfw_insn *first_cmd; /* first match pattern */ struct ip_fw_rule *rule; /* * various flags used to record that we entered some fields. */ ipfw_insn *have_state = NULL; /* check-state or keep-state */ ipfw_insn *have_log = NULL, *have_altq = NULL, *have_tag = NULL; size_t len; int i; int open_par = 0; /* open parenthesis ( */ /* proto is here because it is used to fetch ports */ u_char proto = IPPROTO_IP; /* default protocol */ double match_prob = 1; /* match probability, default is always match */ bzero(actbuf, sizeof(actbuf)); /* actions go here */ bzero(cmdbuf, sizeof(cmdbuf)); bzero(rbuf, *rbufsize); rule = (struct ip_fw_rule *)rbuf; cmd = (ipfw_insn *)cmdbuf; action = (ipfw_insn *)actbuf; rblen = *rbufsize / sizeof(uint32_t); rblen -= sizeof(struct ip_fw_rule) / sizeof(uint32_t); ablen = sizeof(actbuf) / sizeof(actbuf[0]); cblen = sizeof(cmdbuf) / sizeof(cmdbuf[0]); cblen -= F_INSN_SIZE(ipfw_insn_u32) + 1; #define CHECK_RBUFLEN(len) { CHECK_LENGTH(rblen, len); rblen -= len; } #define CHECK_ACTLEN CHECK_LENGTH(ablen, action->len) av++; /* [rule N] -- Rule number optional */ if (av[0] && isdigit(**av)) { rule->rulenum = atoi(*av); av++; } /* [set N] -- set number (0..RESVD_SET), optional */ if (av[0] && av[1] && _substrcmp(*av, "set") == 0) { int set = strtoul(av[1], NULL, 10); if (set < 0 || set > RESVD_SET) errx(EX_DATAERR, "illegal set %s", av[1]); rule->set = set; tstate->set = set; av += 2; } /* [prob D] -- match probability, optional */ if (av[0] && av[1] && _substrcmp(*av, "prob") == 0) { match_prob = strtod(av[1], NULL); if (match_prob <= 0 || match_prob > 1) errx(EX_DATAERR, "illegal match prob. %s", av[1]); av += 2; } /* action -- mandatory */ NEED1("missing action"); i = match_token(rule_actions, *av); av++; action->len = 1; /* default */ CHECK_ACTLEN; switch(i) { case TOK_CHECKSTATE: have_state = action; action->opcode = O_CHECK_STATE; break; case TOK_ACCEPT: action->opcode = O_ACCEPT; break; case TOK_DENY: action->opcode = O_DENY; action->arg1 = 0; break; case TOK_REJECT: action->opcode = O_REJECT; action->arg1 = ICMP_UNREACH_HOST; break; case TOK_RESET: action->opcode = O_REJECT; action->arg1 = ICMP_REJECT_RST; break; case TOK_RESET6: action->opcode = O_UNREACH6; action->arg1 = ICMP6_UNREACH_RST; break; case TOK_UNREACH: action->opcode = O_REJECT; NEED1("missing reject code"); fill_reject_code(&action->arg1, *av); av++; break; case TOK_UNREACH6: action->opcode = O_UNREACH6; NEED1("missing unreach code"); fill_unreach6_code(&action->arg1, *av); av++; break; case TOK_COUNT: action->opcode = O_COUNT; break; case TOK_NAT: action->opcode = O_NAT; action->len = F_INSN_SIZE(ipfw_insn_nat); CHECK_ACTLEN; if (_substrcmp(*av, "global") == 0) { action->arg1 = 0; av++; break; } else goto chkarg; case TOK_QUEUE: action->opcode = O_QUEUE; goto chkarg; case TOK_PIPE: action->opcode = O_PIPE; goto chkarg; case TOK_SKIPTO: action->opcode = O_SKIPTO; goto chkarg; case TOK_NETGRAPH: action->opcode = O_NETGRAPH; goto chkarg; case TOK_NGTEE: action->opcode = O_NGTEE; goto chkarg; case TOK_DIVERT: action->opcode = O_DIVERT; goto chkarg; case TOK_TEE: action->opcode = O_TEE; goto chkarg; case TOK_CALL: action->opcode = O_CALLRETURN; chkarg: if (!av[0]) errx(EX_USAGE, "missing argument for %s", *(av - 1)); if (isdigit(**av)) { action->arg1 = strtoul(*av, NULL, 10); if (action->arg1 <= 0 || action->arg1 >= IP_FW_TABLEARG) errx(EX_DATAERR, "illegal argument for %s", *(av - 1)); } else if (_substrcmp(*av, "tablearg") == 0) { action->arg1 = IP_FW_TARG; } else if (i == TOK_DIVERT || i == TOK_TEE) { struct servent *s; setservent(1); s = getservbyname(av[0], "divert"); if (s != NULL) action->arg1 = ntohs(s->s_port); else errx(EX_DATAERR, "illegal divert/tee port"); } else errx(EX_DATAERR, "illegal argument for %s", *(av - 1)); av++; break; case TOK_FORWARD: { /* * Locate the address-port separator (':' or ','). * Could be one of the following: * hostname:port * IPv4 a.b.c.d,port * IPv4 a.b.c.d:port * IPv6 w:x:y::z,port * The ':' can only be used with hostname and IPv4 address. * XXX-BZ Should we also support [w:x:y::z]:port? */ struct sockaddr_storage result; struct addrinfo *res; char *s, *end; int family; u_short port_number; NEED1("missing forward address[:port]"); /* * locate the address-port separator (':' or ',') */ s = strchr(*av, ','); if (s == NULL) { /* Distinguish between IPv4:port and IPv6 cases. */ s = strchr(*av, ':'); if (s && strchr(s+1, ':')) s = NULL; /* no port */ } port_number = 0; if (s != NULL) { /* Terminate host portion and set s to start of port. */ *(s++) = '\0'; i = strtoport(s, &end, 0 /* base */, 0 /* proto */); if (s == end) errx(EX_DATAERR, "illegal forwarding port ``%s''", s); port_number = (u_short)i; } if (_substrcmp(*av, "tablearg") == 0) { family = PF_INET; ((struct sockaddr_in*)&result)->sin_addr.s_addr = INADDR_ANY; } else { /* * Resolve the host name or address to a family and a * network representation of the address. */ if (getaddrinfo(*av, NULL, NULL, &res)) errx(EX_DATAERR, NULL); /* Just use the first host in the answer. */ family = res->ai_family; memcpy(&result, res->ai_addr, res->ai_addrlen); freeaddrinfo(res); } if (family == PF_INET) { ipfw_insn_sa *p = (ipfw_insn_sa *)action; action->opcode = O_FORWARD_IP; action->len = F_INSN_SIZE(ipfw_insn_sa); CHECK_ACTLEN; /* * In the kernel we assume AF_INET and use only * sin_port and sin_addr. Remember to set sin_len as * the routing code seems to use it too. */ p->sa.sin_len = sizeof(struct sockaddr_in); p->sa.sin_family = AF_INET; p->sa.sin_port = port_number; p->sa.sin_addr.s_addr = ((struct sockaddr_in *)&result)->sin_addr.s_addr; } else if (family == PF_INET6) { ipfw_insn_sa6 *p = (ipfw_insn_sa6 *)action; action->opcode = O_FORWARD_IP6; action->len = F_INSN_SIZE(ipfw_insn_sa6); CHECK_ACTLEN; p->sa.sin6_len = sizeof(struct sockaddr_in6); p->sa.sin6_family = AF_INET6; p->sa.sin6_port = port_number; p->sa.sin6_flowinfo = 0; p->sa.sin6_scope_id = 0; /* No table support for v6 yet. */ bcopy(&((struct sockaddr_in6*)&result)->sin6_addr, &p->sa.sin6_addr, sizeof(p->sa.sin6_addr)); } else { errx(EX_DATAERR, "Invalid address family in forward action"); } av++; break; } case TOK_COMMENT: /* pretend it is a 'count' rule followed by the comment */ action->opcode = O_COUNT; av--; /* go back... */ break; case TOK_SETFIB: { int numfibs; size_t intsize = sizeof(int); action->opcode = O_SETFIB; NEED1("missing fib number"); if (_substrcmp(*av, "tablearg") == 0) { action->arg1 = IP_FW_TARG; } else { action->arg1 = strtoul(*av, NULL, 10); if (sysctlbyname("net.fibs", &numfibs, &intsize, NULL, 0) == -1) errx(EX_DATAERR, "fibs not suported.\n"); if (action->arg1 >= numfibs) /* Temporary */ errx(EX_DATAERR, "fib too large.\n"); /* Add high-order bit to fib to make room for tablearg*/ action->arg1 |= 0x8000; } av++; break; } case TOK_SETDSCP: { int code; action->opcode = O_SETDSCP; NEED1("missing DSCP code"); if (_substrcmp(*av, "tablearg") == 0) { action->arg1 = IP_FW_TARG; } else if (isalpha(*av[0])) { if ((code = match_token(f_ipdscp, *av)) == -1) errx(EX_DATAERR, "Unknown DSCP code"); action->arg1 = code; } else action->arg1 = strtoul(*av, NULL, 10); /* Add high-order bit to DSCP to make room for tablearg */ if (action->arg1 != IP_FW_TARG) action->arg1 |= 0x8000; av++; break; } case TOK_REASS: action->opcode = O_REASS; break; case TOK_RETURN: fill_cmd(action, O_CALLRETURN, F_NOT, 0); break; default: errx(EX_DATAERR, "invalid action %s\n", av[-1]); } action = next_cmd(action, &ablen); /* * [altq queuename] -- altq tag, optional * [log [logamount N]] -- log, optional * * If they exist, it go first in the cmdbuf, but then it is * skipped in the copy section to the end of the buffer. */ while (av[0] != NULL && (i = match_token(rule_action_params, *av)) != -1) { av++; switch (i) { case TOK_LOG: { ipfw_insn_log *c = (ipfw_insn_log *)cmd; int l; if (have_log) errx(EX_DATAERR, "log cannot be specified more than once"); have_log = (ipfw_insn *)c; cmd->len = F_INSN_SIZE(ipfw_insn_log); CHECK_CMDLEN; cmd->opcode = O_LOG; if (av[0] && _substrcmp(*av, "logamount") == 0) { av++; NEED1("logamount requires argument"); l = atoi(*av); if (l < 0) errx(EX_DATAERR, "logamount must be positive"); c->max_log = l; av++; } else { len = sizeof(c->max_log); if (sysctlbyname("net.inet.ip.fw.verbose_limit", &c->max_log, &len, NULL, 0) == -1) { if (co.test_only) { c->max_log = 0; break; } errx(1, "sysctlbyname(\"%s\")", "net.inet.ip.fw.verbose_limit"); } } } break; #ifndef NO_ALTQ case TOK_ALTQ: { ipfw_insn_altq *a = (ipfw_insn_altq *)cmd; NEED1("missing altq queue name"); if (have_altq) errx(EX_DATAERR, "altq cannot be specified more than once"); have_altq = (ipfw_insn *)a; cmd->len = F_INSN_SIZE(ipfw_insn_altq); CHECK_CMDLEN; cmd->opcode = O_ALTQ; a->qid = altq_name_to_qid(*av); av++; } break; #endif case TOK_TAG: case TOK_UNTAG: { uint16_t tag; if (have_tag) errx(EX_USAGE, "tag and untag cannot be " "specified more than once"); GET_UINT_ARG(tag, IPFW_ARG_MIN, IPFW_ARG_MAX, i, rule_action_params); have_tag = cmd; fill_cmd(cmd, O_TAG, (i == TOK_TAG) ? 0: F_NOT, tag); av++; break; } default: abort(); } cmd = next_cmd(cmd, &cblen); } if (have_state) /* must be a check-state, we are done */ goto done; #define OR_START(target) \ if (av[0] && (*av[0] == '(' || *av[0] == '{')) { \ if (open_par) \ errx(EX_USAGE, "nested \"(\" not allowed\n"); \ prev = NULL; \ open_par = 1; \ if ( (av[0])[1] == '\0') { \ av++; \ } else \ (*av)++; \ } \ target: \ #define CLOSE_PAR \ if (open_par) { \ if (av[0] && ( \ strcmp(*av, ")") == 0 || \ strcmp(*av, "}") == 0)) { \ prev = NULL; \ open_par = 0; \ av++; \ } else \ errx(EX_USAGE, "missing \")\"\n"); \ } #define NOT_BLOCK \ if (av[0] && _substrcmp(*av, "not") == 0) { \ if (cmd->len & F_NOT) \ errx(EX_USAGE, "double \"not\" not allowed\n"); \ cmd->len |= F_NOT; \ av++; \ } #define OR_BLOCK(target) \ if (av[0] && _substrcmp(*av, "or") == 0) { \ if (prev == NULL || open_par == 0) \ errx(EX_DATAERR, "invalid OR block"); \ prev->len |= F_OR; \ av++; \ goto target; \ } \ CLOSE_PAR; first_cmd = cmd; #if 0 /* * MAC addresses, optional. * If we have this, we skip the part "proto from src to dst" * and jump straight to the option parsing. */ NOT_BLOCK; NEED1("missing protocol"); if (_substrcmp(*av, "MAC") == 0 || _substrcmp(*av, "mac") == 0) { av++; /* the "MAC" keyword */ add_mac(cmd, av); /* exits in case of errors */ cmd = next_cmd(cmd); av += 2; /* dst-mac and src-mac */ NOT_BLOCK; NEED1("missing mac type"); if (add_mactype(cmd, av[0])) cmd = next_cmd(cmd); av++; /* any or mac-type */ goto read_options; } #endif /* * protocol, mandatory */ OR_START(get_proto); NOT_BLOCK; NEED1("missing protocol"); if (add_proto_compat(cmd, *av, &proto)) { av++; if (F_LEN(cmd) != 0) { prev = cmd; cmd = next_cmd(cmd, &cblen); } } else if (first_cmd != cmd) { errx(EX_DATAERR, "invalid protocol ``%s''", *av); } else goto read_options; OR_BLOCK(get_proto); /* * "from", mandatory */ if ((av[0] == NULL) || _substrcmp(*av, "from") != 0) errx(EX_USAGE, "missing ``from''"); av++; /* * source IP, mandatory */ OR_START(source_ip); NOT_BLOCK; /* optional "not" */ NEED1("missing source address"); if (add_src(cmd, *av, proto, cblen, tstate)) { av++; if (F_LEN(cmd) != 0) { /* ! any */ prev = cmd; cmd = next_cmd(cmd, &cblen); } } else errx(EX_USAGE, "bad source address %s", *av); OR_BLOCK(source_ip); /* * source ports, optional */ NOT_BLOCK; /* optional "not" */ if ( av[0] != NULL ) { if (_substrcmp(*av, "any") == 0 || add_ports(cmd, *av, proto, O_IP_SRCPORT, cblen)) { av++; if (F_LEN(cmd) != 0) cmd = next_cmd(cmd, &cblen); } } /* * "to", mandatory */ if ( (av[0] == NULL) || _substrcmp(*av, "to") != 0 ) errx(EX_USAGE, "missing ``to''"); av++; /* * destination, mandatory */ OR_START(dest_ip); NOT_BLOCK; /* optional "not" */ NEED1("missing dst address"); if (add_dst(cmd, *av, proto, cblen, tstate)) { av++; if (F_LEN(cmd) != 0) { /* ! any */ prev = cmd; cmd = next_cmd(cmd, &cblen); } } else errx( EX_USAGE, "bad destination address %s", *av); OR_BLOCK(dest_ip); /* * dest. ports, optional */ NOT_BLOCK; /* optional "not" */ if (av[0]) { if (_substrcmp(*av, "any") == 0 || add_ports(cmd, *av, proto, O_IP_DSTPORT, cblen)) { av++; if (F_LEN(cmd) != 0) cmd = next_cmd(cmd, &cblen); } } read_options: if (av[0] && first_cmd == cmd) { /* * nothing specified so far, store in the rule to ease * printout later. */ rule->flags |= IPFW_RULE_NOOPT; } prev = NULL; while ( av[0] != NULL ) { char *s; ipfw_insn_u32 *cmd32; /* alias for cmd */ s = *av; cmd32 = (ipfw_insn_u32 *)cmd; if (*s == '!') { /* alternate syntax for NOT */ if (cmd->len & F_NOT) errx(EX_USAGE, "double \"not\" not allowed\n"); cmd->len = F_NOT; s++; } i = match_token(rule_options, s); av++; switch(i) { case TOK_NOT: if (cmd->len & F_NOT) errx(EX_USAGE, "double \"not\" not allowed\n"); cmd->len = F_NOT; break; case TOK_OR: if (open_par == 0 || prev == NULL) errx(EX_USAGE, "invalid \"or\" block\n"); prev->len |= F_OR; break; case TOK_STARTBRACE: if (open_par) errx(EX_USAGE, "+nested \"(\" not allowed\n"); open_par = 1; break; case TOK_ENDBRACE: if (!open_par) errx(EX_USAGE, "+missing \")\"\n"); open_par = 0; prev = NULL; break; case TOK_IN: fill_cmd(cmd, O_IN, 0, 0); break; case TOK_OUT: cmd->len ^= F_NOT; /* toggle F_NOT */ fill_cmd(cmd, O_IN, 0, 0); break; case TOK_DIVERTED: fill_cmd(cmd, O_DIVERTED, 0, 3); break; case TOK_DIVERTEDLOOPBACK: fill_cmd(cmd, O_DIVERTED, 0, 1); break; case TOK_DIVERTEDOUTPUT: fill_cmd(cmd, O_DIVERTED, 0, 2); break; case TOK_FRAG: fill_cmd(cmd, O_FRAG, 0, 0); break; case TOK_LAYER2: fill_cmd(cmd, O_LAYER2, 0, 0); break; case TOK_XMIT: case TOK_RECV: case TOK_VIA: NEED1("recv, xmit, via require interface name" " or address"); fill_iface((ipfw_insn_if *)cmd, av[0], cblen, tstate); av++; if (F_LEN(cmd) == 0) /* not a valid address */ break; if (i == TOK_XMIT) cmd->opcode = O_XMIT; else if (i == TOK_RECV) cmd->opcode = O_RECV; else if (i == TOK_VIA) cmd->opcode = O_VIA; break; case TOK_ICMPTYPES: NEED1("icmptypes requires list of types"); fill_icmptypes((ipfw_insn_u32 *)cmd, *av); av++; break; case TOK_ICMP6TYPES: NEED1("icmptypes requires list of types"); fill_icmp6types((ipfw_insn_icmp6 *)cmd, *av, cblen); av++; break; case TOK_IPTTL: NEED1("ipttl requires TTL"); if (strpbrk(*av, "-,")) { if (!add_ports(cmd, *av, 0, O_IPTTL, cblen)) errx(EX_DATAERR, "invalid ipttl %s", *av); } else fill_cmd(cmd, O_IPTTL, 0, strtoul(*av, NULL, 0)); av++; break; case TOK_IPID: NEED1("ipid requires id"); if (strpbrk(*av, "-,")) { if (!add_ports(cmd, *av, 0, O_IPID, cblen)) errx(EX_DATAERR, "invalid ipid %s", *av); } else fill_cmd(cmd, O_IPID, 0, strtoul(*av, NULL, 0)); av++; break; case TOK_IPLEN: NEED1("iplen requires length"); if (strpbrk(*av, "-,")) { if (!add_ports(cmd, *av, 0, O_IPLEN, cblen)) errx(EX_DATAERR, "invalid ip len %s", *av); } else fill_cmd(cmd, O_IPLEN, 0, strtoul(*av, NULL, 0)); av++; break; case TOK_IPVER: NEED1("ipver requires version"); fill_cmd(cmd, O_IPVER, 0, strtoul(*av, NULL, 0)); av++; break; case TOK_IPPRECEDENCE: NEED1("ipprecedence requires value"); fill_cmd(cmd, O_IPPRECEDENCE, 0, (strtoul(*av, NULL, 0) & 7) << 5); av++; break; case TOK_DSCP: NEED1("missing DSCP code"); fill_dscp(cmd, *av, cblen); av++; break; case TOK_IPOPTS: NEED1("missing argument for ipoptions"); fill_flags_cmd(cmd, O_IPOPT, f_ipopts, *av); av++; break; case TOK_IPTOS: NEED1("missing argument for iptos"); fill_flags_cmd(cmd, O_IPTOS, f_iptos, *av); av++; break; case TOK_UID: NEED1("uid requires argument"); { char *end; uid_t uid; struct passwd *pwd; cmd->opcode = O_UID; uid = strtoul(*av, &end, 0); pwd = (*end == '\0') ? getpwuid(uid) : getpwnam(*av); if (pwd == NULL) errx(EX_DATAERR, "uid \"%s\" nonexistent", *av); cmd32->d[0] = pwd->pw_uid; cmd->len |= F_INSN_SIZE(ipfw_insn_u32); av++; } break; case TOK_GID: NEED1("gid requires argument"); { char *end; gid_t gid; struct group *grp; cmd->opcode = O_GID; gid = strtoul(*av, &end, 0); grp = (*end == '\0') ? getgrgid(gid) : getgrnam(*av); if (grp == NULL) errx(EX_DATAERR, "gid \"%s\" nonexistent", *av); cmd32->d[0] = grp->gr_gid; cmd->len |= F_INSN_SIZE(ipfw_insn_u32); av++; } break; case TOK_JAIL: NEED1("jail requires argument"); { char *end; int jid; cmd->opcode = O_JAIL; jid = (int)strtol(*av, &end, 0); if (jid < 0 || *end != '\0') errx(EX_DATAERR, "jail requires prison ID"); cmd32->d[0] = (uint32_t)jid; cmd->len |= F_INSN_SIZE(ipfw_insn_u32); av++; } break; case TOK_ESTAB: fill_cmd(cmd, O_ESTAB, 0, 0); break; case TOK_SETUP: fill_cmd(cmd, O_TCPFLAGS, 0, (TH_SYN) | ( (TH_ACK) & 0xff) <<8 ); break; case TOK_TCPDATALEN: NEED1("tcpdatalen requires length"); if (strpbrk(*av, "-,")) { if (!add_ports(cmd, *av, 0, O_TCPDATALEN, cblen)) errx(EX_DATAERR, "invalid tcpdata len %s", *av); } else fill_cmd(cmd, O_TCPDATALEN, 0, strtoul(*av, NULL, 0)); av++; break; case TOK_TCPOPTS: NEED1("missing argument for tcpoptions"); fill_flags_cmd(cmd, O_TCPOPTS, f_tcpopts, *av); av++; break; case TOK_TCPSEQ: case TOK_TCPACK: NEED1("tcpseq/tcpack requires argument"); cmd->len = F_INSN_SIZE(ipfw_insn_u32); cmd->opcode = (i == TOK_TCPSEQ) ? O_TCPSEQ : O_TCPACK; cmd32->d[0] = htonl(strtoul(*av, NULL, 0)); av++; break; case TOK_TCPWIN: NEED1("tcpwin requires length"); if (strpbrk(*av, "-,")) { if (!add_ports(cmd, *av, 0, O_TCPWIN, cblen)) errx(EX_DATAERR, "invalid tcpwin len %s", *av); } else fill_cmd(cmd, O_TCPWIN, 0, strtoul(*av, NULL, 0)); av++; break; case TOK_TCPFLAGS: NEED1("missing argument for tcpflags"); cmd->opcode = O_TCPFLAGS; fill_flags_cmd(cmd, O_TCPFLAGS, f_tcpflags, *av); av++; break; case TOK_KEEPSTATE: if (open_par) errx(EX_USAGE, "keep-state cannot be part " "of an or block"); if (have_state) errx(EX_USAGE, "only one of keep-state " "and limit is allowed"); have_state = cmd; fill_cmd(cmd, O_KEEP_STATE, 0, 0); break; case TOK_LIMIT: { ipfw_insn_limit *c = (ipfw_insn_limit *)cmd; int val; if (open_par) errx(EX_USAGE, "limit cannot be part of an or block"); if (have_state) errx(EX_USAGE, "only one of keep-state and " "limit is allowed"); have_state = cmd; cmd->len = F_INSN_SIZE(ipfw_insn_limit); CHECK_CMDLEN; cmd->opcode = O_LIMIT; c->limit_mask = c->conn_limit = 0; while ( av[0] != NULL ) { if ((val = match_token(limit_masks, *av)) <= 0) break; c->limit_mask |= val; av++; } if (c->limit_mask == 0) errx(EX_USAGE, "limit: missing limit mask"); GET_UINT_ARG(c->conn_limit, IPFW_ARG_MIN, IPFW_ARG_MAX, TOK_LIMIT, rule_options); av++; break; } case TOK_PROTO: NEED1("missing protocol"); if (add_proto(cmd, *av, &proto)) { av++; } else errx(EX_DATAERR, "invalid protocol ``%s''", *av); break; case TOK_SRCIP: NEED1("missing source IP"); if (add_srcip(cmd, *av, cblen, tstate)) { av++; } break; case TOK_DSTIP: NEED1("missing destination IP"); if (add_dstip(cmd, *av, cblen, tstate)) { av++; } break; case TOK_SRCIP6: NEED1("missing source IP6"); if (add_srcip6(cmd, *av, cblen)) { av++; } break; case TOK_DSTIP6: NEED1("missing destination IP6"); if (add_dstip6(cmd, *av, cblen)) { av++; } break; case TOK_SRCPORT: NEED1("missing source port"); if (_substrcmp(*av, "any") == 0 || add_ports(cmd, *av, proto, O_IP_SRCPORT, cblen)) { av++; } else errx(EX_DATAERR, "invalid source port %s", *av); break; case TOK_DSTPORT: NEED1("missing destination port"); if (_substrcmp(*av, "any") == 0 || add_ports(cmd, *av, proto, O_IP_DSTPORT, cblen)) { av++; } else errx(EX_DATAERR, "invalid destination port %s", *av); break; case TOK_MAC: if (add_mac(cmd, av, cblen)) av += 2; break; case TOK_MACTYPE: NEED1("missing mac type"); if (!add_mactype(cmd, *av, cblen)) errx(EX_DATAERR, "invalid mac type %s", *av); av++; break; case TOK_VERREVPATH: fill_cmd(cmd, O_VERREVPATH, 0, 0); break; case TOK_VERSRCREACH: fill_cmd(cmd, O_VERSRCREACH, 0, 0); break; case TOK_ANTISPOOF: fill_cmd(cmd, O_ANTISPOOF, 0, 0); break; case TOK_IPSEC: fill_cmd(cmd, O_IPSEC, 0, 0); break; case TOK_IPV6: fill_cmd(cmd, O_IP6, 0, 0); break; case TOK_IPV4: fill_cmd(cmd, O_IP4, 0, 0); break; case TOK_EXT6HDR: fill_ext6hdr( cmd, *av ); av++; break; case TOK_FLOWID: if (proto != IPPROTO_IPV6 ) errx( EX_USAGE, "flow-id filter is active " "only for ipv6 protocol\n"); fill_flow6( (ipfw_insn_u32 *) cmd, *av, cblen); av++; break; case TOK_COMMENT: fill_comment(cmd, av, cblen); av[0]=NULL; break; case TOK_TAGGED: if (av[0] && strpbrk(*av, "-,")) { if (!add_ports(cmd, *av, 0, O_TAGGED, cblen)) errx(EX_DATAERR, "tagged: invalid tag" " list: %s", *av); } else { uint16_t tag; GET_UINT_ARG(tag, IPFW_ARG_MIN, IPFW_ARG_MAX, TOK_TAGGED, rule_options); fill_cmd(cmd, O_TAGGED, 0, tag); } av++; break; case TOK_FIB: NEED1("fib requires fib number"); fill_cmd(cmd, O_FIB, 0, strtoul(*av, NULL, 0)); av++; break; case TOK_SOCKARG: fill_cmd(cmd, O_SOCKARG, 0, 0); break; case TOK_LOOKUP: { ipfw_insn_u32 *c = (ipfw_insn_u32 *)cmd; int j; if (!av[0] || !av[1]) errx(EX_USAGE, "format: lookup argument tablenum"); cmd->opcode = O_IP_DST_LOOKUP; cmd->len |= F_INSN_SIZE(ipfw_insn) + 2; i = match_token(rule_options, *av); for (j = 0; lookup_key[j] >= 0 ; j++) { if (i == lookup_key[j]) break; } if (lookup_key[j] <= 0) errx(EX_USAGE, "format: cannot lookup on %s", *av); __PAST_END(c->d, 1) = j; // i converted to option av++; if ((j = pack_table(tstate, *av)) == 0) errx(EX_DATAERR, "Invalid table name: %s", *av); cmd->arg1 = j; av++; } break; case TOK_FLOW: NEED1("missing table name"); if (strncmp(*av, "table(", 6) != 0) errx(EX_DATAERR, "enclose table name into \"table()\""); fill_table(cmd, *av, O_IP_FLOW_LOOKUP, tstate); av++; break; default: errx(EX_USAGE, "unrecognised option [%d] %s\n", i, s); } if (F_LEN(cmd) > 0) { /* prepare to advance */ prev = cmd; cmd = next_cmd(cmd, &cblen); } } done: /* * Now copy stuff into the rule. * If we have a keep-state option, the first instruction * must be a PROBE_STATE (which is generated here). * If we have a LOG option, it was stored as the first command, * and now must be moved to the top of the action part. */ dst = (ipfw_insn *)rule->cmd; /* * First thing to write into the command stream is the match probability. */ if (match_prob != 1) { /* 1 means always match */ dst->opcode = O_PROB; dst->len = 2; *((int32_t *)(dst+1)) = (int32_t)(match_prob * 0x7fffffff); dst += dst->len; } /* * generate O_PROBE_STATE if necessary */ if (have_state && have_state->opcode != O_CHECK_STATE) { fill_cmd(dst, O_PROBE_STATE, 0, 0); dst = next_cmd(dst, &rblen); } /* copy all commands but O_LOG, O_KEEP_STATE, O_LIMIT, O_ALTQ, O_TAG */ for (src = (ipfw_insn *)cmdbuf; src != cmd; src += i) { i = F_LEN(src); CHECK_RBUFLEN(i); switch (src->opcode) { case O_LOG: case O_KEEP_STATE: case O_LIMIT: case O_ALTQ: case O_TAG: break; default: bcopy(src, dst, i * sizeof(uint32_t)); dst += i; } } /* * put back the have_state command as last opcode */ if (have_state && have_state->opcode != O_CHECK_STATE) { i = F_LEN(have_state); CHECK_RBUFLEN(i); bcopy(have_state, dst, i * sizeof(uint32_t)); dst += i; } /* * start action section */ rule->act_ofs = dst - rule->cmd; /* put back O_LOG, O_ALTQ, O_TAG if necessary */ if (have_log) { i = F_LEN(have_log); CHECK_RBUFLEN(i); bcopy(have_log, dst, i * sizeof(uint32_t)); dst += i; } if (have_altq) { i = F_LEN(have_altq); CHECK_RBUFLEN(i); bcopy(have_altq, dst, i * sizeof(uint32_t)); dst += i; } if (have_tag) { i = F_LEN(have_tag); CHECK_RBUFLEN(i); bcopy(have_tag, dst, i * sizeof(uint32_t)); dst += i; } /* * copy all other actions */ for (src = (ipfw_insn *)actbuf; src != action; src += i) { i = F_LEN(src); CHECK_RBUFLEN(i); bcopy(src, dst, i * sizeof(uint32_t)); dst += i; } rule->cmd_len = (uint32_t *)dst - (uint32_t *)(rule->cmd); *rbufsize = (char *)dst - (char *)rule; } /* * Adds one or more rules to ipfw chain. * Data layout: * Request: * [ * ip_fw3_opheader * [ ipfw_obj_ctlv(IPFW_TLV_TBL_LIST) ipfw_obj_ntlv x N ] (optional *1) * [ ipfw_obj_ctlv(IPFW_TLV_RULE_LIST) [ ip_fw_rule ip_fw_insn ] x N ] (*2) (*3) * ] * Reply: * [ * ip_fw3_opheader * [ ipfw_obj_ctlv(IPFW_TLV_TBL_LIST) ipfw_obj_ntlv x N ] (optional) * [ ipfw_obj_ctlv(IPFW_TLV_RULE_LIST) [ ip_fw_rule ip_fw_insn ] x N ] * ] * * Rules in reply are modified to store their actual ruleset number. * * (*1) TLVs inside IPFW_TLV_TBL_LIST needs to be sorted ascending * accoring to their idx field and there has to be no duplicates. * (*2) Numbered rules inside IPFW_TLV_RULE_LIST needs to be sorted ascending. * (*3) Each ip_fw structure needs to be aligned to u64 boundary. */ void ipfw_add(char *av[]) { uint32_t rulebuf[1024]; int rbufsize, default_off, tlen, rlen; size_t sz; struct tidx ts; struct ip_fw_rule *rule; caddr_t tbuf; ip_fw3_opheader *op3; ipfw_obj_ctlv *ctlv, *tstate; rbufsize = sizeof(rulebuf); memset(rulebuf, 0, rbufsize); memset(&ts, 0, sizeof(ts)); /* Optimize case with no tables */ default_off = sizeof(ipfw_obj_ctlv) + sizeof(ip_fw3_opheader); op3 = (ip_fw3_opheader *)rulebuf; ctlv = (ipfw_obj_ctlv *)(op3 + 1); rule = (struct ip_fw_rule *)(ctlv + 1); rbufsize -= default_off; compile_rule(av, (uint32_t *)rule, &rbufsize, &ts); /* Align rule size to u64 boundary */ rlen = roundup2(rbufsize, sizeof(uint64_t)); tbuf = NULL; sz = 0; tstate = NULL; if (ts.count != 0) { /* Some tables. We have to alloc more data */ tlen = ts.count * sizeof(ipfw_obj_ntlv); sz = default_off + sizeof(ipfw_obj_ctlv) + tlen + rlen; if ((tbuf = calloc(1, sz)) == NULL) err(EX_UNAVAILABLE, "malloc() failed for IP_FW_ADD"); op3 = (ip_fw3_opheader *)tbuf; /* Tables first */ ctlv = (ipfw_obj_ctlv *)(op3 + 1); ctlv->head.type = IPFW_TLV_TBLNAME_LIST; ctlv->head.length = sizeof(ipfw_obj_ctlv) + tlen; ctlv->count = ts.count; ctlv->objsize = sizeof(ipfw_obj_ntlv); memcpy(ctlv + 1, ts.idx, tlen); table_sort_ctlv(ctlv); tstate = ctlv; /* Rule next */ ctlv = (ipfw_obj_ctlv *)((caddr_t)ctlv + ctlv->head.length); ctlv->head.type = IPFW_TLV_RULE_LIST; ctlv->head.length = sizeof(ipfw_obj_ctlv) + rlen; ctlv->count = 1; memcpy(ctlv + 1, rule, rbufsize); } else { /* Simply add header */ sz = rlen + default_off; memset(ctlv, 0, sizeof(*ctlv)); ctlv->head.type = IPFW_TLV_RULE_LIST; ctlv->head.length = sizeof(ipfw_obj_ctlv) + rlen; ctlv->count = 1; } if (do_get3(IP_FW_XADD, op3, &sz) != 0) err(EX_UNAVAILABLE, "getsockopt(%s)", "IP_FW_XADD"); if (!co.do_quiet) { struct format_opts sfo; struct buf_pr bp; memset(&sfo, 0, sizeof(sfo)); sfo.tstate = tstate; sfo.set_mask = (uint32_t)(-1); bp_alloc(&bp, 4096); show_static_rule(&co, &sfo, &bp, rule, NULL); printf("%s", bp.buf); bp_free(&bp); } if (tbuf != NULL) free(tbuf); if (ts.idx != NULL) free(ts.idx); } /* * clear the counters or the log counters. * optname has the following values: * 0 (zero both counters and logging) * 1 (zero logging only) */ void ipfw_zero(int ac, char *av[], int optname) { ipfw_range_tlv rt; uint32_t arg; int failed = EX_OK; char const *errstr; char const *name = optname ? "RESETLOG" : "ZERO"; optname = optname ? IP_FW_XRESETLOG : IP_FW_XZERO; memset(&rt, 0, sizeof(rt)); av++; ac--; if (ac == 0) { /* clear all entries */ rt.flags = IPFW_RCFLAG_ALL; if (do_range_cmd(optname, &rt) < 0) err(EX_UNAVAILABLE, "setsockopt(IP_FW_X%s)", name); if (!co.do_quiet) printf("%s.\n", optname == IP_FW_XZERO ? "Accounting cleared":"Logging counts reset"); return; } while (ac) { /* Rule number */ if (isdigit(**av)) { arg = strtonum(*av, 0, 0xffff, &errstr); if (errstr) errx(EX_DATAERR, "invalid rule number %s\n", *av); rt.start_rule = arg; rt.end_rule = arg; rt.flags |= IPFW_RCFLAG_RANGE; if (co.use_set != 0) { rt.set = co.use_set - 1; rt.flags |= IPFW_RCFLAG_SET; } if (do_range_cmd(optname, &rt) != 0) { warn("rule %u: setsockopt(IP_FW_X%s)", arg, name); failed = EX_UNAVAILABLE; } else if (rt.new_set == 0) { printf("Entry %d not found\n", arg); failed = EX_UNAVAILABLE; } else if (!co.do_quiet) printf("Entry %d %s.\n", arg, optname == IP_FW_XZERO ? "cleared" : "logging count reset"); } else { errx(EX_USAGE, "invalid rule number ``%s''", *av); } av++; ac--; } if (failed != EX_OK) exit(failed); } void ipfw_flush(int force) { ipfw_range_tlv rt; if (!force && !co.do_quiet) { /* need to ask user */ int c; printf("Are you sure? [yn] "); fflush(stdout); do { c = toupper(getc(stdin)); while (c != '\n' && getc(stdin) != '\n') if (feof(stdin)) return; /* and do not flush */ } while (c != 'Y' && c != 'N'); printf("\n"); if (c == 'N') /* user said no */ return; } if (co.do_pipe) { dummynet_flush(); return; } /* `ipfw set N flush` - is the same that `ipfw delete set N` */ memset(&rt, 0, sizeof(rt)); if (co.use_set != 0) { rt.set = co.use_set - 1; rt.flags = IPFW_RCFLAG_SET; } else rt.flags = IPFW_RCFLAG_ALL; if (do_range_cmd(IP_FW_XDEL, &rt) != 0) err(EX_UNAVAILABLE, "setsockopt(IP_FW_XDEL)"); if (!co.do_quiet) printf("Flushed all %s.\n", co.do_pipe ? "pipes" : "rules"); } static struct _s_x intcmds[] = { { "talist", TOK_TALIST }, { "iflist", TOK_IFLIST }, { "vlist", TOK_VLIST }, { NULL, 0 } }; void ipfw_internal_handler(int ac, char *av[]) { int tcmd; ac--; av++; NEED1("internal cmd required"); if ((tcmd = match_token(intcmds, *av)) == -1) errx(EX_USAGE, "invalid internal sub-cmd: %s", *av); switch (tcmd) { case TOK_IFLIST: ipfw_list_tifaces(); break; case TOK_TALIST: ipfw_list_ta(ac, av); break; case TOK_VLIST: ipfw_list_values(ac, av); break; } } static int ipfw_get_tracked_ifaces(ipfw_obj_lheader **polh) { ipfw_obj_lheader req, *olh; size_t sz; memset(&req, 0, sizeof(req)); sz = sizeof(req); if (do_get3(IP_FW_XIFLIST, &req.opheader, &sz) != 0) { if (errno != ENOMEM) return (errno); } sz = req.size; if ((olh = calloc(1, sz)) == NULL) return (ENOMEM); olh->size = sz; if (do_get3(IP_FW_XIFLIST, &olh->opheader, &sz) != 0) { free(olh); return (errno); } *polh = olh; return (0); } static int ifinfo_cmp(const void *a, const void *b) { ipfw_iface_info *ia, *ib; ia = (ipfw_iface_info *)a; ib = (ipfw_iface_info *)b; return (stringnum_cmp(ia->ifname, ib->ifname)); } /* * Retrieves table list from kernel, * optionally sorts it and calls requested function for each table. * Returns 0 on success. */ static void ipfw_list_tifaces() { ipfw_obj_lheader *olh; ipfw_iface_info *info; int i, error; if ((error = ipfw_get_tracked_ifaces(&olh)) != 0) err(EX_OSERR, "Unable to request ipfw tracked interface list"); qsort(olh + 1, olh->count, olh->objsize, ifinfo_cmp); info = (ipfw_iface_info *)(olh + 1); for (i = 0; i < olh->count; i++) { if (info->flags & IPFW_IFFLAG_RESOLVED) printf("%s ifindex: %d refcount: %u changes: %u\n", info->ifname, info->ifindex, info->refcnt, info->gencnt); else printf("%s ifindex: unresolved refcount: %u changes: %u\n", info->ifname, info->refcnt, info->gencnt); info = (ipfw_iface_info *)((caddr_t)info + olh->objsize); } free(olh); } Index: projects/building-blocks/sbin/ipfw =================================================================== --- projects/building-blocks/sbin/ipfw (revision 278311) +++ projects/building-blocks/sbin/ipfw (revision 278312) Property changes on: projects/building-blocks/sbin/ipfw ___________________________________________________________________ Modified: svn:mergeinfo ## -0,0 +0,1 ## Merged /head/sbin/ipfw:r277724-278311 Index: projects/building-blocks/sbin =================================================================== --- projects/building-blocks/sbin (revision 278311) +++ projects/building-blocks/sbin (revision 278312) Property changes on: projects/building-blocks/sbin ___________________________________________________________________ Modified: svn:mergeinfo ## -0,0 +0,1 ## Merged /head/sbin:r278195-278311 Index: projects/building-blocks/sys/arm/broadcom/bcm2835/files.bcm2835 =================================================================== --- projects/building-blocks/sys/arm/broadcom/bcm2835/files.bcm2835 (revision 278311) +++ projects/building-blocks/sys/arm/broadcom/bcm2835/files.bcm2835 (revision 278312) @@ -1,29 +1,49 @@ # $FreeBSD$ arm/broadcom/bcm2835/bcm2835_bsc.c optional bcm2835_bsc arm/broadcom/bcm2835/bcm2835_common.c optional fdt arm/broadcom/bcm2835/bcm2835_cpufreq.c standard arm/broadcom/bcm2835/bcm2835_dma.c standard arm/broadcom/bcm2835/bcm2835_fb.c optional sc arm/broadcom/bcm2835/bcm2835_fbd.c optional vt arm/broadcom/bcm2835/bcm2835_gpio.c optional gpio arm/broadcom/bcm2835/bcm2835_intr.c standard arm/broadcom/bcm2835/bcm2835_machdep.c standard arm/broadcom/bcm2835/bcm2835_mbox.c standard arm/broadcom/bcm2835/bcm2835_sdhci.c optional sdhci arm/broadcom/bcm2835/bcm2835_spi.c optional bcm2835_spi arm/broadcom/bcm2835/bcm2835_systimer.c standard arm/broadcom/bcm2835/bcm2835_wdog.c standard arm/arm/bus_space_base.c standard arm/arm/bus_space_generic.c standard arm/arm/bus_space_asm_generic.S standard arm/arm/cpufunc_asm_arm11.S standard arm/arm/cpufunc_asm_arm11x6.S standard arm/arm/cpufunc_asm_armv5.S standard arm/arm/cpufunc_asm_armv6.S standard kern/kern_clocksource.c standard dev/mbox/mbox_if.m standard dev/ofw/ofw_cpu.c standard + +# VideoCore driver +contrib/vchiq/interface/compat/vchi_bsd.c standard \ + compile-with "${NORMAL_C} -DUSE_VCHIQ_ARM -D__VCCOREVER__=0x04000000 -I$S/contrib/vchiq" +contrib/vchiq/interface/vchiq_arm/vchiq_2835_arm.c standard \ + compile-with "${NORMAL_C} -Wno-unused -DUSE_VCHIQ_ARM -D__VCCOREVER__=0x04000000 -I$S/contrib/vchiq" +contrib/vchiq/interface/vchiq_arm/vchiq_arm.c standard \ + compile-with "${NORMAL_C} -Wno-unused -DUSE_VCHIQ_ARM -D__VCCOREVER__=0x04000000 -I$S/contrib/vchiq" +contrib/vchiq/interface/vchiq_arm/vchiq_connected.c standard \ + compile-with "${NORMAL_C} -DUSE_VCHIQ_ARM -D__VCCOREVER__=0x04000000 -I$S/contrib/vchiq" +contrib/vchiq/interface/vchiq_arm/vchiq_core.c standard \ + compile-with "${NORMAL_C} -DUSE_VCHIQ_ARM -D__VCCOREVER__=0x04000000 -I$S/contrib/vchiq" +contrib/vchiq/interface/vchiq_arm/vchiq_kern_lib.c standard \ + compile-with "${NORMAL_C} -DUSE_VCHIQ_ARM -D__VCCOREVER__=0x04000000 -I$S/contrib/vchiq" +contrib/vchiq/interface/vchiq_arm/vchiq_kmod.c standard \ + compile-with "${NORMAL_C} -DUSE_VCHIQ_ARM -D__VCCOREVER__=0x04000000 -I$S/contrib/vchiq" +contrib/vchiq/interface/vchiq_arm/vchiq_shim.c standard \ + compile-with "${NORMAL_C} -DUSE_VCHIQ_ARM -D__VCCOREVER__=0x04000000 -I$S/contrib/vchiq" +contrib/vchiq/interface/vchiq_arm/vchiq_util.c standard \ + compile-with "${NORMAL_C} -DUSE_VCHIQ_ARM -D__VCCOREVER__=0x04000000 -I$S/contrib/vchiq" Index: projects/building-blocks/sys/boot/amd64/boot1.efi/fat.tmpl.bz2.uu =================================================================== --- projects/building-blocks/sys/boot/amd64/boot1.efi/fat.tmpl.bz2.uu (revision 278311) +++ projects/building-blocks/sys/boot/amd64/boot1.efi/fat.tmpl.bz2.uu (revision 278312) @@ -1,21 +1,20 @@ FAT template boot filesystem created by generate-fat.sh DO NOT EDIT $FreeBSD$ begin 644 fat.tmpl.bz2 -M0EIH.3%!62936=AO?&0`&J9____[ZZKJJ_^N_ZO^Z_^[OO_\`5`(0!0&#$D" -M0$)$2&(H:`81HT#)D!H-#U`T#31IH-&(``R8$9_I)6[MY/, -M(H=/()+4&!(3V0"20C3J5$L5@2`219,"T6JI,@0"2*2\=LAD6=>N6(8QSW'U+N42P^'5X@7X``23=EA``#Z,O)^-VTX@ -M`+E!=,&6PV11C:*D8K#^<%FTG-%!@PR72@\ZU0BF1Y] -MF-FPGL2L>4QCU&O/>89^#H$6^<;&WKC9W52KUX."CM6+GD;(=1!MUD,,?Y[] -MTLAG0];,:B^]M%BH0J1":_C-*2I9R3AS#,&0>$RCY'T/R?HR!?'5$MILQ:!" -M+;10A*!&^<(_/8>D8I-DTU,)ZAZ0VA-!M0T'J`>H#"9 +M'I#0-H&HQI&0&3&FH>H>*`JHHU3V]1%/4/2``T#0`!H``#0`````#1H,@``6 +M'1&G'&@?$6[T#A)?X8$A160"20BO#")0J4TB1*4GXF$B4I,&>43+=_?K=#3* +M6]R"ZNKJZI,9*68E8*E2Q +M4J5*E3'(1830A"$(12A-"<(0A#]VD)H0A"$,>I0FA"$(0I\>P^=F5:M6K5JU +M:DI3:64UN;[7%5B]Y-^\]@_K@B:N\/,5F%&H<\G#IXQXAEFC&D?![6%0'6MR +MX1@@%FC"FD`M7,/SXFNG:2`'-0<-C$8^+$N.7M1B,^6)9,DV9,0A\OL<:C"L +ML1V&,<\9YRB>XV#BG")'6NKRK^("UF2XO?_L!#29">MGDF$R3).!PX&%E,4C +M''=(FL1.`_3?CN@-IB2PI3!FF\<8X.X@D,>CA90I)#M$XRPNDFJELL<3=1?8 +M2B7\5Z64,!7Z;EEBW-MXN-4IJ@W$462]-*\YCR,-B,5[W?=3&L/U>SX,WV#\ +M\B`:I"'0Z)5"$1B.E)(K[5I4RS`%R$>Y\D0NR*,;<9CZ:^V3P(I?D Makefile.fat echo '# $FreeBSD$' >> Makefile.fat echo "BOOT1_OFFSET=0x$BOOT1_OFFSET" >> Makefile.fat bzip2 $OUTPUT_FILE echo 'FAT template boot filesystem created by generate-fat.sh' > $OUTPUT_FILE.bz2.uu echo 'DO NOT EDIT' >> $OUTPUT_FILE.bz2.uu echo '$FreeBSD$' >> $OUTPUT_FILE.bz2.uu uuencode $OUTPUT_FILE.bz2 $OUTPUT_FILE.bz2 >> $OUTPUT_FILE.bz2.uu rm $OUTPUT_FILE.bz2 Index: projects/building-blocks/sys/boot =================================================================== --- projects/building-blocks/sys/boot (revision 278311) +++ projects/building-blocks/sys/boot (revision 278312) Property changes on: projects/building-blocks/sys/boot ___________________________________________________________________ Modified: svn:mergeinfo ## -0,0 +0,1 ## Merged /head/sys/boot:r278256-278311 Index: projects/building-blocks/sys/conf/kern.pre.mk =================================================================== --- projects/building-blocks/sys/conf/kern.pre.mk (revision 278311) +++ projects/building-blocks/sys/conf/kern.pre.mk (revision 278312) @@ -1,202 +1,205 @@ # $FreeBSD$ # Part of a unified Makefile for building kernels. This part contains all # of the definitions that need to be before %BEFORE_DEPEND. # Allow user to configure things that only effect src tree builds. # Note: This is duplicated from src.sys.mk to ensure that we include # /etc/src.conf when building the kernel. Kernels can be built without # the rest of /usr/src, but they still always process SRCCONF even though # the normal mechanisms to prevent that (compiling out of tree) won't # work. To ensure they do work, we have to duplicate thee few lines here. SRCCONF?= /etc/src.conf .if (exists(${SRCCONF}) || ${SRCCONF} != "/etc/src.conf") && !target(_srcconf_included_) .include "${SRCCONF}" _srcconf_included_: .endif .include .include .include "kern.opts.mk" # Can be overridden by makeoptions or /etc/make.conf KERNEL_KO?= kernel KERNEL?= kernel KODIR?= /boot/${KERNEL} LDSCRIPT_NAME?= ldscript.$M LDSCRIPT?= $S/conf/${LDSCRIPT_NAME} M= ${MACHINE_CPUARCH} AWK?= awk CP?= cp LINT?= lint NM?= nm OBJCOPY?= objcopy SIZE?= size .if defined(DEBUG) _MINUS_O= -O CTFFLAGS+= -g .else .if ${MACHINE_CPUARCH} == "powerpc" _MINUS_O= -O # gcc miscompiles some code at -O2 .else _MINUS_O= -O2 .endif .endif .if ${MACHINE_CPUARCH} == "amd64" .if ${COMPILER_TYPE} == "clang" COPTFLAGS?=-O2 -pipe .else COPTFLAGS?=-O2 -frename-registers -pipe .endif .else COPTFLAGS?=${_MINUS_O} -pipe .endif .if !empty(COPTFLAGS:M-O[23s]) && empty(COPTFLAGS:M-fno-strict-aliasing) COPTFLAGS+= -fno-strict-aliasing .endif .if !defined(NO_CPU_COPTFLAGS) COPTFLAGS+= ${_CPUCFLAGS} .endif NOSTDINC= -nostdinc INCLUDES= ${NOSTDINC} ${INCLMAGIC} -I. -I$S # This hack lets us use the OpenBSD altq code without spamming a new # include path into contrib'ed source files. INCLUDES+= -I$S/contrib/altq .if make(depend) || make(kernel-depend) # ... and the same for ipfilter INCLUDES+= -I$S/contrib/ipfilter # ... and the same for ath INCLUDES+= -I$S/dev/ath -I$S/dev/ath/ath_hal -I$S/contrib/dev/ath/ath_hal # ... and the same for the NgATM stuff INCLUDES+= -I$S/contrib/ngatm +# ... and the same for vchiq +INCLUDES+= -I$S/contrib/vchiq + # ... and the same for twa INCLUDES+= -I$S/dev/twa # ... and the same for cxgb and cxgbe INCLUDES+= -I$S/dev/cxgb -I$S/dev/cxgbe .endif CFLAGS= ${COPTFLAGS} ${DEBUG} ${CWARNFLAGS} CFLAGS+= ${INCLUDES} -D_KERNEL -DHAVE_KERNEL_OPTION_HEADERS -include opt_global.h CFLAGS_PARAM_INLINE_UNIT_GROWTH?=100 CFLAGS_PARAM_LARGE_FUNCTION_GROWTH?=1000 .if ${MACHINE_CPUARCH} == "mips" CFLAGS_ARCH_PARAMS?=--param max-inline-insns-single=1000 .endif CFLAGS.gcc+= -fno-common -finline-limit=${INLINE_LIMIT} CFLAGS.gcc+= --param inline-unit-growth=${CFLAGS_PARAM_INLINE_UNIT_GROWTH} CFLAGS.gcc+= --param large-function-growth=${CFLAGS_PARAM_LARGE_FUNCTION_GROWTH} .if defined(CFLAGS_ARCH_PARAMS) CFLAGS.gcc+=${CFLAGS_ARCH_PARAMS} .endif WERROR?= -Werror # XXX LOCORE means "don't declare C stuff" not "for locore.s". ASM_CFLAGS= -x assembler-with-cpp -DLOCORE ${CFLAGS} ${ASM_CFLAGS.${.IMPSRC:T}} .if defined(PROFLEVEL) && ${PROFLEVEL} >= 1 CFLAGS+= -DGPROF CFLAGS.gcc+= -falign-functions=16 .if ${PROFLEVEL} >= 2 CFLAGS+= -DGPROF4 -DGUPROF PROF= -pg .if ${COMPILER_TYPE} == "gcc" PROF+= -mprofiler-epilogue .endif .else PROF= -pg .endif .endif DEFINED_PROF= ${PROF} # Put configuration-specific C flags last (except for ${PROF}) so that they # can override the others. CFLAGS+= ${CONF_CFLAGS} # Optional linting. This can be overridden in /etc/make.conf. LINTFLAGS= ${LINTOBJKERNFLAGS} NORMAL_C= ${CC} -c ${CFLAGS} ${WERROR} ${PROF} ${.IMPSRC} NORMAL_S= ${CC} -c ${ASM_CFLAGS} ${WERROR} ${.IMPSRC} PROFILE_C= ${CC} -c ${CFLAGS} ${WERROR} ${.IMPSRC} NORMAL_C_NOWERROR= ${CC} -c ${CFLAGS} ${PROF} ${.IMPSRC} NORMAL_M= ${AWK} -f $S/tools/makeobjops.awk ${.IMPSRC} -c ; \ ${CC} -c ${CFLAGS} ${WERROR} ${PROF} ${.PREFIX}.c NORMAL_FW= uudecode -o ${.TARGET} ${.ALLSRC} NORMAL_FWO= ${LD} -b binary --no-warn-mismatch -d -warn-common -r \ -o ${.TARGET} ${.ALLSRC:M*.fw} # Special flags for managing the compat compiles for ZFS ZFS_CFLAGS= -DFREEBSD_NAMECACHE -DBUILDING_ZFS -nostdinc -I$S/cddl/compat/opensolaris -I$S/cddl/contrib/opensolaris/uts/common/fs/zfs -I$S/cddl/contrib/opensolaris/uts/common/zmod -I$S/cddl/contrib/opensolaris/uts/common -I$S -I$S/cddl/contrib/opensolaris/common/zfs -I$S/cddl/contrib/opensolaris/common ${CFLAGS} -Wno-unknown-pragmas -Wno-missing-prototypes -Wno-undef -Wno-strict-prototypes -Wno-cast-qual -Wno-parentheses -Wno-redundant-decls -Wno-missing-braces -Wno-uninitialized -Wno-unused -Wno-inline -Wno-switch -Wno-pointer-arith -Wno-unknown-pragmas ZFS_CFLAGS+= -include $S/cddl/compat/opensolaris/sys/debug_compat.h ZFS_ASM_CFLAGS= -x assembler-with-cpp -DLOCORE ${ZFS_CFLAGS} ZFS_C= ${CC} -c ${ZFS_CFLAGS} ${WERROR} ${PROF} ${.IMPSRC} ZFS_S= ${CC} -c ${ZFS_ASM_CFLAGS} ${WERROR} ${.IMPSRC} .if ${MK_CTF} != "no" NORMAL_CTFCONVERT= ${CTFCONVERT} ${CTFFLAGS} ${.TARGET} .elif ${MAKE_VERSION} >= 5201111300 NORMAL_CTFCONVERT= .else NORMAL_CTFCONVERT= @: .endif NORMAL_LINT= ${LINT} ${LINTFLAGS} ${CFLAGS:M-[DIU]*} ${.IMPSRC} # Infiniband C flags. Correct include paths and omit errors that linux # does not honor. OFEDINCLUDES= -I$S/ofed/include/ OFEDNOERR= -Wno-cast-qual -Wno-pointer-arith ${GCC_MS_EXTENSIONS} OFEDCFLAGS= ${CFLAGS:N-I*} ${OFEDINCLUDES} ${CFLAGS:M-I*} ${OFEDNOERR} OFED_C_NOIMP= ${CC} -c -o ${.TARGET} ${OFEDCFLAGS} ${WERROR} ${PROF} OFED_C= ${OFED_C_NOIMP} ${.IMPSRC} GEN_CFILES= $S/$M/$M/genassym.c ${MFILES:T:S/.m$/.c/} SYSTEM_CFILES= config.c env.c hints.c vnode_if.c SYSTEM_DEP= Makefile ${SYSTEM_OBJS} SYSTEM_OBJS= locore.o ${MDOBJS} ${OBJS} SYSTEM_OBJS+= ${SYSTEM_CFILES:.c=.o} SYSTEM_OBJS+= hack.So SYSTEM_LD= @${LD} -Bdynamic -T ${LDSCRIPT} ${_LDFLAGS} --no-warn-mismatch \ - -warn-common -export-dynamic -dynamic-linker /red/herring \ + --warn-common --export-dynamic --dynamic-linker /red/herring \ -o ${.TARGET} -X ${SYSTEM_OBJS} vers.o SYSTEM_LD_TAIL= @${OBJCOPY} --strip-symbol gcc2_compiled. ${.TARGET} ; \ ${SIZE} ${.TARGET} ; chmod 755 ${.TARGET} SYSTEM_DEP+= ${LDSCRIPT} # MKMODULESENV is set here so that port makefiles can augment # them. MKMODULESENV+= MAKEOBJDIRPREFIX=${.OBJDIR}/modules KMODDIR=${KODIR} MKMODULESENV+= MACHINE_CPUARCH=${MACHINE_CPUARCH} MKMODULESENV+= MACHINE=${MACHINE} MACHINE_ARCH=${MACHINE_ARCH} .if (${KERN_IDENT} == LINT) MKMODULESENV+= ALL_MODULES=LINT .endif .if defined(MODULES_OVERRIDE) MKMODULESENV+= MODULES_OVERRIDE="${MODULES_OVERRIDE}" .endif .if defined(WITHOUT_MODULES) MKMODULESENV+= WITHOUT_MODULES="${WITHOUT_MODULES}" .endif .if defined(DEBUG) MKMODULESENV+= DEBUG_FLAGS="${DEBUG}" .endif # Detect kernel config options that force stack frames to be turned on. DDB_ENABLED!= grep DDB opt_ddb.h || true ; echo DTR_ENABLED!= grep KDTRACE_FRAME opt_kdtrace.h || true ; echo HWPMC_ENABLED!= grep HWPMC opt_hwpmc_hooks.h || true ; echo Index: projects/building-blocks/sys/conf =================================================================== --- projects/building-blocks/sys/conf (revision 278311) +++ projects/building-blocks/sys/conf (revision 278312) Property changes on: projects/building-blocks/sys/conf ___________________________________________________________________ Modified: svn:mergeinfo ## -0,0 +0,1 ## Merged /head/sys/conf:r278256-278311 Index: projects/building-blocks/sys/contrib/vchiq/interface/compat/list.h =================================================================== --- projects/building-blocks/sys/contrib/vchiq/interface/compat/list.h (nonexistent) +++ projects/building-blocks/sys/contrib/vchiq/interface/compat/list.h (revision 278312) @@ -0,0 +1,256 @@ +/* $NetBSD: list.h,v 1.5 2014/08/20 15:26:52 riastradh Exp $ */ + +/*- + * Copyright (c) 2013 The NetBSD Foundation, Inc. + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by Taylor R. Campbell. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * Notes on porting: + * + * - LIST_HEAD(x) means a declaration `struct list_head x = + * LIST_HEAD_INIT(x)' in Linux, but something else in NetBSD. + * Replace by the expansion. + * + * - The `_rcu' routines here are not actually pserialize(9)-safe. + * They need dependent read memory barriers added. Please fix this + * if you need to use them with pserialize(9). + */ + +#ifndef _LINUX_LIST_H_ +#define _LINUX_LIST_H_ + +#include + +#define container_of(ptr, type, member) \ +({ \ + __typeof(((type *)0)->member) *_p = (ptr); \ + (type *)((char *)_p - offsetof(type, member)); \ +}) + +/* + * Doubly-linked lists. + */ + +struct list_head { + struct list_head *prev; + struct list_head *next; +}; + +#define LIST_HEAD_INIT(name) { .prev = &(name), .next = &(name) } + +static inline void +INIT_LIST_HEAD(struct list_head *head) +{ + head->prev = head; + head->next = head; +} + +static inline struct list_head * +list_first(const struct list_head *head) +{ + return head->next; +} + +static inline struct list_head * +list_last(const struct list_head *head) +{ + return head->prev; +} + +static inline struct list_head * +list_next(const struct list_head *node) +{ + return node->next; +} + +static inline struct list_head * +list_prev(const struct list_head *node) +{ + return node->prev; +} + +static inline int +list_empty(const struct list_head *head) +{ + return (head->next == head); +} + +static inline int +list_is_singular(const struct list_head *head) +{ + + if (list_empty(head)) + return false; + if (head->next != head->prev) + return false; + return true; +} + +static inline void +__list_add_between(struct list_head *prev, struct list_head *node, + struct list_head *next) +{ + prev->next = node; + node->prev = prev; + node->next = next; + next->prev = node; +} + +static inline void +list_add(struct list_head *node, struct list_head *head) +{ + __list_add_between(head, node, head->next); +} + +static inline void +list_add_tail(struct list_head *node, struct list_head *head) +{ + __list_add_between(head->prev, node, head); +} + +static inline void +list_del(struct list_head *entry) +{ + entry->prev->next = entry->next; + entry->next->prev = entry->prev; +} + +static inline void +__list_splice_between(struct list_head *prev, const struct list_head *list, + struct list_head *next) +{ + struct list_head *first = list->next; + struct list_head *last = list->prev; + + first->prev = prev; + prev->next = first; + + last->next = next; + next->prev = last; +} + +static inline void +list_splice(const struct list_head *list, struct list_head *head) +{ + if (!list_empty(list)) + __list_splice_between(head, list, head->next); +} + +static inline void +list_splice_tail(const struct list_head *list, struct list_head *head) +{ + if (!list_empty(list)) + __list_splice_between(head->prev, list, head); +} + +static inline void +list_move(struct list_head *node, struct list_head *head) +{ + list_del(node); + list_add(node, head); +} + +static inline void +list_move_tail(struct list_head *node, struct list_head *head) +{ + list_del(node); + list_add_tail(node, head); +} + +static inline void +list_replace(struct list_head *old, struct list_head *new) +{ + new->prev = old->prev; + old->prev->next = new; + new->next = old->next; + old->next->prev = new; +} + +static inline void +list_del_init(struct list_head *node) +{ + list_del(node); + INIT_LIST_HEAD(node); +} + +#define list_entry(PTR, TYPE, FIELD) container_of(PTR, TYPE, FIELD) +#define list_first_entry(PTR, TYPE, FIELD) \ + list_entry(list_first((PTR)), TYPE, FIELD) +#define list_last_entry(PTR, TYPE, FIELD) \ + list_entry(list_last((PTR)), TYPE, FIELD) +#define list_next_entry(ENTRY, FIELD) \ + list_entry(list_next(&(ENTRY)->FIELD), typeof(*(ENTRY)), FIELD) +#define list_prev_entry(ENTRY, FIELD) \ + list_entry(list_prev(&(ENTRY)->FIELD), typeof(*(ENTRY)), FIELD) + +#define list_for_each(VAR, HEAD) \ + for ((VAR) = list_first((HEAD)); \ + (VAR) != (HEAD); \ + (VAR) = list_next((VAR))) + +#define list_for_each_safe(VAR, NEXT, HEAD) \ + for ((VAR) = list_first((HEAD)); \ + ((VAR) != (HEAD)) && ((NEXT) = list_next((VAR)), 1); \ + (VAR) = (NEXT)) + +#define list_for_each_entry(VAR, HEAD, FIELD) \ + for ((VAR) = list_entry(list_first((HEAD)), typeof(*(VAR)), FIELD); \ + &(VAR)->FIELD != (HEAD); \ + (VAR) = list_entry(list_next(&(VAR)->FIELD), typeof(*(VAR)), \ + FIELD)) + +#define list_for_each_entry_reverse(VAR, HEAD, FIELD) \ + for ((VAR) = list_entry(list_last((HEAD)), typeof(*(VAR)), FIELD); \ + &(VAR)->FIELD != (HEAD); \ + (VAR) = list_entry(list_prev(&(VAR)->FIELD), typeof(*(VAR)), \ + FIELD)) + +#define list_for_each_entry_safe(VAR, NEXT, HEAD, FIELD) \ + for ((VAR) = list_entry(list_first((HEAD)), typeof(*(VAR)), FIELD); \ + (&(VAR)->FIELD != (HEAD)) && \ + ((NEXT) = list_entry(list_next(&(VAR)->FIELD), \ + typeof(*(VAR)), FIELD), 1); \ + (VAR) = (NEXT)) + +#define list_for_each_entry_continue(VAR, HEAD, FIELD) \ + for ((VAR) = list_next_entry((VAR), FIELD); \ + &(VAR)->FIELD != (HEAD); \ + (VAR) = list_next_entry((VAR), FIELD)) + +#define list_for_each_entry_continue_reverse(VAR, HEAD, FIELD) \ + for ((VAR) = list_prev_entry((VAR), FIELD); \ + &(VAR)->FIELD != (HEAD); \ + (VAR) = list_prev_entry((VAR), FIELD)) + +#define list_for_each_entry_safe_from(VAR, NEXT, HEAD, FIELD) \ + for (; \ + (&(VAR)->FIELD != (HEAD)) && \ + ((NEXT) = list_next_entry((VAR), FIELD)); \ + (VAR) = (NEXT)) + +#endif /* _LINUX_LIST_H_ */ Property changes on: projects/building-blocks/sys/contrib/vchiq/interface/compat/list.h ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Index: projects/building-blocks/sys/contrib/vchiq/interface/compat/vchi_bsd.c =================================================================== --- projects/building-blocks/sys/contrib/vchiq/interface/compat/vchi_bsd.c (nonexistent) +++ projects/building-blocks/sys/contrib/vchiq/interface/compat/vchi_bsd.c (revision 278312) @@ -0,0 +1,532 @@ +/*- + * Copyright (c) 2010 Max Khon + * All rights reserved. + * + * This software was developed by Max Khon under sponsorship from + * the FreeBSD Foundation and Ethon Technologies GmbH. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $Id: bsd-compat.c 9253 2010-09-02 10:12:09Z fjoe $ + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "mbox_if.h" + +#include + +MALLOC_DEFINE(M_VCHI, "VCHI", "VCHI"); + +/* + * Timer API + */ +static void +run_timer(void *arg) +{ + struct timer_list *t = (struct timer_list *) arg; + void (*function)(unsigned long); + + mtx_lock_spin(&t->mtx); + if (callout_pending(&t->callout)) { + /* callout was reset */ + mtx_unlock_spin(&t->mtx); + return; + } + if (!callout_active(&t->callout)) { + /* callout was stopped */ + mtx_unlock_spin(&t->mtx); + return; + } + callout_deactivate(&t->callout); + + function = t->function; + mtx_unlock_spin(&t->mtx); + + function(t->data); +} + +void +init_timer(struct timer_list *t) +{ + mtx_init(&t->mtx, "dahdi timer lock", NULL, MTX_SPIN); + callout_init(&t->callout, CALLOUT_MPSAFE); + t->expires = 0; + /* + * function and data are not initialized intentionally: + * they are not initialized by Linux implementation too + */ +} + +void +setup_timer(struct timer_list *t, void (*function)(unsigned long), unsigned long data) +{ + t->function = function; + t->data = data; + init_timer(t); +} + +void +mod_timer(struct timer_list *t, unsigned long expires) +{ + mtx_lock_spin(&t->mtx); + callout_reset(&t->callout, expires - jiffies, run_timer, t); + mtx_unlock_spin(&t->mtx); +} + +void +add_timer(struct timer_list *t) +{ + mod_timer(t, t->expires); +} + +int +del_timer_sync(struct timer_list *t) +{ + mtx_lock_spin(&t->mtx); + callout_stop(&t->callout); + mtx_unlock_spin(&t->mtx); + + mtx_destroy(&t->mtx); + return 0; +} + +int +del_timer(struct timer_list *t) +{ + del_timer_sync(t); + return 0; +} + +/* + * Completion API + */ +void +init_completion(struct completion *c) +{ + cv_init(&c->cv, "VCHI completion cv"); + mtx_init(&c->lock, "VCHI completion lock", "condvar", MTX_DEF); + c->done = 0; +} + +void +destroy_completion(struct completion *c) +{ + cv_destroy(&c->cv); + mtx_destroy(&c->lock); +} + +void +complete(struct completion *c) +{ + mtx_lock(&c->lock); + + if (c->done >= 0) { + KASSERT(c->done < INT_MAX, ("c->done overflow")); /* XXX check */ + c->done++; + cv_signal(&c->cv); + } else { + KASSERT(c->done == -1, ("Invalid value of c->done: %d", c->done)); + } + + mtx_unlock(&c->lock); +} + +void +complete_all(struct completion *c) +{ + mtx_lock(&c->lock); + + if (c->done >= 0) { + KASSERT(c->done < INT_MAX, ("c->done overflow")); /* XXX check */ + c->done = -1; + cv_broadcast(&c->cv); + } else { + KASSERT(c->done == -1, ("Invalid value of c->done: %d", c->done)); + } + + mtx_unlock(&c->lock); +} + +void +INIT_COMPLETION_locked(struct completion *c) +{ + mtx_lock(&c->lock); + + c->done = 0; + + mtx_unlock(&c->lock); +} + +static void +_completion_claim(struct completion *c) +{ + + KASSERT(mtx_owned(&c->lock), + ("_completion_claim should be called with acquired lock")); + KASSERT(c->done != 0, ("_completion_claim on non-waited completion")); + if (c->done > 0) + c->done--; + else + KASSERT(c->done == -1, ("Invalid value of c->done: %d", c->done)); +} + +void +wait_for_completion(struct completion *c) +{ + mtx_lock(&c->lock); + if (!c->done) + cv_wait(&c->cv, &c->lock); + c->done--; + mtx_unlock(&c->lock); +} + +int +try_wait_for_completion(struct completion *c) +{ + int res = 0; + + mtx_lock(&c->lock); + if (!c->done) + res = 1; + else + c->done--; + mtx_unlock(&c->lock); + return res == 0; +} + +int +wait_for_completion_interruptible_timeout(struct completion *c, unsigned long timeout) +{ + int res = 0; + unsigned long start, now; + start = jiffies; + + mtx_lock(&c->lock); + while (c->done == 0) { + res = cv_timedwait_sig(&c->cv, &c->lock, timeout); + if (res) + goto out; + now = jiffies; + if (timeout < (now - start)) { + res = EWOULDBLOCK; + goto out; + } + + timeout -= (now - start); + start = now; + } + + _completion_claim(c); + res = 0; + +out: + mtx_unlock(&c->lock); + + if (res == EWOULDBLOCK) { + return 0; + } else if ((res == EINTR) || (res == ERESTART)) { + return -ERESTART; + } else { + KASSERT((res == 0), ("res = %d", res)); + return timeout; + } +} + +int +wait_for_completion_interruptible(struct completion *c) +{ + int res = 0; + + mtx_lock(&c->lock); + while (c->done == 0) { + res = cv_wait_sig(&c->cv, &c->lock); + if (res) + goto out; + } + + _completion_claim(c); + +out: + mtx_unlock(&c->lock); + + if ((res == EINTR) || (res == ERESTART)) + res = -ERESTART; + return res; +} + +int +wait_for_completion_killable(struct completion *c) +{ + + return wait_for_completion_interruptible(c); +} + +/* + * Semaphore API + */ + +void sema_sysinit(void *arg) +{ + struct semaphore *s = arg; + + printf("sema_sysinit\n"); + _sema_init(s, 1); +} + +void +_sema_init(struct semaphore *s, int value) +{ + bzero(s, sizeof(*s)); + mtx_init(&s->mtx, "sema lock", "VCHIQ sepmaphore backing lock", + MTX_DEF | MTX_NOWITNESS | MTX_QUIET); + cv_init(&s->cv, "sema cv"); + s->value = value; +} + +void +_sema_destroy(struct semaphore *s) +{ + mtx_destroy(&s->mtx); + cv_destroy(&s->cv); +} + +void +down(struct semaphore *s) +{ + + mtx_lock(&s->mtx); + while (s->value == 0) { + s->waiters++; + cv_wait(&s->cv, &s->mtx); + s->waiters--; + } + + s->value--; + mtx_unlock(&s->mtx); +} + +int +down_interruptible(struct semaphore *s) +{ + int ret ; + + ret = 0; + + mtx_lock(&s->mtx); + + while (s->value == 0) { + s->waiters++; + ret = cv_wait_sig(&s->cv, &s->mtx); + s->waiters--; + + if (ret == EINTR) { + mtx_unlock(&s->mtx); + return (-EINTR); + } + + if (ret == ERESTART) + continue; + } + + s->value--; + mtx_unlock(&s->mtx); + + return (0); +} + +int +down_trylock(struct semaphore *s) +{ + int ret; + + ret = 0; + + mtx_lock(&s->mtx); + + if (s->value > 0) { + /* Success. */ + s->value--; + ret = 0; + } else { + ret = -EAGAIN; + } + + mtx_unlock(&s->mtx); + + return (ret); +} + +void +up(struct semaphore *s) +{ + mtx_lock(&s->mtx); + s->value++; + if (s->waiters && s->value > 0) + cv_signal(&s->cv); + + mtx_unlock(&s->mtx); +} + +/* + * Logging API + */ +void +rlprintf(int pps, const char *fmt, ...) +{ + va_list ap; + static struct timeval last_printf; + static int count; + + if (ppsratecheck(&last_printf, &count, pps)) { + va_start(ap, fmt); + vprintf(fmt, ap); + va_end(ap); + } +} + +void +device_rlprintf(int pps, device_t dev, const char *fmt, ...) +{ + va_list ap; + static struct timeval last_printf; + static int count; + + if (ppsratecheck(&last_printf, &count, pps)) { + va_start(ap, fmt); + device_print_prettyname(dev); + vprintf(fmt, ap); + va_end(ap); + } +} + +/* + * Signals API + */ + +void +flush_signals(VCHIQ_THREAD_T thr) +{ + printf("Implement ME: %s\n", __func__); +} + +int +fatal_signal_pending(VCHIQ_THREAD_T thr) +{ + printf("Implement ME: %s\n", __func__); + return (0); +} + +/* + * kthread API + */ + +/* + * This is a hack to avoid memory leak + */ +#define MAX_THREAD_DATA_SLOTS 32 +static int thread_data_slot = 0; + +struct thread_data { + void *data; + int (*threadfn)(void *); +}; + +static struct thread_data thread_slots[MAX_THREAD_DATA_SLOTS]; + +static void +kthread_wrapper(void *data) +{ + struct thread_data *slot; + + slot = data; + slot->threadfn(slot->data); +} + +VCHIQ_THREAD_T +vchiq_thread_create(int (*threadfn)(void *data), + void *data, + const char namefmt[], ...) +{ + VCHIQ_THREAD_T newp; + va_list ap; + char name[MAXCOMLEN+1]; + struct thread_data *slot; + + if (thread_data_slot >= MAX_THREAD_DATA_SLOTS) { + printf("kthread_create: out of thread data slots\n"); + return (NULL); + } + + slot = &thread_slots[thread_data_slot]; + slot->data = data; + slot->threadfn = threadfn; + + va_start(ap, namefmt); + vsnprintf(name, sizeof(name), namefmt, ap); + va_end(ap); + + newp = NULL; + if (kproc_create(kthread_wrapper, (void*)slot, &newp, 0, 0, + "%s", name) != 0) { + /* Just to be sure */ + newp = NULL; + } + else + thread_data_slot++; + + return newp; +} + +void +set_user_nice(VCHIQ_THREAD_T thr, int nice) +{ + /* NOOP */ +} + +void +wake_up_process(VCHIQ_THREAD_T thr) +{ + /* NOOP */ +} + +void +bcm_mbox_write(int channel, uint32_t data) +{ + device_t mbox; + + mbox = devclass_get_device(devclass_find("mbox"), 0); + + if (mbox) + MBOX_WRITE(mbox, channel, data); +} Property changes on: projects/building-blocks/sys/contrib/vchiq/interface/compat/vchi_bsd.c ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Index: projects/building-blocks/sys/contrib/vchiq/interface/compat/vchi_bsd.h =================================================================== --- projects/building-blocks/sys/contrib/vchiq/interface/compat/vchi_bsd.h (nonexistent) +++ projects/building-blocks/sys/contrib/vchiq/interface/compat/vchi_bsd.h (revision 278312) @@ -0,0 +1,434 @@ +/*- + * Copyright (c) 2010 Max Khon + * Copyright (c) 2012 Oleksandr Tymoshenko + * Copyright (c) 2013 Jared D. McNeill + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +#ifndef __VCHI_BSD_H__ +#define __VCHI_BSD_H__ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Copy from/to user API + */ +#define copy_from_user(to, from, n) copyin((from), (to), (n)) +#define copy_to_user(to, from, n) copyout((from), (to), (n)) + +/* + * Bit API + */ + +static __inline int +test_and_set_bit(int nr, volatile void *addr) +{ + int val; + + do { + val = *(volatile int *) addr; + } while (atomic_cmpset_int(addr, val, val | (1 << nr)) == 0); + return (val & (1 << nr)); +} + +static __inline__ +int test_and_clear_bit(int nr, volatile void *addr) +{ + int val; + + do { + val = *(volatile int *) addr; + } while (atomic_cmpset_int(addr, val, val & ~(1 << nr)) == 0); + return (val & (1 << nr)); +} + +/* + * Atomic API + */ +typedef volatile unsigned atomic_t; + +#define atomic_set(p, v) (*(p) = (v)) +#define atomic_read(p) (*(p)) +#define atomic_inc(p) atomic_add_int(p, 1) +#define atomic_dec(p) atomic_subtract_int(p, 1) +#define atomic_dec_and_test(p) (atomic_fetchadd_int(p, -1) == 1) +#define atomic_inc_return(v) atomic_add_return(1, (v)) +#define atomic_dec_return(v) atomic_sub_return(1, (v)) +#define atomic_add(v, p) atomic_add_int(p, v) +#define atomic_sub(v, p) atomic_subtract_int(p, v) + +#define ATOMIC_INIT(v) (v) + +static inline int +atomic_add_return(int i, atomic_t *v) +{ + return i + atomic_fetchadd_int(v, i); +} + +static inline int +atomic_sub_return(int i, atomic_t *v) +{ + return atomic_fetchadd_int(v, -i) - i; +} + +static inline int +atomic_cmpxchg(atomic_t *v, int oldv, int newv) +{ + if (atomic_cmpset_rel_int(v, oldv, newv)) + return newv; + else + return *v; +} + +static inline int +atomic_xchg(atomic_t *v, int newv) +{ + int oldv; + if (newv == 0) + return atomic_readandclear_int(v); + else { + do { + oldv = atomic_load_acq_int(v); + } while (!atomic_cmpset_rel_int(v, oldv, newv)); + } + + return (oldv); +} + +/* + * Spinlock API + */ +typedef struct mtx spinlock_t; + +#define DEFINE_SPINLOCK(name) \ + struct mtx name +#define spin_lock_init(lock) mtx_init(lock, "VCHI spinlock " # lock, NULL, MTX_DEF) +#define spin_lock_destroy(lock) mtx_destroy(lock) +#define spin_lock(lock) mtx_lock(lock) +#define spin_unlock(lock) mtx_unlock(lock) +#define spin_lock_bh(lock) spin_lock(lock) +#define spin_unlock_bh(lock) spin_unlock(lock) + +/* + * Mutex API + */ +struct mutex { + struct mtx mtx; +}; + +#define lmutex_init(lock) mtx_init(&(lock)->mtx, #lock, NULL, MTX_DEF) +#define lmutex_lock(lock) mtx_lock(&(lock)->mtx) +#define lmutex_lock_interruptible(lock) (mtx_lock(&(lock)->mtx),0) +#define lmutex_unlock(lock) mtx_unlock(&(lock)->mtx) +#define lmutex_destroy(lock) mtx_destroy(&(lock)->mtx) + +/* + * Rwlock API + */ +typedef struct sx rwlock_t; + +#if defined(SX_ADAPTIVESPIN) && !defined(SX_NOADAPTIVE) +#define SX_NOADAPTIVE SX_ADAPTIVESPIN +#endif + +#define DEFINE_RWLOCK(name) \ + struct sx name; \ + SX_SYSINIT(name, &name, #name) +#define rwlock_init(rwlock) sx_init_flags(rwlock, "VCHI rwlock", SX_NOADAPTIVE) +#define read_lock(rwlock) sx_slock(rwlock) +#define read_unlock(rwlock) sx_sunlock(rwlock) + +#define write_lock(rwlock) sx_xlock(rwlock) +#define write_unlock(rwlock) sx_xunlock(rwlock) +#define write_lock_irqsave(rwlock, flags) \ + do { \ + sx_xlock(rwlock); \ + (void) &(flags); \ + } while (0) +#define write_unlock_irqrestore(rwlock, flags) \ + sx_xunlock(rwlock) + +#define read_lock_bh(rwlock) sx_slock(rwlock) +#define read_unlock_bh(rwlock) sx_sunlock(rwlock) +#define write_lock_bh(rwlock) sx_xlock(rwlock) +#define write_unlock_bh(rwlock) sx_xunlock(rwlock) + +/* + * Timer API + */ +struct timer_list { + struct mtx mtx; + struct callout callout; + + unsigned long expires; + void (*function)(unsigned long); + unsigned long data; +}; + +void init_timer(struct timer_list *t); +void setup_timer(struct timer_list *t, void (*function)(unsigned long), unsigned long data); +void mod_timer(struct timer_list *t, unsigned long expires); +void add_timer(struct timer_list *t); +int del_timer(struct timer_list *t); +int del_timer_sync(struct timer_list *t); + +/* + * Completion API + */ +struct completion { + struct cv cv; + struct mtx lock; + int done; +}; + +void init_completion(struct completion *c); +void destroy_completion(struct completion *c); +int try_wait_for_completion(struct completion *); +int wait_for_completion_interruptible(struct completion *); +int wait_for_completion_interruptible_timeout(struct completion *, unsigned long ticks); +int wait_for_completion_killable(struct completion *); +void wait_for_completion(struct completion *c); +void complete(struct completion *c); +void complete_all(struct completion *c); +void INIT_COMPLETION_locked(struct completion *c); + +#define INIT_COMPLETION(x) INIT_COMPLETION_locked(&(x)) + +/* + * Semaphore API + */ +struct semaphore { + struct mtx mtx; + struct cv cv; + int value; + int waiters; +}; + +#define DEFINE_SEMAPHORE(name) \ + struct semaphore name; \ + SYSINIT(name##_sema_sysinit, SI_SUB_LOCK, SI_ORDER_MIDDLE, \ + sema_sysinit, &name); \ + SYSUNINIT(name##_sema_sysuninit, SI_SUB_LOCK, SI_ORDER_MIDDLE, \ + _sema_destroy, __DEVOLATILE(void *, &(name))) + +void sema_sysinit(void *arg); +void _sema_init(struct semaphore *s, int value); +void _sema_destroy(struct semaphore *s); +void down(struct semaphore *s); +int down_interruptible(struct semaphore *s); +int down_trylock(struct semaphore *s); +void up(struct semaphore *s); + +/* + * Logging and assertions API + */ +void rlprintf(int pps, const char *fmt, ...) + __printflike(2, 3); + +void +device_rlprintf(int pps, device_t dev, const char *fmt, ...) + __printflike(3, 4); + +#define might_sleep() + +#define WARN(condition, msg) \ +({ \ + int __ret_warn_on = !!(condition); \ + if (unlikely(__ret_warn_on)) \ + printf((msg)); \ + unlikely(__ret_warn_on); \ +}) + + + +#define WARN_ON(condition) \ +({ \ + int __ret_warn_on = !!(condition); \ + if (unlikely(__ret_warn_on)) \ + printf("WARN_ON: " #condition "\n"); \ + unlikely(__ret_warn_on); \ +}) + +#define WARN_ON_ONCE(condition) ({ \ + static int __warned; \ + int __ret_warn_once = !!(condition); \ + \ + if (unlikely(__ret_warn_once)) \ + if (WARN_ON(!__warned)) \ + __warned = 1; \ + unlikely(__ret_warn_once); \ +}) + +#define BUG_ON(cond) \ + do { \ + if (cond) \ + panic("BUG_ON: " #cond); \ + } while (0) + +#define BUG() \ + do { \ + panic("BUG: %s:%d", __FILE__, __LINE__); \ + } while (0) + +#define vchiq_static_assert(cond) CTASSERT(cond) + +#define KERN_EMERG "<0>" /* system is unusable */ +#define KERN_ALERT "<1>" /* action must be taken immediately */ +#define KERN_CRIT "<2>" /* critical conditions */ +#define KERN_ERR "<3>" /* error conditions */ +#define KERN_WARNING "<4>" /* warning conditions */ +#define KERN_NOTICE "<5>" /* normal but significant condition */ +#define KERN_INFO "<6>" /* informational */ +#define KERN_DEBUG "<7>" /* debug-level messages */ +#define KERN_CONT "" + +#define printk(fmt, args...) printf(fmt, ##args) +#define vprintk(fmt, args) vprintf(fmt, args) + +/* + * Malloc API + */ +#define GFP_KERNEL 0 +#define GFP_ATOMIC 0 + +MALLOC_DECLARE(M_VCHI); + +#define kmalloc(size, flags) malloc((size), M_VCHI, M_NOWAIT | M_ZERO) +#define kcalloc(n, size, flags) malloc((n) * (size), M_VCHI, M_NOWAIT | M_ZERO) +#define kzalloc(a, b) kcalloc(1, (a), (b)) +#define kfree(p) free(p, M_VCHI) + +/* + * Kernel module API + */ +#define __init +#define __exit +#define __devinit +#define __devexit +#define __devinitdata + +/* + * Time API + */ +#if 1 +/* emulate jiffies */ +static inline unsigned long +_jiffies(void) +{ + struct timeval tv; + + microuptime(&tv); + return tvtohz(&tv); +} + +static inline unsigned long +msecs_to_jiffies(unsigned long msecs) +{ + struct timeval tv; + + tv.tv_sec = msecs / 1000000UL; + tv.tv_usec = msecs % 1000000UL; + return tvtohz(&tv); +} + +#define jiffies _jiffies() +#else +#define jiffies ticks +#endif +#define HZ hz + +#define udelay(usec) DELAY(usec) +#define mdelay(msec) DELAY((msec) * 1000) + +#define schedule_timeout(jiff) pause("dhdslp", jiff) + +#if defined(msleep) +#undef msleep +#endif +#define msleep(msec) mdelay(msec) + +#define time_after(a, b) ((a) > (b)) +#define time_after_eq(a, b) ((a) >= (b)) +#define time_before(a, b) time_after((b), (a)) + +/* + * kthread API (we use proc) + */ +typedef struct proc * VCHIQ_THREAD_T; + +VCHIQ_THREAD_T vchiq_thread_create(int (*threadfn)(void *data), + void *data, + const char namefmt[], ...); +void set_user_nice(VCHIQ_THREAD_T p, int nice); +void wake_up_process(VCHIQ_THREAD_T p); + +/* + * Proc APIs + */ +void flush_signals(VCHIQ_THREAD_T); +int fatal_signal_pending(VCHIQ_THREAD_T); + +/* + * mbox API + */ +void bcm_mbox_write(int channel, uint32_t data); + +/* + * Misc API + */ + +#define ENODATA EINVAL + +#define __user + +#define likely(x) __builtin_expect(!!(x), 1) +#define unlikely(x) __builtin_expect(!!(x), 0) +#define current curproc +#define EXPORT_SYMBOL(x) +#define PAGE_ALIGN(addr) round_page(addr) + +typedef void irqreturn_t; +typedef off_t loff_t; + +#define BCM2835_MBOX_CHAN_VCHIQ 3 + +#define smp_mb wmb +#define smp_rmb rmb +#define smp_wmb wmb + +#define device_print_prettyname(dev) device_printf((dev), "") + +#endif /* __VCHI_BSD_H__ */ Property changes on: projects/building-blocks/sys/contrib/vchiq/interface/compat/vchi_bsd.h ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Index: projects/building-blocks/sys/contrib/vchiq/interface/vchi/connections/connection.h =================================================================== --- projects/building-blocks/sys/contrib/vchiq/interface/vchi/connections/connection.h (nonexistent) +++ projects/building-blocks/sys/contrib/vchiq/interface/vchi/connections/connection.h (revision 278312) @@ -0,0 +1,324 @@ +/** + * Copyright (c) 2010-2012 Broadcom. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions, and the following disclaimer, + * without modification. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The names of the above-listed copyright holders may not be used + * to endorse or promote products derived from this software without + * specific prior written permission. + * + * ALTERNATIVELY, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2, as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef CONNECTION_H_ +#define CONNECTION_H_ + +#include "interface/vchi/vchi_cfg_internal.h" +#include "interface/vchi/vchi_common.h" +#include "interface/vchi/message_drivers/message.h" + +/****************************************************************************** + Global defs + *****************************************************************************/ + +// Opaque handle for a connection / service pair +typedef struct opaque_vchi_connection_connected_service_handle_t *VCHI_CONNECTION_SERVICE_HANDLE_T; + +// opaque handle to the connection state information +typedef struct opaque_vchi_connection_info_t VCHI_CONNECTION_STATE_T; + +typedef struct vchi_connection_t VCHI_CONNECTION_T; + + +/****************************************************************************** + API + *****************************************************************************/ + +// Routine to init a connection with a particular low level driver +typedef VCHI_CONNECTION_STATE_T * (*VCHI_CONNECTION_INIT_T)( struct vchi_connection_t * connection, + const VCHI_MESSAGE_DRIVER_T * driver ); + +// Routine to control CRC enabling at a connection level +typedef int32_t (*VCHI_CONNECTION_CRC_CONTROL_T)( VCHI_CONNECTION_STATE_T *state_handle, + VCHI_CRC_CONTROL_T control ); + +// Routine to create a service +typedef int32_t (*VCHI_CONNECTION_SERVICE_CONNECT_T)( VCHI_CONNECTION_STATE_T *state_handle, + int32_t service_id, + uint32_t rx_fifo_size, + uint32_t tx_fifo_size, + int server, + VCHI_CALLBACK_T callback, + void *callback_param, + int32_t want_crc, + int32_t want_unaligned_bulk_rx, + int32_t want_unaligned_bulk_tx, + VCHI_CONNECTION_SERVICE_HANDLE_T *service_handle ); + +// Routine to close a service +typedef int32_t (*VCHI_CONNECTION_SERVICE_DISCONNECT_T)( VCHI_CONNECTION_SERVICE_HANDLE_T service_handle ); + +// Routine to queue a message +typedef int32_t (*VCHI_CONNECTION_SERVICE_QUEUE_MESSAGE_T)( VCHI_CONNECTION_SERVICE_HANDLE_T service_handle, + const void *data, + uint32_t data_size, + VCHI_FLAGS_T flags, + void *msg_handle ); + +// scatter-gather (vector) message queueing +typedef int32_t (*VCHI_CONNECTION_SERVICE_QUEUE_MESSAGEV_T)( VCHI_CONNECTION_SERVICE_HANDLE_T service_handle, + VCHI_MSG_VECTOR_T *vector, + uint32_t count, + VCHI_FLAGS_T flags, + void *msg_handle ); + +// Routine to dequeue a message +typedef int32_t (*VCHI_CONNECTION_SERVICE_DEQUEUE_MESSAGE_T)( VCHI_CONNECTION_SERVICE_HANDLE_T service_handle, + void *data, + uint32_t max_data_size_to_read, + uint32_t *actual_msg_size, + VCHI_FLAGS_T flags ); + +// Routine to peek at a message +typedef int32_t (*VCHI_CONNECTION_SERVICE_PEEK_MESSAGE_T)( VCHI_CONNECTION_SERVICE_HANDLE_T service_handle, + void **data, + uint32_t *msg_size, + VCHI_FLAGS_T flags ); + +// Routine to hold a message +typedef int32_t (*VCHI_CONNECTION_SERVICE_HOLD_MESSAGE_T)( VCHI_CONNECTION_SERVICE_HANDLE_T service_handle, + void **data, + uint32_t *msg_size, + VCHI_FLAGS_T flags, + void **message_handle ); + +// Routine to initialise a received message iterator +typedef int32_t (*VCHI_CONNECTION_SERVICE_LOOKAHEAD_MESSAGE_T)( VCHI_CONNECTION_SERVICE_HANDLE_T service_handle, + VCHI_MSG_ITER_T *iter, + VCHI_FLAGS_T flags ); + +// Routine to release a held message +typedef int32_t (*VCHI_CONNECTION_HELD_MSG_RELEASE_T)( VCHI_CONNECTION_SERVICE_HANDLE_T service_handle, + void *message_handle ); + +// Routine to get info on a held message +typedef int32_t (*VCHI_CONNECTION_HELD_MSG_INFO_T)( VCHI_CONNECTION_SERVICE_HANDLE_T service_handle, + void *message_handle, + void **data, + int32_t *msg_size, + uint32_t *tx_timestamp, + uint32_t *rx_timestamp ); + +// Routine to check whether the iterator has a next message +typedef int32_t (*VCHI_CONNECTION_MSG_ITER_HAS_NEXT_T)( VCHI_CONNECTION_SERVICE_HANDLE_T service, + const VCHI_MSG_ITER_T *iter ); + +// Routine to advance the iterator +typedef int32_t (*VCHI_CONNECTION_MSG_ITER_NEXT_T)( VCHI_CONNECTION_SERVICE_HANDLE_T service, + VCHI_MSG_ITER_T *iter, + void **data, + uint32_t *msg_size ); + +// Routine to remove the last message returned by the iterator +typedef int32_t (*VCHI_CONNECTION_MSG_ITER_REMOVE_T)( VCHI_CONNECTION_SERVICE_HANDLE_T service, + VCHI_MSG_ITER_T *iter ); + +// Routine to hold the last message returned by the iterator +typedef int32_t (*VCHI_CONNECTION_MSG_ITER_HOLD_T)( VCHI_CONNECTION_SERVICE_HANDLE_T service, + VCHI_MSG_ITER_T *iter, + void **msg_handle ); + +// Routine to transmit bulk data +typedef int32_t (*VCHI_CONNECTION_BULK_QUEUE_TRANSMIT_T)( VCHI_CONNECTION_SERVICE_HANDLE_T service_handle, + const void *data_src, + uint32_t data_size, + VCHI_FLAGS_T flags, + void *bulk_handle ); + +// Routine to receive data +typedef int32_t (*VCHI_CONNECTION_BULK_QUEUE_RECEIVE_T)( VCHI_CONNECTION_SERVICE_HANDLE_T service_handle, + void *data_dst, + uint32_t data_size, + VCHI_FLAGS_T flags, + void *bulk_handle ); + +// Routine to report if a server is available +typedef int32_t (*VCHI_CONNECTION_SERVER_PRESENT)( VCHI_CONNECTION_STATE_T *state, int32_t service_id, int32_t peer_flags ); + +// Routine to report the number of RX slots available +typedef int (*VCHI_CONNECTION_RX_SLOTS_AVAILABLE)( const VCHI_CONNECTION_STATE_T *state ); + +// Routine to report the RX slot size +typedef uint32_t (*VCHI_CONNECTION_RX_SLOT_SIZE)( const VCHI_CONNECTION_STATE_T *state ); + +// Callback to indicate that the other side has added a buffer to the rx bulk DMA FIFO +typedef void (*VCHI_CONNECTION_RX_BULK_BUFFER_ADDED)(VCHI_CONNECTION_STATE_T *state, + int32_t service, + uint32_t length, + MESSAGE_TX_CHANNEL_T channel, + uint32_t channel_params, + uint32_t data_length, + uint32_t data_offset); + +// Callback to inform a service that a Xon or Xoff message has been received +typedef void (*VCHI_CONNECTION_FLOW_CONTROL)(VCHI_CONNECTION_STATE_T *state, int32_t service_id, int32_t xoff); + +// Callback to inform a service that a server available reply message has been received +typedef void (*VCHI_CONNECTION_SERVER_AVAILABLE_REPLY)(VCHI_CONNECTION_STATE_T *state, int32_t service_id, uint32_t flags); + +// Callback to indicate that bulk auxiliary messages have arrived +typedef void (*VCHI_CONNECTION_BULK_AUX_RECEIVED)(VCHI_CONNECTION_STATE_T *state); + +// Callback to indicate that bulk auxiliary messages have arrived +typedef void (*VCHI_CONNECTION_BULK_AUX_TRANSMITTED)(VCHI_CONNECTION_STATE_T *state, void *handle); + +// Callback with all the connection info you require +typedef void (*VCHI_CONNECTION_INFO)(VCHI_CONNECTION_STATE_T *state, uint32_t protocol_version, uint32_t slot_size, uint32_t num_slots, uint32_t min_bulk_size); + +// Callback to inform of a disconnect +typedef void (*VCHI_CONNECTION_DISCONNECT)(VCHI_CONNECTION_STATE_T *state, uint32_t flags); + +// Callback to inform of a power control request +typedef void (*VCHI_CONNECTION_POWER_CONTROL)(VCHI_CONNECTION_STATE_T *state, MESSAGE_TX_CHANNEL_T channel, int32_t enable); + +// allocate memory suitably aligned for this connection +typedef void * (*VCHI_BUFFER_ALLOCATE)(VCHI_CONNECTION_SERVICE_HANDLE_T service_handle, uint32_t * length); + +// free memory allocated by buffer_allocate +typedef void (*VCHI_BUFFER_FREE)(VCHI_CONNECTION_SERVICE_HANDLE_T service_handle, void * address); + + +/****************************************************************************** + System driver struct + *****************************************************************************/ + +struct opaque_vchi_connection_api_t +{ + // Routine to init the connection + VCHI_CONNECTION_INIT_T init; + + // Connection-level CRC control + VCHI_CONNECTION_CRC_CONTROL_T crc_control; + + // Routine to connect to or create service + VCHI_CONNECTION_SERVICE_CONNECT_T service_connect; + + // Routine to disconnect from a service + VCHI_CONNECTION_SERVICE_DISCONNECT_T service_disconnect; + + // Routine to queue a message + VCHI_CONNECTION_SERVICE_QUEUE_MESSAGE_T service_queue_msg; + + // scatter-gather (vector) message queue + VCHI_CONNECTION_SERVICE_QUEUE_MESSAGEV_T service_queue_msgv; + + // Routine to dequeue a message + VCHI_CONNECTION_SERVICE_DEQUEUE_MESSAGE_T service_dequeue_msg; + + // Routine to peek at a message + VCHI_CONNECTION_SERVICE_PEEK_MESSAGE_T service_peek_msg; + + // Routine to hold a message + VCHI_CONNECTION_SERVICE_HOLD_MESSAGE_T service_hold_msg; + + // Routine to initialise a received message iterator + VCHI_CONNECTION_SERVICE_LOOKAHEAD_MESSAGE_T service_look_ahead_msg; + + // Routine to release a message + VCHI_CONNECTION_HELD_MSG_RELEASE_T held_msg_release; + + // Routine to get information on a held message + VCHI_CONNECTION_HELD_MSG_INFO_T held_msg_info; + + // Routine to check for next message on iterator + VCHI_CONNECTION_MSG_ITER_HAS_NEXT_T msg_iter_has_next; + + // Routine to get next message on iterator + VCHI_CONNECTION_MSG_ITER_NEXT_T msg_iter_next; + + // Routine to remove the last message returned by iterator + VCHI_CONNECTION_MSG_ITER_REMOVE_T msg_iter_remove; + + // Routine to hold the last message returned by iterator + VCHI_CONNECTION_MSG_ITER_HOLD_T msg_iter_hold; + + // Routine to transmit bulk data + VCHI_CONNECTION_BULK_QUEUE_TRANSMIT_T bulk_queue_transmit; + + // Routine to receive data + VCHI_CONNECTION_BULK_QUEUE_RECEIVE_T bulk_queue_receive; + + // Routine to report the available servers + VCHI_CONNECTION_SERVER_PRESENT server_present; + + // Routine to report the number of RX slots available + VCHI_CONNECTION_RX_SLOTS_AVAILABLE connection_rx_slots_available; + + // Routine to report the RX slot size + VCHI_CONNECTION_RX_SLOT_SIZE connection_rx_slot_size; + + // Callback to indicate that the other side has added a buffer to the rx bulk DMA FIFO + VCHI_CONNECTION_RX_BULK_BUFFER_ADDED rx_bulk_buffer_added; + + // Callback to inform a service that a Xon or Xoff message has been received + VCHI_CONNECTION_FLOW_CONTROL flow_control; + + // Callback to inform a service that a server available reply message has been received + VCHI_CONNECTION_SERVER_AVAILABLE_REPLY server_available_reply; + + // Callback to indicate that bulk auxiliary messages have arrived + VCHI_CONNECTION_BULK_AUX_RECEIVED bulk_aux_received; + + // Callback to indicate that a bulk auxiliary message has been transmitted + VCHI_CONNECTION_BULK_AUX_TRANSMITTED bulk_aux_transmitted; + + // Callback to provide information about the connection + VCHI_CONNECTION_INFO connection_info; + + // Callback to notify that peer has requested disconnect + VCHI_CONNECTION_DISCONNECT disconnect; + + // Callback to notify that peer has requested power change + VCHI_CONNECTION_POWER_CONTROL power_control; + + // allocate memory suitably aligned for this connection + VCHI_BUFFER_ALLOCATE buffer_allocate; + + // free memory allocated by buffer_allocate + VCHI_BUFFER_FREE buffer_free; + +}; + +struct vchi_connection_t { + const VCHI_CONNECTION_API_T *api; + VCHI_CONNECTION_STATE_T *state; +#ifdef VCHI_COARSE_LOCKING + struct semaphore sem; +#endif +}; + + +#endif /* CONNECTION_H_ */ + +/****************************** End of file **********************************/ Property changes on: projects/building-blocks/sys/contrib/vchiq/interface/vchi/connections/connection.h ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Index: projects/building-blocks/sys/contrib/vchiq/interface/vchi/message_drivers/message.h =================================================================== --- projects/building-blocks/sys/contrib/vchiq/interface/vchi/message_drivers/message.h (nonexistent) +++ projects/building-blocks/sys/contrib/vchiq/interface/vchi/message_drivers/message.h (revision 278312) @@ -0,0 +1,200 @@ +/** + * Copyright (c) 2010-2012 Broadcom. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions, and the following disclaimer, + * without modification. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The names of the above-listed copyright holders may not be used + * to endorse or promote products derived from this software without + * specific prior written permission. + * + * ALTERNATIVELY, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2, as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _VCHI_MESSAGE_H_ +#define _VCHI_MESSAGE_H_ + +#include "interface/vchi/vchi_cfg_internal.h" +#include "interface/vchi/vchi_common.h" + + +typedef enum message_event_type { + MESSAGE_EVENT_NONE, + MESSAGE_EVENT_NOP, + MESSAGE_EVENT_MESSAGE, + MESSAGE_EVENT_SLOT_COMPLETE, + MESSAGE_EVENT_RX_BULK_PAUSED, + MESSAGE_EVENT_RX_BULK_COMPLETE, + MESSAGE_EVENT_TX_COMPLETE, + MESSAGE_EVENT_MSG_DISCARDED +} MESSAGE_EVENT_TYPE_T; + +typedef enum vchi_msg_flags +{ + VCHI_MSG_FLAGS_NONE = 0x0, + VCHI_MSG_FLAGS_TERMINATE_DMA = 0x1 +} VCHI_MSG_FLAGS_T; + +typedef enum message_tx_channel +{ + MESSAGE_TX_CHANNEL_MESSAGE = 0, + MESSAGE_TX_CHANNEL_BULK = 1 // drivers may provide multiple bulk channels, from 1 upwards +} MESSAGE_TX_CHANNEL_T; + +// Macros used for cycling through bulk channels +#define MESSAGE_TX_CHANNEL_BULK_PREV(c) (MESSAGE_TX_CHANNEL_BULK+((c)-MESSAGE_TX_CHANNEL_BULK+VCHI_MAX_BULK_TX_CHANNELS_PER_CONNECTION-1)%VCHI_MAX_BULK_TX_CHANNELS_PER_CONNECTION) +#define MESSAGE_TX_CHANNEL_BULK_NEXT(c) (MESSAGE_TX_CHANNEL_BULK+((c)-MESSAGE_TX_CHANNEL_BULK+1)%VCHI_MAX_BULK_TX_CHANNELS_PER_CONNECTION) + +typedef enum message_rx_channel +{ + MESSAGE_RX_CHANNEL_MESSAGE = 0, + MESSAGE_RX_CHANNEL_BULK = 1 // drivers may provide multiple bulk channels, from 1 upwards +} MESSAGE_RX_CHANNEL_T; + +// Message receive slot information +typedef struct rx_msg_slot_info { + + struct rx_msg_slot_info *next; + //struct slot_info *prev; +#if !defined VCHI_COARSE_LOCKING + struct semaphore sem; +#endif + + uint8_t *addr; // base address of slot + uint32_t len; // length of slot in bytes + + uint32_t write_ptr; // hardware causes this to advance + uint32_t read_ptr; // this module does the reading + int active; // is this slot in the hardware dma fifo? + uint32_t msgs_parsed; // count how many messages are in this slot + uint32_t msgs_released; // how many messages have been released + void *state; // connection state information + uint8_t ref_count[VCHI_MAX_SERVICES_PER_CONNECTION]; // reference count for slots held by services +} RX_MSG_SLOTINFO_T; + +// The message driver no longer needs to know about the fields of RX_BULK_SLOTINFO_T - sort this out. +// In particular, it mustn't use addr and len - they're the client buffer, but the message +// driver will be tasked with sending the aligned core section. +typedef struct rx_bulk_slotinfo_t { + struct rx_bulk_slotinfo_t *next; + + struct semaphore *blocking; + + // needed by DMA + void *addr; + uint32_t len; + + // needed for the callback + void *service; + void *handle; + VCHI_FLAGS_T flags; +} RX_BULK_SLOTINFO_T; + + +/* ---------------------------------------------------------------------- + * each connection driver will have a pool of the following struct. + * + * the pool will be managed by vchi_qman_* + * this means there will be multiple queues (single linked lists) + * a given struct message_info will be on exactly one of these queues + * at any one time + * -------------------------------------------------------------------- */ +typedef struct rx_message_info { + + struct message_info *next; + //struct message_info *prev; + + uint8_t *addr; + uint32_t len; + RX_MSG_SLOTINFO_T *slot; // points to whichever slot contains this message + uint32_t tx_timestamp; + uint32_t rx_timestamp; + +} RX_MESSAGE_INFO_T; + +typedef struct { + MESSAGE_EVENT_TYPE_T type; + + struct { + // for messages + void *addr; // address of message + uint16_t slot_delta; // whether this message indicated slot delta + uint32_t len; // length of message + RX_MSG_SLOTINFO_T *slot; // slot this message is in + int32_t service; // service id this message is destined for + uint32_t tx_timestamp; // timestamp from the header + uint32_t rx_timestamp; // timestamp when we parsed it + } message; + + // FIXME: cleanup slot reporting... + RX_MSG_SLOTINFO_T *rx_msg; + RX_BULK_SLOTINFO_T *rx_bulk; + void *tx_handle; + MESSAGE_TX_CHANNEL_T tx_channel; + +} MESSAGE_EVENT_T; + + +// callbacks +typedef void VCHI_MESSAGE_DRIVER_EVENT_CALLBACK_T( void *state ); + +typedef struct { + VCHI_MESSAGE_DRIVER_EVENT_CALLBACK_T *event_callback; +} VCHI_MESSAGE_DRIVER_OPEN_T; + + +// handle to this instance of message driver (as returned by ->open) +typedef struct opaque_mhandle_t *VCHI_MDRIVER_HANDLE_T; + +struct opaque_vchi_message_driver_t { + VCHI_MDRIVER_HANDLE_T *(*open)( VCHI_MESSAGE_DRIVER_OPEN_T *params, void *state ); + int32_t (*suspending)( VCHI_MDRIVER_HANDLE_T *handle ); + int32_t (*resumed)( VCHI_MDRIVER_HANDLE_T *handle ); + int32_t (*power_control)( VCHI_MDRIVER_HANDLE_T *handle, MESSAGE_TX_CHANNEL_T, int32_t enable ); + int32_t (*add_msg_rx_slot)( VCHI_MDRIVER_HANDLE_T *handle, RX_MSG_SLOTINFO_T *slot ); // rx message + int32_t (*add_bulk_rx)( VCHI_MDRIVER_HANDLE_T *handle, void *data, uint32_t len, RX_BULK_SLOTINFO_T *slot ); // rx data (bulk) + int32_t (*send)( VCHI_MDRIVER_HANDLE_T *handle, MESSAGE_TX_CHANNEL_T channel, const void *data, uint32_t len, VCHI_MSG_FLAGS_T flags, void *send_handle ); // tx (message & bulk) + void (*next_event)( VCHI_MDRIVER_HANDLE_T *handle, MESSAGE_EVENT_T *event ); // get the next event from message_driver + int32_t (*enable)( VCHI_MDRIVER_HANDLE_T *handle ); + int32_t (*form_message)( VCHI_MDRIVER_HANDLE_T *handle, int32_t service_id, VCHI_MSG_VECTOR_T *vector, uint32_t count, void + *address, uint32_t length_avail, uint32_t max_total_length, int32_t pad_to_fill, int32_t allow_partial ); + + int32_t (*update_message)( VCHI_MDRIVER_HANDLE_T *handle, void *dest, int16_t *slot_count ); + int32_t (*buffer_aligned)( VCHI_MDRIVER_HANDLE_T *handle, int tx, int uncached, const void *address, const uint32_t length ); + void * (*allocate_buffer)( VCHI_MDRIVER_HANDLE_T *handle, uint32_t *length ); + void (*free_buffer)( VCHI_MDRIVER_HANDLE_T *handle, void *address ); + int (*rx_slot_size)( VCHI_MDRIVER_HANDLE_T *handle, int msg_size ); + int (*tx_slot_size)( VCHI_MDRIVER_HANDLE_T *handle, int msg_size ); + + int32_t (*tx_supports_terminate)( const VCHI_MDRIVER_HANDLE_T *handle, MESSAGE_TX_CHANNEL_T channel ); + uint32_t (*tx_bulk_chunk_size)( const VCHI_MDRIVER_HANDLE_T *handle, MESSAGE_TX_CHANNEL_T channel ); + int (*tx_alignment)( const VCHI_MDRIVER_HANDLE_T *handle, MESSAGE_TX_CHANNEL_T channel ); + int (*rx_alignment)( const VCHI_MDRIVER_HANDLE_T *handle, MESSAGE_RX_CHANNEL_T channel ); + void (*form_bulk_aux)( VCHI_MDRIVER_HANDLE_T *handle, MESSAGE_TX_CHANNEL_T channel, const void *data, uint32_t len, uint32_t chunk_size, const void **aux_data, int32_t *aux_len ); + void (*debug)( VCHI_MDRIVER_HANDLE_T *handle ); +}; + + +#endif // _VCHI_MESSAGE_H_ + +/****************************** End of file ***********************************/ Property changes on: projects/building-blocks/sys/contrib/vchiq/interface/vchi/message_drivers/message.h ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Index: projects/building-blocks/sys/contrib/vchiq/interface/vchi/vchi.h =================================================================== --- projects/building-blocks/sys/contrib/vchiq/interface/vchi/vchi.h (nonexistent) +++ projects/building-blocks/sys/contrib/vchiq/interface/vchi/vchi.h (revision 278312) @@ -0,0 +1,373 @@ +/** + * Copyright (c) 2010-2012 Broadcom. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions, and the following disclaimer, + * without modification. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The names of the above-listed copyright holders may not be used + * to endorse or promote products derived from this software without + * specific prior written permission. + * + * ALTERNATIVELY, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2, as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef VCHI_H_ +#define VCHI_H_ + +#include "interface/vchi/vchi_cfg.h" +#include "interface/vchi/vchi_common.h" +#include "interface/vchi/connections/connection.h" +#include "vchi_mh.h" + + +/****************************************************************************** + Global defs + *****************************************************************************/ + +#define VCHI_BULK_ROUND_UP(x) ((((unsigned long)(x))+VCHI_BULK_ALIGN-1) & ~(VCHI_BULK_ALIGN-1)) +#define VCHI_BULK_ROUND_DOWN(x) (((unsigned long)(x)) & ~(VCHI_BULK_ALIGN-1)) +#define VCHI_BULK_ALIGN_NBYTES(x) (VCHI_BULK_ALIGNED(x) ? 0 : (VCHI_BULK_ALIGN - ((unsigned long)(x) & (VCHI_BULK_ALIGN-1)))) + +#ifdef USE_VCHIQ_ARM +#define VCHI_BULK_ALIGNED(x) 1 +#else +#define VCHI_BULK_ALIGNED(x) (((unsigned long)(x) & (VCHI_BULK_ALIGN-1)) == 0) +#endif + +struct vchi_version { + uint32_t version; + uint32_t version_min; +}; +#define VCHI_VERSION(v_) { v_, v_ } +#define VCHI_VERSION_EX(v_, m_) { v_, m_ } + +typedef enum +{ + VCHI_VEC_POINTER, + VCHI_VEC_HANDLE, + VCHI_VEC_LIST +} VCHI_MSG_VECTOR_TYPE_T; + +typedef struct vchi_msg_vector_ex { + + VCHI_MSG_VECTOR_TYPE_T type; + union + { + // a memory handle + struct + { + VCHI_MEM_HANDLE_T handle; + uint32_t offset; + int32_t vec_len; + } handle; + + // an ordinary data pointer + struct + { + const void *vec_base; + int32_t vec_len; + } ptr; + + // a nested vector list + struct + { + struct vchi_msg_vector_ex *vec; + uint32_t vec_len; + } list; + } u; +} VCHI_MSG_VECTOR_EX_T; + + +// Construct an entry in a msg vector for a pointer (p) of length (l) +#define VCHI_VEC_POINTER(p,l) VCHI_VEC_POINTER, { { (VCHI_MEM_HANDLE_T)(p), (l) } } + +// Construct an entry in a msg vector for a message handle (h), starting at offset (o) of length (l) +#define VCHI_VEC_HANDLE(h,o,l) VCHI_VEC_HANDLE, { { (h), (o), (l) } } + +// Macros to manipulate 'FOURCC' values +#define MAKE_FOURCC(x) ((int32_t)( (x[0] << 24) | (x[1] << 16) | (x[2] << 8) | x[3] )) +#define FOURCC_TO_CHAR(x) (x >> 24) & 0xFF,(x >> 16) & 0xFF,(x >> 8) & 0xFF, x & 0xFF + + +// Opaque service information +struct opaque_vchi_service_t; + +// Descriptor for a held message. Allocated by client, initialised by vchi_msg_hold, +// vchi_msg_iter_hold or vchi_msg_iter_hold_next. Fields are for internal VCHI use only. +typedef struct +{ + struct opaque_vchi_service_t *service; + void *message; +} VCHI_HELD_MSG_T; + + + +// structure used to provide the information needed to open a server or a client +typedef struct { + struct vchi_version version; + int32_t service_id; + VCHI_CONNECTION_T *connection; + uint32_t rx_fifo_size; + uint32_t tx_fifo_size; + VCHI_CALLBACK_T callback; + void *callback_param; + /* client intends to receive bulk transfers of + odd lengths or into unaligned buffers */ + int32_t want_unaligned_bulk_rx; + /* client intends to transmit bulk transfers of + odd lengths or out of unaligned buffers */ + int32_t want_unaligned_bulk_tx; + /* client wants to check CRCs on (bulk) xfers. + Only needs to be set at 1 end - will do both directions. */ + int32_t want_crc; +} SERVICE_CREATION_T; + +// Opaque handle for a VCHI instance +typedef struct opaque_vchi_instance_handle_t *VCHI_INSTANCE_T; + +// Opaque handle for a server or client +typedef struct opaque_vchi_service_handle_t *VCHI_SERVICE_HANDLE_T; + +// Service registration & startup +typedef void (*VCHI_SERVICE_INIT)(VCHI_INSTANCE_T initialise_instance, VCHI_CONNECTION_T **connections, uint32_t num_connections); + +typedef struct service_info_tag { + const char * const vll_filename; /* VLL to load to start this service. This is an empty string if VLL is "static" */ + VCHI_SERVICE_INIT init; /* Service initialisation function */ + void *vll_handle; /* VLL handle; NULL when unloaded or a "static VLL" in build */ +} SERVICE_INFO_T; + +/****************************************************************************** + Global funcs - implementation is specific to which side you are on (local / remote) + *****************************************************************************/ + +#ifdef __cplusplus +extern "C" { +#endif + +extern /*@observer@*/ VCHI_CONNECTION_T * vchi_create_connection( const VCHI_CONNECTION_API_T * function_table, + const VCHI_MESSAGE_DRIVER_T * low_level); + + +// Routine used to initialise the vchi on both local + remote connections +extern int32_t vchi_initialise( VCHI_INSTANCE_T *instance_handle ); + +extern int32_t vchi_exit( void ); + +extern int32_t vchi_connect( VCHI_CONNECTION_T **connections, + const uint32_t num_connections, + VCHI_INSTANCE_T instance_handle ); + +//When this is called, ensure that all services have no data pending. +//Bulk transfers can remain 'queued' +extern int32_t vchi_disconnect( VCHI_INSTANCE_T instance_handle ); + +// Global control over bulk CRC checking +extern int32_t vchi_crc_control( VCHI_CONNECTION_T *connection, + VCHI_CRC_CONTROL_T control ); + +// helper functions +extern void * vchi_allocate_buffer(VCHI_SERVICE_HANDLE_T handle, uint32_t *length); +extern void vchi_free_buffer(VCHI_SERVICE_HANDLE_T handle, void *address); +extern uint32_t vchi_current_time(VCHI_INSTANCE_T instance_handle); + + +/****************************************************************************** + Global service API + *****************************************************************************/ +// Routine to create a named service +extern int32_t vchi_service_create( VCHI_INSTANCE_T instance_handle, + SERVICE_CREATION_T *setup, + VCHI_SERVICE_HANDLE_T *handle ); + +// Routine to destory a service +extern int32_t vchi_service_destroy( const VCHI_SERVICE_HANDLE_T handle ); + +// Routine to open a named service +extern int32_t vchi_service_open( VCHI_INSTANCE_T instance_handle, + SERVICE_CREATION_T *setup, + VCHI_SERVICE_HANDLE_T *handle); + +extern int32_t vchi_get_peer_version( const VCHI_SERVICE_HANDLE_T handle, + short *peer_version ); + +// Routine to close a named service +extern int32_t vchi_service_close( const VCHI_SERVICE_HANDLE_T handle ); + +// Routine to increment ref count on a named service +extern int32_t vchi_service_use( const VCHI_SERVICE_HANDLE_T handle ); + +// Routine to decrement ref count on a named service +extern int32_t vchi_service_release( const VCHI_SERVICE_HANDLE_T handle ); + +// Routine to send a message accross a service +extern int32_t vchi_msg_queue( VCHI_SERVICE_HANDLE_T handle, + const void *data, + uint32_t data_size, + VCHI_FLAGS_T flags, + void *msg_handle ); + +// scatter-gather (vector) and send message +int32_t vchi_msg_queuev_ex( VCHI_SERVICE_HANDLE_T handle, + VCHI_MSG_VECTOR_EX_T *vector, + uint32_t count, + VCHI_FLAGS_T flags, + void *msg_handle ); + +// legacy scatter-gather (vector) and send message, only handles pointers +int32_t vchi_msg_queuev( VCHI_SERVICE_HANDLE_T handle, + VCHI_MSG_VECTOR_T *vector, + uint32_t count, + VCHI_FLAGS_T flags, + void *msg_handle ); + +// Routine to receive a msg from a service +// Dequeue is equivalent to hold, copy into client buffer, release +extern int32_t vchi_msg_dequeue( VCHI_SERVICE_HANDLE_T handle, + void *data, + uint32_t max_data_size_to_read, + uint32_t *actual_msg_size, + VCHI_FLAGS_T flags ); + +// Routine to look at a message in place. +// The message is not dequeued, so a subsequent call to peek or dequeue +// will return the same message. +extern int32_t vchi_msg_peek( VCHI_SERVICE_HANDLE_T handle, + void **data, + uint32_t *msg_size, + VCHI_FLAGS_T flags ); + +// Routine to remove a message after it has been read in place with peek +// The first message on the queue is dequeued. +extern int32_t vchi_msg_remove( VCHI_SERVICE_HANDLE_T handle ); + +// Routine to look at a message in place. +// The message is dequeued, so the caller is left holding it; the descriptor is +// filled in and must be released when the user has finished with the message. +extern int32_t vchi_msg_hold( VCHI_SERVICE_HANDLE_T handle, + void **data, // } may be NULL, as info can be + uint32_t *msg_size, // } obtained from HELD_MSG_T + VCHI_FLAGS_T flags, + VCHI_HELD_MSG_T *message_descriptor ); + +// Initialise an iterator to look through messages in place +extern int32_t vchi_msg_look_ahead( VCHI_SERVICE_HANDLE_T handle, + VCHI_MSG_ITER_T *iter, + VCHI_FLAGS_T flags ); + +/****************************************************************************** + Global service support API - operations on held messages and message iterators + *****************************************************************************/ + +// Routine to get the address of a held message +extern void *vchi_held_msg_ptr( const VCHI_HELD_MSG_T *message ); + +// Routine to get the size of a held message +extern int32_t vchi_held_msg_size( const VCHI_HELD_MSG_T *message ); + +// Routine to get the transmit timestamp as written into the header by the peer +extern uint32_t vchi_held_msg_tx_timestamp( const VCHI_HELD_MSG_T *message ); + +// Routine to get the reception timestamp, written as we parsed the header +extern uint32_t vchi_held_msg_rx_timestamp( const VCHI_HELD_MSG_T *message ); + +// Routine to release a held message after it has been processed +extern int32_t vchi_held_msg_release( VCHI_HELD_MSG_T *message ); + +// Indicates whether the iterator has a next message. +extern int32_t vchi_msg_iter_has_next( const VCHI_MSG_ITER_T *iter ); + +// Return the pointer and length for the next message and advance the iterator. +extern int32_t vchi_msg_iter_next( VCHI_MSG_ITER_T *iter, + void **data, + uint32_t *msg_size ); + +// Remove the last message returned by vchi_msg_iter_next. +// Can only be called once after each call to vchi_msg_iter_next. +extern int32_t vchi_msg_iter_remove( VCHI_MSG_ITER_T *iter ); + +// Hold the last message returned by vchi_msg_iter_next. +// Can only be called once after each call to vchi_msg_iter_next. +extern int32_t vchi_msg_iter_hold( VCHI_MSG_ITER_T *iter, + VCHI_HELD_MSG_T *message ); + +// Return information for the next message, and hold it, advancing the iterator. +extern int32_t vchi_msg_iter_hold_next( VCHI_MSG_ITER_T *iter, + void **data, // } may be NULL + uint32_t *msg_size, // } + VCHI_HELD_MSG_T *message ); + + +/****************************************************************************** + Global bulk API + *****************************************************************************/ + +// Routine to prepare interface for a transfer from the other side +extern int32_t vchi_bulk_queue_receive( VCHI_SERVICE_HANDLE_T handle, + void *data_dst, + uint32_t data_size, + VCHI_FLAGS_T flags, + void *transfer_handle ); + + +// Prepare interface for a transfer from the other side into relocatable memory. +int32_t vchi_bulk_queue_receive_reloc( const VCHI_SERVICE_HANDLE_T handle, + VCHI_MEM_HANDLE_T h_dst, + uint32_t offset, + uint32_t data_size, + const VCHI_FLAGS_T flags, + void * const bulk_handle ); + +// Routine to queue up data ready for transfer to the other (once they have signalled they are ready) +extern int32_t vchi_bulk_queue_transmit( VCHI_SERVICE_HANDLE_T handle, + void *data_src, + uint32_t data_size, + VCHI_FLAGS_T flags, + void *transfer_handle ); + + +/****************************************************************************** + Configuration plumbing + *****************************************************************************/ + +// function prototypes for the different mid layers (the state info gives the different physical connections) +extern const VCHI_CONNECTION_API_T *single_get_func_table( void ); +//extern const VCHI_CONNECTION_API_T *local_server_get_func_table( void ); +//extern const VCHI_CONNECTION_API_T *local_client_get_func_table( void ); + +// declare all message drivers here +const VCHI_MESSAGE_DRIVER_T *vchi_mphi_message_driver_func_table( void ); + +#ifdef __cplusplus +} +#endif + +extern int32_t vchi_bulk_queue_transmit_reloc( VCHI_SERVICE_HANDLE_T handle, + VCHI_MEM_HANDLE_T h_src, + uint32_t offset, + uint32_t data_size, + VCHI_FLAGS_T flags, + void *transfer_handle ); +#endif /* VCHI_H_ */ + +/****************************** End of file **********************************/ Property changes on: projects/building-blocks/sys/contrib/vchiq/interface/vchi/vchi.h ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Index: projects/building-blocks/sys/contrib/vchiq/interface/vchi/vchi_cfg.h =================================================================== --- projects/building-blocks/sys/contrib/vchiq/interface/vchi/vchi_cfg.h (nonexistent) +++ projects/building-blocks/sys/contrib/vchiq/interface/vchi/vchi_cfg.h (revision 278312) @@ -0,0 +1,224 @@ +/** + * Copyright (c) 2010-2012 Broadcom. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions, and the following disclaimer, + * without modification. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The names of the above-listed copyright holders may not be used + * to endorse or promote products derived from this software without + * specific prior written permission. + * + * ALTERNATIVELY, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2, as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef VCHI_CFG_H_ +#define VCHI_CFG_H_ + +/**************************************************************************************** + * Defines in this first section are part of the VCHI API and may be examined by VCHI + * services. + ***************************************************************************************/ + +/* Required alignment of base addresses for bulk transfer, if unaligned transfers are not enabled */ +/* Really determined by the message driver, and should be available from a run-time call. */ +#ifndef VCHI_BULK_ALIGN +# if __VCCOREVER__ >= 0x04000000 +# define VCHI_BULK_ALIGN 32 // Allows for the need to do cache cleans +# else +# define VCHI_BULK_ALIGN 16 +# endif +#endif + +/* Required length multiple for bulk transfers, if unaligned transfers are not enabled */ +/* May be less than or greater than VCHI_BULK_ALIGN */ +/* Really determined by the message driver, and should be available from a run-time call. */ +#ifndef VCHI_BULK_GRANULARITY +# if __VCCOREVER__ >= 0x04000000 +# define VCHI_BULK_GRANULARITY 32 // Allows for the need to do cache cleans +# else +# define VCHI_BULK_GRANULARITY 16 +# endif +#endif + +/* The largest possible message to be queued with vchi_msg_queue. */ +#ifndef VCHI_MAX_MSG_SIZE +# if defined VCHI_LOCAL_HOST_PORT +# define VCHI_MAX_MSG_SIZE 16384 // makes file transfers fast, but should they be using bulk? +# else +# define VCHI_MAX_MSG_SIZE 4096 // NOTE: THIS MUST BE LARGER THAN OR EQUAL TO THE SIZE OF THE KHRONOS MERGE BUFFER!! +# endif +#endif + +/****************************************************************************************** + * Defines below are system configuration options, and should not be used by VCHI services. + *****************************************************************************************/ + +/* How many connections can we support? A localhost implementation uses 2 connections, + * 1 for host-app, 1 for VMCS, and these are hooked together by a loopback MPHI VCFW + * driver. */ +#ifndef VCHI_MAX_NUM_CONNECTIONS +# define VCHI_MAX_NUM_CONNECTIONS 3 +#endif + +/* How many services can we open per connection? Extending this doesn't cost processing time, just a small + * amount of static memory. */ +#ifndef VCHI_MAX_SERVICES_PER_CONNECTION +# define VCHI_MAX_SERVICES_PER_CONNECTION 36 +#endif + +/* Adjust if using a message driver that supports more logical TX channels */ +#ifndef VCHI_MAX_BULK_TX_CHANNELS_PER_CONNECTION +# define VCHI_MAX_BULK_TX_CHANNELS_PER_CONNECTION 9 // 1 MPHI + 8 CCP2 logical channels +#endif + +/* Adjust if using a message driver that supports more logical RX channels */ +#ifndef VCHI_MAX_BULK_RX_CHANNELS_PER_CONNECTION +# define VCHI_MAX_BULK_RX_CHANNELS_PER_CONNECTION 1 // 1 MPHI +#endif + +/* How many receive slots do we use. This times VCHI_MAX_MSG_SIZE gives the effective + * receive queue space, less message headers. */ +#ifndef VCHI_NUM_READ_SLOTS +# if defined(VCHI_LOCAL_HOST_PORT) +# define VCHI_NUM_READ_SLOTS 4 +# else +# define VCHI_NUM_READ_SLOTS 48 +# endif +#endif + +/* Do we utilise overrun facility for receive message slots? Can aid peer transmit + * performance. Only define on VideoCore end, talking to host. + */ +//#define VCHI_MSG_RX_OVERRUN + +/* How many transmit slots do we use. Generally don't need many, as the hardware driver + * underneath VCHI will usually have its own buffering. */ +#ifndef VCHI_NUM_WRITE_SLOTS +# define VCHI_NUM_WRITE_SLOTS 4 +#endif + +/* If a service has held or queued received messages in VCHI_XOFF_THRESHOLD or more slots, + * then it's taking up too much buffer space, and the peer service will be told to stop + * transmitting with an XOFF message. For this to be effective, the VCHI_NUM_READ_SLOTS + * needs to be considerably bigger than VCHI_NUM_WRITE_SLOTS, or the transmit latency + * is too high. */ +#ifndef VCHI_XOFF_THRESHOLD +# define VCHI_XOFF_THRESHOLD (VCHI_NUM_READ_SLOTS / 2) +#endif + +/* After we've sent an XOFF, the peer will be told to resume transmission once the local + * service has dequeued/released enough messages that it's now occupying + * VCHI_XON_THRESHOLD slots or fewer. */ +#ifndef VCHI_XON_THRESHOLD +# define VCHI_XON_THRESHOLD (VCHI_NUM_READ_SLOTS / 4) +#endif + +/* A size below which a bulk transfer omits the handshake completely and always goes + * via the message channel, if bulk auxiliary is being sent on that service. (The user + * can guarantee this by enabling unaligned transmits). + * Not API. */ +#ifndef VCHI_MIN_BULK_SIZE +# define VCHI_MIN_BULK_SIZE ( VCHI_MAX_MSG_SIZE / 2 < 4096 ? VCHI_MAX_MSG_SIZE / 2 : 4096 ) +#endif + +/* Maximum size of bulk transmission chunks, for each interface type. A trade-off between + * speed and latency; the smaller the chunk size the better change of messages and other + * bulk transmissions getting in when big bulk transfers are happening. Set to 0 to not + * break transmissions into chunks. + */ +#ifndef VCHI_MAX_BULK_CHUNK_SIZE_MPHI +# define VCHI_MAX_BULK_CHUNK_SIZE_MPHI (16 * 1024) +#endif + +/* NB Chunked CCP2 transmissions violate the letter of the CCP2 spec by using "JPEG8" mode + * with multiple-line frames. Only use if the receiver can cope. */ +#ifndef VCHI_MAX_BULK_CHUNK_SIZE_CCP2 +# define VCHI_MAX_BULK_CHUNK_SIZE_CCP2 0 +#endif + +/* How many TX messages can we have pending in our transmit slots. Once exhausted, + * vchi_msg_queue will be blocked. */ +#ifndef VCHI_TX_MSG_QUEUE_SIZE +# define VCHI_TX_MSG_QUEUE_SIZE 256 +#endif + +/* How many RX messages can we have parsed in the receive slots. Once exhausted, parsing + * will be suspended until older messages are dequeued/released. */ +#ifndef VCHI_RX_MSG_QUEUE_SIZE +# define VCHI_RX_MSG_QUEUE_SIZE 256 +#endif + +/* Really should be able to cope if we run out of received message descriptors, by + * suspending parsing as the comment above says, but we don't. This sweeps the issue + * under the carpet. */ +#if VCHI_RX_MSG_QUEUE_SIZE < (VCHI_MAX_MSG_SIZE/16 + 1) * VCHI_NUM_READ_SLOTS +# undef VCHI_RX_MSG_QUEUE_SIZE +# define VCHI_RX_MSG_QUEUE_SIZE (VCHI_MAX_MSG_SIZE/16 + 1) * VCHI_NUM_READ_SLOTS +#endif + +/* How many bulk transmits can we have pending. Once exhausted, vchi_bulk_queue_transmit + * will be blocked. */ +#ifndef VCHI_TX_BULK_QUEUE_SIZE +# define VCHI_TX_BULK_QUEUE_SIZE 64 +#endif + +/* How many bulk receives can we have pending. Once exhausted, vchi_bulk_queue_receive + * will be blocked. */ +#ifndef VCHI_RX_BULK_QUEUE_SIZE +# define VCHI_RX_BULK_QUEUE_SIZE 64 +#endif + +/* A limit on how many outstanding bulk requests we expect the peer to give us. If + * the peer asks for more than this, VCHI will fail and assert. The number is determined + * by the peer's hardware - it's the number of outstanding requests that can be queued + * on all bulk channels. VC3's MPHI peripheral allows 16. */ +#ifndef VCHI_MAX_PEER_BULK_REQUESTS +# define VCHI_MAX_PEER_BULK_REQUESTS 32 +#endif + +/* Define VCHI_CCP2TX_MANUAL_POWER if the host tells us when to turn the CCP2 + * transmitter on and off. + */ +/*#define VCHI_CCP2TX_MANUAL_POWER*/ + +#ifndef VCHI_CCP2TX_MANUAL_POWER + +/* Timeout (in milliseconds) for putting the CCP2TX interface into IDLE state. Set + * negative for no IDLE. + */ +# ifndef VCHI_CCP2TX_IDLE_TIMEOUT +# define VCHI_CCP2TX_IDLE_TIMEOUT 5 +# endif + +/* Timeout (in milliseconds) for putting the CCP2TX interface into OFF state. Set + * negative for no OFF. + */ +# ifndef VCHI_CCP2TX_OFF_TIMEOUT +# define VCHI_CCP2TX_OFF_TIMEOUT 1000 +# endif + +#endif /* VCHI_CCP2TX_MANUAL_POWER */ + +#endif /* VCHI_CFG_H_ */ + +/****************************** End of file **********************************/ Property changes on: projects/building-blocks/sys/contrib/vchiq/interface/vchi/vchi_cfg.h ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Index: projects/building-blocks/sys/contrib/vchiq/interface/vchi/vchi_cfg_internal.h =================================================================== --- projects/building-blocks/sys/contrib/vchiq/interface/vchi/vchi_cfg_internal.h (nonexistent) +++ projects/building-blocks/sys/contrib/vchiq/interface/vchi/vchi_cfg_internal.h (revision 278312) @@ -0,0 +1,71 @@ +/** + * Copyright (c) 2010-2012 Broadcom. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions, and the following disclaimer, + * without modification. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The names of the above-listed copyright holders may not be used + * to endorse or promote products derived from this software without + * specific prior written permission. + * + * ALTERNATIVELY, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2, as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef VCHI_CFG_INTERNAL_H_ +#define VCHI_CFG_INTERNAL_H_ + +/**************************************************************************************** + * Control optimisation attempts. + ***************************************************************************************/ + +// Don't use lots of short-term locks - use great long ones, reducing the overall locks-per-second +#define VCHI_COARSE_LOCKING + +// Avoid lock then unlock on exit from blocking queue operations (msg tx, bulk rx/tx) +// (only relevant if VCHI_COARSE_LOCKING) +#define VCHI_ELIDE_BLOCK_EXIT_LOCK + +// Avoid lock on non-blocking peek +// (only relevant if VCHI_COARSE_LOCKING) +#define VCHI_AVOID_PEEK_LOCK + +// Use one slot-handler thread per connection, rather than 1 thread dealing with all connections in rotation. +#define VCHI_MULTIPLE_HANDLER_THREADS + +// Put free descriptors onto the head of the free queue, rather than the tail, so that we don't thrash +// our way through the pool of descriptors. +#define VCHI_PUSH_FREE_DESCRIPTORS_ONTO_HEAD + +// Don't issue a MSG_AVAILABLE callback for every single message. Possibly only safe if VCHI_COARSE_LOCKING. +#define VCHI_FEWER_MSG_AVAILABLE_CALLBACKS + +// Don't use message descriptors for TX messages that don't need them +#define VCHI_MINIMISE_TX_MSG_DESCRIPTORS + +// Nano-locks for multiqueue +//#define VCHI_MQUEUE_NANOLOCKS + +// Lock-free(er) dequeuing +//#define VCHI_RX_NANOLOCKS + +#endif /*VCHI_CFG_INTERNAL_H_*/ Property changes on: projects/building-blocks/sys/contrib/vchiq/interface/vchi/vchi_cfg_internal.h ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Index: projects/building-blocks/sys/contrib/vchiq/interface/vchi/vchi_common.h =================================================================== --- projects/building-blocks/sys/contrib/vchiq/interface/vchi/vchi_common.h (nonexistent) +++ projects/building-blocks/sys/contrib/vchiq/interface/vchi/vchi_common.h (revision 278312) @@ -0,0 +1,163 @@ +/** + * Copyright (c) 2010-2012 Broadcom. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions, and the following disclaimer, + * without modification. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The names of the above-listed copyright holders may not be used + * to endorse or promote products derived from this software without + * specific prior written permission. + * + * ALTERNATIVELY, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2, as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef VCHI_COMMON_H_ +#define VCHI_COMMON_H_ + + +//flags used when sending messages (must be bitmapped) +typedef enum +{ + VCHI_FLAGS_NONE = 0x0, + VCHI_FLAGS_BLOCK_UNTIL_OP_COMPLETE = 0x1, // waits for message to be received, or sent (NB. not the same as being seen on other side) + VCHI_FLAGS_CALLBACK_WHEN_OP_COMPLETE = 0x2, // run a callback when message sent + VCHI_FLAGS_BLOCK_UNTIL_QUEUED = 0x4, // return once the transfer is in a queue ready to go + VCHI_FLAGS_ALLOW_PARTIAL = 0x8, + VCHI_FLAGS_BLOCK_UNTIL_DATA_READ = 0x10, + VCHI_FLAGS_CALLBACK_WHEN_DATA_READ = 0x20, + + VCHI_FLAGS_ALIGN_SLOT = 0x000080, // internal use only + VCHI_FLAGS_BULK_AUX_QUEUED = 0x010000, // internal use only + VCHI_FLAGS_BULK_AUX_COMPLETE = 0x020000, // internal use only + VCHI_FLAGS_BULK_DATA_QUEUED = 0x040000, // internal use only + VCHI_FLAGS_BULK_DATA_COMPLETE = 0x080000, // internal use only + VCHI_FLAGS_INTERNAL = 0xFF0000 +} VCHI_FLAGS_T; + +// constants for vchi_crc_control() +typedef enum { + VCHI_CRC_NOTHING = -1, + VCHI_CRC_PER_SERVICE = 0, + VCHI_CRC_EVERYTHING = 1, +} VCHI_CRC_CONTROL_T; + +//callback reasons when an event occurs on a service +typedef enum +{ + VCHI_CALLBACK_REASON_MIN, + + //This indicates that there is data available + //handle is the msg id that was transmitted with the data + // When a message is received and there was no FULL message available previously, send callback + // Tasks get kicked by the callback, reset their event and try and read from the fifo until it fails + VCHI_CALLBACK_MSG_AVAILABLE, + VCHI_CALLBACK_MSG_SENT, + VCHI_CALLBACK_MSG_SPACE_AVAILABLE, // XXX not yet implemented + + // This indicates that a transfer from the other side has completed + VCHI_CALLBACK_BULK_RECEIVED, + //This indicates that data queued up to be sent has now gone + //handle is the msg id that was used when sending the data + VCHI_CALLBACK_BULK_SENT, + VCHI_CALLBACK_BULK_RX_SPACE_AVAILABLE, // XXX not yet implemented + VCHI_CALLBACK_BULK_TX_SPACE_AVAILABLE, // XXX not yet implemented + + VCHI_CALLBACK_SERVICE_CLOSED, + + // this side has sent XOFF to peer due to lack of data consumption by service + // (suggests the service may need to take some recovery action if it has + // been deliberately holding off consuming data) + VCHI_CALLBACK_SENT_XOFF, + VCHI_CALLBACK_SENT_XON, + + // indicates that a bulk transfer has finished reading the source buffer + VCHI_CALLBACK_BULK_DATA_READ, + + // power notification events (currently host side only) + VCHI_CALLBACK_PEER_OFF, + VCHI_CALLBACK_PEER_SUSPENDED, + VCHI_CALLBACK_PEER_ON, + VCHI_CALLBACK_PEER_RESUMED, + VCHI_CALLBACK_FORCED_POWER_OFF, + +#ifdef USE_VCHIQ_ARM + // some extra notifications provided by vchiq_arm + VCHI_CALLBACK_SERVICE_OPENED, + VCHI_CALLBACK_BULK_RECEIVE_ABORTED, + VCHI_CALLBACK_BULK_TRANSMIT_ABORTED, +#endif + + VCHI_CALLBACK_REASON_MAX +} VCHI_CALLBACK_REASON_T; + +//Calback used by all services / bulk transfers +typedef void (*VCHI_CALLBACK_T)( void *callback_param, //my service local param + VCHI_CALLBACK_REASON_T reason, + void *handle ); //for transmitting msg's only + + + +/* + * Define vector struct for scatter-gather (vector) operations + * Vectors can be nested - if a vector element has negative length, then + * the data pointer is treated as pointing to another vector array, with + * '-vec_len' elements. Thus to append a header onto an existing vector, + * you can do this: + * + * void foo(const VCHI_MSG_VECTOR_T *v, int n) + * { + * VCHI_MSG_VECTOR_T nv[2]; + * nv[0].vec_base = my_header; + * nv[0].vec_len = sizeof my_header; + * nv[1].vec_base = v; + * nv[1].vec_len = -n; + * ... + * + */ +typedef struct vchi_msg_vector { + const void *vec_base; + int32_t vec_len; +} VCHI_MSG_VECTOR_T; + +// Opaque type for a connection API +typedef struct opaque_vchi_connection_api_t VCHI_CONNECTION_API_T; + +// Opaque type for a message driver +typedef struct opaque_vchi_message_driver_t VCHI_MESSAGE_DRIVER_T; + + +// Iterator structure for reading ahead through received message queue. Allocated by client, +// initialised by vchi_msg_look_ahead. Fields are for internal VCHI use only. +// Iterates over messages in queue at the instant of the call to vchi_msg_lookahead - +// will not proceed to messages received since. Behaviour is undefined if an iterator +// is used again after messages for that service are removed/dequeued by any +// means other than vchi_msg_iter_... calls on the iterator itself. +typedef struct { + struct opaque_vchi_service_t *service; + void *last; + void *next; + void *remove; +} VCHI_MSG_ITER_T; + + +#endif // VCHI_COMMON_H_ Property changes on: projects/building-blocks/sys/contrib/vchiq/interface/vchi/vchi_common.h ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Index: projects/building-blocks/sys/contrib/vchiq/interface/vchi/vchi_mh.h =================================================================== --- projects/building-blocks/sys/contrib/vchiq/interface/vchi/vchi_mh.h (nonexistent) +++ projects/building-blocks/sys/contrib/vchiq/interface/vchi/vchi_mh.h (revision 278312) @@ -0,0 +1,42 @@ +/** + * Copyright (c) 2010-2012 Broadcom. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions, and the following disclaimer, + * without modification. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The names of the above-listed copyright holders may not be used + * to endorse or promote products derived from this software without + * specific prior written permission. + * + * ALTERNATIVELY, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2, as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef VCHI_MH_H_ +#define VCHI_MH_H_ + +#include + +typedef int32_t VCHI_MEM_HANDLE_T; +#define VCHI_MEM_HANDLE_INVALID 0 + +#endif Property changes on: projects/building-blocks/sys/contrib/vchiq/interface/vchi/vchi_mh.h ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Index: projects/building-blocks/sys/contrib/vchiq/interface/vchiq_arm/vchiq.h =================================================================== --- projects/building-blocks/sys/contrib/vchiq/interface/vchiq_arm/vchiq.h (nonexistent) +++ projects/building-blocks/sys/contrib/vchiq/interface/vchiq_arm/vchiq.h (revision 278312) @@ -0,0 +1,41 @@ +/** + * Copyright (c) 2010-2012 Broadcom. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions, and the following disclaimer, + * without modification. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The names of the above-listed copyright holders may not be used + * to endorse or promote products derived from this software without + * specific prior written permission. + * + * ALTERNATIVELY, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2, as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef VCHIQ_VCHIQ_H +#define VCHIQ_VCHIQ_H + +#include "vchiq_if.h" +#include "vchiq_util.h" + +#endif + Property changes on: projects/building-blocks/sys/contrib/vchiq/interface/vchiq_arm/vchiq.h ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Index: projects/building-blocks/sys/contrib/vchiq/interface/vchiq_arm/vchiq_2835.h =================================================================== --- projects/building-blocks/sys/contrib/vchiq/interface/vchiq_arm/vchiq_2835.h (nonexistent) +++ projects/building-blocks/sys/contrib/vchiq/interface/vchiq_arm/vchiq_2835.h (revision 278312) @@ -0,0 +1,42 @@ +/** + * Copyright (c) 2010-2012 Broadcom. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions, and the following disclaimer, + * without modification. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The names of the above-listed copyright holders may not be used + * to endorse or promote products derived from this software without + * specific prior written permission. + * + * ALTERNATIVELY, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2, as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef VCHIQ_2835_H +#define VCHIQ_2835_H + +#include "vchiq_pagelist.h" + +#define VCHIQ_PLATFORM_FRAGMENTS_OFFSET_IDX 0 +#define VCHIQ_PLATFORM_FRAGMENTS_COUNT_IDX 1 + +#endif /* VCHIQ_2835_H */ Property changes on: projects/building-blocks/sys/contrib/vchiq/interface/vchiq_arm/vchiq_2835.h ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Index: projects/building-blocks/sys/contrib/vchiq/interface/vchiq_arm/vchiq_2835_arm.c =================================================================== --- projects/building-blocks/sys/contrib/vchiq/interface/vchiq_arm/vchiq_2835_arm.c (nonexistent) +++ projects/building-blocks/sys/contrib/vchiq/interface/vchiq_arm/vchiq_2835_arm.c (revision 278312) @@ -0,0 +1,580 @@ +/** + * Copyright (c) 2010-2012 Broadcom. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions, and the following disclaimer, + * without modification. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The names of the above-listed copyright holders may not be used + * to endorse or promote products derived from this software without + * specific prior written permission. + * + * ALTERNATIVELY, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2, as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +MALLOC_DEFINE(M_VCPAGELIST, "vcpagelist", "VideoCore pagelist memory"); + +#define TOTAL_SLOTS (VCHIQ_SLOT_ZERO_SLOTS + 2 * 32) + +#define VCHIQ_DOORBELL_IRQ IRQ_ARM_DOORBELL_0 +#define VCHIQ_ARM_ADDRESS(x) ((void *)PHYS_TO_VCBUS(pmap_kextract((vm_offset_t)(x)))) + +#include "vchiq_arm.h" +#include "vchiq_2835.h" +#include "vchiq_connected.h" + +#define MAX_FRAGMENTS (VCHIQ_NUM_CURRENT_BULKS * 2) + +typedef struct vchiq_2835_state_struct { + int inited; + VCHIQ_ARM_STATE_T arm_state; +} VCHIQ_2835_ARM_STATE_T; + +static char *g_slot_mem; +static int g_slot_mem_size; +vm_paddr_t g_slot_phys; +/* BSD DMA */ +bus_dma_tag_t bcm_slots_dma_tag; +bus_dmamap_t bcm_slots_dma_map; + +static FRAGMENTS_T *g_fragments_base; +static FRAGMENTS_T *g_free_fragments; +struct semaphore g_free_fragments_sema; + +static DEFINE_SEMAPHORE(g_free_fragments_mutex); + +typedef struct bulkinfo_struct { + PAGELIST_T *pagelist; + bus_dma_tag_t pagelist_dma_tag; + bus_dmamap_t pagelist_dma_map; + void *buf; + size_t size; +} BULKINFO_T; + +static int +create_pagelist(char __user *buf, size_t count, unsigned short type, + struct proc *p, BULKINFO_T *bi); + +static void +free_pagelist(BULKINFO_T *bi, int actual); + +static void +vchiq_dmamap_cb(void *arg, bus_dma_segment_t *segs, int nseg, int err) +{ + bus_addr_t *addr; + + if (err) + return; + + addr = (bus_addr_t*)arg; + *addr = PHYS_TO_VCBUS(segs[0].ds_addr); +} + +int __init +vchiq_platform_init(VCHIQ_STATE_T *state) +{ + VCHIQ_SLOT_ZERO_T *vchiq_slot_zero; + int frag_mem_size; + int err; + int i; + + /* Allocate space for the channels in coherent memory */ + g_slot_mem_size = PAGE_ALIGN(TOTAL_SLOTS * VCHIQ_SLOT_SIZE); + frag_mem_size = PAGE_ALIGN(sizeof(FRAGMENTS_T) * MAX_FRAGMENTS); + + err = bus_dma_tag_create( + NULL, + PAGE_SIZE, 0, /* alignment, boundary */ + BUS_SPACE_MAXADDR_32BIT, /* lowaddr */ + BUS_SPACE_MAXADDR, /* highaddr */ + NULL, NULL, /* filter, filterarg */ + g_slot_mem_size + frag_mem_size, 1, /* maxsize, nsegments */ + g_slot_mem_size + frag_mem_size, 0, /* maxsegsize, flags */ + NULL, NULL, /* lockfunc, lockarg */ + &bcm_slots_dma_tag); + + err = bus_dmamem_alloc(bcm_slots_dma_tag, (void **)&g_slot_mem, + BUS_DMA_COHERENT | BUS_DMA_WAITOK, &bcm_slots_dma_map); + if (err) { + vchiq_log_error(vchiq_core_log_level, "Unable to allocate channel memory"); + err = -ENOMEM; + goto failed_alloc; + } + + err = bus_dmamap_load(bcm_slots_dma_tag, bcm_slots_dma_map, g_slot_mem, + g_slot_mem_size + frag_mem_size, vchiq_dmamap_cb, + &g_slot_phys, 0); + + if (err) { + vchiq_log_error(vchiq_core_log_level, "cannot load DMA map"); + err = -ENOMEM; + goto failed_load; + } + + WARN_ON(((int)g_slot_mem & (PAGE_SIZE - 1)) != 0); + + vchiq_slot_zero = vchiq_init_slots(g_slot_mem, g_slot_mem_size); + if (!vchiq_slot_zero) { + err = -EINVAL; + goto failed_init_slots; + } + + vchiq_slot_zero->platform_data[VCHIQ_PLATFORM_FRAGMENTS_OFFSET_IDX] = + (int)g_slot_phys + g_slot_mem_size; + vchiq_slot_zero->platform_data[VCHIQ_PLATFORM_FRAGMENTS_COUNT_IDX] = + MAX_FRAGMENTS; + + g_fragments_base = (FRAGMENTS_T *)(g_slot_mem + g_slot_mem_size); + g_slot_mem_size += frag_mem_size; + + g_free_fragments = g_fragments_base; + for (i = 0; i < (MAX_FRAGMENTS - 1); i++) { + *(FRAGMENTS_T **)&g_fragments_base[i] = + &g_fragments_base[i + 1]; + } + *(FRAGMENTS_T **)&g_fragments_base[i] = NULL; + _sema_init(&g_free_fragments_sema, MAX_FRAGMENTS); + + if (vchiq_init_state(state, vchiq_slot_zero, 0/*slave*/) != + VCHIQ_SUCCESS) { + err = -EINVAL; + goto failed_vchiq_init; + } + + bcm_mbox_write(BCM2835_MBOX_CHAN_VCHIQ, (unsigned int)g_slot_phys); + + vchiq_log_info(vchiq_arm_log_level, + "vchiq_init - done (slots %x, phys %x)", + (unsigned int)vchiq_slot_zero, g_slot_phys); + + vchiq_call_connected_callbacks(); + + return 0; + +failed_vchiq_init: +failed_init_slots: +failed_load: + bus_dmamap_unload(bcm_slots_dma_tag, bcm_slots_dma_map); +failed_alloc: + bus_dmamap_destroy(bcm_slots_dma_tag, bcm_slots_dma_map); + bus_dma_tag_destroy(bcm_slots_dma_tag); + + return err; +} + +void __exit +vchiq_platform_exit(VCHIQ_STATE_T *state) +{ + + bus_dmamap_unload(bcm_slots_dma_tag, bcm_slots_dma_map); + bus_dmamap_destroy(bcm_slots_dma_tag, bcm_slots_dma_map); + bus_dma_tag_destroy(bcm_slots_dma_tag); +} + +VCHIQ_STATUS_T +vchiq_platform_init_state(VCHIQ_STATE_T *state) +{ + VCHIQ_STATUS_T status = VCHIQ_SUCCESS; + state->platform_state = kzalloc(sizeof(VCHIQ_2835_ARM_STATE_T), GFP_KERNEL); + ((VCHIQ_2835_ARM_STATE_T*)state->platform_state)->inited = 1; + status = vchiq_arm_init_state(state, &((VCHIQ_2835_ARM_STATE_T*)state->platform_state)->arm_state); + if(status != VCHIQ_SUCCESS) + { + ((VCHIQ_2835_ARM_STATE_T*)state->platform_state)->inited = 0; + } + return status; +} + +VCHIQ_ARM_STATE_T* +vchiq_platform_get_arm_state(VCHIQ_STATE_T *state) +{ + if(!((VCHIQ_2835_ARM_STATE_T*)state->platform_state)->inited) + { + BUG(); + } + return &((VCHIQ_2835_ARM_STATE_T*)state->platform_state)->arm_state; +} + +int +vchiq_copy_from_user(void *dst, const void *src, int size) +{ + + if (((vm_offset_t)(src)) < VM_MIN_KERNEL_ADDRESS) { + int error = copyin(src, dst, size); + return error ? VCHIQ_ERROR : VCHIQ_SUCCESS; + } + else + bcopy(src, dst, size); + + return 0; +} + +VCHIQ_STATUS_T +vchiq_prepare_bulk_data(VCHIQ_BULK_T *bulk, VCHI_MEM_HANDLE_T memhandle, + void *offset, int size, int dir) +{ + PAGELIST_T *pagelist; + BULKINFO_T *bi; + int ret; + + WARN_ON(memhandle != VCHI_MEM_HANDLE_INVALID); + bi = malloc(sizeof(*bi), M_VCPAGELIST, M_WAITOK | M_ZERO); + if (bi == NULL) + return VCHIQ_ERROR; + + ret = create_pagelist((char __user *)offset, size, + (dir == VCHIQ_BULK_RECEIVE) + ? PAGELIST_READ + : PAGELIST_WRITE, + current, + bi); + if (ret != 0) + return VCHIQ_ERROR; + + bulk->handle = memhandle; + bulk->data = VCHIQ_ARM_ADDRESS(bi->pagelist); + + /* Store the pagelist address in remote_data, which isn't used by the + slave. */ + bulk->remote_data = bi; + + return VCHIQ_SUCCESS; +} + +void +vchiq_complete_bulk(VCHIQ_BULK_T *bulk) +{ + if (bulk && bulk->remote_data && bulk->actual) + free_pagelist((BULKINFO_T *)bulk->remote_data, bulk->actual); +} + +void +vchiq_transfer_bulk(VCHIQ_BULK_T *bulk) +{ + /* + * This should only be called on the master (VideoCore) side, but + * provide an implementation to avoid the need for ifdefery. + */ + BUG(); +} + +void +vchiq_dump_platform_state(void *dump_context) +{ + char buf[80]; + int len; + len = snprintf(buf, sizeof(buf), + " Platform: 2835 (VC master)"); + vchiq_dump(dump_context, buf, len + 1); +} + +VCHIQ_STATUS_T +vchiq_platform_suspend(VCHIQ_STATE_T *state) +{ + return VCHIQ_ERROR; +} + +VCHIQ_STATUS_T +vchiq_platform_resume(VCHIQ_STATE_T *state) +{ + return VCHIQ_SUCCESS; +} + +void +vchiq_platform_paused(VCHIQ_STATE_T *state) +{ +} + +void +vchiq_platform_resumed(VCHIQ_STATE_T *state) +{ +} + +int +vchiq_platform_videocore_wanted(VCHIQ_STATE_T* state) +{ + return 1; // autosuspend not supported - videocore always wanted +} + +int +vchiq_platform_use_suspend_timer(void) +{ + return 0; +} +void +vchiq_dump_platform_use_state(VCHIQ_STATE_T *state) +{ + vchiq_log_info(vchiq_arm_log_level, "Suspend timer not in use"); +} +void +vchiq_platform_handle_timeout(VCHIQ_STATE_T *state) +{ + (void)state; +} +/* + * Local functions + */ + +/* There is a potential problem with partial cache lines (pages?) +** at the ends of the block when reading. If the CPU accessed anything in +** the same line (page?) then it may have pulled old data into the cache, +** obscuring the new data underneath. We can solve this by transferring the +** partial cache lines separately, and allowing the ARM to copy into the +** cached area. + +** N.B. This implementation plays slightly fast and loose with the Linux +** driver programming rules, e.g. its use of __virt_to_bus instead of +** dma_map_single, but it isn't a multi-platform driver and it benefits +** from increased speed as a result. +*/ + +static int +create_pagelist(char __user *buf, size_t count, unsigned short type, + struct proc *p, BULKINFO_T *bi) +{ + PAGELIST_T *pagelist; + vm_page_t* pages; + unsigned long *addrs; + unsigned int num_pages, i; + vm_offset_t offset; + int pagelist_size; + char *addr, *base_addr, *next_addr; + int run, addridx, actual_pages; + int err; + vm_paddr_t pagelist_phys; + + offset = (vm_offset_t)buf & (PAGE_SIZE - 1); + num_pages = (count + offset + PAGE_SIZE - 1) / PAGE_SIZE; + + bi->pagelist = NULL; + bi->buf = buf; + bi->size = count; + + /* Allocate enough storage to hold the page pointers and the page + ** list + */ + pagelist_size = sizeof(PAGELIST_T) + + (num_pages * sizeof(unsigned long)) + + (num_pages * sizeof(pages[0])); + + err = bus_dma_tag_create( + NULL, + PAGE_SIZE, 0, /* alignment, boundary */ + BUS_SPACE_MAXADDR_32BIT, /* lowaddr */ + BUS_SPACE_MAXADDR, /* highaddr */ + NULL, NULL, /* filter, filterarg */ + pagelist_size, 1, /* maxsize, nsegments */ + pagelist_size, 0, /* maxsegsize, flags */ + NULL, NULL, /* lockfunc, lockarg */ + &bi->pagelist_dma_tag); + + + + err = bus_dmamem_alloc(bi->pagelist_dma_tag, (void **)&pagelist, + BUS_DMA_COHERENT | BUS_DMA_WAITOK, &bi->pagelist_dma_map); + if (err) { + vchiq_log_error(vchiq_core_log_level, "Unable to allocate pagelist memory"); + err = -ENOMEM; + goto failed_alloc; + } + + err = bus_dmamap_load(bi->pagelist_dma_tag, bi->pagelist_dma_map, pagelist, + pagelist_size, vchiq_dmamap_cb, + &pagelist_phys, 0); + + if (err) { + vchiq_log_error(vchiq_core_log_level, "cannot load DMA map for pagelist memory"); + err = -ENOMEM; + goto failed_load; + } + + vchiq_log_trace(vchiq_arm_log_level, + "create_pagelist - %x", (unsigned int)pagelist); + if (!pagelist) + return -ENOMEM; + + addrs = pagelist->addrs; + pages = (vm_page_t*)(addrs + num_pages); + + actual_pages = vm_fault_quick_hold_pages(&p->p_vmspace->vm_map, + (vm_offset_t)buf, count, + (type == PAGELIST_READ ? VM_PROT_WRITE : 0 ) | VM_PROT_READ, pages, num_pages); + + if (actual_pages != num_pages) { + vm_page_unhold_pages(pages, actual_pages); + free(pagelist, M_VCPAGELIST); + return (-ENOMEM); + } + + pagelist->length = count; + pagelist->type = type; + pagelist->offset = offset; + + /* Group the pages into runs of contiguous pages */ + + base_addr = (void *)PHYS_TO_VCBUS(VM_PAGE_TO_PHYS(pages[0])); + next_addr = base_addr + PAGE_SIZE; + addridx = 0; + run = 0; + + for (i = 1; i < num_pages; i++) { + addr = (void *)PHYS_TO_VCBUS(VM_PAGE_TO_PHYS(pages[i])); + if ((addr == next_addr) && (run < (PAGE_SIZE - 1))) { + next_addr += PAGE_SIZE; + run++; + } else { + addrs[addridx] = (unsigned long)base_addr + run; + addridx++; + base_addr = addr; + next_addr = addr + PAGE_SIZE; + run = 0; + } + } + + addrs[addridx] = (unsigned long)base_addr + run; + addridx++; + + /* Partial cache lines (fragments) require special measures */ + if ((type == PAGELIST_READ) && + ((pagelist->offset & (CACHE_LINE_SIZE - 1)) || + ((pagelist->offset + pagelist->length) & + (CACHE_LINE_SIZE - 1)))) { + FRAGMENTS_T *fragments; + + if (down_interruptible(&g_free_fragments_sema) != 0) { + free(pagelist, M_VCPAGELIST); + return -EINTR; + } + + WARN_ON(g_free_fragments == NULL); + + down(&g_free_fragments_mutex); + fragments = (FRAGMENTS_T *) g_free_fragments; + WARN_ON(fragments == NULL); + g_free_fragments = *(FRAGMENTS_T **) g_free_fragments; + up(&g_free_fragments_mutex); + pagelist->type = + PAGELIST_READ_WITH_FRAGMENTS + (fragments - + g_fragments_base); + } + + /* XXX: optimize? INV operation for read WBINV for write? */ + cpu_dcache_wbinv_range((vm_offset_t)buf, count); + + bi->pagelist = pagelist; + + return 0; + +failed_load: + bus_dmamap_unload(bi->pagelist_dma_tag, bi->pagelist_dma_map); +failed_alloc: + bus_dmamap_destroy(bi->pagelist_dma_tag, bi->pagelist_dma_map); + bus_dma_tag_destroy(bi->pagelist_dma_tag); + + return err; +} + +static void +free_pagelist(BULKINFO_T *bi, int actual) +{ + vm_page_t*pages; + unsigned int num_pages, i; + void *page_address; + PAGELIST_T *pagelist; + + pagelist = bi->pagelist; + + vchiq_log_trace(vchiq_arm_log_level, + "free_pagelist - %x, %d", (unsigned int)pagelist, actual); + + num_pages = + (pagelist->length + pagelist->offset + PAGE_SIZE - 1) / + PAGE_SIZE; + + pages = (vm_page_t*)(pagelist->addrs + num_pages); + + /* Deal with any partial cache lines (fragments) */ + if (pagelist->type >= PAGELIST_READ_WITH_FRAGMENTS) { + FRAGMENTS_T *fragments = g_fragments_base + + (pagelist->type - PAGELIST_READ_WITH_FRAGMENTS); + int head_bytes, tail_bytes; + head_bytes = (CACHE_LINE_SIZE - pagelist->offset) & + (CACHE_LINE_SIZE - 1); + tail_bytes = (pagelist->offset + actual) & + (CACHE_LINE_SIZE - 1); + + if ((actual >= 0) && (head_bytes != 0)) { + if (head_bytes > actual) + head_bytes = actual; + + memcpy((char *)bi->buf, + fragments->headbuf, + head_bytes); + } + + if ((actual >= 0) && (head_bytes < actual) && + (tail_bytes != 0)) { + memcpy((char *)bi->buf + actual - tail_bytes, + fragments->tailbuf, tail_bytes); + } + + down(&g_free_fragments_mutex); + *(FRAGMENTS_T **) fragments = g_free_fragments; + g_free_fragments = fragments; + up(&g_free_fragments_mutex); + up(&g_free_fragments_sema); + } + + for (i = 0; i < num_pages; i++) { + if (pagelist->type != PAGELIST_WRITE) + vm_page_dirty(pages[i]); + } + + vm_page_unhold_pages(pages, num_pages); + + bus_dmamap_unload(bi->pagelist_dma_tag, bi->pagelist_dma_map); + bus_dmamem_free(bi->pagelist_dma_tag, bi->pagelist, bi->pagelist_dma_map); + bus_dmamap_destroy(bi->pagelist_dma_tag, bi->pagelist_dma_map); + bus_dma_tag_destroy(bi->pagelist_dma_tag); + + free(bi, M_VCPAGELIST); +} Property changes on: projects/building-blocks/sys/contrib/vchiq/interface/vchiq_arm/vchiq_2835_arm.c ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Index: projects/building-blocks/sys/contrib/vchiq/interface/vchiq_arm/vchiq_arm.c =================================================================== --- projects/building-blocks/sys/contrib/vchiq/interface/vchiq_arm/vchiq_arm.c (nonexistent) +++ projects/building-blocks/sys/contrib/vchiq/interface/vchiq_arm/vchiq_arm.c (revision 278312) @@ -0,0 +1,2810 @@ +/** + * Copyright (c) 2010-2012 Broadcom. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions, and the following disclaimer, + * without modification. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The names of the above-listed copyright holders may not be used + * to endorse or promote products derived from this software without + * specific prior written permission. + * + * ALTERNATIVELY, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2, as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + + +#include "vchiq_core.h" +#include "vchiq_ioctl.h" +#include "vchiq_arm.h" + +#define DEVICE_NAME "vchiq" + +/* Override the default prefix, which would be vchiq_arm (from the filename) */ +#undef MODULE_PARAM_PREFIX +#define MODULE_PARAM_PREFIX DEVICE_NAME "." + +#define VCHIQ_MINOR 0 + +/* Some per-instance constants */ +#define MAX_COMPLETIONS 16 +#define MAX_SERVICES 64 +#define MAX_ELEMENTS 8 +#define MSG_QUEUE_SIZE 64 + +#define KEEPALIVE_VER 1 +#define KEEPALIVE_VER_MIN KEEPALIVE_VER + +MALLOC_DEFINE(M_VCHIQ, "vchiq_cdev", "VideoCore cdev memroy"); + +/* Run time control of log level, based on KERN_XXX level. */ +int vchiq_arm_log_level = VCHIQ_LOG_DEFAULT; +int vchiq_susp_log_level = VCHIQ_LOG_ERROR; + +#define SUSPEND_TIMER_TIMEOUT_MS 100 +#define SUSPEND_RETRY_TIMER_TIMEOUT_MS 1000 + +#define VC_SUSPEND_NUM_OFFSET 3 /* number of values before idle which are -ve */ +static const char *const suspend_state_names[] = { + "VC_SUSPEND_FORCE_CANCELED", + "VC_SUSPEND_REJECTED", + "VC_SUSPEND_FAILED", + "VC_SUSPEND_IDLE", + "VC_SUSPEND_REQUESTED", + "VC_SUSPEND_IN_PROGRESS", + "VC_SUSPEND_SUSPENDED" +}; +#define VC_RESUME_NUM_OFFSET 1 /* number of values before idle which are -ve */ +static const char *const resume_state_names[] = { + "VC_RESUME_FAILED", + "VC_RESUME_IDLE", + "VC_RESUME_REQUESTED", + "VC_RESUME_IN_PROGRESS", + "VC_RESUME_RESUMED" +}; +/* The number of times we allow force suspend to timeout before actually +** _forcing_ suspend. This is to cater for SW which fails to release vchiq +** correctly - we don't want to prevent ARM suspend indefinitely in this case. +*/ +#define FORCE_SUSPEND_FAIL_MAX 8 + +/* The time in ms allowed for videocore to go idle when force suspend has been + * requested */ +#define FORCE_SUSPEND_TIMEOUT_MS 200 + + +static void suspend_timer_callback(unsigned long context); +#ifdef notyet +static int vchiq_proc_add_instance(VCHIQ_INSTANCE_T instance); +static void vchiq_proc_remove_instance(VCHIQ_INSTANCE_T instance); +#endif + + +typedef struct user_service_struct { + VCHIQ_SERVICE_T *service; + void *userdata; + VCHIQ_INSTANCE_T instance; + int is_vchi; + int dequeue_pending; + int message_available_pos; + int msg_insert; + int msg_remove; + struct semaphore insert_event; + struct semaphore remove_event; + VCHIQ_HEADER_T * msg_queue[MSG_QUEUE_SIZE]; +} USER_SERVICE_T; + +struct bulk_waiter_node { + struct bulk_waiter bulk_waiter; + int pid; + struct list_head list; +}; + +struct vchiq_instance_struct { + VCHIQ_STATE_T *state; + VCHIQ_COMPLETION_DATA_T completions[MAX_COMPLETIONS]; + int completion_insert; + int completion_remove; + struct semaphore insert_event; + struct semaphore remove_event; + struct mutex completion_mutex; + + int connected; + int closing; + int pid; + int mark; + + struct list_head bulk_waiter_list; + struct mutex bulk_waiter_list_mutex; + + struct proc_dir_entry *proc_entry; +}; + +typedef struct dump_context_struct { + char __user *buf; + size_t actual; + size_t space; + loff_t offset; +} DUMP_CONTEXT_T; + +static struct cdev * vchiq_cdev; +VCHIQ_STATE_T g_state; +static DEFINE_SPINLOCK(msg_queue_spinlock); + +static const char *const ioctl_names[] = { + "CONNECT", + "SHUTDOWN", + "CREATE_SERVICE", + "REMOVE_SERVICE", + "QUEUE_MESSAGE", + "QUEUE_BULK_TRANSMIT", + "QUEUE_BULK_RECEIVE", + "AWAIT_COMPLETION", + "DEQUEUE_MESSAGE", + "GET_CLIENT_ID", + "GET_CONFIG", + "CLOSE_SERVICE", + "USE_SERVICE", + "RELEASE_SERVICE", + "SET_SERVICE_OPTION", + "DUMP_PHYS_MEM" +}; + +vchiq_static_assert((sizeof(ioctl_names)/sizeof(ioctl_names[0])) == + (VCHIQ_IOC_MAX + 1)); + +static eventhandler_tag vchiq_ehtag = NULL; +static d_open_t vchiq_open; +static d_close_t vchiq_close; +static d_ioctl_t vchiq_ioctl; + +static struct cdevsw vchiq_cdevsw = { + .d_version = D_VERSION, + .d_ioctl = vchiq_ioctl, + .d_open = vchiq_open, + .d_close = vchiq_close, + .d_name = DEVICE_NAME, +}; + +#if 0 +static void +dump_phys_mem(void *virt_addr, uint32_t num_bytes); +#endif + +/**************************************************************************** +* +* add_completion +* +***************************************************************************/ + +static VCHIQ_STATUS_T +add_completion(VCHIQ_INSTANCE_T instance, VCHIQ_REASON_T reason, + VCHIQ_HEADER_T *header, USER_SERVICE_T *user_service, + void *bulk_userdata) +{ + VCHIQ_COMPLETION_DATA_T *completion; + DEBUG_INITIALISE(g_state.local) + + while (instance->completion_insert == + (instance->completion_remove + MAX_COMPLETIONS)) { + /* Out of space - wait for the client */ + DEBUG_TRACE(SERVICE_CALLBACK_LINE); + vchiq_log_trace(vchiq_arm_log_level, + "add_completion - completion queue full"); + DEBUG_COUNT(COMPLETION_QUEUE_FULL_COUNT); + if (down_interruptible(&instance->remove_event) != 0) { + vchiq_log_info(vchiq_arm_log_level, + "service_callback interrupted"); + return VCHIQ_RETRY; + } else if (instance->closing) { + vchiq_log_info(vchiq_arm_log_level, + "service_callback closing"); + return VCHIQ_ERROR; + } + DEBUG_TRACE(SERVICE_CALLBACK_LINE); + } + + completion = + &instance->completions[instance->completion_insert & + (MAX_COMPLETIONS - 1)]; + + completion->header = header; + completion->reason = reason; + /* N.B. service_userdata is updated while processing AWAIT_COMPLETION */ + completion->service_userdata = user_service->service; + completion->bulk_userdata = bulk_userdata; + + if (reason == VCHIQ_SERVICE_CLOSED) + /* Take an extra reference, to be held until + this CLOSED notification is delivered. */ + lock_service(user_service->service); + + /* A write barrier is needed here to ensure that the entire completion + record is written out before the insert point. */ + wmb(); + + if (reason == VCHIQ_MESSAGE_AVAILABLE) + user_service->message_available_pos = + instance->completion_insert; + instance->completion_insert++; + + up(&instance->insert_event); + + return VCHIQ_SUCCESS; +} + +/**************************************************************************** +* +* service_callback +* +***************************************************************************/ + +static VCHIQ_STATUS_T +service_callback(VCHIQ_REASON_T reason, VCHIQ_HEADER_T *header, + VCHIQ_SERVICE_HANDLE_T handle, void *bulk_userdata) +{ + /* How do we ensure the callback goes to the right client? + ** The service_user data points to a USER_SERVICE_T record containing + ** the original callback and the user state structure, which contains a + ** circular buffer for completion records. + */ + USER_SERVICE_T *user_service; + VCHIQ_SERVICE_T *service; + VCHIQ_INSTANCE_T instance; + DEBUG_INITIALISE(g_state.local) + + DEBUG_TRACE(SERVICE_CALLBACK_LINE); + + service = handle_to_service(handle); + BUG_ON(!service); + user_service = (USER_SERVICE_T *)service->base.userdata; + instance = user_service->instance; + + if (!instance || instance->closing) + return VCHIQ_SUCCESS; + + vchiq_log_trace(vchiq_arm_log_level, + "service_callback - service %lx(%d), handle %x, reason %d, header %lx, " + "instance %lx, bulk_userdata %lx", + (unsigned long)user_service, + service->localport, service->handle, + reason, (unsigned long)header, + (unsigned long)instance, (unsigned long)bulk_userdata); + + if (header && user_service->is_vchi) { + spin_lock(&msg_queue_spinlock); + while (user_service->msg_insert == + (user_service->msg_remove + MSG_QUEUE_SIZE)) { + spin_unlock(&msg_queue_spinlock); + DEBUG_TRACE(SERVICE_CALLBACK_LINE); + DEBUG_COUNT(MSG_QUEUE_FULL_COUNT); + vchiq_log_trace(vchiq_arm_log_level, + "service_callback - msg queue full"); + /* If there is no MESSAGE_AVAILABLE in the completion + ** queue, add one + */ + if ((user_service->message_available_pos - + instance->completion_remove) < 0) { + VCHIQ_STATUS_T status; + vchiq_log_info(vchiq_arm_log_level, + "Inserting extra MESSAGE_AVAILABLE"); + DEBUG_TRACE(SERVICE_CALLBACK_LINE); + status = add_completion(instance, reason, + NULL, user_service, bulk_userdata); + if (status != VCHIQ_SUCCESS) { + DEBUG_TRACE(SERVICE_CALLBACK_LINE); + return status; + } + } + + DEBUG_TRACE(SERVICE_CALLBACK_LINE); + if (down_interruptible(&user_service->remove_event) + != 0) { + vchiq_log_info(vchiq_arm_log_level, + "service_callback interrupted"); + DEBUG_TRACE(SERVICE_CALLBACK_LINE); + return VCHIQ_RETRY; + } else if (instance->closing) { + vchiq_log_info(vchiq_arm_log_level, + "service_callback closing"); + DEBUG_TRACE(SERVICE_CALLBACK_LINE); + return VCHIQ_ERROR; + } + DEBUG_TRACE(SERVICE_CALLBACK_LINE); + spin_lock(&msg_queue_spinlock); + } + + user_service->msg_queue[user_service->msg_insert & + (MSG_QUEUE_SIZE - 1)] = header; + user_service->msg_insert++; + spin_unlock(&msg_queue_spinlock); + + up(&user_service->insert_event); + + /* If there is a thread waiting in DEQUEUE_MESSAGE, or if + ** there is a MESSAGE_AVAILABLE in the completion queue then + ** bypass the completion queue. + */ + if (((user_service->message_available_pos - + instance->completion_remove) >= 0) || + user_service->dequeue_pending) { + DEBUG_TRACE(SERVICE_CALLBACK_LINE); + user_service->dequeue_pending = 0; + return VCHIQ_SUCCESS; + } + + header = NULL; + } + DEBUG_TRACE(SERVICE_CALLBACK_LINE); + + return add_completion(instance, reason, header, user_service, + bulk_userdata); +} + +/**************************************************************************** +* +* user_service_free +* +***************************************************************************/ +static void +user_service_free(void *userdata) +{ + USER_SERVICE_T *user_service = userdata; + + _sema_destroy(&user_service->insert_event); + _sema_destroy(&user_service->remove_event); + + kfree(user_service); +} + +/**************************************************************************** +* +* vchiq_ioctl +* +***************************************************************************/ + +static int +vchiq_ioctl(struct cdev *cdev, u_long cmd, caddr_t arg, int fflag, + struct thread *td) +{ + VCHIQ_INSTANCE_T instance; + VCHIQ_STATUS_T status = VCHIQ_SUCCESS; + VCHIQ_SERVICE_T *service = NULL; + int ret = 0; + int i, rc; + DEBUG_INITIALISE(g_state.local) + + if ((ret = devfs_get_cdevpriv((void**)&instance))) { + printf("vchiq_ioctl: devfs_get_cdevpriv failed: error %d\n", ret); + return (ret); + } + +/* XXXBSD: HACK! */ +#define _IOC_NR(x) ((x) & 0xff) +#define _IOC_TYPE(x) IOCGROUP(x) + + vchiq_log_trace(vchiq_arm_log_level, + "vchiq_ioctl - instance %x, cmd %s, arg %p", + (unsigned int)instance, + ((_IOC_TYPE(cmd) == VCHIQ_IOC_MAGIC) && + (_IOC_NR(cmd) <= VCHIQ_IOC_MAX)) ? + ioctl_names[_IOC_NR(cmd)] : "", arg); + + switch (cmd) { + case VCHIQ_IOC_SHUTDOWN: + if (!instance->connected) + break; + + /* Remove all services */ + i = 0; + while ((service = next_service_by_instance(instance->state, + instance, &i)) != NULL) { + status = vchiq_remove_service(service->handle); + unlock_service(service); + if (status != VCHIQ_SUCCESS) + break; + } + service = NULL; + + if (status == VCHIQ_SUCCESS) { + /* Wake the completion thread and ask it to exit */ + instance->closing = 1; + up(&instance->insert_event); + } + + break; + + case VCHIQ_IOC_CONNECT: + if (instance->connected) { + ret = -EINVAL; + break; + } + rc = lmutex_lock_interruptible(&instance->state->mutex); + if (rc != 0) { + vchiq_log_error(vchiq_arm_log_level, + "vchiq: connect: could not lock mutex for " + "state %d: %d", + instance->state->id, rc); + ret = -EINTR; + break; + } + status = vchiq_connect_internal(instance->state, instance); + lmutex_unlock(&instance->state->mutex); + + if (status == VCHIQ_SUCCESS) + instance->connected = 1; + else + vchiq_log_error(vchiq_arm_log_level, + "vchiq: could not connect: %d", status); + break; + + case VCHIQ_IOC_CREATE_SERVICE: { + VCHIQ_CREATE_SERVICE_T args; + USER_SERVICE_T *user_service = NULL; + void *userdata; + int srvstate; + + memcpy(&args, (const void*)arg, sizeof(args)); + + user_service = kmalloc(sizeof(USER_SERVICE_T), GFP_KERNEL); + if (!user_service) { + ret = -ENOMEM; + break; + } + + if (args.is_open) { + if (!instance->connected) { + ret = -ENOTCONN; + kfree(user_service); + break; + } + srvstate = VCHIQ_SRVSTATE_OPENING; + } else { + srvstate = + instance->connected ? + VCHIQ_SRVSTATE_LISTENING : + VCHIQ_SRVSTATE_HIDDEN; + } + + userdata = args.params.userdata; + args.params.callback = service_callback; + args.params.userdata = user_service; + service = vchiq_add_service_internal( + instance->state, + &args.params, srvstate, + instance, user_service_free); + + if (service != NULL) { + user_service->service = service; + user_service->userdata = userdata; + user_service->instance = instance; + user_service->is_vchi = args.is_vchi; + user_service->dequeue_pending = 0; + user_service->message_available_pos = + instance->completion_remove - 1; + user_service->msg_insert = 0; + user_service->msg_remove = 0; + _sema_init(&user_service->insert_event, 0); + _sema_init(&user_service->remove_event, 0); + + if (args.is_open) { + status = vchiq_open_service_internal + (service, instance->pid); + if (status != VCHIQ_SUCCESS) { + vchiq_remove_service(service->handle); + service = NULL; + ret = (status == VCHIQ_RETRY) ? + -EINTR : -EIO; + break; + } + } + +#ifdef VCHIQ_IOCTL_DEBUG + printf("%s: [CREATE SERVICE] handle = %08x\n", __func__, service->handle); +#endif + memcpy((void *) + &(((VCHIQ_CREATE_SERVICE_T*) + arg)->handle), + (const void *)&service->handle, + sizeof(service->handle)); + + service = NULL; + } else { + ret = -EEXIST; + kfree(user_service); + } + } break; + + case VCHIQ_IOC_CLOSE_SERVICE: { + VCHIQ_SERVICE_HANDLE_T handle; + + memcpy(&handle, (const void*)arg, sizeof(handle)); + +#ifdef VCHIQ_IOCTL_DEBUG + printf("%s: [CLOSE SERVICE] handle = %08x\n", __func__, handle); +#endif + + service = find_service_for_instance(instance, handle); + if (service != NULL) + status = vchiq_close_service(service->handle); + else + ret = -EINVAL; + } break; + + case VCHIQ_IOC_REMOVE_SERVICE: { + VCHIQ_SERVICE_HANDLE_T handle; + + memcpy(&handle, (const void*)arg, sizeof(handle)); + +#ifdef VCHIQ_IOCTL_DEBUG + printf("%s: [REMOVE SERVICE] handle = %08x\n", __func__, handle); +#endif + + service = find_service_for_instance(instance, handle); + if (service != NULL) + status = vchiq_remove_service(service->handle); + else + ret = -EINVAL; + } break; + + case VCHIQ_IOC_USE_SERVICE: + case VCHIQ_IOC_RELEASE_SERVICE: { + VCHIQ_SERVICE_HANDLE_T handle; + + memcpy(&handle, (const void*)arg, sizeof(handle)); + +#ifdef VCHIQ_IOCTL_DEBUG + printf("%s: [%s SERVICE] handle = %08x\n", __func__, + cmd == VCHIQ_IOC_USE_SERVICE ? "USE" : "RELEASE", handle); +#endif + + service = find_service_for_instance(instance, handle); + if (service != NULL) { + status = (cmd == VCHIQ_IOC_USE_SERVICE) ? + vchiq_use_service_internal(service) : + vchiq_release_service_internal(service); + if (status != VCHIQ_SUCCESS) { + vchiq_log_error(vchiq_susp_log_level, + "%s: cmd %s returned error %d for " + "service %c%c%c%c:%8x", + __func__, + (cmd == VCHIQ_IOC_USE_SERVICE) ? + "VCHIQ_IOC_USE_SERVICE" : + "VCHIQ_IOC_RELEASE_SERVICE", + status, + VCHIQ_FOURCC_AS_4CHARS( + service->base.fourcc), + service->client_id); + ret = -EINVAL; + } + } else + ret = -EINVAL; + } break; + + case VCHIQ_IOC_QUEUE_MESSAGE: { + VCHIQ_QUEUE_MESSAGE_T args; + memcpy(&args, (const void*)arg, sizeof(args)); + +#ifdef VCHIQ_IOCTL_DEBUG + printf("%s: [QUEUE MESSAGE] handle = %08x\n", __func__, args.handle); +#endif + + service = find_service_for_instance(instance, args.handle); + + if ((service != NULL) && (args.count <= MAX_ELEMENTS)) { + /* Copy elements into kernel space */ + VCHIQ_ELEMENT_T elements[MAX_ELEMENTS]; + if (copy_from_user(elements, args.elements, + args.count * sizeof(VCHIQ_ELEMENT_T)) == 0) + status = vchiq_queue_message + (args.handle, + elements, args.count); + else + ret = -EFAULT; + } else { + ret = -EINVAL; + } + } break; + + case VCHIQ_IOC_QUEUE_BULK_TRANSMIT: + case VCHIQ_IOC_QUEUE_BULK_RECEIVE: { + VCHIQ_QUEUE_BULK_TRANSFER_T args; + struct bulk_waiter_node *waiter = NULL; + VCHIQ_BULK_DIR_T dir = + (cmd == VCHIQ_IOC_QUEUE_BULK_TRANSMIT) ? + VCHIQ_BULK_TRANSMIT : VCHIQ_BULK_RECEIVE; + + memcpy(&args, (const void*)arg, sizeof(args)); + + service = find_service_for_instance(instance, args.handle); + if (!service) { + ret = -EINVAL; + break; + } + + if (args.mode == VCHIQ_BULK_MODE_BLOCKING) { + waiter = kzalloc(sizeof(struct bulk_waiter_node), + GFP_KERNEL); + if (!waiter) { + ret = -ENOMEM; + break; + } + args.userdata = &waiter->bulk_waiter; + } else if (args.mode == VCHIQ_BULK_MODE_WAITING) { + struct list_head *pos; + lmutex_lock(&instance->bulk_waiter_list_mutex); + list_for_each(pos, &instance->bulk_waiter_list) { + if (list_entry(pos, struct bulk_waiter_node, + list)->pid == current->p_pid) { + waiter = list_entry(pos, + struct bulk_waiter_node, + list); + list_del(pos); + break; + } + + } + lmutex_unlock(&instance->bulk_waiter_list_mutex); + if (!waiter) { + vchiq_log_error(vchiq_arm_log_level, + "no bulk_waiter found for pid %d", + current->p_pid); + ret = -ESRCH; + break; + } + vchiq_log_info(vchiq_arm_log_level, + "found bulk_waiter %x for pid %d", + (unsigned int)waiter, current->p_pid); + args.userdata = &waiter->bulk_waiter; + } + status = vchiq_bulk_transfer + (args.handle, + VCHI_MEM_HANDLE_INVALID, + args.data, args.size, + args.userdata, args.mode, + dir); + if (!waiter) + break; + if ((status != VCHIQ_RETRY) || fatal_signal_pending(current) || + !waiter->bulk_waiter.bulk) { + if (waiter->bulk_waiter.bulk) { + /* Cancel the signal when the transfer + ** completes. */ + spin_lock(&bulk_waiter_spinlock); + waiter->bulk_waiter.bulk->userdata = NULL; + spin_unlock(&bulk_waiter_spinlock); + } + _sema_destroy(&waiter->bulk_waiter.event); + kfree(waiter); + } else { + const VCHIQ_BULK_MODE_T mode_waiting = + VCHIQ_BULK_MODE_WAITING; + waiter->pid = current->p_pid; + lmutex_lock(&instance->bulk_waiter_list_mutex); + list_add(&waiter->list, &instance->bulk_waiter_list); + lmutex_unlock(&instance->bulk_waiter_list_mutex); + vchiq_log_info(vchiq_arm_log_level, + "saved bulk_waiter %x for pid %d", + (unsigned int)waiter, current->p_pid); + + memcpy((void *) + &(((VCHIQ_QUEUE_BULK_TRANSFER_T *) + arg)->mode), + (const void *)&mode_waiting, + sizeof(mode_waiting)); + } + } break; + + case VCHIQ_IOC_AWAIT_COMPLETION: { + VCHIQ_AWAIT_COMPLETION_T args; + int count = 0; + + DEBUG_TRACE(AWAIT_COMPLETION_LINE); + if (!instance->connected) { + ret = -ENOTCONN; + break; + } + + memcpy(&args, (const void*)arg, sizeof(args)); + + lmutex_lock(&instance->completion_mutex); + + DEBUG_TRACE(AWAIT_COMPLETION_LINE); + while ((instance->completion_remove == + instance->completion_insert) + && !instance->closing) { + DEBUG_TRACE(AWAIT_COMPLETION_LINE); + lmutex_unlock(&instance->completion_mutex); + rc = down_interruptible(&instance->insert_event); + lmutex_lock(&instance->completion_mutex); + if (rc != 0) { + DEBUG_TRACE(AWAIT_COMPLETION_LINE); + vchiq_log_info(vchiq_arm_log_level, + "AWAIT_COMPLETION interrupted"); + ret = -EINTR; + break; + } + } + DEBUG_TRACE(AWAIT_COMPLETION_LINE); + + /* A read memory barrier is needed to stop prefetch of a stale + ** completion record + */ + rmb(); + + if (ret == 0) { + int msgbufcount = args.msgbufcount; + for (count = 0; count < args.count; count++) { + VCHIQ_COMPLETION_DATA_T *completion; + VCHIQ_SERVICE_T *service1; + USER_SERVICE_T *user_service; + VCHIQ_HEADER_T *header; + if (instance->completion_remove == + instance->completion_insert) + break; + completion = &instance->completions[ + instance->completion_remove & + (MAX_COMPLETIONS - 1)]; + + service1 = completion->service_userdata; + user_service = service1->base.userdata; + completion->service_userdata = + user_service->userdata; + + header = completion->header; + if (header) { + void __user *msgbuf; + int msglen; + + msglen = header->size + + sizeof(VCHIQ_HEADER_T); + /* This must be a VCHIQ-style service */ + if (args.msgbufsize < msglen) { + vchiq_log_error( + vchiq_arm_log_level, + "header %x: msgbufsize" + " %x < msglen %x", + (unsigned int)header, + args.msgbufsize, + msglen); + WARN(1, "invalid message " + "size\n"); + if (count == 0) + ret = -EMSGSIZE; + break; + } + if (msgbufcount <= 0) + /* Stall here for lack of a + ** buffer for the message. */ + break; + /* Get the pointer from user space */ + msgbufcount--; + if (copy_from_user(&msgbuf, + (const void __user *) + &args.msgbufs[msgbufcount], + sizeof(msgbuf)) != 0) { + if (count == 0) + ret = -EFAULT; + break; + } + + /* Copy the message to user space */ + if (copy_to_user(msgbuf, header, + msglen) != 0) { + if (count == 0) + ret = -EFAULT; + break; + } + + /* Now it has been copied, the message + ** can be released. */ + vchiq_release_message(service1->handle, + header); + + /* The completion must point to the + ** msgbuf. */ + completion->header = msgbuf; + } + + if (completion->reason == + VCHIQ_SERVICE_CLOSED) + unlock_service(service1); + + if (copy_to_user((void __user *)( + (size_t)args.buf + + count * sizeof(VCHIQ_COMPLETION_DATA_T)), + completion, + sizeof(VCHIQ_COMPLETION_DATA_T)) != 0) { + if (ret == 0) + ret = -EFAULT; + break; + } + + instance->completion_remove++; + } + + if (msgbufcount != args.msgbufcount) { + memcpy((void __user *) + &((VCHIQ_AWAIT_COMPLETION_T *)arg)-> + msgbufcount, + &msgbufcount, + sizeof(msgbufcount)); + } + + if (count != args.count) + { + memcpy((void __user *) + &((VCHIQ_AWAIT_COMPLETION_T *)arg)->count, + &count, sizeof(count)); + } + } + + if (count != 0) + up(&instance->remove_event); + + if ((ret == 0) && instance->closing) + ret = -ENOTCONN; + /* + * XXXBSD: ioctl return codes are not negative as in linux, so + * we can not indicate success with positive number of passed + * messages + */ + if (ret > 0) + ret = 0; + + lmutex_unlock(&instance->completion_mutex); + DEBUG_TRACE(AWAIT_COMPLETION_LINE); + } break; + + case VCHIQ_IOC_DEQUEUE_MESSAGE: { + VCHIQ_DEQUEUE_MESSAGE_T args; + USER_SERVICE_T *user_service; + VCHIQ_HEADER_T *header; + + DEBUG_TRACE(DEQUEUE_MESSAGE_LINE); + memcpy(&args, (const void*)arg, sizeof(args)); + service = find_service_for_instance(instance, args.handle); + if (!service) { + ret = -EINVAL; + break; + } + user_service = (USER_SERVICE_T *)service->base.userdata; + if (user_service->is_vchi == 0) { + ret = -EINVAL; + break; + } + + spin_lock(&msg_queue_spinlock); + if (user_service->msg_remove == user_service->msg_insert) { + if (!args.blocking) { + spin_unlock(&msg_queue_spinlock); + DEBUG_TRACE(DEQUEUE_MESSAGE_LINE); + ret = -EWOULDBLOCK; + break; + } + user_service->dequeue_pending = 1; + do { + spin_unlock(&msg_queue_spinlock); + DEBUG_TRACE(DEQUEUE_MESSAGE_LINE); + if (down_interruptible( + &user_service->insert_event) != 0) { + vchiq_log_info(vchiq_arm_log_level, + "DEQUEUE_MESSAGE interrupted"); + ret = -EINTR; + break; + } + spin_lock(&msg_queue_spinlock); + } while (user_service->msg_remove == + user_service->msg_insert); + + if (ret) + break; + } + + BUG_ON((int)(user_service->msg_insert - + user_service->msg_remove) < 0); + + header = user_service->msg_queue[user_service->msg_remove & + (MSG_QUEUE_SIZE - 1)]; + user_service->msg_remove++; + spin_unlock(&msg_queue_spinlock); + + up(&user_service->remove_event); + if (header == NULL) + ret = -ENOTCONN; + else if (header->size <= args.bufsize) { + /* Copy to user space if msgbuf is not NULL */ + if ((args.buf == NULL) || + (copy_to_user((void __user *)args.buf, + header->data, + header->size) == 0)) { + args.bufsize = header->size; + memcpy((void *)arg, &args, + sizeof(args)); + vchiq_release_message( + service->handle, + header); + } else + ret = -EFAULT; + } else { + vchiq_log_error(vchiq_arm_log_level, + "header %x: bufsize %x < size %x", + (unsigned int)header, args.bufsize, + header->size); + WARN(1, "invalid size\n"); + ret = -EMSGSIZE; + } + DEBUG_TRACE(DEQUEUE_MESSAGE_LINE); + } break; + + case VCHIQ_IOC_GET_CLIENT_ID: { + VCHIQ_SERVICE_HANDLE_T handle; + + memcpy(&handle, (const void*)arg, sizeof(handle)); + + ret = vchiq_get_client_id(handle); + } break; + + case VCHIQ_IOC_GET_CONFIG: { + VCHIQ_GET_CONFIG_T args; + VCHIQ_CONFIG_T config; + + memcpy(&args, (const void*)arg, sizeof(args)); + if (args.config_size > sizeof(config)) { + ret = -EINVAL; + break; + } + status = vchiq_get_config(instance, args.config_size, &config); + if (status == VCHIQ_SUCCESS) { + if (copy_to_user((void __user *)args.pconfig, + &config, args.config_size) != 0) { + ret = -EFAULT; + break; + } + } + } break; + + case VCHIQ_IOC_SET_SERVICE_OPTION: { + VCHIQ_SET_SERVICE_OPTION_T args; + + memcpy(&args, (const void*)arg, sizeof(args)); + + service = find_service_for_instance(instance, args.handle); + if (!service) { + ret = -EINVAL; + break; + } + + status = vchiq_set_service_option( + args.handle, args.option, args.value); + } break; + + case VCHIQ_IOC_DUMP_PHYS_MEM: { + VCHIQ_DUMP_MEM_T args; + + memcpy(&args, (const void*)arg, sizeof(args)); + printf("IMPLEMENT ME: %s:%d\n", __FILE__, __LINE__); +#if 0 + dump_phys_mem(args.virt_addr, args.num_bytes); +#endif + } break; + + default: + ret = -ENOTTY; + break; + } + + if (service) + unlock_service(service); + + if (ret == 0) { + if (status == VCHIQ_ERROR) + ret = -EIO; + else if (status == VCHIQ_RETRY) + ret = -EINTR; + } + + if ((status == VCHIQ_SUCCESS) && (ret < 0) && (ret != -EINTR) && + (ret != -EWOULDBLOCK)) + vchiq_log_info(vchiq_arm_log_level, + " ioctl instance %lx, cmd %s -> status %d, %d", + (unsigned long)instance, + (_IOC_NR(cmd) <= VCHIQ_IOC_MAX) ? + ioctl_names[_IOC_NR(cmd)] : + "", + status, ret); + else + vchiq_log_trace(vchiq_arm_log_level, + " ioctl instance %lx, cmd %s -> status %d, %d", + (unsigned long)instance, + (_IOC_NR(cmd) <= VCHIQ_IOC_MAX) ? + ioctl_names[_IOC_NR(cmd)] : + "", + status, ret); + + /* XXXBSD: report BSD-style error to userland */ + if (ret < 0) + ret = -ret; + + return ret; +} + +static void +instance_dtr(void *data) +{ + + free(data, M_VCHIQ); +} + +/**************************************************************************** +* +* vchiq_open +* +***************************************************************************/ + +static int +vchiq_open(struct cdev *dev, int flags, int fmt __unused, struct thread *td) +{ + vchiq_log_info(vchiq_arm_log_level, "vchiq_open"); + /* XXXBSD: do we really need this check? */ + if (1) { + VCHIQ_STATE_T *state = vchiq_get_state(); + VCHIQ_INSTANCE_T instance; + + if (!state) { + vchiq_log_error(vchiq_arm_log_level, + "vchiq has no connection to VideoCore"); + return -ENOTCONN; + } + + instance = kmalloc(sizeof(*instance), GFP_KERNEL); + if (!instance) + return -ENOMEM; + + instance->state = state; + /* XXXBSD: PID or thread ID? */ + instance->pid = td->td_proc->p_pid; + +#ifdef notyet + ret = vchiq_proc_add_instance(instance); + if (ret != 0) { + kfree(instance); + return ret; + } +#endif + + _sema_init(&instance->insert_event, 0); + _sema_init(&instance->remove_event, 0); + lmutex_init(&instance->completion_mutex); + lmutex_init(&instance->bulk_waiter_list_mutex); + INIT_LIST_HEAD(&instance->bulk_waiter_list); + + devfs_set_cdevpriv(instance, instance_dtr); + } + else { + vchiq_log_error(vchiq_arm_log_level, + "Unknown minor device"); + return -ENXIO; + } + + return 0; +} + +/**************************************************************************** +* +* vchiq_release +* +***************************************************************************/ + +static int +vchiq_close(struct cdev *dev, int flags __unused, int fmt __unused, + struct thread *td) +{ + int ret = 0; + if (1) { + VCHIQ_INSTANCE_T instance; + VCHIQ_STATE_T *state = vchiq_get_state(); + VCHIQ_SERVICE_T *service; + int i; + + if ((ret = devfs_get_cdevpriv((void**)&instance))) { + printf("devfs_get_cdevpriv failed: error %d\n", ret); + return (ret); + } + + vchiq_log_info(vchiq_arm_log_level, + "vchiq_release: instance=%lx", + (unsigned long)instance); + + if (!state) { + ret = -EPERM; + goto out; + } + + /* Ensure videocore is awake to allow termination. */ + vchiq_use_internal(instance->state, NULL, + USE_TYPE_VCHIQ); + + lmutex_lock(&instance->completion_mutex); + + /* Wake the completion thread and ask it to exit */ + instance->closing = 1; + up(&instance->insert_event); + + lmutex_unlock(&instance->completion_mutex); + + /* Wake the slot handler if the completion queue is full. */ + up(&instance->remove_event); + + /* Mark all services for termination... */ + i = 0; + while ((service = next_service_by_instance(state, instance, + &i)) != NULL) { + USER_SERVICE_T *user_service = service->base.userdata; + + /* Wake the slot handler if the msg queue is full. */ + up(&user_service->remove_event); + + vchiq_terminate_service_internal(service); + unlock_service(service); + } + + /* ...and wait for them to die */ + i = 0; + while ((service = next_service_by_instance(state, instance, &i)) + != NULL) { + USER_SERVICE_T *user_service = service->base.userdata; + + down(&service->remove_event); + + BUG_ON(service->srvstate != VCHIQ_SRVSTATE_FREE); + + spin_lock(&msg_queue_spinlock); + + while (user_service->msg_remove != + user_service->msg_insert) { + VCHIQ_HEADER_T *header = user_service-> + msg_queue[user_service->msg_remove & + (MSG_QUEUE_SIZE - 1)]; + user_service->msg_remove++; + spin_unlock(&msg_queue_spinlock); + + if (header) + vchiq_release_message( + service->handle, + header); + spin_lock(&msg_queue_spinlock); + } + + spin_unlock(&msg_queue_spinlock); + + unlock_service(service); + } + + /* Release any closed services */ + while (instance->completion_remove != + instance->completion_insert) { + VCHIQ_COMPLETION_DATA_T *completion; + VCHIQ_SERVICE_T *service1; + completion = &instance->completions[ + instance->completion_remove & + (MAX_COMPLETIONS - 1)]; + service1 = completion->service_userdata; + if (completion->reason == VCHIQ_SERVICE_CLOSED) + unlock_service(service1); + instance->completion_remove++; + } + + /* Release the PEER service count. */ + vchiq_release_internal(instance->state, NULL); + + { + struct list_head *pos, *next; + list_for_each_safe(pos, next, + &instance->bulk_waiter_list) { + struct bulk_waiter_node *waiter; + waiter = list_entry(pos, + struct bulk_waiter_node, + list); + list_del(pos); + vchiq_log_info(vchiq_arm_log_level, + "bulk_waiter - cleaned up %x " + "for pid %d", + (unsigned int)waiter, waiter->pid); + _sema_destroy(&waiter->bulk_waiter.event); + kfree(waiter); + } + } + + } + else { + vchiq_log_error(vchiq_arm_log_level, + "Unknown minor device"); + ret = -ENXIO; + } + +out: + return ret; +} + +/**************************************************************************** +* +* vchiq_dump +* +***************************************************************************/ + +void +vchiq_dump(void *dump_context, const char *str, int len) +{ + DUMP_CONTEXT_T *context = (DUMP_CONTEXT_T *)dump_context; + + if (context->actual < context->space) { + int copy_bytes; + if (context->offset > 0) { + int skip_bytes = min(len, (int)context->offset); + str += skip_bytes; + len -= skip_bytes; + context->offset -= skip_bytes; + if (context->offset > 0) + return; + } + copy_bytes = min(len, (int)(context->space - context->actual)); + if (copy_bytes == 0) + return; + memcpy(context->buf + context->actual, str, copy_bytes); + context->actual += copy_bytes; + len -= copy_bytes; + + /* If tne terminating NUL is included in the length, then it + ** marks the end of a line and should be replaced with a + ** carriage return. */ + if ((len == 0) && (str[copy_bytes - 1] == '\0')) { + char cr = '\n'; + memcpy(context->buf + context->actual - 1, &cr, 1); + } + } +} + +/**************************************************************************** +* +* vchiq_dump_platform_instance_state +* +***************************************************************************/ + +void +vchiq_dump_platform_instances(void *dump_context) +{ + VCHIQ_STATE_T *state = vchiq_get_state(); + char buf[80]; + int len; + int i; + + /* There is no list of instances, so instead scan all services, + marking those that have been dumped. */ + + for (i = 0; i < state->unused_service; i++) { + VCHIQ_SERVICE_T *service = state->services[i]; + VCHIQ_INSTANCE_T instance; + + if (service && (service->base.callback == service_callback)) { + instance = service->instance; + if (instance) + instance->mark = 0; + } + } + + for (i = 0; i < state->unused_service; i++) { + VCHIQ_SERVICE_T *service = state->services[i]; + VCHIQ_INSTANCE_T instance; + + if (service && (service->base.callback == service_callback)) { + instance = service->instance; + if (instance && !instance->mark) { + len = snprintf(buf, sizeof(buf), + "Instance %x: pid %d,%s completions " + "%d/%d", + (unsigned int)instance, instance->pid, + instance->connected ? " connected, " : + "", + instance->completion_insert - + instance->completion_remove, + MAX_COMPLETIONS); + + vchiq_dump(dump_context, buf, len + 1); + + instance->mark = 1; + } + } + } +} + +/**************************************************************************** +* +* vchiq_dump_platform_service_state +* +***************************************************************************/ + +void +vchiq_dump_platform_service_state(void *dump_context, VCHIQ_SERVICE_T *service) +{ + USER_SERVICE_T *user_service = (USER_SERVICE_T *)service->base.userdata; + char buf[80]; + int len; + + len = snprintf(buf, sizeof(buf), " instance %x", + (unsigned int)service->instance); + + if ((service->base.callback == service_callback) && + user_service->is_vchi) { + len += snprintf(buf + len, sizeof(buf) - len, + ", %d/%d messages", + user_service->msg_insert - user_service->msg_remove, + MSG_QUEUE_SIZE); + + if (user_service->dequeue_pending) + len += snprintf(buf + len, sizeof(buf) - len, + " (dequeue pending)"); + } + + vchiq_dump(dump_context, buf, len + 1); +} + +#ifdef notyet +/**************************************************************************** +* +* dump_user_mem +* +***************************************************************************/ + +static void +dump_phys_mem(void *virt_addr, uint32_t num_bytes) +{ + int rc; + uint8_t *end_virt_addr = virt_addr + num_bytes; + int num_pages; + int offset; + int end_offset; + int page_idx; + int prev_idx; + struct page *page; + struct page **pages; + uint8_t *kmapped_virt_ptr; + + /* Align virtAddr and endVirtAddr to 16 byte boundaries. */ + + virt_addr = (void *)((unsigned long)virt_addr & ~0x0fuL); + end_virt_addr = (void *)(((unsigned long)end_virt_addr + 15uL) & + ~0x0fuL); + + offset = (int)(long)virt_addr & (PAGE_SIZE - 1); + end_offset = (int)(long)end_virt_addr & (PAGE_SIZE - 1); + + num_pages = (offset + num_bytes + PAGE_SIZE - 1) / PAGE_SIZE; + + pages = kmalloc(sizeof(struct page *) * num_pages, GFP_KERNEL); + if (pages == NULL) { + vchiq_log_error(vchiq_arm_log_level, + "Unable to allocation memory for %d pages\n", + num_pages); + return; + } + + down_read(¤t->mm->mmap_sem); + rc = get_user_pages(current, /* task */ + current->mm, /* mm */ + (unsigned long)virt_addr, /* start */ + num_pages, /* len */ + 0, /* write */ + 0, /* force */ + pages, /* pages (array of page pointers) */ + NULL); /* vmas */ + up_read(¤t->mm->mmap_sem); + + prev_idx = -1; + page = NULL; + + while (offset < end_offset) { + + int page_offset = offset % PAGE_SIZE; + page_idx = offset / PAGE_SIZE; + + if (page_idx != prev_idx) { + + if (page != NULL) + kunmap(page); + page = pages[page_idx]; + kmapped_virt_ptr = kmap(page); + + prev_idx = page_idx; + } + + if (vchiq_arm_log_level >= VCHIQ_LOG_TRACE) + vchiq_log_dump_mem("ph", + (uint32_t)(unsigned long)&kmapped_virt_ptr[ + page_offset], + &kmapped_virt_ptr[page_offset], 16); + + offset += 16; + } + if (page != NULL) + kunmap(page); + + for (page_idx = 0; page_idx < num_pages; page_idx++) + page_cache_release(pages[page_idx]); + + kfree(pages); +} + +/**************************************************************************** +* +* vchiq_read +* +***************************************************************************/ + +static ssize_t +vchiq_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + DUMP_CONTEXT_T context; + context.buf = buf; + context.actual = 0; + context.space = count; + context.offset = *ppos; + + vchiq_dump_state(&context, &g_state); + + *ppos += context.actual; + + return context.actual; +} +#endif + +VCHIQ_STATE_T * +vchiq_get_state(void) +{ + + if (g_state.remote == NULL) + printk(KERN_ERR "%s: g_state.remote == NULL\n", __func__); + else if (g_state.remote->initialised != 1) + printk(KERN_NOTICE "%s: g_state.remote->initialised != 1 (%d)\n", + __func__, g_state.remote->initialised); + + return ((g_state.remote != NULL) && + (g_state.remote->initialised == 1)) ? &g_state : NULL; +} + +/* + * Autosuspend related functionality + */ + +int +vchiq_videocore_wanted(VCHIQ_STATE_T *state) +{ + VCHIQ_ARM_STATE_T *arm_state = vchiq_platform_get_arm_state(state); + if (!arm_state) + /* autosuspend not supported - always return wanted */ + return 1; + else if (arm_state->blocked_count) + return 1; + else if (!arm_state->videocore_use_count) + /* usage count zero - check for override unless we're forcing */ + if (arm_state->resume_blocked) + return 0; + else + return vchiq_platform_videocore_wanted(state); + else + /* non-zero usage count - videocore still required */ + return 1; +} + +static VCHIQ_STATUS_T +vchiq_keepalive_vchiq_callback(VCHIQ_REASON_T reason, + VCHIQ_HEADER_T *header, + VCHIQ_SERVICE_HANDLE_T service_user, + void *bulk_user) +{ + vchiq_log_error(vchiq_susp_log_level, + "%s callback reason %d", __func__, reason); + return 0; +} + +static int +vchiq_keepalive_thread_func(void *v) +{ + VCHIQ_STATE_T *state = (VCHIQ_STATE_T *) v; + VCHIQ_ARM_STATE_T *arm_state = vchiq_platform_get_arm_state(state); + + VCHIQ_STATUS_T status; + VCHIQ_INSTANCE_T instance; + VCHIQ_SERVICE_HANDLE_T ka_handle; + + VCHIQ_SERVICE_PARAMS_T params = { + .fourcc = VCHIQ_MAKE_FOURCC('K', 'E', 'E', 'P'), + .callback = vchiq_keepalive_vchiq_callback, + .version = KEEPALIVE_VER, + .version_min = KEEPALIVE_VER_MIN + }; + + status = vchiq_initialise(&instance); + if (status != VCHIQ_SUCCESS) { + vchiq_log_error(vchiq_susp_log_level, + "%s vchiq_initialise failed %d", __func__, status); + goto exit; + } + + status = vchiq_connect(instance); + if (status != VCHIQ_SUCCESS) { + vchiq_log_error(vchiq_susp_log_level, + "%s vchiq_connect failed %d", __func__, status); + goto shutdown; + } + + status = vchiq_add_service(instance, ¶ms, &ka_handle); + if (status != VCHIQ_SUCCESS) { + vchiq_log_error(vchiq_susp_log_level, + "%s vchiq_open_service failed %d", __func__, status); + goto shutdown; + } + + while (1) { + long rc = 0, uc = 0; + if (wait_for_completion_interruptible(&arm_state->ka_evt) + != 0) { + vchiq_log_error(vchiq_susp_log_level, + "%s interrupted", __func__); + flush_signals(current); + continue; + } + + /* read and clear counters. Do release_count then use_count to + * prevent getting more releases than uses */ + rc = atomic_xchg(&arm_state->ka_release_count, 0); + uc = atomic_xchg(&arm_state->ka_use_count, 0); + + /* Call use/release service the requisite number of times. + * Process use before release so use counts don't go negative */ + while (uc--) { + atomic_inc(&arm_state->ka_use_ack_count); + status = vchiq_use_service(ka_handle); + if (status != VCHIQ_SUCCESS) { + vchiq_log_error(vchiq_susp_log_level, + "%s vchiq_use_service error %d", + __func__, status); + } + } + while (rc--) { + status = vchiq_release_service(ka_handle); + if (status != VCHIQ_SUCCESS) { + vchiq_log_error(vchiq_susp_log_level, + "%s vchiq_release_service error %d", + __func__, status); + } + } + } + +shutdown: + vchiq_shutdown(instance); +exit: + return 0; +} + +VCHIQ_STATUS_T +vchiq_arm_init_state(VCHIQ_STATE_T *state, VCHIQ_ARM_STATE_T *arm_state) +{ + VCHIQ_STATUS_T status = VCHIQ_SUCCESS; + + if (arm_state) { + rwlock_init(&arm_state->susp_res_lock); + + init_completion(&arm_state->ka_evt); + atomic_set(&arm_state->ka_use_count, 0); + atomic_set(&arm_state->ka_use_ack_count, 0); + atomic_set(&arm_state->ka_release_count, 0); + + init_completion(&arm_state->vc_suspend_complete); + + init_completion(&arm_state->vc_resume_complete); + /* Initialise to 'done' state. We only want to block on resume + * completion while videocore is suspended. */ + set_resume_state(arm_state, VC_RESUME_RESUMED); + + init_completion(&arm_state->resume_blocker); + /* Initialise to 'done' state. We only want to block on this + * completion while resume is blocked */ + complete_all(&arm_state->resume_blocker); + + init_completion(&arm_state->blocked_blocker); + /* Initialise to 'done' state. We only want to block on this + * completion while things are waiting on the resume blocker */ + complete_all(&arm_state->blocked_blocker); + + arm_state->suspend_timer_timeout = SUSPEND_TIMER_TIMEOUT_MS; + arm_state->suspend_timer_running = 0; + init_timer(&arm_state->suspend_timer); + arm_state->suspend_timer.data = (unsigned long)(state); + arm_state->suspend_timer.function = suspend_timer_callback; + + arm_state->first_connect = 0; + + } + return status; +} + +/* +** Functions to modify the state variables; +** set_suspend_state +** set_resume_state +** +** There are more state variables than we might like, so ensure they remain in +** step. Suspend and resume state are maintained separately, since most of +** these state machines can operate independently. However, there are a few +** states where state transitions in one state machine cause a reset to the +** other state machine. In addition, there are some completion events which +** need to occur on state machine reset and end-state(s), so these are also +** dealt with in these functions. +** +** In all states we set the state variable according to the input, but in some +** cases we perform additional steps outlined below; +** +** VC_SUSPEND_IDLE - Initialise the suspend completion at the same time. +** The suspend completion is completed after any suspend +** attempt. When we reset the state machine we also reset +** the completion. This reset occurs when videocore is +** resumed, and also if we initiate suspend after a suspend +** failure. +** +** VC_SUSPEND_IN_PROGRESS - This state is considered the point of no return for +** suspend - ie from this point on we must try to suspend +** before resuming can occur. We therefore also reset the +** resume state machine to VC_RESUME_IDLE in this state. +** +** VC_SUSPEND_SUSPENDED - Suspend has completed successfully. Also call +** complete_all on the suspend completion to notify +** anything waiting for suspend to happen. +** +** VC_SUSPEND_REJECTED - Videocore rejected suspend. Videocore will also +** initiate resume, so no need to alter resume state. +** We call complete_all on the suspend completion to notify +** of suspend rejection. +** +** VC_SUSPEND_FAILED - We failed to initiate videocore suspend. We notify the +** suspend completion and reset the resume state machine. +** +** VC_RESUME_IDLE - Initialise the resume completion at the same time. The +** resume completion is in it's 'done' state whenever +** videcore is running. Therfore, the VC_RESUME_IDLE state +** implies that videocore is suspended. +** Hence, any thread which needs to wait until videocore is +** running can wait on this completion - it will only block +** if videocore is suspended. +** +** VC_RESUME_RESUMED - Resume has completed successfully. Videocore is running. +** Call complete_all on the resume completion to unblock +** any threads waiting for resume. Also reset the suspend +** state machine to it's idle state. +** +** VC_RESUME_FAILED - Currently unused - no mechanism to fail resume exists. +*/ + +inline void +set_suspend_state(VCHIQ_ARM_STATE_T *arm_state, + enum vc_suspend_status new_state) +{ + /* set the state in all cases */ + arm_state->vc_suspend_state = new_state; + + /* state specific additional actions */ + switch (new_state) { + case VC_SUSPEND_FORCE_CANCELED: + complete_all(&arm_state->vc_suspend_complete); + break; + case VC_SUSPEND_REJECTED: + complete_all(&arm_state->vc_suspend_complete); + break; + case VC_SUSPEND_FAILED: + complete_all(&arm_state->vc_suspend_complete); + arm_state->vc_resume_state = VC_RESUME_RESUMED; + complete_all(&arm_state->vc_resume_complete); + break; + case VC_SUSPEND_IDLE: + INIT_COMPLETION(arm_state->vc_suspend_complete); + break; + case VC_SUSPEND_REQUESTED: + break; + case VC_SUSPEND_IN_PROGRESS: + set_resume_state(arm_state, VC_RESUME_IDLE); + break; + case VC_SUSPEND_SUSPENDED: + complete_all(&arm_state->vc_suspend_complete); + break; + default: + BUG(); + break; + } +} + +inline void +set_resume_state(VCHIQ_ARM_STATE_T *arm_state, + enum vc_resume_status new_state) +{ + /* set the state in all cases */ + arm_state->vc_resume_state = new_state; + + /* state specific additional actions */ + switch (new_state) { + case VC_RESUME_FAILED: + break; + case VC_RESUME_IDLE: + INIT_COMPLETION(arm_state->vc_resume_complete); + break; + case VC_RESUME_REQUESTED: + break; + case VC_RESUME_IN_PROGRESS: + break; + case VC_RESUME_RESUMED: + complete_all(&arm_state->vc_resume_complete); + set_suspend_state(arm_state, VC_SUSPEND_IDLE); + break; + default: + BUG(); + break; + } +} + + +/* should be called with the write lock held */ +inline void +start_suspend_timer(VCHIQ_ARM_STATE_T *arm_state) +{ + del_timer(&arm_state->suspend_timer); + arm_state->suspend_timer.expires = jiffies + + msecs_to_jiffies(arm_state-> + suspend_timer_timeout); + add_timer(&arm_state->suspend_timer); + arm_state->suspend_timer_running = 1; +} + +/* should be called with the write lock held */ +static inline void +stop_suspend_timer(VCHIQ_ARM_STATE_T *arm_state) +{ + if (arm_state->suspend_timer_running) { + del_timer(&arm_state->suspend_timer); + arm_state->suspend_timer_running = 0; + } +} + +static inline int +need_resume(VCHIQ_STATE_T *state) +{ + VCHIQ_ARM_STATE_T *arm_state = vchiq_platform_get_arm_state(state); + return (arm_state->vc_suspend_state > VC_SUSPEND_IDLE) && + (arm_state->vc_resume_state < VC_RESUME_REQUESTED) && + vchiq_videocore_wanted(state); +} + +static int +block_resume(VCHIQ_ARM_STATE_T *arm_state) +{ + int status = VCHIQ_SUCCESS; + const unsigned long timeout_val = + msecs_to_jiffies(FORCE_SUSPEND_TIMEOUT_MS); + int resume_count = 0; + + /* Allow any threads which were blocked by the last force suspend to + * complete if they haven't already. Only give this one shot; if + * blocked_count is incremented after blocked_blocker is completed + * (which only happens when blocked_count hits 0) then those threads + * will have to wait until next time around */ + if (arm_state->blocked_count) { + INIT_COMPLETION(arm_state->blocked_blocker); + write_unlock_bh(&arm_state->susp_res_lock); + vchiq_log_info(vchiq_susp_log_level, "%s wait for previously " + "blocked clients", __func__); + if (wait_for_completion_interruptible_timeout( + &arm_state->blocked_blocker, timeout_val) + <= 0) { + vchiq_log_error(vchiq_susp_log_level, "%s wait for " + "previously blocked clients failed" , __func__); + status = VCHIQ_ERROR; + write_lock_bh(&arm_state->susp_res_lock); + goto out; + } + vchiq_log_info(vchiq_susp_log_level, "%s previously blocked " + "clients resumed", __func__); + write_lock_bh(&arm_state->susp_res_lock); + } + + /* We need to wait for resume to complete if it's in process */ + while (arm_state->vc_resume_state != VC_RESUME_RESUMED && + arm_state->vc_resume_state > VC_RESUME_IDLE) { + if (resume_count > 1) { + status = VCHIQ_ERROR; + vchiq_log_error(vchiq_susp_log_level, "%s waited too " + "many times for resume" , __func__); + goto out; + } + write_unlock_bh(&arm_state->susp_res_lock); + vchiq_log_info(vchiq_susp_log_level, "%s wait for resume", + __func__); + if (wait_for_completion_interruptible_timeout( + &arm_state->vc_resume_complete, timeout_val) + <= 0) { + vchiq_log_error(vchiq_susp_log_level, "%s wait for " + "resume failed (%s)", __func__, + resume_state_names[arm_state->vc_resume_state + + VC_RESUME_NUM_OFFSET]); + status = VCHIQ_ERROR; + write_lock_bh(&arm_state->susp_res_lock); + goto out; + } + vchiq_log_info(vchiq_susp_log_level, "%s resumed", __func__); + write_lock_bh(&arm_state->susp_res_lock); + resume_count++; + } + INIT_COMPLETION(arm_state->resume_blocker); + arm_state->resume_blocked = 1; + +out: + return status; +} + +static inline void +unblock_resume(VCHIQ_ARM_STATE_T *arm_state) +{ + complete_all(&arm_state->resume_blocker); + arm_state->resume_blocked = 0; +} + +/* Initiate suspend via slot handler. Should be called with the write lock + * held */ +VCHIQ_STATUS_T +vchiq_arm_vcsuspend(VCHIQ_STATE_T *state) +{ + VCHIQ_STATUS_T status = VCHIQ_ERROR; + VCHIQ_ARM_STATE_T *arm_state = vchiq_platform_get_arm_state(state); + + if (!arm_state) + goto out; + + vchiq_log_trace(vchiq_susp_log_level, "%s", __func__); + status = VCHIQ_SUCCESS; + + + switch (arm_state->vc_suspend_state) { + case VC_SUSPEND_REQUESTED: + vchiq_log_info(vchiq_susp_log_level, "%s: suspend already " + "requested", __func__); + break; + case VC_SUSPEND_IN_PROGRESS: + vchiq_log_info(vchiq_susp_log_level, "%s: suspend already in " + "progress", __func__); + break; + + default: + /* We don't expect to be in other states, so log but continue + * anyway */ + vchiq_log_error(vchiq_susp_log_level, + "%s unexpected suspend state %s", __func__, + suspend_state_names[arm_state->vc_suspend_state + + VC_SUSPEND_NUM_OFFSET]); + /* fall through */ + case VC_SUSPEND_REJECTED: + case VC_SUSPEND_FAILED: + /* Ensure any idle state actions have been run */ + set_suspend_state(arm_state, VC_SUSPEND_IDLE); + /* fall through */ + case VC_SUSPEND_IDLE: + vchiq_log_info(vchiq_susp_log_level, + "%s: suspending", __func__); + set_suspend_state(arm_state, VC_SUSPEND_REQUESTED); + /* kick the slot handler thread to initiate suspend */ + request_poll(state, NULL, 0); + break; + } + +out: + vchiq_log_trace(vchiq_susp_log_level, "%s exit %d", __func__, status); + return status; +} + +void +vchiq_platform_check_suspend(VCHIQ_STATE_T *state) +{ + VCHIQ_ARM_STATE_T *arm_state = vchiq_platform_get_arm_state(state); + int susp = 0; + + if (!arm_state) + goto out; + + vchiq_log_trace(vchiq_susp_log_level, "%s", __func__); + + write_lock_bh(&arm_state->susp_res_lock); + if (arm_state->vc_suspend_state == VC_SUSPEND_REQUESTED && + arm_state->vc_resume_state == VC_RESUME_RESUMED) { + set_suspend_state(arm_state, VC_SUSPEND_IN_PROGRESS); + susp = 1; + } + write_unlock_bh(&arm_state->susp_res_lock); + + if (susp) + vchiq_platform_suspend(state); + +out: + vchiq_log_trace(vchiq_susp_log_level, "%s exit", __func__); + return; +} + + +static void +output_timeout_error(VCHIQ_STATE_T *state) +{ + VCHIQ_ARM_STATE_T *arm_state = vchiq_platform_get_arm_state(state); + char service_err[50] = ""; + int vc_use_count = arm_state->videocore_use_count; + int active_services = state->unused_service; + int i; + + if (!arm_state->videocore_use_count) { + snprintf(service_err, 50, " Videocore usecount is 0"); + goto output_msg; + } + for (i = 0; i < active_services; i++) { + VCHIQ_SERVICE_T *service_ptr = state->services[i]; + if (service_ptr && service_ptr->service_use_count && + (service_ptr->srvstate != VCHIQ_SRVSTATE_FREE)) { + snprintf(service_err, 50, " %c%c%c%c(%8x) service has " + "use count %d%s", VCHIQ_FOURCC_AS_4CHARS( + service_ptr->base.fourcc), + service_ptr->client_id, + service_ptr->service_use_count, + service_ptr->service_use_count == + vc_use_count ? "" : " (+ more)"); + break; + } + } + +output_msg: + vchiq_log_error(vchiq_susp_log_level, + "timed out waiting for vc suspend (%d).%s", + arm_state->autosuspend_override, service_err); + +} + +/* Try to get videocore into suspended state, regardless of autosuspend state. +** We don't actually force suspend, since videocore may get into a bad state +** if we force suspend at a bad time. Instead, we wait for autosuspend to +** determine a good point to suspend. If this doesn't happen within 100ms we +** report failure. +** +** Returns VCHIQ_SUCCESS if videocore suspended successfully, VCHIQ_RETRY if +** videocore failed to suspend in time or VCHIQ_ERROR if interrupted. +*/ +VCHIQ_STATUS_T +vchiq_arm_force_suspend(VCHIQ_STATE_T *state) +{ + VCHIQ_ARM_STATE_T *arm_state = vchiq_platform_get_arm_state(state); + VCHIQ_STATUS_T status = VCHIQ_ERROR; + long rc = 0; + int repeat = -1; + + if (!arm_state) + goto out; + + vchiq_log_trace(vchiq_susp_log_level, "%s", __func__); + + write_lock_bh(&arm_state->susp_res_lock); + + status = block_resume(arm_state); + if (status != VCHIQ_SUCCESS) + goto unlock; + if (arm_state->vc_suspend_state == VC_SUSPEND_SUSPENDED) { + /* Already suspended - just block resume and exit */ + vchiq_log_info(vchiq_susp_log_level, "%s already suspended", + __func__); + status = VCHIQ_SUCCESS; + goto unlock; + } else if (arm_state->vc_suspend_state <= VC_SUSPEND_IDLE) { + /* initiate suspend immediately in the case that we're waiting + * for the timeout */ + stop_suspend_timer(arm_state); + if (!vchiq_videocore_wanted(state)) { + vchiq_log_info(vchiq_susp_log_level, "%s videocore " + "idle, initiating suspend", __func__); + status = vchiq_arm_vcsuspend(state); + } else if (arm_state->autosuspend_override < + FORCE_SUSPEND_FAIL_MAX) { + vchiq_log_info(vchiq_susp_log_level, "%s letting " + "videocore go idle", __func__); + status = VCHIQ_SUCCESS; + } else { + vchiq_log_warning(vchiq_susp_log_level, "%s failed too " + "many times - attempting suspend", __func__); + status = vchiq_arm_vcsuspend(state); + } + } else { + vchiq_log_info(vchiq_susp_log_level, "%s videocore suspend " + "in progress - wait for completion", __func__); + status = VCHIQ_SUCCESS; + } + + /* Wait for suspend to happen due to system idle (not forced..) */ + if (status != VCHIQ_SUCCESS) + goto unblock_resume; + + do { + write_unlock_bh(&arm_state->susp_res_lock); + + rc = wait_for_completion_interruptible_timeout( + &arm_state->vc_suspend_complete, + msecs_to_jiffies(FORCE_SUSPEND_TIMEOUT_MS)); + + write_lock_bh(&arm_state->susp_res_lock); + if (rc < 0) { + vchiq_log_warning(vchiq_susp_log_level, "%s " + "interrupted waiting for suspend", __func__); + status = VCHIQ_ERROR; + goto unblock_resume; + } else if (rc == 0) { + if (arm_state->vc_suspend_state > VC_SUSPEND_IDLE) { + /* Repeat timeout once if in progress */ + if (repeat < 0) { + repeat = 1; + continue; + } + } + arm_state->autosuspend_override++; + output_timeout_error(state); + + status = VCHIQ_RETRY; + goto unblock_resume; + } + } while (0 < (repeat--)); + + /* Check and report state in case we need to abort ARM suspend */ + if (arm_state->vc_suspend_state != VC_SUSPEND_SUSPENDED) { + status = VCHIQ_RETRY; + vchiq_log_error(vchiq_susp_log_level, + "%s videocore suspend failed (state %s)", __func__, + suspend_state_names[arm_state->vc_suspend_state + + VC_SUSPEND_NUM_OFFSET]); + /* Reset the state only if it's still in an error state. + * Something could have already initiated another suspend. */ + if (arm_state->vc_suspend_state < VC_SUSPEND_IDLE) + set_suspend_state(arm_state, VC_SUSPEND_IDLE); + + goto unblock_resume; + } + + /* successfully suspended - unlock and exit */ + goto unlock; + +unblock_resume: + /* all error states need to unblock resume before exit */ + unblock_resume(arm_state); + +unlock: + write_unlock_bh(&arm_state->susp_res_lock); + +out: + vchiq_log_trace(vchiq_susp_log_level, "%s exit %d", __func__, status); + return status; +} + +void +vchiq_check_suspend(VCHIQ_STATE_T *state) +{ + VCHIQ_ARM_STATE_T *arm_state = vchiq_platform_get_arm_state(state); + + if (!arm_state) + goto out; + + vchiq_log_trace(vchiq_susp_log_level, "%s", __func__); + + write_lock_bh(&arm_state->susp_res_lock); + if (arm_state->vc_suspend_state != VC_SUSPEND_SUSPENDED && + arm_state->first_connect && + !vchiq_videocore_wanted(state)) { + vchiq_arm_vcsuspend(state); + } + write_unlock_bh(&arm_state->susp_res_lock); + +out: + vchiq_log_trace(vchiq_susp_log_level, "%s exit", __func__); + return; +} + + +int +vchiq_arm_allow_resume(VCHIQ_STATE_T *state) +{ + VCHIQ_ARM_STATE_T *arm_state = vchiq_platform_get_arm_state(state); + int resume = 0; + int ret = -1; + + if (!arm_state) + goto out; + + vchiq_log_trace(vchiq_susp_log_level, "%s", __func__); + + write_lock_bh(&arm_state->susp_res_lock); + unblock_resume(arm_state); + resume = vchiq_check_resume(state); + write_unlock_bh(&arm_state->susp_res_lock); + + if (resume) { + if (wait_for_completion_interruptible( + &arm_state->vc_resume_complete) < 0) { + vchiq_log_error(vchiq_susp_log_level, + "%s interrupted", __func__); + /* failed, cannot accurately derive suspend + * state, so exit early. */ + goto out; + } + } + + read_lock_bh(&arm_state->susp_res_lock); + if (arm_state->vc_suspend_state == VC_SUSPEND_SUSPENDED) { + vchiq_log_info(vchiq_susp_log_level, + "%s: Videocore remains suspended", __func__); + } else { + vchiq_log_info(vchiq_susp_log_level, + "%s: Videocore resumed", __func__); + ret = 0; + } + read_unlock_bh(&arm_state->susp_res_lock); +out: + vchiq_log_trace(vchiq_susp_log_level, "%s exit %d", __func__, ret); + return ret; +} + +/* This function should be called with the write lock held */ +int +vchiq_check_resume(VCHIQ_STATE_T *state) +{ + VCHIQ_ARM_STATE_T *arm_state = vchiq_platform_get_arm_state(state); + int resume = 0; + + if (!arm_state) + goto out; + + vchiq_log_trace(vchiq_susp_log_level, "%s", __func__); + + if (need_resume(state)) { + set_resume_state(arm_state, VC_RESUME_REQUESTED); + request_poll(state, NULL, 0); + resume = 1; + } + +out: + vchiq_log_trace(vchiq_susp_log_level, "%s exit", __func__); + return resume; +} + +#ifdef notyet +void +vchiq_platform_check_resume(VCHIQ_STATE_T *state) +{ + VCHIQ_ARM_STATE_T *arm_state = vchiq_platform_get_arm_state(state); + int res = 0; + + if (!arm_state) + goto out; + + vchiq_log_trace(vchiq_susp_log_level, "%s", __func__); + + write_lock_bh(&arm_state->susp_res_lock); + if (arm_state->wake_address == 0) { + vchiq_log_info(vchiq_susp_log_level, + "%s: already awake", __func__); + goto unlock; + } + if (arm_state->vc_resume_state == VC_RESUME_IN_PROGRESS) { + vchiq_log_info(vchiq_susp_log_level, + "%s: already resuming", __func__); + goto unlock; + } + + if (arm_state->vc_resume_state == VC_RESUME_REQUESTED) { + set_resume_state(arm_state, VC_RESUME_IN_PROGRESS); + res = 1; + } else + vchiq_log_trace(vchiq_susp_log_level, + "%s: not resuming (resume state %s)", __func__, + resume_state_names[arm_state->vc_resume_state + + VC_RESUME_NUM_OFFSET]); + +unlock: + write_unlock_bh(&arm_state->susp_res_lock); + + if (res) + vchiq_platform_resume(state); + +out: + vchiq_log_trace(vchiq_susp_log_level, "%s exit", __func__); + return; + +} +#endif + + + +VCHIQ_STATUS_T +vchiq_use_internal(VCHIQ_STATE_T *state, VCHIQ_SERVICE_T *service, + enum USE_TYPE_E use_type) +{ + VCHIQ_ARM_STATE_T *arm_state = vchiq_platform_get_arm_state(state); + VCHIQ_STATUS_T ret = VCHIQ_SUCCESS; + char entity[16]; + int *entity_uc; + int local_uc, local_entity_uc; + + if (!arm_state) + goto out; + + vchiq_log_trace(vchiq_susp_log_level, "%s", __func__); + + if (use_type == USE_TYPE_VCHIQ) { + snprintf(entity, sizeof(entity), "VCHIQ: "); + entity_uc = &arm_state->peer_use_count; + } else if (service) { + snprintf(entity, sizeof(entity), "%c%c%c%c:%8x", + VCHIQ_FOURCC_AS_4CHARS(service->base.fourcc), + service->client_id); + entity_uc = &service->service_use_count; + } else { + vchiq_log_error(vchiq_susp_log_level, "%s null service " + "ptr", __func__); + ret = VCHIQ_ERROR; + goto out; + } + + write_lock_bh(&arm_state->susp_res_lock); + while (arm_state->resume_blocked) { + /* If we call 'use' while force suspend is waiting for suspend, + * then we're about to block the thread which the force is + * waiting to complete, so we're bound to just time out. In this + * case, set the suspend state such that the wait will be + * canceled, so we can complete as quickly as possible. */ + if (arm_state->resume_blocked && arm_state->vc_suspend_state == + VC_SUSPEND_IDLE) { + set_suspend_state(arm_state, VC_SUSPEND_FORCE_CANCELED); + break; + } + /* If suspend is already in progress then we need to block */ + if (!try_wait_for_completion(&arm_state->resume_blocker)) { + /* Indicate that there are threads waiting on the resume + * blocker. These need to be allowed to complete before + * a _second_ call to force suspend can complete, + * otherwise low priority threads might never actually + * continue */ + arm_state->blocked_count++; + write_unlock_bh(&arm_state->susp_res_lock); + vchiq_log_info(vchiq_susp_log_level, "%s %s resume " + "blocked - waiting...", __func__, entity); + if (wait_for_completion_killable( + &arm_state->resume_blocker) != 0) { + vchiq_log_error(vchiq_susp_log_level, "%s %s " + "wait for resume blocker interrupted", + __func__, entity); + ret = VCHIQ_ERROR; + write_lock_bh(&arm_state->susp_res_lock); + arm_state->blocked_count--; + write_unlock_bh(&arm_state->susp_res_lock); + goto out; + } + vchiq_log_info(vchiq_susp_log_level, "%s %s resume " + "unblocked", __func__, entity); + write_lock_bh(&arm_state->susp_res_lock); + if (--arm_state->blocked_count == 0) + complete_all(&arm_state->blocked_blocker); + } + } + + stop_suspend_timer(arm_state); + + local_uc = ++arm_state->videocore_use_count; + local_entity_uc = ++(*entity_uc); + + /* If there's a pending request which hasn't yet been serviced then + * just clear it. If we're past VC_SUSPEND_REQUESTED state then + * vc_resume_complete will block until we either resume or fail to + * suspend */ + if (arm_state->vc_suspend_state <= VC_SUSPEND_REQUESTED) + set_suspend_state(arm_state, VC_SUSPEND_IDLE); + + if ((use_type != USE_TYPE_SERVICE_NO_RESUME) && need_resume(state)) { + set_resume_state(arm_state, VC_RESUME_REQUESTED); + vchiq_log_info(vchiq_susp_log_level, + "%s %s count %d, state count %d", + __func__, entity, local_entity_uc, local_uc); + request_poll(state, NULL, 0); + } else + vchiq_log_trace(vchiq_susp_log_level, + "%s %s count %d, state count %d", + __func__, entity, *entity_uc, local_uc); + + + write_unlock_bh(&arm_state->susp_res_lock); + + /* Completion is in a done state when we're not suspended, so this won't + * block for the non-suspended case. */ + if (!try_wait_for_completion(&arm_state->vc_resume_complete)) { + vchiq_log_info(vchiq_susp_log_level, "%s %s wait for resume", + __func__, entity); + if (wait_for_completion_killable( + &arm_state->vc_resume_complete) != 0) { + vchiq_log_error(vchiq_susp_log_level, "%s %s wait for " + "resume interrupted", __func__, entity); + ret = VCHIQ_ERROR; + goto out; + } + vchiq_log_info(vchiq_susp_log_level, "%s %s resumed", __func__, + entity); + } + + if (ret == VCHIQ_SUCCESS) { + VCHIQ_STATUS_T status = VCHIQ_SUCCESS; + long ack_cnt = atomic_xchg(&arm_state->ka_use_ack_count, 0); + while (ack_cnt && (status == VCHIQ_SUCCESS)) { + /* Send the use notify to videocore */ + status = vchiq_send_remote_use_active(state); + if (status == VCHIQ_SUCCESS) + ack_cnt--; + else + atomic_add(ack_cnt, + &arm_state->ka_use_ack_count); + } + } + +out: + vchiq_log_trace(vchiq_susp_log_level, "%s exit %d", __func__, ret); + return ret; +} + +VCHIQ_STATUS_T +vchiq_release_internal(VCHIQ_STATE_T *state, VCHIQ_SERVICE_T *service) +{ + VCHIQ_ARM_STATE_T *arm_state = vchiq_platform_get_arm_state(state); + VCHIQ_STATUS_T ret = VCHIQ_SUCCESS; + char entity[16]; + int *entity_uc; + int local_uc, local_entity_uc; + + if (!arm_state) + goto out; + + vchiq_log_trace(vchiq_susp_log_level, "%s", __func__); + + if (service) { + snprintf(entity, sizeof(entity), "%c%c%c%c:%8x", + VCHIQ_FOURCC_AS_4CHARS(service->base.fourcc), + service->client_id); + entity_uc = &service->service_use_count; + } else { + snprintf(entity, sizeof(entity), "PEER: "); + entity_uc = &arm_state->peer_use_count; + } + + write_lock_bh(&arm_state->susp_res_lock); + if (!arm_state->videocore_use_count || !(*entity_uc)) { + /* Don't use BUG_ON - don't allow user thread to crash kernel */ + WARN_ON(!arm_state->videocore_use_count); + WARN_ON(!(*entity_uc)); + ret = VCHIQ_ERROR; + goto unlock; + } + --arm_state->videocore_use_count; + --(*entity_uc); + + if (!vchiq_videocore_wanted(state)) { + if (vchiq_platform_use_suspend_timer() && + !arm_state->resume_blocked) { + /* Only use the timer if we're not trying to force + * suspend (=> resume_blocked) */ + start_suspend_timer(arm_state); + } else { + vchiq_log_info(vchiq_susp_log_level, + "%s %s count %d, state count %d - suspending", + __func__, entity, *entity_uc, + arm_state->videocore_use_count); + vchiq_arm_vcsuspend(state); + } + } else + vchiq_log_trace(vchiq_susp_log_level, + "%s %s count %d, state count %d", + __func__, entity, *entity_uc, + arm_state->videocore_use_count); + +unlock: + write_unlock_bh(&arm_state->susp_res_lock); + +out: + vchiq_log_trace(vchiq_susp_log_level, "%s exit %d", __func__, ret); + return ret; +} + +void +vchiq_on_remote_use(VCHIQ_STATE_T *state) +{ + VCHIQ_ARM_STATE_T *arm_state = vchiq_platform_get_arm_state(state); + vchiq_log_trace(vchiq_susp_log_level, "%s", __func__); + atomic_inc(&arm_state->ka_use_count); + complete(&arm_state->ka_evt); +} + +void +vchiq_on_remote_release(VCHIQ_STATE_T *state) +{ + VCHIQ_ARM_STATE_T *arm_state = vchiq_platform_get_arm_state(state); + vchiq_log_trace(vchiq_susp_log_level, "%s", __func__); + atomic_inc(&arm_state->ka_release_count); + complete(&arm_state->ka_evt); +} + +VCHIQ_STATUS_T +vchiq_use_service_internal(VCHIQ_SERVICE_T *service) +{ + return vchiq_use_internal(service->state, service, USE_TYPE_SERVICE); +} + +VCHIQ_STATUS_T +vchiq_release_service_internal(VCHIQ_SERVICE_T *service) +{ + return vchiq_release_internal(service->state, service); +} + +static void suspend_timer_callback(unsigned long context) +{ + VCHIQ_STATE_T *state = (VCHIQ_STATE_T *)context; + VCHIQ_ARM_STATE_T *arm_state = vchiq_platform_get_arm_state(state); + if (!arm_state) + goto out; + vchiq_log_info(vchiq_susp_log_level, + "%s - suspend timer expired - check suspend", __func__); + vchiq_check_suspend(state); +out: + return; +} + +VCHIQ_STATUS_T +vchiq_use_service_no_resume(VCHIQ_SERVICE_HANDLE_T handle) +{ + VCHIQ_STATUS_T ret = VCHIQ_ERROR; + VCHIQ_SERVICE_T *service = find_service_by_handle(handle); + if (service) { + ret = vchiq_use_internal(service->state, service, + USE_TYPE_SERVICE_NO_RESUME); + unlock_service(service); + } + return ret; +} + +VCHIQ_STATUS_T +vchiq_use_service(VCHIQ_SERVICE_HANDLE_T handle) +{ + VCHIQ_STATUS_T ret = VCHIQ_ERROR; + VCHIQ_SERVICE_T *service = find_service_by_handle(handle); + if (service) { + ret = vchiq_use_internal(service->state, service, + USE_TYPE_SERVICE); + unlock_service(service); + } + return ret; +} + +VCHIQ_STATUS_T +vchiq_release_service(VCHIQ_SERVICE_HANDLE_T handle) +{ + VCHIQ_STATUS_T ret = VCHIQ_ERROR; + VCHIQ_SERVICE_T *service = find_service_by_handle(handle); + if (service) { + ret = vchiq_release_internal(service->state, service); + unlock_service(service); + } + return ret; +} + +void +vchiq_dump_service_use_state(VCHIQ_STATE_T *state) +{ + VCHIQ_ARM_STATE_T *arm_state = vchiq_platform_get_arm_state(state); + int i, j = 0; + /* Only dump 64 services */ + static const int local_max_services = 64; + /* If there's more than 64 services, only dump ones with + * non-zero counts */ + int only_nonzero = 0; + static const char *nz = "<-- preventing suspend"; + + enum vc_suspend_status vc_suspend_state; + enum vc_resume_status vc_resume_state; + int peer_count; + int vc_use_count; + int active_services; + struct service_data_struct { + int fourcc; + int clientid; + int use_count; + } service_data[local_max_services]; + + if (!arm_state) + return; + + read_lock_bh(&arm_state->susp_res_lock); + vc_suspend_state = arm_state->vc_suspend_state; + vc_resume_state = arm_state->vc_resume_state; + peer_count = arm_state->peer_use_count; + vc_use_count = arm_state->videocore_use_count; + active_services = state->unused_service; + if (active_services > local_max_services) + only_nonzero = 1; + + for (i = 0; (i < active_services) && (j < local_max_services); i++) { + VCHIQ_SERVICE_T *service_ptr = state->services[i]; + if (!service_ptr) + continue; + + if (only_nonzero && !service_ptr->service_use_count) + continue; + + if (service_ptr->srvstate != VCHIQ_SRVSTATE_FREE) { + service_data[j].fourcc = service_ptr->base.fourcc; + service_data[j].clientid = service_ptr->client_id; + service_data[j++].use_count = service_ptr-> + service_use_count; + } + } + + read_unlock_bh(&arm_state->susp_res_lock); + + vchiq_log_warning(vchiq_susp_log_level, + "-- Videcore suspend state: %s --", + suspend_state_names[vc_suspend_state + VC_SUSPEND_NUM_OFFSET]); + vchiq_log_warning(vchiq_susp_log_level, + "-- Videcore resume state: %s --", + resume_state_names[vc_resume_state + VC_RESUME_NUM_OFFSET]); + + if (only_nonzero) + vchiq_log_warning(vchiq_susp_log_level, "Too many active " + "services (%d). Only dumping up to first %d services " + "with non-zero use-count", active_services, + local_max_services); + + for (i = 0; i < j; i++) { + vchiq_log_warning(vchiq_susp_log_level, + "----- %c%c%c%c:%d service count %d %s", + VCHIQ_FOURCC_AS_4CHARS(service_data[i].fourcc), + service_data[i].clientid, + service_data[i].use_count, + service_data[i].use_count ? nz : ""); + } + vchiq_log_warning(vchiq_susp_log_level, + "----- VCHIQ use count count %d", peer_count); + vchiq_log_warning(vchiq_susp_log_level, + "--- Overall vchiq instance use count %d", vc_use_count); + + vchiq_dump_platform_use_state(state); +} + +VCHIQ_STATUS_T +vchiq_check_service(VCHIQ_SERVICE_T *service) +{ + VCHIQ_ARM_STATE_T *arm_state; + VCHIQ_STATUS_T ret = VCHIQ_ERROR; + + if (!service || !service->state) + goto out; + + vchiq_log_trace(vchiq_susp_log_level, "%s", __func__); + + arm_state = vchiq_platform_get_arm_state(service->state); + + read_lock_bh(&arm_state->susp_res_lock); + if (service->service_use_count) + ret = VCHIQ_SUCCESS; + read_unlock_bh(&arm_state->susp_res_lock); + + if (ret == VCHIQ_ERROR) { + vchiq_log_error(vchiq_susp_log_level, + "%s ERROR - %c%c%c%c:%8x service count %d, " + "state count %d, videocore suspend state %s", __func__, + VCHIQ_FOURCC_AS_4CHARS(service->base.fourcc), + service->client_id, service->service_use_count, + arm_state->videocore_use_count, + suspend_state_names[arm_state->vc_suspend_state + + VC_SUSPEND_NUM_OFFSET]); + vchiq_dump_service_use_state(service->state); + } +out: + return ret; +} + +/* stub functions */ +void vchiq_on_remote_use_active(VCHIQ_STATE_T *state) +{ + (void)state; +} + +void vchiq_platform_conn_state_changed(VCHIQ_STATE_T *state, + VCHIQ_CONNSTATE_T oldstate, VCHIQ_CONNSTATE_T newstate) +{ + VCHIQ_ARM_STATE_T *arm_state = vchiq_platform_get_arm_state(state); + vchiq_log_info(vchiq_susp_log_level, "%d: %s->%s", state->id, + get_conn_state_name(oldstate), get_conn_state_name(newstate)); + if (state->conn_state == VCHIQ_CONNSTATE_CONNECTED) { + write_lock_bh(&arm_state->susp_res_lock); + if (!arm_state->first_connect) { + char threadname[10]; + arm_state->first_connect = 1; + write_unlock_bh(&arm_state->susp_res_lock); + snprintf(threadname, sizeof(threadname), "VCHIQka-%d", + state->id); + arm_state->ka_thread = vchiq_thread_create( + &vchiq_keepalive_thread_func, + (void *)state, + threadname); + if (arm_state->ka_thread == NULL) { + vchiq_log_error(vchiq_susp_log_level, + "vchiq: FATAL: couldn't create thread %s", + threadname); + } else { + wake_up_process(arm_state->ka_thread); + } + } else + write_unlock_bh(&arm_state->susp_res_lock); + } +} + +/**************************************************************************** +* +* vchiq_init - called when the module is loaded. +* +***************************************************************************/ + +int __init vchiq_init(void); +int __init +vchiq_init(void) +{ + int err; + +#ifdef notyet + /* create proc entries */ + err = vchiq_proc_init(); + if (err != 0) + goto failed_proc_init; +#endif + + vchiq_cdev = make_dev(&vchiq_cdevsw, 0, + UID_ROOT, GID_WHEEL, 0600, "vchiq"); + if (!vchiq_cdev) { + printf("Failed to create /dev/vchiq"); + return (-ENXIO); + } + + spin_lock_init(&msg_queue_spinlock); + + err = vchiq_platform_init(&g_state); + if (err != 0) + goto failed_platform_init; + + vchiq_log_info(vchiq_arm_log_level, + "vchiq: initialised - version %d (min %d)", + VCHIQ_VERSION, VCHIQ_VERSION_MIN); + + return 0; + +failed_platform_init: + if (vchiq_cdev) { + destroy_dev(vchiq_cdev); + vchiq_cdev = NULL; + } + vchiq_log_warning(vchiq_arm_log_level, "could not load vchiq"); + return err; +} + +#ifdef notyet +static int vchiq_instance_get_use_count(VCHIQ_INSTANCE_T instance) +{ + VCHIQ_SERVICE_T *service; + int use_count = 0, i; + i = 0; + while ((service = next_service_by_instance(instance->state, + instance, &i)) != NULL) { + use_count += service->service_use_count; + unlock_service(service); + } + return use_count; +} + +/* read the per-process use-count */ +static int proc_read_use_count(char *page, char **start, + off_t off, int count, + int *eof, void *data) +{ + VCHIQ_INSTANCE_T instance = data; + int len, use_count; + + use_count = vchiq_instance_get_use_count(instance); + len = snprintf(page+off, count, "%d\n", use_count); + + return len; +} + +/* add an instance (process) to the proc entries */ +static int vchiq_proc_add_instance(VCHIQ_INSTANCE_T instance) +{ + char pidstr[32]; + struct proc_dir_entry *top, *use_count; + struct proc_dir_entry *clients = vchiq_clients_top(); + int pid = instance->pid; + + snprintf(pidstr, sizeof(pidstr), "%d", pid); + top = proc_mkdir(pidstr, clients); + if (!top) + goto fail_top; + + use_count = create_proc_read_entry("use_count", + 0444, top, + proc_read_use_count, + instance); + if (!use_count) + goto fail_use_count; + + instance->proc_entry = top; + + return 0; + +fail_use_count: + remove_proc_entry(top->name, clients); +fail_top: + return -ENOMEM; +} + +static void vchiq_proc_remove_instance(VCHIQ_INSTANCE_T instance) +{ + struct proc_dir_entry *clients = vchiq_clients_top(); + remove_proc_entry("use_count", instance->proc_entry); + remove_proc_entry(instance->proc_entry->name, clients); +} + +#endif + +/**************************************************************************** +* +* vchiq_exit - called when the module is unloaded. +* +***************************************************************************/ + +void vchiq_exit(void); +void +vchiq_exit(void) +{ + if (vchiq_ehtag == NULL) + EVENTHANDLER_DEREGISTER(dev_clone, vchiq_ehtag); + vchiq_ehtag = NULL; + + vchiq_platform_exit(&g_state); + if (vchiq_cdev) { + destroy_dev(vchiq_cdev); + vchiq_cdev = NULL; + } +} Property changes on: projects/building-blocks/sys/contrib/vchiq/interface/vchiq_arm/vchiq_arm.c ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Index: projects/building-blocks/sys/contrib/vchiq/interface/vchiq_arm/vchiq_arm.h =================================================================== --- projects/building-blocks/sys/contrib/vchiq/interface/vchiq_arm/vchiq_arm.h (nonexistent) +++ projects/building-blocks/sys/contrib/vchiq/interface/vchiq_arm/vchiq_arm.h (revision 278312) @@ -0,0 +1,200 @@ +/** + * Copyright (c) 2010-2012 Broadcom. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions, and the following disclaimer, + * without modification. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The names of the above-listed copyright holders may not be used + * to endorse or promote products derived from this software without + * specific prior written permission. + * + * ALTERNATIVELY, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2, as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef VCHIQ_ARM_H +#define VCHIQ_ARM_H + +#include "vchiq_core.h" + + +enum vc_suspend_status { + VC_SUSPEND_FORCE_CANCELED = -3, /* Force suspend canceled, too busy */ + VC_SUSPEND_REJECTED = -2, /* Videocore rejected suspend request */ + VC_SUSPEND_FAILED = -1, /* Videocore suspend failed */ + VC_SUSPEND_IDLE = 0, /* VC active, no suspend actions */ + VC_SUSPEND_REQUESTED, /* User has requested suspend */ + VC_SUSPEND_IN_PROGRESS, /* Slot handler has recvd suspend request */ + VC_SUSPEND_SUSPENDED /* Videocore suspend succeeded */ +}; + +enum vc_resume_status { + VC_RESUME_FAILED = -1, /* Videocore resume failed */ + VC_RESUME_IDLE = 0, /* VC suspended, no resume actions */ + VC_RESUME_REQUESTED, /* User has requested resume */ + VC_RESUME_IN_PROGRESS, /* Slot handler has received resume request */ + VC_RESUME_RESUMED /* Videocore resumed successfully (active) */ +}; + + +enum USE_TYPE_E { + USE_TYPE_SERVICE, + USE_TYPE_SERVICE_NO_RESUME, + USE_TYPE_VCHIQ +}; + + + +typedef struct vchiq_arm_state_struct { + /* Keepalive-related data */ + VCHIQ_THREAD_T ka_thread; + struct completion ka_evt; + atomic_t ka_use_count; + atomic_t ka_use_ack_count; + atomic_t ka_release_count; + + struct completion vc_suspend_complete; + struct completion vc_resume_complete; + + rwlock_t susp_res_lock; + enum vc_suspend_status vc_suspend_state; + enum vc_resume_status vc_resume_state; + + unsigned int wake_address; + + struct timer_list suspend_timer; + int suspend_timer_timeout; + int suspend_timer_running; + + /* Global use count for videocore. + ** This is equal to the sum of the use counts for all services. When + ** this hits zero the videocore suspend procedure will be initiated. + */ + int videocore_use_count; + + /* Use count to track requests from videocore peer. + ** This use count is not associated with a service, so needs to be + ** tracked separately with the state. + */ + int peer_use_count; + + /* Flag to indicate whether resume is blocked. This happens when the + ** ARM is suspending + */ + struct completion resume_blocker; + int resume_blocked; + struct completion blocked_blocker; + int blocked_count; + + int autosuspend_override; + + /* Flag to indicate that the first vchiq connect has made it through. + ** This means that both sides should be fully ready, and we should + ** be able to suspend after this point. + */ + int first_connect; + + unsigned long long suspend_start_time; + unsigned long long sleep_start_time; + unsigned long long resume_start_time; + unsigned long long last_wake_time; + +} VCHIQ_ARM_STATE_T; + +extern int vchiq_arm_log_level; +extern int vchiq_susp_log_level; + +extern int __init +vchiq_platform_init(VCHIQ_STATE_T *state); + +extern void __exit +vchiq_platform_exit(VCHIQ_STATE_T *state); + +extern VCHIQ_STATE_T * +vchiq_get_state(void); + +extern VCHIQ_STATUS_T +vchiq_arm_vcsuspend(VCHIQ_STATE_T *state); + +extern VCHIQ_STATUS_T +vchiq_arm_force_suspend(VCHIQ_STATE_T *state); + +extern int +vchiq_arm_allow_resume(VCHIQ_STATE_T *state); + +extern VCHIQ_STATUS_T +vchiq_arm_vcresume(VCHIQ_STATE_T *state); + +extern VCHIQ_STATUS_T +vchiq_arm_init_state(VCHIQ_STATE_T *state, VCHIQ_ARM_STATE_T *arm_state); + +extern int +vchiq_check_resume(VCHIQ_STATE_T *state); + +extern void +vchiq_check_suspend(VCHIQ_STATE_T *state); + +extern VCHIQ_STATUS_T +vchiq_platform_suspend(VCHIQ_STATE_T *state); + +extern int +vchiq_platform_videocore_wanted(VCHIQ_STATE_T *state); + +extern int +vchiq_platform_use_suspend_timer(void); + +extern void +vchiq_dump_platform_use_state(VCHIQ_STATE_T *state); + +extern void +vchiq_dump_service_use_state(VCHIQ_STATE_T *state); + +extern VCHIQ_ARM_STATE_T* +vchiq_platform_get_arm_state(VCHIQ_STATE_T *state); + +extern int +vchiq_videocore_wanted(VCHIQ_STATE_T *state); + +extern VCHIQ_STATUS_T +vchiq_use_internal(VCHIQ_STATE_T *state, VCHIQ_SERVICE_T *service, + enum USE_TYPE_E use_type); +extern VCHIQ_STATUS_T +vchiq_release_internal(VCHIQ_STATE_T *state, VCHIQ_SERVICE_T *service); + +void +set_suspend_state(VCHIQ_ARM_STATE_T *arm_state, + enum vc_suspend_status new_state); + +void +set_resume_state(VCHIQ_ARM_STATE_T *arm_state, + enum vc_resume_status new_state); + +void +start_suspend_timer(VCHIQ_ARM_STATE_T *arm_state); + +extern int vchiq_proc_init(void); +extern void vchiq_proc_deinit(void); +extern struct proc_dir_entry *vchiq_proc_top(void); +extern struct proc_dir_entry *vchiq_clients_top(void); + + +#endif /* VCHIQ_ARM_H */ Property changes on: projects/building-blocks/sys/contrib/vchiq/interface/vchiq_arm/vchiq_arm.h ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Index: projects/building-blocks/sys/contrib/vchiq/interface/vchiq_arm/vchiq_build_info.h =================================================================== --- projects/building-blocks/sys/contrib/vchiq/interface/vchiq_arm/vchiq_build_info.h (nonexistent) +++ projects/building-blocks/sys/contrib/vchiq/interface/vchiq_arm/vchiq_build_info.h (revision 278312) @@ -0,0 +1,37 @@ +/** + * Copyright (c) 2010-2012 Broadcom. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions, and the following disclaimer, + * without modification. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The names of the above-listed copyright holders may not be used + * to endorse or promote products derived from this software without + * specific prior written permission. + * + * ALTERNATIVELY, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2, as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +const char *vchiq_get_build_hostname(void); +const char *vchiq_get_build_version(void); +const char *vchiq_get_build_time(void); +const char *vchiq_get_build_date(void); Property changes on: projects/building-blocks/sys/contrib/vchiq/interface/vchiq_arm/vchiq_build_info.h ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Index: projects/building-blocks/sys/contrib/vchiq/interface/vchiq_arm/vchiq_cfg.h =================================================================== --- projects/building-blocks/sys/contrib/vchiq/interface/vchiq_arm/vchiq_cfg.h (nonexistent) +++ projects/building-blocks/sys/contrib/vchiq/interface/vchiq_arm/vchiq_cfg.h (revision 278312) @@ -0,0 +1,60 @@ +/** + * Copyright (c) 2010-2012 Broadcom. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions, and the following disclaimer, + * without modification. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The names of the above-listed copyright holders may not be used + * to endorse or promote products derived from this software without + * specific prior written permission. + * + * ALTERNATIVELY, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2, as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef VCHIQ_CFG_H +#define VCHIQ_CFG_H + +#define VCHIQ_MAGIC VCHIQ_MAKE_FOURCC('V', 'C', 'H', 'I') +/* The version of VCHIQ - change with any non-trivial change */ +#define VCHIQ_VERSION 6 +/* The minimum compatible version - update to match VCHIQ_VERSION with any +** incompatible change */ +#define VCHIQ_VERSION_MIN 3 + +#define VCHIQ_MAX_STATES 1 +#define VCHIQ_MAX_SERVICES 4096 +#define VCHIQ_MAX_SLOTS 128 +#define VCHIQ_MAX_SLOTS_PER_SIDE 64 + +#define VCHIQ_NUM_CURRENT_BULKS 32 +#define VCHIQ_NUM_SERVICE_BULKS 4 + +#ifndef VCHIQ_ENABLE_DEBUG +#define VCHIQ_ENABLE_DEBUG 1 +#endif + +#ifndef VCHIQ_ENABLE_STATS +#define VCHIQ_ENABLE_STATS 1 +#endif + +#endif /* VCHIQ_CFG_H */ Property changes on: projects/building-blocks/sys/contrib/vchiq/interface/vchiq_arm/vchiq_cfg.h ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Index: projects/building-blocks/sys/contrib/vchiq/interface/vchiq_arm/vchiq_connected.c =================================================================== --- projects/building-blocks/sys/contrib/vchiq/interface/vchiq_arm/vchiq_connected.c (nonexistent) +++ projects/building-blocks/sys/contrib/vchiq/interface/vchiq_arm/vchiq_connected.c (revision 278312) @@ -0,0 +1,117 @@ +/** + * Copyright (c) 2010-2012 Broadcom. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions, and the following disclaimer, + * without modification. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The names of the above-listed copyright holders may not be used + * to endorse or promote products derived from this software without + * specific prior written permission. + * + * ALTERNATIVELY, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2, as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "vchiq_connected.h" +#include "vchiq_core.h" + +#define MAX_CALLBACKS 10 + +static int g_connected; +static int g_num_deferred_callbacks; +static VCHIQ_CONNECTED_CALLBACK_T g_deferred_callback[MAX_CALLBACKS]; +static int g_once_init; +static struct mutex g_connected_mutex; + +/**************************************************************************** +* +* Function to initialize our lock. +* +***************************************************************************/ + +static void connected_init(void) +{ + if (!g_once_init) { + lmutex_init(&g_connected_mutex); + g_once_init = 1; + } +} + +/**************************************************************************** +* +* This function is used to defer initialization until the vchiq stack is +* initialized. If the stack is already initialized, then the callback will +* be made immediately, otherwise it will be deferred until +* vchiq_call_connected_callbacks is called. +* +***************************************************************************/ + +void vchiq_add_connected_callback(VCHIQ_CONNECTED_CALLBACK_T callback) +{ + connected_init(); + + if (lmutex_lock_interruptible(&g_connected_mutex) != 0) + return; + + if (g_connected) + /* We're already connected. Call the callback immediately. */ + + callback(); + else { + if (g_num_deferred_callbacks >= MAX_CALLBACKS) + vchiq_log_error(vchiq_core_log_level, + "There are already %d callbacks registered - " + "please increase MAX_CALLBACKS", + g_num_deferred_callbacks); + else { + g_deferred_callback[g_num_deferred_callbacks] = + callback; + g_num_deferred_callbacks++; + } + } + lmutex_unlock(&g_connected_mutex); +} + +/**************************************************************************** +* +* This function is called by the vchiq stack once it has been connected to +* the videocore and clients can start to use the stack. +* +***************************************************************************/ + +void vchiq_call_connected_callbacks(void) +{ + int i; + + connected_init(); + + if (lmutex_lock_interruptible(&g_connected_mutex) != 0) + return; + + for (i = 0; i < g_num_deferred_callbacks; i++) + g_deferred_callback[i](); + + g_num_deferred_callbacks = 0; + g_connected = 1; + lmutex_unlock(&g_connected_mutex); +} +EXPORT_SYMBOL(vchiq_add_connected_callback); Property changes on: projects/building-blocks/sys/contrib/vchiq/interface/vchiq_arm/vchiq_connected.c ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Index: projects/building-blocks/sys/contrib/vchiq/interface/vchiq_arm/vchiq_connected.h =================================================================== --- projects/building-blocks/sys/contrib/vchiq/interface/vchiq_arm/vchiq_connected.h (nonexistent) +++ projects/building-blocks/sys/contrib/vchiq/interface/vchiq_arm/vchiq_connected.h (revision 278312) @@ -0,0 +1,51 @@ +/** + * Copyright (c) 2010-2012 Broadcom. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions, and the following disclaimer, + * without modification. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The names of the above-listed copyright holders may not be used + * to endorse or promote products derived from this software without + * specific prior written permission. + * + * ALTERNATIVELY, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2, as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef VCHIQ_CONNECTED_H +#define VCHIQ_CONNECTED_H + +/* ---- Include Files ----------------------------------------------------- */ + +/* ---- Constants and Types ---------------------------------------------- */ + +typedef void (*VCHIQ_CONNECTED_CALLBACK_T)(void); + +/* ---- Variable Externs ------------------------------------------------- */ + +/* ---- Function Prototypes ---------------------------------------------- */ + +void vchiq_add_connected_callback(VCHIQ_CONNECTED_CALLBACK_T callback); +void vchiq_call_connected_callbacks(void); + +#endif /* VCHIQ_CONNECTED_H */ + Property changes on: projects/building-blocks/sys/contrib/vchiq/interface/vchiq_arm/vchiq_connected.h ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Index: projects/building-blocks/sys/contrib/vchiq/interface/vchiq_arm/vchiq_core.c =================================================================== --- projects/building-blocks/sys/contrib/vchiq/interface/vchiq_arm/vchiq_core.c (nonexistent) +++ projects/building-blocks/sys/contrib/vchiq/interface/vchiq_arm/vchiq_core.c (revision 278312) @@ -0,0 +1,3842 @@ +/** + * Copyright (c) 2010-2012 Broadcom. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions, and the following disclaimer, + * without modification. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The names of the above-listed copyright holders may not be used + * to endorse or promote products derived from this software without + * specific prior written permission. + * + * ALTERNATIVELY, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2, as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "vchiq_core.h" + +#define VCHIQ_SLOT_HANDLER_STACK 8192 + +#define HANDLE_STATE_SHIFT 12 + +#define SLOT_INFO_FROM_INDEX(state, index) (state->slot_info + (index)) +#define SLOT_DATA_FROM_INDEX(state, index) (state->slot_data + (index)) +#define SLOT_INDEX_FROM_DATA(state, data) \ + (((unsigned int)((char *)data - (char *)state->slot_data)) / \ + VCHIQ_SLOT_SIZE) +#define SLOT_INDEX_FROM_INFO(state, info) \ + ((unsigned int)(info - state->slot_info)) +#define SLOT_QUEUE_INDEX_FROM_POS(pos) \ + ((int)((unsigned int)(pos) / VCHIQ_SLOT_SIZE)) + + +#define BULK_INDEX(x) (x & (VCHIQ_NUM_SERVICE_BULKS - 1)) + + +struct vchiq_open_payload { + int fourcc; + int client_id; + short version; + short version_min; +}; + +struct vchiq_openack_payload { + short version; +}; + +/* we require this for consistency between endpoints */ +vchiq_static_assert(sizeof(VCHIQ_HEADER_T) == 8); +vchiq_static_assert(IS_POW2(sizeof(VCHIQ_HEADER_T))); +vchiq_static_assert(IS_POW2(VCHIQ_NUM_CURRENT_BULKS)); +vchiq_static_assert(IS_POW2(VCHIQ_NUM_SERVICE_BULKS)); +vchiq_static_assert(IS_POW2(VCHIQ_MAX_SERVICES)); +vchiq_static_assert(VCHIQ_VERSION >= VCHIQ_VERSION_MIN); + +/* Run time control of log level, based on KERN_XXX level. */ +int vchiq_core_log_level = VCHIQ_LOG_DEFAULT; +int vchiq_core_msg_log_level = VCHIQ_LOG_DEFAULT; +int vchiq_sync_log_level = VCHIQ_LOG_DEFAULT; + +static atomic_t pause_bulks_count = ATOMIC_INIT(0); + +static DEFINE_SPINLOCK(service_spinlock); +DEFINE_SPINLOCK(bulk_waiter_spinlock); +DEFINE_SPINLOCK(quota_spinlock); + +void +vchiq_core_initialize(void) +{ + spin_lock_init(&service_spinlock); + spin_lock_init(&bulk_waiter_spinlock); + spin_lock_init("a_spinlock); +} + +VCHIQ_STATE_T *vchiq_states[VCHIQ_MAX_STATES]; +static unsigned int handle_seq; + +static const char *const srvstate_names[] = { + "FREE", + "HIDDEN", + "LISTENING", + "OPENING", + "OPEN", + "OPENSYNC", + "CLOSESENT", + "CLOSERECVD", + "CLOSEWAIT", + "CLOSED" +}; + +static const char *const reason_names[] = { + "SERVICE_OPENED", + "SERVICE_CLOSED", + "MESSAGE_AVAILABLE", + "BULK_TRANSMIT_DONE", + "BULK_RECEIVE_DONE", + "BULK_TRANSMIT_ABORTED", + "BULK_RECEIVE_ABORTED" +}; + +static const char *const conn_state_names[] = { + "DISCONNECTED", + "CONNECTING", + "CONNECTED", + "PAUSING", + "PAUSE_SENT", + "PAUSED", + "RESUMING", + "PAUSE_TIMEOUT", + "RESUME_TIMEOUT" +}; + + +static void +release_message_sync(VCHIQ_STATE_T *state, VCHIQ_HEADER_T *header); + +static const char *msg_type_str(unsigned int msg_type) +{ + switch (msg_type) { + case VCHIQ_MSG_PADDING: return "PADDING"; + case VCHIQ_MSG_CONNECT: return "CONNECT"; + case VCHIQ_MSG_OPEN: return "OPEN"; + case VCHIQ_MSG_OPENACK: return "OPENACK"; + case VCHIQ_MSG_CLOSE: return "CLOSE"; + case VCHIQ_MSG_DATA: return "DATA"; + case VCHIQ_MSG_BULK_RX: return "BULK_RX"; + case VCHIQ_MSG_BULK_TX: return "BULK_TX"; + case VCHIQ_MSG_BULK_RX_DONE: return "BULK_RX_DONE"; + case VCHIQ_MSG_BULK_TX_DONE: return "BULK_TX_DONE"; + case VCHIQ_MSG_PAUSE: return "PAUSE"; + case VCHIQ_MSG_RESUME: return "RESUME"; + case VCHIQ_MSG_REMOTE_USE: return "REMOTE_USE"; + case VCHIQ_MSG_REMOTE_RELEASE: return "REMOTE_RELEASE"; + case VCHIQ_MSG_REMOTE_USE_ACTIVE: return "REMOTE_USE_ACTIVE"; + } + return "???"; +} + +static inline void +vchiq_set_service_state(VCHIQ_SERVICE_T *service, int newstate) +{ + vchiq_log_info(vchiq_core_log_level, "%d: srv:%d %s->%s", + service->state->id, service->localport, + srvstate_names[service->srvstate], + srvstate_names[newstate]); + service->srvstate = newstate; +} + +VCHIQ_SERVICE_T * +find_service_by_handle(VCHIQ_SERVICE_HANDLE_T handle) +{ + VCHIQ_SERVICE_T *service; + + spin_lock(&service_spinlock); + service = handle_to_service(handle); + if (service && (service->srvstate != VCHIQ_SRVSTATE_FREE) && + (service->handle == handle)) { + BUG_ON(service->ref_count == 0); + service->ref_count++; + } else + service = NULL; + spin_unlock(&service_spinlock); + + if (!service) + vchiq_log_info(vchiq_core_log_level, + "Invalid service handle 0x%x", handle); + + return service; +} + +VCHIQ_SERVICE_T * +find_service_by_port(VCHIQ_STATE_T *state, int localport) +{ + VCHIQ_SERVICE_T *service = NULL; + if ((unsigned int)localport <= VCHIQ_PORT_MAX) { + spin_lock(&service_spinlock); + service = state->services[localport]; + if (service && (service->srvstate != VCHIQ_SRVSTATE_FREE)) { + BUG_ON(service->ref_count == 0); + service->ref_count++; + } else + service = NULL; + spin_unlock(&service_spinlock); + } + + if (!service) + vchiq_log_info(vchiq_core_log_level, + "Invalid port %d", localport); + + return service; +} + +VCHIQ_SERVICE_T * +find_service_for_instance(VCHIQ_INSTANCE_T instance, + VCHIQ_SERVICE_HANDLE_T handle) { + VCHIQ_SERVICE_T *service; + + spin_lock(&service_spinlock); + service = handle_to_service(handle); + if (service && (service->srvstate != VCHIQ_SRVSTATE_FREE) && + (service->handle == handle) && + (service->instance == instance)) { + BUG_ON(service->ref_count == 0); + service->ref_count++; + } else + service = NULL; + spin_unlock(&service_spinlock); + + if (!service) + vchiq_log_info(vchiq_core_log_level, + "Invalid service handle 0x%x", handle); + + return service; +} + +VCHIQ_SERVICE_T * +next_service_by_instance(VCHIQ_STATE_T *state, VCHIQ_INSTANCE_T instance, + int *pidx) +{ + VCHIQ_SERVICE_T *service = NULL; + int idx = *pidx; + + spin_lock(&service_spinlock); + while (idx < state->unused_service) { + VCHIQ_SERVICE_T *srv = state->services[idx++]; + if (srv && (srv->srvstate != VCHIQ_SRVSTATE_FREE) && + (srv->instance == instance)) { + service = srv; + BUG_ON(service->ref_count == 0); + service->ref_count++; + break; + } + } + spin_unlock(&service_spinlock); + + *pidx = idx; + + return service; +} + +void +lock_service(VCHIQ_SERVICE_T *service) +{ + spin_lock(&service_spinlock); + BUG_ON(!service || (service->ref_count == 0)); + if (service) + service->ref_count++; + spin_unlock(&service_spinlock); +} + +void +unlock_service(VCHIQ_SERVICE_T *service) +{ + VCHIQ_STATE_T *state = service->state; + spin_lock(&service_spinlock); + BUG_ON(!service || (service->ref_count == 0)); + if (service && service->ref_count) { + service->ref_count--; + if (!service->ref_count) { + BUG_ON(service->srvstate != VCHIQ_SRVSTATE_FREE); + state->services[service->localport] = NULL; + + _sema_destroy(&service->remove_event); + _sema_destroy(&service->bulk_remove_event); + lmutex_destroy(&service->bulk_mutex); + } else + service = NULL; + } + spin_unlock(&service_spinlock); + + if (service && service->userdata_term) + service->userdata_term(service->base.userdata); + + kfree(service); +} + +int +vchiq_get_client_id(VCHIQ_SERVICE_HANDLE_T handle) +{ + VCHIQ_SERVICE_T *service = find_service_by_handle(handle); + int id; + + id = service ? service->client_id : 0; + if (service) + unlock_service(service); + + return id; +} + +void * +vchiq_get_service_userdata(VCHIQ_SERVICE_HANDLE_T handle) +{ + VCHIQ_SERVICE_T *service = handle_to_service(handle); + + return service ? service->base.userdata : NULL; +} + +int +vchiq_get_service_fourcc(VCHIQ_SERVICE_HANDLE_T handle) +{ + VCHIQ_SERVICE_T *service = handle_to_service(handle); + + return service ? service->base.fourcc : 0; +} + +static void +mark_service_closing_internal(VCHIQ_SERVICE_T *service, int sh_thread) +{ + VCHIQ_STATE_T *state = service->state; + VCHIQ_SERVICE_QUOTA_T *service_quota; + + service->closing = 1; + + /* Synchronise with other threads. */ + lmutex_lock(&state->recycle_mutex); + lmutex_unlock(&state->recycle_mutex); + if (!sh_thread || (state->conn_state != VCHIQ_CONNSTATE_PAUSE_SENT)) { + /* If we're pausing then the slot_mutex is held until resume + * by the slot handler. Therefore don't try to acquire this + * mutex if we're the slot handler and in the pause sent state. + * We don't need to in this case anyway. */ + lmutex_lock(&state->slot_mutex); + lmutex_unlock(&state->slot_mutex); + } + + /* Unblock any sending thread. */ + service_quota = &state->service_quotas[service->localport]; + up(&service_quota->quota_event); +} + +static void +mark_service_closing(VCHIQ_SERVICE_T *service) +{ + mark_service_closing_internal(service, 0); +} + +static inline VCHIQ_STATUS_T +make_service_callback(VCHIQ_SERVICE_T *service, VCHIQ_REASON_T reason, + VCHIQ_HEADER_T *header, void *bulk_userdata) +{ + VCHIQ_STATUS_T status; + vchiq_log_trace(vchiq_core_log_level, "%d: callback:%d (%s, %x, %x)", + service->state->id, service->localport, reason_names[reason], + (unsigned int)header, (unsigned int)bulk_userdata); + status = service->base.callback(reason, header, service->handle, + bulk_userdata); + if (status == VCHIQ_ERROR) { + vchiq_log_warning(vchiq_core_log_level, + "%d: ignoring ERROR from callback to service %x", + service->state->id, service->handle); + status = VCHIQ_SUCCESS; + } + return status; +} + +inline void +vchiq_set_conn_state(VCHIQ_STATE_T *state, VCHIQ_CONNSTATE_T newstate) +{ + VCHIQ_CONNSTATE_T oldstate = state->conn_state; + vchiq_log_info(vchiq_core_log_level, "%d: %s->%s", state->id, + conn_state_names[oldstate], + conn_state_names[newstate]); + state->conn_state = newstate; + vchiq_platform_conn_state_changed(state, oldstate, newstate); +} + +static inline void +remote_event_create(REMOTE_EVENT_T *event) +{ + event->armed = 0; + /* Don't clear the 'fired' flag because it may already have been set + ** by the other side. */ + _sema_init(event->event, 0); +} + +__unused static inline void +remote_event_destroy(REMOTE_EVENT_T *event) +{ + (void)event; +} + +static inline int +remote_event_wait(REMOTE_EVENT_T *event) +{ + if (!event->fired) { + event->armed = 1; + dsb(); + if (!event->fired) { + if (down_interruptible(event->event) != 0) { + event->armed = 0; + return 0; + } + } + event->armed = 0; + wmb(); + } + + event->fired = 0; + return 1; +} + +static inline void +remote_event_signal_local(REMOTE_EVENT_T *event) +{ + event->armed = 0; + up(event->event); +} + +static inline void +remote_event_poll(REMOTE_EVENT_T *event) +{ + if (event->fired && event->armed) + remote_event_signal_local(event); +} + +void +remote_event_pollall(VCHIQ_STATE_T *state) +{ + remote_event_poll(&state->local->sync_trigger); + remote_event_poll(&state->local->sync_release); + remote_event_poll(&state->local->trigger); + remote_event_poll(&state->local->recycle); +} + +/* Round up message sizes so that any space at the end of a slot is always big +** enough for a header. This relies on header size being a power of two, which +** has been verified earlier by a static assertion. */ + +static inline unsigned int +calc_stride(unsigned int size) +{ + /* Allow room for the header */ + size += sizeof(VCHIQ_HEADER_T); + + /* Round up */ + return (size + sizeof(VCHIQ_HEADER_T) - 1) & ~(sizeof(VCHIQ_HEADER_T) + - 1); +} + +/* Called by the slot handler thread */ +static VCHIQ_SERVICE_T * +get_listening_service(VCHIQ_STATE_T *state, int fourcc) +{ + int i; + + WARN_ON(fourcc == VCHIQ_FOURCC_INVALID); + + for (i = 0; i < state->unused_service; i++) { + VCHIQ_SERVICE_T *service = state->services[i]; + if (service && + (service->public_fourcc == fourcc) && + ((service->srvstate == VCHIQ_SRVSTATE_LISTENING) || + ((service->srvstate == VCHIQ_SRVSTATE_OPEN) && + (service->remoteport == VCHIQ_PORT_FREE)))) { + lock_service(service); + return service; + } + } + + return NULL; +} + +/* Called by the slot handler thread */ +static VCHIQ_SERVICE_T * +get_connected_service(VCHIQ_STATE_T *state, unsigned int port) +{ + int i; + for (i = 0; i < state->unused_service; i++) { + VCHIQ_SERVICE_T *service = state->services[i]; + if (service && (service->srvstate == VCHIQ_SRVSTATE_OPEN) + && (service->remoteport == port)) { + lock_service(service); + return service; + } + } + return NULL; +} + +inline void +request_poll(VCHIQ_STATE_T *state, VCHIQ_SERVICE_T *service, int poll_type) +{ + uint32_t value; + + if (service) { + do { + value = atomic_read(&service->poll_flags); + } while (atomic_cmpxchg(&service->poll_flags, value, + value | (1 << poll_type)) != value); + + do { + value = atomic_read(&state->poll_services[ + service->localport>>5]); + } while (atomic_cmpxchg( + &state->poll_services[service->localport>>5], + value, value | (1 << (service->localport & 0x1f))) + != value); + } + + state->poll_needed = 1; + wmb(); + + /* ... and ensure the slot handler runs. */ + remote_event_signal_local(&state->local->trigger); +} + +/* Called from queue_message, by the slot handler and application threads, +** with slot_mutex held */ +static VCHIQ_HEADER_T * +reserve_space(VCHIQ_STATE_T *state, int space, int is_blocking) +{ + VCHIQ_SHARED_STATE_T *local = state->local; + int tx_pos = state->local_tx_pos; + int slot_space = VCHIQ_SLOT_SIZE - (tx_pos & VCHIQ_SLOT_MASK); + + if (space > slot_space) { + VCHIQ_HEADER_T *header; + /* Fill the remaining space with padding */ + WARN_ON(state->tx_data == NULL); + header = (VCHIQ_HEADER_T *) + (state->tx_data + (tx_pos & VCHIQ_SLOT_MASK)); + header->msgid = VCHIQ_MSGID_PADDING; + header->size = slot_space - sizeof(VCHIQ_HEADER_T); + + tx_pos += slot_space; + } + + /* If necessary, get the next slot. */ + if ((tx_pos & VCHIQ_SLOT_MASK) == 0) { + int slot_index; + + /* If there is no free slot... */ + + if (down_trylock(&state->slot_available_event) != 0) { + /* ...wait for one. */ + + VCHIQ_STATS_INC(state, slot_stalls); + + /* But first, flush through the last slot. */ + state->local_tx_pos = tx_pos; + local->tx_pos = tx_pos; + remote_event_signal(&state->remote->trigger); + + if (!is_blocking || + (down_interruptible( + &state->slot_available_event) != 0)) + return NULL; /* No space available */ + } + + BUG_ON(tx_pos == + (state->slot_queue_available * VCHIQ_SLOT_SIZE)); + + slot_index = local->slot_queue[ + SLOT_QUEUE_INDEX_FROM_POS(tx_pos) & + VCHIQ_SLOT_QUEUE_MASK]; + state->tx_data = + (char *)SLOT_DATA_FROM_INDEX(state, slot_index); + } + + state->local_tx_pos = tx_pos + space; + + return (VCHIQ_HEADER_T *)(state->tx_data + (tx_pos & VCHIQ_SLOT_MASK)); +} + +/* Called by the recycle thread. */ +static void +process_free_queue(VCHIQ_STATE_T *state) +{ + VCHIQ_SHARED_STATE_T *local = state->local; + BITSET_T service_found[BITSET_SIZE(VCHIQ_MAX_SERVICES)]; + int slot_queue_available; + + /* Use a read memory barrier to ensure that any state that may have + ** been modified by another thread is not masked by stale prefetched + ** values. */ + rmb(); + + /* Find slots which have been freed by the other side, and return them + ** to the available queue. */ + slot_queue_available = state->slot_queue_available; + + while (slot_queue_available != local->slot_queue_recycle) { + unsigned int pos; + int slot_index = local->slot_queue[slot_queue_available++ & + VCHIQ_SLOT_QUEUE_MASK]; + char *data = (char *)SLOT_DATA_FROM_INDEX(state, slot_index); + int data_found = 0; + + vchiq_log_trace(vchiq_core_log_level, "%d: pfq %d=%x %x %x", + state->id, slot_index, (unsigned int)data, + local->slot_queue_recycle, slot_queue_available); + + /* Initialise the bitmask for services which have used this + ** slot */ + BITSET_ZERO(service_found); + + pos = 0; + + while (pos < VCHIQ_SLOT_SIZE) { + VCHIQ_HEADER_T *header = + (VCHIQ_HEADER_T *)(data + pos); + int msgid = header->msgid; + if (VCHIQ_MSG_TYPE(msgid) == VCHIQ_MSG_DATA) { + int port = VCHIQ_MSG_SRCPORT(msgid); + VCHIQ_SERVICE_QUOTA_T *service_quota = + &state->service_quotas[port]; + int count; + spin_lock("a_spinlock); + count = service_quota->message_use_count; + if (count > 0) + service_quota->message_use_count = + count - 1; + spin_unlock("a_spinlock); + + if (count == service_quota->message_quota) + /* Signal the service that it + ** has dropped below its quota + */ + up(&service_quota->quota_event); + else if (count == 0) { + vchiq_log_error(vchiq_core_log_level, + "service %d " + "message_use_count=%d " + "(header %x, msgid %x, " + "header->msgid %x, " + "header->size %x)", + port, + service_quota-> + message_use_count, + (unsigned int)header, msgid, + header->msgid, + header->size); + WARN(1, "invalid message use count\n"); + } + if (!BITSET_IS_SET(service_found, port)) { + /* Set the found bit for this service */ + BITSET_SET(service_found, port); + + spin_lock("a_spinlock); + count = service_quota->slot_use_count; + if (count > 0) + service_quota->slot_use_count = + count - 1; + spin_unlock("a_spinlock); + + if (count > 0) { + /* Signal the service in case + ** it has dropped below its + ** quota */ + up(&service_quota->quota_event); + vchiq_log_trace( + vchiq_core_log_level, + "%d: pfq:%d %x@%x - " + "slot_use->%d", + state->id, port, + header->size, + (unsigned int)header, + count - 1); + } else { + vchiq_log_error( + vchiq_core_log_level, + "service %d " + "slot_use_count" + "=%d (header %x" + ", msgid %x, " + "header->msgid" + " %x, header->" + "size %x)", + port, count, + (unsigned int)header, + msgid, + header->msgid, + header->size); + WARN(1, "bad slot use count\n"); + } + } + + data_found = 1; + } + + pos += calc_stride(header->size); + if (pos > VCHIQ_SLOT_SIZE) { + vchiq_log_error(vchiq_core_log_level, + "pfq - pos %x: header %x, msgid %x, " + "header->msgid %x, header->size %x", + pos, (unsigned int)header, msgid, + header->msgid, header->size); + WARN(1, "invalid slot position\n"); + } + } + + if (data_found) { + int count; + spin_lock("a_spinlock); + count = state->data_use_count; + if (count > 0) + state->data_use_count = + count - 1; + spin_unlock("a_spinlock); + if (count == state->data_quota) + up(&state->data_quota_event); + } + + state->slot_queue_available = slot_queue_available; + up(&state->slot_available_event); + } +} + +/* Called by the slot handler and application threads */ +static VCHIQ_STATUS_T +queue_message(VCHIQ_STATE_T *state, VCHIQ_SERVICE_T *service, + int msgid, const VCHIQ_ELEMENT_T *elements, + int count, int size, int is_blocking) +{ + VCHIQ_SHARED_STATE_T *local; + VCHIQ_SERVICE_QUOTA_T *service_quota = NULL; + VCHIQ_HEADER_T *header; + int type = VCHIQ_MSG_TYPE(msgid); + + unsigned int stride; + + local = state->local; + + stride = calc_stride(size); + + WARN_ON(!(stride <= VCHIQ_SLOT_SIZE)); + + if ((type != VCHIQ_MSG_RESUME) && + (lmutex_lock_interruptible(&state->slot_mutex) != 0)) + return VCHIQ_RETRY; + + if (type == VCHIQ_MSG_DATA) { + int tx_end_index; + + BUG_ON(!service); + + if (service->closing) { + /* The service has been closed */ + lmutex_unlock(&state->slot_mutex); + return VCHIQ_ERROR; + } + + service_quota = &state->service_quotas[service->localport]; + + spin_lock("a_spinlock); + + /* Ensure this service doesn't use more than its quota of + ** messages or slots */ + tx_end_index = SLOT_QUEUE_INDEX_FROM_POS( + state->local_tx_pos + stride - 1); + + /* Ensure data messages don't use more than their quota of + ** slots */ + while ((tx_end_index != state->previous_data_index) && + (state->data_use_count == state->data_quota)) { + VCHIQ_STATS_INC(state, data_stalls); + spin_unlock("a_spinlock); + lmutex_unlock(&state->slot_mutex); + + if (down_interruptible(&state->data_quota_event) + != 0) + return VCHIQ_RETRY; + + lmutex_lock(&state->slot_mutex); + spin_lock("a_spinlock); + tx_end_index = SLOT_QUEUE_INDEX_FROM_POS( + state->local_tx_pos + stride - 1); + if ((tx_end_index == state->previous_data_index) || + (state->data_use_count < state->data_quota)) { + /* Pass the signal on to other waiters */ + up(&state->data_quota_event); + break; + } + } + + while ((service_quota->message_use_count == + service_quota->message_quota) || + ((tx_end_index != service_quota->previous_tx_index) && + (service_quota->slot_use_count == + service_quota->slot_quota))) { + spin_unlock("a_spinlock); + vchiq_log_trace(vchiq_core_log_level, + "%d: qm:%d %s,%x - quota stall " + "(msg %d, slot %d)", + state->id, service->localport, + msg_type_str(type), size, + service_quota->message_use_count, + service_quota->slot_use_count); + VCHIQ_SERVICE_STATS_INC(service, quota_stalls); + lmutex_unlock(&state->slot_mutex); + if (down_interruptible(&service_quota->quota_event) + != 0) + return VCHIQ_RETRY; + if (service->closing) + return VCHIQ_ERROR; + if (lmutex_lock_interruptible(&state->slot_mutex) != 0) + return VCHIQ_RETRY; + if (service->srvstate != VCHIQ_SRVSTATE_OPEN) { + /* The service has been closed */ + lmutex_unlock(&state->slot_mutex); + return VCHIQ_ERROR; + } + spin_lock("a_spinlock); + tx_end_index = SLOT_QUEUE_INDEX_FROM_POS( + state->local_tx_pos + stride - 1); + } + + spin_unlock("a_spinlock); + } + + header = reserve_space(state, stride, is_blocking); + + if (!header) { + if (service) + VCHIQ_SERVICE_STATS_INC(service, slot_stalls); + lmutex_unlock(&state->slot_mutex); + return VCHIQ_RETRY; + } + + if (type == VCHIQ_MSG_DATA) { + int i, pos; + int tx_end_index; + int slot_use_count; + + vchiq_log_info(vchiq_core_log_level, + "%d: qm %s@%x,%x (%d->%d)", + state->id, + msg_type_str(VCHIQ_MSG_TYPE(msgid)), + (unsigned int)header, size, + VCHIQ_MSG_SRCPORT(msgid), + VCHIQ_MSG_DSTPORT(msgid)); + + BUG_ON(!service); + + for (i = 0, pos = 0; i < (unsigned int)count; + pos += elements[i++].size) + if (elements[i].size) { + if (vchiq_copy_from_user + (header->data + pos, elements[i].data, + (size_t) elements[i].size) != + VCHIQ_SUCCESS) { + lmutex_unlock(&state->slot_mutex); + VCHIQ_SERVICE_STATS_INC(service, + error_count); + return VCHIQ_ERROR; + } + if (i == 0) { + if (vchiq_core_msg_log_level >= + VCHIQ_LOG_INFO) + vchiq_log_dump_mem("Sent", 0, + header->data + pos, + min(64, + elements[0].size)); + } + } + + spin_lock("a_spinlock); + service_quota->message_use_count++; + + tx_end_index = + SLOT_QUEUE_INDEX_FROM_POS(state->local_tx_pos - 1); + + /* If this transmission can't fit in the last slot used by any + ** service, the data_use_count must be increased. */ + if (tx_end_index != state->previous_data_index) { + state->previous_data_index = tx_end_index; + state->data_use_count++; + } + + /* If this isn't the same slot last used by this service, + ** the service's slot_use_count must be increased. */ + if (tx_end_index != service_quota->previous_tx_index) { + service_quota->previous_tx_index = tx_end_index; + slot_use_count = ++service_quota->slot_use_count; + } else { + slot_use_count = 0; + } + + spin_unlock("a_spinlock); + + if (slot_use_count) + vchiq_log_trace(vchiq_core_log_level, + "%d: qm:%d %s,%x - slot_use->%d (hdr %p)", + state->id, service->localport, + msg_type_str(VCHIQ_MSG_TYPE(msgid)), size, + slot_use_count, header); + + VCHIQ_SERVICE_STATS_INC(service, ctrl_tx_count); + VCHIQ_SERVICE_STATS_ADD(service, ctrl_tx_bytes, size); + } else { + vchiq_log_info(vchiq_core_log_level, + "%d: qm %s@%x,%x (%d->%d)", state->id, + msg_type_str(VCHIQ_MSG_TYPE(msgid)), + (unsigned int)header, size, + VCHIQ_MSG_SRCPORT(msgid), + VCHIQ_MSG_DSTPORT(msgid)); + if (size != 0) { + WARN_ON(!((count == 1) && (size == elements[0].size))); + memcpy(header->data, elements[0].data, + elements[0].size); + } + VCHIQ_STATS_INC(state, ctrl_tx_count); + } + + header->msgid = msgid; + header->size = size; + + { + int svc_fourcc; + + svc_fourcc = service + ? service->base.fourcc + : VCHIQ_MAKE_FOURCC('?', '?', '?', '?'); + + vchiq_log_info(vchiq_core_msg_log_level, + "Sent Msg %s(%u) to %c%c%c%c s:%u d:%d len:%d", + msg_type_str(VCHIQ_MSG_TYPE(msgid)), + VCHIQ_MSG_TYPE(msgid), + VCHIQ_FOURCC_AS_4CHARS(svc_fourcc), + VCHIQ_MSG_SRCPORT(msgid), + VCHIQ_MSG_DSTPORT(msgid), + size); + } + + /* Make sure the new header is visible to the peer. */ + wmb(); + + /* Make the new tx_pos visible to the peer. */ + local->tx_pos = state->local_tx_pos; + wmb(); + + if (service && (type == VCHIQ_MSG_CLOSE)) + vchiq_set_service_state(service, VCHIQ_SRVSTATE_CLOSESENT); + + if (VCHIQ_MSG_TYPE(msgid) != VCHIQ_MSG_PAUSE) + lmutex_unlock(&state->slot_mutex); + + remote_event_signal(&state->remote->trigger); + + return VCHIQ_SUCCESS; +} + +/* Called by the slot handler and application threads */ +static VCHIQ_STATUS_T +queue_message_sync(VCHIQ_STATE_T *state, VCHIQ_SERVICE_T *service, + int msgid, const VCHIQ_ELEMENT_T *elements, + int count, int size, int is_blocking) +{ + VCHIQ_SHARED_STATE_T *local; + VCHIQ_HEADER_T *header; + + local = state->local; + + if ((VCHIQ_MSG_TYPE(msgid) != VCHIQ_MSG_RESUME) && + (lmutex_lock_interruptible(&state->sync_mutex) != 0)) + return VCHIQ_RETRY; + + remote_event_wait(&local->sync_release); + + rmb(); + + header = (VCHIQ_HEADER_T *)SLOT_DATA_FROM_INDEX(state, + local->slot_sync); + + { + int oldmsgid = header->msgid; + if (oldmsgid != VCHIQ_MSGID_PADDING) + vchiq_log_error(vchiq_core_log_level, + "%d: qms - msgid %x, not PADDING", + state->id, oldmsgid); + } + + if (service) { + int i, pos; + + vchiq_log_info(vchiq_sync_log_level, + "%d: qms %s@%x,%x (%d->%d)", state->id, + msg_type_str(VCHIQ_MSG_TYPE(msgid)), + (unsigned int)header, size, + VCHIQ_MSG_SRCPORT(msgid), + VCHIQ_MSG_DSTPORT(msgid)); + + for (i = 0, pos = 0; i < (unsigned int)count; + pos += elements[i++].size) + if (elements[i].size) { + if (vchiq_copy_from_user + (header->data + pos, elements[i].data, + (size_t) elements[i].size) != + VCHIQ_SUCCESS) { + lmutex_unlock(&state->sync_mutex); + VCHIQ_SERVICE_STATS_INC(service, + error_count); + return VCHIQ_ERROR; + } + if (i == 0) { + if (vchiq_sync_log_level >= + VCHIQ_LOG_TRACE) + vchiq_log_dump_mem("Sent Sync", + 0, header->data + pos, + min(64, + elements[0].size)); + } + } + + VCHIQ_SERVICE_STATS_INC(service, ctrl_tx_count); + VCHIQ_SERVICE_STATS_ADD(service, ctrl_tx_bytes, size); + } else { + vchiq_log_info(vchiq_sync_log_level, + "%d: qms %s@%x,%x (%d->%d)", state->id, + msg_type_str(VCHIQ_MSG_TYPE(msgid)), + (unsigned int)header, size, + VCHIQ_MSG_SRCPORT(msgid), + VCHIQ_MSG_DSTPORT(msgid)); + if (size != 0) { + WARN_ON(!((count == 1) && (size == elements[0].size))); + memcpy(header->data, elements[0].data, + elements[0].size); + } + VCHIQ_STATS_INC(state, ctrl_tx_count); + } + + header->size = size; + header->msgid = msgid; + + if (vchiq_sync_log_level >= VCHIQ_LOG_TRACE) { + int svc_fourcc; + + svc_fourcc = service + ? service->base.fourcc + : VCHIQ_MAKE_FOURCC('?', '?', '?', '?'); + + vchiq_log_trace(vchiq_sync_log_level, + "Sent Sync Msg %s(%u) to %c%c%c%c s:%u d:%d len:%d", + msg_type_str(VCHIQ_MSG_TYPE(msgid)), + VCHIQ_MSG_TYPE(msgid), + VCHIQ_FOURCC_AS_4CHARS(svc_fourcc), + VCHIQ_MSG_SRCPORT(msgid), + VCHIQ_MSG_DSTPORT(msgid), + size); + } + + /* Make sure the new header is visible to the peer. */ + wmb(); + + remote_event_signal(&state->remote->sync_trigger); + + if (VCHIQ_MSG_TYPE(msgid) != VCHIQ_MSG_PAUSE) + lmutex_unlock(&state->sync_mutex); + + return VCHIQ_SUCCESS; +} + +static inline void +claim_slot(VCHIQ_SLOT_INFO_T *slot) +{ + slot->use_count++; +} + +static void +release_slot(VCHIQ_STATE_T *state, VCHIQ_SLOT_INFO_T *slot_info, + VCHIQ_HEADER_T *header, VCHIQ_SERVICE_T *service) +{ + int release_count; + + lmutex_lock(&state->recycle_mutex); + + if (header) { + int msgid = header->msgid; + if (((msgid & VCHIQ_MSGID_CLAIMED) == 0) || + (service && service->closing)) { + lmutex_unlock(&state->recycle_mutex); + return; + } + + /* Rewrite the message header to prevent a double + ** release */ + header->msgid = msgid & ~VCHIQ_MSGID_CLAIMED; + } + + release_count = slot_info->release_count; + slot_info->release_count = ++release_count; + + if (release_count == slot_info->use_count) { + int slot_queue_recycle; + /* Add to the freed queue */ + + /* A read barrier is necessary here to prevent speculative + ** fetches of remote->slot_queue_recycle from overtaking the + ** mutex. */ + rmb(); + + slot_queue_recycle = state->remote->slot_queue_recycle; + state->remote->slot_queue[slot_queue_recycle & + VCHIQ_SLOT_QUEUE_MASK] = + SLOT_INDEX_FROM_INFO(state, slot_info); + state->remote->slot_queue_recycle = slot_queue_recycle + 1; + vchiq_log_info(vchiq_core_log_level, + "%d: release_slot %d - recycle->%x", + state->id, SLOT_INDEX_FROM_INFO(state, slot_info), + state->remote->slot_queue_recycle); + + /* A write barrier is necessary, but remote_event_signal + ** contains one. */ + remote_event_signal(&state->remote->recycle); + } + + lmutex_unlock(&state->recycle_mutex); +} + +/* Called by the slot handler - don't hold the bulk mutex */ +static VCHIQ_STATUS_T +notify_bulks(VCHIQ_SERVICE_T *service, VCHIQ_BULK_QUEUE_T *queue, + int retry_poll) +{ + VCHIQ_STATUS_T status = VCHIQ_SUCCESS; + + vchiq_log_trace(vchiq_core_log_level, + "%d: nb:%d %cx - p=%x rn=%x r=%x", + service->state->id, service->localport, + (queue == &service->bulk_tx) ? 't' : 'r', + queue->process, queue->remote_notify, queue->remove); + + if (service->state->is_master) { + while (queue->remote_notify != queue->process) { + VCHIQ_BULK_T *bulk = + &queue->bulks[BULK_INDEX(queue->remote_notify)]; + int msgtype = (bulk->dir == VCHIQ_BULK_TRANSMIT) ? + VCHIQ_MSG_BULK_RX_DONE : VCHIQ_MSG_BULK_TX_DONE; + int msgid = VCHIQ_MAKE_MSG(msgtype, service->localport, + service->remoteport); + VCHIQ_ELEMENT_T element = { &bulk->actual, 4 }; + /* Only reply to non-dummy bulk requests */ + if (bulk->remote_data) { + status = queue_message(service->state, NULL, + msgid, &element, 1, 4, 0); + if (status != VCHIQ_SUCCESS) + break; + } + queue->remote_notify++; + } + } else { + queue->remote_notify = queue->process; + } + + if (status == VCHIQ_SUCCESS) { + while (queue->remove != queue->remote_notify) { + VCHIQ_BULK_T *bulk = + &queue->bulks[BULK_INDEX(queue->remove)]; + + /* Only generate callbacks for non-dummy bulk + ** requests, and non-terminated services */ + if (bulk->data && service->instance) { + if (bulk->actual != VCHIQ_BULK_ACTUAL_ABORTED) { + if (bulk->dir == VCHIQ_BULK_TRANSMIT) { + VCHIQ_SERVICE_STATS_INC(service, + bulk_tx_count); + VCHIQ_SERVICE_STATS_ADD(service, + bulk_tx_bytes, + bulk->actual); + } else { + VCHIQ_SERVICE_STATS_INC(service, + bulk_rx_count); + VCHIQ_SERVICE_STATS_ADD(service, + bulk_rx_bytes, + bulk->actual); + } + } else { + VCHIQ_SERVICE_STATS_INC(service, + bulk_aborted_count); + } + if (bulk->mode == VCHIQ_BULK_MODE_BLOCKING) { + struct bulk_waiter *waiter; + spin_lock(&bulk_waiter_spinlock); + waiter = bulk->userdata; + if (waiter) { + waiter->actual = bulk->actual; + up(&waiter->event); + } + spin_unlock(&bulk_waiter_spinlock); + } else if (bulk->mode == + VCHIQ_BULK_MODE_CALLBACK) { + VCHIQ_REASON_T reason = (bulk->dir == + VCHIQ_BULK_TRANSMIT) ? + ((bulk->actual == + VCHIQ_BULK_ACTUAL_ABORTED) ? + VCHIQ_BULK_TRANSMIT_ABORTED : + VCHIQ_BULK_TRANSMIT_DONE) : + ((bulk->actual == + VCHIQ_BULK_ACTUAL_ABORTED) ? + VCHIQ_BULK_RECEIVE_ABORTED : + VCHIQ_BULK_RECEIVE_DONE); + status = make_service_callback(service, + reason, NULL, bulk->userdata); + if (status == VCHIQ_RETRY) + break; + } + } + + queue->remove++; + up(&service->bulk_remove_event); + } + if (!retry_poll) + status = VCHIQ_SUCCESS; + } + + if (status == VCHIQ_RETRY) + request_poll(service->state, service, + (queue == &service->bulk_tx) ? + VCHIQ_POLL_TXNOTIFY : VCHIQ_POLL_RXNOTIFY); + + return status; +} + +/* Called by the slot handler thread */ +static void +poll_services(VCHIQ_STATE_T *state) +{ + int group, i; + + for (group = 0; group < BITSET_SIZE(state->unused_service); group++) { + uint32_t flags; + flags = atomic_xchg(&state->poll_services[group], 0); + for (i = 0; flags; i++) { + if (flags & (1 << i)) { + VCHIQ_SERVICE_T *service = + find_service_by_port(state, + (group<<5) + i); + uint32_t service_flags; + flags &= ~(1 << i); + if (!service) + continue; + service_flags = + atomic_xchg(&service->poll_flags, 0); + if (service_flags & + (1 << VCHIQ_POLL_REMOVE)) { + vchiq_log_info(vchiq_core_log_level, + "%d: ps - remove %d<->%d", + state->id, service->localport, + service->remoteport); + + /* Make it look like a client, because + it must be removed and not left in + the LISTENING state. */ + service->public_fourcc = + VCHIQ_FOURCC_INVALID; + + if (vchiq_close_service_internal( + service, 0/*!close_recvd*/) != + VCHIQ_SUCCESS) + request_poll(state, service, + VCHIQ_POLL_REMOVE); + } else if (service_flags & + (1 << VCHIQ_POLL_TERMINATE)) { + vchiq_log_info(vchiq_core_log_level, + "%d: ps - terminate %d<->%d", + state->id, service->localport, + service->remoteport); + if (vchiq_close_service_internal( + service, 0/*!close_recvd*/) != + VCHIQ_SUCCESS) + request_poll(state, service, + VCHIQ_POLL_TERMINATE); + } + if (service_flags & (1 << VCHIQ_POLL_TXNOTIFY)) + notify_bulks(service, + &service->bulk_tx, + 1/*retry_poll*/); + if (service_flags & (1 << VCHIQ_POLL_RXNOTIFY)) + notify_bulks(service, + &service->bulk_rx, + 1/*retry_poll*/); + unlock_service(service); + } + } + } +} + +/* Called by the slot handler or application threads, holding the bulk mutex. */ +static int +resolve_bulks(VCHIQ_SERVICE_T *service, VCHIQ_BULK_QUEUE_T *queue) +{ + VCHIQ_STATE_T *state = service->state; + int resolved = 0; + int rc; + + while ((queue->process != queue->local_insert) && + (queue->process != queue->remote_insert)) { + VCHIQ_BULK_T *bulk = &queue->bulks[BULK_INDEX(queue->process)]; + + vchiq_log_trace(vchiq_core_log_level, + "%d: rb:%d %cx - li=%x ri=%x p=%x", + state->id, service->localport, + (queue == &service->bulk_tx) ? 't' : 'r', + queue->local_insert, queue->remote_insert, + queue->process); + + WARN_ON(!((int)(queue->local_insert - queue->process) > 0)); + WARN_ON(!((int)(queue->remote_insert - queue->process) > 0)); + + rc = lmutex_lock_interruptible(&state->bulk_transfer_mutex); + if (rc != 0) + break; + + vchiq_transfer_bulk(bulk); + lmutex_unlock(&state->bulk_transfer_mutex); + + if (vchiq_core_msg_log_level >= VCHIQ_LOG_INFO) { + const char *header = (queue == &service->bulk_tx) ? + "Send Bulk to" : "Recv Bulk from"; + if (bulk->actual != VCHIQ_BULK_ACTUAL_ABORTED) + vchiq_log_info(vchiq_core_msg_log_level, + "%s %c%c%c%c d:%d len:%d %x<->%x", + header, + VCHIQ_FOURCC_AS_4CHARS( + service->base.fourcc), + service->remoteport, + bulk->size, + (unsigned int)bulk->data, + (unsigned int)bulk->remote_data); + else + vchiq_log_info(vchiq_core_msg_log_level, + "%s %c%c%c%c d:%d ABORTED - tx len:%d," + " rx len:%d %x<->%x", + header, + VCHIQ_FOURCC_AS_4CHARS( + service->base.fourcc), + service->remoteport, + bulk->size, + bulk->remote_size, + (unsigned int)bulk->data, + (unsigned int)bulk->remote_data); + } + + vchiq_complete_bulk(bulk); + queue->process++; + resolved++; + } + return resolved; +} + +/* Called with the bulk_mutex held */ +static void +abort_outstanding_bulks(VCHIQ_SERVICE_T *service, VCHIQ_BULK_QUEUE_T *queue) +{ + int is_tx = (queue == &service->bulk_tx); + vchiq_log_trace(vchiq_core_log_level, + "%d: aob:%d %cx - li=%x ri=%x p=%x", + service->state->id, service->localport, is_tx ? 't' : 'r', + queue->local_insert, queue->remote_insert, queue->process); + + WARN_ON(!((int)(queue->local_insert - queue->process) >= 0)); + WARN_ON(!((int)(queue->remote_insert - queue->process) >= 0)); + + while ((queue->process != queue->local_insert) || + (queue->process != queue->remote_insert)) { + VCHIQ_BULK_T *bulk = &queue->bulks[BULK_INDEX(queue->process)]; + + if (queue->process == queue->remote_insert) { + /* fabricate a matching dummy bulk */ + bulk->remote_data = NULL; + bulk->remote_size = 0; + queue->remote_insert++; + } + + if (queue->process != queue->local_insert) { + vchiq_complete_bulk(bulk); + + vchiq_log_info(vchiq_core_msg_log_level, + "%s %c%c%c%c d:%d ABORTED - tx len:%d, " + "rx len:%d", + is_tx ? "Send Bulk to" : "Recv Bulk from", + VCHIQ_FOURCC_AS_4CHARS(service->base.fourcc), + service->remoteport, + bulk->size, + bulk->remote_size); + } else { + /* fabricate a matching dummy bulk */ + bulk->data = NULL; + bulk->size = 0; + bulk->actual = VCHIQ_BULK_ACTUAL_ABORTED; + bulk->dir = is_tx ? VCHIQ_BULK_TRANSMIT : + VCHIQ_BULK_RECEIVE; + queue->local_insert++; + } + + queue->process++; + } +} + +/* Called from the slot handler thread */ +static void +pause_bulks(VCHIQ_STATE_T *state) +{ + if (unlikely(atomic_inc_return(&pause_bulks_count) != 1)) { + WARN_ON_ONCE(1); + atomic_set(&pause_bulks_count, 1); + return; + } + + /* Block bulk transfers from all services */ + lmutex_lock(&state->bulk_transfer_mutex); +} + +/* Called from the slot handler thread */ +static void +resume_bulks(VCHIQ_STATE_T *state) +{ + int i; + if (unlikely(atomic_dec_return(&pause_bulks_count) != 0)) { + WARN_ON_ONCE(1); + atomic_set(&pause_bulks_count, 0); + return; + } + + /* Allow bulk transfers from all services */ + lmutex_unlock(&state->bulk_transfer_mutex); + + if (state->deferred_bulks == 0) + return; + + /* Deal with any bulks which had to be deferred due to being in + * paused state. Don't try to match up to number of deferred bulks + * in case we've had something come and close the service in the + * interim - just process all bulk queues for all services */ + vchiq_log_info(vchiq_core_log_level, "%s: processing %d deferred bulks", + __func__, state->deferred_bulks); + + for (i = 0; i < state->unused_service; i++) { + VCHIQ_SERVICE_T *service = state->services[i]; + int resolved_rx = 0; + int resolved_tx = 0; + if (!service || (service->srvstate != VCHIQ_SRVSTATE_OPEN)) + continue; + + lmutex_lock(&service->bulk_mutex); + resolved_rx = resolve_bulks(service, &service->bulk_rx); + resolved_tx = resolve_bulks(service, &service->bulk_tx); + lmutex_unlock(&service->bulk_mutex); + if (resolved_rx) + notify_bulks(service, &service->bulk_rx, 1); + if (resolved_tx) + notify_bulks(service, &service->bulk_tx, 1); + } + state->deferred_bulks = 0; +} + +static int +parse_open(VCHIQ_STATE_T *state, VCHIQ_HEADER_T *header) +{ + VCHIQ_SERVICE_T *service = NULL; + int msgid, size; + unsigned int localport, remoteport; + + msgid = header->msgid; + size = header->size; + //int type = VCHIQ_MSG_TYPE(msgid); + localport = VCHIQ_MSG_DSTPORT(msgid); + remoteport = VCHIQ_MSG_SRCPORT(msgid); + if (size >= sizeof(struct vchiq_open_payload)) { + const struct vchiq_open_payload *payload = + (struct vchiq_open_payload *)header->data; + unsigned int fourcc; + + fourcc = payload->fourcc; + vchiq_log_info(vchiq_core_log_level, + "%d: prs OPEN@%x (%d->'%c%c%c%c')", + state->id, (unsigned int)header, + localport, + VCHIQ_FOURCC_AS_4CHARS(fourcc)); + + service = get_listening_service(state, fourcc); + + if (service) { + /* A matching service exists */ + short v = payload->version; + short version_min = payload->version_min; + if ((service->version < version_min) || + (v < service->version_min)) { + /* Version mismatch */ + vchiq_loud_error_header(); + vchiq_loud_error("%d: service %d (%c%c%c%c) " + "version mismatch - local (%d, min %d)" + " vs. remote (%d, min %d)", + state->id, service->localport, + VCHIQ_FOURCC_AS_4CHARS(fourcc), + service->version, service->version_min, + v, version_min); + vchiq_loud_error_footer(); + unlock_service(service); + goto fail_open; + } + service->peer_version = v; + + if (service->srvstate == VCHIQ_SRVSTATE_LISTENING) { + struct vchiq_openack_payload ack_payload = { + service->version + }; + VCHIQ_ELEMENT_T body = { + &ack_payload, + sizeof(ack_payload) + }; + + /* Acknowledge the OPEN */ + if (service->sync) { + if (queue_message_sync(state, NULL, + VCHIQ_MAKE_MSG( + VCHIQ_MSG_OPENACK, + service->localport, + remoteport), + &body, 1, sizeof(ack_payload), + 0) == VCHIQ_RETRY) + goto bail_not_ready; + } else { + if (queue_message(state, NULL, + VCHIQ_MAKE_MSG( + VCHIQ_MSG_OPENACK, + service->localport, + remoteport), + &body, 1, sizeof(ack_payload), + 0) == VCHIQ_RETRY) + goto bail_not_ready; + } + + /* The service is now open */ + vchiq_set_service_state(service, + service->sync ? VCHIQ_SRVSTATE_OPENSYNC + : VCHIQ_SRVSTATE_OPEN); + } + + service->remoteport = remoteport; + service->client_id = ((int *)header->data)[1]; + if (make_service_callback(service, VCHIQ_SERVICE_OPENED, + NULL, NULL) == VCHIQ_RETRY) { + /* Bail out if not ready */ + service->remoteport = VCHIQ_PORT_FREE; + goto bail_not_ready; + } + + /* Success - the message has been dealt with */ + unlock_service(service); + return 1; + } + } + +fail_open: + /* No available service, or an invalid request - send a CLOSE */ + if (queue_message(state, NULL, + VCHIQ_MAKE_MSG(VCHIQ_MSG_CLOSE, 0, VCHIQ_MSG_SRCPORT(msgid)), + NULL, 0, 0, 0) == VCHIQ_RETRY) + goto bail_not_ready; + + return 1; + +bail_not_ready: + if (service) + unlock_service(service); + + return 0; +} + +/* Called by the slot handler thread */ +static void +parse_rx_slots(VCHIQ_STATE_T *state) +{ + VCHIQ_SHARED_STATE_T *remote = state->remote; + VCHIQ_SERVICE_T *service = NULL; + int tx_pos; + DEBUG_INITIALISE(state->local) + + tx_pos = remote->tx_pos; + + while (state->rx_pos != tx_pos) { + VCHIQ_HEADER_T *header; + int msgid, size; + int type; + unsigned int localport, remoteport; + + DEBUG_TRACE(PARSE_LINE); + if (!state->rx_data) { + int rx_index; + WARN_ON(!((state->rx_pos & VCHIQ_SLOT_MASK) == 0)); + rx_index = remote->slot_queue[ + SLOT_QUEUE_INDEX_FROM_POS(state->rx_pos) & + VCHIQ_SLOT_QUEUE_MASK]; + state->rx_data = (char *)SLOT_DATA_FROM_INDEX(state, + rx_index); + state->rx_info = SLOT_INFO_FROM_INDEX(state, rx_index); + + /* Initialise use_count to one, and increment + ** release_count at the end of the slot to avoid + ** releasing the slot prematurely. */ + state->rx_info->use_count = 1; + state->rx_info->release_count = 0; + } + + header = (VCHIQ_HEADER_T *)(state->rx_data + + (state->rx_pos & VCHIQ_SLOT_MASK)); + DEBUG_VALUE(PARSE_HEADER, (int)header); + msgid = header->msgid; + DEBUG_VALUE(PARSE_MSGID, msgid); + size = header->size; + type = VCHIQ_MSG_TYPE(msgid); + localport = VCHIQ_MSG_DSTPORT(msgid); + remoteport = VCHIQ_MSG_SRCPORT(msgid); + + if (type != VCHIQ_MSG_DATA) + VCHIQ_STATS_INC(state, ctrl_rx_count); + + switch (type) { + case VCHIQ_MSG_OPENACK: + case VCHIQ_MSG_CLOSE: + case VCHIQ_MSG_DATA: + case VCHIQ_MSG_BULK_RX: + case VCHIQ_MSG_BULK_TX: + case VCHIQ_MSG_BULK_RX_DONE: + case VCHIQ_MSG_BULK_TX_DONE: + service = find_service_by_port(state, localport); + if ((!service || service->remoteport != remoteport) && + (localport == 0) && + (type == VCHIQ_MSG_CLOSE)) { + /* This could be a CLOSE from a client which + hadn't yet received the OPENACK - look for + the connected service */ + if (service) + unlock_service(service); + service = get_connected_service(state, + remoteport); + if (service) + vchiq_log_warning(vchiq_core_log_level, + "%d: prs %s@%x (%d->%d) - " + "found connected service %d", + state->id, msg_type_str(type), + (unsigned int)header, + remoteport, localport, + service->localport); + } + + if (!service) { + vchiq_log_error(vchiq_core_log_level, + "%d: prs %s@%x (%d->%d) - " + "invalid/closed service %d", + state->id, msg_type_str(type), + (unsigned int)header, + remoteport, localport, localport); + goto skip_message; + } + break; + default: + break; + } + + if (vchiq_core_msg_log_level >= VCHIQ_LOG_INFO) { + int svc_fourcc; + + svc_fourcc = service + ? service->base.fourcc + : VCHIQ_MAKE_FOURCC('?', '?', '?', '?'); + vchiq_log_info(vchiq_core_msg_log_level, + "Rcvd Msg %s(%u) from %c%c%c%c s:%d d:%d " + "len:%d", + msg_type_str(type), type, + VCHIQ_FOURCC_AS_4CHARS(svc_fourcc), + remoteport, localport, size); + if (size > 0) + vchiq_log_dump_mem("Rcvd", 0, header->data, + min(64, size)); + } + + if (((unsigned int)header & VCHIQ_SLOT_MASK) + calc_stride(size) + > VCHIQ_SLOT_SIZE) { + vchiq_log_error(vchiq_core_log_level, + "header %x (msgid %x) - size %x too big for " + "slot", + (unsigned int)header, (unsigned int)msgid, + (unsigned int)size); + WARN(1, "oversized for slot\n"); + } + + switch (type) { + case VCHIQ_MSG_OPEN: + WARN_ON(!(VCHIQ_MSG_DSTPORT(msgid) == 0)); + if (!parse_open(state, header)) + goto bail_not_ready; + break; + case VCHIQ_MSG_OPENACK: + if (size >= sizeof(struct vchiq_openack_payload)) { + const struct vchiq_openack_payload *payload = + (struct vchiq_openack_payload *) + header->data; + service->peer_version = payload->version; + } + vchiq_log_info(vchiq_core_log_level, + "%d: prs OPENACK@%x,%x (%d->%d) v:%d", + state->id, (unsigned int)header, size, + remoteport, localport, service->peer_version); + if (service->srvstate == + VCHIQ_SRVSTATE_OPENING) { + service->remoteport = remoteport; + vchiq_set_service_state(service, + VCHIQ_SRVSTATE_OPEN); + up(&service->remove_event); + } else + vchiq_log_error(vchiq_core_log_level, + "OPENACK received in state %s", + srvstate_names[service->srvstate]); + break; + case VCHIQ_MSG_CLOSE: + WARN_ON(size != 0); /* There should be no data */ + + vchiq_log_info(vchiq_core_log_level, + "%d: prs CLOSE@%x (%d->%d)", + state->id, (unsigned int)header, + remoteport, localport); + + mark_service_closing_internal(service, 1); + + if (vchiq_close_service_internal(service, + 1/*close_recvd*/) == VCHIQ_RETRY) + goto bail_not_ready; + + vchiq_log_info(vchiq_core_log_level, + "Close Service %c%c%c%c s:%u d:%d", + VCHIQ_FOURCC_AS_4CHARS(service->base.fourcc), + service->localport, + service->remoteport); + break; + case VCHIQ_MSG_DATA: + vchiq_log_trace(vchiq_core_log_level, + "%d: prs DATA@%x,%x (%d->%d)", + state->id, (unsigned int)header, size, + remoteport, localport); + + if ((service->remoteport == remoteport) + && (service->srvstate == + VCHIQ_SRVSTATE_OPEN)) { + header->msgid = msgid | VCHIQ_MSGID_CLAIMED; + claim_slot(state->rx_info); + DEBUG_TRACE(PARSE_LINE); + if (make_service_callback(service, + VCHIQ_MESSAGE_AVAILABLE, header, + NULL) == VCHIQ_RETRY) { + DEBUG_TRACE(PARSE_LINE); + goto bail_not_ready; + } + VCHIQ_SERVICE_STATS_INC(service, ctrl_rx_count); + VCHIQ_SERVICE_STATS_ADD(service, ctrl_rx_bytes, + size); + } else { + VCHIQ_STATS_INC(state, error_count); + } + break; + case VCHIQ_MSG_CONNECT: + vchiq_log_info(vchiq_core_log_level, + "%d: prs CONNECT@%x", + state->id, (unsigned int)header); + up(&state->connect); + break; + case VCHIQ_MSG_BULK_RX: + case VCHIQ_MSG_BULK_TX: { + VCHIQ_BULK_QUEUE_T *queue; + WARN_ON(!state->is_master); + queue = (type == VCHIQ_MSG_BULK_RX) ? + &service->bulk_tx : &service->bulk_rx; + if ((service->remoteport == remoteport) + && (service->srvstate == + VCHIQ_SRVSTATE_OPEN)) { + VCHIQ_BULK_T *bulk; + int resolved = 0; + + DEBUG_TRACE(PARSE_LINE); + if (lmutex_lock_interruptible( + &service->bulk_mutex) != 0) { + DEBUG_TRACE(PARSE_LINE); + goto bail_not_ready; + } + + WARN_ON(!(queue->remote_insert < queue->remove + + VCHIQ_NUM_SERVICE_BULKS)); + bulk = &queue->bulks[ + BULK_INDEX(queue->remote_insert)]; + bulk->remote_data = + (void *)((int *)header->data)[0]; + bulk->remote_size = ((int *)header->data)[1]; + wmb(); + + vchiq_log_info(vchiq_core_log_level, + "%d: prs %s@%x (%d->%d) %x@%x", + state->id, msg_type_str(type), + (unsigned int)header, + remoteport, localport, + bulk->remote_size, + (unsigned int)bulk->remote_data); + + queue->remote_insert++; + + if (atomic_read(&pause_bulks_count)) { + state->deferred_bulks++; + vchiq_log_info(vchiq_core_log_level, + "%s: deferring bulk (%d)", + __func__, + state->deferred_bulks); + if (state->conn_state != + VCHIQ_CONNSTATE_PAUSE_SENT) + vchiq_log_error( + vchiq_core_log_level, + "%s: bulks paused in " + "unexpected state %s", + __func__, + conn_state_names[ + state->conn_state]); + } else if (state->conn_state == + VCHIQ_CONNSTATE_CONNECTED) { + DEBUG_TRACE(PARSE_LINE); + resolved = resolve_bulks(service, + queue); + } + + lmutex_unlock(&service->bulk_mutex); + if (resolved) + notify_bulks(service, queue, + 1/*retry_poll*/); + } + } break; + case VCHIQ_MSG_BULK_RX_DONE: + case VCHIQ_MSG_BULK_TX_DONE: + WARN_ON(state->is_master); + if ((service->remoteport == remoteport) + && (service->srvstate != + VCHIQ_SRVSTATE_FREE)) { + VCHIQ_BULK_QUEUE_T *queue; + VCHIQ_BULK_T *bulk; + + queue = (type == VCHIQ_MSG_BULK_RX_DONE) ? + &service->bulk_rx : &service->bulk_tx; + + DEBUG_TRACE(PARSE_LINE); + if (lmutex_lock_interruptible( + &service->bulk_mutex) != 0) { + DEBUG_TRACE(PARSE_LINE); + goto bail_not_ready; + } + if ((int)(queue->remote_insert - + queue->local_insert) >= 0) { + vchiq_log_error(vchiq_core_log_level, + "%d: prs %s@%x (%d->%d) " + "unexpected (ri=%d,li=%d)", + state->id, msg_type_str(type), + (unsigned int)header, + remoteport, localport, + queue->remote_insert, + queue->local_insert); + lmutex_unlock(&service->bulk_mutex); + break; + } + + BUG_ON(queue->process == queue->local_insert); + BUG_ON(queue->process != queue->remote_insert); + + bulk = &queue->bulks[ + BULK_INDEX(queue->remote_insert)]; + bulk->actual = *(int *)header->data; + queue->remote_insert++; + + vchiq_log_info(vchiq_core_log_level, + "%d: prs %s@%x (%d->%d) %x@%x", + state->id, msg_type_str(type), + (unsigned int)header, + remoteport, localport, + bulk->actual, (unsigned int)bulk->data); + + vchiq_log_trace(vchiq_core_log_level, + "%d: prs:%d %cx li=%x ri=%x p=%x", + state->id, localport, + (type == VCHIQ_MSG_BULK_RX_DONE) ? + 'r' : 't', + queue->local_insert, + queue->remote_insert, queue->process); + + DEBUG_TRACE(PARSE_LINE); + WARN_ON(queue->process == queue->local_insert); + vchiq_complete_bulk(bulk); + queue->process++; + lmutex_unlock(&service->bulk_mutex); + DEBUG_TRACE(PARSE_LINE); + notify_bulks(service, queue, 1/*retry_poll*/); + DEBUG_TRACE(PARSE_LINE); + } + break; + case VCHIQ_MSG_PADDING: + vchiq_log_trace(vchiq_core_log_level, + "%d: prs PADDING@%x,%x", + state->id, (unsigned int)header, size); + break; + case VCHIQ_MSG_PAUSE: + /* If initiated, signal the application thread */ + vchiq_log_trace(vchiq_core_log_level, + "%d: prs PAUSE@%x,%x", + state->id, (unsigned int)header, size); + if (state->conn_state == VCHIQ_CONNSTATE_PAUSED) { + vchiq_log_error(vchiq_core_log_level, + "%d: PAUSE received in state PAUSED", + state->id); + break; + } + if (state->conn_state != VCHIQ_CONNSTATE_PAUSE_SENT) { + /* Send a PAUSE in response */ + if (queue_message(state, NULL, + VCHIQ_MAKE_MSG(VCHIQ_MSG_PAUSE, 0, 0), + NULL, 0, 0, 0) == VCHIQ_RETRY) + goto bail_not_ready; + if (state->is_master) + pause_bulks(state); + } + /* At this point slot_mutex is held */ + vchiq_set_conn_state(state, VCHIQ_CONNSTATE_PAUSED); + vchiq_platform_paused(state); + break; + case VCHIQ_MSG_RESUME: + vchiq_log_trace(vchiq_core_log_level, + "%d: prs RESUME@%x,%x", + state->id, (unsigned int)header, size); + /* Release the slot mutex */ + lmutex_unlock(&state->slot_mutex); + if (state->is_master) + resume_bulks(state); + vchiq_set_conn_state(state, VCHIQ_CONNSTATE_CONNECTED); + vchiq_platform_resumed(state); + break; + + case VCHIQ_MSG_REMOTE_USE: + vchiq_on_remote_use(state); + break; + case VCHIQ_MSG_REMOTE_RELEASE: + vchiq_on_remote_release(state); + break; + case VCHIQ_MSG_REMOTE_USE_ACTIVE: + vchiq_on_remote_use_active(state); + break; + + default: + vchiq_log_error(vchiq_core_log_level, + "%d: prs invalid msgid %x@%x,%x", + state->id, msgid, (unsigned int)header, size); + WARN(1, "invalid message\n"); + break; + } + +skip_message: + if (service) { + unlock_service(service); + service = NULL; + } + + state->rx_pos += calc_stride(size); + + DEBUG_TRACE(PARSE_LINE); + /* Perform some housekeeping when the end of the slot is + ** reached. */ + if ((state->rx_pos & VCHIQ_SLOT_MASK) == 0) { + /* Remove the extra reference count. */ + release_slot(state, state->rx_info, NULL, NULL); + state->rx_data = NULL; + } + } + +bail_not_ready: + if (service) + unlock_service(service); +} + +/* Called by the slot handler thread */ +int slot_handler_func(void *v); +int +slot_handler_func(void *v) +{ + VCHIQ_STATE_T *state = (VCHIQ_STATE_T *) v; + VCHIQ_SHARED_STATE_T *local = state->local; + DEBUG_INITIALISE(local) + + while (1) { + DEBUG_COUNT(SLOT_HANDLER_COUNT); + DEBUG_TRACE(SLOT_HANDLER_LINE); + remote_event_wait(&local->trigger); + + rmb(); + + DEBUG_TRACE(SLOT_HANDLER_LINE); + if (state->poll_needed) { + /* Check if we need to suspend - may change our + * conn_state */ + vchiq_platform_check_suspend(state); + + state->poll_needed = 0; + + /* Handle service polling and other rare conditions here + ** out of the mainline code */ + switch (state->conn_state) { + case VCHIQ_CONNSTATE_CONNECTED: + /* Poll the services as requested */ + poll_services(state); + break; + + case VCHIQ_CONNSTATE_PAUSING: + if (state->is_master) + pause_bulks(state); + if (queue_message(state, NULL, + VCHIQ_MAKE_MSG(VCHIQ_MSG_PAUSE, 0, 0), + NULL, 0, 0, 0) != VCHIQ_RETRY) { + vchiq_set_conn_state(state, + VCHIQ_CONNSTATE_PAUSE_SENT); + } else { + if (state->is_master) + resume_bulks(state); + /* Retry later */ + state->poll_needed = 1; + } + break; + + case VCHIQ_CONNSTATE_PAUSED: + vchiq_platform_resume(state); + break; + + case VCHIQ_CONNSTATE_RESUMING: + if (queue_message(state, NULL, + VCHIQ_MAKE_MSG(VCHIQ_MSG_RESUME, 0, 0), + NULL, 0, 0, 0) != VCHIQ_RETRY) { + if (state->is_master) + resume_bulks(state); + vchiq_set_conn_state(state, + VCHIQ_CONNSTATE_CONNECTED); + vchiq_platform_resumed(state); + } else { + /* This should really be impossible, + ** since the PAUSE should have flushed + ** through outstanding messages. */ + vchiq_log_error(vchiq_core_log_level, + "Failed to send RESUME " + "message"); + BUG(); + } + break; + + case VCHIQ_CONNSTATE_PAUSE_TIMEOUT: + case VCHIQ_CONNSTATE_RESUME_TIMEOUT: + vchiq_platform_handle_timeout(state); + break; + default: + break; + } + + + } + + DEBUG_TRACE(SLOT_HANDLER_LINE); + parse_rx_slots(state); + } + return 0; +} + + +/* Called by the recycle thread */ +int recycle_func(void *v); +int +recycle_func(void *v) +{ + VCHIQ_STATE_T *state = (VCHIQ_STATE_T *) v; + VCHIQ_SHARED_STATE_T *local = state->local; + + while (1) { + remote_event_wait(&local->recycle); + + process_free_queue(state); + } + return 0; +} + + +/* Called by the sync thread */ +int sync_func(void *v); +int +sync_func(void *v) +{ + VCHIQ_STATE_T *state = (VCHIQ_STATE_T *) v; + VCHIQ_SHARED_STATE_T *local = state->local; + VCHIQ_HEADER_T *header = (VCHIQ_HEADER_T *)SLOT_DATA_FROM_INDEX(state, + state->remote->slot_sync); + + while (1) { + VCHIQ_SERVICE_T *service; + int msgid, size; + int type; + unsigned int localport, remoteport; + + remote_event_wait(&local->sync_trigger); + + rmb(); + + msgid = header->msgid; + size = header->size; + type = VCHIQ_MSG_TYPE(msgid); + localport = VCHIQ_MSG_DSTPORT(msgid); + remoteport = VCHIQ_MSG_SRCPORT(msgid); + + service = find_service_by_port(state, localport); + + if (!service) { + vchiq_log_error(vchiq_sync_log_level, + "%d: sf %s@%x (%d->%d) - " + "invalid/closed service %d", + state->id, msg_type_str(type), + (unsigned int)header, + remoteport, localport, localport); + release_message_sync(state, header); + continue; + } + + if (vchiq_sync_log_level >= VCHIQ_LOG_TRACE) { + int svc_fourcc; + + svc_fourcc = service + ? service->base.fourcc + : VCHIQ_MAKE_FOURCC('?', '?', '?', '?'); + vchiq_log_trace(vchiq_sync_log_level, + "Rcvd Msg %s from %c%c%c%c s:%d d:%d len:%d", + msg_type_str(type), + VCHIQ_FOURCC_AS_4CHARS(svc_fourcc), + remoteport, localport, size); + if (size > 0) + vchiq_log_dump_mem("Rcvd", 0, header->data, + min(64, size)); + } + + switch (type) { + case VCHIQ_MSG_OPENACK: + if (size >= sizeof(struct vchiq_openack_payload)) { + const struct vchiq_openack_payload *payload = + (struct vchiq_openack_payload *) + header->data; + service->peer_version = payload->version; + } + vchiq_log_info(vchiq_sync_log_level, + "%d: sf OPENACK@%x,%x (%d->%d) v:%d", + state->id, (unsigned int)header, size, + remoteport, localport, service->peer_version); + if (service->srvstate == VCHIQ_SRVSTATE_OPENING) { + service->remoteport = remoteport; + vchiq_set_service_state(service, + VCHIQ_SRVSTATE_OPENSYNC); + up(&service->remove_event); + } + release_message_sync(state, header); + break; + + case VCHIQ_MSG_DATA: + vchiq_log_trace(vchiq_sync_log_level, + "%d: sf DATA@%x,%x (%d->%d)", + state->id, (unsigned int)header, size, + remoteport, localport); + + if ((service->remoteport == remoteport) && + (service->srvstate == + VCHIQ_SRVSTATE_OPENSYNC)) { + if (make_service_callback(service, + VCHIQ_MESSAGE_AVAILABLE, header, + NULL) == VCHIQ_RETRY) + vchiq_log_error(vchiq_sync_log_level, + "synchronous callback to " + "service %d returns " + "VCHIQ_RETRY", + localport); + } + break; + + default: + vchiq_log_error(vchiq_sync_log_level, + "%d: sf unexpected msgid %x@%x,%x", + state->id, msgid, (unsigned int)header, size); + release_message_sync(state, header); + break; + } + + unlock_service(service); + } + + return 0; +} + + +static void +init_bulk_queue(VCHIQ_BULK_QUEUE_T *queue) +{ + queue->local_insert = 0; + queue->remote_insert = 0; + queue->process = 0; + queue->remote_notify = 0; + queue->remove = 0; +} + + +inline const char * +get_conn_state_name(VCHIQ_CONNSTATE_T conn_state) +{ + return conn_state_names[conn_state]; +} + + +VCHIQ_SLOT_ZERO_T * +vchiq_init_slots(void *mem_base, int mem_size) +{ + int mem_align = (VCHIQ_SLOT_SIZE - (int)mem_base) & VCHIQ_SLOT_MASK; + VCHIQ_SLOT_ZERO_T *slot_zero = + (VCHIQ_SLOT_ZERO_T *)((char *)mem_base + mem_align); + int num_slots = (mem_size - mem_align)/VCHIQ_SLOT_SIZE; + int first_data_slot = VCHIQ_SLOT_ZERO_SLOTS; + + /* Ensure there is enough memory to run an absolutely minimum system */ + num_slots -= first_data_slot; + + if (num_slots < 4) { + vchiq_log_error(vchiq_core_log_level, + "vchiq_init_slots - insufficient memory %x bytes", + mem_size); + return NULL; + } + + memset(slot_zero, 0, sizeof(VCHIQ_SLOT_ZERO_T)); + + slot_zero->magic = VCHIQ_MAGIC; + slot_zero->version = VCHIQ_VERSION; + slot_zero->version_min = VCHIQ_VERSION_MIN; + slot_zero->slot_zero_size = sizeof(VCHIQ_SLOT_ZERO_T); + slot_zero->slot_size = VCHIQ_SLOT_SIZE; + slot_zero->max_slots = VCHIQ_MAX_SLOTS; + slot_zero->max_slots_per_side = VCHIQ_MAX_SLOTS_PER_SIDE; + + slot_zero->master.slot_sync = first_data_slot; + slot_zero->master.slot_first = first_data_slot + 1; + slot_zero->master.slot_last = first_data_slot + (num_slots/2) - 1; + slot_zero->slave.slot_sync = first_data_slot + (num_slots/2); + slot_zero->slave.slot_first = first_data_slot + (num_slots/2) + 1; + slot_zero->slave.slot_last = first_data_slot + num_slots - 1; + + return slot_zero; +} + +VCHIQ_STATUS_T +vchiq_init_state(VCHIQ_STATE_T *state, VCHIQ_SLOT_ZERO_T *slot_zero, + int is_master) +{ + VCHIQ_SHARED_STATE_T *local; + VCHIQ_SHARED_STATE_T *remote; + VCHIQ_STATUS_T status; + char threadname[10]; + static int id; + int i; + + /* Check the input configuration */ + + if (slot_zero->magic != VCHIQ_MAGIC) { + vchiq_loud_error_header(); + vchiq_loud_error("Invalid VCHIQ magic value found."); + vchiq_loud_error("slot_zero=%x: magic=%x (expected %x)", + (unsigned int)slot_zero, slot_zero->magic, VCHIQ_MAGIC); + vchiq_loud_error_footer(); + return VCHIQ_ERROR; + } + + vchiq_log_warning(vchiq_core_log_level, + "local ver %d (min %d), remote ver %d.", + VCHIQ_VERSION, VCHIQ_VERSION_MIN, + slot_zero->version); + + if (slot_zero->version < VCHIQ_VERSION_MIN) { + vchiq_loud_error_header(); + vchiq_loud_error("Incompatible VCHIQ versions found."); + vchiq_loud_error("slot_zero=%x: VideoCore version=%d " + "(minimum %d)", + (unsigned int)slot_zero, slot_zero->version, + VCHIQ_VERSION_MIN); + vchiq_loud_error("Restart with a newer VideoCore image."); + vchiq_loud_error_footer(); + return VCHIQ_ERROR; + } + + if (VCHIQ_VERSION < slot_zero->version_min) { + vchiq_loud_error_header(); + vchiq_loud_error("Incompatible VCHIQ versions found."); + vchiq_loud_error("slot_zero=%x: version=%d (VideoCore " + "minimum %d)", + (unsigned int)slot_zero, VCHIQ_VERSION, + slot_zero->version_min); + vchiq_loud_error("Restart with a newer kernel."); + vchiq_loud_error_footer(); + return VCHIQ_ERROR; + } + + if ((slot_zero->slot_zero_size != sizeof(VCHIQ_SLOT_ZERO_T)) || + (slot_zero->slot_size != VCHIQ_SLOT_SIZE) || + (slot_zero->max_slots != VCHIQ_MAX_SLOTS) || + (slot_zero->max_slots_per_side != VCHIQ_MAX_SLOTS_PER_SIDE)) { + vchiq_loud_error_header(); + if (slot_zero->slot_zero_size != sizeof(VCHIQ_SLOT_ZERO_T)) + vchiq_loud_error("slot_zero=%x: slot_zero_size=%x " + "(expected %zx)", + (unsigned int)slot_zero, + slot_zero->slot_zero_size, + sizeof(VCHIQ_SLOT_ZERO_T)); + if (slot_zero->slot_size != VCHIQ_SLOT_SIZE) + vchiq_loud_error("slot_zero=%x: slot_size=%d " + "(expected %d", + (unsigned int)slot_zero, slot_zero->slot_size, + VCHIQ_SLOT_SIZE); + if (slot_zero->max_slots != VCHIQ_MAX_SLOTS) + vchiq_loud_error("slot_zero=%x: max_slots=%d " + "(expected %d)", + (unsigned int)slot_zero, slot_zero->max_slots, + VCHIQ_MAX_SLOTS); + if (slot_zero->max_slots_per_side != VCHIQ_MAX_SLOTS_PER_SIDE) + vchiq_loud_error("slot_zero=%x: max_slots_per_side=%d " + "(expected %d)", + (unsigned int)slot_zero, + slot_zero->max_slots_per_side, + VCHIQ_MAX_SLOTS_PER_SIDE); + vchiq_loud_error_footer(); + return VCHIQ_ERROR; + } + + if (is_master) { + local = &slot_zero->master; + remote = &slot_zero->slave; + } else { + local = &slot_zero->slave; + remote = &slot_zero->master; + } + + if (local->initialised) { + vchiq_loud_error_header(); + if (remote->initialised) + vchiq_loud_error("local state has already been " + "initialised"); + else + vchiq_loud_error("master/slave mismatch - two %ss", + is_master ? "master" : "slave"); + vchiq_loud_error_footer(); + return VCHIQ_ERROR; + } + + memset(state, 0, sizeof(VCHIQ_STATE_T)); + + state->id = id++; + state->is_master = is_master; + + /* + initialize shared state pointers + */ + + state->local = local; + state->remote = remote; + state->slot_data = (VCHIQ_SLOT_T *)slot_zero; + + /* + initialize events and mutexes + */ + + _sema_init(&state->connect, 0); + lmutex_init(&state->mutex); + _sema_init(&state->trigger_event, 0); + _sema_init(&state->recycle_event, 0); + _sema_init(&state->sync_trigger_event, 0); + _sema_init(&state->sync_release_event, 0); + + lmutex_init(&state->slot_mutex); + lmutex_init(&state->recycle_mutex); + lmutex_init(&state->sync_mutex); + lmutex_init(&state->bulk_transfer_mutex); + + _sema_init(&state->slot_available_event, 0); + _sema_init(&state->slot_remove_event, 0); + _sema_init(&state->data_quota_event, 0); + + state->slot_queue_available = 0; + + for (i = 0; i < VCHIQ_MAX_SERVICES; i++) { + VCHIQ_SERVICE_QUOTA_T *service_quota = + &state->service_quotas[i]; + _sema_init(&service_quota->quota_event, 0); + } + + for (i = local->slot_first; i <= local->slot_last; i++) { + local->slot_queue[state->slot_queue_available++] = i; + up(&state->slot_available_event); + } + + state->default_slot_quota = state->slot_queue_available/2; + state->default_message_quota = + min((unsigned short)(state->default_slot_quota * 256), + (unsigned short)~0); + + state->previous_data_index = -1; + state->data_use_count = 0; + state->data_quota = state->slot_queue_available - 1; + + local->trigger.event = &state->trigger_event; + remote_event_create(&local->trigger); + local->tx_pos = 0; + + local->recycle.event = &state->recycle_event; + remote_event_create(&local->recycle); + local->slot_queue_recycle = state->slot_queue_available; + + local->sync_trigger.event = &state->sync_trigger_event; + remote_event_create(&local->sync_trigger); + + local->sync_release.event = &state->sync_release_event; + remote_event_create(&local->sync_release); + + /* At start-of-day, the slot is empty and available */ + ((VCHIQ_HEADER_T *)SLOT_DATA_FROM_INDEX(state, local->slot_sync))->msgid + = VCHIQ_MSGID_PADDING; + remote_event_signal_local(&local->sync_release); + + local->debug[DEBUG_ENTRIES] = DEBUG_MAX; + + status = vchiq_platform_init_state(state); + + /* + bring up slot handler thread + */ + snprintf(threadname, sizeof(threadname), "VCHIQ-%d", state->id); + state->slot_handler_thread = vchiq_thread_create(&slot_handler_func, + (void *)state, + threadname); + + if (state->slot_handler_thread == NULL) { + vchiq_loud_error_header(); + vchiq_loud_error("couldn't create thread %s", threadname); + vchiq_loud_error_footer(); + return VCHIQ_ERROR; + } + set_user_nice(state->slot_handler_thread, -19); + wake_up_process(state->slot_handler_thread); + + snprintf(threadname, sizeof(threadname), "VCHIQr-%d", state->id); + state->recycle_thread = vchiq_thread_create(&recycle_func, + (void *)state, + threadname); + if (state->recycle_thread == NULL) { + vchiq_loud_error_header(); + vchiq_loud_error("couldn't create thread %s", threadname); + vchiq_loud_error_footer(); + return VCHIQ_ERROR; + } + set_user_nice(state->recycle_thread, -19); + wake_up_process(state->recycle_thread); + + snprintf(threadname, sizeof(threadname), "VCHIQs-%d", state->id); + state->sync_thread = vchiq_thread_create(&sync_func, + (void *)state, + threadname); + if (state->sync_thread == NULL) { + vchiq_loud_error_header(); + vchiq_loud_error("couldn't create thread %s", threadname); + vchiq_loud_error_footer(); + return VCHIQ_ERROR; + } + set_user_nice(state->sync_thread, -20); + wake_up_process(state->sync_thread); + + BUG_ON(state->id >= VCHIQ_MAX_STATES); + vchiq_states[state->id] = state; + + /* Indicate readiness to the other side */ + local->initialised = 1; + + return status; +} + +/* Called from application thread when a client or server service is created. */ +VCHIQ_SERVICE_T * +vchiq_add_service_internal(VCHIQ_STATE_T *state, + const VCHIQ_SERVICE_PARAMS_T *params, int srvstate, + VCHIQ_INSTANCE_T instance, VCHIQ_USERDATA_TERM_T userdata_term) +{ + VCHIQ_SERVICE_T *service; + + service = kmalloc(sizeof(VCHIQ_SERVICE_T), GFP_KERNEL); + if (service) { + service->base.fourcc = params->fourcc; + service->base.callback = params->callback; + service->base.userdata = params->userdata; + service->handle = VCHIQ_SERVICE_HANDLE_INVALID; + service->ref_count = 1; + service->srvstate = VCHIQ_SRVSTATE_FREE; + service->userdata_term = userdata_term; + service->localport = VCHIQ_PORT_FREE; + service->remoteport = VCHIQ_PORT_FREE; + + service->public_fourcc = (srvstate == VCHIQ_SRVSTATE_OPENING) ? + VCHIQ_FOURCC_INVALID : params->fourcc; + service->client_id = 0; + service->auto_close = 1; + service->sync = 0; + service->closing = 0; + atomic_set(&service->poll_flags, 0); + service->version = params->version; + service->version_min = params->version_min; + service->state = state; + service->instance = instance; + service->service_use_count = 0; + init_bulk_queue(&service->bulk_tx); + init_bulk_queue(&service->bulk_rx); + _sema_init(&service->remove_event, 0); + _sema_init(&service->bulk_remove_event, 0); + lmutex_init(&service->bulk_mutex); + memset(&service->stats, 0, sizeof(service->stats)); + } else { + vchiq_log_error(vchiq_core_log_level, + "Out of memory"); + } + + if (service) { + VCHIQ_SERVICE_T **pservice = NULL; + int i; + + /* Although it is perfectly possible to use service_spinlock + ** to protect the creation of services, it is overkill as it + ** disables interrupts while the array is searched. + ** The only danger is of another thread trying to create a + ** service - service deletion is safe. + ** Therefore it is preferable to use state->mutex which, + ** although slower to claim, doesn't block interrupts while + ** it is held. + */ + + lmutex_lock(&state->mutex); + + /* Prepare to use a previously unused service */ + if (state->unused_service < VCHIQ_MAX_SERVICES) + pservice = &state->services[state->unused_service]; + + if (srvstate == VCHIQ_SRVSTATE_OPENING) { + for (i = 0; i < state->unused_service; i++) { + VCHIQ_SERVICE_T *srv = state->services[i]; + if (!srv) { + pservice = &state->services[i]; + break; + } + } + } else { + for (i = (state->unused_service - 1); i >= 0; i--) { + VCHIQ_SERVICE_T *srv = state->services[i]; + if (!srv) + pservice = &state->services[i]; + else if ((srv->public_fourcc == params->fourcc) + && ((srv->instance != instance) || + (srv->base.callback != + params->callback))) { + /* There is another server using this + ** fourcc which doesn't match. */ + pservice = NULL; + break; + } + } + } + + if (pservice) { + service->localport = (pservice - state->services); + if (!handle_seq) + handle_seq = VCHIQ_MAX_STATES * + VCHIQ_MAX_SERVICES; + service->handle = handle_seq | + (state->id * VCHIQ_MAX_SERVICES) | + service->localport; + handle_seq += VCHIQ_MAX_STATES * VCHIQ_MAX_SERVICES; + *pservice = service; + if (pservice == &state->services[state->unused_service]) + state->unused_service++; + } + + lmutex_unlock(&state->mutex); + + if (!pservice) { + _sema_destroy(&service->remove_event); + _sema_destroy(&service->bulk_remove_event); + lmutex_destroy(&service->bulk_mutex); + + kfree(service); + service = NULL; + } + } + + if (service) { + VCHIQ_SERVICE_QUOTA_T *service_quota = + &state->service_quotas[service->localport]; + service_quota->slot_quota = state->default_slot_quota; + service_quota->message_quota = state->default_message_quota; + if (service_quota->slot_use_count == 0) + service_quota->previous_tx_index = + SLOT_QUEUE_INDEX_FROM_POS(state->local_tx_pos) + - 1; + + /* Bring this service online */ + vchiq_set_service_state(service, srvstate); + + vchiq_log_info(vchiq_core_msg_log_level, + "%s Service %c%c%c%c SrcPort:%d", + (srvstate == VCHIQ_SRVSTATE_OPENING) + ? "Open" : "Add", + VCHIQ_FOURCC_AS_4CHARS(params->fourcc), + service->localport); + } + + /* Don't unlock the service - leave it with a ref_count of 1. */ + + return service; +} + +VCHIQ_STATUS_T +vchiq_open_service_internal(VCHIQ_SERVICE_T *service, int client_id) +{ + struct vchiq_open_payload payload = { + service->base.fourcc, + client_id, + service->version, + service->version_min + }; + VCHIQ_ELEMENT_T body = { &payload, sizeof(payload) }; + VCHIQ_STATUS_T status = VCHIQ_SUCCESS; + + service->client_id = client_id; + vchiq_use_service_internal(service); + status = queue_message(service->state, NULL, + VCHIQ_MAKE_MSG(VCHIQ_MSG_OPEN, service->localport, 0), + &body, 1, sizeof(payload), 1); + if (status == VCHIQ_SUCCESS) { + if (down_interruptible(&service->remove_event) != 0) { + status = VCHIQ_RETRY; + vchiq_release_service_internal(service); + } else if ((service->srvstate != VCHIQ_SRVSTATE_OPEN) && + (service->srvstate != VCHIQ_SRVSTATE_OPENSYNC)) { + if (service->srvstate != VCHIQ_SRVSTATE_CLOSEWAIT) + vchiq_log_error(vchiq_core_log_level, + "%d: osi - srvstate = %s (ref %d)", + service->state->id, + srvstate_names[service->srvstate], + service->ref_count); + status = VCHIQ_ERROR; + VCHIQ_SERVICE_STATS_INC(service, error_count); + vchiq_release_service_internal(service); + } + } + return status; +} + +static void +release_service_messages(VCHIQ_SERVICE_T *service) +{ + VCHIQ_STATE_T *state = service->state; + int slot_last = state->remote->slot_last; + int i; + + /* Release any claimed messages */ + for (i = state->remote->slot_first; i <= slot_last; i++) { + VCHIQ_SLOT_INFO_T *slot_info = + SLOT_INFO_FROM_INDEX(state, i); + if (slot_info->release_count != slot_info->use_count) { + char *data = + (char *)SLOT_DATA_FROM_INDEX(state, i); + unsigned int pos, end; + + end = VCHIQ_SLOT_SIZE; + if (data == state->rx_data) + /* This buffer is still being read from - stop + ** at the current read position */ + end = state->rx_pos & VCHIQ_SLOT_MASK; + + pos = 0; + + while (pos < end) { + VCHIQ_HEADER_T *header = + (VCHIQ_HEADER_T *)(data + pos); + int msgid = header->msgid; + int port = VCHIQ_MSG_DSTPORT(msgid); + if ((port == service->localport) && + (msgid & VCHIQ_MSGID_CLAIMED)) { + vchiq_log_info(vchiq_core_log_level, + " fsi - hdr %x", + (unsigned int)header); + release_slot(state, slot_info, header, + NULL); + } + pos += calc_stride(header->size); + if (pos > VCHIQ_SLOT_SIZE) { + vchiq_log_error(vchiq_core_log_level, + "fsi - pos %x: header %x, " + "msgid %x, header->msgid %x, " + "header->size %x", + pos, (unsigned int)header, + msgid, header->msgid, + header->size); + WARN(1, "invalid slot position\n"); + } + } + } + } +} + +static int +do_abort_bulks(VCHIQ_SERVICE_T *service) +{ + VCHIQ_STATUS_T status; + + /* Abort any outstanding bulk transfers */ + if (lmutex_lock_interruptible(&service->bulk_mutex) != 0) + return 0; + abort_outstanding_bulks(service, &service->bulk_tx); + abort_outstanding_bulks(service, &service->bulk_rx); + lmutex_unlock(&service->bulk_mutex); + + status = notify_bulks(service, &service->bulk_tx, 0/*!retry_poll*/); + if (status == VCHIQ_SUCCESS) + status = notify_bulks(service, &service->bulk_rx, + 0/*!retry_poll*/); + return (status == VCHIQ_SUCCESS); +} + +static VCHIQ_STATUS_T +close_service_complete(VCHIQ_SERVICE_T *service, int failstate) +{ + VCHIQ_STATUS_T status; + int is_server = (service->public_fourcc != VCHIQ_FOURCC_INVALID); + int newstate; + + switch (service->srvstate) { + case VCHIQ_SRVSTATE_OPEN: + case VCHIQ_SRVSTATE_CLOSESENT: + case VCHIQ_SRVSTATE_CLOSERECVD: + if (is_server) { + if (service->auto_close) { + service->client_id = 0; + service->remoteport = VCHIQ_PORT_FREE; + newstate = VCHIQ_SRVSTATE_LISTENING; + } else + newstate = VCHIQ_SRVSTATE_CLOSEWAIT; + } else + newstate = VCHIQ_SRVSTATE_CLOSED; + vchiq_set_service_state(service, newstate); + break; + case VCHIQ_SRVSTATE_LISTENING: + break; + default: + vchiq_log_error(vchiq_core_log_level, + "close_service_complete(%x) called in state %s", + service->handle, srvstate_names[service->srvstate]); + WARN(1, "close_service_complete in unexpected state\n"); + return VCHIQ_ERROR; + } + + status = make_service_callback(service, + VCHIQ_SERVICE_CLOSED, NULL, NULL); + + if (status != VCHIQ_RETRY) { + int uc = service->service_use_count; + int i; + /* Complete the close process */ + for (i = 0; i < uc; i++) + /* cater for cases where close is forced and the + ** client may not close all it's handles */ + vchiq_release_service_internal(service); + + service->client_id = 0; + service->remoteport = VCHIQ_PORT_FREE; + + if (service->srvstate == VCHIQ_SRVSTATE_CLOSED) + vchiq_free_service_internal(service); + else if (service->srvstate != VCHIQ_SRVSTATE_CLOSEWAIT) { + if (is_server) + service->closing = 0; + + up(&service->remove_event); + } + } else + vchiq_set_service_state(service, failstate); + + return status; +} + +/* Called by the slot handler */ +VCHIQ_STATUS_T +vchiq_close_service_internal(VCHIQ_SERVICE_T *service, int close_recvd) +{ + VCHIQ_STATE_T *state = service->state; + VCHIQ_STATUS_T status = VCHIQ_SUCCESS; + int is_server = (service->public_fourcc != VCHIQ_FOURCC_INVALID); + + vchiq_log_info(vchiq_core_log_level, "%d: csi:%d,%d (%s)", + service->state->id, service->localport, close_recvd, + srvstate_names[service->srvstate]); + + switch (service->srvstate) { + case VCHIQ_SRVSTATE_CLOSED: + case VCHIQ_SRVSTATE_HIDDEN: + case VCHIQ_SRVSTATE_LISTENING: + case VCHIQ_SRVSTATE_CLOSEWAIT: + if (close_recvd) + vchiq_log_error(vchiq_core_log_level, + "vchiq_close_service_internal(1) called " + "in state %s", + srvstate_names[service->srvstate]); + else if (is_server) { + if (service->srvstate == VCHIQ_SRVSTATE_LISTENING) { + status = VCHIQ_ERROR; + } else { + service->client_id = 0; + service->remoteport = VCHIQ_PORT_FREE; + if (service->srvstate == + VCHIQ_SRVSTATE_CLOSEWAIT) + vchiq_set_service_state(service, + VCHIQ_SRVSTATE_LISTENING); + } + up(&service->remove_event); + } else + vchiq_free_service_internal(service); + break; + case VCHIQ_SRVSTATE_OPENING: + if (close_recvd) { + /* The open was rejected - tell the user */ + vchiq_set_service_state(service, + VCHIQ_SRVSTATE_CLOSEWAIT); + up(&service->remove_event); + } else { + /* Shutdown mid-open - let the other side know */ + status = queue_message(state, service, + VCHIQ_MAKE_MSG + (VCHIQ_MSG_CLOSE, + service->localport, + VCHIQ_MSG_DSTPORT(service->remoteport)), + NULL, 0, 0, 0); + } + break; + + case VCHIQ_SRVSTATE_OPENSYNC: + lmutex_lock(&state->sync_mutex); + /* Drop through */ + + case VCHIQ_SRVSTATE_OPEN: + if (state->is_master || close_recvd) { + if (!do_abort_bulks(service)) + status = VCHIQ_RETRY; + } + + release_service_messages(service); + + if (status == VCHIQ_SUCCESS) + status = queue_message(state, service, + VCHIQ_MAKE_MSG + (VCHIQ_MSG_CLOSE, + service->localport, + VCHIQ_MSG_DSTPORT(service->remoteport)), + NULL, 0, 0, 0); + + if (status == VCHIQ_SUCCESS) { + if (!close_recvd) + break; + } else if (service->srvstate == VCHIQ_SRVSTATE_OPENSYNC) { + lmutex_unlock(&state->sync_mutex); + break; + } else + break; + + status = close_service_complete(service, + VCHIQ_SRVSTATE_CLOSERECVD); + break; + + case VCHIQ_SRVSTATE_CLOSESENT: + if (!close_recvd) + /* This happens when a process is killed mid-close */ + break; + + if (!state->is_master) { + if (!do_abort_bulks(service)) { + status = VCHIQ_RETRY; + break; + } + } + + if (status == VCHIQ_SUCCESS) + status = close_service_complete(service, + VCHIQ_SRVSTATE_CLOSERECVD); + break; + + case VCHIQ_SRVSTATE_CLOSERECVD: + if (!close_recvd && is_server) + /* Force into LISTENING mode */ + vchiq_set_service_state(service, + VCHIQ_SRVSTATE_LISTENING); + status = close_service_complete(service, + VCHIQ_SRVSTATE_CLOSERECVD); + break; + + default: + vchiq_log_error(vchiq_core_log_level, + "vchiq_close_service_internal(%d) called in state %s", + close_recvd, srvstate_names[service->srvstate]); + break; + } + + return status; +} + +/* Called from the application process upon process death */ +void +vchiq_terminate_service_internal(VCHIQ_SERVICE_T *service) +{ + VCHIQ_STATE_T *state = service->state; + + vchiq_log_info(vchiq_core_log_level, "%d: tsi - (%d<->%d)", + state->id, service->localport, service->remoteport); + + mark_service_closing(service); + + /* Mark the service for removal by the slot handler */ + request_poll(state, service, VCHIQ_POLL_REMOVE); +} + +/* Called from the slot handler */ +void +vchiq_free_service_internal(VCHIQ_SERVICE_T *service) +{ + VCHIQ_STATE_T *state = service->state; + + vchiq_log_info(vchiq_core_log_level, "%d: fsi - (%d)", + state->id, service->localport); + + switch (service->srvstate) { + case VCHIQ_SRVSTATE_OPENING: + case VCHIQ_SRVSTATE_CLOSED: + case VCHIQ_SRVSTATE_HIDDEN: + case VCHIQ_SRVSTATE_LISTENING: + case VCHIQ_SRVSTATE_CLOSEWAIT: + break; + default: + vchiq_log_error(vchiq_core_log_level, + "%d: fsi - (%d) in state %s", + state->id, service->localport, + srvstate_names[service->srvstate]); + return; + } + + vchiq_set_service_state(service, VCHIQ_SRVSTATE_FREE); + + up(&service->remove_event); + + /* Release the initial lock */ + unlock_service(service); +} + +VCHIQ_STATUS_T +vchiq_connect_internal(VCHIQ_STATE_T *state, VCHIQ_INSTANCE_T instance) +{ + VCHIQ_SERVICE_T *service; + int i; + + /* Find all services registered to this client and enable them. */ + i = 0; + while ((service = next_service_by_instance(state, instance, + &i)) != NULL) { + if (service->srvstate == VCHIQ_SRVSTATE_HIDDEN) + vchiq_set_service_state(service, + VCHIQ_SRVSTATE_LISTENING); + unlock_service(service); + } + + if (state->conn_state == VCHIQ_CONNSTATE_DISCONNECTED) { + if (queue_message(state, NULL, + VCHIQ_MAKE_MSG(VCHIQ_MSG_CONNECT, 0, 0), NULL, 0, + 0, 1) == VCHIQ_RETRY) + return VCHIQ_RETRY; + + vchiq_set_conn_state(state, VCHIQ_CONNSTATE_CONNECTING); + } + + if (state->conn_state == VCHIQ_CONNSTATE_CONNECTING) { + if (down_interruptible(&state->connect) != 0) + return VCHIQ_RETRY; + + vchiq_set_conn_state(state, VCHIQ_CONNSTATE_CONNECTED); + up(&state->connect); + } + + return VCHIQ_SUCCESS; +} + +VCHIQ_STATUS_T +vchiq_shutdown_internal(VCHIQ_STATE_T *state, VCHIQ_INSTANCE_T instance) +{ + VCHIQ_SERVICE_T *service; + int i; + + /* Find all services registered to this client and enable them. */ + i = 0; + while ((service = next_service_by_instance(state, instance, + &i)) != NULL) { + (void)vchiq_remove_service(service->handle); + unlock_service(service); + } + + return VCHIQ_SUCCESS; +} + +VCHIQ_STATUS_T +vchiq_pause_internal(VCHIQ_STATE_T *state) +{ + VCHIQ_STATUS_T status = VCHIQ_SUCCESS; + + switch (state->conn_state) { + case VCHIQ_CONNSTATE_CONNECTED: + /* Request a pause */ + vchiq_set_conn_state(state, VCHIQ_CONNSTATE_PAUSING); + request_poll(state, NULL, 0); + break; + default: + vchiq_log_error(vchiq_core_log_level, + "vchiq_pause_internal in state %s\n", + conn_state_names[state->conn_state]); + status = VCHIQ_ERROR; + VCHIQ_STATS_INC(state, error_count); + break; + } + + return status; +} + +VCHIQ_STATUS_T +vchiq_resume_internal(VCHIQ_STATE_T *state) +{ + VCHIQ_STATUS_T status = VCHIQ_SUCCESS; + + if (state->conn_state == VCHIQ_CONNSTATE_PAUSED) { + vchiq_set_conn_state(state, VCHIQ_CONNSTATE_RESUMING); + request_poll(state, NULL, 0); + } else { + status = VCHIQ_ERROR; + VCHIQ_STATS_INC(state, error_count); + } + + return status; +} + +VCHIQ_STATUS_T +vchiq_close_service(VCHIQ_SERVICE_HANDLE_T handle) +{ + /* Unregister the service */ + VCHIQ_SERVICE_T *service = find_service_by_handle(handle); + VCHIQ_STATUS_T status = VCHIQ_SUCCESS; + + if (!service) + return VCHIQ_ERROR; + + vchiq_log_info(vchiq_core_log_level, + "%d: close_service:%d", + service->state->id, service->localport); + + if ((service->srvstate == VCHIQ_SRVSTATE_FREE) || + (service->srvstate == VCHIQ_SRVSTATE_LISTENING) || + (service->srvstate == VCHIQ_SRVSTATE_HIDDEN)) { + unlock_service(service); + return VCHIQ_ERROR; + } + + mark_service_closing(service); + + if (current == service->state->slot_handler_thread) { + status = vchiq_close_service_internal(service, + 0/*!close_recvd*/); + BUG_ON(status == VCHIQ_RETRY); + } else { + /* Mark the service for termination by the slot handler */ + request_poll(service->state, service, VCHIQ_POLL_TERMINATE); + } + + while (1) { + if (down_interruptible(&service->remove_event) != 0) { + status = VCHIQ_RETRY; + break; + } + + if ((service->srvstate == VCHIQ_SRVSTATE_FREE) || + (service->srvstate == VCHIQ_SRVSTATE_LISTENING) || + (service->srvstate == VCHIQ_SRVSTATE_OPEN)) + break; + + vchiq_log_warning(vchiq_core_log_level, + "%d: close_service:%d - waiting in state %s", + service->state->id, service->localport, + srvstate_names[service->srvstate]); + } + + if ((status == VCHIQ_SUCCESS) && + (service->srvstate != VCHIQ_SRVSTATE_FREE) && + (service->srvstate != VCHIQ_SRVSTATE_LISTENING)) + status = VCHIQ_ERROR; + + unlock_service(service); + + return status; +} + +VCHIQ_STATUS_T +vchiq_remove_service(VCHIQ_SERVICE_HANDLE_T handle) +{ + /* Unregister the service */ + VCHIQ_SERVICE_T *service = find_service_by_handle(handle); + VCHIQ_STATUS_T status = VCHIQ_SUCCESS; + + if (!service) + return VCHIQ_ERROR; + + vchiq_log_info(vchiq_core_log_level, + "%d: remove_service:%d", + service->state->id, service->localport); + + if (service->srvstate == VCHIQ_SRVSTATE_FREE) { + unlock_service(service); + return VCHIQ_ERROR; + } + + mark_service_closing(service); + + if ((service->srvstate == VCHIQ_SRVSTATE_HIDDEN) || + (current == service->state->slot_handler_thread)) { + /* Make it look like a client, because it must be removed and + not left in the LISTENING state. */ + service->public_fourcc = VCHIQ_FOURCC_INVALID; + + status = vchiq_close_service_internal(service, + 0/*!close_recvd*/); + BUG_ON(status == VCHIQ_RETRY); + } else { + /* Mark the service for removal by the slot handler */ + request_poll(service->state, service, VCHIQ_POLL_REMOVE); + } + while (1) { + if (down_interruptible(&service->remove_event) != 0) { + status = VCHIQ_RETRY; + break; + } + + if ((service->srvstate == VCHIQ_SRVSTATE_FREE) || + (service->srvstate == VCHIQ_SRVSTATE_OPEN)) + break; + + vchiq_log_warning(vchiq_core_log_level, + "%d: remove_service:%d - waiting in state %s", + service->state->id, service->localport, + srvstate_names[service->srvstate]); + } + + if ((status == VCHIQ_SUCCESS) && + (service->srvstate != VCHIQ_SRVSTATE_FREE)) + status = VCHIQ_ERROR; + + unlock_service(service); + + return status; +} + + +/* This function may be called by kernel threads or user threads. + * User threads may receive VCHIQ_RETRY to indicate that a signal has been + * received and the call should be retried after being returned to user + * context. + * When called in blocking mode, the userdata field points to a bulk_waiter + * structure. + */ +VCHIQ_STATUS_T +vchiq_bulk_transfer(VCHIQ_SERVICE_HANDLE_T handle, + VCHI_MEM_HANDLE_T memhandle, void *offset, int size, void *userdata, + VCHIQ_BULK_MODE_T mode, VCHIQ_BULK_DIR_T dir) +{ + VCHIQ_SERVICE_T *service = find_service_by_handle(handle); + VCHIQ_BULK_QUEUE_T *queue; + VCHIQ_BULK_T *bulk; + VCHIQ_STATE_T *state; + struct bulk_waiter *bulk_waiter = NULL; + const char dir_char = (dir == VCHIQ_BULK_TRANSMIT) ? 't' : 'r'; + const int dir_msgtype = (dir == VCHIQ_BULK_TRANSMIT) ? + VCHIQ_MSG_BULK_TX : VCHIQ_MSG_BULK_RX; + VCHIQ_STATUS_T status = VCHIQ_ERROR; + + if (!service || + (service->srvstate != VCHIQ_SRVSTATE_OPEN) || + ((memhandle == VCHI_MEM_HANDLE_INVALID) && (offset == NULL)) || + (vchiq_check_service(service) != VCHIQ_SUCCESS)) + goto error_exit; + + switch (mode) { + case VCHIQ_BULK_MODE_NOCALLBACK: + case VCHIQ_BULK_MODE_CALLBACK: + break; + case VCHIQ_BULK_MODE_BLOCKING: + bulk_waiter = (struct bulk_waiter *)userdata; + _sema_init(&bulk_waiter->event, 0); + bulk_waiter->actual = 0; + bulk_waiter->bulk = NULL; + break; + case VCHIQ_BULK_MODE_WAITING: + bulk_waiter = (struct bulk_waiter *)userdata; + bulk = bulk_waiter->bulk; + goto waiting; + default: + goto error_exit; + } + + state = service->state; + + queue = (dir == VCHIQ_BULK_TRANSMIT) ? + &service->bulk_tx : &service->bulk_rx; + + if (lmutex_lock_interruptible(&service->bulk_mutex) != 0) { + status = VCHIQ_RETRY; + goto error_exit; + } + + if (queue->local_insert == queue->remove + VCHIQ_NUM_SERVICE_BULKS) { + VCHIQ_SERVICE_STATS_INC(service, bulk_stalls); + do { + lmutex_unlock(&service->bulk_mutex); + if (down_interruptible(&service->bulk_remove_event) + != 0) { + status = VCHIQ_RETRY; + goto error_exit; + } + if (lmutex_lock_interruptible(&service->bulk_mutex) + != 0) { + status = VCHIQ_RETRY; + goto error_exit; + } + } while (queue->local_insert == queue->remove + + VCHIQ_NUM_SERVICE_BULKS); + } + + bulk = &queue->bulks[BULK_INDEX(queue->local_insert)]; + + bulk->mode = mode; + bulk->dir = dir; + bulk->userdata = userdata; + bulk->size = size; + bulk->actual = VCHIQ_BULK_ACTUAL_ABORTED; + + if (vchiq_prepare_bulk_data(bulk, memhandle, offset, size, dir) != + VCHIQ_SUCCESS) + goto unlock_error_exit; + + wmb(); + + vchiq_log_info(vchiq_core_log_level, + "%d: bt (%d->%d) %cx %x@%x %x", + state->id, + service->localport, service->remoteport, dir_char, + size, (unsigned int)bulk->data, (unsigned int)userdata); + + if (state->is_master) { + queue->local_insert++; + if (resolve_bulks(service, queue)) + request_poll(state, service, + (dir == VCHIQ_BULK_TRANSMIT) ? + VCHIQ_POLL_TXNOTIFY : VCHIQ_POLL_RXNOTIFY); + } else { + int payload[2] = { (int)bulk->data, bulk->size }; + VCHIQ_ELEMENT_T element = { payload, sizeof(payload) }; + + status = queue_message(state, NULL, + VCHIQ_MAKE_MSG(dir_msgtype, + service->localport, service->remoteport), + &element, 1, sizeof(payload), 1); + if (status != VCHIQ_SUCCESS) { + vchiq_complete_bulk(bulk); + goto unlock_error_exit; + } + queue->local_insert++; + } + + lmutex_unlock(&service->bulk_mutex); + + vchiq_log_trace(vchiq_core_log_level, + "%d: bt:%d %cx li=%x ri=%x p=%x", + state->id, + service->localport, dir_char, + queue->local_insert, queue->remote_insert, queue->process); + +waiting: + unlock_service(service); + + status = VCHIQ_SUCCESS; + + if (bulk_waiter) { + bulk_waiter->bulk = bulk; + if (down_interruptible(&bulk_waiter->event) != 0) + status = VCHIQ_RETRY; + else if (bulk_waiter->actual == VCHIQ_BULK_ACTUAL_ABORTED) + status = VCHIQ_ERROR; + } + + return status; + +unlock_error_exit: + lmutex_unlock(&service->bulk_mutex); + +error_exit: + if (service) + unlock_service(service); + return status; +} + +VCHIQ_STATUS_T +vchiq_queue_message(VCHIQ_SERVICE_HANDLE_T handle, + const VCHIQ_ELEMENT_T *elements, unsigned int count) +{ + VCHIQ_SERVICE_T *service = find_service_by_handle(handle); + VCHIQ_STATUS_T status = VCHIQ_ERROR; + + unsigned int size = 0; + unsigned int i; + + if (!service || + (vchiq_check_service(service) != VCHIQ_SUCCESS)) + goto error_exit; + + for (i = 0; i < (unsigned int)count; i++) { + if (elements[i].size) { + if (elements[i].data == NULL) { + VCHIQ_SERVICE_STATS_INC(service, error_count); + goto error_exit; + } + size += elements[i].size; + } + } + + if (size > VCHIQ_MAX_MSG_SIZE) { + VCHIQ_SERVICE_STATS_INC(service, error_count); + goto error_exit; + } + + switch (service->srvstate) { + case VCHIQ_SRVSTATE_OPEN: + status = queue_message(service->state, service, + VCHIQ_MAKE_MSG(VCHIQ_MSG_DATA, + service->localport, + service->remoteport), + elements, count, size, 1); + break; + case VCHIQ_SRVSTATE_OPENSYNC: + status = queue_message_sync(service->state, service, + VCHIQ_MAKE_MSG(VCHIQ_MSG_DATA, + service->localport, + service->remoteport), + elements, count, size, 1); + break; + default: + status = VCHIQ_ERROR; + break; + } + +error_exit: + if (service) + unlock_service(service); + + return status; +} + +void +vchiq_release_message(VCHIQ_SERVICE_HANDLE_T handle, VCHIQ_HEADER_T *header) +{ + VCHIQ_SERVICE_T *service = find_service_by_handle(handle); + VCHIQ_SHARED_STATE_T *remote; + VCHIQ_STATE_T *state; + int slot_index; + + if (!service) + return; + + state = service->state; + remote = state->remote; + + slot_index = SLOT_INDEX_FROM_DATA(state, (void *)header); + + if ((slot_index >= remote->slot_first) && + (slot_index <= remote->slot_last)) { + int msgid = header->msgid; + if (msgid & VCHIQ_MSGID_CLAIMED) { + VCHIQ_SLOT_INFO_T *slot_info = + SLOT_INFO_FROM_INDEX(state, slot_index); + + release_slot(state, slot_info, header, service); + } + } else if (slot_index == remote->slot_sync) + release_message_sync(state, header); + + unlock_service(service); +} + +static void +release_message_sync(VCHIQ_STATE_T *state, VCHIQ_HEADER_T *header) +{ + header->msgid = VCHIQ_MSGID_PADDING; + wmb(); + remote_event_signal(&state->remote->sync_release); +} + +VCHIQ_STATUS_T +vchiq_get_peer_version(VCHIQ_SERVICE_HANDLE_T handle, short *peer_version) +{ + VCHIQ_STATUS_T status = VCHIQ_ERROR; + VCHIQ_SERVICE_T *service = find_service_by_handle(handle); + + if (!service || + (vchiq_check_service(service) != VCHIQ_SUCCESS) || + !peer_version) + goto exit; + *peer_version = service->peer_version; + status = VCHIQ_SUCCESS; + +exit: + if (service) + unlock_service(service); + return status; +} + +VCHIQ_STATUS_T +vchiq_get_config(VCHIQ_INSTANCE_T instance, + int config_size, VCHIQ_CONFIG_T *pconfig) +{ + VCHIQ_CONFIG_T config; + + (void)instance; + + config.max_msg_size = VCHIQ_MAX_MSG_SIZE; + config.bulk_threshold = VCHIQ_MAX_MSG_SIZE; + config.max_outstanding_bulks = VCHIQ_NUM_SERVICE_BULKS; + config.max_services = VCHIQ_MAX_SERVICES; + config.version = VCHIQ_VERSION; + config.version_min = VCHIQ_VERSION_MIN; + + if (config_size > sizeof(VCHIQ_CONFIG_T)) + return VCHIQ_ERROR; + + memcpy(pconfig, &config, + min(config_size, (int)(sizeof(VCHIQ_CONFIG_T)))); + + return VCHIQ_SUCCESS; +} + +VCHIQ_STATUS_T +vchiq_set_service_option(VCHIQ_SERVICE_HANDLE_T handle, + VCHIQ_SERVICE_OPTION_T option, int value) +{ + VCHIQ_SERVICE_T *service = find_service_by_handle(handle); + VCHIQ_STATUS_T status = VCHIQ_ERROR; + + if (service) { + switch (option) { + case VCHIQ_SERVICE_OPTION_AUTOCLOSE: + service->auto_close = value; + status = VCHIQ_SUCCESS; + break; + + case VCHIQ_SERVICE_OPTION_SLOT_QUOTA: { + VCHIQ_SERVICE_QUOTA_T *service_quota = + &service->state->service_quotas[ + service->localport]; + if (value == 0) + value = service->state->default_slot_quota; + if ((value >= service_quota->slot_use_count) && + (value < (unsigned short)~0)) { + service_quota->slot_quota = value; + if ((value >= service_quota->slot_use_count) && + (service_quota->message_quota >= + service_quota->message_use_count)) { + /* Signal the service that it may have + ** dropped below its quota */ + up(&service_quota->quota_event); + } + status = VCHIQ_SUCCESS; + } + } break; + + case VCHIQ_SERVICE_OPTION_MESSAGE_QUOTA: { + VCHIQ_SERVICE_QUOTA_T *service_quota = + &service->state->service_quotas[ + service->localport]; + if (value == 0) + value = service->state->default_message_quota; + if ((value >= service_quota->message_use_count) && + (value < (unsigned short)~0)) { + service_quota->message_quota = value; + if ((value >= + service_quota->message_use_count) && + (service_quota->slot_quota >= + service_quota->slot_use_count)) + /* Signal the service that it may have + ** dropped below its quota */ + up(&service_quota->quota_event); + status = VCHIQ_SUCCESS; + } + } break; + + case VCHIQ_SERVICE_OPTION_SYNCHRONOUS: + if ((service->srvstate == VCHIQ_SRVSTATE_HIDDEN) || + (service->srvstate == + VCHIQ_SRVSTATE_LISTENING)) { + service->sync = value; + status = VCHIQ_SUCCESS; + } + break; + + default: + break; + } + unlock_service(service); + } + + return status; +} + +static void +vchiq_dump_shared_state(void *dump_context, VCHIQ_STATE_T *state, + VCHIQ_SHARED_STATE_T *shared, const char *label) +{ + static const char *const debug_names[] = { + "", + "SLOT_HANDLER_COUNT", + "SLOT_HANDLER_LINE", + "PARSE_LINE", + "PARSE_HEADER", + "PARSE_MSGID", + "AWAIT_COMPLETION_LINE", + "DEQUEUE_MESSAGE_LINE", + "SERVICE_CALLBACK_LINE", + "MSG_QUEUE_FULL_COUNT", + "COMPLETION_QUEUE_FULL_COUNT" + }; + int i; + + char buf[80]; + int len; + len = snprintf(buf, sizeof(buf), + " %s: slots %d-%d tx_pos=%x recycle=%x", + label, shared->slot_first, shared->slot_last, + shared->tx_pos, shared->slot_queue_recycle); + vchiq_dump(dump_context, buf, len + 1); + + len = snprintf(buf, sizeof(buf), + " Slots claimed:"); + vchiq_dump(dump_context, buf, len + 1); + + for (i = shared->slot_first; i <= shared->slot_last; i++) { + VCHIQ_SLOT_INFO_T slot_info = *SLOT_INFO_FROM_INDEX(state, i); + if (slot_info.use_count != slot_info.release_count) { + len = snprintf(buf, sizeof(buf), + " %d: %d/%d", i, slot_info.use_count, + slot_info.release_count); + vchiq_dump(dump_context, buf, len + 1); + } + } + + for (i = 1; i < shared->debug[DEBUG_ENTRIES]; i++) { + len = snprintf(buf, sizeof(buf), " DEBUG: %s = %d(%x)", + debug_names[i], shared->debug[i], shared->debug[i]); + vchiq_dump(dump_context, buf, len + 1); + } +} + +void +vchiq_dump_state(void *dump_context, VCHIQ_STATE_T *state) +{ + char buf[80]; + int len; + int i; + + len = snprintf(buf, sizeof(buf), "State %d: %s", state->id, + conn_state_names[state->conn_state]); + vchiq_dump(dump_context, buf, len + 1); + + len = snprintf(buf, sizeof(buf), + " tx_pos=%x(@%x), rx_pos=%x(@%x)", + state->local->tx_pos, + (uint32_t)state->tx_data + + (state->local_tx_pos & VCHIQ_SLOT_MASK), + state->rx_pos, + (uint32_t)state->rx_data + + (state->rx_pos & VCHIQ_SLOT_MASK)); + vchiq_dump(dump_context, buf, len + 1); + + len = snprintf(buf, sizeof(buf), + " Version: %d (min %d)", + VCHIQ_VERSION, VCHIQ_VERSION_MIN); + vchiq_dump(dump_context, buf, len + 1); + + if (VCHIQ_ENABLE_STATS) { + len = snprintf(buf, sizeof(buf), + " Stats: ctrl_tx_count=%d, ctrl_rx_count=%d, " + "error_count=%d", + state->stats.ctrl_tx_count, state->stats.ctrl_rx_count, + state->stats.error_count); + vchiq_dump(dump_context, buf, len + 1); + } + + len = snprintf(buf, sizeof(buf), + " Slots: %d available (%d data), %d recyclable, %d stalls " + "(%d data)", + ((state->slot_queue_available * VCHIQ_SLOT_SIZE) - + state->local_tx_pos) / VCHIQ_SLOT_SIZE, + state->data_quota - state->data_use_count, + state->local->slot_queue_recycle - state->slot_queue_available, + state->stats.slot_stalls, state->stats.data_stalls); + vchiq_dump(dump_context, buf, len + 1); + + vchiq_dump_platform_state(dump_context); + + vchiq_dump_shared_state(dump_context, state, state->local, "Local"); + vchiq_dump_shared_state(dump_context, state, state->remote, "Remote"); + + vchiq_dump_platform_instances(dump_context); + + for (i = 0; i < state->unused_service; i++) { + VCHIQ_SERVICE_T *service = find_service_by_port(state, i); + + if (service) { + vchiq_dump_service_state(dump_context, service); + unlock_service(service); + } + } +} + +void +vchiq_dump_service_state(void *dump_context, VCHIQ_SERVICE_T *service) +{ + char buf[120]; + int len; + + len = snprintf(buf, sizeof(buf), "Service %d: %s (ref %u)", + service->localport, srvstate_names[service->srvstate], + service->ref_count - 1); /*Don't include the lock just taken*/ + + if (service->srvstate != VCHIQ_SRVSTATE_FREE) { + char remoteport[30]; + VCHIQ_SERVICE_QUOTA_T *service_quota = + &service->state->service_quotas[service->localport]; + int fourcc = service->base.fourcc; + int tx_pending, rx_pending; + if (service->remoteport != VCHIQ_PORT_FREE) { + int len2 = snprintf(remoteport, sizeof(remoteport), + "%d", service->remoteport); + if (service->public_fourcc != VCHIQ_FOURCC_INVALID) + snprintf(remoteport + len2, + sizeof(remoteport) - len2, + " (client %8x)", service->client_id); + } else + strcpy(remoteport, "n/a"); + + len += snprintf(buf + len, sizeof(buf) - len, + " '%c%c%c%c' remote %s (msg use %d/%d, slot use %d/%d)", + VCHIQ_FOURCC_AS_4CHARS(fourcc), + remoteport, + service_quota->message_use_count, + service_quota->message_quota, + service_quota->slot_use_count, + service_quota->slot_quota); + + vchiq_dump(dump_context, buf, len + 1); + + tx_pending = service->bulk_tx.local_insert - + service->bulk_tx.remote_insert; + + rx_pending = service->bulk_rx.local_insert - + service->bulk_rx.remote_insert; + + len = snprintf(buf, sizeof(buf), + " Bulk: tx_pending=%d (size %d)," + " rx_pending=%d (size %d)", + tx_pending, + tx_pending ? service->bulk_tx.bulks[ + BULK_INDEX(service->bulk_tx.remove)].size : 0, + rx_pending, + rx_pending ? service->bulk_rx.bulks[ + BULK_INDEX(service->bulk_rx.remove)].size : 0); + + if (VCHIQ_ENABLE_STATS) { + vchiq_dump(dump_context, buf, len + 1); + + len = snprintf(buf, sizeof(buf), + " Ctrl: tx_count=%d, tx_bytes=%llu, " + "rx_count=%d, rx_bytes=%llu", + service->stats.ctrl_tx_count, + service->stats.ctrl_tx_bytes, + service->stats.ctrl_rx_count, + service->stats.ctrl_rx_bytes); + vchiq_dump(dump_context, buf, len + 1); + + len = snprintf(buf, sizeof(buf), + " Bulk: tx_count=%d, tx_bytes=%llu, " + "rx_count=%d, rx_bytes=%llu", + service->stats.bulk_tx_count, + service->stats.bulk_tx_bytes, + service->stats.bulk_rx_count, + service->stats.bulk_rx_bytes); + vchiq_dump(dump_context, buf, len + 1); + + len = snprintf(buf, sizeof(buf), + " %d quota stalls, %d slot stalls, " + "%d bulk stalls, %d aborted, %d errors", + service->stats.quota_stalls, + service->stats.slot_stalls, + service->stats.bulk_stalls, + service->stats.bulk_aborted_count, + service->stats.error_count); + } + } + + vchiq_dump(dump_context, buf, len + 1); + + if (service->srvstate != VCHIQ_SRVSTATE_FREE) + vchiq_dump_platform_service_state(dump_context, service); +} + + +void +vchiq_loud_error_header(void) +{ + vchiq_log_error(vchiq_core_log_level, + "============================================================" + "================"); + vchiq_log_error(vchiq_core_log_level, + "============================================================" + "================"); + vchiq_log_error(vchiq_core_log_level, "====="); +} + +void +vchiq_loud_error_footer(void) +{ + vchiq_log_error(vchiq_core_log_level, "====="); + vchiq_log_error(vchiq_core_log_level, + "============================================================" + "================"); + vchiq_log_error(vchiq_core_log_level, + "============================================================" + "================"); +} + + +VCHIQ_STATUS_T vchiq_send_remote_use(VCHIQ_STATE_T *state) +{ + VCHIQ_STATUS_T status = VCHIQ_RETRY; + if (state->conn_state != VCHIQ_CONNSTATE_DISCONNECTED) + status = queue_message(state, NULL, + VCHIQ_MAKE_MSG(VCHIQ_MSG_REMOTE_USE, 0, 0), + NULL, 0, 0, 0); + return status; +} + +VCHIQ_STATUS_T vchiq_send_remote_release(VCHIQ_STATE_T *state) +{ + VCHIQ_STATUS_T status = VCHIQ_RETRY; + if (state->conn_state != VCHIQ_CONNSTATE_DISCONNECTED) + status = queue_message(state, NULL, + VCHIQ_MAKE_MSG(VCHIQ_MSG_REMOTE_RELEASE, 0, 0), + NULL, 0, 0, 0); + return status; +} + +VCHIQ_STATUS_T vchiq_send_remote_use_active(VCHIQ_STATE_T *state) +{ + VCHIQ_STATUS_T status = VCHIQ_RETRY; + if (state->conn_state != VCHIQ_CONNSTATE_DISCONNECTED) + status = queue_message(state, NULL, + VCHIQ_MAKE_MSG(VCHIQ_MSG_REMOTE_USE_ACTIVE, 0, 0), + NULL, 0, 0, 0); + return status; +} + +void vchiq_log_dump_mem(const char *label, uint32_t addr, const void *voidMem, + size_t numBytes) +{ + const uint8_t *mem = (const uint8_t *)voidMem; + size_t offset; + char lineBuf[100]; + char *s; + + while (numBytes > 0) { + s = lineBuf; + + for (offset = 0; offset < 16; offset++) { + if (offset < numBytes) + s += snprintf(s, 4, "%02x ", mem[offset]); + else + s += snprintf(s, 4, " "); + } + + for (offset = 0; offset < 16; offset++) { + if (offset < numBytes) { + uint8_t ch = mem[offset]; + + if ((ch < ' ') || (ch > '~')) + ch = '.'; + *s++ = (char)ch; + } + } + *s++ = '\0'; + + if ((label != NULL) && (*label != '\0')) + vchiq_log_trace(VCHIQ_LOG_TRACE, + "%s: %08x: %s", label, addr, lineBuf); + else + vchiq_log_trace(VCHIQ_LOG_TRACE, + "%08x: %s", addr, lineBuf); + + addr += 16; + mem += 16; + if (numBytes > 16) + numBytes -= 16; + else + numBytes = 0; + } +} Property changes on: projects/building-blocks/sys/contrib/vchiq/interface/vchiq_arm/vchiq_core.c ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Index: projects/building-blocks/sys/contrib/vchiq/interface/vchiq_arm/vchiq_core.h =================================================================== --- projects/building-blocks/sys/contrib/vchiq/interface/vchiq_arm/vchiq_core.h (nonexistent) +++ projects/building-blocks/sys/contrib/vchiq/interface/vchiq_arm/vchiq_core.h (revision 278312) @@ -0,0 +1,710 @@ +/** + * Copyright (c) 2010-2012 Broadcom. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions, and the following disclaimer, + * without modification. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The names of the above-listed copyright holders may not be used + * to endorse or promote products derived from this software without + * specific prior written permission. + * + * ALTERNATIVELY, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2, as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef VCHIQ_CORE_H +#define VCHIQ_CORE_H + +#include +#include + +#include "vchiq_cfg.h" + +#include "vchiq.h" + +/* Run time control of log level, based on KERN_XXX level. */ +#ifndef VCHIQ_LOG_DEFAULT +#define VCHIQ_LOG_DEFAULT 4 +#endif +#define VCHIQ_LOG_ERROR 3 +#define VCHIQ_LOG_WARNING 4 +#define VCHIQ_LOG_INFO 6 +#define VCHIQ_LOG_TRACE 7 + +#define VCHIQ_LOG_PREFIX "vchiq: " + +#ifndef vchiq_log_error +#define vchiq_log_error(cat, fmt, ...) \ + do { if (cat >= VCHIQ_LOG_ERROR) \ + printf(VCHIQ_LOG_PREFIX fmt "\n", ##__VA_ARGS__); } while (0) +#endif +#ifndef vchiq_log_warning +#define vchiq_log_warning(cat, fmt, ...) \ + do { if (cat >= VCHIQ_LOG_WARNING) \ + printf(VCHIQ_LOG_PREFIX fmt "\n", ##__VA_ARGS__); } while (0) +#endif +#ifndef vchiq_log_info +#define vchiq_log_info(cat, fmt, ...) \ + do { if (cat >= VCHIQ_LOG_INFO) \ + printf(VCHIQ_LOG_PREFIX fmt "\n", ##__VA_ARGS__); } while (0) +#endif +#ifndef vchiq_log_trace +#define vchiq_log_trace(cat, fmt, ...) \ + do { if (cat >= VCHIQ_LOG_TRACE) \ + printf(VCHIQ_LOG_PREFIX fmt "\n", ##__VA_ARGS__); } while (0) +#endif + +#define vchiq_loud_error(...) \ + vchiq_log_error(vchiq_core_log_level, "===== " __VA_ARGS__) + +#ifndef vchiq_static_assert +#define vchiq_static_assert(cond) __attribute__((unused)) \ + extern int vchiq_static_assert[(cond) ? 1 : -1] +#endif + +#define IS_POW2(x) (x && ((x & (x - 1)) == 0)) + +/* Ensure that the slot size and maximum number of slots are powers of 2 */ +vchiq_static_assert(IS_POW2(VCHIQ_SLOT_SIZE)); +vchiq_static_assert(IS_POW2(VCHIQ_MAX_SLOTS)); +vchiq_static_assert(IS_POW2(VCHIQ_MAX_SLOTS_PER_SIDE)); + +#define VCHIQ_SLOT_MASK (VCHIQ_SLOT_SIZE - 1) +#define VCHIQ_SLOT_QUEUE_MASK (VCHIQ_MAX_SLOTS_PER_SIDE - 1) +#define VCHIQ_SLOT_ZERO_SLOTS ((sizeof(VCHIQ_SLOT_ZERO_T) + \ + VCHIQ_SLOT_SIZE - 1) / VCHIQ_SLOT_SIZE) + +#define VCHIQ_MSG_PADDING 0 /* - */ +#define VCHIQ_MSG_CONNECT 1 /* - */ +#define VCHIQ_MSG_OPEN 2 /* + (srcport, -), fourcc, client_id */ +#define VCHIQ_MSG_OPENACK 3 /* + (srcport, dstport) */ +#define VCHIQ_MSG_CLOSE 4 /* + (srcport, dstport) */ +#define VCHIQ_MSG_DATA 5 /* + (srcport, dstport) */ +#define VCHIQ_MSG_BULK_RX 6 /* + (srcport, dstport), data, size */ +#define VCHIQ_MSG_BULK_TX 7 /* + (srcport, dstport), data, size */ +#define VCHIQ_MSG_BULK_RX_DONE 8 /* + (srcport, dstport), actual */ +#define VCHIQ_MSG_BULK_TX_DONE 9 /* + (srcport, dstport), actual */ +#define VCHIQ_MSG_PAUSE 10 /* - */ +#define VCHIQ_MSG_RESUME 11 /* - */ +#define VCHIQ_MSG_REMOTE_USE 12 /* - */ +#define VCHIQ_MSG_REMOTE_RELEASE 13 /* - */ +#define VCHIQ_MSG_REMOTE_USE_ACTIVE 14 /* - */ + +#define VCHIQ_PORT_MAX (VCHIQ_MAX_SERVICES - 1) +#define VCHIQ_PORT_FREE 0x1000 +#define VCHIQ_PORT_IS_VALID(port) (port < VCHIQ_PORT_FREE) +#define VCHIQ_MAKE_MSG(type, srcport, dstport) \ + ((type<<24) | (srcport<<12) | (dstport<<0)) +#define VCHIQ_MSG_TYPE(msgid) ((unsigned int)msgid >> 24) +#define VCHIQ_MSG_SRCPORT(msgid) \ + (unsigned short)(((unsigned int)msgid >> 12) & 0xfff) +#define VCHIQ_MSG_DSTPORT(msgid) \ + ((unsigned short)msgid & 0xfff) + +#define VCHIQ_FOURCC_AS_4CHARS(fourcc) \ + ((fourcc) >> 24) & 0xff, \ + ((fourcc) >> 16) & 0xff, \ + ((fourcc) >> 8) & 0xff, \ + (fourcc) & 0xff + +/* Ensure the fields are wide enough */ +vchiq_static_assert(VCHIQ_MSG_SRCPORT(VCHIQ_MAKE_MSG(0, 0, VCHIQ_PORT_MAX)) + == 0); +vchiq_static_assert(VCHIQ_MSG_TYPE(VCHIQ_MAKE_MSG(0, VCHIQ_PORT_MAX, 0)) == 0); +vchiq_static_assert((unsigned int)VCHIQ_PORT_MAX < + (unsigned int)VCHIQ_PORT_FREE); + +#define VCHIQ_MSGID_PADDING VCHIQ_MAKE_MSG(VCHIQ_MSG_PADDING, 0, 0) +#define VCHIQ_MSGID_CLAIMED 0x40000000 + +#define VCHIQ_FOURCC_INVALID 0x00000000 +#define VCHIQ_FOURCC_IS_LEGAL(fourcc) (fourcc != VCHIQ_FOURCC_INVALID) + +#define VCHIQ_BULK_ACTUAL_ABORTED -1 + +typedef uint32_t BITSET_T; + +vchiq_static_assert((sizeof(BITSET_T) * 8) == 32); + +#define BITSET_SIZE(b) ((b + 31) >> 5) +#define BITSET_WORD(b) (b >> 5) +#define BITSET_BIT(b) (1 << (b & 31)) +#define BITSET_ZERO(bs) memset(bs, 0, sizeof(bs)) +#define BITSET_IS_SET(bs, b) (bs[BITSET_WORD(b)] & BITSET_BIT(b)) +#define BITSET_SET(bs, b) (bs[BITSET_WORD(b)] |= BITSET_BIT(b)) +#define BITSET_CLR(bs, b) (bs[BITSET_WORD(b)] &= ~BITSET_BIT(b)) + +#if VCHIQ_ENABLE_STATS +#define VCHIQ_STATS_INC(state, stat) (state->stats. stat++) +#define VCHIQ_SERVICE_STATS_INC(service, stat) (service->stats. stat++) +#define VCHIQ_SERVICE_STATS_ADD(service, stat, addend) \ + (service->stats. stat += addend) +#else +#define VCHIQ_STATS_INC(state, stat) ((void)0) +#define VCHIQ_SERVICE_STATS_INC(service, stat) ((void)0) +#define VCHIQ_SERVICE_STATS_ADD(service, stat, addend) ((void)0) +#endif + +enum { + DEBUG_ENTRIES, +#if VCHIQ_ENABLE_DEBUG + DEBUG_SLOT_HANDLER_COUNT, + DEBUG_SLOT_HANDLER_LINE, + DEBUG_PARSE_LINE, + DEBUG_PARSE_HEADER, + DEBUG_PARSE_MSGID, + DEBUG_AWAIT_COMPLETION_LINE, + DEBUG_DEQUEUE_MESSAGE_LINE, + DEBUG_SERVICE_CALLBACK_LINE, + DEBUG_MSG_QUEUE_FULL_COUNT, + DEBUG_COMPLETION_QUEUE_FULL_COUNT, +#endif + DEBUG_MAX +}; + +#if VCHIQ_ENABLE_DEBUG + +#define DEBUG_INITIALISE(local) int *debug_ptr = (local)->debug; +#define DEBUG_TRACE(d) \ + do { debug_ptr[DEBUG_ ## d] = __LINE__; dsb(); } while (0) +#define DEBUG_VALUE(d, v) \ + do { debug_ptr[DEBUG_ ## d] = (v); dsb(); } while (0) +#define DEBUG_COUNT(d) \ + do { debug_ptr[DEBUG_ ## d]++; dsb(); } while (0) + +#else /* VCHIQ_ENABLE_DEBUG */ + +#define DEBUG_INITIALISE(local) +#define DEBUG_TRACE(d) +#define DEBUG_VALUE(d, v) +#define DEBUG_COUNT(d) + +#endif /* VCHIQ_ENABLE_DEBUG */ + +typedef enum { + VCHIQ_CONNSTATE_DISCONNECTED, + VCHIQ_CONNSTATE_CONNECTING, + VCHIQ_CONNSTATE_CONNECTED, + VCHIQ_CONNSTATE_PAUSING, + VCHIQ_CONNSTATE_PAUSE_SENT, + VCHIQ_CONNSTATE_PAUSED, + VCHIQ_CONNSTATE_RESUMING, + VCHIQ_CONNSTATE_PAUSE_TIMEOUT, + VCHIQ_CONNSTATE_RESUME_TIMEOUT +} VCHIQ_CONNSTATE_T; + +enum { + VCHIQ_SRVSTATE_FREE, + VCHIQ_SRVSTATE_HIDDEN, + VCHIQ_SRVSTATE_LISTENING, + VCHIQ_SRVSTATE_OPENING, + VCHIQ_SRVSTATE_OPEN, + VCHIQ_SRVSTATE_OPENSYNC, + VCHIQ_SRVSTATE_CLOSESENT, + VCHIQ_SRVSTATE_CLOSERECVD, + VCHIQ_SRVSTATE_CLOSEWAIT, + VCHIQ_SRVSTATE_CLOSED +}; + +enum { + VCHIQ_POLL_TERMINATE, + VCHIQ_POLL_REMOVE, + VCHIQ_POLL_TXNOTIFY, + VCHIQ_POLL_RXNOTIFY, + VCHIQ_POLL_COUNT +}; + +typedef enum { + VCHIQ_BULK_TRANSMIT, + VCHIQ_BULK_RECEIVE +} VCHIQ_BULK_DIR_T; + +typedef void (*VCHIQ_USERDATA_TERM_T)(void *userdata); + +typedef struct vchiq_bulk_struct { + short mode; + short dir; + void *userdata; + VCHI_MEM_HANDLE_T handle; + void *data; + int size; + void *remote_data; + int remote_size; + int actual; +} VCHIQ_BULK_T; + +typedef struct vchiq_bulk_queue_struct { + int local_insert; /* Where to insert the next local bulk */ + int remote_insert; /* Where to insert the next remote bulk (master) */ + int process; /* Bulk to transfer next */ + int remote_notify; /* Bulk to notify the remote client of next (mstr) */ + int remove; /* Bulk to notify the local client of, and remove, + ** next */ + VCHIQ_BULK_T bulks[VCHIQ_NUM_SERVICE_BULKS]; +} VCHIQ_BULK_QUEUE_T; + +typedef struct remote_event_struct { + int armed; + int fired; + struct semaphore *event; +} REMOTE_EVENT_T; + +typedef struct opaque_platform_state_t *VCHIQ_PLATFORM_STATE_T; + +typedef struct vchiq_state_struct VCHIQ_STATE_T; + +typedef struct vchiq_slot_struct { + char data[VCHIQ_SLOT_SIZE]; +} VCHIQ_SLOT_T; + +typedef struct vchiq_slot_info_struct { + /* Use two counters rather than one to avoid the need for a mutex. */ + short use_count; + short release_count; +} __packed VCHIQ_SLOT_INFO_T; /* XXXGONZO: check it */ + +typedef struct vchiq_service_struct { + VCHIQ_SERVICE_BASE_T base; + VCHIQ_SERVICE_HANDLE_T handle; + unsigned int ref_count; + int srvstate; + VCHIQ_USERDATA_TERM_T userdata_term; + unsigned int localport; + unsigned int remoteport; + int public_fourcc; + int client_id; + char auto_close; + char sync; + char closing; + atomic_t poll_flags; + short version; + short version_min; + short peer_version; + + VCHIQ_STATE_T *state; + VCHIQ_INSTANCE_T instance; + + int service_use_count; + + VCHIQ_BULK_QUEUE_T bulk_tx; + VCHIQ_BULK_QUEUE_T bulk_rx; + + struct semaphore remove_event; + struct semaphore bulk_remove_event; + struct mutex bulk_mutex; + + struct service_stats_struct { + int quota_stalls; + int slot_stalls; + int bulk_stalls; + int error_count; + int ctrl_tx_count; + int ctrl_rx_count; + int bulk_tx_count; + int bulk_rx_count; + int bulk_aborted_count; + uint64_t ctrl_tx_bytes; + uint64_t ctrl_rx_bytes; + uint64_t bulk_tx_bytes; + uint64_t bulk_rx_bytes; + } stats; +} VCHIQ_SERVICE_T; + +/* The quota information is outside VCHIQ_SERVICE_T so that it can be + statically allocated, since for accounting reasons a service's slot + usage is carried over between users of the same port number. + */ +typedef struct vchiq_service_quota_struct { + unsigned short slot_quota; + unsigned short slot_use_count; + unsigned short message_quota; + unsigned short message_use_count; + struct semaphore quota_event; + int previous_tx_index; +} VCHIQ_SERVICE_QUOTA_T; + +typedef struct vchiq_shared_state_struct { + + /* A non-zero value here indicates that the content is valid. */ + int initialised; + + /* The first and last (inclusive) slots allocated to the owner. */ + int slot_first; + int slot_last; + + /* The slot allocated to synchronous messages from the owner. */ + int slot_sync; + + /* Signalling this event indicates that owner's slot handler thread + ** should run. */ + REMOTE_EVENT_T trigger; + + /* Indicates the byte position within the stream where the next message + ** will be written. The least significant bits are an index into the + ** slot. The next bits are the index of the slot in slot_queue. */ + int tx_pos; + + /* This event should be signalled when a slot is recycled. */ + REMOTE_EVENT_T recycle; + + /* The slot_queue index where the next recycled slot will be written. */ + int slot_queue_recycle; + + /* This event should be signalled when a synchronous message is sent. */ + REMOTE_EVENT_T sync_trigger; + + /* This event should be signalled when a synchronous message has been + ** released. */ + REMOTE_EVENT_T sync_release; + + /* A circular buffer of slot indexes. */ + int slot_queue[VCHIQ_MAX_SLOTS_PER_SIDE]; + + /* Debugging state */ + int debug[DEBUG_MAX]; +} __packed VCHIQ_SHARED_STATE_T; + +typedef struct vchiq_slot_zero_struct { + int magic; + short version; + short version_min; + int slot_zero_size; + int slot_size; + int max_slots; + int max_slots_per_side; + int platform_data[2]; + VCHIQ_SHARED_STATE_T master; + VCHIQ_SHARED_STATE_T slave; + VCHIQ_SLOT_INFO_T slots[VCHIQ_MAX_SLOTS]; +} __packed VCHIQ_SLOT_ZERO_T; + +struct vchiq_state_struct { + int id; + int initialised; + VCHIQ_CONNSTATE_T conn_state; + int is_master; + + VCHIQ_SHARED_STATE_T *local; + VCHIQ_SHARED_STATE_T *remote; + VCHIQ_SLOT_T *slot_data; + + unsigned short default_slot_quota; + unsigned short default_message_quota; + + /* Event indicating connect message received */ + struct semaphore connect; + + /* Mutex protecting services */ + struct mutex mutex; + VCHIQ_INSTANCE_T *instance; + + /* Processes incoming messages */ + VCHIQ_THREAD_T slot_handler_thread; + + /* Processes recycled slots */ + VCHIQ_THREAD_T recycle_thread; + + /* Processes synchronous messages */ + VCHIQ_THREAD_T sync_thread; + + /* Local implementation of the trigger remote event */ + struct semaphore trigger_event; + + /* Local implementation of the recycle remote event */ + struct semaphore recycle_event; + + /* Local implementation of the sync trigger remote event */ + struct semaphore sync_trigger_event; + + /* Local implementation of the sync release remote event */ + struct semaphore sync_release_event; + + char *tx_data; + char *rx_data; + VCHIQ_SLOT_INFO_T *rx_info; + + struct mutex slot_mutex; + + struct mutex recycle_mutex; + + struct mutex sync_mutex; + + struct mutex bulk_transfer_mutex; + + /* Indicates the byte position within the stream from where the next + ** message will be read. The least significant bits are an index into + ** the slot.The next bits are the index of the slot in + ** remote->slot_queue. */ + int rx_pos; + + /* A cached copy of local->tx_pos. Only write to local->tx_pos, and read + from remote->tx_pos. */ + int local_tx_pos; + + /* The slot_queue index of the slot to become available next. */ + int slot_queue_available; + + /* A flag to indicate if any poll has been requested */ + int poll_needed; + + /* Ths index of the previous slot used for data messages. */ + int previous_data_index; + + /* The number of slots occupied by data messages. */ + unsigned short data_use_count; + + /* The maximum number of slots to be occupied by data messages. */ + unsigned short data_quota; + + /* An array of bit sets indicating which services must be polled. */ + atomic_t poll_services[BITSET_SIZE(VCHIQ_MAX_SERVICES)]; + + /* The number of the first unused service */ + int unused_service; + + /* Signalled when a free slot becomes available. */ + struct semaphore slot_available_event; + + struct semaphore slot_remove_event; + + /* Signalled when a free data slot becomes available. */ + struct semaphore data_quota_event; + + /* Incremented when there are bulk transfers which cannot be processed + * whilst paused and must be processed on resume */ + int deferred_bulks; + + struct state_stats_struct { + int slot_stalls; + int data_stalls; + int ctrl_tx_count; + int ctrl_rx_count; + int error_count; + } stats; + + VCHIQ_SERVICE_T * services[VCHIQ_MAX_SERVICES]; + VCHIQ_SERVICE_QUOTA_T service_quotas[VCHIQ_MAX_SERVICES]; + VCHIQ_SLOT_INFO_T slot_info[VCHIQ_MAX_SLOTS]; + + VCHIQ_PLATFORM_STATE_T platform_state; +}; + +struct bulk_waiter { + VCHIQ_BULK_T *bulk; + struct semaphore event; + int actual; +}; + +extern spinlock_t bulk_waiter_spinlock; + +extern int vchiq_core_log_level; +extern int vchiq_core_msg_log_level; +extern int vchiq_sync_log_level; + +extern VCHIQ_STATE_T *vchiq_states[VCHIQ_MAX_STATES]; + +extern const char * +get_conn_state_name(VCHIQ_CONNSTATE_T conn_state); + +extern VCHIQ_SLOT_ZERO_T * +vchiq_init_slots(void *mem_base, int mem_size); + +extern VCHIQ_STATUS_T +vchiq_init_state(VCHIQ_STATE_T *state, VCHIQ_SLOT_ZERO_T *slot_zero, + int is_master); + +extern VCHIQ_STATUS_T +vchiq_connect_internal(VCHIQ_STATE_T *state, VCHIQ_INSTANCE_T instance); + +extern VCHIQ_SERVICE_T * +vchiq_add_service_internal(VCHIQ_STATE_T *state, + const VCHIQ_SERVICE_PARAMS_T *params, int srvstate, + VCHIQ_INSTANCE_T instance, VCHIQ_USERDATA_TERM_T userdata_term); + +extern VCHIQ_STATUS_T +vchiq_open_service_internal(VCHIQ_SERVICE_T *service, int client_id); + +extern VCHIQ_STATUS_T +vchiq_close_service_internal(VCHIQ_SERVICE_T *service, int close_recvd); + +extern void +vchiq_terminate_service_internal(VCHIQ_SERVICE_T *service); + +extern void +vchiq_free_service_internal(VCHIQ_SERVICE_T *service); + +extern VCHIQ_STATUS_T +vchiq_shutdown_internal(VCHIQ_STATE_T *state, VCHIQ_INSTANCE_T instance); + +extern VCHIQ_STATUS_T +vchiq_pause_internal(VCHIQ_STATE_T *state); + +extern VCHIQ_STATUS_T +vchiq_resume_internal(VCHIQ_STATE_T *state); + +extern void +remote_event_pollall(VCHIQ_STATE_T *state); + +extern VCHIQ_STATUS_T +vchiq_bulk_transfer(VCHIQ_SERVICE_HANDLE_T handle, + VCHI_MEM_HANDLE_T memhandle, void *offset, int size, void *userdata, + VCHIQ_BULK_MODE_T mode, VCHIQ_BULK_DIR_T dir); + +extern void +vchiq_dump_state(void *dump_context, VCHIQ_STATE_T *state); + +extern void +vchiq_dump_service_state(void *dump_context, VCHIQ_SERVICE_T *service); + +extern void +vchiq_loud_error_header(void); + +extern void +vchiq_loud_error_footer(void); + +extern void +request_poll(VCHIQ_STATE_T *state, VCHIQ_SERVICE_T *service, int poll_type); + +static inline VCHIQ_SERVICE_T * +handle_to_service(VCHIQ_SERVICE_HANDLE_T handle) +{ + VCHIQ_STATE_T *state = vchiq_states[(handle / VCHIQ_MAX_SERVICES) & + (VCHIQ_MAX_STATES - 1)]; + if (!state) + return NULL; + + return state->services[handle & (VCHIQ_MAX_SERVICES - 1)]; +} + +extern VCHIQ_SERVICE_T * +find_service_by_handle(VCHIQ_SERVICE_HANDLE_T handle); + +extern VCHIQ_SERVICE_T * +find_service_by_port(VCHIQ_STATE_T *state, int localport); + +extern VCHIQ_SERVICE_T * +find_service_for_instance(VCHIQ_INSTANCE_T instance, + VCHIQ_SERVICE_HANDLE_T handle); + +extern VCHIQ_SERVICE_T * +next_service_by_instance(VCHIQ_STATE_T *state, VCHIQ_INSTANCE_T instance, + int *pidx); + +extern void +lock_service(VCHIQ_SERVICE_T *service); + +extern void +unlock_service(VCHIQ_SERVICE_T *service); + +/* The following functions are called from vchiq_core, and external +** implementations must be provided. */ + +extern VCHIQ_STATUS_T +vchiq_prepare_bulk_data(VCHIQ_BULK_T *bulk, + VCHI_MEM_HANDLE_T memhandle, void *offset, int size, int dir); + +extern void +vchiq_transfer_bulk(VCHIQ_BULK_T *bulk); + +extern void +vchiq_complete_bulk(VCHIQ_BULK_T *bulk); + +extern VCHIQ_STATUS_T +vchiq_copy_from_user(void *dst, const void *src, int size); + +extern void +remote_event_signal(REMOTE_EVENT_T *event); + +void +vchiq_platform_check_suspend(VCHIQ_STATE_T *state); + +extern void +vchiq_platform_paused(VCHIQ_STATE_T *state); + +extern VCHIQ_STATUS_T +vchiq_platform_resume(VCHIQ_STATE_T *state); + +extern void +vchiq_platform_resumed(VCHIQ_STATE_T *state); + +extern void +vchiq_dump(void *dump_context, const char *str, int len); + +extern void +vchiq_dump_platform_state(void *dump_context); + +extern void +vchiq_dump_platform_instances(void *dump_context); + +extern void +vchiq_dump_platform_service_state(void *dump_context, + VCHIQ_SERVICE_T *service); + +extern VCHIQ_STATUS_T +vchiq_use_service_internal(VCHIQ_SERVICE_T *service); + +extern VCHIQ_STATUS_T +vchiq_release_service_internal(VCHIQ_SERVICE_T *service); + +extern void +vchiq_on_remote_use(VCHIQ_STATE_T *state); + +extern void +vchiq_on_remote_release(VCHIQ_STATE_T *state); + +extern VCHIQ_STATUS_T +vchiq_platform_init_state(VCHIQ_STATE_T *state); + +extern VCHIQ_STATUS_T +vchiq_check_service(VCHIQ_SERVICE_T *service); + +extern void +vchiq_on_remote_use_active(VCHIQ_STATE_T *state); + +extern VCHIQ_STATUS_T +vchiq_send_remote_use(VCHIQ_STATE_T *state); + +extern VCHIQ_STATUS_T +vchiq_send_remote_release(VCHIQ_STATE_T *state); + +extern VCHIQ_STATUS_T +vchiq_send_remote_use_active(VCHIQ_STATE_T *state); + +extern void +vchiq_platform_conn_state_changed(VCHIQ_STATE_T *state, + VCHIQ_CONNSTATE_T oldstate, VCHIQ_CONNSTATE_T newstate); + +extern void +vchiq_platform_handle_timeout(VCHIQ_STATE_T *state); + +extern void +vchiq_set_conn_state(VCHIQ_STATE_T *state, VCHIQ_CONNSTATE_T newstate); + + +extern void +vchiq_log_dump_mem(const char *label, uint32_t addr, const void *voidMem, + size_t numBytes); + +extern void +vchiq_core_initialize(void); + +#endif Property changes on: projects/building-blocks/sys/contrib/vchiq/interface/vchiq_arm/vchiq_core.h ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Index: projects/building-blocks/sys/contrib/vchiq/interface/vchiq_arm/vchiq_if.h =================================================================== --- projects/building-blocks/sys/contrib/vchiq/interface/vchiq_arm/vchiq_if.h (nonexistent) +++ projects/building-blocks/sys/contrib/vchiq/interface/vchiq_arm/vchiq_if.h (revision 278312) @@ -0,0 +1,188 @@ +/** + * Copyright (c) 2010-2012 Broadcom. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions, and the following disclaimer, + * without modification. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The names of the above-listed copyright holders may not be used + * to endorse or promote products derived from this software without + * specific prior written permission. + * + * ALTERNATIVELY, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2, as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef VCHIQ_IF_H +#define VCHIQ_IF_H + +#include "interface/vchi/vchi_mh.h" + +#define VCHIQ_SERVICE_HANDLE_INVALID 0 + +#define VCHIQ_SLOT_SIZE 4096 +#define VCHIQ_MAX_MSG_SIZE (VCHIQ_SLOT_SIZE - sizeof(VCHIQ_HEADER_T)) +#define VCHIQ_CHANNEL_SIZE VCHIQ_MAX_MSG_SIZE /* For backwards compatibility */ + +#define VCHIQ_MAKE_FOURCC(x0, x1, x2, x3) \ + (((x0) << 24) | ((x1) << 16) | ((x2) << 8) | (x3)) +#define VCHIQ_GET_SERVICE_USERDATA(service) vchiq_get_service_userdata(service) +#define VCHIQ_GET_SERVICE_FOURCC(service) vchiq_get_service_fourcc(service) + +typedef enum { + VCHIQ_SERVICE_OPENED, /* service, -, - */ + VCHIQ_SERVICE_CLOSED, /* service, -, - */ + VCHIQ_MESSAGE_AVAILABLE, /* service, header, - */ + VCHIQ_BULK_TRANSMIT_DONE, /* service, -, bulk_userdata */ + VCHIQ_BULK_RECEIVE_DONE, /* service, -, bulk_userdata */ + VCHIQ_BULK_TRANSMIT_ABORTED, /* service, -, bulk_userdata */ + VCHIQ_BULK_RECEIVE_ABORTED /* service, -, bulk_userdata */ +} VCHIQ_REASON_T; + +typedef enum { + VCHIQ_ERROR = -1, + VCHIQ_SUCCESS = 0, + VCHIQ_RETRY = 1 +} VCHIQ_STATUS_T; + +typedef enum { + VCHIQ_BULK_MODE_CALLBACK, + VCHIQ_BULK_MODE_BLOCKING, + VCHIQ_BULK_MODE_NOCALLBACK, + VCHIQ_BULK_MODE_WAITING /* Reserved for internal use */ +} VCHIQ_BULK_MODE_T; + +typedef enum { + VCHIQ_SERVICE_OPTION_AUTOCLOSE, + VCHIQ_SERVICE_OPTION_SLOT_QUOTA, + VCHIQ_SERVICE_OPTION_MESSAGE_QUOTA, + VCHIQ_SERVICE_OPTION_SYNCHRONOUS +} VCHIQ_SERVICE_OPTION_T; + +typedef struct vchiq_header_struct { + /* The message identifier - opaque to applications. */ + int msgid; + + /* Size of message data. */ + unsigned int size; + + char data[0]; /* message */ +} VCHIQ_HEADER_T; + +typedef struct { + const void *data; + unsigned int size; +} VCHIQ_ELEMENT_T; + +typedef unsigned int VCHIQ_SERVICE_HANDLE_T; + +typedef VCHIQ_STATUS_T (*VCHIQ_CALLBACK_T)(VCHIQ_REASON_T, VCHIQ_HEADER_T *, + VCHIQ_SERVICE_HANDLE_T, void *); + +typedef struct vchiq_service_base_struct { + int fourcc; + VCHIQ_CALLBACK_T callback; + void *userdata; +} VCHIQ_SERVICE_BASE_T; + +typedef struct vchiq_service_params_struct { + int fourcc; + VCHIQ_CALLBACK_T callback; + void *userdata; + short version; /* Increment for non-trivial changes */ + short version_min; /* Update for incompatible changes */ +} VCHIQ_SERVICE_PARAMS_T; + +typedef struct vchiq_config_struct { + unsigned int max_msg_size; + unsigned int bulk_threshold; /* The message size above which it + is better to use a bulk transfer + (<= max_msg_size) */ + unsigned int max_outstanding_bulks; + unsigned int max_services; + short version; /* The version of VCHIQ */ + short version_min; /* The minimum compatible version of VCHIQ */ +} VCHIQ_CONFIG_T; + +typedef struct vchiq_instance_struct *VCHIQ_INSTANCE_T; +typedef void (*VCHIQ_REMOTE_USE_CALLBACK_T)(void *cb_arg); + +extern VCHIQ_STATUS_T vchiq_initialise(VCHIQ_INSTANCE_T *pinstance); +extern VCHIQ_STATUS_T vchiq_shutdown(VCHIQ_INSTANCE_T instance); +extern VCHIQ_STATUS_T vchiq_connect(VCHIQ_INSTANCE_T instance); +extern VCHIQ_STATUS_T vchiq_add_service(VCHIQ_INSTANCE_T instance, + const VCHIQ_SERVICE_PARAMS_T *params, + VCHIQ_SERVICE_HANDLE_T *pservice); +extern VCHIQ_STATUS_T vchiq_open_service(VCHIQ_INSTANCE_T instance, + const VCHIQ_SERVICE_PARAMS_T *params, + VCHIQ_SERVICE_HANDLE_T *pservice); +extern VCHIQ_STATUS_T vchiq_close_service(VCHIQ_SERVICE_HANDLE_T service); +extern VCHIQ_STATUS_T vchiq_remove_service(VCHIQ_SERVICE_HANDLE_T service); +extern VCHIQ_STATUS_T vchiq_use_service(VCHIQ_SERVICE_HANDLE_T service); +extern VCHIQ_STATUS_T vchiq_use_service_no_resume( + VCHIQ_SERVICE_HANDLE_T service); +extern VCHIQ_STATUS_T vchiq_release_service(VCHIQ_SERVICE_HANDLE_T service); + +extern VCHIQ_STATUS_T vchiq_queue_message(VCHIQ_SERVICE_HANDLE_T service, + const VCHIQ_ELEMENT_T *elements, unsigned int count); +extern void vchiq_release_message(VCHIQ_SERVICE_HANDLE_T service, + VCHIQ_HEADER_T *header); +extern VCHIQ_STATUS_T vchiq_queue_bulk_transmit(VCHIQ_SERVICE_HANDLE_T service, + void *data, unsigned int size, void *userdata); +extern VCHIQ_STATUS_T vchiq_queue_bulk_receive(VCHIQ_SERVICE_HANDLE_T service, + void *data, unsigned int size, void *userdata); +extern VCHIQ_STATUS_T vchiq_queue_bulk_transmit_handle( + VCHIQ_SERVICE_HANDLE_T service, VCHI_MEM_HANDLE_T handle, + const void *offset, unsigned int size, void *userdata); +extern VCHIQ_STATUS_T vchiq_queue_bulk_receive_handle( + VCHIQ_SERVICE_HANDLE_T service, VCHI_MEM_HANDLE_T handle, + void *offset, unsigned int size, void *userdata); +extern VCHIQ_STATUS_T vchiq_bulk_transmit(VCHIQ_SERVICE_HANDLE_T service, + void *data, unsigned int size, void *userdata, + VCHIQ_BULK_MODE_T mode); +extern VCHIQ_STATUS_T vchiq_bulk_receive(VCHIQ_SERVICE_HANDLE_T service, + void *data, unsigned int size, void *userdata, + VCHIQ_BULK_MODE_T mode); +extern VCHIQ_STATUS_T vchiq_bulk_transmit_handle(VCHIQ_SERVICE_HANDLE_T service, + VCHI_MEM_HANDLE_T handle, const void *offset, unsigned int size, + void *userdata, VCHIQ_BULK_MODE_T mode); +extern VCHIQ_STATUS_T vchiq_bulk_receive_handle(VCHIQ_SERVICE_HANDLE_T service, + VCHI_MEM_HANDLE_T handle, void *offset, unsigned int size, + void *userdata, VCHIQ_BULK_MODE_T mode); +extern int vchiq_get_client_id(VCHIQ_SERVICE_HANDLE_T service); +extern void *vchiq_get_service_userdata(VCHIQ_SERVICE_HANDLE_T service); +extern int vchiq_get_service_fourcc(VCHIQ_SERVICE_HANDLE_T service); +extern VCHIQ_STATUS_T vchiq_get_config(VCHIQ_INSTANCE_T instance, + int config_size, VCHIQ_CONFIG_T *pconfig); +extern VCHIQ_STATUS_T vchiq_set_service_option(VCHIQ_SERVICE_HANDLE_T service, + VCHIQ_SERVICE_OPTION_T option, int value); + +extern VCHIQ_STATUS_T vchiq_remote_use(VCHIQ_INSTANCE_T instance, + VCHIQ_REMOTE_USE_CALLBACK_T callback, void *cb_arg); +extern VCHIQ_STATUS_T vchiq_remote_release(VCHIQ_INSTANCE_T instance); + +extern VCHIQ_STATUS_T vchiq_dump_phys_mem(VCHIQ_SERVICE_HANDLE_T service, + void *ptr, size_t num_bytes); + +extern VCHIQ_STATUS_T vchiq_get_peer_version(VCHIQ_SERVICE_HANDLE_T handle, + short *peer_version); + +#endif /* VCHIQ_IF_H */ Property changes on: projects/building-blocks/sys/contrib/vchiq/interface/vchiq_arm/vchiq_if.h ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Index: projects/building-blocks/sys/contrib/vchiq/interface/vchiq_arm/vchiq_ioctl.h =================================================================== --- projects/building-blocks/sys/contrib/vchiq/interface/vchiq_arm/vchiq_ioctl.h (nonexistent) +++ projects/building-blocks/sys/contrib/vchiq/interface/vchiq_arm/vchiq_ioctl.h (revision 278312) @@ -0,0 +1,128 @@ +/** + * Copyright (c) 2010-2012 Broadcom. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions, and the following disclaimer, + * without modification. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The names of the above-listed copyright holders may not be used + * to endorse or promote products derived from this software without + * specific prior written permission. + * + * ALTERNATIVELY, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2, as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef VCHIQ_IOCTLS_H +#define VCHIQ_IOCTLS_H + +#include "vchiq_if.h" + +#define VCHIQ_IOC_MAGIC 0xc4 +#define VCHIQ_INVALID_HANDLE (~0) + +typedef struct { + VCHIQ_SERVICE_PARAMS_T params; + int is_open; + int is_vchi; + unsigned int handle; /* OUT */ +} VCHIQ_CREATE_SERVICE_T; + +typedef struct { + unsigned int handle; + unsigned int count; + const VCHIQ_ELEMENT_T *elements; +} VCHIQ_QUEUE_MESSAGE_T; + +typedef struct { + unsigned int handle; + void *data; + unsigned int size; + void *userdata; + VCHIQ_BULK_MODE_T mode; +} VCHIQ_QUEUE_BULK_TRANSFER_T; + +typedef struct { + VCHIQ_REASON_T reason; + VCHIQ_HEADER_T *header; + void *service_userdata; + void *bulk_userdata; +} VCHIQ_COMPLETION_DATA_T; + +typedef struct { + unsigned int count; + VCHIQ_COMPLETION_DATA_T *buf; + unsigned int msgbufsize; + unsigned int msgbufcount; /* IN/OUT */ + void **msgbufs; +} VCHIQ_AWAIT_COMPLETION_T; + +typedef struct { + unsigned int handle; + int blocking; + unsigned int bufsize; + void *buf; +} VCHIQ_DEQUEUE_MESSAGE_T; + +typedef struct { + unsigned int config_size; + VCHIQ_CONFIG_T *pconfig; +} VCHIQ_GET_CONFIG_T; + +typedef struct { + unsigned int handle; + VCHIQ_SERVICE_OPTION_T option; + int value; +} VCHIQ_SET_SERVICE_OPTION_T; + +typedef struct { + void *virt_addr; + size_t num_bytes; +} VCHIQ_DUMP_MEM_T; + +#define VCHIQ_IOC_CONNECT _IO(VCHIQ_IOC_MAGIC, 0) +#define VCHIQ_IOC_SHUTDOWN _IO(VCHIQ_IOC_MAGIC, 1) +#define VCHIQ_IOC_CREATE_SERVICE \ + _IOWR(VCHIQ_IOC_MAGIC, 2, VCHIQ_CREATE_SERVICE_T) +#define VCHIQ_IOC_REMOVE_SERVICE _IO(VCHIQ_IOC_MAGIC, 3) +#define VCHIQ_IOC_QUEUE_MESSAGE \ + _IOW(VCHIQ_IOC_MAGIC, 4, VCHIQ_QUEUE_MESSAGE_T) +#define VCHIQ_IOC_QUEUE_BULK_TRANSMIT \ + _IOWR(VCHIQ_IOC_MAGIC, 5, VCHIQ_QUEUE_BULK_TRANSFER_T) +#define VCHIQ_IOC_QUEUE_BULK_RECEIVE \ + _IOWR(VCHIQ_IOC_MAGIC, 6, VCHIQ_QUEUE_BULK_TRANSFER_T) +#define VCHIQ_IOC_AWAIT_COMPLETION \ + _IOWR(VCHIQ_IOC_MAGIC, 7, VCHIQ_AWAIT_COMPLETION_T) +#define VCHIQ_IOC_DEQUEUE_MESSAGE \ + _IOWR(VCHIQ_IOC_MAGIC, 8, VCHIQ_DEQUEUE_MESSAGE_T) +#define VCHIQ_IOC_GET_CLIENT_ID _IO(VCHIQ_IOC_MAGIC, 9) +#define VCHIQ_IOC_GET_CONFIG \ + _IOWR(VCHIQ_IOC_MAGIC, 10, VCHIQ_GET_CONFIG_T) +#define VCHIQ_IOC_CLOSE_SERVICE _IO(VCHIQ_IOC_MAGIC, 11) +#define VCHIQ_IOC_USE_SERVICE _IO(VCHIQ_IOC_MAGIC, 12) +#define VCHIQ_IOC_RELEASE_SERVICE _IO(VCHIQ_IOC_MAGIC, 13) +#define VCHIQ_IOC_SET_SERVICE_OPTION \ + _IOW(VCHIQ_IOC_MAGIC, 14, VCHIQ_SET_SERVICE_OPTION_T) +#define VCHIQ_IOC_DUMP_PHYS_MEM \ + _IOW(VCHIQ_IOC_MAGIC, 15, VCHIQ_DUMP_MEM_T) +#define VCHIQ_IOC_MAX 15 + +#endif Property changes on: projects/building-blocks/sys/contrib/vchiq/interface/vchiq_arm/vchiq_ioctl.h ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Index: projects/building-blocks/sys/contrib/vchiq/interface/vchiq_arm/vchiq_kern_lib.c =================================================================== --- projects/building-blocks/sys/contrib/vchiq/interface/vchiq_arm/vchiq_kern_lib.c (nonexistent) +++ projects/building-blocks/sys/contrib/vchiq/interface/vchiq_arm/vchiq_kern_lib.c (revision 278312) @@ -0,0 +1,461 @@ +/** + * Copyright (c) 2010-2012 Broadcom. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions, and the following disclaimer, + * without modification. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The names of the above-listed copyright holders may not be used + * to endorse or promote products derived from this software without + * specific prior written permission. + * + * ALTERNATIVELY, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2, as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* ---- Include Files ---------------------------------------------------- */ + +#include "vchiq_core.h" +#include "vchiq_arm.h" + +/* ---- Public Variables ------------------------------------------------- */ + +/* ---- Private Constants and Types -------------------------------------- */ + +struct bulk_waiter_node { + struct bulk_waiter bulk_waiter; + int pid; + struct list_head list; +}; + +struct vchiq_instance_struct { + VCHIQ_STATE_T *state; + + int connected; + + struct list_head bulk_waiter_list; + struct mutex bulk_waiter_list_mutex; +}; + +static VCHIQ_STATUS_T +vchiq_blocking_bulk_transfer(VCHIQ_SERVICE_HANDLE_T handle, void *data, + unsigned int size, VCHIQ_BULK_DIR_T dir); + +/**************************************************************************** +* +* vchiq_initialise +* +***************************************************************************/ +#define VCHIQ_INIT_RETRIES 10 +VCHIQ_STATUS_T vchiq_initialise(VCHIQ_INSTANCE_T *instanceOut) +{ + VCHIQ_STATUS_T status = VCHIQ_ERROR; + VCHIQ_STATE_T *state; + VCHIQ_INSTANCE_T instance = NULL; + int i; + + vchiq_log_trace(vchiq_core_log_level, "%s called", __func__); + + /* VideoCore may not be ready due to boot up timing. + It may never be ready if kernel and firmware are mismatched, so don't block forever. */ + for (i=0; i0) { + vchiq_log_warning(vchiq_core_log_level, + "%s: videocore initialized after %d retries\n", __func__, i); + } + + instance = kzalloc(sizeof(*instance), GFP_KERNEL); + if (!instance) { + vchiq_log_error(vchiq_core_log_level, + "%s: error allocating vchiq instance\n", __func__); + goto failed; + } + + instance->connected = 0; + instance->state = state; + lmutex_init(&instance->bulk_waiter_list_mutex); + INIT_LIST_HEAD(&instance->bulk_waiter_list); + + *instanceOut = instance; + + status = VCHIQ_SUCCESS; + +failed: + vchiq_log_trace(vchiq_core_log_level, + "%s(%p): returning %d", __func__, instance, status); + + return status; +} +EXPORT_SYMBOL(vchiq_initialise); + +/**************************************************************************** +* +* vchiq_shutdown +* +***************************************************************************/ + +VCHIQ_STATUS_T vchiq_shutdown(VCHIQ_INSTANCE_T instance) +{ + VCHIQ_STATUS_T status; + VCHIQ_STATE_T *state = instance->state; + + vchiq_log_trace(vchiq_core_log_level, + "%s(%p) called", __func__, instance); + + if (lmutex_lock_interruptible(&state->mutex) != 0) + return VCHIQ_RETRY; + + /* Remove all services */ + status = vchiq_shutdown_internal(state, instance); + + lmutex_unlock(&state->mutex); + + vchiq_log_trace(vchiq_core_log_level, + "%s(%p): returning %d", __func__, instance, status); + + if (status == VCHIQ_SUCCESS) { + struct list_head *pos, *next; + list_for_each_safe(pos, next, + &instance->bulk_waiter_list) { + struct bulk_waiter_node *waiter; + waiter = list_entry(pos, + struct bulk_waiter_node, + list); + list_del(pos); + vchiq_log_info(vchiq_arm_log_level, + "bulk_waiter - cleaned up %x " + "for pid %d", + (unsigned int)waiter, waiter->pid); + _sema_destroy(&waiter->bulk_waiter.event); + + kfree(waiter); + } + + lmutex_destroy(&instance->bulk_waiter_list_mutex); + + kfree(instance); + } + + return status; +} +EXPORT_SYMBOL(vchiq_shutdown); + +/**************************************************************************** +* +* vchiq_is_connected +* +***************************************************************************/ + +static int vchiq_is_connected(VCHIQ_INSTANCE_T instance) +{ + return instance->connected; +} + +/**************************************************************************** +* +* vchiq_connect +* +***************************************************************************/ + +VCHIQ_STATUS_T vchiq_connect(VCHIQ_INSTANCE_T instance) +{ + VCHIQ_STATUS_T status; + VCHIQ_STATE_T *state = instance->state; + + vchiq_log_trace(vchiq_core_log_level, + "%s(%p) called", __func__, instance); + + if (lmutex_lock_interruptible(&state->mutex) != 0) { + vchiq_log_trace(vchiq_core_log_level, + "%s: call to lmutex_lock failed", __func__); + status = VCHIQ_RETRY; + goto failed; + } + status = vchiq_connect_internal(state, instance); + + if (status == VCHIQ_SUCCESS) + instance->connected = 1; + + lmutex_unlock(&state->mutex); + +failed: + vchiq_log_trace(vchiq_core_log_level, + "%s(%p): returning %d", __func__, instance, status); + + return status; +} +EXPORT_SYMBOL(vchiq_connect); + +/**************************************************************************** +* +* vchiq_add_service +* +***************************************************************************/ + +VCHIQ_STATUS_T vchiq_add_service( + VCHIQ_INSTANCE_T instance, + const VCHIQ_SERVICE_PARAMS_T *params, + VCHIQ_SERVICE_HANDLE_T *phandle) +{ + VCHIQ_STATUS_T status; + VCHIQ_STATE_T *state = instance->state; + VCHIQ_SERVICE_T *service = NULL; + int srvstate; + + vchiq_log_trace(vchiq_core_log_level, + "%s(%p) called", __func__, instance); + + *phandle = VCHIQ_SERVICE_HANDLE_INVALID; + + srvstate = vchiq_is_connected(instance) + ? VCHIQ_SRVSTATE_LISTENING + : VCHIQ_SRVSTATE_HIDDEN; + + service = vchiq_add_service_internal( + state, + params, + srvstate, + instance, + NULL); + + if (service) { + *phandle = service->handle; + status = VCHIQ_SUCCESS; + } else + status = VCHIQ_ERROR; + + vchiq_log_trace(vchiq_core_log_level, + "%s(%p): returning %d", __func__, instance, status); + + return status; +} +EXPORT_SYMBOL(vchiq_add_service); + +/**************************************************************************** +* +* vchiq_open_service +* +***************************************************************************/ + +VCHIQ_STATUS_T vchiq_open_service( + VCHIQ_INSTANCE_T instance, + const VCHIQ_SERVICE_PARAMS_T *params, + VCHIQ_SERVICE_HANDLE_T *phandle) +{ + VCHIQ_STATUS_T status = VCHIQ_ERROR; + VCHIQ_STATE_T *state = instance->state; + VCHIQ_SERVICE_T *service = NULL; + + vchiq_log_trace(vchiq_core_log_level, + "%s(%p) called", __func__, instance); + + *phandle = VCHIQ_SERVICE_HANDLE_INVALID; + + if (!vchiq_is_connected(instance)) + goto failed; + + service = vchiq_add_service_internal(state, + params, + VCHIQ_SRVSTATE_OPENING, + instance, + NULL); + + if (service) { + *phandle = service->handle; + status = vchiq_open_service_internal(service, + (uintptr_t)current); + if (status != VCHIQ_SUCCESS) { + vchiq_remove_service(service->handle); + *phandle = VCHIQ_SERVICE_HANDLE_INVALID; + } + } + +failed: + vchiq_log_trace(vchiq_core_log_level, + "%s(%p): returning %d", __func__, instance, status); + + return status; +} +EXPORT_SYMBOL(vchiq_open_service); + +VCHIQ_STATUS_T +vchiq_queue_bulk_transmit(VCHIQ_SERVICE_HANDLE_T handle, + void *data, unsigned int size, void *userdata) +{ + return vchiq_bulk_transfer(handle, + VCHI_MEM_HANDLE_INVALID, data, size, userdata, + VCHIQ_BULK_MODE_CALLBACK, VCHIQ_BULK_TRANSMIT); +} +EXPORT_SYMBOL(vchiq_queue_bulk_transmit); + +VCHIQ_STATUS_T +vchiq_queue_bulk_receive(VCHIQ_SERVICE_HANDLE_T handle, void *data, + unsigned int size, void *userdata) +{ + return vchiq_bulk_transfer(handle, + VCHI_MEM_HANDLE_INVALID, data, size, userdata, + VCHIQ_BULK_MODE_CALLBACK, VCHIQ_BULK_RECEIVE); +} +EXPORT_SYMBOL(vchiq_queue_bulk_receive); + +VCHIQ_STATUS_T +vchiq_bulk_transmit(VCHIQ_SERVICE_HANDLE_T handle, void *data, + unsigned int size, void *userdata, VCHIQ_BULK_MODE_T mode) +{ + VCHIQ_STATUS_T status; + + switch (mode) { + case VCHIQ_BULK_MODE_NOCALLBACK: + case VCHIQ_BULK_MODE_CALLBACK: + status = vchiq_bulk_transfer(handle, + VCHI_MEM_HANDLE_INVALID, data, size, userdata, + mode, VCHIQ_BULK_TRANSMIT); + break; + case VCHIQ_BULK_MODE_BLOCKING: + status = vchiq_blocking_bulk_transfer(handle, + data, size, VCHIQ_BULK_TRANSMIT); + break; + default: + return VCHIQ_ERROR; + } + + return status; +} +EXPORT_SYMBOL(vchiq_bulk_transmit); + +VCHIQ_STATUS_T +vchiq_bulk_receive(VCHIQ_SERVICE_HANDLE_T handle, void *data, + unsigned int size, void *userdata, VCHIQ_BULK_MODE_T mode) +{ + VCHIQ_STATUS_T status; + + switch (mode) { + case VCHIQ_BULK_MODE_NOCALLBACK: + case VCHIQ_BULK_MODE_CALLBACK: + status = vchiq_bulk_transfer(handle, + VCHI_MEM_HANDLE_INVALID, data, size, userdata, + mode, VCHIQ_BULK_RECEIVE); + break; + case VCHIQ_BULK_MODE_BLOCKING: + status = vchiq_blocking_bulk_transfer(handle, + data, size, VCHIQ_BULK_RECEIVE); + break; + default: + return VCHIQ_ERROR; + } + + return status; +} +EXPORT_SYMBOL(vchiq_bulk_receive); + +static VCHIQ_STATUS_T +vchiq_blocking_bulk_transfer(VCHIQ_SERVICE_HANDLE_T handle, void *data, + unsigned int size, VCHIQ_BULK_DIR_T dir) +{ + VCHIQ_INSTANCE_T instance; + VCHIQ_SERVICE_T *service; + VCHIQ_STATUS_T status; + struct bulk_waiter_node *waiter = NULL; + struct list_head *pos; + + service = find_service_by_handle(handle); + if (!service) + return VCHIQ_ERROR; + + instance = service->instance; + + unlock_service(service); + + lmutex_lock(&instance->bulk_waiter_list_mutex); + list_for_each(pos, &instance->bulk_waiter_list) { + if (list_entry(pos, struct bulk_waiter_node, + list)->pid == current->p_pid) { + waiter = list_entry(pos, + struct bulk_waiter_node, + list); + list_del(pos); + break; + } + } + lmutex_unlock(&instance->bulk_waiter_list_mutex); + + if (waiter) { + VCHIQ_BULK_T *bulk = waiter->bulk_waiter.bulk; + if (bulk) { + /* This thread has an outstanding bulk transfer. */ + if ((bulk->data != data) || + (bulk->size != size)) { + /* This is not a retry of the previous one. + ** Cancel the signal when the transfer + ** completes. */ + spin_lock(&bulk_waiter_spinlock); + bulk->userdata = NULL; + spin_unlock(&bulk_waiter_spinlock); + } + } + } + + if (!waiter) { + waiter = kzalloc(sizeof(struct bulk_waiter_node), GFP_KERNEL); + if (!waiter) { + vchiq_log_error(vchiq_core_log_level, + "%s - out of memory", __func__); + return VCHIQ_ERROR; + } + } + + status = vchiq_bulk_transfer(handle, VCHI_MEM_HANDLE_INVALID, + data, size, &waiter->bulk_waiter, VCHIQ_BULK_MODE_BLOCKING, + dir); + if ((status != VCHIQ_RETRY) || fatal_signal_pending(current) || + !waiter->bulk_waiter.bulk) { + VCHIQ_BULK_T *bulk = waiter->bulk_waiter.bulk; + if (bulk) { + /* Cancel the signal when the transfer + ** completes. */ + spin_lock(&bulk_waiter_spinlock); + bulk->userdata = NULL; + spin_unlock(&bulk_waiter_spinlock); + } + _sema_destroy(&waiter->bulk_waiter.event); + + kfree(waiter); + } else { + waiter->pid = current->p_pid; + lmutex_lock(&instance->bulk_waiter_list_mutex); + list_add(&waiter->list, &instance->bulk_waiter_list); + lmutex_unlock(&instance->bulk_waiter_list_mutex); + vchiq_log_info(vchiq_arm_log_level, + "saved bulk_waiter %x for pid %d", + (unsigned int)waiter, current->p_pid); + } + + return status; +} Property changes on: projects/building-blocks/sys/contrib/vchiq/interface/vchiq_arm/vchiq_kern_lib.c ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Index: projects/building-blocks/sys/contrib/vchiq/interface/vchiq_arm/vchiq_kmod.c =================================================================== --- projects/building-blocks/sys/contrib/vchiq/interface/vchiq_arm/vchiq_kmod.c (nonexistent) +++ projects/building-blocks/sys/contrib/vchiq/interface/vchiq_arm/vchiq_kmod.c (revision 278312) @@ -0,0 +1,217 @@ +/*- + * Copyright (c) 2012-2015 Oleksandr Tymoshenko + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include + +#include "vchiq_arm.h" +#include "vchiq_2835.h" + +#define VCHIQ_LOCK do { \ + mtx_lock(&bcm_vchiq_sc->lock); \ +} while(0) + +#define VCHIQ_UNLOCK do { \ + mtx_unlock(&bcm_vchiq_sc->lock); \ +} while(0) + +#ifdef DEBUG +#define dprintf(fmt, args...) printf(fmt, ##args) +#else +#define dprintf(fmt, args...) +#endif + +struct bcm_vchiq_softc { + struct mtx lock; + struct resource * mem_res; + struct resource * irq_res; + void* intr_hl; + bus_space_tag_t bst; + bus_space_handle_t bsh; +}; + +static struct bcm_vchiq_softc *bcm_vchiq_sc = NULL; + +#define vchiq_read_4(reg) \ + bus_space_read_4(bcm_vchiq_sc->bst, bcm_vchiq_sc->bsh, reg) +#define vchiq_write_4(reg, val) \ + bus_space_write_4(bcm_vchiq_sc->bst, bcm_vchiq_sc->bsh, reg, val) + +/* + * Extern functions */ +void vchiq_exit(void); +int vchiq_init(void); + +extern VCHIQ_STATE_T g_state; + +static void +bcm_vchiq_intr(void *arg) +{ + VCHIQ_STATE_T *state = &g_state; + unsigned int status; + + /* Read (and clear) the doorbell */ + status = vchiq_read_4(0x40); + + if (status & 0x4) { /* Was the doorbell rung? */ + remote_event_pollall(state); + } +} + +void +remote_event_signal(REMOTE_EVENT_T *event) +{ + event->fired = 1; + + /* The test on the next line also ensures the write on the previous line + has completed */ + if (event->armed) { + /* trigger vc interrupt */ + __asm __volatile ("mcr p15, 0, %0, c7, c10, 4" : : "r" (0) : "memory"); + vchiq_write_4(0x48, 0); + } +} + +static int +bcm_vchiq_probe(device_t dev) +{ + + if (ofw_bus_is_compatible(dev, "broadcom,bcm2835-vchiq")) { + device_set_desc(dev, "BCM2835 VCHIQ"); + return(BUS_PROBE_DEFAULT); + } + + return (ENXIO); +} + +static int +bcm_vchiq_attach(device_t dev) +{ + struct bcm_vchiq_softc *sc = device_get_softc(dev); + int rid = 0; + + if (bcm_vchiq_sc != NULL) + return (EINVAL); + + sc->mem_res = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid, RF_ACTIVE); + if (sc->mem_res == NULL) { + device_printf(dev, "could not allocate memory resource\n"); + return (ENXIO); + } + + sc->bst = rman_get_bustag(sc->mem_res); + sc->bsh = rman_get_bushandle(sc->mem_res); + + rid = 0; + sc->irq_res = bus_alloc_resource_any(dev, SYS_RES_IRQ, &rid, RF_ACTIVE); + if (sc->irq_res == NULL) { + device_printf(dev, "could not allocate interrupt resource\n"); + return (ENXIO); + } + + vchiq_core_initialize(); + + /* Setup and enable the timer */ + if (bus_setup_intr(dev, sc->irq_res, INTR_TYPE_MISC, + NULL, bcm_vchiq_intr, sc, + &sc->intr_hl) != 0) { + bus_release_resource(dev, SYS_RES_IRQ, rid, + sc->irq_res); + device_printf(dev, "Unable to setup the clock irq handler.\n"); + return (ENXIO); + } + + mtx_init(&sc->lock, "vchiq", MTX_DEF, 0); + bcm_vchiq_sc = sc; + + vchiq_init(); + + return (0); +} + +static int +bcm_vchiq_detach(device_t dev) +{ + struct bcm_vchiq_softc *sc = device_get_softc(dev); + + vchiq_exit(); + + if (sc->intr_hl) + bus_teardown_intr(dev, sc->irq_res, sc->intr_hl); + bus_release_resource(dev, SYS_RES_IRQ, 0, + sc->irq_res); + bus_release_resource(dev, SYS_RES_MEMORY, 0, + sc->mem_res); + + mtx_destroy(&sc->lock); + + return (0); +} + + +static device_method_t bcm_vchiq_methods[] = { + DEVMETHOD(device_probe, bcm_vchiq_probe), + DEVMETHOD(device_attach, bcm_vchiq_attach), + DEVMETHOD(device_detach, bcm_vchiq_detach), + + /* Bus interface */ + DEVMETHOD(bus_add_child, bus_generic_add_child), + + { 0, 0 } +}; + +static driver_t bcm_vchiq_driver = { + "vchiq", + bcm_vchiq_methods, + sizeof(struct bcm_vchiq_softc), +}; + +static devclass_t bcm_vchiq_devclass; + +DRIVER_MODULE(vchiq, simplebus, bcm_vchiq_driver, bcm_vchiq_devclass, 0, 0); +MODULE_VERSION(vchiq, 1); Property changes on: projects/building-blocks/sys/contrib/vchiq/interface/vchiq_arm/vchiq_kmod.c ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Index: projects/building-blocks/sys/contrib/vchiq/interface/vchiq_arm/vchiq_memdrv.h =================================================================== --- projects/building-blocks/sys/contrib/vchiq/interface/vchiq_arm/vchiq_memdrv.h (nonexistent) +++ projects/building-blocks/sys/contrib/vchiq/interface/vchiq_arm/vchiq_memdrv.h (revision 278312) @@ -0,0 +1,71 @@ +/** + * Copyright (c) 2010-2012 Broadcom. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions, and the following disclaimer, + * without modification. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The names of the above-listed copyright holders may not be used + * to endorse or promote products derived from this software without + * specific prior written permission. + * + * ALTERNATIVELY, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2, as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef VCHIQ_MEMDRV_H +#define VCHIQ_MEMDRV_H + +/* ---- Include Files ----------------------------------------------------- */ + +#include +#include "vchiq_if.h" + +/* ---- Constants and Types ---------------------------------------------- */ + +typedef struct { + void *armSharedMemVirt; + dma_addr_t armSharedMemPhys; + size_t armSharedMemSize; + + void *vcSharedMemVirt; + dma_addr_t vcSharedMemPhys; + size_t vcSharedMemSize; +} VCHIQ_SHARED_MEM_INFO_T; + +/* ---- Variable Externs ------------------------------------------------- */ + +/* ---- Function Prototypes ---------------------------------------------- */ + +void vchiq_get_shared_mem_info(VCHIQ_SHARED_MEM_INFO_T *info); + +VCHIQ_STATUS_T vchiq_memdrv_initialise(void); + +VCHIQ_STATUS_T vchiq_userdrv_create_instance( + const VCHIQ_PLATFORM_DATA_T * platform_data); + +VCHIQ_STATUS_T vchiq_userdrv_suspend( + const VCHIQ_PLATFORM_DATA_T * platform_data); + +VCHIQ_STATUS_T vchiq_userdrv_resume( + const VCHIQ_PLATFORM_DATA_T * platform_data); + +#endif Property changes on: projects/building-blocks/sys/contrib/vchiq/interface/vchiq_arm/vchiq_memdrv.h ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Index: projects/building-blocks/sys/contrib/vchiq/interface/vchiq_arm/vchiq_pagelist.h =================================================================== --- projects/building-blocks/sys/contrib/vchiq/interface/vchiq_arm/vchiq_pagelist.h (nonexistent) +++ projects/building-blocks/sys/contrib/vchiq/interface/vchiq_arm/vchiq_pagelist.h (revision 278312) @@ -0,0 +1,59 @@ +/** + * Copyright (c) 2010-2012 Broadcom. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions, and the following disclaimer, + * without modification. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The names of the above-listed copyright holders may not be used + * to endorse or promote products derived from this software without + * specific prior written permission. + * + * ALTERNATIVELY, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2, as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef VCHIQ_PAGELIST_H +#define VCHIQ_PAGELIST_H + +#ifndef PAGE_SIZE +#define PAGE_SIZE 4096 +#endif +#undef CACHE_LINE_SIZE +#define CACHE_LINE_SIZE 32 +#define PAGELIST_WRITE 0 +#define PAGELIST_READ 1 +#define PAGELIST_READ_WITH_FRAGMENTS 2 + +typedef struct pagelist_struct { + unsigned long length; + unsigned short type; + unsigned short offset; + unsigned long addrs[1]; /* N.B. 12 LSBs hold the number of following + pages at consecutive addresses. */ +} PAGELIST_T; + +typedef struct fragments_struct { + char headbuf[CACHE_LINE_SIZE]; + char tailbuf[CACHE_LINE_SIZE]; +} FRAGMENTS_T; + +#endif /* VCHIQ_PAGELIST_H */ Property changes on: projects/building-blocks/sys/contrib/vchiq/interface/vchiq_arm/vchiq_pagelist.h ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Index: projects/building-blocks/sys/contrib/vchiq/interface/vchiq_arm/vchiq_proc.c =================================================================== --- projects/building-blocks/sys/contrib/vchiq/interface/vchiq_arm/vchiq_proc.c (nonexistent) +++ projects/building-blocks/sys/contrib/vchiq/interface/vchiq_arm/vchiq_proc.c (revision 278312) @@ -0,0 +1,240 @@ +/** + * Copyright (c) 2010-2012 Broadcom. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions, and the following disclaimer, + * without modification. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The names of the above-listed copyright holders may not be used + * to endorse or promote products derived from this software without + * specific prior written permission. + * + * ALTERNATIVELY, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2, as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + + +#include +#include "vchiq_core.h" +#include "vchiq_arm.h" + +struct vchiq_proc_info { + /* Global 'vc' proc entry used by all instances */ + struct proc_dir_entry *vc_cfg_dir; + + /* one entry per client process */ + struct proc_dir_entry *clients; + + /* log categories */ + struct proc_dir_entry *log_categories; +}; + +static struct vchiq_proc_info proc_info; + +struct proc_dir_entry *vchiq_proc_top(void) +{ + BUG_ON(proc_info.vc_cfg_dir == NULL); + return proc_info.vc_cfg_dir; +} + +/**************************************************************************** +* +* log category entries +* +***************************************************************************/ +#define PROC_WRITE_BUF_SIZE 256 + +#define VCHIQ_LOG_ERROR_STR "error" +#define VCHIQ_LOG_WARNING_STR "warning" +#define VCHIQ_LOG_INFO_STR "info" +#define VCHIQ_LOG_TRACE_STR "trace" + +static int log_cfg_read(char *buffer, + char **start, + off_t off, + int count, + int *eof, + void *data) +{ + int len = 0; + char *log_value = NULL; + + switch (*((int *)data)) { + case VCHIQ_LOG_ERROR: + log_value = VCHIQ_LOG_ERROR_STR; + break; + case VCHIQ_LOG_WARNING: + log_value = VCHIQ_LOG_WARNING_STR; + break; + case VCHIQ_LOG_INFO: + log_value = VCHIQ_LOG_INFO_STR; + break; + case VCHIQ_LOG_TRACE: + log_value = VCHIQ_LOG_TRACE_STR; + break; + default: + break; + } + + len += snprintf(buffer + len, count - len, + "%s\n", + log_value ? log_value : "(null)"); + + return len; +} + + +static int log_cfg_write(struct file *file, + const char __user *buffer, + unsigned long count, + void *data) +{ + int *log_module = data; + char kbuf[PROC_WRITE_BUF_SIZE + 1]; + + (void)file; + + memset(kbuf, 0, PROC_WRITE_BUF_SIZE + 1); + if (count >= PROC_WRITE_BUF_SIZE) + count = PROC_WRITE_BUF_SIZE; + + if (copy_from_user(kbuf, + buffer, + count) != 0) + return -EFAULT; + kbuf[count - 1] = 0; + + if (strncmp("error", kbuf, strlen("error")) == 0) + *log_module = VCHIQ_LOG_ERROR; + else if (strncmp("warning", kbuf, strlen("warning")) == 0) + *log_module = VCHIQ_LOG_WARNING; + else if (strncmp("info", kbuf, strlen("info")) == 0) + *log_module = VCHIQ_LOG_INFO; + else if (strncmp("trace", kbuf, strlen("trace")) == 0) + *log_module = VCHIQ_LOG_TRACE; + else + *log_module = VCHIQ_LOG_DEFAULT; + + return count; +} + +/* Log category proc entries */ +struct vchiq_proc_log_entry { + const char *name; + int *plevel; + struct proc_dir_entry *dir; +}; + +static struct vchiq_proc_log_entry vchiq_proc_log_entries[] = { + { "core", &vchiq_core_log_level }, + { "msg", &vchiq_core_msg_log_level }, + { "sync", &vchiq_sync_log_level }, + { "susp", &vchiq_susp_log_level }, + { "arm", &vchiq_arm_log_level }, +}; +static int n_log_entries = + sizeof(vchiq_proc_log_entries)/sizeof(vchiq_proc_log_entries[0]); + +/* create an entry under /proc/vc/log for each log category */ +static int vchiq_proc_create_log_entries(struct proc_dir_entry *top) +{ + struct proc_dir_entry *dir; + size_t i; + int ret = 0; + + dir = proc_mkdir("log", proc_info.vc_cfg_dir); + if (!dir) + return -ENOMEM; + proc_info.log_categories = dir; + + for (i = 0; i < n_log_entries; i++) { + dir = create_proc_entry(vchiq_proc_log_entries[i].name, + 0644, + proc_info.log_categories); + if (!dir) { + ret = -ENOMEM; + break; + } + + dir->read_proc = &log_cfg_read; + dir->write_proc = &log_cfg_write; + dir->data = (void *)vchiq_proc_log_entries[i].plevel; + + vchiq_proc_log_entries[i].dir = dir; + } + return ret; +} + + +int vchiq_proc_init(void) +{ + BUG_ON(proc_info.vc_cfg_dir != NULL); + + proc_info.vc_cfg_dir = proc_mkdir("vc", NULL); + if (proc_info.vc_cfg_dir == NULL) + goto fail; + + proc_info.clients = proc_mkdir("clients", + proc_info.vc_cfg_dir); + if (!proc_info.clients) + goto fail; + + if (vchiq_proc_create_log_entries(proc_info.vc_cfg_dir) != 0) + goto fail; + + return 0; + +fail: + vchiq_proc_deinit(); + vchiq_log_error(vchiq_arm_log_level, + "%s: failed to create proc directory", + __func__); + + return -ENOMEM; +} + +/* remove all the proc entries */ +void vchiq_proc_deinit(void) +{ + /* log category entries */ + if (proc_info.log_categories) { + size_t i; + for (i = 0; i < n_log_entries; i++) + if (vchiq_proc_log_entries[i].dir) + remove_proc_entry( + vchiq_proc_log_entries[i].name, + proc_info.log_categories); + + remove_proc_entry(proc_info.log_categories->name, + proc_info.vc_cfg_dir); + } + if (proc_info.clients) + remove_proc_entry(proc_info.clients->name, + proc_info.vc_cfg_dir); + if (proc_info.vc_cfg_dir) + remove_proc_entry(proc_info.vc_cfg_dir->name, NULL); +} + +struct proc_dir_entry *vchiq_clients_top(void) +{ + return proc_info.clients; +} + Property changes on: projects/building-blocks/sys/contrib/vchiq/interface/vchiq_arm/vchiq_proc.c ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Index: projects/building-blocks/sys/contrib/vchiq/interface/vchiq_arm/vchiq_shim.c =================================================================== --- projects/building-blocks/sys/contrib/vchiq/interface/vchiq_arm/vchiq_shim.c (nonexistent) +++ projects/building-blocks/sys/contrib/vchiq/interface/vchiq_arm/vchiq_shim.c (revision 278312) @@ -0,0 +1,830 @@ +/** + * Copyright (c) 2010-2012 Broadcom. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions, and the following disclaimer, + * without modification. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The names of the above-listed copyright holders may not be used + * to endorse or promote products derived from this software without + * specific prior written permission. + * + * ALTERNATIVELY, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2, as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include + +#include "interface/vchi/vchi.h" +#include "vchiq.h" +#include "vchiq_core.h" + +#include "vchiq_util.h" + +#define vchiq_status_to_vchi(status) ((int32_t)status) + +typedef struct { + VCHIQ_SERVICE_HANDLE_T handle; + + VCHIU_QUEUE_T queue; + + VCHI_CALLBACK_T callback; + void *callback_param; +} SHIM_SERVICE_T; + +/* ---------------------------------------------------------------------- + * return pointer to the mphi message driver function table + * -------------------------------------------------------------------- */ +const VCHI_MESSAGE_DRIVER_T * +vchi_mphi_message_driver_func_table(void) +{ + return NULL; +} + +/* ---------------------------------------------------------------------- + * return a pointer to the 'single' connection driver fops + * -------------------------------------------------------------------- */ +const VCHI_CONNECTION_API_T * +single_get_func_table(void) +{ + return NULL; +} + +VCHI_CONNECTION_T *vchi_create_connection( + const VCHI_CONNECTION_API_T *function_table, + const VCHI_MESSAGE_DRIVER_T *low_level) +{ + (void)function_table; + (void)low_level; + return NULL; +} + +/*********************************************************** + * Name: vchi_msg_peek + * + * Arguments: const VCHI_SERVICE_HANDLE_T handle, + * void **data, + * uint32_t *msg_size, + + + * VCHI_FLAGS_T flags + * + * Description: Routine to return a pointer to the current message (to allow in + * place processing). The message can be removed using + * vchi_msg_remove when you're finished + * + * Returns: int32_t - success == 0 + * + ***********************************************************/ +int32_t vchi_msg_peek(VCHI_SERVICE_HANDLE_T handle, + void **data, + uint32_t *msg_size, + VCHI_FLAGS_T flags) +{ + SHIM_SERVICE_T *service = (SHIM_SERVICE_T *)handle; + VCHIQ_HEADER_T *header; + + WARN_ON((flags != VCHI_FLAGS_NONE) && + (flags != VCHI_FLAGS_BLOCK_UNTIL_OP_COMPLETE)); + + if (flags == VCHI_FLAGS_NONE) + if (vchiu_queue_is_empty(&service->queue)) + return -1; + + header = vchiu_queue_peek(&service->queue); + + *data = header->data; + *msg_size = header->size; + + return 0; +} +EXPORT_SYMBOL(vchi_msg_peek); + +/*********************************************************** + * Name: vchi_msg_remove + * + * Arguments: const VCHI_SERVICE_HANDLE_T handle, + * + * Description: Routine to remove a message (after it has been read with + * vchi_msg_peek) + * + * Returns: int32_t - success == 0 + * + ***********************************************************/ +int32_t vchi_msg_remove(VCHI_SERVICE_HANDLE_T handle) +{ + SHIM_SERVICE_T *service = (SHIM_SERVICE_T *)handle; + VCHIQ_HEADER_T *header; + + header = vchiu_queue_pop(&service->queue); + + vchiq_release_message(service->handle, header); + + return 0; +} +EXPORT_SYMBOL(vchi_msg_remove); + +/*********************************************************** + * Name: vchi_msg_queue + * + * Arguments: VCHI_SERVICE_HANDLE_T handle, + * const void *data, + * uint32_t data_size, + * VCHI_FLAGS_T flags, + * void *msg_handle, + * + * Description: Thin wrapper to queue a message onto a connection + * + * Returns: int32_t - success == 0 + * + ***********************************************************/ +int32_t vchi_msg_queue(VCHI_SERVICE_HANDLE_T handle, + const void *data, + uint32_t data_size, + VCHI_FLAGS_T flags, + void *msg_handle) +{ + SHIM_SERVICE_T *service = (SHIM_SERVICE_T *)handle; + VCHIQ_ELEMENT_T element = {data, data_size}; + VCHIQ_STATUS_T status; + + (void)msg_handle; + + WARN_ON(flags != VCHI_FLAGS_BLOCK_UNTIL_QUEUED); + + status = vchiq_queue_message(service->handle, &element, 1); + + /* vchiq_queue_message() may return VCHIQ_RETRY, so we need to + ** implement a retry mechanism since this function is supposed + ** to block until queued + */ + while (status == VCHIQ_RETRY) { + msleep(1); + status = vchiq_queue_message(service->handle, &element, 1); + } + + return vchiq_status_to_vchi(status); +} +EXPORT_SYMBOL(vchi_msg_queue); + +/*********************************************************** + * Name: vchi_bulk_queue_receive + * + * Arguments: VCHI_BULK_HANDLE_T handle, + * void *data_dst, + * const uint32_t data_size, + * VCHI_FLAGS_T flags + * void *bulk_handle + * + * Description: Routine to setup a rcv buffer + * + * Returns: int32_t - success == 0 + * + ***********************************************************/ +int32_t vchi_bulk_queue_receive(VCHI_SERVICE_HANDLE_T handle, + void *data_dst, + uint32_t data_size, + VCHI_FLAGS_T flags, + void *bulk_handle) +{ + SHIM_SERVICE_T *service = (SHIM_SERVICE_T *)handle; + VCHIQ_BULK_MODE_T mode; + VCHIQ_STATUS_T status; + + switch ((int)flags) { + case VCHI_FLAGS_CALLBACK_WHEN_OP_COMPLETE + | VCHI_FLAGS_BLOCK_UNTIL_QUEUED: + WARN_ON(!service->callback); + mode = VCHIQ_BULK_MODE_CALLBACK; + break; + case VCHI_FLAGS_BLOCK_UNTIL_OP_COMPLETE: + mode = VCHIQ_BULK_MODE_BLOCKING; + break; + case VCHI_FLAGS_BLOCK_UNTIL_QUEUED: + case VCHI_FLAGS_NONE: + mode = VCHIQ_BULK_MODE_NOCALLBACK; + break; + default: + WARN(1, "unsupported message\n"); + return vchiq_status_to_vchi(VCHIQ_ERROR); + } + + status = vchiq_bulk_receive(service->handle, data_dst, data_size, + bulk_handle, mode); + + /* vchiq_bulk_receive() may return VCHIQ_RETRY, so we need to + ** implement a retry mechanism since this function is supposed + ** to block until queued + */ + while (status == VCHIQ_RETRY) { + msleep(1); + status = vchiq_bulk_receive(service->handle, data_dst, + data_size, bulk_handle, mode); + } + + return vchiq_status_to_vchi(status); +} +EXPORT_SYMBOL(vchi_bulk_queue_receive); + +/*********************************************************** + * Name: vchi_bulk_queue_transmit + * + * Arguments: VCHI_BULK_HANDLE_T handle, + * void *data_src, + * uint32_t data_size, + * VCHI_FLAGS_T flags, + * void *bulk_handle + * + * Description: Routine to transmit some data + * + * Returns: int32_t - success == 0 + * + ***********************************************************/ +int32_t vchi_bulk_queue_transmit(VCHI_SERVICE_HANDLE_T handle, + void *data_src, + uint32_t data_size, + VCHI_FLAGS_T flags, + void *bulk_handle) +{ + SHIM_SERVICE_T *service = (SHIM_SERVICE_T *)handle; + VCHIQ_BULK_MODE_T mode; + VCHIQ_STATUS_T status; + + switch ((int)flags) { + case VCHI_FLAGS_CALLBACK_WHEN_OP_COMPLETE + | VCHI_FLAGS_BLOCK_UNTIL_QUEUED: + WARN_ON(!service->callback); + mode = VCHIQ_BULK_MODE_CALLBACK; + break; + case VCHI_FLAGS_BLOCK_UNTIL_DATA_READ: + case VCHI_FLAGS_BLOCK_UNTIL_OP_COMPLETE: + mode = VCHIQ_BULK_MODE_BLOCKING; + break; + case VCHI_FLAGS_BLOCK_UNTIL_QUEUED: + case VCHI_FLAGS_NONE: + mode = VCHIQ_BULK_MODE_NOCALLBACK; + break; + default: + WARN(1, "unsupported message\n"); + return vchiq_status_to_vchi(VCHIQ_ERROR); + } + + status = vchiq_bulk_transmit(service->handle, data_src, data_size, + bulk_handle, mode); + + /* vchiq_bulk_transmit() may return VCHIQ_RETRY, so we need to + ** implement a retry mechanism since this function is supposed + ** to block until queued + */ + while (status == VCHIQ_RETRY) { + msleep(1); + status = vchiq_bulk_transmit(service->handle, data_src, + data_size, bulk_handle, mode); + } + + return vchiq_status_to_vchi(status); +} +EXPORT_SYMBOL(vchi_bulk_queue_transmit); + +/*********************************************************** + * Name: vchi_msg_dequeue + * + * Arguments: VCHI_SERVICE_HANDLE_T handle, + * void *data, + * uint32_t max_data_size_to_read, + * uint32_t *actual_msg_size + * VCHI_FLAGS_T flags + * + * Description: Routine to dequeue a message into the supplied buffer + * + * Returns: int32_t - success == 0 + * + ***********************************************************/ +int32_t vchi_msg_dequeue(VCHI_SERVICE_HANDLE_T handle, + void *data, + uint32_t max_data_size_to_read, + uint32_t *actual_msg_size, + VCHI_FLAGS_T flags) +{ + SHIM_SERVICE_T *service = (SHIM_SERVICE_T *)handle; + VCHIQ_HEADER_T *header; + + WARN_ON((flags != VCHI_FLAGS_NONE) && + (flags != VCHI_FLAGS_BLOCK_UNTIL_OP_COMPLETE)); + + if (flags == VCHI_FLAGS_NONE) + if (vchiu_queue_is_empty(&service->queue)) + return -1; + + header = vchiu_queue_pop(&service->queue); + + memcpy(data, header->data, header->size < max_data_size_to_read ? + header->size : max_data_size_to_read); + + *actual_msg_size = header->size; + + vchiq_release_message(service->handle, header); + + return 0; +} +EXPORT_SYMBOL(vchi_msg_dequeue); + +/*********************************************************** + * Name: vchi_msg_queuev + * + * Arguments: VCHI_SERVICE_HANDLE_T handle, + * VCHI_MSG_VECTOR_T *vector, + * uint32_t count, + * VCHI_FLAGS_T flags, + * void *msg_handle + * + * Description: Thin wrapper to queue a message onto a connection + * + * Returns: int32_t - success == 0 + * + ***********************************************************/ + +vchiq_static_assert(sizeof(VCHI_MSG_VECTOR_T) == sizeof(VCHIQ_ELEMENT_T)); +vchiq_static_assert(offsetof(VCHI_MSG_VECTOR_T, vec_base) == + offsetof(VCHIQ_ELEMENT_T, data)); +vchiq_static_assert(offsetof(VCHI_MSG_VECTOR_T, vec_len) == + offsetof(VCHIQ_ELEMENT_T, size)); + +int32_t vchi_msg_queuev(VCHI_SERVICE_HANDLE_T handle, + VCHI_MSG_VECTOR_T *vector, + uint32_t count, + VCHI_FLAGS_T flags, + void *msg_handle) +{ + SHIM_SERVICE_T *service = (SHIM_SERVICE_T *)handle; + + (void)msg_handle; + + WARN_ON(flags != VCHI_FLAGS_BLOCK_UNTIL_QUEUED); + + return vchiq_status_to_vchi(vchiq_queue_message(service->handle, + (const VCHIQ_ELEMENT_T *)vector, count)); +} +EXPORT_SYMBOL(vchi_msg_queuev); + +/*********************************************************** + * Name: vchi_held_msg_release + * + * Arguments: VCHI_HELD_MSG_T *message + * + * Description: Routine to release a held message (after it has been read with + * vchi_msg_hold) + * + * Returns: int32_t - success == 0 + * + ***********************************************************/ +int32_t vchi_held_msg_release(VCHI_HELD_MSG_T *message) +{ + vchiq_release_message((VCHIQ_SERVICE_HANDLE_T)message->service, + (VCHIQ_HEADER_T *)message->message); + + return 0; +} + +/*********************************************************** + * Name: vchi_msg_hold + * + * Arguments: VCHI_SERVICE_HANDLE_T handle, + * void **data, + * uint32_t *msg_size, + * VCHI_FLAGS_T flags, + * VCHI_HELD_MSG_T *message_handle + * + * Description: Routine to return a pointer to the current message (to allow + * in place processing). The message is dequeued - don't forget + * to release the message using vchi_held_msg_release when you're + * finished. + * + * Returns: int32_t - success == 0 + * + ***********************************************************/ +int32_t vchi_msg_hold(VCHI_SERVICE_HANDLE_T handle, + void **data, + uint32_t *msg_size, + VCHI_FLAGS_T flags, + VCHI_HELD_MSG_T *message_handle) +{ + SHIM_SERVICE_T *service = (SHIM_SERVICE_T *)handle; + VCHIQ_HEADER_T *header; + + WARN_ON((flags != VCHI_FLAGS_NONE) && + (flags != VCHI_FLAGS_BLOCK_UNTIL_OP_COMPLETE)); + + if (flags == VCHI_FLAGS_NONE) + if (vchiu_queue_is_empty(&service->queue)) + return -1; + + header = vchiu_queue_pop(&service->queue); + + *data = header->data; + *msg_size = header->size; + + message_handle->service = + (struct opaque_vchi_service_t *)service->handle; + message_handle->message = header; + + return 0; +} + +/*********************************************************** + * Name: vchi_initialise + * + * Arguments: VCHI_INSTANCE_T *instance_handle + * VCHI_CONNECTION_T **connections + * const uint32_t num_connections + * + * Description: Initialises the hardware but does not transmit anything + * When run as a Host App this will be called twice hence the need + * to malloc the state information + * + * Returns: 0 if successful, failure otherwise + * + ***********************************************************/ + +int32_t vchi_initialise(VCHI_INSTANCE_T *instance_handle) +{ + VCHIQ_INSTANCE_T instance; + VCHIQ_STATUS_T status; + + status = vchiq_initialise(&instance); + + *instance_handle = (VCHI_INSTANCE_T)instance; + + return vchiq_status_to_vchi(status); +} +EXPORT_SYMBOL(vchi_initialise); + +/*********************************************************** + * Name: vchi_connect + * + * Arguments: VCHI_CONNECTION_T **connections + * const uint32_t num_connections + * VCHI_INSTANCE_T instance_handle) + * + * Description: Starts the command service on each connection, + * causing INIT messages to be pinged back and forth + * + * Returns: 0 if successful, failure otherwise + * + ***********************************************************/ +int32_t vchi_connect(VCHI_CONNECTION_T **connections, + const uint32_t num_connections, + VCHI_INSTANCE_T instance_handle) +{ + VCHIQ_INSTANCE_T instance = (VCHIQ_INSTANCE_T)instance_handle; + + (void)connections; + (void)num_connections; + + return vchiq_connect(instance); +} +EXPORT_SYMBOL(vchi_connect); + + +/*********************************************************** + * Name: vchi_disconnect + * + * Arguments: VCHI_INSTANCE_T instance_handle + * + * Description: Stops the command service on each connection, + * causing DE-INIT messages to be pinged back and forth + * + * Returns: 0 if successful, failure otherwise + * + ***********************************************************/ +int32_t vchi_disconnect(VCHI_INSTANCE_T instance_handle) +{ + VCHIQ_INSTANCE_T instance = (VCHIQ_INSTANCE_T)instance_handle; + return vchiq_status_to_vchi(vchiq_shutdown(instance)); +} +EXPORT_SYMBOL(vchi_disconnect); + + +/*********************************************************** + * Name: vchi_service_open + * Name: vchi_service_create + * + * Arguments: VCHI_INSTANCE_T *instance_handle + * SERVICE_CREATION_T *setup, + * VCHI_SERVICE_HANDLE_T *handle + * + * Description: Routine to open a service + * + * Returns: int32_t - success == 0 + * + ***********************************************************/ + +static VCHIQ_STATUS_T shim_callback(VCHIQ_REASON_T reason, + VCHIQ_HEADER_T *header, VCHIQ_SERVICE_HANDLE_T handle, void *bulk_user) +{ + SHIM_SERVICE_T *service = + (SHIM_SERVICE_T *)VCHIQ_GET_SERVICE_USERDATA(handle); + + if (!service->callback) + goto release; + + switch (reason) { + case VCHIQ_MESSAGE_AVAILABLE: + vchiu_queue_push(&service->queue, header); + + service->callback(service->callback_param, + VCHI_CALLBACK_MSG_AVAILABLE, NULL); + + goto done; + break; + + case VCHIQ_BULK_TRANSMIT_DONE: + service->callback(service->callback_param, + VCHI_CALLBACK_BULK_SENT, bulk_user); + break; + + case VCHIQ_BULK_RECEIVE_DONE: + service->callback(service->callback_param, + VCHI_CALLBACK_BULK_RECEIVED, bulk_user); + break; + + case VCHIQ_SERVICE_CLOSED: + service->callback(service->callback_param, + VCHI_CALLBACK_SERVICE_CLOSED, NULL); + break; + + case VCHIQ_SERVICE_OPENED: + /* No equivalent VCHI reason */ + break; + + case VCHIQ_BULK_TRANSMIT_ABORTED: + service->callback(service->callback_param, + VCHI_CALLBACK_BULK_TRANSMIT_ABORTED, + bulk_user); + break; + + case VCHIQ_BULK_RECEIVE_ABORTED: + service->callback(service->callback_param, + VCHI_CALLBACK_BULK_RECEIVE_ABORTED, + bulk_user); + break; + + default: + WARN(1, "not supported\n"); + break; + } + +release: + vchiq_release_message(service->handle, header); +done: + return VCHIQ_SUCCESS; +} + +static SHIM_SERVICE_T *service_alloc(VCHIQ_INSTANCE_T instance, + SERVICE_CREATION_T *setup) +{ + SHIM_SERVICE_T *service = kzalloc(sizeof(SHIM_SERVICE_T), GFP_KERNEL); + + (void)instance; + + if (service) { + if (vchiu_queue_init(&service->queue, 64)) { + service->callback = setup->callback; + service->callback_param = setup->callback_param; + } else { + kfree(service); + service = NULL; + } + } + + return service; +} + +static void service_free(SHIM_SERVICE_T *service) +{ + if (service) { + vchiu_queue_delete(&service->queue); + kfree(service); + } +} + +int32_t vchi_service_open(VCHI_INSTANCE_T instance_handle, + SERVICE_CREATION_T *setup, + VCHI_SERVICE_HANDLE_T *handle) +{ + VCHIQ_INSTANCE_T instance = (VCHIQ_INSTANCE_T)instance_handle; + SHIM_SERVICE_T *service = service_alloc(instance, setup); + + *handle = (VCHI_SERVICE_HANDLE_T)service; + + if (service) { + VCHIQ_SERVICE_PARAMS_T params; + VCHIQ_STATUS_T status; + + memset(¶ms, 0, sizeof(params)); + params.fourcc = setup->service_id; + params.callback = shim_callback; + params.userdata = service; + params.version = setup->version.version; + params.version_min = setup->version.version_min; + + status = vchiq_open_service(instance, ¶ms, + &service->handle); + if (status != VCHIQ_SUCCESS) { + service_free(service); + service = NULL; + *handle = NULL; + } + } + + return (service != NULL) ? 0 : -1; +} +EXPORT_SYMBOL(vchi_service_open); + +int32_t vchi_service_create(VCHI_INSTANCE_T instance_handle, + SERVICE_CREATION_T *setup, + VCHI_SERVICE_HANDLE_T *handle) +{ + VCHIQ_INSTANCE_T instance = (VCHIQ_INSTANCE_T)instance_handle; + SHIM_SERVICE_T *service = service_alloc(instance, setup); + + *handle = (VCHI_SERVICE_HANDLE_T)service; + + if (service) { + VCHIQ_SERVICE_PARAMS_T params; + VCHIQ_STATUS_T status; + + memset(¶ms, 0, sizeof(params)); + params.fourcc = setup->service_id; + params.callback = shim_callback; + params.userdata = service; + params.version = setup->version.version; + params.version_min = setup->version.version_min; + status = vchiq_add_service(instance, ¶ms, &service->handle); + + if (status != VCHIQ_SUCCESS) { + service_free(service); + service = NULL; + *handle = NULL; + } + } + + return (service != NULL) ? 0 : -1; +} +EXPORT_SYMBOL(vchi_service_create); + +int32_t vchi_service_close(const VCHI_SERVICE_HANDLE_T handle) +{ + int32_t ret = -1; + SHIM_SERVICE_T *service = (SHIM_SERVICE_T *)handle; + if (service) { + VCHIQ_STATUS_T status = vchiq_close_service(service->handle); + if (status == VCHIQ_SUCCESS) { + service_free(service); + service = NULL; + } + + ret = vchiq_status_to_vchi(status); + } + return ret; +} +EXPORT_SYMBOL(vchi_service_close); + +int32_t vchi_service_destroy(const VCHI_SERVICE_HANDLE_T handle) +{ + int32_t ret = -1; + SHIM_SERVICE_T *service = (SHIM_SERVICE_T *)handle; + if (service) { + VCHIQ_STATUS_T status = vchiq_remove_service(service->handle); + if (status == VCHIQ_SUCCESS) { + service_free(service); + service = NULL; + } + + ret = vchiq_status_to_vchi(status); + } + return ret; +} +EXPORT_SYMBOL(vchi_service_destroy); + +int32_t vchi_get_peer_version( const VCHI_SERVICE_HANDLE_T handle, short *peer_version ) +{ + int32_t ret = -1; + SHIM_SERVICE_T *service = (SHIM_SERVICE_T *)handle; + if(service) + { + VCHIQ_STATUS_T status = vchiq_get_peer_version(service->handle, peer_version); + ret = vchiq_status_to_vchi( status ); + } + return ret; +} +EXPORT_SYMBOL(vchi_get_peer_version); + +#ifdef notyet +/* ---------------------------------------------------------------------- + * read a uint32_t from buffer. + * network format is defined to be little endian + * -------------------------------------------------------------------- */ +uint32_t +vchi_readbuf_uint32(const void *_ptr) +{ + const unsigned char *ptr = _ptr; + return ptr[0] | (ptr[1] << 8) | (ptr[2] << 16) | (ptr[3] << 24); +} + +/* ---------------------------------------------------------------------- + * write a uint32_t to buffer. + * network format is defined to be little endian + * -------------------------------------------------------------------- */ +void +vchi_writebuf_uint32(void *_ptr, uint32_t value) +{ + unsigned char *ptr = _ptr; + ptr[0] = (unsigned char)((value >> 0) & 0xFF); + ptr[1] = (unsigned char)((value >> 8) & 0xFF); + ptr[2] = (unsigned char)((value >> 16) & 0xFF); + ptr[3] = (unsigned char)((value >> 24) & 0xFF); +} + +/* ---------------------------------------------------------------------- + * read a uint16_t from buffer. + * network format is defined to be little endian + * -------------------------------------------------------------------- */ +uint16_t +vchi_readbuf_uint16(const void *_ptr) +{ + const unsigned char *ptr = _ptr; + return ptr[0] | (ptr[1] << 8); +} + +/* ---------------------------------------------------------------------- + * write a uint16_t into the buffer. + * network format is defined to be little endian + * -------------------------------------------------------------------- */ +void +vchi_writebuf_uint16(void *_ptr, uint16_t value) +{ + unsigned char *ptr = _ptr; + ptr[0] = (value >> 0) & 0xFF; + ptr[1] = (value >> 8) & 0xFF; +} +#endif + +/*********************************************************** + * Name: vchi_service_use + * + * Arguments: const VCHI_SERVICE_HANDLE_T handle + * + * Description: Routine to increment refcount on a service + * + * Returns: void + * + ***********************************************************/ +int32_t vchi_service_use(const VCHI_SERVICE_HANDLE_T handle) +{ + int32_t ret = -1; + SHIM_SERVICE_T *service = (SHIM_SERVICE_T *)handle; + if (service) + ret = vchiq_status_to_vchi(vchiq_use_service(service->handle)); + return ret; +} +EXPORT_SYMBOL(vchi_service_use); + +/*********************************************************** + * Name: vchi_service_release + * + * Arguments: const VCHI_SERVICE_HANDLE_T handle + * + * Description: Routine to decrement refcount on a service + * + * Returns: void + * + ***********************************************************/ +int32_t vchi_service_release(const VCHI_SERVICE_HANDLE_T handle) +{ + int32_t ret = -1; + SHIM_SERVICE_T *service = (SHIM_SERVICE_T *)handle; + if (service) + ret = vchiq_status_to_vchi( + vchiq_release_service(service->handle)); + return ret; +} +EXPORT_SYMBOL(vchi_service_release); Property changes on: projects/building-blocks/sys/contrib/vchiq/interface/vchiq_arm/vchiq_shim.c ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Index: projects/building-blocks/sys/contrib/vchiq/interface/vchiq_arm/vchiq_util.c =================================================================== --- projects/building-blocks/sys/contrib/vchiq/interface/vchiq_arm/vchiq_util.c (nonexistent) +++ projects/building-blocks/sys/contrib/vchiq/interface/vchiq_arm/vchiq_util.c (revision 278312) @@ -0,0 +1,151 @@ +/** + * Copyright (c) 2010-2012 Broadcom. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions, and the following disclaimer, + * without modification. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The names of the above-listed copyright holders may not be used + * to endorse or promote products derived from this software without + * specific prior written permission. + * + * ALTERNATIVELY, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2, as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "vchiq_util.h" + +static inline int is_pow2(int i) +{ + return i && !(i & (i - 1)); +} + +int vchiu_queue_init(VCHIU_QUEUE_T *queue, int size) +{ + WARN_ON(!is_pow2(size)); + + queue->size = size; + queue->read = 0; + queue->write = 0; + + _sema_init(&queue->pop, 0); + _sema_init(&queue->push, 0); + + queue->storage = kzalloc(size * sizeof(VCHIQ_HEADER_T *), GFP_KERNEL); + if (queue->storage == NULL) { + vchiu_queue_delete(queue); + return 0; + } + return 1; +} + +void vchiu_queue_delete(VCHIU_QUEUE_T *queue) +{ + if (queue->storage != NULL) + kfree(queue->storage); +} + +int vchiu_queue_is_empty(VCHIU_QUEUE_T *queue) +{ + return queue->read == queue->write; +} + +int vchiu_queue_is_full(VCHIU_QUEUE_T *queue) +{ + return queue->write == queue->read + queue->size; +} + +void vchiu_queue_push(VCHIU_QUEUE_T *queue, VCHIQ_HEADER_T *header) +{ + while (queue->write == queue->read + queue->size) { + if (down_interruptible(&queue->pop) != 0) { + flush_signals(current); + } + } + + /* + * Write to queue->storage must be visible after read from + * queue->read + */ + smp_mb(); + + queue->storage[queue->write & (queue->size - 1)] = header; + + /* + * Write to queue->storage must be visible before write to + * queue->write + */ + smp_wmb(); + + queue->write++; + + up(&queue->push); +} + +VCHIQ_HEADER_T *vchiu_queue_peek(VCHIU_QUEUE_T *queue) +{ + while (queue->write == queue->read) { + if (down_interruptible(&queue->push) != 0) { + flush_signals(current); + } + } + + up(&queue->push); // We haven't removed anything from the queue. + + /* + * Read from queue->storage must be visible after read from + * queue->write + */ + smp_rmb(); + + return queue->storage[queue->read & (queue->size - 1)]; +} + +VCHIQ_HEADER_T *vchiu_queue_pop(VCHIU_QUEUE_T *queue) +{ + VCHIQ_HEADER_T *header; + + while (queue->write == queue->read) { + if (down_interruptible(&queue->push) != 0) { + flush_signals(current); + } + } + + /* + * Read from queue->storage must be visible after read from + * queue->write + */ + smp_rmb(); + + header = queue->storage[queue->read & (queue->size - 1)]; + + /* + * Read from queue->storage must be visible before write to + * queue->read + */ + smp_mb(); + + queue->read++; + + up(&queue->pop); + + return header; +} Property changes on: projects/building-blocks/sys/contrib/vchiq/interface/vchiq_arm/vchiq_util.c ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Index: projects/building-blocks/sys/contrib/vchiq/interface/vchiq_arm/vchiq_util.h =================================================================== --- projects/building-blocks/sys/contrib/vchiq/interface/vchiq_arm/vchiq_util.h (nonexistent) +++ projects/building-blocks/sys/contrib/vchiq/interface/vchiq_arm/vchiq_util.h (revision 278312) @@ -0,0 +1,66 @@ +/** + * Copyright (c) 2010-2012 Broadcom. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions, and the following disclaimer, + * without modification. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The names of the above-listed copyright holders may not be used + * to endorse or promote products derived from this software without + * specific prior written permission. + * + * ALTERNATIVELY, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2, as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef VCHIQ_UTIL_H +#define VCHIQ_UTIL_H + +#ifdef __FreeBSD__ +#include +#endif + +#include "vchiq_if.h" + +typedef struct { + int size; + int read; + int write; + + struct semaphore pop; + struct semaphore push; + + VCHIQ_HEADER_T **storage; +} VCHIU_QUEUE_T; + +extern int vchiu_queue_init(VCHIU_QUEUE_T *queue, int size); +extern void vchiu_queue_delete(VCHIU_QUEUE_T *queue); + +extern int vchiu_queue_is_empty(VCHIU_QUEUE_T *queue); +extern int vchiu_queue_is_full(VCHIU_QUEUE_T *queue); + +extern void vchiu_queue_push(VCHIU_QUEUE_T *queue, VCHIQ_HEADER_T *header); + +extern VCHIQ_HEADER_T *vchiu_queue_peek(VCHIU_QUEUE_T *queue); +extern VCHIQ_HEADER_T *vchiu_queue_pop(VCHIU_QUEUE_T *queue); + +#endif + Property changes on: projects/building-blocks/sys/contrib/vchiq/interface/vchiq_arm/vchiq_util.h ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Index: projects/building-blocks/sys/contrib/vchiq/interface/vchiq_arm/vchiq_version.c =================================================================== --- projects/building-blocks/sys/contrib/vchiq/interface/vchiq_arm/vchiq_version.c (nonexistent) +++ projects/building-blocks/sys/contrib/vchiq/interface/vchiq_arm/vchiq_version.c (revision 278312) @@ -0,0 +1,59 @@ +/** + * Copyright (c) 2010-2012 Broadcom. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions, and the following disclaimer, + * without modification. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The names of the above-listed copyright holders may not be used + * to endorse or promote products derived from this software without + * specific prior written permission. + * + * ALTERNATIVELY, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2, as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#include "vchiq_build_info.h" +#include + +VC_DEBUG_DECLARE_STRING_VAR( vchiq_build_hostname, "dc4-arm-01" ); +VC_DEBUG_DECLARE_STRING_VAR( vchiq_build_version, "9245b4c35b99b3870e1f7dc598c5692b3c66a6f0 (tainted)" ); +VC_DEBUG_DECLARE_STRING_VAR( vchiq_build_time, __TIME__ ); +VC_DEBUG_DECLARE_STRING_VAR( vchiq_build_date, __DATE__ ); + +const char *vchiq_get_build_hostname( void ) +{ + return vchiq_build_hostname; +} + +const char *vchiq_get_build_version( void ) +{ + return vchiq_build_version; +} + +const char *vchiq_get_build_date( void ) +{ + return vchiq_build_date; +} + +const char *vchiq_get_build_time( void ) +{ + return vchiq_build_time; +} Property changes on: projects/building-blocks/sys/contrib/vchiq/interface/vchiq_arm/vchiq_version.c ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Index: projects/building-blocks/sys/dev/cxgbe/if_cxl.c =================================================================== --- projects/building-blocks/sys/dev/cxgbe/if_cxl.c (nonexistent) +++ projects/building-blocks/sys/dev/cxgbe/if_cxl.c (revision 278312) @@ -0,0 +1,44 @@ +/*- + * Copyright (c) 2015 Chelsio Communications, Inc. + * All rights reserved. + * Written by: Navdeep Parhar + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include + +static int +mod_event(module_t mod, int cmd, void *arg) +{ + + return (0); +} +static moduledata_t if_cxl_mod = {"if_cxl", mod_event}; +DECLARE_MODULE(if_cxl, if_cxl_mod, SI_SUB_EXEC, SI_ORDER_ANY); +MODULE_VERSION(if_cxl, 1); +MODULE_DEPEND(if_cxl, cxl, 1, 1, 1); Property changes on: projects/building-blocks/sys/dev/cxgbe/if_cxl.c ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Index: projects/building-blocks/sys/dev/uart/uart_bus_pci.c =================================================================== --- projects/building-blocks/sys/dev/uart/uart_bus_pci.c (revision 278311) +++ projects/building-blocks/sys/dev/uart/uart_bus_pci.c (revision 278312) @@ -1,198 +1,199 @@ /*- * Copyright (c) 2006 Marcel Moolenaar * Copyright (c) 2001 M. Warner Losh * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #define DEFAULT_RCLK 1843200 static int uart_pci_probe(device_t dev); static device_method_t uart_pci_methods[] = { /* Device interface */ DEVMETHOD(device_probe, uart_pci_probe), DEVMETHOD(device_attach, uart_bus_attach), DEVMETHOD(device_detach, uart_bus_detach), DEVMETHOD(device_resume, uart_bus_resume), DEVMETHOD_END }; static driver_t uart_pci_driver = { uart_driver_name, uart_pci_methods, sizeof(struct uart_softc), }; struct pci_id { uint16_t vendor; uint16_t device; uint16_t subven; uint16_t subdev; const char *desc; int rid; int rclk; }; static const struct pci_id pci_ns8250_ids[] = { { 0x1028, 0x0008, 0xffff, 0, "Dell Remote Access Card III", 0x14, 128 * DEFAULT_RCLK }, { 0x1028, 0x0012, 0xffff, 0, "Dell RAC 4 Daughter Card Virtual UART", 0x14, 128 * DEFAULT_RCLK }, { 0x1033, 0x0074, 0x1033, 0x8014, "NEC RCV56ACF 56k Voice Modem", 0x10 }, { 0x1033, 0x007d, 0x1033, 0x8012, "NEC RS232C", 0x10 }, { 0x103c, 0x1048, 0x103c, 0x1227, "HP Diva Serial [GSP] UART - Powerbar SP2", 0x10 }, { 0x103c, 0x1048, 0x103c, 0x1301, "HP Diva RMP3", 0x14 }, { 0x103c, 0x1290, 0xffff, 0, "HP Auxiliary Diva Serial Port", 0x18 }, { 0x103c, 0x3301, 0xffff, 0, "HP iLO serial port", 0x10 }, { 0x11c1, 0x0480, 0xffff, 0, "Agere Systems Venus Modem (V90, 56KFlex)", 0x14 }, { 0x115d, 0x0103, 0xffff, 0, "Xircom Cardbus Ethernet + 56k Modem", 0x10 }, { 0x1282, 0x6585, 0xffff, 0, "Davicom 56PDV PCI Modem", 0x10 }, { 0x12b9, 0x1008, 0xffff, 0, "3Com 56K FaxModem Model 5610", 0x10 }, { 0x131f, 0x1000, 0xffff, 0, "Siig CyberSerial (1-port) 16550", 0x18 }, { 0x131f, 0x1001, 0xffff, 0, "Siig CyberSerial (1-port) 16650", 0x18 }, { 0x131f, 0x1002, 0xffff, 0, "Siig CyberSerial (1-port) 16850", 0x18 }, { 0x131f, 0x2000, 0xffff, 0, "Siig CyberSerial (1-port) 16550", 0x10 }, { 0x131f, 0x2001, 0xffff, 0, "Siig CyberSerial (1-port) 16650", 0x10 }, { 0x131f, 0x2002, 0xffff, 0, "Siig CyberSerial (1-port) 16850", 0x10 }, { 0x135c, 0x0190, 0xffff, 0, "Quatech SSCLP-100", 0x18 }, { 0x135c, 0x01c0, 0xffff, 0, "Quatech SSCLP-200/300", 0x18 }, { 0x135e, 0x7101, 0xffff, 0, "Sealevel Systems Single Port RS-232/422/485/530", 0x18 }, { 0x1407, 0x0110, 0xffff, 0, "Lava Computer mfg DSerial-PCI Port A", 0x10 }, { 0x1407, 0x0111, 0xffff, 0, "Lava Computer mfg DSerial-PCI Port B", 0x10 }, { 0x1407, 0x0510, 0xffff, 0, "Lava SP Serial 550 PCI", 0x10 }, { 0x1409, 0x7168, 0x1409, 0x4025, "Timedia Technology Serial Port", 0x10, 8 * DEFAULT_RCLK }, { 0x1409, 0x7168, 0x1409, 0x4027, "Timedia Technology Serial Port", 0x10, 8 * DEFAULT_RCLK }, { 0x1409, 0x7168, 0x1409, 0x4028, "Timedia Technology Serial Port", 0x10, 8 * DEFAULT_RCLK }, { 0x1409, 0x7168, 0x1409, 0x5025, "Timedia Technology Serial Port", 0x10, 8 * DEFAULT_RCLK }, { 0x1409, 0x7168, 0x1409, 0x5027, "Timedia Technology Serial Port", 0x10, 8 * DEFAULT_RCLK }, { 0x1415, 0x950b, 0xffff, 0, "Oxford Semiconductor OXCB950 Cardbus 16950 UART", 0x10, 16384000 }, { 0x1415, 0xc120, 0xffff, 0, "Oxford Semiconductor OXPCIe952 PCIe 16950 UART", 0x10 }, { 0x14e4, 0x4344, 0xffff, 0, "Sony Ericsson GC89 PC Card", 0x10}, { 0x151f, 0x0000, 0xffff, 0, "TOPIC Semiconductor TP560 56k modem", 0x10 }, { 0x1fd4, 0x1999, 0x1fd4, 0x0001, "Sunix SER5xxxx Serial Port", 0x10, 8 * DEFAULT_RCLK }, { 0x8086, 0x1c3d, 0xffff, 0, "Intel AMT - KT Controller", 0x10 }, { 0x8086, 0x1d3d, 0xffff, 0, "Intel C600/X79 Series Chipset KT Controller", 0x10 }, { 0x8086, 0x2a07, 0xffff, 0, "Intel AMT - PM965/GM965 KT Controller", 0x10 }, +{ 0x8086, 0x2a47, 0xffff, 0, "Mobile 4 Series Chipset KT Controller", 0x10 }, { 0x8086, 0x2e17, 0xffff, 0, "4 Series Chipset Serial KT Controller", 0x10 }, { 0x8086, 0x3b67, 0xffff, 0, "5 Series/3400 Series Chipset KT Controller", 0x10 }, { 0x8086, 0x8811, 0xffff, 0, "Intel EG20T Serial Port 0", 0x10 }, { 0x8086, 0x8812, 0xffff, 0, "Intel EG20T Serial Port 1", 0x10 }, { 0x8086, 0x8813, 0xffff, 0, "Intel EG20T Serial Port 2", 0x10 }, { 0x8086, 0x8814, 0xffff, 0, "Intel EG20T Serial Port 3", 0x10 }, { 0x8086, 0x8c3d, 0xffff, 0, "Intel Lynx Point KT Controller", 0x10 }, { 0x8086, 0x8cbd, 0xffff, 0, "Intel Wildcat Point KT Controller", 0x10 }, { 0x9710, 0x9820, 0x1000, 1, "NetMos NM9820 Serial Port", 0x10 }, { 0x9710, 0x9835, 0x1000, 1, "NetMos NM9835 Serial Port", 0x10 }, { 0x9710, 0x9865, 0xa000, 0x1000, "NetMos NM9865 Serial Port", 0x10 }, { 0x9710, 0x9900, 0xa000, 0x1000, "MosChip MCS9900 PCIe to Peripheral Controller", 0x10 }, { 0x9710, 0x9901, 0xa000, 0x1000, "MosChip MCS9901 PCIe to Peripheral Controller", 0x10 }, { 0x9710, 0x9904, 0xa000, 0x1000, "MosChip MCS9904 PCIe to Peripheral Controller", 0x10 }, { 0x9710, 0x9922, 0xa000, 0x1000, "MosChip MCS9922 PCIe to Peripheral Controller", 0x10 }, { 0xdeaf, 0x9051, 0xffff, 0, "Middle Digital PC Weasel Serial Port", 0x10 }, { 0xffff, 0, 0xffff, 0, NULL, 0, 0} }; const static struct pci_id * uart_pci_match(device_t dev, const struct pci_id *id) { uint16_t device, subdev, subven, vendor; vendor = pci_get_vendor(dev); device = pci_get_device(dev); while (id->vendor != 0xffff && (id->vendor != vendor || id->device != device)) id++; if (id->vendor == 0xffff) return (NULL); if (id->subven == 0xffff) return (id); subven = pci_get_subvendor(dev); subdev = pci_get_subdevice(dev); while (id->vendor == vendor && id->device == device && (id->subven != subven || id->subdev != subdev)) id++; return ((id->vendor == vendor && id->device == device) ? id : NULL); } static int uart_pci_probe(device_t dev) { struct uart_softc *sc; const struct pci_id *id; int result; sc = device_get_softc(dev); id = uart_pci_match(dev, pci_ns8250_ids); if (id != NULL) { sc->sc_class = &uart_ns8250_class; goto match; } /* Add checks for non-ns8250 IDs here. */ return (ENXIO); match: result = uart_bus_probe(dev, 0, id->rclk, id->rid, 0); /* Bail out on error. */ if (result > 0) return (result); /* Set/override the device description. */ if (id->desc) device_set_desc(dev, id->desc); return (result); } DRIVER_MODULE(uart, pci, uart_pci_driver, uart_devclass, NULL, NULL); Index: projects/building-blocks/sys/dev/vt/hw/vga/vt_vga.c =================================================================== --- projects/building-blocks/sys/dev/vt/hw/vga/vt_vga.c (revision 278311) +++ projects/building-blocks/sys/dev/vt/hw/vga/vt_vga.c (revision 278312) @@ -1,1298 +1,1299 @@ /*- * Copyright (c) 2005 Marcel Moolenaar * All rights reserved. * * Copyright (c) 2009 The FreeBSD Foundation * All rights reserved. * * Portions of this software were developed by Ed Schouten * under sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #if defined(__amd64__) || defined(__i386__) #include #include #include #include #endif /* __amd64__ || __i386__ */ struct vga_softc { bus_space_tag_t vga_fb_tag; bus_space_handle_t vga_fb_handle; bus_space_tag_t vga_reg_tag; bus_space_handle_t vga_reg_handle; int vga_wmode; term_color_t vga_curfg, vga_curbg; boolean_t vga_enabled; }; /* Convenience macros. */ #define MEM_READ1(sc, ofs) \ bus_space_read_1(sc->vga_fb_tag, sc->vga_fb_handle, ofs) #define MEM_WRITE1(sc, ofs, val) \ bus_space_write_1(sc->vga_fb_tag, sc->vga_fb_handle, ofs, val) #define REG_READ1(sc, reg) \ bus_space_read_1(sc->vga_reg_tag, sc->vga_reg_handle, reg) #define REG_WRITE1(sc, reg, val) \ bus_space_write_1(sc->vga_reg_tag, sc->vga_reg_handle, reg, val) #define VT_VGA_WIDTH 640 #define VT_VGA_HEIGHT 480 #define VT_VGA_MEMSIZE (VT_VGA_WIDTH * VT_VGA_HEIGHT / 8) /* * VGA is designed to handle 8 pixels at a time (8 pixels in one byte of * memory). */ #define VT_VGA_PIXELS_BLOCK 8 /* * We use an off-screen addresses to: * o store the background color; * o store pixels pattern. * Those addresses are then loaded in the latches once. */ #define VT_VGA_BGCOLOR_OFFSET VT_VGA_MEMSIZE static vd_probe_t vga_probe; static vd_init_t vga_init; static vd_blank_t vga_blank; static vd_bitblt_text_t vga_bitblt_text; static vd_bitblt_bmp_t vga_bitblt_bitmap; static vd_drawrect_t vga_drawrect; static vd_setpixel_t vga_setpixel; static vd_postswitch_t vga_postswitch; static const struct vt_driver vt_vga_driver = { .vd_name = "vga", .vd_probe = vga_probe, .vd_init = vga_init, .vd_blank = vga_blank, .vd_bitblt_text = vga_bitblt_text, .vd_bitblt_bmp = vga_bitblt_bitmap, .vd_drawrect = vga_drawrect, .vd_setpixel = vga_setpixel, .vd_postswitch = vga_postswitch, .vd_priority = VD_PRIORITY_GENERIC, }; /* * Driver supports both text mode and graphics mode. Make sure the * buffer is always big enough to support both. */ static struct vga_softc vga_conssoftc; VT_DRIVER_DECLARE(vt_vga, vt_vga_driver); static inline void vga_setwmode(struct vt_device *vd, int wmode) { struct vga_softc *sc = vd->vd_softc; if (sc->vga_wmode == wmode) return; REG_WRITE1(sc, VGA_GC_ADDRESS, VGA_GC_MODE); REG_WRITE1(sc, VGA_GC_DATA, wmode); sc->vga_wmode = wmode; switch (wmode) { case 3: /* Re-enable all plans. */ REG_WRITE1(sc, VGA_SEQ_ADDRESS, VGA_SEQ_MAP_MASK); REG_WRITE1(sc, VGA_SEQ_DATA, VGA_SEQ_MM_EM3 | VGA_SEQ_MM_EM2 | VGA_SEQ_MM_EM1 | VGA_SEQ_MM_EM0); break; } } static inline void vga_setfg(struct vt_device *vd, term_color_t color) { struct vga_softc *sc = vd->vd_softc; vga_setwmode(vd, 3); if (sc->vga_curfg == color) return; REG_WRITE1(sc, VGA_GC_ADDRESS, VGA_GC_SET_RESET); REG_WRITE1(sc, VGA_GC_DATA, color); sc->vga_curfg = color; } static inline void vga_setbg(struct vt_device *vd, term_color_t color) { struct vga_softc *sc = vd->vd_softc; vga_setwmode(vd, 3); if (sc->vga_curbg == color) return; REG_WRITE1(sc, VGA_GC_ADDRESS, VGA_GC_SET_RESET); REG_WRITE1(sc, VGA_GC_DATA, color); /* * Write 8 pixels using the background color to an off-screen * byte in the video memory. */ MEM_WRITE1(sc, VT_VGA_BGCOLOR_OFFSET, 0xff); /* * Read those 8 pixels back to load the background color in the * latches register. */ MEM_READ1(sc, VT_VGA_BGCOLOR_OFFSET); sc->vga_curbg = color; /* * The Set/Reset register doesn't contain the fg color anymore, * store an invalid color. */ sc->vga_curfg = 0xff; } /* * Binary searchable table for Unicode to CP437 conversion. */ struct unicp437 { uint16_t unicode_base; uint8_t cp437_base; uint8_t length; }; static const struct unicp437 cp437table[] = { { 0x0020, 0x20, 0x5e }, { 0x00a0, 0x20, 0x00 }, { 0x00a1, 0xad, 0x00 }, { 0x00a2, 0x9b, 0x00 }, { 0x00a3, 0x9c, 0x00 }, { 0x00a5, 0x9d, 0x00 }, { 0x00a7, 0x15, 0x00 }, { 0x00aa, 0xa6, 0x00 }, { 0x00ab, 0xae, 0x00 }, { 0x00ac, 0xaa, 0x00 }, { 0x00b0, 0xf8, 0x00 }, { 0x00b1, 0xf1, 0x00 }, { 0x00b2, 0xfd, 0x00 }, { 0x00b5, 0xe6, 0x00 }, { 0x00b6, 0x14, 0x00 }, { 0x00b7, 0xfa, 0x00 }, { 0x00ba, 0xa7, 0x00 }, { 0x00bb, 0xaf, 0x00 }, { 0x00bc, 0xac, 0x00 }, { 0x00bd, 0xab, 0x00 }, { 0x00bf, 0xa8, 0x00 }, { 0x00c4, 0x8e, 0x01 }, { 0x00c6, 0x92, 0x00 }, { 0x00c7, 0x80, 0x00 }, { 0x00c9, 0x90, 0x00 }, { 0x00d1, 0xa5, 0x00 }, { 0x00d6, 0x99, 0x00 }, { 0x00dc, 0x9a, 0x00 }, { 0x00df, 0xe1, 0x00 }, { 0x00e0, 0x85, 0x00 }, { 0x00e1, 0xa0, 0x00 }, { 0x00e2, 0x83, 0x00 }, { 0x00e4, 0x84, 0x00 }, { 0x00e5, 0x86, 0x00 }, { 0x00e6, 0x91, 0x00 }, { 0x00e7, 0x87, 0x00 }, { 0x00e8, 0x8a, 0x00 }, { 0x00e9, 0x82, 0x00 }, { 0x00ea, 0x88, 0x01 }, { 0x00ec, 0x8d, 0x00 }, { 0x00ed, 0xa1, 0x00 }, { 0x00ee, 0x8c, 0x00 }, { 0x00ef, 0x8b, 0x00 }, { 0x00f0, 0xeb, 0x00 }, { 0x00f1, 0xa4, 0x00 }, { 0x00f2, 0x95, 0x00 }, { 0x00f3, 0xa2, 0x00 }, { 0x00f4, 0x93, 0x00 }, { 0x00f6, 0x94, 0x00 }, { 0x00f7, 0xf6, 0x00 }, { 0x00f8, 0xed, 0x00 }, { 0x00f9, 0x97, 0x00 }, { 0x00fa, 0xa3, 0x00 }, { 0x00fb, 0x96, 0x00 }, { 0x00fc, 0x81, 0x00 }, { 0x00ff, 0x98, 0x00 }, { 0x0192, 0x9f, 0x00 }, { 0x0393, 0xe2, 0x00 }, { 0x0398, 0xe9, 0x00 }, { 0x03a3, 0xe4, 0x00 }, { 0x03a6, 0xe8, 0x00 }, { 0x03a9, 0xea, 0x00 }, { 0x03b1, 0xe0, 0x01 }, { 0x03b4, 0xeb, 0x00 }, { 0x03b5, 0xee, 0x00 }, { 0x03bc, 0xe6, 0x00 }, { 0x03c0, 0xe3, 0x00 }, { 0x03c3, 0xe5, 0x00 }, { 0x03c4, 0xe7, 0x00 }, { 0x03c6, 0xed, 0x00 }, { 0x03d5, 0xed, 0x00 }, { 0x2010, 0x2d, 0x00 }, { 0x2014, 0x2d, 0x00 }, { 0x2018, 0x60, 0x00 }, { 0x2019, 0x27, 0x00 }, { 0x201c, 0x22, 0x00 }, { 0x201d, 0x22, 0x00 }, { 0x2022, 0x07, 0x00 }, { 0x203c, 0x13, 0x00 }, { 0x207f, 0xfc, 0x00 }, { 0x20a7, 0x9e, 0x00 }, { 0x20ac, 0xee, 0x00 }, { 0x2126, 0xea, 0x00 }, { 0x2190, 0x1b, 0x00 }, { 0x2191, 0x18, 0x00 }, { 0x2192, 0x1a, 0x00 }, { 0x2193, 0x19, 0x00 }, { 0x2194, 0x1d, 0x00 }, { 0x2195, 0x12, 0x00 }, { 0x21a8, 0x17, 0x00 }, { 0x2202, 0xeb, 0x00 }, { 0x2208, 0xee, 0x00 }, { 0x2211, 0xe4, 0x00 }, { 0x2212, 0x2d, 0x00 }, { 0x2219, 0xf9, 0x00 }, { 0x221a, 0xfb, 0x00 }, { 0x221e, 0xec, 0x00 }, { 0x221f, 0x1c, 0x00 }, { 0x2229, 0xef, 0x00 }, { 0x2248, 0xf7, 0x00 }, { 0x2261, 0xf0, 0x00 }, { 0x2264, 0xf3, 0x00 }, { 0x2265, 0xf2, 0x00 }, { 0x2302, 0x7f, 0x00 }, { 0x2310, 0xa9, 0x00 }, { 0x2320, 0xf4, 0x00 }, { 0x2321, 0xf5, 0x00 }, { 0x2500, 0xc4, 0x00 }, { 0x2502, 0xb3, 0x00 }, { 0x250c, 0xda, 0x00 }, { 0x2510, 0xbf, 0x00 }, { 0x2514, 0xc0, 0x00 }, { 0x2518, 0xd9, 0x00 }, { 0x251c, 0xc3, 0x00 }, { 0x2524, 0xb4, 0x00 }, { 0x252c, 0xc2, 0x00 }, { 0x2534, 0xc1, 0x00 }, { 0x253c, 0xc5, 0x00 }, { 0x2550, 0xcd, 0x00 }, { 0x2551, 0xba, 0x00 }, { 0x2552, 0xd5, 0x00 }, { 0x2553, 0xd6, 0x00 }, { 0x2554, 0xc9, 0x00 }, { 0x2555, 0xb8, 0x00 }, { 0x2556, 0xb7, 0x00 }, { 0x2557, 0xbb, 0x00 }, { 0x2558, 0xd4, 0x00 }, { 0x2559, 0xd3, 0x00 }, { 0x255a, 0xc8, 0x00 }, { 0x255b, 0xbe, 0x00 }, { 0x255c, 0xbd, 0x00 }, { 0x255d, 0xbc, 0x00 }, { 0x255e, 0xc6, 0x01 }, { 0x2560, 0xcc, 0x00 }, { 0x2561, 0xb5, 0x00 }, { 0x2562, 0xb6, 0x00 }, { 0x2563, 0xb9, 0x00 }, { 0x2564, 0xd1, 0x01 }, { 0x2566, 0xcb, 0x00 }, { 0x2567, 0xcf, 0x00 }, { 0x2568, 0xd0, 0x00 }, { 0x2569, 0xca, 0x00 }, { 0x256a, 0xd8, 0x00 }, { 0x256b, 0xd7, 0x00 }, { 0x256c, 0xce, 0x00 }, { 0x2580, 0xdf, 0x00 }, { 0x2584, 0xdc, 0x00 }, { 0x2588, 0xdb, 0x00 }, { 0x258c, 0xdd, 0x00 }, { 0x2590, 0xde, 0x00 }, { 0x2591, 0xb0, 0x02 }, { 0x25a0, 0xfe, 0x00 }, { 0x25ac, 0x16, 0x00 }, { 0x25b2, 0x1e, 0x00 }, { 0x25ba, 0x10, 0x00 }, { 0x25bc, 0x1f, 0x00 }, { 0x25c4, 0x11, 0x00 }, { 0x25cb, 0x09, 0x00 }, { 0x25d8, 0x08, 0x00 }, { 0x25d9, 0x0a, 0x00 }, { 0x263a, 0x01, 0x01 }, { 0x263c, 0x0f, 0x00 }, { 0x2640, 0x0c, 0x00 }, { 0x2642, 0x0b, 0x00 }, { 0x2660, 0x06, 0x00 }, { 0x2663, 0x05, 0x00 }, { 0x2665, 0x03, 0x01 }, { 0x266a, 0x0d, 0x00 }, { 0x266c, 0x0e, 0x00 }, }; static uint8_t vga_get_cp437(term_char_t c) { int min, mid, max; min = 0; max = (sizeof(cp437table) / sizeof(struct unicp437)) - 1; if (c < cp437table[0].unicode_base || c > cp437table[max].unicode_base + cp437table[max].length) return '?'; while (max >= min) { mid = (min + max) / 2; if (c < cp437table[mid].unicode_base) max = mid - 1; else if (c > cp437table[mid].unicode_base + cp437table[mid].length) min = mid + 1; else return (c - cp437table[mid].unicode_base + cp437table[mid].cp437_base); } return '?'; } static void vga_blank(struct vt_device *vd, term_color_t color) { struct vga_softc *sc = vd->vd_softc; u_int ofs; vga_setfg(vd, color); for (ofs = 0; ofs < VT_VGA_MEMSIZE; ofs++) MEM_WRITE1(sc, ofs, 0xff); } static inline void vga_bitblt_put(struct vt_device *vd, u_long dst, term_color_t color, uint8_t v) { struct vga_softc *sc = vd->vd_softc; /* Skip empty writes, in order to avoid palette changes. */ if (v != 0x00) { vga_setfg(vd, color); /* * When this MEM_READ1() gets disabled, all sorts of * artifacts occur. This is because this read loads the * set of 8 pixels that are about to be changed. There * is one scenario where we can avoid the read, namely * if all pixels are about to be overwritten anyway. */ if (v != 0xff) { MEM_READ1(sc, dst); /* The bg color was trashed by the reads. */ sc->vga_curbg = 0xff; } MEM_WRITE1(sc, dst, v); } } static void vga_setpixel(struct vt_device *vd, int x, int y, term_color_t color) { if (vd->vd_flags & VDF_TEXTMODE) return; vga_bitblt_put(vd, (y * VT_VGA_WIDTH / 8) + (x / 8), color, 0x80 >> (x % 8)); } static void vga_drawrect(struct vt_device *vd, int x1, int y1, int x2, int y2, int fill, term_color_t color) { int x, y; if (vd->vd_flags & VDF_TEXTMODE) return; for (y = y1; y <= y2; y++) { if (fill || (y == y1) || (y == y2)) { for (x = x1; x <= x2; x++) vga_setpixel(vd, x, y, color); } else { vga_setpixel(vd, x1, y, color); vga_setpixel(vd, x2, y, color); } } } static void vga_compute_shifted_pattern(const uint8_t *src, unsigned int bytes, unsigned int src_x, unsigned int x_count, unsigned int dst_x, uint8_t *pattern, uint8_t *mask) { unsigned int n; n = src_x / 8; /* * This mask has bits set, where a pixel (ether 0 or 1) * comes from the source bitmap. */ if (mask != NULL) { *mask = (0xff >> (8 - x_count)) << (8 - x_count - dst_x); } if (n == (src_x + x_count - 1) / 8) { /* All the pixels we want are in the same byte. */ *pattern = src[n]; if (dst_x >= src_x) *pattern >>= (dst_x - src_x % 8); else *pattern <<= (src_x % 8 - dst_x); } else { /* The pixels we want are split into two bytes. */ if (dst_x >= src_x % 8) { *pattern = src[n] << (8 - dst_x - src_x % 8) | src[n + 1] >> (dst_x - src_x % 8); } else { *pattern = src[n] << (src_x % 8 - dst_x) | src[n + 1] >> (8 - src_x % 8 - dst_x); } } } static void vga_copy_bitmap_portion(uint8_t *pattern_2colors, uint8_t *pattern_ncolors, const uint8_t *src, const uint8_t *src_mask, unsigned int src_width, unsigned int src_x, unsigned int dst_x, unsigned int x_count, unsigned int src_y, unsigned int dst_y, unsigned int y_count, term_color_t fg, term_color_t bg, int overwrite) { unsigned int i, bytes; uint8_t pattern, relevant_bits, mask; bytes = (src_width + 7) / 8; for (i = 0; i < y_count; ++i) { vga_compute_shifted_pattern(src + (src_y + i) * bytes, bytes, src_x, x_count, dst_x, &pattern, &relevant_bits); if (src_mask == NULL) { /* * No src mask. Consider that all wanted bits * from the source are "authoritative". */ mask = relevant_bits; } else { /* * There's an src mask. We shift it the same way * we shifted the source pattern. */ vga_compute_shifted_pattern( src_mask + (src_y + i) * bytes, bytes, src_x, x_count, dst_x, &mask, NULL); /* Now, only keep the wanted bits among them. */ mask &= relevant_bits; } /* * Clear bits from the pattern which must be * transparent, according to the source mask. */ pattern &= mask; /* Set the bits in the 2-colors array. */ if (overwrite) pattern_2colors[dst_y + i] &= ~mask; pattern_2colors[dst_y + i] |= pattern; if (pattern_ncolors == NULL) continue; /* * Set the same bits in the n-colors array. This one * supports transparency, when a given bit is cleared in * all colors. */ if (overwrite) { /* * Ensure that the pixels used by this bitmap are * cleared in other colors. */ for (int j = 0; j < 16; ++j) pattern_ncolors[(dst_y + i) * 16 + j] &= ~mask; } pattern_ncolors[(dst_y + i) * 16 + fg] |= pattern; pattern_ncolors[(dst_y + i) * 16 + bg] |= (~pattern & mask); } } static void vga_bitblt_pixels_block_2colors(struct vt_device *vd, const uint8_t *masks, term_color_t fg, term_color_t bg, unsigned int x, unsigned int y, unsigned int height) { unsigned int i, offset; struct vga_softc *sc; /* * The great advantage of Write Mode 3 is that we just need * to load the foreground in the Set/Reset register, load the * background color in the latches register (this is done * through a write in offscreen memory followed by a read of * that data), then write the pattern to video memory. This * pattern indicates if the pixel should use the foreground * color (bit set) or the background color (bit cleared). */ vga_setbg(vd, bg); vga_setfg(vd, fg); sc = vd->vd_softc; offset = (VT_VGA_WIDTH * y + x) / 8; for (i = 0; i < height; ++i, offset += VT_VGA_WIDTH / 8) { MEM_WRITE1(sc, offset, masks[i]); } } static void vga_bitblt_pixels_block_ncolors(struct vt_device *vd, const uint8_t *masks, unsigned int x, unsigned int y, unsigned int height) { unsigned int i, j, plan, color, offset; struct vga_softc *sc; uint8_t mask, plans[height * 4]; sc = vd->vd_softc; memset(plans, 0, sizeof(plans)); /* * To write a group of pixels using 3 or more colors, we select * Write Mode 0 and write one byte to each plan separately. */ /* * We first compute each byte: each plan contains one bit of the * color code for each of the 8 pixels. * * For example, if the 8 pixels are like this: * GBBBBBBY * where: * G (gray) = 0b0111 * B (black) = 0b0000 * Y (yellow) = 0b0011 * * The corresponding for bytes are: * GBBBBBBY * Plan 0: 10000001 = 0x81 * Plan 1: 10000001 = 0x81 * Plan 2: 10000000 = 0x80 * Plan 3: 00000000 = 0x00 * | | | * | | +-> 0b0011 (Y) * | +-----> 0b0000 (B) * +--------> 0b0111 (G) */ for (i = 0; i < height; ++i) { for (color = 0; color < 16; ++color) { mask = masks[i * 16 + color]; if (mask == 0x00) continue; for (j = 0; j < 8; ++j) { if (!((mask >> (7 - j)) & 0x1)) continue; /* The pixel "j" uses color "color". */ for (plan = 0; plan < 4; ++plan) plans[i * 4 + plan] |= ((color >> plan) & 0x1) << (7 - j); } } } /* * The bytes are ready: we now switch to Write Mode 0 and write * all bytes, one plan at a time. */ vga_setwmode(vd, 0); REG_WRITE1(sc, VGA_SEQ_ADDRESS, VGA_SEQ_MAP_MASK); for (plan = 0; plan < 4; ++plan) { /* Select plan. */ REG_WRITE1(sc, VGA_SEQ_DATA, 1 << plan); /* Write all bytes for this plan, from Y to Y+height. */ for (i = 0; i < height; ++i) { offset = (VT_VGA_WIDTH * (y + i) + x) / 8; MEM_WRITE1(sc, offset, plans[i * 4 + plan]); } } } static void vga_bitblt_one_text_pixels_block(struct vt_device *vd, const struct vt_window *vw, unsigned int x, unsigned int y) { const struct vt_buf *vb; const struct vt_font *vf; unsigned int i, col, row, src_x, x_count; unsigned int used_colors_list[16], used_colors; uint8_t pattern_2colors[vw->vw_font->vf_height]; uint8_t pattern_ncolors[vw->vw_font->vf_height * 16]; term_char_t c; term_color_t fg, bg; const uint8_t *src; vb = &vw->vw_buf; vf = vw->vw_font; /* * The current pixels block. * * We fill it with portions of characters, because both "grids" * may not match. * * i is the index in this pixels block. */ i = x; used_colors = 0; memset(used_colors_list, 0, sizeof(used_colors_list)); memset(pattern_2colors, 0, sizeof(pattern_2colors)); memset(pattern_ncolors, 0, sizeof(pattern_ncolors)); if (i < vw->vw_draw_area.tr_begin.tp_col) { /* * i is in the margin used to center the text area on * the screen. */ i = vw->vw_draw_area.tr_begin.tp_col; } while (i < x + VT_VGA_PIXELS_BLOCK && i < vw->vw_draw_area.tr_end.tp_col) { /* * Find which character is drawn on this pixel in the * pixels block. * * While here, record what colors it uses. */ col = (i - vw->vw_draw_area.tr_begin.tp_col) / vf->vf_width; row = (y - vw->vw_draw_area.tr_begin.tp_row) / vf->vf_height; c = VTBUF_GET_FIELD(vb, row, col); src = vtfont_lookup(vf, c); vt_determine_colors(c, VTBUF_ISCURSOR(vb, row, col), &fg, &bg); if ((used_colors_list[fg] & 0x1) != 0x1) used_colors++; if ((used_colors_list[bg] & 0x2) != 0x2) used_colors++; used_colors_list[fg] |= 0x1; used_colors_list[bg] |= 0x2; /* * Compute the portion of the character we want to draw, * because the pixels block may start in the middle of a * character. * * The first pixel to draw in the character is * the current position - * the start position of the character * * The last pixel to draw is either * - the last pixel of the character, or * - the pixel of the character matching the end of * the pixels block * whichever comes first. This position is then * changed to be relative to the start position of the * character. */ src_x = i - (col * vf->vf_width + vw->vw_draw_area.tr_begin.tp_col); x_count = min(min( (col + 1) * vf->vf_width + vw->vw_draw_area.tr_begin.tp_col, x + VT_VGA_PIXELS_BLOCK), vw->vw_draw_area.tr_end.tp_col); x_count -= col * vf->vf_width + vw->vw_draw_area.tr_begin.tp_col; x_count -= src_x; /* Copy a portion of the character. */ vga_copy_bitmap_portion(pattern_2colors, pattern_ncolors, src, NULL, vf->vf_width, src_x, i % VT_VGA_PIXELS_BLOCK, x_count, 0, 0, vf->vf_height, fg, bg, 0); /* We move to the next portion. */ i += x_count; } #ifndef SC_NO_CUTPASTE /* * Copy the mouse pointer bitmap if it's over the current pixels * block. * * We use the saved cursor position (saved in vt_flush()), because * the current position could be different than the one used * to mark the area dirty. */ term_rect_t drawn_area; drawn_area.tr_begin.tp_col = x; drawn_area.tr_begin.tp_row = y; drawn_area.tr_end.tp_col = x + VT_VGA_PIXELS_BLOCK; drawn_area.tr_end.tp_row = y + vf->vf_height; if (vd->vd_mshown && vt_is_cursor_in_area(vd, &drawn_area)) { struct vt_mouse_cursor *cursor; unsigned int mx, my; unsigned int dst_x, src_y, dst_y, y_count; cursor = vd->vd_mcursor; mx = vd->vd_mx_drawn + vw->vw_draw_area.tr_begin.tp_col; my = vd->vd_my_drawn + vw->vw_draw_area.tr_begin.tp_row; /* Compute the portion of the cursor we want to copy. */ src_x = x > mx ? x - mx : 0; dst_x = mx > x ? mx - x : 0; x_count = min(min(min( cursor->width - src_x, x + VT_VGA_PIXELS_BLOCK - mx), vw->vw_draw_area.tr_end.tp_col - mx), VT_VGA_PIXELS_BLOCK); /* * The cursor isn't aligned on the Y-axis with * characters, so we need to compute the vertical * start/count. */ src_y = y > my ? y - my : 0; dst_y = my > y ? my - y : 0; y_count = min( min(cursor->height - src_y, y + vf->vf_height - my), vf->vf_height); /* Copy the cursor portion. */ vga_copy_bitmap_portion(pattern_2colors, pattern_ncolors, cursor->map, cursor->mask, cursor->width, src_x, dst_x, x_count, src_y, dst_y, y_count, vd->vd_mcursor_fg, vd->vd_mcursor_bg, 1); if ((used_colors_list[vd->vd_mcursor_fg] & 0x1) != 0x1) used_colors++; if ((used_colors_list[vd->vd_mcursor_bg] & 0x2) != 0x2) used_colors++; } #endif /* * The pixels block is completed, we can now draw it on the * screen. */ if (used_colors == 2) vga_bitblt_pixels_block_2colors(vd, pattern_2colors, fg, bg, x, y, vf->vf_height); else vga_bitblt_pixels_block_ncolors(vd, pattern_ncolors, x, y, vf->vf_height); } static void vga_bitblt_text_gfxmode(struct vt_device *vd, const struct vt_window *vw, const term_rect_t *area) { const struct vt_font *vf; unsigned int col, row; unsigned int x1, y1, x2, y2, x, y; vf = vw->vw_font; /* * Compute the top-left pixel position aligned with the video * adapter pixels block size. * * This is calculated from the top-left column of te dirty area: * * 1. Compute the top-left pixel of the character: * col * font width + x offset * * NOTE: x offset is used to center the text area on the * screen. It's expressed in pixels, not in characters * col/row! * * 2. Find the pixel further on the left marking the start of * an aligned pixels block (eg. chunk of 8 pixels): * character's x / blocksize * blocksize * * The division, being made on integers, achieves the * alignment. * * For the Y-axis, we need to compute the character's y * coordinate, but we don't need to align it. */ col = area->tr_begin.tp_col; row = area->tr_begin.tp_row; x1 = (int)((col * vf->vf_width + vw->vw_draw_area.tr_begin.tp_col) / VT_VGA_PIXELS_BLOCK) * VT_VGA_PIXELS_BLOCK; y1 = row * vf->vf_height + vw->vw_draw_area.tr_begin.tp_row; /* * Compute the bottom right pixel position, again, aligned with * the pixels block size. * * The same rules apply, we just add 1 to base the computation * on the "right border" of the dirty area. */ col = area->tr_end.tp_col; row = area->tr_end.tp_row; x2 = (int)((col * vf->vf_width + vw->vw_draw_area.tr_begin.tp_col + VT_VGA_PIXELS_BLOCK - 1) / VT_VGA_PIXELS_BLOCK) * VT_VGA_PIXELS_BLOCK; y2 = row * vf->vf_height + vw->vw_draw_area.tr_begin.tp_row; /* Clip the area to the screen size. */ x2 = min(x2, vw->vw_draw_area.tr_end.tp_col); y2 = min(y2, vw->vw_draw_area.tr_end.tp_row); /* * Now, we take care of N pixels line at a time (the first for * loop, N = font height), and for these lines, draw one pixels * block at a time (the second for loop), not a character at a * time. * * Therefore, on the X-axis, characters my be drawn partially if * they are not aligned on 8-pixels boundary. * * However, the operation is repeated for the full height of the * font before moving to the next character, because it allows * to keep the color settings and write mode, before perhaps * changing them with the next one. */ for (y = y1; y < y2; y += vf->vf_height) { for (x = x1; x < x2; x += VT_VGA_PIXELS_BLOCK) { vga_bitblt_one_text_pixels_block(vd, vw, x, y); } } } static void vga_bitblt_text_txtmode(struct vt_device *vd, const struct vt_window *vw, const term_rect_t *area) { struct vga_softc *sc; const struct vt_buf *vb; unsigned int col, row; term_char_t c; term_color_t fg, bg; uint8_t ch, attr; sc = vd->vd_softc; vb = &vw->vw_buf; for (row = area->tr_begin.tp_row; row < area->tr_end.tp_row; ++row) { for (col = area->tr_begin.tp_col; col < area->tr_end.tp_col; ++col) { /* * Get next character and its associated fg/bg * colors. */ c = VTBUF_GET_FIELD(vb, row, col); vt_determine_colors(c, VTBUF_ISCURSOR(vb, row, col), &fg, &bg); /* * Convert character to CP437, which is the * character set used by the VGA hardware by * default. */ ch = vga_get_cp437(TCHAR_CHARACTER(c)); /* Convert colors to VGA attributes. */ attr = bg << 4 | fg; MEM_WRITE1(sc, 0x18000 + (row * 80 + col) * 2 + 0, ch); MEM_WRITE1(sc, 0x18000 + (row * 80 + col) * 2 + 1, attr); } } } static void vga_bitblt_text(struct vt_device *vd, const struct vt_window *vw, const term_rect_t *area) { if (!(vd->vd_flags & VDF_TEXTMODE)) { vga_bitblt_text_gfxmode(vd, vw, area); } else { vga_bitblt_text_txtmode(vd, vw, area); } } static void vga_bitblt_bitmap(struct vt_device *vd, const struct vt_window *vw, const uint8_t *pattern, const uint8_t *mask, unsigned int width, unsigned int height, unsigned int x, unsigned int y, term_color_t fg, term_color_t bg) { unsigned int x1, y1, x2, y2, i, j, src_x, dst_x, x_count; uint8_t pattern_2colors; /* Align coordinates with the 8-pxels grid. */ x1 = x / VT_VGA_PIXELS_BLOCK * VT_VGA_PIXELS_BLOCK; y1 = y; x2 = (x + width + VT_VGA_PIXELS_BLOCK - 1) / VT_VGA_PIXELS_BLOCK * VT_VGA_PIXELS_BLOCK; y2 = y + height; x2 = min(x2, vd->vd_width - 1); y2 = min(y2, vd->vd_height - 1); for (j = y1; j < y2; ++j) { src_x = 0; dst_x = x - x1; x_count = VT_VGA_PIXELS_BLOCK - dst_x; for (i = x1; i < x2; i += VT_VGA_PIXELS_BLOCK) { pattern_2colors = 0; vga_copy_bitmap_portion( &pattern_2colors, NULL, pattern, mask, width, src_x, dst_x, x_count, j - y1, 0, 1, fg, bg, 0); vga_bitblt_pixels_block_2colors(vd, &pattern_2colors, fg, bg, i, j, 1); src_x += x_count; dst_x = (dst_x + x_count) % VT_VGA_PIXELS_BLOCK; x_count = min(width - src_x, VT_VGA_PIXELS_BLOCK); } } } static void vga_initialize_graphics(struct vt_device *vd) { struct vga_softc *sc = vd->vd_softc; /* Clock select. */ REG_WRITE1(sc, VGA_GEN_MISC_OUTPUT_W, VGA_GEN_MO_VSP | VGA_GEN_MO_HSP | VGA_GEN_MO_PB | VGA_GEN_MO_ER | VGA_GEN_MO_IOA); /* Set sequencer clocking and memory mode. */ REG_WRITE1(sc, VGA_SEQ_ADDRESS, VGA_SEQ_CLOCKING_MODE); REG_WRITE1(sc, VGA_SEQ_DATA, VGA_SEQ_CM_89); REG_WRITE1(sc, VGA_SEQ_ADDRESS, VGA_SEQ_MEMORY_MODE); REG_WRITE1(sc, VGA_SEQ_DATA, VGA_SEQ_MM_OE | VGA_SEQ_MM_EM); /* Set the graphics controller in graphics mode. */ REG_WRITE1(sc, VGA_GC_ADDRESS, VGA_GC_MISCELLANEOUS); REG_WRITE1(sc, VGA_GC_DATA, 0x04 + VGA_GC_MISC_GA); /* Program the CRT controller. */ REG_WRITE1(sc, VGA_CRTC_ADDRESS, VGA_CRTC_HORIZ_TOTAL); REG_WRITE1(sc, VGA_CRTC_DATA, 0x5f); /* 760 */ REG_WRITE1(sc, VGA_CRTC_ADDRESS, VGA_CRTC_HORIZ_DISP_END); REG_WRITE1(sc, VGA_CRTC_DATA, 0x4f); /* 640 - 8 */ REG_WRITE1(sc, VGA_CRTC_ADDRESS, VGA_CRTC_START_HORIZ_BLANK); REG_WRITE1(sc, VGA_CRTC_DATA, 0x50); /* 640 */ REG_WRITE1(sc, VGA_CRTC_ADDRESS, VGA_CRTC_END_HORIZ_BLANK); REG_WRITE1(sc, VGA_CRTC_DATA, VGA_CRTC_EHB_CR + 2); REG_WRITE1(sc, VGA_CRTC_ADDRESS, VGA_CRTC_START_HORIZ_RETRACE); REG_WRITE1(sc, VGA_CRTC_DATA, 0x54); /* 672 */ REG_WRITE1(sc, VGA_CRTC_ADDRESS, VGA_CRTC_END_HORIZ_RETRACE); REG_WRITE1(sc, VGA_CRTC_DATA, VGA_CRTC_EHR_EHB + 0); REG_WRITE1(sc, VGA_CRTC_ADDRESS, VGA_CRTC_VERT_TOTAL); REG_WRITE1(sc, VGA_CRTC_DATA, 0x0b); /* 523 */ REG_WRITE1(sc, VGA_CRTC_ADDRESS, VGA_CRTC_OVERFLOW); REG_WRITE1(sc, VGA_CRTC_DATA, VGA_CRTC_OF_VT9 | VGA_CRTC_OF_LC8 | VGA_CRTC_OF_VBS8 | VGA_CRTC_OF_VRS8 | VGA_CRTC_OF_VDE8); REG_WRITE1(sc, VGA_CRTC_ADDRESS, VGA_CRTC_MAX_SCAN_LINE); REG_WRITE1(sc, VGA_CRTC_DATA, VGA_CRTC_MSL_LC9); REG_WRITE1(sc, VGA_CRTC_ADDRESS, VGA_CRTC_VERT_RETRACE_START); REG_WRITE1(sc, VGA_CRTC_DATA, 0xea); /* 480 + 10 */ REG_WRITE1(sc, VGA_CRTC_ADDRESS, VGA_CRTC_VERT_RETRACE_END); REG_WRITE1(sc, VGA_CRTC_DATA, 0x0c); REG_WRITE1(sc, VGA_CRTC_ADDRESS, VGA_CRTC_VERT_DISPLAY_END); REG_WRITE1(sc, VGA_CRTC_DATA, 0xdf); /* 480 - 1*/ REG_WRITE1(sc, VGA_CRTC_ADDRESS, VGA_CRTC_OFFSET); REG_WRITE1(sc, VGA_CRTC_DATA, 0x28); REG_WRITE1(sc, VGA_CRTC_ADDRESS, VGA_CRTC_START_VERT_BLANK); REG_WRITE1(sc, VGA_CRTC_DATA, 0xe7); /* 480 + 7 */ REG_WRITE1(sc, VGA_CRTC_ADDRESS, VGA_CRTC_END_VERT_BLANK); REG_WRITE1(sc, VGA_CRTC_DATA, 0x04); REG_WRITE1(sc, VGA_CRTC_ADDRESS, VGA_CRTC_MODE_CONTROL); REG_WRITE1(sc, VGA_CRTC_DATA, VGA_CRTC_MC_WB | VGA_CRTC_MC_AW | VGA_CRTC_MC_SRS | VGA_CRTC_MC_CMS); REG_WRITE1(sc, VGA_CRTC_ADDRESS, VGA_CRTC_LINE_COMPARE); REG_WRITE1(sc, VGA_CRTC_DATA, 0xff); /* 480 + 31 */ REG_WRITE1(sc, VGA_GEN_FEATURE_CTRL_W, 0); REG_WRITE1(sc, VGA_SEQ_ADDRESS, VGA_SEQ_MAP_MASK); REG_WRITE1(sc, VGA_SEQ_DATA, VGA_SEQ_MM_EM3 | VGA_SEQ_MM_EM2 | VGA_SEQ_MM_EM1 | VGA_SEQ_MM_EM0); REG_WRITE1(sc, VGA_SEQ_ADDRESS, VGA_SEQ_CHAR_MAP_SELECT); REG_WRITE1(sc, VGA_SEQ_DATA, 0); REG_WRITE1(sc, VGA_GC_ADDRESS, VGA_GC_SET_RESET); REG_WRITE1(sc, VGA_GC_DATA, 0); REG_WRITE1(sc, VGA_GC_ADDRESS, VGA_GC_ENABLE_SET_RESET); REG_WRITE1(sc, VGA_GC_DATA, 0x0f); REG_WRITE1(sc, VGA_GC_ADDRESS, VGA_GC_COLOR_COMPARE); REG_WRITE1(sc, VGA_GC_DATA, 0); REG_WRITE1(sc, VGA_GC_ADDRESS, VGA_GC_DATA_ROTATE); REG_WRITE1(sc, VGA_GC_DATA, 0); REG_WRITE1(sc, VGA_GC_ADDRESS, VGA_GC_READ_MAP_SELECT); REG_WRITE1(sc, VGA_GC_DATA, 0); REG_WRITE1(sc, VGA_GC_ADDRESS, VGA_GC_MODE); REG_WRITE1(sc, VGA_GC_DATA, 0); REG_WRITE1(sc, VGA_GC_ADDRESS, VGA_GC_COLOR_DONT_CARE); REG_WRITE1(sc, VGA_GC_DATA, 0x0f); REG_WRITE1(sc, VGA_GC_ADDRESS, VGA_GC_BIT_MASK); REG_WRITE1(sc, VGA_GC_DATA, 0xff); } static void vga_initialize(struct vt_device *vd, int textmode) { struct vga_softc *sc = vd->vd_softc; uint8_t x; /* Make sure the VGA adapter is not in monochrome emulation mode. */ x = REG_READ1(sc, VGA_GEN_MISC_OUTPUT_R); REG_WRITE1(sc, VGA_GEN_MISC_OUTPUT_W, x | VGA_GEN_MO_IOA); /* Unprotect CRTC registers 0-7. */ REG_WRITE1(sc, VGA_CRTC_ADDRESS, VGA_CRTC_VERT_RETRACE_END); x = REG_READ1(sc, VGA_CRTC_DATA); REG_WRITE1(sc, VGA_CRTC_DATA, x & ~VGA_CRTC_VRE_PR); /* * Wait for the vertical retrace. * NOTE: this code reads the VGA_GEN_INPUT_STAT_1 register, which has * the side-effect of clearing the internal flip-flip of the attribute * controller's write register. This means that because this code is * here, we know for sure that the first write to the attribute * controller will be a write to the address register. Removing this * code therefore also removes that guarantee and appropriate measures * need to be taken. */ do { x = REG_READ1(sc, VGA_GEN_INPUT_STAT_1); x &= VGA_GEN_IS1_VR | VGA_GEN_IS1_DE; } while (x != (VGA_GEN_IS1_VR | VGA_GEN_IS1_DE)); /* Now, disable the sync. signals. */ REG_WRITE1(sc, VGA_CRTC_ADDRESS, VGA_CRTC_MODE_CONTROL); x = REG_READ1(sc, VGA_CRTC_DATA); REG_WRITE1(sc, VGA_CRTC_DATA, x & ~VGA_CRTC_MC_HR); /* Asynchronous sequencer reset. */ REG_WRITE1(sc, VGA_SEQ_ADDRESS, VGA_SEQ_RESET); REG_WRITE1(sc, VGA_SEQ_DATA, VGA_SEQ_RST_SR); if (!textmode) vga_initialize_graphics(vd); REG_WRITE1(sc, VGA_CRTC_ADDRESS, VGA_CRTC_PRESET_ROW_SCAN); REG_WRITE1(sc, VGA_CRTC_DATA, 0); REG_WRITE1(sc, VGA_CRTC_ADDRESS, VGA_CRTC_CURSOR_START); REG_WRITE1(sc, VGA_CRTC_DATA, VGA_CRTC_CS_COO); REG_WRITE1(sc, VGA_CRTC_ADDRESS, VGA_CRTC_CURSOR_END); REG_WRITE1(sc, VGA_CRTC_DATA, 0); REG_WRITE1(sc, VGA_CRTC_ADDRESS, VGA_CRTC_START_ADDR_HIGH); REG_WRITE1(sc, VGA_CRTC_DATA, 0); REG_WRITE1(sc, VGA_CRTC_ADDRESS, VGA_CRTC_START_ADDR_LOW); REG_WRITE1(sc, VGA_CRTC_DATA, 0); REG_WRITE1(sc, VGA_CRTC_ADDRESS, VGA_CRTC_CURSOR_LOC_HIGH); REG_WRITE1(sc, VGA_CRTC_DATA, 0); REG_WRITE1(sc, VGA_CRTC_ADDRESS, VGA_CRTC_CURSOR_LOC_LOW); REG_WRITE1(sc, VGA_CRTC_DATA, 0x59); REG_WRITE1(sc, VGA_CRTC_ADDRESS, VGA_CRTC_UNDERLINE_LOC); REG_WRITE1(sc, VGA_CRTC_DATA, VGA_CRTC_UL_UL); if (textmode) { /* Set the attribute controller to blink disable. */ REG_WRITE1(sc, VGA_AC_WRITE, VGA_AC_MODE_CONTROL); REG_WRITE1(sc, VGA_AC_WRITE, 0); } else { /* Set the attribute controller in graphics mode. */ REG_WRITE1(sc, VGA_AC_WRITE, VGA_AC_MODE_CONTROL); REG_WRITE1(sc, VGA_AC_WRITE, VGA_AC_MC_GA); REG_WRITE1(sc, VGA_AC_WRITE, VGA_AC_HORIZ_PIXEL_PANNING); REG_WRITE1(sc, VGA_AC_WRITE, 0); } REG_WRITE1(sc, VGA_AC_WRITE, VGA_AC_PALETTE(0)); REG_WRITE1(sc, VGA_AC_WRITE, 0); REG_WRITE1(sc, VGA_AC_WRITE, VGA_AC_PALETTE(1)); REG_WRITE1(sc, VGA_AC_WRITE, VGA_AC_PAL_R); REG_WRITE1(sc, VGA_AC_WRITE, VGA_AC_PALETTE(2)); REG_WRITE1(sc, VGA_AC_WRITE, VGA_AC_PAL_G); REG_WRITE1(sc, VGA_AC_WRITE, VGA_AC_PALETTE(3)); REG_WRITE1(sc, VGA_AC_WRITE, VGA_AC_PAL_SG | VGA_AC_PAL_R); REG_WRITE1(sc, VGA_AC_WRITE, VGA_AC_PALETTE(4)); REG_WRITE1(sc, VGA_AC_WRITE, VGA_AC_PAL_B); REG_WRITE1(sc, VGA_AC_WRITE, VGA_AC_PALETTE(5)); REG_WRITE1(sc, VGA_AC_WRITE, VGA_AC_PAL_R | VGA_AC_PAL_B); REG_WRITE1(sc, VGA_AC_WRITE, VGA_AC_PALETTE(6)); REG_WRITE1(sc, VGA_AC_WRITE, VGA_AC_PAL_G | VGA_AC_PAL_B); REG_WRITE1(sc, VGA_AC_WRITE, VGA_AC_PALETTE(7)); REG_WRITE1(sc, VGA_AC_WRITE, VGA_AC_PAL_R | VGA_AC_PAL_G | VGA_AC_PAL_B); REG_WRITE1(sc, VGA_AC_WRITE, VGA_AC_PALETTE(8)); REG_WRITE1(sc, VGA_AC_WRITE, VGA_AC_PAL_SR | VGA_AC_PAL_SG | VGA_AC_PAL_SB); REG_WRITE1(sc, VGA_AC_WRITE, VGA_AC_PALETTE(9)); REG_WRITE1(sc, VGA_AC_WRITE, VGA_AC_PAL_SR | VGA_AC_PAL_SG | VGA_AC_PAL_SB | VGA_AC_PAL_R); REG_WRITE1(sc, VGA_AC_WRITE, VGA_AC_PALETTE(10)); REG_WRITE1(sc, VGA_AC_WRITE, VGA_AC_PAL_SR | VGA_AC_PAL_SG | VGA_AC_PAL_SB | VGA_AC_PAL_G); REG_WRITE1(sc, VGA_AC_WRITE, VGA_AC_PALETTE(11)); REG_WRITE1(sc, VGA_AC_WRITE, VGA_AC_PAL_SR | VGA_AC_PAL_SG | VGA_AC_PAL_SB | VGA_AC_PAL_R | VGA_AC_PAL_G); REG_WRITE1(sc, VGA_AC_WRITE, VGA_AC_PALETTE(12)); REG_WRITE1(sc, VGA_AC_WRITE, VGA_AC_PAL_SR | VGA_AC_PAL_SG | VGA_AC_PAL_SB | VGA_AC_PAL_B); REG_WRITE1(sc, VGA_AC_WRITE, VGA_AC_PALETTE(13)); REG_WRITE1(sc, VGA_AC_WRITE, VGA_AC_PAL_SR | VGA_AC_PAL_SG | VGA_AC_PAL_SB | VGA_AC_PAL_R | VGA_AC_PAL_B); REG_WRITE1(sc, VGA_AC_WRITE, VGA_AC_PALETTE(14)); REG_WRITE1(sc, VGA_AC_WRITE, VGA_AC_PAL_SR | VGA_AC_PAL_SG | VGA_AC_PAL_SB | VGA_AC_PAL_G | VGA_AC_PAL_B); REG_WRITE1(sc, VGA_AC_WRITE, VGA_AC_PALETTE(15)); REG_WRITE1(sc, VGA_AC_WRITE, VGA_AC_PAL_SR | VGA_AC_PAL_SG | VGA_AC_PAL_SB | VGA_AC_PAL_R | VGA_AC_PAL_G | VGA_AC_PAL_B); REG_WRITE1(sc, VGA_AC_WRITE, VGA_AC_OVERSCAN_COLOR); REG_WRITE1(sc, VGA_AC_WRITE, 0); REG_WRITE1(sc, VGA_AC_WRITE, VGA_AC_COLOR_PLANE_ENABLE); REG_WRITE1(sc, VGA_AC_WRITE, 0x0f); REG_WRITE1(sc, VGA_AC_WRITE, VGA_AC_COLOR_SELECT); REG_WRITE1(sc, VGA_AC_WRITE, 0); if (!textmode) { u_int ofs; /* * Done. Clear the frame buffer. All bit planes are * enabled, so a single-paged loop should clear all * planes. */ for (ofs = 0; ofs < VT_VGA_MEMSIZE; ofs++) { MEM_WRITE1(sc, ofs, 0); } } /* Re-enable the sequencer. */ REG_WRITE1(sc, VGA_SEQ_ADDRESS, VGA_SEQ_RESET); REG_WRITE1(sc, VGA_SEQ_DATA, VGA_SEQ_RST_SR | VGA_SEQ_RST_NAR); /* Re-enable the sync signals. */ REG_WRITE1(sc, VGA_CRTC_ADDRESS, VGA_CRTC_MODE_CONTROL); x = REG_READ1(sc, VGA_CRTC_DATA); REG_WRITE1(sc, VGA_CRTC_DATA, x | VGA_CRTC_MC_HR); if (!textmode) { /* Switch to write mode 3, because we'll mainly do bitblt. */ REG_WRITE1(sc, VGA_GC_ADDRESS, VGA_GC_MODE); REG_WRITE1(sc, VGA_GC_DATA, 3); sc->vga_wmode = 3; /* * In Write Mode 3, Enable Set/Reset is ignored, but we * use Write Mode 0 to write a group of 8 pixels using * 3 or more colors. In this case, we want to disable * Set/Reset: set Enable Set/Reset to 0. */ REG_WRITE1(sc, VGA_GC_ADDRESS, VGA_GC_ENABLE_SET_RESET); REG_WRITE1(sc, VGA_GC_DATA, 0x00); /* * Clear the colors we think are loaded into Set/Reset or * the latches. */ sc->vga_curfg = sc->vga_curbg = 0xff; } } static int vga_probe(struct vt_device *vd) { return (CN_INTERNAL); } static int vga_init(struct vt_device *vd) { struct vga_softc *sc; int textmode; if (vd->vd_softc == NULL) vd->vd_softc = (void *)&vga_conssoftc; sc = vd->vd_softc; textmode = 0; #if defined(__amd64__) || defined(__i386__) sc->vga_fb_tag = X86_BUS_SPACE_MEM; sc->vga_fb_handle = KERNBASE + VGA_MEM_BASE; sc->vga_reg_tag = X86_BUS_SPACE_IO; sc->vga_reg_handle = VGA_REG_BASE; #else # error "Architecture not yet supported!" #endif TUNABLE_INT_FETCH("hw.vga.textmode", &textmode); if (textmode) { vd->vd_flags |= VDF_TEXTMODE; vd->vd_width = 80; vd->vd_height = 25; } else { vd->vd_width = VT_VGA_WIDTH; vd->vd_height = VT_VGA_HEIGHT; } vga_initialize(vd, textmode); sc->vga_enabled = true; return (CN_INTERNAL); } static void vga_postswitch(struct vt_device *vd) { /* Reinit VGA mode, to restore view after app which change mode. */ vga_initialize(vd, (vd->vd_flags & VDF_TEXTMODE)); /* Ask vt(9) to update chars on visible area. */ vd->vd_flags |= VDF_INVALID; } /* Dummy NewBus functions to reserve the resources used by the vt_vga driver */ static void vtvga_identify(driver_t *driver, device_t parent) { if (!vga_conssoftc.vga_enabled) return; if (BUS_ADD_CHILD(parent, 0, driver->name, 0) == NULL) panic("Unable to attach vt_vga console"); } static int vtvga_probe(device_t dev) { - device_set_desc(dev, "vt_vga driver"); + device_set_desc(dev, "VT VGA driver"); + return (BUS_PROBE_NOWILDCARD); } static int vtvga_attach(device_t dev) { struct resource *pseudo_phys_res; int res_id; res_id = 0; pseudo_phys_res = bus_alloc_resource(dev, SYS_RES_MEMORY, &res_id, VGA_MEM_BASE, VGA_MEM_BASE + VGA_MEM_SIZE - 1, VGA_MEM_SIZE, RF_ACTIVE); if (pseudo_phys_res == NULL) panic("Unable to reserve vt_vga memory"); return (0); } /*-------------------- Private Device Attachment Data -----------------------*/ static device_method_t vtvga_methods[] = { /* Device interface */ DEVMETHOD(device_identify, vtvga_identify), DEVMETHOD(device_probe, vtvga_probe), DEVMETHOD(device_attach, vtvga_attach), DEVMETHOD_END }; DEFINE_CLASS_0(vtvga, vtvga_driver, vtvga_methods, 0); devclass_t vtvga_devclass; DRIVER_MODULE(vtvga, nexus, vtvga_driver, vtvga_devclass, NULL, NULL); Index: projects/building-blocks/sys/kern/subr_bus.c =================================================================== --- projects/building-blocks/sys/kern/subr_bus.c (revision 278311) +++ projects/building-blocks/sys/kern/subr_bus.c (revision 278312) @@ -1,5033 +1,5024 @@ /*- * Copyright (c) 1997,1998,2003 Doug Rabson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_bus.h" #include "opt_random.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include SYSCTL_NODE(_hw, OID_AUTO, bus, CTLFLAG_RW, NULL, NULL); SYSCTL_ROOT_NODE(OID_AUTO, dev, CTLFLAG_RW, NULL, NULL); /* * Used to attach drivers to devclasses. */ typedef struct driverlink *driverlink_t; struct driverlink { kobj_class_t driver; TAILQ_ENTRY(driverlink) link; /* list of drivers in devclass */ int pass; TAILQ_ENTRY(driverlink) passlink; }; /* * Forward declarations */ typedef TAILQ_HEAD(devclass_list, devclass) devclass_list_t; typedef TAILQ_HEAD(driver_list, driverlink) driver_list_t; typedef TAILQ_HEAD(device_list, device) device_list_t; struct devclass { TAILQ_ENTRY(devclass) link; devclass_t parent; /* parent in devclass hierarchy */ driver_list_t drivers; /* bus devclasses store drivers for bus */ char *name; device_t *devices; /* array of devices indexed by unit */ int maxunit; /* size of devices array */ int flags; #define DC_HAS_CHILDREN 1 struct sysctl_ctx_list sysctl_ctx; struct sysctl_oid *sysctl_tree; }; /** * @brief Implementation of device. */ struct device { /* * A device is a kernel object. The first field must be the * current ops table for the object. */ KOBJ_FIELDS; /* * Device hierarchy. */ TAILQ_ENTRY(device) link; /**< list of devices in parent */ TAILQ_ENTRY(device) devlink; /**< global device list membership */ device_t parent; /**< parent of this device */ device_list_t children; /**< list of child devices */ /* * Details of this device. */ driver_t *driver; /**< current driver */ devclass_t devclass; /**< current device class */ int unit; /**< current unit number */ char* nameunit; /**< name+unit e.g. foodev0 */ char* desc; /**< driver specific description */ int busy; /**< count of calls to device_busy() */ device_state_t state; /**< current device state */ uint32_t devflags; /**< api level flags for device_get_flags() */ u_int flags; /**< internal device flags */ -#define DF_ENABLED 0x01 /* device should be probed/attached */ -#define DF_FIXEDCLASS 0x02 /* devclass specified at create time */ -#define DF_WILDCARD 0x04 /* unit was originally wildcard */ -#define DF_DESCMALLOCED 0x08 /* description was malloced */ -#define DF_QUIET 0x10 /* don't print verbose attach message */ -#define DF_DONENOMATCH 0x20 /* don't execute DEVICE_NOMATCH again */ -#define DF_EXTERNALSOFTC 0x40 /* softc not allocated by us */ -#define DF_REBID 0x80 /* Can rebid after attach */ -#define DF_SUSPENDED 0x100 /* Device is suspended. */ u_int order; /**< order from device_add_child_ordered() */ void *ivars; /**< instance variables */ void *softc; /**< current driver's variables */ struct sysctl_ctx_list sysctl_ctx; /**< state for sysctl variables */ struct sysctl_oid *sysctl_tree; /**< state for sysctl variables */ }; static MALLOC_DEFINE(M_BUS, "bus", "Bus data structures"); static MALLOC_DEFINE(M_BUS_SC, "bus-sc", "Bus data structures, softc"); #ifdef BUS_DEBUG static int bus_debug = 1; SYSCTL_INT(_debug, OID_AUTO, bus_debug, CTLFLAG_RWTUN, &bus_debug, 0, "Bus debug level"); #define PDEBUG(a) if (bus_debug) {printf("%s:%d: ", __func__, __LINE__), printf a; printf("\n");} #define DEVICENAME(d) ((d)? device_get_name(d): "no device") #define DRIVERNAME(d) ((d)? d->name : "no driver") #define DEVCLANAME(d) ((d)? d->name : "no devclass") /** * Produce the indenting, indent*2 spaces plus a '.' ahead of that to * prevent syslog from deleting initial spaces */ #define indentprintf(p) do { int iJ; printf("."); for (iJ=0; iJparent ? dc->parent->name : ""; break; default: return (EINVAL); } return (SYSCTL_OUT(req, value, strlen(value))); } static void devclass_sysctl_init(devclass_t dc) { if (dc->sysctl_tree != NULL) return; sysctl_ctx_init(&dc->sysctl_ctx); dc->sysctl_tree = SYSCTL_ADD_NODE(&dc->sysctl_ctx, SYSCTL_STATIC_CHILDREN(_dev), OID_AUTO, dc->name, CTLFLAG_RD, NULL, ""); SYSCTL_ADD_PROC(&dc->sysctl_ctx, SYSCTL_CHILDREN(dc->sysctl_tree), OID_AUTO, "%parent", CTLTYPE_STRING | CTLFLAG_RD, dc, DEVCLASS_SYSCTL_PARENT, devclass_sysctl_handler, "A", "parent class"); } enum { DEVICE_SYSCTL_DESC, DEVICE_SYSCTL_DRIVER, DEVICE_SYSCTL_LOCATION, DEVICE_SYSCTL_PNPINFO, DEVICE_SYSCTL_PARENT, }; static int device_sysctl_handler(SYSCTL_HANDLER_ARGS) { device_t dev = (device_t)arg1; const char *value; char *buf; int error; buf = NULL; switch (arg2) { case DEVICE_SYSCTL_DESC: value = dev->desc ? dev->desc : ""; break; case DEVICE_SYSCTL_DRIVER: value = dev->driver ? dev->driver->name : ""; break; case DEVICE_SYSCTL_LOCATION: value = buf = malloc(1024, M_BUS, M_WAITOK | M_ZERO); bus_child_location_str(dev, buf, 1024); break; case DEVICE_SYSCTL_PNPINFO: value = buf = malloc(1024, M_BUS, M_WAITOK | M_ZERO); bus_child_pnpinfo_str(dev, buf, 1024); break; case DEVICE_SYSCTL_PARENT: value = dev->parent ? dev->parent->nameunit : ""; break; default: return (EINVAL); } error = SYSCTL_OUT(req, value, strlen(value)); if (buf != NULL) free(buf, M_BUS); return (error); } static void device_sysctl_init(device_t dev) { devclass_t dc = dev->devclass; int domain; if (dev->sysctl_tree != NULL) return; devclass_sysctl_init(dc); sysctl_ctx_init(&dev->sysctl_ctx); dev->sysctl_tree = SYSCTL_ADD_NODE(&dev->sysctl_ctx, SYSCTL_CHILDREN(dc->sysctl_tree), OID_AUTO, dev->nameunit + strlen(dc->name), CTLFLAG_RD, NULL, ""); SYSCTL_ADD_PROC(&dev->sysctl_ctx, SYSCTL_CHILDREN(dev->sysctl_tree), OID_AUTO, "%desc", CTLTYPE_STRING | CTLFLAG_RD, dev, DEVICE_SYSCTL_DESC, device_sysctl_handler, "A", "device description"); SYSCTL_ADD_PROC(&dev->sysctl_ctx, SYSCTL_CHILDREN(dev->sysctl_tree), OID_AUTO, "%driver", CTLTYPE_STRING | CTLFLAG_RD, dev, DEVICE_SYSCTL_DRIVER, device_sysctl_handler, "A", "device driver name"); SYSCTL_ADD_PROC(&dev->sysctl_ctx, SYSCTL_CHILDREN(dev->sysctl_tree), OID_AUTO, "%location", CTLTYPE_STRING | CTLFLAG_RD, dev, DEVICE_SYSCTL_LOCATION, device_sysctl_handler, "A", "device location relative to parent"); SYSCTL_ADD_PROC(&dev->sysctl_ctx, SYSCTL_CHILDREN(dev->sysctl_tree), OID_AUTO, "%pnpinfo", CTLTYPE_STRING | CTLFLAG_RD, dev, DEVICE_SYSCTL_PNPINFO, device_sysctl_handler, "A", "device identification"); SYSCTL_ADD_PROC(&dev->sysctl_ctx, SYSCTL_CHILDREN(dev->sysctl_tree), OID_AUTO, "%parent", CTLTYPE_STRING | CTLFLAG_RD, dev, DEVICE_SYSCTL_PARENT, device_sysctl_handler, "A", "parent device"); if (bus_get_domain(dev, &domain) == 0) SYSCTL_ADD_INT(&dev->sysctl_ctx, SYSCTL_CHILDREN(dev->sysctl_tree), OID_AUTO, "%domain", CTLFLAG_RD, NULL, domain, "NUMA domain"); } static void device_sysctl_update(device_t dev) { devclass_t dc = dev->devclass; if (dev->sysctl_tree == NULL) return; sysctl_rename_oid(dev->sysctl_tree, dev->nameunit + strlen(dc->name)); } static void device_sysctl_fini(device_t dev) { if (dev->sysctl_tree == NULL) return; sysctl_ctx_free(&dev->sysctl_ctx); dev->sysctl_tree = NULL; } /* * /dev/devctl implementation */ /* * This design allows only one reader for /dev/devctl. This is not desirable * in the long run, but will get a lot of hair out of this implementation. * Maybe we should make this device a clonable device. * * Also note: we specifically do not attach a device to the device_t tree * to avoid potential chicken and egg problems. One could argue that all * of this belongs to the root node. One could also further argue that the * sysctl interface that we have not might more properly be an ioctl * interface, but at this stage of the game, I'm not inclined to rock that * boat. * * I'm also not sure that the SIGIO support is done correctly or not, as * I copied it from a driver that had SIGIO support that likely hasn't been * tested since 3.4 or 2.2.8! */ /* Deprecated way to adjust queue length */ static int sysctl_devctl_disable(SYSCTL_HANDLER_ARGS); SYSCTL_PROC(_hw_bus, OID_AUTO, devctl_disable, CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, NULL, 0, sysctl_devctl_disable, "I", "devctl disable -- deprecated"); #define DEVCTL_DEFAULT_QUEUE_LEN 1000 static int sysctl_devctl_queue(SYSCTL_HANDLER_ARGS); static int devctl_queue_length = DEVCTL_DEFAULT_QUEUE_LEN; SYSCTL_PROC(_hw_bus, OID_AUTO, devctl_queue, CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, NULL, 0, sysctl_devctl_queue, "I", "devctl queue length"); static d_open_t devopen; static d_close_t devclose; static d_read_t devread; static d_ioctl_t devioctl; static d_poll_t devpoll; static d_kqfilter_t devkqfilter; static struct cdevsw dev_cdevsw = { .d_version = D_VERSION, .d_open = devopen, .d_close = devclose, .d_read = devread, .d_ioctl = devioctl, .d_poll = devpoll, .d_kqfilter = devkqfilter, .d_name = "devctl", }; struct dev_event_info { char *dei_data; TAILQ_ENTRY(dev_event_info) dei_link; }; TAILQ_HEAD(devq, dev_event_info); static struct dev_softc { int inuse; int nonblock; int queued; int async; struct mtx mtx; struct cv cv; struct selinfo sel; struct devq devq; struct sigio *sigio; } devsoftc; static void filt_devctl_detach(struct knote *kn); static int filt_devctl_read(struct knote *kn, long hint); struct filterops devctl_rfiltops = { .f_isfd = 1, .f_detach = filt_devctl_detach, .f_event = filt_devctl_read, }; static struct cdev *devctl_dev; static void devinit(void) { devctl_dev = make_dev_credf(MAKEDEV_ETERNAL, &dev_cdevsw, 0, NULL, UID_ROOT, GID_WHEEL, 0600, "devctl"); mtx_init(&devsoftc.mtx, "dev mtx", "devd", MTX_DEF); cv_init(&devsoftc.cv, "dev cv"); TAILQ_INIT(&devsoftc.devq); knlist_init_mtx(&devsoftc.sel.si_note, &devsoftc.mtx); } static int devopen(struct cdev *dev, int oflags, int devtype, struct thread *td) { mtx_lock(&devsoftc.mtx); if (devsoftc.inuse) { mtx_unlock(&devsoftc.mtx); return (EBUSY); } /* move to init */ devsoftc.inuse = 1; mtx_unlock(&devsoftc.mtx); return (0); } static int devclose(struct cdev *dev, int fflag, int devtype, struct thread *td) { mtx_lock(&devsoftc.mtx); devsoftc.inuse = 0; devsoftc.nonblock = 0; devsoftc.async = 0; cv_broadcast(&devsoftc.cv); funsetown(&devsoftc.sigio); mtx_unlock(&devsoftc.mtx); return (0); } /* * The read channel for this device is used to report changes to * userland in realtime. We are required to free the data as well as * the n1 object because we allocate them separately. Also note that * we return one record at a time. If you try to read this device a * character at a time, you will lose the rest of the data. Listening * programs are expected to cope. */ static int devread(struct cdev *dev, struct uio *uio, int ioflag) { struct dev_event_info *n1; int rv; mtx_lock(&devsoftc.mtx); while (TAILQ_EMPTY(&devsoftc.devq)) { if (devsoftc.nonblock) { mtx_unlock(&devsoftc.mtx); return (EAGAIN); } rv = cv_wait_sig(&devsoftc.cv, &devsoftc.mtx); if (rv) { /* * Need to translate ERESTART to EINTR here? -- jake */ mtx_unlock(&devsoftc.mtx); return (rv); } } n1 = TAILQ_FIRST(&devsoftc.devq); TAILQ_REMOVE(&devsoftc.devq, n1, dei_link); devsoftc.queued--; mtx_unlock(&devsoftc.mtx); rv = uiomove(n1->dei_data, strlen(n1->dei_data), uio); free(n1->dei_data, M_BUS); free(n1, M_BUS); return (rv); } static int devioctl(struct cdev *dev, u_long cmd, caddr_t data, int fflag, struct thread *td) { switch (cmd) { case FIONBIO: if (*(int*)data) devsoftc.nonblock = 1; else devsoftc.nonblock = 0; return (0); case FIOASYNC: if (*(int*)data) devsoftc.async = 1; else devsoftc.async = 0; return (0); case FIOSETOWN: return fsetown(*(int *)data, &devsoftc.sigio); case FIOGETOWN: *(int *)data = fgetown(&devsoftc.sigio); return (0); /* (un)Support for other fcntl() calls. */ case FIOCLEX: case FIONCLEX: case FIONREAD: default: break; } return (ENOTTY); } static int devpoll(struct cdev *dev, int events, struct thread *td) { int revents = 0; mtx_lock(&devsoftc.mtx); if (events & (POLLIN | POLLRDNORM)) { if (!TAILQ_EMPTY(&devsoftc.devq)) revents = events & (POLLIN | POLLRDNORM); else selrecord(td, &devsoftc.sel); } mtx_unlock(&devsoftc.mtx); return (revents); } static int devkqfilter(struct cdev *dev, struct knote *kn) { int error; if (kn->kn_filter == EVFILT_READ) { kn->kn_fop = &devctl_rfiltops; knlist_add(&devsoftc.sel.si_note, kn, 0); error = 0; } else error = EINVAL; return (error); } static void filt_devctl_detach(struct knote *kn) { knlist_remove(&devsoftc.sel.si_note, kn, 0); } static int filt_devctl_read(struct knote *kn, long hint) { kn->kn_data = devsoftc.queued; return (kn->kn_data != 0); } /** * @brief Return whether the userland process is running */ boolean_t devctl_process_running(void) { return (devsoftc.inuse == 1); } /** * @brief Queue data to be read from the devctl device * * Generic interface to queue data to the devctl device. It is * assumed that @p data is properly formatted. It is further assumed * that @p data is allocated using the M_BUS malloc type. */ void devctl_queue_data_f(char *data, int flags) { struct dev_event_info *n1 = NULL, *n2 = NULL; if (strlen(data) == 0) goto out; if (devctl_queue_length == 0) goto out; n1 = malloc(sizeof(*n1), M_BUS, flags); if (n1 == NULL) goto out; n1->dei_data = data; mtx_lock(&devsoftc.mtx); if (devctl_queue_length == 0) { mtx_unlock(&devsoftc.mtx); free(n1->dei_data, M_BUS); free(n1, M_BUS); return; } /* Leave at least one spot in the queue... */ while (devsoftc.queued > devctl_queue_length - 1) { n2 = TAILQ_FIRST(&devsoftc.devq); TAILQ_REMOVE(&devsoftc.devq, n2, dei_link); free(n2->dei_data, M_BUS); free(n2, M_BUS); devsoftc.queued--; } TAILQ_INSERT_TAIL(&devsoftc.devq, n1, dei_link); devsoftc.queued++; cv_broadcast(&devsoftc.cv); KNOTE_LOCKED(&devsoftc.sel.si_note, 0); mtx_unlock(&devsoftc.mtx); selwakeup(&devsoftc.sel); if (devsoftc.async && devsoftc.sigio != NULL) pgsigio(&devsoftc.sigio, SIGIO, 0); return; out: /* * We have to free data on all error paths since the caller * assumes it will be free'd when this item is dequeued. */ free(data, M_BUS); return; } void devctl_queue_data(char *data) { devctl_queue_data_f(data, M_NOWAIT); } /** * @brief Send a 'notification' to userland, using standard ways */ void devctl_notify_f(const char *system, const char *subsystem, const char *type, const char *data, int flags) { int len = 0; char *msg; if (system == NULL) return; /* BOGUS! Must specify system. */ if (subsystem == NULL) return; /* BOGUS! Must specify subsystem. */ if (type == NULL) return; /* BOGUS! Must specify type. */ len += strlen(" system=") + strlen(system); len += strlen(" subsystem=") + strlen(subsystem); len += strlen(" type=") + strlen(type); /* add in the data message plus newline. */ if (data != NULL) len += strlen(data); len += 3; /* '!', '\n', and NUL */ msg = malloc(len, M_BUS, flags); if (msg == NULL) return; /* Drop it on the floor */ if (data != NULL) snprintf(msg, len, "!system=%s subsystem=%s type=%s %s\n", system, subsystem, type, data); else snprintf(msg, len, "!system=%s subsystem=%s type=%s\n", system, subsystem, type); devctl_queue_data_f(msg, flags); } void devctl_notify(const char *system, const char *subsystem, const char *type, const char *data) { devctl_notify_f(system, subsystem, type, data, M_NOWAIT); } /* * Common routine that tries to make sending messages as easy as possible. * We allocate memory for the data, copy strings into that, but do not * free it unless there's an error. The dequeue part of the driver should * free the data. We don't send data when the device is disabled. We do * send data, even when we have no listeners, because we wish to avoid * races relating to startup and restart of listening applications. * * devaddq is designed to string together the type of event, with the * object of that event, plus the plug and play info and location info * for that event. This is likely most useful for devices, but less * useful for other consumers of this interface. Those should use * the devctl_queue_data() interface instead. */ static void devaddq(const char *type, const char *what, device_t dev) { char *data = NULL; char *loc = NULL; char *pnp = NULL; const char *parstr; if (!devctl_queue_length)/* Rare race, but lost races safely discard */ return; data = malloc(1024, M_BUS, M_NOWAIT); if (data == NULL) goto bad; /* get the bus specific location of this device */ loc = malloc(1024, M_BUS, M_NOWAIT); if (loc == NULL) goto bad; *loc = '\0'; bus_child_location_str(dev, loc, 1024); /* Get the bus specific pnp info of this device */ pnp = malloc(1024, M_BUS, M_NOWAIT); if (pnp == NULL) goto bad; *pnp = '\0'; bus_child_pnpinfo_str(dev, pnp, 1024); /* Get the parent of this device, or / if high enough in the tree. */ if (device_get_parent(dev) == NULL) parstr = "."; /* Or '/' ? */ else parstr = device_get_nameunit(device_get_parent(dev)); /* String it all together. */ snprintf(data, 1024, "%s%s at %s %s on %s\n", type, what, loc, pnp, parstr); free(loc, M_BUS); free(pnp, M_BUS); devctl_queue_data(data); return; bad: free(pnp, M_BUS); free(loc, M_BUS); free(data, M_BUS); return; } /* * A device was added to the tree. We are called just after it successfully * attaches (that is, probe and attach success for this device). No call * is made if a device is merely parented into the tree. See devnomatch * if probe fails. If attach fails, no notification is sent (but maybe * we should have a different message for this). */ static void devadded(device_t dev) { devaddq("+", device_get_nameunit(dev), dev); } /* * A device was removed from the tree. We are called just before this * happens. */ static void devremoved(device_t dev) { devaddq("-", device_get_nameunit(dev), dev); } /* * Called when there's no match for this device. This is only called * the first time that no match happens, so we don't keep getting this * message. Should that prove to be undesirable, we can change it. * This is called when all drivers that can attach to a given bus * decline to accept this device. Other errors may not be detected. */ static void devnomatch(device_t dev) { devaddq("?", "", dev); } static int sysctl_devctl_disable(SYSCTL_HANDLER_ARGS) { struct dev_event_info *n1; int dis, error; dis = (devctl_queue_length == 0); error = sysctl_handle_int(oidp, &dis, 0, req); if (error || !req->newptr) return (error); if (mtx_initialized(&devsoftc.mtx)) mtx_lock(&devsoftc.mtx); if (dis) { while (!TAILQ_EMPTY(&devsoftc.devq)) { n1 = TAILQ_FIRST(&devsoftc.devq); TAILQ_REMOVE(&devsoftc.devq, n1, dei_link); free(n1->dei_data, M_BUS); free(n1, M_BUS); } devsoftc.queued = 0; devctl_queue_length = 0; } else { devctl_queue_length = DEVCTL_DEFAULT_QUEUE_LEN; } if (mtx_initialized(&devsoftc.mtx)) mtx_unlock(&devsoftc.mtx); return (0); } static int sysctl_devctl_queue(SYSCTL_HANDLER_ARGS) { struct dev_event_info *n1; int q, error; q = devctl_queue_length; error = sysctl_handle_int(oidp, &q, 0, req); if (error || !req->newptr) return (error); if (q < 0) return (EINVAL); if (mtx_initialized(&devsoftc.mtx)) mtx_lock(&devsoftc.mtx); devctl_queue_length = q; while (devsoftc.queued > devctl_queue_length) { n1 = TAILQ_FIRST(&devsoftc.devq); TAILQ_REMOVE(&devsoftc.devq, n1, dei_link); free(n1->dei_data, M_BUS); free(n1, M_BUS); devsoftc.queued--; } if (mtx_initialized(&devsoftc.mtx)) mtx_unlock(&devsoftc.mtx); return (0); } /* End of /dev/devctl code */ static TAILQ_HEAD(,device) bus_data_devices; static int bus_data_generation = 1; static kobj_method_t null_methods[] = { KOBJMETHOD_END }; DEFINE_CLASS(null, null_methods, 0); /* * Bus pass implementation */ static driver_list_t passes = TAILQ_HEAD_INITIALIZER(passes); int bus_current_pass = BUS_PASS_ROOT; /** * @internal * @brief Register the pass level of a new driver attachment * * Register a new driver attachment's pass level. If no driver * attachment with the same pass level has been added, then @p new * will be added to the global passes list. * * @param new the new driver attachment */ static void driver_register_pass(struct driverlink *new) { struct driverlink *dl; /* We only consider pass numbers during boot. */ if (bus_current_pass == BUS_PASS_DEFAULT) return; /* * Walk the passes list. If we already know about this pass * then there is nothing to do. If we don't, then insert this * driver link into the list. */ TAILQ_FOREACH(dl, &passes, passlink) { if (dl->pass < new->pass) continue; if (dl->pass == new->pass) return; TAILQ_INSERT_BEFORE(dl, new, passlink); return; } TAILQ_INSERT_TAIL(&passes, new, passlink); } /** * @brief Raise the current bus pass * * Raise the current bus pass level to @p pass. Call the BUS_NEW_PASS() * method on the root bus to kick off a new device tree scan for each * new pass level that has at least one driver. */ void bus_set_pass(int pass) { struct driverlink *dl; if (bus_current_pass > pass) panic("Attempt to lower bus pass level"); TAILQ_FOREACH(dl, &passes, passlink) { /* Skip pass values below the current pass level. */ if (dl->pass <= bus_current_pass) continue; /* * Bail once we hit a driver with a pass level that is * too high. */ if (dl->pass > pass) break; /* * Raise the pass level to the next level and rescan * the tree. */ bus_current_pass = dl->pass; BUS_NEW_PASS(root_bus); } /* * If there isn't a driver registered for the requested pass, * then bus_current_pass might still be less than 'pass'. Set * it to 'pass' in that case. */ if (bus_current_pass < pass) bus_current_pass = pass; KASSERT(bus_current_pass == pass, ("Failed to update bus pass level")); } /* * Devclass implementation */ static devclass_list_t devclasses = TAILQ_HEAD_INITIALIZER(devclasses); /** * @internal * @brief Find or create a device class * * If a device class with the name @p classname exists, return it, * otherwise if @p create is non-zero create and return a new device * class. * * If @p parentname is non-NULL, the parent of the devclass is set to * the devclass of that name. * * @param classname the devclass name to find or create * @param parentname the parent devclass name or @c NULL * @param create non-zero to create a devclass */ static devclass_t devclass_find_internal(const char *classname, const char *parentname, int create) { devclass_t dc; PDEBUG(("looking for %s", classname)); if (!classname) return (NULL); TAILQ_FOREACH(dc, &devclasses, link) { if (!strcmp(dc->name, classname)) break; } if (create && !dc) { PDEBUG(("creating %s", classname)); dc = malloc(sizeof(struct devclass) + strlen(classname) + 1, M_BUS, M_NOWAIT | M_ZERO); if (!dc) return (NULL); dc->parent = NULL; dc->name = (char*) (dc + 1); strcpy(dc->name, classname); TAILQ_INIT(&dc->drivers); TAILQ_INSERT_TAIL(&devclasses, dc, link); bus_data_generation_update(); } /* * If a parent class is specified, then set that as our parent so * that this devclass will support drivers for the parent class as * well. If the parent class has the same name don't do this though * as it creates a cycle that can trigger an infinite loop in * device_probe_child() if a device exists for which there is no * suitable driver. */ if (parentname && dc && !dc->parent && strcmp(classname, parentname) != 0) { dc->parent = devclass_find_internal(parentname, NULL, TRUE); dc->parent->flags |= DC_HAS_CHILDREN; } return (dc); } /** * @brief Create a device class * * If a device class with the name @p classname exists, return it, * otherwise create and return a new device class. * * @param classname the devclass name to find or create */ devclass_t devclass_create(const char *classname) { return (devclass_find_internal(classname, NULL, TRUE)); } /** * @brief Find a device class * * If a device class with the name @p classname exists, return it, * otherwise return @c NULL. * * @param classname the devclass name to find */ devclass_t devclass_find(const char *classname) { return (devclass_find_internal(classname, NULL, FALSE)); } /** * @brief Register that a device driver has been added to a devclass * * Register that a device driver has been added to a devclass. This * is called by devclass_add_driver to accomplish the recursive * notification of all the children classes of dc, as well as dc. * Each layer will have BUS_DRIVER_ADDED() called for all instances of * the devclass. * * We do a full search here of the devclass list at each iteration * level to save storing children-lists in the devclass structure. If * we ever move beyond a few dozen devices doing this, we may need to * reevaluate... * * @param dc the devclass to edit * @param driver the driver that was just added */ static void devclass_driver_added(devclass_t dc, driver_t *driver) { devclass_t parent; int i; /* * Call BUS_DRIVER_ADDED for any existing busses in this class. */ for (i = 0; i < dc->maxunit; i++) if (dc->devices[i] && device_is_attached(dc->devices[i])) BUS_DRIVER_ADDED(dc->devices[i], driver); /* * Walk through the children classes. Since we only keep a * single parent pointer around, we walk the entire list of * devclasses looking for children. We set the * DC_HAS_CHILDREN flag when a child devclass is created on * the parent, so we only walk the list for those devclasses * that have children. */ if (!(dc->flags & DC_HAS_CHILDREN)) return; parent = dc; TAILQ_FOREACH(dc, &devclasses, link) { if (dc->parent == parent) devclass_driver_added(dc, driver); } } /** * @brief Add a device driver to a device class * * Add a device driver to a devclass. This is normally called * automatically by DRIVER_MODULE(). The BUS_DRIVER_ADDED() method of * all devices in the devclass will be called to allow them to attempt * to re-probe any unmatched children. * * @param dc the devclass to edit * @param driver the driver to register */ int devclass_add_driver(devclass_t dc, driver_t *driver, int pass, devclass_t *dcp) { driverlink_t dl; const char *parentname; PDEBUG(("%s", DRIVERNAME(driver))); /* Don't allow invalid pass values. */ if (pass <= BUS_PASS_ROOT) return (EINVAL); dl = malloc(sizeof *dl, M_BUS, M_NOWAIT|M_ZERO); if (!dl) return (ENOMEM); /* * Compile the driver's methods. Also increase the reference count * so that the class doesn't get freed when the last instance * goes. This means we can safely use static methods and avoids a * double-free in devclass_delete_driver. */ kobj_class_compile((kobj_class_t) driver); /* * If the driver has any base classes, make the * devclass inherit from the devclass of the driver's * first base class. This will allow the system to * search for drivers in both devclasses for children * of a device using this driver. */ if (driver->baseclasses) parentname = driver->baseclasses[0]->name; else parentname = NULL; *dcp = devclass_find_internal(driver->name, parentname, TRUE); dl->driver = driver; TAILQ_INSERT_TAIL(&dc->drivers, dl, link); driver->refs++; /* XXX: kobj_mtx */ dl->pass = pass; driver_register_pass(dl); devclass_driver_added(dc, driver); bus_data_generation_update(); return (0); } /** * @brief Register that a device driver has been deleted from a devclass * * Register that a device driver has been removed from a devclass. * This is called by devclass_delete_driver to accomplish the * recursive notification of all the children classes of busclass, as * well as busclass. Each layer will attempt to detach the driver * from any devices that are children of the bus's devclass. The function * will return an error if a device fails to detach. * * We do a full search here of the devclass list at each iteration * level to save storing children-lists in the devclass structure. If * we ever move beyond a few dozen devices doing this, we may need to * reevaluate... * * @param busclass the devclass of the parent bus * @param dc the devclass of the driver being deleted * @param driver the driver being deleted */ static int devclass_driver_deleted(devclass_t busclass, devclass_t dc, driver_t *driver) { devclass_t parent; device_t dev; int error, i; /* * Disassociate from any devices. We iterate through all the * devices in the devclass of the driver and detach any which are * using the driver and which have a parent in the devclass which * we are deleting from. * * Note that since a driver can be in multiple devclasses, we * should not detach devices which are not children of devices in * the affected devclass. */ for (i = 0; i < dc->maxunit; i++) { if (dc->devices[i]) { dev = dc->devices[i]; if (dev->driver == driver && dev->parent && dev->parent->devclass == busclass) { if ((error = device_detach(dev)) != 0) return (error); BUS_PROBE_NOMATCH(dev->parent, dev); devnomatch(dev); dev->flags |= DF_DONENOMATCH; } } } /* * Walk through the children classes. Since we only keep a * single parent pointer around, we walk the entire list of * devclasses looking for children. We set the * DC_HAS_CHILDREN flag when a child devclass is created on * the parent, so we only walk the list for those devclasses * that have children. */ if (!(busclass->flags & DC_HAS_CHILDREN)) return (0); parent = busclass; TAILQ_FOREACH(busclass, &devclasses, link) { if (busclass->parent == parent) { error = devclass_driver_deleted(busclass, dc, driver); if (error) return (error); } } return (0); } /** * @brief Delete a device driver from a device class * * Delete a device driver from a devclass. This is normally called * automatically by DRIVER_MODULE(). * * If the driver is currently attached to any devices, * devclass_delete_driver() will first attempt to detach from each * device. If one of the detach calls fails, the driver will not be * deleted. * * @param dc the devclass to edit * @param driver the driver to unregister */ int devclass_delete_driver(devclass_t busclass, driver_t *driver) { devclass_t dc = devclass_find(driver->name); driverlink_t dl; int error; PDEBUG(("%s from devclass %s", driver->name, DEVCLANAME(busclass))); if (!dc) return (0); /* * Find the link structure in the bus' list of drivers. */ TAILQ_FOREACH(dl, &busclass->drivers, link) { if (dl->driver == driver) break; } if (!dl) { PDEBUG(("%s not found in %s list", driver->name, busclass->name)); return (ENOENT); } error = devclass_driver_deleted(busclass, dc, driver); if (error != 0) return (error); TAILQ_REMOVE(&busclass->drivers, dl, link); free(dl, M_BUS); /* XXX: kobj_mtx */ driver->refs--; if (driver->refs == 0) kobj_class_free((kobj_class_t) driver); bus_data_generation_update(); return (0); } /** * @brief Quiesces a set of device drivers from a device class * * Quiesce a device driver from a devclass. This is normally called * automatically by DRIVER_MODULE(). * * If the driver is currently attached to any devices, * devclass_quiesece_driver() will first attempt to quiesce each * device. * * @param dc the devclass to edit * @param driver the driver to unregister */ static int devclass_quiesce_driver(devclass_t busclass, driver_t *driver) { devclass_t dc = devclass_find(driver->name); driverlink_t dl; device_t dev; int i; int error; PDEBUG(("%s from devclass %s", driver->name, DEVCLANAME(busclass))); if (!dc) return (0); /* * Find the link structure in the bus' list of drivers. */ TAILQ_FOREACH(dl, &busclass->drivers, link) { if (dl->driver == driver) break; } if (!dl) { PDEBUG(("%s not found in %s list", driver->name, busclass->name)); return (ENOENT); } /* * Quiesce all devices. We iterate through all the devices in * the devclass of the driver and quiesce any which are using * the driver and which have a parent in the devclass which we * are quiescing. * * Note that since a driver can be in multiple devclasses, we * should not quiesce devices which are not children of * devices in the affected devclass. */ for (i = 0; i < dc->maxunit; i++) { if (dc->devices[i]) { dev = dc->devices[i]; if (dev->driver == driver && dev->parent && dev->parent->devclass == busclass) { if ((error = device_quiesce(dev)) != 0) return (error); } } } return (0); } /** * @internal */ static driverlink_t devclass_find_driver_internal(devclass_t dc, const char *classname) { driverlink_t dl; PDEBUG(("%s in devclass %s", classname, DEVCLANAME(dc))); TAILQ_FOREACH(dl, &dc->drivers, link) { if (!strcmp(dl->driver->name, classname)) return (dl); } PDEBUG(("not found")); return (NULL); } /** * @brief Return the name of the devclass */ const char * devclass_get_name(devclass_t dc) { return (dc->name); } /** * @brief Find a device given a unit number * * @param dc the devclass to search * @param unit the unit number to search for * * @returns the device with the given unit number or @c * NULL if there is no such device */ device_t devclass_get_device(devclass_t dc, int unit) { if (dc == NULL || unit < 0 || unit >= dc->maxunit) return (NULL); return (dc->devices[unit]); } /** * @brief Find the softc field of a device given a unit number * * @param dc the devclass to search * @param unit the unit number to search for * * @returns the softc field of the device with the given * unit number or @c NULL if there is no such * device */ void * devclass_get_softc(devclass_t dc, int unit) { device_t dev; dev = devclass_get_device(dc, unit); if (!dev) return (NULL); return (device_get_softc(dev)); } /** * @brief Get a list of devices in the devclass * * An array containing a list of all the devices in the given devclass * is allocated and returned in @p *devlistp. The number of devices * in the array is returned in @p *devcountp. The caller should free * the array using @c free(p, M_TEMP), even if @p *devcountp is 0. * * @param dc the devclass to examine * @param devlistp points at location for array pointer return * value * @param devcountp points at location for array size return value * * @retval 0 success * @retval ENOMEM the array allocation failed */ int devclass_get_devices(devclass_t dc, device_t **devlistp, int *devcountp) { int count, i; device_t *list; count = devclass_get_count(dc); list = malloc(count * sizeof(device_t), M_TEMP, M_NOWAIT|M_ZERO); if (!list) return (ENOMEM); count = 0; for (i = 0; i < dc->maxunit; i++) { if (dc->devices[i]) { list[count] = dc->devices[i]; count++; } } *devlistp = list; *devcountp = count; return (0); } /** * @brief Get a list of drivers in the devclass * * An array containing a list of pointers to all the drivers in the * given devclass is allocated and returned in @p *listp. The number * of drivers in the array is returned in @p *countp. The caller should * free the array using @c free(p, M_TEMP). * * @param dc the devclass to examine * @param listp gives location for array pointer return value * @param countp gives location for number of array elements * return value * * @retval 0 success * @retval ENOMEM the array allocation failed */ int devclass_get_drivers(devclass_t dc, driver_t ***listp, int *countp) { driverlink_t dl; driver_t **list; int count; count = 0; TAILQ_FOREACH(dl, &dc->drivers, link) count++; list = malloc(count * sizeof(driver_t *), M_TEMP, M_NOWAIT); if (list == NULL) return (ENOMEM); count = 0; TAILQ_FOREACH(dl, &dc->drivers, link) { list[count] = dl->driver; count++; } *listp = list; *countp = count; return (0); } /** * @brief Get the number of devices in a devclass * * @param dc the devclass to examine */ int devclass_get_count(devclass_t dc) { int count, i; count = 0; for (i = 0; i < dc->maxunit; i++) if (dc->devices[i]) count++; return (count); } /** * @brief Get the maximum unit number used in a devclass * * Note that this is one greater than the highest currently-allocated * unit. If a null devclass_t is passed in, -1 is returned to indicate * that not even the devclass has been allocated yet. * * @param dc the devclass to examine */ int devclass_get_maxunit(devclass_t dc) { if (dc == NULL) return (-1); return (dc->maxunit); } /** * @brief Find a free unit number in a devclass * * This function searches for the first unused unit number greater * that or equal to @p unit. * * @param dc the devclass to examine * @param unit the first unit number to check */ int devclass_find_free_unit(devclass_t dc, int unit) { if (dc == NULL) return (unit); while (unit < dc->maxunit && dc->devices[unit] != NULL) unit++; return (unit); } /** * @brief Set the parent of a devclass * * The parent class is normally initialised automatically by * DRIVER_MODULE(). * * @param dc the devclass to edit * @param pdc the new parent devclass */ void devclass_set_parent(devclass_t dc, devclass_t pdc) { dc->parent = pdc; } /** * @brief Get the parent of a devclass * * @param dc the devclass to examine */ devclass_t devclass_get_parent(devclass_t dc) { return (dc->parent); } struct sysctl_ctx_list * devclass_get_sysctl_ctx(devclass_t dc) { return (&dc->sysctl_ctx); } struct sysctl_oid * devclass_get_sysctl_tree(devclass_t dc) { return (dc->sysctl_tree); } /** * @internal * @brief Allocate a unit number * * On entry, @p *unitp is the desired unit number (or @c -1 if any * will do). The allocated unit number is returned in @p *unitp. * @param dc the devclass to allocate from * @param unitp points at the location for the allocated unit * number * * @retval 0 success * @retval EEXIST the requested unit number is already allocated * @retval ENOMEM memory allocation failure */ static int devclass_alloc_unit(devclass_t dc, device_t dev, int *unitp) { const char *s; int unit = *unitp; PDEBUG(("unit %d in devclass %s", unit, DEVCLANAME(dc))); /* Ask the parent bus if it wants to wire this device. */ if (unit == -1) BUS_HINT_DEVICE_UNIT(device_get_parent(dev), dev, dc->name, &unit); /* If we were given a wired unit number, check for existing device */ /* XXX imp XXX */ if (unit != -1) { if (unit >= 0 && unit < dc->maxunit && dc->devices[unit] != NULL) { if (bootverbose) printf("%s: %s%d already exists; skipping it\n", dc->name, dc->name, *unitp); return (EEXIST); } } else { /* Unwired device, find the next available slot for it */ unit = 0; for (unit = 0;; unit++) { /* If there is an "at" hint for a unit then skip it. */ if (resource_string_value(dc->name, unit, "at", &s) == 0) continue; /* If this device slot is already in use, skip it. */ if (unit < dc->maxunit && dc->devices[unit] != NULL) continue; break; } } /* * We've selected a unit beyond the length of the table, so let's * extend the table to make room for all units up to and including * this one. */ if (unit >= dc->maxunit) { device_t *newlist, *oldlist; int newsize; oldlist = dc->devices; newsize = roundup((unit + 1), MINALLOCSIZE / sizeof(device_t)); newlist = malloc(sizeof(device_t) * newsize, M_BUS, M_NOWAIT); if (!newlist) return (ENOMEM); if (oldlist != NULL) bcopy(oldlist, newlist, sizeof(device_t) * dc->maxunit); bzero(newlist + dc->maxunit, sizeof(device_t) * (newsize - dc->maxunit)); dc->devices = newlist; dc->maxunit = newsize; if (oldlist != NULL) free(oldlist, M_BUS); } PDEBUG(("now: unit %d in devclass %s", unit, DEVCLANAME(dc))); *unitp = unit; return (0); } /** * @internal * @brief Add a device to a devclass * * A unit number is allocated for the device (using the device's * preferred unit number if any) and the device is registered in the * devclass. This allows the device to be looked up by its unit * number, e.g. by decoding a dev_t minor number. * * @param dc the devclass to add to * @param dev the device to add * * @retval 0 success * @retval EEXIST the requested unit number is already allocated * @retval ENOMEM memory allocation failure */ static int devclass_add_device(devclass_t dc, device_t dev) { int buflen, error; PDEBUG(("%s in devclass %s", DEVICENAME(dev), DEVCLANAME(dc))); buflen = snprintf(NULL, 0, "%s%d$", dc->name, INT_MAX); if (buflen < 0) return (ENOMEM); dev->nameunit = malloc(buflen, M_BUS, M_NOWAIT|M_ZERO); if (!dev->nameunit) return (ENOMEM); if ((error = devclass_alloc_unit(dc, dev, &dev->unit)) != 0) { free(dev->nameunit, M_BUS); dev->nameunit = NULL; return (error); } dc->devices[dev->unit] = dev; dev->devclass = dc; snprintf(dev->nameunit, buflen, "%s%d", dc->name, dev->unit); return (0); } /** * @internal * @brief Delete a device from a devclass * * The device is removed from the devclass's device list and its unit * number is freed. * @param dc the devclass to delete from * @param dev the device to delete * * @retval 0 success */ static int devclass_delete_device(devclass_t dc, device_t dev) { if (!dc || !dev) return (0); PDEBUG(("%s in devclass %s", DEVICENAME(dev), DEVCLANAME(dc))); if (dev->devclass != dc || dc->devices[dev->unit] != dev) panic("devclass_delete_device: inconsistent device class"); dc->devices[dev->unit] = NULL; if (dev->flags & DF_WILDCARD) dev->unit = -1; dev->devclass = NULL; free(dev->nameunit, M_BUS); dev->nameunit = NULL; return (0); } /** * @internal * @brief Make a new device and add it as a child of @p parent * * @param parent the parent of the new device * @param name the devclass name of the new device or @c NULL * to leave the devclass unspecified * @parem unit the unit number of the new device of @c -1 to * leave the unit number unspecified * * @returns the new device */ static device_t make_device(device_t parent, const char *name, int unit) { device_t dev; devclass_t dc; PDEBUG(("%s at %s as unit %d", name, DEVICENAME(parent), unit)); if (name) { dc = devclass_find_internal(name, NULL, TRUE); if (!dc) { printf("make_device: can't find device class %s\n", name); return (NULL); } } else { dc = NULL; } dev = malloc(sizeof(struct device), M_BUS, M_NOWAIT|M_ZERO); if (!dev) return (NULL); dev->parent = parent; TAILQ_INIT(&dev->children); kobj_init((kobj_t) dev, &null_class); dev->driver = NULL; dev->devclass = NULL; dev->unit = unit; dev->nameunit = NULL; dev->desc = NULL; dev->busy = 0; dev->devflags = 0; dev->flags = DF_ENABLED; dev->order = 0; if (unit == -1) dev->flags |= DF_WILDCARD; if (name) { dev->flags |= DF_FIXEDCLASS; if (devclass_add_device(dc, dev)) { kobj_delete((kobj_t) dev, M_BUS); return (NULL); } } dev->ivars = NULL; dev->softc = NULL; dev->state = DS_NOTPRESENT; TAILQ_INSERT_TAIL(&bus_data_devices, dev, devlink); bus_data_generation_update(); return (dev); } /** * @internal * @brief Print a description of a device. */ static int device_print_child(device_t dev, device_t child) { int retval = 0; if (device_is_alive(child)) retval += BUS_PRINT_CHILD(dev, child); else retval += device_printf(child, " not found\n"); return (retval); } /** * @brief Create a new device * * This creates a new device and adds it as a child of an existing * parent device. The new device will be added after the last existing * child with order zero. * * @param dev the device which will be the parent of the * new child device * @param name devclass name for new device or @c NULL if not * specified * @param unit unit number for new device or @c -1 if not * specified * * @returns the new device */ device_t device_add_child(device_t dev, const char *name, int unit) { return (device_add_child_ordered(dev, 0, name, unit)); } /** * @brief Create a new device * * This creates a new device and adds it as a child of an existing * parent device. The new device will be added after the last existing * child with the same order. * * @param dev the device which will be the parent of the * new child device * @param order a value which is used to partially sort the * children of @p dev - devices created using * lower values of @p order appear first in @p * dev's list of children * @param name devclass name for new device or @c NULL if not * specified * @param unit unit number for new device or @c -1 if not * specified * * @returns the new device */ device_t device_add_child_ordered(device_t dev, u_int order, const char *name, int unit) { device_t child; device_t place; PDEBUG(("%s at %s with order %u as unit %d", name, DEVICENAME(dev), order, unit)); KASSERT(name != NULL || unit == -1, ("child device with wildcard name and specific unit number")); child = make_device(dev, name, unit); if (child == NULL) return (child); child->order = order; TAILQ_FOREACH(place, &dev->children, link) { if (place->order > order) break; } if (place) { /* * The device 'place' is the first device whose order is * greater than the new child. */ TAILQ_INSERT_BEFORE(place, child, link); } else { /* * The new child's order is greater or equal to the order of * any existing device. Add the child to the tail of the list. */ TAILQ_INSERT_TAIL(&dev->children, child, link); } bus_data_generation_update(); return (child); } /** * @brief Delete a device * * This function deletes a device along with all of its children. If * the device currently has a driver attached to it, the device is * detached first using device_detach(). * * @param dev the parent device * @param child the device to delete * * @retval 0 success * @retval non-zero a unit error code describing the error */ int device_delete_child(device_t dev, device_t child) { int error; device_t grandchild; PDEBUG(("%s from %s", DEVICENAME(child), DEVICENAME(dev))); /* remove children first */ while ((grandchild = TAILQ_FIRST(&child->children)) != NULL) { error = device_delete_child(child, grandchild); if (error) return (error); } if ((error = device_detach(child)) != 0) return (error); if (child->devclass) devclass_delete_device(child->devclass, child); if (child->parent) BUS_CHILD_DELETED(dev, child); TAILQ_REMOVE(&dev->children, child, link); TAILQ_REMOVE(&bus_data_devices, child, devlink); kobj_delete((kobj_t) child, M_BUS); bus_data_generation_update(); return (0); } /** * @brief Delete all children devices of the given device, if any. * * This function deletes all children devices of the given device, if * any, using the device_delete_child() function for each device it * finds. If a child device cannot be deleted, this function will * return an error code. * * @param dev the parent device * * @retval 0 success * @retval non-zero a device would not detach */ int device_delete_children(device_t dev) { device_t child; int error; PDEBUG(("Deleting all children of %s", DEVICENAME(dev))); error = 0; while ((child = TAILQ_FIRST(&dev->children)) != NULL) { error = device_delete_child(dev, child); if (error) { PDEBUG(("Failed deleting %s", DEVICENAME(child))); break; } } return (error); } /** * @brief Find a device given a unit number * * This is similar to devclass_get_devices() but only searches for * devices which have @p dev as a parent. * * @param dev the parent device to search * @param unit the unit number to search for. If the unit is -1, * return the first child of @p dev which has name * @p classname (that is, the one with the lowest unit.) * * @returns the device with the given unit number or @c * NULL if there is no such device */ device_t device_find_child(device_t dev, const char *classname, int unit) { devclass_t dc; device_t child; dc = devclass_find(classname); if (!dc) return (NULL); if (unit != -1) { child = devclass_get_device(dc, unit); if (child && child->parent == dev) return (child); } else { for (unit = 0; unit < devclass_get_maxunit(dc); unit++) { child = devclass_get_device(dc, unit); if (child && child->parent == dev) return (child); } } return (NULL); } /** * @internal */ static driverlink_t first_matching_driver(devclass_t dc, device_t dev) { if (dev->devclass) return (devclass_find_driver_internal(dc, dev->devclass->name)); return (TAILQ_FIRST(&dc->drivers)); } /** * @internal */ static driverlink_t next_matching_driver(devclass_t dc, device_t dev, driverlink_t last) { if (dev->devclass) { driverlink_t dl; for (dl = TAILQ_NEXT(last, link); dl; dl = TAILQ_NEXT(dl, link)) if (!strcmp(dev->devclass->name, dl->driver->name)) return (dl); return (NULL); } return (TAILQ_NEXT(last, link)); } /** * @internal */ int device_probe_child(device_t dev, device_t child) { devclass_t dc; driverlink_t best = NULL; driverlink_t dl; int result, pri = 0; int hasclass = (child->devclass != NULL); GIANT_REQUIRED; dc = dev->devclass; if (!dc) panic("device_probe_child: parent device has no devclass"); /* * If the state is already probed, then return. However, don't * return if we can rebid this object. */ if (child->state == DS_ALIVE && (child->flags & DF_REBID) == 0) return (0); for (; dc; dc = dc->parent) { for (dl = first_matching_driver(dc, child); dl; dl = next_matching_driver(dc, child, dl)) { /* If this driver's pass is too high, then ignore it. */ if (dl->pass > bus_current_pass) continue; PDEBUG(("Trying %s", DRIVERNAME(dl->driver))); result = device_set_driver(child, dl->driver); if (result == ENOMEM) return (result); else if (result != 0) continue; if (!hasclass) { if (device_set_devclass(child, dl->driver->name) != 0) { char const * devname = device_get_name(child); if (devname == NULL) devname = "(unknown)"; printf("driver bug: Unable to set " "devclass (class: %s " "devname: %s)\n", dl->driver->name, devname); (void)device_set_driver(child, NULL); continue; } } /* Fetch any flags for the device before probing. */ resource_int_value(dl->driver->name, child->unit, "flags", &child->devflags); result = DEVICE_PROBE(child); /* Reset flags and devclass before the next probe. */ child->devflags = 0; if (!hasclass) (void)device_set_devclass(child, NULL); /* * If the driver returns SUCCESS, there can be * no higher match for this device. */ if (result == 0) { best = dl; pri = 0; break; } /* * The driver returned an error so it * certainly doesn't match. */ if (result > 0) { (void)device_set_driver(child, NULL); continue; } /* * A priority lower than SUCCESS, remember the * best matching driver. Initialise the value * of pri for the first match. */ if (best == NULL || result > pri) { /* * Probes that return BUS_PROBE_NOWILDCARD * or lower only match on devices whose * driver was explicitly specified. */ if (result <= BUS_PROBE_NOWILDCARD && !(child->flags & DF_FIXEDCLASS)) continue; best = dl; pri = result; continue; } } /* * If we have an unambiguous match in this devclass, * don't look in the parent. */ if (best && pri == 0) break; } /* * If we found a driver, change state and initialise the devclass. */ /* XXX What happens if we rebid and got no best? */ if (best) { /* * If this device was attached, and we were asked to * rescan, and it is a different driver, then we have * to detach the old driver and reattach this new one. * Note, we don't have to check for DF_REBID here * because if the state is > DS_ALIVE, we know it must * be. * * This assumes that all DF_REBID drivers can have * their probe routine called at any time and that * they are idempotent as well as completely benign in * normal operations. * * We also have to make sure that the detach * succeeded, otherwise we fail the operation (or * maybe it should just fail silently? I'm torn). */ if (child->state > DS_ALIVE && best->driver != child->driver) if ((result = device_detach(dev)) != 0) return (result); /* Set the winning driver, devclass, and flags. */ if (!child->devclass) { result = device_set_devclass(child, best->driver->name); if (result != 0) return (result); } result = device_set_driver(child, best->driver); if (result != 0) return (result); resource_int_value(best->driver->name, child->unit, "flags", &child->devflags); if (pri < 0) { /* * A bit bogus. Call the probe method again to make * sure that we have the right description. */ DEVICE_PROBE(child); #if 0 child->flags |= DF_REBID; #endif } else child->flags &= ~DF_REBID; child->state = DS_ALIVE; bus_data_generation_update(); return (0); } return (ENXIO); } /** * @brief Return the parent of a device */ device_t device_get_parent(device_t dev) { return (dev->parent); } /** * @brief Get a list of children of a device * * An array containing a list of all the children of the given device * is allocated and returned in @p *devlistp. The number of devices * in the array is returned in @p *devcountp. The caller should free * the array using @c free(p, M_TEMP). * * @param dev the device to examine * @param devlistp points at location for array pointer return * value * @param devcountp points at location for array size return value * * @retval 0 success * @retval ENOMEM the array allocation failed */ int device_get_children(device_t dev, device_t **devlistp, int *devcountp) { int count; device_t child; device_t *list; count = 0; TAILQ_FOREACH(child, &dev->children, link) { count++; } if (count == 0) { *devlistp = NULL; *devcountp = 0; return (0); } list = malloc(count * sizeof(device_t), M_TEMP, M_NOWAIT|M_ZERO); if (!list) return (ENOMEM); count = 0; TAILQ_FOREACH(child, &dev->children, link) { list[count] = child; count++; } *devlistp = list; *devcountp = count; return (0); } /** * @brief Return the current driver for the device or @c NULL if there * is no driver currently attached */ driver_t * device_get_driver(device_t dev) { return (dev->driver); } /** * @brief Return the current devclass for the device or @c NULL if * there is none. */ devclass_t device_get_devclass(device_t dev) { return (dev->devclass); } /** * @brief Return the name of the device's devclass or @c NULL if there * is none. */ const char * device_get_name(device_t dev) { if (dev != NULL && dev->devclass) return (devclass_get_name(dev->devclass)); return (NULL); } /** * @brief Return a string containing the device's devclass name * followed by an ascii representation of the device's unit number * (e.g. @c "foo2"). */ const char * device_get_nameunit(device_t dev) { return (dev->nameunit); } /** * @brief Return the device's unit number. */ int device_get_unit(device_t dev) { return (dev->unit); } /** * @brief Return the device's description string */ const char * device_get_desc(device_t dev) { return (dev->desc); } /** * @brief Return the device's flags */ uint32_t device_get_flags(device_t dev) { return (dev->devflags); } struct sysctl_ctx_list * device_get_sysctl_ctx(device_t dev) { return (&dev->sysctl_ctx); } struct sysctl_oid * device_get_sysctl_tree(device_t dev) { return (dev->sysctl_tree); } /** * @brief Print the name of the device followed by a colon and a space * * @returns the number of characters printed */ int device_print_prettyname(device_t dev) { const char *name = device_get_name(dev); if (name == NULL) return (printf("unknown: ")); return (printf("%s%d: ", name, device_get_unit(dev))); } /** * @brief Print the name of the device followed by a colon, a space * and the result of calling vprintf() with the value of @p fmt and * the following arguments. * * @returns the number of characters printed */ int device_printf(device_t dev, const char * fmt, ...) { va_list ap; int retval; retval = device_print_prettyname(dev); va_start(ap, fmt); retval += vprintf(fmt, ap); va_end(ap); return (retval); } /** * @internal */ static void device_set_desc_internal(device_t dev, const char* desc, int copy) { if (dev->desc && (dev->flags & DF_DESCMALLOCED)) { free(dev->desc, M_BUS); dev->flags &= ~DF_DESCMALLOCED; dev->desc = NULL; } if (copy && desc) { dev->desc = malloc(strlen(desc) + 1, M_BUS, M_NOWAIT); if (dev->desc) { strcpy(dev->desc, desc); dev->flags |= DF_DESCMALLOCED; } } else { /* Avoid a -Wcast-qual warning */ dev->desc = (char *)(uintptr_t) desc; } bus_data_generation_update(); } /** * @brief Set the device's description * * The value of @c desc should be a string constant that will not * change (at least until the description is changed in a subsequent * call to device_set_desc() or device_set_desc_copy()). */ void device_set_desc(device_t dev, const char* desc) { device_set_desc_internal(dev, desc, FALSE); } /** * @brief Set the device's description * * The string pointed to by @c desc is copied. Use this function if * the device description is generated, (e.g. with sprintf()). */ void device_set_desc_copy(device_t dev, const char* desc) { device_set_desc_internal(dev, desc, TRUE); } /** * @brief Set the device's flags */ void device_set_flags(device_t dev, uint32_t flags) { dev->devflags = flags; } /** * @brief Return the device's softc field * * The softc is allocated and zeroed when a driver is attached, based * on the size field of the driver. */ void * device_get_softc(device_t dev) { return (dev->softc); } /** * @brief Set the device's softc field * * Most drivers do not need to use this since the softc is allocated * automatically when the driver is attached. */ void device_set_softc(device_t dev, void *softc) { if (dev->softc && !(dev->flags & DF_EXTERNALSOFTC)) free(dev->softc, M_BUS_SC); dev->softc = softc; if (dev->softc) dev->flags |= DF_EXTERNALSOFTC; else dev->flags &= ~DF_EXTERNALSOFTC; } /** * @brief Free claimed softc * * Most drivers do not need to use this since the softc is freed * automatically when the driver is detached. */ void device_free_softc(void *softc) { free(softc, M_BUS_SC); } /** * @brief Claim softc * * This function can be used to let the driver free the automatically * allocated softc using "device_free_softc()". This function is * useful when the driver is refcounting the softc and the softc * cannot be freed when the "device_detach" method is called. */ void device_claim_softc(device_t dev) { if (dev->softc) dev->flags |= DF_EXTERNALSOFTC; else dev->flags &= ~DF_EXTERNALSOFTC; } /** * @brief Get the device's ivars field * * The ivars field is used by the parent device to store per-device * state (e.g. the physical location of the device or a list of * resources). */ void * device_get_ivars(device_t dev) { KASSERT(dev != NULL, ("device_get_ivars(NULL, ...)")); return (dev->ivars); } /** * @brief Set the device's ivars field */ void device_set_ivars(device_t dev, void * ivars) { KASSERT(dev != NULL, ("device_set_ivars(NULL, ...)")); dev->ivars = ivars; } /** * @brief Return the device's state */ device_state_t device_get_state(device_t dev) { return (dev->state); } /** * @brief Set the DF_ENABLED flag for the device */ void device_enable(device_t dev) { dev->flags |= DF_ENABLED; } /** * @brief Clear the DF_ENABLED flag for the device */ void device_disable(device_t dev) { dev->flags &= ~DF_ENABLED; } /** * @brief Increment the busy counter for the device */ void device_busy(device_t dev) { if (dev->state < DS_ATTACHING) panic("device_busy: called for unattached device"); if (dev->busy == 0 && dev->parent) device_busy(dev->parent); dev->busy++; if (dev->state == DS_ATTACHED) dev->state = DS_BUSY; } /** * @brief Decrement the busy counter for the device */ void device_unbusy(device_t dev) { if (dev->busy != 0 && dev->state != DS_BUSY && dev->state != DS_ATTACHING) panic("device_unbusy: called for non-busy device %s", device_get_nameunit(dev)); dev->busy--; if (dev->busy == 0) { if (dev->parent) device_unbusy(dev->parent); if (dev->state == DS_BUSY) dev->state = DS_ATTACHED; } } /** * @brief Set the DF_QUIET flag for the device */ void device_quiet(device_t dev) { dev->flags |= DF_QUIET; } /** * @brief Clear the DF_QUIET flag for the device */ void device_verbose(device_t dev) { dev->flags &= ~DF_QUIET; } /** * @brief Return non-zero if the DF_QUIET flag is set on the device */ int device_is_quiet(device_t dev) { return ((dev->flags & DF_QUIET) != 0); } /** * @brief Return non-zero if the DF_ENABLED flag is set on the device */ int device_is_enabled(device_t dev) { return ((dev->flags & DF_ENABLED) != 0); } /** * @brief Return non-zero if the device was successfully probed */ int device_is_alive(device_t dev) { return (dev->state >= DS_ALIVE); } /** * @brief Return non-zero if the device currently has a driver * attached to it */ int device_is_attached(device_t dev) { return (dev->state >= DS_ATTACHED); } /** * @brief Set the devclass of a device * @see devclass_add_device(). */ int device_set_devclass(device_t dev, const char *classname) { devclass_t dc; int error; if (!classname) { if (dev->devclass) devclass_delete_device(dev->devclass, dev); return (0); } if (dev->devclass) { printf("device_set_devclass: device class already set\n"); return (EINVAL); } dc = devclass_find_internal(classname, NULL, TRUE); if (!dc) return (ENOMEM); error = devclass_add_device(dc, dev); bus_data_generation_update(); return (error); } /** * @brief Set the driver of a device * * @retval 0 success * @retval EBUSY the device already has a driver attached * @retval ENOMEM a memory allocation failure occurred */ int device_set_driver(device_t dev, driver_t *driver) { if (dev->state >= DS_ATTACHED) return (EBUSY); if (dev->driver == driver) return (0); if (dev->softc && !(dev->flags & DF_EXTERNALSOFTC)) { free(dev->softc, M_BUS_SC); dev->softc = NULL; } device_set_desc(dev, NULL); kobj_delete((kobj_t) dev, NULL); dev->driver = driver; if (driver) { kobj_init((kobj_t) dev, (kobj_class_t) driver); if (!(dev->flags & DF_EXTERNALSOFTC) && driver->size > 0) { dev->softc = malloc(driver->size, M_BUS_SC, M_NOWAIT | M_ZERO); if (!dev->softc) { kobj_delete((kobj_t) dev, NULL); kobj_init((kobj_t) dev, &null_class); dev->driver = NULL; return (ENOMEM); } } } else { kobj_init((kobj_t) dev, &null_class); } bus_data_generation_update(); return (0); } /** * @brief Probe a device, and return this status. * * This function is the core of the device autoconfiguration * system. Its purpose is to select a suitable driver for a device and * then call that driver to initialise the hardware appropriately. The * driver is selected by calling the DEVICE_PROBE() method of a set of * candidate drivers and then choosing the driver which returned the * best value. This driver is then attached to the device using * device_attach(). * * The set of suitable drivers is taken from the list of drivers in * the parent device's devclass. If the device was originally created * with a specific class name (see device_add_child()), only drivers * with that name are probed, otherwise all drivers in the devclass * are probed. If no drivers return successful probe values in the * parent devclass, the search continues in the parent of that * devclass (see devclass_get_parent()) if any. * * @param dev the device to initialise * * @retval 0 success * @retval ENXIO no driver was found * @retval ENOMEM memory allocation failure * @retval non-zero some other unix error code * @retval -1 Device already attached */ int device_probe(device_t dev) { int error; GIANT_REQUIRED; if (dev->state >= DS_ALIVE && (dev->flags & DF_REBID) == 0) return (-1); if (!(dev->flags & DF_ENABLED)) { if (bootverbose && device_get_name(dev) != NULL) { device_print_prettyname(dev); printf("not probed (disabled)\n"); } return (-1); } if ((error = device_probe_child(dev->parent, dev)) != 0) { if (bus_current_pass == BUS_PASS_DEFAULT && !(dev->flags & DF_DONENOMATCH)) { BUS_PROBE_NOMATCH(dev->parent, dev); devnomatch(dev); dev->flags |= DF_DONENOMATCH; } return (error); } return (0); } /** * @brief Probe a device and attach a driver if possible * * calls device_probe() and attaches if that was successful. */ int device_probe_and_attach(device_t dev) { int error; GIANT_REQUIRED; error = device_probe(dev); if (error == -1) return (0); else if (error != 0) return (error); CURVNET_SET_QUIET(vnet0); error = device_attach(dev); CURVNET_RESTORE(); return error; } /** * @brief Attach a device driver to a device * * This function is a wrapper around the DEVICE_ATTACH() driver * method. In addition to calling DEVICE_ATTACH(), it initialises the * device's sysctl tree, optionally prints a description of the device * and queues a notification event for user-based device management * services. * * Normally this function is only called internally from * device_probe_and_attach(). * * @param dev the device to initialise * * @retval 0 success * @retval ENXIO no driver was found * @retval ENOMEM memory allocation failure * @retval non-zero some other unix error code */ int device_attach(device_t dev) { uint64_t attachtime; int error; if (resource_disabled(dev->driver->name, dev->unit)) { device_disable(dev); if (bootverbose) device_printf(dev, "disabled via hints entry\n"); return (ENXIO); } device_sysctl_init(dev); if (!device_is_quiet(dev)) device_print_child(dev->parent, dev); attachtime = get_cyclecount(); dev->state = DS_ATTACHING; if ((error = DEVICE_ATTACH(dev)) != 0) { printf("device_attach: %s%d attach returned %d\n", dev->driver->name, dev->unit, error); if (!(dev->flags & DF_FIXEDCLASS)) devclass_delete_device(dev->devclass, dev); (void)device_set_driver(dev, NULL); device_sysctl_fini(dev); KASSERT(dev->busy == 0, ("attach failed but busy")); dev->state = DS_NOTPRESENT; return (error); } attachtime = get_cyclecount() - attachtime; /* * 4 bits per device is a reasonable value for desktop and server * hardware with good get_cyclecount() implementations, but may * need to be adjusted on other platforms. */ #ifdef RANDOM_DEBUG printf("random: %s(): feeding %d bit(s) of entropy from %s%d\n", __func__, 4, dev->driver->name, dev->unit); #endif random_harvest(&attachtime, sizeof(attachtime), 4, RANDOM_ATTACH); device_sysctl_update(dev); if (dev->busy) dev->state = DS_BUSY; else dev->state = DS_ATTACHED; dev->flags &= ~DF_DONENOMATCH; devadded(dev); return (0); } /** * @brief Detach a driver from a device * * This function is a wrapper around the DEVICE_DETACH() driver * method. If the call to DEVICE_DETACH() succeeds, it calls * BUS_CHILD_DETACHED() for the parent of @p dev, queues a * notification event for user-based device management services and * cleans up the device's sysctl tree. * * @param dev the device to un-initialise * * @retval 0 success * @retval ENXIO no driver was found * @retval ENOMEM memory allocation failure * @retval non-zero some other unix error code */ int device_detach(device_t dev) { int error; GIANT_REQUIRED; PDEBUG(("%s", DEVICENAME(dev))); if (dev->state == DS_BUSY) return (EBUSY); if (dev->state != DS_ATTACHED) return (0); if ((error = DEVICE_DETACH(dev)) != 0) return (error); devremoved(dev); if (!device_is_quiet(dev)) device_printf(dev, "detached\n"); if (dev->parent) BUS_CHILD_DETACHED(dev->parent, dev); if (!(dev->flags & DF_FIXEDCLASS)) devclass_delete_device(dev->devclass, dev); dev->state = DS_NOTPRESENT; (void)device_set_driver(dev, NULL); device_sysctl_fini(dev); return (0); } /** * @brief Tells a driver to quiesce itself. * * This function is a wrapper around the DEVICE_QUIESCE() driver * method. If the call to DEVICE_QUIESCE() succeeds. * * @param dev the device to quiesce * * @retval 0 success * @retval ENXIO no driver was found * @retval ENOMEM memory allocation failure * @retval non-zero some other unix error code */ int device_quiesce(device_t dev) { PDEBUG(("%s", DEVICENAME(dev))); if (dev->state == DS_BUSY) return (EBUSY); if (dev->state != DS_ATTACHED) return (0); return (DEVICE_QUIESCE(dev)); } /** * @brief Notify a device of system shutdown * * This function calls the DEVICE_SHUTDOWN() driver method if the * device currently has an attached driver. * * @returns the value returned by DEVICE_SHUTDOWN() */ int device_shutdown(device_t dev) { if (dev->state < DS_ATTACHED) return (0); return (DEVICE_SHUTDOWN(dev)); } /** * @brief Set the unit number of a device * * This function can be used to override the unit number used for a * device (e.g. to wire a device to a pre-configured unit number). */ int device_set_unit(device_t dev, int unit) { devclass_t dc; int err; dc = device_get_devclass(dev); if (unit < dc->maxunit && dc->devices[unit]) return (EBUSY); err = devclass_delete_device(dc, dev); if (err) return (err); dev->unit = unit; err = devclass_add_device(dc, dev); if (err) return (err); bus_data_generation_update(); return (0); } /*======================================*/ /* * Some useful method implementations to make life easier for bus drivers. */ /** * @brief Initialise a resource list. * * @param rl the resource list to initialise */ void resource_list_init(struct resource_list *rl) { STAILQ_INIT(rl); } /** * @brief Reclaim memory used by a resource list. * * This function frees the memory for all resource entries on the list * (if any). * * @param rl the resource list to free */ void resource_list_free(struct resource_list *rl) { struct resource_list_entry *rle; while ((rle = STAILQ_FIRST(rl)) != NULL) { if (rle->res) panic("resource_list_free: resource entry is busy"); STAILQ_REMOVE_HEAD(rl, link); free(rle, M_BUS); } } /** * @brief Add a resource entry. * * This function adds a resource entry using the given @p type, @p * start, @p end and @p count values. A rid value is chosen by * searching sequentially for the first unused rid starting at zero. * * @param rl the resource list to edit * @param type the resource entry type (e.g. SYS_RES_MEMORY) * @param start the start address of the resource * @param end the end address of the resource * @param count XXX end-start+1 */ int resource_list_add_next(struct resource_list *rl, int type, u_long start, u_long end, u_long count) { int rid; rid = 0; while (resource_list_find(rl, type, rid) != NULL) rid++; resource_list_add(rl, type, rid, start, end, count); return (rid); } /** * @brief Add or modify a resource entry. * * If an existing entry exists with the same type and rid, it will be * modified using the given values of @p start, @p end and @p * count. If no entry exists, a new one will be created using the * given values. The resource list entry that matches is then returned. * * @param rl the resource list to edit * @param type the resource entry type (e.g. SYS_RES_MEMORY) * @param rid the resource identifier * @param start the start address of the resource * @param end the end address of the resource * @param count XXX end-start+1 */ struct resource_list_entry * resource_list_add(struct resource_list *rl, int type, int rid, u_long start, u_long end, u_long count) { struct resource_list_entry *rle; rle = resource_list_find(rl, type, rid); if (!rle) { rle = malloc(sizeof(struct resource_list_entry), M_BUS, M_NOWAIT); if (!rle) panic("resource_list_add: can't record entry"); STAILQ_INSERT_TAIL(rl, rle, link); rle->type = type; rle->rid = rid; rle->res = NULL; rle->flags = 0; } if (rle->res) panic("resource_list_add: resource entry is busy"); rle->start = start; rle->end = end; rle->count = count; return (rle); } /** * @brief Determine if a resource entry is busy. * * Returns true if a resource entry is busy meaning that it has an * associated resource that is not an unallocated "reserved" resource. * * @param rl the resource list to search * @param type the resource entry type (e.g. SYS_RES_MEMORY) * @param rid the resource identifier * * @returns Non-zero if the entry is busy, zero otherwise. */ int resource_list_busy(struct resource_list *rl, int type, int rid) { struct resource_list_entry *rle; rle = resource_list_find(rl, type, rid); if (rle == NULL || rle->res == NULL) return (0); if ((rle->flags & (RLE_RESERVED | RLE_ALLOCATED)) == RLE_RESERVED) { KASSERT(!(rman_get_flags(rle->res) & RF_ACTIVE), ("reserved resource is active")); return (0); } return (1); } /** * @brief Determine if a resource entry is reserved. * * Returns true if a resource entry is reserved meaning that it has an * associated "reserved" resource. The resource can either be * allocated or unallocated. * * @param rl the resource list to search * @param type the resource entry type (e.g. SYS_RES_MEMORY) * @param rid the resource identifier * * @returns Non-zero if the entry is reserved, zero otherwise. */ int resource_list_reserved(struct resource_list *rl, int type, int rid) { struct resource_list_entry *rle; rle = resource_list_find(rl, type, rid); if (rle != NULL && rle->flags & RLE_RESERVED) return (1); return (0); } /** * @brief Find a resource entry by type and rid. * * @param rl the resource list to search * @param type the resource entry type (e.g. SYS_RES_MEMORY) * @param rid the resource identifier * * @returns the resource entry pointer or NULL if there is no such * entry. */ struct resource_list_entry * resource_list_find(struct resource_list *rl, int type, int rid) { struct resource_list_entry *rle; STAILQ_FOREACH(rle, rl, link) { if (rle->type == type && rle->rid == rid) return (rle); } return (NULL); } /** * @brief Delete a resource entry. * * @param rl the resource list to edit * @param type the resource entry type (e.g. SYS_RES_MEMORY) * @param rid the resource identifier */ void resource_list_delete(struct resource_list *rl, int type, int rid) { struct resource_list_entry *rle = resource_list_find(rl, type, rid); if (rle) { if (rle->res != NULL) panic("resource_list_delete: resource has not been released"); STAILQ_REMOVE(rl, rle, resource_list_entry, link); free(rle, M_BUS); } } /** * @brief Allocate a reserved resource * * This can be used by busses to force the allocation of resources * that are always active in the system even if they are not allocated * by a driver (e.g. PCI BARs). This function is usually called when * adding a new child to the bus. The resource is allocated from the * parent bus when it is reserved. The resource list entry is marked * with RLE_RESERVED to note that it is a reserved resource. * * Subsequent attempts to allocate the resource with * resource_list_alloc() will succeed the first time and will set * RLE_ALLOCATED to note that it has been allocated. When a reserved * resource that has been allocated is released with * resource_list_release() the resource RLE_ALLOCATED is cleared, but * the actual resource remains allocated. The resource can be released to * the parent bus by calling resource_list_unreserve(). * * @param rl the resource list to allocate from * @param bus the parent device of @p child * @param child the device for which the resource is being reserved * @param type the type of resource to allocate * @param rid a pointer to the resource identifier * @param start hint at the start of the resource range - pass * @c 0UL for any start address * @param end hint at the end of the resource range - pass * @c ~0UL for any end address * @param count hint at the size of range required - pass @c 1 * for any size * @param flags any extra flags to control the resource * allocation - see @c RF_XXX flags in * for details * * @returns the resource which was allocated or @c NULL if no * resource could be allocated */ struct resource * resource_list_reserve(struct resource_list *rl, device_t bus, device_t child, int type, int *rid, u_long start, u_long end, u_long count, u_int flags) { struct resource_list_entry *rle = NULL; int passthrough = (device_get_parent(child) != bus); struct resource *r; if (passthrough) panic( "resource_list_reserve() should only be called for direct children"); if (flags & RF_ACTIVE) panic( "resource_list_reserve() should only reserve inactive resources"); r = resource_list_alloc(rl, bus, child, type, rid, start, end, count, flags); if (r != NULL) { rle = resource_list_find(rl, type, *rid); rle->flags |= RLE_RESERVED; } return (r); } /** * @brief Helper function for implementing BUS_ALLOC_RESOURCE() * * Implement BUS_ALLOC_RESOURCE() by looking up a resource from the list * and passing the allocation up to the parent of @p bus. This assumes * that the first entry of @c device_get_ivars(child) is a struct * resource_list. This also handles 'passthrough' allocations where a * child is a remote descendant of bus by passing the allocation up to * the parent of bus. * * Typically, a bus driver would store a list of child resources * somewhere in the child device's ivars (see device_get_ivars()) and * its implementation of BUS_ALLOC_RESOURCE() would find that list and * then call resource_list_alloc() to perform the allocation. * * @param rl the resource list to allocate from * @param bus the parent device of @p child * @param child the device which is requesting an allocation * @param type the type of resource to allocate * @param rid a pointer to the resource identifier * @param start hint at the start of the resource range - pass * @c 0UL for any start address * @param end hint at the end of the resource range - pass * @c ~0UL for any end address * @param count hint at the size of range required - pass @c 1 * for any size * @param flags any extra flags to control the resource * allocation - see @c RF_XXX flags in * for details * * @returns the resource which was allocated or @c NULL if no * resource could be allocated */ struct resource * resource_list_alloc(struct resource_list *rl, device_t bus, device_t child, int type, int *rid, u_long start, u_long end, u_long count, u_int flags) { struct resource_list_entry *rle = NULL; int passthrough = (device_get_parent(child) != bus); int isdefault = (start == 0UL && end == ~0UL); if (passthrough) { return (BUS_ALLOC_RESOURCE(device_get_parent(bus), child, type, rid, start, end, count, flags)); } rle = resource_list_find(rl, type, *rid); if (!rle) return (NULL); /* no resource of that type/rid */ if (rle->res) { if (rle->flags & RLE_RESERVED) { if (rle->flags & RLE_ALLOCATED) return (NULL); if ((flags & RF_ACTIVE) && bus_activate_resource(child, type, *rid, rle->res) != 0) return (NULL); rle->flags |= RLE_ALLOCATED; return (rle->res); } device_printf(bus, "resource entry %#x type %d for child %s is busy\n", *rid, type, device_get_nameunit(child)); return (NULL); } if (isdefault) { start = rle->start; count = ulmax(count, rle->count); end = ulmax(rle->end, start + count - 1); } rle->res = BUS_ALLOC_RESOURCE(device_get_parent(bus), child, type, rid, start, end, count, flags); /* * Record the new range. */ if (rle->res) { rle->start = rman_get_start(rle->res); rle->end = rman_get_end(rle->res); rle->count = count; } return (rle->res); } /** * @brief Helper function for implementing BUS_RELEASE_RESOURCE() * * Implement BUS_RELEASE_RESOURCE() using a resource list. Normally * used with resource_list_alloc(). * * @param rl the resource list which was allocated from * @param bus the parent device of @p child * @param child the device which is requesting a release * @param type the type of resource to release * @param rid the resource identifier * @param res the resource to release * * @retval 0 success * @retval non-zero a standard unix error code indicating what * error condition prevented the operation */ int resource_list_release(struct resource_list *rl, device_t bus, device_t child, int type, int rid, struct resource *res) { struct resource_list_entry *rle = NULL; int passthrough = (device_get_parent(child) != bus); int error; if (passthrough) { return (BUS_RELEASE_RESOURCE(device_get_parent(bus), child, type, rid, res)); } rle = resource_list_find(rl, type, rid); if (!rle) panic("resource_list_release: can't find resource"); if (!rle->res) panic("resource_list_release: resource entry is not busy"); if (rle->flags & RLE_RESERVED) { if (rle->flags & RLE_ALLOCATED) { if (rman_get_flags(res) & RF_ACTIVE) { error = bus_deactivate_resource(child, type, rid, res); if (error) return (error); } rle->flags &= ~RLE_ALLOCATED; return (0); } return (EINVAL); } error = BUS_RELEASE_RESOURCE(device_get_parent(bus), child, type, rid, res); if (error) return (error); rle->res = NULL; return (0); } /** * @brief Release all active resources of a given type * * Release all active resources of a specified type. This is intended * to be used to cleanup resources leaked by a driver after detach or * a failed attach. * * @param rl the resource list which was allocated from * @param bus the parent device of @p child * @param child the device whose active resources are being released * @param type the type of resources to release * * @retval 0 success * @retval EBUSY at least one resource was active */ int resource_list_release_active(struct resource_list *rl, device_t bus, device_t child, int type) { struct resource_list_entry *rle; int error, retval; retval = 0; STAILQ_FOREACH(rle, rl, link) { if (rle->type != type) continue; if (rle->res == NULL) continue; if ((rle->flags & (RLE_RESERVED | RLE_ALLOCATED)) == RLE_RESERVED) continue; retval = EBUSY; error = resource_list_release(rl, bus, child, type, rman_get_rid(rle->res), rle->res); if (error != 0) device_printf(bus, "Failed to release active resource: %d\n", error); } return (retval); } /** * @brief Fully release a reserved resource * * Fully releases a resource reserved via resource_list_reserve(). * * @param rl the resource list which was allocated from * @param bus the parent device of @p child * @param child the device whose reserved resource is being released * @param type the type of resource to release * @param rid the resource identifier * @param res the resource to release * * @retval 0 success * @retval non-zero a standard unix error code indicating what * error condition prevented the operation */ int resource_list_unreserve(struct resource_list *rl, device_t bus, device_t child, int type, int rid) { struct resource_list_entry *rle = NULL; int passthrough = (device_get_parent(child) != bus); if (passthrough) panic( "resource_list_unreserve() should only be called for direct children"); rle = resource_list_find(rl, type, rid); if (!rle) panic("resource_list_unreserve: can't find resource"); if (!(rle->flags & RLE_RESERVED)) return (EINVAL); if (rle->flags & RLE_ALLOCATED) return (EBUSY); rle->flags &= ~RLE_RESERVED; return (resource_list_release(rl, bus, child, type, rid, rle->res)); } /** * @brief Print a description of resources in a resource list * * Print all resources of a specified type, for use in BUS_PRINT_CHILD(). * The name is printed if at least one resource of the given type is available. * The format is used to print resource start and end. * * @param rl the resource list to print * @param name the name of @p type, e.g. @c "memory" * @param type type type of resource entry to print * @param format printf(9) format string to print resource * start and end values * * @returns the number of characters printed */ int resource_list_print_type(struct resource_list *rl, const char *name, int type, const char *format) { struct resource_list_entry *rle; int printed, retval; printed = 0; retval = 0; /* Yes, this is kinda cheating */ STAILQ_FOREACH(rle, rl, link) { if (rle->type == type) { if (printed == 0) retval += printf(" %s ", name); else retval += printf(","); printed++; retval += printf(format, rle->start); if (rle->count > 1) { retval += printf("-"); retval += printf(format, rle->start + rle->count - 1); } } } return (retval); } /** * @brief Releases all the resources in a list. * * @param rl The resource list to purge. * * @returns nothing */ void resource_list_purge(struct resource_list *rl) { struct resource_list_entry *rle; while ((rle = STAILQ_FIRST(rl)) != NULL) { if (rle->res) bus_release_resource(rman_get_device(rle->res), rle->type, rle->rid, rle->res); STAILQ_REMOVE_HEAD(rl, link); free(rle, M_BUS); } } device_t bus_generic_add_child(device_t dev, u_int order, const char *name, int unit) { return (device_add_child_ordered(dev, order, name, unit)); } /** * @brief Helper function for implementing DEVICE_PROBE() * * This function can be used to help implement the DEVICE_PROBE() for * a bus (i.e. a device which has other devices attached to it). It * calls the DEVICE_IDENTIFY() method of each driver in the device's * devclass. */ int bus_generic_probe(device_t dev) { devclass_t dc = dev->devclass; driverlink_t dl; TAILQ_FOREACH(dl, &dc->drivers, link) { /* * If this driver's pass is too high, then ignore it. * For most drivers in the default pass, this will * never be true. For early-pass drivers they will * only call the identify routines of eligible drivers * when this routine is called. Drivers for later * passes should have their identify routines called * on early-pass busses during BUS_NEW_PASS(). */ if (dl->pass > bus_current_pass) continue; DEVICE_IDENTIFY(dl->driver, dev); } return (0); } /** * @brief Helper function for implementing DEVICE_ATTACH() * * This function can be used to help implement the DEVICE_ATTACH() for * a bus. It calls device_probe_and_attach() for each of the device's * children. */ int bus_generic_attach(device_t dev) { device_t child; TAILQ_FOREACH(child, &dev->children, link) { device_probe_and_attach(child); } return (0); } /** * @brief Helper function for implementing DEVICE_DETACH() * * This function can be used to help implement the DEVICE_DETACH() for * a bus. It calls device_detach() for each of the device's * children. */ int bus_generic_detach(device_t dev) { device_t child; int error; if (dev->state != DS_ATTACHED) return (EBUSY); TAILQ_FOREACH(child, &dev->children, link) { if ((error = device_detach(child)) != 0) return (error); } return (0); } /** * @brief Helper function for implementing DEVICE_SHUTDOWN() * * This function can be used to help implement the DEVICE_SHUTDOWN() * for a bus. It calls device_shutdown() for each of the device's * children. */ int bus_generic_shutdown(device_t dev) { device_t child; TAILQ_FOREACH(child, &dev->children, link) { device_shutdown(child); } return (0); } /** * @brief Default function for suspending a child device. * * This function is to be used by a bus's DEVICE_SUSPEND_CHILD(). */ int bus_generic_suspend_child(device_t dev, device_t child) { int error; error = DEVICE_SUSPEND(child); if (error == 0) - dev->flags |= DF_SUSPENDED; + child->flags |= DF_SUSPENDED; return (error); } /** * @brief Default function for resuming a child device. * * This function is to be used by a bus's DEVICE_RESUME_CHILD(). */ int bus_generic_resume_child(device_t dev, device_t child) { DEVICE_RESUME(child); - dev->flags &= ~DF_SUSPENDED; + child->flags &= ~DF_SUSPENDED; return (0); } /** * @brief Helper function for implementing DEVICE_SUSPEND() * * This function can be used to help implement the DEVICE_SUSPEND() * for a bus. It calls DEVICE_SUSPEND() for each of the device's * children. If any call to DEVICE_SUSPEND() fails, the suspend * operation is aborted and any devices which were suspended are * resumed immediately by calling their DEVICE_RESUME() methods. */ int bus_generic_suspend(device_t dev) { int error; device_t child, child2; TAILQ_FOREACH(child, &dev->children, link) { error = BUS_SUSPEND_CHILD(dev, child); if (error) { for (child2 = TAILQ_FIRST(&dev->children); child2 && child2 != child; child2 = TAILQ_NEXT(child2, link)) BUS_RESUME_CHILD(dev, child2); return (error); } } return (0); } /** * @brief Helper function for implementing DEVICE_RESUME() * * This function can be used to help implement the DEVICE_RESUME() for * a bus. It calls DEVICE_RESUME() on each of the device's children. */ int bus_generic_resume(device_t dev) { device_t child; TAILQ_FOREACH(child, &dev->children, link) { BUS_RESUME_CHILD(dev, child); /* if resume fails, there's nothing we can usefully do... */ } return (0); } /** * @brief Helper function for implementing BUS_PRINT_CHILD(). * * This function prints the first part of the ascii representation of * @p child, including its name, unit and description (if any - see * device_set_desc()). * * @returns the number of characters printed */ int bus_print_child_header(device_t dev, device_t child) { int retval = 0; if (device_get_desc(child)) { retval += device_printf(child, "<%s>", device_get_desc(child)); } else { retval += printf("%s", device_get_nameunit(child)); } return (retval); } /** * @brief Helper function for implementing BUS_PRINT_CHILD(). * * This function prints the last part of the ascii representation of * @p child, which consists of the string @c " on " followed by the * name and unit of the @p dev. * * @returns the number of characters printed */ int bus_print_child_footer(device_t dev, device_t child) { return (printf(" on %s\n", device_get_nameunit(dev))); } /** * @brief Helper function for implementing BUS_PRINT_CHILD(). * * This function prints out the VM domain for the given device. * * @returns the number of characters printed */ int bus_print_child_domain(device_t dev, device_t child) { int domain; /* No domain? Don't print anything */ if (BUS_GET_DOMAIN(dev, child, &domain) != 0) return (0); return (printf(" numa-domain %d", domain)); } /** * @brief Helper function for implementing BUS_PRINT_CHILD(). * * This function simply calls bus_print_child_header() followed by * bus_print_child_footer(). * * @returns the number of characters printed */ int bus_generic_print_child(device_t dev, device_t child) { int retval = 0; retval += bus_print_child_header(dev, child); retval += bus_print_child_domain(dev, child); retval += bus_print_child_footer(dev, child); return (retval); } /** * @brief Stub function for implementing BUS_READ_IVAR(). * * @returns ENOENT */ int bus_generic_read_ivar(device_t dev, device_t child, int index, uintptr_t * result) { return (ENOENT); } /** * @brief Stub function for implementing BUS_WRITE_IVAR(). * * @returns ENOENT */ int bus_generic_write_ivar(device_t dev, device_t child, int index, uintptr_t value) { return (ENOENT); } /** * @brief Stub function for implementing BUS_GET_RESOURCE_LIST(). * * @returns NULL */ struct resource_list * bus_generic_get_resource_list(device_t dev, device_t child) { return (NULL); } /** * @brief Helper function for implementing BUS_DRIVER_ADDED(). * * This implementation of BUS_DRIVER_ADDED() simply calls the driver's * DEVICE_IDENTIFY() method to allow it to add new children to the bus * and then calls device_probe_and_attach() for each unattached child. */ void bus_generic_driver_added(device_t dev, driver_t *driver) { device_t child; DEVICE_IDENTIFY(driver, dev); TAILQ_FOREACH(child, &dev->children, link) { if (child->state == DS_NOTPRESENT || (child->flags & DF_REBID)) device_probe_and_attach(child); } } /** * @brief Helper function for implementing BUS_NEW_PASS(). * * This implementing of BUS_NEW_PASS() first calls the identify * routines for any drivers that probe at the current pass. Then it * walks the list of devices for this bus. If a device is already * attached, then it calls BUS_NEW_PASS() on that device. If the * device is not already attached, it attempts to attach a driver to * it. */ void bus_generic_new_pass(device_t dev) { driverlink_t dl; devclass_t dc; device_t child; dc = dev->devclass; TAILQ_FOREACH(dl, &dc->drivers, link) { if (dl->pass == bus_current_pass) DEVICE_IDENTIFY(dl->driver, dev); } TAILQ_FOREACH(child, &dev->children, link) { if (child->state >= DS_ATTACHED) BUS_NEW_PASS(child); else if (child->state == DS_NOTPRESENT) device_probe_and_attach(child); } } /** * @brief Helper function for implementing BUS_SETUP_INTR(). * * This simple implementation of BUS_SETUP_INTR() simply calls the * BUS_SETUP_INTR() method of the parent of @p dev. */ int bus_generic_setup_intr(device_t dev, device_t child, struct resource *irq, int flags, driver_filter_t *filter, driver_intr_t *intr, void *arg, void **cookiep) { /* Propagate up the bus hierarchy until someone handles it. */ if (dev->parent) return (BUS_SETUP_INTR(dev->parent, child, irq, flags, filter, intr, arg, cookiep)); return (EINVAL); } /** * @brief Helper function for implementing BUS_TEARDOWN_INTR(). * * This simple implementation of BUS_TEARDOWN_INTR() simply calls the * BUS_TEARDOWN_INTR() method of the parent of @p dev. */ int bus_generic_teardown_intr(device_t dev, device_t child, struct resource *irq, void *cookie) { /* Propagate up the bus hierarchy until someone handles it. */ if (dev->parent) return (BUS_TEARDOWN_INTR(dev->parent, child, irq, cookie)); return (EINVAL); } /** * @brief Helper function for implementing BUS_ADJUST_RESOURCE(). * * This simple implementation of BUS_ADJUST_RESOURCE() simply calls the * BUS_ADJUST_RESOURCE() method of the parent of @p dev. */ int bus_generic_adjust_resource(device_t dev, device_t child, int type, struct resource *r, u_long start, u_long end) { /* Propagate up the bus hierarchy until someone handles it. */ if (dev->parent) return (BUS_ADJUST_RESOURCE(dev->parent, child, type, r, start, end)); return (EINVAL); } /** * @brief Helper function for implementing BUS_ALLOC_RESOURCE(). * * This simple implementation of BUS_ALLOC_RESOURCE() simply calls the * BUS_ALLOC_RESOURCE() method of the parent of @p dev. */ struct resource * bus_generic_alloc_resource(device_t dev, device_t child, int type, int *rid, u_long start, u_long end, u_long count, u_int flags) { /* Propagate up the bus hierarchy until someone handles it. */ if (dev->parent) return (BUS_ALLOC_RESOURCE(dev->parent, child, type, rid, start, end, count, flags)); return (NULL); } /** * @brief Helper function for implementing BUS_RELEASE_RESOURCE(). * * This simple implementation of BUS_RELEASE_RESOURCE() simply calls the * BUS_RELEASE_RESOURCE() method of the parent of @p dev. */ int bus_generic_release_resource(device_t dev, device_t child, int type, int rid, struct resource *r) { /* Propagate up the bus hierarchy until someone handles it. */ if (dev->parent) return (BUS_RELEASE_RESOURCE(dev->parent, child, type, rid, r)); return (EINVAL); } /** * @brief Helper function for implementing BUS_ACTIVATE_RESOURCE(). * * This simple implementation of BUS_ACTIVATE_RESOURCE() simply calls the * BUS_ACTIVATE_RESOURCE() method of the parent of @p dev. */ int bus_generic_activate_resource(device_t dev, device_t child, int type, int rid, struct resource *r) { /* Propagate up the bus hierarchy until someone handles it. */ if (dev->parent) return (BUS_ACTIVATE_RESOURCE(dev->parent, child, type, rid, r)); return (EINVAL); } /** * @brief Helper function for implementing BUS_DEACTIVATE_RESOURCE(). * * This simple implementation of BUS_DEACTIVATE_RESOURCE() simply calls the * BUS_DEACTIVATE_RESOURCE() method of the parent of @p dev. */ int bus_generic_deactivate_resource(device_t dev, device_t child, int type, int rid, struct resource *r) { /* Propagate up the bus hierarchy until someone handles it. */ if (dev->parent) return (BUS_DEACTIVATE_RESOURCE(dev->parent, child, type, rid, r)); return (EINVAL); } /** * @brief Helper function for implementing BUS_BIND_INTR(). * * This simple implementation of BUS_BIND_INTR() simply calls the * BUS_BIND_INTR() method of the parent of @p dev. */ int bus_generic_bind_intr(device_t dev, device_t child, struct resource *irq, int cpu) { /* Propagate up the bus hierarchy until someone handles it. */ if (dev->parent) return (BUS_BIND_INTR(dev->parent, child, irq, cpu)); return (EINVAL); } /** * @brief Helper function for implementing BUS_CONFIG_INTR(). * * This simple implementation of BUS_CONFIG_INTR() simply calls the * BUS_CONFIG_INTR() method of the parent of @p dev. */ int bus_generic_config_intr(device_t dev, int irq, enum intr_trigger trig, enum intr_polarity pol) { /* Propagate up the bus hierarchy until someone handles it. */ if (dev->parent) return (BUS_CONFIG_INTR(dev->parent, irq, trig, pol)); return (EINVAL); } /** * @brief Helper function for implementing BUS_DESCRIBE_INTR(). * * This simple implementation of BUS_DESCRIBE_INTR() simply calls the * BUS_DESCRIBE_INTR() method of the parent of @p dev. */ int bus_generic_describe_intr(device_t dev, device_t child, struct resource *irq, void *cookie, const char *descr) { /* Propagate up the bus hierarchy until someone handles it. */ if (dev->parent) return (BUS_DESCRIBE_INTR(dev->parent, child, irq, cookie, descr)); return (EINVAL); } /** * @brief Helper function for implementing BUS_GET_DMA_TAG(). * * This simple implementation of BUS_GET_DMA_TAG() simply calls the * BUS_GET_DMA_TAG() method of the parent of @p dev. */ bus_dma_tag_t bus_generic_get_dma_tag(device_t dev, device_t child) { /* Propagate up the bus hierarchy until someone handles it. */ if (dev->parent != NULL) return (BUS_GET_DMA_TAG(dev->parent, child)); return (NULL); } /** * @brief Helper function for implementing BUS_GET_RESOURCE(). * * This implementation of BUS_GET_RESOURCE() uses the * resource_list_find() function to do most of the work. It calls * BUS_GET_RESOURCE_LIST() to find a suitable resource list to * search. */ int bus_generic_rl_get_resource(device_t dev, device_t child, int type, int rid, u_long *startp, u_long *countp) { struct resource_list * rl = NULL; struct resource_list_entry * rle = NULL; rl = BUS_GET_RESOURCE_LIST(dev, child); if (!rl) return (EINVAL); rle = resource_list_find(rl, type, rid); if (!rle) return (ENOENT); if (startp) *startp = rle->start; if (countp) *countp = rle->count; return (0); } /** * @brief Helper function for implementing BUS_SET_RESOURCE(). * * This implementation of BUS_SET_RESOURCE() uses the * resource_list_add() function to do most of the work. It calls * BUS_GET_RESOURCE_LIST() to find a suitable resource list to * edit. */ int bus_generic_rl_set_resource(device_t dev, device_t child, int type, int rid, u_long start, u_long count) { struct resource_list * rl = NULL; rl = BUS_GET_RESOURCE_LIST(dev, child); if (!rl) return (EINVAL); resource_list_add(rl, type, rid, start, (start + count - 1), count); return (0); } /** * @brief Helper function for implementing BUS_DELETE_RESOURCE(). * * This implementation of BUS_DELETE_RESOURCE() uses the * resource_list_delete() function to do most of the work. It calls * BUS_GET_RESOURCE_LIST() to find a suitable resource list to * edit. */ void bus_generic_rl_delete_resource(device_t dev, device_t child, int type, int rid) { struct resource_list * rl = NULL; rl = BUS_GET_RESOURCE_LIST(dev, child); if (!rl) return; resource_list_delete(rl, type, rid); return; } /** * @brief Helper function for implementing BUS_RELEASE_RESOURCE(). * * This implementation of BUS_RELEASE_RESOURCE() uses the * resource_list_release() function to do most of the work. It calls * BUS_GET_RESOURCE_LIST() to find a suitable resource list. */ int bus_generic_rl_release_resource(device_t dev, device_t child, int type, int rid, struct resource *r) { struct resource_list * rl = NULL; if (device_get_parent(child) != dev) return (BUS_RELEASE_RESOURCE(device_get_parent(dev), child, type, rid, r)); rl = BUS_GET_RESOURCE_LIST(dev, child); if (!rl) return (EINVAL); return (resource_list_release(rl, dev, child, type, rid, r)); } /** * @brief Helper function for implementing BUS_ALLOC_RESOURCE(). * * This implementation of BUS_ALLOC_RESOURCE() uses the * resource_list_alloc() function to do most of the work. It calls * BUS_GET_RESOURCE_LIST() to find a suitable resource list. */ struct resource * bus_generic_rl_alloc_resource(device_t dev, device_t child, int type, int *rid, u_long start, u_long end, u_long count, u_int flags) { struct resource_list * rl = NULL; if (device_get_parent(child) != dev) return (BUS_ALLOC_RESOURCE(device_get_parent(dev), child, type, rid, start, end, count, flags)); rl = BUS_GET_RESOURCE_LIST(dev, child); if (!rl) return (NULL); return (resource_list_alloc(rl, dev, child, type, rid, start, end, count, flags)); } /** * @brief Helper function for implementing BUS_CHILD_PRESENT(). * * This simple implementation of BUS_CHILD_PRESENT() simply calls the * BUS_CHILD_PRESENT() method of the parent of @p dev. */ int bus_generic_child_present(device_t dev, device_t child) { return (BUS_CHILD_PRESENT(device_get_parent(dev), dev)); } int bus_generic_get_domain(device_t dev, device_t child, int *domain) { if (dev->parent) return (BUS_GET_DOMAIN(dev->parent, dev, domain)); return (ENOENT); } /* * Some convenience functions to make it easier for drivers to use the * resource-management functions. All these really do is hide the * indirection through the parent's method table, making for slightly * less-wordy code. In the future, it might make sense for this code * to maintain some sort of a list of resources allocated by each device. */ int bus_alloc_resources(device_t dev, struct resource_spec *rs, struct resource **res) { int i; for (i = 0; rs[i].type != -1; i++) res[i] = NULL; for (i = 0; rs[i].type != -1; i++) { res[i] = bus_alloc_resource_any(dev, rs[i].type, &rs[i].rid, rs[i].flags); if (res[i] == NULL && !(rs[i].flags & RF_OPTIONAL)) { bus_release_resources(dev, rs, res); return (ENXIO); } } return (0); } void bus_release_resources(device_t dev, const struct resource_spec *rs, struct resource **res) { int i; for (i = 0; rs[i].type != -1; i++) if (res[i] != NULL) { bus_release_resource( dev, rs[i].type, rs[i].rid, res[i]); res[i] = NULL; } } /** * @brief Wrapper function for BUS_ALLOC_RESOURCE(). * * This function simply calls the BUS_ALLOC_RESOURCE() method of the * parent of @p dev. */ struct resource * bus_alloc_resource(device_t dev, int type, int *rid, u_long start, u_long end, u_long count, u_int flags) { if (dev->parent == NULL) return (NULL); return (BUS_ALLOC_RESOURCE(dev->parent, dev, type, rid, start, end, count, flags)); } /** * @brief Wrapper function for BUS_ADJUST_RESOURCE(). * * This function simply calls the BUS_ADJUST_RESOURCE() method of the * parent of @p dev. */ int bus_adjust_resource(device_t dev, int type, struct resource *r, u_long start, u_long end) { if (dev->parent == NULL) return (EINVAL); return (BUS_ADJUST_RESOURCE(dev->parent, dev, type, r, start, end)); } /** * @brief Wrapper function for BUS_ACTIVATE_RESOURCE(). * * This function simply calls the BUS_ACTIVATE_RESOURCE() method of the * parent of @p dev. */ int bus_activate_resource(device_t dev, int type, int rid, struct resource *r) { if (dev->parent == NULL) return (EINVAL); return (BUS_ACTIVATE_RESOURCE(dev->parent, dev, type, rid, r)); } /** * @brief Wrapper function for BUS_DEACTIVATE_RESOURCE(). * * This function simply calls the BUS_DEACTIVATE_RESOURCE() method of the * parent of @p dev. */ int bus_deactivate_resource(device_t dev, int type, int rid, struct resource *r) { if (dev->parent == NULL) return (EINVAL); return (BUS_DEACTIVATE_RESOURCE(dev->parent, dev, type, rid, r)); } /** * @brief Wrapper function for BUS_RELEASE_RESOURCE(). * * This function simply calls the BUS_RELEASE_RESOURCE() method of the * parent of @p dev. */ int bus_release_resource(device_t dev, int type, int rid, struct resource *r) { if (dev->parent == NULL) return (EINVAL); return (BUS_RELEASE_RESOURCE(dev->parent, dev, type, rid, r)); } /** * @brief Wrapper function for BUS_SETUP_INTR(). * * This function simply calls the BUS_SETUP_INTR() method of the * parent of @p dev. */ int bus_setup_intr(device_t dev, struct resource *r, int flags, driver_filter_t filter, driver_intr_t handler, void *arg, void **cookiep) { int error; if (dev->parent == NULL) return (EINVAL); error = BUS_SETUP_INTR(dev->parent, dev, r, flags, filter, handler, arg, cookiep); if (error != 0) return (error); if (handler != NULL && !(flags & INTR_MPSAFE)) device_printf(dev, "[GIANT-LOCKED]\n"); return (0); } /** * @brief Wrapper function for BUS_TEARDOWN_INTR(). * * This function simply calls the BUS_TEARDOWN_INTR() method of the * parent of @p dev. */ int bus_teardown_intr(device_t dev, struct resource *r, void *cookie) { if (dev->parent == NULL) return (EINVAL); return (BUS_TEARDOWN_INTR(dev->parent, dev, r, cookie)); } /** * @brief Wrapper function for BUS_BIND_INTR(). * * This function simply calls the BUS_BIND_INTR() method of the * parent of @p dev. */ int bus_bind_intr(device_t dev, struct resource *r, int cpu) { if (dev->parent == NULL) return (EINVAL); return (BUS_BIND_INTR(dev->parent, dev, r, cpu)); } /** * @brief Wrapper function for BUS_DESCRIBE_INTR(). * * This function first formats the requested description into a * temporary buffer and then calls the BUS_DESCRIBE_INTR() method of * the parent of @p dev. */ int bus_describe_intr(device_t dev, struct resource *irq, void *cookie, const char *fmt, ...) { va_list ap; char descr[MAXCOMLEN + 1]; if (dev->parent == NULL) return (EINVAL); va_start(ap, fmt); vsnprintf(descr, sizeof(descr), fmt, ap); va_end(ap); return (BUS_DESCRIBE_INTR(dev->parent, dev, irq, cookie, descr)); } /** * @brief Wrapper function for BUS_SET_RESOURCE(). * * This function simply calls the BUS_SET_RESOURCE() method of the * parent of @p dev. */ int bus_set_resource(device_t dev, int type, int rid, u_long start, u_long count) { return (BUS_SET_RESOURCE(device_get_parent(dev), dev, type, rid, start, count)); } /** * @brief Wrapper function for BUS_GET_RESOURCE(). * * This function simply calls the BUS_GET_RESOURCE() method of the * parent of @p dev. */ int bus_get_resource(device_t dev, int type, int rid, u_long *startp, u_long *countp) { return (BUS_GET_RESOURCE(device_get_parent(dev), dev, type, rid, startp, countp)); } /** * @brief Wrapper function for BUS_GET_RESOURCE(). * * This function simply calls the BUS_GET_RESOURCE() method of the * parent of @p dev and returns the start value. */ u_long bus_get_resource_start(device_t dev, int type, int rid) { u_long start, count; int error; error = BUS_GET_RESOURCE(device_get_parent(dev), dev, type, rid, &start, &count); if (error) return (0); return (start); } /** * @brief Wrapper function for BUS_GET_RESOURCE(). * * This function simply calls the BUS_GET_RESOURCE() method of the * parent of @p dev and returns the count value. */ u_long bus_get_resource_count(device_t dev, int type, int rid) { u_long start, count; int error; error = BUS_GET_RESOURCE(device_get_parent(dev), dev, type, rid, &start, &count); if (error) return (0); return (count); } /** * @brief Wrapper function for BUS_DELETE_RESOURCE(). * * This function simply calls the BUS_DELETE_RESOURCE() method of the * parent of @p dev. */ void bus_delete_resource(device_t dev, int type, int rid) { BUS_DELETE_RESOURCE(device_get_parent(dev), dev, type, rid); } /** * @brief Wrapper function for BUS_CHILD_PRESENT(). * * This function simply calls the BUS_CHILD_PRESENT() method of the * parent of @p dev. */ int bus_child_present(device_t child) { return (BUS_CHILD_PRESENT(device_get_parent(child), child)); } /** * @brief Wrapper function for BUS_CHILD_PNPINFO_STR(). * * This function simply calls the BUS_CHILD_PNPINFO_STR() method of the * parent of @p dev. */ int bus_child_pnpinfo_str(device_t child, char *buf, size_t buflen) { device_t parent; parent = device_get_parent(child); if (parent == NULL) { *buf = '\0'; return (0); } return (BUS_CHILD_PNPINFO_STR(parent, child, buf, buflen)); } /** * @brief Wrapper function for BUS_CHILD_LOCATION_STR(). * * This function simply calls the BUS_CHILD_LOCATION_STR() method of the * parent of @p dev. */ int bus_child_location_str(device_t child, char *buf, size_t buflen) { device_t parent; parent = device_get_parent(child); if (parent == NULL) { *buf = '\0'; return (0); } return (BUS_CHILD_LOCATION_STR(parent, child, buf, buflen)); } /** * @brief Wrapper function for BUS_GET_DMA_TAG(). * * This function simply calls the BUS_GET_DMA_TAG() method of the * parent of @p dev. */ bus_dma_tag_t bus_get_dma_tag(device_t dev) { device_t parent; parent = device_get_parent(dev); if (parent == NULL) return (NULL); return (BUS_GET_DMA_TAG(parent, dev)); } /** * @brief Wrapper function for BUS_GET_DOMAIN(). * * This function simply calls the BUS_GET_DOMAIN() method of the * parent of @p dev. */ int bus_get_domain(device_t dev, int *domain) { return (BUS_GET_DOMAIN(device_get_parent(dev), dev, domain)); } /* Resume all devices and then notify userland that we're up again. */ static int root_resume(device_t dev) { int error; error = bus_generic_resume(dev); if (error == 0) devctl_notify("kern", "power", "resume", NULL); return (error); } static int root_print_child(device_t dev, device_t child) { int retval = 0; retval += bus_print_child_header(dev, child); retval += printf("\n"); return (retval); } static int root_setup_intr(device_t dev, device_t child, struct resource *irq, int flags, driver_filter_t *filter, driver_intr_t *intr, void *arg, void **cookiep) { /* * If an interrupt mapping gets to here something bad has happened. */ panic("root_setup_intr"); } /* * If we get here, assume that the device is permanant and really is * present in the system. Removable bus drivers are expected to intercept * this call long before it gets here. We return -1 so that drivers that * really care can check vs -1 or some ERRNO returned higher in the food * chain. */ static int root_child_present(device_t dev, device_t child) { return (-1); } static kobj_method_t root_methods[] = { /* Device interface */ KOBJMETHOD(device_shutdown, bus_generic_shutdown), KOBJMETHOD(device_suspend, bus_generic_suspend), KOBJMETHOD(device_resume, root_resume), /* Bus interface */ KOBJMETHOD(bus_print_child, root_print_child), KOBJMETHOD(bus_read_ivar, bus_generic_read_ivar), KOBJMETHOD(bus_write_ivar, bus_generic_write_ivar), KOBJMETHOD(bus_setup_intr, root_setup_intr), KOBJMETHOD(bus_child_present, root_child_present), KOBJMETHOD_END }; static driver_t root_driver = { "root", root_methods, 1, /* no softc */ }; device_t root_bus; devclass_t root_devclass; static int root_bus_module_handler(module_t mod, int what, void* arg) { switch (what) { case MOD_LOAD: TAILQ_INIT(&bus_data_devices); kobj_class_compile((kobj_class_t) &root_driver); root_bus = make_device(NULL, "root", 0); root_bus->desc = "System root bus"; kobj_init((kobj_t) root_bus, (kobj_class_t) &root_driver); root_bus->driver = &root_driver; root_bus->state = DS_ATTACHED; root_devclass = devclass_find_internal("root", NULL, FALSE); devinit(); return (0); case MOD_SHUTDOWN: device_shutdown(root_bus); return (0); default: return (EOPNOTSUPP); } return (0); } static moduledata_t root_bus_mod = { "rootbus", root_bus_module_handler, NULL }; DECLARE_MODULE(rootbus, root_bus_mod, SI_SUB_DRIVERS, SI_ORDER_FIRST); /** * @brief Automatically configure devices * * This function begins the autoconfiguration process by calling * device_probe_and_attach() for each child of the @c root0 device. */ void root_bus_configure(void) { PDEBUG((".")); /* Eventually this will be split up, but this is sufficient for now. */ bus_set_pass(BUS_PASS_DEFAULT); } /** * @brief Module handler for registering device drivers * * This module handler is used to automatically register device * drivers when modules are loaded. If @p what is MOD_LOAD, it calls * devclass_add_driver() for the driver described by the * driver_module_data structure pointed to by @p arg */ int driver_module_handler(module_t mod, int what, void *arg) { struct driver_module_data *dmd; devclass_t bus_devclass; kobj_class_t driver; int error, pass; dmd = (struct driver_module_data *)arg; bus_devclass = devclass_find_internal(dmd->dmd_busname, NULL, TRUE); error = 0; switch (what) { case MOD_LOAD: if (dmd->dmd_chainevh) error = dmd->dmd_chainevh(mod,what,dmd->dmd_chainarg); pass = dmd->dmd_pass; driver = dmd->dmd_driver; PDEBUG(("Loading module: driver %s on bus %s (pass %d)", DRIVERNAME(driver), dmd->dmd_busname, pass)); error = devclass_add_driver(bus_devclass, driver, pass, dmd->dmd_devclass); break; case MOD_UNLOAD: PDEBUG(("Unloading module: driver %s from bus %s", DRIVERNAME(dmd->dmd_driver), dmd->dmd_busname)); error = devclass_delete_driver(bus_devclass, dmd->dmd_driver); if (!error && dmd->dmd_chainevh) error = dmd->dmd_chainevh(mod,what,dmd->dmd_chainarg); break; case MOD_QUIESCE: PDEBUG(("Quiesce module: driver %s from bus %s", DRIVERNAME(dmd->dmd_driver), dmd->dmd_busname)); error = devclass_quiesce_driver(bus_devclass, dmd->dmd_driver); if (!error && dmd->dmd_chainevh) error = dmd->dmd_chainevh(mod,what,dmd->dmd_chainarg); break; default: error = EOPNOTSUPP; break; } return (error); } /** * @brief Enumerate all hinted devices for this bus. * * Walks through the hints for this bus and calls the bus_hinted_child * routine for each one it fines. It searches first for the specific * bus that's being probed for hinted children (eg isa0), and then for * generic children (eg isa). * * @param dev bus device to enumerate */ void bus_enumerate_hinted_children(device_t bus) { int i; const char *dname, *busname; int dunit; /* * enumerate all devices on the specific bus */ busname = device_get_nameunit(bus); i = 0; while (resource_find_match(&i, &dname, &dunit, "at", busname) == 0) BUS_HINTED_CHILD(bus, dname, dunit); /* * and all the generic ones. */ busname = device_get_name(bus); i = 0; while (resource_find_match(&i, &dname, &dunit, "at", busname) == 0) BUS_HINTED_CHILD(bus, dname, dunit); } #ifdef BUS_DEBUG /* the _short versions avoid iteration by not calling anything that prints * more than oneliners. I love oneliners. */ static void print_device_short(device_t dev, int indent) { if (!dev) return; indentprintf(("device %d: <%s> %sparent,%schildren,%s%s%s%s%s,%sivars,%ssoftc,busy=%d\n", dev->unit, dev->desc, (dev->parent? "":"no "), (TAILQ_EMPTY(&dev->children)? "no ":""), (dev->flags&DF_ENABLED? "enabled,":"disabled,"), (dev->flags&DF_FIXEDCLASS? "fixed,":""), (dev->flags&DF_WILDCARD? "wildcard,":""), (dev->flags&DF_DESCMALLOCED? "descmalloced,":""), (dev->flags&DF_REBID? "rebiddable,":""), (dev->ivars? "":"no "), (dev->softc? "":"no "), dev->busy)); } static void print_device(device_t dev, int indent) { if (!dev) return; print_device_short(dev, indent); indentprintf(("Parent:\n")); print_device_short(dev->parent, indent+1); indentprintf(("Driver:\n")); print_driver_short(dev->driver, indent+1); indentprintf(("Devclass:\n")); print_devclass_short(dev->devclass, indent+1); } void print_device_tree_short(device_t dev, int indent) /* print the device and all its children (indented) */ { device_t child; if (!dev) return; print_device_short(dev, indent); TAILQ_FOREACH(child, &dev->children, link) { print_device_tree_short(child, indent+1); } } void print_device_tree(device_t dev, int indent) /* print the device and all its children (indented) */ { device_t child; if (!dev) return; print_device(dev, indent); TAILQ_FOREACH(child, &dev->children, link) { print_device_tree(child, indent+1); } } static void print_driver_short(driver_t *driver, int indent) { if (!driver) return; indentprintf(("driver %s: softc size = %zd\n", driver->name, driver->size)); } static void print_driver(driver_t *driver, int indent) { if (!driver) return; print_driver_short(driver, indent); } static void print_driver_list(driver_list_t drivers, int indent) { driverlink_t driver; TAILQ_FOREACH(driver, &drivers, link) { print_driver(driver->driver, indent); } } static void print_devclass_short(devclass_t dc, int indent) { if ( !dc ) return; indentprintf(("devclass %s: max units = %d\n", dc->name, dc->maxunit)); } static void print_devclass(devclass_t dc, int indent) { int i; if ( !dc ) return; print_devclass_short(dc, indent); indentprintf(("Drivers:\n")); print_driver_list(dc->drivers, indent+1); indentprintf(("Devices:\n")); for (i = 0; i < dc->maxunit; i++) if (dc->devices[i]) print_device(dc->devices[i], indent+1); } void print_devclass_list_short(void) { devclass_t dc; printf("Short listing of devclasses, drivers & devices:\n"); TAILQ_FOREACH(dc, &devclasses, link) { print_devclass_short(dc, 0); } } void print_devclass_list(void) { devclass_t dc; printf("Full listing of devclasses, drivers & devices:\n"); TAILQ_FOREACH(dc, &devclasses, link) { print_devclass(dc, 0); } } #endif /* * User-space access to the device tree. * * We implement a small set of nodes: * * hw.bus Single integer read method to obtain the * current generation count. * hw.bus.devices Reads the entire device tree in flat space. * hw.bus.rman Resource manager interface * * We might like to add the ability to scan devclasses and/or drivers to * determine what else is currently loaded/available. */ static int sysctl_bus(SYSCTL_HANDLER_ARGS) { struct u_businfo ubus; ubus.ub_version = BUS_USER_VERSION; ubus.ub_generation = bus_data_generation; return (SYSCTL_OUT(req, &ubus, sizeof(ubus))); } SYSCTL_NODE(_hw_bus, OID_AUTO, info, CTLFLAG_RW, sysctl_bus, "bus-related data"); static int sysctl_devices(SYSCTL_HANDLER_ARGS) { int *name = (int *)arg1; u_int namelen = arg2; int index; struct device *dev; struct u_device udev; /* XXX this is a bit big */ int error; if (namelen != 2) return (EINVAL); if (bus_data_generation_check(name[0])) return (EINVAL); index = name[1]; /* * Scan the list of devices, looking for the requested index. */ TAILQ_FOREACH(dev, &bus_data_devices, devlink) { if (index-- == 0) break; } if (dev == NULL) return (ENOENT); /* * Populate the return array. */ bzero(&udev, sizeof(udev)); udev.dv_handle = (uintptr_t)dev; udev.dv_parent = (uintptr_t)dev->parent; if (dev->nameunit != NULL) strlcpy(udev.dv_name, dev->nameunit, sizeof(udev.dv_name)); if (dev->desc != NULL) strlcpy(udev.dv_desc, dev->desc, sizeof(udev.dv_desc)); if (dev->driver != NULL && dev->driver->name != NULL) strlcpy(udev.dv_drivername, dev->driver->name, sizeof(udev.dv_drivername)); bus_child_pnpinfo_str(dev, udev.dv_pnpinfo, sizeof(udev.dv_pnpinfo)); bus_child_location_str(dev, udev.dv_location, sizeof(udev.dv_location)); udev.dv_devflags = dev->devflags; udev.dv_flags = dev->flags; udev.dv_state = dev->state; error = SYSCTL_OUT(req, &udev, sizeof(udev)); return (error); } SYSCTL_NODE(_hw_bus, OID_AUTO, devices, CTLFLAG_RD, sysctl_devices, "system device tree"); int bus_data_generation_check(int generation) { if (generation != bus_data_generation) return (1); /* XXX generate optimised lists here? */ return (0); } void bus_data_generation_update(void) { bus_data_generation++; } int bus_free_resource(device_t dev, int type, struct resource *r) { if (r == NULL) return (0); return (bus_release_resource(dev, type, rman_get_rid(r), r)); } Index: projects/building-blocks/sys/modules/cxgbe/Makefile =================================================================== --- projects/building-blocks/sys/modules/cxgbe/Makefile (revision 278311) +++ projects/building-blocks/sys/modules/cxgbe/Makefile (revision 278312) @@ -1,25 +1,26 @@ # # $FreeBSD$ # SYSDIR?=${.CURDIR}/../.. .include "${SYSDIR}/conf/kern.opts.mk" SUBDIR= if_cxgbe +SUBDIR+= if_cxl SUBDIR+= t4_firmware SUBDIR+= t5_firmware SUBDIR+= ${_tom} SUBDIR+= ${_iw_cxgbe} .if ${MACHINE_CPUARCH} == "amd64" _tom= tom .if ${MK_OFED} != "no" || defined(ALL_MODULES) _iw_cxgbe= iw_cxgbe .endif .endif .if ${MACHINE_CPUARCH} == "i386" _tom= tom .endif .include Index: projects/building-blocks/sys/modules/cxgbe/if_cxl/Makefile =================================================================== --- projects/building-blocks/sys/modules/cxgbe/if_cxl/Makefile (nonexistent) +++ projects/building-blocks/sys/modules/cxgbe/if_cxl/Makefile (revision 278312) @@ -0,0 +1,11 @@ +# +# $FreeBSD$ +# + +CXGBE= ${.CURDIR}/../../../dev/cxgbe +.PATH: ${CXGBE} + +KMOD= if_cxl +SRCS= if_cxl.c + +.include Property changes on: projects/building-blocks/sys/modules/cxgbe/if_cxl/Makefile ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Index: projects/building-blocks/sys/netinet6/in6.c =================================================================== --- projects/building-blocks/sys/netinet6/in6.c (revision 278311) +++ projects/building-blocks/sys/netinet6/in6.c (revision 278312) @@ -1,2402 +1,2403 @@ /*- * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $KAME: in6.c,v 1.259 2002/01/21 11:37:50 keiichi Exp $ */ /*- * Copyright (c) 1982, 1986, 1991, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)in.c 8.2 (Berkeley) 11/15/93 */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include VNET_DECLARE(int, icmp6_nodeinfo_oldmcprefix); #define V_icmp6_nodeinfo_oldmcprefix VNET(icmp6_nodeinfo_oldmcprefix) /* * Definitions of some costant IP6 addresses. */ const struct in6_addr in6addr_any = IN6ADDR_ANY_INIT; const struct in6_addr in6addr_loopback = IN6ADDR_LOOPBACK_INIT; const struct in6_addr in6addr_nodelocal_allnodes = IN6ADDR_NODELOCAL_ALLNODES_INIT; const struct in6_addr in6addr_linklocal_allnodes = IN6ADDR_LINKLOCAL_ALLNODES_INIT; const struct in6_addr in6addr_linklocal_allrouters = IN6ADDR_LINKLOCAL_ALLROUTERS_INIT; const struct in6_addr in6addr_linklocal_allv2routers = IN6ADDR_LINKLOCAL_ALLV2ROUTERS_INIT; const struct in6_addr in6mask0 = IN6MASK0; const struct in6_addr in6mask32 = IN6MASK32; const struct in6_addr in6mask64 = IN6MASK64; const struct in6_addr in6mask96 = IN6MASK96; const struct in6_addr in6mask128 = IN6MASK128; const struct sockaddr_in6 sa6_any = { sizeof(sa6_any), AF_INET6, 0, 0, IN6ADDR_ANY_INIT, 0 }; static int in6_notify_ifa(struct ifnet *, struct in6_ifaddr *, struct in6_aliasreq *, int); static void in6_unlink_ifa(struct in6_ifaddr *, struct ifnet *); static int in6_validate_ifra(struct ifnet *, struct in6_aliasreq *, struct in6_ifaddr *, int); static struct in6_ifaddr *in6_alloc_ifa(struct ifnet *, struct in6_aliasreq *, int flags); static int in6_update_ifa_internal(struct ifnet *, struct in6_aliasreq *, struct in6_ifaddr *, int, int); static int in6_broadcast_ifa(struct ifnet *, struct in6_aliasreq *, struct in6_ifaddr *, int); #define ifa2ia6(ifa) ((struct in6_ifaddr *)(ifa)) #define ia62ifa(ia6) (&((ia6)->ia_ifa)) void in6_newaddrmsg(struct in6_ifaddr *ia, int cmd) { struct sockaddr_dl gateway; struct sockaddr_in6 mask, addr; struct rtentry rt; /* * initialize for rtmsg generation */ bzero(&gateway, sizeof(gateway)); gateway.sdl_len = sizeof(gateway); gateway.sdl_family = AF_LINK; bzero(&rt, sizeof(rt)); rt.rt_gateway = (struct sockaddr *)&gateway; memcpy(&mask, &ia->ia_prefixmask, sizeof(ia->ia_prefixmask)); memcpy(&addr, &ia->ia_addr, sizeof(ia->ia_addr)); rt_mask(&rt) = (struct sockaddr *)&mask; rt_key(&rt) = (struct sockaddr *)&addr; rt.rt_flags = RTF_HOST | RTF_STATIC; if (cmd == RTM_ADD) rt.rt_flags |= RTF_UP; /* Announce arrival of local address to all FIBs. */ rt_newaddrmsg(cmd, &ia->ia_ifa, 0, &rt); } int in6_mask2len(struct in6_addr *mask, u_char *lim0) { int x = 0, y; u_char *lim = lim0, *p; /* ignore the scope_id part */ if (lim0 == NULL || lim0 - (u_char *)mask > sizeof(*mask)) lim = (u_char *)mask + sizeof(*mask); for (p = (u_char *)mask; p < lim; x++, p++) { if (*p != 0xff) break; } y = 0; if (p < lim) { for (y = 0; y < 8; y++) { if ((*p & (0x80 >> y)) == 0) break; } } /* * when the limit pointer is given, do a stricter check on the * remaining bits. */ if (p < lim) { if (y != 0 && (*p & (0x00ff >> y)) != 0) return (-1); for (p = p + 1; p < lim; p++) if (*p != 0) return (-1); } return x * 8 + y; } #ifdef COMPAT_FREEBSD32 struct in6_ndifreq32 { char ifname[IFNAMSIZ]; uint32_t ifindex; }; #define SIOCGDEFIFACE32_IN6 _IOWR('i', 86, struct in6_ndifreq32) #endif int in6_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, struct thread *td) { struct in6_ifreq *ifr = (struct in6_ifreq *)data; struct in6_ifaddr *ia = NULL; struct in6_aliasreq *ifra = (struct in6_aliasreq *)data; struct sockaddr_in6 *sa6; int carp_attached = 0; int error; u_long ocmd = cmd; /* * Compat to make pre-10.x ifconfig(8) operable. */ if (cmd == OSIOCAIFADDR_IN6) cmd = SIOCAIFADDR_IN6; switch (cmd) { case SIOCGETSGCNT_IN6: case SIOCGETMIFCNT_IN6: /* * XXX mrt_ioctl has a 3rd, unused, FIB argument in route.c. * We cannot see how that would be needed, so do not adjust the * KPI blindly; more likely should clean up the IPv4 variant. */ return (mrt6_ioctl ? mrt6_ioctl(cmd, data) : EOPNOTSUPP); } switch (cmd) { case SIOCAADDRCTL_POLICY: case SIOCDADDRCTL_POLICY: if (td != NULL) { error = priv_check(td, PRIV_NETINET_ADDRCTRL6); if (error) return (error); } return (in6_src_ioctl(cmd, data)); } if (ifp == NULL) return (EOPNOTSUPP); switch (cmd) { case SIOCSNDFLUSH_IN6: case SIOCSPFXFLUSH_IN6: case SIOCSRTRFLUSH_IN6: case SIOCSDEFIFACE_IN6: case SIOCSIFINFO_FLAGS: case SIOCSIFINFO_IN6: if (td != NULL) { error = priv_check(td, PRIV_NETINET_ND6); if (error) return (error); } /* FALLTHROUGH */ case OSIOCGIFINFO_IN6: case SIOCGIFINFO_IN6: case SIOCGDRLST_IN6: case SIOCGPRLST_IN6: case SIOCGNBRINFO_IN6: case SIOCGDEFIFACE_IN6: return (nd6_ioctl(cmd, data, ifp)); #ifdef COMPAT_FREEBSD32 case SIOCGDEFIFACE32_IN6: { struct in6_ndifreq ndif; struct in6_ndifreq32 *ndif32; error = nd6_ioctl(SIOCGDEFIFACE_IN6, (caddr_t)&ndif, ifp); if (error) return (error); ndif32 = (struct in6_ndifreq32 *)data; ndif32->ifindex = ndif.ifindex; return (0); } #endif } switch (cmd) { case SIOCSIFPREFIX_IN6: case SIOCDIFPREFIX_IN6: case SIOCAIFPREFIX_IN6: case SIOCCIFPREFIX_IN6: case SIOCSGIFPREFIX_IN6: case SIOCGIFPREFIX_IN6: log(LOG_NOTICE, "prefix ioctls are now invalidated. " "please use ifconfig.\n"); return (EOPNOTSUPP); } switch (cmd) { case SIOCSSCOPE6: if (td != NULL) { error = priv_check(td, PRIV_NETINET_SCOPE6); if (error) return (error); } /* FALLTHROUGH */ case SIOCGSCOPE6: case SIOCGSCOPE6DEF: return (scope6_ioctl(cmd, data, ifp)); } /* * Find address for this interface, if it exists. * * In netinet code, we have checked ifra_addr in SIOCSIF*ADDR operation * only, and used the first interface address as the target of other * operations (without checking ifra_addr). This was because netinet * code/API assumed at most 1 interface address per interface. * Since IPv6 allows a node to assign multiple addresses * on a single interface, we almost always look and check the * presence of ifra_addr, and reject invalid ones here. * It also decreases duplicated code among SIOC*_IN6 operations. */ switch (cmd) { case SIOCAIFADDR_IN6: case SIOCSIFPHYADDR_IN6: sa6 = &ifra->ifra_addr; break; case SIOCSIFADDR_IN6: case SIOCGIFADDR_IN6: case SIOCSIFDSTADDR_IN6: case SIOCSIFNETMASK_IN6: case SIOCGIFDSTADDR_IN6: case SIOCGIFNETMASK_IN6: case SIOCDIFADDR_IN6: case SIOCGIFPSRCADDR_IN6: case SIOCGIFPDSTADDR_IN6: case SIOCGIFAFLAG_IN6: case SIOCSNDFLUSH_IN6: case SIOCSPFXFLUSH_IN6: case SIOCSRTRFLUSH_IN6: case SIOCGIFALIFETIME_IN6: case SIOCSIFALIFETIME_IN6: case SIOCGIFSTAT_IN6: case SIOCGIFSTAT_ICMP6: sa6 = &ifr->ifr_addr; break; case SIOCSIFADDR: case SIOCSIFBRDADDR: case SIOCSIFDSTADDR: case SIOCSIFNETMASK: /* * Although we should pass any non-INET6 ioctl requests * down to driver, we filter some legacy INET requests. * Drivers trust SIOCSIFADDR et al to come from an already * privileged layer, and do not perform any credentials * checks or input validation. */ return (EINVAL); default: sa6 = NULL; break; } if (sa6 && sa6->sin6_family == AF_INET6) { if (sa6->sin6_scope_id != 0) error = sa6_embedscope(sa6, 0); else error = in6_setscope(&sa6->sin6_addr, ifp, NULL); if (error != 0) return (error); if (td != NULL && (error = prison_check_ip6(td->td_ucred, &sa6->sin6_addr)) != 0) return (error); ia = in6ifa_ifpwithaddr(ifp, &sa6->sin6_addr); } else ia = NULL; switch (cmd) { case SIOCSIFADDR_IN6: case SIOCSIFDSTADDR_IN6: case SIOCSIFNETMASK_IN6: /* * Since IPv6 allows a node to assign multiple addresses * on a single interface, SIOCSIFxxx ioctls are deprecated. */ /* we decided to obsolete this command (20000704) */ error = EINVAL; goto out; case SIOCDIFADDR_IN6: /* * for IPv4, we look for existing in_ifaddr here to allow * "ifconfig if0 delete" to remove the first IPv4 address on * the interface. For IPv6, as the spec allows multiple * interface address from the day one, we consider "remove the * first one" semantics to be not preferable. */ if (ia == NULL) { error = EADDRNOTAVAIL; goto out; } /* FALLTHROUGH */ case SIOCAIFADDR_IN6: /* * We always require users to specify a valid IPv6 address for * the corresponding operation. */ if (ifra->ifra_addr.sin6_family != AF_INET6 || ifra->ifra_addr.sin6_len != sizeof(struct sockaddr_in6)) { error = EAFNOSUPPORT; goto out; } if (td != NULL) { error = priv_check(td, (cmd == SIOCDIFADDR_IN6) ? PRIV_NET_DELIFADDR : PRIV_NET_ADDIFADDR); if (error) goto out; } /* FALLTHROUGH */ case SIOCGIFSTAT_IN6: case SIOCGIFSTAT_ICMP6: if (ifp->if_afdata[AF_INET6] == NULL) { error = EPFNOSUPPORT; goto out; } break; case SIOCGIFADDR_IN6: /* This interface is basically deprecated. use SIOCGIFCONF. */ /* FALLTHROUGH */ case SIOCGIFAFLAG_IN6: case SIOCGIFNETMASK_IN6: case SIOCGIFDSTADDR_IN6: case SIOCGIFALIFETIME_IN6: /* must think again about its semantics */ if (ia == NULL) { error = EADDRNOTAVAIL; goto out; } break; case SIOCSIFALIFETIME_IN6: { struct in6_addrlifetime *lt; if (td != NULL) { error = priv_check(td, PRIV_NETINET_ALIFETIME6); if (error) goto out; } if (ia == NULL) { error = EADDRNOTAVAIL; goto out; } /* sanity for overflow - beware unsigned */ lt = &ifr->ifr_ifru.ifru_lifetime; if (lt->ia6t_vltime != ND6_INFINITE_LIFETIME && lt->ia6t_vltime + time_uptime < time_uptime) { error = EINVAL; goto out; } if (lt->ia6t_pltime != ND6_INFINITE_LIFETIME && lt->ia6t_pltime + time_uptime < time_uptime) { error = EINVAL; goto out; } break; } } switch (cmd) { case SIOCGIFADDR_IN6: ifr->ifr_addr = ia->ia_addr; if ((error = sa6_recoverscope(&ifr->ifr_addr)) != 0) goto out; break; case SIOCGIFDSTADDR_IN6: if ((ifp->if_flags & IFF_POINTOPOINT) == 0) { error = EINVAL; goto out; } /* * XXX: should we check if ifa_dstaddr is NULL and return * an error? */ ifr->ifr_dstaddr = ia->ia_dstaddr; if ((error = sa6_recoverscope(&ifr->ifr_dstaddr)) != 0) goto out; break; case SIOCGIFNETMASK_IN6: ifr->ifr_addr = ia->ia_prefixmask; break; case SIOCGIFAFLAG_IN6: ifr->ifr_ifru.ifru_flags6 = ia->ia6_flags; break; case SIOCGIFSTAT_IN6: COUNTER_ARRAY_COPY(((struct in6_ifextra *) ifp->if_afdata[AF_INET6])->in6_ifstat, &ifr->ifr_ifru.ifru_stat, sizeof(struct in6_ifstat) / sizeof(uint64_t)); break; case SIOCGIFSTAT_ICMP6: COUNTER_ARRAY_COPY(((struct in6_ifextra *) ifp->if_afdata[AF_INET6])->icmp6_ifstat, &ifr->ifr_ifru.ifru_icmp6stat, sizeof(struct icmp6_ifstat) / sizeof(uint64_t)); break; case SIOCGIFALIFETIME_IN6: ifr->ifr_ifru.ifru_lifetime = ia->ia6_lifetime; if (ia->ia6_lifetime.ia6t_vltime != ND6_INFINITE_LIFETIME) { time_t maxexpire; struct in6_addrlifetime *retlt = &ifr->ifr_ifru.ifru_lifetime; /* * XXX: adjust expiration time assuming time_t is * signed. */ maxexpire = (-1) & ~((time_t)1 << ((sizeof(maxexpire) * 8) - 1)); if (ia->ia6_lifetime.ia6t_vltime < maxexpire - ia->ia6_updatetime) { retlt->ia6t_expire = ia->ia6_updatetime + ia->ia6_lifetime.ia6t_vltime; } else retlt->ia6t_expire = maxexpire; } if (ia->ia6_lifetime.ia6t_pltime != ND6_INFINITE_LIFETIME) { time_t maxexpire; struct in6_addrlifetime *retlt = &ifr->ifr_ifru.ifru_lifetime; /* * XXX: adjust expiration time assuming time_t is * signed. */ maxexpire = (-1) & ~((time_t)1 << ((sizeof(maxexpire) * 8) - 1)); if (ia->ia6_lifetime.ia6t_pltime < maxexpire - ia->ia6_updatetime) { retlt->ia6t_preferred = ia->ia6_updatetime + ia->ia6_lifetime.ia6t_pltime; } else retlt->ia6t_preferred = maxexpire; } break; case SIOCSIFALIFETIME_IN6: ia->ia6_lifetime = ifr->ifr_ifru.ifru_lifetime; /* for sanity */ if (ia->ia6_lifetime.ia6t_vltime != ND6_INFINITE_LIFETIME) { ia->ia6_lifetime.ia6t_expire = time_uptime + ia->ia6_lifetime.ia6t_vltime; } else ia->ia6_lifetime.ia6t_expire = 0; if (ia->ia6_lifetime.ia6t_pltime != ND6_INFINITE_LIFETIME) { ia->ia6_lifetime.ia6t_preferred = time_uptime + ia->ia6_lifetime.ia6t_pltime; } else ia->ia6_lifetime.ia6t_preferred = 0; break; case SIOCAIFADDR_IN6: { struct nd_prefixctl pr0; struct nd_prefix *pr; /* * first, make or update the interface address structure, * and link it to the list. */ if ((error = in6_update_ifa(ifp, ifra, ia, 0)) != 0) goto out; if (ia != NULL) ifa_free(&ia->ia_ifa); if ((ia = in6ifa_ifpwithaddr(ifp, &ifra->ifra_addr.sin6_addr)) == NULL) { /* * this can happen when the user specify the 0 valid * lifetime. */ break; } if (cmd == ocmd && ifra->ifra_vhid > 0) { if (carp_attach_p != NULL) error = (*carp_attach_p)(&ia->ia_ifa, ifra->ifra_vhid); else error = EPROTONOSUPPORT; if (error) goto out; else carp_attached = 1; } /* * then, make the prefix on-link on the interface. * XXX: we'd rather create the prefix before the address, but * we need at least one address to install the corresponding * interface route, so we configure the address first. */ /* * convert mask to prefix length (prefixmask has already * been validated in in6_update_ifa(). */ bzero(&pr0, sizeof(pr0)); pr0.ndpr_ifp = ifp; pr0.ndpr_plen = in6_mask2len(&ifra->ifra_prefixmask.sin6_addr, NULL); if (pr0.ndpr_plen == 128) { /* we don't need to install a host route. */ goto aifaddr_out; } pr0.ndpr_prefix = ifra->ifra_addr; /* apply the mask for safety. */ IN6_MASK_ADDR(&pr0.ndpr_prefix.sin6_addr, &ifra->ifra_prefixmask.sin6_addr); /* * XXX: since we don't have an API to set prefix (not address) * lifetimes, we just use the same lifetimes as addresses. * The (temporarily) installed lifetimes can be overridden by * later advertised RAs (when accept_rtadv is non 0), which is * an intended behavior. */ pr0.ndpr_raf_onlink = 1; /* should be configurable? */ pr0.ndpr_raf_auto = ((ifra->ifra_flags & IN6_IFF_AUTOCONF) != 0); pr0.ndpr_vltime = ifra->ifra_lifetime.ia6t_vltime; pr0.ndpr_pltime = ifra->ifra_lifetime.ia6t_pltime; /* add the prefix if not yet. */ if ((pr = nd6_prefix_lookup(&pr0)) == NULL) { /* * nd6_prelist_add will install the corresponding * interface route. */ if ((error = nd6_prelist_add(&pr0, NULL, &pr)) != 0) { if (carp_attached) (*carp_detach_p)(&ia->ia_ifa); goto out; } if (pr == NULL) { if (carp_attached) (*carp_detach_p)(&ia->ia_ifa); log(LOG_ERR, "nd6_prelist_add succeeded but " "no prefix\n"); error = EINVAL; goto out; } } /* relate the address to the prefix */ if (ia->ia6_ndpr == NULL) { ia->ia6_ndpr = pr; pr->ndpr_refcnt++; /* * If this is the first autoconf address from the * prefix, create a temporary address as well * (when required). */ if ((ia->ia6_flags & IN6_IFF_AUTOCONF) && V_ip6_use_tempaddr && pr->ndpr_refcnt == 1) { int e; if ((e = in6_tmpifadd(ia, 1, 0)) != 0) { log(LOG_NOTICE, "in6_control: failed " "to create a temporary address, " "errno=%d\n", e); } } } /* * this might affect the status of autoconfigured addresses, * that is, this address might make other addresses detached. */ pfxlist_onlink_check(); aifaddr_out: if (error != 0 || ia == NULL) break; /* * Try to clear the flag when a new IPv6 address is added * onto an IFDISABLED interface and it succeeds. */ if (ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED) { struct in6_ndireq nd; memset(&nd, 0, sizeof(nd)); nd.ndi.flags = ND_IFINFO(ifp)->flags; nd.ndi.flags &= ~ND6_IFF_IFDISABLED; if (nd6_ioctl(SIOCSIFINFO_FLAGS, (caddr_t)&nd, ifp) < 0) log(LOG_NOTICE, "SIOCAIFADDR_IN6: " "SIOCSIFINFO_FLAGS for -ifdisabled " "failed."); /* * Ignore failure of clearing the flag intentionally. * The failure means address duplication was detected. */ } EVENTHANDLER_INVOKE(ifaddr_event, ifp); break; } case SIOCDIFADDR_IN6: { struct nd_prefix *pr; /* * If the address being deleted is the only one that owns * the corresponding prefix, expire the prefix as well. * XXX: theoretically, we don't have to worry about such * relationship, since we separate the address management * and the prefix management. We do this, however, to provide * as much backward compatibility as possible in terms of * the ioctl operation. * Note that in6_purgeaddr() will decrement ndpr_refcnt. */ pr = ia->ia6_ndpr; in6_purgeaddr(&ia->ia_ifa); if (pr && pr->ndpr_refcnt == 0) prelist_remove(pr); EVENTHANDLER_INVOKE(ifaddr_event, ifp); break; } default: if (ifp->if_ioctl == NULL) { error = EOPNOTSUPP; goto out; } error = (*ifp->if_ioctl)(ifp, cmd, data); goto out; } error = 0; out: if (ia != NULL) ifa_free(&ia->ia_ifa); return (error); } /* * Join necessary multicast groups. Factored out from in6_update_ifa(). * This entire work should only be done once, for the default FIB. */ static int in6_update_ifa_join_mc(struct ifnet *ifp, struct in6_aliasreq *ifra, struct in6_ifaddr *ia, int flags, struct in6_multi **in6m_sol) { char ip6buf[INET6_ADDRSTRLEN]; struct in6_addr mltaddr; struct in6_multi_mship *imm; int delay, error; KASSERT(in6m_sol != NULL, ("%s: in6m_sol is NULL", __func__)); /* Join solicited multicast addr for new host id. */ bzero(&mltaddr, sizeof(struct in6_addr)); mltaddr.s6_addr32[0] = IPV6_ADDR_INT32_MLL; mltaddr.s6_addr32[2] = htonl(1); mltaddr.s6_addr32[3] = ifra->ifra_addr.sin6_addr.s6_addr32[3]; mltaddr.s6_addr8[12] = 0xff; if ((error = in6_setscope(&mltaddr, ifp, NULL)) != 0) { /* XXX: should not happen */ log(LOG_ERR, "%s: in6_setscope failed\n", __func__); goto cleanup; } delay = error = 0; if ((flags & IN6_IFAUPDATE_DADDELAY)) { /* * We need a random delay for DAD on the address being * configured. It also means delaying transmission of the * corresponding MLD report to avoid report collision. * [RFC 4861, Section 6.3.7] */ delay = arc4random() % (MAX_RTR_SOLICITATION_DELAY * hz); } imm = in6_joingroup(ifp, &mltaddr, &error, delay); if (imm == NULL) { nd6log((LOG_WARNING, "%s: in6_joingroup failed for %s on %s " "(errno=%d)\n", __func__, ip6_sprintf(ip6buf, &mltaddr), if_name(ifp), error)); goto cleanup; } LIST_INSERT_HEAD(&ia->ia6_memberships, imm, i6mm_chain); *in6m_sol = imm->i6mm_maddr; /* * Join link-local all-nodes address. */ mltaddr = in6addr_linklocal_allnodes; if ((error = in6_setscope(&mltaddr, ifp, NULL)) != 0) goto cleanup; /* XXX: should not fail */ imm = in6_joingroup(ifp, &mltaddr, &error, 0); if (imm == NULL) { nd6log((LOG_WARNING, "%s: in6_joingroup failed for %s on %s " "(errno=%d)\n", __func__, ip6_sprintf(ip6buf, &mltaddr), if_name(ifp), error)); goto cleanup; } LIST_INSERT_HEAD(&ia->ia6_memberships, imm, i6mm_chain); /* * Join node information group address. */ delay = 0; if ((flags & IN6_IFAUPDATE_DADDELAY)) { /* * The spec does not say anything about delay for this group, * but the same logic should apply. */ delay = arc4random() % (MAX_RTR_SOLICITATION_DELAY * hz); } if (in6_nigroup(ifp, NULL, -1, &mltaddr) == 0) { /* XXX jinmei */ imm = in6_joingroup(ifp, &mltaddr, &error, delay); if (imm == NULL) nd6log((LOG_WARNING, "%s: in6_joingroup failed for %s on %s " "(errno=%d)\n", __func__, ip6_sprintf(ip6buf, &mltaddr), if_name(ifp), error)); /* XXX not very fatal, go on... */ else LIST_INSERT_HEAD(&ia->ia6_memberships, imm, i6mm_chain); } if (V_icmp6_nodeinfo_oldmcprefix && in6_nigroup_oldmcprefix(ifp, NULL, -1, &mltaddr) == 0) { imm = in6_joingroup(ifp, &mltaddr, &error, delay); if (imm == NULL) nd6log((LOG_WARNING, "%s: in6_joingroup failed for %s on %s " "(errno=%d)\n", __func__, ip6_sprintf(ip6buf, &mltaddr), if_name(ifp), error)); /* XXX not very fatal, go on... */ else LIST_INSERT_HEAD(&ia->ia6_memberships, imm, i6mm_chain); } /* * Join interface-local all-nodes address. * (ff01::1%ifN, and ff01::%ifN/32) */ mltaddr = in6addr_nodelocal_allnodes; if ((error = in6_setscope(&mltaddr, ifp, NULL)) != 0) goto cleanup; /* XXX: should not fail */ imm = in6_joingroup(ifp, &mltaddr, &error, 0); if (imm == NULL) { nd6log((LOG_WARNING, "%s: in6_joingroup failed for %s on %s " "(errno=%d)\n", __func__, ip6_sprintf(ip6buf, &mltaddr), if_name(ifp), error)); goto cleanup; } LIST_INSERT_HEAD(&ia->ia6_memberships, imm, i6mm_chain); cleanup: return (error); } /* * Update parameters of an IPv6 interface address. * If necessary, a new entry is created and linked into address chains. * This function is separated from in6_control(). */ int in6_update_ifa(struct ifnet *ifp, struct in6_aliasreq *ifra, struct in6_ifaddr *ia, int flags) { int error, hostIsNew = 0; if ((error = in6_validate_ifra(ifp, ifra, ia, flags)) != 0) return (error); if (ia == NULL) { hostIsNew = 1; if ((ia = in6_alloc_ifa(ifp, ifra, flags)) == NULL) return (ENOBUFS); } error = in6_update_ifa_internal(ifp, ifra, ia, hostIsNew, flags); if (error != 0) { if (hostIsNew != 0) { in6_unlink_ifa(ia, ifp); ifa_free(&ia->ia_ifa); } return (error); } if (hostIsNew) error = in6_broadcast_ifa(ifp, ifra, ia, flags); return (error); } /* * Fill in basic IPv6 address request info. */ void in6_prepare_ifra(struct in6_aliasreq *ifra, const struct in6_addr *addr, const struct in6_addr *mask) { memset(ifra, 0, sizeof(struct in6_aliasreq)); ifra->ifra_addr.sin6_family = AF_INET6; ifra->ifra_addr.sin6_len = sizeof(struct sockaddr_in6); if (addr != NULL) ifra->ifra_addr.sin6_addr = *addr; ifra->ifra_prefixmask.sin6_family = AF_INET6; ifra->ifra_prefixmask.sin6_len = sizeof(struct sockaddr_in6); if (mask != NULL) ifra->ifra_prefixmask.sin6_addr = *mask; } static int in6_validate_ifra(struct ifnet *ifp, struct in6_aliasreq *ifra, struct in6_ifaddr *ia, int flags) { int plen = -1; struct sockaddr_in6 dst6; struct in6_addrlifetime *lt; char ip6buf[INET6_ADDRSTRLEN]; /* Validate parameters */ if (ifp == NULL || ifra == NULL) /* this maybe redundant */ return (EINVAL); /* * The destination address for a p2p link must have a family * of AF_UNSPEC or AF_INET6. */ if ((ifp->if_flags & IFF_POINTOPOINT) != 0 && ifra->ifra_dstaddr.sin6_family != AF_INET6 && ifra->ifra_dstaddr.sin6_family != AF_UNSPEC) return (EAFNOSUPPORT); /* * Validate address */ if (ifra->ifra_addr.sin6_len != sizeof(struct sockaddr_in6) || ifra->ifra_addr.sin6_family != AF_INET6) return (EINVAL); /* * validate ifra_prefixmask. don't check sin6_family, netmask * does not carry fields other than sin6_len. */ if (ifra->ifra_prefixmask.sin6_len > sizeof(struct sockaddr_in6)) return (EINVAL); /* * Because the IPv6 address architecture is classless, we require * users to specify a (non 0) prefix length (mask) for a new address. * We also require the prefix (when specified) mask is valid, and thus * reject a non-consecutive mask. */ if (ia == NULL && ifra->ifra_prefixmask.sin6_len == 0) return (EINVAL); if (ifra->ifra_prefixmask.sin6_len != 0) { plen = in6_mask2len(&ifra->ifra_prefixmask.sin6_addr, (u_char *)&ifra->ifra_prefixmask + ifra->ifra_prefixmask.sin6_len); if (plen <= 0) return (EINVAL); } else { /* * In this case, ia must not be NULL. We just use its prefix * length. */ plen = in6_mask2len(&ia->ia_prefixmask.sin6_addr, NULL); } /* * If the destination address on a p2p interface is specified, * and the address is a scoped one, validate/set the scope * zone identifier. */ dst6 = ifra->ifra_dstaddr; if ((ifp->if_flags & (IFF_POINTOPOINT|IFF_LOOPBACK)) != 0 && (dst6.sin6_family == AF_INET6)) { struct in6_addr in6_tmp; u_int32_t zoneid; in6_tmp = dst6.sin6_addr; if (in6_setscope(&in6_tmp, ifp, &zoneid)) return (EINVAL); /* XXX: should be impossible */ if (dst6.sin6_scope_id != 0) { if (dst6.sin6_scope_id != zoneid) return (EINVAL); } else /* user omit to specify the ID. */ dst6.sin6_scope_id = zoneid; /* convert into the internal form */ if (sa6_embedscope(&dst6, 0)) return (EINVAL); /* XXX: should be impossible */ } /* Modify original ifra_dstaddr to reflect changes */ ifra->ifra_dstaddr = dst6; /* * The destination address can be specified only for a p2p or a * loopback interface. If specified, the corresponding prefix length * must be 128. */ if (ifra->ifra_dstaddr.sin6_family == AF_INET6) { if ((ifp->if_flags & (IFF_POINTOPOINT|IFF_LOOPBACK)) == 0) { /* XXX: noisy message */ nd6log((LOG_INFO, "in6_update_ifa: a destination can " "be specified for a p2p or a loopback IF only\n")); return (EINVAL); } if (plen != 128) { nd6log((LOG_INFO, "in6_update_ifa: prefixlen should " "be 128 when dstaddr is specified\n")); return (EINVAL); } } /* lifetime consistency check */ lt = &ifra->ifra_lifetime; if (lt->ia6t_pltime > lt->ia6t_vltime) return (EINVAL); if (lt->ia6t_vltime == 0) { /* * the following log might be noisy, but this is a typical * configuration mistake or a tool's bug. */ nd6log((LOG_INFO, "in6_update_ifa: valid lifetime is 0 for %s\n", ip6_sprintf(ip6buf, &ifra->ifra_addr.sin6_addr))); if (ia == NULL) return (0); /* there's nothing to do */ } /* Check prefix mask */ if (ia != NULL && ifra->ifra_prefixmask.sin6_len != 0) { /* * We prohibit changing the prefix length of an existing * address, because * + such an operation should be rare in IPv6, and * + the operation would confuse prefix management. */ if (ia->ia_prefixmask.sin6_len != 0 && in6_mask2len(&ia->ia_prefixmask.sin6_addr, NULL) != plen) { nd6log((LOG_INFO, "in6_validate_ifa: the prefix length " "of an existing %s address should not be changed\n", ip6_sprintf(ip6buf, &ia->ia_addr.sin6_addr))); return (EINVAL); } } return (0); } /* * Allocate a new ifaddr and link it into chains. */ static struct in6_ifaddr * in6_alloc_ifa(struct ifnet *ifp, struct in6_aliasreq *ifra, int flags) { struct in6_ifaddr *ia; /* * When in6_alloc_ifa() is called in a process of a received * RA, it is called under an interrupt context. So, we should * call malloc with M_NOWAIT. */ ia = (struct in6_ifaddr *)ifa_alloc(sizeof(*ia), M_NOWAIT); if (ia == NULL) return (NULL); LIST_INIT(&ia->ia6_memberships); /* Initialize the address and masks, and put time stamp */ ia->ia_ifa.ifa_addr = (struct sockaddr *)&ia->ia_addr; ia->ia_addr.sin6_family = AF_INET6; ia->ia_addr.sin6_len = sizeof(ia->ia_addr); /* XXX: Can we assign ,sin6_addr and skip the rest? */ ia->ia_addr = ifra->ifra_addr; ia->ia6_createtime = time_uptime; if ((ifp->if_flags & (IFF_POINTOPOINT | IFF_LOOPBACK)) != 0) { /* * Some functions expect that ifa_dstaddr is not * NULL for p2p interfaces. */ ia->ia_ifa.ifa_dstaddr = (struct sockaddr *)&ia->ia_dstaddr; } else { ia->ia_ifa.ifa_dstaddr = NULL; } /* set prefix mask if any */ ia->ia_ifa.ifa_netmask = (struct sockaddr *)&ia->ia_prefixmask; if (ifra->ifra_prefixmask.sin6_len != 0) { ia->ia_prefixmask.sin6_family = AF_INET6; ia->ia_prefixmask.sin6_len = ifra->ifra_prefixmask.sin6_len; ia->ia_prefixmask.sin6_addr = ifra->ifra_prefixmask.sin6_addr; } ia->ia_ifp = ifp; ifa_ref(&ia->ia_ifa); /* if_addrhead */ IF_ADDR_WLOCK(ifp); TAILQ_INSERT_TAIL(&ifp->if_addrhead, &ia->ia_ifa, ifa_link); IF_ADDR_WUNLOCK(ifp); ifa_ref(&ia->ia_ifa); /* in6_ifaddrhead */ IN6_IFADDR_WLOCK(); TAILQ_INSERT_TAIL(&V_in6_ifaddrhead, ia, ia_link); LIST_INSERT_HEAD(IN6ADDR_HASH(&ia->ia_addr.sin6_addr), ia, ia6_hash); IN6_IFADDR_WUNLOCK(); return (ia); } /* * Update/configure interface address parameters: * * 1) Update lifetime * 2) Update interface metric ad flags * 3) Notify other subsystems */ static int in6_update_ifa_internal(struct ifnet *ifp, struct in6_aliasreq *ifra, struct in6_ifaddr *ia, int hostIsNew, int flags) { int error; /* update timestamp */ ia->ia6_updatetime = time_uptime; /* * Set lifetimes. We do not refer to ia6t_expire and ia6t_preferred * to see if the address is deprecated or invalidated, but initialize * these members for applications. */ ia->ia6_lifetime = ifra->ifra_lifetime; if (ia->ia6_lifetime.ia6t_vltime != ND6_INFINITE_LIFETIME) { ia->ia6_lifetime.ia6t_expire = time_uptime + ia->ia6_lifetime.ia6t_vltime; } else ia->ia6_lifetime.ia6t_expire = 0; if (ia->ia6_lifetime.ia6t_pltime != ND6_INFINITE_LIFETIME) { ia->ia6_lifetime.ia6t_preferred = time_uptime + ia->ia6_lifetime.ia6t_pltime; } else ia->ia6_lifetime.ia6t_preferred = 0; /* * backward compatibility - if IN6_IFF_DEPRECATED is set from the * userland, make it deprecated. */ if ((ifra->ifra_flags & IN6_IFF_DEPRECATED) != 0) { ia->ia6_lifetime.ia6t_pltime = 0; ia->ia6_lifetime.ia6t_preferred = time_uptime; } /* * configure address flags. */ ia->ia6_flags = ifra->ifra_flags; /* * Make the address tentative before joining multicast addresses, * so that corresponding MLD responses would not have a tentative * source address. */ ia->ia6_flags &= ~IN6_IFF_DUPLICATED; /* safety */ if (hostIsNew && in6if_do_dad(ifp)) ia->ia6_flags |= IN6_IFF_TENTATIVE; /* DAD should be performed after ND6_IFF_IFDISABLED is cleared. */ if (ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED) ia->ia6_flags |= IN6_IFF_TENTATIVE; /* notify other subsystems */ error = in6_notify_ifa(ifp, ia, ifra, hostIsNew); return (error); } /* * Do link-level ifa job: * 1) Add lle entry for added address * 2) Notifies routing socket users about new address * 3) join appropriate multicast group * 4) start DAD if enabled */ static int in6_broadcast_ifa(struct ifnet *ifp, struct in6_aliasreq *ifra, struct in6_ifaddr *ia, int flags) { struct in6_multi *in6m_sol; int error = 0; /* Add local address to lltable, if necessary (ex. on p2p link). */ if ((error = nd6_add_ifa_lle(ia)) != 0) { in6_purgeaddr(&ia->ia_ifa); ifa_free(&ia->ia_ifa); return (error); } /* Join necessary multicast groups. */ in6m_sol = NULL; if ((ifp->if_flags & IFF_MULTICAST) != 0) { error = in6_update_ifa_join_mc(ifp, ifra, ia, flags, &in6m_sol); if (error != 0) { in6_purgeaddr(&ia->ia_ifa); ifa_free(&ia->ia_ifa); return (error); } } /* * Perform DAD, if needed. * XXX It may be of use, if we can administratively disable DAD. */ if (in6if_do_dad(ifp) && ((ifra->ifra_flags & IN6_IFF_NODAD) == 0) && (ia->ia6_flags & IN6_IFF_TENTATIVE)) { int delay, mindelay, maxdelay; delay = 0; if ((flags & IN6_IFAUPDATE_DADDELAY)) { /* * We need to impose a delay before sending an NS * for DAD. Check if we also needed a delay for the * corresponding MLD message. If we did, the delay * should be larger than the MLD delay (this could be * relaxed a bit, but this simple logic is at least * safe). * XXX: Break data hiding guidelines and look at * state for the solicited multicast group. */ mindelay = 0; if (in6m_sol != NULL && in6m_sol->in6m_state == MLD_REPORTING_MEMBER) { mindelay = in6m_sol->in6m_timer; } maxdelay = MAX_RTR_SOLICITATION_DELAY * hz; if (maxdelay - mindelay == 0) delay = 0; else { delay = (arc4random() % (maxdelay - mindelay)) + mindelay; } } nd6_dad_start((struct ifaddr *)ia, delay); } ifa_free(&ia->ia_ifa); return (error); } void in6_purgeaddr(struct ifaddr *ifa) { struct ifnet *ifp = ifa->ifa_ifp; struct in6_ifaddr *ia = (struct in6_ifaddr *) ifa; struct in6_multi_mship *imm; int plen, error; if (ifa->ifa_carp) (*carp_detach_p)(ifa); /* * Remove the loopback route to the interface address. * The check for the current setting of "nd6_useloopback" * is not needed. */ if (ia->ia_flags & IFA_RTSELF) { error = ifa_del_loopback_route((struct ifaddr *)ia, (struct sockaddr *)&ia->ia_addr); if (error == 0) ia->ia_flags &= ~IFA_RTSELF; } /* stop DAD processing */ nd6_dad_stop(ifa); /* Remove local address entry from lltable. */ nd6_rem_ifa_lle(ia); /* Leave multicast groups. */ while ((imm = LIST_FIRST(&ia->ia6_memberships)) != NULL) { LIST_REMOVE(imm, i6mm_chain); in6_leavegroup(imm); } plen = in6_mask2len(&ia->ia_prefixmask.sin6_addr, NULL); /* XXX */ if ((ia->ia_flags & IFA_ROUTE) && plen == 128) { error = rtinit(&(ia->ia_ifa), RTM_DELETE, ia->ia_flags | (ia->ia_dstaddr.sin6_family == AF_INET6) ? RTF_HOST : 0); if (error != 0) log(LOG_INFO, "%s: err=%d, destination address delete " "failed\n", __func__, error); ia->ia_flags &= ~IFA_ROUTE; } in6_unlink_ifa(ia, ifp); } static void in6_unlink_ifa(struct in6_ifaddr *ia, struct ifnet *ifp) { + char ip6buf[INET6_ADDRSTRLEN]; IF_ADDR_WLOCK(ifp); TAILQ_REMOVE(&ifp->if_addrhead, &ia->ia_ifa, ifa_link); IF_ADDR_WUNLOCK(ifp); ifa_free(&ia->ia_ifa); /* if_addrhead */ /* * Defer the release of what might be the last reference to the * in6_ifaddr so that it can't be freed before the remainder of the * cleanup. */ IN6_IFADDR_WLOCK(); TAILQ_REMOVE(&V_in6_ifaddrhead, ia, ia_link); LIST_REMOVE(ia, ia6_hash); IN6_IFADDR_WUNLOCK(); /* * Release the reference to the base prefix. There should be a * positive reference. */ if (ia->ia6_ndpr == NULL) { nd6log((LOG_NOTICE, "in6_unlink_ifa: autoconf'ed address " - "%p has no prefix\n", ia)); + "%s has no prefix\n", ip6_sprintf(ip6buf, IA6_IN6(ia)))); } else { ia->ia6_ndpr->ndpr_refcnt--; ia->ia6_ndpr = NULL; } /* * Also, if the address being removed is autoconf'ed, call * pfxlist_onlink_check() since the release might affect the status of * other (detached) addresses. */ if ((ia->ia6_flags & IN6_IFF_AUTOCONF)) { pfxlist_onlink_check(); } ifa_free(&ia->ia_ifa); /* in6_ifaddrhead */ } /* * Notifies other other subsystems about address change/arrival: * 1) Notifies device handler on first IPv6 address assignment * 2) Handle routing table changes for P2P links and route * 3) Handle routing table changes for address host route */ static int in6_notify_ifa(struct ifnet *ifp, struct in6_ifaddr *ia, struct in6_aliasreq *ifra, int hostIsNew) { int error = 0, plen, ifacount = 0; struct ifaddr *ifa; struct sockaddr_in6 *pdst; char ip6buf[INET6_ADDRSTRLEN]; /* * Give the interface a chance to initialize * if this is its first address, */ if (hostIsNew != 0) { IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; ifacount++; } IF_ADDR_RUNLOCK(ifp); } if (ifacount <= 1 && ifp->if_ioctl) { error = (*ifp->if_ioctl)(ifp, SIOCSIFADDR, (caddr_t)ia); if (error) return (error); } /* * If a new destination address is specified, scrub the old one and * install the new destination. Note that the interface must be * p2p or loopback. */ pdst = &ifra->ifra_dstaddr; if (pdst->sin6_family == AF_INET6 && !IN6_ARE_ADDR_EQUAL(&pdst->sin6_addr, &ia->ia_dstaddr.sin6_addr)) { if ((ia->ia_flags & IFA_ROUTE) != 0 && (rtinit(&(ia->ia_ifa), (int)RTM_DELETE, RTF_HOST) != 0)) { nd6log((LOG_ERR, "in6_update_ifa_internal: failed to " "remove a route to the old destination: %s\n", ip6_sprintf(ip6buf, &ia->ia_addr.sin6_addr))); /* proceed anyway... */ } else ia->ia_flags &= ~IFA_ROUTE; ia->ia_dstaddr = *pdst; } /* * If a new destination address is specified for a point-to-point * interface, install a route to the destination as an interface * direct route. * XXX: the logic below rejects assigning multiple addresses on a p2p * interface that share the same destination. */ plen = in6_mask2len(&ia->ia_prefixmask.sin6_addr, NULL); /* XXX */ if (!(ia->ia_flags & IFA_ROUTE) && plen == 128 && ia->ia_dstaddr.sin6_family == AF_INET6) { int rtflags = RTF_UP | RTF_HOST; /* * Handle the case for ::1 . */ if (ifp->if_flags & IFF_LOOPBACK) ia->ia_flags |= IFA_RTSELF; error = rtinit(&ia->ia_ifa, RTM_ADD, ia->ia_flags | rtflags); if (error) return (error); ia->ia_flags |= IFA_ROUTE; } /* * add a loopback route to self if not exists */ if (!(ia->ia_flags & IFA_RTSELF) && V_nd6_useloopback) { error = ifa_add_loopback_route((struct ifaddr *)ia, (struct sockaddr *)&ia->ia_addr); if (error == 0) ia->ia_flags |= IFA_RTSELF; } return (error); } /* * Find an IPv6 interface link-local address specific to an interface. * ifaddr is returned referenced. */ struct in6_ifaddr * in6ifa_ifpforlinklocal(struct ifnet *ifp, int ignoreflags) { struct ifaddr *ifa; IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; if (IN6_IS_ADDR_LINKLOCAL(IFA_IN6(ifa))) { if ((((struct in6_ifaddr *)ifa)->ia6_flags & ignoreflags) != 0) continue; ifa_ref(ifa); break; } } IF_ADDR_RUNLOCK(ifp); return ((struct in6_ifaddr *)ifa); } /* * find the internet address corresponding to a given address. * ifaddr is returned referenced. */ struct in6_ifaddr * in6ifa_ifwithaddr(const struct in6_addr *addr, uint32_t zoneid) { struct in6_ifaddr *ia; IN6_IFADDR_RLOCK(); LIST_FOREACH(ia, IN6ADDR_HASH(addr), ia6_hash) { if (IN6_ARE_ADDR_EQUAL(IA6_IN6(ia), addr)) { if (zoneid != 0 && zoneid != ia->ia_addr.sin6_scope_id) continue; ifa_ref(&ia->ia_ifa); break; } } IN6_IFADDR_RUNLOCK(); return (ia); } /* * find the internet address corresponding to a given interface and address. * ifaddr is returned referenced. */ struct in6_ifaddr * in6ifa_ifpwithaddr(struct ifnet *ifp, struct in6_addr *addr) { struct ifaddr *ifa; IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; if (IN6_ARE_ADDR_EQUAL(addr, IFA_IN6(ifa))) { ifa_ref(ifa); break; } } IF_ADDR_RUNLOCK(ifp); return ((struct in6_ifaddr *)ifa); } /* * Find a link-local scoped address on ifp and return it if any. */ struct in6_ifaddr * in6ifa_llaonifp(struct ifnet *ifp) { struct sockaddr_in6 *sin6; struct ifaddr *ifa; if (ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED) return (NULL); if_addr_rlock(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; sin6 = (struct sockaddr_in6 *)ifa->ifa_addr; if (IN6_IS_SCOPE_LINKLOCAL(&sin6->sin6_addr) || IN6_IS_ADDR_MC_INTFACELOCAL(&sin6->sin6_addr) || IN6_IS_ADDR_MC_NODELOCAL(&sin6->sin6_addr)) break; } if_addr_runlock(ifp); return ((struct in6_ifaddr *)ifa); } /* * Convert IP6 address to printable (loggable) representation. Caller * has to make sure that ip6buf is at least INET6_ADDRSTRLEN long. */ static char digits[] = "0123456789abcdef"; char * ip6_sprintf(char *ip6buf, const struct in6_addr *addr) { int i, cnt = 0, maxcnt = 0, idx = 0, index = 0; char *cp; const u_int16_t *a = (const u_int16_t *)addr; const u_int8_t *d; int dcolon = 0, zero = 0; cp = ip6buf; for (i = 0; i < 8; i++) { if (*(a + i) == 0) { cnt++; if (cnt == 1) idx = i; } else if (maxcnt < cnt) { maxcnt = cnt; index = idx; cnt = 0; } } if (maxcnt < cnt) { maxcnt = cnt; index = idx; } for (i = 0; i < 8; i++) { if (dcolon == 1) { if (*a == 0) { if (i == 7) *cp++ = ':'; a++; continue; } else dcolon = 2; } if (*a == 0) { if (dcolon == 0 && *(a + 1) == 0 && i == index) { if (i == 0) *cp++ = ':'; *cp++ = ':'; dcolon = 1; } else { *cp++ = '0'; *cp++ = ':'; } a++; continue; } d = (const u_char *)a; /* Try to eliminate leading zeros in printout like in :0001. */ zero = 1; *cp = digits[*d >> 4]; if (*cp != '0') { zero = 0; cp++; } *cp = digits[*d++ & 0xf]; if (zero == 0 || (*cp != '0')) { zero = 0; cp++; } *cp = digits[*d >> 4]; if (zero == 0 || (*cp != '0')) { zero = 0; cp++; } *cp++ = digits[*d & 0xf]; *cp++ = ':'; a++; } *--cp = '\0'; return (ip6buf); } int in6_localaddr(struct in6_addr *in6) { struct in6_ifaddr *ia; if (IN6_IS_ADDR_LOOPBACK(in6) || IN6_IS_ADDR_LINKLOCAL(in6)) return 1; IN6_IFADDR_RLOCK(); TAILQ_FOREACH(ia, &V_in6_ifaddrhead, ia_link) { if (IN6_ARE_MASKED_ADDR_EQUAL(in6, &ia->ia_addr.sin6_addr, &ia->ia_prefixmask.sin6_addr)) { IN6_IFADDR_RUNLOCK(); return 1; } } IN6_IFADDR_RUNLOCK(); return (0); } /* * Return 1 if an internet address is for the local host and configured * on one of its interfaces. */ int in6_localip(struct in6_addr *in6) { struct in6_ifaddr *ia; IN6_IFADDR_RLOCK(); LIST_FOREACH(ia, IN6ADDR_HASH(in6), ia6_hash) { if (IN6_ARE_ADDR_EQUAL(in6, &ia->ia_addr.sin6_addr)) { IN6_IFADDR_RUNLOCK(); return (1); } } IN6_IFADDR_RUNLOCK(); return (0); } int in6_is_addr_deprecated(struct sockaddr_in6 *sa6) { struct in6_ifaddr *ia; IN6_IFADDR_RLOCK(); LIST_FOREACH(ia, IN6ADDR_HASH(&sa6->sin6_addr), ia6_hash) { if (IN6_ARE_ADDR_EQUAL(IA6_IN6(ia), &sa6->sin6_addr)) { if (ia->ia6_flags & IN6_IFF_DEPRECATED) { IN6_IFADDR_RUNLOCK(); return (1); /* true */ } break; } } IN6_IFADDR_RUNLOCK(); return (0); /* false */ } /* * return length of part which dst and src are equal * hard coding... */ int in6_matchlen(struct in6_addr *src, struct in6_addr *dst) { int match = 0; u_char *s = (u_char *)src, *d = (u_char *)dst; u_char *lim = s + 16, r; while (s < lim) if ((r = (*d++ ^ *s++)) != 0) { while (r < 128) { match++; r <<= 1; } break; } else match += 8; return match; } /* XXX: to be scope conscious */ int in6_are_prefix_equal(struct in6_addr *p1, struct in6_addr *p2, int len) { int bytelen, bitlen; /* sanity check */ if (0 > len || len > 128) { log(LOG_ERR, "in6_are_prefix_equal: invalid prefix length(%d)\n", len); return (0); } bytelen = len / 8; bitlen = len % 8; if (bcmp(&p1->s6_addr, &p2->s6_addr, bytelen)) return (0); if (bitlen != 0 && p1->s6_addr[bytelen] >> (8 - bitlen) != p2->s6_addr[bytelen] >> (8 - bitlen)) return (0); return (1); } void in6_prefixlen2mask(struct in6_addr *maskp, int len) { u_char maskarray[8] = {0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe, 0xff}; int bytelen, bitlen, i; /* sanity check */ if (0 > len || len > 128) { log(LOG_ERR, "in6_prefixlen2mask: invalid prefix length(%d)\n", len); return; } bzero(maskp, sizeof(*maskp)); bytelen = len / 8; bitlen = len % 8; for (i = 0; i < bytelen; i++) maskp->s6_addr[i] = 0xff; if (bitlen) maskp->s6_addr[bytelen] = maskarray[bitlen - 1]; } /* * return the best address out of the same scope. if no address was * found, return the first valid address from designated IF. */ struct in6_ifaddr * in6_ifawithifp(struct ifnet *ifp, struct in6_addr *dst) { int dst_scope = in6_addrscope(dst), blen = -1, tlen; struct ifaddr *ifa; struct in6_ifaddr *besta = 0; struct in6_ifaddr *dep[2]; /* last-resort: deprecated */ dep[0] = dep[1] = NULL; /* * We first look for addresses in the same scope. * If there is one, return it. * If two or more, return one which matches the dst longest. * If none, return one of global addresses assigned other ifs. */ IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_ANYCAST) continue; /* XXX: is there any case to allow anycast? */ if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_NOTREADY) continue; /* don't use this interface */ if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_DETACHED) continue; if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_DEPRECATED) { if (V_ip6_use_deprecated) dep[0] = (struct in6_ifaddr *)ifa; continue; } if (dst_scope == in6_addrscope(IFA_IN6(ifa))) { /* * call in6_matchlen() as few as possible */ if (besta) { if (blen == -1) blen = in6_matchlen(&besta->ia_addr.sin6_addr, dst); tlen = in6_matchlen(IFA_IN6(ifa), dst); if (tlen > blen) { blen = tlen; besta = (struct in6_ifaddr *)ifa; } } else besta = (struct in6_ifaddr *)ifa; } } if (besta) { ifa_ref(&besta->ia_ifa); IF_ADDR_RUNLOCK(ifp); return (besta); } TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_ANYCAST) continue; /* XXX: is there any case to allow anycast? */ if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_NOTREADY) continue; /* don't use this interface */ if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_DETACHED) continue; if (((struct in6_ifaddr *)ifa)->ia6_flags & IN6_IFF_DEPRECATED) { if (V_ip6_use_deprecated) dep[1] = (struct in6_ifaddr *)ifa; continue; } if (ifa != NULL) ifa_ref(ifa); IF_ADDR_RUNLOCK(ifp); return (struct in6_ifaddr *)ifa; } /* use the last-resort values, that are, deprecated addresses */ if (dep[0]) { ifa_ref((struct ifaddr *)dep[0]); IF_ADDR_RUNLOCK(ifp); return dep[0]; } if (dep[1]) { ifa_ref((struct ifaddr *)dep[1]); IF_ADDR_RUNLOCK(ifp); return dep[1]; } IF_ADDR_RUNLOCK(ifp); return NULL; } /* * perform DAD when interface becomes IFF_UP. */ void in6_if_up(struct ifnet *ifp) { struct ifaddr *ifa; struct in6_ifaddr *ia; IF_ADDR_RLOCK(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; ia = (struct in6_ifaddr *)ifa; if (ia->ia6_flags & IN6_IFF_TENTATIVE) { /* * The TENTATIVE flag was likely set by hand * beforehand, implicitly indicating the need for DAD. * We may be able to skip the random delay in this * case, but we impose delays just in case. */ nd6_dad_start(ifa, arc4random() % (MAX_RTR_SOLICITATION_DELAY * hz)); } } IF_ADDR_RUNLOCK(ifp); /* * special cases, like 6to4, are handled in in6_ifattach */ in6_ifattach(ifp, NULL); } int in6if_do_dad(struct ifnet *ifp) { if ((ifp->if_flags & IFF_LOOPBACK) != 0) return (0); if (ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED) return (0); /* * Our DAD routine requires the interface up and running. * However, some interfaces can be up before the RUNNING * status. Additionaly, users may try to assign addresses * before the interface becomes up (or running). * We simply skip DAD in such a case as a work around. * XXX: we should rather mark "tentative" on such addresses, * and do DAD after the interface becomes ready. */ if (!((ifp->if_flags & IFF_UP) && (ifp->if_drv_flags & IFF_DRV_RUNNING))) return (0); return (1); } /* * Calculate max IPv6 MTU through all the interfaces and store it * to in6_maxmtu. */ void in6_setmaxmtu(void) { unsigned long maxmtu = 0; struct ifnet *ifp; IFNET_RLOCK_NOSLEEP(); TAILQ_FOREACH(ifp, &V_ifnet, if_link) { /* this function can be called during ifnet initialization */ if (!ifp->if_afdata[AF_INET6]) continue; if ((ifp->if_flags & IFF_LOOPBACK) == 0 && IN6_LINKMTU(ifp) > maxmtu) maxmtu = IN6_LINKMTU(ifp); } IFNET_RUNLOCK_NOSLEEP(); if (maxmtu) /* update only when maxmtu is positive */ V_in6_maxmtu = maxmtu; } /* * Provide the length of interface identifiers to be used for the link attached * to the given interface. The length should be defined in "IPv6 over * xxx-link" document. Note that address architecture might also define * the length for a particular set of address prefixes, regardless of the * link type. As clarified in rfc2462bis, those two definitions should be * consistent, and those really are as of August 2004. */ int in6_if2idlen(struct ifnet *ifp) { switch (ifp->if_type) { case IFT_ETHER: /* RFC2464 */ #ifdef IFT_PROPVIRTUAL case IFT_PROPVIRTUAL: /* XXX: no RFC. treat it as ether */ #endif #ifdef IFT_L2VLAN case IFT_L2VLAN: /* ditto */ #endif #ifdef IFT_IEEE80211 case IFT_IEEE80211: /* ditto */ #endif #ifdef IFT_MIP case IFT_MIP: /* ditto */ #endif case IFT_INFINIBAND: return (64); case IFT_FDDI: /* RFC2467 */ return (64); case IFT_ISO88025: /* RFC2470 (IPv6 over Token Ring) */ return (64); case IFT_PPP: /* RFC2472 */ return (64); case IFT_ARCNET: /* RFC2497 */ return (64); case IFT_FRELAY: /* RFC2590 */ return (64); case IFT_IEEE1394: /* RFC3146 */ return (64); case IFT_GIF: return (64); /* draft-ietf-v6ops-mech-v2-07 */ case IFT_LOOP: return (64); /* XXX: is this really correct? */ default: /* * Unknown link type: * It might be controversial to use the today's common constant * of 64 for these cases unconditionally. For full compliance, * we should return an error in this case. On the other hand, * if we simply miss the standard for the link type or a new * standard is defined for a new link type, the IFID length * is very likely to be the common constant. As a compromise, * we always use the constant, but make an explicit notice * indicating the "unknown" case. */ printf("in6_if2idlen: unknown link type (%d)\n", ifp->if_type); return (64); } } #include struct in6_llentry { struct llentry base; struct sockaddr_in6 l3_addr6; }; /* * Deletes an address from the address table. * This function is called by the timer functions * such as arptimer() and nd6_llinfo_timer(), and * the caller does the locking. */ static void in6_lltable_free(struct lltable *llt, struct llentry *lle) { LLE_WUNLOCK(lle); LLE_LOCK_DESTROY(lle); free(lle, M_LLTABLE); } static struct llentry * in6_lltable_new(const struct sockaddr *l3addr, u_int flags) { struct in6_llentry *lle; lle = malloc(sizeof(struct in6_llentry), M_LLTABLE, M_NOWAIT | M_ZERO); if (lle == NULL) /* NB: caller generates msg */ return NULL; lle->l3_addr6 = *(const struct sockaddr_in6 *)l3addr; lle->base.lle_refcnt = 1; lle->base.lle_free = in6_lltable_free; LLE_LOCK_INIT(&lle->base); callout_init_rw(&lle->base.ln_timer_ch, &lle->base.lle_lock, CALLOUT_RETURNUNLOCKED); return (&lle->base); } static void in6_lltable_prefix_free(struct lltable *llt, const struct sockaddr *prefix, const struct sockaddr *mask, u_int flags) { const struct sockaddr_in6 *pfx = (const struct sockaddr_in6 *)prefix; const struct sockaddr_in6 *msk = (const struct sockaddr_in6 *)mask; struct llentry *lle, *next; int i; /* * (flags & LLE_STATIC) means deleting all entries * including static ND6 entries. */ IF_AFDATA_WLOCK(llt->llt_ifp); for (i = 0; i < LLTBL_HASHTBL_SIZE; i++) { LIST_FOREACH_SAFE(lle, &llt->lle_head[i], lle_next, next) { if (IN6_ARE_MASKED_ADDR_EQUAL( &satosin6(L3_ADDR(lle))->sin6_addr, &pfx->sin6_addr, &msk->sin6_addr) && ((flags & LLE_STATIC) || !(lle->la_flags & LLE_STATIC))) { LLE_WLOCK(lle); if (callout_stop(&lle->la_timer)) LLE_REMREF(lle); llentry_free(lle); } } } IF_AFDATA_WUNLOCK(llt->llt_ifp); } static int in6_lltable_rtcheck(struct ifnet *ifp, u_int flags, const struct sockaddr *l3addr) { struct rtentry *rt; char ip6buf[INET6_ADDRSTRLEN]; KASSERT(l3addr->sa_family == AF_INET6, ("sin_family %d", l3addr->sa_family)); /* Our local addresses are always only installed on the default FIB. */ /* XXX rtalloc1 should take a const param */ rt = in6_rtalloc1(__DECONST(struct sockaddr *, l3addr), 0, 0, RT_DEFAULT_FIB); if (rt == NULL || (rt->rt_flags & RTF_GATEWAY) || rt->rt_ifp != ifp) { struct ifaddr *ifa; /* * Create an ND6 cache for an IPv6 neighbor * that is not covered by our own prefix. */ /* XXX ifaof_ifpforaddr should take a const param */ ifa = ifaof_ifpforaddr(__DECONST(struct sockaddr *, l3addr), ifp); if (ifa != NULL) { ifa_free(ifa); if (rt != NULL) RTFREE_LOCKED(rt); return 0; } log(LOG_INFO, "IPv6 address: \"%s\" is not on the network\n", ip6_sprintf(ip6buf, &((const struct sockaddr_in6 *)l3addr)->sin6_addr)); if (rt != NULL) RTFREE_LOCKED(rt); return EINVAL; } RTFREE_LOCKED(rt); return 0; } static struct llentry * in6_lltable_lookup(struct lltable *llt, u_int flags, const struct sockaddr *l3addr) { const struct sockaddr_in6 *sin6 = (const struct sockaddr_in6 *)l3addr; struct ifnet *ifp = llt->llt_ifp; struct llentry *lle; struct llentries *lleh; u_int hashkey; IF_AFDATA_LOCK_ASSERT(ifp); KASSERT(l3addr->sa_family == AF_INET6, ("sin_family %d", l3addr->sa_family)); hashkey = sin6->sin6_addr.s6_addr32[3]; lleh = &llt->lle_head[LLATBL_HASH(hashkey, LLTBL_HASHMASK)]; LIST_FOREACH(lle, lleh, lle_next) { struct sockaddr_in6 *sa6 = (struct sockaddr_in6 *)L3_ADDR(lle); if (lle->la_flags & LLE_DELETED) continue; if (bcmp(&sa6->sin6_addr, &sin6->sin6_addr, sizeof(struct in6_addr)) == 0) break; } if (lle == NULL) { if (!(flags & LLE_CREATE)) return (NULL); IF_AFDATA_WLOCK_ASSERT(ifp); /* * A route that covers the given address must have * been installed 1st because we are doing a resolution, * verify this. */ if (!(flags & LLE_IFADDR) && in6_lltable_rtcheck(ifp, flags, l3addr) != 0) return NULL; lle = in6_lltable_new(l3addr, flags); if (lle == NULL) { log(LOG_INFO, "lla_lookup: new lle malloc failed\n"); return NULL; } lle->la_flags = flags & ~LLE_CREATE; if ((flags & (LLE_CREATE | LLE_IFADDR)) == (LLE_CREATE | LLE_IFADDR)) { bcopy(IF_LLADDR(ifp), &lle->ll_addr, ifp->if_addrlen); lle->la_flags |= (LLE_VALID | LLE_STATIC); } lle->lle_tbl = llt; lle->lle_head = lleh; lle->la_flags |= LLE_LINKED; LIST_INSERT_HEAD(lleh, lle, lle_next); } else if (flags & LLE_DELETE) { if (!(lle->la_flags & LLE_IFADDR) || (flags & LLE_IFADDR)) { LLE_WLOCK(lle); lle->la_flags |= LLE_DELETED; #ifdef DIAGNOSTIC log(LOG_INFO, "ifaddr cache = %p is deleted\n", lle); #endif if ((lle->la_flags & (LLE_STATIC | LLE_IFADDR)) == LLE_STATIC) llentry_free(lle); else LLE_WUNLOCK(lle); } lle = (void *)-1; } if (LLE_IS_VALID(lle)) { if (flags & LLE_EXCLUSIVE) LLE_WLOCK(lle); else LLE_RLOCK(lle); } return (lle); } static int in6_lltable_dump(struct lltable *llt, struct sysctl_req *wr) { struct ifnet *ifp = llt->llt_ifp; struct llentry *lle; /* XXX stack use */ struct { struct rt_msghdr rtm; struct sockaddr_in6 sin6; /* * ndp.c assumes that sdl is word aligned */ #ifdef __LP64__ uint32_t pad; #endif struct sockaddr_dl sdl; } ndpc; int i, error; if (ifp->if_flags & IFF_LOOPBACK) return 0; LLTABLE_LOCK_ASSERT(); error = 0; for (i = 0; i < LLTBL_HASHTBL_SIZE; i++) { LIST_FOREACH(lle, &llt->lle_head[i], lle_next) { struct sockaddr_dl *sdl; /* skip deleted or invalid entries */ if ((lle->la_flags & (LLE_DELETED|LLE_VALID)) != LLE_VALID) continue; /* Skip if jailed and not a valid IP of the prison. */ if (prison_if(wr->td->td_ucred, L3_ADDR(lle)) != 0) continue; /* * produce a msg made of: * struct rt_msghdr; * struct sockaddr_in6 (IPv6) * struct sockaddr_dl; */ bzero(&ndpc, sizeof(ndpc)); ndpc.rtm.rtm_msglen = sizeof(ndpc); ndpc.rtm.rtm_version = RTM_VERSION; ndpc.rtm.rtm_type = RTM_GET; ndpc.rtm.rtm_flags = RTF_UP; ndpc.rtm.rtm_addrs = RTA_DST | RTA_GATEWAY; ndpc.sin6.sin6_family = AF_INET6; ndpc.sin6.sin6_len = sizeof(ndpc.sin6); bcopy(L3_ADDR(lle), &ndpc.sin6, L3_ADDR_LEN(lle)); if (V_deembed_scopeid) sa6_recoverscope(&ndpc.sin6); /* publish */ if (lle->la_flags & LLE_PUB) ndpc.rtm.rtm_flags |= RTF_ANNOUNCE; sdl = &ndpc.sdl; sdl->sdl_family = AF_LINK; sdl->sdl_len = sizeof(*sdl); sdl->sdl_alen = ifp->if_addrlen; sdl->sdl_index = ifp->if_index; sdl->sdl_type = ifp->if_type; bcopy(&lle->ll_addr, LLADDR(sdl), ifp->if_addrlen); ndpc.rtm.rtm_rmx.rmx_expire = lle->la_flags & LLE_STATIC ? 0 : lle->la_expire; ndpc.rtm.rtm_flags |= (RTF_HOST | RTF_LLDATA); if (lle->la_flags & LLE_STATIC) ndpc.rtm.rtm_flags |= RTF_STATIC; ndpc.rtm.rtm_index = ifp->if_index; error = SYSCTL_OUT(wr, &ndpc, sizeof(ndpc)); if (error) break; } } return error; } void * in6_domifattach(struct ifnet *ifp) { struct in6_ifextra *ext; /* There are not IPv6-capable interfaces. */ switch (ifp->if_type) { case IFT_PFLOG: case IFT_PFSYNC: case IFT_USB: return (NULL); } ext = (struct in6_ifextra *)malloc(sizeof(*ext), M_IFADDR, M_WAITOK); bzero(ext, sizeof(*ext)); ext->in6_ifstat = malloc(sizeof(counter_u64_t) * sizeof(struct in6_ifstat) / sizeof(uint64_t), M_IFADDR, M_WAITOK); COUNTER_ARRAY_ALLOC(ext->in6_ifstat, sizeof(struct in6_ifstat) / sizeof(uint64_t), M_WAITOK); ext->icmp6_ifstat = malloc(sizeof(counter_u64_t) * sizeof(struct icmp6_ifstat) / sizeof(uint64_t), M_IFADDR, M_WAITOK); COUNTER_ARRAY_ALLOC(ext->icmp6_ifstat, sizeof(struct icmp6_ifstat) / sizeof(uint64_t), M_WAITOK); ext->nd_ifinfo = nd6_ifattach(ifp); ext->scope6_id = scope6_ifattach(ifp); ext->lltable = lltable_init(ifp, AF_INET6); if (ext->lltable != NULL) { ext->lltable->llt_prefix_free = in6_lltable_prefix_free; ext->lltable->llt_lookup = in6_lltable_lookup; ext->lltable->llt_dump = in6_lltable_dump; } ext->mld_ifinfo = mld_domifattach(ifp); return ext; } int in6_domifmtu(struct ifnet *ifp) { return (IN6_LINKMTU(ifp)); } void in6_domifdetach(struct ifnet *ifp, void *aux) { struct in6_ifextra *ext = (struct in6_ifextra *)aux; mld_domifdetach(ifp); scope6_ifdetach(ext->scope6_id); nd6_ifdetach(ext->nd_ifinfo); lltable_free(ext->lltable); COUNTER_ARRAY_FREE(ext->in6_ifstat, sizeof(struct in6_ifstat) / sizeof(uint64_t)); free(ext->in6_ifstat, M_IFADDR); COUNTER_ARRAY_FREE(ext->icmp6_ifstat, sizeof(struct icmp6_ifstat) / sizeof(uint64_t)); free(ext->icmp6_ifstat, M_IFADDR); free(ext, M_IFADDR); } /* * Convert sockaddr_in6 to sockaddr_in. Original sockaddr_in6 must be * v4 mapped addr or v4 compat addr */ void in6_sin6_2_sin(struct sockaddr_in *sin, struct sockaddr_in6 *sin6) { bzero(sin, sizeof(*sin)); sin->sin_len = sizeof(struct sockaddr_in); sin->sin_family = AF_INET; sin->sin_port = sin6->sin6_port; sin->sin_addr.s_addr = sin6->sin6_addr.s6_addr32[3]; } /* Convert sockaddr_in to sockaddr_in6 in v4 mapped addr format. */ void in6_sin_2_v4mapsin6(struct sockaddr_in *sin, struct sockaddr_in6 *sin6) { bzero(sin6, sizeof(*sin6)); sin6->sin6_len = sizeof(struct sockaddr_in6); sin6->sin6_family = AF_INET6; sin6->sin6_port = sin->sin_port; sin6->sin6_addr.s6_addr32[0] = 0; sin6->sin6_addr.s6_addr32[1] = 0; sin6->sin6_addr.s6_addr32[2] = IPV6_ADDR_INT32_SMP; sin6->sin6_addr.s6_addr32[3] = sin->sin_addr.s_addr; } /* Convert sockaddr_in6 into sockaddr_in. */ void in6_sin6_2_sin_in_sock(struct sockaddr *nam) { struct sockaddr_in *sin_p; struct sockaddr_in6 sin6; /* * Save original sockaddr_in6 addr and convert it * to sockaddr_in. */ sin6 = *(struct sockaddr_in6 *)nam; sin_p = (struct sockaddr_in *)nam; in6_sin6_2_sin(sin_p, &sin6); } /* Convert sockaddr_in into sockaddr_in6 in v4 mapped addr format. */ void in6_sin_2_v4mapsin6_in_sock(struct sockaddr **nam) { struct sockaddr_in *sin_p; struct sockaddr_in6 *sin6_p; sin6_p = malloc(sizeof *sin6_p, M_SONAME, M_WAITOK); sin_p = (struct sockaddr_in *)*nam; in6_sin_2_v4mapsin6(sin_p, sin6_p); free(*nam, M_SONAME); *nam = (struct sockaddr *)sin6_p; } Index: projects/building-blocks/sys/netpfil/ipfw/ip_fw_iface.c =================================================================== --- projects/building-blocks/sys/netpfil/ipfw/ip_fw_iface.c (revision 278311) +++ projects/building-blocks/sys/netpfil/ipfw/ip_fw_iface.c (revision 278312) @@ -1,537 +1,537 @@ /*- * Copyright (c) 2014 Yandex LLC. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include -__FBSDID("$FreeBSD: projects/ipfw/sys/netpfil/ipfw/ip_fw_iface.c 267384 2014-06-12 09:59:11Z melifaro $"); +__FBSDID("$FreeBSD$"); /* * Kernel interface tracking API. * */ #include "opt_ipfw.h" #include "opt_inet.h" #ifndef INET #error IPFIREWALL requires INET. #endif /* INET */ #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* struct ipfw_rule_ref */ #include #include #define CHAIN_TO_II(ch) ((struct namedobj_instance *)ch->ifcfg) #define DEFAULT_IFACES 128 static void handle_ifdetach(struct ip_fw_chain *ch, struct ipfw_iface *iif, uint16_t ifindex); static void handle_ifattach(struct ip_fw_chain *ch, struct ipfw_iface *iif, uint16_t ifindex); static int list_ifaces(struct ip_fw_chain *ch, ip_fw3_opheader *op3, struct sockopt_data *sd); static struct ipfw_sopt_handler scodes[] = { { IP_FW_XIFLIST, 0, HDIR_GET, list_ifaces }, }; /* * FreeBSD Kernel interface. */ static void ipfw_kifhandler(void *arg, struct ifnet *ifp); static int ipfw_kiflookup(char *name); static void iface_khandler_register(void); static void iface_khandler_deregister(void); static eventhandler_tag ipfw_ifdetach_event, ipfw_ifattach_event; static int num_vnets = 0; static struct mtx vnet_mtx; /* * Checks if kernel interface is contained in our tracked * interface list and calls attach/detach handler. */ static void ipfw_kifhandler(void *arg, struct ifnet *ifp) { struct ip_fw_chain *ch; struct ipfw_iface *iif; struct namedobj_instance *ii; uintptr_t htype; if (V_ipfw_vnet_ready == 0) return; ch = &V_layer3_chain; htype = (uintptr_t)arg; IPFW_UH_WLOCK(ch); ii = CHAIN_TO_II(ch); if (ii == NULL) { IPFW_UH_WUNLOCK(ch); return; } iif = (struct ipfw_iface*)ipfw_objhash_lookup_name(ii, 0, if_name(ifp)); if (iif != NULL) { if (htype == 1) handle_ifattach(ch, iif, ifp->if_index); else handle_ifdetach(ch, iif, ifp->if_index); } IPFW_UH_WUNLOCK(ch); } /* * Reference current VNET as iface tracking API user. * Registers interface tracking handlers for first VNET. */ static void iface_khandler_register() { int create; create = 0; mtx_lock(&vnet_mtx); if (num_vnets == 0) create = 1; num_vnets++; mtx_unlock(&vnet_mtx); if (create == 0) return; printf("IPFW: starting up interface tracker\n"); ipfw_ifdetach_event = EVENTHANDLER_REGISTER( ifnet_departure_event, ipfw_kifhandler, NULL, EVENTHANDLER_PRI_ANY); ipfw_ifattach_event = EVENTHANDLER_REGISTER( ifnet_arrival_event, ipfw_kifhandler, (void*)((uintptr_t)1), EVENTHANDLER_PRI_ANY); } /* * * Detach interface event handlers on last VNET instance * detach. */ static void iface_khandler_deregister() { int destroy; destroy = 0; mtx_lock(&vnet_mtx); if (num_vnets == 1) destroy = 1; num_vnets--; mtx_unlock(&vnet_mtx); if (destroy == 0) return; EVENTHANDLER_DEREGISTER(ifnet_arrival_event, ipfw_ifattach_event); EVENTHANDLER_DEREGISTER(ifnet_departure_event, ipfw_ifdetach_event); } /* * Retrieves ifindex for given @name. * * Returns ifindex or 0. */ static int ipfw_kiflookup(char *name) { struct ifnet *ifp; int ifindex; ifindex = 0; if ((ifp = ifunit_ref(name)) != NULL) { ifindex = ifp->if_index; if_rele(ifp); } return (ifindex); } /* * Global ipfw startup hook. * Since we perform lazy initialization, do nothing except * mutex init. */ int ipfw_iface_init() { mtx_init(&vnet_mtx, "IPFW ifhandler mtx", NULL, MTX_DEF); IPFW_ADD_SOPT_HANDLER(1, scodes); return (0); } /* * Global ipfw destroy hook. * Unregister khandlers iff init has been done. */ void ipfw_iface_destroy() { IPFW_DEL_SOPT_HANDLER(1, scodes); mtx_destroy(&vnet_mtx); } /* * Perform actual init on internal request. * Inits both namehash and global khandler. */ static void vnet_ipfw_iface_init(struct ip_fw_chain *ch) { struct namedobj_instance *ii; ii = ipfw_objhash_create(DEFAULT_IFACES); IPFW_UH_WLOCK(ch); if (ch->ifcfg == NULL) { ch->ifcfg = ii; ii = NULL; } IPFW_UH_WUNLOCK(ch); if (ii != NULL) { /* Already initialized. Free namehash. */ ipfw_objhash_destroy(ii); } else { /* We're the first ones. Init kernel hooks. */ iface_khandler_register(); } } static void destroy_iface(struct namedobj_instance *ii, struct named_object *no, void *arg) { /* Assume all consumers have been already detached */ free(no, M_IPFW); } /* * Per-VNET ipfw detach hook. * */ void vnet_ipfw_iface_destroy(struct ip_fw_chain *ch) { struct namedobj_instance *ii; IPFW_UH_WLOCK(ch); ii = CHAIN_TO_II(ch); ch->ifcfg = NULL; IPFW_UH_WUNLOCK(ch); if (ii != NULL) { ipfw_objhash_foreach(ii, destroy_iface, ch); ipfw_objhash_destroy(ii); iface_khandler_deregister(); } } /* * Notify the subsystem that we are interested in tracking * interface @name. This function has to be called without * holding any locks to permit allocating the necessary states * for proper interface tracking. * * Returns 0 on success. */ int ipfw_iface_ref(struct ip_fw_chain *ch, char *name, struct ipfw_ifc *ic) { struct namedobj_instance *ii; struct ipfw_iface *iif, *tmp; if (strlen(name) >= sizeof(iif->ifname)) return (EINVAL); IPFW_UH_WLOCK(ch); ii = CHAIN_TO_II(ch); if (ii == NULL) { /* * First request to subsystem. * Let's perform init. */ IPFW_UH_WUNLOCK(ch); vnet_ipfw_iface_init(ch); IPFW_UH_WLOCK(ch); ii = CHAIN_TO_II(ch); } iif = (struct ipfw_iface *)ipfw_objhash_lookup_name(ii, 0, name); if (iif != NULL) { iif->no.refcnt++; ic->iface = iif; IPFW_UH_WUNLOCK(ch); return (0); } IPFW_UH_WUNLOCK(ch); /* Not found. Let's create one */ iif = malloc(sizeof(struct ipfw_iface), M_IPFW, M_WAITOK | M_ZERO); TAILQ_INIT(&iif->consumers); iif->no.name = iif->ifname; strlcpy(iif->ifname, name, sizeof(iif->ifname)); /* * Ref & link to the list. * * We assume ifnet_arrival_event / ifnet_departure_event * are not holding any locks. */ iif->no.refcnt = 1; IPFW_UH_WLOCK(ch); tmp = (struct ipfw_iface *)ipfw_objhash_lookup_name(ii, 0, name); if (tmp != NULL) { /* Interface has been created since unlock. Ref and return */ tmp->no.refcnt++; ic->iface = tmp; IPFW_UH_WUNLOCK(ch); free(iif, M_IPFW); return (0); } iif->ifindex = ipfw_kiflookup(name); if (iif->ifindex != 0) iif->resolved = 1; ipfw_objhash_add(ii, &iif->no); ic->iface = iif; IPFW_UH_WUNLOCK(ch); return (0); } /* * Adds @ic to the list of iif interface consumers. * Must be called with holding both UH+WLOCK. * Callback may be immediately called (if interface exists). */ void ipfw_iface_add_notify(struct ip_fw_chain *ch, struct ipfw_ifc *ic) { struct ipfw_iface *iif; IPFW_UH_WLOCK_ASSERT(ch); IPFW_WLOCK_ASSERT(ch); iif = ic->iface; TAILQ_INSERT_TAIL(&iif->consumers, ic, next); if (iif->resolved != 0) ic->cb(ch, ic->cbdata, iif->ifindex); } /* * Unlinks interface tracker object @ic from interface. * Must be called while holding UH lock. */ void ipfw_iface_del_notify(struct ip_fw_chain *ch, struct ipfw_ifc *ic) { struct ipfw_iface *iif; IPFW_UH_WLOCK_ASSERT(ch); iif = ic->iface; TAILQ_REMOVE(&iif->consumers, ic, next); } /* * Unreference interface specified by @ic. - * Must be called without holding any locks. + * Must be called while holding UH lock. */ void ipfw_iface_unref(struct ip_fw_chain *ch, struct ipfw_ifc *ic) { struct ipfw_iface *iif; + IPFW_UH_WLOCK_ASSERT(ch); + iif = ic->iface; ic->iface = NULL; - IPFW_UH_WLOCK(ch); iif->no.refcnt--; /* TODO: check for references & delete */ - IPFW_UH_WUNLOCK(ch); } /* * Interface arrival handler. */ static void handle_ifattach(struct ip_fw_chain *ch, struct ipfw_iface *iif, uint16_t ifindex) { struct ipfw_ifc *ic; IPFW_UH_WLOCK_ASSERT(ch); iif->gencnt++; iif->resolved = 1; iif->ifindex = ifindex; IPFW_WLOCK(ch); TAILQ_FOREACH(ic, &iif->consumers, next) ic->cb(ch, ic->cbdata, iif->ifindex); IPFW_WUNLOCK(ch); } /* * Interface departure handler. */ static void handle_ifdetach(struct ip_fw_chain *ch, struct ipfw_iface *iif, uint16_t ifindex) { struct ipfw_ifc *ic; IPFW_UH_WLOCK_ASSERT(ch); IPFW_WLOCK(ch); TAILQ_FOREACH(ic, &iif->consumers, next) ic->cb(ch, ic->cbdata, 0); IPFW_WUNLOCK(ch); iif->gencnt++; iif->resolved = 0; iif->ifindex = 0; } struct dump_iface_args { struct ip_fw_chain *ch; struct sockopt_data *sd; }; static void export_iface_internal(struct namedobj_instance *ii, struct named_object *no, void *arg) { ipfw_iface_info *i; struct dump_iface_args *da; struct ipfw_iface *iif; da = (struct dump_iface_args *)arg; i = (ipfw_iface_info *)ipfw_get_sopt_space(da->sd, sizeof(*i)); KASSERT(i != 0, ("previously checked buffer is not enough")); iif = (struct ipfw_iface *)no; strlcpy(i->ifname, iif->ifname, sizeof(i->ifname)); if (iif->resolved) i->flags |= IPFW_IFFLAG_RESOLVED; i->ifindex = iif->ifindex; i->refcnt = iif->no.refcnt; i->gencnt = iif->gencnt; } /* * Lists all interface currently tracked by ipfw. * Data layout (v0)(current): * Request: [ ipfw_obj_lheader ], size = ipfw_obj_lheader.size * Reply: [ ipfw_obj_lheader ipfw_iface_info x N ] * * Returns 0 on success */ static int list_ifaces(struct ip_fw_chain *ch, ip_fw3_opheader *op3, struct sockopt_data *sd) { struct namedobj_instance *ii; struct _ipfw_obj_lheader *olh; struct dump_iface_args da; uint32_t count, size; olh = (struct _ipfw_obj_lheader *)ipfw_get_sopt_header(sd,sizeof(*olh)); if (olh == NULL) return (EINVAL); if (sd->valsize < olh->size) return (EINVAL); IPFW_UH_RLOCK(ch); ii = CHAIN_TO_II(ch); if (ii != NULL) count = ipfw_objhash_count(ii); else count = 0; size = count * sizeof(ipfw_iface_info) + sizeof(ipfw_obj_lheader); /* Fill in header regadless of buffer size */ olh->count = count; olh->objsize = sizeof(ipfw_iface_info); if (size > olh->size) { olh->size = size; IPFW_UH_RUNLOCK(ch); return (ENOMEM); } olh->size = size; da.ch = ch; da.sd = sd; if (ii != NULL) ipfw_objhash_foreach(ii, export_iface_internal, &da); IPFW_UH_RUNLOCK(ch); return (0); } Property changes on: projects/building-blocks/sys/netpfil/ipfw/ip_fw_iface.c ___________________________________________________________________ Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Index: projects/building-blocks/sys/netpfil/ipfw/ip_fw_nat.c =================================================================== --- projects/building-blocks/sys/netpfil/ipfw/ip_fw_nat.c (revision 278311) +++ projects/building-blocks/sys/netpfil/ipfw/ip_fw_nat.c (revision 278312) @@ -1,1230 +1,1230 @@ /*- * Copyright (c) 2008 Paolo Pisati * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* XXX for in_cksum */ struct cfg_spool { LIST_ENTRY(cfg_spool) _next; /* chain of spool instances */ struct in_addr addr; uint16_t port; }; /* Nat redirect configuration. */ struct cfg_redir { LIST_ENTRY(cfg_redir) _next; /* chain of redir instances */ uint16_t mode; /* type of redirect mode */ uint16_t proto; /* protocol: tcp/udp */ struct in_addr laddr; /* local ip address */ struct in_addr paddr; /* public ip address */ struct in_addr raddr; /* remote ip address */ uint16_t lport; /* local port */ uint16_t pport; /* public port */ uint16_t rport; /* remote port */ uint16_t pport_cnt; /* number of public ports */ uint16_t rport_cnt; /* number of remote ports */ struct alias_link **alink; u_int16_t spool_cnt; /* num of entry in spool chain */ /* chain of spool instances */ LIST_HEAD(spool_chain, cfg_spool) spool_chain; }; /* Nat configuration data struct. */ struct cfg_nat { /* chain of nat instances */ LIST_ENTRY(cfg_nat) _next; int id; /* nat id */ struct in_addr ip; /* nat ip address */ struct libalias *lib; /* libalias instance */ int mode; /* aliasing mode */ int redir_cnt; /* number of entry in spool chain */ /* chain of redir instances */ LIST_HEAD(redir_chain, cfg_redir) redir_chain; char if_name[IF_NAMESIZE]; /* interface name */ }; static eventhandler_tag ifaddr_event_tag; static void ifaddr_change(void *arg __unused, struct ifnet *ifp) { struct cfg_nat *ptr; struct ifaddr *ifa; struct ip_fw_chain *chain; KASSERT(curvnet == ifp->if_vnet, ("curvnet(%p) differs from iface vnet(%p)", curvnet, ifp->if_vnet)); chain = &V_layer3_chain; IPFW_UH_WLOCK(chain); /* Check every nat entry... */ LIST_FOREACH(ptr, &chain->nat, _next) { /* ...using nic 'ifp->if_xname' as dynamic alias address. */ if (strncmp(ptr->if_name, ifp->if_xname, IF_NAMESIZE) != 0) continue; if_addr_rlock(ifp); TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr == NULL) continue; if (ifa->ifa_addr->sa_family != AF_INET) continue; IPFW_WLOCK(chain); ptr->ip = ((struct sockaddr_in *) (ifa->ifa_addr))->sin_addr; LibAliasSetAddress(ptr->lib, ptr->ip); IPFW_WUNLOCK(chain); } if_addr_runlock(ifp); } IPFW_UH_WUNLOCK(chain); } /* * delete the pointers for nat entry ix, or all of them if ix < 0 */ static void flush_nat_ptrs(struct ip_fw_chain *chain, const int ix) { int i; ipfw_insn_nat *cmd; IPFW_WLOCK_ASSERT(chain); for (i = 0; i < chain->n_rules; i++) { cmd = (ipfw_insn_nat *)ACTION_PTR(chain->map[i]); /* XXX skip log and the like ? */ if (cmd->o.opcode == O_NAT && cmd->nat != NULL && (ix < 0 || cmd->nat->id == ix)) cmd->nat = NULL; } } static void del_redir_spool_cfg(struct cfg_nat *n, struct redir_chain *head) { struct cfg_redir *r, *tmp_r; struct cfg_spool *s, *tmp_s; int i, num; LIST_FOREACH_SAFE(r, head, _next, tmp_r) { num = 1; /* Number of alias_link to delete. */ switch (r->mode) { case NAT44_REDIR_PORT: num = r->pport_cnt; /* FALLTHROUGH */ case NAT44_REDIR_ADDR: case NAT44_REDIR_PROTO: /* Delete all libalias redirect entry. */ for (i = 0; i < num; i++) LibAliasRedirectDelete(n->lib, r->alink[i]); /* Del spool cfg if any. */ LIST_FOREACH_SAFE(s, &r->spool_chain, _next, tmp_s) { LIST_REMOVE(s, _next); free(s, M_IPFW); } free(r->alink, M_IPFW); LIST_REMOVE(r, _next); free(r, M_IPFW); break; default: printf("unknown redirect mode: %u\n", r->mode); /* XXX - panic?!?!? */ break; } } } static int add_redir_spool_cfg(char *buf, struct cfg_nat *ptr) { struct cfg_redir *r; struct cfg_spool *s; struct nat44_cfg_redir *ser_r; struct nat44_cfg_spool *ser_s; int cnt, off, i; for (cnt = 0, off = 0; cnt < ptr->redir_cnt; cnt++) { ser_r = (struct nat44_cfg_redir *)&buf[off]; r = malloc(sizeof(*r), M_IPFW, M_WAITOK | M_ZERO); r->mode = ser_r->mode; r->laddr = ser_r->laddr; r->paddr = ser_r->paddr; r->raddr = ser_r->raddr; r->lport = ser_r->lport; r->pport = ser_r->pport; r->rport = ser_r->rport; r->pport_cnt = ser_r->pport_cnt; r->rport_cnt = ser_r->rport_cnt; r->proto = ser_r->proto; r->spool_cnt = ser_r->spool_cnt; //memcpy(r, ser_r, SOF_REDIR); LIST_INIT(&r->spool_chain); off += sizeof(struct nat44_cfg_redir); r->alink = malloc(sizeof(struct alias_link *) * r->pport_cnt, M_IPFW, M_WAITOK | M_ZERO); switch (r->mode) { case NAT44_REDIR_ADDR: r->alink[0] = LibAliasRedirectAddr(ptr->lib, r->laddr, r->paddr); break; case NAT44_REDIR_PORT: for (i = 0 ; i < r->pport_cnt; i++) { /* If remotePort is all ports, set it to 0. */ u_short remotePortCopy = r->rport + i; if (r->rport_cnt == 1 && r->rport == 0) remotePortCopy = 0; r->alink[i] = LibAliasRedirectPort(ptr->lib, r->laddr, htons(r->lport + i), r->raddr, htons(remotePortCopy), r->paddr, htons(r->pport + i), r->proto); if (r->alink[i] == NULL) { r->alink[0] = NULL; break; } } break; case NAT44_REDIR_PROTO: r->alink[0] = LibAliasRedirectProto(ptr->lib ,r->laddr, r->raddr, r->paddr, r->proto); break; default: printf("unknown redirect mode: %u\n", r->mode); break; } if (r->alink[0] == NULL) { printf("LibAliasRedirect* returned NULL\n"); return (EINVAL); } /* LSNAT handling. */ for (i = 0; i < r->spool_cnt; i++) { ser_s = (struct nat44_cfg_spool *)&buf[off]; s = malloc(sizeof(*s), M_IPFW, M_WAITOK | M_ZERO); s->addr = ser_s->addr; s->port = ser_s->port; LibAliasAddServer(ptr->lib, r->alink[0], s->addr, htons(s->port)); off += sizeof(struct nat44_cfg_spool); /* Hook spool entry. */ LIST_INSERT_HEAD(&r->spool_chain, s, _next); } /* And finally hook this redir entry. */ LIST_INSERT_HEAD(&ptr->redir_chain, r, _next); } return (0); } /* * ipfw_nat - perform mbuf header translation. * * Note V_layer3_chain has to be locked while calling ipfw_nat() in * 'global' operation mode (t == NULL). * */ static int ipfw_nat(struct ip_fw_args *args, struct cfg_nat *t, struct mbuf *m) { struct mbuf *mcl; struct ip *ip; /* XXX - libalias duct tape */ int ldt, retval, found; struct ip_fw_chain *chain; char *c; ldt = 0; retval = 0; mcl = m_megapullup(m, m->m_pkthdr.len); if (mcl == NULL) { args->m = NULL; return (IP_FW_DENY); } ip = mtod(mcl, struct ip *); /* * XXX - Libalias checksum offload 'duct tape': * * locally generated packets have only pseudo-header checksum * calculated and libalias will break it[1], so mark them for * later fix. Moreover there are cases when libalias modifies * tcp packet data[2], mark them for later fix too. * * [1] libalias was never meant to run in kernel, so it does * not have any knowledge about checksum offloading, and * expects a packet with a full internet checksum. * Unfortunately, packets generated locally will have just the * pseudo header calculated, and when libalias tries to adjust * the checksum it will actually compute a wrong value. * * [2] when libalias modifies tcp's data content, full TCP * checksum has to be recomputed: the problem is that * libalias does not have any idea about checksum offloading. * To work around this, we do not do checksumming in LibAlias, * but only mark the packets in th_x2 field. If we receive a * marked packet, we calculate correct checksum for it * aware of offloading. Why such a terrible hack instead of * recalculating checksum for each packet? * Because the previous checksum was not checked! * Recalculating checksums for EVERY packet will hide ALL * transmission errors. Yes, marked packets still suffer from * this problem. But, sigh, natd(8) has this problem, too. * * TODO: -make libalias mbuf aware (so * it can handle delayed checksum and tso) */ if (mcl->m_pkthdr.rcvif == NULL && mcl->m_pkthdr.csum_flags & CSUM_DELAY_DATA) ldt = 1; c = mtod(mcl, char *); /* Check if this is 'global' instance */ if (t == NULL) { if (args->oif == NULL) { /* Wrong direction, skip processing */ args->m = mcl; return (IP_FW_NAT); } found = 0; chain = &V_layer3_chain; IPFW_RLOCK_ASSERT(chain); /* Check every nat entry... */ LIST_FOREACH(t, &chain->nat, _next) { if ((t->mode & PKT_ALIAS_SKIP_GLOBAL) != 0) continue; retval = LibAliasOutTry(t->lib, c, mcl->m_len + M_TRAILINGSPACE(mcl), 0); if (retval == PKT_ALIAS_OK) { /* Nat instance recognises state */ found = 1; break; } } if (found != 1) { /* No instance found, return ignore */ args->m = mcl; return (IP_FW_NAT); } } else { if (args->oif == NULL) retval = LibAliasIn(t->lib, c, mcl->m_len + M_TRAILINGSPACE(mcl)); else retval = LibAliasOut(t->lib, c, mcl->m_len + M_TRAILINGSPACE(mcl)); } /* * We drop packet when: * 1. libalias returns PKT_ALIAS_ERROR; * 2. For incoming packets: * a) for unresolved fragments; * b) libalias returns PKT_ALIAS_IGNORED and * PKT_ALIAS_DENY_INCOMING flag is set. */ if (retval == PKT_ALIAS_ERROR || (args->oif == NULL && (retval == PKT_ALIAS_UNRESOLVED_FRAGMENT || (retval == PKT_ALIAS_IGNORED && (t->mode & PKT_ALIAS_DENY_INCOMING) != 0)))) { /* XXX - should i add some logging? */ m_free(mcl); args->m = NULL; return (IP_FW_DENY); } if (retval == PKT_ALIAS_RESPOND) mcl->m_flags |= M_SKIP_FIREWALL; mcl->m_pkthdr.len = mcl->m_len = ntohs(ip->ip_len); /* * XXX - libalias checksum offload * 'duct tape' (see above) */ if ((ip->ip_off & htons(IP_OFFMASK)) == 0 && ip->ip_p == IPPROTO_TCP) { struct tcphdr *th; th = (struct tcphdr *)(ip + 1); if (th->th_x2) ldt = 1; } if (ldt) { struct tcphdr *th; struct udphdr *uh; uint16_t ip_len, cksum; ip_len = ntohs(ip->ip_len); cksum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htons(ip->ip_p + ip_len - (ip->ip_hl << 2))); switch (ip->ip_p) { case IPPROTO_TCP: th = (struct tcphdr *)(ip + 1); /* * Maybe it was set in * libalias... */ th->th_x2 = 0; th->th_sum = cksum; mcl->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); break; case IPPROTO_UDP: uh = (struct udphdr *)(ip + 1); uh->uh_sum = cksum; mcl->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); break; } /* No hw checksum offloading: do it ourselves */ if ((mcl->m_pkthdr.csum_flags & CSUM_DELAY_DATA) == 0) { in_delayed_cksum(mcl); mcl->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; } } args->m = mcl; return (IP_FW_NAT); } static struct cfg_nat * lookup_nat(struct nat_list *l, int nat_id) { struct cfg_nat *res; LIST_FOREACH(res, l, _next) { if (res->id == nat_id) break; } return res; } static struct cfg_nat * lookup_nat_name(struct nat_list *l, char *name) { struct cfg_nat *res; int id; char *errptr; id = strtol(name, &errptr, 10); if (id == 0 || *errptr != '\0') return (NULL); LIST_FOREACH(res, l, _next) { if (res->id == id) break; } return (res); } /* IP_FW3 configuration routines */ static void nat44_config(struct ip_fw_chain *chain, struct nat44_cfg_nat *ucfg) { struct cfg_nat *ptr, *tcfg; int gencnt; /* * Find/create nat rule. */ IPFW_UH_WLOCK(chain); gencnt = chain->gencnt; ptr = lookup_nat_name(&chain->nat, ucfg->name); if (ptr == NULL) { IPFW_UH_WUNLOCK(chain); /* New rule: allocate and init new instance. */ ptr = malloc(sizeof(struct cfg_nat), M_IPFW, M_WAITOK | M_ZERO); ptr->lib = LibAliasInit(NULL); LIST_INIT(&ptr->redir_chain); } else { /* Entry already present: temporarily unhook it. */ IPFW_WLOCK(chain); LIST_REMOVE(ptr, _next); flush_nat_ptrs(chain, ptr->id); IPFW_WUNLOCK(chain); IPFW_UH_WUNLOCK(chain); } /* * Basic nat (re)configuration. */ ptr->id = strtol(ucfg->name, NULL, 10); /* * XXX - what if this rule doesn't nat any ip and just * redirect? * do we set aliasaddress to 0.0.0.0? */ ptr->ip = ucfg->ip; ptr->redir_cnt = ucfg->redir_cnt; ptr->mode = ucfg->mode; strlcpy(ptr->if_name, ucfg->if_name, sizeof(ptr->if_name)); LibAliasSetMode(ptr->lib, ptr->mode, ~0); LibAliasSetAddress(ptr->lib, ptr->ip); /* * Redir and LSNAT configuration. */ /* Delete old cfgs. */ del_redir_spool_cfg(ptr, &ptr->redir_chain); /* Add new entries. */ add_redir_spool_cfg((char *)(ucfg + 1), ptr); IPFW_UH_WLOCK(chain); /* Extra check to avoid race with another ipfw_nat_cfg() */ tcfg = NULL; if (gencnt != chain->gencnt) tcfg = lookup_nat_name(&chain->nat, ucfg->name); IPFW_WLOCK(chain); if (tcfg != NULL) LIST_REMOVE(tcfg, _next); LIST_INSERT_HEAD(&chain->nat, ptr, _next); IPFW_WUNLOCK(chain); chain->gencnt++; IPFW_UH_WUNLOCK(chain); if (tcfg != NULL) free(tcfg, M_IPFW); } /* * Creates/configure nat44 instance * Data layout (v0)(current): * Request: [ ipfw_obj_header nat44_cfg_nat .. ] * * Returns 0 on success */ static int nat44_cfg(struct ip_fw_chain *chain, ip_fw3_opheader *op3, struct sockopt_data *sd) { ipfw_obj_header *oh; struct nat44_cfg_nat *ucfg; int id; size_t read; char *errptr; /* Check minimum header size */ if (sd->valsize < (sizeof(*oh) + sizeof(*ucfg))) return (EINVAL); oh = (ipfw_obj_header *)sd->kbuf; /* Basic length checks for TLVs */ if (oh->ntlv.head.length != sizeof(oh->ntlv)) return (EINVAL); ucfg = (struct nat44_cfg_nat *)(oh + 1); /* Check if name is properly terminated and looks like number */ if (strnlen(ucfg->name, sizeof(ucfg->name)) == sizeof(ucfg->name)) return (EINVAL); id = strtol(ucfg->name, &errptr, 10); if (id == 0 || *errptr != '\0') return (EINVAL); read = sizeof(*oh) + sizeof(*ucfg); /* Check number of redirs */ if (sd->valsize < read + ucfg->redir_cnt*sizeof(struct nat44_cfg_redir)) return (EINVAL); nat44_config(chain, ucfg); return (0); } /* * Destroys given nat instances. * Data layout (v0)(current): * Request: [ ipfw_obj_header ] * * Returns 0 on success */ static int nat44_destroy(struct ip_fw_chain *chain, ip_fw3_opheader *op3, struct sockopt_data *sd) { ipfw_obj_header *oh; struct cfg_nat *ptr; ipfw_obj_ntlv *ntlv; /* Check minimum header size */ if (sd->valsize < sizeof(*oh)) return (EINVAL); oh = (ipfw_obj_header *)sd->kbuf; /* Basic length checks for TLVs */ if (oh->ntlv.head.length != sizeof(oh->ntlv)) return (EINVAL); ntlv = &oh->ntlv; /* Check if name is properly terminated */ if (strnlen(ntlv->name, sizeof(ntlv->name)) == sizeof(ntlv->name)) return (EINVAL); IPFW_UH_WLOCK(chain); ptr = lookup_nat_name(&chain->nat, ntlv->name); if (ptr == NULL) { IPFW_UH_WUNLOCK(chain); return (ESRCH); } IPFW_WLOCK(chain); LIST_REMOVE(ptr, _next); flush_nat_ptrs(chain, ptr->id); IPFW_WUNLOCK(chain); IPFW_UH_WUNLOCK(chain); del_redir_spool_cfg(ptr, &ptr->redir_chain); LibAliasUninit(ptr->lib); free(ptr, M_IPFW); return (0); } static void export_nat_cfg(struct cfg_nat *ptr, struct nat44_cfg_nat *ucfg) { snprintf(ucfg->name, sizeof(ucfg->name), "%d", ptr->id); ucfg->ip = ptr->ip; ucfg->redir_cnt = ptr->redir_cnt; ucfg->mode = ptr->mode; strlcpy(ucfg->if_name, ptr->if_name, sizeof(ucfg->if_name)); } /* * Gets config for given nat instance * Data layout (v0)(current): * Request: [ ipfw_obj_header nat44_cfg_nat .. ] * * Returns 0 on success */ static int nat44_get_cfg(struct ip_fw_chain *chain, ip_fw3_opheader *op3, struct sockopt_data *sd) { ipfw_obj_header *oh; struct nat44_cfg_nat *ucfg; struct cfg_nat *ptr; struct cfg_redir *r; struct cfg_spool *s; struct nat44_cfg_redir *ser_r; struct nat44_cfg_spool *ser_s; size_t sz; sz = sizeof(*oh) + sizeof(*ucfg); /* Check minimum header size */ if (sd->valsize < sz) return (EINVAL); oh = (struct _ipfw_obj_header *)ipfw_get_sopt_header(sd, sz); /* Basic length checks for TLVs */ if (oh->ntlv.head.length != sizeof(oh->ntlv)) return (EINVAL); ucfg = (struct nat44_cfg_nat *)(oh + 1); /* Check if name is properly terminated */ if (strnlen(ucfg->name, sizeof(ucfg->name)) == sizeof(ucfg->name)) return (EINVAL); IPFW_UH_RLOCK(chain); ptr = lookup_nat_name(&chain->nat, ucfg->name); if (ptr == NULL) { IPFW_UH_RUNLOCK(chain); return (ESRCH); } export_nat_cfg(ptr, ucfg); /* Estimate memory amount */ - sz = sizeof(struct nat44_cfg_nat); + sz = sizeof(ipfw_obj_header) + sizeof(struct nat44_cfg_nat); LIST_FOREACH(r, &ptr->redir_chain, _next) { sz += sizeof(struct nat44_cfg_redir); LIST_FOREACH(s, &r->spool_chain, _next) sz += sizeof(struct nat44_cfg_spool); } ucfg->size = sz; - if (sd->valsize < sz + sizeof(*oh)) { + if (sd->valsize < sz) { /* * Submitted buffer size is not enough. * WE've already filled in @ucfg structure with * relevant info including size, so we * can return. Buffer will be flushed automatically. */ IPFW_UH_RUNLOCK(chain); return (ENOMEM); } /* Size OK, let's copy data */ LIST_FOREACH(r, &ptr->redir_chain, _next) { ser_r = (struct nat44_cfg_redir *)ipfw_get_sopt_space(sd, sizeof(*ser_r)); ser_r->mode = r->mode; ser_r->laddr = r->laddr; ser_r->paddr = r->paddr; ser_r->raddr = r->raddr; ser_r->lport = r->lport; ser_r->pport = r->pport; ser_r->rport = r->rport; ser_r->pport_cnt = r->pport_cnt; ser_r->rport_cnt = r->rport_cnt; ser_r->proto = r->proto; ser_r->spool_cnt = r->spool_cnt; LIST_FOREACH(s, &r->spool_chain, _next) { ser_s = (struct nat44_cfg_spool *)ipfw_get_sopt_space( sd, sizeof(*ser_s)); ser_s->addr = s->addr; ser_s->port = s->port; } } IPFW_UH_RUNLOCK(chain); return (0); } /* * Lists all nat44 instances currently available in kernel. * Data layout (v0)(current): * Request: [ ipfw_obj_lheader ] * Reply: [ ipfw_obj_lheader nat44_cfg_nat x N ] * * Returns 0 on success */ static int nat44_list_nat(struct ip_fw_chain *chain, ip_fw3_opheader *op3, struct sockopt_data *sd) { ipfw_obj_lheader *olh; struct nat44_cfg_nat *ucfg; struct cfg_nat *ptr; int nat_count; /* Check minimum header size */ if (sd->valsize < sizeof(ipfw_obj_lheader)) return (EINVAL); olh = (ipfw_obj_lheader *)ipfw_get_sopt_header(sd, sizeof(*olh)); IPFW_UH_RLOCK(chain); nat_count = 0; LIST_FOREACH(ptr, &chain->nat, _next) nat_count++; olh->count = nat_count; olh->objsize = sizeof(struct nat44_cfg_nat); olh->size = sizeof(*olh) + olh->count * olh->objsize; if (sd->valsize < olh->size) { IPFW_UH_RUNLOCK(chain); return (ENOMEM); } LIST_FOREACH(ptr, &chain->nat, _next) { ucfg = (struct nat44_cfg_nat *)ipfw_get_sopt_space(sd, sizeof(*ucfg)); export_nat_cfg(ptr, ucfg); } IPFW_UH_RUNLOCK(chain); return (0); } /* * Gets log for given nat instance * Data layout (v0)(current): * Request: [ ipfw_obj_header nat44_cfg_nat ] * Reply: [ ipfw_obj_header nat44_cfg_nat LOGBUFFER ] * * Returns 0 on success */ static int nat44_get_log(struct ip_fw_chain *chain, ip_fw3_opheader *op3, struct sockopt_data *sd) { ipfw_obj_header *oh; struct nat44_cfg_nat *ucfg; struct cfg_nat *ptr; void *pbuf; size_t sz; sz = sizeof(*oh) + sizeof(*ucfg); /* Check minimum header size */ if (sd->valsize < sz) return (EINVAL); oh = (struct _ipfw_obj_header *)ipfw_get_sopt_header(sd, sz); /* Basic length checks for TLVs */ if (oh->ntlv.head.length != sizeof(oh->ntlv)) return (EINVAL); ucfg = (struct nat44_cfg_nat *)(oh + 1); /* Check if name is properly terminated */ if (strnlen(ucfg->name, sizeof(ucfg->name)) == sizeof(ucfg->name)) return (EINVAL); IPFW_UH_RLOCK(chain); ptr = lookup_nat_name(&chain->nat, ucfg->name); if (ptr == NULL) { IPFW_UH_RUNLOCK(chain); return (ESRCH); } if (ptr->lib->logDesc == NULL) { IPFW_UH_RUNLOCK(chain); return (ENOENT); } export_nat_cfg(ptr, ucfg); /* Estimate memory amount */ ucfg->size = sizeof(struct nat44_cfg_nat) + LIBALIAS_BUF_SIZE; if (sd->valsize < sz + sizeof(*oh)) { /* * Submitted buffer size is not enough. * WE've already filled in @ucfg structure with * relevant info including size, so we * can return. Buffer will be flushed automatically. */ IPFW_UH_RUNLOCK(chain); return (ENOMEM); } pbuf = (void *)ipfw_get_sopt_space(sd, LIBALIAS_BUF_SIZE); memcpy(pbuf, ptr->lib->logDesc, LIBALIAS_BUF_SIZE); IPFW_UH_RUNLOCK(chain); return (0); } static struct ipfw_sopt_handler scodes[] = { { IP_FW_NAT44_XCONFIG, 0, HDIR_SET, nat44_cfg }, { IP_FW_NAT44_DESTROY, 0, HDIR_SET, nat44_destroy }, { IP_FW_NAT44_XGETCONFIG, 0, HDIR_GET, nat44_get_cfg }, { IP_FW_NAT44_LIST_NAT, 0, HDIR_GET, nat44_list_nat }, { IP_FW_NAT44_XGETLOG, 0, HDIR_GET, nat44_get_log }, }; /* * Legacy configuration routines */ struct cfg_spool_legacy { LIST_ENTRY(cfg_spool_legacy) _next; struct in_addr addr; u_short port; }; struct cfg_redir_legacy { LIST_ENTRY(cfg_redir) _next; u_int16_t mode; struct in_addr laddr; struct in_addr paddr; struct in_addr raddr; u_short lport; u_short pport; u_short rport; u_short pport_cnt; u_short rport_cnt; int proto; struct alias_link **alink; u_int16_t spool_cnt; LIST_HEAD(, cfg_spool_legacy) spool_chain; }; struct cfg_nat_legacy { LIST_ENTRY(cfg_nat_legacy) _next; int id; struct in_addr ip; char if_name[IF_NAMESIZE]; int mode; struct libalias *lib; int redir_cnt; LIST_HEAD(, cfg_redir_legacy) redir_chain; }; static int ipfw_nat_cfg(struct sockopt *sopt) { struct cfg_nat_legacy *cfg; struct nat44_cfg_nat *ucfg; struct cfg_redir_legacy *rdir; struct nat44_cfg_redir *urdir; char *buf; size_t len, len2; int error, i; len = sopt->sopt_valsize; len2 = len + 128; /* * Allocate 2x buffer to store converted structures. * new redir_cfg has shrinked, so we're sure that * new buffer size is enough. */ buf = malloc(roundup2(len, 8) + len2, M_TEMP, M_WAITOK | M_ZERO); error = sooptcopyin(sopt, buf, len, sizeof(struct cfg_nat_legacy)); if (error != 0) goto out; cfg = (struct cfg_nat_legacy *)buf; if (cfg->id < 0) { error = EINVAL; goto out; } ucfg = (struct nat44_cfg_nat *)&buf[roundup2(len, 8)]; snprintf(ucfg->name, sizeof(ucfg->name), "%d", cfg->id); strlcpy(ucfg->if_name, cfg->if_name, sizeof(ucfg->if_name)); ucfg->ip = cfg->ip; ucfg->mode = cfg->mode; ucfg->redir_cnt = cfg->redir_cnt; if (len < sizeof(*cfg) + cfg->redir_cnt * sizeof(*rdir)) { error = EINVAL; goto out; } urdir = (struct nat44_cfg_redir *)(ucfg + 1); rdir = (struct cfg_redir_legacy *)(cfg + 1); for (i = 0; i < cfg->redir_cnt; i++) { urdir->mode = rdir->mode; urdir->laddr = rdir->laddr; urdir->paddr = rdir->paddr; urdir->raddr = rdir->raddr; urdir->lport = rdir->lport; urdir->pport = rdir->pport; urdir->rport = rdir->rport; urdir->pport_cnt = rdir->pport_cnt; urdir->rport_cnt = rdir->rport_cnt; urdir->proto = rdir->proto; urdir->spool_cnt = rdir->spool_cnt; urdir++; rdir++; } nat44_config(&V_layer3_chain, ucfg); out: free(buf, M_TEMP); return (error); } static int ipfw_nat_del(struct sockopt *sopt) { struct cfg_nat *ptr; struct ip_fw_chain *chain = &V_layer3_chain; int i; sooptcopyin(sopt, &i, sizeof i, sizeof i); /* XXX validate i */ IPFW_UH_WLOCK(chain); ptr = lookup_nat(&chain->nat, i); if (ptr == NULL) { IPFW_UH_WUNLOCK(chain); return (EINVAL); } IPFW_WLOCK(chain); LIST_REMOVE(ptr, _next); flush_nat_ptrs(chain, i); IPFW_WUNLOCK(chain); IPFW_UH_WUNLOCK(chain); del_redir_spool_cfg(ptr, &ptr->redir_chain); LibAliasUninit(ptr->lib); free(ptr, M_IPFW); return (0); } static int ipfw_nat_get_cfg(struct sockopt *sopt) { struct ip_fw_chain *chain = &V_layer3_chain; struct cfg_nat *n; struct cfg_nat_legacy *ucfg; struct cfg_redir *r; struct cfg_spool *s; struct cfg_redir_legacy *ser_r; struct cfg_spool_legacy *ser_s; char *data; int gencnt, nat_cnt, len, error; nat_cnt = 0; len = sizeof(nat_cnt); IPFW_UH_RLOCK(chain); retry: gencnt = chain->gencnt; /* Estimate memory amount */ LIST_FOREACH(n, &chain->nat, _next) { nat_cnt++; len += sizeof(struct cfg_nat_legacy); LIST_FOREACH(r, &n->redir_chain, _next) { len += sizeof(struct cfg_redir_legacy); LIST_FOREACH(s, &r->spool_chain, _next) len += sizeof(struct cfg_spool_legacy); } } IPFW_UH_RUNLOCK(chain); data = malloc(len, M_TEMP, M_WAITOK | M_ZERO); bcopy(&nat_cnt, data, sizeof(nat_cnt)); nat_cnt = 0; len = sizeof(nat_cnt); IPFW_UH_RLOCK(chain); if (gencnt != chain->gencnt) { free(data, M_TEMP); goto retry; } /* Serialize all the data. */ LIST_FOREACH(n, &chain->nat, _next) { ucfg = (struct cfg_nat_legacy *)&data[len]; ucfg->id = n->id; ucfg->ip = n->ip; ucfg->redir_cnt = n->redir_cnt; ucfg->mode = n->mode; strlcpy(ucfg->if_name, n->if_name, sizeof(ucfg->if_name)); len += sizeof(struct cfg_nat_legacy); LIST_FOREACH(r, &n->redir_chain, _next) { ser_r = (struct cfg_redir_legacy *)&data[len]; ser_r->mode = r->mode; ser_r->laddr = r->laddr; ser_r->paddr = r->paddr; ser_r->raddr = r->raddr; ser_r->lport = r->lport; ser_r->pport = r->pport; ser_r->rport = r->rport; ser_r->pport_cnt = r->pport_cnt; ser_r->rport_cnt = r->rport_cnt; ser_r->proto = r->proto; ser_r->spool_cnt = r->spool_cnt; len += sizeof(struct cfg_redir_legacy); LIST_FOREACH(s, &r->spool_chain, _next) { ser_s = (struct cfg_spool_legacy *)&data[len]; ser_s->addr = s->addr; ser_s->port = s->port; len += sizeof(struct cfg_spool_legacy); } } } IPFW_UH_RUNLOCK(chain); error = sooptcopyout(sopt, data, len); free(data, M_TEMP); return (error); } static int ipfw_nat_get_log(struct sockopt *sopt) { uint8_t *data; struct cfg_nat *ptr; int i, size; struct ip_fw_chain *chain; IPFW_RLOCK_TRACKER; chain = &V_layer3_chain; IPFW_RLOCK(chain); /* one pass to count, one to copy the data */ i = 0; LIST_FOREACH(ptr, &chain->nat, _next) { if (ptr->lib->logDesc == NULL) continue; i++; } size = i * (LIBALIAS_BUF_SIZE + sizeof(int)); data = malloc(size, M_IPFW, M_NOWAIT | M_ZERO); if (data == NULL) { IPFW_RUNLOCK(chain); return (ENOSPC); } i = 0; LIST_FOREACH(ptr, &chain->nat, _next) { if (ptr->lib->logDesc == NULL) continue; bcopy(&ptr->id, &data[i], sizeof(int)); i += sizeof(int); bcopy(ptr->lib->logDesc, &data[i], LIBALIAS_BUF_SIZE); i += LIBALIAS_BUF_SIZE; } IPFW_RUNLOCK(chain); sooptcopyout(sopt, data, size); free(data, M_IPFW); return(0); } static int vnet_ipfw_nat_init(const void *arg __unused) { V_ipfw_nat_ready = 1; return (0); } static int vnet_ipfw_nat_uninit(const void *arg __unused) { struct cfg_nat *ptr, *ptr_temp; struct ip_fw_chain *chain; chain = &V_layer3_chain; IPFW_WLOCK(chain); LIST_FOREACH_SAFE(ptr, &chain->nat, _next, ptr_temp) { LIST_REMOVE(ptr, _next); del_redir_spool_cfg(ptr, &ptr->redir_chain); LibAliasUninit(ptr->lib); free(ptr, M_IPFW); } flush_nat_ptrs(chain, -1 /* flush all */); V_ipfw_nat_ready = 0; IPFW_WUNLOCK(chain); return (0); } static void ipfw_nat_init(void) { /* init ipfw hooks */ ipfw_nat_ptr = ipfw_nat; lookup_nat_ptr = lookup_nat; ipfw_nat_cfg_ptr = ipfw_nat_cfg; ipfw_nat_del_ptr = ipfw_nat_del; ipfw_nat_get_cfg_ptr = ipfw_nat_get_cfg; ipfw_nat_get_log_ptr = ipfw_nat_get_log; IPFW_ADD_SOPT_HANDLER(1, scodes); ifaddr_event_tag = EVENTHANDLER_REGISTER(ifaddr_event, ifaddr_change, NULL, EVENTHANDLER_PRI_ANY); } static void ipfw_nat_destroy(void) { EVENTHANDLER_DEREGISTER(ifaddr_event, ifaddr_event_tag); /* deregister ipfw_nat */ IPFW_DEL_SOPT_HANDLER(1, scodes); ipfw_nat_ptr = NULL; lookup_nat_ptr = NULL; ipfw_nat_cfg_ptr = NULL; ipfw_nat_del_ptr = NULL; ipfw_nat_get_cfg_ptr = NULL; ipfw_nat_get_log_ptr = NULL; } static int ipfw_nat_modevent(module_t mod, int type, void *unused) { int err = 0; switch (type) { case MOD_LOAD: break; case MOD_UNLOAD: break; default: return EOPNOTSUPP; break; } return err; } static moduledata_t ipfw_nat_mod = { "ipfw_nat", ipfw_nat_modevent, 0 }; /* Define startup order. */ #define IPFW_NAT_SI_SUB_FIREWALL SI_SUB_PROTO_IFATTACHDOMAIN #define IPFW_NAT_MODEVENT_ORDER (SI_ORDER_ANY - 128) /* after ipfw */ #define IPFW_NAT_MODULE_ORDER (IPFW_NAT_MODEVENT_ORDER + 1) #define IPFW_NAT_VNET_ORDER (IPFW_NAT_MODEVENT_ORDER + 2) DECLARE_MODULE(ipfw_nat, ipfw_nat_mod, IPFW_NAT_SI_SUB_FIREWALL, SI_ORDER_ANY); MODULE_DEPEND(ipfw_nat, libalias, 1, 1, 1); MODULE_DEPEND(ipfw_nat, ipfw, 3, 3, 3); MODULE_VERSION(ipfw_nat, 1); SYSINIT(ipfw_nat_init, IPFW_NAT_SI_SUB_FIREWALL, IPFW_NAT_MODULE_ORDER, ipfw_nat_init, NULL); VNET_SYSINIT(vnet_ipfw_nat_init, IPFW_NAT_SI_SUB_FIREWALL, IPFW_NAT_VNET_ORDER, vnet_ipfw_nat_init, NULL); SYSUNINIT(ipfw_nat_destroy, IPFW_NAT_SI_SUB_FIREWALL, IPFW_NAT_MODULE_ORDER, ipfw_nat_destroy, NULL); VNET_SYSUNINIT(vnet_ipfw_nat_uninit, IPFW_NAT_SI_SUB_FIREWALL, IPFW_NAT_VNET_ORDER, vnet_ipfw_nat_uninit, NULL); /* end of file */ Index: projects/building-blocks/sys/netpfil/ipfw/ip_fw_private.h =================================================================== --- projects/building-blocks/sys/netpfil/ipfw/ip_fw_private.h (revision 278311) +++ projects/building-blocks/sys/netpfil/ipfw/ip_fw_private.h (revision 278312) @@ -1,623 +1,624 @@ /*- * Copyright (c) 2002-2009 Luigi Rizzo, Universita` di Pisa * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _IPFW2_PRIVATE_H #define _IPFW2_PRIVATE_H /* * Internal constants and data structures used by ipfw components * and not meant to be exported outside the kernel. */ #ifdef _KERNEL /* * For platforms that do not have SYSCTL support, we wrap the * SYSCTL_* into a function (one per file) to collect the values * into an array at module initialization. The wrapping macros, * SYSBEGIN() and SYSEND, are empty in the default case. */ #ifndef SYSBEGIN #define SYSBEGIN(x) #endif #ifndef SYSEND #define SYSEND #endif /* Return values from ipfw_chk() */ enum { IP_FW_PASS = 0, IP_FW_DENY, IP_FW_DIVERT, IP_FW_TEE, IP_FW_DUMMYNET, IP_FW_NETGRAPH, IP_FW_NGTEE, IP_FW_NAT, IP_FW_REASS, }; /* * Structure for collecting parameters to dummynet for ip6_output forwarding */ struct _ip6dn_args { struct ip6_pktopts *opt_or; int flags_or; struct ip6_moptions *im6o_or; struct ifnet *origifp_or; struct ifnet *ifp_or; struct sockaddr_in6 dst_or; u_long mtu_or; }; /* * Arguments for calling ipfw_chk() and dummynet_io(). We put them * all into a structure because this way it is easier and more * efficient to pass variables around and extend the interface. */ struct ip_fw_args { struct mbuf *m; /* the mbuf chain */ struct ifnet *oif; /* output interface */ struct sockaddr_in *next_hop; /* forward address */ struct sockaddr_in6 *next_hop6; /* ipv6 forward address */ /* * On return, it points to the matching rule. * On entry, rule.slot > 0 means the info is valid and * contains the starting rule for an ipfw search. * If chain_id == chain->id && slot >0 then jump to that slot. * Otherwise, we locate the first rule >= rulenum:rule_id */ struct ipfw_rule_ref rule; /* match/restart info */ struct ether_header *eh; /* for bridged packets */ struct ipfw_flow_id f_id; /* grabbed from IP header */ //uint32_t cookie; /* a cookie depending on rule action */ struct inpcb *inp; struct _ip6dn_args dummypar; /* dummynet->ip6_output */ struct sockaddr_in hopstore; /* store here if cannot use a pointer */ }; MALLOC_DECLARE(M_IPFW); /* * Hooks sometime need to know the direction of the packet * (divert, dummynet, netgraph, ...) * We use a generic definition here, with bit0-1 indicating the * direction, bit 2 indicating layer2 or 3, bit 3-4 indicating the * specific protocol * indicating the protocol (if necessary) */ enum { DIR_MASK = 0x3, DIR_OUT = 0, DIR_IN = 1, DIR_FWD = 2, DIR_DROP = 3, PROTO_LAYER2 = 0x4, /* set for layer 2 */ /* PROTO_DEFAULT = 0, */ PROTO_IPV4 = 0x08, PROTO_IPV6 = 0x10, PROTO_IFB = 0x0c, /* layer2 + ifbridge */ /* PROTO_OLDBDG = 0x14, unused, old bridge */ }; /* wrapper for freeing a packet, in case we need to do more work */ #ifndef FREE_PKT #if defined(__linux__) || defined(_WIN32) #define FREE_PKT(m) netisr_dispatch(-1, m) #else #define FREE_PKT(m) m_freem(m) #endif #endif /* !FREE_PKT */ /* * Function definitions. */ /* attach (arg = 1) or detach (arg = 0) hooks */ int ipfw_attach_hooks(int); #ifdef NOTYET void ipfw_nat_destroy(void); #endif /* In ip_fw_log.c */ struct ip; struct ip_fw_chain; void ipfw_log_bpf(int); void ipfw_log(struct ip_fw_chain *chain, struct ip_fw *f, u_int hlen, struct ip_fw_args *args, struct mbuf *m, struct ifnet *oif, u_short offset, uint32_t tablearg, struct ip *ip); VNET_DECLARE(u_int64_t, norule_counter); #define V_norule_counter VNET(norule_counter) VNET_DECLARE(int, verbose_limit); #define V_verbose_limit VNET(verbose_limit) /* In ip_fw_dynamic.c */ enum { /* result for matching dynamic rules */ MATCH_REVERSE = 0, MATCH_FORWARD, MATCH_NONE, MATCH_UNKNOWN, }; /* * The lock for dynamic rules is only used once outside the file, * and only to release the result of lookup_dyn_rule(). * Eventually we may implement it with a callback on the function. */ struct ip_fw_chain; struct sockopt_data; int ipfw_is_dyn_rule(struct ip_fw *rule); void ipfw_expire_dyn_rules(struct ip_fw_chain *, ipfw_range_tlv *); void ipfw_dyn_unlock(ipfw_dyn_rule *q); struct tcphdr; struct mbuf *ipfw_send_pkt(struct mbuf *, struct ipfw_flow_id *, u_int32_t, u_int32_t, int); int ipfw_install_state(struct ip_fw_chain *chain, struct ip_fw *rule, ipfw_insn_limit *cmd, struct ip_fw_args *args, uint32_t tablearg); ipfw_dyn_rule *ipfw_lookup_dyn_rule(struct ipfw_flow_id *pkt, int *match_direction, struct tcphdr *tcp); void ipfw_remove_dyn_children(struct ip_fw *rule); void ipfw_get_dynamic(struct ip_fw_chain *chain, char **bp, const char *ep); int ipfw_dump_states(struct ip_fw_chain *chain, struct sockopt_data *sd); void ipfw_dyn_init(struct ip_fw_chain *); /* per-vnet initialization */ void ipfw_dyn_uninit(int); /* per-vnet deinitialization */ int ipfw_dyn_len(void); int ipfw_dyn_get_count(void); /* common variables */ VNET_DECLARE(int, fw_one_pass); #define V_fw_one_pass VNET(fw_one_pass) VNET_DECLARE(int, fw_verbose); #define V_fw_verbose VNET(fw_verbose) VNET_DECLARE(struct ip_fw_chain, layer3_chain); #define V_layer3_chain VNET(layer3_chain) VNET_DECLARE(int, ipfw_vnet_ready); #define V_ipfw_vnet_ready VNET(ipfw_vnet_ready) VNET_DECLARE(u_int32_t, set_disable); #define V_set_disable VNET(set_disable) VNET_DECLARE(int, autoinc_step); #define V_autoinc_step VNET(autoinc_step) VNET_DECLARE(unsigned int, fw_tables_max); #define V_fw_tables_max VNET(fw_tables_max) VNET_DECLARE(unsigned int, fw_tables_sets); #define V_fw_tables_sets VNET(fw_tables_sets) struct tables_config; #ifdef _KERNEL /* * Here we have the structure representing an ipfw rule. * * It starts with a general area * followed by an array of one or more instructions, which the code * accesses as an array of 32-bit values. * * Given a rule pointer r: * * r->cmd is the start of the first instruction. * ACTION_PTR(r) is the start of the first action (things to do * once a rule matched). */ struct ip_fw { uint16_t act_ofs; /* offset of action in 32-bit units */ uint16_t cmd_len; /* # of 32-bit words in cmd */ uint16_t rulenum; /* rule number */ uint8_t set; /* rule set (0..31) */ uint8_t flags; /* currently unused */ counter_u64_t cntr; /* Pointer to rule counters */ uint32_t timestamp; /* tv_sec of last match */ uint32_t id; /* rule id */ uint32_t cached_id; /* used by jump_fast */ uint32_t cached_pos; /* used by jump_fast */ ipfw_insn cmd[1]; /* storage for commands */ }; #define IPFW_RULE_CNTR_SIZE (2 * sizeof(counter_u64_t)) #endif struct ip_fw_chain { struct ip_fw **map; /* array of rule ptrs to ease lookup */ uint32_t id; /* ruleset id */ int n_rules; /* number of static rules */ LIST_HEAD(nat_list, cfg_nat) nat; /* list of nat entries */ void *tablestate; /* runtime table info */ void *valuestate; /* runtime table value info */ int *idxmap; /* skipto array of rules */ #if defined( __linux__ ) || defined( _WIN32 ) spinlock_t rwmtx; #else struct rmlock rwmtx; #endif int static_len; /* total len of static rules (v0) */ uint32_t gencnt; /* NAT generation count */ struct ip_fw *default_rule; struct tables_config *tblcfg; /* tables module data */ void *ifcfg; /* interface module data */ int *idxmap_back; /* standby skipto array of rules */ #if defined( __linux__ ) || defined( _WIN32 ) spinlock_t uh_lock; #else struct rwlock uh_lock; /* lock for upper half */ #endif }; /* 64-byte structure representing multi-field table value */ struct table_value { uint32_t tag; /* O_TAG/O_TAGGED */ uint32_t pipe; /* O_PIPE/O_QUEUE */ uint16_t divert; /* O_DIVERT/O_TEE */ uint16_t skipto; /* skipto, CALLRET */ uint32_t netgraph; /* O_NETGRAPH/O_NGTEE */ uint32_t fib; /* O_SETFIB */ uint32_t nat; /* O_NAT */ uint32_t nh4; uint8_t dscp; uint8_t spare0[3]; /* -- 32 bytes -- */ struct in6_addr nh6; uint32_t limit; /* O_LIMIT */ uint32_t spare1; uint64_t refcnt; /* Number of references */ }; struct namedobj_instance; struct named_object { TAILQ_ENTRY(named_object) nn_next; /* namehash */ TAILQ_ENTRY(named_object) nv_next; /* valuehash */ char *name; /* object name */ uint8_t type; /* object type */ uint8_t compat; /* Object name is number */ uint16_t kidx; /* object kernel index */ uint16_t uidx; /* userland idx for compat records */ uint32_t set; /* set object belongs to */ uint32_t refcnt; /* number of references */ }; TAILQ_HEAD(namedobjects_head, named_object); struct sockopt; /* used by tcp_var.h */ struct sockopt_data { caddr_t kbuf; /* allocated buffer */ size_t ksize; /* given buffer size */ size_t koff; /* data already used */ size_t kavail; /* number of bytes available */ size_t ktotal; /* total bytes pushed */ struct sockopt *sopt; /* socket data */ caddr_t sopt_val; /* sopt user buffer */ size_t valsize; /* original data size */ }; struct ipfw_ifc; typedef void (ipfw_ifc_cb)(struct ip_fw_chain *ch, void *cbdata, uint16_t ifindex); struct ipfw_iface { struct named_object no; char ifname[64]; int resolved; uint16_t ifindex; uint16_t spare; uint64_t gencnt; TAILQ_HEAD(, ipfw_ifc) consumers; }; struct ipfw_ifc { TAILQ_ENTRY(ipfw_ifc) next; struct ipfw_iface *iface; ipfw_ifc_cb *cb; void *cbdata; }; /* Macro for working with various counters */ #define IPFW_INC_RULE_COUNTER(_cntr, _bytes) do { \ counter_u64_add((_cntr)->cntr, 1); \ counter_u64_add((_cntr)->cntr + 1, _bytes); \ if ((_cntr)->timestamp != time_uptime) \ (_cntr)->timestamp = time_uptime; \ } while (0) #define IPFW_INC_DYN_COUNTER(_cntr, _bytes) do { \ (_cntr)->pcnt++; \ (_cntr)->bcnt += _bytes; \ } while (0) #define IPFW_ZERO_RULE_COUNTER(_cntr) do { \ counter_u64_zero((_cntr)->cntr); \ counter_u64_zero((_cntr)->cntr + 1); \ (_cntr)->timestamp = 0; \ } while (0) #define IPFW_ZERO_DYN_COUNTER(_cntr) do { \ (_cntr)->pcnt = 0; \ (_cntr)->bcnt = 0; \ } while (0) #define TARG_VAL(ch, k, f) ((struct table_value *)((ch)->valuestate))[k].f #define IP_FW_ARG_TABLEARG(ch, a, f) \ (((a) == IP_FW_TARG) ? TARG_VAL(ch, tablearg, f) : (a)) /* * The lock is heavily used by ip_fw2.c (the main file) and ip_fw_nat.c * so the variable and the macros must be here. */ #if defined( __linux__ ) || defined( _WIN32 ) #define IPFW_LOCK_INIT(_chain) do { \ rw_init(&(_chain)->rwmtx, "IPFW static rules"); \ rw_init(&(_chain)->uh_lock, "IPFW UH lock"); \ } while (0) #define IPFW_LOCK_DESTROY(_chain) do { \ rw_destroy(&(_chain)->rwmtx); \ rw_destroy(&(_chain)->uh_lock); \ } while (0) #define IPFW_RLOCK_ASSERT(_chain) rw_assert(&(_chain)->rwmtx, RA_RLOCKED) #define IPFW_WLOCK_ASSERT(_chain) rw_assert(&(_chain)->rwmtx, RA_WLOCKED) #define IPFW_RLOCK_TRACKER #define IPFW_RLOCK(p) rw_rlock(&(p)->rwmtx) #define IPFW_RUNLOCK(p) rw_runlock(&(p)->rwmtx) #define IPFW_WLOCK(p) rw_wlock(&(p)->rwmtx) #define IPFW_WUNLOCK(p) rw_wunlock(&(p)->rwmtx) #define IPFW_PF_RLOCK(p) IPFW_RLOCK(p) #define IPFW_PF_RUNLOCK(p) IPFW_RUNLOCK(p) #else /* FreeBSD */ #define IPFW_LOCK_INIT(_chain) do { \ rm_init(&(_chain)->rwmtx, "IPFW static rules"); \ rw_init(&(_chain)->uh_lock, "IPFW UH lock"); \ } while (0) #define IPFW_LOCK_DESTROY(_chain) do { \ rm_destroy(&(_chain)->rwmtx); \ rw_destroy(&(_chain)->uh_lock); \ } while (0) #define IPFW_RLOCK_ASSERT(_chain) rm_assert(&(_chain)->rwmtx, RA_RLOCKED) #define IPFW_WLOCK_ASSERT(_chain) rm_assert(&(_chain)->rwmtx, RA_WLOCKED) #define IPFW_RLOCK_TRACKER struct rm_priotracker _tracker #define IPFW_RLOCK(p) rm_rlock(&(p)->rwmtx, &_tracker) #define IPFW_RUNLOCK(p) rm_runlock(&(p)->rwmtx, &_tracker) #define IPFW_WLOCK(p) rm_wlock(&(p)->rwmtx) #define IPFW_WUNLOCK(p) rm_wunlock(&(p)->rwmtx) #define IPFW_PF_RLOCK(p) IPFW_RLOCK(p) #define IPFW_PF_RUNLOCK(p) IPFW_RUNLOCK(p) #endif #define IPFW_UH_RLOCK_ASSERT(_chain) rw_assert(&(_chain)->uh_lock, RA_RLOCKED) #define IPFW_UH_WLOCK_ASSERT(_chain) rw_assert(&(_chain)->uh_lock, RA_WLOCKED) +#define IPFW_UH_UNLOCK_ASSERT(_chain) rw_assert(&(_chain)->uh_lock, RA_UNLOCKED) #define IPFW_UH_RLOCK(p) rw_rlock(&(p)->uh_lock) #define IPFW_UH_RUNLOCK(p) rw_runlock(&(p)->uh_lock) #define IPFW_UH_WLOCK(p) rw_wlock(&(p)->uh_lock) #define IPFW_UH_WUNLOCK(p) rw_wunlock(&(p)->uh_lock) struct obj_idx { uint16_t uidx; /* internal index supplied by userland */ uint16_t kidx; /* kernel object index */ uint16_t off; /* tlv offset from rule end in 4-byte words */ uint8_t spare; uint8_t type; /* object type within its category */ }; struct rule_check_info { uint16_t flags; /* rule-specific check flags */ uint16_t table_opcodes; /* count of opcodes referencing table */ uint16_t urule_numoff; /* offset of rulenum in bytes */ uint8_t version; /* rule version */ uint8_t spare; ipfw_obj_ctlv *ctlv; /* name TLV containter */ struct ip_fw *krule; /* resulting rule pointer */ caddr_t urule; /* original rule pointer */ struct obj_idx obuf[8]; /* table references storage */ }; /* Legacy interface support */ /* * FreeBSD 8 export rule format */ struct ip_fw_rule0 { struct ip_fw *x_next; /* linked list of rules */ struct ip_fw *next_rule; /* ptr to next [skipto] rule */ /* 'next_rule' is used to pass up 'set_disable' status */ uint16_t act_ofs; /* offset of action in 32-bit units */ uint16_t cmd_len; /* # of 32-bit words in cmd */ uint16_t rulenum; /* rule number */ uint8_t set; /* rule set (0..31) */ uint8_t _pad; /* padding */ uint32_t id; /* rule id */ /* These fields are present in all rules. */ uint64_t pcnt; /* Packet counter */ uint64_t bcnt; /* Byte counter */ uint32_t timestamp; /* tv_sec of last match */ ipfw_insn cmd[1]; /* storage for commands */ }; struct ip_fw_bcounter0 { uint64_t pcnt; /* Packet counter */ uint64_t bcnt; /* Byte counter */ uint32_t timestamp; /* tv_sec of last match */ }; /* Kernel rule length */ /* * RULE _K_ SIZE _V_ -> * get kernel size from userland rool version _V_. * RULE _U_ SIZE _V_ -> * get user size version _V_ from kernel rule * RULESIZE _V_ -> * get user size rule length */ /* FreeBSD8 <> current kernel format */ #define RULEUSIZE0(r) (sizeof(struct ip_fw_rule0) + (r)->cmd_len * 4 - 4) #define RULEKSIZE0(r) roundup2((sizeof(struct ip_fw) + (r)->cmd_len*4 - 4), 8) /* FreeBSD11 <> current kernel format */ #define RULEUSIZE1(r) (roundup2(sizeof(struct ip_fw_rule) + \ (r)->cmd_len * 4 - 4, 8)) #define RULEKSIZE1(r) roundup2((sizeof(struct ip_fw) + (r)->cmd_len*4 - 4), 8) /* In ip_fw_iface.c */ int ipfw_iface_init(void); void ipfw_iface_destroy(void); void vnet_ipfw_iface_destroy(struct ip_fw_chain *ch); int ipfw_iface_ref(struct ip_fw_chain *ch, char *name, struct ipfw_ifc *ic); void ipfw_iface_unref(struct ip_fw_chain *ch, struct ipfw_ifc *ic); void ipfw_iface_add_notify(struct ip_fw_chain *ch, struct ipfw_ifc *ic); void ipfw_iface_del_notify(struct ip_fw_chain *ch, struct ipfw_ifc *ic); /* In ip_fw_sockopt.c */ void ipfw_init_skipto_cache(struct ip_fw_chain *chain); void ipfw_destroy_skipto_cache(struct ip_fw_chain *chain); int ipfw_find_rule(struct ip_fw_chain *chain, uint32_t key, uint32_t id); int ipfw_ctl3(struct sockopt *sopt); int ipfw_chk(struct ip_fw_args *args); void ipfw_reap_add(struct ip_fw_chain *chain, struct ip_fw **head, struct ip_fw *rule); void ipfw_reap_rules(struct ip_fw *head); void ipfw_init_counters(void); void ipfw_destroy_counters(void); struct ip_fw *ipfw_alloc_rule(struct ip_fw_chain *chain, size_t rulesize); int ipfw_match_range(struct ip_fw *rule, ipfw_range_tlv *rt); typedef int (sopt_handler_f)(struct ip_fw_chain *ch, ip_fw3_opheader *op3, struct sockopt_data *sd); struct ipfw_sopt_handler { uint16_t opcode; uint8_t version; uint8_t dir; sopt_handler_f *handler; uint64_t refcnt; }; #define HDIR_SET 0x01 /* Handler is used to set some data */ #define HDIR_GET 0x02 /* Handler is used to retrieve data */ #define HDIR_BOTH HDIR_GET|HDIR_SET void ipfw_init_sopt_handler(void); void ipfw_destroy_sopt_handler(void); void ipfw_add_sopt_handler(struct ipfw_sopt_handler *sh, size_t count); int ipfw_del_sopt_handler(struct ipfw_sopt_handler *sh, size_t count); caddr_t ipfw_get_sopt_space(struct sockopt_data *sd, size_t needed); caddr_t ipfw_get_sopt_header(struct sockopt_data *sd, size_t needed); #define IPFW_ADD_SOPT_HANDLER(f, c) do { \ if ((f) != 0) \ ipfw_add_sopt_handler(c, \ sizeof(c) / sizeof(c[0])); \ } while(0) #define IPFW_DEL_SOPT_HANDLER(l, c) do { \ if ((l) != 0) \ ipfw_del_sopt_handler(c, \ sizeof(c) / sizeof(c[0])); \ } while(0) typedef void (objhash_cb_t)(struct namedobj_instance *ni, struct named_object *, void *arg); typedef uint32_t (objhash_hash_f)(struct namedobj_instance *ni, void *key, uint32_t kopt); typedef int (objhash_cmp_f)(struct named_object *no, void *key, uint32_t kopt); struct namedobj_instance *ipfw_objhash_create(uint32_t items); void ipfw_objhash_destroy(struct namedobj_instance *); void ipfw_objhash_bitmap_alloc(uint32_t items, void **idx, int *pblocks); void ipfw_objhash_bitmap_merge(struct namedobj_instance *ni, void **idx, int *blocks); void ipfw_objhash_bitmap_swap(struct namedobj_instance *ni, void **idx, int *blocks); void ipfw_objhash_bitmap_free(void *idx, int blocks); void ipfw_objhash_set_hashf(struct namedobj_instance *ni, objhash_hash_f *f); struct named_object *ipfw_objhash_lookup_name(struct namedobj_instance *ni, uint32_t set, char *name); struct named_object *ipfw_objhash_lookup_kidx(struct namedobj_instance *ni, uint16_t idx); int ipfw_objhash_same_name(struct namedobj_instance *ni, struct named_object *a, struct named_object *b); void ipfw_objhash_add(struct namedobj_instance *ni, struct named_object *no); void ipfw_objhash_del(struct namedobj_instance *ni, struct named_object *no); uint32_t ipfw_objhash_count(struct namedobj_instance *ni); void ipfw_objhash_foreach(struct namedobj_instance *ni, objhash_cb_t *f, void *arg); int ipfw_objhash_free_idx(struct namedobj_instance *ni, uint16_t idx); int ipfw_objhash_alloc_idx(void *n, uint16_t *pidx); void ipfw_objhash_set_funcs(struct namedobj_instance *ni, objhash_hash_f *hash_f, objhash_cmp_f *cmp_f); /* In ip_fw_table.c */ struct table_info; typedef int (table_lookup_t)(struct table_info *ti, void *key, uint32_t keylen, uint32_t *val); int ipfw_lookup_table(struct ip_fw_chain *ch, uint16_t tbl, in_addr_t addr, uint32_t *val); int ipfw_lookup_table_extended(struct ip_fw_chain *ch, uint16_t tbl, uint16_t plen, void *paddr, uint32_t *val); int ipfw_init_tables(struct ip_fw_chain *ch, int first); int ipfw_resize_tables(struct ip_fw_chain *ch, unsigned int ntables); int ipfw_switch_tables_namespace(struct ip_fw_chain *ch, unsigned int nsets); void ipfw_destroy_tables(struct ip_fw_chain *ch, int last); /* In ip_fw_nat.c -- XXX to be moved to ip_var.h */ extern struct cfg_nat *(*lookup_nat_ptr)(struct nat_list *, int); typedef int ipfw_nat_t(struct ip_fw_args *, struct cfg_nat *, struct mbuf *); typedef int ipfw_nat_cfg_t(struct sockopt *); VNET_DECLARE(int, ipfw_nat_ready); #define V_ipfw_nat_ready VNET(ipfw_nat_ready) #define IPFW_NAT_LOADED (V_ipfw_nat_ready) extern ipfw_nat_t *ipfw_nat_ptr; extern ipfw_nat_cfg_t *ipfw_nat_cfg_ptr; extern ipfw_nat_cfg_t *ipfw_nat_del_ptr; extern ipfw_nat_cfg_t *ipfw_nat_get_cfg_ptr; extern ipfw_nat_cfg_t *ipfw_nat_get_log_ptr; #endif /* _KERNEL */ #endif /* _IPFW2_PRIVATE_H */ Index: projects/building-blocks/sys/netpfil/ipfw/ip_fw_table.c =================================================================== --- projects/building-blocks/sys/netpfil/ipfw/ip_fw_table.c (revision 278311) +++ projects/building-blocks/sys/netpfil/ipfw/ip_fw_table.c (revision 278312) @@ -1,3673 +1,3686 @@ /*- * Copyright (c) 2004 Ruslan Ermilov and Vsevolod Lobko. * Copyright (c) 2014 Yandex LLC * Copyright (c) 2014 Alexander V. Chernikov * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); /* * Lookup table support for ipfw. * * This file contains handlers for all generic tables' operations: * add/del/flush entries, list/dump tables etc.. * * Table data modification is protected by both UH and runtime lock * while reading configuration/data is protected by UH lock. * * Lookup algorithms for all table types are located in ip_fw_table_algo.c */ #include "opt_ipfw.h" #include #include #include #include #include #include #include #include #include #include #include /* ip_fw.h requires IFNAMSIZ */ #include #include /* struct ipfw_rule_ref */ #include #include #include /* * Table has the following `type` concepts: * * `no.type` represents lookup key type (addr, ifp, uid, etc..) * vmask represents bitmask of table values which are present at the moment. * Special IPFW_VTYPE_LEGACY ( (uint32_t)-1 ) represents old * single-value-for-all approach. */ struct table_config { struct named_object no; uint8_t tflags; /* type flags */ uint8_t locked; /* 1 if locked from changes */ uint8_t linked; /* 1 if already linked */ uint8_t ochanged; /* used by set swapping */ uint8_t vshared; /* 1 if using shared value array */ uint8_t spare[3]; uint32_t count; /* Number of records */ uint32_t limit; /* Max number of records */ uint32_t vmask; /* bitmask with supported values */ uint32_t ocount; /* used by set swapping */ uint64_t gencnt; /* generation count */ char tablename[64]; /* table name */ struct table_algo *ta; /* Callbacks for given algo */ void *astate; /* algorithm state */ struct table_info ti_copy; /* data to put to table_info */ struct namedobj_instance *vi; }; static struct table_config *find_table(struct namedobj_instance *ni, struct tid_info *ti); static struct table_config *alloc_table_config(struct ip_fw_chain *ch, struct tid_info *ti, struct table_algo *ta, char *adata, uint8_t tflags); static void free_table_config(struct namedobj_instance *ni, struct table_config *tc); static int create_table_internal(struct ip_fw_chain *ch, struct tid_info *ti, char *aname, ipfw_xtable_info *i, uint16_t *pkidx, int ref); static void link_table(struct ip_fw_chain *ch, struct table_config *tc); static void unlink_table(struct ip_fw_chain *ch, struct table_config *tc); static int find_ref_table(struct ip_fw_chain *ch, struct tid_info *ti, struct tentry_info *tei, uint32_t count, int op, struct table_config **ptc); #define OP_ADD 1 #define OP_DEL 0 static int export_tables(struct ip_fw_chain *ch, ipfw_obj_lheader *olh, struct sockopt_data *sd); static void export_table_info(struct ip_fw_chain *ch, struct table_config *tc, ipfw_xtable_info *i); static int dump_table_tentry(void *e, void *arg); static int dump_table_xentry(void *e, void *arg); static int swap_tables(struct ip_fw_chain *ch, struct tid_info *a, struct tid_info *b); static int check_table_space(struct ip_fw_chain *ch, struct tableop_state *ts, struct table_config *tc, struct table_info *ti, uint32_t count); static int destroy_table(struct ip_fw_chain *ch, struct tid_info *ti); static struct table_algo *find_table_algo(struct tables_config *tableconf, struct tid_info *ti, char *name); static void objheader_to_ti(struct _ipfw_obj_header *oh, struct tid_info *ti); static void ntlv_to_ti(struct _ipfw_obj_ntlv *ntlv, struct tid_info *ti); static int classify_table_opcode(ipfw_insn *cmd, uint16_t *puidx, uint8_t *ptype); #define CHAIN_TO_NI(chain) (CHAIN_TO_TCFG(chain)->namehash) #define KIDX_TO_TI(ch, k) (&(((struct table_info *)(ch)->tablestate)[k])) #define TA_BUF_SZ 128 /* On-stack buffer for add/delete state */ void rollback_toperation_state(struct ip_fw_chain *ch, void *object) { struct tables_config *tcfg; struct op_state *os; tcfg = CHAIN_TO_TCFG(ch); TAILQ_FOREACH(os, &tcfg->state_list, next) os->func(object, os); } void add_toperation_state(struct ip_fw_chain *ch, struct tableop_state *ts) { struct tables_config *tcfg; tcfg = CHAIN_TO_TCFG(ch); TAILQ_INSERT_HEAD(&tcfg->state_list, &ts->opstate, next); } void del_toperation_state(struct ip_fw_chain *ch, struct tableop_state *ts) { struct tables_config *tcfg; tcfg = CHAIN_TO_TCFG(ch); TAILQ_REMOVE(&tcfg->state_list, &ts->opstate, next); } void tc_ref(struct table_config *tc) { tc->no.refcnt++; } void tc_unref(struct table_config *tc) { tc->no.refcnt--; } static struct table_value * get_table_value(struct ip_fw_chain *ch, struct table_config *tc, uint32_t kidx) { struct table_value *pval; pval = (struct table_value *)ch->valuestate; return (&pval[kidx]); } /* * Checks if we're able to insert/update entry @tei into table * w.r.t @tc limits. * May alter @tei to indicate insertion error / insert * options. * * Returns 0 if operation can be performed/ */ static int check_table_limit(struct table_config *tc, struct tentry_info *tei) { if (tc->limit == 0 || tc->count < tc->limit) return (0); if ((tei->flags & TEI_FLAGS_UPDATE) == 0) { /* Notify userland on error cause */ tei->flags |= TEI_FLAGS_LIMIT; return (EFBIG); } /* * We have UPDATE flag set. * Permit updating record (if found), * but restrict adding new one since we've * already hit the limit. */ tei->flags |= TEI_FLAGS_DONTADD; return (0); } /* * Convert algorithm callback return code into * one of pre-defined states known by userland. */ static void store_tei_result(struct tentry_info *tei, int op, int error, uint32_t num) { int flag; flag = 0; switch (error) { case 0: if (op == OP_ADD && num != 0) flag = TEI_FLAGS_ADDED; if (op == OP_DEL) flag = TEI_FLAGS_DELETED; break; case ENOENT: flag = TEI_FLAGS_NOTFOUND; break; case EEXIST: flag = TEI_FLAGS_EXISTS; break; default: flag = TEI_FLAGS_ERROR; } tei->flags |= flag; } /* * Creates and references table with default parameters. * Saves table config, algo and allocated kidx info @ptc, @pta and * @pkidx if non-zero. * Used for table auto-creation to support old binaries. * * Returns 0 on success. */ static int create_table_compat(struct ip_fw_chain *ch, struct tid_info *ti, uint16_t *pkidx) { ipfw_xtable_info xi; int error; memset(&xi, 0, sizeof(xi)); /* Set default value mask for legacy clients */ xi.vmask = IPFW_VTYPE_LEGACY; error = create_table_internal(ch, ti, NULL, &xi, pkidx, 1); if (error != 0) return (error); return (0); } /* * Find and reference existing table optionally * creating new one. * * Saves found table config into @ptc. * Note function may drop/acquire UH_WLOCK. * Returns 0 if table was found/created and referenced * or non-zero return code. */ static int find_ref_table(struct ip_fw_chain *ch, struct tid_info *ti, struct tentry_info *tei, uint32_t count, int op, struct table_config **ptc) { struct namedobj_instance *ni; struct table_config *tc; uint16_t kidx; int error; IPFW_UH_WLOCK_ASSERT(ch); ni = CHAIN_TO_NI(ch); tc = NULL; if ((tc = find_table(ni, ti)) != NULL) { /* check table type */ if (tc->no.type != ti->type) return (EINVAL); if (tc->locked != 0) return (EACCES); /* Try to exit early on limit hit */ if (op == OP_ADD && count == 1 && check_table_limit(tc, tei) != 0) return (EFBIG); /* Reference and return */ tc->no.refcnt++; *ptc = tc; return (0); } if (op == OP_DEL) return (ESRCH); /* Compability mode: create new table for old clients */ if ((tei->flags & TEI_FLAGS_COMPAT) == 0) return (ESRCH); IPFW_UH_WUNLOCK(ch); error = create_table_compat(ch, ti, &kidx); IPFW_UH_WLOCK(ch); if (error != 0) return (error); tc = (struct table_config *)ipfw_objhash_lookup_kidx(ni, kidx); KASSERT(tc != NULL, ("create_table_compat returned bad idx %d", kidx)); /* OK, now we've got referenced table. */ *ptc = tc; return (0); } /* * Rolls back already @added to @tc entries using state array @ta_buf_m. * Assume the following layout: * 1) ADD state (ta_buf_m[0] ... t_buf_m[added - 1]) for handling update cases * 2) DEL state (ta_buf_m[count[ ... t_buf_m[count + added - 1]) * for storing deleted state */ static void rollback_added_entries(struct ip_fw_chain *ch, struct table_config *tc, struct table_info *tinfo, struct tentry_info *tei, caddr_t ta_buf_m, uint32_t count, uint32_t added) { struct table_algo *ta; struct tentry_info *ptei; caddr_t v, vv; size_t ta_buf_sz; int error, i; uint32_t num; IPFW_UH_WLOCK_ASSERT(ch); ta = tc->ta; ta_buf_sz = ta->ta_buf_size; v = ta_buf_m; vv = v + count * ta_buf_sz; for (i = 0; i < added; i++, v += ta_buf_sz, vv += ta_buf_sz) { ptei = &tei[i]; if ((ptei->flags & TEI_FLAGS_UPDATED) != 0) { /* * We have old value stored by previous * call in @ptei->value. Do add once again * to restore it. */ error = ta->add(tc->astate, tinfo, ptei, v, &num); KASSERT(error == 0, ("rollback UPDATE fail")); KASSERT(num == 0, ("rollback UPDATE fail2")); continue; } error = ta->prepare_del(ch, ptei, vv); KASSERT(error == 0, ("pre-rollback INSERT failed")); error = ta->del(tc->astate, tinfo, ptei, vv, &num); KASSERT(error == 0, ("rollback INSERT failed")); tc->count -= num; } } /* * Prepares add/del state for all @count entries in @tei. * Uses either stack buffer (@ta_buf) or allocates a new one. * Stores pointer to allocated buffer back to @ta_buf. * * Returns 0 on success. */ static int prepare_batch_buffer(struct ip_fw_chain *ch, struct table_algo *ta, struct tentry_info *tei, uint32_t count, int op, caddr_t *ta_buf) { caddr_t ta_buf_m, v; size_t ta_buf_sz, sz; struct tentry_info *ptei; int error, i; error = 0; ta_buf_sz = ta->ta_buf_size; if (count == 1) { /* Sigle add/delete, use on-stack buffer */ memset(*ta_buf, 0, TA_BUF_SZ); ta_buf_m = *ta_buf; } else { /* * Multiple adds/deletes, allocate larger buffer * * Note we need 2xcount buffer for add case: * we have hold both ADD state * and DELETE state (this may be needed * if we need to rollback all changes) */ sz = count * ta_buf_sz; ta_buf_m = malloc((op == OP_ADD) ? sz * 2 : sz, M_TEMP, M_WAITOK | M_ZERO); } v = ta_buf_m; for (i = 0; i < count; i++, v += ta_buf_sz) { ptei = &tei[i]; error = (op == OP_ADD) ? ta->prepare_add(ch, ptei, v) : ta->prepare_del(ch, ptei, v); /* * Some syntax error (incorrect mask, or address, or * anything). Return error regardless of atomicity * settings. */ if (error != 0) break; } *ta_buf = ta_buf_m; return (error); } /* * Flushes allocated state for each @count entries in @tei. * Frees @ta_buf_m if differs from stack buffer @ta_buf. */ static void flush_batch_buffer(struct ip_fw_chain *ch, struct table_algo *ta, struct tentry_info *tei, uint32_t count, int rollback, caddr_t ta_buf_m, caddr_t ta_buf) { caddr_t v; struct tentry_info *ptei; size_t ta_buf_sz; int i; ta_buf_sz = ta->ta_buf_size; /* Run cleaning callback anyway */ v = ta_buf_m; for (i = 0; i < count; i++, v += ta_buf_sz) { ptei = &tei[i]; ta->flush_entry(ch, ptei, v); if (ptei->ptv != NULL) { free(ptei->ptv, M_IPFW); ptei->ptv = NULL; } } /* Clean up "deleted" state in case of rollback */ if (rollback != 0) { v = ta_buf_m + count * ta_buf_sz; for (i = 0; i < count; i++, v += ta_buf_sz) ta->flush_entry(ch, &tei[i], v); } if (ta_buf_m != ta_buf) free(ta_buf_m, M_TEMP); } static void rollback_add_entry(void *object, struct op_state *_state) { struct ip_fw_chain *ch; struct tableop_state *ts; ts = (struct tableop_state *)_state; if (ts->tc != object && ts->ch != object) return; ch = ts->ch; IPFW_UH_WLOCK_ASSERT(ch); /* Call specifid unlockers */ rollback_table_values(ts); /* Indicate we've called */ ts->modified = 1; } /* * Adds/updates one or more entries in table @ti. * * Function may drop/reacquire UH wlock multiple times due to * items alloc, algorithm callbacks (check_space), value linkage * (new values, value storage realloc), etc.. * Other processes like other adds (which may involve storage resize), * table swaps (which changes table data and may change algo type), * table modify (which may change value mask) may be executed * simultaneously so we need to deal with it. * * The following approach was implemented: * we have per-chain linked list, protected with UH lock. * add_table_entry prepares special on-stack structure wthich is passed * to its descendants. Users add this structure to this list before unlock. * After performing needed operations and acquiring UH lock back, each user * checks if structure has changed. If true, it rolls local state back and * returns without error to the caller. * add_table_entry() on its own checks if structure has changed and restarts * its operation from the beginning (goto restart). * * Functions which are modifying fields of interest (currently * resize_shared_value_storage() and swap_tables() ) * traverses given list while holding UH lock immediately before * performing their operations calling function provided be list entry * ( currently rollback_add_entry ) which performs rollback for all necessary * state and sets appropriate values in structure indicating rollback * has happened. * * Algo interaction: * Function references @ti first to ensure table won't * disappear or change its type. * After that, prepare_add callback is called for each @tei entry. * Next, we try to add each entry under UH+WHLOCK * using add() callback. * Finally, we free all state by calling flush_entry callback * for each @tei. * * Returns 0 on success. */ int add_table_entry(struct ip_fw_chain *ch, struct tid_info *ti, struct tentry_info *tei, uint8_t flags, uint32_t count) { struct table_config *tc; struct table_algo *ta; uint16_t kidx; int error, first_error, i, rollback; uint32_t num, numadd; struct tentry_info *ptei; struct tableop_state ts; char ta_buf[TA_BUF_SZ]; caddr_t ta_buf_m, v; memset(&ts, 0, sizeof(ts)); ta = NULL; IPFW_UH_WLOCK(ch); /* * Find and reference existing table. */ restart: if (ts.modified != 0) { IPFW_UH_WUNLOCK(ch); flush_batch_buffer(ch, ta, tei, count, rollback, ta_buf_m, ta_buf); memset(&ts, 0, sizeof(ts)); ta = NULL; IPFW_UH_WLOCK(ch); } error = find_ref_table(ch, ti, tei, count, OP_ADD, &tc); if (error != 0) { IPFW_UH_WUNLOCK(ch); return (error); } ta = tc->ta; /* Fill in tablestate */ ts.ch = ch; ts.opstate.func = rollback_add_entry; ts.tc = tc; ts.vshared = tc->vshared; ts.vmask = tc->vmask; ts.ta = ta; ts.tei = tei; ts.count = count; rollback = 0; add_toperation_state(ch, &ts); IPFW_UH_WUNLOCK(ch); /* Allocate memory and prepare record(s) */ /* Pass stack buffer by default */ ta_buf_m = ta_buf; error = prepare_batch_buffer(ch, ta, tei, count, OP_ADD, &ta_buf_m); if (error != 0) goto cleanup; IPFW_UH_WLOCK(ch); /* Drop reference we've used in first search */ tc->no.refcnt--; /* * Check if table swap has happened. * (so table algo might be changed). * Restart operation to achieve consistent behavior. */ del_toperation_state(ch, &ts); if (ts.modified != 0) goto restart; /* * Link all values values to shared/per-table value array. * * May release/reacquire UH_WLOCK. */ error = ipfw_link_table_values(ch, &ts); if (error != 0) goto cleanup; if (ts.modified != 0) goto restart; /* * Ensure we are able to add all entries without additional * memory allocations. May release/reacquire UH_WLOCK. */ kidx = tc->no.kidx; error = check_table_space(ch, &ts, tc, KIDX_TO_TI(ch, kidx), count); if (error != 0) goto cleanup; if (ts.modified != 0) goto restart; /* We've got valid table in @tc. Let's try to add data */ kidx = tc->no.kidx; ta = tc->ta; numadd = 0; first_error = 0; IPFW_WLOCK(ch); v = ta_buf_m; for (i = 0; i < count; i++, v += ta->ta_buf_size) { ptei = &tei[i]; num = 0; /* check limit before adding */ if ((error = check_table_limit(tc, ptei)) == 0) { error = ta->add(tc->astate, KIDX_TO_TI(ch, kidx), ptei, v, &num); /* Set status flag to inform userland */ store_tei_result(ptei, OP_ADD, error, num); } if (error == 0) { /* Update number of records to ease limit checking */ tc->count += num; numadd += num; continue; } if (first_error == 0) first_error = error; /* * Some error have happened. Check our atomicity * settings: continue if atomicity is not required, * rollback changes otherwise. */ if ((flags & IPFW_CTF_ATOMIC) == 0) continue; rollback_added_entries(ch, tc, KIDX_TO_TI(ch, kidx), tei, ta_buf_m, count, i); rollback = 1; break; } IPFW_WUNLOCK(ch); ipfw_garbage_table_values(ch, tc, tei, count, rollback); /* Permit post-add algorithm grow/rehash. */ if (numadd != 0) check_table_space(ch, NULL, tc, KIDX_TO_TI(ch, kidx), 0); /* Return first error to user, if any */ error = first_error; cleanup: IPFW_UH_WUNLOCK(ch); flush_batch_buffer(ch, ta, tei, count, rollback, ta_buf_m, ta_buf); return (error); } /* * Deletes one or more entries in table @ti. * * Returns 0 on success. */ int del_table_entry(struct ip_fw_chain *ch, struct tid_info *ti, struct tentry_info *tei, uint8_t flags, uint32_t count) { struct table_config *tc; struct table_algo *ta; struct tentry_info *ptei; uint16_t kidx; int error, first_error, i; uint32_t num, numdel; char ta_buf[TA_BUF_SZ]; caddr_t ta_buf_m, v; /* * Find and reference existing table. */ IPFW_UH_WLOCK(ch); error = find_ref_table(ch, ti, tei, count, OP_DEL, &tc); if (error != 0) { IPFW_UH_WUNLOCK(ch); return (error); } ta = tc->ta; IPFW_UH_WUNLOCK(ch); /* Allocate memory and prepare record(s) */ /* Pass stack buffer by default */ ta_buf_m = ta_buf; error = prepare_batch_buffer(ch, ta, tei, count, OP_DEL, &ta_buf_m); if (error != 0) goto cleanup; IPFW_UH_WLOCK(ch); /* Drop reference we've used in first search */ tc->no.refcnt--; /* * Check if table algo is still the same. * (changed ta may be the result of table swap). */ if (ta != tc->ta) { IPFW_UH_WUNLOCK(ch); error = EINVAL; goto cleanup; } kidx = tc->no.kidx; numdel = 0; first_error = 0; IPFW_WLOCK(ch); v = ta_buf_m; for (i = 0; i < count; i++, v += ta->ta_buf_size) { ptei = &tei[i]; num = 0; error = ta->del(tc->astate, KIDX_TO_TI(ch, kidx), ptei, v, &num); /* Save state for userland */ store_tei_result(ptei, OP_DEL, error, num); if (error != 0 && first_error == 0) first_error = error; tc->count -= num; numdel += num; } IPFW_WUNLOCK(ch); /* Unlink non-used values */ ipfw_garbage_table_values(ch, tc, tei, count, 0); if (numdel != 0) { /* Run post-del hook to permit shrinking */ check_table_space(ch, NULL, tc, KIDX_TO_TI(ch, kidx), 0); } IPFW_UH_WUNLOCK(ch); /* Return first error to user, if any */ error = first_error; cleanup: flush_batch_buffer(ch, ta, tei, count, 0, ta_buf_m, ta_buf); return (error); } /* * Ensure that table @tc has enough space to add @count entries without * need for reallocation. * * Callbacks order: * 0) need_modify() (UH_WLOCK) - checks if @count items can be added w/o resize. * * 1) alloc_modify (no locks, M_WAITOK) - alloc new state based on @pflags. * 2) prepare_modifyt (UH_WLOCK) - copy old data into new storage * 3) modify (UH_WLOCK + WLOCK) - switch pointers * 4) flush_modify (UH_WLOCK) - free state, if needed * * Returns 0 on success. */ static int check_table_space(struct ip_fw_chain *ch, struct tableop_state *ts, struct table_config *tc, struct table_info *ti, uint32_t count) { struct table_algo *ta; uint64_t pflags; char ta_buf[TA_BUF_SZ]; int error; IPFW_UH_WLOCK_ASSERT(ch); error = 0; ta = tc->ta; if (ta->need_modify == NULL) return (0); /* Acquire reference not to loose @tc between locks/unlocks */ tc->no.refcnt++; /* * TODO: think about avoiding race between large add/large delete * operation on algorithm which implements shrinking along with * growing. */ while (true) { pflags = 0; if (ta->need_modify(tc->astate, ti, count, &pflags) == 0) { error = 0; break; } /* We have to shrink/grow table */ if (ts != NULL) add_toperation_state(ch, ts); IPFW_UH_WUNLOCK(ch); memset(&ta_buf, 0, sizeof(ta_buf)); error = ta->prepare_mod(ta_buf, &pflags); IPFW_UH_WLOCK(ch); if (ts != NULL) del_toperation_state(ch, ts); if (error != 0) break; if (ts != NULL && ts->modified != 0) { /* * Swap operation has happened * so we're currently operating on other * table data. Stop doing this. */ ta->flush_mod(ta_buf); break; } /* Check if we still need to alter table */ ti = KIDX_TO_TI(ch, tc->no.kidx); if (ta->need_modify(tc->astate, ti, count, &pflags) == 0) { IPFW_UH_WUNLOCK(ch); /* * Other thread has already performed resize. * Flush our state and return. */ ta->flush_mod(ta_buf); break; } error = ta->fill_mod(tc->astate, ti, ta_buf, &pflags); if (error == 0) { /* Do actual modification */ IPFW_WLOCK(ch); ta->modify(tc->astate, ti, ta_buf, pflags); IPFW_WUNLOCK(ch); } /* Anyway, flush data and retry */ ta->flush_mod(ta_buf); } tc->no.refcnt--; return (error); } /* * Adds or deletes record in table. * Data layout (v0): * Request: [ ip_fw3_opheader ipfw_table_xentry ] * * Returns 0 on success */ static int manage_table_ent_v0(struct ip_fw_chain *ch, ip_fw3_opheader *op3, struct sockopt_data *sd) { ipfw_table_xentry *xent; struct tentry_info tei; struct tid_info ti; struct table_value v; int error, hdrlen, read; hdrlen = offsetof(ipfw_table_xentry, k); /* Check minimum header size */ if (sd->valsize < (sizeof(*op3) + hdrlen)) return (EINVAL); read = sizeof(ip_fw3_opheader); /* Check if xentry len field is valid */ xent = (ipfw_table_xentry *)(op3 + 1); if (xent->len < hdrlen || xent->len + read > sd->valsize) return (EINVAL); memset(&tei, 0, sizeof(tei)); tei.paddr = &xent->k; tei.masklen = xent->masklen; ipfw_import_table_value_legacy(xent->value, &v); tei.pvalue = &v; /* Old requests compability */ tei.flags = TEI_FLAGS_COMPAT; if (xent->type == IPFW_TABLE_ADDR) { if (xent->len - hdrlen == sizeof(in_addr_t)) tei.subtype = AF_INET; else tei.subtype = AF_INET6; } memset(&ti, 0, sizeof(ti)); ti.uidx = xent->tbl; ti.type = xent->type; error = (op3->opcode == IP_FW_TABLE_XADD) ? add_table_entry(ch, &ti, &tei, 0, 1) : del_table_entry(ch, &ti, &tei, 0, 1); return (error); } /* * Adds or deletes record in table. * Data layout (v1)(current): * Request: [ ipfw_obj_header * ipfw_obj_ctlv(IPFW_TLV_TBLENT_LIST) [ ipfw_obj_tentry x N ] * ] * * Returns 0 on success */ static int manage_table_ent_v1(struct ip_fw_chain *ch, ip_fw3_opheader *op3, struct sockopt_data *sd) { ipfw_obj_tentry *tent, *ptent; ipfw_obj_ctlv *ctlv; ipfw_obj_header *oh; struct tentry_info *ptei, tei, *tei_buf; struct tid_info ti; int error, i, kidx, read; /* Check minimum header size */ if (sd->valsize < (sizeof(*oh) + sizeof(*ctlv))) return (EINVAL); /* Check if passed data is too long */ if (sd->valsize != sd->kavail) return (EINVAL); oh = (ipfw_obj_header *)sd->kbuf; /* Basic length checks for TLVs */ if (oh->ntlv.head.length != sizeof(oh->ntlv)) return (EINVAL); read = sizeof(*oh); ctlv = (ipfw_obj_ctlv *)(oh + 1); if (ctlv->head.length + read != sd->valsize) return (EINVAL); read += sizeof(*ctlv); tent = (ipfw_obj_tentry *)(ctlv + 1); if (ctlv->count * sizeof(*tent) + read != sd->valsize) return (EINVAL); if (ctlv->count == 0) return (0); /* * Mark entire buffer as "read". * This instructs sopt api write it back * after function return. */ ipfw_get_sopt_header(sd, sd->valsize); /* Perform basic checks for each entry */ ptent = tent; kidx = tent->idx; for (i = 0; i < ctlv->count; i++, ptent++) { if (ptent->head.length != sizeof(*ptent)) return (EINVAL); if (ptent->idx != kidx) return (ENOTSUP); } /* Convert data into kernel request objects */ objheader_to_ti(oh, &ti); ti.type = oh->ntlv.type; ti.uidx = kidx; /* Use on-stack buffer for single add/del */ if (ctlv->count == 1) { memset(&tei, 0, sizeof(tei)); tei_buf = &tei; } else tei_buf = malloc(ctlv->count * sizeof(tei), M_TEMP, M_WAITOK | M_ZERO); ptei = tei_buf; ptent = tent; for (i = 0; i < ctlv->count; i++, ptent++, ptei++) { ptei->paddr = &ptent->k; ptei->subtype = ptent->subtype; ptei->masklen = ptent->masklen; if (ptent->head.flags & IPFW_TF_UPDATE) ptei->flags |= TEI_FLAGS_UPDATE; ipfw_import_table_value_v1(&ptent->v.value); ptei->pvalue = (struct table_value *)&ptent->v.value; } error = (oh->opheader.opcode == IP_FW_TABLE_XADD) ? add_table_entry(ch, &ti, tei_buf, ctlv->flags, ctlv->count) : del_table_entry(ch, &ti, tei_buf, ctlv->flags, ctlv->count); /* Translate result back to userland */ ptei = tei_buf; ptent = tent; for (i = 0; i < ctlv->count; i++, ptent++, ptei++) { if (ptei->flags & TEI_FLAGS_ADDED) ptent->result = IPFW_TR_ADDED; else if (ptei->flags & TEI_FLAGS_DELETED) ptent->result = IPFW_TR_DELETED; else if (ptei->flags & TEI_FLAGS_UPDATED) ptent->result = IPFW_TR_UPDATED; else if (ptei->flags & TEI_FLAGS_LIMIT) ptent->result = IPFW_TR_LIMIT; else if (ptei->flags & TEI_FLAGS_ERROR) ptent->result = IPFW_TR_ERROR; else if (ptei->flags & TEI_FLAGS_NOTFOUND) ptent->result = IPFW_TR_NOTFOUND; else if (ptei->flags & TEI_FLAGS_EXISTS) ptent->result = IPFW_TR_EXISTS; ipfw_export_table_value_v1(ptei->pvalue, &ptent->v.value); } if (tei_buf != &tei) free(tei_buf, M_TEMP); return (error); } /* * Looks up an entry in given table. * Data layout (v0)(current): * Request: [ ipfw_obj_header ipfw_obj_tentry ] * Reply: [ ipfw_obj_header ipfw_obj_tentry ] * * Returns 0 on success */ static int find_table_entry(struct ip_fw_chain *ch, ip_fw3_opheader *op3, struct sockopt_data *sd) { ipfw_obj_tentry *tent; ipfw_obj_header *oh; struct tid_info ti; struct table_config *tc; struct table_algo *ta; struct table_info *kti; struct namedobj_instance *ni; int error; size_t sz; /* Check minimum header size */ sz = sizeof(*oh) + sizeof(*tent); if (sd->valsize != sz) return (EINVAL); oh = (struct _ipfw_obj_header *)ipfw_get_sopt_header(sd, sz); tent = (ipfw_obj_tentry *)(oh + 1); /* Basic length checks for TLVs */ if (oh->ntlv.head.length != sizeof(oh->ntlv)) return (EINVAL); objheader_to_ti(oh, &ti); ti.type = oh->ntlv.type; ti.uidx = tent->idx; IPFW_UH_RLOCK(ch); ni = CHAIN_TO_NI(ch); /* * Find existing table and check its type . */ ta = NULL; if ((tc = find_table(ni, &ti)) == NULL) { IPFW_UH_RUNLOCK(ch); return (ESRCH); } /* check table type */ if (tc->no.type != ti.type) { IPFW_UH_RUNLOCK(ch); return (EINVAL); } kti = KIDX_TO_TI(ch, tc->no.kidx); ta = tc->ta; if (ta->find_tentry == NULL) return (ENOTSUP); error = ta->find_tentry(tc->astate, kti, tent); IPFW_UH_RUNLOCK(ch); return (error); } /* * Flushes all entries or destroys given table. * Data layout (v0)(current): * Request: [ ipfw_obj_header ] * * Returns 0 on success */ static int flush_table_v0(struct ip_fw_chain *ch, ip_fw3_opheader *op3, struct sockopt_data *sd) { int error; struct _ipfw_obj_header *oh; struct tid_info ti; if (sd->valsize != sizeof(*oh)) return (EINVAL); oh = (struct _ipfw_obj_header *)op3; objheader_to_ti(oh, &ti); if (op3->opcode == IP_FW_TABLE_XDESTROY) error = destroy_table(ch, &ti); else if (op3->opcode == IP_FW_TABLE_XFLUSH) error = flush_table(ch, &ti); else return (ENOTSUP); return (error); } static void restart_flush(void *object, struct op_state *_state) { struct tableop_state *ts; ts = (struct tableop_state *)_state; if (ts->tc != object) return; /* Indicate we've called */ ts->modified = 1; } /* * Flushes given table. * * Function create new table instance with the same * parameters, swaps it with old one and * flushes state without holding runtime WLOCK. * * Returns 0 on success. */ int flush_table(struct ip_fw_chain *ch, struct tid_info *ti) { struct namedobj_instance *ni; struct table_config *tc; struct table_algo *ta; struct table_info ti_old, ti_new, *tablestate; void *astate_old, *astate_new; char algostate[64], *pstate; struct tableop_state ts; - int error; + int error, need_gc; uint16_t kidx; uint8_t tflags; /* * Stage 1: save table algoritm. * Reference found table to ensure it won't disappear. */ IPFW_UH_WLOCK(ch); ni = CHAIN_TO_NI(ch); if ((tc = find_table(ni, ti)) == NULL) { IPFW_UH_WUNLOCK(ch); return (ESRCH); } + need_gc = 0; + astate_new = NULL; + memset(&ti_new, 0, sizeof(ti_new)); restart: /* Set up swap handler */ memset(&ts, 0, sizeof(ts)); ts.opstate.func = restart_flush; ts.tc = tc; ta = tc->ta; /* Do not flush readonly tables */ if ((ta->flags & TA_FLAG_READONLY) != 0) { IPFW_UH_WUNLOCK(ch); return (EACCES); } /* Save startup algo parameters */ if (ta->print_config != NULL) { ta->print_config(tc->astate, KIDX_TO_TI(ch, tc->no.kidx), algostate, sizeof(algostate)); pstate = algostate; } else pstate = NULL; tflags = tc->tflags; tc->no.refcnt++; add_toperation_state(ch, &ts); IPFW_UH_WUNLOCK(ch); /* + * Stage 1.5: if this is not the first attempt, destroy previous state + */ + if (need_gc != 0) { + ta->destroy(astate_new, &ti_new); + need_gc = 0; + } + + /* * Stage 2: allocate new table instance using same algo. */ memset(&ti_new, 0, sizeof(struct table_info)); error = ta->init(ch, &astate_new, &ti_new, pstate, tflags); /* * Stage 3: swap old state pointers with newly-allocated ones. * Decrease refcount. */ IPFW_UH_WLOCK(ch); tc->no.refcnt--; del_toperation_state(ch, &ts); if (error != 0) { IPFW_UH_WUNLOCK(ch); return (error); } /* * Restart operation if table swap has happened: * even if algo may be the same, algo init parameters * may change. Restart operation instead of doing * complex checks. */ if (ts.modified != 0) { - ta->destroy(astate_new, &ti_new); + /* Delay destroying data since we're holding UH lock */ + need_gc = 1; goto restart; } ni = CHAIN_TO_NI(ch); kidx = tc->no.kidx; tablestate = (struct table_info *)ch->tablestate; IPFW_WLOCK(ch); ti_old = tablestate[kidx]; tablestate[kidx] = ti_new; IPFW_WUNLOCK(ch); astate_old = tc->astate; tc->astate = astate_new; tc->ti_copy = ti_new; tc->count = 0; /* Notify algo on real @ti address */ if (ta->change_ti != NULL) ta->change_ti(tc->astate, &tablestate[kidx]); /* * Stage 4: unref values. */ ipfw_unref_table_values(ch, tc, ta, astate_old, &ti_old); IPFW_UH_WUNLOCK(ch); /* * Stage 5: perform real flush/destroy. */ ta->destroy(astate_old, &ti_old); return (0); } /* * Swaps two tables. * Data layout (v0)(current): * Request: [ ipfw_obj_header ipfw_obj_ntlv ] * * Returns 0 on success */ static int swap_table(struct ip_fw_chain *ch, ip_fw3_opheader *op3, struct sockopt_data *sd) { int error; struct _ipfw_obj_header *oh; struct tid_info ti_a, ti_b; if (sd->valsize != sizeof(*oh) + sizeof(ipfw_obj_ntlv)) return (EINVAL); oh = (struct _ipfw_obj_header *)op3; ntlv_to_ti(&oh->ntlv, &ti_a); ntlv_to_ti((ipfw_obj_ntlv *)(oh + 1), &ti_b); error = swap_tables(ch, &ti_a, &ti_b); return (error); } /* * Swaps two tables of the same type/valtype. * * Checks if tables are compatible and limits * permits swap, than actually perform swap. * * Each table consists of 2 different parts: * config: * @tc (with name, set, kidx) and rule bindings, which is "stable". * number of items * table algo * runtime: * runtime data @ti (ch->tablestate) * runtime cache in @tc * algo-specific data (@tc->astate) * * So we switch: * all runtime data * number of items * table algo * * After that we call @ti change handler for each table. * * Note that referencing @tc won't protect tc->ta from change. * XXX: Do we need to restrict swap between locked tables? * XXX: Do we need to exchange ftype? * * Returns 0 on success. */ static int swap_tables(struct ip_fw_chain *ch, struct tid_info *a, struct tid_info *b) { struct namedobj_instance *ni; struct table_config *tc_a, *tc_b; struct table_algo *ta; struct table_info ti, *tablestate; void *astate; uint32_t count; /* * Stage 1: find both tables and ensure they are of * the same type. */ IPFW_UH_WLOCK(ch); ni = CHAIN_TO_NI(ch); if ((tc_a = find_table(ni, a)) == NULL) { IPFW_UH_WUNLOCK(ch); return (ESRCH); } if ((tc_b = find_table(ni, b)) == NULL) { IPFW_UH_WUNLOCK(ch); return (ESRCH); } /* It is very easy to swap between the same table */ if (tc_a == tc_b) { IPFW_UH_WUNLOCK(ch); return (0); } /* Check type and value are the same */ if (tc_a->no.type != tc_b->no.type || tc_a->tflags != tc_b->tflags) { IPFW_UH_WUNLOCK(ch); return (EINVAL); } /* Check limits before swap */ if ((tc_a->limit != 0 && tc_b->count > tc_a->limit) || (tc_b->limit != 0 && tc_a->count > tc_b->limit)) { IPFW_UH_WUNLOCK(ch); return (EFBIG); } /* Check if one of the tables is readonly */ if (((tc_a->ta->flags | tc_b->ta->flags) & TA_FLAG_READONLY) != 0) { IPFW_UH_WUNLOCK(ch); return (EACCES); } /* Notify we're going to swap */ rollback_toperation_state(ch, tc_a); rollback_toperation_state(ch, tc_b); /* Everything is fine, prepare to swap */ tablestate = (struct table_info *)ch->tablestate; ti = tablestate[tc_a->no.kidx]; ta = tc_a->ta; astate = tc_a->astate; count = tc_a->count; IPFW_WLOCK(ch); /* a <- b */ tablestate[tc_a->no.kidx] = tablestate[tc_b->no.kidx]; tc_a->ta = tc_b->ta; tc_a->astate = tc_b->astate; tc_a->count = tc_b->count; /* b <- a */ tablestate[tc_b->no.kidx] = ti; tc_b->ta = ta; tc_b->astate = astate; tc_b->count = count; IPFW_WUNLOCK(ch); /* Ensure tc.ti copies are in sync */ tc_a->ti_copy = tablestate[tc_a->no.kidx]; tc_b->ti_copy = tablestate[tc_b->no.kidx]; /* Notify both tables on @ti change */ if (tc_a->ta->change_ti != NULL) tc_a->ta->change_ti(tc_a->astate, &tablestate[tc_a->no.kidx]); if (tc_b->ta->change_ti != NULL) tc_b->ta->change_ti(tc_b->astate, &tablestate[tc_b->no.kidx]); IPFW_UH_WUNLOCK(ch); return (0); } /* * Destroys table specified by @ti. * Data layout (v0)(current): * Request: [ ip_fw3_opheader ] * * Returns 0 on success */ static int destroy_table(struct ip_fw_chain *ch, struct tid_info *ti) { struct namedobj_instance *ni; struct table_config *tc; IPFW_UH_WLOCK(ch); ni = CHAIN_TO_NI(ch); if ((tc = find_table(ni, ti)) == NULL) { IPFW_UH_WUNLOCK(ch); return (ESRCH); } /* Do not permit destroying referenced tables */ if (tc->no.refcnt > 0) { IPFW_UH_WUNLOCK(ch); return (EBUSY); } IPFW_WLOCK(ch); unlink_table(ch, tc); IPFW_WUNLOCK(ch); /* Free obj index */ if (ipfw_objhash_free_idx(ni, tc->no.kidx) != 0) printf("Error unlinking kidx %d from table %s\n", tc->no.kidx, tc->tablename); /* Unref values used in tables while holding UH lock */ ipfw_unref_table_values(ch, tc, tc->ta, tc->astate, &tc->ti_copy); IPFW_UH_WUNLOCK(ch); free_table_config(ni, tc); return (0); } static uint32_t roundup2p(uint32_t v) { v--; v |= v >> 1; v |= v >> 2; v |= v >> 4; v |= v >> 8; v |= v >> 16; v++; return (v); } /* * Grow tables index. * * Returns 0 on success. */ int ipfw_resize_tables(struct ip_fw_chain *ch, unsigned int ntables) { unsigned int ntables_old, tbl; struct namedobj_instance *ni; void *new_idx, *old_tablestate, *tablestate; struct table_info *ti; struct table_config *tc; int i, new_blocks; /* Check new value for validity */ if (ntables == 0) return (EINVAL); if (ntables > IPFW_TABLES_MAX) ntables = IPFW_TABLES_MAX; /* Alight to nearest power of 2 */ ntables = (unsigned int)roundup2p(ntables); /* Allocate new pointers */ tablestate = malloc(ntables * sizeof(struct table_info), M_IPFW, M_WAITOK | M_ZERO); ipfw_objhash_bitmap_alloc(ntables, (void *)&new_idx, &new_blocks); IPFW_UH_WLOCK(ch); tbl = (ntables >= V_fw_tables_max) ? V_fw_tables_max : ntables; ni = CHAIN_TO_NI(ch); /* Temporary restrict decreasing max_tables */ if (ntables < V_fw_tables_max) { /* * FIXME: Check if we really can shrink */ IPFW_UH_WUNLOCK(ch); return (EINVAL); } /* Copy table info/indices */ memcpy(tablestate, ch->tablestate, sizeof(struct table_info) * tbl); ipfw_objhash_bitmap_merge(ni, &new_idx, &new_blocks); IPFW_WLOCK(ch); /* Change pointers */ old_tablestate = ch->tablestate; ch->tablestate = tablestate; ipfw_objhash_bitmap_swap(ni, &new_idx, &new_blocks); ntables_old = V_fw_tables_max; V_fw_tables_max = ntables; IPFW_WUNLOCK(ch); /* Notify all consumers that their @ti pointer has changed */ ti = (struct table_info *)ch->tablestate; for (i = 0; i < tbl; i++, ti++) { if (ti->lookup == NULL) continue; tc = (struct table_config *)ipfw_objhash_lookup_kidx(ni, i); if (tc == NULL || tc->ta->change_ti == NULL) continue; tc->ta->change_ti(tc->astate, ti); } IPFW_UH_WUNLOCK(ch); /* Free old pointers */ free(old_tablestate, M_IPFW); ipfw_objhash_bitmap_free(new_idx, new_blocks); return (0); } /* * Switch between "set 0" and "rule's set" table binding, * Check all ruleset bindings and permits changing * IFF each binding has both rule AND table in default set (set 0). * * Returns 0 on success. */ int ipfw_switch_tables_namespace(struct ip_fw_chain *ch, unsigned int sets) { struct namedobj_instance *ni; struct named_object *no; struct ip_fw *rule; ipfw_insn *cmd; int cmdlen, i, l; uint16_t kidx; uint8_t type; IPFW_UH_WLOCK(ch); if (V_fw_tables_sets == sets) { IPFW_UH_WUNLOCK(ch); return (0); } ni = CHAIN_TO_NI(ch); /* * Scan all rules and examine tables opcodes. */ for (i = 0; i < ch->n_rules; i++) { rule = ch->map[i]; l = rule->cmd_len; cmd = rule->cmd; cmdlen = 0; for ( ; l > 0 ; l -= cmdlen, cmd += cmdlen) { cmdlen = F_LEN(cmd); if (classify_table_opcode(cmd, &kidx, &type) != 0) continue; no = ipfw_objhash_lookup_kidx(ni, kidx); /* Check if both table object and rule has the set 0 */ if (no->set != 0 || rule->set != 0) { IPFW_UH_WUNLOCK(ch); return (EBUSY); } } } V_fw_tables_sets = sets; IPFW_UH_WUNLOCK(ch); return (0); } /* * Lookup an IP @addr in table @tbl. * Stores found value in @val. * * Returns 1 if @addr was found. */ int ipfw_lookup_table(struct ip_fw_chain *ch, uint16_t tbl, in_addr_t addr, uint32_t *val) { struct table_info *ti; ti = KIDX_TO_TI(ch, tbl); return (ti->lookup(ti, &addr, sizeof(in_addr_t), val)); } /* * Lookup an arbtrary key @paddr of legth @plen in table @tbl. * Stores found value in @val. * * Returns 1 if key was found. */ int ipfw_lookup_table_extended(struct ip_fw_chain *ch, uint16_t tbl, uint16_t plen, void *paddr, uint32_t *val) { struct table_info *ti; ti = KIDX_TO_TI(ch, tbl); return (ti->lookup(ti, paddr, plen, val)); } /* * Info/List/dump support for tables. * */ /* * High-level 'get' cmds sysctl handlers */ /* * Lists all tables currently available in kernel. * Data layout (v0)(current): * Request: [ ipfw_obj_lheader ], size = ipfw_obj_lheader.size * Reply: [ ipfw_obj_lheader ipfw_xtable_info x N ] * * Returns 0 on success */ static int list_tables(struct ip_fw_chain *ch, ip_fw3_opheader *op3, struct sockopt_data *sd) { struct _ipfw_obj_lheader *olh; int error; olh = (struct _ipfw_obj_lheader *)ipfw_get_sopt_header(sd,sizeof(*olh)); if (olh == NULL) return (EINVAL); if (sd->valsize < olh->size) return (EINVAL); IPFW_UH_RLOCK(ch); error = export_tables(ch, olh, sd); IPFW_UH_RUNLOCK(ch); return (error); } /* * Store table info to buffer provided by @sd. * Data layout (v0)(current): * Request: [ ipfw_obj_header ipfw_xtable_info(empty)] * Reply: [ ipfw_obj_header ipfw_xtable_info ] * * Returns 0 on success. */ static int describe_table(struct ip_fw_chain *ch, ip_fw3_opheader *op3, struct sockopt_data *sd) { struct _ipfw_obj_header *oh; struct table_config *tc; struct tid_info ti; size_t sz; sz = sizeof(*oh) + sizeof(ipfw_xtable_info); oh = (struct _ipfw_obj_header *)ipfw_get_sopt_header(sd, sz); if (oh == NULL) return (EINVAL); objheader_to_ti(oh, &ti); IPFW_UH_RLOCK(ch); if ((tc = find_table(CHAIN_TO_NI(ch), &ti)) == NULL) { IPFW_UH_RUNLOCK(ch); return (ESRCH); } export_table_info(ch, tc, (ipfw_xtable_info *)(oh + 1)); IPFW_UH_RUNLOCK(ch); return (0); } /* * Modifies existing table. * Data layout (v0)(current): * Request: [ ipfw_obj_header ipfw_xtable_info ] * * Returns 0 on success */ static int modify_table(struct ip_fw_chain *ch, ip_fw3_opheader *op3, struct sockopt_data *sd) { struct _ipfw_obj_header *oh; ipfw_xtable_info *i; char *tname; struct tid_info ti; struct namedobj_instance *ni; struct table_config *tc; if (sd->valsize != sizeof(*oh) + sizeof(ipfw_xtable_info)) return (EINVAL); oh = (struct _ipfw_obj_header *)sd->kbuf; i = (ipfw_xtable_info *)(oh + 1); /* * Verify user-supplied strings. * Check for null-terminated/zero-length strings/ */ tname = oh->ntlv.name; if (ipfw_check_table_name(tname) != 0) return (EINVAL); objheader_to_ti(oh, &ti); ti.type = i->type; IPFW_UH_WLOCK(ch); ni = CHAIN_TO_NI(ch); if ((tc = find_table(ni, &ti)) == NULL) { IPFW_UH_WUNLOCK(ch); return (ESRCH); } /* Do not support any modifications for readonly tables */ if ((tc->ta->flags & TA_FLAG_READONLY) != 0) { IPFW_UH_WUNLOCK(ch); return (EACCES); } if ((i->mflags & IPFW_TMFLAGS_LIMIT) != 0) tc->limit = i->limit; if ((i->mflags & IPFW_TMFLAGS_LOCK) != 0) tc->locked = ((i->flags & IPFW_TGFLAGS_LOCKED) != 0); IPFW_UH_WUNLOCK(ch); return (0); } /* * Creates new table. * Data layout (v0)(current): * Request: [ ipfw_obj_header ipfw_xtable_info ] * * Returns 0 on success */ static int create_table(struct ip_fw_chain *ch, ip_fw3_opheader *op3, struct sockopt_data *sd) { struct _ipfw_obj_header *oh; ipfw_xtable_info *i; char *tname, *aname; struct tid_info ti; struct namedobj_instance *ni; if (sd->valsize != sizeof(*oh) + sizeof(ipfw_xtable_info)) return (EINVAL); oh = (struct _ipfw_obj_header *)sd->kbuf; i = (ipfw_xtable_info *)(oh + 1); /* * Verify user-supplied strings. * Check for null-terminated/zero-length strings/ */ tname = oh->ntlv.name; aname = i->algoname; if (ipfw_check_table_name(tname) != 0 || strnlen(aname, sizeof(i->algoname)) == sizeof(i->algoname)) return (EINVAL); if (aname[0] == '\0') { /* Use default algorithm */ aname = NULL; } objheader_to_ti(oh, &ti); ti.type = i->type; ni = CHAIN_TO_NI(ch); IPFW_UH_RLOCK(ch); if (find_table(ni, &ti) != NULL) { IPFW_UH_RUNLOCK(ch); return (EEXIST); } IPFW_UH_RUNLOCK(ch); return (create_table_internal(ch, &ti, aname, i, NULL, 0)); } /* * Creates new table based on @ti and @aname. * * Relies on table name checking inside find_name_tlv() * Assume @aname to be checked and valid. * Stores allocated table kidx inside @pkidx (if non-NULL). * Reference created table if @compat is non-zero. * * Returns 0 on success. */ static int create_table_internal(struct ip_fw_chain *ch, struct tid_info *ti, char *aname, ipfw_xtable_info *i, uint16_t *pkidx, int compat) { struct namedobj_instance *ni; struct table_config *tc, *tc_new, *tmp; struct table_algo *ta; uint16_t kidx; ni = CHAIN_TO_NI(ch); ta = find_table_algo(CHAIN_TO_TCFG(ch), ti, aname); if (ta == NULL) return (ENOTSUP); tc = alloc_table_config(ch, ti, ta, aname, i->tflags); if (tc == NULL) return (ENOMEM); tc->vmask = i->vmask; tc->limit = i->limit; if (ta->flags & TA_FLAG_READONLY) tc->locked = 1; else tc->locked = (i->flags & IPFW_TGFLAGS_LOCKED) != 0; IPFW_UH_WLOCK(ch); /* Check if table has been already created */ tc_new = find_table(ni, ti); if (tc_new != NULL) { /* * Compat: do not fail if we're * requesting to create existing table * which has the same type */ if (compat == 0 || tc_new->no.type != tc->no.type) { IPFW_UH_WUNLOCK(ch); free_table_config(ni, tc); return (EEXIST); } /* Exchange tc and tc_new for proper refcounting & freeing */ tmp = tc; tc = tc_new; tc_new = tmp; } else { /* New table */ if (ipfw_objhash_alloc_idx(ni, &kidx) != 0) { IPFW_UH_WUNLOCK(ch); printf("Unable to allocate table index." " Consider increasing net.inet.ip.fw.tables_max"); free_table_config(ni, tc); return (EBUSY); } tc->no.kidx = kidx; IPFW_WLOCK(ch); link_table(ch, tc); IPFW_WUNLOCK(ch); } if (compat != 0) tc->no.refcnt++; if (pkidx != NULL) *pkidx = tc->no.kidx; IPFW_UH_WUNLOCK(ch); if (tc_new != NULL) free_table_config(ni, tc_new); return (0); } static void ntlv_to_ti(ipfw_obj_ntlv *ntlv, struct tid_info *ti) { memset(ti, 0, sizeof(struct tid_info)); ti->set = ntlv->set; ti->uidx = ntlv->idx; ti->tlvs = ntlv; ti->tlen = ntlv->head.length; } static void objheader_to_ti(struct _ipfw_obj_header *oh, struct tid_info *ti) { ntlv_to_ti(&oh->ntlv, ti); } /* * Exports basic table info as name TLV. * Used inside dump_static_rules() to provide info * about all tables referenced by current ruleset. * * Returns 0 on success. */ int ipfw_export_table_ntlv(struct ip_fw_chain *ch, uint16_t kidx, struct sockopt_data *sd) { struct namedobj_instance *ni; struct named_object *no; ipfw_obj_ntlv *ntlv; ni = CHAIN_TO_NI(ch); no = ipfw_objhash_lookup_kidx(ni, kidx); KASSERT(no != NULL, ("invalid table kidx passed")); ntlv = (ipfw_obj_ntlv *)ipfw_get_sopt_space(sd, sizeof(*ntlv)); if (ntlv == NULL) return (ENOMEM); ntlv->head.type = IPFW_TLV_TBL_NAME; ntlv->head.length = sizeof(*ntlv); ntlv->idx = no->kidx; strlcpy(ntlv->name, no->name, sizeof(ntlv->name)); return (0); } /* * Marks every table kidx used in @rule with bit in @bmask. * Used to generate bitmask of referenced tables for given ruleset. * * Returns number of newly-referenced tables. */ int ipfw_mark_table_kidx(struct ip_fw_chain *chain, struct ip_fw *rule, uint32_t *bmask) { int cmdlen, l, count; ipfw_insn *cmd; uint16_t kidx; uint8_t type; l = rule->cmd_len; cmd = rule->cmd; cmdlen = 0; count = 0; for ( ; l > 0 ; l -= cmdlen, cmd += cmdlen) { cmdlen = F_LEN(cmd); if (classify_table_opcode(cmd, &kidx, &type) != 0) continue; if ((bmask[kidx / 32] & (1 << (kidx % 32))) == 0) count++; bmask[kidx / 32] |= 1 << (kidx % 32); } return (count); } struct dump_args { struct ip_fw_chain *ch; struct table_info *ti; struct table_config *tc; struct sockopt_data *sd; uint32_t cnt; uint16_t uidx; int error; uint32_t size; ipfw_table_entry *ent; ta_foreach_f *f; void *farg; ipfw_obj_tentry tent; }; static int count_ext_entries(void *e, void *arg) { struct dump_args *da; da = (struct dump_args *)arg; da->cnt++; return (0); } /* * Gets number of items from table either using * internal counter or calling algo callback for * externally-managed tables. * * Returns number of records. */ static uint32_t table_get_count(struct ip_fw_chain *ch, struct table_config *tc) { struct table_info *ti; struct table_algo *ta; struct dump_args da; ti = KIDX_TO_TI(ch, tc->no.kidx); ta = tc->ta; /* Use internal counter for self-managed tables */ if ((ta->flags & TA_FLAG_READONLY) == 0) return (tc->count); /* Use callback to quickly get number of items */ if ((ta->flags & TA_FLAG_EXTCOUNTER) != 0) return (ta->get_count(tc->astate, ti)); /* Count number of iterms ourselves */ memset(&da, 0, sizeof(da)); ta->foreach(tc->astate, ti, count_ext_entries, &da); return (da.cnt); } /* * Exports table @tc info into standard ipfw_xtable_info format. */ static void export_table_info(struct ip_fw_chain *ch, struct table_config *tc, ipfw_xtable_info *i) { struct table_info *ti; struct table_algo *ta; i->type = tc->no.type; i->tflags = tc->tflags; i->vmask = tc->vmask; i->set = tc->no.set; i->kidx = tc->no.kidx; i->refcnt = tc->no.refcnt; i->count = table_get_count(ch, tc); i->limit = tc->limit; i->flags |= (tc->locked != 0) ? IPFW_TGFLAGS_LOCKED : 0; i->size = tc->count * sizeof(ipfw_obj_tentry); i->size += sizeof(ipfw_obj_header) + sizeof(ipfw_xtable_info); strlcpy(i->tablename, tc->tablename, sizeof(i->tablename)); ti = KIDX_TO_TI(ch, tc->no.kidx); ta = tc->ta; if (ta->print_config != NULL) { /* Use algo function to print table config to string */ ta->print_config(tc->astate, ti, i->algoname, sizeof(i->algoname)); } else strlcpy(i->algoname, ta->name, sizeof(i->algoname)); /* Dump algo-specific data, if possible */ if (ta->dump_tinfo != NULL) { ta->dump_tinfo(tc->astate, ti, &i->ta_info); i->ta_info.flags |= IPFW_TATFLAGS_DATA; } } struct dump_table_args { struct ip_fw_chain *ch; struct sockopt_data *sd; }; static void export_table_internal(struct namedobj_instance *ni, struct named_object *no, void *arg) { ipfw_xtable_info *i; struct dump_table_args *dta; dta = (struct dump_table_args *)arg; i = (ipfw_xtable_info *)ipfw_get_sopt_space(dta->sd, sizeof(*i)); KASSERT(i != 0, ("previously checked buffer is not enough")); export_table_info(dta->ch, (struct table_config *)no, i); } /* * Export all tables as ipfw_xtable_info structures to * storage provided by @sd. * * If supplied buffer is too small, fills in required size * and returns ENOMEM. * Returns 0 on success. */ static int export_tables(struct ip_fw_chain *ch, ipfw_obj_lheader *olh, struct sockopt_data *sd) { uint32_t size; uint32_t count; struct dump_table_args dta; count = ipfw_objhash_count(CHAIN_TO_NI(ch)); size = count * sizeof(ipfw_xtable_info) + sizeof(ipfw_obj_lheader); /* Fill in header regadless of buffer size */ olh->count = count; olh->objsize = sizeof(ipfw_xtable_info); if (size > olh->size) { olh->size = size; return (ENOMEM); } olh->size = size; dta.ch = ch; dta.sd = sd; ipfw_objhash_foreach(CHAIN_TO_NI(ch), export_table_internal, &dta); return (0); } /* * Dumps all table data * Data layout (v1)(current): * Request: [ ipfw_obj_header ], size = ipfw_xtable_info.size * Reply: [ ipfw_obj_header ipfw_xtable_info ipfw_obj_tentry x N ] * * Returns 0 on success */ static int dump_table_v1(struct ip_fw_chain *ch, ip_fw3_opheader *op3, struct sockopt_data *sd) { struct _ipfw_obj_header *oh; ipfw_xtable_info *i; struct tid_info ti; struct table_config *tc; struct table_algo *ta; struct dump_args da; uint32_t sz; sz = sizeof(ipfw_obj_header) + sizeof(ipfw_xtable_info); oh = (struct _ipfw_obj_header *)ipfw_get_sopt_header(sd, sz); if (oh == NULL) return (EINVAL); i = (ipfw_xtable_info *)(oh + 1); objheader_to_ti(oh, &ti); IPFW_UH_RLOCK(ch); if ((tc = find_table(CHAIN_TO_NI(ch), &ti)) == NULL) { IPFW_UH_RUNLOCK(ch); return (ESRCH); } export_table_info(ch, tc, i); if (sd->valsize < i->size) { /* * Submitted buffer size is not enough. * WE've already filled in @i structure with * relevant table info including size, so we * can return. Buffer will be flushed automatically. */ IPFW_UH_RUNLOCK(ch); return (ENOMEM); } /* * Do the actual dump in eXtended format */ memset(&da, 0, sizeof(da)); da.ch = ch; da.ti = KIDX_TO_TI(ch, tc->no.kidx); da.tc = tc; da.sd = sd; ta = tc->ta; ta->foreach(tc->astate, da.ti, dump_table_tentry, &da); IPFW_UH_RUNLOCK(ch); return (da.error); } /* * Dumps all table data * Data layout (version 0)(legacy): * Request: [ ipfw_xtable ], size = IP_FW_TABLE_XGETSIZE() * Reply: [ ipfw_xtable ipfw_table_xentry x N ] * * Returns 0 on success */ static int dump_table_v0(struct ip_fw_chain *ch, ip_fw3_opheader *op3, struct sockopt_data *sd) { ipfw_xtable *xtbl; struct tid_info ti; struct table_config *tc; struct table_algo *ta; struct dump_args da; size_t sz, count; xtbl = (ipfw_xtable *)ipfw_get_sopt_header(sd, sizeof(ipfw_xtable)); if (xtbl == NULL) return (EINVAL); memset(&ti, 0, sizeof(ti)); ti.uidx = xtbl->tbl; IPFW_UH_RLOCK(ch); if ((tc = find_table(CHAIN_TO_NI(ch), &ti)) == NULL) { IPFW_UH_RUNLOCK(ch); return (0); } count = table_get_count(ch, tc); sz = count * sizeof(ipfw_table_xentry) + sizeof(ipfw_xtable); xtbl->cnt = count; xtbl->size = sz; xtbl->type = tc->no.type; xtbl->tbl = ti.uidx; if (sd->valsize < sz) { /* * Submitted buffer size is not enough. * WE've already filled in @i structure with * relevant table info including size, so we * can return. Buffer will be flushed automatically. */ IPFW_UH_RUNLOCK(ch); return (ENOMEM); } /* Do the actual dump in eXtended format */ memset(&da, 0, sizeof(da)); da.ch = ch; da.ti = KIDX_TO_TI(ch, tc->no.kidx); da.tc = tc; da.sd = sd; ta = tc->ta; ta->foreach(tc->astate, da.ti, dump_table_xentry, &da); IPFW_UH_RUNLOCK(ch); return (0); } /* * Legacy function to retrieve number of items in table. */ static int get_table_size(struct ip_fw_chain *ch, ip_fw3_opheader *op3, struct sockopt_data *sd) { uint32_t *tbl; struct tid_info ti; size_t sz; int error; sz = sizeof(*op3) + sizeof(uint32_t); op3 = (ip_fw3_opheader *)ipfw_get_sopt_header(sd, sz); if (op3 == NULL) return (EINVAL); tbl = (uint32_t *)(op3 + 1); memset(&ti, 0, sizeof(ti)); ti.uidx = *tbl; IPFW_UH_RLOCK(ch); error = ipfw_count_xtable(ch, &ti, tbl); IPFW_UH_RUNLOCK(ch); return (error); } /* * Legacy IP_FW_TABLE_GETSIZE handler */ int ipfw_count_table(struct ip_fw_chain *ch, struct tid_info *ti, uint32_t *cnt) { struct table_config *tc; if ((tc = find_table(CHAIN_TO_NI(ch), ti)) == NULL) return (ESRCH); *cnt = table_get_count(ch, tc); return (0); } /* * Legacy IP_FW_TABLE_XGETSIZE handler */ int ipfw_count_xtable(struct ip_fw_chain *ch, struct tid_info *ti, uint32_t *cnt) { struct table_config *tc; uint32_t count; if ((tc = find_table(CHAIN_TO_NI(ch), ti)) == NULL) { *cnt = 0; return (0); /* 'table all list' requires success */ } count = table_get_count(ch, tc); *cnt = count * sizeof(ipfw_table_xentry); if (count > 0) *cnt += sizeof(ipfw_xtable); return (0); } static int dump_table_entry(void *e, void *arg) { struct dump_args *da; struct table_config *tc; struct table_algo *ta; ipfw_table_entry *ent; struct table_value *pval; int error; da = (struct dump_args *)arg; tc = da->tc; ta = tc->ta; /* Out of memory, returning */ if (da->cnt == da->size) return (1); ent = da->ent++; ent->tbl = da->uidx; da->cnt++; error = ta->dump_tentry(tc->astate, da->ti, e, &da->tent); if (error != 0) return (error); ent->addr = da->tent.k.addr.s_addr; ent->masklen = da->tent.masklen; pval = get_table_value(da->ch, da->tc, da->tent.v.kidx); ent->value = ipfw_export_table_value_legacy(pval); return (0); } /* * Dumps table in pre-8.1 legacy format. */ int ipfw_dump_table_legacy(struct ip_fw_chain *ch, struct tid_info *ti, ipfw_table *tbl) { struct table_config *tc; struct table_algo *ta; struct dump_args da; tbl->cnt = 0; if ((tc = find_table(CHAIN_TO_NI(ch), ti)) == NULL) return (0); /* XXX: We should return ESRCH */ ta = tc->ta; /* This dump format supports IPv4 only */ if (tc->no.type != IPFW_TABLE_ADDR) return (0); memset(&da, 0, sizeof(da)); da.ch = ch; da.ti = KIDX_TO_TI(ch, tc->no.kidx); da.tc = tc; da.ent = &tbl->ent[0]; da.size = tbl->size; tbl->cnt = 0; ta->foreach(tc->astate, da.ti, dump_table_entry, &da); tbl->cnt = da.cnt; return (0); } /* * Dumps table entry in eXtended format (v1)(current). */ static int dump_table_tentry(void *e, void *arg) { struct dump_args *da; struct table_config *tc; struct table_algo *ta; struct table_value *pval; ipfw_obj_tentry *tent; int error; da = (struct dump_args *)arg; tc = da->tc; ta = tc->ta; tent = (ipfw_obj_tentry *)ipfw_get_sopt_space(da->sd, sizeof(*tent)); /* Out of memory, returning */ if (tent == NULL) { da->error = ENOMEM; return (1); } tent->head.length = sizeof(ipfw_obj_tentry); tent->idx = da->uidx; error = ta->dump_tentry(tc->astate, da->ti, e, tent); if (error != 0) return (error); pval = get_table_value(da->ch, da->tc, tent->v.kidx); ipfw_export_table_value_v1(pval, &tent->v.value); return (0); } /* * Dumps table entry in eXtended format (v0). */ static int dump_table_xentry(void *e, void *arg) { struct dump_args *da; struct table_config *tc; struct table_algo *ta; ipfw_table_xentry *xent; ipfw_obj_tentry *tent; struct table_value *pval; int error; da = (struct dump_args *)arg; tc = da->tc; ta = tc->ta; xent = (ipfw_table_xentry *)ipfw_get_sopt_space(da->sd, sizeof(*xent)); /* Out of memory, returning */ if (xent == NULL) return (1); xent->len = sizeof(ipfw_table_xentry); xent->tbl = da->uidx; memset(&da->tent, 0, sizeof(da->tent)); tent = &da->tent; error = ta->dump_tentry(tc->astate, da->ti, e, tent); if (error != 0) return (error); /* Convert current format to previous one */ xent->masklen = tent->masklen; pval = get_table_value(da->ch, da->tc, da->tent.v.kidx); xent->value = ipfw_export_table_value_legacy(pval); /* Apply some hacks */ if (tc->no.type == IPFW_TABLE_ADDR && tent->subtype == AF_INET) { xent->k.addr6.s6_addr32[3] = tent->k.addr.s_addr; xent->flags = IPFW_TCF_INET; } else memcpy(&xent->k, &tent->k, sizeof(xent->k)); return (0); } /* * Helper function to export table algo data * to tentry format before calling user function. * * Returns 0 on success. */ static int prepare_table_tentry(void *e, void *arg) { struct dump_args *da; struct table_config *tc; struct table_algo *ta; int error; da = (struct dump_args *)arg; tc = da->tc; ta = tc->ta; error = ta->dump_tentry(tc->astate, da->ti, e, &da->tent); if (error != 0) return (error); da->f(&da->tent, da->farg); return (0); } /* * Allow external consumers to read table entries in standard format. */ int ipfw_foreach_table_tentry(struct ip_fw_chain *ch, uint16_t kidx, ta_foreach_f *f, void *arg) { struct namedobj_instance *ni; struct table_config *tc; struct table_algo *ta; struct dump_args da; ni = CHAIN_TO_NI(ch); tc = (struct table_config *)ipfw_objhash_lookup_kidx(ni, kidx); if (tc == NULL) return (ESRCH); ta = tc->ta; memset(&da, 0, sizeof(da)); da.ch = ch; da.ti = KIDX_TO_TI(ch, tc->no.kidx); da.tc = tc; da.f = f; da.farg = arg; ta->foreach(tc->astate, da.ti, prepare_table_tentry, &da); return (0); } /* * Table algorithms */ /* * Finds algoritm by index, table type or supplied name. * * Returns pointer to algo or NULL. */ static struct table_algo * find_table_algo(struct tables_config *tcfg, struct tid_info *ti, char *name) { int i, l; struct table_algo *ta; if (ti->type > IPFW_TABLE_MAXTYPE) return (NULL); /* Search by index */ if (ti->atype != 0) { if (ti->atype > tcfg->algo_count) return (NULL); return (tcfg->algo[ti->atype]); } if (name == NULL) { /* Return default algorithm for given type if set */ return (tcfg->def_algo[ti->type]); } /* Search by name */ /* TODO: better search */ for (i = 1; i <= tcfg->algo_count; i++) { ta = tcfg->algo[i]; /* * One can supply additional algorithm * parameters so we compare only the first word * of supplied name: * 'addr:chash hsize=32' * '^^^^^^^^^' * */ l = strlen(ta->name); if (strncmp(name, ta->name, l) != 0) continue; if (name[l] != '\0' && name[l] != ' ') continue; /* Check if we're requesting proper table type */ if (ti->type != 0 && ti->type != ta->type) return (NULL); return (ta); } return (NULL); } /* * Register new table algo @ta. * Stores algo id inside @idx. * * Returns 0 on success. */ int ipfw_add_table_algo(struct ip_fw_chain *ch, struct table_algo *ta, size_t size, int *idx) { struct tables_config *tcfg; struct table_algo *ta_new; size_t sz; if (size > sizeof(struct table_algo)) return (EINVAL); /* Check for the required on-stack size for add/del */ sz = roundup2(ta->ta_buf_size, sizeof(void *)); if (sz > TA_BUF_SZ) return (EINVAL); KASSERT(ta->type <= IPFW_TABLE_MAXTYPE,("Increase IPFW_TABLE_MAXTYPE")); /* Copy algorithm data to stable storage. */ ta_new = malloc(sizeof(struct table_algo), M_IPFW, M_WAITOK | M_ZERO); memcpy(ta_new, ta, size); tcfg = CHAIN_TO_TCFG(ch); KASSERT(tcfg->algo_count < 255, ("Increase algo array size")); tcfg->algo[++tcfg->algo_count] = ta_new; ta_new->idx = tcfg->algo_count; /* Set algorithm as default one for given type */ if ((ta_new->flags & TA_FLAG_DEFAULT) != 0 && tcfg->def_algo[ta_new->type] == NULL) tcfg->def_algo[ta_new->type] = ta_new; *idx = ta_new->idx; return (0); } /* * Unregisters table algo using @idx as id. * XXX: It is NOT safe to call this function in any place * other than ipfw instance destroy handler. */ void ipfw_del_table_algo(struct ip_fw_chain *ch, int idx) { struct tables_config *tcfg; struct table_algo *ta; tcfg = CHAIN_TO_TCFG(ch); KASSERT(idx <= tcfg->algo_count, ("algo idx %d out of range 1..%d", idx, tcfg->algo_count)); ta = tcfg->algo[idx]; KASSERT(ta != NULL, ("algo idx %d is NULL", idx)); if (tcfg->def_algo[ta->type] == ta) tcfg->def_algo[ta->type] = NULL; free(ta, M_IPFW); } /* * Lists all table algorithms currently available. * Data layout (v0)(current): * Request: [ ipfw_obj_lheader ], size = ipfw_obj_lheader.size * Reply: [ ipfw_obj_lheader ipfw_ta_info x N ] * * Returns 0 on success */ static int list_table_algo(struct ip_fw_chain *ch, ip_fw3_opheader *op3, struct sockopt_data *sd) { struct _ipfw_obj_lheader *olh; struct tables_config *tcfg; ipfw_ta_info *i; struct table_algo *ta; uint32_t count, n, size; olh = (struct _ipfw_obj_lheader *)ipfw_get_sopt_header(sd,sizeof(*olh)); if (olh == NULL) return (EINVAL); if (sd->valsize < olh->size) return (EINVAL); IPFW_UH_RLOCK(ch); tcfg = CHAIN_TO_TCFG(ch); count = tcfg->algo_count; size = count * sizeof(ipfw_ta_info) + sizeof(ipfw_obj_lheader); /* Fill in header regadless of buffer size */ olh->count = count; olh->objsize = sizeof(ipfw_ta_info); if (size > olh->size) { olh->size = size; IPFW_UH_RUNLOCK(ch); return (ENOMEM); } olh->size = size; for (n = 1; n <= count; n++) { i = (ipfw_ta_info *)ipfw_get_sopt_space(sd, sizeof(*i)); KASSERT(i != 0, ("previously checked buffer is not enough")); ta = tcfg->algo[n]; strlcpy(i->algoname, ta->name, sizeof(i->algoname)); i->type = ta->type; i->refcnt = ta->refcnt; } IPFW_UH_RUNLOCK(ch); return (0); } /* * Tables rewriting code */ /* * Determine table number and lookup type for @cmd. * Fill @tbl and @type with appropriate values. * Returns 0 for relevant opcodes, 1 otherwise. */ static int classify_table_opcode(ipfw_insn *cmd, uint16_t *puidx, uint8_t *ptype) { ipfw_insn_if *cmdif; int skip; uint16_t v; skip = 1; switch (cmd->opcode) { case O_IP_SRC_LOOKUP: case O_IP_DST_LOOKUP: /* Basic IPv4/IPv6 or u32 lookups */ *puidx = cmd->arg1; /* Assume ADDR by default */ *ptype = IPFW_TABLE_ADDR; skip = 0; if (F_LEN(cmd) > F_INSN_SIZE(ipfw_insn_u32)) { /* * generic lookup. The key must be * in 32bit big-endian format. */ v = ((ipfw_insn_u32 *)cmd)->d[1]; switch (v) { case 0: case 1: /* IPv4 src/dst */ break; case 2: case 3: /* src/dst port */ *ptype = IPFW_TABLE_NUMBER; break; case 4: /* uid/gid */ *ptype = IPFW_TABLE_NUMBER; break; case 5: /* jid */ *ptype = IPFW_TABLE_NUMBER; break; case 6: /* dscp */ *ptype = IPFW_TABLE_NUMBER; break; } } break; case O_XMIT: case O_RECV: case O_VIA: /* Interface table, possibly */ cmdif = (ipfw_insn_if *)cmd; if (cmdif->name[0] != '\1') break; *ptype = IPFW_TABLE_INTERFACE; *puidx = cmdif->p.kidx; skip = 0; break; case O_IP_FLOW_LOOKUP: *puidx = cmd->arg1; *ptype = IPFW_TABLE_FLOW; skip = 0; break; } return (skip); } /* * Sets new table value for given opcode. * Assume the same opcodes as classify_table_opcode() */ static void update_table_opcode(ipfw_insn *cmd, uint16_t idx) { ipfw_insn_if *cmdif; switch (cmd->opcode) { case O_IP_SRC_LOOKUP: case O_IP_DST_LOOKUP: /* Basic IPv4/IPv6 or u32 lookups */ cmd->arg1 = idx; break; case O_XMIT: case O_RECV: case O_VIA: /* Interface table, possibly */ cmdif = (ipfw_insn_if *)cmd; cmdif->p.kidx = idx; break; case O_IP_FLOW_LOOKUP: cmd->arg1 = idx; break; } } /* * Checks table name for validity. * Enforce basic length checks, the rest * should be done in userland. * * Returns 0 if name is considered valid. */ int ipfw_check_table_name(char *name) { int nsize; ipfw_obj_ntlv *ntlv = NULL; nsize = sizeof(ntlv->name); if (strnlen(name, nsize) == nsize) return (EINVAL); if (name[0] == '\0') return (EINVAL); /* * TODO: do some more complicated checks */ return (0); } /* * Find tablename TLV by @uid. * Check @tlvs for valid data inside. * * Returns pointer to found TLV or NULL. */ static ipfw_obj_ntlv * find_name_tlv(void *tlvs, int len, uint16_t uidx) { ipfw_obj_ntlv *ntlv; uintptr_t pa, pe; int l; pa = (uintptr_t)tlvs; pe = pa + len; l = 0; for (; pa < pe; pa += l) { ntlv = (ipfw_obj_ntlv *)pa; l = ntlv->head.length; if (l != sizeof(*ntlv)) return (NULL); if (ntlv->head.type != IPFW_TLV_TBL_NAME) continue; if (ntlv->idx != uidx) continue; if (ipfw_check_table_name(ntlv->name) != 0) return (NULL); return (ntlv); } return (NULL); } /* * Finds table config based on either legacy index * or name in ntlv. * Note @ti structure contains unchecked data from userland. * * Returns pointer to table_config or NULL. */ static struct table_config * find_table(struct namedobj_instance *ni, struct tid_info *ti) { char *name, bname[16]; struct named_object *no; ipfw_obj_ntlv *ntlv; uint32_t set; if (ti->tlvs != NULL) { ntlv = find_name_tlv(ti->tlvs, ti->tlen, ti->uidx); if (ntlv == NULL) return (NULL); name = ntlv->name; /* * Use set provided by @ti instead of @ntlv one. * This is needed due to different sets behavior * controlled by V_fw_tables_sets. */ set = ti->set; } else { snprintf(bname, sizeof(bname), "%d", ti->uidx); name = bname; set = 0; } no = ipfw_objhash_lookup_name(ni, set, name); return ((struct table_config *)no); } /* * Allocate new table config structure using * specified @algo and @aname. * * Returns pointer to config or NULL. */ static struct table_config * alloc_table_config(struct ip_fw_chain *ch, struct tid_info *ti, struct table_algo *ta, char *aname, uint8_t tflags) { char *name, bname[16]; struct table_config *tc; int error; ipfw_obj_ntlv *ntlv; uint32_t set; if (ti->tlvs != NULL) { ntlv = find_name_tlv(ti->tlvs, ti->tlen, ti->uidx); if (ntlv == NULL) return (NULL); name = ntlv->name; set = ntlv->set; } else { snprintf(bname, sizeof(bname), "%d", ti->uidx); name = bname; set = 0; } tc = malloc(sizeof(struct table_config), M_IPFW, M_WAITOK | M_ZERO); tc->no.name = tc->tablename; tc->no.type = ta->type; tc->no.set = set; tc->tflags = tflags; tc->ta = ta; strlcpy(tc->tablename, name, sizeof(tc->tablename)); /* Set "shared" value type by default */ tc->vshared = 1; if (ti->tlvs == NULL) { tc->no.compat = 1; tc->no.uidx = ti->uidx; } /* Preallocate data structures for new tables */ error = ta->init(ch, &tc->astate, &tc->ti_copy, aname, tflags); if (error != 0) { free(tc, M_IPFW); return (NULL); } return (tc); } /* * Destroys table state and config. */ static void free_table_config(struct namedobj_instance *ni, struct table_config *tc) { KASSERT(tc->linked == 0, ("free() on linked config")); + /* UH lock MUST NOT be held */ /* * We're using ta without any locking/referencing. * TODO: fix this if we're going to use unloadable algos. */ tc->ta->destroy(tc->astate, &tc->ti_copy); free(tc, M_IPFW); } /* * Links @tc to @chain table named instance. * Sets appropriate type/states in @chain table info. */ static void link_table(struct ip_fw_chain *ch, struct table_config *tc) { struct namedobj_instance *ni; struct table_info *ti; uint16_t kidx; IPFW_UH_WLOCK_ASSERT(ch); IPFW_WLOCK_ASSERT(ch); ni = CHAIN_TO_NI(ch); kidx = tc->no.kidx; ipfw_objhash_add(ni, &tc->no); ti = KIDX_TO_TI(ch, kidx); *ti = tc->ti_copy; /* Notify algo on real @ti address */ if (tc->ta->change_ti != NULL) tc->ta->change_ti(tc->astate, ti); tc->linked = 1; tc->ta->refcnt++; } /* * Unlinks @tc from @chain table named instance. * Zeroes states in @chain and stores them in @tc. */ static void unlink_table(struct ip_fw_chain *ch, struct table_config *tc) { struct namedobj_instance *ni; struct table_info *ti; uint16_t kidx; IPFW_UH_WLOCK_ASSERT(ch); IPFW_WLOCK_ASSERT(ch); ni = CHAIN_TO_NI(ch); kidx = tc->no.kidx; /* Clear state. @ti copy is already saved inside @tc */ ipfw_objhash_del(ni, &tc->no); ti = KIDX_TO_TI(ch, kidx); memset(ti, 0, sizeof(struct table_info)); tc->linked = 0; tc->ta->refcnt--; /* Notify algo on real @ti address */ if (tc->ta->change_ti != NULL) tc->ta->change_ti(tc->astate, NULL); } struct swap_table_args { int set; int new_set; int mv; }; /* * Change set for each matching table. * * Ensure we dispatch each table once by setting/checking ochange * fields. */ static void swap_table_set(struct namedobj_instance *ni, struct named_object *no, void *arg) { struct table_config *tc; struct swap_table_args *sta; tc = (struct table_config *)no; sta = (struct swap_table_args *)arg; if (no->set != sta->set && (no->set != sta->new_set || sta->mv != 0)) return; if (tc->ochanged != 0) return; tc->ochanged = 1; ipfw_objhash_del(ni, no); if (no->set == sta->set) no->set = sta->new_set; else no->set = sta->set; ipfw_objhash_add(ni, no); } /* * Cleans up ochange field for all tables. */ static void clean_table_set_data(struct namedobj_instance *ni, struct named_object *no, void *arg) { struct table_config *tc; struct swap_table_args *sta; tc = (struct table_config *)no; sta = (struct swap_table_args *)arg; tc->ochanged = 0; } /* * Swaps tables within two sets. */ void ipfw_swap_tables_sets(struct ip_fw_chain *ch, uint32_t set, uint32_t new_set, int mv) { struct swap_table_args sta; IPFW_UH_WLOCK_ASSERT(ch); sta.set = set; sta.new_set = new_set; sta.mv = mv; ipfw_objhash_foreach(CHAIN_TO_NI(ch), swap_table_set, &sta); ipfw_objhash_foreach(CHAIN_TO_NI(ch), clean_table_set_data, &sta); } /* * Move all tables which are reference by rules in @rr to set @new_set. * Makes sure that all relevant tables are referenced ONLLY by given rules. * * Retuns 0 on success, */ int ipfw_move_tables_sets(struct ip_fw_chain *ch, ipfw_range_tlv *rt, uint32_t new_set) { struct ip_fw *rule; struct table_config *tc; struct named_object *no; struct namedobj_instance *ni; int bad, i, l, cmdlen; uint16_t kidx; uint8_t type; ipfw_insn *cmd; IPFW_UH_WLOCK_ASSERT(ch); ni = CHAIN_TO_NI(ch); /* Stage 1: count number of references by given rules */ for (i = 0; i < ch->n_rules - 1; i++) { rule = ch->map[i]; if (ipfw_match_range(rule, rt) == 0) continue; l = rule->cmd_len; cmd = rule->cmd; cmdlen = 0; for ( ; l > 0 ; l -= cmdlen, cmd += cmdlen) { cmdlen = F_LEN(cmd); if (classify_table_opcode(cmd, &kidx, &type) != 0) continue; no = ipfw_objhash_lookup_kidx(ni, kidx); KASSERT(no != NULL, ("objhash lookup failed on index %d", kidx)); tc = (struct table_config *)no; tc->ocount++; } } /* Stage 2: verify "ownership" */ bad = 0; for (i = 0; i < ch->n_rules - 1; i++) { rule = ch->map[i]; if (ipfw_match_range(rule, rt) == 0) continue; l = rule->cmd_len; cmd = rule->cmd; cmdlen = 0; for ( ; l > 0 ; l -= cmdlen, cmd += cmdlen) { cmdlen = F_LEN(cmd); if (classify_table_opcode(cmd, &kidx, &type) != 0) continue; no = ipfw_objhash_lookup_kidx(ni, kidx); KASSERT(no != NULL, ("objhash lookup failed on index %d", kidx)); tc = (struct table_config *)no; if (tc->no.refcnt != tc->ocount) { /* * Number of references differ: * Other rule(s) are holding reference to given * table, so it is not possible to change its set. * * Note that refcnt may account * references to some going-to-be-added rules. * Since we don't know their numbers (and event * if they will be added) it is perfectly OK * to return error here. */ bad = 1; break; } } if (bad != 0) break; } /* Stage 3: change set or cleanup */ for (i = 0; i < ch->n_rules - 1; i++) { rule = ch->map[i]; if (ipfw_match_range(rule, rt) == 0) continue; l = rule->cmd_len; cmd = rule->cmd; cmdlen = 0; for ( ; l > 0 ; l -= cmdlen, cmd += cmdlen) { cmdlen = F_LEN(cmd); if (classify_table_opcode(cmd, &kidx, &type) != 0) continue; no = ipfw_objhash_lookup_kidx(ni, kidx); KASSERT(no != NULL, ("objhash lookup failed on index %d", kidx)); tc = (struct table_config *)no; tc->ocount = 0; if (bad != 0) continue; /* Actually change set. */ ipfw_objhash_del(ni, no); no->set = new_set; ipfw_objhash_add(ni, no); } } return (bad); } /* * Finds and bumps refcount for tables referenced by given @rule. * Auto-creates non-existing tables. * Fills in @oib array with userland/kernel indexes. * First free oidx pointer is saved back in @oib. * * Returns 0 on success. */ static int find_ref_rule_tables(struct ip_fw_chain *ch, struct ip_fw *rule, struct rule_check_info *ci, struct obj_idx **oib, struct tid_info *ti) { struct table_config *tc; struct namedobj_instance *ni; struct named_object *no; int cmdlen, error, l, numnew; uint16_t kidx; ipfw_insn *cmd; struct obj_idx *pidx, *pidx_first, *p; pidx_first = *oib; pidx = pidx_first; l = rule->cmd_len; cmd = rule->cmd; cmdlen = 0; error = 0; numnew = 0; IPFW_UH_WLOCK(ch); ni = CHAIN_TO_NI(ch); /* Increase refcount on each existing referenced table. */ for ( ; l > 0 ; l -= cmdlen, cmd += cmdlen) { cmdlen = F_LEN(cmd); if (classify_table_opcode(cmd, &ti->uidx, &ti->type) != 0) continue; pidx->uidx = ti->uidx; pidx->type = ti->type; if ((tc = find_table(ni, ti)) != NULL) { if (tc->no.type != ti->type) { /* Incompatible types */ error = EINVAL; break; } /* Reference found table and save kidx */ tc->no.refcnt++; pidx->kidx = tc->no.kidx; pidx++; continue; } /* * Compability stuff for old clients: * prepare to manually create non-existing tables. */ pidx++; numnew++; } if (error != 0) { /* Unref everything we have already done */ for (p = *oib; p < pidx; p++) { if (p->kidx == 0) continue; /* Find & unref by existing idx */ no = ipfw_objhash_lookup_kidx(ni, p->kidx); KASSERT(no != NULL, ("Ref'd table %d disappeared", p->kidx)); no->refcnt--; } } IPFW_UH_WUNLOCK(ch); if (numnew == 0) { *oib = pidx; return (error); } /* * Compatibility stuff: do actual creation for non-existing, * but referenced tables. */ for (p = pidx_first; p < pidx; p++) { if (p->kidx != 0) continue; ti->uidx = p->uidx; ti->type = p->type; ti->atype = 0; error = create_table_compat(ch, ti, &kidx); if (error == 0) { p->kidx = kidx; continue; } /* Error. We have to drop references */ IPFW_UH_WLOCK(ch); for (p = pidx_first; p < pidx; p++) { if (p->kidx == 0) continue; /* Find & unref by existing idx */ no = ipfw_objhash_lookup_kidx(ni, p->kidx); KASSERT(no != NULL, ("Ref'd table %d disappeared", p->kidx)); no->refcnt--; } IPFW_UH_WUNLOCK(ch); return (error); } *oib = pidx; return (error); } /* * Remove references from every table used in @rule. */ void ipfw_unref_rule_tables(struct ip_fw_chain *chain, struct ip_fw *rule) { int cmdlen, l; ipfw_insn *cmd; struct namedobj_instance *ni; struct named_object *no; uint16_t kidx; uint8_t type; IPFW_UH_WLOCK_ASSERT(chain); ni = CHAIN_TO_NI(chain); l = rule->cmd_len; cmd = rule->cmd; cmdlen = 0; for ( ; l > 0 ; l -= cmdlen, cmd += cmdlen) { cmdlen = F_LEN(cmd); if (classify_table_opcode(cmd, &kidx, &type) != 0) continue; no = ipfw_objhash_lookup_kidx(ni, kidx); KASSERT(no != NULL, ("table id %d not found", kidx)); KASSERT(no->type == type, ("wrong type %d (%d) for table id %d", no->type, type, kidx)); KASSERT(no->refcnt > 0, ("refcount for table %d is %d", kidx, no->refcnt)); no->refcnt--; } } /* * Compatibility function for old ipfw(8) binaries. * Rewrites table kernel indices with userland ones. * Convert tables matching '/^\d+$/' to their atoi() value. * Use number 65535 for other tables. * * Returns 0 on success. */ int ipfw_rewrite_table_kidx(struct ip_fw_chain *chain, struct ip_fw_rule0 *rule) { int cmdlen, error, l; ipfw_insn *cmd; uint16_t kidx, uidx; uint8_t type; struct named_object *no; struct namedobj_instance *ni; ni = CHAIN_TO_NI(chain); error = 0; l = rule->cmd_len; cmd = rule->cmd; cmdlen = 0; for ( ; l > 0 ; l -= cmdlen, cmd += cmdlen) { cmdlen = F_LEN(cmd); if (classify_table_opcode(cmd, &kidx, &type) != 0) continue; if ((no = ipfw_objhash_lookup_kidx(ni, kidx)) == NULL) return (1); uidx = no->uidx; if (no->compat == 0) { /* * We are called via legacy opcode. * Save error and show table as fake number * not to make ipfw(8) hang. */ uidx = 65535; error = 2; } update_table_opcode(cmd, uidx); } return (error); } /* * Checks is opcode is referencing table of appropriate type. * Adds reference count for found table if true. * Rewrites user-supplied opcode values with kernel ones. * * Returns 0 on success and appropriate error code otherwise. */ int ipfw_rewrite_table_uidx(struct ip_fw_chain *chain, struct rule_check_info *ci) { int cmdlen, error, l; ipfw_insn *cmd; uint16_t uidx; uint8_t type; struct namedobj_instance *ni; struct obj_idx *p, *pidx_first, *pidx_last; struct tid_info ti; ni = CHAIN_TO_NI(chain); /* * Prepare an array for storing opcode indices. * Use stack allocation by default. */ if (ci->table_opcodes <= (sizeof(ci->obuf)/sizeof(ci->obuf[0]))) { /* Stack */ pidx_first = ci->obuf; } else pidx_first = malloc(ci->table_opcodes * sizeof(struct obj_idx), M_IPFW, M_WAITOK | M_ZERO); pidx_last = pidx_first; error = 0; type = 0; memset(&ti, 0, sizeof(ti)); /* * Use default set for looking up tables (old way) or * use set rule is assigned to (new way). */ ti.set = (V_fw_tables_sets != 0) ? ci->krule->set : 0; if (ci->ctlv != NULL) { ti.tlvs = (void *)(ci->ctlv + 1); ti.tlen = ci->ctlv->head.length - sizeof(ipfw_obj_ctlv); } /* Reference all used tables */ error = find_ref_rule_tables(chain, ci->krule, ci, &pidx_last, &ti); if (error != 0) goto free; IPFW_UH_WLOCK(chain); /* Perform rule rewrite */ l = ci->krule->cmd_len; cmd = ci->krule->cmd; cmdlen = 0; p = pidx_first; for ( ; l > 0 ; l -= cmdlen, cmd += cmdlen) { cmdlen = F_LEN(cmd); if (classify_table_opcode(cmd, &uidx, &type) != 0) continue; update_table_opcode(cmd, p->kidx); p++; } IPFW_UH_WUNLOCK(chain); free: if (pidx_first != ci->obuf) free(pidx_first, M_IPFW); return (error); } static struct ipfw_sopt_handler scodes[] = { { IP_FW_TABLE_XCREATE, 0, HDIR_SET, create_table }, { IP_FW_TABLE_XDESTROY, 0, HDIR_SET, flush_table_v0 }, { IP_FW_TABLE_XFLUSH, 0, HDIR_SET, flush_table_v0 }, { IP_FW_TABLE_XMODIFY, 0, HDIR_BOTH, modify_table }, { IP_FW_TABLE_XINFO, 0, HDIR_GET, describe_table }, { IP_FW_TABLES_XLIST, 0, HDIR_GET, list_tables }, { IP_FW_TABLE_XLIST, 0, HDIR_GET, dump_table_v0 }, { IP_FW_TABLE_XLIST, 1, HDIR_GET, dump_table_v1 }, { IP_FW_TABLE_XADD, 0, HDIR_BOTH, manage_table_ent_v0 }, { IP_FW_TABLE_XADD, 1, HDIR_BOTH, manage_table_ent_v1 }, { IP_FW_TABLE_XDEL, 0, HDIR_BOTH, manage_table_ent_v0 }, { IP_FW_TABLE_XDEL, 1, HDIR_BOTH, manage_table_ent_v1 }, { IP_FW_TABLE_XFIND, 0, HDIR_GET, find_table_entry }, { IP_FW_TABLE_XSWAP, 0, HDIR_SET, swap_table }, { IP_FW_TABLES_ALIST, 0, HDIR_GET, list_table_algo }, { IP_FW_TABLE_XGETSIZE, 0, HDIR_GET, get_table_size }, }; static void destroy_table_locked(struct namedobj_instance *ni, struct named_object *no, void *arg) { unlink_table((struct ip_fw_chain *)arg, (struct table_config *)no); if (ipfw_objhash_free_idx(ni, no->kidx) != 0) printf("Error unlinking kidx %d from table %s\n", no->kidx, no->name); free_table_config(ni, (struct table_config *)no); } /* * Shuts tables module down. */ void ipfw_destroy_tables(struct ip_fw_chain *ch, int last) { IPFW_DEL_SOPT_HANDLER(last, scodes); /* Remove all tables from working set */ IPFW_UH_WLOCK(ch); IPFW_WLOCK(ch); ipfw_objhash_foreach(CHAIN_TO_NI(ch), destroy_table_locked, ch); IPFW_WUNLOCK(ch); IPFW_UH_WUNLOCK(ch); /* Free pointers itself */ free(ch->tablestate, M_IPFW); ipfw_table_value_destroy(ch, last); ipfw_table_algo_destroy(ch); ipfw_objhash_destroy(CHAIN_TO_NI(ch)); free(CHAIN_TO_TCFG(ch), M_IPFW); } /* * Starts tables module. */ int ipfw_init_tables(struct ip_fw_chain *ch, int first) { struct tables_config *tcfg; /* Allocate pointers */ ch->tablestate = malloc(V_fw_tables_max * sizeof(struct table_info), M_IPFW, M_WAITOK | M_ZERO); tcfg = malloc(sizeof(struct tables_config), M_IPFW, M_WAITOK | M_ZERO); tcfg->namehash = ipfw_objhash_create(V_fw_tables_max); ch->tblcfg = tcfg; ipfw_table_value_init(ch, first); ipfw_table_algo_init(ch); IPFW_ADD_SOPT_HANDLER(first, scodes); return (0); } Index: projects/building-blocks/sys/netpfil/ipfw/ip_fw_table_algo.c =================================================================== --- projects/building-blocks/sys/netpfil/ipfw/ip_fw_table_algo.c (revision 278311) +++ projects/building-blocks/sys/netpfil/ipfw/ip_fw_table_algo.c (revision 278312) @@ -1,4081 +1,4082 @@ /*- * Copyright (c) 2014 Yandex LLC * Copyright (c) 2014 Alexander V. Chernikov * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); /* * Lookup table algorithms. * */ #include "opt_ipfw.h" #include "opt_inet.h" #ifndef INET #error IPFIREWALL requires INET. #endif /* INET */ #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include /* ip_fw.h requires IFNAMSIZ */ #include #include #include #include /* struct ipfw_rule_ref */ #include #include #include /* * IPFW table lookup algorithms. * * What is needed to add another table algo? * * Algo init: * * struct table_algo has to be filled with: * name: "type:algoname" format, e.g. "addr:radix". Currently * there are the following types: "addr", "iface", "number" and "flow". * type: one of IPFW_TABLE_* types * flags: one or more TA_FLAGS_* * ta_buf_size: size of structure used to store add/del item state. * Needs to be less than TA_BUF_SZ. * callbacks: see below for description. * * ipfw_add_table_algo / ipfw_del_table_algo has to be called * * Callbacks description: * * -init: request to initialize new table instance. * typedef int (ta_init)(struct ip_fw_chain *ch, void **ta_state, * struct table_info *ti, char *data, uint8_t tflags); * MANDATORY, unlocked. (M_WAITOK). Returns 0 on success. * * Allocate all structures needed for normal operations. * * Caller may want to parse @data for some algo-specific * options provided by userland. * * Caller may want to save configuration state pointer to @ta_state * * Caller needs to save desired runtime structure pointer(s) * inside @ti fields. Note that it is not correct to save * @ti pointer at this moment. Use -change_ti hook for that. * * Caller has to fill in ti->lookup to appropriate function * pointer. * * * * -destroy: request to destroy table instance. * typedef void (ta_destroy)(void *ta_state, struct table_info *ti); - * MANDATORY, may be locked (UH+WLOCK). (M_NOWAIT). + * MANDATORY, unlocked. (M_WAITOK). * * Frees all table entries and all tables structures allocated by -init. * * * * -prepare_add: request to allocate state for adding new entry. * typedef int (ta_prepare_add)(struct ip_fw_chain *ch, struct tentry_info *tei, * void *ta_buf); * MANDATORY, unlocked. (M_WAITOK). Returns 0 on success. * * Allocates state and fills it in with all necessary data (EXCEPT value) * from @tei to minimize operations needed to be done under WLOCK. * "value" field has to be copied to new entry in @add callback. * Buffer ta_buf of size ta->ta_buf_sz may be used to store * allocated state. * * * * -prepare_del: request to set state for deleting existing entry. * typedef int (ta_prepare_del)(struct ip_fw_chain *ch, struct tentry_info *tei, * void *ta_buf); * MANDATORY, locked, UH. (M_NOWAIT). Returns 0 on success. * * Buffer ta_buf of size ta->ta_buf_sz may be used to store * allocated state. Caller should use on-stack ta_buf allocation * instead of doing malloc(). * * * * -add: request to insert new entry into runtime/config structures. * typedef int (ta_add)(void *ta_state, struct table_info *ti, * struct tentry_info *tei, void *ta_buf, uint32_t *pnum); * MANDATORY, UH+WLOCK. (M_NOWAIT). Returns 0 on success. * * Insert new entry using previously-allocated state in @ta_buf. * * @tei may have the following flags: * TEI_FLAGS_UPDATE: request to add or update entry. * TEI_FLAGS_DONTADD: request to update (but not add) entry. * * Caller is required to do the following: * copy real entry value from @tei * entry added: return 0, set 1 to @pnum * entry updated: return 0, store 0 to @pnum, store old value in @tei, * add TEI_FLAGS_UPDATED flag to @tei. * entry exists: return EEXIST * entry not found: return ENOENT * other error: return non-zero error code. * * * * -del: request to delete existing entry from runtime/config structures. * typedef int (ta_del)(void *ta_state, struct table_info *ti, * struct tentry_info *tei, void *ta_buf, uint32_t *pnum); * MANDATORY, UH+WLOCK. (M_NOWAIT). Returns 0 on success. * * Delete entry using previously set up in @ta_buf. * * Caller is required to do the following: * entry deleted: return 0, set 1 to @pnum, store old value in @tei. * entry not found: return ENOENT * other error: return non-zero error code. * * * * -flush_entry: flush entry state created by -prepare_add / -del / others * typedef void (ta_flush_entry)(struct ip_fw_chain *ch, * struct tentry_info *tei, void *ta_buf); * MANDATORY, may be locked. (M_NOWAIT). * * Delete state allocated by: * -prepare_add (-add returned EEXIST|UPDATED) * -prepare_del (if any) * -del * * Caller is required to handle empty @ta_buf correctly. * * * -find_tentry: finds entry specified by key @tei * typedef int ta_find_tentry(void *ta_state, struct table_info *ti, * ipfw_obj_tentry *tent); * OPTIONAL, locked (UH). (M_NOWAIT). Returns 0 on success. * * Finds entry specified by given key. * * Caller is requred to do the following: * entry found: returns 0, export entry to @tent * entry not found: returns ENOENT * * * -need_modify: checks if @ti has enough space to hold another @count items. * typedef int (ta_need_modify)(void *ta_state, struct table_info *ti, * uint32_t count, uint64_t *pflags); * OPTIONAL, locked (UH). (M_NOWAIT). Returns 0 if has. * * Checks if given table has enough space to add @count items without * resize. Caller may use @pflags to store desired modification data. * * * * -prepare_mod: allocate structures for table modification. * typedef int (ta_prepare_mod)(void *ta_buf, uint64_t *pflags); * OPTIONAL(need_modify), unlocked. (M_WAITOK). Returns 0 on success. * * Allocate all needed state for table modification. Caller * should use `struct mod_item` to store new state in @ta_buf. * Up to TA_BUF_SZ (128 bytes) can be stored in @ta_buf. * * * * -fill_mod: copy some data to new state/ * typedef int (ta_fill_mod)(void *ta_state, struct table_info *ti, * void *ta_buf, uint64_t *pflags); * OPTIONAL(need_modify), locked (UH). (M_NOWAIT). Returns 0 on success. * * Copy as much data as we can to minimize changes under WLOCK. * For example, array can be merged inside this callback. * * * * -modify: perform final modification. * typedef void (ta_modify)(void *ta_state, struct table_info *ti, * void *ta_buf, uint64_t pflags); * OPTIONAL(need_modify), locked (UH+WLOCK). (M_NOWAIT). * * Performs all changes necessary to switch to new structures. * * Caller should save old pointers to @ta_buf storage. * * * * -flush_mod: flush table modification state. * typedef void (ta_flush_mod)(void *ta_buf); * OPTIONAL(need_modify), unlocked. (M_WAITOK). * * Performs flush for the following: * - prepare_mod (modification was not necessary) * - modify (for the old state) * * * * -change_gi: monitor table info pointer changes * typedef void (ta_change_ti)(void *ta_state, struct table_info *ti); * OPTIONAL, locked (UH). (M_NOWAIT). * * Called on @ti pointer changed. Called immediately after -init * to set initial state. * * * * -foreach: calls @f for each table entry * typedef void ta_foreach(void *ta_state, struct table_info *ti, * ta_foreach_f *f, void *arg); * MANDATORY, locked(UH). (M_NOWAIT). * * Runs callback with specified argument for each table entry, * Typically used for dumping table entries. * * * * -dump_tentry: dump table entry in current @tentry format. * typedef int ta_dump_tentry(void *ta_state, struct table_info *ti, void *e, * ipfw_obj_tentry *tent); * MANDATORY, locked(UH). (M_NOWAIT). Returns 0 on success. * * Dumps entry @e to @tent. * * * -print_config: prints custom algoritm options into buffer. * typedef void (ta_print_config)(void *ta_state, struct table_info *ti, * char *buf, size_t bufsize); * OPTIONAL. locked(UH). (M_NOWAIT). * * Prints custom algorithm options in the format suitable to pass * back to -init callback. * * * * -dump_tinfo: dumps algo-specific info. * typedef void ta_dump_tinfo(void *ta_state, struct table_info *ti, * ipfw_ta_tinfo *tinfo); * OPTIONAL. locked(UH). (M_NOWAIT). * * Dumps options like items size/hash size, etc. */ MALLOC_DEFINE(M_IPFW_TBL, "ipfw_tbl", "IpFw tables"); /* * Utility structures/functions common to more than one algo */ struct mod_item { void *main_ptr; size_t size; void *main_ptr6; size_t size6; }; static int badd(const void *key, void *item, void *base, size_t nmemb, size_t size, int (*compar) (const void *, const void *)); static int bdel(const void *key, void *base, size_t nmemb, size_t size, int (*compar) (const void *, const void *)); /* * ADDR implementation using radix * */ /* * The radix code expects addr and mask to be array of bytes, * with the first byte being the length of the array. rn_inithead * is called with the offset in bits of the lookup key within the * array. If we use a sockaddr_in as the underlying type, * sin_len is conveniently located at offset 0, sin_addr is at * offset 4 and normally aligned. * But for portability, let's avoid assumption and make the code explicit */ #define KEY_LEN(v) *((uint8_t *)&(v)) /* * Do not require radix to compare more than actual IPv4/IPv6 address */ #define KEY_LEN_INET (offsetof(struct sockaddr_in, sin_addr) + sizeof(in_addr_t)) #define KEY_LEN_INET6 (offsetof(struct sa_in6, sin6_addr) + sizeof(struct in6_addr)) #define OFF_LEN_INET (8 * offsetof(struct sockaddr_in, sin_addr)) #define OFF_LEN_INET6 (8 * offsetof(struct sa_in6, sin6_addr)) struct radix_addr_entry { struct radix_node rn[2]; struct sockaddr_in addr; uint32_t value; uint8_t masklen; }; struct sa_in6 { uint8_t sin6_len; uint8_t sin6_family; uint8_t pad[2]; struct in6_addr sin6_addr; }; struct radix_addr_xentry { struct radix_node rn[2]; struct sa_in6 addr6; uint32_t value; uint8_t masklen; }; struct radix_cfg { struct radix_node_head *head4; struct radix_node_head *head6; size_t count4; size_t count6; }; struct ta_buf_radix { void *ent_ptr; struct sockaddr *addr_ptr; struct sockaddr *mask_ptr; union { struct { struct sockaddr_in sa; struct sockaddr_in ma; } a4; struct { struct sa_in6 sa; struct sa_in6 ma; } a6; } addr; }; static int ta_lookup_radix(struct table_info *ti, void *key, uint32_t keylen, uint32_t *val); static int ta_init_radix(struct ip_fw_chain *ch, void **ta_state, struct table_info *ti, char *data, uint8_t tflags); static int flush_radix_entry(struct radix_node *rn, void *arg); static void ta_destroy_radix(void *ta_state, struct table_info *ti); static void ta_dump_radix_tinfo(void *ta_state, struct table_info *ti, ipfw_ta_tinfo *tinfo); static int ta_dump_radix_tentry(void *ta_state, struct table_info *ti, void *e, ipfw_obj_tentry *tent); static int ta_find_radix_tentry(void *ta_state, struct table_info *ti, ipfw_obj_tentry *tent); static void ta_foreach_radix(void *ta_state, struct table_info *ti, ta_foreach_f *f, void *arg); static void tei_to_sockaddr_ent(struct tentry_info *tei, struct sockaddr *sa, struct sockaddr *ma, int *set_mask); static int ta_prepare_add_radix(struct ip_fw_chain *ch, struct tentry_info *tei, void *ta_buf); static int ta_add_radix(void *ta_state, struct table_info *ti, struct tentry_info *tei, void *ta_buf, uint32_t *pnum); static int ta_prepare_del_radix(struct ip_fw_chain *ch, struct tentry_info *tei, void *ta_buf); static int ta_del_radix(void *ta_state, struct table_info *ti, struct tentry_info *tei, void *ta_buf, uint32_t *pnum); static void ta_flush_radix_entry(struct ip_fw_chain *ch, struct tentry_info *tei, void *ta_buf); static int ta_need_modify_radix(void *ta_state, struct table_info *ti, uint32_t count, uint64_t *pflags); static int ta_lookup_radix(struct table_info *ti, void *key, uint32_t keylen, uint32_t *val) { struct radix_node_head *rnh; if (keylen == sizeof(in_addr_t)) { struct radix_addr_entry *ent; struct sockaddr_in sa; KEY_LEN(sa) = KEY_LEN_INET; sa.sin_addr.s_addr = *((in_addr_t *)key); rnh = (struct radix_node_head *)ti->state; ent = (struct radix_addr_entry *)(rnh->rnh_matchaddr(&sa, rnh)); if (ent != NULL) { *val = ent->value; return (1); } } else { struct radix_addr_xentry *xent; struct sa_in6 sa6; KEY_LEN(sa6) = KEY_LEN_INET6; memcpy(&sa6.sin6_addr, key, sizeof(struct in6_addr)); rnh = (struct radix_node_head *)ti->xstate; xent = (struct radix_addr_xentry *)(rnh->rnh_matchaddr(&sa6, rnh)); if (xent != NULL) { *val = xent->value; return (1); } } return (0); } /* * New table */ static int ta_init_radix(struct ip_fw_chain *ch, void **ta_state, struct table_info *ti, char *data, uint8_t tflags) { struct radix_cfg *cfg; if (!rn_inithead(&ti->state, OFF_LEN_INET)) return (ENOMEM); if (!rn_inithead(&ti->xstate, OFF_LEN_INET6)) { rn_detachhead(&ti->state); return (ENOMEM); } cfg = malloc(sizeof(struct radix_cfg), M_IPFW, M_WAITOK | M_ZERO); *ta_state = cfg; ti->lookup = ta_lookup_radix; return (0); } static int flush_radix_entry(struct radix_node *rn, void *arg) { struct radix_node_head * const rnh = arg; struct radix_addr_entry *ent; ent = (struct radix_addr_entry *) rnh->rnh_deladdr(rn->rn_key, rn->rn_mask, rnh); if (ent != NULL) free(ent, M_IPFW_TBL); return (0); } static void ta_destroy_radix(void *ta_state, struct table_info *ti) { struct radix_cfg *cfg; struct radix_node_head *rnh; cfg = (struct radix_cfg *)ta_state; rnh = (struct radix_node_head *)(ti->state); rnh->rnh_walktree(rnh, flush_radix_entry, rnh); rn_detachhead(&ti->state); rnh = (struct radix_node_head *)(ti->xstate); rnh->rnh_walktree(rnh, flush_radix_entry, rnh); rn_detachhead(&ti->xstate); free(cfg, M_IPFW); } /* * Provide algo-specific table info */ static void ta_dump_radix_tinfo(void *ta_state, struct table_info *ti, ipfw_ta_tinfo *tinfo) { struct radix_cfg *cfg; cfg = (struct radix_cfg *)ta_state; tinfo->flags = IPFW_TATFLAGS_AFDATA | IPFW_TATFLAGS_AFITEM; tinfo->taclass4 = IPFW_TACLASS_RADIX; tinfo->count4 = cfg->count4; tinfo->itemsize4 = sizeof(struct radix_addr_entry); tinfo->taclass6 = IPFW_TACLASS_RADIX; tinfo->count6 = cfg->count6; tinfo->itemsize6 = sizeof(struct radix_addr_xentry); } static int ta_dump_radix_tentry(void *ta_state, struct table_info *ti, void *e, ipfw_obj_tentry *tent) { struct radix_addr_entry *n; #ifdef INET6 struct radix_addr_xentry *xn; #endif n = (struct radix_addr_entry *)e; /* Guess IPv4/IPv6 radix by sockaddr family */ if (n->addr.sin_family == AF_INET) { tent->k.addr.s_addr = n->addr.sin_addr.s_addr; tent->masklen = n->masklen; tent->subtype = AF_INET; tent->v.kidx = n->value; #ifdef INET6 } else { xn = (struct radix_addr_xentry *)e; memcpy(&tent->k, &xn->addr6.sin6_addr, sizeof(struct in6_addr)); tent->masklen = xn->masklen; tent->subtype = AF_INET6; tent->v.kidx = xn->value; #endif } return (0); } static int ta_find_radix_tentry(void *ta_state, struct table_info *ti, ipfw_obj_tentry *tent) { struct radix_node_head *rnh; void *e; e = NULL; if (tent->subtype == AF_INET) { struct sockaddr_in sa; KEY_LEN(sa) = KEY_LEN_INET; sa.sin_addr.s_addr = tent->k.addr.s_addr; rnh = (struct radix_node_head *)ti->state; e = rnh->rnh_matchaddr(&sa, rnh); } else { struct sa_in6 sa6; KEY_LEN(sa6) = KEY_LEN_INET6; memcpy(&sa6.sin6_addr, &tent->k.addr6, sizeof(struct in6_addr)); rnh = (struct radix_node_head *)ti->xstate; e = rnh->rnh_matchaddr(&sa6, rnh); } if (e != NULL) { ta_dump_radix_tentry(ta_state, ti, e, tent); return (0); } return (ENOENT); } static void ta_foreach_radix(void *ta_state, struct table_info *ti, ta_foreach_f *f, void *arg) { struct radix_node_head *rnh; rnh = (struct radix_node_head *)(ti->state); rnh->rnh_walktree(rnh, (walktree_f_t *)f, arg); rnh = (struct radix_node_head *)(ti->xstate); rnh->rnh_walktree(rnh, (walktree_f_t *)f, arg); } #ifdef INET6 static inline void ipv6_writemask(struct in6_addr *addr6, uint8_t mask); static inline void ipv6_writemask(struct in6_addr *addr6, uint8_t mask) { uint32_t *cp; for (cp = (uint32_t *)addr6; mask >= 32; mask -= 32) *cp++ = 0xFFFFFFFF; *cp = htonl(mask ? ~((1 << (32 - mask)) - 1) : 0); } #endif static void tei_to_sockaddr_ent(struct tentry_info *tei, struct sockaddr *sa, struct sockaddr *ma, int *set_mask) { int mlen; #ifdef INET struct sockaddr_in *addr, *mask; #endif #ifdef INET6 struct sa_in6 *addr6, *mask6; #endif in_addr_t a4; mlen = tei->masklen; if (tei->subtype == AF_INET) { #ifdef INET addr = (struct sockaddr_in *)sa; mask = (struct sockaddr_in *)ma; /* Set 'total' structure length */ KEY_LEN(*addr) = KEY_LEN_INET; KEY_LEN(*mask) = KEY_LEN_INET; addr->sin_family = AF_INET; mask->sin_addr.s_addr = htonl(mlen ? ~((1 << (32 - mlen)) - 1) : 0); a4 = *((in_addr_t *)tei->paddr); addr->sin_addr.s_addr = a4 & mask->sin_addr.s_addr; if (mlen != 32) *set_mask = 1; else *set_mask = 0; #endif #ifdef INET6 } else if (tei->subtype == AF_INET6) { /* IPv6 case */ addr6 = (struct sa_in6 *)sa; mask6 = (struct sa_in6 *)ma; /* Set 'total' structure length */ KEY_LEN(*addr6) = KEY_LEN_INET6; KEY_LEN(*mask6) = KEY_LEN_INET6; addr6->sin6_family = AF_INET6; ipv6_writemask(&mask6->sin6_addr, mlen); memcpy(&addr6->sin6_addr, tei->paddr, sizeof(struct in6_addr)); APPLY_MASK(&addr6->sin6_addr, &mask6->sin6_addr); if (mlen != 128) *set_mask = 1; else *set_mask = 0; #endif } } static int ta_prepare_add_radix(struct ip_fw_chain *ch, struct tentry_info *tei, void *ta_buf) { struct ta_buf_radix *tb; struct radix_addr_entry *ent; #ifdef INET6 struct radix_addr_xentry *xent; #endif struct sockaddr *addr, *mask; int mlen, set_mask; tb = (struct ta_buf_radix *)ta_buf; mlen = tei->masklen; set_mask = 0; if (tei->subtype == AF_INET) { #ifdef INET if (mlen > 32) return (EINVAL); ent = malloc(sizeof(*ent), M_IPFW_TBL, M_WAITOK | M_ZERO); ent->masklen = mlen; addr = (struct sockaddr *)&ent->addr; mask = (struct sockaddr *)&tb->addr.a4.ma; tb->ent_ptr = ent; #endif #ifdef INET6 } else if (tei->subtype == AF_INET6) { /* IPv6 case */ if (mlen > 128) return (EINVAL); xent = malloc(sizeof(*xent), M_IPFW_TBL, M_WAITOK | M_ZERO); xent->masklen = mlen; addr = (struct sockaddr *)&xent->addr6; mask = (struct sockaddr *)&tb->addr.a6.ma; tb->ent_ptr = xent; #endif } else { /* Unknown CIDR type */ return (EINVAL); } tei_to_sockaddr_ent(tei, addr, mask, &set_mask); /* Set pointers */ tb->addr_ptr = addr; if (set_mask != 0) tb->mask_ptr = mask; return (0); } static int ta_add_radix(void *ta_state, struct table_info *ti, struct tentry_info *tei, void *ta_buf, uint32_t *pnum) { struct radix_cfg *cfg; struct radix_node_head *rnh; struct radix_node *rn; struct ta_buf_radix *tb; uint32_t *old_value, value; cfg = (struct radix_cfg *)ta_state; tb = (struct ta_buf_radix *)ta_buf; /* Save current entry value from @tei */ if (tei->subtype == AF_INET) { rnh = ti->state; ((struct radix_addr_entry *)tb->ent_ptr)->value = tei->value; } else { rnh = ti->xstate; ((struct radix_addr_xentry *)tb->ent_ptr)->value = tei->value; } /* Search for an entry first */ rn = rnh->rnh_lookup(tb->addr_ptr, tb->mask_ptr, rnh); if (rn != NULL) { if ((tei->flags & TEI_FLAGS_UPDATE) == 0) return (EEXIST); /* Record already exists. Update value if we're asked to */ if (tei->subtype == AF_INET) old_value = &((struct radix_addr_entry *)rn)->value; else old_value = &((struct radix_addr_xentry *)rn)->value; value = *old_value; *old_value = tei->value; tei->value = value; /* Indicate that update has happened instead of addition */ tei->flags |= TEI_FLAGS_UPDATED; *pnum = 0; return (0); } if ((tei->flags & TEI_FLAGS_DONTADD) != 0) return (EFBIG); rn = rnh->rnh_addaddr(tb->addr_ptr, tb->mask_ptr, rnh, tb->ent_ptr); if (rn == NULL) { /* Unknown error */ return (EINVAL); } if (tei->subtype == AF_INET) cfg->count4++; else cfg->count6++; tb->ent_ptr = NULL; *pnum = 1; return (0); } static int ta_prepare_del_radix(struct ip_fw_chain *ch, struct tentry_info *tei, void *ta_buf) { struct ta_buf_radix *tb; struct sockaddr *addr, *mask; int mlen, set_mask; tb = (struct ta_buf_radix *)ta_buf; mlen = tei->masklen; set_mask = 0; if (tei->subtype == AF_INET) { if (mlen > 32) return (EINVAL); addr = (struct sockaddr *)&tb->addr.a4.sa; mask = (struct sockaddr *)&tb->addr.a4.ma; #ifdef INET6 } else if (tei->subtype == AF_INET6) { if (mlen > 128) return (EINVAL); addr = (struct sockaddr *)&tb->addr.a6.sa; mask = (struct sockaddr *)&tb->addr.a6.ma; #endif } else return (EINVAL); tei_to_sockaddr_ent(tei, addr, mask, &set_mask); tb->addr_ptr = addr; if (set_mask != 0) tb->mask_ptr = mask; return (0); } static int ta_del_radix(void *ta_state, struct table_info *ti, struct tentry_info *tei, void *ta_buf, uint32_t *pnum) { struct radix_cfg *cfg; struct radix_node_head *rnh; struct radix_node *rn; struct ta_buf_radix *tb; cfg = (struct radix_cfg *)ta_state; tb = (struct ta_buf_radix *)ta_buf; if (tei->subtype == AF_INET) rnh = ti->state; else rnh = ti->xstate; rn = rnh->rnh_deladdr(tb->addr_ptr, tb->mask_ptr, rnh); if (rn == NULL) return (ENOENT); /* Save entry value to @tei */ if (tei->subtype == AF_INET) tei->value = ((struct radix_addr_entry *)rn)->value; else tei->value = ((struct radix_addr_xentry *)rn)->value; tb->ent_ptr = rn; if (tei->subtype == AF_INET) cfg->count4--; else cfg->count6--; *pnum = 1; return (0); } static void ta_flush_radix_entry(struct ip_fw_chain *ch, struct tentry_info *tei, void *ta_buf) { struct ta_buf_radix *tb; tb = (struct ta_buf_radix *)ta_buf; if (tb->ent_ptr != NULL) free(tb->ent_ptr, M_IPFW_TBL); } static int ta_need_modify_radix(void *ta_state, struct table_info *ti, uint32_t count, uint64_t *pflags) { /* * radix does not require additional memory allocations * other than nodes itself. Adding new masks to the tree do * but we don't have any API to call (and we don't known which * sizes do we need). */ return (0); } struct table_algo addr_radix = { .name = "addr:radix", .type = IPFW_TABLE_ADDR, .flags = TA_FLAG_DEFAULT, .ta_buf_size = sizeof(struct ta_buf_radix), .init = ta_init_radix, .destroy = ta_destroy_radix, .prepare_add = ta_prepare_add_radix, .prepare_del = ta_prepare_del_radix, .add = ta_add_radix, .del = ta_del_radix, .flush_entry = ta_flush_radix_entry, .foreach = ta_foreach_radix, .dump_tentry = ta_dump_radix_tentry, .find_tentry = ta_find_radix_tentry, .dump_tinfo = ta_dump_radix_tinfo, .need_modify = ta_need_modify_radix, }; /* * addr:hash cmds * * * ti->data: * [inv.mask4][inv.mask6][log2hsize4][log2hsize6] * [ 8][ 8[ 8][ 8] * * inv.mask4: 32 - mask * inv.mask6: * 1) _slow lookup: mask * 2) _aligned: (128 - mask) / 8 * 3) _64: 8 * * * pflags: * [v4=1/v6=0][hsize] * [ 32][ 32] */ struct chashentry; SLIST_HEAD(chashbhead, chashentry); struct chash_cfg { struct chashbhead *head4; struct chashbhead *head6; size_t size4; size_t size6; size_t items4; size_t items6; uint8_t mask4; uint8_t mask6; }; struct chashentry { SLIST_ENTRY(chashentry) next; uint32_t value; uint32_t type; union { uint32_t a4; /* Host format */ struct in6_addr a6; /* Network format */ } a; }; struct ta_buf_chash { void *ent_ptr; struct chashentry ent; }; #ifdef INET static __inline uint32_t hash_ip(uint32_t addr, int hsize); #endif #ifdef INET6 static __inline uint32_t hash_ip6(struct in6_addr *addr6, int hsize); static __inline uint16_t hash_ip64(struct in6_addr *addr6, int hsize); static __inline uint32_t hash_ip6_slow(struct in6_addr *addr6, void *key, int mask, int hsize); static __inline uint32_t hash_ip6_al(struct in6_addr *addr6, void *key, int mask, int hsize); #endif static int ta_lookup_chash_slow(struct table_info *ti, void *key, uint32_t keylen, uint32_t *val); static int ta_lookup_chash_aligned(struct table_info *ti, void *key, uint32_t keylen, uint32_t *val); static int ta_lookup_chash_64(struct table_info *ti, void *key, uint32_t keylen, uint32_t *val); static int chash_parse_opts(struct chash_cfg *cfg, char *data); static void ta_print_chash_config(void *ta_state, struct table_info *ti, char *buf, size_t bufsize); static int ta_log2(uint32_t v); static int ta_init_chash(struct ip_fw_chain *ch, void **ta_state, struct table_info *ti, char *data, uint8_t tflags); static void ta_destroy_chash(void *ta_state, struct table_info *ti); static void ta_dump_chash_tinfo(void *ta_state, struct table_info *ti, ipfw_ta_tinfo *tinfo); static int ta_dump_chash_tentry(void *ta_state, struct table_info *ti, void *e, ipfw_obj_tentry *tent); static uint32_t hash_ent(struct chashentry *ent, int af, int mlen, uint32_t size); static int tei_to_chash_ent(struct tentry_info *tei, struct chashentry *ent); static int ta_find_chash_tentry(void *ta_state, struct table_info *ti, ipfw_obj_tentry *tent); static void ta_foreach_chash(void *ta_state, struct table_info *ti, ta_foreach_f *f, void *arg); static int ta_prepare_add_chash(struct ip_fw_chain *ch, struct tentry_info *tei, void *ta_buf); static int ta_add_chash(void *ta_state, struct table_info *ti, struct tentry_info *tei, void *ta_buf, uint32_t *pnum); static int ta_prepare_del_chash(struct ip_fw_chain *ch, struct tentry_info *tei, void *ta_buf); static int ta_del_chash(void *ta_state, struct table_info *ti, struct tentry_info *tei, void *ta_buf, uint32_t *pnum); static void ta_flush_chash_entry(struct ip_fw_chain *ch, struct tentry_info *tei, void *ta_buf); static int ta_need_modify_chash(void *ta_state, struct table_info *ti, uint32_t count, uint64_t *pflags); static int ta_prepare_mod_chash(void *ta_buf, uint64_t *pflags); static int ta_fill_mod_chash(void *ta_state, struct table_info *ti, void *ta_buf, uint64_t *pflags); static void ta_modify_chash(void *ta_state, struct table_info *ti, void *ta_buf, uint64_t pflags); static void ta_flush_mod_chash(void *ta_buf); #ifdef INET static __inline uint32_t hash_ip(uint32_t addr, int hsize) { return (addr % (hsize - 1)); } #endif #ifdef INET6 static __inline uint32_t hash_ip6(struct in6_addr *addr6, int hsize) { uint32_t i; i = addr6->s6_addr32[0] ^ addr6->s6_addr32[1] ^ addr6->s6_addr32[2] ^ addr6->s6_addr32[3]; return (i % (hsize - 1)); } static __inline uint16_t hash_ip64(struct in6_addr *addr6, int hsize) { uint32_t i; i = addr6->s6_addr32[0] ^ addr6->s6_addr32[1]; return (i % (hsize - 1)); } static __inline uint32_t hash_ip6_slow(struct in6_addr *addr6, void *key, int mask, int hsize) { struct in6_addr mask6; ipv6_writemask(&mask6, mask); memcpy(addr6, key, sizeof(struct in6_addr)); APPLY_MASK(addr6, &mask6); return (hash_ip6(addr6, hsize)); } static __inline uint32_t hash_ip6_al(struct in6_addr *addr6, void *key, int mask, int hsize) { uint64_t *paddr; paddr = (uint64_t *)addr6; *paddr = 0; *(paddr + 1) = 0; memcpy(addr6, key, mask); return (hash_ip6(addr6, hsize)); } #endif static int ta_lookup_chash_slow(struct table_info *ti, void *key, uint32_t keylen, uint32_t *val) { struct chashbhead *head; struct chashentry *ent; uint16_t hash, hsize; uint8_t imask; if (keylen == sizeof(in_addr_t)) { #ifdef INET head = (struct chashbhead *)ti->state; imask = ti->data >> 24; hsize = 1 << ((ti->data & 0xFFFF) >> 8); uint32_t a; a = ntohl(*((in_addr_t *)key)); a = a >> imask; hash = hash_ip(a, hsize); SLIST_FOREACH(ent, &head[hash], next) { if (ent->a.a4 == a) { *val = ent->value; return (1); } } #endif } else { #ifdef INET6 /* IPv6: worst scenario: non-round mask */ struct in6_addr addr6; head = (struct chashbhead *)ti->xstate; imask = (ti->data & 0xFF0000) >> 16; hsize = 1 << (ti->data & 0xFF); hash = hash_ip6_slow(&addr6, key, imask, hsize); SLIST_FOREACH(ent, &head[hash], next) { if (memcmp(&ent->a.a6, &addr6, 16) == 0) { *val = ent->value; return (1); } } #endif } return (0); } static int ta_lookup_chash_aligned(struct table_info *ti, void *key, uint32_t keylen, uint32_t *val) { struct chashbhead *head; struct chashentry *ent; uint16_t hash, hsize; uint8_t imask; if (keylen == sizeof(in_addr_t)) { #ifdef INET head = (struct chashbhead *)ti->state; imask = ti->data >> 24; hsize = 1 << ((ti->data & 0xFFFF) >> 8); uint32_t a; a = ntohl(*((in_addr_t *)key)); a = a >> imask; hash = hash_ip(a, hsize); SLIST_FOREACH(ent, &head[hash], next) { if (ent->a.a4 == a) { *val = ent->value; return (1); } } #endif } else { #ifdef INET6 /* IPv6: aligned to 8bit mask */ struct in6_addr addr6; uint64_t *paddr, *ptmp; head = (struct chashbhead *)ti->xstate; imask = (ti->data & 0xFF0000) >> 16; hsize = 1 << (ti->data & 0xFF); hash = hash_ip6_al(&addr6, key, imask, hsize); paddr = (uint64_t *)&addr6; SLIST_FOREACH(ent, &head[hash], next) { ptmp = (uint64_t *)&ent->a.a6; if (paddr[0] == ptmp[0] && paddr[1] == ptmp[1]) { *val = ent->value; return (1); } } #endif } return (0); } static int ta_lookup_chash_64(struct table_info *ti, void *key, uint32_t keylen, uint32_t *val) { struct chashbhead *head; struct chashentry *ent; uint16_t hash, hsize; uint8_t imask; if (keylen == sizeof(in_addr_t)) { #ifdef INET head = (struct chashbhead *)ti->state; imask = ti->data >> 24; hsize = 1 << ((ti->data & 0xFFFF) >> 8); uint32_t a; a = ntohl(*((in_addr_t *)key)); a = a >> imask; hash = hash_ip(a, hsize); SLIST_FOREACH(ent, &head[hash], next) { if (ent->a.a4 == a) { *val = ent->value; return (1); } } #endif } else { #ifdef INET6 /* IPv6: /64 */ uint64_t a6, *paddr; head = (struct chashbhead *)ti->xstate; paddr = (uint64_t *)key; hsize = 1 << (ti->data & 0xFF); a6 = *paddr; hash = hash_ip64((struct in6_addr *)key, hsize); SLIST_FOREACH(ent, &head[hash], next) { paddr = (uint64_t *)&ent->a.a6; if (a6 == *paddr) { *val = ent->value; return (1); } } #endif } return (0); } static int chash_parse_opts(struct chash_cfg *cfg, char *data) { char *pdel, *pend, *s; int mask4, mask6; mask4 = cfg->mask4; mask6 = cfg->mask6; if (data == NULL) return (0); if ((pdel = strchr(data, ' ')) == NULL) return (0); while (*pdel == ' ') pdel++; if (strncmp(pdel, "masks=", 6) != 0) return (EINVAL); if ((s = strchr(pdel, ' ')) != NULL) *s++ = '\0'; pdel += 6; /* Need /XX[,/YY] */ if (*pdel++ != '/') return (EINVAL); mask4 = strtol(pdel, &pend, 10); if (*pend == ',') { /* ,/YY */ pdel = pend + 1; if (*pdel++ != '/') return (EINVAL); mask6 = strtol(pdel, &pend, 10); if (*pend != '\0') return (EINVAL); } else if (*pend != '\0') return (EINVAL); if (mask4 < 0 || mask4 > 32 || mask6 < 0 || mask6 > 128) return (EINVAL); cfg->mask4 = mask4; cfg->mask6 = mask6; return (0); } static void ta_print_chash_config(void *ta_state, struct table_info *ti, char *buf, size_t bufsize) { struct chash_cfg *cfg; cfg = (struct chash_cfg *)ta_state; if (cfg->mask4 != 32 || cfg->mask6 != 128) snprintf(buf, bufsize, "%s masks=/%d,/%d", "addr:hash", cfg->mask4, cfg->mask6); else snprintf(buf, bufsize, "%s", "addr:hash"); } static int ta_log2(uint32_t v) { uint32_t r; r = 0; while (v >>= 1) r++; return (r); } /* * New table. * We assume 'data' to be either NULL or the following format: * 'addr:hash [masks=/32[,/128]]' */ static int ta_init_chash(struct ip_fw_chain *ch, void **ta_state, struct table_info *ti, char *data, uint8_t tflags) { int error, i; uint32_t hsize; struct chash_cfg *cfg; cfg = malloc(sizeof(struct chash_cfg), M_IPFW, M_WAITOK | M_ZERO); cfg->mask4 = 32; cfg->mask6 = 128; if ((error = chash_parse_opts(cfg, data)) != 0) { free(cfg, M_IPFW); return (error); } cfg->size4 = 128; cfg->size6 = 128; cfg->head4 = malloc(sizeof(struct chashbhead) * cfg->size4, M_IPFW, M_WAITOK | M_ZERO); cfg->head6 = malloc(sizeof(struct chashbhead) * cfg->size6, M_IPFW, M_WAITOK | M_ZERO); for (i = 0; i < cfg->size4; i++) SLIST_INIT(&cfg->head4[i]); for (i = 0; i < cfg->size6; i++) SLIST_INIT(&cfg->head6[i]); *ta_state = cfg; ti->state = cfg->head4; ti->xstate = cfg->head6; /* Store data depending on v6 mask length */ hsize = ta_log2(cfg->size4) << 8 | ta_log2(cfg->size6); if (cfg->mask6 == 64) { ti->data = (32 - cfg->mask4) << 24 | (128 - cfg->mask6) << 16| hsize; ti->lookup = ta_lookup_chash_64; } else if ((cfg->mask6 % 8) == 0) { ti->data = (32 - cfg->mask4) << 24 | cfg->mask6 << 13 | hsize; ti->lookup = ta_lookup_chash_aligned; } else { /* don't do that! */ ti->data = (32 - cfg->mask4) << 24 | cfg->mask6 << 16 | hsize; ti->lookup = ta_lookup_chash_slow; } return (0); } static void ta_destroy_chash(void *ta_state, struct table_info *ti) { struct chash_cfg *cfg; struct chashentry *ent, *ent_next; int i; cfg = (struct chash_cfg *)ta_state; for (i = 0; i < cfg->size4; i++) SLIST_FOREACH_SAFE(ent, &cfg->head4[i], next, ent_next) free(ent, M_IPFW_TBL); for (i = 0; i < cfg->size6; i++) SLIST_FOREACH_SAFE(ent, &cfg->head6[i], next, ent_next) free(ent, M_IPFW_TBL); free(cfg->head4, M_IPFW); free(cfg->head6, M_IPFW); free(cfg, M_IPFW); } static void ta_dump_chash_tinfo(void *ta_state, struct table_info *ti, ipfw_ta_tinfo *tinfo) { struct chash_cfg *cfg; cfg = (struct chash_cfg *)ta_state; tinfo->flags = IPFW_TATFLAGS_AFDATA | IPFW_TATFLAGS_AFITEM; tinfo->taclass4 = IPFW_TACLASS_HASH; tinfo->size4 = cfg->size4; tinfo->count4 = cfg->items4; tinfo->itemsize4 = sizeof(struct chashentry); tinfo->taclass6 = IPFW_TACLASS_HASH; tinfo->size6 = cfg->size6; tinfo->count6 = cfg->items6; tinfo->itemsize6 = sizeof(struct chashentry); } static int ta_dump_chash_tentry(void *ta_state, struct table_info *ti, void *e, ipfw_obj_tentry *tent) { struct chash_cfg *cfg; struct chashentry *ent; cfg = (struct chash_cfg *)ta_state; ent = (struct chashentry *)e; if (ent->type == AF_INET) { tent->k.addr.s_addr = htonl(ent->a.a4 << (32 - cfg->mask4)); tent->masklen = cfg->mask4; tent->subtype = AF_INET; tent->v.kidx = ent->value; #ifdef INET6 } else { memcpy(&tent->k, &ent->a.a6, sizeof(struct in6_addr)); tent->masklen = cfg->mask6; tent->subtype = AF_INET6; tent->v.kidx = ent->value; #endif } return (0); } static uint32_t hash_ent(struct chashentry *ent, int af, int mlen, uint32_t size) { uint32_t hash; hash = 0; if (af == AF_INET) { #ifdef INET hash = hash_ip(ent->a.a4, size); #endif } else { #ifdef INET6 if (mlen == 64) hash = hash_ip64(&ent->a.a6, size); else hash = hash_ip6(&ent->a.a6, size); #endif } return (hash); } static int tei_to_chash_ent(struct tentry_info *tei, struct chashentry *ent) { int mlen; #ifdef INET6 struct in6_addr mask6; #endif mlen = tei->masklen; if (tei->subtype == AF_INET) { #ifdef INET if (mlen > 32) return (EINVAL); ent->type = AF_INET; /* Calculate masked address */ ent->a.a4 = ntohl(*((in_addr_t *)tei->paddr)) >> (32 - mlen); #endif #ifdef INET6 } else if (tei->subtype == AF_INET6) { /* IPv6 case */ if (mlen > 128) return (EINVAL); ent->type = AF_INET6; ipv6_writemask(&mask6, mlen); memcpy(&ent->a.a6, tei->paddr, sizeof(struct in6_addr)); APPLY_MASK(&ent->a.a6, &mask6); #endif } else { /* Unknown CIDR type */ return (EINVAL); } return (0); } static int ta_find_chash_tentry(void *ta_state, struct table_info *ti, ipfw_obj_tentry *tent) { struct chash_cfg *cfg; struct chashbhead *head; struct chashentry ent, *tmp; struct tentry_info tei; int error; uint32_t hash; cfg = (struct chash_cfg *)ta_state; memset(&ent, 0, sizeof(ent)); memset(&tei, 0, sizeof(tei)); if (tent->subtype == AF_INET) { tei.paddr = &tent->k.addr; tei.masklen = cfg->mask4; tei.subtype = AF_INET; if ((error = tei_to_chash_ent(&tei, &ent)) != 0) return (error); head = cfg->head4; hash = hash_ent(&ent, AF_INET, cfg->mask4, cfg->size4); /* Check for existence */ SLIST_FOREACH(tmp, &head[hash], next) { if (tmp->a.a4 != ent.a.a4) continue; ta_dump_chash_tentry(ta_state, ti, tmp, tent); return (0); } } else { tei.paddr = &tent->k.addr6; tei.masklen = cfg->mask6; tei.subtype = AF_INET6; if ((error = tei_to_chash_ent(&tei, &ent)) != 0) return (error); head = cfg->head6; hash = hash_ent(&ent, AF_INET6, cfg->mask6, cfg->size6); /* Check for existence */ SLIST_FOREACH(tmp, &head[hash], next) { if (memcmp(&tmp->a.a6, &ent.a.a6, 16) != 0) continue; ta_dump_chash_tentry(ta_state, ti, tmp, tent); return (0); } } return (ENOENT); } static void ta_foreach_chash(void *ta_state, struct table_info *ti, ta_foreach_f *f, void *arg) { struct chash_cfg *cfg; struct chashentry *ent, *ent_next; int i; cfg = (struct chash_cfg *)ta_state; for (i = 0; i < cfg->size4; i++) SLIST_FOREACH_SAFE(ent, &cfg->head4[i], next, ent_next) f(ent, arg); for (i = 0; i < cfg->size6; i++) SLIST_FOREACH_SAFE(ent, &cfg->head6[i], next, ent_next) f(ent, arg); } static int ta_prepare_add_chash(struct ip_fw_chain *ch, struct tentry_info *tei, void *ta_buf) { struct ta_buf_chash *tb; struct chashentry *ent; int error; tb = (struct ta_buf_chash *)ta_buf; ent = malloc(sizeof(*ent), M_IPFW_TBL, M_WAITOK | M_ZERO); error = tei_to_chash_ent(tei, ent); if (error != 0) { free(ent, M_IPFW_TBL); return (error); } tb->ent_ptr = ent; return (0); } static int ta_add_chash(void *ta_state, struct table_info *ti, struct tentry_info *tei, void *ta_buf, uint32_t *pnum) { struct chash_cfg *cfg; struct chashbhead *head; struct chashentry *ent, *tmp; struct ta_buf_chash *tb; int exists; uint32_t hash, value; cfg = (struct chash_cfg *)ta_state; tb = (struct ta_buf_chash *)ta_buf; ent = (struct chashentry *)tb->ent_ptr; hash = 0; exists = 0; /* Read current value from @tei */ ent->value = tei->value; /* Read cuurrent value */ if (tei->subtype == AF_INET) { if (tei->masklen != cfg->mask4) return (EINVAL); head = cfg->head4; hash = hash_ent(ent, AF_INET, cfg->mask4, cfg->size4); /* Check for existence */ SLIST_FOREACH(tmp, &head[hash], next) { if (tmp->a.a4 == ent->a.a4) { exists = 1; break; } } } else { if (tei->masklen != cfg->mask6) return (EINVAL); head = cfg->head6; hash = hash_ent(ent, AF_INET6, cfg->mask6, cfg->size6); /* Check for existence */ SLIST_FOREACH(tmp, &head[hash], next) { if (memcmp(&tmp->a.a6, &ent->a.a6, 16) == 0) { exists = 1; break; } } } if (exists == 1) { if ((tei->flags & TEI_FLAGS_UPDATE) == 0) return (EEXIST); /* Record already exists. Update value if we're asked to */ value = tmp->value; tmp->value = tei->value; tei->value = value; /* Indicate that update has happened instead of addition */ tei->flags |= TEI_FLAGS_UPDATED; *pnum = 0; } else { if ((tei->flags & TEI_FLAGS_DONTADD) != 0) return (EFBIG); SLIST_INSERT_HEAD(&head[hash], ent, next); tb->ent_ptr = NULL; *pnum = 1; /* Update counters */ if (tei->subtype == AF_INET) cfg->items4++; else cfg->items6++; } return (0); } static int ta_prepare_del_chash(struct ip_fw_chain *ch, struct tentry_info *tei, void *ta_buf) { struct ta_buf_chash *tb; tb = (struct ta_buf_chash *)ta_buf; return (tei_to_chash_ent(tei, &tb->ent)); } static int ta_del_chash(void *ta_state, struct table_info *ti, struct tentry_info *tei, void *ta_buf, uint32_t *pnum) { struct chash_cfg *cfg; struct chashbhead *head; struct chashentry *tmp, *tmp_next, *ent; struct ta_buf_chash *tb; uint32_t hash; cfg = (struct chash_cfg *)ta_state; tb = (struct ta_buf_chash *)ta_buf; ent = &tb->ent; if (tei->subtype == AF_INET) { if (tei->masklen != cfg->mask4) return (EINVAL); head = cfg->head4; hash = hash_ent(ent, AF_INET, cfg->mask4, cfg->size4); SLIST_FOREACH_SAFE(tmp, &head[hash], next, tmp_next) { if (tmp->a.a4 != ent->a.a4) continue; SLIST_REMOVE(&head[hash], tmp, chashentry, next); cfg->items4--; tb->ent_ptr = tmp; tei->value = tmp->value; *pnum = 1; return (0); } } else { if (tei->masklen != cfg->mask6) return (EINVAL); head = cfg->head6; hash = hash_ent(ent, AF_INET6, cfg->mask6, cfg->size6); SLIST_FOREACH_SAFE(tmp, &head[hash], next, tmp_next) { if (memcmp(&tmp->a.a6, &ent->a.a6, 16) != 0) continue; SLIST_REMOVE(&head[hash], tmp, chashentry, next); cfg->items6--; tb->ent_ptr = tmp; tei->value = tmp->value; *pnum = 1; return (0); } } return (ENOENT); } static void ta_flush_chash_entry(struct ip_fw_chain *ch, struct tentry_info *tei, void *ta_buf) { struct ta_buf_chash *tb; tb = (struct ta_buf_chash *)ta_buf; if (tb->ent_ptr != NULL) free(tb->ent_ptr, M_IPFW_TBL); } /* * Hash growing callbacks. */ static int ta_need_modify_chash(void *ta_state, struct table_info *ti, uint32_t count, uint64_t *pflags) { struct chash_cfg *cfg; uint64_t data; /* * Since we don't know exact number of IPv4/IPv6 records in @count, * ignore non-zero @count value at all. Check current hash sizes * and return appropriate data. */ cfg = (struct chash_cfg *)ta_state; data = 0; if (cfg->items4 > cfg->size4 && cfg->size4 < 65536) data |= (cfg->size4 * 2) << 16; if (cfg->items6 > cfg->size6 && cfg->size6 < 65536) data |= cfg->size6 * 2; if (data != 0) { *pflags = data; return (1); } return (0); } /* * Allocate new, larger chash. */ static int ta_prepare_mod_chash(void *ta_buf, uint64_t *pflags) { struct mod_item *mi; struct chashbhead *head; int i; mi = (struct mod_item *)ta_buf; memset(mi, 0, sizeof(struct mod_item)); mi->size = (*pflags >> 16) & 0xFFFF; mi->size6 = *pflags & 0xFFFF; if (mi->size > 0) { head = malloc(sizeof(struct chashbhead) * mi->size, M_IPFW, M_WAITOK | M_ZERO); for (i = 0; i < mi->size; i++) SLIST_INIT(&head[i]); mi->main_ptr = head; } if (mi->size6 > 0) { head = malloc(sizeof(struct chashbhead) * mi->size6, M_IPFW, M_WAITOK | M_ZERO); for (i = 0; i < mi->size6; i++) SLIST_INIT(&head[i]); mi->main_ptr6 = head; } return (0); } /* * Copy data from old runtime array to new one. */ static int ta_fill_mod_chash(void *ta_state, struct table_info *ti, void *ta_buf, uint64_t *pflags) { /* In is not possible to do rehash if we're not holidng WLOCK. */ return (0); } /* * Switch old & new arrays. */ static void ta_modify_chash(void *ta_state, struct table_info *ti, void *ta_buf, uint64_t pflags) { struct mod_item *mi; struct chash_cfg *cfg; struct chashbhead *old_head, *new_head; struct chashentry *ent, *ent_next; int af, i, mlen; uint32_t nhash; size_t old_size, new_size; mi = (struct mod_item *)ta_buf; cfg = (struct chash_cfg *)ta_state; /* Check which hash we need to grow and do we still need that */ if (mi->size > 0 && cfg->size4 < mi->size) { new_head = (struct chashbhead *)mi->main_ptr; new_size = mi->size; old_size = cfg->size4; old_head = ti->state; mlen = cfg->mask4; af = AF_INET; for (i = 0; i < old_size; i++) { SLIST_FOREACH_SAFE(ent, &old_head[i], next, ent_next) { nhash = hash_ent(ent, af, mlen, new_size); SLIST_INSERT_HEAD(&new_head[nhash], ent, next); } } ti->state = new_head; cfg->head4 = new_head; cfg->size4 = mi->size; mi->main_ptr = old_head; } if (mi->size6 > 0 && cfg->size6 < mi->size6) { new_head = (struct chashbhead *)mi->main_ptr6; new_size = mi->size6; old_size = cfg->size6; old_head = ti->xstate; mlen = cfg->mask6; af = AF_INET6; for (i = 0; i < old_size; i++) { SLIST_FOREACH_SAFE(ent, &old_head[i], next, ent_next) { nhash = hash_ent(ent, af, mlen, new_size); SLIST_INSERT_HEAD(&new_head[nhash], ent, next); } } ti->xstate = new_head; cfg->head6 = new_head; cfg->size6 = mi->size6; mi->main_ptr6 = old_head; } /* Update lower 32 bits with new values */ ti->data &= 0xFFFFFFFF00000000; ti->data |= ta_log2(cfg->size4) << 8 | ta_log2(cfg->size6); } /* * Free unneded array. */ static void ta_flush_mod_chash(void *ta_buf) { struct mod_item *mi; mi = (struct mod_item *)ta_buf; if (mi->main_ptr != NULL) free(mi->main_ptr, M_IPFW); if (mi->main_ptr6 != NULL) free(mi->main_ptr6, M_IPFW); } struct table_algo addr_hash = { .name = "addr:hash", .type = IPFW_TABLE_ADDR, .ta_buf_size = sizeof(struct ta_buf_chash), .init = ta_init_chash, .destroy = ta_destroy_chash, .prepare_add = ta_prepare_add_chash, .prepare_del = ta_prepare_del_chash, .add = ta_add_chash, .del = ta_del_chash, .flush_entry = ta_flush_chash_entry, .foreach = ta_foreach_chash, .dump_tentry = ta_dump_chash_tentry, .find_tentry = ta_find_chash_tentry, .print_config = ta_print_chash_config, .dump_tinfo = ta_dump_chash_tinfo, .need_modify = ta_need_modify_chash, .prepare_mod = ta_prepare_mod_chash, .fill_mod = ta_fill_mod_chash, .modify = ta_modify_chash, .flush_mod = ta_flush_mod_chash, }; /* * Iface table cmds. * * Implementation: * * Runtime part: * - sorted array of "struct ifidx" pointed by ti->state. * Array is allocated with rounding up to IFIDX_CHUNK. Only existing * interfaces are stored in array, however its allocated size is * sufficient to hold all table records if needed. * - current array size is stored in ti->data * * Table data: * - "struct iftable_cfg" is allocated to store table state (ta_state). * - All table records are stored inside namedobj instance. * */ struct ifidx { uint16_t kidx; uint16_t spare; uint32_t value; }; #define DEFAULT_IFIDX_SIZE 64 struct iftable_cfg; struct ifentry { struct named_object no; struct ipfw_ifc ic; struct iftable_cfg *icfg; uint32_t value; int linked; }; struct iftable_cfg { struct namedobj_instance *ii; struct ip_fw_chain *ch; struct table_info *ti; void *main_ptr; size_t size; /* Number of items allocated in array */ size_t count; /* Number of all items */ size_t used; /* Number of items _active_ now */ }; struct ta_buf_ifidx { struct ifentry *ife; uint32_t value; }; int compare_ifidx(const void *k, const void *v); static struct ifidx * ifidx_find(struct table_info *ti, void *key); static int ta_lookup_ifidx(struct table_info *ti, void *key, uint32_t keylen, uint32_t *val); static int ta_init_ifidx(struct ip_fw_chain *ch, void **ta_state, struct table_info *ti, char *data, uint8_t tflags); static void ta_change_ti_ifidx(void *ta_state, struct table_info *ti); static void destroy_ifidx_locked(struct namedobj_instance *ii, struct named_object *no, void *arg); static void ta_destroy_ifidx(void *ta_state, struct table_info *ti); static void ta_dump_ifidx_tinfo(void *ta_state, struct table_info *ti, ipfw_ta_tinfo *tinfo); static int ta_prepare_add_ifidx(struct ip_fw_chain *ch, struct tentry_info *tei, void *ta_buf); static int ta_add_ifidx(void *ta_state, struct table_info *ti, struct tentry_info *tei, void *ta_buf, uint32_t *pnum); static int ta_prepare_del_ifidx(struct ip_fw_chain *ch, struct tentry_info *tei, void *ta_buf); static int ta_del_ifidx(void *ta_state, struct table_info *ti, struct tentry_info *tei, void *ta_buf, uint32_t *pnum); static void ta_flush_ifidx_entry(struct ip_fw_chain *ch, struct tentry_info *tei, void *ta_buf); static void if_notifier(struct ip_fw_chain *ch, void *cbdata, uint16_t ifindex); static int ta_need_modify_ifidx(void *ta_state, struct table_info *ti, uint32_t count, uint64_t *pflags); static int ta_prepare_mod_ifidx(void *ta_buf, uint64_t *pflags); static int ta_fill_mod_ifidx(void *ta_state, struct table_info *ti, void *ta_buf, uint64_t *pflags); static void ta_modify_ifidx(void *ta_state, struct table_info *ti, void *ta_buf, uint64_t pflags); static void ta_flush_mod_ifidx(void *ta_buf); static int ta_dump_ifidx_tentry(void *ta_state, struct table_info *ti, void *e, ipfw_obj_tentry *tent); static int ta_find_ifidx_tentry(void *ta_state, struct table_info *ti, ipfw_obj_tentry *tent); static void foreach_ifidx(struct namedobj_instance *ii, struct named_object *no, void *arg); static void ta_foreach_ifidx(void *ta_state, struct table_info *ti, ta_foreach_f *f, void *arg); int compare_ifidx(const void *k, const void *v) { const struct ifidx *ifidx; uint16_t key; key = *((const uint16_t *)k); ifidx = (const struct ifidx *)v; if (key < ifidx->kidx) return (-1); else if (key > ifidx->kidx) return (1); return (0); } /* * Adds item @item with key @key into ascending-sorted array @base. * Assumes @base has enough additional storage. * * Returns 1 on success, 0 on duplicate key. */ static int badd(const void *key, void *item, void *base, size_t nmemb, size_t size, int (*compar) (const void *, const void *)) { int min, max, mid, shift, res; caddr_t paddr; if (nmemb == 0) { memcpy(base, item, size); return (1); } /* Binary search */ min = 0; max = nmemb - 1; mid = 0; while (min <= max) { mid = (min + max) / 2; res = compar(key, (const void *)((caddr_t)base + mid * size)); if (res == 0) return (0); if (res > 0) min = mid + 1; else max = mid - 1; } /* Item not found. */ res = compar(key, (const void *)((caddr_t)base + mid * size)); if (res > 0) shift = mid + 1; else shift = mid; paddr = (caddr_t)base + shift * size; if (nmemb > shift) memmove(paddr + size, paddr, (nmemb - shift) * size); memcpy(paddr, item, size); return (1); } /* * Deletes item with key @key from ascending-sorted array @base. * * Returns 1 on success, 0 for non-existent key. */ static int bdel(const void *key, void *base, size_t nmemb, size_t size, int (*compar) (const void *, const void *)) { caddr_t item; size_t sz; item = (caddr_t)bsearch(key, base, nmemb, size, compar); if (item == NULL) return (0); sz = (caddr_t)base + nmemb * size - item; if (sz > 0) memmove(item, item + size, sz); return (1); } static struct ifidx * ifidx_find(struct table_info *ti, void *key) { struct ifidx *ifi; ifi = bsearch(key, ti->state, ti->data, sizeof(struct ifidx), compare_ifidx); return (ifi); } static int ta_lookup_ifidx(struct table_info *ti, void *key, uint32_t keylen, uint32_t *val) { struct ifidx *ifi; ifi = ifidx_find(ti, key); if (ifi != NULL) { *val = ifi->value; return (1); } return (0); } static int ta_init_ifidx(struct ip_fw_chain *ch, void **ta_state, struct table_info *ti, char *data, uint8_t tflags) { struct iftable_cfg *icfg; icfg = malloc(sizeof(struct iftable_cfg), M_IPFW, M_WAITOK | M_ZERO); icfg->ii = ipfw_objhash_create(DEFAULT_IFIDX_SIZE); icfg->size = DEFAULT_IFIDX_SIZE; icfg->main_ptr = malloc(sizeof(struct ifidx) * icfg->size, M_IPFW, M_WAITOK | M_ZERO); icfg->ch = ch; *ta_state = icfg; ti->state = icfg->main_ptr; ti->lookup = ta_lookup_ifidx; return (0); } /* * Handle tableinfo @ti pointer change (on table array resize). */ static void ta_change_ti_ifidx(void *ta_state, struct table_info *ti) { struct iftable_cfg *icfg; icfg = (struct iftable_cfg *)ta_state; icfg->ti = ti; } static void destroy_ifidx_locked(struct namedobj_instance *ii, struct named_object *no, void *arg) { struct ifentry *ife; struct ip_fw_chain *ch; ch = (struct ip_fw_chain *)arg; ife = (struct ifentry *)no; ipfw_iface_del_notify(ch, &ife->ic); + ipfw_iface_unref(ch, &ife->ic); free(ife, M_IPFW_TBL); } /* * Destroys table @ti */ static void ta_destroy_ifidx(void *ta_state, struct table_info *ti) { struct iftable_cfg *icfg; struct ip_fw_chain *ch; icfg = (struct iftable_cfg *)ta_state; ch = icfg->ch; if (icfg->main_ptr != NULL) free(icfg->main_ptr, M_IPFW); + IPFW_UH_WLOCK(ch); ipfw_objhash_foreach(icfg->ii, destroy_ifidx_locked, ch); + IPFW_UH_WUNLOCK(ch); ipfw_objhash_destroy(icfg->ii); free(icfg, M_IPFW); } /* * Provide algo-specific table info */ static void ta_dump_ifidx_tinfo(void *ta_state, struct table_info *ti, ipfw_ta_tinfo *tinfo) { struct iftable_cfg *cfg; cfg = (struct iftable_cfg *)ta_state; tinfo->taclass4 = IPFW_TACLASS_ARRAY; tinfo->size4 = cfg->size; tinfo->count4 = cfg->used; tinfo->itemsize4 = sizeof(struct ifidx); } /* * Prepare state to add to the table: * allocate ifentry and reference needed interface. */ static int ta_prepare_add_ifidx(struct ip_fw_chain *ch, struct tentry_info *tei, void *ta_buf) { struct ta_buf_ifidx *tb; char *ifname; struct ifentry *ife; tb = (struct ta_buf_ifidx *)ta_buf; /* Check if string is terminated */ ifname = (char *)tei->paddr; if (strnlen(ifname, IF_NAMESIZE) == IF_NAMESIZE) return (EINVAL); ife = malloc(sizeof(struct ifentry), M_IPFW_TBL, M_WAITOK | M_ZERO); ife->ic.cb = if_notifier; ife->ic.cbdata = ife; if (ipfw_iface_ref(ch, ifname, &ife->ic) != 0) { free(ife, M_IPFW_TBL); return (EINVAL); } /* Use ipfw_iface 'ifname' field as stable storage */ ife->no.name = ife->ic.iface->ifname; tb->ife = ife; return (0); } static int ta_add_ifidx(void *ta_state, struct table_info *ti, struct tentry_info *tei, void *ta_buf, uint32_t *pnum) { struct iftable_cfg *icfg; struct ifentry *ife, *tmp; struct ta_buf_ifidx *tb; struct ipfw_iface *iif; struct ifidx *ifi; char *ifname; uint32_t value; tb = (struct ta_buf_ifidx *)ta_buf; ifname = (char *)tei->paddr; icfg = (struct iftable_cfg *)ta_state; ife = tb->ife; ife->icfg = icfg; ife->value = tei->value; tmp = (struct ifentry *)ipfw_objhash_lookup_name(icfg->ii, 0, ifname); if (tmp != NULL) { if ((tei->flags & TEI_FLAGS_UPDATE) == 0) return (EEXIST); /* Exchange values in @tmp and @tei */ value = tmp->value; tmp->value = tei->value; tei->value = value; iif = tmp->ic.iface; if (iif->resolved != 0) { /* We have to update runtime value, too */ ifi = ifidx_find(ti, &iif->ifindex); ifi->value = ife->value; } /* Indicate that update has happened instead of addition */ tei->flags |= TEI_FLAGS_UPDATED; *pnum = 0; return (0); } if ((tei->flags & TEI_FLAGS_DONTADD) != 0) return (EFBIG); /* Link to internal list */ ipfw_objhash_add(icfg->ii, &ife->no); /* Link notifier (possible running its callback) */ ipfw_iface_add_notify(icfg->ch, &ife->ic); icfg->count++; tb->ife = NULL; *pnum = 1; return (0); } /* * Prepare to delete key from table. * Do basic interface name checks. */ static int ta_prepare_del_ifidx(struct ip_fw_chain *ch, struct tentry_info *tei, void *ta_buf) { struct ta_buf_ifidx *tb; char *ifname; tb = (struct ta_buf_ifidx *)ta_buf; /* Check if string is terminated */ ifname = (char *)tei->paddr; if (strnlen(ifname, IF_NAMESIZE) == IF_NAMESIZE) return (EINVAL); return (0); } /* * Remove key from both configuration list and * runtime array. Removed interface notification. */ static int ta_del_ifidx(void *ta_state, struct table_info *ti, struct tentry_info *tei, void *ta_buf, uint32_t *pnum) { struct iftable_cfg *icfg; struct ifentry *ife; struct ta_buf_ifidx *tb; char *ifname; uint16_t ifindex; int res; tb = (struct ta_buf_ifidx *)ta_buf; ifname = (char *)tei->paddr; icfg = (struct iftable_cfg *)ta_state; ife = tb->ife; ife = (struct ifentry *)ipfw_objhash_lookup_name(icfg->ii, 0, ifname); if (ife == NULL) return (ENOENT); if (ife->linked != 0) { /* We have to remove item from runtime */ ifindex = ife->ic.iface->ifindex; res = bdel(&ifindex, icfg->main_ptr, icfg->used, sizeof(struct ifidx), compare_ifidx); KASSERT(res == 1, ("index %d does not exist", ifindex)); icfg->used--; ti->data = icfg->used; ife->linked = 0; } /* Unlink from local list */ ipfw_objhash_del(icfg->ii, &ife->no); - /* Unlink notifier */ + /* Unlink notifier and deref */ ipfw_iface_del_notify(icfg->ch, &ife->ic); + ipfw_iface_unref(icfg->ch, &ife->ic); icfg->count--; tei->value = ife->value; tb->ife = ife; *pnum = 1; return (0); } /* * Flush deleted entry. * Drops interface reference and frees entry. */ static void ta_flush_ifidx_entry(struct ip_fw_chain *ch, struct tentry_info *tei, void *ta_buf) { struct ta_buf_ifidx *tb; tb = (struct ta_buf_ifidx *)ta_buf; - if (tb->ife != NULL) { - /* Unlink first */ - ipfw_iface_unref(ch, &tb->ife->ic); + if (tb->ife != NULL) free(tb->ife, M_IPFW_TBL); - } } /* * Handle interface announce/withdrawal for particular table. * Every real runtime array modification happens here. */ static void if_notifier(struct ip_fw_chain *ch, void *cbdata, uint16_t ifindex) { struct ifentry *ife; struct ifidx ifi; struct iftable_cfg *icfg; struct table_info *ti; int res; ife = (struct ifentry *)cbdata; icfg = ife->icfg; ti = icfg->ti; KASSERT(ti != NULL, ("ti=NULL, check change_ti handler")); if (ife->linked == 0 && ifindex != 0) { /* Interface announce */ ifi.kidx = ifindex; ifi.spare = 0; ifi.value = ife->value; res = badd(&ifindex, &ifi, icfg->main_ptr, icfg->used, sizeof(struct ifidx), compare_ifidx); KASSERT(res == 1, ("index %d already exists", ifindex)); icfg->used++; ti->data = icfg->used; ife->linked = 1; } else if (ife->linked != 0 && ifindex == 0) { /* Interface withdrawal */ ifindex = ife->ic.iface->ifindex; res = bdel(&ifindex, icfg->main_ptr, icfg->used, sizeof(struct ifidx), compare_ifidx); KASSERT(res == 1, ("index %d does not exist", ifindex)); icfg->used--; ti->data = icfg->used; ife->linked = 0; } } /* * Table growing callbacks. */ static int ta_need_modify_ifidx(void *ta_state, struct table_info *ti, uint32_t count, uint64_t *pflags) { struct iftable_cfg *cfg; uint32_t size; cfg = (struct iftable_cfg *)ta_state; size = cfg->size; while (size < cfg->count + count) size *= 2; if (size != cfg->size) { *pflags = size; return (1); } return (0); } /* * Allocate ned, larger runtime ifidx array. */ static int ta_prepare_mod_ifidx(void *ta_buf, uint64_t *pflags) { struct mod_item *mi; mi = (struct mod_item *)ta_buf; memset(mi, 0, sizeof(struct mod_item)); mi->size = *pflags; mi->main_ptr = malloc(sizeof(struct ifidx) * mi->size, M_IPFW, M_WAITOK | M_ZERO); return (0); } /* * Copy data from old runtime array to new one. */ static int ta_fill_mod_ifidx(void *ta_state, struct table_info *ti, void *ta_buf, uint64_t *pflags) { struct mod_item *mi; struct iftable_cfg *icfg; mi = (struct mod_item *)ta_buf; icfg = (struct iftable_cfg *)ta_state; /* Check if we still need to grow array */ if (icfg->size >= mi->size) { *pflags = 0; return (0); } memcpy(mi->main_ptr, icfg->main_ptr, icfg->used * sizeof(struct ifidx)); return (0); } /* * Switch old & new arrays. */ static void ta_modify_ifidx(void *ta_state, struct table_info *ti, void *ta_buf, uint64_t pflags) { struct mod_item *mi; struct iftable_cfg *icfg; void *old_ptr; mi = (struct mod_item *)ta_buf; icfg = (struct iftable_cfg *)ta_state; old_ptr = icfg->main_ptr; icfg->main_ptr = mi->main_ptr; icfg->size = mi->size; ti->state = icfg->main_ptr; mi->main_ptr = old_ptr; } /* * Free unneded array. */ static void ta_flush_mod_ifidx(void *ta_buf) { struct mod_item *mi; mi = (struct mod_item *)ta_buf; if (mi->main_ptr != NULL) free(mi->main_ptr, M_IPFW); } static int ta_dump_ifidx_tentry(void *ta_state, struct table_info *ti, void *e, ipfw_obj_tentry *tent) { struct ifentry *ife; ife = (struct ifentry *)e; tent->masklen = 8 * IF_NAMESIZE; memcpy(&tent->k, ife->no.name, IF_NAMESIZE); tent->v.kidx = ife->value; return (0); } static int ta_find_ifidx_tentry(void *ta_state, struct table_info *ti, ipfw_obj_tentry *tent) { struct iftable_cfg *icfg; struct ifentry *ife; char *ifname; icfg = (struct iftable_cfg *)ta_state; ifname = tent->k.iface; if (strnlen(ifname, IF_NAMESIZE) == IF_NAMESIZE) return (EINVAL); ife = (struct ifentry *)ipfw_objhash_lookup_name(icfg->ii, 0, ifname); if (ife != NULL) { ta_dump_ifidx_tentry(ta_state, ti, ife, tent); return (0); } return (ENOENT); } struct wa_ifidx { ta_foreach_f *f; void *arg; }; static void foreach_ifidx(struct namedobj_instance *ii, struct named_object *no, void *arg) { struct ifentry *ife; struct wa_ifidx *wa; ife = (struct ifentry *)no; wa = (struct wa_ifidx *)arg; wa->f(ife, wa->arg); } static void ta_foreach_ifidx(void *ta_state, struct table_info *ti, ta_foreach_f *f, void *arg) { struct iftable_cfg *icfg; struct wa_ifidx wa; icfg = (struct iftable_cfg *)ta_state; wa.f = f; wa.arg = arg; ipfw_objhash_foreach(icfg->ii, foreach_ifidx, &wa); } struct table_algo iface_idx = { .name = "iface:array", .type = IPFW_TABLE_INTERFACE, .flags = TA_FLAG_DEFAULT, .ta_buf_size = sizeof(struct ta_buf_ifidx), .init = ta_init_ifidx, .destroy = ta_destroy_ifidx, .prepare_add = ta_prepare_add_ifidx, .prepare_del = ta_prepare_del_ifidx, .add = ta_add_ifidx, .del = ta_del_ifidx, .flush_entry = ta_flush_ifidx_entry, .foreach = ta_foreach_ifidx, .dump_tentry = ta_dump_ifidx_tentry, .find_tentry = ta_find_ifidx_tentry, .dump_tinfo = ta_dump_ifidx_tinfo, .need_modify = ta_need_modify_ifidx, .prepare_mod = ta_prepare_mod_ifidx, .fill_mod = ta_fill_mod_ifidx, .modify = ta_modify_ifidx, .flush_mod = ta_flush_mod_ifidx, .change_ti = ta_change_ti_ifidx, }; /* * Number array cmds. * * Implementation: * * Runtime part: * - sorted array of "struct numarray" pointed by ti->state. * Array is allocated with rounding up to NUMARRAY_CHUNK. * - current array size is stored in ti->data * */ struct numarray { uint32_t number; uint32_t value; }; struct numarray_cfg { void *main_ptr; size_t size; /* Number of items allocated in array */ size_t used; /* Number of items _active_ now */ }; struct ta_buf_numarray { struct numarray na; }; int compare_numarray(const void *k, const void *v); static struct numarray *numarray_find(struct table_info *ti, void *key); static int ta_lookup_numarray(struct table_info *ti, void *key, uint32_t keylen, uint32_t *val); static int ta_init_numarray(struct ip_fw_chain *ch, void **ta_state, struct table_info *ti, char *data, uint8_t tflags); static void ta_destroy_numarray(void *ta_state, struct table_info *ti); static void ta_dump_numarray_tinfo(void *ta_state, struct table_info *ti, ipfw_ta_tinfo *tinfo); static int ta_prepare_add_numarray(struct ip_fw_chain *ch, struct tentry_info *tei, void *ta_buf); static int ta_add_numarray(void *ta_state, struct table_info *ti, struct tentry_info *tei, void *ta_buf, uint32_t *pnum); static int ta_del_numarray(void *ta_state, struct table_info *ti, struct tentry_info *tei, void *ta_buf, uint32_t *pnum); static void ta_flush_numarray_entry(struct ip_fw_chain *ch, struct tentry_info *tei, void *ta_buf); static int ta_need_modify_numarray(void *ta_state, struct table_info *ti, uint32_t count, uint64_t *pflags); static int ta_prepare_mod_numarray(void *ta_buf, uint64_t *pflags); static int ta_fill_mod_numarray(void *ta_state, struct table_info *ti, void *ta_buf, uint64_t *pflags); static void ta_modify_numarray(void *ta_state, struct table_info *ti, void *ta_buf, uint64_t pflags); static void ta_flush_mod_numarray(void *ta_buf); static int ta_dump_numarray_tentry(void *ta_state, struct table_info *ti, void *e, ipfw_obj_tentry *tent); static int ta_find_numarray_tentry(void *ta_state, struct table_info *ti, ipfw_obj_tentry *tent); static void ta_foreach_numarray(void *ta_state, struct table_info *ti, ta_foreach_f *f, void *arg); int compare_numarray(const void *k, const void *v) { const struct numarray *na; uint32_t key; key = *((const uint32_t *)k); na = (const struct numarray *)v; if (key < na->number) return (-1); else if (key > na->number) return (1); return (0); } static struct numarray * numarray_find(struct table_info *ti, void *key) { struct numarray *ri; ri = bsearch(key, ti->state, ti->data, sizeof(struct numarray), compare_ifidx); return (ri); } static int ta_lookup_numarray(struct table_info *ti, void *key, uint32_t keylen, uint32_t *val) { struct numarray *ri; ri = numarray_find(ti, key); if (ri != NULL) { *val = ri->value; return (1); } return (0); } static int ta_init_numarray(struct ip_fw_chain *ch, void **ta_state, struct table_info *ti, char *data, uint8_t tflags) { struct numarray_cfg *cfg; cfg = malloc(sizeof(*cfg), M_IPFW, M_WAITOK | M_ZERO); cfg->size = 16; cfg->main_ptr = malloc(sizeof(struct numarray) * cfg->size, M_IPFW, M_WAITOK | M_ZERO); *ta_state = cfg; ti->state = cfg->main_ptr; ti->lookup = ta_lookup_numarray; return (0); } /* * Destroys table @ti */ static void ta_destroy_numarray(void *ta_state, struct table_info *ti) { struct numarray_cfg *cfg; cfg = (struct numarray_cfg *)ta_state; if (cfg->main_ptr != NULL) free(cfg->main_ptr, M_IPFW); free(cfg, M_IPFW); } /* * Provide algo-specific table info */ static void ta_dump_numarray_tinfo(void *ta_state, struct table_info *ti, ipfw_ta_tinfo *tinfo) { struct numarray_cfg *cfg; cfg = (struct numarray_cfg *)ta_state; tinfo->taclass4 = IPFW_TACLASS_ARRAY; tinfo->size4 = cfg->size; tinfo->count4 = cfg->used; tinfo->itemsize4 = sizeof(struct numarray); } /* * Prepare for addition/deletion to an array. */ static int ta_prepare_add_numarray(struct ip_fw_chain *ch, struct tentry_info *tei, void *ta_buf) { struct ta_buf_numarray *tb; tb = (struct ta_buf_numarray *)ta_buf; tb->na.number = *((uint32_t *)tei->paddr); return (0); } static int ta_add_numarray(void *ta_state, struct table_info *ti, struct tentry_info *tei, void *ta_buf, uint32_t *pnum) { struct numarray_cfg *cfg; struct ta_buf_numarray *tb; struct numarray *ri; int res; uint32_t value; tb = (struct ta_buf_numarray *)ta_buf; cfg = (struct numarray_cfg *)ta_state; /* Read current value from @tei */ tb->na.value = tei->value; ri = numarray_find(ti, &tb->na.number); if (ri != NULL) { if ((tei->flags & TEI_FLAGS_UPDATE) == 0) return (EEXIST); /* Exchange values between ri and @tei */ value = ri->value; ri->value = tei->value; tei->value = value; /* Indicate that update has happened instead of addition */ tei->flags |= TEI_FLAGS_UPDATED; *pnum = 0; return (0); } if ((tei->flags & TEI_FLAGS_DONTADD) != 0) return (EFBIG); res = badd(&tb->na.number, &tb->na, cfg->main_ptr, cfg->used, sizeof(struct numarray), compare_numarray); KASSERT(res == 1, ("number %d already exists", tb->na.number)); cfg->used++; ti->data = cfg->used; *pnum = 1; return (0); } /* * Remove key from both configuration list and * runtime array. Removed interface notification. */ static int ta_del_numarray(void *ta_state, struct table_info *ti, struct tentry_info *tei, void *ta_buf, uint32_t *pnum) { struct numarray_cfg *cfg; struct ta_buf_numarray *tb; struct numarray *ri; int res; tb = (struct ta_buf_numarray *)ta_buf; cfg = (struct numarray_cfg *)ta_state; ri = numarray_find(ti, &tb->na.number); if (ri == NULL) return (ENOENT); tei->value = ri->value; res = bdel(&tb->na.number, cfg->main_ptr, cfg->used, sizeof(struct numarray), compare_numarray); KASSERT(res == 1, ("number %u does not exist", tb->na.number)); cfg->used--; ti->data = cfg->used; *pnum = 1; return (0); } static void ta_flush_numarray_entry(struct ip_fw_chain *ch, struct tentry_info *tei, void *ta_buf) { /* We don't have any state, do nothing */ } /* * Table growing callbacks. */ static int ta_need_modify_numarray(void *ta_state, struct table_info *ti, uint32_t count, uint64_t *pflags) { struct numarray_cfg *cfg; size_t size; cfg = (struct numarray_cfg *)ta_state; size = cfg->size; while (size < cfg->used + count) size *= 2; if (size != cfg->size) { *pflags = size; return (1); } return (0); } /* * Allocate new, larger runtime array. */ static int ta_prepare_mod_numarray(void *ta_buf, uint64_t *pflags) { struct mod_item *mi; mi = (struct mod_item *)ta_buf; memset(mi, 0, sizeof(struct mod_item)); mi->size = *pflags; mi->main_ptr = malloc(sizeof(struct numarray) * mi->size, M_IPFW, M_WAITOK | M_ZERO); return (0); } /* * Copy data from old runtime array to new one. */ static int ta_fill_mod_numarray(void *ta_state, struct table_info *ti, void *ta_buf, uint64_t *pflags) { struct mod_item *mi; struct numarray_cfg *cfg; mi = (struct mod_item *)ta_buf; cfg = (struct numarray_cfg *)ta_state; /* Check if we still need to grow array */ if (cfg->size >= mi->size) { *pflags = 0; return (0); } memcpy(mi->main_ptr, cfg->main_ptr, cfg->used * sizeof(struct numarray)); return (0); } /* * Switch old & new arrays. */ static void ta_modify_numarray(void *ta_state, struct table_info *ti, void *ta_buf, uint64_t pflags) { struct mod_item *mi; struct numarray_cfg *cfg; void *old_ptr; mi = (struct mod_item *)ta_buf; cfg = (struct numarray_cfg *)ta_state; old_ptr = cfg->main_ptr; cfg->main_ptr = mi->main_ptr; cfg->size = mi->size; ti->state = cfg->main_ptr; mi->main_ptr = old_ptr; } /* * Free unneded array. */ static void ta_flush_mod_numarray(void *ta_buf) { struct mod_item *mi; mi = (struct mod_item *)ta_buf; if (mi->main_ptr != NULL) free(mi->main_ptr, M_IPFW); } static int ta_dump_numarray_tentry(void *ta_state, struct table_info *ti, void *e, ipfw_obj_tentry *tent) { struct numarray *na; na = (struct numarray *)e; tent->k.key = na->number; tent->v.kidx = na->value; return (0); } static int ta_find_numarray_tentry(void *ta_state, struct table_info *ti, ipfw_obj_tentry *tent) { struct numarray_cfg *cfg; struct numarray *ri; cfg = (struct numarray_cfg *)ta_state; ri = numarray_find(ti, &tent->k.key); if (ri != NULL) { ta_dump_numarray_tentry(ta_state, ti, ri, tent); return (0); } return (ENOENT); } static void ta_foreach_numarray(void *ta_state, struct table_info *ti, ta_foreach_f *f, void *arg) { struct numarray_cfg *cfg; struct numarray *array; int i; cfg = (struct numarray_cfg *)ta_state; array = cfg->main_ptr; for (i = 0; i < cfg->used; i++) f(&array[i], arg); } struct table_algo number_array = { .name = "number:array", .type = IPFW_TABLE_NUMBER, .ta_buf_size = sizeof(struct ta_buf_numarray), .init = ta_init_numarray, .destroy = ta_destroy_numarray, .prepare_add = ta_prepare_add_numarray, .prepare_del = ta_prepare_add_numarray, .add = ta_add_numarray, .del = ta_del_numarray, .flush_entry = ta_flush_numarray_entry, .foreach = ta_foreach_numarray, .dump_tentry = ta_dump_numarray_tentry, .find_tentry = ta_find_numarray_tentry, .dump_tinfo = ta_dump_numarray_tinfo, .need_modify = ta_need_modify_numarray, .prepare_mod = ta_prepare_mod_numarray, .fill_mod = ta_fill_mod_numarray, .modify = ta_modify_numarray, .flush_mod = ta_flush_mod_numarray, }; /* * flow:hash cmds * * * ti->data: * [inv.mask4][inv.mask6][log2hsize4][log2hsize6] * [ 8][ 8[ 8][ 8] * * inv.mask4: 32 - mask * inv.mask6: * 1) _slow lookup: mask * 2) _aligned: (128 - mask) / 8 * 3) _64: 8 * * * pflags: * [hsize4][hsize6] * [ 16][ 16] */ struct fhashentry; SLIST_HEAD(fhashbhead, fhashentry); struct fhashentry { SLIST_ENTRY(fhashentry) next; uint8_t af; uint8_t proto; uint16_t spare0; uint16_t dport; uint16_t sport; uint32_t value; uint32_t spare1; }; struct fhashentry4 { struct fhashentry e; struct in_addr dip; struct in_addr sip; }; struct fhashentry6 { struct fhashentry e; struct in6_addr dip6; struct in6_addr sip6; }; struct fhash_cfg { struct fhashbhead *head; size_t size; size_t items; struct fhashentry4 fe4; struct fhashentry6 fe6; }; struct ta_buf_fhash { void *ent_ptr; struct fhashentry6 fe6; }; static __inline int cmp_flow_ent(struct fhashentry *a, struct fhashentry *b, size_t sz); static __inline uint32_t hash_flow4(struct fhashentry4 *f, int hsize); static __inline uint32_t hash_flow6(struct fhashentry6 *f, int hsize); static uint32_t hash_flow_ent(struct fhashentry *ent, uint32_t size); static int ta_lookup_fhash(struct table_info *ti, void *key, uint32_t keylen, uint32_t *val); static int ta_init_fhash(struct ip_fw_chain *ch, void **ta_state, struct table_info *ti, char *data, uint8_t tflags); static void ta_destroy_fhash(void *ta_state, struct table_info *ti); static void ta_dump_fhash_tinfo(void *ta_state, struct table_info *ti, ipfw_ta_tinfo *tinfo); static int ta_dump_fhash_tentry(void *ta_state, struct table_info *ti, void *e, ipfw_obj_tentry *tent); static int tei_to_fhash_ent(struct tentry_info *tei, struct fhashentry *ent); static int ta_find_fhash_tentry(void *ta_state, struct table_info *ti, ipfw_obj_tentry *tent); static void ta_foreach_fhash(void *ta_state, struct table_info *ti, ta_foreach_f *f, void *arg); static int ta_prepare_add_fhash(struct ip_fw_chain *ch, struct tentry_info *tei, void *ta_buf); static int ta_add_fhash(void *ta_state, struct table_info *ti, struct tentry_info *tei, void *ta_buf, uint32_t *pnum); static int ta_prepare_del_fhash(struct ip_fw_chain *ch, struct tentry_info *tei, void *ta_buf); static int ta_del_fhash(void *ta_state, struct table_info *ti, struct tentry_info *tei, void *ta_buf, uint32_t *pnum); static void ta_flush_fhash_entry(struct ip_fw_chain *ch, struct tentry_info *tei, void *ta_buf); static int ta_need_modify_fhash(void *ta_state, struct table_info *ti, uint32_t count, uint64_t *pflags); static int ta_prepare_mod_fhash(void *ta_buf, uint64_t *pflags); static int ta_fill_mod_fhash(void *ta_state, struct table_info *ti, void *ta_buf, uint64_t *pflags); static void ta_modify_fhash(void *ta_state, struct table_info *ti, void *ta_buf, uint64_t pflags); static void ta_flush_mod_fhash(void *ta_buf); static __inline int cmp_flow_ent(struct fhashentry *a, struct fhashentry *b, size_t sz) { uint64_t *ka, *kb; ka = (uint64_t *)(&a->next + 1); kb = (uint64_t *)(&b->next + 1); if (*ka == *kb && (memcmp(a + 1, b + 1, sz) == 0)) return (1); return (0); } static __inline uint32_t hash_flow4(struct fhashentry4 *f, int hsize) { uint32_t i; i = (f->dip.s_addr) ^ (f->sip.s_addr) ^ (f->e.dport) ^ (f->e.sport); return (i % (hsize - 1)); } static __inline uint32_t hash_flow6(struct fhashentry6 *f, int hsize) { uint32_t i; i = (f->dip6.__u6_addr.__u6_addr32[2]) ^ (f->dip6.__u6_addr.__u6_addr32[3]) ^ (f->sip6.__u6_addr.__u6_addr32[2]) ^ (f->sip6.__u6_addr.__u6_addr32[3]) ^ (f->e.dport) ^ (f->e.sport); return (i % (hsize - 1)); } static uint32_t hash_flow_ent(struct fhashentry *ent, uint32_t size) { uint32_t hash; if (ent->af == AF_INET) { hash = hash_flow4((struct fhashentry4 *)ent, size); } else { hash = hash_flow6((struct fhashentry6 *)ent, size); } return (hash); } static int ta_lookup_fhash(struct table_info *ti, void *key, uint32_t keylen, uint32_t *val) { struct fhashbhead *head; struct fhashentry *ent; struct fhashentry4 *m4; struct ipfw_flow_id *id; uint16_t hash, hsize; id = (struct ipfw_flow_id *)key; head = (struct fhashbhead *)ti->state; hsize = ti->data; m4 = (struct fhashentry4 *)ti->xstate; if (id->addr_type == 4) { struct fhashentry4 f; /* Copy hash mask */ f = *m4; f.dip.s_addr &= id->dst_ip; f.sip.s_addr &= id->src_ip; f.e.dport &= id->dst_port; f.e.sport &= id->src_port; f.e.proto &= id->proto; hash = hash_flow4(&f, hsize); SLIST_FOREACH(ent, &head[hash], next) { if (cmp_flow_ent(ent, &f.e, 2 * 4) != 0) { *val = ent->value; return (1); } } } else if (id->addr_type == 6) { struct fhashentry6 f; uint64_t *fp, *idp; /* Copy hash mask */ f = *((struct fhashentry6 *)(m4 + 1)); /* Handle lack of __u6_addr.__u6_addr64 */ fp = (uint64_t *)&f.dip6; idp = (uint64_t *)&id->dst_ip6; /* src IPv6 is stored after dst IPv6 */ *fp++ &= *idp++; *fp++ &= *idp++; *fp++ &= *idp++; *fp &= *idp; f.e.dport &= id->dst_port; f.e.sport &= id->src_port; f.e.proto &= id->proto; hash = hash_flow6(&f, hsize); SLIST_FOREACH(ent, &head[hash], next) { if (cmp_flow_ent(ent, &f.e, 2 * 16) != 0) { *val = ent->value; return (1); } } } return (0); } /* * New table. */ static int ta_init_fhash(struct ip_fw_chain *ch, void **ta_state, struct table_info *ti, char *data, uint8_t tflags) { int i; struct fhash_cfg *cfg; struct fhashentry4 *fe4; struct fhashentry6 *fe6; cfg = malloc(sizeof(struct fhash_cfg), M_IPFW, M_WAITOK | M_ZERO); cfg->size = 512; cfg->head = malloc(sizeof(struct fhashbhead) * cfg->size, M_IPFW, M_WAITOK | M_ZERO); for (i = 0; i < cfg->size; i++) SLIST_INIT(&cfg->head[i]); /* Fill in fe masks based on @tflags */ fe4 = &cfg->fe4; fe6 = &cfg->fe6; if (tflags & IPFW_TFFLAG_SRCIP) { memset(&fe4->sip, 0xFF, sizeof(fe4->sip)); memset(&fe6->sip6, 0xFF, sizeof(fe6->sip6)); } if (tflags & IPFW_TFFLAG_DSTIP) { memset(&fe4->dip, 0xFF, sizeof(fe4->dip)); memset(&fe6->dip6, 0xFF, sizeof(fe6->dip6)); } if (tflags & IPFW_TFFLAG_SRCPORT) { memset(&fe4->e.sport, 0xFF, sizeof(fe4->e.sport)); memset(&fe6->e.sport, 0xFF, sizeof(fe6->e.sport)); } if (tflags & IPFW_TFFLAG_DSTPORT) { memset(&fe4->e.dport, 0xFF, sizeof(fe4->e.dport)); memset(&fe6->e.dport, 0xFF, sizeof(fe6->e.dport)); } if (tflags & IPFW_TFFLAG_PROTO) { memset(&fe4->e.proto, 0xFF, sizeof(fe4->e.proto)); memset(&fe6->e.proto, 0xFF, sizeof(fe6->e.proto)); } fe4->e.af = AF_INET; fe6->e.af = AF_INET6; *ta_state = cfg; ti->state = cfg->head; ti->xstate = &cfg->fe4; ti->data = cfg->size; ti->lookup = ta_lookup_fhash; return (0); } static void ta_destroy_fhash(void *ta_state, struct table_info *ti) { struct fhash_cfg *cfg; struct fhashentry *ent, *ent_next; int i; cfg = (struct fhash_cfg *)ta_state; for (i = 0; i < cfg->size; i++) SLIST_FOREACH_SAFE(ent, &cfg->head[i], next, ent_next) free(ent, M_IPFW_TBL); free(cfg->head, M_IPFW); free(cfg, M_IPFW); } /* * Provide algo-specific table info */ static void ta_dump_fhash_tinfo(void *ta_state, struct table_info *ti, ipfw_ta_tinfo *tinfo) { struct fhash_cfg *cfg; cfg = (struct fhash_cfg *)ta_state; tinfo->flags = IPFW_TATFLAGS_AFITEM; tinfo->taclass4 = IPFW_TACLASS_HASH; tinfo->size4 = cfg->size; tinfo->count4 = cfg->items; tinfo->itemsize4 = sizeof(struct fhashentry4); tinfo->itemsize6 = sizeof(struct fhashentry6); } static int ta_dump_fhash_tentry(void *ta_state, struct table_info *ti, void *e, ipfw_obj_tentry *tent) { struct fhash_cfg *cfg; struct fhashentry *ent; struct fhashentry4 *fe4; #ifdef INET6 struct fhashentry6 *fe6; #endif struct tflow_entry *tfe; cfg = (struct fhash_cfg *)ta_state; ent = (struct fhashentry *)e; tfe = &tent->k.flow; tfe->af = ent->af; tfe->proto = ent->proto; tfe->dport = htons(ent->dport); tfe->sport = htons(ent->sport); tent->v.kidx = ent->value; tent->subtype = ent->af; if (ent->af == AF_INET) { fe4 = (struct fhashentry4 *)ent; tfe->a.a4.sip.s_addr = htonl(fe4->sip.s_addr); tfe->a.a4.dip.s_addr = htonl(fe4->dip.s_addr); tent->masklen = 32; #ifdef INET6 } else { fe6 = (struct fhashentry6 *)ent; tfe->a.a6.sip6 = fe6->sip6; tfe->a.a6.dip6 = fe6->dip6; tent->masklen = 128; #endif } return (0); } static int tei_to_fhash_ent(struct tentry_info *tei, struct fhashentry *ent) { #ifdef INET struct fhashentry4 *fe4; #endif #ifdef INET6 struct fhashentry6 *fe6; #endif struct tflow_entry *tfe; tfe = (struct tflow_entry *)tei->paddr; ent->af = tei->subtype; ent->proto = tfe->proto; ent->dport = ntohs(tfe->dport); ent->sport = ntohs(tfe->sport); if (tei->subtype == AF_INET) { #ifdef INET fe4 = (struct fhashentry4 *)ent; fe4->sip.s_addr = ntohl(tfe->a.a4.sip.s_addr); fe4->dip.s_addr = ntohl(tfe->a.a4.dip.s_addr); #endif #ifdef INET6 } else if (tei->subtype == AF_INET6) { fe6 = (struct fhashentry6 *)ent; fe6->sip6 = tfe->a.a6.sip6; fe6->dip6 = tfe->a.a6.dip6; #endif } else { /* Unknown CIDR type */ return (EINVAL); } return (0); } static int ta_find_fhash_tentry(void *ta_state, struct table_info *ti, ipfw_obj_tentry *tent) { struct fhash_cfg *cfg; struct fhashbhead *head; struct fhashentry *ent, *tmp; struct fhashentry6 fe6; struct tentry_info tei; int error; uint32_t hash; size_t sz; cfg = (struct fhash_cfg *)ta_state; ent = &fe6.e; memset(&fe6, 0, sizeof(fe6)); memset(&tei, 0, sizeof(tei)); tei.paddr = &tent->k.flow; tei.subtype = tent->subtype; if ((error = tei_to_fhash_ent(&tei, ent)) != 0) return (error); head = cfg->head; hash = hash_flow_ent(ent, cfg->size); if (tei.subtype == AF_INET) sz = 2 * sizeof(struct in_addr); else sz = 2 * sizeof(struct in6_addr); /* Check for existence */ SLIST_FOREACH(tmp, &head[hash], next) { if (cmp_flow_ent(tmp, ent, sz) != 0) { ta_dump_fhash_tentry(ta_state, ti, tmp, tent); return (0); } } return (ENOENT); } static void ta_foreach_fhash(void *ta_state, struct table_info *ti, ta_foreach_f *f, void *arg) { struct fhash_cfg *cfg; struct fhashentry *ent, *ent_next; int i; cfg = (struct fhash_cfg *)ta_state; for (i = 0; i < cfg->size; i++) SLIST_FOREACH_SAFE(ent, &cfg->head[i], next, ent_next) f(ent, arg); } static int ta_prepare_add_fhash(struct ip_fw_chain *ch, struct tentry_info *tei, void *ta_buf) { struct ta_buf_fhash *tb; struct fhashentry *ent; size_t sz; int error; tb = (struct ta_buf_fhash *)ta_buf; if (tei->subtype == AF_INET) sz = sizeof(struct fhashentry4); else if (tei->subtype == AF_INET6) sz = sizeof(struct fhashentry6); else return (EINVAL); ent = malloc(sz, M_IPFW_TBL, M_WAITOK | M_ZERO); error = tei_to_fhash_ent(tei, ent); if (error != 0) { free(ent, M_IPFW_TBL); return (error); } tb->ent_ptr = ent; return (0); } static int ta_add_fhash(void *ta_state, struct table_info *ti, struct tentry_info *tei, void *ta_buf, uint32_t *pnum) { struct fhash_cfg *cfg; struct fhashbhead *head; struct fhashentry *ent, *tmp; struct ta_buf_fhash *tb; int exists; uint32_t hash, value; size_t sz; cfg = (struct fhash_cfg *)ta_state; tb = (struct ta_buf_fhash *)ta_buf; ent = (struct fhashentry *)tb->ent_ptr; exists = 0; /* Read current value from @tei */ ent->value = tei->value; head = cfg->head; hash = hash_flow_ent(ent, cfg->size); if (tei->subtype == AF_INET) sz = 2 * sizeof(struct in_addr); else sz = 2 * sizeof(struct in6_addr); /* Check for existence */ SLIST_FOREACH(tmp, &head[hash], next) { if (cmp_flow_ent(tmp, ent, sz) != 0) { exists = 1; break; } } if (exists == 1) { if ((tei->flags & TEI_FLAGS_UPDATE) == 0) return (EEXIST); /* Record already exists. Update value if we're asked to */ /* Exchange values between tmp and @tei */ value = tmp->value; tmp->value = tei->value; tei->value = value; /* Indicate that update has happened instead of addition */ tei->flags |= TEI_FLAGS_UPDATED; *pnum = 0; } else { if ((tei->flags & TEI_FLAGS_DONTADD) != 0) return (EFBIG); SLIST_INSERT_HEAD(&head[hash], ent, next); tb->ent_ptr = NULL; *pnum = 1; /* Update counters and check if we need to grow hash */ cfg->items++; } return (0); } static int ta_prepare_del_fhash(struct ip_fw_chain *ch, struct tentry_info *tei, void *ta_buf) { struct ta_buf_fhash *tb; tb = (struct ta_buf_fhash *)ta_buf; return (tei_to_fhash_ent(tei, &tb->fe6.e)); } static int ta_del_fhash(void *ta_state, struct table_info *ti, struct tentry_info *tei, void *ta_buf, uint32_t *pnum) { struct fhash_cfg *cfg; struct fhashbhead *head; struct fhashentry *ent, *tmp; struct ta_buf_fhash *tb; uint32_t hash; size_t sz; cfg = (struct fhash_cfg *)ta_state; tb = (struct ta_buf_fhash *)ta_buf; ent = &tb->fe6.e; head = cfg->head; hash = hash_flow_ent(ent, cfg->size); if (tei->subtype == AF_INET) sz = 2 * sizeof(struct in_addr); else sz = 2 * sizeof(struct in6_addr); /* Check for existence */ SLIST_FOREACH(tmp, &head[hash], next) { if (cmp_flow_ent(tmp, ent, sz) == 0) continue; SLIST_REMOVE(&head[hash], tmp, fhashentry, next); tei->value = tmp->value; *pnum = 1; cfg->items--; tb->ent_ptr = tmp; return (0); } return (ENOENT); } static void ta_flush_fhash_entry(struct ip_fw_chain *ch, struct tentry_info *tei, void *ta_buf) { struct ta_buf_fhash *tb; tb = (struct ta_buf_fhash *)ta_buf; if (tb->ent_ptr != NULL) free(tb->ent_ptr, M_IPFW_TBL); } /* * Hash growing callbacks. */ static int ta_need_modify_fhash(void *ta_state, struct table_info *ti, uint32_t count, uint64_t *pflags) { struct fhash_cfg *cfg; cfg = (struct fhash_cfg *)ta_state; if (cfg->items > cfg->size && cfg->size < 65536) { *pflags = cfg->size * 2; return (1); } return (0); } /* * Allocate new, larger fhash. */ static int ta_prepare_mod_fhash(void *ta_buf, uint64_t *pflags) { struct mod_item *mi; struct fhashbhead *head; int i; mi = (struct mod_item *)ta_buf; memset(mi, 0, sizeof(struct mod_item)); mi->size = *pflags; head = malloc(sizeof(struct fhashbhead) * mi->size, M_IPFW, M_WAITOK | M_ZERO); for (i = 0; i < mi->size; i++) SLIST_INIT(&head[i]); mi->main_ptr = head; return (0); } /* * Copy data from old runtime array to new one. */ static int ta_fill_mod_fhash(void *ta_state, struct table_info *ti, void *ta_buf, uint64_t *pflags) { /* In is not possible to do rehash if we're not holidng WLOCK. */ return (0); } /* * Switch old & new arrays. */ static void ta_modify_fhash(void *ta_state, struct table_info *ti, void *ta_buf, uint64_t pflags) { struct mod_item *mi; struct fhash_cfg *cfg; struct fhashbhead *old_head, *new_head; struct fhashentry *ent, *ent_next; int i; uint32_t nhash; size_t old_size; mi = (struct mod_item *)ta_buf; cfg = (struct fhash_cfg *)ta_state; old_size = cfg->size; old_head = ti->state; new_head = (struct fhashbhead *)mi->main_ptr; for (i = 0; i < old_size; i++) { SLIST_FOREACH_SAFE(ent, &old_head[i], next, ent_next) { nhash = hash_flow_ent(ent, mi->size); SLIST_INSERT_HEAD(&new_head[nhash], ent, next); } } ti->state = new_head; ti->data = mi->size; cfg->head = new_head; cfg->size = mi->size; mi->main_ptr = old_head; } /* * Free unneded array. */ static void ta_flush_mod_fhash(void *ta_buf) { struct mod_item *mi; mi = (struct mod_item *)ta_buf; if (mi->main_ptr != NULL) free(mi->main_ptr, M_IPFW); } struct table_algo flow_hash = { .name = "flow:hash", .type = IPFW_TABLE_FLOW, .flags = TA_FLAG_DEFAULT, .ta_buf_size = sizeof(struct ta_buf_fhash), .init = ta_init_fhash, .destroy = ta_destroy_fhash, .prepare_add = ta_prepare_add_fhash, .prepare_del = ta_prepare_del_fhash, .add = ta_add_fhash, .del = ta_del_fhash, .flush_entry = ta_flush_fhash_entry, .foreach = ta_foreach_fhash, .dump_tentry = ta_dump_fhash_tentry, .find_tentry = ta_find_fhash_tentry, .dump_tinfo = ta_dump_fhash_tinfo, .need_modify = ta_need_modify_fhash, .prepare_mod = ta_prepare_mod_fhash, .fill_mod = ta_fill_mod_fhash, .modify = ta_modify_fhash, .flush_mod = ta_flush_mod_fhash, }; /* * Kernel fibs bindings. * * Implementation: * * Runtime part: * - fully relies on route API * - fib number is stored in ti->data * */ static struct rtentry *lookup_kfib(void *key, int keylen, int fib); static int ta_lookup_kfib(struct table_info *ti, void *key, uint32_t keylen, uint32_t *val); static int kfib_parse_opts(int *pfib, char *data); static void ta_print_kfib_config(void *ta_state, struct table_info *ti, char *buf, size_t bufsize); static int ta_init_kfib(struct ip_fw_chain *ch, void **ta_state, struct table_info *ti, char *data, uint8_t tflags); static void ta_destroy_kfib(void *ta_state, struct table_info *ti); static void ta_dump_kfib_tinfo(void *ta_state, struct table_info *ti, ipfw_ta_tinfo *tinfo); static int contigmask(uint8_t *p, int len); static int ta_dump_kfib_tentry(void *ta_state, struct table_info *ti, void *e, ipfw_obj_tentry *tent); static int ta_find_kfib_tentry(void *ta_state, struct table_info *ti, ipfw_obj_tentry *tent); static void ta_foreach_kfib(void *ta_state, struct table_info *ti, ta_foreach_f *f, void *arg); static struct rtentry * lookup_kfib(void *key, int keylen, int fib) { struct sockaddr *s; if (keylen == 4) { struct sockaddr_in sin; bzero(&sin, sizeof(sin)); sin.sin_len = sizeof(struct sockaddr_in); sin.sin_family = AF_INET; sin.sin_addr.s_addr = *(in_addr_t *)key; s = (struct sockaddr *)&sin; } else { struct sockaddr_in6 sin6; bzero(&sin6, sizeof(sin6)); sin6.sin6_len = sizeof(struct sockaddr_in6); sin6.sin6_family = AF_INET6; sin6.sin6_addr = *(struct in6_addr *)key; s = (struct sockaddr *)&sin6; } return (rtalloc1_fib(s, 0, 0, fib)); } static int ta_lookup_kfib(struct table_info *ti, void *key, uint32_t keylen, uint32_t *val) { struct rtentry *rte; if ((rte = lookup_kfib(key, keylen, ti->data)) == NULL) return (0); *val = 0; RTFREE_LOCKED(rte); return (1); } /* Parse 'fib=%d' */ static int kfib_parse_opts(int *pfib, char *data) { char *pdel, *pend, *s; int fibnum; if (data == NULL) return (0); if ((pdel = strchr(data, ' ')) == NULL) return (0); while (*pdel == ' ') pdel++; if (strncmp(pdel, "fib=", 4) != 0) return (EINVAL); if ((s = strchr(pdel, ' ')) != NULL) *s++ = '\0'; pdel += 4; /* Need \d+ */ fibnum = strtol(pdel, &pend, 10); if (*pend != '\0') return (EINVAL); *pfib = fibnum; return (0); } static void ta_print_kfib_config(void *ta_state, struct table_info *ti, char *buf, size_t bufsize) { if (ti->data != 0) snprintf(buf, bufsize, "%s fib=%lu", "addr:kfib", ti->data); else snprintf(buf, bufsize, "%s", "addr:kfib"); } static int ta_init_kfib(struct ip_fw_chain *ch, void **ta_state, struct table_info *ti, char *data, uint8_t tflags) { int error, fibnum; fibnum = 0; if ((error = kfib_parse_opts(&fibnum, data)) != 0) return (error); if (fibnum >= rt_numfibs) return (E2BIG); ti->data = fibnum; ti->lookup = ta_lookup_kfib; return (0); } /* * Destroys table @ti */ static void ta_destroy_kfib(void *ta_state, struct table_info *ti) { } /* * Provide algo-specific table info */ static void ta_dump_kfib_tinfo(void *ta_state, struct table_info *ti, ipfw_ta_tinfo *tinfo) { tinfo->flags = IPFW_TATFLAGS_AFDATA; tinfo->taclass4 = IPFW_TACLASS_RADIX; tinfo->count4 = 0; tinfo->itemsize4 = sizeof(struct rtentry); tinfo->taclass6 = IPFW_TACLASS_RADIX; tinfo->count6 = 0; tinfo->itemsize6 = sizeof(struct rtentry); } static int contigmask(uint8_t *p, int len) { int i, n; for (i = 0; i < len ; i++) if ( (p[i/8] & (1 << (7 - (i%8)))) == 0) /* first bit unset */ break; for (n= i + 1; n < len; n++) if ( (p[n/8] & (1 << (7 - (n % 8)))) != 0) return (-1); /* mask not contiguous */ return (i); } static int ta_dump_kfib_tentry(void *ta_state, struct table_info *ti, void *e, ipfw_obj_tentry *tent) { struct rtentry *rte; #ifdef INET struct sockaddr_in *addr, *mask; #endif #ifdef INET6 struct sockaddr_in6 *addr6, *mask6; #endif int len; rte = (struct rtentry *)e; addr = (struct sockaddr_in *)rt_key(rte); mask = (struct sockaddr_in *)rt_mask(rte); len = 0; /* Guess IPv4/IPv6 radix by sockaddr family */ #ifdef INET if (addr->sin_family == AF_INET) { tent->k.addr.s_addr = addr->sin_addr.s_addr; len = 32; if (mask != NULL) len = contigmask((uint8_t *)&mask->sin_addr, 32); if (len == -1) len = 0; tent->masklen = len; tent->subtype = AF_INET; tent->v.kidx = 0; /* Do we need to put GW here? */ } #endif #ifdef INET6 if (addr->sin_family == AF_INET6) { addr6 = (struct sockaddr_in6 *)addr; mask6 = (struct sockaddr_in6 *)mask; memcpy(&tent->k, &addr6->sin6_addr, sizeof(struct in6_addr)); len = 128; if (mask6 != NULL) len = contigmask((uint8_t *)&mask6->sin6_addr, 128); if (len == -1) len = 0; tent->masklen = len; tent->subtype = AF_INET6; tent->v.kidx = 0; } #endif return (0); } static int ta_find_kfib_tentry(void *ta_state, struct table_info *ti, ipfw_obj_tentry *tent) { struct rtentry *rte; void *key; int keylen; if (tent->subtype == AF_INET) { key = &tent->k.addr; keylen = sizeof(struct in_addr); } else { key = &tent->k.addr6; keylen = sizeof(struct in6_addr); } if ((rte = lookup_kfib(key, keylen, ti->data)) == NULL) return (0); if (rte != NULL) { ta_dump_kfib_tentry(ta_state, ti, rte, tent); RTFREE_LOCKED(rte); return (0); } return (ENOENT); } static void ta_foreach_kfib(void *ta_state, struct table_info *ti, ta_foreach_f *f, void *arg) { struct radix_node_head *rnh; int error; rnh = rt_tables_get_rnh(ti->data, AF_INET); if (rnh != NULL) { RADIX_NODE_HEAD_RLOCK(rnh); error = rnh->rnh_walktree(rnh, (walktree_f_t *)f, arg); RADIX_NODE_HEAD_RUNLOCK(rnh); } rnh = rt_tables_get_rnh(ti->data, AF_INET6); if (rnh != NULL) { RADIX_NODE_HEAD_RLOCK(rnh); error = rnh->rnh_walktree(rnh, (walktree_f_t *)f, arg); RADIX_NODE_HEAD_RUNLOCK(rnh); } } struct table_algo addr_kfib = { .name = "addr:kfib", .type = IPFW_TABLE_ADDR, .flags = TA_FLAG_READONLY, .ta_buf_size = 0, .init = ta_init_kfib, .destroy = ta_destroy_kfib, .foreach = ta_foreach_kfib, .dump_tentry = ta_dump_kfib_tentry, .find_tentry = ta_find_kfib_tentry, .dump_tinfo = ta_dump_kfib_tinfo, .print_config = ta_print_kfib_config, }; void ipfw_table_algo_init(struct ip_fw_chain *ch) { size_t sz; /* * Register all algorithms presented here. */ sz = sizeof(struct table_algo); ipfw_add_table_algo(ch, &addr_radix, sz, &addr_radix.idx); ipfw_add_table_algo(ch, &addr_hash, sz, &addr_hash.idx); ipfw_add_table_algo(ch, &iface_idx, sz, &iface_idx.idx); ipfw_add_table_algo(ch, &number_array, sz, &number_array.idx); ipfw_add_table_algo(ch, &flow_hash, sz, &flow_hash.idx); ipfw_add_table_algo(ch, &addr_kfib, sz, &addr_kfib.idx); } void ipfw_table_algo_destroy(struct ip_fw_chain *ch) { ipfw_del_table_algo(ch, addr_radix.idx); ipfw_del_table_algo(ch, addr_hash.idx); ipfw_del_table_algo(ch, iface_idx.idx); ipfw_del_table_algo(ch, number_array.idx); ipfw_del_table_algo(ch, flow_hash.idx); ipfw_del_table_algo(ch, addr_kfib.idx); } Index: projects/building-blocks/sys/sys/bus.h =================================================================== --- projects/building-blocks/sys/sys/bus.h (revision 278311) +++ projects/building-blocks/sys/sys/bus.h (revision 278312) @@ -1,803 +1,814 @@ /*- * Copyright (c) 1997,1998,2003 Doug Rabson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _SYS_BUS_H_ #define _SYS_BUS_H_ #include #include /** * @defgroup NEWBUS newbus - a generic framework for managing devices * @{ */ /** * @brief Interface information structure. */ struct u_businfo { int ub_version; /**< @brief interface version */ #define BUS_USER_VERSION 1 int ub_generation; /**< @brief generation count */ }; /** * @brief State of the device. */ typedef enum device_state { DS_NOTPRESENT = 10, /**< @brief not probed or probe failed */ DS_ALIVE = 20, /**< @brief probe succeeded */ DS_ATTACHING = 25, /**< @brief currently attaching */ DS_ATTACHED = 30, /**< @brief attach method called */ DS_BUSY = 40 /**< @brief device is open */ } device_state_t; /** * @brief Device information exported to userspace. */ struct u_device { uintptr_t dv_handle; uintptr_t dv_parent; char dv_name[32]; /**< @brief Name of device in tree. */ char dv_desc[32]; /**< @brief Driver description */ char dv_drivername[32]; /**< @brief Driver name */ char dv_pnpinfo[128]; /**< @brief Plug and play info */ char dv_location[128]; /**< @brief Where is the device? */ uint32_t dv_devflags; /**< @brief API Flags for device */ - uint16_t dv_flags; /**< @brief flags for dev date */ + uint16_t dv_flags; /**< @brief flags for dev state */ device_state_t dv_state; /**< @brief State of attachment */ /* XXX more driver info? */ }; + +/* Flags exported via dv_flags. */ +#define DF_ENABLED 0x01 /* device should be probed/attached */ +#define DF_FIXEDCLASS 0x02 /* devclass specified at create time */ +#define DF_WILDCARD 0x04 /* unit was originally wildcard */ +#define DF_DESCMALLOCED 0x08 /* description was malloced */ +#define DF_QUIET 0x10 /* don't print verbose attach message */ +#define DF_DONENOMATCH 0x20 /* don't execute DEVICE_NOMATCH again */ +#define DF_EXTERNALSOFTC 0x40 /* softc not allocated by us */ +#define DF_REBID 0x80 /* Can rebid after attach */ +#define DF_SUSPENDED 0x100 /* Device is suspended. */ #ifdef _KERNEL #include #include /** * devctl hooks. Typically one should use the devctl_notify * hook to send the message. However, devctl_queue_data is also * included in case devctl_notify isn't sufficiently general. */ boolean_t devctl_process_running(void); void devctl_notify_f(const char *__system, const char *__subsystem, const char *__type, const char *__data, int __flags); void devctl_notify(const char *__system, const char *__subsystem, const char *__type, const char *__data); void devctl_queue_data_f(char *__data, int __flags); void devctl_queue_data(char *__data); /** * @brief A device driver (included mainly for compatibility with * FreeBSD 4.x). */ typedef struct kobj_class driver_t; /** * @brief A device class * * The devclass object has two main functions in the system. The first * is to manage the allocation of unit numbers for device instances * and the second is to hold the list of device drivers for a * particular bus type. Each devclass has a name and there cannot be * two devclasses with the same name. This ensures that unique unit * numbers are allocated to device instances. * * Drivers that support several different bus attachments (e.g. isa, * pci, pccard) should all use the same devclass to ensure that unit * numbers do not conflict. * * Each devclass may also have a parent devclass. This is used when * searching for device drivers to allow a form of inheritance. When * matching drivers with devices, first the driver list of the parent * device's devclass is searched. If no driver is found in that list, * the search continues in the parent devclass (if any). */ typedef struct devclass *devclass_t; /** * @brief A device method */ #define device_method_t kobj_method_t /** * @brief Driver interrupt filter return values * * If a driver provides an interrupt filter routine it must return an * integer consisting of oring together zero or more of the following * flags: * * FILTER_STRAY - this device did not trigger the interrupt * FILTER_HANDLED - the interrupt has been fully handled and can be EOId * FILTER_SCHEDULE_THREAD - the threaded interrupt handler should be * scheduled to execute * * If the driver does not provide a filter, then the interrupt code will * act is if the filter had returned FILTER_SCHEDULE_THREAD. Note that it * is illegal to specify any other flag with FILTER_STRAY and that it is * illegal to not specify either of FILTER_HANDLED or FILTER_SCHEDULE_THREAD * if FILTER_STRAY is not specified. */ #define FILTER_STRAY 0x01 #define FILTER_HANDLED 0x02 #define FILTER_SCHEDULE_THREAD 0x04 /** * @brief Driver interrupt service routines * * The filter routine is run in primary interrupt context and may not * block or use regular mutexes. It may only use spin mutexes for * synchronization. The filter may either completely handle the * interrupt or it may perform some of the work and defer more * expensive work to the regular interrupt handler. If a filter * routine is not registered by the driver, then the regular interrupt * handler is always used to handle interrupts from this device. * * The regular interrupt handler executes in its own thread context * and may use regular mutexes. However, it is prohibited from * sleeping on a sleep queue. */ typedef int driver_filter_t(void*); typedef void driver_intr_t(void*); /** * @brief Interrupt type bits. * * These flags are used both by newbus interrupt * registration (nexus.c) and also in struct intrec, which defines * interrupt properties. * * XXX We should probably revisit this and remove the vestiges of the * spls implicit in names like INTR_TYPE_TTY. In the meantime, don't * confuse things by renaming them (Grog, 18 July 2000). * * Buses which do interrupt remapping will want to change their type * to reflect what sort of devices are underneath. */ enum intr_type { INTR_TYPE_TTY = 1, INTR_TYPE_BIO = 2, INTR_TYPE_NET = 4, INTR_TYPE_CAM = 8, INTR_TYPE_MISC = 16, INTR_TYPE_CLK = 32, INTR_TYPE_AV = 64, INTR_EXCL = 256, /* exclusive interrupt */ INTR_MPSAFE = 512, /* this interrupt is SMP safe */ INTR_ENTROPY = 1024, /* this interrupt provides entropy */ INTR_MD1 = 4096, /* flag reserved for MD use */ INTR_MD2 = 8192, /* flag reserved for MD use */ INTR_MD3 = 16384, /* flag reserved for MD use */ INTR_MD4 = 32768 /* flag reserved for MD use */ }; enum intr_trigger { INTR_TRIGGER_CONFORM = 0, INTR_TRIGGER_EDGE = 1, INTR_TRIGGER_LEVEL = 2 }; enum intr_polarity { INTR_POLARITY_CONFORM = 0, INTR_POLARITY_HIGH = 1, INTR_POLARITY_LOW = 2 }; typedef int (*devop_t)(void); /** * @brief This structure is deprecated. * * Use the kobj(9) macro DEFINE_CLASS to * declare classes which implement device drivers. */ struct driver { KOBJ_CLASS_FIELDS; }; /* * Definitions for drivers which need to keep simple lists of resources * for their child devices. */ struct resource; /** * @brief An entry for a single resource in a resource list. */ struct resource_list_entry { STAILQ_ENTRY(resource_list_entry) link; int type; /**< @brief type argument to alloc_resource */ int rid; /**< @brief resource identifier */ int flags; /**< @brief resource flags */ struct resource *res; /**< @brief the real resource when allocated */ u_long start; /**< @brief start of resource range */ u_long end; /**< @brief end of resource range */ u_long count; /**< @brief count within range */ }; STAILQ_HEAD(resource_list, resource_list_entry); #define RLE_RESERVED 0x0001 /* Reserved by the parent bus. */ #define RLE_ALLOCATED 0x0002 /* Reserved resource is allocated. */ #define RLE_PREFETCH 0x0004 /* Resource is a prefetch range. */ void resource_list_init(struct resource_list *rl); void resource_list_free(struct resource_list *rl); struct resource_list_entry * resource_list_add(struct resource_list *rl, int type, int rid, u_long start, u_long end, u_long count); int resource_list_add_next(struct resource_list *rl, int type, u_long start, u_long end, u_long count); int resource_list_busy(struct resource_list *rl, int type, int rid); int resource_list_reserved(struct resource_list *rl, int type, int rid); struct resource_list_entry* resource_list_find(struct resource_list *rl, int type, int rid); void resource_list_delete(struct resource_list *rl, int type, int rid); struct resource * resource_list_alloc(struct resource_list *rl, device_t bus, device_t child, int type, int *rid, u_long start, u_long end, u_long count, u_int flags); int resource_list_release(struct resource_list *rl, device_t bus, device_t child, int type, int rid, struct resource *res); int resource_list_release_active(struct resource_list *rl, device_t bus, device_t child, int type); struct resource * resource_list_reserve(struct resource_list *rl, device_t bus, device_t child, int type, int *rid, u_long start, u_long end, u_long count, u_int flags); int resource_list_unreserve(struct resource_list *rl, device_t bus, device_t child, int type, int rid); void resource_list_purge(struct resource_list *rl); int resource_list_print_type(struct resource_list *rl, const char *name, int type, const char *format); /* * The root bus, to which all top-level busses are attached. */ extern device_t root_bus; extern devclass_t root_devclass; void root_bus_configure(void); /* * Useful functions for implementing busses. */ int bus_generic_activate_resource(device_t dev, device_t child, int type, int rid, struct resource *r); device_t bus_generic_add_child(device_t dev, u_int order, const char *name, int unit); int bus_generic_adjust_resource(device_t bus, device_t child, int type, struct resource *r, u_long start, u_long end); struct resource * bus_generic_alloc_resource(device_t bus, device_t child, int type, int *rid, u_long start, u_long end, u_long count, u_int flags); int bus_generic_attach(device_t dev); int bus_generic_bind_intr(device_t dev, device_t child, struct resource *irq, int cpu); int bus_generic_child_present(device_t dev, device_t child); int bus_generic_config_intr(device_t, int, enum intr_trigger, enum intr_polarity); int bus_generic_describe_intr(device_t dev, device_t child, struct resource *irq, void *cookie, const char *descr); int bus_generic_deactivate_resource(device_t dev, device_t child, int type, int rid, struct resource *r); int bus_generic_detach(device_t dev); void bus_generic_driver_added(device_t dev, driver_t *driver); bus_dma_tag_t bus_generic_get_dma_tag(device_t dev, device_t child); int bus_generic_get_domain(device_t dev, device_t child, int *domain); struct resource_list * bus_generic_get_resource_list (device_t, device_t); void bus_generic_new_pass(device_t dev); int bus_print_child_header(device_t dev, device_t child); int bus_print_child_domain(device_t dev, device_t child); int bus_print_child_footer(device_t dev, device_t child); int bus_generic_print_child(device_t dev, device_t child); int bus_generic_probe(device_t dev); int bus_generic_read_ivar(device_t dev, device_t child, int which, uintptr_t *result); int bus_generic_release_resource(device_t bus, device_t child, int type, int rid, struct resource *r); int bus_generic_resume(device_t dev); int bus_generic_resume_child(device_t dev, device_t child); int bus_generic_setup_intr(device_t dev, device_t child, struct resource *irq, int flags, driver_filter_t *filter, driver_intr_t *intr, void *arg, void **cookiep); struct resource * bus_generic_rl_alloc_resource (device_t, device_t, int, int *, u_long, u_long, u_long, u_int); void bus_generic_rl_delete_resource (device_t, device_t, int, int); int bus_generic_rl_get_resource (device_t, device_t, int, int, u_long *, u_long *); int bus_generic_rl_set_resource (device_t, device_t, int, int, u_long, u_long); int bus_generic_rl_release_resource (device_t, device_t, int, int, struct resource *); int bus_generic_shutdown(device_t dev); int bus_generic_suspend(device_t dev); int bus_generic_suspend_child(device_t dev, device_t child); int bus_generic_teardown_intr(device_t dev, device_t child, struct resource *irq, void *cookie); int bus_generic_write_ivar(device_t dev, device_t child, int which, uintptr_t value); /* * Wrapper functions for the BUS_*_RESOURCE methods to make client code * a little simpler. */ struct resource_spec { int type; int rid; int flags; }; int bus_alloc_resources(device_t dev, struct resource_spec *rs, struct resource **res); void bus_release_resources(device_t dev, const struct resource_spec *rs, struct resource **res); int bus_adjust_resource(device_t child, int type, struct resource *r, u_long start, u_long end); struct resource *bus_alloc_resource(device_t dev, int type, int *rid, u_long start, u_long end, u_long count, u_int flags); int bus_activate_resource(device_t dev, int type, int rid, struct resource *r); int bus_deactivate_resource(device_t dev, int type, int rid, struct resource *r); bus_dma_tag_t bus_get_dma_tag(device_t dev); int bus_get_domain(device_t dev, int *domain); int bus_release_resource(device_t dev, int type, int rid, struct resource *r); int bus_free_resource(device_t dev, int type, struct resource *r); int bus_setup_intr(device_t dev, struct resource *r, int flags, driver_filter_t filter, driver_intr_t handler, void *arg, void **cookiep); int bus_teardown_intr(device_t dev, struct resource *r, void *cookie); int bus_bind_intr(device_t dev, struct resource *r, int cpu); int bus_describe_intr(device_t dev, struct resource *irq, void *cookie, const char *fmt, ...); int bus_set_resource(device_t dev, int type, int rid, u_long start, u_long count); int bus_get_resource(device_t dev, int type, int rid, u_long *startp, u_long *countp); u_long bus_get_resource_start(device_t dev, int type, int rid); u_long bus_get_resource_count(device_t dev, int type, int rid); void bus_delete_resource(device_t dev, int type, int rid); int bus_child_present(device_t child); int bus_child_pnpinfo_str(device_t child, char *buf, size_t buflen); int bus_child_location_str(device_t child, char *buf, size_t buflen); void bus_enumerate_hinted_children(device_t bus); static __inline struct resource * bus_alloc_resource_any(device_t dev, int type, int *rid, u_int flags) { return (bus_alloc_resource(dev, type, rid, 0ul, ~0ul, 1, flags)); } /* * Access functions for device. */ device_t device_add_child(device_t dev, const char *name, int unit); device_t device_add_child_ordered(device_t dev, u_int order, const char *name, int unit); void device_busy(device_t dev); int device_delete_child(device_t dev, device_t child); int device_delete_children(device_t dev); int device_attach(device_t dev); int device_detach(device_t dev); void device_disable(device_t dev); void device_enable(device_t dev); device_t device_find_child(device_t dev, const char *classname, int unit); const char *device_get_desc(device_t dev); devclass_t device_get_devclass(device_t dev); driver_t *device_get_driver(device_t dev); u_int32_t device_get_flags(device_t dev); device_t device_get_parent(device_t dev); int device_get_children(device_t dev, device_t **listp, int *countp); void *device_get_ivars(device_t dev); void device_set_ivars(device_t dev, void *ivars); const char *device_get_name(device_t dev); const char *device_get_nameunit(device_t dev); void *device_get_softc(device_t dev); device_state_t device_get_state(device_t dev); int device_get_unit(device_t dev); struct sysctl_ctx_list *device_get_sysctl_ctx(device_t dev); struct sysctl_oid *device_get_sysctl_tree(device_t dev); int device_is_alive(device_t dev); /* did probe succeed? */ int device_is_attached(device_t dev); /* did attach succeed? */ int device_is_enabled(device_t dev); int device_is_quiet(device_t dev); int device_print_prettyname(device_t dev); int device_printf(device_t dev, const char *, ...) __printflike(2, 3); int device_probe(device_t dev); int device_probe_and_attach(device_t dev); int device_probe_child(device_t bus, device_t dev); int device_quiesce(device_t dev); void device_quiet(device_t dev); void device_set_desc(device_t dev, const char* desc); void device_set_desc_copy(device_t dev, const char* desc); int device_set_devclass(device_t dev, const char *classname); int device_set_driver(device_t dev, driver_t *driver); void device_set_flags(device_t dev, u_int32_t flags); void device_set_softc(device_t dev, void *softc); void device_free_softc(void *softc); void device_claim_softc(device_t dev); int device_set_unit(device_t dev, int unit); /* XXX DONT USE XXX */ int device_shutdown(device_t dev); void device_unbusy(device_t dev); void device_verbose(device_t dev); /* * Access functions for devclass. */ int devclass_add_driver(devclass_t dc, driver_t *driver, int pass, devclass_t *dcp); devclass_t devclass_create(const char *classname); int devclass_delete_driver(devclass_t busclass, driver_t *driver); devclass_t devclass_find(const char *classname); const char *devclass_get_name(devclass_t dc); device_t devclass_get_device(devclass_t dc, int unit); void *devclass_get_softc(devclass_t dc, int unit); int devclass_get_devices(devclass_t dc, device_t **listp, int *countp); int devclass_get_drivers(devclass_t dc, driver_t ***listp, int *countp); int devclass_get_count(devclass_t dc); int devclass_get_maxunit(devclass_t dc); int devclass_find_free_unit(devclass_t dc, int unit); void devclass_set_parent(devclass_t dc, devclass_t pdc); devclass_t devclass_get_parent(devclass_t dc); struct sysctl_ctx_list *devclass_get_sysctl_ctx(devclass_t dc); struct sysctl_oid *devclass_get_sysctl_tree(devclass_t dc); /* * Access functions for device resources. */ int resource_int_value(const char *name, int unit, const char *resname, int *result); int resource_long_value(const char *name, int unit, const char *resname, long *result); int resource_string_value(const char *name, int unit, const char *resname, const char **result); int resource_disabled(const char *name, int unit); int resource_find_match(int *anchor, const char **name, int *unit, const char *resname, const char *value); int resource_find_dev(int *anchor, const char *name, int *unit, const char *resname, const char *value); int resource_set_int(const char *name, int unit, const char *resname, int value); int resource_set_long(const char *name, int unit, const char *resname, long value); int resource_set_string(const char *name, int unit, const char *resname, const char *value); /* * Functions for maintaining and checking consistency of * bus information exported to userspace. */ int bus_data_generation_check(int generation); void bus_data_generation_update(void); /** * Some convenience defines for probe routines to return. These are just * suggested values, and there's nothing magical about them. * BUS_PROBE_SPECIFIC is for devices that cannot be reprobed, and that no * possible other driver may exist (typically legacy drivers who don't fallow * all the rules, or special needs drivers). BUS_PROBE_VENDOR is the * suggested value that vendor supplied drivers use. This is for source or * binary drivers that are not yet integrated into the FreeBSD tree. Its use * in the base OS is prohibited. BUS_PROBE_DEFAULT is the normal return value * for drivers to use. It is intended that nearly all of the drivers in the * tree should return this value. BUS_PROBE_LOW_PRIORITY are for drivers that * have special requirements like when there are two drivers that support * overlapping series of hardware devices. In this case the one that supports * the older part of the line would return this value, while the one that * supports the newer ones would return BUS_PROBE_DEFAULT. BUS_PROBE_GENERIC * is for drivers that wish to have a generic form and a specialized form, * like is done with the pci bus and the acpi pci bus. BUS_PROBE_HOOVER is * for those busses that implement a generic device place-holder for devices on * the bus that have no more specific driver for them (aka ugen). * BUS_PROBE_NOWILDCARD or lower means that the device isn't really bidding * for a device node, but accepts only devices that its parent has told it * use this driver. */ #define BUS_PROBE_SPECIFIC 0 /* Only I can use this device */ #define BUS_PROBE_VENDOR (-10) /* Vendor supplied driver */ #define BUS_PROBE_DEFAULT (-20) /* Base OS default driver */ #define BUS_PROBE_LOW_PRIORITY (-40) /* Older, less desirable drivers */ #define BUS_PROBE_GENERIC (-100) /* generic driver for dev */ #define BUS_PROBE_HOOVER (-500) /* Generic dev for all devs on bus */ #define BUS_PROBE_NOWILDCARD (-2000000000) /* No wildcard device matches */ /** * During boot, the device tree is scanned multiple times. Each scan, * or pass, drivers may be attached to devices. Each driver * attachment is assigned a pass number. Drivers may only probe and * attach to devices if their pass number is less than or equal to the * current system-wide pass number. The default pass is the last pass * and is used by most drivers. Drivers needed by the scheduler are * probed in earlier passes. */ #define BUS_PASS_ROOT 0 /* Used to attach root0. */ #define BUS_PASS_BUS 10 /* Busses and bridges. */ #define BUS_PASS_CPU 20 /* CPU devices. */ #define BUS_PASS_RESOURCE 30 /* Resource discovery. */ #define BUS_PASS_INTERRUPT 40 /* Interrupt controllers. */ #define BUS_PASS_TIMER 50 /* Timers and clocks. */ #define BUS_PASS_SCHEDULER 60 /* Start scheduler. */ #define BUS_PASS_DEFAULT __INT_MAX /* Everything else. */ #define BUS_PASS_ORDER_FIRST 0 #define BUS_PASS_ORDER_EARLY 2 #define BUS_PASS_ORDER_MIDDLE 5 #define BUS_PASS_ORDER_LATE 7 #define BUS_PASS_ORDER_LAST 9 extern int bus_current_pass; void bus_set_pass(int pass); /** * Shorthands for constructing method tables. */ #define DEVMETHOD KOBJMETHOD #define DEVMETHOD_END KOBJMETHOD_END /* * Some common device interfaces. */ #include "device_if.h" #include "bus_if.h" struct module; int driver_module_handler(struct module *, int, void *); /** * Module support for automatically adding drivers to busses. */ struct driver_module_data { int (*dmd_chainevh)(struct module *, int, void *); void *dmd_chainarg; const char *dmd_busname; kobj_class_t dmd_driver; devclass_t *dmd_devclass; int dmd_pass; }; #define EARLY_DRIVER_MODULE_ORDERED(name, busname, driver, devclass, \ evh, arg, order, pass) \ \ static struct driver_module_data name##_##busname##_driver_mod = { \ evh, arg, \ #busname, \ (kobj_class_t) &driver, \ &devclass, \ pass \ }; \ \ static moduledata_t name##_##busname##_mod = { \ #busname "/" #name, \ driver_module_handler, \ &name##_##busname##_driver_mod \ }; \ DECLARE_MODULE(name##_##busname, name##_##busname##_mod, \ SI_SUB_DRIVERS, order) #define EARLY_DRIVER_MODULE(name, busname, driver, devclass, evh, arg, pass) \ EARLY_DRIVER_MODULE_ORDERED(name, busname, driver, devclass, \ evh, arg, SI_ORDER_MIDDLE, pass) #define DRIVER_MODULE_ORDERED(name, busname, driver, devclass, evh, arg,\ order) \ EARLY_DRIVER_MODULE_ORDERED(name, busname, driver, devclass, \ evh, arg, order, BUS_PASS_DEFAULT) #define DRIVER_MODULE(name, busname, driver, devclass, evh, arg) \ EARLY_DRIVER_MODULE(name, busname, driver, devclass, evh, arg, \ BUS_PASS_DEFAULT) /** * Generic ivar accessor generation macros for bus drivers */ #define __BUS_ACCESSOR(varp, var, ivarp, ivar, type) \ \ static __inline type varp ## _get_ ## var(device_t dev) \ { \ uintptr_t v; \ BUS_READ_IVAR(device_get_parent(dev), dev, \ ivarp ## _IVAR_ ## ivar, &v); \ return ((type) v); \ } \ \ static __inline void varp ## _set_ ## var(device_t dev, type t) \ { \ uintptr_t v = (uintptr_t) t; \ BUS_WRITE_IVAR(device_get_parent(dev), dev, \ ivarp ## _IVAR_ ## ivar, v); \ } /** * Shorthand macros, taking resource argument * Generated with sys/tools/bus_macro.sh */ #define bus_barrier(r, o, l, f) \ bus_space_barrier((r)->r_bustag, (r)->r_bushandle, (o), (l), (f)) #define bus_read_1(r, o) \ bus_space_read_1((r)->r_bustag, (r)->r_bushandle, (o)) #define bus_read_multi_1(r, o, d, c) \ bus_space_read_multi_1((r)->r_bustag, (r)->r_bushandle, (o), (d), (c)) #define bus_read_region_1(r, o, d, c) \ bus_space_read_region_1((r)->r_bustag, (r)->r_bushandle, (o), (d), (c)) #define bus_set_multi_1(r, o, v, c) \ bus_space_set_multi_1((r)->r_bustag, (r)->r_bushandle, (o), (v), (c)) #define bus_set_region_1(r, o, v, c) \ bus_space_set_region_1((r)->r_bustag, (r)->r_bushandle, (o), (v), (c)) #define bus_write_1(r, o, v) \ bus_space_write_1((r)->r_bustag, (r)->r_bushandle, (o), (v)) #define bus_write_multi_1(r, o, d, c) \ bus_space_write_multi_1((r)->r_bustag, (r)->r_bushandle, (o), (d), (c)) #define bus_write_region_1(r, o, d, c) \ bus_space_write_region_1((r)->r_bustag, (r)->r_bushandle, (o), (d), (c)) #define bus_read_stream_1(r, o) \ bus_space_read_stream_1((r)->r_bustag, (r)->r_bushandle, (o)) #define bus_read_multi_stream_1(r, o, d, c) \ bus_space_read_multi_stream_1((r)->r_bustag, (r)->r_bushandle, (o), (d), (c)) #define bus_read_region_stream_1(r, o, d, c) \ bus_space_read_region_stream_1((r)->r_bustag, (r)->r_bushandle, (o), (d), (c)) #define bus_set_multi_stream_1(r, o, v, c) \ bus_space_set_multi_stream_1((r)->r_bustag, (r)->r_bushandle, (o), (v), (c)) #define bus_set_region_stream_1(r, o, v, c) \ bus_space_set_region_stream_1((r)->r_bustag, (r)->r_bushandle, (o), (v), (c)) #define bus_write_stream_1(r, o, v) \ bus_space_write_stream_1((r)->r_bustag, (r)->r_bushandle, (o), (v)) #define bus_write_multi_stream_1(r, o, d, c) \ bus_space_write_multi_stream_1((r)->r_bustag, (r)->r_bushandle, (o), (d), (c)) #define bus_write_region_stream_1(r, o, d, c) \ bus_space_write_region_stream_1((r)->r_bustag, (r)->r_bushandle, (o), (d), (c)) #define bus_read_2(r, o) \ bus_space_read_2((r)->r_bustag, (r)->r_bushandle, (o)) #define bus_read_multi_2(r, o, d, c) \ bus_space_read_multi_2((r)->r_bustag, (r)->r_bushandle, (o), (d), (c)) #define bus_read_region_2(r, o, d, c) \ bus_space_read_region_2((r)->r_bustag, (r)->r_bushandle, (o), (d), (c)) #define bus_set_multi_2(r, o, v, c) \ bus_space_set_multi_2((r)->r_bustag, (r)->r_bushandle, (o), (v), (c)) #define bus_set_region_2(r, o, v, c) \ bus_space_set_region_2((r)->r_bustag, (r)->r_bushandle, (o), (v), (c)) #define bus_write_2(r, o, v) \ bus_space_write_2((r)->r_bustag, (r)->r_bushandle, (o), (v)) #define bus_write_multi_2(r, o, d, c) \ bus_space_write_multi_2((r)->r_bustag, (r)->r_bushandle, (o), (d), (c)) #define bus_write_region_2(r, o, d, c) \ bus_space_write_region_2((r)->r_bustag, (r)->r_bushandle, (o), (d), (c)) #define bus_read_stream_2(r, o) \ bus_space_read_stream_2((r)->r_bustag, (r)->r_bushandle, (o)) #define bus_read_multi_stream_2(r, o, d, c) \ bus_space_read_multi_stream_2((r)->r_bustag, (r)->r_bushandle, (o), (d), (c)) #define bus_read_region_stream_2(r, o, d, c) \ bus_space_read_region_stream_2((r)->r_bustag, (r)->r_bushandle, (o), (d), (c)) #define bus_set_multi_stream_2(r, o, v, c) \ bus_space_set_multi_stream_2((r)->r_bustag, (r)->r_bushandle, (o), (v), (c)) #define bus_set_region_stream_2(r, o, v, c) \ bus_space_set_region_stream_2((r)->r_bustag, (r)->r_bushandle, (o), (v), (c)) #define bus_write_stream_2(r, o, v) \ bus_space_write_stream_2((r)->r_bustag, (r)->r_bushandle, (o), (v)) #define bus_write_multi_stream_2(r, o, d, c) \ bus_space_write_multi_stream_2((r)->r_bustag, (r)->r_bushandle, (o), (d), (c)) #define bus_write_region_stream_2(r, o, d, c) \ bus_space_write_region_stream_2((r)->r_bustag, (r)->r_bushandle, (o), (d), (c)) #define bus_read_4(r, o) \ bus_space_read_4((r)->r_bustag, (r)->r_bushandle, (o)) #define bus_read_multi_4(r, o, d, c) \ bus_space_read_multi_4((r)->r_bustag, (r)->r_bushandle, (o), (d), (c)) #define bus_read_region_4(r, o, d, c) \ bus_space_read_region_4((r)->r_bustag, (r)->r_bushandle, (o), (d), (c)) #define bus_set_multi_4(r, o, v, c) \ bus_space_set_multi_4((r)->r_bustag, (r)->r_bushandle, (o), (v), (c)) #define bus_set_region_4(r, o, v, c) \ bus_space_set_region_4((r)->r_bustag, (r)->r_bushandle, (o), (v), (c)) #define bus_write_4(r, o, v) \ bus_space_write_4((r)->r_bustag, (r)->r_bushandle, (o), (v)) #define bus_write_multi_4(r, o, d, c) \ bus_space_write_multi_4((r)->r_bustag, (r)->r_bushandle, (o), (d), (c)) #define bus_write_region_4(r, o, d, c) \ bus_space_write_region_4((r)->r_bustag, (r)->r_bushandle, (o), (d), (c)) #define bus_read_stream_4(r, o) \ bus_space_read_stream_4((r)->r_bustag, (r)->r_bushandle, (o)) #define bus_read_multi_stream_4(r, o, d, c) \ bus_space_read_multi_stream_4((r)->r_bustag, (r)->r_bushandle, (o), (d), (c)) #define bus_read_region_stream_4(r, o, d, c) \ bus_space_read_region_stream_4((r)->r_bustag, (r)->r_bushandle, (o), (d), (c)) #define bus_set_multi_stream_4(r, o, v, c) \ bus_space_set_multi_stream_4((r)->r_bustag, (r)->r_bushandle, (o), (v), (c)) #define bus_set_region_stream_4(r, o, v, c) \ bus_space_set_region_stream_4((r)->r_bustag, (r)->r_bushandle, (o), (v), (c)) #define bus_write_stream_4(r, o, v) \ bus_space_write_stream_4((r)->r_bustag, (r)->r_bushandle, (o), (v)) #define bus_write_multi_stream_4(r, o, d, c) \ bus_space_write_multi_stream_4((r)->r_bustag, (r)->r_bushandle, (o), (d), (c)) #define bus_write_region_stream_4(r, o, d, c) \ bus_space_write_region_stream_4((r)->r_bustag, (r)->r_bushandle, (o), (d), (c)) #define bus_read_8(r, o) \ bus_space_read_8((r)->r_bustag, (r)->r_bushandle, (o)) #define bus_read_multi_8(r, o, d, c) \ bus_space_read_multi_8((r)->r_bustag, (r)->r_bushandle, (o), (d), (c)) #define bus_read_region_8(r, o, d, c) \ bus_space_read_region_8((r)->r_bustag, (r)->r_bushandle, (o), (d), (c)) #define bus_set_multi_8(r, o, v, c) \ bus_space_set_multi_8((r)->r_bustag, (r)->r_bushandle, (o), (v), (c)) #define bus_set_region_8(r, o, v, c) \ bus_space_set_region_8((r)->r_bustag, (r)->r_bushandle, (o), (v), (c)) #define bus_write_8(r, o, v) \ bus_space_write_8((r)->r_bustag, (r)->r_bushandle, (o), (v)) #define bus_write_multi_8(r, o, d, c) \ bus_space_write_multi_8((r)->r_bustag, (r)->r_bushandle, (o), (d), (c)) #define bus_write_region_8(r, o, d, c) \ bus_space_write_region_8((r)->r_bustag, (r)->r_bushandle, (o), (d), (c)) #define bus_read_stream_8(r, o) \ bus_space_read_stream_8((r)->r_bustag, (r)->r_bushandle, (o)) #define bus_read_multi_stream_8(r, o, d, c) \ bus_space_read_multi_stream_8((r)->r_bustag, (r)->r_bushandle, (o), (d), (c)) #define bus_read_region_stream_8(r, o, d, c) \ bus_space_read_region_stream_8((r)->r_bustag, (r)->r_bushandle, (o), (d), (c)) #define bus_set_multi_stream_8(r, o, v, c) \ bus_space_set_multi_stream_8((r)->r_bustag, (r)->r_bushandle, (o), (v), (c)) #define bus_set_region_stream_8(r, o, v, c) \ bus_space_set_region_stream_8((r)->r_bustag, (r)->r_bushandle, (o), (v), (c)) #define bus_write_stream_8(r, o, v) \ bus_space_write_stream_8((r)->r_bustag, (r)->r_bushandle, (o), (v)) #define bus_write_multi_stream_8(r, o, d, c) \ bus_space_write_multi_stream_8((r)->r_bustag, (r)->r_bushandle, (o), (d), (c)) #define bus_write_region_stream_8(r, o, d, c) \ bus_space_write_region_stream_8((r)->r_bustag, (r)->r_bushandle, (o), (d), (c)) #endif /* _KERNEL */ #endif /* !_SYS_BUS_H_ */ Index: projects/building-blocks/sys/sys/systm.h =================================================================== --- projects/building-blocks/sys/sys/systm.h (revision 278311) +++ projects/building-blocks/sys/sys/systm.h (revision 278312) @@ -1,457 +1,456 @@ /*- * Copyright (c) 1982, 1988, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)systm.h 8.7 (Berkeley) 3/29/95 * $FreeBSD$ */ #ifndef _SYS_SYSTM_H_ #define _SYS_SYSTM_H_ #include #include #include #include #include #include /* for people using printf mainly */ extern int cold; /* nonzero if we are doing a cold boot */ extern int rebooting; /* kern_reboot() has been called. */ extern const char *panicstr; /* panic message */ extern char version[]; /* system version */ extern char compiler_version[]; /* compiler version */ extern char copyright[]; /* system copyright */ extern int kstack_pages; /* number of kernel stack pages */ extern u_long pagesizes[]; /* supported page sizes */ extern long physmem; /* physical memory */ extern long realmem; /* 'real' memory */ extern char *rootdevnames[2]; /* names of possible root devices */ extern int boothowto; /* reboot flags, from console subsystem */ extern int bootverbose; /* nonzero to print verbose messages */ extern int maxusers; /* system tune hint */ extern int ngroups_max; /* max # of supplemental groups */ extern int vm_guest; /* Running as virtual machine guest? */ /* * Detected virtual machine guest types. The intention is to expand * and/or add to the VM_GUEST_VM type if specific VM functionality is * ever implemented (e.g. vendor-specific paravirtualization features). * Keep in sync with vm_guest_sysctl_names[]. */ enum VM_GUEST { VM_GUEST_NO = 0, VM_GUEST_VM, VM_GUEST_XEN, VM_GUEST_HV, VM_GUEST_VMWARE, VM_LAST }; #if defined(WITNESS) || defined(INVARIANTS) void kassert_panic(const char *fmt, ...) __printflike(1, 2); #endif #ifdef INVARIANTS /* The option is always available */ #define KASSERT(exp,msg) do { \ if (__predict_false(!(exp))) \ kassert_panic msg; \ } while (0) #define VNASSERT(exp, vp, msg) do { \ if (__predict_false(!(exp))) { \ vn_printf(vp, "VNASSERT failed\n"); \ kassert_panic msg; \ } \ } while (0) #else #define KASSERT(exp,msg) do { \ } while (0) #define VNASSERT(exp, vp, msg) do { \ } while (0) #endif #ifndef CTASSERT /* Allow lint to override */ #define CTASSERT(x) _Static_assert(x, "compile-time assertion failed") #endif /* * Assert that a pointer can be loaded from memory atomically. * * This assertion enforces stronger alignment than necessary. For example, * on some architectures, atomicity for unaligned loads will depend on * whether or not the load spans multiple cache lines. */ #define ASSERT_ATOMIC_LOAD_PTR(var, msg) \ KASSERT(sizeof(var) == sizeof(void *) && \ ((uintptr_t)&(var) & (sizeof(void *) - 1)) == 0, msg) /* * Assert that a thread is in critical(9) section. */ #define CRITICAL_ASSERT(td) \ KASSERT((td)->td_critnest >= 1, ("Not in critical section")); /* * If we have already panic'd and this is the thread that called * panic(), then don't block on any mutexes but silently succeed. * Otherwise, the kernel will deadlock since the scheduler isn't * going to run the thread that holds any lock we need. */ #define SCHEDULER_STOPPED() __predict_false(curthread->td_stopsched) /* * XXX the hints declarations are even more misplaced than most declarations * in this file, since they are needed in one file (per arch) and only used * in two files. * XXX most of these variables should be const. */ extern int osreldate; extern int envmode; extern int hintmode; /* 0 = off. 1 = config, 2 = fallback */ extern int dynamic_kenv; extern struct mtx kenv_lock; extern char *kern_envp; extern char static_env[]; extern char static_hints[]; /* by config for now */ extern char **kenvp; extern const void *zero_region; /* address space maps to a zeroed page */ extern int unmapped_buf_allowed; extern int iosize_max_clamp; extern int devfs_iosize_max_clamp; #define IOSIZE_MAX (iosize_max_clamp ? INT_MAX : SSIZE_MAX) #define DEVFS_IOSIZE_MAX (devfs_iosize_max_clamp ? INT_MAX : SSIZE_MAX) /* * General function declarations. */ struct inpcb; struct lock_object; struct malloc_type; struct mtx; struct proc; struct socket; struct thread; struct tty; struct ucred; struct uio; struct _jmp_buf; struct trapframe; struct eventtimer; int setjmp(struct _jmp_buf *) __returns_twice; void longjmp(struct _jmp_buf *, int) __dead2; int dumpstatus(vm_offset_t addr, off_t count); int nullop(void); int eopnotsupp(void); int ureadc(int, struct uio *); void hashdestroy(void *, struct malloc_type *, u_long); void *hashinit(int count, struct malloc_type *type, u_long *hashmask); void *hashinit_flags(int count, struct malloc_type *type, u_long *hashmask, int flags); #define HASH_NOWAIT 0x00000001 #define HASH_WAITOK 0x00000002 void *phashinit(int count, struct malloc_type *type, u_long *nentries); void g_waitidle(void); void panic(const char *, ...) __dead2 __printflike(1, 2); void cpu_boot(int); void cpu_flush_dcache(void *, size_t); void cpu_rootconf(void); void critical_enter(void); void critical_exit(void); void init_param1(void); void init_param2(long physpages); void init_static_kenv(char *, size_t); void tablefull(const char *); #ifdef EARLY_PRINTF typedef void early_putc_t(int ch); extern early_putc_t *early_putc; #endif int kvprintf(char const *, void (*)(int, void*), void *, int, __va_list) __printflike(1, 0); void log(int, const char *, ...) __printflike(2, 3); void log_console(struct uio *); int printf(const char *, ...) __printflike(1, 2); int snprintf(char *, size_t, const char *, ...) __printflike(3, 4); int sprintf(char *buf, const char *, ...) __printflike(2, 3); int uprintf(const char *, ...) __printflike(1, 2); int vprintf(const char *, __va_list) __printflike(1, 0); int vsnprintf(char *, size_t, const char *, __va_list) __printflike(3, 0); int vsnrprintf(char *, size_t, int, const char *, __va_list) __printflike(4, 0); int vsprintf(char *buf, const char *, __va_list) __printflike(2, 0); int ttyprintf(struct tty *, const char *, ...) __printflike(2, 3); int sscanf(const char *, char const *, ...) __nonnull(1) __nonnull(2); int vsscanf(const char *, char const *, __va_list) __nonnull(1) __nonnull(2); long strtol(const char *, char **, int) __nonnull(1); u_long strtoul(const char *, char **, int) __nonnull(1); quad_t strtoq(const char *, char **, int) __nonnull(1); u_quad_t strtouq(const char *, char **, int) __nonnull(1); void tprintf(struct proc *p, int pri, const char *, ...) __printflike(3, 4); void vtprintf(struct proc *, int, const char *, __va_list) __printflike(3, 0); void hexdump(const void *ptr, int length, const char *hdr, int flags); #define HD_COLUMN_MASK 0xff #define HD_DELIM_MASK 0xff00 #define HD_OMIT_COUNT (1 << 16) #define HD_OMIT_HEX (1 << 17) #define HD_OMIT_CHARS (1 << 18) #define ovbcopy(f, t, l) bcopy((f), (t), (l)) void bcopy(const void *from, void *to, size_t len) __nonnull(1) __nonnull(2); void bzero(void *buf, size_t len) __nonnull(1); void explicit_bzero(void *, size_t) __nonnull(1);; void *memcpy(void *to, const void *from, size_t len) __nonnull(1) __nonnull(2); void *memmove(void *dest, const void *src, size_t n) __nonnull(1) __nonnull(2); int copystr(const void * __restrict kfaddr, void * __restrict kdaddr, size_t len, size_t * __restrict lencopied) __nonnull(1) __nonnull(2); int copyinstr(const void * __restrict udaddr, void * __restrict kaddr, size_t len, size_t * __restrict lencopied) __nonnull(1) __nonnull(2); int copyin(const void * __restrict udaddr, void * __restrict kaddr, size_t len) __nonnull(1) __nonnull(2); int copyin_nofault(const void * __restrict udaddr, void * __restrict kaddr, size_t len) __nonnull(1) __nonnull(2); int copyout(const void * __restrict kaddr, void * __restrict udaddr, size_t len) __nonnull(1) __nonnull(2); int copyout_nofault(const void * __restrict kaddr, void * __restrict udaddr, size_t len) __nonnull(1) __nonnull(2); int fubyte(volatile const void *base); long fuword(volatile const void *base); int fuword16(volatile const void *base); int32_t fuword32(volatile const void *base); int64_t fuword64(volatile const void *base); int fueword(volatile const void *base, long *val); int fueword32(volatile const void *base, int32_t *val); int fueword64(volatile const void *base, int64_t *val); int subyte(volatile void *base, int byte); int suword(volatile void *base, long word); int suword16(volatile void *base, int word); int suword32(volatile void *base, int32_t word); int suword64(volatile void *base, int64_t word); uint32_t casuword32(volatile uint32_t *base, uint32_t oldval, uint32_t newval); u_long casuword(volatile u_long *p, u_long oldval, u_long newval); int casueword32(volatile uint32_t *base, uint32_t oldval, uint32_t *oldvalp, uint32_t newval); int casueword(volatile u_long *p, u_long oldval, u_long *oldvalp, u_long newval); void realitexpire(void *); int sysbeep(int hertz, int period); void hardclock(int usermode, uintfptr_t pc); void hardclock_cnt(int cnt, int usermode); void hardclock_cpu(int usermode); void hardclock_sync(int cpu); void softclock(void *); void statclock(int usermode); void statclock_cnt(int cnt, int usermode); void profclock(int usermode, uintfptr_t pc); void profclock_cnt(int cnt, int usermode, uintfptr_t pc); int hardclockintr(void); void startprofclock(struct proc *); void stopprofclock(struct proc *); void cpu_startprofclock(void); void cpu_stopprofclock(void); sbintime_t cpu_idleclock(void); void cpu_activeclock(void); void cpu_new_callout(int cpu, sbintime_t bt, sbintime_t bt_opt); void cpu_et_frequency(struct eventtimer *et, uint64_t newfreq); extern int cpu_deepest_sleep; extern int cpu_disable_c2_sleep; extern int cpu_disable_c3_sleep; int cr_cansee(struct ucred *u1, struct ucred *u2); int cr_canseesocket(struct ucred *cred, struct socket *so); int cr_canseeinpcb(struct ucred *cred, struct inpcb *inp); char *kern_getenv(const char *name); void freeenv(char *env); int getenv_int(const char *name, int *data); int getenv_uint(const char *name, unsigned int *data); int getenv_long(const char *name, long *data); int getenv_ulong(const char *name, unsigned long *data); int getenv_string(const char *name, char *data, int size); int getenv_quad(const char *name, quad_t *data); int kern_setenv(const char *name, const char *value); int kern_unsetenv(const char *name); int testenv(const char *name); typedef uint64_t (cpu_tick_f)(void); void set_cputicker(cpu_tick_f *func, uint64_t freq, unsigned var); extern cpu_tick_f *cpu_ticks; uint64_t cpu_tickrate(void); uint64_t cputick2usec(uint64_t tick); #ifdef APM_FIXUP_CALLTODO struct timeval; void adjust_timeout_calltodo(struct timeval *time_change); #endif /* APM_FIXUP_CALLTODO */ #include /* Initialize the world */ void consinit(void); void cpu_initclocks(void); void cpu_initclocks_bsp(void); void cpu_initclocks_ap(void); void usrinfoinit(void); /* Finalize the world */ void kern_reboot(int) __dead2; void shutdown_nice(int); /* Timeouts */ typedef void timeout_t(void *); /* timeout function type */ #define CALLOUT_HANDLE_INITIALIZER(handle) \ { NULL } void callout_handle_init(struct callout_handle *); struct callout_handle timeout(timeout_t *, void *, int); void untimeout(timeout_t *, void *, struct callout_handle); /* Stubs for obsolete functions that used to be for interrupt management */ static __inline intrmask_t splbio(void) { return 0; } static __inline intrmask_t splcam(void) { return 0; } static __inline intrmask_t splclock(void) { return 0; } static __inline intrmask_t splhigh(void) { return 0; } static __inline intrmask_t splimp(void) { return 0; } static __inline intrmask_t splnet(void) { return 0; } static __inline intrmask_t spltty(void) { return 0; } -static __inline intrmask_t splvm(void) { return 0; } static __inline void splx(intrmask_t ipl __unused) { return; } /* * Common `proc' functions are declared here so that proc.h can be included * less often. */ int _sleep(void *chan, struct lock_object *lock, int pri, const char *wmesg, sbintime_t sbt, sbintime_t pr, int flags) __nonnull(1); #define msleep(chan, mtx, pri, wmesg, timo) \ _sleep((chan), &(mtx)->lock_object, (pri), (wmesg), \ tick_sbt * (timo), 0, C_HARDCLOCK) #define msleep_sbt(chan, mtx, pri, wmesg, bt, pr, flags) \ _sleep((chan), &(mtx)->lock_object, (pri), (wmesg), (bt), (pr), \ (flags)) int msleep_spin_sbt(void *chan, struct mtx *mtx, const char *wmesg, sbintime_t sbt, sbintime_t pr, int flags) __nonnull(1); #define msleep_spin(chan, mtx, wmesg, timo) \ msleep_spin_sbt((chan), (mtx), (wmesg), tick_sbt * (timo), \ 0, C_HARDCLOCK) int pause_sbt(const char *wmesg, sbintime_t sbt, sbintime_t pr, int flags); #define pause(wmesg, timo) \ pause_sbt((wmesg), tick_sbt * (timo), 0, C_HARDCLOCK) #define tsleep(chan, pri, wmesg, timo) \ _sleep((chan), NULL, (pri), (wmesg), tick_sbt * (timo), \ 0, C_HARDCLOCK) #define tsleep_sbt(chan, pri, wmesg, bt, pr, flags) \ _sleep((chan), NULL, (pri), (wmesg), (bt), (pr), (flags)) void wakeup(void *chan) __nonnull(1); void wakeup_one(void *chan) __nonnull(1); /* * Common `struct cdev *' stuff are declared here to avoid #include poisoning */ struct cdev; dev_t dev2udev(struct cdev *x); const char *devtoname(struct cdev *cdev); int poll_no_poll(int events); /* XXX: Should be void nanodelay(u_int nsec); */ void DELAY(int usec); /* Root mount holdback API */ struct root_hold_token; struct root_hold_token *root_mount_hold(const char *identifier); void root_mount_rel(struct root_hold_token *h); void root_mount_wait(void); int root_mounted(void); /* * Unit number allocation API. (kern/subr_unit.c) */ struct unrhdr; struct unrhdr *new_unrhdr(int low, int high, struct mtx *mutex); void init_unrhdr(struct unrhdr *uh, int low, int high, struct mtx *mutex); void delete_unrhdr(struct unrhdr *uh); void clean_unrhdr(struct unrhdr *uh); void clean_unrhdrl(struct unrhdr *uh); int alloc_unr(struct unrhdr *uh); int alloc_unr_specific(struct unrhdr *uh, u_int item); int alloc_unrl(struct unrhdr *uh); void free_unr(struct unrhdr *uh, u_int item); /* * Population count algorithm using SWAR approach * - "SIMD Within A Register". */ static __inline uint32_t bitcount32(uint32_t x) { x = (x & 0x55555555) + ((x & 0xaaaaaaaa) >> 1); x = (x & 0x33333333) + ((x & 0xcccccccc) >> 2); x = (x + (x >> 4)) & 0x0f0f0f0f; x = (x + (x >> 8)); x = (x + (x >> 16)) & 0x000000ff; return (x); } static __inline uint16_t bitcount16(uint32_t x) { x = (x & 0x5555) + ((x & 0xaaaa) >> 1); x = (x & 0x3333) + ((x & 0xcccc) >> 2); x = (x + (x >> 4)) & 0x0f0f; x = (x + (x >> 8)) & 0x00ff; return (x); } void intr_prof_stack_use(struct thread *td, struct trapframe *frame); #endif /* !_SYS_SYSTM_H_ */ Index: projects/building-blocks/sys/ufs/ffs/ffs_softdep.c =================================================================== --- projects/building-blocks/sys/ufs/ffs/ffs_softdep.c (revision 278311) +++ projects/building-blocks/sys/ufs/ffs/ffs_softdep.c (revision 278312) @@ -1,14146 +1,14144 @@ /*- * Copyright 1998, 2000 Marshall Kirk McKusick. * Copyright 2009, 2010 Jeffrey W. Roberson * All rights reserved. * * The soft updates code is derived from the appendix of a University * of Michigan technical report (Gregory R. Ganger and Yale N. Patt, * "Soft Updates: A Solution to the Metadata Update Problem in File * Systems", CSE-TR-254-95, August 1995). * * Further information about soft updates can be obtained from: * * Marshall Kirk McKusick http://www.mckusick.com/softdep/ * 1614 Oxford Street mckusick@mckusick.com * Berkeley, CA 94709-1608 +1-510-843-9542 * USA * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR * TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * from: @(#)ffs_softdep.c 9.59 (McKusick) 6/21/00 */ #include __FBSDID("$FreeBSD$"); #include "opt_ffs.h" #include "opt_quota.h" #include "opt_ddb.h" /* * For now we want the safety net that the DEBUG flag provides. */ #ifndef DEBUG #define DEBUG #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define KTR_SUJ 0 /* Define to KTR_SPARE. */ #ifndef SOFTUPDATES int softdep_flushfiles(oldmnt, flags, td) struct mount *oldmnt; int flags; struct thread *td; { panic("softdep_flushfiles called"); } int softdep_mount(devvp, mp, fs, cred) struct vnode *devvp; struct mount *mp; struct fs *fs; struct ucred *cred; { return (0); } void softdep_initialize() { return; } void softdep_uninitialize() { return; } void softdep_unmount(mp) struct mount *mp; { panic("softdep_unmount called"); } void softdep_setup_sbupdate(ump, fs, bp) struct ufsmount *ump; struct fs *fs; struct buf *bp; { panic("softdep_setup_sbupdate called"); } void softdep_setup_inomapdep(bp, ip, newinum, mode) struct buf *bp; struct inode *ip; ino_t newinum; int mode; { panic("softdep_setup_inomapdep called"); } void softdep_setup_blkmapdep(bp, mp, newblkno, frags, oldfrags) struct buf *bp; struct mount *mp; ufs2_daddr_t newblkno; int frags; int oldfrags; { panic("softdep_setup_blkmapdep called"); } void softdep_setup_allocdirect(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp) struct inode *ip; ufs_lbn_t lbn; ufs2_daddr_t newblkno; ufs2_daddr_t oldblkno; long newsize; long oldsize; struct buf *bp; { panic("softdep_setup_allocdirect called"); } void softdep_setup_allocext(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp) struct inode *ip; ufs_lbn_t lbn; ufs2_daddr_t newblkno; ufs2_daddr_t oldblkno; long newsize; long oldsize; struct buf *bp; { panic("softdep_setup_allocext called"); } void softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp) struct inode *ip; ufs_lbn_t lbn; struct buf *bp; int ptrno; ufs2_daddr_t newblkno; ufs2_daddr_t oldblkno; struct buf *nbp; { panic("softdep_setup_allocindir_page called"); } void softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno) struct buf *nbp; struct inode *ip; struct buf *bp; int ptrno; ufs2_daddr_t newblkno; { panic("softdep_setup_allocindir_meta called"); } void softdep_journal_freeblocks(ip, cred, length, flags) struct inode *ip; struct ucred *cred; off_t length; int flags; { panic("softdep_journal_freeblocks called"); } void softdep_journal_fsync(ip) struct inode *ip; { panic("softdep_journal_fsync called"); } void softdep_setup_freeblocks(ip, length, flags) struct inode *ip; off_t length; int flags; { panic("softdep_setup_freeblocks called"); } void softdep_freefile(pvp, ino, mode) struct vnode *pvp; ino_t ino; int mode; { panic("softdep_freefile called"); } int softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp, isnewblk) struct buf *bp; struct inode *dp; off_t diroffset; ino_t newinum; struct buf *newdirbp; int isnewblk; { panic("softdep_setup_directory_add called"); } void softdep_change_directoryentry_offset(bp, dp, base, oldloc, newloc, entrysize) struct buf *bp; struct inode *dp; caddr_t base; caddr_t oldloc; caddr_t newloc; int entrysize; { panic("softdep_change_directoryentry_offset called"); } void softdep_setup_remove(bp, dp, ip, isrmdir) struct buf *bp; struct inode *dp; struct inode *ip; int isrmdir; { panic("softdep_setup_remove called"); } void softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir) struct buf *bp; struct inode *dp; struct inode *ip; ino_t newinum; int isrmdir; { panic("softdep_setup_directory_change called"); } void softdep_setup_blkfree(mp, bp, blkno, frags, wkhd) struct mount *mp; struct buf *bp; ufs2_daddr_t blkno; int frags; struct workhead *wkhd; { panic("%s called", __FUNCTION__); } void softdep_setup_inofree(mp, bp, ino, wkhd) struct mount *mp; struct buf *bp; ino_t ino; struct workhead *wkhd; { panic("%s called", __FUNCTION__); } void softdep_setup_unlink(dp, ip) struct inode *dp; struct inode *ip; { panic("%s called", __FUNCTION__); } void softdep_setup_link(dp, ip) struct inode *dp; struct inode *ip; { panic("%s called", __FUNCTION__); } void softdep_revert_link(dp, ip) struct inode *dp; struct inode *ip; { panic("%s called", __FUNCTION__); } void softdep_setup_rmdir(dp, ip) struct inode *dp; struct inode *ip; { panic("%s called", __FUNCTION__); } void softdep_revert_rmdir(dp, ip) struct inode *dp; struct inode *ip; { panic("%s called", __FUNCTION__); } void softdep_setup_create(dp, ip) struct inode *dp; struct inode *ip; { panic("%s called", __FUNCTION__); } void softdep_revert_create(dp, ip) struct inode *dp; struct inode *ip; { panic("%s called", __FUNCTION__); } void softdep_setup_mkdir(dp, ip) struct inode *dp; struct inode *ip; { panic("%s called", __FUNCTION__); } void softdep_revert_mkdir(dp, ip) struct inode *dp; struct inode *ip; { panic("%s called", __FUNCTION__); } void softdep_setup_dotdot_link(dp, ip) struct inode *dp; struct inode *ip; { panic("%s called", __FUNCTION__); } int softdep_prealloc(vp, waitok) struct vnode *vp; int waitok; { panic("%s called", __FUNCTION__); } int softdep_journal_lookup(mp, vpp) struct mount *mp; struct vnode **vpp; { return (ENOENT); } void softdep_change_linkcnt(ip) struct inode *ip; { panic("softdep_change_linkcnt called"); } void softdep_load_inodeblock(ip) struct inode *ip; { panic("softdep_load_inodeblock called"); } void softdep_update_inodeblock(ip, bp, waitfor) struct inode *ip; struct buf *bp; int waitfor; { panic("softdep_update_inodeblock called"); } int softdep_fsync(vp) struct vnode *vp; /* the "in_core" copy of the inode */ { return (0); } void softdep_fsync_mountdev(vp) struct vnode *vp; { return; } int softdep_flushworklist(oldmnt, countp, td) struct mount *oldmnt; int *countp; struct thread *td; { *countp = 0; return (0); } int softdep_sync_metadata(struct vnode *vp) { panic("softdep_sync_metadata called"); } int softdep_sync_buf(struct vnode *vp, struct buf *bp, int waitfor) { panic("softdep_sync_buf called"); } int softdep_slowdown(vp) struct vnode *vp; { panic("softdep_slowdown called"); } int softdep_request_cleanup(fs, vp, cred, resource) struct fs *fs; struct vnode *vp; struct ucred *cred; int resource; { return (0); } int softdep_check_suspend(struct mount *mp, struct vnode *devvp, int softdep_depcnt, int softdep_accdepcnt, int secondary_writes, int secondary_accwrites) { struct bufobj *bo; int error; (void) softdep_depcnt, (void) softdep_accdepcnt; bo = &devvp->v_bufobj; ASSERT_BO_WLOCKED(bo); MNT_ILOCK(mp); while (mp->mnt_secondary_writes != 0) { BO_UNLOCK(bo); msleep(&mp->mnt_secondary_writes, MNT_MTX(mp), (PUSER - 1) | PDROP, "secwr", 0); BO_LOCK(bo); MNT_ILOCK(mp); } /* * Reasons for needing more work before suspend: * - Dirty buffers on devvp. * - Secondary writes occurred after start of vnode sync loop */ error = 0; if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0 || secondary_writes != 0 || mp->mnt_secondary_writes != 0 || secondary_accwrites != mp->mnt_secondary_accwrites) error = EAGAIN; BO_UNLOCK(bo); return (error); } void softdep_get_depcounts(struct mount *mp, int *softdepactivep, int *softdepactiveaccp) { (void) mp; *softdepactivep = 0; *softdepactiveaccp = 0; } void softdep_buf_append(bp, wkhd) struct buf *bp; struct workhead *wkhd; { panic("softdep_buf_appendwork called"); } void softdep_inode_append(ip, cred, wkhd) struct inode *ip; struct ucred *cred; struct workhead *wkhd; { panic("softdep_inode_appendwork called"); } void softdep_freework(wkhd) struct workhead *wkhd; { panic("softdep_freework called"); } #else FEATURE(softupdates, "FFS soft-updates support"); static SYSCTL_NODE(_debug, OID_AUTO, softdep, CTLFLAG_RW, 0, "soft updates stats"); static SYSCTL_NODE(_debug_softdep, OID_AUTO, total, CTLFLAG_RW, 0, "total dependencies allocated"); static SYSCTL_NODE(_debug_softdep, OID_AUTO, highuse, CTLFLAG_RW, 0, "high use dependencies allocated"); static SYSCTL_NODE(_debug_softdep, OID_AUTO, current, CTLFLAG_RW, 0, "current dependencies allocated"); static SYSCTL_NODE(_debug_softdep, OID_AUTO, write, CTLFLAG_RW, 0, "current dependencies written"); unsigned long dep_current[D_LAST + 1]; unsigned long dep_highuse[D_LAST + 1]; unsigned long dep_total[D_LAST + 1]; unsigned long dep_write[D_LAST + 1]; #define SOFTDEP_TYPE(type, str, long) \ static MALLOC_DEFINE(M_ ## type, #str, long); \ SYSCTL_ULONG(_debug_softdep_total, OID_AUTO, str, CTLFLAG_RD, \ &dep_total[D_ ## type], 0, ""); \ SYSCTL_ULONG(_debug_softdep_current, OID_AUTO, str, CTLFLAG_RD, \ &dep_current[D_ ## type], 0, ""); \ SYSCTL_ULONG(_debug_softdep_highuse, OID_AUTO, str, CTLFLAG_RD, \ &dep_highuse[D_ ## type], 0, ""); \ SYSCTL_ULONG(_debug_softdep_write, OID_AUTO, str, CTLFLAG_RD, \ &dep_write[D_ ## type], 0, ""); SOFTDEP_TYPE(PAGEDEP, pagedep, "File page dependencies"); SOFTDEP_TYPE(INODEDEP, inodedep, "Inode dependencies"); SOFTDEP_TYPE(BMSAFEMAP, bmsafemap, "Block or frag allocated from cyl group map"); SOFTDEP_TYPE(NEWBLK, newblk, "New block or frag allocation dependency"); SOFTDEP_TYPE(ALLOCDIRECT, allocdirect, "Block or frag dependency for an inode"); SOFTDEP_TYPE(INDIRDEP, indirdep, "Indirect block dependencies"); SOFTDEP_TYPE(ALLOCINDIR, allocindir, "Block dependency for an indirect block"); SOFTDEP_TYPE(FREEFRAG, freefrag, "Previously used frag for an inode"); SOFTDEP_TYPE(FREEBLKS, freeblks, "Blocks freed from an inode"); SOFTDEP_TYPE(FREEFILE, freefile, "Inode deallocated"); SOFTDEP_TYPE(DIRADD, diradd, "New directory entry"); SOFTDEP_TYPE(MKDIR, mkdir, "New directory"); SOFTDEP_TYPE(DIRREM, dirrem, "Directory entry deleted"); SOFTDEP_TYPE(NEWDIRBLK, newdirblk, "Unclaimed new directory block"); SOFTDEP_TYPE(FREEWORK, freework, "free an inode block"); SOFTDEP_TYPE(FREEDEP, freedep, "track a block free"); SOFTDEP_TYPE(JADDREF, jaddref, "Journal inode ref add"); SOFTDEP_TYPE(JREMREF, jremref, "Journal inode ref remove"); SOFTDEP_TYPE(JMVREF, jmvref, "Journal inode ref move"); SOFTDEP_TYPE(JNEWBLK, jnewblk, "Journal new block"); SOFTDEP_TYPE(JFREEBLK, jfreeblk, "Journal free block"); SOFTDEP_TYPE(JFREEFRAG, jfreefrag, "Journal free frag"); SOFTDEP_TYPE(JSEG, jseg, "Journal segment"); SOFTDEP_TYPE(JSEGDEP, jsegdep, "Journal segment complete"); SOFTDEP_TYPE(SBDEP, sbdep, "Superblock write dependency"); SOFTDEP_TYPE(JTRUNC, jtrunc, "Journal inode truncation"); SOFTDEP_TYPE(JFSYNC, jfsync, "Journal fsync complete"); static MALLOC_DEFINE(M_SENTINEL, "sentinel", "Worklist sentinel"); static MALLOC_DEFINE(M_SAVEDINO, "savedino", "Saved inodes"); static MALLOC_DEFINE(M_JBLOCKS, "jblocks", "Journal block locations"); static MALLOC_DEFINE(M_MOUNTDATA, "softdep", "Softdep per-mount data"); #define M_SOFTDEP_FLAGS (M_WAITOK) /* * translate from workitem type to memory type * MUST match the defines above, such that memtype[D_XXX] == M_XXX */ static struct malloc_type *memtype[] = { M_PAGEDEP, M_INODEDEP, M_BMSAFEMAP, M_NEWBLK, M_ALLOCDIRECT, M_INDIRDEP, M_ALLOCINDIR, M_FREEFRAG, M_FREEBLKS, M_FREEFILE, M_DIRADD, M_MKDIR, M_DIRREM, M_NEWDIRBLK, M_FREEWORK, M_FREEDEP, M_JADDREF, M_JREMREF, M_JMVREF, M_JNEWBLK, M_JFREEBLK, M_JFREEFRAG, M_JSEG, M_JSEGDEP, M_SBDEP, M_JTRUNC, M_JFSYNC, M_SENTINEL }; #define DtoM(type) (memtype[type]) /* * Names of malloc types. */ #define TYPENAME(type) \ ((unsigned)(type) <= D_LAST ? memtype[type]->ks_shortdesc : "???") /* * End system adaptation definitions. */ #define DOTDOT_OFFSET offsetof(struct dirtemplate, dotdot_ino) #define DOT_OFFSET offsetof(struct dirtemplate, dot_ino) /* * Internal function prototypes. */ static void check_clear_deps(struct mount *); static void softdep_error(char *, int); static int softdep_process_worklist(struct mount *, int); static int softdep_waitidle(struct mount *, int); static void drain_output(struct vnode *); static struct buf *getdirtybuf(struct buf *, struct rwlock *, int); static void clear_remove(struct mount *); static void clear_inodedeps(struct mount *); static void unlinked_inodedep(struct mount *, struct inodedep *); static void clear_unlinked_inodedep(struct inodedep *); static struct inodedep *first_unlinked_inodedep(struct ufsmount *); static int flush_pagedep_deps(struct vnode *, struct mount *, struct diraddhd *); static int free_pagedep(struct pagedep *); static int flush_newblk_dep(struct vnode *, struct mount *, ufs_lbn_t); static int flush_inodedep_deps(struct vnode *, struct mount *, ino_t); static int flush_deplist(struct allocdirectlst *, int, int *); static int sync_cgs(struct mount *, int); static int handle_written_filepage(struct pagedep *, struct buf *); static int handle_written_sbdep(struct sbdep *, struct buf *); static void initiate_write_sbdep(struct sbdep *); static void diradd_inode_written(struct diradd *, struct inodedep *); static int handle_written_indirdep(struct indirdep *, struct buf *, struct buf**); static int handle_written_inodeblock(struct inodedep *, struct buf *); static int jnewblk_rollforward(struct jnewblk *, struct fs *, struct cg *, uint8_t *); static int handle_written_bmsafemap(struct bmsafemap *, struct buf *); static void handle_written_jaddref(struct jaddref *); static void handle_written_jremref(struct jremref *); static void handle_written_jseg(struct jseg *, struct buf *); static void handle_written_jnewblk(struct jnewblk *); static void handle_written_jblkdep(struct jblkdep *); static void handle_written_jfreefrag(struct jfreefrag *); static void complete_jseg(struct jseg *); static void complete_jsegs(struct jseg *); static void jseg_write(struct ufsmount *ump, struct jseg *, uint8_t *); static void jaddref_write(struct jaddref *, struct jseg *, uint8_t *); static void jremref_write(struct jremref *, struct jseg *, uint8_t *); static void jmvref_write(struct jmvref *, struct jseg *, uint8_t *); static void jtrunc_write(struct jtrunc *, struct jseg *, uint8_t *); static void jfsync_write(struct jfsync *, struct jseg *, uint8_t *data); static void jnewblk_write(struct jnewblk *, struct jseg *, uint8_t *); static void jfreeblk_write(struct jfreeblk *, struct jseg *, uint8_t *); static void jfreefrag_write(struct jfreefrag *, struct jseg *, uint8_t *); static inline void inoref_write(struct inoref *, struct jseg *, struct jrefrec *); static void handle_allocdirect_partdone(struct allocdirect *, struct workhead *); static struct jnewblk *cancel_newblk(struct newblk *, struct worklist *, struct workhead *); static void indirdep_complete(struct indirdep *); static int indirblk_lookup(struct mount *, ufs2_daddr_t); static void indirblk_insert(struct freework *); static void indirblk_remove(struct freework *); static void handle_allocindir_partdone(struct allocindir *); static void initiate_write_filepage(struct pagedep *, struct buf *); static void initiate_write_indirdep(struct indirdep*, struct buf *); static void handle_written_mkdir(struct mkdir *, int); static int jnewblk_rollback(struct jnewblk *, struct fs *, struct cg *, uint8_t *); static void initiate_write_bmsafemap(struct bmsafemap *, struct buf *); static void initiate_write_inodeblock_ufs1(struct inodedep *, struct buf *); static void initiate_write_inodeblock_ufs2(struct inodedep *, struct buf *); static void handle_workitem_freefile(struct freefile *); static int handle_workitem_remove(struct dirrem *, int); static struct dirrem *newdirrem(struct buf *, struct inode *, struct inode *, int, struct dirrem **); static struct indirdep *indirdep_lookup(struct mount *, struct inode *, struct buf *); static void cancel_indirdep(struct indirdep *, struct buf *, struct freeblks *); static void free_indirdep(struct indirdep *); static void free_diradd(struct diradd *, struct workhead *); static void merge_diradd(struct inodedep *, struct diradd *); static void complete_diradd(struct diradd *); static struct diradd *diradd_lookup(struct pagedep *, int); static struct jremref *cancel_diradd_dotdot(struct inode *, struct dirrem *, struct jremref *); static struct jremref *cancel_mkdir_dotdot(struct inode *, struct dirrem *, struct jremref *); static void cancel_diradd(struct diradd *, struct dirrem *, struct jremref *, struct jremref *, struct jremref *); static void dirrem_journal(struct dirrem *, struct jremref *, struct jremref *, struct jremref *); static void cancel_allocindir(struct allocindir *, struct buf *bp, struct freeblks *, int); static int setup_trunc_indir(struct freeblks *, struct inode *, ufs_lbn_t, ufs_lbn_t, ufs2_daddr_t); static void complete_trunc_indir(struct freework *); static void trunc_indirdep(struct indirdep *, struct freeblks *, struct buf *, int); static void complete_mkdir(struct mkdir *); static void free_newdirblk(struct newdirblk *); static void free_jremref(struct jremref *); static void free_jaddref(struct jaddref *); static void free_jsegdep(struct jsegdep *); static void free_jsegs(struct jblocks *); static void rele_jseg(struct jseg *); static void free_jseg(struct jseg *, struct jblocks *); static void free_jnewblk(struct jnewblk *); static void free_jblkdep(struct jblkdep *); static void free_jfreefrag(struct jfreefrag *); static void free_freedep(struct freedep *); static void journal_jremref(struct dirrem *, struct jremref *, struct inodedep *); static void cancel_jnewblk(struct jnewblk *, struct workhead *); static int cancel_jaddref(struct jaddref *, struct inodedep *, struct workhead *); static void cancel_jfreefrag(struct jfreefrag *); static inline void setup_freedirect(struct freeblks *, struct inode *, int, int); static inline void setup_freeext(struct freeblks *, struct inode *, int, int); static inline void setup_freeindir(struct freeblks *, struct inode *, int, ufs_lbn_t, int); static inline struct freeblks *newfreeblks(struct mount *, struct inode *); static void freeblks_free(struct ufsmount *, struct freeblks *, int); static void indir_trunc(struct freework *, ufs2_daddr_t, ufs_lbn_t); static ufs2_daddr_t blkcount(struct fs *, ufs2_daddr_t, off_t); static int trunc_check_buf(struct buf *, int *, ufs_lbn_t, int, int); static void trunc_dependencies(struct inode *, struct freeblks *, ufs_lbn_t, int, int); static void trunc_pages(struct inode *, off_t, ufs2_daddr_t, int); static int cancel_pagedep(struct pagedep *, struct freeblks *, int); static int deallocate_dependencies(struct buf *, struct freeblks *, int); static void newblk_freefrag(struct newblk*); static void free_newblk(struct newblk *); static void cancel_allocdirect(struct allocdirectlst *, struct allocdirect *, struct freeblks *); static int check_inode_unwritten(struct inodedep *); static int free_inodedep(struct inodedep *); static void freework_freeblock(struct freework *); static void freework_enqueue(struct freework *); static int handle_workitem_freeblocks(struct freeblks *, int); static int handle_complete_freeblocks(struct freeblks *, int); static void handle_workitem_indirblk(struct freework *); static void handle_written_freework(struct freework *); static void merge_inode_lists(struct allocdirectlst *,struct allocdirectlst *); static struct worklist *jnewblk_merge(struct worklist *, struct worklist *, struct workhead *); static struct freefrag *setup_allocindir_phase2(struct buf *, struct inode *, struct inodedep *, struct allocindir *, ufs_lbn_t); static struct allocindir *newallocindir(struct inode *, int, ufs2_daddr_t, ufs2_daddr_t, ufs_lbn_t); static void handle_workitem_freefrag(struct freefrag *); static struct freefrag *newfreefrag(struct inode *, ufs2_daddr_t, long, ufs_lbn_t); static void allocdirect_merge(struct allocdirectlst *, struct allocdirect *, struct allocdirect *); static struct freefrag *allocindir_merge(struct allocindir *, struct allocindir *); static int bmsafemap_find(struct bmsafemap_hashhead *, int, struct bmsafemap **); static struct bmsafemap *bmsafemap_lookup(struct mount *, struct buf *, int cg, struct bmsafemap *); static int newblk_find(struct newblk_hashhead *, ufs2_daddr_t, int, struct newblk **); static int newblk_lookup(struct mount *, ufs2_daddr_t, int, struct newblk **); static int inodedep_find(struct inodedep_hashhead *, ino_t, struct inodedep **); static int inodedep_lookup(struct mount *, ino_t, int, struct inodedep **); static int pagedep_lookup(struct mount *, struct buf *bp, ino_t, ufs_lbn_t, int, struct pagedep **); static int pagedep_find(struct pagedep_hashhead *, ino_t, ufs_lbn_t, struct pagedep **); static void pause_timer(void *); static int request_cleanup(struct mount *, int); static int process_worklist_item(struct mount *, int, int); static void process_removes(struct vnode *); static void process_truncates(struct vnode *); static void jwork_move(struct workhead *, struct workhead *); static void jwork_insert(struct workhead *, struct jsegdep *); static void add_to_worklist(struct worklist *, int); static void wake_worklist(struct worklist *); static void wait_worklist(struct worklist *, char *); static void remove_from_worklist(struct worklist *); static void softdep_flush(void *); static void softdep_flushjournal(struct mount *); static int softdep_speedup(struct ufsmount *); static void worklist_speedup(struct mount *); static int journal_mount(struct mount *, struct fs *, struct ucred *); static void journal_unmount(struct ufsmount *); static int journal_space(struct ufsmount *, int); static void journal_suspend(struct ufsmount *); static int journal_unsuspend(struct ufsmount *ump); static void softdep_prelink(struct vnode *, struct vnode *); static void add_to_journal(struct worklist *); static void remove_from_journal(struct worklist *); static void softdep_process_journal(struct mount *, struct worklist *, int); static struct jremref *newjremref(struct dirrem *, struct inode *, struct inode *ip, off_t, nlink_t); static struct jaddref *newjaddref(struct inode *, ino_t, off_t, int16_t, uint16_t); static inline void newinoref(struct inoref *, ino_t, ino_t, off_t, nlink_t, uint16_t); static inline struct jsegdep *inoref_jseg(struct inoref *); static struct jmvref *newjmvref(struct inode *, ino_t, off_t, off_t); static struct jfreeblk *newjfreeblk(struct freeblks *, ufs_lbn_t, ufs2_daddr_t, int); static void adjust_newfreework(struct freeblks *, int); static struct jtrunc *newjtrunc(struct freeblks *, off_t, int); static void move_newblock_dep(struct jaddref *, struct inodedep *); static void cancel_jfreeblk(struct freeblks *, ufs2_daddr_t); static struct jfreefrag *newjfreefrag(struct freefrag *, struct inode *, ufs2_daddr_t, long, ufs_lbn_t); static struct freework *newfreework(struct ufsmount *, struct freeblks *, struct freework *, ufs_lbn_t, ufs2_daddr_t, int, int, int); static int jwait(struct worklist *, int); static struct inodedep *inodedep_lookup_ip(struct inode *); static int bmsafemap_backgroundwrite(struct bmsafemap *, struct buf *); static struct freefile *handle_bufwait(struct inodedep *, struct workhead *); static void handle_jwork(struct workhead *); static struct mkdir *setup_newdir(struct diradd *, ino_t, ino_t, struct buf *, struct mkdir **); static struct jblocks *jblocks_create(void); static ufs2_daddr_t jblocks_alloc(struct jblocks *, int, int *); static void jblocks_free(struct jblocks *, struct mount *, int); static void jblocks_destroy(struct jblocks *); static void jblocks_add(struct jblocks *, ufs2_daddr_t, int); /* * Exported softdep operations. */ static void softdep_disk_io_initiation(struct buf *); static void softdep_disk_write_complete(struct buf *); static void softdep_deallocate_dependencies(struct buf *); static int softdep_count_dependencies(struct buf *bp, int); /* * Global lock over all of soft updates. */ static struct mtx lk; MTX_SYSINIT(softdep_lock, &lk, "Global Softdep Lock", MTX_DEF); #define ACQUIRE_GBLLOCK(lk) mtx_lock(lk) #define FREE_GBLLOCK(lk) mtx_unlock(lk) #define GBLLOCK_OWNED(lk) mtx_assert((lk), MA_OWNED) /* * Per-filesystem soft-updates locking. */ #define LOCK_PTR(ump) (&(ump)->um_softdep->sd_fslock) #define TRY_ACQUIRE_LOCK(ump) rw_try_wlock(&(ump)->um_softdep->sd_fslock) #define ACQUIRE_LOCK(ump) rw_wlock(&(ump)->um_softdep->sd_fslock) #define FREE_LOCK(ump) rw_wunlock(&(ump)->um_softdep->sd_fslock) #define LOCK_OWNED(ump) rw_assert(&(ump)->um_softdep->sd_fslock, \ RA_WLOCKED) #define BUF_AREC(bp) lockallowrecurse(&(bp)->b_lock) #define BUF_NOREC(bp) lockdisablerecurse(&(bp)->b_lock) /* * Worklist queue management. * These routines require that the lock be held. */ #ifndef /* NOT */ DEBUG #define WORKLIST_INSERT(head, item) do { \ (item)->wk_state |= ONWORKLIST; \ LIST_INSERT_HEAD(head, item, wk_list); \ } while (0) #define WORKLIST_REMOVE(item) do { \ (item)->wk_state &= ~ONWORKLIST; \ LIST_REMOVE(item, wk_list); \ } while (0) #define WORKLIST_INSERT_UNLOCKED WORKLIST_INSERT #define WORKLIST_REMOVE_UNLOCKED WORKLIST_REMOVE #else /* DEBUG */ static void worklist_insert(struct workhead *, struct worklist *, int); static void worklist_remove(struct worklist *, int); #define WORKLIST_INSERT(head, item) worklist_insert(head, item, 1) #define WORKLIST_INSERT_UNLOCKED(head, item) worklist_insert(head, item, 0) #define WORKLIST_REMOVE(item) worklist_remove(item, 1) #define WORKLIST_REMOVE_UNLOCKED(item) worklist_remove(item, 0) static void worklist_insert(head, item, locked) struct workhead *head; struct worklist *item; int locked; { if (locked) LOCK_OWNED(VFSTOUFS(item->wk_mp)); if (item->wk_state & ONWORKLIST) panic("worklist_insert: %p %s(0x%X) already on list", item, TYPENAME(item->wk_type), item->wk_state); item->wk_state |= ONWORKLIST; LIST_INSERT_HEAD(head, item, wk_list); } static void worklist_remove(item, locked) struct worklist *item; int locked; { if (locked) LOCK_OWNED(VFSTOUFS(item->wk_mp)); if ((item->wk_state & ONWORKLIST) == 0) panic("worklist_remove: %p %s(0x%X) not on list", item, TYPENAME(item->wk_type), item->wk_state); item->wk_state &= ~ONWORKLIST; LIST_REMOVE(item, wk_list); } #endif /* DEBUG */ /* * Merge two jsegdeps keeping only the oldest one as newer references * can't be discarded until after older references. */ static inline struct jsegdep * jsegdep_merge(struct jsegdep *one, struct jsegdep *two) { struct jsegdep *swp; if (two == NULL) return (one); if (one->jd_seg->js_seq > two->jd_seg->js_seq) { swp = one; one = two; two = swp; } WORKLIST_REMOVE(&two->jd_list); free_jsegdep(two); return (one); } /* * If two freedeps are compatible free one to reduce list size. */ static inline struct freedep * freedep_merge(struct freedep *one, struct freedep *two) { if (two == NULL) return (one); if (one->fd_freework == two->fd_freework) { WORKLIST_REMOVE(&two->fd_list); free_freedep(two); } return (one); } /* * Move journal work from one list to another. Duplicate freedeps and * jsegdeps are coalesced to keep the lists as small as possible. */ static void jwork_move(dst, src) struct workhead *dst; struct workhead *src; { struct freedep *freedep; struct jsegdep *jsegdep; struct worklist *wkn; struct worklist *wk; KASSERT(dst != src, ("jwork_move: dst == src")); freedep = NULL; jsegdep = NULL; LIST_FOREACH_SAFE(wk, dst, wk_list, wkn) { if (wk->wk_type == D_JSEGDEP) jsegdep = jsegdep_merge(WK_JSEGDEP(wk), jsegdep); else if (wk->wk_type == D_FREEDEP) freedep = freedep_merge(WK_FREEDEP(wk), freedep); } while ((wk = LIST_FIRST(src)) != NULL) { WORKLIST_REMOVE(wk); WORKLIST_INSERT(dst, wk); if (wk->wk_type == D_JSEGDEP) { jsegdep = jsegdep_merge(WK_JSEGDEP(wk), jsegdep); continue; } if (wk->wk_type == D_FREEDEP) freedep = freedep_merge(WK_FREEDEP(wk), freedep); } } static void jwork_insert(dst, jsegdep) struct workhead *dst; struct jsegdep *jsegdep; { struct jsegdep *jsegdepn; struct worklist *wk; LIST_FOREACH(wk, dst, wk_list) if (wk->wk_type == D_JSEGDEP) break; if (wk == NULL) { WORKLIST_INSERT(dst, &jsegdep->jd_list); return; } jsegdepn = WK_JSEGDEP(wk); if (jsegdep->jd_seg->js_seq < jsegdepn->jd_seg->js_seq) { WORKLIST_REMOVE(wk); free_jsegdep(jsegdepn); WORKLIST_INSERT(dst, &jsegdep->jd_list); } else free_jsegdep(jsegdep); } /* * Routines for tracking and managing workitems. */ static void workitem_free(struct worklist *, int); static void workitem_alloc(struct worklist *, int, struct mount *); static void workitem_reassign(struct worklist *, int); #define WORKITEM_FREE(item, type) \ workitem_free((struct worklist *)(item), (type)) #define WORKITEM_REASSIGN(item, type) \ workitem_reassign((struct worklist *)(item), (type)) static void workitem_free(item, type) struct worklist *item; int type; { struct ufsmount *ump; #ifdef DEBUG if (item->wk_state & ONWORKLIST) panic("workitem_free: %s(0x%X) still on list", TYPENAME(item->wk_type), item->wk_state); if (item->wk_type != type && type != D_NEWBLK) panic("workitem_free: type mismatch %s != %s", TYPENAME(item->wk_type), TYPENAME(type)); #endif if (item->wk_state & IOWAITING) wakeup(item); ump = VFSTOUFS(item->wk_mp); LOCK_OWNED(ump); KASSERT(ump->softdep_deps > 0, ("workitem_free: %s: softdep_deps going negative", ump->um_fs->fs_fsmnt)); if (--ump->softdep_deps == 0 && ump->softdep_req) wakeup(&ump->softdep_deps); KASSERT(dep_current[item->wk_type] > 0, ("workitem_free: %s: dep_current[%s] going negative", ump->um_fs->fs_fsmnt, TYPENAME(item->wk_type))); KASSERT(ump->softdep_curdeps[item->wk_type] > 0, ("workitem_free: %s: softdep_curdeps[%s] going negative", ump->um_fs->fs_fsmnt, TYPENAME(item->wk_type))); atomic_subtract_long(&dep_current[item->wk_type], 1); ump->softdep_curdeps[item->wk_type] -= 1; free(item, DtoM(type)); } static void workitem_alloc(item, type, mp) struct worklist *item; int type; struct mount *mp; { struct ufsmount *ump; item->wk_type = type; item->wk_mp = mp; item->wk_state = 0; ump = VFSTOUFS(mp); ACQUIRE_GBLLOCK(&lk); dep_current[type]++; if (dep_current[type] > dep_highuse[type]) dep_highuse[type] = dep_current[type]; dep_total[type]++; FREE_GBLLOCK(&lk); ACQUIRE_LOCK(ump); ump->softdep_curdeps[type] += 1; ump->softdep_deps++; ump->softdep_accdeps++; FREE_LOCK(ump); } static void workitem_reassign(item, newtype) struct worklist *item; int newtype; { struct ufsmount *ump; ump = VFSTOUFS(item->wk_mp); LOCK_OWNED(ump); KASSERT(ump->softdep_curdeps[item->wk_type] > 0, ("workitem_reassign: %s: softdep_curdeps[%s] going negative", VFSTOUFS(item->wk_mp)->um_fs->fs_fsmnt, TYPENAME(item->wk_type))); ump->softdep_curdeps[item->wk_type] -= 1; ump->softdep_curdeps[newtype] += 1; KASSERT(dep_current[item->wk_type] > 0, ("workitem_reassign: %s: dep_current[%s] going negative", VFSTOUFS(item->wk_mp)->um_fs->fs_fsmnt, TYPENAME(item->wk_type))); ACQUIRE_GBLLOCK(&lk); dep_current[newtype]++; dep_current[item->wk_type]--; if (dep_current[newtype] > dep_highuse[newtype]) dep_highuse[newtype] = dep_current[newtype]; dep_total[newtype]++; FREE_GBLLOCK(&lk); item->wk_type = newtype; } /* * Workitem queue management */ static int max_softdeps; /* maximum number of structs before slowdown */ static int tickdelay = 2; /* number of ticks to pause during slowdown */ static int proc_waiting; /* tracks whether we have a timeout posted */ static int *stat_countp; /* statistic to count in proc_waiting timeout */ static struct callout softdep_callout; static int req_clear_inodedeps; /* syncer process flush some inodedeps */ static int req_clear_remove; /* syncer process flush some freeblks */ static int softdep_flushcache = 0; /* Should we do BIO_FLUSH? */ /* * runtime statistics */ static int stat_flush_threads; /* number of softdep flushing threads */ static int stat_worklist_push; /* number of worklist cleanups */ static int stat_blk_limit_push; /* number of times block limit neared */ static int stat_ino_limit_push; /* number of times inode limit neared */ static int stat_blk_limit_hit; /* number of times block slowdown imposed */ static int stat_ino_limit_hit; /* number of times inode slowdown imposed */ static int stat_sync_limit_hit; /* number of synchronous slowdowns imposed */ static int stat_indir_blk_ptrs; /* bufs redirtied as indir ptrs not written */ static int stat_inode_bitmap; /* bufs redirtied as inode bitmap not written */ static int stat_direct_blk_ptrs;/* bufs redirtied as direct ptrs not written */ static int stat_dir_entry; /* bufs redirtied as dir entry cannot write */ static int stat_jaddref; /* bufs redirtied as ino bitmap can not write */ static int stat_jnewblk; /* bufs redirtied as blk bitmap can not write */ static int stat_journal_min; /* Times hit journal min threshold */ static int stat_journal_low; /* Times hit journal low threshold */ static int stat_journal_wait; /* Times blocked in jwait(). */ static int stat_jwait_filepage; /* Times blocked in jwait() for filepage. */ static int stat_jwait_freeblks; /* Times blocked in jwait() for freeblks. */ static int stat_jwait_inode; /* Times blocked in jwait() for inodes. */ static int stat_jwait_newblk; /* Times blocked in jwait() for newblks. */ static int stat_cleanup_high_delay; /* Maximum cleanup delay (in ticks) */ static int stat_cleanup_blkrequests; /* Number of block cleanup requests */ static int stat_cleanup_inorequests; /* Number of inode cleanup requests */ static int stat_cleanup_retries; /* Number of cleanups that needed to flush */ static int stat_cleanup_failures; /* Number of cleanup requests that failed */ static int stat_emptyjblocks; /* Number of potentially empty journal blocks */ SYSCTL_INT(_debug_softdep, OID_AUTO, max_softdeps, CTLFLAG_RW, &max_softdeps, 0, ""); SYSCTL_INT(_debug_softdep, OID_AUTO, tickdelay, CTLFLAG_RW, &tickdelay, 0, ""); SYSCTL_INT(_debug_softdep, OID_AUTO, flush_threads, CTLFLAG_RD, &stat_flush_threads, 0, ""); SYSCTL_INT(_debug_softdep, OID_AUTO, worklist_push, CTLFLAG_RW, &stat_worklist_push, 0,""); SYSCTL_INT(_debug_softdep, OID_AUTO, blk_limit_push, CTLFLAG_RW, &stat_blk_limit_push, 0,""); SYSCTL_INT(_debug_softdep, OID_AUTO, ino_limit_push, CTLFLAG_RW, &stat_ino_limit_push, 0,""); SYSCTL_INT(_debug_softdep, OID_AUTO, blk_limit_hit, CTLFLAG_RW, &stat_blk_limit_hit, 0, ""); SYSCTL_INT(_debug_softdep, OID_AUTO, ino_limit_hit, CTLFLAG_RW, &stat_ino_limit_hit, 0, ""); SYSCTL_INT(_debug_softdep, OID_AUTO, sync_limit_hit, CTLFLAG_RW, &stat_sync_limit_hit, 0, ""); SYSCTL_INT(_debug_softdep, OID_AUTO, indir_blk_ptrs, CTLFLAG_RW, &stat_indir_blk_ptrs, 0, ""); SYSCTL_INT(_debug_softdep, OID_AUTO, inode_bitmap, CTLFLAG_RW, &stat_inode_bitmap, 0, ""); SYSCTL_INT(_debug_softdep, OID_AUTO, direct_blk_ptrs, CTLFLAG_RW, &stat_direct_blk_ptrs, 0, ""); SYSCTL_INT(_debug_softdep, OID_AUTO, dir_entry, CTLFLAG_RW, &stat_dir_entry, 0, ""); SYSCTL_INT(_debug_softdep, OID_AUTO, jaddref_rollback, CTLFLAG_RW, &stat_jaddref, 0, ""); SYSCTL_INT(_debug_softdep, OID_AUTO, jnewblk_rollback, CTLFLAG_RW, &stat_jnewblk, 0, ""); SYSCTL_INT(_debug_softdep, OID_AUTO, journal_low, CTLFLAG_RW, &stat_journal_low, 0, ""); SYSCTL_INT(_debug_softdep, OID_AUTO, journal_min, CTLFLAG_RW, &stat_journal_min, 0, ""); SYSCTL_INT(_debug_softdep, OID_AUTO, journal_wait, CTLFLAG_RW, &stat_journal_wait, 0, ""); SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_filepage, CTLFLAG_RW, &stat_jwait_filepage, 0, ""); SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_freeblks, CTLFLAG_RW, &stat_jwait_freeblks, 0, ""); SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_inode, CTLFLAG_RW, &stat_jwait_inode, 0, ""); SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_newblk, CTLFLAG_RW, &stat_jwait_newblk, 0, ""); SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_blkrequests, CTLFLAG_RW, &stat_cleanup_blkrequests, 0, ""); SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_inorequests, CTLFLAG_RW, &stat_cleanup_inorequests, 0, ""); SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_high_delay, CTLFLAG_RW, &stat_cleanup_high_delay, 0, ""); SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_retries, CTLFLAG_RW, &stat_cleanup_retries, 0, ""); SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_failures, CTLFLAG_RW, &stat_cleanup_failures, 0, ""); SYSCTL_INT(_debug_softdep, OID_AUTO, flushcache, CTLFLAG_RW, &softdep_flushcache, 0, ""); SYSCTL_INT(_debug_softdep, OID_AUTO, emptyjblocks, CTLFLAG_RD, &stat_emptyjblocks, 0, ""); SYSCTL_DECL(_vfs_ffs); /* Whether to recompute the summary at mount time */ static int compute_summary_at_mount = 0; SYSCTL_INT(_vfs_ffs, OID_AUTO, compute_summary_at_mount, CTLFLAG_RW, &compute_summary_at_mount, 0, "Recompute summary at mount"); static int print_threads = 0; SYSCTL_INT(_debug_softdep, OID_AUTO, print_threads, CTLFLAG_RW, &print_threads, 0, "Notify flusher thread start/stop"); /* List of all filesystems mounted with soft updates */ static TAILQ_HEAD(, mount_softdeps) softdepmounts; /* * This function cleans the worklist for a filesystem. * Each filesystem running with soft dependencies gets its own * thread to run in this function. The thread is started up in * softdep_mount and shutdown in softdep_unmount. They show up * as part of the kernel "bufdaemon" process whose process * entry is available in bufdaemonproc. */ static int searchfailed; extern struct proc *bufdaemonproc; static void softdep_flush(addr) void *addr; { struct mount *mp; struct thread *td; struct ufsmount *ump; td = curthread; td->td_pflags |= TDP_NORUNNINGBUF; mp = (struct mount *)addr; ump = VFSTOUFS(mp); atomic_add_int(&stat_flush_threads, 1); ACQUIRE_LOCK(ump); ump->softdep_flags &= ~FLUSH_STARTING; wakeup(&ump->softdep_flushtd); FREE_LOCK(ump); if (print_threads) { if (stat_flush_threads == 1) printf("Running %s at pid %d\n", bufdaemonproc->p_comm, bufdaemonproc->p_pid); printf("Start thread %s\n", td->td_name); } for (;;) { while (softdep_process_worklist(mp, 0) > 0 || (MOUNTEDSUJ(mp) && VFSTOUFS(mp)->softdep_jblocks->jb_suspended)) kthread_suspend_check(); ACQUIRE_LOCK(ump); - while ((ump->softdep_flags & (FLUSH_CLEANUP | FLUSH_EXIT)) == 0) + if ((ump->softdep_flags & (FLUSH_CLEANUP | FLUSH_EXIT)) == 0) msleep(&ump->softdep_flushtd, LOCK_PTR(ump), PVM, "sdflush", hz / 2); ump->softdep_flags &= ~FLUSH_CLEANUP; /* * Check to see if we are done and need to exit. */ if ((ump->softdep_flags & FLUSH_EXIT) == 0) { FREE_LOCK(ump); continue; } ump->softdep_flags &= ~FLUSH_EXIT; FREE_LOCK(ump); wakeup(&ump->softdep_flags); if (print_threads) printf("Stop thread %s: searchfailed %d, did cleanups %d\n", td->td_name, searchfailed, ump->um_softdep->sd_cleanups); atomic_subtract_int(&stat_flush_threads, 1); kthread_exit(); panic("kthread_exit failed\n"); } } static void worklist_speedup(mp) struct mount *mp; { struct ufsmount *ump; ump = VFSTOUFS(mp); LOCK_OWNED(ump); - if ((ump->softdep_flags & (FLUSH_CLEANUP | FLUSH_EXIT)) == 0) { + if ((ump->softdep_flags & (FLUSH_CLEANUP | FLUSH_EXIT)) == 0) ump->softdep_flags |= FLUSH_CLEANUP; - wakeup(&ump->softdep_flushtd); - } + wakeup(&ump->softdep_flushtd); } static int softdep_speedup(ump) struct ufsmount *ump; { struct ufsmount *altump; struct mount_softdeps *sdp; LOCK_OWNED(ump); worklist_speedup(ump->um_mountp); bd_speedup(); /* * If we have global shortages, then we need other * filesystems to help with the cleanup. Here we wakeup a * flusher thread for a filesystem that is over its fair * share of resources. */ if (req_clear_inodedeps || req_clear_remove) { ACQUIRE_GBLLOCK(&lk); TAILQ_FOREACH(sdp, &softdepmounts, sd_next) { if ((altump = sdp->sd_ump) == ump) continue; if (((req_clear_inodedeps && altump->softdep_curdeps[D_INODEDEP] > max_softdeps / stat_flush_threads) || (req_clear_remove && altump->softdep_curdeps[D_DIRREM] > (max_softdeps / 2) / stat_flush_threads)) && TRY_ACQUIRE_LOCK(altump)) break; } if (sdp == NULL) { searchfailed++; FREE_GBLLOCK(&lk); } else { /* * Move to the end of the list so we pick a * different one on out next try. */ TAILQ_REMOVE(&softdepmounts, sdp, sd_next); TAILQ_INSERT_TAIL(&softdepmounts, sdp, sd_next); FREE_GBLLOCK(&lk); if ((altump->softdep_flags & - (FLUSH_CLEANUP | FLUSH_EXIT)) == 0) { + (FLUSH_CLEANUP | FLUSH_EXIT)) == 0) altump->softdep_flags |= FLUSH_CLEANUP; - altump->um_softdep->sd_cleanups++; - wakeup(&altump->softdep_flushtd); - } + altump->um_softdep->sd_cleanups++; + wakeup(&altump->softdep_flushtd); FREE_LOCK(altump); } } return (speedup_syncer()); } /* * Add an item to the end of the work queue. * This routine requires that the lock be held. * This is the only routine that adds items to the list. * The following routine is the only one that removes items * and does so in order from first to last. */ #define WK_HEAD 0x0001 /* Add to HEAD. */ #define WK_NODELAY 0x0002 /* Process immediately. */ static void add_to_worklist(wk, flags) struct worklist *wk; int flags; { struct ufsmount *ump; ump = VFSTOUFS(wk->wk_mp); LOCK_OWNED(ump); if (wk->wk_state & ONWORKLIST) panic("add_to_worklist: %s(0x%X) already on list", TYPENAME(wk->wk_type), wk->wk_state); wk->wk_state |= ONWORKLIST; if (ump->softdep_on_worklist == 0) { LIST_INSERT_HEAD(&ump->softdep_workitem_pending, wk, wk_list); ump->softdep_worklist_tail = wk; } else if (flags & WK_HEAD) { LIST_INSERT_HEAD(&ump->softdep_workitem_pending, wk, wk_list); } else { LIST_INSERT_AFTER(ump->softdep_worklist_tail, wk, wk_list); ump->softdep_worklist_tail = wk; } ump->softdep_on_worklist += 1; if (flags & WK_NODELAY) worklist_speedup(wk->wk_mp); } /* * Remove the item to be processed. If we are removing the last * item on the list, we need to recalculate the tail pointer. */ static void remove_from_worklist(wk) struct worklist *wk; { struct ufsmount *ump; ump = VFSTOUFS(wk->wk_mp); WORKLIST_REMOVE(wk); if (ump->softdep_worklist_tail == wk) ump->softdep_worklist_tail = (struct worklist *)wk->wk_list.le_prev; ump->softdep_on_worklist -= 1; } static void wake_worklist(wk) struct worklist *wk; { if (wk->wk_state & IOWAITING) { wk->wk_state &= ~IOWAITING; wakeup(wk); } } static void wait_worklist(wk, wmesg) struct worklist *wk; char *wmesg; { struct ufsmount *ump; ump = VFSTOUFS(wk->wk_mp); wk->wk_state |= IOWAITING; msleep(wk, LOCK_PTR(ump), PVM, wmesg, 0); } /* * Process that runs once per second to handle items in the background queue. * * Note that we ensure that everything is done in the order in which they * appear in the queue. The code below depends on this property to ensure * that blocks of a file are freed before the inode itself is freed. This * ordering ensures that no new triples will be generated * until all the old ones have been purged from the dependency lists. */ static int softdep_process_worklist(mp, full) struct mount *mp; int full; { int cnt, matchcnt; struct ufsmount *ump; long starttime; KASSERT(mp != NULL, ("softdep_process_worklist: NULL mp")); if (MOUNTEDSOFTDEP(mp) == 0) return (0); matchcnt = 0; ump = VFSTOUFS(mp); ACQUIRE_LOCK(ump); starttime = time_second; softdep_process_journal(mp, NULL, full ? MNT_WAIT : 0); check_clear_deps(mp); while (ump->softdep_on_worklist > 0) { if ((cnt = process_worklist_item(mp, 10, LK_NOWAIT)) == 0) break; else matchcnt += cnt; check_clear_deps(mp); /* * We do not generally want to stop for buffer space, but if * we are really being a buffer hog, we will stop and wait. */ if (should_yield()) { FREE_LOCK(ump); kern_yield(PRI_USER); bwillwrite(); ACQUIRE_LOCK(ump); } /* * Never allow processing to run for more than one * second. This gives the syncer thread the opportunity * to pause if appropriate. */ if (!full && starttime != time_second) break; } if (full == 0) journal_unsuspend(ump); FREE_LOCK(ump); return (matchcnt); } /* * Process all removes associated with a vnode if we are running out of * journal space. Any other process which attempts to flush these will * be unable as we have the vnodes locked. */ static void process_removes(vp) struct vnode *vp; { struct inodedep *inodedep; struct dirrem *dirrem; struct ufsmount *ump; struct mount *mp; ino_t inum; mp = vp->v_mount; ump = VFSTOUFS(mp); LOCK_OWNED(ump); inum = VTOI(vp)->i_number; for (;;) { top: if (inodedep_lookup(mp, inum, 0, &inodedep) == 0) return; LIST_FOREACH(dirrem, &inodedep->id_dirremhd, dm_inonext) { /* * If another thread is trying to lock this vnode * it will fail but we must wait for it to do so * before we can proceed. */ if (dirrem->dm_state & INPROGRESS) { wait_worklist(&dirrem->dm_list, "pwrwait"); goto top; } if ((dirrem->dm_state & (COMPLETE | ONWORKLIST)) == (COMPLETE | ONWORKLIST)) break; } if (dirrem == NULL) return; remove_from_worklist(&dirrem->dm_list); FREE_LOCK(ump); if (vn_start_secondary_write(NULL, &mp, V_NOWAIT)) panic("process_removes: suspended filesystem"); handle_workitem_remove(dirrem, 0); vn_finished_secondary_write(mp); ACQUIRE_LOCK(ump); } } /* * Process all truncations associated with a vnode if we are running out * of journal space. This is called when the vnode lock is already held * and no other process can clear the truncation. This function returns * a value greater than zero if it did any work. */ static void process_truncates(vp) struct vnode *vp; { struct inodedep *inodedep; struct freeblks *freeblks; struct ufsmount *ump; struct mount *mp; ino_t inum; int cgwait; mp = vp->v_mount; ump = VFSTOUFS(mp); LOCK_OWNED(ump); inum = VTOI(vp)->i_number; for (;;) { if (inodedep_lookup(mp, inum, 0, &inodedep) == 0) return; cgwait = 0; TAILQ_FOREACH(freeblks, &inodedep->id_freeblklst, fb_next) { /* Journal entries not yet written. */ if (!LIST_EMPTY(&freeblks->fb_jblkdephd)) { jwait(&LIST_FIRST( &freeblks->fb_jblkdephd)->jb_list, MNT_WAIT); break; } /* Another thread is executing this item. */ if (freeblks->fb_state & INPROGRESS) { wait_worklist(&freeblks->fb_list, "ptrwait"); break; } /* Freeblks is waiting on a inode write. */ if ((freeblks->fb_state & COMPLETE) == 0) { FREE_LOCK(ump); ffs_update(vp, 1); ACQUIRE_LOCK(ump); break; } if ((freeblks->fb_state & (ALLCOMPLETE | ONWORKLIST)) == (ALLCOMPLETE | ONWORKLIST)) { remove_from_worklist(&freeblks->fb_list); freeblks->fb_state |= INPROGRESS; FREE_LOCK(ump); if (vn_start_secondary_write(NULL, &mp, V_NOWAIT)) panic("process_truncates: " "suspended filesystem"); handle_workitem_freeblocks(freeblks, 0); vn_finished_secondary_write(mp); ACQUIRE_LOCK(ump); break; } if (freeblks->fb_cgwait) cgwait++; } if (cgwait) { FREE_LOCK(ump); sync_cgs(mp, MNT_WAIT); ffs_sync_snap(mp, MNT_WAIT); ACQUIRE_LOCK(ump); continue; } if (freeblks == NULL) break; } return; } /* * Process one item on the worklist. */ static int process_worklist_item(mp, target, flags) struct mount *mp; int target; int flags; { struct worklist sentinel; struct worklist *wk; struct ufsmount *ump; int matchcnt; int error; KASSERT(mp != NULL, ("process_worklist_item: NULL mp")); /* * If we are being called because of a process doing a * copy-on-write, then it is not safe to write as we may * recurse into the copy-on-write routine. */ if (curthread->td_pflags & TDP_COWINPROGRESS) return (-1); PHOLD(curproc); /* Don't let the stack go away. */ ump = VFSTOUFS(mp); LOCK_OWNED(ump); matchcnt = 0; sentinel.wk_mp = NULL; sentinel.wk_type = D_SENTINEL; LIST_INSERT_HEAD(&ump->softdep_workitem_pending, &sentinel, wk_list); for (wk = LIST_NEXT(&sentinel, wk_list); wk != NULL; wk = LIST_NEXT(&sentinel, wk_list)) { if (wk->wk_type == D_SENTINEL) { LIST_REMOVE(&sentinel, wk_list); LIST_INSERT_AFTER(wk, &sentinel, wk_list); continue; } if (wk->wk_state & INPROGRESS) panic("process_worklist_item: %p already in progress.", wk); wk->wk_state |= INPROGRESS; remove_from_worklist(wk); FREE_LOCK(ump); if (vn_start_secondary_write(NULL, &mp, V_NOWAIT)) panic("process_worklist_item: suspended filesystem"); switch (wk->wk_type) { case D_DIRREM: /* removal of a directory entry */ error = handle_workitem_remove(WK_DIRREM(wk), flags); break; case D_FREEBLKS: /* releasing blocks and/or fragments from a file */ error = handle_workitem_freeblocks(WK_FREEBLKS(wk), flags); break; case D_FREEFRAG: /* releasing a fragment when replaced as a file grows */ handle_workitem_freefrag(WK_FREEFRAG(wk)); error = 0; break; case D_FREEFILE: /* releasing an inode when its link count drops to 0 */ handle_workitem_freefile(WK_FREEFILE(wk)); error = 0; break; default: panic("%s_process_worklist: Unknown type %s", "softdep", TYPENAME(wk->wk_type)); /* NOTREACHED */ } vn_finished_secondary_write(mp); ACQUIRE_LOCK(ump); if (error == 0) { if (++matchcnt == target) break; continue; } /* * We have to retry the worklist item later. Wake up any * waiters who may be able to complete it immediately and * add the item back to the head so we don't try to execute * it again. */ wk->wk_state &= ~INPROGRESS; wake_worklist(wk); add_to_worklist(wk, WK_HEAD); } LIST_REMOVE(&sentinel, wk_list); /* Sentinal could've become the tail from remove_from_worklist. */ if (ump->softdep_worklist_tail == &sentinel) ump->softdep_worklist_tail = (struct worklist *)sentinel.wk_list.le_prev; PRELE(curproc); return (matchcnt); } /* * Move dependencies from one buffer to another. */ int softdep_move_dependencies(oldbp, newbp) struct buf *oldbp; struct buf *newbp; { struct worklist *wk, *wktail; struct ufsmount *ump; int dirty; if ((wk = LIST_FIRST(&oldbp->b_dep)) == NULL) return (0); KASSERT(MOUNTEDSOFTDEP(wk->wk_mp) != 0, ("softdep_move_dependencies called on non-softdep filesystem")); dirty = 0; wktail = NULL; ump = VFSTOUFS(wk->wk_mp); ACQUIRE_LOCK(ump); while ((wk = LIST_FIRST(&oldbp->b_dep)) != NULL) { LIST_REMOVE(wk, wk_list); if (wk->wk_type == D_BMSAFEMAP && bmsafemap_backgroundwrite(WK_BMSAFEMAP(wk), newbp)) dirty = 1; if (wktail == 0) LIST_INSERT_HEAD(&newbp->b_dep, wk, wk_list); else LIST_INSERT_AFTER(wktail, wk, wk_list); wktail = wk; } FREE_LOCK(ump); return (dirty); } /* * Purge the work list of all items associated with a particular mount point. */ int softdep_flushworklist(oldmnt, countp, td) struct mount *oldmnt; int *countp; struct thread *td; { struct vnode *devvp; int count, error = 0; struct ufsmount *ump; /* * Alternately flush the block device associated with the mount * point and process any dependencies that the flushing * creates. We continue until no more worklist dependencies * are found. */ *countp = 0; ump = VFSTOUFS(oldmnt); devvp = ump->um_devvp; while ((count = softdep_process_worklist(oldmnt, 1)) > 0) { *countp += count; vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); error = VOP_FSYNC(devvp, MNT_WAIT, td); VOP_UNLOCK(devvp, 0); if (error) break; } return (error); } static int softdep_waitidle(struct mount *mp, int flags __unused) { struct ufsmount *ump; int error; int i; ump = VFSTOUFS(mp); ACQUIRE_LOCK(ump); for (i = 0; i < 10 && ump->softdep_deps; i++) { ump->softdep_req = 1; KASSERT((flags & FORCECLOSE) == 0 || ump->softdep_on_worklist == 0, ("softdep_waitidle: work added after flush")); msleep(&ump->softdep_deps, LOCK_PTR(ump), PVM, "softdeps", 1); } ump->softdep_req = 0; FREE_LOCK(ump); error = 0; if (i == 10) { error = EBUSY; printf("softdep_waitidle: Failed to flush worklist for %p\n", mp); } return (error); } /* * Flush all vnodes and worklist items associated with a specified mount point. */ int softdep_flushfiles(oldmnt, flags, td) struct mount *oldmnt; int flags; struct thread *td; { #ifdef QUOTA struct ufsmount *ump; int i; #endif int error, early, depcount, loopcnt, retry_flush_count, retry; int morework; KASSERT(MOUNTEDSOFTDEP(oldmnt) != 0, ("softdep_flushfiles called on non-softdep filesystem")); loopcnt = 10; retry_flush_count = 3; retry_flush: error = 0; /* * Alternately flush the vnodes associated with the mount * point and process any dependencies that the flushing * creates. In theory, this loop can happen at most twice, * but we give it a few extra just to be sure. */ for (; loopcnt > 0; loopcnt--) { /* * Do another flush in case any vnodes were brought in * as part of the cleanup operations. */ early = retry_flush_count == 1 || (oldmnt->mnt_kern_flag & MNTK_UNMOUNT) == 0 ? 0 : EARLYFLUSH; if ((error = ffs_flushfiles(oldmnt, flags | early, td)) != 0) break; if ((error = softdep_flushworklist(oldmnt, &depcount, td)) != 0 || depcount == 0) break; } /* * If we are unmounting then it is an error to fail. If we * are simply trying to downgrade to read-only, then filesystem * activity can keep us busy forever, so we just fail with EBUSY. */ if (loopcnt == 0) { if (oldmnt->mnt_kern_flag & MNTK_UNMOUNT) panic("softdep_flushfiles: looping"); error = EBUSY; } if (!error) error = softdep_waitidle(oldmnt, flags); if (!error) { if (oldmnt->mnt_kern_flag & MNTK_UNMOUNT) { retry = 0; MNT_ILOCK(oldmnt); KASSERT((oldmnt->mnt_kern_flag & MNTK_NOINSMNTQ) != 0, ("softdep_flushfiles: !MNTK_NOINSMNTQ")); morework = oldmnt->mnt_nvnodelistsize > 0; #ifdef QUOTA ump = VFSTOUFS(oldmnt); UFS_LOCK(ump); for (i = 0; i < MAXQUOTAS; i++) { if (ump->um_quotas[i] != NULLVP) morework = 1; } UFS_UNLOCK(ump); #endif if (morework) { if (--retry_flush_count > 0) { retry = 1; loopcnt = 3; } else error = EBUSY; } MNT_IUNLOCK(oldmnt); if (retry) goto retry_flush; } } return (error); } /* * Structure hashing. * * There are four types of structures that can be looked up: * 1) pagedep structures identified by mount point, inode number, * and logical block. * 2) inodedep structures identified by mount point and inode number. * 3) newblk structures identified by mount point and * physical block number. * 4) bmsafemap structures identified by mount point and * cylinder group number. * * The "pagedep" and "inodedep" dependency structures are hashed * separately from the file blocks and inodes to which they correspond. * This separation helps when the in-memory copy of an inode or * file block must be replaced. It also obviates the need to access * an inode or file page when simply updating (or de-allocating) * dependency structures. Lookup of newblk structures is needed to * find newly allocated blocks when trying to associate them with * their allocdirect or allocindir structure. * * The lookup routines optionally create and hash a new instance when * an existing entry is not found. The bmsafemap lookup routine always * allocates a new structure if an existing one is not found. */ #define DEPALLOC 0x0001 /* allocate structure if lookup fails */ #define NODELAY 0x0002 /* cannot do background work */ /* * Structures and routines associated with pagedep caching. */ #define PAGEDEP_HASH(ump, inum, lbn) \ (&(ump)->pagedep_hashtbl[((inum) + (lbn)) & (ump)->pagedep_hash_size]) static int pagedep_find(pagedephd, ino, lbn, pagedeppp) struct pagedep_hashhead *pagedephd; ino_t ino; ufs_lbn_t lbn; struct pagedep **pagedeppp; { struct pagedep *pagedep; LIST_FOREACH(pagedep, pagedephd, pd_hash) { if (ino == pagedep->pd_ino && lbn == pagedep->pd_lbn) { *pagedeppp = pagedep; return (1); } } *pagedeppp = NULL; return (0); } /* * Look up a pagedep. Return 1 if found, 0 otherwise. * If not found, allocate if DEPALLOC flag is passed. * Found or allocated entry is returned in pagedeppp. * This routine must be called with splbio interrupts blocked. */ static int pagedep_lookup(mp, bp, ino, lbn, flags, pagedeppp) struct mount *mp; struct buf *bp; ino_t ino; ufs_lbn_t lbn; int flags; struct pagedep **pagedeppp; { struct pagedep *pagedep; struct pagedep_hashhead *pagedephd; struct worklist *wk; struct ufsmount *ump; int ret; int i; ump = VFSTOUFS(mp); LOCK_OWNED(ump); if (bp) { LIST_FOREACH(wk, &bp->b_dep, wk_list) { if (wk->wk_type == D_PAGEDEP) { *pagedeppp = WK_PAGEDEP(wk); return (1); } } } pagedephd = PAGEDEP_HASH(ump, ino, lbn); ret = pagedep_find(pagedephd, ino, lbn, pagedeppp); if (ret) { if (((*pagedeppp)->pd_state & ONWORKLIST) == 0 && bp) WORKLIST_INSERT(&bp->b_dep, &(*pagedeppp)->pd_list); return (1); } if ((flags & DEPALLOC) == 0) return (0); FREE_LOCK(ump); pagedep = malloc(sizeof(struct pagedep), M_PAGEDEP, M_SOFTDEP_FLAGS|M_ZERO); workitem_alloc(&pagedep->pd_list, D_PAGEDEP, mp); ACQUIRE_LOCK(ump); ret = pagedep_find(pagedephd, ino, lbn, pagedeppp); if (*pagedeppp) { /* * This should never happen since we only create pagedeps * with the vnode lock held. Could be an assert. */ WORKITEM_FREE(pagedep, D_PAGEDEP); return (ret); } pagedep->pd_ino = ino; pagedep->pd_lbn = lbn; LIST_INIT(&pagedep->pd_dirremhd); LIST_INIT(&pagedep->pd_pendinghd); for (i = 0; i < DAHASHSZ; i++) LIST_INIT(&pagedep->pd_diraddhd[i]); LIST_INSERT_HEAD(pagedephd, pagedep, pd_hash); WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list); *pagedeppp = pagedep; return (0); } /* * Structures and routines associated with inodedep caching. */ #define INODEDEP_HASH(ump, inum) \ (&(ump)->inodedep_hashtbl[(inum) & (ump)->inodedep_hash_size]) static int inodedep_find(inodedephd, inum, inodedeppp) struct inodedep_hashhead *inodedephd; ino_t inum; struct inodedep **inodedeppp; { struct inodedep *inodedep; LIST_FOREACH(inodedep, inodedephd, id_hash) if (inum == inodedep->id_ino) break; if (inodedep) { *inodedeppp = inodedep; return (1); } *inodedeppp = NULL; return (0); } /* * Look up an inodedep. Return 1 if found, 0 if not found. * If not found, allocate if DEPALLOC flag is passed. * Found or allocated entry is returned in inodedeppp. * This routine must be called with splbio interrupts blocked. */ static int inodedep_lookup(mp, inum, flags, inodedeppp) struct mount *mp; ino_t inum; int flags; struct inodedep **inodedeppp; { struct inodedep *inodedep; struct inodedep_hashhead *inodedephd; struct ufsmount *ump; struct fs *fs; ump = VFSTOUFS(mp); LOCK_OWNED(ump); fs = ump->um_fs; inodedephd = INODEDEP_HASH(ump, inum); if (inodedep_find(inodedephd, inum, inodedeppp)) return (1); if ((flags & DEPALLOC) == 0) return (0); /* * If the system is over its limit and our filesystem is * responsible for more than our share of that usage and * we are not in a rush, request some inodedep cleanup. */ while (dep_current[D_INODEDEP] > max_softdeps && (flags & NODELAY) == 0 && ump->softdep_curdeps[D_INODEDEP] > max_softdeps / stat_flush_threads) request_cleanup(mp, FLUSH_INODES); FREE_LOCK(ump); inodedep = malloc(sizeof(struct inodedep), M_INODEDEP, M_SOFTDEP_FLAGS); workitem_alloc(&inodedep->id_list, D_INODEDEP, mp); ACQUIRE_LOCK(ump); if (inodedep_find(inodedephd, inum, inodedeppp)) { WORKITEM_FREE(inodedep, D_INODEDEP); return (1); } inodedep->id_fs = fs; inodedep->id_ino = inum; inodedep->id_state = ALLCOMPLETE; inodedep->id_nlinkdelta = 0; inodedep->id_savedino1 = NULL; inodedep->id_savedsize = -1; inodedep->id_savedextsize = -1; inodedep->id_savednlink = -1; inodedep->id_bmsafemap = NULL; inodedep->id_mkdiradd = NULL; LIST_INIT(&inodedep->id_dirremhd); LIST_INIT(&inodedep->id_pendinghd); LIST_INIT(&inodedep->id_inowait); LIST_INIT(&inodedep->id_bufwait); TAILQ_INIT(&inodedep->id_inoreflst); TAILQ_INIT(&inodedep->id_inoupdt); TAILQ_INIT(&inodedep->id_newinoupdt); TAILQ_INIT(&inodedep->id_extupdt); TAILQ_INIT(&inodedep->id_newextupdt); TAILQ_INIT(&inodedep->id_freeblklst); LIST_INSERT_HEAD(inodedephd, inodedep, id_hash); *inodedeppp = inodedep; return (0); } /* * Structures and routines associated with newblk caching. */ #define NEWBLK_HASH(ump, inum) \ (&(ump)->newblk_hashtbl[(inum) & (ump)->newblk_hash_size]) static int newblk_find(newblkhd, newblkno, flags, newblkpp) struct newblk_hashhead *newblkhd; ufs2_daddr_t newblkno; int flags; struct newblk **newblkpp; { struct newblk *newblk; LIST_FOREACH(newblk, newblkhd, nb_hash) { if (newblkno != newblk->nb_newblkno) continue; /* * If we're creating a new dependency don't match those that * have already been converted to allocdirects. This is for * a frag extend. */ if ((flags & DEPALLOC) && newblk->nb_list.wk_type != D_NEWBLK) continue; break; } if (newblk) { *newblkpp = newblk; return (1); } *newblkpp = NULL; return (0); } /* * Look up a newblk. Return 1 if found, 0 if not found. * If not found, allocate if DEPALLOC flag is passed. * Found or allocated entry is returned in newblkpp. */ static int newblk_lookup(mp, newblkno, flags, newblkpp) struct mount *mp; ufs2_daddr_t newblkno; int flags; struct newblk **newblkpp; { struct newblk *newblk; struct newblk_hashhead *newblkhd; struct ufsmount *ump; ump = VFSTOUFS(mp); LOCK_OWNED(ump); newblkhd = NEWBLK_HASH(ump, newblkno); if (newblk_find(newblkhd, newblkno, flags, newblkpp)) return (1); if ((flags & DEPALLOC) == 0) return (0); FREE_LOCK(ump); newblk = malloc(sizeof(union allblk), M_NEWBLK, M_SOFTDEP_FLAGS | M_ZERO); workitem_alloc(&newblk->nb_list, D_NEWBLK, mp); ACQUIRE_LOCK(ump); if (newblk_find(newblkhd, newblkno, flags, newblkpp)) { WORKITEM_FREE(newblk, D_NEWBLK); return (1); } newblk->nb_freefrag = NULL; LIST_INIT(&newblk->nb_indirdeps); LIST_INIT(&newblk->nb_newdirblk); LIST_INIT(&newblk->nb_jwork); newblk->nb_state = ATTACHED; newblk->nb_newblkno = newblkno; LIST_INSERT_HEAD(newblkhd, newblk, nb_hash); *newblkpp = newblk; return (0); } /* * Structures and routines associated with freed indirect block caching. */ #define INDIR_HASH(ump, blkno) \ (&(ump)->indir_hashtbl[(blkno) & (ump)->indir_hash_size]) /* * Lookup an indirect block in the indir hash table. The freework is * removed and potentially freed. The caller must do a blocking journal * write before writing to the blkno. */ static int indirblk_lookup(mp, blkno) struct mount *mp; ufs2_daddr_t blkno; { struct freework *freework; struct indir_hashhead *wkhd; struct ufsmount *ump; ump = VFSTOUFS(mp); wkhd = INDIR_HASH(ump, blkno); TAILQ_FOREACH(freework, wkhd, fw_next) { if (freework->fw_blkno != blkno) continue; indirblk_remove(freework); return (1); } return (0); } /* * Insert an indirect block represented by freework into the indirblk * hash table so that it may prevent the block from being re-used prior * to the journal being written. */ static void indirblk_insert(freework) struct freework *freework; { struct jblocks *jblocks; struct jseg *jseg; struct ufsmount *ump; ump = VFSTOUFS(freework->fw_list.wk_mp); jblocks = ump->softdep_jblocks; jseg = TAILQ_LAST(&jblocks->jb_segs, jseglst); if (jseg == NULL) return; LIST_INSERT_HEAD(&jseg->js_indirs, freework, fw_segs); TAILQ_INSERT_HEAD(INDIR_HASH(ump, freework->fw_blkno), freework, fw_next); freework->fw_state &= ~DEPCOMPLETE; } static void indirblk_remove(freework) struct freework *freework; { struct ufsmount *ump; ump = VFSTOUFS(freework->fw_list.wk_mp); LIST_REMOVE(freework, fw_segs); TAILQ_REMOVE(INDIR_HASH(ump, freework->fw_blkno), freework, fw_next); freework->fw_state |= DEPCOMPLETE; if ((freework->fw_state & ALLCOMPLETE) == ALLCOMPLETE) WORKITEM_FREE(freework, D_FREEWORK); } /* * Executed during filesystem system initialization before * mounting any filesystems. */ void softdep_initialize() { TAILQ_INIT(&softdepmounts); max_softdeps = desiredvnodes * 4; /* initialise bioops hack */ bioops.io_start = softdep_disk_io_initiation; bioops.io_complete = softdep_disk_write_complete; bioops.io_deallocate = softdep_deallocate_dependencies; bioops.io_countdeps = softdep_count_dependencies; /* Initialize the callout with an mtx. */ callout_init_mtx(&softdep_callout, &lk, 0); } /* * Executed after all filesystems have been unmounted during * filesystem module unload. */ void softdep_uninitialize() { /* clear bioops hack */ bioops.io_start = NULL; bioops.io_complete = NULL; bioops.io_deallocate = NULL; bioops.io_countdeps = NULL; callout_drain(&softdep_callout); } /* * Called at mount time to notify the dependency code that a * filesystem wishes to use it. */ int softdep_mount(devvp, mp, fs, cred) struct vnode *devvp; struct mount *mp; struct fs *fs; struct ucred *cred; { struct csum_total cstotal; struct mount_softdeps *sdp; struct ufsmount *ump; struct cg *cgp; struct buf *bp; int i, error, cyl; sdp = malloc(sizeof(struct mount_softdeps), M_MOUNTDATA, M_WAITOK | M_ZERO); MNT_ILOCK(mp); mp->mnt_flag = (mp->mnt_flag & ~MNT_ASYNC) | MNT_SOFTDEP; if ((mp->mnt_kern_flag & MNTK_SOFTDEP) == 0) { mp->mnt_kern_flag = (mp->mnt_kern_flag & ~MNTK_ASYNC) | MNTK_SOFTDEP | MNTK_NOASYNC; } ump = VFSTOUFS(mp); ump->um_softdep = sdp; MNT_IUNLOCK(mp); rw_init(LOCK_PTR(ump), "Per-Filesystem Softdep Lock"); sdp->sd_ump = ump; LIST_INIT(&ump->softdep_workitem_pending); LIST_INIT(&ump->softdep_journal_pending); TAILQ_INIT(&ump->softdep_unlinked); LIST_INIT(&ump->softdep_dirtycg); ump->softdep_worklist_tail = NULL; ump->softdep_on_worklist = 0; ump->softdep_deps = 0; LIST_INIT(&ump->softdep_mkdirlisthd); ump->pagedep_hashtbl = hashinit(desiredvnodes / 5, M_PAGEDEP, &ump->pagedep_hash_size); ump->pagedep_nextclean = 0; ump->inodedep_hashtbl = hashinit(desiredvnodes, M_INODEDEP, &ump->inodedep_hash_size); ump->inodedep_nextclean = 0; ump->newblk_hashtbl = hashinit(max_softdeps / 2, M_NEWBLK, &ump->newblk_hash_size); ump->bmsafemap_hashtbl = hashinit(1024, M_BMSAFEMAP, &ump->bmsafemap_hash_size); i = 1 << (ffs(desiredvnodes / 10) - 1); ump->indir_hashtbl = malloc(i * sizeof(struct indir_hashhead), M_FREEWORK, M_WAITOK); ump->indir_hash_size = i - 1; for (i = 0; i <= ump->indir_hash_size; i++) TAILQ_INIT(&ump->indir_hashtbl[i]); ACQUIRE_GBLLOCK(&lk); TAILQ_INSERT_TAIL(&softdepmounts, sdp, sd_next); FREE_GBLLOCK(&lk); if ((fs->fs_flags & FS_SUJ) && (error = journal_mount(mp, fs, cred)) != 0) { printf("Failed to start journal: %d\n", error); softdep_unmount(mp); return (error); } /* * Start our flushing thread in the bufdaemon process. */ ACQUIRE_LOCK(ump); ump->softdep_flags |= FLUSH_STARTING; FREE_LOCK(ump); kproc_kthread_add(&softdep_flush, mp, &bufdaemonproc, &ump->softdep_flushtd, 0, 0, "softdepflush", "%s worker", mp->mnt_stat.f_mntonname); ACQUIRE_LOCK(ump); while ((ump->softdep_flags & FLUSH_STARTING) != 0) { msleep(&ump->softdep_flushtd, LOCK_PTR(ump), PVM, "sdstart", hz / 2); } FREE_LOCK(ump); /* * When doing soft updates, the counters in the * superblock may have gotten out of sync. Recomputation * can take a long time and can be deferred for background * fsck. However, the old behavior of scanning the cylinder * groups and recalculating them at mount time is available * by setting vfs.ffs.compute_summary_at_mount to one. */ if (compute_summary_at_mount == 0 || fs->fs_clean != 0) return (0); bzero(&cstotal, sizeof cstotal); for (cyl = 0; cyl < fs->fs_ncg; cyl++) { if ((error = bread(devvp, fsbtodb(fs, cgtod(fs, cyl)), fs->fs_cgsize, cred, &bp)) != 0) { brelse(bp); softdep_unmount(mp); return (error); } cgp = (struct cg *)bp->b_data; cstotal.cs_nffree += cgp->cg_cs.cs_nffree; cstotal.cs_nbfree += cgp->cg_cs.cs_nbfree; cstotal.cs_nifree += cgp->cg_cs.cs_nifree; cstotal.cs_ndir += cgp->cg_cs.cs_ndir; fs->fs_cs(fs, cyl) = cgp->cg_cs; brelse(bp); } #ifdef DEBUG if (bcmp(&cstotal, &fs->fs_cstotal, sizeof cstotal)) printf("%s: superblock summary recomputed\n", fs->fs_fsmnt); #endif bcopy(&cstotal, &fs->fs_cstotal, sizeof cstotal); return (0); } void softdep_unmount(mp) struct mount *mp; { struct ufsmount *ump; #ifdef INVARIANTS int i; #endif KASSERT(MOUNTEDSOFTDEP(mp) != 0, ("softdep_unmount called on non-softdep filesystem")); ump = VFSTOUFS(mp); MNT_ILOCK(mp); mp->mnt_flag &= ~MNT_SOFTDEP; if (MOUNTEDSUJ(mp) == 0) { MNT_IUNLOCK(mp); } else { mp->mnt_flag &= ~MNT_SUJ; MNT_IUNLOCK(mp); journal_unmount(ump); } /* * Shut down our flushing thread. Check for NULL is if * softdep_mount errors out before the thread has been created. */ if (ump->softdep_flushtd != NULL) { ACQUIRE_LOCK(ump); ump->softdep_flags |= FLUSH_EXIT; wakeup(&ump->softdep_flushtd); msleep(&ump->softdep_flags, LOCK_PTR(ump), PVM | PDROP, "sdwait", 0); KASSERT((ump->softdep_flags & FLUSH_EXIT) == 0, ("Thread shutdown failed")); } /* * Free up our resources. */ ACQUIRE_GBLLOCK(&lk); TAILQ_REMOVE(&softdepmounts, ump->um_softdep, sd_next); FREE_GBLLOCK(&lk); rw_destroy(LOCK_PTR(ump)); hashdestroy(ump->pagedep_hashtbl, M_PAGEDEP, ump->pagedep_hash_size); hashdestroy(ump->inodedep_hashtbl, M_INODEDEP, ump->inodedep_hash_size); hashdestroy(ump->newblk_hashtbl, M_NEWBLK, ump->newblk_hash_size); hashdestroy(ump->bmsafemap_hashtbl, M_BMSAFEMAP, ump->bmsafemap_hash_size); free(ump->indir_hashtbl, M_FREEWORK); #ifdef INVARIANTS for (i = 0; i <= D_LAST; i++) KASSERT(ump->softdep_curdeps[i] == 0, ("Unmount %s: Dep type %s != 0 (%ld)", ump->um_fs->fs_fsmnt, TYPENAME(i), ump->softdep_curdeps[i])); #endif free(ump->um_softdep, M_MOUNTDATA); } static struct jblocks * jblocks_create(void) { struct jblocks *jblocks; jblocks = malloc(sizeof(*jblocks), M_JBLOCKS, M_WAITOK | M_ZERO); TAILQ_INIT(&jblocks->jb_segs); jblocks->jb_avail = 10; jblocks->jb_extent = malloc(sizeof(struct jextent) * jblocks->jb_avail, M_JBLOCKS, M_WAITOK | M_ZERO); return (jblocks); } static ufs2_daddr_t jblocks_alloc(jblocks, bytes, actual) struct jblocks *jblocks; int bytes; int *actual; { ufs2_daddr_t daddr; struct jextent *jext; int freecnt; int blocks; blocks = bytes / DEV_BSIZE; jext = &jblocks->jb_extent[jblocks->jb_head]; freecnt = jext->je_blocks - jblocks->jb_off; if (freecnt == 0) { jblocks->jb_off = 0; if (++jblocks->jb_head > jblocks->jb_used) jblocks->jb_head = 0; jext = &jblocks->jb_extent[jblocks->jb_head]; freecnt = jext->je_blocks; } if (freecnt > blocks) freecnt = blocks; *actual = freecnt * DEV_BSIZE; daddr = jext->je_daddr + jblocks->jb_off; jblocks->jb_off += freecnt; jblocks->jb_free -= freecnt; return (daddr); } static void jblocks_free(jblocks, mp, bytes) struct jblocks *jblocks; struct mount *mp; int bytes; { LOCK_OWNED(VFSTOUFS(mp)); jblocks->jb_free += bytes / DEV_BSIZE; if (jblocks->jb_suspended) worklist_speedup(mp); wakeup(jblocks); } static void jblocks_destroy(jblocks) struct jblocks *jblocks; { if (jblocks->jb_extent) free(jblocks->jb_extent, M_JBLOCKS); free(jblocks, M_JBLOCKS); } static void jblocks_add(jblocks, daddr, blocks) struct jblocks *jblocks; ufs2_daddr_t daddr; int blocks; { struct jextent *jext; jblocks->jb_blocks += blocks; jblocks->jb_free += blocks; jext = &jblocks->jb_extent[jblocks->jb_used]; /* Adding the first block. */ if (jext->je_daddr == 0) { jext->je_daddr = daddr; jext->je_blocks = blocks; return; } /* Extending the last extent. */ if (jext->je_daddr + jext->je_blocks == daddr) { jext->je_blocks += blocks; return; } /* Adding a new extent. */ if (++jblocks->jb_used == jblocks->jb_avail) { jblocks->jb_avail *= 2; jext = malloc(sizeof(struct jextent) * jblocks->jb_avail, M_JBLOCKS, M_WAITOK | M_ZERO); memcpy(jext, jblocks->jb_extent, sizeof(struct jextent) * jblocks->jb_used); free(jblocks->jb_extent, M_JBLOCKS); jblocks->jb_extent = jext; } jext = &jblocks->jb_extent[jblocks->jb_used]; jext->je_daddr = daddr; jext->je_blocks = blocks; return; } int softdep_journal_lookup(mp, vpp) struct mount *mp; struct vnode **vpp; { struct componentname cnp; struct vnode *dvp; ino_t sujournal; int error; error = VFS_VGET(mp, ROOTINO, LK_EXCLUSIVE, &dvp); if (error) return (error); bzero(&cnp, sizeof(cnp)); cnp.cn_nameiop = LOOKUP; cnp.cn_flags = ISLASTCN; cnp.cn_thread = curthread; cnp.cn_cred = curthread->td_ucred; cnp.cn_pnbuf = SUJ_FILE; cnp.cn_nameptr = SUJ_FILE; cnp.cn_namelen = strlen(SUJ_FILE); error = ufs_lookup_ino(dvp, NULL, &cnp, &sujournal); vput(dvp); if (error != 0) return (error); error = VFS_VGET(mp, sujournal, LK_EXCLUSIVE, vpp); return (error); } /* * Open and verify the journal file. */ static int journal_mount(mp, fs, cred) struct mount *mp; struct fs *fs; struct ucred *cred; { struct jblocks *jblocks; struct ufsmount *ump; struct vnode *vp; struct inode *ip; ufs2_daddr_t blkno; int bcount; int error; int i; ump = VFSTOUFS(mp); ump->softdep_journal_tail = NULL; ump->softdep_on_journal = 0; ump->softdep_accdeps = 0; ump->softdep_req = 0; ump->softdep_jblocks = NULL; error = softdep_journal_lookup(mp, &vp); if (error != 0) { printf("Failed to find journal. Use tunefs to create one\n"); return (error); } ip = VTOI(vp); if (ip->i_size < SUJ_MIN) { error = ENOSPC; goto out; } bcount = lblkno(fs, ip->i_size); /* Only use whole blocks. */ jblocks = jblocks_create(); for (i = 0; i < bcount; i++) { error = ufs_bmaparray(vp, i, &blkno, NULL, NULL, NULL); if (error) break; jblocks_add(jblocks, blkno, fsbtodb(fs, fs->fs_frag)); } if (error) { jblocks_destroy(jblocks); goto out; } jblocks->jb_low = jblocks->jb_free / 3; /* Reserve 33%. */ jblocks->jb_min = jblocks->jb_free / 10; /* Suspend at 10%. */ ump->softdep_jblocks = jblocks; out: if (error == 0) { MNT_ILOCK(mp); mp->mnt_flag |= MNT_SUJ; mp->mnt_flag &= ~MNT_SOFTDEP; MNT_IUNLOCK(mp); /* * Only validate the journal contents if the * filesystem is clean, otherwise we write the logs * but they'll never be used. If the filesystem was * still dirty when we mounted it the journal is * invalid and a new journal can only be valid if it * starts from a clean mount. */ if (fs->fs_clean) { DIP_SET(ip, i_modrev, fs->fs_mtime); ip->i_flags |= IN_MODIFIED; ffs_update(vp, 1); } } vput(vp); return (error); } static void journal_unmount(ump) struct ufsmount *ump; { if (ump->softdep_jblocks) jblocks_destroy(ump->softdep_jblocks); ump->softdep_jblocks = NULL; } /* * Called when a journal record is ready to be written. Space is allocated * and the journal entry is created when the journal is flushed to stable * store. */ static void add_to_journal(wk) struct worklist *wk; { struct ufsmount *ump; ump = VFSTOUFS(wk->wk_mp); LOCK_OWNED(ump); if (wk->wk_state & ONWORKLIST) panic("add_to_journal: %s(0x%X) already on list", TYPENAME(wk->wk_type), wk->wk_state); wk->wk_state |= ONWORKLIST | DEPCOMPLETE; if (LIST_EMPTY(&ump->softdep_journal_pending)) { ump->softdep_jblocks->jb_age = ticks; LIST_INSERT_HEAD(&ump->softdep_journal_pending, wk, wk_list); } else LIST_INSERT_AFTER(ump->softdep_journal_tail, wk, wk_list); ump->softdep_journal_tail = wk; ump->softdep_on_journal += 1; } /* * Remove an arbitrary item for the journal worklist maintain the tail * pointer. This happens when a new operation obviates the need to * journal an old operation. */ static void remove_from_journal(wk) struct worklist *wk; { struct ufsmount *ump; ump = VFSTOUFS(wk->wk_mp); LOCK_OWNED(ump); #ifdef SUJ_DEBUG { struct worklist *wkn; LIST_FOREACH(wkn, &ump->softdep_journal_pending, wk_list) if (wkn == wk) break; if (wkn == NULL) panic("remove_from_journal: %p is not in journal", wk); } #endif /* * We emulate a TAILQ to save space in most structures which do not * require TAILQ semantics. Here we must update the tail position * when removing the tail which is not the final entry. This works * only if the worklist linkage are at the beginning of the structure. */ if (ump->softdep_journal_tail == wk) ump->softdep_journal_tail = (struct worklist *)wk->wk_list.le_prev; WORKLIST_REMOVE(wk); ump->softdep_on_journal -= 1; } /* * Check for journal space as well as dependency limits so the prelink * code can throttle both journaled and non-journaled filesystems. * Threshold is 0 for low and 1 for min. */ static int journal_space(ump, thresh) struct ufsmount *ump; int thresh; { struct jblocks *jblocks; int limit, avail; jblocks = ump->softdep_jblocks; if (jblocks == NULL) return (1); /* * We use a tighter restriction here to prevent request_cleanup() * running in threads from running into locks we currently hold. * We have to be over the limit and our filesystem has to be * responsible for more than our share of that usage. */ limit = (max_softdeps / 10) * 9; if (dep_current[D_INODEDEP] > limit && ump->softdep_curdeps[D_INODEDEP] > limit / stat_flush_threads) return (0); if (thresh) thresh = jblocks->jb_min; else thresh = jblocks->jb_low; avail = (ump->softdep_on_journal * JREC_SIZE) / DEV_BSIZE; avail = jblocks->jb_free - avail; return (avail > thresh); } static void journal_suspend(ump) struct ufsmount *ump; { struct jblocks *jblocks; struct mount *mp; mp = UFSTOVFS(ump); jblocks = ump->softdep_jblocks; MNT_ILOCK(mp); if ((mp->mnt_kern_flag & MNTK_SUSPEND) == 0) { stat_journal_min++; mp->mnt_kern_flag |= MNTK_SUSPEND; mp->mnt_susp_owner = ump->softdep_flushtd; } jblocks->jb_suspended = 1; MNT_IUNLOCK(mp); } static int journal_unsuspend(struct ufsmount *ump) { struct jblocks *jblocks; struct mount *mp; mp = UFSTOVFS(ump); jblocks = ump->softdep_jblocks; if (jblocks != NULL && jblocks->jb_suspended && journal_space(ump, jblocks->jb_min)) { jblocks->jb_suspended = 0; FREE_LOCK(ump); mp->mnt_susp_owner = curthread; vfs_write_resume(mp, 0); ACQUIRE_LOCK(ump); return (1); } return (0); } /* * Called before any allocation function to be certain that there is * sufficient space in the journal prior to creating any new records. * Since in the case of block allocation we may have multiple locked * buffers at the time of the actual allocation we can not block * when the journal records are created. Doing so would create a deadlock * if any of these buffers needed to be flushed to reclaim space. Instead * we require a sufficiently large amount of available space such that * each thread in the system could have passed this allocation check and * still have sufficient free space. With 20% of a minimum journal size * of 1MB we have 6553 records available. */ int softdep_prealloc(vp, waitok) struct vnode *vp; int waitok; { struct ufsmount *ump; KASSERT(MOUNTEDSOFTDEP(vp->v_mount) != 0, ("softdep_prealloc called on non-softdep filesystem")); /* * Nothing to do if we are not running journaled soft updates. * If we currently hold the snapshot lock, we must avoid handling * other resources that could cause deadlock. */ if (DOINGSUJ(vp) == 0 || IS_SNAPSHOT(VTOI(vp))) return (0); ump = VFSTOUFS(vp->v_mount); ACQUIRE_LOCK(ump); if (journal_space(ump, 0)) { FREE_LOCK(ump); return (0); } stat_journal_low++; FREE_LOCK(ump); if (waitok == MNT_NOWAIT) return (ENOSPC); /* * Attempt to sync this vnode once to flush any journal * work attached to it. */ if ((curthread->td_pflags & TDP_COWINPROGRESS) == 0) ffs_syncvnode(vp, waitok, 0); ACQUIRE_LOCK(ump); process_removes(vp); process_truncates(vp); if (journal_space(ump, 0) == 0) { softdep_speedup(ump); if (journal_space(ump, 1) == 0) journal_suspend(ump); } FREE_LOCK(ump); return (0); } /* * Before adjusting a link count on a vnode verify that we have sufficient * journal space. If not, process operations that depend on the currently * locked pair of vnodes to try to flush space as the syncer, buf daemon, * and softdep flush threads can not acquire these locks to reclaim space. */ static void softdep_prelink(dvp, vp) struct vnode *dvp; struct vnode *vp; { struct ufsmount *ump; ump = VFSTOUFS(dvp->v_mount); LOCK_OWNED(ump); /* * Nothing to do if we have sufficient journal space. * If we currently hold the snapshot lock, we must avoid * handling other resources that could cause deadlock. */ if (journal_space(ump, 0) || (vp && IS_SNAPSHOT(VTOI(vp)))) return; stat_journal_low++; FREE_LOCK(ump); if (vp) ffs_syncvnode(vp, MNT_NOWAIT, 0); ffs_syncvnode(dvp, MNT_WAIT, 0); ACQUIRE_LOCK(ump); /* Process vp before dvp as it may create .. removes. */ if (vp) { process_removes(vp); process_truncates(vp); } process_removes(dvp); process_truncates(dvp); softdep_speedup(ump); process_worklist_item(UFSTOVFS(ump), 2, LK_NOWAIT); if (journal_space(ump, 0) == 0) { softdep_speedup(ump); if (journal_space(ump, 1) == 0) journal_suspend(ump); } } static void jseg_write(ump, jseg, data) struct ufsmount *ump; struct jseg *jseg; uint8_t *data; { struct jsegrec *rec; rec = (struct jsegrec *)data; rec->jsr_seq = jseg->js_seq; rec->jsr_oldest = jseg->js_oldseq; rec->jsr_cnt = jseg->js_cnt; rec->jsr_blocks = jseg->js_size / ump->um_devvp->v_bufobj.bo_bsize; rec->jsr_crc = 0; rec->jsr_time = ump->um_fs->fs_mtime; } static inline void inoref_write(inoref, jseg, rec) struct inoref *inoref; struct jseg *jseg; struct jrefrec *rec; { inoref->if_jsegdep->jd_seg = jseg; rec->jr_ino = inoref->if_ino; rec->jr_parent = inoref->if_parent; rec->jr_nlink = inoref->if_nlink; rec->jr_mode = inoref->if_mode; rec->jr_diroff = inoref->if_diroff; } static void jaddref_write(jaddref, jseg, data) struct jaddref *jaddref; struct jseg *jseg; uint8_t *data; { struct jrefrec *rec; rec = (struct jrefrec *)data; rec->jr_op = JOP_ADDREF; inoref_write(&jaddref->ja_ref, jseg, rec); } static void jremref_write(jremref, jseg, data) struct jremref *jremref; struct jseg *jseg; uint8_t *data; { struct jrefrec *rec; rec = (struct jrefrec *)data; rec->jr_op = JOP_REMREF; inoref_write(&jremref->jr_ref, jseg, rec); } static void jmvref_write(jmvref, jseg, data) struct jmvref *jmvref; struct jseg *jseg; uint8_t *data; { struct jmvrec *rec; rec = (struct jmvrec *)data; rec->jm_op = JOP_MVREF; rec->jm_ino = jmvref->jm_ino; rec->jm_parent = jmvref->jm_parent; rec->jm_oldoff = jmvref->jm_oldoff; rec->jm_newoff = jmvref->jm_newoff; } static void jnewblk_write(jnewblk, jseg, data) struct jnewblk *jnewblk; struct jseg *jseg; uint8_t *data; { struct jblkrec *rec; jnewblk->jn_jsegdep->jd_seg = jseg; rec = (struct jblkrec *)data; rec->jb_op = JOP_NEWBLK; rec->jb_ino = jnewblk->jn_ino; rec->jb_blkno = jnewblk->jn_blkno; rec->jb_lbn = jnewblk->jn_lbn; rec->jb_frags = jnewblk->jn_frags; rec->jb_oldfrags = jnewblk->jn_oldfrags; } static void jfreeblk_write(jfreeblk, jseg, data) struct jfreeblk *jfreeblk; struct jseg *jseg; uint8_t *data; { struct jblkrec *rec; jfreeblk->jf_dep.jb_jsegdep->jd_seg = jseg; rec = (struct jblkrec *)data; rec->jb_op = JOP_FREEBLK; rec->jb_ino = jfreeblk->jf_ino; rec->jb_blkno = jfreeblk->jf_blkno; rec->jb_lbn = jfreeblk->jf_lbn; rec->jb_frags = jfreeblk->jf_frags; rec->jb_oldfrags = 0; } static void jfreefrag_write(jfreefrag, jseg, data) struct jfreefrag *jfreefrag; struct jseg *jseg; uint8_t *data; { struct jblkrec *rec; jfreefrag->fr_jsegdep->jd_seg = jseg; rec = (struct jblkrec *)data; rec->jb_op = JOP_FREEBLK; rec->jb_ino = jfreefrag->fr_ino; rec->jb_blkno = jfreefrag->fr_blkno; rec->jb_lbn = jfreefrag->fr_lbn; rec->jb_frags = jfreefrag->fr_frags; rec->jb_oldfrags = 0; } static void jtrunc_write(jtrunc, jseg, data) struct jtrunc *jtrunc; struct jseg *jseg; uint8_t *data; { struct jtrncrec *rec; jtrunc->jt_dep.jb_jsegdep->jd_seg = jseg; rec = (struct jtrncrec *)data; rec->jt_op = JOP_TRUNC; rec->jt_ino = jtrunc->jt_ino; rec->jt_size = jtrunc->jt_size; rec->jt_extsize = jtrunc->jt_extsize; } static void jfsync_write(jfsync, jseg, data) struct jfsync *jfsync; struct jseg *jseg; uint8_t *data; { struct jtrncrec *rec; rec = (struct jtrncrec *)data; rec->jt_op = JOP_SYNC; rec->jt_ino = jfsync->jfs_ino; rec->jt_size = jfsync->jfs_size; rec->jt_extsize = jfsync->jfs_extsize; } static void softdep_flushjournal(mp) struct mount *mp; { struct jblocks *jblocks; struct ufsmount *ump; if (MOUNTEDSUJ(mp) == 0) return; ump = VFSTOUFS(mp); jblocks = ump->softdep_jblocks; ACQUIRE_LOCK(ump); while (ump->softdep_on_journal) { jblocks->jb_needseg = 1; softdep_process_journal(mp, NULL, MNT_WAIT); } FREE_LOCK(ump); } static void softdep_synchronize_completed(struct bio *); static void softdep_synchronize(struct bio *, struct ufsmount *, void *); static void softdep_synchronize_completed(bp) struct bio *bp; { struct jseg *oldest; struct jseg *jseg; struct ufsmount *ump; /* * caller1 marks the last segment written before we issued the * synchronize cache. */ jseg = bp->bio_caller1; if (jseg == NULL) { g_destroy_bio(bp); return; } ump = VFSTOUFS(jseg->js_list.wk_mp); ACQUIRE_LOCK(ump); oldest = NULL; /* * Mark all the journal entries waiting on the synchronize cache * as completed so they may continue on. */ while (jseg != NULL && (jseg->js_state & COMPLETE) == 0) { jseg->js_state |= COMPLETE; oldest = jseg; jseg = TAILQ_PREV(jseg, jseglst, js_next); } /* * Restart deferred journal entry processing from the oldest * completed jseg. */ if (oldest) complete_jsegs(oldest); FREE_LOCK(ump); g_destroy_bio(bp); } /* * Send BIO_FLUSH/SYNCHRONIZE CACHE to the device to enforce write ordering * barriers. The journal must be written prior to any blocks that depend * on it and the journal can not be released until the blocks have be * written. This code handles both barriers simultaneously. */ static void softdep_synchronize(bp, ump, caller1) struct bio *bp; struct ufsmount *ump; void *caller1; { bp->bio_cmd = BIO_FLUSH; bp->bio_flags |= BIO_ORDERED; bp->bio_data = NULL; bp->bio_offset = ump->um_cp->provider->mediasize; bp->bio_length = 0; bp->bio_done = softdep_synchronize_completed; bp->bio_caller1 = caller1; g_io_request(bp, (struct g_consumer *)ump->um_devvp->v_bufobj.bo_private); } /* * Flush some journal records to disk. */ static void softdep_process_journal(mp, needwk, flags) struct mount *mp; struct worklist *needwk; int flags; { struct jblocks *jblocks; struct ufsmount *ump; struct worklist *wk; struct jseg *jseg; struct buf *bp; struct bio *bio; uint8_t *data; struct fs *fs; int shouldflush; int segwritten; int jrecmin; /* Minimum records per block. */ int jrecmax; /* Maximum records per block. */ int size; int cnt; int off; int devbsize; if (MOUNTEDSUJ(mp) == 0) return; shouldflush = softdep_flushcache; bio = NULL; jseg = NULL; ump = VFSTOUFS(mp); LOCK_OWNED(ump); fs = ump->um_fs; jblocks = ump->softdep_jblocks; devbsize = ump->um_devvp->v_bufobj.bo_bsize; /* * We write anywhere between a disk block and fs block. The upper * bound is picked to prevent buffer cache fragmentation and limit * processing time per I/O. */ jrecmin = (devbsize / JREC_SIZE) - 1; /* -1 for seg header */ jrecmax = (fs->fs_bsize / devbsize) * jrecmin; segwritten = 0; for (;;) { cnt = ump->softdep_on_journal; /* * Criteria for writing a segment: * 1) We have a full block. * 2) We're called from jwait() and haven't found the * journal item yet. * 3) Always write if needseg is set. * 4) If we are called from process_worklist and have * not yet written anything we write a partial block * to enforce a 1 second maximum latency on journal * entries. */ if (cnt < (jrecmax - 1) && needwk == NULL && jblocks->jb_needseg == 0 && (segwritten || cnt == 0)) break; cnt++; /* * Verify some free journal space. softdep_prealloc() should * guarantee that we don't run out so this is indicative of * a problem with the flow control. Try to recover * gracefully in any event. */ while (jblocks->jb_free == 0) { if (flags != MNT_WAIT) break; printf("softdep: Out of journal space!\n"); softdep_speedup(ump); msleep(jblocks, LOCK_PTR(ump), PRIBIO, "jblocks", hz); } FREE_LOCK(ump); jseg = malloc(sizeof(*jseg), M_JSEG, M_SOFTDEP_FLAGS); workitem_alloc(&jseg->js_list, D_JSEG, mp); LIST_INIT(&jseg->js_entries); LIST_INIT(&jseg->js_indirs); jseg->js_state = ATTACHED; if (shouldflush == 0) jseg->js_state |= COMPLETE; else if (bio == NULL) bio = g_alloc_bio(); jseg->js_jblocks = jblocks; bp = geteblk(fs->fs_bsize, 0); ACQUIRE_LOCK(ump); /* * If there was a race while we were allocating the block * and jseg the entry we care about was likely written. * We bail out in both the WAIT and NOWAIT case and assume * the caller will loop if the entry it cares about is * not written. */ cnt = ump->softdep_on_journal; if (cnt + jblocks->jb_needseg == 0 || jblocks->jb_free == 0) { bp->b_flags |= B_INVAL | B_NOCACHE; WORKITEM_FREE(jseg, D_JSEG); FREE_LOCK(ump); brelse(bp); ACQUIRE_LOCK(ump); break; } /* * Calculate the disk block size required for the available * records rounded to the min size. */ if (cnt == 0) size = devbsize; else if (cnt < jrecmax) size = howmany(cnt, jrecmin) * devbsize; else size = fs->fs_bsize; /* * Allocate a disk block for this journal data and account * for truncation of the requested size if enough contiguous * space was not available. */ bp->b_blkno = jblocks_alloc(jblocks, size, &size); bp->b_lblkno = bp->b_blkno; bp->b_offset = bp->b_blkno * DEV_BSIZE; bp->b_bcount = size; bp->b_flags &= ~B_INVAL; bp->b_flags |= B_VALIDSUSPWRT | B_NOCOPY; /* * Initialize our jseg with cnt records. Assign the next * sequence number to it and link it in-order. */ cnt = MIN(cnt, (size / devbsize) * jrecmin); jseg->js_buf = bp; jseg->js_cnt = cnt; jseg->js_refs = cnt + 1; /* Self ref. */ jseg->js_size = size; jseg->js_seq = jblocks->jb_nextseq++; if (jblocks->jb_oldestseg == NULL) jblocks->jb_oldestseg = jseg; jseg->js_oldseq = jblocks->jb_oldestseg->js_seq; TAILQ_INSERT_TAIL(&jblocks->jb_segs, jseg, js_next); if (jblocks->jb_writeseg == NULL) jblocks->jb_writeseg = jseg; /* * Start filling in records from the pending list. */ data = bp->b_data; off = 0; /* * Always put a header on the first block. * XXX As with below, there might not be a chance to get * into the loop. Ensure that something valid is written. */ jseg_write(ump, jseg, data); off += JREC_SIZE; data = bp->b_data + off; /* * XXX Something is wrong here. There's no work to do, * but we need to perform and I/O and allow it to complete * anyways. */ if (LIST_EMPTY(&ump->softdep_journal_pending)) stat_emptyjblocks++; while ((wk = LIST_FIRST(&ump->softdep_journal_pending)) != NULL) { if (cnt == 0) break; /* Place a segment header on every device block. */ if ((off % devbsize) == 0) { jseg_write(ump, jseg, data); off += JREC_SIZE; data = bp->b_data + off; } if (wk == needwk) needwk = NULL; remove_from_journal(wk); wk->wk_state |= INPROGRESS; WORKLIST_INSERT(&jseg->js_entries, wk); switch (wk->wk_type) { case D_JADDREF: jaddref_write(WK_JADDREF(wk), jseg, data); break; case D_JREMREF: jremref_write(WK_JREMREF(wk), jseg, data); break; case D_JMVREF: jmvref_write(WK_JMVREF(wk), jseg, data); break; case D_JNEWBLK: jnewblk_write(WK_JNEWBLK(wk), jseg, data); break; case D_JFREEBLK: jfreeblk_write(WK_JFREEBLK(wk), jseg, data); break; case D_JFREEFRAG: jfreefrag_write(WK_JFREEFRAG(wk), jseg, data); break; case D_JTRUNC: jtrunc_write(WK_JTRUNC(wk), jseg, data); break; case D_JFSYNC: jfsync_write(WK_JFSYNC(wk), jseg, data); break; default: panic("process_journal: Unknown type %s", TYPENAME(wk->wk_type)); /* NOTREACHED */ } off += JREC_SIZE; data = bp->b_data + off; cnt--; } /* Clear any remaining space so we don't leak kernel data */ if (size > off) bzero(data, size - off); /* * Write this one buffer and continue. */ segwritten = 1; jblocks->jb_needseg = 0; WORKLIST_INSERT(&bp->b_dep, &jseg->js_list); FREE_LOCK(ump); pbgetvp(ump->um_devvp, bp); /* * We only do the blocking wait once we find the journal * entry we're looking for. */ if (needwk == NULL && flags == MNT_WAIT) bwrite(bp); else bawrite(bp); ACQUIRE_LOCK(ump); } /* * If we wrote a segment issue a synchronize cache so the journal * is reflected on disk before the data is written. Since reclaiming * journal space also requires writing a journal record this * process also enforces a barrier before reclamation. */ if (segwritten && shouldflush) { softdep_synchronize(bio, ump, TAILQ_LAST(&jblocks->jb_segs, jseglst)); } else if (bio) g_destroy_bio(bio); /* * If we've suspended the filesystem because we ran out of journal * space either try to sync it here to make some progress or * unsuspend it if we already have. */ if (flags == 0 && jblocks->jb_suspended) { if (journal_unsuspend(ump)) return; FREE_LOCK(ump); VFS_SYNC(mp, MNT_NOWAIT); ffs_sbupdate(ump, MNT_WAIT, 0); ACQUIRE_LOCK(ump); } } /* * Complete a jseg, allowing all dependencies awaiting journal writes * to proceed. Each journal dependency also attaches a jsegdep to dependent * structures so that the journal segment can be freed to reclaim space. */ static void complete_jseg(jseg) struct jseg *jseg; { struct worklist *wk; struct jmvref *jmvref; int waiting; #ifdef INVARIANTS int i = 0; #endif while ((wk = LIST_FIRST(&jseg->js_entries)) != NULL) { WORKLIST_REMOVE(wk); waiting = wk->wk_state & IOWAITING; wk->wk_state &= ~(INPROGRESS | IOWAITING); wk->wk_state |= COMPLETE; KASSERT(i++ < jseg->js_cnt, ("handle_written_jseg: overflow %d >= %d", i - 1, jseg->js_cnt)); switch (wk->wk_type) { case D_JADDREF: handle_written_jaddref(WK_JADDREF(wk)); break; case D_JREMREF: handle_written_jremref(WK_JREMREF(wk)); break; case D_JMVREF: rele_jseg(jseg); /* No jsegdep. */ jmvref = WK_JMVREF(wk); LIST_REMOVE(jmvref, jm_deps); if ((jmvref->jm_pagedep->pd_state & ONWORKLIST) == 0) free_pagedep(jmvref->jm_pagedep); WORKITEM_FREE(jmvref, D_JMVREF); break; case D_JNEWBLK: handle_written_jnewblk(WK_JNEWBLK(wk)); break; case D_JFREEBLK: handle_written_jblkdep(&WK_JFREEBLK(wk)->jf_dep); break; case D_JTRUNC: handle_written_jblkdep(&WK_JTRUNC(wk)->jt_dep); break; case D_JFSYNC: rele_jseg(jseg); /* No jsegdep. */ WORKITEM_FREE(wk, D_JFSYNC); break; case D_JFREEFRAG: handle_written_jfreefrag(WK_JFREEFRAG(wk)); break; default: panic("handle_written_jseg: Unknown type %s", TYPENAME(wk->wk_type)); /* NOTREACHED */ } if (waiting) wakeup(wk); } /* Release the self reference so the structure may be freed. */ rele_jseg(jseg); } /* * Determine which jsegs are ready for completion processing. Waits for * synchronize cache to complete as well as forcing in-order completion * of journal entries. */ static void complete_jsegs(jseg) struct jseg *jseg; { struct jblocks *jblocks; struct jseg *jsegn; jblocks = jseg->js_jblocks; /* * Don't allow out of order completions. If this isn't the first * block wait for it to write before we're done. */ if (jseg != jblocks->jb_writeseg) return; /* Iterate through available jsegs processing their entries. */ while (jseg && (jseg->js_state & ALLCOMPLETE) == ALLCOMPLETE) { jblocks->jb_oldestwrseq = jseg->js_oldseq; jsegn = TAILQ_NEXT(jseg, js_next); complete_jseg(jseg); jseg = jsegn; } jblocks->jb_writeseg = jseg; /* * Attempt to free jsegs now that oldestwrseq may have advanced. */ free_jsegs(jblocks); } /* * Mark a jseg as DEPCOMPLETE and throw away the buffer. Attempt to handle * the final completions. */ static void handle_written_jseg(jseg, bp) struct jseg *jseg; struct buf *bp; { if (jseg->js_refs == 0) panic("handle_written_jseg: No self-reference on %p", jseg); jseg->js_state |= DEPCOMPLETE; /* * We'll never need this buffer again, set flags so it will be * discarded. */ bp->b_flags |= B_INVAL | B_NOCACHE; pbrelvp(bp); complete_jsegs(jseg); } static inline struct jsegdep * inoref_jseg(inoref) struct inoref *inoref; { struct jsegdep *jsegdep; jsegdep = inoref->if_jsegdep; inoref->if_jsegdep = NULL; return (jsegdep); } /* * Called once a jremref has made it to stable store. The jremref is marked * complete and we attempt to free it. Any pagedeps writes sleeping waiting * for the jremref to complete will be awoken by free_jremref. */ static void handle_written_jremref(jremref) struct jremref *jremref; { struct inodedep *inodedep; struct jsegdep *jsegdep; struct dirrem *dirrem; /* Grab the jsegdep. */ jsegdep = inoref_jseg(&jremref->jr_ref); /* * Remove us from the inoref list. */ if (inodedep_lookup(jremref->jr_list.wk_mp, jremref->jr_ref.if_ino, 0, &inodedep) == 0) panic("handle_written_jremref: Lost inodedep"); TAILQ_REMOVE(&inodedep->id_inoreflst, &jremref->jr_ref, if_deps); /* * Complete the dirrem. */ dirrem = jremref->jr_dirrem; jremref->jr_dirrem = NULL; LIST_REMOVE(jremref, jr_deps); jsegdep->jd_state |= jremref->jr_state & MKDIR_PARENT; jwork_insert(&dirrem->dm_jwork, jsegdep); if (LIST_EMPTY(&dirrem->dm_jremrefhd) && (dirrem->dm_state & COMPLETE) != 0) add_to_worklist(&dirrem->dm_list, 0); free_jremref(jremref); } /* * Called once a jaddref has made it to stable store. The dependency is * marked complete and any dependent structures are added to the inode * bufwait list to be completed as soon as it is written. If a bitmap write * depends on this entry we move the inode into the inodedephd of the * bmsafemap dependency and attempt to remove the jaddref from the bmsafemap. */ static void handle_written_jaddref(jaddref) struct jaddref *jaddref; { struct jsegdep *jsegdep; struct inodedep *inodedep; struct diradd *diradd; struct mkdir *mkdir; /* Grab the jsegdep. */ jsegdep = inoref_jseg(&jaddref->ja_ref); mkdir = NULL; diradd = NULL; if (inodedep_lookup(jaddref->ja_list.wk_mp, jaddref->ja_ino, 0, &inodedep) == 0) panic("handle_written_jaddref: Lost inodedep."); if (jaddref->ja_diradd == NULL) panic("handle_written_jaddref: No dependency"); if (jaddref->ja_diradd->da_list.wk_type == D_DIRADD) { diradd = jaddref->ja_diradd; WORKLIST_INSERT(&inodedep->id_bufwait, &diradd->da_list); } else if (jaddref->ja_state & MKDIR_PARENT) { mkdir = jaddref->ja_mkdir; WORKLIST_INSERT(&inodedep->id_bufwait, &mkdir->md_list); } else if (jaddref->ja_state & MKDIR_BODY) mkdir = jaddref->ja_mkdir; else panic("handle_written_jaddref: Unknown dependency %p", jaddref->ja_diradd); jaddref->ja_diradd = NULL; /* also clears ja_mkdir */ /* * Remove us from the inode list. */ TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref, if_deps); /* * The mkdir may be waiting on the jaddref to clear before freeing. */ if (mkdir) { KASSERT(mkdir->md_list.wk_type == D_MKDIR, ("handle_written_jaddref: Incorrect type for mkdir %s", TYPENAME(mkdir->md_list.wk_type))); mkdir->md_jaddref = NULL; diradd = mkdir->md_diradd; mkdir->md_state |= DEPCOMPLETE; complete_mkdir(mkdir); } jwork_insert(&diradd->da_jwork, jsegdep); if (jaddref->ja_state & NEWBLOCK) { inodedep->id_state |= ONDEPLIST; LIST_INSERT_HEAD(&inodedep->id_bmsafemap->sm_inodedephd, inodedep, id_deps); } free_jaddref(jaddref); } /* * Called once a jnewblk journal is written. The allocdirect or allocindir * is placed in the bmsafemap to await notification of a written bitmap. If * the operation was canceled we add the segdep to the appropriate * dependency to free the journal space once the canceling operation * completes. */ static void handle_written_jnewblk(jnewblk) struct jnewblk *jnewblk; { struct bmsafemap *bmsafemap; struct freefrag *freefrag; struct freework *freework; struct jsegdep *jsegdep; struct newblk *newblk; /* Grab the jsegdep. */ jsegdep = jnewblk->jn_jsegdep; jnewblk->jn_jsegdep = NULL; if (jnewblk->jn_dep == NULL) panic("handle_written_jnewblk: No dependency for the segdep."); switch (jnewblk->jn_dep->wk_type) { case D_NEWBLK: case D_ALLOCDIRECT: case D_ALLOCINDIR: /* * Add the written block to the bmsafemap so it can * be notified when the bitmap is on disk. */ newblk = WK_NEWBLK(jnewblk->jn_dep); newblk->nb_jnewblk = NULL; if ((newblk->nb_state & GOINGAWAY) == 0) { bmsafemap = newblk->nb_bmsafemap; newblk->nb_state |= ONDEPLIST; LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk, nb_deps); } jwork_insert(&newblk->nb_jwork, jsegdep); break; case D_FREEFRAG: /* * A newblock being removed by a freefrag when replaced by * frag extension. */ freefrag = WK_FREEFRAG(jnewblk->jn_dep); freefrag->ff_jdep = NULL; jwork_insert(&freefrag->ff_jwork, jsegdep); break; case D_FREEWORK: /* * A direct block was removed by truncate. */ freework = WK_FREEWORK(jnewblk->jn_dep); freework->fw_jnewblk = NULL; jwork_insert(&freework->fw_freeblks->fb_jwork, jsegdep); break; default: panic("handle_written_jnewblk: Unknown type %d.", jnewblk->jn_dep->wk_type); } jnewblk->jn_dep = NULL; free_jnewblk(jnewblk); } /* * Cancel a jfreefrag that won't be needed, probably due to colliding with * an in-flight allocation that has not yet been committed. Divorce us * from the freefrag and mark it DEPCOMPLETE so that it may be added * to the worklist. */ static void cancel_jfreefrag(jfreefrag) struct jfreefrag *jfreefrag; { struct freefrag *freefrag; if (jfreefrag->fr_jsegdep) { free_jsegdep(jfreefrag->fr_jsegdep); jfreefrag->fr_jsegdep = NULL; } freefrag = jfreefrag->fr_freefrag; jfreefrag->fr_freefrag = NULL; free_jfreefrag(jfreefrag); freefrag->ff_state |= DEPCOMPLETE; CTR1(KTR_SUJ, "cancel_jfreefrag: blkno %jd", freefrag->ff_blkno); } /* * Free a jfreefrag when the parent freefrag is rendered obsolete. */ static void free_jfreefrag(jfreefrag) struct jfreefrag *jfreefrag; { if (jfreefrag->fr_state & INPROGRESS) WORKLIST_REMOVE(&jfreefrag->fr_list); else if (jfreefrag->fr_state & ONWORKLIST) remove_from_journal(&jfreefrag->fr_list); if (jfreefrag->fr_freefrag != NULL) panic("free_jfreefrag: Still attached to a freefrag."); WORKITEM_FREE(jfreefrag, D_JFREEFRAG); } /* * Called when the journal write for a jfreefrag completes. The parent * freefrag is added to the worklist if this completes its dependencies. */ static void handle_written_jfreefrag(jfreefrag) struct jfreefrag *jfreefrag; { struct jsegdep *jsegdep; struct freefrag *freefrag; /* Grab the jsegdep. */ jsegdep = jfreefrag->fr_jsegdep; jfreefrag->fr_jsegdep = NULL; freefrag = jfreefrag->fr_freefrag; if (freefrag == NULL) panic("handle_written_jfreefrag: No freefrag."); freefrag->ff_state |= DEPCOMPLETE; freefrag->ff_jdep = NULL; jwork_insert(&freefrag->ff_jwork, jsegdep); if ((freefrag->ff_state & ALLCOMPLETE) == ALLCOMPLETE) add_to_worklist(&freefrag->ff_list, 0); jfreefrag->fr_freefrag = NULL; free_jfreefrag(jfreefrag); } /* * Called when the journal write for a jfreeblk completes. The jfreeblk * is removed from the freeblks list of pending journal writes and the * jsegdep is moved to the freeblks jwork to be completed when all blocks * have been reclaimed. */ static void handle_written_jblkdep(jblkdep) struct jblkdep *jblkdep; { struct freeblks *freeblks; struct jsegdep *jsegdep; /* Grab the jsegdep. */ jsegdep = jblkdep->jb_jsegdep; jblkdep->jb_jsegdep = NULL; freeblks = jblkdep->jb_freeblks; LIST_REMOVE(jblkdep, jb_deps); jwork_insert(&freeblks->fb_jwork, jsegdep); /* * If the freeblks is all journaled, we can add it to the worklist. */ if (LIST_EMPTY(&freeblks->fb_jblkdephd) && (freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE) add_to_worklist(&freeblks->fb_list, WK_NODELAY); free_jblkdep(jblkdep); } static struct jsegdep * newjsegdep(struct worklist *wk) { struct jsegdep *jsegdep; jsegdep = malloc(sizeof(*jsegdep), M_JSEGDEP, M_SOFTDEP_FLAGS); workitem_alloc(&jsegdep->jd_list, D_JSEGDEP, wk->wk_mp); jsegdep->jd_seg = NULL; return (jsegdep); } static struct jmvref * newjmvref(dp, ino, oldoff, newoff) struct inode *dp; ino_t ino; off_t oldoff; off_t newoff; { struct jmvref *jmvref; jmvref = malloc(sizeof(*jmvref), M_JMVREF, M_SOFTDEP_FLAGS); workitem_alloc(&jmvref->jm_list, D_JMVREF, UFSTOVFS(dp->i_ump)); jmvref->jm_list.wk_state = ATTACHED | DEPCOMPLETE; jmvref->jm_parent = dp->i_number; jmvref->jm_ino = ino; jmvref->jm_oldoff = oldoff; jmvref->jm_newoff = newoff; return (jmvref); } /* * Allocate a new jremref that tracks the removal of ip from dp with the * directory entry offset of diroff. Mark the entry as ATTACHED and * DEPCOMPLETE as we have all the information required for the journal write * and the directory has already been removed from the buffer. The caller * is responsible for linking the jremref into the pagedep and adding it * to the journal to write. The MKDIR_PARENT flag is set if we're doing * a DOTDOT addition so handle_workitem_remove() can properly assign * the jsegdep when we're done. */ static struct jremref * newjremref(struct dirrem *dirrem, struct inode *dp, struct inode *ip, off_t diroff, nlink_t nlink) { struct jremref *jremref; jremref = malloc(sizeof(*jremref), M_JREMREF, M_SOFTDEP_FLAGS); workitem_alloc(&jremref->jr_list, D_JREMREF, UFSTOVFS(dp->i_ump)); jremref->jr_state = ATTACHED; newinoref(&jremref->jr_ref, ip->i_number, dp->i_number, diroff, nlink, ip->i_mode); jremref->jr_dirrem = dirrem; return (jremref); } static inline void newinoref(struct inoref *inoref, ino_t ino, ino_t parent, off_t diroff, nlink_t nlink, uint16_t mode) { inoref->if_jsegdep = newjsegdep(&inoref->if_list); inoref->if_diroff = diroff; inoref->if_ino = ino; inoref->if_parent = parent; inoref->if_nlink = nlink; inoref->if_mode = mode; } /* * Allocate a new jaddref to track the addition of ino to dp at diroff. The * directory offset may not be known until later. The caller is responsible * adding the entry to the journal when this information is available. nlink * should be the link count prior to the addition and mode is only required * to have the correct FMT. */ static struct jaddref * newjaddref(struct inode *dp, ino_t ino, off_t diroff, int16_t nlink, uint16_t mode) { struct jaddref *jaddref; jaddref = malloc(sizeof(*jaddref), M_JADDREF, M_SOFTDEP_FLAGS); workitem_alloc(&jaddref->ja_list, D_JADDREF, UFSTOVFS(dp->i_ump)); jaddref->ja_state = ATTACHED; jaddref->ja_mkdir = NULL; newinoref(&jaddref->ja_ref, ino, dp->i_number, diroff, nlink, mode); return (jaddref); } /* * Create a new free dependency for a freework. The caller is responsible * for adjusting the reference count when it has the lock held. The freedep * will track an outstanding bitmap write that will ultimately clear the * freework to continue. */ static struct freedep * newfreedep(struct freework *freework) { struct freedep *freedep; freedep = malloc(sizeof(*freedep), M_FREEDEP, M_SOFTDEP_FLAGS); workitem_alloc(&freedep->fd_list, D_FREEDEP, freework->fw_list.wk_mp); freedep->fd_freework = freework; return (freedep); } /* * Free a freedep structure once the buffer it is linked to is written. If * this is the last reference to the freework schedule it for completion. */ static void free_freedep(freedep) struct freedep *freedep; { struct freework *freework; freework = freedep->fd_freework; freework->fw_freeblks->fb_cgwait--; if (--freework->fw_ref == 0) freework_enqueue(freework); WORKITEM_FREE(freedep, D_FREEDEP); } /* * Allocate a new freework structure that may be a level in an indirect * when parent is not NULL or a top level block when it is. The top level * freework structures are allocated without the per-filesystem lock held * and before the freeblks is visible outside of softdep_setup_freeblocks(). */ static struct freework * newfreework(ump, freeblks, parent, lbn, nb, frags, off, journal) struct ufsmount *ump; struct freeblks *freeblks; struct freework *parent; ufs_lbn_t lbn; ufs2_daddr_t nb; int frags; int off; int journal; { struct freework *freework; freework = malloc(sizeof(*freework), M_FREEWORK, M_SOFTDEP_FLAGS); workitem_alloc(&freework->fw_list, D_FREEWORK, freeblks->fb_list.wk_mp); freework->fw_state = ATTACHED; freework->fw_jnewblk = NULL; freework->fw_freeblks = freeblks; freework->fw_parent = parent; freework->fw_lbn = lbn; freework->fw_blkno = nb; freework->fw_frags = frags; freework->fw_indir = NULL; freework->fw_ref = (MOUNTEDSUJ(UFSTOVFS(ump)) == 0 || lbn >= -NXADDR) ? 0 : NINDIR(ump->um_fs) + 1; freework->fw_start = freework->fw_off = off; if (journal) newjfreeblk(freeblks, lbn, nb, frags); if (parent == NULL) { ACQUIRE_LOCK(ump); WORKLIST_INSERT(&freeblks->fb_freeworkhd, &freework->fw_list); freeblks->fb_ref++; FREE_LOCK(ump); } return (freework); } /* * Eliminate a jfreeblk for a block that does not need journaling. */ static void cancel_jfreeblk(freeblks, blkno) struct freeblks *freeblks; ufs2_daddr_t blkno; { struct jfreeblk *jfreeblk; struct jblkdep *jblkdep; LIST_FOREACH(jblkdep, &freeblks->fb_jblkdephd, jb_deps) { if (jblkdep->jb_list.wk_type != D_JFREEBLK) continue; jfreeblk = WK_JFREEBLK(&jblkdep->jb_list); if (jfreeblk->jf_blkno == blkno) break; } if (jblkdep == NULL) return; CTR1(KTR_SUJ, "cancel_jfreeblk: blkno %jd", blkno); free_jsegdep(jblkdep->jb_jsegdep); LIST_REMOVE(jblkdep, jb_deps); WORKITEM_FREE(jfreeblk, D_JFREEBLK); } /* * Allocate a new jfreeblk to journal top level block pointer when truncating * a file. The caller must add this to the worklist when the per-filesystem * lock is held. */ static struct jfreeblk * newjfreeblk(freeblks, lbn, blkno, frags) struct freeblks *freeblks; ufs_lbn_t lbn; ufs2_daddr_t blkno; int frags; { struct jfreeblk *jfreeblk; jfreeblk = malloc(sizeof(*jfreeblk), M_JFREEBLK, M_SOFTDEP_FLAGS); workitem_alloc(&jfreeblk->jf_dep.jb_list, D_JFREEBLK, freeblks->fb_list.wk_mp); jfreeblk->jf_dep.jb_jsegdep = newjsegdep(&jfreeblk->jf_dep.jb_list); jfreeblk->jf_dep.jb_freeblks = freeblks; jfreeblk->jf_ino = freeblks->fb_inum; jfreeblk->jf_lbn = lbn; jfreeblk->jf_blkno = blkno; jfreeblk->jf_frags = frags; LIST_INSERT_HEAD(&freeblks->fb_jblkdephd, &jfreeblk->jf_dep, jb_deps); return (jfreeblk); } /* * The journal is only prepared to handle full-size block numbers, so we * have to adjust the record to reflect the change to a full-size block. * For example, suppose we have a block made up of fragments 8-15 and * want to free its last two fragments. We are given a request that says: * FREEBLK ino=5, blkno=14, lbn=0, frags=2, oldfrags=0 * where frags are the number of fragments to free and oldfrags are the * number of fragments to keep. To block align it, we have to change it to * have a valid full-size blkno, so it becomes: * FREEBLK ino=5, blkno=8, lbn=0, frags=2, oldfrags=6 */ static void adjust_newfreework(freeblks, frag_offset) struct freeblks *freeblks; int frag_offset; { struct jfreeblk *jfreeblk; KASSERT((LIST_FIRST(&freeblks->fb_jblkdephd) != NULL && LIST_FIRST(&freeblks->fb_jblkdephd)->jb_list.wk_type == D_JFREEBLK), ("adjust_newfreework: Missing freeblks dependency")); jfreeblk = WK_JFREEBLK(LIST_FIRST(&freeblks->fb_jblkdephd)); jfreeblk->jf_blkno -= frag_offset; jfreeblk->jf_frags += frag_offset; } /* * Allocate a new jtrunc to track a partial truncation. */ static struct jtrunc * newjtrunc(freeblks, size, extsize) struct freeblks *freeblks; off_t size; int extsize; { struct jtrunc *jtrunc; jtrunc = malloc(sizeof(*jtrunc), M_JTRUNC, M_SOFTDEP_FLAGS); workitem_alloc(&jtrunc->jt_dep.jb_list, D_JTRUNC, freeblks->fb_list.wk_mp); jtrunc->jt_dep.jb_jsegdep = newjsegdep(&jtrunc->jt_dep.jb_list); jtrunc->jt_dep.jb_freeblks = freeblks; jtrunc->jt_ino = freeblks->fb_inum; jtrunc->jt_size = size; jtrunc->jt_extsize = extsize; LIST_INSERT_HEAD(&freeblks->fb_jblkdephd, &jtrunc->jt_dep, jb_deps); return (jtrunc); } /* * If we're canceling a new bitmap we have to search for another ref * to move into the bmsafemap dep. This might be better expressed * with another structure. */ static void move_newblock_dep(jaddref, inodedep) struct jaddref *jaddref; struct inodedep *inodedep; { struct inoref *inoref; struct jaddref *jaddrefn; jaddrefn = NULL; for (inoref = TAILQ_NEXT(&jaddref->ja_ref, if_deps); inoref; inoref = TAILQ_NEXT(inoref, if_deps)) { if ((jaddref->ja_state & NEWBLOCK) && inoref->if_list.wk_type == D_JADDREF) { jaddrefn = (struct jaddref *)inoref; break; } } if (jaddrefn == NULL) return; jaddrefn->ja_state &= ~(ATTACHED | UNDONE); jaddrefn->ja_state |= jaddref->ja_state & (ATTACHED | UNDONE | NEWBLOCK); jaddref->ja_state &= ~(ATTACHED | UNDONE | NEWBLOCK); jaddref->ja_state |= ATTACHED; LIST_REMOVE(jaddref, ja_bmdeps); LIST_INSERT_HEAD(&inodedep->id_bmsafemap->sm_jaddrefhd, jaddrefn, ja_bmdeps); } /* * Cancel a jaddref either before it has been written or while it is being * written. This happens when a link is removed before the add reaches * the disk. The jaddref dependency is kept linked into the bmsafemap * and inode to prevent the link count or bitmap from reaching the disk * until handle_workitem_remove() re-adjusts the counts and bitmaps as * required. * * Returns 1 if the canceled addref requires journaling of the remove and * 0 otherwise. */ static int cancel_jaddref(jaddref, inodedep, wkhd) struct jaddref *jaddref; struct inodedep *inodedep; struct workhead *wkhd; { struct inoref *inoref; struct jsegdep *jsegdep; int needsj; KASSERT((jaddref->ja_state & COMPLETE) == 0, ("cancel_jaddref: Canceling complete jaddref")); if (jaddref->ja_state & (INPROGRESS | COMPLETE)) needsj = 1; else needsj = 0; if (inodedep == NULL) if (inodedep_lookup(jaddref->ja_list.wk_mp, jaddref->ja_ino, 0, &inodedep) == 0) panic("cancel_jaddref: Lost inodedep"); /* * We must adjust the nlink of any reference operation that follows * us so that it is consistent with the in-memory reference. This * ensures that inode nlink rollbacks always have the correct link. */ if (needsj == 0) { for (inoref = TAILQ_NEXT(&jaddref->ja_ref, if_deps); inoref; inoref = TAILQ_NEXT(inoref, if_deps)) { if (inoref->if_state & GOINGAWAY) break; inoref->if_nlink--; } } jsegdep = inoref_jseg(&jaddref->ja_ref); if (jaddref->ja_state & NEWBLOCK) move_newblock_dep(jaddref, inodedep); wake_worklist(&jaddref->ja_list); jaddref->ja_mkdir = NULL; if (jaddref->ja_state & INPROGRESS) { jaddref->ja_state &= ~INPROGRESS; WORKLIST_REMOVE(&jaddref->ja_list); jwork_insert(wkhd, jsegdep); } else { free_jsegdep(jsegdep); if (jaddref->ja_state & DEPCOMPLETE) remove_from_journal(&jaddref->ja_list); } jaddref->ja_state |= (GOINGAWAY | DEPCOMPLETE); /* * Leave NEWBLOCK jaddrefs on the inodedep so handle_workitem_remove * can arrange for them to be freed with the bitmap. Otherwise we * no longer need this addref attached to the inoreflst and it * will incorrectly adjust nlink if we leave it. */ if ((jaddref->ja_state & NEWBLOCK) == 0) { TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref, if_deps); jaddref->ja_state |= COMPLETE; free_jaddref(jaddref); return (needsj); } /* * Leave the head of the list for jsegdeps for fast merging. */ if (LIST_FIRST(wkhd) != NULL) { jaddref->ja_state |= ONWORKLIST; LIST_INSERT_AFTER(LIST_FIRST(wkhd), &jaddref->ja_list, wk_list); } else WORKLIST_INSERT(wkhd, &jaddref->ja_list); return (needsj); } /* * Attempt to free a jaddref structure when some work completes. This * should only succeed once the entry is written and all dependencies have * been notified. */ static void free_jaddref(jaddref) struct jaddref *jaddref; { if ((jaddref->ja_state & ALLCOMPLETE) != ALLCOMPLETE) return; if (jaddref->ja_ref.if_jsegdep) panic("free_jaddref: segdep attached to jaddref %p(0x%X)\n", jaddref, jaddref->ja_state); if (jaddref->ja_state & NEWBLOCK) LIST_REMOVE(jaddref, ja_bmdeps); if (jaddref->ja_state & (INPROGRESS | ONWORKLIST)) panic("free_jaddref: Bad state %p(0x%X)", jaddref, jaddref->ja_state); if (jaddref->ja_mkdir != NULL) panic("free_jaddref: Work pending, 0x%X\n", jaddref->ja_state); WORKITEM_FREE(jaddref, D_JADDREF); } /* * Free a jremref structure once it has been written or discarded. */ static void free_jremref(jremref) struct jremref *jremref; { if (jremref->jr_ref.if_jsegdep) free_jsegdep(jremref->jr_ref.if_jsegdep); if (jremref->jr_state & INPROGRESS) panic("free_jremref: IO still pending"); WORKITEM_FREE(jremref, D_JREMREF); } /* * Free a jnewblk structure. */ static void free_jnewblk(jnewblk) struct jnewblk *jnewblk; { if ((jnewblk->jn_state & ALLCOMPLETE) != ALLCOMPLETE) return; LIST_REMOVE(jnewblk, jn_deps); if (jnewblk->jn_dep != NULL) panic("free_jnewblk: Dependency still attached."); WORKITEM_FREE(jnewblk, D_JNEWBLK); } /* * Cancel a jnewblk which has been been made redundant by frag extension. */ static void cancel_jnewblk(jnewblk, wkhd) struct jnewblk *jnewblk; struct workhead *wkhd; { struct jsegdep *jsegdep; CTR1(KTR_SUJ, "cancel_jnewblk: blkno %jd", jnewblk->jn_blkno); jsegdep = jnewblk->jn_jsegdep; if (jnewblk->jn_jsegdep == NULL || jnewblk->jn_dep == NULL) panic("cancel_jnewblk: Invalid state"); jnewblk->jn_jsegdep = NULL; jnewblk->jn_dep = NULL; jnewblk->jn_state |= GOINGAWAY; if (jnewblk->jn_state & INPROGRESS) { jnewblk->jn_state &= ~INPROGRESS; WORKLIST_REMOVE(&jnewblk->jn_list); jwork_insert(wkhd, jsegdep); } else { free_jsegdep(jsegdep); remove_from_journal(&jnewblk->jn_list); } wake_worklist(&jnewblk->jn_list); WORKLIST_INSERT(wkhd, &jnewblk->jn_list); } static void free_jblkdep(jblkdep) struct jblkdep *jblkdep; { if (jblkdep->jb_list.wk_type == D_JFREEBLK) WORKITEM_FREE(jblkdep, D_JFREEBLK); else if (jblkdep->jb_list.wk_type == D_JTRUNC) WORKITEM_FREE(jblkdep, D_JTRUNC); else panic("free_jblkdep: Unexpected type %s", TYPENAME(jblkdep->jb_list.wk_type)); } /* * Free a single jseg once it is no longer referenced in memory or on * disk. Reclaim journal blocks and dependencies waiting for the segment * to disappear. */ static void free_jseg(jseg, jblocks) struct jseg *jseg; struct jblocks *jblocks; { struct freework *freework; /* * Free freework structures that were lingering to indicate freed * indirect blocks that forced journal write ordering on reallocate. */ while ((freework = LIST_FIRST(&jseg->js_indirs)) != NULL) indirblk_remove(freework); if (jblocks->jb_oldestseg == jseg) jblocks->jb_oldestseg = TAILQ_NEXT(jseg, js_next); TAILQ_REMOVE(&jblocks->jb_segs, jseg, js_next); jblocks_free(jblocks, jseg->js_list.wk_mp, jseg->js_size); KASSERT(LIST_EMPTY(&jseg->js_entries), ("free_jseg: Freed jseg has valid entries.")); WORKITEM_FREE(jseg, D_JSEG); } /* * Free all jsegs that meet the criteria for being reclaimed and update * oldestseg. */ static void free_jsegs(jblocks) struct jblocks *jblocks; { struct jseg *jseg; /* * Free only those jsegs which have none allocated before them to * preserve the journal space ordering. */ while ((jseg = TAILQ_FIRST(&jblocks->jb_segs)) != NULL) { /* * Only reclaim space when nothing depends on this journal * set and another set has written that it is no longer * valid. */ if (jseg->js_refs != 0) { jblocks->jb_oldestseg = jseg; return; } if ((jseg->js_state & ALLCOMPLETE) != ALLCOMPLETE) break; if (jseg->js_seq > jblocks->jb_oldestwrseq) break; /* * We can free jsegs that didn't write entries when * oldestwrseq == js_seq. */ if (jseg->js_seq == jblocks->jb_oldestwrseq && jseg->js_cnt != 0) break; free_jseg(jseg, jblocks); } /* * If we exited the loop above we still must discover the * oldest valid segment. */ if (jseg) for (jseg = jblocks->jb_oldestseg; jseg != NULL; jseg = TAILQ_NEXT(jseg, js_next)) if (jseg->js_refs != 0) break; jblocks->jb_oldestseg = jseg; /* * The journal has no valid records but some jsegs may still be * waiting on oldestwrseq to advance. We force a small record * out to permit these lingering records to be reclaimed. */ if (jblocks->jb_oldestseg == NULL && !TAILQ_EMPTY(&jblocks->jb_segs)) jblocks->jb_needseg = 1; } /* * Release one reference to a jseg and free it if the count reaches 0. This * should eventually reclaim journal space as well. */ static void rele_jseg(jseg) struct jseg *jseg; { KASSERT(jseg->js_refs > 0, ("free_jseg: Invalid refcnt %d", jseg->js_refs)); if (--jseg->js_refs != 0) return; free_jsegs(jseg->js_jblocks); } /* * Release a jsegdep and decrement the jseg count. */ static void free_jsegdep(jsegdep) struct jsegdep *jsegdep; { if (jsegdep->jd_seg) rele_jseg(jsegdep->jd_seg); WORKITEM_FREE(jsegdep, D_JSEGDEP); } /* * Wait for a journal item to make it to disk. Initiate journal processing * if required. */ static int jwait(wk, waitfor) struct worklist *wk; int waitfor; { LOCK_OWNED(VFSTOUFS(wk->wk_mp)); /* * Blocking journal waits cause slow synchronous behavior. Record * stats on the frequency of these blocking operations. */ if (waitfor == MNT_WAIT) { stat_journal_wait++; switch (wk->wk_type) { case D_JREMREF: case D_JMVREF: stat_jwait_filepage++; break; case D_JTRUNC: case D_JFREEBLK: stat_jwait_freeblks++; break; case D_JNEWBLK: stat_jwait_newblk++; break; case D_JADDREF: stat_jwait_inode++; break; default: break; } } /* * If IO has not started we process the journal. We can't mark the * worklist item as IOWAITING because we drop the lock while * processing the journal and the worklist entry may be freed after * this point. The caller may call back in and re-issue the request. */ if ((wk->wk_state & INPROGRESS) == 0) { softdep_process_journal(wk->wk_mp, wk, waitfor); if (waitfor != MNT_WAIT) return (EBUSY); return (0); } if (waitfor != MNT_WAIT) return (EBUSY); wait_worklist(wk, "jwait"); return (0); } /* * Lookup an inodedep based on an inode pointer and set the nlinkdelta as * appropriate. This is a convenience function to reduce duplicate code * for the setup and revert functions below. */ static struct inodedep * inodedep_lookup_ip(ip) struct inode *ip; { struct inodedep *inodedep; int dflags; KASSERT(ip->i_nlink >= ip->i_effnlink, ("inodedep_lookup_ip: bad delta")); dflags = DEPALLOC; if (IS_SNAPSHOT(ip)) dflags |= NODELAY; (void) inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, dflags, &inodedep); inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink; KASSERT((inodedep->id_state & UNLINKED) == 0, ("inode unlinked")); return (inodedep); } /* * Called prior to creating a new inode and linking it to a directory. The * jaddref structure must already be allocated by softdep_setup_inomapdep * and it is discovered here so we can initialize the mode and update * nlinkdelta. */ void softdep_setup_create(dp, ip) struct inode *dp; struct inode *ip; { struct inodedep *inodedep; struct jaddref *jaddref; struct vnode *dvp; KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(dp->i_ump)) != 0, ("softdep_setup_create called on non-softdep filesystem")); KASSERT(ip->i_nlink == 1, ("softdep_setup_create: Invalid link count.")); dvp = ITOV(dp); ACQUIRE_LOCK(dp->i_ump); inodedep = inodedep_lookup_ip(ip); if (DOINGSUJ(dvp)) { jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, inoreflst); KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number, ("softdep_setup_create: No addref structure present.")); } softdep_prelink(dvp, NULL); FREE_LOCK(dp->i_ump); } /* * Create a jaddref structure to track the addition of a DOTDOT link when * we are reparenting an inode as part of a rename. This jaddref will be * found by softdep_setup_directory_change. Adjusts nlinkdelta for * non-journaling softdep. */ void softdep_setup_dotdot_link(dp, ip) struct inode *dp; struct inode *ip; { struct inodedep *inodedep; struct jaddref *jaddref; struct vnode *dvp; struct vnode *vp; KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(dp->i_ump)) != 0, ("softdep_setup_dotdot_link called on non-softdep filesystem")); dvp = ITOV(dp); vp = ITOV(ip); jaddref = NULL; /* * We don't set MKDIR_PARENT as this is not tied to a mkdir and * is used as a normal link would be. */ if (DOINGSUJ(dvp)) jaddref = newjaddref(ip, dp->i_number, DOTDOT_OFFSET, dp->i_effnlink - 1, dp->i_mode); ACQUIRE_LOCK(dp->i_ump); inodedep = inodedep_lookup_ip(dp); if (jaddref) TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref, if_deps); softdep_prelink(dvp, ITOV(ip)); FREE_LOCK(dp->i_ump); } /* * Create a jaddref structure to track a new link to an inode. The directory * offset is not known until softdep_setup_directory_add or * softdep_setup_directory_change. Adjusts nlinkdelta for non-journaling * softdep. */ void softdep_setup_link(dp, ip) struct inode *dp; struct inode *ip; { struct inodedep *inodedep; struct jaddref *jaddref; struct vnode *dvp; KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(dp->i_ump)) != 0, ("softdep_setup_link called on non-softdep filesystem")); dvp = ITOV(dp); jaddref = NULL; if (DOINGSUJ(dvp)) jaddref = newjaddref(dp, ip->i_number, 0, ip->i_effnlink - 1, ip->i_mode); ACQUIRE_LOCK(dp->i_ump); inodedep = inodedep_lookup_ip(ip); if (jaddref) TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref, if_deps); softdep_prelink(dvp, ITOV(ip)); FREE_LOCK(dp->i_ump); } /* * Called to create the jaddref structures to track . and .. references as * well as lookup and further initialize the incomplete jaddref created * by softdep_setup_inomapdep when the inode was allocated. Adjusts * nlinkdelta for non-journaling softdep. */ void softdep_setup_mkdir(dp, ip) struct inode *dp; struct inode *ip; { struct inodedep *inodedep; struct jaddref *dotdotaddref; struct jaddref *dotaddref; struct jaddref *jaddref; struct vnode *dvp; KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(dp->i_ump)) != 0, ("softdep_setup_mkdir called on non-softdep filesystem")); dvp = ITOV(dp); dotaddref = dotdotaddref = NULL; if (DOINGSUJ(dvp)) { dotaddref = newjaddref(ip, ip->i_number, DOT_OFFSET, 1, ip->i_mode); dotaddref->ja_state |= MKDIR_BODY; dotdotaddref = newjaddref(ip, dp->i_number, DOTDOT_OFFSET, dp->i_effnlink - 1, dp->i_mode); dotdotaddref->ja_state |= MKDIR_PARENT; } ACQUIRE_LOCK(dp->i_ump); inodedep = inodedep_lookup_ip(ip); if (DOINGSUJ(dvp)) { jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, inoreflst); KASSERT(jaddref != NULL, ("softdep_setup_mkdir: No addref structure present.")); KASSERT(jaddref->ja_parent == dp->i_number, ("softdep_setup_mkdir: bad parent %ju", (uintmax_t)jaddref->ja_parent)); TAILQ_INSERT_BEFORE(&jaddref->ja_ref, &dotaddref->ja_ref, if_deps); } inodedep = inodedep_lookup_ip(dp); if (DOINGSUJ(dvp)) TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &dotdotaddref->ja_ref, if_deps); softdep_prelink(ITOV(dp), NULL); FREE_LOCK(dp->i_ump); } /* * Called to track nlinkdelta of the inode and parent directories prior to * unlinking a directory. */ void softdep_setup_rmdir(dp, ip) struct inode *dp; struct inode *ip; { struct vnode *dvp; KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(dp->i_ump)) != 0, ("softdep_setup_rmdir called on non-softdep filesystem")); dvp = ITOV(dp); ACQUIRE_LOCK(dp->i_ump); (void) inodedep_lookup_ip(ip); (void) inodedep_lookup_ip(dp); softdep_prelink(dvp, ITOV(ip)); FREE_LOCK(dp->i_ump); } /* * Called to track nlinkdelta of the inode and parent directories prior to * unlink. */ void softdep_setup_unlink(dp, ip) struct inode *dp; struct inode *ip; { struct vnode *dvp; KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(dp->i_ump)) != 0, ("softdep_setup_unlink called on non-softdep filesystem")); dvp = ITOV(dp); ACQUIRE_LOCK(dp->i_ump); (void) inodedep_lookup_ip(ip); (void) inodedep_lookup_ip(dp); softdep_prelink(dvp, ITOV(ip)); FREE_LOCK(dp->i_ump); } /* * Called to release the journal structures created by a failed non-directory * creation. Adjusts nlinkdelta for non-journaling softdep. */ void softdep_revert_create(dp, ip) struct inode *dp; struct inode *ip; { struct inodedep *inodedep; struct jaddref *jaddref; struct vnode *dvp; KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(dp->i_ump)) != 0, ("softdep_revert_create called on non-softdep filesystem")); dvp = ITOV(dp); ACQUIRE_LOCK(dp->i_ump); inodedep = inodedep_lookup_ip(ip); if (DOINGSUJ(dvp)) { jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, inoreflst); KASSERT(jaddref->ja_parent == dp->i_number, ("softdep_revert_create: addref parent mismatch")); cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait); } FREE_LOCK(dp->i_ump); } /* * Called to release the journal structures created by a failed link * addition. Adjusts nlinkdelta for non-journaling softdep. */ void softdep_revert_link(dp, ip) struct inode *dp; struct inode *ip; { struct inodedep *inodedep; struct jaddref *jaddref; struct vnode *dvp; KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(dp->i_ump)) != 0, ("softdep_revert_link called on non-softdep filesystem")); dvp = ITOV(dp); ACQUIRE_LOCK(dp->i_ump); inodedep = inodedep_lookup_ip(ip); if (DOINGSUJ(dvp)) { jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, inoreflst); KASSERT(jaddref->ja_parent == dp->i_number, ("softdep_revert_link: addref parent mismatch")); cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait); } FREE_LOCK(dp->i_ump); } /* * Called to release the journal structures created by a failed mkdir * attempt. Adjusts nlinkdelta for non-journaling softdep. */ void softdep_revert_mkdir(dp, ip) struct inode *dp; struct inode *ip; { struct inodedep *inodedep; struct jaddref *jaddref; struct jaddref *dotaddref; struct vnode *dvp; KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(dp->i_ump)) != 0, ("softdep_revert_mkdir called on non-softdep filesystem")); dvp = ITOV(dp); ACQUIRE_LOCK(dp->i_ump); inodedep = inodedep_lookup_ip(dp); if (DOINGSUJ(dvp)) { jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, inoreflst); KASSERT(jaddref->ja_parent == ip->i_number, ("softdep_revert_mkdir: dotdot addref parent mismatch")); cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait); } inodedep = inodedep_lookup_ip(ip); if (DOINGSUJ(dvp)) { jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, inoreflst); KASSERT(jaddref->ja_parent == dp->i_number, ("softdep_revert_mkdir: addref parent mismatch")); dotaddref = (struct jaddref *)TAILQ_PREV(&jaddref->ja_ref, inoreflst, if_deps); cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait); KASSERT(dotaddref->ja_parent == ip->i_number, ("softdep_revert_mkdir: dot addref parent mismatch")); cancel_jaddref(dotaddref, inodedep, &inodedep->id_inowait); } FREE_LOCK(dp->i_ump); } /* * Called to correct nlinkdelta after a failed rmdir. */ void softdep_revert_rmdir(dp, ip) struct inode *dp; struct inode *ip; { KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(dp->i_ump)) != 0, ("softdep_revert_rmdir called on non-softdep filesystem")); ACQUIRE_LOCK(dp->i_ump); (void) inodedep_lookup_ip(ip); (void) inodedep_lookup_ip(dp); FREE_LOCK(dp->i_ump); } /* * Protecting the freemaps (or bitmaps). * * To eliminate the need to execute fsck before mounting a filesystem * after a power failure, one must (conservatively) guarantee that the * on-disk copy of the bitmaps never indicate that a live inode or block is * free. So, when a block or inode is allocated, the bitmap should be * updated (on disk) before any new pointers. When a block or inode is * freed, the bitmap should not be updated until all pointers have been * reset. The latter dependency is handled by the delayed de-allocation * approach described below for block and inode de-allocation. The former * dependency is handled by calling the following procedure when a block or * inode is allocated. When an inode is allocated an "inodedep" is created * with its DEPCOMPLETE flag cleared until its bitmap is written to disk. * Each "inodedep" is also inserted into the hash indexing structure so * that any additional link additions can be made dependent on the inode * allocation. * * The ufs filesystem maintains a number of free block counts (e.g., per * cylinder group, per cylinder and per pair) * in addition to the bitmaps. These counts are used to improve efficiency * during allocation and therefore must be consistent with the bitmaps. * There is no convenient way to guarantee post-crash consistency of these * counts with simple update ordering, for two main reasons: (1) The counts * and bitmaps for a single cylinder group block are not in the same disk * sector. If a disk write is interrupted (e.g., by power failure), one may * be written and the other not. (2) Some of the counts are located in the * superblock rather than the cylinder group block. So, we focus our soft * updates implementation on protecting the bitmaps. When mounting a * filesystem, we recompute the auxiliary counts from the bitmaps. */ /* * Called just after updating the cylinder group block to allocate an inode. */ void softdep_setup_inomapdep(bp, ip, newinum, mode) struct buf *bp; /* buffer for cylgroup block with inode map */ struct inode *ip; /* inode related to allocation */ ino_t newinum; /* new inode number being allocated */ int mode; { struct inodedep *inodedep; struct bmsafemap *bmsafemap; struct jaddref *jaddref; struct mount *mp; struct fs *fs; mp = UFSTOVFS(ip->i_ump); KASSERT(MOUNTEDSOFTDEP(mp) != 0, ("softdep_setup_inomapdep called on non-softdep filesystem")); fs = ip->i_ump->um_fs; jaddref = NULL; /* * Allocate the journal reference add structure so that the bitmap * can be dependent on it. */ if (MOUNTEDSUJ(mp)) { jaddref = newjaddref(ip, newinum, 0, 0, mode); jaddref->ja_state |= NEWBLOCK; } /* * Create a dependency for the newly allocated inode. * Panic if it already exists as something is seriously wrong. * Otherwise add it to the dependency list for the buffer holding * the cylinder group map from which it was allocated. * * We have to preallocate a bmsafemap entry in case it is needed * in bmsafemap_lookup since once we allocate the inodedep, we * have to finish initializing it before we can FREE_LOCK(). * By preallocating, we avoid FREE_LOCK() while doing a malloc * in bmsafemap_lookup. We cannot call bmsafemap_lookup before * creating the inodedep as it can be freed during the time * that we FREE_LOCK() while allocating the inodedep. We must * call workitem_alloc() before entering the locked section as * it also acquires the lock and we must avoid trying doing so * recursively. */ bmsafemap = malloc(sizeof(struct bmsafemap), M_BMSAFEMAP, M_SOFTDEP_FLAGS); workitem_alloc(&bmsafemap->sm_list, D_BMSAFEMAP, mp); ACQUIRE_LOCK(ip->i_ump); if ((inodedep_lookup(mp, newinum, DEPALLOC | NODELAY, &inodedep))) panic("softdep_setup_inomapdep: dependency %p for new" "inode already exists", inodedep); bmsafemap = bmsafemap_lookup(mp, bp, ino_to_cg(fs, newinum), bmsafemap); if (jaddref) { LIST_INSERT_HEAD(&bmsafemap->sm_jaddrefhd, jaddref, ja_bmdeps); TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref, if_deps); } else { inodedep->id_state |= ONDEPLIST; LIST_INSERT_HEAD(&bmsafemap->sm_inodedephd, inodedep, id_deps); } inodedep->id_bmsafemap = bmsafemap; inodedep->id_state &= ~DEPCOMPLETE; FREE_LOCK(ip->i_ump); } /* * Called just after updating the cylinder group block to * allocate block or fragment. */ void softdep_setup_blkmapdep(bp, mp, newblkno, frags, oldfrags) struct buf *bp; /* buffer for cylgroup block with block map */ struct mount *mp; /* filesystem doing allocation */ ufs2_daddr_t newblkno; /* number of newly allocated block */ int frags; /* Number of fragments. */ int oldfrags; /* Previous number of fragments for extend. */ { struct newblk *newblk; struct bmsafemap *bmsafemap; struct jnewblk *jnewblk; struct ufsmount *ump; struct fs *fs; KASSERT(MOUNTEDSOFTDEP(mp) != 0, ("softdep_setup_blkmapdep called on non-softdep filesystem")); ump = VFSTOUFS(mp); fs = ump->um_fs; jnewblk = NULL; /* * Create a dependency for the newly allocated block. * Add it to the dependency list for the buffer holding * the cylinder group map from which it was allocated. */ if (MOUNTEDSUJ(mp)) { jnewblk = malloc(sizeof(*jnewblk), M_JNEWBLK, M_SOFTDEP_FLAGS); workitem_alloc(&jnewblk->jn_list, D_JNEWBLK, mp); jnewblk->jn_jsegdep = newjsegdep(&jnewblk->jn_list); jnewblk->jn_state = ATTACHED; jnewblk->jn_blkno = newblkno; jnewblk->jn_frags = frags; jnewblk->jn_oldfrags = oldfrags; #ifdef SUJ_DEBUG { struct cg *cgp; uint8_t *blksfree; long bno; int i; cgp = (struct cg *)bp->b_data; blksfree = cg_blksfree(cgp); bno = dtogd(fs, jnewblk->jn_blkno); for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags; i++) { if (isset(blksfree, bno + i)) panic("softdep_setup_blkmapdep: " "free fragment %d from %d-%d " "state 0x%X dep %p", i, jnewblk->jn_oldfrags, jnewblk->jn_frags, jnewblk->jn_state, jnewblk->jn_dep); } } #endif } CTR3(KTR_SUJ, "softdep_setup_blkmapdep: blkno %jd frags %d oldfrags %d", newblkno, frags, oldfrags); ACQUIRE_LOCK(ump); if (newblk_lookup(mp, newblkno, DEPALLOC, &newblk) != 0) panic("softdep_setup_blkmapdep: found block"); newblk->nb_bmsafemap = bmsafemap = bmsafemap_lookup(mp, bp, dtog(fs, newblkno), NULL); if (jnewblk) { jnewblk->jn_dep = (struct worklist *)newblk; LIST_INSERT_HEAD(&bmsafemap->sm_jnewblkhd, jnewblk, jn_deps); } else { newblk->nb_state |= ONDEPLIST; LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk, nb_deps); } newblk->nb_bmsafemap = bmsafemap; newblk->nb_jnewblk = jnewblk; FREE_LOCK(ump); } #define BMSAFEMAP_HASH(ump, cg) \ (&(ump)->bmsafemap_hashtbl[(cg) & (ump)->bmsafemap_hash_size]) static int bmsafemap_find(bmsafemaphd, cg, bmsafemapp) struct bmsafemap_hashhead *bmsafemaphd; int cg; struct bmsafemap **bmsafemapp; { struct bmsafemap *bmsafemap; LIST_FOREACH(bmsafemap, bmsafemaphd, sm_hash) if (bmsafemap->sm_cg == cg) break; if (bmsafemap) { *bmsafemapp = bmsafemap; return (1); } *bmsafemapp = NULL; return (0); } /* * Find the bmsafemap associated with a cylinder group buffer. * If none exists, create one. The buffer must be locked when * this routine is called and this routine must be called with * the softdep lock held. To avoid giving up the lock while * allocating a new bmsafemap, a preallocated bmsafemap may be * provided. If it is provided but not needed, it is freed. */ static struct bmsafemap * bmsafemap_lookup(mp, bp, cg, newbmsafemap) struct mount *mp; struct buf *bp; int cg; struct bmsafemap *newbmsafemap; { struct bmsafemap_hashhead *bmsafemaphd; struct bmsafemap *bmsafemap, *collision; struct worklist *wk; struct ufsmount *ump; ump = VFSTOUFS(mp); LOCK_OWNED(ump); KASSERT(bp != NULL, ("bmsafemap_lookup: missing buffer")); LIST_FOREACH(wk, &bp->b_dep, wk_list) { if (wk->wk_type == D_BMSAFEMAP) { if (newbmsafemap) WORKITEM_FREE(newbmsafemap, D_BMSAFEMAP); return (WK_BMSAFEMAP(wk)); } } bmsafemaphd = BMSAFEMAP_HASH(ump, cg); if (bmsafemap_find(bmsafemaphd, cg, &bmsafemap) == 1) { if (newbmsafemap) WORKITEM_FREE(newbmsafemap, D_BMSAFEMAP); return (bmsafemap); } if (newbmsafemap) { bmsafemap = newbmsafemap; } else { FREE_LOCK(ump); bmsafemap = malloc(sizeof(struct bmsafemap), M_BMSAFEMAP, M_SOFTDEP_FLAGS); workitem_alloc(&bmsafemap->sm_list, D_BMSAFEMAP, mp); ACQUIRE_LOCK(ump); } bmsafemap->sm_buf = bp; LIST_INIT(&bmsafemap->sm_inodedephd); LIST_INIT(&bmsafemap->sm_inodedepwr); LIST_INIT(&bmsafemap->sm_newblkhd); LIST_INIT(&bmsafemap->sm_newblkwr); LIST_INIT(&bmsafemap->sm_jaddrefhd); LIST_INIT(&bmsafemap->sm_jnewblkhd); LIST_INIT(&bmsafemap->sm_freehd); LIST_INIT(&bmsafemap->sm_freewr); if (bmsafemap_find(bmsafemaphd, cg, &collision) == 1) { WORKITEM_FREE(bmsafemap, D_BMSAFEMAP); return (collision); } bmsafemap->sm_cg = cg; LIST_INSERT_HEAD(bmsafemaphd, bmsafemap, sm_hash); LIST_INSERT_HEAD(&ump->softdep_dirtycg, bmsafemap, sm_next); WORKLIST_INSERT(&bp->b_dep, &bmsafemap->sm_list); return (bmsafemap); } /* * Direct block allocation dependencies. * * When a new block is allocated, the corresponding disk locations must be * initialized (with zeros or new data) before the on-disk inode points to * them. Also, the freemap from which the block was allocated must be * updated (on disk) before the inode's pointer. These two dependencies are * independent of each other and are needed for all file blocks and indirect * blocks that are pointed to directly by the inode. Just before the * "in-core" version of the inode is updated with a newly allocated block * number, a procedure (below) is called to setup allocation dependency * structures. These structures are removed when the corresponding * dependencies are satisfied or when the block allocation becomes obsolete * (i.e., the file is deleted, the block is de-allocated, or the block is a * fragment that gets upgraded). All of these cases are handled in * procedures described later. * * When a file extension causes a fragment to be upgraded, either to a larger * fragment or to a full block, the on-disk location may change (if the * previous fragment could not simply be extended). In this case, the old * fragment must be de-allocated, but not until after the inode's pointer has * been updated. In most cases, this is handled by later procedures, which * will construct a "freefrag" structure to be added to the workitem queue * when the inode update is complete (or obsolete). The main exception to * this is when an allocation occurs while a pending allocation dependency * (for the same block pointer) remains. This case is handled in the main * allocation dependency setup procedure by immediately freeing the * unreferenced fragments. */ void softdep_setup_allocdirect(ip, off, newblkno, oldblkno, newsize, oldsize, bp) struct inode *ip; /* inode to which block is being added */ ufs_lbn_t off; /* block pointer within inode */ ufs2_daddr_t newblkno; /* disk block number being added */ ufs2_daddr_t oldblkno; /* previous block number, 0 unless frag */ long newsize; /* size of new block */ long oldsize; /* size of new block */ struct buf *bp; /* bp for allocated block */ { struct allocdirect *adp, *oldadp; struct allocdirectlst *adphead; struct freefrag *freefrag; struct inodedep *inodedep; struct pagedep *pagedep; struct jnewblk *jnewblk; struct newblk *newblk; struct mount *mp; ufs_lbn_t lbn; lbn = bp->b_lblkno; mp = UFSTOVFS(ip->i_ump); KASSERT(MOUNTEDSOFTDEP(mp) != 0, ("softdep_setup_allocdirect called on non-softdep filesystem")); if (oldblkno && oldblkno != newblkno) freefrag = newfreefrag(ip, oldblkno, oldsize, lbn); else freefrag = NULL; CTR6(KTR_SUJ, "softdep_setup_allocdirect: ino %d blkno %jd oldblkno %jd " "off %jd newsize %ld oldsize %d", ip->i_number, newblkno, oldblkno, off, newsize, oldsize); ACQUIRE_LOCK(ip->i_ump); if (off >= NDADDR) { if (lbn > 0) panic("softdep_setup_allocdirect: bad lbn %jd, off %jd", lbn, off); /* allocating an indirect block */ if (oldblkno != 0) panic("softdep_setup_allocdirect: non-zero indir"); } else { if (off != lbn) panic("softdep_setup_allocdirect: lbn %jd != off %jd", lbn, off); /* * Allocating a direct block. * * If we are allocating a directory block, then we must * allocate an associated pagedep to track additions and * deletions. */ if ((ip->i_mode & IFMT) == IFDIR) pagedep_lookup(mp, bp, ip->i_number, off, DEPALLOC, &pagedep); } if (newblk_lookup(mp, newblkno, 0, &newblk) == 0) panic("softdep_setup_allocdirect: lost block"); KASSERT(newblk->nb_list.wk_type == D_NEWBLK, ("softdep_setup_allocdirect: newblk already initialized")); /* * Convert the newblk to an allocdirect. */ WORKITEM_REASSIGN(newblk, D_ALLOCDIRECT); adp = (struct allocdirect *)newblk; newblk->nb_freefrag = freefrag; adp->ad_offset = off; adp->ad_oldblkno = oldblkno; adp->ad_newsize = newsize; adp->ad_oldsize = oldsize; /* * Finish initializing the journal. */ if ((jnewblk = newblk->nb_jnewblk) != NULL) { jnewblk->jn_ino = ip->i_number; jnewblk->jn_lbn = lbn; add_to_journal(&jnewblk->jn_list); } if (freefrag && freefrag->ff_jdep != NULL && freefrag->ff_jdep->wk_type == D_JFREEFRAG) add_to_journal(freefrag->ff_jdep); inodedep_lookup(mp, ip->i_number, DEPALLOC | NODELAY, &inodedep); adp->ad_inodedep = inodedep; WORKLIST_INSERT(&bp->b_dep, &newblk->nb_list); /* * The list of allocdirects must be kept in sorted and ascending * order so that the rollback routines can quickly determine the * first uncommitted block (the size of the file stored on disk * ends at the end of the lowest committed fragment, or if there * are no fragments, at the end of the highest committed block). * Since files generally grow, the typical case is that the new * block is to be added at the end of the list. We speed this * special case by checking against the last allocdirect in the * list before laboriously traversing the list looking for the * insertion point. */ adphead = &inodedep->id_newinoupdt; oldadp = TAILQ_LAST(adphead, allocdirectlst); if (oldadp == NULL || oldadp->ad_offset <= off) { /* insert at end of list */ TAILQ_INSERT_TAIL(adphead, adp, ad_next); if (oldadp != NULL && oldadp->ad_offset == off) allocdirect_merge(adphead, adp, oldadp); FREE_LOCK(ip->i_ump); return; } TAILQ_FOREACH(oldadp, adphead, ad_next) { if (oldadp->ad_offset >= off) break; } if (oldadp == NULL) panic("softdep_setup_allocdirect: lost entry"); /* insert in middle of list */ TAILQ_INSERT_BEFORE(oldadp, adp, ad_next); if (oldadp->ad_offset == off) allocdirect_merge(adphead, adp, oldadp); FREE_LOCK(ip->i_ump); } /* * Merge a newer and older journal record to be stored either in a * newblock or freefrag. This handles aggregating journal records for * fragment allocation into a second record as well as replacing a * journal free with an aborted journal allocation. A segment for the * oldest record will be placed on wkhd if it has been written. If not * the segment for the newer record will suffice. */ static struct worklist * jnewblk_merge(new, old, wkhd) struct worklist *new; struct worklist *old; struct workhead *wkhd; { struct jnewblk *njnewblk; struct jnewblk *jnewblk; /* Handle NULLs to simplify callers. */ if (new == NULL) return (old); if (old == NULL) return (new); /* Replace a jfreefrag with a jnewblk. */ if (new->wk_type == D_JFREEFRAG) { if (WK_JNEWBLK(old)->jn_blkno != WK_JFREEFRAG(new)->fr_blkno) panic("jnewblk_merge: blkno mismatch: %p, %p", old, new); cancel_jfreefrag(WK_JFREEFRAG(new)); return (old); } if (old->wk_type != D_JNEWBLK || new->wk_type != D_JNEWBLK) panic("jnewblk_merge: Bad type: old %d new %d\n", old->wk_type, new->wk_type); /* * Handle merging of two jnewblk records that describe * different sets of fragments in the same block. */ jnewblk = WK_JNEWBLK(old); njnewblk = WK_JNEWBLK(new); if (jnewblk->jn_blkno != njnewblk->jn_blkno) panic("jnewblk_merge: Merging disparate blocks."); /* * The record may be rolled back in the cg. */ if (jnewblk->jn_state & UNDONE) { jnewblk->jn_state &= ~UNDONE; njnewblk->jn_state |= UNDONE; njnewblk->jn_state &= ~ATTACHED; } /* * We modify the newer addref and free the older so that if neither * has been written the most up-to-date copy will be on disk. If * both have been written but rolled back we only temporarily need * one of them to fix the bits when the cg write completes. */ jnewblk->jn_state |= ATTACHED | COMPLETE; njnewblk->jn_oldfrags = jnewblk->jn_oldfrags; cancel_jnewblk(jnewblk, wkhd); WORKLIST_REMOVE(&jnewblk->jn_list); free_jnewblk(jnewblk); return (new); } /* * Replace an old allocdirect dependency with a newer one. * This routine must be called with splbio interrupts blocked. */ static void allocdirect_merge(adphead, newadp, oldadp) struct allocdirectlst *adphead; /* head of list holding allocdirects */ struct allocdirect *newadp; /* allocdirect being added */ struct allocdirect *oldadp; /* existing allocdirect being checked */ { struct worklist *wk; struct freefrag *freefrag; freefrag = NULL; LOCK_OWNED(VFSTOUFS(newadp->ad_list.wk_mp)); if (newadp->ad_oldblkno != oldadp->ad_newblkno || newadp->ad_oldsize != oldadp->ad_newsize || newadp->ad_offset >= NDADDR) panic("%s %jd != new %jd || old size %ld != new %ld", "allocdirect_merge: old blkno", (intmax_t)newadp->ad_oldblkno, (intmax_t)oldadp->ad_newblkno, newadp->ad_oldsize, oldadp->ad_newsize); newadp->ad_oldblkno = oldadp->ad_oldblkno; newadp->ad_oldsize = oldadp->ad_oldsize; /* * If the old dependency had a fragment to free or had never * previously had a block allocated, then the new dependency * can immediately post its freefrag and adopt the old freefrag. * This action is done by swapping the freefrag dependencies. * The new dependency gains the old one's freefrag, and the * old one gets the new one and then immediately puts it on * the worklist when it is freed by free_newblk. It is * not possible to do this swap when the old dependency had a * non-zero size but no previous fragment to free. This condition * arises when the new block is an extension of the old block. * Here, the first part of the fragment allocated to the new * dependency is part of the block currently claimed on disk by * the old dependency, so cannot legitimately be freed until the * conditions for the new dependency are fulfilled. */ freefrag = newadp->ad_freefrag; if (oldadp->ad_freefrag != NULL || oldadp->ad_oldblkno == 0) { newadp->ad_freefrag = oldadp->ad_freefrag; oldadp->ad_freefrag = freefrag; } /* * If we are tracking a new directory-block allocation, * move it from the old allocdirect to the new allocdirect. */ if ((wk = LIST_FIRST(&oldadp->ad_newdirblk)) != NULL) { WORKLIST_REMOVE(wk); if (!LIST_EMPTY(&oldadp->ad_newdirblk)) panic("allocdirect_merge: extra newdirblk"); WORKLIST_INSERT(&newadp->ad_newdirblk, wk); } TAILQ_REMOVE(adphead, oldadp, ad_next); /* * We need to move any journal dependencies over to the freefrag * that releases this block if it exists. Otherwise we are * extending an existing block and we'll wait until that is * complete to release the journal space and extend the * new journal to cover this old space as well. */ if (freefrag == NULL) { if (oldadp->ad_newblkno != newadp->ad_newblkno) panic("allocdirect_merge: %jd != %jd", oldadp->ad_newblkno, newadp->ad_newblkno); newadp->ad_block.nb_jnewblk = (struct jnewblk *) jnewblk_merge(&newadp->ad_block.nb_jnewblk->jn_list, &oldadp->ad_block.nb_jnewblk->jn_list, &newadp->ad_block.nb_jwork); oldadp->ad_block.nb_jnewblk = NULL; cancel_newblk(&oldadp->ad_block, NULL, &newadp->ad_block.nb_jwork); } else { wk = (struct worklist *) cancel_newblk(&oldadp->ad_block, &freefrag->ff_list, &freefrag->ff_jwork); freefrag->ff_jdep = jnewblk_merge(freefrag->ff_jdep, wk, &freefrag->ff_jwork); } free_newblk(&oldadp->ad_block); } /* * Allocate a jfreefrag structure to journal a single block free. */ static struct jfreefrag * newjfreefrag(freefrag, ip, blkno, size, lbn) struct freefrag *freefrag; struct inode *ip; ufs2_daddr_t blkno; long size; ufs_lbn_t lbn; { struct jfreefrag *jfreefrag; struct fs *fs; fs = ip->i_fs; jfreefrag = malloc(sizeof(struct jfreefrag), M_JFREEFRAG, M_SOFTDEP_FLAGS); workitem_alloc(&jfreefrag->fr_list, D_JFREEFRAG, UFSTOVFS(ip->i_ump)); jfreefrag->fr_jsegdep = newjsegdep(&jfreefrag->fr_list); jfreefrag->fr_state = ATTACHED | DEPCOMPLETE; jfreefrag->fr_ino = ip->i_number; jfreefrag->fr_lbn = lbn; jfreefrag->fr_blkno = blkno; jfreefrag->fr_frags = numfrags(fs, size); jfreefrag->fr_freefrag = freefrag; return (jfreefrag); } /* * Allocate a new freefrag structure. */ static struct freefrag * newfreefrag(ip, blkno, size, lbn) struct inode *ip; ufs2_daddr_t blkno; long size; ufs_lbn_t lbn; { struct freefrag *freefrag; struct fs *fs; CTR4(KTR_SUJ, "newfreefrag: ino %d blkno %jd size %ld lbn %jd", ip->i_number, blkno, size, lbn); fs = ip->i_fs; if (fragnum(fs, blkno) + numfrags(fs, size) > fs->fs_frag) panic("newfreefrag: frag size"); freefrag = malloc(sizeof(struct freefrag), M_FREEFRAG, M_SOFTDEP_FLAGS); workitem_alloc(&freefrag->ff_list, D_FREEFRAG, UFSTOVFS(ip->i_ump)); freefrag->ff_state = ATTACHED; LIST_INIT(&freefrag->ff_jwork); freefrag->ff_inum = ip->i_number; freefrag->ff_vtype = ITOV(ip)->v_type; freefrag->ff_blkno = blkno; freefrag->ff_fragsize = size; if (MOUNTEDSUJ(UFSTOVFS(ip->i_ump))) { freefrag->ff_jdep = (struct worklist *) newjfreefrag(freefrag, ip, blkno, size, lbn); } else { freefrag->ff_state |= DEPCOMPLETE; freefrag->ff_jdep = NULL; } return (freefrag); } /* * This workitem de-allocates fragments that were replaced during * file block allocation. */ static void handle_workitem_freefrag(freefrag) struct freefrag *freefrag; { struct ufsmount *ump = VFSTOUFS(freefrag->ff_list.wk_mp); struct workhead wkhd; CTR3(KTR_SUJ, "handle_workitem_freefrag: ino %d blkno %jd size %ld", freefrag->ff_inum, freefrag->ff_blkno, freefrag->ff_fragsize); /* * It would be illegal to add new completion items to the * freefrag after it was schedule to be done so it must be * safe to modify the list head here. */ LIST_INIT(&wkhd); ACQUIRE_LOCK(ump); LIST_SWAP(&freefrag->ff_jwork, &wkhd, worklist, wk_list); /* * If the journal has not been written we must cancel it here. */ if (freefrag->ff_jdep) { if (freefrag->ff_jdep->wk_type != D_JNEWBLK) panic("handle_workitem_freefrag: Unexpected type %d\n", freefrag->ff_jdep->wk_type); cancel_jnewblk(WK_JNEWBLK(freefrag->ff_jdep), &wkhd); } FREE_LOCK(ump); ffs_blkfree(ump, ump->um_fs, ump->um_devvp, freefrag->ff_blkno, freefrag->ff_fragsize, freefrag->ff_inum, freefrag->ff_vtype, &wkhd); ACQUIRE_LOCK(ump); WORKITEM_FREE(freefrag, D_FREEFRAG); FREE_LOCK(ump); } /* * Set up a dependency structure for an external attributes data block. * This routine follows much of the structure of softdep_setup_allocdirect. * See the description of softdep_setup_allocdirect above for details. */ void softdep_setup_allocext(ip, off, newblkno, oldblkno, newsize, oldsize, bp) struct inode *ip; ufs_lbn_t off; ufs2_daddr_t newblkno; ufs2_daddr_t oldblkno; long newsize; long oldsize; struct buf *bp; { struct allocdirect *adp, *oldadp; struct allocdirectlst *adphead; struct freefrag *freefrag; struct inodedep *inodedep; struct jnewblk *jnewblk; struct newblk *newblk; struct mount *mp; ufs_lbn_t lbn; mp = UFSTOVFS(ip->i_ump); KASSERT(MOUNTEDSOFTDEP(mp) != 0, ("softdep_setup_allocext called on non-softdep filesystem")); KASSERT(off < NXADDR, ("softdep_setup_allocext: lbn %lld > NXADDR", (long long)off)); lbn = bp->b_lblkno; if (oldblkno && oldblkno != newblkno) freefrag = newfreefrag(ip, oldblkno, oldsize, lbn); else freefrag = NULL; ACQUIRE_LOCK(ip->i_ump); if (newblk_lookup(mp, newblkno, 0, &newblk) == 0) panic("softdep_setup_allocext: lost block"); KASSERT(newblk->nb_list.wk_type == D_NEWBLK, ("softdep_setup_allocext: newblk already initialized")); /* * Convert the newblk to an allocdirect. */ WORKITEM_REASSIGN(newblk, D_ALLOCDIRECT); adp = (struct allocdirect *)newblk; newblk->nb_freefrag = freefrag; adp->ad_offset = off; adp->ad_oldblkno = oldblkno; adp->ad_newsize = newsize; adp->ad_oldsize = oldsize; adp->ad_state |= EXTDATA; /* * Finish initializing the journal. */ if ((jnewblk = newblk->nb_jnewblk) != NULL) { jnewblk->jn_ino = ip->i_number; jnewblk->jn_lbn = lbn; add_to_journal(&jnewblk->jn_list); } if (freefrag && freefrag->ff_jdep != NULL && freefrag->ff_jdep->wk_type == D_JFREEFRAG) add_to_journal(freefrag->ff_jdep); inodedep_lookup(mp, ip->i_number, DEPALLOC | NODELAY, &inodedep); adp->ad_inodedep = inodedep; WORKLIST_INSERT(&bp->b_dep, &newblk->nb_list); /* * The list of allocdirects must be kept in sorted and ascending * order so that the rollback routines can quickly determine the * first uncommitted block (the size of the file stored on disk * ends at the end of the lowest committed fragment, or if there * are no fragments, at the end of the highest committed block). * Since files generally grow, the typical case is that the new * block is to be added at the end of the list. We speed this * special case by checking against the last allocdirect in the * list before laboriously traversing the list looking for the * insertion point. */ adphead = &inodedep->id_newextupdt; oldadp = TAILQ_LAST(adphead, allocdirectlst); if (oldadp == NULL || oldadp->ad_offset <= off) { /* insert at end of list */ TAILQ_INSERT_TAIL(adphead, adp, ad_next); if (oldadp != NULL && oldadp->ad_offset == off) allocdirect_merge(adphead, adp, oldadp); FREE_LOCK(ip->i_ump); return; } TAILQ_FOREACH(oldadp, adphead, ad_next) { if (oldadp->ad_offset >= off) break; } if (oldadp == NULL) panic("softdep_setup_allocext: lost entry"); /* insert in middle of list */ TAILQ_INSERT_BEFORE(oldadp, adp, ad_next); if (oldadp->ad_offset == off) allocdirect_merge(adphead, adp, oldadp); FREE_LOCK(ip->i_ump); } /* * Indirect block allocation dependencies. * * The same dependencies that exist for a direct block also exist when * a new block is allocated and pointed to by an entry in a block of * indirect pointers. The undo/redo states described above are also * used here. Because an indirect block contains many pointers that * may have dependencies, a second copy of the entire in-memory indirect * block is kept. The buffer cache copy is always completely up-to-date. * The second copy, which is used only as a source for disk writes, * contains only the safe pointers (i.e., those that have no remaining * update dependencies). The second copy is freed when all pointers * are safe. The cache is not allowed to replace indirect blocks with * pending update dependencies. If a buffer containing an indirect * block with dependencies is written, these routines will mark it * dirty again. It can only be successfully written once all the * dependencies are removed. The ffs_fsync routine in conjunction with * softdep_sync_metadata work together to get all the dependencies * removed so that a file can be successfully written to disk. Three * procedures are used when setting up indirect block pointer * dependencies. The division is necessary because of the organization * of the "balloc" routine and because of the distinction between file * pages and file metadata blocks. */ /* * Allocate a new allocindir structure. */ static struct allocindir * newallocindir(ip, ptrno, newblkno, oldblkno, lbn) struct inode *ip; /* inode for file being extended */ int ptrno; /* offset of pointer in indirect block */ ufs2_daddr_t newblkno; /* disk block number being added */ ufs2_daddr_t oldblkno; /* previous block number, 0 if none */ ufs_lbn_t lbn; { struct newblk *newblk; struct allocindir *aip; struct freefrag *freefrag; struct jnewblk *jnewblk; if (oldblkno) freefrag = newfreefrag(ip, oldblkno, ip->i_fs->fs_bsize, lbn); else freefrag = NULL; ACQUIRE_LOCK(ip->i_ump); if (newblk_lookup(UFSTOVFS(ip->i_ump), newblkno, 0, &newblk) == 0) panic("new_allocindir: lost block"); KASSERT(newblk->nb_list.wk_type == D_NEWBLK, ("newallocindir: newblk already initialized")); WORKITEM_REASSIGN(newblk, D_ALLOCINDIR); newblk->nb_freefrag = freefrag; aip = (struct allocindir *)newblk; aip->ai_offset = ptrno; aip->ai_oldblkno = oldblkno; aip->ai_lbn = lbn; if ((jnewblk = newblk->nb_jnewblk) != NULL) { jnewblk->jn_ino = ip->i_number; jnewblk->jn_lbn = lbn; add_to_journal(&jnewblk->jn_list); } if (freefrag && freefrag->ff_jdep != NULL && freefrag->ff_jdep->wk_type == D_JFREEFRAG) add_to_journal(freefrag->ff_jdep); return (aip); } /* * Called just before setting an indirect block pointer * to a newly allocated file page. */ void softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp) struct inode *ip; /* inode for file being extended */ ufs_lbn_t lbn; /* allocated block number within file */ struct buf *bp; /* buffer with indirect blk referencing page */ int ptrno; /* offset of pointer in indirect block */ ufs2_daddr_t newblkno; /* disk block number being added */ ufs2_daddr_t oldblkno; /* previous block number, 0 if none */ struct buf *nbp; /* buffer holding allocated page */ { struct inodedep *inodedep; struct freefrag *freefrag; struct allocindir *aip; struct pagedep *pagedep; struct mount *mp; int dflags; mp = UFSTOVFS(ip->i_ump); KASSERT(MOUNTEDSOFTDEP(mp) != 0, ("softdep_setup_allocindir_page called on non-softdep filesystem")); KASSERT(lbn == nbp->b_lblkno, ("softdep_setup_allocindir_page: lbn %jd != lblkno %jd", lbn, bp->b_lblkno)); CTR4(KTR_SUJ, "softdep_setup_allocindir_page: ino %d blkno %jd oldblkno %jd " "lbn %jd", ip->i_number, newblkno, oldblkno, lbn); ASSERT_VOP_LOCKED(ITOV(ip), "softdep_setup_allocindir_page"); aip = newallocindir(ip, ptrno, newblkno, oldblkno, lbn); dflags = DEPALLOC; if (IS_SNAPSHOT(ip)) dflags |= NODELAY; (void) inodedep_lookup(mp, ip->i_number, dflags, &inodedep); /* * If we are allocating a directory page, then we must * allocate an associated pagedep to track additions and * deletions. */ if ((ip->i_mode & IFMT) == IFDIR) pagedep_lookup(mp, nbp, ip->i_number, lbn, DEPALLOC, &pagedep); WORKLIST_INSERT(&nbp->b_dep, &aip->ai_block.nb_list); freefrag = setup_allocindir_phase2(bp, ip, inodedep, aip, lbn); FREE_LOCK(ip->i_ump); if (freefrag) handle_workitem_freefrag(freefrag); } /* * Called just before setting an indirect block pointer to a * newly allocated indirect block. */ void softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno) struct buf *nbp; /* newly allocated indirect block */ struct inode *ip; /* inode for file being extended */ struct buf *bp; /* indirect block referencing allocated block */ int ptrno; /* offset of pointer in indirect block */ ufs2_daddr_t newblkno; /* disk block number being added */ { struct inodedep *inodedep; struct allocindir *aip; ufs_lbn_t lbn; int dflags; KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ip->i_ump)) != 0, ("softdep_setup_allocindir_meta called on non-softdep filesystem")); CTR3(KTR_SUJ, "softdep_setup_allocindir_meta: ino %d blkno %jd ptrno %d", ip->i_number, newblkno, ptrno); lbn = nbp->b_lblkno; ASSERT_VOP_LOCKED(ITOV(ip), "softdep_setup_allocindir_meta"); aip = newallocindir(ip, ptrno, newblkno, 0, lbn); dflags = DEPALLOC; if (IS_SNAPSHOT(ip)) dflags |= NODELAY; inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, dflags, &inodedep); WORKLIST_INSERT(&nbp->b_dep, &aip->ai_block.nb_list); if (setup_allocindir_phase2(bp, ip, inodedep, aip, lbn)) panic("softdep_setup_allocindir_meta: Block already existed"); FREE_LOCK(ip->i_ump); } static void indirdep_complete(indirdep) struct indirdep *indirdep; { struct allocindir *aip; LIST_REMOVE(indirdep, ir_next); indirdep->ir_state |= DEPCOMPLETE; while ((aip = LIST_FIRST(&indirdep->ir_completehd)) != NULL) { LIST_REMOVE(aip, ai_next); free_newblk(&aip->ai_block); } /* * If this indirdep is not attached to a buf it was simply waiting * on completion to clear completehd. free_indirdep() asserts * that nothing is dangling. */ if ((indirdep->ir_state & ONWORKLIST) == 0) free_indirdep(indirdep); } static struct indirdep * indirdep_lookup(mp, ip, bp) struct mount *mp; struct inode *ip; struct buf *bp; { struct indirdep *indirdep, *newindirdep; struct newblk *newblk; struct ufsmount *ump; struct worklist *wk; struct fs *fs; ufs2_daddr_t blkno; ump = VFSTOUFS(mp); LOCK_OWNED(ump); indirdep = NULL; newindirdep = NULL; fs = ip->i_fs; for (;;) { LIST_FOREACH(wk, &bp->b_dep, wk_list) { if (wk->wk_type != D_INDIRDEP) continue; indirdep = WK_INDIRDEP(wk); break; } /* Found on the buffer worklist, no new structure to free. */ if (indirdep != NULL && newindirdep == NULL) return (indirdep); if (indirdep != NULL && newindirdep != NULL) panic("indirdep_lookup: simultaneous create"); /* None found on the buffer and a new structure is ready. */ if (indirdep == NULL && newindirdep != NULL) break; /* None found and no new structure available. */ FREE_LOCK(ump); newindirdep = malloc(sizeof(struct indirdep), M_INDIRDEP, M_SOFTDEP_FLAGS); workitem_alloc(&newindirdep->ir_list, D_INDIRDEP, mp); newindirdep->ir_state = ATTACHED; if (ip->i_ump->um_fstype == UFS1) newindirdep->ir_state |= UFS1FMT; TAILQ_INIT(&newindirdep->ir_trunc); newindirdep->ir_saveddata = NULL; LIST_INIT(&newindirdep->ir_deplisthd); LIST_INIT(&newindirdep->ir_donehd); LIST_INIT(&newindirdep->ir_writehd); LIST_INIT(&newindirdep->ir_completehd); if (bp->b_blkno == bp->b_lblkno) { ufs_bmaparray(bp->b_vp, bp->b_lblkno, &blkno, bp, NULL, NULL); bp->b_blkno = blkno; } newindirdep->ir_freeblks = NULL; newindirdep->ir_savebp = getblk(ip->i_devvp, bp->b_blkno, bp->b_bcount, 0, 0, 0); newindirdep->ir_bp = bp; BUF_KERNPROC(newindirdep->ir_savebp); bcopy(bp->b_data, newindirdep->ir_savebp->b_data, bp->b_bcount); ACQUIRE_LOCK(ump); } indirdep = newindirdep; WORKLIST_INSERT(&bp->b_dep, &indirdep->ir_list); /* * If the block is not yet allocated we don't set DEPCOMPLETE so * that we don't free dependencies until the pointers are valid. * This could search b_dep for D_ALLOCDIRECT/D_ALLOCINDIR rather * than using the hash. */ if (newblk_lookup(mp, dbtofsb(fs, bp->b_blkno), 0, &newblk)) LIST_INSERT_HEAD(&newblk->nb_indirdeps, indirdep, ir_next); else indirdep->ir_state |= DEPCOMPLETE; return (indirdep); } /* * Called to finish the allocation of the "aip" allocated * by one of the two routines above. */ static struct freefrag * setup_allocindir_phase2(bp, ip, inodedep, aip, lbn) struct buf *bp; /* in-memory copy of the indirect block */ struct inode *ip; /* inode for file being extended */ struct inodedep *inodedep; /* Inodedep for ip */ struct allocindir *aip; /* allocindir allocated by the above routines */ ufs_lbn_t lbn; /* Logical block number for this block. */ { struct fs *fs; struct indirdep *indirdep; struct allocindir *oldaip; struct freefrag *freefrag; struct mount *mp; LOCK_OWNED(ip->i_ump); mp = UFSTOVFS(ip->i_ump); fs = ip->i_fs; if (bp->b_lblkno >= 0) panic("setup_allocindir_phase2: not indir blk"); KASSERT(aip->ai_offset >= 0 && aip->ai_offset < NINDIR(fs), ("setup_allocindir_phase2: Bad offset %d", aip->ai_offset)); indirdep = indirdep_lookup(mp, ip, bp); KASSERT(indirdep->ir_savebp != NULL, ("setup_allocindir_phase2 NULL ir_savebp")); aip->ai_indirdep = indirdep; /* * Check for an unwritten dependency for this indirect offset. If * there is, merge the old dependency into the new one. This happens * as a result of reallocblk only. */ freefrag = NULL; if (aip->ai_oldblkno != 0) { LIST_FOREACH(oldaip, &indirdep->ir_deplisthd, ai_next) { if (oldaip->ai_offset == aip->ai_offset) { freefrag = allocindir_merge(aip, oldaip); goto done; } } LIST_FOREACH(oldaip, &indirdep->ir_donehd, ai_next) { if (oldaip->ai_offset == aip->ai_offset) { freefrag = allocindir_merge(aip, oldaip); goto done; } } } done: LIST_INSERT_HEAD(&indirdep->ir_deplisthd, aip, ai_next); return (freefrag); } /* * Merge two allocindirs which refer to the same block. Move newblock * dependencies and setup the freefrags appropriately. */ static struct freefrag * allocindir_merge(aip, oldaip) struct allocindir *aip; struct allocindir *oldaip; { struct freefrag *freefrag; struct worklist *wk; if (oldaip->ai_newblkno != aip->ai_oldblkno) panic("allocindir_merge: blkno"); aip->ai_oldblkno = oldaip->ai_oldblkno; freefrag = aip->ai_freefrag; aip->ai_freefrag = oldaip->ai_freefrag; oldaip->ai_freefrag = NULL; KASSERT(freefrag != NULL, ("setup_allocindir_phase2: No freefrag")); /* * If we are tracking a new directory-block allocation, * move it from the old allocindir to the new allocindir. */ if ((wk = LIST_FIRST(&oldaip->ai_newdirblk)) != NULL) { WORKLIST_REMOVE(wk); if (!LIST_EMPTY(&oldaip->ai_newdirblk)) panic("allocindir_merge: extra newdirblk"); WORKLIST_INSERT(&aip->ai_newdirblk, wk); } /* * We can skip journaling for this freefrag and just complete * any pending journal work for the allocindir that is being * removed after the freefrag completes. */ if (freefrag->ff_jdep) cancel_jfreefrag(WK_JFREEFRAG(freefrag->ff_jdep)); LIST_REMOVE(oldaip, ai_next); freefrag->ff_jdep = (struct worklist *)cancel_newblk(&oldaip->ai_block, &freefrag->ff_list, &freefrag->ff_jwork); free_newblk(&oldaip->ai_block); return (freefrag); } static inline void setup_freedirect(freeblks, ip, i, needj) struct freeblks *freeblks; struct inode *ip; int i; int needj; { ufs2_daddr_t blkno; int frags; blkno = DIP(ip, i_db[i]); if (blkno == 0) return; DIP_SET(ip, i_db[i], 0); frags = sblksize(ip->i_fs, ip->i_size, i); frags = numfrags(ip->i_fs, frags); newfreework(ip->i_ump, freeblks, NULL, i, blkno, frags, 0, needj); } static inline void setup_freeext(freeblks, ip, i, needj) struct freeblks *freeblks; struct inode *ip; int i; int needj; { ufs2_daddr_t blkno; int frags; blkno = ip->i_din2->di_extb[i]; if (blkno == 0) return; ip->i_din2->di_extb[i] = 0; frags = sblksize(ip->i_fs, ip->i_din2->di_extsize, i); frags = numfrags(ip->i_fs, frags); newfreework(ip->i_ump, freeblks, NULL, -1 - i, blkno, frags, 0, needj); } static inline void setup_freeindir(freeblks, ip, i, lbn, needj) struct freeblks *freeblks; struct inode *ip; int i; ufs_lbn_t lbn; int needj; { ufs2_daddr_t blkno; blkno = DIP(ip, i_ib[i]); if (blkno == 0) return; DIP_SET(ip, i_ib[i], 0); newfreework(ip->i_ump, freeblks, NULL, lbn, blkno, ip->i_fs->fs_frag, 0, needj); } static inline struct freeblks * newfreeblks(mp, ip) struct mount *mp; struct inode *ip; { struct freeblks *freeblks; freeblks = malloc(sizeof(struct freeblks), M_FREEBLKS, M_SOFTDEP_FLAGS|M_ZERO); workitem_alloc(&freeblks->fb_list, D_FREEBLKS, mp); LIST_INIT(&freeblks->fb_jblkdephd); LIST_INIT(&freeblks->fb_jwork); freeblks->fb_ref = 0; freeblks->fb_cgwait = 0; freeblks->fb_state = ATTACHED; freeblks->fb_uid = ip->i_uid; freeblks->fb_inum = ip->i_number; freeblks->fb_vtype = ITOV(ip)->v_type; freeblks->fb_modrev = DIP(ip, i_modrev); freeblks->fb_devvp = ip->i_devvp; freeblks->fb_chkcnt = 0; freeblks->fb_len = 0; return (freeblks); } static void trunc_indirdep(indirdep, freeblks, bp, off) struct indirdep *indirdep; struct freeblks *freeblks; struct buf *bp; int off; { struct allocindir *aip, *aipn; /* * The first set of allocindirs won't be in savedbp. */ LIST_FOREACH_SAFE(aip, &indirdep->ir_deplisthd, ai_next, aipn) if (aip->ai_offset > off) cancel_allocindir(aip, bp, freeblks, 1); LIST_FOREACH_SAFE(aip, &indirdep->ir_donehd, ai_next, aipn) if (aip->ai_offset > off) cancel_allocindir(aip, bp, freeblks, 1); /* * These will exist in savedbp. */ LIST_FOREACH_SAFE(aip, &indirdep->ir_writehd, ai_next, aipn) if (aip->ai_offset > off) cancel_allocindir(aip, NULL, freeblks, 0); LIST_FOREACH_SAFE(aip, &indirdep->ir_completehd, ai_next, aipn) if (aip->ai_offset > off) cancel_allocindir(aip, NULL, freeblks, 0); } /* * Follow the chain of indirects down to lastlbn creating a freework * structure for each. This will be used to start indir_trunc() at * the right offset and create the journal records for the parrtial * truncation. A second step will handle the truncated dependencies. */ static int setup_trunc_indir(freeblks, ip, lbn, lastlbn, blkno) struct freeblks *freeblks; struct inode *ip; ufs_lbn_t lbn; ufs_lbn_t lastlbn; ufs2_daddr_t blkno; { struct indirdep *indirdep; struct indirdep *indirn; struct freework *freework; struct newblk *newblk; struct mount *mp; struct buf *bp; uint8_t *start; uint8_t *end; ufs_lbn_t lbnadd; int level; int error; int off; freework = NULL; if (blkno == 0) return (0); mp = freeblks->fb_list.wk_mp; bp = getblk(ITOV(ip), lbn, mp->mnt_stat.f_iosize, 0, 0, 0); if ((bp->b_flags & B_CACHE) == 0) { bp->b_blkno = blkptrtodb(VFSTOUFS(mp), blkno); bp->b_iocmd = BIO_READ; bp->b_flags &= ~B_INVAL; bp->b_ioflags &= ~BIO_ERROR; vfs_busy_pages(bp, 0); bp->b_iooffset = dbtob(bp->b_blkno); bstrategy(bp); curthread->td_ru.ru_inblock++; error = bufwait(bp); if (error) { brelse(bp); return (error); } } level = lbn_level(lbn); lbnadd = lbn_offset(ip->i_fs, level); /* * Compute the offset of the last block we want to keep. Store * in the freework the first block we want to completely free. */ off = (lastlbn - -(lbn + level)) / lbnadd; if (off + 1 == NINDIR(ip->i_fs)) goto nowork; freework = newfreework(ip->i_ump, freeblks, NULL, lbn, blkno, 0, off+1, 0); /* * Link the freework into the indirdep. This will prevent any new * allocations from proceeding until we are finished with the * truncate and the block is written. */ ACQUIRE_LOCK(ip->i_ump); indirdep = indirdep_lookup(mp, ip, bp); if (indirdep->ir_freeblks) panic("setup_trunc_indir: indirdep already truncated."); TAILQ_INSERT_TAIL(&indirdep->ir_trunc, freework, fw_next); freework->fw_indir = indirdep; /* * Cancel any allocindirs that will not make it to disk. * We have to do this for all copies of the indirdep that * live on this newblk. */ if ((indirdep->ir_state & DEPCOMPLETE) == 0) { newblk_lookup(mp, dbtofsb(ip->i_fs, bp->b_blkno), 0, &newblk); LIST_FOREACH(indirn, &newblk->nb_indirdeps, ir_next) trunc_indirdep(indirn, freeblks, bp, off); } else trunc_indirdep(indirdep, freeblks, bp, off); FREE_LOCK(ip->i_ump); /* * Creation is protected by the buf lock. The saveddata is only * needed if a full truncation follows a partial truncation but it * is difficult to allocate in that case so we fetch it anyway. */ if (indirdep->ir_saveddata == NULL) indirdep->ir_saveddata = malloc(bp->b_bcount, M_INDIRDEP, M_SOFTDEP_FLAGS); nowork: /* Fetch the blkno of the child and the zero start offset. */ if (ip->i_ump->um_fstype == UFS1) { blkno = ((ufs1_daddr_t *)bp->b_data)[off]; start = (uint8_t *)&((ufs1_daddr_t *)bp->b_data)[off+1]; } else { blkno = ((ufs2_daddr_t *)bp->b_data)[off]; start = (uint8_t *)&((ufs2_daddr_t *)bp->b_data)[off+1]; } if (freework) { /* Zero the truncated pointers. */ end = bp->b_data + bp->b_bcount; bzero(start, end - start); bdwrite(bp); } else bqrelse(bp); if (level == 0) return (0); lbn++; /* adjust level */ lbn -= (off * lbnadd); return setup_trunc_indir(freeblks, ip, lbn, lastlbn, blkno); } /* * Complete the partial truncation of an indirect block setup by * setup_trunc_indir(). This zeros the truncated pointers in the saved * copy and writes them to disk before the freeblks is allowed to complete. */ static void complete_trunc_indir(freework) struct freework *freework; { struct freework *fwn; struct indirdep *indirdep; struct ufsmount *ump; struct buf *bp; uintptr_t start; int count; ump = VFSTOUFS(freework->fw_list.wk_mp); LOCK_OWNED(ump); indirdep = freework->fw_indir; for (;;) { bp = indirdep->ir_bp; /* See if the block was discarded. */ if (bp == NULL) break; /* Inline part of getdirtybuf(). We dont want bremfree. */ if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) == 0) break; if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, LOCK_PTR(ump)) == 0) BUF_UNLOCK(bp); ACQUIRE_LOCK(ump); } freework->fw_state |= DEPCOMPLETE; TAILQ_REMOVE(&indirdep->ir_trunc, freework, fw_next); /* * Zero the pointers in the saved copy. */ if (indirdep->ir_state & UFS1FMT) start = sizeof(ufs1_daddr_t); else start = sizeof(ufs2_daddr_t); start *= freework->fw_start; count = indirdep->ir_savebp->b_bcount - start; start += (uintptr_t)indirdep->ir_savebp->b_data; bzero((char *)start, count); /* * We need to start the next truncation in the list if it has not * been started yet. */ fwn = TAILQ_FIRST(&indirdep->ir_trunc); if (fwn != NULL) { if (fwn->fw_freeblks == indirdep->ir_freeblks) TAILQ_REMOVE(&indirdep->ir_trunc, fwn, fw_next); if ((fwn->fw_state & ONWORKLIST) == 0) freework_enqueue(fwn); } /* * If bp is NULL the block was fully truncated, restore * the saved block list otherwise free it if it is no * longer needed. */ if (TAILQ_EMPTY(&indirdep->ir_trunc)) { if (bp == NULL) bcopy(indirdep->ir_saveddata, indirdep->ir_savebp->b_data, indirdep->ir_savebp->b_bcount); free(indirdep->ir_saveddata, M_INDIRDEP); indirdep->ir_saveddata = NULL; } /* * When bp is NULL there is a full truncation pending. We * must wait for this full truncation to be journaled before * we can release this freework because the disk pointers will * never be written as zero. */ if (bp == NULL) { if (LIST_EMPTY(&indirdep->ir_freeblks->fb_jblkdephd)) handle_written_freework(freework); else WORKLIST_INSERT(&indirdep->ir_freeblks->fb_freeworkhd, &freework->fw_list); } else { /* Complete when the real copy is written. */ WORKLIST_INSERT(&bp->b_dep, &freework->fw_list); BUF_UNLOCK(bp); } } /* * Calculate the number of blocks we are going to release where datablocks * is the current total and length is the new file size. */ static ufs2_daddr_t blkcount(fs, datablocks, length) struct fs *fs; ufs2_daddr_t datablocks; off_t length; { off_t totblks, numblks; totblks = 0; numblks = howmany(length, fs->fs_bsize); if (numblks <= NDADDR) { totblks = howmany(length, fs->fs_fsize); goto out; } totblks = blkstofrags(fs, numblks); numblks -= NDADDR; /* * Count all single, then double, then triple indirects required. * Subtracting one indirects worth of blocks for each pass * acknowledges one of each pointed to by the inode. */ for (;;) { totblks += blkstofrags(fs, howmany(numblks, NINDIR(fs))); numblks -= NINDIR(fs); if (numblks <= 0) break; numblks = howmany(numblks, NINDIR(fs)); } out: totblks = fsbtodb(fs, totblks); /* * Handle sparse files. We can't reclaim more blocks than the inode * references. We will correct it later in handle_complete_freeblks() * when we know the real count. */ if (totblks > datablocks) return (0); return (datablocks - totblks); } /* * Handle freeblocks for journaled softupdate filesystems. * * Contrary to normal softupdates, we must preserve the block pointers in * indirects until their subordinates are free. This is to avoid journaling * every block that is freed which may consume more space than the journal * itself. The recovery program will see the free block journals at the * base of the truncated area and traverse them to reclaim space. The * pointers in the inode may be cleared immediately after the journal * records are written because each direct and indirect pointer in the * inode is recorded in a journal. This permits full truncation to proceed * asynchronously. The write order is journal -> inode -> cgs -> indirects. * * The algorithm is as follows: * 1) Traverse the in-memory state and create journal entries to release * the relevant blocks and full indirect trees. * 2) Traverse the indirect block chain adding partial truncation freework * records to indirects in the path to lastlbn. The freework will * prevent new allocation dependencies from being satisfied in this * indirect until the truncation completes. * 3) Read and lock the inode block, performing an update with the new size * and pointers. This prevents truncated data from becoming valid on * disk through step 4. * 4) Reap unsatisfied dependencies that are beyond the truncated area, * eliminate journal work for those records that do not require it. * 5) Schedule the journal records to be written followed by the inode block. * 6) Allocate any necessary frags for the end of file. * 7) Zero any partially truncated blocks. * * From this truncation proceeds asynchronously using the freework and * indir_trunc machinery. The file will not be extended again into a * partially truncated indirect block until all work is completed but * the normal dependency mechanism ensures that it is rolled back/forward * as appropriate. Further truncation may occur without delay and is * serialized in indir_trunc(). */ void softdep_journal_freeblocks(ip, cred, length, flags) struct inode *ip; /* The inode whose length is to be reduced */ struct ucred *cred; off_t length; /* The new length for the file */ int flags; /* IO_EXT and/or IO_NORMAL */ { struct freeblks *freeblks, *fbn; struct worklist *wk, *wkn; struct inodedep *inodedep; struct jblkdep *jblkdep; struct allocdirect *adp, *adpn; struct ufsmount *ump; struct fs *fs; struct buf *bp; struct vnode *vp; struct mount *mp; ufs2_daddr_t extblocks, datablocks; ufs_lbn_t tmpval, lbn, lastlbn; int frags, lastoff, iboff, allocblock, needj, dflags, error, i; fs = ip->i_fs; ump = ip->i_ump; mp = UFSTOVFS(ump); KASSERT(MOUNTEDSOFTDEP(mp) != 0, ("softdep_journal_freeblocks called on non-softdep filesystem")); vp = ITOV(ip); needj = 1; iboff = -1; allocblock = 0; extblocks = 0; datablocks = 0; frags = 0; freeblks = newfreeblks(mp, ip); ACQUIRE_LOCK(ump); /* * If we're truncating a removed file that will never be written * we don't need to journal the block frees. The canceled journals * for the allocations will suffice. */ dflags = DEPALLOC; if (IS_SNAPSHOT(ip)) dflags |= NODELAY; inodedep_lookup(mp, ip->i_number, dflags, &inodedep); if ((inodedep->id_state & (UNLINKED | DEPCOMPLETE)) == UNLINKED && length == 0) needj = 0; CTR3(KTR_SUJ, "softdep_journal_freeblks: ip %d length %ld needj %d", ip->i_number, length, needj); FREE_LOCK(ump); /* * Calculate the lbn that we are truncating to. This results in -1 * if we're truncating the 0 bytes. So it is the last lbn we want * to keep, not the first lbn we want to truncate. */ lastlbn = lblkno(fs, length + fs->fs_bsize - 1) - 1; lastoff = blkoff(fs, length); /* * Compute frags we are keeping in lastlbn. 0 means all. */ if (lastlbn >= 0 && lastlbn < NDADDR) { frags = fragroundup(fs, lastoff); /* adp offset of last valid allocdirect. */ iboff = lastlbn; } else if (lastlbn > 0) iboff = NDADDR; if (fs->fs_magic == FS_UFS2_MAGIC) extblocks = btodb(fragroundup(fs, ip->i_din2->di_extsize)); /* * Handle normal data blocks and indirects. This section saves * values used after the inode update to complete frag and indirect * truncation. */ if ((flags & IO_NORMAL) != 0) { /* * Handle truncation of whole direct and indirect blocks. */ for (i = iboff + 1; i < NDADDR; i++) setup_freedirect(freeblks, ip, i, needj); for (i = 0, tmpval = NINDIR(fs), lbn = NDADDR; i < NIADDR; i++, lbn += tmpval, tmpval *= NINDIR(fs)) { /* Release a whole indirect tree. */ if (lbn > lastlbn) { setup_freeindir(freeblks, ip, i, -lbn -i, needj); continue; } iboff = i + NDADDR; /* * Traverse partially truncated indirect tree. */ if (lbn <= lastlbn && lbn + tmpval - 1 > lastlbn) setup_trunc_indir(freeblks, ip, -lbn - i, lastlbn, DIP(ip, i_ib[i])); } /* * Handle partial truncation to a frag boundary. */ if (frags) { ufs2_daddr_t blkno; long oldfrags; oldfrags = blksize(fs, ip, lastlbn); blkno = DIP(ip, i_db[lastlbn]); if (blkno && oldfrags != frags) { oldfrags -= frags; oldfrags = numfrags(ip->i_fs, oldfrags); blkno += numfrags(ip->i_fs, frags); newfreework(ump, freeblks, NULL, lastlbn, blkno, oldfrags, 0, needj); if (needj) adjust_newfreework(freeblks, numfrags(ip->i_fs, frags)); } else if (blkno == 0) allocblock = 1; } /* * Add a journal record for partial truncate if we are * handling indirect blocks. Non-indirects need no extra * journaling. */ if (length != 0 && lastlbn >= NDADDR) { ip->i_flag |= IN_TRUNCATED; newjtrunc(freeblks, length, 0); } ip->i_size = length; DIP_SET(ip, i_size, ip->i_size); datablocks = DIP(ip, i_blocks) - extblocks; if (length != 0) datablocks = blkcount(ip->i_fs, datablocks, length); freeblks->fb_len = length; } if ((flags & IO_EXT) != 0) { for (i = 0; i < NXADDR; i++) setup_freeext(freeblks, ip, i, needj); ip->i_din2->di_extsize = 0; datablocks += extblocks; } #ifdef QUOTA /* Reference the quotas in case the block count is wrong in the end. */ quotaref(vp, freeblks->fb_quota); (void) chkdq(ip, -datablocks, NOCRED, 0); #endif freeblks->fb_chkcnt = -datablocks; UFS_LOCK(ump); fs->fs_pendingblocks += datablocks; UFS_UNLOCK(ump); DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - datablocks); /* * Handle truncation of incomplete alloc direct dependencies. We * hold the inode block locked to prevent incomplete dependencies * from reaching the disk while we are eliminating those that * have been truncated. This is a partially inlined ffs_update(). */ ufs_itimes(vp); ip->i_flag &= ~(IN_LAZYACCESS | IN_LAZYMOD | IN_MODIFIED); error = bread(ip->i_devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)), (int)fs->fs_bsize, cred, &bp); if (error) { brelse(bp); softdep_error("softdep_journal_freeblocks", error); return; } if (bp->b_bufsize == fs->fs_bsize) bp->b_flags |= B_CLUSTEROK; softdep_update_inodeblock(ip, bp, 0); if (ump->um_fstype == UFS1) *((struct ufs1_dinode *)bp->b_data + ino_to_fsbo(fs, ip->i_number)) = *ip->i_din1; else *((struct ufs2_dinode *)bp->b_data + ino_to_fsbo(fs, ip->i_number)) = *ip->i_din2; ACQUIRE_LOCK(ump); (void) inodedep_lookup(mp, ip->i_number, dflags, &inodedep); if ((inodedep->id_state & IOSTARTED) != 0) panic("softdep_setup_freeblocks: inode busy"); /* * Add the freeblks structure to the list of operations that * must await the zero'ed inode being written to disk. If we * still have a bitmap dependency (needj), then the inode * has never been written to disk, so we can process the * freeblks below once we have deleted the dependencies. */ if (needj) WORKLIST_INSERT(&bp->b_dep, &freeblks->fb_list); else freeblks->fb_state |= COMPLETE; if ((flags & IO_NORMAL) != 0) { TAILQ_FOREACH_SAFE(adp, &inodedep->id_inoupdt, ad_next, adpn) { if (adp->ad_offset > iboff) cancel_allocdirect(&inodedep->id_inoupdt, adp, freeblks); /* * Truncate the allocdirect. We could eliminate * or modify journal records as well. */ else if (adp->ad_offset == iboff && frags) adp->ad_newsize = frags; } } if ((flags & IO_EXT) != 0) while ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != 0) cancel_allocdirect(&inodedep->id_extupdt, adp, freeblks); /* * Scan the bufwait list for newblock dependencies that will never * make it to disk. */ LIST_FOREACH_SAFE(wk, &inodedep->id_bufwait, wk_list, wkn) { if (wk->wk_type != D_ALLOCDIRECT) continue; adp = WK_ALLOCDIRECT(wk); if (((flags & IO_NORMAL) != 0 && (adp->ad_offset > iboff)) || ((flags & IO_EXT) != 0 && (adp->ad_state & EXTDATA))) { cancel_jfreeblk(freeblks, adp->ad_newblkno); cancel_newblk(WK_NEWBLK(wk), NULL, &freeblks->fb_jwork); WORKLIST_INSERT(&freeblks->fb_freeworkhd, wk); } } /* * Add journal work. */ LIST_FOREACH(jblkdep, &freeblks->fb_jblkdephd, jb_deps) add_to_journal(&jblkdep->jb_list); FREE_LOCK(ump); bdwrite(bp); /* * Truncate dependency structures beyond length. */ trunc_dependencies(ip, freeblks, lastlbn, frags, flags); /* * This is only set when we need to allocate a fragment because * none existed at the end of a frag-sized file. It handles only * allocating a new, zero filled block. */ if (allocblock) { ip->i_size = length - lastoff; DIP_SET(ip, i_size, ip->i_size); error = UFS_BALLOC(vp, length - 1, 1, cred, BA_CLRBUF, &bp); if (error != 0) { softdep_error("softdep_journal_freeblks", error); return; } ip->i_size = length; DIP_SET(ip, i_size, length); ip->i_flag |= IN_CHANGE | IN_UPDATE; allocbuf(bp, frags); ffs_update(vp, 0); bawrite(bp); } else if (lastoff != 0 && vp->v_type != VDIR) { int size; /* * Zero the end of a truncated frag or block. */ size = sblksize(fs, length, lastlbn); error = bread(vp, lastlbn, size, cred, &bp); if (error) { softdep_error("softdep_journal_freeblks", error); return; } bzero((char *)bp->b_data + lastoff, size - lastoff); bawrite(bp); } ACQUIRE_LOCK(ump); inodedep_lookup(mp, ip->i_number, dflags, &inodedep); TAILQ_INSERT_TAIL(&inodedep->id_freeblklst, freeblks, fb_next); freeblks->fb_state |= DEPCOMPLETE | ONDEPLIST; /* * We zero earlier truncations so they don't erroneously * update i_blocks. */ if (freeblks->fb_len == 0 && (flags & IO_NORMAL) != 0) TAILQ_FOREACH(fbn, &inodedep->id_freeblklst, fb_next) fbn->fb_len = 0; if ((freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE && LIST_EMPTY(&freeblks->fb_jblkdephd)) freeblks->fb_state |= INPROGRESS; else freeblks = NULL; FREE_LOCK(ump); if (freeblks) handle_workitem_freeblocks(freeblks, 0); trunc_pages(ip, length, extblocks, flags); } /* * Flush a JOP_SYNC to the journal. */ void softdep_journal_fsync(ip) struct inode *ip; { struct jfsync *jfsync; KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ip->i_ump)) != 0, ("softdep_journal_fsync called on non-softdep filesystem")); if ((ip->i_flag & IN_TRUNCATED) == 0) return; ip->i_flag &= ~IN_TRUNCATED; jfsync = malloc(sizeof(*jfsync), M_JFSYNC, M_SOFTDEP_FLAGS | M_ZERO); workitem_alloc(&jfsync->jfs_list, D_JFSYNC, UFSTOVFS(ip->i_ump)); jfsync->jfs_size = ip->i_size; jfsync->jfs_ino = ip->i_number; ACQUIRE_LOCK(ip->i_ump); add_to_journal(&jfsync->jfs_list); jwait(&jfsync->jfs_list, MNT_WAIT); FREE_LOCK(ip->i_ump); } /* * Block de-allocation dependencies. * * When blocks are de-allocated, the on-disk pointers must be nullified before * the blocks are made available for use by other files. (The true * requirement is that old pointers must be nullified before new on-disk * pointers are set. We chose this slightly more stringent requirement to * reduce complexity.) Our implementation handles this dependency by updating * the inode (or indirect block) appropriately but delaying the actual block * de-allocation (i.e., freemap and free space count manipulation) until * after the updated versions reach stable storage. After the disk is * updated, the blocks can be safely de-allocated whenever it is convenient. * This implementation handles only the common case of reducing a file's * length to zero. Other cases are handled by the conventional synchronous * write approach. * * The ffs implementation with which we worked double-checks * the state of the block pointers and file size as it reduces * a file's length. Some of this code is replicated here in our * soft updates implementation. The freeblks->fb_chkcnt field is * used to transfer a part of this information to the procedure * that eventually de-allocates the blocks. * * This routine should be called from the routine that shortens * a file's length, before the inode's size or block pointers * are modified. It will save the block pointer information for * later release and zero the inode so that the calling routine * can release it. */ void softdep_setup_freeblocks(ip, length, flags) struct inode *ip; /* The inode whose length is to be reduced */ off_t length; /* The new length for the file */ int flags; /* IO_EXT and/or IO_NORMAL */ { struct ufs1_dinode *dp1; struct ufs2_dinode *dp2; struct freeblks *freeblks; struct inodedep *inodedep; struct allocdirect *adp; struct ufsmount *ump; struct buf *bp; struct fs *fs; ufs2_daddr_t extblocks, datablocks; struct mount *mp; int i, delay, error, dflags; ufs_lbn_t tmpval; ufs_lbn_t lbn; ump = ip->i_ump; mp = UFSTOVFS(ump); KASSERT(MOUNTEDSOFTDEP(mp) != 0, ("softdep_setup_freeblocks called on non-softdep filesystem")); CTR2(KTR_SUJ, "softdep_setup_freeblks: ip %d length %ld", ip->i_number, length); KASSERT(length == 0, ("softdep_setup_freeblocks: non-zero length")); fs = ip->i_fs; freeblks = newfreeblks(mp, ip); extblocks = 0; datablocks = 0; if (fs->fs_magic == FS_UFS2_MAGIC) extblocks = btodb(fragroundup(fs, ip->i_din2->di_extsize)); if ((flags & IO_NORMAL) != 0) { for (i = 0; i < NDADDR; i++) setup_freedirect(freeblks, ip, i, 0); for (i = 0, tmpval = NINDIR(fs), lbn = NDADDR; i < NIADDR; i++, lbn += tmpval, tmpval *= NINDIR(fs)) setup_freeindir(freeblks, ip, i, -lbn -i, 0); ip->i_size = 0; DIP_SET(ip, i_size, 0); datablocks = DIP(ip, i_blocks) - extblocks; } if ((flags & IO_EXT) != 0) { for (i = 0; i < NXADDR; i++) setup_freeext(freeblks, ip, i, 0); ip->i_din2->di_extsize = 0; datablocks += extblocks; } #ifdef QUOTA /* Reference the quotas in case the block count is wrong in the end. */ quotaref(ITOV(ip), freeblks->fb_quota); (void) chkdq(ip, -datablocks, NOCRED, 0); #endif freeblks->fb_chkcnt = -datablocks; UFS_LOCK(ump); fs->fs_pendingblocks += datablocks; UFS_UNLOCK(ump); DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - datablocks); /* * Push the zero'ed inode to to its disk buffer so that we are free * to delete its dependencies below. Once the dependencies are gone * the buffer can be safely released. */ if ((error = bread(ip->i_devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)), (int)fs->fs_bsize, NOCRED, &bp)) != 0) { brelse(bp); softdep_error("softdep_setup_freeblocks", error); } if (ump->um_fstype == UFS1) { dp1 = ((struct ufs1_dinode *)bp->b_data + ino_to_fsbo(fs, ip->i_number)); ip->i_din1->di_freelink = dp1->di_freelink; *dp1 = *ip->i_din1; } else { dp2 = ((struct ufs2_dinode *)bp->b_data + ino_to_fsbo(fs, ip->i_number)); ip->i_din2->di_freelink = dp2->di_freelink; *dp2 = *ip->i_din2; } /* * Find and eliminate any inode dependencies. */ ACQUIRE_LOCK(ump); dflags = DEPALLOC; if (IS_SNAPSHOT(ip)) dflags |= NODELAY; (void) inodedep_lookup(mp, ip->i_number, dflags, &inodedep); if ((inodedep->id_state & IOSTARTED) != 0) panic("softdep_setup_freeblocks: inode busy"); /* * Add the freeblks structure to the list of operations that * must await the zero'ed inode being written to disk. If we * still have a bitmap dependency (delay == 0), then the inode * has never been written to disk, so we can process the * freeblks below once we have deleted the dependencies. */ delay = (inodedep->id_state & DEPCOMPLETE); if (delay) WORKLIST_INSERT(&bp->b_dep, &freeblks->fb_list); else freeblks->fb_state |= COMPLETE; /* * Because the file length has been truncated to zero, any * pending block allocation dependency structures associated * with this inode are obsolete and can simply be de-allocated. * We must first merge the two dependency lists to get rid of * any duplicate freefrag structures, then purge the merged list. * If we still have a bitmap dependency, then the inode has never * been written to disk, so we can free any fragments without delay. */ if (flags & IO_NORMAL) { merge_inode_lists(&inodedep->id_newinoupdt, &inodedep->id_inoupdt); while ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != 0) cancel_allocdirect(&inodedep->id_inoupdt, adp, freeblks); } if (flags & IO_EXT) { merge_inode_lists(&inodedep->id_newextupdt, &inodedep->id_extupdt); while ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != 0) cancel_allocdirect(&inodedep->id_extupdt, adp, freeblks); } FREE_LOCK(ump); bdwrite(bp); trunc_dependencies(ip, freeblks, -1, 0, flags); ACQUIRE_LOCK(ump); if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0) (void) free_inodedep(inodedep); freeblks->fb_state |= DEPCOMPLETE; /* * If the inode with zeroed block pointers is now on disk * we can start freeing blocks. */ if ((freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE) freeblks->fb_state |= INPROGRESS; else freeblks = NULL; FREE_LOCK(ump); if (freeblks) handle_workitem_freeblocks(freeblks, 0); trunc_pages(ip, length, extblocks, flags); } /* * Eliminate pages from the page cache that back parts of this inode and * adjust the vnode pager's idea of our size. This prevents stale data * from hanging around in the page cache. */ static void trunc_pages(ip, length, extblocks, flags) struct inode *ip; off_t length; ufs2_daddr_t extblocks; int flags; { struct vnode *vp; struct fs *fs; ufs_lbn_t lbn; off_t end, extend; vp = ITOV(ip); fs = ip->i_fs; extend = OFF_TO_IDX(lblktosize(fs, -extblocks)); if ((flags & IO_EXT) != 0) vn_pages_remove(vp, extend, 0); if ((flags & IO_NORMAL) == 0) return; BO_LOCK(&vp->v_bufobj); drain_output(vp); BO_UNLOCK(&vp->v_bufobj); /* * The vnode pager eliminates file pages we eliminate indirects * below. */ vnode_pager_setsize(vp, length); /* * Calculate the end based on the last indirect we want to keep. If * the block extends into indirects we can just use the negative of * its lbn. Doubles and triples exist at lower numbers so we must * be careful not to remove those, if they exist. double and triple * indirect lbns do not overlap with others so it is not important * to verify how many levels are required. */ lbn = lblkno(fs, length); if (lbn >= NDADDR) { /* Calculate the virtual lbn of the triple indirect. */ lbn = -lbn - (NIADDR - 1); end = OFF_TO_IDX(lblktosize(fs, lbn)); } else end = extend; vn_pages_remove(vp, OFF_TO_IDX(OFF_MAX), end); } /* * See if the buf bp is in the range eliminated by truncation. */ static int trunc_check_buf(bp, blkoffp, lastlbn, lastoff, flags) struct buf *bp; int *blkoffp; ufs_lbn_t lastlbn; int lastoff; int flags; { ufs_lbn_t lbn; *blkoffp = 0; /* Only match ext/normal blocks as appropriate. */ if (((flags & IO_EXT) == 0 && (bp->b_xflags & BX_ALTDATA)) || ((flags & IO_NORMAL) == 0 && (bp->b_xflags & BX_ALTDATA) == 0)) return (0); /* ALTDATA is always a full truncation. */ if ((bp->b_xflags & BX_ALTDATA) != 0) return (1); /* -1 is full truncation. */ if (lastlbn == -1) return (1); /* * If this is a partial truncate we only want those * blocks and indirect blocks that cover the range * we're after. */ lbn = bp->b_lblkno; if (lbn < 0) lbn = -(lbn + lbn_level(lbn)); if (lbn < lastlbn) return (0); /* Here we only truncate lblkno if it's partial. */ if (lbn == lastlbn) { if (lastoff == 0) return (0); *blkoffp = lastoff; } return (1); } /* * Eliminate any dependencies that exist in memory beyond lblkno:off */ static void trunc_dependencies(ip, freeblks, lastlbn, lastoff, flags) struct inode *ip; struct freeblks *freeblks; ufs_lbn_t lastlbn; int lastoff; int flags; { struct bufobj *bo; struct vnode *vp; struct buf *bp; struct fs *fs; int blkoff; /* * We must wait for any I/O in progress to finish so that * all potential buffers on the dirty list will be visible. * Once they are all there, walk the list and get rid of * any dependencies. */ fs = ip->i_fs; vp = ITOV(ip); bo = &vp->v_bufobj; BO_LOCK(bo); drain_output(vp); TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) bp->b_vflags &= ~BV_SCANNED; restart: TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) { if (bp->b_vflags & BV_SCANNED) continue; if (!trunc_check_buf(bp, &blkoff, lastlbn, lastoff, flags)) { bp->b_vflags |= BV_SCANNED; continue; } KASSERT(bp->b_bufobj == bo, ("Wrong object in buffer")); if ((bp = getdirtybuf(bp, BO_LOCKPTR(bo), MNT_WAIT)) == NULL) goto restart; BO_UNLOCK(bo); if (deallocate_dependencies(bp, freeblks, blkoff)) bqrelse(bp); else brelse(bp); BO_LOCK(bo); goto restart; } /* * Now do the work of vtruncbuf while also matching indirect blocks. */ TAILQ_FOREACH(bp, &bo->bo_clean.bv_hd, b_bobufs) bp->b_vflags &= ~BV_SCANNED; cleanrestart: TAILQ_FOREACH(bp, &bo->bo_clean.bv_hd, b_bobufs) { if (bp->b_vflags & BV_SCANNED) continue; if (!trunc_check_buf(bp, &blkoff, lastlbn, lastoff, flags)) { bp->b_vflags |= BV_SCANNED; continue; } if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_LOCKPTR(bo)) == ENOLCK) { BO_LOCK(bo); goto cleanrestart; } bp->b_vflags |= BV_SCANNED; bremfree(bp); if (blkoff != 0) { allocbuf(bp, blkoff); bqrelse(bp); } else { bp->b_flags |= B_INVAL | B_NOCACHE | B_RELBUF; brelse(bp); } BO_LOCK(bo); goto cleanrestart; } drain_output(vp); BO_UNLOCK(bo); } static int cancel_pagedep(pagedep, freeblks, blkoff) struct pagedep *pagedep; struct freeblks *freeblks; int blkoff; { struct jremref *jremref; struct jmvref *jmvref; struct dirrem *dirrem, *tmp; int i; /* * Copy any directory remove dependencies to the list * to be processed after the freeblks proceeds. If * directory entry never made it to disk they * can be dumped directly onto the work list. */ LIST_FOREACH_SAFE(dirrem, &pagedep->pd_dirremhd, dm_next, tmp) { /* Skip this directory removal if it is intended to remain. */ if (dirrem->dm_offset < blkoff) continue; /* * If there are any dirrems we wait for the journal write * to complete and then restart the buf scan as the lock * has been dropped. */ while ((jremref = LIST_FIRST(&dirrem->dm_jremrefhd)) != NULL) { jwait(&jremref->jr_list, MNT_WAIT); return (ERESTART); } LIST_REMOVE(dirrem, dm_next); dirrem->dm_dirinum = pagedep->pd_ino; WORKLIST_INSERT(&freeblks->fb_freeworkhd, &dirrem->dm_list); } while ((jmvref = LIST_FIRST(&pagedep->pd_jmvrefhd)) != NULL) { jwait(&jmvref->jm_list, MNT_WAIT); return (ERESTART); } /* * When we're partially truncating a pagedep we just want to flush * journal entries and return. There can not be any adds in the * truncated portion of the directory and newblk must remain if * part of the block remains. */ if (blkoff != 0) { struct diradd *dap; LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist) if (dap->da_offset > blkoff) panic("cancel_pagedep: diradd %p off %d > %d", dap, dap->da_offset, blkoff); for (i = 0; i < DAHASHSZ; i++) LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) if (dap->da_offset > blkoff) panic("cancel_pagedep: diradd %p off %d > %d", dap, dap->da_offset, blkoff); return (0); } /* * There should be no directory add dependencies present * as the directory could not be truncated until all * children were removed. */ KASSERT(LIST_FIRST(&pagedep->pd_pendinghd) == NULL, ("deallocate_dependencies: pendinghd != NULL")); for (i = 0; i < DAHASHSZ; i++) KASSERT(LIST_FIRST(&pagedep->pd_diraddhd[i]) == NULL, ("deallocate_dependencies: diraddhd != NULL")); if ((pagedep->pd_state & NEWBLOCK) != 0) free_newdirblk(pagedep->pd_newdirblk); if (free_pagedep(pagedep) == 0) panic("Failed to free pagedep %p", pagedep); return (0); } /* * Reclaim any dependency structures from a buffer that is about to * be reallocated to a new vnode. The buffer must be locked, thus, * no I/O completion operations can occur while we are manipulating * its associated dependencies. The mutex is held so that other I/O's * associated with related dependencies do not occur. */ static int deallocate_dependencies(bp, freeblks, off) struct buf *bp; struct freeblks *freeblks; int off; { struct indirdep *indirdep; struct pagedep *pagedep; struct allocdirect *adp; struct worklist *wk, *wkn; struct ufsmount *ump; if ((wk = LIST_FIRST(&bp->b_dep)) == NULL) goto done; ump = VFSTOUFS(wk->wk_mp); ACQUIRE_LOCK(ump); LIST_FOREACH_SAFE(wk, &bp->b_dep, wk_list, wkn) { switch (wk->wk_type) { case D_INDIRDEP: indirdep = WK_INDIRDEP(wk); if (bp->b_lblkno >= 0 || bp->b_blkno != indirdep->ir_savebp->b_lblkno) panic("deallocate_dependencies: not indir"); cancel_indirdep(indirdep, bp, freeblks); continue; case D_PAGEDEP: pagedep = WK_PAGEDEP(wk); if (cancel_pagedep(pagedep, freeblks, off)) { FREE_LOCK(ump); return (ERESTART); } continue; case D_ALLOCINDIR: /* * Simply remove the allocindir, we'll find it via * the indirdep where we can clear pointers if * needed. */ WORKLIST_REMOVE(wk); continue; case D_FREEWORK: /* * A truncation is waiting for the zero'd pointers * to be written. It can be freed when the freeblks * is journaled. */ WORKLIST_REMOVE(wk); wk->wk_state |= ONDEPLIST; WORKLIST_INSERT(&freeblks->fb_freeworkhd, wk); break; case D_ALLOCDIRECT: adp = WK_ALLOCDIRECT(wk); if (off != 0) continue; /* FALLTHROUGH */ default: panic("deallocate_dependencies: Unexpected type %s", TYPENAME(wk->wk_type)); /* NOTREACHED */ } } FREE_LOCK(ump); done: /* * Don't throw away this buf, we were partially truncating and * some deps may always remain. */ if (off) { allocbuf(bp, off); bp->b_vflags |= BV_SCANNED; return (EBUSY); } bp->b_flags |= B_INVAL | B_NOCACHE; return (0); } /* * An allocdirect is being canceled due to a truncate. We must make sure * the journal entry is released in concert with the blkfree that releases * the storage. Completed journal entries must not be released until the * space is no longer pointed to by the inode or in the bitmap. */ static void cancel_allocdirect(adphead, adp, freeblks) struct allocdirectlst *adphead; struct allocdirect *adp; struct freeblks *freeblks; { struct freework *freework; struct newblk *newblk; struct worklist *wk; TAILQ_REMOVE(adphead, adp, ad_next); newblk = (struct newblk *)adp; freework = NULL; /* * Find the correct freework structure. */ LIST_FOREACH(wk, &freeblks->fb_freeworkhd, wk_list) { if (wk->wk_type != D_FREEWORK) continue; freework = WK_FREEWORK(wk); if (freework->fw_blkno == newblk->nb_newblkno) break; } if (freework == NULL) panic("cancel_allocdirect: Freework not found"); /* * If a newblk exists at all we still have the journal entry that * initiated the allocation so we do not need to journal the free. */ cancel_jfreeblk(freeblks, freework->fw_blkno); /* * If the journal hasn't been written the jnewblk must be passed * to the call to ffs_blkfree that reclaims the space. We accomplish * this by linking the journal dependency into the freework to be * freed when freework_freeblock() is called. If the journal has * been written we can simply reclaim the journal space when the * freeblks work is complete. */ freework->fw_jnewblk = cancel_newblk(newblk, &freework->fw_list, &freeblks->fb_jwork); WORKLIST_INSERT(&freeblks->fb_freeworkhd, &newblk->nb_list); } /* * Cancel a new block allocation. May be an indirect or direct block. We * remove it from various lists and return any journal record that needs to * be resolved by the caller. * * A special consideration is made for indirects which were never pointed * at on disk and will never be found once this block is released. */ static struct jnewblk * cancel_newblk(newblk, wk, wkhd) struct newblk *newblk; struct worklist *wk; struct workhead *wkhd; { struct jnewblk *jnewblk; CTR1(KTR_SUJ, "cancel_newblk: blkno %jd", newblk->nb_newblkno); newblk->nb_state |= GOINGAWAY; /* * Previously we traversed the completedhd on each indirdep * attached to this newblk to cancel them and gather journal * work. Since we need only the oldest journal segment and * the lowest point on the tree will always have the oldest * journal segment we are free to release the segments * of any subordinates and may leave the indirdep list to * indirdep_complete() when this newblk is freed. */ if (newblk->nb_state & ONDEPLIST) { newblk->nb_state &= ~ONDEPLIST; LIST_REMOVE(newblk, nb_deps); } if (newblk->nb_state & ONWORKLIST) WORKLIST_REMOVE(&newblk->nb_list); /* * If the journal entry hasn't been written we save a pointer to * the dependency that frees it until it is written or the * superseding operation completes. */ jnewblk = newblk->nb_jnewblk; if (jnewblk != NULL && wk != NULL) { newblk->nb_jnewblk = NULL; jnewblk->jn_dep = wk; } if (!LIST_EMPTY(&newblk->nb_jwork)) jwork_move(wkhd, &newblk->nb_jwork); /* * When truncating we must free the newdirblk early to remove * the pagedep from the hash before returning. */ if ((wk = LIST_FIRST(&newblk->nb_newdirblk)) != NULL) free_newdirblk(WK_NEWDIRBLK(wk)); if (!LIST_EMPTY(&newblk->nb_newdirblk)) panic("cancel_newblk: extra newdirblk"); return (jnewblk); } /* * Schedule the freefrag associated with a newblk to be released once * the pointers are written and the previous block is no longer needed. */ static void newblk_freefrag(newblk) struct newblk *newblk; { struct freefrag *freefrag; if (newblk->nb_freefrag == NULL) return; freefrag = newblk->nb_freefrag; newblk->nb_freefrag = NULL; freefrag->ff_state |= COMPLETE; if ((freefrag->ff_state & ALLCOMPLETE) == ALLCOMPLETE) add_to_worklist(&freefrag->ff_list, 0); } /* * Free a newblk. Generate a new freefrag work request if appropriate. * This must be called after the inode pointer and any direct block pointers * are valid or fully removed via truncate or frag extension. */ static void free_newblk(newblk) struct newblk *newblk; { struct indirdep *indirdep; struct worklist *wk; KASSERT(newblk->nb_jnewblk == NULL, ("free_newblk: jnewblk %p still attached", newblk->nb_jnewblk)); KASSERT(newblk->nb_list.wk_type != D_NEWBLK, ("free_newblk: unclaimed newblk")); LOCK_OWNED(VFSTOUFS(newblk->nb_list.wk_mp)); newblk_freefrag(newblk); if (newblk->nb_state & ONDEPLIST) LIST_REMOVE(newblk, nb_deps); if (newblk->nb_state & ONWORKLIST) WORKLIST_REMOVE(&newblk->nb_list); LIST_REMOVE(newblk, nb_hash); if ((wk = LIST_FIRST(&newblk->nb_newdirblk)) != NULL) free_newdirblk(WK_NEWDIRBLK(wk)); if (!LIST_EMPTY(&newblk->nb_newdirblk)) panic("free_newblk: extra newdirblk"); while ((indirdep = LIST_FIRST(&newblk->nb_indirdeps)) != NULL) indirdep_complete(indirdep); handle_jwork(&newblk->nb_jwork); WORKITEM_FREE(newblk, D_NEWBLK); } /* * Free a newdirblk. Clear the NEWBLOCK flag on its associated pagedep. * This routine must be called with splbio interrupts blocked. */ static void free_newdirblk(newdirblk) struct newdirblk *newdirblk; { struct pagedep *pagedep; struct diradd *dap; struct worklist *wk; LOCK_OWNED(VFSTOUFS(newdirblk->db_list.wk_mp)); WORKLIST_REMOVE(&newdirblk->db_list); /* * If the pagedep is still linked onto the directory buffer * dependency chain, then some of the entries on the * pd_pendinghd list may not be committed to disk yet. In * this case, we will simply clear the NEWBLOCK flag and * let the pd_pendinghd list be processed when the pagedep * is next written. If the pagedep is no longer on the buffer * dependency chain, then all the entries on the pd_pending * list are committed to disk and we can free them here. */ pagedep = newdirblk->db_pagedep; pagedep->pd_state &= ~NEWBLOCK; if ((pagedep->pd_state & ONWORKLIST) == 0) { while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL) free_diradd(dap, NULL); /* * If no dependencies remain, the pagedep will be freed. */ free_pagedep(pagedep); } /* Should only ever be one item in the list. */ while ((wk = LIST_FIRST(&newdirblk->db_mkdir)) != NULL) { WORKLIST_REMOVE(wk); handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY); } WORKITEM_FREE(newdirblk, D_NEWDIRBLK); } /* * Prepare an inode to be freed. The actual free operation is not * done until the zero'ed inode has been written to disk. */ void softdep_freefile(pvp, ino, mode) struct vnode *pvp; ino_t ino; int mode; { struct inode *ip = VTOI(pvp); struct inodedep *inodedep; struct freefile *freefile; struct freeblks *freeblks; struct ufsmount *ump; ump = ip->i_ump; KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0, ("softdep_freefile called on non-softdep filesystem")); /* * This sets up the inode de-allocation dependency. */ freefile = malloc(sizeof(struct freefile), M_FREEFILE, M_SOFTDEP_FLAGS); workitem_alloc(&freefile->fx_list, D_FREEFILE, pvp->v_mount); freefile->fx_mode = mode; freefile->fx_oldinum = ino; freefile->fx_devvp = ip->i_devvp; LIST_INIT(&freefile->fx_jwork); UFS_LOCK(ump); ip->i_fs->fs_pendinginodes += 1; UFS_UNLOCK(ump); /* * If the inodedep does not exist, then the zero'ed inode has * been written to disk. If the allocated inode has never been * written to disk, then the on-disk inode is zero'ed. In either * case we can free the file immediately. If the journal was * canceled before being written the inode will never make it to * disk and we must send the canceled journal entrys to * ffs_freefile() to be cleared in conjunction with the bitmap. * Any blocks waiting on the inode to write can be safely freed * here as it will never been written. */ ACQUIRE_LOCK(ump); inodedep_lookup(pvp->v_mount, ino, 0, &inodedep); if (inodedep) { /* * Clear out freeblks that no longer need to reference * this inode. */ while ((freeblks = TAILQ_FIRST(&inodedep->id_freeblklst)) != NULL) { TAILQ_REMOVE(&inodedep->id_freeblklst, freeblks, fb_next); freeblks->fb_state &= ~ONDEPLIST; } /* * Remove this inode from the unlinked list. */ if (inodedep->id_state & UNLINKED) { /* * Save the journal work to be freed with the bitmap * before we clear UNLINKED. Otherwise it can be lost * if the inode block is written. */ handle_bufwait(inodedep, &freefile->fx_jwork); clear_unlinked_inodedep(inodedep); /* * Re-acquire inodedep as we've dropped the * per-filesystem lock in clear_unlinked_inodedep(). */ inodedep_lookup(pvp->v_mount, ino, 0, &inodedep); } } if (inodedep == NULL || check_inode_unwritten(inodedep)) { FREE_LOCK(ump); handle_workitem_freefile(freefile); return; } if ((inodedep->id_state & DEPCOMPLETE) == 0) inodedep->id_state |= GOINGAWAY; WORKLIST_INSERT(&inodedep->id_inowait, &freefile->fx_list); FREE_LOCK(ump); if (ip->i_number == ino) ip->i_flag |= IN_MODIFIED; } /* * Check to see if an inode has never been written to disk. If * so free the inodedep and return success, otherwise return failure. * This routine must be called with splbio interrupts blocked. * * If we still have a bitmap dependency, then the inode has never * been written to disk. Drop the dependency as it is no longer * necessary since the inode is being deallocated. We set the * ALLCOMPLETE flags since the bitmap now properly shows that the * inode is not allocated. Even if the inode is actively being * written, it has been rolled back to its zero'ed state, so we * are ensured that a zero inode is what is on the disk. For short * lived files, this change will usually result in removing all the * dependencies from the inode so that it can be freed immediately. */ static int check_inode_unwritten(inodedep) struct inodedep *inodedep; { LOCK_OWNED(VFSTOUFS(inodedep->id_list.wk_mp)); if ((inodedep->id_state & (DEPCOMPLETE | UNLINKED)) != 0 || !LIST_EMPTY(&inodedep->id_dirremhd) || !LIST_EMPTY(&inodedep->id_pendinghd) || !LIST_EMPTY(&inodedep->id_bufwait) || !LIST_EMPTY(&inodedep->id_inowait) || !TAILQ_EMPTY(&inodedep->id_inoreflst) || !TAILQ_EMPTY(&inodedep->id_inoupdt) || !TAILQ_EMPTY(&inodedep->id_newinoupdt) || !TAILQ_EMPTY(&inodedep->id_extupdt) || !TAILQ_EMPTY(&inodedep->id_newextupdt) || !TAILQ_EMPTY(&inodedep->id_freeblklst) || inodedep->id_mkdiradd != NULL || inodedep->id_nlinkdelta != 0) return (0); /* * Another process might be in initiate_write_inodeblock_ufs[12] * trying to allocate memory without holding "Softdep Lock". */ if ((inodedep->id_state & IOSTARTED) != 0 && inodedep->id_savedino1 == NULL) return (0); if (inodedep->id_state & ONDEPLIST) LIST_REMOVE(inodedep, id_deps); inodedep->id_state &= ~ONDEPLIST; inodedep->id_state |= ALLCOMPLETE; inodedep->id_bmsafemap = NULL; if (inodedep->id_state & ONWORKLIST) WORKLIST_REMOVE(&inodedep->id_list); if (inodedep->id_savedino1 != NULL) { free(inodedep->id_savedino1, M_SAVEDINO); inodedep->id_savedino1 = NULL; } if (free_inodedep(inodedep) == 0) panic("check_inode_unwritten: busy inode"); return (1); } /* * Try to free an inodedep structure. Return 1 if it could be freed. */ static int free_inodedep(inodedep) struct inodedep *inodedep; { LOCK_OWNED(VFSTOUFS(inodedep->id_list.wk_mp)); if ((inodedep->id_state & (ONWORKLIST | UNLINKED)) != 0 || (inodedep->id_state & ALLCOMPLETE) != ALLCOMPLETE || !LIST_EMPTY(&inodedep->id_dirremhd) || !LIST_EMPTY(&inodedep->id_pendinghd) || !LIST_EMPTY(&inodedep->id_bufwait) || !LIST_EMPTY(&inodedep->id_inowait) || !TAILQ_EMPTY(&inodedep->id_inoreflst) || !TAILQ_EMPTY(&inodedep->id_inoupdt) || !TAILQ_EMPTY(&inodedep->id_newinoupdt) || !TAILQ_EMPTY(&inodedep->id_extupdt) || !TAILQ_EMPTY(&inodedep->id_newextupdt) || !TAILQ_EMPTY(&inodedep->id_freeblklst) || inodedep->id_mkdiradd != NULL || inodedep->id_nlinkdelta != 0 || inodedep->id_savedino1 != NULL) return (0); if (inodedep->id_state & ONDEPLIST) LIST_REMOVE(inodedep, id_deps); LIST_REMOVE(inodedep, id_hash); WORKITEM_FREE(inodedep, D_INODEDEP); return (1); } /* * Free the block referenced by a freework structure. The parent freeblks * structure is released and completed when the final cg bitmap reaches * the disk. This routine may be freeing a jnewblk which never made it to * disk in which case we do not have to wait as the operation is undone * in memory immediately. */ static void freework_freeblock(freework) struct freework *freework; { struct freeblks *freeblks; struct jnewblk *jnewblk; struct ufsmount *ump; struct workhead wkhd; struct fs *fs; int bsize; int needj; ump = VFSTOUFS(freework->fw_list.wk_mp); LOCK_OWNED(ump); /* * Handle partial truncate separately. */ if (freework->fw_indir) { complete_trunc_indir(freework); return; } freeblks = freework->fw_freeblks; fs = ump->um_fs; needj = MOUNTEDSUJ(freeblks->fb_list.wk_mp) != 0; bsize = lfragtosize(fs, freework->fw_frags); LIST_INIT(&wkhd); /* * DEPCOMPLETE is cleared in indirblk_insert() if the block lives * on the indirblk hashtable and prevents premature freeing. */ freework->fw_state |= DEPCOMPLETE; /* * SUJ needs to wait for the segment referencing freed indirect * blocks to expire so that we know the checker will not confuse * a re-allocated indirect block with its old contents. */ if (needj && freework->fw_lbn <= -NDADDR) indirblk_insert(freework); /* * If we are canceling an existing jnewblk pass it to the free * routine, otherwise pass the freeblk which will ultimately * release the freeblks. If we're not journaling, we can just * free the freeblks immediately. */ jnewblk = freework->fw_jnewblk; if (jnewblk != NULL) { cancel_jnewblk(jnewblk, &wkhd); needj = 0; } else if (needj) { freework->fw_state |= DELAYEDFREE; freeblks->fb_cgwait++; WORKLIST_INSERT(&wkhd, &freework->fw_list); } FREE_LOCK(ump); freeblks_free(ump, freeblks, btodb(bsize)); CTR4(KTR_SUJ, "freework_freeblock: ino %d blkno %jd lbn %jd size %ld", freeblks->fb_inum, freework->fw_blkno, freework->fw_lbn, bsize); ffs_blkfree(ump, fs, freeblks->fb_devvp, freework->fw_blkno, bsize, freeblks->fb_inum, freeblks->fb_vtype, &wkhd); ACQUIRE_LOCK(ump); /* * The jnewblk will be discarded and the bits in the map never * made it to disk. We can immediately free the freeblk. */ if (needj == 0) handle_written_freework(freework); } /* * We enqueue freework items that need processing back on the freeblks and * add the freeblks to the worklist. This makes it easier to find all work * required to flush a truncation in process_truncates(). */ static void freework_enqueue(freework) struct freework *freework; { struct freeblks *freeblks; freeblks = freework->fw_freeblks; if ((freework->fw_state & INPROGRESS) == 0) WORKLIST_INSERT(&freeblks->fb_freeworkhd, &freework->fw_list); if ((freeblks->fb_state & (ONWORKLIST | INPROGRESS | ALLCOMPLETE)) == ALLCOMPLETE && LIST_EMPTY(&freeblks->fb_jblkdephd)) add_to_worklist(&freeblks->fb_list, WK_NODELAY); } /* * Start, continue, or finish the process of freeing an indirect block tree. * The free operation may be paused at any point with fw_off containing the * offset to restart from. This enables us to implement some flow control * for large truncates which may fan out and generate a huge number of * dependencies. */ static void handle_workitem_indirblk(freework) struct freework *freework; { struct freeblks *freeblks; struct ufsmount *ump; struct fs *fs; freeblks = freework->fw_freeblks; ump = VFSTOUFS(freeblks->fb_list.wk_mp); fs = ump->um_fs; if (freework->fw_state & DEPCOMPLETE) { handle_written_freework(freework); return; } if (freework->fw_off == NINDIR(fs)) { freework_freeblock(freework); return; } freework->fw_state |= INPROGRESS; FREE_LOCK(ump); indir_trunc(freework, fsbtodb(fs, freework->fw_blkno), freework->fw_lbn); ACQUIRE_LOCK(ump); } /* * Called when a freework structure attached to a cg buf is written. The * ref on either the parent or the freeblks structure is released and * the freeblks is added back to the worklist if there is more work to do. */ static void handle_written_freework(freework) struct freework *freework; { struct freeblks *freeblks; struct freework *parent; freeblks = freework->fw_freeblks; parent = freework->fw_parent; if (freework->fw_state & DELAYEDFREE) freeblks->fb_cgwait--; freework->fw_state |= COMPLETE; if ((freework->fw_state & ALLCOMPLETE) == ALLCOMPLETE) WORKITEM_FREE(freework, D_FREEWORK); if (parent) { if (--parent->fw_ref == 0) freework_enqueue(parent); return; } if (--freeblks->fb_ref != 0) return; if ((freeblks->fb_state & (ALLCOMPLETE | ONWORKLIST | INPROGRESS)) == ALLCOMPLETE && LIST_EMPTY(&freeblks->fb_jblkdephd)) add_to_worklist(&freeblks->fb_list, WK_NODELAY); } /* * This workitem routine performs the block de-allocation. * The workitem is added to the pending list after the updated * inode block has been written to disk. As mentioned above, * checks regarding the number of blocks de-allocated (compared * to the number of blocks allocated for the file) are also * performed in this function. */ static int handle_workitem_freeblocks(freeblks, flags) struct freeblks *freeblks; int flags; { struct freework *freework; struct newblk *newblk; struct allocindir *aip; struct ufsmount *ump; struct worklist *wk; KASSERT(LIST_EMPTY(&freeblks->fb_jblkdephd), ("handle_workitem_freeblocks: Journal entries not written.")); ump = VFSTOUFS(freeblks->fb_list.wk_mp); ACQUIRE_LOCK(ump); while ((wk = LIST_FIRST(&freeblks->fb_freeworkhd)) != NULL) { WORKLIST_REMOVE(wk); switch (wk->wk_type) { case D_DIRREM: wk->wk_state |= COMPLETE; add_to_worklist(wk, 0); continue; case D_ALLOCDIRECT: free_newblk(WK_NEWBLK(wk)); continue; case D_ALLOCINDIR: aip = WK_ALLOCINDIR(wk); freework = NULL; if (aip->ai_state & DELAYEDFREE) { FREE_LOCK(ump); freework = newfreework(ump, freeblks, NULL, aip->ai_lbn, aip->ai_newblkno, ump->um_fs->fs_frag, 0, 0); ACQUIRE_LOCK(ump); } newblk = WK_NEWBLK(wk); if (newblk->nb_jnewblk) { freework->fw_jnewblk = newblk->nb_jnewblk; newblk->nb_jnewblk->jn_dep = &freework->fw_list; newblk->nb_jnewblk = NULL; } free_newblk(newblk); continue; case D_FREEWORK: freework = WK_FREEWORK(wk); if (freework->fw_lbn <= -NDADDR) handle_workitem_indirblk(freework); else freework_freeblock(freework); continue; default: panic("handle_workitem_freeblocks: Unknown type %s", TYPENAME(wk->wk_type)); } } if (freeblks->fb_ref != 0) { freeblks->fb_state &= ~INPROGRESS; wake_worklist(&freeblks->fb_list); freeblks = NULL; } FREE_LOCK(ump); if (freeblks) return handle_complete_freeblocks(freeblks, flags); return (0); } /* * Handle completion of block free via truncate. This allows fs_pending * to track the actual free block count more closely than if we only updated * it at the end. We must be careful to handle cases where the block count * on free was incorrect. */ static void freeblks_free(ump, freeblks, blocks) struct ufsmount *ump; struct freeblks *freeblks; int blocks; { struct fs *fs; ufs2_daddr_t remain; UFS_LOCK(ump); remain = -freeblks->fb_chkcnt; freeblks->fb_chkcnt += blocks; if (remain > 0) { if (remain < blocks) blocks = remain; fs = ump->um_fs; fs->fs_pendingblocks -= blocks; } UFS_UNLOCK(ump); } /* * Once all of the freework workitems are complete we can retire the * freeblocks dependency and any journal work awaiting completion. This * can not be called until all other dependencies are stable on disk. */ static int handle_complete_freeblocks(freeblks, flags) struct freeblks *freeblks; int flags; { struct inodedep *inodedep; struct inode *ip; struct vnode *vp; struct fs *fs; struct ufsmount *ump; ufs2_daddr_t spare; ump = VFSTOUFS(freeblks->fb_list.wk_mp); fs = ump->um_fs; flags = LK_EXCLUSIVE | flags; spare = freeblks->fb_chkcnt; /* * If we did not release the expected number of blocks we may have * to adjust the inode block count here. Only do so if it wasn't * a truncation to zero and the modrev still matches. */ if (spare && freeblks->fb_len != 0) { if (ffs_vgetf(freeblks->fb_list.wk_mp, freeblks->fb_inum, flags, &vp, FFSV_FORCEINSMQ) != 0) return (EBUSY); ip = VTOI(vp); if (DIP(ip, i_modrev) == freeblks->fb_modrev) { DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - spare); ip->i_flag |= IN_CHANGE; /* * We must wait so this happens before the * journal is reclaimed. */ ffs_update(vp, 1); } vput(vp); } if (spare < 0) { UFS_LOCK(ump); fs->fs_pendingblocks += spare; UFS_UNLOCK(ump); } #ifdef QUOTA /* Handle spare. */ if (spare) quotaadj(freeblks->fb_quota, ump, -spare); quotarele(freeblks->fb_quota); #endif ACQUIRE_LOCK(ump); if (freeblks->fb_state & ONDEPLIST) { inodedep_lookup(freeblks->fb_list.wk_mp, freeblks->fb_inum, 0, &inodedep); TAILQ_REMOVE(&inodedep->id_freeblklst, freeblks, fb_next); freeblks->fb_state &= ~ONDEPLIST; if (TAILQ_EMPTY(&inodedep->id_freeblklst)) free_inodedep(inodedep); } /* * All of the freeblock deps must be complete prior to this call * so it's now safe to complete earlier outstanding journal entries. */ handle_jwork(&freeblks->fb_jwork); WORKITEM_FREE(freeblks, D_FREEBLKS); FREE_LOCK(ump); return (0); } /* * Release blocks associated with the freeblks and stored in the indirect * block dbn. If level is greater than SINGLE, the block is an indirect block * and recursive calls to indirtrunc must be used to cleanse other indirect * blocks. * * This handles partial and complete truncation of blocks. Partial is noted * with goingaway == 0. In this case the freework is completed after the * zero'd indirects are written to disk. For full truncation the freework * is completed after the block is freed. */ static void indir_trunc(freework, dbn, lbn) struct freework *freework; ufs2_daddr_t dbn; ufs_lbn_t lbn; { struct freework *nfreework; struct workhead wkhd; struct freeblks *freeblks; struct buf *bp; struct fs *fs; struct indirdep *indirdep; struct ufsmount *ump; ufs1_daddr_t *bap1 = 0; ufs2_daddr_t nb, nnb, *bap2 = 0; ufs_lbn_t lbnadd, nlbn; int i, nblocks, ufs1fmt; int freedblocks; int goingaway; int freedeps; int needj; int level; int cnt; freeblks = freework->fw_freeblks; ump = VFSTOUFS(freeblks->fb_list.wk_mp); fs = ump->um_fs; /* * Get buffer of block pointers to be freed. There are three cases: * * 1) Partial truncate caches the indirdep pointer in the freework * which provides us a back copy to the save bp which holds the * pointers we want to clear. When this completes the zero * pointers are written to the real copy. * 2) The indirect is being completely truncated, cancel_indirdep() * eliminated the real copy and placed the indirdep on the saved * copy. The indirdep and buf are discarded when this completes. * 3) The indirect was not in memory, we read a copy off of the disk * using the devvp and drop and invalidate the buffer when we're * done. */ goingaway = 1; indirdep = NULL; if (freework->fw_indir != NULL) { goingaway = 0; indirdep = freework->fw_indir; bp = indirdep->ir_savebp; if (bp == NULL || bp->b_blkno != dbn) panic("indir_trunc: Bad saved buf %p blkno %jd", bp, (intmax_t)dbn); } else if ((bp = incore(&freeblks->fb_devvp->v_bufobj, dbn)) != NULL) { /* * The lock prevents the buf dep list from changing and * indirects on devvp should only ever have one dependency. */ indirdep = WK_INDIRDEP(LIST_FIRST(&bp->b_dep)); if (indirdep == NULL || (indirdep->ir_state & GOINGAWAY) == 0) panic("indir_trunc: Bad indirdep %p from buf %p", indirdep, bp); } else if (bread(freeblks->fb_devvp, dbn, (int)fs->fs_bsize, NOCRED, &bp) != 0) { brelse(bp); return; } ACQUIRE_LOCK(ump); /* Protects against a race with complete_trunc_indir(). */ freework->fw_state &= ~INPROGRESS; /* * If we have an indirdep we need to enforce the truncation order * and discard it when it is complete. */ if (indirdep) { if (freework != TAILQ_FIRST(&indirdep->ir_trunc) && !TAILQ_EMPTY(&indirdep->ir_trunc)) { /* * Add the complete truncate to the list on the * indirdep to enforce in-order processing. */ if (freework->fw_indir == NULL) TAILQ_INSERT_TAIL(&indirdep->ir_trunc, freework, fw_next); FREE_LOCK(ump); return; } /* * If we're goingaway, free the indirdep. Otherwise it will * linger until the write completes. */ if (goingaway) free_indirdep(indirdep); } FREE_LOCK(ump); /* Initialize pointers depending on block size. */ if (ump->um_fstype == UFS1) { bap1 = (ufs1_daddr_t *)bp->b_data; nb = bap1[freework->fw_off]; ufs1fmt = 1; } else { bap2 = (ufs2_daddr_t *)bp->b_data; nb = bap2[freework->fw_off]; ufs1fmt = 0; } level = lbn_level(lbn); needj = MOUNTEDSUJ(UFSTOVFS(ump)) != 0; lbnadd = lbn_offset(fs, level); nblocks = btodb(fs->fs_bsize); nfreework = freework; freedeps = 0; cnt = 0; /* * Reclaim blocks. Traverses into nested indirect levels and * arranges for the current level to be freed when subordinates * are free when journaling. */ for (i = freework->fw_off; i < NINDIR(fs); i++, nb = nnb) { if (i != NINDIR(fs) - 1) { if (ufs1fmt) nnb = bap1[i+1]; else nnb = bap2[i+1]; } else nnb = 0; if (nb == 0) continue; cnt++; if (level != 0) { nlbn = (lbn + 1) - (i * lbnadd); if (needj != 0) { nfreework = newfreework(ump, freeblks, freework, nlbn, nb, fs->fs_frag, 0, 0); freedeps++; } indir_trunc(nfreework, fsbtodb(fs, nb), nlbn); } else { struct freedep *freedep; /* * Attempt to aggregate freedep dependencies for * all blocks being released to the same CG. */ LIST_INIT(&wkhd); if (needj != 0 && (nnb == 0 || (dtog(fs, nb) != dtog(fs, nnb)))) { freedep = newfreedep(freework); WORKLIST_INSERT_UNLOCKED(&wkhd, &freedep->fd_list); freedeps++; } CTR3(KTR_SUJ, "indir_trunc: ino %d blkno %jd size %ld", freeblks->fb_inum, nb, fs->fs_bsize); ffs_blkfree(ump, fs, freeblks->fb_devvp, nb, fs->fs_bsize, freeblks->fb_inum, freeblks->fb_vtype, &wkhd); } } if (goingaway) { bp->b_flags |= B_INVAL | B_NOCACHE; brelse(bp); } freedblocks = 0; if (level == 0) freedblocks = (nblocks * cnt); if (needj == 0) freedblocks += nblocks; freeblks_free(ump, freeblks, freedblocks); /* * If we are journaling set up the ref counts and offset so this * indirect can be completed when its children are free. */ if (needj) { ACQUIRE_LOCK(ump); freework->fw_off = i; freework->fw_ref += freedeps; freework->fw_ref -= NINDIR(fs) + 1; if (level == 0) freeblks->fb_cgwait += freedeps; if (freework->fw_ref == 0) freework_freeblock(freework); FREE_LOCK(ump); return; } /* * If we're not journaling we can free the indirect now. */ dbn = dbtofsb(fs, dbn); CTR3(KTR_SUJ, "indir_trunc 2: ino %d blkno %jd size %ld", freeblks->fb_inum, dbn, fs->fs_bsize); ffs_blkfree(ump, fs, freeblks->fb_devvp, dbn, fs->fs_bsize, freeblks->fb_inum, freeblks->fb_vtype, NULL); /* Non SUJ softdep does single-threaded truncations. */ if (freework->fw_blkno == dbn) { freework->fw_state |= ALLCOMPLETE; ACQUIRE_LOCK(ump); handle_written_freework(freework); FREE_LOCK(ump); } return; } /* * Cancel an allocindir when it is removed via truncation. When bp is not * NULL the indirect never appeared on disk and is scheduled to be freed * independently of the indir so we can more easily track journal work. */ static void cancel_allocindir(aip, bp, freeblks, trunc) struct allocindir *aip; struct buf *bp; struct freeblks *freeblks; int trunc; { struct indirdep *indirdep; struct freefrag *freefrag; struct newblk *newblk; newblk = (struct newblk *)aip; LIST_REMOVE(aip, ai_next); /* * We must eliminate the pointer in bp if it must be freed on its * own due to partial truncate or pending journal work. */ if (bp && (trunc || newblk->nb_jnewblk)) { /* * Clear the pointer and mark the aip to be freed * directly if it never existed on disk. */ aip->ai_state |= DELAYEDFREE; indirdep = aip->ai_indirdep; if (indirdep->ir_state & UFS1FMT) ((ufs1_daddr_t *)bp->b_data)[aip->ai_offset] = 0; else ((ufs2_daddr_t *)bp->b_data)[aip->ai_offset] = 0; } /* * When truncating the previous pointer will be freed via * savedbp. Eliminate the freefrag which would dup free. */ if (trunc && (freefrag = newblk->nb_freefrag) != NULL) { newblk->nb_freefrag = NULL; if (freefrag->ff_jdep) cancel_jfreefrag( WK_JFREEFRAG(freefrag->ff_jdep)); jwork_move(&freeblks->fb_jwork, &freefrag->ff_jwork); WORKITEM_FREE(freefrag, D_FREEFRAG); } /* * If the journal hasn't been written the jnewblk must be passed * to the call to ffs_blkfree that reclaims the space. We accomplish * this by leaving the journal dependency on the newblk to be freed * when a freework is created in handle_workitem_freeblocks(). */ cancel_newblk(newblk, NULL, &freeblks->fb_jwork); WORKLIST_INSERT(&freeblks->fb_freeworkhd, &newblk->nb_list); } /* * Create the mkdir dependencies for . and .. in a new directory. Link them * in to a newdirblk so any subsequent additions are tracked properly. The * caller is responsible for adding the mkdir1 dependency to the journal * and updating id_mkdiradd. This function returns with the per-filesystem * lock held. */ static struct mkdir * setup_newdir(dap, newinum, dinum, newdirbp, mkdirp) struct diradd *dap; ino_t newinum; ino_t dinum; struct buf *newdirbp; struct mkdir **mkdirp; { struct newblk *newblk; struct pagedep *pagedep; struct inodedep *inodedep; struct newdirblk *newdirblk = 0; struct mkdir *mkdir1, *mkdir2; struct worklist *wk; struct jaddref *jaddref; struct ufsmount *ump; struct mount *mp; mp = dap->da_list.wk_mp; ump = VFSTOUFS(mp); newdirblk = malloc(sizeof(struct newdirblk), M_NEWDIRBLK, M_SOFTDEP_FLAGS); workitem_alloc(&newdirblk->db_list, D_NEWDIRBLK, mp); LIST_INIT(&newdirblk->db_mkdir); mkdir1 = malloc(sizeof(struct mkdir), M_MKDIR, M_SOFTDEP_FLAGS); workitem_alloc(&mkdir1->md_list, D_MKDIR, mp); mkdir1->md_state = ATTACHED | MKDIR_BODY; mkdir1->md_diradd = dap; mkdir1->md_jaddref = NULL; mkdir2 = malloc(sizeof(struct mkdir), M_MKDIR, M_SOFTDEP_FLAGS); workitem_alloc(&mkdir2->md_list, D_MKDIR, mp); mkdir2->md_state = ATTACHED | MKDIR_PARENT; mkdir2->md_diradd = dap; mkdir2->md_jaddref = NULL; if (MOUNTEDSUJ(mp) == 0) { mkdir1->md_state |= DEPCOMPLETE; mkdir2->md_state |= DEPCOMPLETE; } /* * Dependency on "." and ".." being written to disk. */ mkdir1->md_buf = newdirbp; ACQUIRE_LOCK(VFSTOUFS(mp)); LIST_INSERT_HEAD(&ump->softdep_mkdirlisthd, mkdir1, md_mkdirs); /* * We must link the pagedep, allocdirect, and newdirblk for * the initial file page so the pointer to the new directory * is not written until the directory contents are live and * any subsequent additions are not marked live until the * block is reachable via the inode. */ if (pagedep_lookup(mp, newdirbp, newinum, 0, 0, &pagedep) == 0) panic("setup_newdir: lost pagedep"); LIST_FOREACH(wk, &newdirbp->b_dep, wk_list) if (wk->wk_type == D_ALLOCDIRECT) break; if (wk == NULL) panic("setup_newdir: lost allocdirect"); if (pagedep->pd_state & NEWBLOCK) panic("setup_newdir: NEWBLOCK already set"); newblk = WK_NEWBLK(wk); pagedep->pd_state |= NEWBLOCK; pagedep->pd_newdirblk = newdirblk; newdirblk->db_pagedep = pagedep; WORKLIST_INSERT(&newblk->nb_newdirblk, &newdirblk->db_list); WORKLIST_INSERT(&newdirblk->db_mkdir, &mkdir1->md_list); /* * Look up the inodedep for the parent directory so that we * can link mkdir2 into the pending dotdot jaddref or * the inode write if there is none. If the inode is * ALLCOMPLETE and no jaddref is present all dependencies have * been satisfied and mkdir2 can be freed. */ inodedep_lookup(mp, dinum, 0, &inodedep); if (MOUNTEDSUJ(mp)) { if (inodedep == NULL) panic("setup_newdir: Lost parent."); jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, inoreflst); KASSERT(jaddref != NULL && jaddref->ja_parent == newinum && (jaddref->ja_state & MKDIR_PARENT), ("setup_newdir: bad dotdot jaddref %p", jaddref)); LIST_INSERT_HEAD(&ump->softdep_mkdirlisthd, mkdir2, md_mkdirs); mkdir2->md_jaddref = jaddref; jaddref->ja_mkdir = mkdir2; } else if (inodedep == NULL || (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) { dap->da_state &= ~MKDIR_PARENT; WORKITEM_FREE(mkdir2, D_MKDIR); mkdir2 = NULL; } else { LIST_INSERT_HEAD(&ump->softdep_mkdirlisthd, mkdir2, md_mkdirs); WORKLIST_INSERT(&inodedep->id_bufwait, &mkdir2->md_list); } *mkdirp = mkdir2; return (mkdir1); } /* * Directory entry addition dependencies. * * When adding a new directory entry, the inode (with its incremented link * count) must be written to disk before the directory entry's pointer to it. * Also, if the inode is newly allocated, the corresponding freemap must be * updated (on disk) before the directory entry's pointer. These requirements * are met via undo/redo on the directory entry's pointer, which consists * simply of the inode number. * * As directory entries are added and deleted, the free space within a * directory block can become fragmented. The ufs filesystem will compact * a fragmented directory block to make space for a new entry. When this * occurs, the offsets of previously added entries change. Any "diradd" * dependency structures corresponding to these entries must be updated with * the new offsets. */ /* * This routine is called after the in-memory inode's link * count has been incremented, but before the directory entry's * pointer to the inode has been set. */ int softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp, isnewblk) struct buf *bp; /* buffer containing directory block */ struct inode *dp; /* inode for directory */ off_t diroffset; /* offset of new entry in directory */ ino_t newinum; /* inode referenced by new directory entry */ struct buf *newdirbp; /* non-NULL => contents of new mkdir */ int isnewblk; /* entry is in a newly allocated block */ { int offset; /* offset of new entry within directory block */ ufs_lbn_t lbn; /* block in directory containing new entry */ struct fs *fs; struct diradd *dap; struct newblk *newblk; struct pagedep *pagedep; struct inodedep *inodedep; struct newdirblk *newdirblk = 0; struct mkdir *mkdir1, *mkdir2; struct jaddref *jaddref; struct ufsmount *ump; struct mount *mp; int isindir; ump = dp->i_ump; mp = UFSTOVFS(ump); KASSERT(MOUNTEDSOFTDEP(mp) != 0, ("softdep_setup_directory_add called on non-softdep filesystem")); /* * Whiteouts have no dependencies. */ if (newinum == WINO) { if (newdirbp != NULL) bdwrite(newdirbp); return (0); } jaddref = NULL; mkdir1 = mkdir2 = NULL; fs = dp->i_fs; lbn = lblkno(fs, diroffset); offset = blkoff(fs, diroffset); dap = malloc(sizeof(struct diradd), M_DIRADD, M_SOFTDEP_FLAGS|M_ZERO); workitem_alloc(&dap->da_list, D_DIRADD, mp); dap->da_offset = offset; dap->da_newinum = newinum; dap->da_state = ATTACHED; LIST_INIT(&dap->da_jwork); isindir = bp->b_lblkno >= NDADDR; if (isnewblk && (isindir ? blkoff(fs, diroffset) : fragoff(fs, diroffset)) == 0) { newdirblk = malloc(sizeof(struct newdirblk), M_NEWDIRBLK, M_SOFTDEP_FLAGS); workitem_alloc(&newdirblk->db_list, D_NEWDIRBLK, mp); LIST_INIT(&newdirblk->db_mkdir); } /* * If we're creating a new directory setup the dependencies and set * the dap state to wait for them. Otherwise it's COMPLETE and * we can move on. */ if (newdirbp == NULL) { dap->da_state |= DEPCOMPLETE; ACQUIRE_LOCK(ump); } else { dap->da_state |= MKDIR_BODY | MKDIR_PARENT; mkdir1 = setup_newdir(dap, newinum, dp->i_number, newdirbp, &mkdir2); } /* * Link into parent directory pagedep to await its being written. */ pagedep_lookup(mp, bp, dp->i_number, lbn, DEPALLOC, &pagedep); #ifdef DEBUG if (diradd_lookup(pagedep, offset) != NULL) panic("softdep_setup_directory_add: %p already at off %d\n", diradd_lookup(pagedep, offset), offset); #endif dap->da_pagedep = pagedep; LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], dap, da_pdlist); inodedep_lookup(mp, newinum, DEPALLOC | NODELAY, &inodedep); /* * If we're journaling, link the diradd into the jaddref so it * may be completed after the journal entry is written. Otherwise, * link the diradd into its inodedep. If the inode is not yet * written place it on the bufwait list, otherwise do the post-inode * write processing to put it on the id_pendinghd list. */ if (MOUNTEDSUJ(mp)) { jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, inoreflst); KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number, ("softdep_setup_directory_add: bad jaddref %p", jaddref)); jaddref->ja_diroff = diroffset; jaddref->ja_diradd = dap; add_to_journal(&jaddref->ja_list); } else if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) diradd_inode_written(dap, inodedep); else WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list); /* * Add the journal entries for . and .. links now that the primary * link is written. */ if (mkdir1 != NULL && MOUNTEDSUJ(mp)) { jaddref = (struct jaddref *)TAILQ_PREV(&jaddref->ja_ref, inoreflst, if_deps); KASSERT(jaddref != NULL && jaddref->ja_ino == jaddref->ja_parent && (jaddref->ja_state & MKDIR_BODY), ("softdep_setup_directory_add: bad dot jaddref %p", jaddref)); mkdir1->md_jaddref = jaddref; jaddref->ja_mkdir = mkdir1; /* * It is important that the dotdot journal entry * is added prior to the dot entry since dot writes * both the dot and dotdot links. These both must * be added after the primary link for the journal * to remain consistent. */ add_to_journal(&mkdir2->md_jaddref->ja_list); add_to_journal(&jaddref->ja_list); } /* * If we are adding a new directory remember this diradd so that if * we rename it we can keep the dot and dotdot dependencies. If * we are adding a new name for an inode that has a mkdiradd we * must be in rename and we have to move the dot and dotdot * dependencies to this new name. The old name is being orphaned * soon. */ if (mkdir1 != NULL) { if (inodedep->id_mkdiradd != NULL) panic("softdep_setup_directory_add: Existing mkdir"); inodedep->id_mkdiradd = dap; } else if (inodedep->id_mkdiradd) merge_diradd(inodedep, dap); if (newdirblk) { /* * There is nothing to do if we are already tracking * this block. */ if ((pagedep->pd_state & NEWBLOCK) != 0) { WORKITEM_FREE(newdirblk, D_NEWDIRBLK); FREE_LOCK(ump); return (0); } if (newblk_lookup(mp, dbtofsb(fs, bp->b_blkno), 0, &newblk) == 0) panic("softdep_setup_directory_add: lost entry"); WORKLIST_INSERT(&newblk->nb_newdirblk, &newdirblk->db_list); pagedep->pd_state |= NEWBLOCK; pagedep->pd_newdirblk = newdirblk; newdirblk->db_pagedep = pagedep; FREE_LOCK(ump); /* * If we extended into an indirect signal direnter to sync. */ if (isindir) return (1); return (0); } FREE_LOCK(ump); return (0); } /* * This procedure is called to change the offset of a directory * entry when compacting a directory block which must be owned * exclusively by the caller. Note that the actual entry movement * must be done in this procedure to ensure that no I/O completions * occur while the move is in progress. */ void softdep_change_directoryentry_offset(bp, dp, base, oldloc, newloc, entrysize) struct buf *bp; /* Buffer holding directory block. */ struct inode *dp; /* inode for directory */ caddr_t base; /* address of dp->i_offset */ caddr_t oldloc; /* address of old directory location */ caddr_t newloc; /* address of new directory location */ int entrysize; /* size of directory entry */ { int offset, oldoffset, newoffset; struct pagedep *pagedep; struct jmvref *jmvref; struct diradd *dap; struct direct *de; struct mount *mp; ufs_lbn_t lbn; int flags; mp = UFSTOVFS(dp->i_ump); KASSERT(MOUNTEDSOFTDEP(mp) != 0, ("softdep_change_directoryentry_offset called on " "non-softdep filesystem")); de = (struct direct *)oldloc; jmvref = NULL; flags = 0; /* * Moves are always journaled as it would be too complex to * determine if any affected adds or removes are present in the * journal. */ if (MOUNTEDSUJ(mp)) { flags = DEPALLOC; jmvref = newjmvref(dp, de->d_ino, dp->i_offset + (oldloc - base), dp->i_offset + (newloc - base)); } lbn = lblkno(dp->i_fs, dp->i_offset); offset = blkoff(dp->i_fs, dp->i_offset); oldoffset = offset + (oldloc - base); newoffset = offset + (newloc - base); ACQUIRE_LOCK(dp->i_ump); if (pagedep_lookup(mp, bp, dp->i_number, lbn, flags, &pagedep) == 0) goto done; dap = diradd_lookup(pagedep, oldoffset); if (dap) { dap->da_offset = newoffset; newoffset = DIRADDHASH(newoffset); oldoffset = DIRADDHASH(oldoffset); if ((dap->da_state & ALLCOMPLETE) != ALLCOMPLETE && newoffset != oldoffset) { LIST_REMOVE(dap, da_pdlist); LIST_INSERT_HEAD(&pagedep->pd_diraddhd[newoffset], dap, da_pdlist); } } done: if (jmvref) { jmvref->jm_pagedep = pagedep; LIST_INSERT_HEAD(&pagedep->pd_jmvrefhd, jmvref, jm_deps); add_to_journal(&jmvref->jm_list); } bcopy(oldloc, newloc, entrysize); FREE_LOCK(dp->i_ump); } /* * Move the mkdir dependencies and journal work from one diradd to another * when renaming a directory. The new name must depend on the mkdir deps * completing as the old name did. Directories can only have one valid link * at a time so one must be canonical. */ static void merge_diradd(inodedep, newdap) struct inodedep *inodedep; struct diradd *newdap; { struct diradd *olddap; struct mkdir *mkdir, *nextmd; struct ufsmount *ump; short state; olddap = inodedep->id_mkdiradd; inodedep->id_mkdiradd = newdap; if ((olddap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) { newdap->da_state &= ~DEPCOMPLETE; ump = VFSTOUFS(inodedep->id_list.wk_mp); for (mkdir = LIST_FIRST(&ump->softdep_mkdirlisthd); mkdir; mkdir = nextmd) { nextmd = LIST_NEXT(mkdir, md_mkdirs); if (mkdir->md_diradd != olddap) continue; mkdir->md_diradd = newdap; state = mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY); newdap->da_state |= state; olddap->da_state &= ~state; if ((olddap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0) break; } if ((olddap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) panic("merge_diradd: unfound ref"); } /* * Any mkdir related journal items are not safe to be freed until * the new name is stable. */ jwork_move(&newdap->da_jwork, &olddap->da_jwork); olddap->da_state |= DEPCOMPLETE; complete_diradd(olddap); } /* * Move the diradd to the pending list when all diradd dependencies are * complete. */ static void complete_diradd(dap) struct diradd *dap; { struct pagedep *pagedep; if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) { if (dap->da_state & DIRCHG) pagedep = dap->da_previous->dm_pagedep; else pagedep = dap->da_pagedep; LIST_REMOVE(dap, da_pdlist); LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist); } } /* * Cancel a diradd when a dirrem overlaps with it. We must cancel the journal * add entries and conditonally journal the remove. */ static void cancel_diradd(dap, dirrem, jremref, dotremref, dotdotremref) struct diradd *dap; struct dirrem *dirrem; struct jremref *jremref; struct jremref *dotremref; struct jremref *dotdotremref; { struct inodedep *inodedep; struct jaddref *jaddref; struct inoref *inoref; struct ufsmount *ump; struct mkdir *mkdir; /* * If no remove references were allocated we're on a non-journaled * filesystem and can skip the cancel step. */ if (jremref == NULL) { free_diradd(dap, NULL); return; } /* * Cancel the primary name an free it if it does not require * journaling. */ if (inodedep_lookup(dap->da_list.wk_mp, dap->da_newinum, 0, &inodedep) != 0) { /* Abort the addref that reference this diradd. */ TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) { if (inoref->if_list.wk_type != D_JADDREF) continue; jaddref = (struct jaddref *)inoref; if (jaddref->ja_diradd != dap) continue; if (cancel_jaddref(jaddref, inodedep, &dirrem->dm_jwork) == 0) { free_jremref(jremref); jremref = NULL; } break; } } /* * Cancel subordinate names and free them if they do not require * journaling. */ if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) { ump = VFSTOUFS(dap->da_list.wk_mp); LIST_FOREACH(mkdir, &ump->softdep_mkdirlisthd, md_mkdirs) { if (mkdir->md_diradd != dap) continue; if ((jaddref = mkdir->md_jaddref) == NULL) continue; mkdir->md_jaddref = NULL; if (mkdir->md_state & MKDIR_PARENT) { if (cancel_jaddref(jaddref, NULL, &dirrem->dm_jwork) == 0) { free_jremref(dotdotremref); dotdotremref = NULL; } } else { if (cancel_jaddref(jaddref, inodedep, &dirrem->dm_jwork) == 0) { free_jremref(dotremref); dotremref = NULL; } } } } if (jremref) journal_jremref(dirrem, jremref, inodedep); if (dotremref) journal_jremref(dirrem, dotremref, inodedep); if (dotdotremref) journal_jremref(dirrem, dotdotremref, NULL); jwork_move(&dirrem->dm_jwork, &dap->da_jwork); free_diradd(dap, &dirrem->dm_jwork); } /* * Free a diradd dependency structure. This routine must be called * with splbio interrupts blocked. */ static void free_diradd(dap, wkhd) struct diradd *dap; struct workhead *wkhd; { struct dirrem *dirrem; struct pagedep *pagedep; struct inodedep *inodedep; struct mkdir *mkdir, *nextmd; struct ufsmount *ump; ump = VFSTOUFS(dap->da_list.wk_mp); LOCK_OWNED(ump); LIST_REMOVE(dap, da_pdlist); if (dap->da_state & ONWORKLIST) WORKLIST_REMOVE(&dap->da_list); if ((dap->da_state & DIRCHG) == 0) { pagedep = dap->da_pagedep; } else { dirrem = dap->da_previous; pagedep = dirrem->dm_pagedep; dirrem->dm_dirinum = pagedep->pd_ino; dirrem->dm_state |= COMPLETE; if (LIST_EMPTY(&dirrem->dm_jremrefhd)) add_to_worklist(&dirrem->dm_list, 0); } if (inodedep_lookup(pagedep->pd_list.wk_mp, dap->da_newinum, 0, &inodedep) != 0) if (inodedep->id_mkdiradd == dap) inodedep->id_mkdiradd = NULL; if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) { for (mkdir = LIST_FIRST(&ump->softdep_mkdirlisthd); mkdir; mkdir = nextmd) { nextmd = LIST_NEXT(mkdir, md_mkdirs); if (mkdir->md_diradd != dap) continue; dap->da_state &= ~(mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY)); LIST_REMOVE(mkdir, md_mkdirs); if (mkdir->md_state & ONWORKLIST) WORKLIST_REMOVE(&mkdir->md_list); if (mkdir->md_jaddref != NULL) panic("free_diradd: Unexpected jaddref"); WORKITEM_FREE(mkdir, D_MKDIR); if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0) break; } if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) panic("free_diradd: unfound ref"); } if (inodedep) free_inodedep(inodedep); /* * Free any journal segments waiting for the directory write. */ handle_jwork(&dap->da_jwork); WORKITEM_FREE(dap, D_DIRADD); } /* * Directory entry removal dependencies. * * When removing a directory entry, the entry's inode pointer must be * zero'ed on disk before the corresponding inode's link count is decremented * (possibly freeing the inode for re-use). This dependency is handled by * updating the directory entry but delaying the inode count reduction until * after the directory block has been written to disk. After this point, the * inode count can be decremented whenever it is convenient. */ /* * This routine should be called immediately after removing * a directory entry. The inode's link count should not be * decremented by the calling procedure -- the soft updates * code will do this task when it is safe. */ void softdep_setup_remove(bp, dp, ip, isrmdir) struct buf *bp; /* buffer containing directory block */ struct inode *dp; /* inode for the directory being modified */ struct inode *ip; /* inode for directory entry being removed */ int isrmdir; /* indicates if doing RMDIR */ { struct dirrem *dirrem, *prevdirrem; struct inodedep *inodedep; int direct; KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ip->i_ump)) != 0, ("softdep_setup_remove called on non-softdep filesystem")); /* * Allocate a new dirrem if appropriate and ACQUIRE_LOCK. We want * newdirrem() to setup the full directory remove which requires * isrmdir > 1. */ dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem); /* * Add the dirrem to the inodedep's pending remove list for quick * discovery later. */ if (inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0, &inodedep) == 0) panic("softdep_setup_remove: Lost inodedep."); KASSERT((inodedep->id_state & UNLINKED) == 0, ("inode unlinked")); dirrem->dm_state |= ONDEPLIST; LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext); /* * If the COMPLETE flag is clear, then there were no active * entries and we want to roll back to a zeroed entry until * the new inode is committed to disk. If the COMPLETE flag is * set then we have deleted an entry that never made it to * disk. If the entry we deleted resulted from a name change, * then the old name still resides on disk. We cannot delete * its inode (returned to us in prevdirrem) until the zeroed * directory entry gets to disk. The new inode has never been * referenced on the disk, so can be deleted immediately. */ if ((dirrem->dm_state & COMPLETE) == 0) { LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, dirrem, dm_next); FREE_LOCK(ip->i_ump); } else { if (prevdirrem != NULL) LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, prevdirrem, dm_next); dirrem->dm_dirinum = dirrem->dm_pagedep->pd_ino; direct = LIST_EMPTY(&dirrem->dm_jremrefhd); FREE_LOCK(ip->i_ump); if (direct) handle_workitem_remove(dirrem, 0); } } /* * Check for an entry matching 'offset' on both the pd_dirraddhd list and the * pd_pendinghd list of a pagedep. */ static struct diradd * diradd_lookup(pagedep, offset) struct pagedep *pagedep; int offset; { struct diradd *dap; LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(offset)], da_pdlist) if (dap->da_offset == offset) return (dap); LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist) if (dap->da_offset == offset) return (dap); return (NULL); } /* * Search for a .. diradd dependency in a directory that is being removed. * If the directory was renamed to a new parent we have a diradd rather * than a mkdir for the .. entry. We need to cancel it now before * it is found in truncate(). */ static struct jremref * cancel_diradd_dotdot(ip, dirrem, jremref) struct inode *ip; struct dirrem *dirrem; struct jremref *jremref; { struct pagedep *pagedep; struct diradd *dap; struct worklist *wk; if (pagedep_lookup(UFSTOVFS(ip->i_ump), NULL, ip->i_number, 0, 0, &pagedep) == 0) return (jremref); dap = diradd_lookup(pagedep, DOTDOT_OFFSET); if (dap == NULL) return (jremref); cancel_diradd(dap, dirrem, jremref, NULL, NULL); /* * Mark any journal work as belonging to the parent so it is freed * with the .. reference. */ LIST_FOREACH(wk, &dirrem->dm_jwork, wk_list) wk->wk_state |= MKDIR_PARENT; return (NULL); } /* * Cancel the MKDIR_PARENT mkdir component of a diradd when we're going to * replace it with a dirrem/diradd pair as a result of re-parenting a * directory. This ensures that we don't simultaneously have a mkdir and * a diradd for the same .. entry. */ static struct jremref * cancel_mkdir_dotdot(ip, dirrem, jremref) struct inode *ip; struct dirrem *dirrem; struct jremref *jremref; { struct inodedep *inodedep; struct jaddref *jaddref; struct ufsmount *ump; struct mkdir *mkdir; struct diradd *dap; if (inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0, &inodedep) == 0) return (jremref); dap = inodedep->id_mkdiradd; if (dap == NULL || (dap->da_state & MKDIR_PARENT) == 0) return (jremref); ump = VFSTOUFS(inodedep->id_list.wk_mp); for (mkdir = LIST_FIRST(&ump->softdep_mkdirlisthd); mkdir; mkdir = LIST_NEXT(mkdir, md_mkdirs)) if (mkdir->md_diradd == dap && mkdir->md_state & MKDIR_PARENT) break; if (mkdir == NULL) panic("cancel_mkdir_dotdot: Unable to find mkdir\n"); if ((jaddref = mkdir->md_jaddref) != NULL) { mkdir->md_jaddref = NULL; jaddref->ja_state &= ~MKDIR_PARENT; if (inodedep_lookup(UFSTOVFS(ip->i_ump), jaddref->ja_ino, 0, &inodedep) == 0) panic("cancel_mkdir_dotdot: Lost parent inodedep"); if (cancel_jaddref(jaddref, inodedep, &dirrem->dm_jwork)) { journal_jremref(dirrem, jremref, inodedep); jremref = NULL; } } if (mkdir->md_state & ONWORKLIST) WORKLIST_REMOVE(&mkdir->md_list); mkdir->md_state |= ALLCOMPLETE; complete_mkdir(mkdir); return (jremref); } static void journal_jremref(dirrem, jremref, inodedep) struct dirrem *dirrem; struct jremref *jremref; struct inodedep *inodedep; { if (inodedep == NULL) if (inodedep_lookup(jremref->jr_list.wk_mp, jremref->jr_ref.if_ino, 0, &inodedep) == 0) panic("journal_jremref: Lost inodedep"); LIST_INSERT_HEAD(&dirrem->dm_jremrefhd, jremref, jr_deps); TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jremref->jr_ref, if_deps); add_to_journal(&jremref->jr_list); } static void dirrem_journal(dirrem, jremref, dotremref, dotdotremref) struct dirrem *dirrem; struct jremref *jremref; struct jremref *dotremref; struct jremref *dotdotremref; { struct inodedep *inodedep; if (inodedep_lookup(jremref->jr_list.wk_mp, jremref->jr_ref.if_ino, 0, &inodedep) == 0) panic("dirrem_journal: Lost inodedep"); journal_jremref(dirrem, jremref, inodedep); if (dotremref) journal_jremref(dirrem, dotremref, inodedep); if (dotdotremref) journal_jremref(dirrem, dotdotremref, NULL); } /* * Allocate a new dirrem if appropriate and return it along with * its associated pagedep. Called without a lock, returns with lock. */ static struct dirrem * newdirrem(bp, dp, ip, isrmdir, prevdirremp) struct buf *bp; /* buffer containing directory block */ struct inode *dp; /* inode for the directory being modified */ struct inode *ip; /* inode for directory entry being removed */ int isrmdir; /* indicates if doing RMDIR */ struct dirrem **prevdirremp; /* previously referenced inode, if any */ { int offset; ufs_lbn_t lbn; struct diradd *dap; struct dirrem *dirrem; struct pagedep *pagedep; struct jremref *jremref; struct jremref *dotremref; struct jremref *dotdotremref; struct vnode *dvp; /* * Whiteouts have no deletion dependencies. */ if (ip == NULL) panic("newdirrem: whiteout"); dvp = ITOV(dp); /* * If the system is over its limit and our filesystem is * responsible for more than our share of that usage and * we are not a snapshot, request some inodedep cleanup. * Limiting the number of dirrem structures will also limit * the number of freefile and freeblks structures. */ ACQUIRE_LOCK(ip->i_ump); while (!IS_SNAPSHOT(ip) && dep_current[D_DIRREM] > max_softdeps / 2 && ip->i_ump->softdep_curdeps[D_DIRREM] > (max_softdeps / 2) / stat_flush_threads) (void) request_cleanup(ITOV(dp)->v_mount, FLUSH_BLOCKS); FREE_LOCK(ip->i_ump); dirrem = malloc(sizeof(struct dirrem), M_DIRREM, M_SOFTDEP_FLAGS|M_ZERO); workitem_alloc(&dirrem->dm_list, D_DIRREM, dvp->v_mount); LIST_INIT(&dirrem->dm_jremrefhd); LIST_INIT(&dirrem->dm_jwork); dirrem->dm_state = isrmdir ? RMDIR : 0; dirrem->dm_oldinum = ip->i_number; *prevdirremp = NULL; /* * Allocate remove reference structures to track journal write * dependencies. We will always have one for the link and * when doing directories we will always have one more for dot. * When renaming a directory we skip the dotdot link change so * this is not needed. */ jremref = dotremref = dotdotremref = NULL; if (DOINGSUJ(dvp)) { if (isrmdir) { jremref = newjremref(dirrem, dp, ip, dp->i_offset, ip->i_effnlink + 2); dotremref = newjremref(dirrem, ip, ip, DOT_OFFSET, ip->i_effnlink + 1); dotdotremref = newjremref(dirrem, ip, dp, DOTDOT_OFFSET, dp->i_effnlink + 1); dotdotremref->jr_state |= MKDIR_PARENT; } else jremref = newjremref(dirrem, dp, ip, dp->i_offset, ip->i_effnlink + 1); } ACQUIRE_LOCK(ip->i_ump); lbn = lblkno(dp->i_fs, dp->i_offset); offset = blkoff(dp->i_fs, dp->i_offset); pagedep_lookup(UFSTOVFS(dp->i_ump), bp, dp->i_number, lbn, DEPALLOC, &pagedep); dirrem->dm_pagedep = pagedep; dirrem->dm_offset = offset; /* * If we're renaming a .. link to a new directory, cancel any * existing MKDIR_PARENT mkdir. If it has already been canceled * the jremref is preserved for any potential diradd in this * location. This can not coincide with a rmdir. */ if (dp->i_offset == DOTDOT_OFFSET) { if (isrmdir) panic("newdirrem: .. directory change during remove?"); jremref = cancel_mkdir_dotdot(dp, dirrem, jremref); } /* * If we're removing a directory search for the .. dependency now and * cancel it. Any pending journal work will be added to the dirrem * to be completed when the workitem remove completes. */ if (isrmdir) dotdotremref = cancel_diradd_dotdot(ip, dirrem, dotdotremref); /* * Check for a diradd dependency for the same directory entry. * If present, then both dependencies become obsolete and can * be de-allocated. */ dap = diradd_lookup(pagedep, offset); if (dap == NULL) { /* * Link the jremref structures into the dirrem so they are * written prior to the pagedep. */ if (jremref) dirrem_journal(dirrem, jremref, dotremref, dotdotremref); return (dirrem); } /* * Must be ATTACHED at this point. */ if ((dap->da_state & ATTACHED) == 0) panic("newdirrem: not ATTACHED"); if (dap->da_newinum != ip->i_number) panic("newdirrem: inum %ju should be %ju", (uintmax_t)ip->i_number, (uintmax_t)dap->da_newinum); /* * If we are deleting a changed name that never made it to disk, * then return the dirrem describing the previous inode (which * represents the inode currently referenced from this entry on disk). */ if ((dap->da_state & DIRCHG) != 0) { *prevdirremp = dap->da_previous; dap->da_state &= ~DIRCHG; dap->da_pagedep = pagedep; } /* * We are deleting an entry that never made it to disk. * Mark it COMPLETE so we can delete its inode immediately. */ dirrem->dm_state |= COMPLETE; cancel_diradd(dap, dirrem, jremref, dotremref, dotdotremref); #ifdef SUJ_DEBUG if (isrmdir == 0) { struct worklist *wk; LIST_FOREACH(wk, &dirrem->dm_jwork, wk_list) if (wk->wk_state & (MKDIR_BODY | MKDIR_PARENT)) panic("bad wk %p (0x%X)\n", wk, wk->wk_state); } #endif return (dirrem); } /* * Directory entry change dependencies. * * Changing an existing directory entry requires that an add operation * be completed first followed by a deletion. The semantics for the addition * are identical to the description of adding a new entry above except * that the rollback is to the old inode number rather than zero. Once * the addition dependency is completed, the removal is done as described * in the removal routine above. */ /* * This routine should be called immediately after changing * a directory entry. The inode's link count should not be * decremented by the calling procedure -- the soft updates * code will perform this task when it is safe. */ void softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir) struct buf *bp; /* buffer containing directory block */ struct inode *dp; /* inode for the directory being modified */ struct inode *ip; /* inode for directory entry being removed */ ino_t newinum; /* new inode number for changed entry */ int isrmdir; /* indicates if doing RMDIR */ { int offset; struct diradd *dap = NULL; struct dirrem *dirrem, *prevdirrem; struct pagedep *pagedep; struct inodedep *inodedep; struct jaddref *jaddref; struct mount *mp; offset = blkoff(dp->i_fs, dp->i_offset); mp = UFSTOVFS(dp->i_ump); KASSERT(MOUNTEDSOFTDEP(mp) != 0, ("softdep_setup_directory_change called on non-softdep filesystem")); /* * Whiteouts do not need diradd dependencies. */ if (newinum != WINO) { dap = malloc(sizeof(struct diradd), M_DIRADD, M_SOFTDEP_FLAGS|M_ZERO); workitem_alloc(&dap->da_list, D_DIRADD, mp); dap->da_state = DIRCHG | ATTACHED | DEPCOMPLETE; dap->da_offset = offset; dap->da_newinum = newinum; LIST_INIT(&dap->da_jwork); } /* * Allocate a new dirrem and ACQUIRE_LOCK. */ dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem); pagedep = dirrem->dm_pagedep; /* * The possible values for isrmdir: * 0 - non-directory file rename * 1 - directory rename within same directory * inum - directory rename to new directory of given inode number * When renaming to a new directory, we are both deleting and * creating a new directory entry, so the link count on the new * directory should not change. Thus we do not need the followup * dirrem which is usually done in handle_workitem_remove. We set * the DIRCHG flag to tell handle_workitem_remove to skip the * followup dirrem. */ if (isrmdir > 1) dirrem->dm_state |= DIRCHG; /* * Whiteouts have no additional dependencies, * so just put the dirrem on the correct list. */ if (newinum == WINO) { if ((dirrem->dm_state & COMPLETE) == 0) { LIST_INSERT_HEAD(&pagedep->pd_dirremhd, dirrem, dm_next); } else { dirrem->dm_dirinum = pagedep->pd_ino; if (LIST_EMPTY(&dirrem->dm_jremrefhd)) add_to_worklist(&dirrem->dm_list, 0); } FREE_LOCK(dp->i_ump); return; } /* * Add the dirrem to the inodedep's pending remove list for quick * discovery later. A valid nlinkdelta ensures that this lookup * will not fail. */ if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) panic("softdep_setup_directory_change: Lost inodedep."); dirrem->dm_state |= ONDEPLIST; LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext); /* * If the COMPLETE flag is clear, then there were no active * entries and we want to roll back to the previous inode until * the new inode is committed to disk. If the COMPLETE flag is * set, then we have deleted an entry that never made it to disk. * If the entry we deleted resulted from a name change, then the old * inode reference still resides on disk. Any rollback that we do * needs to be to that old inode (returned to us in prevdirrem). If * the entry we deleted resulted from a create, then there is * no entry on the disk, so we want to roll back to zero rather * than the uncommitted inode. In either of the COMPLETE cases we * want to immediately free the unwritten and unreferenced inode. */ if ((dirrem->dm_state & COMPLETE) == 0) { dap->da_previous = dirrem; } else { if (prevdirrem != NULL) { dap->da_previous = prevdirrem; } else { dap->da_state &= ~DIRCHG; dap->da_pagedep = pagedep; } dirrem->dm_dirinum = pagedep->pd_ino; if (LIST_EMPTY(&dirrem->dm_jremrefhd)) add_to_worklist(&dirrem->dm_list, 0); } /* * Lookup the jaddref for this journal entry. We must finish * initializing it and make the diradd write dependent on it. * If we're not journaling, put it on the id_bufwait list if the * inode is not yet written. If it is written, do the post-inode * write processing to put it on the id_pendinghd list. */ inodedep_lookup(mp, newinum, DEPALLOC | NODELAY, &inodedep); if (MOUNTEDSUJ(mp)) { jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, inoreflst); KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number, ("softdep_setup_directory_change: bad jaddref %p", jaddref)); jaddref->ja_diroff = dp->i_offset; jaddref->ja_diradd = dap; LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], dap, da_pdlist); add_to_journal(&jaddref->ja_list); } else if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) { dap->da_state |= COMPLETE; LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist); WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list); } else { LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], dap, da_pdlist); WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list); } /* * If we're making a new name for a directory that has not been * committed when need to move the dot and dotdot references to * this new name. */ if (inodedep->id_mkdiradd && dp->i_offset != DOTDOT_OFFSET) merge_diradd(inodedep, dap); FREE_LOCK(dp->i_ump); } /* * Called whenever the link count on an inode is changed. * It creates an inode dependency so that the new reference(s) * to the inode cannot be committed to disk until the updated * inode has been written. */ void softdep_change_linkcnt(ip) struct inode *ip; /* the inode with the increased link count */ { struct inodedep *inodedep; int dflags; KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ip->i_ump)) != 0, ("softdep_change_linkcnt called on non-softdep filesystem")); ACQUIRE_LOCK(ip->i_ump); dflags = DEPALLOC; if (IS_SNAPSHOT(ip)) dflags |= NODELAY; inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, dflags, &inodedep); if (ip->i_nlink < ip->i_effnlink) panic("softdep_change_linkcnt: bad delta"); inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink; FREE_LOCK(ip->i_ump); } /* * Attach a sbdep dependency to the superblock buf so that we can keep * track of the head of the linked list of referenced but unlinked inodes. */ void softdep_setup_sbupdate(ump, fs, bp) struct ufsmount *ump; struct fs *fs; struct buf *bp; { struct sbdep *sbdep; struct worklist *wk; KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0, ("softdep_setup_sbupdate called on non-softdep filesystem")); LIST_FOREACH(wk, &bp->b_dep, wk_list) if (wk->wk_type == D_SBDEP) break; if (wk != NULL) return; sbdep = malloc(sizeof(struct sbdep), M_SBDEP, M_SOFTDEP_FLAGS); workitem_alloc(&sbdep->sb_list, D_SBDEP, UFSTOVFS(ump)); sbdep->sb_fs = fs; sbdep->sb_ump = ump; ACQUIRE_LOCK(ump); WORKLIST_INSERT(&bp->b_dep, &sbdep->sb_list); FREE_LOCK(ump); } /* * Return the first unlinked inodedep which is ready to be the head of the * list. The inodedep and all those after it must have valid next pointers. */ static struct inodedep * first_unlinked_inodedep(ump) struct ufsmount *ump; { struct inodedep *inodedep; struct inodedep *idp; LOCK_OWNED(ump); for (inodedep = TAILQ_LAST(&ump->softdep_unlinked, inodedeplst); inodedep; inodedep = idp) { if ((inodedep->id_state & UNLINKNEXT) == 0) return (NULL); idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked); if (idp == NULL || (idp->id_state & UNLINKNEXT) == 0) break; if ((inodedep->id_state & UNLINKPREV) == 0) break; } return (inodedep); } /* * Set the sujfree unlinked head pointer prior to writing a superblock. */ static void initiate_write_sbdep(sbdep) struct sbdep *sbdep; { struct inodedep *inodedep; struct fs *bpfs; struct fs *fs; bpfs = sbdep->sb_fs; fs = sbdep->sb_ump->um_fs; inodedep = first_unlinked_inodedep(sbdep->sb_ump); if (inodedep) { fs->fs_sujfree = inodedep->id_ino; inodedep->id_state |= UNLINKPREV; } else fs->fs_sujfree = 0; bpfs->fs_sujfree = fs->fs_sujfree; } /* * After a superblock is written determine whether it must be written again * due to a changing unlinked list head. */ static int handle_written_sbdep(sbdep, bp) struct sbdep *sbdep; struct buf *bp; { struct inodedep *inodedep; struct mount *mp; struct fs *fs; LOCK_OWNED(sbdep->sb_ump); fs = sbdep->sb_fs; mp = UFSTOVFS(sbdep->sb_ump); /* * If the superblock doesn't match the in-memory list start over. */ inodedep = first_unlinked_inodedep(sbdep->sb_ump); if ((inodedep && fs->fs_sujfree != inodedep->id_ino) || (inodedep == NULL && fs->fs_sujfree != 0)) { bdirty(bp); return (1); } WORKITEM_FREE(sbdep, D_SBDEP); if (fs->fs_sujfree == 0) return (0); /* * Now that we have a record of this inode in stable store allow it * to be written to free up pending work. Inodes may see a lot of * write activity after they are unlinked which we must not hold up. */ for (; inodedep != NULL; inodedep = TAILQ_NEXT(inodedep, id_unlinked)) { if ((inodedep->id_state & UNLINKLINKS) != UNLINKLINKS) panic("handle_written_sbdep: Bad inodedep %p (0x%X)", inodedep, inodedep->id_state); if (inodedep->id_state & UNLINKONLIST) break; inodedep->id_state |= DEPCOMPLETE | UNLINKONLIST; } return (0); } /* * Mark an inodedep as unlinked and insert it into the in-memory unlinked list. */ static void unlinked_inodedep(mp, inodedep) struct mount *mp; struct inodedep *inodedep; { struct ufsmount *ump; ump = VFSTOUFS(mp); LOCK_OWNED(ump); if (MOUNTEDSUJ(mp) == 0) return; ump->um_fs->fs_fmod = 1; if (inodedep->id_state & UNLINKED) panic("unlinked_inodedep: %p already unlinked\n", inodedep); inodedep->id_state |= UNLINKED; TAILQ_INSERT_HEAD(&ump->softdep_unlinked, inodedep, id_unlinked); } /* * Remove an inodedep from the unlinked inodedep list. This may require * disk writes if the inode has made it that far. */ static void clear_unlinked_inodedep(inodedep) struct inodedep *inodedep; { struct ufsmount *ump; struct inodedep *idp; struct inodedep *idn; struct fs *fs; struct buf *bp; ino_t ino; ino_t nino; ino_t pino; int error; ump = VFSTOUFS(inodedep->id_list.wk_mp); fs = ump->um_fs; ino = inodedep->id_ino; error = 0; for (;;) { LOCK_OWNED(ump); KASSERT((inodedep->id_state & UNLINKED) != 0, ("clear_unlinked_inodedep: inodedep %p not unlinked", inodedep)); /* * If nothing has yet been written simply remove us from * the in memory list and return. This is the most common * case where handle_workitem_remove() loses the final * reference. */ if ((inodedep->id_state & UNLINKLINKS) == 0) break; /* * If we have a NEXT pointer and no PREV pointer we can simply * clear NEXT's PREV and remove ourselves from the list. Be * careful not to clear PREV if the superblock points at * next as well. */ idn = TAILQ_NEXT(inodedep, id_unlinked); if ((inodedep->id_state & UNLINKLINKS) == UNLINKNEXT) { if (idn && fs->fs_sujfree != idn->id_ino) idn->id_state &= ~UNLINKPREV; break; } /* * Here we have an inodedep which is actually linked into * the list. We must remove it by forcing a write to the * link before us, whether it be the superblock or an inode. * Unfortunately the list may change while we're waiting * on the buf lock for either resource so we must loop until * we lock the right one. If both the superblock and an * inode point to this inode we must clear the inode first * followed by the superblock. */ idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked); pino = 0; if (idp && (idp->id_state & UNLINKNEXT)) pino = idp->id_ino; FREE_LOCK(ump); if (pino == 0) { bp = getblk(ump->um_devvp, btodb(fs->fs_sblockloc), (int)fs->fs_sbsize, 0, 0, 0); } else { error = bread(ump->um_devvp, fsbtodb(fs, ino_to_fsba(fs, pino)), (int)fs->fs_bsize, NOCRED, &bp); if (error) brelse(bp); } ACQUIRE_LOCK(ump); if (error) break; /* If the list has changed restart the loop. */ idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked); nino = 0; if (idp && (idp->id_state & UNLINKNEXT)) nino = idp->id_ino; if (nino != pino || (inodedep->id_state & UNLINKPREV) != UNLINKPREV) { FREE_LOCK(ump); brelse(bp); ACQUIRE_LOCK(ump); continue; } nino = 0; idn = TAILQ_NEXT(inodedep, id_unlinked); if (idn) nino = idn->id_ino; /* * Remove us from the in memory list. After this we cannot * access the inodedep. */ KASSERT((inodedep->id_state & UNLINKED) != 0, ("clear_unlinked_inodedep: inodedep %p not unlinked", inodedep)); inodedep->id_state &= ~(UNLINKED | UNLINKLINKS | UNLINKONLIST); TAILQ_REMOVE(&ump->softdep_unlinked, inodedep, id_unlinked); FREE_LOCK(ump); /* * The predecessor's next pointer is manually updated here * so that the NEXT flag is never cleared for an element * that is in the list. */ if (pino == 0) { bcopy((caddr_t)fs, bp->b_data, (u_int)fs->fs_sbsize); ffs_oldfscompat_write((struct fs *)bp->b_data, ump); softdep_setup_sbupdate(ump, (struct fs *)bp->b_data, bp); } else if (fs->fs_magic == FS_UFS1_MAGIC) ((struct ufs1_dinode *)bp->b_data + ino_to_fsbo(fs, pino))->di_freelink = nino; else ((struct ufs2_dinode *)bp->b_data + ino_to_fsbo(fs, pino))->di_freelink = nino; /* * If the bwrite fails we have no recourse to recover. The * filesystem is corrupted already. */ bwrite(bp); ACQUIRE_LOCK(ump); /* * If the superblock pointer still needs to be cleared force * a write here. */ if (fs->fs_sujfree == ino) { FREE_LOCK(ump); bp = getblk(ump->um_devvp, btodb(fs->fs_sblockloc), (int)fs->fs_sbsize, 0, 0, 0); bcopy((caddr_t)fs, bp->b_data, (u_int)fs->fs_sbsize); ffs_oldfscompat_write((struct fs *)bp->b_data, ump); softdep_setup_sbupdate(ump, (struct fs *)bp->b_data, bp); bwrite(bp); ACQUIRE_LOCK(ump); } if (fs->fs_sujfree != ino) return; panic("clear_unlinked_inodedep: Failed to clear free head"); } if (inodedep->id_ino == fs->fs_sujfree) panic("clear_unlinked_inodedep: Freeing head of free list"); inodedep->id_state &= ~(UNLINKED | UNLINKLINKS | UNLINKONLIST); TAILQ_REMOVE(&ump->softdep_unlinked, inodedep, id_unlinked); return; } /* * This workitem decrements the inode's link count. * If the link count reaches zero, the file is removed. */ static int handle_workitem_remove(dirrem, flags) struct dirrem *dirrem; int flags; { struct inodedep *inodedep; struct workhead dotdotwk; struct worklist *wk; struct ufsmount *ump; struct mount *mp; struct vnode *vp; struct inode *ip; ino_t oldinum; if (dirrem->dm_state & ONWORKLIST) panic("handle_workitem_remove: dirrem %p still on worklist", dirrem); oldinum = dirrem->dm_oldinum; mp = dirrem->dm_list.wk_mp; ump = VFSTOUFS(mp); flags |= LK_EXCLUSIVE; if (ffs_vgetf(mp, oldinum, flags, &vp, FFSV_FORCEINSMQ) != 0) return (EBUSY); ip = VTOI(vp); ACQUIRE_LOCK(ump); if ((inodedep_lookup(mp, oldinum, 0, &inodedep)) == 0) panic("handle_workitem_remove: lost inodedep"); if (dirrem->dm_state & ONDEPLIST) LIST_REMOVE(dirrem, dm_inonext); KASSERT(LIST_EMPTY(&dirrem->dm_jremrefhd), ("handle_workitem_remove: Journal entries not written.")); /* * Move all dependencies waiting on the remove to complete * from the dirrem to the inode inowait list to be completed * after the inode has been updated and written to disk. Any * marked MKDIR_PARENT are saved to be completed when the .. ref * is removed. */ LIST_INIT(&dotdotwk); while ((wk = LIST_FIRST(&dirrem->dm_jwork)) != NULL) { WORKLIST_REMOVE(wk); if (wk->wk_state & MKDIR_PARENT) { wk->wk_state &= ~MKDIR_PARENT; WORKLIST_INSERT(&dotdotwk, wk); continue; } WORKLIST_INSERT(&inodedep->id_inowait, wk); } LIST_SWAP(&dirrem->dm_jwork, &dotdotwk, worklist, wk_list); /* * Normal file deletion. */ if ((dirrem->dm_state & RMDIR) == 0) { ip->i_nlink--; DIP_SET(ip, i_nlink, ip->i_nlink); ip->i_flag |= IN_CHANGE; if (ip->i_nlink < ip->i_effnlink) panic("handle_workitem_remove: bad file delta"); if (ip->i_nlink == 0) unlinked_inodedep(mp, inodedep); inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink; KASSERT(LIST_EMPTY(&dirrem->dm_jwork), ("handle_workitem_remove: worklist not empty. %s", TYPENAME(LIST_FIRST(&dirrem->dm_jwork)->wk_type))); WORKITEM_FREE(dirrem, D_DIRREM); FREE_LOCK(ump); goto out; } /* * Directory deletion. Decrement reference count for both the * just deleted parent directory entry and the reference for ".". * Arrange to have the reference count on the parent decremented * to account for the loss of "..". */ ip->i_nlink -= 2; DIP_SET(ip, i_nlink, ip->i_nlink); ip->i_flag |= IN_CHANGE; if (ip->i_nlink < ip->i_effnlink) panic("handle_workitem_remove: bad dir delta"); if (ip->i_nlink == 0) unlinked_inodedep(mp, inodedep); inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink; /* * Rename a directory to a new parent. Since, we are both deleting * and creating a new directory entry, the link count on the new * directory should not change. Thus we skip the followup dirrem. */ if (dirrem->dm_state & DIRCHG) { KASSERT(LIST_EMPTY(&dirrem->dm_jwork), ("handle_workitem_remove: DIRCHG and worklist not empty.")); WORKITEM_FREE(dirrem, D_DIRREM); FREE_LOCK(ump); goto out; } dirrem->dm_state = ONDEPLIST; dirrem->dm_oldinum = dirrem->dm_dirinum; /* * Place the dirrem on the parent's diremhd list. */ if (inodedep_lookup(mp, dirrem->dm_oldinum, 0, &inodedep) == 0) panic("handle_workitem_remove: lost dir inodedep"); LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext); /* * If the allocated inode has never been written to disk, then * the on-disk inode is zero'ed and we can remove the file * immediately. When journaling if the inode has been marked * unlinked and not DEPCOMPLETE we know it can never be written. */ inodedep_lookup(mp, oldinum, 0, &inodedep); if (inodedep == NULL || (inodedep->id_state & (DEPCOMPLETE | UNLINKED)) == UNLINKED || check_inode_unwritten(inodedep)) { FREE_LOCK(ump); vput(vp); return handle_workitem_remove(dirrem, flags); } WORKLIST_INSERT(&inodedep->id_inowait, &dirrem->dm_list); FREE_LOCK(ump); ip->i_flag |= IN_CHANGE; out: ffs_update(vp, 0); vput(vp); return (0); } /* * Inode de-allocation dependencies. * * When an inode's link count is reduced to zero, it can be de-allocated. We * found it convenient to postpone de-allocation until after the inode is * written to disk with its new link count (zero). At this point, all of the * on-disk inode's block pointers are nullified and, with careful dependency * list ordering, all dependencies related to the inode will be satisfied and * the corresponding dependency structures de-allocated. So, if/when the * inode is reused, there will be no mixing of old dependencies with new * ones. This artificial dependency is set up by the block de-allocation * procedure above (softdep_setup_freeblocks) and completed by the * following procedure. */ static void handle_workitem_freefile(freefile) struct freefile *freefile; { struct workhead wkhd; struct fs *fs; struct inodedep *idp; struct ufsmount *ump; int error; ump = VFSTOUFS(freefile->fx_list.wk_mp); fs = ump->um_fs; #ifdef DEBUG ACQUIRE_LOCK(ump); error = inodedep_lookup(UFSTOVFS(ump), freefile->fx_oldinum, 0, &idp); FREE_LOCK(ump); if (error) panic("handle_workitem_freefile: inodedep %p survived", idp); #endif UFS_LOCK(ump); fs->fs_pendinginodes -= 1; UFS_UNLOCK(ump); LIST_INIT(&wkhd); LIST_SWAP(&freefile->fx_jwork, &wkhd, worklist, wk_list); if ((error = ffs_freefile(ump, fs, freefile->fx_devvp, freefile->fx_oldinum, freefile->fx_mode, &wkhd)) != 0) softdep_error("handle_workitem_freefile", error); ACQUIRE_LOCK(ump); WORKITEM_FREE(freefile, D_FREEFILE); FREE_LOCK(ump); } /* * Helper function which unlinks marker element from work list and returns * the next element on the list. */ static __inline struct worklist * markernext(struct worklist *marker) { struct worklist *next; next = LIST_NEXT(marker, wk_list); LIST_REMOVE(marker, wk_list); return next; } /* * Disk writes. * * The dependency structures constructed above are most actively used when file * system blocks are written to disk. No constraints are placed on when a * block can be written, but unsatisfied update dependencies are made safe by * modifying (or replacing) the source memory for the duration of the disk * write. When the disk write completes, the memory block is again brought * up-to-date. * * In-core inode structure reclamation. * * Because there are a finite number of "in-core" inode structures, they are * reused regularly. By transferring all inode-related dependencies to the * in-memory inode block and indexing them separately (via "inodedep"s), we * can allow "in-core" inode structures to be reused at any time and avoid * any increase in contention. * * Called just before entering the device driver to initiate a new disk I/O. * The buffer must be locked, thus, no I/O completion operations can occur * while we are manipulating its associated dependencies. */ static void softdep_disk_io_initiation(bp) struct buf *bp; /* structure describing disk write to occur */ { struct worklist *wk; struct worklist marker; struct inodedep *inodedep; struct freeblks *freeblks; struct jblkdep *jblkdep; struct newblk *newblk; struct ufsmount *ump; /* * We only care about write operations. There should never * be dependencies for reads. */ if (bp->b_iocmd != BIO_WRITE) panic("softdep_disk_io_initiation: not write"); if (bp->b_vflags & BV_BKGRDINPROG) panic("softdep_disk_io_initiation: Writing buffer with " "background write in progress: %p", bp); if ((wk = LIST_FIRST(&bp->b_dep)) == NULL) return; ump = VFSTOUFS(wk->wk_mp); marker.wk_type = D_LAST + 1; /* Not a normal workitem */ PHOLD(curproc); /* Don't swap out kernel stack */ ACQUIRE_LOCK(ump); /* * Do any necessary pre-I/O processing. */ for (wk = LIST_FIRST(&bp->b_dep); wk != NULL; wk = markernext(&marker)) { LIST_INSERT_AFTER(wk, &marker, wk_list); switch (wk->wk_type) { case D_PAGEDEP: initiate_write_filepage(WK_PAGEDEP(wk), bp); continue; case D_INODEDEP: inodedep = WK_INODEDEP(wk); if (inodedep->id_fs->fs_magic == FS_UFS1_MAGIC) initiate_write_inodeblock_ufs1(inodedep, bp); else initiate_write_inodeblock_ufs2(inodedep, bp); continue; case D_INDIRDEP: initiate_write_indirdep(WK_INDIRDEP(wk), bp); continue; case D_BMSAFEMAP: initiate_write_bmsafemap(WK_BMSAFEMAP(wk), bp); continue; case D_JSEG: WK_JSEG(wk)->js_buf = NULL; continue; case D_FREEBLKS: freeblks = WK_FREEBLKS(wk); jblkdep = LIST_FIRST(&freeblks->fb_jblkdephd); /* * We have to wait for the freeblks to be journaled * before we can write an inodeblock with updated * pointers. Be careful to arrange the marker so * we revisit the freeblks if it's not removed by * the first jwait(). */ if (jblkdep != NULL) { LIST_REMOVE(&marker, wk_list); LIST_INSERT_BEFORE(wk, &marker, wk_list); jwait(&jblkdep->jb_list, MNT_WAIT); } continue; case D_ALLOCDIRECT: case D_ALLOCINDIR: /* * We have to wait for the jnewblk to be journaled * before we can write to a block if the contents * may be confused with an earlier file's indirect * at recovery time. Handle the marker as described * above. */ newblk = WK_NEWBLK(wk); if (newblk->nb_jnewblk != NULL && indirblk_lookup(newblk->nb_list.wk_mp, newblk->nb_newblkno)) { LIST_REMOVE(&marker, wk_list); LIST_INSERT_BEFORE(wk, &marker, wk_list); jwait(&newblk->nb_jnewblk->jn_list, MNT_WAIT); } continue; case D_SBDEP: initiate_write_sbdep(WK_SBDEP(wk)); continue; case D_MKDIR: case D_FREEWORK: case D_FREEDEP: case D_JSEGDEP: continue; default: panic("handle_disk_io_initiation: Unexpected type %s", TYPENAME(wk->wk_type)); /* NOTREACHED */ } } FREE_LOCK(ump); PRELE(curproc); /* Allow swapout of kernel stack */ } /* * Called from within the procedure above to deal with unsatisfied * allocation dependencies in a directory. The buffer must be locked, * thus, no I/O completion operations can occur while we are * manipulating its associated dependencies. */ static void initiate_write_filepage(pagedep, bp) struct pagedep *pagedep; struct buf *bp; { struct jremref *jremref; struct jmvref *jmvref; struct dirrem *dirrem; struct diradd *dap; struct direct *ep; int i; if (pagedep->pd_state & IOSTARTED) { /* * This can only happen if there is a driver that does not * understand chaining. Here biodone will reissue the call * to strategy for the incomplete buffers. */ printf("initiate_write_filepage: already started\n"); return; } pagedep->pd_state |= IOSTARTED; /* * Wait for all journal remove dependencies to hit the disk. * We can not allow any potentially conflicting directory adds * to be visible before removes and rollback is too difficult. * The per-filesystem lock may be dropped and re-acquired, however * we hold the buf locked so the dependency can not go away. */ LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next) while ((jremref = LIST_FIRST(&dirrem->dm_jremrefhd)) != NULL) jwait(&jremref->jr_list, MNT_WAIT); while ((jmvref = LIST_FIRST(&pagedep->pd_jmvrefhd)) != NULL) jwait(&jmvref->jm_list, MNT_WAIT); for (i = 0; i < DAHASHSZ; i++) { LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) { ep = (struct direct *) ((char *)bp->b_data + dap->da_offset); if (ep->d_ino != dap->da_newinum) panic("%s: dir inum %ju != new %ju", "initiate_write_filepage", (uintmax_t)ep->d_ino, (uintmax_t)dap->da_newinum); if (dap->da_state & DIRCHG) ep->d_ino = dap->da_previous->dm_oldinum; else ep->d_ino = 0; dap->da_state &= ~ATTACHED; dap->da_state |= UNDONE; } } } /* * Version of initiate_write_inodeblock that handles UFS1 dinodes. * Note that any bug fixes made to this routine must be done in the * version found below. * * Called from within the procedure above to deal with unsatisfied * allocation dependencies in an inodeblock. The buffer must be * locked, thus, no I/O completion operations can occur while we * are manipulating its associated dependencies. */ static void initiate_write_inodeblock_ufs1(inodedep, bp) struct inodedep *inodedep; struct buf *bp; /* The inode block */ { struct allocdirect *adp, *lastadp; struct ufs1_dinode *dp; struct ufs1_dinode *sip; struct inoref *inoref; struct ufsmount *ump; struct fs *fs; ufs_lbn_t i; #ifdef INVARIANTS ufs_lbn_t prevlbn = 0; #endif int deplist; if (inodedep->id_state & IOSTARTED) panic("initiate_write_inodeblock_ufs1: already started"); inodedep->id_state |= IOSTARTED; fs = inodedep->id_fs; ump = VFSTOUFS(inodedep->id_list.wk_mp); LOCK_OWNED(ump); dp = (struct ufs1_dinode *)bp->b_data + ino_to_fsbo(fs, inodedep->id_ino); /* * If we're on the unlinked list but have not yet written our * next pointer initialize it here. */ if ((inodedep->id_state & (UNLINKED | UNLINKNEXT)) == UNLINKED) { struct inodedep *inon; inon = TAILQ_NEXT(inodedep, id_unlinked); dp->di_freelink = inon ? inon->id_ino : 0; } /* * If the bitmap is not yet written, then the allocated * inode cannot be written to disk. */ if ((inodedep->id_state & DEPCOMPLETE) == 0) { if (inodedep->id_savedino1 != NULL) panic("initiate_write_inodeblock_ufs1: I/O underway"); FREE_LOCK(ump); sip = malloc(sizeof(struct ufs1_dinode), M_SAVEDINO, M_SOFTDEP_FLAGS); ACQUIRE_LOCK(ump); inodedep->id_savedino1 = sip; *inodedep->id_savedino1 = *dp; bzero((caddr_t)dp, sizeof(struct ufs1_dinode)); dp->di_gen = inodedep->id_savedino1->di_gen; dp->di_freelink = inodedep->id_savedino1->di_freelink; return; } /* * If no dependencies, then there is nothing to roll back. */ inodedep->id_savedsize = dp->di_size; inodedep->id_savedextsize = 0; inodedep->id_savednlink = dp->di_nlink; if (TAILQ_EMPTY(&inodedep->id_inoupdt) && TAILQ_EMPTY(&inodedep->id_inoreflst)) return; /* * Revert the link count to that of the first unwritten journal entry. */ inoref = TAILQ_FIRST(&inodedep->id_inoreflst); if (inoref) dp->di_nlink = inoref->if_nlink; /* * Set the dependencies to busy. */ for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; adp = TAILQ_NEXT(adp, ad_next)) { #ifdef INVARIANTS if (deplist != 0 && prevlbn >= adp->ad_offset) panic("softdep_write_inodeblock: lbn order"); prevlbn = adp->ad_offset; if (adp->ad_offset < NDADDR && dp->di_db[adp->ad_offset] != adp->ad_newblkno) panic("%s: direct pointer #%jd mismatch %d != %jd", "softdep_write_inodeblock", (intmax_t)adp->ad_offset, dp->di_db[adp->ad_offset], (intmax_t)adp->ad_newblkno); if (adp->ad_offset >= NDADDR && dp->di_ib[adp->ad_offset - NDADDR] != adp->ad_newblkno) panic("%s: indirect pointer #%jd mismatch %d != %jd", "softdep_write_inodeblock", (intmax_t)adp->ad_offset - NDADDR, dp->di_ib[adp->ad_offset - NDADDR], (intmax_t)adp->ad_newblkno); deplist |= 1 << adp->ad_offset; if ((adp->ad_state & ATTACHED) == 0) panic("softdep_write_inodeblock: Unknown state 0x%x", adp->ad_state); #endif /* INVARIANTS */ adp->ad_state &= ~ATTACHED; adp->ad_state |= UNDONE; } /* * The on-disk inode cannot claim to be any larger than the last * fragment that has been written. Otherwise, the on-disk inode * might have fragments that were not the last block in the file * which would corrupt the filesystem. */ for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) { if (adp->ad_offset >= NDADDR) break; dp->di_db[adp->ad_offset] = adp->ad_oldblkno; /* keep going until hitting a rollback to a frag */ if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize) continue; dp->di_size = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize; for (i = adp->ad_offset + 1; i < NDADDR; i++) { #ifdef INVARIANTS if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0) panic("softdep_write_inodeblock: lost dep1"); #endif /* INVARIANTS */ dp->di_db[i] = 0; } for (i = 0; i < NIADDR; i++) { #ifdef INVARIANTS if (dp->di_ib[i] != 0 && (deplist & ((1 << NDADDR) << i)) == 0) panic("softdep_write_inodeblock: lost dep2"); #endif /* INVARIANTS */ dp->di_ib[i] = 0; } return; } /* * If we have zero'ed out the last allocated block of the file, * roll back the size to the last currently allocated block. * We know that this last allocated block is a full-sized as * we already checked for fragments in the loop above. */ if (lastadp != NULL && dp->di_size <= (lastadp->ad_offset + 1) * fs->fs_bsize) { for (i = lastadp->ad_offset; i >= 0; i--) if (dp->di_db[i] != 0) break; dp->di_size = (i + 1) * fs->fs_bsize; } /* * The only dependencies are for indirect blocks. * * The file size for indirect block additions is not guaranteed. * Such a guarantee would be non-trivial to achieve. The conventional * synchronous write implementation also does not make this guarantee. * Fsck should catch and fix discrepancies. Arguably, the file size * can be over-estimated without destroying integrity when the file * moves into the indirect blocks (i.e., is large). If we want to * postpone fsck, we are stuck with this argument. */ for (; adp; adp = TAILQ_NEXT(adp, ad_next)) dp->di_ib[adp->ad_offset - NDADDR] = 0; } /* * Version of initiate_write_inodeblock that handles UFS2 dinodes. * Note that any bug fixes made to this routine must be done in the * version found above. * * Called from within the procedure above to deal with unsatisfied * allocation dependencies in an inodeblock. The buffer must be * locked, thus, no I/O completion operations can occur while we * are manipulating its associated dependencies. */ static void initiate_write_inodeblock_ufs2(inodedep, bp) struct inodedep *inodedep; struct buf *bp; /* The inode block */ { struct allocdirect *adp, *lastadp; struct ufs2_dinode *dp; struct ufs2_dinode *sip; struct inoref *inoref; struct ufsmount *ump; struct fs *fs; ufs_lbn_t i; #ifdef INVARIANTS ufs_lbn_t prevlbn = 0; #endif int deplist; if (inodedep->id_state & IOSTARTED) panic("initiate_write_inodeblock_ufs2: already started"); inodedep->id_state |= IOSTARTED; fs = inodedep->id_fs; ump = VFSTOUFS(inodedep->id_list.wk_mp); LOCK_OWNED(ump); dp = (struct ufs2_dinode *)bp->b_data + ino_to_fsbo(fs, inodedep->id_ino); /* * If we're on the unlinked list but have not yet written our * next pointer initialize it here. */ if ((inodedep->id_state & (UNLINKED | UNLINKNEXT)) == UNLINKED) { struct inodedep *inon; inon = TAILQ_NEXT(inodedep, id_unlinked); dp->di_freelink = inon ? inon->id_ino : 0; } /* * If the bitmap is not yet written, then the allocated * inode cannot be written to disk. */ if ((inodedep->id_state & DEPCOMPLETE) == 0) { if (inodedep->id_savedino2 != NULL) panic("initiate_write_inodeblock_ufs2: I/O underway"); FREE_LOCK(ump); sip = malloc(sizeof(struct ufs2_dinode), M_SAVEDINO, M_SOFTDEP_FLAGS); ACQUIRE_LOCK(ump); inodedep->id_savedino2 = sip; *inodedep->id_savedino2 = *dp; bzero((caddr_t)dp, sizeof(struct ufs2_dinode)); dp->di_gen = inodedep->id_savedino2->di_gen; dp->di_freelink = inodedep->id_savedino2->di_freelink; return; } /* * If no dependencies, then there is nothing to roll back. */ inodedep->id_savedsize = dp->di_size; inodedep->id_savedextsize = dp->di_extsize; inodedep->id_savednlink = dp->di_nlink; if (TAILQ_EMPTY(&inodedep->id_inoupdt) && TAILQ_EMPTY(&inodedep->id_extupdt) && TAILQ_EMPTY(&inodedep->id_inoreflst)) return; /* * Revert the link count to that of the first unwritten journal entry. */ inoref = TAILQ_FIRST(&inodedep->id_inoreflst); if (inoref) dp->di_nlink = inoref->if_nlink; /* * Set the ext data dependencies to busy. */ for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp; adp = TAILQ_NEXT(adp, ad_next)) { #ifdef INVARIANTS if (deplist != 0 && prevlbn >= adp->ad_offset) panic("softdep_write_inodeblock: lbn order"); prevlbn = adp->ad_offset; if (dp->di_extb[adp->ad_offset] != adp->ad_newblkno) panic("%s: direct pointer #%jd mismatch %jd != %jd", "softdep_write_inodeblock", (intmax_t)adp->ad_offset, (intmax_t)dp->di_extb[adp->ad_offset], (intmax_t)adp->ad_newblkno); deplist |= 1 << adp->ad_offset; if ((adp->ad_state & ATTACHED) == 0) panic("softdep_write_inodeblock: Unknown state 0x%x", adp->ad_state); #endif /* INVARIANTS */ adp->ad_state &= ~ATTACHED; adp->ad_state |= UNDONE; } /* * The on-disk inode cannot claim to be any larger than the last * fragment that has been written. Otherwise, the on-disk inode * might have fragments that were not the last block in the ext * data which would corrupt the filesystem. */ for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp; lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) { dp->di_extb[adp->ad_offset] = adp->ad_oldblkno; /* keep going until hitting a rollback to a frag */ if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize) continue; dp->di_extsize = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize; for (i = adp->ad_offset + 1; i < NXADDR; i++) { #ifdef INVARIANTS if (dp->di_extb[i] != 0 && (deplist & (1 << i)) == 0) panic("softdep_write_inodeblock: lost dep1"); #endif /* INVARIANTS */ dp->di_extb[i] = 0; } lastadp = NULL; break; } /* * If we have zero'ed out the last allocated block of the ext * data, roll back the size to the last currently allocated block. * We know that this last allocated block is a full-sized as * we already checked for fragments in the loop above. */ if (lastadp != NULL && dp->di_extsize <= (lastadp->ad_offset + 1) * fs->fs_bsize) { for (i = lastadp->ad_offset; i >= 0; i--) if (dp->di_extb[i] != 0) break; dp->di_extsize = (i + 1) * fs->fs_bsize; } /* * Set the file data dependencies to busy. */ for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; adp = TAILQ_NEXT(adp, ad_next)) { #ifdef INVARIANTS if (deplist != 0 && prevlbn >= adp->ad_offset) panic("softdep_write_inodeblock: lbn order"); if ((adp->ad_state & ATTACHED) == 0) panic("inodedep %p and adp %p not attached", inodedep, adp); prevlbn = adp->ad_offset; if (adp->ad_offset < NDADDR && dp->di_db[adp->ad_offset] != adp->ad_newblkno) panic("%s: direct pointer #%jd mismatch %jd != %jd", "softdep_write_inodeblock", (intmax_t)adp->ad_offset, (intmax_t)dp->di_db[adp->ad_offset], (intmax_t)adp->ad_newblkno); if (adp->ad_offset >= NDADDR && dp->di_ib[adp->ad_offset - NDADDR] != adp->ad_newblkno) panic("%s indirect pointer #%jd mismatch %jd != %jd", "softdep_write_inodeblock:", (intmax_t)adp->ad_offset - NDADDR, (intmax_t)dp->di_ib[adp->ad_offset - NDADDR], (intmax_t)adp->ad_newblkno); deplist |= 1 << adp->ad_offset; if ((adp->ad_state & ATTACHED) == 0) panic("softdep_write_inodeblock: Unknown state 0x%x", adp->ad_state); #endif /* INVARIANTS */ adp->ad_state &= ~ATTACHED; adp->ad_state |= UNDONE; } /* * The on-disk inode cannot claim to be any larger than the last * fragment that has been written. Otherwise, the on-disk inode * might have fragments that were not the last block in the file * which would corrupt the filesystem. */ for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) { if (adp->ad_offset >= NDADDR) break; dp->di_db[adp->ad_offset] = adp->ad_oldblkno; /* keep going until hitting a rollback to a frag */ if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize) continue; dp->di_size = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize; for (i = adp->ad_offset + 1; i < NDADDR; i++) { #ifdef INVARIANTS if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0) panic("softdep_write_inodeblock: lost dep2"); #endif /* INVARIANTS */ dp->di_db[i] = 0; } for (i = 0; i < NIADDR; i++) { #ifdef INVARIANTS if (dp->di_ib[i] != 0 && (deplist & ((1 << NDADDR) << i)) == 0) panic("softdep_write_inodeblock: lost dep3"); #endif /* INVARIANTS */ dp->di_ib[i] = 0; } return; } /* * If we have zero'ed out the last allocated block of the file, * roll back the size to the last currently allocated block. * We know that this last allocated block is a full-sized as * we already checked for fragments in the loop above. */ if (lastadp != NULL && dp->di_size <= (lastadp->ad_offset + 1) * fs->fs_bsize) { for (i = lastadp->ad_offset; i >= 0; i--) if (dp->di_db[i] != 0) break; dp->di_size = (i + 1) * fs->fs_bsize; } /* * The only dependencies are for indirect blocks. * * The file size for indirect block additions is not guaranteed. * Such a guarantee would be non-trivial to achieve. The conventional * synchronous write implementation also does not make this guarantee. * Fsck should catch and fix discrepancies. Arguably, the file size * can be over-estimated without destroying integrity when the file * moves into the indirect blocks (i.e., is large). If we want to * postpone fsck, we are stuck with this argument. */ for (; adp; adp = TAILQ_NEXT(adp, ad_next)) dp->di_ib[adp->ad_offset - NDADDR] = 0; } /* * Cancel an indirdep as a result of truncation. Release all of the * children allocindirs and place their journal work on the appropriate * list. */ static void cancel_indirdep(indirdep, bp, freeblks) struct indirdep *indirdep; struct buf *bp; struct freeblks *freeblks; { struct allocindir *aip; /* * None of the indirect pointers will ever be visible, * so they can simply be tossed. GOINGAWAY ensures * that allocated pointers will be saved in the buffer * cache until they are freed. Note that they will * only be able to be found by their physical address * since the inode mapping the logical address will * be gone. The save buffer used for the safe copy * was allocated in setup_allocindir_phase2 using * the physical address so it could be used for this * purpose. Hence we swap the safe copy with the real * copy, allowing the safe copy to be freed and holding * on to the real copy for later use in indir_trunc. */ if (indirdep->ir_state & GOINGAWAY) panic("cancel_indirdep: already gone"); if ((indirdep->ir_state & DEPCOMPLETE) == 0) { indirdep->ir_state |= DEPCOMPLETE; LIST_REMOVE(indirdep, ir_next); } indirdep->ir_state |= GOINGAWAY; /* * Pass in bp for blocks still have journal writes * pending so we can cancel them on their own. */ while ((aip = LIST_FIRST(&indirdep->ir_deplisthd)) != 0) cancel_allocindir(aip, bp, freeblks, 0); while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != 0) cancel_allocindir(aip, NULL, freeblks, 0); while ((aip = LIST_FIRST(&indirdep->ir_writehd)) != 0) cancel_allocindir(aip, NULL, freeblks, 0); while ((aip = LIST_FIRST(&indirdep->ir_completehd)) != 0) cancel_allocindir(aip, NULL, freeblks, 0); /* * If there are pending partial truncations we need to keep the * old block copy around until they complete. This is because * the current b_data is not a perfect superset of the available * blocks. */ if (TAILQ_EMPTY(&indirdep->ir_trunc)) bcopy(bp->b_data, indirdep->ir_savebp->b_data, bp->b_bcount); else bcopy(bp->b_data, indirdep->ir_saveddata, bp->b_bcount); WORKLIST_REMOVE(&indirdep->ir_list); WORKLIST_INSERT(&indirdep->ir_savebp->b_dep, &indirdep->ir_list); indirdep->ir_bp = NULL; indirdep->ir_freeblks = freeblks; } /* * Free an indirdep once it no longer has new pointers to track. */ static void free_indirdep(indirdep) struct indirdep *indirdep; { KASSERT(TAILQ_EMPTY(&indirdep->ir_trunc), ("free_indirdep: Indir trunc list not empty.")); KASSERT(LIST_EMPTY(&indirdep->ir_completehd), ("free_indirdep: Complete head not empty.")); KASSERT(LIST_EMPTY(&indirdep->ir_writehd), ("free_indirdep: write head not empty.")); KASSERT(LIST_EMPTY(&indirdep->ir_donehd), ("free_indirdep: done head not empty.")); KASSERT(LIST_EMPTY(&indirdep->ir_deplisthd), ("free_indirdep: deplist head not empty.")); KASSERT((indirdep->ir_state & DEPCOMPLETE), ("free_indirdep: %p still on newblk list.", indirdep)); KASSERT(indirdep->ir_saveddata == NULL, ("free_indirdep: %p still has saved data.", indirdep)); if (indirdep->ir_state & ONWORKLIST) WORKLIST_REMOVE(&indirdep->ir_list); WORKITEM_FREE(indirdep, D_INDIRDEP); } /* * Called before a write to an indirdep. This routine is responsible for * rolling back pointers to a safe state which includes only those * allocindirs which have been completed. */ static void initiate_write_indirdep(indirdep, bp) struct indirdep *indirdep; struct buf *bp; { struct ufsmount *ump; indirdep->ir_state |= IOSTARTED; if (indirdep->ir_state & GOINGAWAY) panic("disk_io_initiation: indirdep gone"); /* * If there are no remaining dependencies, this will be writing * the real pointers. */ if (LIST_EMPTY(&indirdep->ir_deplisthd) && TAILQ_EMPTY(&indirdep->ir_trunc)) return; /* * Replace up-to-date version with safe version. */ if (indirdep->ir_saveddata == NULL) { ump = VFSTOUFS(indirdep->ir_list.wk_mp); LOCK_OWNED(ump); FREE_LOCK(ump); indirdep->ir_saveddata = malloc(bp->b_bcount, M_INDIRDEP, M_SOFTDEP_FLAGS); ACQUIRE_LOCK(ump); } indirdep->ir_state &= ~ATTACHED; indirdep->ir_state |= UNDONE; bcopy(bp->b_data, indirdep->ir_saveddata, bp->b_bcount); bcopy(indirdep->ir_savebp->b_data, bp->b_data, bp->b_bcount); } /* * Called when an inode has been cleared in a cg bitmap. This finally * eliminates any canceled jaddrefs */ void softdep_setup_inofree(mp, bp, ino, wkhd) struct mount *mp; struct buf *bp; ino_t ino; struct workhead *wkhd; { struct worklist *wk, *wkn; struct inodedep *inodedep; struct ufsmount *ump; uint8_t *inosused; struct cg *cgp; struct fs *fs; KASSERT(MOUNTEDSOFTDEP(mp) != 0, ("softdep_setup_inofree called on non-softdep filesystem")); ump = VFSTOUFS(mp); ACQUIRE_LOCK(ump); fs = ump->um_fs; cgp = (struct cg *)bp->b_data; inosused = cg_inosused(cgp); if (isset(inosused, ino % fs->fs_ipg)) panic("softdep_setup_inofree: inode %ju not freed.", (uintmax_t)ino); if (inodedep_lookup(mp, ino, 0, &inodedep)) panic("softdep_setup_inofree: ino %ju has existing inodedep %p", (uintmax_t)ino, inodedep); if (wkhd) { LIST_FOREACH_SAFE(wk, wkhd, wk_list, wkn) { if (wk->wk_type != D_JADDREF) continue; WORKLIST_REMOVE(wk); /* * We can free immediately even if the jaddref * isn't attached in a background write as now * the bitmaps are reconciled. */ wk->wk_state |= COMPLETE | ATTACHED; free_jaddref(WK_JADDREF(wk)); } jwork_move(&bp->b_dep, wkhd); } FREE_LOCK(ump); } /* * Called via ffs_blkfree() after a set of frags has been cleared from a cg * map. Any dependencies waiting for the write to clear are added to the * buf's list and any jnewblks that are being canceled are discarded * immediately. */ void softdep_setup_blkfree(mp, bp, blkno, frags, wkhd) struct mount *mp; struct buf *bp; ufs2_daddr_t blkno; int frags; struct workhead *wkhd; { struct bmsafemap *bmsafemap; struct jnewblk *jnewblk; struct ufsmount *ump; struct worklist *wk; struct fs *fs; #ifdef SUJ_DEBUG uint8_t *blksfree; struct cg *cgp; ufs2_daddr_t jstart; ufs2_daddr_t jend; ufs2_daddr_t end; long bno; int i; #endif CTR3(KTR_SUJ, "softdep_setup_blkfree: blkno %jd frags %d wk head %p", blkno, frags, wkhd); ump = VFSTOUFS(mp); KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0, ("softdep_setup_blkfree called on non-softdep filesystem")); ACQUIRE_LOCK(ump); /* Lookup the bmsafemap so we track when it is dirty. */ fs = ump->um_fs; bmsafemap = bmsafemap_lookup(mp, bp, dtog(fs, blkno), NULL); /* * Detach any jnewblks which have been canceled. They must linger * until the bitmap is cleared again by ffs_blkfree() to prevent * an unjournaled allocation from hitting the disk. */ if (wkhd) { while ((wk = LIST_FIRST(wkhd)) != NULL) { CTR2(KTR_SUJ, "softdep_setup_blkfree: blkno %jd wk type %d", blkno, wk->wk_type); WORKLIST_REMOVE(wk); if (wk->wk_type != D_JNEWBLK) { WORKLIST_INSERT(&bmsafemap->sm_freehd, wk); continue; } jnewblk = WK_JNEWBLK(wk); KASSERT(jnewblk->jn_state & GOINGAWAY, ("softdep_setup_blkfree: jnewblk not canceled.")); #ifdef SUJ_DEBUG /* * Assert that this block is free in the bitmap * before we discard the jnewblk. */ cgp = (struct cg *)bp->b_data; blksfree = cg_blksfree(cgp); bno = dtogd(fs, jnewblk->jn_blkno); for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags; i++) { if (isset(blksfree, bno + i)) continue; panic("softdep_setup_blkfree: not free"); } #endif /* * Even if it's not attached we can free immediately * as the new bitmap is correct. */ wk->wk_state |= COMPLETE | ATTACHED; free_jnewblk(jnewblk); } } #ifdef SUJ_DEBUG /* * Assert that we are not freeing a block which has an outstanding * allocation dependency. */ fs = VFSTOUFS(mp)->um_fs; bmsafemap = bmsafemap_lookup(mp, bp, dtog(fs, blkno), NULL); end = blkno + frags; LIST_FOREACH(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps) { /* * Don't match against blocks that will be freed when the * background write is done. */ if ((jnewblk->jn_state & (ATTACHED | COMPLETE | DEPCOMPLETE)) == (COMPLETE | DEPCOMPLETE)) continue; jstart = jnewblk->jn_blkno + jnewblk->jn_oldfrags; jend = jnewblk->jn_blkno + jnewblk->jn_frags; if ((blkno >= jstart && blkno < jend) || (end > jstart && end <= jend)) { printf("state 0x%X %jd - %d %d dep %p\n", jnewblk->jn_state, jnewblk->jn_blkno, jnewblk->jn_oldfrags, jnewblk->jn_frags, jnewblk->jn_dep); panic("softdep_setup_blkfree: " "%jd-%jd(%d) overlaps with %jd-%jd", blkno, end, frags, jstart, jend); } } #endif FREE_LOCK(ump); } /* * Revert a block allocation when the journal record that describes it * is not yet written. */ static int jnewblk_rollback(jnewblk, fs, cgp, blksfree) struct jnewblk *jnewblk; struct fs *fs; struct cg *cgp; uint8_t *blksfree; { ufs1_daddr_t fragno; long cgbno, bbase; int frags, blk; int i; frags = 0; cgbno = dtogd(fs, jnewblk->jn_blkno); /* * We have to test which frags need to be rolled back. We may * be operating on a stale copy when doing background writes. */ for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags; i++) if (isclr(blksfree, cgbno + i)) frags++; if (frags == 0) return (0); /* * This is mostly ffs_blkfree() sans some validation and * superblock updates. */ if (frags == fs->fs_frag) { fragno = fragstoblks(fs, cgbno); ffs_setblock(fs, blksfree, fragno); ffs_clusteracct(fs, cgp, fragno, 1); cgp->cg_cs.cs_nbfree++; } else { cgbno += jnewblk->jn_oldfrags; bbase = cgbno - fragnum(fs, cgbno); /* Decrement the old frags. */ blk = blkmap(fs, blksfree, bbase); ffs_fragacct(fs, blk, cgp->cg_frsum, -1); /* Deallocate the fragment */ for (i = 0; i < frags; i++) setbit(blksfree, cgbno + i); cgp->cg_cs.cs_nffree += frags; /* Add back in counts associated with the new frags */ blk = blkmap(fs, blksfree, bbase); ffs_fragacct(fs, blk, cgp->cg_frsum, 1); /* If a complete block has been reassembled, account for it. */ fragno = fragstoblks(fs, bbase); if (ffs_isblock(fs, blksfree, fragno)) { cgp->cg_cs.cs_nffree -= fs->fs_frag; ffs_clusteracct(fs, cgp, fragno, 1); cgp->cg_cs.cs_nbfree++; } } stat_jnewblk++; jnewblk->jn_state &= ~ATTACHED; jnewblk->jn_state |= UNDONE; return (frags); } static void initiate_write_bmsafemap(bmsafemap, bp) struct bmsafemap *bmsafemap; struct buf *bp; /* The cg block. */ { struct jaddref *jaddref; struct jnewblk *jnewblk; uint8_t *inosused; uint8_t *blksfree; struct cg *cgp; struct fs *fs; ino_t ino; if (bmsafemap->sm_state & IOSTARTED) return; bmsafemap->sm_state |= IOSTARTED; /* * Clear any inode allocations which are pending journal writes. */ if (LIST_FIRST(&bmsafemap->sm_jaddrefhd) != NULL) { cgp = (struct cg *)bp->b_data; fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs; inosused = cg_inosused(cgp); LIST_FOREACH(jaddref, &bmsafemap->sm_jaddrefhd, ja_bmdeps) { ino = jaddref->ja_ino % fs->fs_ipg; if (isset(inosused, ino)) { if ((jaddref->ja_mode & IFMT) == IFDIR) cgp->cg_cs.cs_ndir--; cgp->cg_cs.cs_nifree++; clrbit(inosused, ino); jaddref->ja_state &= ~ATTACHED; jaddref->ja_state |= UNDONE; stat_jaddref++; } else panic("initiate_write_bmsafemap: inode %ju " "marked free", (uintmax_t)jaddref->ja_ino); } } /* * Clear any block allocations which are pending journal writes. */ if (LIST_FIRST(&bmsafemap->sm_jnewblkhd) != NULL) { cgp = (struct cg *)bp->b_data; fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs; blksfree = cg_blksfree(cgp); LIST_FOREACH(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps) { if (jnewblk_rollback(jnewblk, fs, cgp, blksfree)) continue; panic("initiate_write_bmsafemap: block %jd " "marked free", jnewblk->jn_blkno); } } /* * Move allocation lists to the written lists so they can be * cleared once the block write is complete. */ LIST_SWAP(&bmsafemap->sm_inodedephd, &bmsafemap->sm_inodedepwr, inodedep, id_deps); LIST_SWAP(&bmsafemap->sm_newblkhd, &bmsafemap->sm_newblkwr, newblk, nb_deps); LIST_SWAP(&bmsafemap->sm_freehd, &bmsafemap->sm_freewr, worklist, wk_list); } /* * This routine is called during the completion interrupt * service routine for a disk write (from the procedure called * by the device driver to inform the filesystem caches of * a request completion). It should be called early in this * procedure, before the block is made available to other * processes or other routines are called. * */ static void softdep_disk_write_complete(bp) struct buf *bp; /* describes the completed disk write */ { struct worklist *wk; struct worklist *owk; struct ufsmount *ump; struct workhead reattach; struct freeblks *freeblks; struct buf *sbp; /* * If an error occurred while doing the write, then the data * has not hit the disk and the dependencies cannot be unrolled. */ if ((bp->b_ioflags & BIO_ERROR) != 0 && (bp->b_flags & B_INVAL) == 0) return; if ((wk = LIST_FIRST(&bp->b_dep)) == NULL) return; ump = VFSTOUFS(wk->wk_mp); LIST_INIT(&reattach); /* * This lock must not be released anywhere in this code segment. */ sbp = NULL; owk = NULL; ACQUIRE_LOCK(ump); while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) { WORKLIST_REMOVE(wk); atomic_add_long(&dep_write[wk->wk_type], 1); if (wk == owk) panic("duplicate worklist: %p\n", wk); owk = wk; switch (wk->wk_type) { case D_PAGEDEP: if (handle_written_filepage(WK_PAGEDEP(wk), bp)) WORKLIST_INSERT(&reattach, wk); continue; case D_INODEDEP: if (handle_written_inodeblock(WK_INODEDEP(wk), bp)) WORKLIST_INSERT(&reattach, wk); continue; case D_BMSAFEMAP: if (handle_written_bmsafemap(WK_BMSAFEMAP(wk), bp)) WORKLIST_INSERT(&reattach, wk); continue; case D_MKDIR: handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY); continue; case D_ALLOCDIRECT: wk->wk_state |= COMPLETE; handle_allocdirect_partdone(WK_ALLOCDIRECT(wk), NULL); continue; case D_ALLOCINDIR: wk->wk_state |= COMPLETE; handle_allocindir_partdone(WK_ALLOCINDIR(wk)); continue; case D_INDIRDEP: if (handle_written_indirdep(WK_INDIRDEP(wk), bp, &sbp)) WORKLIST_INSERT(&reattach, wk); continue; case D_FREEBLKS: wk->wk_state |= COMPLETE; freeblks = WK_FREEBLKS(wk); if ((wk->wk_state & ALLCOMPLETE) == ALLCOMPLETE && LIST_EMPTY(&freeblks->fb_jblkdephd)) add_to_worklist(wk, WK_NODELAY); continue; case D_FREEWORK: handle_written_freework(WK_FREEWORK(wk)); break; case D_JSEGDEP: free_jsegdep(WK_JSEGDEP(wk)); continue; case D_JSEG: handle_written_jseg(WK_JSEG(wk), bp); continue; case D_SBDEP: if (handle_written_sbdep(WK_SBDEP(wk), bp)) WORKLIST_INSERT(&reattach, wk); continue; case D_FREEDEP: free_freedep(WK_FREEDEP(wk)); continue; default: panic("handle_disk_write_complete: Unknown type %s", TYPENAME(wk->wk_type)); /* NOTREACHED */ } } /* * Reattach any requests that must be redone. */ while ((wk = LIST_FIRST(&reattach)) != NULL) { WORKLIST_REMOVE(wk); WORKLIST_INSERT(&bp->b_dep, wk); } FREE_LOCK(ump); if (sbp) brelse(sbp); } /* * Called from within softdep_disk_write_complete above. Note that * this routine is always called from interrupt level with further * splbio interrupts blocked. */ static void handle_allocdirect_partdone(adp, wkhd) struct allocdirect *adp; /* the completed allocdirect */ struct workhead *wkhd; /* Work to do when inode is writtne. */ { struct allocdirectlst *listhead; struct allocdirect *listadp; struct inodedep *inodedep; long bsize; if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE) return; /* * The on-disk inode cannot claim to be any larger than the last * fragment that has been written. Otherwise, the on-disk inode * might have fragments that were not the last block in the file * which would corrupt the filesystem. Thus, we cannot free any * allocdirects after one whose ad_oldblkno claims a fragment as * these blocks must be rolled back to zero before writing the inode. * We check the currently active set of allocdirects in id_inoupdt * or id_extupdt as appropriate. */ inodedep = adp->ad_inodedep; bsize = inodedep->id_fs->fs_bsize; if (adp->ad_state & EXTDATA) listhead = &inodedep->id_extupdt; else listhead = &inodedep->id_inoupdt; TAILQ_FOREACH(listadp, listhead, ad_next) { /* found our block */ if (listadp == adp) break; /* continue if ad_oldlbn is not a fragment */ if (listadp->ad_oldsize == 0 || listadp->ad_oldsize == bsize) continue; /* hit a fragment */ return; } /* * If we have reached the end of the current list without * finding the just finished dependency, then it must be * on the future dependency list. Future dependencies cannot * be freed until they are moved to the current list. */ if (listadp == NULL) { #ifdef DEBUG if (adp->ad_state & EXTDATA) listhead = &inodedep->id_newextupdt; else listhead = &inodedep->id_newinoupdt; TAILQ_FOREACH(listadp, listhead, ad_next) /* found our block */ if (listadp == adp) break; if (listadp == NULL) panic("handle_allocdirect_partdone: lost dep"); #endif /* DEBUG */ return; } /* * If we have found the just finished dependency, then queue * it along with anything that follows it that is complete. * Since the pointer has not yet been written in the inode * as the dependency prevents it, place the allocdirect on the * bufwait list where it will be freed once the pointer is * valid. */ if (wkhd == NULL) wkhd = &inodedep->id_bufwait; for (; adp; adp = listadp) { listadp = TAILQ_NEXT(adp, ad_next); if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE) return; TAILQ_REMOVE(listhead, adp, ad_next); WORKLIST_INSERT(wkhd, &adp->ad_block.nb_list); } } /* * Called from within softdep_disk_write_complete above. This routine * completes successfully written allocindirs. */ static void handle_allocindir_partdone(aip) struct allocindir *aip; /* the completed allocindir */ { struct indirdep *indirdep; if ((aip->ai_state & ALLCOMPLETE) != ALLCOMPLETE) return; indirdep = aip->ai_indirdep; LIST_REMOVE(aip, ai_next); /* * Don't set a pointer while the buffer is undergoing IO or while * we have active truncations. */ if (indirdep->ir_state & UNDONE || !TAILQ_EMPTY(&indirdep->ir_trunc)) { LIST_INSERT_HEAD(&indirdep->ir_donehd, aip, ai_next); return; } if (indirdep->ir_state & UFS1FMT) ((ufs1_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] = aip->ai_newblkno; else ((ufs2_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] = aip->ai_newblkno; /* * Await the pointer write before freeing the allocindir. */ LIST_INSERT_HEAD(&indirdep->ir_writehd, aip, ai_next); } /* * Release segments held on a jwork list. */ static void handle_jwork(wkhd) struct workhead *wkhd; { struct worklist *wk; while ((wk = LIST_FIRST(wkhd)) != NULL) { WORKLIST_REMOVE(wk); switch (wk->wk_type) { case D_JSEGDEP: free_jsegdep(WK_JSEGDEP(wk)); continue; case D_FREEDEP: free_freedep(WK_FREEDEP(wk)); continue; case D_FREEFRAG: rele_jseg(WK_JSEG(WK_FREEFRAG(wk)->ff_jdep)); WORKITEM_FREE(wk, D_FREEFRAG); continue; case D_FREEWORK: handle_written_freework(WK_FREEWORK(wk)); continue; default: panic("handle_jwork: Unknown type %s\n", TYPENAME(wk->wk_type)); } } } /* * Handle the bufwait list on an inode when it is safe to release items * held there. This normally happens after an inode block is written but * may be delayed and handled later if there are pending journal items that * are not yet safe to be released. */ static struct freefile * handle_bufwait(inodedep, refhd) struct inodedep *inodedep; struct workhead *refhd; { struct jaddref *jaddref; struct freefile *freefile; struct worklist *wk; freefile = NULL; while ((wk = LIST_FIRST(&inodedep->id_bufwait)) != NULL) { WORKLIST_REMOVE(wk); switch (wk->wk_type) { case D_FREEFILE: /* * We defer adding freefile to the worklist * until all other additions have been made to * ensure that it will be done after all the * old blocks have been freed. */ if (freefile != NULL) panic("handle_bufwait: freefile"); freefile = WK_FREEFILE(wk); continue; case D_MKDIR: handle_written_mkdir(WK_MKDIR(wk), MKDIR_PARENT); continue; case D_DIRADD: diradd_inode_written(WK_DIRADD(wk), inodedep); continue; case D_FREEFRAG: wk->wk_state |= COMPLETE; if ((wk->wk_state & ALLCOMPLETE) == ALLCOMPLETE) add_to_worklist(wk, 0); continue; case D_DIRREM: wk->wk_state |= COMPLETE; add_to_worklist(wk, 0); continue; case D_ALLOCDIRECT: case D_ALLOCINDIR: free_newblk(WK_NEWBLK(wk)); continue; case D_JNEWBLK: wk->wk_state |= COMPLETE; free_jnewblk(WK_JNEWBLK(wk)); continue; /* * Save freed journal segments and add references on * the supplied list which will delay their release * until the cg bitmap is cleared on disk. */ case D_JSEGDEP: if (refhd == NULL) free_jsegdep(WK_JSEGDEP(wk)); else WORKLIST_INSERT(refhd, wk); continue; case D_JADDREF: jaddref = WK_JADDREF(wk); TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref, if_deps); /* * Transfer any jaddrefs to the list to be freed with * the bitmap if we're handling a removed file. */ if (refhd == NULL) { wk->wk_state |= COMPLETE; free_jaddref(jaddref); } else WORKLIST_INSERT(refhd, wk); continue; default: panic("handle_bufwait: Unknown type %p(%s)", wk, TYPENAME(wk->wk_type)); /* NOTREACHED */ } } return (freefile); } /* * Called from within softdep_disk_write_complete above to restore * in-memory inode block contents to their most up-to-date state. Note * that this routine is always called from interrupt level with further * splbio interrupts blocked. */ static int handle_written_inodeblock(inodedep, bp) struct inodedep *inodedep; struct buf *bp; /* buffer containing the inode block */ { struct freefile *freefile; struct allocdirect *adp, *nextadp; struct ufs1_dinode *dp1 = NULL; struct ufs2_dinode *dp2 = NULL; struct workhead wkhd; int hadchanges, fstype; ino_t freelink; LIST_INIT(&wkhd); hadchanges = 0; freefile = NULL; if ((inodedep->id_state & IOSTARTED) == 0) panic("handle_written_inodeblock: not started"); inodedep->id_state &= ~IOSTARTED; if (inodedep->id_fs->fs_magic == FS_UFS1_MAGIC) { fstype = UFS1; dp1 = (struct ufs1_dinode *)bp->b_data + ino_to_fsbo(inodedep->id_fs, inodedep->id_ino); freelink = dp1->di_freelink; } else { fstype = UFS2; dp2 = (struct ufs2_dinode *)bp->b_data + ino_to_fsbo(inodedep->id_fs, inodedep->id_ino); freelink = dp2->di_freelink; } /* * Leave this inodeblock dirty until it's in the list. */ if ((inodedep->id_state & (UNLINKED | UNLINKONLIST)) == UNLINKED) { struct inodedep *inon; inon = TAILQ_NEXT(inodedep, id_unlinked); if ((inon == NULL && freelink == 0) || (inon && inon->id_ino == freelink)) { if (inon) inon->id_state |= UNLINKPREV; inodedep->id_state |= UNLINKNEXT; } hadchanges = 1; } /* * If we had to rollback the inode allocation because of * bitmaps being incomplete, then simply restore it. * Keep the block dirty so that it will not be reclaimed until * all associated dependencies have been cleared and the * corresponding updates written to disk. */ if (inodedep->id_savedino1 != NULL) { hadchanges = 1; if (fstype == UFS1) *dp1 = *inodedep->id_savedino1; else *dp2 = *inodedep->id_savedino2; free(inodedep->id_savedino1, M_SAVEDINO); inodedep->id_savedino1 = NULL; if ((bp->b_flags & B_DELWRI) == 0) stat_inode_bitmap++; bdirty(bp); /* * If the inode is clear here and GOINGAWAY it will never * be written. Process the bufwait and clear any pending * work which may include the freefile. */ if (inodedep->id_state & GOINGAWAY) goto bufwait; return (1); } inodedep->id_state |= COMPLETE; /* * Roll forward anything that had to be rolled back before * the inode could be updated. */ for (adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; adp = nextadp) { nextadp = TAILQ_NEXT(adp, ad_next); if (adp->ad_state & ATTACHED) panic("handle_written_inodeblock: new entry"); if (fstype == UFS1) { if (adp->ad_offset < NDADDR) { if (dp1->di_db[adp->ad_offset]!=adp->ad_oldblkno) panic("%s %s #%jd mismatch %d != %jd", "handle_written_inodeblock:", "direct pointer", (intmax_t)adp->ad_offset, dp1->di_db[adp->ad_offset], (intmax_t)adp->ad_oldblkno); dp1->di_db[adp->ad_offset] = adp->ad_newblkno; } else { if (dp1->di_ib[adp->ad_offset - NDADDR] != 0) panic("%s: %s #%jd allocated as %d", "handle_written_inodeblock", "indirect pointer", (intmax_t)adp->ad_offset - NDADDR, dp1->di_ib[adp->ad_offset - NDADDR]); dp1->di_ib[adp->ad_offset - NDADDR] = adp->ad_newblkno; } } else { if (adp->ad_offset < NDADDR) { if (dp2->di_db[adp->ad_offset]!=adp->ad_oldblkno) panic("%s: %s #%jd %s %jd != %jd", "handle_written_inodeblock", "direct pointer", (intmax_t)adp->ad_offset, "mismatch", (intmax_t)dp2->di_db[adp->ad_offset], (intmax_t)adp->ad_oldblkno); dp2->di_db[adp->ad_offset] = adp->ad_newblkno; } else { if (dp2->di_ib[adp->ad_offset - NDADDR] != 0) panic("%s: %s #%jd allocated as %jd", "handle_written_inodeblock", "indirect pointer", (intmax_t)adp->ad_offset - NDADDR, (intmax_t) dp2->di_ib[adp->ad_offset - NDADDR]); dp2->di_ib[adp->ad_offset - NDADDR] = adp->ad_newblkno; } } adp->ad_state &= ~UNDONE; adp->ad_state |= ATTACHED; hadchanges = 1; } for (adp = TAILQ_FIRST(&inodedep->id_extupdt); adp; adp = nextadp) { nextadp = TAILQ_NEXT(adp, ad_next); if (adp->ad_state & ATTACHED) panic("handle_written_inodeblock: new entry"); if (dp2->di_extb[adp->ad_offset] != adp->ad_oldblkno) panic("%s: direct pointers #%jd %s %jd != %jd", "handle_written_inodeblock", (intmax_t)adp->ad_offset, "mismatch", (intmax_t)dp2->di_extb[adp->ad_offset], (intmax_t)adp->ad_oldblkno); dp2->di_extb[adp->ad_offset] = adp->ad_newblkno; adp->ad_state &= ~UNDONE; adp->ad_state |= ATTACHED; hadchanges = 1; } if (hadchanges && (bp->b_flags & B_DELWRI) == 0) stat_direct_blk_ptrs++; /* * Reset the file size to its most up-to-date value. */ if (inodedep->id_savedsize == -1 || inodedep->id_savedextsize == -1) panic("handle_written_inodeblock: bad size"); if (inodedep->id_savednlink > LINK_MAX) panic("handle_written_inodeblock: Invalid link count " "%d for inodedep %p", inodedep->id_savednlink, inodedep); if (fstype == UFS1) { if (dp1->di_nlink != inodedep->id_savednlink) { dp1->di_nlink = inodedep->id_savednlink; hadchanges = 1; } if (dp1->di_size != inodedep->id_savedsize) { dp1->di_size = inodedep->id_savedsize; hadchanges = 1; } } else { if (dp2->di_nlink != inodedep->id_savednlink) { dp2->di_nlink = inodedep->id_savednlink; hadchanges = 1; } if (dp2->di_size != inodedep->id_savedsize) { dp2->di_size = inodedep->id_savedsize; hadchanges = 1; } if (dp2->di_extsize != inodedep->id_savedextsize) { dp2->di_extsize = inodedep->id_savedextsize; hadchanges = 1; } } inodedep->id_savedsize = -1; inodedep->id_savedextsize = -1; inodedep->id_savednlink = -1; /* * If there were any rollbacks in the inode block, then it must be * marked dirty so that its will eventually get written back in * its correct form. */ if (hadchanges) bdirty(bp); bufwait: /* * Process any allocdirects that completed during the update. */ if ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != NULL) handle_allocdirect_partdone(adp, &wkhd); if ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != NULL) handle_allocdirect_partdone(adp, &wkhd); /* * Process deallocations that were held pending until the * inode had been written to disk. Freeing of the inode * is delayed until after all blocks have been freed to * avoid creation of new triples * before the old ones have been deleted. Completely * unlinked inodes are not processed until the unlinked * inode list is written or the last reference is removed. */ if ((inodedep->id_state & (UNLINKED | UNLINKONLIST)) != UNLINKED) { freefile = handle_bufwait(inodedep, NULL); if (freefile && !LIST_EMPTY(&wkhd)) { WORKLIST_INSERT(&wkhd, &freefile->fx_list); freefile = NULL; } } /* * Move rolled forward dependency completions to the bufwait list * now that those that were already written have been processed. */ if (!LIST_EMPTY(&wkhd) && hadchanges == 0) panic("handle_written_inodeblock: bufwait but no changes"); jwork_move(&inodedep->id_bufwait, &wkhd); if (freefile != NULL) { /* * If the inode is goingaway it was never written. Fake up * the state here so free_inodedep() can succeed. */ if (inodedep->id_state & GOINGAWAY) inodedep->id_state |= COMPLETE | DEPCOMPLETE; if (free_inodedep(inodedep) == 0) panic("handle_written_inodeblock: live inodedep %p", inodedep); add_to_worklist(&freefile->fx_list, 0); return (0); } /* * If no outstanding dependencies, free it. */ if (free_inodedep(inodedep) || (TAILQ_FIRST(&inodedep->id_inoreflst) == 0 && TAILQ_FIRST(&inodedep->id_inoupdt) == 0 && TAILQ_FIRST(&inodedep->id_extupdt) == 0 && LIST_FIRST(&inodedep->id_bufwait) == 0)) return (0); return (hadchanges); } static int handle_written_indirdep(indirdep, bp, bpp) struct indirdep *indirdep; struct buf *bp; struct buf **bpp; { struct allocindir *aip; struct buf *sbp; int chgs; if (indirdep->ir_state & GOINGAWAY) panic("handle_written_indirdep: indirdep gone"); if ((indirdep->ir_state & IOSTARTED) == 0) panic("handle_written_indirdep: IO not started"); chgs = 0; /* * If there were rollbacks revert them here. */ if (indirdep->ir_saveddata) { bcopy(indirdep->ir_saveddata, bp->b_data, bp->b_bcount); if (TAILQ_EMPTY(&indirdep->ir_trunc)) { free(indirdep->ir_saveddata, M_INDIRDEP); indirdep->ir_saveddata = NULL; } chgs = 1; } indirdep->ir_state &= ~(UNDONE | IOSTARTED); indirdep->ir_state |= ATTACHED; /* * Move allocindirs with written pointers to the completehd if * the indirdep's pointer is not yet written. Otherwise * free them here. */ while ((aip = LIST_FIRST(&indirdep->ir_writehd)) != 0) { LIST_REMOVE(aip, ai_next); if ((indirdep->ir_state & DEPCOMPLETE) == 0) { LIST_INSERT_HEAD(&indirdep->ir_completehd, aip, ai_next); newblk_freefrag(&aip->ai_block); continue; } free_newblk(&aip->ai_block); } /* * Move allocindirs that have finished dependency processing from * the done list to the write list after updating the pointers. */ if (TAILQ_EMPTY(&indirdep->ir_trunc)) { while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != 0) { handle_allocindir_partdone(aip); if (aip == LIST_FIRST(&indirdep->ir_donehd)) panic("disk_write_complete: not gone"); chgs = 1; } } /* * Preserve the indirdep if there were any changes or if it is not * yet valid on disk. */ if (chgs) { stat_indir_blk_ptrs++; bdirty(bp); return (1); } /* * If there were no changes we can discard the savedbp and detach * ourselves from the buf. We are only carrying completed pointers * in this case. */ sbp = indirdep->ir_savebp; sbp->b_flags |= B_INVAL | B_NOCACHE; indirdep->ir_savebp = NULL; indirdep->ir_bp = NULL; if (*bpp != NULL) panic("handle_written_indirdep: bp already exists."); *bpp = sbp; /* * The indirdep may not be freed until its parent points at it. */ if (indirdep->ir_state & DEPCOMPLETE) free_indirdep(indirdep); return (0); } /* * Process a diradd entry after its dependent inode has been written. * This routine must be called with splbio interrupts blocked. */ static void diradd_inode_written(dap, inodedep) struct diradd *dap; struct inodedep *inodedep; { dap->da_state |= COMPLETE; complete_diradd(dap); WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list); } /* * Returns true if the bmsafemap will have rollbacks when written. Must only * be called with the per-filesystem lock and the buf lock on the cg held. */ static int bmsafemap_backgroundwrite(bmsafemap, bp) struct bmsafemap *bmsafemap; struct buf *bp; { int dirty; LOCK_OWNED(VFSTOUFS(bmsafemap->sm_list.wk_mp)); dirty = !LIST_EMPTY(&bmsafemap->sm_jaddrefhd) | !LIST_EMPTY(&bmsafemap->sm_jnewblkhd); /* * If we're initiating a background write we need to process the * rollbacks as they exist now, not as they exist when IO starts. * No other consumers will look at the contents of the shadowed * buf so this is safe to do here. */ if (bp->b_xflags & BX_BKGRDMARKER) initiate_write_bmsafemap(bmsafemap, bp); return (dirty); } /* * Re-apply an allocation when a cg write is complete. */ static int jnewblk_rollforward(jnewblk, fs, cgp, blksfree) struct jnewblk *jnewblk; struct fs *fs; struct cg *cgp; uint8_t *blksfree; { ufs1_daddr_t fragno; ufs2_daddr_t blkno; long cgbno, bbase; int frags, blk; int i; frags = 0; cgbno = dtogd(fs, jnewblk->jn_blkno); for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags; i++) { if (isclr(blksfree, cgbno + i)) panic("jnewblk_rollforward: re-allocated fragment"); frags++; } if (frags == fs->fs_frag) { blkno = fragstoblks(fs, cgbno); ffs_clrblock(fs, blksfree, (long)blkno); ffs_clusteracct(fs, cgp, blkno, -1); cgp->cg_cs.cs_nbfree--; } else { bbase = cgbno - fragnum(fs, cgbno); cgbno += jnewblk->jn_oldfrags; /* If a complete block had been reassembled, account for it. */ fragno = fragstoblks(fs, bbase); if (ffs_isblock(fs, blksfree, fragno)) { cgp->cg_cs.cs_nffree += fs->fs_frag; ffs_clusteracct(fs, cgp, fragno, -1); cgp->cg_cs.cs_nbfree--; } /* Decrement the old frags. */ blk = blkmap(fs, blksfree, bbase); ffs_fragacct(fs, blk, cgp->cg_frsum, -1); /* Allocate the fragment */ for (i = 0; i < frags; i++) clrbit(blksfree, cgbno + i); cgp->cg_cs.cs_nffree -= frags; /* Add back in counts associated with the new frags */ blk = blkmap(fs, blksfree, bbase); ffs_fragacct(fs, blk, cgp->cg_frsum, 1); } return (frags); } /* * Complete a write to a bmsafemap structure. Roll forward any bitmap * changes if it's not a background write. Set all written dependencies * to DEPCOMPLETE and free the structure if possible. */ static int handle_written_bmsafemap(bmsafemap, bp) struct bmsafemap *bmsafemap; struct buf *bp; { struct newblk *newblk; struct inodedep *inodedep; struct jaddref *jaddref, *jatmp; struct jnewblk *jnewblk, *jntmp; struct ufsmount *ump; uint8_t *inosused; uint8_t *blksfree; struct cg *cgp; struct fs *fs; ino_t ino; int foreground; int chgs; if ((bmsafemap->sm_state & IOSTARTED) == 0) panic("initiate_write_bmsafemap: Not started\n"); ump = VFSTOUFS(bmsafemap->sm_list.wk_mp); chgs = 0; bmsafemap->sm_state &= ~IOSTARTED; foreground = (bp->b_xflags & BX_BKGRDMARKER) == 0; /* * Release journal work that was waiting on the write. */ handle_jwork(&bmsafemap->sm_freewr); /* * Restore unwritten inode allocation pending jaddref writes. */ if (!LIST_EMPTY(&bmsafemap->sm_jaddrefhd)) { cgp = (struct cg *)bp->b_data; fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs; inosused = cg_inosused(cgp); LIST_FOREACH_SAFE(jaddref, &bmsafemap->sm_jaddrefhd, ja_bmdeps, jatmp) { if ((jaddref->ja_state & UNDONE) == 0) continue; ino = jaddref->ja_ino % fs->fs_ipg; if (isset(inosused, ino)) panic("handle_written_bmsafemap: " "re-allocated inode"); /* Do the roll-forward only if it's a real copy. */ if (foreground) { if ((jaddref->ja_mode & IFMT) == IFDIR) cgp->cg_cs.cs_ndir++; cgp->cg_cs.cs_nifree--; setbit(inosused, ino); chgs = 1; } jaddref->ja_state &= ~UNDONE; jaddref->ja_state |= ATTACHED; free_jaddref(jaddref); } } /* * Restore any block allocations which are pending journal writes. */ if (LIST_FIRST(&bmsafemap->sm_jnewblkhd) != NULL) { cgp = (struct cg *)bp->b_data; fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs; blksfree = cg_blksfree(cgp); LIST_FOREACH_SAFE(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps, jntmp) { if ((jnewblk->jn_state & UNDONE) == 0) continue; /* Do the roll-forward only if it's a real copy. */ if (foreground && jnewblk_rollforward(jnewblk, fs, cgp, blksfree)) chgs = 1; jnewblk->jn_state &= ~(UNDONE | NEWBLOCK); jnewblk->jn_state |= ATTACHED; free_jnewblk(jnewblk); } } while ((newblk = LIST_FIRST(&bmsafemap->sm_newblkwr))) { newblk->nb_state |= DEPCOMPLETE; newblk->nb_state &= ~ONDEPLIST; newblk->nb_bmsafemap = NULL; LIST_REMOVE(newblk, nb_deps); if (newblk->nb_list.wk_type == D_ALLOCDIRECT) handle_allocdirect_partdone( WK_ALLOCDIRECT(&newblk->nb_list), NULL); else if (newblk->nb_list.wk_type == D_ALLOCINDIR) handle_allocindir_partdone( WK_ALLOCINDIR(&newblk->nb_list)); else if (newblk->nb_list.wk_type != D_NEWBLK) panic("handle_written_bmsafemap: Unexpected type: %s", TYPENAME(newblk->nb_list.wk_type)); } while ((inodedep = LIST_FIRST(&bmsafemap->sm_inodedepwr)) != NULL) { inodedep->id_state |= DEPCOMPLETE; inodedep->id_state &= ~ONDEPLIST; LIST_REMOVE(inodedep, id_deps); inodedep->id_bmsafemap = NULL; } LIST_REMOVE(bmsafemap, sm_next); if (chgs == 0 && LIST_EMPTY(&bmsafemap->sm_jaddrefhd) && LIST_EMPTY(&bmsafemap->sm_jnewblkhd) && LIST_EMPTY(&bmsafemap->sm_newblkhd) && LIST_EMPTY(&bmsafemap->sm_inodedephd) && LIST_EMPTY(&bmsafemap->sm_freehd)) { LIST_REMOVE(bmsafemap, sm_hash); WORKITEM_FREE(bmsafemap, D_BMSAFEMAP); return (0); } LIST_INSERT_HEAD(&ump->softdep_dirtycg, bmsafemap, sm_next); if (foreground) bdirty(bp); return (1); } /* * Try to free a mkdir dependency. */ static void complete_mkdir(mkdir) struct mkdir *mkdir; { struct diradd *dap; if ((mkdir->md_state & ALLCOMPLETE) != ALLCOMPLETE) return; LIST_REMOVE(mkdir, md_mkdirs); dap = mkdir->md_diradd; dap->da_state &= ~(mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY)); if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0) { dap->da_state |= DEPCOMPLETE; complete_diradd(dap); } WORKITEM_FREE(mkdir, D_MKDIR); } /* * Handle the completion of a mkdir dependency. */ static void handle_written_mkdir(mkdir, type) struct mkdir *mkdir; int type; { if ((mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY)) != type) panic("handle_written_mkdir: bad type"); mkdir->md_state |= COMPLETE; complete_mkdir(mkdir); } static int free_pagedep(pagedep) struct pagedep *pagedep; { int i; if (pagedep->pd_state & NEWBLOCK) return (0); if (!LIST_EMPTY(&pagedep->pd_dirremhd)) return (0); for (i = 0; i < DAHASHSZ; i++) if (!LIST_EMPTY(&pagedep->pd_diraddhd[i])) return (0); if (!LIST_EMPTY(&pagedep->pd_pendinghd)) return (0); if (!LIST_EMPTY(&pagedep->pd_jmvrefhd)) return (0); if (pagedep->pd_state & ONWORKLIST) WORKLIST_REMOVE(&pagedep->pd_list); LIST_REMOVE(pagedep, pd_hash); WORKITEM_FREE(pagedep, D_PAGEDEP); return (1); } /* * Called from within softdep_disk_write_complete above. * A write operation was just completed. Removed inodes can * now be freed and associated block pointers may be committed. * Note that this routine is always called from interrupt level * with further splbio interrupts blocked. */ static int handle_written_filepage(pagedep, bp) struct pagedep *pagedep; struct buf *bp; /* buffer containing the written page */ { struct dirrem *dirrem; struct diradd *dap, *nextdap; struct direct *ep; int i, chgs; if ((pagedep->pd_state & IOSTARTED) == 0) panic("handle_written_filepage: not started"); pagedep->pd_state &= ~IOSTARTED; /* * Process any directory removals that have been committed. */ while ((dirrem = LIST_FIRST(&pagedep->pd_dirremhd)) != NULL) { LIST_REMOVE(dirrem, dm_next); dirrem->dm_state |= COMPLETE; dirrem->dm_dirinum = pagedep->pd_ino; KASSERT(LIST_EMPTY(&dirrem->dm_jremrefhd), ("handle_written_filepage: Journal entries not written.")); add_to_worklist(&dirrem->dm_list, 0); } /* * Free any directory additions that have been committed. * If it is a newly allocated block, we have to wait until * the on-disk directory inode claims the new block. */ if ((pagedep->pd_state & NEWBLOCK) == 0) while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL) free_diradd(dap, NULL); /* * Uncommitted directory entries must be restored. */ for (chgs = 0, i = 0; i < DAHASHSZ; i++) { for (dap = LIST_FIRST(&pagedep->pd_diraddhd[i]); dap; dap = nextdap) { nextdap = LIST_NEXT(dap, da_pdlist); if (dap->da_state & ATTACHED) panic("handle_written_filepage: attached"); ep = (struct direct *) ((char *)bp->b_data + dap->da_offset); ep->d_ino = dap->da_newinum; dap->da_state &= ~UNDONE; dap->da_state |= ATTACHED; chgs = 1; /* * If the inode referenced by the directory has * been written out, then the dependency can be * moved to the pending list. */ if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) { LIST_REMOVE(dap, da_pdlist); LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist); } } } /* * If there were any rollbacks in the directory, then it must be * marked dirty so that its will eventually get written back in * its correct form. */ if (chgs) { if ((bp->b_flags & B_DELWRI) == 0) stat_dir_entry++; bdirty(bp); return (1); } /* * If we are not waiting for a new directory block to be * claimed by its inode, then the pagedep will be freed. * Otherwise it will remain to track any new entries on * the page in case they are fsync'ed. */ free_pagedep(pagedep); return (0); } /* * Writing back in-core inode structures. * * The filesystem only accesses an inode's contents when it occupies an * "in-core" inode structure. These "in-core" structures are separate from * the page frames used to cache inode blocks. Only the latter are * transferred to/from the disk. So, when the updated contents of the * "in-core" inode structure are copied to the corresponding in-memory inode * block, the dependencies are also transferred. The following procedure is * called when copying a dirty "in-core" inode to a cached inode block. */ /* * Called when an inode is loaded from disk. If the effective link count * differed from the actual link count when it was last flushed, then we * need to ensure that the correct effective link count is put back. */ void softdep_load_inodeblock(ip) struct inode *ip; /* the "in_core" copy of the inode */ { struct inodedep *inodedep; KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ip->i_ump)) != 0, ("softdep_load_inodeblock called on non-softdep filesystem")); /* * Check for alternate nlink count. */ ip->i_effnlink = ip->i_nlink; ACQUIRE_LOCK(ip->i_ump); if (inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0, &inodedep) == 0) { FREE_LOCK(ip->i_ump); return; } ip->i_effnlink -= inodedep->id_nlinkdelta; FREE_LOCK(ip->i_ump); } /* * This routine is called just before the "in-core" inode * information is to be copied to the in-memory inode block. * Recall that an inode block contains several inodes. If * the force flag is set, then the dependencies will be * cleared so that the update can always be made. Note that * the buffer is locked when this routine is called, so we * will never be in the middle of writing the inode block * to disk. */ void softdep_update_inodeblock(ip, bp, waitfor) struct inode *ip; /* the "in_core" copy of the inode */ struct buf *bp; /* the buffer containing the inode block */ int waitfor; /* nonzero => update must be allowed */ { struct inodedep *inodedep; struct inoref *inoref; struct ufsmount *ump; struct worklist *wk; struct mount *mp; struct buf *ibp; struct fs *fs; int error; ump = ip->i_ump; mp = UFSTOVFS(ump); KASSERT(MOUNTEDSOFTDEP(mp) != 0, ("softdep_update_inodeblock called on non-softdep filesystem")); fs = ip->i_fs; /* * Preserve the freelink that is on disk. clear_unlinked_inodedep() * does not have access to the in-core ip so must write directly into * the inode block buffer when setting freelink. */ if (fs->fs_magic == FS_UFS1_MAGIC) DIP_SET(ip, i_freelink, ((struct ufs1_dinode *)bp->b_data + ino_to_fsbo(fs, ip->i_number))->di_freelink); else DIP_SET(ip, i_freelink, ((struct ufs2_dinode *)bp->b_data + ino_to_fsbo(fs, ip->i_number))->di_freelink); /* * If the effective link count is not equal to the actual link * count, then we must track the difference in an inodedep while * the inode is (potentially) tossed out of the cache. Otherwise, * if there is no existing inodedep, then there are no dependencies * to track. */ ACQUIRE_LOCK(ump); again: if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) { FREE_LOCK(ump); if (ip->i_effnlink != ip->i_nlink) panic("softdep_update_inodeblock: bad link count"); return; } if (inodedep->id_nlinkdelta != ip->i_nlink - ip->i_effnlink) panic("softdep_update_inodeblock: bad delta"); /* * If we're flushing all dependencies we must also move any waiting * for journal writes onto the bufwait list prior to I/O. */ if (waitfor) { TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) { if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY)) == DEPCOMPLETE) { jwait(&inoref->if_list, MNT_WAIT); goto again; } } } /* * Changes have been initiated. Anything depending on these * changes cannot occur until this inode has been written. */ inodedep->id_state &= ~COMPLETE; if ((inodedep->id_state & ONWORKLIST) == 0) WORKLIST_INSERT(&bp->b_dep, &inodedep->id_list); /* * Any new dependencies associated with the incore inode must * now be moved to the list associated with the buffer holding * the in-memory copy of the inode. Once merged process any * allocdirects that are completed by the merger. */ merge_inode_lists(&inodedep->id_newinoupdt, &inodedep->id_inoupdt); if (!TAILQ_EMPTY(&inodedep->id_inoupdt)) handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_inoupdt), NULL); merge_inode_lists(&inodedep->id_newextupdt, &inodedep->id_extupdt); if (!TAILQ_EMPTY(&inodedep->id_extupdt)) handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_extupdt), NULL); /* * Now that the inode has been pushed into the buffer, the * operations dependent on the inode being written to disk * can be moved to the id_bufwait so that they will be * processed when the buffer I/O completes. */ while ((wk = LIST_FIRST(&inodedep->id_inowait)) != NULL) { WORKLIST_REMOVE(wk); WORKLIST_INSERT(&inodedep->id_bufwait, wk); } /* * Newly allocated inodes cannot be written until the bitmap * that allocates them have been written (indicated by * DEPCOMPLETE being set in id_state). If we are doing a * forced sync (e.g., an fsync on a file), we force the bitmap * to be written so that the update can be done. */ if (waitfor == 0) { FREE_LOCK(ump); return; } retry: if ((inodedep->id_state & (DEPCOMPLETE | GOINGAWAY)) != 0) { FREE_LOCK(ump); return; } ibp = inodedep->id_bmsafemap->sm_buf; ibp = getdirtybuf(ibp, LOCK_PTR(ump), MNT_WAIT); if (ibp == NULL) { /* * If ibp came back as NULL, the dependency could have been * freed while we slept. Look it up again, and check to see * that it has completed. */ if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0) goto retry; FREE_LOCK(ump); return; } FREE_LOCK(ump); if ((error = bwrite(ibp)) != 0) softdep_error("softdep_update_inodeblock: bwrite", error); } /* * Merge the a new inode dependency list (such as id_newinoupdt) into an * old inode dependency list (such as id_inoupdt). This routine must be * called with splbio interrupts blocked. */ static void merge_inode_lists(newlisthead, oldlisthead) struct allocdirectlst *newlisthead; struct allocdirectlst *oldlisthead; { struct allocdirect *listadp, *newadp; newadp = TAILQ_FIRST(newlisthead); for (listadp = TAILQ_FIRST(oldlisthead); listadp && newadp;) { if (listadp->ad_offset < newadp->ad_offset) { listadp = TAILQ_NEXT(listadp, ad_next); continue; } TAILQ_REMOVE(newlisthead, newadp, ad_next); TAILQ_INSERT_BEFORE(listadp, newadp, ad_next); if (listadp->ad_offset == newadp->ad_offset) { allocdirect_merge(oldlisthead, newadp, listadp); listadp = newadp; } newadp = TAILQ_FIRST(newlisthead); } while ((newadp = TAILQ_FIRST(newlisthead)) != NULL) { TAILQ_REMOVE(newlisthead, newadp, ad_next); TAILQ_INSERT_TAIL(oldlisthead, newadp, ad_next); } } /* * If we are doing an fsync, then we must ensure that any directory * entries for the inode have been written after the inode gets to disk. */ int softdep_fsync(vp) struct vnode *vp; /* the "in_core" copy of the inode */ { struct inodedep *inodedep; struct pagedep *pagedep; struct inoref *inoref; struct ufsmount *ump; struct worklist *wk; struct diradd *dap; struct mount *mp; struct vnode *pvp; struct inode *ip; struct buf *bp; struct fs *fs; struct thread *td = curthread; int error, flushparent, pagedep_new_block; ino_t parentino; ufs_lbn_t lbn; ip = VTOI(vp); fs = ip->i_fs; ump = ip->i_ump; mp = vp->v_mount; if (MOUNTEDSOFTDEP(mp) == 0) return (0); ACQUIRE_LOCK(ump); restart: if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) { FREE_LOCK(ump); return (0); } TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) { if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY)) == DEPCOMPLETE) { jwait(&inoref->if_list, MNT_WAIT); goto restart; } } if (!LIST_EMPTY(&inodedep->id_inowait) || !TAILQ_EMPTY(&inodedep->id_extupdt) || !TAILQ_EMPTY(&inodedep->id_newextupdt) || !TAILQ_EMPTY(&inodedep->id_inoupdt) || !TAILQ_EMPTY(&inodedep->id_newinoupdt)) panic("softdep_fsync: pending ops %p", inodedep); for (error = 0, flushparent = 0; ; ) { if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) == NULL) break; if (wk->wk_type != D_DIRADD) panic("softdep_fsync: Unexpected type %s", TYPENAME(wk->wk_type)); dap = WK_DIRADD(wk); /* * Flush our parent if this directory entry has a MKDIR_PARENT * dependency or is contained in a newly allocated block. */ if (dap->da_state & DIRCHG) pagedep = dap->da_previous->dm_pagedep; else pagedep = dap->da_pagedep; parentino = pagedep->pd_ino; lbn = pagedep->pd_lbn; if ((dap->da_state & (MKDIR_BODY | COMPLETE)) != COMPLETE) panic("softdep_fsync: dirty"); if ((dap->da_state & MKDIR_PARENT) || (pagedep->pd_state & NEWBLOCK)) flushparent = 1; else flushparent = 0; /* * If we are being fsync'ed as part of vgone'ing this vnode, * then we will not be able to release and recover the * vnode below, so we just have to give up on writing its * directory entry out. It will eventually be written, just * not now, but then the user was not asking to have it * written, so we are not breaking any promises. */ if (vp->v_iflag & VI_DOOMED) break; /* * We prevent deadlock by always fetching inodes from the * root, moving down the directory tree. Thus, when fetching * our parent directory, we first try to get the lock. If * that fails, we must unlock ourselves before requesting * the lock on our parent. See the comment in ufs_lookup * for details on possible races. */ FREE_LOCK(ump); if (ffs_vgetf(mp, parentino, LK_NOWAIT | LK_EXCLUSIVE, &pvp, FFSV_FORCEINSMQ)) { error = vfs_busy(mp, MBF_NOWAIT); if (error != 0) { vfs_ref(mp); VOP_UNLOCK(vp, 0); error = vfs_busy(mp, 0); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); vfs_rel(mp); if (error != 0) return (ENOENT); if (vp->v_iflag & VI_DOOMED) { vfs_unbusy(mp); return (ENOENT); } } VOP_UNLOCK(vp, 0); error = ffs_vgetf(mp, parentino, LK_EXCLUSIVE, &pvp, FFSV_FORCEINSMQ); vfs_unbusy(mp); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); if (vp->v_iflag & VI_DOOMED) { if (error == 0) vput(pvp); error = ENOENT; } if (error != 0) return (error); } /* * All MKDIR_PARENT dependencies and all the NEWBLOCK pagedeps * that are contained in direct blocks will be resolved by * doing a ffs_update. Pagedeps contained in indirect blocks * may require a complete sync'ing of the directory. So, we * try the cheap and fast ffs_update first, and if that fails, * then we do the slower ffs_syncvnode of the directory. */ if (flushparent) { int locked; if ((error = ffs_update(pvp, 1)) != 0) { vput(pvp); return (error); } ACQUIRE_LOCK(ump); locked = 1; if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0) { if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) != NULL) { if (wk->wk_type != D_DIRADD) panic("softdep_fsync: Unexpected type %s", TYPENAME(wk->wk_type)); dap = WK_DIRADD(wk); if (dap->da_state & DIRCHG) pagedep = dap->da_previous->dm_pagedep; else pagedep = dap->da_pagedep; pagedep_new_block = pagedep->pd_state & NEWBLOCK; FREE_LOCK(ump); locked = 0; if (pagedep_new_block && (error = ffs_syncvnode(pvp, MNT_WAIT, 0))) { vput(pvp); return (error); } } } if (locked) FREE_LOCK(ump); } /* * Flush directory page containing the inode's name. */ error = bread(pvp, lbn, blksize(fs, VTOI(pvp), lbn), td->td_ucred, &bp); if (error == 0) error = bwrite(bp); else brelse(bp); vput(pvp); if (error != 0) return (error); ACQUIRE_LOCK(ump); if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) break; } FREE_LOCK(ump); return (0); } /* * Flush all the dirty bitmaps associated with the block device * before flushing the rest of the dirty blocks so as to reduce * the number of dependencies that will have to be rolled back. * * XXX Unused? */ void softdep_fsync_mountdev(vp) struct vnode *vp; { struct buf *bp, *nbp; struct worklist *wk; struct bufobj *bo; if (!vn_isdisk(vp, NULL)) panic("softdep_fsync_mountdev: vnode not a disk"); bo = &vp->v_bufobj; restart: BO_LOCK(bo); TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) { /* * If it is already scheduled, skip to the next buffer. */ if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL)) continue; if ((bp->b_flags & B_DELWRI) == 0) panic("softdep_fsync_mountdev: not dirty"); /* * We are only interested in bitmaps with outstanding * dependencies. */ if ((wk = LIST_FIRST(&bp->b_dep)) == NULL || wk->wk_type != D_BMSAFEMAP || (bp->b_vflags & BV_BKGRDINPROG)) { BUF_UNLOCK(bp); continue; } BO_UNLOCK(bo); bremfree(bp); (void) bawrite(bp); goto restart; } drain_output(vp); BO_UNLOCK(bo); } /* * Sync all cylinder groups that were dirty at the time this function is * called. Newly dirtied cgs will be inserted before the sentinel. This * is used to flush freedep activity that may be holding up writes to a * indirect block. */ static int sync_cgs(mp, waitfor) struct mount *mp; int waitfor; { struct bmsafemap *bmsafemap; struct bmsafemap *sentinel; struct ufsmount *ump; struct buf *bp; int error; sentinel = malloc(sizeof(*sentinel), M_BMSAFEMAP, M_ZERO | M_WAITOK); sentinel->sm_cg = -1; ump = VFSTOUFS(mp); error = 0; ACQUIRE_LOCK(ump); LIST_INSERT_HEAD(&ump->softdep_dirtycg, sentinel, sm_next); for (bmsafemap = LIST_NEXT(sentinel, sm_next); bmsafemap != NULL; bmsafemap = LIST_NEXT(sentinel, sm_next)) { /* Skip sentinels and cgs with no work to release. */ if (bmsafemap->sm_cg == -1 || (LIST_EMPTY(&bmsafemap->sm_freehd) && LIST_EMPTY(&bmsafemap->sm_freewr))) { LIST_REMOVE(sentinel, sm_next); LIST_INSERT_AFTER(bmsafemap, sentinel, sm_next); continue; } /* * If we don't get the lock and we're waiting try again, if * not move on to the next buf and try to sync it. */ bp = getdirtybuf(bmsafemap->sm_buf, LOCK_PTR(ump), waitfor); if (bp == NULL && waitfor == MNT_WAIT) continue; LIST_REMOVE(sentinel, sm_next); LIST_INSERT_AFTER(bmsafemap, sentinel, sm_next); if (bp == NULL) continue; FREE_LOCK(ump); if (waitfor == MNT_NOWAIT) bawrite(bp); else error = bwrite(bp); ACQUIRE_LOCK(ump); if (error) break; } LIST_REMOVE(sentinel, sm_next); FREE_LOCK(ump); free(sentinel, M_BMSAFEMAP); return (error); } /* * This routine is called when we are trying to synchronously flush a * file. This routine must eliminate any filesystem metadata dependencies * so that the syncing routine can succeed. */ int softdep_sync_metadata(struct vnode *vp) { struct inode *ip; int error; ip = VTOI(vp); KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ip->i_ump)) != 0, ("softdep_sync_metadata called on non-softdep filesystem")); /* * Ensure that any direct block dependencies have been cleared, * truncations are started, and inode references are journaled. */ ACQUIRE_LOCK(ip->i_ump); /* * Write all journal records to prevent rollbacks on devvp. */ if (vp->v_type == VCHR) softdep_flushjournal(vp->v_mount); error = flush_inodedep_deps(vp, vp->v_mount, ip->i_number); /* * Ensure that all truncates are written so we won't find deps on * indirect blocks. */ process_truncates(vp); FREE_LOCK(ip->i_ump); return (error); } /* * This routine is called when we are attempting to sync a buf with * dependencies. If waitfor is MNT_NOWAIT it attempts to schedule any * other IO it can but returns EBUSY if the buffer is not yet able to * be written. Dependencies which will not cause rollbacks will always * return 0. */ int softdep_sync_buf(struct vnode *vp, struct buf *bp, int waitfor) { struct indirdep *indirdep; struct pagedep *pagedep; struct allocindir *aip; struct newblk *newblk; struct ufsmount *ump; struct buf *nbp; struct worklist *wk; int i, error; KASSERT(MOUNTEDSOFTDEP(vp->v_mount) != 0, ("softdep_sync_buf called on non-softdep filesystem")); /* * For VCHR we just don't want to force flush any dependencies that * will cause rollbacks. */ if (vp->v_type == VCHR) { if (waitfor == MNT_NOWAIT && softdep_count_dependencies(bp, 0)) return (EBUSY); return (0); } ump = VTOI(vp)->i_ump; ACQUIRE_LOCK(ump); /* * As we hold the buffer locked, none of its dependencies * will disappear. */ error = 0; top: LIST_FOREACH(wk, &bp->b_dep, wk_list) { switch (wk->wk_type) { case D_ALLOCDIRECT: case D_ALLOCINDIR: newblk = WK_NEWBLK(wk); if (newblk->nb_jnewblk != NULL) { if (waitfor == MNT_NOWAIT) { error = EBUSY; goto out_unlock; } jwait(&newblk->nb_jnewblk->jn_list, waitfor); goto top; } if (newblk->nb_state & DEPCOMPLETE || waitfor == MNT_NOWAIT) continue; nbp = newblk->nb_bmsafemap->sm_buf; nbp = getdirtybuf(nbp, LOCK_PTR(ump), waitfor); if (nbp == NULL) goto top; FREE_LOCK(ump); if ((error = bwrite(nbp)) != 0) goto out; ACQUIRE_LOCK(ump); continue; case D_INDIRDEP: indirdep = WK_INDIRDEP(wk); if (waitfor == MNT_NOWAIT) { if (!TAILQ_EMPTY(&indirdep->ir_trunc) || !LIST_EMPTY(&indirdep->ir_deplisthd)) { error = EBUSY; goto out_unlock; } } if (!TAILQ_EMPTY(&indirdep->ir_trunc)) panic("softdep_sync_buf: truncation pending."); restart: LIST_FOREACH(aip, &indirdep->ir_deplisthd, ai_next) { newblk = (struct newblk *)aip; if (newblk->nb_jnewblk != NULL) { jwait(&newblk->nb_jnewblk->jn_list, waitfor); goto restart; } if (newblk->nb_state & DEPCOMPLETE) continue; nbp = newblk->nb_bmsafemap->sm_buf; nbp = getdirtybuf(nbp, LOCK_PTR(ump), waitfor); if (nbp == NULL) goto restart; FREE_LOCK(ump); if ((error = bwrite(nbp)) != 0) goto out; ACQUIRE_LOCK(ump); goto restart; } continue; case D_PAGEDEP: /* * Only flush directory entries in synchronous passes. */ if (waitfor != MNT_WAIT) { error = EBUSY; goto out_unlock; } /* * While syncing snapshots, we must allow recursive * lookups. */ BUF_AREC(bp); /* * We are trying to sync a directory that may * have dependencies on both its own metadata * and/or dependencies on the inodes of any * recently allocated files. We walk its diradd * lists pushing out the associated inode. */ pagedep = WK_PAGEDEP(wk); for (i = 0; i < DAHASHSZ; i++) { if (LIST_FIRST(&pagedep->pd_diraddhd[i]) == 0) continue; if ((error = flush_pagedep_deps(vp, wk->wk_mp, &pagedep->pd_diraddhd[i]))) { BUF_NOREC(bp); goto out_unlock; } } BUF_NOREC(bp); continue; case D_FREEWORK: case D_FREEDEP: case D_JSEGDEP: case D_JNEWBLK: continue; default: panic("softdep_sync_buf: Unknown type %s", TYPENAME(wk->wk_type)); /* NOTREACHED */ } } out_unlock: FREE_LOCK(ump); out: return (error); } /* * Flush the dependencies associated with an inodedep. * Called with splbio blocked. */ static int flush_inodedep_deps(vp, mp, ino) struct vnode *vp; struct mount *mp; ino_t ino; { struct inodedep *inodedep; struct inoref *inoref; struct ufsmount *ump; int error, waitfor; /* * This work is done in two passes. The first pass grabs most * of the buffers and begins asynchronously writing them. The * only way to wait for these asynchronous writes is to sleep * on the filesystem vnode which may stay busy for a long time * if the filesystem is active. So, instead, we make a second * pass over the dependencies blocking on each write. In the * usual case we will be blocking against a write that we * initiated, so when it is done the dependency will have been * resolved. Thus the second pass is expected to end quickly. * We give a brief window at the top of the loop to allow * any pending I/O to complete. */ ump = VFSTOUFS(mp); LOCK_OWNED(ump); for (error = 0, waitfor = MNT_NOWAIT; ; ) { if (error) return (error); FREE_LOCK(ump); ACQUIRE_LOCK(ump); restart: if (inodedep_lookup(mp, ino, 0, &inodedep) == 0) return (0); TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) { if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY)) == DEPCOMPLETE) { jwait(&inoref->if_list, MNT_WAIT); goto restart; } } if (flush_deplist(&inodedep->id_inoupdt, waitfor, &error) || flush_deplist(&inodedep->id_newinoupdt, waitfor, &error) || flush_deplist(&inodedep->id_extupdt, waitfor, &error) || flush_deplist(&inodedep->id_newextupdt, waitfor, &error)) continue; /* * If pass2, we are done, otherwise do pass 2. */ if (waitfor == MNT_WAIT) break; waitfor = MNT_WAIT; } /* * Try freeing inodedep in case all dependencies have been removed. */ if (inodedep_lookup(mp, ino, 0, &inodedep) != 0) (void) free_inodedep(inodedep); return (0); } /* * Flush an inode dependency list. * Called with splbio blocked. */ static int flush_deplist(listhead, waitfor, errorp) struct allocdirectlst *listhead; int waitfor; int *errorp; { struct allocdirect *adp; struct newblk *newblk; struct ufsmount *ump; struct buf *bp; if ((adp = TAILQ_FIRST(listhead)) == NULL) return (0); ump = VFSTOUFS(adp->ad_list.wk_mp); LOCK_OWNED(ump); TAILQ_FOREACH(adp, listhead, ad_next) { newblk = (struct newblk *)adp; if (newblk->nb_jnewblk != NULL) { jwait(&newblk->nb_jnewblk->jn_list, MNT_WAIT); return (1); } if (newblk->nb_state & DEPCOMPLETE) continue; bp = newblk->nb_bmsafemap->sm_buf; bp = getdirtybuf(bp, LOCK_PTR(ump), waitfor); if (bp == NULL) { if (waitfor == MNT_NOWAIT) continue; return (1); } FREE_LOCK(ump); if (waitfor == MNT_NOWAIT) bawrite(bp); else *errorp = bwrite(bp); ACQUIRE_LOCK(ump); return (1); } return (0); } /* * Flush dependencies associated with an allocdirect block. */ static int flush_newblk_dep(vp, mp, lbn) struct vnode *vp; struct mount *mp; ufs_lbn_t lbn; { struct newblk *newblk; struct ufsmount *ump; struct bufobj *bo; struct inode *ip; struct buf *bp; ufs2_daddr_t blkno; int error; error = 0; bo = &vp->v_bufobj; ip = VTOI(vp); blkno = DIP(ip, i_db[lbn]); if (blkno == 0) panic("flush_newblk_dep: Missing block"); ump = VFSTOUFS(mp); ACQUIRE_LOCK(ump); /* * Loop until all dependencies related to this block are satisfied. * We must be careful to restart after each sleep in case a write * completes some part of this process for us. */ for (;;) { if (newblk_lookup(mp, blkno, 0, &newblk) == 0) { FREE_LOCK(ump); break; } if (newblk->nb_list.wk_type != D_ALLOCDIRECT) panic("flush_newblk_deps: Bad newblk %p", newblk); /* * Flush the journal. */ if (newblk->nb_jnewblk != NULL) { jwait(&newblk->nb_jnewblk->jn_list, MNT_WAIT); continue; } /* * Write the bitmap dependency. */ if ((newblk->nb_state & DEPCOMPLETE) == 0) { bp = newblk->nb_bmsafemap->sm_buf; bp = getdirtybuf(bp, LOCK_PTR(ump), MNT_WAIT); if (bp == NULL) continue; FREE_LOCK(ump); error = bwrite(bp); if (error) break; ACQUIRE_LOCK(ump); continue; } /* * Write the buffer. */ FREE_LOCK(ump); BO_LOCK(bo); bp = gbincore(bo, lbn); if (bp != NULL) { error = BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_LOCKPTR(bo)); if (error == ENOLCK) { ACQUIRE_LOCK(ump); continue; /* Slept, retry */ } if (error != 0) break; /* Failed */ if (bp->b_flags & B_DELWRI) { bremfree(bp); error = bwrite(bp); if (error) break; } else BUF_UNLOCK(bp); } else BO_UNLOCK(bo); /* * We have to wait for the direct pointers to * point at the newdirblk before the dependency * will go away. */ error = ffs_update(vp, 1); if (error) break; ACQUIRE_LOCK(ump); } return (error); } /* * Eliminate a pagedep dependency by flushing out all its diradd dependencies. * Called with splbio blocked. */ static int flush_pagedep_deps(pvp, mp, diraddhdp) struct vnode *pvp; struct mount *mp; struct diraddhd *diraddhdp; { struct inodedep *inodedep; struct inoref *inoref; struct ufsmount *ump; struct diradd *dap; struct vnode *vp; int error = 0; struct buf *bp; ino_t inum; struct diraddhd unfinished; LIST_INIT(&unfinished); ump = VFSTOUFS(mp); LOCK_OWNED(ump); restart: while ((dap = LIST_FIRST(diraddhdp)) != NULL) { /* * Flush ourselves if this directory entry * has a MKDIR_PARENT dependency. */ if (dap->da_state & MKDIR_PARENT) { FREE_LOCK(ump); if ((error = ffs_update(pvp, 1)) != 0) break; ACQUIRE_LOCK(ump); /* * If that cleared dependencies, go on to next. */ if (dap != LIST_FIRST(diraddhdp)) continue; /* * All MKDIR_PARENT dependencies and all the * NEWBLOCK pagedeps that are contained in direct * blocks were resolved by doing above ffs_update. * Pagedeps contained in indirect blocks may * require a complete sync'ing of the directory. * We are in the midst of doing a complete sync, * so if they are not resolved in this pass we * defer them for now as they will be sync'ed by * our caller shortly. */ LIST_REMOVE(dap, da_pdlist); LIST_INSERT_HEAD(&unfinished, dap, da_pdlist); continue; } /* * A newly allocated directory must have its "." and * ".." entries written out before its name can be * committed in its parent. */ inum = dap->da_newinum; if (inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep) == 0) panic("flush_pagedep_deps: lost inode1"); /* * Wait for any pending journal adds to complete so we don't * cause rollbacks while syncing. */ TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) { if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY)) == DEPCOMPLETE) { jwait(&inoref->if_list, MNT_WAIT); goto restart; } } if (dap->da_state & MKDIR_BODY) { FREE_LOCK(ump); if ((error = ffs_vgetf(mp, inum, LK_EXCLUSIVE, &vp, FFSV_FORCEINSMQ))) break; error = flush_newblk_dep(vp, mp, 0); /* * If we still have the dependency we might need to * update the vnode to sync the new link count to * disk. */ if (error == 0 && dap == LIST_FIRST(diraddhdp)) error = ffs_update(vp, 1); vput(vp); if (error != 0) break; ACQUIRE_LOCK(ump); /* * If that cleared dependencies, go on to next. */ if (dap != LIST_FIRST(diraddhdp)) continue; if (dap->da_state & MKDIR_BODY) { inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep); panic("flush_pagedep_deps: MKDIR_BODY " "inodedep %p dap %p vp %p", inodedep, dap, vp); } } /* * Flush the inode on which the directory entry depends. * Having accounted for MKDIR_PARENT and MKDIR_BODY above, * the only remaining dependency is that the updated inode * count must get pushed to disk. The inode has already * been pushed into its inode buffer (via VOP_UPDATE) at * the time of the reference count change. So we need only * locate that buffer, ensure that there will be no rollback * caused by a bitmap dependency, then write the inode buffer. */ retry: if (inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep) == 0) panic("flush_pagedep_deps: lost inode"); /* * If the inode still has bitmap dependencies, * push them to disk. */ if ((inodedep->id_state & (DEPCOMPLETE | GOINGAWAY)) == 0) { bp = inodedep->id_bmsafemap->sm_buf; bp = getdirtybuf(bp, LOCK_PTR(ump), MNT_WAIT); if (bp == NULL) goto retry; FREE_LOCK(ump); if ((error = bwrite(bp)) != 0) break; ACQUIRE_LOCK(ump); if (dap != LIST_FIRST(diraddhdp)) continue; } /* * If the inode is still sitting in a buffer waiting * to be written or waiting for the link count to be * adjusted update it here to flush it to disk. */ if (dap == LIST_FIRST(diraddhdp)) { FREE_LOCK(ump); if ((error = ffs_vgetf(mp, inum, LK_EXCLUSIVE, &vp, FFSV_FORCEINSMQ))) break; error = ffs_update(vp, 1); vput(vp); if (error) break; ACQUIRE_LOCK(ump); } /* * If we have failed to get rid of all the dependencies * then something is seriously wrong. */ if (dap == LIST_FIRST(diraddhdp)) { inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep); panic("flush_pagedep_deps: failed to flush " "inodedep %p ino %ju dap %p", inodedep, (uintmax_t)inum, dap); } } if (error) ACQUIRE_LOCK(ump); while ((dap = LIST_FIRST(&unfinished)) != NULL) { LIST_REMOVE(dap, da_pdlist); LIST_INSERT_HEAD(diraddhdp, dap, da_pdlist); } return (error); } /* * A large burst of file addition or deletion activity can drive the * memory load excessively high. First attempt to slow things down * using the techniques below. If that fails, this routine requests * the offending operations to fall back to running synchronously * until the memory load returns to a reasonable level. */ int softdep_slowdown(vp) struct vnode *vp; { struct ufsmount *ump; int jlow; int max_softdeps_hard; KASSERT(MOUNTEDSOFTDEP(vp->v_mount) != 0, ("softdep_slowdown called on non-softdep filesystem")); ump = VFSTOUFS(vp->v_mount); ACQUIRE_LOCK(ump); jlow = 0; /* * Check for journal space if needed. */ if (DOINGSUJ(vp)) { if (journal_space(ump, 0) == 0) jlow = 1; } /* * If the system is under its limits and our filesystem is * not responsible for more than our share of the usage and * we are not low on journal space, then no need to slow down. */ max_softdeps_hard = max_softdeps * 11 / 10; if (dep_current[D_DIRREM] < max_softdeps_hard / 2 && dep_current[D_INODEDEP] < max_softdeps_hard && dep_current[D_INDIRDEP] < max_softdeps_hard / 1000 && dep_current[D_FREEBLKS] < max_softdeps_hard && jlow == 0 && ump->softdep_curdeps[D_DIRREM] < (max_softdeps_hard / 2) / stat_flush_threads && ump->softdep_curdeps[D_INODEDEP] < max_softdeps_hard / stat_flush_threads && ump->softdep_curdeps[D_INDIRDEP] < (max_softdeps_hard / 1000) / stat_flush_threads && ump->softdep_curdeps[D_FREEBLKS] < max_softdeps_hard / stat_flush_threads) { FREE_LOCK(ump); return (0); } /* * If the journal is low or our filesystem is over its limit * then speedup the cleanup. */ if (ump->softdep_curdeps[D_INDIRDEP] < (max_softdeps_hard / 1000) / stat_flush_threads || jlow) softdep_speedup(ump); stat_sync_limit_hit += 1; FREE_LOCK(ump); /* * We only slow down the rate at which new dependencies are * generated if we are not using journaling. With journaling, * the cleanup should always be sufficient to keep things * under control. */ if (DOINGSUJ(vp)) return (0); return (1); } /* * Called by the allocation routines when they are about to fail * in the hope that we can free up the requested resource (inodes * or disk space). * * First check to see if the work list has anything on it. If it has, * clean up entries until we successfully free the requested resource. * Because this process holds inodes locked, we cannot handle any remove * requests that might block on a locked inode as that could lead to * deadlock. If the worklist yields none of the requested resource, * start syncing out vnodes to free up the needed space. */ int softdep_request_cleanup(fs, vp, cred, resource) struct fs *fs; struct vnode *vp; struct ucred *cred; int resource; { struct ufsmount *ump; struct mount *mp; struct vnode *lvp, *mvp; long starttime; ufs2_daddr_t needed; int error; /* * If we are being called because of a process doing a * copy-on-write, then it is not safe to process any * worklist items as we will recurse into the copyonwrite * routine. This will result in an incoherent snapshot. * If the vnode that we hold is a snapshot, we must avoid * handling other resources that could cause deadlock. */ if ((curthread->td_pflags & TDP_COWINPROGRESS) || IS_SNAPSHOT(VTOI(vp))) return (0); if (resource == FLUSH_BLOCKS_WAIT) stat_cleanup_blkrequests += 1; else stat_cleanup_inorequests += 1; mp = vp->v_mount; ump = VFSTOUFS(mp); mtx_assert(UFS_MTX(ump), MA_OWNED); UFS_UNLOCK(ump); error = ffs_update(vp, 1); if (error != 0 || MOUNTEDSOFTDEP(mp) == 0) { UFS_LOCK(ump); return (0); } /* * If we are in need of resources, start by cleaning up * any block removals associated with our inode. */ ACQUIRE_LOCK(ump); process_removes(vp); process_truncates(vp); FREE_LOCK(ump); /* * Now clean up at least as many resources as we will need. * * When requested to clean up inodes, the number that are needed * is set by the number of simultaneous writers (mnt_writeopcount) * plus a bit of slop (2) in case some more writers show up while * we are cleaning. * * When requested to free up space, the amount of space that * we need is enough blocks to allocate a full-sized segment * (fs_contigsumsize). The number of such segments that will * be needed is set by the number of simultaneous writers * (mnt_writeopcount) plus a bit of slop (2) in case some more * writers show up while we are cleaning. * * Additionally, if we are unpriviledged and allocating space, * we need to ensure that we clean up enough blocks to get the * needed number of blocks over the threshhold of the minimum * number of blocks required to be kept free by the filesystem * (fs_minfree). */ if (resource == FLUSH_INODES_WAIT) { needed = vp->v_mount->mnt_writeopcount + 2; } else if (resource == FLUSH_BLOCKS_WAIT) { needed = (vp->v_mount->mnt_writeopcount + 2) * fs->fs_contigsumsize; if (priv_check_cred(cred, PRIV_VFS_BLOCKRESERVE, 0)) needed += fragstoblks(fs, roundup((fs->fs_dsize * fs->fs_minfree / 100) - fs->fs_cstotal.cs_nffree, fs->fs_frag)); } else { UFS_LOCK(ump); printf("softdep_request_cleanup: Unknown resource type %d\n", resource); return (0); } starttime = time_second; retry: if ((resource == FLUSH_BLOCKS_WAIT && ump->softdep_on_worklist > 0 && fs->fs_cstotal.cs_nbfree <= needed) || (resource == FLUSH_INODES_WAIT && fs->fs_pendinginodes > 0 && fs->fs_cstotal.cs_nifree <= needed)) { ACQUIRE_LOCK(ump); if (ump->softdep_on_worklist > 0 && process_worklist_item(UFSTOVFS(ump), ump->softdep_on_worklist, LK_NOWAIT) != 0) stat_worklist_push += 1; FREE_LOCK(ump); } /* * If we still need resources and there are no more worklist * entries to process to obtain them, we have to start flushing * the dirty vnodes to force the release of additional requests * to the worklist that we can then process to reap addition * resources. We walk the vnodes associated with the mount point * until we get the needed worklist requests that we can reap. */ if ((resource == FLUSH_BLOCKS_WAIT && fs->fs_cstotal.cs_nbfree <= needed) || (resource == FLUSH_INODES_WAIT && fs->fs_pendinginodes > 0 && fs->fs_cstotal.cs_nifree <= needed)) { MNT_VNODE_FOREACH_ALL(lvp, mp, mvp) { if (TAILQ_FIRST(&lvp->v_bufobj.bo_dirty.bv_hd) == 0) { VI_UNLOCK(lvp); continue; } if (vget(lvp, LK_EXCLUSIVE | LK_INTERLOCK | LK_NOWAIT, curthread)) continue; if (lvp->v_vflag & VV_NOSYNC) { /* unlinked */ vput(lvp); continue; } (void) ffs_syncvnode(lvp, MNT_NOWAIT, 0); vput(lvp); } lvp = ump->um_devvp; if (vn_lock(lvp, LK_EXCLUSIVE | LK_NOWAIT) == 0) { VOP_FSYNC(lvp, MNT_NOWAIT, curthread); VOP_UNLOCK(lvp, 0); } if (ump->softdep_on_worklist > 0) { stat_cleanup_retries += 1; goto retry; } stat_cleanup_failures += 1; } if (time_second - starttime > stat_cleanup_high_delay) stat_cleanup_high_delay = time_second - starttime; UFS_LOCK(ump); return (1); } /* * If memory utilization has gotten too high, deliberately slow things * down and speed up the I/O processing. */ static int request_cleanup(mp, resource) struct mount *mp; int resource; { struct thread *td = curthread; struct ufsmount *ump; ump = VFSTOUFS(mp); LOCK_OWNED(ump); /* * We never hold up the filesystem syncer or buf daemon. */ if (td->td_pflags & (TDP_SOFTDEP|TDP_NORUNNINGBUF)) return (0); /* * First check to see if the work list has gotten backlogged. * If it has, co-opt this process to help clean up two entries. * Because this process may hold inodes locked, we cannot * handle any remove requests that might block on a locked * inode as that could lead to deadlock. We set TDP_SOFTDEP * to avoid recursively processing the worklist. */ if (ump->softdep_on_worklist > max_softdeps / 10) { td->td_pflags |= TDP_SOFTDEP; process_worklist_item(mp, 2, LK_NOWAIT); td->td_pflags &= ~TDP_SOFTDEP; stat_worklist_push += 2; return(1); } /* * Next, we attempt to speed up the syncer process. If that * is successful, then we allow the process to continue. */ if (softdep_speedup(ump) && resource != FLUSH_BLOCKS_WAIT && resource != FLUSH_INODES_WAIT) return(0); /* * If we are resource constrained on inode dependencies, try * flushing some dirty inodes. Otherwise, we are constrained * by file deletions, so try accelerating flushes of directories * with removal dependencies. We would like to do the cleanup * here, but we probably hold an inode locked at this point and * that might deadlock against one that we try to clean. So, * the best that we can do is request the syncer daemon to do * the cleanup for us. */ switch (resource) { case FLUSH_INODES: case FLUSH_INODES_WAIT: ACQUIRE_GBLLOCK(&lk); stat_ino_limit_push += 1; req_clear_inodedeps += 1; FREE_GBLLOCK(&lk); stat_countp = &stat_ino_limit_hit; break; case FLUSH_BLOCKS: case FLUSH_BLOCKS_WAIT: ACQUIRE_GBLLOCK(&lk); stat_blk_limit_push += 1; req_clear_remove += 1; FREE_GBLLOCK(&lk); stat_countp = &stat_blk_limit_hit; break; default: panic("request_cleanup: unknown type"); } /* * Hopefully the syncer daemon will catch up and awaken us. * We wait at most tickdelay before proceeding in any case. */ ACQUIRE_GBLLOCK(&lk); FREE_LOCK(ump); proc_waiting += 1; if (callout_pending(&softdep_callout) == FALSE) callout_reset(&softdep_callout, tickdelay > 2 ? tickdelay : 2, pause_timer, 0); msleep((caddr_t)&proc_waiting, &lk, PPAUSE, "softupdate", 0); proc_waiting -= 1; FREE_GBLLOCK(&lk); ACQUIRE_LOCK(ump); return (1); } /* * Awaken processes pausing in request_cleanup and clear proc_waiting * to indicate that there is no longer a timer running. Pause_timer * will be called with the global softdep mutex (&lk) locked. */ static void pause_timer(arg) void *arg; { GBLLOCK_OWNED(&lk); /* * The callout_ API has acquired mtx and will hold it around this * function call. */ *stat_countp += proc_waiting; wakeup(&proc_waiting); } /* * If requested, try removing inode or removal dependencies. */ static void check_clear_deps(mp) struct mount *mp; { /* * If we are suspended, it may be because of our using * too many inodedeps, so help clear them out. */ if (MOUNTEDSUJ(mp) && VFSTOUFS(mp)->softdep_jblocks->jb_suspended) clear_inodedeps(mp); /* * General requests for cleanup of backed up dependencies */ ACQUIRE_GBLLOCK(&lk); if (req_clear_inodedeps) { req_clear_inodedeps -= 1; FREE_GBLLOCK(&lk); clear_inodedeps(mp); ACQUIRE_GBLLOCK(&lk); wakeup(&proc_waiting); } if (req_clear_remove) { req_clear_remove -= 1; FREE_GBLLOCK(&lk); clear_remove(mp); ACQUIRE_GBLLOCK(&lk); wakeup(&proc_waiting); } FREE_GBLLOCK(&lk); } /* * Flush out a directory with at least one removal dependency in an effort to * reduce the number of dirrem, freefile, and freeblks dependency structures. */ static void clear_remove(mp) struct mount *mp; { struct pagedep_hashhead *pagedephd; struct pagedep *pagedep; struct ufsmount *ump; struct vnode *vp; struct bufobj *bo; int error, cnt; ino_t ino; ump = VFSTOUFS(mp); LOCK_OWNED(ump); for (cnt = 0; cnt <= ump->pagedep_hash_size; cnt++) { pagedephd = &ump->pagedep_hashtbl[ump->pagedep_nextclean++]; if (ump->pagedep_nextclean > ump->pagedep_hash_size) ump->pagedep_nextclean = 0; LIST_FOREACH(pagedep, pagedephd, pd_hash) { if (LIST_EMPTY(&pagedep->pd_dirremhd)) continue; ino = pagedep->pd_ino; if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) continue; FREE_LOCK(ump); /* * Let unmount clear deps */ error = vfs_busy(mp, MBF_NOWAIT); if (error != 0) goto finish_write; error = ffs_vgetf(mp, ino, LK_EXCLUSIVE, &vp, FFSV_FORCEINSMQ); vfs_unbusy(mp); if (error != 0) { softdep_error("clear_remove: vget", error); goto finish_write; } if ((error = ffs_syncvnode(vp, MNT_NOWAIT, 0))) softdep_error("clear_remove: fsync", error); bo = &vp->v_bufobj; BO_LOCK(bo); drain_output(vp); BO_UNLOCK(bo); vput(vp); finish_write: vn_finished_write(mp); ACQUIRE_LOCK(ump); return; } } } /* * Clear out a block of dirty inodes in an effort to reduce * the number of inodedep dependency structures. */ static void clear_inodedeps(mp) struct mount *mp; { struct inodedep_hashhead *inodedephd; struct inodedep *inodedep; struct ufsmount *ump; struct vnode *vp; struct fs *fs; int error, cnt; ino_t firstino, lastino, ino; ump = VFSTOUFS(mp); fs = ump->um_fs; LOCK_OWNED(ump); /* * Pick a random inode dependency to be cleared. * We will then gather up all the inodes in its block * that have dependencies and flush them out. */ for (cnt = 0; cnt <= ump->inodedep_hash_size; cnt++) { inodedephd = &ump->inodedep_hashtbl[ump->inodedep_nextclean++]; if (ump->inodedep_nextclean > ump->inodedep_hash_size) ump->inodedep_nextclean = 0; if ((inodedep = LIST_FIRST(inodedephd)) != NULL) break; } if (inodedep == NULL) return; /* * Find the last inode in the block with dependencies. */ firstino = inodedep->id_ino & ~(INOPB(fs) - 1); for (lastino = firstino + INOPB(fs) - 1; lastino > firstino; lastino--) if (inodedep_lookup(mp, lastino, 0, &inodedep) != 0) break; /* * Asynchronously push all but the last inode with dependencies. * Synchronously push the last inode with dependencies to ensure * that the inode block gets written to free up the inodedeps. */ for (ino = firstino; ino <= lastino; ino++) { if (inodedep_lookup(mp, ino, 0, &inodedep) == 0) continue; if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) continue; FREE_LOCK(ump); error = vfs_busy(mp, MBF_NOWAIT); /* Let unmount clear deps */ if (error != 0) { vn_finished_write(mp); ACQUIRE_LOCK(ump); return; } if ((error = ffs_vgetf(mp, ino, LK_EXCLUSIVE, &vp, FFSV_FORCEINSMQ)) != 0) { softdep_error("clear_inodedeps: vget", error); vfs_unbusy(mp); vn_finished_write(mp); ACQUIRE_LOCK(ump); return; } vfs_unbusy(mp); if (ino == lastino) { if ((error = ffs_syncvnode(vp, MNT_WAIT, 0))) softdep_error("clear_inodedeps: fsync1", error); } else { if ((error = ffs_syncvnode(vp, MNT_NOWAIT, 0))) softdep_error("clear_inodedeps: fsync2", error); BO_LOCK(&vp->v_bufobj); drain_output(vp); BO_UNLOCK(&vp->v_bufobj); } vput(vp); vn_finished_write(mp); ACQUIRE_LOCK(ump); } } void softdep_buf_append(bp, wkhd) struct buf *bp; struct workhead *wkhd; { struct worklist *wk; struct ufsmount *ump; if ((wk = LIST_FIRST(wkhd)) == NULL) return; KASSERT(MOUNTEDSOFTDEP(wk->wk_mp) != 0, ("softdep_buf_append called on non-softdep filesystem")); ump = VFSTOUFS(wk->wk_mp); ACQUIRE_LOCK(ump); while ((wk = LIST_FIRST(wkhd)) != NULL) { WORKLIST_REMOVE(wk); WORKLIST_INSERT(&bp->b_dep, wk); } FREE_LOCK(ump); } void softdep_inode_append(ip, cred, wkhd) struct inode *ip; struct ucred *cred; struct workhead *wkhd; { struct buf *bp; struct fs *fs; int error; KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ip->i_ump)) != 0, ("softdep_inode_append called on non-softdep filesystem")); fs = ip->i_fs; error = bread(ip->i_devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)), (int)fs->fs_bsize, cred, &bp); if (error) { bqrelse(bp); softdep_freework(wkhd); return; } softdep_buf_append(bp, wkhd); bqrelse(bp); } void softdep_freework(wkhd) struct workhead *wkhd; { struct worklist *wk; struct ufsmount *ump; if ((wk = LIST_FIRST(wkhd)) == NULL) return; KASSERT(MOUNTEDSOFTDEP(wk->wk_mp) != 0, ("softdep_freework called on non-softdep filesystem")); ump = VFSTOUFS(wk->wk_mp); ACQUIRE_LOCK(ump); handle_jwork(wkhd); FREE_LOCK(ump); } /* * Function to determine if the buffer has outstanding dependencies * that will cause a roll-back if the buffer is written. If wantcount * is set, return number of dependencies, otherwise just yes or no. */ static int softdep_count_dependencies(bp, wantcount) struct buf *bp; int wantcount; { struct worklist *wk; struct ufsmount *ump; struct bmsafemap *bmsafemap; struct freework *freework; struct inodedep *inodedep; struct indirdep *indirdep; struct freeblks *freeblks; struct allocindir *aip; struct pagedep *pagedep; struct dirrem *dirrem; struct newblk *newblk; struct mkdir *mkdir; struct diradd *dap; int i, retval; retval = 0; if ((wk = LIST_FIRST(&bp->b_dep)) == NULL) return (0); ump = VFSTOUFS(wk->wk_mp); ACQUIRE_LOCK(ump); LIST_FOREACH(wk, &bp->b_dep, wk_list) { switch (wk->wk_type) { case D_INODEDEP: inodedep = WK_INODEDEP(wk); if ((inodedep->id_state & DEPCOMPLETE) == 0) { /* bitmap allocation dependency */ retval += 1; if (!wantcount) goto out; } if (TAILQ_FIRST(&inodedep->id_inoupdt)) { /* direct block pointer dependency */ retval += 1; if (!wantcount) goto out; } if (TAILQ_FIRST(&inodedep->id_extupdt)) { /* direct block pointer dependency */ retval += 1; if (!wantcount) goto out; } if (TAILQ_FIRST(&inodedep->id_inoreflst)) { /* Add reference dependency. */ retval += 1; if (!wantcount) goto out; } continue; case D_INDIRDEP: indirdep = WK_INDIRDEP(wk); TAILQ_FOREACH(freework, &indirdep->ir_trunc, fw_next) { /* indirect truncation dependency */ retval += 1; if (!wantcount) goto out; } LIST_FOREACH(aip, &indirdep->ir_deplisthd, ai_next) { /* indirect block pointer dependency */ retval += 1; if (!wantcount) goto out; } continue; case D_PAGEDEP: pagedep = WK_PAGEDEP(wk); LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next) { if (LIST_FIRST(&dirrem->dm_jremrefhd)) { /* Journal remove ref dependency. */ retval += 1; if (!wantcount) goto out; } } for (i = 0; i < DAHASHSZ; i++) { LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) { /* directory entry dependency */ retval += 1; if (!wantcount) goto out; } } continue; case D_BMSAFEMAP: bmsafemap = WK_BMSAFEMAP(wk); if (LIST_FIRST(&bmsafemap->sm_jaddrefhd)) { /* Add reference dependency. */ retval += 1; if (!wantcount) goto out; } if (LIST_FIRST(&bmsafemap->sm_jnewblkhd)) { /* Allocate block dependency. */ retval += 1; if (!wantcount) goto out; } continue; case D_FREEBLKS: freeblks = WK_FREEBLKS(wk); if (LIST_FIRST(&freeblks->fb_jblkdephd)) { /* Freeblk journal dependency. */ retval += 1; if (!wantcount) goto out; } continue; case D_ALLOCDIRECT: case D_ALLOCINDIR: newblk = WK_NEWBLK(wk); if (newblk->nb_jnewblk) { /* Journal allocate dependency. */ retval += 1; if (!wantcount) goto out; } continue; case D_MKDIR: mkdir = WK_MKDIR(wk); if (mkdir->md_jaddref) { /* Journal reference dependency. */ retval += 1; if (!wantcount) goto out; } continue; case D_FREEWORK: case D_FREEDEP: case D_JSEGDEP: case D_JSEG: case D_SBDEP: /* never a dependency on these blocks */ continue; default: panic("softdep_count_dependencies: Unexpected type %s", TYPENAME(wk->wk_type)); /* NOTREACHED */ } } out: FREE_LOCK(ump); return retval; } /* * Acquire exclusive access to a buffer. * Must be called with a locked mtx parameter. * Return acquired buffer or NULL on failure. */ static struct buf * getdirtybuf(bp, lock, waitfor) struct buf *bp; struct rwlock *lock; int waitfor; { int error; if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) { if (waitfor != MNT_WAIT) return (NULL); error = BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, lock); /* * Even if we sucessfully acquire bp here, we have dropped * lock, which may violates our guarantee. */ if (error == 0) BUF_UNLOCK(bp); else if (error != ENOLCK) panic("getdirtybuf: inconsistent lock: %d", error); rw_wlock(lock); return (NULL); } if ((bp->b_vflags & BV_BKGRDINPROG) != 0) { if (lock != BO_LOCKPTR(bp->b_bufobj) && waitfor == MNT_WAIT) { rw_wunlock(lock); BO_LOCK(bp->b_bufobj); BUF_UNLOCK(bp); if ((bp->b_vflags & BV_BKGRDINPROG) != 0) { bp->b_vflags |= BV_BKGRDWAIT; msleep(&bp->b_xflags, BO_LOCKPTR(bp->b_bufobj), PRIBIO | PDROP, "getbuf", 0); } else BO_UNLOCK(bp->b_bufobj); rw_wlock(lock); return (NULL); } BUF_UNLOCK(bp); if (waitfor != MNT_WAIT) return (NULL); /* * The lock argument must be bp->b_vp's mutex in * this case. */ #ifdef DEBUG_VFS_LOCKS if (bp->b_vp->v_type != VCHR) ASSERT_BO_WLOCKED(bp->b_bufobj); #endif bp->b_vflags |= BV_BKGRDWAIT; rw_sleep(&bp->b_xflags, lock, PRIBIO, "getbuf", 0); return (NULL); } if ((bp->b_flags & B_DELWRI) == 0) { BUF_UNLOCK(bp); return (NULL); } bremfree(bp); return (bp); } /* * Check if it is safe to suspend the file system now. On entry, * the vnode interlock for devvp should be held. Return 0 with * the mount interlock held if the file system can be suspended now, * otherwise return EAGAIN with the mount interlock held. */ int softdep_check_suspend(struct mount *mp, struct vnode *devvp, int softdep_depcnt, int softdep_accdepcnt, int secondary_writes, int secondary_accwrites) { struct bufobj *bo; struct ufsmount *ump; int error; bo = &devvp->v_bufobj; ASSERT_BO_WLOCKED(bo); /* * If we are not running with soft updates, then we need only * deal with secondary writes as we try to suspend. */ if (MOUNTEDSOFTDEP(mp) == 0) { MNT_ILOCK(mp); while (mp->mnt_secondary_writes != 0) { BO_UNLOCK(bo); msleep(&mp->mnt_secondary_writes, MNT_MTX(mp), (PUSER - 1) | PDROP, "secwr", 0); BO_LOCK(bo); MNT_ILOCK(mp); } /* * Reasons for needing more work before suspend: * - Dirty buffers on devvp. * - Secondary writes occurred after start of vnode sync loop */ error = 0; if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0 || secondary_writes != 0 || mp->mnt_secondary_writes != 0 || secondary_accwrites != mp->mnt_secondary_accwrites) error = EAGAIN; BO_UNLOCK(bo); return (error); } /* * If we are running with soft updates, then we need to coordinate * with them as we try to suspend. */ ump = VFSTOUFS(mp); for (;;) { if (!TRY_ACQUIRE_LOCK(ump)) { BO_UNLOCK(bo); ACQUIRE_LOCK(ump); FREE_LOCK(ump); BO_LOCK(bo); continue; } MNT_ILOCK(mp); if (mp->mnt_secondary_writes != 0) { FREE_LOCK(ump); BO_UNLOCK(bo); msleep(&mp->mnt_secondary_writes, MNT_MTX(mp), (PUSER - 1) | PDROP, "secwr", 0); BO_LOCK(bo); continue; } break; } /* * Reasons for needing more work before suspend: * - Dirty buffers on devvp. * - Softdep activity occurred after start of vnode sync loop * - Secondary writes occurred after start of vnode sync loop */ error = 0; if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0 || softdep_depcnt != 0 || ump->softdep_deps != 0 || softdep_accdepcnt != ump->softdep_accdeps || secondary_writes != 0 || mp->mnt_secondary_writes != 0 || secondary_accwrites != mp->mnt_secondary_accwrites) error = EAGAIN; FREE_LOCK(ump); BO_UNLOCK(bo); return (error); } /* * Get the number of dependency structures for the file system, both * the current number and the total number allocated. These will * later be used to detect that softdep processing has occurred. */ void softdep_get_depcounts(struct mount *mp, int *softdep_depsp, int *softdep_accdepsp) { struct ufsmount *ump; if (MOUNTEDSOFTDEP(mp) == 0) { *softdep_depsp = 0; *softdep_accdepsp = 0; return; } ump = VFSTOUFS(mp); ACQUIRE_LOCK(ump); *softdep_depsp = ump->softdep_deps; *softdep_accdepsp = ump->softdep_accdeps; FREE_LOCK(ump); } /* * Wait for pending output on a vnode to complete. * Must be called with vnode lock and interlock locked. * * XXX: Should just be a call to bufobj_wwait(). */ static void drain_output(vp) struct vnode *vp; { struct bufobj *bo; bo = &vp->v_bufobj; ASSERT_VOP_LOCKED(vp, "drain_output"); ASSERT_BO_WLOCKED(bo); while (bo->bo_numoutput) { bo->bo_flag |= BO_WWAIT; msleep((caddr_t)&bo->bo_numoutput, BO_LOCKPTR(bo), PRIBIO + 1, "drainvp", 0); } } /* * Called whenever a buffer that is being invalidated or reallocated * contains dependencies. This should only happen if an I/O error has * occurred. The routine is called with the buffer locked. */ static void softdep_deallocate_dependencies(bp) struct buf *bp; { if ((bp->b_ioflags & BIO_ERROR) == 0) panic("softdep_deallocate_dependencies: dangling deps"); if (bp->b_vp != NULL && bp->b_vp->v_mount != NULL) softdep_error(bp->b_vp->v_mount->mnt_stat.f_mntonname, bp->b_error); else printf("softdep_deallocate_dependencies: " "got error %d while accessing filesystem\n", bp->b_error); if (bp->b_error != ENXIO) panic("softdep_deallocate_dependencies: unrecovered I/O error"); } /* * Function to handle asynchronous write errors in the filesystem. */ static void softdep_error(func, error) char *func; int error; { /* XXX should do something better! */ printf("%s: got error %d while accessing filesystem\n", func, error); } #ifdef DDB static void inodedep_print(struct inodedep *inodedep, int verbose) { db_printf("%p fs %p st %x ino %jd inoblk %jd delta %d nlink %d" " saveino %p\n", inodedep, inodedep->id_fs, inodedep->id_state, (intmax_t)inodedep->id_ino, (intmax_t)fsbtodb(inodedep->id_fs, ino_to_fsba(inodedep->id_fs, inodedep->id_ino)), inodedep->id_nlinkdelta, inodedep->id_savednlink, inodedep->id_savedino1); if (verbose == 0) return; db_printf("\tpendinghd %p, bufwait %p, inowait %p, inoreflst %p, " "mkdiradd %p\n", LIST_FIRST(&inodedep->id_pendinghd), LIST_FIRST(&inodedep->id_bufwait), LIST_FIRST(&inodedep->id_inowait), TAILQ_FIRST(&inodedep->id_inoreflst), inodedep->id_mkdiradd); db_printf("\tinoupdt %p, newinoupdt %p, extupdt %p, newextupdt %p\n", TAILQ_FIRST(&inodedep->id_inoupdt), TAILQ_FIRST(&inodedep->id_newinoupdt), TAILQ_FIRST(&inodedep->id_extupdt), TAILQ_FIRST(&inodedep->id_newextupdt)); } DB_SHOW_COMMAND(inodedep, db_show_inodedep) { if (have_addr == 0) { db_printf("Address required\n"); return; } inodedep_print((struct inodedep*)addr, 1); } DB_SHOW_COMMAND(inodedeps, db_show_inodedeps) { struct inodedep_hashhead *inodedephd; struct inodedep *inodedep; struct ufsmount *ump; int cnt; if (have_addr == 0) { db_printf("Address required\n"); return; } ump = (struct ufsmount *)addr; for (cnt = 0; cnt < ump->inodedep_hash_size; cnt++) { inodedephd = &ump->inodedep_hashtbl[cnt]; LIST_FOREACH(inodedep, inodedephd, id_hash) { inodedep_print(inodedep, 0); } } } DB_SHOW_COMMAND(worklist, db_show_worklist) { struct worklist *wk; if (have_addr == 0) { db_printf("Address required\n"); return; } wk = (struct worklist *)addr; printf("worklist: %p type %s state 0x%X\n", wk, TYPENAME(wk->wk_type), wk->wk_state); } DB_SHOW_COMMAND(workhead, db_show_workhead) { struct workhead *wkhd; struct worklist *wk; int i; if (have_addr == 0) { db_printf("Address required\n"); return; } wkhd = (struct workhead *)addr; wk = LIST_FIRST(wkhd); for (i = 0; i < 100 && wk != NULL; i++, wk = LIST_NEXT(wk, wk_list)) db_printf("worklist: %p type %s state 0x%X", wk, TYPENAME(wk->wk_type), wk->wk_state); if (i == 100) db_printf("workhead overflow"); printf("\n"); } DB_SHOW_COMMAND(mkdirs, db_show_mkdirs) { struct mkdirlist *mkdirlisthd; struct jaddref *jaddref; struct diradd *diradd; struct mkdir *mkdir; if (have_addr == 0) { db_printf("Address required\n"); return; } mkdirlisthd = (struct mkdirlist *)addr; LIST_FOREACH(mkdir, mkdirlisthd, md_mkdirs) { diradd = mkdir->md_diradd; db_printf("mkdir: %p state 0x%X dap %p state 0x%X", mkdir, mkdir->md_state, diradd, diradd->da_state); if ((jaddref = mkdir->md_jaddref) != NULL) db_printf(" jaddref %p jaddref state 0x%X", jaddref, jaddref->ja_state); db_printf("\n"); } } /* exported to ffs_vfsops.c */ extern void db_print_ffs(struct ufsmount *ump); void db_print_ffs(struct ufsmount *ump) { db_printf("mp %p %s devvp %p fs %p su_wl %d su_deps %d su_req %d\n", ump->um_mountp, ump->um_mountp->mnt_stat.f_mntonname, ump->um_devvp, ump->um_fs, ump->softdep_on_worklist, ump->softdep_deps, ump->softdep_req); } #endif /* DDB */ #endif /* SOFTUPDATES */ Index: projects/building-blocks/sys =================================================================== --- projects/building-blocks/sys (revision 278311) +++ projects/building-blocks/sys (revision 278312) Property changes on: projects/building-blocks/sys ___________________________________________________________________ Modified: svn:mergeinfo ## -0,0 +0,1 ## Merged /head/sys:r278256-278311 Index: projects/building-blocks/tools/tools/qrndtest/Makefile =================================================================== --- projects/building-blocks/tools/tools/qrndtest/Makefile (nonexistent) +++ projects/building-blocks/tools/tools/qrndtest/Makefile (revision 278312) @@ -0,0 +1,11 @@ +# +# $FreeBSD$ +# + +PROG = qrndtest + +SRCS = r.c + +MAN= + +.include Property changes on: projects/building-blocks/tools/tools/qrndtest/Makefile ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Index: projects/building-blocks/tools/tools/qrndtest/README =================================================================== --- projects/building-blocks/tools/tools/qrndtest/README (nonexistent) +++ projects/building-blocks/tools/tools/qrndtest/README (revision 278312) @@ -0,0 +1,4 @@ +This is a really quick random testing that I wrote for adrian. + +If you want a real randomness tester, look at dieharder that has a much +larger battery of tests and will tell you if your PRNG is good or not. Property changes on: projects/building-blocks/tools/tools/qrndtest/README ___________________________________________________________________ Added: fbsd:nokeywords ## -0,0 +1 ## +yes \ No newline at end of property Index: projects/building-blocks/tools/tools/qrndtest/r.c =================================================================== --- projects/building-blocks/tools/tools/qrndtest/r.c (nonexistent) +++ projects/building-blocks/tools/tools/qrndtest/r.c (revision 278312) @@ -0,0 +1,81 @@ +/*- + * Copyright 2015 John-Mark Gurney. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + * + */ + +#include +#include +#include +#include + +typedef uint32_t (*rndfun_t)(); + +uint32_t +randomwrap() +{ + return random(); +} + +struct { + const char *name; + rndfun_t rndfun; +} rndfuns[] = { + { "random", randomwrap }, + { "arc4random", arc4random }, + { NULL, NULL }, +}; + +int +main(int argc, char *argv[]) +{ + uint64_t vals[4] = {}; + uint64_t avg; + unsigned int i; + rndfun_t f; + + if (argc == 1) + f = rndfuns[0].rndfun; + else { + for (i = 0; rndfuns[i].name != NULL; i++) { + if (strcasecmp(rndfuns[i].name, argv[1]) == 0) + break; + } + if (rndfuns[i].name == NULL) + return 1; + f = rndfuns[i].rndfun; + } + + for (;;) { + vals[f() % 4]++; + if (((i++) % (4*1024*1024)) == 0) { + avg = vals[0] + vals[1] + vals[2] + vals[3]; + avg /= 4; + printf("%d: %ld %ld %ld %ld\n", i, vals[0] - avg, vals[1] - avg, vals[2] - avg, vals[3] - avg); + } + } + return 0; +} Property changes on: projects/building-blocks/tools/tools/qrndtest/r.c ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Index: projects/building-blocks/usr.sbin/devinfo/devinfo.c =================================================================== --- projects/building-blocks/usr.sbin/devinfo/devinfo.c (revision 278311) +++ projects/building-blocks/usr.sbin/devinfo/devinfo.c (revision 278312) @@ -1,233 +1,237 @@ /*- * Copyright (c) 2000, 2001 Michael Smith * Copyright (c) 2000 BSDi * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * Print information about system device configuration. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include "devinfo.h" static int rflag; static int vflag; static void print_resource(struct devinfo_res *); static int print_device_matching_resource(struct devinfo_res *, void *); static int print_device_rman_resources(struct devinfo_rman *, void *); static int print_device(struct devinfo_dev *, void *); static int print_rman_resource(struct devinfo_res *, void *); static int print_rman(struct devinfo_rman *, void *); struct indent_arg { int indent; void *arg; }; /* * Print a resource. */ void print_resource(struct devinfo_res *res) { struct devinfo_rman *rman; int hexmode; rman = devinfo_handle_to_rman(res->dr_rman); hexmode = (rman->dm_size > 1000) || (rman->dm_size == 0); printf(hexmode ? "0x%lx" : "%lu", res->dr_start); if (res->dr_size > 1) printf(hexmode ? "-0x%lx" : "-%lu", res->dr_start + res->dr_size - 1); } /* * Print resource information if this resource matches the * given device. * * If the given indent is 0, return an indicator that a matching * resource exists. */ int print_device_matching_resource(struct devinfo_res *res, void *arg) { struct indent_arg *ia = (struct indent_arg *)arg; struct devinfo_dev *dev = (struct devinfo_dev *)ia->arg; int i; if (devinfo_handle_to_device(res->dr_device) == dev) { /* in 'detect' mode, found a match */ if (ia->indent == 0) return(1); for (i = 0; i < ia->indent; i++) printf(" "); print_resource(res); printf("\n"); } return(0); } /* * Print resource information for this device and resource manager. */ int print_device_rman_resources(struct devinfo_rman *rman, void *arg) { struct indent_arg *ia = (struct indent_arg *)arg; int indent, i; indent = ia->indent; /* check whether there are any resources matching this device */ ia->indent = 0; if (devinfo_foreach_rman_resource(rman, print_device_matching_resource, ia) != 0) { /* there are, print header */ for (i = 0; i < indent; i++) printf(" "); printf("%s:\n", rman->dm_desc); /* print resources */ ia->indent = indent + 4; devinfo_foreach_rman_resource(rman, print_device_matching_resource, ia); } ia->indent = indent; return(0); } /* * Print information about a device. */ int print_device(struct devinfo_dev *dev, void *arg) { struct indent_arg ia; int i, indent; if (vflag || (dev->dd_name[0] != 0 && dev->dd_state >= DS_ATTACHED)) { indent = (int)(intptr_t)arg; for (i = 0; i < indent; i++) printf(" "); printf("%s", dev->dd_name[0] ? dev->dd_name : "unknown"); if (vflag && *dev->dd_pnpinfo) printf(" pnpinfo %s", dev->dd_pnpinfo); if (vflag && *dev->dd_location) printf(" at %s", dev->dd_location); + if (!(dev->dd_flags & DF_ENABLED)) + printf(" (disabled)"); + else if (dev->dd_flags & DF_SUSPENDED) + printf(" (suspended)"); printf("\n"); if (rflag) { ia.indent = indent + 4; ia.arg = dev; devinfo_foreach_rman(print_device_rman_resources, (void *)&ia); } } return(devinfo_foreach_device_child(dev, print_device, (void *)((char *)arg + 2))); } /* * Print information about a resource under a resource manager. */ int print_rman_resource(struct devinfo_res *res, void *arg __unused) { struct devinfo_dev *dev; printf(" "); print_resource(res); dev = devinfo_handle_to_device(res->dr_device); if ((dev != NULL) && (dev->dd_name[0] != 0)) { printf(" (%s)", dev->dd_name); } else { printf(" ----"); } printf("\n"); return(0); } /* * Print information about a resource manager. */ int print_rman(struct devinfo_rman *rman, void *arg __unused) { printf("%s:\n", rman->dm_desc); devinfo_foreach_rman_resource(rman, print_rman_resource, 0); return(0); } int main(int argc, char *argv[]) { struct devinfo_dev *root; int c, uflag; uflag = 0; while ((c = getopt(argc, argv, "ruv")) != -1) { switch(c) { case 'r': rflag++; break; case 'u': uflag++; break; case 'v': vflag++; break; default: fprintf(stderr, "%s\n%s\n", "usage: devinfo [-rv]", " devinfo -u"); exit(1); } } if (devinfo_init()) err(1, "devinfo_init"); if ((root = devinfo_handle_to_device(DEVINFO_ROOT_DEVICE)) == NULL) errx(1, "can't find root device"); /* print resource usage? */ if (uflag) { devinfo_foreach_rman(print_rman, NULL); } else { /* print device hierarchy */ devinfo_foreach_device_child(root, print_device, (void *)0); } return(0); } Index: projects/building-blocks/usr.sbin/syslogd/syslogd.c =================================================================== --- projects/building-blocks/usr.sbin/syslogd/syslogd.c (revision 278311) +++ projects/building-blocks/usr.sbin/syslogd/syslogd.c (revision 278312) @@ -1,2759 +1,2757 @@ /* * Copyright (c) 1983, 1988, 1993, 1994 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #ifndef lint static const char copyright[] = "@(#) Copyright (c) 1983, 1988, 1993, 1994\n\ The Regents of the University of California. All rights reserved.\n"; #endif /* not lint */ #ifndef lint #if 0 static char sccsid[] = "@(#)syslogd.c 8.3 (Berkeley) 4/4/94"; #endif #endif /* not lint */ #include __FBSDID("$FreeBSD$"); /* * syslogd -- log system messages * * This program implements a system log. It takes a series of lines. * Each line may have a priority, signified as "" as * the first characters of the line. If this is * not present, a default priority is used. * * To kill syslogd, send a signal 15 (terminate). A signal 1 (hup) will * cause it to reread its configuration file. * * Defined Constants: * * MAXLINE -- the maximum line length that can be handled. * DEFUPRI -- the default priority for user messages * DEFSPRI -- the default priority for kernel messages * * Author: Eric Allman * extensive changes by Ralph Campbell * more extensive changes by Eric Allman (again) * Extension to log by program name as well as facility and priority * by Peter da Silva. * -u and -v by Harlan Stenn. * Priority comparison code by Harlan Stenn. */ #define MAXLINE 1024 /* maximum line length */ #define MAXSVLINE 120 /* maximum saved line length */ #define DEFUPRI (LOG_USER|LOG_NOTICE) #define DEFSPRI (LOG_KERN|LOG_CRIT) #define TIMERINTVL 30 /* interval for checking flush, mark */ #define TTYMSGTIME 1 /* timeout passed to ttymsg */ #define RCVBUF_MINSIZE (80 * 1024) /* minimum size of dgram rcv buffer */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "pathnames.h" #include "ttymsg.h" #define SYSLOG_NAMES #include const char *ConfFile = _PATH_LOGCONF; const char *PidFile = _PATH_LOGPID; const char ctty[] = _PATH_CONSOLE; #define dprintf if (Debug) printf #define MAXUNAMES 20 /* maximum number of user names */ /* * Unix sockets. * We have two default sockets, one with 666 permissions, * and one for privileged programs. */ struct funix { int s; const char *name; mode_t mode; STAILQ_ENTRY(funix) next; }; struct funix funix_secure = { -1, _PATH_LOG_PRIV, S_IRUSR | S_IWUSR, { NULL } }; struct funix funix_default = { -1, _PATH_LOG, DEFFILEMODE, { &funix_secure } }; STAILQ_HEAD(, funix) funixes = { &funix_default, &(funix_secure.next.stqe_next) }; /* * Flags to logmsg(). */ #define IGN_CONS 0x001 /* don't print on console */ #define SYNC_FILE 0x002 /* do fsync on file after printing */ #define ADDDATE 0x004 /* add a date to the message */ #define MARK 0x008 /* this message is a mark */ #define ISKERNEL 0x010 /* kernel generated message */ /* * This structure represents the files that will have log * copies printed. * We require f_file to be valid if f_type is F_FILE, F_CONSOLE, F_TTY * or if f_type if F_PIPE and f_pid > 0. */ struct filed { struct filed *f_next; /* next in linked list */ short f_type; /* entry type, see below */ short f_file; /* file descriptor */ time_t f_time; /* time this was last written */ char *f_host; /* host from which to recd. */ u_char f_pmask[LOG_NFACILITIES+1]; /* priority mask */ u_char f_pcmp[LOG_NFACILITIES+1]; /* compare priority */ #define PRI_LT 0x1 #define PRI_EQ 0x2 #define PRI_GT 0x4 char *f_program; /* program this applies to */ union { char f_uname[MAXUNAMES][MAXLOGNAME]; struct { char f_hname[MAXHOSTNAMELEN]; struct addrinfo *f_addr; } f_forw; /* forwarding address */ char f_fname[MAXPATHLEN]; struct { char f_pname[MAXPATHLEN]; pid_t f_pid; } f_pipe; } f_un; char f_prevline[MAXSVLINE]; /* last message logged */ char f_lasttime[16]; /* time of last occurrence */ char f_prevhost[MAXHOSTNAMELEN]; /* host from which recd. */ int f_prevpri; /* pri of f_prevline */ int f_prevlen; /* length of f_prevline */ int f_prevcount; /* repetition cnt of prevline */ u_int f_repeatcount; /* number of "repeated" msgs */ int f_flags; /* file-specific flags */ #define FFLAG_SYNC 0x01 #define FFLAG_NEEDSYNC 0x02 }; /* * Queue of about-to-be dead processes we should watch out for. */ TAILQ_HEAD(stailhead, deadq_entry) deadq_head; struct stailhead *deadq_headp; struct deadq_entry { pid_t dq_pid; int dq_timeout; TAILQ_ENTRY(deadq_entry) dq_entries; }; /* * The timeout to apply to processes waiting on the dead queue. Unit * of measure is `mark intervals', i.e. 20 minutes by default. * Processes on the dead queue will be terminated after that time. */ #define DQ_TIMO_INIT 2 typedef struct deadq_entry *dq_t; /* * Struct to hold records of network addresses that are allowed to log * to us. */ struct allowedpeer { int isnumeric; u_short port; union { struct { struct sockaddr_storage addr; struct sockaddr_storage mask; } numeric; char *name; } u; #define a_addr u.numeric.addr #define a_mask u.numeric.mask #define a_name u.name }; /* * Intervals at which we flush out "message repeated" messages, * in seconds after previous message is logged. After each flush, * we move to the next interval until we reach the largest. */ int repeatinterval[] = { 30, 120, 600 }; /* # of secs before flush */ #define MAXREPEAT ((sizeof(repeatinterval) / sizeof(repeatinterval[0])) - 1) #define REPEATTIME(f) ((f)->f_time + repeatinterval[(f)->f_repeatcount]) #define BACKOFF(f) { if (++(f)->f_repeatcount > MAXREPEAT) \ (f)->f_repeatcount = MAXREPEAT; \ } /* values for f_type */ #define F_UNUSED 0 /* unused entry */ #define F_FILE 1 /* regular file */ #define F_TTY 2 /* terminal */ #define F_CONSOLE 3 /* console terminal */ #define F_FORW 4 /* remote machine */ #define F_USERS 5 /* list of users */ #define F_WALL 6 /* everyone logged on */ #define F_PIPE 7 /* pipe to program */ const char *TypeNames[8] = { "UNUSED", "FILE", "TTY", "CONSOLE", "FORW", "USERS", "WALL", "PIPE" }; static struct filed *Files; /* Log files that we write to */ static struct filed consfile; /* Console */ static int Debug; /* debug flag */ static int resolve = 1; /* resolve hostname */ static char LocalHostName[MAXHOSTNAMELEN]; /* our hostname */ static const char *LocalDomain; /* our local domain name */ static int *finet; /* Internet datagram socket */ static int fklog = -1; /* /dev/klog */ static int Initialized; /* set when we have initialized ourselves */ static int MarkInterval = 20 * 60; /* interval between marks in seconds */ static int MarkSeq; /* mark sequence number */ static int NoBind; /* don't bind() as suggested by RFC 3164 */ static int SecureMode; /* when true, receive only unix domain socks */ #ifdef INET6 static int family = PF_UNSPEC; /* protocol family (IPv4, IPv6 or both) */ #else static int family = PF_INET; /* protocol family (IPv4 only) */ #endif static int mask_C1 = 1; /* mask characters from 0x80 - 0x9F */ static int send_to_all; /* send message to all IPv4/IPv6 addresses */ static int use_bootfile; /* log entire bootfile for every kern msg */ static int no_compress; /* don't compress messages (1=pipes, 2=all) */ static int logflags = O_WRONLY|O_APPEND; /* flags used to open log files */ static char bootfile[MAXLINE+1]; /* booted kernel file */ struct allowedpeer *AllowedPeers; /* List of allowed peers */ static int NumAllowed; /* Number of entries in AllowedPeers */ static int RemoteAddDate; /* Always set the date on remote messages */ static int UniquePriority; /* Only log specified priority? */ static int LogFacPri; /* Put facility and priority in log message: */ /* 0=no, 1=numeric, 2=names */ static int KeepKernFac; /* Keep remotely logged kernel facility */ static int needdofsync = 0; /* Are any file(s) waiting to be fsynced? */ static struct pidfh *pfh; volatile sig_atomic_t MarkSet, WantDie; static int allowaddr(char *); static void cfline(const char *, struct filed *, const char *, const char *); static const char *cvthname(struct sockaddr *); static void deadq_enter(pid_t, const char *); static int deadq_remove(pid_t); static int decode(const char *, const CODE *); static void die(int); static void dodie(int); static void dofsync(void); static void domark(int); static void fprintlog(struct filed *, int, const char *); static int *socksetup(int, char *); static void init(int); static void logerror(const char *); static void logmsg(int, const char *, const char *, int); static void log_deadchild(pid_t, int, const char *); static void markit(void); static int skip_message(const char *, const char *, int); static void printline(const char *, char *, int); static void printsys(char *); static int p_open(const char *, pid_t *); static void readklog(void); static void reapchild(int); static void usage(void); static int validate(struct sockaddr *, const char *); static void unmapped(struct sockaddr *); static void wallmsg(struct filed *, struct iovec *, const int iovlen); static int waitdaemon(int, int, int); static void timedout(int); static void increase_rcvbuf(int); int main(int argc, char *argv[]) { int ch, i, fdsrmax = 0, l; struct sockaddr_un sunx, fromunix; struct sockaddr_storage frominet; fd_set *fdsr = NULL; char line[MAXLINE + 1]; char *bindhostname; const char *hname; struct timeval tv, *tvp; struct sigaction sact; struct funix *fx, *fx1; sigset_t mask; pid_t ppid = 1, spid; socklen_t len; if (madvise(NULL, 0, MADV_PROTECT) != 0) dprintf("madvise() failed: %s\n", strerror(errno)); bindhostname = NULL; while ((ch = getopt(argc, argv, "468Aa:b:cCdf:kl:m:nNop:P:sS:Tuv")) != -1) switch (ch) { case '4': family = PF_INET; break; #ifdef INET6 case '6': family = PF_INET6; break; #endif case '8': mask_C1 = 0; break; case 'A': send_to_all++; break; case 'a': /* allow specific network addresses only */ if (allowaddr(optarg) == -1) usage(); break; case 'b': bindhostname = optarg; break; case 'c': no_compress++; break; case 'C': logflags |= O_CREAT; break; case 'd': /* debug */ Debug++; break; case 'f': /* configuration file */ ConfFile = optarg; break; case 'k': /* keep remote kern fac */ KeepKernFac = 1; break; case 'l': { long perml; mode_t mode; char *name, *ep; if (optarg[0] == '/') { mode = DEFFILEMODE; name = optarg; } else if ((name = strchr(optarg, ':')) != NULL) { *name++ = '\0'; if (name[0] != '/') errx(1, "socket name must be absolute " "path"); if (isdigit(*optarg)) { perml = strtol(optarg, &ep, 8); if (*ep || perml < 0 || perml & ~(S_IRWXU|S_IRWXG|S_IRWXO)) errx(1, "invalid mode %s, exiting", optarg); mode = (mode_t )perml; } else errx(1, "invalid mode %s, exiting", optarg); } else /* doesn't begin with '/', and no ':' */ errx(1, "can't parse path %s", optarg); if (strlen(name) >= sizeof(sunx.sun_path)) errx(1, "%s path too long, exiting", name); if ((fx = malloc(sizeof(struct funix))) == NULL) errx(1, "malloc failed"); fx->s = -1; fx->name = name; fx->mode = mode; STAILQ_INSERT_TAIL(&funixes, fx, next); break; } case 'm': /* mark interval */ MarkInterval = atoi(optarg) * 60; break; case 'N': NoBind = 1; SecureMode = 1; break; case 'n': resolve = 0; break; case 'o': use_bootfile = 1; break; case 'p': /* path */ if (strlen(optarg) >= sizeof(sunx.sun_path)) errx(1, "%s path too long, exiting", optarg); funix_default.name = optarg; break; case 'P': /* path for alt. PID */ PidFile = optarg; break; case 's': /* no network mode */ SecureMode++; break; case 'S': /* path for privileged originator */ if (strlen(optarg) >= sizeof(sunx.sun_path)) errx(1, "%s path too long, exiting", optarg); funix_secure.name = optarg; break; case 'T': RemoteAddDate = 1; break; case 'u': /* only log specified priority */ UniquePriority++; break; case 'v': /* log facility and priority */ LogFacPri++; break; default: usage(); } if ((argc -= optind) != 0) usage(); pfh = pidfile_open(PidFile, 0600, &spid); if (pfh == NULL) { if (errno == EEXIST) errx(1, "syslogd already running, pid: %d", spid); warn("cannot open pid file"); } if (!Debug) { ppid = waitdaemon(0, 0, 30); if (ppid < 0) { warn("could not become daemon"); pidfile_remove(pfh); exit(1); } } else { setlinebuf(stdout); } if (NumAllowed) endservent(); consfile.f_type = F_CONSOLE; (void)strlcpy(consfile.f_un.f_fname, ctty + sizeof _PATH_DEV - 1, sizeof(consfile.f_un.f_fname)); (void)strlcpy(bootfile, getbootfile(), sizeof(bootfile)); (void)signal(SIGTERM, dodie); (void)signal(SIGINT, Debug ? dodie : SIG_IGN); (void)signal(SIGQUIT, Debug ? dodie : SIG_IGN); /* * We don't want the SIGCHLD and SIGHUP handlers to interfere * with each other; they are likely candidates for being called * simultaneously (SIGHUP closes pipe descriptor, process dies, * SIGCHLD happens). */ sigemptyset(&mask); sigaddset(&mask, SIGHUP); sact.sa_handler = reapchild; sact.sa_mask = mask; sact.sa_flags = SA_RESTART; (void)sigaction(SIGCHLD, &sact, NULL); (void)signal(SIGALRM, domark); (void)signal(SIGPIPE, SIG_IGN); /* We'll catch EPIPE instead. */ (void)alarm(TIMERINTVL); TAILQ_INIT(&deadq_head); #ifndef SUN_LEN #define SUN_LEN(unp) (strlen((unp)->sun_path) + 2) #endif STAILQ_FOREACH_SAFE(fx, &funixes, next, fx1) { (void)unlink(fx->name); memset(&sunx, 0, sizeof(sunx)); sunx.sun_family = AF_LOCAL; (void)strlcpy(sunx.sun_path, fx->name, sizeof(sunx.sun_path)); fx->s = socket(PF_LOCAL, SOCK_DGRAM, 0); if (fx->s < 0 || bind(fx->s, (struct sockaddr *)&sunx, SUN_LEN(&sunx)) < 0 || chmod(fx->name, fx->mode) < 0) { (void)snprintf(line, sizeof line, "cannot create %s", fx->name); logerror(line); dprintf("cannot create %s (%d)\n", fx->name, errno); if (fx == &funix_default || fx == &funix_secure) die(0); else { STAILQ_REMOVE(&funixes, fx, funix, next); continue; } } increase_rcvbuf(fx->s); } if (SecureMode <= 1) finet = socksetup(family, bindhostname); if (finet) { if (SecureMode) { for (i = 0; i < *finet; i++) { if (shutdown(finet[i+1], SHUT_RD) < 0) { logerror("shutdown"); if (!Debug) die(0); } } } else { dprintf("listening on inet and/or inet6 socket\n"); } dprintf("sending on inet and/or inet6 socket\n"); } if ((fklog = open(_PATH_KLOG, O_RDONLY, 0)) >= 0) if (fcntl(fklog, F_SETFL, O_NONBLOCK) < 0) fklog = -1; if (fklog < 0) dprintf("can't open %s (%d)\n", _PATH_KLOG, errno); /* tuck my process id away */ pidfile_write(pfh); dprintf("off & running....\n"); init(0); /* prevent SIGHUP and SIGCHLD handlers from running in parallel */ sigemptyset(&mask); sigaddset(&mask, SIGCHLD); sact.sa_handler = init; sact.sa_mask = mask; sact.sa_flags = SA_RESTART; (void)sigaction(SIGHUP, &sact, NULL); tvp = &tv; tv.tv_sec = tv.tv_usec = 0; if (fklog != -1 && fklog > fdsrmax) fdsrmax = fklog; if (finet && !SecureMode) { for (i = 0; i < *finet; i++) { if (finet[i+1] != -1 && finet[i+1] > fdsrmax) fdsrmax = finet[i+1]; } } STAILQ_FOREACH(fx, &funixes, next) if (fx->s > fdsrmax) fdsrmax = fx->s; fdsr = (fd_set *)calloc(howmany(fdsrmax+1, NFDBITS), sizeof(fd_mask)); if (fdsr == NULL) errx(1, "calloc fd_set"); for (;;) { if (MarkSet) markit(); if (WantDie) die(WantDie); bzero(fdsr, howmany(fdsrmax+1, NFDBITS) * sizeof(fd_mask)); if (fklog != -1) FD_SET(fklog, fdsr); if (finet && !SecureMode) { for (i = 0; i < *finet; i++) { if (finet[i+1] != -1) FD_SET(finet[i+1], fdsr); } } STAILQ_FOREACH(fx, &funixes, next) FD_SET(fx->s, fdsr); i = select(fdsrmax+1, fdsr, NULL, NULL, needdofsync ? &tv : tvp); switch (i) { case 0: dofsync(); needdofsync = 0; if (tvp) { tvp = NULL; if (ppid != 1) kill(ppid, SIGALRM); } continue; case -1: if (errno != EINTR) logerror("select"); continue; } if (fklog != -1 && FD_ISSET(fklog, fdsr)) readklog(); if (finet && !SecureMode) { for (i = 0; i < *finet; i++) { if (FD_ISSET(finet[i+1], fdsr)) { len = sizeof(frominet); l = recvfrom(finet[i+1], line, MAXLINE, 0, (struct sockaddr *)&frominet, &len); if (l > 0) { line[l] = '\0'; hname = cvthname((struct sockaddr *)&frominet); unmapped((struct sockaddr *)&frominet); if (validate((struct sockaddr *)&frominet, hname)) printline(hname, line, RemoteAddDate ? ADDDATE : 0); } else if (l < 0 && errno != EINTR) logerror("recvfrom inet"); } } } STAILQ_FOREACH(fx, &funixes, next) { if (FD_ISSET(fx->s, fdsr)) { len = sizeof(fromunix); l = recvfrom(fx->s, line, MAXLINE, 0, (struct sockaddr *)&fromunix, &len); if (l > 0) { line[l] = '\0'; printline(LocalHostName, line, 0); } else if (l < 0 && errno != EINTR) logerror("recvfrom unix"); } } } if (fdsr) free(fdsr); } static void unmapped(struct sockaddr *sa) { struct sockaddr_in6 *sin6; struct sockaddr_in sin4; if (sa->sa_family != AF_INET6) return; if (sa->sa_len != sizeof(struct sockaddr_in6) || sizeof(sin4) > sa->sa_len) return; sin6 = (struct sockaddr_in6 *)sa; if (!IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) return; memset(&sin4, 0, sizeof(sin4)); sin4.sin_family = AF_INET; sin4.sin_len = sizeof(struct sockaddr_in); memcpy(&sin4.sin_addr, &sin6->sin6_addr.s6_addr[12], sizeof(sin4.sin_addr)); sin4.sin_port = sin6->sin6_port; memcpy(sa, &sin4, sin4.sin_len); } static void usage(void) { fprintf(stderr, "%s\n%s\n%s\n%s\n", "usage: syslogd [-468ACcdknosTuv] [-a allowed_peer]", " [-b bind_address] [-f config_file]", " [-l [mode:]path] [-m mark_interval]", " [-P pid_file] [-p log_socket]"); exit(1); } /* * Take a raw input line, decode the message, and print the message * on the appropriate log files. */ static void printline(const char *hname, char *msg, int flags) { char *p, *q; long n; int c, pri; char line[MAXLINE + 1]; /* test for special codes */ p = msg; pri = DEFUPRI; if (*p == '<') { errno = 0; n = strtol(p + 1, &q, 10); if (*q == '>' && n >= 0 && n < INT_MAX && errno == 0) { p = q + 1; pri = n; } } if (pri &~ (LOG_FACMASK|LOG_PRIMASK)) pri = DEFUPRI; /* * Don't allow users to log kernel messages. * NOTE: since LOG_KERN == 0 this will also match * messages with no facility specified. */ if ((pri & LOG_FACMASK) == LOG_KERN && !KeepKernFac) pri = LOG_MAKEPRI(LOG_USER, LOG_PRI(pri)); q = line; while ((c = (unsigned char)*p++) != '\0' && q < &line[sizeof(line) - 4]) { if (mask_C1 && (c & 0x80) && c < 0xA0) { c &= 0x7F; *q++ = 'M'; *q++ = '-'; } if (isascii(c) && iscntrl(c)) { if (c == '\n') { *q++ = ' '; } else if (c == '\t') { *q++ = '\t'; } else { *q++ = '^'; *q++ = c ^ 0100; } } else { *q++ = c; } } *q = '\0'; logmsg(pri, line, hname, flags); } /* * Read /dev/klog while data are available, split into lines. */ static void readklog(void) { char *p, *q, line[MAXLINE + 1]; int len, i; len = 0; for (;;) { i = read(fklog, line + len, MAXLINE - 1 - len); if (i > 0) { line[i + len] = '\0'; } else { if (i < 0 && errno != EINTR && errno != EAGAIN) { logerror("klog"); fklog = -1; } break; } for (p = line; (q = strchr(p, '\n')) != NULL; p = q + 1) { *q = '\0'; printsys(p); } len = strlen(p); if (len >= MAXLINE - 1) { printsys(p); len = 0; } if (len > 0) memmove(line, p, len + 1); } if (len > 0) printsys(line); } /* * Take a raw input line from /dev/klog, format similar to syslog(). */ static void printsys(char *msg) { char *p, *q; long n; int flags, isprintf, pri; flags = ISKERNEL | SYNC_FILE | ADDDATE; /* fsync after write */ p = msg; pri = DEFSPRI; isprintf = 1; if (*p == '<') { errno = 0; n = strtol(p + 1, &q, 10); if (*q == '>' && n >= 0 && n < INT_MAX && errno == 0) { p = q + 1; pri = n; isprintf = 0; } } /* * Kernel printf's and LOG_CONSOLE messages have been displayed * on the console already. */ if (isprintf || (pri & LOG_FACMASK) == LOG_CONSOLE) flags |= IGN_CONS; if (pri &~ (LOG_FACMASK|LOG_PRIMASK)) pri = DEFSPRI; logmsg(pri, p, LocalHostName, flags); } static time_t now; /* * Match a program or host name against a specification. * Return a non-0 value if the message must be ignored * based on the specification. */ static int skip_message(const char *name, const char *spec, int checkcase) { const char *s; char prev, next; int exclude = 0; /* Behaviour on explicit match */ if (spec == NULL) return 0; switch (*spec) { case '-': exclude = 1; /*FALLTHROUGH*/ case '+': spec++; break; default: break; } if (checkcase) s = strstr (spec, name); else s = strcasestr (spec, name); if (s != NULL) { prev = (s == spec ? ',' : *(s - 1)); next = *(s + strlen (name)); if (prev == ',' && (next == '\0' || next == ',')) /* Explicit match: skip iff the spec is an exclusive one. */ return exclude; } /* No explicit match for this name: skip the message iff the spec is an inclusive one. */ return !exclude; } /* * Log a message to the appropriate log files, users, etc. based on * the priority. */ static void logmsg(int pri, const char *msg, const char *from, int flags) { struct filed *f; int i, fac, msglen, omask, prilev; const char *timestamp; char prog[NAME_MAX+1]; char buf[MAXLINE+1]; dprintf("logmsg: pri %o, flags %x, from %s, msg %s\n", pri, flags, from, msg); omask = sigblock(sigmask(SIGHUP)|sigmask(SIGALRM)); /* * Check to see if msg looks non-standard. */ msglen = strlen(msg); if (msglen < 16 || msg[3] != ' ' || msg[6] != ' ' || msg[9] != ':' || msg[12] != ':' || msg[15] != ' ') flags |= ADDDATE; (void)time(&now); if (flags & ADDDATE) { timestamp = ctime(&now) + 4; } else { timestamp = msg; msg += 16; msglen -= 16; } /* skip leading blanks */ while (isspace(*msg)) { msg++; msglen--; } /* extract facility and priority level */ if (flags & MARK) fac = LOG_NFACILITIES; else fac = LOG_FAC(pri); /* Check maximum facility number. */ if (fac > LOG_NFACILITIES) { (void)sigsetmask(omask); return; } prilev = LOG_PRI(pri); /* extract program name */ for (i = 0; i < NAME_MAX; i++) { if (!isprint(msg[i]) || msg[i] == ':' || msg[i] == '[' || msg[i] == '/' || isspace(msg[i])) break; prog[i] = msg[i]; } prog[i] = 0; /* add kernel prefix for kernel messages */ if (flags & ISKERNEL) { snprintf(buf, sizeof(buf), "%s: %s", use_bootfile ? bootfile : "kernel", msg); msg = buf; msglen = strlen(buf); } /* log the message to the particular outputs */ if (!Initialized) { f = &consfile; /* * Open in non-blocking mode to avoid hangs during open * and close(waiting for the port to drain). */ f->f_file = open(ctty, O_WRONLY | O_NONBLOCK, 0); if (f->f_file >= 0) { (void)strlcpy(f->f_lasttime, timestamp, sizeof(f->f_lasttime)); fprintlog(f, flags, msg); (void)close(f->f_file); } (void)sigsetmask(omask); return; } for (f = Files; f; f = f->f_next) { /* skip messages that are incorrect priority */ if (!(((f->f_pcmp[fac] & PRI_EQ) && (f->f_pmask[fac] == prilev)) ||((f->f_pcmp[fac] & PRI_LT) && (f->f_pmask[fac] < prilev)) ||((f->f_pcmp[fac] & PRI_GT) && (f->f_pmask[fac] > prilev)) ) || f->f_pmask[fac] == INTERNAL_NOPRI) continue; /* skip messages with the incorrect hostname */ if (skip_message(from, f->f_host, 0)) continue; /* skip messages with the incorrect program name */ if (skip_message(prog, f->f_program, 1)) continue; /* skip message to console if it has already been printed */ if (f->f_type == F_CONSOLE && (flags & IGN_CONS)) continue; /* don't output marks to recently written files */ if ((flags & MARK) && (now - f->f_time) < MarkInterval / 2) continue; /* * suppress duplicate lines to this file */ if (no_compress - (f->f_type != F_PIPE) < 1 && (flags & MARK) == 0 && msglen == f->f_prevlen && - f->f_prevline && !strcmp(msg, f->f_prevline) && + !strcmp(msg, f->f_prevline) && !strcasecmp(from, f->f_prevhost)) { (void)strlcpy(f->f_lasttime, timestamp, sizeof(f->f_lasttime)); f->f_prevcount++; dprintf("msg repeated %d times, %ld sec of %d\n", f->f_prevcount, (long)(now - f->f_time), repeatinterval[f->f_repeatcount]); /* * If domark would have logged this by now, * flush it now (so we don't hold isolated messages), * but back off so we'll flush less often * in the future. */ if (now > REPEATTIME(f)) { fprintlog(f, flags, (char *)NULL); BACKOFF(f); } } else { /* new line, save it */ if (f->f_prevcount) fprintlog(f, 0, (char *)NULL); f->f_repeatcount = 0; f->f_prevpri = pri; (void)strlcpy(f->f_lasttime, timestamp, sizeof(f->f_lasttime)); (void)strlcpy(f->f_prevhost, from, sizeof(f->f_prevhost)); if (msglen < MAXSVLINE) { f->f_prevlen = msglen; (void)strlcpy(f->f_prevline, msg, sizeof(f->f_prevline)); fprintlog(f, flags, (char *)NULL); } else { f->f_prevline[0] = 0; f->f_prevlen = 0; fprintlog(f, flags, msg); } } } (void)sigsetmask(omask); } static void dofsync(void) { struct filed *f; for (f = Files; f; f = f->f_next) { if ((f->f_type == F_FILE) && (f->f_flags & FFLAG_NEEDSYNC)) { f->f_flags &= ~FFLAG_NEEDSYNC; (void)fsync(f->f_file); } } } #define IOV_SIZE 7 static void fprintlog(struct filed *f, int flags, const char *msg) { struct iovec iov[IOV_SIZE]; struct iovec *v; struct addrinfo *r; int i, l, lsent = 0; char line[MAXLINE + 1], repbuf[80], greetings[200], *wmsg = NULL; char nul[] = "", space[] = " ", lf[] = "\n", crlf[] = "\r\n"; const char *msgret; v = iov; if (f->f_type == F_WALL) { v->iov_base = greetings; /* The time displayed is not synchornized with the other log * destinations (like messages). Following fragment was using * ctime(&now), which was updating the time every 30 sec. * With f_lasttime, time is synchronized correctly. */ v->iov_len = snprintf(greetings, sizeof greetings, "\r\n\7Message from syslogd@%s at %.24s ...\r\n", f->f_prevhost, f->f_lasttime); if (v->iov_len >= sizeof greetings) v->iov_len = sizeof greetings - 1; v++; v->iov_base = nul; v->iov_len = 0; v++; } else { v->iov_base = f->f_lasttime; v->iov_len = strlen(f->f_lasttime); v++; v->iov_base = space; v->iov_len = 1; v++; } if (LogFacPri) { static char fp_buf[30]; /* Hollow laugh */ int fac = f->f_prevpri & LOG_FACMASK; int pri = LOG_PRI(f->f_prevpri); const char *f_s = NULL; char f_n[5]; /* Hollow laugh */ const char *p_s = NULL; char p_n[5]; /* Hollow laugh */ if (LogFacPri > 1) { const CODE *c; for (c = facilitynames; c->c_name; c++) { if (c->c_val == fac) { f_s = c->c_name; break; } } for (c = prioritynames; c->c_name; c++) { if (c->c_val == pri) { p_s = c->c_name; break; } } } if (!f_s) { snprintf(f_n, sizeof f_n, "%d", LOG_FAC(fac)); f_s = f_n; } if (!p_s) { snprintf(p_n, sizeof p_n, "%d", pri); p_s = p_n; } snprintf(fp_buf, sizeof fp_buf, "<%s.%s> ", f_s, p_s); v->iov_base = fp_buf; v->iov_len = strlen(fp_buf); } else { v->iov_base = nul; v->iov_len = 0; } v++; v->iov_base = f->f_prevhost; v->iov_len = strlen(v->iov_base); v++; v->iov_base = space; v->iov_len = 1; v++; if (msg) { wmsg = strdup(msg); /* XXX iov_base needs a `const' sibling. */ if (wmsg == NULL) { logerror("strdup"); exit(1); } v->iov_base = wmsg; v->iov_len = strlen(msg); } else if (f->f_prevcount > 1) { v->iov_base = repbuf; v->iov_len = snprintf(repbuf, sizeof repbuf, "last message repeated %d times", f->f_prevcount); - } else if (f->f_prevline) { + } else { v->iov_base = f->f_prevline; v->iov_len = f->f_prevlen; - } else { - return; } v++; dprintf("Logging to %s", TypeNames[f->f_type]); f->f_time = now; switch (f->f_type) { int port; case F_UNUSED: dprintf("\n"); break; case F_FORW: port = (int)ntohs(((struct sockaddr_in *) (f->f_un.f_forw.f_addr->ai_addr))->sin_port); if (port != 514) { dprintf(" %s:%d\n", f->f_un.f_forw.f_hname, port); } else { dprintf(" %s\n", f->f_un.f_forw.f_hname); } /* check for local vs remote messages */ if (strcasecmp(f->f_prevhost, LocalHostName)) l = snprintf(line, sizeof line - 1, "<%d>%.15s Forwarded from %s: %s", f->f_prevpri, (char *)iov[0].iov_base, f->f_prevhost, (char *)iov[5].iov_base); else l = snprintf(line, sizeof line - 1, "<%d>%.15s %s", f->f_prevpri, (char *)iov[0].iov_base, (char *)iov[5].iov_base); if (l < 0) l = 0; else if (l > MAXLINE) l = MAXLINE; if (finet) { for (r = f->f_un.f_forw.f_addr; r; r = r->ai_next) { for (i = 0; i < *finet; i++) { #if 0 /* * should we check AF first, or just * trial and error? FWD */ if (r->ai_family == address_family_of(finet[i+1])) #endif lsent = sendto(finet[i+1], line, l, 0, r->ai_addr, r->ai_addrlen); if (lsent == l) break; } if (lsent == l && !send_to_all) break; } dprintf("lsent/l: %d/%d\n", lsent, l); if (lsent != l) { int e = errno; logerror("sendto"); errno = e; switch (errno) { case ENOBUFS: case ENETDOWN: case ENETUNREACH: case EHOSTUNREACH: case EHOSTDOWN: case EADDRNOTAVAIL: break; /* case EBADF: */ /* case EACCES: */ /* case ENOTSOCK: */ /* case EFAULT: */ /* case EMSGSIZE: */ /* case EAGAIN: */ /* case ENOBUFS: */ /* case ECONNREFUSED: */ default: dprintf("removing entry: errno=%d\n", e); f->f_type = F_UNUSED; break; } } } break; case F_FILE: dprintf(" %s\n", f->f_un.f_fname); v->iov_base = lf; v->iov_len = 1; if (writev(f->f_file, iov, IOV_SIZE) < 0) { /* * If writev(2) fails for potentially transient errors * like the filesystem being full, ignore it. * Otherwise remove this logfile from the list. */ if (errno != ENOSPC) { int e = errno; (void)close(f->f_file); f->f_type = F_UNUSED; errno = e; logerror(f->f_un.f_fname); } } else if ((flags & SYNC_FILE) && (f->f_flags & FFLAG_SYNC)) { f->f_flags |= FFLAG_NEEDSYNC; needdofsync = 1; } break; case F_PIPE: dprintf(" %s\n", f->f_un.f_pipe.f_pname); v->iov_base = lf; v->iov_len = 1; if (f->f_un.f_pipe.f_pid == 0) { if ((f->f_file = p_open(f->f_un.f_pipe.f_pname, &f->f_un.f_pipe.f_pid)) < 0) { f->f_type = F_UNUSED; logerror(f->f_un.f_pipe.f_pname); break; } } if (writev(f->f_file, iov, IOV_SIZE) < 0) { int e = errno; (void)close(f->f_file); if (f->f_un.f_pipe.f_pid > 0) deadq_enter(f->f_un.f_pipe.f_pid, f->f_un.f_pipe.f_pname); f->f_un.f_pipe.f_pid = 0; errno = e; logerror(f->f_un.f_pipe.f_pname); } break; case F_CONSOLE: if (flags & IGN_CONS) { dprintf(" (ignored)\n"); break; } /* FALLTHROUGH */ case F_TTY: dprintf(" %s%s\n", _PATH_DEV, f->f_un.f_fname); v->iov_base = crlf; v->iov_len = 2; errno = 0; /* ttymsg() only sometimes returns an errno */ if ((msgret = ttymsg(iov, IOV_SIZE, f->f_un.f_fname, 10))) { f->f_type = F_UNUSED; logerror(msgret); } break; case F_USERS: case F_WALL: dprintf("\n"); v->iov_base = crlf; v->iov_len = 2; wallmsg(f, iov, IOV_SIZE); break; } f->f_prevcount = 0; free(wmsg); } /* * WALLMSG -- Write a message to the world at large * * Write the specified message to either the entire * world, or a list of approved users. */ static void wallmsg(struct filed *f, struct iovec *iov, const int iovlen) { static int reenter; /* avoid calling ourselves */ struct utmpx *ut; int i; const char *p; if (reenter++) return; setutxent(); /* NOSTRICT */ while ((ut = getutxent()) != NULL) { if (ut->ut_type != USER_PROCESS) continue; if (f->f_type == F_WALL) { if ((p = ttymsg(iov, iovlen, ut->ut_line, TTYMSGTIME)) != NULL) { errno = 0; /* already in msg */ logerror(p); } continue; } /* should we send the message to this user? */ for (i = 0; i < MAXUNAMES; i++) { if (!f->f_un.f_uname[i][0]) break; if (!strcmp(f->f_un.f_uname[i], ut->ut_user)) { if ((p = ttymsg(iov, iovlen, ut->ut_line, TTYMSGTIME)) != NULL) { errno = 0; /* already in msg */ logerror(p); } break; } } } endutxent(); reenter = 0; } static void reapchild(int signo __unused) { int status; pid_t pid; struct filed *f; while ((pid = wait3(&status, WNOHANG, (struct rusage *)NULL)) > 0) { if (!Initialized) /* Don't tell while we are initting. */ continue; /* First, look if it's a process from the dead queue. */ if (deadq_remove(pid)) goto oncemore; /* Now, look in list of active processes. */ for (f = Files; f; f = f->f_next) if (f->f_type == F_PIPE && f->f_un.f_pipe.f_pid == pid) { (void)close(f->f_file); f->f_un.f_pipe.f_pid = 0; log_deadchild(pid, status, f->f_un.f_pipe.f_pname); break; } oncemore: continue; } } /* * Return a printable representation of a host address. */ static const char * cvthname(struct sockaddr *f) { int error, hl; sigset_t omask, nmask; static char hname[NI_MAXHOST], ip[NI_MAXHOST]; error = getnameinfo((struct sockaddr *)f, ((struct sockaddr *)f)->sa_len, ip, sizeof ip, NULL, 0, NI_NUMERICHOST); dprintf("cvthname(%s)\n", ip); if (error) { dprintf("Malformed from address %s\n", gai_strerror(error)); return ("???"); } if (!resolve) return (ip); sigemptyset(&nmask); sigaddset(&nmask, SIGHUP); sigprocmask(SIG_BLOCK, &nmask, &omask); error = getnameinfo((struct sockaddr *)f, ((struct sockaddr *)f)->sa_len, hname, sizeof hname, NULL, 0, NI_NAMEREQD); sigprocmask(SIG_SETMASK, &omask, NULL); if (error) { dprintf("Host name for your address (%s) unknown\n", ip); return (ip); } hl = strlen(hname); if (hl > 0 && hname[hl-1] == '.') hname[--hl] = '\0'; trimdomain(hname, hl); return (hname); } static void dodie(int signo) { WantDie = signo; } static void domark(int signo __unused) { MarkSet = 1; } /* * Print syslogd errors some place. */ static void logerror(const char *type) { char buf[512]; static int recursed = 0; /* If there's an error while trying to log an error, give up. */ if (recursed) return; recursed++; if (errno) (void)snprintf(buf, sizeof buf, "syslogd: %s: %s", type, strerror(errno)); else (void)snprintf(buf, sizeof buf, "syslogd: %s", type); errno = 0; dprintf("%s\n", buf); logmsg(LOG_SYSLOG|LOG_ERR, buf, LocalHostName, ADDDATE); recursed--; } static void die(int signo) { struct filed *f; struct funix *fx; int was_initialized; char buf[100]; was_initialized = Initialized; Initialized = 0; /* Don't log SIGCHLDs. */ for (f = Files; f != NULL; f = f->f_next) { /* flush any pending output */ if (f->f_prevcount) fprintlog(f, 0, (char *)NULL); if (f->f_type == F_PIPE && f->f_un.f_pipe.f_pid > 0) { (void)close(f->f_file); f->f_un.f_pipe.f_pid = 0; } } Initialized = was_initialized; if (signo) { dprintf("syslogd: exiting on signal %d\n", signo); (void)snprintf(buf, sizeof(buf), "exiting on signal %d", signo); errno = 0; logerror(buf); } STAILQ_FOREACH(fx, &funixes, next) (void)unlink(fx->name); pidfile_remove(pfh); exit(1); } /* * INIT -- Initialize syslogd from configuration table */ static void init(int signo) { int i; FILE *cf; struct filed *f, *next, **nextp; char *p; char cline[LINE_MAX]; char prog[LINE_MAX]; char host[MAXHOSTNAMELEN]; char oldLocalHostName[MAXHOSTNAMELEN]; char hostMsg[2*MAXHOSTNAMELEN+40]; char bootfileMsg[LINE_MAX]; dprintf("init\n"); /* * Load hostname (may have changed). */ if (signo != 0) (void)strlcpy(oldLocalHostName, LocalHostName, sizeof(oldLocalHostName)); if (gethostname(LocalHostName, sizeof(LocalHostName))) err(EX_OSERR, "gethostname() failed"); if ((p = strchr(LocalHostName, '.')) != NULL) { *p++ = '\0'; LocalDomain = p; } else { LocalDomain = ""; } /* * Close all open log files. */ Initialized = 0; for (f = Files; f != NULL; f = next) { /* flush any pending output */ if (f->f_prevcount) fprintlog(f, 0, (char *)NULL); switch (f->f_type) { case F_FILE: case F_FORW: case F_CONSOLE: case F_TTY: (void)close(f->f_file); break; case F_PIPE: if (f->f_un.f_pipe.f_pid > 0) { (void)close(f->f_file); deadq_enter(f->f_un.f_pipe.f_pid, f->f_un.f_pipe.f_pname); } f->f_un.f_pipe.f_pid = 0; break; } next = f->f_next; if (f->f_program) free(f->f_program); if (f->f_host) free(f->f_host); free((char *)f); } Files = NULL; nextp = &Files; /* open the configuration file */ if ((cf = fopen(ConfFile, "r")) == NULL) { dprintf("cannot open %s\n", ConfFile); *nextp = (struct filed *)calloc(1, sizeof(*f)); if (*nextp == NULL) { logerror("calloc"); exit(1); } cfline("*.ERR\t/dev/console", *nextp, "*", "*"); (*nextp)->f_next = (struct filed *)calloc(1, sizeof(*f)); if ((*nextp)->f_next == NULL) { logerror("calloc"); exit(1); } cfline("*.PANIC\t*", (*nextp)->f_next, "*", "*"); Initialized = 1; return; } /* * Foreach line in the conf table, open that file. */ f = NULL; (void)strlcpy(host, "*", sizeof(host)); (void)strlcpy(prog, "*", sizeof(prog)); while (fgets(cline, sizeof(cline), cf) != NULL) { /* * check for end-of-section, comments, strip off trailing * spaces and newline character. #!prog is treated specially: * following lines apply only to that program. */ for (p = cline; isspace(*p); ++p) continue; if (*p == 0) continue; if (*p == '#') { p++; if (*p != '!' && *p != '+' && *p != '-') continue; } if (*p == '+' || *p == '-') { host[0] = *p++; while (isspace(*p)) p++; if ((!*p) || (*p == '*')) { (void)strlcpy(host, "*", sizeof(host)); continue; } if (*p == '@') p = LocalHostName; for (i = 1; i < MAXHOSTNAMELEN - 1; i++) { if (!isalnum(*p) && *p != '.' && *p != '-' && *p != ',' && *p != ':' && *p != '%') break; host[i] = *p++; } host[i] = '\0'; continue; } if (*p == '!') { p++; while (isspace(*p)) p++; if ((!*p) || (*p == '*')) { (void)strlcpy(prog, "*", sizeof(prog)); continue; } for (i = 0; i < LINE_MAX - 1; i++) { if (!isprint(p[i]) || isspace(p[i])) break; prog[i] = p[i]; } prog[i] = 0; continue; } for (p = cline + 1; *p != '\0'; p++) { if (*p != '#') continue; if (*(p - 1) == '\\') { strcpy(p - 1, p); p--; continue; } *p = '\0'; break; } for (i = strlen(cline) - 1; i >= 0 && isspace(cline[i]); i--) cline[i] = '\0'; f = (struct filed *)calloc(1, sizeof(*f)); if (f == NULL) { logerror("calloc"); exit(1); } *nextp = f; nextp = &f->f_next; cfline(cline, f, prog, host); } /* close the configuration file */ (void)fclose(cf); Initialized = 1; if (Debug) { int port; for (f = Files; f; f = f->f_next) { for (i = 0; i <= LOG_NFACILITIES; i++) if (f->f_pmask[i] == INTERNAL_NOPRI) printf("X "); else printf("%d ", f->f_pmask[i]); printf("%s: ", TypeNames[f->f_type]); switch (f->f_type) { case F_FILE: printf("%s", f->f_un.f_fname); break; case F_CONSOLE: case F_TTY: printf("%s%s", _PATH_DEV, f->f_un.f_fname); break; case F_FORW: port = (int)ntohs(((struct sockaddr_in *) (f->f_un.f_forw.f_addr->ai_addr))->sin_port); if (port != 514) { printf("%s:%d", f->f_un.f_forw.f_hname, port); } else { printf("%s", f->f_un.f_forw.f_hname); } break; case F_PIPE: printf("%s", f->f_un.f_pipe.f_pname); break; case F_USERS: for (i = 0; i < MAXUNAMES && *f->f_un.f_uname[i]; i++) printf("%s, ", f->f_un.f_uname[i]); break; } if (f->f_program) printf(" (%s)", f->f_program); printf("\n"); } } logmsg(LOG_SYSLOG|LOG_INFO, "syslogd: restart", LocalHostName, ADDDATE); dprintf("syslogd: restarted\n"); /* * Log a change in hostname, but only on a restart. */ if (signo != 0 && strcmp(oldLocalHostName, LocalHostName) != 0) { (void)snprintf(hostMsg, sizeof(hostMsg), "syslogd: hostname changed, \"%s\" to \"%s\"", oldLocalHostName, LocalHostName); logmsg(LOG_SYSLOG|LOG_INFO, hostMsg, LocalHostName, ADDDATE); dprintf("%s\n", hostMsg); } /* * Log the kernel boot file if we aren't going to use it as * the prefix, and if this is *not* a restart. */ if (signo == 0 && !use_bootfile) { (void)snprintf(bootfileMsg, sizeof(bootfileMsg), "syslogd: kernel boot file is %s", bootfile); logmsg(LOG_KERN|LOG_INFO, bootfileMsg, LocalHostName, ADDDATE); dprintf("%s\n", bootfileMsg); } } /* * Crack a configuration file line */ static void cfline(const char *line, struct filed *f, const char *prog, const char *host) { struct addrinfo hints, *res; int error, i, pri, syncfile; const char *p, *q; char *bp; char buf[MAXLINE], ebuf[100]; dprintf("cfline(\"%s\", f, \"%s\", \"%s\")\n", line, prog, host); errno = 0; /* keep strerror() stuff out of logerror messages */ /* clear out file entry */ memset(f, 0, sizeof(*f)); for (i = 0; i <= LOG_NFACILITIES; i++) f->f_pmask[i] = INTERNAL_NOPRI; /* save hostname if any */ if (host && *host == '*') host = NULL; if (host) { int hl; f->f_host = strdup(host); if (f->f_host == NULL) { logerror("strdup"); exit(1); } hl = strlen(f->f_host); if (hl > 0 && f->f_host[hl-1] == '.') f->f_host[--hl] = '\0'; trimdomain(f->f_host, hl); } /* save program name if any */ if (prog && *prog == '*') prog = NULL; if (prog) { f->f_program = strdup(prog); if (f->f_program == NULL) { logerror("strdup"); exit(1); } } /* scan through the list of selectors */ for (p = line; *p && *p != '\t' && *p != ' ';) { int pri_done; int pri_cmp; int pri_invert; /* find the end of this facility name list */ for (q = p; *q && *q != '\t' && *q != ' ' && *q++ != '.'; ) continue; /* get the priority comparison */ pri_cmp = 0; pri_done = 0; pri_invert = 0; if (*q == '!') { pri_invert = 1; q++; } while (!pri_done) { switch (*q) { case '<': pri_cmp |= PRI_LT; q++; break; case '=': pri_cmp |= PRI_EQ; q++; break; case '>': pri_cmp |= PRI_GT; q++; break; default: pri_done++; break; } } /* collect priority name */ for (bp = buf; *q && !strchr("\t,; ", *q); ) *bp++ = *q++; *bp = '\0'; /* skip cruft */ while (strchr(",;", *q)) q++; /* decode priority name */ if (*buf == '*') { pri = LOG_PRIMASK; pri_cmp = PRI_LT | PRI_EQ | PRI_GT; } else { /* Ignore trailing spaces. */ for (i = strlen(buf) - 1; i >= 0 && buf[i] == ' '; i--) buf[i] = '\0'; pri = decode(buf, prioritynames); if (pri < 0) { errno = 0; (void)snprintf(ebuf, sizeof ebuf, "unknown priority name \"%s\"", buf); logerror(ebuf); return; } } if (!pri_cmp) pri_cmp = (UniquePriority) ? (PRI_EQ) : (PRI_EQ | PRI_GT) ; if (pri_invert) pri_cmp ^= PRI_LT | PRI_EQ | PRI_GT; /* scan facilities */ while (*p && !strchr("\t.; ", *p)) { for (bp = buf; *p && !strchr("\t,;. ", *p); ) *bp++ = *p++; *bp = '\0'; if (*buf == '*') { for (i = 0; i < LOG_NFACILITIES; i++) { f->f_pmask[i] = pri; f->f_pcmp[i] = pri_cmp; } } else { i = decode(buf, facilitynames); if (i < 0) { errno = 0; (void)snprintf(ebuf, sizeof ebuf, "unknown facility name \"%s\"", buf); logerror(ebuf); return; } f->f_pmask[i >> 3] = pri; f->f_pcmp[i >> 3] = pri_cmp; } while (*p == ',' || *p == ' ') p++; } p = q; } /* skip to action part */ while (*p == '\t' || *p == ' ') p++; if (*p == '-') { syncfile = 0; p++; } else syncfile = 1; switch (*p) { case '@': { char *tp; char endkey = ':'; /* * scan forward to see if there is a port defined. * so we can't use strlcpy.. */ i = sizeof(f->f_un.f_forw.f_hname); tp = f->f_un.f_forw.f_hname; p++; /* * an ipv6 address should start with a '[' in that case * we should scan for a ']' */ if (*p == '[') { p++; endkey = ']'; } while (*p && (*p != endkey) && (i-- > 0)) { *tp++ = *p++; } if (endkey == ']' && *p == endkey) p++; *tp = '\0'; } /* See if we copied a domain and have a port */ if (*p == ':') p++; else p = NULL; memset(&hints, 0, sizeof(hints)); hints.ai_family = family; hints.ai_socktype = SOCK_DGRAM; error = getaddrinfo(f->f_un.f_forw.f_hname, p ? p : "syslog", &hints, &res); if (error) { logerror(gai_strerror(error)); break; } f->f_un.f_forw.f_addr = res; f->f_type = F_FORW; break; case '/': if ((f->f_file = open(p, logflags, 0600)) < 0) { f->f_type = F_UNUSED; logerror(p); break; } if (syncfile) f->f_flags |= FFLAG_SYNC; if (isatty(f->f_file)) { if (strcmp(p, ctty) == 0) f->f_type = F_CONSOLE; else f->f_type = F_TTY; (void)strlcpy(f->f_un.f_fname, p + sizeof(_PATH_DEV) - 1, sizeof(f->f_un.f_fname)); } else { (void)strlcpy(f->f_un.f_fname, p, sizeof(f->f_un.f_fname)); f->f_type = F_FILE; } break; case '|': f->f_un.f_pipe.f_pid = 0; (void)strlcpy(f->f_un.f_pipe.f_pname, p + 1, sizeof(f->f_un.f_pipe.f_pname)); f->f_type = F_PIPE; break; case '*': f->f_type = F_WALL; break; default: for (i = 0; i < MAXUNAMES && *p; i++) { for (q = p; *q && *q != ','; ) q++; (void)strncpy(f->f_un.f_uname[i], p, MAXLOGNAME - 1); if ((q - p) >= MAXLOGNAME) f->f_un.f_uname[i][MAXLOGNAME - 1] = '\0'; else f->f_un.f_uname[i][q - p] = '\0'; while (*q == ',' || *q == ' ') q++; p = q; } f->f_type = F_USERS; break; } } /* * Decode a symbolic name to a numeric value */ static int decode(const char *name, const CODE *codetab) { const CODE *c; char *p, buf[40]; if (isdigit(*name)) return (atoi(name)); for (p = buf; *name && p < &buf[sizeof(buf) - 1]; p++, name++) { if (isupper(*name)) *p = tolower(*name); else *p = *name; } *p = '\0'; for (c = codetab; c->c_name; c++) if (!strcmp(buf, c->c_name)) return (c->c_val); return (-1); } static void markit(void) { struct filed *f; dq_t q, next; now = time((time_t *)NULL); MarkSeq += TIMERINTVL; if (MarkSeq >= MarkInterval) { logmsg(LOG_INFO, "-- MARK --", LocalHostName, ADDDATE|MARK); MarkSeq = 0; } for (f = Files; f; f = f->f_next) { if (f->f_prevcount && now >= REPEATTIME(f)) { dprintf("flush %s: repeated %d times, %d sec.\n", TypeNames[f->f_type], f->f_prevcount, repeatinterval[f->f_repeatcount]); fprintlog(f, 0, (char *)NULL); BACKOFF(f); } } /* Walk the dead queue, and see if we should signal somebody. */ for (q = TAILQ_FIRST(&deadq_head); q != NULL; q = next) { next = TAILQ_NEXT(q, dq_entries); switch (q->dq_timeout) { case 0: /* Already signalled once, try harder now. */ if (kill(q->dq_pid, SIGKILL) != 0) (void)deadq_remove(q->dq_pid); break; case 1: /* * Timed out on dead queue, send terminate * signal. Note that we leave the removal * from the dead queue to reapchild(), which * will also log the event (unless the process * didn't even really exist, in case we simply * drop it from the dead queue). */ if (kill(q->dq_pid, SIGTERM) != 0) (void)deadq_remove(q->dq_pid); /* FALLTHROUGH */ default: q->dq_timeout--; } } MarkSet = 0; (void)alarm(TIMERINTVL); } /* * fork off and become a daemon, but wait for the child to come online * before returing to the parent, or we get disk thrashing at boot etc. * Set a timer so we don't hang forever if it wedges. */ static int waitdaemon(int nochdir, int noclose, int maxwait) { int fd; int status; pid_t pid, childpid; switch (childpid = fork()) { case -1: return (-1); case 0: break; default: signal(SIGALRM, timedout); alarm(maxwait); while ((pid = wait3(&status, 0, NULL)) != -1) { if (WIFEXITED(status)) errx(1, "child pid %d exited with return code %d", pid, WEXITSTATUS(status)); if (WIFSIGNALED(status)) errx(1, "child pid %d exited on signal %d%s", pid, WTERMSIG(status), WCOREDUMP(status) ? " (core dumped)" : ""); if (pid == childpid) /* it's gone... */ break; } exit(0); } if (setsid() == -1) return (-1); if (!nochdir) (void)chdir("/"); if (!noclose && (fd = open(_PATH_DEVNULL, O_RDWR, 0)) != -1) { (void)dup2(fd, STDIN_FILENO); (void)dup2(fd, STDOUT_FILENO); (void)dup2(fd, STDERR_FILENO); if (fd > 2) (void)close (fd); } return (getppid()); } /* * We get a SIGALRM from the child when it's running and finished doing it's * fsync()'s or O_SYNC writes for all the boot messages. * * We also get a signal from the kernel if the timer expires, so check to * see what happened. */ static void timedout(int sig __unused) { int left; left = alarm(0); signal(SIGALRM, SIG_DFL); if (left == 0) errx(1, "timed out waiting for child"); else _exit(0); } /* * Add `s' to the list of allowable peer addresses to accept messages * from. * * `s' is a string in the form: * * [*]domainname[:{servicename|portnumber|*}] * * or * * netaddr/maskbits[:{servicename|portnumber|*}] * * Returns -1 on error, 0 if the argument was valid. */ static int allowaddr(char *s) { char *cp1, *cp2; struct allowedpeer ap; struct servent *se; int masklen = -1; struct addrinfo hints, *res; struct in_addr *addrp, *maskp; #ifdef INET6 int i; u_int32_t *addr6p, *mask6p; #endif char ip[NI_MAXHOST]; #ifdef INET6 if (*s != '[' || (cp1 = strchr(s + 1, ']')) == NULL) #endif cp1 = s; if ((cp1 = strrchr(cp1, ':'))) { /* service/port provided */ *cp1++ = '\0'; if (strlen(cp1) == 1 && *cp1 == '*') /* any port allowed */ ap.port = 0; else if ((se = getservbyname(cp1, "udp"))) { ap.port = ntohs(se->s_port); } else { ap.port = strtol(cp1, &cp2, 0); if (*cp2 != '\0') return (-1); /* port not numeric */ } } else { if ((se = getservbyname("syslog", "udp"))) ap.port = ntohs(se->s_port); else /* sanity, should not happen */ ap.port = 514; } if ((cp1 = strchr(s, '/')) != NULL && strspn(cp1 + 1, "0123456789") == strlen(cp1 + 1)) { *cp1 = '\0'; if ((masklen = atoi(cp1 + 1)) < 0) return (-1); } #ifdef INET6 if (*s == '[') { cp2 = s + strlen(s) - 1; if (*cp2 == ']') { ++s; *cp2 = '\0'; } else { cp2 = NULL; } } else { cp2 = NULL; } #endif memset(&hints, 0, sizeof(hints)); hints.ai_family = PF_UNSPEC; hints.ai_socktype = SOCK_DGRAM; hints.ai_flags = AI_PASSIVE | AI_NUMERICHOST; if (getaddrinfo(s, NULL, &hints, &res) == 0) { ap.isnumeric = 1; memcpy(&ap.a_addr, res->ai_addr, res->ai_addrlen); memset(&ap.a_mask, 0, sizeof(ap.a_mask)); ap.a_mask.ss_family = res->ai_family; if (res->ai_family == AF_INET) { ap.a_mask.ss_len = sizeof(struct sockaddr_in); maskp = &((struct sockaddr_in *)&ap.a_mask)->sin_addr; addrp = &((struct sockaddr_in *)&ap.a_addr)->sin_addr; if (masklen < 0) { /* use default netmask */ if (IN_CLASSA(ntohl(addrp->s_addr))) maskp->s_addr = htonl(IN_CLASSA_NET); else if (IN_CLASSB(ntohl(addrp->s_addr))) maskp->s_addr = htonl(IN_CLASSB_NET); else maskp->s_addr = htonl(IN_CLASSC_NET); } else if (masklen <= 32) { /* convert masklen to netmask */ if (masklen == 0) maskp->s_addr = 0; else maskp->s_addr = htonl(~((1 << (32 - masklen)) - 1)); } else { freeaddrinfo(res); return (-1); } /* Lose any host bits in the network number. */ addrp->s_addr &= maskp->s_addr; } #ifdef INET6 else if (res->ai_family == AF_INET6 && masklen <= 128) { ap.a_mask.ss_len = sizeof(struct sockaddr_in6); if (masklen < 0) masklen = 128; mask6p = (u_int32_t *)&((struct sockaddr_in6 *)&ap.a_mask)->sin6_addr; /* convert masklen to netmask */ while (masklen > 0) { if (masklen < 32) { *mask6p = htonl(~(0xffffffff >> masklen)); break; } *mask6p++ = 0xffffffff; masklen -= 32; } /* Lose any host bits in the network number. */ mask6p = (u_int32_t *)&((struct sockaddr_in6 *)&ap.a_mask)->sin6_addr; addr6p = (u_int32_t *)&((struct sockaddr_in6 *)&ap.a_addr)->sin6_addr; for (i = 0; i < 4; i++) addr6p[i] &= mask6p[i]; } #endif else { freeaddrinfo(res); return (-1); } freeaddrinfo(res); } else { /* arg `s' is domain name */ ap.isnumeric = 0; ap.a_name = s; if (cp1) *cp1 = '/'; #ifdef INET6 if (cp2) { *cp2 = ']'; --s; } #endif } if (Debug) { printf("allowaddr: rule %d: ", NumAllowed); if (ap.isnumeric) { printf("numeric, "); getnameinfo((struct sockaddr *)&ap.a_addr, ((struct sockaddr *)&ap.a_addr)->sa_len, ip, sizeof ip, NULL, 0, NI_NUMERICHOST); printf("addr = %s, ", ip); getnameinfo((struct sockaddr *)&ap.a_mask, ((struct sockaddr *)&ap.a_mask)->sa_len, ip, sizeof ip, NULL, 0, NI_NUMERICHOST); printf("mask = %s; ", ip); } else { printf("domainname = %s; ", ap.a_name); } printf("port = %d\n", ap.port); } if ((AllowedPeers = realloc(AllowedPeers, ++NumAllowed * sizeof(struct allowedpeer))) == NULL) { logerror("realloc"); exit(1); } memcpy(&AllowedPeers[NumAllowed - 1], &ap, sizeof(struct allowedpeer)); return (0); } /* * Validate that the remote peer has permission to log to us. */ static int validate(struct sockaddr *sa, const char *hname) { int i; size_t l1, l2; char *cp, name[NI_MAXHOST], ip[NI_MAXHOST], port[NI_MAXSERV]; struct allowedpeer *ap; struct sockaddr_in *sin4, *a4p = NULL, *m4p = NULL; #ifdef INET6 int j, reject; struct sockaddr_in6 *sin6, *a6p = NULL, *m6p = NULL; #endif struct addrinfo hints, *res; u_short sport; if (NumAllowed == 0) /* traditional behaviour, allow everything */ return (1); (void)strlcpy(name, hname, sizeof(name)); memset(&hints, 0, sizeof(hints)); hints.ai_family = PF_UNSPEC; hints.ai_socktype = SOCK_DGRAM; hints.ai_flags = AI_PASSIVE | AI_NUMERICHOST; if (getaddrinfo(name, NULL, &hints, &res) == 0) freeaddrinfo(res); else if (strchr(name, '.') == NULL) { strlcat(name, ".", sizeof name); strlcat(name, LocalDomain, sizeof name); } if (getnameinfo(sa, sa->sa_len, ip, sizeof ip, port, sizeof port, NI_NUMERICHOST | NI_NUMERICSERV) != 0) return (0); /* for safety, should not occur */ dprintf("validate: dgram from IP %s, port %s, name %s;\n", ip, port, name); sport = atoi(port); /* now, walk down the list */ for (i = 0, ap = AllowedPeers; i < NumAllowed; i++, ap++) { if (ap->port != 0 && ap->port != sport) { dprintf("rejected in rule %d due to port mismatch.\n", i); continue; } if (ap->isnumeric) { if (ap->a_addr.ss_family != sa->sa_family) { dprintf("rejected in rule %d due to address family mismatch.\n", i); continue; } if (ap->a_addr.ss_family == AF_INET) { sin4 = (struct sockaddr_in *)sa; a4p = (struct sockaddr_in *)&ap->a_addr; m4p = (struct sockaddr_in *)&ap->a_mask; if ((sin4->sin_addr.s_addr & m4p->sin_addr.s_addr) != a4p->sin_addr.s_addr) { dprintf("rejected in rule %d due to IP mismatch.\n", i); continue; } } #ifdef INET6 else if (ap->a_addr.ss_family == AF_INET6) { sin6 = (struct sockaddr_in6 *)sa; a6p = (struct sockaddr_in6 *)&ap->a_addr; m6p = (struct sockaddr_in6 *)&ap->a_mask; if (a6p->sin6_scope_id != 0 && sin6->sin6_scope_id != a6p->sin6_scope_id) { dprintf("rejected in rule %d due to scope mismatch.\n", i); continue; } reject = 0; for (j = 0; j < 16; j += 4) { if ((*(u_int32_t *)&sin6->sin6_addr.s6_addr[j] & *(u_int32_t *)&m6p->sin6_addr.s6_addr[j]) != *(u_int32_t *)&a6p->sin6_addr.s6_addr[j]) { ++reject; break; } } if (reject) { dprintf("rejected in rule %d due to IP mismatch.\n", i); continue; } } #endif else continue; } else { cp = ap->a_name; l1 = strlen(name); if (*cp == '*') { /* allow wildmatch */ cp++; l2 = strlen(cp); if (l2 > l1 || memcmp(cp, &name[l1 - l2], l2) != 0) { dprintf("rejected in rule %d due to name mismatch.\n", i); continue; } } else { /* exact match */ l2 = strlen(cp); if (l2 != l1 || memcmp(cp, name, l1) != 0) { dprintf("rejected in rule %d due to name mismatch.\n", i); continue; } } } dprintf("accepted in rule %d.\n", i); return (1); /* hooray! */ } return (0); } /* * Fairly similar to popen(3), but returns an open descriptor, as * opposed to a FILE *. */ static int p_open(const char *prog, pid_t *rpid) { int pfd[2], nulldesc; pid_t pid; sigset_t omask, mask; char *argv[4]; /* sh -c cmd NULL */ char errmsg[200]; if (pipe(pfd) == -1) return (-1); if ((nulldesc = open(_PATH_DEVNULL, O_RDWR)) == -1) /* we are royally screwed anyway */ return (-1); sigemptyset(&mask); sigaddset(&mask, SIGALRM); sigaddset(&mask, SIGHUP); sigprocmask(SIG_BLOCK, &mask, &omask); switch ((pid = fork())) { case -1: sigprocmask(SIG_SETMASK, &omask, 0); close(nulldesc); return (-1); case 0: argv[0] = strdup("sh"); argv[1] = strdup("-c"); argv[2] = strdup(prog); argv[3] = NULL; if (argv[0] == NULL || argv[1] == NULL || argv[2] == NULL) { logerror("strdup"); exit(1); } alarm(0); (void)setsid(); /* Avoid catching SIGHUPs. */ /* * Throw away pending signals, and reset signal * behaviour to standard values. */ signal(SIGALRM, SIG_IGN); signal(SIGHUP, SIG_IGN); sigprocmask(SIG_SETMASK, &omask, 0); signal(SIGPIPE, SIG_DFL); signal(SIGQUIT, SIG_DFL); signal(SIGALRM, SIG_DFL); signal(SIGHUP, SIG_DFL); dup2(pfd[0], STDIN_FILENO); dup2(nulldesc, STDOUT_FILENO); dup2(nulldesc, STDERR_FILENO); closefrom(3); (void)execvp(_PATH_BSHELL, argv); _exit(255); } sigprocmask(SIG_SETMASK, &omask, 0); close(nulldesc); close(pfd[0]); /* * Avoid blocking on a hung pipe. With O_NONBLOCK, we are * supposed to get an EWOULDBLOCK on writev(2), which is * caught by the logic above anyway, which will in turn close * the pipe, and fork a new logging subprocess if necessary. * The stale subprocess will be killed some time later unless * it terminated itself due to closing its input pipe (so we * get rid of really dead puppies). */ if (fcntl(pfd[1], F_SETFL, O_NONBLOCK) == -1) { /* This is bad. */ (void)snprintf(errmsg, sizeof errmsg, "Warning: cannot change pipe to PID %d to " "non-blocking behaviour.", (int)pid); logerror(errmsg); } *rpid = pid; return (pfd[1]); } static void deadq_enter(pid_t pid, const char *name) { dq_t p; int status; /* * Be paranoid, if we can't signal the process, don't enter it * into the dead queue (perhaps it's already dead). If possible, * we try to fetch and log the child's status. */ if (kill(pid, 0) != 0) { if (waitpid(pid, &status, WNOHANG) > 0) log_deadchild(pid, status, name); return; } p = malloc(sizeof(struct deadq_entry)); if (p == NULL) { logerror("malloc"); exit(1); } p->dq_pid = pid; p->dq_timeout = DQ_TIMO_INIT; TAILQ_INSERT_TAIL(&deadq_head, p, dq_entries); } static int deadq_remove(pid_t pid) { dq_t q; TAILQ_FOREACH(q, &deadq_head, dq_entries) { if (q->dq_pid == pid) { TAILQ_REMOVE(&deadq_head, q, dq_entries); free(q); return (1); } } return (0); } static void log_deadchild(pid_t pid, int status, const char *name) { int code; char buf[256]; const char *reason; errno = 0; /* Keep strerror() stuff out of logerror messages. */ if (WIFSIGNALED(status)) { reason = "due to signal"; code = WTERMSIG(status); } else { reason = "with status"; code = WEXITSTATUS(status); if (code == 0) return; } (void)snprintf(buf, sizeof buf, "Logging subprocess %d (%s) exited %s %d.", pid, name, reason, code); logerror(buf); } static int * socksetup(int af, char *bindhostname) { struct addrinfo hints, *res, *r; const char *bindservice; char *cp; int error, maxs, *s, *socks; /* * We have to handle this case for backwards compatibility: * If there are two (or more) colons but no '[' and ']', * assume this is an inet6 address without a service. */ bindservice = "syslog"; if (bindhostname != NULL) { #ifdef INET6 if (*bindhostname == '[' && (cp = strchr(bindhostname + 1, ']')) != NULL) { ++bindhostname; *cp = '\0'; if (cp[1] == ':' && cp[2] != '\0') bindservice = cp + 2; } else { #endif cp = strchr(bindhostname, ':'); if (cp != NULL && strchr(cp + 1, ':') == NULL) { *cp = '\0'; if (cp[1] != '\0') bindservice = cp + 1; if (cp == bindhostname) bindhostname = NULL; } #ifdef INET6 } #endif } memset(&hints, 0, sizeof(hints)); hints.ai_flags = AI_PASSIVE; hints.ai_family = af; hints.ai_socktype = SOCK_DGRAM; error = getaddrinfo(bindhostname, bindservice, &hints, &res); if (error) { logerror(gai_strerror(error)); errno = 0; die(0); } /* Count max number of sockets we may open */ for (maxs = 0, r = res; r; r = r->ai_next, maxs++); socks = malloc((maxs+1) * sizeof(int)); if (socks == NULL) { logerror("couldn't allocate memory for sockets"); die(0); } *socks = 0; /* num of sockets counter at start of array */ s = socks + 1; for (r = res; r; r = r->ai_next) { int on = 1; *s = socket(r->ai_family, r->ai_socktype, r->ai_protocol); if (*s < 0) { logerror("socket"); continue; } #ifdef INET6 if (r->ai_family == AF_INET6) { if (setsockopt(*s, IPPROTO_IPV6, IPV6_V6ONLY, (char *)&on, sizeof (on)) < 0) { logerror("setsockopt"); close(*s); continue; } } #endif if (setsockopt(*s, SOL_SOCKET, SO_REUSEADDR, (char *)&on, sizeof (on)) < 0) { logerror("setsockopt"); close(*s); continue; } /* * RFC 3164 recommends that client side message * should come from the privileged syslogd port. * * If the system administrator choose not to obey * this, we can skip the bind() step so that the * system will choose a port for us. */ if (!NoBind) { if (bind(*s, r->ai_addr, r->ai_addrlen) < 0) { logerror("bind"); close(*s); continue; } if (!SecureMode) increase_rcvbuf(*s); } (*socks)++; s++; } if (*socks == 0) { free(socks); if (Debug) return (NULL); else die(0); } if (res) freeaddrinfo(res); return (socks); } static void increase_rcvbuf(int fd) { socklen_t len, slen; slen = sizeof(len); if (getsockopt(fd, SOL_SOCKET, SO_RCVBUF, &len, &slen) == 0) { if (len < RCVBUF_MINSIZE) { len = RCVBUF_MINSIZE; setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &len, sizeof(len)); } } } Index: projects/building-blocks =================================================================== --- projects/building-blocks (revision 278311) +++ projects/building-blocks (revision 278312) Property changes on: projects/building-blocks ___________________________________________________________________ Modified: svn:mergeinfo ## -0,0 +0,1 ## Merged /head:r278256-278311