Index: projects/clang1100-import/lib/libcasper/services/Makefile =================================================================== --- projects/clang1100-import/lib/libcasper/services/Makefile (revision 364278) +++ projects/clang1100-import/lib/libcasper/services/Makefile (revision 364279) @@ -1,16 +1,17 @@ # $FreeBSD$ .include SUBDIR= cap_dns SUBDIR+= cap_fileargs SUBDIR+= cap_grp +SUBDIR+= cap_net SUBDIR+= cap_pwd SUBDIR+= cap_sysctl SUBDIR+= cap_syslog SUBDIR.${MK_TESTS}+= tests SUBDIR_PARALLEL= .include Index: projects/clang1100-import/lib/libcasper/services/cap_dns/Makefile =================================================================== --- projects/clang1100-import/lib/libcasper/services/cap_dns/Makefile (revision 364278) +++ projects/clang1100-import/lib/libcasper/services/cap_dns/Makefile (revision 364279) @@ -1,38 +1,33 @@ # $FreeBSD$ SHLIBDIR?= /lib/casper .include PACKAGE= runtime SHLIB_MAJOR= 2 INCSDIR?= ${INCLUDEDIR}/casper .if ${MK_CASPER} != "no" SHLIB= cap_dns SRCS= cap_dns.c .endif INCS= cap_dns.h LIBADD= nv CFLAGS+=-I${.CURDIR} HAS_TESTS= SUBDIR.${MK_TESTS}+= tests MAN+= cap_dns.3 MLINKS+=cap_dns.3 libcap_dns.3 -MLINKS+=cap_dns.3 cap_gethostbyname.3 -MLINKS+=cap_dns.3 cap_gethostbyname2.3 -MLINKS+=cap_dns.3 cap_gethostbyaddr.3 -MLINKS+=cap_dns.3 cap_getaddrinfo.3 -MLINKS+=cap_dns.3 cap_getnameinfo.3 MLINKS+=cap_dns.3 cap_dns_type_limit.3 MLINKS+=cap_dns.3 cap_dns_family_limit.3 .include Index: projects/clang1100-import/lib/libcasper/services/cap_dns/cap_dns.3 =================================================================== --- projects/clang1100-import/lib/libcasper/services/cap_dns/cap_dns.3 (revision 364278) +++ projects/clang1100-import/lib/libcasper/services/cap_dns/cap_dns.3 (revision 364279) @@ -1,239 +1,242 @@ .\" Copyright (c) 2018 Mariusz Zaborski .\" All rights reserved. .\" .\" Redistribution and use in source and binary forms, with or without .\" modification, are permitted provided that the following conditions .\" are met: .\" 1. Redistributions of source code must retain the above copyright .\" notice, this list of conditions and the following disclaimer. .\" 2. Redistributions in binary form must reproduce the above copyright .\" notice, this list of conditions and the following disclaimer in the .\" documentation and/or other materials provided with the distribution. .\" .\" THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE .\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE .\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL .\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS .\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) .\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF .\" SUCH DAMAGE. .\" .\" $FreeBSD$ .\" -.Dd May 5, 2020 +.Dd August 15, 2020 .Dt CAP_DNS 3 .Os .Sh NAME .Nm cap_getaddrinfo , .Nm cap_getnameinfo , .Nm cap_gethostbyname , .Nm cap_gethostbyname2 , .Nm cap_gethostbyaddr , .Nm cap_dns_type_limit , .Nm cap_dns_family_limit .Nd "library for getting network host entry in capability mode" .Sh LIBRARY .Lb libcap_dns .Sh SYNOPSIS .In sys/nv.h .In libcasper.h .In casper/cap_dns.h .Ft int .Fn cap_getaddrinfo "cap_channel_t *chan" "const char *hostname" "const char *servname" "const struct addrinfo *hints" "struct addrinfo **res" .Ft int .Fn cap_getnameinfo "cap_channel_t *chan" "const struct sockaddr *sa" "socklen_t salen" "char *host" "size_t hostlen" "char *serv" "size_t servlen" "int flags" .Ft "struct hostent *" .Fn cap_gethostbyname "const cap_channel_t *chan" "const char *name" .Ft "struct hostent *" .Fn cap_gethostbyname2 "const cap_channel_t *chan" "const char *name" "int af" .Ft "struct hostent *" .Fn cap_gethostbyaddr "const cap_channel_t *chan" "const void *addr" "socklen_t len" "int af" .Ft "int" .Fn cap_dns_type_limit "cap_channel_t *chan" "const char * const *types" "size_t ntypes" .Ft "int" .Fn cap_dns_family_limit "const cap_channel_t *chan" "const int *families" "size_t nfamilies" .Sh DESCRIPTION .Bf -symbolic +This service is obsolete and +.Xr cap_net 3 +should be used instead. The .Fn cap_getaddrinfo , and .Fn cap_getnameinfo , functions are preferred over the .Fn cap_gethostbyname , .Fn cap_gethostbyname2 , and .Fn cap_gethostbyaddr functions. .Ef .Pp The functions .Fn cap_gethostbyname , .Fn cap_gethostbyname2 , .Fn cep_gethostbyaddr and .Fn cap_getnameinfo are respectively equivalent to .Xr gethostbyname 3 , .Xr gethostbyname2 3 , .Xr gethostbyaddr 3 and .Xr getnameinfo 3 except that the connection to the .Nm system.dns service needs to be provided. .Pp The .Fn cap_dns_type_limit function limits the functions allowed in the service. The .Fa types variable can be set to .Dv ADDR2NAME or .Dv NAME2ADDR . See the .Sx LIMITS section for more details. The .Fa ntpyes variable contains the number of .Fa types provided. .Pp The .Fn cap_dns_family_limit functions allows to limit address families. For details see .Sx LIMITS . The .Fa nfamilies variable contains the number of .Fa families provided. .Sh LIMITS The preferred way of setting limits is to use the .Fn cap_dns_type_limit and .Fn cap_dns_family_limit functions, but the limits of service can be set also using .Xr cap_limit_set 3 . The .Xr nvlist 9 for that function can contain the following values and types: .Bl -ohang -offset indent .It type ( NV_TYPE_STRING ) The .Va type can have two values: .Dv ADDR2NAME or .Dv NAME2ADDR . The .Dv ADDR2NAME means that reverse DNS lookups are allowed with .Fn cap_getnameinfo and .Fn cap_gethostbyaddr functions. In case when .Va type is set to .Dv NAME2ADDR the name resolution is allowed with .Fn cap_getaddrinfo , .Fn cap_gethostbyname , and .Fn cap_gethostbyname2 functions. .It family ( NV_TYPE_NUMBER ) The .Va family limits service to one of the address families (e.g. .Dv AF_INET , AF_INET6 , etc.). .Sh EXAMPLES The following example first opens a capability to casper and then uses this capability to create the .Nm system.dns casper service and uses it to resolve an IP address. .Bd -literal cap_channel_t *capcas, *capdns; int familylimit, error; const char *ipstr = "127.0.0.1"; const char *typelimit = "ADDR2NAME"; char hname[NI_MAXHOST]; struct addrinfo hints, *res; /* Open capability to Casper. */ capcas = cap_init(); if (capcas == NULL) err(1, "Unable to contact Casper"); /* Cache NLA for gai_strerror. */ caph_cache_catpages(); /* Enter capability mode sandbox. */ if (caph_enter() < 0) err(1, "Unable to enter capability mode"); /* Use Casper capability to create capability to the system.dns service. */ capdns = cap_service_open(capcas, "system.dns"); if (capdns == NULL) err(1, "Unable to open system.dns service"); /* Close Casper capability, we don't need it anymore. */ cap_close(capcas); /* Limit system.dns to reserve IPv4 addresses */ familylimit = AF_INET; if (cap_dns_family_limit(capdns, &familylimit, 1) < 0) err(1, "Unable to limit access to the system.dns service"); /* Convert IP address in C-string to struct sockaddr. */ memset(&hints, 0, sizeof(hints)); hints.ai_family = familylimit; hints.ai_flags = AI_NUMERICHOST; error = cap_getaddrinfo(capdns, ipstr, NULL, &hints, &res); if (error != 0) errx(1, "cap_getaddrinfo(): %s: %s", ipstr, gai_strerror(error)); /* Limit system.dns to reverse DNS lookups. */ if (cap_dns_type_limit(capdns, &typelimit, 1) < 0) err(1, "Unable to limit access to the system.dns service"); /* Find hostname for the given IP address. */ error = cap_getnameinfo(capdns, res->ai_addr, res->ai_addrlen, hname, sizeof(hname), NULL, 0, 0); if (error != 0) errx(1, "cap_getnameinfo(): %s: %s", ipstr, gai_strerror(error)); printf("Name associated with %s is %s.\\n", ipstr, hname); .Ed .Sh SEE ALSO .Xr cap_enter 2 , .Xr caph_enter 3 , .Xr err 3 , .Xr gethostbyaddr 3 , .Xr gethostbyname 3 , .Xr gethostbyname2 3 , .Xr getnameinfo 3 , .Xr capsicum 4 , .Xr nv 9 .Sh HISTORY The .Nm cap_dns service first appeared in .Fx 10.3 . .Sh AUTHORS The .Nm cap_dns service was implemented by .An Pawel Jakub Dawidek Aq Mt pawel@dawidek.net under sponsorship from the FreeBSD Foundation. .Pp This manual page was written by .An Mariusz Zaborski Aq Mt oshogbo@FreeBSD.org . Index: projects/clang1100-import/lib/libcasper/services/cap_net/Makefile =================================================================== --- projects/clang1100-import/lib/libcasper/services/cap_net/Makefile (nonexistent) +++ projects/clang1100-import/lib/libcasper/services/cap_net/Makefile (revision 364279) @@ -0,0 +1,48 @@ +# $FreeBSD$ + +SHLIBDIR?= /lib/casper + +.include + +PACKAGE=libcasper + +SHLIB_MAJOR= 1 +INCSDIR?= ${INCLUDEDIR}/casper + +.if ${MK_CASPER} != "no" +SHLIB= cap_net + +SRCS= cap_net.c +.endif + +INCS= cap_net.h + +LIBADD= nv + +CFLAGS+=-I${.CURDIR} +CFLAGS+=-DWITH_CASPER + +HAS_TESTS= +SUBDIR.${MK_TESTS}+= tests + +MAN+= cap_net.3 + +MLINKS+=cap_net.3 libcap_net.3 +MLINKS+=cap_net.3 cap_bind.3 +MLINKS+=cap_net.3 cap_connect.3 +MLINKS+=cap_net.3 cap_net_free.3 +MLINKS+=cap_net.3 cap_net_limit.3 +MLINKS+=cap_net.3 cap_net_limit_addr2name.3 +MLINKS+=cap_net.3 cap_net_limit_addr2name_family.3 +MLINKS+=cap_net.3 cap_net_limit_bind.3 +MLINKS+=cap_net.3 cap_net_limit_connect.3 +MLINKS+=cap_net.3 cap_net_limit_init.3 +MLINKS+=cap_net.3 cap_net_limit_name2addr.3 +MLINKS+=cap_net.3 cap_net_limit_name2addr_family.3 +MLINKS+=cap_net.3 cap_getaddrinfo.3 +MLINKS+=cap_net.3 cap_gethostbyaddr.3 +MLINKS+=cap_net.3 cap_gethostbyname.3 +MLINKS+=cap_net.3 cap_gethostbyname2.3 +MLINKS+=cap_net.3 cap_getnameinfo.3 + +.include Property changes on: projects/clang1100-import/lib/libcasper/services/cap_net/Makefile ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Index: projects/clang1100-import/lib/libcasper/services/cap_net/cap_net.3 =================================================================== --- projects/clang1100-import/lib/libcasper/services/cap_net/cap_net.3 (nonexistent) +++ projects/clang1100-import/lib/libcasper/services/cap_net/cap_net.3 (revision 364279) @@ -0,0 +1,287 @@ +.\" Copyright (c) 2020 Mariusz Zaborski +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" +.\" THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND +.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +.\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE +.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +.\" SUCH DAMAGE. +.\" +.\" $FreeBSD$ +.\" +.Dd August 15, 2020 +.Dt CAP_NET 3 +.Os +.Sh NAME +.Nm cap_bind , +.Nm cap_connect , +.Nm cap_getaddrinfo , +.Nm cap_gethostbyaddr , +.Nm cap_gethostbyname , +.Nm cap_gethostbyname2 , +.Nm cap_getnameinfo , +.Nm cap_net_free , +.Nm cap_net_limit , +.Nm cap_net_limit_addr2name , +.Nm cap_net_limit_addr2name_family , +.Nm cap_net_limit_bind , +.Nm cap_net_limit_connect , +.Nm cap_net_limit_init , +.Nm cap_net_limit_name2addr , +.Nm cap_net_limit_name2addr_family , +.Nd "library for networking in capability mode" +.Sh LIBRARY +.Lb libcap_net +.Sh SYNOPSIS +.In sys/nv.h +.In libcasper.h +.In casper/cap_net.h +.Ft int +.Fn cap_bind "cap_channel_t *chan" "int s" "const struct sockaddr *addr" "socklen_t addrlen" +.Ft int +.Fn cap_connect "cap_channel_t *chan" "int s" "const struct sockaddr *name" "socklen_t namelen" +.Ft int +.Fn cap_getaddrinfo "cap_channel_t *chan" "const char *hostname" "const char *servname" "const struct addrinfo *hints" "struct addrinfo **res" +.Ft int +.Fn cap_getnameinfo "cap_channel_t *chan" "const struct sockaddr *sa" "socklen_t salen" "char *host" "size_t hostlen" "char *serv" "size_t servlen" "int flags" +.Ft "struct hostent *" +.Fn cap_gethostbyname "const cap_channel_t *chan" "const char *name" +.Ft "struct hostent *" +.Fn cap_gethostbyname2 "const cap_channel_t *chan" "const char *name" "int af" +.Ft "struct hostent *" +.Fn cap_gethostbyaddr "const cap_channel_t *chan" "const void *addr" "socklen_t len" "int af" +.Ft "cap_net_limit_t *" +.Fn cap_net_limit_init "cap_channel_t *chan" "uint64_t mode" +.Ft int +.Fn cap_net_limit "cap_net_limit_t *limit" +.Ft void +.Fn cap_net_free "cap_net_limit_t *limit" +.Ft "cap_net_limit_t *" +.Fn cap_net_limit_addr2name_family "cap_net_limit_t *limit" "int *family" "size_t size" +.Ft "cap_net_limit_t *" +.Fn cap_net_limit_addr2name "cap_net_limit_t *limit" "const struct sockaddr *sa" "socklen_t salen" +.Ft "cap_net_limit_t *" +.Fn cap_net_limit_name2addr_family "cap_net_limit_t *limit" "int *family" "size_t size" +.Ft "cap_net_limit_t *" +.Fn cap_net_limit_name2addr "cap_net_limit_t *limit" "const char *name" "const char *serv" +.Ft "cap_net_limit_t *" +.Fn cap_net_limit_connect "cap_net_limit_t *limit" "const struct sockaddr *sa" "socklen_t salen" +.Ft "cap_net_limit_t *" +.Fn cap_net_limit_bind "cap_net_limit_t *limit" "const struct sockaddr *sa" "socklen_t salen" +.Sh DESCRIPTION +.Pp +The functions +.Fn cap_bind, +.Fn cap_connect, +.Fn cap_gethostbyname , +.Fn cap_gethostbyname2 , +.Fn cap_gethostbyaddr +and +.Fn cap_getnameinfo +are respectively equivalent to +.Xr bind 2 , +.Xr connect 2 , +.Xr gethostbyname 3 , +.Xr gethostbyname2 3 , +.Xr gethostbyaddr 3 +and +.Xr getnameinfo 3 +except that the connection to the +.Nm system.net +service needs to be provided. +.Sh LIMITS +By default, the cap_net capability provides unrestricted access to the network +namespace. +Applications typically only require access to a small portion of the network +namespace: +.Fn cap_net_limit +interface can be used to restrict access to the network. +.Fn cap_net_limit_init +returns an opaque limit handle used to store a list of capabilities. +The +.Fv mode +restricts the functionality of the service. +Modes are encoded using the following flags: +.Pp +.Bd -literal -offset indent -compact +CAPNET_ADDR2NAME reverse DNS lookups are allowed with + cap_getnameinfo +CAPNET_NAME2ADDR name resolution is allowed with + cap_getaddrinfo +CAPNET_DEPRECATED_ADDR2NAME reverse DNS lookups are allowed with + cap_gethostbyaddr +CAPNET_DEPRECATED_NAME2ADDR name resolution is allowed with + cap_gethostbyname and cap_gethostbyname2 +CAPNET_BIND bind syscall is allowed +CAPNET_CONNECT connect syscall is allowed +CAPNET_CONNECTDNS connect syscall is allowed to the values + returned from privies call to + the cap_getaddrinfo or cap_gethostbyname +.Ed +.Pp +.Fn cap_net_limit_addr2name_family +limits the +.Fn cap_getnameinfo +and +.Fn cap_gethostbyaddr +to do reverse DNS lookups to specific family (AF_INET, AF_INET6, etc.) +.Pp +.Fn cap_net_limit_addr2name +limits the +.Fn cap_getnameinfo +and +.Fn cap_gethostbyaddr +to do reverse DNS lookups only on those specific structures. +.Pp +.Fn cap_net_limit_name2addr_family +limits the +.Fn cap_getaddrinfo , +.Fn cap_gethostbyname +and +.Fn cap_gethostbyname2 +to do the name resolution on specific family (AF_INET, AF_INET6, etc.) +.Pp +.Fn cap_net_limit_addr2name +restricts +.Fn cap_getaddrinfo , +.Fn cap_gethostbyname +and +.Fn cap_gethostbyname2 +to a set of domains. +.Pp +.Fn cap_net_limit_bind +limits +.Fn cap_bind +to bind only on those specific structures. +.Pp +.Fn cap_net_limit_connect +limits +.Fn cap_connect +to connect only on those specific structures. +If the CAPNET_CONNECTDNS is set the limits are extended to the values returned +by +.Fn cap_getaddrinfo , +.Fn cap_gethostbyname +and +.Fn cap_gethostbyname2 +In case of the +.Fn cap_getaddrinfo +the restriction is strict. +In case of the +.Fn cap_gethostbyname +and +.Fn cap_gethostbyname2 +any port will be accepted in the +.Fn cap_connect +function. +.Pp +.Fn cap_net_limit +applies a set of sysctl limits to the capability, denying access to sysctl +variables not belonging to the set. +.Pp +Once a set of limits is applied, subsequent calls to +.Fn cap_net_limit +will fail unless the new set is a subset of the current set. +.Pp +The +.Fn cap_net_limit +will consume the limits. +If the +.Fn cap_net_limit +was not called the rights may be freed using +.Fn cap_net_free . +Multiple calls to +.Fn cap_net_limit_addr2name_family , +.Fn cap_net_limit_addr2name , +.Fn cap_net_limit_name2addr_family , +.Fn cap_net_limit_name2addr , +.Fn cap_net_limit_connect , +and +.Fn cap_net_limit_bind +is supported, each call is extending preview capabilities. +.Sh EXAMPLES +The following example first opens a capability to casper and then uses this +capability to create the +.Nm system.net +casper service and uses it to resolve a host and connect to it. +.Bd -literal +cap_channel_t *capcas, *capnet; +cap_net_limit_t *limit; +int familylimit, error, s; +const char *host = "example.com"; +struct addrinfo hints, *res; + +/* Open capability to Casper. */ +capcas = cap_init(); +if (capcas == NULL) + err(1, "Unable to contact Casper"); + +/* Cache NLA for gai_strerror. */ +caph_cache_catpages(); + +/* Enter capability mode sandbox. */ +if (caph_enter_casper() < 0) + err(1, "Unable to enter capability mode"); + +/* Use Casper capability to create capability to the system.net service. */ +capnet = cap_service_open(capcas, "system.net"); +if (capnet == NULL) + err(1, "Unable to open system.net service"); + +/* Close Casper capability. */ +cap_close(capcas); + +/* Limit system.net to reserve IPv4 addresses, to host example.com . */ +limit = cap_net_limit_init(capnet, CAPNET_NAME2ADDR | CAPNET_CONNECTDNS); +if (limit == NULL) + err(1, "Unable to create limits."); +cap_net_limit_name2addr(limit, host, "80"); +familylimit = AF_INET; +cap_net_limit_name2addr_family(limit, &familylimit, 1); +if (cap_net_limit(limit) < 0) + err(1, "Unable to apply limits."); + +/* Find IP addresses for the given host. */ +memset(&hints, 0, sizeof(hints)); +hints.ai_family = AF_INET; +hints.ai_socktype = SOCK_STREAM; + +error = cap_getaddrinfo(capnet, host, "80", &hints, &res); +if (error != 0) + errx(1, "cap_getaddrinfo(): %s: %s", host, gai_strerror(error)); + +s = socket(res->ai_family, res->ai_socktype, res->ai_protocol); +if (s < 0) + err(1, "Unable to create socket"); + +if (cap_connect(capnet, s, res->ai_addr, res->ai_addrlen) < 0) + err(1, "Unable to connect to host"); +.Ed +.Sh SEE ALSO +.Xr bind 2 , +.Xr cap_enter 2 , +.Xr connect 2 , +.Xr caph_enter 3 , +.Xr err 3 , +.Xr gethostbyaddr 3 , +.Xr gethostbyname 3 , +.Xr gethostbyname2 3 , +.Xr getnameinfo 3 , +.Xr capsicum 4 , +.Xr nv 9 +.Sh AUTHORS +.An Mariusz Zaborski Aq Mt oshogbo@FreeBSD.org Property changes on: projects/clang1100-import/lib/libcasper/services/cap_net/cap_net.3 ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Index: projects/clang1100-import/lib/libcasper/services/cap_net/cap_net.c =================================================================== --- projects/clang1100-import/lib/libcasper/services/cap_net/cap_net.c (nonexistent) +++ projects/clang1100-import/lib/libcasper/services/cap_net/cap_net.c (revision 364279) @@ -0,0 +1,1385 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2020 Mariusz Zaborski + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "cap_net.h" + +#define CAPNET_MASK (CAPNET_ADDR2NAME | CAPNET_NAME2ADDR \ + CAPNET_DEPRECATED_ADDR2NAME | CAPNET_DEPRECATED_NAME2ADDR | \ + CAPNET_CONNECT | CAPNET_BIND | CAPNET_CONNECTDNS) + +/* + * Defines for the names of the limits. + * XXX: we should convert all string constats to this to avoid typos. + */ +#define LIMIT_NV_BIND "bind" +#define LIMIT_NV_CONNECT "connect" +#define LIMIT_NV_ADDR2NAME "addr2name" +#define LIMIT_NV_NAME2ADDR "name2addr" + +struct cap_net_limit { + cap_channel_t *cnl_chan; + uint64_t cnl_mode; + nvlist_t *cnl_addr2name; + nvlist_t *cnl_name2addr; + nvlist_t *cnl_connect; + nvlist_t *cnl_bind; +}; + +static struct hostent hent; + +static void +hostent_free(struct hostent *hp) +{ + unsigned int ii; + + free(hp->h_name); + hp->h_name = NULL; + if (hp->h_aliases != NULL) { + for (ii = 0; hp->h_aliases[ii] != NULL; ii++) + free(hp->h_aliases[ii]); + free(hp->h_aliases); + hp->h_aliases = NULL; + } + if (hp->h_addr_list != NULL) { + for (ii = 0; hp->h_addr_list[ii] != NULL; ii++) + free(hp->h_addr_list[ii]); + free(hp->h_addr_list); + hp->h_addr_list = NULL; + } +} + +static struct hostent * +hostent_unpack(const nvlist_t *nvl, struct hostent *hp) +{ + unsigned int ii, nitems; + char nvlname[64]; + int n; + + hostent_free(hp); + + hp->h_name = strdup(nvlist_get_string(nvl, "name")); + if (hp->h_name == NULL) + goto fail; + hp->h_addrtype = (int)nvlist_get_number(nvl, "addrtype"); + hp->h_length = (int)nvlist_get_number(nvl, "length"); + + nitems = (unsigned int)nvlist_get_number(nvl, "naliases"); + hp->h_aliases = calloc(sizeof(hp->h_aliases[0]), nitems + 1); + if (hp->h_aliases == NULL) + goto fail; + for (ii = 0; ii < nitems; ii++) { + n = snprintf(nvlname, sizeof(nvlname), "alias%u", ii); + assert(n > 0 && n < (int)sizeof(nvlname)); + hp->h_aliases[ii] = + strdup(nvlist_get_string(nvl, nvlname)); + if (hp->h_aliases[ii] == NULL) + goto fail; + } + hp->h_aliases[ii] = NULL; + + nitems = (unsigned int)nvlist_get_number(nvl, "naddrs"); + hp->h_addr_list = calloc(sizeof(hp->h_addr_list[0]), nitems + 1); + if (hp->h_addr_list == NULL) + goto fail; + for (ii = 0; ii < nitems; ii++) { + hp->h_addr_list[ii] = malloc(hp->h_length); + if (hp->h_addr_list[ii] == NULL) + goto fail; + n = snprintf(nvlname, sizeof(nvlname), "addr%u", ii); + assert(n > 0 && n < (int)sizeof(nvlname)); + bcopy(nvlist_get_binary(nvl, nvlname, NULL), + hp->h_addr_list[ii], hp->h_length); + } + hp->h_addr_list[ii] = NULL; + + return (hp); +fail: + hostent_free(hp); + h_errno = NO_RECOVERY; + return (NULL); +} + +static int +request_cb(cap_channel_t *chan, const char *name, int s, + const struct sockaddr *saddr, socklen_t len) +{ + nvlist_t *nvl; + int serrno; + + nvl = nvlist_create(0); + nvlist_add_string(nvl, "cmd", name); + nvlist_add_descriptor(nvl, "s", s); + nvlist_add_binary(nvl, "saddr", saddr, len); + + nvl = cap_xfer_nvlist(chan, nvl); + if (nvl == NULL) + return (-1); + + if (nvlist_get_number(nvl, "error") != 0) { + serrno = (int)nvlist_get_number(nvl, "error"); + nvlist_destroy(nvl); + errno = serrno; + return (-1); + } + + s = dup2(s, nvlist_get_descriptor(nvl, "s")); + nvlist_destroy(nvl); + + return (s == -1 ? -1 : 0); +} + +int +cap_bind(cap_channel_t *chan, int s, const struct sockaddr *addr, + socklen_t addrlen) +{ + + return (request_cb(chan, LIMIT_NV_BIND, s, addr, addrlen)); +} + +int +cap_connect(cap_channel_t *chan, int s, const struct sockaddr *name, + socklen_t namelen) +{ + + return (request_cb(chan, LIMIT_NV_CONNECT, s, name, namelen)); +} + + +struct hostent * +cap_gethostbyname(cap_channel_t *chan, const char *name) +{ + + return (cap_gethostbyname2(chan, name, AF_INET)); +} + +struct hostent * +cap_gethostbyname2(cap_channel_t *chan, const char *name, int af) +{ + struct hostent *hp; + nvlist_t *nvl; + + nvl = nvlist_create(0); + nvlist_add_string(nvl, "cmd", "gethostbyname"); + nvlist_add_number(nvl, "family", (uint64_t)af); + nvlist_add_string(nvl, "name", name); + nvl = cap_xfer_nvlist(chan, nvl); + if (nvl == NULL) { + h_errno = NO_RECOVERY; + return (NULL); + } + if (nvlist_get_number(nvl, "error") != 0) { + h_errno = (int)nvlist_get_number(nvl, "error"); + nvlist_destroy(nvl); + return (NULL); + } + + hp = hostent_unpack(nvl, &hent); + nvlist_destroy(nvl); + return (hp); +} + +struct hostent * +cap_gethostbyaddr(cap_channel_t *chan, const void *addr, socklen_t len, + int af) +{ + struct hostent *hp; + nvlist_t *nvl; + + nvl = nvlist_create(0); + nvlist_add_string(nvl, "cmd", "gethostbyaddr"); + nvlist_add_binary(nvl, "addr", addr, (size_t)len); + nvlist_add_number(nvl, "family", (uint64_t)af); + nvl = cap_xfer_nvlist(chan, nvl); + if (nvl == NULL) { + h_errno = NO_RECOVERY; + return (NULL); + } + if (nvlist_get_number(nvl, "error") != 0) { + h_errno = (int)nvlist_get_number(nvl, "error"); + nvlist_destroy(nvl); + return (NULL); + } + hp = hostent_unpack(nvl, &hent); + nvlist_destroy(nvl); + return (hp); +} + +static struct addrinfo * +addrinfo_unpack(const nvlist_t *nvl) +{ + struct addrinfo *ai; + const void *addr; + size_t addrlen; + const char *canonname; + + addr = nvlist_get_binary(nvl, "ai_addr", &addrlen); + ai = malloc(sizeof(*ai) + addrlen); + if (ai == NULL) + return (NULL); + ai->ai_flags = (int)nvlist_get_number(nvl, "ai_flags"); + ai->ai_family = (int)nvlist_get_number(nvl, "ai_family"); + ai->ai_socktype = (int)nvlist_get_number(nvl, "ai_socktype"); + ai->ai_protocol = (int)nvlist_get_number(nvl, "ai_protocol"); + ai->ai_addrlen = (socklen_t)addrlen; + canonname = dnvlist_get_string(nvl, "ai_canonname", NULL); + if (canonname != NULL) { + ai->ai_canonname = strdup(canonname); + if (ai->ai_canonname == NULL) { + free(ai); + return (NULL); + } + } else { + ai->ai_canonname = NULL; + } + ai->ai_addr = (void *)(ai + 1); + bcopy(addr, ai->ai_addr, addrlen); + ai->ai_next = NULL; + + return (ai); +} + +int +cap_getaddrinfo(cap_channel_t *chan, const char *hostname, const char *servname, + const struct addrinfo *hints, struct addrinfo **res) +{ + struct addrinfo *firstai, *prevai, *curai; + unsigned int ii; + const nvlist_t *nvlai; + char nvlname[64]; + nvlist_t *nvl; + int error, n; + + nvl = nvlist_create(0); + nvlist_add_string(nvl, "cmd", "getaddrinfo"); + if (hostname != NULL) + nvlist_add_string(nvl, "hostname", hostname); + if (servname != NULL) + nvlist_add_string(nvl, "servname", servname); + if (hints != NULL) { + nvlist_add_number(nvl, "hints.ai_flags", + (uint64_t)hints->ai_flags); + nvlist_add_number(nvl, "hints.ai_family", + (uint64_t)hints->ai_family); + nvlist_add_number(nvl, "hints.ai_socktype", + (uint64_t)hints->ai_socktype); + nvlist_add_number(nvl, "hints.ai_protocol", + (uint64_t)hints->ai_protocol); + } + nvl = cap_xfer_nvlist(chan, nvl); + if (nvl == NULL) + return (EAI_MEMORY); + if (nvlist_get_number(nvl, "error") != 0) { + error = (int)nvlist_get_number(nvl, "error"); + nvlist_destroy(nvl); + return (error); + } + + nvlai = NULL; + firstai = prevai = curai = NULL; + for (ii = 0; ; ii++) { + n = snprintf(nvlname, sizeof(nvlname), "res%u", ii); + assert(n > 0 && n < (int)sizeof(nvlname)); + if (!nvlist_exists_nvlist(nvl, nvlname)) + break; + nvlai = nvlist_get_nvlist(nvl, nvlname); + curai = addrinfo_unpack(nvlai); + if (curai == NULL) + return (EAI_MEMORY); + if (prevai != NULL) + prevai->ai_next = curai; + else + firstai = curai; + prevai = curai; + } + nvlist_destroy(nvl); + if (curai == NULL && nvlai != NULL) { + if (firstai == NULL) + freeaddrinfo(firstai); + return (EAI_MEMORY); + } + + *res = firstai; + return (0); +} + +int +cap_getnameinfo(cap_channel_t *chan, const struct sockaddr *sa, socklen_t salen, + char *host, size_t hostlen, char *serv, size_t servlen, int flags) +{ + nvlist_t *nvl; + int error; + + nvl = nvlist_create(0); + nvlist_add_string(nvl, "cmd", "getnameinfo"); + nvlist_add_number(nvl, "hostlen", (uint64_t)hostlen); + nvlist_add_number(nvl, "servlen", (uint64_t)servlen); + nvlist_add_binary(nvl, "sa", sa, (size_t)salen); + nvlist_add_number(nvl, "flags", (uint64_t)flags); + nvl = cap_xfer_nvlist(chan, nvl); + if (nvl == NULL) + return (EAI_MEMORY); + if (nvlist_get_number(nvl, "error") != 0) { + error = (int)nvlist_get_number(nvl, "error"); + nvlist_destroy(nvl); + return (error); + } + + if (host != NULL && nvlist_exists_string(nvl, "host")) + strlcpy(host, nvlist_get_string(nvl, "host"), hostlen + 1); + if (serv != NULL && nvlist_exists_string(nvl, "serv")) + strlcpy(serv, nvlist_get_string(nvl, "serv"), servlen + 1); + nvlist_destroy(nvl); + return (0); +} + +cap_net_limit_t * +cap_net_limit_init(cap_channel_t *chan, uint64_t mode) +{ + cap_net_limit_t *limit; + + limit = calloc(1, sizeof(*limit)); + if (limit != NULL) { + limit->cnl_mode = mode; + limit->cnl_chan = chan; + limit->cnl_addr2name = nvlist_create(0); + limit->cnl_name2addr = nvlist_create(0); + limit->cnl_connect = nvlist_create(0); + limit->cnl_bind = nvlist_create(0); + } + + return (limit); +} + +static void +pack_limit(nvlist_t *lnvl, const char *name, nvlist_t *limit) +{ + + if (!nvlist_empty(limit)) { + nvlist_move_nvlist(lnvl, name, limit); + } else { + nvlist_destroy(limit); + } +} + +int +cap_net_limit(cap_net_limit_t *limit) +{ + nvlist_t *lnvl; + cap_channel_t *chan; + + lnvl = nvlist_create(0); + nvlist_add_number(lnvl, "mode", limit->cnl_mode); + + pack_limit(lnvl, LIMIT_NV_ADDR2NAME, limit->cnl_addr2name); + pack_limit(lnvl, LIMIT_NV_NAME2ADDR, limit->cnl_name2addr); + pack_limit(lnvl, LIMIT_NV_CONNECT, limit->cnl_connect); + pack_limit(lnvl, LIMIT_NV_BIND, limit->cnl_bind); + + chan = limit->cnl_chan; + free(limit); + + return (cap_limit_set(chan, lnvl)); +} + +void +cap_net_free(cap_net_limit_t *limit) +{ + + if (limit == NULL) + return; + + nvlist_destroy(limit->cnl_addr2name); + nvlist_destroy(limit->cnl_name2addr); + nvlist_destroy(limit->cnl_connect); + nvlist_destroy(limit->cnl_bind); + + free(limit); +} + +static void +pack_family(nvlist_t *nvl, int *family, size_t size) +{ + size_t i; + + i = 0; + if (!nvlist_exists_number_array(nvl, "family")) { + uint64_t val; + + val = family[0]; + nvlist_add_number_array(nvl, "family", &val, 1); + i += 1; + } + + for (; i < size; i++) { + nvlist_append_number_array(nvl, "family", family[i]); + } +} + +static void +pack_sockaddr(nvlist_t *res, const struct sockaddr *sa, socklen_t salen) +{ + nvlist_t *nvl; + + if (!nvlist_exists_nvlist(res, "sockaddr")) { + nvl = nvlist_create(NV_FLAG_NO_UNIQUE); + } else { + nvl = nvlist_take_nvlist(res, "sockaddr"); + } + + nvlist_add_binary(nvl, "", sa, salen); + nvlist_move_nvlist(res, "sockaddr", nvl); +} + +cap_net_limit_t * +cap_net_limit_addr2name_family(cap_net_limit_t *limit, int *family, size_t size) +{ + + pack_family(limit->cnl_addr2name, family, size); + return (limit); +} + +cap_net_limit_t * +cap_net_limit_name2addr_family(cap_net_limit_t *limit, int *family, size_t size) +{ + + pack_family(limit->cnl_name2addr, family, size); + return (limit); +} + +cap_net_limit_t * +cap_net_limit_name2addr(cap_net_limit_t *limit, const char *host, + const char *serv) +{ + nvlist_t *nvl; + + if (!nvlist_exists_nvlist(limit->cnl_name2addr, "hosts")) { + nvl = nvlist_create(NV_FLAG_NO_UNIQUE); + } else { + nvl = nvlist_take_nvlist(limit->cnl_name2addr, "hosts"); + } + + nvlist_add_string(nvl, + host != NULL ? host : "", + serv != NULL ? serv : ""); + + nvlist_move_nvlist(limit->cnl_name2addr, "hosts", nvl); + return (limit); +} + +cap_net_limit_t * +cap_net_limit_addr2name(cap_net_limit_t *limit, const struct sockaddr *sa, + socklen_t salen) +{ + + pack_sockaddr(limit->cnl_addr2name, sa, salen); + return (limit); +} + + +cap_net_limit_t * +cap_net_limit_connect(cap_net_limit_t *limit, const struct sockaddr *sa, + socklen_t salen) +{ + + pack_sockaddr(limit->cnl_connect, sa, salen); + return (limit); +} + +cap_net_limit_t * +cap_net_limit_bind(cap_net_limit_t *limit, const struct sockaddr *sa, + socklen_t salen) +{ + + pack_sockaddr(limit->cnl_bind, sa, salen); + return (limit); +} + +/* + * Service functions. + */ + +static nvlist_t *capdnscache; + +static void +net_add_sockaddr_to_cache(struct sockaddr *sa, socklen_t salen, bool deprecated) +{ + void *cookie; + + if (capdnscache == NULL) { + capdnscache = nvlist_create(NV_FLAG_NO_UNIQUE); + } else { + /* Lets keep it clean. Look for dups. */ + cookie = NULL; + while (nvlist_next(capdnscache, NULL, &cookie) != NULL) { + const void *data; + size_t size; + + assert(cnvlist_type(cookie) == NV_TYPE_BINARY); + + data = cnvlist_get_binary(cookie, &size); + if (salen != size) + continue; + if (memcmp(data, sa, size) == 0) + return; + } + } + + nvlist_add_binary(capdnscache, deprecated ? "d" : "", sa, salen); +} + +static void +net_add_hostent_to_cache(const char *address, size_t asize, int family) +{ + + if (family != AF_INET && family != AF_INET6) + return; + + if (family == AF_INET6) { + struct sockaddr_in6 connaddr; + + memset(&connaddr, 0, sizeof(connaddr)); + connaddr.sin6_family = AF_INET6; + memcpy((char *)&connaddr.sin6_addr, address, asize); + connaddr.sin6_port = 0; + + net_add_sockaddr_to_cache((struct sockaddr *)&connaddr, + sizeof(connaddr), true); + } else { + struct sockaddr_in connaddr; + + memset(&connaddr, 0, sizeof(connaddr)); + connaddr.sin_family = AF_INET; + memcpy((char *)&connaddr.sin_addr.s_addr, address, asize); + connaddr.sin_port = 0; + + net_add_sockaddr_to_cache((struct sockaddr *)&connaddr, + sizeof(connaddr), true); + } +} + +static bool +net_allowed_mode(const nvlist_t *limits, uint64_t mode) +{ + + if (limits == NULL) + return (true); + + return ((nvlist_get_number(limits, "mode") & mode) == mode); +} + +static bool +net_allowed_family(const nvlist_t *limits, int family) +{ + const uint64_t *allowedfamily; + size_t i, allsize; + + if (limits == NULL) + return (true); + + /* If there are no familes at all, allow any mode. */ + if (!nvlist_exists_number_array(limits, "family")) + return (true); + + allowedfamily = nvlist_get_number_array(limits, "family", &allsize); + for (i = 0; i < allsize; i++) { + /* XXX: what with AF_UNSPEC? */ + if (allowedfamily[i] == (uint64_t)family) { + return (true); + } + } + + return (false); +} + +static bool +net_allowed_bsaddr_impl(const nvlist_t *salimits, const void *saddr, + size_t saddrsize) +{ + void *cookie; + const void *limit; + size_t limitsize; + + cookie = NULL; + while (nvlist_next(salimits, NULL, &cookie) != NULL) { + limit = cnvlist_get_binary(cookie, &limitsize); + + if (limitsize != saddrsize) { + continue; + } + if (memcmp(limit, saddr, limitsize) == 0) { + return (true); + } + + /* + * In case of deprecated version (gethostbyname) we have to + * ignore port, because there is no such info in the hostent. + * Suporting only AF_INET and AF_INET6. + */ + if (strcmp(cnvlist_name(cookie), "d") != 0 || + (saddrsize != sizeof(struct sockaddr_in) && + saddrsize != sizeof(struct sockaddr_in6))) { + continue; + } + if (saddrsize == sizeof(struct sockaddr_in)) { + const struct sockaddr_in *saddrptr; + struct sockaddr_in sockaddr; + + saddrptr = (const struct sockaddr_in *)saddr; + memcpy(&sockaddr, limit, sizeof(sockaddr)); + sockaddr.sin_port = saddrptr->sin_port; + + if (memcmp(&sockaddr, saddr, saddrsize) == 0) { + return (true); + } + } else if (saddrsize == sizeof(struct sockaddr_in6)) { + const struct sockaddr_in6 *saddrptr; + struct sockaddr_in6 sockaddr; + + saddrptr = (const struct sockaddr_in6 *)saddr; + memcpy(&sockaddr, limit, sizeof(sockaddr)); + sockaddr.sin6_port = saddrptr->sin6_port; + + if (memcmp(&sockaddr, saddr, saddrsize) == 0) { + return (true); + } + } + } + + return (false); +} + +static bool +net_allowed_bsaddr(const nvlist_t *limits, const void *saddr, size_t saddrsize) +{ + + if (limits == NULL) + return (true); + + if (!nvlist_exists_nvlist(limits, "sockaddr")) + return (true); + + return (net_allowed_bsaddr_impl(nvlist_get_nvlist(limits, "sockaddr"), + saddr, saddrsize)); +} + +static bool +net_allowed_hosts(const nvlist_t *limits, const char *name, const char *srvname) +{ + void *cookie; + const nvlist_t *hlimits; + const char *testname, *testsrvname; + + if (limits == NULL) { + return (true); + } + + /* If there are no hosts at all, allow any. */ + if (!nvlist_exists_nvlist(limits, "hosts")) { + return (true); + } + + cookie = NULL; + testname = (name == NULL ? "" : name); + testsrvname = (srvname == NULL ? "" : srvname); + hlimits = nvlist_get_nvlist(limits, "hosts"); + while (nvlist_next(hlimits, NULL, &cookie) != NULL) { + if (strcmp(cnvlist_name(cookie), "") != 0 && + strcmp(cnvlist_name(cookie), testname) != 0) { + continue; + } + + if (strcmp(cnvlist_get_string(cookie), "") != 0 && + strcmp(cnvlist_get_string(cookie), testsrvname) != 0) { + continue; + } + + return (true); + } + + return (false); +} + +static void +hostent_pack(const struct hostent *hp, nvlist_t *nvl, bool addtocache) +{ + unsigned int ii; + char nvlname[64]; + int n; + + nvlist_add_string(nvl, "name", hp->h_name); + nvlist_add_number(nvl, "addrtype", (uint64_t)hp->h_addrtype); + nvlist_add_number(nvl, "length", (uint64_t)hp->h_length); + + if (hp->h_aliases == NULL) { + nvlist_add_number(nvl, "naliases", 0); + } else { + for (ii = 0; hp->h_aliases[ii] != NULL; ii++) { + n = snprintf(nvlname, sizeof(nvlname), "alias%u", ii); + assert(n > 0 && n < (int)sizeof(nvlname)); + nvlist_add_string(nvl, nvlname, hp->h_aliases[ii]); + } + nvlist_add_number(nvl, "naliases", (uint64_t)ii); + } + + if (hp->h_addr_list == NULL) { + nvlist_add_number(nvl, "naddrs", 0); + } else { + for (ii = 0; hp->h_addr_list[ii] != NULL; ii++) { + n = snprintf(nvlname, sizeof(nvlname), "addr%u", ii); + assert(n > 0 && n < (int)sizeof(nvlname)); + nvlist_add_binary(nvl, nvlname, hp->h_addr_list[ii], + (size_t)hp->h_length); + if (addtocache) { + net_add_hostent_to_cache(hp->h_addr_list[ii], + hp->h_length, hp->h_addrtype); + } + } + nvlist_add_number(nvl, "naddrs", (uint64_t)ii); + } +} + +static int +net_gethostbyname(const nvlist_t *limits, const nvlist_t *nvlin, + nvlist_t *nvlout) +{ + struct hostent *hp; + int family; + const nvlist_t *funclimit; + const char *name; + bool dnscache; + + if (!net_allowed_mode(limits, CAPNET_DEPRECATED_NAME2ADDR)) + return (ENOTCAPABLE); + + dnscache = net_allowed_mode(limits, CAPNET_CONNECTDNS); + funclimit = dnvlist_get_nvlist(limits, LIMIT_NV_NAME2ADDR, NULL); + + family = (int)nvlist_get_number(nvlin, "family"); + if (!net_allowed_family(funclimit, family)) + return (ENOTCAPABLE); + + name = nvlist_get_string(nvlin, "name"); + if (!net_allowed_hosts(funclimit, name, "")) + return (ENOTCAPABLE); + + hp = gethostbyname2(name, family); + if (hp == NULL) + return (h_errno); + hostent_pack(hp, nvlout, dnscache); + return (0); +} + +static int +net_gethostbyaddr(const nvlist_t *limits, const nvlist_t *nvlin, + nvlist_t *nvlout) +{ + struct hostent *hp; + const void *addr; + size_t addrsize; + int family; + const nvlist_t *funclimit; + + if (!net_allowed_mode(limits, CAPNET_DEPRECATED_ADDR2NAME)) + return (ENOTCAPABLE); + + funclimit = dnvlist_get_nvlist(limits, LIMIT_NV_ADDR2NAME, NULL); + + family = (int)nvlist_get_number(nvlin, "family"); + if (!net_allowed_family(funclimit, family)) + return (ENOTCAPABLE); + + addr = nvlist_get_binary(nvlin, "addr", &addrsize); + if (!net_allowed_bsaddr(funclimit, addr, addrsize)) + return (ENOTCAPABLE); + + hp = gethostbyaddr(addr, (socklen_t)addrsize, family); + if (hp == NULL) + return (h_errno); + hostent_pack(hp, nvlout, false); + return (0); +} + +static int +net_getnameinfo(const nvlist_t *limits, const nvlist_t *nvlin, nvlist_t *nvlout) +{ + struct sockaddr_storage sast; + const void *sabin; + char *host, *serv; + size_t sabinsize, hostlen, servlen; + socklen_t salen; + int error, flags; + const nvlist_t *funclimit; + + if (!net_allowed_mode(limits, CAPNET_ADDR2NAME)) + return (ENOTCAPABLE); + funclimit = dnvlist_get_nvlist(limits, LIMIT_NV_ADDR2NAME, NULL); + + error = 0; + host = serv = NULL; + memset(&sast, 0, sizeof(sast)); + + hostlen = (size_t)nvlist_get_number(nvlin, "hostlen"); + servlen = (size_t)nvlist_get_number(nvlin, "servlen"); + + if (hostlen > 0) { + host = calloc(1, hostlen + 1); + if (host == NULL) { + error = EAI_MEMORY; + goto out; + } + } + if (servlen > 0) { + serv = calloc(1, servlen + 1); + if (serv == NULL) { + error = EAI_MEMORY; + goto out; + } + } + + sabin = nvlist_get_binary(nvlin, "sa", &sabinsize); + if (sabinsize > sizeof(sast)) { + error = EAI_FAIL; + goto out; + } + if (!net_allowed_bsaddr(funclimit, sabin, sabinsize)) + return (ENOTCAPABLE); + + memcpy(&sast, sabin, sabinsize); + salen = (socklen_t)sabinsize; + + if ((sast.ss_family != AF_INET || + salen != sizeof(struct sockaddr_in)) && + (sast.ss_family != AF_INET6 || + salen != sizeof(struct sockaddr_in6))) { + error = EAI_FAIL; + goto out; + } + + if (!net_allowed_family(funclimit, (int)sast.ss_family)) { + error = ENOTCAPABLE; + goto out; + } + + flags = (int)nvlist_get_number(nvlin, "flags"); + + error = getnameinfo((struct sockaddr *)&sast, salen, host, hostlen, + serv, servlen, flags); + if (error != 0) + goto out; + + if (host != NULL) + nvlist_move_string(nvlout, "host", host); + if (serv != NULL) + nvlist_move_string(nvlout, "serv", serv); +out: + if (error != 0) { + free(host); + free(serv); + } + return (error); +} + +static nvlist_t * +addrinfo_pack(const struct addrinfo *ai) +{ + nvlist_t *nvl; + + nvl = nvlist_create(0); + nvlist_add_number(nvl, "ai_flags", (uint64_t)ai->ai_flags); + nvlist_add_number(nvl, "ai_family", (uint64_t)ai->ai_family); + nvlist_add_number(nvl, "ai_socktype", (uint64_t)ai->ai_socktype); + nvlist_add_number(nvl, "ai_protocol", (uint64_t)ai->ai_protocol); + nvlist_add_binary(nvl, "ai_addr", ai->ai_addr, (size_t)ai->ai_addrlen); + if (ai->ai_canonname != NULL) + nvlist_add_string(nvl, "ai_canonname", ai->ai_canonname); + + return (nvl); +} + +static int +net_getaddrinfo(const nvlist_t *limits, const nvlist_t *nvlin, nvlist_t *nvlout) +{ + struct addrinfo hints, *hintsp, *res, *cur; + const char *hostname, *servname; + char nvlname[64]; + nvlist_t *elem; + unsigned int ii; + int error, family, n; + const nvlist_t *funclimit; + bool dnscache; + + if (!net_allowed_mode(limits, CAPNET_NAME2ADDR)) + return (ENOTCAPABLE); + dnscache = net_allowed_mode(limits, CAPNET_CONNECTDNS); + funclimit = dnvlist_get_nvlist(limits, LIMIT_NV_NAME2ADDR, NULL); + + hostname = dnvlist_get_string(nvlin, "hostname", NULL); + servname = dnvlist_get_string(nvlin, "servname", NULL); + if (nvlist_exists_number(nvlin, "hints.ai_flags")) { + hints.ai_flags = (int)nvlist_get_number(nvlin, + "hints.ai_flags"); + hints.ai_family = (int)nvlist_get_number(nvlin, + "hints.ai_family"); + hints.ai_socktype = (int)nvlist_get_number(nvlin, + "hints.ai_socktype"); + hints.ai_protocol = (int)nvlist_get_number(nvlin, + "hints.ai_protocol"); + hints.ai_addrlen = 0; + hints.ai_addr = NULL; + hints.ai_canonname = NULL; + hints.ai_next = NULL; + hintsp = &hints; + family = hints.ai_family; + } else { + hintsp = NULL; + family = AF_UNSPEC; + } + + if (!net_allowed_family(funclimit, family)) + return (ENOTCAPABLE); + if (!net_allowed_hosts(funclimit, hostname, servname)) + return (ENOTCAPABLE); + error = getaddrinfo(hostname, servname, hintsp, &res); + if (error != 0) { + goto out; + } + + for (cur = res, ii = 0; cur != NULL; cur = cur->ai_next, ii++) { + elem = addrinfo_pack(cur); + n = snprintf(nvlname, sizeof(nvlname), "res%u", ii); + assert(n > 0 && n < (int)sizeof(nvlname)); + nvlist_move_nvlist(nvlout, nvlname, elem); + if (dnscache) { + net_add_sockaddr_to_cache(cur->ai_addr, + cur->ai_addrlen, false); + } + } + + freeaddrinfo(res); + error = 0; +out: + return (error); +} + +static int +net_bind(const nvlist_t *limits, nvlist_t *nvlin, nvlist_t *nvlout) +{ + int socket, serrno; + const void *saddr; + size_t len; + const nvlist_t *funclimit; + + if (!net_allowed_mode(limits, CAPNET_BIND)) + return (ENOTCAPABLE); + funclimit = dnvlist_get_nvlist(limits, LIMIT_NV_BIND, NULL); + + saddr = nvlist_get_binary(nvlin, "saddr", &len); + + if (!net_allowed_bsaddr(funclimit, saddr, len)) + return (ENOTCAPABLE); + + socket = nvlist_take_descriptor(nvlin, "s"); + if (bind(socket, saddr, len) < 0) { + serrno = errno; + close(socket); + return (serrno); + } + + nvlist_move_descriptor(nvlout, "s", socket); + + return (0); +} + +static int +net_connect(const nvlist_t *limits, nvlist_t *nvlin, nvlist_t *nvlout) +{ + int socket, serrno; + const void *saddr; + const nvlist_t *funclimit; + size_t len; + bool conn, conndns; + + conn = net_allowed_mode(limits, CAPNET_CONNECT); + conndns = net_allowed_mode(limits, CAPNET_CONNECTDNS); + + if (!conn && !conndns) + return (ENOTCAPABLE); + + funclimit = dnvlist_get_nvlist(limits, LIMIT_NV_CONNECT, NULL); + + saddr = nvlist_get_binary(nvlin, "saddr", &len); + if (conn && !net_allowed_bsaddr(funclimit, saddr, len)) { + return (ENOTCAPABLE); + } else if (conndns && (capdnscache == NULL || + !net_allowed_bsaddr_impl(capdnscache, saddr, len))) { + return (ENOTCAPABLE); + } + socket = dup(nvlist_get_descriptor(nvlin, "s")); + if (connect(socket, saddr, len) < 0) { + serrno = errno; + close(socket); + return (serrno); + } + + nvlist_move_descriptor(nvlout, "s", socket); + + return (0); +} + +static bool +verify_only_sa_newlimts(const nvlist_t *oldfunclimits, + const nvlist_t *newfunclimit) +{ + void *cookie; + + cookie = NULL; + while (nvlist_next(newfunclimit, NULL, &cookie) != NULL) { + void *sacookie; + + if (strcmp(cnvlist_name(cookie), "sockaddr") != 0) + return (false); + + if (cnvlist_type(cookie) != NV_TYPE_NVLIST) + return (false); + + sacookie = NULL; + while (nvlist_next(cnvlist_get_nvlist(cookie), NULL, + &sacookie) != NULL) { + const void *sa; + size_t sasize; + + if (cnvlist_type(sacookie) != NV_TYPE_BINARY) + return (false); + + sa = cnvlist_get_binary(sacookie, &sasize); + if (!net_allowed_bsaddr(oldfunclimits, sa, sasize)) + return (false); + } + } + + return (true); +} + +static bool +verify_bind_newlimts(const nvlist_t *oldlimits, + const nvlist_t *newfunclimit) +{ + const nvlist_t *oldfunclimits; + + oldfunclimits = NULL; + if (oldlimits != NULL) { + oldfunclimits = dnvlist_get_nvlist(oldlimits, LIMIT_NV_BIND, + NULL); + } + + return (verify_only_sa_newlimts(oldfunclimits, newfunclimit)); +} + + +static bool +verify_connect_newlimits(const nvlist_t *oldlimits, + const nvlist_t *newfunclimit) +{ + const nvlist_t *oldfunclimits; + + oldfunclimits = NULL; + if (oldlimits != NULL) { + oldfunclimits = dnvlist_get_nvlist(oldlimits, LIMIT_NV_CONNECT, + NULL); + } + + return (verify_only_sa_newlimts(oldfunclimits, newfunclimit)); +} + +static bool +verify_addr2name_newlimits(const nvlist_t *oldlimits, + const nvlist_t *newfunclimit) +{ + void *cookie; + const nvlist_t *oldfunclimits; + + oldfunclimits = NULL; + if (oldlimits != NULL) { + oldfunclimits = dnvlist_get_nvlist(oldlimits, + LIMIT_NV_ADDR2NAME, NULL); + } + + cookie = NULL; + while (nvlist_next(newfunclimit, NULL, &cookie) != NULL) { + if (strcmp(cnvlist_name(cookie), "sockaddr") == 0) { + void *sacookie; + + if (cnvlist_type(cookie) != NV_TYPE_NVLIST) + return (false); + + sacookie = NULL; + while (nvlist_next(cnvlist_get_nvlist(cookie), NULL, + &sacookie) != NULL) { + const void *sa; + size_t sasize; + + if (cnvlist_type(sacookie) != NV_TYPE_BINARY) + return (false); + + sa = cnvlist_get_binary(sacookie, &sasize); + if (!net_allowed_bsaddr(oldfunclimits, sa, + sasize)) { + return (false); + } + } + } else if (strcmp(cnvlist_name(cookie), "family") == 0) { + size_t i, sfamilies; + const uint64_t *families; + + if (cnvlist_type(cookie) != NV_TYPE_NUMBER_ARRAY) + return (false); + + families = cnvlist_get_number_array(cookie, &sfamilies); + for (i = 0; i < sfamilies; i++) { + if (!net_allowed_family(oldfunclimits, + families[i])) { + return (false); + } + } + } else { + return (false); + } + } + + return (true); +} + +static bool +verify_name2addr_newlimits(const nvlist_t *oldlimits, + const nvlist_t *newfunclimit) +{ + void *cookie; + const nvlist_t *oldfunclimits; + + oldfunclimits = NULL; + if (oldlimits != NULL) { + oldfunclimits = dnvlist_get_nvlist(oldlimits, + LIMIT_NV_ADDR2NAME, NULL); + } + + cookie = NULL; + while (nvlist_next(newfunclimit, NULL, &cookie) != NULL) { + if (strcmp(cnvlist_name(cookie), "hosts") == 0) { + void *hostcookie; + + if (cnvlist_type(cookie) != NV_TYPE_NVLIST) + return (false); + + hostcookie = NULL; + while (nvlist_next(cnvlist_get_nvlist(cookie), NULL, + &hostcookie) != NULL) { + if (cnvlist_type(hostcookie) != NV_TYPE_STRING) + return (false); + + if (!net_allowed_hosts(oldfunclimits, + cnvlist_name(hostcookie), + cnvlist_get_string(hostcookie))) { + return (false); + } + } + } else if (strcmp(cnvlist_name(cookie), "family") == 0) { + size_t i, sfamilies; + const uint64_t *families; + + if (cnvlist_type(cookie) != NV_TYPE_NUMBER_ARRAY) + return (false); + + families = cnvlist_get_number_array(cookie, &sfamilies); + for (i = 0; i < sfamilies; i++) { + if (!net_allowed_family(oldfunclimits, + families[i])) { + return (false); + } + } + } else { + return (false); + } + } + + return (true); +} + +static int +net_limit(const nvlist_t *oldlimits, const nvlist_t *newlimits) +{ + const char *name; + void *cookie; + bool hasmode, hasconnect, hasbind, hasaddr2name, hasname2addr; + + /* + * Modes: + * ADDR2NAME: + * getnameinfo + * DEPRECATED_ADDR2NAME: + * gethostbyaddr + * + * NAME2ADDR: + * getaddrinfo + * DEPRECATED_NAME2ADDR: + * gethostbyname + * + * Limit scheme: + * mode : NV_TYPE_NUMBER + * connect : NV_TYPE_NVLIST + * sockaddr : NV_TYPE_NVLIST + * "" : NV_TYPE_BINARY + * ... : NV_TYPE_BINARY + * bind : NV_TYPE_NVLIST + * sockaddr : NV_TYPE_NVLIST + * "" : NV_TYPE_BINARY + * ... : NV_TYPE_BINARY + * addr2name : NV_TYPE_NVLIST + * family : NV_TYPE_NUMBER_ARRAY + * sockaddr : NV_TYPE_NVLIST + * "" : NV_TYPE_BINARY + * ... : NV_TYPE_BINARY + * name2addr : NV_TYPE_NVLIST + * family : NV_TYPE_NUMBER + * hosts : NV_TYPE_NVLIST + * host : servname : NV_TYPE_STRING + */ + + hasmode = false; + hasconnect = false; + hasbind = false; + hasaddr2name = false; + hasname2addr = false; + + cookie = NULL; + while ((name = nvlist_next(newlimits, NULL, &cookie)) != NULL) { + if (strcmp(name, "mode") == 0) { + if (cnvlist_type(cookie) != NV_TYPE_NUMBER) { + return (NO_RECOVERY); + } + if (!net_allowed_mode(oldlimits, + cnvlist_get_number(cookie))) { + return (ENOTCAPABLE); + } + hasmode = true; + continue; + } + + if (cnvlist_type(cookie) != NV_TYPE_NVLIST) { + return (NO_RECOVERY); + } + + if (strcmp(name, LIMIT_NV_BIND) == 0) { + hasbind = true; + if (!verify_bind_newlimts(oldlimits, + cnvlist_get_nvlist(cookie))) { + return (ENOTCAPABLE); + } + } else if (strcmp(name, LIMIT_NV_CONNECT) == 0) { + hasconnect = true; + if (!verify_connect_newlimits(oldlimits, + cnvlist_get_nvlist(cookie))) { + return (ENOTCAPABLE); + } + } else if (strcmp(name, LIMIT_NV_ADDR2NAME) == 0) { + hasaddr2name = true; + if (!verify_addr2name_newlimits(oldlimits, + cnvlist_get_nvlist(cookie))) { + return (ENOTCAPABLE); + } + } else if (strcmp(name, LIMIT_NV_NAME2ADDR) == 0) { + hasname2addr = true; + if (!verify_name2addr_newlimits(oldlimits, + cnvlist_get_nvlist(cookie))) { + return (ENOTCAPABLE); + } + } + } + + /* Mode is required. */ + if (!hasmode) + return (ENOTCAPABLE); + + /* + * If the new limit doesn't mention mode or family we have to + * check if the current limit does have those. Missing mode or + * family in the limit means that all modes or families are + * allowed. + */ + if (oldlimits == NULL) + return (0); + if (!hasconnect && nvlist_exists(oldlimits, LIMIT_NV_BIND)) + return (ENOTCAPABLE); + if (!hasconnect && nvlist_exists(oldlimits, LIMIT_NV_CONNECT)) + return (ENOTCAPABLE); + if (!hasaddr2name && nvlist_exists(oldlimits, LIMIT_NV_ADDR2NAME)) + return (ENOTCAPABLE); + if (!hasname2addr && nvlist_exists(oldlimits, LIMIT_NV_NAME2ADDR)) + return (ENOTCAPABLE); + return (0); +} + +static int +net_command(const char *cmd, const nvlist_t *limits, nvlist_t *nvlin, + nvlist_t *nvlout) +{ + + if (strcmp(cmd, "bind") == 0) + return (net_bind(limits, nvlin, nvlout)); + else if (strcmp(cmd, "connect") == 0) + return (net_connect(limits, nvlin, nvlout)); + else if (strcmp(cmd, "gethostbyname") == 0) + return (net_gethostbyname(limits, nvlin, nvlout)); + else if (strcmp(cmd, "gethostbyaddr") == 0) + return (net_gethostbyaddr(limits, nvlin, nvlout)); + else if (strcmp(cmd, "getnameinfo") == 0) + return (net_getnameinfo(limits, nvlin, nvlout)); + else if (strcmp(cmd, "getaddrinfo") == 0) + return (net_getaddrinfo(limits, nvlin, nvlout)); + + return (EINVAL); +} + +CREATE_SERVICE("system.net", net_limit, net_command, 0); Property changes on: projects/clang1100-import/lib/libcasper/services/cap_net/cap_net.c ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Index: projects/clang1100-import/lib/libcasper/services/cap_net/cap_net.h =================================================================== --- projects/clang1100-import/lib/libcasper/services/cap_net/cap_net.h (nonexistent) +++ projects/clang1100-import/lib/libcasper/services/cap_net/cap_net.h (revision 364279) @@ -0,0 +1,165 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2020 Mariusz Zaborski + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _CAP_NETWORK_H_ +#define _CAP_NETWORK_H_ + +#ifdef HAVE_CASPER +#define WITH_CASPER +#endif + +#include +#include + +#include + +struct addrinfo; +struct hostent; + +struct cap_net_limit; +typedef struct cap_net_limit cap_net_limit_t; + +#define CAPNET_ADDR2NAME (0x01) +#define CAPNET_NAME2ADDR (0x02) +#define CAPNET_DEPRECATED_ADDR2NAME (0x04) +#define CAPNET_DEPRECATED_NAME2ADDR (0x08) +#define CAPNET_CONNECT (0x10) +#define CAPNET_BIND (0x20) +#define CAPNET_CONNECTDNS (0x40) + +#ifdef WITH_CASPER +/* Capability functions. */ +int cap_bind(cap_channel_t *chan, int s, const struct sockaddr *addr, + socklen_t addrlen); +int cap_connect(cap_channel_t *chan, int s, const struct sockaddr *name, + socklen_t namelen); + +int cap_getaddrinfo(cap_channel_t *chan, const char *hostname, + const char *servname, const struct addrinfo *hints, struct addrinfo **res); +int cap_getnameinfo(cap_channel_t *chan, const struct sockaddr *sa, + socklen_t salen, char *host, size_t hostlen, char *serv, size_t servlen, + int flags); + +/* Limit functions. */ +cap_net_limit_t *cap_net_limit_init(cap_channel_t *chan, uint64_t mode); +int cap_net_limit(cap_net_limit_t *limit); +void cap_net_free(cap_net_limit_t *limit); + +cap_net_limit_t *cap_net_limit_addr2name_family(cap_net_limit_t *limit, + int *family, size_t size); +cap_net_limit_t *cap_net_limit_addr2name(cap_net_limit_t *limit, + const struct sockaddr *sa, socklen_t salen); + +cap_net_limit_t *cap_net_limit_name2addr_family(cap_net_limit_t *limit, + int *family, size_t size); +cap_net_limit_t *cap_net_limit_name2addr(cap_net_limit_t *limit, + const char *name, const char *serv); + +cap_net_limit_t *cap_net_limit_connect(cap_net_limit_t *limit, + const struct sockaddr *sa, socklen_t salen); + +cap_net_limit_t *cap_net_limit_bind(cap_net_limit_t *limit, + const struct sockaddr *sa, socklen_t salen); + +/* Deprecated functions. */ +struct hostent *cap_gethostbyname(cap_channel_t *chan, const char *name); +struct hostent *cap_gethostbyname2(cap_channel_t *chan, const char *name, + int af); +struct hostent *cap_gethostbyaddr(cap_channel_t *chan, const void *addr, + socklen_t len, int af); +#else +/* Capability functions. */ +#define cap_bind(chan, s, addr, addrlen) \ + bind(s, addr, addrlen) +#define cap_connect(chan, s, name, namelen) \ + connect(s, name, namelen) +#define cap_getaddrinfo(chan, hostname, servname, hints, res) \ + getaddrinfo(hostname, servname, hints, res) +#define cap_getnameinfo(chan, sa, salen, host, hostlen, serv, servlen, flags) \ + getnameinfo(sa, salen, host, hostlen, serv, servlen, flags) + +/* Limit functions. */ +#define cap_net_limit_init(chan, mode) ((cap_net_limit_t *)malloc(8)) +#define cap_net_free(limit) free(limit) +static inline int +cap_net_limit(cap_net_limit_t *limit) +{ + free(limit); + return (0); +} + +static inline cap_net_limit_t * +cap_net_limit_addr2name_family(cap_net_limit_t *limit, + int *family __unused, size_t size __unused) +{ + return (limit); +} + +static inline cap_net_limit_t * +cap_net_limit_addr2name(cap_net_limit_t *limit, + const struct sockaddr *sa __unused, socklen_t salen __unused) +{ + return (limit); +} + +static inline cap_net_limit_t * +cap_net_limit_name2addr_family(cap_net_limit_t *limit, + int *family __unused, size_t size __unused) +{ + return (limit); +} + +static inline cap_net_limit_t * +cap_net_limit_name2addr(cap_net_limit_t *limit, + const char *name __unused, const char *serv __unused) +{ + return (limit); +} + +static inline cap_net_limit_t * +cap_net_limit_connect(cap_net_limit_t *limit, + const struct sockaddr *sa __unused, socklen_t salen __unused) +{ + return (limit); +} + +static inline cap_net_limit_t * +cap_net_limit_bind(cap_net_limit_t *limit, + const struct sockaddr *sa __unused, socklen_t salen __unused) +{ + return (limit); +} + +/* Deprecated functions. */ +#define cap_gethostbyname(chan, name) gethostbyname(name) +#define cap_gethostbyname2(chan, name, type) gethostbyname2(name, type) +#define cap_gethostbyaddr(chan, addr, len, type) gethostbyaddr(addr, len, type) +#endif + +#endif /* !_CAP_NETWORK_H_ */ Property changes on: projects/clang1100-import/lib/libcasper/services/cap_net/cap_net.h ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Index: projects/clang1100-import/lib/libcasper/services/cap_net/tests/Makefile =================================================================== --- projects/clang1100-import/lib/libcasper/services/cap_net/tests/Makefile (nonexistent) +++ projects/clang1100-import/lib/libcasper/services/cap_net/tests/Makefile (revision 364279) @@ -0,0 +1,16 @@ +# $FreeBSD$ + +.include + +ATF_TESTS_C= net_test + +.if ${MK_CASPER} != "no" +LIBADD+= casper +LIBADD+= cap_net +CFLAGS+=-DWITH_CASPER +.endif +LIBADD+= nv + +WARNS?= 3 + +.include Property changes on: projects/clang1100-import/lib/libcasper/services/cap_net/tests/Makefile ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Index: projects/clang1100-import/lib/libcasper/services/cap_net/tests/net_test.c =================================================================== --- projects/clang1100-import/lib/libcasper/services/cap_net/tests/net_test.c (nonexistent) +++ projects/clang1100-import/lib/libcasper/services/cap_net/tests/net_test.c (revision 364279) @@ -0,0 +1,1160 @@ +/*- + * Copyright (c) 2020 Mariusz Zaborski + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include + +#include +#include + +#include + +#include +#include + +#define TEST_DOMAIN_0 "example.com" +#define TEST_DOMAIN_1 "freebsd.org" +#define TEST_IPV4 "1.1.1.1" +#define TEST_IPV6 "2001:4860:4860::8888" +#define TEST_BIND_IPV4 "127.0.0.1" + +static cap_channel_t * +create_network_service(void) +{ + cap_channel_t *capcas, *capnet; + + capcas = cap_init(); + ATF_REQUIRE(capcas != NULL); + + capnet = cap_service_open(capcas, "system.net"); + ATF_REQUIRE(capnet != NULL); + + cap_close(capcas); + return (capnet); +} + +static int +test_getnameinfo_v4(cap_channel_t *chan, int family, const char *ip) +{ + struct sockaddr_in ipaddr; + char capfn[MAXHOSTNAMELEN]; + char origfn[MAXHOSTNAMELEN]; + int ret; + + memset(&ipaddr, 0, sizeof(ipaddr)); + ipaddr.sin_family = family; + inet_pton(family, ip, &ipaddr.sin_addr); + + ret = cap_getnameinfo(chan, (struct sockaddr *)&ipaddr, sizeof(ipaddr), + capfn, sizeof(capfn), NULL, 0, NI_NAMEREQD); + if (ret != 0) { + return (ret); + } + + ret = getnameinfo((struct sockaddr *)&ipaddr, sizeof(ipaddr), origfn, + sizeof(origfn), NULL, 0, NI_NAMEREQD); + ATF_REQUIRE(ret == 0); + ATF_REQUIRE(strcmp(origfn, capfn) == 0); + + return (0); +} + +static int +test_getnameinfo_v6(cap_channel_t *chan, const char *ip) +{ + struct sockaddr_in6 ipaddr; + char capfn[MAXHOSTNAMELEN]; + char origfn[MAXHOSTNAMELEN]; + int ret; + + memset(&ipaddr, 0, sizeof(ipaddr)); + ipaddr.sin6_family = AF_INET6; + inet_pton(AF_INET6, ip, &ipaddr.sin6_addr); + + ret = cap_getnameinfo(chan, (struct sockaddr *)&ipaddr, sizeof(ipaddr), + capfn, sizeof(capfn), NULL, 0, NI_NAMEREQD); + if (ret != 0) { + return (ret); + } + + ret = getnameinfo((struct sockaddr *)&ipaddr, sizeof(ipaddr), origfn, + sizeof(origfn), NULL, 0, NI_NAMEREQD); + ATF_REQUIRE(ret == 0); + ATF_REQUIRE(strcmp(origfn, capfn) == 0); + + return (0); +} + +static int +test_getnameinfo(cap_channel_t *chan, int family, const char *ip) +{ + + if (family == AF_INET6) { + return (test_getnameinfo_v6(chan, ip)); + } + + return (test_getnameinfo_v4(chan, family, ip)); +} + +static int +test_gethostbyaddr_v4(cap_channel_t *chan, int family, const char *ip) +{ + struct in_addr ipaddr; + struct hostent *caphp, *orighp; + + memset(&ipaddr, 0, sizeof(ipaddr)); + inet_pton(AF_INET, ip, &ipaddr); + + caphp = cap_gethostbyaddr(chan, &ipaddr, sizeof(ipaddr), family); + if (caphp == NULL) { + return (h_errno); + } + + orighp = gethostbyaddr(&ipaddr, sizeof(ipaddr), family); + ATF_REQUIRE(orighp != NULL); + ATF_REQUIRE(strcmp(caphp->h_name, caphp->h_name) == 0); + + return (0); +} + +static int +test_gethostbyaddr_v6(cap_channel_t *chan, const char *ip) +{ + struct in6_addr ipaddr; + struct hostent *caphp, *orighp; + + memset(&ipaddr, 0, sizeof(ipaddr)); + inet_pton(AF_INET6, ip, &ipaddr); + + caphp = cap_gethostbyaddr(chan, &ipaddr, sizeof(ipaddr), AF_INET6); + if (caphp == NULL) + return (h_errno); + + orighp = gethostbyaddr(&ipaddr, sizeof(ipaddr), AF_INET6); + ATF_REQUIRE(orighp != NULL); + ATF_REQUIRE(strcmp(caphp->h_name, caphp->h_name) == 0); + + return (0); +} + +static int +test_gethostbyaddr(cap_channel_t *chan, int family, const char *ip) +{ + + if (family == AF_INET6) { + return (test_gethostbyaddr_v6(chan, ip)); + } else { + return (test_gethostbyaddr_v4(chan, family, ip)); + } +} + +static int +test_getaddrinfo(cap_channel_t *chan, int family, const char *domain, + const char *servname) +{ + struct addrinfo hints, *capres, *origres, *res0, *res1; + bool found; + int ret; + + memset(&hints, 0, sizeof(hints)); + hints.ai_family = family; + hints.ai_socktype = SOCK_STREAM; + + ret = cap_getaddrinfo(chan, domain, servname, &hints, &capres); + if (ret != 0) { + return (ret); + } + + ret = getaddrinfo(domain, servname, &hints, &origres); + ATF_REQUIRE(ret == 0); + + for (res0 = capres; res0 != NULL; res0 = res0->ai_next) { + found = false; + for (res1 = origres; res1 != NULL; res1 = res1->ai_next) { + if (res1->ai_addrlen == res0->ai_addrlen && + memcmp(res1->ai_addr, res0->ai_addr, + res0->ai_addrlen) == 0) { + found = true; + break; + } + } + ATF_REQUIRE(found); + } + + freeaddrinfo(capres); + freeaddrinfo(origres); + return (0); +} + +static int +test_gethostbyname(cap_channel_t *chan, int family, const char *domain) +{ + struct hostent *caphp, *orighp; + + caphp = cap_gethostbyname2(chan, domain, family); + if (caphp == NULL) { + return (h_errno); + } + + orighp = gethostbyname2(domain, family); + ATF_REQUIRE(orighp != NULL); + ATF_REQUIRE(strcmp(caphp->h_name, orighp->h_name) == 0); + + return (0); +} + +static int +test_bind(cap_channel_t *chan, const char *ip) +{ + struct sockaddr_in ipv4; + int capfd, ret, serrno; + + capfd = socket(AF_INET, SOCK_STREAM, 0); + ATF_REQUIRE(capfd > 0); + + memset(&ipv4, 0, sizeof(ipv4)); + ipv4.sin_family = AF_INET; + inet_pton(AF_INET, ip, &ipv4.sin_addr); + + ret = cap_bind(chan, capfd, (struct sockaddr *)&ipv4, sizeof(ipv4)); + serrno = errno; + close(capfd); + + return (ret < 0 ? serrno : 0); +} + +static int +test_connect(cap_channel_t *chan, const char *ip, unsigned short port) +{ + struct sockaddr_in ipv4; + int capfd, ret, serrno; + + capfd = socket(AF_INET, SOCK_STREAM, 0); + ATF_REQUIRE(capfd > 0); + + memset(&ipv4, 0, sizeof(ipv4)); + ipv4.sin_family = AF_INET; + ipv4.sin_port = htons(port); + inet_pton(AF_INET, ip, &ipv4.sin_addr); + + ret = cap_connect(chan, capfd, (struct sockaddr *)&ipv4, sizeof(ipv4)); + serrno = errno; + close(capfd); + + return (ret < 0 ? serrno : 0); +} + +static void +test_extend_mode(cap_channel_t *capnet, int current) +{ + cap_net_limit_t *limit; + const int rights[] = { + CAPNET_ADDR2NAME, + CAPNET_NAME2ADDR, + CAPNET_DEPRECATED_ADDR2NAME, + CAPNET_DEPRECATED_NAME2ADDR, + CAPNET_CONNECT, + CAPNET_BIND, + CAPNET_CONNECTDNS + }; + size_t i; + + for (i = 0; i < nitems(rights); i++) { + if (current == rights[i]) + continue; + + limit = cap_net_limit_init(capnet, current | rights[i]); + ATF_REQUIRE(limit != NULL); + ATF_REQUIRE(cap_net_limit(limit) != 0); + } +} + +ATF_TC_WITHOUT_HEAD(capnet__limits_addr2name_mode); +ATF_TC_BODY(capnet__limits_addr2name_mode, tc) +{ + cap_channel_t *capnet; + cap_net_limit_t *limit; + + capnet = create_network_service(); + + /* LIMIT */ + limit = cap_net_limit_init(capnet, CAPNET_ADDR2NAME); + ATF_REQUIRE(limit != NULL); + ATF_REQUIRE(cap_net_limit(limit) == 0); + + /* ALLOWED */ + ATF_REQUIRE(test_getnameinfo(capnet, AF_INET, TEST_IPV4) == 0); + + /* DISALLOWED */ + ATF_REQUIRE(test_gethostbyname(capnet, AF_INET, TEST_DOMAIN_0) == + ENOTCAPABLE); + ATF_REQUIRE(test_gethostbyaddr(capnet, AF_INET, TEST_IPV4) == + ENOTCAPABLE); + ATF_REQUIRE(test_getaddrinfo(capnet, AF_INET, TEST_DOMAIN_0, NULL) == + ENOTCAPABLE); + ATF_REQUIRE(test_bind(capnet, TEST_BIND_IPV4) == ENOTCAPABLE); + ATF_REQUIRE(test_connect(capnet, TEST_IPV4, 80) == ENOTCAPABLE); + + test_extend_mode(capnet, CAPNET_ADDR2NAME); + + cap_close(capnet); +} + +ATF_TC_WITHOUT_HEAD(capnet__limits_addr2name_family); +ATF_TC_BODY(capnet__limits_addr2name_family, tc) +{ + cap_channel_t *capnet; + cap_net_limit_t *limit; + int family[] = { AF_INET6, AF_INET }; + + capnet = create_network_service(); + + /* Limit to AF_INET6 and AF_INET. */ + limit = cap_net_limit_init(capnet, CAPNET_ADDR2NAME); + ATF_REQUIRE(limit != NULL); + cap_net_limit_addr2name_family(limit, family, nitems(family)); + ATF_REQUIRE(cap_net_limit(limit) == 0); + + ATF_REQUIRE(test_getnameinfo(capnet, AF_INET, TEST_IPV4) == 0); + ATF_REQUIRE(test_getnameinfo(capnet, AF_INET6, TEST_IPV6) == 0); + + /* Limit to AF_INET6 and AF_INET. */ + limit = cap_net_limit_init(capnet, CAPNET_ADDR2NAME); + ATF_REQUIRE(limit != NULL); + cap_net_limit_addr2name_family(limit, &family[0], 1); + cap_net_limit_addr2name_family(limit, &family[1], 1); + ATF_REQUIRE(cap_net_limit(limit) == 0); + + ATF_REQUIRE(test_getnameinfo(capnet, AF_INET, TEST_IPV4) == 0); + ATF_REQUIRE(test_getnameinfo(capnet, AF_INET6, TEST_IPV6) == 0); + + /* Limit to AF_INET6. */ + limit = cap_net_limit_init(capnet, CAPNET_ADDR2NAME); + ATF_REQUIRE(limit != NULL); + cap_net_limit_addr2name_family(limit, family, 1); + ATF_REQUIRE(cap_net_limit(limit) == 0); + + ATF_REQUIRE(test_getnameinfo(capnet, AF_INET, TEST_IPV4) == + ENOTCAPABLE); + ATF_REQUIRE(test_getnameinfo(capnet, AF_INET6, TEST_IPV6) == 0); + + /* Unable to set empty limits. Empty limits means full access. */ + limit = cap_net_limit_init(capnet, CAPNET_ADDR2NAME); + ATF_REQUIRE(cap_net_limit(limit) != 0); + + cap_close(capnet); +} + +ATF_TC_WITHOUT_HEAD(capnet__limits_addr2name); +ATF_TC_BODY(capnet__limits_addr2name, tc) +{ + cap_channel_t *capnet; + cap_net_limit_t *limit; + struct sockaddr_in ipaddrv4; + struct sockaddr_in6 ipaddrv6; + + capnet = create_network_service(); + + /* Limit to TEST_IPV4 and TEST_IPV6. */ + memset(&ipaddrv4, 0, sizeof(ipaddrv4)); + memset(&ipaddrv6, 0, sizeof(ipaddrv6)); + + ipaddrv4.sin_family = AF_INET; + inet_pton(AF_INET, TEST_IPV4, &ipaddrv4.sin_addr); + + ipaddrv6.sin6_family = AF_INET6; + inet_pton(AF_INET6, TEST_IPV6, &ipaddrv6.sin6_addr); + + limit = cap_net_limit_init(capnet, CAPNET_ADDR2NAME); + ATF_REQUIRE(limit != NULL); + + cap_net_limit_addr2name(limit, (struct sockaddr *)&ipaddrv4, + sizeof(ipaddrv4)); + cap_net_limit_addr2name(limit, (struct sockaddr *)&ipaddrv6, + sizeof(ipaddrv6)); + ATF_REQUIRE(cap_net_limit(limit) == 0); + + ATF_REQUIRE(test_getnameinfo(capnet, AF_INET, TEST_IPV4) == 0); + ATF_REQUIRE(test_getnameinfo(capnet, AF_INET6, TEST_IPV6) == 0); + ATF_REQUIRE(test_getnameinfo(capnet, AF_INET, "127.0.0.1") == + ENOTCAPABLE); + + /* Limit to AF_INET. */ + limit = cap_net_limit_init(capnet, CAPNET_ADDR2NAME); + ATF_REQUIRE(limit != NULL); + cap_net_limit_addr2name(limit, (struct sockaddr *)&ipaddrv4, + sizeof(ipaddrv4)); + ATF_REQUIRE(cap_net_limit(limit) == 0); + + ATF_REQUIRE(test_getnameinfo(capnet, AF_INET, TEST_IPV4) == 0); + ATF_REQUIRE(test_getnameinfo(capnet, AF_INET6, TEST_IPV6) == + ENOTCAPABLE); + ATF_REQUIRE(test_getnameinfo(capnet, AF_INET, "127.0.0.1") == + ENOTCAPABLE); + + /* Unable to set empty limits. Empty limits means full access. */ + limit = cap_net_limit_init(capnet, CAPNET_ADDR2NAME); + ATF_REQUIRE(cap_net_limit(limit) != 0); + + cap_close(capnet); +} + +ATF_TC_WITHOUT_HEAD(capnet__limits_deprecated_addr2name_mode); +ATF_TC_BODY(capnet__limits_deprecated_addr2name_mode, tc) +{ + cap_channel_t *capnet; + cap_net_limit_t *limit; + + capnet = create_network_service(); + + /* LIMIT */ + limit = cap_net_limit_init(capnet, CAPNET_DEPRECATED_ADDR2NAME); + ATF_REQUIRE(limit != NULL); + ATF_REQUIRE(cap_net_limit(limit) == 0); + + /* ALLOWED */ + ATF_REQUIRE(test_gethostbyaddr(capnet, AF_INET, TEST_IPV4) == 0); + + /* DISALLOWED */ + ATF_REQUIRE(test_gethostbyname(capnet, AF_INET, TEST_DOMAIN_0) == + ENOTCAPABLE); + ATF_REQUIRE(test_getnameinfo(capnet, AF_INET, TEST_IPV4) == + ENOTCAPABLE); + ATF_REQUIRE(test_getaddrinfo(capnet, AF_INET, TEST_DOMAIN_0, NULL) == + ENOTCAPABLE); + ATF_REQUIRE(test_bind(capnet, TEST_BIND_IPV4) == ENOTCAPABLE); + ATF_REQUIRE(test_connect(capnet, TEST_IPV4, 80) == ENOTCAPABLE); + + cap_close(capnet); +} + +ATF_TC_WITHOUT_HEAD(capnet__limits_deprecated_addr2name_family); +ATF_TC_BODY(capnet__limits_deprecated_addr2name_family, tc) +{ + cap_channel_t *capnet; + cap_net_limit_t *limit; + int family[] = { AF_INET6, AF_INET }; + + capnet = create_network_service(); + + /* Limit to AF_INET6 and AF_INET. */ + limit = cap_net_limit_init(capnet, CAPNET_DEPRECATED_ADDR2NAME); + ATF_REQUIRE(limit != NULL); + cap_net_limit_addr2name_family(limit, family, nitems(family)); + ATF_REQUIRE(cap_net_limit(limit) == 0); + + ATF_REQUIRE(test_gethostbyaddr(capnet, AF_INET, TEST_IPV4) == 0); + ATF_REQUIRE(test_gethostbyaddr(capnet, AF_INET6, TEST_IPV6) == 0); + ATF_REQUIRE(test_gethostbyaddr(capnet, PF_LINK, TEST_IPV4) == + ENOTCAPABLE); + + /* Limit to AF_INET6 and AF_INET. */ + limit = cap_net_limit_init(capnet, CAPNET_DEPRECATED_ADDR2NAME); + ATF_REQUIRE(limit != NULL); + cap_net_limit_addr2name_family(limit, &family[0], 1); + cap_net_limit_addr2name_family(limit, &family[1], 1); + ATF_REQUIRE(cap_net_limit(limit) == 0); + + ATF_REQUIRE(test_gethostbyaddr(capnet, AF_INET, TEST_IPV4) == 0); + ATF_REQUIRE(test_gethostbyaddr(capnet, AF_INET6, TEST_IPV6) == 0); + ATF_REQUIRE(test_gethostbyaddr(capnet, PF_LINK, TEST_IPV4) == + ENOTCAPABLE); + + /* Limit to AF_INET6. */ + limit = cap_net_limit_init(capnet, CAPNET_DEPRECATED_ADDR2NAME); + ATF_REQUIRE(limit != NULL); + cap_net_limit_addr2name_family(limit, family, 1); + ATF_REQUIRE(cap_net_limit(limit) == 0); + + ATF_REQUIRE(test_gethostbyaddr(capnet, AF_INET, TEST_IPV4) == + ENOTCAPABLE); + ATF_REQUIRE(test_gethostbyaddr(capnet, AF_INET6, TEST_IPV6) == 0); + ATF_REQUIRE(test_gethostbyaddr(capnet, PF_LINK, TEST_IPV4) == + ENOTCAPABLE); + + /* Unable to set empty limits. Empty limits means full access. */ + limit = cap_net_limit_init(capnet, CAPNET_DEPRECATED_ADDR2NAME); + ATF_REQUIRE(cap_net_limit(limit) != 0); + + cap_close(capnet); +} + +ATF_TC_WITHOUT_HEAD(capnet__limits_deprecated_addr2name); +ATF_TC_BODY(capnet__limits_deprecated_addr2name, tc) +{ + cap_channel_t *capnet; + cap_net_limit_t *limit; + struct in_addr ipaddrv4; + struct in6_addr ipaddrv6; + + capnet = create_network_service(); + + /* Limit to TEST_IPV4 and TEST_IPV6. */ + memset(&ipaddrv4, 0, sizeof(ipaddrv4)); + memset(&ipaddrv6, 0, sizeof(ipaddrv6)); + + inet_pton(AF_INET, TEST_IPV4, &ipaddrv4); + inet_pton(AF_INET6, TEST_IPV6, &ipaddrv6); + + limit = cap_net_limit_init(capnet, CAPNET_DEPRECATED_ADDR2NAME); + ATF_REQUIRE(limit != NULL); + + cap_net_limit_addr2name(limit, (struct sockaddr *)&ipaddrv4, + sizeof(ipaddrv4)); + cap_net_limit_addr2name(limit, (struct sockaddr *)&ipaddrv6, + sizeof(ipaddrv6)); + ATF_REQUIRE(cap_net_limit(limit) == 0); + + ATF_REQUIRE(test_gethostbyaddr(capnet, AF_INET, TEST_IPV4) == 0); + ATF_REQUIRE(test_gethostbyaddr(capnet, AF_INET6, TEST_IPV6) == 0); + ATF_REQUIRE(test_gethostbyaddr(capnet, AF_INET, "127.0.0.1") == + ENOTCAPABLE); + + /* Limit to AF_INET. */ + limit = cap_net_limit_init(capnet, CAPNET_DEPRECATED_ADDR2NAME); + ATF_REQUIRE(limit != NULL); + cap_net_limit_addr2name(limit, (struct sockaddr *)&ipaddrv4, + sizeof(ipaddrv4)); + ATF_REQUIRE(cap_net_limit(limit) == 0); + + ATF_REQUIRE(test_gethostbyaddr(capnet, AF_INET, TEST_IPV4) == 0); + ATF_REQUIRE(test_gethostbyaddr(capnet, AF_INET6, TEST_IPV6) == + ENOTCAPABLE); + ATF_REQUIRE(test_gethostbyaddr(capnet, AF_INET, "127.0.0.1") == + ENOTCAPABLE); + + /* Unable to set empty limits. Empty limits means full access. */ + limit = cap_net_limit_init(capnet, CAPNET_DEPRECATED_ADDR2NAME); + ATF_REQUIRE(cap_net_limit(limit) != 0); + + cap_close(capnet); +} + + +ATF_TC_WITHOUT_HEAD(capnet__limits_name2addr_mode); +ATF_TC_BODY(capnet__limits_name2addr_mode, tc) +{ + cap_channel_t *capnet; + cap_net_limit_t *limit; + + capnet = create_network_service(); + + /* LIMIT */ + limit = cap_net_limit_init(capnet, CAPNET_NAME2ADDR); + ATF_REQUIRE(limit != NULL); + ATF_REQUIRE(cap_net_limit(limit) == 0); + + /* ALLOWED */ + ATF_REQUIRE(test_getaddrinfo(capnet, AF_INET, TEST_DOMAIN_0, NULL) == + 0); + + /* DISALLOWED */ + ATF_REQUIRE( + test_gethostbyname(capnet, AF_INET, TEST_DOMAIN_0) == ENOTCAPABLE); + ATF_REQUIRE(test_getnameinfo(capnet, AF_INET, TEST_IPV4) == + ENOTCAPABLE); + ATF_REQUIRE(test_gethostbyaddr(capnet, AF_INET, TEST_IPV4) == + ENOTCAPABLE); + ATF_REQUIRE(test_bind(capnet, TEST_BIND_IPV4) == ENOTCAPABLE); + ATF_REQUIRE(test_connect(capnet, TEST_IPV4, 80) == ENOTCAPABLE); + + test_extend_mode(capnet, CAPNET_ADDR2NAME); + + cap_close(capnet); +} + +ATF_TC_WITHOUT_HEAD(capnet__limits_name2addr_hosts); +ATF_TC_BODY(capnet__limits_name2addr_hosts, tc) +{ + cap_channel_t *capnet; + cap_net_limit_t *limit; + + capnet = create_network_service(); + + /* Limit to TEST_DOMAIN_0 and localhost only. */ + limit = cap_net_limit_init(capnet, CAPNET_NAME2ADDR); + ATF_REQUIRE(limit != NULL); + cap_net_limit_name2addr(limit, TEST_DOMAIN_0, NULL); + cap_net_limit_name2addr(limit, "localhost", NULL); + ATF_REQUIRE(cap_net_limit(limit) == 0); + + ATF_REQUIRE(test_getaddrinfo(capnet, AF_INET, TEST_DOMAIN_0, NULL) == + 0); + ATF_REQUIRE(test_getaddrinfo(capnet, AF_INET, "localhost", NULL) == 0); + ATF_REQUIRE(test_getaddrinfo(capnet, AF_INET, TEST_DOMAIN_1, NULL) == + ENOTCAPABLE); + + /* Limit to TEST_DOMAIN_0 only. */ + limit = cap_net_limit_init(capnet, CAPNET_NAME2ADDR); + ATF_REQUIRE(limit != NULL); + cap_net_limit_name2addr(limit, TEST_DOMAIN_0, NULL); + ATF_REQUIRE(cap_net_limit(limit) == 0); + + ATF_REQUIRE(test_getaddrinfo(capnet, AF_INET, "localhost", NULL) == + ENOTCAPABLE); + ATF_REQUIRE(test_getaddrinfo(capnet, AF_INET, TEST_DOMAIN_1, NULL) == + ENOTCAPABLE); + ATF_REQUIRE(test_getaddrinfo(capnet, AF_INET, TEST_DOMAIN_0, NULL) == + 0); + + /* Unable to set empty limits. Empty limits means full access. */ + limit = cap_net_limit_init(capnet, CAPNET_NAME2ADDR); + ATF_REQUIRE(cap_net_limit(limit) != 0); + + cap_close(capnet); +} + +ATF_TC_WITHOUT_HEAD(capnet__limits_name2addr_hosts_servnames_strict); +ATF_TC_BODY(capnet__limits_name2addr_hosts_servnames_strict, tc) +{ + cap_channel_t *capnet; + cap_net_limit_t *limit; + + capnet = create_network_service(); + + /* + * Limit to TEST_DOMAIN_0 and HTTP service. + */ + limit = cap_net_limit_init(capnet, CAPNET_NAME2ADDR); + ATF_REQUIRE(limit != NULL); + cap_net_limit_name2addr(limit, TEST_DOMAIN_0, "http"); + ATF_REQUIRE(cap_net_limit(limit) == 0); + + ATF_REQUIRE(test_getaddrinfo(capnet, AF_INET, TEST_DOMAIN_0, "http") == + 0); + ATF_REQUIRE(test_getaddrinfo(capnet, AF_INET, TEST_DOMAIN_0, NULL) == + ENOTCAPABLE); + ATF_REQUIRE(test_getaddrinfo(capnet, AF_INET, TEST_DOMAIN_0, "snmp") == + ENOTCAPABLE); + ATF_REQUIRE(test_getaddrinfo(capnet, AF_INET, TEST_DOMAIN_1, "http") == + ENOTCAPABLE); + + /* Unable to set empty limits. Empty limits means full access. */ + limit = cap_net_limit_init(capnet, CAPNET_NAME2ADDR); + ATF_REQUIRE(cap_net_limit(limit) != 0); + + cap_close(capnet); +} + +ATF_TC_WITHOUT_HEAD(capnet__limits_name2addr_hosts_servnames_mix); +ATF_TC_BODY(capnet__limits_name2addr_hosts_servnames_mix, tc) +{ + cap_channel_t *capnet; + cap_net_limit_t *limit; + + capnet = create_network_service(); + + /* + * Limit to TEST_DOMAIN_0 and any servnamex, and any domain with + * servname HTTP. + */ + limit = cap_net_limit_init(capnet, CAPNET_NAME2ADDR); + ATF_REQUIRE(limit != NULL); + cap_net_limit_name2addr(limit, TEST_DOMAIN_0, NULL); + cap_net_limit_name2addr(limit, NULL, "http"); + ATF_REQUIRE(cap_net_limit(limit) == 0); + + ATF_REQUIRE(test_getaddrinfo(capnet, AF_INET, TEST_DOMAIN_0, "http") == + 0); + ATF_REQUIRE(test_getaddrinfo(capnet, AF_INET, TEST_DOMAIN_0, NULL) == + 0); + ATF_REQUIRE(test_getaddrinfo(capnet, AF_INET, TEST_DOMAIN_1, "http") == + 0); + ATF_REQUIRE(test_getaddrinfo(capnet, AF_INET, TEST_DOMAIN_0, NULL) == + 0); + ATF_REQUIRE(test_getaddrinfo(capnet, AF_INET, TEST_DOMAIN_1, "snmp") == + ENOTCAPABLE); + + /* Limit to HTTTP servname only. */ + limit = cap_net_limit_init(capnet, CAPNET_NAME2ADDR); + ATF_REQUIRE(limit != NULL); + cap_net_limit_name2addr(limit, NULL, "http"); + ATF_REQUIRE(cap_net_limit(limit) == 0); + + ATF_REQUIRE(test_getaddrinfo(capnet, AF_INET, TEST_DOMAIN_0, "http") == + 0); + ATF_REQUIRE(test_getaddrinfo(capnet, AF_INET, TEST_DOMAIN_0, NULL) == + ENOTCAPABLE); + ATF_REQUIRE(test_getaddrinfo(capnet, AF_INET, TEST_DOMAIN_1, "http") == + 0); + ATF_REQUIRE(test_getaddrinfo(capnet, AF_INET, TEST_DOMAIN_0, NULL) == + ENOTCAPABLE); + ATF_REQUIRE(test_getaddrinfo(capnet, AF_INET, TEST_DOMAIN_1, "snmp") == + ENOTCAPABLE); + + /* Unable to set empty limits. Empty limits means full access. */ + limit = cap_net_limit_init(capnet, CAPNET_NAME2ADDR); + ATF_REQUIRE(cap_net_limit(limit) != 0); + + cap_close(capnet); +} + +ATF_TC_WITHOUT_HEAD(capnet__limits_name2addr_family); +ATF_TC_BODY(capnet__limits_name2addr_family, tc) +{ + cap_channel_t *capnet; + cap_net_limit_t *limit; + int family[] = { AF_INET6, AF_INET }; + + capnet = create_network_service(); + + /* Limit to AF_INET and AF_INET6. */ + limit = cap_net_limit_init(capnet, CAPNET_NAME2ADDR); + ATF_REQUIRE(limit != NULL); + cap_net_limit_name2addr(limit, TEST_DOMAIN_0, NULL); + cap_net_limit_name2addr_family(limit, family, nitems(family)); + ATF_REQUIRE(cap_net_limit(limit) == 0); + + ATF_REQUIRE(test_getaddrinfo(capnet, AF_INET, TEST_DOMAIN_0, NULL) == + 0); + ATF_REQUIRE(test_getaddrinfo(capnet, AF_INET6, TEST_DOMAIN_0, NULL) == + 0); + ATF_REQUIRE(test_getaddrinfo(capnet, PF_LINK, TEST_DOMAIN_0, NULL) == + ENOTCAPABLE); + + /* Limit to AF_INET and AF_INET6. */ + limit = cap_net_limit_init(capnet, CAPNET_NAME2ADDR); + ATF_REQUIRE(limit != NULL); + cap_net_limit_name2addr(limit, TEST_DOMAIN_0, NULL); + cap_net_limit_name2addr_family(limit, &family[0], 1); + cap_net_limit_name2addr_family(limit, &family[1], 1); + ATF_REQUIRE(cap_net_limit(limit) == 0); + + ATF_REQUIRE(test_getaddrinfo(capnet, AF_INET, TEST_DOMAIN_0, NULL) == + 0); + ATF_REQUIRE(test_getaddrinfo(capnet, AF_INET6, TEST_DOMAIN_0, NULL) == + 0); + ATF_REQUIRE(test_getaddrinfo(capnet, PF_LINK, TEST_DOMAIN_0, NULL) == + ENOTCAPABLE); + + /* Limit to AF_INET6 only. */ + limit = cap_net_limit_init(capnet, CAPNET_NAME2ADDR); + ATF_REQUIRE(limit != NULL); + cap_net_limit_name2addr(limit, TEST_DOMAIN_0, NULL); + cap_net_limit_name2addr_family(limit, family, 1); + ATF_REQUIRE(cap_net_limit(limit) == 0); + + ATF_REQUIRE(test_getaddrinfo(capnet, AF_INET, TEST_DOMAIN_0, NULL) == + ENOTCAPABLE); + ATF_REQUIRE(test_getaddrinfo(capnet, AF_INET6, TEST_DOMAIN_0, NULL) == + 0); + ATF_REQUIRE(test_getaddrinfo(capnet, PF_LINK, TEST_DOMAIN_0, NULL) == + ENOTCAPABLE); + + /* Unable to set empty limits. Empty limits means full access. */ + limit = cap_net_limit_init(capnet, CAPNET_NAME2ADDR); + ATF_REQUIRE(cap_net_limit(limit) != 0); + + cap_close(capnet); +} + +ATF_TC_WITHOUT_HEAD(capnet__limits_deprecated_name2addr_mode); +ATF_TC_BODY(capnet__limits_deprecated_name2addr_mode, tc) +{ + cap_channel_t *capnet; + cap_net_limit_t *limit; + + capnet = create_network_service(); + + /* LIMIT */ + limit = cap_net_limit_init(capnet, CAPNET_DEPRECATED_NAME2ADDR); + ATF_REQUIRE(limit != NULL); + ATF_REQUIRE(cap_net_limit(limit) == 0); + + /* ALLOWED */ + ATF_REQUIRE(test_gethostbyname(capnet, AF_INET, TEST_DOMAIN_0) == 0); + + /* DISALLOWED */ + ATF_REQUIRE(test_getnameinfo(capnet, AF_INET, TEST_IPV4) == + ENOTCAPABLE); + ATF_REQUIRE(test_gethostbyaddr(capnet, AF_INET, TEST_IPV4) == + ENOTCAPABLE); + ATF_REQUIRE(test_getaddrinfo(capnet, AF_INET, TEST_DOMAIN_0, NULL) == + ENOTCAPABLE); + ATF_REQUIRE(test_bind(capnet, TEST_BIND_IPV4) == ENOTCAPABLE); + ATF_REQUIRE(test_connect(capnet, TEST_IPV4, 80) == ENOTCAPABLE); + + test_extend_mode(capnet, CAPNET_ADDR2NAME); + + cap_close(capnet); +} + +ATF_TC_WITHOUT_HEAD(capnet__limits_deprecated_name2addr_hosts); +ATF_TC_BODY(capnet__limits_deprecated_name2addr_hosts, tc) +{ + cap_channel_t *capnet; + cap_net_limit_t *limit; + + capnet = create_network_service(); + + /* Limit to TEST_DOMAIN_0 and localhost only. */ + limit = cap_net_limit_init(capnet, CAPNET_DEPRECATED_NAME2ADDR); + ATF_REQUIRE(limit != NULL); + cap_net_limit_name2addr(limit, TEST_DOMAIN_0, NULL); + cap_net_limit_name2addr(limit, "localhost", NULL); + ATF_REQUIRE(cap_net_limit(limit) == 0); + + ATF_REQUIRE(test_gethostbyname(capnet, AF_INET, TEST_DOMAIN_0) == 0); + ATF_REQUIRE(test_gethostbyname(capnet, AF_INET, "localhost") == 0); + ATF_REQUIRE( + test_gethostbyname(capnet, AF_INET, TEST_DOMAIN_1) == ENOTCAPABLE); + + /* Limit to TEST_DOMAIN_0 only. */ + limit = cap_net_limit_init(capnet, CAPNET_DEPRECATED_NAME2ADDR); + ATF_REQUIRE(limit != NULL); + cap_net_limit_name2addr(limit, TEST_DOMAIN_0, NULL); + ATF_REQUIRE(cap_net_limit(limit) == 0); + + ATF_REQUIRE( + test_gethostbyname(capnet, AF_INET, "localhost") == ENOTCAPABLE); + ATF_REQUIRE( + test_gethostbyname(capnet, AF_INET, TEST_DOMAIN_1) == ENOTCAPABLE); + ATF_REQUIRE(test_gethostbyname(capnet, AF_INET, TEST_DOMAIN_0) == 0); + + /* Unable to set empty limits. Empty limits means full access. */ + limit = cap_net_limit_init(capnet, CAPNET_DEPRECATED_NAME2ADDR); + ATF_REQUIRE(cap_net_limit(limit) != 0); + + cap_close(capnet); +} + +ATF_TC_WITHOUT_HEAD(capnet__limits_deprecated_name2addr_family); +ATF_TC_BODY(capnet__limits_deprecated_name2addr_family, tc) +{ + cap_channel_t *capnet; + cap_net_limit_t *limit; + int family[] = { AF_INET6, AF_INET }; + + capnet = create_network_service(); + + /* Limit to AF_INET and AF_INET6. */ + limit = cap_net_limit_init(capnet, CAPNET_DEPRECATED_NAME2ADDR); + ATF_REQUIRE(limit != NULL); + cap_net_limit_name2addr(limit, TEST_DOMAIN_0, NULL); + cap_net_limit_name2addr_family(limit, family, nitems(family)); + ATF_REQUIRE(cap_net_limit(limit) == 0); + + ATF_REQUIRE(test_gethostbyname(capnet, AF_INET, TEST_DOMAIN_0) == 0); + ATF_REQUIRE(test_gethostbyname(capnet, AF_INET6, TEST_DOMAIN_0) == 0); + ATF_REQUIRE( + test_gethostbyname(capnet, PF_LINK, TEST_DOMAIN_0) == ENOTCAPABLE); + + /* Limit to AF_INET and AF_INET6. */ + limit = cap_net_limit_init(capnet, CAPNET_DEPRECATED_NAME2ADDR); + ATF_REQUIRE(limit != NULL); + cap_net_limit_name2addr(limit, TEST_DOMAIN_0, NULL); + cap_net_limit_name2addr_family(limit, &family[0], 1); + cap_net_limit_name2addr_family(limit, &family[1], 1); + ATF_REQUIRE(cap_net_limit(limit) == 0); + + ATF_REQUIRE(test_gethostbyname(capnet, AF_INET, TEST_DOMAIN_0) == 0); + ATF_REQUIRE(test_gethostbyname(capnet, AF_INET6, TEST_DOMAIN_0) == 0); + ATF_REQUIRE( + test_gethostbyname(capnet, PF_LINK, TEST_DOMAIN_0) == ENOTCAPABLE); + + /* Limit to AF_INET6 only. */ + limit = cap_net_limit_init(capnet, CAPNET_DEPRECATED_NAME2ADDR); + ATF_REQUIRE(limit != NULL); + cap_net_limit_name2addr(limit, TEST_DOMAIN_0, NULL); + cap_net_limit_name2addr_family(limit, family, 1); + ATF_REQUIRE(cap_net_limit(limit) == 0); + + ATF_REQUIRE( + test_gethostbyname(capnet, AF_INET, TEST_DOMAIN_0) == ENOTCAPABLE); + ATF_REQUIRE(test_gethostbyname(capnet, AF_INET6, TEST_DOMAIN_0) == 0); + ATF_REQUIRE( + test_gethostbyname(capnet, PF_LINK, TEST_DOMAIN_0) == ENOTCAPABLE); + + /* Unable to set empty limits. Empty limits means full access. */ + limit = cap_net_limit_init(capnet, CAPNET_DEPRECATED_NAME2ADDR); + ATF_REQUIRE(cap_net_limit(limit) != 0); + + cap_close(capnet); +} + +ATF_TC_WITHOUT_HEAD(capnet__limits_bind_mode); +ATF_TC_BODY(capnet__limits_bind_mode, tc) +{ + cap_channel_t *capnet; + cap_net_limit_t *limit; + + capnet = create_network_service(); + + /* LIMIT */ + limit = cap_net_limit_init(capnet, CAPNET_BIND); + ATF_REQUIRE(limit != NULL); + ATF_REQUIRE(cap_net_limit(limit) == 0); + + /* ALLOWED */ + ATF_REQUIRE(test_bind(capnet, TEST_BIND_IPV4) == 0); + + /* DISALLOWED */ + ATF_REQUIRE( + test_gethostbyname(capnet, AF_INET, TEST_DOMAIN_0) == ENOTCAPABLE); + ATF_REQUIRE(test_getnameinfo(capnet, AF_INET, TEST_IPV4) == + ENOTCAPABLE); + ATF_REQUIRE(test_gethostbyaddr(capnet, AF_INET, TEST_IPV4) == + ENOTCAPABLE); + ATF_REQUIRE(test_getaddrinfo(capnet, AF_INET, TEST_DOMAIN_0, NULL) == + ENOTCAPABLE); + ATF_REQUIRE(test_connect(capnet, TEST_IPV4, 80) == ENOTCAPABLE); + + test_extend_mode(capnet, CAPNET_ADDR2NAME); + + cap_close(capnet); +} + +ATF_TC_WITHOUT_HEAD(capnet__limits_bind); +ATF_TC_BODY(capnet__limits_bind, tc) +{ + cap_channel_t *capnet; + cap_net_limit_t *limit; + struct sockaddr_in ipv4; + + capnet = create_network_service(); + + limit = cap_net_limit_init(capnet, CAPNET_BIND); + ATF_REQUIRE(limit != NULL); + + memset(&ipv4, 0, sizeof(ipv4)); + ipv4.sin_family = AF_INET; + inet_pton(AF_INET, TEST_BIND_IPV4, &ipv4.sin_addr); + + cap_net_limit_bind(limit, (struct sockaddr *)&ipv4, sizeof(ipv4)); + ATF_REQUIRE(cap_net_limit(limit) == 0); + + ATF_REQUIRE(test_bind(capnet, TEST_BIND_IPV4) == 0); + ATF_REQUIRE(test_bind(capnet, "127.0.0.2") == ENOTCAPABLE); + + cap_close(capnet); +} + +ATF_TC_WITHOUT_HEAD(capnet__limits_connect_mode); +ATF_TC_BODY(capnet__limits_connect_mode, tc) +{ + cap_channel_t *capnet; + cap_net_limit_t *limit; + + capnet = create_network_service(); + + /* LIMIT */ + limit = cap_net_limit_init(capnet, CAPNET_CONNECT); + ATF_REQUIRE(limit != NULL); + ATF_REQUIRE(cap_net_limit(limit) == 0); + + /* ALLOWED */ + ATF_REQUIRE(test_connect(capnet, TEST_IPV4, 80) == 0); + + /* DISALLOWED */ + ATF_REQUIRE( + test_gethostbyname(capnet, AF_INET, TEST_DOMAIN_0) == ENOTCAPABLE); + ATF_REQUIRE(test_getnameinfo(capnet, AF_INET, TEST_IPV4) == + ENOTCAPABLE); + ATF_REQUIRE(test_gethostbyaddr(capnet, AF_INET, TEST_IPV4) == + ENOTCAPABLE); + ATF_REQUIRE(test_getaddrinfo(capnet, AF_INET, TEST_DOMAIN_0, NULL) == + ENOTCAPABLE); + ATF_REQUIRE(test_bind(capnet, TEST_BIND_IPV4) == ENOTCAPABLE); + + test_extend_mode(capnet, CAPNET_ADDR2NAME); + + cap_close(capnet); +} + +ATF_TC_WITHOUT_HEAD(capnet__limits_connect); +ATF_TC_BODY(capnet__limits_connect, tc) +{ + cap_channel_t *capnet; + cap_net_limit_t *limit; + struct sockaddr_in ipv4; + + capnet = create_network_service(); + + /* Limit only to TEST_IPV4 on port 80 and 443. */ + limit = cap_net_limit_init(capnet, CAPNET_CONNECT); + ATF_REQUIRE(limit != NULL); + memset(&ipv4, 0, sizeof(ipv4)); + ipv4.sin_family = AF_INET; + ipv4.sin_port = htons(80); + inet_pton(AF_INET, TEST_IPV4, &ipv4.sin_addr); + cap_net_limit_connect(limit, (struct sockaddr *)&ipv4, sizeof(ipv4)); + + ipv4.sin_port = htons(443); + cap_net_limit_connect(limit, (struct sockaddr *)&ipv4, sizeof(ipv4)); + ATF_REQUIRE(cap_net_limit(limit) == 0); + + ATF_REQUIRE(test_connect(capnet, TEST_IPV4, 80) == 0); + ATF_REQUIRE(test_connect(capnet, "8.8.8.8", 80) == ENOTCAPABLE); + ATF_REQUIRE(test_connect(capnet, "8.8.8.8", 433) == ENOTCAPABLE); + ATF_REQUIRE(test_connect(capnet, TEST_IPV4, 443) == 0); + + /* Limit only to TEST_IPV4 on port 443. */ + limit = cap_net_limit_init(capnet, CAPNET_CONNECT); + cap_net_limit_connect(limit, (struct sockaddr *)&ipv4, sizeof(ipv4)); + ATF_REQUIRE(cap_net_limit(limit) == 0); + + ATF_REQUIRE(test_connect(capnet, TEST_IPV4, 433) == ENOTCAPABLE); + ATF_REQUIRE(test_connect(capnet, "8.8.8.8", 80) == ENOTCAPABLE); + ATF_REQUIRE(test_connect(capnet, "8.8.8.8", 433) == ENOTCAPABLE); + ATF_REQUIRE(test_connect(capnet, TEST_IPV4, 443) == 0); + + /* Unable to set empty limits. Empty limits means full access. */ + limit = cap_net_limit_init(capnet, CAPNET_CONNECT); + ATF_REQUIRE(cap_net_limit(limit) != 0); + + cap_close(capnet); +} + +ATF_TC_WITHOUT_HEAD(capnet__limits_connecttodns); +ATF_TC_BODY(capnet__limits_connecttodns, tc) +{ + cap_channel_t *capnet; + cap_net_limit_t *limit; + struct addrinfo hints, *capres, *res; + int family[] = { AF_INET }; + + capnet = create_network_service(); + + limit = cap_net_limit_init(capnet, CAPNET_CONNECTDNS | + CAPNET_NAME2ADDR); + ATF_REQUIRE(limit != NULL); + cap_net_limit_name2addr(limit, TEST_IPV4, "80"); + cap_net_limit_name2addr_family(limit, family, 1); + ATF_REQUIRE(cap_net_limit(limit) == 0); + + memset(&hints, 0, sizeof(hints)); + hints.ai_family = AF_INET; + hints.ai_socktype = SOCK_STREAM; + + ATF_REQUIRE(test_connect(capnet, "8.8.8.8", 433) == ENOTCAPABLE); + ATF_REQUIRE(cap_getaddrinfo(capnet, TEST_IPV4, "80", &hints, &capres) == + 0); + ATF_REQUIRE(test_connect(capnet, "8.8.8.8", 433) == ENOTCAPABLE); + + for (res = capres; res != NULL; res = res->ai_next) { + int s; + + ATF_REQUIRE(res->ai_family == AF_INET); + ATF_REQUIRE(res->ai_socktype == SOCK_STREAM); + + s = socket(res->ai_family, res->ai_socktype, res->ai_protocol); + ATF_REQUIRE(s >= 0); + + ATF_REQUIRE(cap_connect(capnet, s, res->ai_addr, + res->ai_addrlen) == 0); + close(s); + } + + freeaddrinfo(capres); + cap_close(capnet); +} + + +ATF_TC_WITHOUT_HEAD(capnet__limits_deprecated_connecttodns); +ATF_TC_BODY(capnet__limits_deprecated_connecttodns, tc) +{ + cap_channel_t *capnet; + cap_net_limit_t *limit; + struct hostent *caphp; + struct in_addr ipaddr; + struct sockaddr_in connaddr; + int family[] = { AF_INET }; + int i; + + capnet = create_network_service(); + + limit = cap_net_limit_init(capnet, CAPNET_CONNECTDNS | + CAPNET_DEPRECATED_NAME2ADDR); + ATF_REQUIRE(limit != NULL); + cap_net_limit_name2addr(limit, TEST_IPV4, NULL); + cap_net_limit_name2addr_family(limit, family, 1); + ATF_REQUIRE(cap_net_limit(limit) == 0); + + memset(&ipaddr, 0, sizeof(ipaddr)); + inet_pton(AF_INET, TEST_IPV4, &ipaddr); + + ATF_REQUIRE(test_connect(capnet, "8.8.8.8", 433) == ENOTCAPABLE); + caphp = cap_gethostbyname2(capnet, TEST_IPV4, AF_INET); + ATF_REQUIRE(caphp != NULL); + ATF_REQUIRE(caphp->h_addrtype == AF_INET); + ATF_REQUIRE(test_connect(capnet, "8.8.8.8", 433) == ENOTCAPABLE); + + for (i = 0; caphp->h_addr_list[i] != NULL; i++) { + int s; + + s = socket(AF_INET, SOCK_STREAM, 0); + ATF_REQUIRE(s >= 0); + + memset(&connaddr, 0, sizeof(connaddr)); + connaddr.sin_family = AF_INET; + memcpy((char *)&connaddr.sin_addr.s_addr, + (char *)caphp->h_addr_list[i], caphp->h_length); + connaddr.sin_port = htons(80); + + ATF_REQUIRE(cap_connect(capnet, s, (struct sockaddr *)&connaddr, + sizeof(connaddr)) == 0); + close(s); + } + + cap_close(capnet); +} + +ATF_TP_ADD_TCS(tp) +{ + + ATF_TP_ADD_TC(tp, capnet__limits_addr2name_mode); + ATF_TP_ADD_TC(tp, capnet__limits_addr2name_family); + ATF_TP_ADD_TC(tp, capnet__limits_addr2name); + + ATF_TP_ADD_TC(tp, capnet__limits_deprecated_addr2name_mode); + ATF_TP_ADD_TC(tp, capnet__limits_deprecated_addr2name_family); + ATF_TP_ADD_TC(tp, capnet__limits_deprecated_addr2name); + + ATF_TP_ADD_TC(tp, capnet__limits_name2addr_mode); + ATF_TP_ADD_TC(tp, capnet__limits_name2addr_hosts); + ATF_TP_ADD_TC(tp, capnet__limits_name2addr_hosts_servnames_strict); + ATF_TP_ADD_TC(tp, capnet__limits_name2addr_hosts_servnames_mix); + ATF_TP_ADD_TC(tp, capnet__limits_name2addr_family); + + ATF_TP_ADD_TC(tp, capnet__limits_deprecated_name2addr_mode); + ATF_TP_ADD_TC(tp, capnet__limits_deprecated_name2addr_hosts); + ATF_TP_ADD_TC(tp, capnet__limits_deprecated_name2addr_family); + + ATF_TP_ADD_TC(tp, capnet__limits_bind_mode); + ATF_TP_ADD_TC(tp, capnet__limits_bind); + + ATF_TP_ADD_TC(tp, capnet__limits_connect_mode); + ATF_TP_ADD_TC(tp, capnet__limits_connect); + + ATF_TP_ADD_TC(tp, capnet__limits_connecttodns); + ATF_TP_ADD_TC(tp, capnet__limits_deprecated_connecttodns); + + return (atf_no_error()); +} Property changes on: projects/clang1100-import/lib/libcasper/services/cap_net/tests/net_test.c ___________________________________________________________________ Added: svn:eol-style ## -0,0 +1 ## +native \ No newline at end of property Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property Added: svn:mime-type ## -0,0 +1 ## +text/plain \ No newline at end of property Index: projects/clang1100-import/share/mk/src.libnames.mk =================================================================== --- projects/clang1100-import/share/mk/src.libnames.mk (revision 364278) +++ projects/clang1100-import/share/mk/src.libnames.mk (revision 364279) @@ -1,696 +1,698 @@ # $FreeBSD$ # # The include file define library names suitable # for INTERNALLIB and PRIVATELIB definition .if !target(____) .error src.libnames.mk cannot be included directly. .endif .if !target(____) ____: .include _PRIVATELIBS= \ atf_c \ atf_cxx \ auditd \ bsdstat \ devdctl \ event1 \ gmock \ gtest \ gmock_main \ gtest_main \ heimipcc \ heimipcs \ ldns \ sqlite3 \ ssh \ ucl \ unbound \ zstd _INTERNALLIBS= \ amu \ bsnmptools \ c_nossp_pic \ cron \ elftc \ fifolog \ ifconfig \ ipf \ kyua_cli \ kyua_drivers \ kyua_engine \ kyua_model \ kyua_store \ kyua_utils \ lpr \ lua \ lutok \ netbsd \ ntp \ ntpevent \ openbsd \ opts \ parse \ pe \ pmcstat \ sl \ sm \ smdb \ smutil \ telnet \ vers _LIBRARIES= \ ${_PRIVATELIBS} \ ${_INTERNALLIBS} \ ${LOCAL_LIBRARIES} \ 80211 \ alias \ archive \ asn1 \ avl \ be \ begemot \ bluetooth \ bsdxml \ bsm \ bsnmp \ bz2 \ c \ c_pic \ calendar \ cam \ casper \ cap_dns \ cap_fileargs \ cap_grp \ + cap_net \ cap_pwd \ cap_sysctl \ cap_syslog \ com_err \ compiler_rt \ crypt \ crypto \ ctf \ cuse \ cxxrt \ devctl \ devdctl \ devinfo \ devstat \ dialog \ dl \ dpv \ dtrace \ dwarf \ edit \ efivar \ elf \ execinfo \ fetch \ figpar \ geom \ gnuregex \ gpio \ gssapi \ gssapi_krb5 \ hdb \ heimbase \ heimntlm \ heimsqlite \ hx509 \ ipsec \ ipt \ jail \ kadm5clnt \ kadm5srv \ kafs5 \ kdc \ kiconv \ krb5 \ kvm \ l \ lzma \ m \ magic \ md \ memstat \ mp \ mt \ ncurses \ ncursesw \ netgraph \ ngatm \ nv \ nvpair \ opencsd \ opie \ pam \ panel \ panelw \ pcap \ pcsclite \ pjdlog \ pmc \ proc \ procstat \ pthread \ radius \ regex \ roken \ rpcsec_gss \ rpcsvc \ rt \ rtld_db \ sbuf \ sdp \ sm \ smb \ ssl \ ssp_nonshared \ stats \ stdthreads \ supcplusplus \ sysdecode \ tacplus \ termcap \ termcapw \ ufs \ ugidfw \ ulog \ umem \ usb \ usbhid \ util \ uutil \ vmmapi \ wind \ wrap \ xo \ y \ ypclnt \ z \ zfs_core \ zfs \ zpool \ .if ${MK_BLACKLIST} != "no" _LIBRARIES+= \ blacklist \ .endif .if ${MK_OFED} != "no" _LIBRARIES+= \ cxgb4 \ ibcm \ ibmad \ ibnetdisc \ ibumad \ ibverbs \ mlx4 \ mlx5 \ rdmacm \ osmcomp \ opensm \ osmvendor .endif .if ${MK_BEARSSL} == "yes" _LIBRARIES+= \ bearssl \ secureboot \ LIBBEARSSL?= ${LIBBEARSSLDIR}/libbearssl.a LIBSECUREBOOT?= ${LIBSECUREBOOTDIR}/libsecureboot.a .endif .if ${MK_VERIEXEC} == "yes" _LIBRARIES+= veriexec LIBVERIEXEC?= ${LIBVERIEXECDIR}/libveriexec.a .endif # Each library's LIBADD needs to be duplicated here for static linkage of # 2nd+ order consumers. Auto-generating this would be better. _DP_80211= sbuf bsdxml _DP_archive= z bz2 lzma bsdxml zstd _DP_zstd= pthread .if ${MK_BLACKLIST} != "no" _DP_blacklist+= pthread .endif _DP_crypto= pthread .if ${MK_OPENSSL} != "no" _DP_archive+= crypto .else _DP_archive+= md .endif _DP_sqlite3= pthread _DP_ssl= crypto _DP_ssh= crypto crypt z .if ${MK_LDNS} != "no" _DP_ssh+= ldns .endif _DP_edit= ncursesw .if ${MK_OPENSSL} != "no" _DP_bsnmp= crypto .endif _DP_geom= bsdxml sbuf _DP_cam= sbuf _DP_kvm= elf _DP_kyua_cli= kyua_drivers kyua_engine kyua_model kyua_store kyua_utils _DP_kyua_drivers= kyua_model kyua_engine kyua_store _DP_kyua_engine= lutok kyua_utils _DP_kyua_model= lutok _DP_kyua_utils= lutok _DP_kyua_store= kyua_model kyua_utils sqlite3 _DP_casper= nv _DP_cap_dns= nv _DP_cap_fileargs= nv _DP_cap_grp= nv _DP_cap_pwd= nv _DP_cap_sysctl= nv _DP_cap_syslog= nv .if ${MK_OFED} != "no" _DP_pcap= ibverbs mlx5 .endif _DP_pjdlog= util _DP_opie= md _DP_usb= pthread _DP_unbound= ssl crypto pthread _DP_rt= pthread .if ${MK_OPENSSL} == "no" _DP_radius= md .else _DP_radius= crypto .endif _DP_rtld_db= elf procstat _DP_procstat= kvm util elf .if ${MK_CXX} == "yes" .if ${MK_LIBCPLUSPLUS} != "no" _DP_proc= cxxrt .else _DP_proc= supcplusplus .endif .endif .if ${MK_CDDL} != "no" _DP_proc+= ctf .endif _DP_proc+= elf procstat rtld_db util _DP_mp= crypto _DP_memstat= kvm _DP_magic= z _DP_mt= sbuf bsdxml _DP_ldns= ssl crypto _DP_lua= m _DP_lutok= lua .if ${MK_OPENSSL} != "no" _DP_fetch= ssl crypto .else _DP_fetch= md .endif _DP_execinfo= elf _DP_dwarf= elf _DP_dpv= dialog figpar util ncursesw _DP_dialog= ncursesw m _DP_cuse= pthread _DP_atf_cxx= atf_c _DP_gtest= pthread regex _DP_gmock= gtest _DP_gmock_main= gmock _DP_gtest_main= gtest _DP_devstat= kvm _DP_pam= radius tacplus opie md util .if ${MK_KERBEROS} != "no" _DP_pam+= krb5 .endif .if ${MK_OPENSSH} != "no" _DP_pam+= ssh .endif .if ${MK_NIS} != "no" _DP_pam+= ypclnt .endif _DP_roken= crypt _DP_kadm5clnt= com_err krb5 roken _DP_kadm5srv= com_err hdb krb5 roken _DP_heimntlm= crypto com_err krb5 roken _DP_hx509= asn1 com_err crypto roken wind _DP_hdb= asn1 com_err krb5 roken sqlite3 _DP_asn1= com_err roken _DP_kdc= roken hdb hx509 krb5 heimntlm asn1 crypto _DP_wind= com_err roken _DP_heimbase= pthread _DP_heimipcc= heimbase roken pthread _DP_heimipcs= heimbase roken pthread _DP_kafs5= asn1 krb5 roken _DP_krb5+= asn1 com_err crypt crypto hx509 roken wind heimbase heimipcc _DP_gssapi_krb5+= gssapi krb5 crypto roken asn1 com_err _DP_lzma= md pthread _DP_ucl= m _DP_vmmapi= util _DP_opencsd= cxxrt _DP_ctf= z _DP_dtrace= ctf elf proc pthread rtld_db _DP_xo= util # The libc dependencies are not strictly needed but are defined to make the # assert happy. _DP_c= compiler_rt .if ${MK_SSP} != "no" && \ (${MACHINE_ARCH} == "i386" || ${MACHINE_ARCH:Mpower*} != "") _DP_c+= ssp_nonshared .endif _DP_stats= sbuf pthread _DP_stdthreads= pthread _DP_tacplus= md _DP_panel= ncurses _DP_panelw= ncursesw _DP_rpcsec_gss= gssapi _DP_smb= kiconv _DP_ulog= md _DP_fifolog= z _DP_ipf= kvm _DP_zfs= md pthread umem util uutil m nvpair avl bsdxml geom nvpair z \ zfs_core _DP_zfs_core= nvpair _DP_zpool= md pthread z nvpair avl umem _DP_be= zfs nvpair # OFED support .if ${MK_OFED} != "no" _DP_cxgb4= ibverbs pthread _DP_ibcm= ibverbs _DP_ibmad= ibumad _DP_ibnetdisc= osmcomp ibmad ibumad _DP_ibumad= _DP_ibverbs= _DP_mlx4= ibverbs pthread _DP_mlx5= ibverbs pthread _DP_rdmacm= ibverbs _DP_osmcomp= pthread _DP_opensm= pthread _DP_osmvendor= ibumad pthread .endif # Define special cases LDADD_supcplusplus= -lsupc++ LIBATF_C= ${LIBDESTDIR}${LIBDIR_BASE}/libprivateatf-c.a LIBATF_CXX= ${LIBDESTDIR}${LIBDIR_BASE}/libprivateatf-c++.a LDADD_atf_c= -lprivateatf-c LDADD_atf_cxx= -lprivateatf-c++ LIBGMOCK= ${LIBDESTDIR}${LIBDIR_BASE}/libprivategmock.a LIBGMOCK_MAIN= ${LIBDESTDIR}${LIBDIR_BASE}/libprivategmock_main.a LIBGTEST= ${LIBDESTDIR}${LIBDIR_BASE}/libprivategtest.a LIBGTEST_MAIN= ${LIBDESTDIR}${LIBDIR_BASE}/libprivategtest_main.a LDADD_gmock= -lprivategmock LDADD_gtest= -lprivategtest LDADD_gmock_main= -lprivategmock_main LDADD_gtest_main= -lprivategtest_main .for _l in ${_PRIVATELIBS} LIB${_l:tu}?= ${LIBDESTDIR}${LIBDIR_BASE}/libprivate${_l}.a .endfor .if ${MK_PIE} != "no" PIE_SUFFIX= _pie .endif .for _l in ${_LIBRARIES} .if ${_INTERNALLIBS:M${_l}} || !defined(SYSROOT) LDADD_${_l}_L+= -L${LIB${_l:tu}DIR} .endif DPADD_${_l}?= ${LIB${_l:tu}} .if ${_PRIVATELIBS:M${_l}} LDADD_${_l}?= -lprivate${_l} .elif ${_INTERNALLIBS:M${_l}} LDADD_${_l}?= ${LDADD_${_l}_L} -l${_l:S/${PIE_SUFFIX}//}${PIE_SUFFIX} .else LDADD_${_l}?= ${LDADD_${_l}_L} -l${_l} .endif # Add in all dependencies for static linkage. .if defined(_DP_${_l}) && (${_INTERNALLIBS:M${_l}} || \ (defined(NO_SHARED) && ${NO_SHARED:tl} != "no")) .for _d in ${_DP_${_l}} DPADD_${_l}+= ${DPADD_${_d}} LDADD_${_l}+= ${LDADD_${_d}} .endfor .endif .endfor # These are special cases where the library is broken and anything that uses # it needs to add more dependencies. Broken usually means that it has a # cyclic dependency and cannot link its own dependencies. This is bad, please # fix the library instead. # Unless the library itself is broken then the proper place to define # dependencies is _DP_* above. # libatf-c++ exposes libatf-c abi hence we need to explicit link to atf_c for # atf_cxx DPADD_atf_cxx+= ${DPADD_atf_c} LDADD_atf_cxx+= ${LDADD_atf_c} DPADD_gmock+= ${DPADD_gtest} LDADD_gmock+= ${LDADD_gtest} DPADD_gmock_main+= ${DPADD_gmock} LDADD_gmock_main+= ${LDADD_gmock} DPADD_gtest_main+= ${DPADD_gtest} LDADD_gtest_main+= ${LDADD_gtest} # Detect LDADD/DPADD that should be LIBADD, before modifying LDADD here. _BADLDADD= .for _l in ${LDADD:M-l*:N-l*/*:C,^-l,,} .if ${_LIBRARIES:M${_l}} && !${_PRIVATELIBS:M${_l}} _BADLDADD+= ${_l} .endif .endfor .if !empty(_BADLDADD) .error ${.CURDIR}: These libraries should be LIBADD+=foo rather than DPADD/LDADD+=-lfoo: ${_BADLDADD} .endif .for _l in ${LIBADD} DPADD+= ${DPADD_${_l}} LDADD+= ${LDADD_${_l}} .endfor _LIB_OBJTOP?= ${OBJTOP} # INTERNALLIB definitions. LIBELFTCDIR= ${_LIB_OBJTOP}/lib/libelftc LIBELFTC?= ${LIBELFTCDIR}/libelftc${PIE_SUFFIX}.a LIBKYUA_CLIDIR= ${_LIB_OBJTOP}/lib/kyua/cli LIBKYUA_CLI?= ${LIBKYUA_CLIDIR}/libkyua_cli${PIE_SUFFIX}.a LIBKYUA_DRIVERSDIR= ${_LIB_OBJTOP}/lib/kyua/drivers LIBKYUA_DRIVERS?= ${LIBKYUA_DRIVERSDIR}/libkyua_drivers${PIE_SUFFIX}.a LIBKYUA_ENGINEDIR= ${_LIB_OBJTOP}/lib/kyua/engine LIBKYUA_ENGINE?= ${LIBKYUA_ENGINEDIR}/libkyua_engine${PIE_SUFFIX}.a LIBKYUA_MODELDIR= ${_LIB_OBJTOP}/lib/kyua/model LIBKYUA_MODEL?= ${LIBKYUA_MODELDIR}/libkyua_model${PIE_SUFFIX}.a LIBKYUA_STOREDIR= ${_LIB_OBJTOP}/lib/kyua/store LIBKYUA_STORE?= ${LIBKYUA_STOREDIR}/libkyua_store${PIE_SUFFIX}.a LIBKYUA_UTILSDIR= ${_LIB_OBJTOP}/lib/kyua/utils LIBKYUA_UTILS?= ${LIBKYUA_UTILSDIR}/libkyua_utils${PIE_SUFFIX}.a LIBLUADIR= ${_LIB_OBJTOP}/lib/liblua LIBLUA?= ${LIBLUADIR}/liblua${PIE_SUFFIX}.a LIBLUTOKDIR= ${_LIB_OBJTOP}/lib/liblutok LIBLUTOK?= ${LIBLUTOKDIR}/liblutok${PIE_SUFFIX}.a LIBPEDIR= ${_LIB_OBJTOP}/lib/libpe LIBPE?= ${LIBPEDIR}/libpe${PIE_SUFFIX}.a LIBOPENBSDDIR= ${_LIB_OBJTOP}/lib/libopenbsd LIBOPENBSD?= ${LIBOPENBSDDIR}/libopenbsd${PIE_SUFFIX}.a LIBSMDIR= ${_LIB_OBJTOP}/lib/libsm LIBSM?= ${LIBSMDIR}/libsm${PIE_SUFFIX}.a LIBSMDBDIR= ${_LIB_OBJTOP}/lib/libsmdb LIBSMDB?= ${LIBSMDBDIR}/libsmdb${PIE_SUFFIX}.a LIBSMUTILDIR= ${_LIB_OBJTOP}/lib/libsmutil LIBSMUTIL?= ${LIBSMUTILDIR}/libsmutil${PIE_SUFFIX}.a LIBNETBSDDIR?= ${_LIB_OBJTOP}/lib/libnetbsd LIBNETBSD?= ${LIBNETBSDDIR}/libnetbsd${PIE_SUFFIX}.a LIBVERSDIR?= ${_LIB_OBJTOP}/kerberos5/lib/libvers LIBVERS?= ${LIBVERSDIR}/libvers${PIE_SUFFIX}.a LIBSLDIR= ${_LIB_OBJTOP}/kerberos5/lib/libsl LIBSL?= ${LIBSLDIR}/libsl${PIE_SUFFIX}.a LIBIFCONFIGDIR= ${_LIB_OBJTOP}/lib/libifconfig LIBIFCONFIG?= ${LIBIFCONFIGDIR}/libifconfig${PIE_SUFFIX}.a LIBIPFDIR= ${_LIB_OBJTOP}/sbin/ipf/libipf LIBIPF?= ${LIBIPFDIR}/libipf${PIE_SUFFIX}.a LIBTELNETDIR= ${_LIB_OBJTOP}/lib/libtelnet LIBTELNET?= ${LIBTELNETDIR}/libtelnet${PIE_SUFFIX}.a LIBCRONDIR= ${_LIB_OBJTOP}/usr.sbin/cron/lib LIBCRON?= ${LIBCRONDIR}/libcron${PIE_SUFFIX}.a LIBNTPDIR= ${_LIB_OBJTOP}/usr.sbin/ntp/libntp LIBNTP?= ${LIBNTPDIR}/libntp${PIE_SUFFIX}.a LIBNTPEVENTDIR= ${_LIB_OBJTOP}/usr.sbin/ntp/libntpevent LIBNTPEVENT?= ${LIBNTPEVENTDIR}/libntpevent${PIE_SUFFIX}.a LIBOPTSDIR= ${_LIB_OBJTOP}/usr.sbin/ntp/libopts LIBOPTS?= ${LIBOPTSDIR}/libopts${PIE_SUFFIX}.a LIBPARSEDIR= ${_LIB_OBJTOP}/usr.sbin/ntp/libparse LIBPARSE?= ${LIBPARSEDIR}/libparse${PIE_SUFFIX}.a LIBLPRDIR= ${_LIB_OBJTOP}/usr.sbin/lpr/common_source LIBLPR?= ${LIBLPRDIR}/liblpr${PIE_SUFFIX}.a LIBFIFOLOGDIR= ${_LIB_OBJTOP}/usr.sbin/fifolog/lib LIBFIFOLOG?= ${LIBFIFOLOGDIR}/libfifolog${PIE_SUFFIX}.a LIBBSNMPTOOLSDIR= ${_LIB_OBJTOP}/usr.sbin/bsnmpd/tools/libbsnmptools LIBBSNMPTOOLS?= ${LIBBSNMPTOOLSDIR}/libbsnmptools${PIE_SUFFIX}.a LIBAMUDIR= ${_LIB_OBJTOP}/usr.sbin/amd/libamu LIBAMU?= ${LIBAMUDIR}/libamu${PIE_SUFFIX}.a LIBBE?= ${LIBBEDIR}/libbe${PIE_SUFFIX}.a LIBPMCSTATDIR= ${_LIB_OBJTOP}/lib/libpmcstat LIBPMCSTAT?= ${LIBPMCSTATDIR}/libpmcstat${PIE_SUFFIX}.a LIBC_NOSSP_PICDIR= ${_LIB_OBJTOP}/lib/libc LIBC_NOSSP_PIC?= ${LIBC_NOSSP_PICDIR}/libc_nossp_pic.a # Define a directory for each library. This is useful for adding -L in when # not using a --sysroot or for meta mode bootstrapping when there is no # Makefile.depend. These are sorted by directory. LIBAVLDIR= ${OBJTOP}/cddl/lib/libavl LIBCTFDIR= ${OBJTOP}/cddl/lib/libctf LIBDTRACEDIR= ${OBJTOP}/cddl/lib/libdtrace LIBNVPAIRDIR= ${OBJTOP}/cddl/lib/libnvpair LIBUMEMDIR= ${OBJTOP}/cddl/lib/libumem LIBUUTILDIR= ${OBJTOP}/cddl/lib/libuutil LIBZFSDIR= ${OBJTOP}/cddl/lib/libzfs LIBZFS_COREDIR= ${OBJTOP}/cddl/lib/libzfs_core LIBZPOOLDIR= ${OBJTOP}/cddl/lib/libzpool # OFED support LIBCXGB4DIR= ${OBJTOP}/lib/ofed/libcxgb4 LIBIBCMDIR= ${OBJTOP}/lib/ofed/libibcm LIBIBMADDIR= ${OBJTOP}/lib/ofed/libibmad LIBIBNETDISCDIR=${OBJTOP}/lib/ofed/libibnetdisc LIBIBUMADDIR= ${OBJTOP}/lib/ofed/libibumad LIBIBVERBSDIR= ${OBJTOP}/lib/ofed/libibverbs LIBMLX4DIR= ${OBJTOP}/lib/ofed/libmlx4 LIBMLX5DIR= ${OBJTOP}/lib/ofed/libmlx5 LIBRDMACMDIR= ${OBJTOP}/lib/ofed/librdmacm LIBOSMCOMPDIR= ${OBJTOP}/lib/ofed/complib LIBOPENSMDIR= ${OBJTOP}/lib/ofed/libopensm LIBOSMVENDORDIR=${OBJTOP}/lib/ofed/libvendor LIBDIALOGDIR= ${OBJTOP}/gnu/lib/libdialog LIBGNUREGEXDIR= ${OBJTOP}/gnu/lib/libregex LIBSSPDIR= ${OBJTOP}/lib/libssp LIBSSP_NONSHAREDDIR= ${OBJTOP}/lib/libssp_nonshared LIBASN1DIR= ${OBJTOP}/kerberos5/lib/libasn1 LIBGSSAPI_KRB5DIR= ${OBJTOP}/kerberos5/lib/libgssapi_krb5 LIBGSSAPI_NTLMDIR= ${OBJTOP}/kerberos5/lib/libgssapi_ntlm LIBGSSAPI_SPNEGODIR= ${OBJTOP}/kerberos5/lib/libgssapi_spnego LIBHDBDIR= ${OBJTOP}/kerberos5/lib/libhdb LIBHEIMBASEDIR= ${OBJTOP}/kerberos5/lib/libheimbase LIBHEIMIPCCDIR= ${OBJTOP}/kerberos5/lib/libheimipcc LIBHEIMIPCSDIR= ${OBJTOP}/kerberos5/lib/libheimipcs LIBHEIMNTLMDIR= ${OBJTOP}/kerberos5/lib/libheimntlm LIBHX509DIR= ${OBJTOP}/kerberos5/lib/libhx509 LIBKADM5CLNTDIR= ${OBJTOP}/kerberos5/lib/libkadm5clnt LIBKADM5SRVDIR= ${OBJTOP}/kerberos5/lib/libkadm5srv LIBKAFS5DIR= ${OBJTOP}/kerberos5/lib/libkafs5 LIBKDCDIR= ${OBJTOP}/kerberos5/lib/libkdc LIBKRB5DIR= ${OBJTOP}/kerberos5/lib/libkrb5 LIBROKENDIR= ${OBJTOP}/kerberos5/lib/libroken LIBWINDDIR= ${OBJTOP}/kerberos5/lib/libwind LIBATF_CDIR= ${OBJTOP}/lib/atf/libatf-c LIBATF_CXXDIR= ${OBJTOP}/lib/atf/libatf-c++ LIBGMOCKDIR= ${OBJTOP}/lib/googletest/gmock LIBGMOCK_MAINDIR= ${OBJTOP}/lib/googletest/gmock_main LIBGTESTDIR= ${OBJTOP}/lib/googletest/gtest LIBGTEST_MAINDIR= ${OBJTOP}/lib/googletest/gtest_main LIBALIASDIR= ${OBJTOP}/lib/libalias/libalias LIBBLACKLISTDIR= ${OBJTOP}/lib/libblacklist LIBBLOCKSRUNTIMEDIR= ${OBJTOP}/lib/libblocksruntime LIBBSNMPDIR= ${OBJTOP}/lib/libbsnmp/libbsnmp LIBCASPERDIR= ${OBJTOP}/lib/libcasper/libcasper LIBCAP_DNSDIR= ${OBJTOP}/lib/libcasper/services/cap_dns LIBCAP_GRPDIR= ${OBJTOP}/lib/libcasper/services/cap_grp +LIBCAP_NETDIR= ${OBJTOP}/lib/libcasper/services/cap_net LIBCAP_PWDDIR= ${OBJTOP}/lib/libcasper/services/cap_pwd LIBCAP_SYSCTLDIR= ${OBJTOP}/lib/libcasper/services/cap_sysctl LIBCAP_SYSLOGDIR= ${OBJTOP}/lib/libcasper/services/cap_syslog LIBBSDXMLDIR= ${OBJTOP}/lib/libexpat LIBKVMDIR= ${OBJTOP}/lib/libkvm LIBPTHREADDIR= ${OBJTOP}/lib/libthr LIBMDIR= ${OBJTOP}/lib/msun LIBFORMDIR= ${OBJTOP}/lib/ncurses/form LIBFORMLIBWDIR= ${OBJTOP}/lib/ncurses/formw LIBMENUDIR= ${OBJTOP}/lib/ncurses/menu LIBMENULIBWDIR= ${OBJTOP}/lib/ncurses/menuw LIBNCURSESDIR= ${OBJTOP}/lib/ncurses/ncurses LIBNCURSESWDIR= ${OBJTOP}/lib/ncurses/ncursesw LIBPANELDIR= ${OBJTOP}/lib/ncurses/panel LIBPANELWDIR= ${OBJTOP}/lib/ncurses/panelw LIBCRYPTODIR= ${OBJTOP}/secure/lib/libcrypto LIBSSHDIR= ${OBJTOP}/secure/lib/libssh LIBSSLDIR= ${OBJTOP}/secure/lib/libssl LIBTEKENDIR= ${OBJTOP}/sys/teken/libteken LIBEGACYDIR= ${OBJTOP}/tools/build LIBLNDIR= ${OBJTOP}/usr.bin/lex/lib LIBTERMCAPDIR= ${LIBNCURSESDIR} LIBTERMCAPWDIR= ${LIBNCURSESWDIR} # Default other library directories to lib/libNAME. .for lib in ${_LIBRARIES} LIB${lib:tu}DIR?= ${OBJTOP}/lib/lib${lib} .endfor # Validate that listed LIBADD are valid. .for _l in ${LIBADD} .if empty(_LIBRARIES:M${_l}) _BADLIBADD+= ${_l} .endif .endfor .if !empty(_BADLIBADD) .error ${.CURDIR}: Invalid LIBADD used which may need to be added to ${_this:T}: ${_BADLIBADD} .endif # Sanity check that libraries are defined here properly when building them. .if defined(LIB) && ${_LIBRARIES:M${LIB}} != "" .if !empty(LIBADD) && \ (!defined(_DP_${LIB}) || ${LIBADD:O:u} != ${_DP_${LIB}:O:u}) .error ${.CURDIR}: Missing or incorrect _DP_${LIB} entry in ${_this:T}. Should match LIBADD for ${LIB} ('${LIBADD}' vs '${_DP_${LIB}}') .endif # Note that OBJTOP is not yet defined here but for the purpose of the check # it is fine as it resolves to the SRC directory. .if !defined(LIB${LIB:tu}DIR) || !exists(${SRCTOP}/${LIB${LIB:tu}DIR:S,^${OBJTOP}/,,}) .error ${.CURDIR}: Missing or incorrect value for LIB${LIB:tu}DIR in ${_this:T}: ${LIB${LIB:tu}DIR:S,^${OBJTOP}/,,} .endif .if ${_INTERNALLIBS:M${LIB}} != "" && !defined(LIB${LIB:tu}) .error ${.CURDIR}: Missing value for LIB${LIB:tu} in ${_this:T}. Likely should be: LIB${LIB:tu}?= $${LIB${LIB:tu}DIR}/lib${LIB}.a .endif .endif .endif # !target(____) Index: projects/clang1100-import/sys/compat/linuxkpi/common/include/linux/fs.h =================================================================== --- projects/clang1100-import/sys/compat/linuxkpi/common/include/linux/fs.h (revision 364278) +++ projects/clang1100-import/sys/compat/linuxkpi/common/include/linux/fs.h (revision 364279) @@ -1,305 +1,305 @@ /*- * Copyright (c) 2010 Isilon Systems, Inc. * Copyright (c) 2010 iX Systems, Inc. * Copyright (c) 2010 Panasas, Inc. * Copyright (c) 2013-2018 Mellanox Technologies, Ltd. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _LINUX_FS_H_ #define _LINUX_FS_H_ #include #include #include #include #include #include #include #include #include #include #include #include struct module; struct kiocb; struct iovec; struct dentry; struct page; struct file_lock; struct pipe_inode_info; struct vm_area_struct; struct poll_table_struct; struct files_struct; struct pfs_node; struct linux_cdev; #define inode vnode #define i_cdev v_rdev #define i_private v_data #define S_IRUGO (S_IRUSR | S_IRGRP | S_IROTH) #define S_IWUGO (S_IWUSR | S_IWGRP | S_IWOTH) typedef struct files_struct *fl_owner_t; struct file_operations; struct linux_file_wait_queue { struct wait_queue wq; struct wait_queue_head *wqh; atomic_t state; #define LINUX_FWQ_STATE_INIT 0 #define LINUX_FWQ_STATE_NOT_READY 1 #define LINUX_FWQ_STATE_QUEUED 2 #define LINUX_FWQ_STATE_READY 3 #define LINUX_FWQ_STATE_MAX 4 }; struct linux_file { struct file *_file; const struct file_operations *f_op; void *private_data; int f_flags; int f_mode; /* Just starting mode. */ struct dentry *f_dentry; struct dentry f_dentry_store; struct selinfo f_selinfo; struct sigio *f_sigio; struct vnode *f_vnode; #define f_inode f_vnode volatile u_int f_count; /* anonymous shmem object */ vm_object_t f_shmem; /* kqfilter support */ int f_kqflags; #define LINUX_KQ_FLAG_HAS_READ (1 << 0) #define LINUX_KQ_FLAG_HAS_WRITE (1 << 1) #define LINUX_KQ_FLAG_NEED_READ (1 << 2) #define LINUX_KQ_FLAG_NEED_WRITE (1 << 3) /* protects f_selinfo.si_note */ spinlock_t f_kqlock; struct linux_file_wait_queue f_wait_queue; /* pointer to associated character device, if any */ struct linux_cdev *f_cdev; }; #define file linux_file #define fasync_struct sigio * #define fasync_helper(fd, filp, on, queue) \ ({ \ if ((on)) \ *(queue) = &(filp)->f_sigio; \ else \ *(queue) = NULL; \ 0; \ }) #define kill_fasync(queue, sig, pollstat) \ do { \ if (*(queue) != NULL) \ pgsigio(*(queue), (sig), 0); \ } while (0) typedef int (*filldir_t)(void *, const char *, int, off_t, u64, unsigned); struct file_operations { struct module *owner; ssize_t (*read)(struct linux_file *, char __user *, size_t, off_t *); ssize_t (*write)(struct linux_file *, const char __user *, size_t, off_t *); unsigned int (*poll) (struct linux_file *, struct poll_table_struct *); long (*unlocked_ioctl)(struct linux_file *, unsigned int, unsigned long); long (*compat_ioctl)(struct linux_file *, unsigned int, unsigned long); int (*mmap)(struct linux_file *, struct vm_area_struct *); int (*open)(struct inode *, struct file *); int (*release)(struct inode *, struct linux_file *); int (*fasync)(int, struct linux_file *, int); /* Although not supported in FreeBSD, to align with Linux code * we are adding llseek() only when it is mapped to no_llseek which returns * an illegal seek error */ off_t (*llseek)(struct linux_file *, off_t, int); #if 0 /* We do not support these methods. Don't permit them to compile. */ loff_t (*llseek)(struct file *, loff_t, int); ssize_t (*aio_read)(struct kiocb *, const struct iovec *, unsigned long, loff_t); ssize_t (*aio_write)(struct kiocb *, const struct iovec *, unsigned long, loff_t); int (*readdir)(struct file *, void *, filldir_t); int (*ioctl)(struct inode *, struct file *, unsigned int, unsigned long); int (*flush)(struct file *, fl_owner_t id); int (*fsync)(struct file *, struct dentry *, int datasync); int (*aio_fsync)(struct kiocb *, int datasync); int (*lock)(struct file *, int, struct file_lock *); ssize_t (*sendpage)(struct file *, struct page *, int, size_t, loff_t *, int); unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); int (*check_flags)(int); int (*flock)(struct file *, int, struct file_lock *); ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int); ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int); int (*setlease)(struct file *, long, struct file_lock **); #endif }; #define fops_get(fops) (fops) #define replace_fops(f, fops) ((f)->f_op = (fops)) #define FMODE_READ FREAD #define FMODE_WRITE FWRITE #define FMODE_EXEC FEXEC int __register_chrdev(unsigned int major, unsigned int baseminor, unsigned int count, const char *name, const struct file_operations *fops); int __register_chrdev_p(unsigned int major, unsigned int baseminor, unsigned int count, const char *name, const struct file_operations *fops, uid_t uid, gid_t gid, int mode); void __unregister_chrdev(unsigned int major, unsigned int baseminor, unsigned int count, const char *name); static inline void unregister_chrdev(unsigned int major, const char *name) { __unregister_chrdev(major, 0, 256, name); } static inline int register_chrdev(unsigned int major, const char *name, const struct file_operations *fops) { return (__register_chrdev(major, 0, 256, name, fops)); } static inline int register_chrdev_p(unsigned int major, const char *name, const struct file_operations *fops, uid_t uid, gid_t gid, int mode) { return (__register_chrdev_p(major, 0, 256, name, fops, uid, gid, mode)); } static inline int register_chrdev_region(dev_t dev, unsigned range, const char *name) { return 0; } static inline void unregister_chrdev_region(dev_t dev, unsigned range) { return; } static inline int alloc_chrdev_region(dev_t *dev, unsigned baseminor, unsigned count, const char *name) { return 0; } /* No current support for seek op in FreeBSD */ static inline int nonseekable_open(struct inode *inode, struct file *filp) { return 0; } extern unsigned int linux_iminor(struct inode *); #define iminor(...) linux_iminor(__VA_ARGS__) static inline struct linux_file * get_file(struct linux_file *f) { refcount_acquire(f->_file == NULL ? &f->f_count : &f->_file->f_count); return (f); } static inline struct inode * igrab(struct inode *inode) { int error; - error = vget(inode, 0, curthread); + error = vget(inode, 0); if (error) return (NULL); return (inode); } static inline void iput(struct inode *inode) { vrele(inode); } static inline loff_t no_llseek(struct file *file, loff_t offset, int whence) { return (-ESPIPE); } static inline loff_t noop_llseek(struct linux_file *file, loff_t offset, int whence) { return (file->_file->f_offset); } static inline struct vnode * file_inode(const struct linux_file *file) { return (file->f_vnode); } static inline int call_mmap(struct linux_file *file, struct vm_area_struct *vma) { return (file->f_op->mmap(file, vma)); } #endif /* _LINUX_FS_H_ */ Index: projects/clang1100-import/sys/fs/autofs/autofs_vnops.c =================================================================== --- projects/clang1100-import/sys/fs/autofs/autofs_vnops.c (revision 364278) +++ projects/clang1100-import/sys/fs/autofs/autofs_vnops.c (revision 364279) @@ -1,715 +1,715 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2014 The FreeBSD Foundation * All rights reserved. * * This software was developed by Edward Tomasz Napierala under sponsorship * from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static int autofs_trigger_vn(struct vnode *vp, const char *path, int pathlen, struct vnode **newvp); extern struct autofs_softc *autofs_softc; static int autofs_access(struct vop_access_args *ap) { /* * Nothing to do here; the only kind of access control * needed is in autofs_mkdir(). */ return (0); } static int autofs_getattr(struct vop_getattr_args *ap) { struct vnode *vp, *newvp; struct autofs_node *anp; struct mount *mp; struct vattr *vap; int error; vp = ap->a_vp; anp = vp->v_data; mp = vp->v_mount; vap = ap->a_vap; KASSERT(ap->a_vp->v_type == VDIR, ("!VDIR")); /* * The reason we must do this is that some tree-walking software, * namely fts(3), assumes that stat(".") results will not change * between chdir("subdir") and chdir(".."), and fails with ENOENT * otherwise. */ if (autofs_mount_on_stat && autofs_cached(anp, NULL, 0) == false && autofs_ignore_thread(curthread) == false) { error = autofs_trigger_vn(vp, "", 0, &newvp); if (error != 0) return (error); if (newvp != NULL) { error = VOP_GETATTR(newvp, ap->a_vap, ap->a_cred); vput(newvp); return (error); } } vap->va_type = VDIR; vap->va_mode = 0755; vap->va_nlink = 3; /* XXX */ vap->va_uid = 0; vap->va_gid = 0; vap->va_rdev = NODEV; vap->va_fsid = mp->mnt_stat.f_fsid.val[0]; vap->va_fileid = anp->an_fileno; vap->va_size = S_BLKSIZE; vap->va_blocksize = S_BLKSIZE; vap->va_mtime = anp->an_ctime; vap->va_atime = anp->an_ctime; vap->va_ctime = anp->an_ctime; vap->va_birthtime = anp->an_ctime; vap->va_gen = 0; vap->va_flags = 0; vap->va_rdev = 0; vap->va_bytes = S_BLKSIZE; vap->va_filerev = 0; vap->va_spare = 0; return (0); } /* * Unlock the vnode, request automountd(8) action, and then lock it back. * If anything got mounted on top of the vnode, return the new filesystem's * root vnode in 'newvp', locked. */ static int autofs_trigger_vn(struct vnode *vp, const char *path, int pathlen, struct vnode **newvp) { struct autofs_node *anp; int error, lock_flags; anp = vp->v_data; /* * Release the vnode lock, so that other operations, in partcular * mounting a filesystem on top of it, can proceed. Increase use * count, to prevent the vnode from being deallocated and to prevent * filesystem from being unmounted. */ lock_flags = VOP_ISLOCKED(vp); vref(vp); VOP_UNLOCK(vp); sx_xlock(&autofs_softc->sc_lock); /* * XXX: Workaround for mounting the same thing multiple times; revisit. */ if (vp->v_mountedhere != NULL) { error = 0; goto mounted; } error = autofs_trigger(anp, path, pathlen); mounted: sx_xunlock(&autofs_softc->sc_lock); vn_lock(vp, lock_flags | LK_RETRY); vunref(vp); if (VN_IS_DOOMED(vp)) { AUTOFS_DEBUG("VIRF_DOOMED"); return (ENOENT); } if (error != 0) return (error); if (vp->v_mountedhere == NULL) { *newvp = NULL; return (0); } else { /* * If the operation that succeeded was mount, then mark * the node as non-cached. Otherwise, if someone unmounts * the filesystem before the cache times out, we will fail * to trigger. */ anp->an_cached = false; } error = VFS_ROOT(vp->v_mountedhere, lock_flags, newvp); if (error != 0) { AUTOFS_WARN("VFS_ROOT() failed with error %d", error); return (error); } return (0); } static int autofs_vget_callback(struct mount *mp, void *arg, int flags, struct vnode **vpp) { return (autofs_node_vn(arg, mp, flags, vpp)); } static int autofs_lookup(struct vop_lookup_args *ap) { struct vnode *dvp, *newvp, **vpp; struct mount *mp; struct autofs_mount *amp; struct autofs_node *anp, *child; struct componentname *cnp; int error; dvp = ap->a_dvp; vpp = ap->a_vpp; mp = dvp->v_mount; amp = VFSTOAUTOFS(mp); anp = dvp->v_data; cnp = ap->a_cnp; if (cnp->cn_flags & ISDOTDOT) { KASSERT(anp->an_parent != NULL, ("NULL parent")); /* * Note that in this case, dvp is the child vnode, and we * are looking up the parent vnode - exactly reverse from * normal operation. Unlocking dvp requires some rather * tricky unlock/relock dance to prevent mp from being freed; * use vn_vget_ino_gen() which takes care of all that. */ error = vn_vget_ino_gen(dvp, autofs_vget_callback, anp->an_parent, cnp->cn_lkflags, vpp); if (error != 0) { AUTOFS_WARN("vn_vget_ino_gen() failed with error %d", error); return (error); } return (error); } if (cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.') { vref(dvp); *vpp = dvp; return (0); } if (autofs_cached(anp, cnp->cn_nameptr, cnp->cn_namelen) == false && autofs_ignore_thread(cnp->cn_thread) == false) { error = autofs_trigger_vn(dvp, cnp->cn_nameptr, cnp->cn_namelen, &newvp); if (error != 0) return (error); if (newvp != NULL) { /* * The target filesystem got automounted. * Let the lookup(9) go around with the same * path component. */ vput(newvp); return (ERELOOKUP); } } AUTOFS_SLOCK(amp); error = autofs_node_find(anp, cnp->cn_nameptr, cnp->cn_namelen, &child); if (error != 0) { if ((cnp->cn_flags & ISLASTCN) && cnp->cn_nameiop == CREATE) { AUTOFS_SUNLOCK(amp); return (EJUSTRETURN); } AUTOFS_SUNLOCK(amp); return (ENOENT); } /* * XXX: Dropping the node here is ok, because we never remove nodes. */ AUTOFS_SUNLOCK(amp); error = autofs_node_vn(child, mp, cnp->cn_lkflags, vpp); if (error != 0) { if ((cnp->cn_flags & ISLASTCN) && cnp->cn_nameiop == CREATE) return (EJUSTRETURN); return (error); } return (0); } static int autofs_mkdir(struct vop_mkdir_args *ap) { struct vnode *vp; struct autofs_node *anp; struct autofs_mount *amp; struct autofs_node *child; int error; vp = ap->a_dvp; anp = vp->v_data; amp = VFSTOAUTOFS(vp->v_mount); /* * Do not allow mkdir() if the calling thread is not * automountd(8) descendant. */ if (autofs_ignore_thread(curthread) == false) return (EPERM); AUTOFS_XLOCK(amp); error = autofs_node_new(anp, amp, ap->a_cnp->cn_nameptr, ap->a_cnp->cn_namelen, &child); if (error != 0) { AUTOFS_XUNLOCK(amp); return (error); } AUTOFS_XUNLOCK(amp); error = autofs_node_vn(child, vp->v_mount, LK_EXCLUSIVE, ap->a_vpp); return (error); } static int autofs_print(struct vop_print_args *ap) { struct vnode *vp; struct autofs_node *anp; vp = ap->a_vp; anp = vp->v_data; printf(" name \"%s\", fileno %d, cached %d, wildcards %d\n", anp->an_name, anp->an_fileno, anp->an_cached, anp->an_wildcards); return (0); } /* * Write out a single 'struct dirent', based on 'name' and 'fileno' arguments. */ static int autofs_readdir_one(struct uio *uio, const char *name, int fileno, size_t *reclenp) { struct dirent dirent; size_t namlen, reclen; int error; namlen = strlen(name); reclen = _GENERIC_DIRLEN(namlen); if (reclenp != NULL) *reclenp = reclen; if (uio == NULL) return (0); if (uio->uio_resid < reclen) return (EINVAL); dirent.d_fileno = fileno; dirent.d_reclen = reclen; dirent.d_type = DT_DIR; dirent.d_namlen = namlen; memcpy(dirent.d_name, name, namlen); dirent_terminate(&dirent); error = uiomove(&dirent, reclen, uio); return (error); } static size_t autofs_dirent_reclen(const char *name) { size_t reclen; (void)autofs_readdir_one(NULL, name, -1, &reclen); return (reclen); } static int autofs_readdir(struct vop_readdir_args *ap) { struct vnode *vp, *newvp; struct autofs_mount *amp; struct autofs_node *anp, *child; struct uio *uio; size_t reclen, reclens; ssize_t initial_resid; int error; vp = ap->a_vp; amp = VFSTOAUTOFS(vp->v_mount); anp = vp->v_data; uio = ap->a_uio; initial_resid = ap->a_uio->uio_resid; KASSERT(vp->v_type == VDIR, ("!VDIR")); if (autofs_cached(anp, NULL, 0) == false && autofs_ignore_thread(curthread) == false) { error = autofs_trigger_vn(vp, "", 0, &newvp); if (error != 0) return (error); if (newvp != NULL) { error = VOP_READDIR(newvp, ap->a_uio, ap->a_cred, ap->a_eofflag, ap->a_ncookies, ap->a_cookies); vput(newvp); return (error); } } if (uio->uio_offset < 0) return (EINVAL); if (ap->a_eofflag != NULL) *ap->a_eofflag = FALSE; /* * Write out the directory entry for ".". This is conditional * on the current offset into the directory; same applies to the * other two cases below. */ if (uio->uio_offset == 0) { error = autofs_readdir_one(uio, ".", anp->an_fileno, &reclen); if (error != 0) goto out; } reclens = autofs_dirent_reclen("."); /* * Write out the directory entry for "..". */ if (uio->uio_offset <= reclens) { if (uio->uio_offset != reclens) return (EINVAL); if (anp->an_parent == NULL) { error = autofs_readdir_one(uio, "..", anp->an_fileno, &reclen); } else { error = autofs_readdir_one(uio, "..", anp->an_parent->an_fileno, &reclen); } if (error != 0) goto out; } reclens += autofs_dirent_reclen(".."); /* * Write out the directory entries for subdirectories. */ AUTOFS_SLOCK(amp); RB_FOREACH(child, autofs_node_tree, &anp->an_children) { /* * Check the offset to skip entries returned by previous * calls to getdents(). */ if (uio->uio_offset > reclens) { reclens += autofs_dirent_reclen(child->an_name); continue; } /* * Prevent seeking into the middle of dirent. */ if (uio->uio_offset != reclens) { AUTOFS_SUNLOCK(amp); return (EINVAL); } error = autofs_readdir_one(uio, child->an_name, child->an_fileno, &reclen); reclens += reclen; if (error != 0) { AUTOFS_SUNLOCK(amp); goto out; } } AUTOFS_SUNLOCK(amp); if (ap->a_eofflag != NULL) *ap->a_eofflag = TRUE; return (0); out: /* * Return error if the initial buffer was too small to do anything. */ if (uio->uio_resid == initial_resid) return (error); /* * Don't return an error if we managed to copy out some entries. */ if (uio->uio_resid < reclen) return (0); return (error); } static int autofs_reclaim(struct vop_reclaim_args *ap) { struct vnode *vp; struct autofs_node *anp; vp = ap->a_vp; anp = vp->v_data; /* * We do not free autofs_node here; instead we are * destroying them in autofs_node_delete(). */ sx_xlock(&anp->an_vnode_lock); anp->an_vnode = NULL; vp->v_data = NULL; sx_xunlock(&anp->an_vnode_lock); return (0); } struct vop_vector autofs_vnodeops = { .vop_default = &default_vnodeops, .vop_access = autofs_access, .vop_lookup = autofs_lookup, .vop_create = VOP_EOPNOTSUPP, .vop_getattr = autofs_getattr, .vop_link = VOP_EOPNOTSUPP, .vop_mkdir = autofs_mkdir, .vop_mknod = VOP_EOPNOTSUPP, .vop_print = autofs_print, .vop_read = VOP_EOPNOTSUPP, .vop_readdir = autofs_readdir, .vop_remove = VOP_EOPNOTSUPP, .vop_rename = VOP_EOPNOTSUPP, .vop_rmdir = VOP_EOPNOTSUPP, .vop_setattr = VOP_EOPNOTSUPP, .vop_symlink = VOP_EOPNOTSUPP, .vop_write = VOP_EOPNOTSUPP, .vop_reclaim = autofs_reclaim, }; VFS_VOP_VECTOR_REGISTER(autofs_vnodeops); int autofs_node_new(struct autofs_node *parent, struct autofs_mount *amp, const char *name, int namelen, struct autofs_node **anpp) { struct autofs_node *anp; if (parent != NULL) { AUTOFS_ASSERT_XLOCKED(parent->an_mount); KASSERT(autofs_node_find(parent, name, namelen, NULL) == ENOENT, ("node \"%s\" already exists", name)); } anp = uma_zalloc(autofs_node_zone, M_WAITOK | M_ZERO); if (namelen >= 0) anp->an_name = strndup(name, namelen, M_AUTOFS); else anp->an_name = strdup(name, M_AUTOFS); anp->an_fileno = atomic_fetchadd_int(&->am_last_fileno, 1); callout_init(&anp->an_callout, 1); /* * The reason for SX_NOWITNESS here is that witness(4) * cannot tell vnodes apart, so the following perfectly * valid lock order... * * vnode lock A -> autofsvlk B -> vnode lock B * * ... gets reported as a LOR. */ sx_init_flags(&anp->an_vnode_lock, "autofsvlk", SX_NOWITNESS); getnanotime(&anp->an_ctime); anp->an_parent = parent; anp->an_mount = amp; if (parent != NULL) RB_INSERT(autofs_node_tree, &parent->an_children, anp); RB_INIT(&anp->an_children); *anpp = anp; return (0); } int autofs_node_find(struct autofs_node *parent, const char *name, int namelen, struct autofs_node **anpp) { struct autofs_node *anp, find; int error; AUTOFS_ASSERT_LOCKED(parent->an_mount); if (namelen >= 0) find.an_name = strndup(name, namelen, M_AUTOFS); else find.an_name = strdup(name, M_AUTOFS); anp = RB_FIND(autofs_node_tree, &parent->an_children, &find); if (anp != NULL) { error = 0; if (anpp != NULL) *anpp = anp; } else { error = ENOENT; } free(find.an_name, M_AUTOFS); return (error); } void autofs_node_delete(struct autofs_node *anp) { struct autofs_node *parent; AUTOFS_ASSERT_XLOCKED(anp->an_mount); KASSERT(RB_EMPTY(&anp->an_children), ("have children")); callout_drain(&anp->an_callout); parent = anp->an_parent; if (parent != NULL) RB_REMOVE(autofs_node_tree, &parent->an_children, anp); sx_destroy(&anp->an_vnode_lock); free(anp->an_name, M_AUTOFS); uma_zfree(autofs_node_zone, anp); } int autofs_node_vn(struct autofs_node *anp, struct mount *mp, int flags, struct vnode **vpp) { struct vnode *vp; int error; AUTOFS_ASSERT_UNLOCKED(anp->an_mount); sx_xlock(&anp->an_vnode_lock); vp = anp->an_vnode; if (vp != NULL) { - error = vget(vp, flags | LK_RETRY, curthread); + error = vget(vp, flags | LK_RETRY); if (error != 0) { AUTOFS_WARN("vget failed with error %d", error); sx_xunlock(&anp->an_vnode_lock); return (error); } if (VN_IS_DOOMED(vp)) { /* * We got forcibly unmounted. */ AUTOFS_DEBUG("doomed vnode"); sx_xunlock(&anp->an_vnode_lock); vput(vp); return (ENOENT); } *vpp = vp; sx_xunlock(&anp->an_vnode_lock); return (0); } error = getnewvnode("autofs", mp, &autofs_vnodeops, &vp); if (error != 0) { sx_xunlock(&anp->an_vnode_lock); return (error); } error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); if (error != 0) { sx_xunlock(&anp->an_vnode_lock); vdrop(vp); return (error); } vp->v_type = VDIR; if (anp->an_parent == NULL) vp->v_vflag |= VV_ROOT; vp->v_data = anp; VN_LOCK_ASHARE(vp); error = insmntque(vp, mp); if (error != 0) { AUTOFS_DEBUG("insmntque() failed with error %d", error); sx_xunlock(&anp->an_vnode_lock); return (error); } KASSERT(anp->an_vnode == NULL, ("lost race")); anp->an_vnode = vp; sx_xunlock(&anp->an_vnode_lock); *vpp = vp; return (0); } Index: projects/clang1100-import/sys/fs/ext2fs/ext2_vfsops.c =================================================================== --- projects/clang1100-import/sys/fs/ext2fs/ext2_vfsops.c (revision 364278) +++ projects/clang1100-import/sys/fs/ext2fs/ext2_vfsops.c (revision 364279) @@ -1,1445 +1,1445 @@ /*- * modified for EXT2FS support in Lites 1.1 * * Aug 1995, Godmar Back (gback@cs.utah.edu) * University of Utah, Department of Computer Science */ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1989, 1991, 1993, 1994 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ffs_vfsops.c 8.8 (Berkeley) 4/18/94 * $FreeBSD$ */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include SDT_PROVIDER_DECLARE(ext2fs); /* * ext2fs trace probe: * arg0: verbosity. Higher numbers give more verbose messages * arg1: Textual message */ SDT_PROBE_DEFINE2(ext2fs, , vfsops, trace, "int", "char*"); SDT_PROBE_DEFINE2(ext2fs, , vfsops, ext2_cg_validate_error, "char*", "int"); SDT_PROBE_DEFINE1(ext2fs, , vfsops, ext2_compute_sb_data_error, "char*"); static int ext2_flushfiles(struct mount *mp, int flags, struct thread *td); static int ext2_mountfs(struct vnode *, struct mount *); static int ext2_reload(struct mount *mp, struct thread *td); static int ext2_sbupdate(struct ext2mount *, int); static int ext2_cgupdate(struct ext2mount *, int); static vfs_unmount_t ext2_unmount; static vfs_root_t ext2_root; static vfs_statfs_t ext2_statfs; static vfs_sync_t ext2_sync; static vfs_vget_t ext2_vget; static vfs_fhtovp_t ext2_fhtovp; static vfs_mount_t ext2_mount; MALLOC_DEFINE(M_EXT2NODE, "ext2_node", "EXT2 vnode private part"); static MALLOC_DEFINE(M_EXT2MNT, "ext2_mount", "EXT2 mount structure"); static struct vfsops ext2fs_vfsops = { .vfs_fhtovp = ext2_fhtovp, .vfs_mount = ext2_mount, .vfs_root = ext2_root, /* root inode via vget */ .vfs_statfs = ext2_statfs, .vfs_sync = ext2_sync, .vfs_unmount = ext2_unmount, .vfs_vget = ext2_vget, }; VFS_SET(ext2fs_vfsops, ext2fs, 0); static int ext2_check_sb_compat(struct ext2fs *es, struct cdev *dev, int ronly); static int ext2_compute_sb_data(struct vnode * devvp, struct ext2fs * es, struct m_ext2fs * fs); static const char *ext2_opts[] = { "acls", "async", "noatime", "noclusterr", "noclusterw", "noexec", "export", "force", "from", "multilabel", "suiddir", "nosymfollow", "sync", "union", NULL }; /* * VFS Operations. * * mount system call */ static int ext2_mount(struct mount *mp) { struct vfsoptlist *opts; struct vnode *devvp; struct thread *td; struct ext2mount *ump = NULL; struct m_ext2fs *fs; struct nameidata nd, *ndp = &nd; accmode_t accmode; char *path, *fspec; int error, flags, len; td = curthread; opts = mp->mnt_optnew; if (vfs_filteropt(opts, ext2_opts)) return (EINVAL); vfs_getopt(opts, "fspath", (void **)&path, NULL); /* Double-check the length of path.. */ if (strlen(path) >= MAXMNTLEN) return (ENAMETOOLONG); fspec = NULL; error = vfs_getopt(opts, "from", (void **)&fspec, &len); if (!error && fspec[len - 1] != '\0') return (EINVAL); /* * If updating, check whether changing from read-only to * read/write; if there is no device name, that's all we do. */ if (mp->mnt_flag & MNT_UPDATE) { ump = VFSTOEXT2(mp); fs = ump->um_e2fs; error = 0; if (fs->e2fs_ronly == 0 && vfs_flagopt(opts, "ro", NULL, 0)) { error = VFS_SYNC(mp, MNT_WAIT); if (error) return (error); flags = WRITECLOSE; if (mp->mnt_flag & MNT_FORCE) flags |= FORCECLOSE; error = ext2_flushfiles(mp, flags, td); if (error == 0 && fs->e2fs_wasvalid && ext2_cgupdate(ump, MNT_WAIT) == 0) { fs->e2fs->e2fs_state = htole16((le16toh(fs->e2fs->e2fs_state) | E2FS_ISCLEAN)); ext2_sbupdate(ump, MNT_WAIT); } fs->e2fs_ronly = 1; vfs_flagopt(opts, "ro", &mp->mnt_flag, MNT_RDONLY); g_topology_lock(); g_access(ump->um_cp, 0, -1, 0); g_topology_unlock(); } if (!error && (mp->mnt_flag & MNT_RELOAD)) error = ext2_reload(mp, td); if (error) return (error); devvp = ump->um_devvp; if (fs->e2fs_ronly && !vfs_flagopt(opts, "ro", NULL, 0)) { if (ext2_check_sb_compat(fs->e2fs, devvp->v_rdev, 0)) return (EPERM); /* * If upgrade to read-write by non-root, then verify * that user has necessary permissions on the device. */ vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); error = VOP_ACCESS(devvp, VREAD | VWRITE, td->td_ucred, td); if (error) error = priv_check(td, PRIV_VFS_MOUNT_PERM); if (error) { VOP_UNLOCK(devvp); return (error); } VOP_UNLOCK(devvp); g_topology_lock(); error = g_access(ump->um_cp, 0, 1, 0); g_topology_unlock(); if (error) return (error); if ((le16toh(fs->e2fs->e2fs_state) & E2FS_ISCLEAN) == 0 || (le16toh(fs->e2fs->e2fs_state) & E2FS_ERRORS)) { if (mp->mnt_flag & MNT_FORCE) { printf( "WARNING: %s was not properly dismounted\n", fs->e2fs_fsmnt); } else { printf( "WARNING: R/W mount of %s denied. Filesystem is not clean - run fsck\n", fs->e2fs_fsmnt); return (EPERM); } } fs->e2fs->e2fs_state = htole16(le16toh(fs->e2fs->e2fs_state) & ~E2FS_ISCLEAN); (void)ext2_cgupdate(ump, MNT_WAIT); fs->e2fs_ronly = 0; MNT_ILOCK(mp); mp->mnt_flag &= ~MNT_RDONLY; MNT_IUNLOCK(mp); } if (vfs_flagopt(opts, "export", NULL, 0)) { /* Process export requests in vfs_mount.c. */ return (error); } } /* * Not an update, or updating the name: look up the name * and verify that it refers to a sensible disk device. */ if (fspec == NULL) return (EINVAL); NDINIT(ndp, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, fspec, td); if ((error = namei(ndp)) != 0) return (error); NDFREE(ndp, NDF_ONLY_PNBUF); devvp = ndp->ni_vp; if (!vn_isdisk(devvp, &error)) { vput(devvp); return (error); } /* * If mount by non-root, then verify that user has necessary * permissions on the device. * * XXXRW: VOP_ACCESS() enough? */ accmode = VREAD; if ((mp->mnt_flag & MNT_RDONLY) == 0) accmode |= VWRITE; error = VOP_ACCESS(devvp, accmode, td->td_ucred, td); if (error) error = priv_check(td, PRIV_VFS_MOUNT_PERM); if (error) { vput(devvp); return (error); } if ((mp->mnt_flag & MNT_UPDATE) == 0) { error = ext2_mountfs(devvp, mp); } else { if (devvp != ump->um_devvp) { vput(devvp); return (EINVAL); /* needs translation */ } else vput(devvp); } if (error) { vrele(devvp); return (error); } ump = VFSTOEXT2(mp); fs = ump->um_e2fs; /* * Note that this strncpy() is ok because of a check at the start * of ext2_mount(). */ strncpy(fs->e2fs_fsmnt, path, MAXMNTLEN); fs->e2fs_fsmnt[MAXMNTLEN - 1] = '\0'; vfs_mountedfrom(mp, fspec); return (0); } static int ext2_check_sb_compat(struct ext2fs *es, struct cdev *dev, int ronly) { uint32_t i, mask; if (le16toh(es->e2fs_magic) != E2FS_MAGIC) { printf("ext2fs: %s: wrong magic number %#x (expected %#x)\n", devtoname(dev), le16toh(es->e2fs_magic), E2FS_MAGIC); return (1); } if (le32toh(es->e2fs_rev) > E2FS_REV0) { mask = le32toh(es->e2fs_features_incompat) & ~(EXT2F_INCOMPAT_SUPP); if (mask) { printf("WARNING: mount of %s denied due to " "unsupported optional features:\n", devtoname(dev)); for (i = 0; i < sizeof(incompat)/sizeof(struct ext2_feature); i++) if (mask & incompat[i].mask) printf("%s ", incompat[i].name); printf("\n"); return (1); } mask = le32toh(es->e2fs_features_rocompat) & ~EXT2F_ROCOMPAT_SUPP; if (!ronly && mask) { printf("WARNING: R/W mount of %s denied due to " "unsupported optional features:\n", devtoname(dev)); for (i = 0; i < sizeof(ro_compat)/sizeof(struct ext2_feature); i++) if (mask & ro_compat[i].mask) printf("%s ", ro_compat[i].name); printf("\n"); return (1); } } return (0); } static e4fs_daddr_t ext2_cg_location(struct m_ext2fs *fs, int number) { int cg, descpb, logical_sb, has_super = 0; /* * Adjust logical superblock block number. * Godmar thinks: if the blocksize is greater than 1024, then * the superblock is logically part of block zero. */ logical_sb = fs->e2fs_bsize > SBSIZE ? 0 : 1; if (!EXT2_HAS_INCOMPAT_FEATURE(fs, EXT2F_INCOMPAT_META_BG) || number < le32toh(fs->e2fs->e3fs_first_meta_bg)) return (logical_sb + number + 1); if (EXT2_HAS_INCOMPAT_FEATURE(fs, EXT2F_INCOMPAT_64BIT)) descpb = fs->e2fs_bsize / sizeof(struct ext2_gd); else descpb = fs->e2fs_bsize / E2FS_REV0_GD_SIZE; cg = descpb * number; if (ext2_cg_has_sb(fs, cg)) has_super = 1; return (has_super + cg * (e4fs_daddr_t)EXT2_BLOCKS_PER_GROUP(fs) + le32toh(fs->e2fs->e2fs_first_dblock)); } static int ext2_cg_validate(struct m_ext2fs *fs) { uint64_t b_bitmap; uint64_t i_bitmap; uint64_t i_tables; uint64_t first_block, last_block, last_cg_block; struct ext2_gd *gd; unsigned int i, cg_count; first_block = le32toh(fs->e2fs->e2fs_first_dblock); last_cg_block = ext2_cg_number_gdb(fs, 0); cg_count = fs->e2fs_gcount; for (i = 0; i < fs->e2fs_gcount; i++) { gd = &fs->e2fs_gd[i]; if (EXT2_HAS_INCOMPAT_FEATURE(fs, EXT2F_INCOMPAT_FLEX_BG) || i == fs->e2fs_gcount - 1) { last_block = fs->e2fs_bcount - 1; } else { last_block = first_block + (EXT2_BLOCKS_PER_GROUP(fs) - 1); } if ((cg_count == fs->e2fs_gcount) && !(le16toh(gd->ext4bgd_flags) & EXT2_BG_INODE_ZEROED)) cg_count = i; b_bitmap = e2fs_gd_get_b_bitmap(gd); if (b_bitmap == 0) { SDT_PROBE2(ext2fs, , vfsops, ext2_cg_validate_error, "block bitmap is zero", i); return (EINVAL); } if (b_bitmap <= last_cg_block) { SDT_PROBE2(ext2fs, , vfsops, ext2_cg_validate_error, "block bitmap overlaps gds", i); return (EINVAL); } if (b_bitmap < first_block || b_bitmap > last_block) { SDT_PROBE2(ext2fs, , vfsops, ext2_cg_validate_error, "block bitmap not in group", i); return (EINVAL); } i_bitmap = e2fs_gd_get_i_bitmap(gd); if (i_bitmap == 0) { SDT_PROBE2(ext2fs, , vfsops, ext2_cg_validate_error, "inode bitmap is zero", i); return (EINVAL); } if (i_bitmap <= last_cg_block) { SDT_PROBE2(ext2fs, , vfsops, ext2_cg_validate_error, "inode bitmap overlaps gds", i); return (EINVAL); } if (i_bitmap < first_block || i_bitmap > last_block) { SDT_PROBE2(ext2fs, , vfsops, ext2_cg_validate_error, "inode bitmap not in group blk", i); return (EINVAL); } i_tables = e2fs_gd_get_i_tables(gd); if (i_tables == 0) { SDT_PROBE2(ext2fs, , vfsops, ext2_cg_validate_error, "inode table is zero", i); return (EINVAL); } if (i_tables <= last_cg_block) { SDT_PROBE2(ext2fs, , vfsops, ext2_cg_validate_error, "inode talbes overlaps gds", i); return (EINVAL); } if (i_tables < first_block || i_tables + fs->e2fs_itpg - 1 > last_block) { SDT_PROBE2(ext2fs, , vfsops, ext2_cg_validate_error, "inode tables not in group blk", i); return (EINVAL); } if (!EXT2_HAS_INCOMPAT_FEATURE(fs, EXT2F_INCOMPAT_FLEX_BG)) first_block += EXT2_BLOCKS_PER_GROUP(fs); } return (0); } /* * This computes the fields of the m_ext2fs structure from the * data in the ext2fs structure read in. */ static int ext2_compute_sb_data(struct vnode *devvp, struct ext2fs *es, struct m_ext2fs *fs) { struct buf *bp; uint32_t e2fs_descpb, e2fs_gdbcount_alloc; int i, j; int g_count = 0; int error; /* Check checksum features */ if (EXT2_HAS_RO_COMPAT_FEATURE(fs, EXT2F_ROCOMPAT_GDT_CSUM) && EXT2_HAS_RO_COMPAT_FEATURE(fs, EXT2F_ROCOMPAT_METADATA_CKSUM)) { SDT_PROBE1(ext2fs, , vfsops, ext2_compute_sb_data_error, "incorrect checksum features combination"); return (EINVAL); } /* Precompute checksum seed for all metadata */ ext2_sb_csum_set_seed(fs); /* Verify sb csum if possible */ if (EXT2_HAS_RO_COMPAT_FEATURE(fs, EXT2F_ROCOMPAT_METADATA_CKSUM)) { error = ext2_sb_csum_verify(fs); if (error) { return (error); } } /* Check for block size = 1K|2K|4K */ if (le32toh(es->e2fs_log_bsize) > 2) { SDT_PROBE1(ext2fs, , vfsops, ext2_compute_sb_data_error, "bad block size"); return (EINVAL); } fs->e2fs_bshift = EXT2_MIN_BLOCK_LOG_SIZE + le32toh(es->e2fs_log_bsize); fs->e2fs_bsize = 1U << fs->e2fs_bshift; fs->e2fs_fsbtodb = le32toh(es->e2fs_log_bsize) + 1; fs->e2fs_qbmask = fs->e2fs_bsize - 1; /* Check for fragment size */ if (le32toh(es->e2fs_log_fsize) > (EXT2_MAX_FRAG_LOG_SIZE - EXT2_MIN_BLOCK_LOG_SIZE)) { SDT_PROBE1(ext2fs, , vfsops, ext2_compute_sb_data_error, "invalid log cluster size"); return (EINVAL); } fs->e2fs_fsize = EXT2_MIN_FRAG_SIZE << le32toh(es->e2fs_log_fsize); if (fs->e2fs_fsize != fs->e2fs_bsize) { SDT_PROBE1(ext2fs, , vfsops, ext2_compute_sb_data_error, "fragment size != block size"); return (EINVAL); } fs->e2fs_fpb = fs->e2fs_bsize / fs->e2fs_fsize; /* Check reserved gdt blocks for future filesystem expansion */ if (le16toh(es->e2fs_reserved_ngdb) > (fs->e2fs_bsize / 4)) { SDT_PROBE1(ext2fs, , vfsops, ext2_compute_sb_data_error, "number of reserved GDT blocks too large"); return (EINVAL); } if (le32toh(es->e2fs_rev) == E2FS_REV0) { fs->e2fs_isize = E2FS_REV0_INODE_SIZE; } else { fs->e2fs_isize = le16toh(es->e2fs_inode_size); /* * Check first ino. */ if (le32toh(es->e2fs_first_ino) < EXT2_FIRSTINO) { SDT_PROBE1(ext2fs, , vfsops, ext2_compute_sb_data_error, "invalid first ino"); return (EINVAL); } /* * Simple sanity check for superblock inode size value. */ if (EXT2_INODE_SIZE(fs) < E2FS_REV0_INODE_SIZE || EXT2_INODE_SIZE(fs) > fs->e2fs_bsize || (fs->e2fs_isize & (fs->e2fs_isize - 1)) != 0) { SDT_PROBE1(ext2fs, , vfsops, ext2_compute_sb_data_error, "invalid inode size"); return (EINVAL); } } /* Check group descriptors */ if (EXT2_HAS_INCOMPAT_FEATURE(fs, EXT2F_INCOMPAT_64BIT) && le16toh(es->e3fs_desc_size) != E2FS_64BIT_GD_SIZE) { SDT_PROBE1(ext2fs, , vfsops, ext2_compute_sb_data_error, "unsupported 64bit descriptor size"); return (EINVAL); } fs->e2fs_bpg = le32toh(es->e2fs_bpg); fs->e2fs_fpg = le32toh(es->e2fs_fpg); if (fs->e2fs_bpg == 0 || fs->e2fs_fpg == 0) { SDT_PROBE1(ext2fs, , vfsops, ext2_compute_sb_data_error, "zero blocks/fragments per group"); return (EINVAL); } else if (fs->e2fs_bpg != fs->e2fs_fpg) { SDT_PROBE1(ext2fs, , vfsops, ext2_compute_sb_data_error, "blocks per group not equal fragments per group"); return (EINVAL); } if (fs->e2fs_bpg != fs->e2fs_bsize * 8) { SDT_PROBE1(ext2fs, , vfsops, ext2_compute_sb_data_error, "non-standard group size unsupported"); return (EINVAL); } fs->e2fs_ipb = fs->e2fs_bsize / EXT2_INODE_SIZE(fs); if (fs->e2fs_ipb == 0 || fs->e2fs_ipb > fs->e2fs_bsize / E2FS_REV0_INODE_SIZE) { SDT_PROBE1(ext2fs, , vfsops, ext2_compute_sb_data_error, "bad inodes per block size"); return (EINVAL); } fs->e2fs_ipg = le32toh(es->e2fs_ipg); if (fs->e2fs_ipg < fs->e2fs_ipb || fs->e2fs_ipg > fs->e2fs_bsize * 8) { SDT_PROBE1(ext2fs, , vfsops, ext2_compute_sb_data_error, "invalid inodes per group"); return (EINVAL); } fs->e2fs_itpg = fs->e2fs_ipg / fs->e2fs_ipb; fs->e2fs_bcount = le32toh(es->e2fs_bcount); fs->e2fs_rbcount = le32toh(es->e2fs_rbcount); fs->e2fs_fbcount = le32toh(es->e2fs_fbcount); if (EXT2_HAS_INCOMPAT_FEATURE(fs, EXT2F_INCOMPAT_64BIT)) { fs->e2fs_bcount |= (uint64_t)(le32toh(es->e4fs_bcount_hi)) << 32; fs->e2fs_rbcount |= (uint64_t)(le32toh(es->e4fs_rbcount_hi)) << 32; fs->e2fs_fbcount |= (uint64_t)(le32toh(es->e4fs_fbcount_hi)) << 32; } if (fs->e2fs_rbcount > fs->e2fs_bcount || fs->e2fs_fbcount > fs->e2fs_bcount) { SDT_PROBE1(ext2fs, , vfsops, ext2_compute_sb_data_error, "invalid block count"); return (EINVAL); } fs->e2fs_ficount = le32toh(es->e2fs_ficount); if (fs->e2fs_ficount > le32toh(es->e2fs_icount)) { SDT_PROBE1(ext2fs, , vfsops, ext2_compute_sb_data_error, "invalid number of free inodes"); return (EINVAL); } if (le32toh(es->e2fs_first_dblock) >= fs->e2fs_bcount) { SDT_PROBE1(ext2fs, , vfsops, ext2_compute_sb_data_error, "first data block out of range"); return (EINVAL); } fs->e2fs_gcount = howmany(fs->e2fs_bcount - le32toh(es->e2fs_first_dblock), EXT2_BLOCKS_PER_GROUP(fs)); if (fs->e2fs_gcount > ((uint64_t)1 << 32) - EXT2_DESCS_PER_BLOCK(fs)) { SDT_PROBE1(ext2fs, , vfsops, ext2_compute_sb_data_error, "groups count too large"); return (EINVAL); } /* Check for extra isize in big inodes. */ if (EXT2_HAS_RO_COMPAT_FEATURE(fs, EXT2F_ROCOMPAT_EXTRA_ISIZE) && EXT2_INODE_SIZE(fs) < sizeof(struct ext2fs_dinode)) { SDT_PROBE1(ext2fs, , vfsops, ext2_compute_sb_data_error, "no space for extra inode timestamps"); return (EINVAL); } /* s_resuid / s_resgid ? */ if (EXT2_HAS_INCOMPAT_FEATURE(fs, EXT2F_INCOMPAT_64BIT)) { e2fs_descpb = fs->e2fs_bsize / E2FS_64BIT_GD_SIZE; e2fs_gdbcount_alloc = howmany(fs->e2fs_gcount, e2fs_descpb); } else { e2fs_descpb = fs->e2fs_bsize / E2FS_REV0_GD_SIZE; e2fs_gdbcount_alloc = howmany(fs->e2fs_gcount, fs->e2fs_bsize / sizeof(struct ext2_gd)); } fs->e2fs_gdbcount = howmany(fs->e2fs_gcount, e2fs_descpb); fs->e2fs_gd = malloc(e2fs_gdbcount_alloc * fs->e2fs_bsize, M_EXT2MNT, M_WAITOK | M_ZERO); fs->e2fs_contigdirs = malloc(fs->e2fs_gcount * sizeof(*fs->e2fs_contigdirs), M_EXT2MNT, M_WAITOK | M_ZERO); for (i = 0; i < fs->e2fs_gdbcount; i++) { error = bread(devvp, fsbtodb(fs, ext2_cg_location(fs, i)), fs->e2fs_bsize, NOCRED, &bp); if (error) { /* * fs->e2fs_gd and fs->e2fs_contigdirs * will be freed later by the caller, * because this function could be called from * MNT_UPDATE path. */ return (error); } if (EXT2_HAS_INCOMPAT_FEATURE(fs, EXT2F_INCOMPAT_64BIT)) { memcpy(&fs->e2fs_gd[ i * fs->e2fs_bsize / sizeof(struct ext2_gd)], bp->b_data, fs->e2fs_bsize); } else { for (j = 0; j < e2fs_descpb && g_count < fs->e2fs_gcount; j++, g_count++) memcpy(&fs->e2fs_gd[g_count], bp->b_data + j * E2FS_REV0_GD_SIZE, E2FS_REV0_GD_SIZE); } brelse(bp); bp = NULL; } /* Validate cgs consistency */ error = ext2_cg_validate(fs); if (error) return (error); /* Verfy cgs csum */ if (EXT2_HAS_RO_COMPAT_FEATURE(fs, EXT2F_ROCOMPAT_GDT_CSUM) || EXT2_HAS_RO_COMPAT_FEATURE(fs, EXT2F_ROCOMPAT_METADATA_CKSUM)) { error = ext2_gd_csum_verify(fs, devvp->v_rdev); if (error) return (error); } /* Initialization for the ext2 Orlov allocator variant. */ fs->e2fs_total_dir = 0; for (i = 0; i < fs->e2fs_gcount; i++) fs->e2fs_total_dir += e2fs_gd_get_ndirs(&fs->e2fs_gd[i]); if (le32toh(es->e2fs_rev) == E2FS_REV0 || !EXT2_HAS_RO_COMPAT_FEATURE(fs, EXT2F_ROCOMPAT_LARGEFILE)) fs->e2fs_maxfilesize = 0x7fffffff; else { fs->e2fs_maxfilesize = 0xffffffffffff; if (EXT2_HAS_RO_COMPAT_FEATURE(fs, EXT2F_ROCOMPAT_HUGE_FILE)) fs->e2fs_maxfilesize = 0x7fffffffffffffff; } if (le32toh(es->e4fs_flags) & E2FS_UNSIGNED_HASH) { fs->e2fs_uhash = 3; } else if ((le32toh(es->e4fs_flags) & E2FS_SIGNED_HASH) == 0) { #ifdef __CHAR_UNSIGNED__ es->e4fs_flags = htole32(le32toh(es->e4fs_flags) | E2FS_UNSIGNED_HASH); fs->e2fs_uhash = 3; #else es->e4fs_flags = htole32(le32toh(es->e4fs_flags) | E2FS_SIGNED_HASH); #endif } if (EXT2_HAS_RO_COMPAT_FEATURE(fs, EXT2F_ROCOMPAT_METADATA_CKSUM)) error = ext2_sb_csum_verify(fs); return (error); } /* * Reload all incore data for a filesystem (used after running fsck on * the root filesystem and finding things to fix). The filesystem must * be mounted read-only. * * Things to do to update the mount: * 1) invalidate all cached meta-data. * 2) re-read superblock from disk. * 3) invalidate all cluster summary information. * 4) invalidate all inactive vnodes. * 5) invalidate all cached file data. * 6) re-read inode data for all active vnodes. * XXX we are missing some steps, in particular # 3, this has to be reviewed. */ static int ext2_reload(struct mount *mp, struct thread *td) { struct vnode *vp, *mvp, *devvp; struct inode *ip; struct buf *bp; struct ext2fs *es; struct m_ext2fs *fs; struct csum *sump; int error, i; int32_t *lp; if ((mp->mnt_flag & MNT_RDONLY) == 0) return (EINVAL); /* * Step 1: invalidate all cached meta-data. */ devvp = VFSTOEXT2(mp)->um_devvp; vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); if (vinvalbuf(devvp, 0, 0, 0) != 0) panic("ext2_reload: dirty1"); VOP_UNLOCK(devvp); /* * Step 2: re-read superblock from disk. * constants have been adjusted for ext2 */ if ((error = bread(devvp, SBLOCK, SBSIZE, NOCRED, &bp)) != 0) return (error); es = (struct ext2fs *)bp->b_data; if (ext2_check_sb_compat(es, devvp->v_rdev, 0) != 0) { brelse(bp); return (EIO); /* XXX needs translation */ } fs = VFSTOEXT2(mp)->um_e2fs; bcopy(bp->b_data, fs->e2fs, sizeof(struct ext2fs)); if ((error = ext2_compute_sb_data(devvp, es, fs)) != 0) { brelse(bp); return (error); } #ifdef UNKLAR if (fs->fs_sbsize < SBSIZE) bp->b_flags |= B_INVAL; #endif brelse(bp); /* * Step 3: invalidate all cluster summary information. */ if (fs->e2fs_contigsumsize > 0) { lp = fs->e2fs_maxcluster; sump = fs->e2fs_clustersum; for (i = 0; i < fs->e2fs_gcount; i++, sump++) { *lp++ = fs->e2fs_contigsumsize; sump->cs_init = 0; bzero(sump->cs_sum, fs->e2fs_contigsumsize + 1); } } loop: MNT_VNODE_FOREACH_ALL(vp, mp, mvp) { /* * Step 4: invalidate all cached file data. */ - if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK, td)) { + if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK)) { MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); goto loop; } if (vinvalbuf(vp, 0, 0, 0)) panic("ext2_reload: dirty2"); /* * Step 5: re-read inode data for all active vnodes. */ ip = VTOI(vp); error = bread(devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)), (int)fs->e2fs_bsize, NOCRED, &bp); if (error) { VOP_UNLOCK(vp); vrele(vp); MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); return (error); } error = ext2_ei2i((struct ext2fs_dinode *)((char *)bp->b_data + EXT2_INODE_SIZE(fs) * ino_to_fsbo(fs, ip->i_number)), ip); brelse(bp); VOP_UNLOCK(vp); vrele(vp); if (error) { MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); return (error); } } return (0); } /* * Common code for mount and mountroot. */ static int ext2_mountfs(struct vnode *devvp, struct mount *mp) { struct ext2mount *ump; struct buf *bp; struct m_ext2fs *fs; struct ext2fs *es; struct cdev *dev = devvp->v_rdev; struct g_consumer *cp; struct bufobj *bo; struct csum *sump; int error; int ronly; int i; u_long size; int32_t *lp; int32_t e2fs_maxcontig; ronly = vfs_flagopt(mp->mnt_optnew, "ro", NULL, 0); /* XXX: use VOP_ACESS to check FS perms */ g_topology_lock(); error = g_vfs_open(devvp, &cp, "ext2fs", ronly ? 0 : 1); g_topology_unlock(); VOP_UNLOCK(devvp); if (error) return (error); /* XXX: should we check for some sectorsize or 512 instead? */ if (((SBSIZE % cp->provider->sectorsize) != 0) || (SBSIZE < cp->provider->sectorsize)) { g_topology_lock(); g_vfs_close(cp); g_topology_unlock(); return (EINVAL); } bo = &devvp->v_bufobj; bo->bo_private = cp; bo->bo_ops = g_vfs_bufops; if (devvp->v_rdev->si_iosize_max != 0) mp->mnt_iosize_max = devvp->v_rdev->si_iosize_max; if (mp->mnt_iosize_max > MAXPHYS) mp->mnt_iosize_max = MAXPHYS; bp = NULL; ump = NULL; if ((error = bread(devvp, SBLOCK, SBSIZE, NOCRED, &bp)) != 0) goto out; es = (struct ext2fs *)bp->b_data; if (ext2_check_sb_compat(es, dev, ronly) != 0) { error = EINVAL; /* XXX needs translation */ goto out; } if ((le16toh(es->e2fs_state) & E2FS_ISCLEAN) == 0 || (le16toh(es->e2fs_state) & E2FS_ERRORS)) { if (ronly || (mp->mnt_flag & MNT_FORCE)) { printf( "WARNING: Filesystem was not properly dismounted\n"); } else { printf( "WARNING: R/W mount denied. Filesystem is not clean - run fsck\n"); error = EPERM; goto out; } } ump = malloc(sizeof(*ump), M_EXT2MNT, M_WAITOK | M_ZERO); /* * I don't know whether this is the right strategy. Note that * we dynamically allocate both an m_ext2fs and an ext2fs * while Linux keeps the super block in a locked buffer. */ ump->um_e2fs = malloc(sizeof(struct m_ext2fs), M_EXT2MNT, M_WAITOK | M_ZERO); ump->um_e2fs->e2fs = malloc(sizeof(struct ext2fs), M_EXT2MNT, M_WAITOK); mtx_init(EXT2_MTX(ump), "EXT2FS", "EXT2FS Lock", MTX_DEF); bcopy(es, ump->um_e2fs->e2fs, (u_int)sizeof(struct ext2fs)); if ((error = ext2_compute_sb_data(devvp, ump->um_e2fs->e2fs, ump->um_e2fs))) goto out; /* * Calculate the maximum contiguous blocks and size of cluster summary * array. In FFS this is done by newfs; however, the superblock * in ext2fs doesn't have these variables, so we can calculate * them here. */ e2fs_maxcontig = MAX(1, MAXPHYS / ump->um_e2fs->e2fs_bsize); ump->um_e2fs->e2fs_contigsumsize = MIN(e2fs_maxcontig, EXT2_MAXCONTIG); if (ump->um_e2fs->e2fs_contigsumsize > 0) { size = ump->um_e2fs->e2fs_gcount * sizeof(int32_t); ump->um_e2fs->e2fs_maxcluster = malloc(size, M_EXT2MNT, M_WAITOK); size = ump->um_e2fs->e2fs_gcount * sizeof(struct csum); ump->um_e2fs->e2fs_clustersum = malloc(size, M_EXT2MNT, M_WAITOK); lp = ump->um_e2fs->e2fs_maxcluster; sump = ump->um_e2fs->e2fs_clustersum; for (i = 0; i < ump->um_e2fs->e2fs_gcount; i++, sump++) { *lp++ = ump->um_e2fs->e2fs_contigsumsize; sump->cs_init = 0; sump->cs_sum = malloc((ump->um_e2fs->e2fs_contigsumsize + 1) * sizeof(int32_t), M_EXT2MNT, M_WAITOK | M_ZERO); } } brelse(bp); bp = NULL; fs = ump->um_e2fs; fs->e2fs_ronly = ronly; /* ronly is set according to mnt_flags */ /* * If the fs is not mounted read-only, make sure the super block is * always written back on a sync(). */ fs->e2fs_wasvalid = le16toh(fs->e2fs->e2fs_state) & E2FS_ISCLEAN ? 1 : 0; if (ronly == 0) { fs->e2fs_fmod = 1; /* mark it modified and set fs invalid */ fs->e2fs->e2fs_state = htole16(le16toh(fs->e2fs->e2fs_state) & ~E2FS_ISCLEAN); } mp->mnt_data = ump; mp->mnt_stat.f_fsid.val[0] = dev2udev(dev); mp->mnt_stat.f_fsid.val[1] = mp->mnt_vfc->vfc_typenum; mp->mnt_maxsymlinklen = EXT2_MAXSYMLINKLEN; MNT_ILOCK(mp); mp->mnt_flag |= MNT_LOCAL; MNT_IUNLOCK(mp); ump->um_mountp = mp; ump->um_dev = dev; ump->um_devvp = devvp; ump->um_bo = &devvp->v_bufobj; ump->um_cp = cp; /* * Setting those two parameters allowed us to use * ufs_bmap w/o changse! */ ump->um_nindir = EXT2_ADDR_PER_BLOCK(fs); ump->um_bptrtodb = le32toh(fs->e2fs->e2fs_log_bsize) + 1; ump->um_seqinc = EXT2_FRAGS_PER_BLOCK(fs); if (ronly == 0) ext2_sbupdate(ump, MNT_WAIT); /* * Initialize filesystem stat information in mount struct. */ MNT_ILOCK(mp); mp->mnt_kern_flag |= MNTK_LOOKUP_SHARED | MNTK_EXTENDED_SHARED | MNTK_USES_BCACHE; MNT_IUNLOCK(mp); return (0); out: if (bp) brelse(bp); if (cp != NULL) { g_topology_lock(); g_vfs_close(cp); g_topology_unlock(); } if (ump) { mtx_destroy(EXT2_MTX(ump)); free(ump->um_e2fs->e2fs_gd, M_EXT2MNT); free(ump->um_e2fs->e2fs_contigdirs, M_EXT2MNT); free(ump->um_e2fs->e2fs, M_EXT2MNT); free(ump->um_e2fs, M_EXT2MNT); free(ump, M_EXT2MNT); mp->mnt_data = NULL; } return (error); } /* * Unmount system call. */ static int ext2_unmount(struct mount *mp, int mntflags) { struct ext2mount *ump; struct m_ext2fs *fs; struct csum *sump; int error, flags, i, ronly; flags = 0; if (mntflags & MNT_FORCE) { if (mp->mnt_flag & MNT_ROOTFS) return (EINVAL); flags |= FORCECLOSE; } if ((error = ext2_flushfiles(mp, flags, curthread)) != 0) return (error); ump = VFSTOEXT2(mp); fs = ump->um_e2fs; ronly = fs->e2fs_ronly; if (ronly == 0 && ext2_cgupdate(ump, MNT_WAIT) == 0) { if (fs->e2fs_wasvalid) fs->e2fs->e2fs_state = htole16(le16toh(fs->e2fs->e2fs_state) | E2FS_ISCLEAN); ext2_sbupdate(ump, MNT_WAIT); } g_topology_lock(); g_vfs_close(ump->um_cp); g_topology_unlock(); vrele(ump->um_devvp); sump = fs->e2fs_clustersum; for (i = 0; i < fs->e2fs_gcount; i++, sump++) free(sump->cs_sum, M_EXT2MNT); free(fs->e2fs_clustersum, M_EXT2MNT); free(fs->e2fs_maxcluster, M_EXT2MNT); free(fs->e2fs_gd, M_EXT2MNT); free(fs->e2fs_contigdirs, M_EXT2MNT); free(fs->e2fs, M_EXT2MNT); free(fs, M_EXT2MNT); free(ump, M_EXT2MNT); mp->mnt_data = NULL; MNT_ILOCK(mp); mp->mnt_flag &= ~MNT_LOCAL; MNT_IUNLOCK(mp); return (error); } /* * Flush out all the files in a filesystem. */ static int ext2_flushfiles(struct mount *mp, int flags, struct thread *td) { int error; error = vflush(mp, 0, flags, td); return (error); } /* * Get filesystem statistics. */ int ext2_statfs(struct mount *mp, struct statfs *sbp) { struct ext2mount *ump; struct m_ext2fs *fs; uint32_t overhead, overhead_per_group, ngdb; int i, ngroups; ump = VFSTOEXT2(mp); fs = ump->um_e2fs; if (le16toh(fs->e2fs->e2fs_magic) != E2FS_MAGIC) panic("ext2_statfs"); /* * Compute the overhead (FS structures) */ overhead_per_group = 1 /* block bitmap */ + 1 /* inode bitmap */ + fs->e2fs_itpg; overhead = le32toh(fs->e2fs->e2fs_first_dblock) + fs->e2fs_gcount * overhead_per_group; if (le32toh(fs->e2fs->e2fs_rev) > E2FS_REV0 && le32toh(fs->e2fs->e2fs_features_rocompat) & EXT2F_ROCOMPAT_SPARSESUPER) { for (i = 0, ngroups = 0; i < fs->e2fs_gcount; i++) { if (ext2_cg_has_sb(fs, i)) ngroups++; } } else { ngroups = fs->e2fs_gcount; } ngdb = fs->e2fs_gdbcount; if (le32toh(fs->e2fs->e2fs_rev) > E2FS_REV0 && le32toh(fs->e2fs->e2fs_features_compat) & EXT2F_COMPAT_RESIZE) ngdb += le16toh(fs->e2fs->e2fs_reserved_ngdb); overhead += ngroups * (1 /* superblock */ + ngdb); sbp->f_bsize = EXT2_FRAG_SIZE(fs); sbp->f_iosize = EXT2_BLOCK_SIZE(fs); sbp->f_blocks = fs->e2fs_bcount - overhead; sbp->f_bfree = fs->e2fs_fbcount; sbp->f_bavail = sbp->f_bfree - fs->e2fs_rbcount; sbp->f_files = le32toh(fs->e2fs->e2fs_icount); sbp->f_ffree = fs->e2fs_ficount; return (0); } /* * Go through the disk queues to initiate sandbagged IO; * go through the inodes to write those that have been modified; * initiate the writing of the super block if it has been modified. * * Note: we are always called with the filesystem marked `MPBUSY'. */ static int ext2_sync(struct mount *mp, int waitfor) { struct vnode *mvp, *vp; struct thread *td; struct inode *ip; struct ext2mount *ump = VFSTOEXT2(mp); struct m_ext2fs *fs; int error, allerror = 0; td = curthread; fs = ump->um_e2fs; if (fs->e2fs_fmod != 0 && fs->e2fs_ronly != 0) { /* XXX */ panic("ext2_sync: rofs mod fs=%s", fs->e2fs_fsmnt); } /* * Write back each (modified) inode. */ loop: MNT_VNODE_FOREACH_ALL(vp, mp, mvp) { if (vp->v_type == VNON) { VI_UNLOCK(vp); continue; } ip = VTOI(vp); if ((ip->i_flag & (IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE)) == 0 && (vp->v_bufobj.bo_dirty.bv_cnt == 0 || waitfor == MNT_LAZY)) { VI_UNLOCK(vp); continue; } - error = vget(vp, LK_EXCLUSIVE | LK_NOWAIT | LK_INTERLOCK, td); + error = vget(vp, LK_EXCLUSIVE | LK_NOWAIT | LK_INTERLOCK); if (error) { if (error == ENOENT) { MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); goto loop; } continue; } if ((error = VOP_FSYNC(vp, waitfor, td)) != 0) allerror = error; VOP_UNLOCK(vp); vrele(vp); } /* * Force stale filesystem control information to be flushed. */ if (waitfor != MNT_LAZY) { vn_lock(ump->um_devvp, LK_EXCLUSIVE | LK_RETRY); if ((error = VOP_FSYNC(ump->um_devvp, waitfor, td)) != 0) allerror = error; VOP_UNLOCK(ump->um_devvp); } /* * Write back modified superblock. */ if (fs->e2fs_fmod != 0) { fs->e2fs_fmod = 0; fs->e2fs->e2fs_wtime = htole32(time_second); if ((error = ext2_cgupdate(ump, waitfor)) != 0) allerror = error; } return (allerror); } /* * Look up an EXT2FS dinode number to find its incore vnode, otherwise read it * in from disk. If it is in core, wait for the lock bit to clear, then * return the inode locked. Detection and handling of mount points must be * done by the calling routine. */ static int ext2_vget(struct mount *mp, ino_t ino, int flags, struct vnode **vpp) { struct m_ext2fs *fs; struct inode *ip; struct ext2mount *ump; struct buf *bp; struct vnode *vp; struct thread *td; unsigned int i, used_blocks; int error; td = curthread; error = vfs_hash_get(mp, ino, flags, td, vpp, NULL, NULL); if (error || *vpp != NULL) return (error); ump = VFSTOEXT2(mp); ip = malloc(sizeof(struct inode), M_EXT2NODE, M_WAITOK | M_ZERO); /* Allocate a new vnode/inode. */ if ((error = getnewvnode("ext2fs", mp, &ext2_vnodeops, &vp)) != 0) { *vpp = NULL; free(ip, M_EXT2NODE); return (error); } vp->v_data = ip; ip->i_vnode = vp; ip->i_e2fs = fs = ump->um_e2fs; ip->i_ump = ump; ip->i_number = ino; lockmgr(vp->v_vnlock, LK_EXCLUSIVE, NULL); error = insmntque(vp, mp); if (error != 0) { free(ip, M_EXT2NODE); *vpp = NULL; return (error); } error = vfs_hash_insert(vp, ino, flags, td, vpp, NULL, NULL); if (error || *vpp != NULL) return (error); /* Read in the disk contents for the inode, copy into the inode. */ if ((error = bread(ump->um_devvp, fsbtodb(fs, ino_to_fsba(fs, ino)), (int)fs->e2fs_bsize, NOCRED, &bp)) != 0) { /* * The inode does not contain anything useful, so it would * be misleading to leave it on its hash chain. With mode * still zero, it will be unlinked and returned to the free * list by vput(). */ brelse(bp); vput(vp); *vpp = NULL; return (error); } /* convert ext2 inode to dinode */ error = ext2_ei2i((struct ext2fs_dinode *)((char *)bp->b_data + EXT2_INODE_SIZE(fs) * ino_to_fsbo(fs, ino)), ip); if (error) { brelse(bp); vput(vp); *vpp = NULL; return (error); } ip->i_block_group = ino_to_cg(fs, ino); ip->i_next_alloc_block = 0; ip->i_next_alloc_goal = 0; /* * Now we want to make sure that block pointers for unused * blocks are zeroed out - ext2_balloc depends on this * although for regular files and directories only * * If IN_E4EXTENTS is enabled, unused blocks are not zeroed * out because we could corrupt the extent tree. */ if (!(ip->i_flag & IN_E4EXTENTS) && (S_ISDIR(ip->i_mode) || S_ISREG(ip->i_mode))) { used_blocks = howmany(ip->i_size, fs->e2fs_bsize); for (i = used_blocks; i < EXT2_NDIR_BLOCKS; i++) ip->i_db[i] = 0; } #ifdef EXT2FS_PRINT_EXTENTS ext2_print_inode(ip); ext4_ext_print_extent_tree_status(ip); #endif bqrelse(bp); /* * Initialize the vnode from the inode, check for aliases. * Note that the underlying vnode may have changed. */ if ((error = ext2_vinit(mp, &ext2_fifoops, &vp)) != 0) { vput(vp); *vpp = NULL; return (error); } /* * Finish inode initialization. */ *vpp = vp; return (0); } /* * File handle to vnode * * Have to be really careful about stale file handles: * - check that the inode number is valid * - call ext2_vget() to get the locked inode * - check for an unallocated inode (i_mode == 0) * - check that the given client host has export rights and return * those rights via. exflagsp and credanonp */ static int ext2_fhtovp(struct mount *mp, struct fid *fhp, int flags, struct vnode **vpp) { struct inode *ip; struct ufid *ufhp; struct vnode *nvp; struct m_ext2fs *fs; int error; ufhp = (struct ufid *)fhp; fs = VFSTOEXT2(mp)->um_e2fs; if (ufhp->ufid_ino < EXT2_ROOTINO || ufhp->ufid_ino > fs->e2fs_gcount * fs->e2fs_ipg) return (ESTALE); error = VFS_VGET(mp, ufhp->ufid_ino, LK_EXCLUSIVE, &nvp); if (error) { *vpp = NULLVP; return (error); } ip = VTOI(nvp); if (ip->i_mode == 0 || ip->i_gen != ufhp->ufid_gen || ip->i_nlink <= 0) { vput(nvp); *vpp = NULLVP; return (ESTALE); } *vpp = nvp; vnode_create_vobject(*vpp, 0, curthread); return (0); } /* * Write a superblock and associated information back to disk. */ static int ext2_sbupdate(struct ext2mount *mp, int waitfor) { struct m_ext2fs *fs = mp->um_e2fs; struct ext2fs *es = fs->e2fs; struct buf *bp; int error = 0; es->e2fs_bcount = htole32(fs->e2fs_bcount & 0xffffffff); es->e2fs_rbcount = htole32(fs->e2fs_rbcount & 0xffffffff); es->e2fs_fbcount = htole32(fs->e2fs_fbcount & 0xffffffff); if (EXT2_HAS_INCOMPAT_FEATURE(fs, EXT2F_INCOMPAT_64BIT)) { es->e4fs_bcount_hi = htole32(fs->e2fs_bcount >> 32); es->e4fs_rbcount_hi = htole32(fs->e2fs_rbcount >> 32); es->e4fs_fbcount_hi = htole32(fs->e2fs_fbcount >> 32); } es->e2fs_ficount = htole32(fs->e2fs_ficount); if (EXT2_HAS_RO_COMPAT_FEATURE(fs, EXT2F_ROCOMPAT_METADATA_CKSUM)) ext2_sb_csum_set(fs); bp = getblk(mp->um_devvp, SBLOCK, SBSIZE, 0, 0, 0); bcopy((caddr_t)es, bp->b_data, (u_int)sizeof(struct ext2fs)); if (waitfor == MNT_WAIT) error = bwrite(bp); else bawrite(bp); /* * The buffers for group descriptors, inode bitmaps and block bitmaps * are not busy at this point and are (hopefully) written by the * usual sync mechanism. No need to write them here. */ return (error); } int ext2_cgupdate(struct ext2mount *mp, int waitfor) { struct m_ext2fs *fs = mp->um_e2fs; struct buf *bp; int i, j, g_count = 0, error = 0, allerror = 0; allerror = ext2_sbupdate(mp, waitfor); /* Update gd csums */ if (EXT2_HAS_RO_COMPAT_FEATURE(fs, EXT2F_ROCOMPAT_GDT_CSUM) || EXT2_HAS_RO_COMPAT_FEATURE(fs, EXT2F_ROCOMPAT_METADATA_CKSUM)) ext2_gd_csum_set(fs); for (i = 0; i < fs->e2fs_gdbcount; i++) { bp = getblk(mp->um_devvp, fsbtodb(fs, ext2_cg_location(fs, i)), fs->e2fs_bsize, 0, 0, 0); if (EXT2_HAS_INCOMPAT_FEATURE(fs, EXT2F_INCOMPAT_64BIT)) { memcpy(bp->b_data, &fs->e2fs_gd[ i * fs->e2fs_bsize / sizeof(struct ext2_gd)], fs->e2fs_bsize); } else { for (j = 0; j < fs->e2fs_bsize / E2FS_REV0_GD_SIZE && g_count < fs->e2fs_gcount; j++, g_count++) memcpy(bp->b_data + j * E2FS_REV0_GD_SIZE, &fs->e2fs_gd[g_count], E2FS_REV0_GD_SIZE); } if (waitfor == MNT_WAIT) error = bwrite(bp); else bawrite(bp); } if (!allerror && error) allerror = error; return (allerror); } /* * Return the root of a filesystem. */ static int ext2_root(struct mount *mp, int flags, struct vnode **vpp) { struct vnode *nvp; int error; error = VFS_VGET(mp, EXT2_ROOTINO, LK_EXCLUSIVE, &nvp); if (error) return (error); *vpp = nvp; return (0); } Index: projects/clang1100-import/sys/fs/fdescfs/fdesc_vfsops.c =================================================================== --- projects/clang1100-import/sys/fs/fdescfs/fdesc_vfsops.c (revision 364278) +++ projects/clang1100-import/sys/fs/fdescfs/fdesc_vfsops.c (revision 364279) @@ -1,231 +1,231 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1992, 1993, 1995 * The Regents of the University of California. All rights reserved. * * This code is derived from software donated to Berkeley by * Jan-Simon Pendry. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)fdesc_vfsops.c 8.4 (Berkeley) 1/21/94 * * $FreeBSD$ */ /* * /dev/fd Filesystem */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include static MALLOC_DEFINE(M_FDESCMNT, "fdesc_mount", "FDESC mount structure"); static vfs_cmount_t fdesc_cmount; static vfs_mount_t fdesc_mount; static vfs_unmount_t fdesc_unmount; static vfs_statfs_t fdesc_statfs; static vfs_root_t fdesc_root; /* * Compatibility shim for old mount(2) system call. */ int fdesc_cmount(struct mntarg *ma, void *data, uint64_t flags) { return kernel_mount(ma, flags); } /* * Mount the per-process file descriptors (/dev/fd) */ static int fdesc_mount(struct mount *mp) { struct fdescmount *fmp; struct vnode *rvp; int error; /* * Update is a no-op */ if (mp->mnt_flag & (MNT_UPDATE | MNT_ROOTFS)) return (EOPNOTSUPP); fmp = malloc(sizeof(struct fdescmount), M_FDESCMNT, M_WAITOK); /* XXX */ /* * We need to initialize a few bits of our local mount point struct to * avoid confusion in allocvp. */ mp->mnt_data = fmp; fmp->flags = 0; if (vfs_getopt(mp->mnt_optnew, "linrdlnk", NULL, NULL) == 0) fmp->flags |= FMNT_LINRDLNKF; error = fdesc_allocvp(Froot, -1, FD_ROOT, mp, &rvp); if (error) { free(fmp, M_FDESCMNT); mp->mnt_data = NULL; return (error); } rvp->v_type = VDIR; rvp->v_vflag |= VV_ROOT; fmp->f_root = rvp; VOP_UNLOCK(rvp); /* XXX -- don't mark as local to work around fts() problems */ /*mp->mnt_flag |= MNT_LOCAL;*/ vfs_getnewfsid(mp); vfs_mountedfrom(mp, "fdescfs"); return (0); } static int fdesc_unmount(struct mount *mp, int mntflags) { struct fdescmount *fmp; int error, flags; flags = 0; fmp = mp->mnt_data; if (mntflags & MNT_FORCE) { /* The hash mutex protects the private mount flags. */ mtx_lock(&fdesc_hashmtx); fmp->flags |= FMNT_UNMOUNTF; mtx_unlock(&fdesc_hashmtx); flags |= FORCECLOSE; } /* * Clear out buffer cache. I don't think we * ever get anything cached at this level at the * moment, but who knows... * * There is 1 extra root vnode reference corresponding * to f_root. */ if ((error = vflush(mp, 1, flags, curthread)) != 0) return (error); /* * Finally, throw away the fdescmount structure. */ mp->mnt_data = NULL; free(fmp, M_FDESCMNT); return (0); } static int fdesc_root(struct mount *mp, int flags, struct vnode **vpp) { struct vnode *vp; /* * Return locked reference to root. */ vp = VFSTOFDESC(mp)->f_root; - vget(vp, LK_EXCLUSIVE | LK_RETRY, curthread); + vget(vp, LK_EXCLUSIVE | LK_RETRY); *vpp = vp; return (0); } static int fdesc_statfs(struct mount *mp, struct statfs *sbp) { struct thread *td; struct filedesc *fdp; int lim; int i; int last; int freefd; uint64_t limit; td = curthread; /* * Compute number of free file descriptors. * [ Strange results will ensue if the open file * limit is ever reduced below the current number * of open files... ] */ lim = lim_cur(td, RLIMIT_NOFILE); fdp = td->td_proc->p_fd; FILEDESC_SLOCK(fdp); limit = racct_get_limit(td->td_proc, RACCT_NOFILE); if (lim > limit) lim = limit; last = min(fdp->fd_nfiles, lim); freefd = 0; for (i = fdp->fd_freefile; i < last; i++) if (fdp->fd_ofiles[i].fde_file == NULL) freefd++; /* * Adjust for the fact that the fdesc array may not * have been fully allocated yet. */ if (fdp->fd_nfiles < lim) freefd += (lim - fdp->fd_nfiles); FILEDESC_SUNLOCK(fdp); sbp->f_flags = 0; sbp->f_bsize = DEV_BSIZE; sbp->f_iosize = DEV_BSIZE; sbp->f_blocks = 2; /* 1K to keep df happy */ sbp->f_bfree = 0; sbp->f_bavail = 0; sbp->f_files = lim + 1; /* Allow for "." */ sbp->f_ffree = freefd; /* See comments above */ return (0); } static struct vfsops fdesc_vfsops = { .vfs_cmount = fdesc_cmount, .vfs_init = fdesc_init, .vfs_mount = fdesc_mount, .vfs_root = fdesc_root, .vfs_statfs = fdesc_statfs, .vfs_uninit = fdesc_uninit, .vfs_unmount = fdesc_unmount, }; VFS_SET(fdesc_vfsops, fdescfs, VFCF_SYNTHETIC | VFCF_JAIL); Index: projects/clang1100-import/sys/fs/fdescfs/fdesc_vnops.c =================================================================== --- projects/clang1100-import/sys/fs/fdescfs/fdesc_vnops.c (revision 364278) +++ projects/clang1100-import/sys/fs/fdescfs/fdesc_vnops.c (revision 364279) @@ -1,661 +1,661 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1992, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software donated to Berkeley by * Jan-Simon Pendry. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)fdesc_vnops.c 8.9 (Berkeley) 1/21/94 * * $FreeBSD$ */ /* * /dev/fd Filesystem */ #include #include #include #include #include #include #include /* boottime */ #include #include #include #include /* Must come after sys/malloc.h */ #include #include #include #include #include #include #include #include #define NFDCACHE 4 #define FD_NHASH(ix) \ (&fdhashtbl[(ix) & fdhash]) static LIST_HEAD(fdhashhead, fdescnode) *fdhashtbl; static u_long fdhash; struct mtx fdesc_hashmtx; static vop_getattr_t fdesc_getattr; static vop_lookup_t fdesc_lookup; static vop_open_t fdesc_open; static vop_pathconf_t fdesc_pathconf; static vop_readdir_t fdesc_readdir; static vop_readlink_t fdesc_readlink; static vop_reclaim_t fdesc_reclaim; static vop_setattr_t fdesc_setattr; static struct vop_vector fdesc_vnodeops = { .vop_default = &default_vnodeops, .vop_access = VOP_NULL, .vop_getattr = fdesc_getattr, .vop_lookup = fdesc_lookup, .vop_open = fdesc_open, .vop_pathconf = fdesc_pathconf, .vop_readdir = fdesc_readdir, .vop_readlink = fdesc_readlink, .vop_reclaim = fdesc_reclaim, .vop_setattr = fdesc_setattr, }; VFS_VOP_VECTOR_REGISTER(fdesc_vnodeops); static void fdesc_insmntque_dtr(struct vnode *, void *); static void fdesc_remove_entry(struct fdescnode *); /* * Initialise cache headers */ int fdesc_init(struct vfsconf *vfsp) { mtx_init(&fdesc_hashmtx, "fdescfs_hash", NULL, MTX_DEF); fdhashtbl = hashinit(NFDCACHE, M_CACHE, &fdhash); return (0); } /* * Uninit ready for unload. */ int fdesc_uninit(struct vfsconf *vfsp) { hashdestroy(fdhashtbl, M_CACHE, fdhash); mtx_destroy(&fdesc_hashmtx); return (0); } /* * If allocating vnode fails, call this. */ static void fdesc_insmntque_dtr(struct vnode *vp, void *arg) { vgone(vp); vput(vp); } /* * Remove an entry from the hash if it exists. */ static void fdesc_remove_entry(struct fdescnode *fd) { struct fdhashhead *fc; struct fdescnode *fd2; fc = FD_NHASH(fd->fd_ix); mtx_lock(&fdesc_hashmtx); LIST_FOREACH(fd2, fc, fd_hash) { if (fd == fd2) { LIST_REMOVE(fd, fd_hash); break; } } mtx_unlock(&fdesc_hashmtx); } int fdesc_allocvp(fdntype ftype, unsigned fd_fd, int ix, struct mount *mp, struct vnode **vpp) { struct fdescmount *fmp; struct fdhashhead *fc; struct fdescnode *fd, *fd2; struct vnode *vp, *vp2; struct thread *td; int error; td = curthread; fc = FD_NHASH(ix); loop: mtx_lock(&fdesc_hashmtx); /* * If a forced unmount is progressing, we need to drop it. The flags are * protected by the hashmtx. */ fmp = mp->mnt_data; if (fmp == NULL || fmp->flags & FMNT_UNMOUNTF) { mtx_unlock(&fdesc_hashmtx); return (-1); } LIST_FOREACH(fd, fc, fd_hash) { if (fd->fd_ix == ix && fd->fd_vnode->v_mount == mp) { /* Get reference to vnode in case it's being free'd */ vp = fd->fd_vnode; VI_LOCK(vp); mtx_unlock(&fdesc_hashmtx); - if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK, td)) + if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK)) goto loop; *vpp = vp; return (0); } } mtx_unlock(&fdesc_hashmtx); fd = malloc(sizeof(struct fdescnode), M_TEMP, M_WAITOK); error = getnewvnode("fdescfs", mp, &fdesc_vnodeops, &vp); if (error) { free(fd, M_TEMP); return (error); } vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); vp->v_data = fd; fd->fd_vnode = vp; fd->fd_type = ftype; fd->fd_fd = fd_fd; fd->fd_ix = ix; if (ftype == Fdesc && fmp->flags & FMNT_LINRDLNKF) vp->v_vflag |= VV_READLINK; error = insmntque1(vp, mp, fdesc_insmntque_dtr, NULL); if (error != 0) { *vpp = NULLVP; return (error); } /* Make sure that someone didn't beat us when inserting the vnode. */ mtx_lock(&fdesc_hashmtx); /* * If a forced unmount is progressing, we need to drop it. The flags are * protected by the hashmtx. */ fmp = mp->mnt_data; if (fmp == NULL || fmp->flags & FMNT_UNMOUNTF) { mtx_unlock(&fdesc_hashmtx); vgone(vp); vput(vp); *vpp = NULLVP; return (-1); } LIST_FOREACH(fd2, fc, fd_hash) { if (fd2->fd_ix == ix && fd2->fd_vnode->v_mount == mp) { /* Get reference to vnode in case it's being free'd */ vp2 = fd2->fd_vnode; VI_LOCK(vp2); mtx_unlock(&fdesc_hashmtx); - error = vget(vp2, LK_EXCLUSIVE | LK_INTERLOCK, td); + error = vget(vp2, LK_EXCLUSIVE | LK_INTERLOCK); /* Someone beat us, dec use count and wait for reclaim */ vgone(vp); vput(vp); /* If we didn't get it, return no vnode. */ if (error) vp2 = NULLVP; *vpp = vp2; return (error); } } /* If we came here, we can insert it safely. */ LIST_INSERT_HEAD(fc, fd, fd_hash); mtx_unlock(&fdesc_hashmtx); *vpp = vp; return (0); } struct fdesc_get_ino_args { fdntype ftype; unsigned fd_fd; int ix; struct file *fp; struct thread *td; }; static int fdesc_get_ino_alloc(struct mount *mp, void *arg, int lkflags, struct vnode **rvp) { struct fdesc_get_ino_args *a; int error; a = arg; error = fdesc_allocvp(a->ftype, a->fd_fd, a->ix, mp, rvp); fdrop(a->fp, a->td); return (error); } /* * vp is the current namei directory * ndp is the name to locate in that directory... */ static int fdesc_lookup(struct vop_lookup_args *ap) { struct vnode **vpp = ap->a_vpp; struct vnode *dvp = ap->a_dvp; struct componentname *cnp = ap->a_cnp; char *pname = cnp->cn_nameptr; struct thread *td = cnp->cn_thread; struct file *fp; struct fdesc_get_ino_args arg; int nlen = cnp->cn_namelen; u_int fd, fd1; int error; struct vnode *fvp; if ((cnp->cn_flags & ISLASTCN) && (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) { error = EROFS; goto bad; } if (cnp->cn_namelen == 1 && *pname == '.') { *vpp = dvp; VREF(dvp); return (0); } if (VTOFDESC(dvp)->fd_type != Froot) { error = ENOTDIR; goto bad; } fd = 0; /* the only time a leading 0 is acceptable is if it's "0" */ if (*pname == '0' && nlen != 1) { error = ENOENT; goto bad; } while (nlen--) { if (*pname < '0' || *pname > '9') { error = ENOENT; goto bad; } fd1 = 10 * fd + *pname++ - '0'; if (fd1 < fd) { error = ENOENT; goto bad; } fd = fd1; } /* * No rights to check since 'fp' isn't actually used. */ if ((error = fget(td, fd, &cap_no_rights, &fp)) != 0) goto bad; /* Check if we're looking up ourselves. */ if (VTOFDESC(dvp)->fd_ix == FD_DESC + fd) { /* * In case we're holding the last reference to the file, the dvp * will be re-acquired. */ vhold(dvp); VOP_UNLOCK(dvp); fdrop(fp, td); /* Re-aquire the lock afterwards. */ vn_lock(dvp, LK_RETRY | LK_EXCLUSIVE); vdrop(dvp); fvp = dvp; if (VN_IS_DOOMED(dvp)) error = ENOENT; } else { /* * Unlock our root node (dvp) when doing this, since we might * deadlock since the vnode might be locked by another thread * and the root vnode lock will be obtained afterwards (in case * we're looking up the fd of the root vnode), which will be the * opposite lock order. Vhold the root vnode first so we don't * lose it. */ arg.ftype = Fdesc; arg.fd_fd = fd; arg.ix = FD_DESC + fd; arg.fp = fp; arg.td = td; error = vn_vget_ino_gen(dvp, fdesc_get_ino_alloc, &arg, LK_EXCLUSIVE, &fvp); } if (error) goto bad; *vpp = fvp; return (0); bad: *vpp = NULL; return (error); } static int fdesc_open(struct vop_open_args *ap) { struct vnode *vp = ap->a_vp; if (VTOFDESC(vp)->fd_type == Froot) return (0); /* * XXX Kludge: set td->td_proc->p_dupfd to contain the value of the file * descriptor being sought for duplication. The error return ensures * that the vnode for this device will be released by vn_open. Open * will detect this special error and take the actions in dupfdopen. * Other callers of vn_open or VOP_OPEN will simply report the * error. */ ap->a_td->td_dupfd = VTOFDESC(vp)->fd_fd; /* XXX */ return (ENODEV); } static int fdesc_pathconf(struct vop_pathconf_args *ap) { struct vnode *vp = ap->a_vp; int error; switch (ap->a_name) { case _PC_NAME_MAX: *ap->a_retval = NAME_MAX; return (0); case _PC_LINK_MAX: if (VTOFDESC(vp)->fd_type == Froot) *ap->a_retval = 2; else *ap->a_retval = 1; return (0); default: if (VTOFDESC(vp)->fd_type == Froot) return (vop_stdpathconf(ap)); vref(vp); VOP_UNLOCK(vp); error = kern_fpathconf(curthread, VTOFDESC(vp)->fd_fd, ap->a_name, ap->a_retval); vn_lock(vp, LK_SHARED | LK_RETRY); vunref(vp); return (error); } } static int fdesc_getattr(struct vop_getattr_args *ap) { struct vnode *vp = ap->a_vp; struct vattr *vap = ap->a_vap; struct timeval boottime; getboottime(&boottime); vap->va_mode = S_IRUSR|S_IXUSR|S_IRGRP|S_IXGRP|S_IROTH|S_IXOTH; vap->va_fileid = VTOFDESC(vp)->fd_ix; vap->va_uid = 0; vap->va_gid = 0; vap->va_blocksize = DEV_BSIZE; vap->va_atime.tv_sec = boottime.tv_sec; vap->va_atime.tv_nsec = 0; vap->va_mtime = vap->va_atime; vap->va_ctime = vap->va_mtime; vap->va_gen = 0; vap->va_flags = 0; vap->va_bytes = 0; vap->va_filerev = 0; switch (VTOFDESC(vp)->fd_type) { case Froot: vap->va_type = VDIR; vap->va_nlink = 2; vap->va_size = DEV_BSIZE; vap->va_rdev = NODEV; break; case Fdesc: vap->va_type = (vp->v_vflag & VV_READLINK) == 0 ? VCHR : VLNK; vap->va_nlink = 1; vap->va_size = 0; vap->va_rdev = makedev(0, vap->va_fileid); break; default: panic("fdesc_getattr"); break; } vp->v_type = vap->va_type; return (0); } static int fdesc_setattr(struct vop_setattr_args *ap) { struct vattr *vap = ap->a_vap; struct vnode *vp; struct mount *mp; struct file *fp; struct thread *td = curthread; cap_rights_t rights; unsigned fd; int error; /* * Can't mess with the root vnode */ if (VTOFDESC(ap->a_vp)->fd_type == Froot) return (EACCES); fd = VTOFDESC(ap->a_vp)->fd_fd; /* * Allow setattr where there is an underlying vnode. */ error = getvnode(td, fd, cap_rights_init(&rights, CAP_EXTATTR_SET), &fp); if (error) { /* * getvnode() returns EINVAL if the file descriptor is not * backed by a vnode. Silently drop all changes except * chflags(2) in this case. */ if (error == EINVAL) { if (vap->va_flags != VNOVAL) error = EOPNOTSUPP; else error = 0; } return (error); } vp = fp->f_vnode; if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) == 0) { vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); error = VOP_SETATTR(vp, ap->a_vap, ap->a_cred); VOP_UNLOCK(vp); vn_finished_write(mp); } fdrop(fp, td); return (error); } #define UIO_MX _GENERIC_DIRLEN(10) /* number of symbols in INT_MAX printout */ static int fdesc_readdir(struct vop_readdir_args *ap) { struct fdescmount *fmp; struct uio *uio = ap->a_uio; struct filedesc *fdp; struct dirent d; struct dirent *dp = &d; int error, i, off, fcnt; if (VTOFDESC(ap->a_vp)->fd_type != Froot) panic("fdesc_readdir: not dir"); fmp = VFSTOFDESC(ap->a_vp->v_mount); if (ap->a_ncookies != NULL) *ap->a_ncookies = 0; off = (int)uio->uio_offset; if (off != uio->uio_offset || off < 0 || (u_int)off % UIO_MX != 0 || uio->uio_resid < UIO_MX) return (EINVAL); i = (u_int)off / UIO_MX; fdp = uio->uio_td->td_proc->p_fd; error = 0; fcnt = i - 2; /* The first two nodes are `.' and `..' */ FILEDESC_SLOCK(fdp); while (i < fdp->fd_nfiles + 2 && uio->uio_resid >= UIO_MX) { bzero((caddr_t)dp, UIO_MX); switch (i) { case 0: /* `.' */ case 1: /* `..' */ dp->d_fileno = i + FD_ROOT; dp->d_namlen = i + 1; dp->d_reclen = UIO_MX; bcopy("..", dp->d_name, dp->d_namlen); dp->d_type = DT_DIR; dirent_terminate(dp); break; default: if (fdp->fd_ofiles[fcnt].fde_file == NULL) break; dp->d_namlen = sprintf(dp->d_name, "%d", fcnt); dp->d_reclen = UIO_MX; dp->d_type = (fmp->flags & FMNT_LINRDLNKF) == 0 ? DT_CHR : DT_LNK; dp->d_fileno = i + FD_DESC; dirent_terminate(dp); break; } /* NOTE: d_off is the offset of the *next* entry. */ dp->d_off = UIO_MX * (i + 1); if (dp->d_namlen != 0) { /* * And ship to userland */ FILEDESC_SUNLOCK(fdp); error = uiomove(dp, UIO_MX, uio); if (error) goto done; FILEDESC_SLOCK(fdp); } i++; fcnt++; } FILEDESC_SUNLOCK(fdp); done: uio->uio_offset = i * UIO_MX; return (error); } static int fdesc_reclaim(struct vop_reclaim_args *ap) { struct vnode *vp; struct fdescnode *fd; vp = ap->a_vp; fd = VTOFDESC(vp); fdesc_remove_entry(fd); free(vp->v_data, M_TEMP); vp->v_data = NULL; return (0); } static int fdesc_readlink(struct vop_readlink_args *va) { struct vnode *vp, *vn; struct thread *td; struct uio *uio; struct file *fp; char *freepath, *fullpath; size_t pathlen; int lockflags, fd_fd; int error; freepath = NULL; vn = va->a_vp; if (VTOFDESC(vn)->fd_type != Fdesc) panic("fdesc_readlink: not fdescfs link"); fd_fd = ((struct fdescnode *)vn->v_data)->fd_fd; lockflags = VOP_ISLOCKED(vn); VOP_UNLOCK(vn); td = curthread; error = fget_cap(td, fd_fd, &cap_no_rights, &fp, NULL); if (error != 0) goto out; switch (fp->f_type) { case DTYPE_VNODE: vp = fp->f_vnode; error = vn_fullpath(td, vp, &fullpath, &freepath); break; default: fullpath = "anon_inode:[unknown]"; break; } if (error == 0) { uio = va->a_uio; pathlen = strlen(fullpath); error = uiomove(fullpath, pathlen, uio); } if (freepath != NULL) free(freepath, M_TEMP); fdrop(fp, td); out: vn_lock(vn, lockflags | LK_RETRY); return (error); } Index: projects/clang1100-import/sys/fs/fuse/fuse_vfsops.c =================================================================== --- projects/clang1100-import/sys/fs/fuse/fuse_vfsops.c (revision 364278) +++ projects/clang1100-import/sys/fs/fuse/fuse_vfsops.c (revision 364279) @@ -1,695 +1,695 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2007-2009 Google Inc. and Amit Singh * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are * met: * * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following disclaimer * in the documentation and/or other materials provided with the * distribution. * * Neither the name of Google Inc. nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * Copyright (C) 2005 Csaba Henk. * All rights reserved. * * Copyright (c) 2019 The FreeBSD Foundation * * Portions of this software were developed by BFF Storage Systems, LLC under * sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "fuse.h" #include "fuse_node.h" #include "fuse_ipc.h" #include "fuse_internal.h" #include #include SDT_PROVIDER_DECLARE(fusefs); /* * Fuse trace probe: * arg0: verbosity. Higher numbers give more verbose messages * arg1: Textual message */ SDT_PROBE_DEFINE2(fusefs, , vfsops, trace, "int", "char*"); /* This will do for privilege types for now */ #ifndef PRIV_VFS_FUSE_ALLOWOTHER #define PRIV_VFS_FUSE_ALLOWOTHER PRIV_VFS_MOUNT_NONUSER #endif #ifndef PRIV_VFS_FUSE_MOUNT_NONUSER #define PRIV_VFS_FUSE_MOUNT_NONUSER PRIV_VFS_MOUNT_NONUSER #endif #ifndef PRIV_VFS_FUSE_SYNC_UNMOUNT #define PRIV_VFS_FUSE_SYNC_UNMOUNT PRIV_VFS_MOUNT_NONUSER #endif static vfs_fhtovp_t fuse_vfsop_fhtovp; static vfs_mount_t fuse_vfsop_mount; static vfs_unmount_t fuse_vfsop_unmount; static vfs_root_t fuse_vfsop_root; static vfs_statfs_t fuse_vfsop_statfs; static vfs_vget_t fuse_vfsop_vget; struct vfsops fuse_vfsops = { .vfs_fhtovp = fuse_vfsop_fhtovp, .vfs_mount = fuse_vfsop_mount, .vfs_unmount = fuse_vfsop_unmount, .vfs_root = fuse_vfsop_root, .vfs_statfs = fuse_vfsop_statfs, .vfs_vget = fuse_vfsop_vget, }; static int fuse_enforce_dev_perms = 0; SYSCTL_INT(_vfs_fusefs, OID_AUTO, enforce_dev_perms, CTLFLAG_RW, &fuse_enforce_dev_perms, 0, "enforce fuse device permissions for secondary mounts"); MALLOC_DEFINE(M_FUSEVFS, "fuse_filesystem", "buffer for fuse vfs layer"); static int fuse_getdevice(const char *fspec, struct thread *td, struct cdev **fdevp) { struct nameidata nd, *ndp = &nd; struct vnode *devvp; struct cdev *fdev; int err; /* * Not an update, or updating the name: look up the name * and verify that it refers to a sensible disk device. */ NDINIT(ndp, LOOKUP, FOLLOW, UIO_SYSSPACE, fspec, td); if ((err = namei(ndp)) != 0) return err; NDFREE(ndp, NDF_ONLY_PNBUF); devvp = ndp->ni_vp; if (devvp->v_type != VCHR) { vrele(devvp); return ENXIO; } fdev = devvp->v_rdev; dev_ref(fdev); if (fuse_enforce_dev_perms) { /* * Check if mounter can open the fuse device. * * This has significance only if we are doing a secondary mount * which doesn't involve actually opening fuse devices, but we * still want to enforce the permissions of the device (in * order to keep control over the circle of fuse users). * * (In case of primary mounts, we are either the superuser so * we can do anything anyway, or we can mount only if the * device is already opened by us, ie. we are permitted to open * the device.) */ #if 0 #ifdef MAC err = mac_check_vnode_open(td->td_ucred, devvp, VREAD | VWRITE); if (!err) #endif #endif /* 0 */ err = VOP_ACCESS(devvp, VREAD | VWRITE, td->td_ucred, td); if (err) { vrele(devvp); dev_rel(fdev); return err; } } /* * according to coda code, no extra lock is needed -- * although in sys/vnode.h this field is marked "v" */ vrele(devvp); if (!fdev->si_devsw || strcmp("fuse", fdev->si_devsw->d_name)) { dev_rel(fdev); return ENXIO; } *fdevp = fdev; return 0; } #define FUSE_FLAGOPT(fnam, fval) do { \ vfs_flagopt(opts, #fnam, &mntopts, fval); \ vfs_flagopt(opts, "__" #fnam, &__mntopts, fval); \ } while (0) SDT_PROBE_DEFINE1(fusefs, , vfsops, mntopts, "uint64_t"); SDT_PROBE_DEFINE4(fusefs, , vfsops, mount_err, "char*", "struct fuse_data*", "struct mount*", "int"); static int fuse_vfs_remount(struct mount *mp, struct thread *td, uint64_t mntopts, uint32_t max_read, int daemon_timeout) { int err = 0; struct fuse_data *data = fuse_get_mpdata(mp); /* Don't allow these options to be changed */ const static unsigned long long cant_update_opts = MNT_USER; /* Mount owner must be the user running the daemon */ FUSE_LOCK(); if ((mp->mnt_flag ^ data->mnt_flag) & cant_update_opts) { err = EOPNOTSUPP; SDT_PROBE4(fusefs, , vfsops, mount_err, "Can't change these mount options during remount", data, mp, err); goto out; } if (((data->dataflags ^ mntopts) & FSESS_MNTOPTS_MASK) || (data->max_read != max_read) || (data->daemon_timeout != daemon_timeout)) { // TODO: allow changing options where it makes sense err = EOPNOTSUPP; SDT_PROBE4(fusefs, , vfsops, mount_err, "Can't change fuse mount options during remount", data, mp, err); goto out; } if (fdata_get_dead(data)) { err = ENOTCONN; SDT_PROBE4(fusefs, , vfsops, mount_err, "device is dead during mount", data, mp, err); goto out; } /* Sanity + permission checks */ if (!data->daemoncred) panic("fuse daemon found, but identity unknown"); if (mntopts & FSESS_DAEMON_CAN_SPY) err = priv_check(td, PRIV_VFS_FUSE_ALLOWOTHER); if (err == 0 && td->td_ucred->cr_uid != data->daemoncred->cr_uid) /* are we allowed to do the first mount? */ err = priv_check(td, PRIV_VFS_FUSE_MOUNT_NONUSER); out: FUSE_UNLOCK(); return err; } static int fuse_vfsop_fhtovp(struct mount *mp, struct fid *fhp, int flags, struct vnode **vpp) { struct fuse_fid *ffhp = (struct fuse_fid *)fhp; struct fuse_vnode_data *fvdat; struct vnode *nvp; int error; if (!(fuse_get_mpdata(mp)->dataflags & FSESS_EXPORT_SUPPORT)) return EOPNOTSUPP; error = VFS_VGET(mp, ffhp->nid, LK_EXCLUSIVE, &nvp); if (error) { *vpp = NULLVP; return (error); } fvdat = VTOFUD(nvp); if (fvdat->generation != ffhp->gen ) { vput(nvp); *vpp = NULLVP; return (ESTALE); } *vpp = nvp; vnode_create_vobject(*vpp, 0, curthread); return (0); } static int fuse_vfsop_mount(struct mount *mp) { int err; uint64_t mntopts, __mntopts; uint32_t max_read; int daemon_timeout; int fd; struct cdev *fdev; struct fuse_data *data = NULL; struct thread *td; struct file *fp, *fptmp; char *fspec, *subtype; struct vfsoptlist *opts; subtype = NULL; max_read = ~0; err = 0; mntopts = 0; __mntopts = 0; td = curthread; /* Get the new options passed to mount */ opts = mp->mnt_optnew; if (!opts) return EINVAL; /* `fspath' contains the mount point (eg. /mnt/fuse/sshfs); REQUIRED */ if (!vfs_getopts(opts, "fspath", &err)) return err; /* * With the help of underscored options the mount program * can inform us from the flags it sets by default */ FUSE_FLAGOPT(allow_other, FSESS_DAEMON_CAN_SPY); FUSE_FLAGOPT(push_symlinks_in, FSESS_PUSH_SYMLINKS_IN); FUSE_FLAGOPT(default_permissions, FSESS_DEFAULT_PERMISSIONS); FUSE_FLAGOPT(intr, FSESS_INTR); (void)vfs_scanopt(opts, "max_read=", "%u", &max_read); if (vfs_scanopt(opts, "timeout=", "%u", &daemon_timeout) == 1) { if (daemon_timeout < FUSE_MIN_DAEMON_TIMEOUT) daemon_timeout = FUSE_MIN_DAEMON_TIMEOUT; else if (daemon_timeout > FUSE_MAX_DAEMON_TIMEOUT) daemon_timeout = FUSE_MAX_DAEMON_TIMEOUT; } else { daemon_timeout = FUSE_DEFAULT_DAEMON_TIMEOUT; } subtype = vfs_getopts(opts, "subtype=", &err); SDT_PROBE1(fusefs, , vfsops, mntopts, mntopts); if (mp->mnt_flag & MNT_UPDATE) { return fuse_vfs_remount(mp, td, mntopts, max_read, daemon_timeout); } /* `from' contains the device name (eg. /dev/fuse0); REQUIRED */ fspec = vfs_getopts(opts, "from", &err); if (!fspec) return err; /* `fd' contains the filedescriptor for this session; REQUIRED */ if (vfs_scanopt(opts, "fd", "%d", &fd) != 1) return EINVAL; err = fuse_getdevice(fspec, td, &fdev); if (err != 0) return err; err = fget(td, fd, &cap_read_rights, &fp); if (err != 0) { SDT_PROBE2(fusefs, , vfsops, trace, 1, "invalid or not opened device"); goto out; } fptmp = td->td_fpop; td->td_fpop = fp; err = devfs_get_cdevpriv((void **)&data); td->td_fpop = fptmp; fdrop(fp, td); FUSE_LOCK(); if (err != 0 || data == NULL) { err = ENXIO; SDT_PROBE4(fusefs, , vfsops, mount_err, "invalid or not opened device", data, mp, err); FUSE_UNLOCK(); goto out; } if (fdata_get_dead(data)) { err = ENOTCONN; SDT_PROBE4(fusefs, , vfsops, mount_err, "device is dead during mount", data, mp, err); FUSE_UNLOCK(); goto out; } /* Sanity + permission checks */ if (!data->daemoncred) panic("fuse daemon found, but identity unknown"); if (mntopts & FSESS_DAEMON_CAN_SPY) err = priv_check(td, PRIV_VFS_FUSE_ALLOWOTHER); if (err == 0 && td->td_ucred->cr_uid != data->daemoncred->cr_uid) /* are we allowed to do the first mount? */ err = priv_check(td, PRIV_VFS_FUSE_MOUNT_NONUSER); if (err) { FUSE_UNLOCK(); goto out; } data->ref++; data->mp = mp; data->dataflags |= mntopts; data->max_read = max_read; data->daemon_timeout = daemon_timeout; data->mnt_flag = mp->mnt_flag & MNT_UPDATEMASK; FUSE_UNLOCK(); vfs_getnewfsid(mp); MNT_ILOCK(mp); mp->mnt_data = data; /* * FUSE file systems can be either local or remote, but the kernel * can't tell the difference. */ mp->mnt_flag &= ~MNT_LOCAL; mp->mnt_kern_flag |= MNTK_USES_BCACHE; /* * Disable nullfs cacheing because it can consume too many resources in * the FUSE server. */ mp->mnt_kern_flag |= MNTK_NULL_NOCACHE; MNT_IUNLOCK(mp); /* We need this here as this slot is used by getnewvnode() */ mp->mnt_stat.f_iosize = maxbcachebuf; if (subtype) { strlcat(mp->mnt_stat.f_fstypename, ".", MFSNAMELEN); strlcat(mp->mnt_stat.f_fstypename, subtype, MFSNAMELEN); } memset(mp->mnt_stat.f_mntfromname, 0, MNAMELEN); strlcpy(mp->mnt_stat.f_mntfromname, fspec, MNAMELEN); mp->mnt_iosize_max = MAXPHYS; /* Now handshaking with daemon */ fuse_internal_send_init(data, td); out: if (err) { FUSE_LOCK(); if (data != NULL && data->mp == mp) { /* * Destroy device only if we acquired reference to * it */ SDT_PROBE4(fusefs, , vfsops, mount_err, "mount failed, destroy device", data, mp, err); data->mp = NULL; mp->mnt_data = NULL; fdata_trydestroy(data); } FUSE_UNLOCK(); dev_rel(fdev); } return err; } static int fuse_vfsop_unmount(struct mount *mp, int mntflags) { int err = 0; int flags = 0; struct cdev *fdev; struct fuse_data *data; struct fuse_dispatcher fdi; struct thread *td = curthread; if (mntflags & MNT_FORCE) { flags |= FORCECLOSE; } data = fuse_get_mpdata(mp); if (!data) { panic("no private data for mount point?"); } /* There is 1 extra root vnode reference (mp->mnt_data). */ FUSE_LOCK(); if (data->vroot != NULL) { struct vnode *vroot = data->vroot; data->vroot = NULL; FUSE_UNLOCK(); vrele(vroot); } else FUSE_UNLOCK(); err = vflush(mp, 0, flags, td); if (err) { return err; } if (fdata_get_dead(data)) { goto alreadydead; } if (fsess_isimpl(mp, FUSE_DESTROY)) { fdisp_init(&fdi, 0); fdisp_make(&fdi, FUSE_DESTROY, mp, 0, td, NULL); (void)fdisp_wait_answ(&fdi); fdisp_destroy(&fdi); } fdata_set_dead(data); alreadydead: FUSE_LOCK(); data->mp = NULL; fdev = data->fdev; fdata_trydestroy(data); FUSE_UNLOCK(); MNT_ILOCK(mp); mp->mnt_data = NULL; MNT_IUNLOCK(mp); dev_rel(fdev); return 0; } SDT_PROBE_DEFINE1(fusefs, , vfsops, invalidate_without_export, "struct mount*"); static int fuse_vfsop_vget(struct mount *mp, ino_t ino, int flags, struct vnode **vpp) { struct fuse_data *data = fuse_get_mpdata(mp); uint64_t nodeid = ino; struct thread *td = curthread; struct fuse_dispatcher fdi; struct fuse_entry_out *feo; struct fuse_vnode_data *fvdat; const char dot[] = "."; off_t filesize; enum vtype vtyp; int error; if (!(data->dataflags & FSESS_EXPORT_SUPPORT)) { /* * Unreachable unless you do something stupid, like export a * nullfs mount of a fusefs file system. */ SDT_PROBE1(fusefs, , vfsops, invalidate_without_export, mp); return (EOPNOTSUPP); } error = fuse_internal_get_cached_vnode(mp, ino, flags, vpp); if (error || *vpp != NULL) return error; /* Do a LOOKUP, using nodeid as the parent and "." as filename */ fdisp_init(&fdi, sizeof(dot)); fdisp_make(&fdi, FUSE_LOOKUP, mp, nodeid, td, td->td_ucred); memcpy(fdi.indata, dot, sizeof(dot)); error = fdisp_wait_answ(&fdi); if (error) return error; feo = (struct fuse_entry_out *)fdi.answ; if (feo->nodeid == 0) { /* zero nodeid means ENOENT and cache it */ error = ENOENT; goto out; } vtyp = IFTOVT(feo->attr.mode); error = fuse_vnode_get(mp, feo, nodeid, NULL, vpp, NULL, vtyp); if (error) goto out; filesize = feo->attr.size; /* * In the case where we are looking up a FUSE node represented by an * existing cached vnode, and the true size reported by FUSE_LOOKUP * doesn't match the vnode's cached size, then any cached writes beyond * the file's current size are lost. * * We can get here: * * following attribute cache expiration, or * * due a bug in the daemon, or */ fvdat = VTOFUD(*vpp); if (vnode_isreg(*vpp) && filesize != fvdat->cached_attrs.va_size && fvdat->flag & FN_SIZECHANGE) { printf("%s: WB cache incoherent on %s!\n", __func__, vnode_mount(*vpp)->mnt_stat.f_mntonname); fvdat->flag &= ~FN_SIZECHANGE; } fuse_internal_cache_attrs(*vpp, &feo->attr, feo->attr_valid, feo->attr_valid_nsec, NULL); fuse_validity_2_bintime(feo->entry_valid, feo->entry_valid_nsec, &fvdat->entry_cache_timeout); out: fdisp_destroy(&fdi); return error; } static int fuse_vfsop_root(struct mount *mp, int lkflags, struct vnode **vpp) { struct fuse_data *data = fuse_get_mpdata(mp); int err = 0; if (data->vroot != NULL) { - err = vget(data->vroot, lkflags, curthread); + err = vget(data->vroot, lkflags); if (err == 0) *vpp = data->vroot; } else { err = fuse_vnode_get(mp, NULL, FUSE_ROOT_ID, NULL, vpp, NULL, VDIR); if (err == 0) { FUSE_LOCK(); MPASS(data->vroot == NULL || data->vroot == *vpp); if (data->vroot == NULL) { SDT_PROBE2(fusefs, , vfsops, trace, 1, "new root vnode"); data->vroot = *vpp; FUSE_UNLOCK(); vref(*vpp); } else if (data->vroot != *vpp) { SDT_PROBE2(fusefs, , vfsops, trace, 1, "root vnode race"); FUSE_UNLOCK(); VOP_UNLOCK(*vpp); vrele(*vpp); vrecycle(*vpp); *vpp = data->vroot; } else FUSE_UNLOCK(); } } return err; } static int fuse_vfsop_statfs(struct mount *mp, struct statfs *sbp) { struct fuse_dispatcher fdi; int err = 0; struct fuse_statfs_out *fsfo; struct fuse_data *data; data = fuse_get_mpdata(mp); if (!(data->dataflags & FSESS_INITED)) goto fake; fdisp_init(&fdi, 0); fdisp_make(&fdi, FUSE_STATFS, mp, FUSE_ROOT_ID, NULL, NULL); err = fdisp_wait_answ(&fdi); if (err) { fdisp_destroy(&fdi); if (err == ENOTCONN) { /* * We want to seem a legitimate fs even if the daemon * is stiff dead... (so that, eg., we can still do path * based unmounting after the daemon dies). */ goto fake; } return err; } fsfo = fdi.answ; sbp->f_blocks = fsfo->st.blocks; sbp->f_bfree = fsfo->st.bfree; sbp->f_bavail = fsfo->st.bavail; sbp->f_files = fsfo->st.files; sbp->f_ffree = fsfo->st.ffree; /* cast from uint64_t to int64_t */ sbp->f_namemax = fsfo->st.namelen; sbp->f_bsize = fsfo->st.frsize; /* cast from uint32_t to uint64_t */ fdisp_destroy(&fdi); return 0; fake: sbp->f_blocks = 0; sbp->f_bfree = 0; sbp->f_bavail = 0; sbp->f_files = 0; sbp->f_ffree = 0; sbp->f_namemax = 0; sbp->f_bsize = S_BLKSIZE; return 0; } Index: projects/clang1100-import/sys/fs/msdosfs/msdosfs_vfsops.c =================================================================== --- projects/clang1100-import/sys/fs/msdosfs/msdosfs_vfsops.c (revision 364278) +++ projects/clang1100-import/sys/fs/msdosfs/msdosfs_vfsops.c (revision 364279) @@ -1,985 +1,985 @@ /* $FreeBSD$ */ /* $NetBSD: msdosfs_vfsops.c,v 1.51 1997/11/17 15:36:58 ws Exp $ */ /*- * SPDX-License-Identifier: BSD-4-Clause * * Copyright (C) 1994, 1995, 1997 Wolfgang Solfrank. * Copyright (C) 1994, 1995, 1997 TooLs GmbH. * All rights reserved. * Original code by Paul Popelka (paulp@uts.amdahl.com) (see below). * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by TooLs GmbH. * 4. The name of TooLs GmbH may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY TOOLS GMBH ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /*- * Written by Paul Popelka (paulp@uts.amdahl.com) * * You can do anything you want with this software, just don't say you wrote * it, and don't remove this notice. * * This software is provided "as is". * * The author supplies this software to be publicly redistributed on the * understanding that the author is not responsible for the correct * functioning of this software in any circumstances and is not liable for * any damages caused by this software. * * October 1992 */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef MSDOSFS_DEBUG #include #endif static const char msdosfs_lock_msg[] = "fatlk"; /* Mount options that we support. */ static const char *msdosfs_opts[] = { "async", "noatime", "noclusterr", "noclusterw", "export", "force", "from", "sync", "cs_dos", "cs_local", "cs_win", "dirmask", "gid", "kiconv", "longname", "longnames", "mask", "shortname", "shortnames", "uid", "win95", "nowin95", NULL }; #if 1 /*def PC98*/ /* * XXX - The boot signature formatted by NEC PC-98 DOS looks like a * garbage or a random value :-{ * If you want to use that broken-signatured media, define the * following symbol even though PC/AT. * (ex. mount PC-98 DOS formatted FD on PC/AT) */ #define MSDOSFS_NOCHECKSIG #endif MALLOC_DEFINE(M_MSDOSFSMNT, "msdosfs_mount", "MSDOSFS mount structure"); static MALLOC_DEFINE(M_MSDOSFSFAT, "msdosfs_fat", "MSDOSFS file allocation table"); struct iconv_functions *msdosfs_iconv; static int update_mp(struct mount *mp, struct thread *td); static int mountmsdosfs(struct vnode *devvp, struct mount *mp); static vfs_fhtovp_t msdosfs_fhtovp; static vfs_mount_t msdosfs_mount; static vfs_root_t msdosfs_root; static vfs_statfs_t msdosfs_statfs; static vfs_sync_t msdosfs_sync; static vfs_unmount_t msdosfs_unmount; /* Maximum length of a character set name (arbitrary). */ #define MAXCSLEN 64 static int update_mp(struct mount *mp, struct thread *td) { struct msdosfsmount *pmp = VFSTOMSDOSFS(mp); void *dos, *win, *local; int error, v; if (!vfs_getopt(mp->mnt_optnew, "kiconv", NULL, NULL)) { if (msdosfs_iconv != NULL) { error = vfs_getopt(mp->mnt_optnew, "cs_win", &win, NULL); if (!error) error = vfs_getopt(mp->mnt_optnew, "cs_local", &local, NULL); if (!error) error = vfs_getopt(mp->mnt_optnew, "cs_dos", &dos, NULL); if (!error) { msdosfs_iconv->open(win, local, &pmp->pm_u2w); msdosfs_iconv->open(local, win, &pmp->pm_w2u); msdosfs_iconv->open(dos, local, &pmp->pm_u2d); msdosfs_iconv->open(local, dos, &pmp->pm_d2u); } if (error != 0) return (error); } else { pmp->pm_w2u = NULL; pmp->pm_u2w = NULL; pmp->pm_d2u = NULL; pmp->pm_u2d = NULL; } } if (vfs_scanopt(mp->mnt_optnew, "gid", "%d", &v) == 1) pmp->pm_gid = v; if (vfs_scanopt(mp->mnt_optnew, "uid", "%d", &v) == 1) pmp->pm_uid = v; if (vfs_scanopt(mp->mnt_optnew, "mask", "%d", &v) == 1) pmp->pm_mask = v & ALLPERMS; if (vfs_scanopt(mp->mnt_optnew, "dirmask", "%d", &v) == 1) pmp->pm_dirmask = v & ALLPERMS; vfs_flagopt(mp->mnt_optnew, "shortname", &pmp->pm_flags, MSDOSFSMNT_SHORTNAME); vfs_flagopt(mp->mnt_optnew, "shortnames", &pmp->pm_flags, MSDOSFSMNT_SHORTNAME); vfs_flagopt(mp->mnt_optnew, "longname", &pmp->pm_flags, MSDOSFSMNT_LONGNAME); vfs_flagopt(mp->mnt_optnew, "longnames", &pmp->pm_flags, MSDOSFSMNT_LONGNAME); vfs_flagopt(mp->mnt_optnew, "kiconv", &pmp->pm_flags, MSDOSFSMNT_KICONV); if (vfs_getopt(mp->mnt_optnew, "nowin95", NULL, NULL) == 0) pmp->pm_flags |= MSDOSFSMNT_NOWIN95; else pmp->pm_flags &= ~MSDOSFSMNT_NOWIN95; if (pmp->pm_flags & MSDOSFSMNT_NOWIN95) pmp->pm_flags |= MSDOSFSMNT_SHORTNAME; else pmp->pm_flags |= MSDOSFSMNT_LONGNAME; return 0; } static int msdosfs_cmount(struct mntarg *ma, void *data, uint64_t flags) { struct msdosfs_args args; int error; if (data == NULL) return (EINVAL); error = copyin(data, &args, sizeof args); if (error) return (error); ma = mount_argsu(ma, "from", args.fspec, MAXPATHLEN); ma = mount_arg(ma, "export", &args.export, sizeof(args.export)); ma = mount_argf(ma, "uid", "%d", args.uid); ma = mount_argf(ma, "gid", "%d", args.gid); ma = mount_argf(ma, "mask", "%d", args.mask); ma = mount_argf(ma, "dirmask", "%d", args.dirmask); ma = mount_argb(ma, args.flags & MSDOSFSMNT_SHORTNAME, "noshortname"); ma = mount_argb(ma, args.flags & MSDOSFSMNT_LONGNAME, "nolongname"); ma = mount_argb(ma, !(args.flags & MSDOSFSMNT_NOWIN95), "nowin95"); ma = mount_argb(ma, args.flags & MSDOSFSMNT_KICONV, "nokiconv"); ma = mount_argsu(ma, "cs_win", args.cs_win, MAXCSLEN); ma = mount_argsu(ma, "cs_dos", args.cs_dos, MAXCSLEN); ma = mount_argsu(ma, "cs_local", args.cs_local, MAXCSLEN); error = kernel_mount(ma, flags); return (error); } /* * mp - path - addr in user space of mount point (ie /usr or whatever) * data - addr in user space of mount params including the name of the block * special file to treat as a filesystem. */ static int msdosfs_mount(struct mount *mp) { struct vnode *devvp; /* vnode for blk device to mount */ struct thread *td; /* msdosfs specific mount control block */ struct msdosfsmount *pmp = NULL; struct nameidata ndp; int error, flags; accmode_t accmode; char *from; td = curthread; if (vfs_filteropt(mp->mnt_optnew, msdosfs_opts)) return (EINVAL); /* * If updating, check whether changing from read-only to * read/write; if there is no device name, that's all we do. */ if (mp->mnt_flag & MNT_UPDATE) { pmp = VFSTOMSDOSFS(mp); if (!(pmp->pm_flags & MSDOSFSMNT_RONLY) && vfs_flagopt(mp->mnt_optnew, "ro", NULL, 0)) { error = VFS_SYNC(mp, MNT_WAIT); if (error) return (error); flags = WRITECLOSE; if (mp->mnt_flag & MNT_FORCE) flags |= FORCECLOSE; error = vflush(mp, 0, flags, td); if (error) return (error); /* * Now the volume is clean. Mark it so while the * device is still rw. */ error = markvoldirty(pmp, 0); if (error) { (void)markvoldirty(pmp, 1); return (error); } /* Downgrade the device from rw to ro. */ g_topology_lock(); error = g_access(pmp->pm_cp, 0, -1, 0); g_topology_unlock(); if (error) { (void)markvoldirty(pmp, 1); return (error); } /* * Backing out after an error was painful in the * above. Now we are committed to succeeding. */ pmp->pm_fmod = 0; pmp->pm_flags |= MSDOSFSMNT_RONLY; MNT_ILOCK(mp); mp->mnt_flag |= MNT_RDONLY; MNT_IUNLOCK(mp); } else if ((pmp->pm_flags & MSDOSFSMNT_RONLY) && !vfs_flagopt(mp->mnt_optnew, "ro", NULL, 0)) { /* * If upgrade to read-write by non-root, then verify * that user has necessary permissions on the device. */ devvp = pmp->pm_devvp; vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); error = VOP_ACCESS(devvp, VREAD | VWRITE, td->td_ucred, td); if (error) error = priv_check(td, PRIV_VFS_MOUNT_PERM); if (error) { VOP_UNLOCK(devvp); return (error); } VOP_UNLOCK(devvp); g_topology_lock(); error = g_access(pmp->pm_cp, 0, 1, 0); g_topology_unlock(); if (error) return (error); /* Now that the volume is modifiable, mark it dirty. */ error = markvoldirty_upgrade(pmp, true, true); if (error) { /* * If dirtying the superblock failed, drop GEOM * 'w' refs (we're still RO). */ g_topology_lock(); (void)g_access(pmp->pm_cp, 0, -1, 0); g_topology_unlock(); return (error); } pmp->pm_fmod = 1; pmp->pm_flags &= ~MSDOSFSMNT_RONLY; MNT_ILOCK(mp); mp->mnt_flag &= ~MNT_RDONLY; MNT_IUNLOCK(mp); } } /* * Not an update, or updating the name: look up the name * and verify that it refers to a sensible disk device. */ if (vfs_getopt(mp->mnt_optnew, "from", (void **)&from, NULL)) return (EINVAL); NDINIT(&ndp, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, from, td); error = namei(&ndp); if (error) return (error); devvp = ndp.ni_vp; NDFREE(&ndp, NDF_ONLY_PNBUF); if (!vn_isdisk(devvp, &error)) { vput(devvp); return (error); } /* * If mount by non-root, then verify that user has necessary * permissions on the device. */ accmode = VREAD; if ((mp->mnt_flag & MNT_RDONLY) == 0) accmode |= VWRITE; error = VOP_ACCESS(devvp, accmode, td->td_ucred, td); if (error) error = priv_check(td, PRIV_VFS_MOUNT_PERM); if (error) { vput(devvp); return (error); } if ((mp->mnt_flag & MNT_UPDATE) == 0) { error = mountmsdosfs(devvp, mp); #ifdef MSDOSFS_DEBUG /* only needed for the printf below */ pmp = VFSTOMSDOSFS(mp); #endif } else { vput(devvp); if (devvp != pmp->pm_devvp) return (EINVAL); /* XXX needs translation */ } if (error) { vrele(devvp); return (error); } error = update_mp(mp, td); if (error) { if ((mp->mnt_flag & MNT_UPDATE) == 0) msdosfs_unmount(mp, MNT_FORCE); return error; } vfs_mountedfrom(mp, from); #ifdef MSDOSFS_DEBUG printf("msdosfs_mount(): mp %p, pmp %p, inusemap %p\n", mp, pmp, pmp->pm_inusemap); #endif return (0); } static int mountmsdosfs(struct vnode *devvp, struct mount *mp) { struct msdosfsmount *pmp; struct buf *bp; struct cdev *dev; union bootsector *bsp; struct byte_bpb33 *b33; struct byte_bpb50 *b50; struct byte_bpb710 *b710; uint8_t SecPerClust; u_long clusters; int ronly, error; struct g_consumer *cp; struct bufobj *bo; bp = NULL; /* This and pmp both used in error_exit. */ pmp = NULL; ronly = (mp->mnt_flag & MNT_RDONLY) != 0; dev = devvp->v_rdev; if (atomic_cmpset_acq_ptr((uintptr_t *)&dev->si_mountpt, 0, (uintptr_t)mp) == 0) { VOP_UNLOCK(devvp); return (EBUSY); } g_topology_lock(); error = g_vfs_open(devvp, &cp, "msdosfs", ronly ? 0 : 1); g_topology_unlock(); if (error != 0) { atomic_store_rel_ptr((uintptr_t *)&dev->si_mountpt, 0); VOP_UNLOCK(devvp); return (error); } dev_ref(dev); bo = &devvp->v_bufobj; VOP_UNLOCK(devvp); if (dev->si_iosize_max != 0) mp->mnt_iosize_max = dev->si_iosize_max; if (mp->mnt_iosize_max > MAXPHYS) mp->mnt_iosize_max = MAXPHYS; /* * Read the boot sector of the filesystem, and then check the * boot signature. If not a dos boot sector then error out. * * NOTE: 8192 is a magic size that works for ffs. */ error = bread(devvp, 0, 8192, NOCRED, &bp); if (error) goto error_exit; bp->b_flags |= B_AGE; bsp = (union bootsector *)bp->b_data; b33 = (struct byte_bpb33 *)bsp->bs33.bsBPB; b50 = (struct byte_bpb50 *)bsp->bs50.bsBPB; b710 = (struct byte_bpb710 *)bsp->bs710.bsBPB; #ifndef MSDOSFS_NOCHECKSIG if (bsp->bs50.bsBootSectSig0 != BOOTSIG0 || bsp->bs50.bsBootSectSig1 != BOOTSIG1) { error = EINVAL; goto error_exit; } #endif pmp = malloc(sizeof *pmp, M_MSDOSFSMNT, M_WAITOK | M_ZERO); pmp->pm_mountp = mp; pmp->pm_cp = cp; pmp->pm_bo = bo; lockinit(&pmp->pm_fatlock, 0, msdosfs_lock_msg, 0, 0); /* * Initialize ownerships and permissions, since nothing else will * initialize them iff we are mounting root. */ pmp->pm_uid = UID_ROOT; pmp->pm_gid = GID_WHEEL; pmp->pm_mask = pmp->pm_dirmask = S_IXUSR | S_IXGRP | S_IXOTH | S_IRUSR | S_IRGRP | S_IROTH | S_IWUSR; /* * Compute several useful quantities from the bpb in the * bootsector. Copy in the dos 5 variant of the bpb then fix up * the fields that are different between dos 5 and dos 3.3. */ SecPerClust = b50->bpbSecPerClust; pmp->pm_BytesPerSec = getushort(b50->bpbBytesPerSec); if (pmp->pm_BytesPerSec < DEV_BSIZE) { error = EINVAL; goto error_exit; } pmp->pm_ResSectors = getushort(b50->bpbResSectors); pmp->pm_FATs = b50->bpbFATs; pmp->pm_RootDirEnts = getushort(b50->bpbRootDirEnts); pmp->pm_Sectors = getushort(b50->bpbSectors); pmp->pm_FATsecs = getushort(b50->bpbFATsecs); pmp->pm_SecPerTrack = getushort(b50->bpbSecPerTrack); pmp->pm_Heads = getushort(b50->bpbHeads); pmp->pm_Media = b50->bpbMedia; /* calculate the ratio of sector size to DEV_BSIZE */ pmp->pm_BlkPerSec = pmp->pm_BytesPerSec / DEV_BSIZE; /* * We don't check pm_Heads nor pm_SecPerTrack, because * these may not be set for EFI file systems. We don't * use these anyway, so we're unaffected if they are * invalid. */ if (!pmp->pm_BytesPerSec || !SecPerClust) { error = EINVAL; goto error_exit; } if (pmp->pm_Sectors == 0) { pmp->pm_HiddenSects = getulong(b50->bpbHiddenSecs); pmp->pm_HugeSectors = getulong(b50->bpbHugeSectors); } else { pmp->pm_HiddenSects = getushort(b33->bpbHiddenSecs); pmp->pm_HugeSectors = pmp->pm_Sectors; } if (pmp->pm_RootDirEnts == 0) { if (pmp->pm_FATsecs || getushort(b710->bpbFSVers)) { error = EINVAL; #ifdef MSDOSFS_DEBUG printf("mountmsdosfs(): bad FAT32 filesystem\n"); #endif goto error_exit; } pmp->pm_fatmask = FAT32_MASK; pmp->pm_fatmult = 4; pmp->pm_fatdiv = 1; pmp->pm_FATsecs = getulong(b710->bpbBigFATsecs); if (getushort(b710->bpbExtFlags) & FATMIRROR) pmp->pm_curfat = getushort(b710->bpbExtFlags) & FATNUM; else pmp->pm_flags |= MSDOSFS_FATMIRROR; } else pmp->pm_flags |= MSDOSFS_FATMIRROR; /* * Check a few values (could do some more): * - logical sector size: power of 2, >= block size * - sectors per cluster: power of 2, >= 1 * - number of sectors: >= 1, <= size of partition * - number of FAT sectors: >= 1 */ if ( (SecPerClust == 0) || (SecPerClust & (SecPerClust - 1)) || (pmp->pm_BytesPerSec < DEV_BSIZE) || (pmp->pm_BytesPerSec & (pmp->pm_BytesPerSec - 1)) || (pmp->pm_HugeSectors == 0) || (pmp->pm_FATsecs == 0) || (SecPerClust * pmp->pm_BlkPerSec > MAXBSIZE / DEV_BSIZE) ) { error = EINVAL; goto error_exit; } pmp->pm_HugeSectors *= pmp->pm_BlkPerSec; pmp->pm_HiddenSects *= pmp->pm_BlkPerSec; /* XXX not used? */ pmp->pm_FATsecs *= pmp->pm_BlkPerSec; SecPerClust *= pmp->pm_BlkPerSec; pmp->pm_fatblk = pmp->pm_ResSectors * pmp->pm_BlkPerSec; if (FAT32(pmp)) { pmp->pm_rootdirblk = getulong(b710->bpbRootClust); pmp->pm_firstcluster = pmp->pm_fatblk + (pmp->pm_FATs * pmp->pm_FATsecs); pmp->pm_fsinfo = getushort(b710->bpbFSInfo) * pmp->pm_BlkPerSec; } else { pmp->pm_rootdirblk = pmp->pm_fatblk + (pmp->pm_FATs * pmp->pm_FATsecs); pmp->pm_rootdirsize = howmany(pmp->pm_RootDirEnts * sizeof(struct direntry), DEV_BSIZE); /* in blocks */ pmp->pm_firstcluster = pmp->pm_rootdirblk + pmp->pm_rootdirsize; } pmp->pm_maxcluster = (pmp->pm_HugeSectors - pmp->pm_firstcluster) / SecPerClust + 1; pmp->pm_fatsize = pmp->pm_FATsecs * DEV_BSIZE; /* XXX not used? */ if (pmp->pm_fatmask == 0) { if (pmp->pm_maxcluster <= ((CLUST_RSRVD - CLUST_FIRST) & FAT12_MASK)) { /* * This will usually be a floppy disk. This size makes * sure that one FAT entry will not be split across * multiple blocks. */ pmp->pm_fatmask = FAT12_MASK; pmp->pm_fatmult = 3; pmp->pm_fatdiv = 2; } else { pmp->pm_fatmask = FAT16_MASK; pmp->pm_fatmult = 2; pmp->pm_fatdiv = 1; } } clusters = (pmp->pm_fatsize / pmp->pm_fatmult) * pmp->pm_fatdiv; if (pmp->pm_maxcluster >= clusters) { #ifdef MSDOSFS_DEBUG printf("Warning: number of clusters (%ld) exceeds FAT " "capacity (%ld)\n", pmp->pm_maxcluster + 1, clusters); #endif pmp->pm_maxcluster = clusters - 1; } if (FAT12(pmp)) pmp->pm_fatblocksize = 3 * 512; else pmp->pm_fatblocksize = PAGE_SIZE; pmp->pm_fatblocksize = roundup(pmp->pm_fatblocksize, pmp->pm_BytesPerSec); pmp->pm_fatblocksec = pmp->pm_fatblocksize / DEV_BSIZE; pmp->pm_bnshift = ffs(DEV_BSIZE) - 1; /* * Compute mask and shift value for isolating cluster relative byte * offsets and cluster numbers from a file offset. */ pmp->pm_bpcluster = SecPerClust * DEV_BSIZE; pmp->pm_crbomask = pmp->pm_bpcluster - 1; pmp->pm_cnshift = ffs(pmp->pm_bpcluster) - 1; /* * Check for valid cluster size * must be a power of 2 */ if (pmp->pm_bpcluster ^ (1 << pmp->pm_cnshift)) { error = EINVAL; goto error_exit; } /* * Release the bootsector buffer. */ brelse(bp); bp = NULL; /* * Check the fsinfo sector if we have one. Silently fix up our * in-core copy of fp->fsinxtfree if it is unknown (0xffffffff) * or too large. Ignore fp->fsinfree for now, since we need to * read the entire FAT anyway to fill the inuse map. */ if (pmp->pm_fsinfo) { struct fsinfo *fp; if ((error = bread(devvp, pmp->pm_fsinfo, pmp->pm_BytesPerSec, NOCRED, &bp)) != 0) goto error_exit; fp = (struct fsinfo *)bp->b_data; if (!bcmp(fp->fsisig1, "RRaA", 4) && !bcmp(fp->fsisig2, "rrAa", 4) && !bcmp(fp->fsisig3, "\0\0\125\252", 4)) { pmp->pm_nxtfree = getulong(fp->fsinxtfree); if (pmp->pm_nxtfree > pmp->pm_maxcluster) pmp->pm_nxtfree = CLUST_FIRST; } else pmp->pm_fsinfo = 0; brelse(bp); bp = NULL; } /* * Finish initializing pmp->pm_nxtfree (just in case the first few * sectors aren't properly reserved in the FAT). This completes * the fixup for fp->fsinxtfree, and fixes up the zero-initialized * value if there is no fsinfo. We will use pmp->pm_nxtfree * internally even if there is no fsinfo. */ if (pmp->pm_nxtfree < CLUST_FIRST) pmp->pm_nxtfree = CLUST_FIRST; /* * Allocate memory for the bitmap of allocated clusters, and then * fill it in. */ pmp->pm_inusemap = malloc(howmany(pmp->pm_maxcluster + 1, N_INUSEBITS) * sizeof(*pmp->pm_inusemap), M_MSDOSFSFAT, M_WAITOK); /* * fillinusemap() needs pm_devvp. */ pmp->pm_devvp = devvp; pmp->pm_dev = dev; /* * Have the inuse map filled in. */ MSDOSFS_LOCK_MP(pmp); error = fillinusemap(pmp); MSDOSFS_UNLOCK_MP(pmp); if (error != 0) goto error_exit; /* * If they want FAT updates to be synchronous then let them suffer * the performance degradation in exchange for the on disk copy of * the FAT being correct just about all the time. I suppose this * would be a good thing to turn on if the kernel is still flakey. */ if (mp->mnt_flag & MNT_SYNCHRONOUS) pmp->pm_flags |= MSDOSFSMNT_WAITONFAT; /* * Finish up. */ if (ronly) pmp->pm_flags |= MSDOSFSMNT_RONLY; else { if ((error = markvoldirty(pmp, 1)) != 0) goto error_exit; pmp->pm_fmod = 1; } mp->mnt_data = pmp; mp->mnt_stat.f_fsid.val[0] = dev2udev(dev); mp->mnt_stat.f_fsid.val[1] = mp->mnt_vfc->vfc_typenum; MNT_ILOCK(mp); mp->mnt_flag |= MNT_LOCAL; mp->mnt_kern_flag |= MNTK_USES_BCACHE | MNTK_NO_IOPF; MNT_IUNLOCK(mp); return (0); error_exit: if (bp) brelse(bp); if (cp != NULL) { g_topology_lock(); g_vfs_close(cp); g_topology_unlock(); } if (pmp) { lockdestroy(&pmp->pm_fatlock); free(pmp->pm_inusemap, M_MSDOSFSFAT); free(pmp, M_MSDOSFSMNT); mp->mnt_data = NULL; } atomic_store_rel_ptr((uintptr_t *)&dev->si_mountpt, 0); dev_rel(dev); return (error); } /* * Unmount the filesystem described by mp. */ static int msdosfs_unmount(struct mount *mp, int mntflags) { struct msdosfsmount *pmp; int error, flags; error = flags = 0; pmp = VFSTOMSDOSFS(mp); if ((pmp->pm_flags & MSDOSFSMNT_RONLY) == 0) error = msdosfs_sync(mp, MNT_WAIT); if ((mntflags & MNT_FORCE) != 0) flags |= FORCECLOSE; else if (error != 0) return (error); error = vflush(mp, 0, flags, curthread); if (error != 0 && error != ENXIO) return (error); if ((pmp->pm_flags & MSDOSFSMNT_RONLY) == 0) { error = markvoldirty(pmp, 0); if (error && error != ENXIO) { (void)markvoldirty(pmp, 1); return (error); } } if (pmp->pm_flags & MSDOSFSMNT_KICONV && msdosfs_iconv) { if (pmp->pm_w2u) msdosfs_iconv->close(pmp->pm_w2u); if (pmp->pm_u2w) msdosfs_iconv->close(pmp->pm_u2w); if (pmp->pm_d2u) msdosfs_iconv->close(pmp->pm_d2u); if (pmp->pm_u2d) msdosfs_iconv->close(pmp->pm_u2d); } #ifdef MSDOSFS_DEBUG { struct vnode *vp = pmp->pm_devvp; struct bufobj *bo; bo = &vp->v_bufobj; BO_LOCK(bo); VI_LOCK(vp); vn_printf(vp, "msdosfs_umount(): just before calling VOP_CLOSE()\n"); printf("freef %p, freeb %p, mount %p\n", TAILQ_NEXT(vp, v_vnodelist), vp->v_vnodelist.tqe_prev, vp->v_mount); printf("cleanblkhd %p, dirtyblkhd %p, numoutput %ld, type %d\n", TAILQ_FIRST(&vp->v_bufobj.bo_clean.bv_hd), TAILQ_FIRST(&vp->v_bufobj.bo_dirty.bv_hd), vp->v_bufobj.bo_numoutput, vp->v_type); VI_UNLOCK(vp); BO_UNLOCK(bo); } #endif g_topology_lock(); g_vfs_close(pmp->pm_cp); g_topology_unlock(); atomic_store_rel_ptr((uintptr_t *)&pmp->pm_dev->si_mountpt, 0); vrele(pmp->pm_devvp); dev_rel(pmp->pm_dev); free(pmp->pm_inusemap, M_MSDOSFSFAT); lockdestroy(&pmp->pm_fatlock); free(pmp, M_MSDOSFSMNT); mp->mnt_data = NULL; MNT_ILOCK(mp); mp->mnt_flag &= ~MNT_LOCAL; MNT_IUNLOCK(mp); return (error); } static int msdosfs_root(struct mount *mp, int flags, struct vnode **vpp) { struct msdosfsmount *pmp = VFSTOMSDOSFS(mp); struct denode *ndep; int error; #ifdef MSDOSFS_DEBUG printf("msdosfs_root(); mp %p, pmp %p\n", mp, pmp); #endif error = deget(pmp, MSDOSFSROOT, MSDOSFSROOT_OFS, &ndep); if (error) return (error); *vpp = DETOV(ndep); return (0); } static int msdosfs_statfs(struct mount *mp, struct statfs *sbp) { struct msdosfsmount *pmp; pmp = VFSTOMSDOSFS(mp); sbp->f_bsize = pmp->pm_bpcluster; sbp->f_iosize = pmp->pm_bpcluster; sbp->f_blocks = pmp->pm_maxcluster + 1; sbp->f_bfree = pmp->pm_freeclustercount; sbp->f_bavail = pmp->pm_freeclustercount; sbp->f_files = pmp->pm_RootDirEnts; /* XXX */ sbp->f_ffree = 0; /* what to put in here? */ return (0); } /* * If we have an FSInfo block, update it. */ static int msdosfs_fsiflush(struct msdosfsmount *pmp, int waitfor) { struct fsinfo *fp; struct buf *bp; int error; MSDOSFS_LOCK_MP(pmp); if (pmp->pm_fsinfo == 0 || (pmp->pm_flags & MSDOSFS_FSIMOD) == 0) { error = 0; goto unlock; } error = bread(pmp->pm_devvp, pmp->pm_fsinfo, pmp->pm_BytesPerSec, NOCRED, &bp); if (error != 0) { goto unlock; } fp = (struct fsinfo *)bp->b_data; putulong(fp->fsinfree, pmp->pm_freeclustercount); putulong(fp->fsinxtfree, pmp->pm_nxtfree); pmp->pm_flags &= ~MSDOSFS_FSIMOD; if (waitfor == MNT_WAIT) error = bwrite(bp); else bawrite(bp); unlock: MSDOSFS_UNLOCK_MP(pmp); return (error); } static int msdosfs_sync(struct mount *mp, int waitfor) { struct vnode *vp, *nvp; struct thread *td; struct denode *dep; struct msdosfsmount *pmp = VFSTOMSDOSFS(mp); int error, allerror = 0; td = curthread; /* * If we ever switch to not updating all of the FATs all the time, * this would be the place to update them from the first one. */ if (pmp->pm_fmod != 0) { if (pmp->pm_flags & MSDOSFSMNT_RONLY) panic("msdosfs_sync: rofs mod"); else { /* update FATs here */ } } /* * Write back each (modified) denode. */ loop: MNT_VNODE_FOREACH_ALL(vp, mp, nvp) { if (vp->v_type == VNON) { VI_UNLOCK(vp); continue; } dep = VTODE(vp); if ((dep->de_flag & (DE_ACCESS | DE_CREATE | DE_UPDATE | DE_MODIFIED)) == 0 && (vp->v_bufobj.bo_dirty.bv_cnt == 0 || waitfor == MNT_LAZY)) { VI_UNLOCK(vp); continue; } - error = vget(vp, LK_EXCLUSIVE | LK_NOWAIT | LK_INTERLOCK, td); + error = vget(vp, LK_EXCLUSIVE | LK_NOWAIT | LK_INTERLOCK); if (error) { if (error == ENOENT) { MNT_VNODE_FOREACH_ALL_ABORT(mp, nvp); goto loop; } continue; } error = VOP_FSYNC(vp, waitfor, td); if (error) allerror = error; VOP_UNLOCK(vp); vrele(vp); } /* * Flush filesystem control info. */ if (waitfor != MNT_LAZY) { vn_lock(pmp->pm_devvp, LK_EXCLUSIVE | LK_RETRY); error = VOP_FSYNC(pmp->pm_devvp, waitfor, td); if (error) allerror = error; VOP_UNLOCK(pmp->pm_devvp); } error = msdosfs_fsiflush(pmp, waitfor); if (error != 0) allerror = error; return (allerror); } static int msdosfs_fhtovp(struct mount *mp, struct fid *fhp, int flags, struct vnode **vpp) { struct msdosfsmount *pmp = VFSTOMSDOSFS(mp); struct defid *defhp = (struct defid *) fhp; struct denode *dep; int error; error = deget(pmp, defhp->defid_dirclust, defhp->defid_dirofs, &dep); if (error) { *vpp = NULLVP; return (error); } *vpp = DETOV(dep); vnode_create_vobject(*vpp, dep->de_FileSize, curthread); return (0); } static struct vfsops msdosfs_vfsops = { .vfs_fhtovp = msdosfs_fhtovp, .vfs_mount = msdosfs_mount, .vfs_cmount = msdosfs_cmount, .vfs_root = msdosfs_root, .vfs_statfs = msdosfs_statfs, .vfs_sync = msdosfs_sync, .vfs_unmount = msdosfs_unmount, }; VFS_SET(msdosfs_vfsops, msdosfs, 0); MODULE_VERSION(msdosfs, 1); Index: projects/clang1100-import/sys/fs/nfsclient/nfs_clvfsops.c =================================================================== --- projects/clang1100-import/sys/fs/nfsclient/nfs_clvfsops.c (revision 364278) +++ projects/clang1100-import/sys/fs/nfsclient/nfs_clvfsops.c (revision 364279) @@ -1,2061 +1,2061 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1989, 1993, 1995 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Rick Macklem at The University of Guelph. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from nfs_vfsops.c 8.12 (Berkeley) 5/20/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_bootp.h" #include "opt_nfsroot.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include FEATURE(nfscl, "NFSv4 client"); extern int nfscl_ticks; extern struct timeval nfsboottime; extern int nfsrv_useacl; extern int nfscl_debuglevel; extern enum nfsiod_state ncl_iodwant[NFS_MAXASYNCDAEMON]; extern struct nfsmount *ncl_iodmount[NFS_MAXASYNCDAEMON]; extern struct mtx ncl_iod_mutex; NFSCLSTATEMUTEX; extern struct mtx nfsrv_dslock_mtx; MALLOC_DEFINE(M_NEWNFSREQ, "newnfsclient_req", "NFS request header"); MALLOC_DEFINE(M_NEWNFSMNT, "newnfsmnt", "NFS mount struct"); SYSCTL_DECL(_vfs_nfs); static int nfs_ip_paranoia = 1; SYSCTL_INT(_vfs_nfs, OID_AUTO, nfs_ip_paranoia, CTLFLAG_RW, &nfs_ip_paranoia, 0, ""); static int nfs_tprintf_initial_delay = NFS_TPRINTF_INITIAL_DELAY; SYSCTL_INT(_vfs_nfs, NFS_TPRINTF_INITIAL_DELAY, downdelayinitial, CTLFLAG_RW, &nfs_tprintf_initial_delay, 0, ""); /* how long between console messages "nfs server foo not responding" */ static int nfs_tprintf_delay = NFS_TPRINTF_DELAY; SYSCTL_INT(_vfs_nfs, NFS_TPRINTF_DELAY, downdelayinterval, CTLFLAG_RW, &nfs_tprintf_delay, 0, ""); #ifdef NFS_DEBUG int nfs_debug; SYSCTL_INT(_vfs_nfs, OID_AUTO, debug, CTLFLAG_RW, &nfs_debug, 0, "Toggle debug flag"); #endif static int nfs_mountroot(struct mount *); static void nfs_sec_name(char *, int *); static void nfs_decode_args(struct mount *mp, struct nfsmount *nmp, struct nfs_args *argp, const char *, struct ucred *, struct thread *); static int mountnfs(struct nfs_args *, struct mount *, struct sockaddr *, char *, u_char *, int, u_char *, int, u_char *, int, struct vnode **, struct ucred *, struct thread *, int, int, int); static void nfs_getnlminfo(struct vnode *, uint8_t *, size_t *, struct sockaddr_storage *, int *, off_t *, struct timeval *); static vfs_mount_t nfs_mount; static vfs_cmount_t nfs_cmount; static vfs_unmount_t nfs_unmount; static vfs_root_t nfs_root; static vfs_statfs_t nfs_statfs; static vfs_sync_t nfs_sync; static vfs_sysctl_t nfs_sysctl; static vfs_purge_t nfs_purge; /* * nfs vfs operations. */ static struct vfsops nfs_vfsops = { .vfs_init = ncl_init, .vfs_mount = nfs_mount, .vfs_cmount = nfs_cmount, .vfs_root = vfs_cache_root, .vfs_cachedroot = nfs_root, .vfs_statfs = nfs_statfs, .vfs_sync = nfs_sync, .vfs_uninit = ncl_uninit, .vfs_unmount = nfs_unmount, .vfs_sysctl = nfs_sysctl, .vfs_purge = nfs_purge, }; VFS_SET(nfs_vfsops, nfs, VFCF_NETWORK | VFCF_SBDRY); /* So that loader and kldload(2) can find us, wherever we are.. */ MODULE_VERSION(nfs, 1); MODULE_DEPEND(nfs, nfscommon, 1, 1, 1); MODULE_DEPEND(nfs, krpc, 1, 1, 1); MODULE_DEPEND(nfs, nfssvc, 1, 1, 1); /* * This structure is now defined in sys/nfs/nfs_diskless.c so that it * can be shared by both NFS clients. It is declared here so that it * will be defined for kernels built without NFS_ROOT, although it * isn't used in that case. */ #if !defined(NFS_ROOT) struct nfs_diskless nfs_diskless = { { { 0 } } }; struct nfsv3_diskless nfsv3_diskless = { { { 0 } } }; int nfs_diskless_valid = 0; #endif SYSCTL_INT(_vfs_nfs, OID_AUTO, diskless_valid, CTLFLAG_RD, &nfs_diskless_valid, 0, "Has the diskless struct been filled correctly"); SYSCTL_STRING(_vfs_nfs, OID_AUTO, diskless_rootpath, CTLFLAG_RD, nfsv3_diskless.root_hostnam, 0, "Path to nfs root"); SYSCTL_OPAQUE(_vfs_nfs, OID_AUTO, diskless_rootaddr, CTLFLAG_RD, &nfsv3_diskless.root_saddr, sizeof(nfsv3_diskless.root_saddr), "%Ssockaddr_in", "Diskless root nfs address"); void newnfsargs_ntoh(struct nfs_args *); static int nfs_mountdiskless(char *, struct sockaddr_in *, struct nfs_args *, struct thread *, struct vnode **, struct mount *); static void nfs_convert_diskless(void); static void nfs_convert_oargs(struct nfs_args *args, struct onfs_args *oargs); int newnfs_iosize(struct nfsmount *nmp) { int iosize, maxio; /* First, set the upper limit for iosize */ if (nmp->nm_flag & NFSMNT_NFSV4) { maxio = NFS_MAXBSIZE; } else if (nmp->nm_flag & NFSMNT_NFSV3) { if (nmp->nm_sotype == SOCK_DGRAM) maxio = NFS_MAXDGRAMDATA; else maxio = NFS_MAXBSIZE; } else { maxio = NFS_V2MAXDATA; } if (nmp->nm_rsize > maxio || nmp->nm_rsize == 0) nmp->nm_rsize = maxio; if (nmp->nm_rsize > NFS_MAXBSIZE) nmp->nm_rsize = NFS_MAXBSIZE; if (nmp->nm_readdirsize > maxio || nmp->nm_readdirsize == 0) nmp->nm_readdirsize = maxio; if (nmp->nm_readdirsize > nmp->nm_rsize) nmp->nm_readdirsize = nmp->nm_rsize; if (nmp->nm_wsize > maxio || nmp->nm_wsize == 0) nmp->nm_wsize = maxio; if (nmp->nm_wsize > NFS_MAXBSIZE) nmp->nm_wsize = NFS_MAXBSIZE; /* * Calculate the size used for io buffers. Use the larger * of the two sizes to minimise nfs requests but make sure * that it is at least one VM page to avoid wasting buffer * space. It must also be at least NFS_DIRBLKSIZ, since * that is the buffer size used for directories. */ iosize = imax(nmp->nm_rsize, nmp->nm_wsize); iosize = imax(iosize, PAGE_SIZE); iosize = imax(iosize, NFS_DIRBLKSIZ); nmp->nm_mountp->mnt_stat.f_iosize = iosize; return (iosize); } static void nfs_convert_oargs(struct nfs_args *args, struct onfs_args *oargs) { args->version = NFS_ARGSVERSION; args->addr = oargs->addr; args->addrlen = oargs->addrlen; args->sotype = oargs->sotype; args->proto = oargs->proto; args->fh = oargs->fh; args->fhsize = oargs->fhsize; args->flags = oargs->flags; args->wsize = oargs->wsize; args->rsize = oargs->rsize; args->readdirsize = oargs->readdirsize; args->timeo = oargs->timeo; args->retrans = oargs->retrans; args->readahead = oargs->readahead; args->hostname = oargs->hostname; } static void nfs_convert_diskless(void) { bcopy(&nfs_diskless.myif, &nfsv3_diskless.myif, sizeof(struct ifaliasreq)); bcopy(&nfs_diskless.mygateway, &nfsv3_diskless.mygateway, sizeof(struct sockaddr_in)); nfs_convert_oargs(&nfsv3_diskless.root_args,&nfs_diskless.root_args); if (nfsv3_diskless.root_args.flags & NFSMNT_NFSV3) { nfsv3_diskless.root_fhsize = NFSX_MYFH; bcopy(nfs_diskless.root_fh, nfsv3_diskless.root_fh, NFSX_MYFH); } else { nfsv3_diskless.root_fhsize = NFSX_V2FH; bcopy(nfs_diskless.root_fh, nfsv3_diskless.root_fh, NFSX_V2FH); } bcopy(&nfs_diskless.root_saddr,&nfsv3_diskless.root_saddr, sizeof(struct sockaddr_in)); bcopy(nfs_diskless.root_hostnam, nfsv3_diskless.root_hostnam, MNAMELEN); nfsv3_diskless.root_time = nfs_diskless.root_time; bcopy(nfs_diskless.my_hostnam, nfsv3_diskless.my_hostnam, MAXHOSTNAMELEN); nfs_diskless_valid = 3; } /* * nfs statfs call */ static int nfs_statfs(struct mount *mp, struct statfs *sbp) { struct vnode *vp; struct thread *td; struct nfsmount *nmp = VFSTONFS(mp); struct nfsvattr nfsva; struct nfsfsinfo fs; struct nfsstatfs sb; int error = 0, attrflag, gotfsinfo = 0, ret; struct nfsnode *np; td = curthread; error = vfs_busy(mp, MBF_NOWAIT); if (error) return (error); error = ncl_nget(mp, nmp->nm_fh, nmp->nm_fhsize, &np, LK_EXCLUSIVE); if (error) { vfs_unbusy(mp); return (error); } vp = NFSTOV(np); mtx_lock(&nmp->nm_mtx); if (NFSHASNFSV3(nmp) && !NFSHASGOTFSINFO(nmp)) { mtx_unlock(&nmp->nm_mtx); error = nfsrpc_fsinfo(vp, &fs, td->td_ucred, td, &nfsva, &attrflag, NULL); if (!error) gotfsinfo = 1; } else mtx_unlock(&nmp->nm_mtx); if (!error) error = nfsrpc_statfs(vp, &sb, &fs, td->td_ucred, td, &nfsva, &attrflag, NULL); if (error != 0) NFSCL_DEBUG(2, "statfs=%d\n", error); if (attrflag == 0) { ret = nfsrpc_getattrnovp(nmp, nmp->nm_fh, nmp->nm_fhsize, 1, td->td_ucred, td, &nfsva, NULL, NULL); if (ret) { /* * Just set default values to get things going. */ NFSBZERO((caddr_t)&nfsva, sizeof (struct nfsvattr)); nfsva.na_vattr.va_type = VDIR; nfsva.na_vattr.va_mode = 0777; nfsva.na_vattr.va_nlink = 100; nfsva.na_vattr.va_uid = (uid_t)0; nfsva.na_vattr.va_gid = (gid_t)0; nfsva.na_vattr.va_fileid = 2; nfsva.na_vattr.va_gen = 1; nfsva.na_vattr.va_blocksize = NFS_FABLKSIZE; nfsva.na_vattr.va_size = 512 * 1024; } } (void) nfscl_loadattrcache(&vp, &nfsva, NULL, NULL, 0, 1); if (!error) { mtx_lock(&nmp->nm_mtx); if (gotfsinfo || (nmp->nm_flag & NFSMNT_NFSV4)) nfscl_loadfsinfo(nmp, &fs); nfscl_loadsbinfo(nmp, &sb, sbp); sbp->f_iosize = newnfs_iosize(nmp); mtx_unlock(&nmp->nm_mtx); if (sbp != &mp->mnt_stat) { bcopy(mp->mnt_stat.f_mntonname, sbp->f_mntonname, MNAMELEN); bcopy(mp->mnt_stat.f_mntfromname, sbp->f_mntfromname, MNAMELEN); } strncpy(&sbp->f_fstypename[0], mp->mnt_vfc->vfc_name, MFSNAMELEN); } else if (NFS_ISV4(vp)) { error = nfscl_maperr(td, error, (uid_t)0, (gid_t)0); } vput(vp); vfs_unbusy(mp); return (error); } /* * nfs version 3 fsinfo rpc call */ int ncl_fsinfo(struct nfsmount *nmp, struct vnode *vp, struct ucred *cred, struct thread *td) { struct nfsfsinfo fs; struct nfsvattr nfsva; int error, attrflag; error = nfsrpc_fsinfo(vp, &fs, cred, td, &nfsva, &attrflag, NULL); if (!error) { if (attrflag) (void) nfscl_loadattrcache(&vp, &nfsva, NULL, NULL, 0, 1); mtx_lock(&nmp->nm_mtx); nfscl_loadfsinfo(nmp, &fs); mtx_unlock(&nmp->nm_mtx); } return (error); } /* * Mount a remote root fs via. nfs. This depends on the info in the * nfs_diskless structure that has been filled in properly by some primary * bootstrap. * It goes something like this: * - do enough of "ifconfig" by calling ifioctl() so that the system * can talk to the server * - If nfs_diskless.mygateway is filled in, use that address as * a default gateway. * - build the rootfs mount point and call mountnfs() to do the rest. * * It is assumed to be safe to read, modify, and write the nfsv3_diskless * structure, as well as other global NFS client variables here, as * nfs_mountroot() will be called once in the boot before any other NFS * client activity occurs. */ static int nfs_mountroot(struct mount *mp) { struct thread *td = curthread; struct nfsv3_diskless *nd = &nfsv3_diskless; struct socket *so; struct vnode *vp; struct ifreq ir; int error; u_long l; char buf[128]; char *cp; #if defined(BOOTP_NFSROOT) && defined(BOOTP) bootpc_init(); /* use bootp to get nfs_diskless filled in */ #elif defined(NFS_ROOT) nfs_setup_diskless(); #endif if (nfs_diskless_valid == 0) return (-1); if (nfs_diskless_valid == 1) nfs_convert_diskless(); /* * Do enough of ifconfig(8) so that the critical net interface can * talk to the server. */ error = socreate(nd->myif.ifra_addr.sa_family, &so, nd->root_args.sotype, 0, td->td_ucred, td); if (error) panic("nfs_mountroot: socreate(%04x): %d", nd->myif.ifra_addr.sa_family, error); #if 0 /* XXX Bad idea */ /* * We might not have been told the right interface, so we pass * over the first ten interfaces of the same kind, until we get * one of them configured. */ for (i = strlen(nd->myif.ifra_name) - 1; nd->myif.ifra_name[i] >= '0' && nd->myif.ifra_name[i] <= '9'; nd->myif.ifra_name[i] ++) { error = ifioctl(so, SIOCAIFADDR, (caddr_t)&nd->myif, td); if(!error) break; } #endif error = ifioctl(so, SIOCAIFADDR, (caddr_t)&nd->myif, td); if (error) panic("nfs_mountroot: SIOCAIFADDR: %d", error); if ((cp = kern_getenv("boot.netif.mtu")) != NULL) { ir.ifr_mtu = strtol(cp, NULL, 10); bcopy(nd->myif.ifra_name, ir.ifr_name, IFNAMSIZ); freeenv(cp); error = ifioctl(so, SIOCSIFMTU, (caddr_t)&ir, td); if (error) printf("nfs_mountroot: SIOCSIFMTU: %d", error); } soclose(so); /* * If the gateway field is filled in, set it as the default route. * Note that pxeboot will set a default route of 0 if the route * is not set by the DHCP server. Check also for a value of 0 * to avoid panicking inappropriately in that situation. */ if (nd->mygateway.sin_len != 0 && nd->mygateway.sin_addr.s_addr != 0) { struct sockaddr_in mask, sin; struct epoch_tracker et; struct rt_addrinfo info; struct rib_cmd_info rc; bzero((caddr_t)&mask, sizeof(mask)); sin = mask; sin.sin_family = AF_INET; sin.sin_len = sizeof(sin); /* XXX MRT use table 0 for this sort of thing */ NET_EPOCH_ENTER(et); CURVNET_SET(TD_TO_VNET(td)); bzero((caddr_t)&info, sizeof(info)); info.rti_flags = RTF_UP | RTF_GATEWAY; info.rti_info[RTAX_DST] = (struct sockaddr *)&sin; info.rti_info[RTAX_GATEWAY] = (struct sockaddr *)&nd->mygateway; info.rti_info[RTAX_NETMASK] = (struct sockaddr *)&mask; error = rib_action(RT_DEFAULT_FIB, RTM_ADD, &info, &rc); CURVNET_RESTORE(); NET_EPOCH_EXIT(et); if (error) panic("nfs_mountroot: RTM_ADD: %d", error); } /* * Create the rootfs mount point. */ nd->root_args.fh = nd->root_fh; nd->root_args.fhsize = nd->root_fhsize; l = ntohl(nd->root_saddr.sin_addr.s_addr); snprintf(buf, sizeof(buf), "%ld.%ld.%ld.%ld:%s", (l >> 24) & 0xff, (l >> 16) & 0xff, (l >> 8) & 0xff, (l >> 0) & 0xff, nd->root_hostnam); printf("NFS ROOT: %s\n", buf); nd->root_args.hostname = buf; if ((error = nfs_mountdiskless(buf, &nd->root_saddr, &nd->root_args, td, &vp, mp)) != 0) { return (error); } /* * This is not really an nfs issue, but it is much easier to * set hostname here and then let the "/etc/rc.xxx" files * mount the right /var based upon its preset value. */ mtx_lock(&prison0.pr_mtx); strlcpy(prison0.pr_hostname, nd->my_hostnam, sizeof(prison0.pr_hostname)); mtx_unlock(&prison0.pr_mtx); inittodr(ntohl(nd->root_time)); return (0); } /* * Internal version of mount system call for diskless setup. */ static int nfs_mountdiskless(char *path, struct sockaddr_in *sin, struct nfs_args *args, struct thread *td, struct vnode **vpp, struct mount *mp) { struct sockaddr *nam; int dirlen, error; char *dirpath; /* * Find the directory path in "path", which also has the server's * name/ip address in it. */ dirpath = strchr(path, ':'); if (dirpath != NULL) dirlen = strlen(++dirpath); else dirlen = 0; nam = sodupsockaddr((struct sockaddr *)sin, M_WAITOK); if ((error = mountnfs(args, mp, nam, path, NULL, 0, dirpath, dirlen, NULL, 0, vpp, td->td_ucred, td, NFS_DEFAULT_NAMETIMEO, NFS_DEFAULT_NEGNAMETIMEO, 0)) != 0) { printf("nfs_mountroot: mount %s on /: %d\n", path, error); return (error); } return (0); } static void nfs_sec_name(char *sec, int *flagsp) { if (!strcmp(sec, "krb5")) *flagsp |= NFSMNT_KERB; else if (!strcmp(sec, "krb5i")) *flagsp |= (NFSMNT_KERB | NFSMNT_INTEGRITY); else if (!strcmp(sec, "krb5p")) *flagsp |= (NFSMNT_KERB | NFSMNT_PRIVACY); } static void nfs_decode_args(struct mount *mp, struct nfsmount *nmp, struct nfs_args *argp, const char *hostname, struct ucred *cred, struct thread *td) { int adjsock; char *p; /* * Set read-only flag if requested; otherwise, clear it if this is * an update. If this is not an update, then either the read-only * flag is already clear, or this is a root mount and it was set * intentionally at some previous point. */ if (vfs_getopt(mp->mnt_optnew, "ro", NULL, NULL) == 0) { MNT_ILOCK(mp); mp->mnt_flag |= MNT_RDONLY; MNT_IUNLOCK(mp); } else if (mp->mnt_flag & MNT_UPDATE) { MNT_ILOCK(mp); mp->mnt_flag &= ~MNT_RDONLY; MNT_IUNLOCK(mp); } /* * Silently clear NFSMNT_NOCONN if it's a TCP mount, it makes * no sense in that context. Also, set up appropriate retransmit * and soft timeout behavior. */ if (argp->sotype == SOCK_STREAM) { nmp->nm_flag &= ~NFSMNT_NOCONN; nmp->nm_timeo = NFS_MAXTIMEO; if ((argp->flags & NFSMNT_NFSV4) != 0) nmp->nm_retry = INT_MAX; else nmp->nm_retry = NFS_RETRANS_TCP; } /* Also clear RDIRPLUS if NFSv2, it crashes some servers */ if ((argp->flags & (NFSMNT_NFSV3 | NFSMNT_NFSV4)) == 0) { argp->flags &= ~NFSMNT_RDIRPLUS; nmp->nm_flag &= ~NFSMNT_RDIRPLUS; } /* Clear ONEOPENOWN for NFSv2, 3 and 4.0. */ if (nmp->nm_minorvers == 0) { argp->flags &= ~NFSMNT_ONEOPENOWN; nmp->nm_flag &= ~NFSMNT_ONEOPENOWN; } /* Re-bind if rsrvd port requested and wasn't on one */ adjsock = !(nmp->nm_flag & NFSMNT_RESVPORT) && (argp->flags & NFSMNT_RESVPORT); /* Also re-bind if we're switching to/from a connected UDP socket */ adjsock |= ((nmp->nm_flag & NFSMNT_NOCONN) != (argp->flags & NFSMNT_NOCONN)); /* Update flags atomically. Don't change the lock bits. */ nmp->nm_flag = argp->flags | nmp->nm_flag; if ((argp->flags & NFSMNT_TIMEO) && argp->timeo > 0) { nmp->nm_timeo = (argp->timeo * NFS_HZ + 5) / 10; if (nmp->nm_timeo < NFS_MINTIMEO) nmp->nm_timeo = NFS_MINTIMEO; else if (nmp->nm_timeo > NFS_MAXTIMEO) nmp->nm_timeo = NFS_MAXTIMEO; } if ((argp->flags & NFSMNT_RETRANS) && argp->retrans > 1) { nmp->nm_retry = argp->retrans; if (nmp->nm_retry > NFS_MAXREXMIT) nmp->nm_retry = NFS_MAXREXMIT; } if ((argp->flags & NFSMNT_WSIZE) && argp->wsize > 0) { nmp->nm_wsize = argp->wsize; /* * Clip at the power of 2 below the size. There is an * issue (not isolated) that causes intermittent page * faults if this is not done. */ if (nmp->nm_wsize > NFS_FABLKSIZE) nmp->nm_wsize = 1 << (fls(nmp->nm_wsize) - 1); else nmp->nm_wsize = NFS_FABLKSIZE; } if ((argp->flags & NFSMNT_RSIZE) && argp->rsize > 0) { nmp->nm_rsize = argp->rsize; /* * Clip at the power of 2 below the size. There is an * issue (not isolated) that causes intermittent page * faults if this is not done. */ if (nmp->nm_rsize > NFS_FABLKSIZE) nmp->nm_rsize = 1 << (fls(nmp->nm_rsize) - 1); else nmp->nm_rsize = NFS_FABLKSIZE; } if ((argp->flags & NFSMNT_READDIRSIZE) && argp->readdirsize > 0) { nmp->nm_readdirsize = argp->readdirsize; } if ((argp->flags & NFSMNT_ACREGMIN) && argp->acregmin >= 0) nmp->nm_acregmin = argp->acregmin; else nmp->nm_acregmin = NFS_MINATTRTIMO; if ((argp->flags & NFSMNT_ACREGMAX) && argp->acregmax >= 0) nmp->nm_acregmax = argp->acregmax; else nmp->nm_acregmax = NFS_MAXATTRTIMO; if ((argp->flags & NFSMNT_ACDIRMIN) && argp->acdirmin >= 0) nmp->nm_acdirmin = argp->acdirmin; else nmp->nm_acdirmin = NFS_MINDIRATTRTIMO; if ((argp->flags & NFSMNT_ACDIRMAX) && argp->acdirmax >= 0) nmp->nm_acdirmax = argp->acdirmax; else nmp->nm_acdirmax = NFS_MAXDIRATTRTIMO; if (nmp->nm_acdirmin > nmp->nm_acdirmax) nmp->nm_acdirmin = nmp->nm_acdirmax; if (nmp->nm_acregmin > nmp->nm_acregmax) nmp->nm_acregmin = nmp->nm_acregmax; if ((argp->flags & NFSMNT_READAHEAD) && argp->readahead >= 0) { if (argp->readahead <= NFS_MAXRAHEAD) nmp->nm_readahead = argp->readahead; else nmp->nm_readahead = NFS_MAXRAHEAD; } if ((argp->flags & NFSMNT_WCOMMITSIZE) && argp->wcommitsize >= 0) { if (argp->wcommitsize < nmp->nm_wsize) nmp->nm_wcommitsize = nmp->nm_wsize; else nmp->nm_wcommitsize = argp->wcommitsize; } adjsock |= ((nmp->nm_sotype != argp->sotype) || (nmp->nm_soproto != argp->proto)); if (nmp->nm_client != NULL && adjsock) { int haslock = 0, error = 0; if (nmp->nm_sotype == SOCK_STREAM) { error = newnfs_sndlock(&nmp->nm_sockreq.nr_lock); if (!error) haslock = 1; } if (!error) { newnfs_disconnect(&nmp->nm_sockreq); if (haslock) newnfs_sndunlock(&nmp->nm_sockreq.nr_lock); nmp->nm_sotype = argp->sotype; nmp->nm_soproto = argp->proto; if (nmp->nm_sotype == SOCK_DGRAM) while (newnfs_connect(nmp, &nmp->nm_sockreq, cred, td, 0, false)) { printf("newnfs_args: retrying connect\n"); (void) nfs_catnap(PSOCK, 0, "nfscon"); } } } else { nmp->nm_sotype = argp->sotype; nmp->nm_soproto = argp->proto; } if (hostname != NULL) { strlcpy(nmp->nm_hostname, hostname, sizeof(nmp->nm_hostname)); p = strchr(nmp->nm_hostname, ':'); if (p != NULL) *p = '\0'; } } static const char *nfs_opts[] = { "from", "nfs_args", "noac", "noatime", "noexec", "suiddir", "nosuid", "nosymfollow", "union", "noclusterr", "noclusterw", "multilabel", "acls", "force", "update", "async", "noconn", "nolockd", "conn", "lockd", "intr", "rdirplus", "readdirsize", "soft", "hard", "mntudp", "tcp", "udp", "wsize", "rsize", "retrans", "actimeo", "acregmin", "acregmax", "acdirmin", "acdirmax", "resvport", "readahead", "hostname", "timeo", "timeout", "addr", "fh", "nfsv3", "sec", "principal", "nfsv4", "gssname", "allgssname", "dirpath", "minorversion", "nametimeo", "negnametimeo", "nocto", "noncontigwr", "pnfs", "wcommitsize", "oneopenown", NULL }; /* * Parse the "from" mountarg, passed by the generic mount(8) program * or the mountroot code. This is used when rerooting into NFS. * * Note that the "hostname" is actually a "hostname:/share/path" string. */ static int nfs_mount_parse_from(struct vfsoptlist *opts, char **hostnamep, struct sockaddr_in **sinp, char *dirpath, size_t dirpathsize, int *dirlenp) { char *nam, *delimp, *hostp, *spec; int error, have_bracket = 0, offset, rv, speclen; struct sockaddr_in *sin; size_t len; error = vfs_getopt(opts, "from", (void **)&spec, &speclen); if (error != 0) return (error); nam = malloc(MNAMELEN + 1, M_TEMP, M_WAITOK); /* * This part comes from sbin/mount_nfs/mount_nfs.c:getnfsargs(). */ if (*spec == '[' && (delimp = strchr(spec + 1, ']')) != NULL && *(delimp + 1) == ':') { hostp = spec + 1; spec = delimp + 2; have_bracket = 1; } else if ((delimp = strrchr(spec, ':')) != NULL) { hostp = spec; spec = delimp + 1; } else if ((delimp = strrchr(spec, '@')) != NULL) { printf("%s: path@server syntax is deprecated, " "use server:path\n", __func__); hostp = delimp + 1; } else { printf("%s: no : nfs-name\n", __func__); free(nam, M_TEMP); return (EINVAL); } *delimp = '\0'; /* * If there has been a trailing slash at mounttime it seems * that some mountd implementations fail to remove the mount * entries from their mountlist while unmounting. */ for (speclen = strlen(spec); speclen > 1 && spec[speclen - 1] == '/'; speclen--) spec[speclen - 1] = '\0'; if (strlen(hostp) + strlen(spec) + 1 > MNAMELEN) { printf("%s: %s:%s: name too long", __func__, hostp, spec); free(nam, M_TEMP); return (EINVAL); } /* Make both '@' and ':' notations equal */ if (*hostp != '\0') { len = strlen(hostp); offset = 0; if (have_bracket) nam[offset++] = '['; memmove(nam + offset, hostp, len); if (have_bracket) nam[len + offset++] = ']'; nam[len + offset++] = ':'; memmove(nam + len + offset, spec, speclen); nam[len + speclen + offset] = '\0'; } else nam[0] = '\0'; /* * XXX: IPv6 */ sin = malloc(sizeof(*sin), M_SONAME, M_WAITOK); rv = inet_pton(AF_INET, hostp, &sin->sin_addr); if (rv != 1) { printf("%s: cannot parse '%s', inet_pton() returned %d\n", __func__, hostp, rv); free(nam, M_TEMP); free(sin, M_SONAME); return (EINVAL); } sin->sin_len = sizeof(*sin); sin->sin_family = AF_INET; /* * XXX: hardcoded port number. */ sin->sin_port = htons(2049); *hostnamep = strdup(nam, M_NEWNFSMNT); *sinp = sin; strlcpy(dirpath, spec, dirpathsize); *dirlenp = strlen(dirpath); free(nam, M_TEMP); return (0); } /* * VFS Operations. * * mount system call * It seems a bit dumb to copyinstr() the host and path here and then * bcopy() them in mountnfs(), but I wanted to detect errors before * doing the getsockaddr() call because getsockaddr() allocates an mbuf and * an error after that means that I have to release the mbuf. */ /* ARGSUSED */ static int nfs_mount(struct mount *mp) { struct nfs_args args = { .version = NFS_ARGSVERSION, .addr = NULL, .addrlen = sizeof (struct sockaddr_in), .sotype = SOCK_STREAM, .proto = 0, .fh = NULL, .fhsize = 0, .flags = NFSMNT_RESVPORT, .wsize = NFS_WSIZE, .rsize = NFS_RSIZE, .readdirsize = NFS_READDIRSIZE, .timeo = 10, .retrans = NFS_RETRANS, .readahead = NFS_DEFRAHEAD, .wcommitsize = 0, /* was: NQ_DEFLEASE */ .hostname = NULL, .acregmin = NFS_MINATTRTIMO, .acregmax = NFS_MAXATTRTIMO, .acdirmin = NFS_MINDIRATTRTIMO, .acdirmax = NFS_MAXDIRATTRTIMO, }; int error = 0, ret, len; struct sockaddr *nam = NULL; struct vnode *vp; struct thread *td; char *hst; u_char nfh[NFSX_FHMAX], krbname[100], dirpath[100], srvkrbname[100]; char *cp, *opt, *name, *secname; int nametimeo = NFS_DEFAULT_NAMETIMEO; int negnametimeo = NFS_DEFAULT_NEGNAMETIMEO; int minvers = 0; int dirlen, has_nfs_args_opt, has_nfs_from_opt, krbnamelen, srvkrbnamelen; size_t hstlen; has_nfs_args_opt = 0; has_nfs_from_opt = 0; hst = malloc(MNAMELEN, M_TEMP, M_WAITOK); if (vfs_filteropt(mp->mnt_optnew, nfs_opts)) { error = EINVAL; goto out; } td = curthread; if ((mp->mnt_flag & (MNT_ROOTFS | MNT_UPDATE)) == MNT_ROOTFS && nfs_diskless_valid != 0) { error = nfs_mountroot(mp); goto out; } nfscl_init(); /* * The old mount_nfs program passed the struct nfs_args * from userspace to kernel. The new mount_nfs program * passes string options via nmount() from userspace to kernel * and we populate the struct nfs_args in the kernel. */ if (vfs_getopt(mp->mnt_optnew, "nfs_args", NULL, NULL) == 0) { error = vfs_copyopt(mp->mnt_optnew, "nfs_args", &args, sizeof(args)); if (error != 0) goto out; if (args.version != NFS_ARGSVERSION) { error = EPROGMISMATCH; goto out; } has_nfs_args_opt = 1; } /* Handle the new style options. */ if (vfs_getopt(mp->mnt_optnew, "noac", NULL, NULL) == 0) { args.acdirmin = args.acdirmax = args.acregmin = args.acregmax = 0; args.flags |= NFSMNT_ACDIRMIN | NFSMNT_ACDIRMAX | NFSMNT_ACREGMIN | NFSMNT_ACREGMAX; } if (vfs_getopt(mp->mnt_optnew, "noconn", NULL, NULL) == 0) args.flags |= NFSMNT_NOCONN; if (vfs_getopt(mp->mnt_optnew, "conn", NULL, NULL) == 0) args.flags &= ~NFSMNT_NOCONN; if (vfs_getopt(mp->mnt_optnew, "nolockd", NULL, NULL) == 0) args.flags |= NFSMNT_NOLOCKD; if (vfs_getopt(mp->mnt_optnew, "lockd", NULL, NULL) == 0) args.flags &= ~NFSMNT_NOLOCKD; if (vfs_getopt(mp->mnt_optnew, "intr", NULL, NULL) == 0) args.flags |= NFSMNT_INT; if (vfs_getopt(mp->mnt_optnew, "rdirplus", NULL, NULL) == 0) args.flags |= NFSMNT_RDIRPLUS; if (vfs_getopt(mp->mnt_optnew, "resvport", NULL, NULL) == 0) args.flags |= NFSMNT_RESVPORT; if (vfs_getopt(mp->mnt_optnew, "noresvport", NULL, NULL) == 0) args.flags &= ~NFSMNT_RESVPORT; if (vfs_getopt(mp->mnt_optnew, "soft", NULL, NULL) == 0) args.flags |= NFSMNT_SOFT; if (vfs_getopt(mp->mnt_optnew, "hard", NULL, NULL) == 0) args.flags &= ~NFSMNT_SOFT; if (vfs_getopt(mp->mnt_optnew, "mntudp", NULL, NULL) == 0) args.sotype = SOCK_DGRAM; if (vfs_getopt(mp->mnt_optnew, "udp", NULL, NULL) == 0) args.sotype = SOCK_DGRAM; if (vfs_getopt(mp->mnt_optnew, "tcp", NULL, NULL) == 0) args.sotype = SOCK_STREAM; if (vfs_getopt(mp->mnt_optnew, "nfsv3", NULL, NULL) == 0) args.flags |= NFSMNT_NFSV3; if (vfs_getopt(mp->mnt_optnew, "nfsv4", NULL, NULL) == 0) { args.flags |= NFSMNT_NFSV4; args.sotype = SOCK_STREAM; } if (vfs_getopt(mp->mnt_optnew, "allgssname", NULL, NULL) == 0) args.flags |= NFSMNT_ALLGSSNAME; if (vfs_getopt(mp->mnt_optnew, "nocto", NULL, NULL) == 0) args.flags |= NFSMNT_NOCTO; if (vfs_getopt(mp->mnt_optnew, "noncontigwr", NULL, NULL) == 0) args.flags |= NFSMNT_NONCONTIGWR; if (vfs_getopt(mp->mnt_optnew, "pnfs", NULL, NULL) == 0) args.flags |= NFSMNT_PNFS; if (vfs_getopt(mp->mnt_optnew, "oneopenown", NULL, NULL) == 0) args.flags |= NFSMNT_ONEOPENOWN; if (vfs_getopt(mp->mnt_optnew, "readdirsize", (void **)&opt, NULL) == 0) { if (opt == NULL) { vfs_mount_error(mp, "illegal readdirsize"); error = EINVAL; goto out; } ret = sscanf(opt, "%d", &args.readdirsize); if (ret != 1 || args.readdirsize <= 0) { vfs_mount_error(mp, "illegal readdirsize: %s", opt); error = EINVAL; goto out; } args.flags |= NFSMNT_READDIRSIZE; } if (vfs_getopt(mp->mnt_optnew, "readahead", (void **)&opt, NULL) == 0) { if (opt == NULL) { vfs_mount_error(mp, "illegal readahead"); error = EINVAL; goto out; } ret = sscanf(opt, "%d", &args.readahead); if (ret != 1 || args.readahead <= 0) { vfs_mount_error(mp, "illegal readahead: %s", opt); error = EINVAL; goto out; } args.flags |= NFSMNT_READAHEAD; } if (vfs_getopt(mp->mnt_optnew, "wsize", (void **)&opt, NULL) == 0) { if (opt == NULL) { vfs_mount_error(mp, "illegal wsize"); error = EINVAL; goto out; } ret = sscanf(opt, "%d", &args.wsize); if (ret != 1 || args.wsize <= 0) { vfs_mount_error(mp, "illegal wsize: %s", opt); error = EINVAL; goto out; } args.flags |= NFSMNT_WSIZE; } if (vfs_getopt(mp->mnt_optnew, "rsize", (void **)&opt, NULL) == 0) { if (opt == NULL) { vfs_mount_error(mp, "illegal rsize"); error = EINVAL; goto out; } ret = sscanf(opt, "%d", &args.rsize); if (ret != 1 || args.rsize <= 0) { vfs_mount_error(mp, "illegal wsize: %s", opt); error = EINVAL; goto out; } args.flags |= NFSMNT_RSIZE; } if (vfs_getopt(mp->mnt_optnew, "retrans", (void **)&opt, NULL) == 0) { if (opt == NULL) { vfs_mount_error(mp, "illegal retrans"); error = EINVAL; goto out; } ret = sscanf(opt, "%d", &args.retrans); if (ret != 1 || args.retrans <= 0) { vfs_mount_error(mp, "illegal retrans: %s", opt); error = EINVAL; goto out; } args.flags |= NFSMNT_RETRANS; } if (vfs_getopt(mp->mnt_optnew, "actimeo", (void **)&opt, NULL) == 0) { ret = sscanf(opt, "%d", &args.acregmin); if (ret != 1 || args.acregmin < 0) { vfs_mount_error(mp, "illegal actimeo: %s", opt); error = EINVAL; goto out; } args.acdirmin = args.acdirmax = args.acregmax = args.acregmin; args.flags |= NFSMNT_ACDIRMIN | NFSMNT_ACDIRMAX | NFSMNT_ACREGMIN | NFSMNT_ACREGMAX; } if (vfs_getopt(mp->mnt_optnew, "acregmin", (void **)&opt, NULL) == 0) { ret = sscanf(opt, "%d", &args.acregmin); if (ret != 1 || args.acregmin < 0) { vfs_mount_error(mp, "illegal acregmin: %s", opt); error = EINVAL; goto out; } args.flags |= NFSMNT_ACREGMIN; } if (vfs_getopt(mp->mnt_optnew, "acregmax", (void **)&opt, NULL) == 0) { ret = sscanf(opt, "%d", &args.acregmax); if (ret != 1 || args.acregmax < 0) { vfs_mount_error(mp, "illegal acregmax: %s", opt); error = EINVAL; goto out; } args.flags |= NFSMNT_ACREGMAX; } if (vfs_getopt(mp->mnt_optnew, "acdirmin", (void **)&opt, NULL) == 0) { ret = sscanf(opt, "%d", &args.acdirmin); if (ret != 1 || args.acdirmin < 0) { vfs_mount_error(mp, "illegal acdirmin: %s", opt); error = EINVAL; goto out; } args.flags |= NFSMNT_ACDIRMIN; } if (vfs_getopt(mp->mnt_optnew, "acdirmax", (void **)&opt, NULL) == 0) { ret = sscanf(opt, "%d", &args.acdirmax); if (ret != 1 || args.acdirmax < 0) { vfs_mount_error(mp, "illegal acdirmax: %s", opt); error = EINVAL; goto out; } args.flags |= NFSMNT_ACDIRMAX; } if (vfs_getopt(mp->mnt_optnew, "wcommitsize", (void **)&opt, NULL) == 0) { ret = sscanf(opt, "%d", &args.wcommitsize); if (ret != 1 || args.wcommitsize < 0) { vfs_mount_error(mp, "illegal wcommitsize: %s", opt); error = EINVAL; goto out; } args.flags |= NFSMNT_WCOMMITSIZE; } if (vfs_getopt(mp->mnt_optnew, "timeo", (void **)&opt, NULL) == 0) { ret = sscanf(opt, "%d", &args.timeo); if (ret != 1 || args.timeo <= 0) { vfs_mount_error(mp, "illegal timeo: %s", opt); error = EINVAL; goto out; } args.flags |= NFSMNT_TIMEO; } if (vfs_getopt(mp->mnt_optnew, "timeout", (void **)&opt, NULL) == 0) { ret = sscanf(opt, "%d", &args.timeo); if (ret != 1 || args.timeo <= 0) { vfs_mount_error(mp, "illegal timeout: %s", opt); error = EINVAL; goto out; } args.flags |= NFSMNT_TIMEO; } if (vfs_getopt(mp->mnt_optnew, "nametimeo", (void **)&opt, NULL) == 0) { ret = sscanf(opt, "%d", &nametimeo); if (ret != 1 || nametimeo < 0) { vfs_mount_error(mp, "illegal nametimeo: %s", opt); error = EINVAL; goto out; } } if (vfs_getopt(mp->mnt_optnew, "negnametimeo", (void **)&opt, NULL) == 0) { ret = sscanf(opt, "%d", &negnametimeo); if (ret != 1 || negnametimeo < 0) { vfs_mount_error(mp, "illegal negnametimeo: %s", opt); error = EINVAL; goto out; } } if (vfs_getopt(mp->mnt_optnew, "minorversion", (void **)&opt, NULL) == 0) { ret = sscanf(opt, "%d", &minvers); if (ret != 1 || minvers < 0 || minvers > 2 || (args.flags & NFSMNT_NFSV4) == 0) { vfs_mount_error(mp, "illegal minorversion: %s", opt); error = EINVAL; goto out; } } if (vfs_getopt(mp->mnt_optnew, "sec", (void **) &secname, NULL) == 0) nfs_sec_name(secname, &args.flags); if (mp->mnt_flag & MNT_UPDATE) { struct nfsmount *nmp = VFSTONFS(mp); if (nmp == NULL) { error = EIO; goto out; } /* * If a change from TCP->UDP is done and there are thread(s) * that have I/O RPC(s) in progress with a transfer size * greater than NFS_MAXDGRAMDATA, those thread(s) will be * hung, retrying the RPC(s) forever. Usually these threads * will be seen doing an uninterruptible sleep on wait channel * "nfsreq". */ if (args.sotype == SOCK_DGRAM && nmp->nm_sotype == SOCK_STREAM) tprintf(td->td_proc, LOG_WARNING, "Warning: mount -u that changes TCP->UDP can result in hung threads\n"); /* * When doing an update, we can't change version, * security, switch lockd strategies, change cookie * translation or switch oneopenown. */ args.flags = (args.flags & ~(NFSMNT_NFSV3 | NFSMNT_NFSV4 | NFSMNT_KERB | NFSMNT_INTEGRITY | NFSMNT_PRIVACY | NFSMNT_ONEOPENOWN | NFSMNT_NOLOCKD /*|NFSMNT_XLATECOOKIE*/)) | (nmp->nm_flag & (NFSMNT_NFSV3 | NFSMNT_NFSV4 | NFSMNT_KERB | NFSMNT_INTEGRITY | NFSMNT_PRIVACY | NFSMNT_ONEOPENOWN | NFSMNT_NOLOCKD /*|NFSMNT_XLATECOOKIE*/)); nfs_decode_args(mp, nmp, &args, NULL, td->td_ucred, td); goto out; } /* * Make the nfs_ip_paranoia sysctl serve as the default connection * or no-connection mode for those protocols that support * no-connection mode (the flag will be cleared later for protocols * that do not support no-connection mode). This will allow a client * to receive replies from a different IP then the request was * sent to. Note: default value for nfs_ip_paranoia is 1 (paranoid), * not 0. */ if (nfs_ip_paranoia == 0) args.flags |= NFSMNT_NOCONN; if (has_nfs_args_opt != 0) { /* * In the 'nfs_args' case, the pointers in the args * structure are in userland - we copy them in here. */ if (args.fhsize < 0 || args.fhsize > NFSX_V3FHMAX) { vfs_mount_error(mp, "Bad file handle"); error = EINVAL; goto out; } error = copyin((caddr_t)args.fh, (caddr_t)nfh, args.fhsize); if (error != 0) goto out; error = copyinstr(args.hostname, hst, MNAMELEN - 1, &hstlen); if (error != 0) goto out; bzero(&hst[hstlen], MNAMELEN - hstlen); args.hostname = hst; /* getsockaddr() call must be after above copyin() calls */ error = getsockaddr(&nam, args.addr, args.addrlen); if (error != 0) goto out; } else if (nfs_mount_parse_from(mp->mnt_optnew, &args.hostname, (struct sockaddr_in **)&nam, dirpath, sizeof(dirpath), &dirlen) == 0) { has_nfs_from_opt = 1; bcopy(args.hostname, hst, MNAMELEN); hst[MNAMELEN - 1] = '\0'; /* * This only works with NFSv4 for now. */ args.fhsize = 0; args.flags |= NFSMNT_NFSV4; args.sotype = SOCK_STREAM; } else { if (vfs_getopt(mp->mnt_optnew, "fh", (void **)&args.fh, &args.fhsize) == 0) { if (args.fhsize < 0 || args.fhsize > NFSX_FHMAX) { vfs_mount_error(mp, "Bad file handle"); error = EINVAL; goto out; } bcopy(args.fh, nfh, args.fhsize); } else { args.fhsize = 0; } (void) vfs_getopt(mp->mnt_optnew, "hostname", (void **)&args.hostname, &len); if (args.hostname == NULL) { vfs_mount_error(mp, "Invalid hostname"); error = EINVAL; goto out; } if (len >= MNAMELEN) { vfs_mount_error(mp, "Hostname too long"); error = EINVAL; goto out; } bcopy(args.hostname, hst, len); hst[len] = '\0'; } if (vfs_getopt(mp->mnt_optnew, "principal", (void **)&name, NULL) == 0) strlcpy(srvkrbname, name, sizeof (srvkrbname)); else { snprintf(srvkrbname, sizeof (srvkrbname), "nfs@%s", hst); cp = strchr(srvkrbname, ':'); if (cp != NULL) *cp = '\0'; } srvkrbnamelen = strlen(srvkrbname); if (vfs_getopt(mp->mnt_optnew, "gssname", (void **)&name, NULL) == 0) strlcpy(krbname, name, sizeof (krbname)); else krbname[0] = '\0'; krbnamelen = strlen(krbname); if (has_nfs_from_opt == 0) { if (vfs_getopt(mp->mnt_optnew, "dirpath", (void **)&name, NULL) == 0) strlcpy(dirpath, name, sizeof (dirpath)); else dirpath[0] = '\0'; dirlen = strlen(dirpath); } if (has_nfs_args_opt == 0 && has_nfs_from_opt == 0) { if (vfs_getopt(mp->mnt_optnew, "addr", (void **)&args.addr, &args.addrlen) == 0) { if (args.addrlen > SOCK_MAXADDRLEN) { error = ENAMETOOLONG; goto out; } nam = malloc(args.addrlen, M_SONAME, M_WAITOK); bcopy(args.addr, nam, args.addrlen); nam->sa_len = args.addrlen; } else { vfs_mount_error(mp, "No server address"); error = EINVAL; goto out; } } args.fh = nfh; error = mountnfs(&args, mp, nam, hst, krbname, krbnamelen, dirpath, dirlen, srvkrbname, srvkrbnamelen, &vp, td->td_ucred, td, nametimeo, negnametimeo, minvers); out: if (!error) { MNT_ILOCK(mp); mp->mnt_kern_flag |= MNTK_LOOKUP_SHARED | MNTK_NO_IOPF | MNTK_USES_BCACHE; if ((VFSTONFS(mp)->nm_flag & NFSMNT_NFSV4) != 0) mp->mnt_kern_flag |= MNTK_NULL_NOCACHE; MNT_IUNLOCK(mp); } free(hst, M_TEMP); return (error); } /* * VFS Operations. * * mount system call * It seems a bit dumb to copyinstr() the host and path here and then * bcopy() them in mountnfs(), but I wanted to detect errors before * doing the getsockaddr() call because getsockaddr() allocates an mbuf and * an error after that means that I have to release the mbuf. */ /* ARGSUSED */ static int nfs_cmount(struct mntarg *ma, void *data, uint64_t flags) { int error; struct nfs_args args; error = copyin(data, &args, sizeof (struct nfs_args)); if (error) return error; ma = mount_arg(ma, "nfs_args", &args, sizeof args); error = kernel_mount(ma, flags); return (error); } /* * Common code for mount and mountroot */ static int mountnfs(struct nfs_args *argp, struct mount *mp, struct sockaddr *nam, char *hst, u_char *krbname, int krbnamelen, u_char *dirpath, int dirlen, u_char *srvkrbname, int srvkrbnamelen, struct vnode **vpp, struct ucred *cred, struct thread *td, int nametimeo, int negnametimeo, int minvers) { struct nfsmount *nmp; struct nfsnode *np; int error, trycnt, ret; struct nfsvattr nfsva; struct nfsclclient *clp; struct nfsclds *dsp, *tdsp; uint32_t lease; static u_int64_t clval = 0; NFSCL_DEBUG(3, "in mnt\n"); clp = NULL; if (mp->mnt_flag & MNT_UPDATE) { nmp = VFSTONFS(mp); printf("%s: MNT_UPDATE is no longer handled here\n", __func__); free(nam, M_SONAME); return (0); } else { nmp = malloc(sizeof (struct nfsmount) + krbnamelen + dirlen + srvkrbnamelen + 2, M_NEWNFSMNT, M_WAITOK | M_ZERO); TAILQ_INIT(&nmp->nm_bufq); TAILQ_INIT(&nmp->nm_sess); if (clval == 0) clval = (u_int64_t)nfsboottime.tv_sec; nmp->nm_clval = clval++; nmp->nm_krbnamelen = krbnamelen; nmp->nm_dirpathlen = dirlen; nmp->nm_srvkrbnamelen = srvkrbnamelen; if (td->td_ucred->cr_uid != (uid_t)0) { /* * nm_uid is used to get KerberosV credentials for * the nfsv4 state handling operations if there is * no host based principal set. Use the uid of * this user if not root, since they are doing the * mount. I don't think setting this for root will * work, since root normally does not have user * credentials in a credentials cache. */ nmp->nm_uid = td->td_ucred->cr_uid; } else { /* * Just set to -1, so it won't be used. */ nmp->nm_uid = (uid_t)-1; } /* Copy and null terminate all the names */ if (nmp->nm_krbnamelen > 0) { bcopy(krbname, nmp->nm_krbname, nmp->nm_krbnamelen); nmp->nm_name[nmp->nm_krbnamelen] = '\0'; } if (nmp->nm_dirpathlen > 0) { bcopy(dirpath, NFSMNT_DIRPATH(nmp), nmp->nm_dirpathlen); nmp->nm_name[nmp->nm_krbnamelen + nmp->nm_dirpathlen + 1] = '\0'; } if (nmp->nm_srvkrbnamelen > 0) { bcopy(srvkrbname, NFSMNT_SRVKRBNAME(nmp), nmp->nm_srvkrbnamelen); nmp->nm_name[nmp->nm_krbnamelen + nmp->nm_dirpathlen + nmp->nm_srvkrbnamelen + 2] = '\0'; } nmp->nm_sockreq.nr_cred = crhold(cred); mtx_init(&nmp->nm_sockreq.nr_mtx, "nfssock", NULL, MTX_DEF); mp->mnt_data = nmp; nmp->nm_getinfo = nfs_getnlminfo; nmp->nm_vinvalbuf = ncl_vinvalbuf; } vfs_getnewfsid(mp); nmp->nm_mountp = mp; mtx_init(&nmp->nm_mtx, "NFSmount lock", NULL, MTX_DEF | MTX_DUPOK); /* * Since nfs_decode_args() might optionally set them, these * need to be set to defaults before the call, so that the * optional settings aren't overwritten. */ nmp->nm_nametimeo = nametimeo; nmp->nm_negnametimeo = negnametimeo; nmp->nm_timeo = NFS_TIMEO; nmp->nm_retry = NFS_RETRANS; nmp->nm_readahead = NFS_DEFRAHEAD; /* This is empirical approximation of sqrt(hibufspace) * 256. */ nmp->nm_wcommitsize = NFS_MAXBSIZE / 256; while ((long)nmp->nm_wcommitsize * nmp->nm_wcommitsize < hibufspace) nmp->nm_wcommitsize *= 2; nmp->nm_wcommitsize *= 256; if ((argp->flags & NFSMNT_NFSV4) != 0) nmp->nm_minorvers = minvers; else nmp->nm_minorvers = 0; nfs_decode_args(mp, nmp, argp, hst, cred, td); /* * V2 can only handle 32 bit filesizes. A 4GB-1 limit may be too * high, depending on whether we end up with negative offsets in * the client or server somewhere. 2GB-1 may be safer. * * For V3, ncl_fsinfo will adjust this as necessary. Assume maximum * that we can handle until we find out otherwise. */ if ((argp->flags & (NFSMNT_NFSV3 | NFSMNT_NFSV4)) == 0) nmp->nm_maxfilesize = 0xffffffffLL; else nmp->nm_maxfilesize = OFF_MAX; if ((argp->flags & (NFSMNT_NFSV3 | NFSMNT_NFSV4)) == 0) { nmp->nm_wsize = NFS_WSIZE; nmp->nm_rsize = NFS_RSIZE; nmp->nm_readdirsize = NFS_READDIRSIZE; } nmp->nm_numgrps = NFS_MAXGRPS; nmp->nm_tprintf_delay = nfs_tprintf_delay; if (nmp->nm_tprintf_delay < 0) nmp->nm_tprintf_delay = 0; nmp->nm_tprintf_initial_delay = nfs_tprintf_initial_delay; if (nmp->nm_tprintf_initial_delay < 0) nmp->nm_tprintf_initial_delay = 0; nmp->nm_fhsize = argp->fhsize; if (nmp->nm_fhsize > 0) bcopy((caddr_t)argp->fh, (caddr_t)nmp->nm_fh, argp->fhsize); bcopy(hst, mp->mnt_stat.f_mntfromname, MNAMELEN); nmp->nm_nam = nam; /* Set up the sockets and per-host congestion */ nmp->nm_sotype = argp->sotype; nmp->nm_soproto = argp->proto; nmp->nm_sockreq.nr_prog = NFS_PROG; if ((argp->flags & NFSMNT_NFSV4)) nmp->nm_sockreq.nr_vers = NFS_VER4; else if ((argp->flags & NFSMNT_NFSV3)) nmp->nm_sockreq.nr_vers = NFS_VER3; else nmp->nm_sockreq.nr_vers = NFS_VER2; if ((error = newnfs_connect(nmp, &nmp->nm_sockreq, cred, td, 0, false))) goto bad; /* For NFSv4.1, get the clientid now. */ if (nmp->nm_minorvers > 0) { NFSCL_DEBUG(3, "at getcl\n"); error = nfscl_getcl(mp, cred, td, 0, &clp); NFSCL_DEBUG(3, "aft getcl=%d\n", error); if (error != 0) goto bad; } if (nmp->nm_fhsize == 0 && (nmp->nm_flag & NFSMNT_NFSV4) && nmp->nm_dirpathlen > 0) { NFSCL_DEBUG(3, "in dirp\n"); /* * If the fhsize on the mount point == 0 for V4, the mount * path needs to be looked up. */ trycnt = 3; do { error = nfsrpc_getdirpath(nmp, NFSMNT_DIRPATH(nmp), cred, td); NFSCL_DEBUG(3, "aft dirp=%d\n", error); if (error) (void) nfs_catnap(PZERO, error, "nfsgetdirp"); } while (error && --trycnt > 0); if (error) goto bad; } /* * A reference count is needed on the nfsnode representing the * remote root. If this object is not persistent, then backward * traversals of the mount point (i.e. "..") will not work if * the nfsnode gets flushed out of the cache. Ufs does not have * this problem, because one can identify root inodes by their * number == UFS_ROOTINO (2). */ if (nmp->nm_fhsize > 0) { /* * Set f_iosize to NFS_DIRBLKSIZ so that bo_bsize gets set * non-zero for the root vnode. f_iosize will be set correctly * by nfs_statfs() before any I/O occurs. */ mp->mnt_stat.f_iosize = NFS_DIRBLKSIZ; error = ncl_nget(mp, nmp->nm_fh, nmp->nm_fhsize, &np, LK_EXCLUSIVE); if (error) goto bad; *vpp = NFSTOV(np); /* * Get file attributes and transfer parameters for the * mountpoint. This has the side effect of filling in * (*vpp)->v_type with the correct value. */ ret = nfsrpc_getattrnovp(nmp, nmp->nm_fh, nmp->nm_fhsize, 1, cred, td, &nfsva, NULL, &lease); if (ret) { /* * Just set default values to get things going. */ NFSBZERO((caddr_t)&nfsva, sizeof (struct nfsvattr)); nfsva.na_vattr.va_type = VDIR; nfsva.na_vattr.va_mode = 0777; nfsva.na_vattr.va_nlink = 100; nfsva.na_vattr.va_uid = (uid_t)0; nfsva.na_vattr.va_gid = (gid_t)0; nfsva.na_vattr.va_fileid = 2; nfsva.na_vattr.va_gen = 1; nfsva.na_vattr.va_blocksize = NFS_FABLKSIZE; nfsva.na_vattr.va_size = 512 * 1024; lease = 60; } (void) nfscl_loadattrcache(vpp, &nfsva, NULL, NULL, 0, 1); if (nmp->nm_minorvers > 0) { NFSCL_DEBUG(3, "lease=%d\n", (int)lease); NFSLOCKCLSTATE(); clp->nfsc_renew = NFSCL_RENEW(lease); clp->nfsc_expire = NFSD_MONOSEC + clp->nfsc_renew; clp->nfsc_clientidrev++; if (clp->nfsc_clientidrev == 0) clp->nfsc_clientidrev++; NFSUNLOCKCLSTATE(); /* * Mount will succeed, so the renew thread can be * started now. */ nfscl_start_renewthread(clp); nfscl_clientrelease(clp); } if (argp->flags & NFSMNT_NFSV3) ncl_fsinfo(nmp, *vpp, cred, td); /* Mark if the mount point supports NFSv4 ACLs. */ if ((argp->flags & NFSMNT_NFSV4) != 0 && nfsrv_useacl != 0 && ret == 0 && NFSISSET_ATTRBIT(&nfsva.na_suppattr, NFSATTRBIT_ACL)) { MNT_ILOCK(mp); mp->mnt_flag |= MNT_NFS4ACLS; MNT_IUNLOCK(mp); } /* * Lose the lock but keep the ref. */ NFSVOPUNLOCK(*vpp); vfs_cache_root_set(mp, *vpp); return (0); } error = EIO; bad: if (clp != NULL) nfscl_clientrelease(clp); newnfs_disconnect(&nmp->nm_sockreq); crfree(nmp->nm_sockreq.nr_cred); if (nmp->nm_sockreq.nr_auth != NULL) AUTH_DESTROY(nmp->nm_sockreq.nr_auth); mtx_destroy(&nmp->nm_sockreq.nr_mtx); mtx_destroy(&nmp->nm_mtx); if (nmp->nm_clp != NULL) { NFSLOCKCLSTATE(); LIST_REMOVE(nmp->nm_clp, nfsc_list); NFSUNLOCKCLSTATE(); free(nmp->nm_clp, M_NFSCLCLIENT); } TAILQ_FOREACH_SAFE(dsp, &nmp->nm_sess, nfsclds_list, tdsp) { if (dsp != TAILQ_FIRST(&nmp->nm_sess) && dsp->nfsclds_sockp != NULL) newnfs_disconnect(dsp->nfsclds_sockp); nfscl_freenfsclds(dsp); } free(nmp, M_NEWNFSMNT); free(nam, M_SONAME); return (error); } /* * unmount system call */ static int nfs_unmount(struct mount *mp, int mntflags) { struct thread *td; struct nfsmount *nmp; int error, flags = 0, i, trycnt = 0; struct nfsclds *dsp, *tdsp; td = curthread; if (mntflags & MNT_FORCE) flags |= FORCECLOSE; nmp = VFSTONFS(mp); error = 0; /* * Goes something like this.. * - Call vflush() to clear out vnodes for this filesystem * - Close the socket * - Free up the data structures */ /* In the forced case, cancel any outstanding requests. */ if (mntflags & MNT_FORCE) { NFSDDSLOCK(); if (nfsv4_findmirror(nmp) != NULL) error = ENXIO; NFSDDSUNLOCK(); if (error) goto out; error = newnfs_nmcancelreqs(nmp); if (error) goto out; /* For a forced close, get rid of the renew thread now */ nfscl_umount(nmp, td); } /* We hold 1 extra ref on the root vnode; see comment in mountnfs(). */ do { error = vflush(mp, 1, flags, td); if ((mntflags & MNT_FORCE) && error != 0 && ++trycnt < 30) (void) nfs_catnap(PSOCK, error, "newndm"); } while ((mntflags & MNT_FORCE) && error != 0 && trycnt < 30); if (error) goto out; /* * We are now committed to the unmount. */ if ((mntflags & MNT_FORCE) == 0) nfscl_umount(nmp, td); else { mtx_lock(&nmp->nm_mtx); nmp->nm_privflag |= NFSMNTP_FORCEDISM; mtx_unlock(&nmp->nm_mtx); } /* Make sure no nfsiods are assigned to this mount. */ NFSLOCKIOD(); for (i = 0; i < NFS_MAXASYNCDAEMON; i++) if (ncl_iodmount[i] == nmp) { ncl_iodwant[i] = NFSIOD_AVAILABLE; ncl_iodmount[i] = NULL; } NFSUNLOCKIOD(); /* * We can now set mnt_data to NULL and wait for * nfssvc(NFSSVC_FORCEDISM) to complete. */ mtx_lock(&mountlist_mtx); mtx_lock(&nmp->nm_mtx); mp->mnt_data = NULL; mtx_unlock(&mountlist_mtx); while ((nmp->nm_privflag & NFSMNTP_CANCELRPCS) != 0) msleep(nmp, &nmp->nm_mtx, PVFS, "nfsfdism", 0); mtx_unlock(&nmp->nm_mtx); newnfs_disconnect(&nmp->nm_sockreq); crfree(nmp->nm_sockreq.nr_cred); free(nmp->nm_nam, M_SONAME); if (nmp->nm_sockreq.nr_auth != NULL) AUTH_DESTROY(nmp->nm_sockreq.nr_auth); mtx_destroy(&nmp->nm_sockreq.nr_mtx); mtx_destroy(&nmp->nm_mtx); TAILQ_FOREACH_SAFE(dsp, &nmp->nm_sess, nfsclds_list, tdsp) { if (dsp != TAILQ_FIRST(&nmp->nm_sess) && dsp->nfsclds_sockp != NULL) newnfs_disconnect(dsp->nfsclds_sockp); nfscl_freenfsclds(dsp); } free(nmp, M_NEWNFSMNT); out: return (error); } /* * Return root of a filesystem */ static int nfs_root(struct mount *mp, int flags, struct vnode **vpp) { struct vnode *vp; struct nfsmount *nmp; struct nfsnode *np; int error; nmp = VFSTONFS(mp); error = ncl_nget(mp, nmp->nm_fh, nmp->nm_fhsize, &np, flags); if (error) return error; vp = NFSTOV(np); /* * Get transfer parameters and attributes for root vnode once. */ mtx_lock(&nmp->nm_mtx); if (NFSHASNFSV3(nmp) && !NFSHASGOTFSINFO(nmp)) { mtx_unlock(&nmp->nm_mtx); ncl_fsinfo(nmp, vp, curthread->td_ucred, curthread); } else mtx_unlock(&nmp->nm_mtx); if (vp->v_type == VNON) vp->v_type = VDIR; vp->v_vflag |= VV_ROOT; *vpp = vp; return (0); } /* * Flush out the buffer cache */ /* ARGSUSED */ static int nfs_sync(struct mount *mp, int waitfor) { struct vnode *vp, *mvp; struct thread *td; int error, allerror = 0; td = curthread; MNT_ILOCK(mp); /* * If a forced dismount is in progress, return from here so that * the umount(2) syscall doesn't get stuck in VFS_SYNC() before * calling VFS_UNMOUNT(). */ if (NFSCL_FORCEDISM(mp)) { MNT_IUNLOCK(mp); return (EBADF); } MNT_IUNLOCK(mp); /* * Force stale buffer cache information to be flushed. */ loop: MNT_VNODE_FOREACH_ALL(vp, mp, mvp) { /* XXX Racy bv_cnt check. */ if (NFSVOPISLOCKED(vp) || vp->v_bufobj.bo_dirty.bv_cnt == 0 || waitfor == MNT_LAZY) { VI_UNLOCK(vp); continue; } - if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK, td)) { + if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK)) { MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); goto loop; } error = VOP_FSYNC(vp, waitfor, td); if (error) allerror = error; NFSVOPUNLOCK(vp); vrele(vp); } return (allerror); } static int nfs_sysctl(struct mount *mp, fsctlop_t op, struct sysctl_req *req) { struct nfsmount *nmp = VFSTONFS(mp); struct vfsquery vq; int error; bzero(&vq, sizeof(vq)); switch (op) { #if 0 case VFS_CTL_NOLOCKS: val = (nmp->nm_flag & NFSMNT_NOLOCKS) ? 1 : 0; if (req->oldptr != NULL) { error = SYSCTL_OUT(req, &val, sizeof(val)); if (error) return (error); } if (req->newptr != NULL) { error = SYSCTL_IN(req, &val, sizeof(val)); if (error) return (error); if (val) nmp->nm_flag |= NFSMNT_NOLOCKS; else nmp->nm_flag &= ~NFSMNT_NOLOCKS; } break; #endif case VFS_CTL_QUERY: mtx_lock(&nmp->nm_mtx); if (nmp->nm_state & NFSSTA_TIMEO) vq.vq_flags |= VQ_NOTRESP; mtx_unlock(&nmp->nm_mtx); #if 0 if (!(nmp->nm_flag & NFSMNT_NOLOCKS) && (nmp->nm_state & NFSSTA_LOCKTIMEO)) vq.vq_flags |= VQ_NOTRESPLOCK; #endif error = SYSCTL_OUT(req, &vq, sizeof(vq)); break; case VFS_CTL_TIMEO: if (req->oldptr != NULL) { error = SYSCTL_OUT(req, &nmp->nm_tprintf_initial_delay, sizeof(nmp->nm_tprintf_initial_delay)); if (error) return (error); } if (req->newptr != NULL) { error = vfs_suser(mp, req->td); if (error) return (error); error = SYSCTL_IN(req, &nmp->nm_tprintf_initial_delay, sizeof(nmp->nm_tprintf_initial_delay)); if (error) return (error); if (nmp->nm_tprintf_initial_delay < 0) nmp->nm_tprintf_initial_delay = 0; } break; default: return (ENOTSUP); } return (0); } /* * Purge any RPCs in progress, so that they will all return errors. * This allows dounmount() to continue as far as VFS_UNMOUNT() for a * forced dismount. */ static void nfs_purge(struct mount *mp) { struct nfsmount *nmp = VFSTONFS(mp); newnfs_nmcancelreqs(nmp); } /* * Extract the information needed by the nlm from the nfs vnode. */ static void nfs_getnlminfo(struct vnode *vp, uint8_t *fhp, size_t *fhlenp, struct sockaddr_storage *sp, int *is_v3p, off_t *sizep, struct timeval *timeop) { struct nfsmount *nmp; struct nfsnode *np = VTONFS(vp); nmp = VFSTONFS(vp->v_mount); if (fhlenp != NULL) *fhlenp = (size_t)np->n_fhp->nfh_len; if (fhp != NULL) bcopy(np->n_fhp->nfh_fh, fhp, np->n_fhp->nfh_len); if (sp != NULL) bcopy(nmp->nm_nam, sp, min(nmp->nm_nam->sa_len, sizeof(*sp))); if (is_v3p != NULL) *is_v3p = NFS_ISV3(vp); if (sizep != NULL) *sizep = np->n_size; if (timeop != NULL) { timeop->tv_sec = nmp->nm_timeo / NFS_HZ; timeop->tv_usec = (nmp->nm_timeo % NFS_HZ) * (1000000 / NFS_HZ); } } /* * This function prints out an option name, based on the conditional * argument. */ static __inline void nfscl_printopt(struct nfsmount *nmp, int testval, char *opt, char **buf, size_t *blen) { int len; if (testval != 0 && *blen > strlen(opt)) { len = snprintf(*buf, *blen, "%s", opt); if (len != strlen(opt)) printf("EEK!!\n"); *buf += len; *blen -= len; } } /* * This function printf out an options integer value. */ static __inline void nfscl_printoptval(struct nfsmount *nmp, int optval, char *opt, char **buf, size_t *blen) { int len; if (*blen > strlen(opt) + 1) { /* Could result in truncated output string. */ len = snprintf(*buf, *blen, "%s=%d", opt, optval); if (len < *blen) { *buf += len; *blen -= len; } } } /* * Load the option flags and values into the buffer. */ void nfscl_retopts(struct nfsmount *nmp, char *buffer, size_t buflen) { char *buf; size_t blen; buf = buffer; blen = buflen; nfscl_printopt(nmp, (nmp->nm_flag & NFSMNT_NFSV4) != 0, "nfsv4", &buf, &blen); if ((nmp->nm_flag & NFSMNT_NFSV4) != 0) { nfscl_printoptval(nmp, nmp->nm_minorvers, ",minorversion", &buf, &blen); nfscl_printopt(nmp, (nmp->nm_flag & NFSMNT_PNFS) != 0, ",pnfs", &buf, &blen); nfscl_printopt(nmp, (nmp->nm_flag & NFSMNT_ONEOPENOWN) != 0 && nmp->nm_minorvers > 0, ",oneopenown", &buf, &blen); } nfscl_printopt(nmp, (nmp->nm_flag & NFSMNT_NFSV3) != 0, "nfsv3", &buf, &blen); nfscl_printopt(nmp, (nmp->nm_flag & (NFSMNT_NFSV3 | NFSMNT_NFSV4)) == 0, "nfsv2", &buf, &blen); nfscl_printopt(nmp, nmp->nm_sotype == SOCK_STREAM, ",tcp", &buf, &blen); nfscl_printopt(nmp, nmp->nm_sotype != SOCK_STREAM, ",udp", &buf, &blen); nfscl_printopt(nmp, (nmp->nm_flag & NFSMNT_RESVPORT) != 0, ",resvport", &buf, &blen); nfscl_printopt(nmp, (nmp->nm_flag & NFSMNT_NOCONN) != 0, ",noconn", &buf, &blen); nfscl_printopt(nmp, (nmp->nm_flag & NFSMNT_SOFT) == 0, ",hard", &buf, &blen); nfscl_printopt(nmp, (nmp->nm_flag & NFSMNT_SOFT) != 0, ",soft", &buf, &blen); nfscl_printopt(nmp, (nmp->nm_flag & NFSMNT_INT) != 0, ",intr", &buf, &blen); nfscl_printopt(nmp, (nmp->nm_flag & NFSMNT_NOCTO) == 0, ",cto", &buf, &blen); nfscl_printopt(nmp, (nmp->nm_flag & NFSMNT_NOCTO) != 0, ",nocto", &buf, &blen); nfscl_printopt(nmp, (nmp->nm_flag & NFSMNT_NONCONTIGWR) != 0, ",noncontigwr", &buf, &blen); nfscl_printopt(nmp, (nmp->nm_flag & (NFSMNT_NOLOCKD | NFSMNT_NFSV4)) == 0, ",lockd", &buf, &blen); nfscl_printopt(nmp, (nmp->nm_flag & (NFSMNT_NOLOCKD | NFSMNT_NFSV4)) == NFSMNT_NOLOCKD, ",nolockd", &buf, &blen); nfscl_printopt(nmp, (nmp->nm_flag & NFSMNT_RDIRPLUS) != 0, ",rdirplus", &buf, &blen); nfscl_printopt(nmp, (nmp->nm_flag & NFSMNT_KERB) == 0, ",sec=sys", &buf, &blen); nfscl_printopt(nmp, (nmp->nm_flag & (NFSMNT_KERB | NFSMNT_INTEGRITY | NFSMNT_PRIVACY)) == NFSMNT_KERB, ",sec=krb5", &buf, &blen); nfscl_printopt(nmp, (nmp->nm_flag & (NFSMNT_KERB | NFSMNT_INTEGRITY | NFSMNT_PRIVACY)) == (NFSMNT_KERB | NFSMNT_INTEGRITY), ",sec=krb5i", &buf, &blen); nfscl_printopt(nmp, (nmp->nm_flag & (NFSMNT_KERB | NFSMNT_INTEGRITY | NFSMNT_PRIVACY)) == (NFSMNT_KERB | NFSMNT_PRIVACY), ",sec=krb5p", &buf, &blen); nfscl_printoptval(nmp, nmp->nm_acdirmin, ",acdirmin", &buf, &blen); nfscl_printoptval(nmp, nmp->nm_acdirmax, ",acdirmax", &buf, &blen); nfscl_printoptval(nmp, nmp->nm_acregmin, ",acregmin", &buf, &blen); nfscl_printoptval(nmp, nmp->nm_acregmax, ",acregmax", &buf, &blen); nfscl_printoptval(nmp, nmp->nm_nametimeo, ",nametimeo", &buf, &blen); nfscl_printoptval(nmp, nmp->nm_negnametimeo, ",negnametimeo", &buf, &blen); nfscl_printoptval(nmp, nmp->nm_rsize, ",rsize", &buf, &blen); nfscl_printoptval(nmp, nmp->nm_wsize, ",wsize", &buf, &blen); nfscl_printoptval(nmp, nmp->nm_readdirsize, ",readdirsize", &buf, &blen); nfscl_printoptval(nmp, nmp->nm_readahead, ",readahead", &buf, &blen); nfscl_printoptval(nmp, nmp->nm_wcommitsize, ",wcommitsize", &buf, &blen); nfscl_printoptval(nmp, nmp->nm_timeo, ",timeout", &buf, &blen); nfscl_printoptval(nmp, nmp->nm_retry, ",retrans", &buf, &blen); } Index: projects/clang1100-import/sys/fs/nullfs/null_vfsops.c =================================================================== --- projects/clang1100-import/sys/fs/nullfs/null_vfsops.c (revision 364278) +++ projects/clang1100-import/sys/fs/nullfs/null_vfsops.c (revision 364279) @@ -1,468 +1,468 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1992, 1993, 1995 * The Regents of the University of California. All rights reserved. * * This code is derived from software donated to Berkeley by * Jan-Simon Pendry. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)null_vfsops.c 8.2 (Berkeley) 1/21/94 * * @(#)lofs_vfsops.c 1.2 (Berkeley) 6/18/92 * $FreeBSD$ */ /* * Null Layer * (See null_vnops.c for a description of what this does.) */ #include #include #include #include #include #include #include #include #include #include #include #include static MALLOC_DEFINE(M_NULLFSMNT, "nullfs_mount", "NULLFS mount structure"); static vfs_fhtovp_t nullfs_fhtovp; static vfs_mount_t nullfs_mount; static vfs_quotactl_t nullfs_quotactl; static vfs_root_t nullfs_root; static vfs_sync_t nullfs_sync; static vfs_statfs_t nullfs_statfs; static vfs_unmount_t nullfs_unmount; static vfs_vget_t nullfs_vget; static vfs_extattrctl_t nullfs_extattrctl; /* * Mount null layer */ static int nullfs_mount(struct mount *mp) { struct vnode *lowerrootvp; struct vnode *nullm_rootvp; struct null_mount *xmp; struct null_node *nn; struct nameidata nd, *ndp; char *target; int error, len; bool isvnunlocked; NULLFSDEBUG("nullfs_mount(mp = %p)\n", (void *)mp); if (mp->mnt_flag & MNT_ROOTFS) return (EOPNOTSUPP); /* * Update is a no-op */ if (mp->mnt_flag & MNT_UPDATE) { /* * Only support update mounts for NFS export. */ if (vfs_flagopt(mp->mnt_optnew, "export", NULL, 0)) return (0); else return (EOPNOTSUPP); } /* * Get argument */ error = vfs_getopt(mp->mnt_optnew, "target", (void **)&target, &len); if (error || target[len - 1] != '\0') return (EINVAL); /* * Unlock lower node to avoid possible deadlock. */ if (mp->mnt_vnodecovered->v_op == &null_vnodeops && VOP_ISLOCKED(mp->mnt_vnodecovered) == LK_EXCLUSIVE) { VOP_UNLOCK(mp->mnt_vnodecovered); isvnunlocked = true; } else { isvnunlocked = false; } /* * Find lower node */ ndp = &nd; NDINIT(ndp, LOOKUP, FOLLOW|LOCKLEAF, UIO_SYSSPACE, target, curthread); error = namei(ndp); /* * Re-lock vnode. * XXXKIB This is deadlock-prone as well. */ if (isvnunlocked) vn_lock(mp->mnt_vnodecovered, LK_EXCLUSIVE | LK_RETRY); if (error) return (error); NDFREE(ndp, NDF_ONLY_PNBUF); /* * Sanity check on lower vnode */ lowerrootvp = ndp->ni_vp; /* * Check multi null mount to avoid `lock against myself' panic. */ if (mp->mnt_vnodecovered->v_op == &null_vnodeops) { nn = VTONULL(mp->mnt_vnodecovered); if (nn == NULL || lowerrootvp == nn->null_lowervp) { NULLFSDEBUG("nullfs_mount: multi null mount?\n"); vput(lowerrootvp); return (EDEADLK); } } xmp = (struct null_mount *) malloc(sizeof(struct null_mount), M_NULLFSMNT, M_WAITOK | M_ZERO); /* * Save pointer to underlying FS and the reference to the * lower root vnode. */ xmp->nullm_vfs = lowerrootvp->v_mount; vref(lowerrootvp); xmp->nullm_lowerrootvp = lowerrootvp; mp->mnt_data = xmp; /* * Make sure the node alias worked. */ error = null_nodeget(mp, lowerrootvp, &nullm_rootvp); if (error != 0) { vrele(lowerrootvp); free(xmp, M_NULLFSMNT); return (error); } if (NULLVPTOLOWERVP(nullm_rootvp)->v_mount->mnt_flag & MNT_LOCAL) { MNT_ILOCK(mp); mp->mnt_flag |= MNT_LOCAL; MNT_IUNLOCK(mp); } xmp->nullm_flags |= NULLM_CACHE; if (vfs_getopt(mp->mnt_optnew, "nocache", NULL, NULL) == 0 || (xmp->nullm_vfs->mnt_kern_flag & MNTK_NULL_NOCACHE) != 0) xmp->nullm_flags &= ~NULLM_CACHE; MNT_ILOCK(mp); if ((xmp->nullm_flags & NULLM_CACHE) != 0) { mp->mnt_kern_flag |= lowerrootvp->v_mount->mnt_kern_flag & (MNTK_SHARED_WRITES | MNTK_LOOKUP_SHARED | MNTK_EXTENDED_SHARED); } mp->mnt_kern_flag |= MNTK_LOOKUP_EXCL_DOTDOT | MNTK_NOMSYNC; mp->mnt_kern_flag |= lowerrootvp->v_mount->mnt_kern_flag & (MNTK_USES_BCACHE | MNTK_NO_IOPF | MNTK_UNMAPPED_BUFS); MNT_IUNLOCK(mp); vfs_getnewfsid(mp); if ((xmp->nullm_flags & NULLM_CACHE) != 0) { MNT_ILOCK(xmp->nullm_vfs); TAILQ_INSERT_TAIL(&xmp->nullm_vfs->mnt_uppers, mp, mnt_upper_link); MNT_IUNLOCK(xmp->nullm_vfs); } vfs_mountedfrom(mp, target); vput(nullm_rootvp); NULLFSDEBUG("nullfs_mount: lower %s, alias at %s\n", mp->mnt_stat.f_mntfromname, mp->mnt_stat.f_mntonname); return (0); } /* * Free reference to null layer */ static int nullfs_unmount(mp, mntflags) struct mount *mp; int mntflags; { struct null_mount *mntdata; struct mount *ump; int error, flags; NULLFSDEBUG("nullfs_unmount: mp = %p\n", (void *)mp); if (mntflags & MNT_FORCE) flags = FORCECLOSE; else flags = 0; for (;;) { /* There is 1 extra root vnode reference (nullm_rootvp). */ error = vflush(mp, 0, flags, curthread); if (error) return (error); MNT_ILOCK(mp); if (mp->mnt_nvnodelistsize == 0) { MNT_IUNLOCK(mp); break; } MNT_IUNLOCK(mp); if ((mntflags & MNT_FORCE) == 0) return (EBUSY); } /* * Finally, throw away the null_mount structure */ mntdata = mp->mnt_data; ump = mntdata->nullm_vfs; if ((mntdata->nullm_flags & NULLM_CACHE) != 0) { MNT_ILOCK(ump); while ((ump->mnt_kern_flag & MNTK_VGONE_UPPER) != 0) { ump->mnt_kern_flag |= MNTK_VGONE_WAITER; msleep(&ump->mnt_uppers, &ump->mnt_mtx, 0, "vgnupw", 0); } TAILQ_REMOVE(&ump->mnt_uppers, mp, mnt_upper_link); MNT_IUNLOCK(ump); } vrele(mntdata->nullm_lowerrootvp); mp->mnt_data = NULL; free(mntdata, M_NULLFSMNT); return (0); } static int nullfs_root(mp, flags, vpp) struct mount *mp; int flags; struct vnode **vpp; { struct vnode *vp; struct null_mount *mntdata; int error; mntdata = MOUNTTONULLMOUNT(mp); NULLFSDEBUG("nullfs_root(mp = %p, vp = %p)\n", mp, mntdata->nullm_lowerrootvp); - error = vget(mntdata->nullm_lowerrootvp, flags, curthread); + error = vget(mntdata->nullm_lowerrootvp, flags); if (error == 0) { error = null_nodeget(mp, mntdata->nullm_lowerrootvp, &vp); if (error == 0) { *vpp = vp; } } return (error); } static int nullfs_quotactl(mp, cmd, uid, arg) struct mount *mp; int cmd; uid_t uid; void *arg; { return VFS_QUOTACTL(MOUNTTONULLMOUNT(mp)->nullm_vfs, cmd, uid, arg); } static int nullfs_statfs(mp, sbp) struct mount *mp; struct statfs *sbp; { int error; struct statfs *mstat; NULLFSDEBUG("nullfs_statfs(mp = %p, vp = %p->%p)\n", (void *)mp, (void *)MOUNTTONULLMOUNT(mp)->nullm_rootvp, (void *)NULLVPTOLOWERVP(MOUNTTONULLMOUNT(mp)->nullm_rootvp)); mstat = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK | M_ZERO); error = VFS_STATFS(MOUNTTONULLMOUNT(mp)->nullm_vfs, mstat); if (error) { free(mstat, M_STATFS); return (error); } /* now copy across the "interesting" information and fake the rest */ sbp->f_type = mstat->f_type; sbp->f_flags = (sbp->f_flags & (MNT_RDONLY | MNT_NOEXEC | MNT_NOSUID | MNT_UNION | MNT_NOSYMFOLLOW | MNT_AUTOMOUNTED)) | (mstat->f_flags & ~(MNT_ROOTFS | MNT_AUTOMOUNTED)); sbp->f_bsize = mstat->f_bsize; sbp->f_iosize = mstat->f_iosize; sbp->f_blocks = mstat->f_blocks; sbp->f_bfree = mstat->f_bfree; sbp->f_bavail = mstat->f_bavail; sbp->f_files = mstat->f_files; sbp->f_ffree = mstat->f_ffree; free(mstat, M_STATFS); return (0); } static int nullfs_sync(mp, waitfor) struct mount *mp; int waitfor; { /* * XXX - Assumes no data cached at null layer. */ return (0); } static int nullfs_vget(mp, ino, flags, vpp) struct mount *mp; ino_t ino; int flags; struct vnode **vpp; { int error; KASSERT((flags & LK_TYPE_MASK) != 0, ("nullfs_vget: no lock requested")); error = VFS_VGET(MOUNTTONULLMOUNT(mp)->nullm_vfs, ino, flags, vpp); if (error != 0) return (error); return (null_nodeget(mp, *vpp, vpp)); } static int nullfs_fhtovp(mp, fidp, flags, vpp) struct mount *mp; struct fid *fidp; int flags; struct vnode **vpp; { int error; error = VFS_FHTOVP(MOUNTTONULLMOUNT(mp)->nullm_vfs, fidp, flags, vpp); if (error != 0) return (error); return (null_nodeget(mp, *vpp, vpp)); } static int nullfs_extattrctl(mp, cmd, filename_vp, namespace, attrname) struct mount *mp; int cmd; struct vnode *filename_vp; int namespace; const char *attrname; { return (VFS_EXTATTRCTL(MOUNTTONULLMOUNT(mp)->nullm_vfs, cmd, filename_vp, namespace, attrname)); } static void nullfs_reclaim_lowervp(struct mount *mp, struct vnode *lowervp) { struct vnode *vp; vp = null_hashget(mp, lowervp); if (vp == NULL) return; VTONULL(vp)->null_flags |= NULLV_NOUNLOCK; vgone(vp); vput(vp); } static void nullfs_unlink_lowervp(struct mount *mp, struct vnode *lowervp) { struct vnode *vp; struct null_node *xp; vp = null_hashget(mp, lowervp); if (vp == NULL) return; xp = VTONULL(vp); xp->null_flags |= NULLV_DROP | NULLV_NOUNLOCK; vhold(vp); vunref(vp); if (vp->v_usecount == 0) { /* * If vunref() dropped the last use reference on the * nullfs vnode, it must be reclaimed, and its lock * was split from the lower vnode lock. Need to do * extra unlock before allowing the final vdrop() to * free the vnode. */ KASSERT(VN_IS_DOOMED(vp), ("not reclaimed nullfs vnode %p", vp)); VOP_UNLOCK(vp); } else { /* * Otherwise, the nullfs vnode still shares the lock * with the lower vnode, and must not be unlocked. * Also clear the NULLV_NOUNLOCK, the flag is not * relevant for future reclamations. */ ASSERT_VOP_ELOCKED(vp, "unlink_lowervp"); KASSERT(!VN_IS_DOOMED(vp), ("reclaimed nullfs vnode %p", vp)); xp->null_flags &= ~NULLV_NOUNLOCK; } vdrop(vp); } static struct vfsops null_vfsops = { .vfs_extattrctl = nullfs_extattrctl, .vfs_fhtovp = nullfs_fhtovp, .vfs_init = nullfs_init, .vfs_mount = nullfs_mount, .vfs_quotactl = nullfs_quotactl, .vfs_root = nullfs_root, .vfs_statfs = nullfs_statfs, .vfs_sync = nullfs_sync, .vfs_uninit = nullfs_uninit, .vfs_unmount = nullfs_unmount, .vfs_vget = nullfs_vget, .vfs_reclaim_lowervp = nullfs_reclaim_lowervp, .vfs_unlink_lowervp = nullfs_unlink_lowervp, }; VFS_SET(null_vfsops, nullfs, VFCF_LOOPBACK | VFCF_JAIL); Index: projects/clang1100-import/sys/fs/pseudofs/pseudofs_vncache.c =================================================================== --- projects/clang1100-import/sys/fs/pseudofs/pseudofs_vncache.c (revision 364278) +++ projects/clang1100-import/sys/fs/pseudofs/pseudofs_vncache.c (revision 364279) @@ -1,361 +1,361 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2001 Dag-Erling Coïdan Smørgrav * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer * in this position and unchanged. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_pseudofs.h" #include #include #include #include #include #include #include #include #include #include #include #include static MALLOC_DEFINE(M_PFSVNCACHE, "pfs_vncache", "pseudofs vnode cache"); static struct mtx pfs_vncache_mutex; static eventhandler_tag pfs_exit_tag; static void pfs_exit(void *arg, struct proc *p); static void pfs_purge_all(void); static SYSCTL_NODE(_vfs_pfs, OID_AUTO, vncache, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "pseudofs vnode cache"); static int pfs_vncache_entries; SYSCTL_INT(_vfs_pfs_vncache, OID_AUTO, entries, CTLFLAG_RD, &pfs_vncache_entries, 0, "number of entries in the vnode cache"); static int pfs_vncache_maxentries; SYSCTL_INT(_vfs_pfs_vncache, OID_AUTO, maxentries, CTLFLAG_RD, &pfs_vncache_maxentries, 0, "highest number of entries in the vnode cache"); static int pfs_vncache_hits; SYSCTL_INT(_vfs_pfs_vncache, OID_AUTO, hits, CTLFLAG_RD, &pfs_vncache_hits, 0, "number of cache hits since initialization"); static int pfs_vncache_misses; SYSCTL_INT(_vfs_pfs_vncache, OID_AUTO, misses, CTLFLAG_RD, &pfs_vncache_misses, 0, "number of cache misses since initialization"); extern struct vop_vector pfs_vnodeops; /* XXX -> .h file */ static SLIST_HEAD(pfs_vncache_head, pfs_vdata) *pfs_vncache_hashtbl; static u_long pfs_vncache_hash; #define PFS_VNCACHE_HASH(pid) (&pfs_vncache_hashtbl[(pid) & pfs_vncache_hash]) /* * Initialize vnode cache */ void pfs_vncache_load(void) { mtx_init(&pfs_vncache_mutex, "pfs_vncache", NULL, MTX_DEF); pfs_vncache_hashtbl = hashinit(maxproc / 4, M_PFSVNCACHE, &pfs_vncache_hash); pfs_exit_tag = EVENTHANDLER_REGISTER(process_exit, pfs_exit, NULL, EVENTHANDLER_PRI_ANY); } /* * Tear down vnode cache */ void pfs_vncache_unload(void) { EVENTHANDLER_DEREGISTER(process_exit, pfs_exit_tag); pfs_purge_all(); KASSERT(pfs_vncache_entries == 0, ("%d vncache entries remaining", pfs_vncache_entries)); mtx_destroy(&pfs_vncache_mutex); } /* * Allocate a vnode */ int pfs_vncache_alloc(struct mount *mp, struct vnode **vpp, struct pfs_node *pn, pid_t pid) { struct pfs_vncache_head *hash; struct pfs_vdata *pvd, *pvd2; struct vnode *vp; int error; /* * See if the vnode is in the cache. */ hash = PFS_VNCACHE_HASH(pid); if (SLIST_EMPTY(hash)) goto alloc; retry: mtx_lock(&pfs_vncache_mutex); SLIST_FOREACH(pvd, hash, pvd_hash) { if (pvd->pvd_pn == pn && pvd->pvd_pid == pid && pvd->pvd_vnode->v_mount == mp) { vp = pvd->pvd_vnode; VI_LOCK(vp); mtx_unlock(&pfs_vncache_mutex); - if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK, curthread) == 0) { + if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK) == 0) { ++pfs_vncache_hits; *vpp = vp; /* * Some callers cache_enter(vp) later, so * we have to make sure it's not in the * VFS cache so it doesn't get entered * twice. A better solution would be to * make pfs_vncache_alloc() responsible * for entering the vnode in the VFS * cache. */ cache_purge(vp); return (0); } goto retry; } } mtx_unlock(&pfs_vncache_mutex); alloc: /* nope, get a new one */ pvd = malloc(sizeof *pvd, M_PFSVNCACHE, M_WAITOK); error = getnewvnode("pseudofs", mp, &pfs_vnodeops, vpp); if (error) { free(pvd, M_PFSVNCACHE); return (error); } pvd->pvd_pn = pn; pvd->pvd_pid = pid; (*vpp)->v_data = pvd; switch (pn->pn_type) { case pfstype_root: (*vpp)->v_vflag = VV_ROOT; #if 0 printf("root vnode allocated\n"); #endif /* fall through */ case pfstype_dir: case pfstype_this: case pfstype_parent: case pfstype_procdir: (*vpp)->v_type = VDIR; break; case pfstype_file: (*vpp)->v_type = VREG; break; case pfstype_symlink: (*vpp)->v_type = VLNK; break; case pfstype_none: KASSERT(0, ("pfs_vncache_alloc called for null node\n")); default: panic("%s has unexpected type: %d", pn->pn_name, pn->pn_type); } /* * Propagate flag through to vnode so users know it can change * if the process changes (i.e. execve) */ if ((pn->pn_flags & PFS_PROCDEP) != 0) (*vpp)->v_vflag |= VV_PROCDEP; pvd->pvd_vnode = *vpp; vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY); VN_LOCK_AREC(*vpp); error = insmntque(*vpp, mp); if (error != 0) { free(pvd, M_PFSVNCACHE); *vpp = NULLVP; return (error); } retry2: mtx_lock(&pfs_vncache_mutex); /* * Other thread may race with us, creating the entry we are * going to insert into the cache. Recheck after * pfs_vncache_mutex is reacquired. */ SLIST_FOREACH(pvd2, hash, pvd_hash) { if (pvd2->pvd_pn == pn && pvd2->pvd_pid == pid && pvd2->pvd_vnode->v_mount == mp) { vp = pvd2->pvd_vnode; VI_LOCK(vp); mtx_unlock(&pfs_vncache_mutex); - if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK, curthread) == 0) { + if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK) == 0) { ++pfs_vncache_hits; vgone(*vpp); vput(*vpp); *vpp = vp; cache_purge(vp); return (0); } goto retry2; } } ++pfs_vncache_misses; if (++pfs_vncache_entries > pfs_vncache_maxentries) pfs_vncache_maxentries = pfs_vncache_entries; SLIST_INSERT_HEAD(hash, pvd, pvd_hash); mtx_unlock(&pfs_vncache_mutex); return (0); } /* * Free a vnode */ int pfs_vncache_free(struct vnode *vp) { struct pfs_vdata *pvd, *pvd2; mtx_lock(&pfs_vncache_mutex); pvd = (struct pfs_vdata *)vp->v_data; KASSERT(pvd != NULL, ("pfs_vncache_free(): no vnode data\n")); SLIST_FOREACH(pvd2, PFS_VNCACHE_HASH(pvd->pvd_pid), pvd_hash) { if (pvd2 != pvd) continue; SLIST_REMOVE(PFS_VNCACHE_HASH(pvd->pvd_pid), pvd, pfs_vdata, pvd_hash); --pfs_vncache_entries; break; } mtx_unlock(&pfs_vncache_mutex); free(pvd, M_PFSVNCACHE); vp->v_data = NULL; return (0); } /* * Purge the cache of dead entries * * The code is not very efficient and this perhaps can be addressed without * a complete rewrite. Previous iteration was walking a linked list from * scratch every time. This code only walks the relevant hash chain (if pid * is provided), but still resorts to scanning the entire cache at least twice * if a specific component is to be removed which is slower. This can be * augmented with resizing the hash. * * Explanation of the previous state: * * This is extremely inefficient due to the fact that vgone() not only * indirectly modifies the vnode cache, but may also sleep. We can * neither hold pfs_vncache_mutex across a vgone() call, nor make any * assumptions about the state of the cache after vgone() returns. In * consequence, we must start over after every vgone() call, and keep * trying until we manage to traverse the entire cache. * * The only way to improve this situation is to change the data structure * used to implement the cache. */ static void pfs_purge_one(struct vnode *vnp) { VOP_LOCK(vnp, LK_EXCLUSIVE); vgone(vnp); VOP_UNLOCK(vnp); vdrop(vnp); } void pfs_purge(struct pfs_node *pn) { struct pfs_vdata *pvd; struct vnode *vnp; u_long i, removed; mtx_lock(&pfs_vncache_mutex); restart: removed = 0; for (i = 0; i < pfs_vncache_hash; i++) { restart_chain: SLIST_FOREACH(pvd, &pfs_vncache_hashtbl[i], pvd_hash) { if (pn != NULL && pvd->pvd_pn != pn) continue; vnp = pvd->pvd_vnode; vhold(vnp); mtx_unlock(&pfs_vncache_mutex); pfs_purge_one(vnp); removed++; mtx_lock(&pfs_vncache_mutex); goto restart_chain; } } if (removed > 0) goto restart; mtx_unlock(&pfs_vncache_mutex); } static void pfs_purge_all(void) { pfs_purge(NULL); } /* * Free all vnodes associated with a defunct process */ static void pfs_exit(void *arg, struct proc *p) { struct pfs_vncache_head *hash; struct pfs_vdata *pvd; struct vnode *vnp; int pid; pid = p->p_pid; hash = PFS_VNCACHE_HASH(pid); if (SLIST_EMPTY(hash)) return; restart: mtx_lock(&pfs_vncache_mutex); SLIST_FOREACH(pvd, hash, pvd_hash) { if (pvd->pvd_pid != pid) continue; vnp = pvd->pvd_vnode; vhold(vnp); mtx_unlock(&pfs_vncache_mutex); pfs_purge_one(vnp); goto restart; } mtx_unlock(&pfs_vncache_mutex); } Index: projects/clang1100-import/sys/fs/smbfs/smbfs_node.c =================================================================== --- projects/clang1100-import/sys/fs/smbfs/smbfs_node.c (revision 364278) +++ projects/clang1100-import/sys/fs/smbfs/smbfs_node.c (revision 364279) @@ -1,408 +1,408 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2000-2001 Boris Popov * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /*#include #include */ #include #include #include extern struct vop_vector smbfs_vnodeops; /* XXX -> .h file */ static MALLOC_DEFINE(M_SMBNODE, "smbufs_node", "SMBFS vnode private part"); static MALLOC_DEFINE(M_SMBNODENAME, "smbufs_nname", "SMBFS node name"); u_int32_t __inline smbfs_hash(const u_char *name, int nmlen) { return (fnv_32_buf(name, nmlen, FNV1_32_INIT)); } static char * smbfs_name_alloc(const u_char *name, int nmlen) { u_char *cp; nmlen++; cp = malloc(nmlen, M_SMBNODENAME, M_WAITOK); bcopy(name, cp, nmlen - 1); cp[nmlen - 1] = 0; return cp; } static void smbfs_name_free(u_char *name) { free(name, M_SMBNODENAME); } static int __inline smbfs_vnode_cmp(struct vnode *vp, void *_sc) { struct smbnode *np; struct smbcmp *sc; np = (struct smbnode *) vp->v_data; sc = (struct smbcmp *) _sc; if (np->n_parent != sc->n_parent || np->n_nmlen != sc->n_nmlen || bcmp(sc->n_name, np->n_name, sc->n_nmlen) != 0) return 1; return 0; } static int smbfs_node_alloc(struct mount *mp, struct vnode *dvp, const char *dirnm, int dirlen, const char *name, int nmlen, char sep, struct smbfattr *fap, struct vnode **vpp) { struct vattr vattr; struct thread *td = curthread; /* XXX */ struct smbmount *smp = VFSTOSMBFS(mp); struct smbnode *np, *dnp; struct vnode *vp, *vp2; struct smbcmp sc; char *p, *rpath; int error, rplen; sc.n_parent = dvp; sc.n_nmlen = nmlen; sc.n_name = name; if (smp->sm_root != NULL && dvp == NULL) { SMBERROR("do not allocate root vnode twice!\n"); return EINVAL; } if (nmlen == 2 && bcmp(name, "..", 2) == 0) { if (dvp == NULL) return EINVAL; vp = VTOSMB(VTOSMB(dvp)->n_parent)->n_vnode; - error = vget(vp, LK_EXCLUSIVE, td); + error = vget(vp, LK_EXCLUSIVE); if (error == 0) *vpp = vp; return error; } else if (nmlen == 1 && name[0] == '.') { SMBERROR("do not call me with dot!\n"); return EINVAL; } dnp = dvp ? VTOSMB(dvp) : NULL; if (dnp == NULL && dvp != NULL) { vn_printf(dvp, "smbfs_node_alloc: dead parent vnode "); return EINVAL; } error = vfs_hash_get(mp, smbfs_hash(name, nmlen), LK_EXCLUSIVE, td, vpp, smbfs_vnode_cmp, &sc); if (error) return (error); if (*vpp) { np = VTOSMB(*vpp); /* Force cached attributes to be refreshed if stale. */ (void)VOP_GETATTR(*vpp, &vattr, td->td_ucred); /* * If the file type on the server is inconsistent with * what it was when we created the vnode, kill the * bogus vnode now and fall through to the code below * to create a new one with the right type. */ if (((*vpp)->v_type == VDIR && (np->n_dosattr & SMB_FA_DIR) == 0) || ((*vpp)->v_type == VREG && (np->n_dosattr & SMB_FA_DIR) != 0)) { vgone(*vpp); vput(*vpp); } else { SMBVDEBUG("vnode taken from the hashtable\n"); return (0); } } /* * If we don't have node attributes, then it is an explicit lookup * for an existing vnode. */ if (fap == NULL) return ENOENT; error = getnewvnode("smbfs", mp, &smbfs_vnodeops, vpp); if (error) return (error); vp = *vpp; np = malloc(sizeof *np, M_SMBNODE, M_WAITOK | M_ZERO); rplen = dirlen; if (sep != '\0') rplen++; rplen += nmlen; rpath = malloc(rplen + 1, M_SMBNODENAME, M_WAITOK); p = rpath; bcopy(dirnm, p, dirlen); p += dirlen; if (sep != '\0') *p++ = sep; if (name != NULL) { bcopy(name, p, nmlen); p += nmlen; } *p = '\0'; MPASS(p == rpath + rplen); lockmgr(vp->v_vnlock, LK_EXCLUSIVE, NULL); /* Vnode initialization */ vp->v_type = fap->fa_attr & SMB_FA_DIR ? VDIR : VREG; vp->v_data = np; np->n_vnode = vp; np->n_mount = VFSTOSMBFS(mp); np->n_rpath = rpath; np->n_rplen = rplen; np->n_nmlen = nmlen; np->n_name = smbfs_name_alloc(name, nmlen); np->n_ino = fap->fa_ino; if (dvp) { ASSERT_VOP_LOCKED(dvp, "smbfs_node_alloc"); np->n_parent = dvp; np->n_parentino = VTOSMB(dvp)->n_ino; if (/*vp->v_type == VDIR &&*/ (dvp->v_vflag & VV_ROOT) == 0) { vref(dvp); np->n_flag |= NREFPARENT; } } else if (vp->v_type == VREG) SMBERROR("new vnode '%s' born without parent ?\n", np->n_name); error = insmntque(vp, mp); if (error) { free(np, M_SMBNODE); return (error); } error = vfs_hash_insert(vp, smbfs_hash(name, nmlen), LK_EXCLUSIVE, td, &vp2, smbfs_vnode_cmp, &sc); if (error) return (error); if (vp2 != NULL) *vpp = vp2; return (0); } int smbfs_nget(struct mount *mp, struct vnode *dvp, const char *name, int nmlen, struct smbfattr *fap, struct vnode **vpp) { struct smbnode *dnp, *np; struct vnode *vp; int error, sep; dnp = (dvp) ? VTOSMB(dvp) : NULL; sep = 0; if (dnp != NULL) { sep = SMBFS_DNP_SEP(dnp); error = smbfs_node_alloc(mp, dvp, dnp->n_rpath, dnp->n_rplen, name, nmlen, sep, fap, &vp); } else error = smbfs_node_alloc(mp, NULL, "\\", 1, name, nmlen, sep, fap, &vp); if (error) return error; MPASS(vp != NULL); np = VTOSMB(vp); if (fap) smbfs_attr_cacheenter(vp, fap); *vpp = vp; return 0; } /* * Free smbnode, and give vnode back to system */ int smbfs_reclaim(ap) struct vop_reclaim_args /* { struct vnode *a_vp; struct thread *a_p; } */ *ap; { struct vnode *vp = ap->a_vp; struct vnode *dvp; struct smbnode *np = VTOSMB(vp); struct smbmount *smp = VTOSMBFS(vp); SMBVDEBUG("%s,%d\n", np->n_name, vrefcnt(vp)); KASSERT((np->n_flag & NOPEN) == 0, ("file not closed before reclaim")); dvp = (np->n_parent && (np->n_flag & NREFPARENT)) ? np->n_parent : NULL; /* * Remove the vnode from its hash chain. */ vfs_hash_remove(vp); if (np->n_name) smbfs_name_free(np->n_name); if (np->n_rpath) free(np->n_rpath, M_SMBNODENAME); free(np, M_SMBNODE); vp->v_data = NULL; if (dvp != NULL) { vrele(dvp); /* * Indicate that we released something; see comment * in smbfs_unmount(). */ smp->sm_didrele = 1; } return 0; } int smbfs_inactive(ap) struct vop_inactive_args /* { struct vnode *a_vp; struct thread *a_td; } */ *ap; { struct thread *td = ap->a_td; struct ucred *cred = td->td_ucred; struct vnode *vp = ap->a_vp; struct smbnode *np = VTOSMB(vp); struct smb_cred *scred; struct vattr va; SMBVDEBUG("%s: %d\n", VTOSMB(vp)->n_name, vrefcnt(vp)); if ((np->n_flag & NOPEN) != 0) { scred = smbfs_malloc_scred(); smb_makescred(scred, td, cred); smbfs_vinvalbuf(vp, td); if (vp->v_type == VREG) { VOP_GETATTR(vp, &va, cred); smbfs_smb_close(np->n_mount->sm_share, np->n_fid, &np->n_mtime, scred); } else if (vp->v_type == VDIR) { if (np->n_dirseq != NULL) { smbfs_findclose(np->n_dirseq, scred); np->n_dirseq = NULL; } } np->n_flag &= ~NOPEN; smbfs_attr_cacheremove(vp); smbfs_free_scred(scred); } if (np->n_flag & NGONE) vrecycle(vp); return (0); } /* * routines to maintain vnode attributes cache * smbfs_attr_cacheenter: unpack np.i to vattr structure */ void smbfs_attr_cacheenter(struct vnode *vp, struct smbfattr *fap) { struct smbnode *np = VTOSMB(vp); if (vp->v_type == VREG) { if (np->n_size != fap->fa_size) { np->n_size = fap->fa_size; vnode_pager_setsize(vp, np->n_size); } } else if (vp->v_type == VDIR) { np->n_size = 16384; /* should be a better way ... */ } else return; np->n_mtime = fap->fa_mtime; np->n_dosattr = fap->fa_attr; np->n_attrage = time_second; return; } int smbfs_attr_cachelookup(struct vnode *vp, struct vattr *va) { struct smbnode *np = VTOSMB(vp); struct smbmount *smp = VTOSMBFS(vp); int diff; diff = time_second - np->n_attrage; if (diff > 2) /* XXX should be configurable */ return ENOENT; va->va_type = vp->v_type; /* vnode type (for create) */ va->va_flags = 0; /* flags defined for file */ if (vp->v_type == VREG) { va->va_mode = smp->sm_file_mode; /* files access mode and type */ if (np->n_dosattr & SMB_FA_RDONLY) { va->va_mode &= ~(S_IWUSR|S_IWGRP|S_IWOTH); va->va_flags |= UF_READONLY; } } else if (vp->v_type == VDIR) { va->va_mode = smp->sm_dir_mode; /* files access mode and type */ } else return EINVAL; va->va_size = np->n_size; va->va_nlink = 1; /* number of references to file */ va->va_uid = smp->sm_uid; /* owner user id */ va->va_gid = smp->sm_gid; /* owner group id */ va->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0]; va->va_fileid = np->n_ino; /* file id */ if (va->va_fileid == 0) va->va_fileid = 2; va->va_blocksize = SSTOVC(smp->sm_share)->vc_txmax; va->va_mtime = np->n_mtime; va->va_atime = va->va_ctime = va->va_mtime; /* time file changed */ va->va_gen = VNOVAL; /* generation number of file */ if (np->n_dosattr & SMB_FA_HIDDEN) va->va_flags |= UF_HIDDEN; if (np->n_dosattr & SMB_FA_SYSTEM) va->va_flags |= UF_SYSTEM; /* * We don't set the archive bit for directories. */ if ((vp->v_type != VDIR) && (np->n_dosattr & SMB_FA_ARCHIVE)) va->va_flags |= UF_ARCHIVE; va->va_rdev = NODEV; /* device the special file represents */ va->va_bytes = va->va_size; /* bytes of disk space held by file */ va->va_filerev = 0; /* file modification number */ va->va_vaflags = 0; /* operations flags */ return 0; } Index: projects/clang1100-import/sys/fs/smbfs/smbfs_vfsops.c =================================================================== --- projects/clang1100-import/sys/fs/smbfs/smbfs_vfsops.c (revision 364278) +++ projects/clang1100-import/sys/fs/smbfs/smbfs_vfsops.c (revision 364279) @@ -1,411 +1,411 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2000-2001 Boris Popov * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static int smbfs_debuglevel = 0; static int smbfs_version = SMBFS_VERSION; SYSCTL_NODE(_vfs, OID_AUTO, smbfs, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "SMB/CIFS filesystem"); SYSCTL_INT(_vfs_smbfs, OID_AUTO, version, CTLFLAG_RD, &smbfs_version, 0, ""); SYSCTL_INT(_vfs_smbfs, OID_AUTO, debuglevel, CTLFLAG_RW, &smbfs_debuglevel, 0, ""); static vfs_init_t smbfs_init; static vfs_uninit_t smbfs_uninit; static vfs_cmount_t smbfs_cmount; static vfs_mount_t smbfs_mount; static vfs_root_t smbfs_root; static vfs_quotactl_t smbfs_quotactl; static vfs_statfs_t smbfs_statfs; static vfs_unmount_t smbfs_unmount; static struct vfsops smbfs_vfsops = { .vfs_init = smbfs_init, .vfs_cmount = smbfs_cmount, .vfs_mount = smbfs_mount, .vfs_quotactl = smbfs_quotactl, .vfs_root = smbfs_root, .vfs_statfs = smbfs_statfs, .vfs_sync = vfs_stdsync, .vfs_uninit = smbfs_uninit, .vfs_unmount = smbfs_unmount, }; VFS_SET(smbfs_vfsops, smbfs, VFCF_NETWORK); MODULE_DEPEND(smbfs, netsmb, NSMB_VERSION, NSMB_VERSION, NSMB_VERSION); MODULE_DEPEND(smbfs, libiconv, 1, 1, 2); MODULE_DEPEND(smbfs, libmchain, 1, 1, 1); uma_zone_t smbfs_pbuf_zone; static int smbfs_cmount(struct mntarg *ma, void * data, uint64_t flags) { struct smbfs_args args; int error; error = copyin(data, &args, sizeof(struct smbfs_args)); if (error) return error; if (args.version != SMBFS_VERSION) { printf("mount version mismatch: kernel=%d, mount=%d\n", SMBFS_VERSION, args.version); return EINVAL; } ma = mount_argf(ma, "dev", "%d", args.dev); ma = mount_argb(ma, args.flags & SMBFS_MOUNT_SOFT, "nosoft"); ma = mount_argb(ma, args.flags & SMBFS_MOUNT_INTR, "nointr"); ma = mount_argb(ma, args.flags & SMBFS_MOUNT_STRONG, "nostrong"); ma = mount_argb(ma, args.flags & SMBFS_MOUNT_HAVE_NLS, "nohave_nls"); ma = mount_argb(ma, !(args.flags & SMBFS_MOUNT_NO_LONG), "nolong"); ma = mount_arg(ma, "rootpath", args.root_path, -1); ma = mount_argf(ma, "uid", "%d", args.uid); ma = mount_argf(ma, "gid", "%d", args.gid); ma = mount_argf(ma, "file_mode", "%d", args.file_mode); ma = mount_argf(ma, "dir_mode", "%d", args.dir_mode); ma = mount_argf(ma, "caseopt", "%d", args.caseopt); error = kernel_mount(ma, flags); return (error); } static const char *smbfs_opts[] = { "fd", "soft", "intr", "strong", "have_nls", "long", "mountpoint", "rootpath", "uid", "gid", "file_mode", "dir_mode", "caseopt", "errmsg", NULL }; static int smbfs_mount(struct mount *mp) { struct smbmount *smp = NULL; struct smb_vc *vcp; struct smb_share *ssp = NULL; struct vnode *vp; struct thread *td; struct smb_dev *dev; struct smb_cred *scred; int error, v; char *pc, *pe; dev = NULL; td = curthread; if (mp->mnt_flag & (MNT_UPDATE | MNT_ROOTFS)) return EOPNOTSUPP; if (vfs_filteropt(mp->mnt_optnew, smbfs_opts)) { vfs_mount_error(mp, "%s", "Invalid option"); return (EINVAL); } scred = smbfs_malloc_scred(); smb_makescred(scred, td, td->td_ucred); /* Ask userspace of `fd`, the file descriptor of this session */ if (1 != vfs_scanopt(mp->mnt_optnew, "fd", "%d", &v)) { vfs_mount_error(mp, "No fd option"); smbfs_free_scred(scred); return (EINVAL); } error = smb_dev2share(v, SMBM_EXEC, scred, &ssp, &dev); smp = malloc(sizeof(*smp), M_SMBFSDATA, M_WAITOK | M_ZERO); if (error) { printf("invalid device handle %d (%d)\n", v, error); vfs_mount_error(mp, "invalid device handle %d %d\n", v, error); smbfs_free_scred(scred); free(smp, M_SMBFSDATA); return error; } vcp = SSTOVC(ssp); smb_share_unlock(ssp); mp->mnt_stat.f_iosize = SSTOVC(ssp)->vc_txmax; mp->mnt_data = smp; smp->sm_share = ssp; smp->sm_root = NULL; smp->sm_dev = dev; if (1 != vfs_scanopt(mp->mnt_optnew, "caseopt", "%d", &smp->sm_caseopt)) { vfs_mount_error(mp, "Invalid caseopt"); error = EINVAL; goto bad; } if (1 != vfs_scanopt(mp->mnt_optnew, "uid", "%d", &v)) { vfs_mount_error(mp, "Invalid uid"); error = EINVAL; goto bad; } smp->sm_uid = v; if (1 != vfs_scanopt(mp->mnt_optnew, "gid", "%d", &v)) { vfs_mount_error(mp, "Invalid gid"); error = EINVAL; goto bad; } smp->sm_gid = v; if (1 != vfs_scanopt(mp->mnt_optnew, "file_mode", "%d", &v)) { vfs_mount_error(mp, "Invalid file_mode"); error = EINVAL; goto bad; } smp->sm_file_mode = (v & (S_IRWXU|S_IRWXG|S_IRWXO)) | S_IFREG; if (1 != vfs_scanopt(mp->mnt_optnew, "dir_mode", "%d", &v)) { vfs_mount_error(mp, "Invalid dir_mode"); error = EINVAL; goto bad; } smp->sm_dir_mode = (v & (S_IRWXU|S_IRWXG|S_IRWXO)) | S_IFDIR; vfs_flagopt(mp->mnt_optnew, "nolong", &smp->sm_flags, SMBFS_MOUNT_NO_LONG); pc = mp->mnt_stat.f_mntfromname; pe = pc + sizeof(mp->mnt_stat.f_mntfromname); bzero(pc, MNAMELEN); *pc++ = '/'; *pc++ = '/'; pc = strchr(strncpy(pc, vcp->vc_username, pe - pc - 2), 0); if (pc < pe-1) { *(pc++) = '@'; pc = strchr(strncpy(pc, vcp->vc_srvname, pe - pc - 2), 0); if (pc < pe - 1) { *(pc++) = '/'; strncpy(pc, ssp->ss_name, pe - pc - 2); } } vfs_getnewfsid(mp); error = smbfs_root(mp, LK_EXCLUSIVE, &vp); if (error) { vfs_mount_error(mp, "smbfs_root error: %d", error); goto bad; } VOP_UNLOCK(vp); SMBVDEBUG("root.v_usecount = %d\n", vrefcnt(vp)); #ifdef DIAGNOSTIC SMBERROR("mp=%p\n", mp); #endif smbfs_free_scred(scred); return error; bad: if (ssp) smb_share_put(ssp, scred); smbfs_free_scred(scred); SMB_LOCK(); if (error && smp->sm_dev == dev) { smp->sm_dev = NULL; sdp_trydestroy(dev); } SMB_UNLOCK(); free(smp, M_SMBFSDATA); return error; } /* Unmount the filesystem described by mp. */ static int smbfs_unmount(struct mount *mp, int mntflags) { struct thread *td; struct smbmount *smp = VFSTOSMBFS(mp); struct smb_cred *scred; struct smb_dev *dev; int error, flags; SMBVDEBUG("smbfs_unmount: flags=%04x\n", mntflags); td = curthread; flags = 0; if (mntflags & MNT_FORCE) flags |= FORCECLOSE; /* * Keep trying to flush the vnode list for the mount while * some are still busy and we are making progress towards * making them not busy. This is needed because smbfs vnodes * reference their parent directory but may appear after their * parent in the list; one pass over the vnode list is not * sufficient in this case. */ do { smp->sm_didrele = 0; /* There is 1 extra root vnode reference from smbfs_mount(). */ error = vflush(mp, 1, flags, td); } while (error == EBUSY && smp->sm_didrele != 0); if (error) return error; scred = smbfs_malloc_scred(); smb_makescred(scred, td, td->td_ucred); error = smb_share_lock(smp->sm_share); if (error) goto out; smb_share_put(smp->sm_share, scred); SMB_LOCK(); dev = smp->sm_dev; if (!dev) panic("No private data for mount point"); sdp_trydestroy(dev); mp->mnt_data = NULL; SMB_UNLOCK(); free(smp, M_SMBFSDATA); MNT_ILOCK(mp); mp->mnt_flag &= ~MNT_LOCAL; MNT_IUNLOCK(mp); out: smbfs_free_scred(scred); return error; } /* * Return locked root vnode of a filesystem */ static int smbfs_root(struct mount *mp, int flags, struct vnode **vpp) { struct smbmount *smp = VFSTOSMBFS(mp); struct vnode *vp; struct smbnode *np; struct smbfattr fattr; struct thread *td; struct ucred *cred; struct smb_cred *scred; int error; td = curthread; cred = td->td_ucred; if (smp->sm_root) { *vpp = SMBTOV(smp->sm_root); - return vget(*vpp, LK_EXCLUSIVE | LK_RETRY, td); + return vget(*vpp, LK_EXCLUSIVE | LK_RETRY); } scred = smbfs_malloc_scred(); smb_makescred(scred, td, cred); error = smbfs_smb_lookup(NULL, NULL, 0, &fattr, scred); if (error) goto out; error = smbfs_nget(mp, NULL, NULL, 0, &fattr, &vp); if (error) goto out; ASSERT_VOP_LOCKED(vp, "smbfs_root"); vp->v_vflag |= VV_ROOT; np = VTOSMB(vp); smp->sm_root = np; *vpp = vp; out: smbfs_free_scred(scred); return error; } /* * Do operations associated with quotas, not supported */ /* ARGSUSED */ static int smbfs_quotactl(mp, cmd, uid, arg) struct mount *mp; int cmd; uid_t uid; void *arg; { SMBVDEBUG("return EOPNOTSUPP\n"); return EOPNOTSUPP; } /*ARGSUSED*/ int smbfs_init(struct vfsconf *vfsp) { smbfs_pbuf_zone = pbuf_zsecond_create("smbpbuf", nswbuf / 2); SMBVDEBUG("done.\n"); return 0; } /*ARGSUSED*/ int smbfs_uninit(struct vfsconf *vfsp) { uma_zdestroy(smbfs_pbuf_zone); SMBVDEBUG("done.\n"); return 0; } /* * smbfs_statfs call */ int smbfs_statfs(struct mount *mp, struct statfs *sbp) { struct thread *td = curthread; struct smbmount *smp = VFSTOSMBFS(mp); struct smbnode *np = smp->sm_root; struct smb_share *ssp = smp->sm_share; struct smb_cred *scred; int error; if (np == NULL) { vfs_mount_error(mp, "np == NULL"); return EINVAL; } sbp->f_iosize = SSTOVC(ssp)->vc_txmax; /* optimal transfer block size */ scred = smbfs_malloc_scred(); smb_makescred(scred, td, td->td_ucred); error = smbfs_smb_statfs(ssp, sbp, scred); smbfs_free_scred(scred); return (error); } Index: projects/clang1100-import/sys/fs/tmpfs/tmpfs_subr.c =================================================================== --- projects/clang1100-import/sys/fs/tmpfs/tmpfs_subr.c (revision 364278) +++ projects/clang1100-import/sys/fs/tmpfs/tmpfs_subr.c (revision 364279) @@ -1,1939 +1,1938 @@ /* $NetBSD: tmpfs_subr.c,v 1.35 2007/07/09 21:10:50 ad Exp $ */ /*- * SPDX-License-Identifier: BSD-2-Clause-NetBSD * * Copyright (c) 2005 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Julio M. Merino Vidal, developed as part of Google's Summer of Code * 2005 program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Efficient memory file system supporting functions. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include SYSCTL_NODE(_vfs, OID_AUTO, tmpfs, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "tmpfs file system"); static long tmpfs_pages_reserved = TMPFS_PAGES_MINRESERVED; static uma_zone_t tmpfs_dirent_pool; static uma_zone_t tmpfs_node_pool; VFS_SMR_DECLARE; static int tmpfs_node_ctor(void *mem, int size, void *arg, int flags) { struct tmpfs_node *node; node = mem; node->tn_gen++; node->tn_size = 0; node->tn_status = 0; node->tn_flags = 0; node->tn_links = 0; node->tn_vnode = NULL; node->tn_vpstate = 0; return (0); } static void tmpfs_node_dtor(void *mem, int size, void *arg) { struct tmpfs_node *node; node = mem; node->tn_type = VNON; } static int tmpfs_node_init(void *mem, int size, int flags) { struct tmpfs_node *node; node = mem; node->tn_id = 0; mtx_init(&node->tn_interlock, "tmpfsni", NULL, MTX_DEF); node->tn_gen = arc4random(); return (0); } static void tmpfs_node_fini(void *mem, int size) { struct tmpfs_node *node; node = mem; mtx_destroy(&node->tn_interlock); } void tmpfs_subr_init(void) { tmpfs_dirent_pool = uma_zcreate("TMPFS dirent", sizeof(struct tmpfs_dirent), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); tmpfs_node_pool = uma_zcreate("TMPFS node", sizeof(struct tmpfs_node), tmpfs_node_ctor, tmpfs_node_dtor, tmpfs_node_init, tmpfs_node_fini, UMA_ALIGN_PTR, 0); VFS_SMR_ZONE_SET(tmpfs_node_pool); } void tmpfs_subr_uninit(void) { uma_zdestroy(tmpfs_node_pool); uma_zdestroy(tmpfs_dirent_pool); } static int sysctl_mem_reserved(SYSCTL_HANDLER_ARGS) { int error; long pages, bytes; pages = *(long *)arg1; bytes = pages * PAGE_SIZE; error = sysctl_handle_long(oidp, &bytes, 0, req); if (error || !req->newptr) return (error); pages = bytes / PAGE_SIZE; if (pages < TMPFS_PAGES_MINRESERVED) return (EINVAL); *(long *)arg1 = pages; return (0); } SYSCTL_PROC(_vfs_tmpfs, OID_AUTO, memory_reserved, CTLTYPE_LONG|CTLFLAG_MPSAFE|CTLFLAG_RW, &tmpfs_pages_reserved, 0, sysctl_mem_reserved, "L", "Amount of available memory and swap below which tmpfs growth stops"); static __inline int tmpfs_dirtree_cmp(struct tmpfs_dirent *a, struct tmpfs_dirent *b); RB_PROTOTYPE_STATIC(tmpfs_dir, tmpfs_dirent, uh.td_entries, tmpfs_dirtree_cmp); size_t tmpfs_mem_avail(void) { vm_ooffset_t avail; avail = swap_pager_avail + vm_free_count() - tmpfs_pages_reserved; if (__predict_false(avail < 0)) avail = 0; return (avail); } size_t tmpfs_pages_used(struct tmpfs_mount *tmp) { const size_t node_size = sizeof(struct tmpfs_node) + sizeof(struct tmpfs_dirent); size_t meta_pages; meta_pages = howmany((uintmax_t)tmp->tm_nodes_inuse * node_size, PAGE_SIZE); return (meta_pages + tmp->tm_pages_used); } static size_t tmpfs_pages_check_avail(struct tmpfs_mount *tmp, size_t req_pages) { if (tmpfs_mem_avail() < req_pages) return (0); if (tmp->tm_pages_max != ULONG_MAX && tmp->tm_pages_max < req_pages + tmpfs_pages_used(tmp)) return (0); return (1); } void tmpfs_ref_node(struct tmpfs_node *node) { TMPFS_NODE_LOCK(node); tmpfs_ref_node_locked(node); TMPFS_NODE_UNLOCK(node); } void tmpfs_ref_node_locked(struct tmpfs_node *node) { TMPFS_NODE_ASSERT_LOCKED(node); KASSERT(node->tn_refcount > 0, ("node %p zero refcount", node)); KASSERT(node->tn_refcount < UINT_MAX, ("node %p refcount %u", node, node->tn_refcount)); node->tn_refcount++; } /* * Allocates a new node of type 'type' inside the 'tmp' mount point, with * its owner set to 'uid', its group to 'gid' and its mode set to 'mode', * using the credentials of the process 'p'. * * If the node type is set to 'VDIR', then the parent parameter must point * to the parent directory of the node being created. It may only be NULL * while allocating the root node. * * If the node type is set to 'VBLK' or 'VCHR', then the rdev parameter * specifies the device the node represents. * * If the node type is set to 'VLNK', then the parameter target specifies * the file name of the target file for the symbolic link that is being * created. * * Note that new nodes are retrieved from the available list if it has * items or, if it is empty, from the node pool as long as there is enough * space to create them. * * Returns zero on success or an appropriate error code on failure. */ int tmpfs_alloc_node(struct mount *mp, struct tmpfs_mount *tmp, enum vtype type, uid_t uid, gid_t gid, mode_t mode, struct tmpfs_node *parent, const char *target, dev_t rdev, struct tmpfs_node **node) { struct tmpfs_node *nnode; vm_object_t obj; /* If the root directory of the 'tmp' file system is not yet * allocated, this must be the request to do it. */ MPASS(IMPLIES(tmp->tm_root == NULL, parent == NULL && type == VDIR)); MPASS(IFF(type == VLNK, target != NULL)); MPASS(IFF(type == VBLK || type == VCHR, rdev != VNOVAL)); if (tmp->tm_nodes_inuse >= tmp->tm_nodes_max) return (ENOSPC); if (tmpfs_pages_check_avail(tmp, 1) == 0) return (ENOSPC); if ((mp->mnt_kern_flag & MNTK_UNMOUNT) != 0) { /* * When a new tmpfs node is created for fully * constructed mount point, there must be a parent * node, which vnode is locked exclusively. As * consequence, if the unmount is executing in * parallel, vflush() cannot reclaim the parent vnode. * Due to this, the check for MNTK_UNMOUNT flag is not * racy: if we did not see MNTK_UNMOUNT flag, then tmp * cannot be destroyed until node construction is * finished and the parent vnode unlocked. * * Tmpfs does not need to instantiate new nodes during * unmount. */ return (EBUSY); } if ((mp->mnt_kern_flag & MNT_RDONLY) != 0) return (EROFS); nnode = uma_zalloc_smr(tmpfs_node_pool, M_WAITOK); /* Generic initialization. */ nnode->tn_type = type; vfs_timestamp(&nnode->tn_atime); nnode->tn_birthtime = nnode->tn_ctime = nnode->tn_mtime = nnode->tn_atime; nnode->tn_uid = uid; nnode->tn_gid = gid; nnode->tn_mode = mode; nnode->tn_id = alloc_unr64(&tmp->tm_ino_unr); nnode->tn_refcount = 1; /* Type-specific initialization. */ switch (nnode->tn_type) { case VBLK: case VCHR: nnode->tn_rdev = rdev; break; case VDIR: RB_INIT(&nnode->tn_dir.tn_dirhead); LIST_INIT(&nnode->tn_dir.tn_dupindex); MPASS(parent != nnode); MPASS(IMPLIES(parent == NULL, tmp->tm_root == NULL)); nnode->tn_dir.tn_parent = (parent == NULL) ? nnode : parent; nnode->tn_dir.tn_readdir_lastn = 0; nnode->tn_dir.tn_readdir_lastp = NULL; nnode->tn_links++; TMPFS_NODE_LOCK(nnode->tn_dir.tn_parent); nnode->tn_dir.tn_parent->tn_links++; TMPFS_NODE_UNLOCK(nnode->tn_dir.tn_parent); break; case VFIFO: /* FALLTHROUGH */ case VSOCK: break; case VLNK: MPASS(strlen(target) < MAXPATHLEN); nnode->tn_size = strlen(target); nnode->tn_link = malloc(nnode->tn_size, M_TMPFSNAME, M_WAITOK); memcpy(nnode->tn_link, target, nnode->tn_size); break; case VREG: obj = nnode->tn_reg.tn_aobj = vm_pager_allocate(OBJT_SWAP, NULL, 0, VM_PROT_DEFAULT, 0, NULL /* XXXKIB - tmpfs needs swap reservation */); VM_OBJECT_WLOCK(obj); /* OBJ_TMPFS is set together with the setting of vp->v_object */ vm_object_set_flag(obj, OBJ_TMPFS_NODE); VM_OBJECT_WUNLOCK(obj); break; default: panic("tmpfs_alloc_node: type %p %d", nnode, (int)nnode->tn_type); } TMPFS_LOCK(tmp); LIST_INSERT_HEAD(&tmp->tm_nodes_used, nnode, tn_entries); nnode->tn_attached = true; tmp->tm_nodes_inuse++; tmp->tm_refcount++; TMPFS_UNLOCK(tmp); *node = nnode; return (0); } /* * Destroys the node pointed to by node from the file system 'tmp'. * If the node references a directory, no entries are allowed. */ void tmpfs_free_node(struct tmpfs_mount *tmp, struct tmpfs_node *node) { TMPFS_LOCK(tmp); TMPFS_NODE_LOCK(node); if (!tmpfs_free_node_locked(tmp, node, false)) { TMPFS_NODE_UNLOCK(node); TMPFS_UNLOCK(tmp); } } bool tmpfs_free_node_locked(struct tmpfs_mount *tmp, struct tmpfs_node *node, bool detach) { vm_object_t uobj; TMPFS_MP_ASSERT_LOCKED(tmp); TMPFS_NODE_ASSERT_LOCKED(node); KASSERT(node->tn_refcount > 0, ("node %p refcount zero", node)); node->tn_refcount--; if (node->tn_attached && (detach || node->tn_refcount == 0)) { MPASS(tmp->tm_nodes_inuse > 0); tmp->tm_nodes_inuse--; LIST_REMOVE(node, tn_entries); node->tn_attached = false; } if (node->tn_refcount > 0) return (false); #ifdef INVARIANTS MPASS(node->tn_vnode == NULL); MPASS((node->tn_vpstate & TMPFS_VNODE_ALLOCATING) == 0); #endif TMPFS_NODE_UNLOCK(node); TMPFS_UNLOCK(tmp); switch (node->tn_type) { case VBLK: /* FALLTHROUGH */ case VCHR: /* FALLTHROUGH */ case VDIR: /* FALLTHROUGH */ case VFIFO: /* FALLTHROUGH */ case VSOCK: break; case VLNK: free(node->tn_link, M_TMPFSNAME); break; case VREG: uobj = node->tn_reg.tn_aobj; if (uobj != NULL) { if (uobj->size != 0) atomic_subtract_long(&tmp->tm_pages_used, uobj->size); KASSERT((uobj->flags & OBJ_TMPFS) == 0, ("leaked OBJ_TMPFS node %p vm_obj %p", node, uobj)); vm_object_deallocate(uobj); } break; default: panic("tmpfs_free_node: type %p %d", node, (int)node->tn_type); } uma_zfree_smr(tmpfs_node_pool, node); TMPFS_LOCK(tmp); tmpfs_free_tmp(tmp); return (true); } static __inline uint32_t tmpfs_dirent_hash(const char *name, u_int len) { uint32_t hash; hash = fnv_32_buf(name, len, FNV1_32_INIT + len) & TMPFS_DIRCOOKIE_MASK; #ifdef TMPFS_DEBUG_DIRCOOKIE_DUP hash &= 0xf; #endif if (hash < TMPFS_DIRCOOKIE_MIN) hash += TMPFS_DIRCOOKIE_MIN; return (hash); } static __inline off_t tmpfs_dirent_cookie(struct tmpfs_dirent *de) { if (de == NULL) return (TMPFS_DIRCOOKIE_EOF); MPASS(de->td_cookie >= TMPFS_DIRCOOKIE_MIN); return (de->td_cookie); } static __inline boolean_t tmpfs_dirent_dup(struct tmpfs_dirent *de) { return ((de->td_cookie & TMPFS_DIRCOOKIE_DUP) != 0); } static __inline boolean_t tmpfs_dirent_duphead(struct tmpfs_dirent *de) { return ((de->td_cookie & TMPFS_DIRCOOKIE_DUPHEAD) != 0); } void tmpfs_dirent_init(struct tmpfs_dirent *de, const char *name, u_int namelen) { de->td_hash = de->td_cookie = tmpfs_dirent_hash(name, namelen); memcpy(de->ud.td_name, name, namelen); de->td_namelen = namelen; } /* * Allocates a new directory entry for the node node with a name of name. * The new directory entry is returned in *de. * * The link count of node is increased by one to reflect the new object * referencing it. * * Returns zero on success or an appropriate error code on failure. */ int tmpfs_alloc_dirent(struct tmpfs_mount *tmp, struct tmpfs_node *node, const char *name, u_int len, struct tmpfs_dirent **de) { struct tmpfs_dirent *nde; nde = uma_zalloc(tmpfs_dirent_pool, M_WAITOK); nde->td_node = node; if (name != NULL) { nde->ud.td_name = malloc(len, M_TMPFSNAME, M_WAITOK); tmpfs_dirent_init(nde, name, len); } else nde->td_namelen = 0; if (node != NULL) node->tn_links++; *de = nde; return 0; } /* * Frees a directory entry. It is the caller's responsibility to destroy * the node referenced by it if needed. * * The link count of node is decreased by one to reflect the removal of an * object that referenced it. This only happens if 'node_exists' is true; * otherwise the function will not access the node referred to by the * directory entry, as it may already have been released from the outside. */ void tmpfs_free_dirent(struct tmpfs_mount *tmp, struct tmpfs_dirent *de) { struct tmpfs_node *node; node = de->td_node; if (node != NULL) { MPASS(node->tn_links > 0); node->tn_links--; } if (!tmpfs_dirent_duphead(de) && de->ud.td_name != NULL) free(de->ud.td_name, M_TMPFSNAME); uma_zfree(tmpfs_dirent_pool, de); } void tmpfs_destroy_vobject(struct vnode *vp, vm_object_t obj) { ASSERT_VOP_ELOCKED(vp, "tmpfs_destroy_vobject"); if (vp->v_type != VREG || obj == NULL) return; VM_OBJECT_WLOCK(obj); VI_LOCK(vp); vm_object_clear_flag(obj, OBJ_TMPFS); obj->un_pager.swp.swp_tmpfs = NULL; if (vp->v_writecount < 0) vp->v_writecount = 0; VI_UNLOCK(vp); VM_OBJECT_WUNLOCK(obj); } /* * Need to clear v_object for insmntque failure. */ static void tmpfs_insmntque_dtr(struct vnode *vp, void *dtr_arg) { tmpfs_destroy_vobject(vp, vp->v_object); vp->v_object = NULL; vp->v_data = NULL; vp->v_op = &dead_vnodeops; vgone(vp); vput(vp); } /* * Allocates a new vnode for the node node or returns a new reference to * an existing one if the node had already a vnode referencing it. The * resulting locked vnode is returned in *vpp. * * Returns zero on success or an appropriate error code on failure. */ int tmpfs_alloc_vp(struct mount *mp, struct tmpfs_node *node, int lkflag, struct vnode **vpp) { struct vnode *vp; + enum vgetstate vs; struct tmpfs_mount *tm; vm_object_t object; int error; error = 0; tm = VFS_TO_TMPFS(mp); TMPFS_NODE_LOCK(node); tmpfs_ref_node_locked(node); loop: TMPFS_NODE_ASSERT_LOCKED(node); if ((vp = node->tn_vnode) != NULL) { MPASS((node->tn_vpstate & TMPFS_VNODE_DOOMED) == 0); - VI_LOCK(vp); if ((node->tn_type == VDIR && node->tn_dir.tn_parent == NULL) || (VN_IS_DOOMED(vp) && (lkflag & LK_NOWAIT) != 0)) { - VI_UNLOCK(vp); TMPFS_NODE_UNLOCK(node); error = ENOENT; vp = NULL; goto out; } if (VN_IS_DOOMED(vp)) { - VI_UNLOCK(vp); node->tn_vpstate |= TMPFS_VNODE_WRECLAIM; while ((node->tn_vpstate & TMPFS_VNODE_WRECLAIM) != 0) { msleep(&node->tn_vnode, TMPFS_NODE_MTX(node), 0, "tmpfsE", 0); } goto loop; } + vs = vget_prep(vp); TMPFS_NODE_UNLOCK(node); - error = vget(vp, lkflag | LK_INTERLOCK, curthread); + error = vget_finish(vp, lkflag, vs); if (error == ENOENT) { TMPFS_NODE_LOCK(node); goto loop; } if (error != 0) { vp = NULL; goto out; } /* * Make sure the vnode is still there after * getting the interlock to avoid racing a free. */ if (node->tn_vnode == NULL || node->tn_vnode != vp) { vput(vp); TMPFS_NODE_LOCK(node); goto loop; } goto out; } if ((node->tn_vpstate & TMPFS_VNODE_DOOMED) || (node->tn_type == VDIR && node->tn_dir.tn_parent == NULL)) { TMPFS_NODE_UNLOCK(node); error = ENOENT; vp = NULL; goto out; } /* * otherwise lock the vp list while we call getnewvnode * since that can block. */ if (node->tn_vpstate & TMPFS_VNODE_ALLOCATING) { node->tn_vpstate |= TMPFS_VNODE_WANT; error = msleep((caddr_t) &node->tn_vpstate, TMPFS_NODE_MTX(node), 0, "tmpfs_alloc_vp", 0); if (error != 0) goto out; goto loop; } else node->tn_vpstate |= TMPFS_VNODE_ALLOCATING; TMPFS_NODE_UNLOCK(node); /* Get a new vnode and associate it with our node. */ error = getnewvnode("tmpfs", mp, VFS_TO_TMPFS(mp)->tm_nonc ? &tmpfs_vnodeop_nonc_entries : &tmpfs_vnodeop_entries, &vp); if (error != 0) goto unlock; MPASS(vp != NULL); /* lkflag is ignored, the lock is exclusive */ (void) vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); vp->v_data = node; vp->v_type = node->tn_type; /* Type-specific initialization. */ switch (node->tn_type) { case VBLK: /* FALLTHROUGH */ case VCHR: /* FALLTHROUGH */ case VLNK: /* FALLTHROUGH */ case VSOCK: break; case VFIFO: vp->v_op = &tmpfs_fifoop_entries; break; case VREG: object = node->tn_reg.tn_aobj; VM_OBJECT_WLOCK(object); VI_LOCK(vp); KASSERT(vp->v_object == NULL, ("Not NULL v_object in tmpfs")); vp->v_object = object; object->un_pager.swp.swp_tmpfs = vp; vm_object_set_flag(object, OBJ_TMPFS); VI_UNLOCK(vp); VM_OBJECT_WUNLOCK(object); break; case VDIR: MPASS(node->tn_dir.tn_parent != NULL); if (node->tn_dir.tn_parent == node) vp->v_vflag |= VV_ROOT; break; default: panic("tmpfs_alloc_vp: type %p %d", node, (int)node->tn_type); } if (vp->v_type != VFIFO) VN_LOCK_ASHARE(vp); error = insmntque1(vp, mp, tmpfs_insmntque_dtr, NULL); if (error != 0) vp = NULL; unlock: TMPFS_NODE_LOCK(node); MPASS(node->tn_vpstate & TMPFS_VNODE_ALLOCATING); node->tn_vpstate &= ~TMPFS_VNODE_ALLOCATING; node->tn_vnode = vp; if (node->tn_vpstate & TMPFS_VNODE_WANT) { node->tn_vpstate &= ~TMPFS_VNODE_WANT; TMPFS_NODE_UNLOCK(node); wakeup((caddr_t) &node->tn_vpstate); } else TMPFS_NODE_UNLOCK(node); out: if (error == 0) { *vpp = vp; #ifdef INVARIANTS MPASS(*vpp != NULL && VOP_ISLOCKED(*vpp)); TMPFS_NODE_LOCK(node); MPASS(*vpp == node->tn_vnode); TMPFS_NODE_UNLOCK(node); #endif } tmpfs_free_node(tm, node); return (error); } /* * Destroys the association between the vnode vp and the node it * references. */ void tmpfs_free_vp(struct vnode *vp) { struct tmpfs_node *node; node = VP_TO_TMPFS_NODE(vp); TMPFS_NODE_ASSERT_LOCKED(node); node->tn_vnode = NULL; if ((node->tn_vpstate & TMPFS_VNODE_WRECLAIM) != 0) wakeup(&node->tn_vnode); node->tn_vpstate &= ~TMPFS_VNODE_WRECLAIM; vp->v_data = NULL; } /* * Allocates a new file of type 'type' and adds it to the parent directory * 'dvp'; this addition is done using the component name given in 'cnp'. * The ownership of the new file is automatically assigned based on the * credentials of the caller (through 'cnp'), the group is set based on * the parent directory and the mode is determined from the 'vap' argument. * If successful, *vpp holds a vnode to the newly created file and zero * is returned. Otherwise *vpp is NULL and the function returns an * appropriate error code. */ int tmpfs_alloc_file(struct vnode *dvp, struct vnode **vpp, struct vattr *vap, struct componentname *cnp, const char *target) { int error; struct tmpfs_dirent *de; struct tmpfs_mount *tmp; struct tmpfs_node *dnode; struct tmpfs_node *node; struct tmpfs_node *parent; ASSERT_VOP_ELOCKED(dvp, "tmpfs_alloc_file"); MPASS(cnp->cn_flags & HASBUF); tmp = VFS_TO_TMPFS(dvp->v_mount); dnode = VP_TO_TMPFS_DIR(dvp); *vpp = NULL; /* If the entry we are creating is a directory, we cannot overflow * the number of links of its parent, because it will get a new * link. */ if (vap->va_type == VDIR) { /* Ensure that we do not overflow the maximum number of links * imposed by the system. */ MPASS(dnode->tn_links <= TMPFS_LINK_MAX); if (dnode->tn_links == TMPFS_LINK_MAX) { return (EMLINK); } parent = dnode; MPASS(parent != NULL); } else parent = NULL; /* Allocate a node that represents the new file. */ error = tmpfs_alloc_node(dvp->v_mount, tmp, vap->va_type, cnp->cn_cred->cr_uid, dnode->tn_gid, vap->va_mode, parent, target, vap->va_rdev, &node); if (error != 0) return (error); /* Allocate a directory entry that points to the new file. */ error = tmpfs_alloc_dirent(tmp, node, cnp->cn_nameptr, cnp->cn_namelen, &de); if (error != 0) { tmpfs_free_node(tmp, node); return (error); } /* Allocate a vnode for the new file. */ error = tmpfs_alloc_vp(dvp->v_mount, node, LK_EXCLUSIVE, vpp); if (error != 0) { tmpfs_free_dirent(tmp, de); tmpfs_free_node(tmp, node); return (error); } /* Now that all required items are allocated, we can proceed to * insert the new node into the directory, an operation that * cannot fail. */ if (cnp->cn_flags & ISWHITEOUT) tmpfs_dir_whiteout_remove(dvp, cnp); tmpfs_dir_attach(dvp, de); return (0); } struct tmpfs_dirent * tmpfs_dir_first(struct tmpfs_node *dnode, struct tmpfs_dir_cursor *dc) { struct tmpfs_dirent *de; de = RB_MIN(tmpfs_dir, &dnode->tn_dir.tn_dirhead); dc->tdc_tree = de; if (de != NULL && tmpfs_dirent_duphead(de)) de = LIST_FIRST(&de->ud.td_duphead); dc->tdc_current = de; return (dc->tdc_current); } struct tmpfs_dirent * tmpfs_dir_next(struct tmpfs_node *dnode, struct tmpfs_dir_cursor *dc) { struct tmpfs_dirent *de; MPASS(dc->tdc_tree != NULL); if (tmpfs_dirent_dup(dc->tdc_current)) { dc->tdc_current = LIST_NEXT(dc->tdc_current, uh.td_dup.entries); if (dc->tdc_current != NULL) return (dc->tdc_current); } dc->tdc_tree = dc->tdc_current = RB_NEXT(tmpfs_dir, &dnode->tn_dir.tn_dirhead, dc->tdc_tree); if ((de = dc->tdc_current) != NULL && tmpfs_dirent_duphead(de)) { dc->tdc_current = LIST_FIRST(&de->ud.td_duphead); MPASS(dc->tdc_current != NULL); } return (dc->tdc_current); } /* Lookup directory entry in RB-Tree. Function may return duphead entry. */ static struct tmpfs_dirent * tmpfs_dir_xlookup_hash(struct tmpfs_node *dnode, uint32_t hash) { struct tmpfs_dirent *de, dekey; dekey.td_hash = hash; de = RB_FIND(tmpfs_dir, &dnode->tn_dir.tn_dirhead, &dekey); return (de); } /* Lookup directory entry by cookie, initialize directory cursor accordingly. */ static struct tmpfs_dirent * tmpfs_dir_lookup_cookie(struct tmpfs_node *node, off_t cookie, struct tmpfs_dir_cursor *dc) { struct tmpfs_dir *dirhead = &node->tn_dir.tn_dirhead; struct tmpfs_dirent *de, dekey; MPASS(cookie >= TMPFS_DIRCOOKIE_MIN); if (cookie == node->tn_dir.tn_readdir_lastn && (de = node->tn_dir.tn_readdir_lastp) != NULL) { /* Protect against possible race, tn_readdir_last[pn] * may be updated with only shared vnode lock held. */ if (cookie == tmpfs_dirent_cookie(de)) goto out; } if ((cookie & TMPFS_DIRCOOKIE_DUP) != 0) { LIST_FOREACH(de, &node->tn_dir.tn_dupindex, uh.td_dup.index_entries) { MPASS(tmpfs_dirent_dup(de)); if (de->td_cookie == cookie) goto out; /* dupindex list is sorted. */ if (de->td_cookie < cookie) { de = NULL; goto out; } } MPASS(de == NULL); goto out; } if ((cookie & TMPFS_DIRCOOKIE_MASK) != cookie) { de = NULL; } else { dekey.td_hash = cookie; /* Recover if direntry for cookie was removed */ de = RB_NFIND(tmpfs_dir, dirhead, &dekey); } dc->tdc_tree = de; dc->tdc_current = de; if (de != NULL && tmpfs_dirent_duphead(de)) { dc->tdc_current = LIST_FIRST(&de->ud.td_duphead); MPASS(dc->tdc_current != NULL); } return (dc->tdc_current); out: dc->tdc_tree = de; dc->tdc_current = de; if (de != NULL && tmpfs_dirent_dup(de)) dc->tdc_tree = tmpfs_dir_xlookup_hash(node, de->td_hash); return (dc->tdc_current); } /* * Looks for a directory entry in the directory represented by node. * 'cnp' describes the name of the entry to look for. Note that the . * and .. components are not allowed as they do not physically exist * within directories. * * Returns a pointer to the entry when found, otherwise NULL. */ struct tmpfs_dirent * tmpfs_dir_lookup(struct tmpfs_node *node, struct tmpfs_node *f, struct componentname *cnp) { struct tmpfs_dir_duphead *duphead; struct tmpfs_dirent *de; uint32_t hash; MPASS(IMPLIES(cnp->cn_namelen == 1, cnp->cn_nameptr[0] != '.')); MPASS(IMPLIES(cnp->cn_namelen == 2, !(cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.'))); TMPFS_VALIDATE_DIR(node); hash = tmpfs_dirent_hash(cnp->cn_nameptr, cnp->cn_namelen); de = tmpfs_dir_xlookup_hash(node, hash); if (de != NULL && tmpfs_dirent_duphead(de)) { duphead = &de->ud.td_duphead; LIST_FOREACH(de, duphead, uh.td_dup.entries) { if (TMPFS_DIRENT_MATCHES(de, cnp->cn_nameptr, cnp->cn_namelen)) break; } } else if (de != NULL) { if (!TMPFS_DIRENT_MATCHES(de, cnp->cn_nameptr, cnp->cn_namelen)) de = NULL; } if (de != NULL && f != NULL && de->td_node != f) de = NULL; return (de); } /* * Attach duplicate-cookie directory entry nde to dnode and insert to dupindex * list, allocate new cookie value. */ static void tmpfs_dir_attach_dup(struct tmpfs_node *dnode, struct tmpfs_dir_duphead *duphead, struct tmpfs_dirent *nde) { struct tmpfs_dir_duphead *dupindex; struct tmpfs_dirent *de, *pde; dupindex = &dnode->tn_dir.tn_dupindex; de = LIST_FIRST(dupindex); if (de == NULL || de->td_cookie < TMPFS_DIRCOOKIE_DUP_MAX) { if (de == NULL) nde->td_cookie = TMPFS_DIRCOOKIE_DUP_MIN; else nde->td_cookie = de->td_cookie + 1; MPASS(tmpfs_dirent_dup(nde)); LIST_INSERT_HEAD(dupindex, nde, uh.td_dup.index_entries); LIST_INSERT_HEAD(duphead, nde, uh.td_dup.entries); return; } /* * Cookie numbers are near exhaustion. Scan dupindex list for unused * numbers. dupindex list is sorted in descending order. Keep it so * after inserting nde. */ while (1) { pde = de; de = LIST_NEXT(de, uh.td_dup.index_entries); if (de == NULL && pde->td_cookie != TMPFS_DIRCOOKIE_DUP_MIN) { /* * Last element of the index doesn't have minimal cookie * value, use it. */ nde->td_cookie = TMPFS_DIRCOOKIE_DUP_MIN; LIST_INSERT_AFTER(pde, nde, uh.td_dup.index_entries); LIST_INSERT_HEAD(duphead, nde, uh.td_dup.entries); return; } else if (de == NULL) { /* * We are so lucky have 2^30 hash duplicates in single * directory :) Return largest possible cookie value. * It should be fine except possible issues with * VOP_READDIR restart. */ nde->td_cookie = TMPFS_DIRCOOKIE_DUP_MAX; LIST_INSERT_HEAD(dupindex, nde, uh.td_dup.index_entries); LIST_INSERT_HEAD(duphead, nde, uh.td_dup.entries); return; } if (de->td_cookie + 1 == pde->td_cookie || de->td_cookie >= TMPFS_DIRCOOKIE_DUP_MAX) continue; /* No hole or invalid cookie. */ nde->td_cookie = de->td_cookie + 1; MPASS(tmpfs_dirent_dup(nde)); MPASS(pde->td_cookie > nde->td_cookie); MPASS(nde->td_cookie > de->td_cookie); LIST_INSERT_BEFORE(de, nde, uh.td_dup.index_entries); LIST_INSERT_HEAD(duphead, nde, uh.td_dup.entries); return; } } /* * Attaches the directory entry de to the directory represented by vp. * Note that this does not change the link count of the node pointed by * the directory entry, as this is done by tmpfs_alloc_dirent. */ void tmpfs_dir_attach(struct vnode *vp, struct tmpfs_dirent *de) { struct tmpfs_node *dnode; struct tmpfs_dirent *xde, *nde; ASSERT_VOP_ELOCKED(vp, __func__); MPASS(de->td_namelen > 0); MPASS(de->td_hash >= TMPFS_DIRCOOKIE_MIN); MPASS(de->td_cookie == de->td_hash); dnode = VP_TO_TMPFS_DIR(vp); dnode->tn_dir.tn_readdir_lastn = 0; dnode->tn_dir.tn_readdir_lastp = NULL; MPASS(!tmpfs_dirent_dup(de)); xde = RB_INSERT(tmpfs_dir, &dnode->tn_dir.tn_dirhead, de); if (xde != NULL && tmpfs_dirent_duphead(xde)) tmpfs_dir_attach_dup(dnode, &xde->ud.td_duphead, de); else if (xde != NULL) { /* * Allocate new duphead. Swap xde with duphead to avoid * adding/removing elements with the same hash. */ MPASS(!tmpfs_dirent_dup(xde)); tmpfs_alloc_dirent(VFS_TO_TMPFS(vp->v_mount), NULL, NULL, 0, &nde); /* *nde = *xde; XXX gcc 4.2.1 may generate invalid code. */ memcpy(nde, xde, sizeof(*xde)); xde->td_cookie |= TMPFS_DIRCOOKIE_DUPHEAD; LIST_INIT(&xde->ud.td_duphead); xde->td_namelen = 0; xde->td_node = NULL; tmpfs_dir_attach_dup(dnode, &xde->ud.td_duphead, nde); tmpfs_dir_attach_dup(dnode, &xde->ud.td_duphead, de); } dnode->tn_size += sizeof(struct tmpfs_dirent); dnode->tn_status |= TMPFS_NODE_ACCESSED | TMPFS_NODE_CHANGED | \ TMPFS_NODE_MODIFIED; tmpfs_update(vp); } /* * Detaches the directory entry de from the directory represented by vp. * Note that this does not change the link count of the node pointed by * the directory entry, as this is done by tmpfs_free_dirent. */ void tmpfs_dir_detach(struct vnode *vp, struct tmpfs_dirent *de) { struct tmpfs_mount *tmp; struct tmpfs_dir *head; struct tmpfs_node *dnode; struct tmpfs_dirent *xde; ASSERT_VOP_ELOCKED(vp, __func__); dnode = VP_TO_TMPFS_DIR(vp); head = &dnode->tn_dir.tn_dirhead; dnode->tn_dir.tn_readdir_lastn = 0; dnode->tn_dir.tn_readdir_lastp = NULL; if (tmpfs_dirent_dup(de)) { /* Remove duphead if de was last entry. */ if (LIST_NEXT(de, uh.td_dup.entries) == NULL) { xde = tmpfs_dir_xlookup_hash(dnode, de->td_hash); MPASS(tmpfs_dirent_duphead(xde)); } else xde = NULL; LIST_REMOVE(de, uh.td_dup.entries); LIST_REMOVE(de, uh.td_dup.index_entries); if (xde != NULL) { if (LIST_EMPTY(&xde->ud.td_duphead)) { RB_REMOVE(tmpfs_dir, head, xde); tmp = VFS_TO_TMPFS(vp->v_mount); MPASS(xde->td_node == NULL); tmpfs_free_dirent(tmp, xde); } } de->td_cookie = de->td_hash; } else RB_REMOVE(tmpfs_dir, head, de); dnode->tn_size -= sizeof(struct tmpfs_dirent); dnode->tn_status |= TMPFS_NODE_ACCESSED | TMPFS_NODE_CHANGED | \ TMPFS_NODE_MODIFIED; tmpfs_update(vp); } void tmpfs_dir_destroy(struct tmpfs_mount *tmp, struct tmpfs_node *dnode) { struct tmpfs_dirent *de, *dde, *nde; RB_FOREACH_SAFE(de, tmpfs_dir, &dnode->tn_dir.tn_dirhead, nde) { RB_REMOVE(tmpfs_dir, &dnode->tn_dir.tn_dirhead, de); /* Node may already be destroyed. */ de->td_node = NULL; if (tmpfs_dirent_duphead(de)) { while ((dde = LIST_FIRST(&de->ud.td_duphead)) != NULL) { LIST_REMOVE(dde, uh.td_dup.entries); dde->td_node = NULL; tmpfs_free_dirent(tmp, dde); } } tmpfs_free_dirent(tmp, de); } } /* * Helper function for tmpfs_readdir. Creates a '.' entry for the given * directory and returns it in the uio space. The function returns 0 * on success, -1 if there was not enough space in the uio structure to * hold the directory entry or an appropriate error code if another * error happens. */ static int tmpfs_dir_getdotdent(struct tmpfs_mount *tm, struct tmpfs_node *node, struct uio *uio) { int error; struct dirent dent; TMPFS_VALIDATE_DIR(node); MPASS(uio->uio_offset == TMPFS_DIRCOOKIE_DOT); dent.d_fileno = node->tn_id; dent.d_type = DT_DIR; dent.d_namlen = 1; dent.d_name[0] = '.'; dent.d_reclen = GENERIC_DIRSIZ(&dent); dirent_terminate(&dent); if (dent.d_reclen > uio->uio_resid) error = EJUSTRETURN; else error = uiomove(&dent, dent.d_reclen, uio); tmpfs_set_status(tm, node, TMPFS_NODE_ACCESSED); return (error); } /* * Helper function for tmpfs_readdir. Creates a '..' entry for the given * directory and returns it in the uio space. The function returns 0 * on success, -1 if there was not enough space in the uio structure to * hold the directory entry or an appropriate error code if another * error happens. */ static int tmpfs_dir_getdotdotdent(struct tmpfs_mount *tm, struct tmpfs_node *node, struct uio *uio) { struct tmpfs_node *parent; struct dirent dent; int error; TMPFS_VALIDATE_DIR(node); MPASS(uio->uio_offset == TMPFS_DIRCOOKIE_DOTDOT); /* * Return ENOENT if the current node is already removed. */ TMPFS_ASSERT_LOCKED(node); parent = node->tn_dir.tn_parent; if (parent == NULL) return (ENOENT); TMPFS_NODE_LOCK(parent); dent.d_fileno = parent->tn_id; TMPFS_NODE_UNLOCK(parent); dent.d_type = DT_DIR; dent.d_namlen = 2; dent.d_name[0] = '.'; dent.d_name[1] = '.'; dent.d_reclen = GENERIC_DIRSIZ(&dent); dirent_terminate(&dent); if (dent.d_reclen > uio->uio_resid) error = EJUSTRETURN; else error = uiomove(&dent, dent.d_reclen, uio); tmpfs_set_status(tm, node, TMPFS_NODE_ACCESSED); return (error); } /* * Helper function for tmpfs_readdir. Returns as much directory entries * as can fit in the uio space. The read starts at uio->uio_offset. * The function returns 0 on success, -1 if there was not enough space * in the uio structure to hold the directory entry or an appropriate * error code if another error happens. */ int tmpfs_dir_getdents(struct tmpfs_mount *tm, struct tmpfs_node *node, struct uio *uio, int maxcookies, u_long *cookies, int *ncookies) { struct tmpfs_dir_cursor dc; struct tmpfs_dirent *de; off_t off; int error; TMPFS_VALIDATE_DIR(node); off = 0; /* * Lookup the node from the current offset. The starting offset of * 0 will lookup both '.' and '..', and then the first real entry, * or EOF if there are none. Then find all entries for the dir that * fit into the buffer. Once no more entries are found (de == NULL), * the offset is set to TMPFS_DIRCOOKIE_EOF, which will cause the next * call to return 0. */ switch (uio->uio_offset) { case TMPFS_DIRCOOKIE_DOT: error = tmpfs_dir_getdotdent(tm, node, uio); if (error != 0) return (error); uio->uio_offset = TMPFS_DIRCOOKIE_DOTDOT; if (cookies != NULL) cookies[(*ncookies)++] = off = uio->uio_offset; /* FALLTHROUGH */ case TMPFS_DIRCOOKIE_DOTDOT: error = tmpfs_dir_getdotdotdent(tm, node, uio); if (error != 0) return (error); de = tmpfs_dir_first(node, &dc); uio->uio_offset = tmpfs_dirent_cookie(de); if (cookies != NULL) cookies[(*ncookies)++] = off = uio->uio_offset; /* EOF. */ if (de == NULL) return (0); break; case TMPFS_DIRCOOKIE_EOF: return (0); default: de = tmpfs_dir_lookup_cookie(node, uio->uio_offset, &dc); if (de == NULL) return (EINVAL); if (cookies != NULL) off = tmpfs_dirent_cookie(de); } /* Read as much entries as possible; i.e., until we reach the end of * the directory or we exhaust uio space. */ do { struct dirent d; /* Create a dirent structure representing the current * tmpfs_node and fill it. */ if (de->td_node == NULL) { d.d_fileno = 1; d.d_type = DT_WHT; } else { d.d_fileno = de->td_node->tn_id; switch (de->td_node->tn_type) { case VBLK: d.d_type = DT_BLK; break; case VCHR: d.d_type = DT_CHR; break; case VDIR: d.d_type = DT_DIR; break; case VFIFO: d.d_type = DT_FIFO; break; case VLNK: d.d_type = DT_LNK; break; case VREG: d.d_type = DT_REG; break; case VSOCK: d.d_type = DT_SOCK; break; default: panic("tmpfs_dir_getdents: type %p %d", de->td_node, (int)de->td_node->tn_type); } } d.d_namlen = de->td_namelen; MPASS(de->td_namelen < sizeof(d.d_name)); (void)memcpy(d.d_name, de->ud.td_name, de->td_namelen); d.d_reclen = GENERIC_DIRSIZ(&d); dirent_terminate(&d); /* Stop reading if the directory entry we are treating is * bigger than the amount of data that can be returned. */ if (d.d_reclen > uio->uio_resid) { error = EJUSTRETURN; break; } /* Copy the new dirent structure into the output buffer and * advance pointers. */ error = uiomove(&d, d.d_reclen, uio); if (error == 0) { de = tmpfs_dir_next(node, &dc); if (cookies != NULL) { off = tmpfs_dirent_cookie(de); MPASS(*ncookies < maxcookies); cookies[(*ncookies)++] = off; } } } while (error == 0 && uio->uio_resid > 0 && de != NULL); /* Skip setting off when using cookies as it is already done above. */ if (cookies == NULL) off = tmpfs_dirent_cookie(de); /* Update the offset and cache. */ uio->uio_offset = off; node->tn_dir.tn_readdir_lastn = off; node->tn_dir.tn_readdir_lastp = de; tmpfs_set_status(tm, node, TMPFS_NODE_ACCESSED); return error; } int tmpfs_dir_whiteout_add(struct vnode *dvp, struct componentname *cnp) { struct tmpfs_dirent *de; int error; error = tmpfs_alloc_dirent(VFS_TO_TMPFS(dvp->v_mount), NULL, cnp->cn_nameptr, cnp->cn_namelen, &de); if (error != 0) return (error); tmpfs_dir_attach(dvp, de); return (0); } void tmpfs_dir_whiteout_remove(struct vnode *dvp, struct componentname *cnp) { struct tmpfs_dirent *de; de = tmpfs_dir_lookup(VP_TO_TMPFS_DIR(dvp), NULL, cnp); MPASS(de != NULL && de->td_node == NULL); tmpfs_dir_detach(dvp, de); tmpfs_free_dirent(VFS_TO_TMPFS(dvp->v_mount), de); } /* * Resizes the aobj associated with the regular file pointed to by 'vp' to the * size 'newsize'. 'vp' must point to a vnode that represents a regular file. * 'newsize' must be positive. * * Returns zero on success or an appropriate error code on failure. */ int tmpfs_reg_resize(struct vnode *vp, off_t newsize, boolean_t ignerr) { struct tmpfs_mount *tmp; struct tmpfs_node *node; vm_object_t uobj; vm_page_t m; vm_pindex_t idx, newpages, oldpages; off_t oldsize; int base, rv; MPASS(vp->v_type == VREG); MPASS(newsize >= 0); node = VP_TO_TMPFS_NODE(vp); uobj = node->tn_reg.tn_aobj; tmp = VFS_TO_TMPFS(vp->v_mount); /* * Convert the old and new sizes to the number of pages needed to * store them. It may happen that we do not need to do anything * because the last allocated page can accommodate the change on * its own. */ oldsize = node->tn_size; oldpages = OFF_TO_IDX(oldsize + PAGE_MASK); MPASS(oldpages == uobj->size); newpages = OFF_TO_IDX(newsize + PAGE_MASK); if (__predict_true(newpages == oldpages && newsize >= oldsize)) { node->tn_size = newsize; return (0); } if (newpages > oldpages && tmpfs_pages_check_avail(tmp, newpages - oldpages) == 0) return (ENOSPC); VM_OBJECT_WLOCK(uobj); if (newsize < oldsize) { /* * Zero the truncated part of the last page. */ base = newsize & PAGE_MASK; if (base != 0) { idx = OFF_TO_IDX(newsize); retry: m = vm_page_grab(uobj, idx, VM_ALLOC_NOCREAT); if (m != NULL) { MPASS(vm_page_all_valid(m)); } else if (vm_pager_has_page(uobj, idx, NULL, NULL)) { m = vm_page_alloc(uobj, idx, VM_ALLOC_NORMAL | VM_ALLOC_WAITFAIL); if (m == NULL) goto retry; vm_object_pip_add(uobj, 1); VM_OBJECT_WUNLOCK(uobj); rv = vm_pager_get_pages(uobj, &m, 1, NULL, NULL); VM_OBJECT_WLOCK(uobj); vm_object_pip_wakeup(uobj); if (rv == VM_PAGER_OK) { /* * Since the page was not resident, * and therefore not recently * accessed, immediately enqueue it * for asynchronous laundering. The * current operation is not regarded * as an access. */ vm_page_launder(m); } else { vm_page_free(m); if (ignerr) m = NULL; else { VM_OBJECT_WUNLOCK(uobj); return (EIO); } } } if (m != NULL) { pmap_zero_page_area(m, base, PAGE_SIZE - base); vm_page_set_dirty(m); vm_page_xunbusy(m); } } /* * Release any swap space and free any whole pages. */ if (newpages < oldpages) vm_object_page_remove(uobj, newpages, 0, 0); } uobj->size = newpages; VM_OBJECT_WUNLOCK(uobj); atomic_add_long(&tmp->tm_pages_used, newpages - oldpages); node->tn_size = newsize; return (0); } void tmpfs_check_mtime(struct vnode *vp) { struct tmpfs_node *node; struct vm_object *obj; ASSERT_VOP_ELOCKED(vp, "check_mtime"); if (vp->v_type != VREG) return; obj = vp->v_object; KASSERT((obj->flags & (OBJ_TMPFS_NODE | OBJ_TMPFS)) == (OBJ_TMPFS_NODE | OBJ_TMPFS), ("non-tmpfs obj")); /* unlocked read */ if (obj->generation != obj->cleangeneration) { VM_OBJECT_WLOCK(obj); if (obj->generation != obj->cleangeneration) { obj->cleangeneration = obj->generation; node = VP_TO_TMPFS_NODE(vp); node->tn_status |= TMPFS_NODE_MODIFIED | TMPFS_NODE_CHANGED; } VM_OBJECT_WUNLOCK(obj); } } /* * Change flags of the given vnode. * Caller should execute tmpfs_update on vp after a successful execution. * The vnode must be locked on entry and remain locked on exit. */ int tmpfs_chflags(struct vnode *vp, u_long flags, struct ucred *cred, struct thread *p) { int error; struct tmpfs_node *node; ASSERT_VOP_ELOCKED(vp, "chflags"); node = VP_TO_TMPFS_NODE(vp); if ((flags & ~(SF_APPEND | SF_ARCHIVED | SF_IMMUTABLE | SF_NOUNLINK | UF_APPEND | UF_ARCHIVE | UF_HIDDEN | UF_IMMUTABLE | UF_NODUMP | UF_NOUNLINK | UF_OFFLINE | UF_OPAQUE | UF_READONLY | UF_REPARSE | UF_SPARSE | UF_SYSTEM)) != 0) return (EOPNOTSUPP); /* Disallow this operation if the file system is mounted read-only. */ if (vp->v_mount->mnt_flag & MNT_RDONLY) return EROFS; /* * Callers may only modify the file flags on objects they * have VADMIN rights for. */ if ((error = VOP_ACCESS(vp, VADMIN, cred, p))) return (error); /* * Unprivileged processes are not permitted to unset system * flags, or modify flags if any system flags are set. */ if (!priv_check_cred(cred, PRIV_VFS_SYSFLAGS)) { if (node->tn_flags & (SF_NOUNLINK | SF_IMMUTABLE | SF_APPEND)) { error = securelevel_gt(cred, 0); if (error) return (error); } } else { if (node->tn_flags & (SF_NOUNLINK | SF_IMMUTABLE | SF_APPEND) || ((flags ^ node->tn_flags) & SF_SETTABLE)) return (EPERM); } node->tn_flags = flags; node->tn_status |= TMPFS_NODE_CHANGED; ASSERT_VOP_ELOCKED(vp, "chflags2"); return (0); } /* * Change access mode on the given vnode. * Caller should execute tmpfs_update on vp after a successful execution. * The vnode must be locked on entry and remain locked on exit. */ int tmpfs_chmod(struct vnode *vp, mode_t mode, struct ucred *cred, struct thread *p) { int error; struct tmpfs_node *node; mode_t newmode; ASSERT_VOP_ELOCKED(vp, "chmod"); ASSERT_VOP_IN_SEQC(vp); node = VP_TO_TMPFS_NODE(vp); /* Disallow this operation if the file system is mounted read-only. */ if (vp->v_mount->mnt_flag & MNT_RDONLY) return EROFS; /* Immutable or append-only files cannot be modified, either. */ if (node->tn_flags & (IMMUTABLE | APPEND)) return EPERM; /* * To modify the permissions on a file, must possess VADMIN * for that file. */ if ((error = VOP_ACCESS(vp, VADMIN, cred, p))) return (error); /* * Privileged processes may set the sticky bit on non-directories, * as well as set the setgid bit on a file with a group that the * process is not a member of. */ if (vp->v_type != VDIR && (mode & S_ISTXT)) { if (priv_check_cred(cred, PRIV_VFS_STICKYFILE)) return (EFTYPE); } if (!groupmember(node->tn_gid, cred) && (mode & S_ISGID)) { error = priv_check_cred(cred, PRIV_VFS_SETGID); if (error) return (error); } newmode = node->tn_mode & ~ALLPERMS; newmode |= mode & ALLPERMS; atomic_store_short(&node->tn_mode, newmode); node->tn_status |= TMPFS_NODE_CHANGED; ASSERT_VOP_ELOCKED(vp, "chmod2"); return (0); } /* * Change ownership of the given vnode. At least one of uid or gid must * be different than VNOVAL. If one is set to that value, the attribute * is unchanged. * Caller should execute tmpfs_update on vp after a successful execution. * The vnode must be locked on entry and remain locked on exit. */ int tmpfs_chown(struct vnode *vp, uid_t uid, gid_t gid, struct ucred *cred, struct thread *p) { int error; struct tmpfs_node *node; uid_t ouid; gid_t ogid; mode_t newmode; ASSERT_VOP_ELOCKED(vp, "chown"); ASSERT_VOP_IN_SEQC(vp); node = VP_TO_TMPFS_NODE(vp); /* Assign default values if they are unknown. */ MPASS(uid != VNOVAL || gid != VNOVAL); if (uid == VNOVAL) uid = node->tn_uid; if (gid == VNOVAL) gid = node->tn_gid; MPASS(uid != VNOVAL && gid != VNOVAL); /* Disallow this operation if the file system is mounted read-only. */ if (vp->v_mount->mnt_flag & MNT_RDONLY) return EROFS; /* Immutable or append-only files cannot be modified, either. */ if (node->tn_flags & (IMMUTABLE | APPEND)) return EPERM; /* * To modify the ownership of a file, must possess VADMIN for that * file. */ if ((error = VOP_ACCESS(vp, VADMIN, cred, p))) return (error); /* * To change the owner of a file, or change the group of a file to a * group of which we are not a member, the caller must have * privilege. */ if ((uid != node->tn_uid || (gid != node->tn_gid && !groupmember(gid, cred))) && (error = priv_check_cred(cred, PRIV_VFS_CHOWN))) return (error); ogid = node->tn_gid; ouid = node->tn_uid; node->tn_uid = uid; node->tn_gid = gid; node->tn_status |= TMPFS_NODE_CHANGED; if ((node->tn_mode & (S_ISUID | S_ISGID)) && (ouid != uid || ogid != gid)) { if (priv_check_cred(cred, PRIV_VFS_RETAINSUGID)) { newmode = node->tn_mode & ~(S_ISUID | S_ISGID); atomic_store_short(&node->tn_mode, newmode); } } ASSERT_VOP_ELOCKED(vp, "chown2"); return (0); } /* * Change size of the given vnode. * Caller should execute tmpfs_update on vp after a successful execution. * The vnode must be locked on entry and remain locked on exit. */ int tmpfs_chsize(struct vnode *vp, u_quad_t size, struct ucred *cred, struct thread *p) { int error; struct tmpfs_node *node; ASSERT_VOP_ELOCKED(vp, "chsize"); node = VP_TO_TMPFS_NODE(vp); /* Decide whether this is a valid operation based on the file type. */ error = 0; switch (vp->v_type) { case VDIR: return EISDIR; case VREG: if (vp->v_mount->mnt_flag & MNT_RDONLY) return EROFS; break; case VBLK: /* FALLTHROUGH */ case VCHR: /* FALLTHROUGH */ case VFIFO: /* Allow modifications of special files even if in the file * system is mounted read-only (we are not modifying the * files themselves, but the objects they represent). */ return 0; default: /* Anything else is unsupported. */ return EOPNOTSUPP; } /* Immutable or append-only files cannot be modified, either. */ if (node->tn_flags & (IMMUTABLE | APPEND)) return EPERM; error = tmpfs_truncate(vp, size); /* tmpfs_truncate will raise the NOTE_EXTEND and NOTE_ATTRIB kevents * for us, as will update tn_status; no need to do that here. */ ASSERT_VOP_ELOCKED(vp, "chsize2"); return (error); } /* * Change access and modification times of the given vnode. * Caller should execute tmpfs_update on vp after a successful execution. * The vnode must be locked on entry and remain locked on exit. */ int tmpfs_chtimes(struct vnode *vp, struct vattr *vap, struct ucred *cred, struct thread *l) { int error; struct tmpfs_node *node; ASSERT_VOP_ELOCKED(vp, "chtimes"); node = VP_TO_TMPFS_NODE(vp); /* Disallow this operation if the file system is mounted read-only. */ if (vp->v_mount->mnt_flag & MNT_RDONLY) return EROFS; /* Immutable or append-only files cannot be modified, either. */ if (node->tn_flags & (IMMUTABLE | APPEND)) return EPERM; error = vn_utimes_perm(vp, vap, cred, l); if (error != 0) return (error); if (vap->va_atime.tv_sec != VNOVAL) node->tn_status |= TMPFS_NODE_ACCESSED; if (vap->va_mtime.tv_sec != VNOVAL) node->tn_status |= TMPFS_NODE_MODIFIED; if (vap->va_birthtime.tv_sec != VNOVAL) node->tn_status |= TMPFS_NODE_MODIFIED; tmpfs_itimes(vp, &vap->va_atime, &vap->va_mtime); if (vap->va_birthtime.tv_sec != VNOVAL) node->tn_birthtime = vap->va_birthtime; ASSERT_VOP_ELOCKED(vp, "chtimes2"); return (0); } void tmpfs_set_status(struct tmpfs_mount *tm, struct tmpfs_node *node, int status) { if ((node->tn_status & status) == status || tm->tm_ronly) return; TMPFS_NODE_LOCK(node); node->tn_status |= status; TMPFS_NODE_UNLOCK(node); } /* Sync timestamps */ void tmpfs_itimes(struct vnode *vp, const struct timespec *acc, const struct timespec *mod) { struct tmpfs_node *node; struct timespec now; ASSERT_VOP_LOCKED(vp, "tmpfs_itimes"); node = VP_TO_TMPFS_NODE(vp); if ((node->tn_status & (TMPFS_NODE_ACCESSED | TMPFS_NODE_MODIFIED | TMPFS_NODE_CHANGED)) == 0) return; vfs_timestamp(&now); TMPFS_NODE_LOCK(node); if (node->tn_status & TMPFS_NODE_ACCESSED) { if (acc == NULL) acc = &now; node->tn_atime = *acc; } if (node->tn_status & TMPFS_NODE_MODIFIED) { if (mod == NULL) mod = &now; node->tn_mtime = *mod; } if (node->tn_status & TMPFS_NODE_CHANGED) node->tn_ctime = now; node->tn_status &= ~(TMPFS_NODE_ACCESSED | TMPFS_NODE_MODIFIED | TMPFS_NODE_CHANGED); TMPFS_NODE_UNLOCK(node); /* XXX: FIX? The entropy here is desirable, but the harvesting may be expensive */ random_harvest_queue(node, sizeof(*node), RANDOM_FS_ATIME); } int tmpfs_truncate(struct vnode *vp, off_t length) { int error; struct tmpfs_node *node; node = VP_TO_TMPFS_NODE(vp); if (length < 0) { error = EINVAL; goto out; } if (node->tn_size == length) { error = 0; goto out; } if (length > VFS_TO_TMPFS(vp->v_mount)->tm_maxfilesize) return (EFBIG); error = tmpfs_reg_resize(vp, length, FALSE); if (error == 0) node->tn_status |= TMPFS_NODE_CHANGED | TMPFS_NODE_MODIFIED; out: tmpfs_update(vp); return (error); } static __inline int tmpfs_dirtree_cmp(struct tmpfs_dirent *a, struct tmpfs_dirent *b) { if (a->td_hash > b->td_hash) return (1); else if (a->td_hash < b->td_hash) return (-1); return (0); } RB_GENERATE_STATIC(tmpfs_dir, tmpfs_dirent, uh.td_entries, tmpfs_dirtree_cmp); Index: projects/clang1100-import/sys/fs/tmpfs/tmpfs_vfsops.c =================================================================== --- projects/clang1100-import/sys/fs/tmpfs/tmpfs_vfsops.c (revision 364278) +++ projects/clang1100-import/sys/fs/tmpfs/tmpfs_vfsops.c (revision 364279) @@ -1,690 +1,689 @@ /* $NetBSD: tmpfs_vfsops.c,v 1.10 2005/12/11 12:24:29 christos Exp $ */ /*- * SPDX-License-Identifier: BSD-2-Clause-NetBSD * * Copyright (c) 2005 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Julio M. Merino Vidal, developed as part of Google's Summer of Code * 2005 program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Efficient memory file system. * * tmpfs is a file system that uses FreeBSD's virtual memory * sub-system to store file data and metadata in an efficient way. * This means that it does not follow the structure of an on-disk file * system because it simply does not need to. Instead, it uses * memory-specific data structures and algorithms to automatically * allocate and release resources. */ #include "opt_tmpfs.h" #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Default permission for root node */ #define TMPFS_DEFAULT_ROOT_MODE (S_IRWXU|S_IRGRP|S_IXGRP|S_IROTH|S_IXOTH) MALLOC_DEFINE(M_TMPFSMNT, "tmpfs mount", "tmpfs mount structures"); MALLOC_DEFINE(M_TMPFSNAME, "tmpfs name", "tmpfs file names"); static int tmpfs_mount(struct mount *); static int tmpfs_unmount(struct mount *, int); static int tmpfs_root(struct mount *, int flags, struct vnode **); static int tmpfs_fhtovp(struct mount *, struct fid *, int, struct vnode **); static int tmpfs_statfs(struct mount *, struct statfs *); static const char *tmpfs_opts[] = { "from", "size", "maxfilesize", "inodes", "uid", "gid", "mode", "export", "union", "nonc", "nomtime", NULL }; static const char *tmpfs_updateopts[] = { "from", "export", "nomtime", "size", NULL }; /* * Handle updates of time from writes to mmaped regions, if allowed. * Use MNT_VNODE_FOREACH_ALL instead of MNT_VNODE_FOREACH_LAZY, since * unmap of the tmpfs-backed vnode does not call vinactive(), due to * vm object type is OBJT_SWAP. If lazy, only handle delayed update * of mtime due to the writes to mapped files. */ static void tmpfs_update_mtime(struct mount *mp, bool lazy) { struct vnode *vp, *mvp; struct vm_object *obj; if (VFS_TO_TMPFS(mp)->tm_nomtime) return; MNT_VNODE_FOREACH_ALL(vp, mp, mvp) { if (vp->v_type != VREG) { VI_UNLOCK(vp); continue; } obj = vp->v_object; KASSERT((obj->flags & (OBJ_TMPFS_NODE | OBJ_TMPFS)) == (OBJ_TMPFS_NODE | OBJ_TMPFS), ("non-tmpfs obj")); /* * In lazy case, do unlocked read, avoid taking vnode * lock if not needed. Lost update will be handled on * the next call. * For non-lazy case, we must flush all pending * metadata changes now. */ if (!lazy || obj->generation != obj->cleangeneration) { - if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK, - curthread) != 0) + if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK) != 0) continue; tmpfs_check_mtime(vp); if (!lazy) tmpfs_update(vp); vput(vp); } else { VI_UNLOCK(vp); continue; } } } struct tmpfs_check_rw_maps_arg { bool found; }; static bool tmpfs_check_rw_maps_cb(struct mount *mp __unused, vm_map_t map __unused, vm_map_entry_t entry __unused, void *arg) { struct tmpfs_check_rw_maps_arg *a; a = arg; a->found = true; return (true); } /* * Revoke write permissions from all mappings of regular files * belonging to the specified tmpfs mount. */ static bool tmpfs_revoke_rw_maps_cb(struct mount *mp __unused, vm_map_t map, vm_map_entry_t entry, void *arg __unused) { /* * XXXKIB: might be invalidate the mapping * instead ? The process is not going to be * happy in any case. */ entry->max_protection &= ~VM_PROT_WRITE; if ((entry->protection & VM_PROT_WRITE) != 0) { entry->protection &= ~VM_PROT_WRITE; pmap_protect(map->pmap, entry->start, entry->end, entry->protection); } return (false); } static void tmpfs_all_rw_maps(struct mount *mp, bool (*cb)(struct mount *mp, vm_map_t, vm_map_entry_t, void *), void *cb_arg) { struct proc *p; struct vmspace *vm; vm_map_t map; vm_map_entry_t entry; vm_object_t object; struct vnode *vp; int gen; bool terminate; terminate = false; sx_slock(&allproc_lock); again: gen = allproc_gen; FOREACH_PROC_IN_SYSTEM(p) { PROC_LOCK(p); if (p->p_state != PRS_NORMAL || (p->p_flag & (P_INEXEC | P_SYSTEM | P_WEXIT)) != 0) { PROC_UNLOCK(p); continue; } vm = vmspace_acquire_ref(p); _PHOLD_LITE(p); PROC_UNLOCK(p); if (vm == NULL) { PRELE(p); continue; } sx_sunlock(&allproc_lock); map = &vm->vm_map; vm_map_lock(map); if (map->busy) vm_map_wait_busy(map); VM_MAP_ENTRY_FOREACH(entry, map) { if ((entry->eflags & (MAP_ENTRY_GUARD | MAP_ENTRY_IS_SUB_MAP | MAP_ENTRY_COW)) != 0 || (entry->max_protection & VM_PROT_WRITE) == 0) continue; object = entry->object.vm_object; if (object == NULL || object->type != OBJT_SWAP || (object->flags & OBJ_TMPFS_NODE) == 0) continue; /* * No need to dig into shadow chain, mapping * of the object not at top is readonly. */ VM_OBJECT_RLOCK(object); if (object->type == OBJT_DEAD) { VM_OBJECT_RUNLOCK(object); continue; } MPASS(object->ref_count > 1); if ((object->flags & (OBJ_TMPFS_NODE | OBJ_TMPFS)) != (OBJ_TMPFS_NODE | OBJ_TMPFS)) { VM_OBJECT_RUNLOCK(object); continue; } vp = object->un_pager.swp.swp_tmpfs; if (vp->v_mount != mp) { VM_OBJECT_RUNLOCK(object); continue; } terminate = cb(mp, map, entry, cb_arg); VM_OBJECT_RUNLOCK(object); if (terminate) break; } vm_map_unlock(map); vmspace_free(vm); sx_slock(&allproc_lock); PRELE(p); if (terminate) break; } if (!terminate && gen != allproc_gen) goto again; sx_sunlock(&allproc_lock); } static bool tmpfs_check_rw_maps(struct mount *mp) { struct tmpfs_check_rw_maps_arg ca; ca.found = false; tmpfs_all_rw_maps(mp, tmpfs_check_rw_maps_cb, &ca); return (ca.found); } static int tmpfs_rw_to_ro(struct mount *mp) { int error, flags; bool forced; forced = (mp->mnt_flag & MNT_FORCE) != 0; flags = WRITECLOSE | (forced ? FORCECLOSE : 0); if ((error = vn_start_write(NULL, &mp, V_WAIT)) != 0) return (error); error = vfs_write_suspend_umnt(mp); if (error != 0) return (error); if (!forced && tmpfs_check_rw_maps(mp)) { error = EBUSY; goto out; } VFS_TO_TMPFS(mp)->tm_ronly = 1; MNT_ILOCK(mp); mp->mnt_flag |= MNT_RDONLY; MNT_IUNLOCK(mp); for (;;) { tmpfs_all_rw_maps(mp, tmpfs_revoke_rw_maps_cb, NULL); tmpfs_update_mtime(mp, false); error = vflush(mp, 0, flags, curthread); if (error != 0) { VFS_TO_TMPFS(mp)->tm_ronly = 0; MNT_ILOCK(mp); mp->mnt_flag &= ~MNT_RDONLY; MNT_IUNLOCK(mp); goto out; } if (!tmpfs_check_rw_maps(mp)) break; } out: vfs_write_resume(mp, 0); return (error); } static int tmpfs_mount(struct mount *mp) { const size_t nodes_per_page = howmany(PAGE_SIZE, sizeof(struct tmpfs_dirent) + sizeof(struct tmpfs_node)); struct tmpfs_mount *tmp; struct tmpfs_node *root; int error; bool nomtime, nonc; /* Size counters. */ u_quad_t pages; off_t nodes_max, size_max, maxfilesize; /* Root node attributes. */ uid_t root_uid; gid_t root_gid; mode_t root_mode; struct vattr va; if (vfs_filteropt(mp->mnt_optnew, tmpfs_opts)) return (EINVAL); if (mp->mnt_flag & MNT_UPDATE) { /* Only support update mounts for certain options. */ if (vfs_filteropt(mp->mnt_optnew, tmpfs_updateopts) != 0) return (EOPNOTSUPP); tmp = VFS_TO_TMPFS(mp); if (vfs_getopt_size(mp->mnt_optnew, "size", &size_max) == 0) { /* * On-the-fly resizing is not supported (yet). We still * need to have "size" listed as "supported", otherwise * trying to update fs that is listed in fstab with size * parameter, say trying to change rw to ro or vice * versa, would cause vfs_filteropt() to bail. */ if (size_max != tmp->tm_size_max) return (EOPNOTSUPP); } if (vfs_flagopt(mp->mnt_optnew, "ro", NULL, 0) && !tmp->tm_ronly) { /* RW -> RO */ return (tmpfs_rw_to_ro(mp)); } else if (!vfs_flagopt(mp->mnt_optnew, "ro", NULL, 0) && tmp->tm_ronly) { /* RO -> RW */ tmp->tm_ronly = 0; MNT_ILOCK(mp); mp->mnt_flag &= ~MNT_RDONLY; MNT_IUNLOCK(mp); } tmp->tm_nomtime = vfs_getopt(mp->mnt_optnew, "nomtime", NULL, 0) == 0; MNT_ILOCK(mp); if ((mp->mnt_flag & MNT_UNION) == 0) { mp->mnt_kern_flag |= MNTK_FPLOOKUP; } else { mp->mnt_kern_flag &= ~MNTK_FPLOOKUP; } MNT_IUNLOCK(mp); return (0); } vn_lock(mp->mnt_vnodecovered, LK_SHARED | LK_RETRY); error = VOP_GETATTR(mp->mnt_vnodecovered, &va, mp->mnt_cred); VOP_UNLOCK(mp->mnt_vnodecovered); if (error) return (error); if (mp->mnt_cred->cr_ruid != 0 || vfs_scanopt(mp->mnt_optnew, "gid", "%d", &root_gid) != 1) root_gid = va.va_gid; if (mp->mnt_cred->cr_ruid != 0 || vfs_scanopt(mp->mnt_optnew, "uid", "%d", &root_uid) != 1) root_uid = va.va_uid; if (mp->mnt_cred->cr_ruid != 0 || vfs_scanopt(mp->mnt_optnew, "mode", "%ho", &root_mode) != 1) root_mode = va.va_mode; if (vfs_getopt_size(mp->mnt_optnew, "inodes", &nodes_max) != 0) nodes_max = 0; if (vfs_getopt_size(mp->mnt_optnew, "size", &size_max) != 0) size_max = 0; if (vfs_getopt_size(mp->mnt_optnew, "maxfilesize", &maxfilesize) != 0) maxfilesize = 0; nonc = vfs_getopt(mp->mnt_optnew, "nonc", NULL, NULL) == 0; nomtime = vfs_getopt(mp->mnt_optnew, "nomtime", NULL, NULL) == 0; /* Do not allow mounts if we do not have enough memory to preserve * the minimum reserved pages. */ if (tmpfs_mem_avail() < TMPFS_PAGES_MINRESERVED) return (ENOSPC); /* Get the maximum number of memory pages this file system is * allowed to use, based on the maximum size the user passed in * the mount structure. A value of zero is treated as if the * maximum available space was requested. */ if (size_max == 0 || size_max > OFF_MAX - PAGE_SIZE || (SIZE_MAX < OFF_MAX && size_max / PAGE_SIZE >= SIZE_MAX)) pages = SIZE_MAX; else { size_max = roundup(size_max, PAGE_SIZE); pages = howmany(size_max, PAGE_SIZE); } MPASS(pages > 0); if (nodes_max <= 3) { if (pages < INT_MAX / nodes_per_page) nodes_max = pages * nodes_per_page; else nodes_max = INT_MAX; } if (nodes_max > INT_MAX) nodes_max = INT_MAX; MPASS(nodes_max >= 3); /* Allocate the tmpfs mount structure and fill it. */ tmp = (struct tmpfs_mount *)malloc(sizeof(struct tmpfs_mount), M_TMPFSMNT, M_WAITOK | M_ZERO); mtx_init(&tmp->tm_allnode_lock, "tmpfs allnode lock", NULL, MTX_DEF); tmp->tm_nodes_max = nodes_max; tmp->tm_nodes_inuse = 0; tmp->tm_refcount = 1; tmp->tm_maxfilesize = maxfilesize > 0 ? maxfilesize : OFF_MAX; LIST_INIT(&tmp->tm_nodes_used); tmp->tm_size_max = size_max; tmp->tm_pages_max = pages; tmp->tm_pages_used = 0; new_unrhdr64(&tmp->tm_ino_unr, 2); tmp->tm_ronly = (mp->mnt_flag & MNT_RDONLY) != 0; tmp->tm_nonc = nonc; tmp->tm_nomtime = nomtime; /* Allocate the root node. */ error = tmpfs_alloc_node(mp, tmp, VDIR, root_uid, root_gid, root_mode & ALLPERMS, NULL, NULL, VNOVAL, &root); if (error != 0 || root == NULL) { free(tmp, M_TMPFSMNT); return (error); } KASSERT(root->tn_id == 2, ("tmpfs root with invalid ino: %ju", (uintmax_t)root->tn_id)); tmp->tm_root = root; MNT_ILOCK(mp); mp->mnt_flag |= MNT_LOCAL; mp->mnt_kern_flag |= MNTK_LOOKUP_SHARED | MNTK_EXTENDED_SHARED | MNTK_TEXT_REFS | MNTK_NOMSYNC; if (!nonc && (mp->mnt_flag & MNT_UNION) == 0) mp->mnt_kern_flag |= MNTK_FPLOOKUP; MNT_IUNLOCK(mp); mp->mnt_data = tmp; mp->mnt_stat.f_namemax = MAXNAMLEN; vfs_getnewfsid(mp); vfs_mountedfrom(mp, "tmpfs"); return 0; } /* ARGSUSED2 */ static int tmpfs_unmount(struct mount *mp, int mntflags) { struct tmpfs_mount *tmp; struct tmpfs_node *node; int error, flags; flags = (mntflags & MNT_FORCE) != 0 ? FORCECLOSE : 0; tmp = VFS_TO_TMPFS(mp); /* Stop writers */ error = vfs_write_suspend_umnt(mp); if (error != 0) return (error); /* * At this point, nodes cannot be destroyed by any other * thread because write suspension is started. */ for (;;) { error = vflush(mp, 0, flags, curthread); if (error != 0) { vfs_write_resume(mp, VR_START_WRITE); return (error); } MNT_ILOCK(mp); if (mp->mnt_nvnodelistsize == 0) { MNT_IUNLOCK(mp); break; } MNT_IUNLOCK(mp); if ((mntflags & MNT_FORCE) == 0) { vfs_write_resume(mp, VR_START_WRITE); return (EBUSY); } } TMPFS_LOCK(tmp); while ((node = LIST_FIRST(&tmp->tm_nodes_used)) != NULL) { TMPFS_NODE_LOCK(node); if (node->tn_type == VDIR) tmpfs_dir_destroy(tmp, node); if (tmpfs_free_node_locked(tmp, node, true)) TMPFS_LOCK(tmp); else TMPFS_NODE_UNLOCK(node); } mp->mnt_data = NULL; tmpfs_free_tmp(tmp); vfs_write_resume(mp, VR_START_WRITE); MNT_ILOCK(mp); mp->mnt_flag &= ~MNT_LOCAL; MNT_IUNLOCK(mp); return (0); } void tmpfs_free_tmp(struct tmpfs_mount *tmp) { MPASS(tmp->tm_refcount > 0); tmp->tm_refcount--; if (tmp->tm_refcount > 0) { TMPFS_UNLOCK(tmp); return; } TMPFS_UNLOCK(tmp); mtx_destroy(&tmp->tm_allnode_lock); MPASS(tmp->tm_pages_used == 0); MPASS(tmp->tm_nodes_inuse == 0); free(tmp, M_TMPFSMNT); } static int tmpfs_root(struct mount *mp, int flags, struct vnode **vpp) { int error; error = tmpfs_alloc_vp(mp, VFS_TO_TMPFS(mp)->tm_root, flags, vpp); if (error == 0) (*vpp)->v_vflag |= VV_ROOT; return (error); } static int tmpfs_fhtovp(struct mount *mp, struct fid *fhp, int flags, struct vnode **vpp) { struct tmpfs_fid_data tfd; struct tmpfs_mount *tmp; struct tmpfs_node *node; int error; if (fhp->fid_len != sizeof(tfd)) return (EINVAL); /* * Copy from fid_data onto the stack to avoid unaligned pointer use. * See the comment in sys/mount.h on struct fid for details. */ memcpy(&tfd, fhp->fid_data, fhp->fid_len); tmp = VFS_TO_TMPFS(mp); if (tfd.tfd_id >= tmp->tm_nodes_max) return (EINVAL); TMPFS_LOCK(tmp); LIST_FOREACH(node, &tmp->tm_nodes_used, tn_entries) { if (node->tn_id == tfd.tfd_id && node->tn_gen == tfd.tfd_gen) { tmpfs_ref_node(node); break; } } TMPFS_UNLOCK(tmp); if (node != NULL) { error = tmpfs_alloc_vp(mp, node, LK_EXCLUSIVE, vpp); tmpfs_free_node(tmp, node); } else error = EINVAL; return (error); } /* ARGSUSED2 */ static int tmpfs_statfs(struct mount *mp, struct statfs *sbp) { struct tmpfs_mount *tmp; size_t used; tmp = VFS_TO_TMPFS(mp); sbp->f_iosize = PAGE_SIZE; sbp->f_bsize = PAGE_SIZE; used = tmpfs_pages_used(tmp); if (tmp->tm_pages_max != ULONG_MAX) sbp->f_blocks = tmp->tm_pages_max; else sbp->f_blocks = used + tmpfs_mem_avail(); if (sbp->f_blocks <= used) sbp->f_bavail = 0; else sbp->f_bavail = sbp->f_blocks - used; sbp->f_bfree = sbp->f_bavail; used = tmp->tm_nodes_inuse; sbp->f_files = tmp->tm_nodes_max; if (sbp->f_files <= used) sbp->f_ffree = 0; else sbp->f_ffree = sbp->f_files - used; /* sbp->f_owner = tmp->tn_uid; */ return 0; } static int tmpfs_sync(struct mount *mp, int waitfor) { if (waitfor == MNT_SUSPEND) { MNT_ILOCK(mp); mp->mnt_kern_flag |= MNTK_SUSPEND2 | MNTK_SUSPENDED; MNT_IUNLOCK(mp); } else if (waitfor == MNT_LAZY) { tmpfs_update_mtime(mp, true); } return (0); } static int tmpfs_init(struct vfsconf *conf) { tmpfs_subr_init(); return (0); } static int tmpfs_uninit(struct vfsconf *conf) { tmpfs_subr_uninit(); return (0); } /* * tmpfs vfs operations. */ struct vfsops tmpfs_vfsops = { .vfs_mount = tmpfs_mount, .vfs_unmount = tmpfs_unmount, .vfs_root = vfs_cache_root, .vfs_cachedroot = tmpfs_root, .vfs_statfs = tmpfs_statfs, .vfs_fhtovp = tmpfs_fhtovp, .vfs_sync = tmpfs_sync, .vfs_init = tmpfs_init, .vfs_uninit = tmpfs_uninit, }; VFS_SET(tmpfs_vfsops, tmpfs, VFCF_JAIL); Index: projects/clang1100-import/sys/kern/uipc_mqueue.c =================================================================== --- projects/clang1100-import/sys/kern/uipc_mqueue.c (revision 364278) +++ projects/clang1100-import/sys/kern/uipc_mqueue.c (revision 364279) @@ -1,2945 +1,2945 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2005 David Xu * Copyright (c) 2016-2017 Robert N. M. Watson * All rights reserved. * * Portions of this software were developed by BAE Systems, the University of * Cambridge Computer Laboratory, and Memorial University under DARPA/AFRL * contract FA8650-15-C-7558 ("CADETS"), as part of the DARPA Transparent * Computing (TC) research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ /* * POSIX message queue implementation. * * 1) A mqueue filesystem can be mounted, each message queue appears * in mounted directory, user can change queue's permission and * ownership, or remove a queue. Manually creating a file in the * directory causes a message queue to be created in the kernel with * default message queue attributes applied and same name used, this * method is not advocated since mq_open syscall allows user to specify * different attributes. Also the file system can be mounted multiple * times at different mount points but shows same contents. * * 2) Standard POSIX message queue API. The syscalls do not use vfs layer, * but directly operate on internal data structure, this allows user to * use the IPC facility without having to mount mqueue file system. */ #include __FBSDID("$FreeBSD$"); #include "opt_capsicum.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include FEATURE(p1003_1b_mqueue, "POSIX P1003.1B message queues support"); /* * Limits and constants */ #define MQFS_NAMELEN NAME_MAX #define MQFS_DELEN (8 + MQFS_NAMELEN) /* node types */ typedef enum { mqfstype_none = 0, mqfstype_root, mqfstype_dir, mqfstype_this, mqfstype_parent, mqfstype_file, mqfstype_symlink, } mqfs_type_t; struct mqfs_node; /* * mqfs_info: describes a mqfs instance */ struct mqfs_info { struct sx mi_lock; struct mqfs_node *mi_root; struct unrhdr *mi_unrhdr; }; struct mqfs_vdata { LIST_ENTRY(mqfs_vdata) mv_link; struct mqfs_node *mv_node; struct vnode *mv_vnode; struct task mv_task; }; /* * mqfs_node: describes a node (file or directory) within a mqfs */ struct mqfs_node { char mn_name[MQFS_NAMELEN+1]; struct mqfs_info *mn_info; struct mqfs_node *mn_parent; LIST_HEAD(,mqfs_node) mn_children; LIST_ENTRY(mqfs_node) mn_sibling; LIST_HEAD(,mqfs_vdata) mn_vnodes; const void *mn_pr_root; int mn_refcount; mqfs_type_t mn_type; int mn_deleted; uint32_t mn_fileno; void *mn_data; struct timespec mn_birth; struct timespec mn_ctime; struct timespec mn_atime; struct timespec mn_mtime; uid_t mn_uid; gid_t mn_gid; int mn_mode; }; #define VTON(vp) (((struct mqfs_vdata *)((vp)->v_data))->mv_node) #define VTOMQ(vp) ((struct mqueue *)(VTON(vp)->mn_data)) #define VFSTOMQFS(m) ((struct mqfs_info *)((m)->mnt_data)) #define FPTOMQ(fp) ((struct mqueue *)(((struct mqfs_node *) \ (fp)->f_data)->mn_data)) TAILQ_HEAD(msgq, mqueue_msg); struct mqueue; struct mqueue_notifier { LIST_ENTRY(mqueue_notifier) nt_link; struct sigevent nt_sigev; ksiginfo_t nt_ksi; struct proc *nt_proc; }; struct mqueue { struct mtx mq_mutex; int mq_flags; long mq_maxmsg; long mq_msgsize; long mq_curmsgs; long mq_totalbytes; struct msgq mq_msgq; int mq_receivers; int mq_senders; struct selinfo mq_rsel; struct selinfo mq_wsel; struct mqueue_notifier *mq_notifier; }; #define MQ_RSEL 0x01 #define MQ_WSEL 0x02 struct mqueue_msg { TAILQ_ENTRY(mqueue_msg) msg_link; unsigned int msg_prio; unsigned int msg_size; /* following real data... */ }; static SYSCTL_NODE(_kern, OID_AUTO, mqueue, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "POSIX real time message queue"); static int default_maxmsg = 10; static int default_msgsize = 1024; static int maxmsg = 100; SYSCTL_INT(_kern_mqueue, OID_AUTO, maxmsg, CTLFLAG_RW, &maxmsg, 0, "Default maximum messages in queue"); static int maxmsgsize = 16384; SYSCTL_INT(_kern_mqueue, OID_AUTO, maxmsgsize, CTLFLAG_RW, &maxmsgsize, 0, "Default maximum message size"); static int maxmq = 100; SYSCTL_INT(_kern_mqueue, OID_AUTO, maxmq, CTLFLAG_RW, &maxmq, 0, "maximum message queues"); static int curmq = 0; SYSCTL_INT(_kern_mqueue, OID_AUTO, curmq, CTLFLAG_RW, &curmq, 0, "current message queue number"); static int unloadable = 0; static MALLOC_DEFINE(M_MQUEUEDATA, "mqdata", "mqueue data"); static eventhandler_tag exit_tag; /* Only one instance per-system */ static struct mqfs_info mqfs_data; static uma_zone_t mqnode_zone; static uma_zone_t mqueue_zone; static uma_zone_t mvdata_zone; static uma_zone_t mqnoti_zone; static struct vop_vector mqfs_vnodeops; static struct fileops mqueueops; static unsigned mqfs_osd_jail_slot; /* * Directory structure construction and manipulation */ #ifdef notyet static struct mqfs_node *mqfs_create_dir(struct mqfs_node *parent, const char *name, int namelen, struct ucred *cred, int mode); static struct mqfs_node *mqfs_create_link(struct mqfs_node *parent, const char *name, int namelen, struct ucred *cred, int mode); #endif static struct mqfs_node *mqfs_create_file(struct mqfs_node *parent, const char *name, int namelen, struct ucred *cred, int mode); static int mqfs_destroy(struct mqfs_node *mn); static void mqfs_fileno_alloc(struct mqfs_info *mi, struct mqfs_node *mn); static void mqfs_fileno_free(struct mqfs_info *mi, struct mqfs_node *mn); static int mqfs_allocv(struct mount *mp, struct vnode **vpp, struct mqfs_node *pn); static int mqfs_prison_remove(void *obj, void *data); /* * Message queue construction and maniplation */ static struct mqueue *mqueue_alloc(const struct mq_attr *attr); static void mqueue_free(struct mqueue *mq); static int mqueue_send(struct mqueue *mq, const char *msg_ptr, size_t msg_len, unsigned msg_prio, int waitok, const struct timespec *abs_timeout); static int mqueue_receive(struct mqueue *mq, char *msg_ptr, size_t msg_len, unsigned *msg_prio, int waitok, const struct timespec *abs_timeout); static int _mqueue_send(struct mqueue *mq, struct mqueue_msg *msg, int timo); static int _mqueue_recv(struct mqueue *mq, struct mqueue_msg **msg, int timo); static void mqueue_send_notification(struct mqueue *mq); static void mqueue_fdclose(struct thread *td, int fd, struct file *fp); static void mq_proc_exit(void *arg, struct proc *p); /* * kqueue filters */ static void filt_mqdetach(struct knote *kn); static int filt_mqread(struct knote *kn, long hint); static int filt_mqwrite(struct knote *kn, long hint); struct filterops mq_rfiltops = { .f_isfd = 1, .f_detach = filt_mqdetach, .f_event = filt_mqread, }; struct filterops mq_wfiltops = { .f_isfd = 1, .f_detach = filt_mqdetach, .f_event = filt_mqwrite, }; /* * Initialize fileno bitmap */ static void mqfs_fileno_init(struct mqfs_info *mi) { struct unrhdr *up; up = new_unrhdr(1, INT_MAX, NULL); mi->mi_unrhdr = up; } /* * Tear down fileno bitmap */ static void mqfs_fileno_uninit(struct mqfs_info *mi) { struct unrhdr *up; up = mi->mi_unrhdr; mi->mi_unrhdr = NULL; delete_unrhdr(up); } /* * Allocate a file number */ static void mqfs_fileno_alloc(struct mqfs_info *mi, struct mqfs_node *mn) { /* make sure our parent has a file number */ if (mn->mn_parent && !mn->mn_parent->mn_fileno) mqfs_fileno_alloc(mi, mn->mn_parent); switch (mn->mn_type) { case mqfstype_root: case mqfstype_dir: case mqfstype_file: case mqfstype_symlink: mn->mn_fileno = alloc_unr(mi->mi_unrhdr); break; case mqfstype_this: KASSERT(mn->mn_parent != NULL, ("mqfstype_this node has no parent")); mn->mn_fileno = mn->mn_parent->mn_fileno; break; case mqfstype_parent: KASSERT(mn->mn_parent != NULL, ("mqfstype_parent node has no parent")); if (mn->mn_parent == mi->mi_root) { mn->mn_fileno = mn->mn_parent->mn_fileno; break; } KASSERT(mn->mn_parent->mn_parent != NULL, ("mqfstype_parent node has no grandparent")); mn->mn_fileno = mn->mn_parent->mn_parent->mn_fileno; break; default: KASSERT(0, ("mqfs_fileno_alloc() called for unknown type node: %d", mn->mn_type)); break; } } /* * Release a file number */ static void mqfs_fileno_free(struct mqfs_info *mi, struct mqfs_node *mn) { switch (mn->mn_type) { case mqfstype_root: case mqfstype_dir: case mqfstype_file: case mqfstype_symlink: free_unr(mi->mi_unrhdr, mn->mn_fileno); break; case mqfstype_this: case mqfstype_parent: /* ignore these, as they don't "own" their file number */ break; default: KASSERT(0, ("mqfs_fileno_free() called for unknown type node: %d", mn->mn_type)); break; } } static __inline struct mqfs_node * mqnode_alloc(void) { return uma_zalloc(mqnode_zone, M_WAITOK | M_ZERO); } static __inline void mqnode_free(struct mqfs_node *node) { uma_zfree(mqnode_zone, node); } static __inline void mqnode_addref(struct mqfs_node *node) { atomic_add_int(&node->mn_refcount, 1); } static __inline void mqnode_release(struct mqfs_node *node) { struct mqfs_info *mqfs; int old, exp; mqfs = node->mn_info; old = atomic_fetchadd_int(&node->mn_refcount, -1); if (node->mn_type == mqfstype_dir || node->mn_type == mqfstype_root) exp = 3; /* include . and .. */ else exp = 1; if (old == exp) { int locked = sx_xlocked(&mqfs->mi_lock); if (!locked) sx_xlock(&mqfs->mi_lock); mqfs_destroy(node); if (!locked) sx_xunlock(&mqfs->mi_lock); } } /* * Add a node to a directory */ static int mqfs_add_node(struct mqfs_node *parent, struct mqfs_node *node) { KASSERT(parent != NULL, ("%s(): parent is NULL", __func__)); KASSERT(parent->mn_info != NULL, ("%s(): parent has no mn_info", __func__)); KASSERT(parent->mn_type == mqfstype_dir || parent->mn_type == mqfstype_root, ("%s(): parent is not a directory", __func__)); node->mn_info = parent->mn_info; node->mn_parent = parent; LIST_INIT(&node->mn_children); LIST_INIT(&node->mn_vnodes); LIST_INSERT_HEAD(&parent->mn_children, node, mn_sibling); mqnode_addref(parent); return (0); } static struct mqfs_node * mqfs_create_node(const char *name, int namelen, struct ucred *cred, int mode, int nodetype) { struct mqfs_node *node; node = mqnode_alloc(); strncpy(node->mn_name, name, namelen); node->mn_pr_root = cred->cr_prison->pr_root; node->mn_type = nodetype; node->mn_refcount = 1; vfs_timestamp(&node->mn_birth); node->mn_ctime = node->mn_atime = node->mn_mtime = node->mn_birth; node->mn_uid = cred->cr_uid; node->mn_gid = cred->cr_gid; node->mn_mode = mode; return (node); } /* * Create a file */ static struct mqfs_node * mqfs_create_file(struct mqfs_node *parent, const char *name, int namelen, struct ucred *cred, int mode) { struct mqfs_node *node; node = mqfs_create_node(name, namelen, cred, mode, mqfstype_file); if (mqfs_add_node(parent, node) != 0) { mqnode_free(node); return (NULL); } return (node); } /* * Add . and .. to a directory */ static int mqfs_fixup_dir(struct mqfs_node *parent) { struct mqfs_node *dir; dir = mqnode_alloc(); dir->mn_name[0] = '.'; dir->mn_type = mqfstype_this; dir->mn_refcount = 1; if (mqfs_add_node(parent, dir) != 0) { mqnode_free(dir); return (-1); } dir = mqnode_alloc(); dir->mn_name[0] = dir->mn_name[1] = '.'; dir->mn_type = mqfstype_parent; dir->mn_refcount = 1; if (mqfs_add_node(parent, dir) != 0) { mqnode_free(dir); return (-1); } return (0); } #ifdef notyet /* * Create a directory */ static struct mqfs_node * mqfs_create_dir(struct mqfs_node *parent, const char *name, int namelen, struct ucred *cred, int mode) { struct mqfs_node *node; node = mqfs_create_node(name, namelen, cred, mode, mqfstype_dir); if (mqfs_add_node(parent, node) != 0) { mqnode_free(node); return (NULL); } if (mqfs_fixup_dir(node) != 0) { mqfs_destroy(node); return (NULL); } return (node); } /* * Create a symlink */ static struct mqfs_node * mqfs_create_link(struct mqfs_node *parent, const char *name, int namelen, struct ucred *cred, int mode) { struct mqfs_node *node; node = mqfs_create_node(name, namelen, cred, mode, mqfstype_symlink); if (mqfs_add_node(parent, node) != 0) { mqnode_free(node); return (NULL); } return (node); } #endif /* * Destroy a node or a tree of nodes */ static int mqfs_destroy(struct mqfs_node *node) { struct mqfs_node *parent; KASSERT(node != NULL, ("%s(): node is NULL", __func__)); KASSERT(node->mn_info != NULL, ("%s(): node has no mn_info", __func__)); /* destroy children */ if (node->mn_type == mqfstype_dir || node->mn_type == mqfstype_root) while (! LIST_EMPTY(&node->mn_children)) mqfs_destroy(LIST_FIRST(&node->mn_children)); /* unlink from parent */ if ((parent = node->mn_parent) != NULL) { KASSERT(parent->mn_info == node->mn_info, ("%s(): parent has different mn_info", __func__)); LIST_REMOVE(node, mn_sibling); } if (node->mn_fileno != 0) mqfs_fileno_free(node->mn_info, node); if (node->mn_data != NULL) mqueue_free(node->mn_data); mqnode_free(node); return (0); } /* * Mount a mqfs instance */ static int mqfs_mount(struct mount *mp) { struct statfs *sbp; if (mp->mnt_flag & MNT_UPDATE) return (EOPNOTSUPP); mp->mnt_data = &mqfs_data; MNT_ILOCK(mp); mp->mnt_flag |= MNT_LOCAL; MNT_IUNLOCK(mp); vfs_getnewfsid(mp); sbp = &mp->mnt_stat; vfs_mountedfrom(mp, "mqueue"); sbp->f_bsize = PAGE_SIZE; sbp->f_iosize = PAGE_SIZE; sbp->f_blocks = 1; sbp->f_bfree = 0; sbp->f_bavail = 0; sbp->f_files = 1; sbp->f_ffree = 0; return (0); } /* * Unmount a mqfs instance */ static int mqfs_unmount(struct mount *mp, int mntflags) { int error; error = vflush(mp, 0, (mntflags & MNT_FORCE) ? FORCECLOSE : 0, curthread); return (error); } /* * Return a root vnode */ static int mqfs_root(struct mount *mp, int flags, struct vnode **vpp) { struct mqfs_info *mqfs; int ret; mqfs = VFSTOMQFS(mp); ret = mqfs_allocv(mp, vpp, mqfs->mi_root); return (ret); } /* * Return filesystem stats */ static int mqfs_statfs(struct mount *mp, struct statfs *sbp) { /* XXX update statistics */ return (0); } /* * Initialize a mqfs instance */ static int mqfs_init(struct vfsconf *vfc) { struct mqfs_node *root; struct mqfs_info *mi; osd_method_t methods[PR_MAXMETHOD] = { [PR_METHOD_REMOVE] = mqfs_prison_remove, }; mqnode_zone = uma_zcreate("mqnode", sizeof(struct mqfs_node), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); mqueue_zone = uma_zcreate("mqueue", sizeof(struct mqueue), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); mvdata_zone = uma_zcreate("mvdata", sizeof(struct mqfs_vdata), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); mqnoti_zone = uma_zcreate("mqnotifier", sizeof(struct mqueue_notifier), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); mi = &mqfs_data; sx_init(&mi->mi_lock, "mqfs lock"); /* set up the root diretory */ root = mqfs_create_node("/", 1, curthread->td_ucred, 01777, mqfstype_root); root->mn_info = mi; LIST_INIT(&root->mn_children); LIST_INIT(&root->mn_vnodes); mi->mi_root = root; mqfs_fileno_init(mi); mqfs_fileno_alloc(mi, root); mqfs_fixup_dir(root); exit_tag = EVENTHANDLER_REGISTER(process_exit, mq_proc_exit, NULL, EVENTHANDLER_PRI_ANY); mq_fdclose = mqueue_fdclose; p31b_setcfg(CTL_P1003_1B_MESSAGE_PASSING, _POSIX_MESSAGE_PASSING); mqfs_osd_jail_slot = osd_jail_register(NULL, methods); return (0); } /* * Destroy a mqfs instance */ static int mqfs_uninit(struct vfsconf *vfc) { struct mqfs_info *mi; if (!unloadable) return (EOPNOTSUPP); osd_jail_deregister(mqfs_osd_jail_slot); EVENTHANDLER_DEREGISTER(process_exit, exit_tag); mi = &mqfs_data; mqfs_destroy(mi->mi_root); mi->mi_root = NULL; mqfs_fileno_uninit(mi); sx_destroy(&mi->mi_lock); uma_zdestroy(mqnode_zone); uma_zdestroy(mqueue_zone); uma_zdestroy(mvdata_zone); uma_zdestroy(mqnoti_zone); return (0); } /* * task routine */ static void do_recycle(void *context, int pending __unused) { struct vnode *vp = (struct vnode *)context; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); vrecycle(vp); VOP_UNLOCK(vp); vdrop(vp); } /* * Allocate a vnode */ static int mqfs_allocv(struct mount *mp, struct vnode **vpp, struct mqfs_node *pn) { struct mqfs_vdata *vd; struct mqfs_info *mqfs; struct vnode *newvpp; int error; mqfs = pn->mn_info; *vpp = NULL; sx_xlock(&mqfs->mi_lock); LIST_FOREACH(vd, &pn->mn_vnodes, mv_link) { if (vd->mv_vnode->v_mount == mp) { vhold(vd->mv_vnode); break; } } if (vd != NULL) { found: *vpp = vd->mv_vnode; sx_xunlock(&mqfs->mi_lock); - error = vget(*vpp, LK_RETRY | LK_EXCLUSIVE, curthread); + error = vget(*vpp, LK_RETRY | LK_EXCLUSIVE); vdrop(*vpp); return (error); } sx_xunlock(&mqfs->mi_lock); error = getnewvnode("mqueue", mp, &mqfs_vnodeops, &newvpp); if (error) return (error); vn_lock(newvpp, LK_EXCLUSIVE | LK_RETRY); error = insmntque(newvpp, mp); if (error != 0) return (error); sx_xlock(&mqfs->mi_lock); /* * Check if it has already been allocated * while we were blocked. */ LIST_FOREACH(vd, &pn->mn_vnodes, mv_link) { if (vd->mv_vnode->v_mount == mp) { vhold(vd->mv_vnode); sx_xunlock(&mqfs->mi_lock); vgone(newvpp); vput(newvpp); goto found; } } *vpp = newvpp; vd = uma_zalloc(mvdata_zone, M_WAITOK); (*vpp)->v_data = vd; vd->mv_vnode = *vpp; vd->mv_node = pn; TASK_INIT(&vd->mv_task, 0, do_recycle, *vpp); LIST_INSERT_HEAD(&pn->mn_vnodes, vd, mv_link); mqnode_addref(pn); switch (pn->mn_type) { case mqfstype_root: (*vpp)->v_vflag = VV_ROOT; /* fall through */ case mqfstype_dir: case mqfstype_this: case mqfstype_parent: (*vpp)->v_type = VDIR; break; case mqfstype_file: (*vpp)->v_type = VREG; break; case mqfstype_symlink: (*vpp)->v_type = VLNK; break; case mqfstype_none: KASSERT(0, ("mqfs_allocf called for null node\n")); default: panic("%s has unexpected type: %d", pn->mn_name, pn->mn_type); } sx_xunlock(&mqfs->mi_lock); return (0); } /* * Search a directory entry */ static struct mqfs_node * mqfs_search(struct mqfs_node *pd, const char *name, int len, struct ucred *cred) { struct mqfs_node *pn; const void *pr_root; sx_assert(&pd->mn_info->mi_lock, SX_LOCKED); pr_root = cred->cr_prison->pr_root; LIST_FOREACH(pn, &pd->mn_children, mn_sibling) { /* Only match names within the same prison root directory */ if ((pn->mn_pr_root == NULL || pn->mn_pr_root == pr_root) && strncmp(pn->mn_name, name, len) == 0 && pn->mn_name[len] == '\0') return (pn); } return (NULL); } /* * Look up a file or directory. */ static int mqfs_lookupx(struct vop_cachedlookup_args *ap) { struct componentname *cnp; struct vnode *dvp, **vpp; struct mqfs_node *pd; struct mqfs_node *pn; struct mqfs_info *mqfs; int nameiop, flags, error, namelen; char *pname; struct thread *td; cnp = ap->a_cnp; vpp = ap->a_vpp; dvp = ap->a_dvp; pname = cnp->cn_nameptr; namelen = cnp->cn_namelen; td = cnp->cn_thread; flags = cnp->cn_flags; nameiop = cnp->cn_nameiop; pd = VTON(dvp); pn = NULL; mqfs = pd->mn_info; *vpp = NULLVP; if (dvp->v_type != VDIR) return (ENOTDIR); error = VOP_ACCESS(dvp, VEXEC, cnp->cn_cred, cnp->cn_thread); if (error) return (error); /* shortcut: check if the name is too long */ if (cnp->cn_namelen >= MQFS_NAMELEN) return (ENOENT); /* self */ if (namelen == 1 && pname[0] == '.') { if ((flags & ISLASTCN) && nameiop != LOOKUP) return (EINVAL); pn = pd; *vpp = dvp; VREF(dvp); return (0); } /* parent */ if (cnp->cn_flags & ISDOTDOT) { if (dvp->v_vflag & VV_ROOT) return (EIO); if ((flags & ISLASTCN) && nameiop != LOOKUP) return (EINVAL); VOP_UNLOCK(dvp); KASSERT(pd->mn_parent, ("non-root directory has no parent")); pn = pd->mn_parent; error = mqfs_allocv(dvp->v_mount, vpp, pn); vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY); return (error); } /* named node */ sx_xlock(&mqfs->mi_lock); pn = mqfs_search(pd, pname, namelen, cnp->cn_cred); if (pn != NULL) mqnode_addref(pn); sx_xunlock(&mqfs->mi_lock); /* found */ if (pn != NULL) { /* DELETE */ if (nameiop == DELETE && (flags & ISLASTCN)) { error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred, td); if (error) { mqnode_release(pn); return (error); } if (*vpp == dvp) { VREF(dvp); *vpp = dvp; mqnode_release(pn); return (0); } } /* allocate vnode */ error = mqfs_allocv(dvp->v_mount, vpp, pn); mqnode_release(pn); if (error == 0 && cnp->cn_flags & MAKEENTRY) cache_enter(dvp, *vpp, cnp); return (error); } /* not found */ /* will create a new entry in the directory ? */ if ((nameiop == CREATE || nameiop == RENAME) && (flags & LOCKPARENT) && (flags & ISLASTCN)) { error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred, td); if (error) return (error); cnp->cn_flags |= SAVENAME; return (EJUSTRETURN); } return (ENOENT); } #if 0 struct vop_lookup_args { struct vop_generic_args a_gen; struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; }; #endif /* * vnode lookup operation */ static int mqfs_lookup(struct vop_cachedlookup_args *ap) { int rc; rc = mqfs_lookupx(ap); return (rc); } #if 0 struct vop_create_args { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; }; #endif /* * vnode creation operation */ static int mqfs_create(struct vop_create_args *ap) { struct mqfs_info *mqfs = VFSTOMQFS(ap->a_dvp->v_mount); struct componentname *cnp = ap->a_cnp; struct mqfs_node *pd; struct mqfs_node *pn; struct mqueue *mq; int error; pd = VTON(ap->a_dvp); if (pd->mn_type != mqfstype_root && pd->mn_type != mqfstype_dir) return (ENOTDIR); mq = mqueue_alloc(NULL); if (mq == NULL) return (EAGAIN); sx_xlock(&mqfs->mi_lock); if ((cnp->cn_flags & HASBUF) == 0) panic("%s: no name", __func__); pn = mqfs_create_file(pd, cnp->cn_nameptr, cnp->cn_namelen, cnp->cn_cred, ap->a_vap->va_mode); if (pn == NULL) { sx_xunlock(&mqfs->mi_lock); error = ENOSPC; } else { mqnode_addref(pn); sx_xunlock(&mqfs->mi_lock); error = mqfs_allocv(ap->a_dvp->v_mount, ap->a_vpp, pn); mqnode_release(pn); if (error) mqfs_destroy(pn); else pn->mn_data = mq; } if (error) mqueue_free(mq); return (error); } /* * Remove an entry */ static int do_unlink(struct mqfs_node *pn, struct ucred *ucred) { struct mqfs_node *parent; struct mqfs_vdata *vd; int error = 0; sx_assert(&pn->mn_info->mi_lock, SX_LOCKED); if (ucred->cr_uid != pn->mn_uid && (error = priv_check_cred(ucred, PRIV_MQ_ADMIN)) != 0) error = EACCES; else if (!pn->mn_deleted) { parent = pn->mn_parent; pn->mn_parent = NULL; pn->mn_deleted = 1; LIST_REMOVE(pn, mn_sibling); LIST_FOREACH(vd, &pn->mn_vnodes, mv_link) { cache_purge(vd->mv_vnode); vhold(vd->mv_vnode); taskqueue_enqueue(taskqueue_thread, &vd->mv_task); } mqnode_release(pn); mqnode_release(parent); } else error = ENOENT; return (error); } #if 0 struct vop_remove_args { struct vnode *a_dvp; struct vnode *a_vp; struct componentname *a_cnp; }; #endif /* * vnode removal operation */ static int mqfs_remove(struct vop_remove_args *ap) { struct mqfs_info *mqfs = VFSTOMQFS(ap->a_dvp->v_mount); struct mqfs_node *pn; int error; if (ap->a_vp->v_type == VDIR) return (EPERM); pn = VTON(ap->a_vp); sx_xlock(&mqfs->mi_lock); error = do_unlink(pn, ap->a_cnp->cn_cred); sx_xunlock(&mqfs->mi_lock); return (error); } #if 0 struct vop_inactive_args { struct vnode *a_vp; struct thread *a_td; }; #endif static int mqfs_inactive(struct vop_inactive_args *ap) { struct mqfs_node *pn = VTON(ap->a_vp); if (pn->mn_deleted) vrecycle(ap->a_vp); return (0); } #if 0 struct vop_reclaim_args { struct vop_generic_args a_gen; struct vnode *a_vp; struct thread *a_td; }; #endif static int mqfs_reclaim(struct vop_reclaim_args *ap) { struct mqfs_info *mqfs = VFSTOMQFS(ap->a_vp->v_mount); struct vnode *vp = ap->a_vp; struct mqfs_node *pn; struct mqfs_vdata *vd; vd = vp->v_data; pn = vd->mv_node; sx_xlock(&mqfs->mi_lock); vp->v_data = NULL; LIST_REMOVE(vd, mv_link); uma_zfree(mvdata_zone, vd); mqnode_release(pn); sx_xunlock(&mqfs->mi_lock); return (0); } #if 0 struct vop_open_args { struct vop_generic_args a_gen; struct vnode *a_vp; int a_mode; struct ucred *a_cred; struct thread *a_td; struct file *a_fp; }; #endif static int mqfs_open(struct vop_open_args *ap) { return (0); } #if 0 struct vop_close_args { struct vop_generic_args a_gen; struct vnode *a_vp; int a_fflag; struct ucred *a_cred; struct thread *a_td; }; #endif static int mqfs_close(struct vop_close_args *ap) { return (0); } #if 0 struct vop_access_args { struct vop_generic_args a_gen; struct vnode *a_vp; accmode_t a_accmode; struct ucred *a_cred; struct thread *a_td; }; #endif /* * Verify permissions */ static int mqfs_access(struct vop_access_args *ap) { struct vnode *vp = ap->a_vp; struct vattr vattr; int error; error = VOP_GETATTR(vp, &vattr, ap->a_cred); if (error) return (error); error = vaccess(vp->v_type, vattr.va_mode, vattr.va_uid, vattr.va_gid, ap->a_accmode, ap->a_cred); return (error); } #if 0 struct vop_getattr_args { struct vop_generic_args a_gen; struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; }; #endif /* * Get file attributes */ static int mqfs_getattr(struct vop_getattr_args *ap) { struct vnode *vp = ap->a_vp; struct mqfs_node *pn = VTON(vp); struct vattr *vap = ap->a_vap; int error = 0; vap->va_type = vp->v_type; vap->va_mode = pn->mn_mode; vap->va_nlink = 1; vap->va_uid = pn->mn_uid; vap->va_gid = pn->mn_gid; vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0]; vap->va_fileid = pn->mn_fileno; vap->va_size = 0; vap->va_blocksize = PAGE_SIZE; vap->va_bytes = vap->va_size = 0; vap->va_atime = pn->mn_atime; vap->va_mtime = pn->mn_mtime; vap->va_ctime = pn->mn_ctime; vap->va_birthtime = pn->mn_birth; vap->va_gen = 0; vap->va_flags = 0; vap->va_rdev = NODEV; vap->va_bytes = 0; vap->va_filerev = 0; return (error); } #if 0 struct vop_setattr_args { struct vop_generic_args a_gen; struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; }; #endif /* * Set attributes */ static int mqfs_setattr(struct vop_setattr_args *ap) { struct mqfs_node *pn; struct vattr *vap; struct vnode *vp; struct thread *td; int c, error; uid_t uid; gid_t gid; td = curthread; vap = ap->a_vap; vp = ap->a_vp; if ((vap->va_type != VNON) || (vap->va_nlink != VNOVAL) || (vap->va_fsid != VNOVAL) || (vap->va_fileid != VNOVAL) || (vap->va_blocksize != VNOVAL) || (vap->va_flags != VNOVAL && vap->va_flags != 0) || (vap->va_rdev != VNOVAL) || ((int)vap->va_bytes != VNOVAL) || (vap->va_gen != VNOVAL)) { return (EINVAL); } pn = VTON(vp); error = c = 0; if (vap->va_uid == (uid_t)VNOVAL) uid = pn->mn_uid; else uid = vap->va_uid; if (vap->va_gid == (gid_t)VNOVAL) gid = pn->mn_gid; else gid = vap->va_gid; if (uid != pn->mn_uid || gid != pn->mn_gid) { /* * To modify the ownership of a file, must possess VADMIN * for that file. */ if ((error = VOP_ACCESS(vp, VADMIN, ap->a_cred, td))) return (error); /* * XXXRW: Why is there a privilege check here: shouldn't the * check in VOP_ACCESS() be enough? Also, are the group bits * below definitely right? */ if (((ap->a_cred->cr_uid != pn->mn_uid) || uid != pn->mn_uid || (gid != pn->mn_gid && !groupmember(gid, ap->a_cred))) && (error = priv_check(td, PRIV_MQ_ADMIN)) != 0) return (error); pn->mn_uid = uid; pn->mn_gid = gid; c = 1; } if (vap->va_mode != (mode_t)VNOVAL) { if ((ap->a_cred->cr_uid != pn->mn_uid) && (error = priv_check(td, PRIV_MQ_ADMIN))) return (error); pn->mn_mode = vap->va_mode; c = 1; } if (vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL) { /* See the comment in ufs_vnops::ufs_setattr(). */ if ((error = VOP_ACCESS(vp, VADMIN, ap->a_cred, td)) && ((vap->va_vaflags & VA_UTIMES_NULL) == 0 || (error = VOP_ACCESS(vp, VWRITE, ap->a_cred, td)))) return (error); if (vap->va_atime.tv_sec != VNOVAL) { pn->mn_atime = vap->va_atime; } if (vap->va_mtime.tv_sec != VNOVAL) { pn->mn_mtime = vap->va_mtime; } c = 1; } if (c) { vfs_timestamp(&pn->mn_ctime); } return (0); } #if 0 struct vop_read_args { struct vop_generic_args a_gen; struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; }; #endif /* * Read from a file */ static int mqfs_read(struct vop_read_args *ap) { char buf[80]; struct vnode *vp = ap->a_vp; struct uio *uio = ap->a_uio; struct mqueue *mq; int len, error; if (vp->v_type != VREG) return (EINVAL); mq = VTOMQ(vp); snprintf(buf, sizeof(buf), "QSIZE:%-10ld MAXMSG:%-10ld CURMSG:%-10ld MSGSIZE:%-10ld\n", mq->mq_totalbytes, mq->mq_maxmsg, mq->mq_curmsgs, mq->mq_msgsize); buf[sizeof(buf)-1] = '\0'; len = strlen(buf); error = uiomove_frombuf(buf, len, uio); return (error); } #if 0 struct vop_readdir_args { struct vop_generic_args a_gen; struct vnode *a_vp; struct uio *a_uio; struct ucred *a_cred; int *a_eofflag; int *a_ncookies; u_long **a_cookies; }; #endif /* * Return directory entries. */ static int mqfs_readdir(struct vop_readdir_args *ap) { struct vnode *vp; struct mqfs_info *mi; struct mqfs_node *pd; struct mqfs_node *pn; struct dirent entry; struct uio *uio; const void *pr_root; int *tmp_ncookies = NULL; off_t offset; int error, i; vp = ap->a_vp; mi = VFSTOMQFS(vp->v_mount); pd = VTON(vp); uio = ap->a_uio; if (vp->v_type != VDIR) return (ENOTDIR); if (uio->uio_offset < 0) return (EINVAL); if (ap->a_ncookies != NULL) { tmp_ncookies = ap->a_ncookies; *ap->a_ncookies = 0; ap->a_ncookies = NULL; } error = 0; offset = 0; pr_root = ap->a_cred->cr_prison->pr_root; sx_xlock(&mi->mi_lock); LIST_FOREACH(pn, &pd->mn_children, mn_sibling) { entry.d_reclen = sizeof(entry); /* * Only show names within the same prison root directory * (or not associated with a prison, e.g. "." and ".."). */ if (pn->mn_pr_root != NULL && pn->mn_pr_root != pr_root) continue; if (!pn->mn_fileno) mqfs_fileno_alloc(mi, pn); entry.d_fileno = pn->mn_fileno; for (i = 0; i < MQFS_NAMELEN - 1 && pn->mn_name[i] != '\0'; ++i) entry.d_name[i] = pn->mn_name[i]; entry.d_namlen = i; switch (pn->mn_type) { case mqfstype_root: case mqfstype_dir: case mqfstype_this: case mqfstype_parent: entry.d_type = DT_DIR; break; case mqfstype_file: entry.d_type = DT_REG; break; case mqfstype_symlink: entry.d_type = DT_LNK; break; default: panic("%s has unexpected node type: %d", pn->mn_name, pn->mn_type); } dirent_terminate(&entry); if (entry.d_reclen > uio->uio_resid) break; if (offset >= uio->uio_offset) { error = vfs_read_dirent(ap, &entry, offset); if (error) break; } offset += entry.d_reclen; } sx_xunlock(&mi->mi_lock); uio->uio_offset = offset; if (tmp_ncookies != NULL) ap->a_ncookies = tmp_ncookies; return (error); } #ifdef notyet #if 0 struct vop_mkdir_args { struct vnode *a_dvp; struvt vnode **a_vpp; struvt componentname *a_cnp; struct vattr *a_vap; }; #endif /* * Create a directory. */ static int mqfs_mkdir(struct vop_mkdir_args *ap) { struct mqfs_info *mqfs = VFSTOMQFS(ap->a_dvp->v_mount); struct componentname *cnp = ap->a_cnp; struct mqfs_node *pd = VTON(ap->a_dvp); struct mqfs_node *pn; int error; if (pd->mn_type != mqfstype_root && pd->mn_type != mqfstype_dir) return (ENOTDIR); sx_xlock(&mqfs->mi_lock); if ((cnp->cn_flags & HASBUF) == 0) panic("%s: no name", __func__); pn = mqfs_create_dir(pd, cnp->cn_nameptr, cnp->cn_namelen, ap->a_vap->cn_cred, ap->a_vap->va_mode); if (pn != NULL) mqnode_addref(pn); sx_xunlock(&mqfs->mi_lock); if (pn == NULL) { error = ENOSPC; } else { error = mqfs_allocv(ap->a_dvp->v_mount, ap->a_vpp, pn); mqnode_release(pn); } return (error); } #if 0 struct vop_rmdir_args { struct vnode *a_dvp; struct vnode *a_vp; struct componentname *a_cnp; }; #endif /* * Remove a directory. */ static int mqfs_rmdir(struct vop_rmdir_args *ap) { struct mqfs_info *mqfs = VFSTOMQFS(ap->a_dvp->v_mount); struct mqfs_node *pn = VTON(ap->a_vp); struct mqfs_node *pt; if (pn->mn_type != mqfstype_dir) return (ENOTDIR); sx_xlock(&mqfs->mi_lock); if (pn->mn_deleted) { sx_xunlock(&mqfs->mi_lock); return (ENOENT); } pt = LIST_FIRST(&pn->mn_children); pt = LIST_NEXT(pt, mn_sibling); pt = LIST_NEXT(pt, mn_sibling); if (pt != NULL) { sx_xunlock(&mqfs->mi_lock); return (ENOTEMPTY); } pt = pn->mn_parent; pn->mn_parent = NULL; pn->mn_deleted = 1; LIST_REMOVE(pn, mn_sibling); mqnode_release(pn); mqnode_release(pt); sx_xunlock(&mqfs->mi_lock); cache_purge(ap->a_vp); return (0); } #endif /* notyet */ /* * See if this prison root is obsolete, and clean up associated queues if it is. */ static int mqfs_prison_remove(void *obj, void *data __unused) { const struct prison *pr = obj; const struct prison *tpr; struct mqfs_node *pn, *tpn; int found; found = 0; TAILQ_FOREACH(tpr, &allprison, pr_list) { if (tpr->pr_root == pr->pr_root && tpr != pr && tpr->pr_ref > 0) found = 1; } if (!found) { /* * No jails are rooted in this directory anymore, * so no queues should be either. */ sx_xlock(&mqfs_data.mi_lock); LIST_FOREACH_SAFE(pn, &mqfs_data.mi_root->mn_children, mn_sibling, tpn) { if (pn->mn_pr_root == pr->pr_root) (void)do_unlink(pn, curthread->td_ucred); } sx_xunlock(&mqfs_data.mi_lock); } return (0); } /* * Allocate a message queue */ static struct mqueue * mqueue_alloc(const struct mq_attr *attr) { struct mqueue *mq; if (curmq >= maxmq) return (NULL); mq = uma_zalloc(mqueue_zone, M_WAITOK | M_ZERO); TAILQ_INIT(&mq->mq_msgq); if (attr != NULL) { mq->mq_maxmsg = attr->mq_maxmsg; mq->mq_msgsize = attr->mq_msgsize; } else { mq->mq_maxmsg = default_maxmsg; mq->mq_msgsize = default_msgsize; } mtx_init(&mq->mq_mutex, "mqueue lock", NULL, MTX_DEF); knlist_init_mtx(&mq->mq_rsel.si_note, &mq->mq_mutex); knlist_init_mtx(&mq->mq_wsel.si_note, &mq->mq_mutex); atomic_add_int(&curmq, 1); return (mq); } /* * Destroy a message queue */ static void mqueue_free(struct mqueue *mq) { struct mqueue_msg *msg; while ((msg = TAILQ_FIRST(&mq->mq_msgq)) != NULL) { TAILQ_REMOVE(&mq->mq_msgq, msg, msg_link); free(msg, M_MQUEUEDATA); } mtx_destroy(&mq->mq_mutex); seldrain(&mq->mq_rsel); seldrain(&mq->mq_wsel); knlist_destroy(&mq->mq_rsel.si_note); knlist_destroy(&mq->mq_wsel.si_note); uma_zfree(mqueue_zone, mq); atomic_add_int(&curmq, -1); } /* * Load a message from user space */ static struct mqueue_msg * mqueue_loadmsg(const char *msg_ptr, size_t msg_size, int msg_prio) { struct mqueue_msg *msg; size_t len; int error; len = sizeof(struct mqueue_msg) + msg_size; msg = malloc(len, M_MQUEUEDATA, M_WAITOK); error = copyin(msg_ptr, ((char *)msg) + sizeof(struct mqueue_msg), msg_size); if (error) { free(msg, M_MQUEUEDATA); msg = NULL; } else { msg->msg_size = msg_size; msg->msg_prio = msg_prio; } return (msg); } /* * Save a message to user space */ static int mqueue_savemsg(struct mqueue_msg *msg, char *msg_ptr, int *msg_prio) { int error; error = copyout(((char *)msg) + sizeof(*msg), msg_ptr, msg->msg_size); if (error == 0 && msg_prio != NULL) error = copyout(&msg->msg_prio, msg_prio, sizeof(int)); return (error); } /* * Free a message's memory */ static __inline void mqueue_freemsg(struct mqueue_msg *msg) { free(msg, M_MQUEUEDATA); } /* * Send a message. if waitok is false, thread will not be * blocked if there is no data in queue, otherwise, absolute * time will be checked. */ int mqueue_send(struct mqueue *mq, const char *msg_ptr, size_t msg_len, unsigned msg_prio, int waitok, const struct timespec *abs_timeout) { struct mqueue_msg *msg; struct timespec ts, ts2; struct timeval tv; int error; if (msg_prio >= MQ_PRIO_MAX) return (EINVAL); if (msg_len > mq->mq_msgsize) return (EMSGSIZE); msg = mqueue_loadmsg(msg_ptr, msg_len, msg_prio); if (msg == NULL) return (EFAULT); /* O_NONBLOCK case */ if (!waitok) { error = _mqueue_send(mq, msg, -1); if (error) goto bad; return (0); } /* we allow a null timeout (wait forever) */ if (abs_timeout == NULL) { error = _mqueue_send(mq, msg, 0); if (error) goto bad; return (0); } /* send it before checking time */ error = _mqueue_send(mq, msg, -1); if (error == 0) return (0); if (error != EAGAIN) goto bad; if (abs_timeout->tv_nsec >= 1000000000 || abs_timeout->tv_nsec < 0) { error = EINVAL; goto bad; } for (;;) { getnanotime(&ts); timespecsub(abs_timeout, &ts, &ts2); if (ts2.tv_sec < 0 || (ts2.tv_sec == 0 && ts2.tv_nsec <= 0)) { error = ETIMEDOUT; break; } TIMESPEC_TO_TIMEVAL(&tv, &ts2); error = _mqueue_send(mq, msg, tvtohz(&tv)); if (error != ETIMEDOUT) break; } if (error == 0) return (0); bad: mqueue_freemsg(msg); return (error); } /* * Common routine to send a message */ static int _mqueue_send(struct mqueue *mq, struct mqueue_msg *msg, int timo) { struct mqueue_msg *msg2; int error = 0; mtx_lock(&mq->mq_mutex); while (mq->mq_curmsgs >= mq->mq_maxmsg && error == 0) { if (timo < 0) { mtx_unlock(&mq->mq_mutex); return (EAGAIN); } mq->mq_senders++; error = msleep(&mq->mq_senders, &mq->mq_mutex, PCATCH, "mqsend", timo); mq->mq_senders--; if (error == EAGAIN) error = ETIMEDOUT; } if (mq->mq_curmsgs >= mq->mq_maxmsg) { mtx_unlock(&mq->mq_mutex); return (error); } error = 0; if (TAILQ_EMPTY(&mq->mq_msgq)) { TAILQ_INSERT_HEAD(&mq->mq_msgq, msg, msg_link); } else { if (msg->msg_prio <= TAILQ_LAST(&mq->mq_msgq, msgq)->msg_prio) { TAILQ_INSERT_TAIL(&mq->mq_msgq, msg, msg_link); } else { TAILQ_FOREACH(msg2, &mq->mq_msgq, msg_link) { if (msg2->msg_prio < msg->msg_prio) break; } TAILQ_INSERT_BEFORE(msg2, msg, msg_link); } } mq->mq_curmsgs++; mq->mq_totalbytes += msg->msg_size; if (mq->mq_receivers) wakeup_one(&mq->mq_receivers); else if (mq->mq_notifier != NULL) mqueue_send_notification(mq); if (mq->mq_flags & MQ_RSEL) { mq->mq_flags &= ~MQ_RSEL; selwakeup(&mq->mq_rsel); } KNOTE_LOCKED(&mq->mq_rsel.si_note, 0); mtx_unlock(&mq->mq_mutex); return (0); } /* * Send realtime a signal to process which registered itself * successfully by mq_notify. */ static void mqueue_send_notification(struct mqueue *mq) { struct mqueue_notifier *nt; struct thread *td; struct proc *p; int error; mtx_assert(&mq->mq_mutex, MA_OWNED); nt = mq->mq_notifier; if (nt->nt_sigev.sigev_notify != SIGEV_NONE) { p = nt->nt_proc; error = sigev_findtd(p, &nt->nt_sigev, &td); if (error) { mq->mq_notifier = NULL; return; } if (!KSI_ONQ(&nt->nt_ksi)) { ksiginfo_set_sigev(&nt->nt_ksi, &nt->nt_sigev); tdsendsignal(p, td, nt->nt_ksi.ksi_signo, &nt->nt_ksi); } PROC_UNLOCK(p); } mq->mq_notifier = NULL; } /* * Get a message. if waitok is false, thread will not be * blocked if there is no data in queue, otherwise, absolute * time will be checked. */ int mqueue_receive(struct mqueue *mq, char *msg_ptr, size_t msg_len, unsigned *msg_prio, int waitok, const struct timespec *abs_timeout) { struct mqueue_msg *msg; struct timespec ts, ts2; struct timeval tv; int error; if (msg_len < mq->mq_msgsize) return (EMSGSIZE); /* O_NONBLOCK case */ if (!waitok) { error = _mqueue_recv(mq, &msg, -1); if (error) return (error); goto received; } /* we allow a null timeout (wait forever). */ if (abs_timeout == NULL) { error = _mqueue_recv(mq, &msg, 0); if (error) return (error); goto received; } /* try to get a message before checking time */ error = _mqueue_recv(mq, &msg, -1); if (error == 0) goto received; if (error != EAGAIN) return (error); if (abs_timeout->tv_nsec >= 1000000000 || abs_timeout->tv_nsec < 0) { error = EINVAL; return (error); } for (;;) { getnanotime(&ts); timespecsub(abs_timeout, &ts, &ts2); if (ts2.tv_sec < 0 || (ts2.tv_sec == 0 && ts2.tv_nsec <= 0)) { error = ETIMEDOUT; return (error); } TIMESPEC_TO_TIMEVAL(&tv, &ts2); error = _mqueue_recv(mq, &msg, tvtohz(&tv)); if (error == 0) break; if (error != ETIMEDOUT) return (error); } received: error = mqueue_savemsg(msg, msg_ptr, msg_prio); if (error == 0) { curthread->td_retval[0] = msg->msg_size; curthread->td_retval[1] = 0; } mqueue_freemsg(msg); return (error); } /* * Common routine to receive a message */ static int _mqueue_recv(struct mqueue *mq, struct mqueue_msg **msg, int timo) { int error = 0; mtx_lock(&mq->mq_mutex); while ((*msg = TAILQ_FIRST(&mq->mq_msgq)) == NULL && error == 0) { if (timo < 0) { mtx_unlock(&mq->mq_mutex); return (EAGAIN); } mq->mq_receivers++; error = msleep(&mq->mq_receivers, &mq->mq_mutex, PCATCH, "mqrecv", timo); mq->mq_receivers--; if (error == EAGAIN) error = ETIMEDOUT; } if (*msg != NULL) { error = 0; TAILQ_REMOVE(&mq->mq_msgq, *msg, msg_link); mq->mq_curmsgs--; mq->mq_totalbytes -= (*msg)->msg_size; if (mq->mq_senders) wakeup_one(&mq->mq_senders); if (mq->mq_flags & MQ_WSEL) { mq->mq_flags &= ~MQ_WSEL; selwakeup(&mq->mq_wsel); } KNOTE_LOCKED(&mq->mq_wsel.si_note, 0); } if (mq->mq_notifier != NULL && mq->mq_receivers == 0 && !TAILQ_EMPTY(&mq->mq_msgq)) { mqueue_send_notification(mq); } mtx_unlock(&mq->mq_mutex); return (error); } static __inline struct mqueue_notifier * notifier_alloc(void) { return (uma_zalloc(mqnoti_zone, M_WAITOK | M_ZERO)); } static __inline void notifier_free(struct mqueue_notifier *p) { uma_zfree(mqnoti_zone, p); } static struct mqueue_notifier * notifier_search(struct proc *p, int fd) { struct mqueue_notifier *nt; LIST_FOREACH(nt, &p->p_mqnotifier, nt_link) { if (nt->nt_ksi.ksi_mqd == fd) break; } return (nt); } static __inline void notifier_insert(struct proc *p, struct mqueue_notifier *nt) { LIST_INSERT_HEAD(&p->p_mqnotifier, nt, nt_link); } static __inline void notifier_delete(struct proc *p, struct mqueue_notifier *nt) { LIST_REMOVE(nt, nt_link); notifier_free(nt); } static void notifier_remove(struct proc *p, struct mqueue *mq, int fd) { struct mqueue_notifier *nt; mtx_assert(&mq->mq_mutex, MA_OWNED); PROC_LOCK(p); nt = notifier_search(p, fd); if (nt != NULL) { if (mq->mq_notifier == nt) mq->mq_notifier = NULL; sigqueue_take(&nt->nt_ksi); notifier_delete(p, nt); } PROC_UNLOCK(p); } static int kern_kmq_open(struct thread *td, const char *upath, int flags, mode_t mode, const struct mq_attr *attr) { char path[MQFS_NAMELEN + 1]; struct mqfs_node *pn; struct filedesc *fdp; struct file *fp; struct mqueue *mq; int fd, error, len, cmode; AUDIT_ARG_FFLAGS(flags); AUDIT_ARG_MODE(mode); fdp = td->td_proc->p_fd; cmode = (((mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT); mq = NULL; if ((flags & O_CREAT) != 0 && attr != NULL) { if (attr->mq_maxmsg <= 0 || attr->mq_maxmsg > maxmsg) return (EINVAL); if (attr->mq_msgsize <= 0 || attr->mq_msgsize > maxmsgsize) return (EINVAL); } error = copyinstr(upath, path, MQFS_NAMELEN + 1, NULL); if (error) return (error); /* * The first character of name must be a slash (/) character * and the remaining characters of name cannot include any slash * characters. */ len = strlen(path); if (len < 2 || path[0] != '/' || strchr(path + 1, '/') != NULL) return (EINVAL); /* * "." and ".." are magic directories, populated on the fly, and cannot * be opened as queues. */ if (strcmp(path, "/.") == 0 || strcmp(path, "/..") == 0) return (EINVAL); AUDIT_ARG_UPATH1_CANON(path); error = falloc(td, &fp, &fd, O_CLOEXEC); if (error) return (error); sx_xlock(&mqfs_data.mi_lock); pn = mqfs_search(mqfs_data.mi_root, path + 1, len - 1, td->td_ucred); if (pn == NULL) { if (!(flags & O_CREAT)) { error = ENOENT; } else { mq = mqueue_alloc(attr); if (mq == NULL) { error = ENFILE; } else { pn = mqfs_create_file(mqfs_data.mi_root, path + 1, len - 1, td->td_ucred, cmode); if (pn == NULL) { error = ENOSPC; mqueue_free(mq); } } } if (error == 0) { pn->mn_data = mq; } } else { if ((flags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL)) { error = EEXIST; } else { accmode_t accmode = 0; if (flags & FREAD) accmode |= VREAD; if (flags & FWRITE) accmode |= VWRITE; error = vaccess(VREG, pn->mn_mode, pn->mn_uid, pn->mn_gid, accmode, td->td_ucred); } } if (error) { sx_xunlock(&mqfs_data.mi_lock); fdclose(td, fp, fd); fdrop(fp, td); return (error); } mqnode_addref(pn); sx_xunlock(&mqfs_data.mi_lock); finit(fp, flags & (FREAD | FWRITE | O_NONBLOCK), DTYPE_MQUEUE, pn, &mqueueops); td->td_retval[0] = fd; fdrop(fp, td); return (0); } /* * Syscall to open a message queue. */ int sys_kmq_open(struct thread *td, struct kmq_open_args *uap) { struct mq_attr attr; int flags, error; if ((uap->flags & O_ACCMODE) == O_ACCMODE || uap->flags & O_EXEC) return (EINVAL); flags = FFLAGS(uap->flags); if ((flags & O_CREAT) != 0 && uap->attr != NULL) { error = copyin(uap->attr, &attr, sizeof(attr)); if (error) return (error); } return (kern_kmq_open(td, uap->path, flags, uap->mode, uap->attr != NULL ? &attr : NULL)); } /* * Syscall to unlink a message queue. */ int sys_kmq_unlink(struct thread *td, struct kmq_unlink_args *uap) { char path[MQFS_NAMELEN+1]; struct mqfs_node *pn; int error, len; error = copyinstr(uap->path, path, MQFS_NAMELEN + 1, NULL); if (error) return (error); len = strlen(path); if (len < 2 || path[0] != '/' || strchr(path + 1, '/') != NULL) return (EINVAL); if (strcmp(path, "/.") == 0 || strcmp(path, "/..") == 0) return (EINVAL); AUDIT_ARG_UPATH1_CANON(path); sx_xlock(&mqfs_data.mi_lock); pn = mqfs_search(mqfs_data.mi_root, path + 1, len - 1, td->td_ucred); if (pn != NULL) error = do_unlink(pn, td->td_ucred); else error = ENOENT; sx_xunlock(&mqfs_data.mi_lock); return (error); } typedef int (*_fgetf)(struct thread *, int, cap_rights_t *, struct file **); /* * Get message queue by giving file slot */ static int _getmq(struct thread *td, int fd, cap_rights_t *rightsp, _fgetf func, struct file **fpp, struct mqfs_node **ppn, struct mqueue **pmq) { struct mqfs_node *pn; int error; error = func(td, fd, rightsp, fpp); if (error) return (error); if (&mqueueops != (*fpp)->f_ops) { fdrop(*fpp, td); return (EBADF); } pn = (*fpp)->f_data; if (ppn) *ppn = pn; if (pmq) *pmq = pn->mn_data; return (0); } static __inline int getmq(struct thread *td, int fd, struct file **fpp, struct mqfs_node **ppn, struct mqueue **pmq) { return _getmq(td, fd, &cap_event_rights, fget, fpp, ppn, pmq); } static __inline int getmq_read(struct thread *td, int fd, struct file **fpp, struct mqfs_node **ppn, struct mqueue **pmq) { return _getmq(td, fd, &cap_read_rights, fget_read, fpp, ppn, pmq); } static __inline int getmq_write(struct thread *td, int fd, struct file **fpp, struct mqfs_node **ppn, struct mqueue **pmq) { return _getmq(td, fd, &cap_write_rights, fget_write, fpp, ppn, pmq); } static int kern_kmq_setattr(struct thread *td, int mqd, const struct mq_attr *attr, struct mq_attr *oattr) { struct mqueue *mq; struct file *fp; u_int oflag, flag; int error; AUDIT_ARG_FD(mqd); if (attr != NULL && (attr->mq_flags & ~O_NONBLOCK) != 0) return (EINVAL); error = getmq(td, mqd, &fp, NULL, &mq); if (error) return (error); oattr->mq_maxmsg = mq->mq_maxmsg; oattr->mq_msgsize = mq->mq_msgsize; oattr->mq_curmsgs = mq->mq_curmsgs; if (attr != NULL) { do { oflag = flag = fp->f_flag; flag &= ~O_NONBLOCK; flag |= (attr->mq_flags & O_NONBLOCK); } while (atomic_cmpset_int(&fp->f_flag, oflag, flag) == 0); } else oflag = fp->f_flag; oattr->mq_flags = (O_NONBLOCK & oflag); fdrop(fp, td); return (error); } int sys_kmq_setattr(struct thread *td, struct kmq_setattr_args *uap) { struct mq_attr attr, oattr; int error; if (uap->attr != NULL) { error = copyin(uap->attr, &attr, sizeof(attr)); if (error != 0) return (error); } error = kern_kmq_setattr(td, uap->mqd, uap->attr != NULL ? &attr : NULL, &oattr); if (error == 0 && uap->oattr != NULL) { bzero(oattr.__reserved, sizeof(oattr.__reserved)); error = copyout(&oattr, uap->oattr, sizeof(oattr)); } return (error); } int sys_kmq_timedreceive(struct thread *td, struct kmq_timedreceive_args *uap) { struct mqueue *mq; struct file *fp; struct timespec *abs_timeout, ets; int error; int waitok; AUDIT_ARG_FD(uap->mqd); error = getmq_read(td, uap->mqd, &fp, NULL, &mq); if (error) return (error); if (uap->abs_timeout != NULL) { error = copyin(uap->abs_timeout, &ets, sizeof(ets)); if (error != 0) goto out; abs_timeout = &ets; } else abs_timeout = NULL; waitok = !(fp->f_flag & O_NONBLOCK); error = mqueue_receive(mq, uap->msg_ptr, uap->msg_len, uap->msg_prio, waitok, abs_timeout); out: fdrop(fp, td); return (error); } int sys_kmq_timedsend(struct thread *td, struct kmq_timedsend_args *uap) { struct mqueue *mq; struct file *fp; struct timespec *abs_timeout, ets; int error, waitok; AUDIT_ARG_FD(uap->mqd); error = getmq_write(td, uap->mqd, &fp, NULL, &mq); if (error) return (error); if (uap->abs_timeout != NULL) { error = copyin(uap->abs_timeout, &ets, sizeof(ets)); if (error != 0) goto out; abs_timeout = &ets; } else abs_timeout = NULL; waitok = !(fp->f_flag & O_NONBLOCK); error = mqueue_send(mq, uap->msg_ptr, uap->msg_len, uap->msg_prio, waitok, abs_timeout); out: fdrop(fp, td); return (error); } static int kern_kmq_notify(struct thread *td, int mqd, struct sigevent *sigev) { struct filedesc *fdp; struct proc *p; struct mqueue *mq; struct file *fp, *fp2; struct mqueue_notifier *nt, *newnt = NULL; int error; AUDIT_ARG_FD(mqd); if (sigev != NULL) { if (sigev->sigev_notify != SIGEV_SIGNAL && sigev->sigev_notify != SIGEV_THREAD_ID && sigev->sigev_notify != SIGEV_NONE) return (EINVAL); if ((sigev->sigev_notify == SIGEV_SIGNAL || sigev->sigev_notify == SIGEV_THREAD_ID) && !_SIG_VALID(sigev->sigev_signo)) return (EINVAL); } p = td->td_proc; fdp = td->td_proc->p_fd; error = getmq(td, mqd, &fp, NULL, &mq); if (error) return (error); again: FILEDESC_SLOCK(fdp); fp2 = fget_locked(fdp, mqd); if (fp2 == NULL) { FILEDESC_SUNLOCK(fdp); error = EBADF; goto out; } #ifdef CAPABILITIES error = cap_check(cap_rights(fdp, mqd), &cap_event_rights); if (error) { FILEDESC_SUNLOCK(fdp); goto out; } #endif if (fp2 != fp) { FILEDESC_SUNLOCK(fdp); error = EBADF; goto out; } mtx_lock(&mq->mq_mutex); FILEDESC_SUNLOCK(fdp); if (sigev != NULL) { if (mq->mq_notifier != NULL) { error = EBUSY; } else { PROC_LOCK(p); nt = notifier_search(p, mqd); if (nt == NULL) { if (newnt == NULL) { PROC_UNLOCK(p); mtx_unlock(&mq->mq_mutex); newnt = notifier_alloc(); goto again; } } if (nt != NULL) { sigqueue_take(&nt->nt_ksi); if (newnt != NULL) { notifier_free(newnt); newnt = NULL; } } else { nt = newnt; newnt = NULL; ksiginfo_init(&nt->nt_ksi); nt->nt_ksi.ksi_flags |= KSI_INS | KSI_EXT; nt->nt_ksi.ksi_code = SI_MESGQ; nt->nt_proc = p; nt->nt_ksi.ksi_mqd = mqd; notifier_insert(p, nt); } nt->nt_sigev = *sigev; mq->mq_notifier = nt; PROC_UNLOCK(p); /* * if there is no receivers and message queue * is not empty, we should send notification * as soon as possible. */ if (mq->mq_receivers == 0 && !TAILQ_EMPTY(&mq->mq_msgq)) mqueue_send_notification(mq); } } else { notifier_remove(p, mq, mqd); } mtx_unlock(&mq->mq_mutex); out: fdrop(fp, td); if (newnt != NULL) notifier_free(newnt); return (error); } int sys_kmq_notify(struct thread *td, struct kmq_notify_args *uap) { struct sigevent ev, *evp; int error; if (uap->sigev == NULL) { evp = NULL; } else { error = copyin(uap->sigev, &ev, sizeof(ev)); if (error != 0) return (error); evp = &ev; } return (kern_kmq_notify(td, uap->mqd, evp)); } static void mqueue_fdclose(struct thread *td, int fd, struct file *fp) { struct mqueue *mq; #ifdef INVARIANTS struct filedesc *fdp; fdp = td->td_proc->p_fd; FILEDESC_LOCK_ASSERT(fdp); #endif if (fp->f_ops == &mqueueops) { mq = FPTOMQ(fp); mtx_lock(&mq->mq_mutex); notifier_remove(td->td_proc, mq, fd); /* have to wakeup thread in same process */ if (mq->mq_flags & MQ_RSEL) { mq->mq_flags &= ~MQ_RSEL; selwakeup(&mq->mq_rsel); } if (mq->mq_flags & MQ_WSEL) { mq->mq_flags &= ~MQ_WSEL; selwakeup(&mq->mq_wsel); } mtx_unlock(&mq->mq_mutex); } } static void mq_proc_exit(void *arg __unused, struct proc *p) { struct filedesc *fdp; struct file *fp; struct mqueue *mq; int i; fdp = p->p_fd; FILEDESC_SLOCK(fdp); for (i = 0; i < fdp->fd_nfiles; ++i) { fp = fget_locked(fdp, i); if (fp != NULL && fp->f_ops == &mqueueops) { mq = FPTOMQ(fp); mtx_lock(&mq->mq_mutex); notifier_remove(p, FPTOMQ(fp), i); mtx_unlock(&mq->mq_mutex); } } FILEDESC_SUNLOCK(fdp); KASSERT(LIST_EMPTY(&p->p_mqnotifier), ("mq notifiers left")); } static int mqf_poll(struct file *fp, int events, struct ucred *active_cred, struct thread *td) { struct mqueue *mq = FPTOMQ(fp); int revents = 0; mtx_lock(&mq->mq_mutex); if (events & (POLLIN | POLLRDNORM)) { if (mq->mq_curmsgs) { revents |= events & (POLLIN | POLLRDNORM); } else { mq->mq_flags |= MQ_RSEL; selrecord(td, &mq->mq_rsel); } } if (events & POLLOUT) { if (mq->mq_curmsgs < mq->mq_maxmsg) revents |= POLLOUT; else { mq->mq_flags |= MQ_WSEL; selrecord(td, &mq->mq_wsel); } } mtx_unlock(&mq->mq_mutex); return (revents); } static int mqf_close(struct file *fp, struct thread *td) { struct mqfs_node *pn; fp->f_ops = &badfileops; pn = fp->f_data; fp->f_data = NULL; sx_xlock(&mqfs_data.mi_lock); mqnode_release(pn); sx_xunlock(&mqfs_data.mi_lock); return (0); } static int mqf_stat(struct file *fp, struct stat *st, struct ucred *active_cred, struct thread *td) { struct mqfs_node *pn = fp->f_data; bzero(st, sizeof *st); sx_xlock(&mqfs_data.mi_lock); st->st_atim = pn->mn_atime; st->st_mtim = pn->mn_mtime; st->st_ctim = pn->mn_ctime; st->st_birthtim = pn->mn_birth; st->st_uid = pn->mn_uid; st->st_gid = pn->mn_gid; st->st_mode = S_IFIFO | pn->mn_mode; sx_xunlock(&mqfs_data.mi_lock); return (0); } static int mqf_chmod(struct file *fp, mode_t mode, struct ucred *active_cred, struct thread *td) { struct mqfs_node *pn; int error; error = 0; pn = fp->f_data; sx_xlock(&mqfs_data.mi_lock); error = vaccess(VREG, pn->mn_mode, pn->mn_uid, pn->mn_gid, VADMIN, active_cred); if (error != 0) goto out; pn->mn_mode = mode & ACCESSPERMS; out: sx_xunlock(&mqfs_data.mi_lock); return (error); } static int mqf_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred, struct thread *td) { struct mqfs_node *pn; int error; error = 0; pn = fp->f_data; sx_xlock(&mqfs_data.mi_lock); if (uid == (uid_t)-1) uid = pn->mn_uid; if (gid == (gid_t)-1) gid = pn->mn_gid; if (((uid != pn->mn_uid && uid != active_cred->cr_uid) || (gid != pn->mn_gid && !groupmember(gid, active_cred))) && (error = priv_check_cred(active_cred, PRIV_VFS_CHOWN))) goto out; pn->mn_uid = uid; pn->mn_gid = gid; out: sx_xunlock(&mqfs_data.mi_lock); return (error); } static int mqf_kqfilter(struct file *fp, struct knote *kn) { struct mqueue *mq = FPTOMQ(fp); int error = 0; if (kn->kn_filter == EVFILT_READ) { kn->kn_fop = &mq_rfiltops; knlist_add(&mq->mq_rsel.si_note, kn, 0); } else if (kn->kn_filter == EVFILT_WRITE) { kn->kn_fop = &mq_wfiltops; knlist_add(&mq->mq_wsel.si_note, kn, 0); } else error = EINVAL; return (error); } static void filt_mqdetach(struct knote *kn) { struct mqueue *mq = FPTOMQ(kn->kn_fp); if (kn->kn_filter == EVFILT_READ) knlist_remove(&mq->mq_rsel.si_note, kn, 0); else if (kn->kn_filter == EVFILT_WRITE) knlist_remove(&mq->mq_wsel.si_note, kn, 0); else panic("filt_mqdetach"); } static int filt_mqread(struct knote *kn, long hint) { struct mqueue *mq = FPTOMQ(kn->kn_fp); mtx_assert(&mq->mq_mutex, MA_OWNED); return (mq->mq_curmsgs != 0); } static int filt_mqwrite(struct knote *kn, long hint) { struct mqueue *mq = FPTOMQ(kn->kn_fp); mtx_assert(&mq->mq_mutex, MA_OWNED); return (mq->mq_curmsgs < mq->mq_maxmsg); } static int mqf_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp) { kif->kf_type = KF_TYPE_MQUEUE; return (0); } static struct fileops mqueueops = { .fo_read = invfo_rdwr, .fo_write = invfo_rdwr, .fo_truncate = invfo_truncate, .fo_ioctl = invfo_ioctl, .fo_poll = mqf_poll, .fo_kqfilter = mqf_kqfilter, .fo_stat = mqf_stat, .fo_close = mqf_close, .fo_chmod = mqf_chmod, .fo_chown = mqf_chown, .fo_sendfile = invfo_sendfile, .fo_fill_kinfo = mqf_fill_kinfo, .fo_flags = DFLAG_PASSABLE, }; static struct vop_vector mqfs_vnodeops = { .vop_default = &default_vnodeops, .vop_access = mqfs_access, .vop_cachedlookup = mqfs_lookup, .vop_lookup = vfs_cache_lookup, .vop_reclaim = mqfs_reclaim, .vop_create = mqfs_create, .vop_remove = mqfs_remove, .vop_inactive = mqfs_inactive, .vop_open = mqfs_open, .vop_close = mqfs_close, .vop_getattr = mqfs_getattr, .vop_setattr = mqfs_setattr, .vop_read = mqfs_read, .vop_write = VOP_EOPNOTSUPP, .vop_readdir = mqfs_readdir, .vop_mkdir = VOP_EOPNOTSUPP, .vop_rmdir = VOP_EOPNOTSUPP }; VFS_VOP_VECTOR_REGISTER(mqfs_vnodeops); static struct vfsops mqfs_vfsops = { .vfs_init = mqfs_init, .vfs_uninit = mqfs_uninit, .vfs_mount = mqfs_mount, .vfs_unmount = mqfs_unmount, .vfs_root = mqfs_root, .vfs_statfs = mqfs_statfs, }; static struct vfsconf mqueuefs_vfsconf = { .vfc_version = VFS_VERSION, .vfc_name = "mqueuefs", .vfc_vfsops = &mqfs_vfsops, .vfc_typenum = -1, .vfc_flags = VFCF_SYNTHETIC }; static struct syscall_helper_data mq_syscalls[] = { SYSCALL_INIT_HELPER(kmq_open), SYSCALL_INIT_HELPER_F(kmq_setattr, SYF_CAPENABLED), SYSCALL_INIT_HELPER_F(kmq_timedsend, SYF_CAPENABLED), SYSCALL_INIT_HELPER_F(kmq_timedreceive, SYF_CAPENABLED), SYSCALL_INIT_HELPER_F(kmq_notify, SYF_CAPENABLED), SYSCALL_INIT_HELPER(kmq_unlink), SYSCALL_INIT_LAST }; #ifdef COMPAT_FREEBSD32 #include #include #include #include #include static void mq_attr_from32(const struct mq_attr32 *from, struct mq_attr *to) { to->mq_flags = from->mq_flags; to->mq_maxmsg = from->mq_maxmsg; to->mq_msgsize = from->mq_msgsize; to->mq_curmsgs = from->mq_curmsgs; } static void mq_attr_to32(const struct mq_attr *from, struct mq_attr32 *to) { to->mq_flags = from->mq_flags; to->mq_maxmsg = from->mq_maxmsg; to->mq_msgsize = from->mq_msgsize; to->mq_curmsgs = from->mq_curmsgs; } int freebsd32_kmq_open(struct thread *td, struct freebsd32_kmq_open_args *uap) { struct mq_attr attr; struct mq_attr32 attr32; int flags, error; if ((uap->flags & O_ACCMODE) == O_ACCMODE || uap->flags & O_EXEC) return (EINVAL); flags = FFLAGS(uap->flags); if ((flags & O_CREAT) != 0 && uap->attr != NULL) { error = copyin(uap->attr, &attr32, sizeof(attr32)); if (error) return (error); mq_attr_from32(&attr32, &attr); } return (kern_kmq_open(td, uap->path, flags, uap->mode, uap->attr != NULL ? &attr : NULL)); } int freebsd32_kmq_setattr(struct thread *td, struct freebsd32_kmq_setattr_args *uap) { struct mq_attr attr, oattr; struct mq_attr32 attr32, oattr32; int error; if (uap->attr != NULL) { error = copyin(uap->attr, &attr32, sizeof(attr32)); if (error != 0) return (error); mq_attr_from32(&attr32, &attr); } error = kern_kmq_setattr(td, uap->mqd, uap->attr != NULL ? &attr : NULL, &oattr); if (error == 0 && uap->oattr != NULL) { mq_attr_to32(&oattr, &oattr32); bzero(oattr32.__reserved, sizeof(oattr32.__reserved)); error = copyout(&oattr32, uap->oattr, sizeof(oattr32)); } return (error); } int freebsd32_kmq_timedsend(struct thread *td, struct freebsd32_kmq_timedsend_args *uap) { struct mqueue *mq; struct file *fp; struct timespec32 ets32; struct timespec *abs_timeout, ets; int error; int waitok; AUDIT_ARG_FD(uap->mqd); error = getmq_write(td, uap->mqd, &fp, NULL, &mq); if (error) return (error); if (uap->abs_timeout != NULL) { error = copyin(uap->abs_timeout, &ets32, sizeof(ets32)); if (error != 0) goto out; CP(ets32, ets, tv_sec); CP(ets32, ets, tv_nsec); abs_timeout = &ets; } else abs_timeout = NULL; waitok = !(fp->f_flag & O_NONBLOCK); error = mqueue_send(mq, uap->msg_ptr, uap->msg_len, uap->msg_prio, waitok, abs_timeout); out: fdrop(fp, td); return (error); } int freebsd32_kmq_timedreceive(struct thread *td, struct freebsd32_kmq_timedreceive_args *uap) { struct mqueue *mq; struct file *fp; struct timespec32 ets32; struct timespec *abs_timeout, ets; int error, waitok; AUDIT_ARG_FD(uap->mqd); error = getmq_read(td, uap->mqd, &fp, NULL, &mq); if (error) return (error); if (uap->abs_timeout != NULL) { error = copyin(uap->abs_timeout, &ets32, sizeof(ets32)); if (error != 0) goto out; CP(ets32, ets, tv_sec); CP(ets32, ets, tv_nsec); abs_timeout = &ets; } else abs_timeout = NULL; waitok = !(fp->f_flag & O_NONBLOCK); error = mqueue_receive(mq, uap->msg_ptr, uap->msg_len, uap->msg_prio, waitok, abs_timeout); out: fdrop(fp, td); return (error); } int freebsd32_kmq_notify(struct thread *td, struct freebsd32_kmq_notify_args *uap) { struct sigevent ev, *evp; struct sigevent32 ev32; int error; if (uap->sigev == NULL) { evp = NULL; } else { error = copyin(uap->sigev, &ev32, sizeof(ev32)); if (error != 0) return (error); error = convert_sigevent32(&ev32, &ev); if (error != 0) return (error); evp = &ev; } return (kern_kmq_notify(td, uap->mqd, evp)); } static struct syscall_helper_data mq32_syscalls[] = { SYSCALL32_INIT_HELPER(freebsd32_kmq_open), SYSCALL32_INIT_HELPER_F(freebsd32_kmq_setattr, SYF_CAPENABLED), SYSCALL32_INIT_HELPER_F(freebsd32_kmq_timedsend, SYF_CAPENABLED), SYSCALL32_INIT_HELPER_F(freebsd32_kmq_timedreceive, SYF_CAPENABLED), SYSCALL32_INIT_HELPER_F(freebsd32_kmq_notify, SYF_CAPENABLED), SYSCALL32_INIT_HELPER_COMPAT(kmq_unlink), SYSCALL_INIT_LAST }; #endif static int mqinit(void) { int error; error = syscall_helper_register(mq_syscalls, SY_THR_STATIC_KLD); if (error != 0) return (error); #ifdef COMPAT_FREEBSD32 error = syscall32_helper_register(mq32_syscalls, SY_THR_STATIC_KLD); if (error != 0) return (error); #endif return (0); } static int mqunload(void) { #ifdef COMPAT_FREEBSD32 syscall32_helper_unregister(mq32_syscalls); #endif syscall_helper_unregister(mq_syscalls); return (0); } static int mq_modload(struct module *module, int cmd, void *arg) { int error = 0; error = vfs_modevent(module, cmd, arg); if (error != 0) return (error); switch (cmd) { case MOD_LOAD: error = mqinit(); if (error != 0) mqunload(); break; case MOD_UNLOAD: error = mqunload(); break; default: break; } return (error); } static moduledata_t mqueuefs_mod = { "mqueuefs", mq_modload, &mqueuefs_vfsconf }; DECLARE_MODULE(mqueuefs, mqueuefs_mod, SI_SUB_VFS, SI_ORDER_MIDDLE); MODULE_VERSION(mqueuefs, 1); Index: projects/clang1100-import/sys/kern/vfs_cache.c =================================================================== --- projects/clang1100-import/sys/kern/vfs_cache.c (revision 364278) +++ projects/clang1100-import/sys/kern/vfs_cache.c (revision 364279) @@ -1,4181 +1,4179 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1989, 1993, 1995 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Poul-Henning Kamp of the FreeBSD Project. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vfs_cache.c 8.5 (Berkeley) 3/22/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_ktrace.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KTRACE #include #endif #include #include #include #ifdef DDB #include #endif #include SDT_PROVIDER_DECLARE(vfs); SDT_PROBE_DEFINE3(vfs, namecache, enter, done, "struct vnode *", "char *", "struct vnode *"); SDT_PROBE_DEFINE2(vfs, namecache, enter_negative, done, "struct vnode *", "char *"); SDT_PROBE_DEFINE1(vfs, namecache, fullpath, entry, "struct vnode *"); SDT_PROBE_DEFINE3(vfs, namecache, fullpath, hit, "struct vnode *", "char *", "struct vnode *"); SDT_PROBE_DEFINE1(vfs, namecache, fullpath, miss, "struct vnode *"); SDT_PROBE_DEFINE3(vfs, namecache, fullpath, return, "int", "struct vnode *", "char *"); SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, "struct vnode *", "char *", "struct vnode *"); SDT_PROBE_DEFINE2(vfs, namecache, lookup, hit__negative, "struct vnode *", "char *"); SDT_PROBE_DEFINE2(vfs, namecache, lookup, miss, "struct vnode *", "char *"); SDT_PROBE_DEFINE1(vfs, namecache, purge, done, "struct vnode *"); SDT_PROBE_DEFINE1(vfs, namecache, purge_negative, done, "struct vnode *"); SDT_PROBE_DEFINE1(vfs, namecache, purgevfs, done, "struct mount *"); SDT_PROBE_DEFINE3(vfs, namecache, zap, done, "struct vnode *", "char *", "struct vnode *"); SDT_PROBE_DEFINE2(vfs, namecache, zap_negative, done, "struct vnode *", "char *"); SDT_PROBE_DEFINE2(vfs, namecache, shrink_negative, done, "struct vnode *", "char *"); SDT_PROBE_DEFINE3(vfs, fplookup, lookup, done, "struct nameidata", "int", "bool"); SDT_PROBE_DECLARE(vfs, namei, lookup, entry); SDT_PROBE_DECLARE(vfs, namei, lookup, return); /* * This structure describes the elements in the cache of recent * names looked up by namei. */ struct negstate { u_char neg_flag; }; _Static_assert(sizeof(struct negstate) <= sizeof(struct vnode *), "the state must fit in a union with a pointer without growing it"); struct namecache { LIST_ENTRY(namecache) nc_src; /* source vnode list */ TAILQ_ENTRY(namecache) nc_dst; /* destination vnode list */ CK_SLIST_ENTRY(namecache) nc_hash;/* hash chain */ struct vnode *nc_dvp; /* vnode of parent of name */ union { struct vnode *nu_vp; /* vnode the name refers to */ struct negstate nu_neg;/* negative entry state */ } n_un; u_char nc_flag; /* flag bits */ u_char nc_nlen; /* length of name */ char nc_name[0]; /* segment name + nul */ }; /* * struct namecache_ts repeats struct namecache layout up to the * nc_nlen member. * struct namecache_ts is used in place of struct namecache when time(s) need * to be stored. The nc_dotdottime field is used when a cache entry is mapping * both a non-dotdot directory name plus dotdot for the directory's * parent. * * See below for alignment requirement. */ struct namecache_ts { struct timespec nc_time; /* timespec provided by fs */ struct timespec nc_dotdottime; /* dotdot timespec provided by fs */ int nc_ticks; /* ticks value when entry was added */ struct namecache nc_nc; }; /* * At least mips n32 performs 64-bit accesses to timespec as found * in namecache_ts and requires them to be aligned. Since others * may be in the same spot suffer a little bit and enforce the * alignment for everyone. Note this is a nop for 64-bit platforms. */ #define CACHE_ZONE_ALIGNMENT UMA_ALIGNOF(time_t) #define CACHE_PATH_CUTOFF 39 #define CACHE_ZONE_SMALL_SIZE (sizeof(struct namecache) + CACHE_PATH_CUTOFF + 1) #define CACHE_ZONE_SMALL_TS_SIZE (sizeof(struct namecache_ts) + CACHE_PATH_CUTOFF + 1) #define CACHE_ZONE_LARGE_SIZE (sizeof(struct namecache) + NAME_MAX + 1) #define CACHE_ZONE_LARGE_TS_SIZE (sizeof(struct namecache_ts) + NAME_MAX + 1) _Static_assert((CACHE_ZONE_SMALL_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); _Static_assert((CACHE_ZONE_SMALL_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); _Static_assert((CACHE_ZONE_LARGE_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); _Static_assert((CACHE_ZONE_LARGE_TS_SIZE % (CACHE_ZONE_ALIGNMENT + 1)) == 0, "bad zone size"); #define nc_vp n_un.nu_vp #define nc_neg n_un.nu_neg /* * Flags in namecache.nc_flag */ #define NCF_WHITE 0x01 #define NCF_ISDOTDOT 0x02 #define NCF_TS 0x04 #define NCF_DTS 0x08 #define NCF_DVDROP 0x10 #define NCF_NEGATIVE 0x20 #define NCF_INVALID 0x40 #define NCF_WIP 0x80 /* * Flags in negstate.neg_flag */ #define NEG_HOT 0x01 /* * Mark an entry as invalid. * * This is called before it starts getting deconstructed. */ static void cache_ncp_invalidate(struct namecache *ncp) { KASSERT((ncp->nc_flag & NCF_INVALID) == 0, ("%s: entry %p already invalid", __func__, ncp)); atomic_store_char(&ncp->nc_flag, ncp->nc_flag | NCF_INVALID); atomic_thread_fence_rel(); } /* * Check whether the entry can be safely used. * * All places which elide locks are supposed to call this after they are * done with reading from an entry. */ static bool cache_ncp_canuse(struct namecache *ncp) { atomic_thread_fence_acq(); return ((atomic_load_char(&ncp->nc_flag) & (NCF_INVALID | NCF_WIP)) == 0); } /* * Name caching works as follows: * * Names found by directory scans are retained in a cache * for future reference. It is managed LRU, so frequently * used names will hang around. Cache is indexed by hash value * obtained from (dvp, name) where dvp refers to the directory * containing name. * * If it is a "negative" entry, (i.e. for a name that is known NOT to * exist) the vnode pointer will be NULL. * * Upon reaching the last segment of a path, if the reference * is for DELETE, or NOCACHE is set (rewrite), and the * name is located in the cache, it will be dropped. * * These locks are used (in the order in which they can be taken): * NAME TYPE ROLE * vnodelock mtx vnode lists and v_cache_dd field protection * bucketlock rwlock for access to given set of hash buckets * neglist mtx negative entry LRU management * * Additionally, ncneg_shrink_lock mtx is used to have at most one thread * shrinking the LRU list. * * It is legal to take multiple vnodelock and bucketlock locks. The locking * order is lower address first. Both are recursive. * * "." lookups are lockless. * * ".." and vnode -> name lookups require vnodelock. * * name -> vnode lookup requires the relevant bucketlock to be held for reading. * * Insertions and removals of entries require involved vnodes and bucketlocks * to be write-locked to prevent other threads from seeing the entry. * * Some lookups result in removal of the found entry (e.g. getting rid of a * negative entry with the intent to create a positive one), which poses a * problem when multiple threads reach the state. Similarly, two different * threads can purge two different vnodes and try to remove the same name. * * If the already held vnode lock is lower than the second required lock, we * can just take the other lock. However, in the opposite case, this could * deadlock. As such, this is resolved by trylocking and if that fails unlocking * the first node, locking everything in order and revalidating the state. */ VFS_SMR_DECLARE; /* * Structures associated with name caching. */ #define NCHHASH(hash) \ (&nchashtbl[(hash) & nchash]) static __read_mostly CK_SLIST_HEAD(nchashhead, namecache) *nchashtbl;/* Hash Table */ static u_long __read_mostly nchash; /* size of hash table */ SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0, "Size of namecache hash table"); static u_long __read_mostly ncnegfactor = 5; /* ratio of negative entries */ SYSCTL_ULONG(_vfs, OID_AUTO, ncnegfactor, CTLFLAG_RW, &ncnegfactor, 0, "Ratio of negative namecache entries"); static u_long __exclusive_cache_line numneg; /* number of negative entries allocated */ static u_long __exclusive_cache_line numcache;/* number of cache entries allocated */ u_int ncsizefactor = 2; SYSCTL_UINT(_vfs, OID_AUTO, ncsizefactor, CTLFLAG_RW, &ncsizefactor, 0, "Size factor for namecache"); static u_int __read_mostly ncpurgeminvnodes; SYSCTL_UINT(_vfs, OID_AUTO, ncpurgeminvnodes, CTLFLAG_RW, &ncpurgeminvnodes, 0, "Number of vnodes below which purgevfs ignores the request"); static u_int __read_mostly ncsize; /* the size as computed on creation or resizing */ struct nchstats nchstats; /* cache effectiveness statistics */ static struct mtx __exclusive_cache_line ncneg_shrink_lock; struct neglist { struct mtx nl_lock; TAILQ_HEAD(, namecache) nl_list; } __aligned(CACHE_LINE_SIZE); static struct neglist __read_mostly *neglists; static struct neglist ncneg_hot; static u_long numhotneg; #define ncneghash 3 #define numneglists (ncneghash + 1) static inline struct neglist * NCP2NEGLIST(struct namecache *ncp) { return (&neglists[(((uintptr_t)(ncp) >> 8) & ncneghash)]); } static inline struct negstate * NCP2NEGSTATE(struct namecache *ncp) { MPASS(ncp->nc_flag & NCF_NEGATIVE); return (&ncp->nc_neg); } #define numbucketlocks (ncbuckethash + 1) static u_int __read_mostly ncbuckethash; static struct rwlock_padalign __read_mostly *bucketlocks; #define HASH2BUCKETLOCK(hash) \ ((struct rwlock *)(&bucketlocks[((hash) & ncbuckethash)])) #define numvnodelocks (ncvnodehash + 1) static u_int __read_mostly ncvnodehash; static struct mtx __read_mostly *vnodelocks; static inline struct mtx * VP2VNODELOCK(struct vnode *vp) { return (&vnodelocks[(((uintptr_t)(vp) >> 8) & ncvnodehash)]); } /* * UMA zones for the VFS cache. * * The small cache is used for entries with short names, which are the * most common. The large cache is used for entries which are too big to * fit in the small cache. */ static uma_zone_t __read_mostly cache_zone_small; static uma_zone_t __read_mostly cache_zone_small_ts; static uma_zone_t __read_mostly cache_zone_large; static uma_zone_t __read_mostly cache_zone_large_ts; static struct namecache * cache_alloc(int len, int ts) { struct namecache_ts *ncp_ts; struct namecache *ncp; if (__predict_false(ts)) { if (len <= CACHE_PATH_CUTOFF) ncp_ts = uma_zalloc_smr(cache_zone_small_ts, M_WAITOK); else ncp_ts = uma_zalloc_smr(cache_zone_large_ts, M_WAITOK); ncp = &ncp_ts->nc_nc; } else { if (len <= CACHE_PATH_CUTOFF) ncp = uma_zalloc_smr(cache_zone_small, M_WAITOK); else ncp = uma_zalloc_smr(cache_zone_large, M_WAITOK); } return (ncp); } static void cache_free(struct namecache *ncp) { struct namecache_ts *ncp_ts; if (ncp == NULL) return; if ((ncp->nc_flag & NCF_DVDROP) != 0) vdrop(ncp->nc_dvp); if (__predict_false(ncp->nc_flag & NCF_TS)) { ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); if (ncp->nc_nlen <= CACHE_PATH_CUTOFF) uma_zfree_smr(cache_zone_small_ts, ncp_ts); else uma_zfree_smr(cache_zone_large_ts, ncp_ts); } else { if (ncp->nc_nlen <= CACHE_PATH_CUTOFF) uma_zfree_smr(cache_zone_small, ncp); else uma_zfree_smr(cache_zone_large, ncp); } } static void cache_out_ts(struct namecache *ncp, struct timespec *tsp, int *ticksp) { struct namecache_ts *ncp_ts; KASSERT((ncp->nc_flag & NCF_TS) != 0 || (tsp == NULL && ticksp == NULL), ("No NCF_TS")); if (tsp == NULL && ticksp == NULL) return; ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); if (tsp != NULL) *tsp = ncp_ts->nc_time; if (ticksp != NULL) *ticksp = ncp_ts->nc_ticks; } #ifdef DEBUG_CACHE static int __read_mostly doingcache = 1; /* 1 => enable the cache */ SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0, "VFS namecache enabled"); #endif /* Export size information to userland */ SYSCTL_INT(_debug_sizeof, OID_AUTO, namecache, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, sizeof(struct namecache), "sizeof(struct namecache)"); /* * The new name cache statistics */ static SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Name cache statistics"); #define STATNODE_ULONG(name, descr) \ SYSCTL_ULONG(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, 0, descr); #define STATNODE_COUNTER(name, descr) \ static COUNTER_U64_DEFINE_EARLY(name); \ SYSCTL_COUNTER_U64(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, \ descr); STATNODE_ULONG(numneg, "Number of negative cache entries"); STATNODE_ULONG(numcache, "Number of cache entries"); STATNODE_COUNTER(numcachehv, "Number of namecache entries with vnodes held"); STATNODE_COUNTER(numdrops, "Number of dropped entries due to reaching the limit"); STATNODE_COUNTER(dothits, "Number of '.' hits"); STATNODE_COUNTER(dotdothits, "Number of '..' hits"); STATNODE_COUNTER(nummiss, "Number of cache misses"); STATNODE_COUNTER(nummisszap, "Number of cache misses we do not want to cache"); STATNODE_COUNTER(numposzaps, "Number of cache hits (positive) we do not want to cache"); STATNODE_COUNTER(numposhits, "Number of cache hits (positive)"); STATNODE_COUNTER(numnegzaps, "Number of cache hits (negative) we do not want to cache"); STATNODE_COUNTER(numneghits, "Number of cache hits (negative)"); /* These count for vn_getcwd(), too. */ STATNODE_COUNTER(numfullpathcalls, "Number of fullpath search calls"); STATNODE_COUNTER(numfullpathfail1, "Number of fullpath search errors (ENOTDIR)"); STATNODE_COUNTER(numfullpathfail2, "Number of fullpath search errors (VOP_VPTOCNP failures)"); STATNODE_COUNTER(numfullpathfail4, "Number of fullpath search errors (ENOMEM)"); STATNODE_COUNTER(numfullpathfound, "Number of successful fullpath calls"); STATNODE_COUNTER(zap_and_exit_bucket_relock_success, "Number of successful removals after relocking"); static long zap_and_exit_bucket_fail; STATNODE_ULONG(zap_and_exit_bucket_fail, "Number of times zap_and_exit failed to lock"); static long zap_and_exit_bucket_fail2; STATNODE_ULONG(zap_and_exit_bucket_fail2, "Number of times zap_and_exit failed to lock"); static long cache_lock_vnodes_cel_3_failures; STATNODE_ULONG(cache_lock_vnodes_cel_3_failures, "Number of times 3-way vnode locking failed"); STATNODE_ULONG(numhotneg, "Number of hot negative entries"); STATNODE_COUNTER(numneg_evicted, "Number of negative entries evicted when adding a new entry"); STATNODE_COUNTER(shrinking_skipped, "Number of times shrinking was already in progress"); static void cache_zap_locked(struct namecache *ncp); static int vn_fullpath_hardlink(struct thread *td, struct nameidata *ndp, char **retbuf, char **freebuf, size_t *buflen); static int vn_fullpath_any(struct thread *td, struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf, size_t *buflen); static int vn_fullpath_dir(struct thread *td, struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf, size_t *len, bool slash_prefixed, size_t addend); static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries"); static int cache_yield; SYSCTL_INT(_vfs_cache, OID_AUTO, yield, CTLFLAG_RD, &cache_yield, 0, "Number of times cache called yield"); static void __noinline cache_maybe_yield(void) { if (should_yield()) { cache_yield++; kern_yield(PRI_USER); } } static inline void cache_assert_vlp_locked(struct mtx *vlp) { if (vlp != NULL) mtx_assert(vlp, MA_OWNED); } static inline void cache_assert_vnode_locked(struct vnode *vp) { struct mtx *vlp; vlp = VP2VNODELOCK(vp); cache_assert_vlp_locked(vlp); } /* * TODO: With the value stored we can do better than computing the hash based * on the address and the choice of FNV should also be revisisted. */ static void cache_prehash(struct vnode *vp) { vp->v_nchash = fnv_32_buf(&vp, sizeof(vp), FNV1_32_INIT); } static uint32_t cache_get_hash(char *name, u_char len, struct vnode *dvp) { return (fnv_32_buf(name, len, dvp->v_nchash)); } static inline struct nchashhead * NCP2BUCKET(struct namecache *ncp) { uint32_t hash; hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp); return (NCHHASH(hash)); } static inline struct rwlock * NCP2BUCKETLOCK(struct namecache *ncp) { uint32_t hash; hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp); return (HASH2BUCKETLOCK(hash)); } #ifdef INVARIANTS static void cache_assert_bucket_locked(struct namecache *ncp, int mode) { struct rwlock *blp; blp = NCP2BUCKETLOCK(ncp); rw_assert(blp, mode); } #else #define cache_assert_bucket_locked(x, y) do { } while (0) #endif #define cache_sort_vnodes(x, y) _cache_sort_vnodes((void **)(x), (void **)(y)) static void _cache_sort_vnodes(void **p1, void **p2) { void *tmp; MPASS(*p1 != NULL || *p2 != NULL); if (*p1 > *p2) { tmp = *p2; *p2 = *p1; *p1 = tmp; } } static void cache_lock_all_buckets(void) { u_int i; for (i = 0; i < numbucketlocks; i++) rw_wlock(&bucketlocks[i]); } static void cache_unlock_all_buckets(void) { u_int i; for (i = 0; i < numbucketlocks; i++) rw_wunlock(&bucketlocks[i]); } static void cache_lock_all_vnodes(void) { u_int i; for (i = 0; i < numvnodelocks; i++) mtx_lock(&vnodelocks[i]); } static void cache_unlock_all_vnodes(void) { u_int i; for (i = 0; i < numvnodelocks; i++) mtx_unlock(&vnodelocks[i]); } static int cache_trylock_vnodes(struct mtx *vlp1, struct mtx *vlp2) { cache_sort_vnodes(&vlp1, &vlp2); if (vlp1 != NULL) { if (!mtx_trylock(vlp1)) return (EAGAIN); } if (!mtx_trylock(vlp2)) { if (vlp1 != NULL) mtx_unlock(vlp1); return (EAGAIN); } return (0); } static void cache_lock_vnodes(struct mtx *vlp1, struct mtx *vlp2) { MPASS(vlp1 != NULL || vlp2 != NULL); MPASS(vlp1 <= vlp2); if (vlp1 != NULL) mtx_lock(vlp1); if (vlp2 != NULL) mtx_lock(vlp2); } static void cache_unlock_vnodes(struct mtx *vlp1, struct mtx *vlp2) { MPASS(vlp1 != NULL || vlp2 != NULL); if (vlp1 != NULL) mtx_unlock(vlp1); if (vlp2 != NULL) mtx_unlock(vlp2); } static int sysctl_nchstats(SYSCTL_HANDLER_ARGS) { struct nchstats snap; if (req->oldptr == NULL) return (SYSCTL_OUT(req, 0, sizeof(snap))); snap = nchstats; snap.ncs_goodhits = counter_u64_fetch(numposhits); snap.ncs_neghits = counter_u64_fetch(numneghits); snap.ncs_badhits = counter_u64_fetch(numposzaps) + counter_u64_fetch(numnegzaps); snap.ncs_miss = counter_u64_fetch(nummisszap) + counter_u64_fetch(nummiss); return (SYSCTL_OUT(req, &snap, sizeof(snap))); } SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 0, sysctl_nchstats, "LU", "VFS cache effectiveness statistics"); #ifdef DIAGNOSTIC /* * Grab an atomic snapshot of the name cache hash chain lengths */ static SYSCTL_NODE(_debug, OID_AUTO, hashstat, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, "hash table stats"); static int sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS) { struct nchashhead *ncpp; struct namecache *ncp; int i, error, n_nchash, *cntbuf; retry: n_nchash = nchash + 1; /* nchash is max index, not count */ if (req->oldptr == NULL) return SYSCTL_OUT(req, 0, n_nchash * sizeof(int)); cntbuf = malloc(n_nchash * sizeof(int), M_TEMP, M_ZERO | M_WAITOK); cache_lock_all_buckets(); if (n_nchash != nchash + 1) { cache_unlock_all_buckets(); free(cntbuf, M_TEMP); goto retry; } /* Scan hash tables counting entries */ for (ncpp = nchashtbl, i = 0; i < n_nchash; ncpp++, i++) CK_SLIST_FOREACH(ncp, ncpp, nc_hash) cntbuf[i]++; cache_unlock_all_buckets(); for (error = 0, i = 0; i < n_nchash; i++) if ((error = SYSCTL_OUT(req, &cntbuf[i], sizeof(int))) != 0) break; free(cntbuf, M_TEMP); return (error); } SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD| CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_rawnchash, "S,int", "nchash chain lengths"); static int sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS) { int error; struct nchashhead *ncpp; struct namecache *ncp; int n_nchash; int count, maxlength, used, pct; if (!req->oldptr) return SYSCTL_OUT(req, 0, 4 * sizeof(int)); cache_lock_all_buckets(); n_nchash = nchash + 1; /* nchash is max index, not count */ used = 0; maxlength = 0; /* Scan hash tables for applicable entries */ for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) { count = 0; CK_SLIST_FOREACH(ncp, ncpp, nc_hash) { count++; } if (count) used++; if (maxlength < count) maxlength = count; } n_nchash = nchash + 1; cache_unlock_all_buckets(); pct = (used * 100) / (n_nchash / 100); error = SYSCTL_OUT(req, &n_nchash, sizeof(n_nchash)); if (error) return (error); error = SYSCTL_OUT(req, &used, sizeof(used)); if (error) return (error); error = SYSCTL_OUT(req, &maxlength, sizeof(maxlength)); if (error) return (error); error = SYSCTL_OUT(req, &pct, sizeof(pct)); if (error) return (error); return (0); } SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD| CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_nchash, "I", "nchash statistics (number of total/used buckets, maximum chain length, usage percentage)"); #endif /* * Negative entries management * * A variation of LRU scheme is used. New entries are hashed into one of * numneglists cold lists. Entries get promoted to the hot list on first hit. * * The shrinker will demote hot list head and evict from the cold list in a * round-robin manner. */ static void cache_negative_init(struct namecache *ncp) { struct negstate *negstate; ncp->nc_flag |= NCF_NEGATIVE; negstate = NCP2NEGSTATE(ncp); negstate->neg_flag = 0; } static void cache_negative_hit(struct namecache *ncp) { struct neglist *neglist; struct negstate *negstate; negstate = NCP2NEGSTATE(ncp); if ((negstate->neg_flag & NEG_HOT) != 0) return; neglist = NCP2NEGLIST(ncp); mtx_lock(&ncneg_hot.nl_lock); mtx_lock(&neglist->nl_lock); if ((negstate->neg_flag & NEG_HOT) == 0) { numhotneg++; TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst); TAILQ_INSERT_TAIL(&ncneg_hot.nl_list, ncp, nc_dst); negstate->neg_flag |= NEG_HOT; } mtx_unlock(&neglist->nl_lock); mtx_unlock(&ncneg_hot.nl_lock); } static void cache_negative_insert(struct namecache *ncp) { struct neglist *neglist; MPASS(ncp->nc_flag & NCF_NEGATIVE); cache_assert_bucket_locked(ncp, RA_WLOCKED); neglist = NCP2NEGLIST(ncp); mtx_lock(&neglist->nl_lock); TAILQ_INSERT_TAIL(&neglist->nl_list, ncp, nc_dst); mtx_unlock(&neglist->nl_lock); atomic_add_rel_long(&numneg, 1); } static void cache_negative_remove(struct namecache *ncp) { struct neglist *neglist; struct negstate *negstate; bool hot_locked = false; bool list_locked = false; cache_assert_bucket_locked(ncp, RA_WLOCKED); neglist = NCP2NEGLIST(ncp); negstate = NCP2NEGSTATE(ncp); if ((negstate->neg_flag & NEG_HOT) != 0) { hot_locked = true; mtx_lock(&ncneg_hot.nl_lock); if ((negstate->neg_flag & NEG_HOT) == 0) { list_locked = true; mtx_lock(&neglist->nl_lock); } } else { list_locked = true; mtx_lock(&neglist->nl_lock); /* * We may be racing against promotion in lockless lookup. */ if ((negstate->neg_flag & NEG_HOT) != 0) { mtx_unlock(&neglist->nl_lock); hot_locked = true; mtx_lock(&ncneg_hot.nl_lock); mtx_lock(&neglist->nl_lock); } } if ((negstate->neg_flag & NEG_HOT) != 0) { mtx_assert(&ncneg_hot.nl_lock, MA_OWNED); TAILQ_REMOVE(&ncneg_hot.nl_list, ncp, nc_dst); numhotneg--; } else { mtx_assert(&neglist->nl_lock, MA_OWNED); TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst); } if (list_locked) mtx_unlock(&neglist->nl_lock); if (hot_locked) mtx_unlock(&ncneg_hot.nl_lock); atomic_subtract_rel_long(&numneg, 1); } static void cache_negative_shrink_select(struct namecache **ncpp, struct neglist **neglistpp) { struct neglist *neglist; struct namecache *ncp; static u_int cycle; u_int i; *ncpp = ncp = NULL; for (i = 0; i < numneglists; i++) { neglist = &neglists[(cycle + i) % numneglists]; if (TAILQ_FIRST(&neglist->nl_list) == NULL) continue; mtx_lock(&neglist->nl_lock); ncp = TAILQ_FIRST(&neglist->nl_list); if (ncp != NULL) break; mtx_unlock(&neglist->nl_lock); } *neglistpp = neglist; *ncpp = ncp; cycle++; } static void cache_negative_zap_one(void) { struct namecache *ncp, *ncp2; struct neglist *neglist; struct negstate *negstate; struct mtx *dvlp; struct rwlock *blp; if (mtx_owner(&ncneg_shrink_lock) != NULL || !mtx_trylock(&ncneg_shrink_lock)) { counter_u64_add(shrinking_skipped, 1); return; } mtx_lock(&ncneg_hot.nl_lock); ncp = TAILQ_FIRST(&ncneg_hot.nl_list); if (ncp != NULL) { neglist = NCP2NEGLIST(ncp); negstate = NCP2NEGSTATE(ncp); mtx_lock(&neglist->nl_lock); MPASS((negstate->neg_flag & NEG_HOT) != 0); TAILQ_REMOVE(&ncneg_hot.nl_list, ncp, nc_dst); TAILQ_INSERT_TAIL(&neglist->nl_list, ncp, nc_dst); negstate->neg_flag &= ~NEG_HOT; numhotneg--; mtx_unlock(&neglist->nl_lock); } mtx_unlock(&ncneg_hot.nl_lock); cache_negative_shrink_select(&ncp, &neglist); mtx_unlock(&ncneg_shrink_lock); if (ncp == NULL) return; MPASS(ncp->nc_flag & NCF_NEGATIVE); dvlp = VP2VNODELOCK(ncp->nc_dvp); blp = NCP2BUCKETLOCK(ncp); mtx_unlock(&neglist->nl_lock); mtx_lock(dvlp); rw_wlock(blp); /* * Enter SMR to safely check the negative list. * Even if the found pointer matches, the entry may now be reallocated * and used by a different vnode. */ vfs_smr_enter(); ncp2 = TAILQ_FIRST(&neglist->nl_list); if (ncp != ncp2 || dvlp != VP2VNODELOCK(ncp2->nc_dvp) || blp != NCP2BUCKETLOCK(ncp2)) { vfs_smr_exit(); ncp = NULL; } else { vfs_smr_exit(); SDT_PROBE2(vfs, namecache, shrink_negative, done, ncp->nc_dvp, ncp->nc_name); cache_zap_locked(ncp); counter_u64_add(numneg_evicted, 1); } rw_wunlock(blp); mtx_unlock(dvlp); cache_free(ncp); } /* * cache_zap_locked(): * * Removes a namecache entry from cache, whether it contains an actual * pointer to a vnode or if it is just a negative cache entry. */ static void cache_zap_locked(struct namecache *ncp) { struct nchashhead *ncpp; if (!(ncp->nc_flag & NCF_NEGATIVE)) cache_assert_vnode_locked(ncp->nc_vp); cache_assert_vnode_locked(ncp->nc_dvp); cache_assert_bucket_locked(ncp, RA_WLOCKED); CTR2(KTR_VFS, "cache_zap(%p) vp %p", ncp, (ncp->nc_flag & NCF_NEGATIVE) ? NULL : ncp->nc_vp); cache_ncp_invalidate(ncp); ncpp = NCP2BUCKET(ncp); CK_SLIST_REMOVE(ncpp, ncp, namecache, nc_hash); if (!(ncp->nc_flag & NCF_NEGATIVE)) { SDT_PROBE3(vfs, namecache, zap, done, ncp->nc_dvp, ncp->nc_name, ncp->nc_vp); TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst, ncp, nc_dst); if (ncp == ncp->nc_vp->v_cache_dd) { vn_seqc_write_begin_unheld(ncp->nc_vp); ncp->nc_vp->v_cache_dd = NULL; vn_seqc_write_end(ncp->nc_vp); } } else { SDT_PROBE2(vfs, namecache, zap_negative, done, ncp->nc_dvp, ncp->nc_name); cache_negative_remove(ncp); } if (ncp->nc_flag & NCF_ISDOTDOT) { if (ncp == ncp->nc_dvp->v_cache_dd) { vn_seqc_write_begin_unheld(ncp->nc_dvp); ncp->nc_dvp->v_cache_dd = NULL; vn_seqc_write_end(ncp->nc_dvp); } } else { LIST_REMOVE(ncp, nc_src); if (LIST_EMPTY(&ncp->nc_dvp->v_cache_src)) { ncp->nc_flag |= NCF_DVDROP; counter_u64_add(numcachehv, -1); } } atomic_subtract_rel_long(&numcache, 1); } static void cache_zap_negative_locked_vnode_kl(struct namecache *ncp, struct vnode *vp) { struct rwlock *blp; MPASS(ncp->nc_dvp == vp); MPASS(ncp->nc_flag & NCF_NEGATIVE); cache_assert_vnode_locked(vp); blp = NCP2BUCKETLOCK(ncp); rw_wlock(blp); cache_zap_locked(ncp); rw_wunlock(blp); } static bool cache_zap_locked_vnode_kl2(struct namecache *ncp, struct vnode *vp, struct mtx **vlpp) { struct mtx *pvlp, *vlp1, *vlp2, *to_unlock; struct rwlock *blp; MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp); cache_assert_vnode_locked(vp); if (ncp->nc_flag & NCF_NEGATIVE) { if (*vlpp != NULL) { mtx_unlock(*vlpp); *vlpp = NULL; } cache_zap_negative_locked_vnode_kl(ncp, vp); return (true); } pvlp = VP2VNODELOCK(vp); blp = NCP2BUCKETLOCK(ncp); vlp1 = VP2VNODELOCK(ncp->nc_dvp); vlp2 = VP2VNODELOCK(ncp->nc_vp); if (*vlpp == vlp1 || *vlpp == vlp2) { to_unlock = *vlpp; *vlpp = NULL; } else { if (*vlpp != NULL) { mtx_unlock(*vlpp); *vlpp = NULL; } cache_sort_vnodes(&vlp1, &vlp2); if (vlp1 == pvlp) { mtx_lock(vlp2); to_unlock = vlp2; } else { if (!mtx_trylock(vlp1)) goto out_relock; to_unlock = vlp1; } } rw_wlock(blp); cache_zap_locked(ncp); rw_wunlock(blp); if (to_unlock != NULL) mtx_unlock(to_unlock); return (true); out_relock: mtx_unlock(vlp2); mtx_lock(vlp1); mtx_lock(vlp2); MPASS(*vlpp == NULL); *vlpp = vlp1; return (false); } static int __noinline cache_zap_locked_vnode(struct namecache *ncp, struct vnode *vp) { struct mtx *pvlp, *vlp1, *vlp2, *to_unlock; struct rwlock *blp; int error = 0; MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp); cache_assert_vnode_locked(vp); pvlp = VP2VNODELOCK(vp); if (ncp->nc_flag & NCF_NEGATIVE) { cache_zap_negative_locked_vnode_kl(ncp, vp); goto out; } blp = NCP2BUCKETLOCK(ncp); vlp1 = VP2VNODELOCK(ncp->nc_dvp); vlp2 = VP2VNODELOCK(ncp->nc_vp); cache_sort_vnodes(&vlp1, &vlp2); if (vlp1 == pvlp) { mtx_lock(vlp2); to_unlock = vlp2; } else { if (!mtx_trylock(vlp1)) { error = EAGAIN; goto out; } to_unlock = vlp1; } rw_wlock(blp); cache_zap_locked(ncp); rw_wunlock(blp); mtx_unlock(to_unlock); out: mtx_unlock(pvlp); return (error); } /* * If trylocking failed we can get here. We know enough to take all needed locks * in the right order and re-lookup the entry. */ static int cache_zap_unlocked_bucket(struct namecache *ncp, struct componentname *cnp, struct vnode *dvp, struct mtx *dvlp, struct mtx *vlp, uint32_t hash, struct rwlock *blp) { struct namecache *rncp; cache_assert_bucket_locked(ncp, RA_UNLOCKED); cache_sort_vnodes(&dvlp, &vlp); cache_lock_vnodes(dvlp, vlp); rw_wlock(blp); CK_SLIST_FOREACH(rncp, (NCHHASH(hash)), nc_hash) { if (rncp == ncp && rncp->nc_dvp == dvp && rncp->nc_nlen == cnp->cn_namelen && !bcmp(rncp->nc_name, cnp->cn_nameptr, rncp->nc_nlen)) break; } if (rncp != NULL) { cache_zap_locked(rncp); rw_wunlock(blp); cache_unlock_vnodes(dvlp, vlp); counter_u64_add(zap_and_exit_bucket_relock_success, 1); return (0); } rw_wunlock(blp); cache_unlock_vnodes(dvlp, vlp); return (EAGAIN); } static int __noinline cache_zap_wlocked_bucket(struct namecache *ncp, struct componentname *cnp, uint32_t hash, struct rwlock *blp) { struct mtx *dvlp, *vlp; struct vnode *dvp; cache_assert_bucket_locked(ncp, RA_WLOCKED); dvlp = VP2VNODELOCK(ncp->nc_dvp); vlp = NULL; if (!(ncp->nc_flag & NCF_NEGATIVE)) vlp = VP2VNODELOCK(ncp->nc_vp); if (cache_trylock_vnodes(dvlp, vlp) == 0) { cache_zap_locked(ncp); rw_wunlock(blp); cache_unlock_vnodes(dvlp, vlp); return (0); } dvp = ncp->nc_dvp; rw_wunlock(blp); return (cache_zap_unlocked_bucket(ncp, cnp, dvp, dvlp, vlp, hash, blp)); } static int __noinline cache_zap_rlocked_bucket(struct namecache *ncp, struct componentname *cnp, uint32_t hash, struct rwlock *blp) { struct mtx *dvlp, *vlp; struct vnode *dvp; cache_assert_bucket_locked(ncp, RA_RLOCKED); dvlp = VP2VNODELOCK(ncp->nc_dvp); vlp = NULL; if (!(ncp->nc_flag & NCF_NEGATIVE)) vlp = VP2VNODELOCK(ncp->nc_vp); if (cache_trylock_vnodes(dvlp, vlp) == 0) { rw_runlock(blp); rw_wlock(blp); cache_zap_locked(ncp); rw_wunlock(blp); cache_unlock_vnodes(dvlp, vlp); return (0); } dvp = ncp->nc_dvp; rw_runlock(blp); return (cache_zap_unlocked_bucket(ncp, cnp, dvp, dvlp, vlp, hash, blp)); } static int cache_zap_wlocked_bucket_kl(struct namecache *ncp, struct rwlock *blp, struct mtx **vlpp1, struct mtx **vlpp2) { struct mtx *dvlp, *vlp; cache_assert_bucket_locked(ncp, RA_WLOCKED); dvlp = VP2VNODELOCK(ncp->nc_dvp); vlp = NULL; if (!(ncp->nc_flag & NCF_NEGATIVE)) vlp = VP2VNODELOCK(ncp->nc_vp); cache_sort_vnodes(&dvlp, &vlp); if (*vlpp1 == dvlp && *vlpp2 == vlp) { cache_zap_locked(ncp); cache_unlock_vnodes(dvlp, vlp); *vlpp1 = NULL; *vlpp2 = NULL; return (0); } if (*vlpp1 != NULL) mtx_unlock(*vlpp1); if (*vlpp2 != NULL) mtx_unlock(*vlpp2); *vlpp1 = NULL; *vlpp2 = NULL; if (cache_trylock_vnodes(dvlp, vlp) == 0) { cache_zap_locked(ncp); cache_unlock_vnodes(dvlp, vlp); return (0); } rw_wunlock(blp); *vlpp1 = dvlp; *vlpp2 = vlp; if (*vlpp1 != NULL) mtx_lock(*vlpp1); mtx_lock(*vlpp2); rw_wlock(blp); return (EAGAIN); } static void cache_lookup_unlock(struct rwlock *blp, struct mtx *vlp) { if (blp != NULL) { rw_runlock(blp); } else { mtx_unlock(vlp); } } static int __noinline cache_lookup_dot(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, struct timespec *tsp, int *ticksp) { int ltype; *vpp = dvp; CTR2(KTR_VFS, "cache_lookup(%p, %s) found via .", dvp, cnp->cn_nameptr); counter_u64_add(dothits, 1); SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", *vpp); if (tsp != NULL) timespecclear(tsp); if (ticksp != NULL) *ticksp = ticks; vrefact(*vpp); /* * When we lookup "." we still can be asked to lock it * differently... */ ltype = cnp->cn_lkflags & LK_TYPE_MASK; if (ltype != VOP_ISLOCKED(*vpp)) { if (ltype == LK_EXCLUSIVE) { vn_lock(*vpp, LK_UPGRADE | LK_RETRY); if (VN_IS_DOOMED((*vpp))) { /* forced unmount */ vrele(*vpp); *vpp = NULL; return (ENOENT); } } else vn_lock(*vpp, LK_DOWNGRADE | LK_RETRY); } return (-1); } static __noinline int cache_lookup_nomakeentry(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, struct timespec *tsp, int *ticksp) { struct namecache *ncp; struct rwlock *blp; struct mtx *dvlp, *dvlp2; uint32_t hash; int error; if (cnp->cn_namelen == 2 && cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') { counter_u64_add(dotdothits, 1); dvlp = VP2VNODELOCK(dvp); dvlp2 = NULL; mtx_lock(dvlp); retry_dotdot: ncp = dvp->v_cache_dd; if (ncp == NULL) { SDT_PROBE3(vfs, namecache, lookup, miss, dvp, "..", NULL); mtx_unlock(dvlp); if (dvlp2 != NULL) mtx_unlock(dvlp2); return (0); } if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) { if (ncp->nc_dvp != dvp) panic("dvp %p v_cache_dd %p\n", dvp, ncp); if (!cache_zap_locked_vnode_kl2(ncp, dvp, &dvlp2)) goto retry_dotdot; MPASS(dvp->v_cache_dd == NULL); mtx_unlock(dvlp); if (dvlp2 != NULL) mtx_unlock(dvlp2); cache_free(ncp); } else { vn_seqc_write_begin(dvp); dvp->v_cache_dd = NULL; vn_seqc_write_end(dvp); mtx_unlock(dvlp); if (dvlp2 != NULL) mtx_unlock(dvlp2); } return (0); } hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); blp = HASH2BUCKETLOCK(hash); retry: if (CK_SLIST_EMPTY(NCHHASH(hash))) goto out_no_entry; rw_wlock(blp); CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) break; } /* We failed to find an entry */ if (ncp == NULL) { rw_wunlock(blp); goto out_no_entry; } error = cache_zap_wlocked_bucket(ncp, cnp, hash, blp); if (__predict_false(error != 0)) { zap_and_exit_bucket_fail++; cache_maybe_yield(); goto retry; } counter_u64_add(numposzaps, 1); cache_free(ncp); return (0); out_no_entry: SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr, NULL); counter_u64_add(nummisszap, 1); return (0); } /** * Lookup a name in the name cache * * # Arguments * * - dvp: Parent directory in which to search. * - vpp: Return argument. Will contain desired vnode on cache hit. * - cnp: Parameters of the name search. The most interesting bits of * the cn_flags field have the following meanings: * - MAKEENTRY: If clear, free an entry from the cache rather than look * it up. * - ISDOTDOT: Must be set if and only if cn_nameptr == ".." * - tsp: Return storage for cache timestamp. On a successful (positive * or negative) lookup, tsp will be filled with any timespec that * was stored when this cache entry was created. However, it will * be clear for "." entries. * - ticks: Return storage for alternate cache timestamp. On a successful * (positive or negative) lookup, it will contain the ticks value * that was current when the cache entry was created, unless cnp * was ".". * * # Returns * * - -1: A positive cache hit. vpp will contain the desired vnode. * - ENOENT: A negative cache hit, or dvp was recycled out from under us due * to a forced unmount. vpp will not be modified. If the entry * is a whiteout, then the ISWHITEOUT flag will be set in * cnp->cn_flags. * - 0: A cache miss. vpp will not be modified. * * # Locking * * On a cache hit, vpp will be returned locked and ref'd. If we're looking up * .., dvp is unlocked. If we're looking up . an extra ref is taken, but the * lock is not recursively acquired. */ int cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, struct timespec *tsp, int *ticksp) { struct namecache_ts *ncp_ts; struct namecache *ncp; struct negstate *negstate; struct rwlock *blp; struct mtx *dvlp; uint32_t hash; enum vgetstate vs; int error, ltype; bool try_smr, doing_smr, whiteout; #ifdef DEBUG_CACHE if (__predict_false(!doingcache)) { cnp->cn_flags &= ~MAKEENTRY; return (0); } #endif if (__predict_false(cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.')) return (cache_lookup_dot(dvp, vpp, cnp, tsp, ticksp)); if ((cnp->cn_flags & MAKEENTRY) == 0) return (cache_lookup_nomakeentry(dvp, vpp, cnp, tsp, ticksp)); try_smr = true; if (cnp->cn_nameiop == CREATE) try_smr = false; retry: doing_smr = false; blp = NULL; dvlp = NULL; error = 0; if (cnp->cn_namelen == 2 && cnp->cn_nameptr[0] == '.' && cnp->cn_nameptr[1] == '.') { counter_u64_add(dotdothits, 1); dvlp = VP2VNODELOCK(dvp); mtx_lock(dvlp); ncp = dvp->v_cache_dd; if (ncp == NULL) { SDT_PROBE3(vfs, namecache, lookup, miss, dvp, "..", NULL); mtx_unlock(dvlp); return (0); } if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) { if (ncp->nc_flag & NCF_NEGATIVE) *vpp = NULL; else *vpp = ncp->nc_vp; } else *vpp = ncp->nc_dvp; /* Return failure if negative entry was found. */ if (*vpp == NULL) goto negative_success; CTR3(KTR_VFS, "cache_lookup(%p, %s) found %p via ..", dvp, cnp->cn_nameptr, *vpp); SDT_PROBE3(vfs, namecache, lookup, hit, dvp, "..", *vpp); cache_out_ts(ncp, tsp, ticksp); if ((ncp->nc_flag & (NCF_ISDOTDOT | NCF_DTS)) == NCF_DTS && tsp != NULL) { ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); *tsp = ncp_ts->nc_dotdottime; } goto success; } hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); retry_hashed: if (try_smr) { vfs_smr_enter(); doing_smr = true; try_smr = false; } else { blp = HASH2BUCKETLOCK(hash); rw_rlock(blp); } CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) break; } /* We failed to find an entry */ if (__predict_false(ncp == NULL)) { if (doing_smr) vfs_smr_exit(); else rw_runlock(blp); SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr, NULL); counter_u64_add(nummiss, 1); return (0); } if (ncp->nc_flag & NCF_NEGATIVE) goto negative_success; /* We found a "positive" match, return the vnode */ counter_u64_add(numposhits, 1); *vpp = ncp->nc_vp; CTR4(KTR_VFS, "cache_lookup(%p, %s) found %p via ncp %p", dvp, cnp->cn_nameptr, *vpp, ncp); SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, *vpp); cache_out_ts(ncp, tsp, ticksp); success: /* * On success we return a locked and ref'd vnode as per the lookup * protocol. */ MPASS(dvp != *vpp); ltype = 0; /* silence gcc warning */ if (cnp->cn_flags & ISDOTDOT) { ltype = VOP_ISLOCKED(dvp); VOP_UNLOCK(dvp); } if (doing_smr) { if (!cache_ncp_canuse(ncp)) { vfs_smr_exit(); *vpp = NULL; goto retry; } vs = vget_prep_smr(*vpp); vfs_smr_exit(); if (__predict_false(vs == VGET_NONE)) { *vpp = NULL; goto retry; } } else { vs = vget_prep(*vpp); cache_lookup_unlock(blp, dvlp); } error = vget_finish(*vpp, cnp->cn_lkflags, vs); if (cnp->cn_flags & ISDOTDOT) { vn_lock(dvp, ltype | LK_RETRY); if (VN_IS_DOOMED(dvp)) { if (error == 0) vput(*vpp); *vpp = NULL; return (ENOENT); } } if (error) { *vpp = NULL; goto retry; } if ((cnp->cn_flags & ISLASTCN) && (cnp->cn_lkflags & LK_TYPE_MASK) == LK_EXCLUSIVE) { ASSERT_VOP_ELOCKED(*vpp, "cache_lookup"); } return (-1); negative_success: /* We found a negative match, and want to create it, so purge */ if (cnp->cn_nameiop == CREATE) { MPASS(!doing_smr); counter_u64_add(numnegzaps, 1); goto zap_and_exit; } SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, ncp->nc_name); cache_out_ts(ncp, tsp, ticksp); counter_u64_add(numneghits, 1); whiteout = (ncp->nc_flag & NCF_WHITE); if (doing_smr) { /* * We need to take locks to promote an entry. */ negstate = NCP2NEGSTATE(ncp); if ((negstate->neg_flag & NEG_HOT) == 0 || !cache_ncp_canuse(ncp)) { vfs_smr_exit(); doing_smr = false; goto retry_hashed; } vfs_smr_exit(); } else { cache_negative_hit(ncp); cache_lookup_unlock(blp, dvlp); } if (whiteout) cnp->cn_flags |= ISWHITEOUT; return (ENOENT); zap_and_exit: MPASS(!doing_smr); if (blp != NULL) error = cache_zap_rlocked_bucket(ncp, cnp, hash, blp); else error = cache_zap_locked_vnode(ncp, dvp); if (__predict_false(error != 0)) { zap_and_exit_bucket_fail2++; cache_maybe_yield(); goto retry; } cache_free(ncp); return (0); } struct celockstate { struct mtx *vlp[3]; struct rwlock *blp[2]; }; CTASSERT((nitems(((struct celockstate *)0)->vlp) == 3)); CTASSERT((nitems(((struct celockstate *)0)->blp) == 2)); static inline void cache_celockstate_init(struct celockstate *cel) { bzero(cel, sizeof(*cel)); } static void cache_lock_vnodes_cel(struct celockstate *cel, struct vnode *vp, struct vnode *dvp) { struct mtx *vlp1, *vlp2; MPASS(cel->vlp[0] == NULL); MPASS(cel->vlp[1] == NULL); MPASS(cel->vlp[2] == NULL); MPASS(vp != NULL || dvp != NULL); vlp1 = VP2VNODELOCK(vp); vlp2 = VP2VNODELOCK(dvp); cache_sort_vnodes(&vlp1, &vlp2); if (vlp1 != NULL) { mtx_lock(vlp1); cel->vlp[0] = vlp1; } mtx_lock(vlp2); cel->vlp[1] = vlp2; } static void cache_unlock_vnodes_cel(struct celockstate *cel) { MPASS(cel->vlp[0] != NULL || cel->vlp[1] != NULL); if (cel->vlp[0] != NULL) mtx_unlock(cel->vlp[0]); if (cel->vlp[1] != NULL) mtx_unlock(cel->vlp[1]); if (cel->vlp[2] != NULL) mtx_unlock(cel->vlp[2]); } static bool cache_lock_vnodes_cel_3(struct celockstate *cel, struct vnode *vp) { struct mtx *vlp; bool ret; cache_assert_vlp_locked(cel->vlp[0]); cache_assert_vlp_locked(cel->vlp[1]); MPASS(cel->vlp[2] == NULL); MPASS(vp != NULL); vlp = VP2VNODELOCK(vp); ret = true; if (vlp >= cel->vlp[1]) { mtx_lock(vlp); } else { if (mtx_trylock(vlp)) goto out; cache_lock_vnodes_cel_3_failures++; cache_unlock_vnodes_cel(cel); if (vlp < cel->vlp[0]) { mtx_lock(vlp); mtx_lock(cel->vlp[0]); mtx_lock(cel->vlp[1]); } else { if (cel->vlp[0] != NULL) mtx_lock(cel->vlp[0]); mtx_lock(vlp); mtx_lock(cel->vlp[1]); } ret = false; } out: cel->vlp[2] = vlp; return (ret); } static void cache_lock_buckets_cel(struct celockstate *cel, struct rwlock *blp1, struct rwlock *blp2) { MPASS(cel->blp[0] == NULL); MPASS(cel->blp[1] == NULL); cache_sort_vnodes(&blp1, &blp2); if (blp1 != NULL) { rw_wlock(blp1); cel->blp[0] = blp1; } rw_wlock(blp2); cel->blp[1] = blp2; } static void cache_unlock_buckets_cel(struct celockstate *cel) { if (cel->blp[0] != NULL) rw_wunlock(cel->blp[0]); rw_wunlock(cel->blp[1]); } /* * Lock part of the cache affected by the insertion. * * This means vnodelocks for dvp, vp and the relevant bucketlock. * However, insertion can result in removal of an old entry. In this * case we have an additional vnode and bucketlock pair to lock. If the * entry is negative, ncelock is locked instead of the vnode. * * That is, in the worst case we have to lock 3 vnodes and 2 bucketlocks, while * preserving the locking order (smaller address first). */ static void cache_enter_lock(struct celockstate *cel, struct vnode *dvp, struct vnode *vp, uint32_t hash) { struct namecache *ncp; struct rwlock *blps[2]; blps[0] = HASH2BUCKETLOCK(hash); for (;;) { blps[1] = NULL; cache_lock_vnodes_cel(cel, dvp, vp); if (vp == NULL || vp->v_type != VDIR) break; ncp = vp->v_cache_dd; if (ncp == NULL) break; if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) break; MPASS(ncp->nc_dvp == vp); blps[1] = NCP2BUCKETLOCK(ncp); if (ncp->nc_flag & NCF_NEGATIVE) break; if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp)) break; /* * All vnodes got re-locked. Re-validate the state and if * nothing changed we are done. Otherwise restart. */ if (ncp == vp->v_cache_dd && (ncp->nc_flag & NCF_ISDOTDOT) != 0 && blps[1] == NCP2BUCKETLOCK(ncp) && VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2]) break; cache_unlock_vnodes_cel(cel); cel->vlp[0] = NULL; cel->vlp[1] = NULL; cel->vlp[2] = NULL; } cache_lock_buckets_cel(cel, blps[0], blps[1]); } static void cache_enter_lock_dd(struct celockstate *cel, struct vnode *dvp, struct vnode *vp, uint32_t hash) { struct namecache *ncp; struct rwlock *blps[2]; blps[0] = HASH2BUCKETLOCK(hash); for (;;) { blps[1] = NULL; cache_lock_vnodes_cel(cel, dvp, vp); ncp = dvp->v_cache_dd; if (ncp == NULL) break; if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) break; MPASS(ncp->nc_dvp == dvp); blps[1] = NCP2BUCKETLOCK(ncp); if (ncp->nc_flag & NCF_NEGATIVE) break; if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp)) break; if (ncp == dvp->v_cache_dd && (ncp->nc_flag & NCF_ISDOTDOT) != 0 && blps[1] == NCP2BUCKETLOCK(ncp) && VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2]) break; cache_unlock_vnodes_cel(cel); cel->vlp[0] = NULL; cel->vlp[1] = NULL; cel->vlp[2] = NULL; } cache_lock_buckets_cel(cel, blps[0], blps[1]); } static void cache_enter_unlock(struct celockstate *cel) { cache_unlock_buckets_cel(cel); cache_unlock_vnodes_cel(cel); } static void __noinline cache_enter_dotdot_prep(struct vnode *dvp, struct vnode *vp, struct componentname *cnp) { struct celockstate cel; struct namecache *ncp; uint32_t hash; int len; if (dvp->v_cache_dd == NULL) return; len = cnp->cn_namelen; cache_celockstate_init(&cel); hash = cache_get_hash(cnp->cn_nameptr, len, dvp); cache_enter_lock_dd(&cel, dvp, vp, hash); vn_seqc_write_begin(dvp); ncp = dvp->v_cache_dd; if (ncp != NULL && (ncp->nc_flag & NCF_ISDOTDOT)) { KASSERT(ncp->nc_dvp == dvp, ("wrong isdotdot parent")); cache_zap_locked(ncp); } else { ncp = NULL; } dvp->v_cache_dd = NULL; vn_seqc_write_end(dvp); cache_enter_unlock(&cel); cache_free(ncp); } /* * Add an entry to the cache. */ void cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, struct timespec *tsp, struct timespec *dtsp) { struct celockstate cel; struct namecache *ncp, *n2, *ndd; struct namecache_ts *ncp_ts, *n2_ts; struct nchashhead *ncpp; uint32_t hash; int flag; int len; u_long lnumcache; CTR3(KTR_VFS, "cache_enter(%p, %p, %s)", dvp, vp, cnp->cn_nameptr); VNASSERT(vp == NULL || !VN_IS_DOOMED(vp), vp, ("cache_enter: Adding a doomed vnode")); VNASSERT(dvp == NULL || !VN_IS_DOOMED(dvp), dvp, ("cache_enter: Doomed vnode used as src")); #ifdef DEBUG_CACHE if (__predict_false(!doingcache)) return; #endif flag = 0; if (__predict_false(cnp->cn_nameptr[0] == '.')) { if (cnp->cn_namelen == 1) return; if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') { cache_enter_dotdot_prep(dvp, vp, cnp); flag = NCF_ISDOTDOT; } } /* * Avoid blowout in namecache entries. */ lnumcache = atomic_fetchadd_long(&numcache, 1) + 1; if (__predict_false(lnumcache >= ncsize)) { atomic_add_long(&numcache, -1); counter_u64_add(numdrops, 1); return; } cache_celockstate_init(&cel); ndd = NULL; ncp_ts = NULL; /* * Calculate the hash key and setup as much of the new * namecache entry as possible before acquiring the lock. */ ncp = cache_alloc(cnp->cn_namelen, tsp != NULL); ncp->nc_flag = flag | NCF_WIP; ncp->nc_vp = vp; if (vp == NULL) cache_negative_init(ncp); ncp->nc_dvp = dvp; if (tsp != NULL) { ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); ncp_ts->nc_time = *tsp; ncp_ts->nc_ticks = ticks; ncp_ts->nc_nc.nc_flag |= NCF_TS; if (dtsp != NULL) { ncp_ts->nc_dotdottime = *dtsp; ncp_ts->nc_nc.nc_flag |= NCF_DTS; } } len = ncp->nc_nlen = cnp->cn_namelen; hash = cache_get_hash(cnp->cn_nameptr, len, dvp); memcpy(ncp->nc_name, cnp->cn_nameptr, len); ncp->nc_name[len] = '\0'; cache_enter_lock(&cel, dvp, vp, hash); /* * See if this vnode or negative entry is already in the cache * with this name. This can happen with concurrent lookups of * the same path name. */ ncpp = NCHHASH(hash); CK_SLIST_FOREACH(n2, ncpp, nc_hash) { if (n2->nc_dvp == dvp && n2->nc_nlen == cnp->cn_namelen && !bcmp(n2->nc_name, cnp->cn_nameptr, n2->nc_nlen)) { if (tsp != NULL) { KASSERT((n2->nc_flag & NCF_TS) != 0, ("no NCF_TS")); n2_ts = __containerof(n2, struct namecache_ts, nc_nc); n2_ts->nc_time = ncp_ts->nc_time; n2_ts->nc_ticks = ncp_ts->nc_ticks; if (dtsp != NULL) { n2_ts->nc_dotdottime = ncp_ts->nc_dotdottime; n2_ts->nc_nc.nc_flag |= NCF_DTS; } } goto out_unlock_free; } } if (flag == NCF_ISDOTDOT) { /* * See if we are trying to add .. entry, but some other lookup * has populated v_cache_dd pointer already. */ if (dvp->v_cache_dd != NULL) goto out_unlock_free; KASSERT(vp == NULL || vp->v_type == VDIR, ("wrong vnode type %p", vp)); vn_seqc_write_begin(dvp); dvp->v_cache_dd = ncp; vn_seqc_write_end(dvp); } if (vp != NULL) { if (vp->v_type == VDIR) { if (flag != NCF_ISDOTDOT) { /* * For this case, the cache entry maps both the * directory name in it and the name ".." for the * directory's parent. */ vn_seqc_write_begin(vp); if ((ndd = vp->v_cache_dd) != NULL) { if ((ndd->nc_flag & NCF_ISDOTDOT) != 0) cache_zap_locked(ndd); else ndd = NULL; } vp->v_cache_dd = ncp; vn_seqc_write_end(vp); } } else { if (vp->v_cache_dd != NULL) { vn_seqc_write_begin(vp); vp->v_cache_dd = NULL; vn_seqc_write_end(vp); } } } if (flag != NCF_ISDOTDOT) { if (LIST_EMPTY(&dvp->v_cache_src)) { vhold(dvp); counter_u64_add(numcachehv, 1); } LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src); } /* * If the entry is "negative", we place it into the * "negative" cache queue, otherwise, we place it into the * destination vnode's cache entries queue. */ if (vp != NULL) { TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst); SDT_PROBE3(vfs, namecache, enter, done, dvp, ncp->nc_name, vp); } else { if (cnp->cn_flags & ISWHITEOUT) ncp->nc_flag |= NCF_WHITE; cache_negative_insert(ncp); SDT_PROBE2(vfs, namecache, enter_negative, done, dvp, ncp->nc_name); } /* * Insert the new namecache entry into the appropriate chain * within the cache entries table. */ CK_SLIST_INSERT_HEAD(ncpp, ncp, nc_hash); atomic_thread_fence_rel(); /* * Mark the entry as fully constructed. * It is immutable past this point until its removal. */ atomic_store_char(&ncp->nc_flag, ncp->nc_flag & ~NCF_WIP); cache_enter_unlock(&cel); if (numneg * ncnegfactor > lnumcache) cache_negative_zap_one(); cache_free(ndd); return; out_unlock_free: cache_enter_unlock(&cel); atomic_add_long(&numcache, -1); cache_free(ncp); return; } static u_int cache_roundup_2(u_int val) { u_int res; for (res = 1; res <= val; res <<= 1) continue; return (res); } static struct nchashhead * nchinittbl(u_long elements, u_long *hashmask) { struct nchashhead *hashtbl; u_long hashsize, i; hashsize = cache_roundup_2(elements) / 2; hashtbl = malloc((u_long)hashsize * sizeof(*hashtbl), M_VFSCACHE, M_WAITOK); for (i = 0; i < hashsize; i++) CK_SLIST_INIT(&hashtbl[i]); *hashmask = hashsize - 1; return (hashtbl); } static void ncfreetbl(struct nchashhead *hashtbl) { free(hashtbl, M_VFSCACHE); } /* * Name cache initialization, from vfs_init() when we are booting */ static void nchinit(void *dummy __unused) { u_int i; cache_zone_small = uma_zcreate("S VFS Cache", CACHE_ZONE_SMALL_SIZE, NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); cache_zone_small_ts = uma_zcreate("STS VFS Cache", CACHE_ZONE_SMALL_TS_SIZE, NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); cache_zone_large = uma_zcreate("L VFS Cache", CACHE_ZONE_LARGE_SIZE, NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); cache_zone_large_ts = uma_zcreate("LTS VFS Cache", CACHE_ZONE_LARGE_TS_SIZE, NULL, NULL, NULL, NULL, CACHE_ZONE_ALIGNMENT, UMA_ZONE_ZINIT); VFS_SMR_ZONE_SET(cache_zone_small); VFS_SMR_ZONE_SET(cache_zone_small_ts); VFS_SMR_ZONE_SET(cache_zone_large); VFS_SMR_ZONE_SET(cache_zone_large_ts); ncsize = desiredvnodes * ncsizefactor; nchashtbl = nchinittbl(desiredvnodes * 2, &nchash); ncbuckethash = cache_roundup_2(mp_ncpus * mp_ncpus) - 1; if (ncbuckethash < 7) /* arbitrarily chosen to avoid having one lock */ ncbuckethash = 7; if (ncbuckethash > nchash) ncbuckethash = nchash; bucketlocks = malloc(sizeof(*bucketlocks) * numbucketlocks, M_VFSCACHE, M_WAITOK | M_ZERO); for (i = 0; i < numbucketlocks; i++) rw_init_flags(&bucketlocks[i], "ncbuc", RW_DUPOK | RW_RECURSE); ncvnodehash = ncbuckethash; vnodelocks = malloc(sizeof(*vnodelocks) * numvnodelocks, M_VFSCACHE, M_WAITOK | M_ZERO); for (i = 0; i < numvnodelocks; i++) mtx_init(&vnodelocks[i], "ncvn", NULL, MTX_DUPOK | MTX_RECURSE); ncpurgeminvnodes = numbucketlocks * 2; neglists = malloc(sizeof(*neglists) * numneglists, M_VFSCACHE, M_WAITOK | M_ZERO); for (i = 0; i < numneglists; i++) { mtx_init(&neglists[i].nl_lock, "ncnegl", NULL, MTX_DEF); TAILQ_INIT(&neglists[i].nl_list); } mtx_init(&ncneg_hot.nl_lock, "ncneglh", NULL, MTX_DEF); TAILQ_INIT(&ncneg_hot.nl_list); mtx_init(&ncneg_shrink_lock, "ncnegs", NULL, MTX_DEF); } SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL); void cache_vnode_init(struct vnode *vp) { LIST_INIT(&vp->v_cache_src); TAILQ_INIT(&vp->v_cache_dst); vp->v_cache_dd = NULL; cache_prehash(vp); } void cache_changesize(u_long newmaxvnodes) { struct nchashhead *new_nchashtbl, *old_nchashtbl; u_long new_nchash, old_nchash; struct namecache *ncp; uint32_t hash; u_long newncsize; int i; newncsize = newmaxvnodes * ncsizefactor; newmaxvnodes = cache_roundup_2(newmaxvnodes * 2); if (newmaxvnodes < numbucketlocks) newmaxvnodes = numbucketlocks; new_nchashtbl = nchinittbl(newmaxvnodes, &new_nchash); /* If same hash table size, nothing to do */ if (nchash == new_nchash) { ncfreetbl(new_nchashtbl); return; } /* * Move everything from the old hash table to the new table. * None of the namecache entries in the table can be removed * because to do so, they have to be removed from the hash table. */ cache_lock_all_vnodes(); cache_lock_all_buckets(); old_nchashtbl = nchashtbl; old_nchash = nchash; nchashtbl = new_nchashtbl; nchash = new_nchash; for (i = 0; i <= old_nchash; i++) { while ((ncp = CK_SLIST_FIRST(&old_nchashtbl[i])) != NULL) { hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp); CK_SLIST_REMOVE(&old_nchashtbl[i], ncp, namecache, nc_hash); CK_SLIST_INSERT_HEAD(NCHHASH(hash), ncp, nc_hash); } } ncsize = newncsize; cache_unlock_all_buckets(); cache_unlock_all_vnodes(); ncfreetbl(old_nchashtbl); } /* * Invalidate all entries from and to a particular vnode. */ static void cache_purge_impl(struct vnode *vp) { TAILQ_HEAD(, namecache) ncps; struct namecache *ncp, *nnp; struct mtx *vlp, *vlp2; TAILQ_INIT(&ncps); vlp = VP2VNODELOCK(vp); vlp2 = NULL; mtx_assert(vlp, MA_OWNED); retry: while (!LIST_EMPTY(&vp->v_cache_src)) { ncp = LIST_FIRST(&vp->v_cache_src); if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) goto retry; TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); } while (!TAILQ_EMPTY(&vp->v_cache_dst)) { ncp = TAILQ_FIRST(&vp->v_cache_dst); if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) goto retry; TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); } ncp = vp->v_cache_dd; if (ncp != NULL) { KASSERT(ncp->nc_flag & NCF_ISDOTDOT, ("lost dotdot link")); if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2)) goto retry; TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); } KASSERT(vp->v_cache_dd == NULL, ("incomplete purge")); mtx_unlock(vlp); if (vlp2 != NULL) mtx_unlock(vlp2); TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) { cache_free(ncp); } } void cache_purge(struct vnode *vp) { struct mtx *vlp; SDT_PROBE1(vfs, namecache, purge, done, vp); if (LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) && vp->v_cache_dd == NULL) return; vlp = VP2VNODELOCK(vp); mtx_lock(vlp); cache_purge_impl(vp); } /* * Only to be used by vgone. */ void cache_purge_vgone(struct vnode *vp) { struct mtx *vlp; VNPASS(VN_IS_DOOMED(vp), vp); vlp = VP2VNODELOCK(vp); if (!(LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) && vp->v_cache_dd == NULL)) { mtx_lock(vlp); cache_purge_impl(vp); mtx_assert(vlp, MA_NOTOWNED); return; } /* * All the NULL pointer state we found above may be transient. * Serialize against a possible thread doing cache_purge. */ mtx_wait_unlocked(vlp); if (!(LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) && vp->v_cache_dd == NULL)) { mtx_lock(vlp); cache_purge_impl(vp); mtx_assert(vlp, MA_NOTOWNED); return; } return; } /* * Invalidate all negative entries for a particular directory vnode. */ void cache_purge_negative(struct vnode *vp) { TAILQ_HEAD(, namecache) ncps; struct namecache *ncp, *nnp; struct mtx *vlp; CTR1(KTR_VFS, "cache_purge_negative(%p)", vp); SDT_PROBE1(vfs, namecache, purge_negative, done, vp); if (LIST_EMPTY(&vp->v_cache_src)) return; TAILQ_INIT(&ncps); vlp = VP2VNODELOCK(vp); mtx_lock(vlp); LIST_FOREACH_SAFE(ncp, &vp->v_cache_src, nc_src, nnp) { if (!(ncp->nc_flag & NCF_NEGATIVE)) continue; cache_zap_negative_locked_vnode_kl(ncp, vp); TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst); } mtx_unlock(vlp); TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) { cache_free(ncp); } } /* * Flush all entries referencing a particular filesystem. */ void cache_purgevfs(struct mount *mp, bool force) { TAILQ_HEAD(, namecache) ncps; struct mtx *vlp1, *vlp2; struct rwlock *blp; struct nchashhead *bucket; struct namecache *ncp, *nnp; u_long i, j, n_nchash; int error; /* Scan hash tables for applicable entries */ SDT_PROBE1(vfs, namecache, purgevfs, done, mp); if (!force && mp->mnt_nvnodelistsize <= ncpurgeminvnodes) return; TAILQ_INIT(&ncps); n_nchash = nchash + 1; vlp1 = vlp2 = NULL; for (i = 0; i < numbucketlocks; i++) { blp = (struct rwlock *)&bucketlocks[i]; rw_wlock(blp); for (j = i; j < n_nchash; j += numbucketlocks) { retry: bucket = &nchashtbl[j]; CK_SLIST_FOREACH_SAFE(ncp, bucket, nc_hash, nnp) { cache_assert_bucket_locked(ncp, RA_WLOCKED); if (ncp->nc_dvp->v_mount != mp) continue; error = cache_zap_wlocked_bucket_kl(ncp, blp, &vlp1, &vlp2); if (error != 0) goto retry; TAILQ_INSERT_HEAD(&ncps, ncp, nc_dst); } } rw_wunlock(blp); if (vlp1 == NULL && vlp2 == NULL) cache_maybe_yield(); } if (vlp1 != NULL) mtx_unlock(vlp1); if (vlp2 != NULL) mtx_unlock(vlp2); TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) { cache_free(ncp); } } /* * Perform canonical checks and cache lookup and pass on to filesystem * through the vop_cachedlookup only if needed. */ int vfs_cache_lookup(struct vop_lookup_args *ap) { struct vnode *dvp; int error; struct vnode **vpp = ap->a_vpp; struct componentname *cnp = ap->a_cnp; int flags = cnp->cn_flags; *vpp = NULL; dvp = ap->a_dvp; if (dvp->v_type != VDIR) return (ENOTDIR); if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) && (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) return (EROFS); error = vn_dir_check_exec(dvp, cnp); if (error != 0) return (error); error = cache_lookup(dvp, vpp, cnp, NULL, NULL); if (error == 0) return (VOP_CACHEDLOOKUP(dvp, vpp, cnp)); if (error == -1) return (0); return (error); } /* Implementation of the getcwd syscall. */ int sys___getcwd(struct thread *td, struct __getcwd_args *uap) { char *buf, *retbuf; size_t buflen; int error; buflen = uap->buflen; if (__predict_false(buflen < 2)) return (EINVAL); if (buflen > MAXPATHLEN) buflen = MAXPATHLEN; - buf = malloc(buflen, M_TEMP, M_WAITOK); + buf = uma_zalloc(namei_zone, M_WAITOK); error = vn_getcwd(td, buf, &retbuf, &buflen); if (error == 0) error = copyout(retbuf, uap->buf, buflen); - free(buf, M_TEMP); + uma_zfree(namei_zone, buf); return (error); } int vn_getcwd(struct thread *td, char *buf, char **retbuf, size_t *buflen) { struct pwd *pwd; int error; pwd = pwd_hold(td); error = vn_fullpath_any(td, pwd->pwd_cdir, pwd->pwd_rdir, buf, retbuf, buflen); pwd_drop(pwd); #ifdef KTRACE if (KTRPOINT(curthread, KTR_NAMEI) && error == 0) ktrnamei(*retbuf); #endif return (error); } static int kern___realpathat(struct thread *td, int fd, const char *path, char *buf, size_t size, int flags, enum uio_seg pathseg) { struct nameidata nd; char *retbuf, *freebuf; int error; if (flags != 0) return (EINVAL); NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | SAVENAME | WANTPARENT | AUDITVNODE1, pathseg, path, fd, &cap_fstat_rights, td); if ((error = namei(&nd)) != 0) return (error); error = vn_fullpath_hardlink(td, &nd, &retbuf, &freebuf, &size); if (error == 0) { error = copyout(retbuf, buf, size); free(freebuf, M_TEMP); } NDFREE(&nd, 0); return (error); } int sys___realpathat(struct thread *td, struct __realpathat_args *uap) { return (kern___realpathat(td, uap->fd, uap->path, uap->buf, uap->size, uap->flags, UIO_USERSPACE)); } /* * Retrieve the full filesystem path that correspond to a vnode from the name * cache (if available) */ int vn_fullpath(struct thread *td, struct vnode *vn, char **retbuf, char **freebuf) { struct pwd *pwd; char *buf; size_t buflen; int error; if (__predict_false(vn == NULL)) return (EINVAL); buflen = MAXPATHLEN; buf = malloc(buflen, M_TEMP, M_WAITOK); pwd = pwd_hold(td); error = vn_fullpath_any(td, vn, pwd->pwd_rdir, buf, retbuf, &buflen); pwd_drop(pwd); if (!error) *freebuf = buf; else free(buf, M_TEMP); return (error); } /* * This function is similar to vn_fullpath, but it attempts to lookup the * pathname relative to the global root mount point. This is required for the * auditing sub-system, as audited pathnames must be absolute, relative to the * global root mount point. */ int vn_fullpath_global(struct thread *td, struct vnode *vn, char **retbuf, char **freebuf) { char *buf; size_t buflen; int error; if (__predict_false(vn == NULL)) return (EINVAL); buflen = MAXPATHLEN; buf = malloc(buflen, M_TEMP, M_WAITOK); error = vn_fullpath_any(td, vn, rootvnode, buf, retbuf, &buflen); if (!error) *freebuf = buf; else free(buf, M_TEMP); return (error); } int vn_vptocnp(struct vnode **vp, struct ucred *cred, char *buf, size_t *buflen) { struct vnode *dvp; struct namecache *ncp; struct mtx *vlp; int error; vlp = VP2VNODELOCK(*vp); mtx_lock(vlp); TAILQ_FOREACH(ncp, &((*vp)->v_cache_dst), nc_dst) { if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) break; } if (ncp != NULL) { if (*buflen < ncp->nc_nlen) { mtx_unlock(vlp); vrele(*vp); counter_u64_add(numfullpathfail4, 1); error = ENOMEM; SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); return (error); } *buflen -= ncp->nc_nlen; memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen); SDT_PROBE3(vfs, namecache, fullpath, hit, ncp->nc_dvp, ncp->nc_name, vp); dvp = *vp; *vp = ncp->nc_dvp; vref(*vp); mtx_unlock(vlp); vrele(dvp); return (0); } SDT_PROBE1(vfs, namecache, fullpath, miss, vp); mtx_unlock(vlp); vn_lock(*vp, LK_SHARED | LK_RETRY); error = VOP_VPTOCNP(*vp, &dvp, cred, buf, buflen); vput(*vp); if (error) { counter_u64_add(numfullpathfail2, 1); SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); return (error); } *vp = dvp; if (VN_IS_DOOMED(dvp)) { /* forced unmount */ vrele(dvp); error = ENOENT; SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); return (error); } /* * *vp has its use count incremented still. */ return (0); } /* * Resolve a directory to a pathname. * * The name of the directory can always be found in the namecache or fetched * from the filesystem. There is also guaranteed to be only one parent, meaning * we can just follow vnodes up until we find the root. * * The vnode must be referenced. */ static int vn_fullpath_dir(struct thread *td, struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf, size_t *len, bool slash_prefixed, size_t addend) { #ifdef KDTRACE_HOOKS struct vnode *startvp = vp; #endif struct vnode *vp1; size_t buflen; int error; VNPASS(vp->v_type == VDIR || VN_IS_DOOMED(vp), vp); VNPASS(vp->v_usecount > 0, vp); buflen = *len; if (!slash_prefixed) { MPASS(*len >= 2); buflen--; buf[buflen] = '\0'; } error = 0; SDT_PROBE1(vfs, namecache, fullpath, entry, vp); counter_u64_add(numfullpathcalls, 1); while (vp != rdir && vp != rootvnode) { /* * The vp vnode must be already fully constructed, * since it is either found in namecache or obtained * from VOP_VPTOCNP(). We may test for VV_ROOT safely * without obtaining the vnode lock. */ if ((vp->v_vflag & VV_ROOT) != 0) { vn_lock(vp, LK_RETRY | LK_SHARED); /* * With the vnode locked, check for races with * unmount, forced or not. Note that we * already verified that vp is not equal to * the root vnode, which means that * mnt_vnodecovered can be NULL only for the * case of unmount. */ if (VN_IS_DOOMED(vp) || (vp1 = vp->v_mount->mnt_vnodecovered) == NULL || vp1->v_mountedhere != vp->v_mount) { vput(vp); error = ENOENT; SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); break; } vref(vp1); vput(vp); vp = vp1; continue; } if (vp->v_type != VDIR) { vrele(vp); counter_u64_add(numfullpathfail1, 1); error = ENOTDIR; SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL); break; } error = vn_vptocnp(&vp, td->td_ucred, buf, &buflen); if (error) break; if (buflen == 0) { vrele(vp); error = ENOMEM; SDT_PROBE3(vfs, namecache, fullpath, return, error, startvp, NULL); break; } buf[--buflen] = '/'; slash_prefixed = true; } if (error) return (error); if (!slash_prefixed) { if (buflen == 0) { vrele(vp); counter_u64_add(numfullpathfail4, 1); SDT_PROBE3(vfs, namecache, fullpath, return, ENOMEM, startvp, NULL); return (ENOMEM); } buf[--buflen] = '/'; } counter_u64_add(numfullpathfound, 1); vrele(vp); *retbuf = buf + buflen; SDT_PROBE3(vfs, namecache, fullpath, return, 0, startvp, *retbuf); *len -= buflen; *len += addend; return (0); } /* * Resolve an arbitrary vnode to a pathname. * * Note 2 caveats: * - hardlinks are not tracked, thus if the vnode is not a directory this can * resolve to a different path than the one used to find it * - namecache is not mandatory, meaning names are not guaranteed to be added * (in which case resolving fails) */ static int vn_fullpath_any(struct thread *td, struct vnode *vp, struct vnode *rdir, char *buf, char **retbuf, size_t *buflen) { size_t orig_buflen; bool slash_prefixed; int error; if (*buflen < 2) return (EINVAL); orig_buflen = *buflen; vref(vp); slash_prefixed = false; if (vp->v_type != VDIR) { *buflen -= 1; buf[*buflen] = '\0'; error = vn_vptocnp(&vp, td->td_ucred, buf, buflen); if (error) return (error); if (*buflen == 0) { vrele(vp); return (ENOMEM); } *buflen -= 1; buf[*buflen] = '/'; slash_prefixed = true; } return (vn_fullpath_dir(td, vp, rdir, buf, retbuf, buflen, slash_prefixed, orig_buflen - *buflen)); } /* * Resolve an arbitrary vnode to a pathname (taking care of hardlinks). * * Since the namecache does not track handlings, the caller is expected to first * look up the target vnode with SAVENAME | WANTPARENT flags passed to namei. * * Then we have 2 cases: * - if the found vnode is a directory, the path can be constructed just by * fullowing names up the chain * - otherwise we populate the buffer with the saved name and start resolving * from the parent */ static int vn_fullpath_hardlink(struct thread *td, struct nameidata *ndp, char **retbuf, char **freebuf, size_t *buflen) { char *buf, *tmpbuf; struct pwd *pwd; struct componentname *cnp; struct vnode *vp; size_t addend; int error; bool slash_prefixed; if (*buflen < 2) return (EINVAL); if (*buflen > MAXPATHLEN) *buflen = MAXPATHLEN; slash_prefixed = false; buf = malloc(*buflen, M_TEMP, M_WAITOK); pwd = pwd_hold(td); addend = 0; vp = ndp->ni_vp; if (vp->v_type != VDIR) { cnp = &ndp->ni_cnd; addend = cnp->cn_namelen + 2; if (*buflen < addend) { error = ENOMEM; goto out_bad; } *buflen -= addend; tmpbuf = buf + *buflen; tmpbuf[0] = '/'; memcpy(&tmpbuf[1], cnp->cn_nameptr, cnp->cn_namelen); tmpbuf[addend - 1] = '\0'; slash_prefixed = true; vp = ndp->ni_dvp; } vref(vp); error = vn_fullpath_dir(td, vp, pwd->pwd_rdir, buf, retbuf, buflen, slash_prefixed, addend); if (error != 0) goto out_bad; pwd_drop(pwd); *freebuf = buf; return (0); out_bad: pwd_drop(pwd); free(buf, M_TEMP); return (error); } struct vnode * vn_dir_dd_ino(struct vnode *vp) { struct namecache *ncp; struct vnode *ddvp; struct mtx *vlp; enum vgetstate vs; ASSERT_VOP_LOCKED(vp, "vn_dir_dd_ino"); vlp = VP2VNODELOCK(vp); mtx_lock(vlp); TAILQ_FOREACH(ncp, &(vp->v_cache_dst), nc_dst) { if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) continue; ddvp = ncp->nc_dvp; vs = vget_prep(ddvp); mtx_unlock(vlp); if (vget_finish(ddvp, LK_SHARED | LK_NOWAIT, vs)) return (NULL); return (ddvp); } mtx_unlock(vlp); return (NULL); } int vn_commname(struct vnode *vp, char *buf, u_int buflen) { struct namecache *ncp; struct mtx *vlp; int l; vlp = VP2VNODELOCK(vp); mtx_lock(vlp); TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst) if ((ncp->nc_flag & NCF_ISDOTDOT) == 0) break; if (ncp == NULL) { mtx_unlock(vlp); return (ENOENT); } l = min(ncp->nc_nlen, buflen - 1); memcpy(buf, ncp->nc_name, l); mtx_unlock(vlp); buf[l] = '\0'; return (0); } /* * This function updates path string to vnode's full global path * and checks the size of the new path string against the pathlen argument. * * Requires a locked, referenced vnode. * Vnode is re-locked on success or ENODEV, otherwise unlocked. * * If vp is a directory, the call to vn_fullpath_global() always succeeds * because it falls back to the ".." lookup if the namecache lookup fails. */ int vn_path_to_global_path(struct thread *td, struct vnode *vp, char *path, u_int pathlen) { struct nameidata nd; struct vnode *vp1; char *rpath, *fbuf; int error; ASSERT_VOP_ELOCKED(vp, __func__); /* Construct global filesystem path from vp. */ VOP_UNLOCK(vp); error = vn_fullpath_global(td, vp, &rpath, &fbuf); if (error != 0) { vrele(vp); return (error); } if (strlen(rpath) >= pathlen) { vrele(vp); error = ENAMETOOLONG; goto out; } /* * Re-lookup the vnode by path to detect a possible rename. * As a side effect, the vnode is relocked. * If vnode was renamed, return ENOENT. */ NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_SYSSPACE, path, td); error = namei(&nd); if (error != 0) { vrele(vp); goto out; } NDFREE(&nd, NDF_ONLY_PNBUF); vp1 = nd.ni_vp; vrele(vp); if (vp1 == vp) strcpy(path, rpath); else { vput(vp1); error = ENOENT; } out: free(fbuf, M_TEMP); return (error); } #ifdef DDB static void db_print_vpath(struct vnode *vp) { while (vp != NULL) { db_printf("%p: ", vp); if (vp == rootvnode) { db_printf("/"); vp = NULL; } else { if (vp->v_vflag & VV_ROOT) { db_printf(""); vp = vp->v_mount->mnt_vnodecovered; } else { struct namecache *ncp; char *ncn; int i; ncp = TAILQ_FIRST(&vp->v_cache_dst); if (ncp != NULL) { ncn = ncp->nc_name; for (i = 0; i < ncp->nc_nlen; i++) db_printf("%c", *ncn++); vp = ncp->nc_dvp; } else { vp = NULL; } } } db_printf("\n"); } return; } DB_SHOW_COMMAND(vpath, db_show_vpath) { struct vnode *vp; if (!have_addr) { db_printf("usage: show vpath \n"); return; } vp = (struct vnode *)addr; db_print_vpath(vp); } #endif - -extern uma_zone_t namei_zone; static bool __read_frequently cache_fast_lookup = true; SYSCTL_BOOL(_vfs, OID_AUTO, cache_fast_lookup, CTLFLAG_RW, &cache_fast_lookup, 0, ""); #define CACHE_FPL_FAILED -2020 static void cache_fpl_cleanup_cnp(struct componentname *cnp) { uma_zfree(namei_zone, cnp->cn_pnbuf); #ifdef DIAGNOSTIC cnp->cn_pnbuf = NULL; cnp->cn_nameptr = NULL; #endif } static void cache_fpl_handle_root(struct nameidata *ndp, struct vnode **dpp) { struct componentname *cnp; cnp = &ndp->ni_cnd; while (*(cnp->cn_nameptr) == '/') { cnp->cn_nameptr++; ndp->ni_pathlen--; } *dpp = ndp->ni_rootdir; } /* * Components of nameidata (or objects it can point to) which may * need restoring in case fast path lookup fails. */ struct nameidata_saved { long cn_namelen; char *cn_nameptr; size_t ni_pathlen; int cn_flags; }; struct cache_fpl { struct nameidata *ndp; struct componentname *cnp; struct pwd *pwd; struct vnode *dvp; struct vnode *tvp; seqc_t dvp_seqc; seqc_t tvp_seqc; struct nameidata_saved snd; int line; enum cache_fpl_status status:8; bool in_smr; }; static void cache_fpl_checkpoint(struct cache_fpl *fpl, struct nameidata_saved *snd) { snd->cn_flags = fpl->ndp->ni_cnd.cn_flags; snd->cn_namelen = fpl->ndp->ni_cnd.cn_namelen; snd->cn_nameptr = fpl->ndp->ni_cnd.cn_nameptr; snd->ni_pathlen = fpl->ndp->ni_pathlen; } static void cache_fpl_restore(struct cache_fpl *fpl, struct nameidata_saved *snd) { fpl->ndp->ni_cnd.cn_flags = snd->cn_flags; fpl->ndp->ni_cnd.cn_namelen = snd->cn_namelen; fpl->ndp->ni_cnd.cn_nameptr = snd->cn_nameptr; fpl->ndp->ni_pathlen = snd->ni_pathlen; } #ifdef INVARIANTS #define cache_fpl_smr_assert_entered(fpl) ({ \ struct cache_fpl *_fpl = (fpl); \ MPASS(_fpl->in_smr == true); \ VFS_SMR_ASSERT_ENTERED(); \ }) #define cache_fpl_smr_assert_not_entered(fpl) ({ \ struct cache_fpl *_fpl = (fpl); \ MPASS(_fpl->in_smr == false); \ VFS_SMR_ASSERT_NOT_ENTERED(); \ }) #else #define cache_fpl_smr_assert_entered(fpl) do { } while (0) #define cache_fpl_smr_assert_not_entered(fpl) do { } while (0) #endif #define cache_fpl_smr_enter_initial(fpl) ({ \ struct cache_fpl *_fpl = (fpl); \ vfs_smr_enter(); \ _fpl->in_smr = true; \ }) #define cache_fpl_smr_enter(fpl) ({ \ struct cache_fpl *_fpl = (fpl); \ MPASS(_fpl->in_smr == false); \ vfs_smr_enter(); \ _fpl->in_smr = true; \ }) #define cache_fpl_smr_exit(fpl) ({ \ struct cache_fpl *_fpl = (fpl); \ MPASS(_fpl->in_smr == true); \ vfs_smr_exit(); \ _fpl->in_smr = false; \ }) static int cache_fpl_aborted_impl(struct cache_fpl *fpl, int line) { if (fpl->status != CACHE_FPL_STATUS_UNSET) { KASSERT(fpl->status == CACHE_FPL_STATUS_PARTIAL, ("%s: converting to abort from %d at %d, set at %d\n", __func__, fpl->status, line, fpl->line)); } fpl->status = CACHE_FPL_STATUS_ABORTED; fpl->line = line; return (CACHE_FPL_FAILED); } #define cache_fpl_aborted(x) cache_fpl_aborted_impl((x), __LINE__) static int cache_fpl_partial_impl(struct cache_fpl *fpl, int line) { KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET, ("%s: setting to partial at %d, but already set to %d at %d\n", __func__, line, fpl->status, fpl->line)); cache_fpl_smr_assert_entered(fpl); fpl->status = CACHE_FPL_STATUS_PARTIAL; fpl->line = line; return (CACHE_FPL_FAILED); } #define cache_fpl_partial(x) cache_fpl_partial_impl((x), __LINE__) static int cache_fpl_handled_impl(struct cache_fpl *fpl, int error, int line) { KASSERT(fpl->status == CACHE_FPL_STATUS_UNSET, ("%s: setting to handled at %d, but already set to %d at %d\n", __func__, line, fpl->status, fpl->line)); cache_fpl_smr_assert_not_entered(fpl); MPASS(error != CACHE_FPL_FAILED); fpl->status = CACHE_FPL_STATUS_HANDLED; fpl->line = line; return (error); } #define cache_fpl_handled(x, e) cache_fpl_handled_impl((x), (e), __LINE__) #define CACHE_FPL_SUPPORTED_CN_FLAGS \ (LOCKLEAF | LOCKPARENT | WANTPARENT | NOCACHE | FOLLOW | LOCKSHARED | SAVENAME | \ SAVESTART | WILLBEDIR | ISOPEN | NOMACCHECK | AUDITVNODE1 | AUDITVNODE2) #define CACHE_FPL_INTERNAL_CN_FLAGS \ (ISDOTDOT | MAKEENTRY | ISLASTCN) _Static_assert((CACHE_FPL_SUPPORTED_CN_FLAGS & CACHE_FPL_INTERNAL_CN_FLAGS) == 0, "supported and internal flags overlap"); static bool cache_fpl_islastcn(struct nameidata *ndp) { return (*ndp->ni_next == 0); } static bool cache_fpl_isdotdot(struct componentname *cnp) { if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.') return (true); return (false); } static bool cache_can_fplookup(struct cache_fpl *fpl) { struct nameidata *ndp; struct componentname *cnp; struct thread *td; ndp = fpl->ndp; cnp = fpl->cnp; td = cnp->cn_thread; if (!cache_fast_lookup) { cache_fpl_aborted(fpl); return (false); } #ifdef MAC if (mac_vnode_check_lookup_enabled()) { cache_fpl_aborted(fpl); return (false); } #endif if ((cnp->cn_flags & ~CACHE_FPL_SUPPORTED_CN_FLAGS) != 0) { cache_fpl_aborted(fpl); return (false); } if (ndp->ni_dirfd != AT_FDCWD) { cache_fpl_aborted(fpl); return (false); } if (IN_CAPABILITY_MODE(td)) { cache_fpl_aborted(fpl); return (false); } if (AUDITING_TD(td)) { cache_fpl_aborted(fpl); return (false); } if (ndp->ni_startdir != NULL) { cache_fpl_aborted(fpl); return (false); } return (true); } static bool cache_fplookup_vnode_supported(struct vnode *vp) { return (vp->v_type != VLNK); } /* * Move a negative entry to the hot list. * * We have to take locks, but they may be contended and in the worst * case we may need to go off CPU. We don't want to spin within the * smr section and we can't block with it. Instead we are going to * look up the entry again. */ static int __noinline cache_fplookup_negative_promote(struct cache_fpl *fpl, struct namecache *oncp, uint32_t hash) { struct componentname *cnp; struct namecache *ncp; struct neglist *neglist; struct negstate *negstate; struct vnode *dvp; u_char nc_flag; cnp = fpl->cnp; dvp = fpl->dvp; if (!vhold_smr(dvp)) return (cache_fpl_aborted(fpl)); neglist = NCP2NEGLIST(oncp); cache_fpl_smr_exit(fpl); mtx_lock(&ncneg_hot.nl_lock); mtx_lock(&neglist->nl_lock); /* * For hash iteration. */ cache_fpl_smr_enter(fpl); /* * Avoid all surprises by only succeeding if we got the same entry and * bailing completely otherwise. * * In particular at this point there can be a new ncp which matches the * search but hashes to a different neglist. */ CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { if (ncp == oncp) break; } /* * No match to begin with. */ if (__predict_false(ncp == NULL)) { goto out_abort; } /* * The newly found entry may be something different... */ if (!(ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))) { goto out_abort; } /* * ... and not even negative. */ nc_flag = atomic_load_char(&ncp->nc_flag); if ((nc_flag & NCF_NEGATIVE) == 0) { goto out_abort; } if (__predict_false(!cache_ncp_canuse(ncp))) { goto out_abort; } negstate = NCP2NEGSTATE(ncp); if ((negstate->neg_flag & NEG_HOT) == 0) { numhotneg++; TAILQ_REMOVE(&neglist->nl_list, ncp, nc_dst); TAILQ_INSERT_TAIL(&ncneg_hot.nl_list, ncp, nc_dst); negstate->neg_flag |= NEG_HOT; } SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, ncp->nc_name); counter_u64_add(numneghits, 1); cache_fpl_smr_exit(fpl); mtx_unlock(&neglist->nl_lock); mtx_unlock(&ncneg_hot.nl_lock); vdrop(dvp); return (cache_fpl_handled(fpl, ENOENT)); out_abort: cache_fpl_smr_exit(fpl); mtx_unlock(&neglist->nl_lock); mtx_unlock(&ncneg_hot.nl_lock); vdrop(dvp); return (cache_fpl_aborted(fpl)); } /* * The target vnode is not supported, prepare for the slow path to take over. */ static int __noinline cache_fplookup_partial_setup(struct cache_fpl *fpl) { struct nameidata *ndp; struct componentname *cnp; enum vgetstate dvs; struct vnode *dvp; struct pwd *pwd; seqc_t dvp_seqc; ndp = fpl->ndp; cnp = fpl->cnp; dvp = fpl->dvp; dvp_seqc = fpl->dvp_seqc; dvs = vget_prep_smr(dvp); if (__predict_false(dvs == VGET_NONE)) { cache_fpl_smr_exit(fpl); return (cache_fpl_aborted(fpl)); } cache_fpl_smr_exit(fpl); vget_finish_ref(dvp, dvs); if (!vn_seqc_consistent(dvp, dvp_seqc)) { vrele(dvp); return (cache_fpl_aborted(fpl)); } pwd = pwd_hold(curthread); if (fpl->pwd != pwd) { vrele(dvp); pwd_drop(pwd); return (cache_fpl_aborted(fpl)); } cache_fpl_restore(fpl, &fpl->snd); ndp->ni_startdir = dvp; cnp->cn_flags |= MAKEENTRY; if (cache_fpl_islastcn(ndp)) cnp->cn_flags |= ISLASTCN; if (cache_fpl_isdotdot(cnp)) cnp->cn_flags |= ISDOTDOT; return (0); } static int cache_fplookup_final_child(struct cache_fpl *fpl, enum vgetstate tvs) { struct componentname *cnp; struct vnode *tvp; seqc_t tvp_seqc; int error, lkflags; cnp = fpl->cnp; tvp = fpl->tvp; tvp_seqc = fpl->tvp_seqc; if ((cnp->cn_flags & LOCKLEAF) != 0) { lkflags = LK_SHARED; if ((cnp->cn_flags & LOCKSHARED) == 0) lkflags = LK_EXCLUSIVE; error = vget_finish(tvp, lkflags, tvs); if (__predict_false(error != 0)) { return (cache_fpl_aborted(fpl)); } } else { vget_finish_ref(tvp, tvs); } if (!vn_seqc_consistent(tvp, tvp_seqc)) { if ((cnp->cn_flags & LOCKLEAF) != 0) vput(tvp); else vrele(tvp); return (cache_fpl_aborted(fpl)); } return (cache_fpl_handled(fpl, 0)); } /* * They want to possibly modify the state of the namecache. * * Don't try to match the API contract, just leave. * TODO: this leaves scalability on the table */ static int cache_fplookup_final_modifying(struct cache_fpl *fpl) { struct componentname *cnp; cnp = fpl->cnp; MPASS(cnp->cn_nameiop != LOOKUP); return (cache_fpl_partial(fpl)); } static int __noinline cache_fplookup_final_withparent(struct cache_fpl *fpl) { struct componentname *cnp; enum vgetstate dvs, tvs; struct vnode *dvp, *tvp; seqc_t dvp_seqc, tvp_seqc; int error; cnp = fpl->cnp; dvp = fpl->dvp; dvp_seqc = fpl->dvp_seqc; tvp = fpl->tvp; tvp_seqc = fpl->tvp_seqc; MPASS((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0); /* * This is less efficient than it can be for simplicity. */ dvs = vget_prep_smr(dvp); if (__predict_false(dvs == VGET_NONE)) { return (cache_fpl_aborted(fpl)); } tvs = vget_prep_smr(tvp); if (__predict_false(tvs == VGET_NONE)) { cache_fpl_smr_exit(fpl); vget_abort(dvp, dvs); return (cache_fpl_aborted(fpl)); } cache_fpl_smr_exit(fpl); if ((cnp->cn_flags & LOCKPARENT) != 0) { error = vget_finish(dvp, LK_EXCLUSIVE, dvs); if (__predict_false(error != 0)) { vget_abort(tvp, tvs); return (cache_fpl_aborted(fpl)); } } else { vget_finish_ref(dvp, dvs); } if (!vn_seqc_consistent(dvp, dvp_seqc)) { vget_abort(tvp, tvs); if ((cnp->cn_flags & LOCKPARENT) != 0) vput(dvp); else vrele(dvp); return (cache_fpl_aborted(fpl)); } error = cache_fplookup_final_child(fpl, tvs); if (__predict_false(error != 0)) { MPASS(fpl->status == CACHE_FPL_STATUS_ABORTED); if ((cnp->cn_flags & LOCKPARENT) != 0) vput(dvp); else vrele(dvp); return (error); } MPASS(fpl->status == CACHE_FPL_STATUS_HANDLED); return (0); } static int cache_fplookup_final(struct cache_fpl *fpl) { struct componentname *cnp; enum vgetstate tvs; struct vnode *dvp, *tvp; seqc_t dvp_seqc, tvp_seqc; cnp = fpl->cnp; dvp = fpl->dvp; dvp_seqc = fpl->dvp_seqc; tvp = fpl->tvp; tvp_seqc = fpl->tvp_seqc; VNPASS(cache_fplookup_vnode_supported(dvp), dvp); if (cnp->cn_nameiop != LOOKUP) { return (cache_fplookup_final_modifying(fpl)); } if ((cnp->cn_flags & (LOCKPARENT|WANTPARENT)) != 0) return (cache_fplookup_final_withparent(fpl)); tvs = vget_prep_smr(tvp); if (__predict_false(tvs == VGET_NONE)) { return (cache_fpl_partial(fpl)); } if (!vn_seqc_consistent(dvp, dvp_seqc)) { cache_fpl_smr_exit(fpl); vget_abort(tvp, tvs); return (cache_fpl_aborted(fpl)); } cache_fpl_smr_exit(fpl); return (cache_fplookup_final_child(fpl, tvs)); } static int __noinline cache_fplookup_dot(struct cache_fpl *fpl) { struct vnode *dvp; dvp = fpl->dvp; fpl->tvp = dvp; fpl->tvp_seqc = vn_seqc_read_any(dvp); if (seqc_in_modify(fpl->tvp_seqc)) { return (cache_fpl_aborted(fpl)); } counter_u64_add(dothits, 1); SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", dvp); return (0); } static int __noinline cache_fplookup_dotdot(struct cache_fpl *fpl) { struct nameidata *ndp; struct componentname *cnp; struct namecache *ncp; struct vnode *dvp; struct prison *pr; u_char nc_flag; ndp = fpl->ndp; cnp = fpl->cnp; dvp = fpl->dvp; /* * XXX this is racy the same way regular lookup is */ for (pr = cnp->cn_cred->cr_prison; pr != NULL; pr = pr->pr_parent) if (dvp == pr->pr_root) break; if (dvp == ndp->ni_rootdir || dvp == ndp->ni_topdir || dvp == rootvnode || pr != NULL) { fpl->tvp = dvp; fpl->tvp_seqc = vn_seqc_read_any(dvp); if (seqc_in_modify(fpl->tvp_seqc)) { return (cache_fpl_aborted(fpl)); } return (0); } if ((dvp->v_vflag & VV_ROOT) != 0) { /* * TODO * The opposite of climb mount is needed here. */ return (cache_fpl_aborted(fpl)); } ncp = atomic_load_ptr(&dvp->v_cache_dd); if (ncp == NULL) { return (cache_fpl_aborted(fpl)); } nc_flag = atomic_load_char(&ncp->nc_flag); if ((nc_flag & NCF_ISDOTDOT) != 0) { if ((nc_flag & NCF_NEGATIVE) != 0) return (cache_fpl_aborted(fpl)); fpl->tvp = ncp->nc_vp; } else { fpl->tvp = ncp->nc_dvp; } if (__predict_false(!cache_ncp_canuse(ncp))) { return (cache_fpl_aborted(fpl)); } fpl->tvp_seqc = vn_seqc_read_any(fpl->tvp); if (seqc_in_modify(fpl->tvp_seqc)) { return (cache_fpl_partial(fpl)); } counter_u64_add(dotdothits, 1); return (0); } static int cache_fplookup_next(struct cache_fpl *fpl) { struct componentname *cnp; struct namecache *ncp; struct negstate *negstate; struct vnode *dvp, *tvp; u_char nc_flag; uint32_t hash; bool neg_hot; cnp = fpl->cnp; dvp = fpl->dvp; if (__predict_false(cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.')) { return (cache_fplookup_dot(fpl)); } hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); CK_SLIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) break; } /* * If there is no entry we have to punt to the slow path to perform * actual lookup. Should there be nothing with this name a negative * entry will be created. */ if (__predict_false(ncp == NULL)) { return (cache_fpl_partial(fpl)); } tvp = atomic_load_ptr(&ncp->nc_vp); nc_flag = atomic_load_char(&ncp->nc_flag); if ((nc_flag & NCF_NEGATIVE) != 0) { /* * If they want to create an entry we need to replace this one. */ if (__predict_false(fpl->cnp->cn_nameiop != LOOKUP)) { return (cache_fpl_partial(fpl)); } negstate = NCP2NEGSTATE(ncp); neg_hot = ((negstate->neg_flag & NEG_HOT) != 0); if (__predict_false(!cache_ncp_canuse(ncp))) { return (cache_fpl_partial(fpl)); } if (__predict_false((nc_flag & NCF_WHITE) != 0)) { return (cache_fpl_partial(fpl)); } if (!neg_hot) { return (cache_fplookup_negative_promote(fpl, ncp, hash)); } SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, ncp->nc_name); counter_u64_add(numneghits, 1); cache_fpl_smr_exit(fpl); return (cache_fpl_handled(fpl, ENOENT)); } if (__predict_false(!cache_ncp_canuse(ncp))) { return (cache_fpl_partial(fpl)); } fpl->tvp = tvp; fpl->tvp_seqc = vn_seqc_read_any(tvp); if (seqc_in_modify(fpl->tvp_seqc)) { return (cache_fpl_partial(fpl)); } if (!cache_fplookup_vnode_supported(tvp)) { return (cache_fpl_partial(fpl)); } counter_u64_add(numposhits, 1); SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ncp->nc_name, tvp); return (0); } static bool cache_fplookup_mp_supported(struct mount *mp) { if (mp == NULL) return (false); if ((mp->mnt_kern_flag & MNTK_FPLOOKUP) == 0) return (false); return (true); } /* * Walk up the mount stack (if any). * * Correctness is provided in the following ways: * - all vnodes are protected from freeing with SMR * - struct mount objects are type stable making them always safe to access * - stability of the particular mount is provided by busying it * - relationship between the vnode which is mounted on and the mount is * verified with the vnode sequence counter after busying * - association between root vnode of the mount and the mount is protected * by busy * * From that point on we can read the sequence counter of the root vnode * and get the next mount on the stack (if any) using the same protection. * * By the end of successful walk we are guaranteed the reached state was * indeed present at least at some point which matches the regular lookup. */ static int __noinline cache_fplookup_climb_mount(struct cache_fpl *fpl) { struct mount *mp, *prev_mp; struct vnode *vp; seqc_t vp_seqc; vp = fpl->tvp; vp_seqc = fpl->tvp_seqc; VNPASS(vp->v_type == VDIR || vp->v_type == VBAD, vp); mp = atomic_load_ptr(&vp->v_mountedhere); if (mp == NULL) return (0); prev_mp = NULL; for (;;) { if (!vfs_op_thread_enter_crit(mp)) { if (prev_mp != NULL) vfs_op_thread_exit_crit(prev_mp); return (cache_fpl_partial(fpl)); } if (prev_mp != NULL) vfs_op_thread_exit_crit(prev_mp); if (!vn_seqc_consistent(vp, vp_seqc)) { vfs_op_thread_exit_crit(mp); return (cache_fpl_partial(fpl)); } if (!cache_fplookup_mp_supported(mp)) { vfs_op_thread_exit_crit(mp); return (cache_fpl_partial(fpl)); } vp = atomic_load_ptr(&mp->mnt_rootvnode); if (vp == NULL || VN_IS_DOOMED(vp)) { vfs_op_thread_exit_crit(mp); return (cache_fpl_partial(fpl)); } vp_seqc = vn_seqc_read_any(vp); if (seqc_in_modify(vp_seqc)) { vfs_op_thread_exit_crit(mp); return (cache_fpl_partial(fpl)); } prev_mp = mp; mp = atomic_load_ptr(&vp->v_mountedhere); if (mp == NULL) break; } vfs_op_thread_exit_crit(prev_mp); fpl->tvp = vp; fpl->tvp_seqc = vp_seqc; return (0); } static bool cache_fplookup_need_climb_mount(struct cache_fpl *fpl) { struct mount *mp; struct vnode *vp; vp = fpl->tvp; /* * Hack: while this is a union, the pointer tends to be NULL so save on * a branch. */ mp = atomic_load_ptr(&vp->v_mountedhere); if (mp == NULL) return (false); if (vp->v_type == VDIR) return (true); return (false); } /* * Parse the path. * * The code is mostly copy-pasted from regular lookup, see lookup(). * The structure is maintained along with comments for easier maintenance. * Deduplicating the code will become feasible after fast path lookup * becomes more feature-complete. */ static int cache_fplookup_parse(struct cache_fpl *fpl) { struct nameidata *ndp; struct componentname *cnp; char *cp; ndp = fpl->ndp; cnp = fpl->cnp; /* * Search a new directory. * * The last component of the filename is left accessible via * cnp->cn_nameptr for callers that need the name. Callers needing * the name set the SAVENAME flag. When done, they assume * responsibility for freeing the pathname buffer. */ for (cp = cnp->cn_nameptr; *cp != 0 && *cp != '/'; cp++) continue; cnp->cn_namelen = cp - cnp->cn_nameptr; if (__predict_false(cnp->cn_namelen > NAME_MAX)) { cache_fpl_smr_exit(fpl); return (cache_fpl_handled(fpl, ENAMETOOLONG)); } ndp->ni_pathlen -= cnp->cn_namelen; KASSERT(ndp->ni_pathlen <= PATH_MAX, ("%s: ni_pathlen underflow to %zd\n", __func__, ndp->ni_pathlen)); ndp->ni_next = cp; /* * Replace multiple slashes by a single slash and trailing slashes * by a null. This must be done before VOP_LOOKUP() because some * fs's don't know about trailing slashes. Remember if there were * trailing slashes to handle symlinks, existing non-directories * and non-existing files that won't be directories specially later. */ while (*cp == '/' && (cp[1] == '/' || cp[1] == '\0')) { cp++; ndp->ni_pathlen--; if (*cp == '\0') { /* * TODO * Regular lookup performs the following: * *ndp->ni_next = '\0'; * cnp->cn_flags |= TRAILINGSLASH; * * Which is problematic since it modifies data read * from userspace. Then if fast path lookup was to * abort we would have to either restore it or convey * the flag. Since this is a corner case just ignore * it for simplicity. */ return (cache_fpl_partial(fpl)); } } ndp->ni_next = cp; /* * Check for degenerate name (e.g. / or "") * which is a way of talking about a directory, * e.g. like "/." or ".". * * TODO * Another corner case handled by the regular lookup */ if (__predict_false(cnp->cn_nameptr[0] == '\0')) { return (cache_fpl_partial(fpl)); } return (0); } static void cache_fplookup_parse_advance(struct cache_fpl *fpl) { struct nameidata *ndp; struct componentname *cnp; ndp = fpl->ndp; cnp = fpl->cnp; cnp->cn_nameptr = ndp->ni_next; while (*cnp->cn_nameptr == '/') { cnp->cn_nameptr++; ndp->ni_pathlen--; } } static int __noinline cache_fplookup_failed_vexec(struct cache_fpl *fpl, int error) { switch (error) { case EAGAIN: /* * Can happen when racing against vgone. * */ case EOPNOTSUPP: cache_fpl_partial(fpl); break; default: /* * See the API contract for VOP_FPLOOKUP_VEXEC. */ if (!vn_seqc_consistent(fpl->dvp, fpl->dvp_seqc)) { error = cache_fpl_aborted(fpl); } else { cache_fpl_smr_exit(fpl); cache_fpl_handled(fpl, error); } break; } return (error); } static int cache_fplookup_impl(struct vnode *dvp, struct cache_fpl *fpl) { struct nameidata *ndp; struct componentname *cnp; struct mount *mp; int error; error = CACHE_FPL_FAILED; ndp = fpl->ndp; cnp = fpl->cnp; cache_fpl_checkpoint(fpl, &fpl->snd); fpl->dvp = dvp; fpl->dvp_seqc = vn_seqc_read_any(fpl->dvp); if (seqc_in_modify(fpl->dvp_seqc)) { cache_fpl_aborted(fpl); goto out; } mp = atomic_load_ptr(&fpl->dvp->v_mount); if (!cache_fplookup_mp_supported(mp)) { cache_fpl_aborted(fpl); goto out; } VNPASS(cache_fplookup_vnode_supported(fpl->dvp), fpl->dvp); for (;;) { error = cache_fplookup_parse(fpl); if (__predict_false(error != 0)) { break; } VNPASS(cache_fplookup_vnode_supported(fpl->dvp), fpl->dvp); error = VOP_FPLOOKUP_VEXEC(fpl->dvp, cnp->cn_cred); if (__predict_false(error != 0)) { error = cache_fplookup_failed_vexec(fpl, error); break; } if (__predict_false(cache_fpl_isdotdot(cnp))) { error = cache_fplookup_dotdot(fpl); if (__predict_false(error != 0)) { break; } } else { error = cache_fplookup_next(fpl); if (__predict_false(error != 0)) { break; } VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp); if (cache_fplookup_need_climb_mount(fpl)) { error = cache_fplookup_climb_mount(fpl); if (__predict_false(error != 0)) { break; } } } VNPASS(!seqc_in_modify(fpl->tvp_seqc), fpl->tvp); if (cache_fpl_islastcn(ndp)) { error = cache_fplookup_final(fpl); break; } if (!vn_seqc_consistent(fpl->dvp, fpl->dvp_seqc)) { error = cache_fpl_aborted(fpl); break; } fpl->dvp = fpl->tvp; fpl->dvp_seqc = fpl->tvp_seqc; cache_fplookup_parse_advance(fpl); cache_fpl_checkpoint(fpl, &fpl->snd); } out: switch (fpl->status) { case CACHE_FPL_STATUS_UNSET: __assert_unreachable(); break; case CACHE_FPL_STATUS_PARTIAL: cache_fpl_smr_assert_entered(fpl); return (cache_fplookup_partial_setup(fpl)); case CACHE_FPL_STATUS_ABORTED: if (fpl->in_smr) cache_fpl_smr_exit(fpl); return (CACHE_FPL_FAILED); case CACHE_FPL_STATUS_HANDLED: MPASS(error != CACHE_FPL_FAILED); cache_fpl_smr_assert_not_entered(fpl); if (__predict_false(error != 0)) { ndp->ni_dvp = NULL; ndp->ni_vp = NULL; cache_fpl_cleanup_cnp(cnp); return (error); } ndp->ni_dvp = fpl->dvp; ndp->ni_vp = fpl->tvp; if (cnp->cn_flags & SAVENAME) cnp->cn_flags |= HASBUF; else cache_fpl_cleanup_cnp(cnp); return (error); } } /* * Fast path lookup protected with SMR and sequence counters. * * Note: all VOP_FPLOOKUP_VEXEC routines have a comment referencing this one. * * Filesystems can opt in by setting the MNTK_FPLOOKUP flag and meeting criteria * outlined below. * * Traditional vnode lookup conceptually looks like this: * * vn_lock(current); * for (;;) { * next = find(); * vn_lock(next); * vn_unlock(current); * current = next; * if (last) * break; * } * return (current); * * Each jump to the next vnode is safe memory-wise and atomic with respect to * any modifications thanks to holding respective locks. * * The same guarantee can be provided with a combination of safe memory * reclamation and sequence counters instead. If all operations which affect * the relationship between the current vnode and the one we are looking for * also modify the counter, we can verify whether all the conditions held as * we made the jump. This includes things like permissions, mount points etc. * Counter modification is provided by enclosing relevant places in * vn_seqc_write_begin()/end() calls. * * Thus this translates to: * * vfs_smr_enter(); * dvp_seqc = seqc_read_any(dvp); * if (seqc_in_modify(dvp_seqc)) // someone is altering the vnode * abort(); * for (;;) { * tvp = find(); * tvp_seqc = seqc_read_any(tvp); * if (seqc_in_modify(tvp_seqc)) // someone is altering the target vnode * abort(); * if (!seqc_consistent(dvp, dvp_seqc) // someone is altering the vnode * abort(); * dvp = tvp; // we know nothing of importance has changed * dvp_seqc = tvp_seqc; // store the counter for the tvp iteration * if (last) * break; * } * vget(); // secure the vnode * if (!seqc_consistent(tvp, tvp_seqc) // final check * abort(); * // at this point we know nothing has changed for any parent<->child pair * // as they were crossed during the lookup, meaning we matched the guarantee * // of the locked variant * return (tvp); * * The API contract for VOP_FPLOOKUP_VEXEC routines is as follows: * - they are called while within vfs_smr protection which they must never exit * - EAGAIN can be returned to denote checking could not be performed, it is * always valid to return it * - if the sequence counter has not changed the result must be valid * - if the sequence counter has changed both false positives and false negatives * are permitted (since the result will be rejected later) * - for simple cases of unix permission checks vaccess_vexec_smr can be used * * Caveats to watch out for: * - vnodes are passed unlocked and unreferenced with nothing stopping * VOP_RECLAIM, in turn meaning that ->v_data can become NULL. It is advised * to use atomic_load_ptr to fetch it. * - the aforementioned object can also get freed, meaning absent other means it * should be protected with vfs_smr * - either safely checking permissions as they are modified or guaranteeing * their stability is left to the routine */ int cache_fplookup(struct nameidata *ndp, enum cache_fpl_status *status, struct pwd **pwdp) { struct cache_fpl fpl; struct pwd *pwd; struct vnode *dvp; struct componentname *cnp; struct nameidata_saved orig; int error; MPASS(ndp->ni_lcf == 0); fpl.status = CACHE_FPL_STATUS_UNSET; fpl.ndp = ndp; fpl.cnp = &ndp->ni_cnd; MPASS(curthread == fpl.cnp->cn_thread); if ((fpl.cnp->cn_flags & SAVESTART) != 0) MPASS(fpl.cnp->cn_nameiop != LOOKUP); if (!cache_can_fplookup(&fpl)) { SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status); *status = fpl.status; return (EOPNOTSUPP); } cache_fpl_checkpoint(&fpl, &orig); cache_fpl_smr_enter_initial(&fpl); pwd = pwd_get_smr(); fpl.pwd = pwd; ndp->ni_rootdir = pwd->pwd_rdir; ndp->ni_topdir = pwd->pwd_jdir; cnp = fpl.cnp; cnp->cn_nameptr = cnp->cn_pnbuf; if (cnp->cn_pnbuf[0] == '/') { cache_fpl_handle_root(ndp, &dvp); } else { MPASS(ndp->ni_dirfd == AT_FDCWD); dvp = pwd->pwd_cdir; } SDT_PROBE4(vfs, namei, lookup, entry, dvp, cnp->cn_pnbuf, cnp->cn_flags, true); error = cache_fplookup_impl(dvp, &fpl); cache_fpl_smr_assert_not_entered(&fpl); SDT_PROBE3(vfs, fplookup, lookup, done, ndp, fpl.line, fpl.status); *status = fpl.status; switch (fpl.status) { case CACHE_FPL_STATUS_UNSET: __assert_unreachable(); break; case CACHE_FPL_STATUS_HANDLED: SDT_PROBE3(vfs, namei, lookup, return, error, (error == 0 ? ndp->ni_vp : NULL), true); break; case CACHE_FPL_STATUS_PARTIAL: *pwdp = fpl.pwd; /* * Status restored by cache_fplookup_partial_setup. */ break; case CACHE_FPL_STATUS_ABORTED: cache_fpl_restore(&fpl, &orig); break; } return (error); } Index: projects/clang1100-import/sys/kern/vfs_default.c =================================================================== --- projects/clang1100-import/sys/kern/vfs_default.c (revision 364278) +++ projects/clang1100-import/sys/kern/vfs_default.c (revision 364279) @@ -1,1576 +1,1576 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed * to Berkeley by John Heidemann of the UCLA Ficus project. * * Source: * @(#)i405_init.c 2.10 92/04/27 UCLA Ficus project * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static int vop_nolookup(struct vop_lookup_args *); static int vop_norename(struct vop_rename_args *); static int vop_nostrategy(struct vop_strategy_args *); static int get_next_dirent(struct vnode *vp, struct dirent **dpp, char *dirbuf, int dirbuflen, off_t *off, char **cpos, int *len, int *eofflag, struct thread *td); static int dirent_exists(struct vnode *vp, const char *dirname, struct thread *td); #define DIRENT_MINSIZE (sizeof(struct dirent) - (MAXNAMLEN+1) + 4) static int vop_stdis_text(struct vop_is_text_args *ap); static int vop_stdunset_text(struct vop_unset_text_args *ap); static int vop_stdadd_writecount(struct vop_add_writecount_args *ap); static int vop_stdcopy_file_range(struct vop_copy_file_range_args *ap); static int vop_stdfdatasync(struct vop_fdatasync_args *ap); static int vop_stdgetpages_async(struct vop_getpages_async_args *ap); static int vop_stdstat(struct vop_stat_args *ap); /* * This vnode table stores what we want to do if the filesystem doesn't * implement a particular VOP. * * If there is no specific entry here, we will return EOPNOTSUPP. * * Note that every filesystem has to implement either vop_access * or vop_accessx; failing to do so will result in immediate crash * due to stack overflow, as vop_stdaccess() calls vop_stdaccessx(), * which calls vop_stdaccess() etc. */ struct vop_vector default_vnodeops = { .vop_default = NULL, .vop_bypass = VOP_EOPNOTSUPP, .vop_access = vop_stdaccess, .vop_accessx = vop_stdaccessx, .vop_advise = vop_stdadvise, .vop_advlock = vop_stdadvlock, .vop_advlockasync = vop_stdadvlockasync, .vop_advlockpurge = vop_stdadvlockpurge, .vop_allocate = vop_stdallocate, .vop_bmap = vop_stdbmap, .vop_close = VOP_NULL, .vop_fsync = VOP_NULL, .vop_stat = vop_stdstat, .vop_fdatasync = vop_stdfdatasync, .vop_getpages = vop_stdgetpages, .vop_getpages_async = vop_stdgetpages_async, .vop_getwritemount = vop_stdgetwritemount, .vop_inactive = VOP_NULL, .vop_need_inactive = vop_stdneed_inactive, .vop_ioctl = vop_stdioctl, .vop_kqfilter = vop_stdkqfilter, .vop_islocked = vop_stdislocked, .vop_lock1 = vop_stdlock, .vop_lookup = vop_nolookup, .vop_open = VOP_NULL, .vop_pathconf = VOP_EINVAL, .vop_poll = vop_nopoll, .vop_putpages = vop_stdputpages, .vop_readlink = VOP_EINVAL, .vop_rename = vop_norename, .vop_revoke = VOP_PANIC, .vop_strategy = vop_nostrategy, .vop_unlock = vop_stdunlock, .vop_vptocnp = vop_stdvptocnp, .vop_vptofh = vop_stdvptofh, .vop_unp_bind = vop_stdunp_bind, .vop_unp_connect = vop_stdunp_connect, .vop_unp_detach = vop_stdunp_detach, .vop_is_text = vop_stdis_text, .vop_set_text = vop_stdset_text, .vop_unset_text = vop_stdunset_text, .vop_add_writecount = vop_stdadd_writecount, .vop_copy_file_range = vop_stdcopy_file_range, }; VFS_VOP_VECTOR_REGISTER(default_vnodeops); /* * Series of placeholder functions for various error returns for * VOPs. */ int vop_eopnotsupp(struct vop_generic_args *ap) { /* printf("vop_notsupp[%s]\n", ap->a_desc->vdesc_name); */ return (EOPNOTSUPP); } int vop_ebadf(struct vop_generic_args *ap) { return (EBADF); } int vop_enotty(struct vop_generic_args *ap) { return (ENOTTY); } int vop_einval(struct vop_generic_args *ap) { return (EINVAL); } int vop_enoent(struct vop_generic_args *ap) { return (ENOENT); } int vop_null(struct vop_generic_args *ap) { return (0); } /* * Helper function to panic on some bad VOPs in some filesystems. */ int vop_panic(struct vop_generic_args *ap) { panic("filesystem goof: vop_panic[%s]", ap->a_desc->vdesc_name); } /* * vop_std and vop_no are default functions for use by * filesystems that need the "default reasonable" implementation for a * particular operation. * * The documentation for the operations they implement exists (if it exists) * in the VOP_(9) manpage (all uppercase). */ /* * Default vop for filesystems that do not support name lookup */ static int vop_nolookup(ap) struct vop_lookup_args /* { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; } */ *ap; { *ap->a_vpp = NULL; return (ENOTDIR); } /* * vop_norename: * * Handle unlock and reference counting for arguments of vop_rename * for filesystems that do not implement rename operation. */ static int vop_norename(struct vop_rename_args *ap) { vop_rename_fail(ap); return (EOPNOTSUPP); } /* * vop_nostrategy: * * Strategy routine for VFS devices that have none. * * BIO_ERROR and B_INVAL must be cleared prior to calling any strategy * routine. Typically this is done for a BIO_READ strategy call. * Typically B_INVAL is assumed to already be clear prior to a write * and should not be cleared manually unless you just made the buffer * invalid. BIO_ERROR should be cleared either way. */ static int vop_nostrategy (struct vop_strategy_args *ap) { printf("No strategy for buffer at %p\n", ap->a_bp); vn_printf(ap->a_vp, "vnode "); ap->a_bp->b_ioflags |= BIO_ERROR; ap->a_bp->b_error = EOPNOTSUPP; bufdone(ap->a_bp); return (EOPNOTSUPP); } static int get_next_dirent(struct vnode *vp, struct dirent **dpp, char *dirbuf, int dirbuflen, off_t *off, char **cpos, int *len, int *eofflag, struct thread *td) { int error, reclen; struct uio uio; struct iovec iov; struct dirent *dp; KASSERT(VOP_ISLOCKED(vp), ("vp %p is not locked", vp)); KASSERT(vp->v_type == VDIR, ("vp %p is not a directory", vp)); if (*len == 0) { iov.iov_base = dirbuf; iov.iov_len = dirbuflen; uio.uio_iov = &iov; uio.uio_iovcnt = 1; uio.uio_offset = *off; uio.uio_resid = dirbuflen; uio.uio_segflg = UIO_SYSSPACE; uio.uio_rw = UIO_READ; uio.uio_td = td; *eofflag = 0; #ifdef MAC error = mac_vnode_check_readdir(td->td_ucred, vp); if (error == 0) #endif error = VOP_READDIR(vp, &uio, td->td_ucred, eofflag, NULL, NULL); if (error) return (error); *off = uio.uio_offset; *cpos = dirbuf; *len = (dirbuflen - uio.uio_resid); if (*len == 0) return (ENOENT); } dp = (struct dirent *)(*cpos); reclen = dp->d_reclen; *dpp = dp; /* check for malformed directory.. */ if (reclen < DIRENT_MINSIZE) return (EINVAL); *cpos += reclen; *len -= reclen; return (0); } /* * Check if a named file exists in a given directory vnode. */ static int dirent_exists(struct vnode *vp, const char *dirname, struct thread *td) { char *dirbuf, *cpos; int error, eofflag, dirbuflen, len, found; off_t off; struct dirent *dp; struct vattr va; KASSERT(VOP_ISLOCKED(vp), ("vp %p is not locked", vp)); KASSERT(vp->v_type == VDIR, ("vp %p is not a directory", vp)); found = 0; error = VOP_GETATTR(vp, &va, td->td_ucred); if (error) return (found); dirbuflen = DEV_BSIZE; if (dirbuflen < va.va_blocksize) dirbuflen = va.va_blocksize; dirbuf = (char *)malloc(dirbuflen, M_TEMP, M_WAITOK); off = 0; len = 0; do { error = get_next_dirent(vp, &dp, dirbuf, dirbuflen, &off, &cpos, &len, &eofflag, td); if (error) goto out; if (dp->d_type != DT_WHT && dp->d_fileno != 0 && strcmp(dp->d_name, dirname) == 0) { found = 1; goto out; } } while (len > 0 || !eofflag); out: free(dirbuf, M_TEMP); return (found); } int vop_stdaccess(struct vop_access_args *ap) { KASSERT((ap->a_accmode & ~(VEXEC | VWRITE | VREAD | VADMIN | VAPPEND)) == 0, ("invalid bit in accmode")); return (VOP_ACCESSX(ap->a_vp, ap->a_accmode, ap->a_cred, ap->a_td)); } int vop_stdaccessx(struct vop_accessx_args *ap) { int error; accmode_t accmode = ap->a_accmode; error = vfs_unixify_accmode(&accmode); if (error != 0) return (error); if (accmode == 0) return (0); return (VOP_ACCESS(ap->a_vp, accmode, ap->a_cred, ap->a_td)); } /* * Advisory record locking support */ int vop_stdadvlock(struct vop_advlock_args *ap) { struct vnode *vp; struct vattr vattr; int error; vp = ap->a_vp; if (ap->a_fl->l_whence == SEEK_END) { /* * The NFSv4 server must avoid doing a vn_lock() here, since it * can deadlock the nfsd threads, due to a LOR. Fortunately * the NFSv4 server always uses SEEK_SET and this code is * only required for the SEEK_END case. */ vn_lock(vp, LK_SHARED | LK_RETRY); error = VOP_GETATTR(vp, &vattr, curthread->td_ucred); VOP_UNLOCK(vp); if (error) return (error); } else vattr.va_size = 0; return (lf_advlock(ap, &(vp->v_lockf), vattr.va_size)); } int vop_stdadvlockasync(struct vop_advlockasync_args *ap) { struct vnode *vp; struct vattr vattr; int error; vp = ap->a_vp; if (ap->a_fl->l_whence == SEEK_END) { /* The size argument is only needed for SEEK_END. */ vn_lock(vp, LK_SHARED | LK_RETRY); error = VOP_GETATTR(vp, &vattr, curthread->td_ucred); VOP_UNLOCK(vp); if (error) return (error); } else vattr.va_size = 0; return (lf_advlockasync(ap, &(vp->v_lockf), vattr.va_size)); } int vop_stdadvlockpurge(struct vop_advlockpurge_args *ap) { struct vnode *vp; vp = ap->a_vp; lf_purgelocks(vp, &vp->v_lockf); return (0); } /* * vop_stdpathconf: * * Standard implementation of POSIX pathconf, to get information about limits * for a filesystem. * Override per filesystem for the case where the filesystem has smaller * limits. */ int vop_stdpathconf(ap) struct vop_pathconf_args /* { struct vnode *a_vp; int a_name; int *a_retval; } */ *ap; { switch (ap->a_name) { case _PC_ASYNC_IO: *ap->a_retval = _POSIX_ASYNCHRONOUS_IO; return (0); case _PC_PATH_MAX: *ap->a_retval = PATH_MAX; return (0); case _PC_ACL_EXTENDED: case _PC_ACL_NFS4: case _PC_CAP_PRESENT: case _PC_INF_PRESENT: case _PC_MAC_PRESENT: *ap->a_retval = 0; return (0); default: return (EINVAL); } /* NOTREACHED */ } /* * Standard lock, unlock and islocked functions. */ int vop_stdlock(ap) struct vop_lock1_args /* { struct vnode *a_vp; int a_flags; char *file; int line; } */ *ap; { struct vnode *vp = ap->a_vp; struct mtx *ilk; ilk = VI_MTX(vp); return (lockmgr_lock_flags(vp->v_vnlock, ap->a_flags, &ilk->lock_object, ap->a_file, ap->a_line)); } /* See above. */ int vop_stdunlock(ap) struct vop_unlock_args /* { struct vnode *a_vp; } */ *ap; { struct vnode *vp = ap->a_vp; return (lockmgr_unlock(vp->v_vnlock)); } /* See above. */ int vop_stdislocked(ap) struct vop_islocked_args /* { struct vnode *a_vp; } */ *ap; { return (lockstatus(ap->a_vp->v_vnlock)); } /* * Variants of the above set. * * Differences are: * - shared locking disablement is not supported * - v_vnlock pointer is not honored */ int vop_lock(ap) struct vop_lock1_args /* { struct vnode *a_vp; int a_flags; char *file; int line; } */ *ap; { struct vnode *vp = ap->a_vp; int flags = ap->a_flags; struct mtx *ilk; MPASS(vp->v_vnlock == &vp->v_lock); if (__predict_false((flags & ~(LK_TYPE_MASK | LK_NODDLKTREAT | LK_RETRY)) != 0)) goto other; switch (flags & LK_TYPE_MASK) { case LK_SHARED: return (lockmgr_slock(&vp->v_lock, flags, ap->a_file, ap->a_line)); case LK_EXCLUSIVE: return (lockmgr_xlock(&vp->v_lock, flags, ap->a_file, ap->a_line)); } other: ilk = VI_MTX(vp); return (lockmgr_lock_flags(&vp->v_lock, flags, &ilk->lock_object, ap->a_file, ap->a_line)); } int vop_unlock(ap) struct vop_unlock_args /* { struct vnode *a_vp; } */ *ap; { struct vnode *vp = ap->a_vp; MPASS(vp->v_vnlock == &vp->v_lock); return (lockmgr_unlock(&vp->v_lock)); } int vop_islocked(ap) struct vop_islocked_args /* { struct vnode *a_vp; } */ *ap; { struct vnode *vp = ap->a_vp; MPASS(vp->v_vnlock == &vp->v_lock); return (lockstatus(&vp->v_lock)); } /* * Return true for select/poll. */ int vop_nopoll(ap) struct vop_poll_args /* { struct vnode *a_vp; int a_events; struct ucred *a_cred; struct thread *a_td; } */ *ap; { if (ap->a_events & ~POLLSTANDARD) return (POLLNVAL); return (ap->a_events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM)); } /* * Implement poll for local filesystems that support it. */ int vop_stdpoll(ap) struct vop_poll_args /* { struct vnode *a_vp; int a_events; struct ucred *a_cred; struct thread *a_td; } */ *ap; { if (ap->a_events & ~POLLSTANDARD) return (vn_pollrecord(ap->a_vp, ap->a_td, ap->a_events)); return (ap->a_events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM)); } /* * Return our mount point, as we will take charge of the writes. */ int vop_stdgetwritemount(ap) struct vop_getwritemount_args /* { struct vnode *a_vp; struct mount **a_mpp; } */ *ap; { struct mount *mp; struct vnode *vp; /* * Note that having a reference does not prevent forced unmount from * setting ->v_mount to NULL after the lock gets released. This is of * no consequence for typical consumers (most notably vn_start_write) * since in this case the vnode is VIRF_DOOMED. Unmount might have * progressed far enough that its completion is only delayed by the * reference obtained here. The consumer only needs to concern itself * with releasing it. */ vp = ap->a_vp; mp = vp->v_mount; if (mp == NULL) { *(ap->a_mpp) = NULL; return (0); } if (vfs_op_thread_enter(mp)) { if (mp == vp->v_mount) { vfs_mp_count_add_pcpu(mp, ref, 1); vfs_op_thread_exit(mp); } else { vfs_op_thread_exit(mp); mp = NULL; } } else { MNT_ILOCK(mp); if (mp == vp->v_mount) { MNT_REF(mp); MNT_IUNLOCK(mp); } else { MNT_IUNLOCK(mp); mp = NULL; } } *(ap->a_mpp) = mp; return (0); } /* * If the file system doesn't implement VOP_BMAP, then return sensible defaults: * - Return the vnode's bufobj instead of any underlying device's bufobj * - Calculate the physical block number as if there were equal size * consecutive blocks, but * - Report no contiguous runs of blocks. */ int vop_stdbmap(ap) struct vop_bmap_args /* { struct vnode *a_vp; daddr_t a_bn; struct bufobj **a_bop; daddr_t *a_bnp; int *a_runp; int *a_runb; } */ *ap; { if (ap->a_bop != NULL) *ap->a_bop = &ap->a_vp->v_bufobj; if (ap->a_bnp != NULL) *ap->a_bnp = ap->a_bn * btodb(ap->a_vp->v_mount->mnt_stat.f_iosize); if (ap->a_runp != NULL) *ap->a_runp = 0; if (ap->a_runb != NULL) *ap->a_runb = 0; return (0); } int vop_stdfsync(ap) struct vop_fsync_args /* { struct vnode *a_vp; int a_waitfor; struct thread *a_td; } */ *ap; { return (vn_fsync_buf(ap->a_vp, ap->a_waitfor)); } static int vop_stdfdatasync(struct vop_fdatasync_args *ap) { return (VOP_FSYNC(ap->a_vp, MNT_WAIT, ap->a_td)); } int vop_stdfdatasync_buf(struct vop_fdatasync_args *ap) { return (vn_fsync_buf(ap->a_vp, MNT_WAIT)); } /* XXX Needs good comment and more info in the manpage (VOP_GETPAGES(9)). */ int vop_stdgetpages(ap) struct vop_getpages_args /* { struct vnode *a_vp; vm_page_t *a_m; int a_count; int *a_rbehind; int *a_rahead; } */ *ap; { return vnode_pager_generic_getpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_rbehind, ap->a_rahead, NULL, NULL); } static int vop_stdgetpages_async(struct vop_getpages_async_args *ap) { int error; error = VOP_GETPAGES(ap->a_vp, ap->a_m, ap->a_count, ap->a_rbehind, ap->a_rahead); if (ap->a_iodone != NULL) ap->a_iodone(ap->a_arg, ap->a_m, ap->a_count, error); return (error); } int vop_stdkqfilter(struct vop_kqfilter_args *ap) { return vfs_kqfilter(ap); } /* XXX Needs good comment and more info in the manpage (VOP_PUTPAGES(9)). */ int vop_stdputpages(ap) struct vop_putpages_args /* { struct vnode *a_vp; vm_page_t *a_m; int a_count; int a_sync; int *a_rtvals; } */ *ap; { return vnode_pager_generic_putpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_sync, ap->a_rtvals); } int vop_stdvptofh(struct vop_vptofh_args *ap) { return (EOPNOTSUPP); } int vop_stdvptocnp(struct vop_vptocnp_args *ap) { struct vnode *vp = ap->a_vp; struct vnode **dvp = ap->a_vpp; struct ucred *cred = ap->a_cred; char *buf = ap->a_buf; size_t *buflen = ap->a_buflen; char *dirbuf, *cpos; int i, error, eofflag, dirbuflen, flags, locked, len, covered; off_t off; ino_t fileno; struct vattr va; struct nameidata nd; struct thread *td; struct dirent *dp; struct vnode *mvp; i = *buflen; error = 0; covered = 0; td = curthread; if (vp->v_type != VDIR) return (ENOENT); error = VOP_GETATTR(vp, &va, cred); if (error) return (error); VREF(vp); locked = VOP_ISLOCKED(vp); VOP_UNLOCK(vp); NDINIT_ATVP(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF, UIO_SYSSPACE, "..", vp, td); flags = FREAD; error = vn_open_cred(&nd, &flags, 0, VN_OPEN_NOAUDIT, cred, NULL); if (error) { vn_lock(vp, locked | LK_RETRY); return (error); } NDFREE(&nd, NDF_ONLY_PNBUF); mvp = *dvp = nd.ni_vp; if (vp->v_mount != (*dvp)->v_mount && ((*dvp)->v_vflag & VV_ROOT) && ((*dvp)->v_mount->mnt_flag & MNT_UNION)) { *dvp = (*dvp)->v_mount->mnt_vnodecovered; VREF(mvp); VOP_UNLOCK(mvp); vn_close(mvp, FREAD, cred, td); VREF(*dvp); vn_lock(*dvp, LK_SHARED | LK_RETRY); covered = 1; } fileno = va.va_fileid; dirbuflen = DEV_BSIZE; if (dirbuflen < va.va_blocksize) dirbuflen = va.va_blocksize; dirbuf = (char *)malloc(dirbuflen, M_TEMP, M_WAITOK); if ((*dvp)->v_type != VDIR) { error = ENOENT; goto out; } off = 0; len = 0; do { /* call VOP_READDIR of parent */ error = get_next_dirent(*dvp, &dp, dirbuf, dirbuflen, &off, &cpos, &len, &eofflag, td); if (error) goto out; if ((dp->d_type != DT_WHT) && (dp->d_fileno == fileno)) { if (covered) { VOP_UNLOCK(*dvp); vn_lock(mvp, LK_SHARED | LK_RETRY); if (dirent_exists(mvp, dp->d_name, td)) { error = ENOENT; VOP_UNLOCK(mvp); vn_lock(*dvp, LK_SHARED | LK_RETRY); goto out; } VOP_UNLOCK(mvp); vn_lock(*dvp, LK_SHARED | LK_RETRY); } i -= dp->d_namlen; if (i < 0) { error = ENOMEM; goto out; } if (dp->d_namlen == 1 && dp->d_name[0] == '.') { error = ENOENT; } else { bcopy(dp->d_name, buf + i, dp->d_namlen); error = 0; } goto out; } } while (len > 0 || !eofflag); error = ENOENT; out: free(dirbuf, M_TEMP); if (!error) { *buflen = i; vref(*dvp); } if (covered) { vput(*dvp); vrele(mvp); } else { VOP_UNLOCK(mvp); vn_close(mvp, FREAD, cred, td); } vn_lock(vp, locked | LK_RETRY); return (error); } int vop_stdallocate(struct vop_allocate_args *ap) { #ifdef __notyet__ struct statfs *sfs; off_t maxfilesize = 0; #endif struct iovec aiov; struct vattr vattr, *vap; struct uio auio; off_t fsize, len, cur, offset; uint8_t *buf; struct thread *td; struct vnode *vp; size_t iosize; int error; buf = NULL; error = 0; td = curthread; vap = &vattr; vp = ap->a_vp; len = *ap->a_len; offset = *ap->a_offset; error = VOP_GETATTR(vp, vap, td->td_ucred); if (error != 0) goto out; fsize = vap->va_size; iosize = vap->va_blocksize; if (iosize == 0) iosize = BLKDEV_IOSIZE; if (iosize > MAXPHYS) iosize = MAXPHYS; buf = malloc(iosize, M_TEMP, M_WAITOK); #ifdef __notyet__ /* * Check if the filesystem sets f_maxfilesize; if not use * VOP_SETATTR to perform the check. */ sfs = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK); error = VFS_STATFS(vp->v_mount, sfs, td); if (error == 0) maxfilesize = sfs->f_maxfilesize; free(sfs, M_STATFS); if (error != 0) goto out; if (maxfilesize) { if (offset > maxfilesize || len > maxfilesize || offset + len > maxfilesize) { error = EFBIG; goto out; } } else #endif if (offset + len > vap->va_size) { /* * Test offset + len against the filesystem's maxfilesize. */ VATTR_NULL(vap); vap->va_size = offset + len; error = VOP_SETATTR(vp, vap, td->td_ucred); if (error != 0) goto out; VATTR_NULL(vap); vap->va_size = fsize; error = VOP_SETATTR(vp, vap, td->td_ucred); if (error != 0) goto out; } for (;;) { /* * Read and write back anything below the nominal file * size. There's currently no way outside the filesystem * to know whether this area is sparse or not. */ cur = iosize; if ((offset % iosize) != 0) cur -= (offset % iosize); if (cur > len) cur = len; if (offset < fsize) { aiov.iov_base = buf; aiov.iov_len = cur; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_offset = offset; auio.uio_resid = cur; auio.uio_segflg = UIO_SYSSPACE; auio.uio_rw = UIO_READ; auio.uio_td = td; error = VOP_READ(vp, &auio, 0, td->td_ucred); if (error != 0) break; if (auio.uio_resid > 0) { bzero(buf + cur - auio.uio_resid, auio.uio_resid); } } else { bzero(buf, cur); } aiov.iov_base = buf; aiov.iov_len = cur; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_offset = offset; auio.uio_resid = cur; auio.uio_segflg = UIO_SYSSPACE; auio.uio_rw = UIO_WRITE; auio.uio_td = td; error = VOP_WRITE(vp, &auio, 0, td->td_ucred); if (error != 0) break; len -= cur; offset += cur; if (len == 0) break; if (should_yield()) break; } out: *ap->a_len = len; *ap->a_offset = offset; free(buf, M_TEMP); return (error); } int vop_stdadvise(struct vop_advise_args *ap) { struct vnode *vp; struct bufobj *bo; daddr_t startn, endn; off_t bstart, bend, start, end; int bsize, error; vp = ap->a_vp; switch (ap->a_advice) { case POSIX_FADV_WILLNEED: /* * Do nothing for now. Filesystems should provide a * custom method which starts an asynchronous read of * the requested region. */ error = 0; break; case POSIX_FADV_DONTNEED: error = 0; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); if (VN_IS_DOOMED(vp)) { VOP_UNLOCK(vp); break; } /* * Round to block boundaries (and later possibly further to * page boundaries). Applications cannot reasonably be aware * of the boundaries, and the rounding must be to expand at * both extremities to cover enough. It still doesn't cover * read-ahead. For partial blocks, this gives unnecessary * discarding of buffers but is efficient enough since the * pages usually remain in VMIO for some time. */ bsize = vp->v_bufobj.bo_bsize; bstart = rounddown(ap->a_start, bsize); bend = roundup(ap->a_end, bsize); /* * Deactivate pages in the specified range from the backing VM * object. Pages that are resident in the buffer cache will * remain wired until their corresponding buffers are released * below. */ if (vp->v_object != NULL) { start = trunc_page(bstart); end = round_page(bend); VM_OBJECT_RLOCK(vp->v_object); vm_object_page_noreuse(vp->v_object, OFF_TO_IDX(start), OFF_TO_IDX(end)); VM_OBJECT_RUNLOCK(vp->v_object); } bo = &vp->v_bufobj; BO_RLOCK(bo); startn = bstart / bsize; endn = bend / bsize; error = bnoreuselist(&bo->bo_clean, bo, startn, endn); if (error == 0) error = bnoreuselist(&bo->bo_dirty, bo, startn, endn); BO_RUNLOCK(bo); VOP_UNLOCK(vp); break; default: error = EINVAL; break; } return (error); } int vop_stdunp_bind(struct vop_unp_bind_args *ap) { ap->a_vp->v_unpcb = ap->a_unpcb; return (0); } int vop_stdunp_connect(struct vop_unp_connect_args *ap) { *ap->a_unpcb = ap->a_vp->v_unpcb; return (0); } int vop_stdunp_detach(struct vop_unp_detach_args *ap) { ap->a_vp->v_unpcb = NULL; return (0); } static int vop_stdis_text(struct vop_is_text_args *ap) { return (ap->a_vp->v_writecount < 0); } int vop_stdset_text(struct vop_set_text_args *ap) { struct vnode *vp; struct mount *mp; int error; vp = ap->a_vp; VI_LOCK(vp); if (vp->v_writecount > 0) { error = ETXTBSY; } else { /* * If requested by fs, keep a use reference to the * vnode until the last text reference is released. */ mp = vp->v_mount; if (mp != NULL && (mp->mnt_kern_flag & MNTK_TEXT_REFS) != 0 && vp->v_writecount == 0) { vp->v_iflag |= VI_TEXT_REF; vrefl(vp); } vp->v_writecount--; error = 0; } VI_UNLOCK(vp); return (error); } static int vop_stdunset_text(struct vop_unset_text_args *ap) { struct vnode *vp; int error; bool last; vp = ap->a_vp; last = false; VI_LOCK(vp); if (vp->v_writecount < 0) { if ((vp->v_iflag & VI_TEXT_REF) != 0 && vp->v_writecount == -1) { last = true; vp->v_iflag &= ~VI_TEXT_REF; } vp->v_writecount++; error = 0; } else { error = EINVAL; } VI_UNLOCK(vp); if (last) vunref(vp); return (error); } static int vop_stdadd_writecount(struct vop_add_writecount_args *ap) { struct vnode *vp; struct mount *mp; int error; vp = ap->a_vp; VI_LOCK_FLAGS(vp, MTX_DUPOK); if (vp->v_writecount < 0) { error = ETXTBSY; } else { VNASSERT(vp->v_writecount + ap->a_inc >= 0, vp, ("neg writecount increment %d", ap->a_inc)); if (vp->v_writecount == 0) { mp = vp->v_mount; if (mp != NULL && (mp->mnt_kern_flag & MNTK_NOMSYNC) == 0) vlazy(vp); } vp->v_writecount += ap->a_inc; error = 0; } VI_UNLOCK(vp); return (error); } int vop_stdneed_inactive(struct vop_need_inactive_args *ap) { return (1); } int vop_stdioctl(struct vop_ioctl_args *ap) { struct vnode *vp; struct vattr va; off_t *offp; int error; switch (ap->a_command) { case FIOSEEKDATA: case FIOSEEKHOLE: vp = ap->a_vp; error = vn_lock(vp, LK_SHARED); if (error != 0) return (EBADF); if (vp->v_type == VREG) error = VOP_GETATTR(vp, &va, ap->a_cred); else error = ENOTTY; if (error == 0) { offp = ap->a_data; if (*offp < 0 || *offp >= va.va_size) error = ENXIO; else if (ap->a_command == FIOSEEKHOLE) *offp = va.va_size; } VOP_UNLOCK(vp); break; default: error = ENOTTY; break; } return (error); } /* * vfs default ops * used to fill the vfs function table to get reasonable default return values. */ int vfs_stdroot (mp, flags, vpp) struct mount *mp; int flags; struct vnode **vpp; { return (EOPNOTSUPP); } int vfs_stdstatfs (mp, sbp) struct mount *mp; struct statfs *sbp; { return (EOPNOTSUPP); } int vfs_stdquotactl (mp, cmds, uid, arg) struct mount *mp; int cmds; uid_t uid; void *arg; { return (EOPNOTSUPP); } int vfs_stdsync(mp, waitfor) struct mount *mp; int waitfor; { struct vnode *vp, *mvp; struct thread *td; int error, lockreq, allerror = 0; td = curthread; lockreq = LK_EXCLUSIVE | LK_INTERLOCK; if (waitfor != MNT_WAIT) lockreq |= LK_NOWAIT; /* * Force stale buffer cache information to be flushed. */ loop: MNT_VNODE_FOREACH_ALL(vp, mp, mvp) { if (vp->v_bufobj.bo_dirty.bv_cnt == 0) { VI_UNLOCK(vp); continue; } - if ((error = vget(vp, lockreq, td)) != 0) { + if ((error = vget(vp, lockreq)) != 0) { if (error == ENOENT) { MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); goto loop; } continue; } error = VOP_FSYNC(vp, waitfor, td); if (error) allerror = error; vput(vp); } return (allerror); } int vfs_stdnosync (mp, waitfor) struct mount *mp; int waitfor; { return (0); } static int vop_stdcopy_file_range(struct vop_copy_file_range_args *ap) { int error; error = vn_generic_copy_file_range(ap->a_invp, ap->a_inoffp, ap->a_outvp, ap->a_outoffp, ap->a_lenp, ap->a_flags, ap->a_incred, ap->a_outcred, ap->a_fsizetd); return (error); } int vfs_stdvget (mp, ino, flags, vpp) struct mount *mp; ino_t ino; int flags; struct vnode **vpp; { return (EOPNOTSUPP); } int vfs_stdfhtovp (mp, fhp, flags, vpp) struct mount *mp; struct fid *fhp; int flags; struct vnode **vpp; { return (EOPNOTSUPP); } int vfs_stdinit (vfsp) struct vfsconf *vfsp; { return (0); } int vfs_stduninit (vfsp) struct vfsconf *vfsp; { return(0); } int vfs_stdextattrctl(mp, cmd, filename_vp, attrnamespace, attrname) struct mount *mp; int cmd; struct vnode *filename_vp; int attrnamespace; const char *attrname; { if (filename_vp != NULL) VOP_UNLOCK(filename_vp); return (EOPNOTSUPP); } int vfs_stdsysctl(mp, op, req) struct mount *mp; fsctlop_t op; struct sysctl_req *req; { return (EOPNOTSUPP); } static vop_bypass_t * bp_by_off(struct vop_vector *vop, struct vop_generic_args *a) { return (*(vop_bypass_t **)((char *)vop + a->a_desc->vdesc_vop_offset)); } int vop_sigdefer(struct vop_vector *vop, struct vop_generic_args *a) { vop_bypass_t *bp; int prev_stops, rc; bp = bp_by_off(vop, a); MPASS(bp != NULL); prev_stops = sigdeferstop(SIGDEFERSTOP_SILENT); rc = bp(a); sigallowstop(prev_stops); return (rc); } static int vop_stdstat(struct vop_stat_args *a) { struct vattr vattr; struct vattr *vap; struct vnode *vp; struct stat *sb; int error; u_short mode; vp = a->a_vp; sb = a->a_sb; error = vop_stat_helper_pre(a); if (error != 0) return (error); vap = &vattr; /* * Initialize defaults for new and unusual fields, so that file * systems which don't support these fields don't need to know * about them. */ vap->va_birthtime.tv_sec = -1; vap->va_birthtime.tv_nsec = 0; vap->va_fsid = VNOVAL; vap->va_rdev = NODEV; error = VOP_GETATTR(vp, vap, a->a_active_cred); if (error) goto out; /* * Zero the spare stat fields */ bzero(sb, sizeof *sb); /* * Copy from vattr table */ if (vap->va_fsid != VNOVAL) sb->st_dev = vap->va_fsid; else sb->st_dev = vp->v_mount->mnt_stat.f_fsid.val[0]; sb->st_ino = vap->va_fileid; mode = vap->va_mode; switch (vap->va_type) { case VREG: mode |= S_IFREG; break; case VDIR: mode |= S_IFDIR; break; case VBLK: mode |= S_IFBLK; break; case VCHR: mode |= S_IFCHR; break; case VLNK: mode |= S_IFLNK; break; case VSOCK: mode |= S_IFSOCK; break; case VFIFO: mode |= S_IFIFO; break; default: error = EBADF; goto out; } sb->st_mode = mode; sb->st_nlink = vap->va_nlink; sb->st_uid = vap->va_uid; sb->st_gid = vap->va_gid; sb->st_rdev = vap->va_rdev; if (vap->va_size > OFF_MAX) { error = EOVERFLOW; goto out; } sb->st_size = vap->va_size; sb->st_atim.tv_sec = vap->va_atime.tv_sec; sb->st_atim.tv_nsec = vap->va_atime.tv_nsec; sb->st_mtim.tv_sec = vap->va_mtime.tv_sec; sb->st_mtim.tv_nsec = vap->va_mtime.tv_nsec; sb->st_ctim.tv_sec = vap->va_ctime.tv_sec; sb->st_ctim.tv_nsec = vap->va_ctime.tv_nsec; sb->st_birthtim.tv_sec = vap->va_birthtime.tv_sec; sb->st_birthtim.tv_nsec = vap->va_birthtime.tv_nsec; /* * According to www.opengroup.org, the meaning of st_blksize is * "a filesystem-specific preferred I/O block size for this * object. In some filesystem types, this may vary from file * to file" * Use minimum/default of PAGE_SIZE (e.g. for VCHR). */ sb->st_blksize = max(PAGE_SIZE, vap->va_blocksize); sb->st_flags = vap->va_flags; sb->st_blocks = vap->va_bytes / S_BLKSIZE; sb->st_gen = vap->va_gen; out: return (vop_stat_helper_post(a, error)); } Index: projects/clang1100-import/sys/kern/vfs_lookup.c =================================================================== --- projects/clang1100-import/sys/kern/vfs_lookup.c (revision 364278) +++ projects/clang1100-import/sys/kern/vfs_lookup.c (revision 364279) @@ -1,1548 +1,1547 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vfs_lookup.c 8.4 (Berkeley) 2/16/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_capsicum.h" #include "opt_ktrace.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KTRACE #include #endif #ifdef INVARIANTS #include #endif #include #include #include #define NAMEI_DIAGNOSTIC 1 #undef NAMEI_DIAGNOSTIC SDT_PROVIDER_DEFINE(vfs); SDT_PROBE_DEFINE4(vfs, namei, lookup, entry, "struct vnode *", "char *", "unsigned long", "bool"); SDT_PROBE_DEFINE3(vfs, namei, lookup, return, "int", "struct vnode *", "bool"); /* Allocation zone for namei. */ uma_zone_t namei_zone; /* Placeholder vnode for mp traversal. */ static struct vnode *vp_crossmp; static int crossmp_vop_islocked(struct vop_islocked_args *ap) { return (LK_SHARED); } static int crossmp_vop_lock1(struct vop_lock1_args *ap) { struct vnode *vp; struct lock *lk __unused; const char *file __unused; int flags, line __unused; vp = ap->a_vp; lk = vp->v_vnlock; flags = ap->a_flags; file = ap->a_file; line = ap->a_line; if ((flags & LK_SHARED) == 0) panic("invalid lock request for crossmp"); WITNESS_CHECKORDER(&lk->lock_object, LOP_NEWORDER, file, line, flags & LK_INTERLOCK ? &VI_MTX(vp)->lock_object : NULL); WITNESS_LOCK(&lk->lock_object, 0, file, line); if ((flags & LK_INTERLOCK) != 0) VI_UNLOCK(vp); LOCK_LOG_LOCK("SLOCK", &lk->lock_object, 0, 0, ap->a_file, line); return (0); } static int crossmp_vop_unlock(struct vop_unlock_args *ap) { struct vnode *vp; struct lock *lk __unused; vp = ap->a_vp; lk = vp->v_vnlock; WITNESS_UNLOCK(&lk->lock_object, 0, LOCK_FILE, LOCK_LINE); LOCK_LOG_LOCK("SUNLOCK", &lk->lock_object, 0, 0, LOCK_FILE, LOCK_LINE); return (0); } static struct vop_vector crossmp_vnodeops = { .vop_default = &default_vnodeops, .vop_islocked = crossmp_vop_islocked, .vop_lock1 = crossmp_vop_lock1, .vop_unlock = crossmp_vop_unlock, }; /* * VFS_VOP_VECTOR_REGISTER(crossmp_vnodeops) is not used here since the vnode * gets allocated early. See nameiinit for the direct call below. */ struct nameicap_tracker { struct vnode *dp; TAILQ_ENTRY(nameicap_tracker) nm_link; }; /* Zone for cap mode tracker elements used for dotdot capability checks. */ static uma_zone_t nt_zone; static void nameiinit(void *dummy __unused) { namei_zone = uma_zcreate("NAMEI", MAXPATHLEN, NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); nt_zone = uma_zcreate("rentr", sizeof(struct nameicap_tracker), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); vfs_vector_op_register(&crossmp_vnodeops); getnewvnode("crossmp", NULL, &crossmp_vnodeops, &vp_crossmp); } SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nameiinit, NULL); static int lookup_cap_dotdot = 1; SYSCTL_INT(_vfs, OID_AUTO, lookup_cap_dotdot, CTLFLAG_RWTUN, &lookup_cap_dotdot, 0, "enables \"..\" components in path lookup in capability mode"); static int lookup_cap_dotdot_nonlocal = 1; SYSCTL_INT(_vfs, OID_AUTO, lookup_cap_dotdot_nonlocal, CTLFLAG_RWTUN, &lookup_cap_dotdot_nonlocal, 0, "enables \"..\" components in path lookup in capability mode " "on non-local mount"); static void nameicap_tracker_add(struct nameidata *ndp, struct vnode *dp) { struct nameicap_tracker *nt; if ((ndp->ni_lcf & NI_LCF_CAP_DOTDOT) == 0 || dp->v_type != VDIR) return; if ((ndp->ni_lcf & (NI_LCF_BENEATH_ABS | NI_LCF_BENEATH_LATCHED)) == NI_LCF_BENEATH_ABS) { MPASS((ndp->ni_lcf & NI_LCF_LATCH) != 0); if (dp != ndp->ni_beneath_latch) return; ndp->ni_lcf |= NI_LCF_BENEATH_LATCHED; } nt = uma_zalloc(nt_zone, M_WAITOK); vhold(dp); nt->dp = dp; TAILQ_INSERT_TAIL(&ndp->ni_cap_tracker, nt, nm_link); } static void nameicap_cleanup(struct nameidata *ndp, bool clean_latch) { struct nameicap_tracker *nt, *nt1; KASSERT(TAILQ_EMPTY(&ndp->ni_cap_tracker) || (ndp->ni_lcf & NI_LCF_CAP_DOTDOT) != 0, ("not strictrelative")); TAILQ_FOREACH_SAFE(nt, &ndp->ni_cap_tracker, nm_link, nt1) { TAILQ_REMOVE(&ndp->ni_cap_tracker, nt, nm_link); vdrop(nt->dp); uma_zfree(nt_zone, nt); } if (clean_latch && (ndp->ni_lcf & NI_LCF_LATCH) != 0) { ndp->ni_lcf &= ~NI_LCF_LATCH; vrele(ndp->ni_beneath_latch); } } /* * For dotdot lookups in capability mode, only allow the component * lookup to succeed if the resulting directory was already traversed * during the operation. Also fail dotdot lookups for non-local * filesystems, where external agents might assist local lookups to * escape the compartment. */ static int nameicap_check_dotdot(struct nameidata *ndp, struct vnode *dp) { struct nameicap_tracker *nt; struct mount *mp; if ((ndp->ni_lcf & NI_LCF_CAP_DOTDOT) == 0 || dp == NULL || dp->v_type != VDIR) return (0); mp = dp->v_mount; if (lookup_cap_dotdot_nonlocal == 0 && mp != NULL && (mp->mnt_flag & MNT_LOCAL) == 0) return (ENOTCAPABLE); TAILQ_FOREACH_REVERSE(nt, &ndp->ni_cap_tracker, nameicap_tracker_head, nm_link) { if (dp == nt->dp) return (0); } if ((ndp->ni_lcf & NI_LCF_BENEATH_ABS) != 0) { ndp->ni_lcf &= ~NI_LCF_BENEATH_LATCHED; nameicap_cleanup(ndp, false); return (0); } return (ENOTCAPABLE); } static void namei_cleanup_cnp(struct componentname *cnp) { uma_zfree(namei_zone, cnp->cn_pnbuf); #ifdef DIAGNOSTIC cnp->cn_pnbuf = NULL; cnp->cn_nameptr = NULL; #endif } static int namei_handle_root(struct nameidata *ndp, struct vnode **dpp) { struct componentname *cnp; cnp = &ndp->ni_cnd; if ((ndp->ni_lcf & NI_LCF_STRICTRELATIVE) != 0) { #ifdef KTRACE if (KTRPOINT(curthread, KTR_CAPFAIL)) ktrcapfail(CAPFAIL_LOOKUP, NULL, NULL); #endif return (ENOTCAPABLE); } if ((cnp->cn_flags & BENEATH) != 0) { ndp->ni_lcf |= NI_LCF_BENEATH_ABS; ndp->ni_lcf &= ~NI_LCF_BENEATH_LATCHED; nameicap_cleanup(ndp, false); } while (*(cnp->cn_nameptr) == '/') { cnp->cn_nameptr++; ndp->ni_pathlen--; } *dpp = ndp->ni_rootdir; vrefact(*dpp); return (0); } static int namei_setup(struct nameidata *ndp, struct vnode **dpp, struct pwd **pwdp) { struct componentname *cnp; struct file *dfp; struct thread *td; struct pwd *pwd; cap_rights_t rights; struct filecaps dirfd_caps; int error; bool startdir_used; cnp = &ndp->ni_cnd; td = cnp->cn_thread; startdir_used = false; *pwdp = NULL; *dpp = NULL; #ifdef CAPABILITY_MODE /* * In capability mode, lookups must be restricted to happen in * the subtree with the root specified by the file descriptor: * - The root must be real file descriptor, not the pseudo-descriptor * AT_FDCWD. * - The passed path must be relative and not absolute. * - If lookup_cap_dotdot is disabled, path must not contain the * '..' components. * - If lookup_cap_dotdot is enabled, we verify that all '..' * components lookups result in the directories which were * previously walked by us, which prevents an escape from * the relative root. */ if (IN_CAPABILITY_MODE(td) && (cnp->cn_flags & NOCAPCHECK) == 0) { ndp->ni_lcf |= NI_LCF_STRICTRELATIVE; if (ndp->ni_dirfd == AT_FDCWD) { #ifdef KTRACE if (KTRPOINT(td, KTR_CAPFAIL)) ktrcapfail(CAPFAIL_LOOKUP, NULL, NULL); #endif return (ECAPMODE); } } #endif error = 0; /* * Get starting point for the translation. */ pwd = pwd_hold(td); /* * The reference on ni_rootdir is acquired in the block below to avoid * back-to-back atomics for absolute lookups. */ ndp->ni_rootdir = pwd->pwd_rdir; ndp->ni_topdir = pwd->pwd_jdir; if (cnp->cn_pnbuf[0] == '/') { ndp->ni_resflags |= NIRES_ABS; error = namei_handle_root(ndp, dpp); } else { if (ndp->ni_startdir != NULL) { *dpp = ndp->ni_startdir; startdir_used = true; } else if (ndp->ni_dirfd == AT_FDCWD) { *dpp = pwd->pwd_cdir; vrefact(*dpp); } else { rights = *ndp->ni_rightsneeded; cap_rights_set_one(&rights, CAP_LOOKUP); if (cnp->cn_flags & AUDITVNODE1) AUDIT_ARG_ATFD1(ndp->ni_dirfd); if (cnp->cn_flags & AUDITVNODE2) AUDIT_ARG_ATFD2(ndp->ni_dirfd); /* * Effectively inlined fgetvp_rights, because we need to * inspect the file as well as grabbing the vnode. */ error = fget_cap(td, ndp->ni_dirfd, &rights, &dfp, &ndp->ni_filecaps); if (error != 0) { /* * Preserve the error; it should either be EBADF * or capability-related, both of which can be * safely returned to the caller. */ } else { if (dfp->f_ops == &badfileops) { error = EBADF; } else if (dfp->f_vnode == NULL) { error = ENOTDIR; } else { *dpp = dfp->f_vnode; vrefact(*dpp); if ((dfp->f_flag & FSEARCH) != 0) cnp->cn_flags |= NOEXECCHECK; } fdrop(dfp, td); } #ifdef CAPABILITIES /* * If file descriptor doesn't have all rights, * all lookups relative to it must also be * strictly relative. */ CAP_ALL(&rights); if (!cap_rights_contains(&ndp->ni_filecaps.fc_rights, &rights) || ndp->ni_filecaps.fc_fcntls != CAP_FCNTL_ALL || ndp->ni_filecaps.fc_nioctls != -1) { ndp->ni_lcf |= NI_LCF_STRICTRELATIVE; } #endif } if (error == 0 && (*dpp)->v_type != VDIR) error = ENOTDIR; } if (error == 0 && (cnp->cn_flags & BENEATH) != 0) { if (ndp->ni_dirfd == AT_FDCWD) { ndp->ni_beneath_latch = pwd->pwd_cdir; vrefact(ndp->ni_beneath_latch); } else { rights = *ndp->ni_rightsneeded; cap_rights_set_one(&rights, CAP_LOOKUP); error = fgetvp_rights(td, ndp->ni_dirfd, &rights, &dirfd_caps, &ndp->ni_beneath_latch); if (error == 0 && (*dpp)->v_type != VDIR) { vrele(ndp->ni_beneath_latch); error = ENOTDIR; } } if (error == 0) ndp->ni_lcf |= NI_LCF_LATCH; } /* * If we are auditing the kernel pathname, save the user pathname. */ if (cnp->cn_flags & AUDITVNODE1) AUDIT_ARG_UPATH1_VP(td, ndp->ni_rootdir, *dpp, cnp->cn_pnbuf); if (cnp->cn_flags & AUDITVNODE2) AUDIT_ARG_UPATH2_VP(td, ndp->ni_rootdir, *dpp, cnp->cn_pnbuf); if (ndp->ni_startdir != NULL && !startdir_used) vrele(ndp->ni_startdir); if (error != 0) { if (*dpp != NULL) vrele(*dpp); pwd_drop(pwd); return (error); } MPASS((ndp->ni_lcf & (NI_LCF_BENEATH_ABS | NI_LCF_LATCH)) != NI_LCF_BENEATH_ABS); if (((ndp->ni_lcf & NI_LCF_STRICTRELATIVE) != 0 && lookup_cap_dotdot != 0) || ((ndp->ni_lcf & NI_LCF_STRICTRELATIVE) == 0 && (cnp->cn_flags & BENEATH) != 0)) ndp->ni_lcf |= NI_LCF_CAP_DOTDOT; SDT_PROBE4(vfs, namei, lookup, entry, *dpp, cnp->cn_pnbuf, cnp->cn_flags, false); *pwdp = pwd; return (0); } /* * Convert a pathname into a pointer to a locked vnode. * * The FOLLOW flag is set when symbolic links are to be followed * when they occur at the end of the name translation process. * Symbolic links are always followed for all other pathname * components other than the last. * * The segflg defines whether the name is to be copied from user * space or kernel space. * * Overall outline of namei: * * copy in name * get starting directory * while (!done && !error) { * call lookup to search path. * if symbolic link, massage name in buffer and continue * } */ int namei(struct nameidata *ndp) { char *cp; /* pointer into pathname argument */ struct vnode *dp; /* the directory we are searching */ struct iovec aiov; /* uio for reading symbolic links */ struct componentname *cnp; struct thread *td; struct pwd *pwd; struct uio auio; int error, linklen; enum cache_fpl_status status; cnp = &ndp->ni_cnd; td = cnp->cn_thread; ndp->ni_cnd.cn_cred = ndp->ni_cnd.cn_thread->td_ucred; KASSERT(cnp->cn_cred && td->td_proc, ("namei: bad cred/proc")); KASSERT((cnp->cn_nameiop & (~OPMASK)) == 0, ("namei: nameiop contaminated with flags")); KASSERT((cnp->cn_flags & OPMASK) == 0, ("namei: flags contaminated with nameiops")); KASSERT((cnp->cn_flags & NAMEI_INTERNAL_FLAGS) == 0, ("namei: unexpected flags: %" PRIx64 "\n", cnp->cn_flags & NAMEI_INTERNAL_FLAGS)); if (cnp->cn_flags & NOCACHE) KASSERT(cnp->cn_nameiop != LOOKUP, ("%s: NOCACHE passed with LOOKUP", __func__)); MPASS(ndp->ni_startdir == NULL || ndp->ni_startdir->v_type == VDIR || ndp->ni_startdir->v_type == VBAD); ndp->ni_lcf = 0; ndp->ni_vp = NULL; /* * Get a buffer for the name to be translated, and copy the * name into the buffer. */ - if ((cnp->cn_flags & HASBUF) == 0) - cnp->cn_pnbuf = uma_zalloc(namei_zone, M_WAITOK); + cnp->cn_pnbuf = uma_zalloc(namei_zone, M_WAITOK); if (ndp->ni_segflg == UIO_SYSSPACE) error = copystr(ndp->ni_dirp, cnp->cn_pnbuf, MAXPATHLEN, &ndp->ni_pathlen); else error = copyinstr(ndp->ni_dirp, cnp->cn_pnbuf, MAXPATHLEN, &ndp->ni_pathlen); if (__predict_false(error != 0)) { namei_cleanup_cnp(cnp); return (error); } /* * Don't allow empty pathnames. */ if (__predict_false(*cnp->cn_pnbuf == '\0')) { namei_cleanup_cnp(cnp); return (ENOENT); } #ifdef KTRACE if (KTRPOINT(td, KTR_NAMEI)) { KASSERT(cnp->cn_thread == curthread, ("namei not using curthread")); ktrnamei(cnp->cn_pnbuf); } #endif cnp->cn_nameptr = cnp->cn_pnbuf; /* * First try looking up the target without locking any vnodes. * * We may need to start from scratch or pick up where it left off. */ error = cache_fplookup(ndp, &status, &pwd); switch (status) { case CACHE_FPL_STATUS_UNSET: __assert_unreachable(); break; case CACHE_FPL_STATUS_HANDLED: return (error); case CACHE_FPL_STATUS_PARTIAL: TAILQ_INIT(&ndp->ni_cap_tracker); dp = ndp->ni_startdir; break; case CACHE_FPL_STATUS_ABORTED: TAILQ_INIT(&ndp->ni_cap_tracker); error = namei_setup(ndp, &dp, &pwd); if (error != 0) { namei_cleanup_cnp(cnp); return (error); } break; } ndp->ni_loopcnt = 0; /* * Locked lookup. */ for (;;) { ndp->ni_startdir = dp; error = lookup(ndp); if (error != 0) goto out; /* * If not a symbolic link, we're done. */ if ((cnp->cn_flags & ISSYMLINK) == 0) { if ((cnp->cn_flags & (SAVENAME | SAVESTART)) == 0) { namei_cleanup_cnp(cnp); } else cnp->cn_flags |= HASBUF; if ((ndp->ni_lcf & (NI_LCF_BENEATH_ABS | NI_LCF_BENEATH_LATCHED)) == NI_LCF_BENEATH_ABS) { NDFREE(ndp, 0); error = ENOTCAPABLE; } nameicap_cleanup(ndp, true); SDT_PROBE3(vfs, namei, lookup, return, error, (error == 0 ? ndp->ni_vp : NULL), false); pwd_drop(pwd); return (error); } if (ndp->ni_loopcnt++ >= MAXSYMLINKS) { error = ELOOP; break; } #ifdef MAC if ((cnp->cn_flags & NOMACCHECK) == 0) { error = mac_vnode_check_readlink(td->td_ucred, ndp->ni_vp); if (error != 0) break; } #endif if (ndp->ni_pathlen > 1) cp = uma_zalloc(namei_zone, M_WAITOK); else cp = cnp->cn_pnbuf; aiov.iov_base = cp; aiov.iov_len = MAXPATHLEN; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_offset = 0; auio.uio_rw = UIO_READ; auio.uio_segflg = UIO_SYSSPACE; auio.uio_td = td; auio.uio_resid = MAXPATHLEN; error = VOP_READLINK(ndp->ni_vp, &auio, cnp->cn_cred); if (error != 0) { if (ndp->ni_pathlen > 1) uma_zfree(namei_zone, cp); break; } linklen = MAXPATHLEN - auio.uio_resid; if (linklen == 0) { if (ndp->ni_pathlen > 1) uma_zfree(namei_zone, cp); error = ENOENT; break; } if (linklen + ndp->ni_pathlen > MAXPATHLEN) { if (ndp->ni_pathlen > 1) uma_zfree(namei_zone, cp); error = ENAMETOOLONG; break; } if (ndp->ni_pathlen > 1) { bcopy(ndp->ni_next, cp + linklen, ndp->ni_pathlen); uma_zfree(namei_zone, cnp->cn_pnbuf); cnp->cn_pnbuf = cp; } else cnp->cn_pnbuf[linklen] = '\0'; ndp->ni_pathlen += linklen; vput(ndp->ni_vp); dp = ndp->ni_dvp; /* * Check if root directory should replace current directory. */ cnp->cn_nameptr = cnp->cn_pnbuf; if (*(cnp->cn_nameptr) == '/') { vrele(dp); error = namei_handle_root(ndp, &dp); if (error != 0) goto out; } } vput(ndp->ni_vp); ndp->ni_vp = NULL; vrele(ndp->ni_dvp); out: MPASS(error != 0); namei_cleanup_cnp(cnp); nameicap_cleanup(ndp, true); SDT_PROBE3(vfs, namei, lookup, return, error, NULL, false); pwd_drop(pwd); return (error); } static int compute_cn_lkflags(struct mount *mp, int lkflags, int cnflags) { if (mp == NULL || ((lkflags & LK_SHARED) && (!(mp->mnt_kern_flag & MNTK_LOOKUP_SHARED) || ((cnflags & ISDOTDOT) && (mp->mnt_kern_flag & MNTK_LOOKUP_EXCL_DOTDOT))))) { lkflags &= ~LK_SHARED; lkflags |= LK_EXCLUSIVE; } lkflags |= LK_NODDLKTREAT; return (lkflags); } static __inline int needs_exclusive_leaf(struct mount *mp, int flags) { /* * Intermediate nodes can use shared locks, we only need to * force an exclusive lock for leaf nodes. */ if ((flags & (ISLASTCN | LOCKLEAF)) != (ISLASTCN | LOCKLEAF)) return (0); /* Always use exclusive locks if LOCKSHARED isn't set. */ if (!(flags & LOCKSHARED)) return (1); /* * For lookups during open(), if the mount point supports * extended shared operations, then use a shared lock for the * leaf node, otherwise use an exclusive lock. */ if ((flags & ISOPEN) != 0) return (!MNT_EXTENDED_SHARED(mp)); /* * Lookup requests outside of open() that specify LOCKSHARED * only need a shared lock on the leaf vnode. */ return (0); } /* * Search a pathname. * This is a very central and rather complicated routine. * * The pathname is pointed to by ni_ptr and is of length ni_pathlen. * The starting directory is taken from ni_startdir. The pathname is * descended until done, or a symbolic link is encountered. The variable * ni_more is clear if the path is completed; it is set to one if a * symbolic link needing interpretation is encountered. * * The flag argument is LOOKUP, CREATE, RENAME, or DELETE depending on * whether the name is to be looked up, created, renamed, or deleted. * When CREATE, RENAME, or DELETE is specified, information usable in * creating, renaming, or deleting a directory entry may be calculated. * If flag has LOCKPARENT or'ed into it, the parent directory is returned * locked. If flag has WANTPARENT or'ed into it, the parent directory is * returned unlocked. Otherwise the parent directory is not returned. If * the target of the pathname exists and LOCKLEAF is or'ed into the flag * the target is returned locked, otherwise it is returned unlocked. * When creating or renaming and LOCKPARENT is specified, the target may not * be ".". When deleting and LOCKPARENT is specified, the target may be ".". * * Overall outline of lookup: * * dirloop: * identify next component of name at ndp->ni_ptr * handle degenerate case where name is null string * if .. and crossing mount points and on mounted filesys, find parent * call VOP_LOOKUP routine for next component name * directory vnode returned in ni_dvp, unlocked unless LOCKPARENT set * component vnode returned in ni_vp (if it exists), locked. * if result vnode is mounted on and crossing mount points, * find mounted on vnode * if more components of name, do next level at dirloop * return the answer in ni_vp, locked if LOCKLEAF set * if LOCKPARENT set, return locked parent in ni_dvp * if WANTPARENT set, return unlocked parent in ni_dvp */ int lookup(struct nameidata *ndp) { char *cp; /* pointer into pathname argument */ char *prev_ni_next; /* saved ndp->ni_next */ struct vnode *dp = NULL; /* the directory we are searching */ struct vnode *tdp; /* saved dp */ struct mount *mp; /* mount table entry */ struct prison *pr; size_t prev_ni_pathlen; /* saved ndp->ni_pathlen */ int docache; /* == 0 do not cache last component */ int wantparent; /* 1 => wantparent or lockparent flag */ int rdonly; /* lookup read-only flag bit */ int error = 0; int dpunlocked = 0; /* dp has already been unlocked */ int relookup = 0; /* do not consume the path component */ struct componentname *cnp = &ndp->ni_cnd; int lkflags_save; int ni_dvp_unlocked; /* * Setup: break out flag bits into variables. */ ni_dvp_unlocked = 0; wantparent = cnp->cn_flags & (LOCKPARENT | WANTPARENT); KASSERT(cnp->cn_nameiop == LOOKUP || wantparent, ("CREATE, DELETE, RENAME require LOCKPARENT or WANTPARENT.")); docache = (cnp->cn_flags & NOCACHE) ^ NOCACHE; if (cnp->cn_nameiop == DELETE || (wantparent && cnp->cn_nameiop != CREATE && cnp->cn_nameiop != LOOKUP)) docache = 0; rdonly = cnp->cn_flags & RDONLY; cnp->cn_flags &= ~ISSYMLINK; ndp->ni_dvp = NULL; /* * We use shared locks until we hit the parent of the last cn then * we adjust based on the requesting flags. */ cnp->cn_lkflags = LK_SHARED; dp = ndp->ni_startdir; ndp->ni_startdir = NULLVP; vn_lock(dp, compute_cn_lkflags(dp->v_mount, cnp->cn_lkflags | LK_RETRY, cnp->cn_flags)); dirloop: /* * Search a new directory. * * The last component of the filename is left accessible via * cnp->cn_nameptr for callers that need the name. Callers needing * the name set the SAVENAME flag. When done, they assume * responsibility for freeing the pathname buffer. */ for (cp = cnp->cn_nameptr; *cp != 0 && *cp != '/'; cp++) continue; cnp->cn_namelen = cp - cnp->cn_nameptr; if (cnp->cn_namelen > NAME_MAX) { error = ENAMETOOLONG; goto bad; } #ifdef NAMEI_DIAGNOSTIC { char c = *cp; *cp = '\0'; printf("{%s}: ", cnp->cn_nameptr); *cp = c; } #endif prev_ni_pathlen = ndp->ni_pathlen; ndp->ni_pathlen -= cnp->cn_namelen; KASSERT(ndp->ni_pathlen <= PATH_MAX, ("%s: ni_pathlen underflow to %zd\n", __func__, ndp->ni_pathlen)); prev_ni_next = ndp->ni_next; ndp->ni_next = cp; /* * Replace multiple slashes by a single slash and trailing slashes * by a null. This must be done before VOP_LOOKUP() because some * fs's don't know about trailing slashes. Remember if there were * trailing slashes to handle symlinks, existing non-directories * and non-existing files that won't be directories specially later. */ while (*cp == '/' && (cp[1] == '/' || cp[1] == '\0')) { cp++; ndp->ni_pathlen--; if (*cp == '\0') { *ndp->ni_next = '\0'; cnp->cn_flags |= TRAILINGSLASH; } } ndp->ni_next = cp; cnp->cn_flags |= MAKEENTRY; if (*cp == '\0' && docache == 0) cnp->cn_flags &= ~MAKEENTRY; if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.') cnp->cn_flags |= ISDOTDOT; else cnp->cn_flags &= ~ISDOTDOT; if (*ndp->ni_next == 0) cnp->cn_flags |= ISLASTCN; else cnp->cn_flags &= ~ISLASTCN; if ((cnp->cn_flags & ISLASTCN) != 0 && cnp->cn_namelen == 1 && cnp->cn_nameptr[0] == '.' && (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) { error = EINVAL; goto bad; } nameicap_tracker_add(ndp, dp); /* * Check for degenerate name (e.g. / or "") * which is a way of talking about a directory, * e.g. like "/." or ".". */ if (cnp->cn_nameptr[0] == '\0') { if (dp->v_type != VDIR) { error = ENOTDIR; goto bad; } if (cnp->cn_nameiop != LOOKUP) { error = EISDIR; goto bad; } if (wantparent) { ndp->ni_dvp = dp; VREF(dp); } ndp->ni_vp = dp; if (cnp->cn_flags & AUDITVNODE1) AUDIT_ARG_VNODE1(dp); else if (cnp->cn_flags & AUDITVNODE2) AUDIT_ARG_VNODE2(dp); if (!(cnp->cn_flags & (LOCKPARENT | LOCKLEAF))) VOP_UNLOCK(dp); /* XXX This should probably move to the top of function. */ if (cnp->cn_flags & SAVESTART) panic("lookup: SAVESTART"); goto success; } /* * Handle "..": five special cases. * 0. If doing a capability lookup and lookup_cap_dotdot is * disabled, return ENOTCAPABLE. * 1. Return an error if this is the last component of * the name and the operation is DELETE or RENAME. * 2. If at root directory (e.g. after chroot) * or at absolute root directory * then ignore it so can't get out. * 3. If this vnode is the root of a mounted * filesystem, then replace it with the * vnode which was mounted on so we take the * .. in the other filesystem. * 4. If the vnode is the top directory of * the jail or chroot, don't let them out. * 5. If doing a capability lookup and lookup_cap_dotdot is * enabled, return ENOTCAPABLE if the lookup would escape * from the initial file descriptor directory. Checks are * done by ensuring that namei() already traversed the * result of dotdot lookup. */ if (cnp->cn_flags & ISDOTDOT) { if ((ndp->ni_lcf & (NI_LCF_STRICTRELATIVE | NI_LCF_CAP_DOTDOT)) == NI_LCF_STRICTRELATIVE) { #ifdef KTRACE if (KTRPOINT(curthread, KTR_CAPFAIL)) ktrcapfail(CAPFAIL_LOOKUP, NULL, NULL); #endif error = ENOTCAPABLE; goto bad; } if ((cnp->cn_flags & ISLASTCN) != 0 && (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) { error = EINVAL; goto bad; } for (;;) { for (pr = cnp->cn_cred->cr_prison; pr != NULL; pr = pr->pr_parent) if (dp == pr->pr_root) break; if (dp == ndp->ni_rootdir || dp == ndp->ni_topdir || dp == rootvnode || pr != NULL || ((dp->v_vflag & VV_ROOT) != 0 && (cnp->cn_flags & NOCROSSMOUNT) != 0)) { ndp->ni_dvp = dp; ndp->ni_vp = dp; VREF(dp); goto nextname; } if ((dp->v_vflag & VV_ROOT) == 0) break; if (VN_IS_DOOMED(dp)) { /* forced unmount */ error = ENOENT; goto bad; } tdp = dp; dp = dp->v_mount->mnt_vnodecovered; VREF(dp); vput(tdp); vn_lock(dp, compute_cn_lkflags(dp->v_mount, cnp->cn_lkflags | LK_RETRY, ISDOTDOT)); error = nameicap_check_dotdot(ndp, dp); if (error != 0) { #ifdef KTRACE if (KTRPOINT(curthread, KTR_CAPFAIL)) ktrcapfail(CAPFAIL_LOOKUP, NULL, NULL); #endif goto bad; } } } /* * We now have a segment name to search for, and a directory to search. */ unionlookup: #ifdef MAC error = mac_vnode_check_lookup(cnp->cn_thread->td_ucred, dp, cnp); if (error) goto bad; #endif ndp->ni_dvp = dp; ndp->ni_vp = NULL; ASSERT_VOP_LOCKED(dp, "lookup"); /* * If we have a shared lock we may need to upgrade the lock for the * last operation. */ if ((cnp->cn_flags & LOCKPARENT) && (cnp->cn_flags & ISLASTCN) && dp != vp_crossmp && VOP_ISLOCKED(dp) == LK_SHARED) vn_lock(dp, LK_UPGRADE|LK_RETRY); if (VN_IS_DOOMED(dp)) { error = ENOENT; goto bad; } /* * If we're looking up the last component and we need an exclusive * lock, adjust our lkflags. */ if (needs_exclusive_leaf(dp->v_mount, cnp->cn_flags)) cnp->cn_lkflags = LK_EXCLUSIVE; #ifdef NAMEI_DIAGNOSTIC vn_printf(dp, "lookup in "); #endif lkflags_save = cnp->cn_lkflags; cnp->cn_lkflags = compute_cn_lkflags(dp->v_mount, cnp->cn_lkflags, cnp->cn_flags); error = VOP_LOOKUP(dp, &ndp->ni_vp, cnp); cnp->cn_lkflags = lkflags_save; if (error != 0) { KASSERT(ndp->ni_vp == NULL, ("leaf should be empty")); #ifdef NAMEI_DIAGNOSTIC printf("not found\n"); #endif if ((error == ENOENT) && (dp->v_vflag & VV_ROOT) && (dp->v_mount != NULL) && (dp->v_mount->mnt_flag & MNT_UNION)) { tdp = dp; dp = dp->v_mount->mnt_vnodecovered; VREF(dp); vput(tdp); vn_lock(dp, compute_cn_lkflags(dp->v_mount, cnp->cn_lkflags | LK_RETRY, cnp->cn_flags)); nameicap_tracker_add(ndp, dp); goto unionlookup; } if (error == ERELOOKUP) { vref(dp); ndp->ni_vp = dp; error = 0; relookup = 1; goto good; } if (error != EJUSTRETURN) goto bad; /* * At this point, we know we're at the end of the * pathname. If creating / renaming, we can consider * allowing the file or directory to be created / renamed, * provided we're not on a read-only filesystem. */ if (rdonly) { error = EROFS; goto bad; } /* trailing slash only allowed for directories */ if ((cnp->cn_flags & TRAILINGSLASH) && !(cnp->cn_flags & WILLBEDIR)) { error = ENOENT; goto bad; } if ((cnp->cn_flags & LOCKPARENT) == 0) VOP_UNLOCK(dp); /* * We return with ni_vp NULL to indicate that the entry * doesn't currently exist, leaving a pointer to the * (possibly locked) directory vnode in ndp->ni_dvp. */ if (cnp->cn_flags & SAVESTART) { ndp->ni_startdir = ndp->ni_dvp; VREF(ndp->ni_startdir); } goto success; } good: #ifdef NAMEI_DIAGNOSTIC printf("found\n"); #endif dp = ndp->ni_vp; /* * Check to see if the vnode has been mounted on; * if so find the root of the mounted filesystem. */ while (dp->v_type == VDIR && (mp = dp->v_mountedhere) && (cnp->cn_flags & NOCROSSMOUNT) == 0) { if (vfs_busy(mp, 0)) continue; vput(dp); if (dp != ndp->ni_dvp) vput(ndp->ni_dvp); else vrele(ndp->ni_dvp); vrefact(vp_crossmp); ndp->ni_dvp = vp_crossmp; error = VFS_ROOT(mp, compute_cn_lkflags(mp, cnp->cn_lkflags, cnp->cn_flags), &tdp); vfs_unbusy(mp); if (vn_lock(vp_crossmp, LK_SHARED | LK_NOWAIT)) panic("vp_crossmp exclusively locked or reclaimed"); if (error) { dpunlocked = 1; goto bad2; } ndp->ni_vp = dp = tdp; } /* * Check for symbolic link */ if ((dp->v_type == VLNK) && ((cnp->cn_flags & FOLLOW) || (cnp->cn_flags & TRAILINGSLASH) || *ndp->ni_next == '/')) { cnp->cn_flags |= ISSYMLINK; if (VN_IS_DOOMED(dp)) { /* * We can't know whether the directory was mounted with * NOSYMFOLLOW, so we can't follow safely. */ error = ENOENT; goto bad2; } if (dp->v_mount->mnt_flag & MNT_NOSYMFOLLOW) { error = EACCES; goto bad2; } /* * Symlink code always expects an unlocked dvp. */ if (ndp->ni_dvp != ndp->ni_vp) { VOP_UNLOCK(ndp->ni_dvp); ni_dvp_unlocked = 1; } goto success; } nextname: /* * Not a symbolic link that we will follow. Continue with the * next component if there is any; otherwise, we're done. */ KASSERT((cnp->cn_flags & ISLASTCN) || *ndp->ni_next == '/', ("lookup: invalid path state.")); if (relookup) { relookup = 0; ndp->ni_pathlen = prev_ni_pathlen; ndp->ni_next = prev_ni_next; if (ndp->ni_dvp != dp) vput(ndp->ni_dvp); else vrele(ndp->ni_dvp); goto dirloop; } if (cnp->cn_flags & ISDOTDOT) { error = nameicap_check_dotdot(ndp, ndp->ni_vp); if (error != 0) { #ifdef KTRACE if (KTRPOINT(curthread, KTR_CAPFAIL)) ktrcapfail(CAPFAIL_LOOKUP, NULL, NULL); #endif goto bad2; } } if (*ndp->ni_next == '/') { cnp->cn_nameptr = ndp->ni_next; while (*cnp->cn_nameptr == '/') { cnp->cn_nameptr++; ndp->ni_pathlen--; } if (ndp->ni_dvp != dp) vput(ndp->ni_dvp); else vrele(ndp->ni_dvp); goto dirloop; } /* * If we're processing a path with a trailing slash, * check that the end result is a directory. */ if ((cnp->cn_flags & TRAILINGSLASH) && dp->v_type != VDIR) { error = ENOTDIR; goto bad2; } /* * Disallow directory write attempts on read-only filesystems. */ if (rdonly && (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) { error = EROFS; goto bad2; } if (cnp->cn_flags & SAVESTART) { ndp->ni_startdir = ndp->ni_dvp; VREF(ndp->ni_startdir); } if (!wantparent) { ni_dvp_unlocked = 2; if (ndp->ni_dvp != dp) vput(ndp->ni_dvp); else vrele(ndp->ni_dvp); } else if ((cnp->cn_flags & LOCKPARENT) == 0 && ndp->ni_dvp != dp) { VOP_UNLOCK(ndp->ni_dvp); ni_dvp_unlocked = 1; } if (cnp->cn_flags & AUDITVNODE1) AUDIT_ARG_VNODE1(dp); else if (cnp->cn_flags & AUDITVNODE2) AUDIT_ARG_VNODE2(dp); if ((cnp->cn_flags & LOCKLEAF) == 0) VOP_UNLOCK(dp); success: /* * Because of shared lookup we may have the vnode shared locked, but * the caller may want it to be exclusively locked. */ if (needs_exclusive_leaf(dp->v_mount, cnp->cn_flags) && VOP_ISLOCKED(dp) != LK_EXCLUSIVE) { vn_lock(dp, LK_UPGRADE | LK_RETRY); if (VN_IS_DOOMED(dp)) { error = ENOENT; goto bad2; } } return (0); bad2: if (ni_dvp_unlocked != 2) { if (dp != ndp->ni_dvp && !ni_dvp_unlocked) vput(ndp->ni_dvp); else vrele(ndp->ni_dvp); } bad: if (!dpunlocked) vput(dp); ndp->ni_vp = NULL; return (error); } /* * relookup - lookup a path name component * Used by lookup to re-acquire things. */ int relookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp) { struct vnode *dp = NULL; /* the directory we are searching */ int wantparent; /* 1 => wantparent or lockparent flag */ int rdonly; /* lookup read-only flag bit */ int error = 0; KASSERT(cnp->cn_flags & ISLASTCN, ("relookup: Not given last component.")); /* * Setup: break out flag bits into variables. */ wantparent = cnp->cn_flags & (LOCKPARENT|WANTPARENT); KASSERT(wantparent, ("relookup: parent not wanted.")); rdonly = cnp->cn_flags & RDONLY; cnp->cn_flags &= ~ISSYMLINK; dp = dvp; cnp->cn_lkflags = LK_EXCLUSIVE; vn_lock(dp, LK_EXCLUSIVE | LK_RETRY); /* * Search a new directory. * * The last component of the filename is left accessible via * cnp->cn_nameptr for callers that need the name. Callers needing * the name set the SAVENAME flag. When done, they assume * responsibility for freeing the pathname buffer. */ #ifdef NAMEI_DIAGNOSTIC printf("{%s}: ", cnp->cn_nameptr); #endif /* * Check for "" which represents the root directory after slash * removal. */ if (cnp->cn_nameptr[0] == '\0') { /* * Support only LOOKUP for "/" because lookup() * can't succeed for CREATE, DELETE and RENAME. */ KASSERT(cnp->cn_nameiop == LOOKUP, ("nameiop must be LOOKUP")); KASSERT(dp->v_type == VDIR, ("dp is not a directory")); if (!(cnp->cn_flags & LOCKLEAF)) VOP_UNLOCK(dp); *vpp = dp; /* XXX This should probably move to the top of function. */ if (cnp->cn_flags & SAVESTART) panic("lookup: SAVESTART"); return (0); } if (cnp->cn_flags & ISDOTDOT) panic ("relookup: lookup on dot-dot"); /* * We now have a segment name to search for, and a directory to search. */ #ifdef NAMEI_DIAGNOSTIC vn_printf(dp, "search in "); #endif if ((error = VOP_LOOKUP(dp, vpp, cnp)) != 0) { KASSERT(*vpp == NULL, ("leaf should be empty")); if (error != EJUSTRETURN) goto bad; /* * If creating and at end of pathname, then can consider * allowing file to be created. */ if (rdonly) { error = EROFS; goto bad; } /* ASSERT(dvp == ndp->ni_startdir) */ if (cnp->cn_flags & SAVESTART) VREF(dvp); if ((cnp->cn_flags & LOCKPARENT) == 0) VOP_UNLOCK(dp); /* * We return with ni_vp NULL to indicate that the entry * doesn't currently exist, leaving a pointer to the * (possibly locked) directory vnode in ndp->ni_dvp. */ return (0); } dp = *vpp; /* * Disallow directory write attempts on read-only filesystems. */ if (rdonly && (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) { if (dvp == dp) vrele(dvp); else vput(dvp); error = EROFS; goto bad; } /* * Set the parent lock/ref state to the requested state. */ if ((cnp->cn_flags & LOCKPARENT) == 0 && dvp != dp) { if (wantparent) VOP_UNLOCK(dvp); else vput(dvp); } else if (!wantparent) vrele(dvp); /* * Check for symbolic link */ KASSERT(dp->v_type != VLNK || !(cnp->cn_flags & FOLLOW), ("relookup: symlink found.\n")); /* ASSERT(dvp == ndp->ni_startdir) */ if (cnp->cn_flags & SAVESTART) VREF(dvp); if ((cnp->cn_flags & LOCKLEAF) == 0) VOP_UNLOCK(dp); return (0); bad: vput(dp); *vpp = NULL; return (error); } /* * Free data allocated by namei(); see namei(9) for details. */ void NDFREE_PNBUF(struct nameidata *ndp) { if ((ndp->ni_cnd.cn_flags & HASBUF) != 0) { uma_zfree(namei_zone, ndp->ni_cnd.cn_pnbuf); ndp->ni_cnd.cn_flags &= ~HASBUF; } } void (NDFREE)(struct nameidata *ndp, const u_int flags) { int unlock_dvp; int unlock_vp; unlock_dvp = 0; unlock_vp = 0; if (!(flags & NDF_NO_FREE_PNBUF)) { NDFREE_PNBUF(ndp); } if (!(flags & NDF_NO_VP_UNLOCK) && (ndp->ni_cnd.cn_flags & LOCKLEAF) && ndp->ni_vp) unlock_vp = 1; if (!(flags & NDF_NO_DVP_UNLOCK) && (ndp->ni_cnd.cn_flags & LOCKPARENT) && ndp->ni_dvp != ndp->ni_vp) unlock_dvp = 1; if (!(flags & NDF_NO_VP_RELE) && ndp->ni_vp) { if (unlock_vp) { vput(ndp->ni_vp); unlock_vp = 0; } else vrele(ndp->ni_vp); ndp->ni_vp = NULL; } if (unlock_vp) VOP_UNLOCK(ndp->ni_vp); if (!(flags & NDF_NO_DVP_RELE) && (ndp->ni_cnd.cn_flags & (LOCKPARENT|WANTPARENT))) { if (unlock_dvp) { vput(ndp->ni_dvp); unlock_dvp = 0; } else vrele(ndp->ni_dvp); ndp->ni_dvp = NULL; } if (unlock_dvp) VOP_UNLOCK(ndp->ni_dvp); if (!(flags & NDF_NO_STARTDIR_RELE) && (ndp->ni_cnd.cn_flags & SAVESTART)) { vrele(ndp->ni_startdir); ndp->ni_startdir = NULL; } } /* * Determine if there is a suitable alternate filename under the specified * prefix for the specified path. If the create flag is set, then the * alternate prefix will be used so long as the parent directory exists. * This is used by the various compatibility ABIs so that Linux binaries prefer * files under /compat/linux for example. The chosen path (whether under * the prefix or under /) is returned in a kernel malloc'd buffer pointed * to by pathbuf. The caller is responsible for free'ing the buffer from * the M_TEMP bucket if one is returned. */ int kern_alternate_path(struct thread *td, const char *prefix, const char *path, enum uio_seg pathseg, char **pathbuf, int create, int dirfd) { struct nameidata nd, ndroot; char *ptr, *buf, *cp; size_t len, sz; int error; buf = (char *) malloc(MAXPATHLEN, M_TEMP, M_WAITOK); *pathbuf = buf; /* Copy the prefix into the new pathname as a starting point. */ len = strlcpy(buf, prefix, MAXPATHLEN); if (len >= MAXPATHLEN) { *pathbuf = NULL; free(buf, M_TEMP); return (EINVAL); } sz = MAXPATHLEN - len; ptr = buf + len; /* Append the filename to the prefix. */ if (pathseg == UIO_SYSSPACE) error = copystr(path, ptr, sz, &len); else error = copyinstr(path, ptr, sz, &len); if (error) { *pathbuf = NULL; free(buf, M_TEMP); return (error); } /* Only use a prefix with absolute pathnames. */ if (*ptr != '/') { error = EINVAL; goto keeporig; } if (dirfd != AT_FDCWD) { /* * We want the original because the "prefix" is * included in the already opened dirfd. */ bcopy(ptr, buf, len); return (0); } /* * We know that there is a / somewhere in this pathname. * Search backwards for it, to find the file's parent dir * to see if it exists in the alternate tree. If it does, * and we want to create a file (cflag is set). We don't * need to worry about the root comparison in this case. */ if (create) { for (cp = &ptr[len] - 1; *cp != '/'; cp--); *cp = '\0'; NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, buf, td); error = namei(&nd); *cp = '/'; if (error != 0) goto keeporig; } else { NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, buf, td); error = namei(&nd); if (error != 0) goto keeporig; /* * We now compare the vnode of the prefix to the one * vnode asked. If they resolve to be the same, then we * ignore the match so that the real root gets used. * This avoids the problem of traversing "../.." to find the * root directory and never finding it, because "/" resolves * to the emulation root directory. This is expensive :-( */ NDINIT(&ndroot, LOOKUP, FOLLOW, UIO_SYSSPACE, prefix, td); /* We shouldn't ever get an error from this namei(). */ error = namei(&ndroot); if (error == 0) { if (nd.ni_vp == ndroot.ni_vp) error = ENOENT; NDFREE(&ndroot, NDF_ONLY_PNBUF); vrele(ndroot.ni_vp); } } NDFREE(&nd, NDF_ONLY_PNBUF); vrele(nd.ni_vp); keeporig: /* If there was an error, use the original path name. */ if (error) bcopy(ptr, buf, len); return (error); } Index: projects/clang1100-import/sys/kern/vfs_subr.c =================================================================== --- projects/clang1100-import/sys/kern/vfs_subr.c (revision 364278) +++ projects/clang1100-import/sys/kern/vfs_subr.c (revision 364279) @@ -1,6681 +1,6679 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95 */ /* * External virtual filesystem routines */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_watchdog.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #endif static void delmntque(struct vnode *vp); static int flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo, int slpflag, int slptimeo); static void syncer_shutdown(void *arg, int howto); static int vtryrecycle(struct vnode *vp); static void v_init_counters(struct vnode *); static void vgonel(struct vnode *); static void vfs_knllock(void *arg); static void vfs_knlunlock(void *arg); static void vfs_knl_assert_locked(void *arg); static void vfs_knl_assert_unlocked(void *arg); static void destroy_vpollinfo(struct vpollinfo *vi); static int v_inval_buf_range_locked(struct vnode *vp, struct bufobj *bo, daddr_t startlbn, daddr_t endlbn); static void vnlru_recalc(void); /* * These fences are intended for cases where some synchronization is * needed between access of v_iflags and lockless vnode refcount (v_holdcnt * and v_usecount) updates. Access to v_iflags is generally synchronized * by the interlock, but we have some internal assertions that check vnode * flags without acquiring the lock. Thus, these fences are INVARIANTS-only * for now. */ #ifdef INVARIANTS #define VNODE_REFCOUNT_FENCE_ACQ() atomic_thread_fence_acq() #define VNODE_REFCOUNT_FENCE_REL() atomic_thread_fence_rel() #else #define VNODE_REFCOUNT_FENCE_ACQ() #define VNODE_REFCOUNT_FENCE_REL() #endif /* * Number of vnodes in existence. Increased whenever getnewvnode() * allocates a new vnode, decreased in vdropl() for VIRF_DOOMED vnode. */ static u_long __exclusive_cache_line numvnodes; SYSCTL_ULONG(_vfs, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, "Number of vnodes in existence"); static counter_u64_t vnodes_created; SYSCTL_COUNTER_U64(_vfs, OID_AUTO, vnodes_created, CTLFLAG_RD, &vnodes_created, "Number of vnodes created by getnewvnode"); /* * Conversion tables for conversion from vnode types to inode formats * and back. */ enum vtype iftovt_tab[16] = { VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON, VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VNON }; int vttoif_tab[10] = { 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, S_IFSOCK, S_IFIFO, S_IFMT, S_IFMT }; /* * List of allocates vnodes in the system. */ static TAILQ_HEAD(freelst, vnode) vnode_list; static struct vnode *vnode_list_free_marker; static struct vnode *vnode_list_reclaim_marker; /* * "Free" vnode target. Free vnodes are rarely completely free, but are * just ones that are cheap to recycle. Usually they are for files which * have been stat'd but not read; these usually have inode and namecache * data attached to them. This target is the preferred minimum size of a * sub-cache consisting mostly of such files. The system balances the size * of this sub-cache with its complement to try to prevent either from * thrashing while the other is relatively inactive. The targets express * a preference for the best balance. * * "Above" this target there are 2 further targets (watermarks) related * to recyling of free vnodes. In the best-operating case, the cache is * exactly full, the free list has size between vlowat and vhiwat above the * free target, and recycling from it and normal use maintains this state. * Sometimes the free list is below vlowat or even empty, but this state * is even better for immediate use provided the cache is not full. * Otherwise, vnlru_proc() runs to reclaim enough vnodes (usually non-free * ones) to reach one of these states. The watermarks are currently hard- * coded as 4% and 9% of the available space higher. These and the default * of 25% for wantfreevnodes are too large if the memory size is large. * E.g., 9% of 75% of MAXVNODES is more than 566000 vnodes to reclaim * whenever vnlru_proc() becomes active. */ static long wantfreevnodes; static long __exclusive_cache_line freevnodes; SYSCTL_ULONG(_vfs, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0, "Number of \"free\" vnodes"); static long freevnodes_old; static counter_u64_t recycles_count; SYSCTL_COUNTER_U64(_vfs, OID_AUTO, recycles, CTLFLAG_RD, &recycles_count, "Number of vnodes recycled to meet vnode cache targets"); static counter_u64_t recycles_free_count; SYSCTL_COUNTER_U64(_vfs, OID_AUTO, recycles_free, CTLFLAG_RD, &recycles_free_count, "Number of free vnodes recycled to meet vnode cache targets"); static counter_u64_t deferred_inact; SYSCTL_COUNTER_U64(_vfs, OID_AUTO, deferred_inact, CTLFLAG_RD, &deferred_inact, "Number of times inactive processing was deferred"); /* To keep more than one thread at a time from running vfs_getnewfsid */ static struct mtx mntid_mtx; /* * Lock for any access to the following: * vnode_list * numvnodes * freevnodes */ static struct mtx __exclusive_cache_line vnode_list_mtx; /* Publicly exported FS */ struct nfs_public nfs_pub; static uma_zone_t buf_trie_zone; static smr_t buf_trie_smr; /* Zone for allocation of new vnodes - used exclusively by getnewvnode() */ static uma_zone_t vnode_zone; static uma_zone_t vnodepoll_zone; __read_frequently smr_t vfs_smr; /* * The workitem queue. * * It is useful to delay writes of file data and filesystem metadata * for tens of seconds so that quickly created and deleted files need * not waste disk bandwidth being created and removed. To realize this, * we append vnodes to a "workitem" queue. When running with a soft * updates implementation, most pending metadata dependencies should * not wait for more than a few seconds. Thus, mounted on block devices * are delayed only about a half the time that file data is delayed. * Similarly, directory updates are more critical, so are only delayed * about a third the time that file data is delayed. Thus, there are * SYNCER_MAXDELAY queues that are processed round-robin at a rate of * one each second (driven off the filesystem syncer process). The * syncer_delayno variable indicates the next queue that is to be processed. * Items that need to be processed soon are placed in this queue: * * syncer_workitem_pending[syncer_delayno] * * A delay of fifteen seconds is done by placing the request fifteen * entries later in the queue: * * syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask] * */ static int syncer_delayno; static long syncer_mask; LIST_HEAD(synclist, bufobj); static struct synclist *syncer_workitem_pending; /* * The sync_mtx protects: * bo->bo_synclist * sync_vnode_count * syncer_delayno * syncer_state * syncer_workitem_pending * syncer_worklist_len * rushjob */ static struct mtx sync_mtx; static struct cv sync_wakeup; #define SYNCER_MAXDELAY 32 static int syncer_maxdelay = SYNCER_MAXDELAY; /* maximum delay time */ static int syncdelay = 30; /* max time to delay syncing data */ static int filedelay = 30; /* time to delay syncing files */ SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0, "Time to delay syncing files (in seconds)"); static int dirdelay = 29; /* time to delay syncing directories */ SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0, "Time to delay syncing directories (in seconds)"); static int metadelay = 28; /* time to delay syncing metadata */ SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0, "Time to delay syncing metadata (in seconds)"); static int rushjob; /* number of slots to run ASAP */ static int stat_rush_requests; /* number of times I/O speeded up */ SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0, "Number of times I/O speeded up (rush requests)"); #define VDBATCH_SIZE 8 struct vdbatch { u_int index; long freevnodes; struct mtx lock; struct vnode *tab[VDBATCH_SIZE]; }; DPCPU_DEFINE_STATIC(struct vdbatch, vd); static void vdbatch_dequeue(struct vnode *vp); /* * When shutting down the syncer, run it at four times normal speed. */ #define SYNCER_SHUTDOWN_SPEEDUP 4 static int sync_vnode_count; static int syncer_worklist_len; static enum { SYNCER_RUNNING, SYNCER_SHUTTING_DOWN, SYNCER_FINAL_DELAY } syncer_state; /* Target for maximum number of vnodes. */ u_long desiredvnodes; static u_long gapvnodes; /* gap between wanted and desired */ static u_long vhiwat; /* enough extras after expansion */ static u_long vlowat; /* minimal extras before expansion */ static u_long vstir; /* nonzero to stir non-free vnodes */ static volatile int vsmalltrigger = 8; /* pref to keep if > this many pages */ static u_long vnlru_read_freevnodes(void); /* * Note that no attempt is made to sanitize these parameters. */ static int sysctl_maxvnodes(SYSCTL_HANDLER_ARGS) { u_long val; int error; val = desiredvnodes; error = sysctl_handle_long(oidp, &val, 0, req); if (error != 0 || req->newptr == NULL) return (error); if (val == desiredvnodes) return (0); mtx_lock(&vnode_list_mtx); desiredvnodes = val; wantfreevnodes = desiredvnodes / 4; vnlru_recalc(); mtx_unlock(&vnode_list_mtx); /* * XXX There is no protection against multiple threads changing * desiredvnodes at the same time. Locking above only helps vnlru and * getnewvnode. */ vfs_hash_changesize(desiredvnodes); cache_changesize(desiredvnodes); return (0); } SYSCTL_PROC(_kern, KERN_MAXVNODES, maxvnodes, CTLTYPE_ULONG | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_maxvnodes, "LU", "Target for maximum number of vnodes"); static int sysctl_wantfreevnodes(SYSCTL_HANDLER_ARGS) { u_long val; int error; val = wantfreevnodes; error = sysctl_handle_long(oidp, &val, 0, req); if (error != 0 || req->newptr == NULL) return (error); if (val == wantfreevnodes) return (0); mtx_lock(&vnode_list_mtx); wantfreevnodes = val; vnlru_recalc(); mtx_unlock(&vnode_list_mtx); return (0); } SYSCTL_PROC(_vfs, OID_AUTO, wantfreevnodes, CTLTYPE_ULONG | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0, sysctl_wantfreevnodes, "LU", "Target for minimum number of \"free\" vnodes"); SYSCTL_ULONG(_kern, OID_AUTO, minvnodes, CTLFLAG_RW, &wantfreevnodes, 0, "Old name for vfs.wantfreevnodes (legacy)"); static int vnlru_nowhere; SYSCTL_INT(_debug, OID_AUTO, vnlru_nowhere, CTLFLAG_RW, &vnlru_nowhere, 0, "Number of times the vnlru process ran without success"); static int sysctl_try_reclaim_vnode(SYSCTL_HANDLER_ARGS) { struct vnode *vp; struct nameidata nd; char *buf; unsigned long ndflags; int error; if (req->newptr == NULL) return (EINVAL); if (req->newlen >= PATH_MAX) return (E2BIG); buf = malloc(PATH_MAX, M_TEMP, M_WAITOK); error = SYSCTL_IN(req, buf, req->newlen); if (error != 0) goto out; buf[req->newlen] = '\0'; ndflags = LOCKLEAF | NOFOLLOW | AUDITVNODE1 | SAVENAME; NDINIT(&nd, LOOKUP, ndflags, UIO_SYSSPACE, buf, curthread); if ((error = namei(&nd)) != 0) goto out; vp = nd.ni_vp; if (VN_IS_DOOMED(vp)) { /* * This vnode is being recycled. Return != 0 to let the caller * know that the sysctl had no effect. Return EAGAIN because a * subsequent call will likely succeed (since namei will create * a new vnode if necessary) */ error = EAGAIN; goto putvnode; } counter_u64_add(recycles_count, 1); vgone(vp); putvnode: NDFREE(&nd, 0); out: free(buf, M_TEMP); return (error); } static int sysctl_ftry_reclaim_vnode(SYSCTL_HANDLER_ARGS) { struct thread *td = curthread; struct vnode *vp; struct file *fp; int error; int fd; if (req->newptr == NULL) return (EBADF); error = sysctl_handle_int(oidp, &fd, 0, req); if (error != 0) return (error); error = getvnode(curthread, fd, &cap_fcntl_rights, &fp); if (error != 0) return (error); vp = fp->f_vnode; error = vn_lock(vp, LK_EXCLUSIVE); if (error != 0) goto drop; counter_u64_add(recycles_count, 1); vgone(vp); VOP_UNLOCK(vp); drop: fdrop(fp, td); return (error); } SYSCTL_PROC(_debug, OID_AUTO, try_reclaim_vnode, CTLTYPE_STRING | CTLFLAG_MPSAFE | CTLFLAG_WR, NULL, 0, sysctl_try_reclaim_vnode, "A", "Try to reclaim a vnode by its pathname"); SYSCTL_PROC(_debug, OID_AUTO, ftry_reclaim_vnode, CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_WR, NULL, 0, sysctl_ftry_reclaim_vnode, "I", "Try to reclaim a vnode by its file descriptor"); /* Shift count for (uintptr_t)vp to initialize vp->v_hash. */ static int vnsz2log; /* * Support for the bufobj clean & dirty pctrie. */ static void * buf_trie_alloc(struct pctrie *ptree) { return (uma_zalloc_smr(buf_trie_zone, M_NOWAIT)); } static void buf_trie_free(struct pctrie *ptree, void *node) { uma_zfree_smr(buf_trie_zone, node); } PCTRIE_DEFINE_SMR(BUF, buf, b_lblkno, buf_trie_alloc, buf_trie_free, buf_trie_smr); /* * Initialize the vnode management data structures. * * Reevaluate the following cap on the number of vnodes after the physical * memory size exceeds 512GB. In the limit, as the physical memory size * grows, the ratio of the memory size in KB to vnodes approaches 64:1. */ #ifndef MAXVNODES_MAX #define MAXVNODES_MAX (512UL * 1024 * 1024 / 64) /* 8M */ #endif static MALLOC_DEFINE(M_VNODE_MARKER, "vnodemarker", "vnode marker"); static struct vnode * vn_alloc_marker(struct mount *mp) { struct vnode *vp; vp = malloc(sizeof(struct vnode), M_VNODE_MARKER, M_WAITOK | M_ZERO); vp->v_type = VMARKER; vp->v_mount = mp; return (vp); } static void vn_free_marker(struct vnode *vp) { MPASS(vp->v_type == VMARKER); free(vp, M_VNODE_MARKER); } /* * Initialize a vnode as it first enters the zone. */ static int vnode_init(void *mem, int size, int flags) { struct vnode *vp; vp = mem; bzero(vp, size); /* * Setup locks. */ vp->v_vnlock = &vp->v_lock; mtx_init(&vp->v_interlock, "vnode interlock", NULL, MTX_DEF); /* * By default, don't allow shared locks unless filesystems opt-in. */ lockinit(vp->v_vnlock, PVFS, "vnode", VLKTIMEOUT, LK_NOSHARE | LK_IS_VNODE); /* * Initialize bufobj. */ bufobj_init(&vp->v_bufobj, vp); /* * Initialize namecache. */ cache_vnode_init(vp); /* * Initialize rangelocks. */ rangelock_init(&vp->v_rl); vp->v_dbatchcpu = NOCPU; mtx_lock(&vnode_list_mtx); TAILQ_INSERT_BEFORE(vnode_list_free_marker, vp, v_vnodelist); mtx_unlock(&vnode_list_mtx); return (0); } /* * Free a vnode when it is cleared from the zone. */ static void vnode_fini(void *mem, int size) { struct vnode *vp; struct bufobj *bo; vp = mem; vdbatch_dequeue(vp); mtx_lock(&vnode_list_mtx); TAILQ_REMOVE(&vnode_list, vp, v_vnodelist); mtx_unlock(&vnode_list_mtx); rangelock_destroy(&vp->v_rl); lockdestroy(vp->v_vnlock); mtx_destroy(&vp->v_interlock); bo = &vp->v_bufobj; rw_destroy(BO_LOCKPTR(bo)); } /* * Provide the size of NFS nclnode and NFS fh for calculation of the * vnode memory consumption. The size is specified directly to * eliminate dependency on NFS-private header. * * Other filesystems may use bigger or smaller (like UFS and ZFS) * private inode data, but the NFS-based estimation is ample enough. * Still, we care about differences in the size between 64- and 32-bit * platforms. * * Namecache structure size is heuristically * sizeof(struct namecache_ts) + CACHE_PATH_CUTOFF + 1. */ #ifdef _LP64 #define NFS_NCLNODE_SZ (528 + 64) #define NC_SZ 148 #else #define NFS_NCLNODE_SZ (360 + 32) #define NC_SZ 92 #endif static void vntblinit(void *dummy __unused) { struct vdbatch *vd; int cpu, physvnodes, virtvnodes; u_int i; /* * Desiredvnodes is a function of the physical memory size and the * kernel's heap size. Generally speaking, it scales with the * physical memory size. The ratio of desiredvnodes to the physical * memory size is 1:16 until desiredvnodes exceeds 98,304. * Thereafter, the * marginal ratio of desiredvnodes to the physical memory size is * 1:64. However, desiredvnodes is limited by the kernel's heap * size. The memory required by desiredvnodes vnodes and vm objects * must not exceed 1/10th of the kernel's heap size. */ physvnodes = maxproc + pgtok(vm_cnt.v_page_count) / 64 + 3 * min(98304 * 16, pgtok(vm_cnt.v_page_count)) / 64; virtvnodes = vm_kmem_size / (10 * (sizeof(struct vm_object) + sizeof(struct vnode) + NC_SZ * ncsizefactor + NFS_NCLNODE_SZ)); desiredvnodes = min(physvnodes, virtvnodes); if (desiredvnodes > MAXVNODES_MAX) { if (bootverbose) printf("Reducing kern.maxvnodes %lu -> %lu\n", desiredvnodes, MAXVNODES_MAX); desiredvnodes = MAXVNODES_MAX; } wantfreevnodes = desiredvnodes / 4; mtx_init(&mntid_mtx, "mntid", NULL, MTX_DEF); TAILQ_INIT(&vnode_list); mtx_init(&vnode_list_mtx, "vnode_list", NULL, MTX_DEF); /* * The lock is taken to appease WITNESS. */ mtx_lock(&vnode_list_mtx); vnlru_recalc(); mtx_unlock(&vnode_list_mtx); vnode_list_free_marker = vn_alloc_marker(NULL); TAILQ_INSERT_HEAD(&vnode_list, vnode_list_free_marker, v_vnodelist); vnode_list_reclaim_marker = vn_alloc_marker(NULL); TAILQ_INSERT_HEAD(&vnode_list, vnode_list_reclaim_marker, v_vnodelist); vnode_zone = uma_zcreate("VNODE", sizeof (struct vnode), NULL, NULL, vnode_init, vnode_fini, UMA_ALIGN_PTR, 0); uma_zone_set_smr(vnode_zone, vfs_smr); vnodepoll_zone = uma_zcreate("VNODEPOLL", sizeof (struct vpollinfo), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); /* * Preallocate enough nodes to support one-per buf so that * we can not fail an insert. reassignbuf() callers can not * tolerate the insertion failure. */ buf_trie_zone = uma_zcreate("BUF TRIE", pctrie_node_size(), NULL, NULL, pctrie_zone_init, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE | UMA_ZONE_SMR); buf_trie_smr = uma_zone_get_smr(buf_trie_zone); uma_prealloc(buf_trie_zone, nbuf); vnodes_created = counter_u64_alloc(M_WAITOK); recycles_count = counter_u64_alloc(M_WAITOK); recycles_free_count = counter_u64_alloc(M_WAITOK); deferred_inact = counter_u64_alloc(M_WAITOK); /* * Initialize the filesystem syncer. */ syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE, &syncer_mask); syncer_maxdelay = syncer_mask + 1; mtx_init(&sync_mtx, "Syncer mtx", NULL, MTX_DEF); cv_init(&sync_wakeup, "syncer"); for (i = 1; i <= sizeof(struct vnode); i <<= 1) vnsz2log++; vnsz2log--; CPU_FOREACH(cpu) { vd = DPCPU_ID_PTR((cpu), vd); bzero(vd, sizeof(*vd)); mtx_init(&vd->lock, "vdbatch", NULL, MTX_DEF); } } SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL); /* * Mark a mount point as busy. Used to synchronize access and to delay * unmounting. Eventually, mountlist_mtx is not released on failure. * * vfs_busy() is a custom lock, it can block the caller. * vfs_busy() only sleeps if the unmount is active on the mount point. * For a mountpoint mp, vfs_busy-enforced lock is before lock of any * vnode belonging to mp. * * Lookup uses vfs_busy() to traverse mount points. * root fs var fs * / vnode lock A / vnode lock (/var) D * /var vnode lock B /log vnode lock(/var/log) E * vfs_busy lock C vfs_busy lock F * * Within each file system, the lock order is C->A->B and F->D->E. * * When traversing across mounts, the system follows that lock order: * * C->A->B * | * +->F->D->E * * The lookup() process for namei("/var") illustrates the process: * VOP_LOOKUP() obtains B while A is held * vfs_busy() obtains a shared lock on F while A and B are held * vput() releases lock on B * vput() releases lock on A * VFS_ROOT() obtains lock on D while shared lock on F is held * vfs_unbusy() releases shared lock on F * vn_lock() obtains lock on deadfs vnode vp_crossmp instead of A. * Attempt to lock A (instead of vp_crossmp) while D is held would * violate the global order, causing deadlocks. * * dounmount() locks B while F is drained. */ int vfs_busy(struct mount *mp, int flags) { MPASS((flags & ~MBF_MASK) == 0); CTR3(KTR_VFS, "%s: mp %p with flags %d", __func__, mp, flags); if (vfs_op_thread_enter(mp)) { MPASS((mp->mnt_kern_flag & MNTK_DRAINING) == 0); MPASS((mp->mnt_kern_flag & MNTK_UNMOUNT) == 0); MPASS((mp->mnt_kern_flag & MNTK_REFEXPIRE) == 0); vfs_mp_count_add_pcpu(mp, ref, 1); vfs_mp_count_add_pcpu(mp, lockref, 1); vfs_op_thread_exit(mp); if (flags & MBF_MNTLSTLOCK) mtx_unlock(&mountlist_mtx); return (0); } MNT_ILOCK(mp); vfs_assert_mount_counters(mp); MNT_REF(mp); /* * If mount point is currently being unmounted, sleep until the * mount point fate is decided. If thread doing the unmounting fails, * it will clear MNTK_UNMOUNT flag before waking us up, indicating * that this mount point has survived the unmount attempt and vfs_busy * should retry. Otherwise the unmounter thread will set MNTK_REFEXPIRE * flag in addition to MNTK_UNMOUNT, indicating that mount point is * about to be really destroyed. vfs_busy needs to release its * reference on the mount point in this case and return with ENOENT, * telling the caller that mount mount it tried to busy is no longer * valid. */ while (mp->mnt_kern_flag & MNTK_UNMOUNT) { if (flags & MBF_NOWAIT || mp->mnt_kern_flag & MNTK_REFEXPIRE) { MNT_REL(mp); MNT_IUNLOCK(mp); CTR1(KTR_VFS, "%s: failed busying before sleeping", __func__); return (ENOENT); } if (flags & MBF_MNTLSTLOCK) mtx_unlock(&mountlist_mtx); mp->mnt_kern_flag |= MNTK_MWAIT; msleep(mp, MNT_MTX(mp), PVFS | PDROP, "vfs_busy", 0); if (flags & MBF_MNTLSTLOCK) mtx_lock(&mountlist_mtx); MNT_ILOCK(mp); } if (flags & MBF_MNTLSTLOCK) mtx_unlock(&mountlist_mtx); mp->mnt_lockref++; MNT_IUNLOCK(mp); return (0); } /* * Free a busy filesystem. */ void vfs_unbusy(struct mount *mp) { int c; CTR2(KTR_VFS, "%s: mp %p", __func__, mp); if (vfs_op_thread_enter(mp)) { MPASS((mp->mnt_kern_flag & MNTK_DRAINING) == 0); vfs_mp_count_sub_pcpu(mp, lockref, 1); vfs_mp_count_sub_pcpu(mp, ref, 1); vfs_op_thread_exit(mp); return; } MNT_ILOCK(mp); vfs_assert_mount_counters(mp); MNT_REL(mp); c = --mp->mnt_lockref; if (mp->mnt_vfs_ops == 0) { MPASS((mp->mnt_kern_flag & MNTK_DRAINING) == 0); MNT_IUNLOCK(mp); return; } if (c < 0) vfs_dump_mount_counters(mp); if (c == 0 && (mp->mnt_kern_flag & MNTK_DRAINING) != 0) { MPASS(mp->mnt_kern_flag & MNTK_UNMOUNT); CTR1(KTR_VFS, "%s: waking up waiters", __func__); mp->mnt_kern_flag &= ~MNTK_DRAINING; wakeup(&mp->mnt_lockref); } MNT_IUNLOCK(mp); } /* * Lookup a mount point by filesystem identifier. */ struct mount * vfs_getvfs(fsid_t *fsid) { struct mount *mp; CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid); mtx_lock(&mountlist_mtx); TAILQ_FOREACH(mp, &mountlist, mnt_list) { if (fsidcmp(&mp->mnt_stat.f_fsid, fsid) == 0) { vfs_ref(mp); mtx_unlock(&mountlist_mtx); return (mp); } } mtx_unlock(&mountlist_mtx); CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid); return ((struct mount *) 0); } /* * Lookup a mount point by filesystem identifier, busying it before * returning. * * To avoid congestion on mountlist_mtx, implement simple direct-mapped * cache for popular filesystem identifiers. The cache is lockess, using * the fact that struct mount's are never freed. In worst case we may * get pointer to unmounted or even different filesystem, so we have to * check what we got, and go slow way if so. */ struct mount * vfs_busyfs(fsid_t *fsid) { #define FSID_CACHE_SIZE 256 typedef struct mount * volatile vmp_t; static vmp_t cache[FSID_CACHE_SIZE]; struct mount *mp; int error; uint32_t hash; CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid); hash = fsid->val[0] ^ fsid->val[1]; hash = (hash >> 16 ^ hash) & (FSID_CACHE_SIZE - 1); mp = cache[hash]; if (mp == NULL || fsidcmp(&mp->mnt_stat.f_fsid, fsid) != 0) goto slow; if (vfs_busy(mp, 0) != 0) { cache[hash] = NULL; goto slow; } if (fsidcmp(&mp->mnt_stat.f_fsid, fsid) == 0) return (mp); else vfs_unbusy(mp); slow: mtx_lock(&mountlist_mtx); TAILQ_FOREACH(mp, &mountlist, mnt_list) { if (fsidcmp(&mp->mnt_stat.f_fsid, fsid) == 0) { error = vfs_busy(mp, MBF_MNTLSTLOCK); if (error) { cache[hash] = NULL; mtx_unlock(&mountlist_mtx); return (NULL); } cache[hash] = mp; return (mp); } } CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid); mtx_unlock(&mountlist_mtx); return ((struct mount *) 0); } /* * Check if a user can access privileged mount options. */ int vfs_suser(struct mount *mp, struct thread *td) { int error; if (jailed(td->td_ucred)) { /* * If the jail of the calling thread lacks permission for * this type of file system, deny immediately. */ if (!prison_allow(td->td_ucred, mp->mnt_vfc->vfc_prison_flag)) return (EPERM); /* * If the file system was mounted outside the jail of the * calling thread, deny immediately. */ if (prison_check(td->td_ucred, mp->mnt_cred) != 0) return (EPERM); } /* * If file system supports delegated administration, we don't check * for the PRIV_VFS_MOUNT_OWNER privilege - it will be better verified * by the file system itself. * If this is not the user that did original mount, we check for * the PRIV_VFS_MOUNT_OWNER privilege. */ if (!(mp->mnt_vfc->vfc_flags & VFCF_DELEGADMIN) && mp->mnt_cred->cr_uid != td->td_ucred->cr_uid) { if ((error = priv_check(td, PRIV_VFS_MOUNT_OWNER)) != 0) return (error); } return (0); } /* * Get a new unique fsid. Try to make its val[0] unique, since this value * will be used to create fake device numbers for stat(). Also try (but * not so hard) make its val[0] unique mod 2^16, since some emulators only * support 16-bit device numbers. We end up with unique val[0]'s for the * first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls. * * Keep in mind that several mounts may be running in parallel. Starting * the search one past where the previous search terminated is both a * micro-optimization and a defense against returning the same fsid to * different mounts. */ void vfs_getnewfsid(struct mount *mp) { static uint16_t mntid_base; struct mount *nmp; fsid_t tfsid; int mtype; CTR2(KTR_VFS, "%s: mp %p", __func__, mp); mtx_lock(&mntid_mtx); mtype = mp->mnt_vfc->vfc_typenum; tfsid.val[1] = mtype; mtype = (mtype & 0xFF) << 24; for (;;) { tfsid.val[0] = makedev(255, mtype | ((mntid_base & 0xFF00) << 8) | (mntid_base & 0xFF)); mntid_base++; if ((nmp = vfs_getvfs(&tfsid)) == NULL) break; vfs_rel(nmp); } mp->mnt_stat.f_fsid.val[0] = tfsid.val[0]; mp->mnt_stat.f_fsid.val[1] = tfsid.val[1]; mtx_unlock(&mntid_mtx); } /* * Knob to control the precision of file timestamps: * * 0 = seconds only; nanoseconds zeroed. * 1 = seconds and nanoseconds, accurate within 1/HZ. * 2 = seconds and nanoseconds, truncated to microseconds. * >=3 = seconds and nanoseconds, maximum precision. */ enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC }; static int timestamp_precision = TSP_USEC; SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW, ×tamp_precision, 0, "File timestamp precision (0: seconds, " "1: sec + ns accurate to 1/HZ, 2: sec + ns truncated to us, " "3+: sec + ns (max. precision))"); /* * Get a current timestamp. */ void vfs_timestamp(struct timespec *tsp) { struct timeval tv; switch (timestamp_precision) { case TSP_SEC: tsp->tv_sec = time_second; tsp->tv_nsec = 0; break; case TSP_HZ: getnanotime(tsp); break; case TSP_USEC: microtime(&tv); TIMEVAL_TO_TIMESPEC(&tv, tsp); break; case TSP_NSEC: default: nanotime(tsp); break; } } /* * Set vnode attributes to VNOVAL */ void vattr_null(struct vattr *vap) { vap->va_type = VNON; vap->va_size = VNOVAL; vap->va_bytes = VNOVAL; vap->va_mode = VNOVAL; vap->va_nlink = VNOVAL; vap->va_uid = VNOVAL; vap->va_gid = VNOVAL; vap->va_fsid = VNOVAL; vap->va_fileid = VNOVAL; vap->va_blocksize = VNOVAL; vap->va_rdev = VNOVAL; vap->va_atime.tv_sec = VNOVAL; vap->va_atime.tv_nsec = VNOVAL; vap->va_mtime.tv_sec = VNOVAL; vap->va_mtime.tv_nsec = VNOVAL; vap->va_ctime.tv_sec = VNOVAL; vap->va_ctime.tv_nsec = VNOVAL; vap->va_birthtime.tv_sec = VNOVAL; vap->va_birthtime.tv_nsec = VNOVAL; vap->va_flags = VNOVAL; vap->va_gen = VNOVAL; vap->va_vaflags = 0; } /* * Try to reduce the total number of vnodes. * * This routine (and its user) are buggy in at least the following ways: * - all parameters were picked years ago when RAM sizes were significantly * smaller * - it can pick vnodes based on pages used by the vm object, but filesystems * like ZFS don't use it making the pick broken * - since ZFS has its own aging policy it gets partially combated by this one * - a dedicated method should be provided for filesystems to let them decide * whether the vnode should be recycled * * This routine is called when we have too many vnodes. It attempts * to free vnodes and will potentially free vnodes that still * have VM backing store (VM backing store is typically the cause * of a vnode blowout so we want to do this). Therefore, this operation * is not considered cheap. * * A number of conditions may prevent a vnode from being reclaimed. * the buffer cache may have references on the vnode, a directory * vnode may still have references due to the namei cache representing * underlying files, or the vnode may be in active use. It is not * desirable to reuse such vnodes. These conditions may cause the * number of vnodes to reach some minimum value regardless of what * you set kern.maxvnodes to. Do not set kern.maxvnodes too low. * * @param reclaim_nc_src Only reclaim directories with outgoing namecache * entries if this argument is strue * @param trigger Only reclaim vnodes with fewer than this many resident * pages. * @param target How many vnodes to reclaim. * @return The number of vnodes that were reclaimed. */ static int vlrureclaim(bool reclaim_nc_src, int trigger, u_long target) { struct vnode *vp, *mvp; struct mount *mp; struct vm_object *object; u_long done; bool retried; mtx_assert(&vnode_list_mtx, MA_OWNED); retried = false; done = 0; mvp = vnode_list_reclaim_marker; restart: vp = mvp; while (done < target) { vp = TAILQ_NEXT(vp, v_vnodelist); if (__predict_false(vp == NULL)) break; if (__predict_false(vp->v_type == VMARKER)) continue; /* * If it's been deconstructed already, it's still * referenced, or it exceeds the trigger, skip it. * Also skip free vnodes. We are trying to make space * to expand the free list, not reduce it. */ if (vp->v_usecount > 0 || vp->v_holdcnt == 0 || (!reclaim_nc_src && !LIST_EMPTY(&vp->v_cache_src))) goto next_iter; if (vp->v_type == VBAD || vp->v_type == VNON) goto next_iter; if (!VI_TRYLOCK(vp)) goto next_iter; if (vp->v_usecount > 0 || vp->v_holdcnt == 0 || (!reclaim_nc_src && !LIST_EMPTY(&vp->v_cache_src)) || VN_IS_DOOMED(vp) || vp->v_type == VNON) { VI_UNLOCK(vp); goto next_iter; } object = atomic_load_ptr(&vp->v_object); if (object == NULL || object->resident_page_count > trigger) { VI_UNLOCK(vp); goto next_iter; } vholdl(vp); VI_UNLOCK(vp); TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); TAILQ_INSERT_AFTER(&vnode_list, vp, mvp, v_vnodelist); mtx_unlock(&vnode_list_mtx); if (vn_start_write(vp, &mp, V_NOWAIT) != 0) { vdrop(vp); goto next_iter_unlocked; } if (VOP_LOCK(vp, LK_EXCLUSIVE|LK_NOWAIT) != 0) { vdrop(vp); vn_finished_write(mp); goto next_iter_unlocked; } VI_LOCK(vp); if (vp->v_usecount > 0 || (!reclaim_nc_src && !LIST_EMPTY(&vp->v_cache_src)) || (vp->v_object != NULL && vp->v_object->resident_page_count > trigger)) { VOP_UNLOCK(vp); vdropl(vp); vn_finished_write(mp); goto next_iter_unlocked; } counter_u64_add(recycles_count, 1); vgonel(vp); VOP_UNLOCK(vp); vdropl(vp); vn_finished_write(mp); done++; next_iter_unlocked: if (should_yield()) kern_yield(PRI_USER); mtx_lock(&vnode_list_mtx); goto restart; next_iter: MPASS(vp->v_type != VMARKER); if (!should_yield()) continue; TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); TAILQ_INSERT_AFTER(&vnode_list, vp, mvp, v_vnodelist); mtx_unlock(&vnode_list_mtx); kern_yield(PRI_USER); mtx_lock(&vnode_list_mtx); goto restart; } if (done == 0 && !retried) { TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); TAILQ_INSERT_HEAD(&vnode_list, mvp, v_vnodelist); retried = true; goto restart; } return (done); } static int max_vnlru_free = 10000; /* limit on vnode free requests per call */ SYSCTL_INT(_debug, OID_AUTO, max_vnlru_free, CTLFLAG_RW, &max_vnlru_free, 0, "limit on vnode free requests per call to the vnlru_free routine"); /* * Attempt to reduce the free list by the requested amount. */ static int vnlru_free_locked(int count, struct vfsops *mnt_op) { struct vnode *vp, *mvp; struct mount *mp; int ocount; mtx_assert(&vnode_list_mtx, MA_OWNED); if (count > max_vnlru_free) count = max_vnlru_free; ocount = count; mvp = vnode_list_free_marker; restart: vp = mvp; while (count > 0) { vp = TAILQ_NEXT(vp, v_vnodelist); if (__predict_false(vp == NULL)) { TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); TAILQ_INSERT_TAIL(&vnode_list, mvp, v_vnodelist); break; } if (__predict_false(vp->v_type == VMARKER)) continue; /* * Don't recycle if our vnode is from different type * of mount point. Note that mp is type-safe, the * check does not reach unmapped address even if * vnode is reclaimed. * Don't recycle if we can't get the interlock without * blocking. */ if (vp->v_holdcnt > 0 || (mnt_op != NULL && (mp = vp->v_mount) != NULL && mp->mnt_op != mnt_op) || !VI_TRYLOCK(vp)) { continue; } TAILQ_REMOVE(&vnode_list, mvp, v_vnodelist); TAILQ_INSERT_AFTER(&vnode_list, vp, mvp, v_vnodelist); if (__predict_false(vp->v_type == VBAD || vp->v_type == VNON)) { VI_UNLOCK(vp); continue; } vholdl(vp); count--; mtx_unlock(&vnode_list_mtx); VI_UNLOCK(vp); vtryrecycle(vp); vdrop(vp); mtx_lock(&vnode_list_mtx); goto restart; } return (ocount - count); } void vnlru_free(int count, struct vfsops *mnt_op) { mtx_lock(&vnode_list_mtx); vnlru_free_locked(count, mnt_op); mtx_unlock(&vnode_list_mtx); } static void vnlru_recalc(void) { mtx_assert(&vnode_list_mtx, MA_OWNED); gapvnodes = imax(desiredvnodes - wantfreevnodes, 100); vhiwat = gapvnodes / 11; /* 9% -- just under the 10% in vlrureclaim() */ vlowat = vhiwat / 2; } /* * Attempt to recycle vnodes in a context that is always safe to block. * Calling vlrurecycle() from the bowels of filesystem code has some * interesting deadlock problems. */ static struct proc *vnlruproc; static int vnlruproc_sig; /* * The main freevnodes counter is only updated when threads requeue their vnode * batches. CPUs are conditionally walked to compute a more accurate total. * * Limit how much of a slop are we willing to tolerate. Note: the actual value * at any given moment can still exceed slop, but it should not be by significant * margin in practice. */ #define VNLRU_FREEVNODES_SLOP 128 static u_long vnlru_read_freevnodes(void) { struct vdbatch *vd; long slop; int cpu; mtx_assert(&vnode_list_mtx, MA_OWNED); if (freevnodes > freevnodes_old) slop = freevnodes - freevnodes_old; else slop = freevnodes_old - freevnodes; if (slop < VNLRU_FREEVNODES_SLOP) return (freevnodes >= 0 ? freevnodes : 0); freevnodes_old = freevnodes; CPU_FOREACH(cpu) { vd = DPCPU_ID_PTR((cpu), vd); freevnodes_old += vd->freevnodes; } return (freevnodes_old >= 0 ? freevnodes_old : 0); } static bool vnlru_under(u_long rnumvnodes, u_long limit) { u_long rfreevnodes, space; if (__predict_false(rnumvnodes > desiredvnodes)) return (true); space = desiredvnodes - rnumvnodes; if (space < limit) { rfreevnodes = vnlru_read_freevnodes(); if (rfreevnodes > wantfreevnodes) space += rfreevnodes - wantfreevnodes; } return (space < limit); } static bool vnlru_under_unlocked(u_long rnumvnodes, u_long limit) { long rfreevnodes, space; if (__predict_false(rnumvnodes > desiredvnodes)) return (true); space = desiredvnodes - rnumvnodes; if (space < limit) { rfreevnodes = atomic_load_long(&freevnodes); if (rfreevnodes > wantfreevnodes) space += rfreevnodes - wantfreevnodes; } return (space < limit); } static void vnlru_kick(void) { mtx_assert(&vnode_list_mtx, MA_OWNED); if (vnlruproc_sig == 0) { vnlruproc_sig = 1; wakeup(vnlruproc); } } static void vnlru_proc(void) { u_long rnumvnodes, rfreevnodes, target; unsigned long onumvnodes; int done, force, trigger, usevnodes; bool reclaim_nc_src, want_reread; EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, vnlruproc, SHUTDOWN_PRI_FIRST); force = 0; want_reread = false; for (;;) { kproc_suspend_check(vnlruproc); mtx_lock(&vnode_list_mtx); rnumvnodes = atomic_load_long(&numvnodes); if (want_reread) { force = vnlru_under(numvnodes, vhiwat) ? 1 : 0; want_reread = false; } /* * If numvnodes is too large (due to desiredvnodes being * adjusted using its sysctl, or emergency growth), first * try to reduce it by discarding from the free list. */ if (rnumvnodes > desiredvnodes) { vnlru_free_locked(rnumvnodes - desiredvnodes, NULL); rnumvnodes = atomic_load_long(&numvnodes); } /* * Sleep if the vnode cache is in a good state. This is * when it is not over-full and has space for about a 4% * or 9% expansion (by growing its size or inexcessively * reducing its free list). Otherwise, try to reclaim * space for a 10% expansion. */ if (vstir && force == 0) { force = 1; vstir = 0; } if (force == 0 && !vnlru_under(rnumvnodes, vlowat)) { vnlruproc_sig = 0; wakeup(&vnlruproc_sig); msleep(vnlruproc, &vnode_list_mtx, PVFS|PDROP, "vlruwt", hz); continue; } rfreevnodes = vnlru_read_freevnodes(); onumvnodes = rnumvnodes; /* * Calculate parameters for recycling. These are the same * throughout the loop to give some semblance of fairness. * The trigger point is to avoid recycling vnodes with lots * of resident pages. We aren't trying to free memory; we * are trying to recycle or at least free vnodes. */ if (rnumvnodes <= desiredvnodes) usevnodes = rnumvnodes - rfreevnodes; else usevnodes = rnumvnodes; if (usevnodes <= 0) usevnodes = 1; /* * The trigger value is is chosen to give a conservatively * large value to ensure that it alone doesn't prevent * making progress. The value can easily be so large that * it is effectively infinite in some congested and * misconfigured cases, and this is necessary. Normally * it is about 8 to 100 (pages), which is quite large. */ trigger = vm_cnt.v_page_count * 2 / usevnodes; if (force < 2) trigger = vsmalltrigger; reclaim_nc_src = force >= 3; target = rnumvnodes * (int64_t)gapvnodes / imax(desiredvnodes, 1); target = target / 10 + 1; done = vlrureclaim(reclaim_nc_src, trigger, target); mtx_unlock(&vnode_list_mtx); if (onumvnodes > desiredvnodes && numvnodes <= desiredvnodes) uma_reclaim(UMA_RECLAIM_DRAIN); if (done == 0) { if (force == 0 || force == 1) { force = 2; continue; } if (force == 2) { force = 3; continue; } want_reread = true; force = 0; vnlru_nowhere++; tsleep(vnlruproc, PPAUSE, "vlrup", hz * 3); } else { want_reread = true; kern_yield(PRI_USER); } } } static struct kproc_desc vnlru_kp = { "vnlru", vnlru_proc, &vnlruproc }; SYSINIT(vnlru, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &vnlru_kp); /* * Routines having to do with the management of the vnode table. */ /* * Try to recycle a freed vnode. We abort if anyone picks up a reference * before we actually vgone(). This function must be called with the vnode * held to prevent the vnode from being returned to the free list midway * through vgone(). */ static int vtryrecycle(struct vnode *vp) { struct mount *vnmp; CTR2(KTR_VFS, "%s: vp %p", __func__, vp); VNASSERT(vp->v_holdcnt, vp, ("vtryrecycle: Recycling vp %p without a reference.", vp)); /* * This vnode may found and locked via some other list, if so we * can't recycle it yet. */ if (VOP_LOCK(vp, LK_EXCLUSIVE | LK_NOWAIT) != 0) { CTR2(KTR_VFS, "%s: impossible to recycle, vp %p lock is already held", __func__, vp); return (EWOULDBLOCK); } /* * Don't recycle if its filesystem is being suspended. */ if (vn_start_write(vp, &vnmp, V_NOWAIT) != 0) { VOP_UNLOCK(vp); CTR2(KTR_VFS, "%s: impossible to recycle, cannot start the write for %p", __func__, vp); return (EBUSY); } /* * If we got this far, we need to acquire the interlock and see if * anyone picked up this vnode from another list. If not, we will * mark it with DOOMED via vgonel() so that anyone who does find it * will skip over it. */ VI_LOCK(vp); if (vp->v_usecount) { VOP_UNLOCK(vp); VI_UNLOCK(vp); vn_finished_write(vnmp); CTR2(KTR_VFS, "%s: impossible to recycle, %p is already referenced", __func__, vp); return (EBUSY); } if (!VN_IS_DOOMED(vp)) { counter_u64_add(recycles_free_count, 1); vgonel(vp); } VOP_UNLOCK(vp); VI_UNLOCK(vp); vn_finished_write(vnmp); return (0); } /* * Allocate a new vnode. * * The operation never returns an error. Returning an error was disabled * in r145385 (dated 2005) with the following comment: * * XXX Not all VFS_VGET/ffs_vget callers check returns. * * Given the age of this commit (almost 15 years at the time of writing this * comment) restoring the ability to fail requires a significant audit of * all codepaths. * * The routine can try to free a vnode or stall for up to 1 second waiting for * vnlru to clear things up, but ultimately always performs a M_WAITOK allocation. */ static u_long vn_alloc_cyclecount; static struct vnode * __noinline vn_alloc_hard(struct mount *mp) { u_long rnumvnodes, rfreevnodes; mtx_lock(&vnode_list_mtx); rnumvnodes = atomic_load_long(&numvnodes); if (rnumvnodes + 1 < desiredvnodes) { vn_alloc_cyclecount = 0; goto alloc; } rfreevnodes = vnlru_read_freevnodes(); if (vn_alloc_cyclecount++ >= rfreevnodes) { vn_alloc_cyclecount = 0; vstir = 1; } /* * Grow the vnode cache if it will not be above its target max * after growing. Otherwise, if the free list is nonempty, try * to reclaim 1 item from it before growing the cache (possibly * above its target max if the reclamation failed or is delayed). * Otherwise, wait for some space. In all cases, schedule * vnlru_proc() if we are getting short of space. The watermarks * should be chosen so that we never wait or even reclaim from * the free list to below its target minimum. */ if (vnlru_free_locked(1, NULL) > 0) goto alloc; if (mp == NULL || (mp->mnt_kern_flag & MNTK_SUSPEND) == 0) { /* * Wait for space for a new vnode. */ vnlru_kick(); msleep(&vnlruproc_sig, &vnode_list_mtx, PVFS, "vlruwk", hz); if (atomic_load_long(&numvnodes) + 1 > desiredvnodes && vnlru_read_freevnodes() > 1) vnlru_free_locked(1, NULL); } alloc: rnumvnodes = atomic_fetchadd_long(&numvnodes, 1) + 1; if (vnlru_under(rnumvnodes, vlowat)) vnlru_kick(); mtx_unlock(&vnode_list_mtx); return (uma_zalloc_smr(vnode_zone, M_WAITOK)); } static struct vnode * vn_alloc(struct mount *mp) { u_long rnumvnodes; if (__predict_false(vn_alloc_cyclecount != 0)) return (vn_alloc_hard(mp)); rnumvnodes = atomic_fetchadd_long(&numvnodes, 1) + 1; if (__predict_false(vnlru_under_unlocked(rnumvnodes, vlowat))) { atomic_subtract_long(&numvnodes, 1); return (vn_alloc_hard(mp)); } return (uma_zalloc_smr(vnode_zone, M_WAITOK)); } static void vn_free(struct vnode *vp) { atomic_subtract_long(&numvnodes, 1); uma_zfree_smr(vnode_zone, vp); } /* * Return the next vnode from the free list. */ int getnewvnode(const char *tag, struct mount *mp, struct vop_vector *vops, struct vnode **vpp) { struct vnode *vp; struct thread *td; struct lock_object *lo; CTR3(KTR_VFS, "%s: mp %p with tag %s", __func__, mp, tag); KASSERT(vops->registered, ("%s: not registered vector op %p\n", __func__, vops)); td = curthread; if (td->td_vp_reserved != NULL) { vp = td->td_vp_reserved; td->td_vp_reserved = NULL; } else { vp = vn_alloc(mp); } counter_u64_add(vnodes_created, 1); /* * Locks are given the generic name "vnode" when created. * Follow the historic practice of using the filesystem * name when they allocated, e.g., "zfs", "ufs", "nfs, etc. * * Locks live in a witness group keyed on their name. Thus, * when a lock is renamed, it must also move from the witness * group of its old name to the witness group of its new name. * * The change only needs to be made when the vnode moves * from one filesystem type to another. We ensure that each * filesystem use a single static name pointer for its tag so * that we can compare pointers rather than doing a strcmp(). */ lo = &vp->v_vnlock->lock_object; #ifdef WITNESS if (lo->lo_name != tag) { #endif lo->lo_name = tag; #ifdef WITNESS WITNESS_DESTROY(lo); WITNESS_INIT(lo, tag); } #endif /* * By default, don't allow shared locks unless filesystems opt-in. */ vp->v_vnlock->lock_object.lo_flags |= LK_NOSHARE; /* * Finalize various vnode identity bits. */ KASSERT(vp->v_object == NULL, ("stale v_object %p", vp)); KASSERT(vp->v_lockf == NULL, ("stale v_lockf %p", vp)); KASSERT(vp->v_pollinfo == NULL, ("stale v_pollinfo %p", vp)); vp->v_type = VNON; vp->v_op = vops; v_init_counters(vp); vp->v_bufobj.bo_ops = &buf_ops_bio; #ifdef DIAGNOSTIC if (mp == NULL && vops != &dead_vnodeops) printf("NULL mp in getnewvnode(9), tag %s\n", tag); #endif #ifdef MAC mac_vnode_init(vp); if (mp != NULL && (mp->mnt_flag & MNT_MULTILABEL) == 0) mac_vnode_associate_singlelabel(mp, vp); #endif if (mp != NULL) { vp->v_bufobj.bo_bsize = mp->mnt_stat.f_iosize; if ((mp->mnt_kern_flag & MNTK_NOKNOTE) != 0) vp->v_vflag |= VV_NOKNOTE; } /* * For the filesystems which do not use vfs_hash_insert(), * still initialize v_hash to have vfs_hash_index() useful. * E.g., nullfs uses vfs_hash_index() on the lower vnode for * its own hashing. */ vp->v_hash = (uintptr_t)vp >> vnsz2log; *vpp = vp; return (0); } void getnewvnode_reserve(void) { struct thread *td; td = curthread; MPASS(td->td_vp_reserved == NULL); td->td_vp_reserved = vn_alloc(NULL); } void getnewvnode_drop_reserve(void) { struct thread *td; td = curthread; if (td->td_vp_reserved != NULL) { vn_free(td->td_vp_reserved); td->td_vp_reserved = NULL; } } static void freevnode(struct vnode *vp) { struct bufobj *bo; /* * The vnode has been marked for destruction, so free it. * * The vnode will be returned to the zone where it will * normally remain until it is needed for another vnode. We * need to cleanup (or verify that the cleanup has already * been done) any residual data left from its current use * so as not to contaminate the freshly allocated vnode. */ CTR2(KTR_VFS, "%s: destroying the vnode %p", __func__, vp); /* * Paired with vgone. */ vn_seqc_write_end_locked(vp); VNPASS(vp->v_seqc_users == 0, vp); bo = &vp->v_bufobj; VNASSERT(vp->v_data == NULL, vp, ("cleaned vnode isn't")); VNPASS(vp->v_holdcnt == VHOLD_NO_SMR, vp); VNASSERT(vp->v_usecount == 0, vp, ("Non-zero use count")); VNASSERT(vp->v_writecount == 0, vp, ("Non-zero write count")); VNASSERT(bo->bo_numoutput == 0, vp, ("Clean vnode has pending I/O's")); VNASSERT(bo->bo_clean.bv_cnt == 0, vp, ("cleanbufcnt not 0")); VNASSERT(pctrie_is_empty(&bo->bo_clean.bv_root), vp, ("clean blk trie not empty")); VNASSERT(bo->bo_dirty.bv_cnt == 0, vp, ("dirtybufcnt not 0")); VNASSERT(pctrie_is_empty(&bo->bo_dirty.bv_root), vp, ("dirty blk trie not empty")); VNASSERT(TAILQ_EMPTY(&vp->v_cache_dst), vp, ("vp has namecache dst")); VNASSERT(LIST_EMPTY(&vp->v_cache_src), vp, ("vp has namecache src")); VNASSERT(vp->v_cache_dd == NULL, vp, ("vp has namecache for ..")); VNASSERT(TAILQ_EMPTY(&vp->v_rl.rl_waiters), vp, ("Dangling rangelock waiters")); VI_UNLOCK(vp); #ifdef MAC mac_vnode_destroy(vp); #endif if (vp->v_pollinfo != NULL) { destroy_vpollinfo(vp->v_pollinfo); vp->v_pollinfo = NULL; } #ifdef INVARIANTS /* XXX Elsewhere we detect an already freed vnode via NULL v_op. */ vp->v_op = NULL; #endif vp->v_mountedhere = NULL; vp->v_unpcb = NULL; vp->v_rdev = NULL; vp->v_fifoinfo = NULL; vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0; vp->v_irflag = 0; vp->v_iflag = 0; vp->v_vflag = 0; bo->bo_flag = 0; vn_free(vp); } /* * Delete from old mount point vnode list, if on one. */ static void delmntque(struct vnode *vp) { struct mount *mp; VNPASS((vp->v_mflag & VMP_LAZYLIST) == 0, vp); mp = vp->v_mount; if (mp == NULL) return; MNT_ILOCK(mp); VI_LOCK(vp); vp->v_mount = NULL; VI_UNLOCK(vp); VNASSERT(mp->mnt_nvnodelistsize > 0, vp, ("bad mount point vnode list size")); TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes); mp->mnt_nvnodelistsize--; MNT_REL(mp); MNT_IUNLOCK(mp); } static void insmntque_stddtr(struct vnode *vp, void *dtr_arg) { vp->v_data = NULL; vp->v_op = &dead_vnodeops; vgone(vp); vput(vp); } /* * Insert into list of vnodes for the new mount point, if available. */ int insmntque1(struct vnode *vp, struct mount *mp, void (*dtr)(struct vnode *, void *), void *dtr_arg) { KASSERT(vp->v_mount == NULL, ("insmntque: vnode already on per mount vnode list")); VNASSERT(mp != NULL, vp, ("Don't call insmntque(foo, NULL)")); ASSERT_VOP_ELOCKED(vp, "insmntque: non-locked vp"); /* * We acquire the vnode interlock early to ensure that the * vnode cannot be recycled by another process releasing a * holdcnt on it before we get it on both the vnode list * and the active vnode list. The mount mutex protects only * manipulation of the vnode list and the vnode freelist * mutex protects only manipulation of the active vnode list. * Hence the need to hold the vnode interlock throughout. */ MNT_ILOCK(mp); VI_LOCK(vp); if (((mp->mnt_kern_flag & MNTK_UNMOUNT) != 0 && ((mp->mnt_kern_flag & MNTK_UNMOUNTF) != 0 || mp->mnt_nvnodelistsize == 0)) && (vp->v_vflag & VV_FORCEINSMQ) == 0) { VI_UNLOCK(vp); MNT_IUNLOCK(mp); if (dtr != NULL) dtr(vp, dtr_arg); return (EBUSY); } vp->v_mount = mp; MNT_REF(mp); TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes); VNASSERT(mp->mnt_nvnodelistsize >= 0, vp, ("neg mount point vnode list size")); mp->mnt_nvnodelistsize++; VI_UNLOCK(vp); MNT_IUNLOCK(mp); return (0); } int insmntque(struct vnode *vp, struct mount *mp) { return (insmntque1(vp, mp, insmntque_stddtr, NULL)); } /* * Flush out and invalidate all buffers associated with a bufobj * Called with the underlying object locked. */ int bufobj_invalbuf(struct bufobj *bo, int flags, int slpflag, int slptimeo) { int error; BO_LOCK(bo); if (flags & V_SAVE) { error = bufobj_wwait(bo, slpflag, slptimeo); if (error) { BO_UNLOCK(bo); return (error); } if (bo->bo_dirty.bv_cnt > 0) { BO_UNLOCK(bo); if ((error = BO_SYNC(bo, MNT_WAIT)) != 0) return (error); /* * XXX We could save a lock/unlock if this was only * enabled under INVARIANTS */ BO_LOCK(bo); if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0) panic("vinvalbuf: dirty bufs"); } } /* * If you alter this loop please notice that interlock is dropped and * reacquired in flushbuflist. Special care is needed to ensure that * no race conditions occur from this. */ do { error = flushbuflist(&bo->bo_clean, flags, bo, slpflag, slptimeo); if (error == 0 && !(flags & V_CLEANONLY)) error = flushbuflist(&bo->bo_dirty, flags, bo, slpflag, slptimeo); if (error != 0 && error != EAGAIN) { BO_UNLOCK(bo); return (error); } } while (error != 0); /* * Wait for I/O to complete. XXX needs cleaning up. The vnode can * have write I/O in-progress but if there is a VM object then the * VM object can also have read-I/O in-progress. */ do { bufobj_wwait(bo, 0, 0); if ((flags & V_VMIO) == 0 && bo->bo_object != NULL) { BO_UNLOCK(bo); vm_object_pip_wait_unlocked(bo->bo_object, "bovlbx"); BO_LOCK(bo); } } while (bo->bo_numoutput > 0); BO_UNLOCK(bo); /* * Destroy the copy in the VM cache, too. */ if (bo->bo_object != NULL && (flags & (V_ALT | V_NORMAL | V_CLEANONLY | V_VMIO)) == 0) { VM_OBJECT_WLOCK(bo->bo_object); vm_object_page_remove(bo->bo_object, 0, 0, (flags & V_SAVE) ? OBJPR_CLEANONLY : 0); VM_OBJECT_WUNLOCK(bo->bo_object); } #ifdef INVARIANTS BO_LOCK(bo); if ((flags & (V_ALT | V_NORMAL | V_CLEANONLY | V_VMIO | V_ALLOWCLEAN)) == 0 && (bo->bo_dirty.bv_cnt > 0 || bo->bo_clean.bv_cnt > 0)) panic("vinvalbuf: flush failed"); if ((flags & (V_ALT | V_NORMAL | V_CLEANONLY | V_VMIO)) == 0 && bo->bo_dirty.bv_cnt > 0) panic("vinvalbuf: flush dirty failed"); BO_UNLOCK(bo); #endif return (0); } /* * Flush out and invalidate all buffers associated with a vnode. * Called with the underlying object locked. */ int vinvalbuf(struct vnode *vp, int flags, int slpflag, int slptimeo) { CTR3(KTR_VFS, "%s: vp %p with flags %d", __func__, vp, flags); ASSERT_VOP_LOCKED(vp, "vinvalbuf"); if (vp->v_object != NULL && vp->v_object->handle != vp) return (0); return (bufobj_invalbuf(&vp->v_bufobj, flags, slpflag, slptimeo)); } /* * Flush out buffers on the specified list. * */ static int flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo, int slpflag, int slptimeo) { struct buf *bp, *nbp; int retval, error; daddr_t lblkno; b_xflags_t xflags; ASSERT_BO_WLOCKED(bo); retval = 0; TAILQ_FOREACH_SAFE(bp, &bufv->bv_hd, b_bobufs, nbp) { /* * If we are flushing both V_NORMAL and V_ALT buffers then * do not skip any buffers. If we are flushing only V_NORMAL * buffers then skip buffers marked as BX_ALTDATA. If we are * flushing only V_ALT buffers then skip buffers not marked * as BX_ALTDATA. */ if (((flags & (V_NORMAL | V_ALT)) != (V_NORMAL | V_ALT)) && (((flags & V_NORMAL) && (bp->b_xflags & BX_ALTDATA) != 0) || ((flags & V_ALT) && (bp->b_xflags & BX_ALTDATA) == 0))) { continue; } if (nbp != NULL) { lblkno = nbp->b_lblkno; xflags = nbp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN); } retval = EAGAIN; error = BUF_TIMELOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_LOCKPTR(bo), "flushbuf", slpflag, slptimeo); if (error) { BO_LOCK(bo); return (error != ENOLCK ? error : EAGAIN); } KASSERT(bp->b_bufobj == bo, ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); /* * XXX Since there are no node locks for NFS, I * believe there is a slight chance that a delayed * write will occur while sleeping just above, so * check for it. */ if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) && (flags & V_SAVE)) { bremfree(bp); bp->b_flags |= B_ASYNC; bwrite(bp); BO_LOCK(bo); return (EAGAIN); /* XXX: why not loop ? */ } bremfree(bp); bp->b_flags |= (B_INVAL | B_RELBUF); bp->b_flags &= ~B_ASYNC; brelse(bp); BO_LOCK(bo); if (nbp == NULL) break; nbp = gbincore(bo, lblkno); if (nbp == NULL || (nbp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) != xflags) break; /* nbp invalid */ } return (retval); } int bnoreuselist(struct bufv *bufv, struct bufobj *bo, daddr_t startn, daddr_t endn) { struct buf *bp; int error; daddr_t lblkno; ASSERT_BO_LOCKED(bo); for (lblkno = startn;;) { again: bp = BUF_PCTRIE_LOOKUP_GE(&bufv->bv_root, lblkno); if (bp == NULL || bp->b_lblkno >= endn || bp->b_lblkno < startn) break; error = BUF_TIMELOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_LOCKPTR(bo), "brlsfl", 0, 0); if (error != 0) { BO_RLOCK(bo); if (error == ENOLCK) goto again; return (error); } KASSERT(bp->b_bufobj == bo, ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); lblkno = bp->b_lblkno + 1; if ((bp->b_flags & B_MANAGED) == 0) bremfree(bp); bp->b_flags |= B_RELBUF; /* * In the VMIO case, use the B_NOREUSE flag to hint that the * pages backing each buffer in the range are unlikely to be * reused. Dirty buffers will have the hint applied once * they've been written. */ if ((bp->b_flags & B_VMIO) != 0) bp->b_flags |= B_NOREUSE; brelse(bp); BO_RLOCK(bo); } return (0); } /* * Truncate a file's buffer and pages to a specified length. This * is in lieu of the old vinvalbuf mechanism, which performed unneeded * sync activity. */ int vtruncbuf(struct vnode *vp, off_t length, int blksize) { struct buf *bp, *nbp; struct bufobj *bo; daddr_t startlbn; CTR4(KTR_VFS, "%s: vp %p with block %d:%ju", __func__, vp, blksize, (uintmax_t)length); /* * Round up to the *next* lbn. */ startlbn = howmany(length, blksize); ASSERT_VOP_LOCKED(vp, "vtruncbuf"); bo = &vp->v_bufobj; restart_unlocked: BO_LOCK(bo); while (v_inval_buf_range_locked(vp, bo, startlbn, INT64_MAX) == EAGAIN) ; if (length > 0) { restartsync: TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) { if (bp->b_lblkno > 0) continue; /* * Since we hold the vnode lock this should only * fail if we're racing with the buf daemon. */ if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_LOCKPTR(bo)) == ENOLCK) goto restart_unlocked; VNASSERT((bp->b_flags & B_DELWRI), vp, ("buf(%p) on dirty queue without DELWRI", bp)); bremfree(bp); bawrite(bp); BO_LOCK(bo); goto restartsync; } } bufobj_wwait(bo, 0, 0); BO_UNLOCK(bo); vnode_pager_setsize(vp, length); return (0); } /* * Invalidate the cached pages of a file's buffer within the range of block * numbers [startlbn, endlbn). */ void v_inval_buf_range(struct vnode *vp, daddr_t startlbn, daddr_t endlbn, int blksize) { struct bufobj *bo; off_t start, end; ASSERT_VOP_LOCKED(vp, "v_inval_buf_range"); start = blksize * startlbn; end = blksize * endlbn; bo = &vp->v_bufobj; BO_LOCK(bo); MPASS(blksize == bo->bo_bsize); while (v_inval_buf_range_locked(vp, bo, startlbn, endlbn) == EAGAIN) ; BO_UNLOCK(bo); vn_pages_remove(vp, OFF_TO_IDX(start), OFF_TO_IDX(end + PAGE_SIZE - 1)); } static int v_inval_buf_range_locked(struct vnode *vp, struct bufobj *bo, daddr_t startlbn, daddr_t endlbn) { struct buf *bp, *nbp; bool anyfreed; ASSERT_VOP_LOCKED(vp, "v_inval_buf_range_locked"); ASSERT_BO_LOCKED(bo); do { anyfreed = false; TAILQ_FOREACH_SAFE(bp, &bo->bo_clean.bv_hd, b_bobufs, nbp) { if (bp->b_lblkno < startlbn || bp->b_lblkno >= endlbn) continue; if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_LOCKPTR(bo)) == ENOLCK) { BO_LOCK(bo); return (EAGAIN); } bremfree(bp); bp->b_flags |= B_INVAL | B_RELBUF; bp->b_flags &= ~B_ASYNC; brelse(bp); anyfreed = true; BO_LOCK(bo); if (nbp != NULL && (((nbp->b_xflags & BX_VNCLEAN) == 0) || nbp->b_vp != vp || (nbp->b_flags & B_DELWRI) != 0)) return (EAGAIN); } TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) { if (bp->b_lblkno < startlbn || bp->b_lblkno >= endlbn) continue; if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_LOCKPTR(bo)) == ENOLCK) { BO_LOCK(bo); return (EAGAIN); } bremfree(bp); bp->b_flags |= B_INVAL | B_RELBUF; bp->b_flags &= ~B_ASYNC; brelse(bp); anyfreed = true; BO_LOCK(bo); if (nbp != NULL && (((nbp->b_xflags & BX_VNDIRTY) == 0) || (nbp->b_vp != vp) || (nbp->b_flags & B_DELWRI) == 0)) return (EAGAIN); } } while (anyfreed); return (0); } static void buf_vlist_remove(struct buf *bp) { struct bufv *bv; b_xflags_t flags; flags = bp->b_xflags; KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp)); ASSERT_BO_WLOCKED(bp->b_bufobj); KASSERT((flags & (BX_VNDIRTY | BX_VNCLEAN)) != 0 && (flags & (BX_VNDIRTY | BX_VNCLEAN)) != (BX_VNDIRTY | BX_VNCLEAN), ("%s: buffer %p has invalid queue state", __func__, bp)); if ((flags & BX_VNDIRTY) != 0) bv = &bp->b_bufobj->bo_dirty; else bv = &bp->b_bufobj->bo_clean; BUF_PCTRIE_REMOVE(&bv->bv_root, bp->b_lblkno); TAILQ_REMOVE(&bv->bv_hd, bp, b_bobufs); bv->bv_cnt--; bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN); } /* * Add the buffer to the sorted clean or dirty block list. * * NOTE: xflags is passed as a constant, optimizing this inline function! */ static void buf_vlist_add(struct buf *bp, struct bufobj *bo, b_xflags_t xflags) { struct bufv *bv; struct buf *n; int error; ASSERT_BO_WLOCKED(bo); KASSERT((bo->bo_flag & BO_NOBUFS) == 0, ("buf_vlist_add: bo %p does not allow bufs", bo)); KASSERT((xflags & BX_VNDIRTY) == 0 || (bo->bo_flag & BO_DEAD) == 0, ("dead bo %p", bo)); KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0, ("buf_vlist_add: Buf %p has existing xflags %d", bp, bp->b_xflags)); bp->b_xflags |= xflags; if (xflags & BX_VNDIRTY) bv = &bo->bo_dirty; else bv = &bo->bo_clean; /* * Keep the list ordered. Optimize empty list insertion. Assume * we tend to grow at the tail so lookup_le should usually be cheaper * than _ge. */ if (bv->bv_cnt == 0 || bp->b_lblkno > TAILQ_LAST(&bv->bv_hd, buflists)->b_lblkno) TAILQ_INSERT_TAIL(&bv->bv_hd, bp, b_bobufs); else if ((n = BUF_PCTRIE_LOOKUP_LE(&bv->bv_root, bp->b_lblkno)) == NULL) TAILQ_INSERT_HEAD(&bv->bv_hd, bp, b_bobufs); else TAILQ_INSERT_AFTER(&bv->bv_hd, n, bp, b_bobufs); error = BUF_PCTRIE_INSERT(&bv->bv_root, bp); if (error) panic("buf_vlist_add: Preallocated nodes insufficient."); bv->bv_cnt++; } /* * Look up a buffer using the buffer tries. */ struct buf * gbincore(struct bufobj *bo, daddr_t lblkno) { struct buf *bp; ASSERT_BO_LOCKED(bo); bp = BUF_PCTRIE_LOOKUP(&bo->bo_clean.bv_root, lblkno); if (bp != NULL) return (bp); return (BUF_PCTRIE_LOOKUP(&bo->bo_dirty.bv_root, lblkno)); } /* * Look up a buf using the buffer tries, without the bufobj lock. This relies * on SMR for safe lookup, and bufs being in a no-free zone to provide type * stability of the result. Like other lockless lookups, the found buf may * already be invalid by the time this function returns. */ struct buf * gbincore_unlocked(struct bufobj *bo, daddr_t lblkno) { struct buf *bp; ASSERT_BO_UNLOCKED(bo); bp = BUF_PCTRIE_LOOKUP_UNLOCKED(&bo->bo_clean.bv_root, lblkno); if (bp != NULL) return (bp); return (BUF_PCTRIE_LOOKUP_UNLOCKED(&bo->bo_dirty.bv_root, lblkno)); } /* * Associate a buffer with a vnode. */ void bgetvp(struct vnode *vp, struct buf *bp) { struct bufobj *bo; bo = &vp->v_bufobj; ASSERT_BO_WLOCKED(bo); VNASSERT(bp->b_vp == NULL, bp->b_vp, ("bgetvp: not free")); CTR3(KTR_BUF, "bgetvp(%p) vp %p flags %X", bp, vp, bp->b_flags); VNASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0, vp, ("bgetvp: bp already attached! %p", bp)); vhold(vp); bp->b_vp = vp; bp->b_bufobj = bo; /* * Insert onto list for new vnode. */ buf_vlist_add(bp, bo, BX_VNCLEAN); } /* * Disassociate a buffer from a vnode. */ void brelvp(struct buf *bp) { struct bufobj *bo; struct vnode *vp; CTR3(KTR_BUF, "brelvp(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); KASSERT(bp->b_vp != NULL, ("brelvp: NULL")); /* * Delete from old vnode list, if on one. */ vp = bp->b_vp; /* XXX */ bo = bp->b_bufobj; BO_LOCK(bo); buf_vlist_remove(bp); if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) { bo->bo_flag &= ~BO_ONWORKLST; mtx_lock(&sync_mtx); LIST_REMOVE(bo, bo_synclist); syncer_worklist_len--; mtx_unlock(&sync_mtx); } bp->b_vp = NULL; bp->b_bufobj = NULL; BO_UNLOCK(bo); vdrop(vp); } /* * Add an item to the syncer work queue. */ static void vn_syncer_add_to_worklist(struct bufobj *bo, int delay) { int slot; ASSERT_BO_WLOCKED(bo); mtx_lock(&sync_mtx); if (bo->bo_flag & BO_ONWORKLST) LIST_REMOVE(bo, bo_synclist); else { bo->bo_flag |= BO_ONWORKLST; syncer_worklist_len++; } if (delay > syncer_maxdelay - 2) delay = syncer_maxdelay - 2; slot = (syncer_delayno + delay) & syncer_mask; LIST_INSERT_HEAD(&syncer_workitem_pending[slot], bo, bo_synclist); mtx_unlock(&sync_mtx); } static int sysctl_vfs_worklist_len(SYSCTL_HANDLER_ARGS) { int error, len; mtx_lock(&sync_mtx); len = syncer_worklist_len - sync_vnode_count; mtx_unlock(&sync_mtx); error = SYSCTL_OUT(req, &len, sizeof(len)); return (error); } SYSCTL_PROC(_vfs, OID_AUTO, worklist_len, CTLTYPE_INT | CTLFLAG_MPSAFE| CTLFLAG_RD, NULL, 0, sysctl_vfs_worklist_len, "I", "Syncer thread worklist length"); static struct proc *updateproc; static void sched_sync(void); static struct kproc_desc up_kp = { "syncer", sched_sync, &updateproc }; SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp); static int sync_vnode(struct synclist *slp, struct bufobj **bo, struct thread *td) { struct vnode *vp; struct mount *mp; *bo = LIST_FIRST(slp); if (*bo == NULL) return (0); vp = bo2vnode(*bo); if (VOP_ISLOCKED(vp) != 0 || VI_TRYLOCK(vp) == 0) return (1); /* * We use vhold in case the vnode does not * successfully sync. vhold prevents the vnode from * going away when we unlock the sync_mtx so that * we can acquire the vnode interlock. */ vholdl(vp); mtx_unlock(&sync_mtx); VI_UNLOCK(vp); if (vn_start_write(vp, &mp, V_NOWAIT) != 0) { vdrop(vp); mtx_lock(&sync_mtx); return (*bo == LIST_FIRST(slp)); } vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); (void) VOP_FSYNC(vp, MNT_LAZY, td); VOP_UNLOCK(vp); vn_finished_write(mp); BO_LOCK(*bo); if (((*bo)->bo_flag & BO_ONWORKLST) != 0) { /* * Put us back on the worklist. The worklist * routine will remove us from our current * position and then add us back in at a later * position. */ vn_syncer_add_to_worklist(*bo, syncdelay); } BO_UNLOCK(*bo); vdrop(vp); mtx_lock(&sync_mtx); return (0); } static int first_printf = 1; /* * System filesystem synchronizer daemon. */ static void sched_sync(void) { struct synclist *next, *slp; struct bufobj *bo; long starttime; struct thread *td = curthread; int last_work_seen; int net_worklist_len; int syncer_final_iter; int error; last_work_seen = 0; syncer_final_iter = 0; syncer_state = SYNCER_RUNNING; starttime = time_uptime; td->td_pflags |= TDP_NORUNNINGBUF; EVENTHANDLER_REGISTER(shutdown_pre_sync, syncer_shutdown, td->td_proc, SHUTDOWN_PRI_LAST); mtx_lock(&sync_mtx); for (;;) { if (syncer_state == SYNCER_FINAL_DELAY && syncer_final_iter == 0) { mtx_unlock(&sync_mtx); kproc_suspend_check(td->td_proc); mtx_lock(&sync_mtx); } net_worklist_len = syncer_worklist_len - sync_vnode_count; if (syncer_state != SYNCER_RUNNING && starttime != time_uptime) { if (first_printf) { printf("\nSyncing disks, vnodes remaining... "); first_printf = 0; } printf("%d ", net_worklist_len); } starttime = time_uptime; /* * Push files whose dirty time has expired. Be careful * of interrupt race on slp queue. * * Skip over empty worklist slots when shutting down. */ do { slp = &syncer_workitem_pending[syncer_delayno]; syncer_delayno += 1; if (syncer_delayno == syncer_maxdelay) syncer_delayno = 0; next = &syncer_workitem_pending[syncer_delayno]; /* * If the worklist has wrapped since the * it was emptied of all but syncer vnodes, * switch to the FINAL_DELAY state and run * for one more second. */ if (syncer_state == SYNCER_SHUTTING_DOWN && net_worklist_len == 0 && last_work_seen == syncer_delayno) { syncer_state = SYNCER_FINAL_DELAY; syncer_final_iter = SYNCER_SHUTDOWN_SPEEDUP; } } while (syncer_state != SYNCER_RUNNING && LIST_EMPTY(slp) && syncer_worklist_len > 0); /* * Keep track of the last time there was anything * on the worklist other than syncer vnodes. * Return to the SHUTTING_DOWN state if any * new work appears. */ if (net_worklist_len > 0 || syncer_state == SYNCER_RUNNING) last_work_seen = syncer_delayno; if (net_worklist_len > 0 && syncer_state == SYNCER_FINAL_DELAY) syncer_state = SYNCER_SHUTTING_DOWN; while (!LIST_EMPTY(slp)) { error = sync_vnode(slp, &bo, td); if (error == 1) { LIST_REMOVE(bo, bo_synclist); LIST_INSERT_HEAD(next, bo, bo_synclist); continue; } if (first_printf == 0) { /* * Drop the sync mutex, because some watchdog * drivers need to sleep while patting */ mtx_unlock(&sync_mtx); wdog_kern_pat(WD_LASTVAL); mtx_lock(&sync_mtx); } } if (syncer_state == SYNCER_FINAL_DELAY && syncer_final_iter > 0) syncer_final_iter--; /* * The variable rushjob allows the kernel to speed up the * processing of the filesystem syncer process. A rushjob * value of N tells the filesystem syncer to process the next * N seconds worth of work on its queue ASAP. Currently rushjob * is used by the soft update code to speed up the filesystem * syncer process when the incore state is getting so far * ahead of the disk that the kernel memory pool is being * threatened with exhaustion. */ if (rushjob > 0) { rushjob -= 1; continue; } /* * Just sleep for a short period of time between * iterations when shutting down to allow some I/O * to happen. * * If it has taken us less than a second to process the * current work, then wait. Otherwise start right over * again. We can still lose time if any single round * takes more than two seconds, but it does not really * matter as we are just trying to generally pace the * filesystem activity. */ if (syncer_state != SYNCER_RUNNING || time_uptime == starttime) { thread_lock(td); sched_prio(td, PPAUSE); thread_unlock(td); } if (syncer_state != SYNCER_RUNNING) cv_timedwait(&sync_wakeup, &sync_mtx, hz / SYNCER_SHUTDOWN_SPEEDUP); else if (time_uptime == starttime) cv_timedwait(&sync_wakeup, &sync_mtx, hz); } } /* * Request the syncer daemon to speed up its work. * We never push it to speed up more than half of its * normal turn time, otherwise it could take over the cpu. */ int speedup_syncer(void) { int ret = 0; mtx_lock(&sync_mtx); if (rushjob < syncdelay / 2) { rushjob += 1; stat_rush_requests += 1; ret = 1; } mtx_unlock(&sync_mtx); cv_broadcast(&sync_wakeup); return (ret); } /* * Tell the syncer to speed up its work and run though its work * list several times, then tell it to shut down. */ static void syncer_shutdown(void *arg, int howto) { if (howto & RB_NOSYNC) return; mtx_lock(&sync_mtx); syncer_state = SYNCER_SHUTTING_DOWN; rushjob = 0; mtx_unlock(&sync_mtx); cv_broadcast(&sync_wakeup); kproc_shutdown(arg, howto); } void syncer_suspend(void) { syncer_shutdown(updateproc, 0); } void syncer_resume(void) { mtx_lock(&sync_mtx); first_printf = 1; syncer_state = SYNCER_RUNNING; mtx_unlock(&sync_mtx); cv_broadcast(&sync_wakeup); kproc_resume(updateproc); } /* * Move the buffer between the clean and dirty lists of its vnode. */ void reassignbuf(struct buf *bp) { struct vnode *vp; struct bufobj *bo; int delay; #ifdef INVARIANTS struct bufv *bv; #endif vp = bp->b_vp; bo = bp->b_bufobj; KASSERT((bp->b_flags & B_PAGING) == 0, ("%s: cannot reassign paging buffer %p", __func__, bp)); CTR3(KTR_BUF, "reassignbuf(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); BO_LOCK(bo); buf_vlist_remove(bp); /* * If dirty, put on list of dirty buffers; otherwise insert onto list * of clean buffers. */ if (bp->b_flags & B_DELWRI) { if ((bo->bo_flag & BO_ONWORKLST) == 0) { switch (vp->v_type) { case VDIR: delay = dirdelay; break; case VCHR: delay = metadelay; break; default: delay = filedelay; } vn_syncer_add_to_worklist(bo, delay); } buf_vlist_add(bp, bo, BX_VNDIRTY); } else { buf_vlist_add(bp, bo, BX_VNCLEAN); if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) { mtx_lock(&sync_mtx); LIST_REMOVE(bo, bo_synclist); syncer_worklist_len--; mtx_unlock(&sync_mtx); bo->bo_flag &= ~BO_ONWORKLST; } } #ifdef INVARIANTS bv = &bo->bo_clean; bp = TAILQ_FIRST(&bv->bv_hd); KASSERT(bp == NULL || bp->b_bufobj == bo, ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); bp = TAILQ_LAST(&bv->bv_hd, buflists); KASSERT(bp == NULL || bp->b_bufobj == bo, ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); bv = &bo->bo_dirty; bp = TAILQ_FIRST(&bv->bv_hd); KASSERT(bp == NULL || bp->b_bufobj == bo, ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); bp = TAILQ_LAST(&bv->bv_hd, buflists); KASSERT(bp == NULL || bp->b_bufobj == bo, ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo)); #endif BO_UNLOCK(bo); } static void v_init_counters(struct vnode *vp) { VNASSERT(vp->v_type == VNON && vp->v_data == NULL && vp->v_iflag == 0, vp, ("%s called for an initialized vnode", __FUNCTION__)); ASSERT_VI_UNLOCKED(vp, __FUNCTION__); refcount_init(&vp->v_holdcnt, 1); refcount_init(&vp->v_usecount, 1); } /* * Grab a particular vnode from the free list, increment its * reference count and lock it. VIRF_DOOMED is set if the vnode * is being destroyed. Only callers who specify LK_RETRY will * see doomed vnodes. If inactive processing was delayed in * vput try to do it here. * * usecount is manipulated using atomics without holding any locks. * * holdcnt can be manipulated using atomics without holding any locks, * except when transitioning 1<->0, in which case the interlock is held. * * Consumers which don't guarantee liveness of the vnode can use SMR to * try to get a reference. Note this operation can fail since the vnode * may be awaiting getting freed by the time they get to it. */ enum vgetstate vget_prep_smr(struct vnode *vp) { enum vgetstate vs; VFS_SMR_ASSERT_ENTERED(); if (refcount_acquire_if_not_zero(&vp->v_usecount)) { vs = VGET_USECOUNT; } else { if (vhold_smr(vp)) vs = VGET_HOLDCNT; else vs = VGET_NONE; } return (vs); } enum vgetstate vget_prep(struct vnode *vp) { enum vgetstate vs; if (refcount_acquire_if_not_zero(&vp->v_usecount)) { vs = VGET_USECOUNT; } else { vhold(vp); vs = VGET_HOLDCNT; } return (vs); } void vget_abort(struct vnode *vp, enum vgetstate vs) { switch (vs) { case VGET_USECOUNT: vrele(vp); break; case VGET_HOLDCNT: vdrop(vp); break; default: __assert_unreachable(); } } int -vget(struct vnode *vp, int flags, struct thread *td) +vget(struct vnode *vp, int flags) { enum vgetstate vs; - MPASS(td == curthread); - vs = vget_prep(vp); return (vget_finish(vp, flags, vs)); } int vget_finish(struct vnode *vp, int flags, enum vgetstate vs) { int error; if ((flags & LK_INTERLOCK) != 0) ASSERT_VI_LOCKED(vp, __func__); else ASSERT_VI_UNLOCKED(vp, __func__); VNPASS(vs == VGET_HOLDCNT || vs == VGET_USECOUNT, vp); VNPASS(vp->v_holdcnt > 0, vp); VNPASS(vs == VGET_HOLDCNT || vp->v_usecount > 0, vp); error = vn_lock(vp, flags); if (__predict_false(error != 0)) { vget_abort(vp, vs); CTR2(KTR_VFS, "%s: impossible to lock vnode %p", __func__, vp); return (error); } vget_finish_ref(vp, vs); return (0); } void vget_finish_ref(struct vnode *vp, enum vgetstate vs) { int old; VNPASS(vs == VGET_HOLDCNT || vs == VGET_USECOUNT, vp); VNPASS(vp->v_holdcnt > 0, vp); VNPASS(vs == VGET_HOLDCNT || vp->v_usecount > 0, vp); if (vs == VGET_USECOUNT) return; /* * We hold the vnode. If the usecount is 0 it will be utilized to keep * the vnode around. Otherwise someone else lended their hold count and * we have to drop ours. */ old = atomic_fetchadd_int(&vp->v_usecount, 1); VNASSERT(old >= 0, vp, ("%s: wrong use count %d", __func__, old)); if (old != 0) { #ifdef INVARIANTS old = atomic_fetchadd_int(&vp->v_holdcnt, -1); VNASSERT(old > 1, vp, ("%s: wrong hold count %d", __func__, old)); #else refcount_release(&vp->v_holdcnt); #endif } } void vref(struct vnode *vp) { enum vgetstate vs; CTR2(KTR_VFS, "%s: vp %p", __func__, vp); vs = vget_prep(vp); vget_finish_ref(vp, vs); } void vrefl(struct vnode *vp) { ASSERT_VI_LOCKED(vp, __func__); CTR2(KTR_VFS, "%s: vp %p", __func__, vp); vref(vp); } void vrefact(struct vnode *vp) { CTR2(KTR_VFS, "%s: vp %p", __func__, vp); #ifdef INVARIANTS int old = atomic_fetchadd_int(&vp->v_usecount, 1); VNASSERT(old > 0, vp, ("%s: wrong use count %d", __func__, old)); #else refcount_acquire(&vp->v_usecount); #endif } void vlazy(struct vnode *vp) { struct mount *mp; VNASSERT(vp->v_holdcnt > 0, vp, ("%s: vnode not held", __func__)); if ((vp->v_mflag & VMP_LAZYLIST) != 0) return; /* * We may get here for inactive routines after the vnode got doomed. */ if (VN_IS_DOOMED(vp)) return; mp = vp->v_mount; mtx_lock(&mp->mnt_listmtx); if ((vp->v_mflag & VMP_LAZYLIST) == 0) { vp->v_mflag |= VMP_LAZYLIST; TAILQ_INSERT_TAIL(&mp->mnt_lazyvnodelist, vp, v_lazylist); mp->mnt_lazyvnodelistsize++; } mtx_unlock(&mp->mnt_listmtx); } /* * This routine is only meant to be called from vgonel prior to dooming * the vnode. */ static void vunlazy_gone(struct vnode *vp) { struct mount *mp; ASSERT_VOP_ELOCKED(vp, __func__); ASSERT_VI_LOCKED(vp, __func__); VNPASS(!VN_IS_DOOMED(vp), vp); if (vp->v_mflag & VMP_LAZYLIST) { mp = vp->v_mount; mtx_lock(&mp->mnt_listmtx); VNPASS(vp->v_mflag & VMP_LAZYLIST, vp); vp->v_mflag &= ~VMP_LAZYLIST; TAILQ_REMOVE(&mp->mnt_lazyvnodelist, vp, v_lazylist); mp->mnt_lazyvnodelistsize--; mtx_unlock(&mp->mnt_listmtx); } } static void vdefer_inactive(struct vnode *vp) { ASSERT_VI_LOCKED(vp, __func__); VNASSERT(vp->v_holdcnt > 0, vp, ("%s: vnode without hold count", __func__)); if (VN_IS_DOOMED(vp)) { vdropl(vp); return; } if (vp->v_iflag & VI_DEFINACT) { VNASSERT(vp->v_holdcnt > 1, vp, ("lost hold count")); vdropl(vp); return; } if (vp->v_usecount > 0) { vp->v_iflag &= ~VI_OWEINACT; vdropl(vp); return; } vlazy(vp); vp->v_iflag |= VI_DEFINACT; VI_UNLOCK(vp); counter_u64_add(deferred_inact, 1); } static void vdefer_inactive_unlocked(struct vnode *vp) { VI_LOCK(vp); if ((vp->v_iflag & VI_OWEINACT) == 0) { vdropl(vp); return; } vdefer_inactive(vp); } enum vput_op { VRELE, VPUT, VUNREF }; /* * Handle ->v_usecount transitioning to 0. * * By releasing the last usecount we take ownership of the hold count which * provides liveness of the vnode, meaning we have to vdrop. * * For all vnodes we may need to perform inactive processing. It requires an * exclusive lock on the vnode, while it is legal to call here with only a * shared lock (or no locks). If locking the vnode in an expected manner fails, * inactive processing gets deferred to the syncer. * * XXX Some filesystems pass in an exclusively locked vnode and strongly depend * on the lock being held all the way until VOP_INACTIVE. This in particular * happens with UFS which adds half-constructed vnodes to the hash, where they * can be found by other code. */ static void vput_final(struct vnode *vp, enum vput_op func) { int error; bool want_unlock; CTR2(KTR_VFS, "%s: vp %p", __func__, vp); VNPASS(vp->v_holdcnt > 0, vp); VI_LOCK(vp); /* * By the time we got here someone else might have transitioned * the count back to > 0. */ if (vp->v_usecount > 0) goto out; /* * If the vnode is doomed vgone already performed inactive processing * (if needed). */ if (VN_IS_DOOMED(vp)) goto out; if (__predict_true(VOP_NEED_INACTIVE(vp) == 0)) goto out; if (vp->v_iflag & VI_DOINGINACT) goto out; /* * Locking operations here will drop the interlock and possibly the * vnode lock, opening a window where the vnode can get doomed all the * while ->v_usecount is 0. Set VI_OWEINACT to let vgone know to * perform inactive. */ vp->v_iflag |= VI_OWEINACT; want_unlock = false; error = 0; switch (func) { case VRELE: switch (VOP_ISLOCKED(vp)) { case LK_EXCLUSIVE: break; case LK_EXCLOTHER: case 0: want_unlock = true; error = vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK); VI_LOCK(vp); break; default: /* * The lock has at least one sharer, but we have no way * to conclude whether this is us. Play it safe and * defer processing. */ error = EAGAIN; break; } break; case VPUT: want_unlock = true; if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) { error = VOP_LOCK(vp, LK_UPGRADE | LK_INTERLOCK | LK_NOWAIT); VI_LOCK(vp); } break; case VUNREF: if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) { error = VOP_LOCK(vp, LK_TRYUPGRADE | LK_INTERLOCK); VI_LOCK(vp); } break; } if (error == 0) { vinactive(vp); if (want_unlock) VOP_UNLOCK(vp); vdropl(vp); } else { vdefer_inactive(vp); } return; out: if (func == VPUT) VOP_UNLOCK(vp); vdropl(vp); } /* * Decrement ->v_usecount for a vnode. * * Releasing the last use count requires additional processing, see vput_final * above for details. * * Comment above each variant denotes lock state on entry and exit. */ /* * in: any * out: same as passed in */ void vrele(struct vnode *vp) { ASSERT_VI_UNLOCKED(vp, __func__); if (!refcount_release(&vp->v_usecount)) return; vput_final(vp, VRELE); } /* * in: locked * out: unlocked */ void vput(struct vnode *vp) { ASSERT_VOP_LOCKED(vp, __func__); ASSERT_VI_UNLOCKED(vp, __func__); if (!refcount_release(&vp->v_usecount)) { VOP_UNLOCK(vp); return; } vput_final(vp, VPUT); } /* * in: locked * out: locked */ void vunref(struct vnode *vp) { ASSERT_VOP_LOCKED(vp, __func__); ASSERT_VI_UNLOCKED(vp, __func__); if (!refcount_release(&vp->v_usecount)) return; vput_final(vp, VUNREF); } void vhold(struct vnode *vp) { struct vdbatch *vd; int old; CTR2(KTR_VFS, "%s: vp %p", __func__, vp); old = atomic_fetchadd_int(&vp->v_holdcnt, 1); VNASSERT(old >= 0 && (old & VHOLD_ALL_FLAGS) == 0, vp, ("%s: wrong hold count %d", __func__, old)); if (old != 0) return; critical_enter(); vd = DPCPU_PTR(vd); vd->freevnodes--; critical_exit(); } void vholdl(struct vnode *vp) { ASSERT_VI_LOCKED(vp, __func__); CTR2(KTR_VFS, "%s: vp %p", __func__, vp); vhold(vp); } void vholdnz(struct vnode *vp) { CTR2(KTR_VFS, "%s: vp %p", __func__, vp); #ifdef INVARIANTS int old = atomic_fetchadd_int(&vp->v_holdcnt, 1); VNASSERT(old > 0 && (old & VHOLD_ALL_FLAGS) == 0, vp, ("%s: wrong hold count %d", __func__, old)); #else atomic_add_int(&vp->v_holdcnt, 1); #endif } /* * Grab a hold count unless the vnode is freed. * * Only use this routine if vfs smr is the only protection you have against * freeing the vnode. * * The code loops trying to add a hold count as long as the VHOLD_NO_SMR flag * is not set. After the flag is set the vnode becomes immutable to anyone but * the thread which managed to set the flag. * * It may be tempting to replace the loop with: * count = atomic_fetchadd_int(&vp->v_holdcnt, 1); * if (count & VHOLD_NO_SMR) { * backpedal and error out; * } * * However, while this is more performant, it hinders debugging by eliminating * the previously mentioned invariant. */ bool vhold_smr(struct vnode *vp) { int count; VFS_SMR_ASSERT_ENTERED(); count = atomic_load_int(&vp->v_holdcnt); for (;;) { if (count & VHOLD_NO_SMR) { VNASSERT((count & ~VHOLD_NO_SMR) == 0, vp, ("non-zero hold count with flags %d\n", count)); return (false); } VNASSERT(count >= 0, vp, ("invalid hold count %d\n", count)); if (atomic_fcmpset_int(&vp->v_holdcnt, &count, count + 1)) return (true); } } static void __noinline vdbatch_process(struct vdbatch *vd) { struct vnode *vp; int i; mtx_assert(&vd->lock, MA_OWNED); MPASS(curthread->td_pinned > 0); MPASS(vd->index == VDBATCH_SIZE); mtx_lock(&vnode_list_mtx); critical_enter(); freevnodes += vd->freevnodes; for (i = 0; i < VDBATCH_SIZE; i++) { vp = vd->tab[i]; TAILQ_REMOVE(&vnode_list, vp, v_vnodelist); TAILQ_INSERT_TAIL(&vnode_list, vp, v_vnodelist); MPASS(vp->v_dbatchcpu != NOCPU); vp->v_dbatchcpu = NOCPU; } mtx_unlock(&vnode_list_mtx); vd->freevnodes = 0; bzero(vd->tab, sizeof(vd->tab)); vd->index = 0; critical_exit(); } static void vdbatch_enqueue(struct vnode *vp) { struct vdbatch *vd; ASSERT_VI_LOCKED(vp, __func__); VNASSERT(!VN_IS_DOOMED(vp), vp, ("%s: deferring requeue of a doomed vnode", __func__)); critical_enter(); vd = DPCPU_PTR(vd); vd->freevnodes++; if (vp->v_dbatchcpu != NOCPU) { VI_UNLOCK(vp); critical_exit(); return; } sched_pin(); critical_exit(); mtx_lock(&vd->lock); MPASS(vd->index < VDBATCH_SIZE); MPASS(vd->tab[vd->index] == NULL); /* * A hack: we depend on being pinned so that we know what to put in * ->v_dbatchcpu. */ vp->v_dbatchcpu = curcpu; vd->tab[vd->index] = vp; vd->index++; VI_UNLOCK(vp); if (vd->index == VDBATCH_SIZE) vdbatch_process(vd); mtx_unlock(&vd->lock); sched_unpin(); } /* * This routine must only be called for vnodes which are about to be * deallocated. Supporting dequeue for arbitrary vndoes would require * validating that the locked batch matches. */ static void vdbatch_dequeue(struct vnode *vp) { struct vdbatch *vd; int i; short cpu; VNASSERT(vp->v_type == VBAD || vp->v_type == VNON, vp, ("%s: called for a used vnode\n", __func__)); cpu = vp->v_dbatchcpu; if (cpu == NOCPU) return; vd = DPCPU_ID_PTR(cpu, vd); mtx_lock(&vd->lock); for (i = 0; i < vd->index; i++) { if (vd->tab[i] != vp) continue; vp->v_dbatchcpu = NOCPU; vd->index--; vd->tab[i] = vd->tab[vd->index]; vd->tab[vd->index] = NULL; break; } mtx_unlock(&vd->lock); /* * Either we dequeued the vnode above or the target CPU beat us to it. */ MPASS(vp->v_dbatchcpu == NOCPU); } /* * Drop the hold count of the vnode. If this is the last reference to * the vnode we place it on the free list unless it has been vgone'd * (marked VIRF_DOOMED) in which case we will free it. * * Because the vnode vm object keeps a hold reference on the vnode if * there is at least one resident non-cached page, the vnode cannot * leave the active list without the page cleanup done. */ static void vdrop_deactivate(struct vnode *vp) { struct mount *mp; ASSERT_VI_LOCKED(vp, __func__); /* * Mark a vnode as free: remove it from its active list * and put it up for recycling on the freelist. */ VNASSERT(!VN_IS_DOOMED(vp), vp, ("vdrop: returning doomed vnode")); VNASSERT(vp->v_op != NULL, vp, ("vdrop: vnode already reclaimed.")); VNASSERT((vp->v_iflag & VI_OWEINACT) == 0, vp, ("vnode with VI_OWEINACT set")); VNASSERT((vp->v_iflag & VI_DEFINACT) == 0, vp, ("vnode with VI_DEFINACT set")); if (vp->v_mflag & VMP_LAZYLIST) { mp = vp->v_mount; mtx_lock(&mp->mnt_listmtx); VNASSERT(vp->v_mflag & VMP_LAZYLIST, vp, ("lost VMP_LAZYLIST")); /* * Don't remove the vnode from the lazy list if another thread * has increased the hold count. It may have re-enqueued the * vnode to the lazy list and is now responsible for its * removal. */ if (vp->v_holdcnt == 0) { vp->v_mflag &= ~VMP_LAZYLIST; TAILQ_REMOVE(&mp->mnt_lazyvnodelist, vp, v_lazylist); mp->mnt_lazyvnodelistsize--; } mtx_unlock(&mp->mnt_listmtx); } vdbatch_enqueue(vp); } void vdrop(struct vnode *vp) { ASSERT_VI_UNLOCKED(vp, __func__); CTR2(KTR_VFS, "%s: vp %p", __func__, vp); if (refcount_release_if_not_last(&vp->v_holdcnt)) return; VI_LOCK(vp); vdropl(vp); } void vdropl(struct vnode *vp) { ASSERT_VI_LOCKED(vp, __func__); CTR2(KTR_VFS, "%s: vp %p", __func__, vp); if (!refcount_release(&vp->v_holdcnt)) { VI_UNLOCK(vp); return; } if (!VN_IS_DOOMED(vp)) { vdrop_deactivate(vp); /* * Also unlocks the interlock. We can't assert on it as we * released our hold and by now the vnode might have been * freed. */ return; } /* * Set the VHOLD_NO_SMR flag. * * We may be racing against vhold_smr. If they win we can just pretend * we never got this far, they will vdrop later. */ if (!atomic_cmpset_int(&vp->v_holdcnt, 0, VHOLD_NO_SMR)) { VI_UNLOCK(vp); /* * We lost the aforementioned race. Any subsequent access is * invalid as they might have managed to vdropl on their own. */ return; } freevnode(vp); } /* * Call VOP_INACTIVE on the vnode and manage the DOINGINACT and OWEINACT * flags. DOINGINACT prevents us from recursing in calls to vinactive. */ static void vinactivef(struct vnode *vp) { struct vm_object *obj; ASSERT_VOP_ELOCKED(vp, "vinactive"); ASSERT_VI_LOCKED(vp, "vinactive"); VNASSERT((vp->v_iflag & VI_DOINGINACT) == 0, vp, ("vinactive: recursed on VI_DOINGINACT")); CTR2(KTR_VFS, "%s: vp %p", __func__, vp); vp->v_iflag |= VI_DOINGINACT; vp->v_iflag &= ~VI_OWEINACT; VI_UNLOCK(vp); /* * Before moving off the active list, we must be sure that any * modified pages are converted into the vnode's dirty * buffers, since these will no longer be checked once the * vnode is on the inactive list. * * The write-out of the dirty pages is asynchronous. At the * point that VOP_INACTIVE() is called, there could still be * pending I/O and dirty pages in the object. */ if ((obj = vp->v_object) != NULL && (vp->v_vflag & VV_NOSYNC) == 0 && vm_object_mightbedirty(obj)) { VM_OBJECT_WLOCK(obj); vm_object_page_clean(obj, 0, 0, 0); VM_OBJECT_WUNLOCK(obj); } VOP_INACTIVE(vp, curthread); VI_LOCK(vp); VNASSERT(vp->v_iflag & VI_DOINGINACT, vp, ("vinactive: lost VI_DOINGINACT")); vp->v_iflag &= ~VI_DOINGINACT; } void vinactive(struct vnode *vp) { ASSERT_VOP_ELOCKED(vp, "vinactive"); ASSERT_VI_LOCKED(vp, "vinactive"); CTR2(KTR_VFS, "%s: vp %p", __func__, vp); if ((vp->v_iflag & VI_OWEINACT) == 0) return; if (vp->v_iflag & VI_DOINGINACT) return; if (vp->v_usecount > 0) { vp->v_iflag &= ~VI_OWEINACT; return; } vinactivef(vp); } /* * Remove any vnodes in the vnode table belonging to mount point mp. * * If FORCECLOSE is not specified, there should not be any active ones, * return error if any are found (nb: this is a user error, not a * system error). If FORCECLOSE is specified, detach any active vnodes * that are found. * * If WRITECLOSE is set, only flush out regular file vnodes open for * writing. * * SKIPSYSTEM causes any vnodes marked VV_SYSTEM to be skipped. * * `rootrefs' specifies the base reference count for the root vnode * of this filesystem. The root vnode is considered busy if its * v_usecount exceeds this value. On a successful return, vflush(, td) * will call vrele() on the root vnode exactly rootrefs times. * If the SKIPSYSTEM or WRITECLOSE flags are specified, rootrefs must * be zero. */ #ifdef DIAGNOSTIC static int busyprt = 0; /* print out busy vnodes */ SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, "Print out busy vnodes"); #endif int vflush(struct mount *mp, int rootrefs, int flags, struct thread *td) { struct vnode *vp, *mvp, *rootvp = NULL; struct vattr vattr; int busy = 0, error; CTR4(KTR_VFS, "%s: mp %p with rootrefs %d and flags %d", __func__, mp, rootrefs, flags); if (rootrefs > 0) { KASSERT((flags & (SKIPSYSTEM | WRITECLOSE)) == 0, ("vflush: bad args")); /* * Get the filesystem root vnode. We can vput() it * immediately, since with rootrefs > 0, it won't go away. */ if ((error = VFS_ROOT(mp, LK_EXCLUSIVE, &rootvp)) != 0) { CTR2(KTR_VFS, "%s: vfs_root lookup failed with %d", __func__, error); return (error); } vput(rootvp); } loop: MNT_VNODE_FOREACH_ALL(vp, mp, mvp) { vholdl(vp); error = vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE); if (error) { vdrop(vp); MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); goto loop; } /* * Skip over a vnodes marked VV_SYSTEM. */ if ((flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM)) { VOP_UNLOCK(vp); vdrop(vp); continue; } /* * If WRITECLOSE is set, flush out unlinked but still open * files (even if open only for reading) and regular file * vnodes open for writing. */ if (flags & WRITECLOSE) { if (vp->v_object != NULL) { VM_OBJECT_WLOCK(vp->v_object); vm_object_page_clean(vp->v_object, 0, 0, 0); VM_OBJECT_WUNLOCK(vp->v_object); } error = VOP_FSYNC(vp, MNT_WAIT, td); if (error != 0) { VOP_UNLOCK(vp); vdrop(vp); MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); return (error); } error = VOP_GETATTR(vp, &vattr, td->td_ucred); VI_LOCK(vp); if ((vp->v_type == VNON || (error == 0 && vattr.va_nlink > 0)) && (vp->v_writecount <= 0 || vp->v_type != VREG)) { VOP_UNLOCK(vp); vdropl(vp); continue; } } else VI_LOCK(vp); /* * With v_usecount == 0, all we need to do is clear out the * vnode data structures and we are done. * * If FORCECLOSE is set, forcibly close the vnode. */ if (vp->v_usecount == 0 || (flags & FORCECLOSE)) { vgonel(vp); } else { busy++; #ifdef DIAGNOSTIC if (busyprt) vn_printf(vp, "vflush: busy vnode "); #endif } VOP_UNLOCK(vp); vdropl(vp); } if (rootrefs > 0 && (flags & FORCECLOSE) == 0) { /* * If just the root vnode is busy, and if its refcount * is equal to `rootrefs', then go ahead and kill it. */ VI_LOCK(rootvp); KASSERT(busy > 0, ("vflush: not busy")); VNASSERT(rootvp->v_usecount >= rootrefs, rootvp, ("vflush: usecount %d < rootrefs %d", rootvp->v_usecount, rootrefs)); if (busy == 1 && rootvp->v_usecount == rootrefs) { VOP_LOCK(rootvp, LK_EXCLUSIVE|LK_INTERLOCK); vgone(rootvp); VOP_UNLOCK(rootvp); busy = 0; } else VI_UNLOCK(rootvp); } if (busy) { CTR2(KTR_VFS, "%s: failing as %d vnodes are busy", __func__, busy); return (EBUSY); } for (; rootrefs > 0; rootrefs--) vrele(rootvp); return (0); } /* * Recycle an unused vnode to the front of the free list. */ int vrecycle(struct vnode *vp) { int recycled; VI_LOCK(vp); recycled = vrecyclel(vp); VI_UNLOCK(vp); return (recycled); } /* * vrecycle, with the vp interlock held. */ int vrecyclel(struct vnode *vp) { int recycled; ASSERT_VOP_ELOCKED(vp, __func__); ASSERT_VI_LOCKED(vp, __func__); CTR2(KTR_VFS, "%s: vp %p", __func__, vp); recycled = 0; if (vp->v_usecount == 0) { recycled = 1; vgonel(vp); } return (recycled); } /* * Eliminate all activity associated with a vnode * in preparation for reuse. */ void vgone(struct vnode *vp) { VI_LOCK(vp); vgonel(vp); VI_UNLOCK(vp); } static void notify_lowervp_vfs_dummy(struct mount *mp __unused, struct vnode *lowervp __unused) { } /* * Notify upper mounts about reclaimed or unlinked vnode. */ void vfs_notify_upper(struct vnode *vp, int event) { static struct vfsops vgonel_vfsops = { .vfs_reclaim_lowervp = notify_lowervp_vfs_dummy, .vfs_unlink_lowervp = notify_lowervp_vfs_dummy, }; struct mount *mp, *ump, *mmp; mp = vp->v_mount; if (mp == NULL) return; if (TAILQ_EMPTY(&mp->mnt_uppers)) return; mmp = malloc(sizeof(struct mount), M_TEMP, M_WAITOK | M_ZERO); mmp->mnt_op = &vgonel_vfsops; mmp->mnt_kern_flag |= MNTK_MARKER; MNT_ILOCK(mp); mp->mnt_kern_flag |= MNTK_VGONE_UPPER; for (ump = TAILQ_FIRST(&mp->mnt_uppers); ump != NULL;) { if ((ump->mnt_kern_flag & MNTK_MARKER) != 0) { ump = TAILQ_NEXT(ump, mnt_upper_link); continue; } TAILQ_INSERT_AFTER(&mp->mnt_uppers, ump, mmp, mnt_upper_link); MNT_IUNLOCK(mp); switch (event) { case VFS_NOTIFY_UPPER_RECLAIM: VFS_RECLAIM_LOWERVP(ump, vp); break; case VFS_NOTIFY_UPPER_UNLINK: VFS_UNLINK_LOWERVP(ump, vp); break; default: KASSERT(0, ("invalid event %d", event)); break; } MNT_ILOCK(mp); ump = TAILQ_NEXT(mmp, mnt_upper_link); TAILQ_REMOVE(&mp->mnt_uppers, mmp, mnt_upper_link); } free(mmp, M_TEMP); mp->mnt_kern_flag &= ~MNTK_VGONE_UPPER; if ((mp->mnt_kern_flag & MNTK_VGONE_WAITER) != 0) { mp->mnt_kern_flag &= ~MNTK_VGONE_WAITER; wakeup(&mp->mnt_uppers); } MNT_IUNLOCK(mp); } /* * vgone, with the vp interlock held. */ static void vgonel(struct vnode *vp) { struct thread *td; struct mount *mp; vm_object_t object; bool active, oweinact; ASSERT_VOP_ELOCKED(vp, "vgonel"); ASSERT_VI_LOCKED(vp, "vgonel"); VNASSERT(vp->v_holdcnt, vp, ("vgonel: vp %p has no reference.", vp)); CTR2(KTR_VFS, "%s: vp %p", __func__, vp); td = curthread; /* * Don't vgonel if we're already doomed. */ if (vp->v_irflag & VIRF_DOOMED) return; /* * Paired with freevnode. */ vn_seqc_write_begin_locked(vp); vunlazy_gone(vp); vp->v_irflag |= VIRF_DOOMED; /* * Check to see if the vnode is in use. If so, we have to call * VOP_CLOSE() and VOP_INACTIVE(). */ active = vp->v_usecount > 0; oweinact = (vp->v_iflag & VI_OWEINACT) != 0; /* * If we need to do inactive VI_OWEINACT will be set. */ if (vp->v_iflag & VI_DEFINACT) { VNASSERT(vp->v_holdcnt > 1, vp, ("lost hold count")); vp->v_iflag &= ~VI_DEFINACT; vdropl(vp); } else { VNASSERT(vp->v_holdcnt > 0, vp, ("vnode without hold count")); VI_UNLOCK(vp); } vfs_notify_upper(vp, VFS_NOTIFY_UPPER_RECLAIM); /* * If purging an active vnode, it must be closed and * deactivated before being reclaimed. */ if (active) VOP_CLOSE(vp, FNONBLOCK, NOCRED, td); if (oweinact || active) { VI_LOCK(vp); vinactivef(vp); VI_UNLOCK(vp); } if (vp->v_type == VSOCK) vfs_unp_reclaim(vp); /* * Clean out any buffers associated with the vnode. * If the flush fails, just toss the buffers. */ mp = NULL; if (!TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd)) (void) vn_start_secondary_write(vp, &mp, V_WAIT); if (vinvalbuf(vp, V_SAVE, 0, 0) != 0) { while (vinvalbuf(vp, 0, 0, 0) != 0) ; } BO_LOCK(&vp->v_bufobj); KASSERT(TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd) && vp->v_bufobj.bo_dirty.bv_cnt == 0 && TAILQ_EMPTY(&vp->v_bufobj.bo_clean.bv_hd) && vp->v_bufobj.bo_clean.bv_cnt == 0, ("vp %p bufobj not invalidated", vp)); /* * For VMIO bufobj, BO_DEAD is set later, or in * vm_object_terminate() after the object's page queue is * flushed. */ object = vp->v_bufobj.bo_object; if (object == NULL) vp->v_bufobj.bo_flag |= BO_DEAD; BO_UNLOCK(&vp->v_bufobj); /* * Handle the VM part. Tmpfs handles v_object on its own (the * OBJT_VNODE check). Nullfs or other bypassing filesystems * should not touch the object borrowed from the lower vnode * (the handle check). */ if (object != NULL && object->type == OBJT_VNODE && object->handle == vp) vnode_destroy_vobject(vp); /* * Reclaim the vnode. */ if (VOP_RECLAIM(vp, td)) panic("vgone: cannot reclaim"); if (mp != NULL) vn_finished_secondary_write(mp); VNASSERT(vp->v_object == NULL, vp, ("vop_reclaim left v_object vp=%p", vp)); /* * Clear the advisory locks and wake up waiting threads. */ (void)VOP_ADVLOCKPURGE(vp); vp->v_lockf = NULL; /* * Delete from old mount point vnode list. */ delmntque(vp); cache_purge_vgone(vp); /* * Done with purge, reset to the standard lock and invalidate * the vnode. */ VI_LOCK(vp); vp->v_vnlock = &vp->v_lock; vp->v_op = &dead_vnodeops; vp->v_type = VBAD; } /* * Print out a description of a vnode. */ static const char * const typename[] = {"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD", "VMARKER"}; _Static_assert((VHOLD_ALL_FLAGS & ~VHOLD_NO_SMR) == 0, "new hold count flag not added to vn_printf"); void vn_printf(struct vnode *vp, const char *fmt, ...) { va_list ap; char buf[256], buf2[16]; u_long flags; u_int holdcnt; va_start(ap, fmt); vprintf(fmt, ap); va_end(ap); printf("%p: ", (void *)vp); printf("type %s\n", typename[vp->v_type]); holdcnt = atomic_load_int(&vp->v_holdcnt); printf(" usecount %d, writecount %d, refcount %d seqc users %d", vp->v_usecount, vp->v_writecount, holdcnt & ~VHOLD_ALL_FLAGS, vp->v_seqc_users); switch (vp->v_type) { case VDIR: printf(" mountedhere %p\n", vp->v_mountedhere); break; case VCHR: printf(" rdev %p\n", vp->v_rdev); break; case VSOCK: printf(" socket %p\n", vp->v_unpcb); break; case VFIFO: printf(" fifoinfo %p\n", vp->v_fifoinfo); break; default: printf("\n"); break; } buf[0] = '\0'; buf[1] = '\0'; if (holdcnt & VHOLD_NO_SMR) strlcat(buf, "|VHOLD_NO_SMR", sizeof(buf)); printf(" hold count flags (%s)\n", buf + 1); buf[0] = '\0'; buf[1] = '\0'; if (vp->v_irflag & VIRF_DOOMED) strlcat(buf, "|VIRF_DOOMED", sizeof(buf)); flags = vp->v_irflag & ~(VIRF_DOOMED); if (flags != 0) { snprintf(buf2, sizeof(buf2), "|VIRF(0x%lx)", flags); strlcat(buf, buf2, sizeof(buf)); } if (vp->v_vflag & VV_ROOT) strlcat(buf, "|VV_ROOT", sizeof(buf)); if (vp->v_vflag & VV_ISTTY) strlcat(buf, "|VV_ISTTY", sizeof(buf)); if (vp->v_vflag & VV_NOSYNC) strlcat(buf, "|VV_NOSYNC", sizeof(buf)); if (vp->v_vflag & VV_ETERNALDEV) strlcat(buf, "|VV_ETERNALDEV", sizeof(buf)); if (vp->v_vflag & VV_CACHEDLABEL) strlcat(buf, "|VV_CACHEDLABEL", sizeof(buf)); if (vp->v_vflag & VV_VMSIZEVNLOCK) strlcat(buf, "|VV_VMSIZEVNLOCK", sizeof(buf)); if (vp->v_vflag & VV_COPYONWRITE) strlcat(buf, "|VV_COPYONWRITE", sizeof(buf)); if (vp->v_vflag & VV_SYSTEM) strlcat(buf, "|VV_SYSTEM", sizeof(buf)); if (vp->v_vflag & VV_PROCDEP) strlcat(buf, "|VV_PROCDEP", sizeof(buf)); if (vp->v_vflag & VV_NOKNOTE) strlcat(buf, "|VV_NOKNOTE", sizeof(buf)); if (vp->v_vflag & VV_DELETED) strlcat(buf, "|VV_DELETED", sizeof(buf)); if (vp->v_vflag & VV_MD) strlcat(buf, "|VV_MD", sizeof(buf)); if (vp->v_vflag & VV_FORCEINSMQ) strlcat(buf, "|VV_FORCEINSMQ", sizeof(buf)); if (vp->v_vflag & VV_READLINK) strlcat(buf, "|VV_READLINK", sizeof(buf)); flags = vp->v_vflag & ~(VV_ROOT | VV_ISTTY | VV_NOSYNC | VV_ETERNALDEV | VV_CACHEDLABEL | VV_COPYONWRITE | VV_SYSTEM | VV_PROCDEP | VV_NOKNOTE | VV_DELETED | VV_MD | VV_FORCEINSMQ); if (flags != 0) { snprintf(buf2, sizeof(buf2), "|VV(0x%lx)", flags); strlcat(buf, buf2, sizeof(buf)); } if (vp->v_iflag & VI_TEXT_REF) strlcat(buf, "|VI_TEXT_REF", sizeof(buf)); if (vp->v_iflag & VI_MOUNT) strlcat(buf, "|VI_MOUNT", sizeof(buf)); if (vp->v_iflag & VI_DOINGINACT) strlcat(buf, "|VI_DOINGINACT", sizeof(buf)); if (vp->v_iflag & VI_OWEINACT) strlcat(buf, "|VI_OWEINACT", sizeof(buf)); if (vp->v_iflag & VI_DEFINACT) strlcat(buf, "|VI_DEFINACT", sizeof(buf)); flags = vp->v_iflag & ~(VI_TEXT_REF | VI_MOUNT | VI_DOINGINACT | VI_OWEINACT | VI_DEFINACT); if (flags != 0) { snprintf(buf2, sizeof(buf2), "|VI(0x%lx)", flags); strlcat(buf, buf2, sizeof(buf)); } if (vp->v_mflag & VMP_LAZYLIST) strlcat(buf, "|VMP_LAZYLIST", sizeof(buf)); flags = vp->v_mflag & ~(VMP_LAZYLIST); if (flags != 0) { snprintf(buf2, sizeof(buf2), "|VMP(0x%lx)", flags); strlcat(buf, buf2, sizeof(buf)); } printf(" flags (%s)\n", buf + 1); if (mtx_owned(VI_MTX(vp))) printf(" VI_LOCKed"); if (vp->v_object != NULL) printf(" v_object %p ref %d pages %d " "cleanbuf %d dirtybuf %d\n", vp->v_object, vp->v_object->ref_count, vp->v_object->resident_page_count, vp->v_bufobj.bo_clean.bv_cnt, vp->v_bufobj.bo_dirty.bv_cnt); printf(" "); lockmgr_printinfo(vp->v_vnlock); if (vp->v_data != NULL) VOP_PRINT(vp); } #ifdef DDB /* * List all of the locked vnodes in the system. * Called when debugging the kernel. */ DB_SHOW_COMMAND(lockedvnods, lockedvnodes) { struct mount *mp; struct vnode *vp; /* * Note: because this is DDB, we can't obey the locking semantics * for these structures, which means we could catch an inconsistent * state and dereference a nasty pointer. Not much to be done * about that. */ db_printf("Locked vnodes\n"); TAILQ_FOREACH(mp, &mountlist, mnt_list) { TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { if (vp->v_type != VMARKER && VOP_ISLOCKED(vp)) vn_printf(vp, "vnode "); } } } /* * Show details about the given vnode. */ DB_SHOW_COMMAND(vnode, db_show_vnode) { struct vnode *vp; if (!have_addr) return; vp = (struct vnode *)addr; vn_printf(vp, "vnode "); } /* * Show details about the given mount point. */ DB_SHOW_COMMAND(mount, db_show_mount) { struct mount *mp; struct vfsopt *opt; struct statfs *sp; struct vnode *vp; char buf[512]; uint64_t mflags; u_int flags; if (!have_addr) { /* No address given, print short info about all mount points. */ TAILQ_FOREACH(mp, &mountlist, mnt_list) { db_printf("%p %s on %s (%s)\n", mp, mp->mnt_stat.f_mntfromname, mp->mnt_stat.f_mntonname, mp->mnt_stat.f_fstypename); if (db_pager_quit) break; } db_printf("\nMore info: show mount \n"); return; } mp = (struct mount *)addr; db_printf("%p %s on %s (%s)\n", mp, mp->mnt_stat.f_mntfromname, mp->mnt_stat.f_mntonname, mp->mnt_stat.f_fstypename); buf[0] = '\0'; mflags = mp->mnt_flag; #define MNT_FLAG(flag) do { \ if (mflags & (flag)) { \ if (buf[0] != '\0') \ strlcat(buf, ", ", sizeof(buf)); \ strlcat(buf, (#flag) + 4, sizeof(buf)); \ mflags &= ~(flag); \ } \ } while (0) MNT_FLAG(MNT_RDONLY); MNT_FLAG(MNT_SYNCHRONOUS); MNT_FLAG(MNT_NOEXEC); MNT_FLAG(MNT_NOSUID); MNT_FLAG(MNT_NFS4ACLS); MNT_FLAG(MNT_UNION); MNT_FLAG(MNT_ASYNC); MNT_FLAG(MNT_SUIDDIR); MNT_FLAG(MNT_SOFTDEP); MNT_FLAG(MNT_NOSYMFOLLOW); MNT_FLAG(MNT_GJOURNAL); MNT_FLAG(MNT_MULTILABEL); MNT_FLAG(MNT_ACLS); MNT_FLAG(MNT_NOATIME); MNT_FLAG(MNT_NOCLUSTERR); MNT_FLAG(MNT_NOCLUSTERW); MNT_FLAG(MNT_SUJ); MNT_FLAG(MNT_EXRDONLY); MNT_FLAG(MNT_EXPORTED); MNT_FLAG(MNT_DEFEXPORTED); MNT_FLAG(MNT_EXPORTANON); MNT_FLAG(MNT_EXKERB); MNT_FLAG(MNT_EXPUBLIC); MNT_FLAG(MNT_LOCAL); MNT_FLAG(MNT_QUOTA); MNT_FLAG(MNT_ROOTFS); MNT_FLAG(MNT_USER); MNT_FLAG(MNT_IGNORE); MNT_FLAG(MNT_UPDATE); MNT_FLAG(MNT_DELEXPORT); MNT_FLAG(MNT_RELOAD); MNT_FLAG(MNT_FORCE); MNT_FLAG(MNT_SNAPSHOT); MNT_FLAG(MNT_BYFSID); #undef MNT_FLAG if (mflags != 0) { if (buf[0] != '\0') strlcat(buf, ", ", sizeof(buf)); snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "0x%016jx", mflags); } db_printf(" mnt_flag = %s\n", buf); buf[0] = '\0'; flags = mp->mnt_kern_flag; #define MNT_KERN_FLAG(flag) do { \ if (flags & (flag)) { \ if (buf[0] != '\0') \ strlcat(buf, ", ", sizeof(buf)); \ strlcat(buf, (#flag) + 5, sizeof(buf)); \ flags &= ~(flag); \ } \ } while (0) MNT_KERN_FLAG(MNTK_UNMOUNTF); MNT_KERN_FLAG(MNTK_ASYNC); MNT_KERN_FLAG(MNTK_SOFTDEP); MNT_KERN_FLAG(MNTK_DRAINING); MNT_KERN_FLAG(MNTK_REFEXPIRE); MNT_KERN_FLAG(MNTK_EXTENDED_SHARED); MNT_KERN_FLAG(MNTK_SHARED_WRITES); MNT_KERN_FLAG(MNTK_NO_IOPF); MNT_KERN_FLAG(MNTK_VGONE_UPPER); MNT_KERN_FLAG(MNTK_VGONE_WAITER); MNT_KERN_FLAG(MNTK_LOOKUP_EXCL_DOTDOT); MNT_KERN_FLAG(MNTK_MARKER); MNT_KERN_FLAG(MNTK_USES_BCACHE); MNT_KERN_FLAG(MNTK_FPLOOKUP); MNT_KERN_FLAG(MNTK_NOASYNC); MNT_KERN_FLAG(MNTK_UNMOUNT); MNT_KERN_FLAG(MNTK_MWAIT); MNT_KERN_FLAG(MNTK_SUSPEND); MNT_KERN_FLAG(MNTK_SUSPEND2); MNT_KERN_FLAG(MNTK_SUSPENDED); MNT_KERN_FLAG(MNTK_LOOKUP_SHARED); MNT_KERN_FLAG(MNTK_NOKNOTE); #undef MNT_KERN_FLAG if (flags != 0) { if (buf[0] != '\0') strlcat(buf, ", ", sizeof(buf)); snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "0x%08x", flags); } db_printf(" mnt_kern_flag = %s\n", buf); db_printf(" mnt_opt = "); opt = TAILQ_FIRST(mp->mnt_opt); if (opt != NULL) { db_printf("%s", opt->name); opt = TAILQ_NEXT(opt, link); while (opt != NULL) { db_printf(", %s", opt->name); opt = TAILQ_NEXT(opt, link); } } db_printf("\n"); sp = &mp->mnt_stat; db_printf(" mnt_stat = { version=%u type=%u flags=0x%016jx " "bsize=%ju iosize=%ju blocks=%ju bfree=%ju bavail=%jd files=%ju " "ffree=%jd syncwrites=%ju asyncwrites=%ju syncreads=%ju " "asyncreads=%ju namemax=%u owner=%u fsid=[%d, %d] }\n", (u_int)sp->f_version, (u_int)sp->f_type, (uintmax_t)sp->f_flags, (uintmax_t)sp->f_bsize, (uintmax_t)sp->f_iosize, (uintmax_t)sp->f_blocks, (uintmax_t)sp->f_bfree, (intmax_t)sp->f_bavail, (uintmax_t)sp->f_files, (intmax_t)sp->f_ffree, (uintmax_t)sp->f_syncwrites, (uintmax_t)sp->f_asyncwrites, (uintmax_t)sp->f_syncreads, (uintmax_t)sp->f_asyncreads, (u_int)sp->f_namemax, (u_int)sp->f_owner, (int)sp->f_fsid.val[0], (int)sp->f_fsid.val[1]); db_printf(" mnt_cred = { uid=%u ruid=%u", (u_int)mp->mnt_cred->cr_uid, (u_int)mp->mnt_cred->cr_ruid); if (jailed(mp->mnt_cred)) db_printf(", jail=%d", mp->mnt_cred->cr_prison->pr_id); db_printf(" }\n"); db_printf(" mnt_ref = %d (with %d in the struct)\n", vfs_mount_fetch_counter(mp, MNT_COUNT_REF), mp->mnt_ref); db_printf(" mnt_gen = %d\n", mp->mnt_gen); db_printf(" mnt_nvnodelistsize = %d\n", mp->mnt_nvnodelistsize); db_printf(" mnt_lazyvnodelistsize = %d\n", mp->mnt_lazyvnodelistsize); db_printf(" mnt_writeopcount = %d (with %d in the struct)\n", vfs_mount_fetch_counter(mp, MNT_COUNT_WRITEOPCOUNT), mp->mnt_writeopcount); db_printf(" mnt_maxsymlinklen = %d\n", mp->mnt_maxsymlinklen); db_printf(" mnt_iosize_max = %d\n", mp->mnt_iosize_max); db_printf(" mnt_hashseed = %u\n", mp->mnt_hashseed); db_printf(" mnt_lockref = %d (with %d in the struct)\n", vfs_mount_fetch_counter(mp, MNT_COUNT_LOCKREF), mp->mnt_lockref); db_printf(" mnt_secondary_writes = %d\n", mp->mnt_secondary_writes); db_printf(" mnt_secondary_accwrites = %d\n", mp->mnt_secondary_accwrites); db_printf(" mnt_gjprovider = %s\n", mp->mnt_gjprovider != NULL ? mp->mnt_gjprovider : "NULL"); db_printf(" mnt_vfs_ops = %d\n", mp->mnt_vfs_ops); db_printf("\n\nList of active vnodes\n"); TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { if (vp->v_type != VMARKER && vp->v_holdcnt > 0) { vn_printf(vp, "vnode "); if (db_pager_quit) break; } } db_printf("\n\nList of inactive vnodes\n"); TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { if (vp->v_type != VMARKER && vp->v_holdcnt == 0) { vn_printf(vp, "vnode "); if (db_pager_quit) break; } } } #endif /* DDB */ /* * Fill in a struct xvfsconf based on a struct vfsconf. */ static int vfsconf2x(struct sysctl_req *req, struct vfsconf *vfsp) { struct xvfsconf xvfsp; bzero(&xvfsp, sizeof(xvfsp)); strcpy(xvfsp.vfc_name, vfsp->vfc_name); xvfsp.vfc_typenum = vfsp->vfc_typenum; xvfsp.vfc_refcount = vfsp->vfc_refcount; xvfsp.vfc_flags = vfsp->vfc_flags; /* * These are unused in userland, we keep them * to not break binary compatibility. */ xvfsp.vfc_vfsops = NULL; xvfsp.vfc_next = NULL; return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp))); } #ifdef COMPAT_FREEBSD32 struct xvfsconf32 { uint32_t vfc_vfsops; char vfc_name[MFSNAMELEN]; int32_t vfc_typenum; int32_t vfc_refcount; int32_t vfc_flags; uint32_t vfc_next; }; static int vfsconf2x32(struct sysctl_req *req, struct vfsconf *vfsp) { struct xvfsconf32 xvfsp; bzero(&xvfsp, sizeof(xvfsp)); strcpy(xvfsp.vfc_name, vfsp->vfc_name); xvfsp.vfc_typenum = vfsp->vfc_typenum; xvfsp.vfc_refcount = vfsp->vfc_refcount; xvfsp.vfc_flags = vfsp->vfc_flags; return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp))); } #endif /* * Top level filesystem related information gathering. */ static int sysctl_vfs_conflist(SYSCTL_HANDLER_ARGS) { struct vfsconf *vfsp; int error; error = 0; vfsconf_slock(); TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) { #ifdef COMPAT_FREEBSD32 if (req->flags & SCTL_MASK32) error = vfsconf2x32(req, vfsp); else #endif error = vfsconf2x(req, vfsp); if (error) break; } vfsconf_sunlock(); return (error); } SYSCTL_PROC(_vfs, OID_AUTO, conflist, CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_vfs_conflist, "S,xvfsconf", "List of all configured filesystems"); #ifndef BURN_BRIDGES static int sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS); static int vfs_sysctl(SYSCTL_HANDLER_ARGS) { int *name = (int *)arg1 - 1; /* XXX */ u_int namelen = arg2 + 1; /* XXX */ struct vfsconf *vfsp; log(LOG_WARNING, "userland calling deprecated sysctl, " "please rebuild world\n"); #if 1 || defined(COMPAT_PRELITE2) /* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */ if (namelen == 1) return (sysctl_ovfs_conf(oidp, arg1, arg2, req)); #endif switch (name[1]) { case VFS_MAXTYPENUM: if (namelen != 2) return (ENOTDIR); return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int))); case VFS_CONF: if (namelen != 3) return (ENOTDIR); /* overloaded */ vfsconf_slock(); TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) { if (vfsp->vfc_typenum == name[2]) break; } vfsconf_sunlock(); if (vfsp == NULL) return (EOPNOTSUPP); #ifdef COMPAT_FREEBSD32 if (req->flags & SCTL_MASK32) return (vfsconf2x32(req, vfsp)); else #endif return (vfsconf2x(req, vfsp)); } return (EOPNOTSUPP); } static SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD | CTLFLAG_SKIP | CTLFLAG_MPSAFE, vfs_sysctl, "Generic filesystem"); #if 1 || defined(COMPAT_PRELITE2) static int sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS) { int error; struct vfsconf *vfsp; struct ovfsconf ovfs; vfsconf_slock(); TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) { bzero(&ovfs, sizeof(ovfs)); ovfs.vfc_vfsops = vfsp->vfc_vfsops; /* XXX used as flag */ strcpy(ovfs.vfc_name, vfsp->vfc_name); ovfs.vfc_index = vfsp->vfc_typenum; ovfs.vfc_refcount = vfsp->vfc_refcount; ovfs.vfc_flags = vfsp->vfc_flags; error = SYSCTL_OUT(req, &ovfs, sizeof ovfs); if (error != 0) { vfsconf_sunlock(); return (error); } } vfsconf_sunlock(); return (0); } #endif /* 1 || COMPAT_PRELITE2 */ #endif /* !BURN_BRIDGES */ #define KINFO_VNODESLOP 10 #ifdef notyet /* * Dump vnode list (via sysctl). */ /* ARGSUSED */ static int sysctl_vnode(SYSCTL_HANDLER_ARGS) { struct xvnode *xvn; struct mount *mp; struct vnode *vp; int error, len, n; /* * Stale numvnodes access is not fatal here. */ req->lock = 0; len = (numvnodes + KINFO_VNODESLOP) * sizeof *xvn; if (!req->oldptr) /* Make an estimate */ return (SYSCTL_OUT(req, 0, len)); error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); xvn = malloc(len, M_TEMP, M_ZERO | M_WAITOK); n = 0; mtx_lock(&mountlist_mtx); TAILQ_FOREACH(mp, &mountlist, mnt_list) { if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) continue; MNT_ILOCK(mp); TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { if (n == len) break; vref(vp); xvn[n].xv_size = sizeof *xvn; xvn[n].xv_vnode = vp; xvn[n].xv_id = 0; /* XXX compat */ #define XV_COPY(field) xvn[n].xv_##field = vp->v_##field XV_COPY(usecount); XV_COPY(writecount); XV_COPY(holdcnt); XV_COPY(mount); XV_COPY(numoutput); XV_COPY(type); #undef XV_COPY xvn[n].xv_flag = vp->v_vflag; switch (vp->v_type) { case VREG: case VDIR: case VLNK: break; case VBLK: case VCHR: if (vp->v_rdev == NULL) { vrele(vp); continue; } xvn[n].xv_dev = dev2udev(vp->v_rdev); break; case VSOCK: xvn[n].xv_socket = vp->v_socket; break; case VFIFO: xvn[n].xv_fifo = vp->v_fifoinfo; break; case VNON: case VBAD: default: /* shouldn't happen? */ vrele(vp); continue; } vrele(vp); ++n; } MNT_IUNLOCK(mp); mtx_lock(&mountlist_mtx); vfs_unbusy(mp); if (n == len) break; } mtx_unlock(&mountlist_mtx); error = SYSCTL_OUT(req, xvn, n * sizeof *xvn); free(xvn, M_TEMP); return (error); } SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 0, sysctl_vnode, "S,xvnode", ""); #endif static void unmount_or_warn(struct mount *mp) { int error; error = dounmount(mp, MNT_FORCE, curthread); if (error != 0) { printf("unmount of %s failed (", mp->mnt_stat.f_mntonname); if (error == EBUSY) printf("BUSY)\n"); else printf("%d)\n", error); } } /* * Unmount all filesystems. The list is traversed in reverse order * of mounting to avoid dependencies. */ void vfs_unmountall(void) { struct mount *mp, *tmp; CTR1(KTR_VFS, "%s: unmounting all filesystems", __func__); /* * Since this only runs when rebooting, it is not interlocked. */ TAILQ_FOREACH_REVERSE_SAFE(mp, &mountlist, mntlist, mnt_list, tmp) { vfs_ref(mp); /* * Forcibly unmounting "/dev" before "/" would prevent clean * unmount of the latter. */ if (mp == rootdevmp) continue; unmount_or_warn(mp); } if (rootdevmp != NULL) unmount_or_warn(rootdevmp); } static void vfs_deferred_inactive(struct vnode *vp, int lkflags) { ASSERT_VI_LOCKED(vp, __func__); VNASSERT((vp->v_iflag & VI_DEFINACT) == 0, vp, ("VI_DEFINACT still set")); if ((vp->v_iflag & VI_OWEINACT) == 0) { vdropl(vp); return; } if (vn_lock(vp, lkflags) == 0) { VI_LOCK(vp); vinactive(vp); VOP_UNLOCK(vp); vdropl(vp); return; } vdefer_inactive_unlocked(vp); } static int vfs_periodic_inactive_filter(struct vnode *vp, void *arg) { return (vp->v_iflag & VI_DEFINACT); } static void __noinline vfs_periodic_inactive(struct mount *mp, int flags) { struct vnode *vp, *mvp; int lkflags; lkflags = LK_EXCLUSIVE | LK_INTERLOCK; if (flags != MNT_WAIT) lkflags |= LK_NOWAIT; MNT_VNODE_FOREACH_LAZY(vp, mp, mvp, vfs_periodic_inactive_filter, NULL) { if ((vp->v_iflag & VI_DEFINACT) == 0) { VI_UNLOCK(vp); continue; } vp->v_iflag &= ~VI_DEFINACT; vfs_deferred_inactive(vp, lkflags); } } static inline bool vfs_want_msync(struct vnode *vp) { struct vm_object *obj; /* * This test may be performed without any locks held. * We rely on vm_object's type stability. */ if (vp->v_vflag & VV_NOSYNC) return (false); obj = vp->v_object; return (obj != NULL && vm_object_mightbedirty(obj)); } static int vfs_periodic_msync_inactive_filter(struct vnode *vp, void *arg __unused) { if (vp->v_vflag & VV_NOSYNC) return (false); if (vp->v_iflag & VI_DEFINACT) return (true); return (vfs_want_msync(vp)); } static void __noinline vfs_periodic_msync_inactive(struct mount *mp, int flags) { struct vnode *vp, *mvp; struct vm_object *obj; struct thread *td; int lkflags, objflags; bool seen_defer; td = curthread; lkflags = LK_EXCLUSIVE | LK_INTERLOCK; if (flags != MNT_WAIT) { lkflags |= LK_NOWAIT; objflags = OBJPC_NOSYNC; } else { objflags = OBJPC_SYNC; } MNT_VNODE_FOREACH_LAZY(vp, mp, mvp, vfs_periodic_msync_inactive_filter, NULL) { seen_defer = false; if (vp->v_iflag & VI_DEFINACT) { vp->v_iflag &= ~VI_DEFINACT; seen_defer = true; } if (!vfs_want_msync(vp)) { if (seen_defer) vfs_deferred_inactive(vp, lkflags); else VI_UNLOCK(vp); continue; } - if (vget(vp, lkflags, td) == 0) { + if (vget(vp, lkflags) == 0) { obj = vp->v_object; if (obj != NULL && (vp->v_vflag & VV_NOSYNC) == 0) { VM_OBJECT_WLOCK(obj); vm_object_page_clean(obj, 0, 0, objflags); VM_OBJECT_WUNLOCK(obj); } vput(vp); if (seen_defer) vdrop(vp); } else { if (seen_defer) vdefer_inactive_unlocked(vp); } } } void vfs_periodic(struct mount *mp, int flags) { CTR2(KTR_VFS, "%s: mp %p", __func__, mp); if ((mp->mnt_kern_flag & MNTK_NOMSYNC) != 0) vfs_periodic_inactive(mp, flags); else vfs_periodic_msync_inactive(mp, flags); } static void destroy_vpollinfo_free(struct vpollinfo *vi) { knlist_destroy(&vi->vpi_selinfo.si_note); mtx_destroy(&vi->vpi_lock); uma_zfree(vnodepoll_zone, vi); } static void destroy_vpollinfo(struct vpollinfo *vi) { knlist_clear(&vi->vpi_selinfo.si_note, 1); seldrain(&vi->vpi_selinfo); destroy_vpollinfo_free(vi); } /* * Initialize per-vnode helper structure to hold poll-related state. */ void v_addpollinfo(struct vnode *vp) { struct vpollinfo *vi; if (vp->v_pollinfo != NULL) return; vi = uma_zalloc(vnodepoll_zone, M_WAITOK | M_ZERO); mtx_init(&vi->vpi_lock, "vnode pollinfo", NULL, MTX_DEF); knlist_init(&vi->vpi_selinfo.si_note, vp, vfs_knllock, vfs_knlunlock, vfs_knl_assert_locked, vfs_knl_assert_unlocked); VI_LOCK(vp); if (vp->v_pollinfo != NULL) { VI_UNLOCK(vp); destroy_vpollinfo_free(vi); return; } vp->v_pollinfo = vi; VI_UNLOCK(vp); } /* * Record a process's interest in events which might happen to * a vnode. Because poll uses the historic select-style interface * internally, this routine serves as both the ``check for any * pending events'' and the ``record my interest in future events'' * functions. (These are done together, while the lock is held, * to avoid race conditions.) */ int vn_pollrecord(struct vnode *vp, struct thread *td, int events) { v_addpollinfo(vp); mtx_lock(&vp->v_pollinfo->vpi_lock); if (vp->v_pollinfo->vpi_revents & events) { /* * This leaves events we are not interested * in available for the other process which * which presumably had requested them * (otherwise they would never have been * recorded). */ events &= vp->v_pollinfo->vpi_revents; vp->v_pollinfo->vpi_revents &= ~events; mtx_unlock(&vp->v_pollinfo->vpi_lock); return (events); } vp->v_pollinfo->vpi_events |= events; selrecord(td, &vp->v_pollinfo->vpi_selinfo); mtx_unlock(&vp->v_pollinfo->vpi_lock); return (0); } /* * Routine to create and manage a filesystem syncer vnode. */ #define sync_close ((int (*)(struct vop_close_args *))nullop) static int sync_fsync(struct vop_fsync_args *); static int sync_inactive(struct vop_inactive_args *); static int sync_reclaim(struct vop_reclaim_args *); static struct vop_vector sync_vnodeops = { .vop_bypass = VOP_EOPNOTSUPP, .vop_close = sync_close, /* close */ .vop_fsync = sync_fsync, /* fsync */ .vop_inactive = sync_inactive, /* inactive */ .vop_need_inactive = vop_stdneed_inactive, /* need_inactive */ .vop_reclaim = sync_reclaim, /* reclaim */ .vop_lock1 = vop_stdlock, /* lock */ .vop_unlock = vop_stdunlock, /* unlock */ .vop_islocked = vop_stdislocked, /* islocked */ }; VFS_VOP_VECTOR_REGISTER(sync_vnodeops); /* * Create a new filesystem syncer vnode for the specified mount point. */ void vfs_allocate_syncvnode(struct mount *mp) { struct vnode *vp; struct bufobj *bo; static long start, incr, next; int error; /* Allocate a new vnode */ error = getnewvnode("syncer", mp, &sync_vnodeops, &vp); if (error != 0) panic("vfs_allocate_syncvnode: getnewvnode() failed"); vp->v_type = VNON; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); vp->v_vflag |= VV_FORCEINSMQ; error = insmntque(vp, mp); if (error != 0) panic("vfs_allocate_syncvnode: insmntque() failed"); vp->v_vflag &= ~VV_FORCEINSMQ; VOP_UNLOCK(vp); /* * Place the vnode onto the syncer worklist. We attempt to * scatter them about on the list so that they will go off * at evenly distributed times even if all the filesystems * are mounted at once. */ next += incr; if (next == 0 || next > syncer_maxdelay) { start /= 2; incr /= 2; if (start == 0) { start = syncer_maxdelay / 2; incr = syncer_maxdelay; } next = start; } bo = &vp->v_bufobj; BO_LOCK(bo); vn_syncer_add_to_worklist(bo, syncdelay > 0 ? next % syncdelay : 0); /* XXX - vn_syncer_add_to_worklist() also grabs and drops sync_mtx. */ mtx_lock(&sync_mtx); sync_vnode_count++; if (mp->mnt_syncer == NULL) { mp->mnt_syncer = vp; vp = NULL; } mtx_unlock(&sync_mtx); BO_UNLOCK(bo); if (vp != NULL) { vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); vgone(vp); vput(vp); } } void vfs_deallocate_syncvnode(struct mount *mp) { struct vnode *vp; mtx_lock(&sync_mtx); vp = mp->mnt_syncer; if (vp != NULL) mp->mnt_syncer = NULL; mtx_unlock(&sync_mtx); if (vp != NULL) vrele(vp); } /* * Do a lazy sync of the filesystem. */ static int sync_fsync(struct vop_fsync_args *ap) { struct vnode *syncvp = ap->a_vp; struct mount *mp = syncvp->v_mount; int error, save; struct bufobj *bo; /* * We only need to do something if this is a lazy evaluation. */ if (ap->a_waitfor != MNT_LAZY) return (0); /* * Move ourselves to the back of the sync list. */ bo = &syncvp->v_bufobj; BO_LOCK(bo); vn_syncer_add_to_worklist(bo, syncdelay); BO_UNLOCK(bo); /* * Walk the list of vnodes pushing all that are dirty and * not already on the sync list. */ if (vfs_busy(mp, MBF_NOWAIT) != 0) return (0); if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) { vfs_unbusy(mp); return (0); } save = curthread_pflags_set(TDP_SYNCIO); /* * The filesystem at hand may be idle with free vnodes stored in the * batch. Return them instead of letting them stay there indefinitely. */ vfs_periodic(mp, MNT_NOWAIT); error = VFS_SYNC(mp, MNT_LAZY); curthread_pflags_restore(save); vn_finished_write(mp); vfs_unbusy(mp); return (error); } /* * The syncer vnode is no referenced. */ static int sync_inactive(struct vop_inactive_args *ap) { vgone(ap->a_vp); return (0); } /* * The syncer vnode is no longer needed and is being decommissioned. * * Modifications to the worklist must be protected by sync_mtx. */ static int sync_reclaim(struct vop_reclaim_args *ap) { struct vnode *vp = ap->a_vp; struct bufobj *bo; bo = &vp->v_bufobj; BO_LOCK(bo); mtx_lock(&sync_mtx); if (vp->v_mount->mnt_syncer == vp) vp->v_mount->mnt_syncer = NULL; if (bo->bo_flag & BO_ONWORKLST) { LIST_REMOVE(bo, bo_synclist); syncer_worklist_len--; sync_vnode_count--; bo->bo_flag &= ~BO_ONWORKLST; } mtx_unlock(&sync_mtx); BO_UNLOCK(bo); return (0); } int vn_need_pageq_flush(struct vnode *vp) { struct vm_object *obj; int need; MPASS(mtx_owned(VI_MTX(vp))); need = 0; if ((obj = vp->v_object) != NULL && (vp->v_vflag & VV_NOSYNC) == 0 && vm_object_mightbedirty(obj)) need = 1; return (need); } /* * Check if vnode represents a disk device */ int vn_isdisk(struct vnode *vp, int *errp) { int error; if (vp->v_type != VCHR) { error = ENOTBLK; goto out; } error = 0; dev_lock(); if (vp->v_rdev == NULL) error = ENXIO; else if (vp->v_rdev->si_devsw == NULL) error = ENXIO; else if (!(vp->v_rdev->si_devsw->d_flags & D_DISK)) error = ENOTBLK; dev_unlock(); out: if (errp != NULL) *errp = error; return (error == 0); } /* * VOP_FPLOOKUP_VEXEC routines are subject to special circumstances, see * the comment above cache_fplookup for details. * * We never deny as priv_check_cred calls are not yet supported, see vaccess. */ int vaccess_vexec_smr(mode_t file_mode, uid_t file_uid, gid_t file_gid, struct ucred *cred) { VFS_SMR_ASSERT_ENTERED(); /* Check the owner. */ if (cred->cr_uid == file_uid) { if (file_mode & S_IXUSR) return (0); return (EAGAIN); } /* Otherwise, check the groups (first match) */ if (groupmember(file_gid, cred)) { if (file_mode & S_IXGRP) return (0); return (EAGAIN); } /* Otherwise, check everyone else. */ if (file_mode & S_IXOTH) return (0); return (EAGAIN); } /* * Common filesystem object access control check routine. Accepts a * vnode's type, "mode", uid and gid, requested access mode, and credentials. * Returns 0 on success, or an errno on failure. */ int vaccess(enum vtype type, mode_t file_mode, uid_t file_uid, gid_t file_gid, accmode_t accmode, struct ucred *cred) { accmode_t dac_granted; accmode_t priv_granted; KASSERT((accmode & ~(VEXEC | VWRITE | VREAD | VADMIN | VAPPEND)) == 0, ("invalid bit in accmode")); KASSERT((accmode & VAPPEND) == 0 || (accmode & VWRITE), ("VAPPEND without VWRITE")); /* * Look for a normal, non-privileged way to access the file/directory * as requested. If it exists, go with that. */ dac_granted = 0; /* Check the owner. */ if (cred->cr_uid == file_uid) { dac_granted |= VADMIN; if (file_mode & S_IXUSR) dac_granted |= VEXEC; if (file_mode & S_IRUSR) dac_granted |= VREAD; if (file_mode & S_IWUSR) dac_granted |= (VWRITE | VAPPEND); if ((accmode & dac_granted) == accmode) return (0); goto privcheck; } /* Otherwise, check the groups (first match) */ if (groupmember(file_gid, cred)) { if (file_mode & S_IXGRP) dac_granted |= VEXEC; if (file_mode & S_IRGRP) dac_granted |= VREAD; if (file_mode & S_IWGRP) dac_granted |= (VWRITE | VAPPEND); if ((accmode & dac_granted) == accmode) return (0); goto privcheck; } /* Otherwise, check everyone else. */ if (file_mode & S_IXOTH) dac_granted |= VEXEC; if (file_mode & S_IROTH) dac_granted |= VREAD; if (file_mode & S_IWOTH) dac_granted |= (VWRITE | VAPPEND); if ((accmode & dac_granted) == accmode) return (0); privcheck: /* * Build a privilege mask to determine if the set of privileges * satisfies the requirements when combined with the granted mask * from above. For each privilege, if the privilege is required, * bitwise or the request type onto the priv_granted mask. */ priv_granted = 0; if (type == VDIR) { /* * For directories, use PRIV_VFS_LOOKUP to satisfy VEXEC * requests, instead of PRIV_VFS_EXEC. */ if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) && !priv_check_cred(cred, PRIV_VFS_LOOKUP)) priv_granted |= VEXEC; } else { /* * Ensure that at least one execute bit is on. Otherwise, * a privileged user will always succeed, and we don't want * this to happen unless the file really is executable. */ if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) && (file_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) != 0 && !priv_check_cred(cred, PRIV_VFS_EXEC)) priv_granted |= VEXEC; } if ((accmode & VREAD) && ((dac_granted & VREAD) == 0) && !priv_check_cred(cred, PRIV_VFS_READ)) priv_granted |= VREAD; if ((accmode & VWRITE) && ((dac_granted & VWRITE) == 0) && !priv_check_cred(cred, PRIV_VFS_WRITE)) priv_granted |= (VWRITE | VAPPEND); if ((accmode & VADMIN) && ((dac_granted & VADMIN) == 0) && !priv_check_cred(cred, PRIV_VFS_ADMIN)) priv_granted |= VADMIN; if ((accmode & (priv_granted | dac_granted)) == accmode) { return (0); } return ((accmode & VADMIN) ? EPERM : EACCES); } /* * Credential check based on process requesting service, and per-attribute * permissions. */ int extattr_check_cred(struct vnode *vp, int attrnamespace, struct ucred *cred, struct thread *td, accmode_t accmode) { /* * Kernel-invoked always succeeds. */ if (cred == NOCRED) return (0); /* * Do not allow privileged processes in jail to directly manipulate * system attributes. */ switch (attrnamespace) { case EXTATTR_NAMESPACE_SYSTEM: /* Potentially should be: return (EPERM); */ return (priv_check_cred(cred, PRIV_VFS_EXTATTR_SYSTEM)); case EXTATTR_NAMESPACE_USER: return (VOP_ACCESS(vp, accmode, cred, td)); default: return (EPERM); } } #ifdef DEBUG_VFS_LOCKS /* * This only exists to suppress warnings from unlocked specfs accesses. It is * no longer ok to have an unlocked VFS. */ #define IGNORE_LOCK(vp) (KERNEL_PANICKED() || (vp) == NULL || \ (vp)->v_type == VCHR || (vp)->v_type == VBAD) int vfs_badlock_ddb = 1; /* Drop into debugger on violation. */ SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_ddb, CTLFLAG_RW, &vfs_badlock_ddb, 0, "Drop into debugger on lock violation"); int vfs_badlock_mutex = 1; /* Check for interlock across VOPs. */ SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_mutex, CTLFLAG_RW, &vfs_badlock_mutex, 0, "Check for interlock across VOPs"); int vfs_badlock_print = 1; /* Print lock violations. */ SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_print, CTLFLAG_RW, &vfs_badlock_print, 0, "Print lock violations"); int vfs_badlock_vnode = 1; /* Print vnode details on lock violations. */ SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_vnode, CTLFLAG_RW, &vfs_badlock_vnode, 0, "Print vnode details on lock violations"); #ifdef KDB int vfs_badlock_backtrace = 1; /* Print backtrace at lock violations. */ SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_backtrace, CTLFLAG_RW, &vfs_badlock_backtrace, 0, "Print backtrace at lock violations"); #endif static void vfs_badlock(const char *msg, const char *str, struct vnode *vp) { #ifdef KDB if (vfs_badlock_backtrace) kdb_backtrace(); #endif if (vfs_badlock_vnode) vn_printf(vp, "vnode "); if (vfs_badlock_print) printf("%s: %p %s\n", str, (void *)vp, msg); if (vfs_badlock_ddb) kdb_enter(KDB_WHY_VFSLOCK, "lock violation"); } void assert_vi_locked(struct vnode *vp, const char *str) { if (vfs_badlock_mutex && !mtx_owned(VI_MTX(vp))) vfs_badlock("interlock is not locked but should be", str, vp); } void assert_vi_unlocked(struct vnode *vp, const char *str) { if (vfs_badlock_mutex && mtx_owned(VI_MTX(vp))) vfs_badlock("interlock is locked but should not be", str, vp); } void assert_vop_locked(struct vnode *vp, const char *str) { int locked; if (!IGNORE_LOCK(vp)) { locked = VOP_ISLOCKED(vp); if (locked == 0 || locked == LK_EXCLOTHER) vfs_badlock("is not locked but should be", str, vp); } } void assert_vop_unlocked(struct vnode *vp, const char *str) { if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) == LK_EXCLUSIVE) vfs_badlock("is locked but should not be", str, vp); } void assert_vop_elocked(struct vnode *vp, const char *str) { if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) != LK_EXCLUSIVE) vfs_badlock("is not exclusive locked but should be", str, vp); } #endif /* DEBUG_VFS_LOCKS */ void vop_rename_fail(struct vop_rename_args *ap) { if (ap->a_tvp != NULL) vput(ap->a_tvp); if (ap->a_tdvp == ap->a_tvp) vrele(ap->a_tdvp); else vput(ap->a_tdvp); vrele(ap->a_fdvp); vrele(ap->a_fvp); } void vop_rename_pre(void *ap) { struct vop_rename_args *a = ap; #ifdef DEBUG_VFS_LOCKS if (a->a_tvp) ASSERT_VI_UNLOCKED(a->a_tvp, "VOP_RENAME"); ASSERT_VI_UNLOCKED(a->a_tdvp, "VOP_RENAME"); ASSERT_VI_UNLOCKED(a->a_fvp, "VOP_RENAME"); ASSERT_VI_UNLOCKED(a->a_fdvp, "VOP_RENAME"); /* Check the source (from). */ if (a->a_tdvp->v_vnlock != a->a_fdvp->v_vnlock && (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fdvp->v_vnlock)) ASSERT_VOP_UNLOCKED(a->a_fdvp, "vop_rename: fdvp locked"); if (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fvp->v_vnlock) ASSERT_VOP_UNLOCKED(a->a_fvp, "vop_rename: fvp locked"); /* Check the target. */ if (a->a_tvp) ASSERT_VOP_LOCKED(a->a_tvp, "vop_rename: tvp not locked"); ASSERT_VOP_LOCKED(a->a_tdvp, "vop_rename: tdvp not locked"); #endif /* * It may be tempting to add vn_seqc_write_begin/end calls here and * in vop_rename_post but that's not going to work out since some * filesystems relookup vnodes mid-rename. This is probably a bug. * * For now filesystems are expected to do the relevant calls after they * decide what vnodes to operate on. */ if (a->a_tdvp != a->a_fdvp) vhold(a->a_fdvp); if (a->a_tvp != a->a_fvp) vhold(a->a_fvp); vhold(a->a_tdvp); if (a->a_tvp) vhold(a->a_tvp); } #ifdef DEBUG_VFS_LOCKS void vop_fplookup_vexec_debugpre(void *ap __unused) { VFS_SMR_ASSERT_ENTERED(); } void vop_fplookup_vexec_debugpost(void *ap __unused, int rc __unused) { VFS_SMR_ASSERT_ENTERED(); } void vop_strategy_debugpre(void *ap) { struct vop_strategy_args *a; struct buf *bp; a = ap; bp = a->a_bp; /* * Cluster ops lock their component buffers but not the IO container. */ if ((bp->b_flags & B_CLUSTER) != 0) return; if (!KERNEL_PANICKED() && !BUF_ISLOCKED(bp)) { if (vfs_badlock_print) printf( "VOP_STRATEGY: bp is not locked but should be\n"); if (vfs_badlock_ddb) kdb_enter(KDB_WHY_VFSLOCK, "lock violation"); } } void vop_lock_debugpre(void *ap) { struct vop_lock1_args *a = ap; if ((a->a_flags & LK_INTERLOCK) == 0) ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK"); else ASSERT_VI_LOCKED(a->a_vp, "VOP_LOCK"); } void vop_lock_debugpost(void *ap, int rc) { struct vop_lock1_args *a = ap; ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK"); if (rc == 0 && (a->a_flags & LK_EXCLOTHER) == 0) ASSERT_VOP_LOCKED(a->a_vp, "VOP_LOCK"); } void vop_unlock_debugpre(void *ap) { struct vop_unlock_args *a = ap; ASSERT_VOP_LOCKED(a->a_vp, "VOP_UNLOCK"); } void vop_need_inactive_debugpre(void *ap) { struct vop_need_inactive_args *a = ap; ASSERT_VI_LOCKED(a->a_vp, "VOP_NEED_INACTIVE"); } void vop_need_inactive_debugpost(void *ap, int rc) { struct vop_need_inactive_args *a = ap; ASSERT_VI_LOCKED(a->a_vp, "VOP_NEED_INACTIVE"); } #endif void vop_create_pre(void *ap) { struct vop_create_args *a; struct vnode *dvp; a = ap; dvp = a->a_dvp; vn_seqc_write_begin(dvp); } void vop_create_post(void *ap, int rc) { struct vop_create_args *a; struct vnode *dvp; a = ap; dvp = a->a_dvp; vn_seqc_write_end(dvp); if (!rc) VFS_KNOTE_LOCKED(dvp, NOTE_WRITE); } void vop_whiteout_pre(void *ap) { struct vop_whiteout_args *a; struct vnode *dvp; a = ap; dvp = a->a_dvp; vn_seqc_write_begin(dvp); } void vop_whiteout_post(void *ap, int rc) { struct vop_whiteout_args *a; struct vnode *dvp; a = ap; dvp = a->a_dvp; vn_seqc_write_end(dvp); } void vop_deleteextattr_pre(void *ap) { struct vop_deleteextattr_args *a; struct vnode *vp; a = ap; vp = a->a_vp; vn_seqc_write_begin(vp); } void vop_deleteextattr_post(void *ap, int rc) { struct vop_deleteextattr_args *a; struct vnode *vp; a = ap; vp = a->a_vp; vn_seqc_write_end(vp); if (!rc) VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB); } void vop_link_pre(void *ap) { struct vop_link_args *a; struct vnode *vp, *tdvp; a = ap; vp = a->a_vp; tdvp = a->a_tdvp; vn_seqc_write_begin(vp); vn_seqc_write_begin(tdvp); } void vop_link_post(void *ap, int rc) { struct vop_link_args *a; struct vnode *vp, *tdvp; a = ap; vp = a->a_vp; tdvp = a->a_tdvp; vn_seqc_write_end(vp); vn_seqc_write_end(tdvp); if (!rc) { VFS_KNOTE_LOCKED(vp, NOTE_LINK); VFS_KNOTE_LOCKED(tdvp, NOTE_WRITE); } } void vop_mkdir_pre(void *ap) { struct vop_mkdir_args *a; struct vnode *dvp; a = ap; dvp = a->a_dvp; vn_seqc_write_begin(dvp); } void vop_mkdir_post(void *ap, int rc) { struct vop_mkdir_args *a; struct vnode *dvp; a = ap; dvp = a->a_dvp; vn_seqc_write_end(dvp); if (!rc) VFS_KNOTE_LOCKED(dvp, NOTE_WRITE | NOTE_LINK); } void vop_mknod_pre(void *ap) { struct vop_mknod_args *a; struct vnode *dvp; a = ap; dvp = a->a_dvp; vn_seqc_write_begin(dvp); } void vop_mknod_post(void *ap, int rc) { struct vop_mknod_args *a; struct vnode *dvp; a = ap; dvp = a->a_dvp; vn_seqc_write_end(dvp); if (!rc) VFS_KNOTE_LOCKED(dvp, NOTE_WRITE); } void vop_reclaim_post(void *ap, int rc) { struct vop_reclaim_args *a; struct vnode *vp; a = ap; vp = a->a_vp; ASSERT_VOP_IN_SEQC(vp); if (!rc) VFS_KNOTE_LOCKED(vp, NOTE_REVOKE); } void vop_remove_pre(void *ap) { struct vop_remove_args *a; struct vnode *dvp, *vp; a = ap; dvp = a->a_dvp; vp = a->a_vp; vn_seqc_write_begin(dvp); vn_seqc_write_begin(vp); } void vop_remove_post(void *ap, int rc) { struct vop_remove_args *a; struct vnode *dvp, *vp; a = ap; dvp = a->a_dvp; vp = a->a_vp; vn_seqc_write_end(dvp); vn_seqc_write_end(vp); if (!rc) { VFS_KNOTE_LOCKED(dvp, NOTE_WRITE); VFS_KNOTE_LOCKED(vp, NOTE_DELETE); } } void vop_rename_post(void *ap, int rc) { struct vop_rename_args *a = ap; long hint; if (!rc) { hint = NOTE_WRITE; if (a->a_fdvp == a->a_tdvp) { if (a->a_tvp != NULL && a->a_tvp->v_type == VDIR) hint |= NOTE_LINK; VFS_KNOTE_UNLOCKED(a->a_fdvp, hint); VFS_KNOTE_UNLOCKED(a->a_tdvp, hint); } else { hint |= NOTE_EXTEND; if (a->a_fvp->v_type == VDIR) hint |= NOTE_LINK; VFS_KNOTE_UNLOCKED(a->a_fdvp, hint); if (a->a_fvp->v_type == VDIR && a->a_tvp != NULL && a->a_tvp->v_type == VDIR) hint &= ~NOTE_LINK; VFS_KNOTE_UNLOCKED(a->a_tdvp, hint); } VFS_KNOTE_UNLOCKED(a->a_fvp, NOTE_RENAME); if (a->a_tvp) VFS_KNOTE_UNLOCKED(a->a_tvp, NOTE_DELETE); } if (a->a_tdvp != a->a_fdvp) vdrop(a->a_fdvp); if (a->a_tvp != a->a_fvp) vdrop(a->a_fvp); vdrop(a->a_tdvp); if (a->a_tvp) vdrop(a->a_tvp); } void vop_rmdir_pre(void *ap) { struct vop_rmdir_args *a; struct vnode *dvp, *vp; a = ap; dvp = a->a_dvp; vp = a->a_vp; vn_seqc_write_begin(dvp); vn_seqc_write_begin(vp); } void vop_rmdir_post(void *ap, int rc) { struct vop_rmdir_args *a; struct vnode *dvp, *vp; a = ap; dvp = a->a_dvp; vp = a->a_vp; vn_seqc_write_end(dvp); vn_seqc_write_end(vp); if (!rc) { VFS_KNOTE_LOCKED(dvp, NOTE_WRITE | NOTE_LINK); VFS_KNOTE_LOCKED(vp, NOTE_DELETE); } } void vop_setattr_pre(void *ap) { struct vop_setattr_args *a; struct vnode *vp; a = ap; vp = a->a_vp; vn_seqc_write_begin(vp); } void vop_setattr_post(void *ap, int rc) { struct vop_setattr_args *a; struct vnode *vp; a = ap; vp = a->a_vp; vn_seqc_write_end(vp); if (!rc) VFS_KNOTE_LOCKED(vp, NOTE_ATTRIB); } void vop_setacl_pre(void *ap) { struct vop_setacl_args *a; struct vnode *vp; a = ap; vp = a->a_vp; vn_seqc_write_begin(vp); } void vop_setacl_post(void *ap, int rc __unused) { struct vop_setacl_args *a; struct vnode *vp; a = ap; vp = a->a_vp; vn_seqc_write_end(vp); } void vop_setextattr_pre(void *ap) { struct vop_setextattr_args *a; struct vnode *vp; a = ap; vp = a->a_vp; vn_seqc_write_begin(vp); } void vop_setextattr_post(void *ap, int rc) { struct vop_setextattr_args *a; struct vnode *vp; a = ap; vp = a->a_vp; vn_seqc_write_end(vp); if (!rc) VFS_KNOTE_LOCKED(vp, NOTE_ATTRIB); } void vop_symlink_pre(void *ap) { struct vop_symlink_args *a; struct vnode *dvp; a = ap; dvp = a->a_dvp; vn_seqc_write_begin(dvp); } void vop_symlink_post(void *ap, int rc) { struct vop_symlink_args *a; struct vnode *dvp; a = ap; dvp = a->a_dvp; vn_seqc_write_end(dvp); if (!rc) VFS_KNOTE_LOCKED(dvp, NOTE_WRITE); } void vop_open_post(void *ap, int rc) { struct vop_open_args *a = ap; if (!rc) VFS_KNOTE_LOCKED(a->a_vp, NOTE_OPEN); } void vop_close_post(void *ap, int rc) { struct vop_close_args *a = ap; if (!rc && (a->a_cred != NOCRED || /* filter out revokes */ !VN_IS_DOOMED(a->a_vp))) { VFS_KNOTE_LOCKED(a->a_vp, (a->a_fflag & FWRITE) != 0 ? NOTE_CLOSE_WRITE : NOTE_CLOSE); } } void vop_read_post(void *ap, int rc) { struct vop_read_args *a = ap; if (!rc) VFS_KNOTE_LOCKED(a->a_vp, NOTE_READ); } void vop_readdir_post(void *ap, int rc) { struct vop_readdir_args *a = ap; if (!rc) VFS_KNOTE_LOCKED(a->a_vp, NOTE_READ); } static struct knlist fs_knlist; static void vfs_event_init(void *arg) { knlist_init_mtx(&fs_knlist, NULL); } /* XXX - correct order? */ SYSINIT(vfs_knlist, SI_SUB_VFS, SI_ORDER_ANY, vfs_event_init, NULL); void vfs_event_signal(fsid_t *fsid, uint32_t event, intptr_t data __unused) { KNOTE_UNLOCKED(&fs_knlist, event); } static int filt_fsattach(struct knote *kn); static void filt_fsdetach(struct knote *kn); static int filt_fsevent(struct knote *kn, long hint); struct filterops fs_filtops = { .f_isfd = 0, .f_attach = filt_fsattach, .f_detach = filt_fsdetach, .f_event = filt_fsevent }; static int filt_fsattach(struct knote *kn) { kn->kn_flags |= EV_CLEAR; knlist_add(&fs_knlist, kn, 0); return (0); } static void filt_fsdetach(struct knote *kn) { knlist_remove(&fs_knlist, kn, 0); } static int filt_fsevent(struct knote *kn, long hint) { kn->kn_fflags |= hint; return (kn->kn_fflags != 0); } static int sysctl_vfs_ctl(SYSCTL_HANDLER_ARGS) { struct vfsidctl vc; int error; struct mount *mp; error = SYSCTL_IN(req, &vc, sizeof(vc)); if (error) return (error); if (vc.vc_vers != VFS_CTL_VERS1) return (EINVAL); mp = vfs_getvfs(&vc.vc_fsid); if (mp == NULL) return (ENOENT); /* ensure that a specific sysctl goes to the right filesystem. */ if (strcmp(vc.vc_fstypename, "*") != 0 && strcmp(vc.vc_fstypename, mp->mnt_vfc->vfc_name) != 0) { vfs_rel(mp); return (EINVAL); } VCTLTOREQ(&vc, req); error = VFS_SYSCTL(mp, vc.vc_op, req); vfs_rel(mp); return (error); } SYSCTL_PROC(_vfs, OID_AUTO, ctl, CTLTYPE_OPAQUE | CTLFLAG_MPSAFE | CTLFLAG_WR, NULL, 0, sysctl_vfs_ctl, "", "Sysctl by fsid"); /* * Function to initialize a va_filerev field sensibly. * XXX: Wouldn't a random number make a lot more sense ?? */ u_quad_t init_va_filerev(void) { struct bintime bt; getbinuptime(&bt); return (((u_quad_t)bt.sec << 32LL) | (bt.frac >> 32LL)); } static int filt_vfsread(struct knote *kn, long hint); static int filt_vfswrite(struct knote *kn, long hint); static int filt_vfsvnode(struct knote *kn, long hint); static void filt_vfsdetach(struct knote *kn); static struct filterops vfsread_filtops = { .f_isfd = 1, .f_detach = filt_vfsdetach, .f_event = filt_vfsread }; static struct filterops vfswrite_filtops = { .f_isfd = 1, .f_detach = filt_vfsdetach, .f_event = filt_vfswrite }; static struct filterops vfsvnode_filtops = { .f_isfd = 1, .f_detach = filt_vfsdetach, .f_event = filt_vfsvnode }; static void vfs_knllock(void *arg) { struct vnode *vp = arg; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); } static void vfs_knlunlock(void *arg) { struct vnode *vp = arg; VOP_UNLOCK(vp); } static void vfs_knl_assert_locked(void *arg) { #ifdef DEBUG_VFS_LOCKS struct vnode *vp = arg; ASSERT_VOP_LOCKED(vp, "vfs_knl_assert_locked"); #endif } static void vfs_knl_assert_unlocked(void *arg) { #ifdef DEBUG_VFS_LOCKS struct vnode *vp = arg; ASSERT_VOP_UNLOCKED(vp, "vfs_knl_assert_unlocked"); #endif } int vfs_kqfilter(struct vop_kqfilter_args *ap) { struct vnode *vp = ap->a_vp; struct knote *kn = ap->a_kn; struct knlist *knl; switch (kn->kn_filter) { case EVFILT_READ: kn->kn_fop = &vfsread_filtops; break; case EVFILT_WRITE: kn->kn_fop = &vfswrite_filtops; break; case EVFILT_VNODE: kn->kn_fop = &vfsvnode_filtops; break; default: return (EINVAL); } kn->kn_hook = (caddr_t)vp; v_addpollinfo(vp); if (vp->v_pollinfo == NULL) return (ENOMEM); knl = &vp->v_pollinfo->vpi_selinfo.si_note; vhold(vp); knlist_add(knl, kn, 0); return (0); } /* * Detach knote from vnode */ static void filt_vfsdetach(struct knote *kn) { struct vnode *vp = (struct vnode *)kn->kn_hook; KASSERT(vp->v_pollinfo != NULL, ("Missing v_pollinfo")); knlist_remove(&vp->v_pollinfo->vpi_selinfo.si_note, kn, 0); vdrop(vp); } /*ARGSUSED*/ static int filt_vfsread(struct knote *kn, long hint) { struct vnode *vp = (struct vnode *)kn->kn_hook; struct vattr va; int res; /* * filesystem is gone, so set the EOF flag and schedule * the knote for deletion. */ if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD)) { VI_LOCK(vp); kn->kn_flags |= (EV_EOF | EV_ONESHOT); VI_UNLOCK(vp); return (1); } if (VOP_GETATTR(vp, &va, curthread->td_ucred)) return (0); VI_LOCK(vp); kn->kn_data = va.va_size - kn->kn_fp->f_offset; res = (kn->kn_sfflags & NOTE_FILE_POLL) != 0 || kn->kn_data != 0; VI_UNLOCK(vp); return (res); } /*ARGSUSED*/ static int filt_vfswrite(struct knote *kn, long hint) { struct vnode *vp = (struct vnode *)kn->kn_hook; VI_LOCK(vp); /* * filesystem is gone, so set the EOF flag and schedule * the knote for deletion. */ if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD)) kn->kn_flags |= (EV_EOF | EV_ONESHOT); kn->kn_data = 0; VI_UNLOCK(vp); return (1); } static int filt_vfsvnode(struct knote *kn, long hint) { struct vnode *vp = (struct vnode *)kn->kn_hook; int res; VI_LOCK(vp); if (kn->kn_sfflags & hint) kn->kn_fflags |= hint; if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD)) { kn->kn_flags |= EV_EOF; VI_UNLOCK(vp); return (1); } res = (kn->kn_fflags != 0); VI_UNLOCK(vp); return (res); } /* * Returns whether the directory is empty or not. * If it is empty, the return value is 0; otherwise * the return value is an error value (which may * be ENOTEMPTY). */ int vfs_emptydir(struct vnode *vp) { struct uio uio; struct iovec iov; struct dirent *dirent, *dp, *endp; int error, eof; error = 0; eof = 0; ASSERT_VOP_LOCKED(vp, "vfs_emptydir"); dirent = malloc(sizeof(struct dirent), M_TEMP, M_WAITOK); iov.iov_base = dirent; iov.iov_len = sizeof(struct dirent); uio.uio_iov = &iov; uio.uio_iovcnt = 1; uio.uio_offset = 0; uio.uio_resid = sizeof(struct dirent); uio.uio_segflg = UIO_SYSSPACE; uio.uio_rw = UIO_READ; uio.uio_td = curthread; while (eof == 0 && error == 0) { error = VOP_READDIR(vp, &uio, curthread->td_ucred, &eof, NULL, NULL); if (error != 0) break; endp = (void *)((uint8_t *)dirent + sizeof(struct dirent) - uio.uio_resid); for (dp = dirent; dp < endp; dp = (void *)((uint8_t *)dp + GENERIC_DIRSIZ(dp))) { if (dp->d_type == DT_WHT) continue; if (dp->d_namlen == 0) continue; if (dp->d_type != DT_DIR && dp->d_type != DT_UNKNOWN) { error = ENOTEMPTY; break; } if (dp->d_namlen > 2) { error = ENOTEMPTY; break; } if (dp->d_namlen == 1 && dp->d_name[0] != '.') { error = ENOTEMPTY; break; } if (dp->d_namlen == 2 && dp->d_name[1] != '.') { error = ENOTEMPTY; break; } uio.uio_resid = sizeof(struct dirent); } } free(dirent, M_TEMP); return (error); } int vfs_read_dirent(struct vop_readdir_args *ap, struct dirent *dp, off_t off) { int error; if (dp->d_reclen > ap->a_uio->uio_resid) return (ENAMETOOLONG); error = uiomove(dp, dp->d_reclen, ap->a_uio); if (error) { if (ap->a_ncookies != NULL) { if (ap->a_cookies != NULL) free(ap->a_cookies, M_TEMP); ap->a_cookies = NULL; *ap->a_ncookies = 0; } return (error); } if (ap->a_ncookies == NULL) return (0); KASSERT(ap->a_cookies, ("NULL ap->a_cookies value with non-NULL ap->a_ncookies!")); *ap->a_cookies = realloc(*ap->a_cookies, (*ap->a_ncookies + 1) * sizeof(u_long), M_TEMP, M_WAITOK | M_ZERO); (*ap->a_cookies)[*ap->a_ncookies] = off; *ap->a_ncookies += 1; return (0); } /* * The purpose of this routine is to remove granularity from accmode_t, * reducing it into standard unix access bits - VEXEC, VREAD, VWRITE, * VADMIN and VAPPEND. * * If it returns 0, the caller is supposed to continue with the usual * access checks using 'accmode' as modified by this routine. If it * returns nonzero value, the caller is supposed to return that value * as errno. * * Note that after this routine runs, accmode may be zero. */ int vfs_unixify_accmode(accmode_t *accmode) { /* * There is no way to specify explicit "deny" rule using * file mode or POSIX.1e ACLs. */ if (*accmode & VEXPLICIT_DENY) { *accmode = 0; return (0); } /* * None of these can be translated into usual access bits. * Also, the common case for NFSv4 ACLs is to not contain * either of these bits. Caller should check for VWRITE * on the containing directory instead. */ if (*accmode & (VDELETE_CHILD | VDELETE)) return (EPERM); if (*accmode & VADMIN_PERMS) { *accmode &= ~VADMIN_PERMS; *accmode |= VADMIN; } /* * There is no way to deny VREAD_ATTRIBUTES, VREAD_ACL * or VSYNCHRONIZE using file mode or POSIX.1e ACL. */ *accmode &= ~(VSTAT_PERMS | VSYNCHRONIZE); return (0); } /* * Clear out a doomed vnode (if any) and replace it with a new one as long * as the fs is not being unmounted. Return the root vnode to the caller. */ static int __noinline vfs_cache_root_fallback(struct mount *mp, int flags, struct vnode **vpp) { struct vnode *vp; int error; restart: if (mp->mnt_rootvnode != NULL) { MNT_ILOCK(mp); vp = mp->mnt_rootvnode; if (vp != NULL) { if (!VN_IS_DOOMED(vp)) { vrefact(vp); MNT_IUNLOCK(mp); error = vn_lock(vp, flags); if (error == 0) { *vpp = vp; return (0); } vrele(vp); goto restart; } /* * Clear the old one. */ mp->mnt_rootvnode = NULL; } MNT_IUNLOCK(mp); if (vp != NULL) { vfs_op_barrier_wait(mp); vrele(vp); } } error = VFS_CACHEDROOT(mp, flags, vpp); if (error != 0) return (error); if (mp->mnt_vfs_ops == 0) { MNT_ILOCK(mp); if (mp->mnt_vfs_ops != 0) { MNT_IUNLOCK(mp); return (0); } if (mp->mnt_rootvnode == NULL) { vrefact(*vpp); mp->mnt_rootvnode = *vpp; } else { if (mp->mnt_rootvnode != *vpp) { if (!VN_IS_DOOMED(mp->mnt_rootvnode)) { panic("%s: mismatch between vnode returned " " by VFS_CACHEDROOT and the one cached " " (%p != %p)", __func__, *vpp, mp->mnt_rootvnode); } } } MNT_IUNLOCK(mp); } return (0); } int vfs_cache_root(struct mount *mp, int flags, struct vnode **vpp) { struct vnode *vp; int error; if (!vfs_op_thread_enter(mp)) return (vfs_cache_root_fallback(mp, flags, vpp)); vp = atomic_load_ptr(&mp->mnt_rootvnode); if (vp == NULL || VN_IS_DOOMED(vp)) { vfs_op_thread_exit(mp); return (vfs_cache_root_fallback(mp, flags, vpp)); } vrefact(vp); vfs_op_thread_exit(mp); error = vn_lock(vp, flags); if (error != 0) { vrele(vp); return (vfs_cache_root_fallback(mp, flags, vpp)); } *vpp = vp; return (0); } struct vnode * vfs_cache_root_clear(struct mount *mp) { struct vnode *vp; /* * ops > 0 guarantees there is nobody who can see this vnode */ MPASS(mp->mnt_vfs_ops > 0); vp = mp->mnt_rootvnode; if (vp != NULL) vn_seqc_write_begin(vp); mp->mnt_rootvnode = NULL; return (vp); } void vfs_cache_root_set(struct mount *mp, struct vnode *vp) { MPASS(mp->mnt_vfs_ops > 0); vrefact(vp); mp->mnt_rootvnode = vp; } /* * These are helper functions for filesystems to traverse all * their vnodes. See MNT_VNODE_FOREACH_ALL() in sys/mount.h. * * This interface replaces MNT_VNODE_FOREACH. */ struct vnode * __mnt_vnode_next_all(struct vnode **mvp, struct mount *mp) { struct vnode *vp; if (should_yield()) kern_yield(PRI_USER); MNT_ILOCK(mp); KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); for (vp = TAILQ_NEXT(*mvp, v_nmntvnodes); vp != NULL; vp = TAILQ_NEXT(vp, v_nmntvnodes)) { /* Allow a racy peek at VIRF_DOOMED to save a lock acquisition. */ if (vp->v_type == VMARKER || VN_IS_DOOMED(vp)) continue; VI_LOCK(vp); if (VN_IS_DOOMED(vp)) { VI_UNLOCK(vp); continue; } break; } if (vp == NULL) { __mnt_vnode_markerfree_all(mvp, mp); /* MNT_IUNLOCK(mp); -- done in above function */ mtx_assert(MNT_MTX(mp), MA_NOTOWNED); return (NULL); } TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes); TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes); MNT_IUNLOCK(mp); return (vp); } struct vnode * __mnt_vnode_first_all(struct vnode **mvp, struct mount *mp) { struct vnode *vp; *mvp = vn_alloc_marker(mp); MNT_ILOCK(mp); MNT_REF(mp); TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) { /* Allow a racy peek at VIRF_DOOMED to save a lock acquisition. */ if (vp->v_type == VMARKER || VN_IS_DOOMED(vp)) continue; VI_LOCK(vp); if (VN_IS_DOOMED(vp)) { VI_UNLOCK(vp); continue; } break; } if (vp == NULL) { MNT_REL(mp); MNT_IUNLOCK(mp); vn_free_marker(*mvp); *mvp = NULL; return (NULL); } TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes); MNT_IUNLOCK(mp); return (vp); } void __mnt_vnode_markerfree_all(struct vnode **mvp, struct mount *mp) { if (*mvp == NULL) { MNT_IUNLOCK(mp); return; } mtx_assert(MNT_MTX(mp), MA_OWNED); KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes); MNT_REL(mp); MNT_IUNLOCK(mp); vn_free_marker(*mvp); *mvp = NULL; } /* * These are helper functions for filesystems to traverse their * lazy vnodes. See MNT_VNODE_FOREACH_LAZY() in sys/mount.h */ static void mnt_vnode_markerfree_lazy(struct vnode **mvp, struct mount *mp) { KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); MNT_ILOCK(mp); MNT_REL(mp); MNT_IUNLOCK(mp); vn_free_marker(*mvp); *mvp = NULL; } /* * Relock the mp mount vnode list lock with the vp vnode interlock in the * conventional lock order during mnt_vnode_next_lazy iteration. * * On entry, the mount vnode list lock is held and the vnode interlock is not. * The list lock is dropped and reacquired. On success, both locks are held. * On failure, the mount vnode list lock is held but the vnode interlock is * not, and the procedure may have yielded. */ static bool mnt_vnode_next_lazy_relock(struct vnode *mvp, struct mount *mp, struct vnode *vp) { VNASSERT(mvp->v_mount == mp && mvp->v_type == VMARKER && TAILQ_NEXT(mvp, v_lazylist) != NULL, mvp, ("%s: bad marker", __func__)); VNASSERT(vp->v_mount == mp && vp->v_type != VMARKER, vp, ("%s: inappropriate vnode", __func__)); ASSERT_VI_UNLOCKED(vp, __func__); mtx_assert(&mp->mnt_listmtx, MA_OWNED); TAILQ_REMOVE(&mp->mnt_lazyvnodelist, mvp, v_lazylist); TAILQ_INSERT_BEFORE(vp, mvp, v_lazylist); /* * Note we may be racing against vdrop which transitioned the hold * count to 0 and now waits for the ->mnt_listmtx lock. This is fine, * if we are the only user after we get the interlock we will just * vdrop. */ vhold(vp); mtx_unlock(&mp->mnt_listmtx); VI_LOCK(vp); if (VN_IS_DOOMED(vp)) { VNPASS((vp->v_mflag & VMP_LAZYLIST) == 0, vp); goto out_lost; } VNPASS(vp->v_mflag & VMP_LAZYLIST, vp); /* * There is nothing to do if we are the last user. */ if (!refcount_release_if_not_last(&vp->v_holdcnt)) goto out_lost; mtx_lock(&mp->mnt_listmtx); return (true); out_lost: vdropl(vp); maybe_yield(); mtx_lock(&mp->mnt_listmtx); return (false); } static struct vnode * mnt_vnode_next_lazy(struct vnode **mvp, struct mount *mp, mnt_lazy_cb_t *cb, void *cbarg) { struct vnode *vp; mtx_assert(&mp->mnt_listmtx, MA_OWNED); KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch")); restart: vp = TAILQ_NEXT(*mvp, v_lazylist); while (vp != NULL) { if (vp->v_type == VMARKER) { vp = TAILQ_NEXT(vp, v_lazylist); continue; } /* * See if we want to process the vnode. Note we may encounter a * long string of vnodes we don't care about and hog the list * as a result. Check for it and requeue the marker. */ VNPASS(!VN_IS_DOOMED(vp), vp); if (!cb(vp, cbarg)) { if (!should_yield()) { vp = TAILQ_NEXT(vp, v_lazylist); continue; } TAILQ_REMOVE(&mp->mnt_lazyvnodelist, *mvp, v_lazylist); TAILQ_INSERT_AFTER(&mp->mnt_lazyvnodelist, vp, *mvp, v_lazylist); mtx_unlock(&mp->mnt_listmtx); kern_yield(PRI_USER); mtx_lock(&mp->mnt_listmtx); goto restart; } /* * Try-lock because this is the wrong lock order. */ if (!VI_TRYLOCK(vp) && !mnt_vnode_next_lazy_relock(*mvp, mp, vp)) goto restart; KASSERT(vp->v_type != VMARKER, ("locked marker %p", vp)); KASSERT(vp->v_mount == mp || vp->v_mount == NULL, ("alien vnode on the lazy list %p %p", vp, mp)); VNPASS(vp->v_mount == mp, vp); VNPASS(!VN_IS_DOOMED(vp), vp); break; } TAILQ_REMOVE(&mp->mnt_lazyvnodelist, *mvp, v_lazylist); /* Check if we are done */ if (vp == NULL) { mtx_unlock(&mp->mnt_listmtx); mnt_vnode_markerfree_lazy(mvp, mp); return (NULL); } TAILQ_INSERT_AFTER(&mp->mnt_lazyvnodelist, vp, *mvp, v_lazylist); mtx_unlock(&mp->mnt_listmtx); ASSERT_VI_LOCKED(vp, "lazy iter"); return (vp); } struct vnode * __mnt_vnode_next_lazy(struct vnode **mvp, struct mount *mp, mnt_lazy_cb_t *cb, void *cbarg) { if (should_yield()) kern_yield(PRI_USER); mtx_lock(&mp->mnt_listmtx); return (mnt_vnode_next_lazy(mvp, mp, cb, cbarg)); } struct vnode * __mnt_vnode_first_lazy(struct vnode **mvp, struct mount *mp, mnt_lazy_cb_t *cb, void *cbarg) { struct vnode *vp; if (TAILQ_EMPTY(&mp->mnt_lazyvnodelist)) return (NULL); *mvp = vn_alloc_marker(mp); MNT_ILOCK(mp); MNT_REF(mp); MNT_IUNLOCK(mp); mtx_lock(&mp->mnt_listmtx); vp = TAILQ_FIRST(&mp->mnt_lazyvnodelist); if (vp == NULL) { mtx_unlock(&mp->mnt_listmtx); mnt_vnode_markerfree_lazy(mvp, mp); return (NULL); } TAILQ_INSERT_BEFORE(vp, *mvp, v_lazylist); return (mnt_vnode_next_lazy(mvp, mp, cb, cbarg)); } void __mnt_vnode_markerfree_lazy(struct vnode **mvp, struct mount *mp) { if (*mvp == NULL) return; mtx_lock(&mp->mnt_listmtx); TAILQ_REMOVE(&mp->mnt_lazyvnodelist, *mvp, v_lazylist); mtx_unlock(&mp->mnt_listmtx); mnt_vnode_markerfree_lazy(mvp, mp); } int vn_dir_check_exec(struct vnode *vp, struct componentname *cnp) { if ((cnp->cn_flags & NOEXECCHECK) != 0) { cnp->cn_flags &= ~NOEXECCHECK; return (0); } return (VOP_ACCESS(vp, VEXEC, cnp->cn_cred, cnp->cn_thread)); } /* * Do not use this variant unless you have means other than the hold count * to prevent the vnode from getting freed. */ void vn_seqc_write_begin_unheld_locked(struct vnode *vp) { ASSERT_VI_LOCKED(vp, __func__); VNPASS(vp->v_seqc_users >= 0, vp); vp->v_seqc_users++; if (vp->v_seqc_users == 1) seqc_sleepable_write_begin(&vp->v_seqc); } void vn_seqc_write_begin_locked(struct vnode *vp) { ASSERT_VI_LOCKED(vp, __func__); VNPASS(vp->v_holdcnt > 0, vp); vn_seqc_write_begin_unheld_locked(vp); } void vn_seqc_write_begin(struct vnode *vp) { VI_LOCK(vp); vn_seqc_write_begin_locked(vp); VI_UNLOCK(vp); } void vn_seqc_write_begin_unheld(struct vnode *vp) { VI_LOCK(vp); vn_seqc_write_begin_unheld_locked(vp); VI_UNLOCK(vp); } void vn_seqc_write_end_locked(struct vnode *vp) { ASSERT_VI_LOCKED(vp, __func__); VNPASS(vp->v_seqc_users > 0, vp); vp->v_seqc_users--; if (vp->v_seqc_users == 0) seqc_sleepable_write_end(&vp->v_seqc); } void vn_seqc_write_end(struct vnode *vp) { VI_LOCK(vp); vn_seqc_write_end_locked(vp); VI_UNLOCK(vp); } Index: projects/clang1100-import/sys/modules/nvd/Makefile =================================================================== --- projects/clang1100-import/sys/modules/nvd/Makefile (revision 364278) +++ projects/clang1100-import/sys/modules/nvd/Makefile (revision 364279) @@ -1,8 +1,8 @@ # $FreeBSD$ .PATH: ${SRCTOP}/sys/dev/nvd KMOD= nvd -SRCS= nvd.c opt_geom.h device_if.h bus_if.h +SRCS= nvd.c opt_geom.h device_if.h bus_if.h pci_if.h .include Index: projects/clang1100-import/sys/modules/usb/cp2112/Makefile =================================================================== --- projects/clang1100-import/sys/modules/usb/cp2112/Makefile (revision 364278) +++ projects/clang1100-import/sys/modules/usb/cp2112/Makefile (revision 364279) @@ -1,37 +1,37 @@ # # $FreeBSD$ # # Copyright (c) Andriy Gapon # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # 1. Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE # ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS # OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY # OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF # SUCH DAMAGE. # S= ${SRCTOP}/sys .PATH: $S/dev/usb/misc KMOD= cp2112 SRCS= cp2112.c -SRCS+= opt_bus.h opt_usb.h +SRCS+= opt_bus.h opt_platform.h opt_usb.h SRCS+= device_if.h bus_if.h gpio_if.h iicbus_if.h usb_if.h usbdevs.h .include Index: projects/clang1100-import/sys/netinet/sctp_input.c =================================================================== --- projects/clang1100-import/sys/netinet/sctp_input.c (revision 364278) +++ projects/clang1100-import/sys/netinet/sctp_input.c (revision 364279) @@ -1,5809 +1,5808 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2001-2008, by Cisco Systems, Inc. All rights reserved. * Copyright (c) 2008-2012, by Randall Stewart. All rights reserved. * Copyright (c) 2008-2012, by Michael Tuexen. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * a) Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * b) Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the distribution. * * c) Neither the name of Cisco Systems, Inc. nor the names of its * contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(INET) || defined(INET6) #include #endif #include static void sctp_stop_all_cookie_timers(struct sctp_tcb *stcb) { struct sctp_nets *net; /* * This now not only stops all cookie timers it also stops any INIT * timers as well. This will make sure that the timers are stopped * in all collision cases. */ SCTP_TCB_LOCK_ASSERT(stcb); TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) { if (net->rxt_timer.type == SCTP_TIMER_TYPE_COOKIE) { sctp_timer_stop(SCTP_TIMER_TYPE_COOKIE, stcb->sctp_ep, stcb, net, SCTP_FROM_SCTP_INPUT + SCTP_LOC_1); } else if (net->rxt_timer.type == SCTP_TIMER_TYPE_INIT) { sctp_timer_stop(SCTP_TIMER_TYPE_INIT, stcb->sctp_ep, stcb, net, SCTP_FROM_SCTP_INPUT + SCTP_LOC_2); } } } /* INIT handler */ static void sctp_handle_init(struct mbuf *m, int iphlen, int offset, struct sockaddr *src, struct sockaddr *dst, struct sctphdr *sh, struct sctp_init_chunk *cp, struct sctp_inpcb *inp, struct sctp_tcb *stcb, struct sctp_nets *net, int *abort_no_unlock, uint8_t mflowtype, uint32_t mflowid, uint32_t vrf_id, uint16_t port) { struct sctp_init *init; struct mbuf *op_err; SCTPDBG(SCTP_DEBUG_INPUT2, "sctp_handle_init: handling INIT tcb:%p\n", (void *)stcb); if (stcb == NULL) { SCTP_INP_RLOCK(inp); } /* validate length */ if (ntohs(cp->ch.chunk_length) < sizeof(struct sctp_init_chunk)) { op_err = sctp_generate_cause(SCTP_CAUSE_INVALID_PARAM, ""); sctp_abort_association(inp, stcb, m, iphlen, src, dst, sh, op_err, mflowtype, mflowid, vrf_id, port); if (stcb) *abort_no_unlock = 1; goto outnow; } /* validate parameters */ init = &cp->init; if (init->initiate_tag == 0) { /* protocol error... send abort */ op_err = sctp_generate_cause(SCTP_CAUSE_INVALID_PARAM, ""); sctp_abort_association(inp, stcb, m, iphlen, src, dst, sh, op_err, mflowtype, mflowid, vrf_id, port); if (stcb) *abort_no_unlock = 1; goto outnow; } if (ntohl(init->a_rwnd) < SCTP_MIN_RWND) { /* invalid parameter... send abort */ op_err = sctp_generate_cause(SCTP_CAUSE_INVALID_PARAM, ""); sctp_abort_association(inp, stcb, m, iphlen, src, dst, sh, op_err, mflowtype, mflowid, vrf_id, port); if (stcb) *abort_no_unlock = 1; goto outnow; } if (init->num_inbound_streams == 0) { /* protocol error... send abort */ op_err = sctp_generate_cause(SCTP_CAUSE_INVALID_PARAM, ""); sctp_abort_association(inp, stcb, m, iphlen, src, dst, sh, op_err, mflowtype, mflowid, vrf_id, port); if (stcb) *abort_no_unlock = 1; goto outnow; } if (init->num_outbound_streams == 0) { /* protocol error... send abort */ op_err = sctp_generate_cause(SCTP_CAUSE_INVALID_PARAM, ""); sctp_abort_association(inp, stcb, m, iphlen, src, dst, sh, op_err, mflowtype, mflowid, vrf_id, port); if (stcb) *abort_no_unlock = 1; goto outnow; } if (sctp_validate_init_auth_params(m, offset + sizeof(*cp), offset + ntohs(cp->ch.chunk_length))) { /* auth parameter(s) error... send abort */ op_err = sctp_generate_cause(SCTP_BASE_SYSCTL(sctp_diag_info_code), "Problem with AUTH parameters"); sctp_abort_association(inp, stcb, m, iphlen, src, dst, sh, op_err, mflowtype, mflowid, vrf_id, port); if (stcb) *abort_no_unlock = 1; goto outnow; } /* We are only accepting if we have a listening socket. */ if ((stcb == NULL) && ((inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) || (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) || (!SCTP_IS_LISTENING(inp)))) { /* * FIX ME ?? What about TCP model and we have a * match/restart case? Actually no fix is needed. the lookup * will always find the existing assoc so stcb would not be * NULL. It may be questionable to do this since we COULD * just send back the INIT-ACK and hope that the app did * accept()'s by the time the COOKIE was sent. But there is * a price to pay for COOKIE generation and I don't want to * pay it on the chance that the app will actually do some * accepts(). The App just looses and should NOT be in this * state :-) */ if (SCTP_BASE_SYSCTL(sctp_blackhole) == 0) { op_err = sctp_generate_cause(SCTP_BASE_SYSCTL(sctp_diag_info_code), "No listener"); sctp_send_abort(m, iphlen, src, dst, sh, 0, op_err, mflowtype, mflowid, inp->fibnum, vrf_id, port); } goto outnow; } if ((stcb != NULL) && (SCTP_GET_STATE(stcb) == SCTP_STATE_SHUTDOWN_ACK_SENT)) { SCTPDBG(SCTP_DEBUG_INPUT3, "sctp_handle_init: sending SHUTDOWN-ACK\n"); sctp_send_shutdown_ack(stcb, NULL); sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_CONTROL_PROC, SCTP_SO_NOT_LOCKED); } else { SCTPDBG(SCTP_DEBUG_INPUT3, "sctp_handle_init: sending INIT-ACK\n"); sctp_send_initiate_ack(inp, stcb, net, m, iphlen, offset, src, dst, sh, cp, mflowtype, mflowid, vrf_id, port); } outnow: if (stcb == NULL) { SCTP_INP_RUNLOCK(inp); } } /* * process peer "INIT/INIT-ACK" chunk returns value < 0 on error */ int sctp_is_there_unsent_data(struct sctp_tcb *stcb, int so_locked) { int unsent_data; unsigned int i; struct sctp_stream_queue_pending *sp; struct sctp_association *asoc; /* * This function returns if any stream has true unsent data on it. * Note that as it looks through it will clean up any places that * have old data that has been sent but left at top of stream queue. */ asoc = &stcb->asoc; unsent_data = 0; SCTP_TCB_SEND_LOCK(stcb); if (!stcb->asoc.ss_functions.sctp_ss_is_empty(stcb, asoc)) { /* Check to see if some data queued */ for (i = 0; i < stcb->asoc.streamoutcnt; i++) { /* sa_ignore FREED_MEMORY */ sp = TAILQ_FIRST(&stcb->asoc.strmout[i].outqueue); if (sp == NULL) { continue; } if ((sp->msg_is_complete) && (sp->length == 0) && (sp->sender_all_done)) { /* * We are doing differed cleanup. Last time * through when we took all the data the * sender_all_done was not set. */ if (sp->put_last_out == 0) { SCTP_PRINTF("Gak, put out entire msg with NO end!-1\n"); SCTP_PRINTF("sender_done:%d len:%d msg_comp:%d put_last_out:%d\n", sp->sender_all_done, sp->length, sp->msg_is_complete, sp->put_last_out); } atomic_subtract_int(&stcb->asoc.stream_queue_cnt, 1); TAILQ_REMOVE(&stcb->asoc.strmout[i].outqueue, sp, next); stcb->asoc.ss_functions.sctp_ss_remove_from_stream(stcb, asoc, &asoc->strmout[i], sp, 1); if (sp->net) { sctp_free_remote_addr(sp->net); sp->net = NULL; } if (sp->data) { sctp_m_freem(sp->data); sp->data = NULL; } sctp_free_a_strmoq(stcb, sp, so_locked); if (!TAILQ_EMPTY(&stcb->asoc.strmout[i].outqueue)) { unsent_data++; } } else { unsent_data++; } if (unsent_data > 0) { break; } } } SCTP_TCB_SEND_UNLOCK(stcb); return (unsent_data); } static int sctp_process_init(struct sctp_init_chunk *cp, struct sctp_tcb *stcb) { struct sctp_init *init; struct sctp_association *asoc; struct sctp_nets *lnet; unsigned int i; init = &cp->init; asoc = &stcb->asoc; /* save off parameters */ asoc->peer_vtag = ntohl(init->initiate_tag); asoc->peers_rwnd = ntohl(init->a_rwnd); /* init tsn's */ asoc->highest_tsn_inside_map = asoc->asconf_seq_in = ntohl(init->initial_tsn) - 1; if (!TAILQ_EMPTY(&asoc->nets)) { /* update any ssthresh's that may have a default */ TAILQ_FOREACH(lnet, &asoc->nets, sctp_next) { lnet->ssthresh = asoc->peers_rwnd; if (SCTP_BASE_SYSCTL(sctp_logging_level) & (SCTP_CWND_MONITOR_ENABLE | SCTP_CWND_LOGGING_ENABLE)) { sctp_log_cwnd(stcb, lnet, 0, SCTP_CWND_INITIALIZATION); } } } SCTP_TCB_SEND_LOCK(stcb); if (asoc->pre_open_streams > ntohs(init->num_inbound_streams)) { unsigned int newcnt; struct sctp_stream_out *outs; struct sctp_stream_queue_pending *sp, *nsp; struct sctp_tmit_chunk *chk, *nchk; /* abandon the upper streams */ newcnt = ntohs(init->num_inbound_streams); TAILQ_FOREACH_SAFE(chk, &asoc->send_queue, sctp_next, nchk) { if (chk->rec.data.sid >= newcnt) { TAILQ_REMOVE(&asoc->send_queue, chk, sctp_next); asoc->send_queue_cnt--; if (asoc->strmout[chk->rec.data.sid].chunks_on_queues > 0) { asoc->strmout[chk->rec.data.sid].chunks_on_queues--; #ifdef INVARIANTS } else { panic("No chunks on the queues for sid %u.", chk->rec.data.sid); #endif } if (chk->data != NULL) { sctp_free_bufspace(stcb, asoc, chk, 1); sctp_ulp_notify(SCTP_NOTIFY_UNSENT_DG_FAIL, stcb, 0, chk, SCTP_SO_NOT_LOCKED); if (chk->data) { sctp_m_freem(chk->data); chk->data = NULL; } } sctp_free_a_chunk(stcb, chk, SCTP_SO_NOT_LOCKED); /* sa_ignore FREED_MEMORY */ } } if (asoc->strmout) { for (i = newcnt; i < asoc->pre_open_streams; i++) { outs = &asoc->strmout[i]; TAILQ_FOREACH_SAFE(sp, &outs->outqueue, next, nsp) { atomic_subtract_int(&stcb->asoc.stream_queue_cnt, 1); TAILQ_REMOVE(&outs->outqueue, sp, next); stcb->asoc.ss_functions.sctp_ss_remove_from_stream(stcb, asoc, outs, sp, 1); sctp_ulp_notify(SCTP_NOTIFY_SPECIAL_SP_FAIL, stcb, 0, sp, SCTP_SO_NOT_LOCKED); if (sp->data) { sctp_m_freem(sp->data); sp->data = NULL; } if (sp->net) { sctp_free_remote_addr(sp->net); sp->net = NULL; } /* Free the chunk */ sctp_free_a_strmoq(stcb, sp, SCTP_SO_NOT_LOCKED); /* sa_ignore FREED_MEMORY */ } outs->state = SCTP_STREAM_CLOSED; } } /* cut back the count */ asoc->pre_open_streams = newcnt; } SCTP_TCB_SEND_UNLOCK(stcb); asoc->streamoutcnt = asoc->pre_open_streams; if (asoc->strmout) { for (i = 0; i < asoc->streamoutcnt; i++) { asoc->strmout[i].state = SCTP_STREAM_OPEN; } } /* EY - nr_sack: initialize highest tsn in nr_mapping_array */ asoc->highest_tsn_inside_nr_map = asoc->highest_tsn_inside_map; if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MAP_LOGGING_ENABLE) { sctp_log_map(0, 5, asoc->highest_tsn_inside_map, SCTP_MAP_SLIDE_RESULT); } /* This is the next one we expect */ asoc->str_reset_seq_in = asoc->asconf_seq_in + 1; asoc->mapping_array_base_tsn = ntohl(init->initial_tsn); asoc->tsn_last_delivered = asoc->cumulative_tsn = asoc->asconf_seq_in; asoc->advanced_peer_ack_point = asoc->last_acked_seq; /* open the requested streams */ if (asoc->strmin != NULL) { /* Free the old ones */ for (i = 0; i < asoc->streamincnt; i++) { sctp_clean_up_stream(stcb, &asoc->strmin[i].inqueue); sctp_clean_up_stream(stcb, &asoc->strmin[i].uno_inqueue); } SCTP_FREE(asoc->strmin, SCTP_M_STRMI); } if (asoc->max_inbound_streams > ntohs(init->num_outbound_streams)) { asoc->streamincnt = ntohs(init->num_outbound_streams); } else { asoc->streamincnt = asoc->max_inbound_streams; } SCTP_MALLOC(asoc->strmin, struct sctp_stream_in *, asoc->streamincnt * sizeof(struct sctp_stream_in), SCTP_M_STRMI); if (asoc->strmin == NULL) { /* we didn't get memory for the streams! */ SCTPDBG(SCTP_DEBUG_INPUT2, "process_init: couldn't get memory for the streams!\n"); return (-1); } for (i = 0; i < asoc->streamincnt; i++) { asoc->strmin[i].sid = i; asoc->strmin[i].last_mid_delivered = 0xffffffff; TAILQ_INIT(&asoc->strmin[i].inqueue); TAILQ_INIT(&asoc->strmin[i].uno_inqueue); asoc->strmin[i].pd_api_started = 0; asoc->strmin[i].delivery_started = 0; } /* * load_address_from_init will put the addresses into the * association when the COOKIE is processed or the INIT-ACK is * processed. Both types of COOKIE's existing and new call this * routine. It will remove addresses that are no longer in the * association (for the restarting case where addresses are * removed). Up front when the INIT arrives we will discard it if it * is a restart and new addresses have been added. */ /* sa_ignore MEMLEAK */ return (0); } /* * INIT-ACK message processing/consumption returns value < 0 on error */ static int sctp_process_init_ack(struct mbuf *m, int iphlen, int offset, struct sockaddr *src, struct sockaddr *dst, struct sctphdr *sh, struct sctp_init_ack_chunk *cp, struct sctp_tcb *stcb, struct sctp_nets *net, int *abort_no_unlock, uint8_t mflowtype, uint32_t mflowid, uint32_t vrf_id) { struct sctp_association *asoc; struct mbuf *op_err; int retval, abort_flag, cookie_found; int initack_limit; int nat_friendly = 0; /* First verify that we have no illegal param's */ abort_flag = 0; cookie_found = 0; op_err = sctp_arethere_unrecognized_parameters(m, (offset + sizeof(struct sctp_init_chunk)), &abort_flag, (struct sctp_chunkhdr *)cp, &nat_friendly, &cookie_found); if (abort_flag) { /* Send an abort and notify peer */ sctp_abort_an_association(stcb->sctp_ep, stcb, op_err, SCTP_SO_NOT_LOCKED); *abort_no_unlock = 1; return (-1); } if (!cookie_found) { uint16_t len; /* Only report the missing cookie parameter */ if (op_err != NULL) { sctp_m_freem(op_err); } len = (uint16_t)(sizeof(struct sctp_error_missing_param) + sizeof(uint16_t)); /* We abort with an error of missing mandatory param */ op_err = sctp_get_mbuf_for_msg(len, 0, M_NOWAIT, 1, MT_DATA); if (op_err != NULL) { struct sctp_error_missing_param *cause; SCTP_BUF_LEN(op_err) = len; cause = mtod(op_err, struct sctp_error_missing_param *); /* Subtract the reserved param */ cause->cause.code = htons(SCTP_CAUSE_MISSING_PARAM); cause->cause.length = htons(len); cause->num_missing_params = htonl(1); cause->type[0] = htons(SCTP_STATE_COOKIE); } sctp_abort_association(stcb->sctp_ep, stcb, m, iphlen, src, dst, sh, op_err, mflowtype, mflowid, vrf_id, net->port); *abort_no_unlock = 1; return (-3); } asoc = &stcb->asoc; asoc->peer_supports_nat = (uint8_t)nat_friendly; /* process the peer's parameters in the INIT-ACK */ retval = sctp_process_init((struct sctp_init_chunk *)cp, stcb); if (retval < 0) { if (op_err != NULL) { sctp_m_freem(op_err); } return (retval); } initack_limit = offset + ntohs(cp->ch.chunk_length); /* load all addresses */ if ((retval = sctp_load_addresses_from_init(stcb, m, (offset + sizeof(struct sctp_init_chunk)), initack_limit, src, dst, NULL, stcb->asoc.port))) { if (op_err != NULL) { sctp_m_freem(op_err); } op_err = sctp_generate_cause(SCTP_BASE_SYSCTL(sctp_diag_info_code), "Problem with address parameters"); SCTPDBG(SCTP_DEBUG_INPUT1, "Load addresses from INIT causes an abort %d\n", retval); sctp_abort_association(stcb->sctp_ep, stcb, m, iphlen, src, dst, sh, op_err, mflowtype, mflowid, vrf_id, net->port); *abort_no_unlock = 1; return (-1); } /* if the peer doesn't support asconf, flush the asconf queue */ if (asoc->asconf_supported == 0) { struct sctp_asconf_addr *param, *nparam; TAILQ_FOREACH_SAFE(param, &asoc->asconf_queue, next, nparam) { TAILQ_REMOVE(&asoc->asconf_queue, param, next); SCTP_FREE(param, SCTP_M_ASC_ADDR); } } stcb->asoc.peer_hmac_id = sctp_negotiate_hmacid(stcb->asoc.peer_hmacs, stcb->asoc.local_hmacs); if (op_err) { sctp_queue_op_err(stcb, op_err); /* queuing will steal away the mbuf chain to the out queue */ op_err = NULL; } /* extract the cookie and queue it to "echo" it back... */ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_THRESHOLD_LOGGING) { sctp_misc_ints(SCTP_THRESHOLD_CLEAR, stcb->asoc.overall_error_count, 0, SCTP_FROM_SCTP_INPUT, __LINE__); } stcb->asoc.overall_error_count = 0; net->error_count = 0; /* * Cancel the INIT timer, We do this first before queueing the * cookie. We always cancel at the primary to assue that we are * canceling the timer started by the INIT which always goes to the * primary. */ sctp_timer_stop(SCTP_TIMER_TYPE_INIT, stcb->sctp_ep, stcb, asoc->primary_destination, SCTP_FROM_SCTP_INPUT + SCTP_LOC_3); /* calculate the RTO */ sctp_calculate_rto(stcb, asoc, net, &asoc->time_entered, SCTP_RTT_FROM_NON_DATA); retval = sctp_send_cookie_echo(m, offset, initack_limit, stcb, net); return (retval); } static void sctp_handle_heartbeat_ack(struct sctp_heartbeat_chunk *cp, struct sctp_tcb *stcb, struct sctp_nets *net) { union sctp_sockstore store; struct sctp_nets *r_net, *f_net; struct timeval tv; int req_prim = 0; uint16_t old_error_counter; if (ntohs(cp->ch.chunk_length) != sizeof(struct sctp_heartbeat_chunk)) { /* Invalid length */ return; } memset(&store, 0, sizeof(store)); switch (cp->heartbeat.hb_info.addr_family) { #ifdef INET case AF_INET: if (cp->heartbeat.hb_info.addr_len == sizeof(struct sockaddr_in)) { store.sin.sin_family = cp->heartbeat.hb_info.addr_family; store.sin.sin_len = cp->heartbeat.hb_info.addr_len; store.sin.sin_port = stcb->rport; memcpy(&store.sin.sin_addr, cp->heartbeat.hb_info.address, sizeof(store.sin.sin_addr)); } else { return; } break; #endif #ifdef INET6 case AF_INET6: if (cp->heartbeat.hb_info.addr_len == sizeof(struct sockaddr_in6)) { store.sin6.sin6_family = cp->heartbeat.hb_info.addr_family; store.sin6.sin6_len = cp->heartbeat.hb_info.addr_len; store.sin6.sin6_port = stcb->rport; memcpy(&store.sin6.sin6_addr, cp->heartbeat.hb_info.address, sizeof(struct in6_addr)); } else { return; } break; #endif default: return; } r_net = sctp_findnet(stcb, &store.sa); if (r_net == NULL) { SCTPDBG(SCTP_DEBUG_INPUT1, "Huh? I can't find the address I sent it to, discard\n"); return; } if ((r_net && (r_net->dest_state & SCTP_ADDR_UNCONFIRMED)) && (r_net->heartbeat_random1 == cp->heartbeat.hb_info.random_value1) && (r_net->heartbeat_random2 == cp->heartbeat.hb_info.random_value2)) { /* * If the its a HB and it's random value is correct when can * confirm the destination. */ r_net->dest_state &= ~SCTP_ADDR_UNCONFIRMED; if (r_net->dest_state & SCTP_ADDR_REQ_PRIMARY) { stcb->asoc.primary_destination = r_net; r_net->dest_state &= ~SCTP_ADDR_REQ_PRIMARY; f_net = TAILQ_FIRST(&stcb->asoc.nets); if (f_net != r_net) { /* * first one on the list is NOT the primary * sctp_cmpaddr() is much more efficient if * the primary is the first on the list, * make it so. */ TAILQ_REMOVE(&stcb->asoc.nets, r_net, sctp_next); TAILQ_INSERT_HEAD(&stcb->asoc.nets, r_net, sctp_next); } req_prim = 1; } sctp_ulp_notify(SCTP_NOTIFY_INTERFACE_CONFIRMED, stcb, 0, (void *)r_net, SCTP_SO_NOT_LOCKED); sctp_timer_stop(SCTP_TIMER_TYPE_HEARTBEAT, stcb->sctp_ep, stcb, r_net, SCTP_FROM_SCTP_INPUT + SCTP_LOC_4); sctp_timer_start(SCTP_TIMER_TYPE_HEARTBEAT, stcb->sctp_ep, stcb, r_net); } if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_THRESHOLD_LOGGING) { sctp_misc_ints(SCTP_THRESHOLD_CLEAR, stcb->asoc.overall_error_count, 0, SCTP_FROM_SCTP_INPUT, __LINE__); } stcb->asoc.overall_error_count = 0; old_error_counter = r_net->error_count; r_net->error_count = 0; r_net->hb_responded = 1; tv.tv_sec = cp->heartbeat.hb_info.time_value_1; tv.tv_usec = cp->heartbeat.hb_info.time_value_2; /* Now lets do a RTO with this */ sctp_calculate_rto(stcb, &stcb->asoc, r_net, &tv, SCTP_RTT_FROM_NON_DATA); if (!(r_net->dest_state & SCTP_ADDR_REACHABLE)) { r_net->dest_state |= SCTP_ADDR_REACHABLE; sctp_ulp_notify(SCTP_NOTIFY_INTERFACE_UP, stcb, 0, (void *)r_net, SCTP_SO_NOT_LOCKED); } if (r_net->dest_state & SCTP_ADDR_PF) { r_net->dest_state &= ~SCTP_ADDR_PF; stcb->asoc.cc_functions.sctp_cwnd_update_exit_pf(stcb, net); } if (old_error_counter > 0) { sctp_timer_stop(SCTP_TIMER_TYPE_HEARTBEAT, stcb->sctp_ep, stcb, r_net, SCTP_FROM_SCTP_INPUT + SCTP_LOC_5); sctp_timer_start(SCTP_TIMER_TYPE_HEARTBEAT, stcb->sctp_ep, stcb, r_net); } if (r_net == stcb->asoc.primary_destination) { if (stcb->asoc.alternate) { /* release the alternate, primary is good */ sctp_free_remote_addr(stcb->asoc.alternate); stcb->asoc.alternate = NULL; } } /* Mobility adaptation */ if (req_prim) { if ((sctp_is_mobility_feature_on(stcb->sctp_ep, SCTP_MOBILITY_BASE) || sctp_is_mobility_feature_on(stcb->sctp_ep, SCTP_MOBILITY_FASTHANDOFF)) && sctp_is_mobility_feature_on(stcb->sctp_ep, SCTP_MOBILITY_PRIM_DELETED)) { sctp_timer_stop(SCTP_TIMER_TYPE_PRIM_DELETED, stcb->sctp_ep, stcb, NULL, SCTP_FROM_SCTP_INPUT + SCTP_LOC_6); if (sctp_is_mobility_feature_on(stcb->sctp_ep, SCTP_MOBILITY_FASTHANDOFF)) { sctp_assoc_immediate_retrans(stcb, stcb->asoc.primary_destination); } if (sctp_is_mobility_feature_on(stcb->sctp_ep, SCTP_MOBILITY_BASE)) { sctp_move_chunks_from_net(stcb, stcb->asoc.deleted_primary); } sctp_delete_prim_timer(stcb->sctp_ep, stcb); } } } static int sctp_handle_nat_colliding_state(struct sctp_tcb *stcb) { /* * Return 0 means we want you to proceed with the abort non-zero * means no abort processing. */ uint32_t new_vtag; struct sctpasochead *head; if ((SCTP_GET_STATE(stcb) == SCTP_STATE_COOKIE_WAIT) || (SCTP_GET_STATE(stcb) == SCTP_STATE_COOKIE_ECHOED)) { new_vtag = sctp_select_a_tag(stcb->sctp_ep, stcb->sctp_ep->sctp_lport, stcb->rport, 1); atomic_add_int(&stcb->asoc.refcnt, 1); SCTP_TCB_UNLOCK(stcb); SCTP_INP_INFO_WLOCK(); SCTP_TCB_LOCK(stcb); atomic_subtract_int(&stcb->asoc.refcnt, 1); } else { return (0); } if (SCTP_GET_STATE(stcb) == SCTP_STATE_COOKIE_WAIT) { /* generate a new vtag and send init */ LIST_REMOVE(stcb, sctp_asocs); stcb->asoc.my_vtag = new_vtag; head = &SCTP_BASE_INFO(sctp_asochash)[SCTP_PCBHASH_ASOC(stcb->asoc.my_vtag, SCTP_BASE_INFO(hashasocmark))]; /* * put it in the bucket in the vtag hash of assoc's for the * system */ LIST_INSERT_HEAD(head, stcb, sctp_asocs); SCTP_INP_INFO_WUNLOCK(); sctp_send_initiate(stcb->sctp_ep, stcb, SCTP_SO_NOT_LOCKED); return (1); } else { /* * treat like a case where the cookie expired i.e.: - dump * current cookie. - generate a new vtag. - resend init. */ /* generate a new vtag and send init */ LIST_REMOVE(stcb, sctp_asocs); SCTP_SET_STATE(stcb, SCTP_STATE_COOKIE_WAIT); sctp_stop_all_cookie_timers(stcb); sctp_toss_old_cookies(stcb, &stcb->asoc); stcb->asoc.my_vtag = new_vtag; head = &SCTP_BASE_INFO(sctp_asochash)[SCTP_PCBHASH_ASOC(stcb->asoc.my_vtag, SCTP_BASE_INFO(hashasocmark))]; /* * put it in the bucket in the vtag hash of assoc's for the * system */ LIST_INSERT_HEAD(head, stcb, sctp_asocs); SCTP_INP_INFO_WUNLOCK(); sctp_send_initiate(stcb->sctp_ep, stcb, SCTP_SO_NOT_LOCKED); return (1); } return (0); } static int sctp_handle_nat_missing_state(struct sctp_tcb *stcb, struct sctp_nets *net) { /* * return 0 means we want you to proceed with the abort non-zero * means no abort processing */ if (stcb->asoc.auth_supported == 0) { SCTPDBG(SCTP_DEBUG_INPUT2, "sctp_handle_nat_missing_state: Peer does not support AUTH, cannot send an asconf\n"); return (0); } sctp_asconf_send_nat_state_update(stcb, net); return (1); } /* Returns 1 if the stcb was aborted, 0 otherwise */ static int sctp_handle_abort(struct sctp_abort_chunk *abort, struct sctp_tcb *stcb, struct sctp_nets *net) { uint16_t len; uint16_t error; SCTPDBG(SCTP_DEBUG_INPUT2, "sctp_handle_abort: handling ABORT\n"); if (stcb == NULL) return (0); len = ntohs(abort->ch.chunk_length); if (len >= sizeof(struct sctp_chunkhdr) + sizeof(struct sctp_error_cause)) { /* * Need to check the cause codes for our two magic nat * aborts which don't kill the assoc necessarily. */ struct sctp_error_cause *cause; cause = (struct sctp_error_cause *)(abort + 1); error = ntohs(cause->code); if (error == SCTP_CAUSE_NAT_COLLIDING_STATE) { SCTPDBG(SCTP_DEBUG_INPUT2, "Received Colliding state, ABORT flags:%x\n", abort->ch.chunk_flags); if (sctp_handle_nat_colliding_state(stcb)) { return (0); } } else if (error == SCTP_CAUSE_NAT_MISSING_STATE) { SCTPDBG(SCTP_DEBUG_INPUT2, "Received missing state, ABORT flags:%x\n", abort->ch.chunk_flags); if (sctp_handle_nat_missing_state(stcb, net)) { return (0); } } } else { error = 0; } /* stop any receive timers */ sctp_timer_stop(SCTP_TIMER_TYPE_RECV, stcb->sctp_ep, stcb, NULL, SCTP_FROM_SCTP_INPUT + SCTP_LOC_7); /* notify user of the abort and clean up... */ sctp_abort_notification(stcb, 1, error, abort, SCTP_SO_NOT_LOCKED); /* free the tcb */ SCTP_STAT_INCR_COUNTER32(sctps_aborted); if ((SCTP_GET_STATE(stcb) == SCTP_STATE_OPEN) || (SCTP_GET_STATE(stcb) == SCTP_STATE_SHUTDOWN_RECEIVED)) { SCTP_STAT_DECR_GAUGE32(sctps_currestab); } #ifdef SCTP_ASOCLOG_OF_TSNS sctp_print_out_track_log(stcb); #endif - SCTP_ADD_SUBSTATE(stcb, SCTP_STATE_WAS_ABORTED); (void)sctp_free_assoc(stcb->sctp_ep, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTP_INPUT + SCTP_LOC_8); SCTPDBG(SCTP_DEBUG_INPUT2, "sctp_handle_abort: finished\n"); return (1); } static void sctp_start_net_timers(struct sctp_tcb *stcb) { uint32_t cnt_hb_sent; struct sctp_nets *net; cnt_hb_sent = 0; TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) { /* * For each network start: 1) A pmtu timer. 2) A HB timer 3) * If the dest in unconfirmed send a hb as well if under * max_hb_burst have been sent. */ sctp_timer_start(SCTP_TIMER_TYPE_PATHMTURAISE, stcb->sctp_ep, stcb, net); sctp_timer_start(SCTP_TIMER_TYPE_HEARTBEAT, stcb->sctp_ep, stcb, net); if ((net->dest_state & SCTP_ADDR_UNCONFIRMED) && (cnt_hb_sent < SCTP_BASE_SYSCTL(sctp_hb_maxburst))) { sctp_send_hb(stcb, net, SCTP_SO_NOT_LOCKED); cnt_hb_sent++; } } if (cnt_hb_sent) { sctp_chunk_output(stcb->sctp_ep, stcb, SCTP_OUTPUT_FROM_COOKIE_ACK, SCTP_SO_NOT_LOCKED); } } static void sctp_handle_shutdown(struct sctp_shutdown_chunk *cp, struct sctp_tcb *stcb, struct sctp_nets *net, int *abort_flag) { struct sctp_association *asoc; int some_on_streamwheel; int old_state; SCTPDBG(SCTP_DEBUG_INPUT2, "sctp_handle_shutdown: handling SHUTDOWN\n"); if (stcb == NULL) return; asoc = &stcb->asoc; if ((SCTP_GET_STATE(stcb) == SCTP_STATE_COOKIE_WAIT) || (SCTP_GET_STATE(stcb) == SCTP_STATE_COOKIE_ECHOED)) { return; } if (ntohs(cp->ch.chunk_length) != sizeof(struct sctp_shutdown_chunk)) { /* Shutdown NOT the expected size */ return; } old_state = SCTP_GET_STATE(stcb); sctp_update_acked(stcb, cp, abort_flag); if (*abort_flag) { return; } if (asoc->control_pdapi) { /* * With a normal shutdown we assume the end of last record. */ SCTP_INP_READ_LOCK(stcb->sctp_ep); if (asoc->control_pdapi->on_strm_q) { struct sctp_stream_in *strm; strm = &asoc->strmin[asoc->control_pdapi->sinfo_stream]; if (asoc->control_pdapi->on_strm_q == SCTP_ON_UNORDERED) { /* Unordered */ TAILQ_REMOVE(&strm->uno_inqueue, asoc->control_pdapi, next_instrm); asoc->control_pdapi->on_strm_q = 0; } else if (asoc->control_pdapi->on_strm_q == SCTP_ON_ORDERED) { /* Ordered */ TAILQ_REMOVE(&strm->inqueue, asoc->control_pdapi, next_instrm); asoc->control_pdapi->on_strm_q = 0; #ifdef INVARIANTS } else { panic("Unknown state on ctrl:%p on_strm_q:%d", asoc->control_pdapi, asoc->control_pdapi->on_strm_q); #endif } } asoc->control_pdapi->end_added = 1; asoc->control_pdapi->pdapi_aborted = 1; asoc->control_pdapi = NULL; SCTP_INP_READ_UNLOCK(stcb->sctp_ep); if (stcb->sctp_socket) { sctp_sorwakeup(stcb->sctp_ep, stcb->sctp_socket); } } /* goto SHUTDOWN_RECEIVED state to block new requests */ if (stcb->sctp_socket) { if ((SCTP_GET_STATE(stcb) != SCTP_STATE_SHUTDOWN_RECEIVED) && (SCTP_GET_STATE(stcb) != SCTP_STATE_SHUTDOWN_ACK_SENT) && (SCTP_GET_STATE(stcb) != SCTP_STATE_SHUTDOWN_SENT)) { SCTP_SET_STATE(stcb, SCTP_STATE_SHUTDOWN_RECEIVED); /* * notify upper layer that peer has initiated a * shutdown */ sctp_ulp_notify(SCTP_NOTIFY_PEER_SHUTDOWN, stcb, 0, NULL, SCTP_SO_NOT_LOCKED); /* reset time */ (void)SCTP_GETTIME_TIMEVAL(&asoc->time_entered); } } if (SCTP_GET_STATE(stcb) == SCTP_STATE_SHUTDOWN_SENT) { /* * stop the shutdown timer, since we WILL move to * SHUTDOWN-ACK-SENT. */ sctp_timer_stop(SCTP_TIMER_TYPE_SHUTDOWN, stcb->sctp_ep, stcb, net, SCTP_FROM_SCTP_INPUT + SCTP_LOC_9); } /* Now is there unsent data on a stream somewhere? */ some_on_streamwheel = sctp_is_there_unsent_data(stcb, SCTP_SO_NOT_LOCKED); if (!TAILQ_EMPTY(&asoc->send_queue) || !TAILQ_EMPTY(&asoc->sent_queue) || some_on_streamwheel) { /* By returning we will push more data out */ return; } else { /* no outstanding data to send, so move on... */ /* send SHUTDOWN-ACK */ /* move to SHUTDOWN-ACK-SENT state */ if ((SCTP_GET_STATE(stcb) == SCTP_STATE_OPEN) || (SCTP_GET_STATE(stcb) == SCTP_STATE_SHUTDOWN_RECEIVED)) { SCTP_STAT_DECR_GAUGE32(sctps_currestab); } if (SCTP_GET_STATE(stcb) != SCTP_STATE_SHUTDOWN_ACK_SENT) { SCTP_SET_STATE(stcb, SCTP_STATE_SHUTDOWN_ACK_SENT); sctp_stop_timers_for_shutdown(stcb); sctp_send_shutdown_ack(stcb, net); sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWNACK, stcb->sctp_ep, stcb, net); } else if (old_state == SCTP_STATE_SHUTDOWN_ACK_SENT) { sctp_send_shutdown_ack(stcb, net); } } } static void sctp_handle_shutdown_ack(struct sctp_shutdown_ack_chunk *cp SCTP_UNUSED, struct sctp_tcb *stcb, struct sctp_nets *net) { struct sctp_association *asoc; SCTPDBG(SCTP_DEBUG_INPUT2, "sctp_handle_shutdown_ack: handling SHUTDOWN ACK\n"); if (stcb == NULL) return; asoc = &stcb->asoc; /* process according to association state */ if ((SCTP_GET_STATE(stcb) == SCTP_STATE_COOKIE_WAIT) || (SCTP_GET_STATE(stcb) == SCTP_STATE_COOKIE_ECHOED)) { /* unexpected SHUTDOWN-ACK... do OOTB handling... */ sctp_send_shutdown_complete(stcb, net, 1); SCTP_TCB_UNLOCK(stcb); return; } if ((SCTP_GET_STATE(stcb) != SCTP_STATE_SHUTDOWN_SENT) && (SCTP_GET_STATE(stcb) != SCTP_STATE_SHUTDOWN_ACK_SENT)) { /* unexpected SHUTDOWN-ACK... so ignore... */ SCTP_TCB_UNLOCK(stcb); return; } if (asoc->control_pdapi) { /* * With a normal shutdown we assume the end of last record. */ SCTP_INP_READ_LOCK(stcb->sctp_ep); asoc->control_pdapi->end_added = 1; asoc->control_pdapi->pdapi_aborted = 1; asoc->control_pdapi = NULL; SCTP_INP_READ_UNLOCK(stcb->sctp_ep); sctp_sorwakeup(stcb->sctp_ep, stcb->sctp_socket); } #ifdef INVARIANTS if (!TAILQ_EMPTY(&asoc->send_queue) || !TAILQ_EMPTY(&asoc->sent_queue) || sctp_is_there_unsent_data(stcb, SCTP_SO_NOT_LOCKED)) { panic("Queues are not empty when handling SHUTDOWN-ACK"); } #endif /* stop the timer */ sctp_timer_stop(SCTP_TIMER_TYPE_SHUTDOWN, stcb->sctp_ep, stcb, net, SCTP_FROM_SCTP_INPUT + SCTP_LOC_10); /* send SHUTDOWN-COMPLETE */ sctp_send_shutdown_complete(stcb, net, 0); /* notify upper layer protocol */ if (stcb->sctp_socket) { if ((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL)) { stcb->sctp_socket->so_snd.sb_cc = 0; } sctp_ulp_notify(SCTP_NOTIFY_ASSOC_DOWN, stcb, 0, NULL, SCTP_SO_NOT_LOCKED); } SCTP_STAT_INCR_COUNTER32(sctps_shutdown); /* free the TCB but first save off the ep */ (void)sctp_free_assoc(stcb->sctp_ep, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTP_INPUT + SCTP_LOC_11); } static void sctp_process_unrecog_chunk(struct sctp_tcb *stcb, uint8_t chunk_type) { switch (chunk_type) { case SCTP_ASCONF_ACK: case SCTP_ASCONF: sctp_asconf_cleanup(stcb); break; case SCTP_IFORWARD_CUM_TSN: case SCTP_FORWARD_CUM_TSN: stcb->asoc.prsctp_supported = 0; break; default: SCTPDBG(SCTP_DEBUG_INPUT2, "Peer does not support chunk type %d (0x%x).\n", chunk_type, chunk_type); break; } } /* * Skip past the param header and then we will find the param that caused the * problem. There are a number of param's in a ASCONF OR the prsctp param * these will turn of specific features. * XXX: Is this the right thing to do? */ static void sctp_process_unrecog_param(struct sctp_tcb *stcb, uint16_t parameter_type) { switch (parameter_type) { /* pr-sctp draft */ case SCTP_PRSCTP_SUPPORTED: stcb->asoc.prsctp_supported = 0; break; case SCTP_SUPPORTED_CHUNK_EXT: break; /* draft-ietf-tsvwg-addip-sctp */ case SCTP_HAS_NAT_SUPPORT: stcb->asoc.peer_supports_nat = 0; break; case SCTP_ADD_IP_ADDRESS: case SCTP_DEL_IP_ADDRESS: case SCTP_SET_PRIM_ADDR: stcb->asoc.asconf_supported = 0; break; case SCTP_SUCCESS_REPORT: case SCTP_ERROR_CAUSE_IND: SCTPDBG(SCTP_DEBUG_INPUT2, "Huh, the peer does not support success? or error cause?\n"); SCTPDBG(SCTP_DEBUG_INPUT2, "Turning off ASCONF to this strange peer\n"); stcb->asoc.asconf_supported = 0; break; default: SCTPDBG(SCTP_DEBUG_INPUT2, "Peer does not support param type %d (0x%x)??\n", parameter_type, parameter_type); break; } } static int sctp_handle_error(struct sctp_chunkhdr *ch, struct sctp_tcb *stcb, struct sctp_nets *net, uint32_t limit) { struct sctp_error_cause *cause; struct sctp_association *asoc; uint32_t remaining_length, adjust; uint16_t code, cause_code, cause_length; /* parse through all of the errors and process */ asoc = &stcb->asoc; cause = (struct sctp_error_cause *)((caddr_t)ch + sizeof(struct sctp_chunkhdr)); remaining_length = ntohs(ch->chunk_length); if (remaining_length > limit) { remaining_length = limit; } if (remaining_length >= sizeof(struct sctp_chunkhdr)) { remaining_length -= sizeof(struct sctp_chunkhdr); } else { remaining_length = 0; } code = 0; while (remaining_length >= sizeof(struct sctp_error_cause)) { /* Process an Error Cause */ cause_code = ntohs(cause->code); cause_length = ntohs(cause->length); if ((cause_length > remaining_length) || (cause_length == 0)) { /* Invalid cause length, possibly due to truncation. */ SCTPDBG(SCTP_DEBUG_INPUT1, "Bogus length in cause - bytes left: %u cause length: %u\n", remaining_length, cause_length); return (0); } if (code == 0) { /* report the first error cause */ code = cause_code; } switch (cause_code) { case SCTP_CAUSE_INVALID_STREAM: case SCTP_CAUSE_MISSING_PARAM: case SCTP_CAUSE_INVALID_PARAM: case SCTP_CAUSE_NO_USER_DATA: SCTPDBG(SCTP_DEBUG_INPUT1, "Software error we got a %u back? We have a bug :/ (or do they?)\n", cause_code); break; case SCTP_CAUSE_NAT_COLLIDING_STATE: SCTPDBG(SCTP_DEBUG_INPUT2, "Received Colliding state, ERROR flags: %x\n", ch->chunk_flags); if (sctp_handle_nat_colliding_state(stcb)) { return (0); } break; case SCTP_CAUSE_NAT_MISSING_STATE: SCTPDBG(SCTP_DEBUG_INPUT2, "Received missing state, ERROR flags: %x\n", ch->chunk_flags); if (sctp_handle_nat_missing_state(stcb, net)) { return (0); } break; case SCTP_CAUSE_STALE_COOKIE: /* * We only act if we have echoed a cookie and are * waiting. */ if ((cause_length >= sizeof(struct sctp_error_stale_cookie)) && (SCTP_GET_STATE(stcb) == SCTP_STATE_COOKIE_ECHOED)) { struct sctp_error_stale_cookie *stale_cookie; stale_cookie = (struct sctp_error_stale_cookie *)cause; asoc->cookie_preserve_req = ntohl(stale_cookie->stale_time); /* Double it to be more robust on RTX */ if (asoc->cookie_preserve_req <= UINT32_MAX / 2) { asoc->cookie_preserve_req *= 2; } else { asoc->cookie_preserve_req = UINT32_MAX; } asoc->stale_cookie_count++; if (asoc->stale_cookie_count > asoc->max_init_times) { sctp_abort_notification(stcb, 0, 0, NULL, SCTP_SO_NOT_LOCKED); /* now free the asoc */ (void)sctp_free_assoc(stcb->sctp_ep, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTP_INPUT + SCTP_LOC_12); return (-1); } /* blast back to INIT state */ sctp_toss_old_cookies(stcb, &stcb->asoc); SCTP_SET_STATE(stcb, SCTP_STATE_COOKIE_WAIT); sctp_stop_all_cookie_timers(stcb); sctp_send_initiate(stcb->sctp_ep, stcb, SCTP_SO_NOT_LOCKED); } break; case SCTP_CAUSE_UNRESOLVABLE_ADDR: /* * Nothing we can do here, we don't do hostname * addresses so if the peer does not like my IPv6 * (or IPv4 for that matter) it does not matter. If * they don't support that type of address, they can * NOT possibly get that packet type... i.e. with no * IPv6 you can't receive a IPv6 packet. so we can * safely ignore this one. If we ever added support * for HOSTNAME Addresses, then we would need to do * something here. */ break; case SCTP_CAUSE_UNRECOG_CHUNK: if (cause_length >= sizeof(struct sctp_error_unrecognized_chunk)) { struct sctp_error_unrecognized_chunk *unrec_chunk; unrec_chunk = (struct sctp_error_unrecognized_chunk *)cause; sctp_process_unrecog_chunk(stcb, unrec_chunk->ch.chunk_type); } break; case SCTP_CAUSE_UNRECOG_PARAM: /* XXX: We only consider the first parameter */ if (cause_length >= sizeof(struct sctp_error_cause) + sizeof(struct sctp_paramhdr)) { struct sctp_paramhdr *unrec_parameter; unrec_parameter = (struct sctp_paramhdr *)(cause + 1); sctp_process_unrecog_param(stcb, ntohs(unrec_parameter->param_type)); } break; case SCTP_CAUSE_COOKIE_IN_SHUTDOWN: /* * We ignore this since the timer will drive out a * new cookie anyway and there timer will drive us * to send a SHUTDOWN_COMPLETE. We can't send one * here since we don't have their tag. */ break; case SCTP_CAUSE_DELETING_LAST_ADDR: case SCTP_CAUSE_RESOURCE_SHORTAGE: case SCTP_CAUSE_DELETING_SRC_ADDR: /* * We should NOT get these here, but in a * ASCONF-ACK. */ SCTPDBG(SCTP_DEBUG_INPUT2, "Peer sends ASCONF errors in a error cause with code %u.\n", cause_code); break; case SCTP_CAUSE_OUT_OF_RESC: /* * And what, pray tell do we do with the fact that * the peer is out of resources? Not really sure we * could do anything but abort. I suspect this * should have came WITH an abort instead of in a * OP-ERROR. */ break; default: SCTPDBG(SCTP_DEBUG_INPUT1, "sctp_handle_error: unknown code 0x%x\n", cause_code); break; } adjust = SCTP_SIZE32(cause_length); if (remaining_length >= adjust) { remaining_length -= adjust; } else { remaining_length = 0; } cause = (struct sctp_error_cause *)((caddr_t)cause + adjust); } sctp_ulp_notify(SCTP_NOTIFY_REMOTE_ERROR, stcb, code, ch, SCTP_SO_NOT_LOCKED); return (0); } static int sctp_handle_init_ack(struct mbuf *m, int iphlen, int offset, struct sockaddr *src, struct sockaddr *dst, struct sctphdr *sh, struct sctp_init_ack_chunk *cp, struct sctp_tcb *stcb, struct sctp_nets *net, int *abort_no_unlock, uint8_t mflowtype, uint32_t mflowid, uint32_t vrf_id) { struct sctp_init_ack *init_ack; struct mbuf *op_err; SCTPDBG(SCTP_DEBUG_INPUT2, "sctp_handle_init_ack: handling INIT-ACK\n"); if (stcb == NULL) { SCTPDBG(SCTP_DEBUG_INPUT2, "sctp_handle_init_ack: TCB is null\n"); return (-1); } if (ntohs(cp->ch.chunk_length) < sizeof(struct sctp_init_ack_chunk)) { /* Invalid length */ op_err = sctp_generate_cause(SCTP_CAUSE_INVALID_PARAM, ""); sctp_abort_association(stcb->sctp_ep, stcb, m, iphlen, src, dst, sh, op_err, mflowtype, mflowid, vrf_id, net->port); *abort_no_unlock = 1; return (-1); } init_ack = &cp->init; /* validate parameters */ if (init_ack->initiate_tag == 0) { /* protocol error... send an abort */ op_err = sctp_generate_cause(SCTP_CAUSE_INVALID_PARAM, ""); sctp_abort_association(stcb->sctp_ep, stcb, m, iphlen, src, dst, sh, op_err, mflowtype, mflowid, vrf_id, net->port); *abort_no_unlock = 1; return (-1); } if (ntohl(init_ack->a_rwnd) < SCTP_MIN_RWND) { /* protocol error... send an abort */ op_err = sctp_generate_cause(SCTP_CAUSE_INVALID_PARAM, ""); sctp_abort_association(stcb->sctp_ep, stcb, m, iphlen, src, dst, sh, op_err, mflowtype, mflowid, vrf_id, net->port); *abort_no_unlock = 1; return (-1); } if (init_ack->num_inbound_streams == 0) { /* protocol error... send an abort */ op_err = sctp_generate_cause(SCTP_CAUSE_INVALID_PARAM, ""); sctp_abort_association(stcb->sctp_ep, stcb, m, iphlen, src, dst, sh, op_err, mflowtype, mflowid, vrf_id, net->port); *abort_no_unlock = 1; return (-1); } if (init_ack->num_outbound_streams == 0) { /* protocol error... send an abort */ op_err = sctp_generate_cause(SCTP_CAUSE_INVALID_PARAM, ""); sctp_abort_association(stcb->sctp_ep, stcb, m, iphlen, src, dst, sh, op_err, mflowtype, mflowid, vrf_id, net->port); *abort_no_unlock = 1; return (-1); } /* process according to association state... */ switch (SCTP_GET_STATE(stcb)) { case SCTP_STATE_COOKIE_WAIT: /* this is the expected state for this chunk */ /* process the INIT-ACK parameters */ if (stcb->asoc.primary_destination->dest_state & SCTP_ADDR_UNCONFIRMED) { /* * The primary is where we sent the INIT, we can * always consider it confirmed when the INIT-ACK is * returned. Do this before we load addresses * though. */ stcb->asoc.primary_destination->dest_state &= ~SCTP_ADDR_UNCONFIRMED; sctp_ulp_notify(SCTP_NOTIFY_INTERFACE_CONFIRMED, stcb, 0, (void *)stcb->asoc.primary_destination, SCTP_SO_NOT_LOCKED); } if (sctp_process_init_ack(m, iphlen, offset, src, dst, sh, cp, stcb, net, abort_no_unlock, mflowtype, mflowid, vrf_id) < 0) { /* error in parsing parameters */ return (-1); } /* update our state */ SCTPDBG(SCTP_DEBUG_INPUT2, "moving to COOKIE-ECHOED state\n"); SCTP_SET_STATE(stcb, SCTP_STATE_COOKIE_ECHOED); /* reset the RTO calc */ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_THRESHOLD_LOGGING) { sctp_misc_ints(SCTP_THRESHOLD_CLEAR, stcb->asoc.overall_error_count, 0, SCTP_FROM_SCTP_INPUT, __LINE__); } stcb->asoc.overall_error_count = 0; (void)SCTP_GETTIME_TIMEVAL(&stcb->asoc.time_entered); /* * collapse the init timer back in case of a exponential * backoff */ sctp_timer_start(SCTP_TIMER_TYPE_COOKIE, stcb->sctp_ep, stcb, net); /* * the send at the end of the inbound data processing will * cause the cookie to be sent */ break; case SCTP_STATE_SHUTDOWN_SENT: /* incorrect state... discard */ break; case SCTP_STATE_COOKIE_ECHOED: /* incorrect state... discard */ break; case SCTP_STATE_OPEN: /* incorrect state... discard */ break; case SCTP_STATE_EMPTY: case SCTP_STATE_INUSE: default: /* incorrect state... discard */ return (-1); break; } SCTPDBG(SCTP_DEBUG_INPUT1, "Leaving handle-init-ack end\n"); return (0); } static struct sctp_tcb * sctp_process_cookie_new(struct mbuf *m, int iphlen, int offset, struct sockaddr *src, struct sockaddr *dst, struct sctphdr *sh, struct sctp_state_cookie *cookie, int cookie_len, struct sctp_inpcb *inp, struct sctp_nets **netp, struct sockaddr *init_src, int *notification, int auth_skipped, uint32_t auth_offset, uint32_t auth_len, uint8_t mflowtype, uint32_t mflowid, uint32_t vrf_id, uint16_t port); /* * handle a state cookie for an existing association m: input packet mbuf * chain-- assumes a pullup on IP/SCTP/COOKIE-ECHO chunk note: this is a * "split" mbuf and the cookie signature does not exist offset: offset into * mbuf to the cookie-echo chunk */ static struct sctp_tcb * sctp_process_cookie_existing(struct mbuf *m, int iphlen, int offset, struct sockaddr *src, struct sockaddr *dst, struct sctphdr *sh, struct sctp_state_cookie *cookie, int cookie_len, struct sctp_inpcb *inp, struct sctp_tcb *stcb, struct sctp_nets **netp, struct sockaddr *init_src, int *notification, int auth_skipped, uint32_t auth_offset, uint32_t auth_len, uint8_t mflowtype, uint32_t mflowid, uint32_t vrf_id, uint16_t port) { struct sctp_association *asoc; struct sctp_init_chunk *init_cp, init_buf; struct sctp_init_ack_chunk *initack_cp, initack_buf; struct sctp_nets *net; struct mbuf *op_err; struct timeval old; int init_offset, initack_offset, i; int retval; int spec_flag = 0; uint32_t how_indx; #if defined(SCTP_DETAILED_STR_STATS) int j; #endif net = *netp; /* I know that the TCB is non-NULL from the caller */ asoc = &stcb->asoc; for (how_indx = 0; how_indx < sizeof(asoc->cookie_how); how_indx++) { if (asoc->cookie_how[how_indx] == 0) break; } if (how_indx < sizeof(asoc->cookie_how)) { asoc->cookie_how[how_indx] = 1; } if (SCTP_GET_STATE(stcb) == SCTP_STATE_SHUTDOWN_ACK_SENT) { /* SHUTDOWN came in after sending INIT-ACK */ sctp_send_shutdown_ack(stcb, stcb->asoc.primary_destination); op_err = sctp_generate_cause(SCTP_CAUSE_COOKIE_IN_SHUTDOWN, ""); sctp_send_operr_to(src, dst, sh, cookie->peers_vtag, op_err, mflowtype, mflowid, inp->fibnum, vrf_id, net->port); if (how_indx < sizeof(asoc->cookie_how)) asoc->cookie_how[how_indx] = 2; return (NULL); } /* * find and validate the INIT chunk in the cookie (peer's info) the * INIT should start after the cookie-echo header struct (chunk * header, state cookie header struct) */ init_offset = offset += sizeof(struct sctp_cookie_echo_chunk); init_cp = (struct sctp_init_chunk *) sctp_m_getptr(m, init_offset, sizeof(struct sctp_init_chunk), (uint8_t *)&init_buf); if (init_cp == NULL) { /* could not pull a INIT chunk in cookie */ return (NULL); } if (init_cp->ch.chunk_type != SCTP_INITIATION) { return (NULL); } /* * find and validate the INIT-ACK chunk in the cookie (my info) the * INIT-ACK follows the INIT chunk */ initack_offset = init_offset + SCTP_SIZE32(ntohs(init_cp->ch.chunk_length)); initack_cp = (struct sctp_init_ack_chunk *) sctp_m_getptr(m, initack_offset, sizeof(struct sctp_init_ack_chunk), (uint8_t *)&initack_buf); if (initack_cp == NULL) { /* could not pull INIT-ACK chunk in cookie */ return (NULL); } if (initack_cp->ch.chunk_type != SCTP_INITIATION_ACK) { return (NULL); } if ((ntohl(initack_cp->init.initiate_tag) == asoc->my_vtag) && (ntohl(init_cp->init.initiate_tag) == asoc->peer_vtag)) { /* * case D in Section 5.2.4 Table 2: MMAA process accordingly * to get into the OPEN state */ if (ntohl(initack_cp->init.initial_tsn) != asoc->init_seq_number) { /*- * Opps, this means that we somehow generated two vtag's * the same. I.e. we did: * Us Peer * <---INIT(tag=a)------ * ----INIT-ACK(tag=t)--> * ----INIT(tag=t)------> *1 * <---INIT-ACK(tag=a)--- * <----CE(tag=t)------------- *2 * * At point *1 we should be generating a different * tag t'. Which means we would throw away the CE and send * ours instead. Basically this is case C (throw away side). */ if (how_indx < sizeof(asoc->cookie_how)) asoc->cookie_how[how_indx] = 17; return (NULL); } switch (SCTP_GET_STATE(stcb)) { case SCTP_STATE_COOKIE_WAIT: case SCTP_STATE_COOKIE_ECHOED: /* * INIT was sent but got a COOKIE_ECHO with the * correct tags... just accept it...but we must * process the init so that we can make sure we have * the right seq no's. */ /* First we must process the INIT !! */ retval = sctp_process_init(init_cp, stcb); if (retval < 0) { if (how_indx < sizeof(asoc->cookie_how)) asoc->cookie_how[how_indx] = 3; return (NULL); } /* we have already processed the INIT so no problem */ sctp_timer_stop(SCTP_TIMER_TYPE_HEARTBEAT, inp, stcb, net, SCTP_FROM_SCTP_INPUT + SCTP_LOC_13); sctp_timer_stop(SCTP_TIMER_TYPE_INIT, inp, stcb, net, SCTP_FROM_SCTP_INPUT + SCTP_LOC_14); /* update current state */ if (SCTP_GET_STATE(stcb) == SCTP_STATE_COOKIE_ECHOED) SCTP_STAT_INCR_COUNTER32(sctps_activeestab); else SCTP_STAT_INCR_COUNTER32(sctps_collisionestab); SCTP_SET_STATE(stcb, SCTP_STATE_OPEN); if (asoc->state & SCTP_STATE_SHUTDOWN_PENDING) { sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWNGUARD, stcb->sctp_ep, stcb, NULL); } SCTP_STAT_INCR_GAUGE32(sctps_currestab); sctp_stop_all_cookie_timers(stcb); if (((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL)) && (!SCTP_IS_LISTENING(inp))) { /* * Here is where collision would go if we * did a connect() and instead got a * init/init-ack/cookie done before the * init-ack came back.. */ stcb->sctp_ep->sctp_flags |= SCTP_PCB_FLAGS_CONNECTED; soisconnected(stcb->sctp_socket); } /* notify upper layer */ *notification = SCTP_NOTIFY_ASSOC_UP; /* * since we did not send a HB make sure we don't * double things */ old.tv_sec = cookie->time_entered.tv_sec; old.tv_usec = cookie->time_entered.tv_usec; net->hb_responded = 1; sctp_calculate_rto(stcb, asoc, net, &old, SCTP_RTT_FROM_NON_DATA); if (stcb->asoc.sctp_autoclose_ticks && (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_AUTOCLOSE))) { sctp_timer_start(SCTP_TIMER_TYPE_AUTOCLOSE, inp, stcb, NULL); } break; default: /* * we're in the OPEN state (or beyond), so peer must * have simply lost the COOKIE-ACK */ break; } /* end switch */ sctp_stop_all_cookie_timers(stcb); /* * We ignore the return code here.. not sure if we should * somehow abort.. but we do have an existing asoc. This * really should not fail. */ if (sctp_load_addresses_from_init(stcb, m, init_offset + sizeof(struct sctp_init_chunk), initack_offset, src, dst, init_src, stcb->asoc.port)) { if (how_indx < sizeof(asoc->cookie_how)) asoc->cookie_how[how_indx] = 4; return (NULL); } /* respond with a COOKIE-ACK */ sctp_toss_old_cookies(stcb, asoc); sctp_send_cookie_ack(stcb); if (how_indx < sizeof(asoc->cookie_how)) asoc->cookie_how[how_indx] = 5; return (stcb); } if (ntohl(initack_cp->init.initiate_tag) != asoc->my_vtag && ntohl(init_cp->init.initiate_tag) == asoc->peer_vtag && cookie->tie_tag_my_vtag == 0 && cookie->tie_tag_peer_vtag == 0) { /* * case C in Section 5.2.4 Table 2: XMOO silently discard */ if (how_indx < sizeof(asoc->cookie_how)) asoc->cookie_how[how_indx] = 6; return (NULL); } /* * If nat support, and the below and stcb is established, send back * a ABORT(colliding state) if we are established. */ if ((SCTP_GET_STATE(stcb) == SCTP_STATE_OPEN) && (asoc->peer_supports_nat) && ((ntohl(initack_cp->init.initiate_tag) == asoc->my_vtag) && ((ntohl(init_cp->init.initiate_tag) != asoc->peer_vtag) || (asoc->peer_vtag == 0)))) { /* * Special case - Peer's support nat. We may have two init's * that we gave out the same tag on since one was not * established.. i.e. we get INIT from host-1 behind the nat * and we respond tag-a, we get a INIT from host-2 behind * the nat and we get tag-a again. Then we bring up host-1 * (or 2's) assoc, Then comes the cookie from hsot-2 (or 1). * Now we have colliding state. We must send an abort here * with colliding state indication. */ op_err = sctp_generate_cause(SCTP_CAUSE_NAT_COLLIDING_STATE, ""); sctp_send_abort(m, iphlen, src, dst, sh, 0, op_err, mflowtype, mflowid, inp->fibnum, vrf_id, port); return (NULL); } if ((ntohl(initack_cp->init.initiate_tag) == asoc->my_vtag) && ((ntohl(init_cp->init.initiate_tag) != asoc->peer_vtag) || (asoc->peer_vtag == 0))) { /* * case B in Section 5.2.4 Table 2: MXAA or MOAA my info * should be ok, re-accept peer info */ if (ntohl(initack_cp->init.initial_tsn) != asoc->init_seq_number) { /* * Extension of case C. If we hit this, then the * random number generator returned the same vtag * when we first sent our INIT-ACK and when we later * sent our INIT. The side with the seq numbers that * are different will be the one that normnally * would have hit case C. This in effect "extends" * our vtags in this collision case to be 64 bits. * The same collision could occur aka you get both * vtag and seq number the same twice in a row.. but * is much less likely. If it did happen then we * would proceed through and bring up the assoc.. we * may end up with the wrong stream setup however.. * which would be bad.. but there is no way to * tell.. until we send on a stream that does not * exist :-) */ if (how_indx < sizeof(asoc->cookie_how)) asoc->cookie_how[how_indx] = 7; return (NULL); } if (how_indx < sizeof(asoc->cookie_how)) asoc->cookie_how[how_indx] = 8; sctp_timer_stop(SCTP_TIMER_TYPE_HEARTBEAT, inp, stcb, net, SCTP_FROM_SCTP_INPUT + SCTP_LOC_15); sctp_stop_all_cookie_timers(stcb); /* * since we did not send a HB make sure we don't double * things */ net->hb_responded = 1; if (stcb->asoc.sctp_autoclose_ticks && sctp_is_feature_on(inp, SCTP_PCB_FLAGS_AUTOCLOSE)) { sctp_timer_start(SCTP_TIMER_TYPE_AUTOCLOSE, inp, stcb, NULL); } asoc->my_rwnd = ntohl(initack_cp->init.a_rwnd); asoc->pre_open_streams = ntohs(initack_cp->init.num_outbound_streams); if (ntohl(init_cp->init.initiate_tag) != asoc->peer_vtag) { /* * Ok the peer probably discarded our data (if we * echoed a cookie+data). So anything on the * sent_queue should be marked for retransmit, we * may not get something to kick us so it COULD * still take a timeout to move these.. but it can't * hurt to mark them. */ struct sctp_tmit_chunk *chk; TAILQ_FOREACH(chk, &stcb->asoc.sent_queue, sctp_next) { if (chk->sent < SCTP_DATAGRAM_RESEND) { chk->sent = SCTP_DATAGRAM_RESEND; sctp_flight_size_decrease(chk); sctp_total_flight_decrease(stcb, chk); sctp_ucount_incr(stcb->asoc.sent_queue_retran_cnt); spec_flag++; } } } /* process the INIT info (peer's info) */ retval = sctp_process_init(init_cp, stcb); if (retval < 0) { if (how_indx < sizeof(asoc->cookie_how)) asoc->cookie_how[how_indx] = 9; return (NULL); } if (sctp_load_addresses_from_init(stcb, m, init_offset + sizeof(struct sctp_init_chunk), initack_offset, src, dst, init_src, stcb->asoc.port)) { if (how_indx < sizeof(asoc->cookie_how)) asoc->cookie_how[how_indx] = 10; return (NULL); } if ((SCTP_GET_STATE(stcb) == SCTP_STATE_COOKIE_WAIT) || (SCTP_GET_STATE(stcb) == SCTP_STATE_COOKIE_ECHOED)) { *notification = SCTP_NOTIFY_ASSOC_UP; if (((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL)) && (!SCTP_IS_LISTENING(inp))) { stcb->sctp_ep->sctp_flags |= SCTP_PCB_FLAGS_CONNECTED; soisconnected(stcb->sctp_socket); } if (SCTP_GET_STATE(stcb) == SCTP_STATE_COOKIE_ECHOED) SCTP_STAT_INCR_COUNTER32(sctps_activeestab); else SCTP_STAT_INCR_COUNTER32(sctps_collisionestab); SCTP_STAT_INCR_GAUGE32(sctps_currestab); } else if (SCTP_GET_STATE(stcb) == SCTP_STATE_OPEN) { SCTP_STAT_INCR_COUNTER32(sctps_restartestab); } else { SCTP_STAT_INCR_COUNTER32(sctps_collisionestab); } SCTP_SET_STATE(stcb, SCTP_STATE_OPEN); if (asoc->state & SCTP_STATE_SHUTDOWN_PENDING) { sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWNGUARD, stcb->sctp_ep, stcb, NULL); } sctp_stop_all_cookie_timers(stcb); sctp_toss_old_cookies(stcb, asoc); sctp_send_cookie_ack(stcb); if (spec_flag) { /* * only if we have retrans set do we do this. What * this call does is get only the COOKIE-ACK out and * then when we return the normal call to * sctp_chunk_output will get the retrans out behind * this. */ sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_COOKIE_ACK, SCTP_SO_NOT_LOCKED); } if (how_indx < sizeof(asoc->cookie_how)) asoc->cookie_how[how_indx] = 11; return (stcb); } if ((ntohl(initack_cp->init.initiate_tag) != asoc->my_vtag && ntohl(init_cp->init.initiate_tag) != asoc->peer_vtag) && cookie->tie_tag_my_vtag == asoc->my_vtag_nonce && cookie->tie_tag_peer_vtag == asoc->peer_vtag_nonce && cookie->tie_tag_peer_vtag != 0) { struct sctpasochead *head; if (asoc->peer_supports_nat) { /* * This is a gross gross hack. Just call the * cookie_new code since we are allowing a duplicate * association. I hope this works... */ return (sctp_process_cookie_new(m, iphlen, offset, src, dst, sh, cookie, cookie_len, inp, netp, init_src, notification, auth_skipped, auth_offset, auth_len, mflowtype, mflowid, vrf_id, port)); } /* * case A in Section 5.2.4 Table 2: XXMM (peer restarted) */ /* temp code */ if (how_indx < sizeof(asoc->cookie_how)) asoc->cookie_how[how_indx] = 12; sctp_timer_stop(SCTP_TIMER_TYPE_INIT, inp, stcb, net, SCTP_FROM_SCTP_INPUT + SCTP_LOC_16); sctp_timer_stop(SCTP_TIMER_TYPE_HEARTBEAT, inp, stcb, net, SCTP_FROM_SCTP_INPUT + SCTP_LOC_17); /* notify upper layer */ *notification = SCTP_NOTIFY_ASSOC_RESTART; atomic_add_int(&stcb->asoc.refcnt, 1); if ((SCTP_GET_STATE(stcb) != SCTP_STATE_OPEN) && (SCTP_GET_STATE(stcb) != SCTP_STATE_SHUTDOWN_RECEIVED) && (SCTP_GET_STATE(stcb) != SCTP_STATE_SHUTDOWN_SENT)) { SCTP_STAT_INCR_GAUGE32(sctps_currestab); } if (SCTP_GET_STATE(stcb) == SCTP_STATE_OPEN) { SCTP_STAT_INCR_GAUGE32(sctps_restartestab); } else if (SCTP_GET_STATE(stcb) != SCTP_STATE_SHUTDOWN_SENT) { SCTP_STAT_INCR_GAUGE32(sctps_collisionestab); } if (asoc->state & SCTP_STATE_SHUTDOWN_PENDING) { SCTP_SET_STATE(stcb, SCTP_STATE_OPEN); sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWNGUARD, stcb->sctp_ep, stcb, NULL); } else if (SCTP_GET_STATE(stcb) != SCTP_STATE_SHUTDOWN_SENT) { /* move to OPEN state, if not in SHUTDOWN_SENT */ SCTP_SET_STATE(stcb, SCTP_STATE_OPEN); } asoc->pre_open_streams = ntohs(initack_cp->init.num_outbound_streams); asoc->init_seq_number = ntohl(initack_cp->init.initial_tsn); asoc->sending_seq = asoc->asconf_seq_out = asoc->str_reset_seq_out = asoc->init_seq_number; asoc->asconf_seq_out_acked = asoc->asconf_seq_out - 1; asoc->asconf_seq_in = asoc->last_acked_seq = asoc->init_seq_number - 1; asoc->str_reset_seq_in = asoc->init_seq_number; asoc->advanced_peer_ack_point = asoc->last_acked_seq; if (asoc->mapping_array) { memset(asoc->mapping_array, 0, asoc->mapping_array_size); } if (asoc->nr_mapping_array) { memset(asoc->nr_mapping_array, 0, asoc->mapping_array_size); } SCTP_TCB_UNLOCK(stcb); SCTP_INP_INFO_WLOCK(); SCTP_INP_WLOCK(stcb->sctp_ep); SCTP_TCB_LOCK(stcb); atomic_add_int(&stcb->asoc.refcnt, -1); /* send up all the data */ SCTP_TCB_SEND_LOCK(stcb); - sctp_report_all_outbound(stcb, 0, 1, SCTP_SO_LOCKED); + sctp_report_all_outbound(stcb, 0, SCTP_SO_LOCKED); for (i = 0; i < stcb->asoc.streamoutcnt; i++) { stcb->asoc.strmout[i].chunks_on_queues = 0; #if defined(SCTP_DETAILED_STR_STATS) for (j = 0; j < SCTP_PR_SCTP_MAX + 1; j++) { asoc->strmout[i].abandoned_sent[j] = 0; asoc->strmout[i].abandoned_unsent[j] = 0; } #else asoc->strmout[i].abandoned_sent[0] = 0; asoc->strmout[i].abandoned_unsent[0] = 0; #endif stcb->asoc.strmout[i].sid = i; stcb->asoc.strmout[i].next_mid_ordered = 0; stcb->asoc.strmout[i].next_mid_unordered = 0; stcb->asoc.strmout[i].last_msg_incomplete = 0; } /* process the INIT-ACK info (my info) */ asoc->my_vtag = ntohl(initack_cp->init.initiate_tag); asoc->my_rwnd = ntohl(initack_cp->init.a_rwnd); /* pull from vtag hash */ LIST_REMOVE(stcb, sctp_asocs); /* re-insert to new vtag position */ head = &SCTP_BASE_INFO(sctp_asochash)[SCTP_PCBHASH_ASOC(stcb->asoc.my_vtag, SCTP_BASE_INFO(hashasocmark))]; /* * put it in the bucket in the vtag hash of assoc's for the * system */ LIST_INSERT_HEAD(head, stcb, sctp_asocs); SCTP_TCB_SEND_UNLOCK(stcb); SCTP_INP_WUNLOCK(stcb->sctp_ep); SCTP_INP_INFO_WUNLOCK(); asoc->total_flight = 0; asoc->total_flight_count = 0; /* process the INIT info (peer's info) */ retval = sctp_process_init(init_cp, stcb); if (retval < 0) { if (how_indx < sizeof(asoc->cookie_how)) asoc->cookie_how[how_indx] = 13; return (NULL); } /* * since we did not send a HB make sure we don't double * things */ net->hb_responded = 1; if (sctp_load_addresses_from_init(stcb, m, init_offset + sizeof(struct sctp_init_chunk), initack_offset, src, dst, init_src, stcb->asoc.port)) { if (how_indx < sizeof(asoc->cookie_how)) asoc->cookie_how[how_indx] = 14; return (NULL); } /* respond with a COOKIE-ACK */ sctp_stop_all_cookie_timers(stcb); sctp_toss_old_cookies(stcb, asoc); sctp_send_cookie_ack(stcb); if (how_indx < sizeof(asoc->cookie_how)) asoc->cookie_how[how_indx] = 15; return (stcb); } if (how_indx < sizeof(asoc->cookie_how)) asoc->cookie_how[how_indx] = 16; /* all other cases... */ return (NULL); } /* * handle a state cookie for a new association m: input packet mbuf chain-- * assumes a pullup on IP/SCTP/COOKIE-ECHO chunk note: this is a "split" mbuf * and the cookie signature does not exist offset: offset into mbuf to the * cookie-echo chunk length: length of the cookie chunk to: where the init * was from returns a new TCB */ static struct sctp_tcb * sctp_process_cookie_new(struct mbuf *m, int iphlen, int offset, struct sockaddr *src, struct sockaddr *dst, struct sctphdr *sh, struct sctp_state_cookie *cookie, int cookie_len, struct sctp_inpcb *inp, struct sctp_nets **netp, struct sockaddr *init_src, int *notification, int auth_skipped, uint32_t auth_offset, uint32_t auth_len, uint8_t mflowtype, uint32_t mflowid, uint32_t vrf_id, uint16_t port) { struct sctp_tcb *stcb; struct sctp_init_chunk *init_cp, init_buf; struct sctp_init_ack_chunk *initack_cp, initack_buf; union sctp_sockstore store; struct sctp_association *asoc; int init_offset, initack_offset, initack_limit; int retval; int error = 0; uint8_t auth_chunk_buf[SCTP_CHUNK_BUFFER_SIZE]; /* * find and validate the INIT chunk in the cookie (peer's info) the * INIT should start after the cookie-echo header struct (chunk * header, state cookie header struct) */ init_offset = offset + sizeof(struct sctp_cookie_echo_chunk); init_cp = (struct sctp_init_chunk *) sctp_m_getptr(m, init_offset, sizeof(struct sctp_init_chunk), (uint8_t *)&init_buf); if (init_cp == NULL) { /* could not pull a INIT chunk in cookie */ SCTPDBG(SCTP_DEBUG_INPUT1, "process_cookie_new: could not pull INIT chunk hdr\n"); return (NULL); } if (init_cp->ch.chunk_type != SCTP_INITIATION) { SCTPDBG(SCTP_DEBUG_INPUT1, "HUH? process_cookie_new: could not find INIT chunk!\n"); return (NULL); } initack_offset = init_offset + SCTP_SIZE32(ntohs(init_cp->ch.chunk_length)); /* * find and validate the INIT-ACK chunk in the cookie (my info) the * INIT-ACK follows the INIT chunk */ initack_cp = (struct sctp_init_ack_chunk *) sctp_m_getptr(m, initack_offset, sizeof(struct sctp_init_ack_chunk), (uint8_t *)&initack_buf); if (initack_cp == NULL) { /* could not pull INIT-ACK chunk in cookie */ SCTPDBG(SCTP_DEBUG_INPUT1, "process_cookie_new: could not pull INIT-ACK chunk hdr\n"); return (NULL); } if (initack_cp->ch.chunk_type != SCTP_INITIATION_ACK) { return (NULL); } /* * NOTE: We can't use the INIT_ACK's chk_length to determine the * "initack_limit" value. This is because the chk_length field * includes the length of the cookie, but the cookie is omitted when * the INIT and INIT_ACK are tacked onto the cookie... */ initack_limit = offset + cookie_len; /* * now that we know the INIT/INIT-ACK are in place, create a new TCB * and popluate */ /* * Here we do a trick, we set in NULL for the proc/thread argument. * We do this since in effect we only use the p argument when the * socket is unbound and we must do an implicit bind. Since we are * getting a cookie, we cannot be unbound. */ stcb = sctp_aloc_assoc(inp, init_src, &error, ntohl(initack_cp->init.initiate_tag), vrf_id, ntohs(initack_cp->init.num_outbound_streams), port, (struct thread *)NULL, SCTP_DONT_INITIALIZE_AUTH_PARAMS); if (stcb == NULL) { struct mbuf *op_err; /* memory problem? */ SCTPDBG(SCTP_DEBUG_INPUT1, "process_cookie_new: no room for another TCB!\n"); op_err = sctp_generate_cause(SCTP_CAUSE_OUT_OF_RESC, ""); sctp_abort_association(inp, (struct sctp_tcb *)NULL, m, iphlen, src, dst, sh, op_err, mflowtype, mflowid, vrf_id, port); return (NULL); } /* get the correct sctp_nets */ if (netp) *netp = sctp_findnet(stcb, init_src); asoc = &stcb->asoc; /* get scope variables out of cookie */ asoc->scope.ipv4_local_scope = cookie->ipv4_scope; asoc->scope.site_scope = cookie->site_scope; asoc->scope.local_scope = cookie->local_scope; asoc->scope.loopback_scope = cookie->loopback_scope; if ((asoc->scope.ipv4_addr_legal != cookie->ipv4_addr_legal) || (asoc->scope.ipv6_addr_legal != cookie->ipv6_addr_legal)) { struct mbuf *op_err; /* * Houston we have a problem. The EP changed while the * cookie was in flight. Only recourse is to abort the * association. */ op_err = sctp_generate_cause(SCTP_CAUSE_OUT_OF_RESC, ""); sctp_abort_association(inp, (struct sctp_tcb *)NULL, m, iphlen, src, dst, sh, op_err, mflowtype, mflowid, vrf_id, port); (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTP_INPUT + SCTP_LOC_18); return (NULL); } /* process the INIT-ACK info (my info) */ asoc->my_vtag = ntohl(initack_cp->init.initiate_tag); asoc->my_rwnd = ntohl(initack_cp->init.a_rwnd); asoc->pre_open_streams = ntohs(initack_cp->init.num_outbound_streams); asoc->init_seq_number = ntohl(initack_cp->init.initial_tsn); asoc->sending_seq = asoc->asconf_seq_out = asoc->str_reset_seq_out = asoc->init_seq_number; asoc->asconf_seq_out_acked = asoc->asconf_seq_out - 1; asoc->asconf_seq_in = asoc->last_acked_seq = asoc->init_seq_number - 1; asoc->str_reset_seq_in = asoc->init_seq_number; asoc->advanced_peer_ack_point = asoc->last_acked_seq; /* process the INIT info (peer's info) */ if (netp) retval = sctp_process_init(init_cp, stcb); else retval = 0; if (retval < 0) { (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTP_INPUT + SCTP_LOC_19); return (NULL); } /* load all addresses */ if (sctp_load_addresses_from_init(stcb, m, init_offset + sizeof(struct sctp_init_chunk), initack_offset, src, dst, init_src, port)) { (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTP_INPUT + SCTP_LOC_20); return (NULL); } /* * verify any preceding AUTH chunk that was skipped */ /* pull the local authentication parameters from the cookie/init-ack */ sctp_auth_get_cookie_params(stcb, m, initack_offset + sizeof(struct sctp_init_ack_chunk), initack_limit - (initack_offset + sizeof(struct sctp_init_ack_chunk))); if (auth_skipped) { struct sctp_auth_chunk *auth; if (auth_len <= SCTP_CHUNK_BUFFER_SIZE) { auth = (struct sctp_auth_chunk *)sctp_m_getptr(m, auth_offset, auth_len, auth_chunk_buf); } else { auth = NULL; } if ((auth == NULL) || sctp_handle_auth(stcb, auth, m, auth_offset)) { /* auth HMAC failed, dump the assoc and packet */ SCTPDBG(SCTP_DEBUG_AUTH1, "COOKIE-ECHO: AUTH failed\n"); (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTP_INPUT + SCTP_LOC_21); return (NULL); } else { /* remaining chunks checked... good to go */ stcb->asoc.authenticated = 1; } } /* * if we're doing ASCONFs, check to see if we have any new local * addresses that need to get added to the peer (eg. addresses * changed while cookie echo in flight). This needs to be done * after we go to the OPEN state to do the correct asconf * processing. else, make sure we have the correct addresses in our * lists */ /* warning, we re-use sin, sin6, sa_store here! */ /* pull in local_address (our "from" address) */ switch (cookie->laddr_type) { #ifdef INET case SCTP_IPV4_ADDRESS: /* source addr is IPv4 */ memset(&store.sin, 0, sizeof(struct sockaddr_in)); store.sin.sin_family = AF_INET; store.sin.sin_len = sizeof(struct sockaddr_in); store.sin.sin_addr.s_addr = cookie->laddress[0]; break; #endif #ifdef INET6 case SCTP_IPV6_ADDRESS: /* source addr is IPv6 */ memset(&store.sin6, 0, sizeof(struct sockaddr_in6)); store.sin6.sin6_family = AF_INET6; store.sin6.sin6_len = sizeof(struct sockaddr_in6); store.sin6.sin6_scope_id = cookie->scope_id; memcpy(&store.sin6.sin6_addr, cookie->laddress, sizeof(struct in6_addr)); break; #endif default: (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTP_INPUT + SCTP_LOC_22); return (NULL); } /* update current state */ SCTPDBG(SCTP_DEBUG_INPUT2, "moving to OPEN state\n"); SCTP_SET_STATE(stcb, SCTP_STATE_OPEN); if (asoc->state & SCTP_STATE_SHUTDOWN_PENDING) { sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWNGUARD, stcb->sctp_ep, stcb, NULL); } sctp_stop_all_cookie_timers(stcb); SCTP_STAT_INCR_COUNTER32(sctps_passiveestab); SCTP_STAT_INCR_GAUGE32(sctps_currestab); /* set up to notify upper layer */ *notification = SCTP_NOTIFY_ASSOC_UP; if (((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL)) && (!SCTP_IS_LISTENING(inp))) { /* * This is an endpoint that called connect() how it got a * cookie that is NEW is a bit of a mystery. It must be that * the INIT was sent, but before it got there.. a complete * INIT/INIT-ACK/COOKIE arrived. But of course then it * should have went to the other code.. not here.. oh well.. * a bit of protection is worth having.. */ stcb->sctp_ep->sctp_flags |= SCTP_PCB_FLAGS_CONNECTED; soisconnected(stcb->sctp_socket); } else if ((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) && (SCTP_IS_LISTENING(inp))) { /* * We don't want to do anything with this one. Since it is * the listening guy. The timer will get started for * accepted connections in the caller. */ ; } /* since we did not send a HB make sure we don't double things */ if ((netp) && (*netp)) (*netp)->hb_responded = 1; if (stcb->asoc.sctp_autoclose_ticks && sctp_is_feature_on(inp, SCTP_PCB_FLAGS_AUTOCLOSE)) { sctp_timer_start(SCTP_TIMER_TYPE_AUTOCLOSE, inp, stcb, NULL); } (void)SCTP_GETTIME_TIMEVAL(&stcb->asoc.time_entered); if ((netp != NULL) && (*netp != NULL)) { struct timeval old; /* calculate the RTT and set the encaps port */ old.tv_sec = cookie->time_entered.tv_sec; old.tv_usec = cookie->time_entered.tv_usec; sctp_calculate_rto(stcb, asoc, *netp, &old, SCTP_RTT_FROM_NON_DATA); } /* respond with a COOKIE-ACK */ sctp_send_cookie_ack(stcb); /* * check the address lists for any ASCONFs that need to be sent * AFTER the cookie-ack is sent */ sctp_check_address_list(stcb, m, initack_offset + sizeof(struct sctp_init_ack_chunk), initack_limit - (initack_offset + sizeof(struct sctp_init_ack_chunk)), &store.sa, cookie->local_scope, cookie->site_scope, cookie->ipv4_scope, cookie->loopback_scope); return (stcb); } /* * CODE LIKE THIS NEEDS TO RUN IF the peer supports the NAT extension, i.e * we NEED to make sure we are not already using the vtag. If so we * need to send back an ABORT-TRY-AGAIN-WITH-NEW-TAG No middle box bit! head = &SCTP_BASE_INFO(sctp_asochash)[SCTP_PCBHASH_ASOC(tag, SCTP_BASE_INFO(hashasocmark))]; LIST_FOREACH(stcb, head, sctp_asocs) { if ((stcb->asoc.my_vtag == tag) && (stcb->rport == rport) && (inp == stcb->sctp_ep)) { -- SEND ABORT - TRY AGAIN -- } } */ /* * handles a COOKIE-ECHO message stcb: modified to either a new or left as * existing (non-NULL) TCB */ static struct mbuf * sctp_handle_cookie_echo(struct mbuf *m, int iphlen, int offset, struct sockaddr *src, struct sockaddr *dst, struct sctphdr *sh, struct sctp_cookie_echo_chunk *cp, struct sctp_inpcb **inp_p, struct sctp_tcb **stcb, struct sctp_nets **netp, int auth_skipped, uint32_t auth_offset, uint32_t auth_len, struct sctp_tcb **locked_tcb, uint8_t mflowtype, uint32_t mflowid, uint32_t vrf_id, uint16_t port) { struct sctp_state_cookie *cookie; struct sctp_tcb *l_stcb = *stcb; struct sctp_inpcb *l_inp; struct sockaddr *to; struct sctp_pcb *ep; struct mbuf *m_sig; uint8_t calc_sig[SCTP_SIGNATURE_SIZE], tmp_sig[SCTP_SIGNATURE_SIZE]; uint8_t *sig; uint8_t cookie_ok = 0; unsigned int sig_offset, cookie_offset; unsigned int cookie_len; struct timeval now; struct timeval time_expires; int notification = 0; struct sctp_nets *netl; int had_a_existing_tcb = 0; int send_int_conf = 0; #ifdef INET struct sockaddr_in sin; #endif #ifdef INET6 struct sockaddr_in6 sin6; #endif SCTPDBG(SCTP_DEBUG_INPUT2, "sctp_handle_cookie: handling COOKIE-ECHO\n"); if (inp_p == NULL) { return (NULL); } cookie = &cp->cookie; cookie_offset = offset + sizeof(struct sctp_chunkhdr); cookie_len = ntohs(cp->ch.chunk_length); if (cookie_len < sizeof(struct sctp_cookie_echo_chunk) + sizeof(struct sctp_init_chunk) + sizeof(struct sctp_init_ack_chunk) + SCTP_SIGNATURE_SIZE) { /* cookie too small */ return (NULL); } if ((cookie->peerport != sh->src_port) || (cookie->myport != sh->dest_port) || (cookie->my_vtag != sh->v_tag)) { /* * invalid ports or bad tag. Note that we always leave the * v_tag in the header in network order and when we stored * it in the my_vtag slot we also left it in network order. * This maintains the match even though it may be in the * opposite byte order of the machine :-> */ return (NULL); } /* * split off the signature into its own mbuf (since it should not be * calculated in the sctp_hmac_m() call). */ sig_offset = offset + cookie_len - SCTP_SIGNATURE_SIZE; m_sig = m_split(m, sig_offset, M_NOWAIT); if (m_sig == NULL) { /* out of memory or ?? */ return (NULL); } #ifdef SCTP_MBUF_LOGGING if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MBUF_LOGGING_ENABLE) { sctp_log_mbc(m_sig, SCTP_MBUF_SPLIT); } #endif /* * compute the signature/digest for the cookie */ ep = &(*inp_p)->sctp_ep; l_inp = *inp_p; if (l_stcb) { SCTP_TCB_UNLOCK(l_stcb); } SCTP_INP_RLOCK(l_inp); if (l_stcb) { SCTP_TCB_LOCK(l_stcb); } /* which cookie is it? */ if ((cookie->time_entered.tv_sec < (long)ep->time_of_secret_change) && (ep->current_secret_number != ep->last_secret_number)) { /* it's the old cookie */ (void)sctp_hmac_m(SCTP_HMAC, (uint8_t *)ep->secret_key[(int)ep->last_secret_number], SCTP_SECRET_SIZE, m, cookie_offset, calc_sig, 0); } else { /* it's the current cookie */ (void)sctp_hmac_m(SCTP_HMAC, (uint8_t *)ep->secret_key[(int)ep->current_secret_number], SCTP_SECRET_SIZE, m, cookie_offset, calc_sig, 0); } /* get the signature */ SCTP_INP_RUNLOCK(l_inp); sig = (uint8_t *)sctp_m_getptr(m_sig, 0, SCTP_SIGNATURE_SIZE, (uint8_t *)&tmp_sig); if (sig == NULL) { /* couldn't find signature */ sctp_m_freem(m_sig); return (NULL); } /* compare the received digest with the computed digest */ if (timingsafe_bcmp(calc_sig, sig, SCTP_SIGNATURE_SIZE) != 0) { /* try the old cookie? */ if ((cookie->time_entered.tv_sec == (long)ep->time_of_secret_change) && (ep->current_secret_number != ep->last_secret_number)) { /* compute digest with old */ (void)sctp_hmac_m(SCTP_HMAC, (uint8_t *)ep->secret_key[(int)ep->last_secret_number], SCTP_SECRET_SIZE, m, cookie_offset, calc_sig, 0); /* compare */ if (timingsafe_bcmp(calc_sig, sig, SCTP_SIGNATURE_SIZE) == 0) cookie_ok = 1; } } else { cookie_ok = 1; } /* * Now before we continue we must reconstruct our mbuf so that * normal processing of any other chunks will work. */ { struct mbuf *m_at; m_at = m; while (SCTP_BUF_NEXT(m_at) != NULL) { m_at = SCTP_BUF_NEXT(m_at); } SCTP_BUF_NEXT(m_at) = m_sig; } if (cookie_ok == 0) { SCTPDBG(SCTP_DEBUG_INPUT2, "handle_cookie_echo: cookie signature validation failed!\n"); SCTPDBG(SCTP_DEBUG_INPUT2, "offset = %u, cookie_offset = %u, sig_offset = %u\n", (uint32_t)offset, cookie_offset, sig_offset); return (NULL); } /* * check the cookie timestamps to be sure it's not stale */ (void)SCTP_GETTIME_TIMEVAL(&now); /* Expire time is in Ticks, so we convert to seconds */ time_expires.tv_sec = cookie->time_entered.tv_sec + sctp_ticks_to_secs(cookie->cookie_life); time_expires.tv_usec = cookie->time_entered.tv_usec; if (timevalcmp(&now, &time_expires, >)) { /* cookie is stale! */ struct mbuf *op_err; struct sctp_error_stale_cookie *cause; struct timeval diff; uint32_t staleness; op_err = sctp_get_mbuf_for_msg(sizeof(struct sctp_error_stale_cookie), 0, M_NOWAIT, 1, MT_DATA); if (op_err == NULL) { /* FOOBAR */ return (NULL); } /* Set the len */ SCTP_BUF_LEN(op_err) = sizeof(struct sctp_error_stale_cookie); cause = mtod(op_err, struct sctp_error_stale_cookie *); cause->cause.code = htons(SCTP_CAUSE_STALE_COOKIE); cause->cause.length = htons((sizeof(struct sctp_paramhdr) + (sizeof(uint32_t)))); diff = now; timevalsub(&diff, &time_expires); if ((uint32_t)diff.tv_sec > UINT32_MAX / 1000000) { staleness = UINT32_MAX; } else { staleness = diff.tv_sec * 1000000; } if (UINT32_MAX - staleness >= (uint32_t)diff.tv_usec) { staleness += diff.tv_usec; } else { staleness = UINT32_MAX; } cause->stale_time = htonl(staleness); sctp_send_operr_to(src, dst, sh, cookie->peers_vtag, op_err, mflowtype, mflowid, l_inp->fibnum, vrf_id, port); return (NULL); } /* * Now we must see with the lookup address if we have an existing * asoc. This will only happen if we were in the COOKIE-WAIT state * and a INIT collided with us and somewhere the peer sent the * cookie on another address besides the single address our assoc * had for him. In this case we will have one of the tie-tags set at * least AND the address field in the cookie can be used to look it * up. */ to = NULL; switch (cookie->addr_type) { #ifdef INET6 case SCTP_IPV6_ADDRESS: memset(&sin6, 0, sizeof(sin6)); sin6.sin6_family = AF_INET6; sin6.sin6_len = sizeof(sin6); sin6.sin6_port = sh->src_port; sin6.sin6_scope_id = cookie->scope_id; memcpy(&sin6.sin6_addr.s6_addr, cookie->address, sizeof(sin6.sin6_addr.s6_addr)); to = (struct sockaddr *)&sin6; break; #endif #ifdef INET case SCTP_IPV4_ADDRESS: memset(&sin, 0, sizeof(sin)); sin.sin_family = AF_INET; sin.sin_len = sizeof(sin); sin.sin_port = sh->src_port; sin.sin_addr.s_addr = cookie->address[0]; to = (struct sockaddr *)&sin; break; #endif default: /* This should not happen */ return (NULL); } if (*stcb == NULL) { /* Yep, lets check */ *stcb = sctp_findassociation_ep_addr(inp_p, to, netp, dst, NULL); if (*stcb == NULL) { /* * We should have only got back the same inp. If we * got back a different ep we have a problem. The * original findep got back l_inp and now */ if (l_inp != *inp_p) { SCTP_PRINTF("Bad problem find_ep got a diff inp then special_locate?\n"); } } else { if (*locked_tcb == NULL) { /* * In this case we found the assoc only * after we locked the create lock. This * means we are in a colliding case and we * must make sure that we unlock the tcb if * its one of the cases where we throw away * the incoming packets. */ *locked_tcb = *stcb; /* * We must also increment the inp ref count * since the ref_count flags was set when we * did not find the TCB, now we found it * which reduces the refcount.. we must * raise it back out to balance it all :-) */ SCTP_INP_INCR_REF((*stcb)->sctp_ep); if ((*stcb)->sctp_ep != l_inp) { SCTP_PRINTF("Huh? ep:%p diff then l_inp:%p?\n", (void *)(*stcb)->sctp_ep, (void *)l_inp); } } } } cookie_len -= SCTP_SIGNATURE_SIZE; if (*stcb == NULL) { /* this is the "normal" case... get a new TCB */ *stcb = sctp_process_cookie_new(m, iphlen, offset, src, dst, sh, cookie, cookie_len, *inp_p, netp, to, ¬ification, auth_skipped, auth_offset, auth_len, mflowtype, mflowid, vrf_id, port); } else { /* this is abnormal... cookie-echo on existing TCB */ had_a_existing_tcb = 1; *stcb = sctp_process_cookie_existing(m, iphlen, offset, src, dst, sh, cookie, cookie_len, *inp_p, *stcb, netp, to, ¬ification, auth_skipped, auth_offset, auth_len, mflowtype, mflowid, vrf_id, port); } if (*stcb == NULL) { /* still no TCB... must be bad cookie-echo */ return (NULL); } if (*netp != NULL) { (*netp)->flowtype = mflowtype; (*netp)->flowid = mflowid; } /* * Ok, we built an association so confirm the address we sent the * INIT-ACK to. */ netl = sctp_findnet(*stcb, to); /* * This code should in theory NOT run but */ if (netl == NULL) { /* TSNH! Huh, why do I need to add this address here? */ if (sctp_add_remote_addr(*stcb, to, NULL, port, SCTP_DONOT_SETSCOPE, SCTP_IN_COOKIE_PROC)) { return (NULL); } netl = sctp_findnet(*stcb, to); } if (netl) { if (netl->dest_state & SCTP_ADDR_UNCONFIRMED) { netl->dest_state &= ~SCTP_ADDR_UNCONFIRMED; (void)sctp_set_primary_addr((*stcb), (struct sockaddr *)NULL, netl); send_int_conf = 1; } } sctp_start_net_timers(*stcb); if ((*inp_p)->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) { if (!had_a_existing_tcb || (((*inp_p)->sctp_flags & SCTP_PCB_FLAGS_CONNECTED) == 0)) { /* * If we have a NEW cookie or the connect never * reached the connected state during collision we * must do the TCP accept thing. */ struct socket *so, *oso; struct sctp_inpcb *inp; if (notification == SCTP_NOTIFY_ASSOC_RESTART) { /* * For a restart we will keep the same * socket, no need to do anything. I THINK!! */ sctp_ulp_notify(notification, *stcb, 0, NULL, SCTP_SO_NOT_LOCKED); if (send_int_conf) { sctp_ulp_notify(SCTP_NOTIFY_INTERFACE_CONFIRMED, (*stcb), 0, (void *)netl, SCTP_SO_NOT_LOCKED); } return (m); } oso = (*inp_p)->sctp_socket; atomic_add_int(&(*stcb)->asoc.refcnt, 1); SCTP_TCB_UNLOCK((*stcb)); CURVNET_SET(oso->so_vnet); so = sonewconn(oso, 0 ); CURVNET_RESTORE(); SCTP_TCB_LOCK((*stcb)); atomic_subtract_int(&(*stcb)->asoc.refcnt, 1); if (so == NULL) { struct mbuf *op_err; /* Too many sockets */ SCTPDBG(SCTP_DEBUG_INPUT1, "process_cookie_new: no room for another socket!\n"); op_err = sctp_generate_cause(SCTP_CAUSE_OUT_OF_RESC, ""); sctp_abort_association(*inp_p, NULL, m, iphlen, src, dst, sh, op_err, mflowtype, mflowid, vrf_id, port); (void)sctp_free_assoc(*inp_p, *stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTP_INPUT + SCTP_LOC_23); return (NULL); } inp = (struct sctp_inpcb *)so->so_pcb; SCTP_INP_INCR_REF(inp); /* * We add the unbound flag here so that if we get an * soabort() before we get the move_pcb done, we * will properly cleanup. */ inp->sctp_flags = (SCTP_PCB_FLAGS_TCPTYPE | SCTP_PCB_FLAGS_CONNECTED | SCTP_PCB_FLAGS_IN_TCPPOOL | SCTP_PCB_FLAGS_UNBOUND | (SCTP_PCB_COPY_FLAGS & (*inp_p)->sctp_flags) | SCTP_PCB_FLAGS_DONT_WAKE); inp->sctp_features = (*inp_p)->sctp_features; inp->sctp_mobility_features = (*inp_p)->sctp_mobility_features; inp->sctp_socket = so; inp->sctp_frag_point = (*inp_p)->sctp_frag_point; inp->max_cwnd = (*inp_p)->max_cwnd; inp->sctp_cmt_on_off = (*inp_p)->sctp_cmt_on_off; inp->ecn_supported = (*inp_p)->ecn_supported; inp->prsctp_supported = (*inp_p)->prsctp_supported; inp->auth_supported = (*inp_p)->auth_supported; inp->asconf_supported = (*inp_p)->asconf_supported; inp->reconfig_supported = (*inp_p)->reconfig_supported; inp->nrsack_supported = (*inp_p)->nrsack_supported; inp->pktdrop_supported = (*inp_p)->pktdrop_supported; inp->partial_delivery_point = (*inp_p)->partial_delivery_point; inp->sctp_context = (*inp_p)->sctp_context; inp->local_strreset_support = (*inp_p)->local_strreset_support; inp->fibnum = (*inp_p)->fibnum; inp->inp_starting_point_for_iterator = NULL; /* * copy in the authentication parameters from the * original endpoint */ if (inp->sctp_ep.local_hmacs) sctp_free_hmaclist(inp->sctp_ep.local_hmacs); inp->sctp_ep.local_hmacs = sctp_copy_hmaclist((*inp_p)->sctp_ep.local_hmacs); if (inp->sctp_ep.local_auth_chunks) sctp_free_chunklist(inp->sctp_ep.local_auth_chunks); inp->sctp_ep.local_auth_chunks = sctp_copy_chunklist((*inp_p)->sctp_ep.local_auth_chunks); /* * Now we must move it from one hash table to * another and get the tcb in the right place. */ /* * This is where the one-2-one socket is put into * the accept state waiting for the accept! */ if (*stcb) { SCTP_ADD_SUBSTATE(*stcb, SCTP_STATE_IN_ACCEPT_QUEUE); } sctp_move_pcb_and_assoc(*inp_p, inp, *stcb); atomic_add_int(&(*stcb)->asoc.refcnt, 1); SCTP_TCB_UNLOCK((*stcb)); sctp_pull_off_control_to_new_inp((*inp_p), inp, *stcb, 0); SCTP_TCB_LOCK((*stcb)); atomic_subtract_int(&(*stcb)->asoc.refcnt, 1); /* * now we must check to see if we were aborted while * the move was going on and the lock/unlock * happened. */ if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) { /* * yep it was, we leave the assoc attached * to the socket since the sctp_inpcb_free() * call will send an abort for us. */ SCTP_INP_DECR_REF(inp); return (NULL); } SCTP_INP_DECR_REF(inp); /* Switch over to the new guy */ *inp_p = inp; sctp_ulp_notify(notification, *stcb, 0, NULL, SCTP_SO_NOT_LOCKED); if (send_int_conf) { sctp_ulp_notify(SCTP_NOTIFY_INTERFACE_CONFIRMED, (*stcb), 0, (void *)netl, SCTP_SO_NOT_LOCKED); } /* * Pull it from the incomplete queue and wake the * guy */ soisconnected(so); return (m); } } if (notification) { sctp_ulp_notify(notification, *stcb, 0, NULL, SCTP_SO_NOT_LOCKED); } if (send_int_conf) { sctp_ulp_notify(SCTP_NOTIFY_INTERFACE_CONFIRMED, (*stcb), 0, (void *)netl, SCTP_SO_NOT_LOCKED); } return (m); } static void sctp_handle_cookie_ack(struct sctp_cookie_ack_chunk *cp SCTP_UNUSED, struct sctp_tcb *stcb, struct sctp_nets *net) { /* cp must not be used, others call this without a c-ack :-) */ struct sctp_association *asoc; struct sctp_tmit_chunk *chk; SCTPDBG(SCTP_DEBUG_INPUT2, "sctp_handle_cookie_ack: handling COOKIE-ACK\n"); if ((stcb == NULL) || (net == NULL)) { return; } asoc = &stcb->asoc; if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_THRESHOLD_LOGGING) { sctp_misc_ints(SCTP_THRESHOLD_CLEAR, asoc->overall_error_count, 0, SCTP_FROM_SCTP_INPUT, __LINE__); } asoc->overall_error_count = 0; sctp_stop_all_cookie_timers(stcb); /* process according to association state */ if (SCTP_GET_STATE(stcb) == SCTP_STATE_COOKIE_ECHOED) { /* state change only needed when I am in right state */ SCTPDBG(SCTP_DEBUG_INPUT2, "moving to OPEN state\n"); SCTP_SET_STATE(stcb, SCTP_STATE_OPEN); sctp_start_net_timers(stcb); if (asoc->state & SCTP_STATE_SHUTDOWN_PENDING) { sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWNGUARD, stcb->sctp_ep, stcb, NULL); } /* update RTO */ SCTP_STAT_INCR_COUNTER32(sctps_activeestab); SCTP_STAT_INCR_GAUGE32(sctps_currestab); if (asoc->overall_error_count == 0) { sctp_calculate_rto(stcb, asoc, net, &asoc->time_entered, SCTP_RTT_FROM_NON_DATA); } (void)SCTP_GETTIME_TIMEVAL(&asoc->time_entered); sctp_ulp_notify(SCTP_NOTIFY_ASSOC_UP, stcb, 0, NULL, SCTP_SO_NOT_LOCKED); if ((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL)) { stcb->sctp_ep->sctp_flags |= SCTP_PCB_FLAGS_CONNECTED; if ((stcb->asoc.state & SCTP_STATE_CLOSED_SOCKET) == 0) { soisconnected(stcb->sctp_socket); } } /* * since we did not send a HB make sure we don't double * things */ net->hb_responded = 1; if (stcb->asoc.state & SCTP_STATE_CLOSED_SOCKET) { /* * We don't need to do the asconf thing, nor hb or * autoclose if the socket is closed. */ goto closed_socket; } sctp_timer_start(SCTP_TIMER_TYPE_HEARTBEAT, stcb->sctp_ep, stcb, net); if (stcb->asoc.sctp_autoclose_ticks && sctp_is_feature_on(stcb->sctp_ep, SCTP_PCB_FLAGS_AUTOCLOSE)) { sctp_timer_start(SCTP_TIMER_TYPE_AUTOCLOSE, stcb->sctp_ep, stcb, NULL); } /* * send ASCONF if parameters are pending and ASCONFs are * allowed (eg. addresses changed when init/cookie echo were * in flight) */ if ((sctp_is_feature_on(stcb->sctp_ep, SCTP_PCB_FLAGS_DO_ASCONF)) && (stcb->asoc.asconf_supported == 1) && (!TAILQ_EMPTY(&stcb->asoc.asconf_queue))) { #ifdef SCTP_TIMER_BASED_ASCONF sctp_timer_start(SCTP_TIMER_TYPE_ASCONF, stcb->sctp_ep, stcb, stcb->asoc.primary_destination); #else sctp_send_asconf(stcb, stcb->asoc.primary_destination, SCTP_ADDR_NOT_LOCKED); #endif } } closed_socket: /* Toss the cookie if I can */ sctp_toss_old_cookies(stcb, asoc); /* Restart the timer if we have pending data */ TAILQ_FOREACH(chk, &asoc->sent_queue, sctp_next) { if (chk->whoTo != NULL) { break; } } if (chk != NULL) { sctp_timer_start(SCTP_TIMER_TYPE_SEND, stcb->sctp_ep, stcb, chk->whoTo); } } static void sctp_handle_ecn_echo(struct sctp_ecne_chunk *cp, struct sctp_tcb *stcb) { struct sctp_nets *net; struct sctp_tmit_chunk *lchk; struct sctp_ecne_chunk bkup; uint8_t override_bit; uint32_t tsn, window_data_tsn; int len; unsigned int pkt_cnt; len = ntohs(cp->ch.chunk_length); if ((len != sizeof(struct sctp_ecne_chunk)) && (len != sizeof(struct old_sctp_ecne_chunk))) { return; } if (len == sizeof(struct old_sctp_ecne_chunk)) { /* Its the old format */ memcpy(&bkup, cp, sizeof(struct old_sctp_ecne_chunk)); bkup.num_pkts_since_cwr = htonl(1); cp = &bkup; } SCTP_STAT_INCR(sctps_recvecne); tsn = ntohl(cp->tsn); pkt_cnt = ntohl(cp->num_pkts_since_cwr); lchk = TAILQ_LAST(&stcb->asoc.send_queue, sctpchunk_listhead); if (lchk == NULL) { window_data_tsn = stcb->asoc.sending_seq - 1; } else { window_data_tsn = lchk->rec.data.tsn; } /* Find where it was sent to if possible. */ net = NULL; TAILQ_FOREACH(lchk, &stcb->asoc.sent_queue, sctp_next) { if (lchk->rec.data.tsn == tsn) { net = lchk->whoTo; net->ecn_prev_cwnd = lchk->rec.data.cwnd_at_send; break; } if (SCTP_TSN_GT(lchk->rec.data.tsn, tsn)) { break; } } if (net == NULL) { /* * What to do. A previous send of a CWR was possibly lost. * See how old it is, we may have it marked on the actual * net. */ TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) { if (tsn == net->last_cwr_tsn) { /* Found him, send it off */ break; } } if (net == NULL) { /* * If we reach here, we need to send a special CWR * that says hey, we did this a long time ago and * you lost the response. */ net = TAILQ_FIRST(&stcb->asoc.nets); if (net == NULL) { /* TSNH */ return; } override_bit = SCTP_CWR_REDUCE_OVERRIDE; } else { override_bit = 0; } } else { override_bit = 0; } if (SCTP_TSN_GT(tsn, net->cwr_window_tsn) && ((override_bit & SCTP_CWR_REDUCE_OVERRIDE) == 0)) { /* * JRS - Use the congestion control given in the pluggable * CC module */ stcb->asoc.cc_functions.sctp_cwnd_update_after_ecn_echo(stcb, net, 0, pkt_cnt); /* * We reduce once every RTT. So we will only lower cwnd at * the next sending seq i.e. the window_data_tsn */ net->cwr_window_tsn = window_data_tsn; net->ecn_ce_pkt_cnt += pkt_cnt; net->lost_cnt = pkt_cnt; net->last_cwr_tsn = tsn; } else { override_bit |= SCTP_CWR_IN_SAME_WINDOW; if (SCTP_TSN_GT(tsn, net->last_cwr_tsn) && ((override_bit & SCTP_CWR_REDUCE_OVERRIDE) == 0)) { /* * Another loss in the same window update how many * marks/packets lost we have had. */ int cnt = 1; if (pkt_cnt > net->lost_cnt) { /* Should be the case */ cnt = (pkt_cnt - net->lost_cnt); net->ecn_ce_pkt_cnt += cnt; } net->lost_cnt = pkt_cnt; net->last_cwr_tsn = tsn; /* * Most CC functions will ignore this call, since we * are in-window yet of the initial CE the peer saw. */ stcb->asoc.cc_functions.sctp_cwnd_update_after_ecn_echo(stcb, net, 1, cnt); } } /* * We always send a CWR this way if our previous one was lost our * peer will get an update, or if it is not time again to reduce we * still get the cwr to the peer. Note we set the override when we * could not find the TSN on the chunk or the destination network. */ sctp_send_cwr(stcb, net, net->last_cwr_tsn, override_bit); } static void sctp_handle_ecn_cwr(struct sctp_cwr_chunk *cp, struct sctp_tcb *stcb, struct sctp_nets *net) { /* * Here we get a CWR from the peer. We must look in the outqueue and * make sure that we have a covered ECNE in the control chunk part. * If so remove it. */ struct sctp_tmit_chunk *chk, *nchk; struct sctp_ecne_chunk *ecne; int override; uint32_t cwr_tsn; cwr_tsn = ntohl(cp->tsn); override = cp->ch.chunk_flags & SCTP_CWR_REDUCE_OVERRIDE; TAILQ_FOREACH_SAFE(chk, &stcb->asoc.control_send_queue, sctp_next, nchk) { if (chk->rec.chunk_id.id != SCTP_ECN_ECHO) { continue; } if ((override == 0) && (chk->whoTo != net)) { /* Must be from the right src unless override is set */ continue; } ecne = mtod(chk->data, struct sctp_ecne_chunk *); if (SCTP_TSN_GE(cwr_tsn, ntohl(ecne->tsn))) { /* this covers this ECNE, we can remove it */ stcb->asoc.ecn_echo_cnt_onq--; TAILQ_REMOVE(&stcb->asoc.control_send_queue, chk, sctp_next); stcb->asoc.ctrl_queue_cnt--; sctp_m_freem(chk->data); chk->data = NULL; sctp_free_a_chunk(stcb, chk, SCTP_SO_NOT_LOCKED); if (override == 0) { break; } } } } static void sctp_handle_shutdown_complete(struct sctp_shutdown_complete_chunk *cp SCTP_UNUSED, struct sctp_tcb *stcb, struct sctp_nets *net) { SCTPDBG(SCTP_DEBUG_INPUT2, "sctp_handle_shutdown_complete: handling SHUTDOWN-COMPLETE\n"); if (stcb == NULL) return; /* process according to association state */ if (SCTP_GET_STATE(stcb) != SCTP_STATE_SHUTDOWN_ACK_SENT) { /* unexpected SHUTDOWN-COMPLETE... so ignore... */ SCTPDBG(SCTP_DEBUG_INPUT2, "sctp_handle_shutdown_complete: not in SCTP_STATE_SHUTDOWN_ACK_SENT --- ignore\n"); SCTP_TCB_UNLOCK(stcb); return; } /* notify upper layer protocol */ if (stcb->sctp_socket) { sctp_ulp_notify(SCTP_NOTIFY_ASSOC_DOWN, stcb, 0, NULL, SCTP_SO_NOT_LOCKED); } #ifdef INVARIANTS if (!TAILQ_EMPTY(&stcb->asoc.send_queue) || !TAILQ_EMPTY(&stcb->asoc.sent_queue) || sctp_is_there_unsent_data(stcb, SCTP_SO_NOT_LOCKED)) { panic("Queues are not empty when handling SHUTDOWN-COMPLETE"); } #endif /* stop the timer */ sctp_timer_stop(SCTP_TIMER_TYPE_SHUTDOWNACK, stcb->sctp_ep, stcb, net, SCTP_FROM_SCTP_INPUT + SCTP_LOC_24); SCTP_STAT_INCR_COUNTER32(sctps_shutdown); /* free the TCB */ SCTPDBG(SCTP_DEBUG_INPUT2, "sctp_handle_shutdown_complete: calls free-asoc\n"); (void)sctp_free_assoc(stcb->sctp_ep, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTP_INPUT + SCTP_LOC_25); return; } static int process_chunk_drop(struct sctp_tcb *stcb, struct sctp_chunk_desc *desc, struct sctp_nets *net, uint8_t flg) { switch (desc->chunk_type) { case SCTP_DATA: case SCTP_IDATA: /* find the tsn to resend (possibly) */ { uint32_t tsn; struct sctp_tmit_chunk *tp1; tsn = ntohl(desc->tsn_ifany); TAILQ_FOREACH(tp1, &stcb->asoc.sent_queue, sctp_next) { if (tp1->rec.data.tsn == tsn) { /* found it */ break; } if (SCTP_TSN_GT(tp1->rec.data.tsn, tsn)) { /* not found */ tp1 = NULL; break; } } if (tp1 == NULL) { /* * Do it the other way , aka without paying * attention to queue seq order. */ SCTP_STAT_INCR(sctps_pdrpdnfnd); TAILQ_FOREACH(tp1, &stcb->asoc.sent_queue, sctp_next) { if (tp1->rec.data.tsn == tsn) { /* found it */ break; } } } if (tp1 == NULL) { SCTP_STAT_INCR(sctps_pdrptsnnf); } if ((tp1) && (tp1->sent < SCTP_DATAGRAM_ACKED)) { if (((flg & SCTP_BADCRC) == 0) && ((flg & SCTP_FROM_MIDDLE_BOX) == 0)) { return (0); } if ((stcb->asoc.peers_rwnd == 0) && ((flg & SCTP_FROM_MIDDLE_BOX) == 0)) { SCTP_STAT_INCR(sctps_pdrpdiwnp); return (0); } if (stcb->asoc.peers_rwnd == 0 && (flg & SCTP_FROM_MIDDLE_BOX)) { SCTP_STAT_INCR(sctps_pdrpdizrw); return (0); } if ((uint32_t)SCTP_BUF_LEN(tp1->data) < SCTP_DATA_CHUNK_OVERHEAD(stcb) + SCTP_NUM_DB_TO_VERIFY) { /* Payload not matching. */ SCTP_STAT_INCR(sctps_pdrpbadd); return (-1); } if (memcmp(mtod(tp1->data, caddr_t)+SCTP_DATA_CHUNK_OVERHEAD(stcb), desc->data_bytes, SCTP_NUM_DB_TO_VERIFY) != 0) { /* Payload not matching. */ SCTP_STAT_INCR(sctps_pdrpbadd); return (-1); } if (tp1->do_rtt) { /* * this guy had a RTO calculation * pending on it, cancel it */ if (tp1->whoTo->rto_needed == 0) { tp1->whoTo->rto_needed = 1; } tp1->do_rtt = 0; } SCTP_STAT_INCR(sctps_pdrpmark); if (tp1->sent != SCTP_DATAGRAM_RESEND) sctp_ucount_incr(stcb->asoc.sent_queue_retran_cnt); /* * mark it as if we were doing a FR, since * we will be getting gap ack reports behind * the info from the router. */ tp1->rec.data.doing_fast_retransmit = 1; /* * mark the tsn with what sequences can * cause a new FR. */ if (TAILQ_EMPTY(&stcb->asoc.send_queue)) { tp1->rec.data.fast_retran_tsn = stcb->asoc.sending_seq; } else { tp1->rec.data.fast_retran_tsn = (TAILQ_FIRST(&stcb->asoc.send_queue))->rec.data.tsn; } /* restart the timer */ sctp_timer_stop(SCTP_TIMER_TYPE_SEND, stcb->sctp_ep, stcb, tp1->whoTo, SCTP_FROM_SCTP_INPUT + SCTP_LOC_26); sctp_timer_start(SCTP_TIMER_TYPE_SEND, stcb->sctp_ep, stcb, tp1->whoTo); /* fix counts and things */ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_FLIGHT_LOGGING_ENABLE) { sctp_misc_ints(SCTP_FLIGHT_LOG_DOWN_PDRP, tp1->whoTo->flight_size, tp1->book_size, (uint32_t)(uintptr_t)stcb, tp1->rec.data.tsn); } if (tp1->sent < SCTP_DATAGRAM_RESEND) { sctp_flight_size_decrease(tp1); sctp_total_flight_decrease(stcb, tp1); } tp1->sent = SCTP_DATAGRAM_RESEND; } { /* audit code */ unsigned int audit; audit = 0; TAILQ_FOREACH(tp1, &stcb->asoc.sent_queue, sctp_next) { if (tp1->sent == SCTP_DATAGRAM_RESEND) audit++; } TAILQ_FOREACH(tp1, &stcb->asoc.control_send_queue, sctp_next) { if (tp1->sent == SCTP_DATAGRAM_RESEND) audit++; } if (audit != stcb->asoc.sent_queue_retran_cnt) { SCTP_PRINTF("**Local Audit finds cnt:%d asoc cnt:%d\n", audit, stcb->asoc.sent_queue_retran_cnt); #ifndef SCTP_AUDITING_ENABLED stcb->asoc.sent_queue_retran_cnt = audit; #endif } } } break; case SCTP_ASCONF: { struct sctp_tmit_chunk *asconf; TAILQ_FOREACH(asconf, &stcb->asoc.control_send_queue, sctp_next) { if (asconf->rec.chunk_id.id == SCTP_ASCONF) { break; } } if (asconf) { if (asconf->sent != SCTP_DATAGRAM_RESEND) sctp_ucount_incr(stcb->asoc.sent_queue_retran_cnt); asconf->sent = SCTP_DATAGRAM_RESEND; asconf->snd_count--; } } break; case SCTP_INITIATION: /* resend the INIT */ stcb->asoc.dropped_special_cnt++; if (stcb->asoc.dropped_special_cnt < SCTP_RETRY_DROPPED_THRESH) { /* * If we can get it in, in a few attempts we do * this, otherwise we let the timer fire. */ sctp_timer_stop(SCTP_TIMER_TYPE_INIT, stcb->sctp_ep, stcb, net, SCTP_FROM_SCTP_INPUT + SCTP_LOC_27); sctp_send_initiate(stcb->sctp_ep, stcb, SCTP_SO_NOT_LOCKED); } break; case SCTP_SELECTIVE_ACK: case SCTP_NR_SELECTIVE_ACK: /* resend the sack */ sctp_send_sack(stcb, SCTP_SO_NOT_LOCKED); break; case SCTP_HEARTBEAT_REQUEST: /* resend a demand HB */ if ((stcb->asoc.overall_error_count + 3) < stcb->asoc.max_send_times) { /* * Only retransmit if we KNOW we wont destroy the * tcb */ sctp_send_hb(stcb, net, SCTP_SO_NOT_LOCKED); } break; case SCTP_SHUTDOWN: sctp_send_shutdown(stcb, net); break; case SCTP_SHUTDOWN_ACK: sctp_send_shutdown_ack(stcb, net); break; case SCTP_COOKIE_ECHO: { struct sctp_tmit_chunk *cookie; cookie = NULL; TAILQ_FOREACH(cookie, &stcb->asoc.control_send_queue, sctp_next) { if (cookie->rec.chunk_id.id == SCTP_COOKIE_ECHO) { break; } } if (cookie) { if (cookie->sent != SCTP_DATAGRAM_RESEND) sctp_ucount_incr(stcb->asoc.sent_queue_retran_cnt); cookie->sent = SCTP_DATAGRAM_RESEND; sctp_stop_all_cookie_timers(stcb); } } break; case SCTP_COOKIE_ACK: sctp_send_cookie_ack(stcb); break; case SCTP_ASCONF_ACK: /* resend last asconf ack */ sctp_send_asconf_ack(stcb); break; case SCTP_IFORWARD_CUM_TSN: case SCTP_FORWARD_CUM_TSN: send_forward_tsn(stcb, &stcb->asoc); break; /* can't do anything with these */ case SCTP_PACKET_DROPPED: case SCTP_INITIATION_ACK: /* this should not happen */ case SCTP_HEARTBEAT_ACK: case SCTP_ABORT_ASSOCIATION: case SCTP_OPERATION_ERROR: case SCTP_SHUTDOWN_COMPLETE: case SCTP_ECN_ECHO: case SCTP_ECN_CWR: default: break; } return (0); } void sctp_reset_in_stream(struct sctp_tcb *stcb, uint32_t number_entries, uint16_t *list) { uint32_t i; uint16_t temp; /* * We set things to 0xffffffff since this is the last delivered * sequence and we will be sending in 0 after the reset. */ if (number_entries) { for (i = 0; i < number_entries; i++) { temp = ntohs(list[i]); if (temp >= stcb->asoc.streamincnt) { continue; } stcb->asoc.strmin[temp].last_mid_delivered = 0xffffffff; } } else { list = NULL; for (i = 0; i < stcb->asoc.streamincnt; i++) { stcb->asoc.strmin[i].last_mid_delivered = 0xffffffff; } } sctp_ulp_notify(SCTP_NOTIFY_STR_RESET_RECV, stcb, number_entries, (void *)list, SCTP_SO_NOT_LOCKED); } static void sctp_reset_out_streams(struct sctp_tcb *stcb, uint32_t number_entries, uint16_t *list) { uint32_t i; uint16_t temp; if (number_entries > 0) { for (i = 0; i < number_entries; i++) { temp = ntohs(list[i]); if (temp >= stcb->asoc.streamoutcnt) { /* no such stream */ continue; } stcb->asoc.strmout[temp].next_mid_ordered = 0; stcb->asoc.strmout[temp].next_mid_unordered = 0; } } else { for (i = 0; i < stcb->asoc.streamoutcnt; i++) { stcb->asoc.strmout[i].next_mid_ordered = 0; stcb->asoc.strmout[i].next_mid_unordered = 0; } } sctp_ulp_notify(SCTP_NOTIFY_STR_RESET_SEND, stcb, number_entries, (void *)list, SCTP_SO_NOT_LOCKED); } static void sctp_reset_clear_pending(struct sctp_tcb *stcb, uint32_t number_entries, uint16_t *list) { uint32_t i; uint16_t temp; if (number_entries > 0) { for (i = 0; i < number_entries; i++) { temp = ntohs(list[i]); if (temp >= stcb->asoc.streamoutcnt) { /* no such stream */ continue; } stcb->asoc.strmout[temp].state = SCTP_STREAM_OPEN; } } else { for (i = 0; i < stcb->asoc.streamoutcnt; i++) { stcb->asoc.strmout[i].state = SCTP_STREAM_OPEN; } } } struct sctp_stream_reset_request * sctp_find_stream_reset(struct sctp_tcb *stcb, uint32_t seq, struct sctp_tmit_chunk **bchk) { struct sctp_association *asoc; struct sctp_chunkhdr *ch; struct sctp_stream_reset_request *r; struct sctp_tmit_chunk *chk; int len, clen; asoc = &stcb->asoc; if (TAILQ_EMPTY(&stcb->asoc.control_send_queue)) { asoc->stream_reset_outstanding = 0; return (NULL); } if (stcb->asoc.str_reset == NULL) { asoc->stream_reset_outstanding = 0; return (NULL); } chk = stcb->asoc.str_reset; if (chk->data == NULL) { return (NULL); } if (bchk) { /* he wants a copy of the chk pointer */ *bchk = chk; } clen = chk->send_size; ch = mtod(chk->data, struct sctp_chunkhdr *); r = (struct sctp_stream_reset_request *)(ch + 1); if (ntohl(r->request_seq) == seq) { /* found it */ return (r); } len = SCTP_SIZE32(ntohs(r->ph.param_length)); if (clen > (len + (int)sizeof(struct sctp_chunkhdr))) { /* move to the next one, there can only be a max of two */ r = (struct sctp_stream_reset_request *)((caddr_t)r + len); if (ntohl(r->request_seq) == seq) { return (r); } } /* that seq is not here */ return (NULL); } static void sctp_clean_up_stream_reset(struct sctp_tcb *stcb) { struct sctp_association *asoc; struct sctp_tmit_chunk *chk; asoc = &stcb->asoc; chk = asoc->str_reset; if (chk == NULL) { return; } asoc->str_reset = NULL; sctp_timer_stop(SCTP_TIMER_TYPE_STRRESET, stcb->sctp_ep, stcb, NULL, SCTP_FROM_SCTP_INPUT + SCTP_LOC_28); TAILQ_REMOVE(&asoc->control_send_queue, chk, sctp_next); asoc->ctrl_queue_cnt--; if (chk->data) { sctp_m_freem(chk->data); chk->data = NULL; } sctp_free_a_chunk(stcb, chk, SCTP_SO_NOT_LOCKED); } static int sctp_handle_stream_reset_response(struct sctp_tcb *stcb, uint32_t seq, uint32_t action, struct sctp_stream_reset_response *respin) { uint16_t type; int lparam_len; struct sctp_association *asoc = &stcb->asoc; struct sctp_tmit_chunk *chk; struct sctp_stream_reset_request *req_param; struct sctp_stream_reset_out_request *req_out_param; struct sctp_stream_reset_in_request *req_in_param; uint32_t number_entries; if (asoc->stream_reset_outstanding == 0) { /* duplicate */ return (0); } if (seq == stcb->asoc.str_reset_seq_out) { req_param = sctp_find_stream_reset(stcb, seq, &chk); if (req_param != NULL) { stcb->asoc.str_reset_seq_out++; type = ntohs(req_param->ph.param_type); lparam_len = ntohs(req_param->ph.param_length); if (type == SCTP_STR_RESET_OUT_REQUEST) { int no_clear = 0; req_out_param = (struct sctp_stream_reset_out_request *)req_param; number_entries = (lparam_len - sizeof(struct sctp_stream_reset_out_request)) / sizeof(uint16_t); asoc->stream_reset_out_is_outstanding = 0; if (asoc->stream_reset_outstanding) asoc->stream_reset_outstanding--; if (action == SCTP_STREAM_RESET_RESULT_PERFORMED) { /* do it */ sctp_reset_out_streams(stcb, number_entries, req_out_param->list_of_streams); } else if (action == SCTP_STREAM_RESET_RESULT_DENIED) { sctp_ulp_notify(SCTP_NOTIFY_STR_RESET_DENIED_OUT, stcb, number_entries, req_out_param->list_of_streams, SCTP_SO_NOT_LOCKED); } else if (action == SCTP_STREAM_RESET_RESULT_IN_PROGRESS) { /* * Set it up so we don't stop * retransmitting */ asoc->stream_reset_outstanding++; stcb->asoc.str_reset_seq_out--; asoc->stream_reset_out_is_outstanding = 1; no_clear = 1; } else { sctp_ulp_notify(SCTP_NOTIFY_STR_RESET_FAILED_OUT, stcb, number_entries, req_out_param->list_of_streams, SCTP_SO_NOT_LOCKED); } if (no_clear == 0) { sctp_reset_clear_pending(stcb, number_entries, req_out_param->list_of_streams); } } else if (type == SCTP_STR_RESET_IN_REQUEST) { req_in_param = (struct sctp_stream_reset_in_request *)req_param; number_entries = (lparam_len - sizeof(struct sctp_stream_reset_in_request)) / sizeof(uint16_t); if (asoc->stream_reset_outstanding) asoc->stream_reset_outstanding--; if (action == SCTP_STREAM_RESET_RESULT_DENIED) { sctp_ulp_notify(SCTP_NOTIFY_STR_RESET_DENIED_IN, stcb, number_entries, req_in_param->list_of_streams, SCTP_SO_NOT_LOCKED); } else if (action != SCTP_STREAM_RESET_RESULT_PERFORMED) { sctp_ulp_notify(SCTP_NOTIFY_STR_RESET_FAILED_IN, stcb, number_entries, req_in_param->list_of_streams, SCTP_SO_NOT_LOCKED); } } else if (type == SCTP_STR_RESET_ADD_OUT_STREAMS) { /* Ok we now may have more streams */ int num_stream; num_stream = stcb->asoc.strm_pending_add_size; if (num_stream > (stcb->asoc.strm_realoutsize - stcb->asoc.streamoutcnt)) { /* TSNH */ num_stream = stcb->asoc.strm_realoutsize - stcb->asoc.streamoutcnt; } stcb->asoc.strm_pending_add_size = 0; if (asoc->stream_reset_outstanding) asoc->stream_reset_outstanding--; if (action == SCTP_STREAM_RESET_RESULT_PERFORMED) { /* Put the new streams into effect */ int i; for (i = asoc->streamoutcnt; i < (asoc->streamoutcnt + num_stream); i++) { asoc->strmout[i].state = SCTP_STREAM_OPEN; } asoc->streamoutcnt += num_stream; sctp_notify_stream_reset_add(stcb, stcb->asoc.streamincnt, stcb->asoc.streamoutcnt, 0); } else if (action == SCTP_STREAM_RESET_RESULT_DENIED) { sctp_notify_stream_reset_add(stcb, stcb->asoc.streamincnt, stcb->asoc.streamoutcnt, SCTP_STREAM_CHANGE_DENIED); } else { sctp_notify_stream_reset_add(stcb, stcb->asoc.streamincnt, stcb->asoc.streamoutcnt, SCTP_STREAM_CHANGE_FAILED); } } else if (type == SCTP_STR_RESET_ADD_IN_STREAMS) { if (asoc->stream_reset_outstanding) asoc->stream_reset_outstanding--; if (action == SCTP_STREAM_RESET_RESULT_DENIED) { sctp_notify_stream_reset_add(stcb, stcb->asoc.streamincnt, stcb->asoc.streamoutcnt, SCTP_STREAM_CHANGE_DENIED); } else if (action != SCTP_STREAM_RESET_RESULT_PERFORMED) { sctp_notify_stream_reset_add(stcb, stcb->asoc.streamincnt, stcb->asoc.streamoutcnt, SCTP_STREAM_CHANGE_FAILED); } } else if (type == SCTP_STR_RESET_TSN_REQUEST) { /** * a) Adopt the new in tsn. * b) reset the map * c) Adopt the new out-tsn */ struct sctp_stream_reset_response_tsn *resp; struct sctp_forward_tsn_chunk fwdtsn; int abort_flag = 0; if (respin == NULL) { /* huh ? */ return (0); } if (ntohs(respin->ph.param_length) < sizeof(struct sctp_stream_reset_response_tsn)) { return (0); } if (action == SCTP_STREAM_RESET_RESULT_PERFORMED) { resp = (struct sctp_stream_reset_response_tsn *)respin; asoc->stream_reset_outstanding--; fwdtsn.ch.chunk_length = htons(sizeof(struct sctp_forward_tsn_chunk)); fwdtsn.ch.chunk_type = SCTP_FORWARD_CUM_TSN; fwdtsn.new_cumulative_tsn = htonl(ntohl(resp->senders_next_tsn) - 1); sctp_handle_forward_tsn(stcb, &fwdtsn, &abort_flag, NULL, 0); if (abort_flag) { return (1); } stcb->asoc.highest_tsn_inside_map = (ntohl(resp->senders_next_tsn) - 1); if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MAP_LOGGING_ENABLE) { sctp_log_map(0, 7, asoc->highest_tsn_inside_map, SCTP_MAP_SLIDE_RESULT); } stcb->asoc.tsn_last_delivered = stcb->asoc.cumulative_tsn = stcb->asoc.highest_tsn_inside_map; stcb->asoc.mapping_array_base_tsn = ntohl(resp->senders_next_tsn); memset(stcb->asoc.mapping_array, 0, stcb->asoc.mapping_array_size); stcb->asoc.highest_tsn_inside_nr_map = stcb->asoc.highest_tsn_inside_map; memset(stcb->asoc.nr_mapping_array, 0, stcb->asoc.mapping_array_size); stcb->asoc.sending_seq = ntohl(resp->receivers_next_tsn); stcb->asoc.last_acked_seq = stcb->asoc.cumulative_tsn; sctp_reset_out_streams(stcb, 0, (uint16_t *)NULL); sctp_reset_in_stream(stcb, 0, (uint16_t *)NULL); sctp_notify_stream_reset_tsn(stcb, stcb->asoc.sending_seq, (stcb->asoc.mapping_array_base_tsn + 1), 0); } else if (action == SCTP_STREAM_RESET_RESULT_DENIED) { sctp_notify_stream_reset_tsn(stcb, stcb->asoc.sending_seq, (stcb->asoc.mapping_array_base_tsn + 1), SCTP_ASSOC_RESET_DENIED); } else { sctp_notify_stream_reset_tsn(stcb, stcb->asoc.sending_seq, (stcb->asoc.mapping_array_base_tsn + 1), SCTP_ASSOC_RESET_FAILED); } } /* get rid of the request and get the request flags */ if (asoc->stream_reset_outstanding == 0) { sctp_clean_up_stream_reset(stcb); } } } if (asoc->stream_reset_outstanding == 0) { sctp_send_stream_reset_out_if_possible(stcb, SCTP_SO_NOT_LOCKED); } return (0); } static void sctp_handle_str_reset_request_in(struct sctp_tcb *stcb, struct sctp_tmit_chunk *chk, struct sctp_stream_reset_in_request *req, int trunc) { uint32_t seq; int len, i; int number_entries; uint16_t temp; /* * peer wants me to send a str-reset to him for my outgoing seq's if * seq_in is right. */ struct sctp_association *asoc = &stcb->asoc; seq = ntohl(req->request_seq); if (asoc->str_reset_seq_in == seq) { asoc->last_reset_action[1] = asoc->last_reset_action[0]; if (!(asoc->local_strreset_support & SCTP_ENABLE_RESET_STREAM_REQ)) { asoc->last_reset_action[0] = SCTP_STREAM_RESET_RESULT_DENIED; } else if (trunc) { /* Can't do it, since they exceeded our buffer size */ asoc->last_reset_action[0] = SCTP_STREAM_RESET_RESULT_DENIED; } else if (stcb->asoc.stream_reset_out_is_outstanding == 0) { len = ntohs(req->ph.param_length); number_entries = ((len - sizeof(struct sctp_stream_reset_in_request)) / sizeof(uint16_t)); if (number_entries) { for (i = 0; i < number_entries; i++) { temp = ntohs(req->list_of_streams[i]); if (temp >= stcb->asoc.streamoutcnt) { asoc->last_reset_action[0] = SCTP_STREAM_RESET_RESULT_DENIED; goto bad_boy; } req->list_of_streams[i] = temp; } for (i = 0; i < number_entries; i++) { if (stcb->asoc.strmout[req->list_of_streams[i]].state == SCTP_STREAM_OPEN) { stcb->asoc.strmout[req->list_of_streams[i]].state = SCTP_STREAM_RESET_PENDING; } } } else { /* Its all */ for (i = 0; i < stcb->asoc.streamoutcnt; i++) { if (stcb->asoc.strmout[i].state == SCTP_STREAM_OPEN) stcb->asoc.strmout[i].state = SCTP_STREAM_RESET_PENDING; } } asoc->last_reset_action[0] = SCTP_STREAM_RESET_RESULT_PERFORMED; } else { /* Can't do it, since we have sent one out */ asoc->last_reset_action[0] = SCTP_STREAM_RESET_RESULT_ERR_IN_PROGRESS; } bad_boy: sctp_add_stream_reset_result(chk, seq, asoc->last_reset_action[0]); asoc->str_reset_seq_in++; } else if (asoc->str_reset_seq_in - 1 == seq) { sctp_add_stream_reset_result(chk, seq, asoc->last_reset_action[0]); } else if (asoc->str_reset_seq_in - 2 == seq) { sctp_add_stream_reset_result(chk, seq, asoc->last_reset_action[1]); } else { sctp_add_stream_reset_result(chk, seq, SCTP_STREAM_RESET_RESULT_ERR_BAD_SEQNO); } sctp_send_stream_reset_out_if_possible(stcb, SCTP_SO_NOT_LOCKED); } static int sctp_handle_str_reset_request_tsn(struct sctp_tcb *stcb, struct sctp_tmit_chunk *chk, struct sctp_stream_reset_tsn_request *req) { /* reset all in and out and update the tsn */ /* * A) reset my str-seq's on in and out. B) Select a receive next, * and set cum-ack to it. Also process this selected number as a * fwd-tsn as well. C) set in the response my next sending seq. */ struct sctp_forward_tsn_chunk fwdtsn; struct sctp_association *asoc = &stcb->asoc; int abort_flag = 0; uint32_t seq; seq = ntohl(req->request_seq); if (asoc->str_reset_seq_in == seq) { asoc->last_reset_action[1] = stcb->asoc.last_reset_action[0]; if (!(asoc->local_strreset_support & SCTP_ENABLE_CHANGE_ASSOC_REQ)) { asoc->last_reset_action[0] = SCTP_STREAM_RESET_RESULT_DENIED; } else { fwdtsn.ch.chunk_length = htons(sizeof(struct sctp_forward_tsn_chunk)); fwdtsn.ch.chunk_type = SCTP_FORWARD_CUM_TSN; fwdtsn.ch.chunk_flags = 0; fwdtsn.new_cumulative_tsn = htonl(stcb->asoc.highest_tsn_inside_map + 1); sctp_handle_forward_tsn(stcb, &fwdtsn, &abort_flag, NULL, 0); if (abort_flag) { return (1); } asoc->highest_tsn_inside_map += SCTP_STREAM_RESET_TSN_DELTA; if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MAP_LOGGING_ENABLE) { sctp_log_map(0, 10, asoc->highest_tsn_inside_map, SCTP_MAP_SLIDE_RESULT); } asoc->tsn_last_delivered = asoc->cumulative_tsn = asoc->highest_tsn_inside_map; asoc->mapping_array_base_tsn = asoc->highest_tsn_inside_map + 1; memset(asoc->mapping_array, 0, asoc->mapping_array_size); asoc->highest_tsn_inside_nr_map = asoc->highest_tsn_inside_map; memset(asoc->nr_mapping_array, 0, asoc->mapping_array_size); atomic_add_int(&asoc->sending_seq, 1); /* save off historical data for retrans */ asoc->last_sending_seq[1] = asoc->last_sending_seq[0]; asoc->last_sending_seq[0] = asoc->sending_seq; asoc->last_base_tsnsent[1] = asoc->last_base_tsnsent[0]; asoc->last_base_tsnsent[0] = asoc->mapping_array_base_tsn; sctp_reset_out_streams(stcb, 0, (uint16_t *)NULL); sctp_reset_in_stream(stcb, 0, (uint16_t *)NULL); asoc->last_reset_action[0] = SCTP_STREAM_RESET_RESULT_PERFORMED; sctp_notify_stream_reset_tsn(stcb, asoc->sending_seq, (asoc->mapping_array_base_tsn + 1), 0); } sctp_add_stream_reset_result_tsn(chk, seq, asoc->last_reset_action[0], asoc->last_sending_seq[0], asoc->last_base_tsnsent[0]); asoc->str_reset_seq_in++; } else if (asoc->str_reset_seq_in - 1 == seq) { sctp_add_stream_reset_result_tsn(chk, seq, asoc->last_reset_action[0], asoc->last_sending_seq[0], asoc->last_base_tsnsent[0]); } else if (asoc->str_reset_seq_in - 2 == seq) { sctp_add_stream_reset_result_tsn(chk, seq, asoc->last_reset_action[1], asoc->last_sending_seq[1], asoc->last_base_tsnsent[1]); } else { sctp_add_stream_reset_result(chk, seq, SCTP_STREAM_RESET_RESULT_ERR_BAD_SEQNO); } return (0); } static void sctp_handle_str_reset_request_out(struct sctp_tcb *stcb, struct sctp_tmit_chunk *chk, struct sctp_stream_reset_out_request *req, int trunc) { uint32_t seq, tsn; int number_entries, len; struct sctp_association *asoc = &stcb->asoc; seq = ntohl(req->request_seq); /* now if its not a duplicate we process it */ if (asoc->str_reset_seq_in == seq) { len = ntohs(req->ph.param_length); number_entries = ((len - sizeof(struct sctp_stream_reset_out_request)) / sizeof(uint16_t)); /* * the sender is resetting, handle the list issue.. we must * a) verify if we can do the reset, if so no problem b) If * we can't do the reset we must copy the request. c) queue * it, and setup the data in processor to trigger it off * when needed and dequeue all the queued data. */ tsn = ntohl(req->send_reset_at_tsn); /* move the reset action back one */ asoc->last_reset_action[1] = asoc->last_reset_action[0]; if (!(asoc->local_strreset_support & SCTP_ENABLE_RESET_STREAM_REQ)) { asoc->last_reset_action[0] = SCTP_STREAM_RESET_RESULT_DENIED; } else if (trunc) { asoc->last_reset_action[0] = SCTP_STREAM_RESET_RESULT_DENIED; } else if (SCTP_TSN_GE(asoc->cumulative_tsn, tsn)) { /* we can do it now */ sctp_reset_in_stream(stcb, number_entries, req->list_of_streams); asoc->last_reset_action[0] = SCTP_STREAM_RESET_RESULT_PERFORMED; } else { /* * we must queue it up and thus wait for the TSN's * to arrive that are at or before tsn */ struct sctp_stream_reset_list *liste; int siz; siz = sizeof(struct sctp_stream_reset_list) + (number_entries * sizeof(uint16_t)); SCTP_MALLOC(liste, struct sctp_stream_reset_list *, siz, SCTP_M_STRESET); if (liste == NULL) { /* gak out of memory */ asoc->last_reset_action[0] = SCTP_STREAM_RESET_RESULT_DENIED; sctp_add_stream_reset_result(chk, seq, asoc->last_reset_action[0]); return; } liste->seq = seq; liste->tsn = tsn; liste->number_entries = number_entries; memcpy(&liste->list_of_streams, req->list_of_streams, number_entries * sizeof(uint16_t)); TAILQ_INSERT_TAIL(&asoc->resetHead, liste, next_resp); asoc->last_reset_action[0] = SCTP_STREAM_RESET_RESULT_IN_PROGRESS; } sctp_add_stream_reset_result(chk, seq, asoc->last_reset_action[0]); asoc->str_reset_seq_in++; } else if ((asoc->str_reset_seq_in - 1) == seq) { /* * one seq back, just echo back last action since my * response was lost. */ sctp_add_stream_reset_result(chk, seq, asoc->last_reset_action[0]); } else if ((asoc->str_reset_seq_in - 2) == seq) { /* * two seq back, just echo back last action since my * response was lost. */ sctp_add_stream_reset_result(chk, seq, asoc->last_reset_action[1]); } else { sctp_add_stream_reset_result(chk, seq, SCTP_STREAM_RESET_RESULT_ERR_BAD_SEQNO); } } static void sctp_handle_str_reset_add_strm(struct sctp_tcb *stcb, struct sctp_tmit_chunk *chk, struct sctp_stream_reset_add_strm *str_add) { /* * Peer is requesting to add more streams. If its within our * max-streams we will allow it. */ uint32_t num_stream, i; uint32_t seq; struct sctp_association *asoc = &stcb->asoc; struct sctp_queued_to_read *ctl, *nctl; /* Get the number. */ seq = ntohl(str_add->request_seq); num_stream = ntohs(str_add->number_of_streams); /* Now what would be the new total? */ if (asoc->str_reset_seq_in == seq) { num_stream += stcb->asoc.streamincnt; stcb->asoc.last_reset_action[1] = stcb->asoc.last_reset_action[0]; if (!(asoc->local_strreset_support & SCTP_ENABLE_CHANGE_ASSOC_REQ)) { asoc->last_reset_action[0] = SCTP_STREAM_RESET_RESULT_DENIED; } else if ((num_stream > stcb->asoc.max_inbound_streams) || (num_stream > 0xffff)) { /* We must reject it they ask for to many */ denied: stcb->asoc.last_reset_action[0] = SCTP_STREAM_RESET_RESULT_DENIED; } else { /* Ok, we can do that :-) */ struct sctp_stream_in *oldstrm; /* save off the old */ oldstrm = stcb->asoc.strmin; SCTP_MALLOC(stcb->asoc.strmin, struct sctp_stream_in *, (num_stream * sizeof(struct sctp_stream_in)), SCTP_M_STRMI); if (stcb->asoc.strmin == NULL) { stcb->asoc.strmin = oldstrm; goto denied; } /* copy off the old data */ for (i = 0; i < stcb->asoc.streamincnt; i++) { TAILQ_INIT(&stcb->asoc.strmin[i].inqueue); TAILQ_INIT(&stcb->asoc.strmin[i].uno_inqueue); stcb->asoc.strmin[i].sid = i; stcb->asoc.strmin[i].last_mid_delivered = oldstrm[i].last_mid_delivered; stcb->asoc.strmin[i].delivery_started = oldstrm[i].delivery_started; stcb->asoc.strmin[i].pd_api_started = oldstrm[i].pd_api_started; /* now anything on those queues? */ TAILQ_FOREACH_SAFE(ctl, &oldstrm[i].inqueue, next_instrm, nctl) { TAILQ_REMOVE(&oldstrm[i].inqueue, ctl, next_instrm); TAILQ_INSERT_TAIL(&stcb->asoc.strmin[i].inqueue, ctl, next_instrm); } TAILQ_FOREACH_SAFE(ctl, &oldstrm[i].uno_inqueue, next_instrm, nctl) { TAILQ_REMOVE(&oldstrm[i].uno_inqueue, ctl, next_instrm); TAILQ_INSERT_TAIL(&stcb->asoc.strmin[i].uno_inqueue, ctl, next_instrm); } } /* Init the new streams */ for (i = stcb->asoc.streamincnt; i < num_stream; i++) { TAILQ_INIT(&stcb->asoc.strmin[i].inqueue); TAILQ_INIT(&stcb->asoc.strmin[i].uno_inqueue); stcb->asoc.strmin[i].sid = i; stcb->asoc.strmin[i].last_mid_delivered = 0xffffffff; stcb->asoc.strmin[i].pd_api_started = 0; stcb->asoc.strmin[i].delivery_started = 0; } SCTP_FREE(oldstrm, SCTP_M_STRMI); /* update the size */ stcb->asoc.streamincnt = num_stream; stcb->asoc.last_reset_action[0] = SCTP_STREAM_RESET_RESULT_PERFORMED; sctp_notify_stream_reset_add(stcb, stcb->asoc.streamincnt, stcb->asoc.streamoutcnt, 0); } sctp_add_stream_reset_result(chk, seq, asoc->last_reset_action[0]); asoc->str_reset_seq_in++; } else if ((asoc->str_reset_seq_in - 1) == seq) { /* * one seq back, just echo back last action since my * response was lost. */ sctp_add_stream_reset_result(chk, seq, asoc->last_reset_action[0]); } else if ((asoc->str_reset_seq_in - 2) == seq) { /* * two seq back, just echo back last action since my * response was lost. */ sctp_add_stream_reset_result(chk, seq, asoc->last_reset_action[1]); } else { sctp_add_stream_reset_result(chk, seq, SCTP_STREAM_RESET_RESULT_ERR_BAD_SEQNO); } } static void sctp_handle_str_reset_add_out_strm(struct sctp_tcb *stcb, struct sctp_tmit_chunk *chk, struct sctp_stream_reset_add_strm *str_add) { /* * Peer is requesting to add more streams. If its within our * max-streams we will allow it. */ uint16_t num_stream; uint32_t seq; struct sctp_association *asoc = &stcb->asoc; /* Get the number. */ seq = ntohl(str_add->request_seq); num_stream = ntohs(str_add->number_of_streams); /* Now what would be the new total? */ if (asoc->str_reset_seq_in == seq) { stcb->asoc.last_reset_action[1] = stcb->asoc.last_reset_action[0]; if (!(asoc->local_strreset_support & SCTP_ENABLE_CHANGE_ASSOC_REQ)) { asoc->last_reset_action[0] = SCTP_STREAM_RESET_RESULT_DENIED; } else if (stcb->asoc.stream_reset_outstanding) { /* We must reject it we have something pending */ stcb->asoc.last_reset_action[0] = SCTP_STREAM_RESET_RESULT_ERR_IN_PROGRESS; } else { /* Ok, we can do that :-) */ int mychk; mychk = stcb->asoc.streamoutcnt; mychk += num_stream; if (mychk < 0x10000) { stcb->asoc.last_reset_action[0] = SCTP_STREAM_RESET_RESULT_PERFORMED; if (sctp_send_str_reset_req(stcb, 0, NULL, 0, 0, 1, num_stream, 0, 1)) { stcb->asoc.last_reset_action[0] = SCTP_STREAM_RESET_RESULT_DENIED; } } else { stcb->asoc.last_reset_action[0] = SCTP_STREAM_RESET_RESULT_DENIED; } } sctp_add_stream_reset_result(chk, seq, stcb->asoc.last_reset_action[0]); asoc->str_reset_seq_in++; } else if ((asoc->str_reset_seq_in - 1) == seq) { /* * one seq back, just echo back last action since my * response was lost. */ sctp_add_stream_reset_result(chk, seq, asoc->last_reset_action[0]); } else if ((asoc->str_reset_seq_in - 2) == seq) { /* * two seq back, just echo back last action since my * response was lost. */ sctp_add_stream_reset_result(chk, seq, asoc->last_reset_action[1]); } else { sctp_add_stream_reset_result(chk, seq, SCTP_STREAM_RESET_RESULT_ERR_BAD_SEQNO); } } #ifdef __GNUC__ __attribute__((noinline)) #endif static int sctp_handle_stream_reset(struct sctp_tcb *stcb, struct mbuf *m, int offset, struct sctp_chunkhdr *ch_req) { uint16_t remaining_length, param_len, ptype; struct sctp_paramhdr pstore; uint8_t cstore[SCTP_CHUNK_BUFFER_SIZE]; uint32_t seq = 0; int num_req = 0; int trunc = 0; struct sctp_tmit_chunk *chk; struct sctp_chunkhdr *ch; struct sctp_paramhdr *ph; int ret_code = 0; int num_param = 0; /* now it may be a reset or a reset-response */ remaining_length = ntohs(ch_req->chunk_length) - sizeof(struct sctp_chunkhdr); /* setup for adding the response */ sctp_alloc_a_chunk(stcb, chk); if (chk == NULL) { return (ret_code); } chk->copy_by_ref = 0; chk->rec.chunk_id.id = SCTP_STREAM_RESET; chk->rec.chunk_id.can_take_data = 0; chk->flags = 0; chk->asoc = &stcb->asoc; chk->no_fr_allowed = 0; chk->book_size = chk->send_size = sizeof(struct sctp_chunkhdr); chk->book_size_scale = 0; chk->data = sctp_get_mbuf_for_msg(MCLBYTES, 0, M_NOWAIT, 1, MT_DATA); if (chk->data == NULL) { strres_nochunk: if (chk->data) { sctp_m_freem(chk->data); chk->data = NULL; } sctp_free_a_chunk(stcb, chk, SCTP_SO_NOT_LOCKED); return (ret_code); } SCTP_BUF_RESV_UF(chk->data, SCTP_MIN_OVERHEAD); /* setup chunk parameters */ chk->sent = SCTP_DATAGRAM_UNSENT; chk->snd_count = 0; chk->whoTo = NULL; ch = mtod(chk->data, struct sctp_chunkhdr *); ch->chunk_type = SCTP_STREAM_RESET; ch->chunk_flags = 0; ch->chunk_length = htons(chk->send_size); SCTP_BUF_LEN(chk->data) = SCTP_SIZE32(chk->send_size); offset += sizeof(struct sctp_chunkhdr); while (remaining_length >= sizeof(struct sctp_paramhdr)) { ph = (struct sctp_paramhdr *)sctp_m_getptr(m, offset, sizeof(pstore), (uint8_t *)&pstore); if (ph == NULL) { /* TSNH */ break; } param_len = ntohs(ph->param_length); if ((param_len > remaining_length) || (param_len < (sizeof(struct sctp_paramhdr) + sizeof(uint32_t)))) { /* bad parameter length */ break; } ph = (struct sctp_paramhdr *)sctp_m_getptr(m, offset, min(param_len, sizeof(cstore)), (uint8_t *)&cstore); if (ph == NULL) { /* TSNH */ break; } ptype = ntohs(ph->param_type); num_param++; if (param_len > sizeof(cstore)) { trunc = 1; } else { trunc = 0; } if (num_param > SCTP_MAX_RESET_PARAMS) { /* hit the max of parameters already sorry.. */ break; } if (ptype == SCTP_STR_RESET_OUT_REQUEST) { struct sctp_stream_reset_out_request *req_out; if (param_len < sizeof(struct sctp_stream_reset_out_request)) { break; } req_out = (struct sctp_stream_reset_out_request *)ph; num_req++; if (stcb->asoc.stream_reset_outstanding) { seq = ntohl(req_out->response_seq); if (seq == stcb->asoc.str_reset_seq_out) { /* implicit ack */ (void)sctp_handle_stream_reset_response(stcb, seq, SCTP_STREAM_RESET_RESULT_PERFORMED, NULL); } } sctp_handle_str_reset_request_out(stcb, chk, req_out, trunc); } else if (ptype == SCTP_STR_RESET_ADD_OUT_STREAMS) { struct sctp_stream_reset_add_strm *str_add; if (param_len < sizeof(struct sctp_stream_reset_add_strm)) { break; } str_add = (struct sctp_stream_reset_add_strm *)ph; num_req++; sctp_handle_str_reset_add_strm(stcb, chk, str_add); } else if (ptype == SCTP_STR_RESET_ADD_IN_STREAMS) { struct sctp_stream_reset_add_strm *str_add; if (param_len < sizeof(struct sctp_stream_reset_add_strm)) { break; } str_add = (struct sctp_stream_reset_add_strm *)ph; num_req++; sctp_handle_str_reset_add_out_strm(stcb, chk, str_add); } else if (ptype == SCTP_STR_RESET_IN_REQUEST) { struct sctp_stream_reset_in_request *req_in; num_req++; req_in = (struct sctp_stream_reset_in_request *)ph; sctp_handle_str_reset_request_in(stcb, chk, req_in, trunc); } else if (ptype == SCTP_STR_RESET_TSN_REQUEST) { struct sctp_stream_reset_tsn_request *req_tsn; num_req++; req_tsn = (struct sctp_stream_reset_tsn_request *)ph; if (sctp_handle_str_reset_request_tsn(stcb, chk, req_tsn)) { ret_code = 1; goto strres_nochunk; } /* no more */ break; } else if (ptype == SCTP_STR_RESET_RESPONSE) { struct sctp_stream_reset_response *resp; uint32_t result; if (param_len < sizeof(struct sctp_stream_reset_response)) { break; } resp = (struct sctp_stream_reset_response *)ph; seq = ntohl(resp->response_seq); result = ntohl(resp->result); if (sctp_handle_stream_reset_response(stcb, seq, result, resp)) { ret_code = 1; goto strres_nochunk; } } else { break; } offset += SCTP_SIZE32(param_len); if (remaining_length >= SCTP_SIZE32(param_len)) { remaining_length -= SCTP_SIZE32(param_len); } else { remaining_length = 0; } } if (num_req == 0) { /* we have no response free the stuff */ goto strres_nochunk; } /* ok we have a chunk to link in */ TAILQ_INSERT_TAIL(&stcb->asoc.control_send_queue, chk, sctp_next); stcb->asoc.ctrl_queue_cnt++; return (ret_code); } /* * Handle a router or endpoints report of a packet loss, there are two ways * to handle this, either we get the whole packet and must disect it * ourselves (possibly with truncation and or corruption) or it is a summary * from a middle box that did the disectting for us. */ static void sctp_handle_packet_dropped(struct sctp_pktdrop_chunk *cp, struct sctp_tcb *stcb, struct sctp_nets *net, uint32_t limit) { struct sctp_chunk_desc desc; struct sctp_chunkhdr *chk_hdr; struct sctp_data_chunk *data_chunk; struct sctp_idata_chunk *idata_chunk; uint32_t bottle_bw, on_queue; uint32_t offset, chk_len; uint16_t trunc_len; uint16_t pktdrp_len; uint8_t pktdrp_flags; KASSERT(sizeof(struct sctp_pktdrop_chunk) <= limit, ("PKTDROP chunk too small")); pktdrp_flags = cp->ch.chunk_flags; pktdrp_len = ntohs(cp->ch.chunk_length); KASSERT(limit <= pktdrp_len, ("Inconsistent limit")); if (pktdrp_flags & SCTP_PACKET_TRUNCATED) { trunc_len = ntohs(cp->trunc_len); if (trunc_len <= pktdrp_len - sizeof(struct sctp_pktdrop_chunk)) { /* The peer plays games with us. */ return; } } else { trunc_len = 0; } limit -= sizeof(struct sctp_pktdrop_chunk); offset = 0; if (offset == limit) { if (pktdrp_flags & SCTP_FROM_MIDDLE_BOX) { SCTP_STAT_INCR(sctps_pdrpbwrpt); } } else if (offset + sizeof(struct sctphdr) > limit) { /* Only a partial SCTP common header. */ SCTP_STAT_INCR(sctps_pdrpcrupt); offset = limit; } else { /* XXX: Check embedded SCTP common header. */ offset += sizeof(struct sctphdr); } /* Now parse through the chunks themselves. */ while (offset < limit) { if (offset + sizeof(struct sctp_chunkhdr) > limit) { SCTP_STAT_INCR(sctps_pdrpcrupt); break; } chk_hdr = (struct sctp_chunkhdr *)(cp->data + offset); desc.chunk_type = chk_hdr->chunk_type; /* get amount we need to move */ chk_len = (uint32_t)ntohs(chk_hdr->chunk_length); if (chk_len < sizeof(struct sctp_chunkhdr)) { /* Someone is lying... */ break; } if (desc.chunk_type == SCTP_DATA) { if (stcb->asoc.idata_supported) { /* Some is playing games with us. */ break; } if (chk_len <= sizeof(struct sctp_data_chunk)) { /* Some is playing games with us. */ break; } if (chk_len < sizeof(struct sctp_data_chunk) + SCTP_NUM_DB_TO_VERIFY) { /* * Not enough data bytes available in the * chunk. */ SCTP_STAT_INCR(sctps_pdrpnedat); goto next_chunk; } if (offset + sizeof(struct sctp_data_chunk) + SCTP_NUM_DB_TO_VERIFY > limit) { /* Not enough data in buffer. */ break; } data_chunk = (struct sctp_data_chunk *)(cp->data + offset); memcpy(desc.data_bytes, data_chunk + 1, SCTP_NUM_DB_TO_VERIFY); desc.tsn_ifany = data_chunk->dp.tsn; if (pktdrp_flags & SCTP_FROM_MIDDLE_BOX) { SCTP_STAT_INCR(sctps_pdrpmbda); } } else if (desc.chunk_type == SCTP_IDATA) { if (!stcb->asoc.idata_supported) { /* Some is playing games with us. */ break; } if (chk_len <= sizeof(struct sctp_idata_chunk)) { /* Some is playing games with us. */ break; } if (chk_len < sizeof(struct sctp_idata_chunk) + SCTP_NUM_DB_TO_VERIFY) { /* * Not enough data bytes available in the * chunk. */ SCTP_STAT_INCR(sctps_pdrpnedat); goto next_chunk; } if (offset + sizeof(struct sctp_idata_chunk) + SCTP_NUM_DB_TO_VERIFY > limit) { /* Not enough data in buffer. */ break; } idata_chunk = (struct sctp_idata_chunk *)(cp->data + offset); memcpy(desc.data_bytes, idata_chunk + 1, SCTP_NUM_DB_TO_VERIFY); desc.tsn_ifany = idata_chunk->dp.tsn; if (pktdrp_flags & SCTP_FROM_MIDDLE_BOX) { SCTP_STAT_INCR(sctps_pdrpmbda); } } else { if (pktdrp_flags & SCTP_FROM_MIDDLE_BOX) { SCTP_STAT_INCR(sctps_pdrpmbct); } } if (process_chunk_drop(stcb, &desc, net, pktdrp_flags)) { SCTP_STAT_INCR(sctps_pdrppdbrk); break; } next_chunk: offset += SCTP_SIZE32(chk_len); } /* Now update any rwnd --- possibly */ if ((pktdrp_flags & SCTP_FROM_MIDDLE_BOX) == 0) { /* From a peer, we get a rwnd report */ uint32_t a_rwnd; SCTP_STAT_INCR(sctps_pdrpfehos); bottle_bw = ntohl(cp->bottle_bw); on_queue = ntohl(cp->current_onq); if (bottle_bw && on_queue) { /* a rwnd report is in here */ if (bottle_bw > on_queue) a_rwnd = bottle_bw - on_queue; else a_rwnd = 0; if (a_rwnd == 0) stcb->asoc.peers_rwnd = 0; else { if (a_rwnd > stcb->asoc.total_flight) { stcb->asoc.peers_rwnd = a_rwnd - stcb->asoc.total_flight; } else { stcb->asoc.peers_rwnd = 0; } if (stcb->asoc.peers_rwnd < stcb->sctp_ep->sctp_ep.sctp_sws_sender) { /* SWS sender side engages */ stcb->asoc.peers_rwnd = 0; } } } } else { SCTP_STAT_INCR(sctps_pdrpfmbox); } /* now middle boxes in sat networks get a cwnd bump */ if ((pktdrp_flags & SCTP_FROM_MIDDLE_BOX) && (stcb->asoc.sat_t3_loss_recovery == 0) && (stcb->asoc.sat_network)) { /* * This is debatable but for sat networks it makes sense * Note if a T3 timer has went off, we will prohibit any * changes to cwnd until we exit the t3 loss recovery. */ stcb->asoc.cc_functions.sctp_cwnd_update_after_packet_dropped(stcb, net, cp, &bottle_bw, &on_queue); } } /* * handles all control chunks in a packet inputs: - m: mbuf chain, assumed to * still contain IP/SCTP header - stcb: is the tcb found for this packet - * offset: offset into the mbuf chain to first chunkhdr - length: is the * length of the complete packet outputs: - length: modified to remaining * length after control processing - netp: modified to new sctp_nets after * cookie-echo processing - return NULL to discard the packet (ie. no asoc, * bad packet,...) otherwise return the tcb for this packet */ #ifdef __GNUC__ __attribute__((noinline)) #endif static struct sctp_tcb * sctp_process_control(struct mbuf *m, int iphlen, int *offset, int length, struct sockaddr *src, struct sockaddr *dst, struct sctphdr *sh, struct sctp_chunkhdr *ch, struct sctp_inpcb *inp, struct sctp_tcb *stcb, struct sctp_nets **netp, int *fwd_tsn_seen, uint8_t mflowtype, uint32_t mflowid, uint16_t fibnum, uint32_t vrf_id, uint16_t port) { struct sctp_association *asoc; struct mbuf *op_err; char msg[SCTP_DIAG_INFO_LEN]; uint32_t vtag_in; int num_chunks = 0; /* number of control chunks processed */ uint32_t chk_length, contiguous; int ret; int abort_no_unlock = 0; int ecne_seen = 0; /* * How big should this be, and should it be alloc'd? Lets try the * d-mtu-ceiling for now (2k) and that should hopefully work ... * until we get into jumbo grams and such.. */ uint8_t chunk_buf[SCTP_CHUNK_BUFFER_SIZE]; int got_auth = 0; uint32_t auth_offset = 0, auth_len = 0; int auth_skipped = 0; int asconf_cnt = 0; SCTPDBG(SCTP_DEBUG_INPUT1, "sctp_process_control: iphlen=%u, offset=%u, length=%u stcb:%p\n", iphlen, *offset, length, (void *)stcb); if (stcb) { SCTP_TCB_LOCK_ASSERT(stcb); } /* validate chunk header length... */ if (ntohs(ch->chunk_length) < sizeof(*ch)) { SCTPDBG(SCTP_DEBUG_INPUT1, "Invalid header length %d\n", ntohs(ch->chunk_length)); *offset = length; return (stcb); } /* * validate the verification tag */ vtag_in = ntohl(sh->v_tag); if (ch->chunk_type == SCTP_INITIATION) { SCTPDBG(SCTP_DEBUG_INPUT1, "Its an INIT of len:%d vtag:%x\n", ntohs(ch->chunk_length), vtag_in); if (vtag_in != 0) { /* protocol error- silently discard... */ SCTP_STAT_INCR(sctps_badvtag); if (stcb != NULL) { SCTP_TCB_UNLOCK(stcb); } return (NULL); } } else if (ch->chunk_type != SCTP_COOKIE_ECHO) { /* * If there is no stcb, skip the AUTH chunk and process * later after a stcb is found (to validate the lookup was * valid. */ if ((ch->chunk_type == SCTP_AUTHENTICATION) && (stcb == NULL) && (inp->auth_supported == 1)) { /* save this chunk for later processing */ auth_skipped = 1; auth_offset = *offset; auth_len = ntohs(ch->chunk_length); /* (temporarily) move past this chunk */ *offset += SCTP_SIZE32(auth_len); if (*offset >= length) { /* no more data left in the mbuf chain */ *offset = length; return (NULL); } ch = (struct sctp_chunkhdr *)sctp_m_getptr(m, *offset, sizeof(struct sctp_chunkhdr), chunk_buf); } if (ch == NULL) { /* Help */ *offset = length; return (stcb); } if (ch->chunk_type == SCTP_COOKIE_ECHO) { goto process_control_chunks; } /* * first check if it's an ASCONF with an unknown src addr we * need to look inside to find the association */ if (ch->chunk_type == SCTP_ASCONF && stcb == NULL) { struct sctp_chunkhdr *asconf_ch = ch; uint32_t asconf_offset = 0, asconf_len = 0; /* inp's refcount may be reduced */ SCTP_INP_INCR_REF(inp); asconf_offset = *offset; do { asconf_len = ntohs(asconf_ch->chunk_length); if (asconf_len < sizeof(struct sctp_asconf_paramhdr)) break; stcb = sctp_findassociation_ep_asconf(m, *offset, dst, sh, &inp, netp, vrf_id); if (stcb != NULL) break; asconf_offset += SCTP_SIZE32(asconf_len); asconf_ch = (struct sctp_chunkhdr *)sctp_m_getptr(m, asconf_offset, sizeof(struct sctp_chunkhdr), chunk_buf); } while (asconf_ch != NULL && asconf_ch->chunk_type == SCTP_ASCONF); if (stcb == NULL) { /* * reduce inp's refcount if not reduced in * sctp_findassociation_ep_asconf(). */ SCTP_INP_DECR_REF(inp); } /* now go back and verify any auth chunk to be sure */ if (auth_skipped && (stcb != NULL)) { struct sctp_auth_chunk *auth; if (auth_len <= SCTP_CHUNK_BUFFER_SIZE) { auth = (struct sctp_auth_chunk *)sctp_m_getptr(m, auth_offset, auth_len, chunk_buf); got_auth = 1; auth_skipped = 0; } else { auth = NULL; } if ((auth == NULL) || sctp_handle_auth(stcb, auth, m, auth_offset)) { /* auth HMAC failed so dump it */ *offset = length; return (stcb); } else { /* remaining chunks are HMAC checked */ stcb->asoc.authenticated = 1; } } } if (stcb == NULL) { SCTP_SNPRINTF(msg, sizeof(msg), "OOTB, %s:%d at %s", __FILE__, __LINE__, __func__); op_err = sctp_generate_cause(SCTP_BASE_SYSCTL(sctp_diag_info_code), msg); /* no association, so it's out of the blue... */ sctp_handle_ootb(m, iphlen, *offset, src, dst, sh, inp, op_err, mflowtype, mflowid, inp->fibnum, vrf_id, port); *offset = length; return (NULL); } asoc = &stcb->asoc; /* ABORT and SHUTDOWN can use either v_tag... */ if ((ch->chunk_type == SCTP_ABORT_ASSOCIATION) || (ch->chunk_type == SCTP_SHUTDOWN_COMPLETE) || (ch->chunk_type == SCTP_PACKET_DROPPED)) { /* Take the T-bit always into account. */ if ((((ch->chunk_flags & SCTP_HAD_NO_TCB) == 0) && (vtag_in == asoc->my_vtag)) || (((ch->chunk_flags & SCTP_HAD_NO_TCB) == SCTP_HAD_NO_TCB) && (asoc->peer_vtag != htonl(0)) && (vtag_in == asoc->peer_vtag))) { /* this is valid */ } else { /* drop this packet... */ SCTP_STAT_INCR(sctps_badvtag); if (stcb != NULL) { SCTP_TCB_UNLOCK(stcb); } return (NULL); } } else if (ch->chunk_type == SCTP_SHUTDOWN_ACK) { if (vtag_in != asoc->my_vtag) { /* * this could be a stale SHUTDOWN-ACK or the * peer never got the SHUTDOWN-COMPLETE and * is still hung; we have started a new asoc * but it won't complete until the shutdown * is completed */ if (stcb != NULL) { SCTP_TCB_UNLOCK(stcb); } SCTP_SNPRINTF(msg, sizeof(msg), "OOTB, %s:%d at %s", __FILE__, __LINE__, __func__); op_err = sctp_generate_cause(SCTP_BASE_SYSCTL(sctp_diag_info_code), msg); sctp_handle_ootb(m, iphlen, *offset, src, dst, sh, inp, op_err, mflowtype, mflowid, fibnum, vrf_id, port); return (NULL); } } else { /* for all other chunks, vtag must match */ if (vtag_in != asoc->my_vtag) { /* invalid vtag... */ SCTPDBG(SCTP_DEBUG_INPUT3, "invalid vtag: %xh, expect %xh\n", vtag_in, asoc->my_vtag); SCTP_STAT_INCR(sctps_badvtag); if (stcb != NULL) { SCTP_TCB_UNLOCK(stcb); } *offset = length; return (NULL); } } } /* end if !SCTP_COOKIE_ECHO */ /* * process all control chunks... */ if (((ch->chunk_type == SCTP_SELECTIVE_ACK) || (ch->chunk_type == SCTP_NR_SELECTIVE_ACK) || (ch->chunk_type == SCTP_HEARTBEAT_REQUEST)) && (SCTP_GET_STATE(stcb) == SCTP_STATE_COOKIE_ECHOED)) { /* implied cookie-ack.. we must have lost the ack */ sctp_handle_cookie_ack((struct sctp_cookie_ack_chunk *)ch, stcb, *netp); } process_control_chunks: while (IS_SCTP_CONTROL(ch)) { /* validate chunk length */ chk_length = ntohs(ch->chunk_length); SCTPDBG(SCTP_DEBUG_INPUT2, "sctp_process_control: processing a chunk type=%u, len=%u\n", ch->chunk_type, chk_length); SCTP_LTRACE_CHK(inp, stcb, ch->chunk_type, chk_length); if (chk_length < sizeof(*ch) || (*offset + (int)chk_length) > length) { *offset = length; return (stcb); } SCTP_STAT_INCR_COUNTER64(sctps_incontrolchunks); /* * INIT and INIT-ACK only gets the init ack "header" portion * only because we don't have to process the peer's COOKIE. * All others get a complete chunk. */ switch (ch->chunk_type) { case SCTP_INITIATION: contiguous = sizeof(struct sctp_init_chunk); break; case SCTP_INITIATION_ACK: contiguous = sizeof(struct sctp_init_ack_chunk); break; default: contiguous = min(chk_length, sizeof(chunk_buf)); break; } ch = (struct sctp_chunkhdr *)sctp_m_getptr(m, *offset, contiguous, chunk_buf); if (ch == NULL) { *offset = length; if (stcb != NULL) { SCTP_TCB_UNLOCK(stcb); } return (NULL); } num_chunks++; /* Save off the last place we got a control from */ if (stcb != NULL) { if (((netp != NULL) && (*netp != NULL)) || (ch->chunk_type == SCTP_ASCONF)) { /* * allow last_control to be NULL if * ASCONF... ASCONF processing will find the * right net later */ if ((netp != NULL) && (*netp != NULL)) stcb->asoc.last_control_chunk_from = *netp; } } #ifdef SCTP_AUDITING_ENABLED sctp_audit_log(0xB0, ch->chunk_type); #endif /* check to see if this chunk required auth, but isn't */ if ((stcb != NULL) && sctp_auth_is_required_chunk(ch->chunk_type, stcb->asoc.local_auth_chunks) && !stcb->asoc.authenticated) { /* "silently" ignore */ SCTP_STAT_INCR(sctps_recvauthmissing); goto next_chunk; } switch (ch->chunk_type) { case SCTP_INITIATION: SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_INIT\n"); /* The INIT chunk must be the only chunk. */ if ((num_chunks > 1) || (length - *offset > (int)SCTP_SIZE32(chk_length))) { /* RFC 4960 requires that no ABORT is sent */ *offset = length; if (stcb != NULL) { SCTP_TCB_UNLOCK(stcb); } return (NULL); } /* Honor our resource limit. */ if (chk_length > SCTP_LARGEST_INIT_ACCEPTED) { op_err = sctp_generate_cause(SCTP_CAUSE_OUT_OF_RESC, ""); sctp_abort_association(inp, stcb, m, iphlen, src, dst, sh, op_err, mflowtype, mflowid, vrf_id, port); *offset = length; return (NULL); } sctp_handle_init(m, iphlen, *offset, src, dst, sh, (struct sctp_init_chunk *)ch, inp, stcb, *netp, &abort_no_unlock, mflowtype, mflowid, vrf_id, port); *offset = length; if ((!abort_no_unlock) && (stcb != NULL)) { SCTP_TCB_UNLOCK(stcb); } return (NULL); break; case SCTP_PAD_CHUNK: break; case SCTP_INITIATION_ACK: SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_INIT_ACK\n"); if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) { /* We are not interested anymore */ if ((stcb != NULL) && (stcb->asoc.total_output_queue_size)) { ; } else { *offset = length; if (stcb != NULL) { (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTP_INPUT + SCTP_LOC_29); } return (NULL); } } /* The INIT-ACK chunk must be the only chunk. */ if ((num_chunks > 1) || (length - *offset > (int)SCTP_SIZE32(chk_length))) { *offset = length; return (stcb); } if ((netp != NULL) && (*netp != NULL)) { ret = sctp_handle_init_ack(m, iphlen, *offset, src, dst, sh, (struct sctp_init_ack_chunk *)ch, stcb, *netp, &abort_no_unlock, mflowtype, mflowid, vrf_id); } else { ret = -1; } *offset = length; if (abort_no_unlock) { return (NULL); } /* * Special case, I must call the output routine to * get the cookie echoed */ if ((stcb != NULL) && (ret == 0)) { sctp_chunk_output(stcb->sctp_ep, stcb, SCTP_OUTPUT_FROM_CONTROL_PROC, SCTP_SO_NOT_LOCKED); } return (stcb); break; case SCTP_SELECTIVE_ACK: case SCTP_NR_SELECTIVE_ACK: { int abort_now = 0; uint32_t a_rwnd, cum_ack; uint16_t num_seg, num_nr_seg, num_dup; uint8_t flags; int offset_seg, offset_dup; SCTPDBG(SCTP_DEBUG_INPUT3, "%s\n", ch->chunk_type == SCTP_SELECTIVE_ACK ? "SCTP_SACK" : "SCTP_NR_SACK"); SCTP_STAT_INCR(sctps_recvsacks); if (stcb == NULL) { SCTPDBG(SCTP_DEBUG_INDATA1, "No stcb when processing %s chunk\n", (ch->chunk_type == SCTP_SELECTIVE_ACK) ? "SCTP_SACK" : "SCTP_NR_SACK"); break; } if (ch->chunk_type == SCTP_SELECTIVE_ACK) { if (chk_length < sizeof(struct sctp_sack_chunk)) { SCTPDBG(SCTP_DEBUG_INDATA1, "Bad size on SACK chunk, too small\n"); break; } } else { if (stcb->asoc.nrsack_supported == 0) { goto unknown_chunk; } if (chk_length < sizeof(struct sctp_nr_sack_chunk)) { SCTPDBG(SCTP_DEBUG_INDATA1, "Bad size on NR_SACK chunk, too small\n"); break; } } if (SCTP_GET_STATE(stcb) == SCTP_STATE_SHUTDOWN_ACK_SENT) { /*- * If we have sent a shutdown-ack, we will pay no * attention to a sack sent in to us since * we don't care anymore. */ break; } flags = ch->chunk_flags; if (ch->chunk_type == SCTP_SELECTIVE_ACK) { struct sctp_sack_chunk *sack; sack = (struct sctp_sack_chunk *)ch; cum_ack = ntohl(sack->sack.cum_tsn_ack); num_seg = ntohs(sack->sack.num_gap_ack_blks); num_nr_seg = 0; num_dup = ntohs(sack->sack.num_dup_tsns); a_rwnd = ntohl(sack->sack.a_rwnd); if (sizeof(struct sctp_sack_chunk) + num_seg * sizeof(struct sctp_gap_ack_block) + num_dup * sizeof(uint32_t) != chk_length) { SCTPDBG(SCTP_DEBUG_INDATA1, "Bad size of SACK chunk\n"); break; } offset_seg = *offset + sizeof(struct sctp_sack_chunk); offset_dup = offset_seg + num_seg * sizeof(struct sctp_gap_ack_block); } else { struct sctp_nr_sack_chunk *nr_sack; nr_sack = (struct sctp_nr_sack_chunk *)ch; cum_ack = ntohl(nr_sack->nr_sack.cum_tsn_ack); num_seg = ntohs(nr_sack->nr_sack.num_gap_ack_blks); num_nr_seg = ntohs(nr_sack->nr_sack.num_nr_gap_ack_blks); num_dup = ntohs(nr_sack->nr_sack.num_dup_tsns); a_rwnd = ntohl(nr_sack->nr_sack.a_rwnd); if (sizeof(struct sctp_nr_sack_chunk) + (num_seg + num_nr_seg) * sizeof(struct sctp_gap_ack_block) + num_dup * sizeof(uint32_t) != chk_length) { SCTPDBG(SCTP_DEBUG_INDATA1, "Bad size of NR_SACK chunk\n"); break; } offset_seg = *offset + sizeof(struct sctp_nr_sack_chunk); offset_dup = offset_seg + (num_seg + num_nr_seg) * sizeof(struct sctp_gap_ack_block); } SCTPDBG(SCTP_DEBUG_INPUT3, "%s process cum_ack:%x num_seg:%d a_rwnd:%d\n", (ch->chunk_type == SCTP_SELECTIVE_ACK) ? "SCTP_SACK" : "SCTP_NR_SACK", cum_ack, num_seg, a_rwnd); stcb->asoc.seen_a_sack_this_pkt = 1; if ((stcb->asoc.pr_sctp_cnt == 0) && (num_seg == 0) && (num_nr_seg == 0) && SCTP_TSN_GE(cum_ack, stcb->asoc.last_acked_seq) && (stcb->asoc.saw_sack_with_frags == 0) && (stcb->asoc.saw_sack_with_nr_frags == 0) && (!TAILQ_EMPTY(&stcb->asoc.sent_queue))) { /* * We have a SIMPLE sack having no * prior segments and data on sent * queue to be acked. Use the faster * path sack processing. We also * allow window update sacks with no * missing segments to go this way * too. */ sctp_express_handle_sack(stcb, cum_ack, a_rwnd, &abort_now, ecne_seen); } else { if ((netp != NULL) && (*netp != NULL)) { sctp_handle_sack(m, offset_seg, offset_dup, stcb, num_seg, num_nr_seg, num_dup, &abort_now, flags, cum_ack, a_rwnd, ecne_seen); } } if (abort_now) { /* ABORT signal from sack processing */ *offset = length; return (NULL); } if (TAILQ_EMPTY(&stcb->asoc.send_queue) && TAILQ_EMPTY(&stcb->asoc.sent_queue) && (stcb->asoc.stream_queue_cnt == 0)) { sctp_ulp_notify(SCTP_NOTIFY_SENDER_DRY, stcb, 0, NULL, SCTP_SO_NOT_LOCKED); } break; } case SCTP_HEARTBEAT_REQUEST: SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_HEARTBEAT\n"); if ((stcb != NULL) && (netp != NULL) && (*netp != NULL)) { SCTP_STAT_INCR(sctps_recvheartbeat); sctp_send_heartbeat_ack(stcb, m, *offset, chk_length, *netp); } break; case SCTP_HEARTBEAT_ACK: SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_HEARTBEAT_ACK\n"); if ((stcb == NULL) || (chk_length != sizeof(struct sctp_heartbeat_chunk))) { /* Its not ours */ *offset = length; return (stcb); } SCTP_STAT_INCR(sctps_recvheartbeatack); if ((netp != NULL) && (*netp != NULL)) { sctp_handle_heartbeat_ack((struct sctp_heartbeat_chunk *)ch, stcb, *netp); } break; case SCTP_ABORT_ASSOCIATION: SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_ABORT, stcb %p\n", (void *)stcb); *offset = length; if ((stcb != NULL) && (netp != NULL) && (*netp != NULL)) { if (sctp_handle_abort((struct sctp_abort_chunk *)ch, stcb, *netp)) { return (NULL); } else { return (stcb); } } else { return (NULL); } break; case SCTP_SHUTDOWN: SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_SHUTDOWN, stcb %p\n", (void *)stcb); if ((stcb == NULL) || (chk_length != sizeof(struct sctp_shutdown_chunk))) { *offset = length; return (stcb); } if ((netp != NULL) && (*netp != NULL)) { int abort_flag = 0; sctp_handle_shutdown((struct sctp_shutdown_chunk *)ch, stcb, *netp, &abort_flag); if (abort_flag) { *offset = length; return (NULL); } } break; case SCTP_SHUTDOWN_ACK: SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_SHUTDOWN_ACK, stcb %p\n", (void *)stcb); if ((stcb != NULL) && (netp != NULL) && (*netp != NULL)) { sctp_handle_shutdown_ack((struct sctp_shutdown_ack_chunk *)ch, stcb, *netp); } *offset = length; return (NULL); break; case SCTP_OPERATION_ERROR: SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_OP_ERR\n"); if ((stcb != NULL) && (netp != NULL) && (*netp != NULL) && sctp_handle_error(ch, stcb, *netp, contiguous) < 0) { *offset = length; return (NULL); } break; case SCTP_COOKIE_ECHO: SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_COOKIE_ECHO, stcb %p\n", (void *)stcb); if ((stcb != NULL) && (stcb->asoc.total_output_queue_size > 0)) { ; } else { if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) { /* We are not interested anymore */ abend: if (stcb != NULL) { SCTP_TCB_UNLOCK(stcb); } *offset = length; return (NULL); } } /*- * First are we accepting? We do this again here * since it is possible that a previous endpoint WAS * listening responded to a INIT-ACK and then * closed. We opened and bound.. and are now no * longer listening. * * XXXGL: notes on checking listen queue length. * 1) SCTP_IS_LISTENING() doesn't necessarily mean * SOLISTENING(), because a listening "UDP type" * socket isn't listening in terms of the socket * layer. It is a normal data flow socket, that * can fork off new connections. Thus, we should * look into sol_qlen only in case we are !UDP. * 2) Checking sol_qlen in general requires locking * the socket, and this code lacks that. */ if ((stcb == NULL) && (!SCTP_IS_LISTENING(inp) || (!(inp->sctp_flags & SCTP_PCB_FLAGS_UDPTYPE) && inp->sctp_socket->sol_qlen >= inp->sctp_socket->sol_qlimit))) { if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) && (SCTP_BASE_SYSCTL(sctp_abort_if_one_2_one_hits_limit))) { op_err = sctp_generate_cause(SCTP_CAUSE_OUT_OF_RESC, ""); sctp_abort_association(inp, stcb, m, iphlen, src, dst, sh, op_err, mflowtype, mflowid, vrf_id, port); } *offset = length; return (NULL); } else { struct mbuf *ret_buf; struct sctp_inpcb *linp; struct sctp_tmit_chunk *chk; if (stcb) { linp = NULL; } else { linp = inp; } if (linp != NULL) { SCTP_ASOC_CREATE_LOCK(linp); if ((inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) || (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE)) { SCTP_ASOC_CREATE_UNLOCK(linp); goto abend; } } if (netp != NULL) { struct sctp_tcb *locked_stcb; locked_stcb = stcb; ret_buf = sctp_handle_cookie_echo(m, iphlen, *offset, src, dst, sh, (struct sctp_cookie_echo_chunk *)ch, &inp, &stcb, netp, auth_skipped, auth_offset, auth_len, &locked_stcb, mflowtype, mflowid, vrf_id, port); if ((locked_stcb != NULL) && (locked_stcb != stcb)) { SCTP_TCB_UNLOCK(locked_stcb); } if (stcb != NULL) { SCTP_TCB_LOCK_ASSERT(stcb); } } else { ret_buf = NULL; } if (linp != NULL) { SCTP_ASOC_CREATE_UNLOCK(linp); } if (ret_buf == NULL) { if (stcb != NULL) { SCTP_TCB_UNLOCK(stcb); } SCTPDBG(SCTP_DEBUG_INPUT3, "GAK, null buffer\n"); *offset = length; return (NULL); } /* if AUTH skipped, see if it verified... */ if (auth_skipped) { got_auth = 1; auth_skipped = 0; } /* Restart the timer if we have pending data */ TAILQ_FOREACH(chk, &stcb->asoc.sent_queue, sctp_next) { if (chk->whoTo != NULL) { break; } } if (chk != NULL) { sctp_timer_start(SCTP_TIMER_TYPE_SEND, stcb->sctp_ep, stcb, chk->whoTo); } } break; case SCTP_COOKIE_ACK: SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_COOKIE_ACK, stcb %p\n", (void *)stcb); if ((stcb == NULL) || chk_length != sizeof(struct sctp_cookie_ack_chunk)) { return (stcb); } if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) { /* We are not interested anymore */ if ((stcb) && (stcb->asoc.total_output_queue_size)) { ; } else if (stcb) { (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTP_INPUT + SCTP_LOC_30); *offset = length; return (NULL); } } if ((netp != NULL) && (*netp != NULL)) { sctp_handle_cookie_ack((struct sctp_cookie_ack_chunk *)ch, stcb, *netp); } break; case SCTP_ECN_ECHO: SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_ECN_ECHO\n"); if ((stcb == NULL) || (chk_length != sizeof(struct sctp_ecne_chunk))) { /* Its not ours */ *offset = length; return (stcb); } if (stcb->asoc.ecn_supported == 0) { goto unknown_chunk; } sctp_handle_ecn_echo((struct sctp_ecne_chunk *)ch, stcb); ecne_seen = 1; break; case SCTP_ECN_CWR: SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_ECN_CWR\n"); if ((stcb == NULL) || (chk_length != sizeof(struct sctp_cwr_chunk))) { *offset = length; return (stcb); } if (stcb->asoc.ecn_supported == 0) { goto unknown_chunk; } sctp_handle_ecn_cwr((struct sctp_cwr_chunk *)ch, stcb, *netp); break; case SCTP_SHUTDOWN_COMPLETE: SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_SHUTDOWN_COMPLETE, stcb %p\n", (void *)stcb); /* must be first and only chunk */ if ((num_chunks > 1) || (length - *offset > (int)SCTP_SIZE32(chk_length))) { *offset = length; return (stcb); } if ((stcb != NULL) && (netp != NULL) && (*netp != NULL)) { sctp_handle_shutdown_complete((struct sctp_shutdown_complete_chunk *)ch, stcb, *netp); } *offset = length; return (NULL); break; case SCTP_ASCONF: SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_ASCONF\n"); if (stcb != NULL) { if (stcb->asoc.asconf_supported == 0) { goto unknown_chunk; } sctp_handle_asconf(m, *offset, src, (struct sctp_asconf_chunk *)ch, stcb, asconf_cnt == 0); asconf_cnt++; } break; case SCTP_ASCONF_ACK: SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_ASCONF_ACK\n"); if (chk_length < sizeof(struct sctp_asconf_ack_chunk)) { /* Its not ours */ *offset = length; return (stcb); } if ((stcb != NULL) && (netp != NULL) && (*netp != NULL)) { if (stcb->asoc.asconf_supported == 0) { goto unknown_chunk; } /* He's alive so give him credit */ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_THRESHOLD_LOGGING) { sctp_misc_ints(SCTP_THRESHOLD_CLEAR, stcb->asoc.overall_error_count, 0, SCTP_FROM_SCTP_INPUT, __LINE__); } stcb->asoc.overall_error_count = 0; sctp_handle_asconf_ack(m, *offset, (struct sctp_asconf_ack_chunk *)ch, stcb, *netp, &abort_no_unlock); if (abort_no_unlock) return (NULL); } break; case SCTP_FORWARD_CUM_TSN: case SCTP_IFORWARD_CUM_TSN: SCTPDBG(SCTP_DEBUG_INPUT3, "%s\n", ch->chunk_type == SCTP_FORWARD_CUM_TSN ? "FORWARD_TSN" : "I_FORWARD_TSN"); if (chk_length < sizeof(struct sctp_forward_tsn_chunk)) { /* Its not ours */ *offset = length; return (stcb); } if (stcb != NULL) { int abort_flag = 0; if (stcb->asoc.prsctp_supported == 0) { goto unknown_chunk; } if (((stcb->asoc.idata_supported == 1) && (ch->chunk_type == SCTP_FORWARD_CUM_TSN)) || ((stcb->asoc.idata_supported == 0) && (ch->chunk_type == SCTP_IFORWARD_CUM_TSN))) { if (ch->chunk_type == SCTP_FORWARD_CUM_TSN) { SCTP_SNPRINTF(msg, sizeof(msg), "%s", "FORWARD-TSN chunk received when I-FORWARD-TSN was negotiated"); } else { SCTP_SNPRINTF(msg, sizeof(msg), "%s", "I-FORWARD-TSN chunk received when FORWARD-TSN was negotiated"); } op_err = sctp_generate_cause(SCTP_CAUSE_PROTOCOL_VIOLATION, msg); sctp_abort_an_association(inp, stcb, op_err, SCTP_SO_NOT_LOCKED); *offset = length; return (NULL); } *fwd_tsn_seen = 1; if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) { /* We are not interested anymore */ (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTP_INPUT + SCTP_LOC_31); *offset = length; return (NULL); } /* * For sending a SACK this looks like DATA * chunks. */ stcb->asoc.last_data_chunk_from = stcb->asoc.last_control_chunk_from; sctp_handle_forward_tsn(stcb, (struct sctp_forward_tsn_chunk *)ch, &abort_flag, m, *offset); if (abort_flag) { *offset = length; return (NULL); } } break; case SCTP_STREAM_RESET: SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_STREAM_RESET\n"); if ((stcb == NULL) || (chk_length < sizeof(struct sctp_stream_reset_tsn_req))) { /* Its not ours */ *offset = length; return (stcb); } if (stcb->asoc.reconfig_supported == 0) { goto unknown_chunk; } if (sctp_handle_stream_reset(stcb, m, *offset, ch)) { /* stop processing */ *offset = length; return (NULL); } break; case SCTP_PACKET_DROPPED: SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_PACKET_DROPPED\n"); /* re-get it all please */ if (chk_length < sizeof(struct sctp_pktdrop_chunk)) { /* Its not ours */ *offset = length; return (stcb); } if ((stcb != NULL) && (netp != NULL) && (*netp != NULL)) { if (stcb->asoc.pktdrop_supported == 0) { goto unknown_chunk; } sctp_handle_packet_dropped((struct sctp_pktdrop_chunk *)ch, stcb, *netp, min(chk_length, contiguous)); } break; case SCTP_AUTHENTICATION: SCTPDBG(SCTP_DEBUG_INPUT3, "SCTP_AUTHENTICATION\n"); if (stcb == NULL) { /* save the first AUTH for later processing */ if (auth_skipped == 0) { auth_offset = *offset; auth_len = chk_length; auth_skipped = 1; } /* skip this chunk (temporarily) */ goto next_chunk; } if (stcb->asoc.auth_supported == 0) { goto unknown_chunk; } if ((chk_length < (sizeof(struct sctp_auth_chunk))) || (chk_length > (sizeof(struct sctp_auth_chunk) + SCTP_AUTH_DIGEST_LEN_MAX))) { /* Its not ours */ *offset = length; return (stcb); } if (got_auth == 1) { /* skip this chunk... it's already auth'd */ goto next_chunk; } got_auth = 1; if (sctp_handle_auth(stcb, (struct sctp_auth_chunk *)ch, m, *offset)) { /* auth HMAC failed so dump the packet */ *offset = length; return (stcb); } else { /* remaining chunks are HMAC checked */ stcb->asoc.authenticated = 1; } break; default: unknown_chunk: /* it's an unknown chunk! */ if ((ch->chunk_type & 0x40) && (stcb != NULL) && (SCTP_GET_STATE(stcb) != SCTP_STATE_EMPTY) && (SCTP_GET_STATE(stcb) != SCTP_STATE_INUSE) && (SCTP_GET_STATE(stcb) != SCTP_STATE_COOKIE_WAIT)) { struct sctp_gen_error_cause *cause; int len; op_err = sctp_get_mbuf_for_msg(sizeof(struct sctp_gen_error_cause), 0, M_NOWAIT, 1, MT_DATA); if (op_err != NULL) { len = min(SCTP_SIZE32(chk_length), (uint32_t)(length - *offset)); cause = mtod(op_err, struct sctp_gen_error_cause *); cause->code = htons(SCTP_CAUSE_UNRECOG_CHUNK); cause->length = htons((uint16_t)(len + sizeof(struct sctp_gen_error_cause))); SCTP_BUF_LEN(op_err) = sizeof(struct sctp_gen_error_cause); SCTP_BUF_NEXT(op_err) = SCTP_M_COPYM(m, *offset, len, M_NOWAIT); if (SCTP_BUF_NEXT(op_err) != NULL) { #ifdef SCTP_MBUF_LOGGING if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MBUF_LOGGING_ENABLE) { sctp_log_mbc(SCTP_BUF_NEXT(op_err), SCTP_MBUF_ICOPY); } #endif sctp_queue_op_err(stcb, op_err); } else { sctp_m_freem(op_err); } } } if ((ch->chunk_type & 0x80) == 0) { /* discard this packet */ *offset = length; return (stcb); } /* else skip this bad chunk and continue... */ break; } /* switch (ch->chunk_type) */ next_chunk: /* get the next chunk */ *offset += SCTP_SIZE32(chk_length); if (*offset >= length) { /* no more data left in the mbuf chain */ break; } ch = (struct sctp_chunkhdr *)sctp_m_getptr(m, *offset, sizeof(struct sctp_chunkhdr), chunk_buf); if (ch == NULL) { *offset = length; return (stcb); } } /* while */ if ((asconf_cnt > 0) && (stcb != NULL)) { sctp_send_asconf_ack(stcb); } return (stcb); } /* * common input chunk processing (v4 and v6) */ void sctp_common_input_processing(struct mbuf **mm, int iphlen, int offset, int length, struct sockaddr *src, struct sockaddr *dst, struct sctphdr *sh, struct sctp_chunkhdr *ch, uint8_t compute_crc, uint8_t ecn_bits, uint8_t mflowtype, uint32_t mflowid, uint16_t fibnum, uint32_t vrf_id, uint16_t port) { uint32_t high_tsn; int fwd_tsn_seen = 0, data_processed = 0; struct mbuf *m = *mm, *op_err; char msg[SCTP_DIAG_INFO_LEN]; int un_sent; int cnt_ctrl_ready = 0; struct sctp_inpcb *inp = NULL, *inp_decr = NULL; struct sctp_tcb *stcb = NULL; struct sctp_nets *net = NULL; SCTP_STAT_INCR(sctps_recvdatagrams); #ifdef SCTP_AUDITING_ENABLED sctp_audit_log(0xE0, 1); sctp_auditing(0, inp, stcb, net); #endif if (compute_crc != 0) { uint32_t check, calc_check; check = sh->checksum; sh->checksum = 0; calc_check = sctp_calculate_cksum(m, iphlen); sh->checksum = check; if (calc_check != check) { SCTPDBG(SCTP_DEBUG_INPUT1, "Bad CSUM on SCTP packet calc_check:%x check:%x m:%p mlen:%d iphlen:%d\n", calc_check, check, (void *)m, length, iphlen); stcb = sctp_findassociation_addr(m, offset, src, dst, sh, ch, &inp, &net, vrf_id); #if defined(INET) || defined(INET6) if ((ch->chunk_type != SCTP_INITIATION) && (net != NULL) && (net->port != port)) { if (net->port == 0) { /* UDP encapsulation turned on. */ net->mtu -= sizeof(struct udphdr); if (stcb->asoc.smallest_mtu > net->mtu) { sctp_pathmtu_adjustment(stcb, net->mtu); } } else if (port == 0) { /* UDP encapsulation turned off. */ net->mtu += sizeof(struct udphdr); /* XXX Update smallest_mtu */ } net->port = port; } #endif if (net != NULL) { net->flowtype = mflowtype; net->flowid = mflowid; } SCTP_PROBE5(receive, NULL, stcb, m, stcb, sh); if ((inp != NULL) && (stcb != NULL)) { sctp_send_packet_dropped(stcb, net, m, length, iphlen, 1); sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_INPUT_ERROR, SCTP_SO_NOT_LOCKED); } else if ((inp != NULL) && (stcb == NULL)) { inp_decr = inp; } SCTP_STAT_INCR(sctps_badsum); SCTP_STAT_INCR_COUNTER32(sctps_checksumerrors); goto out; } } /* Destination port of 0 is illegal, based on RFC4960. */ if (sh->dest_port == 0) { SCTP_STAT_INCR(sctps_hdrops); goto out; } stcb = sctp_findassociation_addr(m, offset, src, dst, sh, ch, &inp, &net, vrf_id); #if defined(INET) || defined(INET6) if ((ch->chunk_type != SCTP_INITIATION) && (net != NULL) && (net->port != port)) { if (net->port == 0) { /* UDP encapsulation turned on. */ net->mtu -= sizeof(struct udphdr); if (stcb->asoc.smallest_mtu > net->mtu) { sctp_pathmtu_adjustment(stcb, net->mtu); } } else if (port == 0) { /* UDP encapsulation turned off. */ net->mtu += sizeof(struct udphdr); /* XXX Update smallest_mtu */ } net->port = port; } #endif if (net != NULL) { net->flowtype = mflowtype; net->flowid = mflowid; } if (inp == NULL) { SCTP_PROBE5(receive, NULL, stcb, m, stcb, sh); SCTP_STAT_INCR(sctps_noport); if (badport_bandlim(BANDLIM_SCTP_OOTB) < 0) { goto out; } if (ch->chunk_type == SCTP_SHUTDOWN_ACK) { sctp_send_shutdown_complete2(src, dst, sh, mflowtype, mflowid, fibnum, vrf_id, port); goto out; } if (ch->chunk_type == SCTP_SHUTDOWN_COMPLETE) { goto out; } if (ch->chunk_type != SCTP_ABORT_ASSOCIATION) { if ((SCTP_BASE_SYSCTL(sctp_blackhole) == 0) || ((SCTP_BASE_SYSCTL(sctp_blackhole) == 1) && (ch->chunk_type != SCTP_INIT))) { op_err = sctp_generate_cause(SCTP_BASE_SYSCTL(sctp_diag_info_code), "Out of the blue"); sctp_send_abort(m, iphlen, src, dst, sh, 0, op_err, mflowtype, mflowid, fibnum, vrf_id, port); } } goto out; } else if (stcb == NULL) { inp_decr = inp; } SCTPDBG(SCTP_DEBUG_INPUT1, "Ok, Common input processing called, m:%p iphlen:%d offset:%d length:%d stcb:%p\n", (void *)m, iphlen, offset, length, (void *)stcb); if (stcb) { /* always clear this before beginning a packet */ stcb->asoc.authenticated = 0; stcb->asoc.seen_a_sack_this_pkt = 0; SCTPDBG(SCTP_DEBUG_INPUT1, "stcb:%p state:%x\n", (void *)stcb, stcb->asoc.state); if ((stcb->asoc.state & SCTP_STATE_WAS_ABORTED) || (stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED)) { /*- * If we hit here, we had a ref count * up when the assoc was aborted and the * timer is clearing out the assoc, we should * NOT respond to any packet.. its OOTB. */ SCTP_TCB_UNLOCK(stcb); stcb = NULL; SCTP_PROBE5(receive, NULL, stcb, m, stcb, sh); SCTP_SNPRINTF(msg, sizeof(msg), "OOTB, %s:%d at %s", __FILE__, __LINE__, __func__); op_err = sctp_generate_cause(SCTP_BASE_SYSCTL(sctp_diag_info_code), msg); sctp_handle_ootb(m, iphlen, offset, src, dst, sh, inp, op_err, mflowtype, mflowid, inp->fibnum, vrf_id, port); goto out; } } if (IS_SCTP_CONTROL(ch)) { /* process the control portion of the SCTP packet */ /* sa_ignore NO_NULL_CHK */ stcb = sctp_process_control(m, iphlen, &offset, length, src, dst, sh, ch, inp, stcb, &net, &fwd_tsn_seen, mflowtype, mflowid, fibnum, vrf_id, port); if (stcb) { /* * This covers us if the cookie-echo was there and * it changes our INP. */ inp = stcb->sctp_ep; #if defined(INET) || defined(INET6) if ((ch->chunk_type != SCTP_INITIATION) && (net != NULL) && (net->port != port)) { if (net->port == 0) { /* UDP encapsulation turned on. */ net->mtu -= sizeof(struct udphdr); if (stcb->asoc.smallest_mtu > net->mtu) { sctp_pathmtu_adjustment(stcb, net->mtu); } } else if (port == 0) { /* UDP encapsulation turned off. */ net->mtu += sizeof(struct udphdr); /* XXX Update smallest_mtu */ } net->port = port; } #endif } } else { /* * no control chunks, so pre-process DATA chunks (these * checks are taken care of by control processing) */ /* * if DATA only packet, and auth is required, then punt... * can't have authenticated without any AUTH (control) * chunks */ if ((stcb != NULL) && sctp_auth_is_required_chunk(SCTP_DATA, stcb->asoc.local_auth_chunks)) { /* "silently" ignore */ SCTP_PROBE5(receive, NULL, stcb, m, stcb, sh); SCTP_STAT_INCR(sctps_recvauthmissing); goto out; } if (stcb == NULL) { /* out of the blue DATA chunk */ SCTP_PROBE5(receive, NULL, NULL, m, NULL, sh); SCTP_SNPRINTF(msg, sizeof(msg), "OOTB, %s:%d at %s", __FILE__, __LINE__, __func__); op_err = sctp_generate_cause(SCTP_BASE_SYSCTL(sctp_diag_info_code), msg); sctp_handle_ootb(m, iphlen, offset, src, dst, sh, inp, op_err, mflowtype, mflowid, fibnum, vrf_id, port); goto out; } if (stcb->asoc.my_vtag != ntohl(sh->v_tag)) { /* v_tag mismatch! */ SCTP_PROBE5(receive, NULL, stcb, m, stcb, sh); SCTP_STAT_INCR(sctps_badvtag); goto out; } } SCTP_PROBE5(receive, NULL, stcb, m, stcb, sh); if (stcb == NULL) { /* * no valid TCB for this packet, or we found it's a bad * packet while processing control, or we're done with this * packet (done or skip rest of data), so we drop it... */ goto out; } /* * DATA chunk processing */ /* plow through the data chunks while length > offset */ /* * Rest should be DATA only. Check authentication state if AUTH for * DATA is required. */ if ((length > offset) && (stcb != NULL) && sctp_auth_is_required_chunk(SCTP_DATA, stcb->asoc.local_auth_chunks) && !stcb->asoc.authenticated) { /* "silently" ignore */ SCTP_STAT_INCR(sctps_recvauthmissing); SCTPDBG(SCTP_DEBUG_AUTH1, "Data chunk requires AUTH, skipped\n"); goto trigger_send; } if (length > offset) { int retval; /* * First check to make sure our state is correct. We would * not get here unless we really did have a tag, so we don't * abort if this happens, just dump the chunk silently. */ switch (SCTP_GET_STATE(stcb)) { case SCTP_STATE_COOKIE_ECHOED: /* * we consider data with valid tags in this state * shows us the cookie-ack was lost. Imply it was * there. */ sctp_handle_cookie_ack((struct sctp_cookie_ack_chunk *)ch, stcb, net); break; case SCTP_STATE_COOKIE_WAIT: /* * We consider OOTB any data sent during asoc setup. */ SCTP_SNPRINTF(msg, sizeof(msg), "OOTB, %s:%d at %s", __FILE__, __LINE__, __func__); op_err = sctp_generate_cause(SCTP_BASE_SYSCTL(sctp_diag_info_code), msg); sctp_handle_ootb(m, iphlen, offset, src, dst, sh, inp, op_err, mflowtype, mflowid, inp->fibnum, vrf_id, port); goto out; /* sa_ignore NOTREACHED */ break; case SCTP_STATE_EMPTY: /* should not happen */ case SCTP_STATE_INUSE: /* should not happen */ case SCTP_STATE_SHUTDOWN_RECEIVED: /* This is a peer error */ case SCTP_STATE_SHUTDOWN_ACK_SENT: default: goto out; /* sa_ignore NOTREACHED */ break; case SCTP_STATE_OPEN: case SCTP_STATE_SHUTDOWN_SENT: break; } /* plow through the data chunks while length > offset */ retval = sctp_process_data(mm, iphlen, &offset, length, inp, stcb, net, &high_tsn); if (retval == 2) { /* * The association aborted, NO UNLOCK needed since * the association is destroyed. */ stcb = NULL; goto out; } data_processed = 1; /* * Anything important needs to have been m_copy'ed in * process_data */ } /* take care of ecn */ if ((data_processed == 1) && (stcb->asoc.ecn_supported == 1) && ((ecn_bits & SCTP_CE_BITS) == SCTP_CE_BITS)) { /* Yep, we need to add a ECNE */ sctp_send_ecn_echo(stcb, net, high_tsn); } if ((data_processed == 0) && (fwd_tsn_seen)) { int was_a_gap; uint32_t highest_tsn; if (SCTP_TSN_GT(stcb->asoc.highest_tsn_inside_nr_map, stcb->asoc.highest_tsn_inside_map)) { highest_tsn = stcb->asoc.highest_tsn_inside_nr_map; } else { highest_tsn = stcb->asoc.highest_tsn_inside_map; } was_a_gap = SCTP_TSN_GT(highest_tsn, stcb->asoc.cumulative_tsn); stcb->asoc.send_sack = 1; sctp_sack_check(stcb, was_a_gap); } else if (fwd_tsn_seen) { stcb->asoc.send_sack = 1; } /* trigger send of any chunks in queue... */ trigger_send: #ifdef SCTP_AUDITING_ENABLED sctp_audit_log(0xE0, 2); sctp_auditing(1, inp, stcb, net); #endif SCTPDBG(SCTP_DEBUG_INPUT1, "Check for chunk output prw:%d tqe:%d tf=%d\n", stcb->asoc.peers_rwnd, TAILQ_EMPTY(&stcb->asoc.control_send_queue), stcb->asoc.total_flight); un_sent = (stcb->asoc.total_output_queue_size - stcb->asoc.total_flight); if (!TAILQ_EMPTY(&stcb->asoc.control_send_queue)) { cnt_ctrl_ready = stcb->asoc.ctrl_queue_cnt - stcb->asoc.ecn_echo_cnt_onq; } if (!TAILQ_EMPTY(&stcb->asoc.asconf_send_queue) || cnt_ctrl_ready || stcb->asoc.trigger_reset || ((un_sent > 0) && (stcb->asoc.peers_rwnd > 0 || stcb->asoc.total_flight == 0))) { SCTPDBG(SCTP_DEBUG_INPUT3, "Calling chunk OUTPUT\n"); sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_CONTROL_PROC, SCTP_SO_NOT_LOCKED); SCTPDBG(SCTP_DEBUG_INPUT3, "chunk OUTPUT returns\n"); } #ifdef SCTP_AUDITING_ENABLED sctp_audit_log(0xE0, 3); sctp_auditing(2, inp, stcb, net); #endif out: if (stcb != NULL) { SCTP_TCB_UNLOCK(stcb); } if (inp_decr != NULL) { /* reduce ref-count */ SCTP_INP_WLOCK(inp_decr); SCTP_INP_DECR_REF(inp_decr); SCTP_INP_WUNLOCK(inp_decr); } return; } #ifdef INET void sctp_input_with_port(struct mbuf *i_pak, int off, uint16_t port) { struct mbuf *m; int iphlen; uint32_t vrf_id = 0; uint8_t ecn_bits; struct sockaddr_in src, dst; struct ip *ip; struct sctphdr *sh; struct sctp_chunkhdr *ch; int length, offset; uint8_t compute_crc; uint32_t mflowid; uint8_t mflowtype; uint16_t fibnum; iphlen = off; if (SCTP_GET_PKT_VRFID(i_pak, vrf_id)) { SCTP_RELEASE_PKT(i_pak); return; } m = SCTP_HEADER_TO_CHAIN(i_pak); #ifdef SCTP_MBUF_LOGGING /* Log in any input mbufs */ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MBUF_LOGGING_ENABLE) { sctp_log_mbc(m, SCTP_MBUF_INPUT); } #endif #ifdef SCTP_PACKET_LOGGING if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LAST_PACKET_TRACING) { sctp_packet_log(m); } #endif SCTPDBG(SCTP_DEBUG_CRCOFFLOAD, "sctp_input(): Packet of length %d received on %s with csum_flags 0x%b.\n", m->m_pkthdr.len, if_name(m->m_pkthdr.rcvif), (int)m->m_pkthdr.csum_flags, CSUM_BITS); mflowid = m->m_pkthdr.flowid; mflowtype = M_HASHTYPE_GET(m); fibnum = M_GETFIB(m); SCTP_STAT_INCR(sctps_recvpackets); SCTP_STAT_INCR_COUNTER64(sctps_inpackets); /* Get IP, SCTP, and first chunk header together in the first mbuf. */ offset = iphlen + sizeof(struct sctphdr) + sizeof(struct sctp_chunkhdr); if (SCTP_BUF_LEN(m) < offset) { if ((m = m_pullup(m, offset)) == NULL) { SCTP_STAT_INCR(sctps_hdrops); return; } } ip = mtod(m, struct ip *); sh = (struct sctphdr *)((caddr_t)ip + iphlen); ch = (struct sctp_chunkhdr *)((caddr_t)sh + sizeof(struct sctphdr)); offset -= sizeof(struct sctp_chunkhdr); memset(&src, 0, sizeof(struct sockaddr_in)); src.sin_family = AF_INET; src.sin_len = sizeof(struct sockaddr_in); src.sin_port = sh->src_port; src.sin_addr = ip->ip_src; memset(&dst, 0, sizeof(struct sockaddr_in)); dst.sin_family = AF_INET; dst.sin_len = sizeof(struct sockaddr_in); dst.sin_port = sh->dest_port; dst.sin_addr = ip->ip_dst; length = ntohs(ip->ip_len); /* Validate mbuf chain length with IP payload length. */ if (SCTP_HEADER_LEN(m) != length) { SCTPDBG(SCTP_DEBUG_INPUT1, "sctp_input() length:%d reported length:%d\n", length, SCTP_HEADER_LEN(m)); SCTP_STAT_INCR(sctps_hdrops); goto out; } /* SCTP does not allow broadcasts or multicasts */ if (IN_MULTICAST(ntohl(dst.sin_addr.s_addr))) { goto out; } if (SCTP_IS_IT_BROADCAST(dst.sin_addr, m)) { goto out; } ecn_bits = ip->ip_tos; if (m->m_pkthdr.csum_flags & CSUM_SCTP_VALID) { SCTP_STAT_INCR(sctps_recvhwcrc); compute_crc = 0; } else { SCTP_STAT_INCR(sctps_recvswcrc); compute_crc = 1; } sctp_common_input_processing(&m, iphlen, offset, length, (struct sockaddr *)&src, (struct sockaddr *)&dst, sh, ch, compute_crc, ecn_bits, mflowtype, mflowid, fibnum, vrf_id, port); out: if (m) { sctp_m_freem(m); } return; } #if defined(SCTP_MCORE_INPUT) && defined(SMP) extern int *sctp_cpuarry; #endif int sctp_input(struct mbuf **mp, int *offp, int proto SCTP_UNUSED) { struct mbuf *m; int off; m = *mp; off = *offp; #if defined(SCTP_MCORE_INPUT) && defined(SMP) if (mp_ncpus > 1) { struct ip *ip; struct sctphdr *sh; int offset; int cpu_to_use; uint32_t flowid, tag; if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) { flowid = m->m_pkthdr.flowid; } else { /* * No flow id built by lower layers fix it so we * create one. */ offset = off + sizeof(struct sctphdr); if (SCTP_BUF_LEN(m) < offset) { if ((m = m_pullup(m, offset)) == NULL) { SCTP_STAT_INCR(sctps_hdrops); return (IPPROTO_DONE); } } ip = mtod(m, struct ip *); sh = (struct sctphdr *)((caddr_t)ip + off); tag = htonl(sh->v_tag); flowid = tag ^ ntohs(sh->dest_port) ^ ntohs(sh->src_port); m->m_pkthdr.flowid = flowid; M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE_HASH); } cpu_to_use = sctp_cpuarry[flowid % mp_ncpus]; sctp_queue_to_mcore(m, off, cpu_to_use); return (IPPROTO_DONE); } #endif sctp_input_with_port(m, off, 0); return (IPPROTO_DONE); } #endif Index: projects/clang1100-import/sys/netinet/sctp_output.c =================================================================== --- projects/clang1100-import/sys/netinet/sctp_output.c (revision 364278) +++ projects/clang1100-import/sys/netinet/sctp_output.c (revision 364279) @@ -1,13818 +1,13830 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2001-2008, by Cisco Systems, Inc. All rights reserved. * Copyright (c) 2008-2012, by Randall Stewart. All rights reserved. * Copyright (c) 2008-2012, by Michael Tuexen. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * a) Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * b) Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the distribution. * * c) Neither the name of Cisco Systems, Inc. nor the names of its * contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(INET) || defined(INET6) #include #endif #include #include #define SCTP_MAX_GAPS_INARRAY 4 struct sack_track { uint8_t right_edge; /* mergable on the right edge */ uint8_t left_edge; /* mergable on the left edge */ uint8_t num_entries; uint8_t spare; struct sctp_gap_ack_block gaps[SCTP_MAX_GAPS_INARRAY]; }; const struct sack_track sack_array[256] = { {0, 0, 0, 0, /* 0x00 */ {{0, 0}, {0, 0}, {0, 0}, {0, 0} } }, {1, 0, 1, 0, /* 0x01 */ {{0, 0}, {0, 0}, {0, 0}, {0, 0} } }, {0, 0, 1, 0, /* 0x02 */ {{1, 1}, {0, 0}, {0, 0}, {0, 0} } }, {1, 0, 1, 0, /* 0x03 */ {{0, 1}, {0, 0}, {0, 0}, {0, 0} } }, {0, 0, 1, 0, /* 0x04 */ {{2, 2}, {0, 0}, {0, 0}, {0, 0} } }, {1, 0, 2, 0, /* 0x05 */ {{0, 0}, {2, 2}, {0, 0}, {0, 0} } }, {0, 0, 1, 0, /* 0x06 */ {{1, 2}, {0, 0}, {0, 0}, {0, 0} } }, {1, 0, 1, 0, /* 0x07 */ {{0, 2}, {0, 0}, {0, 0}, {0, 0} } }, {0, 0, 1, 0, /* 0x08 */ {{3, 3}, {0, 0}, {0, 0}, {0, 0} } }, {1, 0, 2, 0, /* 0x09 */ {{0, 0}, {3, 3}, {0, 0}, {0, 0} } }, {0, 0, 2, 0, /* 0x0a */ {{1, 1}, {3, 3}, {0, 0}, {0, 0} } }, {1, 0, 2, 0, /* 0x0b */ {{0, 1}, {3, 3}, {0, 0}, {0, 0} } }, {0, 0, 1, 0, /* 0x0c */ {{2, 3}, {0, 0}, {0, 0}, {0, 0} } }, {1, 0, 2, 0, /* 0x0d */ {{0, 0}, {2, 3}, {0, 0}, {0, 0} } }, {0, 0, 1, 0, /* 0x0e */ {{1, 3}, {0, 0}, {0, 0}, {0, 0} } }, {1, 0, 1, 0, /* 0x0f */ {{0, 3}, {0, 0}, {0, 0}, {0, 0} } }, {0, 0, 1, 0, /* 0x10 */ {{4, 4}, {0, 0}, {0, 0}, {0, 0} } }, {1, 0, 2, 0, /* 0x11 */ {{0, 0}, {4, 4}, {0, 0}, {0, 0} } }, {0, 0, 2, 0, /* 0x12 */ {{1, 1}, {4, 4}, {0, 0}, {0, 0} } }, {1, 0, 2, 0, /* 0x13 */ {{0, 1}, {4, 4}, {0, 0}, {0, 0} } }, {0, 0, 2, 0, /* 0x14 */ {{2, 2}, {4, 4}, {0, 0}, {0, 0} } }, {1, 0, 3, 0, /* 0x15 */ {{0, 0}, {2, 2}, {4, 4}, {0, 0} } }, {0, 0, 2, 0, /* 0x16 */ {{1, 2}, {4, 4}, {0, 0}, {0, 0} } }, {1, 0, 2, 0, /* 0x17 */ {{0, 2}, {4, 4}, {0, 0}, {0, 0} } }, {0, 0, 1, 0, /* 0x18 */ {{3, 4}, {0, 0}, {0, 0}, {0, 0} } }, {1, 0, 2, 0, /* 0x19 */ {{0, 0}, {3, 4}, {0, 0}, {0, 0} } }, {0, 0, 2, 0, /* 0x1a */ {{1, 1}, {3, 4}, {0, 0}, {0, 0} } }, {1, 0, 2, 0, /* 0x1b */ {{0, 1}, {3, 4}, {0, 0}, {0, 0} } }, {0, 0, 1, 0, /* 0x1c */ {{2, 4}, {0, 0}, {0, 0}, {0, 0} } }, {1, 0, 2, 0, /* 0x1d */ {{0, 0}, {2, 4}, {0, 0}, {0, 0} } }, {0, 0, 1, 0, /* 0x1e */ {{1, 4}, {0, 0}, {0, 0}, {0, 0} } }, {1, 0, 1, 0, /* 0x1f */ {{0, 4}, {0, 0}, {0, 0}, {0, 0} } }, {0, 0, 1, 0, /* 0x20 */ {{5, 5}, {0, 0}, {0, 0}, {0, 0} } }, {1, 0, 2, 0, /* 0x21 */ {{0, 0}, {5, 5}, {0, 0}, {0, 0} } }, {0, 0, 2, 0, /* 0x22 */ {{1, 1}, {5, 5}, {0, 0}, {0, 0} } }, {1, 0, 2, 0, /* 0x23 */ {{0, 1}, {5, 5}, {0, 0}, {0, 0} } }, {0, 0, 2, 0, /* 0x24 */ {{2, 2}, {5, 5}, {0, 0}, {0, 0} } }, {1, 0, 3, 0, /* 0x25 */ {{0, 0}, {2, 2}, {5, 5}, {0, 0} } }, {0, 0, 2, 0, /* 0x26 */ {{1, 2}, {5, 5}, {0, 0}, {0, 0} } }, {1, 0, 2, 0, /* 0x27 */ {{0, 2}, {5, 5}, {0, 0}, {0, 0} } }, {0, 0, 2, 0, /* 0x28 */ {{3, 3}, {5, 5}, {0, 0}, {0, 0} } }, {1, 0, 3, 0, /* 0x29 */ {{0, 0}, {3, 3}, {5, 5}, {0, 0} } }, {0, 0, 3, 0, /* 0x2a */ {{1, 1}, {3, 3}, {5, 5}, {0, 0} } }, {1, 0, 3, 0, /* 0x2b */ {{0, 1}, {3, 3}, {5, 5}, {0, 0} } }, {0, 0, 2, 0, /* 0x2c */ {{2, 3}, {5, 5}, {0, 0}, {0, 0} } }, {1, 0, 3, 0, /* 0x2d */ {{0, 0}, {2, 3}, {5, 5}, {0, 0} } }, {0, 0, 2, 0, /* 0x2e */ {{1, 3}, {5, 5}, {0, 0}, {0, 0} } }, {1, 0, 2, 0, /* 0x2f */ {{0, 3}, {5, 5}, {0, 0}, {0, 0} } }, {0, 0, 1, 0, /* 0x30 */ {{4, 5}, {0, 0}, {0, 0}, {0, 0} } }, {1, 0, 2, 0, /* 0x31 */ {{0, 0}, {4, 5}, {0, 0}, {0, 0} } }, {0, 0, 2, 0, /* 0x32 */ {{1, 1}, {4, 5}, {0, 0}, {0, 0} } }, {1, 0, 2, 0, /* 0x33 */ {{0, 1}, {4, 5}, {0, 0}, {0, 0} } }, {0, 0, 2, 0, /* 0x34 */ {{2, 2}, {4, 5}, {0, 0}, {0, 0} } }, {1, 0, 3, 0, /* 0x35 */ {{0, 0}, {2, 2}, {4, 5}, {0, 0} } }, {0, 0, 2, 0, /* 0x36 */ {{1, 2}, {4, 5}, {0, 0}, {0, 0} } }, {1, 0, 2, 0, /* 0x37 */ {{0, 2}, {4, 5}, {0, 0}, {0, 0} } }, {0, 0, 1, 0, /* 0x38 */ {{3, 5}, {0, 0}, {0, 0}, {0, 0} } }, {1, 0, 2, 0, /* 0x39 */ {{0, 0}, {3, 5}, {0, 0}, {0, 0} } }, {0, 0, 2, 0, /* 0x3a */ {{1, 1}, {3, 5}, {0, 0}, {0, 0} } }, {1, 0, 2, 0, /* 0x3b */ {{0, 1}, {3, 5}, {0, 0}, {0, 0} } }, {0, 0, 1, 0, /* 0x3c */ {{2, 5}, {0, 0}, {0, 0}, {0, 0} } }, {1, 0, 2, 0, /* 0x3d */ {{0, 0}, {2, 5}, {0, 0}, {0, 0} } }, {0, 0, 1, 0, /* 0x3e */ {{1, 5}, {0, 0}, {0, 0}, {0, 0} } }, {1, 0, 1, 0, /* 0x3f */ {{0, 5}, {0, 0}, {0, 0}, {0, 0} } }, {0, 0, 1, 0, /* 0x40 */ {{6, 6}, {0, 0}, {0, 0}, {0, 0} } }, {1, 0, 2, 0, /* 0x41 */ {{0, 0}, {6, 6}, {0, 0}, {0, 0} } }, {0, 0, 2, 0, /* 0x42 */ {{1, 1}, {6, 6}, {0, 0}, {0, 0} } }, {1, 0, 2, 0, /* 0x43 */ {{0, 1}, {6, 6}, {0, 0}, {0, 0} } }, {0, 0, 2, 0, /* 0x44 */ {{2, 2}, {6, 6}, {0, 0}, {0, 0} } }, {1, 0, 3, 0, /* 0x45 */ {{0, 0}, {2, 2}, {6, 6}, {0, 0} } }, {0, 0, 2, 0, /* 0x46 */ {{1, 2}, {6, 6}, {0, 0}, {0, 0} } }, {1, 0, 2, 0, /* 0x47 */ {{0, 2}, {6, 6}, {0, 0}, {0, 0} } }, {0, 0, 2, 0, /* 0x48 */ {{3, 3}, {6, 6}, {0, 0}, {0, 0} } }, {1, 0, 3, 0, /* 0x49 */ {{0, 0}, {3, 3}, {6, 6}, {0, 0} } }, {0, 0, 3, 0, /* 0x4a */ {{1, 1}, {3, 3}, {6, 6}, {0, 0} } }, {1, 0, 3, 0, /* 0x4b */ {{0, 1}, {3, 3}, {6, 6}, {0, 0} } }, {0, 0, 2, 0, /* 0x4c */ {{2, 3}, {6, 6}, {0, 0}, {0, 0} } }, {1, 0, 3, 0, /* 0x4d */ {{0, 0}, {2, 3}, {6, 6}, {0, 0} } }, {0, 0, 2, 0, /* 0x4e */ {{1, 3}, {6, 6}, {0, 0}, {0, 0} } }, {1, 0, 2, 0, /* 0x4f */ {{0, 3}, {6, 6}, {0, 0}, {0, 0} } }, {0, 0, 2, 0, /* 0x50 */ {{4, 4}, {6, 6}, {0, 0}, {0, 0} } }, {1, 0, 3, 0, /* 0x51 */ {{0, 0}, {4, 4}, {6, 6}, {0, 0} } }, {0, 0, 3, 0, /* 0x52 */ {{1, 1}, {4, 4}, {6, 6}, {0, 0} } }, {1, 0, 3, 0, /* 0x53 */ {{0, 1}, {4, 4}, {6, 6}, {0, 0} } }, {0, 0, 3, 0, /* 0x54 */ {{2, 2}, {4, 4}, {6, 6}, {0, 0} } }, {1, 0, 4, 0, /* 0x55 */ {{0, 0}, {2, 2}, {4, 4}, {6, 6} } }, {0, 0, 3, 0, /* 0x56 */ {{1, 2}, {4, 4}, {6, 6}, {0, 0} } }, {1, 0, 3, 0, /* 0x57 */ {{0, 2}, {4, 4}, {6, 6}, {0, 0} } }, {0, 0, 2, 0, /* 0x58 */ {{3, 4}, {6, 6}, {0, 0}, {0, 0} } }, {1, 0, 3, 0, /* 0x59 */ {{0, 0}, {3, 4}, {6, 6}, {0, 0} } }, {0, 0, 3, 0, /* 0x5a */ {{1, 1}, {3, 4}, {6, 6}, {0, 0} } }, {1, 0, 3, 0, /* 0x5b */ {{0, 1}, {3, 4}, {6, 6}, {0, 0} } }, {0, 0, 2, 0, /* 0x5c */ {{2, 4}, {6, 6}, {0, 0}, {0, 0} } }, {1, 0, 3, 0, /* 0x5d */ {{0, 0}, {2, 4}, {6, 6}, {0, 0} } }, {0, 0, 2, 0, /* 0x5e */ {{1, 4}, {6, 6}, {0, 0}, {0, 0} } }, {1, 0, 2, 0, /* 0x5f */ {{0, 4}, {6, 6}, {0, 0}, {0, 0} } }, {0, 0, 1, 0, /* 0x60 */ {{5, 6}, {0, 0}, {0, 0}, {0, 0} } }, {1, 0, 2, 0, /* 0x61 */ {{0, 0}, {5, 6}, {0, 0}, {0, 0} } }, {0, 0, 2, 0, /* 0x62 */ {{1, 1}, {5, 6}, {0, 0}, {0, 0} } }, {1, 0, 2, 0, /* 0x63 */ {{0, 1}, {5, 6}, {0, 0}, {0, 0} } }, {0, 0, 2, 0, /* 0x64 */ {{2, 2}, {5, 6}, {0, 0}, {0, 0} } }, {1, 0, 3, 0, /* 0x65 */ {{0, 0}, {2, 2}, {5, 6}, {0, 0} } }, {0, 0, 2, 0, /* 0x66 */ {{1, 2}, {5, 6}, {0, 0}, {0, 0} } }, {1, 0, 2, 0, /* 0x67 */ {{0, 2}, {5, 6}, {0, 0}, {0, 0} } }, {0, 0, 2, 0, /* 0x68 */ {{3, 3}, {5, 6}, {0, 0}, {0, 0} } }, {1, 0, 3, 0, /* 0x69 */ {{0, 0}, {3, 3}, {5, 6}, {0, 0} } }, {0, 0, 3, 0, /* 0x6a */ {{1, 1}, {3, 3}, {5, 6}, {0, 0} } }, {1, 0, 3, 0, /* 0x6b */ {{0, 1}, {3, 3}, {5, 6}, {0, 0} } }, {0, 0, 2, 0, /* 0x6c */ {{2, 3}, {5, 6}, {0, 0}, {0, 0} } }, {1, 0, 3, 0, /* 0x6d */ {{0, 0}, {2, 3}, {5, 6}, {0, 0} } }, {0, 0, 2, 0, /* 0x6e */ {{1, 3}, {5, 6}, {0, 0}, {0, 0} } }, {1, 0, 2, 0, /* 0x6f */ {{0, 3}, {5, 6}, {0, 0}, {0, 0} } }, {0, 0, 1, 0, /* 0x70 */ {{4, 6}, {0, 0}, {0, 0}, {0, 0} } }, {1, 0, 2, 0, /* 0x71 */ {{0, 0}, {4, 6}, {0, 0}, {0, 0} } }, {0, 0, 2, 0, /* 0x72 */ {{1, 1}, {4, 6}, {0, 0}, {0, 0} } }, {1, 0, 2, 0, /* 0x73 */ {{0, 1}, {4, 6}, {0, 0}, {0, 0} } }, {0, 0, 2, 0, /* 0x74 */ {{2, 2}, {4, 6}, {0, 0}, {0, 0} } }, {1, 0, 3, 0, /* 0x75 */ {{0, 0}, {2, 2}, {4, 6}, {0, 0} } }, {0, 0, 2, 0, /* 0x76 */ {{1, 2}, {4, 6}, {0, 0}, {0, 0} } }, {1, 0, 2, 0, /* 0x77 */ {{0, 2}, {4, 6}, {0, 0}, {0, 0} } }, {0, 0, 1, 0, /* 0x78 */ {{3, 6}, {0, 0}, {0, 0}, {0, 0} } }, {1, 0, 2, 0, /* 0x79 */ {{0, 0}, {3, 6}, {0, 0}, {0, 0} } }, {0, 0, 2, 0, /* 0x7a */ {{1, 1}, {3, 6}, {0, 0}, {0, 0} } }, {1, 0, 2, 0, /* 0x7b */ {{0, 1}, {3, 6}, {0, 0}, {0, 0} } }, {0, 0, 1, 0, /* 0x7c */ {{2, 6}, {0, 0}, {0, 0}, {0, 0} } }, {1, 0, 2, 0, /* 0x7d */ {{0, 0}, {2, 6}, {0, 0}, {0, 0} } }, {0, 0, 1, 0, /* 0x7e */ {{1, 6}, {0, 0}, {0, 0}, {0, 0} } }, {1, 0, 1, 0, /* 0x7f */ {{0, 6}, {0, 0}, {0, 0}, {0, 0} } }, {0, 1, 1, 0, /* 0x80 */ {{7, 7}, {0, 0}, {0, 0}, {0, 0} } }, {1, 1, 2, 0, /* 0x81 */ {{0, 0}, {7, 7}, {0, 0}, {0, 0} } }, {0, 1, 2, 0, /* 0x82 */ {{1, 1}, {7, 7}, {0, 0}, {0, 0} } }, {1, 1, 2, 0, /* 0x83 */ {{0, 1}, {7, 7}, {0, 0}, {0, 0} } }, {0, 1, 2, 0, /* 0x84 */ {{2, 2}, {7, 7}, {0, 0}, {0, 0} } }, {1, 1, 3, 0, /* 0x85 */ {{0, 0}, {2, 2}, {7, 7}, {0, 0} } }, {0, 1, 2, 0, /* 0x86 */ {{1, 2}, {7, 7}, {0, 0}, {0, 0} } }, {1, 1, 2, 0, /* 0x87 */ {{0, 2}, {7, 7}, {0, 0}, {0, 0} } }, {0, 1, 2, 0, /* 0x88 */ {{3, 3}, {7, 7}, {0, 0}, {0, 0} } }, {1, 1, 3, 0, /* 0x89 */ {{0, 0}, {3, 3}, {7, 7}, {0, 0} } }, {0, 1, 3, 0, /* 0x8a */ {{1, 1}, {3, 3}, {7, 7}, {0, 0} } }, {1, 1, 3, 0, /* 0x8b */ {{0, 1}, {3, 3}, {7, 7}, {0, 0} } }, {0, 1, 2, 0, /* 0x8c */ {{2, 3}, {7, 7}, {0, 0}, {0, 0} } }, {1, 1, 3, 0, /* 0x8d */ {{0, 0}, {2, 3}, {7, 7}, {0, 0} } }, {0, 1, 2, 0, /* 0x8e */ {{1, 3}, {7, 7}, {0, 0}, {0, 0} } }, {1, 1, 2, 0, /* 0x8f */ {{0, 3}, {7, 7}, {0, 0}, {0, 0} } }, {0, 1, 2, 0, /* 0x90 */ {{4, 4}, {7, 7}, {0, 0}, {0, 0} } }, {1, 1, 3, 0, /* 0x91 */ {{0, 0}, {4, 4}, {7, 7}, {0, 0} } }, {0, 1, 3, 0, /* 0x92 */ {{1, 1}, {4, 4}, {7, 7}, {0, 0} } }, {1, 1, 3, 0, /* 0x93 */ {{0, 1}, {4, 4}, {7, 7}, {0, 0} } }, {0, 1, 3, 0, /* 0x94 */ {{2, 2}, {4, 4}, {7, 7}, {0, 0} } }, {1, 1, 4, 0, /* 0x95 */ {{0, 0}, {2, 2}, {4, 4}, {7, 7} } }, {0, 1, 3, 0, /* 0x96 */ {{1, 2}, {4, 4}, {7, 7}, {0, 0} } }, {1, 1, 3, 0, /* 0x97 */ {{0, 2}, {4, 4}, {7, 7}, {0, 0} } }, {0, 1, 2, 0, /* 0x98 */ {{3, 4}, {7, 7}, {0, 0}, {0, 0} } }, {1, 1, 3, 0, /* 0x99 */ {{0, 0}, {3, 4}, {7, 7}, {0, 0} } }, {0, 1, 3, 0, /* 0x9a */ {{1, 1}, {3, 4}, {7, 7}, {0, 0} } }, {1, 1, 3, 0, /* 0x9b */ {{0, 1}, {3, 4}, {7, 7}, {0, 0} } }, {0, 1, 2, 0, /* 0x9c */ {{2, 4}, {7, 7}, {0, 0}, {0, 0} } }, {1, 1, 3, 0, /* 0x9d */ {{0, 0}, {2, 4}, {7, 7}, {0, 0} } }, {0, 1, 2, 0, /* 0x9e */ {{1, 4}, {7, 7}, {0, 0}, {0, 0} } }, {1, 1, 2, 0, /* 0x9f */ {{0, 4}, {7, 7}, {0, 0}, {0, 0} } }, {0, 1, 2, 0, /* 0xa0 */ {{5, 5}, {7, 7}, {0, 0}, {0, 0} } }, {1, 1, 3, 0, /* 0xa1 */ {{0, 0}, {5, 5}, {7, 7}, {0, 0} } }, {0, 1, 3, 0, /* 0xa2 */ {{1, 1}, {5, 5}, {7, 7}, {0, 0} } }, {1, 1, 3, 0, /* 0xa3 */ {{0, 1}, {5, 5}, {7, 7}, {0, 0} } }, {0, 1, 3, 0, /* 0xa4 */ {{2, 2}, {5, 5}, {7, 7}, {0, 0} } }, {1, 1, 4, 0, /* 0xa5 */ {{0, 0}, {2, 2}, {5, 5}, {7, 7} } }, {0, 1, 3, 0, /* 0xa6 */ {{1, 2}, {5, 5}, {7, 7}, {0, 0} } }, {1, 1, 3, 0, /* 0xa7 */ {{0, 2}, {5, 5}, {7, 7}, {0, 0} } }, {0, 1, 3, 0, /* 0xa8 */ {{3, 3}, {5, 5}, {7, 7}, {0, 0} } }, {1, 1, 4, 0, /* 0xa9 */ {{0, 0}, {3, 3}, {5, 5}, {7, 7} } }, {0, 1, 4, 0, /* 0xaa */ {{1, 1}, {3, 3}, {5, 5}, {7, 7} } }, {1, 1, 4, 0, /* 0xab */ {{0, 1}, {3, 3}, {5, 5}, {7, 7} } }, {0, 1, 3, 0, /* 0xac */ {{2, 3}, {5, 5}, {7, 7}, {0, 0} } }, {1, 1, 4, 0, /* 0xad */ {{0, 0}, {2, 3}, {5, 5}, {7, 7} } }, {0, 1, 3, 0, /* 0xae */ {{1, 3}, {5, 5}, {7, 7}, {0, 0} } }, {1, 1, 3, 0, /* 0xaf */ {{0, 3}, {5, 5}, {7, 7}, {0, 0} } }, {0, 1, 2, 0, /* 0xb0 */ {{4, 5}, {7, 7}, {0, 0}, {0, 0} } }, {1, 1, 3, 0, /* 0xb1 */ {{0, 0}, {4, 5}, {7, 7}, {0, 0} } }, {0, 1, 3, 0, /* 0xb2 */ {{1, 1}, {4, 5}, {7, 7}, {0, 0} } }, {1, 1, 3, 0, /* 0xb3 */ {{0, 1}, {4, 5}, {7, 7}, {0, 0} } }, {0, 1, 3, 0, /* 0xb4 */ {{2, 2}, {4, 5}, {7, 7}, {0, 0} } }, {1, 1, 4, 0, /* 0xb5 */ {{0, 0}, {2, 2}, {4, 5}, {7, 7} } }, {0, 1, 3, 0, /* 0xb6 */ {{1, 2}, {4, 5}, {7, 7}, {0, 0} } }, {1, 1, 3, 0, /* 0xb7 */ {{0, 2}, {4, 5}, {7, 7}, {0, 0} } }, {0, 1, 2, 0, /* 0xb8 */ {{3, 5}, {7, 7}, {0, 0}, {0, 0} } }, {1, 1, 3, 0, /* 0xb9 */ {{0, 0}, {3, 5}, {7, 7}, {0, 0} } }, {0, 1, 3, 0, /* 0xba */ {{1, 1}, {3, 5}, {7, 7}, {0, 0} } }, {1, 1, 3, 0, /* 0xbb */ {{0, 1}, {3, 5}, {7, 7}, {0, 0} } }, {0, 1, 2, 0, /* 0xbc */ {{2, 5}, {7, 7}, {0, 0}, {0, 0} } }, {1, 1, 3, 0, /* 0xbd */ {{0, 0}, {2, 5}, {7, 7}, {0, 0} } }, {0, 1, 2, 0, /* 0xbe */ {{1, 5}, {7, 7}, {0, 0}, {0, 0} } }, {1, 1, 2, 0, /* 0xbf */ {{0, 5}, {7, 7}, {0, 0}, {0, 0} } }, {0, 1, 1, 0, /* 0xc0 */ {{6, 7}, {0, 0}, {0, 0}, {0, 0} } }, {1, 1, 2, 0, /* 0xc1 */ {{0, 0}, {6, 7}, {0, 0}, {0, 0} } }, {0, 1, 2, 0, /* 0xc2 */ {{1, 1}, {6, 7}, {0, 0}, {0, 0} } }, {1, 1, 2, 0, /* 0xc3 */ {{0, 1}, {6, 7}, {0, 0}, {0, 0} } }, {0, 1, 2, 0, /* 0xc4 */ {{2, 2}, {6, 7}, {0, 0}, {0, 0} } }, {1, 1, 3, 0, /* 0xc5 */ {{0, 0}, {2, 2}, {6, 7}, {0, 0} } }, {0, 1, 2, 0, /* 0xc6 */ {{1, 2}, {6, 7}, {0, 0}, {0, 0} } }, {1, 1, 2, 0, /* 0xc7 */ {{0, 2}, {6, 7}, {0, 0}, {0, 0} } }, {0, 1, 2, 0, /* 0xc8 */ {{3, 3}, {6, 7}, {0, 0}, {0, 0} } }, {1, 1, 3, 0, /* 0xc9 */ {{0, 0}, {3, 3}, {6, 7}, {0, 0} } }, {0, 1, 3, 0, /* 0xca */ {{1, 1}, {3, 3}, {6, 7}, {0, 0} } }, {1, 1, 3, 0, /* 0xcb */ {{0, 1}, {3, 3}, {6, 7}, {0, 0} } }, {0, 1, 2, 0, /* 0xcc */ {{2, 3}, {6, 7}, {0, 0}, {0, 0} } }, {1, 1, 3, 0, /* 0xcd */ {{0, 0}, {2, 3}, {6, 7}, {0, 0} } }, {0, 1, 2, 0, /* 0xce */ {{1, 3}, {6, 7}, {0, 0}, {0, 0} } }, {1, 1, 2, 0, /* 0xcf */ {{0, 3}, {6, 7}, {0, 0}, {0, 0} } }, {0, 1, 2, 0, /* 0xd0 */ {{4, 4}, {6, 7}, {0, 0}, {0, 0} } }, {1, 1, 3, 0, /* 0xd1 */ {{0, 0}, {4, 4}, {6, 7}, {0, 0} } }, {0, 1, 3, 0, /* 0xd2 */ {{1, 1}, {4, 4}, {6, 7}, {0, 0} } }, {1, 1, 3, 0, /* 0xd3 */ {{0, 1}, {4, 4}, {6, 7}, {0, 0} } }, {0, 1, 3, 0, /* 0xd4 */ {{2, 2}, {4, 4}, {6, 7}, {0, 0} } }, {1, 1, 4, 0, /* 0xd5 */ {{0, 0}, {2, 2}, {4, 4}, {6, 7} } }, {0, 1, 3, 0, /* 0xd6 */ {{1, 2}, {4, 4}, {6, 7}, {0, 0} } }, {1, 1, 3, 0, /* 0xd7 */ {{0, 2}, {4, 4}, {6, 7}, {0, 0} } }, {0, 1, 2, 0, /* 0xd8 */ {{3, 4}, {6, 7}, {0, 0}, {0, 0} } }, {1, 1, 3, 0, /* 0xd9 */ {{0, 0}, {3, 4}, {6, 7}, {0, 0} } }, {0, 1, 3, 0, /* 0xda */ {{1, 1}, {3, 4}, {6, 7}, {0, 0} } }, {1, 1, 3, 0, /* 0xdb */ {{0, 1}, {3, 4}, {6, 7}, {0, 0} } }, {0, 1, 2, 0, /* 0xdc */ {{2, 4}, {6, 7}, {0, 0}, {0, 0} } }, {1, 1, 3, 0, /* 0xdd */ {{0, 0}, {2, 4}, {6, 7}, {0, 0} } }, {0, 1, 2, 0, /* 0xde */ {{1, 4}, {6, 7}, {0, 0}, {0, 0} } }, {1, 1, 2, 0, /* 0xdf */ {{0, 4}, {6, 7}, {0, 0}, {0, 0} } }, {0, 1, 1, 0, /* 0xe0 */ {{5, 7}, {0, 0}, {0, 0}, {0, 0} } }, {1, 1, 2, 0, /* 0xe1 */ {{0, 0}, {5, 7}, {0, 0}, {0, 0} } }, {0, 1, 2, 0, /* 0xe2 */ {{1, 1}, {5, 7}, {0, 0}, {0, 0} } }, {1, 1, 2, 0, /* 0xe3 */ {{0, 1}, {5, 7}, {0, 0}, {0, 0} } }, {0, 1, 2, 0, /* 0xe4 */ {{2, 2}, {5, 7}, {0, 0}, {0, 0} } }, {1, 1, 3, 0, /* 0xe5 */ {{0, 0}, {2, 2}, {5, 7}, {0, 0} } }, {0, 1, 2, 0, /* 0xe6 */ {{1, 2}, {5, 7}, {0, 0}, {0, 0} } }, {1, 1, 2, 0, /* 0xe7 */ {{0, 2}, {5, 7}, {0, 0}, {0, 0} } }, {0, 1, 2, 0, /* 0xe8 */ {{3, 3}, {5, 7}, {0, 0}, {0, 0} } }, {1, 1, 3, 0, /* 0xe9 */ {{0, 0}, {3, 3}, {5, 7}, {0, 0} } }, {0, 1, 3, 0, /* 0xea */ {{1, 1}, {3, 3}, {5, 7}, {0, 0} } }, {1, 1, 3, 0, /* 0xeb */ {{0, 1}, {3, 3}, {5, 7}, {0, 0} } }, {0, 1, 2, 0, /* 0xec */ {{2, 3}, {5, 7}, {0, 0}, {0, 0} } }, {1, 1, 3, 0, /* 0xed */ {{0, 0}, {2, 3}, {5, 7}, {0, 0} } }, {0, 1, 2, 0, /* 0xee */ {{1, 3}, {5, 7}, {0, 0}, {0, 0} } }, {1, 1, 2, 0, /* 0xef */ {{0, 3}, {5, 7}, {0, 0}, {0, 0} } }, {0, 1, 1, 0, /* 0xf0 */ {{4, 7}, {0, 0}, {0, 0}, {0, 0} } }, {1, 1, 2, 0, /* 0xf1 */ {{0, 0}, {4, 7}, {0, 0}, {0, 0} } }, {0, 1, 2, 0, /* 0xf2 */ {{1, 1}, {4, 7}, {0, 0}, {0, 0} } }, {1, 1, 2, 0, /* 0xf3 */ {{0, 1}, {4, 7}, {0, 0}, {0, 0} } }, {0, 1, 2, 0, /* 0xf4 */ {{2, 2}, {4, 7}, {0, 0}, {0, 0} } }, {1, 1, 3, 0, /* 0xf5 */ {{0, 0}, {2, 2}, {4, 7}, {0, 0} } }, {0, 1, 2, 0, /* 0xf6 */ {{1, 2}, {4, 7}, {0, 0}, {0, 0} } }, {1, 1, 2, 0, /* 0xf7 */ {{0, 2}, {4, 7}, {0, 0}, {0, 0} } }, {0, 1, 1, 0, /* 0xf8 */ {{3, 7}, {0, 0}, {0, 0}, {0, 0} } }, {1, 1, 2, 0, /* 0xf9 */ {{0, 0}, {3, 7}, {0, 0}, {0, 0} } }, {0, 1, 2, 0, /* 0xfa */ {{1, 1}, {3, 7}, {0, 0}, {0, 0} } }, {1, 1, 2, 0, /* 0xfb */ {{0, 1}, {3, 7}, {0, 0}, {0, 0} } }, {0, 1, 1, 0, /* 0xfc */ {{2, 7}, {0, 0}, {0, 0}, {0, 0} } }, {1, 1, 2, 0, /* 0xfd */ {{0, 0}, {2, 7}, {0, 0}, {0, 0} } }, {0, 1, 1, 0, /* 0xfe */ {{1, 7}, {0, 0}, {0, 0}, {0, 0} } }, {1, 1, 1, 0, /* 0xff */ {{0, 7}, {0, 0}, {0, 0}, {0, 0} } } }; int sctp_is_address_in_scope(struct sctp_ifa *ifa, struct sctp_scoping *scope, int do_update) { if ((scope->loopback_scope == 0) && (ifa->ifn_p) && SCTP_IFN_IS_IFT_LOOP(ifa->ifn_p)) { /* * skip loopback if not in scope * */ return (0); } switch (ifa->address.sa.sa_family) { #ifdef INET case AF_INET: if (scope->ipv4_addr_legal) { struct sockaddr_in *sin; sin = &ifa->address.sin; if (sin->sin_addr.s_addr == 0) { /* not in scope , unspecified */ return (0); } if ((scope->ipv4_local_scope == 0) && (IN4_ISPRIVATE_ADDRESS(&sin->sin_addr))) { /* private address not in scope */ return (0); } } else { return (0); } break; #endif #ifdef INET6 case AF_INET6: if (scope->ipv6_addr_legal) { struct sockaddr_in6 *sin6; /* * Must update the flags, bummer, which means any * IFA locks must now be applied HERE <-> */ if (do_update) { sctp_gather_internal_ifa_flags(ifa); } if (ifa->localifa_flags & SCTP_ADDR_IFA_UNUSEABLE) { return (0); } /* ok to use deprecated addresses? */ sin6 = &ifa->address.sin6; if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) { /* skip unspecifed addresses */ return (0); } if ( /* (local_scope == 0) && */ (IN6_IS_ADDR_LINKLOCAL(&sin6->sin6_addr))) { return (0); } if ((scope->site_scope == 0) && (IN6_IS_ADDR_SITELOCAL(&sin6->sin6_addr))) { return (0); } } else { return (0); } break; #endif default: return (0); } return (1); } static struct mbuf * sctp_add_addr_to_mbuf(struct mbuf *m, struct sctp_ifa *ifa, uint16_t *len) { #if defined(INET) || defined(INET6) struct sctp_paramhdr *paramh; struct mbuf *mret; uint16_t plen; #endif switch (ifa->address.sa.sa_family) { #ifdef INET case AF_INET: plen = (uint16_t)sizeof(struct sctp_ipv4addr_param); break; #endif #ifdef INET6 case AF_INET6: plen = (uint16_t)sizeof(struct sctp_ipv6addr_param); break; #endif default: return (m); } #if defined(INET) || defined(INET6) if (M_TRAILINGSPACE(m) >= plen) { /* easy side we just drop it on the end */ paramh = (struct sctp_paramhdr *)(SCTP_BUF_AT(m, SCTP_BUF_LEN(m))); mret = m; } else { /* Need more space */ mret = m; while (SCTP_BUF_NEXT(mret) != NULL) { mret = SCTP_BUF_NEXT(mret); } SCTP_BUF_NEXT(mret) = sctp_get_mbuf_for_msg(plen, 0, M_NOWAIT, 1, MT_DATA); if (SCTP_BUF_NEXT(mret) == NULL) { /* We are hosed, can't add more addresses */ return (m); } mret = SCTP_BUF_NEXT(mret); paramh = mtod(mret, struct sctp_paramhdr *); } /* now add the parameter */ switch (ifa->address.sa.sa_family) { #ifdef INET case AF_INET: { struct sctp_ipv4addr_param *ipv4p; struct sockaddr_in *sin; sin = &ifa->address.sin; ipv4p = (struct sctp_ipv4addr_param *)paramh; paramh->param_type = htons(SCTP_IPV4_ADDRESS); paramh->param_length = htons(plen); ipv4p->addr = sin->sin_addr.s_addr; SCTP_BUF_LEN(mret) += plen; break; } #endif #ifdef INET6 case AF_INET6: { struct sctp_ipv6addr_param *ipv6p; struct sockaddr_in6 *sin6; sin6 = &ifa->address.sin6; ipv6p = (struct sctp_ipv6addr_param *)paramh; paramh->param_type = htons(SCTP_IPV6_ADDRESS); paramh->param_length = htons(plen); memcpy(ipv6p->addr, &sin6->sin6_addr, sizeof(ipv6p->addr)); /* clear embedded scope in the address */ in6_clearscope((struct in6_addr *)ipv6p->addr); SCTP_BUF_LEN(mret) += plen; break; } #endif default: return (m); } if (len != NULL) { *len += plen; } return (mret); #endif } struct mbuf * sctp_add_addresses_to_i_ia(struct sctp_inpcb *inp, struct sctp_tcb *stcb, struct sctp_scoping *scope, struct mbuf *m_at, int cnt_inits_to, uint16_t *padding_len, uint16_t *chunk_len) { struct sctp_vrf *vrf = NULL; int cnt, limit_out = 0, total_count; uint32_t vrf_id; vrf_id = inp->def_vrf_id; SCTP_IPI_ADDR_RLOCK(); vrf = sctp_find_vrf(vrf_id); if (vrf == NULL) { SCTP_IPI_ADDR_RUNLOCK(); return (m_at); } if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) { struct sctp_ifa *sctp_ifap; struct sctp_ifn *sctp_ifnp; cnt = cnt_inits_to; if (vrf->total_ifa_count > SCTP_COUNT_LIMIT) { limit_out = 1; cnt = SCTP_ADDRESS_LIMIT; goto skip_count; } LIST_FOREACH(sctp_ifnp, &vrf->ifnlist, next_ifn) { if ((scope->loopback_scope == 0) && SCTP_IFN_IS_IFT_LOOP(sctp_ifnp)) { /* * Skip loopback devices if loopback_scope * not set */ continue; } LIST_FOREACH(sctp_ifap, &sctp_ifnp->ifalist, next_ifa) { #ifdef INET if ((sctp_ifap->address.sa.sa_family == AF_INET) && (prison_check_ip4(inp->ip_inp.inp.inp_cred, &sctp_ifap->address.sin.sin_addr) != 0)) { continue; } #endif #ifdef INET6 if ((sctp_ifap->address.sa.sa_family == AF_INET6) && (prison_check_ip6(inp->ip_inp.inp.inp_cred, &sctp_ifap->address.sin6.sin6_addr) != 0)) { continue; } #endif if (sctp_is_addr_restricted(stcb, sctp_ifap)) { continue; } if (sctp_is_address_in_scope(sctp_ifap, scope, 1) == 0) { continue; } cnt++; if (cnt > SCTP_ADDRESS_LIMIT) { break; } } if (cnt > SCTP_ADDRESS_LIMIT) { break; } } skip_count: if (cnt > 1) { total_count = 0; LIST_FOREACH(sctp_ifnp, &vrf->ifnlist, next_ifn) { cnt = 0; if ((scope->loopback_scope == 0) && SCTP_IFN_IS_IFT_LOOP(sctp_ifnp)) { /* * Skip loopback devices if * loopback_scope not set */ continue; } LIST_FOREACH(sctp_ifap, &sctp_ifnp->ifalist, next_ifa) { #ifdef INET if ((sctp_ifap->address.sa.sa_family == AF_INET) && (prison_check_ip4(inp->ip_inp.inp.inp_cred, &sctp_ifap->address.sin.sin_addr) != 0)) { continue; } #endif #ifdef INET6 if ((sctp_ifap->address.sa.sa_family == AF_INET6) && (prison_check_ip6(inp->ip_inp.inp.inp_cred, &sctp_ifap->address.sin6.sin6_addr) != 0)) { continue; } #endif if (sctp_is_addr_restricted(stcb, sctp_ifap)) { continue; } if (sctp_is_address_in_scope(sctp_ifap, scope, 0) == 0) { continue; } if ((chunk_len != NULL) && (padding_len != NULL) && (*padding_len > 0)) { memset(mtod(m_at, caddr_t)+*chunk_len, 0, *padding_len); SCTP_BUF_LEN(m_at) += *padding_len; *chunk_len += *padding_len; *padding_len = 0; } m_at = sctp_add_addr_to_mbuf(m_at, sctp_ifap, chunk_len); if (limit_out) { cnt++; total_count++; if (cnt >= 2) { /* * two from each * address */ break; } if (total_count > SCTP_ADDRESS_LIMIT) { /* No more addresses */ break; } } } } } } else { struct sctp_laddr *laddr; cnt = cnt_inits_to; /* First, how many ? */ LIST_FOREACH(laddr, &inp->sctp_addr_list, sctp_nxt_addr) { if (laddr->ifa == NULL) { continue; } if (laddr->ifa->localifa_flags & SCTP_BEING_DELETED) /* * Address being deleted by the system, dont * list. */ continue; if (laddr->action == SCTP_DEL_IP_ADDRESS) { /* * Address being deleted on this ep don't * list. */ continue; } if (sctp_is_address_in_scope(laddr->ifa, scope, 1) == 0) { continue; } cnt++; } /* * To get through a NAT we only list addresses if we have * more than one. That way if you just bind a single address * we let the source of the init dictate our address. */ if (cnt > 1) { cnt = cnt_inits_to; LIST_FOREACH(laddr, &inp->sctp_addr_list, sctp_nxt_addr) { if (laddr->ifa == NULL) { continue; } if (laddr->ifa->localifa_flags & SCTP_BEING_DELETED) { continue; } if (sctp_is_address_in_scope(laddr->ifa, scope, 0) == 0) { continue; } if ((chunk_len != NULL) && (padding_len != NULL) && (*padding_len > 0)) { memset(mtod(m_at, caddr_t)+*chunk_len, 0, *padding_len); SCTP_BUF_LEN(m_at) += *padding_len; *chunk_len += *padding_len; *padding_len = 0; } m_at = sctp_add_addr_to_mbuf(m_at, laddr->ifa, chunk_len); cnt++; if (cnt >= SCTP_ADDRESS_LIMIT) { break; } } } } SCTP_IPI_ADDR_RUNLOCK(); return (m_at); } static struct sctp_ifa * sctp_is_ifa_addr_preferred(struct sctp_ifa *ifa, uint8_t dest_is_loop, uint8_t dest_is_priv, sa_family_t fam) { uint8_t dest_is_global = 0; /* dest_is_priv is true if destination is a private address */ /* dest_is_loop is true if destination is a loopback addresses */ /** * Here we determine if its a preferred address. A preferred address * means it is the same scope or higher scope then the destination. * L = loopback, P = private, G = global * ----------------------------------------- * src | dest | result * ---------------------------------------- * L | L | yes * ----------------------------------------- * P | L | yes-v4 no-v6 * ----------------------------------------- * G | L | yes-v4 no-v6 * ----------------------------------------- * L | P | no * ----------------------------------------- * P | P | yes * ----------------------------------------- * G | P | no * ----------------------------------------- * L | G | no * ----------------------------------------- * P | G | no * ----------------------------------------- * G | G | yes * ----------------------------------------- */ if (ifa->address.sa.sa_family != fam) { /* forget mis-matched family */ return (NULL); } if ((dest_is_priv == 0) && (dest_is_loop == 0)) { dest_is_global = 1; } SCTPDBG(SCTP_DEBUG_OUTPUT2, "Is destination preferred:"); SCTPDBG_ADDR(SCTP_DEBUG_OUTPUT2, &ifa->address.sa); /* Ok the address may be ok */ #ifdef INET6 if (fam == AF_INET6) { /* ok to use deprecated addresses? no lets not! */ if (ifa->localifa_flags & SCTP_ADDR_IFA_UNUSEABLE) { SCTPDBG(SCTP_DEBUG_OUTPUT3, "NO:1\n"); return (NULL); } if (ifa->src_is_priv && !ifa->src_is_loop) { if (dest_is_loop) { SCTPDBG(SCTP_DEBUG_OUTPUT3, "NO:2\n"); return (NULL); } } if (ifa->src_is_glob) { if (dest_is_loop) { SCTPDBG(SCTP_DEBUG_OUTPUT3, "NO:3\n"); return (NULL); } } } #endif /* * Now that we know what is what, implement or table this could in * theory be done slicker (it used to be), but this is * straightforward and easier to validate :-) */ SCTPDBG(SCTP_DEBUG_OUTPUT3, "src_loop:%d src_priv:%d src_glob:%d\n", ifa->src_is_loop, ifa->src_is_priv, ifa->src_is_glob); SCTPDBG(SCTP_DEBUG_OUTPUT3, "dest_loop:%d dest_priv:%d dest_glob:%d\n", dest_is_loop, dest_is_priv, dest_is_global); if ((ifa->src_is_loop) && (dest_is_priv)) { SCTPDBG(SCTP_DEBUG_OUTPUT3, "NO:4\n"); return (NULL); } if ((ifa->src_is_glob) && (dest_is_priv)) { SCTPDBG(SCTP_DEBUG_OUTPUT3, "NO:5\n"); return (NULL); } if ((ifa->src_is_loop) && (dest_is_global)) { SCTPDBG(SCTP_DEBUG_OUTPUT3, "NO:6\n"); return (NULL); } if ((ifa->src_is_priv) && (dest_is_global)) { SCTPDBG(SCTP_DEBUG_OUTPUT3, "NO:7\n"); return (NULL); } SCTPDBG(SCTP_DEBUG_OUTPUT3, "YES\n"); /* its a preferred address */ return (ifa); } static struct sctp_ifa * sctp_is_ifa_addr_acceptable(struct sctp_ifa *ifa, uint8_t dest_is_loop, uint8_t dest_is_priv, sa_family_t fam) { uint8_t dest_is_global = 0; /** * Here we determine if its a acceptable address. A acceptable * address means it is the same scope or higher scope but we can * allow for NAT which means its ok to have a global dest and a * private src. * * L = loopback, P = private, G = global * ----------------------------------------- * src | dest | result * ----------------------------------------- * L | L | yes * ----------------------------------------- * P | L | yes-v4 no-v6 * ----------------------------------------- * G | L | yes * ----------------------------------------- * L | P | no * ----------------------------------------- * P | P | yes * ----------------------------------------- * G | P | yes - May not work * ----------------------------------------- * L | G | no * ----------------------------------------- * P | G | yes - May not work * ----------------------------------------- * G | G | yes * ----------------------------------------- */ if (ifa->address.sa.sa_family != fam) { /* forget non matching family */ SCTPDBG(SCTP_DEBUG_OUTPUT3, "ifa_fam:%d fam:%d\n", ifa->address.sa.sa_family, fam); return (NULL); } /* Ok the address may be ok */ SCTPDBG_ADDR(SCTP_DEBUG_OUTPUT3, &ifa->address.sa); SCTPDBG(SCTP_DEBUG_OUTPUT3, "dst_is_loop:%d dest_is_priv:%d\n", dest_is_loop, dest_is_priv); if ((dest_is_loop == 0) && (dest_is_priv == 0)) { dest_is_global = 1; } #ifdef INET6 if (fam == AF_INET6) { /* ok to use deprecated addresses? */ if (ifa->localifa_flags & SCTP_ADDR_IFA_UNUSEABLE) { return (NULL); } if (ifa->src_is_priv) { /* Special case, linklocal to loop */ if (dest_is_loop) return (NULL); } } #endif /* * Now that we know what is what, implement our table. This could in * theory be done slicker (it used to be), but this is * straightforward and easier to validate :-) */ SCTPDBG(SCTP_DEBUG_OUTPUT3, "ifa->src_is_loop:%d dest_is_priv:%d\n", ifa->src_is_loop, dest_is_priv); if ((ifa->src_is_loop == 1) && (dest_is_priv)) { return (NULL); } SCTPDBG(SCTP_DEBUG_OUTPUT3, "ifa->src_is_loop:%d dest_is_glob:%d\n", ifa->src_is_loop, dest_is_global); if ((ifa->src_is_loop == 1) && (dest_is_global)) { return (NULL); } SCTPDBG(SCTP_DEBUG_OUTPUT3, "address is acceptable\n"); /* its an acceptable address */ return (ifa); } int sctp_is_addr_restricted(struct sctp_tcb *stcb, struct sctp_ifa *ifa) { struct sctp_laddr *laddr; if (stcb == NULL) { /* There are no restrictions, no TCB :-) */ return (0); } LIST_FOREACH(laddr, &stcb->asoc.sctp_restricted_addrs, sctp_nxt_addr) { if (laddr->ifa == NULL) { SCTPDBG(SCTP_DEBUG_OUTPUT1, "%s: NULL ifa\n", __func__); continue; } if (laddr->ifa == ifa) { /* Yes it is on the list */ return (1); } } return (0); } int sctp_is_addr_in_ep(struct sctp_inpcb *inp, struct sctp_ifa *ifa) { struct sctp_laddr *laddr; if (ifa == NULL) return (0); LIST_FOREACH(laddr, &inp->sctp_addr_list, sctp_nxt_addr) { if (laddr->ifa == NULL) { SCTPDBG(SCTP_DEBUG_OUTPUT1, "%s: NULL ifa\n", __func__); continue; } if ((laddr->ifa == ifa) && laddr->action == 0) /* same pointer */ return (1); } return (0); } static struct sctp_ifa * sctp_choose_boundspecific_inp(struct sctp_inpcb *inp, sctp_route_t *ro, uint32_t vrf_id, int non_asoc_addr_ok, uint8_t dest_is_priv, uint8_t dest_is_loop, sa_family_t fam) { struct sctp_laddr *laddr, *starting_point; void *ifn; int resettotop = 0; struct sctp_ifn *sctp_ifn; struct sctp_ifa *sctp_ifa, *sifa; struct sctp_vrf *vrf; uint32_t ifn_index; vrf = sctp_find_vrf(vrf_id); if (vrf == NULL) return (NULL); ifn = SCTP_GET_IFN_VOID_FROM_ROUTE(ro); ifn_index = SCTP_GET_IF_INDEX_FROM_ROUTE(ro); sctp_ifn = sctp_find_ifn(ifn, ifn_index); /* * first question, is the ifn we will emit on in our list, if so, we * want such an address. Note that we first looked for a preferred * address. */ if (sctp_ifn) { /* is a preferred one on the interface we route out? */ LIST_FOREACH(sctp_ifa, &sctp_ifn->ifalist, next_ifa) { #ifdef INET if ((sctp_ifa->address.sa.sa_family == AF_INET) && (prison_check_ip4(inp->ip_inp.inp.inp_cred, &sctp_ifa->address.sin.sin_addr) != 0)) { continue; } #endif #ifdef INET6 if ((sctp_ifa->address.sa.sa_family == AF_INET6) && (prison_check_ip6(inp->ip_inp.inp.inp_cred, &sctp_ifa->address.sin6.sin6_addr) != 0)) { continue; } #endif if ((sctp_ifa->localifa_flags & SCTP_ADDR_DEFER_USE) && (non_asoc_addr_ok == 0)) continue; sifa = sctp_is_ifa_addr_preferred(sctp_ifa, dest_is_loop, dest_is_priv, fam); if (sifa == NULL) continue; if (sctp_is_addr_in_ep(inp, sifa)) { atomic_add_int(&sifa->refcount, 1); return (sifa); } } } /* * ok, now we now need to find one on the list of the addresses. We * can't get one on the emitting interface so let's find first a * preferred one. If not that an acceptable one otherwise... we * return NULL. */ starting_point = inp->next_addr_touse; once_again: if (inp->next_addr_touse == NULL) { inp->next_addr_touse = LIST_FIRST(&inp->sctp_addr_list); resettotop = 1; } for (laddr = inp->next_addr_touse; laddr; laddr = LIST_NEXT(laddr, sctp_nxt_addr)) { if (laddr->ifa == NULL) { /* address has been removed */ continue; } if (laddr->action == SCTP_DEL_IP_ADDRESS) { /* address is being deleted */ continue; } sifa = sctp_is_ifa_addr_preferred(laddr->ifa, dest_is_loop, dest_is_priv, fam); if (sifa == NULL) continue; atomic_add_int(&sifa->refcount, 1); return (sifa); } if (resettotop == 0) { inp->next_addr_touse = NULL; goto once_again; } inp->next_addr_touse = starting_point; resettotop = 0; once_again_too: if (inp->next_addr_touse == NULL) { inp->next_addr_touse = LIST_FIRST(&inp->sctp_addr_list); resettotop = 1; } /* ok, what about an acceptable address in the inp */ for (laddr = inp->next_addr_touse; laddr; laddr = LIST_NEXT(laddr, sctp_nxt_addr)) { if (laddr->ifa == NULL) { /* address has been removed */ continue; } if (laddr->action == SCTP_DEL_IP_ADDRESS) { /* address is being deleted */ continue; } sifa = sctp_is_ifa_addr_acceptable(laddr->ifa, dest_is_loop, dest_is_priv, fam); if (sifa == NULL) continue; atomic_add_int(&sifa->refcount, 1); return (sifa); } if (resettotop == 0) { inp->next_addr_touse = NULL; goto once_again_too; } /* * no address bound can be a source for the destination we are in * trouble */ return (NULL); } static struct sctp_ifa * sctp_choose_boundspecific_stcb(struct sctp_inpcb *inp, struct sctp_tcb *stcb, sctp_route_t *ro, uint32_t vrf_id, uint8_t dest_is_priv, uint8_t dest_is_loop, int non_asoc_addr_ok, sa_family_t fam) { struct sctp_laddr *laddr, *starting_point; void *ifn; struct sctp_ifn *sctp_ifn; struct sctp_ifa *sctp_ifa, *sifa; uint8_t start_at_beginning = 0; struct sctp_vrf *vrf; uint32_t ifn_index; /* * first question, is the ifn we will emit on in our list, if so, we * want that one. */ vrf = sctp_find_vrf(vrf_id); if (vrf == NULL) return (NULL); ifn = SCTP_GET_IFN_VOID_FROM_ROUTE(ro); ifn_index = SCTP_GET_IF_INDEX_FROM_ROUTE(ro); sctp_ifn = sctp_find_ifn(ifn, ifn_index); /* * first question, is the ifn we will emit on in our list? If so, * we want that one. First we look for a preferred. Second, we go * for an acceptable. */ if (sctp_ifn) { /* first try for a preferred address on the ep */ LIST_FOREACH(sctp_ifa, &sctp_ifn->ifalist, next_ifa) { #ifdef INET if ((sctp_ifa->address.sa.sa_family == AF_INET) && (prison_check_ip4(inp->ip_inp.inp.inp_cred, &sctp_ifa->address.sin.sin_addr) != 0)) { continue; } #endif #ifdef INET6 if ((sctp_ifa->address.sa.sa_family == AF_INET6) && (prison_check_ip6(inp->ip_inp.inp.inp_cred, &sctp_ifa->address.sin6.sin6_addr) != 0)) { continue; } #endif if ((sctp_ifa->localifa_flags & SCTP_ADDR_DEFER_USE) && (non_asoc_addr_ok == 0)) continue; if (sctp_is_addr_in_ep(inp, sctp_ifa)) { sifa = sctp_is_ifa_addr_preferred(sctp_ifa, dest_is_loop, dest_is_priv, fam); if (sifa == NULL) continue; if (((non_asoc_addr_ok == 0) && (sctp_is_addr_restricted(stcb, sifa))) || (non_asoc_addr_ok && (sctp_is_addr_restricted(stcb, sifa)) && (!sctp_is_addr_pending(stcb, sifa)))) { /* on the no-no list */ continue; } atomic_add_int(&sifa->refcount, 1); return (sifa); } } /* next try for an acceptable address on the ep */ LIST_FOREACH(sctp_ifa, &sctp_ifn->ifalist, next_ifa) { #ifdef INET if ((sctp_ifa->address.sa.sa_family == AF_INET) && (prison_check_ip4(inp->ip_inp.inp.inp_cred, &sctp_ifa->address.sin.sin_addr) != 0)) { continue; } #endif #ifdef INET6 if ((sctp_ifa->address.sa.sa_family == AF_INET6) && (prison_check_ip6(inp->ip_inp.inp.inp_cred, &sctp_ifa->address.sin6.sin6_addr) != 0)) { continue; } #endif if ((sctp_ifa->localifa_flags & SCTP_ADDR_DEFER_USE) && (non_asoc_addr_ok == 0)) continue; if (sctp_is_addr_in_ep(inp, sctp_ifa)) { sifa = sctp_is_ifa_addr_acceptable(sctp_ifa, dest_is_loop, dest_is_priv, fam); if (sifa == NULL) continue; if (((non_asoc_addr_ok == 0) && (sctp_is_addr_restricted(stcb, sifa))) || (non_asoc_addr_ok && (sctp_is_addr_restricted(stcb, sifa)) && (!sctp_is_addr_pending(stcb, sifa)))) { /* on the no-no list */ continue; } atomic_add_int(&sifa->refcount, 1); return (sifa); } } } /* * if we can't find one like that then we must look at all addresses * bound to pick one at first preferable then secondly acceptable. */ starting_point = stcb->asoc.last_used_address; sctp_from_the_top: if (stcb->asoc.last_used_address == NULL) { start_at_beginning = 1; stcb->asoc.last_used_address = LIST_FIRST(&inp->sctp_addr_list); } /* search beginning with the last used address */ for (laddr = stcb->asoc.last_used_address; laddr; laddr = LIST_NEXT(laddr, sctp_nxt_addr)) { if (laddr->ifa == NULL) { /* address has been removed */ continue; } if (laddr->action == SCTP_DEL_IP_ADDRESS) { /* address is being deleted */ continue; } sifa = sctp_is_ifa_addr_preferred(laddr->ifa, dest_is_loop, dest_is_priv, fam); if (sifa == NULL) continue; if (((non_asoc_addr_ok == 0) && (sctp_is_addr_restricted(stcb, sifa))) || (non_asoc_addr_ok && (sctp_is_addr_restricted(stcb, sifa)) && (!sctp_is_addr_pending(stcb, sifa)))) { /* on the no-no list */ continue; } stcb->asoc.last_used_address = laddr; atomic_add_int(&sifa->refcount, 1); return (sifa); } if (start_at_beginning == 0) { stcb->asoc.last_used_address = NULL; goto sctp_from_the_top; } /* now try for any higher scope than the destination */ stcb->asoc.last_used_address = starting_point; start_at_beginning = 0; sctp_from_the_top2: if (stcb->asoc.last_used_address == NULL) { start_at_beginning = 1; stcb->asoc.last_used_address = LIST_FIRST(&inp->sctp_addr_list); } /* search beginning with the last used address */ for (laddr = stcb->asoc.last_used_address; laddr; laddr = LIST_NEXT(laddr, sctp_nxt_addr)) { if (laddr->ifa == NULL) { /* address has been removed */ continue; } if (laddr->action == SCTP_DEL_IP_ADDRESS) { /* address is being deleted */ continue; } sifa = sctp_is_ifa_addr_acceptable(laddr->ifa, dest_is_loop, dest_is_priv, fam); if (sifa == NULL) continue; if (((non_asoc_addr_ok == 0) && (sctp_is_addr_restricted(stcb, sifa))) || (non_asoc_addr_ok && (sctp_is_addr_restricted(stcb, sifa)) && (!sctp_is_addr_pending(stcb, sifa)))) { /* on the no-no list */ continue; } stcb->asoc.last_used_address = laddr; atomic_add_int(&sifa->refcount, 1); return (sifa); } if (start_at_beginning == 0) { stcb->asoc.last_used_address = NULL; goto sctp_from_the_top2; } return (NULL); } static struct sctp_ifa * sctp_select_nth_preferred_addr_from_ifn_boundall(struct sctp_ifn *ifn, struct sctp_inpcb *inp, struct sctp_tcb *stcb, int non_asoc_addr_ok, uint8_t dest_is_loop, uint8_t dest_is_priv, int addr_wanted, sa_family_t fam, sctp_route_t *ro ) { struct sctp_ifa *ifa, *sifa; int num_eligible_addr = 0; #ifdef INET6 struct sockaddr_in6 sin6, lsa6; if (fam == AF_INET6) { memcpy(&sin6, &ro->ro_dst, sizeof(struct sockaddr_in6)); (void)sa6_recoverscope(&sin6); } #endif /* INET6 */ LIST_FOREACH(ifa, &ifn->ifalist, next_ifa) { #ifdef INET if ((ifa->address.sa.sa_family == AF_INET) && (prison_check_ip4(inp->ip_inp.inp.inp_cred, &ifa->address.sin.sin_addr) != 0)) { continue; } #endif #ifdef INET6 if ((ifa->address.sa.sa_family == AF_INET6) && (prison_check_ip6(inp->ip_inp.inp.inp_cred, &ifa->address.sin6.sin6_addr) != 0)) { continue; } #endif if ((ifa->localifa_flags & SCTP_ADDR_DEFER_USE) && (non_asoc_addr_ok == 0)) continue; sifa = sctp_is_ifa_addr_preferred(ifa, dest_is_loop, dest_is_priv, fam); if (sifa == NULL) continue; #ifdef INET6 if (fam == AF_INET6 && dest_is_loop && sifa->src_is_loop && sifa->src_is_priv) { /* * don't allow fe80::1 to be a src on loop ::1, we * don't list it to the peer so we will get an * abort. */ continue; } if (fam == AF_INET6 && IN6_IS_ADDR_LINKLOCAL(&sifa->address.sin6.sin6_addr) && IN6_IS_ADDR_LINKLOCAL(&sin6.sin6_addr)) { /* * link-local <-> link-local must belong to the same * scope. */ memcpy(&lsa6, &sifa->address.sin6, sizeof(struct sockaddr_in6)); (void)sa6_recoverscope(&lsa6); if (sin6.sin6_scope_id != lsa6.sin6_scope_id) { continue; } } #endif /* INET6 */ /* * Check if the IPv6 address matches to next-hop. In the * mobile case, old IPv6 address may be not deleted from the * interface. Then, the interface has previous and new * addresses. We should use one corresponding to the * next-hop. (by micchie) */ #ifdef INET6 if (stcb && fam == AF_INET6 && sctp_is_mobility_feature_on(stcb->sctp_ep, SCTP_MOBILITY_BASE)) { if (sctp_v6src_match_nexthop(&sifa->address.sin6, ro) == 0) { continue; } } #endif #ifdef INET /* Avoid topologically incorrect IPv4 address */ if (stcb && fam == AF_INET && sctp_is_mobility_feature_on(stcb->sctp_ep, SCTP_MOBILITY_BASE)) { if (sctp_v4src_match_nexthop(sifa, ro) == 0) { continue; } } #endif if (stcb) { if (sctp_is_address_in_scope(ifa, &stcb->asoc.scope, 0) == 0) { continue; } if (((non_asoc_addr_ok == 0) && (sctp_is_addr_restricted(stcb, sifa))) || (non_asoc_addr_ok && (sctp_is_addr_restricted(stcb, sifa)) && (!sctp_is_addr_pending(stcb, sifa)))) { /* * It is restricted for some reason.. * probably not yet added. */ continue; } } if (num_eligible_addr >= addr_wanted) { return (sifa); } num_eligible_addr++; } return (NULL); } static int sctp_count_num_preferred_boundall(struct sctp_ifn *ifn, struct sctp_inpcb *inp, struct sctp_tcb *stcb, int non_asoc_addr_ok, uint8_t dest_is_loop, uint8_t dest_is_priv, sa_family_t fam) { struct sctp_ifa *ifa, *sifa; int num_eligible_addr = 0; LIST_FOREACH(ifa, &ifn->ifalist, next_ifa) { #ifdef INET if ((ifa->address.sa.sa_family == AF_INET) && (prison_check_ip4(inp->ip_inp.inp.inp_cred, &ifa->address.sin.sin_addr) != 0)) { continue; } #endif #ifdef INET6 if ((ifa->address.sa.sa_family == AF_INET6) && (stcb != NULL) && (prison_check_ip6(inp->ip_inp.inp.inp_cred, &ifa->address.sin6.sin6_addr) != 0)) { continue; } #endif if ((ifa->localifa_flags & SCTP_ADDR_DEFER_USE) && (non_asoc_addr_ok == 0)) { continue; } sifa = sctp_is_ifa_addr_preferred(ifa, dest_is_loop, dest_is_priv, fam); if (sifa == NULL) { continue; } if (stcb) { if (sctp_is_address_in_scope(ifa, &stcb->asoc.scope, 0) == 0) { continue; } if (((non_asoc_addr_ok == 0) && (sctp_is_addr_restricted(stcb, sifa))) || (non_asoc_addr_ok && (sctp_is_addr_restricted(stcb, sifa)) && (!sctp_is_addr_pending(stcb, sifa)))) { /* * It is restricted for some reason.. * probably not yet added. */ continue; } } num_eligible_addr++; } return (num_eligible_addr); } static struct sctp_ifa * sctp_choose_boundall(struct sctp_inpcb *inp, struct sctp_tcb *stcb, struct sctp_nets *net, sctp_route_t *ro, uint32_t vrf_id, uint8_t dest_is_priv, uint8_t dest_is_loop, int non_asoc_addr_ok, sa_family_t fam) { int cur_addr_num = 0, num_preferred = 0; void *ifn; struct sctp_ifn *sctp_ifn, *looked_at = NULL, *emit_ifn; struct sctp_ifa *sctp_ifa, *sifa; uint32_t ifn_index; struct sctp_vrf *vrf; #ifdef INET int retried = 0; #endif /*- * For boundall we can use any address in the association. * If non_asoc_addr_ok is set we can use any address (at least in * theory). So we look for preferred addresses first. If we find one, * we use it. Otherwise we next try to get an address on the * interface, which we should be able to do (unless non_asoc_addr_ok * is false and we are routed out that way). In these cases where we * can't use the address of the interface we go through all the * ifn's looking for an address we can use and fill that in. Punting * means we send back address 0, which will probably cause problems * actually since then IP will fill in the address of the route ifn, * which means we probably already rejected it.. i.e. here comes an * abort :-<. */ vrf = sctp_find_vrf(vrf_id); if (vrf == NULL) return (NULL); ifn = SCTP_GET_IFN_VOID_FROM_ROUTE(ro); ifn_index = SCTP_GET_IF_INDEX_FROM_ROUTE(ro); SCTPDBG(SCTP_DEBUG_OUTPUT2, "ifn from route:%p ifn_index:%d\n", ifn, ifn_index); emit_ifn = looked_at = sctp_ifn = sctp_find_ifn(ifn, ifn_index); if (sctp_ifn == NULL) { /* ?? We don't have this guy ?? */ SCTPDBG(SCTP_DEBUG_OUTPUT2, "No ifn emit interface?\n"); goto bound_all_plan_b; } SCTPDBG(SCTP_DEBUG_OUTPUT2, "ifn_index:%d name:%s is emit interface\n", ifn_index, sctp_ifn->ifn_name); if (net) { cur_addr_num = net->indx_of_eligible_next_to_use; } num_preferred = sctp_count_num_preferred_boundall(sctp_ifn, inp, stcb, non_asoc_addr_ok, dest_is_loop, dest_is_priv, fam); SCTPDBG(SCTP_DEBUG_OUTPUT2, "Found %d preferred source addresses for intf:%s\n", num_preferred, sctp_ifn->ifn_name); if (num_preferred == 0) { /* * no eligible addresses, we must use some other interface * address if we can find one. */ goto bound_all_plan_b; } /* * Ok we have num_eligible_addr set with how many we can use, this * may vary from call to call due to addresses being deprecated * etc.. */ if (cur_addr_num >= num_preferred) { cur_addr_num = 0; } /* * select the nth address from the list (where cur_addr_num is the * nth) and 0 is the first one, 1 is the second one etc... */ SCTPDBG(SCTP_DEBUG_OUTPUT2, "cur_addr_num:%d\n", cur_addr_num); sctp_ifa = sctp_select_nth_preferred_addr_from_ifn_boundall(sctp_ifn, inp, stcb, non_asoc_addr_ok, dest_is_loop, dest_is_priv, cur_addr_num, fam, ro); /* if sctp_ifa is NULL something changed??, fall to plan b. */ if (sctp_ifa) { atomic_add_int(&sctp_ifa->refcount, 1); if (net) { /* save off where the next one we will want */ net->indx_of_eligible_next_to_use = cur_addr_num + 1; } return (sctp_ifa); } /* * plan_b: Look at all interfaces and find a preferred address. If * no preferred fall through to plan_c. */ bound_all_plan_b: SCTPDBG(SCTP_DEBUG_OUTPUT2, "Trying Plan B\n"); LIST_FOREACH(sctp_ifn, &vrf->ifnlist, next_ifn) { SCTPDBG(SCTP_DEBUG_OUTPUT2, "Examine interface %s\n", sctp_ifn->ifn_name); if (dest_is_loop == 0 && SCTP_IFN_IS_IFT_LOOP(sctp_ifn)) { /* wrong base scope */ SCTPDBG(SCTP_DEBUG_OUTPUT2, "skip\n"); continue; } if ((sctp_ifn == looked_at) && looked_at) { /* already looked at this guy */ SCTPDBG(SCTP_DEBUG_OUTPUT2, "already seen\n"); continue; } num_preferred = sctp_count_num_preferred_boundall(sctp_ifn, inp, stcb, non_asoc_addr_ok, dest_is_loop, dest_is_priv, fam); SCTPDBG(SCTP_DEBUG_OUTPUT2, "Found ifn:%p %d preferred source addresses\n", ifn, num_preferred); if (num_preferred == 0) { /* None on this interface. */ SCTPDBG(SCTP_DEBUG_OUTPUT2, "No preferred -- skipping to next\n"); continue; } SCTPDBG(SCTP_DEBUG_OUTPUT2, "num preferred:%d on interface:%p cur_addr_num:%d\n", num_preferred, (void *)sctp_ifn, cur_addr_num); /* * Ok we have num_eligible_addr set with how many we can * use, this may vary from call to call due to addresses * being deprecated etc.. */ if (cur_addr_num >= num_preferred) { cur_addr_num = 0; } sifa = sctp_select_nth_preferred_addr_from_ifn_boundall(sctp_ifn, inp, stcb, non_asoc_addr_ok, dest_is_loop, dest_is_priv, cur_addr_num, fam, ro); if (sifa == NULL) continue; if (net) { net->indx_of_eligible_next_to_use = cur_addr_num + 1; SCTPDBG(SCTP_DEBUG_OUTPUT2, "we selected %d\n", cur_addr_num); SCTPDBG(SCTP_DEBUG_OUTPUT2, "Source:"); SCTPDBG_ADDR(SCTP_DEBUG_OUTPUT2, &sifa->address.sa); SCTPDBG(SCTP_DEBUG_OUTPUT2, "Dest:"); SCTPDBG_ADDR(SCTP_DEBUG_OUTPUT2, &net->ro._l_addr.sa); } atomic_add_int(&sifa->refcount, 1); return (sifa); } #ifdef INET again_with_private_addresses_allowed: #endif /* plan_c: do we have an acceptable address on the emit interface */ sifa = NULL; SCTPDBG(SCTP_DEBUG_OUTPUT2, "Trying Plan C: find acceptable on interface\n"); if (emit_ifn == NULL) { SCTPDBG(SCTP_DEBUG_OUTPUT2, "Jump to Plan D - no emit_ifn\n"); goto plan_d; } LIST_FOREACH(sctp_ifa, &emit_ifn->ifalist, next_ifa) { SCTPDBG(SCTP_DEBUG_OUTPUT2, "ifa:%p\n", (void *)sctp_ifa); #ifdef INET if ((sctp_ifa->address.sa.sa_family == AF_INET) && (prison_check_ip4(inp->ip_inp.inp.inp_cred, &sctp_ifa->address.sin.sin_addr) != 0)) { SCTPDBG(SCTP_DEBUG_OUTPUT2, "Jailed\n"); continue; } #endif #ifdef INET6 if ((sctp_ifa->address.sa.sa_family == AF_INET6) && (prison_check_ip6(inp->ip_inp.inp.inp_cred, &sctp_ifa->address.sin6.sin6_addr) != 0)) { SCTPDBG(SCTP_DEBUG_OUTPUT2, "Jailed\n"); continue; } #endif if ((sctp_ifa->localifa_flags & SCTP_ADDR_DEFER_USE) && (non_asoc_addr_ok == 0)) { SCTPDBG(SCTP_DEBUG_OUTPUT2, "Defer\n"); continue; } sifa = sctp_is_ifa_addr_acceptable(sctp_ifa, dest_is_loop, dest_is_priv, fam); if (sifa == NULL) { SCTPDBG(SCTP_DEBUG_OUTPUT2, "IFA not acceptable\n"); continue; } if (stcb) { if (sctp_is_address_in_scope(sifa, &stcb->asoc.scope, 0) == 0) { SCTPDBG(SCTP_DEBUG_OUTPUT2, "NOT in scope\n"); sifa = NULL; continue; } if (((non_asoc_addr_ok == 0) && (sctp_is_addr_restricted(stcb, sifa))) || (non_asoc_addr_ok && (sctp_is_addr_restricted(stcb, sifa)) && (!sctp_is_addr_pending(stcb, sifa)))) { /* * It is restricted for some reason.. * probably not yet added. */ SCTPDBG(SCTP_DEBUG_OUTPUT2, "Its restricted\n"); sifa = NULL; continue; } } atomic_add_int(&sifa->refcount, 1); goto out; } plan_d: /* * plan_d: We are in trouble. No preferred address on the emit * interface. And not even a preferred address on all interfaces. Go * out and see if we can find an acceptable address somewhere * amongst all interfaces. */ SCTPDBG(SCTP_DEBUG_OUTPUT2, "Trying Plan D looked_at is %p\n", (void *)looked_at); LIST_FOREACH(sctp_ifn, &vrf->ifnlist, next_ifn) { if (dest_is_loop == 0 && SCTP_IFN_IS_IFT_LOOP(sctp_ifn)) { /* wrong base scope */ continue; } LIST_FOREACH(sctp_ifa, &sctp_ifn->ifalist, next_ifa) { #ifdef INET if ((sctp_ifa->address.sa.sa_family == AF_INET) && (prison_check_ip4(inp->ip_inp.inp.inp_cred, &sctp_ifa->address.sin.sin_addr) != 0)) { continue; } #endif #ifdef INET6 if ((sctp_ifa->address.sa.sa_family == AF_INET6) && (prison_check_ip6(inp->ip_inp.inp.inp_cred, &sctp_ifa->address.sin6.sin6_addr) != 0)) { continue; } #endif if ((sctp_ifa->localifa_flags & SCTP_ADDR_DEFER_USE) && (non_asoc_addr_ok == 0)) continue; sifa = sctp_is_ifa_addr_acceptable(sctp_ifa, dest_is_loop, dest_is_priv, fam); if (sifa == NULL) continue; if (stcb) { if (sctp_is_address_in_scope(sifa, &stcb->asoc.scope, 0) == 0) { sifa = NULL; continue; } if (((non_asoc_addr_ok == 0) && (sctp_is_addr_restricted(stcb, sifa))) || (non_asoc_addr_ok && (sctp_is_addr_restricted(stcb, sifa)) && (!sctp_is_addr_pending(stcb, sifa)))) { /* * It is restricted for some * reason.. probably not yet added. */ sifa = NULL; continue; } } goto out; } } #ifdef INET if (stcb) { if ((retried == 0) && (stcb->asoc.scope.ipv4_local_scope == 0)) { stcb->asoc.scope.ipv4_local_scope = 1; retried = 1; goto again_with_private_addresses_allowed; } else if (retried == 1) { stcb->asoc.scope.ipv4_local_scope = 0; } } #endif out: #ifdef INET if (sifa) { if (retried == 1) { LIST_FOREACH(sctp_ifn, &vrf->ifnlist, next_ifn) { if (dest_is_loop == 0 && SCTP_IFN_IS_IFT_LOOP(sctp_ifn)) { /* wrong base scope */ continue; } LIST_FOREACH(sctp_ifa, &sctp_ifn->ifalist, next_ifa) { struct sctp_ifa *tmp_sifa; #ifdef INET if ((sctp_ifa->address.sa.sa_family == AF_INET) && (prison_check_ip4(inp->ip_inp.inp.inp_cred, &sctp_ifa->address.sin.sin_addr) != 0)) { continue; } #endif #ifdef INET6 if ((sctp_ifa->address.sa.sa_family == AF_INET6) && (prison_check_ip6(inp->ip_inp.inp.inp_cred, &sctp_ifa->address.sin6.sin6_addr) != 0)) { continue; } #endif if ((sctp_ifa->localifa_flags & SCTP_ADDR_DEFER_USE) && (non_asoc_addr_ok == 0)) continue; tmp_sifa = sctp_is_ifa_addr_acceptable(sctp_ifa, dest_is_loop, dest_is_priv, fam); if (tmp_sifa == NULL) { continue; } if (tmp_sifa == sifa) { continue; } if (stcb) { if (sctp_is_address_in_scope(tmp_sifa, &stcb->asoc.scope, 0) == 0) { continue; } if (((non_asoc_addr_ok == 0) && (sctp_is_addr_restricted(stcb, tmp_sifa))) || (non_asoc_addr_ok && (sctp_is_addr_restricted(stcb, tmp_sifa)) && (!sctp_is_addr_pending(stcb, tmp_sifa)))) { /* * It is restricted * for some reason.. * probably not yet * added. */ continue; } } if ((tmp_sifa->address.sin.sin_family == AF_INET) && (IN4_ISPRIVATE_ADDRESS(&(tmp_sifa->address.sin.sin_addr)))) { sctp_add_local_addr_restricted(stcb, tmp_sifa); } } } } atomic_add_int(&sifa->refcount, 1); } #endif return (sifa); } /* tcb may be NULL */ struct sctp_ifa * sctp_source_address_selection(struct sctp_inpcb *inp, struct sctp_tcb *stcb, sctp_route_t *ro, struct sctp_nets *net, int non_asoc_addr_ok, uint32_t vrf_id) { struct sctp_ifa *answer; uint8_t dest_is_priv, dest_is_loop; sa_family_t fam; #ifdef INET struct sockaddr_in *to = (struct sockaddr_in *)&ro->ro_dst; #endif #ifdef INET6 struct sockaddr_in6 *to6 = (struct sockaddr_in6 *)&ro->ro_dst; #endif /** * Rules: * - Find the route if needed, cache if I can. * - Look at interface address in route, Is it in the bound list. If so we * have the best source. * - If not we must rotate amongst the addresses. * * Cavets and issues * * Do we need to pay attention to scope. We can have a private address * or a global address we are sourcing or sending to. So if we draw * it out * zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz * For V4 * ------------------------------------------ * source * dest * result * ----------------------------------------- * Private * Global * NAT * ----------------------------------------- * Private * Private * No problem * ----------------------------------------- * Global * Private * Huh, How will this work? * ----------------------------------------- * Global * Global * No Problem *------------------------------------------ * zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz * For V6 *------------------------------------------ * source * dest * result * ----------------------------------------- * Linklocal * Global * * ----------------------------------------- * Linklocal * Linklocal * No problem * ----------------------------------------- * Global * Linklocal * Huh, How will this work? * ----------------------------------------- * Global * Global * No Problem *------------------------------------------ * zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz * * And then we add to that what happens if there are multiple addresses * assigned to an interface. Remember the ifa on a ifn is a linked * list of addresses. So one interface can have more than one IP * address. What happens if we have both a private and a global * address? Do we then use context of destination to sort out which * one is best? And what about NAT's sending P->G may get you a NAT * translation, or should you select the G thats on the interface in * preference. * * Decisions: * * - count the number of addresses on the interface. * - if it is one, no problem except case . * For we will assume a NAT out there. * - if there are more than one, then we need to worry about scope P * or G. We should prefer G -> G and P -> P if possible. * Then as a secondary fall back to mixed types G->P being a last * ditch one. * - The above all works for bound all, but bound specific we need to * use the same concept but instead only consider the bound * addresses. If the bound set is NOT assigned to the interface then * we must use rotation amongst the bound addresses.. */ if (ro->ro_nh == NULL) { /* * Need a route to cache. */ SCTP_RTALLOC(ro, vrf_id, inp->fibnum); } if (ro->ro_nh == NULL) { return (NULL); } fam = ro->ro_dst.sa_family; dest_is_priv = dest_is_loop = 0; /* Setup our scopes for the destination */ switch (fam) { #ifdef INET case AF_INET: /* Scope based on outbound address */ if (IN4_ISLOOPBACK_ADDRESS(&to->sin_addr)) { dest_is_loop = 1; if (net != NULL) { /* mark it as local */ net->addr_is_local = 1; } } else if ((IN4_ISPRIVATE_ADDRESS(&to->sin_addr))) { dest_is_priv = 1; } break; #endif #ifdef INET6 case AF_INET6: /* Scope based on outbound address */ if (IN6_IS_ADDR_LOOPBACK(&to6->sin6_addr) || SCTP_ROUTE_IS_REAL_LOOP(ro)) { /* * If the address is a loopback address, which * consists of "::1" OR "fe80::1%lo0", we are * loopback scope. But we don't use dest_is_priv * (link local addresses). */ dest_is_loop = 1; if (net != NULL) { /* mark it as local */ net->addr_is_local = 1; } } else if (IN6_IS_ADDR_LINKLOCAL(&to6->sin6_addr)) { dest_is_priv = 1; } break; #endif } SCTPDBG(SCTP_DEBUG_OUTPUT2, "Select source addr for:"); SCTPDBG_ADDR(SCTP_DEBUG_OUTPUT2, (struct sockaddr *)&ro->ro_dst); SCTP_IPI_ADDR_RLOCK(); if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) { /* * Bound all case */ answer = sctp_choose_boundall(inp, stcb, net, ro, vrf_id, dest_is_priv, dest_is_loop, non_asoc_addr_ok, fam); SCTP_IPI_ADDR_RUNLOCK(); return (answer); } /* * Subset bound case */ if (stcb) { answer = sctp_choose_boundspecific_stcb(inp, stcb, ro, vrf_id, dest_is_priv, dest_is_loop, non_asoc_addr_ok, fam); } else { answer = sctp_choose_boundspecific_inp(inp, ro, vrf_id, non_asoc_addr_ok, dest_is_priv, dest_is_loop, fam); } SCTP_IPI_ADDR_RUNLOCK(); return (answer); } static int sctp_find_cmsg(int c_type, void *data, struct mbuf *control, size_t cpsize) { struct cmsghdr cmh; struct sctp_sndinfo sndinfo; struct sctp_prinfo prinfo; struct sctp_authinfo authinfo; int tot_len, rem_len, cmsg_data_len, cmsg_data_off, off; int found; /* * Independent of how many mbufs, find the c_type inside the control * structure and copy out the data. */ found = 0; tot_len = SCTP_BUF_LEN(control); for (off = 0; off < tot_len; off += CMSG_ALIGN(cmh.cmsg_len)) { rem_len = tot_len - off; if (rem_len < (int)CMSG_ALIGN(sizeof(cmh))) { /* There is not enough room for one more. */ return (found); } m_copydata(control, off, sizeof(cmh), (caddr_t)&cmh); if (cmh.cmsg_len < CMSG_ALIGN(sizeof(cmh))) { /* We dont't have a complete CMSG header. */ return (found); } if ((cmh.cmsg_len > INT_MAX) || ((int)cmh.cmsg_len > rem_len)) { /* We don't have the complete CMSG. */ return (found); } cmsg_data_len = (int)cmh.cmsg_len - CMSG_ALIGN(sizeof(cmh)); cmsg_data_off = off + CMSG_ALIGN(sizeof(cmh)); if ((cmh.cmsg_level == IPPROTO_SCTP) && ((c_type == cmh.cmsg_type) || ((c_type == SCTP_SNDRCV) && ((cmh.cmsg_type == SCTP_SNDINFO) || (cmh.cmsg_type == SCTP_PRINFO) || (cmh.cmsg_type == SCTP_AUTHINFO))))) { if (c_type == cmh.cmsg_type) { if (cpsize > INT_MAX) { return (found); } if (cmsg_data_len < (int)cpsize) { return (found); } /* It is exactly what we want. Copy it out. */ m_copydata(control, cmsg_data_off, (int)cpsize, (caddr_t)data); return (1); } else { struct sctp_sndrcvinfo *sndrcvinfo; sndrcvinfo = (struct sctp_sndrcvinfo *)data; if (found == 0) { if (cpsize < sizeof(struct sctp_sndrcvinfo)) { return (found); } memset(sndrcvinfo, 0, sizeof(struct sctp_sndrcvinfo)); } switch (cmh.cmsg_type) { case SCTP_SNDINFO: if (cmsg_data_len < (int)sizeof(struct sctp_sndinfo)) { return (found); } m_copydata(control, cmsg_data_off, sizeof(struct sctp_sndinfo), (caddr_t)&sndinfo); sndrcvinfo->sinfo_stream = sndinfo.snd_sid; sndrcvinfo->sinfo_flags = sndinfo.snd_flags; sndrcvinfo->sinfo_ppid = sndinfo.snd_ppid; sndrcvinfo->sinfo_context = sndinfo.snd_context; sndrcvinfo->sinfo_assoc_id = sndinfo.snd_assoc_id; break; case SCTP_PRINFO: if (cmsg_data_len < (int)sizeof(struct sctp_prinfo)) { return (found); } m_copydata(control, cmsg_data_off, sizeof(struct sctp_prinfo), (caddr_t)&prinfo); if (prinfo.pr_policy != SCTP_PR_SCTP_NONE) { sndrcvinfo->sinfo_timetolive = prinfo.pr_value; } else { sndrcvinfo->sinfo_timetolive = 0; } sndrcvinfo->sinfo_flags |= prinfo.pr_policy; break; case SCTP_AUTHINFO: if (cmsg_data_len < (int)sizeof(struct sctp_authinfo)) { return (found); } m_copydata(control, cmsg_data_off, sizeof(struct sctp_authinfo), (caddr_t)&authinfo); sndrcvinfo->sinfo_keynumber_valid = 1; sndrcvinfo->sinfo_keynumber = authinfo.auth_keynumber; break; default: return (found); } found = 1; } } } return (found); } static int sctp_process_cmsgs_for_init(struct sctp_tcb *stcb, struct mbuf *control, int *error) { struct cmsghdr cmh; struct sctp_initmsg initmsg; #ifdef INET struct sockaddr_in sin; #endif #ifdef INET6 struct sockaddr_in6 sin6; #endif int tot_len, rem_len, cmsg_data_len, cmsg_data_off, off; tot_len = SCTP_BUF_LEN(control); for (off = 0; off < tot_len; off += CMSG_ALIGN(cmh.cmsg_len)) { rem_len = tot_len - off; if (rem_len < (int)CMSG_ALIGN(sizeof(cmh))) { /* There is not enough room for one more. */ *error = EINVAL; return (1); } m_copydata(control, off, sizeof(cmh), (caddr_t)&cmh); if (cmh.cmsg_len < CMSG_ALIGN(sizeof(cmh))) { /* We dont't have a complete CMSG header. */ *error = EINVAL; return (1); } if ((cmh.cmsg_len > INT_MAX) || ((int)cmh.cmsg_len > rem_len)) { /* We don't have the complete CMSG. */ *error = EINVAL; return (1); } cmsg_data_len = (int)cmh.cmsg_len - CMSG_ALIGN(sizeof(cmh)); cmsg_data_off = off + CMSG_ALIGN(sizeof(cmh)); if (cmh.cmsg_level == IPPROTO_SCTP) { switch (cmh.cmsg_type) { case SCTP_INIT: if (cmsg_data_len < (int)sizeof(struct sctp_initmsg)) { *error = EINVAL; return (1); } m_copydata(control, cmsg_data_off, sizeof(struct sctp_initmsg), (caddr_t)&initmsg); if (initmsg.sinit_max_attempts) stcb->asoc.max_init_times = initmsg.sinit_max_attempts; if (initmsg.sinit_num_ostreams) stcb->asoc.pre_open_streams = initmsg.sinit_num_ostreams; if (initmsg.sinit_max_instreams) stcb->asoc.max_inbound_streams = initmsg.sinit_max_instreams; if (initmsg.sinit_max_init_timeo) stcb->asoc.initial_init_rto_max = initmsg.sinit_max_init_timeo; if (stcb->asoc.streamoutcnt < stcb->asoc.pre_open_streams) { struct sctp_stream_out *tmp_str; unsigned int i; #if defined(SCTP_DETAILED_STR_STATS) int j; #endif /* Default is NOT correct */ SCTPDBG(SCTP_DEBUG_OUTPUT1, "Ok, default:%d pre_open:%d\n", stcb->asoc.streamoutcnt, stcb->asoc.pre_open_streams); SCTP_TCB_UNLOCK(stcb); SCTP_MALLOC(tmp_str, struct sctp_stream_out *, (stcb->asoc.pre_open_streams * sizeof(struct sctp_stream_out)), SCTP_M_STRMO); SCTP_TCB_LOCK(stcb); if (tmp_str != NULL) { SCTP_FREE(stcb->asoc.strmout, SCTP_M_STRMO); stcb->asoc.strmout = tmp_str; stcb->asoc.strm_realoutsize = stcb->asoc.streamoutcnt = stcb->asoc.pre_open_streams; } else { stcb->asoc.pre_open_streams = stcb->asoc.streamoutcnt; } for (i = 0; i < stcb->asoc.streamoutcnt; i++) { TAILQ_INIT(&stcb->asoc.strmout[i].outqueue); stcb->asoc.strmout[i].chunks_on_queues = 0; stcb->asoc.strmout[i].next_mid_ordered = 0; stcb->asoc.strmout[i].next_mid_unordered = 0; #if defined(SCTP_DETAILED_STR_STATS) for (j = 0; j < SCTP_PR_SCTP_MAX + 1; j++) { stcb->asoc.strmout[i].abandoned_sent[j] = 0; stcb->asoc.strmout[i].abandoned_unsent[j] = 0; } #else stcb->asoc.strmout[i].abandoned_sent[0] = 0; stcb->asoc.strmout[i].abandoned_unsent[0] = 0; #endif stcb->asoc.strmout[i].sid = i; stcb->asoc.strmout[i].last_msg_incomplete = 0; stcb->asoc.strmout[i].state = SCTP_STREAM_OPENING; stcb->asoc.ss_functions.sctp_ss_init_stream(stcb, &stcb->asoc.strmout[i], NULL); } } break; #ifdef INET case SCTP_DSTADDRV4: if (cmsg_data_len < (int)sizeof(struct in_addr)) { *error = EINVAL; return (1); } memset(&sin, 0, sizeof(struct sockaddr_in)); sin.sin_family = AF_INET; sin.sin_len = sizeof(struct sockaddr_in); sin.sin_port = stcb->rport; m_copydata(control, cmsg_data_off, sizeof(struct in_addr), (caddr_t)&sin.sin_addr); if ((sin.sin_addr.s_addr == INADDR_ANY) || (sin.sin_addr.s_addr == INADDR_BROADCAST) || IN_MULTICAST(ntohl(sin.sin_addr.s_addr))) { *error = EINVAL; return (1); } if (sctp_add_remote_addr(stcb, (struct sockaddr *)&sin, NULL, stcb->asoc.port, SCTP_DONOT_SETSCOPE, SCTP_ADDR_IS_CONFIRMED)) { *error = ENOBUFS; return (1); } break; #endif #ifdef INET6 case SCTP_DSTADDRV6: if (cmsg_data_len < (int)sizeof(struct in6_addr)) { *error = EINVAL; return (1); } memset(&sin6, 0, sizeof(struct sockaddr_in6)); sin6.sin6_family = AF_INET6; sin6.sin6_len = sizeof(struct sockaddr_in6); sin6.sin6_port = stcb->rport; m_copydata(control, cmsg_data_off, sizeof(struct in6_addr), (caddr_t)&sin6.sin6_addr); if (IN6_IS_ADDR_UNSPECIFIED(&sin6.sin6_addr) || IN6_IS_ADDR_MULTICAST(&sin6.sin6_addr)) { *error = EINVAL; return (1); } #ifdef INET if (IN6_IS_ADDR_V4MAPPED(&sin6.sin6_addr)) { in6_sin6_2_sin(&sin, &sin6); if ((sin.sin_addr.s_addr == INADDR_ANY) || (sin.sin_addr.s_addr == INADDR_BROADCAST) || IN_MULTICAST(ntohl(sin.sin_addr.s_addr))) { *error = EINVAL; return (1); } if (sctp_add_remote_addr(stcb, (struct sockaddr *)&sin, NULL, stcb->asoc.port, SCTP_DONOT_SETSCOPE, SCTP_ADDR_IS_CONFIRMED)) { *error = ENOBUFS; return (1); } } else #endif if (sctp_add_remote_addr(stcb, (struct sockaddr *)&sin6, NULL, stcb->asoc.port, SCTP_DONOT_SETSCOPE, SCTP_ADDR_IS_CONFIRMED)) { *error = ENOBUFS; return (1); } break; #endif default: break; } } } return (0); } #if defined(INET) || defined(INET6) static struct sctp_tcb * sctp_findassociation_cmsgs(struct sctp_inpcb **inp_p, uint16_t port, struct mbuf *control, struct sctp_nets **net_p, int *error) { struct cmsghdr cmh; struct sctp_tcb *stcb; struct sockaddr *addr; #ifdef INET struct sockaddr_in sin; #endif #ifdef INET6 struct sockaddr_in6 sin6; #endif int tot_len, rem_len, cmsg_data_len, cmsg_data_off, off; tot_len = SCTP_BUF_LEN(control); for (off = 0; off < tot_len; off += CMSG_ALIGN(cmh.cmsg_len)) { rem_len = tot_len - off; if (rem_len < (int)CMSG_ALIGN(sizeof(cmh))) { /* There is not enough room for one more. */ *error = EINVAL; return (NULL); } m_copydata(control, off, sizeof(cmh), (caddr_t)&cmh); if (cmh.cmsg_len < CMSG_ALIGN(sizeof(cmh))) { /* We dont't have a complete CMSG header. */ *error = EINVAL; return (NULL); } if ((cmh.cmsg_len > INT_MAX) || ((int)cmh.cmsg_len > rem_len)) { /* We don't have the complete CMSG. */ *error = EINVAL; return (NULL); } cmsg_data_len = (int)cmh.cmsg_len - CMSG_ALIGN(sizeof(cmh)); cmsg_data_off = off + CMSG_ALIGN(sizeof(cmh)); if (cmh.cmsg_level == IPPROTO_SCTP) { switch (cmh.cmsg_type) { #ifdef INET case SCTP_DSTADDRV4: if (cmsg_data_len < (int)sizeof(struct in_addr)) { *error = EINVAL; return (NULL); } memset(&sin, 0, sizeof(struct sockaddr_in)); sin.sin_family = AF_INET; sin.sin_len = sizeof(struct sockaddr_in); sin.sin_port = port; m_copydata(control, cmsg_data_off, sizeof(struct in_addr), (caddr_t)&sin.sin_addr); addr = (struct sockaddr *)&sin; break; #endif #ifdef INET6 case SCTP_DSTADDRV6: if (cmsg_data_len < (int)sizeof(struct in6_addr)) { *error = EINVAL; return (NULL); } memset(&sin6, 0, sizeof(struct sockaddr_in6)); sin6.sin6_family = AF_INET6; sin6.sin6_len = sizeof(struct sockaddr_in6); sin6.sin6_port = port; m_copydata(control, cmsg_data_off, sizeof(struct in6_addr), (caddr_t)&sin6.sin6_addr); #ifdef INET if (IN6_IS_ADDR_V4MAPPED(&sin6.sin6_addr)) { in6_sin6_2_sin(&sin, &sin6); addr = (struct sockaddr *)&sin; } else #endif addr = (struct sockaddr *)&sin6; break; #endif default: addr = NULL; break; } if (addr) { stcb = sctp_findassociation_ep_addr(inp_p, addr, net_p, NULL, NULL); if (stcb != NULL) { return (stcb); } } } } return (NULL); } #endif static struct mbuf * sctp_add_cookie(struct mbuf *init, int init_offset, struct mbuf *initack, int initack_offset, struct sctp_state_cookie *stc_in, uint8_t **signature) { struct mbuf *copy_init, *copy_initack, *m_at, *sig, *mret; struct sctp_state_cookie *stc; struct sctp_paramhdr *ph; uint16_t cookie_sz; mret = sctp_get_mbuf_for_msg((sizeof(struct sctp_state_cookie) + sizeof(struct sctp_paramhdr)), 0, M_NOWAIT, 1, MT_DATA); if (mret == NULL) { return (NULL); } copy_init = SCTP_M_COPYM(init, init_offset, M_COPYALL, M_NOWAIT); if (copy_init == NULL) { sctp_m_freem(mret); return (NULL); } #ifdef SCTP_MBUF_LOGGING if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MBUF_LOGGING_ENABLE) { sctp_log_mbc(copy_init, SCTP_MBUF_ICOPY); } #endif copy_initack = SCTP_M_COPYM(initack, initack_offset, M_COPYALL, M_NOWAIT); if (copy_initack == NULL) { sctp_m_freem(mret); sctp_m_freem(copy_init); return (NULL); } #ifdef SCTP_MBUF_LOGGING if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MBUF_LOGGING_ENABLE) { sctp_log_mbc(copy_initack, SCTP_MBUF_ICOPY); } #endif /* easy side we just drop it on the end */ ph = mtod(mret, struct sctp_paramhdr *); SCTP_BUF_LEN(mret) = sizeof(struct sctp_state_cookie) + sizeof(struct sctp_paramhdr); stc = (struct sctp_state_cookie *)((caddr_t)ph + sizeof(struct sctp_paramhdr)); ph->param_type = htons(SCTP_STATE_COOKIE); ph->param_length = 0; /* fill in at the end */ /* Fill in the stc cookie data */ memcpy(stc, stc_in, sizeof(struct sctp_state_cookie)); /* tack the INIT and then the INIT-ACK onto the chain */ cookie_sz = 0; for (m_at = mret; m_at; m_at = SCTP_BUF_NEXT(m_at)) { cookie_sz += SCTP_BUF_LEN(m_at); if (SCTP_BUF_NEXT(m_at) == NULL) { SCTP_BUF_NEXT(m_at) = copy_init; break; } } for (m_at = copy_init; m_at; m_at = SCTP_BUF_NEXT(m_at)) { cookie_sz += SCTP_BUF_LEN(m_at); if (SCTP_BUF_NEXT(m_at) == NULL) { SCTP_BUF_NEXT(m_at) = copy_initack; break; } } for (m_at = copy_initack; m_at; m_at = SCTP_BUF_NEXT(m_at)) { cookie_sz += SCTP_BUF_LEN(m_at); if (SCTP_BUF_NEXT(m_at) == NULL) { break; } } sig = sctp_get_mbuf_for_msg(SCTP_SIGNATURE_SIZE, 0, M_NOWAIT, 1, MT_DATA); if (sig == NULL) { /* no space, so free the entire chain */ sctp_m_freem(mret); return (NULL); } SCTP_BUF_NEXT(m_at) = sig; SCTP_BUF_LEN(sig) = SCTP_SIGNATURE_SIZE; cookie_sz += SCTP_SIGNATURE_SIZE; ph->param_length = htons(cookie_sz); *signature = (uint8_t *)mtod(sig, caddr_t); memset(*signature, 0, SCTP_SIGNATURE_SIZE); return (mret); } static uint8_t sctp_get_ect(struct sctp_tcb *stcb) { if ((stcb != NULL) && (stcb->asoc.ecn_supported == 1)) { return (SCTP_ECT0_BIT); } else { return (0); } } #if defined(INET) || defined(INET6) static void sctp_handle_no_route(struct sctp_tcb *stcb, struct sctp_nets *net, int so_locked) { SCTPDBG(SCTP_DEBUG_OUTPUT1, "dropped packet - no valid source addr\n"); if (net) { SCTPDBG(SCTP_DEBUG_OUTPUT1, "Destination was "); SCTPDBG_ADDR(SCTP_DEBUG_OUTPUT1, &net->ro._l_addr.sa); if (net->dest_state & SCTP_ADDR_CONFIRMED) { if ((net->dest_state & SCTP_ADDR_REACHABLE) && stcb) { SCTPDBG(SCTP_DEBUG_OUTPUT1, "no route takes interface %p down\n", (void *)net); sctp_ulp_notify(SCTP_NOTIFY_INTERFACE_DOWN, stcb, 0, (void *)net, so_locked); net->dest_state &= ~SCTP_ADDR_REACHABLE; net->dest_state &= ~SCTP_ADDR_PF; } } if (stcb) { if (net == stcb->asoc.primary_destination) { /* need a new primary */ struct sctp_nets *alt; alt = sctp_find_alternate_net(stcb, net, 0); if (alt != net) { if (stcb->asoc.alternate) { sctp_free_remote_addr(stcb->asoc.alternate); } stcb->asoc.alternate = alt; atomic_add_int(&stcb->asoc.alternate->ref_count, 1); if (net->ro._s_addr) { sctp_free_ifa(net->ro._s_addr); net->ro._s_addr = NULL; } net->src_addr_selected = 0; } } } } } #endif static int sctp_lowlevel_chunk_output(struct sctp_inpcb *inp, struct sctp_tcb *stcb, /* may be NULL */ struct sctp_nets *net, struct sockaddr *to, struct mbuf *m, uint32_t auth_offset, struct sctp_auth_chunk *auth, uint16_t auth_keyid, int nofragment_flag, int ecn_ok, int out_of_asoc_ok, uint16_t src_port, uint16_t dest_port, uint32_t v_tag, uint16_t port, union sctp_sockstore *over_addr, uint8_t mflowtype, uint32_t mflowid, int so_locked) { /* nofragment_flag to tell if IP_DF should be set (IPv4 only) */ /** * Given a mbuf chain (via SCTP_BUF_NEXT()) that holds a packet header * WITH an SCTPHDR but no IP header, endpoint inp and sa structure: * - fill in the HMAC digest of any AUTH chunk in the packet. * - calculate and fill in the SCTP checksum. * - prepend an IP address header. * - if boundall use INADDR_ANY. * - if boundspecific do source address selection. * - set fragmentation option for ipV4. * - On return from IP output, check/adjust mtu size of output * interface and smallest_mtu size as well. */ /* Will need ifdefs around this */ struct mbuf *newm; struct sctphdr *sctphdr; int packet_length; int ret; #if defined(INET) || defined(INET6) uint32_t vrf_id; #endif #if defined(INET) || defined(INET6) struct mbuf *o_pak; sctp_route_t *ro = NULL; struct udphdr *udp = NULL; #endif uint8_t tos_value; if ((net) && (net->dest_state & SCTP_ADDR_OUT_OF_SCOPE)) { SCTP_LTRACE_ERR_RET_PKT(m, inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, EFAULT); sctp_m_freem(m); return (EFAULT); } #if defined(INET) || defined(INET6) if (stcb) { vrf_id = stcb->asoc.vrf_id; } else { vrf_id = inp->def_vrf_id; } #endif /* fill in the HMAC digest for any AUTH chunk in the packet */ if ((auth != NULL) && (stcb != NULL)) { sctp_fill_hmac_digest_m(m, auth_offset, auth, stcb, auth_keyid); } if (net) { tos_value = net->dscp; } else if (stcb) { tos_value = stcb->asoc.default_dscp; } else { tos_value = inp->sctp_ep.default_dscp; } switch (to->sa_family) { #ifdef INET case AF_INET: { struct ip *ip = NULL; sctp_route_t iproute; int len; len = SCTP_MIN_V4_OVERHEAD; if (port) { len += sizeof(struct udphdr); } newm = sctp_get_mbuf_for_msg(len, 1, M_NOWAIT, 1, MT_DATA); if (newm == NULL) { sctp_m_freem(m); SCTP_LTRACE_ERR_RET(inp, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, ENOMEM); return (ENOMEM); } SCTP_ALIGN_TO_END(newm, len); SCTP_BUF_LEN(newm) = len; SCTP_BUF_NEXT(newm) = m; m = newm; if (net != NULL) { m->m_pkthdr.flowid = net->flowid; M_HASHTYPE_SET(m, net->flowtype); } else { m->m_pkthdr.flowid = mflowid; M_HASHTYPE_SET(m, mflowtype); } packet_length = sctp_calculate_len(m); ip = mtod(m, struct ip *); ip->ip_v = IPVERSION; ip->ip_hl = (sizeof(struct ip) >> 2); if (tos_value == 0) { /* * This means especially, that it is not set * at the SCTP layer. So use the value from * the IP layer. */ tos_value = inp->ip_inp.inp.inp_ip_tos; } tos_value &= 0xfc; if (ecn_ok) { tos_value |= sctp_get_ect(stcb); } if ((nofragment_flag) && (port == 0)) { ip->ip_off = htons(IP_DF); } else { ip->ip_off = htons(0); } /* FreeBSD has a function for ip_id's */ ip_fillid(ip); ip->ip_ttl = inp->ip_inp.inp.inp_ip_ttl; ip->ip_len = htons(packet_length); ip->ip_tos = tos_value; if (port) { ip->ip_p = IPPROTO_UDP; } else { ip->ip_p = IPPROTO_SCTP; } ip->ip_sum = 0; if (net == NULL) { ro = &iproute; memset(&iproute, 0, sizeof(iproute)); memcpy(&ro->ro_dst, to, to->sa_len); } else { ro = (sctp_route_t *)&net->ro; } /* Now the address selection part */ ip->ip_dst.s_addr = ((struct sockaddr_in *)to)->sin_addr.s_addr; /* call the routine to select the src address */ if (net && out_of_asoc_ok == 0) { if (net->ro._s_addr && (net->ro._s_addr->localifa_flags & (SCTP_BEING_DELETED | SCTP_ADDR_IFA_UNUSEABLE))) { sctp_free_ifa(net->ro._s_addr); net->ro._s_addr = NULL; net->src_addr_selected = 0; RO_NHFREE(ro); } if (net->src_addr_selected == 0) { /* Cache the source address */ net->ro._s_addr = sctp_source_address_selection(inp, stcb, ro, net, 0, vrf_id); net->src_addr_selected = 1; } if (net->ro._s_addr == NULL) { /* No route to host */ net->src_addr_selected = 0; sctp_handle_no_route(stcb, net, so_locked); SCTP_LTRACE_ERR_RET_PKT(m, inp, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, EHOSTUNREACH); sctp_m_freem(m); return (EHOSTUNREACH); } ip->ip_src = net->ro._s_addr->address.sin.sin_addr; } else { if (over_addr == NULL) { struct sctp_ifa *_lsrc; _lsrc = sctp_source_address_selection(inp, stcb, ro, net, out_of_asoc_ok, vrf_id); if (_lsrc == NULL) { sctp_handle_no_route(stcb, net, so_locked); SCTP_LTRACE_ERR_RET_PKT(m, inp, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, EHOSTUNREACH); sctp_m_freem(m); return (EHOSTUNREACH); } ip->ip_src = _lsrc->address.sin.sin_addr; sctp_free_ifa(_lsrc); } else { ip->ip_src = over_addr->sin.sin_addr; SCTP_RTALLOC(ro, vrf_id, inp->fibnum); } } if (port) { if (htons(SCTP_BASE_SYSCTL(sctp_udp_tunneling_port)) == 0) { sctp_handle_no_route(stcb, net, so_locked); SCTP_LTRACE_ERR_RET_PKT(m, inp, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, EHOSTUNREACH); sctp_m_freem(m); return (EHOSTUNREACH); } udp = (struct udphdr *)((caddr_t)ip + sizeof(struct ip)); udp->uh_sport = htons(SCTP_BASE_SYSCTL(sctp_udp_tunneling_port)); udp->uh_dport = port; udp->uh_ulen = htons((uint16_t)(packet_length - sizeof(struct ip))); if (V_udp_cksum) { udp->uh_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, udp->uh_ulen + htons(IPPROTO_UDP)); } else { udp->uh_sum = 0; } sctphdr = (struct sctphdr *)((caddr_t)udp + sizeof(struct udphdr)); } else { sctphdr = (struct sctphdr *)((caddr_t)ip + sizeof(struct ip)); } sctphdr->src_port = src_port; sctphdr->dest_port = dest_port; sctphdr->v_tag = v_tag; sctphdr->checksum = 0; /* * If source address selection fails and we find no * route then the ip_output should fail as well with * a NO_ROUTE_TO_HOST type error. We probably should * catch that somewhere and abort the association * right away (assuming this is an INIT being sent). */ if (ro->ro_nh == NULL) { /* * src addr selection failed to find a route * (or valid source addr), so we can't get * there from here (yet)! */ sctp_handle_no_route(stcb, net, so_locked); SCTP_LTRACE_ERR_RET_PKT(m, inp, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, EHOSTUNREACH); sctp_m_freem(m); return (EHOSTUNREACH); } if (ro != &iproute) { memcpy(&iproute, ro, sizeof(*ro)); } SCTPDBG(SCTP_DEBUG_OUTPUT3, "Calling ipv4 output routine from low level src addr:%x\n", (uint32_t)(ntohl(ip->ip_src.s_addr))); SCTPDBG(SCTP_DEBUG_OUTPUT3, "Destination is %x\n", (uint32_t)(ntohl(ip->ip_dst.s_addr))); SCTPDBG(SCTP_DEBUG_OUTPUT3, "RTP route is %p through\n", (void *)ro->ro_nh); if (SCTP_GET_HEADER_FOR_OUTPUT(o_pak)) { /* failed to prepend data, give up */ SCTP_LTRACE_ERR_RET_PKT(m, inp, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, ENOMEM); sctp_m_freem(m); return (ENOMEM); } SCTP_ATTACH_CHAIN(o_pak, m, packet_length); if (port) { sctphdr->checksum = sctp_calculate_cksum(m, sizeof(struct ip) + sizeof(struct udphdr)); SCTP_STAT_INCR(sctps_sendswcrc); if (V_udp_cksum) { SCTP_ENABLE_UDP_CSUM(o_pak); } } else { m->m_pkthdr.csum_flags = CSUM_SCTP; m->m_pkthdr.csum_data = offsetof(struct sctphdr, checksum); SCTP_STAT_INCR(sctps_sendhwcrc); } #ifdef SCTP_PACKET_LOGGING if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LAST_PACKET_TRACING) sctp_packet_log(o_pak); #endif /* send it out. table id is taken from stcb */ SCTP_PROBE5(send, NULL, stcb, ip, stcb, sctphdr); SCTP_IP_OUTPUT(ret, o_pak, ro, stcb, vrf_id); if (port) { UDPSTAT_INC(udps_opackets); } SCTP_STAT_INCR(sctps_sendpackets); SCTP_STAT_INCR_COUNTER64(sctps_outpackets); if (ret) SCTP_STAT_INCR(sctps_senderrors); SCTPDBG(SCTP_DEBUG_OUTPUT3, "IP output returns %d\n", ret); if (net == NULL) { /* free tempy routes */ RO_NHFREE(ro); } else { if ((ro->ro_nh != NULL) && (net->ro._s_addr) && ((net->dest_state & SCTP_ADDR_NO_PMTUD) == 0)) { uint32_t mtu; mtu = SCTP_GATHER_MTU_FROM_ROUTE(net->ro._s_addr, &net->ro._l_addr.sa, ro->ro_nh); if (mtu > 0) { if (net->port) { mtu -= sizeof(struct udphdr); } if (mtu < net->mtu) { if ((stcb != NULL) && (stcb->asoc.smallest_mtu > mtu)) { sctp_mtu_size_reset(inp, &stcb->asoc, mtu); } net->mtu = mtu; } } } else if (ro->ro_nh == NULL) { /* route was freed */ if (net->ro._s_addr && net->src_addr_selected) { sctp_free_ifa(net->ro._s_addr); net->ro._s_addr = NULL; } net->src_addr_selected = 0; } } return (ret); } #endif #ifdef INET6 case AF_INET6: { uint32_t flowlabel, flowinfo; struct ip6_hdr *ip6h; struct route_in6 ip6route; struct ifnet *ifp; struct sockaddr_in6 *sin6, tmp, *lsa6, lsa6_tmp; int prev_scope = 0; struct sockaddr_in6 lsa6_storage; int error; u_short prev_port = 0; int len; if (net) { flowlabel = net->flowlabel; } else if (stcb) { flowlabel = stcb->asoc.default_flowlabel; } else { flowlabel = inp->sctp_ep.default_flowlabel; } if (flowlabel == 0) { /* * This means especially, that it is not set * at the SCTP layer. So use the value from * the IP layer. */ flowlabel = ntohl(((struct inpcb *)inp)->inp_flow); } flowlabel &= 0x000fffff; len = SCTP_MIN_OVERHEAD; if (port) { len += sizeof(struct udphdr); } newm = sctp_get_mbuf_for_msg(len, 1, M_NOWAIT, 1, MT_DATA); if (newm == NULL) { sctp_m_freem(m); SCTP_LTRACE_ERR_RET(inp, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, ENOMEM); return (ENOMEM); } SCTP_ALIGN_TO_END(newm, len); SCTP_BUF_LEN(newm) = len; SCTP_BUF_NEXT(newm) = m; m = newm; if (net != NULL) { m->m_pkthdr.flowid = net->flowid; M_HASHTYPE_SET(m, net->flowtype); } else { m->m_pkthdr.flowid = mflowid; M_HASHTYPE_SET(m, mflowtype); } packet_length = sctp_calculate_len(m); ip6h = mtod(m, struct ip6_hdr *); /* protect *sin6 from overwrite */ sin6 = (struct sockaddr_in6 *)to; tmp = *sin6; sin6 = &tmp; /* KAME hack: embed scopeid */ if (sa6_embedscope(sin6, MODULE_GLOBAL(ip6_use_defzone)) != 0) { SCTP_LTRACE_ERR_RET_PKT(m, inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, EINVAL); sctp_m_freem(m); return (EINVAL); } if (net == NULL) { memset(&ip6route, 0, sizeof(ip6route)); ro = (sctp_route_t *)&ip6route; memcpy(&ro->ro_dst, sin6, sin6->sin6_len); } else { ro = (sctp_route_t *)&net->ro; } /* * We assume here that inp_flow is in host byte * order within the TCB! */ if (tos_value == 0) { /* * This means especially, that it is not set * at the SCTP layer. So use the value from * the IP layer. */ tos_value = (ntohl(((struct inpcb *)inp)->inp_flow) >> 20) & 0xff; } tos_value &= 0xfc; if (ecn_ok) { tos_value |= sctp_get_ect(stcb); } flowinfo = 0x06; flowinfo <<= 8; flowinfo |= tos_value; flowinfo <<= 20; flowinfo |= flowlabel; ip6h->ip6_flow = htonl(flowinfo); if (port) { ip6h->ip6_nxt = IPPROTO_UDP; } else { ip6h->ip6_nxt = IPPROTO_SCTP; } ip6h->ip6_plen = htons((uint16_t)(packet_length - sizeof(struct ip6_hdr))); ip6h->ip6_dst = sin6->sin6_addr; /* * Add SRC address selection here: we can only reuse * to a limited degree the kame src-addr-sel, since * we can try their selection but it may not be * bound. */ memset(&lsa6_tmp, 0, sizeof(lsa6_tmp)); lsa6_tmp.sin6_family = AF_INET6; lsa6_tmp.sin6_len = sizeof(lsa6_tmp); lsa6 = &lsa6_tmp; if (net && out_of_asoc_ok == 0) { if (net->ro._s_addr && (net->ro._s_addr->localifa_flags & (SCTP_BEING_DELETED | SCTP_ADDR_IFA_UNUSEABLE))) { sctp_free_ifa(net->ro._s_addr); net->ro._s_addr = NULL; net->src_addr_selected = 0; RO_NHFREE(ro); } if (net->src_addr_selected == 0) { sin6 = (struct sockaddr_in6 *)&net->ro._l_addr; /* KAME hack: embed scopeid */ if (sa6_embedscope(sin6, MODULE_GLOBAL(ip6_use_defzone)) != 0) { SCTP_LTRACE_ERR_RET_PKT(m, inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, EINVAL); sctp_m_freem(m); return (EINVAL); } /* Cache the source address */ net->ro._s_addr = sctp_source_address_selection(inp, stcb, ro, net, 0, vrf_id); (void)sa6_recoverscope(sin6); net->src_addr_selected = 1; } if (net->ro._s_addr == NULL) { SCTPDBG(SCTP_DEBUG_OUTPUT3, "V6:No route to host\n"); net->src_addr_selected = 0; sctp_handle_no_route(stcb, net, so_locked); SCTP_LTRACE_ERR_RET_PKT(m, inp, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, EHOSTUNREACH); sctp_m_freem(m); return (EHOSTUNREACH); } lsa6->sin6_addr = net->ro._s_addr->address.sin6.sin6_addr; } else { sin6 = (struct sockaddr_in6 *)&ro->ro_dst; /* KAME hack: embed scopeid */ if (sa6_embedscope(sin6, MODULE_GLOBAL(ip6_use_defzone)) != 0) { SCTP_LTRACE_ERR_RET_PKT(m, inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, EINVAL); sctp_m_freem(m); return (EINVAL); } if (over_addr == NULL) { struct sctp_ifa *_lsrc; _lsrc = sctp_source_address_selection(inp, stcb, ro, net, out_of_asoc_ok, vrf_id); if (_lsrc == NULL) { sctp_handle_no_route(stcb, net, so_locked); SCTP_LTRACE_ERR_RET_PKT(m, inp, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, EHOSTUNREACH); sctp_m_freem(m); return (EHOSTUNREACH); } lsa6->sin6_addr = _lsrc->address.sin6.sin6_addr; sctp_free_ifa(_lsrc); } else { lsa6->sin6_addr = over_addr->sin6.sin6_addr; SCTP_RTALLOC(ro, vrf_id, inp->fibnum); } (void)sa6_recoverscope(sin6); } lsa6->sin6_port = inp->sctp_lport; if (ro->ro_nh == NULL) { /* * src addr selection failed to find a route * (or valid source addr), so we can't get * there from here! */ sctp_handle_no_route(stcb, net, so_locked); SCTP_LTRACE_ERR_RET_PKT(m, inp, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, EHOSTUNREACH); sctp_m_freem(m); return (EHOSTUNREACH); } /* * XXX: sa6 may not have a valid sin6_scope_id in * the non-SCOPEDROUTING case. */ memset(&lsa6_storage, 0, sizeof(lsa6_storage)); lsa6_storage.sin6_family = AF_INET6; lsa6_storage.sin6_len = sizeof(lsa6_storage); lsa6_storage.sin6_addr = lsa6->sin6_addr; if ((error = sa6_recoverscope(&lsa6_storage)) != 0) { SCTPDBG(SCTP_DEBUG_OUTPUT3, "recover scope fails error %d\n", error); sctp_m_freem(m); return (error); } /* XXX */ lsa6_storage.sin6_addr = lsa6->sin6_addr; lsa6_storage.sin6_port = inp->sctp_lport; lsa6 = &lsa6_storage; ip6h->ip6_src = lsa6->sin6_addr; if (port) { if (htons(SCTP_BASE_SYSCTL(sctp_udp_tunneling_port)) == 0) { sctp_handle_no_route(stcb, net, so_locked); SCTP_LTRACE_ERR_RET_PKT(m, inp, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, EHOSTUNREACH); sctp_m_freem(m); return (EHOSTUNREACH); } udp = (struct udphdr *)((caddr_t)ip6h + sizeof(struct ip6_hdr)); udp->uh_sport = htons(SCTP_BASE_SYSCTL(sctp_udp_tunneling_port)); udp->uh_dport = port; udp->uh_ulen = htons((uint16_t)(packet_length - sizeof(struct ip6_hdr))); udp->uh_sum = 0; sctphdr = (struct sctphdr *)((caddr_t)udp + sizeof(struct udphdr)); } else { sctphdr = (struct sctphdr *)((caddr_t)ip6h + sizeof(struct ip6_hdr)); } sctphdr->src_port = src_port; sctphdr->dest_port = dest_port; sctphdr->v_tag = v_tag; sctphdr->checksum = 0; /* * We set the hop limit now since there is a good * chance that our ro pointer is now filled */ ip6h->ip6_hlim = SCTP_GET_HLIM(inp, ro); ifp = SCTP_GET_IFN_VOID_FROM_ROUTE(ro); #ifdef SCTP_DEBUG /* Copy to be sure something bad is not happening */ sin6->sin6_addr = ip6h->ip6_dst; lsa6->sin6_addr = ip6h->ip6_src; #endif SCTPDBG(SCTP_DEBUG_OUTPUT3, "Calling ipv6 output routine from low level\n"); SCTPDBG(SCTP_DEBUG_OUTPUT3, "src: "); SCTPDBG_ADDR(SCTP_DEBUG_OUTPUT3, (struct sockaddr *)lsa6); SCTPDBG(SCTP_DEBUG_OUTPUT3, "dst: "); SCTPDBG_ADDR(SCTP_DEBUG_OUTPUT3, (struct sockaddr *)sin6); if (net) { sin6 = (struct sockaddr_in6 *)&net->ro._l_addr; /* * preserve the port and scope for link * local send */ prev_scope = sin6->sin6_scope_id; prev_port = sin6->sin6_port; } if (SCTP_GET_HEADER_FOR_OUTPUT(o_pak)) { /* failed to prepend data, give up */ sctp_m_freem(m); SCTP_LTRACE_ERR_RET(inp, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, ENOMEM); return (ENOMEM); } SCTP_ATTACH_CHAIN(o_pak, m, packet_length); if (port) { sctphdr->checksum = sctp_calculate_cksum(m, sizeof(struct ip6_hdr) + sizeof(struct udphdr)); SCTP_STAT_INCR(sctps_sendswcrc); if ((udp->uh_sum = in6_cksum(o_pak, IPPROTO_UDP, sizeof(struct ip6_hdr), packet_length - sizeof(struct ip6_hdr))) == 0) { udp->uh_sum = 0xffff; } } else { m->m_pkthdr.csum_flags = CSUM_SCTP_IPV6; m->m_pkthdr.csum_data = offsetof(struct sctphdr, checksum); SCTP_STAT_INCR(sctps_sendhwcrc); } /* send it out. table id is taken from stcb */ #ifdef SCTP_PACKET_LOGGING if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LAST_PACKET_TRACING) sctp_packet_log(o_pak); #endif SCTP_PROBE5(send, NULL, stcb, ip6h, stcb, sctphdr); SCTP_IP6_OUTPUT(ret, o_pak, (struct route_in6 *)ro, &ifp, stcb, vrf_id); if (net) { /* for link local this must be done */ sin6->sin6_scope_id = prev_scope; sin6->sin6_port = prev_port; } SCTPDBG(SCTP_DEBUG_OUTPUT3, "return from send is %d\n", ret); if (port) { UDPSTAT_INC(udps_opackets); } SCTP_STAT_INCR(sctps_sendpackets); SCTP_STAT_INCR_COUNTER64(sctps_outpackets); if (ret) { SCTP_STAT_INCR(sctps_senderrors); } if (net == NULL) { /* Now if we had a temp route free it */ RO_NHFREE(ro); } else { /* * PMTU check versus smallest asoc MTU goes * here */ if (ro->ro_nh == NULL) { /* Route was freed */ if (net->ro._s_addr && net->src_addr_selected) { sctp_free_ifa(net->ro._s_addr); net->ro._s_addr = NULL; } net->src_addr_selected = 0; } if ((ro->ro_nh != NULL) && (net->ro._s_addr) && ((net->dest_state & SCTP_ADDR_NO_PMTUD) == 0)) { uint32_t mtu; mtu = SCTP_GATHER_MTU_FROM_ROUTE(net->ro._s_addr, &net->ro._l_addr.sa, ro->ro_nh); if (mtu > 0) { if (net->port) { mtu -= sizeof(struct udphdr); } if (mtu < net->mtu) { if ((stcb != NULL) && (stcb->asoc.smallest_mtu > mtu)) { sctp_mtu_size_reset(inp, &stcb->asoc, mtu); } net->mtu = mtu; } } } else if (ifp) { if (ND_IFINFO(ifp)->linkmtu && (stcb->asoc.smallest_mtu > ND_IFINFO(ifp)->linkmtu)) { sctp_mtu_size_reset(inp, &stcb->asoc, ND_IFINFO(ifp)->linkmtu); } } } return (ret); } #endif default: SCTPDBG(SCTP_DEBUG_OUTPUT1, "Unknown protocol (TSNH) type %d\n", ((struct sockaddr *)to)->sa_family); sctp_m_freem(m); SCTP_LTRACE_ERR_RET_PKT(m, inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, EFAULT); return (EFAULT); } } void sctp_send_initiate(struct sctp_inpcb *inp, struct sctp_tcb *stcb, int so_locked) { struct mbuf *m, *m_last; struct sctp_nets *net; struct sctp_init_chunk *init; struct sctp_supported_addr_param *sup_addr; struct sctp_adaptation_layer_indication *ali; struct sctp_supported_chunk_types_param *pr_supported; struct sctp_paramhdr *ph; int cnt_inits_to = 0; int error; uint16_t num_ext, chunk_len, padding_len, parameter_len; /* INIT's always go to the primary (and usually ONLY address) */ net = stcb->asoc.primary_destination; if (net == NULL) { net = TAILQ_FIRST(&stcb->asoc.nets); if (net == NULL) { /* TSNH */ return; } /* we confirm any address we send an INIT to */ net->dest_state &= ~SCTP_ADDR_UNCONFIRMED; (void)sctp_set_primary_addr(stcb, NULL, net); } else { /* we confirm any address we send an INIT to */ net->dest_state &= ~SCTP_ADDR_UNCONFIRMED; } SCTPDBG(SCTP_DEBUG_OUTPUT4, "Sending INIT\n"); #ifdef INET6 if (net->ro._l_addr.sa.sa_family == AF_INET6) { /* * special hook, if we are sending to link local it will not * show up in our private address count. */ if (IN6_IS_ADDR_LINKLOCAL(&net->ro._l_addr.sin6.sin6_addr)) cnt_inits_to = 1; } #endif if (SCTP_OS_TIMER_PENDING(&net->rxt_timer.timer)) { /* This case should not happen */ SCTPDBG(SCTP_DEBUG_OUTPUT4, "Sending INIT - failed timer?\n"); return; } /* start the INIT timer */ sctp_timer_start(SCTP_TIMER_TYPE_INIT, inp, stcb, net); m = sctp_get_mbuf_for_msg(MCLBYTES, 1, M_NOWAIT, 1, MT_DATA); if (m == NULL) { /* No memory, INIT timer will re-attempt. */ SCTPDBG(SCTP_DEBUG_OUTPUT4, "Sending INIT - mbuf?\n"); return; } chunk_len = (uint16_t)sizeof(struct sctp_init_chunk); padding_len = 0; /* Now lets put the chunk header in place */ init = mtod(m, struct sctp_init_chunk *); /* now the chunk header */ init->ch.chunk_type = SCTP_INITIATION; init->ch.chunk_flags = 0; /* fill in later from mbuf we build */ init->ch.chunk_length = 0; /* place in my tag */ init->init.initiate_tag = htonl(stcb->asoc.my_vtag); /* set up some of the credits. */ init->init.a_rwnd = htonl(max(inp->sctp_socket ? SCTP_SB_LIMIT_RCV(inp->sctp_socket) : 0, SCTP_MINIMAL_RWND)); init->init.num_outbound_streams = htons(stcb->asoc.pre_open_streams); init->init.num_inbound_streams = htons(stcb->asoc.max_inbound_streams); init->init.initial_tsn = htonl(stcb->asoc.init_seq_number); /* Adaptation layer indication parameter */ if (inp->sctp_ep.adaptation_layer_indicator_provided) { parameter_len = (uint16_t)sizeof(struct sctp_adaptation_layer_indication); ali = (struct sctp_adaptation_layer_indication *)(mtod(m, caddr_t)+chunk_len); ali->ph.param_type = htons(SCTP_ULP_ADAPTATION); ali->ph.param_length = htons(parameter_len); ali->indication = htonl(inp->sctp_ep.adaptation_layer_indicator); chunk_len += parameter_len; } /* ECN parameter */ if (stcb->asoc.ecn_supported == 1) { parameter_len = (uint16_t)sizeof(struct sctp_paramhdr); ph = (struct sctp_paramhdr *)(mtod(m, caddr_t)+chunk_len); ph->param_type = htons(SCTP_ECN_CAPABLE); ph->param_length = htons(parameter_len); chunk_len += parameter_len; } /* PR-SCTP supported parameter */ if (stcb->asoc.prsctp_supported == 1) { parameter_len = (uint16_t)sizeof(struct sctp_paramhdr); ph = (struct sctp_paramhdr *)(mtod(m, caddr_t)+chunk_len); ph->param_type = htons(SCTP_PRSCTP_SUPPORTED); ph->param_length = htons(parameter_len); chunk_len += parameter_len; } /* Add NAT friendly parameter. */ if (SCTP_BASE_SYSCTL(sctp_inits_include_nat_friendly)) { parameter_len = (uint16_t)sizeof(struct sctp_paramhdr); ph = (struct sctp_paramhdr *)(mtod(m, caddr_t)+chunk_len); ph->param_type = htons(SCTP_HAS_NAT_SUPPORT); ph->param_length = htons(parameter_len); chunk_len += parameter_len; } /* And now tell the peer which extensions we support */ num_ext = 0; pr_supported = (struct sctp_supported_chunk_types_param *)(mtod(m, caddr_t)+chunk_len); if (stcb->asoc.prsctp_supported == 1) { pr_supported->chunk_types[num_ext++] = SCTP_FORWARD_CUM_TSN; if (stcb->asoc.idata_supported) { pr_supported->chunk_types[num_ext++] = SCTP_IFORWARD_CUM_TSN; } } if (stcb->asoc.auth_supported == 1) { pr_supported->chunk_types[num_ext++] = SCTP_AUTHENTICATION; } if (stcb->asoc.asconf_supported == 1) { pr_supported->chunk_types[num_ext++] = SCTP_ASCONF; pr_supported->chunk_types[num_ext++] = SCTP_ASCONF_ACK; } if (stcb->asoc.reconfig_supported == 1) { pr_supported->chunk_types[num_ext++] = SCTP_STREAM_RESET; } if (stcb->asoc.idata_supported) { pr_supported->chunk_types[num_ext++] = SCTP_IDATA; } if (stcb->asoc.nrsack_supported == 1) { pr_supported->chunk_types[num_ext++] = SCTP_NR_SELECTIVE_ACK; } if (stcb->asoc.pktdrop_supported == 1) { pr_supported->chunk_types[num_ext++] = SCTP_PACKET_DROPPED; } if (num_ext > 0) { parameter_len = (uint16_t)sizeof(struct sctp_supported_chunk_types_param) + num_ext; pr_supported->ph.param_type = htons(SCTP_SUPPORTED_CHUNK_EXT); pr_supported->ph.param_length = htons(parameter_len); padding_len = SCTP_SIZE32(parameter_len) - parameter_len; chunk_len += parameter_len; } /* add authentication parameters */ if (stcb->asoc.auth_supported) { /* attach RANDOM parameter, if available */ if (stcb->asoc.authinfo.random != NULL) { struct sctp_auth_random *randp; if (padding_len > 0) { memset(mtod(m, caddr_t)+chunk_len, 0, padding_len); chunk_len += padding_len; padding_len = 0; } randp = (struct sctp_auth_random *)(mtod(m, caddr_t)+chunk_len); parameter_len = (uint16_t)sizeof(struct sctp_auth_random) + stcb->asoc.authinfo.random_len; /* random key already contains the header */ memcpy(randp, stcb->asoc.authinfo.random->key, parameter_len); padding_len = SCTP_SIZE32(parameter_len) - parameter_len; chunk_len += parameter_len; } /* add HMAC_ALGO parameter */ if (stcb->asoc.local_hmacs != NULL) { struct sctp_auth_hmac_algo *hmacs; if (padding_len > 0) { memset(mtod(m, caddr_t)+chunk_len, 0, padding_len); chunk_len += padding_len; padding_len = 0; } hmacs = (struct sctp_auth_hmac_algo *)(mtod(m, caddr_t)+chunk_len); parameter_len = (uint16_t)(sizeof(struct sctp_auth_hmac_algo) + stcb->asoc.local_hmacs->num_algo * sizeof(uint16_t)); hmacs->ph.param_type = htons(SCTP_HMAC_LIST); hmacs->ph.param_length = htons(parameter_len); sctp_serialize_hmaclist(stcb->asoc.local_hmacs, (uint8_t *)hmacs->hmac_ids); padding_len = SCTP_SIZE32(parameter_len) - parameter_len; chunk_len += parameter_len; } /* add CHUNKS parameter */ if (stcb->asoc.local_auth_chunks != NULL) { struct sctp_auth_chunk_list *chunks; if (padding_len > 0) { memset(mtod(m, caddr_t)+chunk_len, 0, padding_len); chunk_len += padding_len; padding_len = 0; } chunks = (struct sctp_auth_chunk_list *)(mtod(m, caddr_t)+chunk_len); parameter_len = (uint16_t)(sizeof(struct sctp_auth_chunk_list) + sctp_auth_get_chklist_size(stcb->asoc.local_auth_chunks)); chunks->ph.param_type = htons(SCTP_CHUNK_LIST); chunks->ph.param_length = htons(parameter_len); sctp_serialize_auth_chunks(stcb->asoc.local_auth_chunks, chunks->chunk_types); padding_len = SCTP_SIZE32(parameter_len) - parameter_len; chunk_len += parameter_len; } } /* now any cookie time extensions */ if (stcb->asoc.cookie_preserve_req) { struct sctp_cookie_perserve_param *cookie_preserve; if (padding_len > 0) { memset(mtod(m, caddr_t)+chunk_len, 0, padding_len); chunk_len += padding_len; padding_len = 0; } parameter_len = (uint16_t)sizeof(struct sctp_cookie_perserve_param); cookie_preserve = (struct sctp_cookie_perserve_param *)(mtod(m, caddr_t)+chunk_len); cookie_preserve->ph.param_type = htons(SCTP_COOKIE_PRESERVE); cookie_preserve->ph.param_length = htons(parameter_len); cookie_preserve->time = htonl(stcb->asoc.cookie_preserve_req); stcb->asoc.cookie_preserve_req = 0; chunk_len += parameter_len; } if (stcb->asoc.scope.ipv4_addr_legal || stcb->asoc.scope.ipv6_addr_legal) { uint8_t i; if (padding_len > 0) { memset(mtod(m, caddr_t)+chunk_len, 0, padding_len); chunk_len += padding_len; padding_len = 0; } parameter_len = (uint16_t)sizeof(struct sctp_paramhdr); if (stcb->asoc.scope.ipv4_addr_legal) { parameter_len += (uint16_t)sizeof(uint16_t); } if (stcb->asoc.scope.ipv6_addr_legal) { parameter_len += (uint16_t)sizeof(uint16_t); } sup_addr = (struct sctp_supported_addr_param *)(mtod(m, caddr_t)+chunk_len); sup_addr->ph.param_type = htons(SCTP_SUPPORTED_ADDRTYPE); sup_addr->ph.param_length = htons(parameter_len); i = 0; if (stcb->asoc.scope.ipv4_addr_legal) { sup_addr->addr_type[i++] = htons(SCTP_IPV4_ADDRESS); } if (stcb->asoc.scope.ipv6_addr_legal) { sup_addr->addr_type[i++] = htons(SCTP_IPV6_ADDRESS); } padding_len = 4 - 2 * i; chunk_len += parameter_len; } SCTP_BUF_LEN(m) = chunk_len; /* now the addresses */ /* * To optimize this we could put the scoping stuff into a structure * and remove the individual uint8's from the assoc structure. Then * we could just sifa in the address within the stcb. But for now * this is a quick hack to get the address stuff teased apart. */ m_last = sctp_add_addresses_to_i_ia(inp, stcb, &stcb->asoc.scope, m, cnt_inits_to, &padding_len, &chunk_len); init->ch.chunk_length = htons(chunk_len); if (padding_len > 0) { if (sctp_add_pad_tombuf(m_last, padding_len) == NULL) { sctp_m_freem(m); return; } } SCTPDBG(SCTP_DEBUG_OUTPUT4, "Sending INIT - calls lowlevel_output\n"); if ((error = sctp_lowlevel_chunk_output(inp, stcb, net, (struct sockaddr *)&net->ro._l_addr, m, 0, NULL, 0, 0, 0, 0, inp->sctp_lport, stcb->rport, htonl(0), net->port, NULL, 0, 0, so_locked))) { SCTPDBG(SCTP_DEBUG_OUTPUT4, "Gak send error %d\n", error); if (error == ENOBUFS) { stcb->asoc.ifp_had_enobuf = 1; SCTP_STAT_INCR(sctps_lowlevelerr); } } else { stcb->asoc.ifp_had_enobuf = 0; } SCTP_STAT_INCR_COUNTER64(sctps_outcontrolchunks); (void)SCTP_GETTIME_TIMEVAL(&net->last_sent_time); } struct mbuf * sctp_arethere_unrecognized_parameters(struct mbuf *in_initpkt, int param_offset, int *abort_processing, struct sctp_chunkhdr *cp, int *nat_friendly, int *cookie_found) { /* * Given a mbuf containing an INIT or INIT-ACK with the param_offset * being equal to the beginning of the params i.e. (iphlen + * sizeof(struct sctp_init_msg) parse through the parameters to the * end of the mbuf verifying that all parameters are known. * * For unknown parameters build and return a mbuf with * UNRECOGNIZED_PARAMETER errors. If the flags indicate to stop * processing this chunk stop, and set *abort_processing to 1. * * By having param_offset be pre-set to where parameters begin it is * hoped that this routine may be reused in the future by new * features. */ struct sctp_paramhdr *phdr, params; struct mbuf *mat, *m_tmp, *op_err, *op_err_last; int at, limit, pad_needed; uint16_t ptype, plen, padded_size; *abort_processing = 0; if (cookie_found != NULL) { *cookie_found = 0; } mat = in_initpkt; limit = ntohs(cp->chunk_length) - sizeof(struct sctp_init_chunk); at = param_offset; op_err = NULL; op_err_last = NULL; pad_needed = 0; SCTPDBG(SCTP_DEBUG_OUTPUT1, "Check for unrecognized param's\n"); phdr = sctp_get_next_param(mat, at, ¶ms, sizeof(params)); while ((phdr != NULL) && ((size_t)limit >= sizeof(struct sctp_paramhdr))) { ptype = ntohs(phdr->param_type); plen = ntohs(phdr->param_length); if ((plen > limit) || (plen < sizeof(struct sctp_paramhdr))) { /* wacked parameter */ SCTPDBG(SCTP_DEBUG_OUTPUT1, "Invalid size - error %d\n", plen); goto invalid_size; } limit -= SCTP_SIZE32(plen); /*- * All parameters for all chunks that we know/understand are * listed here. We process them other places and make * appropriate stop actions per the upper bits. However this * is the generic routine processor's can call to get back * an operr.. to either incorporate (init-ack) or send. */ padded_size = SCTP_SIZE32(plen); switch (ptype) { /* Param's with variable size */ case SCTP_HEARTBEAT_INFO: case SCTP_UNRECOG_PARAM: case SCTP_ERROR_CAUSE_IND: /* ok skip fwd */ at += padded_size; break; case SCTP_STATE_COOKIE: if (cookie_found != NULL) { *cookie_found = 1; } at += padded_size; break; /* Param's with variable size within a range */ case SCTP_CHUNK_LIST: case SCTP_SUPPORTED_CHUNK_EXT: if (padded_size > (sizeof(struct sctp_supported_chunk_types_param) + (sizeof(uint8_t) * SCTP_MAX_SUPPORTED_EXT))) { SCTPDBG(SCTP_DEBUG_OUTPUT1, "Invalid size - error chklist %d\n", plen); goto invalid_size; } at += padded_size; break; case SCTP_SUPPORTED_ADDRTYPE: if (padded_size > SCTP_MAX_ADDR_PARAMS_SIZE) { SCTPDBG(SCTP_DEBUG_OUTPUT1, "Invalid size - error supaddrtype %d\n", plen); goto invalid_size; } at += padded_size; break; case SCTP_RANDOM: if (padded_size > (sizeof(struct sctp_auth_random) + SCTP_RANDOM_MAX_SIZE)) { SCTPDBG(SCTP_DEBUG_OUTPUT1, "Invalid size - error random %d\n", plen); goto invalid_size; } at += padded_size; break; case SCTP_SET_PRIM_ADDR: case SCTP_DEL_IP_ADDRESS: case SCTP_ADD_IP_ADDRESS: if ((padded_size != sizeof(struct sctp_asconf_addrv4_param)) && (padded_size != sizeof(struct sctp_asconf_addr_param))) { SCTPDBG(SCTP_DEBUG_OUTPUT1, "Invalid size - error setprim %d\n", plen); goto invalid_size; } at += padded_size; break; /* Param's with a fixed size */ case SCTP_IPV4_ADDRESS: if (padded_size != sizeof(struct sctp_ipv4addr_param)) { SCTPDBG(SCTP_DEBUG_OUTPUT1, "Invalid size - error ipv4 addr %d\n", plen); goto invalid_size; } at += padded_size; break; case SCTP_IPV6_ADDRESS: if (padded_size != sizeof(struct sctp_ipv6addr_param)) { SCTPDBG(SCTP_DEBUG_OUTPUT1, "Invalid size - error ipv6 addr %d\n", plen); goto invalid_size; } at += padded_size; break; case SCTP_COOKIE_PRESERVE: if (padded_size != sizeof(struct sctp_cookie_perserve_param)) { SCTPDBG(SCTP_DEBUG_OUTPUT1, "Invalid size - error cookie-preserve %d\n", plen); goto invalid_size; } at += padded_size; break; case SCTP_HAS_NAT_SUPPORT: *nat_friendly = 1; /* fall through */ case SCTP_PRSCTP_SUPPORTED: if (padded_size != sizeof(struct sctp_paramhdr)) { SCTPDBG(SCTP_DEBUG_OUTPUT1, "Invalid size - error prsctp/nat support %d\n", plen); goto invalid_size; } at += padded_size; break; case SCTP_ECN_CAPABLE: if (padded_size != sizeof(struct sctp_paramhdr)) { SCTPDBG(SCTP_DEBUG_OUTPUT1, "Invalid size - error ecn %d\n", plen); goto invalid_size; } at += padded_size; break; case SCTP_ULP_ADAPTATION: if (padded_size != sizeof(struct sctp_adaptation_layer_indication)) { SCTPDBG(SCTP_DEBUG_OUTPUT1, "Invalid size - error adapatation %d\n", plen); goto invalid_size; } at += padded_size; break; case SCTP_SUCCESS_REPORT: if (padded_size != sizeof(struct sctp_asconf_paramhdr)) { SCTPDBG(SCTP_DEBUG_OUTPUT1, "Invalid size - error success %d\n", plen); goto invalid_size; } at += padded_size; break; case SCTP_HOSTNAME_ADDRESS: { /* Hostname parameters are deprecated. */ struct sctp_gen_error_cause *cause; int l_len; SCTPDBG(SCTP_DEBUG_OUTPUT1, "Can't handle hostname addresses.. abort processing\n"); *abort_processing = 1; sctp_m_freem(op_err); op_err = NULL; op_err_last = NULL; #ifdef INET6 l_len = SCTP_MIN_OVERHEAD; #else l_len = SCTP_MIN_V4_OVERHEAD; #endif l_len += sizeof(struct sctp_chunkhdr); l_len += sizeof(struct sctp_gen_error_cause); op_err = sctp_get_mbuf_for_msg(l_len, 0, M_NOWAIT, 1, MT_DATA); if (op_err != NULL) { /* * Pre-reserve space for IP, SCTP, * and chunk header. */ #ifdef INET6 SCTP_BUF_RESV_UF(op_err, sizeof(struct ip6_hdr)); #else SCTP_BUF_RESV_UF(op_err, sizeof(struct ip)); #endif SCTP_BUF_RESV_UF(op_err, sizeof(struct sctphdr)); SCTP_BUF_RESV_UF(op_err, sizeof(struct sctp_chunkhdr)); SCTP_BUF_LEN(op_err) = sizeof(struct sctp_gen_error_cause); cause = mtod(op_err, struct sctp_gen_error_cause *); cause->code = htons(SCTP_CAUSE_UNRESOLVABLE_ADDR); cause->length = htons((uint16_t)(sizeof(struct sctp_gen_error_cause) + plen)); SCTP_BUF_NEXT(op_err) = SCTP_M_COPYM(mat, at, plen, M_NOWAIT); if (SCTP_BUF_NEXT(op_err) == NULL) { sctp_m_freem(op_err); op_err = NULL; op_err_last = NULL; } } return (op_err); } default: /* * we do not recognize the parameter figure out what * we do. */ SCTPDBG(SCTP_DEBUG_OUTPUT1, "Hit default param %x\n", ptype); if ((ptype & 0x4000) == 0x4000) { /* Report bit is set?? */ SCTPDBG(SCTP_DEBUG_OUTPUT1, "report op err\n"); if (op_err == NULL) { int l_len; /* Ok need to try to get an mbuf */ #ifdef INET6 l_len = SCTP_MIN_OVERHEAD; #else l_len = SCTP_MIN_V4_OVERHEAD; #endif l_len += sizeof(struct sctp_chunkhdr); l_len += sizeof(struct sctp_paramhdr); op_err = sctp_get_mbuf_for_msg(l_len, 0, M_NOWAIT, 1, MT_DATA); if (op_err) { SCTP_BUF_LEN(op_err) = 0; #ifdef INET6 SCTP_BUF_RESV_UF(op_err, sizeof(struct ip6_hdr)); #else SCTP_BUF_RESV_UF(op_err, sizeof(struct ip)); #endif SCTP_BUF_RESV_UF(op_err, sizeof(struct sctphdr)); SCTP_BUF_RESV_UF(op_err, sizeof(struct sctp_chunkhdr)); op_err_last = op_err; } } if (op_err != NULL) { /* If we have space */ struct sctp_paramhdr *param; if (pad_needed > 0) { op_err_last = sctp_add_pad_tombuf(op_err_last, pad_needed); } if (op_err_last == NULL) { sctp_m_freem(op_err); op_err = NULL; op_err_last = NULL; goto more_processing; } if (M_TRAILINGSPACE(op_err_last) < (int)sizeof(struct sctp_paramhdr)) { m_tmp = sctp_get_mbuf_for_msg(sizeof(struct sctp_paramhdr), 0, M_NOWAIT, 1, MT_DATA); if (m_tmp == NULL) { sctp_m_freem(op_err); op_err = NULL; op_err_last = NULL; goto more_processing; } SCTP_BUF_LEN(m_tmp) = 0; SCTP_BUF_NEXT(m_tmp) = NULL; SCTP_BUF_NEXT(op_err_last) = m_tmp; op_err_last = m_tmp; } param = (struct sctp_paramhdr *)(mtod(op_err_last, caddr_t)+SCTP_BUF_LEN(op_err_last)); param->param_type = htons(SCTP_UNRECOG_PARAM); param->param_length = htons((uint16_t)sizeof(struct sctp_paramhdr) + plen); SCTP_BUF_LEN(op_err_last) += sizeof(struct sctp_paramhdr); SCTP_BUF_NEXT(op_err_last) = SCTP_M_COPYM(mat, at, plen, M_NOWAIT); if (SCTP_BUF_NEXT(op_err_last) == NULL) { sctp_m_freem(op_err); op_err = NULL; op_err_last = NULL; goto more_processing; } else { while (SCTP_BUF_NEXT(op_err_last) != NULL) { op_err_last = SCTP_BUF_NEXT(op_err_last); } } if (plen % 4 != 0) { pad_needed = 4 - (plen % 4); } else { pad_needed = 0; } } } more_processing: if ((ptype & 0x8000) == 0x0000) { SCTPDBG(SCTP_DEBUG_OUTPUT1, "stop proc\n"); return (op_err); } else { /* skip this chunk and continue processing */ SCTPDBG(SCTP_DEBUG_OUTPUT1, "move on\n"); at += SCTP_SIZE32(plen); } break; } phdr = sctp_get_next_param(mat, at, ¶ms, sizeof(params)); } return (op_err); invalid_size: SCTPDBG(SCTP_DEBUG_OUTPUT1, "abort flag set\n"); *abort_processing = 1; sctp_m_freem(op_err); op_err = NULL; op_err_last = NULL; if (phdr != NULL) { struct sctp_paramhdr *param; int l_len; #ifdef INET6 l_len = SCTP_MIN_OVERHEAD; #else l_len = SCTP_MIN_V4_OVERHEAD; #endif l_len += sizeof(struct sctp_chunkhdr); l_len += (2 * sizeof(struct sctp_paramhdr)); op_err = sctp_get_mbuf_for_msg(l_len, 0, M_NOWAIT, 1, MT_DATA); if (op_err) { SCTP_BUF_LEN(op_err) = 0; #ifdef INET6 SCTP_BUF_RESV_UF(op_err, sizeof(struct ip6_hdr)); #else SCTP_BUF_RESV_UF(op_err, sizeof(struct ip)); #endif SCTP_BUF_RESV_UF(op_err, sizeof(struct sctphdr)); SCTP_BUF_RESV_UF(op_err, sizeof(struct sctp_chunkhdr)); SCTP_BUF_LEN(op_err) = 2 * sizeof(struct sctp_paramhdr); param = mtod(op_err, struct sctp_paramhdr *); param->param_type = htons(SCTP_CAUSE_PROTOCOL_VIOLATION); param->param_length = htons(2 * sizeof(struct sctp_paramhdr)); param++; param->param_type = htons(ptype); param->param_length = htons(plen); } } return (op_err); } static int sctp_are_there_new_addresses(struct sctp_association *asoc, struct mbuf *in_initpkt, int offset, struct sockaddr *src) { /* * Given a INIT packet, look through the packet to verify that there * are NO new addresses. As we go through the parameters add reports * of any un-understood parameters that require an error. Also we * must return (1) to drop the packet if we see a un-understood * parameter that tells us to drop the chunk. */ struct sockaddr *sa_touse; struct sockaddr *sa; struct sctp_paramhdr *phdr, params; uint16_t ptype, plen; uint8_t fnd; struct sctp_nets *net; int check_src; #ifdef INET struct sockaddr_in sin4, *sa4; #endif #ifdef INET6 struct sockaddr_in6 sin6, *sa6; #endif #ifdef INET memset(&sin4, 0, sizeof(sin4)); sin4.sin_family = AF_INET; sin4.sin_len = sizeof(sin4); #endif #ifdef INET6 memset(&sin6, 0, sizeof(sin6)); sin6.sin6_family = AF_INET6; sin6.sin6_len = sizeof(sin6); #endif /* First what about the src address of the pkt ? */ check_src = 0; switch (src->sa_family) { #ifdef INET case AF_INET: if (asoc->scope.ipv4_addr_legal) { check_src = 1; } break; #endif #ifdef INET6 case AF_INET6: if (asoc->scope.ipv6_addr_legal) { check_src = 1; } break; #endif default: /* TSNH */ break; } if (check_src) { fnd = 0; TAILQ_FOREACH(net, &asoc->nets, sctp_next) { sa = (struct sockaddr *)&net->ro._l_addr; if (sa->sa_family == src->sa_family) { #ifdef INET if (sa->sa_family == AF_INET) { struct sockaddr_in *src4; sa4 = (struct sockaddr_in *)sa; src4 = (struct sockaddr_in *)src; if (sa4->sin_addr.s_addr == src4->sin_addr.s_addr) { fnd = 1; break; } } #endif #ifdef INET6 if (sa->sa_family == AF_INET6) { struct sockaddr_in6 *src6; sa6 = (struct sockaddr_in6 *)sa; src6 = (struct sockaddr_in6 *)src; if (SCTP6_ARE_ADDR_EQUAL(sa6, src6)) { fnd = 1; break; } } #endif } } if (fnd == 0) { /* New address added! no need to look further. */ return (1); } } /* Ok so far lets munge through the rest of the packet */ offset += sizeof(struct sctp_init_chunk); phdr = sctp_get_next_param(in_initpkt, offset, ¶ms, sizeof(params)); while (phdr) { sa_touse = NULL; ptype = ntohs(phdr->param_type); plen = ntohs(phdr->param_length); switch (ptype) { #ifdef INET case SCTP_IPV4_ADDRESS: { struct sctp_ipv4addr_param *p4, p4_buf; if (plen != sizeof(struct sctp_ipv4addr_param)) { return (1); } phdr = sctp_get_next_param(in_initpkt, offset, (struct sctp_paramhdr *)&p4_buf, sizeof(p4_buf)); if (phdr == NULL) { return (1); } if (asoc->scope.ipv4_addr_legal) { p4 = (struct sctp_ipv4addr_param *)phdr; sin4.sin_addr.s_addr = p4->addr; sa_touse = (struct sockaddr *)&sin4; } break; } #endif #ifdef INET6 case SCTP_IPV6_ADDRESS: { struct sctp_ipv6addr_param *p6, p6_buf; if (plen != sizeof(struct sctp_ipv6addr_param)) { return (1); } phdr = sctp_get_next_param(in_initpkt, offset, (struct sctp_paramhdr *)&p6_buf, sizeof(p6_buf)); if (phdr == NULL) { return (1); } if (asoc->scope.ipv6_addr_legal) { p6 = (struct sctp_ipv6addr_param *)phdr; memcpy((caddr_t)&sin6.sin6_addr, p6->addr, sizeof(p6->addr)); sa_touse = (struct sockaddr *)&sin6; } break; } #endif default: sa_touse = NULL; break; } if (sa_touse) { /* ok, sa_touse points to one to check */ fnd = 0; TAILQ_FOREACH(net, &asoc->nets, sctp_next) { sa = (struct sockaddr *)&net->ro._l_addr; if (sa->sa_family != sa_touse->sa_family) { continue; } #ifdef INET if (sa->sa_family == AF_INET) { sa4 = (struct sockaddr_in *)sa; if (sa4->sin_addr.s_addr == sin4.sin_addr.s_addr) { fnd = 1; break; } } #endif #ifdef INET6 if (sa->sa_family == AF_INET6) { sa6 = (struct sockaddr_in6 *)sa; if (SCTP6_ARE_ADDR_EQUAL( sa6, &sin6)) { fnd = 1; break; } } #endif } if (!fnd) { /* New addr added! no need to look further */ return (1); } } offset += SCTP_SIZE32(plen); phdr = sctp_get_next_param(in_initpkt, offset, ¶ms, sizeof(params)); } return (0); } /* * Given a MBUF chain that was sent into us containing an INIT. Build a * INIT-ACK with COOKIE and send back. We assume that the in_initpkt has done * a pullup to include IPv6/4header, SCTP header and initial part of INIT * message (i.e. the struct sctp_init_msg). */ void sctp_send_initiate_ack(struct sctp_inpcb *inp, struct sctp_tcb *stcb, struct sctp_nets *src_net, struct mbuf *init_pkt, int iphlen, int offset, struct sockaddr *src, struct sockaddr *dst, struct sctphdr *sh, struct sctp_init_chunk *init_chk, uint8_t mflowtype, uint32_t mflowid, uint32_t vrf_id, uint16_t port) { struct sctp_association *asoc; struct mbuf *m, *m_tmp, *m_last, *m_cookie, *op_err; struct sctp_init_ack_chunk *initack; struct sctp_adaptation_layer_indication *ali; struct sctp_supported_chunk_types_param *pr_supported; struct sctp_paramhdr *ph; union sctp_sockstore *over_addr; struct sctp_scoping scp; struct timeval now; #ifdef INET struct sockaddr_in *dst4 = (struct sockaddr_in *)dst; struct sockaddr_in *src4 = (struct sockaddr_in *)src; struct sockaddr_in *sin; #endif #ifdef INET6 struct sockaddr_in6 *dst6 = (struct sockaddr_in6 *)dst; struct sockaddr_in6 *src6 = (struct sockaddr_in6 *)src; struct sockaddr_in6 *sin6; #endif struct sockaddr *to; struct sctp_state_cookie stc; struct sctp_nets *net = NULL; uint8_t *signature = NULL; int cnt_inits_to = 0; uint16_t his_limit, i_want; int abort_flag; int nat_friendly = 0; int error; struct socket *so; uint16_t num_ext, chunk_len, padding_len, parameter_len; if (stcb) { asoc = &stcb->asoc; } else { asoc = NULL; } if ((asoc != NULL) && (SCTP_GET_STATE(stcb) != SCTP_STATE_COOKIE_WAIT)) { if (sctp_are_there_new_addresses(asoc, init_pkt, offset, src)) { /* * new addresses, out of here in non-cookie-wait * states * * Send an ABORT, without the new address error * cause. This looks no different than if no * listener was present. */ op_err = sctp_generate_cause(SCTP_BASE_SYSCTL(sctp_diag_info_code), "Address added"); sctp_send_abort(init_pkt, iphlen, src, dst, sh, 0, op_err, mflowtype, mflowid, inp->fibnum, vrf_id, port); return; } if (src_net != NULL && (src_net->port != port)) { /* * change of remote encapsulation port, out of here * in non-cookie-wait states * * Send an ABORT, without an specific error cause. * This looks no different than if no listener was * present. */ op_err = sctp_generate_cause(SCTP_BASE_SYSCTL(sctp_diag_info_code), "Remote encapsulation port changed"); sctp_send_abort(init_pkt, iphlen, src, dst, sh, 0, op_err, mflowtype, mflowid, inp->fibnum, vrf_id, port); return; } } abort_flag = 0; op_err = sctp_arethere_unrecognized_parameters(init_pkt, (offset + sizeof(struct sctp_init_chunk)), &abort_flag, (struct sctp_chunkhdr *)init_chk, &nat_friendly, NULL); if (abort_flag) { do_a_abort: if (op_err == NULL) { char msg[SCTP_DIAG_INFO_LEN]; SCTP_SNPRINTF(msg, sizeof(msg), "%s:%d at %s", __FILE__, __LINE__, __func__); op_err = sctp_generate_cause(SCTP_BASE_SYSCTL(sctp_diag_info_code), msg); } sctp_send_abort(init_pkt, iphlen, src, dst, sh, init_chk->init.initiate_tag, op_err, mflowtype, mflowid, inp->fibnum, vrf_id, port); return; } m = sctp_get_mbuf_for_msg(MCLBYTES, 0, M_NOWAIT, 1, MT_DATA); if (m == NULL) { /* No memory, INIT timer will re-attempt. */ sctp_m_freem(op_err); return; } chunk_len = (uint16_t)sizeof(struct sctp_init_ack_chunk); padding_len = 0; /* * We might not overwrite the identification[] completely and on * some platforms time_entered will contain some padding. Therefore * zero out the cookie to avoid putting uninitialized memory on the * wire. */ memset(&stc, 0, sizeof(struct sctp_state_cookie)); /* the time I built cookie */ (void)SCTP_GETTIME_TIMEVAL(&now); stc.time_entered.tv_sec = now.tv_sec; stc.time_entered.tv_usec = now.tv_usec; /* populate any tie tags */ if (asoc != NULL) { /* unlock before tag selections */ stc.tie_tag_my_vtag = asoc->my_vtag_nonce; stc.tie_tag_peer_vtag = asoc->peer_vtag_nonce; stc.cookie_life = asoc->cookie_life; net = asoc->primary_destination; } else { stc.tie_tag_my_vtag = 0; stc.tie_tag_peer_vtag = 0; /* life I will award this cookie */ stc.cookie_life = inp->sctp_ep.def_cookie_life; } /* copy in the ports for later check */ stc.myport = sh->dest_port; stc.peerport = sh->src_port; /* * If we wanted to honor cookie life extensions, we would add to * stc.cookie_life. For now we should NOT honor any extension */ stc.site_scope = stc.local_scope = stc.loopback_scope = 0; if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) { stc.ipv6_addr_legal = 1; if (SCTP_IPV6_V6ONLY(inp)) { stc.ipv4_addr_legal = 0; } else { stc.ipv4_addr_legal = 1; } } else { stc.ipv6_addr_legal = 0; stc.ipv4_addr_legal = 1; } stc.ipv4_scope = 0; if (net == NULL) { to = src; switch (dst->sa_family) { #ifdef INET case AF_INET: { /* lookup address */ stc.address[0] = src4->sin_addr.s_addr; stc.address[1] = 0; stc.address[2] = 0; stc.address[3] = 0; stc.addr_type = SCTP_IPV4_ADDRESS; /* local from address */ stc.laddress[0] = dst4->sin_addr.s_addr; stc.laddress[1] = 0; stc.laddress[2] = 0; stc.laddress[3] = 0; stc.laddr_type = SCTP_IPV4_ADDRESS; /* scope_id is only for v6 */ stc.scope_id = 0; if ((IN4_ISPRIVATE_ADDRESS(&src4->sin_addr)) || (IN4_ISPRIVATE_ADDRESS(&dst4->sin_addr))) { stc.ipv4_scope = 1; } /* Must use the address in this case */ if (sctp_is_address_on_local_host(src, vrf_id)) { stc.loopback_scope = 1; stc.ipv4_scope = 1; stc.site_scope = 1; stc.local_scope = 0; } break; } #endif #ifdef INET6 case AF_INET6: { stc.addr_type = SCTP_IPV6_ADDRESS; memcpy(&stc.address, &src6->sin6_addr, sizeof(struct in6_addr)); stc.scope_id = ntohs(in6_getscope(&src6->sin6_addr)); if (sctp_is_address_on_local_host(src, vrf_id)) { stc.loopback_scope = 1; stc.local_scope = 0; stc.site_scope = 1; stc.ipv4_scope = 1; } else if (IN6_IS_ADDR_LINKLOCAL(&src6->sin6_addr) || IN6_IS_ADDR_LINKLOCAL(&dst6->sin6_addr)) { /* * If the new destination or source * is a LINK_LOCAL we must have * common both site and local scope. * Don't set local scope though * since we must depend on the * source to be added implicitly. We * cannot assure just because we * share one link that all links are * common. */ stc.local_scope = 0; stc.site_scope = 1; stc.ipv4_scope = 1; /* * we start counting for the private * address stuff at 1. since the * link local we source from won't * show up in our scoped count. */ cnt_inits_to = 1; /* * pull out the scope_id from * incoming pkt */ } else if (IN6_IS_ADDR_SITELOCAL(&src6->sin6_addr) || IN6_IS_ADDR_SITELOCAL(&dst6->sin6_addr)) { /* * If the new destination or source * is SITE_LOCAL then we must have * site scope in common. */ stc.site_scope = 1; } memcpy(&stc.laddress, &dst6->sin6_addr, sizeof(struct in6_addr)); stc.laddr_type = SCTP_IPV6_ADDRESS; break; } #endif default: /* TSNH */ goto do_a_abort; break; } } else { /* set the scope per the existing tcb */ #ifdef INET6 struct sctp_nets *lnet; #endif stc.loopback_scope = asoc->scope.loopback_scope; stc.ipv4_scope = asoc->scope.ipv4_local_scope; stc.site_scope = asoc->scope.site_scope; stc.local_scope = asoc->scope.local_scope; #ifdef INET6 /* Why do we not consider IPv4 LL addresses? */ TAILQ_FOREACH(lnet, &asoc->nets, sctp_next) { if (lnet->ro._l_addr.sin6.sin6_family == AF_INET6) { if (IN6_IS_ADDR_LINKLOCAL(&lnet->ro._l_addr.sin6.sin6_addr)) { /* * if we have a LL address, start * counting at 1. */ cnt_inits_to = 1; } } } #endif /* use the net pointer */ to = (struct sockaddr *)&net->ro._l_addr; switch (to->sa_family) { #ifdef INET case AF_INET: sin = (struct sockaddr_in *)to; stc.address[0] = sin->sin_addr.s_addr; stc.address[1] = 0; stc.address[2] = 0; stc.address[3] = 0; stc.addr_type = SCTP_IPV4_ADDRESS; if (net->src_addr_selected == 0) { /* * strange case here, the INIT should have * did the selection. */ net->ro._s_addr = sctp_source_address_selection(inp, stcb, (sctp_route_t *)&net->ro, net, 0, vrf_id); if (net->ro._s_addr == NULL) { sctp_m_freem(op_err); sctp_m_freem(m); return; } net->src_addr_selected = 1; } stc.laddress[0] = net->ro._s_addr->address.sin.sin_addr.s_addr; stc.laddress[1] = 0; stc.laddress[2] = 0; stc.laddress[3] = 0; stc.laddr_type = SCTP_IPV4_ADDRESS; /* scope_id is only for v6 */ stc.scope_id = 0; break; #endif #ifdef INET6 case AF_INET6: sin6 = (struct sockaddr_in6 *)to; memcpy(&stc.address, &sin6->sin6_addr, sizeof(struct in6_addr)); stc.addr_type = SCTP_IPV6_ADDRESS; stc.scope_id = sin6->sin6_scope_id; if (net->src_addr_selected == 0) { /* * strange case here, the INIT should have * done the selection. */ net->ro._s_addr = sctp_source_address_selection(inp, stcb, (sctp_route_t *)&net->ro, net, 0, vrf_id); if (net->ro._s_addr == NULL) { sctp_m_freem(op_err); sctp_m_freem(m); return; } net->src_addr_selected = 1; } memcpy(&stc.laddress, &net->ro._s_addr->address.sin6.sin6_addr, sizeof(struct in6_addr)); stc.laddr_type = SCTP_IPV6_ADDRESS; break; #endif } } /* Now lets put the SCTP header in place */ initack = mtod(m, struct sctp_init_ack_chunk *); /* Save it off for quick ref */ stc.peers_vtag = ntohl(init_chk->init.initiate_tag); /* who are we */ memcpy(stc.identification, SCTP_VERSION_STRING, min(strlen(SCTP_VERSION_STRING), sizeof(stc.identification))); memset(stc.reserved, 0, SCTP_RESERVE_SPACE); /* now the chunk header */ initack->ch.chunk_type = SCTP_INITIATION_ACK; initack->ch.chunk_flags = 0; /* fill in later from mbuf we build */ initack->ch.chunk_length = 0; /* place in my tag */ if ((asoc != NULL) && ((SCTP_GET_STATE(stcb) == SCTP_STATE_COOKIE_WAIT) || (SCTP_GET_STATE(stcb) == SCTP_STATE_INUSE) || (SCTP_GET_STATE(stcb) == SCTP_STATE_COOKIE_ECHOED))) { /* re-use the v-tags and init-seq here */ initack->init.initiate_tag = htonl(asoc->my_vtag); initack->init.initial_tsn = htonl(asoc->init_seq_number); } else { uint32_t vtag, itsn; if (asoc) { atomic_add_int(&asoc->refcnt, 1); SCTP_TCB_UNLOCK(stcb); new_tag: vtag = sctp_select_a_tag(inp, inp->sctp_lport, sh->src_port, 1); if ((asoc->peer_supports_nat) && (vtag == asoc->my_vtag)) { /* * Got a duplicate vtag on some guy behind a * nat make sure we don't use it. */ goto new_tag; } initack->init.initiate_tag = htonl(vtag); /* get a TSN to use too */ itsn = sctp_select_initial_TSN(&inp->sctp_ep); initack->init.initial_tsn = htonl(itsn); SCTP_TCB_LOCK(stcb); atomic_add_int(&asoc->refcnt, -1); } else { SCTP_INP_INCR_REF(inp); SCTP_INP_RUNLOCK(inp); vtag = sctp_select_a_tag(inp, inp->sctp_lport, sh->src_port, 1); initack->init.initiate_tag = htonl(vtag); /* get a TSN to use too */ initack->init.initial_tsn = htonl(sctp_select_initial_TSN(&inp->sctp_ep)); SCTP_INP_RLOCK(inp); SCTP_INP_DECR_REF(inp); } } /* save away my tag to */ stc.my_vtag = initack->init.initiate_tag; /* set up some of the credits. */ so = inp->sctp_socket; if (so == NULL) { /* memory problem */ sctp_m_freem(op_err); sctp_m_freem(m); return; } else { initack->init.a_rwnd = htonl(max(SCTP_SB_LIMIT_RCV(so), SCTP_MINIMAL_RWND)); } /* set what I want */ his_limit = ntohs(init_chk->init.num_inbound_streams); /* choose what I want */ if (asoc != NULL) { if (asoc->streamoutcnt > asoc->pre_open_streams) { i_want = asoc->streamoutcnt; } else { i_want = asoc->pre_open_streams; } } else { i_want = inp->sctp_ep.pre_open_stream_count; } if (his_limit < i_want) { /* I Want more :< */ initack->init.num_outbound_streams = init_chk->init.num_inbound_streams; } else { /* I can have what I want :> */ initack->init.num_outbound_streams = htons(i_want); } /* tell him his limit. */ initack->init.num_inbound_streams = htons(inp->sctp_ep.max_open_streams_intome); /* adaptation layer indication parameter */ if (inp->sctp_ep.adaptation_layer_indicator_provided) { parameter_len = (uint16_t)sizeof(struct sctp_adaptation_layer_indication); ali = (struct sctp_adaptation_layer_indication *)(mtod(m, caddr_t)+chunk_len); ali->ph.param_type = htons(SCTP_ULP_ADAPTATION); ali->ph.param_length = htons(parameter_len); ali->indication = htonl(inp->sctp_ep.adaptation_layer_indicator); chunk_len += parameter_len; } /* ECN parameter */ if (((asoc != NULL) && (asoc->ecn_supported == 1)) || ((asoc == NULL) && (inp->ecn_supported == 1))) { parameter_len = (uint16_t)sizeof(struct sctp_paramhdr); ph = (struct sctp_paramhdr *)(mtod(m, caddr_t)+chunk_len); ph->param_type = htons(SCTP_ECN_CAPABLE); ph->param_length = htons(parameter_len); chunk_len += parameter_len; } /* PR-SCTP supported parameter */ if (((asoc != NULL) && (asoc->prsctp_supported == 1)) || ((asoc == NULL) && (inp->prsctp_supported == 1))) { parameter_len = (uint16_t)sizeof(struct sctp_paramhdr); ph = (struct sctp_paramhdr *)(mtod(m, caddr_t)+chunk_len); ph->param_type = htons(SCTP_PRSCTP_SUPPORTED); ph->param_length = htons(parameter_len); chunk_len += parameter_len; } /* Add NAT friendly parameter */ if (nat_friendly) { parameter_len = (uint16_t)sizeof(struct sctp_paramhdr); ph = (struct sctp_paramhdr *)(mtod(m, caddr_t)+chunk_len); ph->param_type = htons(SCTP_HAS_NAT_SUPPORT); ph->param_length = htons(parameter_len); chunk_len += parameter_len; } /* And now tell the peer which extensions we support */ num_ext = 0; pr_supported = (struct sctp_supported_chunk_types_param *)(mtod(m, caddr_t)+chunk_len); if (((asoc != NULL) && (asoc->prsctp_supported == 1)) || ((asoc == NULL) && (inp->prsctp_supported == 1))) { pr_supported->chunk_types[num_ext++] = SCTP_FORWARD_CUM_TSN; if (((asoc != NULL) && (asoc->idata_supported == 1)) || ((asoc == NULL) && (inp->idata_supported == 1))) { pr_supported->chunk_types[num_ext++] = SCTP_IFORWARD_CUM_TSN; } } if (((asoc != NULL) && (asoc->auth_supported == 1)) || ((asoc == NULL) && (inp->auth_supported == 1))) { pr_supported->chunk_types[num_ext++] = SCTP_AUTHENTICATION; } if (((asoc != NULL) && (asoc->asconf_supported == 1)) || ((asoc == NULL) && (inp->asconf_supported == 1))) { pr_supported->chunk_types[num_ext++] = SCTP_ASCONF; pr_supported->chunk_types[num_ext++] = SCTP_ASCONF_ACK; } if (((asoc != NULL) && (asoc->reconfig_supported == 1)) || ((asoc == NULL) && (inp->reconfig_supported == 1))) { pr_supported->chunk_types[num_ext++] = SCTP_STREAM_RESET; } if (((asoc != NULL) && (asoc->idata_supported == 1)) || ((asoc == NULL) && (inp->idata_supported == 1))) { pr_supported->chunk_types[num_ext++] = SCTP_IDATA; } if (((asoc != NULL) && (asoc->nrsack_supported == 1)) || ((asoc == NULL) && (inp->nrsack_supported == 1))) { pr_supported->chunk_types[num_ext++] = SCTP_NR_SELECTIVE_ACK; } if (((asoc != NULL) && (asoc->pktdrop_supported == 1)) || ((asoc == NULL) && (inp->pktdrop_supported == 1))) { pr_supported->chunk_types[num_ext++] = SCTP_PACKET_DROPPED; } if (num_ext > 0) { parameter_len = (uint16_t)sizeof(struct sctp_supported_chunk_types_param) + num_ext; pr_supported->ph.param_type = htons(SCTP_SUPPORTED_CHUNK_EXT); pr_supported->ph.param_length = htons(parameter_len); padding_len = SCTP_SIZE32(parameter_len) - parameter_len; chunk_len += parameter_len; } /* add authentication parameters */ if (((asoc != NULL) && (asoc->auth_supported == 1)) || ((asoc == NULL) && (inp->auth_supported == 1))) { struct sctp_auth_random *randp; struct sctp_auth_hmac_algo *hmacs; struct sctp_auth_chunk_list *chunks; if (padding_len > 0) { memset(mtod(m, caddr_t)+chunk_len, 0, padding_len); chunk_len += padding_len; padding_len = 0; } /* generate and add RANDOM parameter */ randp = (struct sctp_auth_random *)(mtod(m, caddr_t)+chunk_len); parameter_len = (uint16_t)sizeof(struct sctp_auth_random) + SCTP_AUTH_RANDOM_SIZE_DEFAULT; randp->ph.param_type = htons(SCTP_RANDOM); randp->ph.param_length = htons(parameter_len); SCTP_READ_RANDOM(randp->random_data, SCTP_AUTH_RANDOM_SIZE_DEFAULT); padding_len = SCTP_SIZE32(parameter_len) - parameter_len; chunk_len += parameter_len; if (padding_len > 0) { memset(mtod(m, caddr_t)+chunk_len, 0, padding_len); chunk_len += padding_len; padding_len = 0; } /* add HMAC_ALGO parameter */ hmacs = (struct sctp_auth_hmac_algo *)(mtod(m, caddr_t)+chunk_len); parameter_len = (uint16_t)sizeof(struct sctp_auth_hmac_algo) + sctp_serialize_hmaclist(inp->sctp_ep.local_hmacs, (uint8_t *)hmacs->hmac_ids); hmacs->ph.param_type = htons(SCTP_HMAC_LIST); hmacs->ph.param_length = htons(parameter_len); padding_len = SCTP_SIZE32(parameter_len) - parameter_len; chunk_len += parameter_len; if (padding_len > 0) { memset(mtod(m, caddr_t)+chunk_len, 0, padding_len); chunk_len += padding_len; padding_len = 0; } /* add CHUNKS parameter */ chunks = (struct sctp_auth_chunk_list *)(mtod(m, caddr_t)+chunk_len); parameter_len = (uint16_t)sizeof(struct sctp_auth_chunk_list) + sctp_serialize_auth_chunks(inp->sctp_ep.local_auth_chunks, chunks->chunk_types); chunks->ph.param_type = htons(SCTP_CHUNK_LIST); chunks->ph.param_length = htons(parameter_len); padding_len = SCTP_SIZE32(parameter_len) - parameter_len; chunk_len += parameter_len; } SCTP_BUF_LEN(m) = chunk_len; m_last = m; /* now the addresses */ /* * To optimize this we could put the scoping stuff into a structure * and remove the individual uint8's from the stc structure. Then we * could just sifa in the address within the stc.. but for now this * is a quick hack to get the address stuff teased apart. */ scp.ipv4_addr_legal = stc.ipv4_addr_legal; scp.ipv6_addr_legal = stc.ipv6_addr_legal; scp.loopback_scope = stc.loopback_scope; scp.ipv4_local_scope = stc.ipv4_scope; scp.local_scope = stc.local_scope; scp.site_scope = stc.site_scope; m_last = sctp_add_addresses_to_i_ia(inp, stcb, &scp, m_last, cnt_inits_to, &padding_len, &chunk_len); /* padding_len can only be positive, if no addresses have been added */ if (padding_len > 0) { memset(mtod(m, caddr_t)+chunk_len, 0, padding_len); chunk_len += padding_len; SCTP_BUF_LEN(m) += padding_len; padding_len = 0; } /* tack on the operational error if present */ if (op_err) { parameter_len = 0; for (m_tmp = op_err; m_tmp != NULL; m_tmp = SCTP_BUF_NEXT(m_tmp)) { parameter_len += SCTP_BUF_LEN(m_tmp); } padding_len = SCTP_SIZE32(parameter_len) - parameter_len; SCTP_BUF_NEXT(m_last) = op_err; while (SCTP_BUF_NEXT(m_last) != NULL) { m_last = SCTP_BUF_NEXT(m_last); } chunk_len += parameter_len; } if (padding_len > 0) { m_last = sctp_add_pad_tombuf(m_last, padding_len); if (m_last == NULL) { /* Houston we have a problem, no space */ sctp_m_freem(m); return; } chunk_len += padding_len; padding_len = 0; } /* Now we must build a cookie */ m_cookie = sctp_add_cookie(init_pkt, offset, m, 0, &stc, &signature); if (m_cookie == NULL) { /* memory problem */ sctp_m_freem(m); return; } /* Now append the cookie to the end and update the space/size */ SCTP_BUF_NEXT(m_last) = m_cookie; parameter_len = 0; for (m_tmp = m_cookie; m_tmp != NULL; m_tmp = SCTP_BUF_NEXT(m_tmp)) { parameter_len += SCTP_BUF_LEN(m_tmp); if (SCTP_BUF_NEXT(m_tmp) == NULL) { m_last = m_tmp; } } padding_len = SCTP_SIZE32(parameter_len) - parameter_len; chunk_len += parameter_len; /* * Place in the size, but we don't include the last pad (if any) in * the INIT-ACK. */ initack->ch.chunk_length = htons(chunk_len); /* * Time to sign the cookie, we don't sign over the cookie signature * though thus we set trailer. */ (void)sctp_hmac_m(SCTP_HMAC, (uint8_t *)inp->sctp_ep.secret_key[(int)(inp->sctp_ep.current_secret_number)], SCTP_SECRET_SIZE, m_cookie, sizeof(struct sctp_paramhdr), (uint8_t *)signature, SCTP_SIGNATURE_SIZE); /* * We sifa 0 here to NOT set IP_DF if its IPv4, we ignore the return * here since the timer will drive a retranmission. */ if (padding_len > 0) { if (sctp_add_pad_tombuf(m_last, padding_len) == NULL) { sctp_m_freem(m); return; } } if (stc.loopback_scope) { over_addr = (union sctp_sockstore *)dst; } else { over_addr = NULL; } if ((error = sctp_lowlevel_chunk_output(inp, NULL, NULL, to, m, 0, NULL, 0, 0, 0, 0, inp->sctp_lport, sh->src_port, init_chk->init.initiate_tag, port, over_addr, mflowtype, mflowid, SCTP_SO_NOT_LOCKED))) { SCTPDBG(SCTP_DEBUG_OUTPUT4, "Gak send error %d\n", error); if (error == ENOBUFS) { if (asoc != NULL) { asoc->ifp_had_enobuf = 1; } SCTP_STAT_INCR(sctps_lowlevelerr); } } else { if (asoc != NULL) { asoc->ifp_had_enobuf = 0; } } SCTP_STAT_INCR_COUNTER64(sctps_outcontrolchunks); } static void sctp_prune_prsctp(struct sctp_tcb *stcb, struct sctp_association *asoc, struct sctp_sndrcvinfo *srcv, int dataout) { int freed_spc = 0; struct sctp_tmit_chunk *chk, *nchk; SCTP_TCB_LOCK_ASSERT(stcb); if ((asoc->prsctp_supported) && (asoc->sent_queue_cnt_removeable > 0)) { TAILQ_FOREACH(chk, &asoc->sent_queue, sctp_next) { /* * Look for chunks marked with the PR_SCTP flag AND * the buffer space flag. If the one being sent is * equal or greater priority then purge the old one * and free some space. */ if (PR_SCTP_BUF_ENABLED(chk->flags)) { /* * This one is PR-SCTP AND buffer space * limited type */ if (chk->rec.data.timetodrop.tv_sec > (long)srcv->sinfo_timetolive) { /* * Lower numbers equates to higher * priority. So if the one we are * looking at has a larger priority, * we want to drop the data and NOT * retransmit it. */ if (chk->data) { /* * We release the book_size * if the mbuf is here */ int ret_spc; uint8_t sent; if (chk->sent > SCTP_DATAGRAM_UNSENT) sent = 1; else sent = 0; ret_spc = sctp_release_pr_sctp_chunk(stcb, chk, sent, SCTP_SO_LOCKED); freed_spc += ret_spc; if (freed_spc >= dataout) { return; } } /* if chunk was present */ } /* if of sufficient priority */ } /* if chunk has enabled */ } /* tailqforeach */ TAILQ_FOREACH_SAFE(chk, &asoc->send_queue, sctp_next, nchk) { /* Here we must move to the sent queue and mark */ if (PR_SCTP_BUF_ENABLED(chk->flags)) { if (chk->rec.data.timetodrop.tv_sec > (long)srcv->sinfo_timetolive) { if (chk->data) { /* * We release the book_size * if the mbuf is here */ int ret_spc; ret_spc = sctp_release_pr_sctp_chunk(stcb, chk, 0, SCTP_SO_LOCKED); freed_spc += ret_spc; if (freed_spc >= dataout) { return; } } /* end if chk->data */ } /* end if right class */ } /* end if chk pr-sctp */ } /* tailqforeachsafe (chk) */ } /* if enabled in asoc */ } int sctp_get_frag_point(struct sctp_tcb *stcb, struct sctp_association *asoc) { int siz, ovh; /* * For endpoints that have both v6 and v4 addresses we must reserve * room for the ipv6 header, for those that are only dealing with V4 * we use a larger frag point. */ if (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) { ovh = SCTP_MIN_OVERHEAD; } else { ovh = SCTP_MIN_V4_OVERHEAD; } ovh += SCTP_DATA_CHUNK_OVERHEAD(stcb); if (stcb->asoc.sctp_frag_point > asoc->smallest_mtu) siz = asoc->smallest_mtu - ovh; else siz = (stcb->asoc.sctp_frag_point - ovh); /* * if (siz > (MCLBYTES-sizeof(struct sctp_data_chunk))) { */ /* A data chunk MUST fit in a cluster */ /* siz = (MCLBYTES - sizeof(struct sctp_data_chunk)); */ /* } */ /* adjust for an AUTH chunk if DATA requires auth */ if (sctp_auth_is_required_chunk(SCTP_DATA, stcb->asoc.peer_auth_chunks)) siz -= sctp_get_auth_chunk_len(stcb->asoc.peer_hmac_id); if (siz % 4) { /* make it an even word boundary please */ siz -= (siz % 4); } return (siz); } static void sctp_set_prsctp_policy(struct sctp_stream_queue_pending *sp) { /* * We assume that the user wants PR_SCTP_TTL if the user provides a * positive lifetime but does not specify any PR_SCTP policy. */ if (PR_SCTP_ENABLED(sp->sinfo_flags)) { sp->act_flags |= PR_SCTP_POLICY(sp->sinfo_flags); } else if (sp->timetolive > 0) { sp->sinfo_flags |= SCTP_PR_SCTP_TTL; sp->act_flags |= PR_SCTP_POLICY(sp->sinfo_flags); } else { return; } switch (PR_SCTP_POLICY(sp->sinfo_flags)) { case CHUNK_FLAGS_PR_SCTP_BUF: /* * Time to live is a priority stored in tv_sec when doing * the buffer drop thing. */ sp->ts.tv_sec = sp->timetolive; sp->ts.tv_usec = 0; break; case CHUNK_FLAGS_PR_SCTP_TTL: { struct timeval tv; (void)SCTP_GETTIME_TIMEVAL(&sp->ts); tv.tv_sec = sp->timetolive / 1000; tv.tv_usec = (sp->timetolive * 1000) % 1000000; /* * TODO sctp_constants.h needs alternative time * macros when _KERNEL is undefined. */ timevaladd(&sp->ts, &tv); } break; case CHUNK_FLAGS_PR_SCTP_RTX: /* * Time to live is a the number or retransmissions stored in * tv_sec. */ sp->ts.tv_sec = sp->timetolive; sp->ts.tv_usec = 0; break; default: SCTPDBG(SCTP_DEBUG_USRREQ1, "Unknown PR_SCTP policy %u.\n", PR_SCTP_POLICY(sp->sinfo_flags)); break; } } static int sctp_msg_append(struct sctp_tcb *stcb, struct sctp_nets *net, struct mbuf *m, struct sctp_sndrcvinfo *srcv, int hold_stcb_lock) { int error = 0; struct mbuf *at; struct sctp_stream_queue_pending *sp = NULL; struct sctp_stream_out *strm; /* * Given an mbuf chain, put it into the association send queue and * place it on the wheel */ if (srcv->sinfo_stream >= stcb->asoc.streamoutcnt) { /* Invalid stream number */ SCTP_LTRACE_ERR_RET_PKT(m, NULL, stcb, net, SCTP_FROM_SCTP_OUTPUT, EINVAL); error = EINVAL; goto out_now; } if ((stcb->asoc.stream_locked) && (stcb->asoc.stream_locked_on != srcv->sinfo_stream)) { SCTP_LTRACE_ERR_RET_PKT(m, NULL, stcb, net, SCTP_FROM_SCTP_OUTPUT, EINVAL); error = EINVAL; goto out_now; } strm = &stcb->asoc.strmout[srcv->sinfo_stream]; /* Now can we send this? */ if ((SCTP_GET_STATE(stcb) == SCTP_STATE_SHUTDOWN_SENT) || (SCTP_GET_STATE(stcb) == SCTP_STATE_SHUTDOWN_ACK_SENT) || (SCTP_GET_STATE(stcb) == SCTP_STATE_SHUTDOWN_RECEIVED) || (stcb->asoc.state & SCTP_STATE_SHUTDOWN_PENDING)) { /* got data while shutting down */ SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, ECONNRESET); error = ECONNRESET; goto out_now; } sctp_alloc_a_strmoq(stcb, sp); if (sp == NULL) { SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, ENOMEM); error = ENOMEM; goto out_now; } sp->sinfo_flags = srcv->sinfo_flags; sp->timetolive = srcv->sinfo_timetolive; sp->ppid = srcv->sinfo_ppid; sp->context = srcv->sinfo_context; sp->fsn = 0; if (sp->sinfo_flags & SCTP_ADDR_OVER) { sp->net = net; atomic_add_int(&sp->net->ref_count, 1); } else { sp->net = NULL; } (void)SCTP_GETTIME_TIMEVAL(&sp->ts); sp->sid = srcv->sinfo_stream; sp->msg_is_complete = 1; sp->sender_all_done = 1; sp->some_taken = 0; sp->data = m; sp->tail_mbuf = NULL; sctp_set_prsctp_policy(sp); /* * We could in theory (for sendall) sifa the length in, but we would * still have to hunt through the chain since we need to setup the * tail_mbuf */ sp->length = 0; for (at = m; at; at = SCTP_BUF_NEXT(at)) { if (SCTP_BUF_NEXT(at) == NULL) sp->tail_mbuf = at; sp->length += SCTP_BUF_LEN(at); } if (srcv->sinfo_keynumber_valid) { sp->auth_keyid = srcv->sinfo_keynumber; } else { sp->auth_keyid = stcb->asoc.authinfo.active_keyid; } if (sctp_auth_is_required_chunk(SCTP_DATA, stcb->asoc.peer_auth_chunks)) { sctp_auth_key_acquire(stcb, sp->auth_keyid); sp->holds_key_ref = 1; } if (hold_stcb_lock == 0) { SCTP_TCB_SEND_LOCK(stcb); } sctp_snd_sb_alloc(stcb, sp->length); atomic_add_int(&stcb->asoc.stream_queue_cnt, 1); TAILQ_INSERT_TAIL(&strm->outqueue, sp, next); stcb->asoc.ss_functions.sctp_ss_add_to_stream(stcb, &stcb->asoc, strm, sp, 1); m = NULL; if (hold_stcb_lock == 0) { SCTP_TCB_SEND_UNLOCK(stcb); } out_now: if (m) { sctp_m_freem(m); } return (error); } static struct mbuf * sctp_copy_mbufchain(struct mbuf *clonechain, struct mbuf *outchain, struct mbuf **endofchain, int can_take_mbuf, int sizeofcpy, uint8_t copy_by_ref) { struct mbuf *m; struct mbuf *appendchain; caddr_t cp; int len; if (endofchain == NULL) { /* error */ error_out: if (outchain) sctp_m_freem(outchain); return (NULL); } if (can_take_mbuf) { appendchain = clonechain; } else { if (!copy_by_ref && (sizeofcpy <= (int)((((SCTP_BASE_SYSCTL(sctp_mbuf_threshold_count) - 1) * MLEN) + MHLEN)))) { /* Its not in a cluster */ if (*endofchain == NULL) { /* lets get a mbuf cluster */ if (outchain == NULL) { /* This is the general case */ new_mbuf: outchain = sctp_get_mbuf_for_msg(MCLBYTES, 0, M_NOWAIT, 1, MT_HEADER); if (outchain == NULL) { goto error_out; } SCTP_BUF_LEN(outchain) = 0; *endofchain = outchain; /* get the prepend space */ SCTP_BUF_RESV_UF(outchain, (SCTP_FIRST_MBUF_RESV + 4)); } else { /* * We really should not get a NULL * in endofchain */ /* find end */ m = outchain; while (m) { if (SCTP_BUF_NEXT(m) == NULL) { *endofchain = m; break; } m = SCTP_BUF_NEXT(m); } /* sanity */ if (*endofchain == NULL) { /* * huh, TSNH XXX maybe we * should panic */ sctp_m_freem(outchain); goto new_mbuf; } } /* get the new end of length */ len = (int)M_TRAILINGSPACE(*endofchain); } else { /* how much is left at the end? */ len = (int)M_TRAILINGSPACE(*endofchain); } /* Find the end of the data, for appending */ cp = (mtod((*endofchain), caddr_t)+SCTP_BUF_LEN((*endofchain))); /* Now lets copy it out */ if (len >= sizeofcpy) { /* It all fits, copy it in */ m_copydata(clonechain, 0, sizeofcpy, cp); SCTP_BUF_LEN((*endofchain)) += sizeofcpy; } else { /* fill up the end of the chain */ if (len > 0) { m_copydata(clonechain, 0, len, cp); SCTP_BUF_LEN((*endofchain)) += len; /* now we need another one */ sizeofcpy -= len; } m = sctp_get_mbuf_for_msg(MCLBYTES, 0, M_NOWAIT, 1, MT_HEADER); if (m == NULL) { /* We failed */ goto error_out; } SCTP_BUF_NEXT((*endofchain)) = m; *endofchain = m; cp = mtod((*endofchain), caddr_t); m_copydata(clonechain, len, sizeofcpy, cp); SCTP_BUF_LEN((*endofchain)) += sizeofcpy; } return (outchain); } else { /* copy the old fashion way */ appendchain = SCTP_M_COPYM(clonechain, 0, M_COPYALL, M_NOWAIT); #ifdef SCTP_MBUF_LOGGING if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MBUF_LOGGING_ENABLE) { sctp_log_mbc(appendchain, SCTP_MBUF_ICOPY); } #endif } } if (appendchain == NULL) { /* error */ if (outchain) sctp_m_freem(outchain); return (NULL); } if (outchain) { /* tack on to the end */ if (*endofchain != NULL) { SCTP_BUF_NEXT(((*endofchain))) = appendchain; } else { m = outchain; while (m) { if (SCTP_BUF_NEXT(m) == NULL) { SCTP_BUF_NEXT(m) = appendchain; break; } m = SCTP_BUF_NEXT(m); } } /* * save off the end and update the end-chain position */ m = appendchain; while (m) { if (SCTP_BUF_NEXT(m) == NULL) { *endofchain = m; break; } m = SCTP_BUF_NEXT(m); } return (outchain); } else { /* save off the end and update the end-chain position */ m = appendchain; while (m) { if (SCTP_BUF_NEXT(m) == NULL) { *endofchain = m; break; } m = SCTP_BUF_NEXT(m); } return (appendchain); } } static int sctp_med_chunk_output(struct sctp_inpcb *inp, struct sctp_tcb *stcb, struct sctp_association *asoc, int *num_out, int *reason_code, int control_only, int from_where, struct timeval *now, int *now_filled, int frag_point, int so_locked); static void sctp_sendall_iterator(struct sctp_inpcb *inp, struct sctp_tcb *stcb, void *ptr, uint32_t val SCTP_UNUSED) { struct sctp_copy_all *ca; struct mbuf *m; int ret = 0; int added_control = 0; int un_sent, do_chunk_output = 1; struct sctp_association *asoc; struct sctp_nets *net; ca = (struct sctp_copy_all *)ptr; if (ca->m == NULL) { return; } if (ca->inp != inp) { /* TSNH */ return; } if (ca->sndlen > 0) { m = SCTP_M_COPYM(ca->m, 0, M_COPYALL, M_NOWAIT); if (m == NULL) { /* can't copy so we are done */ ca->cnt_failed++; return; } #ifdef SCTP_MBUF_LOGGING if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MBUF_LOGGING_ENABLE) { sctp_log_mbc(m, SCTP_MBUF_ICOPY); } #endif } else { m = NULL; } SCTP_TCB_LOCK_ASSERT(stcb); if (stcb->asoc.alternate) { net = stcb->asoc.alternate; } else { net = stcb->asoc.primary_destination; } if (ca->sndrcv.sinfo_flags & SCTP_ABORT) { /* Abort this assoc with m as the user defined reason */ if (m != NULL) { SCTP_BUF_PREPEND(m, sizeof(struct sctp_paramhdr), M_NOWAIT); } else { m = sctp_get_mbuf_for_msg(sizeof(struct sctp_paramhdr), 0, M_NOWAIT, 1, MT_DATA); SCTP_BUF_LEN(m) = sizeof(struct sctp_paramhdr); } if (m != NULL) { struct sctp_paramhdr *ph; ph = mtod(m, struct sctp_paramhdr *); ph->param_type = htons(SCTP_CAUSE_USER_INITIATED_ABT); ph->param_length = htons((uint16_t)(sizeof(struct sctp_paramhdr) + ca->sndlen)); } /* * We add one here to keep the assoc from dis-appearing on * us. */ atomic_add_int(&stcb->asoc.refcnt, 1); sctp_abort_an_association(inp, stcb, m, SCTP_SO_NOT_LOCKED); /* * sctp_abort_an_association calls sctp_free_asoc() free * association will NOT free it since we incremented the * refcnt .. we do this to prevent it being freed and things * getting tricky since we could end up (from free_asoc) * calling inpcb_free which would get a recursive lock call * to the iterator lock.. But as a consequence of that the * stcb will return to us un-locked.. since free_asoc * returns with either no TCB or the TCB unlocked, we must * relock.. to unlock in the iterator timer :-0 */ SCTP_TCB_LOCK(stcb); atomic_add_int(&stcb->asoc.refcnt, -1); goto no_chunk_output; } else { if (m) { ret = sctp_msg_append(stcb, net, m, &ca->sndrcv, 1); } asoc = &stcb->asoc; if (ca->sndrcv.sinfo_flags & SCTP_EOF) { /* shutdown this assoc */ if (TAILQ_EMPTY(&asoc->send_queue) && TAILQ_EMPTY(&asoc->sent_queue) && sctp_is_there_unsent_data(stcb, SCTP_SO_NOT_LOCKED) == 0) { if ((*asoc->ss_functions.sctp_ss_is_user_msgs_incomplete) (stcb, asoc)) { goto abort_anyway; } /* * there is nothing queued to send, so I'm * done... */ if ((SCTP_GET_STATE(stcb) != SCTP_STATE_SHUTDOWN_SENT) && (SCTP_GET_STATE(stcb) != SCTP_STATE_SHUTDOWN_RECEIVED) && (SCTP_GET_STATE(stcb) != SCTP_STATE_SHUTDOWN_ACK_SENT)) { /* * only send SHUTDOWN the first time * through */ if (SCTP_GET_STATE(stcb) == SCTP_STATE_OPEN) { SCTP_STAT_DECR_GAUGE32(sctps_currestab); } SCTP_SET_STATE(stcb, SCTP_STATE_SHUTDOWN_SENT); sctp_stop_timers_for_shutdown(stcb); sctp_send_shutdown(stcb, net); sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWN, stcb->sctp_ep, stcb, net); sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWNGUARD, stcb->sctp_ep, stcb, NULL); added_control = 1; do_chunk_output = 0; } } else { /* * we still got (or just got) data to send, * so set SHUTDOWN_PENDING */ /* * XXX sockets draft says that SCTP_EOF * should be sent with no data. currently, * we will allow user data to be sent first * and move to SHUTDOWN-PENDING */ if ((SCTP_GET_STATE(stcb) != SCTP_STATE_SHUTDOWN_SENT) && (SCTP_GET_STATE(stcb) != SCTP_STATE_SHUTDOWN_RECEIVED) && (SCTP_GET_STATE(stcb) != SCTP_STATE_SHUTDOWN_ACK_SENT)) { if ((*asoc->ss_functions.sctp_ss_is_user_msgs_incomplete) (stcb, asoc)) { SCTP_ADD_SUBSTATE(stcb, SCTP_STATE_PARTIAL_MSG_LEFT); } SCTP_ADD_SUBSTATE(stcb, SCTP_STATE_SHUTDOWN_PENDING); if (TAILQ_EMPTY(&asoc->send_queue) && TAILQ_EMPTY(&asoc->sent_queue) && (asoc->state & SCTP_STATE_PARTIAL_MSG_LEFT)) { struct mbuf *op_err; char msg[SCTP_DIAG_INFO_LEN]; abort_anyway: SCTP_SNPRINTF(msg, sizeof(msg), "%s:%d at %s", __FILE__, __LINE__, __func__); op_err = sctp_generate_cause(SCTP_BASE_SYSCTL(sctp_diag_info_code), msg); atomic_add_int(&stcb->asoc.refcnt, 1); sctp_abort_an_association(stcb->sctp_ep, stcb, op_err, SCTP_SO_NOT_LOCKED); atomic_add_int(&stcb->asoc.refcnt, -1); goto no_chunk_output; } sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWNGUARD, stcb->sctp_ep, stcb, NULL); } } } } un_sent = ((stcb->asoc.total_output_queue_size - stcb->asoc.total_flight) + (stcb->asoc.stream_queue_cnt * SCTP_DATA_CHUNK_OVERHEAD(stcb))); if ((sctp_is_feature_off(inp, SCTP_PCB_FLAGS_NODELAY)) && (stcb->asoc.total_flight > 0) && (un_sent < (int)(stcb->asoc.smallest_mtu - SCTP_MIN_OVERHEAD))) { do_chunk_output = 0; } if (do_chunk_output) sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_USR_SEND, SCTP_SO_NOT_LOCKED); else if (added_control) { int num_out, reason, now_filled = 0; struct timeval now; int frag_point; frag_point = sctp_get_frag_point(stcb, &stcb->asoc); (void)sctp_med_chunk_output(inp, stcb, &stcb->asoc, &num_out, &reason, 1, 1, &now, &now_filled, frag_point, SCTP_SO_NOT_LOCKED); } no_chunk_output: if (ret) { ca->cnt_failed++; } else { ca->cnt_sent++; } } static void sctp_sendall_completes(void *ptr, uint32_t val SCTP_UNUSED) { struct sctp_copy_all *ca; ca = (struct sctp_copy_all *)ptr; /* * Do a notify here? Kacheong suggests that the notify be done at * the send time.. so you would push up a notification if any send * failed. Don't know if this is feasible since the only failures we * have is "memory" related and if you cannot get an mbuf to send * the data you surely can't get an mbuf to send up to notify the * user you can't send the data :-> */ /* now free everything */ if (ca->inp) { /* Lets clear the flag to allow others to run. */ ca->inp->sctp_flags &= ~SCTP_PCB_FLAGS_SND_ITERATOR_UP; } sctp_m_freem(ca->m); SCTP_FREE(ca, SCTP_M_COPYAL); } static struct mbuf * sctp_copy_out_all(struct uio *uio, ssize_t len) { struct mbuf *ret, *at; ssize_t left, willcpy, cancpy, error; ret = sctp_get_mbuf_for_msg(MCLBYTES, 0, M_WAITOK, 1, MT_DATA); if (ret == NULL) { /* TSNH */ return (NULL); } left = len; SCTP_BUF_LEN(ret) = 0; /* save space for the data chunk header */ cancpy = (int)M_TRAILINGSPACE(ret); willcpy = min(cancpy, left); at = ret; while (left > 0) { /* Align data to the end */ error = uiomove(mtod(at, caddr_t), (int)willcpy, uio); if (error) { err_out_now: sctp_m_freem(at); return (NULL); } SCTP_BUF_LEN(at) = (int)willcpy; SCTP_BUF_NEXT_PKT(at) = SCTP_BUF_NEXT(at) = 0; left -= willcpy; if (left > 0) { SCTP_BUF_NEXT(at) = sctp_get_mbuf_for_msg((unsigned int)left, 0, M_WAITOK, 1, MT_DATA); if (SCTP_BUF_NEXT(at) == NULL) { goto err_out_now; } at = SCTP_BUF_NEXT(at); SCTP_BUF_LEN(at) = 0; cancpy = (int)M_TRAILINGSPACE(at); willcpy = min(cancpy, left); } } return (ret); } static int sctp_sendall(struct sctp_inpcb *inp, struct uio *uio, struct mbuf *m, struct sctp_sndrcvinfo *srcv) { int ret; struct sctp_copy_all *ca; if (inp->sctp_flags & SCTP_PCB_FLAGS_SND_ITERATOR_UP) { /* There is another. */ return (EBUSY); } if (uio->uio_resid > (ssize_t)SCTP_BASE_SYSCTL(sctp_sendall_limit)) { /* You must not be larger than the limit! */ return (EMSGSIZE); } SCTP_MALLOC(ca, struct sctp_copy_all *, sizeof(struct sctp_copy_all), SCTP_M_COPYAL); if (ca == NULL) { sctp_m_freem(m); SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_OUTPUT, ENOMEM); return (ENOMEM); } memset(ca, 0, sizeof(struct sctp_copy_all)); ca->inp = inp; if (srcv) { memcpy(&ca->sndrcv, srcv, sizeof(struct sctp_nonpad_sndrcvinfo)); } /* * take off the sendall flag, it would be bad if we failed to do * this :-0 */ ca->sndrcv.sinfo_flags &= ~SCTP_SENDALL; /* get length and mbuf chain */ if (uio) { ca->sndlen = uio->uio_resid; ca->m = sctp_copy_out_all(uio, ca->sndlen); if (ca->m == NULL) { SCTP_FREE(ca, SCTP_M_COPYAL); SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_OUTPUT, ENOMEM); return (ENOMEM); } } else { /* Gather the length of the send */ struct mbuf *mat; ca->sndlen = 0; for (mat = m; mat; mat = SCTP_BUF_NEXT(mat)) { ca->sndlen += SCTP_BUF_LEN(mat); } } inp->sctp_flags |= SCTP_PCB_FLAGS_SND_ITERATOR_UP; ret = sctp_initiate_iterator(NULL, sctp_sendall_iterator, NULL, SCTP_PCB_ANY_FLAGS, SCTP_PCB_ANY_FEATURES, SCTP_ASOC_ANY_STATE, (void *)ca, 0, sctp_sendall_completes, inp, 1); if (ret) { inp->sctp_flags &= ~SCTP_PCB_FLAGS_SND_ITERATOR_UP; SCTP_FREE(ca, SCTP_M_COPYAL); SCTP_LTRACE_ERR_RET_PKT(m, inp, NULL, NULL, SCTP_FROM_SCTP_OUTPUT, EFAULT); return (EFAULT); } return (0); } void sctp_toss_old_cookies(struct sctp_tcb *stcb, struct sctp_association *asoc) { struct sctp_tmit_chunk *chk, *nchk; TAILQ_FOREACH_SAFE(chk, &asoc->control_send_queue, sctp_next, nchk) { if (chk->rec.chunk_id.id == SCTP_COOKIE_ECHO) { TAILQ_REMOVE(&asoc->control_send_queue, chk, sctp_next); asoc->ctrl_queue_cnt--; if (chk->data) { sctp_m_freem(chk->data); chk->data = NULL; } sctp_free_a_chunk(stcb, chk, SCTP_SO_NOT_LOCKED); } } } void sctp_toss_old_asconf(struct sctp_tcb *stcb) { struct sctp_association *asoc; struct sctp_tmit_chunk *chk, *nchk; struct sctp_asconf_chunk *acp; asoc = &stcb->asoc; TAILQ_FOREACH_SAFE(chk, &asoc->asconf_send_queue, sctp_next, nchk) { /* find SCTP_ASCONF chunk in queue */ if (chk->rec.chunk_id.id == SCTP_ASCONF) { if (chk->data) { acp = mtod(chk->data, struct sctp_asconf_chunk *); if (SCTP_TSN_GT(ntohl(acp->serial_number), asoc->asconf_seq_out_acked)) { /* Not Acked yet */ break; } } TAILQ_REMOVE(&asoc->asconf_send_queue, chk, sctp_next); asoc->ctrl_queue_cnt--; if (chk->data) { sctp_m_freem(chk->data); chk->data = NULL; } sctp_free_a_chunk(stcb, chk, SCTP_SO_NOT_LOCKED); } } } static void sctp_clean_up_datalist(struct sctp_tcb *stcb, struct sctp_association *asoc, struct sctp_tmit_chunk **data_list, int bundle_at, struct sctp_nets *net) { int i; struct sctp_tmit_chunk *tp1; for (i = 0; i < bundle_at; i++) { /* off of the send queue */ TAILQ_REMOVE(&asoc->send_queue, data_list[i], sctp_next); asoc->send_queue_cnt--; if (i > 0) { /* * Any chunk NOT 0 you zap the time chunk 0 gets * zapped or set based on if a RTO measurment is * needed. */ data_list[i]->do_rtt = 0; } /* record time */ data_list[i]->sent_rcv_time = net->last_sent_time; data_list[i]->rec.data.cwnd_at_send = net->cwnd; data_list[i]->rec.data.fast_retran_tsn = data_list[i]->rec.data.tsn; if (data_list[i]->whoTo == NULL) { data_list[i]->whoTo = net; atomic_add_int(&net->ref_count, 1); } /* on to the sent queue */ tp1 = TAILQ_LAST(&asoc->sent_queue, sctpchunk_listhead); if ((tp1) && SCTP_TSN_GT(tp1->rec.data.tsn, data_list[i]->rec.data.tsn)) { struct sctp_tmit_chunk *tpp; /* need to move back */ back_up_more: tpp = TAILQ_PREV(tp1, sctpchunk_listhead, sctp_next); if (tpp == NULL) { TAILQ_INSERT_BEFORE(tp1, data_list[i], sctp_next); goto all_done; } tp1 = tpp; if (SCTP_TSN_GT(tp1->rec.data.tsn, data_list[i]->rec.data.tsn)) { goto back_up_more; } TAILQ_INSERT_AFTER(&asoc->sent_queue, tp1, data_list[i], sctp_next); } else { TAILQ_INSERT_TAIL(&asoc->sent_queue, data_list[i], sctp_next); } all_done: /* This does not lower until the cum-ack passes it */ asoc->sent_queue_cnt++; if ((asoc->peers_rwnd <= 0) && (asoc->total_flight == 0) && (bundle_at == 1)) { /* Mark the chunk as being a window probe */ SCTP_STAT_INCR(sctps_windowprobed); } #ifdef SCTP_AUDITING_ENABLED sctp_audit_log(0xC2, 3); #endif data_list[i]->sent = SCTP_DATAGRAM_SENT; data_list[i]->snd_count = 1; data_list[i]->rec.data.chunk_was_revoked = 0; if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_FLIGHT_LOGGING_ENABLE) { sctp_misc_ints(SCTP_FLIGHT_LOG_UP, data_list[i]->whoTo->flight_size, data_list[i]->book_size, (uint32_t)(uintptr_t)data_list[i]->whoTo, data_list[i]->rec.data.tsn); } sctp_flight_size_increase(data_list[i]); sctp_total_flight_increase(stcb, data_list[i]); if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LOG_RWND_ENABLE) { sctp_log_rwnd(SCTP_DECREASE_PEER_RWND, asoc->peers_rwnd, data_list[i]->send_size, SCTP_BASE_SYSCTL(sctp_peer_chunk_oh)); } asoc->peers_rwnd = sctp_sbspace_sub(asoc->peers_rwnd, (uint32_t)(data_list[i]->send_size + SCTP_BASE_SYSCTL(sctp_peer_chunk_oh))); if (asoc->peers_rwnd < stcb->sctp_ep->sctp_ep.sctp_sws_sender) { /* SWS sender side engages */ asoc->peers_rwnd = 0; } } if (asoc->cc_functions.sctp_cwnd_update_packet_transmitted) { (*asoc->cc_functions.sctp_cwnd_update_packet_transmitted) (stcb, net); } } static void sctp_clean_up_ctl(struct sctp_tcb *stcb, struct sctp_association *asoc, int so_locked) { struct sctp_tmit_chunk *chk, *nchk; TAILQ_FOREACH_SAFE(chk, &asoc->control_send_queue, sctp_next, nchk) { if ((chk->rec.chunk_id.id == SCTP_SELECTIVE_ACK) || (chk->rec.chunk_id.id == SCTP_NR_SELECTIVE_ACK) || /* EY */ (chk->rec.chunk_id.id == SCTP_HEARTBEAT_REQUEST) || (chk->rec.chunk_id.id == SCTP_HEARTBEAT_ACK) || (chk->rec.chunk_id.id == SCTP_FORWARD_CUM_TSN) || (chk->rec.chunk_id.id == SCTP_SHUTDOWN) || (chk->rec.chunk_id.id == SCTP_SHUTDOWN_ACK) || (chk->rec.chunk_id.id == SCTP_OPERATION_ERROR) || (chk->rec.chunk_id.id == SCTP_PACKET_DROPPED) || (chk->rec.chunk_id.id == SCTP_COOKIE_ACK) || (chk->rec.chunk_id.id == SCTP_ECN_CWR) || (chk->rec.chunk_id.id == SCTP_ASCONF_ACK)) { /* Stray chunks must be cleaned up */ clean_up_anyway: TAILQ_REMOVE(&asoc->control_send_queue, chk, sctp_next); asoc->ctrl_queue_cnt--; if (chk->data) { sctp_m_freem(chk->data); chk->data = NULL; } if (chk->rec.chunk_id.id == SCTP_FORWARD_CUM_TSN) { asoc->fwd_tsn_cnt--; } sctp_free_a_chunk(stcb, chk, so_locked); } else if (chk->rec.chunk_id.id == SCTP_STREAM_RESET) { /* special handling, we must look into the param */ if (chk != asoc->str_reset) { goto clean_up_anyway; } } } } static uint32_t sctp_can_we_split_this(struct sctp_tcb *stcb, uint32_t length, uint32_t space_left, uint32_t frag_point, int eeor_on) { /* * Make a decision on if I should split a msg into multiple parts. * This is only asked of incomplete messages. */ if (eeor_on) { /* * If we are doing EEOR we need to always send it if its the * entire thing, since it might be all the guy is putting in * the hopper. */ if (space_left >= length) { /*- * If we have data outstanding, * we get another chance when the sack * arrives to transmit - wait for more data */ if (stcb->asoc.total_flight == 0) { /* * If nothing is in flight, we zero the * packet counter. */ return (length); } return (0); } else { /* You can fill the rest */ return (space_left); } } /*- * For those strange folk that make the send buffer * smaller than our fragmentation point, we can't * get a full msg in so we have to allow splitting. */ if (SCTP_SB_LIMIT_SND(stcb->sctp_socket) < frag_point) { return (length); } if ((length <= space_left) || ((length - space_left) < SCTP_BASE_SYSCTL(sctp_min_residual))) { /* Sub-optimial residual don't split in non-eeor mode. */ return (0); } /* * If we reach here length is larger than the space_left. Do we wish * to split it for the sake of packet putting together? */ if (space_left >= min(SCTP_BASE_SYSCTL(sctp_min_split_point), frag_point)) { /* Its ok to split it */ return (min(space_left, frag_point)); } /* Nope, can't split */ return (0); } static uint32_t sctp_move_to_outqueue(struct sctp_tcb *stcb, struct sctp_stream_out *strq, uint32_t space_left, uint32_t frag_point, int *giveup, int eeor_mode, int *bail, int so_locked) { /* Move from the stream to the send_queue keeping track of the total */ struct sctp_association *asoc; struct sctp_stream_queue_pending *sp; struct sctp_tmit_chunk *chk; struct sctp_data_chunk *dchkh = NULL; struct sctp_idata_chunk *ndchkh = NULL; uint32_t to_move, length; int leading; uint8_t rcv_flags = 0; uint8_t some_taken; uint8_t send_lock_up = 0; SCTP_TCB_LOCK_ASSERT(stcb); asoc = &stcb->asoc; one_more_time: /* sa_ignore FREED_MEMORY */ sp = TAILQ_FIRST(&strq->outqueue); if (sp == NULL) { if (send_lock_up == 0) { SCTP_TCB_SEND_LOCK(stcb); send_lock_up = 1; } sp = TAILQ_FIRST(&strq->outqueue); if (sp) { goto one_more_time; } if ((sctp_is_feature_on(stcb->sctp_ep, SCTP_PCB_FLAGS_EXPLICIT_EOR) == 0) && (stcb->asoc.idata_supported == 0) && (strq->last_msg_incomplete)) { SCTP_PRINTF("Huh? Stream:%d lm_in_c=%d but queue is NULL\n", strq->sid, strq->last_msg_incomplete); strq->last_msg_incomplete = 0; } to_move = 0; if (send_lock_up) { SCTP_TCB_SEND_UNLOCK(stcb); send_lock_up = 0; } goto out_of; } if ((sp->msg_is_complete) && (sp->length == 0)) { if (sp->sender_all_done) { /* * We are doing deferred cleanup. Last time through * when we took all the data the sender_all_done was * not set. */ if ((sp->put_last_out == 0) && (sp->discard_rest == 0)) { SCTP_PRINTF("Gak, put out entire msg with NO end!-1\n"); SCTP_PRINTF("sender_done:%d len:%d msg_comp:%d put_last_out:%d send_lock:%d\n", sp->sender_all_done, sp->length, sp->msg_is_complete, sp->put_last_out, send_lock_up); } if ((TAILQ_NEXT(sp, next) == NULL) && (send_lock_up == 0)) { SCTP_TCB_SEND_LOCK(stcb); send_lock_up = 1; } atomic_subtract_int(&asoc->stream_queue_cnt, 1); TAILQ_REMOVE(&strq->outqueue, sp, next); stcb->asoc.ss_functions.sctp_ss_remove_from_stream(stcb, asoc, strq, sp, send_lock_up); if ((strq->state == SCTP_STREAM_RESET_PENDING) && (strq->chunks_on_queues == 0) && TAILQ_EMPTY(&strq->outqueue)) { stcb->asoc.trigger_reset = 1; } if (sp->net) { sctp_free_remote_addr(sp->net); sp->net = NULL; } if (sp->data) { sctp_m_freem(sp->data); sp->data = NULL; } sctp_free_a_strmoq(stcb, sp, so_locked); /* we can't be locked to it */ if (send_lock_up) { SCTP_TCB_SEND_UNLOCK(stcb); send_lock_up = 0; } /* back to get the next msg */ goto one_more_time; } else { /* * sender just finished this but still holds a * reference */ *giveup = 1; to_move = 0; goto out_of; } } else { /* is there some to get */ if (sp->length == 0) { /* no */ *giveup = 1; to_move = 0; goto out_of; } else if (sp->discard_rest) { if (send_lock_up == 0) { SCTP_TCB_SEND_LOCK(stcb); send_lock_up = 1; } /* Whack down the size */ atomic_subtract_int(&stcb->asoc.total_output_queue_size, sp->length); if ((stcb->sctp_socket != NULL) && ((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL))) { atomic_subtract_int(&stcb->sctp_socket->so_snd.sb_cc, sp->length); } if (sp->data) { sctp_m_freem(sp->data); sp->data = NULL; sp->tail_mbuf = NULL; } sp->length = 0; sp->some_taken = 1; *giveup = 1; to_move = 0; goto out_of; } } some_taken = sp->some_taken; re_look: length = sp->length; if (sp->msg_is_complete) { /* The message is complete */ to_move = min(length, frag_point); if (to_move == length) { /* All of it fits in the MTU */ if (sp->some_taken) { rcv_flags |= SCTP_DATA_LAST_FRAG; } else { rcv_flags |= SCTP_DATA_NOT_FRAG; } sp->put_last_out = 1; if (sp->sinfo_flags & SCTP_SACK_IMMEDIATELY) { rcv_flags |= SCTP_DATA_SACK_IMMEDIATELY; } } else { /* Not all of it fits, we fragment */ if (sp->some_taken == 0) { rcv_flags |= SCTP_DATA_FIRST_FRAG; } sp->some_taken = 1; } } else { to_move = sctp_can_we_split_this(stcb, length, space_left, frag_point, eeor_mode); if (to_move) { /*- * We use a snapshot of length in case it * is expanding during the compare. */ uint32_t llen; llen = length; if (to_move >= llen) { to_move = llen; if (send_lock_up == 0) { /*- * We are taking all of an incomplete msg * thus we need a send lock. */ SCTP_TCB_SEND_LOCK(stcb); send_lock_up = 1; if (sp->msg_is_complete) { /* * the sender finished the * msg */ goto re_look; } } } if (sp->some_taken == 0) { rcv_flags |= SCTP_DATA_FIRST_FRAG; sp->some_taken = 1; } } else { /* Nothing to take. */ *giveup = 1; to_move = 0; goto out_of; } } /* If we reach here, we can copy out a chunk */ sctp_alloc_a_chunk(stcb, chk); if (chk == NULL) { /* No chunk memory */ *giveup = 1; to_move = 0; goto out_of; } /* * Setup for unordered if needed by looking at the user sent info * flags. */ if (sp->sinfo_flags & SCTP_UNORDERED) { rcv_flags |= SCTP_DATA_UNORDERED; } if (SCTP_BASE_SYSCTL(sctp_enable_sack_immediately) && (sp->sinfo_flags & SCTP_EOF) == SCTP_EOF) { rcv_flags |= SCTP_DATA_SACK_IMMEDIATELY; } /* clear out the chunk before setting up */ memset(chk, 0, sizeof(*chk)); chk->rec.data.rcv_flags = rcv_flags; if (to_move >= length) { /* we think we can steal the whole thing */ if ((sp->sender_all_done == 0) && (send_lock_up == 0)) { SCTP_TCB_SEND_LOCK(stcb); send_lock_up = 1; } if (to_move < sp->length) { /* bail, it changed */ goto dont_do_it; } chk->data = sp->data; chk->last_mbuf = sp->tail_mbuf; /* register the stealing */ sp->data = sp->tail_mbuf = NULL; } else { struct mbuf *m; dont_do_it: chk->data = SCTP_M_COPYM(sp->data, 0, to_move, M_NOWAIT); chk->last_mbuf = NULL; if (chk->data == NULL) { sp->some_taken = some_taken; sctp_free_a_chunk(stcb, chk, so_locked); *bail = 1; to_move = 0; goto out_of; } #ifdef SCTP_MBUF_LOGGING if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MBUF_LOGGING_ENABLE) { sctp_log_mbc(chk->data, SCTP_MBUF_ICOPY); } #endif /* Pull off the data */ m_adj(sp->data, to_move); /* Now lets work our way down and compact it */ m = sp->data; while (m && (SCTP_BUF_LEN(m) == 0)) { sp->data = SCTP_BUF_NEXT(m); SCTP_BUF_NEXT(m) = NULL; if (sp->tail_mbuf == m) { /*- * Freeing tail? TSNH since * we supposedly were taking less * than the sp->length. */ #ifdef INVARIANTS panic("Huh, freing tail? - TSNH"); #else SCTP_PRINTF("Huh, freeing tail? - TSNH\n"); sp->tail_mbuf = sp->data = NULL; sp->length = 0; #endif } sctp_m_free(m); m = sp->data; } } if (SCTP_BUF_IS_EXTENDED(chk->data)) { chk->copy_by_ref = 1; } else { chk->copy_by_ref = 0; } /* * get last_mbuf and counts of mb usage This is ugly but hopefully * its only one mbuf. */ if (chk->last_mbuf == NULL) { chk->last_mbuf = chk->data; while (SCTP_BUF_NEXT(chk->last_mbuf) != NULL) { chk->last_mbuf = SCTP_BUF_NEXT(chk->last_mbuf); } } if (to_move > length) { /*- This should not happen either * since we always lower to_move to the size * of sp->length if its larger. */ #ifdef INVARIANTS panic("Huh, how can to_move be larger?"); #else SCTP_PRINTF("Huh, how can to_move be larger?\n"); sp->length = 0; #endif } else { atomic_subtract_int(&sp->length, to_move); } leading = SCTP_DATA_CHUNK_OVERHEAD(stcb); if (M_LEADINGSPACE(chk->data) < leading) { /* Not enough room for a chunk header, get some */ struct mbuf *m; m = sctp_get_mbuf_for_msg(1, 0, M_NOWAIT, 1, MT_DATA); if (m == NULL) { /* * we're in trouble here. _PREPEND below will free * all the data if there is no leading space, so we * must put the data back and restore. */ if (send_lock_up == 0) { SCTP_TCB_SEND_LOCK(stcb); send_lock_up = 1; } if (sp->data == NULL) { /* unsteal the data */ sp->data = chk->data; sp->tail_mbuf = chk->last_mbuf; } else { struct mbuf *m_tmp; /* reassemble the data */ m_tmp = sp->data; sp->data = chk->data; SCTP_BUF_NEXT(chk->last_mbuf) = m_tmp; } sp->some_taken = some_taken; atomic_add_int(&sp->length, to_move); chk->data = NULL; *bail = 1; sctp_free_a_chunk(stcb, chk, so_locked); to_move = 0; goto out_of; } else { SCTP_BUF_LEN(m) = 0; SCTP_BUF_NEXT(m) = chk->data; chk->data = m; M_ALIGN(chk->data, 4); } } SCTP_BUF_PREPEND(chk->data, SCTP_DATA_CHUNK_OVERHEAD(stcb), M_NOWAIT); if (chk->data == NULL) { /* HELP, TSNH since we assured it would not above? */ #ifdef INVARIANTS panic("prepend failes HELP?"); #else SCTP_PRINTF("prepend fails HELP?\n"); sctp_free_a_chunk(stcb, chk, so_locked); #endif *bail = 1; to_move = 0; goto out_of; } sctp_snd_sb_alloc(stcb, SCTP_DATA_CHUNK_OVERHEAD(stcb)); chk->book_size = chk->send_size = (uint16_t)(to_move + SCTP_DATA_CHUNK_OVERHEAD(stcb)); chk->book_size_scale = 0; chk->sent = SCTP_DATAGRAM_UNSENT; chk->flags = 0; chk->asoc = &stcb->asoc; chk->pad_inplace = 0; chk->no_fr_allowed = 0; if (stcb->asoc.idata_supported == 0) { if (rcv_flags & SCTP_DATA_UNORDERED) { /* Just use 0. The receiver ignores the values. */ chk->rec.data.mid = 0; } else { chk->rec.data.mid = strq->next_mid_ordered; if (rcv_flags & SCTP_DATA_LAST_FRAG) { strq->next_mid_ordered++; } } } else { if (rcv_flags & SCTP_DATA_UNORDERED) { chk->rec.data.mid = strq->next_mid_unordered; if (rcv_flags & SCTP_DATA_LAST_FRAG) { strq->next_mid_unordered++; } } else { chk->rec.data.mid = strq->next_mid_ordered; if (rcv_flags & SCTP_DATA_LAST_FRAG) { strq->next_mid_ordered++; } } } chk->rec.data.sid = sp->sid; chk->rec.data.ppid = sp->ppid; chk->rec.data.context = sp->context; chk->rec.data.doing_fast_retransmit = 0; chk->rec.data.timetodrop = sp->ts; chk->flags = sp->act_flags; if (sp->net) { chk->whoTo = sp->net; atomic_add_int(&chk->whoTo->ref_count, 1); } else chk->whoTo = NULL; if (sp->holds_key_ref) { chk->auth_keyid = sp->auth_keyid; sctp_auth_key_acquire(stcb, chk->auth_keyid); chk->holds_key_ref = 1; } chk->rec.data.tsn = atomic_fetchadd_int(&asoc->sending_seq, 1); if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LOG_AT_SEND_2_OUTQ) { sctp_misc_ints(SCTP_STRMOUT_LOG_SEND, (uint32_t)(uintptr_t)stcb, sp->length, (uint32_t)((chk->rec.data.sid << 16) | (0x0000ffff & chk->rec.data.mid)), chk->rec.data.tsn); } if (stcb->asoc.idata_supported == 0) { dchkh = mtod(chk->data, struct sctp_data_chunk *); } else { ndchkh = mtod(chk->data, struct sctp_idata_chunk *); } /* * Put the rest of the things in place now. Size was done earlier in * previous loop prior to padding. */ #ifdef SCTP_ASOCLOG_OF_TSNS SCTP_TCB_LOCK_ASSERT(stcb); if (asoc->tsn_out_at >= SCTP_TSN_LOG_SIZE) { asoc->tsn_out_at = 0; asoc->tsn_out_wrapped = 1; } asoc->out_tsnlog[asoc->tsn_out_at].tsn = chk->rec.data.tsn; asoc->out_tsnlog[asoc->tsn_out_at].strm = chk->rec.data.sid; asoc->out_tsnlog[asoc->tsn_out_at].seq = chk->rec.data.mid; asoc->out_tsnlog[asoc->tsn_out_at].sz = chk->send_size; asoc->out_tsnlog[asoc->tsn_out_at].flgs = chk->rec.data.rcv_flags; asoc->out_tsnlog[asoc->tsn_out_at].stcb = (void *)stcb; asoc->out_tsnlog[asoc->tsn_out_at].in_pos = asoc->tsn_out_at; asoc->out_tsnlog[asoc->tsn_out_at].in_out = 2; asoc->tsn_out_at++; #endif if (stcb->asoc.idata_supported == 0) { dchkh->ch.chunk_type = SCTP_DATA; dchkh->ch.chunk_flags = chk->rec.data.rcv_flags; dchkh->dp.tsn = htonl(chk->rec.data.tsn); dchkh->dp.sid = htons(strq->sid); dchkh->dp.ssn = htons((uint16_t)chk->rec.data.mid); dchkh->dp.ppid = chk->rec.data.ppid; dchkh->ch.chunk_length = htons(chk->send_size); } else { ndchkh->ch.chunk_type = SCTP_IDATA; ndchkh->ch.chunk_flags = chk->rec.data.rcv_flags; ndchkh->dp.tsn = htonl(chk->rec.data.tsn); ndchkh->dp.sid = htons(strq->sid); ndchkh->dp.reserved = htons(0); ndchkh->dp.mid = htonl(chk->rec.data.mid); if (sp->fsn == 0) ndchkh->dp.ppid_fsn.ppid = chk->rec.data.ppid; else ndchkh->dp.ppid_fsn.fsn = htonl(sp->fsn); sp->fsn++; ndchkh->ch.chunk_length = htons(chk->send_size); } /* Now advance the chk->send_size by the actual pad needed. */ if (chk->send_size < SCTP_SIZE32(chk->book_size)) { /* need a pad */ struct mbuf *lm; int pads; pads = SCTP_SIZE32(chk->book_size) - chk->send_size; lm = sctp_pad_lastmbuf(chk->data, pads, chk->last_mbuf); if (lm != NULL) { chk->last_mbuf = lm; chk->pad_inplace = 1; } chk->send_size += pads; } if (PR_SCTP_ENABLED(chk->flags)) { asoc->pr_sctp_cnt++; } if (sp->msg_is_complete && (sp->length == 0) && (sp->sender_all_done)) { /* All done pull and kill the message */ if (sp->put_last_out == 0) { SCTP_PRINTF("Gak, put out entire msg with NO end!-2\n"); SCTP_PRINTF("sender_done:%d len:%d msg_comp:%d put_last_out:%d send_lock:%d\n", sp->sender_all_done, sp->length, sp->msg_is_complete, sp->put_last_out, send_lock_up); } if ((send_lock_up == 0) && (TAILQ_NEXT(sp, next) == NULL)) { SCTP_TCB_SEND_LOCK(stcb); send_lock_up = 1; } atomic_subtract_int(&asoc->stream_queue_cnt, 1); TAILQ_REMOVE(&strq->outqueue, sp, next); stcb->asoc.ss_functions.sctp_ss_remove_from_stream(stcb, asoc, strq, sp, send_lock_up); if ((strq->state == SCTP_STREAM_RESET_PENDING) && (strq->chunks_on_queues == 0) && TAILQ_EMPTY(&strq->outqueue)) { stcb->asoc.trigger_reset = 1; } if (sp->net) { sctp_free_remote_addr(sp->net); sp->net = NULL; } if (sp->data) { sctp_m_freem(sp->data); sp->data = NULL; } sctp_free_a_strmoq(stcb, sp, so_locked); } asoc->chunks_on_out_queue++; strq->chunks_on_queues++; TAILQ_INSERT_TAIL(&asoc->send_queue, chk, sctp_next); asoc->send_queue_cnt++; out_of: if (send_lock_up) { SCTP_TCB_SEND_UNLOCK(stcb); } return (to_move); } static void sctp_fill_outqueue(struct sctp_tcb *stcb, struct sctp_nets *net, int frag_point, int eeor_mode, int *quit_now, int so_locked) { struct sctp_association *asoc; struct sctp_stream_out *strq; uint32_t space_left, moved, total_moved; int bail, giveup; SCTP_TCB_LOCK_ASSERT(stcb); asoc = &stcb->asoc; total_moved = 0; switch (net->ro._l_addr.sa.sa_family) { #ifdef INET case AF_INET: space_left = net->mtu - SCTP_MIN_V4_OVERHEAD; break; #endif #ifdef INET6 case AF_INET6: space_left = net->mtu - SCTP_MIN_OVERHEAD; break; #endif default: /* TSNH */ space_left = net->mtu; break; } /* Need an allowance for the data chunk header too */ space_left -= SCTP_DATA_CHUNK_OVERHEAD(stcb); /* must make even word boundary */ space_left &= 0xfffffffc; strq = stcb->asoc.ss_functions.sctp_ss_select_stream(stcb, net, asoc); giveup = 0; bail = 0; while ((space_left > 0) && (strq != NULL)) { moved = sctp_move_to_outqueue(stcb, strq, space_left, frag_point, &giveup, eeor_mode, &bail, so_locked); stcb->asoc.ss_functions.sctp_ss_scheduled(stcb, net, asoc, strq, moved); if ((giveup != 0) || (bail != 0)) { break; } strq = stcb->asoc.ss_functions.sctp_ss_select_stream(stcb, net, asoc); total_moved += moved; if (space_left >= moved) { space_left -= moved; } else { space_left = 0; } if (space_left >= SCTP_DATA_CHUNK_OVERHEAD(stcb)) { space_left -= SCTP_DATA_CHUNK_OVERHEAD(stcb); } else { space_left = 0; } space_left &= 0xfffffffc; } if (bail != 0) *quit_now = 1; stcb->asoc.ss_functions.sctp_ss_packet_done(stcb, net, asoc); if (total_moved == 0) { if ((stcb->asoc.sctp_cmt_on_off == 0) && (net == stcb->asoc.primary_destination)) { /* ran dry for primary network net */ SCTP_STAT_INCR(sctps_primary_randry); } else if (stcb->asoc.sctp_cmt_on_off > 0) { /* ran dry with CMT on */ SCTP_STAT_INCR(sctps_cmt_randry); } } } void sctp_fix_ecn_echo(struct sctp_association *asoc) { struct sctp_tmit_chunk *chk; TAILQ_FOREACH(chk, &asoc->control_send_queue, sctp_next) { if (chk->rec.chunk_id.id == SCTP_ECN_ECHO) { chk->sent = SCTP_DATAGRAM_UNSENT; } } } void sctp_move_chunks_from_net(struct sctp_tcb *stcb, struct sctp_nets *net) { struct sctp_association *asoc; struct sctp_tmit_chunk *chk; struct sctp_stream_queue_pending *sp; unsigned int i; if (net == NULL) { return; } asoc = &stcb->asoc; for (i = 0; i < stcb->asoc.streamoutcnt; i++) { TAILQ_FOREACH(sp, &stcb->asoc.strmout[i].outqueue, next) { if (sp->net == net) { sctp_free_remote_addr(sp->net); sp->net = NULL; } } } TAILQ_FOREACH(chk, &asoc->send_queue, sctp_next) { if (chk->whoTo == net) { sctp_free_remote_addr(chk->whoTo); chk->whoTo = NULL; } } } int sctp_med_chunk_output(struct sctp_inpcb *inp, struct sctp_tcb *stcb, struct sctp_association *asoc, int *num_out, int *reason_code, int control_only, int from_where, struct timeval *now, int *now_filled, int frag_point, int so_locked) { /** * Ok this is the generic chunk service queue. we must do the * following: * - Service the stream queue that is next, moving any * message (note I must get a complete message i.e. FIRST/MIDDLE and * LAST to the out queue in one pass) and assigning TSN's. This * only applys though if the peer does not support NDATA. For NDATA * chunks its ok to not send the entire message ;-) * - Check to see if the cwnd/rwnd allows any output, if so we go ahead and * fomulate and send the low level chunks. Making sure to combine * any control in the control chunk queue also. */ struct sctp_nets *net, *start_at, *sack_goes_to = NULL, *old_start_at = NULL; struct mbuf *outchain, *endoutchain; struct sctp_tmit_chunk *chk, *nchk; /* temp arrays for unlinking */ struct sctp_tmit_chunk *data_list[SCTP_MAX_DATA_BUNDLING]; int no_fragmentflg, error; unsigned int max_rwnd_per_dest, max_send_per_dest; int one_chunk, hbflag, skip_data_for_this_net; int asconf, cookie, no_out_cnt; int bundle_at, ctl_cnt, no_data_chunks, eeor_mode; unsigned int mtu, r_mtu, omtu, mx_mtu, to_out; int tsns_sent = 0; uint32_t auth_offset; struct sctp_auth_chunk *auth; uint16_t auth_keyid; int override_ok = 1; int skip_fill_up = 0; int data_auth_reqd = 0; /* * JRS 5/14/07 - Add flag for whether a heartbeat is sent to the * destination. */ int quit_now = 0; *num_out = 0; *reason_code = 0; auth_keyid = stcb->asoc.authinfo.active_keyid; if ((asoc->state & SCTP_STATE_SHUTDOWN_PENDING) || (SCTP_GET_STATE(stcb) == SCTP_STATE_SHUTDOWN_RECEIVED) || (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_EXPLICIT_EOR))) { eeor_mode = 1; } else { eeor_mode = 0; } ctl_cnt = no_out_cnt = asconf = cookie = 0; /* * First lets prime the pump. For each destination, if there is room * in the flight size, attempt to pull an MTU's worth out of the * stream queues into the general send_queue */ #ifdef SCTP_AUDITING_ENABLED sctp_audit_log(0xC2, 2); #endif SCTP_TCB_LOCK_ASSERT(stcb); hbflag = 0; if (control_only) no_data_chunks = 1; else no_data_chunks = 0; /* Nothing to possible to send? */ if ((TAILQ_EMPTY(&asoc->control_send_queue) || (asoc->ctrl_queue_cnt == stcb->asoc.ecn_echo_cnt_onq)) && TAILQ_EMPTY(&asoc->asconf_send_queue) && TAILQ_EMPTY(&asoc->send_queue) && sctp_is_there_unsent_data(stcb, so_locked) == 0) { nothing_to_send: *reason_code = 9; return (0); } if (asoc->peers_rwnd == 0) { /* No room in peers rwnd */ *reason_code = 1; if (asoc->total_flight > 0) { /* we are allowed one chunk in flight */ no_data_chunks = 1; } } if (stcb->asoc.ecn_echo_cnt_onq) { /* Record where a sack goes, if any */ if (no_data_chunks && (asoc->ctrl_queue_cnt == stcb->asoc.ecn_echo_cnt_onq)) { /* Nothing but ECNe to send - we don't do that */ goto nothing_to_send; } TAILQ_FOREACH(chk, &asoc->control_send_queue, sctp_next) { if ((chk->rec.chunk_id.id == SCTP_SELECTIVE_ACK) || (chk->rec.chunk_id.id == SCTP_NR_SELECTIVE_ACK)) { sack_goes_to = chk->whoTo; break; } } } max_rwnd_per_dest = ((asoc->peers_rwnd + asoc->total_flight) / asoc->numnets); if (stcb->sctp_socket) max_send_per_dest = SCTP_SB_LIMIT_SND(stcb->sctp_socket) / asoc->numnets; else max_send_per_dest = 0; if (no_data_chunks == 0) { /* How many non-directed chunks are there? */ TAILQ_FOREACH(chk, &asoc->send_queue, sctp_next) { if (chk->whoTo == NULL) { /* * We already have non-directed chunks on * the queue, no need to do a fill-up. */ skip_fill_up = 1; break; } } } if ((no_data_chunks == 0) && (skip_fill_up == 0) && (!stcb->asoc.ss_functions.sctp_ss_is_empty(stcb, asoc))) { TAILQ_FOREACH(net, &asoc->nets, sctp_next) { /* * This for loop we are in takes in each net, if * its's got space in cwnd and has data sent to it * (when CMT is off) then it calls * sctp_fill_outqueue for the net. This gets data on * the send queue for that network. * * In sctp_fill_outqueue TSN's are assigned and data * is copied out of the stream buffers. Note mostly * copy by reference (we hope). */ net->window_probe = 0; if ((net != stcb->asoc.alternate) && ((net->dest_state & SCTP_ADDR_PF) || (!(net->dest_state & SCTP_ADDR_REACHABLE)) || (net->dest_state & SCTP_ADDR_UNCONFIRMED))) { if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_LOGGING_ENABLE) { sctp_log_cwnd(stcb, net, 1, SCTP_CWND_LOG_FILL_OUTQ_CALLED); } continue; } if ((stcb->asoc.cc_functions.sctp_cwnd_new_transmission_begins) && (net->flight_size == 0)) { (*stcb->asoc.cc_functions.sctp_cwnd_new_transmission_begins) (stcb, net); } if (net->flight_size >= net->cwnd) { /* skip this network, no room - can't fill */ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_LOGGING_ENABLE) { sctp_log_cwnd(stcb, net, 3, SCTP_CWND_LOG_FILL_OUTQ_CALLED); } continue; } if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_LOGGING_ENABLE) { sctp_log_cwnd(stcb, net, 4, SCTP_CWND_LOG_FILL_OUTQ_CALLED); } sctp_fill_outqueue(stcb, net, frag_point, eeor_mode, &quit_now, so_locked); if (quit_now) { /* memory alloc failure */ no_data_chunks = 1; break; } } } /* now service each destination and send out what we can for it */ /* Nothing to send? */ if (TAILQ_EMPTY(&asoc->control_send_queue) && TAILQ_EMPTY(&asoc->asconf_send_queue) && TAILQ_EMPTY(&asoc->send_queue)) { *reason_code = 8; return (0); } if (asoc->sctp_cmt_on_off > 0) { /* get the last start point */ start_at = asoc->last_net_cmt_send_started; if (start_at == NULL) { /* null so to beginning */ start_at = TAILQ_FIRST(&asoc->nets); } else { start_at = TAILQ_NEXT(asoc->last_net_cmt_send_started, sctp_next); if (start_at == NULL) { start_at = TAILQ_FIRST(&asoc->nets); } } asoc->last_net_cmt_send_started = start_at; } else { start_at = TAILQ_FIRST(&asoc->nets); } TAILQ_FOREACH(chk, &asoc->control_send_queue, sctp_next) { if (chk->whoTo == NULL) { if (asoc->alternate) { chk->whoTo = asoc->alternate; } else { chk->whoTo = asoc->primary_destination; } atomic_add_int(&chk->whoTo->ref_count, 1); } } old_start_at = NULL; again_one_more_time: for (net = start_at; net != NULL; net = TAILQ_NEXT(net, sctp_next)) { /* how much can we send? */ /* SCTPDBG("Examine for sending net:%x\n", (uint32_t)net); */ if (old_start_at && (old_start_at == net)) { /* through list ocmpletely. */ break; } tsns_sent = 0xa; if (TAILQ_EMPTY(&asoc->control_send_queue) && TAILQ_EMPTY(&asoc->asconf_send_queue) && (net->flight_size >= net->cwnd)) { /* * Nothing on control or asconf and flight is full, * we can skip even in the CMT case. */ continue; } bundle_at = 0; endoutchain = outchain = NULL; auth = NULL; auth_offset = 0; no_fragmentflg = 1; one_chunk = 0; if (net->dest_state & SCTP_ADDR_UNCONFIRMED) { skip_data_for_this_net = 1; } else { skip_data_for_this_net = 0; } switch (((struct sockaddr *)&net->ro._l_addr)->sa_family) { #ifdef INET case AF_INET: mtu = net->mtu - SCTP_MIN_V4_OVERHEAD; break; #endif #ifdef INET6 case AF_INET6: mtu = net->mtu - SCTP_MIN_OVERHEAD; break; #endif default: /* TSNH */ mtu = net->mtu; break; } mx_mtu = mtu; to_out = 0; if (mtu > asoc->peers_rwnd) { if (asoc->total_flight > 0) { /* We have a packet in flight somewhere */ r_mtu = asoc->peers_rwnd; } else { /* We are always allowed to send one MTU out */ one_chunk = 1; r_mtu = mtu; } } else { r_mtu = mtu; } error = 0; /************************/ /* ASCONF transmission */ /************************/ /* Now first lets go through the asconf queue */ TAILQ_FOREACH_SAFE(chk, &asoc->asconf_send_queue, sctp_next, nchk) { if (chk->rec.chunk_id.id != SCTP_ASCONF) { continue; } if (chk->whoTo == NULL) { if (asoc->alternate == NULL) { if (asoc->primary_destination != net) { break; } } else { if (asoc->alternate != net) { break; } } } else { if (chk->whoTo != net) { break; } } if (chk->data == NULL) { break; } if (chk->sent != SCTP_DATAGRAM_UNSENT && chk->sent != SCTP_DATAGRAM_RESEND) { break; } /* * if no AUTH is yet included and this chunk * requires it, make sure to account for it. We * don't apply the size until the AUTH chunk is * actually added below in case there is no room for * this chunk. NOTE: we overload the use of "omtu" * here */ if ((auth == NULL) && sctp_auth_is_required_chunk(chk->rec.chunk_id.id, stcb->asoc.peer_auth_chunks)) { omtu = sctp_get_auth_chunk_len(stcb->asoc.peer_hmac_id); } else omtu = 0; /* Here we do NOT factor the r_mtu */ if ((chk->send_size < (int)(mtu - omtu)) || (chk->flags & CHUNK_FLAGS_FRAGMENT_OK)) { /* * We probably should glom the mbuf chain * from the chk->data for control but the * problem is it becomes yet one more level * of tracking to do if for some reason * output fails. Then I have got to * reconstruct the merged control chain.. el * yucko.. for now we take the easy way and * do the copy */ /* * Add an AUTH chunk, if chunk requires it * save the offset into the chain for AUTH */ if ((auth == NULL) && (sctp_auth_is_required_chunk(chk->rec.chunk_id.id, stcb->asoc.peer_auth_chunks))) { outchain = sctp_add_auth_chunk(outchain, &endoutchain, &auth, &auth_offset, stcb, chk->rec.chunk_id.id); SCTP_STAT_INCR_COUNTER64(sctps_outcontrolchunks); } outchain = sctp_copy_mbufchain(chk->data, outchain, &endoutchain, (int)chk->rec.chunk_id.can_take_data, chk->send_size, chk->copy_by_ref); if (outchain == NULL) { *reason_code = 8; SCTP_LTRACE_ERR_RET(inp, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, ENOMEM); return (ENOMEM); } SCTP_STAT_INCR_COUNTER64(sctps_outcontrolchunks); /* update our MTU size */ if (mtu > (chk->send_size + omtu)) mtu -= (chk->send_size + omtu); else mtu = 0; to_out += (chk->send_size + omtu); /* Do clear IP_DF ? */ if (chk->flags & CHUNK_FLAGS_FRAGMENT_OK) { no_fragmentflg = 0; } if (chk->rec.chunk_id.can_take_data) chk->data = NULL; /* * set hb flag since we can use these for * RTO */ hbflag = 1; asconf = 1; /* * should sysctl this: don't bundle data * with ASCONF since it requires AUTH */ no_data_chunks = 1; chk->sent = SCTP_DATAGRAM_SENT; if (chk->whoTo == NULL) { chk->whoTo = net; atomic_add_int(&net->ref_count, 1); } chk->snd_count++; if (mtu == 0) { /* * Ok we are out of room but we can * output without effecting the * flight size since this little guy * is a control only packet. */ sctp_timer_start(SCTP_TIMER_TYPE_ASCONF, inp, stcb, net); /* * do NOT clear the asconf flag as * it is used to do appropriate * source address selection. */ if (*now_filled == 0) { (void)SCTP_GETTIME_TIMEVAL(now); *now_filled = 1; } net->last_sent_time = *now; hbflag = 0; if ((error = sctp_lowlevel_chunk_output(inp, stcb, net, (struct sockaddr *)&net->ro._l_addr, outchain, auth_offset, auth, stcb->asoc.authinfo.active_keyid, no_fragmentflg, 0, asconf, inp->sctp_lport, stcb->rport, htonl(stcb->asoc.peer_vtag), net->port, NULL, 0, 0, so_locked))) { /* * error, we could not * output */ SCTPDBG(SCTP_DEBUG_OUTPUT3, "Gak send error %d\n", error); if (from_where == 0) { SCTP_STAT_INCR(sctps_lowlevelerrusr); } if (error == ENOBUFS) { asoc->ifp_had_enobuf = 1; SCTP_STAT_INCR(sctps_lowlevelerr); } /* error, could not output */ if (error == EHOSTUNREACH) { /* * Destination went * unreachable * during this send */ sctp_move_chunks_from_net(stcb, net); } *reason_code = 7; break; } else { asoc->ifp_had_enobuf = 0; } /* * increase the number we sent, if a * cookie is sent we don't tell them * any was sent out. */ outchain = endoutchain = NULL; auth = NULL; auth_offset = 0; if (!no_out_cnt) *num_out += ctl_cnt; /* recalc a clean slate and setup */ switch (net->ro._l_addr.sa.sa_family) { #ifdef INET case AF_INET: mtu = net->mtu - SCTP_MIN_V4_OVERHEAD; break; #endif #ifdef INET6 case AF_INET6: mtu = net->mtu - SCTP_MIN_OVERHEAD; break; #endif default: /* TSNH */ mtu = net->mtu; break; } to_out = 0; no_fragmentflg = 1; } } } if (error != 0) { /* try next net */ continue; } /************************/ /* Control transmission */ /************************/ /* Now first lets go through the control queue */ TAILQ_FOREACH_SAFE(chk, &asoc->control_send_queue, sctp_next, nchk) { if ((sack_goes_to) && (chk->rec.chunk_id.id == SCTP_ECN_ECHO) && (chk->whoTo != sack_goes_to)) { /* * if we have a sack in queue, and we are * looking at an ecn echo that is NOT queued * to where the sack is going.. */ if (chk->whoTo == net) { /* * Don't transmit it to where its * going (current net) */ continue; } else if (sack_goes_to == net) { /* * But do transmit it to this * address */ goto skip_net_check; } } if (chk->whoTo == NULL) { if (asoc->alternate == NULL) { if (asoc->primary_destination != net) { continue; } } else { if (asoc->alternate != net) { continue; } } } else { if (chk->whoTo != net) { continue; } } skip_net_check: if (chk->data == NULL) { continue; } if (chk->sent != SCTP_DATAGRAM_UNSENT) { /* * It must be unsent. Cookies and ASCONF's * hang around but there timers will force * when marked for resend. */ continue; } /* * if no AUTH is yet included and this chunk * requires it, make sure to account for it. We * don't apply the size until the AUTH chunk is * actually added below in case there is no room for * this chunk. NOTE: we overload the use of "omtu" * here */ if ((auth == NULL) && sctp_auth_is_required_chunk(chk->rec.chunk_id.id, stcb->asoc.peer_auth_chunks)) { omtu = sctp_get_auth_chunk_len(stcb->asoc.peer_hmac_id); } else omtu = 0; /* Here we do NOT factor the r_mtu */ if ((chk->send_size <= (int)(mtu - omtu)) || (chk->flags & CHUNK_FLAGS_FRAGMENT_OK)) { /* * We probably should glom the mbuf chain * from the chk->data for control but the * problem is it becomes yet one more level * of tracking to do if for some reason * output fails. Then I have got to * reconstruct the merged control chain.. el * yucko.. for now we take the easy way and * do the copy */ /* * Add an AUTH chunk, if chunk requires it * save the offset into the chain for AUTH */ if ((auth == NULL) && (sctp_auth_is_required_chunk(chk->rec.chunk_id.id, stcb->asoc.peer_auth_chunks))) { outchain = sctp_add_auth_chunk(outchain, &endoutchain, &auth, &auth_offset, stcb, chk->rec.chunk_id.id); SCTP_STAT_INCR_COUNTER64(sctps_outcontrolchunks); } outchain = sctp_copy_mbufchain(chk->data, outchain, &endoutchain, (int)chk->rec.chunk_id.can_take_data, chk->send_size, chk->copy_by_ref); if (outchain == NULL) { *reason_code = 8; SCTP_LTRACE_ERR_RET(inp, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, ENOMEM); return (ENOMEM); } SCTP_STAT_INCR_COUNTER64(sctps_outcontrolchunks); /* update our MTU size */ if (mtu > (chk->send_size + omtu)) mtu -= (chk->send_size + omtu); else mtu = 0; to_out += (chk->send_size + omtu); /* Do clear IP_DF ? */ if (chk->flags & CHUNK_FLAGS_FRAGMENT_OK) { no_fragmentflg = 0; } if (chk->rec.chunk_id.can_take_data) chk->data = NULL; /* Mark things to be removed, if needed */ if ((chk->rec.chunk_id.id == SCTP_SELECTIVE_ACK) || (chk->rec.chunk_id.id == SCTP_NR_SELECTIVE_ACK) || /* EY */ (chk->rec.chunk_id.id == SCTP_HEARTBEAT_REQUEST) || (chk->rec.chunk_id.id == SCTP_HEARTBEAT_ACK) || (chk->rec.chunk_id.id == SCTP_SHUTDOWN) || (chk->rec.chunk_id.id == SCTP_SHUTDOWN_ACK) || (chk->rec.chunk_id.id == SCTP_OPERATION_ERROR) || (chk->rec.chunk_id.id == SCTP_COOKIE_ACK) || (chk->rec.chunk_id.id == SCTP_ECN_CWR) || (chk->rec.chunk_id.id == SCTP_PACKET_DROPPED) || (chk->rec.chunk_id.id == SCTP_ASCONF_ACK)) { if (chk->rec.chunk_id.id == SCTP_HEARTBEAT_REQUEST) { hbflag = 1; } /* remove these chunks at the end */ if ((chk->rec.chunk_id.id == SCTP_SELECTIVE_ACK) || (chk->rec.chunk_id.id == SCTP_NR_SELECTIVE_ACK)) { /* turn off the timer */ if (SCTP_OS_TIMER_PENDING(&stcb->asoc.dack_timer.timer)) { sctp_timer_stop(SCTP_TIMER_TYPE_RECV, inp, stcb, NULL, SCTP_FROM_SCTP_OUTPUT + SCTP_LOC_1); } } ctl_cnt++; } else { /* * Other chunks, since they have * timers running (i.e. COOKIE) we * just "trust" that it gets sent or * retransmitted. */ ctl_cnt++; if (chk->rec.chunk_id.id == SCTP_COOKIE_ECHO) { cookie = 1; no_out_cnt = 1; } else if (chk->rec.chunk_id.id == SCTP_ECN_ECHO) { /* * Increment ecne send count * here this means we may be * over-zealous in our * counting if the send * fails, but its the best * place to do it (we used * to do it in the queue of * the chunk, but that did * not tell how many times * it was sent. */ SCTP_STAT_INCR(sctps_sendecne); } chk->sent = SCTP_DATAGRAM_SENT; if (chk->whoTo == NULL) { chk->whoTo = net; atomic_add_int(&net->ref_count, 1); } chk->snd_count++; } if (mtu == 0) { /* * Ok we are out of room but we can * output without effecting the * flight size since this little guy * is a control only packet. */ if (asconf) { sctp_timer_start(SCTP_TIMER_TYPE_ASCONF, inp, stcb, net); /* * do NOT clear the asconf * flag as it is used to do * appropriate source * address selection. */ } if (cookie) { sctp_timer_start(SCTP_TIMER_TYPE_COOKIE, inp, stcb, net); cookie = 0; } /* Only HB or ASCONF advances time */ if (hbflag) { if (*now_filled == 0) { (void)SCTP_GETTIME_TIMEVAL(now); *now_filled = 1; } net->last_sent_time = *now; hbflag = 0; } if ((error = sctp_lowlevel_chunk_output(inp, stcb, net, (struct sockaddr *)&net->ro._l_addr, outchain, auth_offset, auth, stcb->asoc.authinfo.active_keyid, no_fragmentflg, 0, asconf, inp->sctp_lport, stcb->rport, htonl(stcb->asoc.peer_vtag), net->port, NULL, 0, 0, so_locked))) { /* * error, we could not * output */ SCTPDBG(SCTP_DEBUG_OUTPUT3, "Gak send error %d\n", error); if (from_where == 0) { SCTP_STAT_INCR(sctps_lowlevelerrusr); } if (error == ENOBUFS) { asoc->ifp_had_enobuf = 1; SCTP_STAT_INCR(sctps_lowlevelerr); } if (error == EHOSTUNREACH) { /* * Destination went * unreachable * during this send */ sctp_move_chunks_from_net(stcb, net); } *reason_code = 7; break; } else { asoc->ifp_had_enobuf = 0; } /* * increase the number we sent, if a * cookie is sent we don't tell them * any was sent out. */ outchain = endoutchain = NULL; auth = NULL; auth_offset = 0; if (!no_out_cnt) *num_out += ctl_cnt; /* recalc a clean slate and setup */ switch (net->ro._l_addr.sa.sa_family) { #ifdef INET case AF_INET: mtu = net->mtu - SCTP_MIN_V4_OVERHEAD; break; #endif #ifdef INET6 case AF_INET6: mtu = net->mtu - SCTP_MIN_OVERHEAD; break; #endif default: /* TSNH */ mtu = net->mtu; break; } to_out = 0; no_fragmentflg = 1; } } } if (error != 0) { /* try next net */ continue; } /* JRI: if dest is in PF state, do not send data to it */ if ((asoc->sctp_cmt_on_off > 0) && (net != stcb->asoc.alternate) && (net->dest_state & SCTP_ADDR_PF)) { goto no_data_fill; } if (net->flight_size >= net->cwnd) { goto no_data_fill; } if ((asoc->sctp_cmt_on_off > 0) && (SCTP_BASE_SYSCTL(sctp_buffer_splitting) & SCTP_RECV_BUFFER_SPLITTING) && (net->flight_size > max_rwnd_per_dest)) { goto no_data_fill; } /* * We need a specific accounting for the usage of the send * buffer. We also need to check the number of messages per * net. For now, this is better than nothing and it disabled * by default... */ if ((asoc->sctp_cmt_on_off > 0) && (SCTP_BASE_SYSCTL(sctp_buffer_splitting) & SCTP_SEND_BUFFER_SPLITTING) && (max_send_per_dest > 0) && (net->flight_size > max_send_per_dest)) { goto no_data_fill; } /*********************/ /* Data transmission */ /*********************/ /* * if AUTH for DATA is required and no AUTH has been added * yet, account for this in the mtu now... if no data can be * bundled, this adjustment won't matter anyways since the * packet will be going out... */ data_auth_reqd = sctp_auth_is_required_chunk(SCTP_DATA, stcb->asoc.peer_auth_chunks); if (data_auth_reqd && (auth == NULL)) { mtu -= sctp_get_auth_chunk_len(stcb->asoc.peer_hmac_id); } /* now lets add any data within the MTU constraints */ switch (((struct sockaddr *)&net->ro._l_addr)->sa_family) { #ifdef INET case AF_INET: if (net->mtu > SCTP_MIN_V4_OVERHEAD) omtu = net->mtu - SCTP_MIN_V4_OVERHEAD; else omtu = 0; break; #endif #ifdef INET6 case AF_INET6: if (net->mtu > SCTP_MIN_OVERHEAD) omtu = net->mtu - SCTP_MIN_OVERHEAD; else omtu = 0; break; #endif default: /* TSNH */ omtu = 0; break; } if ((((SCTP_GET_STATE(stcb) == SCTP_STATE_OPEN) || (SCTP_GET_STATE(stcb) == SCTP_STATE_SHUTDOWN_RECEIVED)) && (skip_data_for_this_net == 0)) || (cookie)) { TAILQ_FOREACH_SAFE(chk, &asoc->send_queue, sctp_next, nchk) { if (no_data_chunks) { /* let only control go out */ *reason_code = 1; break; } if (net->flight_size >= net->cwnd) { /* skip this net, no room for data */ *reason_code = 2; break; } if ((chk->whoTo != NULL) && (chk->whoTo != net)) { /* Don't send the chunk on this net */ continue; } if (asoc->sctp_cmt_on_off == 0) { if ((asoc->alternate) && (asoc->alternate != net) && (chk->whoTo == NULL)) { continue; } else if ((net != asoc->primary_destination) && (asoc->alternate == NULL) && (chk->whoTo == NULL)) { continue; } } if ((chk->send_size > omtu) && ((chk->flags & CHUNK_FLAGS_FRAGMENT_OK) == 0)) { /*- * strange, we have a chunk that is * to big for its destination and * yet no fragment ok flag. * Something went wrong when the * PMTU changed...we did not mark * this chunk for some reason?? I * will fix it here by letting IP * fragment it for now and printing * a warning. This really should not * happen ... */ SCTP_PRINTF("Warning chunk of %d bytes > mtu:%d and yet PMTU disc missed\n", chk->send_size, mtu); chk->flags |= CHUNK_FLAGS_FRAGMENT_OK; } if (SCTP_BASE_SYSCTL(sctp_enable_sack_immediately) && (asoc->state & SCTP_STATE_SHUTDOWN_PENDING)) { struct sctp_data_chunk *dchkh; dchkh = mtod(chk->data, struct sctp_data_chunk *); dchkh->ch.chunk_flags |= SCTP_DATA_SACK_IMMEDIATELY; } if (((chk->send_size <= mtu) && (chk->send_size <= r_mtu)) || ((chk->flags & CHUNK_FLAGS_FRAGMENT_OK) && (chk->send_size <= asoc->peers_rwnd))) { /* ok we will add this one */ /* * Add an AUTH chunk, if chunk * requires it, save the offset into * the chain for AUTH */ if (data_auth_reqd) { if (auth == NULL) { outchain = sctp_add_auth_chunk(outchain, &endoutchain, &auth, &auth_offset, stcb, SCTP_DATA); auth_keyid = chk->auth_keyid; override_ok = 0; SCTP_STAT_INCR_COUNTER64(sctps_outcontrolchunks); } else if (override_ok) { /* * use this data's * keyid */ auth_keyid = chk->auth_keyid; override_ok = 0; } else if (auth_keyid != chk->auth_keyid) { /* * different keyid, * so done bundling */ break; } } outchain = sctp_copy_mbufchain(chk->data, outchain, &endoutchain, 0, chk->send_size, chk->copy_by_ref); if (outchain == NULL) { SCTPDBG(SCTP_DEBUG_OUTPUT3, "No memory?\n"); if (!SCTP_OS_TIMER_PENDING(&net->rxt_timer.timer)) { sctp_timer_start(SCTP_TIMER_TYPE_SEND, inp, stcb, net); } *reason_code = 3; SCTP_LTRACE_ERR_RET(inp, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, ENOMEM); return (ENOMEM); } /* upate our MTU size */ /* Do clear IP_DF ? */ if (chk->flags & CHUNK_FLAGS_FRAGMENT_OK) { no_fragmentflg = 0; } /* unsigned subtraction of mtu */ if (mtu > chk->send_size) mtu -= chk->send_size; else mtu = 0; /* unsigned subtraction of r_mtu */ if (r_mtu > chk->send_size) r_mtu -= chk->send_size; else r_mtu = 0; to_out += chk->send_size; if ((to_out > mx_mtu) && no_fragmentflg) { #ifdef INVARIANTS panic("Exceeding mtu of %d out size is %d", mx_mtu, to_out); #else SCTP_PRINTF("Exceeding mtu of %d out size is %d\n", mx_mtu, to_out); #endif } chk->window_probe = 0; data_list[bundle_at++] = chk; if (bundle_at >= SCTP_MAX_DATA_BUNDLING) { break; } if (chk->sent == SCTP_DATAGRAM_UNSENT) { if ((chk->rec.data.rcv_flags & SCTP_DATA_UNORDERED) == 0) { SCTP_STAT_INCR_COUNTER64(sctps_outorderchunks); } else { SCTP_STAT_INCR_COUNTER64(sctps_outunorderchunks); } if (((chk->rec.data.rcv_flags & SCTP_DATA_LAST_FRAG) == SCTP_DATA_LAST_FRAG) && ((chk->rec.data.rcv_flags & SCTP_DATA_FIRST_FRAG) == 0)) /* * Count number of * user msg's that * were fragmented * we do this by * counting when we * see a LAST * fragment only. */ SCTP_STAT_INCR_COUNTER64(sctps_fragusrmsgs); } if ((mtu == 0) || (r_mtu == 0) || (one_chunk)) { if ((one_chunk) && (stcb->asoc.total_flight == 0)) { data_list[0]->window_probe = 1; net->window_probe = 1; } break; } } else { /* * Must be sent in order of the * TSN's (on a network) */ break; } } /* for (chunk gather loop for this net) */ } /* if asoc.state OPEN */ no_data_fill: /* Is there something to send for this destination? */ if (outchain) { /* We may need to start a control timer or two */ if (asconf) { sctp_timer_start(SCTP_TIMER_TYPE_ASCONF, inp, stcb, net); /* * do NOT clear the asconf flag as it is * used to do appropriate source address * selection. */ } if (cookie) { sctp_timer_start(SCTP_TIMER_TYPE_COOKIE, inp, stcb, net); cookie = 0; } /* must start a send timer if data is being sent */ if (bundle_at && (!SCTP_OS_TIMER_PENDING(&net->rxt_timer.timer))) { /* * no timer running on this destination * restart it. */ sctp_timer_start(SCTP_TIMER_TYPE_SEND, inp, stcb, net); } if (bundle_at || hbflag) { /* For data/asconf and hb set time */ if (*now_filled == 0) { (void)SCTP_GETTIME_TIMEVAL(now); *now_filled = 1; } net->last_sent_time = *now; } /* Now send it, if there is anything to send :> */ if ((error = sctp_lowlevel_chunk_output(inp, stcb, net, (struct sockaddr *)&net->ro._l_addr, outchain, auth_offset, auth, auth_keyid, no_fragmentflg, bundle_at, asconf, inp->sctp_lport, stcb->rport, htonl(stcb->asoc.peer_vtag), net->port, NULL, 0, 0, so_locked))) { /* error, we could not output */ SCTPDBG(SCTP_DEBUG_OUTPUT3, "Gak send error %d\n", error); if (from_where == 0) { SCTP_STAT_INCR(sctps_lowlevelerrusr); } if (error == ENOBUFS) { asoc->ifp_had_enobuf = 1; SCTP_STAT_INCR(sctps_lowlevelerr); } if (error == EHOSTUNREACH) { /* * Destination went unreachable * during this send */ sctp_move_chunks_from_net(stcb, net); } *reason_code = 6; /*- * I add this line to be paranoid. As far as * I can tell the continue, takes us back to * the top of the for, but just to make sure * I will reset these again here. */ ctl_cnt = bundle_at = 0; continue; /* This takes us back to the * for() for the nets. */ } else { asoc->ifp_had_enobuf = 0; } endoutchain = NULL; auth = NULL; auth_offset = 0; if (!no_out_cnt) { *num_out += (ctl_cnt + bundle_at); } if (bundle_at) { /* setup for a RTO measurement */ tsns_sent = data_list[0]->rec.data.tsn; /* fill time if not already filled */ if (*now_filled == 0) { (void)SCTP_GETTIME_TIMEVAL(&asoc->time_last_sent); *now_filled = 1; *now = asoc->time_last_sent; } else { asoc->time_last_sent = *now; } if (net->rto_needed) { data_list[0]->do_rtt = 1; net->rto_needed = 0; } SCTP_STAT_INCR_BY(sctps_senddata, bundle_at); sctp_clean_up_datalist(stcb, asoc, data_list, bundle_at, net); } if (one_chunk) { break; } } if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_LOGGING_ENABLE) { sctp_log_cwnd(stcb, net, tsns_sent, SCTP_CWND_LOG_FROM_SEND); } } if (old_start_at == NULL) { old_start_at = start_at; start_at = TAILQ_FIRST(&asoc->nets); if (old_start_at) goto again_one_more_time; } /* * At the end there should be no NON timed chunks hanging on this * queue. */ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_LOGGING_ENABLE) { sctp_log_cwnd(stcb, net, *num_out, SCTP_CWND_LOG_FROM_SEND); } if ((*num_out == 0) && (*reason_code == 0)) { *reason_code = 4; } else { *reason_code = 5; } sctp_clean_up_ctl(stcb, asoc, so_locked); return (0); } void sctp_queue_op_err(struct sctp_tcb *stcb, struct mbuf *op_err) { /*- * Prepend a OPERATIONAL_ERROR chunk header and put on the end of * the control chunk queue. */ struct sctp_chunkhdr *hdr; struct sctp_tmit_chunk *chk; struct mbuf *mat, *last_mbuf; uint32_t chunk_length; uint16_t padding_length; SCTP_TCB_LOCK_ASSERT(stcb); SCTP_BUF_PREPEND(op_err, sizeof(struct sctp_chunkhdr), M_NOWAIT); if (op_err == NULL) { return; } last_mbuf = NULL; chunk_length = 0; for (mat = op_err; mat != NULL; mat = SCTP_BUF_NEXT(mat)) { chunk_length += SCTP_BUF_LEN(mat); if (SCTP_BUF_NEXT(mat) == NULL) { last_mbuf = mat; } } if (chunk_length > SCTP_MAX_CHUNK_LENGTH) { sctp_m_freem(op_err); return; } padding_length = chunk_length % 4; if (padding_length != 0) { padding_length = 4 - padding_length; } if (padding_length != 0) { if (sctp_add_pad_tombuf(last_mbuf, padding_length) == NULL) { sctp_m_freem(op_err); return; } } sctp_alloc_a_chunk(stcb, chk); if (chk == NULL) { /* no memory */ sctp_m_freem(op_err); return; } chk->copy_by_ref = 0; chk->rec.chunk_id.id = SCTP_OPERATION_ERROR; chk->rec.chunk_id.can_take_data = 0; chk->flags = 0; chk->send_size = (uint16_t)chunk_length; chk->sent = SCTP_DATAGRAM_UNSENT; chk->snd_count = 0; chk->asoc = &stcb->asoc; chk->data = op_err; chk->whoTo = NULL; hdr = mtod(op_err, struct sctp_chunkhdr *); hdr->chunk_type = SCTP_OPERATION_ERROR; hdr->chunk_flags = 0; hdr->chunk_length = htons(chk->send_size); TAILQ_INSERT_TAIL(&chk->asoc->control_send_queue, chk, sctp_next); chk->asoc->ctrl_queue_cnt++; } int sctp_send_cookie_echo(struct mbuf *m, int offset, int limit, struct sctp_tcb *stcb, struct sctp_nets *net) { /*- * pull out the cookie and put it at the front of the control chunk * queue. */ int at; struct mbuf *cookie; struct sctp_paramhdr param, *phdr; struct sctp_chunkhdr *hdr; struct sctp_tmit_chunk *chk; uint16_t ptype, plen; SCTP_TCB_LOCK_ASSERT(stcb); /* First find the cookie in the param area */ cookie = NULL; at = offset + sizeof(struct sctp_init_chunk); for (;;) { phdr = sctp_get_next_param(m, at, ¶m, sizeof(param)); if (phdr == NULL) { return (-3); } ptype = ntohs(phdr->param_type); plen = ntohs(phdr->param_length); if (plen < sizeof(struct sctp_paramhdr)) { return (-6); } if (ptype == SCTP_STATE_COOKIE) { int pad; /* found the cookie */ if (at + plen > limit) { return (-7); } cookie = SCTP_M_COPYM(m, at, plen, M_NOWAIT); if (cookie == NULL) { /* No memory */ return (-2); } if ((pad = (plen % 4)) > 0) { pad = 4 - pad; } if (pad > 0) { if (sctp_pad_lastmbuf(cookie, pad, NULL) == NULL) { return (-8); } } #ifdef SCTP_MBUF_LOGGING if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MBUF_LOGGING_ENABLE) { sctp_log_mbc(cookie, SCTP_MBUF_ICOPY); } #endif break; } at += SCTP_SIZE32(plen); } /* ok, we got the cookie lets change it into a cookie echo chunk */ /* first the change from param to cookie */ hdr = mtod(cookie, struct sctp_chunkhdr *); hdr->chunk_type = SCTP_COOKIE_ECHO; hdr->chunk_flags = 0; /* get the chunk stuff now and place it in the FRONT of the queue */ sctp_alloc_a_chunk(stcb, chk); if (chk == NULL) { /* no memory */ sctp_m_freem(cookie); return (-5); } chk->copy_by_ref = 0; chk->rec.chunk_id.id = SCTP_COOKIE_ECHO; chk->rec.chunk_id.can_take_data = 0; chk->flags = CHUNK_FLAGS_FRAGMENT_OK; chk->send_size = SCTP_SIZE32(plen); chk->sent = SCTP_DATAGRAM_UNSENT; chk->snd_count = 0; chk->asoc = &stcb->asoc; chk->data = cookie; chk->whoTo = net; atomic_add_int(&chk->whoTo->ref_count, 1); TAILQ_INSERT_HEAD(&chk->asoc->control_send_queue, chk, sctp_next); chk->asoc->ctrl_queue_cnt++; return (0); } void sctp_send_heartbeat_ack(struct sctp_tcb *stcb, struct mbuf *m, int offset, int chk_length, struct sctp_nets *net) { /* * take a HB request and make it into a HB ack and send it. */ struct mbuf *outchain; struct sctp_chunkhdr *chdr; struct sctp_tmit_chunk *chk; if (net == NULL) /* must have a net pointer */ return; outchain = SCTP_M_COPYM(m, offset, chk_length, M_NOWAIT); if (outchain == NULL) { /* gak out of memory */ return; } #ifdef SCTP_MBUF_LOGGING if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MBUF_LOGGING_ENABLE) { sctp_log_mbc(outchain, SCTP_MBUF_ICOPY); } #endif chdr = mtod(outchain, struct sctp_chunkhdr *); chdr->chunk_type = SCTP_HEARTBEAT_ACK; chdr->chunk_flags = 0; if (chk_length % 4 != 0) { sctp_pad_lastmbuf(outchain, 4 - (chk_length % 4), NULL); } sctp_alloc_a_chunk(stcb, chk); if (chk == NULL) { /* no memory */ sctp_m_freem(outchain); return; } chk->copy_by_ref = 0; chk->rec.chunk_id.id = SCTP_HEARTBEAT_ACK; chk->rec.chunk_id.can_take_data = 1; chk->flags = 0; chk->send_size = chk_length; chk->sent = SCTP_DATAGRAM_UNSENT; chk->snd_count = 0; chk->asoc = &stcb->asoc; chk->data = outchain; chk->whoTo = net; atomic_add_int(&chk->whoTo->ref_count, 1); TAILQ_INSERT_TAIL(&chk->asoc->control_send_queue, chk, sctp_next); chk->asoc->ctrl_queue_cnt++; } void sctp_send_cookie_ack(struct sctp_tcb *stcb) { /* formulate and queue a cookie-ack back to sender */ struct mbuf *cookie_ack; struct sctp_chunkhdr *hdr; struct sctp_tmit_chunk *chk; SCTP_TCB_LOCK_ASSERT(stcb); cookie_ack = sctp_get_mbuf_for_msg(sizeof(struct sctp_chunkhdr), 0, M_NOWAIT, 1, MT_HEADER); if (cookie_ack == NULL) { /* no mbuf's */ return; } SCTP_BUF_RESV_UF(cookie_ack, SCTP_MIN_OVERHEAD); sctp_alloc_a_chunk(stcb, chk); if (chk == NULL) { /* no memory */ sctp_m_freem(cookie_ack); return; } chk->copy_by_ref = 0; chk->rec.chunk_id.id = SCTP_COOKIE_ACK; chk->rec.chunk_id.can_take_data = 1; chk->flags = 0; chk->send_size = sizeof(struct sctp_chunkhdr); chk->sent = SCTP_DATAGRAM_UNSENT; chk->snd_count = 0; chk->asoc = &stcb->asoc; chk->data = cookie_ack; if (chk->asoc->last_control_chunk_from != NULL) { chk->whoTo = chk->asoc->last_control_chunk_from; atomic_add_int(&chk->whoTo->ref_count, 1); } else { chk->whoTo = NULL; } hdr = mtod(cookie_ack, struct sctp_chunkhdr *); hdr->chunk_type = SCTP_COOKIE_ACK; hdr->chunk_flags = 0; hdr->chunk_length = htons(chk->send_size); SCTP_BUF_LEN(cookie_ack) = chk->send_size; TAILQ_INSERT_TAIL(&chk->asoc->control_send_queue, chk, sctp_next); chk->asoc->ctrl_queue_cnt++; return; } void sctp_send_shutdown_ack(struct sctp_tcb *stcb, struct sctp_nets *net) { /* formulate and queue a SHUTDOWN-ACK back to the sender */ struct mbuf *m_shutdown_ack; struct sctp_shutdown_ack_chunk *ack_cp; struct sctp_tmit_chunk *chk; m_shutdown_ack = sctp_get_mbuf_for_msg(sizeof(struct sctp_shutdown_ack_chunk), 0, M_NOWAIT, 1, MT_HEADER); if (m_shutdown_ack == NULL) { /* no mbuf's */ return; } SCTP_BUF_RESV_UF(m_shutdown_ack, SCTP_MIN_OVERHEAD); sctp_alloc_a_chunk(stcb, chk); if (chk == NULL) { /* no memory */ sctp_m_freem(m_shutdown_ack); return; } chk->copy_by_ref = 0; chk->rec.chunk_id.id = SCTP_SHUTDOWN_ACK; chk->rec.chunk_id.can_take_data = 1; chk->flags = 0; chk->send_size = sizeof(struct sctp_chunkhdr); chk->sent = SCTP_DATAGRAM_UNSENT; chk->snd_count = 0; chk->asoc = &stcb->asoc; chk->data = m_shutdown_ack; chk->whoTo = net; if (chk->whoTo) { atomic_add_int(&chk->whoTo->ref_count, 1); } ack_cp = mtod(m_shutdown_ack, struct sctp_shutdown_ack_chunk *); ack_cp->ch.chunk_type = SCTP_SHUTDOWN_ACK; ack_cp->ch.chunk_flags = 0; ack_cp->ch.chunk_length = htons(chk->send_size); SCTP_BUF_LEN(m_shutdown_ack) = chk->send_size; TAILQ_INSERT_TAIL(&chk->asoc->control_send_queue, chk, sctp_next); chk->asoc->ctrl_queue_cnt++; return; } void sctp_send_shutdown(struct sctp_tcb *stcb, struct sctp_nets *net) { /* formulate and queue a SHUTDOWN to the sender */ struct mbuf *m_shutdown; struct sctp_shutdown_chunk *shutdown_cp; struct sctp_tmit_chunk *chk; TAILQ_FOREACH(chk, &stcb->asoc.control_send_queue, sctp_next) { if (chk->rec.chunk_id.id == SCTP_SHUTDOWN) { /* We already have a SHUTDOWN queued. Reuse it. */ if (chk->whoTo) { sctp_free_remote_addr(chk->whoTo); chk->whoTo = NULL; } break; } } if (chk == NULL) { m_shutdown = sctp_get_mbuf_for_msg(sizeof(struct sctp_shutdown_chunk), 0, M_NOWAIT, 1, MT_HEADER); if (m_shutdown == NULL) { /* no mbuf's */ return; } SCTP_BUF_RESV_UF(m_shutdown, SCTP_MIN_OVERHEAD); sctp_alloc_a_chunk(stcb, chk); if (chk == NULL) { /* no memory */ sctp_m_freem(m_shutdown); return; } chk->copy_by_ref = 0; chk->rec.chunk_id.id = SCTP_SHUTDOWN; chk->rec.chunk_id.can_take_data = 1; chk->flags = 0; chk->send_size = sizeof(struct sctp_shutdown_chunk); chk->sent = SCTP_DATAGRAM_UNSENT; chk->snd_count = 0; chk->asoc = &stcb->asoc; chk->data = m_shutdown; chk->whoTo = net; if (chk->whoTo) { atomic_add_int(&chk->whoTo->ref_count, 1); } shutdown_cp = mtod(m_shutdown, struct sctp_shutdown_chunk *); shutdown_cp->ch.chunk_type = SCTP_SHUTDOWN; shutdown_cp->ch.chunk_flags = 0; shutdown_cp->ch.chunk_length = htons(chk->send_size); shutdown_cp->cumulative_tsn_ack = htonl(stcb->asoc.cumulative_tsn); SCTP_BUF_LEN(m_shutdown) = chk->send_size; TAILQ_INSERT_TAIL(&chk->asoc->control_send_queue, chk, sctp_next); chk->asoc->ctrl_queue_cnt++; } else { TAILQ_REMOVE(&stcb->asoc.control_send_queue, chk, sctp_next); chk->whoTo = net; if (chk->whoTo) { atomic_add_int(&chk->whoTo->ref_count, 1); } shutdown_cp = mtod(chk->data, struct sctp_shutdown_chunk *); shutdown_cp->cumulative_tsn_ack = htonl(stcb->asoc.cumulative_tsn); TAILQ_INSERT_TAIL(&stcb->asoc.control_send_queue, chk, sctp_next); } return; } void sctp_send_asconf(struct sctp_tcb *stcb, struct sctp_nets *net, int addr_locked) { /* * formulate and queue an ASCONF to the peer. ASCONF parameters * should be queued on the assoc queue. */ struct sctp_tmit_chunk *chk; struct mbuf *m_asconf; int len; SCTP_TCB_LOCK_ASSERT(stcb); if ((!TAILQ_EMPTY(&stcb->asoc.asconf_send_queue)) && (!sctp_is_feature_on(stcb->sctp_ep, SCTP_PCB_FLAGS_MULTIPLE_ASCONFS))) { /* can't send a new one if there is one in flight already */ return; } /* compose an ASCONF chunk, maximum length is PMTU */ m_asconf = sctp_compose_asconf(stcb, &len, addr_locked); if (m_asconf == NULL) { return; } sctp_alloc_a_chunk(stcb, chk); if (chk == NULL) { /* no memory */ sctp_m_freem(m_asconf); return; } chk->copy_by_ref = 0; chk->rec.chunk_id.id = SCTP_ASCONF; chk->rec.chunk_id.can_take_data = 0; chk->flags = CHUNK_FLAGS_FRAGMENT_OK; chk->data = m_asconf; chk->send_size = len; chk->sent = SCTP_DATAGRAM_UNSENT; chk->snd_count = 0; chk->asoc = &stcb->asoc; chk->whoTo = net; if (chk->whoTo) { atomic_add_int(&chk->whoTo->ref_count, 1); } TAILQ_INSERT_TAIL(&chk->asoc->asconf_send_queue, chk, sctp_next); chk->asoc->ctrl_queue_cnt++; return; } void sctp_send_asconf_ack(struct sctp_tcb *stcb) { /* * formulate and queue a asconf-ack back to sender. the asconf-ack * must be stored in the tcb. */ struct sctp_tmit_chunk *chk; struct sctp_asconf_ack *ack, *latest_ack; struct mbuf *m_ack; struct sctp_nets *net = NULL; SCTP_TCB_LOCK_ASSERT(stcb); /* Get the latest ASCONF-ACK */ latest_ack = TAILQ_LAST(&stcb->asoc.asconf_ack_sent, sctp_asconf_ackhead); if (latest_ack == NULL) { return; } if (latest_ack->last_sent_to != NULL && latest_ack->last_sent_to == stcb->asoc.last_control_chunk_from) { /* we're doing a retransmission */ net = sctp_find_alternate_net(stcb, stcb->asoc.last_control_chunk_from, 0); if (net == NULL) { /* no alternate */ if (stcb->asoc.last_control_chunk_from == NULL) { if (stcb->asoc.alternate) { net = stcb->asoc.alternate; } else { net = stcb->asoc.primary_destination; } } else { net = stcb->asoc.last_control_chunk_from; } } } else { /* normal case */ if (stcb->asoc.last_control_chunk_from == NULL) { if (stcb->asoc.alternate) { net = stcb->asoc.alternate; } else { net = stcb->asoc.primary_destination; } } else { net = stcb->asoc.last_control_chunk_from; } } latest_ack->last_sent_to = net; TAILQ_FOREACH(ack, &stcb->asoc.asconf_ack_sent, next) { if (ack->data == NULL) { continue; } /* copy the asconf_ack */ m_ack = SCTP_M_COPYM(ack->data, 0, M_COPYALL, M_NOWAIT); if (m_ack == NULL) { /* couldn't copy it */ return; } #ifdef SCTP_MBUF_LOGGING if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MBUF_LOGGING_ENABLE) { sctp_log_mbc(m_ack, SCTP_MBUF_ICOPY); } #endif sctp_alloc_a_chunk(stcb, chk); if (chk == NULL) { /* no memory */ if (m_ack) sctp_m_freem(m_ack); return; } chk->copy_by_ref = 0; chk->rec.chunk_id.id = SCTP_ASCONF_ACK; chk->rec.chunk_id.can_take_data = 1; chk->flags = CHUNK_FLAGS_FRAGMENT_OK; chk->whoTo = net; if (chk->whoTo) { atomic_add_int(&chk->whoTo->ref_count, 1); } chk->data = m_ack; chk->send_size = ack->len; chk->sent = SCTP_DATAGRAM_UNSENT; chk->snd_count = 0; chk->asoc = &stcb->asoc; TAILQ_INSERT_TAIL(&chk->asoc->control_send_queue, chk, sctp_next); chk->asoc->ctrl_queue_cnt++; } return; } static int sctp_chunk_retransmission(struct sctp_inpcb *inp, struct sctp_tcb *stcb, struct sctp_association *asoc, int *cnt_out, struct timeval *now, int *now_filled, int *fr_done, int so_locked) { /*- * send out one MTU of retransmission. If fast_retransmit is * happening we ignore the cwnd. Otherwise we obey the cwnd and * rwnd. For a Cookie or Asconf in the control chunk queue we * retransmit them by themselves. * * For data chunks we will pick out the lowest TSN's in the sent_queue * marked for resend and bundle them all together (up to a MTU of * destination). The address to send to should have been * selected/changed where the retransmission was marked (i.e. in FR * or t3-timeout routines). */ struct sctp_tmit_chunk *data_list[SCTP_MAX_DATA_BUNDLING]; struct sctp_tmit_chunk *chk, *fwd; struct mbuf *m, *endofchain; struct sctp_nets *net = NULL; uint32_t tsns_sent = 0; int no_fragmentflg, bundle_at, cnt_thru; unsigned int mtu; int error, i, one_chunk, fwd_tsn, ctl_cnt, tmr_started; struct sctp_auth_chunk *auth = NULL; uint32_t auth_offset = 0; uint16_t auth_keyid; int override_ok = 1; int data_auth_reqd = 0; uint32_t dmtu = 0; SCTP_TCB_LOCK_ASSERT(stcb); tmr_started = ctl_cnt = bundle_at = error = 0; no_fragmentflg = 1; fwd_tsn = 0; *cnt_out = 0; fwd = NULL; endofchain = m = NULL; auth_keyid = stcb->asoc.authinfo.active_keyid; #ifdef SCTP_AUDITING_ENABLED sctp_audit_log(0xC3, 1); #endif if ((TAILQ_EMPTY(&asoc->sent_queue)) && (TAILQ_EMPTY(&asoc->control_send_queue))) { SCTPDBG(SCTP_DEBUG_OUTPUT1, "SCTP hits empty queue with cnt set to %d?\n", asoc->sent_queue_retran_cnt); asoc->sent_queue_cnt = 0; asoc->sent_queue_cnt_removeable = 0; /* send back 0/0 so we enter normal transmission */ *cnt_out = 0; return (0); } TAILQ_FOREACH(chk, &asoc->control_send_queue, sctp_next) { if ((chk->rec.chunk_id.id == SCTP_COOKIE_ECHO) || (chk->rec.chunk_id.id == SCTP_STREAM_RESET) || (chk->rec.chunk_id.id == SCTP_FORWARD_CUM_TSN)) { if (chk->sent != SCTP_DATAGRAM_RESEND) { continue; } if (chk->rec.chunk_id.id == SCTP_STREAM_RESET) { if (chk != asoc->str_reset) { /* * not eligible for retran if its * not ours */ continue; } } ctl_cnt++; if (chk->rec.chunk_id.id == SCTP_FORWARD_CUM_TSN) { fwd_tsn = 1; } /* * Add an AUTH chunk, if chunk requires it save the * offset into the chain for AUTH */ if ((auth == NULL) && (sctp_auth_is_required_chunk(chk->rec.chunk_id.id, stcb->asoc.peer_auth_chunks))) { m = sctp_add_auth_chunk(m, &endofchain, &auth, &auth_offset, stcb, chk->rec.chunk_id.id); SCTP_STAT_INCR_COUNTER64(sctps_outcontrolchunks); } m = sctp_copy_mbufchain(chk->data, m, &endofchain, 0, chk->send_size, chk->copy_by_ref); break; } } one_chunk = 0; cnt_thru = 0; /* do we have control chunks to retransmit? */ if (m != NULL) { /* Start a timer no matter if we succeed or fail */ if (chk->rec.chunk_id.id == SCTP_COOKIE_ECHO) { sctp_timer_start(SCTP_TIMER_TYPE_COOKIE, inp, stcb, chk->whoTo); } else if (chk->rec.chunk_id.id == SCTP_ASCONF) sctp_timer_start(SCTP_TIMER_TYPE_ASCONF, inp, stcb, chk->whoTo); chk->snd_count++; /* update our count */ if ((error = sctp_lowlevel_chunk_output(inp, stcb, chk->whoTo, (struct sockaddr *)&chk->whoTo->ro._l_addr, m, auth_offset, auth, stcb->asoc.authinfo.active_keyid, no_fragmentflg, 0, 0, inp->sctp_lport, stcb->rport, htonl(stcb->asoc.peer_vtag), chk->whoTo->port, NULL, 0, 0, so_locked))) { SCTPDBG(SCTP_DEBUG_OUTPUT3, "Gak send error %d\n", error); if (error == ENOBUFS) { asoc->ifp_had_enobuf = 1; SCTP_STAT_INCR(sctps_lowlevelerr); } return (error); } else { asoc->ifp_had_enobuf = 0; } endofchain = NULL; auth = NULL; auth_offset = 0; /* * We don't want to mark the net->sent time here since this * we use this for HB and retrans cannot measure RTT */ /* (void)SCTP_GETTIME_TIMEVAL(&chk->whoTo->last_sent_time); */ *cnt_out += 1; chk->sent = SCTP_DATAGRAM_SENT; sctp_ucount_decr(stcb->asoc.sent_queue_retran_cnt); if (fwd_tsn == 0) { return (0); } else { /* Clean up the fwd-tsn list */ sctp_clean_up_ctl(stcb, asoc, so_locked); return (0); } } /* * Ok, it is just data retransmission we need to do or that and a * fwd-tsn with it all. */ if (TAILQ_EMPTY(&asoc->sent_queue)) { return (SCTP_RETRAN_DONE); } if ((SCTP_GET_STATE(stcb) == SCTP_STATE_COOKIE_ECHOED) || (SCTP_GET_STATE(stcb) == SCTP_STATE_COOKIE_WAIT)) { /* not yet open, resend the cookie and that is it */ return (1); } #ifdef SCTP_AUDITING_ENABLED sctp_auditing(20, inp, stcb, NULL); #endif data_auth_reqd = sctp_auth_is_required_chunk(SCTP_DATA, stcb->asoc.peer_auth_chunks); TAILQ_FOREACH(chk, &asoc->sent_queue, sctp_next) { if (chk->sent != SCTP_DATAGRAM_RESEND) { /* No, not sent to this net or not ready for rtx */ continue; } if (chk->data == NULL) { SCTP_PRINTF("TSN:%x chk->snd_count:%d chk->sent:%d can't retran - no data\n", chk->rec.data.tsn, chk->snd_count, chk->sent); continue; } if ((SCTP_BASE_SYSCTL(sctp_max_retran_chunk)) && (chk->snd_count >= SCTP_BASE_SYSCTL(sctp_max_retran_chunk))) { struct mbuf *op_err; char msg[SCTP_DIAG_INFO_LEN]; SCTP_SNPRINTF(msg, sizeof(msg), "TSN %8.8x retransmitted %d times, giving up", chk->rec.data.tsn, chk->snd_count); op_err = sctp_generate_cause(SCTP_BASE_SYSCTL(sctp_diag_info_code), msg); atomic_add_int(&stcb->asoc.refcnt, 1); sctp_abort_an_association(stcb->sctp_ep, stcb, op_err, so_locked); SCTP_TCB_LOCK(stcb); atomic_subtract_int(&stcb->asoc.refcnt, 1); return (SCTP_RETRAN_EXIT); } /* pick up the net */ net = chk->whoTo; switch (net->ro._l_addr.sa.sa_family) { #ifdef INET case AF_INET: mtu = net->mtu - SCTP_MIN_V4_OVERHEAD; break; #endif #ifdef INET6 case AF_INET6: mtu = net->mtu - SCTP_MIN_OVERHEAD; break; #endif default: /* TSNH */ mtu = net->mtu; break; } if ((asoc->peers_rwnd < mtu) && (asoc->total_flight > 0)) { /* No room in peers rwnd */ uint32_t tsn; tsn = asoc->last_acked_seq + 1; if (tsn == chk->rec.data.tsn) { /* * we make a special exception for this * case. The peer has no rwnd but is missing * the lowest chunk.. which is probably what * is holding up the rwnd. */ goto one_chunk_around; } return (1); } one_chunk_around: if (asoc->peers_rwnd < mtu) { one_chunk = 1; if ((asoc->peers_rwnd == 0) && (asoc->total_flight == 0)) { chk->window_probe = 1; chk->whoTo->window_probe = 1; } } #ifdef SCTP_AUDITING_ENABLED sctp_audit_log(0xC3, 2); #endif bundle_at = 0; m = NULL; net->fast_retran_ip = 0; if (chk->rec.data.doing_fast_retransmit == 0) { /* * if no FR in progress skip destination that have * flight_size > cwnd. */ if (net->flight_size >= net->cwnd) { continue; } } else { /* * Mark the destination net to have FR recovery * limits put on it. */ *fr_done = 1; net->fast_retran_ip = 1; } /* * if no AUTH is yet included and this chunk requires it, * make sure to account for it. We don't apply the size * until the AUTH chunk is actually added below in case * there is no room for this chunk. */ if (data_auth_reqd && (auth == NULL)) { dmtu = sctp_get_auth_chunk_len(stcb->asoc.peer_hmac_id); } else dmtu = 0; if ((chk->send_size <= (mtu - dmtu)) || (chk->flags & CHUNK_FLAGS_FRAGMENT_OK)) { /* ok we will add this one */ if (data_auth_reqd) { if (auth == NULL) { m = sctp_add_auth_chunk(m, &endofchain, &auth, &auth_offset, stcb, SCTP_DATA); auth_keyid = chk->auth_keyid; override_ok = 0; SCTP_STAT_INCR_COUNTER64(sctps_outcontrolchunks); } else if (override_ok) { auth_keyid = chk->auth_keyid; override_ok = 0; } else if (chk->auth_keyid != auth_keyid) { /* different keyid, so done bundling */ break; } } m = sctp_copy_mbufchain(chk->data, m, &endofchain, 0, chk->send_size, chk->copy_by_ref); if (m == NULL) { SCTP_LTRACE_ERR_RET(inp, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, ENOMEM); return (ENOMEM); } /* Do clear IP_DF ? */ if (chk->flags & CHUNK_FLAGS_FRAGMENT_OK) { no_fragmentflg = 0; } /* upate our MTU size */ if (mtu > (chk->send_size + dmtu)) mtu -= (chk->send_size + dmtu); else mtu = 0; data_list[bundle_at++] = chk; if (one_chunk && (asoc->total_flight <= 0)) { SCTP_STAT_INCR(sctps_windowprobed); } } if (one_chunk == 0) { /* * now are there anymore forward from chk to pick * up? */ for (fwd = TAILQ_NEXT(chk, sctp_next); fwd != NULL; fwd = TAILQ_NEXT(fwd, sctp_next)) { if (fwd->sent != SCTP_DATAGRAM_RESEND) { /* Nope, not for retran */ continue; } if (fwd->whoTo != net) { /* Nope, not the net in question */ continue; } if (data_auth_reqd && (auth == NULL)) { dmtu = sctp_get_auth_chunk_len(stcb->asoc.peer_hmac_id); } else dmtu = 0; if (fwd->send_size <= (mtu - dmtu)) { if (data_auth_reqd) { if (auth == NULL) { m = sctp_add_auth_chunk(m, &endofchain, &auth, &auth_offset, stcb, SCTP_DATA); auth_keyid = fwd->auth_keyid; override_ok = 0; SCTP_STAT_INCR_COUNTER64(sctps_outcontrolchunks); } else if (override_ok) { auth_keyid = fwd->auth_keyid; override_ok = 0; } else if (fwd->auth_keyid != auth_keyid) { /* * different keyid, * so done bundling */ break; } } m = sctp_copy_mbufchain(fwd->data, m, &endofchain, 0, fwd->send_size, fwd->copy_by_ref); if (m == NULL) { SCTP_LTRACE_ERR_RET(inp, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, ENOMEM); return (ENOMEM); } /* Do clear IP_DF ? */ if (fwd->flags & CHUNK_FLAGS_FRAGMENT_OK) { no_fragmentflg = 0; } /* upate our MTU size */ if (mtu > (fwd->send_size + dmtu)) mtu -= (fwd->send_size + dmtu); else mtu = 0; data_list[bundle_at++] = fwd; if (bundle_at >= SCTP_MAX_DATA_BUNDLING) { break; } } else { /* can't fit so we are done */ break; } } } /* Is there something to send for this destination? */ if (m) { /* * No matter if we fail/or succeed we should start a * timer. A failure is like a lost IP packet :-) */ if (!SCTP_OS_TIMER_PENDING(&net->rxt_timer.timer)) { /* * no timer running on this destination * restart it. */ sctp_timer_start(SCTP_TIMER_TYPE_SEND, inp, stcb, net); tmr_started = 1; } /* Now lets send it, if there is anything to send :> */ if ((error = sctp_lowlevel_chunk_output(inp, stcb, net, (struct sockaddr *)&net->ro._l_addr, m, auth_offset, auth, auth_keyid, no_fragmentflg, 0, 0, inp->sctp_lport, stcb->rport, htonl(stcb->asoc.peer_vtag), net->port, NULL, 0, 0, so_locked))) { /* error, we could not output */ SCTPDBG(SCTP_DEBUG_OUTPUT3, "Gak send error %d\n", error); if (error == ENOBUFS) { asoc->ifp_had_enobuf = 1; SCTP_STAT_INCR(sctps_lowlevelerr); } return (error); } else { asoc->ifp_had_enobuf = 0; } endofchain = NULL; auth = NULL; auth_offset = 0; /* For HB's */ /* * We don't want to mark the net->sent time here * since this we use this for HB and retrans cannot * measure RTT */ /* (void)SCTP_GETTIME_TIMEVAL(&net->last_sent_time); */ /* For auto-close */ cnt_thru++; if (*now_filled == 0) { (void)SCTP_GETTIME_TIMEVAL(&asoc->time_last_sent); *now = asoc->time_last_sent; *now_filled = 1; } else { asoc->time_last_sent = *now; } *cnt_out += bundle_at; #ifdef SCTP_AUDITING_ENABLED sctp_audit_log(0xC4, bundle_at); #endif if (bundle_at) { tsns_sent = data_list[0]->rec.data.tsn; } for (i = 0; i < bundle_at; i++) { SCTP_STAT_INCR(sctps_sendretransdata); data_list[i]->sent = SCTP_DATAGRAM_SENT; /* * When we have a revoked data, and we * retransmit it, then we clear the revoked * flag since this flag dictates if we * subtracted from the fs */ if (data_list[i]->rec.data.chunk_was_revoked) { /* Deflate the cwnd */ data_list[i]->whoTo->cwnd -= data_list[i]->book_size; data_list[i]->rec.data.chunk_was_revoked = 0; } data_list[i]->snd_count++; sctp_ucount_decr(asoc->sent_queue_retran_cnt); /* record the time */ data_list[i]->sent_rcv_time = asoc->time_last_sent; if (data_list[i]->book_size_scale) { /* * need to double the book size on * this one */ data_list[i]->book_size_scale = 0; /* * Since we double the booksize, we * must also double the output queue * size, since this get shrunk when * we free by this amount. */ atomic_add_int(&((asoc)->total_output_queue_size), data_list[i]->book_size); data_list[i]->book_size *= 2; } else { if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LOG_RWND_ENABLE) { sctp_log_rwnd(SCTP_DECREASE_PEER_RWND, asoc->peers_rwnd, data_list[i]->send_size, SCTP_BASE_SYSCTL(sctp_peer_chunk_oh)); } asoc->peers_rwnd = sctp_sbspace_sub(asoc->peers_rwnd, (uint32_t)(data_list[i]->send_size + SCTP_BASE_SYSCTL(sctp_peer_chunk_oh))); } if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_FLIGHT_LOGGING_ENABLE) { sctp_misc_ints(SCTP_FLIGHT_LOG_UP_RSND, data_list[i]->whoTo->flight_size, data_list[i]->book_size, (uint32_t)(uintptr_t)data_list[i]->whoTo, data_list[i]->rec.data.tsn); } sctp_flight_size_increase(data_list[i]); sctp_total_flight_increase(stcb, data_list[i]); if (asoc->peers_rwnd < stcb->sctp_ep->sctp_ep.sctp_sws_sender) { /* SWS sender side engages */ asoc->peers_rwnd = 0; } if ((i == 0) && (data_list[i]->rec.data.doing_fast_retransmit)) { SCTP_STAT_INCR(sctps_sendfastretrans); if ((data_list[i] == TAILQ_FIRST(&asoc->sent_queue)) && (tmr_started == 0)) { /*- * ok we just fast-retrans'd * the lowest TSN, i.e the * first on the list. In * this case we want to give * some more time to get a * SACK back without a * t3-expiring. */ sctp_timer_stop(SCTP_TIMER_TYPE_SEND, inp, stcb, net, SCTP_FROM_SCTP_OUTPUT + SCTP_LOC_2); sctp_timer_start(SCTP_TIMER_TYPE_SEND, inp, stcb, net); } } } if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_LOGGING_ENABLE) { sctp_log_cwnd(stcb, net, tsns_sent, SCTP_CWND_LOG_FROM_RESEND); } #ifdef SCTP_AUDITING_ENABLED sctp_auditing(21, inp, stcb, NULL); #endif } else { /* None will fit */ return (1); } if (asoc->sent_queue_retran_cnt <= 0) { /* all done we have no more to retran */ asoc->sent_queue_retran_cnt = 0; break; } if (one_chunk) { /* No more room in rwnd */ return (1); } /* stop the for loop here. we sent out a packet */ break; } return (0); } static void sctp_timer_validation(struct sctp_inpcb *inp, struct sctp_tcb *stcb, struct sctp_association *asoc) { struct sctp_nets *net; /* Validate that a timer is running somewhere */ TAILQ_FOREACH(net, &asoc->nets, sctp_next) { if (SCTP_OS_TIMER_PENDING(&net->rxt_timer.timer)) { /* Here is a timer */ return; } } SCTP_TCB_LOCK_ASSERT(stcb); /* Gak, we did not have a timer somewhere */ SCTPDBG(SCTP_DEBUG_OUTPUT3, "Deadlock avoided starting timer on a dest at retran\n"); if (asoc->alternate) { sctp_timer_start(SCTP_TIMER_TYPE_SEND, inp, stcb, asoc->alternate); } else { sctp_timer_start(SCTP_TIMER_TYPE_SEND, inp, stcb, asoc->primary_destination); } return; } void sctp_chunk_output(struct sctp_inpcb *inp, struct sctp_tcb *stcb, int from_where, int so_locked) { /*- * Ok this is the generic chunk service queue. we must do the * following: * - See if there are retransmits pending, if so we must * do these first. * - Service the stream queue that is next, moving any * message (note I must get a complete message i.e. * FIRST/MIDDLE and LAST to the out queue in one pass) and assigning * TSN's * - Check to see if the cwnd/rwnd allows any output, if so we * go ahead and fomulate and send the low level chunks. Making sure * to combine any control in the control chunk queue also. */ struct sctp_association *asoc; struct sctp_nets *net; int error = 0, num_out, tot_out = 0, ret = 0, reason_code; unsigned int burst_cnt = 0; struct timeval now; int now_filled = 0; int nagle_on; int frag_point = sctp_get_frag_point(stcb, &stcb->asoc); int un_sent = 0; int fr_done; unsigned int tot_frs = 0; asoc = &stcb->asoc; do_it_again: /* The Nagle algorithm is only applied when handling a send call. */ if (from_where == SCTP_OUTPUT_FROM_USR_SEND) { if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_NODELAY)) { nagle_on = 0; } else { nagle_on = 1; } } else { nagle_on = 0; } SCTP_TCB_LOCK_ASSERT(stcb); un_sent = (stcb->asoc.total_output_queue_size - stcb->asoc.total_flight); if ((un_sent <= 0) && (TAILQ_EMPTY(&asoc->control_send_queue)) && (TAILQ_EMPTY(&asoc->asconf_send_queue)) && (asoc->sent_queue_retran_cnt == 0) && (asoc->trigger_reset == 0)) { /* Nothing to do unless there is something to be sent left */ return; } /* * Do we have something to send, data or control AND a sack timer * running, if so piggy-back the sack. */ if (SCTP_OS_TIMER_PENDING(&stcb->asoc.dack_timer.timer)) { sctp_send_sack(stcb, so_locked); sctp_timer_stop(SCTP_TIMER_TYPE_RECV, stcb->sctp_ep, stcb, NULL, SCTP_FROM_SCTP_OUTPUT + SCTP_LOC_3); } while (asoc->sent_queue_retran_cnt) { /*- * Ok, it is retransmission time only, we send out only ONE * packet with a single call off to the retran code. */ if (from_where == SCTP_OUTPUT_FROM_COOKIE_ACK) { /*- * Special hook for handling cookiess discarded * by peer that carried data. Send cookie-ack only * and then the next call with get the retran's. */ (void)sctp_med_chunk_output(inp, stcb, asoc, &num_out, &reason_code, 1, from_where, &now, &now_filled, frag_point, so_locked); return; } else if (from_where != SCTP_OUTPUT_FROM_HB_TMR) { /* if its not from a HB then do it */ fr_done = 0; ret = sctp_chunk_retransmission(inp, stcb, asoc, &num_out, &now, &now_filled, &fr_done, so_locked); if (fr_done) { tot_frs++; } } else { /* * its from any other place, we don't allow retran * output (only control) */ ret = 1; } if (ret > 0) { /* Can't send anymore */ /*- * now lets push out control by calling med-level * output once. this assures that we WILL send HB's * if queued too. */ (void)sctp_med_chunk_output(inp, stcb, asoc, &num_out, &reason_code, 1, from_where, &now, &now_filled, frag_point, so_locked); #ifdef SCTP_AUDITING_ENABLED sctp_auditing(8, inp, stcb, NULL); #endif sctp_timer_validation(inp, stcb, asoc); return; } if (ret < 0) { /*- * The count was off.. retran is not happening so do * the normal retransmission. */ #ifdef SCTP_AUDITING_ENABLED sctp_auditing(9, inp, stcb, NULL); #endif if (ret == SCTP_RETRAN_EXIT) { return; } break; } if (from_where == SCTP_OUTPUT_FROM_T3) { /* Only one transmission allowed out of a timeout */ #ifdef SCTP_AUDITING_ENABLED sctp_auditing(10, inp, stcb, NULL); #endif /* Push out any control */ (void)sctp_med_chunk_output(inp, stcb, asoc, &num_out, &reason_code, 1, from_where, &now, &now_filled, frag_point, so_locked); return; } if ((asoc->fr_max_burst > 0) && (tot_frs >= asoc->fr_max_burst)) { /* Hit FR burst limit */ return; } if ((num_out == 0) && (ret == 0)) { /* No more retrans to send */ break; } } #ifdef SCTP_AUDITING_ENABLED sctp_auditing(12, inp, stcb, NULL); #endif /* Check for bad destinations, if they exist move chunks around. */ TAILQ_FOREACH(net, &asoc->nets, sctp_next) { if (!(net->dest_state & SCTP_ADDR_REACHABLE)) { /*- * if possible move things off of this address we * still may send below due to the dormant state but * we try to find an alternate address to send to * and if we have one we move all queued data on the * out wheel to this alternate address. */ if (net->ref_count > 1) sctp_move_chunks_from_net(stcb, net); } else { /*- * if ((asoc->sat_network) || (net->addr_is_local)) * { burst_limit = asoc->max_burst * * SCTP_SAT_NETWORK_BURST_INCR; } */ if (asoc->max_burst > 0) { if (SCTP_BASE_SYSCTL(sctp_use_cwnd_based_maxburst)) { if ((net->flight_size + (asoc->max_burst * net->mtu)) < net->cwnd) { /* * JRS - Use the congestion * control given in the * congestion control module */ asoc->cc_functions.sctp_cwnd_update_after_output(stcb, net, asoc->max_burst); if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LOG_MAXBURST_ENABLE) { sctp_log_maxburst(stcb, net, 0, asoc->max_burst, SCTP_MAX_BURST_APPLIED); } SCTP_STAT_INCR(sctps_maxburstqueued); } net->fast_retran_ip = 0; } else { if (net->flight_size == 0) { /* * Should be decaying the * cwnd here */ ; } } } } } burst_cnt = 0; do { error = sctp_med_chunk_output(inp, stcb, asoc, &num_out, &reason_code, 0, from_where, &now, &now_filled, frag_point, so_locked); if (error) { SCTPDBG(SCTP_DEBUG_OUTPUT1, "Error %d was returned from med-c-op\n", error); if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LOG_MAXBURST_ENABLE) { sctp_log_maxburst(stcb, asoc->primary_destination, error, burst_cnt, SCTP_MAX_BURST_ERROR_STOP); } if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_LOGGING_ENABLE) { sctp_log_cwnd(stcb, NULL, error, SCTP_SEND_NOW_COMPLETES); sctp_log_cwnd(stcb, NULL, 0xdeadbeef, SCTP_SEND_NOW_COMPLETES); } break; } SCTPDBG(SCTP_DEBUG_OUTPUT3, "m-c-o put out %d\n", num_out); tot_out += num_out; burst_cnt++; if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_LOGGING_ENABLE) { sctp_log_cwnd(stcb, NULL, num_out, SCTP_SEND_NOW_COMPLETES); if (num_out == 0) { sctp_log_cwnd(stcb, NULL, reason_code, SCTP_SEND_NOW_COMPLETES); } } if (nagle_on) { /* * When the Nagle algorithm is used, look at how * much is unsent, then if its smaller than an MTU * and we have data in flight we stop, except if we * are handling a fragmented user message. */ un_sent = stcb->asoc.total_output_queue_size - stcb->asoc.total_flight; if ((un_sent < (int)(stcb->asoc.smallest_mtu - SCTP_MIN_OVERHEAD)) && (stcb->asoc.total_flight > 0)) { /* && sctp_is_feature_on(inp, SCTP_PCB_FLAGS_EXPLICIT_EOR))) {*/ break; } } if (TAILQ_EMPTY(&asoc->control_send_queue) && TAILQ_EMPTY(&asoc->send_queue) && sctp_is_there_unsent_data(stcb, so_locked) == 0) { /* Nothing left to send */ break; } if ((stcb->asoc.total_output_queue_size - stcb->asoc.total_flight) <= 0) { /* Nothing left to send */ break; } } while (num_out && ((asoc->max_burst == 0) || SCTP_BASE_SYSCTL(sctp_use_cwnd_based_maxburst) || (burst_cnt < asoc->max_burst))); if (SCTP_BASE_SYSCTL(sctp_use_cwnd_based_maxburst) == 0) { if ((asoc->max_burst > 0) && (burst_cnt >= asoc->max_burst)) { SCTP_STAT_INCR(sctps_maxburstqueued); asoc->burst_limit_applied = 1; if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LOG_MAXBURST_ENABLE) { sctp_log_maxburst(stcb, asoc->primary_destination, 0, burst_cnt, SCTP_MAX_BURST_APPLIED); } } else { asoc->burst_limit_applied = 0; } } if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_CWND_LOGGING_ENABLE) { sctp_log_cwnd(stcb, NULL, tot_out, SCTP_SEND_NOW_COMPLETES); } SCTPDBG(SCTP_DEBUG_OUTPUT1, "Ok, we have put out %d chunks\n", tot_out); /*- * Now we need to clean up the control chunk chain if a ECNE is on * it. It must be marked as UNSENT again so next call will continue * to send it until such time that we get a CWR, to remove it. */ if (stcb->asoc.ecn_echo_cnt_onq) sctp_fix_ecn_echo(asoc); if (stcb->asoc.trigger_reset) { if (sctp_send_stream_reset_out_if_possible(stcb, so_locked) == 0) { goto do_it_again; } } return; } int sctp_output( struct sctp_inpcb *inp, struct mbuf *m, struct sockaddr *addr, struct mbuf *control, struct thread *p, int flags) { if (inp == NULL) { SCTP_LTRACE_ERR_RET_PKT(m, inp, NULL, NULL, SCTP_FROM_SCTP_OUTPUT, EINVAL); return (EINVAL); } if (inp->sctp_socket == NULL) { SCTP_LTRACE_ERR_RET_PKT(m, inp, NULL, NULL, SCTP_FROM_SCTP_OUTPUT, EINVAL); return (EINVAL); } return (sctp_sosend(inp->sctp_socket, addr, (struct uio *)NULL, m, control, flags, p )); } void send_forward_tsn(struct sctp_tcb *stcb, struct sctp_association *asoc) { struct sctp_tmit_chunk *chk, *at, *tp1, *last; struct sctp_forward_tsn_chunk *fwdtsn; struct sctp_strseq *strseq; struct sctp_strseq_mid *strseq_m; uint32_t advance_peer_ack_point; unsigned int cnt_of_space, i, ovh; unsigned int space_needed; unsigned int cnt_of_skipped = 0; SCTP_TCB_LOCK_ASSERT(stcb); TAILQ_FOREACH(chk, &asoc->control_send_queue, sctp_next) { if (chk->rec.chunk_id.id == SCTP_FORWARD_CUM_TSN) { /* mark it to unsent */ chk->sent = SCTP_DATAGRAM_UNSENT; chk->snd_count = 0; /* Do we correct its output location? */ if (chk->whoTo) { sctp_free_remote_addr(chk->whoTo); chk->whoTo = NULL; } goto sctp_fill_in_rest; } } /* Ok if we reach here we must build one */ sctp_alloc_a_chunk(stcb, chk); if (chk == NULL) { return; } asoc->fwd_tsn_cnt++; chk->copy_by_ref = 0; /* * We don't do the old thing here since this is used not for on-wire * but to tell if we are sending a fwd-tsn by the stack during * output. And if its a IFORWARD or a FORWARD it is a fwd-tsn. */ chk->rec.chunk_id.id = SCTP_FORWARD_CUM_TSN; chk->rec.chunk_id.can_take_data = 0; chk->flags = 0; chk->asoc = asoc; chk->whoTo = NULL; chk->data = sctp_get_mbuf_for_msg(MCLBYTES, 0, M_NOWAIT, 1, MT_DATA); if (chk->data == NULL) { sctp_free_a_chunk(stcb, chk, SCTP_SO_NOT_LOCKED); return; } SCTP_BUF_RESV_UF(chk->data, SCTP_MIN_OVERHEAD); chk->sent = SCTP_DATAGRAM_UNSENT; chk->snd_count = 0; TAILQ_INSERT_TAIL(&asoc->control_send_queue, chk, sctp_next); asoc->ctrl_queue_cnt++; sctp_fill_in_rest: /*- * Here we go through and fill out the part that deals with * stream/seq of the ones we skip. */ SCTP_BUF_LEN(chk->data) = 0; TAILQ_FOREACH(at, &asoc->sent_queue, sctp_next) { if ((at->sent != SCTP_FORWARD_TSN_SKIP) && (at->sent != SCTP_DATAGRAM_NR_ACKED)) { /* no more to look at */ break; } if (!asoc->idata_supported && (at->rec.data.rcv_flags & SCTP_DATA_UNORDERED)) { /* We don't report these */ continue; } cnt_of_skipped++; } if (asoc->idata_supported) { space_needed = (sizeof(struct sctp_forward_tsn_chunk) + (cnt_of_skipped * sizeof(struct sctp_strseq_mid))); } else { space_needed = (sizeof(struct sctp_forward_tsn_chunk) + (cnt_of_skipped * sizeof(struct sctp_strseq))); } cnt_of_space = (unsigned int)M_TRAILINGSPACE(chk->data); if (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) { ovh = SCTP_MIN_OVERHEAD; } else { ovh = SCTP_MIN_V4_OVERHEAD; } if (cnt_of_space > (asoc->smallest_mtu - ovh)) { /* trim to a mtu size */ cnt_of_space = asoc->smallest_mtu - ovh; } if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LOG_TRY_ADVANCE) { sctp_misc_ints(SCTP_FWD_TSN_CHECK, 0xff, 0, cnt_of_skipped, asoc->advanced_peer_ack_point); } advance_peer_ack_point = asoc->advanced_peer_ack_point; if (cnt_of_space < space_needed) { /*- * ok we must trim down the chunk by lowering the * advance peer ack point. */ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LOG_TRY_ADVANCE) { sctp_misc_ints(SCTP_FWD_TSN_CHECK, 0xff, 0xff, cnt_of_space, space_needed); } cnt_of_skipped = cnt_of_space - sizeof(struct sctp_forward_tsn_chunk); if (asoc->idata_supported) { cnt_of_skipped /= sizeof(struct sctp_strseq_mid); } else { cnt_of_skipped /= sizeof(struct sctp_strseq); } /*- * Go through and find the TSN that will be the one * we report. */ at = TAILQ_FIRST(&asoc->sent_queue); if (at != NULL) { for (i = 0; i < cnt_of_skipped; i++) { tp1 = TAILQ_NEXT(at, sctp_next); if (tp1 == NULL) { break; } at = tp1; } } if (at && SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LOG_TRY_ADVANCE) { sctp_misc_ints(SCTP_FWD_TSN_CHECK, 0xff, cnt_of_skipped, at->rec.data.tsn, asoc->advanced_peer_ack_point); } last = at; /*- * last now points to last one I can report, update * peer ack point */ if (last) { advance_peer_ack_point = last->rec.data.tsn; } if (asoc->idata_supported) { space_needed = sizeof(struct sctp_forward_tsn_chunk) + cnt_of_skipped * sizeof(struct sctp_strseq_mid); } else { space_needed = sizeof(struct sctp_forward_tsn_chunk) + cnt_of_skipped * sizeof(struct sctp_strseq); } } chk->send_size = space_needed; /* Setup the chunk */ fwdtsn = mtod(chk->data, struct sctp_forward_tsn_chunk *); fwdtsn->ch.chunk_length = htons(chk->send_size); fwdtsn->ch.chunk_flags = 0; if (asoc->idata_supported) { fwdtsn->ch.chunk_type = SCTP_IFORWARD_CUM_TSN; } else { fwdtsn->ch.chunk_type = SCTP_FORWARD_CUM_TSN; } fwdtsn->new_cumulative_tsn = htonl(advance_peer_ack_point); SCTP_BUF_LEN(chk->data) = chk->send_size; fwdtsn++; /*- * Move pointer to after the fwdtsn and transfer to the * strseq pointer. */ if (asoc->idata_supported) { strseq_m = (struct sctp_strseq_mid *)fwdtsn; strseq = NULL; } else { strseq = (struct sctp_strseq *)fwdtsn; strseq_m = NULL; } /*- * Now populate the strseq list. This is done blindly * without pulling out duplicate stream info. This is * inefficent but won't harm the process since the peer will * look at these in sequence and will thus release anything. * It could mean we exceed the PMTU and chop off some that * we could have included.. but this is unlikely (aka 1432/4 * would mean 300+ stream seq's would have to be reported in * one FWD-TSN. With a bit of work we can later FIX this to * optimize and pull out duplicates.. but it does add more * overhead. So for now... not! */ i = 0; TAILQ_FOREACH(at, &asoc->sent_queue, sctp_next) { if (i >= cnt_of_skipped) { break; } if (!asoc->idata_supported && (at->rec.data.rcv_flags & SCTP_DATA_UNORDERED)) { /* We don't report these */ continue; } if (at->rec.data.tsn == advance_peer_ack_point) { at->rec.data.fwd_tsn_cnt = 0; } if (asoc->idata_supported) { strseq_m->sid = htons(at->rec.data.sid); if (at->rec.data.rcv_flags & SCTP_DATA_UNORDERED) { strseq_m->flags = htons(PR_SCTP_UNORDERED_FLAG); } else { strseq_m->flags = 0; } strseq_m->mid = htonl(at->rec.data.mid); strseq_m++; } else { strseq->sid = htons(at->rec.data.sid); strseq->ssn = htons((uint16_t)at->rec.data.mid); strseq++; } i++; } return; } void sctp_send_sack(struct sctp_tcb *stcb, int so_locked) { /*- * Queue up a SACK or NR-SACK in the control queue. * We must first check to see if a SACK or NR-SACK is * somehow on the control queue. * If so, we will take and and remove the old one. */ struct sctp_association *asoc; struct sctp_tmit_chunk *chk, *a_chk; struct sctp_sack_chunk *sack; struct sctp_nr_sack_chunk *nr_sack; struct sctp_gap_ack_block *gap_descriptor; const struct sack_track *selector; int mergeable = 0; int offset; caddr_t limit; uint32_t *dup; int limit_reached = 0; unsigned int i, siz, j; unsigned int num_gap_blocks = 0, num_nr_gap_blocks = 0, space; int num_dups = 0; int space_req; uint32_t highest_tsn; uint8_t flags; uint8_t type; uint8_t tsn_map; if (stcb->asoc.nrsack_supported == 1) { type = SCTP_NR_SELECTIVE_ACK; } else { type = SCTP_SELECTIVE_ACK; } a_chk = NULL; asoc = &stcb->asoc; SCTP_TCB_LOCK_ASSERT(stcb); if (asoc->last_data_chunk_from == NULL) { /* Hmm we never received anything */ return; } sctp_slide_mapping_arrays(stcb); sctp_set_rwnd(stcb, asoc); TAILQ_FOREACH(chk, &asoc->control_send_queue, sctp_next) { if (chk->rec.chunk_id.id == type) { /* Hmm, found a sack already on queue, remove it */ TAILQ_REMOVE(&asoc->control_send_queue, chk, sctp_next); asoc->ctrl_queue_cnt--; a_chk = chk; if (a_chk->data) { sctp_m_freem(a_chk->data); a_chk->data = NULL; } if (a_chk->whoTo) { sctp_free_remote_addr(a_chk->whoTo); a_chk->whoTo = NULL; } break; } } if (a_chk == NULL) { sctp_alloc_a_chunk(stcb, a_chk); if (a_chk == NULL) { /* No memory so we drop the idea, and set a timer */ if (stcb->asoc.delayed_ack) { sctp_timer_stop(SCTP_TIMER_TYPE_RECV, stcb->sctp_ep, stcb, NULL, SCTP_FROM_SCTP_OUTPUT + SCTP_LOC_4); sctp_timer_start(SCTP_TIMER_TYPE_RECV, stcb->sctp_ep, stcb, NULL); } else { stcb->asoc.send_sack = 1; } return; } a_chk->copy_by_ref = 0; a_chk->rec.chunk_id.id = type; a_chk->rec.chunk_id.can_take_data = 1; } /* Clear our pkt counts */ asoc->data_pkts_seen = 0; a_chk->flags = 0; a_chk->asoc = asoc; a_chk->snd_count = 0; a_chk->send_size = 0; /* fill in later */ a_chk->sent = SCTP_DATAGRAM_UNSENT; a_chk->whoTo = NULL; if (!(asoc->last_data_chunk_from->dest_state & SCTP_ADDR_REACHABLE)) { /*- * Ok, the destination for the SACK is unreachable, lets see if * we can select an alternate to asoc->last_data_chunk_from */ a_chk->whoTo = sctp_find_alternate_net(stcb, asoc->last_data_chunk_from, 0); if (a_chk->whoTo == NULL) { /* Nope, no alternate */ a_chk->whoTo = asoc->last_data_chunk_from; } } else { a_chk->whoTo = asoc->last_data_chunk_from; } if (a_chk->whoTo) { atomic_add_int(&a_chk->whoTo->ref_count, 1); } if (SCTP_TSN_GT(asoc->highest_tsn_inside_map, asoc->highest_tsn_inside_nr_map)) { highest_tsn = asoc->highest_tsn_inside_map; } else { highest_tsn = asoc->highest_tsn_inside_nr_map; } if (highest_tsn == asoc->cumulative_tsn) { /* no gaps */ if (type == SCTP_SELECTIVE_ACK) { space_req = sizeof(struct sctp_sack_chunk); } else { space_req = sizeof(struct sctp_nr_sack_chunk); } } else { /* gaps get a cluster */ space_req = MCLBYTES; } /* Ok now lets formulate a MBUF with our sack */ a_chk->data = sctp_get_mbuf_for_msg(space_req, 0, M_NOWAIT, 1, MT_DATA); if ((a_chk->data == NULL) || (a_chk->whoTo == NULL)) { /* rats, no mbuf memory */ if (a_chk->data) { /* was a problem with the destination */ sctp_m_freem(a_chk->data); a_chk->data = NULL; } sctp_free_a_chunk(stcb, a_chk, so_locked); /* sa_ignore NO_NULL_CHK */ if (stcb->asoc.delayed_ack) { sctp_timer_stop(SCTP_TIMER_TYPE_RECV, stcb->sctp_ep, stcb, NULL, SCTP_FROM_SCTP_OUTPUT + SCTP_LOC_5); sctp_timer_start(SCTP_TIMER_TYPE_RECV, stcb->sctp_ep, stcb, NULL); } else { stcb->asoc.send_sack = 1; } return; } /* ok, lets go through and fill it in */ SCTP_BUF_RESV_UF(a_chk->data, SCTP_MIN_OVERHEAD); space = (unsigned int)M_TRAILINGSPACE(a_chk->data); if (space > (a_chk->whoTo->mtu - SCTP_MIN_OVERHEAD)) { space = (a_chk->whoTo->mtu - SCTP_MIN_OVERHEAD); } limit = mtod(a_chk->data, caddr_t); limit += space; flags = 0; if ((asoc->sctp_cmt_on_off > 0) && SCTP_BASE_SYSCTL(sctp_cmt_use_dac)) { /*- * CMT DAC algorithm: If 2 (i.e., 0x10) packets have been * received, then set high bit to 1, else 0. Reset * pkts_rcvd. */ flags |= (asoc->cmt_dac_pkts_rcvd << 6); asoc->cmt_dac_pkts_rcvd = 0; } #ifdef SCTP_ASOCLOG_OF_TSNS stcb->asoc.cumack_logsnt[stcb->asoc.cumack_log_atsnt] = asoc->cumulative_tsn; stcb->asoc.cumack_log_atsnt++; if (stcb->asoc.cumack_log_atsnt >= SCTP_TSN_LOG_SIZE) { stcb->asoc.cumack_log_atsnt = 0; } #endif /* reset the readers interpretation */ stcb->freed_by_sorcv_sincelast = 0; if (type == SCTP_SELECTIVE_ACK) { sack = mtod(a_chk->data, struct sctp_sack_chunk *); nr_sack = NULL; gap_descriptor = (struct sctp_gap_ack_block *)((caddr_t)sack + sizeof(struct sctp_sack_chunk)); if (highest_tsn > asoc->mapping_array_base_tsn) { siz = (((highest_tsn - asoc->mapping_array_base_tsn) + 1) + 7) / 8; } else { siz = (((MAX_TSN - asoc->mapping_array_base_tsn) + 1) + highest_tsn + 7) / 8; } } else { sack = NULL; nr_sack = mtod(a_chk->data, struct sctp_nr_sack_chunk *); gap_descriptor = (struct sctp_gap_ack_block *)((caddr_t)nr_sack + sizeof(struct sctp_nr_sack_chunk)); if (asoc->highest_tsn_inside_map > asoc->mapping_array_base_tsn) { siz = (((asoc->highest_tsn_inside_map - asoc->mapping_array_base_tsn) + 1) + 7) / 8; } else { siz = (((MAX_TSN - asoc->mapping_array_base_tsn) + 1) + asoc->highest_tsn_inside_map + 7) / 8; } } if (SCTP_TSN_GT(asoc->mapping_array_base_tsn, asoc->cumulative_tsn)) { offset = 1; } else { offset = asoc->mapping_array_base_tsn - asoc->cumulative_tsn; } if (((type == SCTP_SELECTIVE_ACK) && SCTP_TSN_GT(highest_tsn, asoc->cumulative_tsn)) || ((type == SCTP_NR_SELECTIVE_ACK) && SCTP_TSN_GT(asoc->highest_tsn_inside_map, asoc->cumulative_tsn))) { /* we have a gap .. maybe */ for (i = 0; i < siz; i++) { tsn_map = asoc->mapping_array[i]; if (type == SCTP_SELECTIVE_ACK) { tsn_map |= asoc->nr_mapping_array[i]; } if (i == 0) { /* * Clear all bits corresponding to TSNs * smaller or equal to the cumulative TSN. */ tsn_map &= (~0U << (1 - offset)); } selector = &sack_array[tsn_map]; if (mergeable && selector->right_edge) { /* * Backup, left and right edges were ok to * merge. */ num_gap_blocks--; gap_descriptor--; } if (selector->num_entries == 0) mergeable = 0; else { for (j = 0; j < selector->num_entries; j++) { if (mergeable && selector->right_edge) { /* * do a merge by NOT setting * the left side */ mergeable = 0; } else { /* * no merge, set the left * side */ mergeable = 0; gap_descriptor->start = htons((selector->gaps[j].start + offset)); } gap_descriptor->end = htons((selector->gaps[j].end + offset)); num_gap_blocks++; gap_descriptor++; if (((caddr_t)gap_descriptor + sizeof(struct sctp_gap_ack_block)) > limit) { /* no more room */ limit_reached = 1; break; } } if (selector->left_edge) { mergeable = 1; } } if (limit_reached) { /* Reached the limit stop */ break; } offset += 8; } } if ((type == SCTP_NR_SELECTIVE_ACK) && (limit_reached == 0)) { mergeable = 0; if (asoc->highest_tsn_inside_nr_map > asoc->mapping_array_base_tsn) { siz = (((asoc->highest_tsn_inside_nr_map - asoc->mapping_array_base_tsn) + 1) + 7) / 8; } else { siz = (((MAX_TSN - asoc->mapping_array_base_tsn) + 1) + asoc->highest_tsn_inside_nr_map + 7) / 8; } if (SCTP_TSN_GT(asoc->mapping_array_base_tsn, asoc->cumulative_tsn)) { offset = 1; } else { offset = asoc->mapping_array_base_tsn - asoc->cumulative_tsn; } if (SCTP_TSN_GT(asoc->highest_tsn_inside_nr_map, asoc->cumulative_tsn)) { /* we have a gap .. maybe */ for (i = 0; i < siz; i++) { tsn_map = asoc->nr_mapping_array[i]; if (i == 0) { /* * Clear all bits corresponding to * TSNs smaller or equal to the * cumulative TSN. */ tsn_map &= (~0U << (1 - offset)); } selector = &sack_array[tsn_map]; if (mergeable && selector->right_edge) { /* * Backup, left and right edges were * ok to merge. */ num_nr_gap_blocks--; gap_descriptor--; } if (selector->num_entries == 0) mergeable = 0; else { for (j = 0; j < selector->num_entries; j++) { if (mergeable && selector->right_edge) { /* * do a merge by NOT * setting the left * side */ mergeable = 0; } else { /* * no merge, set the * left side */ mergeable = 0; gap_descriptor->start = htons((selector->gaps[j].start + offset)); } gap_descriptor->end = htons((selector->gaps[j].end + offset)); num_nr_gap_blocks++; gap_descriptor++; if (((caddr_t)gap_descriptor + sizeof(struct sctp_gap_ack_block)) > limit) { /* no more room */ limit_reached = 1; break; } } if (selector->left_edge) { mergeable = 1; } } if (limit_reached) { /* Reached the limit stop */ break; } offset += 8; } } } /* now we must add any dups we are going to report. */ if ((limit_reached == 0) && (asoc->numduptsns)) { dup = (uint32_t *)gap_descriptor; for (i = 0; i < asoc->numduptsns; i++) { *dup = htonl(asoc->dup_tsns[i]); dup++; num_dups++; if (((caddr_t)dup + sizeof(uint32_t)) > limit) { /* no more room */ break; } } asoc->numduptsns = 0; } /* * now that the chunk is prepared queue it to the control chunk * queue. */ if (type == SCTP_SELECTIVE_ACK) { a_chk->send_size = (uint16_t)(sizeof(struct sctp_sack_chunk) + (num_gap_blocks + num_nr_gap_blocks) * sizeof(struct sctp_gap_ack_block) + num_dups * sizeof(int32_t)); SCTP_BUF_LEN(a_chk->data) = a_chk->send_size; sack->sack.cum_tsn_ack = htonl(asoc->cumulative_tsn); sack->sack.a_rwnd = htonl(asoc->my_rwnd); sack->sack.num_gap_ack_blks = htons(num_gap_blocks); sack->sack.num_dup_tsns = htons(num_dups); sack->ch.chunk_type = type; sack->ch.chunk_flags = flags; sack->ch.chunk_length = htons(a_chk->send_size); } else { a_chk->send_size = (uint16_t)(sizeof(struct sctp_nr_sack_chunk) + (num_gap_blocks + num_nr_gap_blocks) * sizeof(struct sctp_gap_ack_block) + num_dups * sizeof(int32_t)); SCTP_BUF_LEN(a_chk->data) = a_chk->send_size; nr_sack->nr_sack.cum_tsn_ack = htonl(asoc->cumulative_tsn); nr_sack->nr_sack.a_rwnd = htonl(asoc->my_rwnd); nr_sack->nr_sack.num_gap_ack_blks = htons(num_gap_blocks); nr_sack->nr_sack.num_nr_gap_ack_blks = htons(num_nr_gap_blocks); nr_sack->nr_sack.num_dup_tsns = htons(num_dups); nr_sack->nr_sack.reserved = 0; nr_sack->ch.chunk_type = type; nr_sack->ch.chunk_flags = flags; nr_sack->ch.chunk_length = htons(a_chk->send_size); } TAILQ_INSERT_TAIL(&asoc->control_send_queue, a_chk, sctp_next); asoc->my_last_reported_rwnd = asoc->my_rwnd; asoc->ctrl_queue_cnt++; asoc->send_sack = 0; SCTP_STAT_INCR(sctps_sendsacks); return; } void sctp_send_abort_tcb(struct sctp_tcb *stcb, struct mbuf *operr, int so_locked) { struct mbuf *m_abort, *m, *m_last; struct mbuf *m_out, *m_end = NULL; struct sctp_abort_chunk *abort; struct sctp_auth_chunk *auth = NULL; struct sctp_nets *net; uint32_t vtag; uint32_t auth_offset = 0; int error; uint16_t cause_len, chunk_len, padding_len; SCTP_TCB_LOCK_ASSERT(stcb); /*- * Add an AUTH chunk, if chunk requires it and save the offset into * the chain for AUTH */ if (sctp_auth_is_required_chunk(SCTP_ABORT_ASSOCIATION, stcb->asoc.peer_auth_chunks)) { m_out = sctp_add_auth_chunk(NULL, &m_end, &auth, &auth_offset, stcb, SCTP_ABORT_ASSOCIATION); SCTP_STAT_INCR_COUNTER64(sctps_outcontrolchunks); } else { m_out = NULL; } m_abort = sctp_get_mbuf_for_msg(sizeof(struct sctp_abort_chunk), 0, M_NOWAIT, 1, MT_HEADER); if (m_abort == NULL) { if (m_out) { sctp_m_freem(m_out); } if (operr) { sctp_m_freem(operr); } return; } /* link in any error */ SCTP_BUF_NEXT(m_abort) = operr; cause_len = 0; m_last = NULL; for (m = operr; m; m = SCTP_BUF_NEXT(m)) { cause_len += (uint16_t)SCTP_BUF_LEN(m); if (SCTP_BUF_NEXT(m) == NULL) { m_last = m; } } SCTP_BUF_LEN(m_abort) = sizeof(struct sctp_abort_chunk); chunk_len = (uint16_t)sizeof(struct sctp_abort_chunk) + cause_len; padding_len = SCTP_SIZE32(chunk_len) - chunk_len; if (m_out == NULL) { /* NO Auth chunk prepended, so reserve space in front */ SCTP_BUF_RESV_UF(m_abort, SCTP_MIN_OVERHEAD); m_out = m_abort; } else { /* Put AUTH chunk at the front of the chain */ SCTP_BUF_NEXT(m_end) = m_abort; } if (stcb->asoc.alternate) { net = stcb->asoc.alternate; } else { net = stcb->asoc.primary_destination; } /* Fill in the ABORT chunk header. */ abort = mtod(m_abort, struct sctp_abort_chunk *); abort->ch.chunk_type = SCTP_ABORT_ASSOCIATION; if (stcb->asoc.peer_vtag == 0) { /* This happens iff the assoc is in COOKIE-WAIT state. */ vtag = stcb->asoc.my_vtag; abort->ch.chunk_flags = SCTP_HAD_NO_TCB; } else { vtag = stcb->asoc.peer_vtag; abort->ch.chunk_flags = 0; } abort->ch.chunk_length = htons(chunk_len); /* Add padding, if necessary. */ if (padding_len > 0) { if ((m_last == NULL) || (sctp_add_pad_tombuf(m_last, padding_len) == NULL)) { sctp_m_freem(m_out); return; } } if ((error = sctp_lowlevel_chunk_output(stcb->sctp_ep, stcb, net, (struct sockaddr *)&net->ro._l_addr, m_out, auth_offset, auth, stcb->asoc.authinfo.active_keyid, 1, 0, 0, stcb->sctp_ep->sctp_lport, stcb->rport, htonl(vtag), stcb->asoc.primary_destination->port, NULL, 0, 0, so_locked))) { SCTPDBG(SCTP_DEBUG_OUTPUT3, "Gak send error %d\n", error); if (error == ENOBUFS) { stcb->asoc.ifp_had_enobuf = 1; SCTP_STAT_INCR(sctps_lowlevelerr); } } else { stcb->asoc.ifp_had_enobuf = 0; } SCTP_STAT_INCR_COUNTER64(sctps_outcontrolchunks); } void sctp_send_shutdown_complete(struct sctp_tcb *stcb, struct sctp_nets *net, int reflect_vtag) { /* formulate and SEND a SHUTDOWN-COMPLETE */ struct mbuf *m_shutdown_comp; struct sctp_shutdown_complete_chunk *shutdown_complete; uint32_t vtag; int error; uint8_t flags; m_shutdown_comp = sctp_get_mbuf_for_msg(sizeof(struct sctp_chunkhdr), 0, M_NOWAIT, 1, MT_HEADER); if (m_shutdown_comp == NULL) { /* no mbuf's */ return; } if (reflect_vtag) { flags = SCTP_HAD_NO_TCB; vtag = stcb->asoc.my_vtag; } else { flags = 0; vtag = stcb->asoc.peer_vtag; } shutdown_complete = mtod(m_shutdown_comp, struct sctp_shutdown_complete_chunk *); shutdown_complete->ch.chunk_type = SCTP_SHUTDOWN_COMPLETE; shutdown_complete->ch.chunk_flags = flags; shutdown_complete->ch.chunk_length = htons(sizeof(struct sctp_shutdown_complete_chunk)); SCTP_BUF_LEN(m_shutdown_comp) = sizeof(struct sctp_shutdown_complete_chunk); if ((error = sctp_lowlevel_chunk_output(stcb->sctp_ep, stcb, net, (struct sockaddr *)&net->ro._l_addr, m_shutdown_comp, 0, NULL, 0, 1, 0, 0, stcb->sctp_ep->sctp_lport, stcb->rport, htonl(vtag), net->port, NULL, 0, 0, SCTP_SO_NOT_LOCKED))) { SCTPDBG(SCTP_DEBUG_OUTPUT3, "Gak send error %d\n", error); if (error == ENOBUFS) { stcb->asoc.ifp_had_enobuf = 1; SCTP_STAT_INCR(sctps_lowlevelerr); } } else { stcb->asoc.ifp_had_enobuf = 0; } SCTP_STAT_INCR_COUNTER64(sctps_outcontrolchunks); return; } static void sctp_send_resp_msg(struct sockaddr *src, struct sockaddr *dst, struct sctphdr *sh, uint32_t vtag, uint8_t type, struct mbuf *cause, uint8_t mflowtype, uint32_t mflowid, uint16_t fibnum, uint32_t vrf_id, uint16_t port) { struct mbuf *o_pak; struct mbuf *mout; struct sctphdr *shout; struct sctp_chunkhdr *ch; #if defined(INET) || defined(INET6) struct udphdr *udp; #endif int ret, len, cause_len, padding_len; #ifdef INET struct sockaddr_in *src_sin, *dst_sin; struct ip *ip; #endif #ifdef INET6 struct sockaddr_in6 *src_sin6, *dst_sin6; struct ip6_hdr *ip6; #endif /* Compute the length of the cause and add final padding. */ cause_len = 0; if (cause != NULL) { struct mbuf *m_at, *m_last = NULL; for (m_at = cause; m_at; m_at = SCTP_BUF_NEXT(m_at)) { if (SCTP_BUF_NEXT(m_at) == NULL) m_last = m_at; cause_len += SCTP_BUF_LEN(m_at); } padding_len = cause_len % 4; if (padding_len != 0) { padding_len = 4 - padding_len; } if (padding_len != 0) { if (sctp_add_pad_tombuf(m_last, padding_len) == NULL) { sctp_m_freem(cause); return; } } } else { padding_len = 0; } /* Get an mbuf for the header. */ len = sizeof(struct sctphdr) + sizeof(struct sctp_chunkhdr); switch (dst->sa_family) { #ifdef INET case AF_INET: len += sizeof(struct ip); break; #endif #ifdef INET6 case AF_INET6: len += sizeof(struct ip6_hdr); break; #endif default: break; } #if defined(INET) || defined(INET6) if (port) { len += sizeof(struct udphdr); } #endif mout = sctp_get_mbuf_for_msg(len + max_linkhdr, 1, M_NOWAIT, 1, MT_DATA); if (mout == NULL) { if (cause) { sctp_m_freem(cause); } return; } SCTP_BUF_RESV_UF(mout, max_linkhdr); SCTP_BUF_LEN(mout) = len; SCTP_BUF_NEXT(mout) = cause; M_SETFIB(mout, fibnum); mout->m_pkthdr.flowid = mflowid; M_HASHTYPE_SET(mout, mflowtype); #ifdef INET ip = NULL; #endif #ifdef INET6 ip6 = NULL; #endif switch (dst->sa_family) { #ifdef INET case AF_INET: src_sin = (struct sockaddr_in *)src; dst_sin = (struct sockaddr_in *)dst; ip = mtod(mout, struct ip *); ip->ip_v = IPVERSION; ip->ip_hl = (sizeof(struct ip) >> 2); ip->ip_tos = 0; ip->ip_off = htons(IP_DF); ip_fillid(ip); ip->ip_ttl = MODULE_GLOBAL(ip_defttl); if (port) { ip->ip_p = IPPROTO_UDP; } else { ip->ip_p = IPPROTO_SCTP; } ip->ip_src.s_addr = dst_sin->sin_addr.s_addr; ip->ip_dst.s_addr = src_sin->sin_addr.s_addr; ip->ip_sum = 0; len = sizeof(struct ip); shout = (struct sctphdr *)((caddr_t)ip + len); break; #endif #ifdef INET6 case AF_INET6: src_sin6 = (struct sockaddr_in6 *)src; dst_sin6 = (struct sockaddr_in6 *)dst; ip6 = mtod(mout, struct ip6_hdr *); ip6->ip6_flow = htonl(0x60000000); if (V_ip6_auto_flowlabel) { ip6->ip6_flow |= (htonl(ip6_randomflowlabel()) & IPV6_FLOWLABEL_MASK); } ip6->ip6_hlim = MODULE_GLOBAL(ip6_defhlim); if (port) { ip6->ip6_nxt = IPPROTO_UDP; } else { ip6->ip6_nxt = IPPROTO_SCTP; } ip6->ip6_src = dst_sin6->sin6_addr; ip6->ip6_dst = src_sin6->sin6_addr; len = sizeof(struct ip6_hdr); shout = (struct sctphdr *)((caddr_t)ip6 + len); break; #endif default: len = 0; shout = mtod(mout, struct sctphdr *); break; } #if defined(INET) || defined(INET6) if (port) { if (htons(SCTP_BASE_SYSCTL(sctp_udp_tunneling_port)) == 0) { sctp_m_freem(mout); return; } udp = (struct udphdr *)shout; udp->uh_sport = htons(SCTP_BASE_SYSCTL(sctp_udp_tunneling_port)); udp->uh_dport = port; udp->uh_sum = 0; udp->uh_ulen = htons((uint16_t)(sizeof(struct udphdr) + sizeof(struct sctphdr) + sizeof(struct sctp_chunkhdr) + cause_len + padding_len)); len += sizeof(struct udphdr); shout = (struct sctphdr *)((caddr_t)shout + sizeof(struct udphdr)); } else { udp = NULL; } #endif shout->src_port = sh->dest_port; shout->dest_port = sh->src_port; shout->checksum = 0; if (vtag) { shout->v_tag = htonl(vtag); } else { shout->v_tag = sh->v_tag; } len += sizeof(struct sctphdr); ch = (struct sctp_chunkhdr *)((caddr_t)shout + sizeof(struct sctphdr)); ch->chunk_type = type; if (vtag) { ch->chunk_flags = 0; } else { ch->chunk_flags = SCTP_HAD_NO_TCB; } ch->chunk_length = htons((uint16_t)(sizeof(struct sctp_chunkhdr) + cause_len)); len += sizeof(struct sctp_chunkhdr); len += cause_len + padding_len; if (SCTP_GET_HEADER_FOR_OUTPUT(o_pak)) { sctp_m_freem(mout); return; } SCTP_ATTACH_CHAIN(o_pak, mout, len); switch (dst->sa_family) { #ifdef INET case AF_INET: if (port) { if (V_udp_cksum) { udp->uh_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, udp->uh_ulen + htons(IPPROTO_UDP)); } else { udp->uh_sum = 0; } } ip->ip_len = htons(len); if (port) { shout->checksum = sctp_calculate_cksum(mout, sizeof(struct ip) + sizeof(struct udphdr)); SCTP_STAT_INCR(sctps_sendswcrc); if (V_udp_cksum) { SCTP_ENABLE_UDP_CSUM(o_pak); } } else { mout->m_pkthdr.csum_flags = CSUM_SCTP; mout->m_pkthdr.csum_data = offsetof(struct sctphdr, checksum); SCTP_STAT_INCR(sctps_sendhwcrc); } #ifdef SCTP_PACKET_LOGGING if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LAST_PACKET_TRACING) { sctp_packet_log(o_pak); } #endif SCTP_PROBE5(send, NULL, NULL, ip, NULL, shout); SCTP_IP_OUTPUT(ret, o_pak, NULL, NULL, vrf_id); break; #endif #ifdef INET6 case AF_INET6: ip6->ip6_plen = htons((uint16_t)(len - sizeof(struct ip6_hdr))); if (port) { shout->checksum = sctp_calculate_cksum(mout, sizeof(struct ip6_hdr) + sizeof(struct udphdr)); SCTP_STAT_INCR(sctps_sendswcrc); if ((udp->uh_sum = in6_cksum(o_pak, IPPROTO_UDP, sizeof(struct ip6_hdr), len - sizeof(struct ip6_hdr))) == 0) { udp->uh_sum = 0xffff; } } else { mout->m_pkthdr.csum_flags = CSUM_SCTP_IPV6; mout->m_pkthdr.csum_data = offsetof(struct sctphdr, checksum); SCTP_STAT_INCR(sctps_sendhwcrc); } #ifdef SCTP_PACKET_LOGGING if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LAST_PACKET_TRACING) { sctp_packet_log(o_pak); } #endif SCTP_PROBE5(send, NULL, NULL, ip6, NULL, shout); SCTP_IP6_OUTPUT(ret, o_pak, NULL, NULL, NULL, vrf_id); break; #endif default: SCTPDBG(SCTP_DEBUG_OUTPUT1, "Unknown protocol (TSNH) type %d\n", dst->sa_family); sctp_m_freem(mout); SCTP_LTRACE_ERR_RET_PKT(mout, NULL, NULL, NULL, SCTP_FROM_SCTP_OUTPUT, EFAULT); return; } SCTPDBG(SCTP_DEBUG_OUTPUT3, "return from send is %d\n", ret); if (port) { UDPSTAT_INC(udps_opackets); } SCTP_STAT_INCR(sctps_sendpackets); SCTP_STAT_INCR_COUNTER64(sctps_outpackets); SCTP_STAT_INCR_COUNTER64(sctps_outcontrolchunks); if (ret) { SCTP_STAT_INCR(sctps_senderrors); } return; } void sctp_send_shutdown_complete2(struct sockaddr *src, struct sockaddr *dst, struct sctphdr *sh, uint8_t mflowtype, uint32_t mflowid, uint16_t fibnum, uint32_t vrf_id, uint16_t port) { sctp_send_resp_msg(src, dst, sh, 0, SCTP_SHUTDOWN_COMPLETE, NULL, mflowtype, mflowid, fibnum, vrf_id, port); } void sctp_send_hb(struct sctp_tcb *stcb, struct sctp_nets *net, int so_locked) { struct sctp_tmit_chunk *chk; struct sctp_heartbeat_chunk *hb; struct timeval now; SCTP_TCB_LOCK_ASSERT(stcb); if (net == NULL) { return; } (void)SCTP_GETTIME_TIMEVAL(&now); switch (net->ro._l_addr.sa.sa_family) { #ifdef INET case AF_INET: break; #endif #ifdef INET6 case AF_INET6: break; #endif default: return; } sctp_alloc_a_chunk(stcb, chk); if (chk == NULL) { SCTPDBG(SCTP_DEBUG_OUTPUT4, "Gak, can't get a chunk for hb\n"); return; } chk->copy_by_ref = 0; chk->rec.chunk_id.id = SCTP_HEARTBEAT_REQUEST; chk->rec.chunk_id.can_take_data = 1; chk->flags = 0; chk->asoc = &stcb->asoc; chk->send_size = sizeof(struct sctp_heartbeat_chunk); chk->data = sctp_get_mbuf_for_msg(chk->send_size, 0, M_NOWAIT, 1, MT_HEADER); if (chk->data == NULL) { sctp_free_a_chunk(stcb, chk, so_locked); return; } SCTP_BUF_RESV_UF(chk->data, SCTP_MIN_OVERHEAD); SCTP_BUF_LEN(chk->data) = chk->send_size; chk->sent = SCTP_DATAGRAM_UNSENT; chk->snd_count = 0; chk->whoTo = net; atomic_add_int(&chk->whoTo->ref_count, 1); /* Now we have a mbuf that we can fill in with the details */ hb = mtod(chk->data, struct sctp_heartbeat_chunk *); memset(hb, 0, sizeof(struct sctp_heartbeat_chunk)); /* fill out chunk header */ hb->ch.chunk_type = SCTP_HEARTBEAT_REQUEST; hb->ch.chunk_flags = 0; hb->ch.chunk_length = htons(chk->send_size); /* Fill out hb parameter */ hb->heartbeat.hb_info.ph.param_type = htons(SCTP_HEARTBEAT_INFO); hb->heartbeat.hb_info.ph.param_length = htons(sizeof(struct sctp_heartbeat_info_param)); hb->heartbeat.hb_info.time_value_1 = now.tv_sec; hb->heartbeat.hb_info.time_value_2 = now.tv_usec; /* Did our user request this one, put it in */ hb->heartbeat.hb_info.addr_family = (uint8_t)net->ro._l_addr.sa.sa_family; hb->heartbeat.hb_info.addr_len = net->ro._l_addr.sa.sa_len; if (net->dest_state & SCTP_ADDR_UNCONFIRMED) { /* * we only take from the entropy pool if the address is not * confirmed. */ net->heartbeat_random1 = hb->heartbeat.hb_info.random_value1 = sctp_select_initial_TSN(&stcb->sctp_ep->sctp_ep); net->heartbeat_random2 = hb->heartbeat.hb_info.random_value2 = sctp_select_initial_TSN(&stcb->sctp_ep->sctp_ep); } else { net->heartbeat_random1 = hb->heartbeat.hb_info.random_value1 = 0; net->heartbeat_random2 = hb->heartbeat.hb_info.random_value2 = 0; } switch (net->ro._l_addr.sa.sa_family) { #ifdef INET case AF_INET: memcpy(hb->heartbeat.hb_info.address, &net->ro._l_addr.sin.sin_addr, sizeof(net->ro._l_addr.sin.sin_addr)); break; #endif #ifdef INET6 case AF_INET6: memcpy(hb->heartbeat.hb_info.address, &net->ro._l_addr.sin6.sin6_addr, sizeof(net->ro._l_addr.sin6.sin6_addr)); break; #endif default: if (chk->data) { sctp_m_freem(chk->data); chk->data = NULL; } sctp_free_a_chunk(stcb, chk, so_locked); return; break; } net->hb_responded = 0; TAILQ_INSERT_TAIL(&stcb->asoc.control_send_queue, chk, sctp_next); stcb->asoc.ctrl_queue_cnt++; SCTP_STAT_INCR(sctps_sendheartbeat); return; } void sctp_send_ecn_echo(struct sctp_tcb *stcb, struct sctp_nets *net, uint32_t high_tsn) { struct sctp_association *asoc; struct sctp_ecne_chunk *ecne; struct sctp_tmit_chunk *chk; if (net == NULL) { return; } asoc = &stcb->asoc; SCTP_TCB_LOCK_ASSERT(stcb); TAILQ_FOREACH(chk, &asoc->control_send_queue, sctp_next) { if ((chk->rec.chunk_id.id == SCTP_ECN_ECHO) && (net == chk->whoTo)) { /* found a previous ECN_ECHO update it if needed */ uint32_t cnt, ctsn; ecne = mtod(chk->data, struct sctp_ecne_chunk *); ctsn = ntohl(ecne->tsn); if (SCTP_TSN_GT(high_tsn, ctsn)) { ecne->tsn = htonl(high_tsn); SCTP_STAT_INCR(sctps_queue_upd_ecne); } cnt = ntohl(ecne->num_pkts_since_cwr); cnt++; ecne->num_pkts_since_cwr = htonl(cnt); return; } } /* nope could not find one to update so we must build one */ sctp_alloc_a_chunk(stcb, chk); if (chk == NULL) { return; } SCTP_STAT_INCR(sctps_queue_upd_ecne); chk->copy_by_ref = 0; chk->rec.chunk_id.id = SCTP_ECN_ECHO; chk->rec.chunk_id.can_take_data = 0; chk->flags = 0; chk->asoc = &stcb->asoc; chk->send_size = sizeof(struct sctp_ecne_chunk); chk->data = sctp_get_mbuf_for_msg(chk->send_size, 0, M_NOWAIT, 1, MT_HEADER); if (chk->data == NULL) { sctp_free_a_chunk(stcb, chk, SCTP_SO_NOT_LOCKED); return; } SCTP_BUF_RESV_UF(chk->data, SCTP_MIN_OVERHEAD); SCTP_BUF_LEN(chk->data) = chk->send_size; chk->sent = SCTP_DATAGRAM_UNSENT; chk->snd_count = 0; chk->whoTo = net; atomic_add_int(&chk->whoTo->ref_count, 1); stcb->asoc.ecn_echo_cnt_onq++; ecne = mtod(chk->data, struct sctp_ecne_chunk *); ecne->ch.chunk_type = SCTP_ECN_ECHO; ecne->ch.chunk_flags = 0; ecne->ch.chunk_length = htons(sizeof(struct sctp_ecne_chunk)); ecne->tsn = htonl(high_tsn); ecne->num_pkts_since_cwr = htonl(1); TAILQ_INSERT_HEAD(&stcb->asoc.control_send_queue, chk, sctp_next); asoc->ctrl_queue_cnt++; } void sctp_send_packet_dropped(struct sctp_tcb *stcb, struct sctp_nets *net, struct mbuf *m, int len, int iphlen, int bad_crc) { struct sctp_association *asoc; struct sctp_pktdrop_chunk *drp; struct sctp_tmit_chunk *chk; uint8_t *datap; int was_trunc = 0; int fullsz = 0; long spc; int offset; struct sctp_chunkhdr *ch, chunk_buf; unsigned int chk_length; if (!stcb) { return; } asoc = &stcb->asoc; SCTP_TCB_LOCK_ASSERT(stcb); if (asoc->pktdrop_supported == 0) { /*- * peer must declare support before I send one. */ return; } if (stcb->sctp_socket == NULL) { return; } sctp_alloc_a_chunk(stcb, chk); if (chk == NULL) { return; } chk->copy_by_ref = 0; chk->rec.chunk_id.id = SCTP_PACKET_DROPPED; chk->rec.chunk_id.can_take_data = 1; chk->flags = 0; len -= iphlen; chk->send_size = len; /* Validate that we do not have an ABORT in here. */ offset = iphlen + sizeof(struct sctphdr); ch = (struct sctp_chunkhdr *)sctp_m_getptr(m, offset, sizeof(*ch), (uint8_t *)&chunk_buf); while (ch != NULL) { chk_length = ntohs(ch->chunk_length); if (chk_length < sizeof(*ch)) { /* break to abort land */ break; } switch (ch->chunk_type) { case SCTP_PACKET_DROPPED: case SCTP_ABORT_ASSOCIATION: case SCTP_INITIATION_ACK: /** * We don't respond with an PKT-DROP to an ABORT * or PKT-DROP. We also do not respond to an * INIT-ACK, because we can't know if the initiation * tag is correct or not. */ sctp_free_a_chunk(stcb, chk, SCTP_SO_NOT_LOCKED); return; default: break; } offset += SCTP_SIZE32(chk_length); ch = (struct sctp_chunkhdr *)sctp_m_getptr(m, offset, sizeof(*ch), (uint8_t *)&chunk_buf); } if ((len + SCTP_MAX_OVERHEAD + sizeof(struct sctp_pktdrop_chunk)) > min(stcb->asoc.smallest_mtu, MCLBYTES)) { /* * only send 1 mtu worth, trim off the excess on the end. */ fullsz = len; len = min(stcb->asoc.smallest_mtu, MCLBYTES) - SCTP_MAX_OVERHEAD; was_trunc = 1; } chk->asoc = &stcb->asoc; chk->data = sctp_get_mbuf_for_msg(MCLBYTES, 0, M_NOWAIT, 1, MT_DATA); if (chk->data == NULL) { jump_out: sctp_free_a_chunk(stcb, chk, SCTP_SO_NOT_LOCKED); return; } SCTP_BUF_RESV_UF(chk->data, SCTP_MIN_OVERHEAD); drp = mtod(chk->data, struct sctp_pktdrop_chunk *); if (drp == NULL) { sctp_m_freem(chk->data); chk->data = NULL; goto jump_out; } chk->book_size = SCTP_SIZE32((chk->send_size + sizeof(struct sctp_pktdrop_chunk) + sizeof(struct sctphdr) + SCTP_MED_OVERHEAD)); chk->book_size_scale = 0; if (was_trunc) { drp->ch.chunk_flags = SCTP_PACKET_TRUNCATED; drp->trunc_len = htons(fullsz); /* * Len is already adjusted to size minus overhead above take * out the pkt_drop chunk itself from it. */ chk->send_size = (uint16_t)(len - sizeof(struct sctp_pktdrop_chunk)); len = chk->send_size; } else { /* no truncation needed */ drp->ch.chunk_flags = 0; drp->trunc_len = htons(0); } if (bad_crc) { drp->ch.chunk_flags |= SCTP_BADCRC; } chk->send_size += sizeof(struct sctp_pktdrop_chunk); SCTP_BUF_LEN(chk->data) = chk->send_size; chk->sent = SCTP_DATAGRAM_UNSENT; chk->snd_count = 0; if (net) { /* we should hit here */ chk->whoTo = net; atomic_add_int(&chk->whoTo->ref_count, 1); } else { chk->whoTo = NULL; } drp->ch.chunk_type = SCTP_PACKET_DROPPED; drp->ch.chunk_length = htons(chk->send_size); spc = SCTP_SB_LIMIT_RCV(stcb->sctp_socket); if (spc < 0) { spc = 0; } drp->bottle_bw = htonl(spc); if (asoc->my_rwnd) { drp->current_onq = htonl(asoc->size_on_reasm_queue + asoc->size_on_all_streams + asoc->my_rwnd_control_len + stcb->sctp_socket->so_rcv.sb_cc); } else { /*- * If my rwnd is 0, possibly from mbuf depletion as well as * space used, tell the peer there is NO space aka onq == bw */ drp->current_onq = htonl(spc); } drp->reserved = 0; datap = drp->data; m_copydata(m, iphlen, len, (caddr_t)datap); TAILQ_INSERT_TAIL(&stcb->asoc.control_send_queue, chk, sctp_next); asoc->ctrl_queue_cnt++; } void sctp_send_cwr(struct sctp_tcb *stcb, struct sctp_nets *net, uint32_t high_tsn, uint8_t override) { struct sctp_association *asoc; struct sctp_cwr_chunk *cwr; struct sctp_tmit_chunk *chk; SCTP_TCB_LOCK_ASSERT(stcb); if (net == NULL) { return; } asoc = &stcb->asoc; TAILQ_FOREACH(chk, &asoc->control_send_queue, sctp_next) { if ((chk->rec.chunk_id.id == SCTP_ECN_CWR) && (net == chk->whoTo)) { /* * found a previous CWR queued to same destination * update it if needed */ uint32_t ctsn; cwr = mtod(chk->data, struct sctp_cwr_chunk *); ctsn = ntohl(cwr->tsn); if (SCTP_TSN_GT(high_tsn, ctsn)) { cwr->tsn = htonl(high_tsn); } if (override & SCTP_CWR_REDUCE_OVERRIDE) { /* Make sure override is carried */ cwr->ch.chunk_flags |= SCTP_CWR_REDUCE_OVERRIDE; } return; } } sctp_alloc_a_chunk(stcb, chk); if (chk == NULL) { return; } chk->copy_by_ref = 0; chk->rec.chunk_id.id = SCTP_ECN_CWR; chk->rec.chunk_id.can_take_data = 1; chk->flags = 0; chk->asoc = &stcb->asoc; chk->send_size = sizeof(struct sctp_cwr_chunk); chk->data = sctp_get_mbuf_for_msg(chk->send_size, 0, M_NOWAIT, 1, MT_HEADER); if (chk->data == NULL) { sctp_free_a_chunk(stcb, chk, SCTP_SO_NOT_LOCKED); return; } SCTP_BUF_RESV_UF(chk->data, SCTP_MIN_OVERHEAD); SCTP_BUF_LEN(chk->data) = chk->send_size; chk->sent = SCTP_DATAGRAM_UNSENT; chk->snd_count = 0; chk->whoTo = net; atomic_add_int(&chk->whoTo->ref_count, 1); cwr = mtod(chk->data, struct sctp_cwr_chunk *); cwr->ch.chunk_type = SCTP_ECN_CWR; cwr->ch.chunk_flags = override; cwr->ch.chunk_length = htons(sizeof(struct sctp_cwr_chunk)); cwr->tsn = htonl(high_tsn); TAILQ_INSERT_TAIL(&stcb->asoc.control_send_queue, chk, sctp_next); asoc->ctrl_queue_cnt++; } static int sctp_add_stream_reset_out(struct sctp_tcb *stcb, struct sctp_tmit_chunk *chk, uint32_t seq, uint32_t resp_seq, uint32_t last_sent) { uint16_t len, old_len, i; struct sctp_stream_reset_out_request *req_out; struct sctp_chunkhdr *ch; int at; int number_entries = 0; ch = mtod(chk->data, struct sctp_chunkhdr *); old_len = len = SCTP_SIZE32(ntohs(ch->chunk_length)); /* get to new offset for the param. */ req_out = (struct sctp_stream_reset_out_request *)((caddr_t)ch + len); /* now how long will this param be? */ for (i = 0; i < stcb->asoc.streamoutcnt; i++) { if ((stcb->asoc.strmout[i].state == SCTP_STREAM_RESET_PENDING) && (stcb->asoc.strmout[i].chunks_on_queues == 0) && TAILQ_EMPTY(&stcb->asoc.strmout[i].outqueue)) { number_entries++; } } if (number_entries == 0) { return (0); } if (number_entries == stcb->asoc.streamoutcnt) { number_entries = 0; } if (number_entries > SCTP_MAX_STREAMS_AT_ONCE_RESET) { number_entries = SCTP_MAX_STREAMS_AT_ONCE_RESET; } len = (uint16_t)(sizeof(struct sctp_stream_reset_out_request) + (sizeof(uint16_t) * number_entries)); req_out->ph.param_type = htons(SCTP_STR_RESET_OUT_REQUEST); req_out->ph.param_length = htons(len); req_out->request_seq = htonl(seq); req_out->response_seq = htonl(resp_seq); req_out->send_reset_at_tsn = htonl(last_sent); at = 0; if (number_entries) { for (i = 0; i < stcb->asoc.streamoutcnt; i++) { if ((stcb->asoc.strmout[i].state == SCTP_STREAM_RESET_PENDING) && (stcb->asoc.strmout[i].chunks_on_queues == 0) && TAILQ_EMPTY(&stcb->asoc.strmout[i].outqueue)) { req_out->list_of_streams[at] = htons(i); at++; stcb->asoc.strmout[i].state = SCTP_STREAM_RESET_IN_FLIGHT; if (at >= number_entries) { break; } } } } else { for (i = 0; i < stcb->asoc.streamoutcnt; i++) { stcb->asoc.strmout[i].state = SCTP_STREAM_RESET_IN_FLIGHT; } } if (SCTP_SIZE32(len) > len) { /*- * Need to worry about the pad we may end up adding to the * end. This is easy since the struct is either aligned to 4 * bytes or 2 bytes off. */ req_out->list_of_streams[number_entries] = 0; } /* now fix the chunk length */ ch->chunk_length = htons(len + old_len); chk->book_size = len + old_len; chk->book_size_scale = 0; chk->send_size = SCTP_SIZE32(chk->book_size); SCTP_BUF_LEN(chk->data) = chk->send_size; return (1); } static void sctp_add_stream_reset_in(struct sctp_tmit_chunk *chk, int number_entries, uint16_t *list, uint32_t seq) { uint16_t len, old_len, i; struct sctp_stream_reset_in_request *req_in; struct sctp_chunkhdr *ch; ch = mtod(chk->data, struct sctp_chunkhdr *); old_len = len = SCTP_SIZE32(ntohs(ch->chunk_length)); /* get to new offset for the param. */ req_in = (struct sctp_stream_reset_in_request *)((caddr_t)ch + len); /* now how long will this param be? */ len = (uint16_t)(sizeof(struct sctp_stream_reset_in_request) + (sizeof(uint16_t) * number_entries)); req_in->ph.param_type = htons(SCTP_STR_RESET_IN_REQUEST); req_in->ph.param_length = htons(len); req_in->request_seq = htonl(seq); if (number_entries) { for (i = 0; i < number_entries; i++) { req_in->list_of_streams[i] = htons(list[i]); } } if (SCTP_SIZE32(len) > len) { /*- * Need to worry about the pad we may end up adding to the * end. This is easy since the struct is either aligned to 4 * bytes or 2 bytes off. */ req_in->list_of_streams[number_entries] = 0; } /* now fix the chunk length */ ch->chunk_length = htons(len + old_len); chk->book_size = len + old_len; chk->book_size_scale = 0; chk->send_size = SCTP_SIZE32(chk->book_size); SCTP_BUF_LEN(chk->data) = chk->send_size; return; } static void sctp_add_stream_reset_tsn(struct sctp_tmit_chunk *chk, uint32_t seq) { uint16_t len, old_len; struct sctp_stream_reset_tsn_request *req_tsn; struct sctp_chunkhdr *ch; ch = mtod(chk->data, struct sctp_chunkhdr *); old_len = len = SCTP_SIZE32(ntohs(ch->chunk_length)); /* get to new offset for the param. */ req_tsn = (struct sctp_stream_reset_tsn_request *)((caddr_t)ch + len); /* now how long will this param be? */ len = sizeof(struct sctp_stream_reset_tsn_request); req_tsn->ph.param_type = htons(SCTP_STR_RESET_TSN_REQUEST); req_tsn->ph.param_length = htons(len); req_tsn->request_seq = htonl(seq); /* now fix the chunk length */ ch->chunk_length = htons(len + old_len); chk->send_size = len + old_len; chk->book_size = SCTP_SIZE32(chk->send_size); chk->book_size_scale = 0; SCTP_BUF_LEN(chk->data) = SCTP_SIZE32(chk->send_size); return; } void sctp_add_stream_reset_result(struct sctp_tmit_chunk *chk, uint32_t resp_seq, uint32_t result) { uint16_t len, old_len; struct sctp_stream_reset_response *resp; struct sctp_chunkhdr *ch; ch = mtod(chk->data, struct sctp_chunkhdr *); old_len = len = SCTP_SIZE32(ntohs(ch->chunk_length)); /* get to new offset for the param. */ resp = (struct sctp_stream_reset_response *)((caddr_t)ch + len); /* now how long will this param be? */ len = sizeof(struct sctp_stream_reset_response); resp->ph.param_type = htons(SCTP_STR_RESET_RESPONSE); resp->ph.param_length = htons(len); resp->response_seq = htonl(resp_seq); resp->result = ntohl(result); /* now fix the chunk length */ ch->chunk_length = htons(len + old_len); chk->book_size = len + old_len; chk->book_size_scale = 0; chk->send_size = SCTP_SIZE32(chk->book_size); SCTP_BUF_LEN(chk->data) = chk->send_size; return; } void sctp_send_deferred_reset_response(struct sctp_tcb *stcb, struct sctp_stream_reset_list *ent, int response) { struct sctp_association *asoc; struct sctp_tmit_chunk *chk; struct sctp_chunkhdr *ch; asoc = &stcb->asoc; /* * Reset our last reset action to the new one IP -> response * (PERFORMED probably). This assures that if we fail to send, a * retran from the peer will get the new response. */ asoc->last_reset_action[0] = response; if (asoc->stream_reset_outstanding) { return; } sctp_alloc_a_chunk(stcb, chk); if (chk == NULL) { SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, ENOMEM); return; } chk->copy_by_ref = 0; chk->rec.chunk_id.id = SCTP_STREAM_RESET; chk->rec.chunk_id.can_take_data = 0; chk->flags = 0; chk->asoc = &stcb->asoc; chk->book_size = sizeof(struct sctp_chunkhdr); chk->send_size = SCTP_SIZE32(chk->book_size); chk->book_size_scale = 0; chk->data = sctp_get_mbuf_for_msg(MCLBYTES, 0, M_NOWAIT, 1, MT_DATA); if (chk->data == NULL) { sctp_free_a_chunk(stcb, chk, SCTP_SO_LOCKED); SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, ENOMEM); return; } SCTP_BUF_RESV_UF(chk->data, SCTP_MIN_OVERHEAD); /* setup chunk parameters */ chk->sent = SCTP_DATAGRAM_UNSENT; chk->snd_count = 0; if (stcb->asoc.alternate) { chk->whoTo = stcb->asoc.alternate; } else { chk->whoTo = stcb->asoc.primary_destination; } ch = mtod(chk->data, struct sctp_chunkhdr *); ch->chunk_type = SCTP_STREAM_RESET; ch->chunk_flags = 0; ch->chunk_length = htons(chk->book_size); atomic_add_int(&chk->whoTo->ref_count, 1); SCTP_BUF_LEN(chk->data) = chk->send_size; sctp_add_stream_reset_result(chk, ent->seq, response); /* insert the chunk for sending */ TAILQ_INSERT_TAIL(&asoc->control_send_queue, chk, sctp_next); asoc->ctrl_queue_cnt++; } void sctp_add_stream_reset_result_tsn(struct sctp_tmit_chunk *chk, uint32_t resp_seq, uint32_t result, uint32_t send_una, uint32_t recv_next) { uint16_t len, old_len; struct sctp_stream_reset_response_tsn *resp; struct sctp_chunkhdr *ch; ch = mtod(chk->data, struct sctp_chunkhdr *); old_len = len = SCTP_SIZE32(ntohs(ch->chunk_length)); /* get to new offset for the param. */ resp = (struct sctp_stream_reset_response_tsn *)((caddr_t)ch + len); /* now how long will this param be? */ len = sizeof(struct sctp_stream_reset_response_tsn); resp->ph.param_type = htons(SCTP_STR_RESET_RESPONSE); resp->ph.param_length = htons(len); resp->response_seq = htonl(resp_seq); resp->result = htonl(result); resp->senders_next_tsn = htonl(send_una); resp->receivers_next_tsn = htonl(recv_next); /* now fix the chunk length */ ch->chunk_length = htons(len + old_len); chk->book_size = len + old_len; chk->send_size = SCTP_SIZE32(chk->book_size); chk->book_size_scale = 0; SCTP_BUF_LEN(chk->data) = chk->send_size; return; } static void sctp_add_an_out_stream(struct sctp_tmit_chunk *chk, uint32_t seq, uint16_t adding) { uint16_t len, old_len; struct sctp_chunkhdr *ch; struct sctp_stream_reset_add_strm *addstr; ch = mtod(chk->data, struct sctp_chunkhdr *); old_len = len = SCTP_SIZE32(ntohs(ch->chunk_length)); /* get to new offset for the param. */ addstr = (struct sctp_stream_reset_add_strm *)((caddr_t)ch + len); /* now how long will this param be? */ len = sizeof(struct sctp_stream_reset_add_strm); /* Fill it out. */ addstr->ph.param_type = htons(SCTP_STR_RESET_ADD_OUT_STREAMS); addstr->ph.param_length = htons(len); addstr->request_seq = htonl(seq); addstr->number_of_streams = htons(adding); addstr->reserved = 0; /* now fix the chunk length */ ch->chunk_length = htons(len + old_len); chk->send_size = len + old_len; chk->book_size = SCTP_SIZE32(chk->send_size); chk->book_size_scale = 0; SCTP_BUF_LEN(chk->data) = SCTP_SIZE32(chk->send_size); return; } static void sctp_add_an_in_stream(struct sctp_tmit_chunk *chk, uint32_t seq, uint16_t adding) { uint16_t len, old_len; struct sctp_chunkhdr *ch; struct sctp_stream_reset_add_strm *addstr; ch = mtod(chk->data, struct sctp_chunkhdr *); old_len = len = SCTP_SIZE32(ntohs(ch->chunk_length)); /* get to new offset for the param. */ addstr = (struct sctp_stream_reset_add_strm *)((caddr_t)ch + len); /* now how long will this param be? */ len = sizeof(struct sctp_stream_reset_add_strm); /* Fill it out. */ addstr->ph.param_type = htons(SCTP_STR_RESET_ADD_IN_STREAMS); addstr->ph.param_length = htons(len); addstr->request_seq = htonl(seq); addstr->number_of_streams = htons(adding); addstr->reserved = 0; /* now fix the chunk length */ ch->chunk_length = htons(len + old_len); chk->send_size = len + old_len; chk->book_size = SCTP_SIZE32(chk->send_size); chk->book_size_scale = 0; SCTP_BUF_LEN(chk->data) = SCTP_SIZE32(chk->send_size); return; } int sctp_send_stream_reset_out_if_possible(struct sctp_tcb *stcb, int so_locked) { struct sctp_association *asoc; struct sctp_tmit_chunk *chk; struct sctp_chunkhdr *ch; uint32_t seq; asoc = &stcb->asoc; asoc->trigger_reset = 0; if (asoc->stream_reset_outstanding) { return (EALREADY); } sctp_alloc_a_chunk(stcb, chk); if (chk == NULL) { SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, ENOMEM); return (ENOMEM); } chk->copy_by_ref = 0; chk->rec.chunk_id.id = SCTP_STREAM_RESET; chk->rec.chunk_id.can_take_data = 0; chk->flags = 0; chk->asoc = &stcb->asoc; chk->book_size = sizeof(struct sctp_chunkhdr); chk->send_size = SCTP_SIZE32(chk->book_size); chk->book_size_scale = 0; chk->data = sctp_get_mbuf_for_msg(MCLBYTES, 0, M_NOWAIT, 1, MT_DATA); if (chk->data == NULL) { sctp_free_a_chunk(stcb, chk, so_locked); SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, ENOMEM); return (ENOMEM); } SCTP_BUF_RESV_UF(chk->data, SCTP_MIN_OVERHEAD); /* setup chunk parameters */ chk->sent = SCTP_DATAGRAM_UNSENT; chk->snd_count = 0; if (stcb->asoc.alternate) { chk->whoTo = stcb->asoc.alternate; } else { chk->whoTo = stcb->asoc.primary_destination; } ch = mtod(chk->data, struct sctp_chunkhdr *); ch->chunk_type = SCTP_STREAM_RESET; ch->chunk_flags = 0; ch->chunk_length = htons(chk->book_size); atomic_add_int(&chk->whoTo->ref_count, 1); SCTP_BUF_LEN(chk->data) = chk->send_size; seq = stcb->asoc.str_reset_seq_out; if (sctp_add_stream_reset_out(stcb, chk, seq, (stcb->asoc.str_reset_seq_in - 1), (stcb->asoc.sending_seq - 1))) { seq++; asoc->stream_reset_outstanding++; } else { m_freem(chk->data); chk->data = NULL; sctp_free_a_chunk(stcb, chk, so_locked); return (ENOENT); } asoc->str_reset = chk; /* insert the chunk for sending */ TAILQ_INSERT_TAIL(&asoc->control_send_queue, chk, sctp_next); asoc->ctrl_queue_cnt++; if (stcb->asoc.send_sack) { sctp_send_sack(stcb, so_locked); } sctp_timer_start(SCTP_TIMER_TYPE_STRRESET, stcb->sctp_ep, stcb, chk->whoTo); return (0); } int sctp_send_str_reset_req(struct sctp_tcb *stcb, uint16_t number_entries, uint16_t *list, uint8_t send_in_req, uint8_t send_tsn_req, uint8_t add_stream, uint16_t adding_o, uint16_t adding_i, uint8_t peer_asked) { struct sctp_association *asoc; struct sctp_tmit_chunk *chk; struct sctp_chunkhdr *ch; int can_send_out_req = 0; uint32_t seq; asoc = &stcb->asoc; if (asoc->stream_reset_outstanding) { /*- * Already one pending, must get ACK back to clear the flag. */ SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, EBUSY); return (EBUSY); } if ((send_in_req == 0) && (send_tsn_req == 0) && (add_stream == 0)) { /* nothing to do */ SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, EINVAL); return (EINVAL); } if (send_tsn_req && send_in_req) { /* error, can't do that */ SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, EINVAL); return (EINVAL); } else if (send_in_req) { can_send_out_req = 1; } if (number_entries > (MCLBYTES - SCTP_MIN_OVERHEAD - sizeof(struct sctp_chunkhdr) - sizeof(struct sctp_stream_reset_out_request)) / sizeof(uint16_t)) { SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, ENOMEM); return (ENOMEM); } sctp_alloc_a_chunk(stcb, chk); if (chk == NULL) { SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, ENOMEM); return (ENOMEM); } chk->copy_by_ref = 0; chk->rec.chunk_id.id = SCTP_STREAM_RESET; chk->rec.chunk_id.can_take_data = 0; chk->flags = 0; chk->asoc = &stcb->asoc; chk->book_size = sizeof(struct sctp_chunkhdr); chk->send_size = SCTP_SIZE32(chk->book_size); chk->book_size_scale = 0; chk->data = sctp_get_mbuf_for_msg(MCLBYTES, 0, M_NOWAIT, 1, MT_DATA); if (chk->data == NULL) { sctp_free_a_chunk(stcb, chk, SCTP_SO_LOCKED); SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, ENOMEM); return (ENOMEM); } SCTP_BUF_RESV_UF(chk->data, SCTP_MIN_OVERHEAD); /* setup chunk parameters */ chk->sent = SCTP_DATAGRAM_UNSENT; chk->snd_count = 0; if (stcb->asoc.alternate) { chk->whoTo = stcb->asoc.alternate; } else { chk->whoTo = stcb->asoc.primary_destination; } atomic_add_int(&chk->whoTo->ref_count, 1); ch = mtod(chk->data, struct sctp_chunkhdr *); ch->chunk_type = SCTP_STREAM_RESET; ch->chunk_flags = 0; ch->chunk_length = htons(chk->book_size); SCTP_BUF_LEN(chk->data) = chk->send_size; seq = stcb->asoc.str_reset_seq_out; if (can_send_out_req) { int ret; ret = sctp_add_stream_reset_out(stcb, chk, seq, (stcb->asoc.str_reset_seq_in - 1), (stcb->asoc.sending_seq - 1)); if (ret) { seq++; asoc->stream_reset_outstanding++; } } if ((add_stream & 1) && ((stcb->asoc.strm_realoutsize - stcb->asoc.streamoutcnt) < adding_o)) { /* Need to allocate more */ struct sctp_stream_out *oldstream; struct sctp_stream_queue_pending *sp, *nsp; int i; #if defined(SCTP_DETAILED_STR_STATS) int j; #endif oldstream = stcb->asoc.strmout; /* get some more */ SCTP_MALLOC(stcb->asoc.strmout, struct sctp_stream_out *, (stcb->asoc.streamoutcnt + adding_o) * sizeof(struct sctp_stream_out), SCTP_M_STRMO); if (stcb->asoc.strmout == NULL) { uint8_t x; stcb->asoc.strmout = oldstream; /* Turn off the bit */ x = add_stream & 0xfe; add_stream = x; goto skip_stuff; } /* * Ok now we proceed with copying the old out stuff and * initializing the new stuff. */ SCTP_TCB_SEND_LOCK(stcb); stcb->asoc.ss_functions.sctp_ss_clear(stcb, &stcb->asoc, 0, 1); for (i = 0; i < stcb->asoc.streamoutcnt; i++) { TAILQ_INIT(&stcb->asoc.strmout[i].outqueue); stcb->asoc.strmout[i].chunks_on_queues = oldstream[i].chunks_on_queues; stcb->asoc.strmout[i].next_mid_ordered = oldstream[i].next_mid_ordered; stcb->asoc.strmout[i].next_mid_unordered = oldstream[i].next_mid_unordered; stcb->asoc.strmout[i].last_msg_incomplete = oldstream[i].last_msg_incomplete; stcb->asoc.strmout[i].sid = i; stcb->asoc.strmout[i].state = oldstream[i].state; /* FIX ME FIX ME */ /* * This should be a SS_COPY operation FIX ME STREAM * SCHEDULER EXPERT */ stcb->asoc.ss_functions.sctp_ss_init_stream(stcb, &stcb->asoc.strmout[i], &oldstream[i]); /* now anything on those queues? */ TAILQ_FOREACH_SAFE(sp, &oldstream[i].outqueue, next, nsp) { TAILQ_REMOVE(&oldstream[i].outqueue, sp, next); TAILQ_INSERT_TAIL(&stcb->asoc.strmout[i].outqueue, sp, next); } } /* now the new streams */ stcb->asoc.ss_functions.sctp_ss_init(stcb, &stcb->asoc, 1); for (i = stcb->asoc.streamoutcnt; i < (stcb->asoc.streamoutcnt + adding_o); i++) { TAILQ_INIT(&stcb->asoc.strmout[i].outqueue); stcb->asoc.strmout[i].chunks_on_queues = 0; #if defined(SCTP_DETAILED_STR_STATS) for (j = 0; j < SCTP_PR_SCTP_MAX + 1; j++) { stcb->asoc.strmout[i].abandoned_sent[j] = 0; stcb->asoc.strmout[i].abandoned_unsent[j] = 0; } #else stcb->asoc.strmout[i].abandoned_sent[0] = 0; stcb->asoc.strmout[i].abandoned_unsent[0] = 0; #endif stcb->asoc.strmout[i].next_mid_ordered = 0; stcb->asoc.strmout[i].next_mid_unordered = 0; stcb->asoc.strmout[i].sid = i; stcb->asoc.strmout[i].last_msg_incomplete = 0; stcb->asoc.ss_functions.sctp_ss_init_stream(stcb, &stcb->asoc.strmout[i], NULL); stcb->asoc.strmout[i].state = SCTP_STREAM_CLOSED; } stcb->asoc.strm_realoutsize = stcb->asoc.streamoutcnt + adding_o; SCTP_FREE(oldstream, SCTP_M_STRMO); SCTP_TCB_SEND_UNLOCK(stcb); } skip_stuff: if ((add_stream & 1) && (adding_o > 0)) { asoc->strm_pending_add_size = adding_o; asoc->peer_req_out = peer_asked; sctp_add_an_out_stream(chk, seq, adding_o); seq++; asoc->stream_reset_outstanding++; } if ((add_stream & 2) && (adding_i > 0)) { sctp_add_an_in_stream(chk, seq, adding_i); seq++; asoc->stream_reset_outstanding++; } if (send_in_req) { sctp_add_stream_reset_in(chk, number_entries, list, seq); seq++; asoc->stream_reset_outstanding++; } if (send_tsn_req) { sctp_add_stream_reset_tsn(chk, seq); asoc->stream_reset_outstanding++; } asoc->str_reset = chk; /* insert the chunk for sending */ TAILQ_INSERT_TAIL(&asoc->control_send_queue, chk, sctp_next); asoc->ctrl_queue_cnt++; if (stcb->asoc.send_sack) { sctp_send_sack(stcb, SCTP_SO_LOCKED); } sctp_timer_start(SCTP_TIMER_TYPE_STRRESET, stcb->sctp_ep, stcb, chk->whoTo); return (0); } void sctp_send_abort(struct mbuf *m, int iphlen, struct sockaddr *src, struct sockaddr *dst, struct sctphdr *sh, uint32_t vtag, struct mbuf *cause, uint8_t mflowtype, uint32_t mflowid, uint16_t fibnum, uint32_t vrf_id, uint16_t port) { /* Don't respond to an ABORT with an ABORT. */ if (sctp_is_there_an_abort_here(m, iphlen, &vtag)) { if (cause) sctp_m_freem(cause); return; } sctp_send_resp_msg(src, dst, sh, vtag, SCTP_ABORT_ASSOCIATION, cause, mflowtype, mflowid, fibnum, vrf_id, port); return; } void sctp_send_operr_to(struct sockaddr *src, struct sockaddr *dst, struct sctphdr *sh, uint32_t vtag, struct mbuf *cause, uint8_t mflowtype, uint32_t mflowid, uint16_t fibnum, uint32_t vrf_id, uint16_t port) { sctp_send_resp_msg(src, dst, sh, vtag, SCTP_OPERATION_ERROR, cause, mflowtype, mflowid, fibnum, vrf_id, port); return; } static struct mbuf * sctp_copy_resume(struct uio *uio, int max_send_len, int user_marks_eor, int *error, uint32_t *sndout, struct mbuf **new_tail) { struct mbuf *m; m = m_uiotombuf(uio, M_WAITOK, max_send_len, 0, (M_PKTHDR | (user_marks_eor ? M_EOR : 0))); if (m == NULL) { SCTP_LTRACE_ERR_RET(NULL, NULL, NULL, SCTP_FROM_SCTP_OUTPUT, ENOBUFS); *error = ENOBUFS; } else { *sndout = m_length(m, NULL); *new_tail = m_last(m); } return (m); } static int sctp_copy_one(struct sctp_stream_queue_pending *sp, struct uio *uio, int resv_upfront) { sp->data = m_uiotombuf(uio, M_WAITOK, sp->length, resv_upfront, 0); if (sp->data == NULL) { SCTP_LTRACE_ERR_RET(NULL, NULL, NULL, SCTP_FROM_SCTP_OUTPUT, ENOBUFS); return (ENOBUFS); } sp->tail_mbuf = m_last(sp->data); return (0); } static struct sctp_stream_queue_pending * sctp_copy_it_in(struct sctp_tcb *stcb, struct sctp_association *asoc, struct sctp_sndrcvinfo *srcv, struct uio *uio, struct sctp_nets *net, ssize_t max_send_len, int user_marks_eor, int *error) { /*- * This routine must be very careful in its work. Protocol * processing is up and running so care must be taken to spl...() * when you need to do something that may effect the stcb/asoc. The * sb is locked however. When data is copied the protocol processing * should be enabled since this is a slower operation... */ struct sctp_stream_queue_pending *sp = NULL; int resv_in_first; *error = 0; /* Now can we send this? */ if ((SCTP_GET_STATE(stcb) == SCTP_STATE_SHUTDOWN_SENT) || (SCTP_GET_STATE(stcb) == SCTP_STATE_SHUTDOWN_ACK_SENT) || (SCTP_GET_STATE(stcb) == SCTP_STATE_SHUTDOWN_RECEIVED) || (asoc->state & SCTP_STATE_SHUTDOWN_PENDING)) { /* got data while shutting down */ SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, ECONNRESET); *error = ECONNRESET; goto out_now; } sctp_alloc_a_strmoq(stcb, sp); if (sp == NULL) { SCTP_LTRACE_ERR_RET(NULL, stcb, net, SCTP_FROM_SCTP_OUTPUT, ENOMEM); *error = ENOMEM; goto out_now; } sp->act_flags = 0; sp->sender_all_done = 0; sp->sinfo_flags = srcv->sinfo_flags; sp->timetolive = srcv->sinfo_timetolive; sp->ppid = srcv->sinfo_ppid; sp->context = srcv->sinfo_context; sp->fsn = 0; (void)SCTP_GETTIME_TIMEVAL(&sp->ts); sp->sid = srcv->sinfo_stream; sp->length = (uint32_t)min(uio->uio_resid, max_send_len); if ((sp->length == (uint32_t)uio->uio_resid) && ((user_marks_eor == 0) || (srcv->sinfo_flags & SCTP_EOF) || (user_marks_eor && (srcv->sinfo_flags & SCTP_EOR)))) { sp->msg_is_complete = 1; } else { sp->msg_is_complete = 0; } sp->sender_all_done = 0; sp->some_taken = 0; sp->put_last_out = 0; resv_in_first = SCTP_DATA_CHUNK_OVERHEAD(stcb); sp->data = sp->tail_mbuf = NULL; if (sp->length == 0) { goto skip_copy; } if (srcv->sinfo_keynumber_valid) { sp->auth_keyid = srcv->sinfo_keynumber; } else { sp->auth_keyid = stcb->asoc.authinfo.active_keyid; } if (sctp_auth_is_required_chunk(SCTP_DATA, stcb->asoc.peer_auth_chunks)) { sctp_auth_key_acquire(stcb, sp->auth_keyid); sp->holds_key_ref = 1; } *error = sctp_copy_one(sp, uio, resv_in_first); skip_copy: if (*error) { sctp_free_a_strmoq(stcb, sp, SCTP_SO_LOCKED); sp = NULL; } else { if (sp->sinfo_flags & SCTP_ADDR_OVER) { sp->net = net; atomic_add_int(&sp->net->ref_count, 1); } else { sp->net = NULL; } sctp_set_prsctp_policy(sp); } out_now: return (sp); } int sctp_sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, struct mbuf *top, struct mbuf *control, int flags, struct thread *p ) { int error, use_sndinfo = 0; struct sctp_sndrcvinfo sndrcvninfo; struct sockaddr *addr_to_use; #if defined(INET) && defined(INET6) struct sockaddr_in sin; #endif if (control) { /* process cmsg snd/rcv info (maybe a assoc-id) */ if (sctp_find_cmsg(SCTP_SNDRCV, (void *)&sndrcvninfo, control, sizeof(sndrcvninfo))) { /* got one */ use_sndinfo = 1; } } addr_to_use = addr; #if defined(INET) && defined(INET6) if ((addr) && (addr->sa_family == AF_INET6)) { struct sockaddr_in6 *sin6; sin6 = (struct sockaddr_in6 *)addr; if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { in6_sin6_2_sin(&sin, sin6); addr_to_use = (struct sockaddr *)&sin; } } #endif error = sctp_lower_sosend(so, addr_to_use, uio, top, control, flags, use_sndinfo ? &sndrcvninfo : NULL ,p ); return (error); } int sctp_lower_sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, struct mbuf *i_pak, struct mbuf *control, int flags, struct sctp_sndrcvinfo *srcv , struct thread *p ) { struct epoch_tracker et; ssize_t sndlen = 0, max_len, local_add_more; int error, len; struct mbuf *top = NULL; int queue_only = 0, queue_only_for_init = 0; int free_cnt_applied = 0; int un_sent; int now_filled = 0; unsigned int inqueue_bytes = 0; struct sctp_block_entry be; struct sctp_inpcb *inp; struct sctp_tcb *stcb = NULL; struct timeval now; struct sctp_nets *net; struct sctp_association *asoc; struct sctp_inpcb *t_inp; int user_marks_eor; int create_lock_applied = 0; int nagle_applies = 0; int some_on_control = 0; int got_all_of_the_send = 0; int hold_tcblock = 0; int non_blocking = 0; ssize_t local_soresv = 0; uint16_t port; uint16_t sinfo_flags; sctp_assoc_t sinfo_assoc_id; error = 0; net = NULL; stcb = NULL; asoc = NULL; t_inp = inp = (struct sctp_inpcb *)so->so_pcb; if (inp == NULL) { SCTP_LTRACE_ERR_RET(NULL, NULL, NULL, SCTP_FROM_SCTP_OUTPUT, EINVAL); error = EINVAL; if (i_pak) { SCTP_RELEASE_PKT(i_pak); } return (error); } if ((uio == NULL) && (i_pak == NULL)) { SCTP_LTRACE_ERR_RET(inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, EINVAL); return (EINVAL); } user_marks_eor = sctp_is_feature_on(inp, SCTP_PCB_FLAGS_EXPLICIT_EOR); atomic_add_int(&inp->total_sends, 1); if (uio) { if (uio->uio_resid < 0) { SCTP_LTRACE_ERR_RET(inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, EINVAL); return (EINVAL); } sndlen = uio->uio_resid; } else { top = SCTP_HEADER_TO_CHAIN(i_pak); sndlen = SCTP_HEADER_LEN(i_pak); } SCTPDBG(SCTP_DEBUG_OUTPUT1, "Send called addr:%p send length %zd\n", (void *)addr, sndlen); if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) && SCTP_IS_LISTENING(inp)) { /* The listener can NOT send */ SCTP_LTRACE_ERR_RET(NULL, NULL, NULL, SCTP_FROM_SCTP_OUTPUT, ENOTCONN); error = ENOTCONN; goto out_unlocked; } /** * Pre-screen address, if one is given the sin-len * must be set correctly! */ if (addr) { union sctp_sockstore *raddr = (union sctp_sockstore *)addr; switch (raddr->sa.sa_family) { #ifdef INET case AF_INET: if (raddr->sin.sin_len != sizeof(struct sockaddr_in)) { SCTP_LTRACE_ERR_RET(inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, EINVAL); error = EINVAL; goto out_unlocked; } port = raddr->sin.sin_port; break; #endif #ifdef INET6 case AF_INET6: if (raddr->sin6.sin6_len != sizeof(struct sockaddr_in6)) { SCTP_LTRACE_ERR_RET(inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, EINVAL); error = EINVAL; goto out_unlocked; } port = raddr->sin6.sin6_port; break; #endif default: SCTP_LTRACE_ERR_RET(inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, EAFNOSUPPORT); error = EAFNOSUPPORT; goto out_unlocked; } } else port = 0; if (srcv) { sinfo_flags = srcv->sinfo_flags; sinfo_assoc_id = srcv->sinfo_assoc_id; if (INVALID_SINFO_FLAG(sinfo_flags) || PR_SCTP_INVALID_POLICY(sinfo_flags)) { SCTP_LTRACE_ERR_RET(inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, EINVAL); error = EINVAL; goto out_unlocked; } if (srcv->sinfo_flags) SCTP_STAT_INCR(sctps_sends_with_flags); } else { sinfo_flags = inp->def_send.sinfo_flags; sinfo_assoc_id = inp->def_send.sinfo_assoc_id; } if (flags & MSG_EOR) { sinfo_flags |= SCTP_EOR; } if (flags & MSG_EOF) { sinfo_flags |= SCTP_EOF; } if (sinfo_flags & SCTP_SENDALL) { /* its a sendall */ error = sctp_sendall(inp, uio, top, srcv); top = NULL; goto out_unlocked; } if ((sinfo_flags & SCTP_ADDR_OVER) && (addr == NULL)) { SCTP_LTRACE_ERR_RET(inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, EINVAL); error = EINVAL; goto out_unlocked; } /* now we must find the assoc */ if ((inp->sctp_flags & SCTP_PCB_FLAGS_CONNECTED) || (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL)) { SCTP_INP_RLOCK(inp); stcb = LIST_FIRST(&inp->sctp_asoc_list); if (stcb) { SCTP_TCB_LOCK(stcb); hold_tcblock = 1; } SCTP_INP_RUNLOCK(inp); } else if (sinfo_assoc_id) { stcb = sctp_findassociation_ep_asocid(inp, sinfo_assoc_id, 1); if (stcb != NULL) { hold_tcblock = 1; } } else if (addr) { /*- * Since we did not use findep we must * increment it, and if we don't find a tcb * decrement it. */ SCTP_INP_WLOCK(inp); SCTP_INP_INCR_REF(inp); SCTP_INP_WUNLOCK(inp); stcb = sctp_findassociation_ep_addr(&t_inp, addr, &net, NULL, NULL); if (stcb == NULL) { SCTP_INP_WLOCK(inp); SCTP_INP_DECR_REF(inp); SCTP_INP_WUNLOCK(inp); } else { hold_tcblock = 1; } } if ((stcb == NULL) && (addr)) { /* Possible implicit send? */ SCTP_ASOC_CREATE_LOCK(inp); create_lock_applied = 1; if ((inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) || (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE)) { /* Should I really unlock ? */ SCTP_LTRACE_ERR_RET(NULL, NULL, NULL, SCTP_FROM_SCTP_OUTPUT, EINVAL); error = EINVAL; goto out_unlocked; } if (((inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) == 0) && (addr->sa_family == AF_INET6)) { SCTP_LTRACE_ERR_RET(inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, EINVAL); error = EINVAL; goto out_unlocked; } SCTP_INP_WLOCK(inp); SCTP_INP_INCR_REF(inp); SCTP_INP_WUNLOCK(inp); /* With the lock applied look again */ stcb = sctp_findassociation_ep_addr(&t_inp, addr, &net, NULL, NULL); #if defined(INET) || defined(INET6) if ((stcb == NULL) && (control != NULL) && (port > 0)) { stcb = sctp_findassociation_cmsgs(&t_inp, port, control, &net, &error); } #endif if (stcb == NULL) { SCTP_INP_WLOCK(inp); SCTP_INP_DECR_REF(inp); SCTP_INP_WUNLOCK(inp); } else { hold_tcblock = 1; } if (error) { goto out_unlocked; } if (t_inp != inp) { SCTP_LTRACE_ERR_RET(inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, ENOTCONN); error = ENOTCONN; goto out_unlocked; } } if (stcb == NULL) { if (addr == NULL) { SCTP_LTRACE_ERR_RET(inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, ENOENT); error = ENOENT; goto out_unlocked; } else { /* We must go ahead and start the INIT process */ uint32_t vrf_id; if ((sinfo_flags & SCTP_ABORT) || ((sinfo_flags & SCTP_EOF) && (sndlen == 0))) { /*- * User asks to abort a non-existant assoc, * or EOF a non-existant assoc with no data */ SCTP_LTRACE_ERR_RET(inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, ENOENT); error = ENOENT; goto out_unlocked; } /* get an asoc/stcb struct */ vrf_id = inp->def_vrf_id; #ifdef INVARIANTS if (create_lock_applied == 0) { panic("Error, should hold create lock and I don't?"); } #endif stcb = sctp_aloc_assoc(inp, addr, &error, 0, vrf_id, inp->sctp_ep.pre_open_stream_count, inp->sctp_ep.port, p, SCTP_INITIALIZE_AUTH_PARAMS); if (stcb == NULL) { /* Error is setup for us in the call */ goto out_unlocked; } if (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) { stcb->sctp_ep->sctp_flags |= SCTP_PCB_FLAGS_CONNECTED; /* * Set the connected flag so we can queue * data */ soisconnecting(so); } hold_tcblock = 1; if (create_lock_applied) { SCTP_ASOC_CREATE_UNLOCK(inp); create_lock_applied = 0; } else { SCTP_PRINTF("Huh-3? create lock should have been on??\n"); } /* * Turn on queue only flag to prevent data from * being sent */ queue_only = 1; asoc = &stcb->asoc; SCTP_SET_STATE(stcb, SCTP_STATE_COOKIE_WAIT); (void)SCTP_GETTIME_TIMEVAL(&asoc->time_entered); if (control) { if (sctp_process_cmsgs_for_init(stcb, control, &error)) { sctp_free_assoc(inp, stcb, SCTP_PCBFREE_FORCE, SCTP_FROM_SCTP_OUTPUT + SCTP_LOC_6); hold_tcblock = 0; stcb = NULL; goto out_unlocked; } } /* out with the INIT */ queue_only_for_init = 1; /*- * we may want to dig in after this call and adjust the MTU * value. It defaulted to 1500 (constant) but the ro * structure may now have an update and thus we may need to * change it BEFORE we append the message. */ } } else asoc = &stcb->asoc; if (srcv == NULL) { srcv = (struct sctp_sndrcvinfo *)&asoc->def_send; sinfo_flags = srcv->sinfo_flags; if (flags & MSG_EOR) { sinfo_flags |= SCTP_EOR; } if (flags & MSG_EOF) { sinfo_flags |= SCTP_EOF; } } if (sinfo_flags & SCTP_ADDR_OVER) { if (addr) net = sctp_findnet(stcb, addr); else net = NULL; if ((net == NULL) || ((port != 0) && (port != stcb->rport))) { SCTP_LTRACE_ERR_RET(inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, EINVAL); error = EINVAL; goto out_unlocked; } } else { if (stcb->asoc.alternate) { net = stcb->asoc.alternate; } else { net = stcb->asoc.primary_destination; } } atomic_add_int(&stcb->total_sends, 1); /* Keep the stcb from being freed under our feet */ atomic_add_int(&asoc->refcnt, 1); free_cnt_applied = 1; if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_NO_FRAGMENT)) { if (sndlen > (ssize_t)asoc->smallest_mtu) { SCTP_LTRACE_ERR_RET(inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, EMSGSIZE); error = EMSGSIZE; goto out_unlocked; } } if (SCTP_SO_IS_NBIO(so) || (flags & (MSG_NBIO | MSG_DONTWAIT)) != 0 ) { non_blocking = 1; } /* would we block? */ if (non_blocking) { ssize_t amount; if (hold_tcblock == 0) { SCTP_TCB_LOCK(stcb); hold_tcblock = 1; } inqueue_bytes = stcb->asoc.total_output_queue_size - (stcb->asoc.chunks_on_out_queue * SCTP_DATA_CHUNK_OVERHEAD(stcb)); if (user_marks_eor == 0) { amount = sndlen; } else { amount = 1; } if ((SCTP_SB_LIMIT_SND(so) < (amount + inqueue_bytes + stcb->asoc.sb_send_resv)) || (stcb->asoc.chunks_on_out_queue >= SCTP_BASE_SYSCTL(sctp_max_chunks_on_queue))) { SCTP_LTRACE_ERR_RET(inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, EWOULDBLOCK); if (sndlen > (ssize_t)SCTP_SB_LIMIT_SND(so)) error = EMSGSIZE; else error = EWOULDBLOCK; goto out_unlocked; } stcb->asoc.sb_send_resv += (uint32_t)sndlen; SCTP_TCB_UNLOCK(stcb); hold_tcblock = 0; } else { atomic_add_int(&stcb->asoc.sb_send_resv, sndlen); } local_soresv = sndlen; if (stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) { SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, ECONNRESET); error = ECONNRESET; goto out_unlocked; } if (create_lock_applied) { SCTP_ASOC_CREATE_UNLOCK(inp); create_lock_applied = 0; } /* Is the stream no. valid? */ if (srcv->sinfo_stream >= asoc->streamoutcnt) { /* Invalid stream number */ SCTP_LTRACE_ERR_RET(inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, EINVAL); error = EINVAL; goto out_unlocked; } if ((asoc->strmout[srcv->sinfo_stream].state != SCTP_STREAM_OPEN) && (asoc->strmout[srcv->sinfo_stream].state != SCTP_STREAM_OPENING)) { /* * Can't queue any data while stream reset is underway. */ if (asoc->strmout[srcv->sinfo_stream].state > SCTP_STREAM_OPEN) { error = EAGAIN; } else { error = EINVAL; } SCTP_LTRACE_ERR_RET(inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, error); goto out_unlocked; } if ((SCTP_GET_STATE(stcb) == SCTP_STATE_COOKIE_WAIT) || (SCTP_GET_STATE(stcb) == SCTP_STATE_COOKIE_ECHOED)) { queue_only = 1; } /* we are now done with all control */ if (control) { sctp_m_freem(control); control = NULL; } if ((SCTP_GET_STATE(stcb) == SCTP_STATE_SHUTDOWN_SENT) || (SCTP_GET_STATE(stcb) == SCTP_STATE_SHUTDOWN_RECEIVED) || (SCTP_GET_STATE(stcb) == SCTP_STATE_SHUTDOWN_ACK_SENT) || (asoc->state & SCTP_STATE_SHUTDOWN_PENDING)) { if (sinfo_flags & SCTP_ABORT) { ; } else { SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, ECONNRESET); error = ECONNRESET; goto out_unlocked; } } /* Ok, we will attempt a msgsnd :> */ if (p) { p->td_ru.ru_msgsnd++; } /* Are we aborting? */ if (sinfo_flags & SCTP_ABORT) { struct mbuf *mm; ssize_t tot_demand, tot_out = 0, max_out; SCTP_STAT_INCR(sctps_sends_with_abort); if ((SCTP_GET_STATE(stcb) == SCTP_STATE_COOKIE_WAIT) || (SCTP_GET_STATE(stcb) == SCTP_STATE_COOKIE_ECHOED)) { /* It has to be up before we abort */ /* how big is the user initiated abort? */ SCTP_LTRACE_ERR_RET(inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, EINVAL); error = EINVAL; goto out; } if (hold_tcblock) { SCTP_TCB_UNLOCK(stcb); hold_tcblock = 0; } if (top) { struct mbuf *cntm = NULL; mm = sctp_get_mbuf_for_msg(sizeof(struct sctp_paramhdr), 0, M_WAITOK, 1, MT_DATA); if (sndlen != 0) { for (cntm = top; cntm; cntm = SCTP_BUF_NEXT(cntm)) { tot_out += SCTP_BUF_LEN(cntm); } } } else { /* Must fit in a MTU */ tot_out = sndlen; tot_demand = (tot_out + sizeof(struct sctp_paramhdr)); if (tot_demand > SCTP_DEFAULT_ADD_MORE) { /* To big */ SCTP_LTRACE_ERR_RET(NULL, stcb, net, SCTP_FROM_SCTP_OUTPUT, EMSGSIZE); error = EMSGSIZE; goto out; } mm = sctp_get_mbuf_for_msg((unsigned int)tot_demand, 0, M_WAITOK, 1, MT_DATA); } if (mm == NULL) { SCTP_LTRACE_ERR_RET(NULL, stcb, net, SCTP_FROM_SCTP_OUTPUT, ENOMEM); error = ENOMEM; goto out; } max_out = asoc->smallest_mtu - sizeof(struct sctp_paramhdr); max_out -= sizeof(struct sctp_abort_msg); if (tot_out > max_out) { tot_out = max_out; } if (mm) { struct sctp_paramhdr *ph; /* now move forward the data pointer */ ph = mtod(mm, struct sctp_paramhdr *); ph->param_type = htons(SCTP_CAUSE_USER_INITIATED_ABT); ph->param_length = htons((uint16_t)(sizeof(struct sctp_paramhdr) + tot_out)); ph++; SCTP_BUF_LEN(mm) = (int)(tot_out + sizeof(struct sctp_paramhdr)); if (top == NULL) { error = uiomove((caddr_t)ph, (int)tot_out, uio); if (error) { /*- * Here if we can't get his data we * still abort we just don't get to * send the users note :-0 */ sctp_m_freem(mm); mm = NULL; } } else { if (sndlen != 0) { SCTP_BUF_NEXT(mm) = top; } } } if (hold_tcblock == 0) { SCTP_TCB_LOCK(stcb); } atomic_add_int(&stcb->asoc.refcnt, -1); free_cnt_applied = 0; /* release this lock, otherwise we hang on ourselves */ NET_EPOCH_ENTER(et); sctp_abort_an_association(stcb->sctp_ep, stcb, mm, SCTP_SO_LOCKED); NET_EPOCH_EXIT(et); /* now relock the stcb so everything is sane */ hold_tcblock = 0; stcb = NULL; /* * In this case top is already chained to mm avoid double * free, since we free it below if top != NULL and driver * would free it after sending the packet out */ if (sndlen != 0) { top = NULL; } goto out_unlocked; } /* Calculate the maximum we can send */ inqueue_bytes = stcb->asoc.total_output_queue_size - (stcb->asoc.chunks_on_out_queue * SCTP_DATA_CHUNK_OVERHEAD(stcb)); if (SCTP_SB_LIMIT_SND(so) > inqueue_bytes) { max_len = SCTP_SB_LIMIT_SND(so) - inqueue_bytes; } else { max_len = 0; } if (hold_tcblock) { SCTP_TCB_UNLOCK(stcb); hold_tcblock = 0; } if (asoc->strmout == NULL) { /* huh? software error */ SCTP_LTRACE_ERR_RET(inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, EFAULT); error = EFAULT; goto out_unlocked; } /* Unless E_EOR mode is on, we must make a send FIT in one call. */ if ((user_marks_eor == 0) && (sndlen > (ssize_t)SCTP_SB_LIMIT_SND(stcb->sctp_socket))) { /* It will NEVER fit */ SCTP_LTRACE_ERR_RET(NULL, stcb, net, SCTP_FROM_SCTP_OUTPUT, EMSGSIZE); error = EMSGSIZE; goto out_unlocked; } if ((uio == NULL) && user_marks_eor) { /*- * We do not support eeor mode for * sending with mbuf chains (like sendfile). */ SCTP_LTRACE_ERR_RET(NULL, stcb, net, SCTP_FROM_SCTP_OUTPUT, EINVAL); error = EINVAL; goto out_unlocked; } if (user_marks_eor) { local_add_more = (ssize_t)min(SCTP_SB_LIMIT_SND(so), SCTP_BASE_SYSCTL(sctp_add_more_threshold)); } else { /*- * For non-eeor the whole message must fit in * the socket send buffer. */ local_add_more = sndlen; } len = 0; if (non_blocking) { goto skip_preblock; } if (((max_len <= local_add_more) && ((ssize_t)SCTP_SB_LIMIT_SND(so) >= local_add_more)) || (max_len == 0) || ((stcb->asoc.chunks_on_out_queue + stcb->asoc.stream_queue_cnt) >= SCTP_BASE_SYSCTL(sctp_max_chunks_on_queue))) { /* No room right now ! */ SOCKBUF_LOCK(&so->so_snd); inqueue_bytes = stcb->asoc.total_output_queue_size - (stcb->asoc.chunks_on_out_queue * SCTP_DATA_CHUNK_OVERHEAD(stcb)); while ((SCTP_SB_LIMIT_SND(so) < (inqueue_bytes + local_add_more)) || ((stcb->asoc.stream_queue_cnt + stcb->asoc.chunks_on_out_queue) >= SCTP_BASE_SYSCTL(sctp_max_chunks_on_queue))) { SCTPDBG(SCTP_DEBUG_OUTPUT1, "pre_block limit:%u <(inq:%d + %zd) || (%d+%d > %d)\n", (unsigned int)SCTP_SB_LIMIT_SND(so), inqueue_bytes, local_add_more, stcb->asoc.stream_queue_cnt, stcb->asoc.chunks_on_out_queue, SCTP_BASE_SYSCTL(sctp_max_chunks_on_queue)); if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_BLK_LOGGING_ENABLE) { sctp_log_block(SCTP_BLOCK_LOG_INTO_BLKA, asoc, sndlen); } be.error = 0; stcb->block_entry = &be; error = sbwait(&so->so_snd); stcb->block_entry = NULL; if (error || so->so_error || be.error) { if (error == 0) { if (so->so_error) error = so->so_error; if (be.error) { error = be.error; } } SOCKBUF_UNLOCK(&so->so_snd); goto out_unlocked; } if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_BLK_LOGGING_ENABLE) { sctp_log_block(SCTP_BLOCK_LOG_OUTOF_BLK, asoc, stcb->asoc.total_output_queue_size); } if (stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) { SOCKBUF_UNLOCK(&so->so_snd); goto out_unlocked; } inqueue_bytes = stcb->asoc.total_output_queue_size - (stcb->asoc.chunks_on_out_queue * SCTP_DATA_CHUNK_OVERHEAD(stcb)); } if (SCTP_SB_LIMIT_SND(so) > inqueue_bytes) { max_len = SCTP_SB_LIMIT_SND(so) - inqueue_bytes; } else { max_len = 0; } SOCKBUF_UNLOCK(&so->so_snd); } skip_preblock: if (stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) { goto out_unlocked; } /* * sndlen covers for mbuf case uio_resid covers for the non-mbuf * case NOTE: uio will be null when top/mbuf is passed */ if (sndlen == 0) { if (sinfo_flags & SCTP_EOF) { got_all_of_the_send = 1; goto dataless_eof; } else { SCTP_LTRACE_ERR_RET(inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, EINVAL); error = EINVAL; goto out; } } if (top == NULL) { struct sctp_stream_queue_pending *sp; struct sctp_stream_out *strm; uint32_t sndout; SCTP_TCB_SEND_LOCK(stcb); if ((asoc->stream_locked) && (asoc->stream_locked_on != srcv->sinfo_stream)) { SCTP_TCB_SEND_UNLOCK(stcb); SCTP_LTRACE_ERR_RET(inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, EINVAL); error = EINVAL; goto out; } SCTP_TCB_SEND_UNLOCK(stcb); strm = &stcb->asoc.strmout[srcv->sinfo_stream]; if (strm->last_msg_incomplete == 0) { do_a_copy_in: sp = sctp_copy_it_in(stcb, asoc, srcv, uio, net, max_len, user_marks_eor, &error); if (error) { goto out; } SCTP_TCB_SEND_LOCK(stcb); if (sp->msg_is_complete) { strm->last_msg_incomplete = 0; asoc->stream_locked = 0; } else { /* * Just got locked to this guy in case of an * interrupt. */ strm->last_msg_incomplete = 1; if (stcb->asoc.idata_supported == 0) { asoc->stream_locked = 1; asoc->stream_locked_on = srcv->sinfo_stream; } sp->sender_all_done = 0; } sctp_snd_sb_alloc(stcb, sp->length); atomic_add_int(&asoc->stream_queue_cnt, 1); if (sinfo_flags & SCTP_UNORDERED) { SCTP_STAT_INCR(sctps_sends_with_unord); } + sp->processing = 1; TAILQ_INSERT_TAIL(&strm->outqueue, sp, next); stcb->asoc.ss_functions.sctp_ss_add_to_stream(stcb, asoc, strm, sp, 1); SCTP_TCB_SEND_UNLOCK(stcb); } else { SCTP_TCB_SEND_LOCK(stcb); sp = TAILQ_LAST(&strm->outqueue, sctp_streamhead); + if (sp->processing) { + SCTP_TCB_SEND_UNLOCK(stcb); + SCTP_LTRACE_ERR_RET(inp, stcb, net, SCTP_FROM_SCTP_OUTPUT, EINVAL); + error = EINVAL; + goto out; + } else { + sp->processing = 1; + } SCTP_TCB_SEND_UNLOCK(stcb); if (sp == NULL) { /* ???? Huh ??? last msg is gone */ #ifdef INVARIANTS panic("Warning: Last msg marked incomplete, yet nothing left?"); #else SCTP_PRINTF("Warning: Last msg marked incomplete, yet nothing left?\n"); strm->last_msg_incomplete = 0; #endif goto do_a_copy_in; } } while (uio->uio_resid > 0) { /* How much room do we have? */ struct mbuf *new_tail, *mm; inqueue_bytes = stcb->asoc.total_output_queue_size - (stcb->asoc.chunks_on_out_queue * SCTP_DATA_CHUNK_OVERHEAD(stcb)); if (SCTP_SB_LIMIT_SND(so) > inqueue_bytes) max_len = SCTP_SB_LIMIT_SND(so) - inqueue_bytes; else max_len = 0; if ((max_len > (ssize_t)SCTP_BASE_SYSCTL(sctp_add_more_threshold)) || (max_len && (SCTP_SB_LIMIT_SND(so) < SCTP_BASE_SYSCTL(sctp_add_more_threshold))) || (uio->uio_resid && (uio->uio_resid <= max_len))) { sndout = 0; new_tail = NULL; if (hold_tcblock) { SCTP_TCB_UNLOCK(stcb); hold_tcblock = 0; } mm = sctp_copy_resume(uio, (int)max_len, user_marks_eor, &error, &sndout, &new_tail); if ((mm == NULL) || error) { if (mm) { sctp_m_freem(mm); } goto out; } /* Update the mbuf and count */ SCTP_TCB_SEND_LOCK(stcb); - if (stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) { + if ((stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) || + (stcb->asoc.state & SCTP_STATE_WAS_ABORTED)) { /* * we need to get out. Peer probably * aborted. */ sctp_m_freem(mm); - if (stcb->asoc.state & SCTP_PCB_FLAGS_WAS_ABORTED) { + if (stcb->asoc.state & SCTP_STATE_WAS_ABORTED) { SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTP_OUTPUT, ECONNRESET); error = ECONNRESET; } SCTP_TCB_SEND_UNLOCK(stcb); goto out; } if (sp->tail_mbuf) { /* tack it to the end */ SCTP_BUF_NEXT(sp->tail_mbuf) = mm; sp->tail_mbuf = new_tail; } else { /* A stolen mbuf */ sp->data = mm; sp->tail_mbuf = new_tail; } sctp_snd_sb_alloc(stcb, sndout); atomic_add_int(&sp->length, sndout); len += sndout; if (sinfo_flags & SCTP_SACK_IMMEDIATELY) { sp->sinfo_flags |= SCTP_SACK_IMMEDIATELY; } /* Did we reach EOR? */ if ((uio->uio_resid == 0) && ((user_marks_eor == 0) || (sinfo_flags & SCTP_EOF) || (user_marks_eor && (sinfo_flags & SCTP_EOR)))) { sp->msg_is_complete = 1; } else { sp->msg_is_complete = 0; } SCTP_TCB_SEND_UNLOCK(stcb); } if (uio->uio_resid == 0) { /* got it all? */ continue; } /* PR-SCTP? */ if ((asoc->prsctp_supported) && (asoc->sent_queue_cnt_removeable > 0)) { /* * This is ugly but we must assure locking * order */ if (hold_tcblock == 0) { SCTP_TCB_LOCK(stcb); hold_tcblock = 1; } sctp_prune_prsctp(stcb, asoc, srcv, (int)sndlen); inqueue_bytes = stcb->asoc.total_output_queue_size - (stcb->asoc.chunks_on_out_queue * SCTP_DATA_CHUNK_OVERHEAD(stcb)); if (SCTP_SB_LIMIT_SND(so) > inqueue_bytes) max_len = SCTP_SB_LIMIT_SND(so) - inqueue_bytes; else max_len = 0; if (max_len > 0) { continue; } SCTP_TCB_UNLOCK(stcb); hold_tcblock = 0; } /* wait for space now */ if (non_blocking) { /* Non-blocking io in place out */ goto skip_out_eof; } /* What about the INIT, send it maybe */ if (queue_only_for_init) { if (hold_tcblock == 0) { SCTP_TCB_LOCK(stcb); hold_tcblock = 1; } if (SCTP_GET_STATE(stcb) == SCTP_STATE_OPEN) { /* a collision took us forward? */ queue_only = 0; } else { NET_EPOCH_ENTER(et); sctp_send_initiate(inp, stcb, SCTP_SO_LOCKED); NET_EPOCH_EXIT(et); SCTP_SET_STATE(stcb, SCTP_STATE_COOKIE_WAIT); queue_only = 1; } } if ((net->flight_size > net->cwnd) && (asoc->sctp_cmt_on_off == 0)) { SCTP_STAT_INCR(sctps_send_cwnd_avoid); queue_only = 1; } else if (asoc->ifp_had_enobuf) { SCTP_STAT_INCR(sctps_ifnomemqueued); if (net->flight_size > (2 * net->mtu)) { queue_only = 1; } asoc->ifp_had_enobuf = 0; } un_sent = stcb->asoc.total_output_queue_size - stcb->asoc.total_flight; if ((sctp_is_feature_off(inp, SCTP_PCB_FLAGS_NODELAY)) && (stcb->asoc.total_flight > 0) && (stcb->asoc.stream_queue_cnt < SCTP_MAX_DATA_BUNDLING) && (un_sent < (int)(stcb->asoc.smallest_mtu - SCTP_MIN_OVERHEAD))) { /*- * Ok, Nagle is set on and we have data outstanding. * Don't send anything and let SACKs drive out the * data unless we have a "full" segment to send. */ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_NAGLE_LOGGING_ENABLE) { sctp_log_nagle_event(stcb, SCTP_NAGLE_APPLIED); } SCTP_STAT_INCR(sctps_naglequeued); nagle_applies = 1; } else { if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_NAGLE_LOGGING_ENABLE) { if (sctp_is_feature_off(inp, SCTP_PCB_FLAGS_NODELAY)) sctp_log_nagle_event(stcb, SCTP_NAGLE_SKIPPED); } SCTP_STAT_INCR(sctps_naglesent); nagle_applies = 0; } if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_BLK_LOGGING_ENABLE) { sctp_misc_ints(SCTP_CWNDLOG_PRESEND, queue_only_for_init, queue_only, nagle_applies, un_sent); sctp_misc_ints(SCTP_CWNDLOG_PRESEND, stcb->asoc.total_output_queue_size, stcb->asoc.total_flight, stcb->asoc.chunks_on_out_queue, stcb->asoc.total_flight_count); } if (queue_only_for_init) queue_only_for_init = 0; if ((queue_only == 0) && (nagle_applies == 0)) { /*- * need to start chunk output * before blocking.. note that if * a lock is already applied, then * the input via the net is happening * and I don't need to start output :-D */ NET_EPOCH_ENTER(et); if (hold_tcblock == 0) { if (SCTP_TCB_TRYLOCK(stcb)) { hold_tcblock = 1; sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_USR_SEND, SCTP_SO_LOCKED); } } else { sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_USR_SEND, SCTP_SO_LOCKED); } NET_EPOCH_EXIT(et); } if (hold_tcblock == 1) { SCTP_TCB_UNLOCK(stcb); hold_tcblock = 0; } SOCKBUF_LOCK(&so->so_snd); /*- * This is a bit strange, but I think it will * work. The total_output_queue_size is locked and * protected by the TCB_LOCK, which we just released. * There is a race that can occur between releasing it * above, and me getting the socket lock, where sacks * come in but we have not put the SB_WAIT on the * so_snd buffer to get the wakeup. After the LOCK * is applied the sack_processing will also need to * LOCK the so->so_snd to do the actual sowwakeup(). So * once we have the socket buffer lock if we recheck the * size we KNOW we will get to sleep safely with the * wakeup flag in place. */ inqueue_bytes = stcb->asoc.total_output_queue_size - (stcb->asoc.chunks_on_out_queue * SCTP_DATA_CHUNK_OVERHEAD(stcb)); if (SCTP_SB_LIMIT_SND(so) <= (inqueue_bytes + min(SCTP_BASE_SYSCTL(sctp_add_more_threshold), SCTP_SB_LIMIT_SND(so)))) { if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_BLK_LOGGING_ENABLE) { sctp_log_block(SCTP_BLOCK_LOG_INTO_BLK, asoc, uio->uio_resid); } be.error = 0; stcb->block_entry = &be; error = sbwait(&so->so_snd); stcb->block_entry = NULL; if (error || so->so_error || be.error) { if (error == 0) { if (so->so_error) error = so->so_error; if (be.error) { error = be.error; } } SOCKBUF_UNLOCK(&so->so_snd); goto out_unlocked; } if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_BLK_LOGGING_ENABLE) { sctp_log_block(SCTP_BLOCK_LOG_OUTOF_BLK, asoc, stcb->asoc.total_output_queue_size); } } SOCKBUF_UNLOCK(&so->so_snd); if (stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) { goto out_unlocked; } } SCTP_TCB_SEND_LOCK(stcb); - if (stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) { + if ((stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) || + (stcb->asoc.state & SCTP_STATE_WAS_ABORTED)) { SCTP_TCB_SEND_UNLOCK(stcb); goto out_unlocked; } if (sp) { if (sp->msg_is_complete == 0) { strm->last_msg_incomplete = 1; if (stcb->asoc.idata_supported == 0) { asoc->stream_locked = 1; asoc->stream_locked_on = srcv->sinfo_stream; } } else { sp->sender_all_done = 1; strm->last_msg_incomplete = 0; asoc->stream_locked = 0; } + sp->processing = 0; } else { SCTP_PRINTF("Huh no sp TSNH?\n"); strm->last_msg_incomplete = 0; asoc->stream_locked = 0; } SCTP_TCB_SEND_UNLOCK(stcb); if (uio->uio_resid == 0) { got_all_of_the_send = 1; } } else { /* We send in a 0, since we do NOT have any locks */ error = sctp_msg_append(stcb, net, top, srcv, 0); top = NULL; if (sinfo_flags & SCTP_EOF) { got_all_of_the_send = 1; } } if (error) { goto out; } dataless_eof: /* EOF thing ? */ if ((sinfo_flags & SCTP_EOF) && (got_all_of_the_send == 1)) { SCTP_STAT_INCR(sctps_sends_with_eof); error = 0; if (hold_tcblock == 0) { SCTP_TCB_LOCK(stcb); hold_tcblock = 1; } if (TAILQ_EMPTY(&asoc->send_queue) && TAILQ_EMPTY(&asoc->sent_queue) && sctp_is_there_unsent_data(stcb, SCTP_SO_LOCKED) == 0) { if ((*asoc->ss_functions.sctp_ss_is_user_msgs_incomplete) (stcb, asoc)) { goto abort_anyway; } /* there is nothing queued to send, so I'm done... */ if ((SCTP_GET_STATE(stcb) != SCTP_STATE_SHUTDOWN_SENT) && (SCTP_GET_STATE(stcb) != SCTP_STATE_SHUTDOWN_RECEIVED) && (SCTP_GET_STATE(stcb) != SCTP_STATE_SHUTDOWN_ACK_SENT)) { struct sctp_nets *netp; /* only send SHUTDOWN the first time through */ if (SCTP_GET_STATE(stcb) == SCTP_STATE_OPEN) { SCTP_STAT_DECR_GAUGE32(sctps_currestab); } SCTP_SET_STATE(stcb, SCTP_STATE_SHUTDOWN_SENT); sctp_stop_timers_for_shutdown(stcb); if (stcb->asoc.alternate) { netp = stcb->asoc.alternate; } else { netp = stcb->asoc.primary_destination; } sctp_send_shutdown(stcb, netp); sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWN, stcb->sctp_ep, stcb, netp); sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWNGUARD, stcb->sctp_ep, stcb, NULL); } } else { /*- * we still got (or just got) data to send, so set * SHUTDOWN_PENDING */ /*- * XXX sockets draft says that SCTP_EOF should be * sent with no data. currently, we will allow user * data to be sent first and move to * SHUTDOWN-PENDING */ if ((SCTP_GET_STATE(stcb) != SCTP_STATE_SHUTDOWN_SENT) && (SCTP_GET_STATE(stcb) != SCTP_STATE_SHUTDOWN_RECEIVED) && (SCTP_GET_STATE(stcb) != SCTP_STATE_SHUTDOWN_ACK_SENT)) { if (hold_tcblock == 0) { SCTP_TCB_LOCK(stcb); hold_tcblock = 1; } if ((*asoc->ss_functions.sctp_ss_is_user_msgs_incomplete) (stcb, asoc)) { SCTP_ADD_SUBSTATE(stcb, SCTP_STATE_PARTIAL_MSG_LEFT); } SCTP_ADD_SUBSTATE(stcb, SCTP_STATE_SHUTDOWN_PENDING); if (TAILQ_EMPTY(&asoc->send_queue) && TAILQ_EMPTY(&asoc->sent_queue) && (asoc->state & SCTP_STATE_PARTIAL_MSG_LEFT)) { struct mbuf *op_err; char msg[SCTP_DIAG_INFO_LEN]; abort_anyway: if (free_cnt_applied) { atomic_add_int(&stcb->asoc.refcnt, -1); free_cnt_applied = 0; } SCTP_SNPRINTF(msg, sizeof(msg), "%s:%d at %s", __FILE__, __LINE__, __func__); op_err = sctp_generate_cause(SCTP_BASE_SYSCTL(sctp_diag_info_code), msg); NET_EPOCH_ENTER(et); sctp_abort_an_association(stcb->sctp_ep, stcb, op_err, SCTP_SO_LOCKED); NET_EPOCH_EXIT(et); /* * now relock the stcb so everything * is sane */ hold_tcblock = 0; stcb = NULL; goto out; } sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWNGUARD, stcb->sctp_ep, stcb, NULL); sctp_feature_off(inp, SCTP_PCB_FLAGS_NODELAY); } } } skip_out_eof: if (!TAILQ_EMPTY(&stcb->asoc.control_send_queue)) { some_on_control = 1; } if (queue_only_for_init) { if (hold_tcblock == 0) { SCTP_TCB_LOCK(stcb); hold_tcblock = 1; } if (SCTP_GET_STATE(stcb) == SCTP_STATE_OPEN) { /* a collision took us forward? */ queue_only = 0; } else { NET_EPOCH_ENTER(et); sctp_send_initiate(inp, stcb, SCTP_SO_LOCKED); NET_EPOCH_EXIT(et); SCTP_SET_STATE(stcb, SCTP_STATE_COOKIE_WAIT); queue_only = 1; } } if ((net->flight_size > net->cwnd) && (stcb->asoc.sctp_cmt_on_off == 0)) { SCTP_STAT_INCR(sctps_send_cwnd_avoid); queue_only = 1; } else if (asoc->ifp_had_enobuf) { SCTP_STAT_INCR(sctps_ifnomemqueued); if (net->flight_size > (2 * net->mtu)) { queue_only = 1; } asoc->ifp_had_enobuf = 0; } un_sent = stcb->asoc.total_output_queue_size - stcb->asoc.total_flight; if ((sctp_is_feature_off(inp, SCTP_PCB_FLAGS_NODELAY)) && (stcb->asoc.total_flight > 0) && (stcb->asoc.stream_queue_cnt < SCTP_MAX_DATA_BUNDLING) && (un_sent < (int)(stcb->asoc.smallest_mtu - SCTP_MIN_OVERHEAD))) { /*- * Ok, Nagle is set on and we have data outstanding. * Don't send anything and let SACKs drive out the * data unless wen have a "full" segment to send. */ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_NAGLE_LOGGING_ENABLE) { sctp_log_nagle_event(stcb, SCTP_NAGLE_APPLIED); } SCTP_STAT_INCR(sctps_naglequeued); nagle_applies = 1; } else { if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_NAGLE_LOGGING_ENABLE) { if (sctp_is_feature_off(inp, SCTP_PCB_FLAGS_NODELAY)) sctp_log_nagle_event(stcb, SCTP_NAGLE_SKIPPED); } SCTP_STAT_INCR(sctps_naglesent); nagle_applies = 0; } if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_BLK_LOGGING_ENABLE) { sctp_misc_ints(SCTP_CWNDLOG_PRESEND, queue_only_for_init, queue_only, nagle_applies, un_sent); sctp_misc_ints(SCTP_CWNDLOG_PRESEND, stcb->asoc.total_output_queue_size, stcb->asoc.total_flight, stcb->asoc.chunks_on_out_queue, stcb->asoc.total_flight_count); } NET_EPOCH_ENTER(et); if ((queue_only == 0) && (nagle_applies == 0) && (stcb->asoc.peers_rwnd && un_sent)) { /* we can attempt to send too. */ if (hold_tcblock == 0) { /* * If there is activity recv'ing sacks no need to * send */ if (SCTP_TCB_TRYLOCK(stcb)) { sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_USR_SEND, SCTP_SO_LOCKED); hold_tcblock = 1; } } else { sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_USR_SEND, SCTP_SO_LOCKED); } } else if ((queue_only == 0) && (stcb->asoc.peers_rwnd == 0) && (stcb->asoc.total_flight == 0)) { /* We get to have a probe outstanding */ if (hold_tcblock == 0) { hold_tcblock = 1; SCTP_TCB_LOCK(stcb); } sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_USR_SEND, SCTP_SO_LOCKED); } else if (some_on_control) { int num_out, reason, frag_point; /* Here we do control only */ if (hold_tcblock == 0) { hold_tcblock = 1; SCTP_TCB_LOCK(stcb); } frag_point = sctp_get_frag_point(stcb, &stcb->asoc); (void)sctp_med_chunk_output(inp, stcb, &stcb->asoc, &num_out, &reason, 1, 1, &now, &now_filled, frag_point, SCTP_SO_LOCKED); } NET_EPOCH_EXIT(et); SCTPDBG(SCTP_DEBUG_OUTPUT1, "USR Send complete qo:%d prw:%d unsent:%d tf:%d cooq:%d toqs:%d err:%d\n", queue_only, stcb->asoc.peers_rwnd, un_sent, stcb->asoc.total_flight, stcb->asoc.chunks_on_out_queue, stcb->asoc.total_output_queue_size, error); out: out_unlocked: if (local_soresv && stcb) { atomic_subtract_int(&stcb->asoc.sb_send_resv, sndlen); } if (create_lock_applied) { SCTP_ASOC_CREATE_UNLOCK(inp); } if ((stcb) && hold_tcblock) { SCTP_TCB_UNLOCK(stcb); } if (stcb && free_cnt_applied) { atomic_add_int(&stcb->asoc.refcnt, -1); } #ifdef INVARIANTS if (stcb) { if (mtx_owned(&stcb->tcb_mtx)) { panic("Leaving with tcb mtx owned?"); } if (mtx_owned(&stcb->tcb_send_mtx)) { panic("Leaving with tcb send mtx owned?"); } } #endif if (top) { sctp_m_freem(top); } if (control) { sctp_m_freem(control); } return (error); } /* * generate an AUTHentication chunk, if required */ struct mbuf * sctp_add_auth_chunk(struct mbuf *m, struct mbuf **m_end, struct sctp_auth_chunk **auth_ret, uint32_t *offset, struct sctp_tcb *stcb, uint8_t chunk) { struct mbuf *m_auth; struct sctp_auth_chunk *auth; int chunk_len; struct mbuf *cn; if ((m_end == NULL) || (auth_ret == NULL) || (offset == NULL) || (stcb == NULL)) return (m); if (stcb->asoc.auth_supported == 0) { return (m); } /* does the requested chunk require auth? */ if (!sctp_auth_is_required_chunk(chunk, stcb->asoc.peer_auth_chunks)) { return (m); } m_auth = sctp_get_mbuf_for_msg(sizeof(*auth), 0, M_NOWAIT, 1, MT_HEADER); if (m_auth == NULL) { /* no mbuf's */ return (m); } /* reserve some space if this will be the first mbuf */ if (m == NULL) SCTP_BUF_RESV_UF(m_auth, SCTP_MIN_OVERHEAD); /* fill in the AUTH chunk details */ auth = mtod(m_auth, struct sctp_auth_chunk *); memset(auth, 0, sizeof(*auth)); auth->ch.chunk_type = SCTP_AUTHENTICATION; auth->ch.chunk_flags = 0; chunk_len = sizeof(*auth) + sctp_get_hmac_digest_len(stcb->asoc.peer_hmac_id); auth->ch.chunk_length = htons(chunk_len); auth->hmac_id = htons(stcb->asoc.peer_hmac_id); /* key id and hmac digest will be computed and filled in upon send */ /* save the offset where the auth was inserted into the chain */ *offset = 0; for (cn = m; cn; cn = SCTP_BUF_NEXT(cn)) { *offset += SCTP_BUF_LEN(cn); } /* update length and return pointer to the auth chunk */ SCTP_BUF_LEN(m_auth) = chunk_len; m = sctp_copy_mbufchain(m_auth, m, m_end, 1, chunk_len, 0); if (auth_ret != NULL) *auth_ret = auth; return (m); } #ifdef INET6 int sctp_v6src_match_nexthop(struct sockaddr_in6 *src6, sctp_route_t *ro) { struct nd_prefix *pfx = NULL; struct nd_pfxrouter *pfxrtr = NULL; struct sockaddr_in6 gw6; if (ro == NULL || ro->ro_nh == NULL || src6->sin6_family != AF_INET6) return (0); /* get prefix entry of address */ ND6_RLOCK(); LIST_FOREACH(pfx, &MODULE_GLOBAL(nd_prefix), ndpr_entry) { if (pfx->ndpr_stateflags & NDPRF_DETACHED) continue; if (IN6_ARE_MASKED_ADDR_EQUAL(&pfx->ndpr_prefix.sin6_addr, &src6->sin6_addr, &pfx->ndpr_mask)) break; } /* no prefix entry in the prefix list */ if (pfx == NULL) { ND6_RUNLOCK(); SCTPDBG(SCTP_DEBUG_OUTPUT2, "No prefix entry for "); SCTPDBG_ADDR(SCTP_DEBUG_OUTPUT2, (struct sockaddr *)src6); return (0); } SCTPDBG(SCTP_DEBUG_OUTPUT2, "v6src_match_nexthop(), Prefix entry is "); SCTPDBG_ADDR(SCTP_DEBUG_OUTPUT2, (struct sockaddr *)src6); /* search installed gateway from prefix entry */ LIST_FOREACH(pfxrtr, &pfx->ndpr_advrtrs, pfr_entry) { memset(&gw6, 0, sizeof(struct sockaddr_in6)); gw6.sin6_family = AF_INET6; gw6.sin6_len = sizeof(struct sockaddr_in6); memcpy(&gw6.sin6_addr, &pfxrtr->router->rtaddr, sizeof(struct in6_addr)); SCTPDBG(SCTP_DEBUG_OUTPUT2, "prefix router is "); SCTPDBG_ADDR(SCTP_DEBUG_OUTPUT2, (struct sockaddr *)&gw6); SCTPDBG(SCTP_DEBUG_OUTPUT2, "installed router is "); SCTPDBG_ADDR(SCTP_DEBUG_OUTPUT2, &ro->ro_nh->gw_sa); if (sctp_cmpaddr((struct sockaddr *)&gw6, &ro->ro_nh->gw_sa)) { ND6_RUNLOCK(); SCTPDBG(SCTP_DEBUG_OUTPUT2, "pfxrouter is installed\n"); return (1); } } ND6_RUNLOCK(); SCTPDBG(SCTP_DEBUG_OUTPUT2, "pfxrouter is not installed\n"); return (0); } #endif int sctp_v4src_match_nexthop(struct sctp_ifa *sifa, sctp_route_t *ro) { #ifdef INET struct sockaddr_in *sin, *mask; struct ifaddr *ifa; struct in_addr srcnetaddr, gwnetaddr; if (ro == NULL || ro->ro_nh == NULL || sifa->address.sa.sa_family != AF_INET) { return (0); } ifa = (struct ifaddr *)sifa->ifa; mask = (struct sockaddr_in *)(ifa->ifa_netmask); sin = &sifa->address.sin; srcnetaddr.s_addr = (sin->sin_addr.s_addr & mask->sin_addr.s_addr); SCTPDBG(SCTP_DEBUG_OUTPUT1, "match_nexthop4: src address is "); SCTPDBG_ADDR(SCTP_DEBUG_OUTPUT2, &sifa->address.sa); SCTPDBG(SCTP_DEBUG_OUTPUT1, "network address is %x\n", srcnetaddr.s_addr); sin = &ro->ro_nh->gw4_sa; gwnetaddr.s_addr = (sin->sin_addr.s_addr & mask->sin_addr.s_addr); SCTPDBG(SCTP_DEBUG_OUTPUT1, "match_nexthop4: nexthop is "); SCTPDBG_ADDR(SCTP_DEBUG_OUTPUT2, &ro->ro_nh->gw_sa); SCTPDBG(SCTP_DEBUG_OUTPUT1, "network address is %x\n", gwnetaddr.s_addr); if (srcnetaddr.s_addr == gwnetaddr.s_addr) { return (1); } #endif return (0); } Index: projects/clang1100-import/sys/netinet/sctp_pcb.c =================================================================== --- projects/clang1100-import/sys/netinet/sctp_pcb.c (revision 364278) +++ projects/clang1100-import/sys/netinet/sctp_pcb.c (revision 364279) @@ -1,7138 +1,7143 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2001-2008, by Cisco Systems, Inc. All rights reserved. * Copyright (c) 2008-2012, by Randall Stewart. All rights reserved. * Copyright (c) 2008-2012, by Michael Tuexen. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * a) Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * b) Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the distribution. * * c) Neither the name of Cisco Systems, Inc. nor the names of its * contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #if defined(INET) || defined(INET6) #include #endif #ifdef INET6 #include #endif #include #include #include /* FIX: we don't handle multiple link local scopes */ /* "scopeless" replacement IN6_ARE_ADDR_EQUAL */ #ifdef INET6 int SCTP6_ARE_ADDR_EQUAL(struct sockaddr_in6 *a, struct sockaddr_in6 *b) { struct sockaddr_in6 tmp_a, tmp_b; memcpy(&tmp_a, a, sizeof(struct sockaddr_in6)); if (sa6_embedscope(&tmp_a, MODULE_GLOBAL(ip6_use_defzone)) != 0) { return (0); } memcpy(&tmp_b, b, sizeof(struct sockaddr_in6)); if (sa6_embedscope(&tmp_b, MODULE_GLOBAL(ip6_use_defzone)) != 0) { return (0); } return (IN6_ARE_ADDR_EQUAL(&tmp_a.sin6_addr, &tmp_b.sin6_addr)); } #endif void sctp_fill_pcbinfo(struct sctp_pcbinfo *spcb) { /* * We really don't need to lock this, but I will just because it * does not hurt. */ SCTP_INP_INFO_RLOCK(); spcb->ep_count = SCTP_BASE_INFO(ipi_count_ep); spcb->asoc_count = SCTP_BASE_INFO(ipi_count_asoc); spcb->laddr_count = SCTP_BASE_INFO(ipi_count_laddr); spcb->raddr_count = SCTP_BASE_INFO(ipi_count_raddr); spcb->chk_count = SCTP_BASE_INFO(ipi_count_chunk); spcb->readq_count = SCTP_BASE_INFO(ipi_count_readq); spcb->stream_oque = SCTP_BASE_INFO(ipi_count_strmoq); spcb->free_chunks = SCTP_BASE_INFO(ipi_free_chunks); SCTP_INP_INFO_RUNLOCK(); } /*- * Addresses are added to VRF's (Virtual Router's). For BSD we * have only the default VRF 0. We maintain a hash list of * VRF's. Each VRF has its own list of sctp_ifn's. Each of * these has a list of addresses. When we add a new address * to a VRF we lookup the ifn/ifn_index, if the ifn does * not exist we create it and add it to the list of IFN's * within the VRF. Once we have the sctp_ifn, we add the * address to the list. So we look something like: * * hash-vrf-table * vrf-> ifn-> ifn -> ifn * vrf | * ... +--ifa-> ifa -> ifa * vrf * * We keep these separate lists since the SCTP subsystem will * point to these from its source address selection nets structure. * When an address is deleted it does not happen right away on * the SCTP side, it gets scheduled. What we do when a * delete happens is immediately remove the address from * the master list and decrement the refcount. As our * addip iterator works through and frees the src address * selection pointing to the sctp_ifa, eventually the refcount * will reach 0 and we will delete it. Note that it is assumed * that any locking on system level ifn/ifa is done at the * caller of these functions and these routines will only * lock the SCTP structures as they add or delete things. * * Other notes on VRF concepts. * - An endpoint can be in multiple VRF's * - An association lives within a VRF and only one VRF. * - Any incoming packet we can deduce the VRF for by * looking at the mbuf/pak inbound (for BSD its VRF=0 :D) * - Any downward send call or connect call must supply the * VRF via ancillary data or via some sort of set default * VRF socket option call (again for BSD no brainer since * the VRF is always 0). * - An endpoint may add multiple VRF's to it. * - Listening sockets can accept associations in any * of the VRF's they are in but the assoc will end up * in only one VRF (gotten from the packet or connect/send). * */ struct sctp_vrf * sctp_allocate_vrf(int vrf_id) { struct sctp_vrf *vrf = NULL; struct sctp_vrflist *bucket; /* First allocate the VRF structure */ vrf = sctp_find_vrf(vrf_id); if (vrf) { /* Already allocated */ return (vrf); } SCTP_MALLOC(vrf, struct sctp_vrf *, sizeof(struct sctp_vrf), SCTP_M_VRF); if (vrf == NULL) { /* No memory */ #ifdef INVARIANTS panic("No memory for VRF:%d", vrf_id); #endif return (NULL); } /* setup the VRF */ memset(vrf, 0, sizeof(struct sctp_vrf)); vrf->vrf_id = vrf_id; LIST_INIT(&vrf->ifnlist); vrf->total_ifa_count = 0; vrf->refcount = 0; /* now also setup table ids */ SCTP_INIT_VRF_TABLEID(vrf); /* Init the HASH of addresses */ vrf->vrf_addr_hash = SCTP_HASH_INIT(SCTP_VRF_ADDR_HASH_SIZE, &vrf->vrf_addr_hashmark); if (vrf->vrf_addr_hash == NULL) { /* No memory */ #ifdef INVARIANTS panic("No memory for VRF:%d", vrf_id); #endif SCTP_FREE(vrf, SCTP_M_VRF); return (NULL); } /* Add it to the hash table */ bucket = &SCTP_BASE_INFO(sctp_vrfhash)[(vrf_id & SCTP_BASE_INFO(hashvrfmark))]; LIST_INSERT_HEAD(bucket, vrf, next_vrf); atomic_add_int(&SCTP_BASE_INFO(ipi_count_vrfs), 1); return (vrf); } struct sctp_ifn * sctp_find_ifn(void *ifn, uint32_t ifn_index) { struct sctp_ifn *sctp_ifnp; struct sctp_ifnlist *hash_ifn_head; /* * We assume the lock is held for the addresses if that's wrong * problems could occur :-) */ SCTP_IPI_ADDR_LOCK_ASSERT(); hash_ifn_head = &SCTP_BASE_INFO(vrf_ifn_hash)[(ifn_index & SCTP_BASE_INFO(vrf_ifn_hashmark))]; LIST_FOREACH(sctp_ifnp, hash_ifn_head, next_bucket) { if (sctp_ifnp->ifn_index == ifn_index) { return (sctp_ifnp); } if (sctp_ifnp->ifn_p && ifn && (sctp_ifnp->ifn_p == ifn)) { return (sctp_ifnp); } } return (NULL); } struct sctp_vrf * sctp_find_vrf(uint32_t vrf_id) { struct sctp_vrflist *bucket; struct sctp_vrf *liste; bucket = &SCTP_BASE_INFO(sctp_vrfhash)[(vrf_id & SCTP_BASE_INFO(hashvrfmark))]; LIST_FOREACH(liste, bucket, next_vrf) { if (vrf_id == liste->vrf_id) { return (liste); } } return (NULL); } void sctp_free_vrf(struct sctp_vrf *vrf) { if (SCTP_DECREMENT_AND_CHECK_REFCOUNT(&vrf->refcount)) { if (vrf->vrf_addr_hash) { SCTP_HASH_FREE(vrf->vrf_addr_hash, vrf->vrf_addr_hashmark); vrf->vrf_addr_hash = NULL; } /* We zero'd the count */ LIST_REMOVE(vrf, next_vrf); SCTP_FREE(vrf, SCTP_M_VRF); atomic_subtract_int(&SCTP_BASE_INFO(ipi_count_vrfs), 1); } } void sctp_free_ifn(struct sctp_ifn *sctp_ifnp) { if (SCTP_DECREMENT_AND_CHECK_REFCOUNT(&sctp_ifnp->refcount)) { /* We zero'd the count */ if (sctp_ifnp->vrf) { sctp_free_vrf(sctp_ifnp->vrf); } SCTP_FREE(sctp_ifnp, SCTP_M_IFN); atomic_subtract_int(&SCTP_BASE_INFO(ipi_count_ifns), 1); } } void sctp_update_ifn_mtu(uint32_t ifn_index, uint32_t mtu) { struct sctp_ifn *sctp_ifnp; sctp_ifnp = sctp_find_ifn((void *)NULL, ifn_index); if (sctp_ifnp != NULL) { sctp_ifnp->ifn_mtu = mtu; } } void sctp_free_ifa(struct sctp_ifa *sctp_ifap) { if (SCTP_DECREMENT_AND_CHECK_REFCOUNT(&sctp_ifap->refcount)) { /* We zero'd the count */ if (sctp_ifap->ifn_p) { sctp_free_ifn(sctp_ifap->ifn_p); } SCTP_FREE(sctp_ifap, SCTP_M_IFA); atomic_subtract_int(&SCTP_BASE_INFO(ipi_count_ifas), 1); } } static void sctp_delete_ifn(struct sctp_ifn *sctp_ifnp, int hold_addr_lock) { struct sctp_ifn *found; found = sctp_find_ifn(sctp_ifnp->ifn_p, sctp_ifnp->ifn_index); if (found == NULL) { /* Not in the list.. sorry */ return; } if (hold_addr_lock == 0) { SCTP_IPI_ADDR_WLOCK(); } else { SCTP_IPI_ADDR_WLOCK_ASSERT(); } LIST_REMOVE(sctp_ifnp, next_bucket); LIST_REMOVE(sctp_ifnp, next_ifn); if (hold_addr_lock == 0) { SCTP_IPI_ADDR_WUNLOCK(); } /* Take away the reference, and possibly free it */ sctp_free_ifn(sctp_ifnp); } void sctp_mark_ifa_addr_down(uint32_t vrf_id, struct sockaddr *addr, const char *if_name, uint32_t ifn_index) { struct sctp_vrf *vrf; struct sctp_ifa *sctp_ifap; SCTP_IPI_ADDR_RLOCK(); vrf = sctp_find_vrf(vrf_id); if (vrf == NULL) { SCTPDBG(SCTP_DEBUG_PCB4, "Can't find vrf_id 0x%x\n", vrf_id); goto out; } sctp_ifap = sctp_find_ifa_by_addr(addr, vrf->vrf_id, SCTP_ADDR_LOCKED); if (sctp_ifap == NULL) { SCTPDBG(SCTP_DEBUG_PCB4, "Can't find sctp_ifap for address\n"); goto out; } if (sctp_ifap->ifn_p == NULL) { SCTPDBG(SCTP_DEBUG_PCB4, "IFA has no IFN - can't mark unusable\n"); goto out; } if (if_name) { if (strncmp(if_name, sctp_ifap->ifn_p->ifn_name, SCTP_IFNAMSIZ) != 0) { SCTPDBG(SCTP_DEBUG_PCB4, "IFN %s of IFA not the same as %s\n", sctp_ifap->ifn_p->ifn_name, if_name); goto out; } } else { if (sctp_ifap->ifn_p->ifn_index != ifn_index) { SCTPDBG(SCTP_DEBUG_PCB4, "IFA owned by ifn_index:%d down command for ifn_index:%d - ignored\n", sctp_ifap->ifn_p->ifn_index, ifn_index); goto out; } } sctp_ifap->localifa_flags &= (~SCTP_ADDR_VALID); sctp_ifap->localifa_flags |= SCTP_ADDR_IFA_UNUSEABLE; out: SCTP_IPI_ADDR_RUNLOCK(); } void sctp_mark_ifa_addr_up(uint32_t vrf_id, struct sockaddr *addr, const char *if_name, uint32_t ifn_index) { struct sctp_vrf *vrf; struct sctp_ifa *sctp_ifap; SCTP_IPI_ADDR_RLOCK(); vrf = sctp_find_vrf(vrf_id); if (vrf == NULL) { SCTPDBG(SCTP_DEBUG_PCB4, "Can't find vrf_id 0x%x\n", vrf_id); goto out; } sctp_ifap = sctp_find_ifa_by_addr(addr, vrf->vrf_id, SCTP_ADDR_LOCKED); if (sctp_ifap == NULL) { SCTPDBG(SCTP_DEBUG_PCB4, "Can't find sctp_ifap for address\n"); goto out; } if (sctp_ifap->ifn_p == NULL) { SCTPDBG(SCTP_DEBUG_PCB4, "IFA has no IFN - can't mark unusable\n"); goto out; } if (if_name) { if (strncmp(if_name, sctp_ifap->ifn_p->ifn_name, SCTP_IFNAMSIZ) != 0) { SCTPDBG(SCTP_DEBUG_PCB4, "IFN %s of IFA not the same as %s\n", sctp_ifap->ifn_p->ifn_name, if_name); goto out; } } else { if (sctp_ifap->ifn_p->ifn_index != ifn_index) { SCTPDBG(SCTP_DEBUG_PCB4, "IFA owned by ifn_index:%d down command for ifn_index:%d - ignored\n", sctp_ifap->ifn_p->ifn_index, ifn_index); goto out; } } sctp_ifap->localifa_flags &= (~SCTP_ADDR_IFA_UNUSEABLE); sctp_ifap->localifa_flags |= SCTP_ADDR_VALID; out: SCTP_IPI_ADDR_RUNLOCK(); } /*- * Add an ifa to an ifn. * Register the interface as necessary. * NOTE: ADDR write lock MUST be held. */ static void sctp_add_ifa_to_ifn(struct sctp_ifn *sctp_ifnp, struct sctp_ifa *sctp_ifap) { int ifa_af; LIST_INSERT_HEAD(&sctp_ifnp->ifalist, sctp_ifap, next_ifa); sctp_ifap->ifn_p = sctp_ifnp; atomic_add_int(&sctp_ifap->ifn_p->refcount, 1); /* update address counts */ sctp_ifnp->ifa_count++; ifa_af = sctp_ifap->address.sa.sa_family; switch (ifa_af) { #ifdef INET case AF_INET: sctp_ifnp->num_v4++; break; #endif #ifdef INET6 case AF_INET6: sctp_ifnp->num_v6++; break; #endif default: break; } if (sctp_ifnp->ifa_count == 1) { /* register the new interface */ sctp_ifnp->registered_af = ifa_af; } } /*- * Remove an ifa from its ifn. * If no more addresses exist, remove the ifn too. Otherwise, re-register * the interface based on the remaining address families left. * NOTE: ADDR write lock MUST be held. */ static void sctp_remove_ifa_from_ifn(struct sctp_ifa *sctp_ifap) { LIST_REMOVE(sctp_ifap, next_ifa); if (sctp_ifap->ifn_p) { /* update address counts */ sctp_ifap->ifn_p->ifa_count--; switch (sctp_ifap->address.sa.sa_family) { #ifdef INET case AF_INET: sctp_ifap->ifn_p->num_v4--; break; #endif #ifdef INET6 case AF_INET6: sctp_ifap->ifn_p->num_v6--; break; #endif default: break; } if (LIST_EMPTY(&sctp_ifap->ifn_p->ifalist)) { /* remove the ifn, possibly freeing it */ sctp_delete_ifn(sctp_ifap->ifn_p, SCTP_ADDR_LOCKED); } else { /* re-register address family type, if needed */ if ((sctp_ifap->ifn_p->num_v6 == 0) && (sctp_ifap->ifn_p->registered_af == AF_INET6)) { sctp_ifap->ifn_p->registered_af = AF_INET; } else if ((sctp_ifap->ifn_p->num_v4 == 0) && (sctp_ifap->ifn_p->registered_af == AF_INET)) { sctp_ifap->ifn_p->registered_af = AF_INET6; } /* free the ifn refcount */ sctp_free_ifn(sctp_ifap->ifn_p); } sctp_ifap->ifn_p = NULL; } } struct sctp_ifa * sctp_add_addr_to_vrf(uint32_t vrf_id, void *ifn, uint32_t ifn_index, uint32_t ifn_type, const char *if_name, void *ifa, struct sockaddr *addr, uint32_t ifa_flags, int dynamic_add) { struct sctp_vrf *vrf; struct sctp_ifn *sctp_ifnp, *new_sctp_ifnp; struct sctp_ifa *sctp_ifap, *new_sctp_ifap; struct sctp_ifalist *hash_addr_head; struct sctp_ifnlist *hash_ifn_head; uint32_t hash_of_addr; int new_ifn_af = 0; #ifdef SCTP_DEBUG SCTPDBG(SCTP_DEBUG_PCB4, "vrf_id 0x%x: adding address: ", vrf_id); SCTPDBG_ADDR(SCTP_DEBUG_PCB4, addr); #endif SCTP_MALLOC(new_sctp_ifnp, struct sctp_ifn *, sizeof(struct sctp_ifn), SCTP_M_IFN); if (new_sctp_ifnp == NULL) { #ifdef INVARIANTS panic("No memory for IFN"); #endif return (NULL); } SCTP_MALLOC(new_sctp_ifap, struct sctp_ifa *, sizeof(struct sctp_ifa), SCTP_M_IFA); if (new_sctp_ifap == NULL) { #ifdef INVARIANTS panic("No memory for IFA"); #endif SCTP_FREE(new_sctp_ifnp, SCTP_M_IFN); return (NULL); } SCTP_IPI_ADDR_WLOCK(); sctp_ifnp = sctp_find_ifn(ifn, ifn_index); if (sctp_ifnp) { vrf = sctp_ifnp->vrf; } else { vrf = sctp_find_vrf(vrf_id); if (vrf == NULL) { vrf = sctp_allocate_vrf(vrf_id); if (vrf == NULL) { SCTP_IPI_ADDR_WUNLOCK(); SCTP_FREE(new_sctp_ifnp, SCTP_M_IFN); SCTP_FREE(new_sctp_ifap, SCTP_M_IFA); return (NULL); } } } if (sctp_ifnp == NULL) { /* * build one and add it, can't hold lock until after malloc * done though. */ sctp_ifnp = new_sctp_ifnp; new_sctp_ifnp = NULL; memset(sctp_ifnp, 0, sizeof(struct sctp_ifn)); sctp_ifnp->ifn_index = ifn_index; sctp_ifnp->ifn_p = ifn; sctp_ifnp->ifn_type = ifn_type; sctp_ifnp->refcount = 0; sctp_ifnp->vrf = vrf; atomic_add_int(&vrf->refcount, 1); sctp_ifnp->ifn_mtu = SCTP_GATHER_MTU_FROM_IFN_INFO(ifn, ifn_index, addr->sa_family); if (if_name != NULL) { SCTP_SNPRINTF(sctp_ifnp->ifn_name, SCTP_IFNAMSIZ, "%s", if_name); } else { SCTP_SNPRINTF(sctp_ifnp->ifn_name, SCTP_IFNAMSIZ, "%s", "unknown"); } hash_ifn_head = &SCTP_BASE_INFO(vrf_ifn_hash)[(ifn_index & SCTP_BASE_INFO(vrf_ifn_hashmark))]; LIST_INIT(&sctp_ifnp->ifalist); LIST_INSERT_HEAD(hash_ifn_head, sctp_ifnp, next_bucket); LIST_INSERT_HEAD(&vrf->ifnlist, sctp_ifnp, next_ifn); atomic_add_int(&SCTP_BASE_INFO(ipi_count_ifns), 1); new_ifn_af = 1; } sctp_ifap = sctp_find_ifa_by_addr(addr, vrf->vrf_id, SCTP_ADDR_LOCKED); if (sctp_ifap) { /* Hmm, it already exists? */ if ((sctp_ifap->ifn_p) && (sctp_ifap->ifn_p->ifn_index == ifn_index)) { SCTPDBG(SCTP_DEBUG_PCB4, "Using existing ifn %s (0x%x) for ifa %p\n", sctp_ifap->ifn_p->ifn_name, ifn_index, (void *)sctp_ifap); if (new_ifn_af) { /* Remove the created one that we don't want */ sctp_delete_ifn(sctp_ifnp, SCTP_ADDR_LOCKED); } if (sctp_ifap->localifa_flags & SCTP_BEING_DELETED) { /* easy to solve, just switch back to active */ SCTPDBG(SCTP_DEBUG_PCB4, "Clearing deleted ifa flag\n"); sctp_ifap->localifa_flags = SCTP_ADDR_VALID; sctp_ifap->ifn_p = sctp_ifnp; atomic_add_int(&sctp_ifap->ifn_p->refcount, 1); } exit_stage_left: SCTP_IPI_ADDR_WUNLOCK(); if (new_sctp_ifnp != NULL) { SCTP_FREE(new_sctp_ifnp, SCTP_M_IFN); } SCTP_FREE(new_sctp_ifap, SCTP_M_IFA); return (sctp_ifap); } else { if (sctp_ifap->ifn_p) { /* * The last IFN gets the address, remove the * old one */ SCTPDBG(SCTP_DEBUG_PCB4, "Moving ifa %p from %s (0x%x) to %s (0x%x)\n", (void *)sctp_ifap, sctp_ifap->ifn_p->ifn_name, sctp_ifap->ifn_p->ifn_index, if_name, ifn_index); /* remove the address from the old ifn */ sctp_remove_ifa_from_ifn(sctp_ifap); /* move the address over to the new ifn */ sctp_add_ifa_to_ifn(sctp_ifnp, sctp_ifap); goto exit_stage_left; } else { /* repair ifnp which was NULL ? */ sctp_ifap->localifa_flags = SCTP_ADDR_VALID; SCTPDBG(SCTP_DEBUG_PCB4, "Repairing ifn %p for ifa %p\n", (void *)sctp_ifnp, (void *)sctp_ifap); sctp_add_ifa_to_ifn(sctp_ifnp, sctp_ifap); } goto exit_stage_left; } } sctp_ifap = new_sctp_ifap; memset(sctp_ifap, 0, sizeof(struct sctp_ifa)); sctp_ifap->ifn_p = sctp_ifnp; atomic_add_int(&sctp_ifnp->refcount, 1); sctp_ifap->vrf_id = vrf_id; sctp_ifap->ifa = ifa; memcpy(&sctp_ifap->address, addr, addr->sa_len); sctp_ifap->localifa_flags = SCTP_ADDR_VALID | SCTP_ADDR_DEFER_USE; sctp_ifap->flags = ifa_flags; /* Set scope */ switch (sctp_ifap->address.sa.sa_family) { #ifdef INET case AF_INET: { struct sockaddr_in *sin; sin = &sctp_ifap->address.sin; if (SCTP_IFN_IS_IFT_LOOP(sctp_ifap->ifn_p) || (IN4_ISLOOPBACK_ADDRESS(&sin->sin_addr))) { sctp_ifap->src_is_loop = 1; } if ((IN4_ISPRIVATE_ADDRESS(&sin->sin_addr))) { sctp_ifap->src_is_priv = 1; } sctp_ifnp->num_v4++; if (new_ifn_af) new_ifn_af = AF_INET; break; } #endif #ifdef INET6 case AF_INET6: { /* ok to use deprecated addresses? */ struct sockaddr_in6 *sin6; sin6 = &sctp_ifap->address.sin6; if (SCTP_IFN_IS_IFT_LOOP(sctp_ifap->ifn_p) || (IN6_IS_ADDR_LOOPBACK(&sin6->sin6_addr))) { sctp_ifap->src_is_loop = 1; } if (IN6_IS_ADDR_LINKLOCAL(&sin6->sin6_addr)) { sctp_ifap->src_is_priv = 1; } sctp_ifnp->num_v6++; if (new_ifn_af) new_ifn_af = AF_INET6; break; } #endif default: new_ifn_af = 0; break; } hash_of_addr = sctp_get_ifa_hash_val(&sctp_ifap->address.sa); if ((sctp_ifap->src_is_priv == 0) && (sctp_ifap->src_is_loop == 0)) { sctp_ifap->src_is_glob = 1; } hash_addr_head = &vrf->vrf_addr_hash[(hash_of_addr & vrf->vrf_addr_hashmark)]; LIST_INSERT_HEAD(hash_addr_head, sctp_ifap, next_bucket); sctp_ifap->refcount = 1; LIST_INSERT_HEAD(&sctp_ifnp->ifalist, sctp_ifap, next_ifa); sctp_ifnp->ifa_count++; vrf->total_ifa_count++; atomic_add_int(&SCTP_BASE_INFO(ipi_count_ifas), 1); if (new_ifn_af) { sctp_ifnp->registered_af = new_ifn_af; } SCTP_IPI_ADDR_WUNLOCK(); if (new_sctp_ifnp != NULL) { SCTP_FREE(new_sctp_ifnp, SCTP_M_IFN); } if (dynamic_add) { /* * Bump up the refcount so that when the timer completes it * will drop back down. */ struct sctp_laddr *wi; atomic_add_int(&sctp_ifap->refcount, 1); wi = SCTP_ZONE_GET(SCTP_BASE_INFO(ipi_zone_laddr), struct sctp_laddr); if (wi == NULL) { /* * Gak, what can we do? We have lost an address * change can you say HOSED? */ SCTPDBG(SCTP_DEBUG_PCB4, "Lost an address change?\n"); /* Opps, must decrement the count */ sctp_del_addr_from_vrf(vrf_id, addr, ifn_index, if_name); return (NULL); } SCTP_INCR_LADDR_COUNT(); memset(wi, 0, sizeof(*wi)); (void)SCTP_GETTIME_TIMEVAL(&wi->start_time); wi->ifa = sctp_ifap; wi->action = SCTP_ADD_IP_ADDRESS; SCTP_WQ_ADDR_LOCK(); LIST_INSERT_HEAD(&SCTP_BASE_INFO(addr_wq), wi, sctp_nxt_addr); sctp_timer_start(SCTP_TIMER_TYPE_ADDR_WQ, (struct sctp_inpcb *)NULL, (struct sctp_tcb *)NULL, (struct sctp_nets *)NULL); SCTP_WQ_ADDR_UNLOCK(); } else { /* it's ready for use */ sctp_ifap->localifa_flags &= ~SCTP_ADDR_DEFER_USE; } return (sctp_ifap); } void sctp_del_addr_from_vrf(uint32_t vrf_id, struct sockaddr *addr, uint32_t ifn_index, const char *if_name) { struct sctp_vrf *vrf; struct sctp_ifa *sctp_ifap = NULL; SCTP_IPI_ADDR_WLOCK(); vrf = sctp_find_vrf(vrf_id); if (vrf == NULL) { SCTPDBG(SCTP_DEBUG_PCB4, "Can't find vrf_id 0x%x\n", vrf_id); goto out_now; } #ifdef SCTP_DEBUG SCTPDBG(SCTP_DEBUG_PCB4, "vrf_id 0x%x: deleting address:", vrf_id); SCTPDBG_ADDR(SCTP_DEBUG_PCB4, addr); #endif sctp_ifap = sctp_find_ifa_by_addr(addr, vrf->vrf_id, SCTP_ADDR_LOCKED); if (sctp_ifap) { /* Validate the delete */ if (sctp_ifap->ifn_p) { int valid = 0; /*- * The name has priority over the ifn_index * if its given. */ if (if_name) { if (strncmp(if_name, sctp_ifap->ifn_p->ifn_name, SCTP_IFNAMSIZ) == 0) { /* They match its a correct delete */ valid = 1; } } if (!valid) { /* last ditch check ifn_index */ if (ifn_index == sctp_ifap->ifn_p->ifn_index) { valid = 1; } } if (!valid) { SCTPDBG(SCTP_DEBUG_PCB4, "ifn:%d ifname:%s does not match addresses\n", ifn_index, ((if_name == NULL) ? "NULL" : if_name)); SCTPDBG(SCTP_DEBUG_PCB4, "ifn:%d ifname:%s - ignoring delete\n", sctp_ifap->ifn_p->ifn_index, sctp_ifap->ifn_p->ifn_name); SCTP_IPI_ADDR_WUNLOCK(); return; } } SCTPDBG(SCTP_DEBUG_PCB4, "Deleting ifa %p\n", (void *)sctp_ifap); sctp_ifap->localifa_flags &= SCTP_ADDR_VALID; /* * We don't set the flag. This means that the structure will * hang around in EP's that have bound specific to it until * they close. This gives us TCP like behavior if someone * removes an address (or for that matter adds it right * back). */ /* sctp_ifap->localifa_flags |= SCTP_BEING_DELETED; */ vrf->total_ifa_count--; LIST_REMOVE(sctp_ifap, next_bucket); sctp_remove_ifa_from_ifn(sctp_ifap); } #ifdef SCTP_DEBUG else { SCTPDBG(SCTP_DEBUG_PCB4, "Del Addr-ifn:%d Could not find address:", ifn_index); SCTPDBG_ADDR(SCTP_DEBUG_PCB1, addr); } #endif out_now: SCTP_IPI_ADDR_WUNLOCK(); if (sctp_ifap) { struct sctp_laddr *wi; wi = SCTP_ZONE_GET(SCTP_BASE_INFO(ipi_zone_laddr), struct sctp_laddr); if (wi == NULL) { /* * Gak, what can we do? We have lost an address * change can you say HOSED? */ SCTPDBG(SCTP_DEBUG_PCB4, "Lost an address change?\n"); /* Oops, must decrement the count */ sctp_free_ifa(sctp_ifap); return; } SCTP_INCR_LADDR_COUNT(); memset(wi, 0, sizeof(*wi)); (void)SCTP_GETTIME_TIMEVAL(&wi->start_time); wi->ifa = sctp_ifap; wi->action = SCTP_DEL_IP_ADDRESS; SCTP_WQ_ADDR_LOCK(); /* * Should this really be a tailq? As it is we will process * the newest first :-0 */ LIST_INSERT_HEAD(&SCTP_BASE_INFO(addr_wq), wi, sctp_nxt_addr); sctp_timer_start(SCTP_TIMER_TYPE_ADDR_WQ, (struct sctp_inpcb *)NULL, (struct sctp_tcb *)NULL, (struct sctp_nets *)NULL); SCTP_WQ_ADDR_UNLOCK(); } return; } static int sctp_does_stcb_own_this_addr(struct sctp_tcb *stcb, struct sockaddr *to) { int loopback_scope; #if defined(INET) int ipv4_local_scope, ipv4_addr_legal; #endif #if defined(INET6) int local_scope, site_scope, ipv6_addr_legal; #endif struct sctp_vrf *vrf; struct sctp_ifn *sctp_ifn; struct sctp_ifa *sctp_ifa; loopback_scope = stcb->asoc.scope.loopback_scope; #if defined(INET) ipv4_local_scope = stcb->asoc.scope.ipv4_local_scope; ipv4_addr_legal = stcb->asoc.scope.ipv4_addr_legal; #endif #if defined(INET6) local_scope = stcb->asoc.scope.local_scope; site_scope = stcb->asoc.scope.site_scope; ipv6_addr_legal = stcb->asoc.scope.ipv6_addr_legal; #endif SCTP_IPI_ADDR_RLOCK(); vrf = sctp_find_vrf(stcb->asoc.vrf_id); if (vrf == NULL) { /* no vrf, no addresses */ SCTP_IPI_ADDR_RUNLOCK(); return (0); } if (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) { LIST_FOREACH(sctp_ifn, &vrf->ifnlist, next_ifn) { if ((loopback_scope == 0) && SCTP_IFN_IS_IFT_LOOP(sctp_ifn)) { continue; } LIST_FOREACH(sctp_ifa, &sctp_ifn->ifalist, next_ifa) { if (sctp_is_addr_restricted(stcb, sctp_ifa) && (!sctp_is_addr_pending(stcb, sctp_ifa))) { /* * We allow pending addresses, where * we have sent an asconf-add to be * considered valid. */ continue; } if (sctp_ifa->address.sa.sa_family != to->sa_family) { continue; } switch (sctp_ifa->address.sa.sa_family) { #ifdef INET case AF_INET: if (ipv4_addr_legal) { struct sockaddr_in *sin, *rsin; sin = &sctp_ifa->address.sin; rsin = (struct sockaddr_in *)to; if ((ipv4_local_scope == 0) && IN4_ISPRIVATE_ADDRESS(&sin->sin_addr)) { continue; } if (prison_check_ip4(stcb->sctp_ep->ip_inp.inp.inp_cred, &sin->sin_addr) != 0) { continue; } if (sin->sin_addr.s_addr == rsin->sin_addr.s_addr) { SCTP_IPI_ADDR_RUNLOCK(); return (1); } } break; #endif #ifdef INET6 case AF_INET6: if (ipv6_addr_legal) { struct sockaddr_in6 *sin6, *rsin6; sin6 = &sctp_ifa->address.sin6; rsin6 = (struct sockaddr_in6 *)to; if (prison_check_ip6(stcb->sctp_ep->ip_inp.inp.inp_cred, &sin6->sin6_addr) != 0) { continue; } if (IN6_IS_ADDR_LINKLOCAL(&sin6->sin6_addr)) { if (local_scope == 0) continue; if (sin6->sin6_scope_id == 0) { if (sa6_recoverscope(sin6) != 0) continue; } } if ((site_scope == 0) && (IN6_IS_ADDR_SITELOCAL(&sin6->sin6_addr))) { continue; } if (SCTP6_ARE_ADDR_EQUAL(sin6, rsin6)) { SCTP_IPI_ADDR_RUNLOCK(); return (1); } } break; #endif default: /* TSNH */ break; } } } } else { struct sctp_laddr *laddr; LIST_FOREACH(laddr, &stcb->sctp_ep->sctp_addr_list, sctp_nxt_addr) { if (laddr->ifa->localifa_flags & SCTP_BEING_DELETED) { SCTPDBG(SCTP_DEBUG_PCB1, "ifa being deleted\n"); continue; } if (sctp_is_addr_restricted(stcb, laddr->ifa) && (!sctp_is_addr_pending(stcb, laddr->ifa))) { /* * We allow pending addresses, where we have * sent an asconf-add to be considered * valid. */ continue; } if (laddr->ifa->address.sa.sa_family != to->sa_family) { continue; } switch (to->sa_family) { #ifdef INET case AF_INET: { struct sockaddr_in *sin, *rsin; sin = &laddr->ifa->address.sin; rsin = (struct sockaddr_in *)to; if (sin->sin_addr.s_addr == rsin->sin_addr.s_addr) { SCTP_IPI_ADDR_RUNLOCK(); return (1); } break; } #endif #ifdef INET6 case AF_INET6: { struct sockaddr_in6 *sin6, *rsin6; sin6 = &laddr->ifa->address.sin6; rsin6 = (struct sockaddr_in6 *)to; if (SCTP6_ARE_ADDR_EQUAL(sin6, rsin6)) { SCTP_IPI_ADDR_RUNLOCK(); return (1); } break; } #endif default: /* TSNH */ break; } } } SCTP_IPI_ADDR_RUNLOCK(); return (0); } static struct sctp_tcb * sctp_tcb_special_locate(struct sctp_inpcb **inp_p, struct sockaddr *from, struct sockaddr *to, struct sctp_nets **netp, uint32_t vrf_id) { /**** ASSUMES THE CALLER holds the INP_INFO_RLOCK */ /* * If we support the TCP model, then we must now dig through to see * if we can find our endpoint in the list of tcp ep's. */ uint16_t lport, rport; struct sctppcbhead *ephead; struct sctp_inpcb *inp; struct sctp_laddr *laddr; struct sctp_tcb *stcb; struct sctp_nets *net; if ((to == NULL) || (from == NULL)) { return (NULL); } switch (to->sa_family) { #ifdef INET case AF_INET: if (from->sa_family == AF_INET) { lport = ((struct sockaddr_in *)to)->sin_port; rport = ((struct sockaddr_in *)from)->sin_port; } else { return (NULL); } break; #endif #ifdef INET6 case AF_INET6: if (from->sa_family == AF_INET6) { lport = ((struct sockaddr_in6 *)to)->sin6_port; rport = ((struct sockaddr_in6 *)from)->sin6_port; } else { return (NULL); } break; #endif default: return (NULL); } ephead = &SCTP_BASE_INFO(sctp_tcpephash)[SCTP_PCBHASH_ALLADDR((lport | rport), SCTP_BASE_INFO(hashtcpmark))]; /* * Ok now for each of the guys in this bucket we must look and see: * - Does the remote port match. - Does there single association's * addresses match this address (to). If so we update p_ep to point * to this ep and return the tcb from it. */ LIST_FOREACH(inp, ephead, sctp_hash) { SCTP_INP_RLOCK(inp); if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) { SCTP_INP_RUNLOCK(inp); continue; } if (lport != inp->sctp_lport) { SCTP_INP_RUNLOCK(inp); continue; } switch (to->sa_family) { #ifdef INET case AF_INET: { struct sockaddr_in *sin; sin = (struct sockaddr_in *)to; if (prison_check_ip4(inp->ip_inp.inp.inp_cred, &sin->sin_addr) != 0) { SCTP_INP_RUNLOCK(inp); continue; } break; } #endif #ifdef INET6 case AF_INET6: { struct sockaddr_in6 *sin6; sin6 = (struct sockaddr_in6 *)to; if (prison_check_ip6(inp->ip_inp.inp.inp_cred, &sin6->sin6_addr) != 0) { SCTP_INP_RUNLOCK(inp); continue; } break; } #endif default: SCTP_INP_RUNLOCK(inp); continue; } if (inp->def_vrf_id != vrf_id) { SCTP_INP_RUNLOCK(inp); continue; } /* check to see if the ep has one of the addresses */ if ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) == 0) { /* We are NOT bound all, so look further */ int match = 0; LIST_FOREACH(laddr, &inp->sctp_addr_list, sctp_nxt_addr) { if (laddr->ifa == NULL) { SCTPDBG(SCTP_DEBUG_PCB1, "%s: NULL ifa\n", __func__); continue; } if (laddr->ifa->localifa_flags & SCTP_BEING_DELETED) { SCTPDBG(SCTP_DEBUG_PCB1, "ifa being deleted\n"); continue; } if (laddr->ifa->address.sa.sa_family == to->sa_family) { /* see if it matches */ #ifdef INET if (from->sa_family == AF_INET) { struct sockaddr_in *intf_addr, *sin; intf_addr = &laddr->ifa->address.sin; sin = (struct sockaddr_in *)to; if (sin->sin_addr.s_addr == intf_addr->sin_addr.s_addr) { match = 1; break; } } #endif #ifdef INET6 if (from->sa_family == AF_INET6) { struct sockaddr_in6 *intf_addr6; struct sockaddr_in6 *sin6; sin6 = (struct sockaddr_in6 *) to; intf_addr6 = &laddr->ifa->address.sin6; if (SCTP6_ARE_ADDR_EQUAL(sin6, intf_addr6)) { match = 1; break; } } #endif } } if (match == 0) { /* This endpoint does not have this address */ SCTP_INP_RUNLOCK(inp); continue; } } /* * Ok if we hit here the ep has the address, does it hold * the tcb? */ /* XXX: Why don't we TAILQ_FOREACH through sctp_asoc_list? */ stcb = LIST_FIRST(&inp->sctp_asoc_list); if (stcb == NULL) { SCTP_INP_RUNLOCK(inp); continue; } SCTP_TCB_LOCK(stcb); if (!sctp_does_stcb_own_this_addr(stcb, to)) { SCTP_TCB_UNLOCK(stcb); SCTP_INP_RUNLOCK(inp); continue; } if (stcb->rport != rport) { /* remote port does not match. */ SCTP_TCB_UNLOCK(stcb); SCTP_INP_RUNLOCK(inp); continue; } if (stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) { SCTP_TCB_UNLOCK(stcb); SCTP_INP_RUNLOCK(inp); continue; } if (!sctp_does_stcb_own_this_addr(stcb, to)) { SCTP_TCB_UNLOCK(stcb); SCTP_INP_RUNLOCK(inp); continue; } /* Does this TCB have a matching address? */ TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) { if (net->ro._l_addr.sa.sa_family != from->sa_family) { /* not the same family, can't be a match */ continue; } switch (from->sa_family) { #ifdef INET case AF_INET: { struct sockaddr_in *sin, *rsin; sin = (struct sockaddr_in *)&net->ro._l_addr; rsin = (struct sockaddr_in *)from; if (sin->sin_addr.s_addr == rsin->sin_addr.s_addr) { /* found it */ if (netp != NULL) { *netp = net; } /* * Update the endpoint * pointer */ *inp_p = inp; SCTP_INP_RUNLOCK(inp); return (stcb); } break; } #endif #ifdef INET6 case AF_INET6: { struct sockaddr_in6 *sin6, *rsin6; sin6 = (struct sockaddr_in6 *)&net->ro._l_addr; rsin6 = (struct sockaddr_in6 *)from; if (SCTP6_ARE_ADDR_EQUAL(sin6, rsin6)) { /* found it */ if (netp != NULL) { *netp = net; } /* * Update the endpoint * pointer */ *inp_p = inp; SCTP_INP_RUNLOCK(inp); return (stcb); } break; } #endif default: /* TSNH */ break; } } SCTP_TCB_UNLOCK(stcb); SCTP_INP_RUNLOCK(inp); } return (NULL); } /* * rules for use * * 1) If I return a NULL you must decrement any INP ref cnt. 2) If I find an * stcb, both will be locked (locked_tcb and stcb) but decrement will be done * (if locked == NULL). 3) Decrement happens on return ONLY if locked == * NULL. */ struct sctp_tcb * sctp_findassociation_ep_addr(struct sctp_inpcb **inp_p, struct sockaddr *remote, struct sctp_nets **netp, struct sockaddr *local, struct sctp_tcb *locked_tcb) { struct sctpasochead *head; struct sctp_inpcb *inp; struct sctp_tcb *stcb = NULL; struct sctp_nets *net; uint16_t rport; inp = *inp_p; switch (remote->sa_family) { #ifdef INET case AF_INET: rport = (((struct sockaddr_in *)remote)->sin_port); break; #endif #ifdef INET6 case AF_INET6: rport = (((struct sockaddr_in6 *)remote)->sin6_port); break; #endif default: return (NULL); } if (locked_tcb) { /* * UN-lock so we can do proper locking here this occurs when * called from load_addresses_from_init. */ atomic_add_int(&locked_tcb->asoc.refcnt, 1); SCTP_TCB_UNLOCK(locked_tcb); } SCTP_INP_INFO_RLOCK(); if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL)) { /*- * Now either this guy is our listener or it's the * connector. If it is the one that issued the connect, then * it's only chance is to be the first TCB in the list. If * it is the acceptor, then do the special_lookup to hash * and find the real inp. */ if ((inp->sctp_socket) && SCTP_IS_LISTENING(inp)) { /* to is peer addr, from is my addr */ stcb = sctp_tcb_special_locate(inp_p, remote, local, netp, inp->def_vrf_id); if ((stcb != NULL) && (locked_tcb == NULL)) { /* we have a locked tcb, lower refcount */ SCTP_INP_DECR_REF(inp); } if ((locked_tcb != NULL) && (locked_tcb != stcb)) { SCTP_INP_RLOCK(locked_tcb->sctp_ep); SCTP_TCB_LOCK(locked_tcb); atomic_subtract_int(&locked_tcb->asoc.refcnt, 1); SCTP_INP_RUNLOCK(locked_tcb->sctp_ep); } SCTP_INP_INFO_RUNLOCK(); return (stcb); } else { SCTP_INP_WLOCK(inp); if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) { goto null_return; } stcb = LIST_FIRST(&inp->sctp_asoc_list); if (stcb == NULL) { goto null_return; } SCTP_TCB_LOCK(stcb); if (stcb->rport != rport) { /* remote port does not match. */ SCTP_TCB_UNLOCK(stcb); goto null_return; } if (stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) { SCTP_TCB_UNLOCK(stcb); goto null_return; } if (local && !sctp_does_stcb_own_this_addr(stcb, local)) { SCTP_TCB_UNLOCK(stcb); goto null_return; } /* now look at the list of remote addresses */ TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) { #ifdef INVARIANTS if (net == (TAILQ_NEXT(net, sctp_next))) { panic("Corrupt net list"); } #endif if (net->ro._l_addr.sa.sa_family != remote->sa_family) { /* not the same family */ continue; } switch (remote->sa_family) { #ifdef INET case AF_INET: { struct sockaddr_in *sin, *rsin; sin = (struct sockaddr_in *) &net->ro._l_addr; rsin = (struct sockaddr_in *)remote; if (sin->sin_addr.s_addr == rsin->sin_addr.s_addr) { /* found it */ if (netp != NULL) { *netp = net; } if (locked_tcb == NULL) { SCTP_INP_DECR_REF(inp); } else if (locked_tcb != stcb) { SCTP_TCB_LOCK(locked_tcb); } if (locked_tcb) { atomic_subtract_int(&locked_tcb->asoc.refcnt, 1); } SCTP_INP_WUNLOCK(inp); SCTP_INP_INFO_RUNLOCK(); return (stcb); } break; } #endif #ifdef INET6 case AF_INET6: { struct sockaddr_in6 *sin6, *rsin6; sin6 = (struct sockaddr_in6 *)&net->ro._l_addr; rsin6 = (struct sockaddr_in6 *)remote; if (SCTP6_ARE_ADDR_EQUAL(sin6, rsin6)) { /* found it */ if (netp != NULL) { *netp = net; } if (locked_tcb == NULL) { SCTP_INP_DECR_REF(inp); } else if (locked_tcb != stcb) { SCTP_TCB_LOCK(locked_tcb); } if (locked_tcb) { atomic_subtract_int(&locked_tcb->asoc.refcnt, 1); } SCTP_INP_WUNLOCK(inp); SCTP_INP_INFO_RUNLOCK(); return (stcb); } break; } #endif default: /* TSNH */ break; } } SCTP_TCB_UNLOCK(stcb); } } else { SCTP_INP_WLOCK(inp); if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) { goto null_return; } head = &inp->sctp_tcbhash[SCTP_PCBHASH_ALLADDR(rport, inp->sctp_hashmark)]; LIST_FOREACH(stcb, head, sctp_tcbhash) { if (stcb->rport != rport) { /* remote port does not match */ continue; } SCTP_TCB_LOCK(stcb); if (stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) { SCTP_TCB_UNLOCK(stcb); continue; } if (local && !sctp_does_stcb_own_this_addr(stcb, local)) { SCTP_TCB_UNLOCK(stcb); continue; } /* now look at the list of remote addresses */ TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) { #ifdef INVARIANTS if (net == (TAILQ_NEXT(net, sctp_next))) { panic("Corrupt net list"); } #endif if (net->ro._l_addr.sa.sa_family != remote->sa_family) { /* not the same family */ continue; } switch (remote->sa_family) { #ifdef INET case AF_INET: { struct sockaddr_in *sin, *rsin; sin = (struct sockaddr_in *) &net->ro._l_addr; rsin = (struct sockaddr_in *)remote; if (sin->sin_addr.s_addr == rsin->sin_addr.s_addr) { /* found it */ if (netp != NULL) { *netp = net; } if (locked_tcb == NULL) { SCTP_INP_DECR_REF(inp); } else if (locked_tcb != stcb) { SCTP_TCB_LOCK(locked_tcb); } if (locked_tcb) { atomic_subtract_int(&locked_tcb->asoc.refcnt, 1); } SCTP_INP_WUNLOCK(inp); SCTP_INP_INFO_RUNLOCK(); return (stcb); } break; } #endif #ifdef INET6 case AF_INET6: { struct sockaddr_in6 *sin6, *rsin6; sin6 = (struct sockaddr_in6 *) &net->ro._l_addr; rsin6 = (struct sockaddr_in6 *)remote; if (SCTP6_ARE_ADDR_EQUAL(sin6, rsin6)) { /* found it */ if (netp != NULL) { *netp = net; } if (locked_tcb == NULL) { SCTP_INP_DECR_REF(inp); } else if (locked_tcb != stcb) { SCTP_TCB_LOCK(locked_tcb); } if (locked_tcb) { atomic_subtract_int(&locked_tcb->asoc.refcnt, 1); } SCTP_INP_WUNLOCK(inp); SCTP_INP_INFO_RUNLOCK(); return (stcb); } break; } #endif default: /* TSNH */ break; } } SCTP_TCB_UNLOCK(stcb); } } null_return: /* clean up for returning null */ if (locked_tcb) { SCTP_TCB_LOCK(locked_tcb); atomic_subtract_int(&locked_tcb->asoc.refcnt, 1); } SCTP_INP_WUNLOCK(inp); SCTP_INP_INFO_RUNLOCK(); /* not found */ return (NULL); } /* * Find an association for a specific endpoint using the association id given * out in the COMM_UP notification */ struct sctp_tcb * sctp_findasoc_ep_asocid_locked(struct sctp_inpcb *inp, sctp_assoc_t asoc_id, int want_lock) { /* * Use my the assoc_id to find a endpoint */ struct sctpasochead *head; struct sctp_tcb *stcb; uint32_t id; if (inp == NULL) { SCTP_PRINTF("TSNH ep_associd\n"); return (NULL); } if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) { SCTP_PRINTF("TSNH ep_associd0\n"); return (NULL); } id = (uint32_t)asoc_id; head = &inp->sctp_asocidhash[SCTP_PCBHASH_ASOC(id, inp->hashasocidmark)]; if (head == NULL) { /* invalid id TSNH */ SCTP_PRINTF("TSNH ep_associd1\n"); return (NULL); } LIST_FOREACH(stcb, head, sctp_tcbasocidhash) { if (stcb->asoc.assoc_id == id) { if (inp != stcb->sctp_ep) { /* * some other guy has the same id active (id * collision ??). */ SCTP_PRINTF("TSNH ep_associd2\n"); continue; } if (stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) { continue; } if (want_lock) { SCTP_TCB_LOCK(stcb); } return (stcb); } } return (NULL); } struct sctp_tcb * sctp_findassociation_ep_asocid(struct sctp_inpcb *inp, sctp_assoc_t asoc_id, int want_lock) { struct sctp_tcb *stcb; SCTP_INP_RLOCK(inp); stcb = sctp_findasoc_ep_asocid_locked(inp, asoc_id, want_lock); SCTP_INP_RUNLOCK(inp); return (stcb); } /* * Endpoint probe expects that the INP_INFO is locked. */ static struct sctp_inpcb * sctp_endpoint_probe(struct sockaddr *nam, struct sctppcbhead *head, uint16_t lport, uint32_t vrf_id) { struct sctp_inpcb *inp; struct sctp_laddr *laddr; #ifdef INET struct sockaddr_in *sin; #endif #ifdef INET6 struct sockaddr_in6 *sin6; struct sockaddr_in6 *intf_addr6; #endif int fnd; #ifdef INET sin = NULL; #endif #ifdef INET6 sin6 = NULL; #endif switch (nam->sa_family) { #ifdef INET case AF_INET: sin = (struct sockaddr_in *)nam; break; #endif #ifdef INET6 case AF_INET6: sin6 = (struct sockaddr_in6 *)nam; break; #endif default: /* unsupported family */ return (NULL); } if (head == NULL) return (NULL); LIST_FOREACH(inp, head, sctp_hash) { SCTP_INP_RLOCK(inp); if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) { SCTP_INP_RUNLOCK(inp); continue; } if ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) && (inp->sctp_lport == lport)) { /* got it */ switch (nam->sa_family) { #ifdef INET case AF_INET: if ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) && SCTP_IPV6_V6ONLY(inp)) { /* * IPv4 on a IPv6 socket with ONLY * IPv6 set */ SCTP_INP_RUNLOCK(inp); continue; } if (prison_check_ip4(inp->ip_inp.inp.inp_cred, &sin->sin_addr) != 0) { SCTP_INP_RUNLOCK(inp); continue; } break; #endif #ifdef INET6 case AF_INET6: /* * A V6 address and the endpoint is NOT * bound V6 */ if ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) == 0) { SCTP_INP_RUNLOCK(inp); continue; } if (prison_check_ip6(inp->ip_inp.inp.inp_cred, &sin6->sin6_addr) != 0) { SCTP_INP_RUNLOCK(inp); continue; } break; #endif default: break; } /* does a VRF id match? */ fnd = 0; if (inp->def_vrf_id == vrf_id) fnd = 1; SCTP_INP_RUNLOCK(inp); if (!fnd) continue; return (inp); } SCTP_INP_RUNLOCK(inp); } switch (nam->sa_family) { #ifdef INET case AF_INET: if (sin->sin_addr.s_addr == INADDR_ANY) { /* Can't hunt for one that has no address specified */ return (NULL); } break; #endif #ifdef INET6 case AF_INET6: if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) { /* Can't hunt for one that has no address specified */ return (NULL); } break; #endif default: break; } /* * ok, not bound to all so see if we can find a EP bound to this * address. */ LIST_FOREACH(inp, head, sctp_hash) { SCTP_INP_RLOCK(inp); if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) { SCTP_INP_RUNLOCK(inp); continue; } if ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL)) { SCTP_INP_RUNLOCK(inp); continue; } /* * Ok this could be a likely candidate, look at all of its * addresses */ if (inp->sctp_lport != lport) { SCTP_INP_RUNLOCK(inp); continue; } /* does a VRF id match? */ fnd = 0; if (inp->def_vrf_id == vrf_id) fnd = 1; if (!fnd) { SCTP_INP_RUNLOCK(inp); continue; } LIST_FOREACH(laddr, &inp->sctp_addr_list, sctp_nxt_addr) { if (laddr->ifa == NULL) { SCTPDBG(SCTP_DEBUG_PCB1, "%s: NULL ifa\n", __func__); continue; } SCTPDBG(SCTP_DEBUG_PCB1, "Ok laddr->ifa:%p is possible, ", (void *)laddr->ifa); if (laddr->ifa->localifa_flags & SCTP_BEING_DELETED) { SCTPDBG(SCTP_DEBUG_PCB1, "Huh IFA being deleted\n"); continue; } if (laddr->ifa->address.sa.sa_family == nam->sa_family) { /* possible, see if it matches */ switch (nam->sa_family) { #ifdef INET case AF_INET: if (sin->sin_addr.s_addr == laddr->ifa->address.sin.sin_addr.s_addr) { SCTP_INP_RUNLOCK(inp); return (inp); } break; #endif #ifdef INET6 case AF_INET6: intf_addr6 = &laddr->ifa->address.sin6; if (SCTP6_ARE_ADDR_EQUAL(sin6, intf_addr6)) { SCTP_INP_RUNLOCK(inp); return (inp); } break; #endif } } } SCTP_INP_RUNLOCK(inp); } return (NULL); } static struct sctp_inpcb * sctp_isport_inuse(struct sctp_inpcb *inp, uint16_t lport, uint32_t vrf_id) { struct sctppcbhead *head; struct sctp_inpcb *t_inp; int fnd; head = &SCTP_BASE_INFO(sctp_ephash)[SCTP_PCBHASH_ALLADDR(lport, SCTP_BASE_INFO(hashmark))]; LIST_FOREACH(t_inp, head, sctp_hash) { if (t_inp->sctp_lport != lport) { continue; } /* is it in the VRF in question */ fnd = 0; if (t_inp->def_vrf_id == vrf_id) fnd = 1; if (!fnd) continue; /* This one is in use. */ /* check the v6/v4 binding issue */ if ((t_inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) && SCTP_IPV6_V6ONLY(t_inp)) { if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) { /* collision in V6 space */ return (t_inp); } else { /* inp is BOUND_V4 no conflict */ continue; } } else if (t_inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) { /* t_inp is bound v4 and v6, conflict always */ return (t_inp); } else { /* t_inp is bound only V4 */ if ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) && SCTP_IPV6_V6ONLY(inp)) { /* no conflict */ continue; } /* else fall through to conflict */ } return (t_inp); } return (NULL); } int sctp_swap_inpcb_for_listen(struct sctp_inpcb *inp) { /* For 1-2-1 with port reuse */ struct sctppcbhead *head; struct sctp_inpcb *tinp, *ninp; if (sctp_is_feature_off(inp, SCTP_PCB_FLAGS_PORTREUSE)) { /* only works with port reuse on */ return (-1); } if ((inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) == 0) { return (0); } SCTP_INP_RUNLOCK(inp); SCTP_INP_INFO_WLOCK(); head = &SCTP_BASE_INFO(sctp_ephash)[SCTP_PCBHASH_ALLADDR(inp->sctp_lport, SCTP_BASE_INFO(hashmark))]; /* Kick out all non-listeners to the TCP hash */ LIST_FOREACH_SAFE(tinp, head, sctp_hash, ninp) { if (tinp->sctp_lport != inp->sctp_lport) { continue; } if (tinp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) { continue; } if (tinp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) { continue; } if (SCTP_IS_LISTENING(tinp)) { continue; } SCTP_INP_WLOCK(tinp); LIST_REMOVE(tinp, sctp_hash); head = &SCTP_BASE_INFO(sctp_tcpephash)[SCTP_PCBHASH_ALLADDR(tinp->sctp_lport, SCTP_BASE_INFO(hashtcpmark))]; tinp->sctp_flags |= SCTP_PCB_FLAGS_IN_TCPPOOL; LIST_INSERT_HEAD(head, tinp, sctp_hash); SCTP_INP_WUNLOCK(tinp); } SCTP_INP_WLOCK(inp); /* Pull from where he was */ LIST_REMOVE(inp, sctp_hash); inp->sctp_flags &= ~SCTP_PCB_FLAGS_IN_TCPPOOL; head = &SCTP_BASE_INFO(sctp_ephash)[SCTP_PCBHASH_ALLADDR(inp->sctp_lport, SCTP_BASE_INFO(hashmark))]; LIST_INSERT_HEAD(head, inp, sctp_hash); SCTP_INP_WUNLOCK(inp); SCTP_INP_RLOCK(inp); SCTP_INP_INFO_WUNLOCK(); return (0); } struct sctp_inpcb * sctp_pcb_findep(struct sockaddr *nam, int find_tcp_pool, int have_lock, uint32_t vrf_id) { /* * First we check the hash table to see if someone has this port * bound with just the port. */ struct sctp_inpcb *inp; struct sctppcbhead *head; int lport; unsigned int i; #ifdef INET struct sockaddr_in *sin; #endif #ifdef INET6 struct sockaddr_in6 *sin6; #endif switch (nam->sa_family) { #ifdef INET case AF_INET: sin = (struct sockaddr_in *)nam; lport = sin->sin_port; break; #endif #ifdef INET6 case AF_INET6: sin6 = (struct sockaddr_in6 *)nam; lport = sin6->sin6_port; break; #endif default: return (NULL); } /* * I could cheat here and just cast to one of the types but we will * do it right. It also provides the check against an Unsupported * type too. */ /* Find the head of the ALLADDR chain */ if (have_lock == 0) { SCTP_INP_INFO_RLOCK(); } head = &SCTP_BASE_INFO(sctp_ephash)[SCTP_PCBHASH_ALLADDR(lport, SCTP_BASE_INFO(hashmark))]; inp = sctp_endpoint_probe(nam, head, lport, vrf_id); /* * If the TCP model exists it could be that the main listening * endpoint is gone but there still exists a connected socket for * this guy. If so we can return the first one that we find. This * may NOT be the correct one so the caller should be wary on the * returned INP. Currently the only caller that sets find_tcp_pool * is in bindx where we are verifying that a user CAN bind the * address. He either has bound it already, or someone else has, or * its open to bind, so this is good enough. */ if (inp == NULL && find_tcp_pool) { for (i = 0; i < SCTP_BASE_INFO(hashtcpmark) + 1; i++) { head = &SCTP_BASE_INFO(sctp_tcpephash)[i]; inp = sctp_endpoint_probe(nam, head, lport, vrf_id); if (inp) { break; } } } if (inp) { SCTP_INP_INCR_REF(inp); } if (have_lock == 0) { SCTP_INP_INFO_RUNLOCK(); } return (inp); } /* * Find an association for an endpoint with the pointer to whom you want to * send to and the endpoint pointer. The address can be IPv4 or IPv6. We may * need to change the *to to some other struct like a mbuf... */ struct sctp_tcb * sctp_findassociation_addr_sa(struct sockaddr *from, struct sockaddr *to, struct sctp_inpcb **inp_p, struct sctp_nets **netp, int find_tcp_pool, uint32_t vrf_id) { struct sctp_inpcb *inp = NULL; struct sctp_tcb *stcb; SCTP_INP_INFO_RLOCK(); if (find_tcp_pool) { if (inp_p != NULL) { stcb = sctp_tcb_special_locate(inp_p, from, to, netp, vrf_id); } else { stcb = sctp_tcb_special_locate(&inp, from, to, netp, vrf_id); } if (stcb != NULL) { SCTP_INP_INFO_RUNLOCK(); return (stcb); } } inp = sctp_pcb_findep(to, 0, 1, vrf_id); if (inp_p != NULL) { *inp_p = inp; } SCTP_INP_INFO_RUNLOCK(); if (inp == NULL) { return (NULL); } /* * ok, we have an endpoint, now lets find the assoc for it (if any) * we now place the source address or from in the to of the find * endpoint call. Since in reality this chain is used from the * inbound packet side. */ if (inp_p != NULL) { stcb = sctp_findassociation_ep_addr(inp_p, from, netp, to, NULL); } else { stcb = sctp_findassociation_ep_addr(&inp, from, netp, to, NULL); } return (stcb); } /* * This routine will grub through the mbuf that is a INIT or INIT-ACK and * find all addresses that the sender has specified in any address list. Each * address will be used to lookup the TCB and see if one exits. */ static struct sctp_tcb * sctp_findassociation_special_addr(struct mbuf *m, int offset, struct sctphdr *sh, struct sctp_inpcb **inp_p, struct sctp_nets **netp, struct sockaddr *dst) { struct sctp_paramhdr *phdr, param_buf; #if defined(INET) || defined(INET6) struct sctp_tcb *stcb; uint16_t ptype; #endif uint16_t plen; #ifdef INET struct sockaddr_in sin4; #endif #ifdef INET6 struct sockaddr_in6 sin6; #endif #ifdef INET memset(&sin4, 0, sizeof(sin4)); sin4.sin_len = sizeof(sin4); sin4.sin_family = AF_INET; sin4.sin_port = sh->src_port; #endif #ifdef INET6 memset(&sin6, 0, sizeof(sin6)); sin6.sin6_len = sizeof(sin6); sin6.sin6_family = AF_INET6; sin6.sin6_port = sh->src_port; #endif offset += sizeof(struct sctp_init_chunk); phdr = sctp_get_next_param(m, offset, ¶m_buf, sizeof(param_buf)); while (phdr != NULL) { /* now we must see if we want the parameter */ #if defined(INET) || defined(INET6) ptype = ntohs(phdr->param_type); #endif plen = ntohs(phdr->param_length); if (plen == 0) { break; } #ifdef INET if (ptype == SCTP_IPV4_ADDRESS && plen == sizeof(struct sctp_ipv4addr_param)) { /* Get the rest of the address */ struct sctp_ipv4addr_param ip4_param, *p4; phdr = sctp_get_next_param(m, offset, (struct sctp_paramhdr *)&ip4_param, sizeof(ip4_param)); if (phdr == NULL) { return (NULL); } p4 = (struct sctp_ipv4addr_param *)phdr; memcpy(&sin4.sin_addr, &p4->addr, sizeof(p4->addr)); /* look it up */ stcb = sctp_findassociation_ep_addr(inp_p, (struct sockaddr *)&sin4, netp, dst, NULL); if (stcb != NULL) { return (stcb); } } #endif #ifdef INET6 if (ptype == SCTP_IPV6_ADDRESS && plen == sizeof(struct sctp_ipv6addr_param)) { /* Get the rest of the address */ struct sctp_ipv6addr_param ip6_param, *p6; phdr = sctp_get_next_param(m, offset, (struct sctp_paramhdr *)&ip6_param, sizeof(ip6_param)); if (phdr == NULL) { return (NULL); } p6 = (struct sctp_ipv6addr_param *)phdr; memcpy(&sin6.sin6_addr, &p6->addr, sizeof(p6->addr)); /* look it up */ stcb = sctp_findassociation_ep_addr(inp_p, (struct sockaddr *)&sin6, netp, dst, NULL); if (stcb != NULL) { return (stcb); } } #endif offset += SCTP_SIZE32(plen); phdr = sctp_get_next_param(m, offset, ¶m_buf, sizeof(param_buf)); } return (NULL); } static struct sctp_tcb * sctp_findassoc_by_vtag(struct sockaddr *from, struct sockaddr *to, uint32_t vtag, struct sctp_inpcb **inp_p, struct sctp_nets **netp, uint16_t rport, uint16_t lport, int skip_src_check, uint32_t vrf_id, uint32_t remote_tag) { /* * Use my vtag to hash. If we find it we then verify the source addr * is in the assoc. If all goes well we save a bit on rec of a * packet. */ struct sctpasochead *head; struct sctp_nets *net; struct sctp_tcb *stcb; SCTP_INP_INFO_RLOCK(); head = &SCTP_BASE_INFO(sctp_asochash)[SCTP_PCBHASH_ASOC(vtag, SCTP_BASE_INFO(hashasocmark))]; LIST_FOREACH(stcb, head, sctp_asocs) { SCTP_INP_RLOCK(stcb->sctp_ep); if (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) { SCTP_INP_RUNLOCK(stcb->sctp_ep); continue; } if (stcb->sctp_ep->def_vrf_id != vrf_id) { SCTP_INP_RUNLOCK(stcb->sctp_ep); continue; } SCTP_TCB_LOCK(stcb); SCTP_INP_RUNLOCK(stcb->sctp_ep); if (stcb->asoc.my_vtag == vtag) { /* candidate */ if (stcb->rport != rport) { SCTP_TCB_UNLOCK(stcb); continue; } if (stcb->sctp_ep->sctp_lport != lport) { SCTP_TCB_UNLOCK(stcb); continue; } if (stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) { SCTP_TCB_UNLOCK(stcb); continue; } /* RRS:Need toaddr check here */ if (sctp_does_stcb_own_this_addr(stcb, to) == 0) { /* Endpoint does not own this address */ SCTP_TCB_UNLOCK(stcb); continue; } if (remote_tag) { /* * If we have both vtags that's all we match * on */ if (stcb->asoc.peer_vtag == remote_tag) { /* * If both tags match we consider it * conclusive and check NO * source/destination addresses */ goto conclusive; } } if (skip_src_check) { conclusive: if (from) { *netp = sctp_findnet(stcb, from); } else { *netp = NULL; /* unknown */ } if (inp_p) *inp_p = stcb->sctp_ep; SCTP_INP_INFO_RUNLOCK(); return (stcb); } net = sctp_findnet(stcb, from); if (net) { /* yep its him. */ *netp = net; SCTP_STAT_INCR(sctps_vtagexpress); *inp_p = stcb->sctp_ep; SCTP_INP_INFO_RUNLOCK(); return (stcb); } else { /* * not him, this should only happen in rare * cases so I peg it. */ SCTP_STAT_INCR(sctps_vtagbogus); } } SCTP_TCB_UNLOCK(stcb); } SCTP_INP_INFO_RUNLOCK(); return (NULL); } /* * Find an association with the pointer to the inbound IP packet. This can be * a IPv4 or IPv6 packet. */ struct sctp_tcb * sctp_findassociation_addr(struct mbuf *m, int offset, struct sockaddr *src, struct sockaddr *dst, struct sctphdr *sh, struct sctp_chunkhdr *ch, struct sctp_inpcb **inp_p, struct sctp_nets **netp, uint32_t vrf_id) { struct sctp_tcb *stcb; struct sctp_inpcb *inp; if (sh->v_tag) { /* we only go down this path if vtag is non-zero */ stcb = sctp_findassoc_by_vtag(src, dst, ntohl(sh->v_tag), inp_p, netp, sh->src_port, sh->dest_port, 0, vrf_id, 0); if (stcb) { return (stcb); } } if (inp_p) { stcb = sctp_findassociation_addr_sa(src, dst, inp_p, netp, 1, vrf_id); inp = *inp_p; } else { stcb = sctp_findassociation_addr_sa(src, dst, &inp, netp, 1, vrf_id); } SCTPDBG(SCTP_DEBUG_PCB1, "stcb:%p inp:%p\n", (void *)stcb, (void *)inp); if (stcb == NULL && inp) { /* Found a EP but not this address */ if ((ch->chunk_type == SCTP_INITIATION) || (ch->chunk_type == SCTP_INITIATION_ACK)) { /*- * special hook, we do NOT return linp or an * association that is linked to an existing * association that is under the TCP pool (i.e. no * listener exists). The endpoint finding routine * will always find a listener before examining the * TCP pool. */ if (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) { if (inp_p) { *inp_p = NULL; } return (NULL); } stcb = sctp_findassociation_special_addr(m, offset, sh, &inp, netp, dst); if (inp_p != NULL) { *inp_p = inp; } } } SCTPDBG(SCTP_DEBUG_PCB1, "stcb is %p\n", (void *)stcb); return (stcb); } /* * lookup an association by an ASCONF lookup address. * if the lookup address is 0.0.0.0 or ::0, use the vtag to do the lookup */ struct sctp_tcb * sctp_findassociation_ep_asconf(struct mbuf *m, int offset, struct sockaddr *dst, struct sctphdr *sh, struct sctp_inpcb **inp_p, struct sctp_nets **netp, uint32_t vrf_id) { struct sctp_tcb *stcb; union sctp_sockstore remote_store; struct sctp_paramhdr param_buf, *phdr; int ptype; int zero_address = 0; #ifdef INET struct sockaddr_in *sin; #endif #ifdef INET6 struct sockaddr_in6 *sin6; #endif memset(&remote_store, 0, sizeof(remote_store)); phdr = sctp_get_next_param(m, offset + sizeof(struct sctp_asconf_chunk), ¶m_buf, sizeof(struct sctp_paramhdr)); if (phdr == NULL) { SCTPDBG(SCTP_DEBUG_INPUT3, "%s: failed to get asconf lookup addr\n", __func__); return NULL; } ptype = (int)((uint32_t)ntohs(phdr->param_type)); /* get the correlation address */ switch (ptype) { #ifdef INET6 case SCTP_IPV6_ADDRESS: { /* ipv6 address param */ struct sctp_ipv6addr_param *p6, p6_buf; if (ntohs(phdr->param_length) != sizeof(struct sctp_ipv6addr_param)) { return NULL; } p6 = (struct sctp_ipv6addr_param *)sctp_get_next_param(m, offset + sizeof(struct sctp_asconf_chunk), &p6_buf.ph, sizeof(p6_buf)); if (p6 == NULL) { SCTPDBG(SCTP_DEBUG_INPUT3, "%s: failed to get asconf v6 lookup addr\n", __func__); return (NULL); } sin6 = &remote_store.sin6; sin6->sin6_family = AF_INET6; sin6->sin6_len = sizeof(*sin6); sin6->sin6_port = sh->src_port; memcpy(&sin6->sin6_addr, &p6->addr, sizeof(struct in6_addr)); if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) zero_address = 1; break; } #endif #ifdef INET case SCTP_IPV4_ADDRESS: { /* ipv4 address param */ struct sctp_ipv4addr_param *p4, p4_buf; if (ntohs(phdr->param_length) != sizeof(struct sctp_ipv4addr_param)) { return NULL; } p4 = (struct sctp_ipv4addr_param *)sctp_get_next_param(m, offset + sizeof(struct sctp_asconf_chunk), &p4_buf.ph, sizeof(p4_buf)); if (p4 == NULL) { SCTPDBG(SCTP_DEBUG_INPUT3, "%s: failed to get asconf v4 lookup addr\n", __func__); return (NULL); } sin = &remote_store.sin; sin->sin_family = AF_INET; sin->sin_len = sizeof(*sin); sin->sin_port = sh->src_port; memcpy(&sin->sin_addr, &p4->addr, sizeof(struct in_addr)); if (sin->sin_addr.s_addr == INADDR_ANY) zero_address = 1; break; } #endif default: /* invalid address param type */ return NULL; } if (zero_address) { stcb = sctp_findassoc_by_vtag(NULL, dst, ntohl(sh->v_tag), inp_p, netp, sh->src_port, sh->dest_port, 1, vrf_id, 0); if (stcb != NULL) { SCTP_INP_DECR_REF(*inp_p); } } else { stcb = sctp_findassociation_ep_addr(inp_p, &remote_store.sa, netp, dst, NULL); } return (stcb); } /* * allocate a sctp_inpcb and setup a temporary binding to a port/all * addresses. This way if we don't get a bind we by default pick a ephemeral * port with all addresses bound. */ int sctp_inpcb_alloc(struct socket *so, uint32_t vrf_id) { /* * we get called when a new endpoint starts up. We need to allocate * the sctp_inpcb structure from the zone and init it. Mark it as * unbound and find a port that we can use as an ephemeral with * INADDR_ANY. If the user binds later no problem we can then add in * the specific addresses. And setup the default parameters for the * EP. */ int i, error; struct sctp_inpcb *inp; struct sctp_pcb *m; struct timeval time; sctp_sharedkey_t *null_key; error = 0; SCTP_INP_INFO_WLOCK(); inp = SCTP_ZONE_GET(SCTP_BASE_INFO(ipi_zone_ep), struct sctp_inpcb); if (inp == NULL) { SCTP_PRINTF("Out of SCTP-INPCB structures - no resources\n"); SCTP_INP_INFO_WUNLOCK(); SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, ENOBUFS); return (ENOBUFS); } /* zap it */ memset(inp, 0, sizeof(*inp)); /* bump generations */ /* setup socket pointers */ inp->sctp_socket = so; inp->ip_inp.inp.inp_socket = so; inp->ip_inp.inp.inp_cred = crhold(so->so_cred); #ifdef INET6 if (INP_SOCKAF(so) == AF_INET6) { if (MODULE_GLOBAL(ip6_auto_flowlabel)) { inp->ip_inp.inp.inp_flags |= IN6P_AUTOFLOWLABEL; } if (MODULE_GLOBAL(ip6_v6only)) { inp->ip_inp.inp.inp_flags |= IN6P_IPV6_V6ONLY; } } #endif inp->sctp_associd_counter = 1; inp->partial_delivery_point = SCTP_SB_LIMIT_RCV(so) >> SCTP_PARTIAL_DELIVERY_SHIFT; inp->sctp_frag_point = SCTP_DEFAULT_MAXSEGMENT; inp->max_cwnd = 0; inp->sctp_cmt_on_off = SCTP_BASE_SYSCTL(sctp_cmt_on_off); inp->ecn_supported = (uint8_t)SCTP_BASE_SYSCTL(sctp_ecn_enable); inp->prsctp_supported = (uint8_t)SCTP_BASE_SYSCTL(sctp_pr_enable); inp->auth_supported = (uint8_t)SCTP_BASE_SYSCTL(sctp_auth_enable); inp->asconf_supported = (uint8_t)SCTP_BASE_SYSCTL(sctp_asconf_enable); inp->reconfig_supported = (uint8_t)SCTP_BASE_SYSCTL(sctp_reconfig_enable); inp->nrsack_supported = (uint8_t)SCTP_BASE_SYSCTL(sctp_nrsack_enable); inp->pktdrop_supported = (uint8_t)SCTP_BASE_SYSCTL(sctp_pktdrop_enable); inp->idata_supported = 0; inp->fibnum = so->so_fibnum; /* init the small hash table we use to track asocid <-> tcb */ inp->sctp_asocidhash = SCTP_HASH_INIT(SCTP_STACK_VTAG_HASH_SIZE, &inp->hashasocidmark); if (inp->sctp_asocidhash == NULL) { crfree(inp->ip_inp.inp.inp_cred); SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_ep), inp); SCTP_INP_INFO_WUNLOCK(); return (ENOBUFS); } SCTP_INCR_EP_COUNT(); inp->ip_inp.inp.inp_ip_ttl = MODULE_GLOBAL(ip_defttl); SCTP_INP_INFO_WUNLOCK(); so->so_pcb = (caddr_t)inp; if (SCTP_SO_TYPE(so) == SOCK_SEQPACKET) { /* UDP style socket */ inp->sctp_flags = (SCTP_PCB_FLAGS_UDPTYPE | SCTP_PCB_FLAGS_UNBOUND); /* Be sure it is NON-BLOCKING IO for UDP */ /* SCTP_SET_SO_NBIO(so); */ } else if (SCTP_SO_TYPE(so) == SOCK_STREAM) { /* TCP style socket */ inp->sctp_flags = (SCTP_PCB_FLAGS_TCPTYPE | SCTP_PCB_FLAGS_UNBOUND); /* Be sure we have blocking IO by default */ SOCK_LOCK(so); SCTP_CLEAR_SO_NBIO(so); SOCK_UNLOCK(so); } else { /* * unsupported socket type (RAW, etc)- in case we missed it * in protosw */ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, EOPNOTSUPP); so->so_pcb = NULL; crfree(inp->ip_inp.inp.inp_cred); SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_ep), inp); return (EOPNOTSUPP); } if (SCTP_BASE_SYSCTL(sctp_default_frag_interleave) == SCTP_FRAG_LEVEL_1) { sctp_feature_on(inp, SCTP_PCB_FLAGS_FRAG_INTERLEAVE); sctp_feature_off(inp, SCTP_PCB_FLAGS_INTERLEAVE_STRMS); } else if (SCTP_BASE_SYSCTL(sctp_default_frag_interleave) == SCTP_FRAG_LEVEL_2) { sctp_feature_on(inp, SCTP_PCB_FLAGS_FRAG_INTERLEAVE); sctp_feature_on(inp, SCTP_PCB_FLAGS_INTERLEAVE_STRMS); } else if (SCTP_BASE_SYSCTL(sctp_default_frag_interleave) == SCTP_FRAG_LEVEL_0) { sctp_feature_off(inp, SCTP_PCB_FLAGS_FRAG_INTERLEAVE); sctp_feature_off(inp, SCTP_PCB_FLAGS_INTERLEAVE_STRMS); } inp->sctp_tcbhash = SCTP_HASH_INIT(SCTP_BASE_SYSCTL(sctp_pcbtblsize), &inp->sctp_hashmark); if (inp->sctp_tcbhash == NULL) { SCTP_PRINTF("Out of SCTP-INPCB->hashinit - no resources\n"); SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, ENOBUFS); so->so_pcb = NULL; crfree(inp->ip_inp.inp.inp_cred); SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_ep), inp); return (ENOBUFS); } inp->def_vrf_id = vrf_id; SCTP_INP_INFO_WLOCK(); SCTP_INP_LOCK_INIT(inp); INP_LOCK_INIT(&inp->ip_inp.inp, "inp", "sctpinp"); SCTP_INP_READ_INIT(inp); SCTP_ASOC_CREATE_LOCK_INIT(inp); /* lock the new ep */ SCTP_INP_WLOCK(inp); /* add it to the info area */ LIST_INSERT_HEAD(&SCTP_BASE_INFO(listhead), inp, sctp_list); SCTP_INP_INFO_WUNLOCK(); TAILQ_INIT(&inp->read_queue); LIST_INIT(&inp->sctp_addr_list); LIST_INIT(&inp->sctp_asoc_list); #ifdef SCTP_TRACK_FREED_ASOCS /* TEMP CODE */ LIST_INIT(&inp->sctp_asoc_free_list); #endif /* Init the timer structure for signature change */ SCTP_OS_TIMER_INIT(&inp->sctp_ep.signature_change.timer); inp->sctp_ep.signature_change.type = SCTP_TIMER_TYPE_NEWCOOKIE; /* now init the actual endpoint default data */ m = &inp->sctp_ep; /* setup the base timeout information */ m->sctp_timeoutticks[SCTP_TIMER_SEND] = sctp_secs_to_ticks(SCTP_SEND_SEC); /* needed ? */ m->sctp_timeoutticks[SCTP_TIMER_INIT] = sctp_secs_to_ticks(SCTP_INIT_SEC); /* needed ? */ m->sctp_timeoutticks[SCTP_TIMER_RECV] = sctp_msecs_to_ticks(SCTP_BASE_SYSCTL(sctp_delayed_sack_time_default)); m->sctp_timeoutticks[SCTP_TIMER_HEARTBEAT] = sctp_msecs_to_ticks(SCTP_BASE_SYSCTL(sctp_heartbeat_interval_default)); m->sctp_timeoutticks[SCTP_TIMER_PMTU] = sctp_secs_to_ticks(SCTP_BASE_SYSCTL(sctp_pmtu_raise_time_default)); m->sctp_timeoutticks[SCTP_TIMER_MAXSHUTDOWN] = sctp_secs_to_ticks(SCTP_BASE_SYSCTL(sctp_shutdown_guard_time_default)); m->sctp_timeoutticks[SCTP_TIMER_SIGNATURE] = sctp_secs_to_ticks(SCTP_BASE_SYSCTL(sctp_secret_lifetime_default)); /* all max/min max are in ms */ m->sctp_maxrto = SCTP_BASE_SYSCTL(sctp_rto_max_default); m->sctp_minrto = SCTP_BASE_SYSCTL(sctp_rto_min_default); m->initial_rto = SCTP_BASE_SYSCTL(sctp_rto_initial_default); m->initial_init_rto_max = SCTP_BASE_SYSCTL(sctp_init_rto_max_default); m->sctp_sack_freq = SCTP_BASE_SYSCTL(sctp_sack_freq_default); m->max_init_times = SCTP_BASE_SYSCTL(sctp_init_rtx_max_default); m->max_send_times = SCTP_BASE_SYSCTL(sctp_assoc_rtx_max_default); m->def_net_failure = SCTP_BASE_SYSCTL(sctp_path_rtx_max_default); m->def_net_pf_threshold = SCTP_BASE_SYSCTL(sctp_path_pf_threshold); m->sctp_sws_sender = SCTP_SWS_SENDER_DEF; m->sctp_sws_receiver = SCTP_SWS_RECEIVER_DEF; m->max_burst = SCTP_BASE_SYSCTL(sctp_max_burst_default); m->fr_max_burst = SCTP_BASE_SYSCTL(sctp_fr_max_burst_default); m->sctp_default_cc_module = SCTP_BASE_SYSCTL(sctp_default_cc_module); m->sctp_default_ss_module = SCTP_BASE_SYSCTL(sctp_default_ss_module); m->max_open_streams_intome = SCTP_BASE_SYSCTL(sctp_nr_incoming_streams_default); /* number of streams to pre-open on a association */ m->pre_open_stream_count = SCTP_BASE_SYSCTL(sctp_nr_outgoing_streams_default); m->default_mtu = 0; /* Add adaptation cookie */ m->adaptation_layer_indicator = 0; m->adaptation_layer_indicator_provided = 0; /* seed random number generator */ m->random_counter = 1; m->store_at = SCTP_SIGNATURE_SIZE; SCTP_READ_RANDOM(m->random_numbers, sizeof(m->random_numbers)); sctp_fill_random_store(m); /* Minimum cookie size */ m->size_of_a_cookie = (sizeof(struct sctp_init_msg) * 2) + sizeof(struct sctp_state_cookie); m->size_of_a_cookie += SCTP_SIGNATURE_SIZE; /* Setup the initial secret */ (void)SCTP_GETTIME_TIMEVAL(&time); m->time_of_secret_change = time.tv_sec; for (i = 0; i < SCTP_NUMBER_OF_SECRETS; i++) { m->secret_key[0][i] = sctp_select_initial_TSN(m); } sctp_timer_start(SCTP_TIMER_TYPE_NEWCOOKIE, inp, NULL, NULL); /* How long is a cookie good for ? */ m->def_cookie_life = sctp_msecs_to_ticks(SCTP_BASE_SYSCTL(sctp_valid_cookie_life_default)); /* * Initialize authentication parameters */ m->local_hmacs = sctp_default_supported_hmaclist(); m->local_auth_chunks = sctp_alloc_chunklist(); if (inp->asconf_supported) { sctp_auth_add_chunk(SCTP_ASCONF, m->local_auth_chunks); sctp_auth_add_chunk(SCTP_ASCONF_ACK, m->local_auth_chunks); } m->default_dscp = 0; #ifdef INET6 m->default_flowlabel = 0; #endif m->port = 0; /* encapsulation disabled by default */ LIST_INIT(&m->shared_keys); /* add default NULL key as key id 0 */ null_key = sctp_alloc_sharedkey(); sctp_insert_sharedkey(&m->shared_keys, null_key); SCTP_INP_WUNLOCK(inp); #ifdef SCTP_LOG_CLOSING sctp_log_closing(inp, NULL, 12); #endif return (error); } void sctp_move_pcb_and_assoc(struct sctp_inpcb *old_inp, struct sctp_inpcb *new_inp, struct sctp_tcb *stcb) { struct sctp_nets *net; uint16_t lport, rport; struct sctppcbhead *head; struct sctp_laddr *laddr, *oladdr; atomic_add_int(&stcb->asoc.refcnt, 1); SCTP_TCB_UNLOCK(stcb); SCTP_INP_INFO_WLOCK(); SCTP_INP_WLOCK(old_inp); SCTP_INP_WLOCK(new_inp); SCTP_TCB_LOCK(stcb); atomic_subtract_int(&stcb->asoc.refcnt, 1); new_inp->sctp_ep.time_of_secret_change = old_inp->sctp_ep.time_of_secret_change; memcpy(new_inp->sctp_ep.secret_key, old_inp->sctp_ep.secret_key, sizeof(old_inp->sctp_ep.secret_key)); new_inp->sctp_ep.current_secret_number = old_inp->sctp_ep.current_secret_number; new_inp->sctp_ep.last_secret_number = old_inp->sctp_ep.last_secret_number; new_inp->sctp_ep.size_of_a_cookie = old_inp->sctp_ep.size_of_a_cookie; /* make it so new data pours into the new socket */ stcb->sctp_socket = new_inp->sctp_socket; stcb->sctp_ep = new_inp; /* Copy the port across */ lport = new_inp->sctp_lport = old_inp->sctp_lport; rport = stcb->rport; /* Pull the tcb from the old association */ LIST_REMOVE(stcb, sctp_tcbhash); LIST_REMOVE(stcb, sctp_tcblist); if (stcb->asoc.in_asocid_hash) { LIST_REMOVE(stcb, sctp_tcbasocidhash); } /* Now insert the new_inp into the TCP connected hash */ head = &SCTP_BASE_INFO(sctp_tcpephash)[SCTP_PCBHASH_ALLADDR((lport | rport), SCTP_BASE_INFO(hashtcpmark))]; LIST_INSERT_HEAD(head, new_inp, sctp_hash); /* Its safe to access */ new_inp->sctp_flags &= ~SCTP_PCB_FLAGS_UNBOUND; /* Now move the tcb into the endpoint list */ LIST_INSERT_HEAD(&new_inp->sctp_asoc_list, stcb, sctp_tcblist); /* * Question, do we even need to worry about the ep-hash since we * only have one connection? Probably not :> so lets get rid of it * and not suck up any kernel memory in that. */ if (stcb->asoc.in_asocid_hash) { struct sctpasochead *lhd; lhd = &new_inp->sctp_asocidhash[SCTP_PCBHASH_ASOC(stcb->asoc.assoc_id, new_inp->hashasocidmark)]; LIST_INSERT_HEAD(lhd, stcb, sctp_tcbasocidhash); } /* Ok. Let's restart timer. */ TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) { sctp_timer_start(SCTP_TIMER_TYPE_PATHMTURAISE, new_inp, stcb, net); } SCTP_INP_INFO_WUNLOCK(); if (new_inp->sctp_tcbhash != NULL) { SCTP_HASH_FREE(new_inp->sctp_tcbhash, new_inp->sctp_hashmark); new_inp->sctp_tcbhash = NULL; } if ((new_inp->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) == 0) { /* Subset bound, so copy in the laddr list from the old_inp */ LIST_FOREACH(oladdr, &old_inp->sctp_addr_list, sctp_nxt_addr) { laddr = SCTP_ZONE_GET(SCTP_BASE_INFO(ipi_zone_laddr), struct sctp_laddr); if (laddr == NULL) { /* * Gak, what can we do? This assoc is really * HOSED. We probably should send an abort * here. */ SCTPDBG(SCTP_DEBUG_PCB1, "Association hosed in TCP model, out of laddr memory\n"); continue; } SCTP_INCR_LADDR_COUNT(); memset(laddr, 0, sizeof(*laddr)); (void)SCTP_GETTIME_TIMEVAL(&laddr->start_time); laddr->ifa = oladdr->ifa; atomic_add_int(&laddr->ifa->refcount, 1); LIST_INSERT_HEAD(&new_inp->sctp_addr_list, laddr, sctp_nxt_addr); new_inp->laddr_count++; if (oladdr == stcb->asoc.last_used_address) { stcb->asoc.last_used_address = laddr; } } } /* Now any running timers need to be adjusted. */ if (stcb->asoc.dack_timer.ep == old_inp) { SCTP_INP_DECR_REF(old_inp); stcb->asoc.dack_timer.ep = new_inp; SCTP_INP_INCR_REF(new_inp); } if (stcb->asoc.asconf_timer.ep == old_inp) { SCTP_INP_DECR_REF(old_inp); stcb->asoc.asconf_timer.ep = new_inp; SCTP_INP_INCR_REF(new_inp); } if (stcb->asoc.strreset_timer.ep == old_inp) { SCTP_INP_DECR_REF(old_inp); stcb->asoc.strreset_timer.ep = new_inp; SCTP_INP_INCR_REF(new_inp); } if (stcb->asoc.shut_guard_timer.ep == old_inp) { SCTP_INP_DECR_REF(old_inp); stcb->asoc.shut_guard_timer.ep = new_inp; SCTP_INP_INCR_REF(new_inp); } if (stcb->asoc.autoclose_timer.ep == old_inp) { SCTP_INP_DECR_REF(old_inp); stcb->asoc.autoclose_timer.ep = new_inp; SCTP_INP_INCR_REF(new_inp); } if (stcb->asoc.delete_prim_timer.ep == old_inp) { SCTP_INP_DECR_REF(old_inp); stcb->asoc.delete_prim_timer.ep = new_inp; SCTP_INP_INCR_REF(new_inp); } /* now what about the nets? */ TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) { if (net->pmtu_timer.ep == old_inp) { SCTP_INP_DECR_REF(old_inp); net->pmtu_timer.ep = new_inp; SCTP_INP_INCR_REF(new_inp); } if (net->hb_timer.ep == old_inp) { SCTP_INP_DECR_REF(old_inp); net->hb_timer.ep = new_inp; SCTP_INP_INCR_REF(new_inp); } if (net->rxt_timer.ep == old_inp) { SCTP_INP_DECR_REF(old_inp); net->rxt_timer.ep = new_inp; SCTP_INP_INCR_REF(new_inp); } } SCTP_INP_WUNLOCK(new_inp); SCTP_INP_WUNLOCK(old_inp); } /* * insert an laddr entry with the given ifa for the desired list */ static int sctp_insert_laddr(struct sctpladdr *list, struct sctp_ifa *ifa, uint32_t act) { struct sctp_laddr *laddr; laddr = SCTP_ZONE_GET(SCTP_BASE_INFO(ipi_zone_laddr), struct sctp_laddr); if (laddr == NULL) { /* out of memory? */ SCTP_LTRACE_ERR_RET(NULL, NULL, NULL, SCTP_FROM_SCTP_PCB, EINVAL); return (EINVAL); } SCTP_INCR_LADDR_COUNT(); memset(laddr, 0, sizeof(*laddr)); (void)SCTP_GETTIME_TIMEVAL(&laddr->start_time); laddr->ifa = ifa; laddr->action = act; atomic_add_int(&ifa->refcount, 1); /* insert it */ LIST_INSERT_HEAD(list, laddr, sctp_nxt_addr); return (0); } /* * Remove an laddr entry from the local address list (on an assoc) */ static void sctp_remove_laddr(struct sctp_laddr *laddr) { /* remove from the list */ LIST_REMOVE(laddr, sctp_nxt_addr); sctp_free_ifa(laddr->ifa); SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_laddr), laddr); SCTP_DECR_LADDR_COUNT(); } /* sctp_ifap is used to bypass normal local address validation checks */ int sctp_inpcb_bind(struct socket *so, struct sockaddr *addr, struct sctp_ifa *sctp_ifap, struct thread *p) { /* bind a ep to a socket address */ struct sctppcbhead *head; struct sctp_inpcb *inp, *inp_tmp; struct inpcb *ip_inp; int port_reuse_active = 0; int bindall; uint16_t lport; int error; uint32_t vrf_id; lport = 0; bindall = 1; inp = (struct sctp_inpcb *)so->so_pcb; ip_inp = (struct inpcb *)so->so_pcb; #ifdef SCTP_DEBUG if (addr) { SCTPDBG(SCTP_DEBUG_PCB1, "Bind called port: %d\n", ntohs(((struct sockaddr_in *)addr)->sin_port)); SCTPDBG(SCTP_DEBUG_PCB1, "Addr: "); SCTPDBG_ADDR(SCTP_DEBUG_PCB1, addr); } #endif if ((inp->sctp_flags & SCTP_PCB_FLAGS_UNBOUND) == 0) { /* already did a bind, subsequent binds NOT allowed ! */ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, EINVAL); return (EINVAL); } #ifdef INVARIANTS if (p == NULL) panic("null proc/thread"); #endif if (addr != NULL) { switch (addr->sa_family) { #ifdef INET case AF_INET: { struct sockaddr_in *sin; /* IPV6_V6ONLY socket? */ if (SCTP_IPV6_V6ONLY(inp)) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, EINVAL); return (EINVAL); } if (addr->sa_len != sizeof(*sin)) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, EINVAL); return (EINVAL); } sin = (struct sockaddr_in *)addr; lport = sin->sin_port; /* * For LOOPBACK the prison_local_ip4() call * will transmute the ip address to the * proper value. */ if (p && (error = prison_local_ip4(p->td_ucred, &sin->sin_addr)) != 0) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, error); return (error); } if (sin->sin_addr.s_addr != INADDR_ANY) { bindall = 0; } break; } #endif #ifdef INET6 case AF_INET6: { /* * Only for pure IPv6 Address. (No IPv4 * Mapped!) */ struct sockaddr_in6 *sin6; sin6 = (struct sockaddr_in6 *)addr; if (addr->sa_len != sizeof(*sin6)) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, EINVAL); return (EINVAL); } lport = sin6->sin6_port; /* * For LOOPBACK the prison_local_ip6() call * will transmute the ipv6 address to the * proper value. */ if (p && (error = prison_local_ip6(p->td_ucred, &sin6->sin6_addr, (SCTP_IPV6_V6ONLY(inp) != 0))) != 0) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, error); return (error); } if (!IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) { bindall = 0; /* KAME hack: embed scopeid */ if (sa6_embedscope(sin6, MODULE_GLOBAL(ip6_use_defzone)) != 0) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, EINVAL); return (EINVAL); } } /* this must be cleared for ifa_ifwithaddr() */ sin6->sin6_scope_id = 0; break; } #endif default: SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, EAFNOSUPPORT); return (EAFNOSUPPORT); } } SCTP_INP_INFO_WLOCK(); SCTP_INP_WLOCK(inp); /* Setup a vrf_id to be the default for the non-bind-all case. */ vrf_id = inp->def_vrf_id; /* increase our count due to the unlock we do */ SCTP_INP_INCR_REF(inp); if (lport) { /* * Did the caller specify a port? if so we must see if an ep * already has this one bound. */ /* got to be root to get at low ports */ if (ntohs(lport) < IPPORT_RESERVED) { if ((p != NULL) && ((error = priv_check(p, PRIV_NETINET_RESERVEDPORT) ) != 0)) { SCTP_INP_DECR_REF(inp); SCTP_INP_WUNLOCK(inp); SCTP_INP_INFO_WUNLOCK(); return (error); } } SCTP_INP_WUNLOCK(inp); if (bindall) { vrf_id = inp->def_vrf_id; inp_tmp = sctp_pcb_findep(addr, 0, 1, vrf_id); if (inp_tmp != NULL) { /* * lock guy returned and lower count note * that we are not bound so inp_tmp should * NEVER be inp. And it is this inp * (inp_tmp) that gets the reference bump, * so we must lower it. */ SCTP_INP_DECR_REF(inp_tmp); /* unlock info */ if ((sctp_is_feature_on(inp, SCTP_PCB_FLAGS_PORTREUSE)) && (sctp_is_feature_on(inp_tmp, SCTP_PCB_FLAGS_PORTREUSE))) { /* * Ok, must be one-2-one and * allowing port re-use */ port_reuse_active = 1; goto continue_anyway; } SCTP_INP_DECR_REF(inp); SCTP_INP_INFO_WUNLOCK(); SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, EADDRINUSE); return (EADDRINUSE); } } else { inp_tmp = sctp_pcb_findep(addr, 0, 1, vrf_id); if (inp_tmp != NULL) { /* * lock guy returned and lower count note * that we are not bound so inp_tmp should * NEVER be inp. And it is this inp * (inp_tmp) that gets the reference bump, * so we must lower it. */ SCTP_INP_DECR_REF(inp_tmp); /* unlock info */ if ((sctp_is_feature_on(inp, SCTP_PCB_FLAGS_PORTREUSE)) && (sctp_is_feature_on(inp_tmp, SCTP_PCB_FLAGS_PORTREUSE))) { /* * Ok, must be one-2-one and * allowing port re-use */ port_reuse_active = 1; goto continue_anyway; } SCTP_INP_DECR_REF(inp); SCTP_INP_INFO_WUNLOCK(); SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, EADDRINUSE); return (EADDRINUSE); } } continue_anyway: SCTP_INP_WLOCK(inp); if (bindall) { /* verify that no lport is not used by a singleton */ if ((port_reuse_active == 0) && (inp_tmp = sctp_isport_inuse(inp, lport, vrf_id))) { /* Sorry someone already has this one bound */ if ((sctp_is_feature_on(inp, SCTP_PCB_FLAGS_PORTREUSE)) && (sctp_is_feature_on(inp_tmp, SCTP_PCB_FLAGS_PORTREUSE))) { port_reuse_active = 1; } else { SCTP_INP_DECR_REF(inp); SCTP_INP_WUNLOCK(inp); SCTP_INP_INFO_WUNLOCK(); SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, EADDRINUSE); return (EADDRINUSE); } } } } else { uint16_t first, last, candidate; uint16_t count; int done; if (ip_inp->inp_flags & INP_HIGHPORT) { first = MODULE_GLOBAL(ipport_hifirstauto); last = MODULE_GLOBAL(ipport_hilastauto); } else if (ip_inp->inp_flags & INP_LOWPORT) { if (p && (error = priv_check(p, PRIV_NETINET_RESERVEDPORT) )) { SCTP_INP_DECR_REF(inp); SCTP_INP_WUNLOCK(inp); SCTP_INP_INFO_WUNLOCK(); SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, error); return (error); } first = MODULE_GLOBAL(ipport_lowfirstauto); last = MODULE_GLOBAL(ipport_lowlastauto); } else { first = MODULE_GLOBAL(ipport_firstauto); last = MODULE_GLOBAL(ipport_lastauto); } if (first > last) { uint16_t temp; temp = first; first = last; last = temp; } count = last - first + 1; /* number of candidates */ candidate = first + sctp_select_initial_TSN(&inp->sctp_ep) % (count); done = 0; while (!done) { if (sctp_isport_inuse(inp, htons(candidate), inp->def_vrf_id) == NULL) { done = 1; } if (!done) { if (--count == 0) { SCTP_INP_DECR_REF(inp); SCTP_INP_WUNLOCK(inp); SCTP_INP_INFO_WUNLOCK(); SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, EADDRINUSE); return (EADDRINUSE); } if (candidate == last) candidate = first; else candidate = candidate + 1; } } lport = htons(candidate); } SCTP_INP_DECR_REF(inp); if (inp->sctp_flags & (SCTP_PCB_FLAGS_SOCKET_GONE | SCTP_PCB_FLAGS_SOCKET_ALLGONE)) { /* * this really should not happen. The guy did a non-blocking * bind and then did a close at the same time. */ SCTP_INP_WUNLOCK(inp); SCTP_INP_INFO_WUNLOCK(); SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, EINVAL); return (EINVAL); } /* ok we look clear to give out this port, so lets setup the binding */ if (bindall) { /* binding to all addresses, so just set in the proper flags */ inp->sctp_flags |= SCTP_PCB_FLAGS_BOUNDALL; /* set the automatic addr changes from kernel flag */ if (SCTP_BASE_SYSCTL(sctp_auto_asconf) == 0) { sctp_feature_off(inp, SCTP_PCB_FLAGS_DO_ASCONF); sctp_feature_off(inp, SCTP_PCB_FLAGS_AUTO_ASCONF); } else { sctp_feature_on(inp, SCTP_PCB_FLAGS_DO_ASCONF); sctp_feature_on(inp, SCTP_PCB_FLAGS_AUTO_ASCONF); } if (SCTP_BASE_SYSCTL(sctp_multiple_asconfs) == 0) { sctp_feature_off(inp, SCTP_PCB_FLAGS_MULTIPLE_ASCONFS); } else { sctp_feature_on(inp, SCTP_PCB_FLAGS_MULTIPLE_ASCONFS); } /* * set the automatic mobility_base from kernel flag (by * micchie) */ if (SCTP_BASE_SYSCTL(sctp_mobility_base) == 0) { sctp_mobility_feature_off(inp, SCTP_MOBILITY_BASE); sctp_mobility_feature_off(inp, SCTP_MOBILITY_PRIM_DELETED); } else { sctp_mobility_feature_on(inp, SCTP_MOBILITY_BASE); sctp_mobility_feature_off(inp, SCTP_MOBILITY_PRIM_DELETED); } /* * set the automatic mobility_fasthandoff from kernel flag * (by micchie) */ if (SCTP_BASE_SYSCTL(sctp_mobility_fasthandoff) == 0) { sctp_mobility_feature_off(inp, SCTP_MOBILITY_FASTHANDOFF); sctp_mobility_feature_off(inp, SCTP_MOBILITY_PRIM_DELETED); } else { sctp_mobility_feature_on(inp, SCTP_MOBILITY_FASTHANDOFF); sctp_mobility_feature_off(inp, SCTP_MOBILITY_PRIM_DELETED); } } else { /* * bind specific, make sure flags is off and add a new * address structure to the sctp_addr_list inside the ep * structure. * * We will need to allocate one and insert it at the head. * The socketopt call can just insert new addresses in there * as well. It will also have to do the embed scope kame * hack too (before adding). */ struct sctp_ifa *ifa; union sctp_sockstore store; memset(&store, 0, sizeof(store)); switch (addr->sa_family) { #ifdef INET case AF_INET: memcpy(&store.sin, addr, sizeof(struct sockaddr_in)); store.sin.sin_port = 0; break; #endif #ifdef INET6 case AF_INET6: memcpy(&store.sin6, addr, sizeof(struct sockaddr_in6)); store.sin6.sin6_port = 0; break; #endif default: break; } /* * first find the interface with the bound address need to * zero out the port to find the address! yuck! can't do * this earlier since need port for sctp_pcb_findep() */ if (sctp_ifap != NULL) { ifa = sctp_ifap; } else { /* * Note for BSD we hit here always other O/S's will * pass things in via the sctp_ifap argument. */ ifa = sctp_find_ifa_by_addr(&store.sa, vrf_id, SCTP_ADDR_NOT_LOCKED); } if (ifa == NULL) { /* Can't find an interface with that address */ SCTP_INP_WUNLOCK(inp); SCTP_INP_INFO_WUNLOCK(); SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, EADDRNOTAVAIL); return (EADDRNOTAVAIL); } #ifdef INET6 if (addr->sa_family == AF_INET6) { /* GAK, more FIXME IFA lock? */ if (ifa->localifa_flags & SCTP_ADDR_IFA_UNUSEABLE) { /* Can't bind a non-existent addr. */ SCTP_INP_WUNLOCK(inp); SCTP_INP_INFO_WUNLOCK(); SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, EINVAL); return (EINVAL); } } #endif /* we're not bound all */ inp->sctp_flags &= ~SCTP_PCB_FLAGS_BOUNDALL; /* allow bindx() to send ASCONF's for binding changes */ sctp_feature_on(inp, SCTP_PCB_FLAGS_DO_ASCONF); /* clear automatic addr changes from kernel flag */ sctp_feature_off(inp, SCTP_PCB_FLAGS_AUTO_ASCONF); /* add this address to the endpoint list */ error = sctp_insert_laddr(&inp->sctp_addr_list, ifa, 0); if (error != 0) { SCTP_INP_WUNLOCK(inp); SCTP_INP_INFO_WUNLOCK(); return (error); } inp->laddr_count++; } /* find the bucket */ if (port_reuse_active) { /* Put it into tcp 1-2-1 hash */ head = &SCTP_BASE_INFO(sctp_tcpephash)[SCTP_PCBHASH_ALLADDR(lport, SCTP_BASE_INFO(hashtcpmark))]; inp->sctp_flags |= SCTP_PCB_FLAGS_IN_TCPPOOL; } else { head = &SCTP_BASE_INFO(sctp_ephash)[SCTP_PCBHASH_ALLADDR(lport, SCTP_BASE_INFO(hashmark))]; } /* put it in the bucket */ LIST_INSERT_HEAD(head, inp, sctp_hash); SCTPDBG(SCTP_DEBUG_PCB1, "Main hash to bind at head:%p, bound port:%d - in tcp_pool=%d\n", (void *)head, ntohs(lport), port_reuse_active); /* set in the port */ inp->sctp_lport = lport; /* turn off just the unbound flag */ inp->sctp_flags &= ~SCTP_PCB_FLAGS_UNBOUND; SCTP_INP_WUNLOCK(inp); SCTP_INP_INFO_WUNLOCK(); return (0); } static void sctp_iterator_inp_being_freed(struct sctp_inpcb *inp) { struct sctp_iterator *it, *nit; /* * We enter with the only the ITERATOR_LOCK in place and a write * lock on the inp_info stuff. */ it = sctp_it_ctl.cur_it; if (it && (it->vn != curvnet)) { /* Its not looking at our VNET */ return; } if (it && (it->inp == inp)) { /* * This is tricky and we hold the iterator lock, but when it * returns and gets the lock (when we release it) the * iterator will try to operate on inp. We need to stop that * from happening. But of course the iterator has a * reference on the stcb and inp. We can mark it and it will * stop. * * If its a single iterator situation, we set the end * iterator flag. Otherwise we set the iterator to go to the * next inp. * */ if (it->iterator_flags & SCTP_ITERATOR_DO_SINGLE_INP) { sctp_it_ctl.iterator_flags |= SCTP_ITERATOR_STOP_CUR_IT; } else { sctp_it_ctl.iterator_flags |= SCTP_ITERATOR_STOP_CUR_INP; } } /* * Now go through and remove any single reference to our inp that * may be still pending on the list */ SCTP_IPI_ITERATOR_WQ_LOCK(); TAILQ_FOREACH_SAFE(it, &sctp_it_ctl.iteratorhead, sctp_nxt_itr, nit) { if (it->vn != curvnet) { continue; } if (it->inp == inp) { /* This one points to me is it inp specific? */ if (it->iterator_flags & SCTP_ITERATOR_DO_SINGLE_INP) { /* Remove and free this one */ TAILQ_REMOVE(&sctp_it_ctl.iteratorhead, it, sctp_nxt_itr); if (it->function_atend != NULL) { (*it->function_atend) (it->pointer, it->val); } SCTP_FREE(it, SCTP_M_ITER); } else { it->inp = LIST_NEXT(it->inp, sctp_list); if (it->inp) { SCTP_INP_INCR_REF(it->inp); } } /* * When its put in the refcnt is incremented so decr * it */ SCTP_INP_DECR_REF(inp); } } SCTP_IPI_ITERATOR_WQ_UNLOCK(); } /* release sctp_inpcb unbind the port */ void sctp_inpcb_free(struct sctp_inpcb *inp, int immediate, int from) { /* * Here we free a endpoint. We must find it (if it is in the Hash * table) and remove it from there. Then we must also find it in the * overall list and remove it from there. After all removals are * complete then any timer has to be stopped. Then start the actual * freeing. a) Any local lists. b) Any associations. c) The hash of * all associations. d) finally the ep itself. */ struct sctp_tcb *asoc, *nasoc; struct sctp_laddr *laddr, *nladdr; struct inpcb *ip_pcb; struct socket *so; int being_refed = 0; struct sctp_queued_to_read *sq, *nsq; int cnt; sctp_sharedkey_t *shared_key, *nshared_key; #ifdef SCTP_LOG_CLOSING sctp_log_closing(inp, NULL, 0); #endif SCTP_ITERATOR_LOCK(); /* mark any iterators on the list or being processed */ sctp_iterator_inp_being_freed(inp); SCTP_ITERATOR_UNLOCK(); so = inp->sctp_socket; if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) { /* been here before.. eeks.. get out of here */ SCTP_PRINTF("This conflict in free SHOULD not be happening! from %d, imm %d\n", from, immediate); #ifdef SCTP_LOG_CLOSING sctp_log_closing(inp, NULL, 1); #endif return; } SCTP_ASOC_CREATE_LOCK(inp); SCTP_INP_INFO_WLOCK(); SCTP_INP_WLOCK(inp); if (from == SCTP_CALLED_AFTER_CMPSET_OFCLOSE) { inp->sctp_flags &= ~SCTP_PCB_FLAGS_CLOSE_IP; /* socket is gone, so no more wakeups allowed */ inp->sctp_flags |= SCTP_PCB_FLAGS_DONT_WAKE; inp->sctp_flags &= ~SCTP_PCB_FLAGS_WAKEINPUT; inp->sctp_flags &= ~SCTP_PCB_FLAGS_WAKEOUTPUT; } /* First time through we have the socket lock, after that no more. */ sctp_timer_stop(SCTP_TIMER_TYPE_NEWCOOKIE, inp, NULL, NULL, SCTP_FROM_SCTP_PCB + SCTP_LOC_1); if (inp->control) { sctp_m_freem(inp->control); inp->control = NULL; } if (inp->pkt) { sctp_m_freem(inp->pkt); inp->pkt = NULL; } ip_pcb = &inp->ip_inp.inp; /* we could just cast the main pointer * here but I will be nice :> (i.e. * ip_pcb = ep;) */ if (immediate == SCTP_FREE_SHOULD_USE_GRACEFUL_CLOSE) { int cnt_in_sd; cnt_in_sd = 0; LIST_FOREACH_SAFE(asoc, &inp->sctp_asoc_list, sctp_tcblist, nasoc) { SCTP_TCB_LOCK(asoc); if (asoc->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) { /* Skip guys being freed */ cnt_in_sd++; if (asoc->asoc.state & SCTP_STATE_IN_ACCEPT_QUEUE) { /* * Special case - we did not start a * kill timer on the asoc due to it * was not closed. So go ahead and * start it now. */ SCTP_CLEAR_SUBSTATE(asoc, SCTP_STATE_IN_ACCEPT_QUEUE); sctp_timer_start(SCTP_TIMER_TYPE_ASOCKILL, inp, asoc, NULL); } SCTP_TCB_UNLOCK(asoc); continue; } if (((SCTP_GET_STATE(asoc) == SCTP_STATE_COOKIE_WAIT) || (SCTP_GET_STATE(asoc) == SCTP_STATE_COOKIE_ECHOED)) && (asoc->asoc.total_output_queue_size == 0)) { /* * If we have data in queue, we don't want * to just free since the app may have done, * send()/close or connect/send/close. And * it wants the data to get across first. */ /* Just abandon things in the front states */ if (sctp_free_assoc(inp, asoc, SCTP_PCBFREE_NOFORCE, SCTP_FROM_SCTP_PCB + SCTP_LOC_2) == 0) { cnt_in_sd++; } continue; } /* Disconnect the socket please */ asoc->sctp_socket = NULL; SCTP_ADD_SUBSTATE(asoc, SCTP_STATE_CLOSED_SOCKET); if ((asoc->asoc.size_on_reasm_queue > 0) || (asoc->asoc.control_pdapi) || (asoc->asoc.size_on_all_streams > 0) || (so && (so->so_rcv.sb_cc > 0))) { /* Left with Data unread */ struct mbuf *op_err; op_err = sctp_generate_cause(SCTP_CAUSE_USER_INITIATED_ABT, ""); asoc->sctp_ep->last_abort_code = SCTP_FROM_SCTP_PCB + SCTP_LOC_3; sctp_send_abort_tcb(asoc, op_err, SCTP_SO_LOCKED); SCTP_STAT_INCR_COUNTER32(sctps_aborted); if ((SCTP_GET_STATE(asoc) == SCTP_STATE_OPEN) || (SCTP_GET_STATE(asoc) == SCTP_STATE_SHUTDOWN_RECEIVED)) { SCTP_STAT_DECR_GAUGE32(sctps_currestab); } if (sctp_free_assoc(inp, asoc, SCTP_PCBFREE_NOFORCE, SCTP_FROM_SCTP_PCB + SCTP_LOC_4) == 0) { cnt_in_sd++; } continue; } else if (TAILQ_EMPTY(&asoc->asoc.send_queue) && TAILQ_EMPTY(&asoc->asoc.sent_queue) && (asoc->asoc.stream_queue_cnt == 0)) { if ((*asoc->asoc.ss_functions.sctp_ss_is_user_msgs_incomplete) (asoc, &asoc->asoc)) { goto abort_anyway; } if ((SCTP_GET_STATE(asoc) != SCTP_STATE_SHUTDOWN_SENT) && (SCTP_GET_STATE(asoc) != SCTP_STATE_SHUTDOWN_ACK_SENT)) { struct sctp_nets *netp; /* * there is nothing queued to send, * so I send shutdown */ if ((SCTP_GET_STATE(asoc) == SCTP_STATE_OPEN) || (SCTP_GET_STATE(asoc) == SCTP_STATE_SHUTDOWN_RECEIVED)) { SCTP_STAT_DECR_GAUGE32(sctps_currestab); } SCTP_SET_STATE(asoc, SCTP_STATE_SHUTDOWN_SENT); sctp_stop_timers_for_shutdown(asoc); if (asoc->asoc.alternate) { netp = asoc->asoc.alternate; } else { netp = asoc->asoc.primary_destination; } sctp_send_shutdown(asoc, netp); sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWN, asoc->sctp_ep, asoc, netp); sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWNGUARD, asoc->sctp_ep, asoc, NULL); sctp_chunk_output(inp, asoc, SCTP_OUTPUT_FROM_SHUT_TMR, SCTP_SO_LOCKED); } } else { /* mark into shutdown pending */ SCTP_ADD_SUBSTATE(asoc, SCTP_STATE_SHUTDOWN_PENDING); sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWNGUARD, asoc->sctp_ep, asoc, NULL); if ((*asoc->asoc.ss_functions.sctp_ss_is_user_msgs_incomplete) (asoc, &asoc->asoc)) { SCTP_ADD_SUBSTATE(asoc, SCTP_STATE_PARTIAL_MSG_LEFT); } if (TAILQ_EMPTY(&asoc->asoc.send_queue) && TAILQ_EMPTY(&asoc->asoc.sent_queue) && (asoc->asoc.state & SCTP_STATE_PARTIAL_MSG_LEFT)) { struct mbuf *op_err; abort_anyway: op_err = sctp_generate_cause(SCTP_CAUSE_USER_INITIATED_ABT, ""); asoc->sctp_ep->last_abort_code = SCTP_FROM_SCTP_PCB + SCTP_LOC_5; sctp_send_abort_tcb(asoc, op_err, SCTP_SO_LOCKED); SCTP_STAT_INCR_COUNTER32(sctps_aborted); if ((SCTP_GET_STATE(asoc) == SCTP_STATE_OPEN) || (SCTP_GET_STATE(asoc) == SCTP_STATE_SHUTDOWN_RECEIVED)) { SCTP_STAT_DECR_GAUGE32(sctps_currestab); } if (sctp_free_assoc(inp, asoc, SCTP_PCBFREE_NOFORCE, SCTP_FROM_SCTP_PCB + SCTP_LOC_6) == 0) { cnt_in_sd++; } continue; } else { sctp_chunk_output(inp, asoc, SCTP_OUTPUT_FROM_CLOSING, SCTP_SO_LOCKED); } } cnt_in_sd++; SCTP_TCB_UNLOCK(asoc); } /* now is there some left in our SHUTDOWN state? */ if (cnt_in_sd) { #ifdef SCTP_LOG_CLOSING sctp_log_closing(inp, NULL, 2); #endif inp->sctp_socket = NULL; SCTP_INP_WUNLOCK(inp); SCTP_ASOC_CREATE_UNLOCK(inp); SCTP_INP_INFO_WUNLOCK(); return; } } inp->sctp_socket = NULL; if ((inp->sctp_flags & SCTP_PCB_FLAGS_UNBOUND) != SCTP_PCB_FLAGS_UNBOUND) { /* * ok, this guy has been bound. It's port is somewhere in * the SCTP_BASE_INFO(hash table). Remove it! */ LIST_REMOVE(inp, sctp_hash); inp->sctp_flags |= SCTP_PCB_FLAGS_UNBOUND; } /* * If there is a timer running to kill us, forget it, since it may * have a contest on the INP lock.. which would cause us to die ... */ cnt = 0; LIST_FOREACH_SAFE(asoc, &inp->sctp_asoc_list, sctp_tcblist, nasoc) { SCTP_TCB_LOCK(asoc); if (immediate != SCTP_FREE_SHOULD_USE_GRACEFUL_CLOSE) { /* Disconnect the socket please */ asoc->sctp_socket = NULL; SCTP_ADD_SUBSTATE(asoc, SCTP_STATE_CLOSED_SOCKET); } if (asoc->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) { if (asoc->asoc.state & SCTP_STATE_IN_ACCEPT_QUEUE) { SCTP_CLEAR_SUBSTATE(asoc, SCTP_STATE_IN_ACCEPT_QUEUE); sctp_timer_start(SCTP_TIMER_TYPE_ASOCKILL, inp, asoc, NULL); } cnt++; SCTP_TCB_UNLOCK(asoc); continue; } /* Free associations that are NOT killing us */ if ((SCTP_GET_STATE(asoc) != SCTP_STATE_COOKIE_WAIT) && ((asoc->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) == 0)) { struct mbuf *op_err; op_err = sctp_generate_cause(SCTP_CAUSE_USER_INITIATED_ABT, ""); asoc->sctp_ep->last_abort_code = SCTP_FROM_SCTP_PCB + SCTP_LOC_7; sctp_send_abort_tcb(asoc, op_err, SCTP_SO_LOCKED); SCTP_STAT_INCR_COUNTER32(sctps_aborted); } else if (asoc->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) { cnt++; SCTP_TCB_UNLOCK(asoc); continue; } if ((SCTP_GET_STATE(asoc) == SCTP_STATE_OPEN) || (SCTP_GET_STATE(asoc) == SCTP_STATE_SHUTDOWN_RECEIVED)) { SCTP_STAT_DECR_GAUGE32(sctps_currestab); } if (sctp_free_assoc(inp, asoc, SCTP_PCBFREE_FORCE, SCTP_FROM_SCTP_PCB + SCTP_LOC_8) == 0) { cnt++; } } if (cnt) { /* Ok we have someone out there that will kill us */ #ifdef SCTP_LOG_CLOSING sctp_log_closing(inp, NULL, 3); #endif SCTP_INP_WUNLOCK(inp); SCTP_ASOC_CREATE_UNLOCK(inp); SCTP_INP_INFO_WUNLOCK(); return; } if (SCTP_INP_LOCK_CONTENDED(inp)) being_refed++; if (SCTP_INP_READ_CONTENDED(inp)) being_refed++; if (SCTP_ASOC_CREATE_LOCK_CONTENDED(inp)) being_refed++; /* NOTE: 0 refcount also means no timers are referencing us. */ if ((inp->refcount) || (being_refed) || (inp->sctp_flags & SCTP_PCB_FLAGS_CLOSE_IP)) { #ifdef SCTP_LOG_CLOSING sctp_log_closing(inp, NULL, 4); #endif sctp_timer_start(SCTP_TIMER_TYPE_INPKILL, inp, NULL, NULL); SCTP_INP_WUNLOCK(inp); SCTP_ASOC_CREATE_UNLOCK(inp); SCTP_INP_INFO_WUNLOCK(); return; } inp->sctp_ep.signature_change.type = 0; inp->sctp_flags |= SCTP_PCB_FLAGS_SOCKET_ALLGONE; /* * Remove it from the list .. last thing we need a lock for. */ LIST_REMOVE(inp, sctp_list); SCTP_INP_WUNLOCK(inp); SCTP_ASOC_CREATE_UNLOCK(inp); SCTP_INP_INFO_WUNLOCK(); #ifdef SCTP_LOG_CLOSING sctp_log_closing(inp, NULL, 5); #endif if ((inp->sctp_asocidhash) != NULL) { SCTP_HASH_FREE(inp->sctp_asocidhash, inp->hashasocidmark); inp->sctp_asocidhash = NULL; } /* sa_ignore FREED_MEMORY */ TAILQ_FOREACH_SAFE(sq, &inp->read_queue, next, nsq) { /* Its only abandoned if it had data left */ if (sq->length) SCTP_STAT_INCR(sctps_left_abandon); TAILQ_REMOVE(&inp->read_queue, sq, next); sctp_free_remote_addr(sq->whoFrom); if (so) so->so_rcv.sb_cc -= sq->length; if (sq->data) { sctp_m_freem(sq->data); sq->data = NULL; } /* * no need to free the net count, since at this point all * assoc's are gone. */ sctp_free_a_readq(NULL, sq); } /* Now the sctp_pcb things */ /* * free each asoc if it is not already closed/free. we can't use the * macro here since le_next will get freed as part of the * sctp_free_assoc() call. */ if (ip_pcb->inp_options) { (void)sctp_m_free(ip_pcb->inp_options); ip_pcb->inp_options = 0; } #ifdef INET6 if (ip_pcb->inp_vflag & INP_IPV6) { ip6_freepcbopts(ip_pcb->in6p_outputopts); } #endif /* INET6 */ ip_pcb->inp_vflag = 0; /* free up authentication fields */ if (inp->sctp_ep.local_auth_chunks != NULL) sctp_free_chunklist(inp->sctp_ep.local_auth_chunks); if (inp->sctp_ep.local_hmacs != NULL) sctp_free_hmaclist(inp->sctp_ep.local_hmacs); LIST_FOREACH_SAFE(shared_key, &inp->sctp_ep.shared_keys, next, nshared_key) { LIST_REMOVE(shared_key, next); sctp_free_sharedkey(shared_key); /* sa_ignore FREED_MEMORY */ } /* * if we have an address list the following will free the list of * ifaddr's that are set into this ep. Again macro limitations here, * since the LIST_FOREACH could be a bad idea. */ LIST_FOREACH_SAFE(laddr, &inp->sctp_addr_list, sctp_nxt_addr, nladdr) { sctp_remove_laddr(laddr); } #ifdef SCTP_TRACK_FREED_ASOCS /* TEMP CODE */ LIST_FOREACH_SAFE(asoc, &inp->sctp_asoc_free_list, sctp_tcblist, nasoc) { LIST_REMOVE(asoc, sctp_tcblist); SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_asoc), asoc); SCTP_DECR_ASOC_COUNT(); } /* *** END TEMP CODE *** */ #endif /* Now lets see about freeing the EP hash table. */ if (inp->sctp_tcbhash != NULL) { SCTP_HASH_FREE(inp->sctp_tcbhash, inp->sctp_hashmark); inp->sctp_tcbhash = NULL; } /* Now we must put the ep memory back into the zone pool */ crfree(inp->ip_inp.inp.inp_cred); INP_LOCK_DESTROY(&inp->ip_inp.inp); SCTP_INP_LOCK_DESTROY(inp); SCTP_INP_READ_DESTROY(inp); SCTP_ASOC_CREATE_LOCK_DESTROY(inp); SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_ep), inp); SCTP_DECR_EP_COUNT(); } struct sctp_nets * sctp_findnet(struct sctp_tcb *stcb, struct sockaddr *addr) { struct sctp_nets *net; /* locate the address */ TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) { if (sctp_cmpaddr(addr, (struct sockaddr *)&net->ro._l_addr)) return (net); } return (NULL); } int sctp_is_address_on_local_host(struct sockaddr *addr, uint32_t vrf_id) { struct sctp_ifa *sctp_ifa; sctp_ifa = sctp_find_ifa_by_addr(addr, vrf_id, SCTP_ADDR_NOT_LOCKED); if (sctp_ifa) { return (1); } else { return (0); } } /* * add's a remote endpoint address, done with the INIT/INIT-ACK as well as * when a ASCONF arrives that adds it. It will also initialize all the cwnd * stats of stuff. */ int sctp_add_remote_addr(struct sctp_tcb *stcb, struct sockaddr *newaddr, struct sctp_nets **netp, uint16_t port, int set_scope, int from) { /* * The following is redundant to the same lines in the * sctp_aloc_assoc() but is needed since others call the add address * function */ struct sctp_nets *net, *netfirst; int addr_inscope; SCTPDBG(SCTP_DEBUG_PCB1, "Adding an address (from:%d) to the peer: ", from); SCTPDBG_ADDR(SCTP_DEBUG_PCB1, newaddr); netfirst = sctp_findnet(stcb, newaddr); if (netfirst) { /* * Lie and return ok, we don't want to make the association * go away for this behavior. It will happen in the TCP * model in a connected socket. It does not reach the hash * table until after the association is built so it can't be * found. Mark as reachable, since the initial creation will * have been cleared and the NOT_IN_ASSOC flag will have * been added... and we don't want to end up removing it * back out. */ if (netfirst->dest_state & SCTP_ADDR_UNCONFIRMED) { netfirst->dest_state = (SCTP_ADDR_REACHABLE | SCTP_ADDR_UNCONFIRMED); } else { netfirst->dest_state = SCTP_ADDR_REACHABLE; } return (0); } addr_inscope = 1; switch (newaddr->sa_family) { #ifdef INET case AF_INET: { struct sockaddr_in *sin; sin = (struct sockaddr_in *)newaddr; if (sin->sin_addr.s_addr == 0) { /* Invalid address */ return (-1); } /* zero out the zero area */ memset(&sin->sin_zero, 0, sizeof(sin->sin_zero)); /* assure len is set */ sin->sin_len = sizeof(struct sockaddr_in); if (set_scope) { if (IN4_ISPRIVATE_ADDRESS(&sin->sin_addr)) { stcb->asoc.scope.ipv4_local_scope = 1; } } else { /* Validate the address is in scope */ if ((IN4_ISPRIVATE_ADDRESS(&sin->sin_addr)) && (stcb->asoc.scope.ipv4_local_scope == 0)) { addr_inscope = 0; } } break; } #endif #ifdef INET6 case AF_INET6: { struct sockaddr_in6 *sin6; sin6 = (struct sockaddr_in6 *)newaddr; if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) { /* Invalid address */ return (-1); } /* assure len is set */ sin6->sin6_len = sizeof(struct sockaddr_in6); if (set_scope) { if (sctp_is_address_on_local_host(newaddr, stcb->asoc.vrf_id)) { stcb->asoc.scope.loopback_scope = 1; stcb->asoc.scope.local_scope = 0; stcb->asoc.scope.ipv4_local_scope = 1; stcb->asoc.scope.site_scope = 1; } else if (IN6_IS_ADDR_LINKLOCAL(&sin6->sin6_addr)) { /* * If the new destination is a * LINK_LOCAL we must have common * site scope. Don't set the local * scope since we may not share all * links, only loopback can do this. * Links on the local network would * also be on our private network * for v4 too. */ stcb->asoc.scope.ipv4_local_scope = 1; stcb->asoc.scope.site_scope = 1; } else if (IN6_IS_ADDR_SITELOCAL(&sin6->sin6_addr)) { /* * If the new destination is * SITE_LOCAL then we must have site * scope in common. */ stcb->asoc.scope.site_scope = 1; } } else { /* Validate the address is in scope */ if (IN6_IS_ADDR_LOOPBACK(&sin6->sin6_addr) && (stcb->asoc.scope.loopback_scope == 0)) { addr_inscope = 0; } else if (IN6_IS_ADDR_LINKLOCAL(&sin6->sin6_addr) && (stcb->asoc.scope.local_scope == 0)) { addr_inscope = 0; } else if (IN6_IS_ADDR_SITELOCAL(&sin6->sin6_addr) && (stcb->asoc.scope.site_scope == 0)) { addr_inscope = 0; } } break; } #endif default: /* not supported family type */ return (-1); } net = SCTP_ZONE_GET(SCTP_BASE_INFO(ipi_zone_net), struct sctp_nets); if (net == NULL) { return (-1); } SCTP_INCR_RADDR_COUNT(); memset(net, 0, sizeof(struct sctp_nets)); (void)SCTP_GETTIME_TIMEVAL(&net->start_time); memcpy(&net->ro._l_addr, newaddr, newaddr->sa_len); switch (newaddr->sa_family) { #ifdef INET case AF_INET: ((struct sockaddr_in *)&net->ro._l_addr)->sin_port = stcb->rport; break; #endif #ifdef INET6 case AF_INET6: ((struct sockaddr_in6 *)&net->ro._l_addr)->sin6_port = stcb->rport; break; #endif default: break; } net->addr_is_local = sctp_is_address_on_local_host(newaddr, stcb->asoc.vrf_id); if (net->addr_is_local && ((set_scope || (from == SCTP_ADDR_IS_CONFIRMED)))) { stcb->asoc.scope.loopback_scope = 1; stcb->asoc.scope.ipv4_local_scope = 1; stcb->asoc.scope.local_scope = 0; stcb->asoc.scope.site_scope = 1; addr_inscope = 1; } net->failure_threshold = stcb->asoc.def_net_failure; net->pf_threshold = stcb->asoc.def_net_pf_threshold; if (addr_inscope == 0) { net->dest_state = (SCTP_ADDR_REACHABLE | SCTP_ADDR_OUT_OF_SCOPE); } else { if (from == SCTP_ADDR_IS_CONFIRMED) /* SCTP_ADDR_IS_CONFIRMED is passed by connect_x */ net->dest_state = SCTP_ADDR_REACHABLE; else net->dest_state = SCTP_ADDR_REACHABLE | SCTP_ADDR_UNCONFIRMED; } /* * We set this to 0, the timer code knows that this means its an * initial value */ net->rto_needed = 1; net->RTO = 0; net->RTO_measured = 0; stcb->asoc.numnets++; net->ref_count = 1; net->cwr_window_tsn = net->last_cwr_tsn = stcb->asoc.sending_seq - 1; net->port = port; net->dscp = stcb->asoc.default_dscp; #ifdef INET6 net->flowlabel = stcb->asoc.default_flowlabel; #endif if (sctp_stcb_is_feature_on(stcb->sctp_ep, stcb, SCTP_PCB_FLAGS_DONOT_HEARTBEAT)) { net->dest_state |= SCTP_ADDR_NOHB; } else { net->dest_state &= ~SCTP_ADDR_NOHB; } if (sctp_stcb_is_feature_on(stcb->sctp_ep, stcb, SCTP_PCB_FLAGS_DO_NOT_PMTUD)) { net->dest_state |= SCTP_ADDR_NO_PMTUD; } else { net->dest_state &= ~SCTP_ADDR_NO_PMTUD; } net->heart_beat_delay = stcb->asoc.heart_beat_delay; /* Init the timer structure */ SCTP_OS_TIMER_INIT(&net->rxt_timer.timer); SCTP_OS_TIMER_INIT(&net->pmtu_timer.timer); SCTP_OS_TIMER_INIT(&net->hb_timer.timer); /* Now generate a route for this guy */ #ifdef INET6 /* KAME hack: embed scopeid */ if (newaddr->sa_family == AF_INET6) { struct sockaddr_in6 *sin6; sin6 = (struct sockaddr_in6 *)&net->ro._l_addr; (void)sa6_embedscope(sin6, MODULE_GLOBAL(ip6_use_defzone)); sin6->sin6_scope_id = 0; } #endif SCTP_RTALLOC((sctp_route_t *)&net->ro, stcb->asoc.vrf_id, stcb->sctp_ep->fibnum); net->src_addr_selected = 0; if (SCTP_ROUTE_HAS_VALID_IFN(&net->ro)) { /* Get source address */ net->ro._s_addr = sctp_source_address_selection(stcb->sctp_ep, stcb, (sctp_route_t *)&net->ro, net, 0, stcb->asoc.vrf_id); if (stcb->asoc.default_mtu > 0) { net->mtu = stcb->asoc.default_mtu; switch (net->ro._l_addr.sa.sa_family) { #ifdef INET case AF_INET: net->mtu += SCTP_MIN_V4_OVERHEAD; break; #endif #ifdef INET6 case AF_INET6: net->mtu += SCTP_MIN_OVERHEAD; break; #endif default: break; } #if defined(INET) || defined(INET6) if (net->port) { net->mtu += (uint32_t)sizeof(struct udphdr); } #endif } else if (net->ro._s_addr != NULL) { uint32_t imtu, rmtu, hcmtu; net->src_addr_selected = 1; /* Now get the interface MTU */ if (net->ro._s_addr->ifn_p != NULL) { imtu = SCTP_GATHER_MTU_FROM_INTFC(net->ro._s_addr->ifn_p); } else { imtu = 0; } rmtu = SCTP_GATHER_MTU_FROM_ROUTE(net->ro._s_addr, &net->ro._l_addr.sa, net->ro.ro_nh); hcmtu = sctp_hc_get_mtu(&net->ro._l_addr, stcb->sctp_ep->fibnum); net->mtu = sctp_min_mtu(hcmtu, rmtu, imtu); } } if (net->mtu == 0) { if (stcb->asoc.default_mtu > 0) { net->mtu = stcb->asoc.default_mtu; switch (net->ro._l_addr.sa.sa_family) { #ifdef INET case AF_INET: net->mtu += SCTP_MIN_V4_OVERHEAD; break; #endif #ifdef INET6 case AF_INET6: net->mtu += SCTP_MIN_OVERHEAD; break; #endif default: break; } #if defined(INET) || defined(INET6) if (net->port) { net->mtu += (uint32_t)sizeof(struct udphdr); } #endif } else { switch (newaddr->sa_family) { #ifdef INET case AF_INET: net->mtu = SCTP_DEFAULT_MTU; break; #endif #ifdef INET6 case AF_INET6: net->mtu = 1280; break; #endif default: break; } } } #if defined(INET) || defined(INET6) if (net->port) { net->mtu -= (uint32_t)sizeof(struct udphdr); } #endif if (from == SCTP_ALLOC_ASOC) { stcb->asoc.smallest_mtu = net->mtu; } if (stcb->asoc.smallest_mtu > net->mtu) { sctp_pathmtu_adjustment(stcb, net->mtu); } #ifdef INET6 if (newaddr->sa_family == AF_INET6) { struct sockaddr_in6 *sin6; sin6 = (struct sockaddr_in6 *)&net->ro._l_addr; (void)sa6_recoverscope(sin6); } #endif /* JRS - Use the congestion control given in the CC module */ if (stcb->asoc.cc_functions.sctp_set_initial_cc_param != NULL) (*stcb->asoc.cc_functions.sctp_set_initial_cc_param) (stcb, net); /* * CMT: CUC algo - set find_pseudo_cumack to TRUE (1) at beginning * of assoc (2005/06/27, iyengar@cis.udel.edu) */ net->find_pseudo_cumack = 1; net->find_rtx_pseudo_cumack = 1; /* Choose an initial flowid. */ net->flowid = stcb->asoc.my_vtag ^ ntohs(stcb->rport) ^ ntohs(stcb->sctp_ep->sctp_lport); net->flowtype = M_HASHTYPE_OPAQUE_HASH; if (netp) { *netp = net; } netfirst = TAILQ_FIRST(&stcb->asoc.nets); if (net->ro.ro_nh == NULL) { /* Since we have no route put it at the back */ TAILQ_INSERT_TAIL(&stcb->asoc.nets, net, sctp_next); } else if (netfirst == NULL) { /* We are the first one in the pool. */ TAILQ_INSERT_HEAD(&stcb->asoc.nets, net, sctp_next); } else if (netfirst->ro.ro_nh == NULL) { /* * First one has NO route. Place this one ahead of the first * one. */ TAILQ_INSERT_HEAD(&stcb->asoc.nets, net, sctp_next); } else if (net->ro.ro_nh->nh_ifp != netfirst->ro.ro_nh->nh_ifp) { /* * This one has a different interface than the one at the * top of the list. Place it ahead. */ TAILQ_INSERT_HEAD(&stcb->asoc.nets, net, sctp_next); } else { /* * Ok we have the same interface as the first one. Move * forward until we find either a) one with a NULL route... * insert ahead of that b) one with a different ifp.. insert * after that. c) end of the list.. insert at the tail. */ struct sctp_nets *netlook; do { netlook = TAILQ_NEXT(netfirst, sctp_next); if (netlook == NULL) { /* End of the list */ TAILQ_INSERT_TAIL(&stcb->asoc.nets, net, sctp_next); break; } else if (netlook->ro.ro_nh == NULL) { /* next one has NO route */ TAILQ_INSERT_BEFORE(netfirst, net, sctp_next); break; } else if (netlook->ro.ro_nh->nh_ifp != net->ro.ro_nh->nh_ifp) { TAILQ_INSERT_AFTER(&stcb->asoc.nets, netlook, net, sctp_next); break; } /* Shift forward */ netfirst = netlook; } while (netlook != NULL); } /* got to have a primary set */ if (stcb->asoc.primary_destination == 0) { stcb->asoc.primary_destination = net; } else if ((stcb->asoc.primary_destination->ro.ro_nh == NULL) && (net->ro.ro_nh) && ((net->dest_state & SCTP_ADDR_UNCONFIRMED) == 0)) { /* No route to current primary adopt new primary */ stcb->asoc.primary_destination = net; } /* Validate primary is first */ net = TAILQ_FIRST(&stcb->asoc.nets); if ((net != stcb->asoc.primary_destination) && (stcb->asoc.primary_destination)) { /* * first one on the list is NOT the primary sctp_cmpaddr() * is much more efficient if the primary is the first on the * list, make it so. */ TAILQ_REMOVE(&stcb->asoc.nets, stcb->asoc.primary_destination, sctp_next); TAILQ_INSERT_HEAD(&stcb->asoc.nets, stcb->asoc.primary_destination, sctp_next); } return (0); } static uint32_t sctp_aloc_a_assoc_id(struct sctp_inpcb *inp, struct sctp_tcb *stcb) { uint32_t id; struct sctpasochead *head; struct sctp_tcb *lstcb; try_again: if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) { /* TSNH */ return (0); } /* * We don't allow assoc id to be one of SCTP_FUTURE_ASSOC, * SCTP_CURRENT_ASSOC and SCTP_ALL_ASSOC. */ if (inp->sctp_associd_counter <= SCTP_ALL_ASSOC) { inp->sctp_associd_counter = SCTP_ALL_ASSOC + 1; } id = inp->sctp_associd_counter; inp->sctp_associd_counter++; lstcb = sctp_findasoc_ep_asocid_locked(inp, (sctp_assoc_t)id, 0); if (lstcb) { goto try_again; } head = &inp->sctp_asocidhash[SCTP_PCBHASH_ASOC(id, inp->hashasocidmark)]; LIST_INSERT_HEAD(head, stcb, sctp_tcbasocidhash); stcb->asoc.in_asocid_hash = 1; return (id); } /* * allocate an association and add it to the endpoint. The caller must be * careful to add all additional addresses once they are know right away or * else the assoc will be may experience a blackout scenario. */ struct sctp_tcb * sctp_aloc_assoc(struct sctp_inpcb *inp, struct sockaddr *firstaddr, int *error, uint32_t override_tag, uint32_t vrf_id, uint16_t o_streams, uint16_t port, struct thread *p, int initialize_auth_params) { /* note the p argument is only valid in unbound sockets */ struct sctp_tcb *stcb; struct sctp_association *asoc; struct sctpasochead *head; uint16_t rport; int err; /* * Assumption made here: Caller has done a * sctp_findassociation_ep_addr(ep, addr's); to make sure the * address does not exist already. */ if (SCTP_BASE_INFO(ipi_count_asoc) >= SCTP_MAX_NUM_OF_ASOC) { /* Hit max assoc, sorry no more */ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, ENOBUFS); *error = ENOBUFS; return (NULL); } if (firstaddr == NULL) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, EINVAL); *error = EINVAL; return (NULL); } SCTP_INP_RLOCK(inp); if ((inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) && ((sctp_is_feature_off(inp, SCTP_PCB_FLAGS_PORTREUSE)) || (inp->sctp_flags & SCTP_PCB_FLAGS_CONNECTED))) { /* * If its in the TCP pool, its NOT allowed to create an * association. The parent listener needs to call * sctp_aloc_assoc.. or the one-2-many socket. If a peeled * off, or connected one does this.. its an error. */ SCTP_INP_RUNLOCK(inp); SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, EINVAL); *error = EINVAL; return (NULL); } if ((inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) || (inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE)) { if ((inp->sctp_flags & SCTP_PCB_FLAGS_WAS_CONNECTED) || (inp->sctp_flags & SCTP_PCB_FLAGS_WAS_ABORTED)) { SCTP_INP_RUNLOCK(inp); SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, EINVAL); *error = EINVAL; return (NULL); } } SCTPDBG(SCTP_DEBUG_PCB3, "Allocate an association for peer:"); #ifdef SCTP_DEBUG if (firstaddr) { SCTPDBG_ADDR(SCTP_DEBUG_PCB3, firstaddr); switch (firstaddr->sa_family) { #ifdef INET case AF_INET: SCTPDBG(SCTP_DEBUG_PCB3, "Port:%d\n", ntohs(((struct sockaddr_in *)firstaddr)->sin_port)); break; #endif #ifdef INET6 case AF_INET6: SCTPDBG(SCTP_DEBUG_PCB3, "Port:%d\n", ntohs(((struct sockaddr_in6 *)firstaddr)->sin6_port)); break; #endif default: break; } } else { SCTPDBG(SCTP_DEBUG_PCB3, "None\n"); } #endif /* SCTP_DEBUG */ switch (firstaddr->sa_family) { #ifdef INET case AF_INET: { struct sockaddr_in *sin; sin = (struct sockaddr_in *)firstaddr; if ((ntohs(sin->sin_port) == 0) || (sin->sin_addr.s_addr == INADDR_ANY) || (sin->sin_addr.s_addr == INADDR_BROADCAST) || IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) { /* Invalid address */ SCTP_INP_RUNLOCK(inp); SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, EINVAL); *error = EINVAL; return (NULL); } rport = sin->sin_port; break; } #endif #ifdef INET6 case AF_INET6: { struct sockaddr_in6 *sin6; sin6 = (struct sockaddr_in6 *)firstaddr; if ((ntohs(sin6->sin6_port) == 0) || IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr) || IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) { /* Invalid address */ SCTP_INP_RUNLOCK(inp); SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, EINVAL); *error = EINVAL; return (NULL); } rport = sin6->sin6_port; break; } #endif default: /* not supported family type */ SCTP_INP_RUNLOCK(inp); SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, EINVAL); *error = EINVAL; return (NULL); } SCTP_INP_RUNLOCK(inp); if (inp->sctp_flags & SCTP_PCB_FLAGS_UNBOUND) { /* * If you have not performed a bind, then we need to do the * ephemeral bind for you. */ if ((err = sctp_inpcb_bind(inp->sctp_socket, NULL, NULL, p))) { /* bind error, probably perm */ *error = err; return (NULL); } } stcb = SCTP_ZONE_GET(SCTP_BASE_INFO(ipi_zone_asoc), struct sctp_tcb); if (stcb == NULL) { /* out of memory? */ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, ENOMEM); *error = ENOMEM; return (NULL); } SCTP_INCR_ASOC_COUNT(); memset(stcb, 0, sizeof(*stcb)); asoc = &stcb->asoc; SCTP_TCB_LOCK_INIT(stcb); SCTP_TCB_SEND_LOCK_INIT(stcb); stcb->rport = rport; /* setup back pointer's */ stcb->sctp_ep = inp; stcb->sctp_socket = inp->sctp_socket; if ((err = sctp_init_asoc(inp, stcb, override_tag, vrf_id, o_streams))) { /* failed */ SCTP_TCB_LOCK_DESTROY(stcb); SCTP_TCB_SEND_LOCK_DESTROY(stcb); SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_asoc), stcb); SCTP_DECR_ASOC_COUNT(); *error = err; return (NULL); } /* and the port */ SCTP_INP_INFO_WLOCK(); SCTP_INP_WLOCK(inp); if (inp->sctp_flags & (SCTP_PCB_FLAGS_SOCKET_GONE | SCTP_PCB_FLAGS_SOCKET_ALLGONE)) { /* inpcb freed while alloc going on */ SCTP_TCB_LOCK_DESTROY(stcb); SCTP_TCB_SEND_LOCK_DESTROY(stcb); SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_asoc), stcb); SCTP_INP_WUNLOCK(inp); SCTP_INP_INFO_WUNLOCK(); SCTP_DECR_ASOC_COUNT(); SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, EINVAL); *error = EINVAL; return (NULL); } SCTP_TCB_LOCK(stcb); asoc->assoc_id = sctp_aloc_a_assoc_id(inp, stcb); /* now that my_vtag is set, add it to the hash */ head = &SCTP_BASE_INFO(sctp_asochash)[SCTP_PCBHASH_ASOC(stcb->asoc.my_vtag, SCTP_BASE_INFO(hashasocmark))]; /* put it in the bucket in the vtag hash of assoc's for the system */ LIST_INSERT_HEAD(head, stcb, sctp_asocs); SCTP_INP_INFO_WUNLOCK(); if ((err = sctp_add_remote_addr(stcb, firstaddr, NULL, port, SCTP_DO_SETSCOPE, SCTP_ALLOC_ASOC))) { /* failure.. memory error? */ if (asoc->strmout) { SCTP_FREE(asoc->strmout, SCTP_M_STRMO); asoc->strmout = NULL; } if (asoc->mapping_array) { SCTP_FREE(asoc->mapping_array, SCTP_M_MAP); asoc->mapping_array = NULL; } if (asoc->nr_mapping_array) { SCTP_FREE(asoc->nr_mapping_array, SCTP_M_MAP); asoc->nr_mapping_array = NULL; } SCTP_DECR_ASOC_COUNT(); SCTP_TCB_UNLOCK(stcb); SCTP_TCB_LOCK_DESTROY(stcb); SCTP_TCB_SEND_LOCK_DESTROY(stcb); LIST_REMOVE(stcb, sctp_tcbasocidhash); SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_asoc), stcb); SCTP_INP_WUNLOCK(inp); SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_PCB, ENOBUFS); *error = ENOBUFS; return (NULL); } /* Init all the timers */ SCTP_OS_TIMER_INIT(&asoc->dack_timer.timer); SCTP_OS_TIMER_INIT(&asoc->strreset_timer.timer); SCTP_OS_TIMER_INIT(&asoc->asconf_timer.timer); SCTP_OS_TIMER_INIT(&asoc->shut_guard_timer.timer); SCTP_OS_TIMER_INIT(&asoc->autoclose_timer.timer); SCTP_OS_TIMER_INIT(&asoc->delete_prim_timer.timer); LIST_INSERT_HEAD(&inp->sctp_asoc_list, stcb, sctp_tcblist); /* now file the port under the hash as well */ if (inp->sctp_tcbhash != NULL) { head = &inp->sctp_tcbhash[SCTP_PCBHASH_ALLADDR(stcb->rport, inp->sctp_hashmark)]; LIST_INSERT_HEAD(head, stcb, sctp_tcbhash); } if (initialize_auth_params == SCTP_INITIALIZE_AUTH_PARAMS) { sctp_initialize_auth_params(inp, stcb); } SCTP_INP_WUNLOCK(inp); SCTPDBG(SCTP_DEBUG_PCB1, "Association %p now allocated\n", (void *)stcb); return (stcb); } void sctp_remove_net(struct sctp_tcb *stcb, struct sctp_nets *net) { struct sctp_inpcb *inp; struct sctp_association *asoc; inp = stcb->sctp_ep; asoc = &stcb->asoc; asoc->numnets--; TAILQ_REMOVE(&asoc->nets, net, sctp_next); if (net == asoc->primary_destination) { /* Reset primary */ struct sctp_nets *lnet; lnet = TAILQ_FIRST(&asoc->nets); /* * Mobility adaptation Ideally, if deleted destination is * the primary, it becomes a fast retransmission trigger by * the subsequent SET PRIMARY. (by micchie) */ if (sctp_is_mobility_feature_on(stcb->sctp_ep, SCTP_MOBILITY_BASE) || sctp_is_mobility_feature_on(stcb->sctp_ep, SCTP_MOBILITY_FASTHANDOFF)) { SCTPDBG(SCTP_DEBUG_ASCONF1, "remove_net: primary dst is deleting\n"); if (asoc->deleted_primary != NULL) { SCTPDBG(SCTP_DEBUG_ASCONF1, "remove_net: deleted primary may be already stored\n"); goto out; } asoc->deleted_primary = net; atomic_add_int(&net->ref_count, 1); memset(&net->lastsa, 0, sizeof(net->lastsa)); memset(&net->lastsv, 0, sizeof(net->lastsv)); sctp_mobility_feature_on(stcb->sctp_ep, SCTP_MOBILITY_PRIM_DELETED); sctp_timer_start(SCTP_TIMER_TYPE_PRIM_DELETED, stcb->sctp_ep, stcb, NULL); } out: /* Try to find a confirmed primary */ asoc->primary_destination = sctp_find_alternate_net(stcb, lnet, 0); } if (net == asoc->last_data_chunk_from) { /* Reset primary */ asoc->last_data_chunk_from = TAILQ_FIRST(&asoc->nets); } if (net == asoc->last_control_chunk_from) { /* Clear net */ asoc->last_control_chunk_from = NULL; } if (net == stcb->asoc.alternate) { sctp_free_remote_addr(stcb->asoc.alternate); stcb->asoc.alternate = NULL; } sctp_timer_stop(SCTP_TIMER_TYPE_PATHMTURAISE, inp, stcb, net, SCTP_FROM_SCTP_PCB + SCTP_LOC_9); sctp_timer_stop(SCTP_TIMER_TYPE_HEARTBEAT, inp, stcb, net, SCTP_FROM_SCTP_PCB + SCTP_LOC_10); net->dest_state |= SCTP_ADDR_BEING_DELETED; sctp_free_remote_addr(net); } /* * remove a remote endpoint address from an association, it will fail if the * address does not exist. */ int sctp_del_remote_addr(struct sctp_tcb *stcb, struct sockaddr *remaddr) { /* * Here we need to remove a remote address. This is quite simple, we * first find it in the list of address for the association * (tasoc->asoc.nets) and then if it is there, we do a LIST_REMOVE * on that item. Note we do not allow it to be removed if there are * no other addresses. */ struct sctp_association *asoc; struct sctp_nets *net, *nnet; asoc = &stcb->asoc; /* locate the address */ TAILQ_FOREACH_SAFE(net, &asoc->nets, sctp_next, nnet) { if (net->ro._l_addr.sa.sa_family != remaddr->sa_family) { continue; } if (sctp_cmpaddr((struct sockaddr *)&net->ro._l_addr, remaddr)) { /* we found the guy */ if (asoc->numnets < 2) { /* Must have at LEAST two remote addresses */ return (-1); } else { sctp_remove_net(stcb, net); return (0); } } } /* not found. */ return (-2); } void sctp_delete_from_timewait(uint32_t tag, uint16_t lport, uint16_t rport) { struct sctpvtaghead *chain; struct sctp_tagblock *twait_block; int found = 0; int i; chain = &SCTP_BASE_INFO(vtag_timewait)[(tag % SCTP_STACK_VTAG_HASH_SIZE)]; LIST_FOREACH(twait_block, chain, sctp_nxt_tagblock) { for (i = 0; i < SCTP_NUMBER_IN_VTAG_BLOCK; i++) { if ((twait_block->vtag_block[i].v_tag == tag) && (twait_block->vtag_block[i].lport == lport) && (twait_block->vtag_block[i].rport == rport)) { twait_block->vtag_block[i].tv_sec_at_expire = 0; twait_block->vtag_block[i].v_tag = 0; twait_block->vtag_block[i].lport = 0; twait_block->vtag_block[i].rport = 0; found = 1; break; } } if (found) break; } } int sctp_is_in_timewait(uint32_t tag, uint16_t lport, uint16_t rport) { struct sctpvtaghead *chain; struct sctp_tagblock *twait_block; int found = 0; int i; SCTP_INP_INFO_WLOCK(); chain = &SCTP_BASE_INFO(vtag_timewait)[(tag % SCTP_STACK_VTAG_HASH_SIZE)]; LIST_FOREACH(twait_block, chain, sctp_nxt_tagblock) { for (i = 0; i < SCTP_NUMBER_IN_VTAG_BLOCK; i++) { if ((twait_block->vtag_block[i].v_tag == tag) && (twait_block->vtag_block[i].lport == lport) && (twait_block->vtag_block[i].rport == rport)) { found = 1; break; } } if (found) break; } SCTP_INP_INFO_WUNLOCK(); return (found); } void sctp_add_vtag_to_timewait(uint32_t tag, uint32_t time, uint16_t lport, uint16_t rport) { struct sctpvtaghead *chain; struct sctp_tagblock *twait_block; struct timeval now; int set, i; if (time == 0) { /* Its disabled */ return; } (void)SCTP_GETTIME_TIMEVAL(&now); chain = &SCTP_BASE_INFO(vtag_timewait)[(tag % SCTP_STACK_VTAG_HASH_SIZE)]; set = 0; LIST_FOREACH(twait_block, chain, sctp_nxt_tagblock) { /* Block(s) present, lets find space, and expire on the fly */ for (i = 0; i < SCTP_NUMBER_IN_VTAG_BLOCK; i++) { if ((twait_block->vtag_block[i].v_tag == 0) && !set) { twait_block->vtag_block[i].tv_sec_at_expire = now.tv_sec + time; twait_block->vtag_block[i].v_tag = tag; twait_block->vtag_block[i].lport = lport; twait_block->vtag_block[i].rport = rport; set = 1; } else if ((twait_block->vtag_block[i].v_tag) && ((long)twait_block->vtag_block[i].tv_sec_at_expire < now.tv_sec)) { /* Audit expires this guy */ twait_block->vtag_block[i].tv_sec_at_expire = 0; twait_block->vtag_block[i].v_tag = 0; twait_block->vtag_block[i].lport = 0; twait_block->vtag_block[i].rport = 0; if (set == 0) { /* Reuse it for my new tag */ twait_block->vtag_block[i].tv_sec_at_expire = now.tv_sec + time; twait_block->vtag_block[i].v_tag = tag; twait_block->vtag_block[i].lport = lport; twait_block->vtag_block[i].rport = rport; set = 1; } } } if (set) { /* * We only do up to the block where we can place our * tag for audits */ break; } } /* Need to add a new block to chain */ if (!set) { SCTP_MALLOC(twait_block, struct sctp_tagblock *, sizeof(struct sctp_tagblock), SCTP_M_TIMW); if (twait_block == NULL) { return; } memset(twait_block, 0, sizeof(struct sctp_tagblock)); LIST_INSERT_HEAD(chain, twait_block, sctp_nxt_tagblock); twait_block->vtag_block[0].tv_sec_at_expire = now.tv_sec + time; twait_block->vtag_block[0].v_tag = tag; twait_block->vtag_block[0].lport = lport; twait_block->vtag_block[0].rport = rport; } } void sctp_clean_up_stream(struct sctp_tcb *stcb, struct sctp_readhead *rh) { struct sctp_tmit_chunk *chk, *nchk; struct sctp_queued_to_read *control, *ncontrol; TAILQ_FOREACH_SAFE(control, rh, next_instrm, ncontrol) { TAILQ_REMOVE(rh, control, next_instrm); control->on_strm_q = 0; if (control->on_read_q == 0) { sctp_free_remote_addr(control->whoFrom); if (control->data) { sctp_m_freem(control->data); control->data = NULL; } } /* Reassembly free? */ TAILQ_FOREACH_SAFE(chk, &control->reasm, sctp_next, nchk) { TAILQ_REMOVE(&control->reasm, chk, sctp_next); if (chk->data) { sctp_m_freem(chk->data); chk->data = NULL; } if (chk->holds_key_ref) sctp_auth_key_release(stcb, chk->auth_keyid, SCTP_SO_LOCKED); sctp_free_remote_addr(chk->whoTo); SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_chunk), chk); SCTP_DECR_CHK_COUNT(); /* sa_ignore FREED_MEMORY */ } /* * We don't free the address here since all the net's were * freed above. */ if (control->on_read_q == 0) { sctp_free_a_readq(stcb, control); } } } /*- * Free the association after un-hashing the remote port. This * function ALWAYS returns holding NO LOCK on the stcb. It DOES * expect that the input to this function IS a locked TCB. * It will return 0, if it did NOT destroy the association (instead * it unlocks it. It will return NON-zero if it either destroyed the * association OR the association is already destroyed. */ int sctp_free_assoc(struct sctp_inpcb *inp, struct sctp_tcb *stcb, int from_inpcbfree, int from_location) { int i; struct sctp_association *asoc; struct sctp_nets *net, *nnet; struct sctp_laddr *laddr, *naddr; struct sctp_tmit_chunk *chk, *nchk; struct sctp_asconf_addr *aparam, *naparam; struct sctp_asconf_ack *aack, *naack; struct sctp_stream_reset_list *strrst, *nstrrst; struct sctp_queued_to_read *sq, *nsq; struct sctp_stream_queue_pending *sp, *nsp; sctp_sharedkey_t *shared_key, *nshared_key; struct socket *so; /* first, lets purge the entry from the hash table. */ #ifdef SCTP_LOG_CLOSING sctp_log_closing(inp, stcb, 6); #endif if (stcb->asoc.state == 0) { #ifdef SCTP_LOG_CLOSING sctp_log_closing(inp, NULL, 7); #endif /* there is no asoc, really TSNH :-0 */ return (1); } + SCTP_TCB_SEND_LOCK(stcb); if (stcb->asoc.alternate) { sctp_free_remote_addr(stcb->asoc.alternate); stcb->asoc.alternate = NULL; } /* TEMP CODE */ if (stcb->freed_from_where == 0) { /* Only record the first place free happened from */ stcb->freed_from_where = from_location; } /* TEMP CODE */ asoc = &stcb->asoc; if ((inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) || (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE)) /* nothing around */ so = NULL; else so = inp->sctp_socket; /* * We used timer based freeing if a reader or writer is in the way. * So we first check if we are actually being called from a timer, * if so we abort early if a reader or writer is still in the way. */ if ((stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) && (from_inpcbfree == SCTP_NORMAL_PROC)) { /* * is it the timer driving us? if so are the reader/writers * gone? */ if (stcb->asoc.refcnt) { /* nope, reader or writer in the way */ sctp_timer_start(SCTP_TIMER_TYPE_ASOCKILL, inp, stcb, NULL); /* no asoc destroyed */ + SCTP_TCB_SEND_UNLOCK(stcb); SCTP_TCB_UNLOCK(stcb); #ifdef SCTP_LOG_CLOSING sctp_log_closing(inp, stcb, 8); #endif return (0); } } /* Now clean up any other timers */ sctp_stop_association_timers(stcb, false); /* Now the read queue needs to be cleaned up (only once) */ if ((stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) == 0) { SCTP_ADD_SUBSTATE(stcb, SCTP_STATE_ABOUT_TO_BE_FREED); SCTP_INP_READ_LOCK(inp); TAILQ_FOREACH(sq, &inp->read_queue, next) { if (sq->stcb == stcb) { sq->do_not_ref_stcb = 1; sq->sinfo_cumtsn = stcb->asoc.cumulative_tsn; /* * If there is no end, there never will be * now. */ if (sq->end_added == 0) { /* Held for PD-API clear that. */ sq->pdapi_aborted = 1; sq->held_length = 0; if (sctp_stcb_is_feature_on(inp, stcb, SCTP_PCB_FLAGS_PDAPIEVNT) && (so != NULL)) { /* * Need to add a PD-API * aborted indication. * Setting the control_pdapi * assures that it will be * added right after this * msg. */ uint32_t strseq; stcb->asoc.control_pdapi = sq; strseq = (sq->sinfo_stream << 16) | (sq->mid & 0x0000ffff); sctp_ulp_notify(SCTP_NOTIFY_PARTIAL_DELVIERY_INDICATION, stcb, SCTP_PARTIAL_DELIVERY_ABORTED, (void *)&strseq, SCTP_SO_LOCKED); stcb->asoc.control_pdapi = NULL; } } /* Add an end to wake them */ sq->end_added = 1; } } SCTP_INP_READ_UNLOCK(inp); if (stcb->block_entry) { SCTP_LTRACE_ERR_RET(inp, stcb, NULL, SCTP_FROM_SCTP_PCB, ECONNRESET); stcb->block_entry->error = ECONNRESET; stcb->block_entry = NULL; } } if ((stcb->asoc.refcnt) || (stcb->asoc.state & SCTP_STATE_IN_ACCEPT_QUEUE)) { /* * Someone holds a reference OR the socket is unaccepted * yet. */ if ((stcb->asoc.refcnt) || (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) || (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE)) { SCTP_CLEAR_SUBSTATE(stcb, SCTP_STATE_IN_ACCEPT_QUEUE); sctp_timer_start(SCTP_TIMER_TYPE_ASOCKILL, inp, stcb, NULL); } + SCTP_TCB_SEND_UNLOCK(stcb); SCTP_TCB_UNLOCK(stcb); if ((inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) || (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE)) /* nothing around */ so = NULL; if (so) { /* Wake any reader/writers */ sctp_sorwakeup(inp, so); sctp_sowwakeup(inp, so); } #ifdef SCTP_LOG_CLOSING sctp_log_closing(inp, stcb, 9); #endif /* no asoc destroyed */ return (0); } #ifdef SCTP_LOG_CLOSING sctp_log_closing(inp, stcb, 10); #endif /* * When I reach here, no others want to kill the assoc yet.. and I * own the lock. Now its possible an abort comes in when I do the * lock exchange below to grab all the locks to do the final take * out. to prevent this we increment the count, which will start a * timer and blow out above thus assuring us that we hold exclusive * killing of the asoc. Note that after getting back the TCB lock we * will go ahead and increment the counter back up and stop any * timer a passing stranger may have started :-S */ if (from_inpcbfree == SCTP_NORMAL_PROC) { atomic_add_int(&stcb->asoc.refcnt, 1); + SCTP_TCB_SEND_UNLOCK(stcb); SCTP_TCB_UNLOCK(stcb); SCTP_INP_INFO_WLOCK(); SCTP_INP_WLOCK(inp); SCTP_TCB_LOCK(stcb); + SCTP_TCB_SEND_LOCK(stcb); } /* Double check the GONE flag */ if ((inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) || (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE)) /* nothing around */ so = NULL; if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL)) { /* * For TCP type we need special handling when we are * connected. We also include the peel'ed off ones to. */ if (inp->sctp_flags & SCTP_PCB_FLAGS_CONNECTED) { inp->sctp_flags &= ~SCTP_PCB_FLAGS_CONNECTED; inp->sctp_flags |= SCTP_PCB_FLAGS_WAS_CONNECTED; if (so) { SOCKBUF_LOCK(&so->so_rcv); so->so_state &= ~(SS_ISCONNECTING | SS_ISDISCONNECTING | SS_ISCONFIRMING | SS_ISCONNECTED); so->so_state |= SS_ISDISCONNECTED; socantrcvmore_locked(so); socantsendmore(so); sctp_sowwakeup(inp, so); sctp_sorwakeup(inp, so); SCTP_SOWAKEUP(so); } } } /* * Make it invalid too, that way if its about to run it will abort * and return. */ /* re-increment the lock */ if (from_inpcbfree == SCTP_NORMAL_PROC) { atomic_add_int(&stcb->asoc.refcnt, -1); } if (stcb->asoc.refcnt) { SCTP_CLEAR_SUBSTATE(stcb, SCTP_STATE_IN_ACCEPT_QUEUE); sctp_timer_start(SCTP_TIMER_TYPE_ASOCKILL, inp, stcb, NULL); if (from_inpcbfree == SCTP_NORMAL_PROC) { SCTP_INP_INFO_WUNLOCK(); SCTP_INP_WUNLOCK(inp); } + SCTP_TCB_SEND_UNLOCK(stcb); SCTP_TCB_UNLOCK(stcb); return (0); } asoc->state = 0; if (inp->sctp_tcbhash) { LIST_REMOVE(stcb, sctp_tcbhash); } if (stcb->asoc.in_asocid_hash) { LIST_REMOVE(stcb, sctp_tcbasocidhash); } /* Now lets remove it from the list of ALL associations in the EP */ LIST_REMOVE(stcb, sctp_tcblist); if (from_inpcbfree == SCTP_NORMAL_PROC) { SCTP_INP_INCR_REF(inp); SCTP_INP_WUNLOCK(inp); } /* pull from vtag hash */ LIST_REMOVE(stcb, sctp_asocs); sctp_add_vtag_to_timewait(asoc->my_vtag, SCTP_BASE_SYSCTL(sctp_vtag_time_wait), inp->sctp_lport, stcb->rport); /* * Now restop the timers to be sure this is paranoia at is finest! */ sctp_stop_association_timers(stcb, true); /* * The chunk lists and such SHOULD be empty but we check them just * in case. */ /* anything on the wheel needs to be removed */ - SCTP_TCB_SEND_LOCK(stcb); for (i = 0; i < asoc->streamoutcnt; i++) { struct sctp_stream_out *outs; outs = &asoc->strmout[i]; /* now clean up any chunks here */ TAILQ_FOREACH_SAFE(sp, &outs->outqueue, next, nsp) { atomic_subtract_int(&asoc->stream_queue_cnt, 1); TAILQ_REMOVE(&outs->outqueue, sp, next); stcb->asoc.ss_functions.sctp_ss_remove_from_stream(stcb, asoc, outs, sp, 1); sctp_free_spbufspace(stcb, asoc, sp); if (sp->data) { if (so) { /* Still an open socket - report */ sctp_ulp_notify(SCTP_NOTIFY_SPECIAL_SP_FAIL, stcb, 0, (void *)sp, SCTP_SO_LOCKED); } if (sp->data) { sctp_m_freem(sp->data); sp->data = NULL; sp->tail_mbuf = NULL; sp->length = 0; } } if (sp->net) { sctp_free_remote_addr(sp->net); sp->net = NULL; } sctp_free_a_strmoq(stcb, sp, SCTP_SO_LOCKED); } } - SCTP_TCB_SEND_UNLOCK(stcb); /* sa_ignore FREED_MEMORY */ TAILQ_FOREACH_SAFE(strrst, &asoc->resetHead, next_resp, nstrrst) { TAILQ_REMOVE(&asoc->resetHead, strrst, next_resp); SCTP_FREE(strrst, SCTP_M_STRESET); } TAILQ_FOREACH_SAFE(sq, &asoc->pending_reply_queue, next, nsq) { TAILQ_REMOVE(&asoc->pending_reply_queue, sq, next); if (sq->data) { sctp_m_freem(sq->data); sq->data = NULL; } sctp_free_remote_addr(sq->whoFrom); sq->whoFrom = NULL; sq->stcb = NULL; /* Free the ctl entry */ sctp_free_a_readq(stcb, sq); /* sa_ignore FREED_MEMORY */ } TAILQ_FOREACH_SAFE(chk, &asoc->free_chunks, sctp_next, nchk) { TAILQ_REMOVE(&asoc->free_chunks, chk, sctp_next); if (chk->data) { sctp_m_freem(chk->data); chk->data = NULL; } if (chk->holds_key_ref) sctp_auth_key_release(stcb, chk->auth_keyid, SCTP_SO_LOCKED); SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_chunk), chk); SCTP_DECR_CHK_COUNT(); atomic_subtract_int(&SCTP_BASE_INFO(ipi_free_chunks), 1); asoc->free_chunk_cnt--; /* sa_ignore FREED_MEMORY */ } /* pending send queue SHOULD be empty */ TAILQ_FOREACH_SAFE(chk, &asoc->send_queue, sctp_next, nchk) { if (asoc->strmout[chk->rec.data.sid].chunks_on_queues > 0) { asoc->strmout[chk->rec.data.sid].chunks_on_queues--; #ifdef INVARIANTS } else { panic("No chunks on the queues for sid %u.", chk->rec.data.sid); #endif } TAILQ_REMOVE(&asoc->send_queue, chk, sctp_next); if (chk->data) { if (so) { /* Still a socket? */ sctp_ulp_notify(SCTP_NOTIFY_UNSENT_DG_FAIL, stcb, 0, chk, SCTP_SO_LOCKED); } if (chk->data) { sctp_m_freem(chk->data); chk->data = NULL; } } if (chk->holds_key_ref) sctp_auth_key_release(stcb, chk->auth_keyid, SCTP_SO_LOCKED); if (chk->whoTo) { sctp_free_remote_addr(chk->whoTo); chk->whoTo = NULL; } SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_chunk), chk); SCTP_DECR_CHK_COUNT(); /* sa_ignore FREED_MEMORY */ } /* sent queue SHOULD be empty */ TAILQ_FOREACH_SAFE(chk, &asoc->sent_queue, sctp_next, nchk) { if (chk->sent != SCTP_DATAGRAM_NR_ACKED) { if (asoc->strmout[chk->rec.data.sid].chunks_on_queues > 0) { asoc->strmout[chk->rec.data.sid].chunks_on_queues--; #ifdef INVARIANTS } else { panic("No chunks on the queues for sid %u.", chk->rec.data.sid); #endif } } TAILQ_REMOVE(&asoc->sent_queue, chk, sctp_next); if (chk->data) { if (so) { /* Still a socket? */ sctp_ulp_notify(SCTP_NOTIFY_SENT_DG_FAIL, stcb, 0, chk, SCTP_SO_LOCKED); } if (chk->data) { sctp_m_freem(chk->data); chk->data = NULL; } } if (chk->holds_key_ref) sctp_auth_key_release(stcb, chk->auth_keyid, SCTP_SO_LOCKED); sctp_free_remote_addr(chk->whoTo); SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_chunk), chk); SCTP_DECR_CHK_COUNT(); /* sa_ignore FREED_MEMORY */ } #ifdef INVARIANTS for (i = 0; i < stcb->asoc.streamoutcnt; i++) { if (stcb->asoc.strmout[i].chunks_on_queues > 0) { panic("%u chunks left for stream %u.", stcb->asoc.strmout[i].chunks_on_queues, i); } } #endif /* control queue MAY not be empty */ TAILQ_FOREACH_SAFE(chk, &asoc->control_send_queue, sctp_next, nchk) { TAILQ_REMOVE(&asoc->control_send_queue, chk, sctp_next); if (chk->data) { sctp_m_freem(chk->data); chk->data = NULL; } if (chk->holds_key_ref) sctp_auth_key_release(stcb, chk->auth_keyid, SCTP_SO_LOCKED); sctp_free_remote_addr(chk->whoTo); SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_chunk), chk); SCTP_DECR_CHK_COUNT(); /* sa_ignore FREED_MEMORY */ } /* ASCONF queue MAY not be empty */ TAILQ_FOREACH_SAFE(chk, &asoc->asconf_send_queue, sctp_next, nchk) { TAILQ_REMOVE(&asoc->asconf_send_queue, chk, sctp_next); if (chk->data) { sctp_m_freem(chk->data); chk->data = NULL; } if (chk->holds_key_ref) sctp_auth_key_release(stcb, chk->auth_keyid, SCTP_SO_LOCKED); sctp_free_remote_addr(chk->whoTo); SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_chunk), chk); SCTP_DECR_CHK_COUNT(); /* sa_ignore FREED_MEMORY */ } if (asoc->mapping_array) { SCTP_FREE(asoc->mapping_array, SCTP_M_MAP); asoc->mapping_array = NULL; } if (asoc->nr_mapping_array) { SCTP_FREE(asoc->nr_mapping_array, SCTP_M_MAP); asoc->nr_mapping_array = NULL; } /* the stream outs */ if (asoc->strmout) { SCTP_FREE(asoc->strmout, SCTP_M_STRMO); asoc->strmout = NULL; } asoc->strm_realoutsize = asoc->streamoutcnt = 0; if (asoc->strmin) { for (i = 0; i < asoc->streamincnt; i++) { sctp_clean_up_stream(stcb, &asoc->strmin[i].inqueue); sctp_clean_up_stream(stcb, &asoc->strmin[i].uno_inqueue); } SCTP_FREE(asoc->strmin, SCTP_M_STRMI); asoc->strmin = NULL; } asoc->streamincnt = 0; TAILQ_FOREACH_SAFE(net, &asoc->nets, sctp_next, nnet) { #ifdef INVARIANTS if (SCTP_BASE_INFO(ipi_count_raddr) == 0) { panic("no net's left alloc'ed, or list points to itself"); } #endif TAILQ_REMOVE(&asoc->nets, net, sctp_next); sctp_free_remote_addr(net); } LIST_FOREACH_SAFE(laddr, &asoc->sctp_restricted_addrs, sctp_nxt_addr, naddr) { /* sa_ignore FREED_MEMORY */ sctp_remove_laddr(laddr); } /* pending asconf (address) parameters */ TAILQ_FOREACH_SAFE(aparam, &asoc->asconf_queue, next, naparam) { /* sa_ignore FREED_MEMORY */ TAILQ_REMOVE(&asoc->asconf_queue, aparam, next); SCTP_FREE(aparam, SCTP_M_ASC_ADDR); } TAILQ_FOREACH_SAFE(aack, &asoc->asconf_ack_sent, next, naack) { /* sa_ignore FREED_MEMORY */ TAILQ_REMOVE(&asoc->asconf_ack_sent, aack, next); if (aack->data != NULL) { sctp_m_freem(aack->data); } SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_asconf_ack), aack); } /* clean up auth stuff */ if (asoc->local_hmacs) sctp_free_hmaclist(asoc->local_hmacs); if (asoc->peer_hmacs) sctp_free_hmaclist(asoc->peer_hmacs); if (asoc->local_auth_chunks) sctp_free_chunklist(asoc->local_auth_chunks); if (asoc->peer_auth_chunks) sctp_free_chunklist(asoc->peer_auth_chunks); sctp_free_authinfo(&asoc->authinfo); LIST_FOREACH_SAFE(shared_key, &asoc->shared_keys, next, nshared_key) { LIST_REMOVE(shared_key, next); sctp_free_sharedkey(shared_key); /* sa_ignore FREED_MEMORY */ } /* Insert new items here :> */ /* Get rid of LOCK */ + SCTP_TCB_SEND_UNLOCK(stcb); SCTP_TCB_UNLOCK(stcb); SCTP_TCB_LOCK_DESTROY(stcb); SCTP_TCB_SEND_LOCK_DESTROY(stcb); if (from_inpcbfree == SCTP_NORMAL_PROC) { SCTP_INP_INFO_WUNLOCK(); SCTP_INP_RLOCK(inp); } #ifdef SCTP_TRACK_FREED_ASOCS if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) { /* now clean up the tasoc itself */ SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_asoc), stcb); SCTP_DECR_ASOC_COUNT(); } else { LIST_INSERT_HEAD(&inp->sctp_asoc_free_list, stcb, sctp_tcblist); } #else SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_asoc), stcb); SCTP_DECR_ASOC_COUNT(); #endif if (from_inpcbfree == SCTP_NORMAL_PROC) { if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) { /* * If its NOT the inp_free calling us AND sctp_close * as been called, we call back... */ SCTP_INP_RUNLOCK(inp); /* * This will start the kill timer (if we are the * last one) since we hold an increment yet. But * this is the only safe way to do this since * otherwise if the socket closes at the same time * we are here we might collide in the cleanup. */ sctp_inpcb_free(inp, SCTP_FREE_SHOULD_USE_GRACEFUL_CLOSE, SCTP_CALLED_DIRECTLY_NOCMPSET); SCTP_INP_DECR_REF(inp); } else { /* The socket is still open. */ SCTP_INP_DECR_REF(inp); SCTP_INP_RUNLOCK(inp); } } /* destroyed the asoc */ #ifdef SCTP_LOG_CLOSING sctp_log_closing(inp, NULL, 11); #endif return (1); } /* * determine if a destination is "reachable" based upon the addresses bound * to the current endpoint (e.g. only v4 or v6 currently bound) */ /* * FIX: if we allow assoc-level bindx(), then this needs to be fixed to use * assoc level v4/v6 flags, as the assoc *may* not have the same address * types bound as its endpoint */ int sctp_destination_is_reachable(struct sctp_tcb *stcb, struct sockaddr *destaddr) { struct sctp_inpcb *inp; int answer; /* * No locks here, the TCB, in all cases is already locked and an * assoc is up. There is either a INP lock by the caller applied (in * asconf case when deleting an address) or NOT in the HB case, * however if HB then the INP increment is up and the INP will not * be removed (on top of the fact that we have a TCB lock). So we * only want to read the sctp_flags, which is either bound-all or * not.. no protection needed since once an assoc is up you can't be * changing your binding. */ inp = stcb->sctp_ep; if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) { /* if bound all, destination is not restricted */ /* * RRS: Question during lock work: Is this correct? If you * are bound-all you still might need to obey the V4--V6 * flags??? IMO this bound-all stuff needs to be removed! */ return (1); } /* NOTE: all "scope" checks are done when local addresses are added */ switch (destaddr->sa_family) { #ifdef INET6 case AF_INET6: answer = inp->ip_inp.inp.inp_vflag & INP_IPV6; break; #endif #ifdef INET case AF_INET: answer = inp->ip_inp.inp.inp_vflag & INP_IPV4; break; #endif default: /* invalid family, so it's unreachable */ answer = 0; break; } return (answer); } /* * update the inp_vflags on an endpoint */ static void sctp_update_ep_vflag(struct sctp_inpcb *inp) { struct sctp_laddr *laddr; /* first clear the flag */ inp->ip_inp.inp.inp_vflag = 0; /* set the flag based on addresses on the ep list */ LIST_FOREACH(laddr, &inp->sctp_addr_list, sctp_nxt_addr) { if (laddr->ifa == NULL) { SCTPDBG(SCTP_DEBUG_PCB1, "%s: NULL ifa\n", __func__); continue; } if (laddr->ifa->localifa_flags & SCTP_BEING_DELETED) { continue; } switch (laddr->ifa->address.sa.sa_family) { #ifdef INET6 case AF_INET6: inp->ip_inp.inp.inp_vflag |= INP_IPV6; break; #endif #ifdef INET case AF_INET: inp->ip_inp.inp.inp_vflag |= INP_IPV4; break; #endif default: break; } } } /* * Add the address to the endpoint local address list There is nothing to be * done if we are bound to all addresses */ void sctp_add_local_addr_ep(struct sctp_inpcb *inp, struct sctp_ifa *ifa, uint32_t action) { struct sctp_laddr *laddr; struct sctp_tcb *stcb; int fnd, error = 0; fnd = 0; if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) { /* You are already bound to all. You have it already */ return; } #ifdef INET6 if (ifa->address.sa.sa_family == AF_INET6) { if (ifa->localifa_flags & SCTP_ADDR_IFA_UNUSEABLE) { /* Can't bind a non-useable addr. */ return; } } #endif /* first, is it already present? */ LIST_FOREACH(laddr, &inp->sctp_addr_list, sctp_nxt_addr) { if (laddr->ifa == ifa) { fnd = 1; break; } } if (fnd == 0) { /* Not in the ep list */ error = sctp_insert_laddr(&inp->sctp_addr_list, ifa, action); if (error != 0) return; inp->laddr_count++; /* update inp_vflag flags */ switch (ifa->address.sa.sa_family) { #ifdef INET6 case AF_INET6: inp->ip_inp.inp.inp_vflag |= INP_IPV6; break; #endif #ifdef INET case AF_INET: inp->ip_inp.inp.inp_vflag |= INP_IPV4; break; #endif default: break; } LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) { sctp_add_local_addr_restricted(stcb, ifa); } } return; } /* * select a new (hopefully reachable) destination net (should only be used * when we deleted an ep addr that is the only usable source address to reach * the destination net) */ static void sctp_select_primary_destination(struct sctp_tcb *stcb) { struct sctp_nets *net; TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) { /* for now, we'll just pick the first reachable one we find */ if (net->dest_state & SCTP_ADDR_UNCONFIRMED) continue; if (sctp_destination_is_reachable(stcb, (struct sockaddr *)&net->ro._l_addr)) { /* found a reachable destination */ stcb->asoc.primary_destination = net; } } /* I can't there from here! ...we're gonna die shortly... */ } /* * Delete the address from the endpoint local address list. There is nothing * to be done if we are bound to all addresses */ void sctp_del_local_addr_ep(struct sctp_inpcb *inp, struct sctp_ifa *ifa) { struct sctp_laddr *laddr; int fnd; fnd = 0; if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) { /* You are already bound to all. You have it already */ return; } LIST_FOREACH(laddr, &inp->sctp_addr_list, sctp_nxt_addr) { if (laddr->ifa == ifa) { fnd = 1; break; } } if (fnd && (inp->laddr_count < 2)) { /* can't delete unless there are at LEAST 2 addresses */ return; } if (fnd) { /* * clean up any use of this address go through our * associations and clear any last_used_address that match * this one for each assoc, see if a new primary_destination * is needed */ struct sctp_tcb *stcb; /* clean up "next_addr_touse" */ if (inp->next_addr_touse == laddr) /* delete this address */ inp->next_addr_touse = NULL; /* clean up "last_used_address" */ LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) { struct sctp_nets *net; SCTP_TCB_LOCK(stcb); if (stcb->asoc.last_used_address == laddr) /* delete this address */ stcb->asoc.last_used_address = NULL; /* * Now spin through all the nets and purge any ref * to laddr */ TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) { if (net->ro._s_addr == laddr->ifa) { /* Yep, purge src address selected */ RO_NHFREE(&net->ro); sctp_free_ifa(net->ro._s_addr); net->ro._s_addr = NULL; net->src_addr_selected = 0; } } SCTP_TCB_UNLOCK(stcb); } /* for each tcb */ /* remove it from the ep list */ sctp_remove_laddr(laddr); inp->laddr_count--; /* update inp_vflag flags */ sctp_update_ep_vflag(inp); } return; } /* * Add the address to the TCB local address restricted list. * This is a "pending" address list (eg. addresses waiting for an * ASCONF-ACK response) and cannot be used as a valid source address. */ void sctp_add_local_addr_restricted(struct sctp_tcb *stcb, struct sctp_ifa *ifa) { struct sctp_laddr *laddr; struct sctpladdr *list; /* * Assumes TCB is locked.. and possibly the INP. May need to * confirm/fix that if we need it and is not the case. */ list = &stcb->asoc.sctp_restricted_addrs; #ifdef INET6 if (ifa->address.sa.sa_family == AF_INET6) { if (ifa->localifa_flags & SCTP_ADDR_IFA_UNUSEABLE) { /* Can't bind a non-existent addr. */ return; } } #endif /* does the address already exist? */ LIST_FOREACH(laddr, list, sctp_nxt_addr) { if (laddr->ifa == ifa) { return; } } /* add to the list */ (void)sctp_insert_laddr(list, ifa, 0); return; } /* * Remove a local address from the TCB local address restricted list */ void sctp_del_local_addr_restricted(struct sctp_tcb *stcb, struct sctp_ifa *ifa) { struct sctp_inpcb *inp; struct sctp_laddr *laddr; /* * This is called by asconf work. It is assumed that a) The TCB is * locked and b) The INP is locked. This is true in as much as I can * trace through the entry asconf code where I did these locks. * Again, the ASCONF code is a bit different in that it does lock * the INP during its work often times. This must be since we don't * want other proc's looking up things while what they are looking * up is changing :-D */ inp = stcb->sctp_ep; /* if subset bound and don't allow ASCONF's, can't delete last */ if (((inp->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) == 0) && sctp_is_feature_off(inp, SCTP_PCB_FLAGS_DO_ASCONF)) { if (stcb->sctp_ep->laddr_count < 2) { /* can't delete last address */ return; } } LIST_FOREACH(laddr, &stcb->asoc.sctp_restricted_addrs, sctp_nxt_addr) { /* remove the address if it exists */ if (laddr->ifa == NULL) continue; if (laddr->ifa == ifa) { sctp_remove_laddr(laddr); return; } } /* address not found! */ return; } /* sysctl */ static int sctp_max_number_of_assoc = SCTP_MAX_NUM_OF_ASOC; static int sctp_scale_up_for_address = SCTP_SCALE_FOR_ADDR; #if defined(SCTP_MCORE_INPUT) && defined(SMP) struct sctp_mcore_ctrl *sctp_mcore_workers = NULL; int *sctp_cpuarry = NULL; void sctp_queue_to_mcore(struct mbuf *m, int off, int cpu_to_use) { /* Queue a packet to a processor for the specified core */ struct sctp_mcore_queue *qent; struct sctp_mcore_ctrl *wkq; int need_wake = 0; if (sctp_mcore_workers == NULL) { /* Something went way bad during setup */ sctp_input_with_port(m, off, 0); return; } SCTP_MALLOC(qent, struct sctp_mcore_queue *, (sizeof(struct sctp_mcore_queue)), SCTP_M_MCORE); if (qent == NULL) { /* This is trouble */ sctp_input_with_port(m, off, 0); return; } qent->vn = curvnet; qent->m = m; qent->off = off; qent->v6 = 0; wkq = &sctp_mcore_workers[cpu_to_use]; SCTP_MCORE_QLOCK(wkq); TAILQ_INSERT_TAIL(&wkq->que, qent, next); if (wkq->running == 0) { need_wake = 1; } SCTP_MCORE_QUNLOCK(wkq); if (need_wake) { wakeup(&wkq->running); } } static void sctp_mcore_thread(void *arg) { struct sctp_mcore_ctrl *wkq; struct sctp_mcore_queue *qent; wkq = (struct sctp_mcore_ctrl *)arg; struct mbuf *m; int off, v6; /* Wait for first tickle */ SCTP_MCORE_LOCK(wkq); wkq->running = 0; msleep(&wkq->running, &wkq->core_mtx, 0, "wait for pkt", 0); SCTP_MCORE_UNLOCK(wkq); /* Bind to our cpu */ thread_lock(curthread); sched_bind(curthread, wkq->cpuid); thread_unlock(curthread); /* Now lets start working */ SCTP_MCORE_LOCK(wkq); /* Now grab lock and go */ for (;;) { SCTP_MCORE_QLOCK(wkq); skip_sleep: wkq->running = 1; qent = TAILQ_FIRST(&wkq->que); if (qent) { TAILQ_REMOVE(&wkq->que, qent, next); SCTP_MCORE_QUNLOCK(wkq); CURVNET_SET(qent->vn); m = qent->m; off = qent->off; v6 = qent->v6; SCTP_FREE(qent, SCTP_M_MCORE); if (v6 == 0) { sctp_input_with_port(m, off, 0); } else { SCTP_PRINTF("V6 not yet supported\n"); sctp_m_freem(m); } CURVNET_RESTORE(); SCTP_MCORE_QLOCK(wkq); } wkq->running = 0; if (!TAILQ_EMPTY(&wkq->que)) { goto skip_sleep; } SCTP_MCORE_QUNLOCK(wkq); msleep(&wkq->running, &wkq->core_mtx, 0, "wait for pkt", 0); } } static void sctp_startup_mcore_threads(void) { int i, cpu; if (mp_ncpus == 1) return; if (sctp_mcore_workers != NULL) { /* * Already been here in some previous vnet? */ return; } SCTP_MALLOC(sctp_mcore_workers, struct sctp_mcore_ctrl *, ((mp_maxid + 1) * sizeof(struct sctp_mcore_ctrl)), SCTP_M_MCORE); if (sctp_mcore_workers == NULL) { /* TSNH I hope */ return; } memset(sctp_mcore_workers, 0, ((mp_maxid + 1) * sizeof(struct sctp_mcore_ctrl))); /* Init the structures */ for (i = 0; i <= mp_maxid; i++) { TAILQ_INIT(&sctp_mcore_workers[i].que); SCTP_MCORE_LOCK_INIT(&sctp_mcore_workers[i]); SCTP_MCORE_QLOCK_INIT(&sctp_mcore_workers[i]); sctp_mcore_workers[i].cpuid = i; } if (sctp_cpuarry == NULL) { SCTP_MALLOC(sctp_cpuarry, int *, (mp_ncpus * sizeof(int)), SCTP_M_MCORE); i = 0; CPU_FOREACH(cpu) { sctp_cpuarry[i] = cpu; i++; } } /* Now start them all */ CPU_FOREACH(cpu) { (void)kproc_create(sctp_mcore_thread, (void *)&sctp_mcore_workers[cpu], &sctp_mcore_workers[cpu].thread_proc, RFPROC, SCTP_KTHREAD_PAGES, SCTP_MCORE_NAME); } } #endif void sctp_pcb_init(void) { /* * SCTP initialization for the PCB structures should be called by * the sctp_init() function. */ int i; struct timeval tv; if (SCTP_BASE_VAR(sctp_pcb_initialized) != 0) { /* error I was called twice */ return; } SCTP_BASE_VAR(sctp_pcb_initialized) = 1; #if defined(SCTP_LOCAL_TRACE_BUF) memset(&SCTP_BASE_SYSCTL(sctp_log), 0, sizeof(struct sctp_log)); #endif #if defined(SMP) && defined(SCTP_USE_PERCPU_STAT) SCTP_MALLOC(SCTP_BASE_STATS, struct sctpstat *, ((mp_maxid + 1) * sizeof(struct sctpstat)), SCTP_M_MCORE); #endif (void)SCTP_GETTIME_TIMEVAL(&tv); #if defined(SMP) && defined(SCTP_USE_PERCPU_STAT) memset(SCTP_BASE_STATS, 0, sizeof(struct sctpstat) * (mp_maxid + 1)); SCTP_BASE_STATS[PCPU_GET(cpuid)].sctps_discontinuitytime.tv_sec = (uint32_t)tv.tv_sec; SCTP_BASE_STATS[PCPU_GET(cpuid)].sctps_discontinuitytime.tv_usec = (uint32_t)tv.tv_usec; #else memset(&SCTP_BASE_STATS, 0, sizeof(struct sctpstat)); SCTP_BASE_STAT(sctps_discontinuitytime).tv_sec = (uint32_t)tv.tv_sec; SCTP_BASE_STAT(sctps_discontinuitytime).tv_usec = (uint32_t)tv.tv_usec; #endif /* init the empty list of (All) Endpoints */ LIST_INIT(&SCTP_BASE_INFO(listhead)); /* init the hash table of endpoints */ TUNABLE_INT_FETCH("net.inet.sctp.tcbhashsize", &SCTP_BASE_SYSCTL(sctp_hashtblsize)); TUNABLE_INT_FETCH("net.inet.sctp.pcbhashsize", &SCTP_BASE_SYSCTL(sctp_pcbtblsize)); TUNABLE_INT_FETCH("net.inet.sctp.chunkscale", &SCTP_BASE_SYSCTL(sctp_chunkscale)); SCTP_BASE_INFO(sctp_asochash) = SCTP_HASH_INIT((SCTP_BASE_SYSCTL(sctp_hashtblsize) * 31), &SCTP_BASE_INFO(hashasocmark)); SCTP_BASE_INFO(sctp_ephash) = SCTP_HASH_INIT(SCTP_BASE_SYSCTL(sctp_hashtblsize), &SCTP_BASE_INFO(hashmark)); SCTP_BASE_INFO(sctp_tcpephash) = SCTP_HASH_INIT(SCTP_BASE_SYSCTL(sctp_hashtblsize), &SCTP_BASE_INFO(hashtcpmark)); SCTP_BASE_INFO(hashtblsize) = SCTP_BASE_SYSCTL(sctp_hashtblsize); SCTP_BASE_INFO(sctp_vrfhash) = SCTP_HASH_INIT(SCTP_SIZE_OF_VRF_HASH, &SCTP_BASE_INFO(hashvrfmark)); SCTP_BASE_INFO(vrf_ifn_hash) = SCTP_HASH_INIT(SCTP_VRF_IFN_HASH_SIZE, &SCTP_BASE_INFO(vrf_ifn_hashmark)); /* init the zones */ /* * FIX ME: Should check for NULL returns, but if it does fail we are * doomed to panic anyways... add later maybe. */ SCTP_ZONE_INIT(SCTP_BASE_INFO(ipi_zone_ep), "sctp_ep", sizeof(struct sctp_inpcb), maxsockets); SCTP_ZONE_INIT(SCTP_BASE_INFO(ipi_zone_asoc), "sctp_asoc", sizeof(struct sctp_tcb), sctp_max_number_of_assoc); SCTP_ZONE_INIT(SCTP_BASE_INFO(ipi_zone_laddr), "sctp_laddr", sizeof(struct sctp_laddr), (sctp_max_number_of_assoc * sctp_scale_up_for_address)); SCTP_ZONE_INIT(SCTP_BASE_INFO(ipi_zone_net), "sctp_raddr", sizeof(struct sctp_nets), (sctp_max_number_of_assoc * sctp_scale_up_for_address)); SCTP_ZONE_INIT(SCTP_BASE_INFO(ipi_zone_chunk), "sctp_chunk", sizeof(struct sctp_tmit_chunk), (sctp_max_number_of_assoc * SCTP_BASE_SYSCTL(sctp_chunkscale))); SCTP_ZONE_INIT(SCTP_BASE_INFO(ipi_zone_readq), "sctp_readq", sizeof(struct sctp_queued_to_read), (sctp_max_number_of_assoc * SCTP_BASE_SYSCTL(sctp_chunkscale))); SCTP_ZONE_INIT(SCTP_BASE_INFO(ipi_zone_strmoq), "sctp_stream_msg_out", sizeof(struct sctp_stream_queue_pending), (sctp_max_number_of_assoc * SCTP_BASE_SYSCTL(sctp_chunkscale))); SCTP_ZONE_INIT(SCTP_BASE_INFO(ipi_zone_asconf), "sctp_asconf", sizeof(struct sctp_asconf), (sctp_max_number_of_assoc * SCTP_BASE_SYSCTL(sctp_chunkscale))); SCTP_ZONE_INIT(SCTP_BASE_INFO(ipi_zone_asconf_ack), "sctp_asconf_ack", sizeof(struct sctp_asconf_ack), (sctp_max_number_of_assoc * SCTP_BASE_SYSCTL(sctp_chunkscale))); /* Master Lock INIT for info structure */ SCTP_INP_INFO_LOCK_INIT(); SCTP_STATLOG_INIT_LOCK(); SCTP_IPI_COUNT_INIT(); SCTP_IPI_ADDR_INIT(); #ifdef SCTP_PACKET_LOGGING SCTP_IP_PKTLOG_INIT(); #endif LIST_INIT(&SCTP_BASE_INFO(addr_wq)); SCTP_WQ_ADDR_INIT(); /* not sure if we need all the counts */ SCTP_BASE_INFO(ipi_count_ep) = 0; /* assoc/tcb zone info */ SCTP_BASE_INFO(ipi_count_asoc) = 0; /* local addrlist zone info */ SCTP_BASE_INFO(ipi_count_laddr) = 0; /* remote addrlist zone info */ SCTP_BASE_INFO(ipi_count_raddr) = 0; /* chunk info */ SCTP_BASE_INFO(ipi_count_chunk) = 0; /* socket queue zone info */ SCTP_BASE_INFO(ipi_count_readq) = 0; /* stream out queue cont */ SCTP_BASE_INFO(ipi_count_strmoq) = 0; SCTP_BASE_INFO(ipi_free_strmoq) = 0; SCTP_BASE_INFO(ipi_free_chunks) = 0; SCTP_OS_TIMER_INIT(&SCTP_BASE_INFO(addr_wq_timer.timer)); /* Init the TIMEWAIT list */ for (i = 0; i < SCTP_STACK_VTAG_HASH_SIZE; i++) { LIST_INIT(&SCTP_BASE_INFO(vtag_timewait)[i]); } sctp_startup_iterator(); #if defined(SCTP_MCORE_INPUT) && defined(SMP) sctp_startup_mcore_threads(); #endif /* * INIT the default VRF which for BSD is the only one, other O/S's * may have more. But initially they must start with one and then * add the VRF's as addresses are added. */ sctp_init_vrf_list(SCTP_DEFAULT_VRF); } /* * Assumes that the SCTP_BASE_INFO() lock is NOT held. */ void sctp_pcb_finish(void) { struct sctp_vrflist *vrf_bucket; struct sctp_vrf *vrf, *nvrf; struct sctp_ifn *ifn, *nifn; struct sctp_ifa *ifa, *nifa; struct sctpvtaghead *chain; struct sctp_tagblock *twait_block, *prev_twait_block; struct sctp_laddr *wi, *nwi; int i; struct sctp_iterator *it, *nit; if (SCTP_BASE_VAR(sctp_pcb_initialized) == 0) { SCTP_PRINTF("%s: race condition on teardown.\n", __func__); return; } SCTP_BASE_VAR(sctp_pcb_initialized) = 0; /* * In FreeBSD the iterator thread never exits but we do clean up. * The only way FreeBSD reaches here is if we have VRF's but we * still add the ifdef to make it compile on old versions. */ retry: SCTP_IPI_ITERATOR_WQ_LOCK(); /* * sctp_iterator_worker() might be working on an it entry without * holding the lock. We won't find it on the list either and * continue and free/destroy it. While holding the lock, spin, to * avoid the race condition as sctp_iterator_worker() will have to * wait to re-acquire the lock. */ if (sctp_it_ctl.iterator_running != 0 || sctp_it_ctl.cur_it != NULL) { SCTP_IPI_ITERATOR_WQ_UNLOCK(); SCTP_PRINTF("%s: Iterator running while we held the lock. Retry. " "cur_it=%p\n", __func__, sctp_it_ctl.cur_it); DELAY(10); goto retry; } TAILQ_FOREACH_SAFE(it, &sctp_it_ctl.iteratorhead, sctp_nxt_itr, nit) { if (it->vn != curvnet) { continue; } TAILQ_REMOVE(&sctp_it_ctl.iteratorhead, it, sctp_nxt_itr); if (it->function_atend != NULL) { (*it->function_atend) (it->pointer, it->val); } SCTP_FREE(it, SCTP_M_ITER); } SCTP_IPI_ITERATOR_WQ_UNLOCK(); SCTP_ITERATOR_LOCK(); if ((sctp_it_ctl.cur_it) && (sctp_it_ctl.cur_it->vn == curvnet)) { sctp_it_ctl.iterator_flags |= SCTP_ITERATOR_STOP_CUR_IT; } SCTP_ITERATOR_UNLOCK(); SCTP_OS_TIMER_STOP_DRAIN(&SCTP_BASE_INFO(addr_wq_timer.timer)); SCTP_WQ_ADDR_LOCK(); LIST_FOREACH_SAFE(wi, &SCTP_BASE_INFO(addr_wq), sctp_nxt_addr, nwi) { LIST_REMOVE(wi, sctp_nxt_addr); SCTP_DECR_LADDR_COUNT(); if (wi->action == SCTP_DEL_IP_ADDRESS) { SCTP_FREE(wi->ifa, SCTP_M_IFA); } SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_laddr), wi); } SCTP_WQ_ADDR_UNLOCK(); /* * free the vrf/ifn/ifa lists and hashes (be sure address monitor is * destroyed first). */ SCTP_IPI_ADDR_WLOCK(); vrf_bucket = &SCTP_BASE_INFO(sctp_vrfhash)[(SCTP_DEFAULT_VRFID & SCTP_BASE_INFO(hashvrfmark))]; LIST_FOREACH_SAFE(vrf, vrf_bucket, next_vrf, nvrf) { LIST_FOREACH_SAFE(ifn, &vrf->ifnlist, next_ifn, nifn) { LIST_FOREACH_SAFE(ifa, &ifn->ifalist, next_ifa, nifa) { /* free the ifa */ LIST_REMOVE(ifa, next_bucket); LIST_REMOVE(ifa, next_ifa); SCTP_FREE(ifa, SCTP_M_IFA); } /* free the ifn */ LIST_REMOVE(ifn, next_bucket); LIST_REMOVE(ifn, next_ifn); SCTP_FREE(ifn, SCTP_M_IFN); } SCTP_HASH_FREE(vrf->vrf_addr_hash, vrf->vrf_addr_hashmark); /* free the vrf */ LIST_REMOVE(vrf, next_vrf); SCTP_FREE(vrf, SCTP_M_VRF); } SCTP_IPI_ADDR_WUNLOCK(); /* free the vrf hashes */ SCTP_HASH_FREE(SCTP_BASE_INFO(sctp_vrfhash), SCTP_BASE_INFO(hashvrfmark)); SCTP_HASH_FREE(SCTP_BASE_INFO(vrf_ifn_hash), SCTP_BASE_INFO(vrf_ifn_hashmark)); /* * free the TIMEWAIT list elements malloc'd in the function * sctp_add_vtag_to_timewait()... */ for (i = 0; i < SCTP_STACK_VTAG_HASH_SIZE; i++) { chain = &SCTP_BASE_INFO(vtag_timewait)[i]; if (!LIST_EMPTY(chain)) { prev_twait_block = NULL; LIST_FOREACH(twait_block, chain, sctp_nxt_tagblock) { if (prev_twait_block) { SCTP_FREE(prev_twait_block, SCTP_M_TIMW); } prev_twait_block = twait_block; } SCTP_FREE(prev_twait_block, SCTP_M_TIMW); } } /* free the locks and mutexes */ #ifdef SCTP_PACKET_LOGGING SCTP_IP_PKTLOG_DESTROY(); #endif SCTP_IPI_ADDR_DESTROY(); SCTP_STATLOG_DESTROY(); SCTP_INP_INFO_LOCK_DESTROY(); SCTP_WQ_ADDR_DESTROY(); /* Get rid of other stuff too. */ if (SCTP_BASE_INFO(sctp_asochash) != NULL) SCTP_HASH_FREE(SCTP_BASE_INFO(sctp_asochash), SCTP_BASE_INFO(hashasocmark)); if (SCTP_BASE_INFO(sctp_ephash) != NULL) SCTP_HASH_FREE(SCTP_BASE_INFO(sctp_ephash), SCTP_BASE_INFO(hashmark)); if (SCTP_BASE_INFO(sctp_tcpephash) != NULL) SCTP_HASH_FREE(SCTP_BASE_INFO(sctp_tcpephash), SCTP_BASE_INFO(hashtcpmark)); SCTP_ZONE_DESTROY(SCTP_BASE_INFO(ipi_zone_ep)); SCTP_ZONE_DESTROY(SCTP_BASE_INFO(ipi_zone_asoc)); SCTP_ZONE_DESTROY(SCTP_BASE_INFO(ipi_zone_laddr)); SCTP_ZONE_DESTROY(SCTP_BASE_INFO(ipi_zone_net)); SCTP_ZONE_DESTROY(SCTP_BASE_INFO(ipi_zone_chunk)); SCTP_ZONE_DESTROY(SCTP_BASE_INFO(ipi_zone_readq)); SCTP_ZONE_DESTROY(SCTP_BASE_INFO(ipi_zone_strmoq)); SCTP_ZONE_DESTROY(SCTP_BASE_INFO(ipi_zone_asconf)); SCTP_ZONE_DESTROY(SCTP_BASE_INFO(ipi_zone_asconf_ack)); #if defined(SMP) && defined(SCTP_USE_PERCPU_STAT) SCTP_FREE(SCTP_BASE_STATS, SCTP_M_MCORE); #endif } int sctp_load_addresses_from_init(struct sctp_tcb *stcb, struct mbuf *m, int offset, int limit, struct sockaddr *src, struct sockaddr *dst, struct sockaddr *altsa, uint16_t port) { /* * grub through the INIT pulling addresses and loading them to the * nets structure in the asoc. The from address in the mbuf should * also be loaded (if it is not already). This routine can be called * with either INIT or INIT-ACK's as long as the m points to the IP * packet and the offset points to the beginning of the parameters. */ struct sctp_inpcb *inp; struct sctp_nets *net, *nnet, *net_tmp; struct sctp_paramhdr *phdr, param_buf; struct sctp_tcb *stcb_tmp; uint16_t ptype, plen; struct sockaddr *sa; uint8_t random_store[SCTP_PARAM_BUFFER_SIZE]; struct sctp_auth_random *p_random = NULL; uint16_t random_len = 0; uint8_t hmacs_store[SCTP_PARAM_BUFFER_SIZE]; struct sctp_auth_hmac_algo *hmacs = NULL; uint16_t hmacs_len = 0; uint8_t saw_asconf = 0; uint8_t saw_asconf_ack = 0; uint8_t chunks_store[SCTP_PARAM_BUFFER_SIZE]; struct sctp_auth_chunk_list *chunks = NULL; uint16_t num_chunks = 0; sctp_key_t *new_key; uint32_t keylen; int got_random = 0, got_hmacs = 0, got_chklist = 0; uint8_t peer_supports_ecn; uint8_t peer_supports_prsctp; uint8_t peer_supports_auth; uint8_t peer_supports_asconf; uint8_t peer_supports_asconf_ack; uint8_t peer_supports_reconfig; uint8_t peer_supports_nrsack; uint8_t peer_supports_pktdrop; uint8_t peer_supports_idata; #ifdef INET struct sockaddr_in sin; #endif #ifdef INET6 struct sockaddr_in6 sin6; #endif /* First get the destination address setup too. */ #ifdef INET memset(&sin, 0, sizeof(sin)); sin.sin_family = AF_INET; sin.sin_len = sizeof(sin); sin.sin_port = stcb->rport; #endif #ifdef INET6 memset(&sin6, 0, sizeof(sin6)); sin6.sin6_family = AF_INET6; sin6.sin6_len = sizeof(struct sockaddr_in6); sin6.sin6_port = stcb->rport; #endif if (altsa) { sa = altsa; } else { sa = src; } peer_supports_idata = 0; peer_supports_ecn = 0; peer_supports_prsctp = 0; peer_supports_auth = 0; peer_supports_asconf = 0; peer_supports_reconfig = 0; peer_supports_nrsack = 0; peer_supports_pktdrop = 0; TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) { /* mark all addresses that we have currently on the list */ net->dest_state |= SCTP_ADDR_NOT_IN_ASSOC; } /* does the source address already exist? if so skip it */ inp = stcb->sctp_ep; atomic_add_int(&stcb->asoc.refcnt, 1); stcb_tmp = sctp_findassociation_ep_addr(&inp, sa, &net_tmp, dst, stcb); atomic_add_int(&stcb->asoc.refcnt, -1); if ((stcb_tmp == NULL && inp == stcb->sctp_ep) || inp == NULL) { /* we must add the source address */ /* no scope set here since we have a tcb already. */ switch (sa->sa_family) { #ifdef INET case AF_INET: if (stcb->asoc.scope.ipv4_addr_legal) { if (sctp_add_remote_addr(stcb, sa, NULL, port, SCTP_DONOT_SETSCOPE, SCTP_LOAD_ADDR_2)) { return (-1); } } break; #endif #ifdef INET6 case AF_INET6: if (stcb->asoc.scope.ipv6_addr_legal) { if (sctp_add_remote_addr(stcb, sa, NULL, port, SCTP_DONOT_SETSCOPE, SCTP_LOAD_ADDR_3)) { return (-2); } } break; #endif default: break; } } else { if (net_tmp != NULL && stcb_tmp == stcb) { net_tmp->dest_state &= ~SCTP_ADDR_NOT_IN_ASSOC; } else if (stcb_tmp != stcb) { /* It belongs to another association? */ if (stcb_tmp) SCTP_TCB_UNLOCK(stcb_tmp); return (-3); } } if (stcb->asoc.state == 0) { /* the assoc was freed? */ return (-4); } /* now we must go through each of the params. */ phdr = sctp_get_next_param(m, offset, ¶m_buf, sizeof(param_buf)); while (phdr) { ptype = ntohs(phdr->param_type); plen = ntohs(phdr->param_length); /* * SCTP_PRINTF("ptype => %0x, plen => %d\n", * (uint32_t)ptype, (int)plen); */ if (offset + plen > limit) { break; } if (plen < sizeof(struct sctp_paramhdr)) { break; } #ifdef INET if (ptype == SCTP_IPV4_ADDRESS) { if (stcb->asoc.scope.ipv4_addr_legal) { struct sctp_ipv4addr_param *p4, p4_buf; /* ok get the v4 address and check/add */ phdr = sctp_get_next_param(m, offset, (struct sctp_paramhdr *)&p4_buf, sizeof(p4_buf)); if (plen != sizeof(struct sctp_ipv4addr_param) || phdr == NULL) { return (-5); } p4 = (struct sctp_ipv4addr_param *)phdr; sin.sin_addr.s_addr = p4->addr; if (IN_MULTICAST(ntohl(sin.sin_addr.s_addr))) { /* Skip multi-cast addresses */ goto next_param; } if ((sin.sin_addr.s_addr == INADDR_BROADCAST) || (sin.sin_addr.s_addr == INADDR_ANY)) { goto next_param; } sa = (struct sockaddr *)&sin; inp = stcb->sctp_ep; atomic_add_int(&stcb->asoc.refcnt, 1); stcb_tmp = sctp_findassociation_ep_addr(&inp, sa, &net, dst, stcb); atomic_add_int(&stcb->asoc.refcnt, -1); if ((stcb_tmp == NULL && inp == stcb->sctp_ep) || inp == NULL) { /* we must add the source address */ /* * no scope set since we have a tcb * already */ /* * we must validate the state again * here */ add_it_now: if (stcb->asoc.state == 0) { /* the assoc was freed? */ return (-7); } if (sctp_add_remote_addr(stcb, sa, NULL, port, SCTP_DONOT_SETSCOPE, SCTP_LOAD_ADDR_4)) { return (-8); } } else if (stcb_tmp == stcb) { if (stcb->asoc.state == 0) { /* the assoc was freed? */ return (-10); } if (net != NULL) { /* clear flag */ net->dest_state &= ~SCTP_ADDR_NOT_IN_ASSOC; } } else { /* * strange, address is in another * assoc? straighten out locks. */ if (stcb_tmp) { if (SCTP_GET_STATE(stcb_tmp) == SCTP_STATE_COOKIE_WAIT) { struct mbuf *op_err; char msg[SCTP_DIAG_INFO_LEN]; /* * in setup state we * abort this guy */ SCTP_SNPRINTF(msg, sizeof(msg), "%s:%d at %s", __FILE__, __LINE__, __func__); op_err = sctp_generate_cause(SCTP_BASE_SYSCTL(sctp_diag_info_code), msg); sctp_abort_an_association(stcb_tmp->sctp_ep, stcb_tmp, op_err, SCTP_SO_NOT_LOCKED); goto add_it_now; } SCTP_TCB_UNLOCK(stcb_tmp); } if (stcb->asoc.state == 0) { /* the assoc was freed? */ return (-12); } return (-13); } } } else #endif #ifdef INET6 if (ptype == SCTP_IPV6_ADDRESS) { if (stcb->asoc.scope.ipv6_addr_legal) { /* ok get the v6 address and check/add */ struct sctp_ipv6addr_param *p6, p6_buf; phdr = sctp_get_next_param(m, offset, (struct sctp_paramhdr *)&p6_buf, sizeof(p6_buf)); if (plen != sizeof(struct sctp_ipv6addr_param) || phdr == NULL) { return (-14); } p6 = (struct sctp_ipv6addr_param *)phdr; memcpy((caddr_t)&sin6.sin6_addr, p6->addr, sizeof(p6->addr)); if (IN6_IS_ADDR_MULTICAST(&sin6.sin6_addr)) { /* Skip multi-cast addresses */ goto next_param; } if (IN6_IS_ADDR_LINKLOCAL(&sin6.sin6_addr)) { /* * Link local make no sense without * scope */ goto next_param; } sa = (struct sockaddr *)&sin6; inp = stcb->sctp_ep; atomic_add_int(&stcb->asoc.refcnt, 1); stcb_tmp = sctp_findassociation_ep_addr(&inp, sa, &net, dst, stcb); atomic_add_int(&stcb->asoc.refcnt, -1); if (stcb_tmp == NULL && (inp == stcb->sctp_ep || inp == NULL)) { /* * we must validate the state again * here */ add_it_now6: if (stcb->asoc.state == 0) { /* the assoc was freed? */ return (-16); } /* * we must add the address, no scope * set */ if (sctp_add_remote_addr(stcb, sa, NULL, port, SCTP_DONOT_SETSCOPE, SCTP_LOAD_ADDR_5)) { return (-17); } } else if (stcb_tmp == stcb) { /* * we must validate the state again * here */ if (stcb->asoc.state == 0) { /* the assoc was freed? */ return (-19); } if (net != NULL) { /* clear flag */ net->dest_state &= ~SCTP_ADDR_NOT_IN_ASSOC; } } else { /* * strange, address is in another * assoc? straighten out locks. */ if (stcb_tmp) { if (SCTP_GET_STATE(stcb_tmp) == SCTP_STATE_COOKIE_WAIT) { struct mbuf *op_err; char msg[SCTP_DIAG_INFO_LEN]; /* * in setup state we * abort this guy */ SCTP_SNPRINTF(msg, sizeof(msg), "%s:%d at %s", __FILE__, __LINE__, __func__); op_err = sctp_generate_cause(SCTP_BASE_SYSCTL(sctp_diag_info_code), msg); sctp_abort_an_association(stcb_tmp->sctp_ep, stcb_tmp, op_err, SCTP_SO_NOT_LOCKED); goto add_it_now6; } SCTP_TCB_UNLOCK(stcb_tmp); } if (stcb->asoc.state == 0) { /* the assoc was freed? */ return (-21); } return (-22); } } } else #endif if (ptype == SCTP_ECN_CAPABLE) { peer_supports_ecn = 1; } else if (ptype == SCTP_ULP_ADAPTATION) { if (stcb->asoc.state != SCTP_STATE_OPEN) { struct sctp_adaptation_layer_indication ai, *aip; phdr = sctp_get_next_param(m, offset, (struct sctp_paramhdr *)&ai, sizeof(ai)); aip = (struct sctp_adaptation_layer_indication *)phdr; if (aip) { stcb->asoc.peers_adaptation = ntohl(aip->indication); stcb->asoc.adaptation_needed = 1; } } } else if (ptype == SCTP_SET_PRIM_ADDR) { struct sctp_asconf_addr_param lstore, *fee; int lptype; struct sockaddr *lsa = NULL; #ifdef INET struct sctp_asconf_addrv4_param *fii; #endif if (stcb->asoc.asconf_supported == 0) { return (-100); } if (plen > sizeof(lstore)) { return (-23); } if (plen < sizeof(struct sctp_asconf_addrv4_param)) { return (-101); } phdr = sctp_get_next_param(m, offset, (struct sctp_paramhdr *)&lstore, plen); if (phdr == NULL) { return (-24); } fee = (struct sctp_asconf_addr_param *)phdr; lptype = ntohs(fee->addrp.ph.param_type); switch (lptype) { #ifdef INET case SCTP_IPV4_ADDRESS: if (plen != sizeof(struct sctp_asconf_addrv4_param)) { SCTP_PRINTF("Sizeof setprim in init/init ack not %d but %d - ignored\n", (int)sizeof(struct sctp_asconf_addrv4_param), plen); } else { fii = (struct sctp_asconf_addrv4_param *)fee; sin.sin_addr.s_addr = fii->addrp.addr; lsa = (struct sockaddr *)&sin; } break; #endif #ifdef INET6 case SCTP_IPV6_ADDRESS: if (plen != sizeof(struct sctp_asconf_addr_param)) { SCTP_PRINTF("Sizeof setprim (v6) in init/init ack not %d but %d - ignored\n", (int)sizeof(struct sctp_asconf_addr_param), plen); } else { memcpy(sin6.sin6_addr.s6_addr, fee->addrp.addr, sizeof(fee->addrp.addr)); lsa = (struct sockaddr *)&sin6; } break; #endif default: break; } if (lsa) { (void)sctp_set_primary_addr(stcb, sa, NULL); } } else if (ptype == SCTP_HAS_NAT_SUPPORT) { stcb->asoc.peer_supports_nat = 1; } else if (ptype == SCTP_PRSCTP_SUPPORTED) { /* Peer supports pr-sctp */ peer_supports_prsctp = 1; } else if (ptype == SCTP_SUPPORTED_CHUNK_EXT) { /* A supported extension chunk */ struct sctp_supported_chunk_types_param *pr_supported; uint8_t local_store[SCTP_PARAM_BUFFER_SIZE]; int num_ent, i; if (plen > sizeof(local_store)) { return (-35); } phdr = sctp_get_next_param(m, offset, (struct sctp_paramhdr *)&local_store, plen); if (phdr == NULL) { return (-25); } pr_supported = (struct sctp_supported_chunk_types_param *)phdr; num_ent = plen - sizeof(struct sctp_paramhdr); for (i = 0; i < num_ent; i++) { switch (pr_supported->chunk_types[i]) { case SCTP_ASCONF: peer_supports_asconf = 1; break; case SCTP_ASCONF_ACK: peer_supports_asconf_ack = 1; break; case SCTP_FORWARD_CUM_TSN: peer_supports_prsctp = 1; break; case SCTP_PACKET_DROPPED: peer_supports_pktdrop = 1; break; case SCTP_NR_SELECTIVE_ACK: peer_supports_nrsack = 1; break; case SCTP_STREAM_RESET: peer_supports_reconfig = 1; break; case SCTP_AUTHENTICATION: peer_supports_auth = 1; break; case SCTP_IDATA: peer_supports_idata = 1; break; default: /* one I have not learned yet */ break; } } } else if (ptype == SCTP_RANDOM) { if (plen > sizeof(random_store)) break; if (got_random) { /* already processed a RANDOM */ goto next_param; } phdr = sctp_get_next_param(m, offset, (struct sctp_paramhdr *)random_store, plen); if (phdr == NULL) return (-26); p_random = (struct sctp_auth_random *)phdr; random_len = plen - sizeof(*p_random); /* enforce the random length */ if (random_len != SCTP_AUTH_RANDOM_SIZE_REQUIRED) { SCTPDBG(SCTP_DEBUG_AUTH1, "SCTP: invalid RANDOM len\n"); return (-27); } got_random = 1; } else if (ptype == SCTP_HMAC_LIST) { uint16_t num_hmacs; uint16_t i; if (plen > sizeof(hmacs_store)) break; if (got_hmacs) { /* already processed a HMAC list */ goto next_param; } phdr = sctp_get_next_param(m, offset, (struct sctp_paramhdr *)hmacs_store, plen); if (phdr == NULL) return (-28); hmacs = (struct sctp_auth_hmac_algo *)phdr; hmacs_len = plen - sizeof(*hmacs); num_hmacs = hmacs_len / sizeof(hmacs->hmac_ids[0]); /* validate the hmac list */ if (sctp_verify_hmac_param(hmacs, num_hmacs)) { return (-29); } if (stcb->asoc.peer_hmacs != NULL) sctp_free_hmaclist(stcb->asoc.peer_hmacs); stcb->asoc.peer_hmacs = sctp_alloc_hmaclist(num_hmacs); if (stcb->asoc.peer_hmacs != NULL) { for (i = 0; i < num_hmacs; i++) { (void)sctp_auth_add_hmacid(stcb->asoc.peer_hmacs, ntohs(hmacs->hmac_ids[i])); } } got_hmacs = 1; } else if (ptype == SCTP_CHUNK_LIST) { int i; if (plen > sizeof(chunks_store)) break; if (got_chklist) { /* already processed a Chunks list */ goto next_param; } phdr = sctp_get_next_param(m, offset, (struct sctp_paramhdr *)chunks_store, plen); if (phdr == NULL) return (-30); chunks = (struct sctp_auth_chunk_list *)phdr; num_chunks = plen - sizeof(*chunks); if (stcb->asoc.peer_auth_chunks != NULL) sctp_clear_chunklist(stcb->asoc.peer_auth_chunks); else stcb->asoc.peer_auth_chunks = sctp_alloc_chunklist(); for (i = 0; i < num_chunks; i++) { (void)sctp_auth_add_chunk(chunks->chunk_types[i], stcb->asoc.peer_auth_chunks); /* record asconf/asconf-ack if listed */ if (chunks->chunk_types[i] == SCTP_ASCONF) saw_asconf = 1; if (chunks->chunk_types[i] == SCTP_ASCONF_ACK) saw_asconf_ack = 1; } got_chklist = 1; } else if ((ptype == SCTP_HEARTBEAT_INFO) || (ptype == SCTP_STATE_COOKIE) || (ptype == SCTP_UNRECOG_PARAM) || (ptype == SCTP_COOKIE_PRESERVE) || (ptype == SCTP_SUPPORTED_ADDRTYPE) || (ptype == SCTP_ADD_IP_ADDRESS) || (ptype == SCTP_DEL_IP_ADDRESS) || (ptype == SCTP_ERROR_CAUSE_IND) || (ptype == SCTP_SUCCESS_REPORT)) { /* don't care */ ; } else { if ((ptype & 0x8000) == 0x0000) { /* * must stop processing the rest of the * param's. Any report bits were handled * with the call to * sctp_arethere_unrecognized_parameters() * when the INIT or INIT-ACK was first seen. */ break; } } next_param: offset += SCTP_SIZE32(plen); if (offset >= limit) { break; } phdr = sctp_get_next_param(m, offset, ¶m_buf, sizeof(param_buf)); } /* Now check to see if we need to purge any addresses */ TAILQ_FOREACH_SAFE(net, &stcb->asoc.nets, sctp_next, nnet) { if ((net->dest_state & SCTP_ADDR_NOT_IN_ASSOC) == SCTP_ADDR_NOT_IN_ASSOC) { /* This address has been removed from the asoc */ /* remove and free it */ stcb->asoc.numnets--; TAILQ_REMOVE(&stcb->asoc.nets, net, sctp_next); sctp_free_remote_addr(net); if (net == stcb->asoc.primary_destination) { stcb->asoc.primary_destination = NULL; sctp_select_primary_destination(stcb); } } } if ((stcb->asoc.ecn_supported == 1) && (peer_supports_ecn == 0)) { stcb->asoc.ecn_supported = 0; } if ((stcb->asoc.prsctp_supported == 1) && (peer_supports_prsctp == 0)) { stcb->asoc.prsctp_supported = 0; } if ((stcb->asoc.auth_supported == 1) && ((peer_supports_auth == 0) || (got_random == 0) || (got_hmacs == 0))) { stcb->asoc.auth_supported = 0; } if ((stcb->asoc.asconf_supported == 1) && ((peer_supports_asconf == 0) || (peer_supports_asconf_ack == 0) || (stcb->asoc.auth_supported == 0) || (saw_asconf == 0) || (saw_asconf_ack == 0))) { stcb->asoc.asconf_supported = 0; } if ((stcb->asoc.reconfig_supported == 1) && (peer_supports_reconfig == 0)) { stcb->asoc.reconfig_supported = 0; } if ((stcb->asoc.idata_supported == 1) && (peer_supports_idata == 0)) { stcb->asoc.idata_supported = 0; } if ((stcb->asoc.nrsack_supported == 1) && (peer_supports_nrsack == 0)) { stcb->asoc.nrsack_supported = 0; } if ((stcb->asoc.pktdrop_supported == 1) && (peer_supports_pktdrop == 0)) { stcb->asoc.pktdrop_supported = 0; } /* validate authentication required parameters */ if ((peer_supports_auth == 0) && (got_chklist == 1)) { /* peer does not support auth but sent a chunks list? */ return (-31); } if ((peer_supports_asconf == 1) && (peer_supports_auth == 0)) { /* peer supports asconf but not auth? */ return (-32); } else if ((peer_supports_asconf == 1) && (peer_supports_auth == 1) && ((saw_asconf == 0) || (saw_asconf_ack == 0))) { return (-33); } /* concatenate the full random key */ keylen = sizeof(*p_random) + random_len + sizeof(*hmacs) + hmacs_len; if (chunks != NULL) { keylen += sizeof(*chunks) + num_chunks; } new_key = sctp_alloc_key(keylen); if (new_key != NULL) { /* copy in the RANDOM */ if (p_random != NULL) { keylen = sizeof(*p_random) + random_len; memcpy(new_key->key, p_random, keylen); } else { keylen = 0; } /* append in the AUTH chunks */ if (chunks != NULL) { memcpy(new_key->key + keylen, chunks, sizeof(*chunks) + num_chunks); keylen += sizeof(*chunks) + num_chunks; } /* append in the HMACs */ if (hmacs != NULL) { memcpy(new_key->key + keylen, hmacs, sizeof(*hmacs) + hmacs_len); } } else { /* failed to get memory for the key */ return (-34); } if (stcb->asoc.authinfo.peer_random != NULL) sctp_free_key(stcb->asoc.authinfo.peer_random); stcb->asoc.authinfo.peer_random = new_key; sctp_clear_cachedkeys(stcb, stcb->asoc.authinfo.assoc_keyid); sctp_clear_cachedkeys(stcb, stcb->asoc.authinfo.recv_keyid); return (0); } int sctp_set_primary_addr(struct sctp_tcb *stcb, struct sockaddr *sa, struct sctp_nets *net) { /* make sure the requested primary address exists in the assoc */ if (net == NULL && sa) net = sctp_findnet(stcb, sa); if (net == NULL) { /* didn't find the requested primary address! */ return (-1); } else { /* set the primary address */ if (net->dest_state & SCTP_ADDR_UNCONFIRMED) { /* Must be confirmed, so queue to set */ net->dest_state |= SCTP_ADDR_REQ_PRIMARY; return (0); } stcb->asoc.primary_destination = net; if (!(net->dest_state & SCTP_ADDR_PF) && (stcb->asoc.alternate)) { sctp_free_remote_addr(stcb->asoc.alternate); stcb->asoc.alternate = NULL; } net = TAILQ_FIRST(&stcb->asoc.nets); if (net != stcb->asoc.primary_destination) { /* * first one on the list is NOT the primary * sctp_cmpaddr() is much more efficient if the * primary is the first on the list, make it so. */ TAILQ_REMOVE(&stcb->asoc.nets, stcb->asoc.primary_destination, sctp_next); TAILQ_INSERT_HEAD(&stcb->asoc.nets, stcb->asoc.primary_destination, sctp_next); } return (0); } } int sctp_is_vtag_good(uint32_t tag, uint16_t lport, uint16_t rport, struct timeval *now) { /* * This function serves two purposes. It will see if a TAG can be * re-used and return 1 for yes it is ok and 0 for don't use that * tag. A secondary function it will do is purge out old tags that * can be removed. */ struct sctpvtaghead *chain; struct sctp_tagblock *twait_block; struct sctpasochead *head; struct sctp_tcb *stcb; int i; SCTP_INP_INFO_RLOCK(); head = &SCTP_BASE_INFO(sctp_asochash)[SCTP_PCBHASH_ASOC(tag, SCTP_BASE_INFO(hashasocmark))]; LIST_FOREACH(stcb, head, sctp_asocs) { /* * We choose not to lock anything here. TCB's can't be * removed since we have the read lock, so they can't be * freed on us, same thing for the INP. I may be wrong with * this assumption, but we will go with it for now :-) */ if (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) { continue; } if (stcb->asoc.my_vtag == tag) { /* candidate */ if (stcb->rport != rport) { continue; } if (stcb->sctp_ep->sctp_lport != lport) { continue; } /* Its a used tag set */ SCTP_INP_INFO_RUNLOCK(); return (0); } } chain = &SCTP_BASE_INFO(vtag_timewait)[(tag % SCTP_STACK_VTAG_HASH_SIZE)]; /* Now what about timed wait ? */ LIST_FOREACH(twait_block, chain, sctp_nxt_tagblock) { /* * Block(s) are present, lets see if we have this tag in the * list */ for (i = 0; i < SCTP_NUMBER_IN_VTAG_BLOCK; i++) { if (twait_block->vtag_block[i].v_tag == 0) { /* not used */ continue; } else if ((long)twait_block->vtag_block[i].tv_sec_at_expire < now->tv_sec) { /* Audit expires this guy */ twait_block->vtag_block[i].tv_sec_at_expire = 0; twait_block->vtag_block[i].v_tag = 0; twait_block->vtag_block[i].lport = 0; twait_block->vtag_block[i].rport = 0; } else if ((twait_block->vtag_block[i].v_tag == tag) && (twait_block->vtag_block[i].lport == lport) && (twait_block->vtag_block[i].rport == rport)) { /* Bad tag, sorry :< */ SCTP_INP_INFO_RUNLOCK(); return (0); } } } SCTP_INP_INFO_RUNLOCK(); return (1); } static void sctp_drain_mbufs(struct sctp_tcb *stcb) { /* * We must hunt this association for MBUF's past the cumack (i.e. * out of order data that we can renege on). */ struct sctp_association *asoc; struct sctp_tmit_chunk *chk, *nchk; uint32_t cumulative_tsn_p1; struct sctp_queued_to_read *control, *ncontrol; int cnt, strmat; uint32_t gap, i; int fnd = 0; /* We look for anything larger than the cum-ack + 1 */ asoc = &stcb->asoc; if (asoc->cumulative_tsn == asoc->highest_tsn_inside_map) { /* none we can reneg on. */ return; } SCTP_STAT_INCR(sctps_protocol_drains_done); cumulative_tsn_p1 = asoc->cumulative_tsn + 1; cnt = 0; /* Ok that was fun, now we will drain all the inbound streams? */ for (strmat = 0; strmat < asoc->streamincnt; strmat++) { TAILQ_FOREACH_SAFE(control, &asoc->strmin[strmat].inqueue, next_instrm, ncontrol) { #ifdef INVARIANTS if (control->on_strm_q != SCTP_ON_ORDERED) { panic("Huh control: %p on_q: %d -- not ordered?", control, control->on_strm_q); } #endif if (SCTP_TSN_GT(control->sinfo_tsn, cumulative_tsn_p1)) { /* Yep it is above cum-ack */ cnt++; SCTP_CALC_TSN_TO_GAP(gap, control->sinfo_tsn, asoc->mapping_array_base_tsn); KASSERT(control->length > 0, ("control has zero length")); if (asoc->size_on_all_streams >= control->length) { asoc->size_on_all_streams -= control->length; } else { #ifdef INVARIANTS panic("size_on_all_streams = %u smaller than control length %u", asoc->size_on_all_streams, control->length); #else asoc->size_on_all_streams = 0; #endif } sctp_ucount_decr(asoc->cnt_on_all_streams); SCTP_UNSET_TSN_PRESENT(asoc->mapping_array, gap); if (control->on_read_q) { TAILQ_REMOVE(&stcb->sctp_ep->read_queue, control, next); control->on_read_q = 0; } TAILQ_REMOVE(&asoc->strmin[strmat].inqueue, control, next_instrm); control->on_strm_q = 0; if (control->data) { sctp_m_freem(control->data); control->data = NULL; } sctp_free_remote_addr(control->whoFrom); /* Now its reasm? */ TAILQ_FOREACH_SAFE(chk, &control->reasm, sctp_next, nchk) { cnt++; SCTP_CALC_TSN_TO_GAP(gap, chk->rec.data.tsn, asoc->mapping_array_base_tsn); KASSERT(chk->send_size > 0, ("chunk has zero length")); if (asoc->size_on_reasm_queue >= chk->send_size) { asoc->size_on_reasm_queue -= chk->send_size; } else { #ifdef INVARIANTS panic("size_on_reasm_queue = %u smaller than chunk length %u", asoc->size_on_reasm_queue, chk->send_size); #else asoc->size_on_reasm_queue = 0; #endif } sctp_ucount_decr(asoc->cnt_on_reasm_queue); SCTP_UNSET_TSN_PRESENT(asoc->mapping_array, gap); TAILQ_REMOVE(&control->reasm, chk, sctp_next); if (chk->data) { sctp_m_freem(chk->data); chk->data = NULL; } sctp_free_a_chunk(stcb, chk, SCTP_SO_NOT_LOCKED); } sctp_free_a_readq(stcb, control); } } TAILQ_FOREACH_SAFE(control, &asoc->strmin[strmat].uno_inqueue, next_instrm, ncontrol) { #ifdef INVARIANTS if (control->on_strm_q != SCTP_ON_UNORDERED) { panic("Huh control: %p on_q: %d -- not unordered?", control, control->on_strm_q); } #endif if (SCTP_TSN_GT(control->sinfo_tsn, cumulative_tsn_p1)) { /* Yep it is above cum-ack */ cnt++; SCTP_CALC_TSN_TO_GAP(gap, control->sinfo_tsn, asoc->mapping_array_base_tsn); KASSERT(control->length > 0, ("control has zero length")); if (asoc->size_on_all_streams >= control->length) { asoc->size_on_all_streams -= control->length; } else { #ifdef INVARIANTS panic("size_on_all_streams = %u smaller than control length %u", asoc->size_on_all_streams, control->length); #else asoc->size_on_all_streams = 0; #endif } sctp_ucount_decr(asoc->cnt_on_all_streams); SCTP_UNSET_TSN_PRESENT(asoc->mapping_array, gap); if (control->on_read_q) { TAILQ_REMOVE(&stcb->sctp_ep->read_queue, control, next); control->on_read_q = 0; } TAILQ_REMOVE(&asoc->strmin[strmat].uno_inqueue, control, next_instrm); control->on_strm_q = 0; if (control->data) { sctp_m_freem(control->data); control->data = NULL; } sctp_free_remote_addr(control->whoFrom); /* Now its reasm? */ TAILQ_FOREACH_SAFE(chk, &control->reasm, sctp_next, nchk) { cnt++; SCTP_CALC_TSN_TO_GAP(gap, chk->rec.data.tsn, asoc->mapping_array_base_tsn); KASSERT(chk->send_size > 0, ("chunk has zero length")); if (asoc->size_on_reasm_queue >= chk->send_size) { asoc->size_on_reasm_queue -= chk->send_size; } else { #ifdef INVARIANTS panic("size_on_reasm_queue = %u smaller than chunk length %u", asoc->size_on_reasm_queue, chk->send_size); #else asoc->size_on_reasm_queue = 0; #endif } sctp_ucount_decr(asoc->cnt_on_reasm_queue); SCTP_UNSET_TSN_PRESENT(asoc->mapping_array, gap); TAILQ_REMOVE(&control->reasm, chk, sctp_next); if (chk->data) { sctp_m_freem(chk->data); chk->data = NULL; } sctp_free_a_chunk(stcb, chk, SCTP_SO_NOT_LOCKED); } sctp_free_a_readq(stcb, control); } } } if (cnt) { /* We must back down to see what the new highest is */ for (i = asoc->highest_tsn_inside_map; SCTP_TSN_GE(i, asoc->mapping_array_base_tsn); i--) { SCTP_CALC_TSN_TO_GAP(gap, i, asoc->mapping_array_base_tsn); if (SCTP_IS_TSN_PRESENT(asoc->mapping_array, gap)) { asoc->highest_tsn_inside_map = i; fnd = 1; break; } } if (!fnd) { asoc->highest_tsn_inside_map = asoc->mapping_array_base_tsn - 1; } /* * Question, should we go through the delivery queue? The * only reason things are on here is the app not reading OR * a p-d-api up. An attacker COULD send enough in to * initiate the PD-API and then send a bunch of stuff to * other streams... these would wind up on the delivery * queue.. and then we would not get to them. But in order * to do this I then have to back-track and un-deliver * sequence numbers in streams.. el-yucko. I think for now * we will NOT look at the delivery queue and leave it to be * something to consider later. An alternative would be to * abort the P-D-API with a notification and then deliver * the data.... Or another method might be to keep track of * how many times the situation occurs and if we see a * possible attack underway just abort the association. */ #ifdef SCTP_DEBUG SCTPDBG(SCTP_DEBUG_PCB1, "Freed %d chunks from reneg harvest\n", cnt); #endif /* * Now do we need to find a new * asoc->highest_tsn_inside_map? */ asoc->last_revoke_count = cnt; sctp_timer_stop(SCTP_TIMER_TYPE_RECV, stcb->sctp_ep, stcb, NULL, SCTP_FROM_SCTP_PCB + SCTP_LOC_11); /* sa_ignore NO_NULL_CHK */ sctp_send_sack(stcb, SCTP_SO_NOT_LOCKED); sctp_chunk_output(stcb->sctp_ep, stcb, SCTP_OUTPUT_FROM_DRAIN, SCTP_SO_NOT_LOCKED); } /* * Another issue, in un-setting the TSN's in the mapping array we * DID NOT adjust the highest_tsn marker. This will cause one of * two things to occur. It may cause us to do extra work in checking * for our mapping array movement. More importantly it may cause us * to SACK every datagram. This may not be a bad thing though since * we will recover once we get our cum-ack above and all this stuff * we dumped recovered. */ } void sctp_drain() { /* * We must walk the PCB lists for ALL associations here. The system * is LOW on MBUF's and needs help. This is where reneging will * occur. We really hope this does NOT happen! */ VNET_ITERATOR_DECL(vnet_iter); VNET_LIST_RLOCK_NOSLEEP(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); struct sctp_inpcb *inp; struct sctp_tcb *stcb; SCTP_STAT_INCR(sctps_protocol_drain_calls); if (SCTP_BASE_SYSCTL(sctp_do_drain) == 0) { #ifdef VIMAGE continue; #else return; #endif } SCTP_INP_INFO_RLOCK(); LIST_FOREACH(inp, &SCTP_BASE_INFO(listhead), sctp_list) { /* For each endpoint */ SCTP_INP_RLOCK(inp); LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) { /* For each association */ SCTP_TCB_LOCK(stcb); sctp_drain_mbufs(stcb); SCTP_TCB_UNLOCK(stcb); } SCTP_INP_RUNLOCK(inp); } SCTP_INP_INFO_RUNLOCK(); CURVNET_RESTORE(); } VNET_LIST_RUNLOCK_NOSLEEP(); } /* * start a new iterator * iterates through all endpoints and associations based on the pcb_state * flags and asoc_state. "af" (mandatory) is executed for all matching * assocs and "ef" (optional) is executed when the iterator completes. * "inpf" (optional) is executed for each new endpoint as it is being * iterated through. inpe (optional) is called when the inp completes * its way through all the stcbs. */ int sctp_initiate_iterator(inp_func inpf, asoc_func af, inp_func inpe, uint32_t pcb_state, uint32_t pcb_features, uint32_t asoc_state, void *argp, uint32_t argi, end_func ef, struct sctp_inpcb *s_inp, uint8_t chunk_output_off) { struct sctp_iterator *it = NULL; if (af == NULL) { return (-1); } if (SCTP_BASE_VAR(sctp_pcb_initialized) == 0) { SCTP_PRINTF("%s: abort on initialize being %d\n", __func__, SCTP_BASE_VAR(sctp_pcb_initialized)); return (-1); } SCTP_MALLOC(it, struct sctp_iterator *, sizeof(struct sctp_iterator), SCTP_M_ITER); if (it == NULL) { SCTP_LTRACE_ERR_RET(NULL, NULL, NULL, SCTP_FROM_SCTP_PCB, ENOMEM); return (-1); } memset(it, 0, sizeof(*it)); it->function_assoc = af; it->function_inp = inpf; if (inpf) it->done_current_ep = 0; else it->done_current_ep = 1; it->function_atend = ef; it->pointer = argp; it->val = argi; it->pcb_flags = pcb_state; it->pcb_features = pcb_features; it->asoc_state = asoc_state; it->function_inp_end = inpe; it->no_chunk_output = chunk_output_off; it->vn = curvnet; if (s_inp) { /* Assume lock is held here */ it->inp = s_inp; SCTP_INP_INCR_REF(it->inp); it->iterator_flags = SCTP_ITERATOR_DO_SINGLE_INP; } else { SCTP_INP_INFO_RLOCK(); it->inp = LIST_FIRST(&SCTP_BASE_INFO(listhead)); if (it->inp) { SCTP_INP_INCR_REF(it->inp); } SCTP_INP_INFO_RUNLOCK(); it->iterator_flags = SCTP_ITERATOR_DO_ALL_INP; } SCTP_IPI_ITERATOR_WQ_LOCK(); if (SCTP_BASE_VAR(sctp_pcb_initialized) == 0) { SCTP_IPI_ITERATOR_WQ_UNLOCK(); SCTP_PRINTF("%s: rollback on initialize being %d it=%p\n", __func__, SCTP_BASE_VAR(sctp_pcb_initialized), it); SCTP_FREE(it, SCTP_M_ITER); return (-1); } TAILQ_INSERT_TAIL(&sctp_it_ctl.iteratorhead, it, sctp_nxt_itr); if (sctp_it_ctl.iterator_running == 0) { sctp_wakeup_iterator(); } SCTP_IPI_ITERATOR_WQ_UNLOCK(); /* sa_ignore MEMLEAK {memory is put on the tailq for the iterator} */ return (0); } Index: projects/clang1100-import/sys/netinet/sctp_structs.h =================================================================== --- projects/clang1100-import/sys/netinet/sctp_structs.h (revision 364278) +++ projects/clang1100-import/sys/netinet/sctp_structs.h (revision 364279) @@ -1,1247 +1,1248 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2001-2008, by Cisco Systems, Inc. All rights reserved. * Copyright (c) 2008-2012, by Randall Stewart. All rights reserved. * Copyright (c) 2008-2012, by Michael Tuexen. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * a) Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * b) Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the distribution. * * c) Neither the name of Cisco Systems, Inc. nor the names of its * contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #ifndef _NETINET_SCTP_STRUCTS_H_ #define _NETINET_SCTP_STRUCTS_H_ #include #include #include struct sctp_timer { sctp_os_timer_t timer; int type; /* * Depending on the timer type these will be setup and cast with the * appropriate entity. */ void *ep; void *tcb; void *net; void *vnet; /* for sanity checking */ void *self; uint32_t ticks; uint32_t stopped_from; }; struct sctp_foo_stuff { struct sctp_inpcb *inp; uint32_t lineno; uint32_t ticks; int updown; }; /* * This is the information we track on each interface that we know about from * the distant end. */ TAILQ_HEAD(sctpnetlisthead, sctp_nets); struct sctp_stream_reset_list { TAILQ_ENTRY(sctp_stream_reset_list) next_resp; uint32_t seq; uint32_t tsn; uint32_t number_entries; uint16_t list_of_streams[]; }; TAILQ_HEAD(sctp_resethead, sctp_stream_reset_list); /* * Users of the iterator need to malloc a iterator with a call to * sctp_initiate_iterator(inp_func, assoc_func, inp_func, pcb_flags, pcb_features, * asoc_state, void-ptr-arg, uint32-arg, end_func, inp); * * Use the following two defines if you don't care what pcb flags are on the EP * and/or you don't care what state the association is in. * * Note that if you specify an INP as the last argument then ONLY each * association of that single INP will be executed upon. Note that the pcb * flags STILL apply so if the inp you specify has different pcb_flags then * what you put in pcb_flags nothing will happen. use SCTP_PCB_ANY_FLAGS to * assure the inp you specify gets treated. */ #define SCTP_PCB_ANY_FLAGS 0x00000000 #define SCTP_PCB_ANY_FEATURES 0x00000000 #define SCTP_ASOC_ANY_STATE 0x00000000 typedef void (*asoc_func) (struct sctp_inpcb *, struct sctp_tcb *, void *ptr, uint32_t val); typedef int (*inp_func) (struct sctp_inpcb *, void *ptr, uint32_t val); typedef void (*end_func) (void *ptr, uint32_t val); #if defined(SCTP_MCORE_INPUT) && defined(SMP) /* whats on the mcore control struct */ struct sctp_mcore_queue { TAILQ_ENTRY(sctp_mcore_queue) next; struct vnet *vn; struct mbuf *m; int off; int v6; }; TAILQ_HEAD(sctp_mcore_qhead, sctp_mcore_queue); struct sctp_mcore_ctrl { SCTP_PROCESS_STRUCT thread_proc; struct sctp_mcore_qhead que; struct mtx core_mtx; struct mtx que_mtx; int running; int cpuid; }; #endif struct sctp_iterator { TAILQ_ENTRY(sctp_iterator) sctp_nxt_itr; struct vnet *vn; struct sctp_timer tmr; struct sctp_inpcb *inp; /* current endpoint */ struct sctp_tcb *stcb; /* current* assoc */ struct sctp_inpcb *next_inp; /* special hook to skip to */ asoc_func function_assoc; /* per assoc function */ inp_func function_inp; /* per endpoint function */ inp_func function_inp_end; /* end INP function */ end_func function_atend; /* iterator completion function */ void *pointer; /* pointer for apply func to use */ uint32_t val; /* value for apply func to use */ uint32_t pcb_flags; /* endpoint flags being checked */ uint32_t pcb_features; /* endpoint features being checked */ uint32_t asoc_state; /* assoc state being checked */ uint32_t iterator_flags; uint8_t no_chunk_output; uint8_t done_current_ep; }; /* iterator_flags values */ #define SCTP_ITERATOR_DO_ALL_INP 0x00000001 #define SCTP_ITERATOR_DO_SINGLE_INP 0x00000002 TAILQ_HEAD(sctpiterators, sctp_iterator); struct sctp_copy_all { struct sctp_inpcb *inp; /* ep */ struct mbuf *m; struct sctp_sndrcvinfo sndrcv; ssize_t sndlen; int cnt_sent; int cnt_failed; }; struct sctp_asconf_iterator { struct sctpladdr list_of_work; int cnt; }; struct iterator_control { struct mtx ipi_iterator_wq_mtx; struct mtx it_mtx; SCTP_PROCESS_STRUCT thread_proc; struct sctpiterators iteratorhead; struct sctp_iterator *cur_it; uint32_t iterator_running; uint32_t iterator_flags; }; #define SCTP_ITERATOR_STOP_CUR_IT 0x00000004 #define SCTP_ITERATOR_STOP_CUR_INP 0x00000008 struct sctp_net_route { struct nhop_object *ro_nh; struct llentry *ro_lle; char *ro_prepend; uint16_t ro_plen; uint16_t ro_flags; uint16_t ro_mtu; uint16_t spare; union sctp_sockstore _l_addr; /* remote peer addr */ struct sctp_ifa *_s_addr; /* our selected src addr */ }; struct htcp { uint16_t alpha; /* Fixed point arith, << 7 */ uint8_t beta; /* Fixed point arith, << 7 */ uint8_t modeswitch; /* Delay modeswitch until we had at least one * congestion event */ uint32_t last_cong; /* Time since last congestion event end */ uint32_t undo_last_cong; uint16_t bytes_acked; uint32_t bytecount; uint32_t minRTT; uint32_t maxRTT; uint32_t undo_maxRTT; uint32_t undo_old_maxB; /* Bandwidth estimation */ uint32_t minB; uint32_t maxB; uint32_t old_maxB; uint32_t Bi; uint32_t lasttime; }; struct rtcc_cc { struct timeval tls; /* The time we started the sending */ uint64_t lbw; /* Our last estimated bw */ uint64_t lbw_rtt; /* RTT at bw estimate */ uint64_t bw_bytes; /* The total bytes since this sending began */ uint64_t bw_tot_time; /* The total time since sending began */ uint64_t new_tot_time; /* temp holding the new value */ uint64_t bw_bytes_at_last_rttc; /* What bw_bytes was at last rtt calc */ uint32_t cwnd_at_bw_set; /* Cwnd at last bw saved - lbw */ uint32_t vol_reduce; /* cnt of voluntary reductions */ uint16_t steady_step; /* The number required to be in steady state */ uint16_t step_cnt; /* The current number */ uint8_t ret_from_eq; /* When all things are equal what do I return * 0/1 - 1 no cc advance */ uint8_t use_dccc_ecn; /* Flag to enable DCCC ECN */ uint8_t tls_needs_set; /* Flag to indicate we need to set tls 0 or 1 * means set at send 2 not */ uint8_t last_step_state; /* Last state if steady state stepdown * is on */ uint8_t rtt_set_this_sack; /* Flag saying this sack had RTT calc * on it */ uint8_t last_inst_ind; /* Last saved inst indication */ }; struct sctp_nets { TAILQ_ENTRY(sctp_nets) sctp_next; /* next link */ /* * Things on the top half may be able to be split into a common * structure shared by all. */ struct sctp_timer pmtu_timer; struct sctp_timer hb_timer; /* * The following two in combination equate to a route entry for v6 * or v4. */ struct sctp_net_route ro; /* mtu discovered so far */ uint32_t mtu; uint32_t ssthresh; /* not sure about this one for split */ uint32_t last_cwr_tsn; uint32_t cwr_window_tsn; uint32_t ecn_ce_pkt_cnt; uint32_t lost_cnt; /* smoothed average things for RTT and RTO itself */ int lastsa; int lastsv; uint64_t rtt; /* last measured rtt value in us */ uint32_t RTO; /* This is used for SHUTDOWN/SHUTDOWN-ACK/SEND or INIT timers */ struct sctp_timer rxt_timer; /* last time in seconds I sent to it */ struct timeval last_sent_time; union cc_control_data { struct htcp htcp_ca; /* JRS - struct used in HTCP algorithm */ struct rtcc_cc rtcc; /* rtcc module cc stuff */ } cc_mod; int ref_count; /* Congestion stats per destination */ /* * flight size variables and such, sorry Vern, I could not avoid * this if I wanted performance :> */ uint32_t flight_size; uint32_t cwnd; /* actual cwnd */ uint32_t prev_cwnd; /* cwnd before any processing */ uint32_t ecn_prev_cwnd; /* ECN prev cwnd at first ecn_echo seen in new * window */ uint32_t partial_bytes_acked; /* in CA tracks when to incr a MTU */ /* tracking variables to avoid the aloc/free in sack processing */ unsigned int net_ack; unsigned int net_ack2; /* * JRS - 5/8/07 - Variable to track last time a destination was * active for CMT PF */ uint32_t last_active; /* * CMT variables (iyengar@cis.udel.edu) */ uint32_t this_sack_highest_newack; /* tracks highest TSN newly * acked for a given dest in * the current SACK. Used in * SFR and HTNA algos */ uint32_t pseudo_cumack; /* CMT CUC algorithm. Maintains next expected * pseudo-cumack for this destination */ uint32_t rtx_pseudo_cumack; /* CMT CUC algorithm. Maintains next * expected pseudo-cumack for this * destination */ /* CMT fast recovery variables */ uint32_t fast_recovery_tsn; uint32_t heartbeat_random1; uint32_t heartbeat_random2; #ifdef INET6 uint32_t flowlabel; #endif uint8_t dscp; struct timeval start_time; /* time when this net was created */ uint32_t marked_retrans; /* number or DATA chunks marked for * timer based retransmissions */ uint32_t marked_fastretrans; uint32_t heart_beat_delay; /* Heart Beat delay in ms */ /* if this guy is ok or not ... status */ uint16_t dest_state; /* number of timeouts to consider the destination unreachable */ uint16_t failure_threshold; /* number of timeouts to consider the destination potentially failed */ uint16_t pf_threshold; /* error stats on the destination */ uint16_t error_count; /* UDP port number in case of UDP tunneling */ uint16_t port; uint8_t fast_retran_loss_recovery; uint8_t will_exit_fast_recovery; /* Flags that probably can be combined into dest_state */ uint8_t fast_retran_ip; /* fast retransmit in progress */ uint8_t hb_responded; uint8_t saw_newack; /* CMT's SFR algorithm flag */ uint8_t src_addr_selected; /* if we split we move */ uint8_t indx_of_eligible_next_to_use; uint8_t addr_is_local; /* its a local address (if known) could move * in split */ /* * CMT variables (iyengar@cis.udel.edu) */ uint8_t find_pseudo_cumack; /* CMT CUC algorithm. Flag used to * find a new pseudocumack. This flag * is set after a new pseudo-cumack * has been received and indicates * that the sender should find the * next pseudo-cumack expected for * this destination */ uint8_t find_rtx_pseudo_cumack; /* CMT CUCv2 algorithm. Flag used to * find a new rtx-pseudocumack. This * flag is set after a new * rtx-pseudo-cumack has been received * and indicates that the sender * should find the next * rtx-pseudo-cumack expected for this * destination */ uint8_t new_pseudo_cumack; /* CMT CUC algorithm. Flag used to * indicate if a new pseudo-cumack or * rtx-pseudo-cumack has been received */ uint8_t window_probe; /* Doing a window probe? */ uint8_t RTO_measured; /* Have we done the first measure */ uint8_t last_hs_used; /* index into the last HS table entry we used */ uint8_t lan_type; uint8_t rto_needed; uint32_t flowid; uint8_t flowtype; }; struct sctp_data_chunkrec { uint32_t tsn; /* the TSN of this transmit */ uint32_t mid; /* the message identifier of this transmit */ uint16_t sid; /* the stream number of this guy */ uint32_t ppid; uint32_t context; /* from send */ uint32_t cwnd_at_send; /* * part of the Highest sacked algorithm to be able to stroke counts * on ones that are FR'd. */ uint32_t fast_retran_tsn; /* sending_seq at the time of FR */ struct timeval timetodrop; /* time we drop it from queue */ uint32_t fsn; /* Fragment Sequence Number */ uint8_t doing_fast_retransmit; uint8_t rcv_flags; /* flags pulled from data chunk on inbound for * outbound holds sending flags for PR-SCTP. */ uint8_t state_flags; uint8_t chunk_was_revoked; uint8_t fwd_tsn_cnt; }; TAILQ_HEAD(sctpchunk_listhead, sctp_tmit_chunk); /* The lower byte is used to enumerate PR_SCTP policies */ #define CHUNK_FLAGS_PR_SCTP_TTL SCTP_PR_SCTP_TTL #define CHUNK_FLAGS_PR_SCTP_BUF SCTP_PR_SCTP_BUF #define CHUNK_FLAGS_PR_SCTP_RTX SCTP_PR_SCTP_RTX /* The upper byte is used as a bit mask */ #define CHUNK_FLAGS_FRAGMENT_OK 0x0100 struct chk_id { uint8_t id; uint8_t can_take_data; }; struct sctp_tmit_chunk { union { struct sctp_data_chunkrec data; struct chk_id chunk_id; } rec; struct sctp_association *asoc; /* bp to asoc this belongs to */ struct timeval sent_rcv_time; /* filled in if RTT being calculated */ struct mbuf *data; /* pointer to mbuf chain of data */ struct mbuf *last_mbuf; /* pointer to last mbuf in chain */ struct sctp_nets *whoTo; TAILQ_ENTRY(sctp_tmit_chunk) sctp_next; /* next link */ int32_t sent; /* the send status */ uint16_t snd_count; /* number of times I sent */ uint16_t flags; /* flags, such as FRAGMENT_OK */ uint16_t send_size; uint16_t book_size; uint16_t mbcnt; uint16_t auth_keyid; uint8_t holds_key_ref; /* flag if auth keyid refcount is held */ uint8_t pad_inplace; uint8_t do_rtt; uint8_t book_size_scale; uint8_t no_fr_allowed; uint8_t copy_by_ref; uint8_t window_probe; }; struct sctp_queued_to_read { /* sinfo structure Pluse more */ uint16_t sinfo_stream; /* off the wire */ uint16_t sinfo_flags; /* SCTP_UNORDERED from wire use SCTP_EOF for * EOR */ uint32_t sinfo_ppid; /* off the wire */ uint32_t sinfo_context; /* pick this up from assoc def context? */ uint32_t sinfo_timetolive; /* not used by kernel */ uint32_t sinfo_tsn; /* Use this in reassembly as first TSN */ uint32_t sinfo_cumtsn; /* Use this in reassembly as last TSN */ sctp_assoc_t sinfo_assoc_id; /* our assoc id */ /* Non sinfo stuff */ uint32_t mid; /* Fragment Index */ uint32_t length; /* length of data */ uint32_t held_length; /* length held in sb */ uint32_t top_fsn; /* Highest FSN in queue */ uint32_t fsn_included; /* Highest FSN in *data portion */ struct sctp_nets *whoFrom; /* where it came from */ struct mbuf *data; /* front of the mbuf chain of data with * PKT_HDR */ struct mbuf *tail_mbuf; /* used for multi-part data */ struct mbuf *aux_data; /* used to hold/cache control if o/s does not * take it from us */ struct sctp_tcb *stcb; /* assoc, used for window update */ TAILQ_ENTRY(sctp_queued_to_read) next; TAILQ_ENTRY(sctp_queued_to_read) next_instrm; struct sctpchunk_listhead reasm; uint16_t port_from; uint16_t spec_flags; /* Flags to hold the notification field */ uint8_t do_not_ref_stcb; uint8_t end_added; uint8_t pdapi_aborted; uint8_t pdapi_started; uint8_t some_taken; uint8_t last_frag_seen; uint8_t first_frag_seen; uint8_t on_read_q; uint8_t on_strm_q; }; #define SCTP_ON_ORDERED 1 #define SCTP_ON_UNORDERED 2 /* This data structure will be on the outbound * stream queues. Data will be pulled off from * the front of the mbuf data and chunk-ified * by the output routines. We will custom * fit every chunk we pull to the send/sent * queue to make up the next full packet * if we can. An entry cannot be removed * from the stream_out queue until * the msg_is_complete flag is set. This * means at times data/tail_mbuf MIGHT * be NULL.. If that occurs it happens * for one of two reasons. Either the user * is blocked on a send() call and has not * awoken to copy more data down... OR * the user is in the explict MSG_EOR mode * and wrote some data, but has not completed * sending. */ struct sctp_stream_queue_pending { struct mbuf *data; struct mbuf *tail_mbuf; struct timeval ts; struct sctp_nets *net; TAILQ_ENTRY(sctp_stream_queue_pending) next; TAILQ_ENTRY(sctp_stream_queue_pending) ss_next; uint32_t fsn; uint32_t length; uint32_t timetolive; uint32_t ppid; uint32_t context; uint16_t sinfo_flags; uint16_t sid; uint16_t act_flags; uint16_t auth_keyid; uint8_t holds_key_ref; uint8_t msg_is_complete; uint8_t some_taken; uint8_t sender_all_done; uint8_t put_last_out; uint8_t discard_rest; + uint8_t processing; }; /* * this struct contains info that is used to track inbound stream data and * help with ordering. */ TAILQ_HEAD(sctpwheelunrel_listhead, sctp_stream_in); struct sctp_stream_in { struct sctp_readhead inqueue; struct sctp_readhead uno_inqueue; uint32_t last_mid_delivered; /* used for re-order */ uint16_t sid; uint8_t delivery_started; uint8_t pd_api_started; }; TAILQ_HEAD(sctpwheel_listhead, sctp_stream_out); TAILQ_HEAD(sctplist_listhead, sctp_stream_queue_pending); /* Round-robin schedulers */ struct ss_rr { /* next link in wheel */ TAILQ_ENTRY(sctp_stream_out) next_spoke; }; /* Priority scheduler */ struct ss_prio { /* next link in wheel */ TAILQ_ENTRY(sctp_stream_out) next_spoke; /* priority id */ uint16_t priority; }; /* Fair Bandwidth scheduler */ struct ss_fb { /* next link in wheel */ TAILQ_ENTRY(sctp_stream_out) next_spoke; /* stores message size */ int32_t rounds; }; /* * This union holds all data necessary for * different stream schedulers. */ struct scheduling_data { struct sctp_stream_out *locked_on_sending; /* circular looking for output selection */ struct sctp_stream_out *last_out_stream; union { struct sctpwheel_listhead wheel; struct sctplist_listhead list; } out; }; /* * This union holds all parameters per stream * necessary for different stream schedulers. */ union scheduling_parameters { struct ss_rr rr; struct ss_prio prio; struct ss_fb fb; }; /* States for outgoing streams */ #define SCTP_STREAM_CLOSED 0x00 #define SCTP_STREAM_OPENING 0x01 #define SCTP_STREAM_OPEN 0x02 #define SCTP_STREAM_RESET_PENDING 0x03 #define SCTP_STREAM_RESET_IN_FLIGHT 0x04 #define SCTP_MAX_STREAMS_AT_ONCE_RESET 200 /* This struct is used to track the traffic on outbound streams */ struct sctp_stream_out { struct sctp_streamhead outqueue; union scheduling_parameters ss_params; uint32_t chunks_on_queues; /* send queue and sent queue */ #if defined(SCTP_DETAILED_STR_STATS) uint32_t abandoned_unsent[SCTP_PR_SCTP_MAX + 1]; uint32_t abandoned_sent[SCTP_PR_SCTP_MAX + 1]; #else /* Only the aggregation */ uint32_t abandoned_unsent[1]; uint32_t abandoned_sent[1]; #endif /* * For associations using DATA chunks, the lower 16-bit of * next_mid_ordered are used as the next SSN. */ uint32_t next_mid_ordered; uint32_t next_mid_unordered; uint16_t sid; uint8_t last_msg_incomplete; uint8_t state; }; /* used to keep track of the addresses yet to try to add/delete */ TAILQ_HEAD(sctp_asconf_addrhead, sctp_asconf_addr); struct sctp_asconf_addr { TAILQ_ENTRY(sctp_asconf_addr) next; struct sctp_asconf_addr_param ap; struct sctp_ifa *ifa; /* save the ifa for add/del ip */ uint8_t sent; /* has this been sent yet? */ uint8_t special_del; /* not to be used in lookup */ }; struct sctp_scoping { uint8_t ipv4_addr_legal; uint8_t ipv6_addr_legal; uint8_t loopback_scope; uint8_t ipv4_local_scope; uint8_t local_scope; uint8_t site_scope; }; #define SCTP_TSN_LOG_SIZE 40 struct sctp_tsn_log { void *stcb; uint32_t tsn; uint32_t seq; uint16_t strm; uint16_t sz; uint16_t flgs; uint16_t in_pos; uint16_t in_out; uint16_t resv; }; #define SCTP_FS_SPEC_LOG_SIZE 200 struct sctp_fs_spec_log { uint32_t sent; uint32_t total_flight; uint32_t tsn; uint16_t book; uint8_t incr; uint8_t decr; }; /* This struct is here to cut out the compatiabilty * pad that bulks up both the inp and stcb. The non * pad portion MUST stay in complete sync with * sctp_sndrcvinfo... i.e. if sinfo_xxxx is added * this must be done here too. */ struct sctp_nonpad_sndrcvinfo { uint16_t sinfo_stream; uint16_t sinfo_ssn; uint16_t sinfo_flags; uint32_t sinfo_ppid; uint32_t sinfo_context; uint32_t sinfo_timetolive; uint32_t sinfo_tsn; uint32_t sinfo_cumtsn; sctp_assoc_t sinfo_assoc_id; uint16_t sinfo_keynumber; uint16_t sinfo_keynumber_valid; }; /* * JRS - Structure to hold function pointers to the functions responsible * for congestion control. */ struct sctp_cc_functions { void (*sctp_set_initial_cc_param) (struct sctp_tcb *stcb, struct sctp_nets *net); void (*sctp_cwnd_update_after_sack) (struct sctp_tcb *stcb, struct sctp_association *asoc, int accum_moved, int reneged_all, int will_exit); void (*sctp_cwnd_update_exit_pf) (struct sctp_tcb *stcb, struct sctp_nets *net); void (*sctp_cwnd_update_after_fr) (struct sctp_tcb *stcb, struct sctp_association *asoc); void (*sctp_cwnd_update_after_timeout) (struct sctp_tcb *stcb, struct sctp_nets *net); void (*sctp_cwnd_update_after_ecn_echo) (struct sctp_tcb *stcb, struct sctp_nets *net, int in_window, int num_pkt_lost); void (*sctp_cwnd_update_after_packet_dropped) (struct sctp_tcb *stcb, struct sctp_nets *net, struct sctp_pktdrop_chunk *cp, uint32_t *bottle_bw, uint32_t *on_queue); void (*sctp_cwnd_update_after_output) (struct sctp_tcb *stcb, struct sctp_nets *net, int burst_limit); void (*sctp_cwnd_update_packet_transmitted) (struct sctp_tcb *stcb, struct sctp_nets *net); void (*sctp_cwnd_update_tsn_acknowledged) (struct sctp_nets *net, struct sctp_tmit_chunk *); void (*sctp_cwnd_new_transmission_begins) (struct sctp_tcb *stcb, struct sctp_nets *net); void (*sctp_cwnd_prepare_net_for_sack) (struct sctp_tcb *stcb, struct sctp_nets *net); int (*sctp_cwnd_socket_option) (struct sctp_tcb *stcb, int set, struct sctp_cc_option *); void (*sctp_rtt_calculated) (struct sctp_tcb *, struct sctp_nets *, struct timeval *); }; /* * RS - Structure to hold function pointers to the functions responsible * for stream scheduling. */ struct sctp_ss_functions { void (*sctp_ss_init) (struct sctp_tcb *stcb, struct sctp_association *asoc, int holds_lock); void (*sctp_ss_clear) (struct sctp_tcb *stcb, struct sctp_association *asoc, int clear_values, int holds_lock); void (*sctp_ss_init_stream) (struct sctp_tcb *stcb, struct sctp_stream_out *strq, struct sctp_stream_out *with_strq); void (*sctp_ss_add_to_stream) (struct sctp_tcb *stcb, struct sctp_association *asoc, struct sctp_stream_out *strq, struct sctp_stream_queue_pending *sp, int holds_lock); int (*sctp_ss_is_empty) (struct sctp_tcb *stcb, struct sctp_association *asoc); void (*sctp_ss_remove_from_stream) (struct sctp_tcb *stcb, struct sctp_association *asoc, struct sctp_stream_out *strq, struct sctp_stream_queue_pending *sp, int holds_lock); struct sctp_stream_out *(*sctp_ss_select_stream) (struct sctp_tcb *stcb, struct sctp_nets *net, struct sctp_association *asoc); void (*sctp_ss_scheduled) (struct sctp_tcb *stcb, struct sctp_nets *net, struct sctp_association *asoc, struct sctp_stream_out *strq, int moved_how_much); void (*sctp_ss_packet_done) (struct sctp_tcb *stcb, struct sctp_nets *net, struct sctp_association *asoc); int (*sctp_ss_get_value) (struct sctp_tcb *stcb, struct sctp_association *asoc, struct sctp_stream_out *strq, uint16_t *value); int (*sctp_ss_set_value) (struct sctp_tcb *stcb, struct sctp_association *asoc, struct sctp_stream_out *strq, uint16_t value); int (*sctp_ss_is_user_msgs_incomplete) (struct sctp_tcb *stcb, struct sctp_association *asoc); }; /* used to save ASCONF chunks for retransmission */ TAILQ_HEAD(sctp_asconf_head, sctp_asconf); struct sctp_asconf { TAILQ_ENTRY(sctp_asconf) next; uint32_t serial_number; uint16_t snd_count; struct mbuf *data; uint16_t len; }; /* used to save ASCONF-ACK chunks for retransmission */ TAILQ_HEAD(sctp_asconf_ackhead, sctp_asconf_ack); struct sctp_asconf_ack { TAILQ_ENTRY(sctp_asconf_ack) next; uint32_t serial_number; struct sctp_nets *last_sent_to; struct mbuf *data; uint16_t len; }; /* * Here we have information about each individual association that we track. * We probably in production would be more dynamic. But for ease of * implementation we will have a fixed array that we hunt for in a linear * fashion. */ struct sctp_association { /* association state */ int state; /* queue of pending addrs to add/delete */ struct sctp_asconf_addrhead asconf_queue; struct timeval time_entered; /* time we entered state */ struct timeval time_last_rcvd; struct timeval time_last_sent; struct timeval time_last_sat_advance; struct sctp_nonpad_sndrcvinfo def_send; /* timers and such */ struct sctp_timer dack_timer; /* Delayed ack timer */ struct sctp_timer asconf_timer; /* asconf */ struct sctp_timer strreset_timer; /* stream reset */ struct sctp_timer shut_guard_timer; /* shutdown guard */ struct sctp_timer autoclose_timer; /* automatic close timer */ struct sctp_timer delete_prim_timer; /* deleting primary dst */ /* list of restricted local addresses */ struct sctpladdr sctp_restricted_addrs; /* last local address pending deletion (waiting for an address add) */ struct sctp_ifa *asconf_addr_del_pending; /* Deleted primary destination (used to stop timer) */ struct sctp_nets *deleted_primary; struct sctpnetlisthead nets; /* remote address list */ /* Free chunk list */ struct sctpchunk_listhead free_chunks; /* Control chunk queue */ struct sctpchunk_listhead control_send_queue; /* ASCONF chunk queue */ struct sctpchunk_listhead asconf_send_queue; /* * Once a TSN hits the wire it is moved to the sent_queue. We * maintain two counts here (don't know if any but retran_cnt is * needed). The idea is that the sent_queue_retran_cnt reflects how * many chunks have been marked for retranmission by either T3-rxt * or FR. */ struct sctpchunk_listhead sent_queue; struct sctpchunk_listhead send_queue; /* Scheduling queues */ struct scheduling_data ss_data; /* If an iterator is looking at me, this is it */ struct sctp_iterator *stcb_starting_point_for_iterator; /* ASCONF save the last ASCONF-ACK so we can resend it if necessary */ struct sctp_asconf_ackhead asconf_ack_sent; /* * pointer to last stream reset queued to control queue by us with * requests. */ struct sctp_tmit_chunk *str_reset; /* * if Source Address Selection happening, this will rotate through * the link list. */ struct sctp_laddr *last_used_address; /* stream arrays */ struct sctp_stream_in *strmin; struct sctp_stream_out *strmout; uint8_t *mapping_array; /* primary destination to use */ struct sctp_nets *primary_destination; struct sctp_nets *alternate; /* If primary is down or PF */ /* For CMT */ struct sctp_nets *last_net_cmt_send_started; /* last place I got a data chunk from */ struct sctp_nets *last_data_chunk_from; /* last place I got a control from */ struct sctp_nets *last_control_chunk_from; /* * wait to the point the cum-ack passes req->send_reset_at_tsn for * any req on the list. */ struct sctp_resethead resetHead; /* queue of chunks waiting to be sent into the local stack */ struct sctp_readhead pending_reply_queue; /* JRS - the congestion control functions are in this struct */ struct sctp_cc_functions cc_functions; /* * JRS - value to store the currently loaded congestion control * module */ uint32_t congestion_control_module; /* RS - the stream scheduling functions are in this struct */ struct sctp_ss_functions ss_functions; /* RS - value to store the currently loaded stream scheduling module */ uint32_t stream_scheduling_module; uint32_t vrf_id; uint32_t cookie_preserve_req; /* ASCONF next seq I am sending out, inits at init-tsn */ uint32_t asconf_seq_out; uint32_t asconf_seq_out_acked; /* ASCONF last received ASCONF from peer, starts at peer's TSN-1 */ uint32_t asconf_seq_in; /* next seq I am sending in str reset messages */ uint32_t str_reset_seq_out; /* next seq I am expecting in str reset messages */ uint32_t str_reset_seq_in; /* various verification tag information */ uint32_t my_vtag; /* The tag to be used. if assoc is re-initited * by remote end, and I have unlocked this * will be regenerated to a new random value. */ uint32_t peer_vtag; /* The peers last tag */ uint32_t my_vtag_nonce; uint32_t peer_vtag_nonce; uint32_t assoc_id; /* This is the SCTP fragmentation threshold */ uint32_t smallest_mtu; /* * Special hook for Fast retransmit, allows us to track the highest * TSN that is NEW in this SACK if gap ack blocks are present. */ uint32_t this_sack_highest_gap; /* * The highest consecutive TSN that has been acked by peer on my * sends */ uint32_t last_acked_seq; /* The next TSN that I will use in sending. */ uint32_t sending_seq; /* Original seq number I used ??questionable to keep?? */ uint32_t init_seq_number; /* The Advanced Peer Ack Point, as required by the PR-SCTP */ /* (A1 in Section 4.2) */ uint32_t advanced_peer_ack_point; /* * The highest consequetive TSN at the bottom of the mapping array * (for his sends). */ uint32_t cumulative_tsn; /* * Used to track the mapping array and its offset bits. This MAY be * lower then cumulative_tsn. */ uint32_t mapping_array_base_tsn; /* * used to track highest TSN we have received and is listed in the * mapping array. */ uint32_t highest_tsn_inside_map; /* EY - new NR variables used for nr_sack based on mapping_array */ uint8_t *nr_mapping_array; uint32_t highest_tsn_inside_nr_map; uint32_t fast_recovery_tsn; uint32_t sat_t3_recovery_tsn; uint32_t tsn_last_delivered; /* * For the pd-api we should re-write this a bit more efficient. We * could have multiple sctp_queued_to_read's that we are building at * once. Now we only do this when we get ready to deliver to the * socket buffer. Note that we depend on the fact that the struct is * "stuck" on the read queue until we finish all the pd-api. */ struct sctp_queued_to_read *control_pdapi; uint32_t tsn_of_pdapi_last_delivered; uint32_t pdapi_ppid; uint32_t context; uint32_t last_reset_action[SCTP_MAX_RESET_PARAMS]; uint32_t last_sending_seq[SCTP_MAX_RESET_PARAMS]; uint32_t last_base_tsnsent[SCTP_MAX_RESET_PARAMS]; #ifdef SCTP_ASOCLOG_OF_TSNS /* * special log - This adds considerable size to the asoc, but * provides a log that you can use to detect problems via kgdb. */ struct sctp_tsn_log in_tsnlog[SCTP_TSN_LOG_SIZE]; struct sctp_tsn_log out_tsnlog[SCTP_TSN_LOG_SIZE]; uint32_t cumack_log[SCTP_TSN_LOG_SIZE]; uint32_t cumack_logsnt[SCTP_TSN_LOG_SIZE]; uint16_t tsn_in_at; uint16_t tsn_out_at; uint16_t tsn_in_wrapped; uint16_t tsn_out_wrapped; uint16_t cumack_log_at; uint16_t cumack_log_atsnt; #endif /* SCTP_ASOCLOG_OF_TSNS */ #ifdef SCTP_FS_SPEC_LOG struct sctp_fs_spec_log fslog[SCTP_FS_SPEC_LOG_SIZE]; uint16_t fs_index; #endif /* * window state information and smallest MTU that I use to bound * segmentation */ uint32_t peers_rwnd; uint32_t my_rwnd; uint32_t my_last_reported_rwnd; uint32_t sctp_frag_point; uint32_t total_output_queue_size; uint32_t sb_cc; /* shadow of sb_cc */ uint32_t sb_send_resv; /* amount reserved on a send */ uint32_t my_rwnd_control_len; /* shadow of sb_mbcnt used for rwnd * control */ #ifdef INET6 uint32_t default_flowlabel; #endif uint32_t pr_sctp_cnt; int ctrl_queue_cnt; /* could be removed REM - NO IT CAN'T!! RRS */ /* * All outbound datagrams queue into this list from the individual * stream queue. Here they get assigned a TSN and then await * sending. The stream seq comes when it is first put in the * individual str queue */ unsigned int stream_queue_cnt; unsigned int send_queue_cnt; unsigned int sent_queue_cnt; unsigned int sent_queue_cnt_removeable; /* * Number on sent queue that are marked for retran until this value * is 0 we only send one packet of retran'ed data. */ unsigned int sent_queue_retran_cnt; unsigned int size_on_reasm_queue; unsigned int cnt_on_reasm_queue; unsigned int fwd_tsn_cnt; /* amount of data (bytes) currently in flight (on all destinations) */ unsigned int total_flight; /* Total book size in flight */ unsigned int total_flight_count; /* count of chunks used with * book total */ /* count of destinaton nets and list of destination nets */ unsigned int numnets; /* Total error count on this association */ unsigned int overall_error_count; unsigned int cnt_msg_on_sb; /* All stream count of chunks for delivery */ unsigned int size_on_all_streams; unsigned int cnt_on_all_streams; /* Heart Beat delay in ms */ uint32_t heart_beat_delay; /* autoclose */ uint32_t sctp_autoclose_ticks; /* how many preopen streams we have */ unsigned int pre_open_streams; /* How many streams I support coming into me */ unsigned int max_inbound_streams; /* the cookie life I award for any cookie, in seconds */ uint32_t cookie_life; /* time to delay acks for */ unsigned int delayed_ack; unsigned int old_delayed_ack; unsigned int sack_freq; unsigned int data_pkts_seen; unsigned int numduptsns; int dup_tsns[SCTP_MAX_DUP_TSNS]; uint32_t initial_init_rto_max; /* initial RTO for INIT's */ uint32_t initial_rto; /* initial send RTO */ uint32_t minrto; /* per assoc RTO-MIN */ uint32_t maxrto; /* per assoc RTO-MAX */ /* authentication fields */ sctp_auth_chklist_t *local_auth_chunks; sctp_auth_chklist_t *peer_auth_chunks; sctp_hmaclist_t *local_hmacs; /* local HMACs supported */ sctp_hmaclist_t *peer_hmacs; /* peer HMACs supported */ struct sctp_keyhead shared_keys; /* assoc's shared keys */ sctp_authinfo_t authinfo; /* randoms, cached keys */ /* * refcnt to block freeing when a sender or receiver is off coping * user data in. */ uint32_t refcnt; uint32_t chunks_on_out_queue; /* total chunks floating around, * locked by send socket buffer */ uint32_t peers_adaptation; uint32_t default_mtu; uint16_t peer_hmac_id; /* peer HMAC id to send */ /* * Being that we have no bag to collect stale cookies, and that we * really would not want to anyway.. we will count them in this * counter. We of course feed them to the pigeons right away (I have * always thought of pigeons as flying rats). */ uint16_t stale_cookie_count; /* * For the partial delivery API, if up, invoked this is what last * TSN I delivered */ uint16_t str_of_pdapi; uint16_t ssn_of_pdapi; /* counts of actual built streams. Allocation may be more however */ /* could re-arrange to optimize space here. */ uint16_t streamincnt; uint16_t streamoutcnt; uint16_t strm_realoutsize; uint16_t strm_pending_add_size; /* my maximum number of retrans of INIT and SEND */ /* copied from SCTP but should be individually setable */ uint16_t max_init_times; uint16_t max_send_times; uint16_t def_net_failure; uint16_t def_net_pf_threshold; /* * lock flag: 0 is ok to send, 1+ (duals as a retran count) is * awaiting ACK */ uint16_t mapping_array_size; uint16_t last_strm_seq_delivered; uint16_t last_strm_no_delivered; uint16_t last_revoke_count; int16_t num_send_timers_up; uint16_t stream_locked_on; uint16_t ecn_echo_cnt_onq; uint16_t free_chunk_cnt; uint8_t stream_locked; uint8_t authenticated; /* packet authenticated ok */ /* * This flag indicates that a SACK need to be sent. Initially this * is 1 to send the first sACK immediately. */ uint8_t send_sack; /* max burst of new packets into the network */ uint32_t max_burst; /* max burst of fast retransmit packets */ uint32_t fr_max_burst; uint8_t sat_network; /* RTT is in range of sat net or greater */ uint8_t sat_network_lockout; /* lockout code */ uint8_t burst_limit_applied; /* Burst limit in effect at last send? */ /* flag goes on when we are doing a partial delivery api */ uint8_t hb_random_values[4]; uint8_t fragmented_delivery_inprogress; uint8_t fragment_flags; uint8_t last_flags_delivered; uint8_t hb_ect_randombit; uint8_t hb_random_idx; uint8_t default_dscp; uint8_t asconf_del_pending; /* asconf delete last addr pending */ uint8_t trigger_reset; /* * This value, plus all other ack'd but above cum-ack is added * together to cross check against the bit that we have yet to * define (probably in the SACK). When the cum-ack is updated, this * sum is updated as well. */ /* Flags whether an extension is supported or not */ uint8_t ecn_supported; uint8_t prsctp_supported; uint8_t auth_supported; uint8_t asconf_supported; uint8_t reconfig_supported; uint8_t nrsack_supported; uint8_t pktdrop_supported; uint8_t idata_supported; /* Did the peer make the stream config (add out) request */ uint8_t peer_req_out; uint8_t local_strreset_support; uint8_t peer_supports_nat; struct sctp_scoping scope; /* flags to handle send alternate net tracking */ uint8_t used_alt_asconfack; uint8_t fast_retran_loss_recovery; uint8_t sat_t3_loss_recovery; uint8_t dropped_special_cnt; uint8_t seen_a_sack_this_pkt; uint8_t stream_reset_outstanding; uint8_t stream_reset_out_is_outstanding; uint8_t delayed_connection; uint8_t ifp_had_enobuf; uint8_t saw_sack_with_frags; uint8_t saw_sack_with_nr_frags; uint8_t in_asocid_hash; uint8_t assoc_up_sent; uint8_t adaptation_needed; uint8_t adaptation_sent; /* CMT variables */ uint8_t cmt_dac_pkts_rcvd; uint8_t sctp_cmt_on_off; uint8_t iam_blocking; uint8_t cookie_how[8]; /* JRS 5/21/07 - CMT PF variable */ uint8_t sctp_cmt_pf; uint8_t use_precise_time; uint64_t sctp_features; uint32_t max_cwnd; uint16_t port; /* remote UDP encapsulation port */ /* * The mapping array is used to track out of order sequences above * last_acked_seq. 0 indicates packet missing 1 indicates packet * rec'd. We slide it up every time we raise last_acked_seq and 0 * trailing locactions out. If I get a TSN above the array * mappingArraySz, I discard the datagram and let retransmit happen. */ uint32_t marked_retrans; uint32_t timoinit; uint32_t timodata; uint32_t timosack; uint32_t timoshutdown; uint32_t timoheartbeat; uint32_t timocookie; uint32_t timoshutdownack; struct timeval start_time; struct timeval discontinuity_time; uint64_t abandoned_unsent[SCTP_PR_SCTP_MAX + 1]; uint64_t abandoned_sent[SCTP_PR_SCTP_MAX + 1]; }; #endif Index: projects/clang1100-import/sys/netinet/sctputil.c =================================================================== --- projects/clang1100-import/sys/netinet/sctputil.c (revision 364278) +++ projects/clang1100-import/sys/netinet/sctputil.c (revision 364279) @@ -1,7675 +1,7668 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2001-2008, by Cisco Systems, Inc. All rights reserved. * Copyright (c) 2008-2012, by Randall Stewart. All rights reserved. * Copyright (c) 2008-2012, by Michael Tuexen. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * a) Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * b) Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the distribution. * * c) Neither the name of Cisco Systems, Inc. nor the names of its * contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #ifdef INET6 #include #endif #include #include #include #include #include #include #include #include #include #if defined(INET6) || defined(INET) #include #endif #include #include #include #ifdef INET6 #include #endif #ifndef KTR_SCTP #define KTR_SCTP KTR_SUBSYS #endif extern const struct sctp_cc_functions sctp_cc_functions[]; extern const struct sctp_ss_functions sctp_ss_functions[]; void sctp_sblog(struct sockbuf *sb, struct sctp_tcb *stcb, int from, int incr) { #if defined(SCTP_LOCAL_TRACE_BUF) struct sctp_cwnd_log sctp_clog; sctp_clog.x.sb.stcb = stcb; sctp_clog.x.sb.so_sbcc = sb->sb_cc; if (stcb) sctp_clog.x.sb.stcb_sbcc = stcb->asoc.sb_cc; else sctp_clog.x.sb.stcb_sbcc = 0; sctp_clog.x.sb.incr = incr; SCTP_CTR6(KTR_SCTP, "SCTP:%d[%d]:%x-%x-%x-%x", SCTP_LOG_EVENT_SB, from, sctp_clog.x.misc.log1, sctp_clog.x.misc.log2, sctp_clog.x.misc.log3, sctp_clog.x.misc.log4); #endif } void sctp_log_closing(struct sctp_inpcb *inp, struct sctp_tcb *stcb, int16_t loc) { #if defined(SCTP_LOCAL_TRACE_BUF) struct sctp_cwnd_log sctp_clog; sctp_clog.x.close.inp = (void *)inp; sctp_clog.x.close.sctp_flags = inp->sctp_flags; if (stcb) { sctp_clog.x.close.stcb = (void *)stcb; sctp_clog.x.close.state = (uint16_t)stcb->asoc.state; } else { sctp_clog.x.close.stcb = 0; sctp_clog.x.close.state = 0; } sctp_clog.x.close.loc = loc; SCTP_CTR6(KTR_SCTP, "SCTP:%d[%d]:%x-%x-%x-%x", SCTP_LOG_EVENT_CLOSE, 0, sctp_clog.x.misc.log1, sctp_clog.x.misc.log2, sctp_clog.x.misc.log3, sctp_clog.x.misc.log4); #endif } void rto_logging(struct sctp_nets *net, int from) { #if defined(SCTP_LOCAL_TRACE_BUF) struct sctp_cwnd_log sctp_clog; memset(&sctp_clog, 0, sizeof(sctp_clog)); sctp_clog.x.rto.net = (void *)net; sctp_clog.x.rto.rtt = net->rtt / 1000; SCTP_CTR6(KTR_SCTP, "SCTP:%d[%d]:%x-%x-%x-%x", SCTP_LOG_EVENT_RTT, from, sctp_clog.x.misc.log1, sctp_clog.x.misc.log2, sctp_clog.x.misc.log3, sctp_clog.x.misc.log4); #endif } void sctp_log_strm_del_alt(struct sctp_tcb *stcb, uint32_t tsn, uint16_t sseq, uint16_t stream, int from) { #if defined(SCTP_LOCAL_TRACE_BUF) struct sctp_cwnd_log sctp_clog; sctp_clog.x.strlog.stcb = stcb; sctp_clog.x.strlog.n_tsn = tsn; sctp_clog.x.strlog.n_sseq = sseq; sctp_clog.x.strlog.e_tsn = 0; sctp_clog.x.strlog.e_sseq = 0; sctp_clog.x.strlog.strm = stream; SCTP_CTR6(KTR_SCTP, "SCTP:%d[%d]:%x-%x-%x-%x", SCTP_LOG_EVENT_STRM, from, sctp_clog.x.misc.log1, sctp_clog.x.misc.log2, sctp_clog.x.misc.log3, sctp_clog.x.misc.log4); #endif } void sctp_log_nagle_event(struct sctp_tcb *stcb, int action) { #if defined(SCTP_LOCAL_TRACE_BUF) struct sctp_cwnd_log sctp_clog; sctp_clog.x.nagle.stcb = (void *)stcb; sctp_clog.x.nagle.total_flight = stcb->asoc.total_flight; sctp_clog.x.nagle.total_in_queue = stcb->asoc.total_output_queue_size; sctp_clog.x.nagle.count_in_queue = stcb->asoc.chunks_on_out_queue; sctp_clog.x.nagle.count_in_flight = stcb->asoc.total_flight_count; SCTP_CTR6(KTR_SCTP, "SCTP:%d[%d]:%x-%x-%x-%x", SCTP_LOG_EVENT_NAGLE, action, sctp_clog.x.misc.log1, sctp_clog.x.misc.log2, sctp_clog.x.misc.log3, sctp_clog.x.misc.log4); #endif } void sctp_log_sack(uint32_t old_cumack, uint32_t cumack, uint32_t tsn, uint16_t gaps, uint16_t dups, int from) { #if defined(SCTP_LOCAL_TRACE_BUF) struct sctp_cwnd_log sctp_clog; sctp_clog.x.sack.cumack = cumack; sctp_clog.x.sack.oldcumack = old_cumack; sctp_clog.x.sack.tsn = tsn; sctp_clog.x.sack.numGaps = gaps; sctp_clog.x.sack.numDups = dups; SCTP_CTR6(KTR_SCTP, "SCTP:%d[%d]:%x-%x-%x-%x", SCTP_LOG_EVENT_SACK, from, sctp_clog.x.misc.log1, sctp_clog.x.misc.log2, sctp_clog.x.misc.log3, sctp_clog.x.misc.log4); #endif } void sctp_log_map(uint32_t map, uint32_t cum, uint32_t high, int from) { #if defined(SCTP_LOCAL_TRACE_BUF) struct sctp_cwnd_log sctp_clog; memset(&sctp_clog, 0, sizeof(sctp_clog)); sctp_clog.x.map.base = map; sctp_clog.x.map.cum = cum; sctp_clog.x.map.high = high; SCTP_CTR6(KTR_SCTP, "SCTP:%d[%d]:%x-%x-%x-%x", SCTP_LOG_EVENT_MAP, from, sctp_clog.x.misc.log1, sctp_clog.x.misc.log2, sctp_clog.x.misc.log3, sctp_clog.x.misc.log4); #endif } void sctp_log_fr(uint32_t biggest_tsn, uint32_t biggest_new_tsn, uint32_t tsn, int from) { #if defined(SCTP_LOCAL_TRACE_BUF) struct sctp_cwnd_log sctp_clog; memset(&sctp_clog, 0, sizeof(sctp_clog)); sctp_clog.x.fr.largest_tsn = biggest_tsn; sctp_clog.x.fr.largest_new_tsn = biggest_new_tsn; sctp_clog.x.fr.tsn = tsn; SCTP_CTR6(KTR_SCTP, "SCTP:%d[%d]:%x-%x-%x-%x", SCTP_LOG_EVENT_FR, from, sctp_clog.x.misc.log1, sctp_clog.x.misc.log2, sctp_clog.x.misc.log3, sctp_clog.x.misc.log4); #endif } #ifdef SCTP_MBUF_LOGGING void sctp_log_mb(struct mbuf *m, int from) { #if defined(SCTP_LOCAL_TRACE_BUF) struct sctp_cwnd_log sctp_clog; sctp_clog.x.mb.mp = m; sctp_clog.x.mb.mbuf_flags = (uint8_t)(SCTP_BUF_GET_FLAGS(m)); sctp_clog.x.mb.size = (uint16_t)(SCTP_BUF_LEN(m)); sctp_clog.x.mb.data = SCTP_BUF_AT(m, 0); if (SCTP_BUF_IS_EXTENDED(m)) { sctp_clog.x.mb.ext = SCTP_BUF_EXTEND_BASE(m); sctp_clog.x.mb.refcnt = (uint8_t)(SCTP_BUF_EXTEND_REFCNT(m)); } else { sctp_clog.x.mb.ext = 0; sctp_clog.x.mb.refcnt = 0; } SCTP_CTR6(KTR_SCTP, "SCTP:%d[%d]:%x-%x-%x-%x", SCTP_LOG_EVENT_MBUF, from, sctp_clog.x.misc.log1, sctp_clog.x.misc.log2, sctp_clog.x.misc.log3, sctp_clog.x.misc.log4); #endif } void sctp_log_mbc(struct mbuf *m, int from) { struct mbuf *mat; for (mat = m; mat; mat = SCTP_BUF_NEXT(mat)) { sctp_log_mb(mat, from); } } #endif void sctp_log_strm_del(struct sctp_queued_to_read *control, struct sctp_queued_to_read *poschk, int from) { #if defined(SCTP_LOCAL_TRACE_BUF) struct sctp_cwnd_log sctp_clog; if (control == NULL) { SCTP_PRINTF("Gak log of NULL?\n"); return; } sctp_clog.x.strlog.stcb = control->stcb; sctp_clog.x.strlog.n_tsn = control->sinfo_tsn; sctp_clog.x.strlog.n_sseq = (uint16_t)control->mid; sctp_clog.x.strlog.strm = control->sinfo_stream; if (poschk != NULL) { sctp_clog.x.strlog.e_tsn = poschk->sinfo_tsn; sctp_clog.x.strlog.e_sseq = (uint16_t)poschk->mid; } else { sctp_clog.x.strlog.e_tsn = 0; sctp_clog.x.strlog.e_sseq = 0; } SCTP_CTR6(KTR_SCTP, "SCTP:%d[%d]:%x-%x-%x-%x", SCTP_LOG_EVENT_STRM, from, sctp_clog.x.misc.log1, sctp_clog.x.misc.log2, sctp_clog.x.misc.log3, sctp_clog.x.misc.log4); #endif } void sctp_log_cwnd(struct sctp_tcb *stcb, struct sctp_nets *net, int augment, uint8_t from) { #if defined(SCTP_LOCAL_TRACE_BUF) struct sctp_cwnd_log sctp_clog; sctp_clog.x.cwnd.net = net; if (stcb->asoc.send_queue_cnt > 255) sctp_clog.x.cwnd.cnt_in_send = 255; else sctp_clog.x.cwnd.cnt_in_send = stcb->asoc.send_queue_cnt; if (stcb->asoc.stream_queue_cnt > 255) sctp_clog.x.cwnd.cnt_in_str = 255; else sctp_clog.x.cwnd.cnt_in_str = stcb->asoc.stream_queue_cnt; if (net) { sctp_clog.x.cwnd.cwnd_new_value = net->cwnd; sctp_clog.x.cwnd.inflight = net->flight_size; sctp_clog.x.cwnd.pseudo_cumack = net->pseudo_cumack; sctp_clog.x.cwnd.meets_pseudo_cumack = net->new_pseudo_cumack; sctp_clog.x.cwnd.need_new_pseudo_cumack = net->find_pseudo_cumack; } if (SCTP_CWNDLOG_PRESEND == from) { sctp_clog.x.cwnd.meets_pseudo_cumack = stcb->asoc.peers_rwnd; } sctp_clog.x.cwnd.cwnd_augment = augment; SCTP_CTR6(KTR_SCTP, "SCTP:%d[%d]:%x-%x-%x-%x", SCTP_LOG_EVENT_CWND, from, sctp_clog.x.misc.log1, sctp_clog.x.misc.log2, sctp_clog.x.misc.log3, sctp_clog.x.misc.log4); #endif } void sctp_log_lock(struct sctp_inpcb *inp, struct sctp_tcb *stcb, uint8_t from) { #if defined(SCTP_LOCAL_TRACE_BUF) struct sctp_cwnd_log sctp_clog; memset(&sctp_clog, 0, sizeof(sctp_clog)); if (inp) { sctp_clog.x.lock.sock = (void *)inp->sctp_socket; } else { sctp_clog.x.lock.sock = (void *)NULL; } sctp_clog.x.lock.inp = (void *)inp; if (stcb) { sctp_clog.x.lock.tcb_lock = mtx_owned(&stcb->tcb_mtx); } else { sctp_clog.x.lock.tcb_lock = SCTP_LOCK_UNKNOWN; } if (inp) { sctp_clog.x.lock.inp_lock = mtx_owned(&inp->inp_mtx); sctp_clog.x.lock.create_lock = mtx_owned(&inp->inp_create_mtx); } else { sctp_clog.x.lock.inp_lock = SCTP_LOCK_UNKNOWN; sctp_clog.x.lock.create_lock = SCTP_LOCK_UNKNOWN; } sctp_clog.x.lock.info_lock = rw_wowned(&SCTP_BASE_INFO(ipi_ep_mtx)); if (inp && (inp->sctp_socket)) { sctp_clog.x.lock.sock_lock = mtx_owned(&(inp->sctp_socket->so_rcv.sb_mtx)); sctp_clog.x.lock.sockrcvbuf_lock = mtx_owned(&(inp->sctp_socket->so_rcv.sb_mtx)); sctp_clog.x.lock.socksndbuf_lock = mtx_owned(&(inp->sctp_socket->so_snd.sb_mtx)); } else { sctp_clog.x.lock.sock_lock = SCTP_LOCK_UNKNOWN; sctp_clog.x.lock.sockrcvbuf_lock = SCTP_LOCK_UNKNOWN; sctp_clog.x.lock.socksndbuf_lock = SCTP_LOCK_UNKNOWN; } SCTP_CTR6(KTR_SCTP, "SCTP:%d[%d]:%x-%x-%x-%x", SCTP_LOG_LOCK_EVENT, from, sctp_clog.x.misc.log1, sctp_clog.x.misc.log2, sctp_clog.x.misc.log3, sctp_clog.x.misc.log4); #endif } void sctp_log_maxburst(struct sctp_tcb *stcb, struct sctp_nets *net, int error, int burst, uint8_t from) { #if defined(SCTP_LOCAL_TRACE_BUF) struct sctp_cwnd_log sctp_clog; memset(&sctp_clog, 0, sizeof(sctp_clog)); sctp_clog.x.cwnd.net = net; sctp_clog.x.cwnd.cwnd_new_value = error; sctp_clog.x.cwnd.inflight = net->flight_size; sctp_clog.x.cwnd.cwnd_augment = burst; if (stcb->asoc.send_queue_cnt > 255) sctp_clog.x.cwnd.cnt_in_send = 255; else sctp_clog.x.cwnd.cnt_in_send = stcb->asoc.send_queue_cnt; if (stcb->asoc.stream_queue_cnt > 255) sctp_clog.x.cwnd.cnt_in_str = 255; else sctp_clog.x.cwnd.cnt_in_str = stcb->asoc.stream_queue_cnt; SCTP_CTR6(KTR_SCTP, "SCTP:%d[%d]:%x-%x-%x-%x", SCTP_LOG_EVENT_MAXBURST, from, sctp_clog.x.misc.log1, sctp_clog.x.misc.log2, sctp_clog.x.misc.log3, sctp_clog.x.misc.log4); #endif } void sctp_log_rwnd(uint8_t from, uint32_t peers_rwnd, uint32_t snd_size, uint32_t overhead) { #if defined(SCTP_LOCAL_TRACE_BUF) struct sctp_cwnd_log sctp_clog; sctp_clog.x.rwnd.rwnd = peers_rwnd; sctp_clog.x.rwnd.send_size = snd_size; sctp_clog.x.rwnd.overhead = overhead; sctp_clog.x.rwnd.new_rwnd = 0; SCTP_CTR6(KTR_SCTP, "SCTP:%d[%d]:%x-%x-%x-%x", SCTP_LOG_EVENT_RWND, from, sctp_clog.x.misc.log1, sctp_clog.x.misc.log2, sctp_clog.x.misc.log3, sctp_clog.x.misc.log4); #endif } void sctp_log_rwnd_set(uint8_t from, uint32_t peers_rwnd, uint32_t flight_size, uint32_t overhead, uint32_t a_rwndval) { #if defined(SCTP_LOCAL_TRACE_BUF) struct sctp_cwnd_log sctp_clog; sctp_clog.x.rwnd.rwnd = peers_rwnd; sctp_clog.x.rwnd.send_size = flight_size; sctp_clog.x.rwnd.overhead = overhead; sctp_clog.x.rwnd.new_rwnd = a_rwndval; SCTP_CTR6(KTR_SCTP, "SCTP:%d[%d]:%x-%x-%x-%x", SCTP_LOG_EVENT_RWND, from, sctp_clog.x.misc.log1, sctp_clog.x.misc.log2, sctp_clog.x.misc.log3, sctp_clog.x.misc.log4); #endif } #ifdef SCTP_MBCNT_LOGGING static void sctp_log_mbcnt(uint8_t from, uint32_t total_oq, uint32_t book, uint32_t total_mbcnt_q, uint32_t mbcnt) { #if defined(SCTP_LOCAL_TRACE_BUF) struct sctp_cwnd_log sctp_clog; sctp_clog.x.mbcnt.total_queue_size = total_oq; sctp_clog.x.mbcnt.size_change = book; sctp_clog.x.mbcnt.total_queue_mb_size = total_mbcnt_q; sctp_clog.x.mbcnt.mbcnt_change = mbcnt; SCTP_CTR6(KTR_SCTP, "SCTP:%d[%d]:%x-%x-%x-%x", SCTP_LOG_EVENT_MBCNT, from, sctp_clog.x.misc.log1, sctp_clog.x.misc.log2, sctp_clog.x.misc.log3, sctp_clog.x.misc.log4); #endif } #endif void sctp_misc_ints(uint8_t from, uint32_t a, uint32_t b, uint32_t c, uint32_t d) { #if defined(SCTP_LOCAL_TRACE_BUF) SCTP_CTR6(KTR_SCTP, "SCTP:%d[%d]:%x-%x-%x-%x", SCTP_LOG_MISC_EVENT, from, a, b, c, d); #endif } void sctp_wakeup_log(struct sctp_tcb *stcb, uint32_t wake_cnt, int from) { #if defined(SCTP_LOCAL_TRACE_BUF) struct sctp_cwnd_log sctp_clog; sctp_clog.x.wake.stcb = (void *)stcb; sctp_clog.x.wake.wake_cnt = wake_cnt; sctp_clog.x.wake.flight = stcb->asoc.total_flight_count; sctp_clog.x.wake.send_q = stcb->asoc.send_queue_cnt; sctp_clog.x.wake.sent_q = stcb->asoc.sent_queue_cnt; if (stcb->asoc.stream_queue_cnt < 0xff) sctp_clog.x.wake.stream_qcnt = (uint8_t)stcb->asoc.stream_queue_cnt; else sctp_clog.x.wake.stream_qcnt = 0xff; if (stcb->asoc.chunks_on_out_queue < 0xff) sctp_clog.x.wake.chunks_on_oque = (uint8_t)stcb->asoc.chunks_on_out_queue; else sctp_clog.x.wake.chunks_on_oque = 0xff; sctp_clog.x.wake.sctpflags = 0; /* set in the defered mode stuff */ if (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_DONT_WAKE) sctp_clog.x.wake.sctpflags |= 1; if (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_WAKEOUTPUT) sctp_clog.x.wake.sctpflags |= 2; if (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_WAKEINPUT) sctp_clog.x.wake.sctpflags |= 4; /* what about the sb */ if (stcb->sctp_socket) { struct socket *so = stcb->sctp_socket; sctp_clog.x.wake.sbflags = (uint8_t)((so->so_snd.sb_flags & 0x00ff)); } else { sctp_clog.x.wake.sbflags = 0xff; } SCTP_CTR6(KTR_SCTP, "SCTP:%d[%d]:%x-%x-%x-%x", SCTP_LOG_EVENT_WAKE, from, sctp_clog.x.misc.log1, sctp_clog.x.misc.log2, sctp_clog.x.misc.log3, sctp_clog.x.misc.log4); #endif } void sctp_log_block(uint8_t from, struct sctp_association *asoc, ssize_t sendlen) { #if defined(SCTP_LOCAL_TRACE_BUF) struct sctp_cwnd_log sctp_clog; sctp_clog.x.blk.onsb = asoc->total_output_queue_size; sctp_clog.x.blk.send_sent_qcnt = (uint16_t)(asoc->send_queue_cnt + asoc->sent_queue_cnt); sctp_clog.x.blk.peer_rwnd = asoc->peers_rwnd; sctp_clog.x.blk.stream_qcnt = (uint16_t)asoc->stream_queue_cnt; sctp_clog.x.blk.chunks_on_oque = (uint16_t)asoc->chunks_on_out_queue; sctp_clog.x.blk.flight_size = (uint16_t)(asoc->total_flight / 1024); sctp_clog.x.blk.sndlen = (uint32_t)sendlen; SCTP_CTR6(KTR_SCTP, "SCTP:%d[%d]:%x-%x-%x-%x", SCTP_LOG_EVENT_BLOCK, from, sctp_clog.x.misc.log1, sctp_clog.x.misc.log2, sctp_clog.x.misc.log3, sctp_clog.x.misc.log4); #endif } int sctp_fill_stat_log(void *optval SCTP_UNUSED, size_t *optsize SCTP_UNUSED) { /* May need to fix this if ktrdump does not work */ return (0); } #ifdef SCTP_AUDITING_ENABLED uint8_t sctp_audit_data[SCTP_AUDIT_SIZE][2]; static int sctp_audit_indx = 0; static void sctp_print_audit_report(void) { int i; int cnt; cnt = 0; for (i = sctp_audit_indx; i < SCTP_AUDIT_SIZE; i++) { if ((sctp_audit_data[i][0] == 0xe0) && (sctp_audit_data[i][1] == 0x01)) { cnt = 0; SCTP_PRINTF("\n"); } else if (sctp_audit_data[i][0] == 0xf0) { cnt = 0; SCTP_PRINTF("\n"); } else if ((sctp_audit_data[i][0] == 0xc0) && (sctp_audit_data[i][1] == 0x01)) { SCTP_PRINTF("\n"); cnt = 0; } SCTP_PRINTF("%2.2x%2.2x ", (uint32_t)sctp_audit_data[i][0], (uint32_t)sctp_audit_data[i][1]); cnt++; if ((cnt % 14) == 0) SCTP_PRINTF("\n"); } for (i = 0; i < sctp_audit_indx; i++) { if ((sctp_audit_data[i][0] == 0xe0) && (sctp_audit_data[i][1] == 0x01)) { cnt = 0; SCTP_PRINTF("\n"); } else if (sctp_audit_data[i][0] == 0xf0) { cnt = 0; SCTP_PRINTF("\n"); } else if ((sctp_audit_data[i][0] == 0xc0) && (sctp_audit_data[i][1] == 0x01)) { SCTP_PRINTF("\n"); cnt = 0; } SCTP_PRINTF("%2.2x%2.2x ", (uint32_t)sctp_audit_data[i][0], (uint32_t)sctp_audit_data[i][1]); cnt++; if ((cnt % 14) == 0) SCTP_PRINTF("\n"); } SCTP_PRINTF("\n"); } void sctp_auditing(int from, struct sctp_inpcb *inp, struct sctp_tcb *stcb, struct sctp_nets *net) { int resend_cnt, tot_out, rep, tot_book_cnt; struct sctp_nets *lnet; struct sctp_tmit_chunk *chk; sctp_audit_data[sctp_audit_indx][0] = 0xAA; sctp_audit_data[sctp_audit_indx][1] = 0x000000ff & from; sctp_audit_indx++; if (sctp_audit_indx >= SCTP_AUDIT_SIZE) { sctp_audit_indx = 0; } if (inp == NULL) { sctp_audit_data[sctp_audit_indx][0] = 0xAF; sctp_audit_data[sctp_audit_indx][1] = 0x01; sctp_audit_indx++; if (sctp_audit_indx >= SCTP_AUDIT_SIZE) { sctp_audit_indx = 0; } return; } if (stcb == NULL) { sctp_audit_data[sctp_audit_indx][0] = 0xAF; sctp_audit_data[sctp_audit_indx][1] = 0x02; sctp_audit_indx++; if (sctp_audit_indx >= SCTP_AUDIT_SIZE) { sctp_audit_indx = 0; } return; } sctp_audit_data[sctp_audit_indx][0] = 0xA1; sctp_audit_data[sctp_audit_indx][1] = (0x000000ff & stcb->asoc.sent_queue_retran_cnt); sctp_audit_indx++; if (sctp_audit_indx >= SCTP_AUDIT_SIZE) { sctp_audit_indx = 0; } rep = 0; tot_book_cnt = 0; resend_cnt = tot_out = 0; TAILQ_FOREACH(chk, &stcb->asoc.sent_queue, sctp_next) { if (chk->sent == SCTP_DATAGRAM_RESEND) { resend_cnt++; } else if (chk->sent < SCTP_DATAGRAM_RESEND) { tot_out += chk->book_size; tot_book_cnt++; } } if (resend_cnt != stcb->asoc.sent_queue_retran_cnt) { sctp_audit_data[sctp_audit_indx][0] = 0xAF; sctp_audit_data[sctp_audit_indx][1] = 0xA1; sctp_audit_indx++; if (sctp_audit_indx >= SCTP_AUDIT_SIZE) { sctp_audit_indx = 0; } SCTP_PRINTF("resend_cnt:%d asoc-tot:%d\n", resend_cnt, stcb->asoc.sent_queue_retran_cnt); rep = 1; stcb->asoc.sent_queue_retran_cnt = resend_cnt; sctp_audit_data[sctp_audit_indx][0] = 0xA2; sctp_audit_data[sctp_audit_indx][1] = (0x000000ff & stcb->asoc.sent_queue_retran_cnt); sctp_audit_indx++; if (sctp_audit_indx >= SCTP_AUDIT_SIZE) { sctp_audit_indx = 0; } } if (tot_out != stcb->asoc.total_flight) { sctp_audit_data[sctp_audit_indx][0] = 0xAF; sctp_audit_data[sctp_audit_indx][1] = 0xA2; sctp_audit_indx++; if (sctp_audit_indx >= SCTP_AUDIT_SIZE) { sctp_audit_indx = 0; } rep = 1; SCTP_PRINTF("tot_flt:%d asoc_tot:%d\n", tot_out, (int)stcb->asoc.total_flight); stcb->asoc.total_flight = tot_out; } if (tot_book_cnt != stcb->asoc.total_flight_count) { sctp_audit_data[sctp_audit_indx][0] = 0xAF; sctp_audit_data[sctp_audit_indx][1] = 0xA5; sctp_audit_indx++; if (sctp_audit_indx >= SCTP_AUDIT_SIZE) { sctp_audit_indx = 0; } rep = 1; SCTP_PRINTF("tot_flt_book:%d\n", tot_book_cnt); stcb->asoc.total_flight_count = tot_book_cnt; } tot_out = 0; TAILQ_FOREACH(lnet, &stcb->asoc.nets, sctp_next) { tot_out += lnet->flight_size; } if (tot_out != stcb->asoc.total_flight) { sctp_audit_data[sctp_audit_indx][0] = 0xAF; sctp_audit_data[sctp_audit_indx][1] = 0xA3; sctp_audit_indx++; if (sctp_audit_indx >= SCTP_AUDIT_SIZE) { sctp_audit_indx = 0; } rep = 1; SCTP_PRINTF("real flight:%d net total was %d\n", stcb->asoc.total_flight, tot_out); /* now corrective action */ TAILQ_FOREACH(lnet, &stcb->asoc.nets, sctp_next) { tot_out = 0; TAILQ_FOREACH(chk, &stcb->asoc.sent_queue, sctp_next) { if ((chk->whoTo == lnet) && (chk->sent < SCTP_DATAGRAM_RESEND)) { tot_out += chk->book_size; } } if (lnet->flight_size != tot_out) { SCTP_PRINTF("net:%p flight was %d corrected to %d\n", (void *)lnet, lnet->flight_size, tot_out); lnet->flight_size = tot_out; } } } if (rep) { sctp_print_audit_report(); } } void sctp_audit_log(uint8_t ev, uint8_t fd) { sctp_audit_data[sctp_audit_indx][0] = ev; sctp_audit_data[sctp_audit_indx][1] = fd; sctp_audit_indx++; if (sctp_audit_indx >= SCTP_AUDIT_SIZE) { sctp_audit_indx = 0; } } #endif /* * The conversion from time to ticks and vice versa is done by rounding * upwards. This way we can test in the code the time to be positive and * know that this corresponds to a positive number of ticks. */ uint32_t sctp_msecs_to_ticks(uint32_t msecs) { uint64_t temp; uint32_t ticks; if (hz == 1000) { ticks = msecs; } else { temp = (((uint64_t)msecs * hz) + 999) / 1000; if (temp > UINT32_MAX) { ticks = UINT32_MAX; } else { ticks = (uint32_t)temp; } } return (ticks); } uint32_t sctp_ticks_to_msecs(uint32_t ticks) { uint64_t temp; uint32_t msecs; if (hz == 1000) { msecs = ticks; } else { temp = (((uint64_t)ticks * 1000) + (hz - 1)) / hz; if (temp > UINT32_MAX) { msecs = UINT32_MAX; } else { msecs = (uint32_t)temp; } } return (msecs); } uint32_t sctp_secs_to_ticks(uint32_t secs) { uint64_t temp; uint32_t ticks; temp = (uint64_t)secs * hz; if (temp > UINT32_MAX) { ticks = UINT32_MAX; } else { ticks = (uint32_t)temp; } return (ticks); } uint32_t sctp_ticks_to_secs(uint32_t ticks) { uint64_t temp; uint32_t secs; temp = ((uint64_t)ticks + (hz - 1)) / hz; if (temp > UINT32_MAX) { secs = UINT32_MAX; } else { secs = (uint32_t)temp; } return (secs); } /* * sctp_stop_timers_for_shutdown() should be called * when entering the SHUTDOWN_SENT or SHUTDOWN_ACK_SENT * state to make sure that all timers are stopped. */ void sctp_stop_timers_for_shutdown(struct sctp_tcb *stcb) { struct sctp_inpcb *inp; struct sctp_nets *net; inp = stcb->sctp_ep; sctp_timer_stop(SCTP_TIMER_TYPE_RECV, inp, stcb, NULL, SCTP_FROM_SCTPUTIL + SCTP_LOC_12); sctp_timer_stop(SCTP_TIMER_TYPE_STRRESET, inp, stcb, NULL, SCTP_FROM_SCTPUTIL + SCTP_LOC_13); sctp_timer_stop(SCTP_TIMER_TYPE_ASCONF, inp, stcb, NULL, SCTP_FROM_SCTPUTIL + SCTP_LOC_14); sctp_timer_stop(SCTP_TIMER_TYPE_AUTOCLOSE, inp, stcb, NULL, SCTP_FROM_SCTPUTIL + SCTP_LOC_15); TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) { sctp_timer_stop(SCTP_TIMER_TYPE_PATHMTURAISE, inp, stcb, net, SCTP_FROM_SCTPUTIL + SCTP_LOC_16); sctp_timer_stop(SCTP_TIMER_TYPE_HEARTBEAT, inp, stcb, net, SCTP_FROM_SCTPUTIL + SCTP_LOC_17); } } void sctp_stop_association_timers(struct sctp_tcb *stcb, bool stop_assoc_kill_timer) { struct sctp_inpcb *inp; struct sctp_nets *net; inp = stcb->sctp_ep; sctp_timer_stop(SCTP_TIMER_TYPE_RECV, inp, stcb, NULL, SCTP_FROM_SCTPUTIL + SCTP_LOC_18); sctp_timer_stop(SCTP_TIMER_TYPE_STRRESET, inp, stcb, NULL, SCTP_FROM_SCTPUTIL + SCTP_LOC_19); if (stop_assoc_kill_timer) { sctp_timer_stop(SCTP_TIMER_TYPE_ASOCKILL, inp, stcb, NULL, SCTP_FROM_SCTPUTIL + SCTP_LOC_20); } sctp_timer_stop(SCTP_TIMER_TYPE_ASCONF, inp, stcb, NULL, SCTP_FROM_SCTPUTIL + SCTP_LOC_21); sctp_timer_stop(SCTP_TIMER_TYPE_AUTOCLOSE, inp, stcb, NULL, SCTP_FROM_SCTPUTIL + SCTP_LOC_22); sctp_timer_stop(SCTP_TIMER_TYPE_SHUTDOWNGUARD, inp, stcb, NULL, SCTP_FROM_SCTPUTIL + SCTP_LOC_23); /* Mobility adaptation */ sctp_timer_stop(SCTP_TIMER_TYPE_PRIM_DELETED, inp, stcb, NULL, SCTP_FROM_SCTPUTIL + SCTP_LOC_24); TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) { sctp_timer_stop(SCTP_TIMER_TYPE_SEND, inp, stcb, net, SCTP_FROM_SCTPUTIL + SCTP_LOC_25); sctp_timer_stop(SCTP_TIMER_TYPE_INIT, inp, stcb, net, SCTP_FROM_SCTPUTIL + SCTP_LOC_26); sctp_timer_stop(SCTP_TIMER_TYPE_SHUTDOWN, inp, stcb, net, SCTP_FROM_SCTPUTIL + SCTP_LOC_27); sctp_timer_stop(SCTP_TIMER_TYPE_COOKIE, inp, stcb, net, SCTP_FROM_SCTPUTIL + SCTP_LOC_28); sctp_timer_stop(SCTP_TIMER_TYPE_SHUTDOWNACK, inp, stcb, net, SCTP_FROM_SCTPUTIL + SCTP_LOC_29); sctp_timer_stop(SCTP_TIMER_TYPE_PATHMTURAISE, inp, stcb, net, SCTP_FROM_SCTPUTIL + SCTP_LOC_30); sctp_timer_stop(SCTP_TIMER_TYPE_HEARTBEAT, inp, stcb, net, SCTP_FROM_SCTPUTIL + SCTP_LOC_31); } } /* * A list of sizes based on typical mtu's, used only if next hop size not * returned. These values MUST be multiples of 4 and MUST be ordered. */ static uint32_t sctp_mtu_sizes[] = { 68, 296, 508, 512, 544, 576, 1004, 1492, 1500, 1536, 2000, 2048, 4352, 4464, 8168, 17912, 32000, 65532 }; /* * Return the largest MTU in sctp_mtu_sizes smaller than val. * If val is smaller than the minimum, just return the largest * multiple of 4 smaller or equal to val. * Ensure that the result is a multiple of 4. */ uint32_t sctp_get_prev_mtu(uint32_t val) { uint32_t i; val &= 0xfffffffc; if (val <= sctp_mtu_sizes[0]) { return (val); } for (i = 1; i < (sizeof(sctp_mtu_sizes) / sizeof(uint32_t)); i++) { if (val <= sctp_mtu_sizes[i]) { break; } } KASSERT((sctp_mtu_sizes[i - 1] & 0x00000003) == 0, ("sctp_mtu_sizes[%u] not a multiple of 4", i - 1)); return (sctp_mtu_sizes[i - 1]); } /* * Return the smallest MTU in sctp_mtu_sizes larger than val. * If val is larger than the maximum, just return the largest multiple of 4 smaller * or equal to val. * Ensure that the result is a multiple of 4. */ uint32_t sctp_get_next_mtu(uint32_t val) { /* select another MTU that is just bigger than this one */ uint32_t i; val &= 0xfffffffc; for (i = 0; i < (sizeof(sctp_mtu_sizes) / sizeof(uint32_t)); i++) { if (val < sctp_mtu_sizes[i]) { KASSERT((sctp_mtu_sizes[i] & 0x00000003) == 0, ("sctp_mtu_sizes[%u] not a multiple of 4", i)); return (sctp_mtu_sizes[i]); } } return (val); } void sctp_fill_random_store(struct sctp_pcb *m) { /* * Here we use the MD5/SHA-1 to hash with our good randomNumbers and * our counter. The result becomes our good random numbers and we * then setup to give these out. Note that we do no locking to * protect this. This is ok, since if competing folks call this we * will get more gobbled gook in the random store which is what we * want. There is a danger that two guys will use the same random * numbers, but thats ok too since that is random as well :-> */ m->store_at = 0; (void)sctp_hmac(SCTP_HMAC, (uint8_t *)m->random_numbers, sizeof(m->random_numbers), (uint8_t *)&m->random_counter, sizeof(m->random_counter), (uint8_t *)m->random_store); m->random_counter++; } uint32_t sctp_select_initial_TSN(struct sctp_pcb *inp) { /* * A true implementation should use random selection process to get * the initial stream sequence number, using RFC1750 as a good * guideline */ uint32_t x, *xp; uint8_t *p; int store_at, new_store; if (inp->initial_sequence_debug != 0) { uint32_t ret; ret = inp->initial_sequence_debug; inp->initial_sequence_debug++; return (ret); } retry: store_at = inp->store_at; new_store = store_at + sizeof(uint32_t); if (new_store >= (SCTP_SIGNATURE_SIZE - 3)) { new_store = 0; } if (!atomic_cmpset_int(&inp->store_at, store_at, new_store)) { goto retry; } if (new_store == 0) { /* Refill the random store */ sctp_fill_random_store(inp); } p = &inp->random_store[store_at]; xp = (uint32_t *)p; x = *xp; return (x); } uint32_t sctp_select_a_tag(struct sctp_inpcb *inp, uint16_t lport, uint16_t rport, int check) { uint32_t x; struct timeval now; if (check) { (void)SCTP_GETTIME_TIMEVAL(&now); } for (;;) { x = sctp_select_initial_TSN(&inp->sctp_ep); if (x == 0) { /* we never use 0 */ continue; } if (!check || sctp_is_vtag_good(x, lport, rport, &now)) { break; } } return (x); } int32_t sctp_map_assoc_state(int kernel_state) { int32_t user_state; if (kernel_state & SCTP_STATE_WAS_ABORTED) { user_state = SCTP_CLOSED; } else if (kernel_state & SCTP_STATE_SHUTDOWN_PENDING) { user_state = SCTP_SHUTDOWN_PENDING; } else { switch (kernel_state & SCTP_STATE_MASK) { case SCTP_STATE_EMPTY: user_state = SCTP_CLOSED; break; case SCTP_STATE_INUSE: user_state = SCTP_CLOSED; break; case SCTP_STATE_COOKIE_WAIT: user_state = SCTP_COOKIE_WAIT; break; case SCTP_STATE_COOKIE_ECHOED: user_state = SCTP_COOKIE_ECHOED; break; case SCTP_STATE_OPEN: user_state = SCTP_ESTABLISHED; break; case SCTP_STATE_SHUTDOWN_SENT: user_state = SCTP_SHUTDOWN_SENT; break; case SCTP_STATE_SHUTDOWN_RECEIVED: user_state = SCTP_SHUTDOWN_RECEIVED; break; case SCTP_STATE_SHUTDOWN_ACK_SENT: user_state = SCTP_SHUTDOWN_ACK_SENT; break; default: user_state = SCTP_CLOSED; break; } } return (user_state); } int sctp_init_asoc(struct sctp_inpcb *inp, struct sctp_tcb *stcb, uint32_t override_tag, uint32_t vrf_id, uint16_t o_strms) { struct sctp_association *asoc; /* * Anything set to zero is taken care of by the allocation routine's * bzero */ /* * Up front select what scoping to apply on addresses I tell my peer * Not sure what to do with these right now, we will need to come up * with a way to set them. We may need to pass them through from the * caller in the sctp_aloc_assoc() function. */ int i; #if defined(SCTP_DETAILED_STR_STATS) int j; #endif asoc = &stcb->asoc; /* init all variables to a known value. */ SCTP_SET_STATE(stcb, SCTP_STATE_INUSE); asoc->max_burst = inp->sctp_ep.max_burst; asoc->fr_max_burst = inp->sctp_ep.fr_max_burst; asoc->heart_beat_delay = sctp_ticks_to_msecs(inp->sctp_ep.sctp_timeoutticks[SCTP_TIMER_HEARTBEAT]); asoc->cookie_life = inp->sctp_ep.def_cookie_life; asoc->sctp_cmt_on_off = inp->sctp_cmt_on_off; asoc->ecn_supported = inp->ecn_supported; asoc->prsctp_supported = inp->prsctp_supported; asoc->auth_supported = inp->auth_supported; asoc->asconf_supported = inp->asconf_supported; asoc->reconfig_supported = inp->reconfig_supported; asoc->nrsack_supported = inp->nrsack_supported; asoc->pktdrop_supported = inp->pktdrop_supported; asoc->idata_supported = inp->idata_supported; asoc->sctp_cmt_pf = (uint8_t)0; asoc->sctp_frag_point = inp->sctp_frag_point; asoc->sctp_features = inp->sctp_features; asoc->default_dscp = inp->sctp_ep.default_dscp; asoc->max_cwnd = inp->max_cwnd; #ifdef INET6 if (inp->sctp_ep.default_flowlabel) { asoc->default_flowlabel = inp->sctp_ep.default_flowlabel; } else { if (inp->ip_inp.inp.inp_flags & IN6P_AUTOFLOWLABEL) { asoc->default_flowlabel = sctp_select_initial_TSN(&inp->sctp_ep); asoc->default_flowlabel &= 0x000fffff; asoc->default_flowlabel |= 0x80000000; } else { asoc->default_flowlabel = 0; } } #endif asoc->sb_send_resv = 0; if (override_tag) { asoc->my_vtag = override_tag; } else { asoc->my_vtag = sctp_select_a_tag(inp, stcb->sctp_ep->sctp_lport, stcb->rport, 1); } /* Get the nonce tags */ asoc->my_vtag_nonce = sctp_select_a_tag(inp, stcb->sctp_ep->sctp_lport, stcb->rport, 0); asoc->peer_vtag_nonce = sctp_select_a_tag(inp, stcb->sctp_ep->sctp_lport, stcb->rport, 0); asoc->vrf_id = vrf_id; #ifdef SCTP_ASOCLOG_OF_TSNS asoc->tsn_in_at = 0; asoc->tsn_out_at = 0; asoc->tsn_in_wrapped = 0; asoc->tsn_out_wrapped = 0; asoc->cumack_log_at = 0; asoc->cumack_log_atsnt = 0; #endif #ifdef SCTP_FS_SPEC_LOG asoc->fs_index = 0; #endif asoc->refcnt = 0; asoc->assoc_up_sent = 0; asoc->asconf_seq_out = asoc->str_reset_seq_out = asoc->init_seq_number = asoc->sending_seq = sctp_select_initial_TSN(&inp->sctp_ep); asoc->asconf_seq_out_acked = asoc->asconf_seq_out - 1; /* we are optimisitic here */ asoc->peer_supports_nat = 0; asoc->sent_queue_retran_cnt = 0; /* for CMT */ asoc->last_net_cmt_send_started = NULL; /* This will need to be adjusted */ asoc->last_acked_seq = asoc->init_seq_number - 1; asoc->advanced_peer_ack_point = asoc->last_acked_seq; asoc->asconf_seq_in = asoc->last_acked_seq; /* here we are different, we hold the next one we expect */ asoc->str_reset_seq_in = asoc->last_acked_seq + 1; asoc->initial_init_rto_max = inp->sctp_ep.initial_init_rto_max; asoc->initial_rto = inp->sctp_ep.initial_rto; asoc->default_mtu = inp->sctp_ep.default_mtu; asoc->max_init_times = inp->sctp_ep.max_init_times; asoc->max_send_times = inp->sctp_ep.max_send_times; asoc->def_net_failure = inp->sctp_ep.def_net_failure; asoc->def_net_pf_threshold = inp->sctp_ep.def_net_pf_threshold; asoc->free_chunk_cnt = 0; asoc->iam_blocking = 0; asoc->context = inp->sctp_context; asoc->local_strreset_support = inp->local_strreset_support; asoc->def_send = inp->def_send; asoc->delayed_ack = sctp_ticks_to_msecs(inp->sctp_ep.sctp_timeoutticks[SCTP_TIMER_RECV]); asoc->sack_freq = inp->sctp_ep.sctp_sack_freq; asoc->pr_sctp_cnt = 0; asoc->total_output_queue_size = 0; if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) { asoc->scope.ipv6_addr_legal = 1; if (SCTP_IPV6_V6ONLY(inp) == 0) { asoc->scope.ipv4_addr_legal = 1; } else { asoc->scope.ipv4_addr_legal = 0; } } else { asoc->scope.ipv6_addr_legal = 0; asoc->scope.ipv4_addr_legal = 1; } asoc->my_rwnd = max(SCTP_SB_LIMIT_RCV(inp->sctp_socket), SCTP_MINIMAL_RWND); asoc->peers_rwnd = SCTP_SB_LIMIT_RCV(inp->sctp_socket); asoc->smallest_mtu = inp->sctp_frag_point; asoc->minrto = inp->sctp_ep.sctp_minrto; asoc->maxrto = inp->sctp_ep.sctp_maxrto; asoc->stream_locked_on = 0; asoc->ecn_echo_cnt_onq = 0; asoc->stream_locked = 0; asoc->send_sack = 1; LIST_INIT(&asoc->sctp_restricted_addrs); TAILQ_INIT(&asoc->nets); TAILQ_INIT(&asoc->pending_reply_queue); TAILQ_INIT(&asoc->asconf_ack_sent); /* Setup to fill the hb random cache at first HB */ asoc->hb_random_idx = 4; asoc->sctp_autoclose_ticks = inp->sctp_ep.auto_close_time; stcb->asoc.congestion_control_module = inp->sctp_ep.sctp_default_cc_module; stcb->asoc.cc_functions = sctp_cc_functions[inp->sctp_ep.sctp_default_cc_module]; stcb->asoc.stream_scheduling_module = inp->sctp_ep.sctp_default_ss_module; stcb->asoc.ss_functions = sctp_ss_functions[inp->sctp_ep.sctp_default_ss_module]; /* * Now the stream parameters, here we allocate space for all streams * that we request by default. */ asoc->strm_realoutsize = asoc->streamoutcnt = asoc->pre_open_streams = o_strms; SCTP_MALLOC(asoc->strmout, struct sctp_stream_out *, asoc->streamoutcnt * sizeof(struct sctp_stream_out), SCTP_M_STRMO); if (asoc->strmout == NULL) { /* big trouble no memory */ SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTPUTIL, ENOMEM); return (ENOMEM); } for (i = 0; i < asoc->streamoutcnt; i++) { /* * inbound side must be set to 0xffff, also NOTE when we get * the INIT-ACK back (for INIT sender) we MUST reduce the * count (streamoutcnt) but first check if we sent to any of * the upper streams that were dropped (if some were). Those * that were dropped must be notified to the upper layer as * failed to send. */ asoc->strmout[i].next_mid_ordered = 0; asoc->strmout[i].next_mid_unordered = 0; TAILQ_INIT(&asoc->strmout[i].outqueue); asoc->strmout[i].chunks_on_queues = 0; #if defined(SCTP_DETAILED_STR_STATS) for (j = 0; j < SCTP_PR_SCTP_MAX + 1; j++) { asoc->strmout[i].abandoned_sent[j] = 0; asoc->strmout[i].abandoned_unsent[j] = 0; } #else asoc->strmout[i].abandoned_sent[0] = 0; asoc->strmout[i].abandoned_unsent[0] = 0; #endif asoc->strmout[i].sid = i; asoc->strmout[i].last_msg_incomplete = 0; asoc->strmout[i].state = SCTP_STREAM_OPENING; asoc->ss_functions.sctp_ss_init_stream(stcb, &asoc->strmout[i], NULL); } asoc->ss_functions.sctp_ss_init(stcb, asoc, 0); /* Now the mapping array */ asoc->mapping_array_size = SCTP_INITIAL_MAPPING_ARRAY; SCTP_MALLOC(asoc->mapping_array, uint8_t *, asoc->mapping_array_size, SCTP_M_MAP); if (asoc->mapping_array == NULL) { SCTP_FREE(asoc->strmout, SCTP_M_STRMO); SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTPUTIL, ENOMEM); return (ENOMEM); } memset(asoc->mapping_array, 0, asoc->mapping_array_size); SCTP_MALLOC(asoc->nr_mapping_array, uint8_t *, asoc->mapping_array_size, SCTP_M_MAP); if (asoc->nr_mapping_array == NULL) { SCTP_FREE(asoc->strmout, SCTP_M_STRMO); SCTP_FREE(asoc->mapping_array, SCTP_M_MAP); SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTPUTIL, ENOMEM); return (ENOMEM); } memset(asoc->nr_mapping_array, 0, asoc->mapping_array_size); /* Now the init of the other outqueues */ TAILQ_INIT(&asoc->free_chunks); TAILQ_INIT(&asoc->control_send_queue); TAILQ_INIT(&asoc->asconf_send_queue); TAILQ_INIT(&asoc->send_queue); TAILQ_INIT(&asoc->sent_queue); TAILQ_INIT(&asoc->resetHead); asoc->max_inbound_streams = inp->sctp_ep.max_open_streams_intome; TAILQ_INIT(&asoc->asconf_queue); /* authentication fields */ asoc->authinfo.random = NULL; asoc->authinfo.active_keyid = 0; asoc->authinfo.assoc_key = NULL; asoc->authinfo.assoc_keyid = 0; asoc->authinfo.recv_key = NULL; asoc->authinfo.recv_keyid = 0; LIST_INIT(&asoc->shared_keys); asoc->marked_retrans = 0; asoc->port = inp->sctp_ep.port; asoc->timoinit = 0; asoc->timodata = 0; asoc->timosack = 0; asoc->timoshutdown = 0; asoc->timoheartbeat = 0; asoc->timocookie = 0; asoc->timoshutdownack = 0; (void)SCTP_GETTIME_TIMEVAL(&asoc->start_time); asoc->discontinuity_time = asoc->start_time; for (i = 0; i < SCTP_PR_SCTP_MAX + 1; i++) { asoc->abandoned_unsent[i] = 0; asoc->abandoned_sent[i] = 0; } /* * sa_ignore MEMLEAK {memory is put in the assoc mapping array and * freed later when the association is freed. */ return (0); } void sctp_print_mapping_array(struct sctp_association *asoc) { unsigned int i, limit; SCTP_PRINTF("Mapping array size: %d, baseTSN: %8.8x, cumAck: %8.8x, highestTSN: (%8.8x, %8.8x).\n", asoc->mapping_array_size, asoc->mapping_array_base_tsn, asoc->cumulative_tsn, asoc->highest_tsn_inside_map, asoc->highest_tsn_inside_nr_map); for (limit = asoc->mapping_array_size; limit > 1; limit--) { if (asoc->mapping_array[limit - 1] != 0) { break; } } SCTP_PRINTF("Renegable mapping array (last %d entries are zero):\n", asoc->mapping_array_size - limit); for (i = 0; i < limit; i++) { SCTP_PRINTF("%2.2x%c", asoc->mapping_array[i], ((i + 1) % 16) ? ' ' : '\n'); } if (limit % 16) SCTP_PRINTF("\n"); for (limit = asoc->mapping_array_size; limit > 1; limit--) { if (asoc->nr_mapping_array[limit - 1]) { break; } } SCTP_PRINTF("Non renegable mapping array (last %d entries are zero):\n", asoc->mapping_array_size - limit); for (i = 0; i < limit; i++) { SCTP_PRINTF("%2.2x%c", asoc->nr_mapping_array[i], ((i + 1) % 16) ? ' ' : '\n'); } if (limit % 16) SCTP_PRINTF("\n"); } int sctp_expand_mapping_array(struct sctp_association *asoc, uint32_t needed) { /* mapping array needs to grow */ uint8_t *new_array1, *new_array2; uint32_t new_size; new_size = asoc->mapping_array_size + ((needed + 7) / 8 + SCTP_MAPPING_ARRAY_INCR); SCTP_MALLOC(new_array1, uint8_t *, new_size, SCTP_M_MAP); SCTP_MALLOC(new_array2, uint8_t *, new_size, SCTP_M_MAP); if ((new_array1 == NULL) || (new_array2 == NULL)) { /* can't get more, forget it */ SCTP_PRINTF("No memory for expansion of SCTP mapping array %d\n", new_size); if (new_array1) { SCTP_FREE(new_array1, SCTP_M_MAP); } if (new_array2) { SCTP_FREE(new_array2, SCTP_M_MAP); } return (-1); } memset(new_array1, 0, new_size); memset(new_array2, 0, new_size); memcpy(new_array1, asoc->mapping_array, asoc->mapping_array_size); memcpy(new_array2, asoc->nr_mapping_array, asoc->mapping_array_size); SCTP_FREE(asoc->mapping_array, SCTP_M_MAP); SCTP_FREE(asoc->nr_mapping_array, SCTP_M_MAP); asoc->mapping_array = new_array1; asoc->nr_mapping_array = new_array2; asoc->mapping_array_size = new_size; return (0); } static void sctp_iterator_work(struct sctp_iterator *it) { struct epoch_tracker et; struct sctp_inpcb *tinp; int iteration_count = 0; int inp_skip = 0; int first_in = 1; NET_EPOCH_ENTER(et); SCTP_INP_INFO_RLOCK(); SCTP_ITERATOR_LOCK(); sctp_it_ctl.cur_it = it; if (it->inp) { SCTP_INP_RLOCK(it->inp); SCTP_INP_DECR_REF(it->inp); } if (it->inp == NULL) { /* iterator is complete */ done_with_iterator: sctp_it_ctl.cur_it = NULL; SCTP_ITERATOR_UNLOCK(); SCTP_INP_INFO_RUNLOCK(); if (it->function_atend != NULL) { (*it->function_atend) (it->pointer, it->val); } SCTP_FREE(it, SCTP_M_ITER); NET_EPOCH_EXIT(et); return; } select_a_new_ep: if (first_in) { first_in = 0; } else { SCTP_INP_RLOCK(it->inp); } while (((it->pcb_flags) && ((it->inp->sctp_flags & it->pcb_flags) != it->pcb_flags)) || ((it->pcb_features) && ((it->inp->sctp_features & it->pcb_features) != it->pcb_features))) { /* endpoint flags or features don't match, so keep looking */ if (it->iterator_flags & SCTP_ITERATOR_DO_SINGLE_INP) { SCTP_INP_RUNLOCK(it->inp); goto done_with_iterator; } tinp = it->inp; it->inp = LIST_NEXT(it->inp, sctp_list); it->stcb = NULL; SCTP_INP_RUNLOCK(tinp); if (it->inp == NULL) { goto done_with_iterator; } SCTP_INP_RLOCK(it->inp); } /* now go through each assoc which is in the desired state */ if (it->done_current_ep == 0) { if (it->function_inp != NULL) inp_skip = (*it->function_inp) (it->inp, it->pointer, it->val); it->done_current_ep = 1; } if (it->stcb == NULL) { /* run the per instance function */ it->stcb = LIST_FIRST(&it->inp->sctp_asoc_list); } if ((inp_skip) || it->stcb == NULL) { if (it->function_inp_end != NULL) { inp_skip = (*it->function_inp_end) (it->inp, it->pointer, it->val); } SCTP_INP_RUNLOCK(it->inp); goto no_stcb; } while (it->stcb) { SCTP_TCB_LOCK(it->stcb); if (it->asoc_state && ((it->stcb->asoc.state & it->asoc_state) != it->asoc_state)) { /* not in the right state... keep looking */ SCTP_TCB_UNLOCK(it->stcb); goto next_assoc; } /* see if we have limited out the iterator loop */ iteration_count++; if (iteration_count > SCTP_ITERATOR_MAX_AT_ONCE) { /* Pause to let others grab the lock */ atomic_add_int(&it->stcb->asoc.refcnt, 1); SCTP_TCB_UNLOCK(it->stcb); SCTP_INP_INCR_REF(it->inp); SCTP_INP_RUNLOCK(it->inp); SCTP_ITERATOR_UNLOCK(); SCTP_INP_INFO_RUNLOCK(); SCTP_INP_INFO_RLOCK(); SCTP_ITERATOR_LOCK(); if (sctp_it_ctl.iterator_flags) { /* We won't be staying here */ SCTP_INP_DECR_REF(it->inp); atomic_add_int(&it->stcb->asoc.refcnt, -1); if (sctp_it_ctl.iterator_flags & SCTP_ITERATOR_STOP_CUR_IT) { sctp_it_ctl.iterator_flags &= ~SCTP_ITERATOR_STOP_CUR_IT; goto done_with_iterator; } if (sctp_it_ctl.iterator_flags & SCTP_ITERATOR_STOP_CUR_INP) { sctp_it_ctl.iterator_flags &= ~SCTP_ITERATOR_STOP_CUR_INP; goto no_stcb; } /* If we reach here huh? */ SCTP_PRINTF("Unknown it ctl flag %x\n", sctp_it_ctl.iterator_flags); sctp_it_ctl.iterator_flags = 0; } SCTP_INP_RLOCK(it->inp); SCTP_INP_DECR_REF(it->inp); SCTP_TCB_LOCK(it->stcb); atomic_add_int(&it->stcb->asoc.refcnt, -1); iteration_count = 0; } KASSERT(it->inp == it->stcb->sctp_ep, ("%s: stcb %p does not belong to inp %p, but inp %p", __func__, it->stcb, it->inp, it->stcb->sctp_ep)); /* run function on this one */ (*it->function_assoc) (it->inp, it->stcb, it->pointer, it->val); /* * we lie here, it really needs to have its own type but * first I must verify that this won't effect things :-0 */ if (it->no_chunk_output == 0) sctp_chunk_output(it->inp, it->stcb, SCTP_OUTPUT_FROM_T3, SCTP_SO_NOT_LOCKED); SCTP_TCB_UNLOCK(it->stcb); next_assoc: it->stcb = LIST_NEXT(it->stcb, sctp_tcblist); if (it->stcb == NULL) { /* Run last function */ if (it->function_inp_end != NULL) { inp_skip = (*it->function_inp_end) (it->inp, it->pointer, it->val); } } } SCTP_INP_RUNLOCK(it->inp); no_stcb: /* done with all assocs on this endpoint, move on to next endpoint */ it->done_current_ep = 0; if (it->iterator_flags & SCTP_ITERATOR_DO_SINGLE_INP) { it->inp = NULL; } else { it->inp = LIST_NEXT(it->inp, sctp_list); } it->stcb = NULL; if (it->inp == NULL) { goto done_with_iterator; } goto select_a_new_ep; } void sctp_iterator_worker(void) { struct sctp_iterator *it; /* This function is called with the WQ lock in place */ sctp_it_ctl.iterator_running = 1; while ((it = TAILQ_FIRST(&sctp_it_ctl.iteratorhead)) != NULL) { /* now lets work on this one */ TAILQ_REMOVE(&sctp_it_ctl.iteratorhead, it, sctp_nxt_itr); SCTP_IPI_ITERATOR_WQ_UNLOCK(); CURVNET_SET(it->vn); sctp_iterator_work(it); CURVNET_RESTORE(); SCTP_IPI_ITERATOR_WQ_LOCK(); /* sa_ignore FREED_MEMORY */ } sctp_it_ctl.iterator_running = 0; return; } static void sctp_handle_addr_wq(void) { /* deal with the ADDR wq from the rtsock calls */ struct sctp_laddr *wi, *nwi; struct sctp_asconf_iterator *asc; SCTP_MALLOC(asc, struct sctp_asconf_iterator *, sizeof(struct sctp_asconf_iterator), SCTP_M_ASC_IT); if (asc == NULL) { /* Try later, no memory */ sctp_timer_start(SCTP_TIMER_TYPE_ADDR_WQ, (struct sctp_inpcb *)NULL, (struct sctp_tcb *)NULL, (struct sctp_nets *)NULL); return; } LIST_INIT(&asc->list_of_work); asc->cnt = 0; LIST_FOREACH_SAFE(wi, &SCTP_BASE_INFO(addr_wq), sctp_nxt_addr, nwi) { LIST_REMOVE(wi, sctp_nxt_addr); LIST_INSERT_HEAD(&asc->list_of_work, wi, sctp_nxt_addr); asc->cnt++; } if (asc->cnt == 0) { SCTP_FREE(asc, SCTP_M_ASC_IT); } else { int ret; ret = sctp_initiate_iterator(sctp_asconf_iterator_ep, sctp_asconf_iterator_stcb, NULL, /* No ep end for boundall */ SCTP_PCB_FLAGS_BOUNDALL, SCTP_PCB_ANY_FEATURES, SCTP_ASOC_ANY_STATE, (void *)asc, 0, sctp_asconf_iterator_end, NULL, 0); if (ret) { SCTP_PRINTF("Failed to initiate iterator for handle_addr_wq\n"); /* * Freeing if we are stopping or put back on the * addr_wq. */ if (SCTP_BASE_VAR(sctp_pcb_initialized) == 0) { sctp_asconf_iterator_end(asc, 0); } else { LIST_FOREACH(wi, &asc->list_of_work, sctp_nxt_addr) { LIST_INSERT_HEAD(&SCTP_BASE_INFO(addr_wq), wi, sctp_nxt_addr); } SCTP_FREE(asc, SCTP_M_ASC_IT); } } } } /*- * The following table shows which pointers for the inp, stcb, or net are * stored for each timer after it was started. * *|Name |Timer |inp |stcb|net | *|-----------------------------|-----------------------------|----|----|----| *|SCTP_TIMER_TYPE_SEND |net->rxt_timer |Yes |Yes |Yes | *|SCTP_TIMER_TYPE_INIT |net->rxt_timer |Yes |Yes |Yes | *|SCTP_TIMER_TYPE_RECV |stcb->asoc.dack_timer |Yes |Yes |No | *|SCTP_TIMER_TYPE_SHUTDOWN |net->rxt_timer |Yes |Yes |Yes | *|SCTP_TIMER_TYPE_HEARTBEAT |net->hb_timer |Yes |Yes |Yes | *|SCTP_TIMER_TYPE_COOKIE |net->rxt_timer |Yes |Yes |Yes | *|SCTP_TIMER_TYPE_NEWCOOKIE |inp->sctp_ep.signature_change|Yes |No |No | *|SCTP_TIMER_TYPE_PATHMTURAISE |net->pmtu_timer |Yes |Yes |Yes | *|SCTP_TIMER_TYPE_SHUTDOWNACK |net->rxt_timer |Yes |Yes |Yes | *|SCTP_TIMER_TYPE_ASCONF |stcb->asoc.asconf_timer |Yes |Yes |Yes | *|SCTP_TIMER_TYPE_SHUTDOWNGUARD|stcb->asoc.shut_guard_timer |Yes |Yes |No | *|SCTP_TIMER_TYPE_AUTOCLOSE |stcb->asoc.autoclose_timer |Yes |Yes |No | *|SCTP_TIMER_TYPE_STRRESET |stcb->asoc.strreset_timer |Yes |Yes |No | *|SCTP_TIMER_TYPE_INPKILL |inp->sctp_ep.signature_change|Yes |No |No | *|SCTP_TIMER_TYPE_ASOCKILL |stcb->asoc.strreset_timer |Yes |Yes |No | *|SCTP_TIMER_TYPE_ADDR_WQ |SCTP_BASE_INFO(addr_wq_timer)|No |No |No | *|SCTP_TIMER_TYPE_PRIM_DELETED |stcb->asoc.delete_prim_timer |Yes |Yes |No | */ void sctp_timeout_handler(void *t) { struct epoch_tracker et; struct timeval tv; struct sctp_inpcb *inp; struct sctp_tcb *stcb; struct sctp_nets *net; struct sctp_timer *tmr; struct mbuf *op_err; int type; int i, secret; bool did_output, released_asoc_reference; /* * If inp, stcb or net are not NULL, then references to these were * added when the timer was started, and must be released before * this function returns. */ tmr = (struct sctp_timer *)t; inp = (struct sctp_inpcb *)tmr->ep; stcb = (struct sctp_tcb *)tmr->tcb; net = (struct sctp_nets *)tmr->net; CURVNET_SET((struct vnet *)tmr->vnet); NET_EPOCH_ENTER(et); did_output = 1; released_asoc_reference = false; #ifdef SCTP_AUDITING_ENABLED sctp_audit_log(0xF0, (uint8_t)tmr->type); sctp_auditing(3, inp, stcb, net); #endif /* sanity checks... */ KASSERT(tmr->self == NULL || tmr->self == tmr, ("sctp_timeout_handler: tmr->self corrupted")); KASSERT(SCTP_IS_TIMER_TYPE_VALID(tmr->type), ("sctp_timeout_handler: invalid timer type %d", tmr->type)); type = tmr->type; KASSERT(stcb == NULL || stcb->sctp_ep == inp, ("sctp_timeout_handler of type %d: inp = %p, stcb->sctp_ep %p", type, stcb, stcb->sctp_ep)); tmr->stopped_from = 0xa001; if ((stcb != NULL) && (stcb->asoc.state == SCTP_STATE_EMPTY)) { SCTPDBG(SCTP_DEBUG_TIMER2, "Timer type %d handler exiting due to CLOSED association.\n", type); goto out_decr; } tmr->stopped_from = 0xa002; SCTPDBG(SCTP_DEBUG_TIMER2, "Timer type %d goes off.\n", type); if (!SCTP_OS_TIMER_ACTIVE(&tmr->timer)) { SCTPDBG(SCTP_DEBUG_TIMER2, "Timer type %d handler exiting due to not being active.\n", type); goto out_decr; } tmr->stopped_from = 0xa003; if (stcb) { SCTP_TCB_LOCK(stcb); /* * Release reference so that association can be freed if * necessary below. This is safe now that we have acquired * the lock. */ atomic_add_int(&stcb->asoc.refcnt, -1); released_asoc_reference = true; if ((type != SCTP_TIMER_TYPE_ASOCKILL) && ((stcb->asoc.state == SCTP_STATE_EMPTY) || (stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED))) { SCTPDBG(SCTP_DEBUG_TIMER2, "Timer type %d handler exiting due to CLOSED association.\n", type); goto out; } } else if (inp != NULL) { SCTP_INP_WLOCK(inp); } else { SCTP_WQ_ADDR_LOCK(); } /* Record in stopped_from which timeout occurred. */ tmr->stopped_from = type; /* mark as being serviced now */ if (SCTP_OS_TIMER_PENDING(&tmr->timer)) { /* * Callout has been rescheduled. */ goto out; } if (!SCTP_OS_TIMER_ACTIVE(&tmr->timer)) { /* * Not active, so no action. */ goto out; } SCTP_OS_TIMER_DEACTIVATE(&tmr->timer); /* call the handler for the appropriate timer type */ switch (type) { case SCTP_TIMER_TYPE_SEND: KASSERT(inp != NULL && stcb != NULL && net != NULL, ("timeout of type %d: inp = %p, stcb = %p, net = %p", type, inp, stcb, net)); SCTP_STAT_INCR(sctps_timodata); stcb->asoc.timodata++; stcb->asoc.num_send_timers_up--; if (stcb->asoc.num_send_timers_up < 0) { stcb->asoc.num_send_timers_up = 0; } SCTP_TCB_LOCK_ASSERT(stcb); if (sctp_t3rxt_timer(inp, stcb, net)) { /* no need to unlock on tcb its gone */ goto out_decr; } SCTP_TCB_LOCK_ASSERT(stcb); #ifdef SCTP_AUDITING_ENABLED sctp_auditing(4, inp, stcb, net); #endif sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_T3, SCTP_SO_NOT_LOCKED); did_output = true; if ((stcb->asoc.num_send_timers_up == 0) && (stcb->asoc.sent_queue_cnt > 0)) { struct sctp_tmit_chunk *chk; /* * Safeguard. If there on some on the sent queue * somewhere but no timers running something is * wrong... so we start a timer on the first chunk * on the send queue on whatever net it is sent to. */ TAILQ_FOREACH(chk, &stcb->asoc.sent_queue, sctp_next) { if (chk->whoTo != NULL) { break; } } if (chk != NULL) { sctp_timer_start(SCTP_TIMER_TYPE_SEND, stcb->sctp_ep, stcb, chk->whoTo); } } break; case SCTP_TIMER_TYPE_INIT: KASSERT(inp != NULL && stcb != NULL && net != NULL, ("timeout of type %d: inp = %p, stcb = %p, net = %p", type, inp, stcb, net)); SCTP_STAT_INCR(sctps_timoinit); stcb->asoc.timoinit++; if (sctp_t1init_timer(inp, stcb, net)) { /* no need to unlock on tcb its gone */ goto out_decr; } did_output = false; break; case SCTP_TIMER_TYPE_RECV: KASSERT(inp != NULL && stcb != NULL && net == NULL, ("timeout of type %d: inp = %p, stcb = %p, net = %p", type, inp, stcb, net)); SCTP_STAT_INCR(sctps_timosack); stcb->asoc.timosack++; sctp_send_sack(stcb, SCTP_SO_NOT_LOCKED); #ifdef SCTP_AUDITING_ENABLED sctp_auditing(4, inp, stcb, NULL); #endif sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_SACK_TMR, SCTP_SO_NOT_LOCKED); did_output = true; break; case SCTP_TIMER_TYPE_SHUTDOWN: KASSERT(inp != NULL && stcb != NULL && net != NULL, ("timeout of type %d: inp = %p, stcb = %p, net = %p", type, inp, stcb, net)); SCTP_STAT_INCR(sctps_timoshutdown); stcb->asoc.timoshutdown++; if (sctp_shutdown_timer(inp, stcb, net)) { /* no need to unlock on tcb its gone */ goto out_decr; } #ifdef SCTP_AUDITING_ENABLED sctp_auditing(4, inp, stcb, net); #endif sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_SHUT_TMR, SCTP_SO_NOT_LOCKED); did_output = true; break; case SCTP_TIMER_TYPE_HEARTBEAT: KASSERT(inp != NULL && stcb != NULL && net != NULL, ("timeout of type %d: inp = %p, stcb = %p, net = %p", type, inp, stcb, net)); SCTP_STAT_INCR(sctps_timoheartbeat); stcb->asoc.timoheartbeat++; if (sctp_heartbeat_timer(inp, stcb, net)) { /* no need to unlock on tcb its gone */ goto out_decr; } #ifdef SCTP_AUDITING_ENABLED sctp_auditing(4, inp, stcb, net); #endif if (!(net->dest_state & SCTP_ADDR_NOHB)) { sctp_timer_start(SCTP_TIMER_TYPE_HEARTBEAT, inp, stcb, net); sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_HB_TMR, SCTP_SO_NOT_LOCKED); did_output = true; } else { did_output = false; } break; case SCTP_TIMER_TYPE_COOKIE: KASSERT(inp != NULL && stcb != NULL && net != NULL, ("timeout of type %d: inp = %p, stcb = %p, net = %p", type, inp, stcb, net)); SCTP_STAT_INCR(sctps_timocookie); stcb->asoc.timocookie++; if (sctp_cookie_timer(inp, stcb, net)) { /* no need to unlock on tcb its gone */ goto out_decr; } #ifdef SCTP_AUDITING_ENABLED sctp_auditing(4, inp, stcb, net); #endif /* * We consider T3 and Cookie timer pretty much the same with * respect to where from in chunk_output. */ sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_T3, SCTP_SO_NOT_LOCKED); did_output = true; break; case SCTP_TIMER_TYPE_NEWCOOKIE: KASSERT(inp != NULL && stcb == NULL && net == NULL, ("timeout of type %d: inp = %p, stcb = %p, net = %p", type, inp, stcb, net)); SCTP_STAT_INCR(sctps_timosecret); (void)SCTP_GETTIME_TIMEVAL(&tv); inp->sctp_ep.time_of_secret_change = tv.tv_sec; inp->sctp_ep.last_secret_number = inp->sctp_ep.current_secret_number; inp->sctp_ep.current_secret_number++; if (inp->sctp_ep.current_secret_number >= SCTP_HOW_MANY_SECRETS) { inp->sctp_ep.current_secret_number = 0; } secret = (int)inp->sctp_ep.current_secret_number; for (i = 0; i < SCTP_NUMBER_OF_SECRETS; i++) { inp->sctp_ep.secret_key[secret][i] = sctp_select_initial_TSN(&inp->sctp_ep); } sctp_timer_start(SCTP_TIMER_TYPE_NEWCOOKIE, inp, NULL, NULL); did_output = false; break; case SCTP_TIMER_TYPE_PATHMTURAISE: KASSERT(inp != NULL && stcb != NULL && net != NULL, ("timeout of type %d: inp = %p, stcb = %p, net = %p", type, inp, stcb, net)); SCTP_STAT_INCR(sctps_timopathmtu); sctp_pathmtu_timer(inp, stcb, net); did_output = false; break; case SCTP_TIMER_TYPE_SHUTDOWNACK: KASSERT(inp != NULL && stcb != NULL && net != NULL, ("timeout of type %d: inp = %p, stcb = %p, net = %p", type, inp, stcb, net)); if (sctp_shutdownack_timer(inp, stcb, net)) { /* no need to unlock on tcb its gone */ goto out_decr; } SCTP_STAT_INCR(sctps_timoshutdownack); stcb->asoc.timoshutdownack++; #ifdef SCTP_AUDITING_ENABLED sctp_auditing(4, inp, stcb, net); #endif sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_SHUT_ACK_TMR, SCTP_SO_NOT_LOCKED); did_output = true; break; case SCTP_TIMER_TYPE_ASCONF: KASSERT(inp != NULL && stcb != NULL && net != NULL, ("timeout of type %d: inp = %p, stcb = %p, net = %p", type, inp, stcb, net)); SCTP_STAT_INCR(sctps_timoasconf); if (sctp_asconf_timer(inp, stcb, net)) { /* no need to unlock on tcb its gone */ goto out_decr; } #ifdef SCTP_AUDITING_ENABLED sctp_auditing(4, inp, stcb, net); #endif sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_ASCONF_TMR, SCTP_SO_NOT_LOCKED); did_output = true; break; case SCTP_TIMER_TYPE_SHUTDOWNGUARD: KASSERT(inp != NULL && stcb != NULL && net == NULL, ("timeout of type %d: inp = %p, stcb = %p, net = %p", type, inp, stcb, net)); SCTP_STAT_INCR(sctps_timoshutdownguard); op_err = sctp_generate_cause(SCTP_BASE_SYSCTL(sctp_diag_info_code), "Shutdown guard timer expired"); sctp_abort_an_association(inp, stcb, op_err, SCTP_SO_NOT_LOCKED); did_output = true; /* no need to unlock on tcb its gone */ goto out_decr; case SCTP_TIMER_TYPE_AUTOCLOSE: KASSERT(inp != NULL && stcb != NULL && net == NULL, ("timeout of type %d: inp = %p, stcb = %p, net = %p", type, inp, stcb, net)); SCTP_STAT_INCR(sctps_timoautoclose); sctp_autoclose_timer(inp, stcb); sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_AUTOCLOSE_TMR, SCTP_SO_NOT_LOCKED); did_output = true; break; case SCTP_TIMER_TYPE_STRRESET: KASSERT(inp != NULL && stcb != NULL && net == NULL, ("timeout of type %d: inp = %p, stcb = %p, net = %p", type, inp, stcb, net)); SCTP_STAT_INCR(sctps_timostrmrst); if (sctp_strreset_timer(inp, stcb)) { /* no need to unlock on tcb its gone */ goto out_decr; } sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_STRRST_TMR, SCTP_SO_NOT_LOCKED); did_output = true; break; case SCTP_TIMER_TYPE_INPKILL: KASSERT(inp != NULL && stcb == NULL && net == NULL, ("timeout of type %d: inp = %p, stcb = %p, net = %p", type, inp, stcb, net)); SCTP_STAT_INCR(sctps_timoinpkill); /* * special case, take away our increment since WE are the * killer */ sctp_timer_stop(SCTP_TIMER_TYPE_INPKILL, inp, NULL, NULL, SCTP_FROM_SCTPUTIL + SCTP_LOC_3); SCTP_INP_DECR_REF(inp); SCTP_INP_WUNLOCK(inp); sctp_inpcb_free(inp, SCTP_FREE_SHOULD_USE_ABORT, SCTP_CALLED_FROM_INPKILL_TIMER); inp = NULL; goto out_no_decr; case SCTP_TIMER_TYPE_ASOCKILL: KASSERT(inp != NULL && stcb != NULL && net == NULL, ("timeout of type %d: inp = %p, stcb = %p, net = %p", type, inp, stcb, net)); SCTP_STAT_INCR(sctps_timoassockill); /* Can we free it yet? */ SCTP_INP_DECR_REF(inp); sctp_timer_stop(SCTP_TIMER_TYPE_ASOCKILL, inp, stcb, NULL, SCTP_FROM_SCTPUTIL + SCTP_LOC_1); (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTPUTIL + SCTP_LOC_2); /* * free asoc, always unlocks (or destroy's) so prevent * duplicate unlock or unlock of a free mtx :-0 */ stcb = NULL; goto out_no_decr; case SCTP_TIMER_TYPE_ADDR_WQ: KASSERT(inp == NULL && stcb == NULL && net == NULL, ("timeout of type %d: inp = %p, stcb = %p, net = %p", type, inp, stcb, net)); sctp_handle_addr_wq(); did_output = true; break; case SCTP_TIMER_TYPE_PRIM_DELETED: KASSERT(inp != NULL && stcb != NULL && net == NULL, ("timeout of type %d: inp = %p, stcb = %p, net = %p", type, inp, stcb, net)); SCTP_STAT_INCR(sctps_timodelprim); sctp_delete_prim_timer(inp, stcb); did_output = false; break; default: #ifdef INVARIANTS panic("Unknown timer type %d", type); #else did_output = false; goto out; #endif } #ifdef SCTP_AUDITING_ENABLED sctp_audit_log(0xF1, (uint8_t)type); if (inp != NULL) sctp_auditing(5, inp, stcb, net); #endif if (did_output && (stcb != NULL)) { /* * Now we need to clean up the control chunk chain if an * ECNE is on it. It must be marked as UNSENT again so next * call will continue to send it until such time that we get * a CWR, to remove it. It is, however, less likely that we * will find a ecn echo on the chain though. */ sctp_fix_ecn_echo(&stcb->asoc); } out: if (stcb != NULL) { SCTP_TCB_UNLOCK(stcb); } else if (inp != NULL) { SCTP_INP_WUNLOCK(inp); } else { SCTP_WQ_ADDR_UNLOCK(); } out_decr: /* These reference counts were incremented in sctp_timer_start(). */ if (inp != NULL) { SCTP_INP_DECR_REF(inp); } if ((stcb != NULL) && !released_asoc_reference) { atomic_add_int(&stcb->asoc.refcnt, -1); } if (net != NULL) { sctp_free_remote_addr(net); } out_no_decr: SCTPDBG(SCTP_DEBUG_TIMER2, "Timer type %d handler finished.\n", type); CURVNET_RESTORE(); NET_EPOCH_EXIT(et); } /*- * The following table shows which parameters must be provided * when calling sctp_timer_start(). For parameters not being * provided, NULL must be used. * * |Name |inp |stcb|net | * |-----------------------------|----|----|----| * |SCTP_TIMER_TYPE_SEND |Yes |Yes |Yes | * |SCTP_TIMER_TYPE_INIT |Yes |Yes |Yes | * |SCTP_TIMER_TYPE_RECV |Yes |Yes |No | * |SCTP_TIMER_TYPE_SHUTDOWN |Yes |Yes |Yes | * |SCTP_TIMER_TYPE_HEARTBEAT |Yes |Yes |Yes | * |SCTP_TIMER_TYPE_COOKIE |Yes |Yes |Yes | * |SCTP_TIMER_TYPE_NEWCOOKIE |Yes |No |No | * |SCTP_TIMER_TYPE_PATHMTURAISE |Yes |Yes |Yes | * |SCTP_TIMER_TYPE_SHUTDOWNACK |Yes |Yes |Yes | * |SCTP_TIMER_TYPE_ASCONF |Yes |Yes |Yes | * |SCTP_TIMER_TYPE_SHUTDOWNGUARD|Yes |Yes |No | * |SCTP_TIMER_TYPE_AUTOCLOSE |Yes |Yes |No | * |SCTP_TIMER_TYPE_STRRESET |Yes |Yes |Yes | * |SCTP_TIMER_TYPE_INPKILL |Yes |No |No | * |SCTP_TIMER_TYPE_ASOCKILL |Yes |Yes |No | * |SCTP_TIMER_TYPE_ADDR_WQ |No |No |No | * |SCTP_TIMER_TYPE_PRIM_DELETED |Yes |Yes |No | * */ void sctp_timer_start(int t_type, struct sctp_inpcb *inp, struct sctp_tcb *stcb, struct sctp_nets *net) { struct sctp_timer *tmr; uint32_t to_ticks; uint32_t rndval, jitter; KASSERT(stcb == NULL || stcb->sctp_ep == inp, ("sctp_timer_start of type %d: inp = %p, stcb->sctp_ep %p", t_type, stcb, stcb->sctp_ep)); tmr = NULL; to_ticks = 0; if (stcb != NULL) { SCTP_TCB_LOCK_ASSERT(stcb); } else if (inp != NULL) { SCTP_INP_WLOCK_ASSERT(inp); } else { SCTP_WQ_ADDR_LOCK_ASSERT(); } if (stcb != NULL) { /* * Don't restart timer on association that's about to be * killed. */ if ((stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) && (t_type != SCTP_TIMER_TYPE_ASOCKILL)) { SCTPDBG(SCTP_DEBUG_TIMER2, "Timer type %d not started: inp=%p, stcb=%p, net=%p (stcb deleted).\n", t_type, inp, stcb, net); return; } /* Don't restart timer on net that's been removed. */ if (net != NULL && (net->dest_state & SCTP_ADDR_BEING_DELETED)) { SCTPDBG(SCTP_DEBUG_TIMER2, "Timer type %d not started: inp=%p, stcb=%p, net=%p (net deleted).\n", t_type, inp, stcb, net); return; } } switch (t_type) { case SCTP_TIMER_TYPE_SEND: /* Here we use the RTO timer. */ if ((inp == NULL) || (stcb == NULL) || (net == NULL)) { #ifdef INVARIANTS panic("sctp_timer_start of type %d: inp = %p, stcb = %p, net = %p", t_type, inp, stcb, net); #else return; #endif } tmr = &net->rxt_timer; if (net->RTO == 0) { to_ticks = sctp_msecs_to_ticks(stcb->asoc.initial_rto); } else { to_ticks = sctp_msecs_to_ticks(net->RTO); } break; case SCTP_TIMER_TYPE_INIT: /* * Here we use the INIT timer default usually about 1 * second. */ if ((inp == NULL) || (stcb == NULL) || (net == NULL)) { #ifdef INVARIANTS panic("sctp_timer_start of type %d: inp = %p, stcb = %p, net = %p", t_type, inp, stcb, net); #else return; #endif } tmr = &net->rxt_timer; if (net->RTO == 0) { to_ticks = sctp_msecs_to_ticks(stcb->asoc.initial_rto); } else { to_ticks = sctp_msecs_to_ticks(net->RTO); } break; case SCTP_TIMER_TYPE_RECV: /* * Here we use the Delayed-Ack timer value from the inp, * ususually about 200ms. */ if ((inp == NULL) || (stcb == NULL) || (net != NULL)) { #ifdef INVARIANTS panic("sctp_timer_start of type %d: inp = %p, stcb = %p, net = %p", t_type, inp, stcb, net); #else return; #endif } tmr = &stcb->asoc.dack_timer; to_ticks = sctp_msecs_to_ticks(stcb->asoc.delayed_ack); break; case SCTP_TIMER_TYPE_SHUTDOWN: /* Here we use the RTO of the destination. */ if ((inp == NULL) || (stcb == NULL) || (net == NULL)) { #ifdef INVARIANTS panic("sctp_timer_start of type %d: inp = %p, stcb = %p, net = %p", t_type, inp, stcb, net); #else return; #endif } tmr = &net->rxt_timer; if (net->RTO == 0) { to_ticks = sctp_msecs_to_ticks(stcb->asoc.initial_rto); } else { to_ticks = sctp_msecs_to_ticks(net->RTO); } break; case SCTP_TIMER_TYPE_HEARTBEAT: /* * The net is used here so that we can add in the RTO. Even * though we use a different timer. We also add the HB timer * PLUS a random jitter. */ if ((inp == NULL) || (stcb == NULL) || (net == NULL)) { #ifdef INVARIANTS panic("sctp_timer_start of type %d: inp = %p, stcb = %p, net = %p", t_type, inp, stcb, net); #else return; #endif } if ((net->dest_state & SCTP_ADDR_NOHB) && !(net->dest_state & SCTP_ADDR_UNCONFIRMED)) { SCTPDBG(SCTP_DEBUG_TIMER2, "Timer type %d not started: inp=%p, stcb=%p, net=%p.\n", t_type, inp, stcb, net); return; } tmr = &net->hb_timer; if (net->RTO == 0) { to_ticks = stcb->asoc.initial_rto; } else { to_ticks = net->RTO; } rndval = sctp_select_initial_TSN(&inp->sctp_ep); jitter = rndval % to_ticks; if (jitter >= (to_ticks >> 1)) { to_ticks = to_ticks + (jitter - (to_ticks >> 1)); } else { to_ticks = to_ticks - jitter; } if (!(net->dest_state & SCTP_ADDR_UNCONFIRMED) && !(net->dest_state & SCTP_ADDR_PF)) { to_ticks += net->heart_beat_delay; } /* * Now we must convert the to_ticks that are now in ms to * ticks. */ to_ticks = sctp_msecs_to_ticks(to_ticks); break; case SCTP_TIMER_TYPE_COOKIE: /* * Here we can use the RTO timer from the network since one * RTT was complete. If a retransmission happened then we * will be using the RTO initial value. */ if ((inp == NULL) || (stcb == NULL) || (net == NULL)) { #ifdef INVARIANTS panic("sctp_timer_start of type %d: inp = %p, stcb = %p, net = %p", t_type, inp, stcb, net); #else return; #endif } tmr = &net->rxt_timer; if (net->RTO == 0) { to_ticks = sctp_msecs_to_ticks(stcb->asoc.initial_rto); } else { to_ticks = sctp_msecs_to_ticks(net->RTO); } break; case SCTP_TIMER_TYPE_NEWCOOKIE: /* * Nothing needed but the endpoint here ususually about 60 * minutes. */ if ((inp == NULL) || (stcb != NULL) || (net != NULL)) { #ifdef INVARIANTS panic("sctp_timer_start of type %d: inp = %p, stcb = %p, net = %p", t_type, inp, stcb, net); #else return; #endif } tmr = &inp->sctp_ep.signature_change; to_ticks = inp->sctp_ep.sctp_timeoutticks[SCTP_TIMER_SIGNATURE]; break; case SCTP_TIMER_TYPE_PATHMTURAISE: /* * Here we use the value found in the EP for PMTUD, * ususually about 10 minutes. */ if ((inp == NULL) || (stcb == NULL) || (net == NULL)) { #ifdef INVARIANTS panic("sctp_timer_start of type %d: inp = %p, stcb = %p, net = %p", t_type, inp, stcb, net); #else return; #endif } if (net->dest_state & SCTP_ADDR_NO_PMTUD) { SCTPDBG(SCTP_DEBUG_TIMER2, "Timer type %d not started: inp=%p, stcb=%p, net=%p.\n", t_type, inp, stcb, net); return; } tmr = &net->pmtu_timer; to_ticks = inp->sctp_ep.sctp_timeoutticks[SCTP_TIMER_PMTU]; break; case SCTP_TIMER_TYPE_SHUTDOWNACK: /* Here we use the RTO of the destination. */ if ((inp == NULL) || (stcb == NULL) || (net == NULL)) { #ifdef INVARIANTS panic("sctp_timer_start of type %d: inp = %p, stcb = %p, net = %p", t_type, inp, stcb, net); #else return; #endif } tmr = &net->rxt_timer; if (net->RTO == 0) { to_ticks = sctp_msecs_to_ticks(stcb->asoc.initial_rto); } else { to_ticks = sctp_msecs_to_ticks(net->RTO); } break; case SCTP_TIMER_TYPE_ASCONF: /* * Here the timer comes from the stcb but its value is from * the net's RTO. */ if ((inp == NULL) || (stcb == NULL) || (net == NULL)) { #ifdef INVARIANTS panic("sctp_timer_start of type %d: inp = %p, stcb = %p, net = %p", t_type, inp, stcb, net); #else return; #endif } tmr = &stcb->asoc.asconf_timer; if (net->RTO == 0) { to_ticks = sctp_msecs_to_ticks(stcb->asoc.initial_rto); } else { to_ticks = sctp_msecs_to_ticks(net->RTO); } break; case SCTP_TIMER_TYPE_SHUTDOWNGUARD: /* * Here we use the endpoints shutdown guard timer usually * about 3 minutes. */ if ((inp == NULL) || (stcb == NULL) || (net != NULL)) { #ifdef INVARIANTS panic("sctp_timer_start of type %d: inp = %p, stcb = %p, net = %p", t_type, inp, stcb, net); #else return; #endif } tmr = &stcb->asoc.shut_guard_timer; if (inp->sctp_ep.sctp_timeoutticks[SCTP_TIMER_MAXSHUTDOWN] == 0) { if (stcb->asoc.maxrto < UINT32_MAX / 5) { to_ticks = sctp_msecs_to_ticks(5 * stcb->asoc.maxrto); } else { to_ticks = sctp_msecs_to_ticks(UINT32_MAX); } } else { to_ticks = inp->sctp_ep.sctp_timeoutticks[SCTP_TIMER_MAXSHUTDOWN]; } break; case SCTP_TIMER_TYPE_AUTOCLOSE: if ((inp == NULL) || (stcb == NULL) || (net != NULL)) { #ifdef INVARIANTS panic("sctp_timer_start of type %d: inp = %p, stcb = %p, net = %p", t_type, inp, stcb, net); #else return; #endif } tmr = &stcb->asoc.autoclose_timer; to_ticks = stcb->asoc.sctp_autoclose_ticks; break; case SCTP_TIMER_TYPE_STRRESET: /* * Here the timer comes from the stcb but its value is from * the net's RTO. */ if ((inp == NULL) || (stcb == NULL) || (net == NULL)) { #ifdef INVARIANTS panic("sctp_timer_start of type %d: inp = %p, stcb = %p, net = %p", t_type, inp, stcb, net); #else return; #endif } tmr = &stcb->asoc.strreset_timer; if (net->RTO == 0) { to_ticks = sctp_msecs_to_ticks(stcb->asoc.initial_rto); } else { to_ticks = sctp_msecs_to_ticks(net->RTO); } break; case SCTP_TIMER_TYPE_INPKILL: /* * The inp is setup to die. We re-use the signature_chage * timer since that has stopped and we are in the GONE * state. */ if ((inp == NULL) || (stcb != NULL) || (net != NULL)) { #ifdef INVARIANTS panic("sctp_timer_start of type %d: inp = %p, stcb = %p, net = %p", t_type, inp, stcb, net); #else return; #endif } tmr = &inp->sctp_ep.signature_change; to_ticks = sctp_msecs_to_ticks(SCTP_INP_KILL_TIMEOUT); break; case SCTP_TIMER_TYPE_ASOCKILL: if ((inp == NULL) || (stcb == NULL) || (net != NULL)) { #ifdef INVARIANTS panic("sctp_timer_start of type %d: inp = %p, stcb = %p, net = %p", t_type, inp, stcb, net); #else return; #endif } tmr = &stcb->asoc.strreset_timer; to_ticks = sctp_msecs_to_ticks(SCTP_ASOC_KILL_TIMEOUT); break; case SCTP_TIMER_TYPE_ADDR_WQ: if ((inp != NULL) || (stcb != NULL) || (net != NULL)) { #ifdef INVARIANTS panic("sctp_timer_start of type %d: inp = %p, stcb = %p, net = %p", t_type, inp, stcb, net); #else return; #endif } /* Only 1 tick away :-) */ tmr = &SCTP_BASE_INFO(addr_wq_timer); to_ticks = SCTP_ADDRESS_TICK_DELAY; break; case SCTP_TIMER_TYPE_PRIM_DELETED: if ((inp == NULL) || (stcb == NULL) || (net != NULL)) { #ifdef INVARIANTS panic("sctp_timer_start of type %d: inp = %p, stcb = %p, net = %p", t_type, inp, stcb, net); #else return; #endif } tmr = &stcb->asoc.delete_prim_timer; to_ticks = sctp_msecs_to_ticks(stcb->asoc.initial_rto); break; default: #ifdef INVARIANTS panic("Unknown timer type %d", t_type); #else return; #endif } KASSERT(tmr != NULL, ("tmr is NULL for timer type %d", t_type)); KASSERT(to_ticks > 0, ("to_ticks == 0 for timer type %d", t_type)); if (SCTP_OS_TIMER_PENDING(&tmr->timer)) { /* * We do NOT allow you to have it already running. If it is, * we leave the current one up unchanged. */ SCTPDBG(SCTP_DEBUG_TIMER2, "Timer type %d already running: inp=%p, stcb=%p, net=%p.\n", t_type, inp, stcb, net); return; } /* At this point we can proceed. */ if (t_type == SCTP_TIMER_TYPE_SEND) { stcb->asoc.num_send_timers_up++; } tmr->stopped_from = 0; tmr->type = t_type; tmr->ep = (void *)inp; tmr->tcb = (void *)stcb; if (t_type == SCTP_TIMER_TYPE_STRRESET) { tmr->net = NULL; } else { tmr->net = (void *)net; } tmr->self = (void *)tmr; tmr->vnet = (void *)curvnet; tmr->ticks = sctp_get_tick_count(); if (SCTP_OS_TIMER_START(&tmr->timer, to_ticks, sctp_timeout_handler, tmr) == 0) { SCTPDBG(SCTP_DEBUG_TIMER2, "Timer type %d started: ticks=%u, inp=%p, stcb=%p, net=%p.\n", t_type, to_ticks, inp, stcb, net); /* * If this is a newly scheduled callout, as opposed to a * rescheduled one, increment relevant reference counts. */ if (tmr->ep != NULL) { SCTP_INP_INCR_REF(inp); } if (tmr->tcb != NULL) { atomic_add_int(&stcb->asoc.refcnt, 1); } if (tmr->net != NULL) { atomic_add_int(&net->ref_count, 1); } } else { /* * This should not happen, since we checked for pending * above. */ SCTPDBG(SCTP_DEBUG_TIMER2, "Timer type %d restarted: ticks=%u, inp=%p, stcb=%p, net=%p.\n", t_type, to_ticks, inp, stcb, net); } return; } /*- * The following table shows which parameters must be provided * when calling sctp_timer_stop(). For parameters not being * provided, NULL must be used. * * |Name |inp |stcb|net | * |-----------------------------|----|----|----| * |SCTP_TIMER_TYPE_SEND |Yes |Yes |Yes | * |SCTP_TIMER_TYPE_INIT |Yes |Yes |Yes | * |SCTP_TIMER_TYPE_RECV |Yes |Yes |No | * |SCTP_TIMER_TYPE_SHUTDOWN |Yes |Yes |Yes | * |SCTP_TIMER_TYPE_HEARTBEAT |Yes |Yes |Yes | * |SCTP_TIMER_TYPE_COOKIE |Yes |Yes |Yes | * |SCTP_TIMER_TYPE_NEWCOOKIE |Yes |No |No | * |SCTP_TIMER_TYPE_PATHMTURAISE |Yes |Yes |Yes | * |SCTP_TIMER_TYPE_SHUTDOWNACK |Yes |Yes |Yes | * |SCTP_TIMER_TYPE_ASCONF |Yes |Yes |No | * |SCTP_TIMER_TYPE_SHUTDOWNGUARD|Yes |Yes |No | * |SCTP_TIMER_TYPE_AUTOCLOSE |Yes |Yes |No | * |SCTP_TIMER_TYPE_STRRESET |Yes |Yes |No | * |SCTP_TIMER_TYPE_INPKILL |Yes |No |No | * |SCTP_TIMER_TYPE_ASOCKILL |Yes |Yes |No | * |SCTP_TIMER_TYPE_ADDR_WQ |No |No |No | * |SCTP_TIMER_TYPE_PRIM_DELETED |Yes |Yes |No | * */ void sctp_timer_stop(int t_type, struct sctp_inpcb *inp, struct sctp_tcb *stcb, struct sctp_nets *net, uint32_t from) { struct sctp_timer *tmr; KASSERT(stcb == NULL || stcb->sctp_ep == inp, ("sctp_timer_stop of type %d: inp = %p, stcb->sctp_ep %p", t_type, stcb, stcb->sctp_ep)); if (stcb != NULL) { SCTP_TCB_LOCK_ASSERT(stcb); } else if (inp != NULL) { SCTP_INP_WLOCK_ASSERT(inp); } else { SCTP_WQ_ADDR_LOCK_ASSERT(); } tmr = NULL; switch (t_type) { case SCTP_TIMER_TYPE_SEND: if ((inp == NULL) || (stcb == NULL) || (net == NULL)) { #ifdef INVARIANTS panic("sctp_timer_stop of type %d: inp = %p, stcb = %p, net = %p", t_type, inp, stcb, net); #else return; #endif } tmr = &net->rxt_timer; break; case SCTP_TIMER_TYPE_INIT: if ((inp == NULL) || (stcb == NULL) || (net == NULL)) { #ifdef INVARIANTS panic("sctp_timer_stop of type %d: inp = %p, stcb = %p, net = %p", t_type, inp, stcb, net); #else return; #endif } tmr = &net->rxt_timer; break; case SCTP_TIMER_TYPE_RECV: if ((inp == NULL) || (stcb == NULL) || (net != NULL)) { #ifdef INVARIANTS panic("sctp_timer_stop of type %d: inp = %p, stcb = %p, net = %p", t_type, inp, stcb, net); #else return; #endif } tmr = &stcb->asoc.dack_timer; break; case SCTP_TIMER_TYPE_SHUTDOWN: if ((inp == NULL) || (stcb == NULL) || (net == NULL)) { #ifdef INVARIANTS panic("sctp_timer_stop of type %d: inp = %p, stcb = %p, net = %p", t_type, inp, stcb, net); #else return; #endif } tmr = &net->rxt_timer; break; case SCTP_TIMER_TYPE_HEARTBEAT: if ((inp == NULL) || (stcb == NULL) || (net == NULL)) { #ifdef INVARIANTS panic("sctp_timer_stop of type %d: inp = %p, stcb = %p, net = %p", t_type, inp, stcb, net); #else return; #endif } tmr = &net->hb_timer; break; case SCTP_TIMER_TYPE_COOKIE: if ((inp == NULL) || (stcb == NULL) || (net == NULL)) { #ifdef INVARIANTS panic("sctp_timer_stop of type %d: inp = %p, stcb = %p, net = %p", t_type, inp, stcb, net); #else return; #endif } tmr = &net->rxt_timer; break; case SCTP_TIMER_TYPE_NEWCOOKIE: if ((inp == NULL) || (stcb != NULL) || (net != NULL)) { #ifdef INVARIANTS panic("sctp_timer_stop of type %d: inp = %p, stcb = %p, net = %p", t_type, inp, stcb, net); #else return; #endif } tmr = &inp->sctp_ep.signature_change; break; case SCTP_TIMER_TYPE_PATHMTURAISE: if ((inp == NULL) || (stcb == NULL) || (net == NULL)) { #ifdef INVARIANTS panic("sctp_timer_stop of type %d: inp = %p, stcb = %p, net = %p", t_type, inp, stcb, net); #else return; #endif } tmr = &net->pmtu_timer; break; case SCTP_TIMER_TYPE_SHUTDOWNACK: if ((inp == NULL) || (stcb == NULL) || (net == NULL)) { #ifdef INVARIANTS panic("sctp_timer_stop of type %d: inp = %p, stcb = %p, net = %p", t_type, inp, stcb, net); #else return; #endif } tmr = &net->rxt_timer; break; case SCTP_TIMER_TYPE_ASCONF: if ((inp == NULL) || (stcb == NULL) || (net != NULL)) { #ifdef INVARIANTS panic("sctp_timer_stop of type %d: inp = %p, stcb = %p, net = %p", t_type, inp, stcb, net); #else return; #endif } tmr = &stcb->asoc.asconf_timer; break; case SCTP_TIMER_TYPE_SHUTDOWNGUARD: if ((inp == NULL) || (stcb == NULL) || (net != NULL)) { #ifdef INVARIANTS panic("sctp_timer_stop of type %d: inp = %p, stcb = %p, net = %p", t_type, inp, stcb, net); #else return; #endif } tmr = &stcb->asoc.shut_guard_timer; break; case SCTP_TIMER_TYPE_AUTOCLOSE: if ((inp == NULL) || (stcb == NULL) || (net != NULL)) { #ifdef INVARIANTS panic("sctp_timer_stop of type %d: inp = %p, stcb = %p, net = %p", t_type, inp, stcb, net); #else return; #endif } tmr = &stcb->asoc.autoclose_timer; break; case SCTP_TIMER_TYPE_STRRESET: if ((inp == NULL) || (stcb == NULL) || (net != NULL)) { #ifdef INVARIANTS panic("sctp_timer_stop of type %d: inp = %p, stcb = %p, net = %p", t_type, inp, stcb, net); #else return; #endif } tmr = &stcb->asoc.strreset_timer; break; case SCTP_TIMER_TYPE_INPKILL: /* * The inp is setup to die. We re-use the signature_chage * timer since that has stopped and we are in the GONE * state. */ if ((inp == NULL) || (stcb != NULL) || (net != NULL)) { #ifdef INVARIANTS panic("sctp_timer_stop of type %d: inp = %p, stcb = %p, net = %p", t_type, inp, stcb, net); #else return; #endif } tmr = &inp->sctp_ep.signature_change; break; case SCTP_TIMER_TYPE_ASOCKILL: if ((inp == NULL) || (stcb == NULL) || (net != NULL)) { #ifdef INVARIANTS panic("sctp_timer_stop of type %d: inp = %p, stcb = %p, net = %p", t_type, inp, stcb, net); #else return; #endif } tmr = &stcb->asoc.strreset_timer; break; case SCTP_TIMER_TYPE_ADDR_WQ: if ((inp != NULL) || (stcb != NULL) || (net != NULL)) { #ifdef INVARIANTS panic("sctp_timer_stop of type %d: inp = %p, stcb = %p, net = %p", t_type, inp, stcb, net); #else return; #endif } tmr = &SCTP_BASE_INFO(addr_wq_timer); break; case SCTP_TIMER_TYPE_PRIM_DELETED: if ((inp == NULL) || (stcb == NULL) || (net != NULL)) { #ifdef INVARIANTS panic("sctp_timer_stop of type %d: inp = %p, stcb = %p, net = %p", t_type, inp, stcb, net); #else return; #endif } tmr = &stcb->asoc.delete_prim_timer; break; default: #ifdef INVARIANTS panic("Unknown timer type %d", t_type); #else return; #endif } KASSERT(tmr != NULL, ("tmr is NULL for timer type %d", t_type)); if ((tmr->type != SCTP_TIMER_TYPE_NONE) && (tmr->type != t_type)) { /* * Ok we have a timer that is under joint use. Cookie timer * per chance with the SEND timer. We therefore are NOT * running the timer that the caller wants stopped. So just * return. */ SCTPDBG(SCTP_DEBUG_TIMER2, "Shared timer type %d not running: inp=%p, stcb=%p, net=%p.\n", t_type, inp, stcb, net); return; } if ((t_type == SCTP_TIMER_TYPE_SEND) && (stcb != NULL)) { stcb->asoc.num_send_timers_up--; if (stcb->asoc.num_send_timers_up < 0) { stcb->asoc.num_send_timers_up = 0; } } tmr->self = NULL; tmr->stopped_from = from; if (SCTP_OS_TIMER_STOP(&tmr->timer) == 1) { KASSERT(tmr->ep == inp, ("sctp_timer_stop of type %d: inp = %p, tmr->inp = %p", t_type, inp, tmr->ep)); KASSERT(tmr->tcb == stcb, ("sctp_timer_stop of type %d: stcb = %p, tmr->stcb = %p", t_type, stcb, tmr->tcb)); KASSERT(((t_type == SCTP_TIMER_TYPE_ASCONF) && (tmr->net != NULL)) || ((t_type != SCTP_TIMER_TYPE_ASCONF) && (tmr->net == net)), ("sctp_timer_stop of type %d: net = %p, tmr->net = %p", t_type, net, tmr->net)); SCTPDBG(SCTP_DEBUG_TIMER2, "Timer type %d stopped: inp=%p, stcb=%p, net=%p.\n", t_type, inp, stcb, net); /* * If the timer was actually stopped, decrement reference * counts that were incremented in sctp_timer_start(). */ if (tmr->ep != NULL) { SCTP_INP_DECR_REF(inp); tmr->ep = NULL; } if (tmr->tcb != NULL) { atomic_add_int(&stcb->asoc.refcnt, -1); tmr->tcb = NULL; } if (tmr->net != NULL) { /* * Can't use net, since it doesn't work for * SCTP_TIMER_TYPE_ASCONF. */ sctp_free_remote_addr((struct sctp_nets *)tmr->net); tmr->net = NULL; } } else { SCTPDBG(SCTP_DEBUG_TIMER2, "Timer type %d not stopped: inp=%p, stcb=%p, net=%p.\n", t_type, inp, stcb, net); } return; } uint32_t sctp_calculate_len(struct mbuf *m) { uint32_t tlen = 0; struct mbuf *at; at = m; while (at) { tlen += SCTP_BUF_LEN(at); at = SCTP_BUF_NEXT(at); } return (tlen); } void sctp_mtu_size_reset(struct sctp_inpcb *inp, struct sctp_association *asoc, uint32_t mtu) { /* * Reset the P-MTU size on this association, this involves changing * the asoc MTU, going through ANY chunk+overhead larger than mtu to * allow the DF flag to be cleared. */ struct sctp_tmit_chunk *chk; unsigned int eff_mtu, ovh; asoc->smallest_mtu = mtu; if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) { ovh = SCTP_MIN_OVERHEAD; } else { ovh = SCTP_MIN_V4_OVERHEAD; } eff_mtu = mtu - ovh; TAILQ_FOREACH(chk, &asoc->send_queue, sctp_next) { if (chk->send_size > eff_mtu) { chk->flags |= CHUNK_FLAGS_FRAGMENT_OK; } } TAILQ_FOREACH(chk, &asoc->sent_queue, sctp_next) { if (chk->send_size > eff_mtu) { chk->flags |= CHUNK_FLAGS_FRAGMENT_OK; } } } /* * Given an association and starting time of the current RTT period, update * RTO in number of msecs. net should point to the current network. * Return 1, if an RTO update was performed, return 0 if no update was * performed due to invalid starting point. */ int sctp_calculate_rto(struct sctp_tcb *stcb, struct sctp_association *asoc, struct sctp_nets *net, struct timeval *old, int rtt_from_sack) { struct timeval now; uint64_t rtt_us; /* RTT in us */ int32_t rtt; /* RTT in ms */ uint32_t new_rto; int first_measure = 0; /************************/ /* 1. calculate new RTT */ /************************/ /* get the current time */ if (stcb->asoc.use_precise_time) { (void)SCTP_GETPTIME_TIMEVAL(&now); } else { (void)SCTP_GETTIME_TIMEVAL(&now); } if ((old->tv_sec > now.tv_sec) || ((old->tv_sec == now.tv_sec) && (old->tv_usec > now.tv_usec))) { /* The starting point is in the future. */ return (0); } timevalsub(&now, old); rtt_us = (uint64_t)1000000 * (uint64_t)now.tv_sec + (uint64_t)now.tv_usec; if (rtt_us > SCTP_RTO_UPPER_BOUND * 1000) { /* The RTT is larger than a sane value. */ return (0); } /* store the current RTT in us */ net->rtt = rtt_us; /* compute rtt in ms */ rtt = (int32_t)(net->rtt / 1000); if ((asoc->cc_functions.sctp_rtt_calculated) && (rtt_from_sack == SCTP_RTT_FROM_DATA)) { /* * Tell the CC module that a new update has just occurred * from a sack */ (*asoc->cc_functions.sctp_rtt_calculated) (stcb, net, &now); } /* * Do we need to determine the lan? We do this only on sacks i.e. * RTT being determined from data not non-data (HB/INIT->INITACK). */ if ((rtt_from_sack == SCTP_RTT_FROM_DATA) && (net->lan_type == SCTP_LAN_UNKNOWN)) { if (net->rtt > SCTP_LOCAL_LAN_RTT) { net->lan_type = SCTP_LAN_INTERNET; } else { net->lan_type = SCTP_LAN_LOCAL; } } /***************************/ /* 2. update RTTVAR & SRTT */ /***************************/ /*- * Compute the scaled average lastsa and the * scaled variance lastsv as described in van Jacobson * Paper "Congestion Avoidance and Control", Annex A. * * (net->lastsa >> SCTP_RTT_SHIFT) is the srtt * (net->lastsv >> SCTP_RTT_VAR_SHIFT) is the rttvar */ if (net->RTO_measured) { rtt -= (net->lastsa >> SCTP_RTT_SHIFT); net->lastsa += rtt; if (rtt < 0) { rtt = -rtt; } rtt -= (net->lastsv >> SCTP_RTT_VAR_SHIFT); net->lastsv += rtt; if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_RTTVAR_LOGGING_ENABLE) { rto_logging(net, SCTP_LOG_RTTVAR); } } else { /* First RTO measurment */ net->RTO_measured = 1; first_measure = 1; net->lastsa = rtt << SCTP_RTT_SHIFT; net->lastsv = (rtt / 2) << SCTP_RTT_VAR_SHIFT; if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_RTTVAR_LOGGING_ENABLE) { rto_logging(net, SCTP_LOG_INITIAL_RTT); } } if (net->lastsv == 0) { net->lastsv = SCTP_CLOCK_GRANULARITY; } new_rto = (net->lastsa >> SCTP_RTT_SHIFT) + net->lastsv; if ((new_rto > SCTP_SAT_NETWORK_MIN) && (stcb->asoc.sat_network_lockout == 0)) { stcb->asoc.sat_network = 1; } else if ((!first_measure) && stcb->asoc.sat_network) { stcb->asoc.sat_network = 0; stcb->asoc.sat_network_lockout = 1; } /* bound it, per C6/C7 in Section 5.3.1 */ if (new_rto < stcb->asoc.minrto) { new_rto = stcb->asoc.minrto; } if (new_rto > stcb->asoc.maxrto) { new_rto = stcb->asoc.maxrto; } net->RTO = new_rto; return (1); } /* * return a pointer to a contiguous piece of data from the given mbuf chain * starting at 'off' for 'len' bytes. If the desired piece spans more than * one mbuf, a copy is made at 'ptr'. caller must ensure that the buffer size * is >= 'len' returns NULL if there there isn't 'len' bytes in the chain. */ caddr_t sctp_m_getptr(struct mbuf *m, int off, int len, uint8_t *in_ptr) { uint32_t count; uint8_t *ptr; ptr = in_ptr; if ((off < 0) || (len <= 0)) return (NULL); /* find the desired start location */ while ((m != NULL) && (off > 0)) { if (off < SCTP_BUF_LEN(m)) break; off -= SCTP_BUF_LEN(m); m = SCTP_BUF_NEXT(m); } if (m == NULL) return (NULL); /* is the current mbuf large enough (eg. contiguous)? */ if ((SCTP_BUF_LEN(m) - off) >= len) { return (mtod(m, caddr_t)+off); } else { /* else, it spans more than one mbuf, so save a temp copy... */ while ((m != NULL) && (len > 0)) { count = min(SCTP_BUF_LEN(m) - off, len); memcpy(ptr, mtod(m, caddr_t)+off, count); len -= count; ptr += count; off = 0; m = SCTP_BUF_NEXT(m); } if ((m == NULL) && (len > 0)) return (NULL); else return ((caddr_t)in_ptr); } } struct sctp_paramhdr * sctp_get_next_param(struct mbuf *m, int offset, struct sctp_paramhdr *pull, int pull_limit) { /* This just provides a typed signature to Peter's Pull routine */ return ((struct sctp_paramhdr *)sctp_m_getptr(m, offset, pull_limit, (uint8_t *)pull)); } struct mbuf * sctp_add_pad_tombuf(struct mbuf *m, int padlen) { struct mbuf *m_last; caddr_t dp; if (padlen > 3) { return (NULL); } if (padlen <= M_TRAILINGSPACE(m)) { /* * The easy way. We hope the majority of the time we hit * here :) */ m_last = m; } else { /* Hard way we must grow the mbuf chain */ m_last = sctp_get_mbuf_for_msg(padlen, 0, M_NOWAIT, 1, MT_DATA); if (m_last == NULL) { return (NULL); } SCTP_BUF_LEN(m_last) = 0; SCTP_BUF_NEXT(m_last) = NULL; SCTP_BUF_NEXT(m) = m_last; } dp = mtod(m_last, caddr_t)+SCTP_BUF_LEN(m_last); SCTP_BUF_LEN(m_last) += padlen; memset(dp, 0, padlen); return (m_last); } struct mbuf * sctp_pad_lastmbuf(struct mbuf *m, int padval, struct mbuf *last_mbuf) { /* find the last mbuf in chain and pad it */ struct mbuf *m_at; if (last_mbuf != NULL) { return (sctp_add_pad_tombuf(last_mbuf, padval)); } else { for (m_at = m; m_at; m_at = SCTP_BUF_NEXT(m_at)) { if (SCTP_BUF_NEXT(m_at) == NULL) { return (sctp_add_pad_tombuf(m_at, padval)); } } } return (NULL); } static void sctp_notify_assoc_change(uint16_t state, struct sctp_tcb *stcb, uint16_t error, struct sctp_abort_chunk *abort, uint8_t from_peer, int so_locked) { struct mbuf *m_notify; struct sctp_assoc_change *sac; struct sctp_queued_to_read *control; unsigned int notif_len; uint16_t abort_len; unsigned int i; if (stcb == NULL) { return; } if (sctp_stcb_is_feature_on(stcb->sctp_ep, stcb, SCTP_PCB_FLAGS_RECVASSOCEVNT)) { notif_len = (unsigned int)sizeof(struct sctp_assoc_change); if (abort != NULL) { abort_len = ntohs(abort->ch.chunk_length); /* * Only SCTP_CHUNK_BUFFER_SIZE are guaranteed to be * contiguous. */ if (abort_len > SCTP_CHUNK_BUFFER_SIZE) { abort_len = SCTP_CHUNK_BUFFER_SIZE; } } else { abort_len = 0; } if ((state == SCTP_COMM_UP) || (state == SCTP_RESTART)) { notif_len += SCTP_ASSOC_SUPPORTS_MAX; } else if ((state == SCTP_COMM_LOST) || (state == SCTP_CANT_STR_ASSOC)) { notif_len += abort_len; } m_notify = sctp_get_mbuf_for_msg(notif_len, 0, M_NOWAIT, 1, MT_DATA); if (m_notify == NULL) { /* Retry with smaller value. */ notif_len = (unsigned int)sizeof(struct sctp_assoc_change); m_notify = sctp_get_mbuf_for_msg(notif_len, 0, M_NOWAIT, 1, MT_DATA); if (m_notify == NULL) { goto set_error; } } SCTP_BUF_NEXT(m_notify) = NULL; sac = mtod(m_notify, struct sctp_assoc_change *); memset(sac, 0, notif_len); sac->sac_type = SCTP_ASSOC_CHANGE; sac->sac_flags = 0; sac->sac_length = sizeof(struct sctp_assoc_change); sac->sac_state = state; sac->sac_error = error; /* XXX verify these stream counts */ sac->sac_outbound_streams = stcb->asoc.streamoutcnt; sac->sac_inbound_streams = stcb->asoc.streamincnt; sac->sac_assoc_id = sctp_get_associd(stcb); if (notif_len > sizeof(struct sctp_assoc_change)) { if ((state == SCTP_COMM_UP) || (state == SCTP_RESTART)) { i = 0; if (stcb->asoc.prsctp_supported == 1) { sac->sac_info[i++] = SCTP_ASSOC_SUPPORTS_PR; } if (stcb->asoc.auth_supported == 1) { sac->sac_info[i++] = SCTP_ASSOC_SUPPORTS_AUTH; } if (stcb->asoc.asconf_supported == 1) { sac->sac_info[i++] = SCTP_ASSOC_SUPPORTS_ASCONF; } if (stcb->asoc.idata_supported == 1) { sac->sac_info[i++] = SCTP_ASSOC_SUPPORTS_INTERLEAVING; } sac->sac_info[i++] = SCTP_ASSOC_SUPPORTS_MULTIBUF; if (stcb->asoc.reconfig_supported == 1) { sac->sac_info[i++] = SCTP_ASSOC_SUPPORTS_RE_CONFIG; } sac->sac_length += i; } else if ((state == SCTP_COMM_LOST) || (state == SCTP_CANT_STR_ASSOC)) { memcpy(sac->sac_info, abort, abort_len); sac->sac_length += abort_len; } } SCTP_BUF_LEN(m_notify) = sac->sac_length; control = sctp_build_readq_entry(stcb, stcb->asoc.primary_destination, 0, 0, stcb->asoc.context, 0, 0, 0, m_notify); if (control != NULL) { control->length = SCTP_BUF_LEN(m_notify); control->spec_flags = M_NOTIFICATION; /* not that we need this */ control->tail_mbuf = m_notify; sctp_add_to_readq(stcb->sctp_ep, stcb, control, &stcb->sctp_socket->so_rcv, 1, SCTP_READ_LOCK_NOT_HELD, so_locked); } else { sctp_m_freem(m_notify); } } /* * For 1-to-1 style sockets, we send up and error when an ABORT * comes in. */ set_error: if (((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL)) && ((state == SCTP_COMM_LOST) || (state == SCTP_CANT_STR_ASSOC))) { SOCK_LOCK(stcb->sctp_socket); if (from_peer) { if (SCTP_GET_STATE(stcb) == SCTP_STATE_COOKIE_WAIT) { SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTPUTIL, ECONNREFUSED); stcb->sctp_socket->so_error = ECONNREFUSED; } else { SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTPUTIL, ECONNRESET); stcb->sctp_socket->so_error = ECONNRESET; } } else { if ((SCTP_GET_STATE(stcb) == SCTP_STATE_COOKIE_WAIT) || (SCTP_GET_STATE(stcb) == SCTP_STATE_COOKIE_ECHOED)) { SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTPUTIL, ETIMEDOUT); stcb->sctp_socket->so_error = ETIMEDOUT; } else { SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTPUTIL, ECONNABORTED); stcb->sctp_socket->so_error = ECONNABORTED; } } SOCK_UNLOCK(stcb->sctp_socket); } /* Wake ANY sleepers */ if (((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL)) && ((state == SCTP_COMM_LOST) || (state == SCTP_CANT_STR_ASSOC))) { socantrcvmore(stcb->sctp_socket); } sorwakeup(stcb->sctp_socket); sowwakeup(stcb->sctp_socket); } static void sctp_notify_peer_addr_change(struct sctp_tcb *stcb, uint32_t state, struct sockaddr *sa, uint32_t error, int so_locked) { struct mbuf *m_notify; struct sctp_paddr_change *spc; struct sctp_queued_to_read *control; if ((stcb == NULL) || sctp_stcb_is_feature_off(stcb->sctp_ep, stcb, SCTP_PCB_FLAGS_RECVPADDREVNT)) { /* event not enabled */ return; } m_notify = sctp_get_mbuf_for_msg(sizeof(struct sctp_paddr_change), 0, M_NOWAIT, 1, MT_DATA); if (m_notify == NULL) return; SCTP_BUF_LEN(m_notify) = 0; spc = mtod(m_notify, struct sctp_paddr_change *); memset(spc, 0, sizeof(struct sctp_paddr_change)); spc->spc_type = SCTP_PEER_ADDR_CHANGE; spc->spc_flags = 0; spc->spc_length = sizeof(struct sctp_paddr_change); switch (sa->sa_family) { #ifdef INET case AF_INET: #ifdef INET6 if (sctp_is_feature_on(stcb->sctp_ep, SCTP_PCB_FLAGS_NEEDS_MAPPED_V4)) { in6_sin_2_v4mapsin6((struct sockaddr_in *)sa, (struct sockaddr_in6 *)&spc->spc_aaddr); } else { memcpy(&spc->spc_aaddr, sa, sizeof(struct sockaddr_in)); } #else memcpy(&spc->spc_aaddr, sa, sizeof(struct sockaddr_in)); #endif break; #endif #ifdef INET6 case AF_INET6: { struct sockaddr_in6 *sin6; memcpy(&spc->spc_aaddr, sa, sizeof(struct sockaddr_in6)); sin6 = (struct sockaddr_in6 *)&spc->spc_aaddr; if (IN6_IS_SCOPE_LINKLOCAL(&sin6->sin6_addr)) { if (sin6->sin6_scope_id == 0) { /* recover scope_id for user */ (void)sa6_recoverscope(sin6); } else { /* clear embedded scope_id for user */ in6_clearscope(&sin6->sin6_addr); } } break; } #endif default: /* TSNH */ break; } spc->spc_state = state; spc->spc_error = error; spc->spc_assoc_id = sctp_get_associd(stcb); SCTP_BUF_LEN(m_notify) = sizeof(struct sctp_paddr_change); SCTP_BUF_NEXT(m_notify) = NULL; /* append to socket */ control = sctp_build_readq_entry(stcb, stcb->asoc.primary_destination, 0, 0, stcb->asoc.context, 0, 0, 0, m_notify); if (control == NULL) { /* no memory */ sctp_m_freem(m_notify); return; } control->length = SCTP_BUF_LEN(m_notify); control->spec_flags = M_NOTIFICATION; /* not that we need this */ control->tail_mbuf = m_notify; sctp_add_to_readq(stcb->sctp_ep, stcb, control, &stcb->sctp_socket->so_rcv, 1, SCTP_READ_LOCK_NOT_HELD, so_locked); } static void sctp_notify_send_failed(struct sctp_tcb *stcb, uint8_t sent, uint32_t error, struct sctp_tmit_chunk *chk, int so_locked) { struct mbuf *m_notify; struct sctp_send_failed *ssf; struct sctp_send_failed_event *ssfe; struct sctp_queued_to_read *control; struct sctp_chunkhdr *chkhdr; int notifhdr_len, chk_len, chkhdr_len, padding_len, payload_len; if ((stcb == NULL) || (sctp_stcb_is_feature_off(stcb->sctp_ep, stcb, SCTP_PCB_FLAGS_RECVSENDFAILEVNT) && sctp_stcb_is_feature_off(stcb->sctp_ep, stcb, SCTP_PCB_FLAGS_RECVNSENDFAILEVNT))) { /* event not enabled */ return; } if (sctp_stcb_is_feature_on(stcb->sctp_ep, stcb, SCTP_PCB_FLAGS_RECVNSENDFAILEVNT)) { notifhdr_len = sizeof(struct sctp_send_failed_event); } else { notifhdr_len = sizeof(struct sctp_send_failed); } m_notify = sctp_get_mbuf_for_msg(notifhdr_len, 0, M_NOWAIT, 1, MT_DATA); if (m_notify == NULL) /* no space left */ return; SCTP_BUF_LEN(m_notify) = notifhdr_len; if (stcb->asoc.idata_supported) { chkhdr_len = sizeof(struct sctp_idata_chunk); } else { chkhdr_len = sizeof(struct sctp_data_chunk); } /* Use some defaults in case we can't access the chunk header */ if (chk->send_size >= chkhdr_len) { payload_len = chk->send_size - chkhdr_len; } else { payload_len = 0; } padding_len = 0; if (chk->data != NULL) { chkhdr = mtod(chk->data, struct sctp_chunkhdr *); if (chkhdr != NULL) { chk_len = ntohs(chkhdr->chunk_length); if ((chk_len >= chkhdr_len) && (chk->send_size >= chk_len) && (chk->send_size - chk_len < 4)) { padding_len = chk->send_size - chk_len; payload_len = chk->send_size - chkhdr_len - padding_len; } } } if (sctp_stcb_is_feature_on(stcb->sctp_ep, stcb, SCTP_PCB_FLAGS_RECVNSENDFAILEVNT)) { ssfe = mtod(m_notify, struct sctp_send_failed_event *); memset(ssfe, 0, notifhdr_len); ssfe->ssfe_type = SCTP_SEND_FAILED_EVENT; if (sent) { ssfe->ssfe_flags = SCTP_DATA_SENT; } else { ssfe->ssfe_flags = SCTP_DATA_UNSENT; } ssfe->ssfe_length = (uint32_t)(notifhdr_len + payload_len); ssfe->ssfe_error = error; /* not exactly what the user sent in, but should be close :) */ ssfe->ssfe_info.snd_sid = chk->rec.data.sid; ssfe->ssfe_info.snd_flags = chk->rec.data.rcv_flags; ssfe->ssfe_info.snd_ppid = chk->rec.data.ppid; ssfe->ssfe_info.snd_context = chk->rec.data.context; ssfe->ssfe_info.snd_assoc_id = sctp_get_associd(stcb); ssfe->ssfe_assoc_id = sctp_get_associd(stcb); } else { ssf = mtod(m_notify, struct sctp_send_failed *); memset(ssf, 0, notifhdr_len); ssf->ssf_type = SCTP_SEND_FAILED; if (sent) { ssf->ssf_flags = SCTP_DATA_SENT; } else { ssf->ssf_flags = SCTP_DATA_UNSENT; } ssf->ssf_length = (uint32_t)(notifhdr_len + payload_len); ssf->ssf_error = error; /* not exactly what the user sent in, but should be close :) */ ssf->ssf_info.sinfo_stream = chk->rec.data.sid; ssf->ssf_info.sinfo_ssn = (uint16_t)chk->rec.data.mid; ssf->ssf_info.sinfo_flags = chk->rec.data.rcv_flags; ssf->ssf_info.sinfo_ppid = chk->rec.data.ppid; ssf->ssf_info.sinfo_context = chk->rec.data.context; ssf->ssf_info.sinfo_assoc_id = sctp_get_associd(stcb); ssf->ssf_assoc_id = sctp_get_associd(stcb); } if (chk->data != NULL) { /* Trim off the sctp chunk header (it should be there) */ if (chk->send_size == chkhdr_len + payload_len + padding_len) { m_adj(chk->data, chkhdr_len); m_adj(chk->data, -padding_len); sctp_mbuf_crush(chk->data); chk->send_size -= (chkhdr_len + padding_len); } } SCTP_BUF_NEXT(m_notify) = chk->data; /* Steal off the mbuf */ chk->data = NULL; /* * For this case, we check the actual socket buffer, since the assoc * is going away we don't want to overfill the socket buffer for a * non-reader */ if (sctp_sbspace_failedmsgs(&stcb->sctp_socket->so_rcv) < SCTP_BUF_LEN(m_notify)) { sctp_m_freem(m_notify); return; } /* append to socket */ control = sctp_build_readq_entry(stcb, stcb->asoc.primary_destination, 0, 0, stcb->asoc.context, 0, 0, 0, m_notify); if (control == NULL) { /* no memory */ sctp_m_freem(m_notify); return; } control->length = SCTP_BUF_LEN(m_notify); control->spec_flags = M_NOTIFICATION; /* not that we need this */ control->tail_mbuf = m_notify; sctp_add_to_readq(stcb->sctp_ep, stcb, control, &stcb->sctp_socket->so_rcv, 1, SCTP_READ_LOCK_NOT_HELD, so_locked); } static void sctp_notify_send_failed2(struct sctp_tcb *stcb, uint32_t error, struct sctp_stream_queue_pending *sp, int so_locked) { struct mbuf *m_notify; struct sctp_send_failed *ssf; struct sctp_send_failed_event *ssfe; struct sctp_queued_to_read *control; int notifhdr_len; if ((stcb == NULL) || (sctp_stcb_is_feature_off(stcb->sctp_ep, stcb, SCTP_PCB_FLAGS_RECVSENDFAILEVNT) && sctp_stcb_is_feature_off(stcb->sctp_ep, stcb, SCTP_PCB_FLAGS_RECVNSENDFAILEVNT))) { /* event not enabled */ return; } if (sctp_stcb_is_feature_on(stcb->sctp_ep, stcb, SCTP_PCB_FLAGS_RECVNSENDFAILEVNT)) { notifhdr_len = sizeof(struct sctp_send_failed_event); } else { notifhdr_len = sizeof(struct sctp_send_failed); } m_notify = sctp_get_mbuf_for_msg(notifhdr_len, 0, M_NOWAIT, 1, MT_DATA); if (m_notify == NULL) { /* no space left */ return; } SCTP_BUF_LEN(m_notify) = notifhdr_len; if (sctp_stcb_is_feature_on(stcb->sctp_ep, stcb, SCTP_PCB_FLAGS_RECVNSENDFAILEVNT)) { ssfe = mtod(m_notify, struct sctp_send_failed_event *); memset(ssfe, 0, notifhdr_len); ssfe->ssfe_type = SCTP_SEND_FAILED_EVENT; ssfe->ssfe_flags = SCTP_DATA_UNSENT; ssfe->ssfe_length = (uint32_t)(notifhdr_len + sp->length); ssfe->ssfe_error = error; /* not exactly what the user sent in, but should be close :) */ ssfe->ssfe_info.snd_sid = sp->sid; if (sp->some_taken) { ssfe->ssfe_info.snd_flags = SCTP_DATA_LAST_FRAG; } else { ssfe->ssfe_info.snd_flags = SCTP_DATA_NOT_FRAG; } ssfe->ssfe_info.snd_ppid = sp->ppid; ssfe->ssfe_info.snd_context = sp->context; ssfe->ssfe_info.snd_assoc_id = sctp_get_associd(stcb); ssfe->ssfe_assoc_id = sctp_get_associd(stcb); } else { ssf = mtod(m_notify, struct sctp_send_failed *); memset(ssf, 0, notifhdr_len); ssf->ssf_type = SCTP_SEND_FAILED; ssf->ssf_flags = SCTP_DATA_UNSENT; ssf->ssf_length = (uint32_t)(notifhdr_len + sp->length); ssf->ssf_error = error; /* not exactly what the user sent in, but should be close :) */ ssf->ssf_info.sinfo_stream = sp->sid; ssf->ssf_info.sinfo_ssn = 0; if (sp->some_taken) { ssf->ssf_info.sinfo_flags = SCTP_DATA_LAST_FRAG; } else { ssf->ssf_info.sinfo_flags = SCTP_DATA_NOT_FRAG; } ssf->ssf_info.sinfo_ppid = sp->ppid; ssf->ssf_info.sinfo_context = sp->context; ssf->ssf_info.sinfo_assoc_id = sctp_get_associd(stcb); ssf->ssf_assoc_id = sctp_get_associd(stcb); } SCTP_BUF_NEXT(m_notify) = sp->data; /* Steal off the mbuf */ sp->data = NULL; /* * For this case, we check the actual socket buffer, since the assoc * is going away we don't want to overfill the socket buffer for a * non-reader */ if (sctp_sbspace_failedmsgs(&stcb->sctp_socket->so_rcv) < SCTP_BUF_LEN(m_notify)) { sctp_m_freem(m_notify); return; } /* append to socket */ control = sctp_build_readq_entry(stcb, stcb->asoc.primary_destination, 0, 0, stcb->asoc.context, 0, 0, 0, m_notify); if (control == NULL) { /* no memory */ sctp_m_freem(m_notify); return; } control->length = SCTP_BUF_LEN(m_notify); control->spec_flags = M_NOTIFICATION; /* not that we need this */ control->tail_mbuf = m_notify; sctp_add_to_readq(stcb->sctp_ep, stcb, control, &stcb->sctp_socket->so_rcv, 1, SCTP_READ_LOCK_NOT_HELD, so_locked); } static void sctp_notify_adaptation_layer(struct sctp_tcb *stcb) { struct mbuf *m_notify; struct sctp_adaptation_event *sai; struct sctp_queued_to_read *control; if ((stcb == NULL) || sctp_stcb_is_feature_off(stcb->sctp_ep, stcb, SCTP_PCB_FLAGS_ADAPTATIONEVNT)) { /* event not enabled */ return; } m_notify = sctp_get_mbuf_for_msg(sizeof(struct sctp_adaption_event), 0, M_NOWAIT, 1, MT_DATA); if (m_notify == NULL) /* no space left */ return; SCTP_BUF_LEN(m_notify) = 0; sai = mtod(m_notify, struct sctp_adaptation_event *); memset(sai, 0, sizeof(struct sctp_adaptation_event)); sai->sai_type = SCTP_ADAPTATION_INDICATION; sai->sai_flags = 0; sai->sai_length = sizeof(struct sctp_adaptation_event); sai->sai_adaptation_ind = stcb->asoc.peers_adaptation; sai->sai_assoc_id = sctp_get_associd(stcb); SCTP_BUF_LEN(m_notify) = sizeof(struct sctp_adaptation_event); SCTP_BUF_NEXT(m_notify) = NULL; /* append to socket */ control = sctp_build_readq_entry(stcb, stcb->asoc.primary_destination, 0, 0, stcb->asoc.context, 0, 0, 0, m_notify); if (control == NULL) { /* no memory */ sctp_m_freem(m_notify); return; } control->length = SCTP_BUF_LEN(m_notify); control->spec_flags = M_NOTIFICATION; /* not that we need this */ control->tail_mbuf = m_notify; sctp_add_to_readq(stcb->sctp_ep, stcb, control, &stcb->sctp_socket->so_rcv, 1, SCTP_READ_LOCK_NOT_HELD, SCTP_SO_NOT_LOCKED); } /* This always must be called with the read-queue LOCKED in the INP */ static void sctp_notify_partial_delivery_indication(struct sctp_tcb *stcb, uint32_t error, uint32_t val, int so_locked) { struct mbuf *m_notify; struct sctp_pdapi_event *pdapi; struct sctp_queued_to_read *control; struct sockbuf *sb; if ((stcb == NULL) || sctp_stcb_is_feature_off(stcb->sctp_ep, stcb, SCTP_PCB_FLAGS_PDAPIEVNT)) { /* event not enabled */ return; } if (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_SOCKET_CANT_READ) { return; } m_notify = sctp_get_mbuf_for_msg(sizeof(struct sctp_pdapi_event), 0, M_NOWAIT, 1, MT_DATA); if (m_notify == NULL) /* no space left */ return; SCTP_BUF_LEN(m_notify) = 0; pdapi = mtod(m_notify, struct sctp_pdapi_event *); memset(pdapi, 0, sizeof(struct sctp_pdapi_event)); pdapi->pdapi_type = SCTP_PARTIAL_DELIVERY_EVENT; pdapi->pdapi_flags = 0; pdapi->pdapi_length = sizeof(struct sctp_pdapi_event); pdapi->pdapi_indication = error; pdapi->pdapi_stream = (val >> 16); pdapi->pdapi_seq = (val & 0x0000ffff); pdapi->pdapi_assoc_id = sctp_get_associd(stcb); SCTP_BUF_LEN(m_notify) = sizeof(struct sctp_pdapi_event); SCTP_BUF_NEXT(m_notify) = NULL; control = sctp_build_readq_entry(stcb, stcb->asoc.primary_destination, 0, 0, stcb->asoc.context, 0, 0, 0, m_notify); if (control == NULL) { /* no memory */ sctp_m_freem(m_notify); return; } control->length = SCTP_BUF_LEN(m_notify); control->spec_flags = M_NOTIFICATION; /* not that we need this */ control->tail_mbuf = m_notify; sb = &stcb->sctp_socket->so_rcv; if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_SB_LOGGING_ENABLE) { sctp_sblog(sb, control->do_not_ref_stcb ? NULL : stcb, SCTP_LOG_SBALLOC, SCTP_BUF_LEN(m_notify)); } sctp_sballoc(stcb, sb, m_notify); if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_SB_LOGGING_ENABLE) { sctp_sblog(sb, control->do_not_ref_stcb ? NULL : stcb, SCTP_LOG_SBRESULT, 0); } control->end_added = 1; if (stcb->asoc.control_pdapi) TAILQ_INSERT_AFTER(&stcb->sctp_ep->read_queue, stcb->asoc.control_pdapi, control, next); else { /* we really should not see this case */ TAILQ_INSERT_TAIL(&stcb->sctp_ep->read_queue, control, next); } if (stcb->sctp_ep && stcb->sctp_socket) { /* This should always be the case */ sctp_sorwakeup(stcb->sctp_ep, stcb->sctp_socket); } } static void sctp_notify_shutdown_event(struct sctp_tcb *stcb) { struct mbuf *m_notify; struct sctp_shutdown_event *sse; struct sctp_queued_to_read *control; /* * For TCP model AND UDP connected sockets we will send an error up * when an SHUTDOWN completes */ if ((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL)) { /* mark socket closed for read/write and wakeup! */ socantsendmore(stcb->sctp_socket); } if (sctp_stcb_is_feature_off(stcb->sctp_ep, stcb, SCTP_PCB_FLAGS_RECVSHUTDOWNEVNT)) { /* event not enabled */ return; } m_notify = sctp_get_mbuf_for_msg(sizeof(struct sctp_shutdown_event), 0, M_NOWAIT, 1, MT_DATA); if (m_notify == NULL) /* no space left */ return; sse = mtod(m_notify, struct sctp_shutdown_event *); memset(sse, 0, sizeof(struct sctp_shutdown_event)); sse->sse_type = SCTP_SHUTDOWN_EVENT; sse->sse_flags = 0; sse->sse_length = sizeof(struct sctp_shutdown_event); sse->sse_assoc_id = sctp_get_associd(stcb); SCTP_BUF_LEN(m_notify) = sizeof(struct sctp_shutdown_event); SCTP_BUF_NEXT(m_notify) = NULL; /* append to socket */ control = sctp_build_readq_entry(stcb, stcb->asoc.primary_destination, 0, 0, stcb->asoc.context, 0, 0, 0, m_notify); if (control == NULL) { /* no memory */ sctp_m_freem(m_notify); return; } control->length = SCTP_BUF_LEN(m_notify); control->spec_flags = M_NOTIFICATION; /* not that we need this */ control->tail_mbuf = m_notify; sctp_add_to_readq(stcb->sctp_ep, stcb, control, &stcb->sctp_socket->so_rcv, 1, SCTP_READ_LOCK_NOT_HELD, SCTP_SO_NOT_LOCKED); } static void sctp_notify_sender_dry_event(struct sctp_tcb *stcb, int so_locked) { struct mbuf *m_notify; struct sctp_sender_dry_event *event; struct sctp_queued_to_read *control; if ((stcb == NULL) || sctp_stcb_is_feature_off(stcb->sctp_ep, stcb, SCTP_PCB_FLAGS_DRYEVNT)) { /* event not enabled */ return; } m_notify = sctp_get_mbuf_for_msg(sizeof(struct sctp_sender_dry_event), 0, M_NOWAIT, 1, MT_DATA); if (m_notify == NULL) { /* no space left */ return; } SCTP_BUF_LEN(m_notify) = 0; event = mtod(m_notify, struct sctp_sender_dry_event *); memset(event, 0, sizeof(struct sctp_sender_dry_event)); event->sender_dry_type = SCTP_SENDER_DRY_EVENT; event->sender_dry_flags = 0; event->sender_dry_length = sizeof(struct sctp_sender_dry_event); event->sender_dry_assoc_id = sctp_get_associd(stcb); SCTP_BUF_LEN(m_notify) = sizeof(struct sctp_sender_dry_event); SCTP_BUF_NEXT(m_notify) = NULL; /* append to socket */ control = sctp_build_readq_entry(stcb, stcb->asoc.primary_destination, 0, 0, stcb->asoc.context, 0, 0, 0, m_notify); if (control == NULL) { /* no memory */ sctp_m_freem(m_notify); return; } control->length = SCTP_BUF_LEN(m_notify); control->spec_flags = M_NOTIFICATION; /* not that we need this */ control->tail_mbuf = m_notify; sctp_add_to_readq(stcb->sctp_ep, stcb, control, &stcb->sctp_socket->so_rcv, 1, SCTP_READ_LOCK_NOT_HELD, so_locked); } void sctp_notify_stream_reset_add(struct sctp_tcb *stcb, uint16_t numberin, uint16_t numberout, int flag) { struct mbuf *m_notify; struct sctp_queued_to_read *control; struct sctp_stream_change_event *stradd; if ((stcb == NULL) || (sctp_stcb_is_feature_off(stcb->sctp_ep, stcb, SCTP_PCB_FLAGS_STREAM_CHANGEEVNT))) { /* event not enabled */ return; } if ((stcb->asoc.peer_req_out) && flag) { /* Peer made the request, don't tell the local user */ stcb->asoc.peer_req_out = 0; return; } stcb->asoc.peer_req_out = 0; m_notify = sctp_get_mbuf_for_msg(sizeof(struct sctp_stream_change_event), 0, M_NOWAIT, 1, MT_DATA); if (m_notify == NULL) /* no space left */ return; SCTP_BUF_LEN(m_notify) = 0; stradd = mtod(m_notify, struct sctp_stream_change_event *); memset(stradd, 0, sizeof(struct sctp_stream_change_event)); stradd->strchange_type = SCTP_STREAM_CHANGE_EVENT; stradd->strchange_flags = flag; stradd->strchange_length = sizeof(struct sctp_stream_change_event); stradd->strchange_assoc_id = sctp_get_associd(stcb); stradd->strchange_instrms = numberin; stradd->strchange_outstrms = numberout; SCTP_BUF_LEN(m_notify) = sizeof(struct sctp_stream_change_event); SCTP_BUF_NEXT(m_notify) = NULL; if (sctp_sbspace(&stcb->asoc, &stcb->sctp_socket->so_rcv) < SCTP_BUF_LEN(m_notify)) { /* no space */ sctp_m_freem(m_notify); return; } /* append to socket */ control = sctp_build_readq_entry(stcb, stcb->asoc.primary_destination, 0, 0, stcb->asoc.context, 0, 0, 0, m_notify); if (control == NULL) { /* no memory */ sctp_m_freem(m_notify); return; } control->length = SCTP_BUF_LEN(m_notify); control->spec_flags = M_NOTIFICATION; /* not that we need this */ control->tail_mbuf = m_notify; sctp_add_to_readq(stcb->sctp_ep, stcb, control, &stcb->sctp_socket->so_rcv, 1, SCTP_READ_LOCK_NOT_HELD, SCTP_SO_NOT_LOCKED); } void sctp_notify_stream_reset_tsn(struct sctp_tcb *stcb, uint32_t sending_tsn, uint32_t recv_tsn, int flag) { struct mbuf *m_notify; struct sctp_queued_to_read *control; struct sctp_assoc_reset_event *strasoc; if ((stcb == NULL) || (sctp_stcb_is_feature_off(stcb->sctp_ep, stcb, SCTP_PCB_FLAGS_ASSOC_RESETEVNT))) { /* event not enabled */ return; } m_notify = sctp_get_mbuf_for_msg(sizeof(struct sctp_assoc_reset_event), 0, M_NOWAIT, 1, MT_DATA); if (m_notify == NULL) /* no space left */ return; SCTP_BUF_LEN(m_notify) = 0; strasoc = mtod(m_notify, struct sctp_assoc_reset_event *); memset(strasoc, 0, sizeof(struct sctp_assoc_reset_event)); strasoc->assocreset_type = SCTP_ASSOC_RESET_EVENT; strasoc->assocreset_flags = flag; strasoc->assocreset_length = sizeof(struct sctp_assoc_reset_event); strasoc->assocreset_assoc_id = sctp_get_associd(stcb); strasoc->assocreset_local_tsn = sending_tsn; strasoc->assocreset_remote_tsn = recv_tsn; SCTP_BUF_LEN(m_notify) = sizeof(struct sctp_assoc_reset_event); SCTP_BUF_NEXT(m_notify) = NULL; if (sctp_sbspace(&stcb->asoc, &stcb->sctp_socket->so_rcv) < SCTP_BUF_LEN(m_notify)) { /* no space */ sctp_m_freem(m_notify); return; } /* append to socket */ control = sctp_build_readq_entry(stcb, stcb->asoc.primary_destination, 0, 0, stcb->asoc.context, 0, 0, 0, m_notify); if (control == NULL) { /* no memory */ sctp_m_freem(m_notify); return; } control->length = SCTP_BUF_LEN(m_notify); control->spec_flags = M_NOTIFICATION; /* not that we need this */ control->tail_mbuf = m_notify; sctp_add_to_readq(stcb->sctp_ep, stcb, control, &stcb->sctp_socket->so_rcv, 1, SCTP_READ_LOCK_NOT_HELD, SCTP_SO_NOT_LOCKED); } static void sctp_notify_stream_reset(struct sctp_tcb *stcb, int number_entries, uint16_t *list, int flag) { struct mbuf *m_notify; struct sctp_queued_to_read *control; struct sctp_stream_reset_event *strreset; int len; if ((stcb == NULL) || (sctp_stcb_is_feature_off(stcb->sctp_ep, stcb, SCTP_PCB_FLAGS_STREAM_RESETEVNT))) { /* event not enabled */ return; } m_notify = sctp_get_mbuf_for_msg(MCLBYTES, 0, M_NOWAIT, 1, MT_DATA); if (m_notify == NULL) /* no space left */ return; SCTP_BUF_LEN(m_notify) = 0; len = sizeof(struct sctp_stream_reset_event) + (number_entries * sizeof(uint16_t)); if (len > M_TRAILINGSPACE(m_notify)) { /* never enough room */ sctp_m_freem(m_notify); return; } strreset = mtod(m_notify, struct sctp_stream_reset_event *); memset(strreset, 0, len); strreset->strreset_type = SCTP_STREAM_RESET_EVENT; strreset->strreset_flags = flag; strreset->strreset_length = len; strreset->strreset_assoc_id = sctp_get_associd(stcb); if (number_entries) { int i; for (i = 0; i < number_entries; i++) { strreset->strreset_stream_list[i] = ntohs(list[i]); } } SCTP_BUF_LEN(m_notify) = len; SCTP_BUF_NEXT(m_notify) = NULL; if (sctp_sbspace(&stcb->asoc, &stcb->sctp_socket->so_rcv) < SCTP_BUF_LEN(m_notify)) { /* no space */ sctp_m_freem(m_notify); return; } /* append to socket */ control = sctp_build_readq_entry(stcb, stcb->asoc.primary_destination, 0, 0, stcb->asoc.context, 0, 0, 0, m_notify); if (control == NULL) { /* no memory */ sctp_m_freem(m_notify); return; } control->length = SCTP_BUF_LEN(m_notify); control->spec_flags = M_NOTIFICATION; /* not that we need this */ control->tail_mbuf = m_notify; sctp_add_to_readq(stcb->sctp_ep, stcb, control, &stcb->sctp_socket->so_rcv, 1, SCTP_READ_LOCK_NOT_HELD, SCTP_SO_NOT_LOCKED); } static void sctp_notify_remote_error(struct sctp_tcb *stcb, uint16_t error, struct sctp_error_chunk *chunk) { struct mbuf *m_notify; struct sctp_remote_error *sre; struct sctp_queued_to_read *control; unsigned int notif_len; uint16_t chunk_len; if ((stcb == NULL) || sctp_stcb_is_feature_off(stcb->sctp_ep, stcb, SCTP_PCB_FLAGS_RECVPEERERR)) { return; } if (chunk != NULL) { chunk_len = ntohs(chunk->ch.chunk_length); /* * Only SCTP_CHUNK_BUFFER_SIZE are guaranteed to be * contiguous. */ if (chunk_len > SCTP_CHUNK_BUFFER_SIZE) { chunk_len = SCTP_CHUNK_BUFFER_SIZE; } } else { chunk_len = 0; } notif_len = (unsigned int)(sizeof(struct sctp_remote_error) + chunk_len); m_notify = sctp_get_mbuf_for_msg(notif_len, 0, M_NOWAIT, 1, MT_DATA); if (m_notify == NULL) { /* Retry with smaller value. */ notif_len = (unsigned int)sizeof(struct sctp_remote_error); m_notify = sctp_get_mbuf_for_msg(notif_len, 0, M_NOWAIT, 1, MT_DATA); if (m_notify == NULL) { return; } } SCTP_BUF_NEXT(m_notify) = NULL; sre = mtod(m_notify, struct sctp_remote_error *); memset(sre, 0, notif_len); sre->sre_type = SCTP_REMOTE_ERROR; sre->sre_flags = 0; sre->sre_length = sizeof(struct sctp_remote_error); sre->sre_error = error; sre->sre_assoc_id = sctp_get_associd(stcb); if (notif_len > sizeof(struct sctp_remote_error)) { memcpy(sre->sre_data, chunk, chunk_len); sre->sre_length += chunk_len; } SCTP_BUF_LEN(m_notify) = sre->sre_length; control = sctp_build_readq_entry(stcb, stcb->asoc.primary_destination, 0, 0, stcb->asoc.context, 0, 0, 0, m_notify); if (control != NULL) { control->length = SCTP_BUF_LEN(m_notify); control->spec_flags = M_NOTIFICATION; /* not that we need this */ control->tail_mbuf = m_notify; sctp_add_to_readq(stcb->sctp_ep, stcb, control, &stcb->sctp_socket->so_rcv, 1, SCTP_READ_LOCK_NOT_HELD, SCTP_SO_NOT_LOCKED); } else { sctp_m_freem(m_notify); } } void sctp_ulp_notify(uint32_t notification, struct sctp_tcb *stcb, uint32_t error, void *data, int so_locked) { if ((stcb == NULL) || (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) || (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) || (stcb->asoc.state & SCTP_STATE_CLOSED_SOCKET)) { /* If the socket is gone we are out of here */ return; } if (stcb->sctp_socket->so_rcv.sb_state & SBS_CANTRCVMORE) { return; } if ((SCTP_GET_STATE(stcb) == SCTP_STATE_COOKIE_WAIT) || (SCTP_GET_STATE(stcb) == SCTP_STATE_COOKIE_ECHOED)) { if ((notification == SCTP_NOTIFY_INTERFACE_DOWN) || (notification == SCTP_NOTIFY_INTERFACE_UP) || (notification == SCTP_NOTIFY_INTERFACE_CONFIRMED)) { /* Don't report these in front states */ return; } } switch (notification) { case SCTP_NOTIFY_ASSOC_UP: if (stcb->asoc.assoc_up_sent == 0) { sctp_notify_assoc_change(SCTP_COMM_UP, stcb, error, NULL, 0, so_locked); stcb->asoc.assoc_up_sent = 1; } if (stcb->asoc.adaptation_needed && (stcb->asoc.adaptation_sent == 0)) { sctp_notify_adaptation_layer(stcb); } if (stcb->asoc.auth_supported == 0) { sctp_ulp_notify(SCTP_NOTIFY_NO_PEER_AUTH, stcb, 0, NULL, so_locked); } break; case SCTP_NOTIFY_ASSOC_DOWN: sctp_notify_assoc_change(SCTP_SHUTDOWN_COMP, stcb, error, NULL, 0, so_locked); break; case SCTP_NOTIFY_INTERFACE_DOWN: { struct sctp_nets *net; net = (struct sctp_nets *)data; sctp_notify_peer_addr_change(stcb, SCTP_ADDR_UNREACHABLE, (struct sockaddr *)&net->ro._l_addr, error, so_locked); break; } case SCTP_NOTIFY_INTERFACE_UP: { struct sctp_nets *net; net = (struct sctp_nets *)data; sctp_notify_peer_addr_change(stcb, SCTP_ADDR_AVAILABLE, (struct sockaddr *)&net->ro._l_addr, error, so_locked); break; } case SCTP_NOTIFY_INTERFACE_CONFIRMED: { struct sctp_nets *net; net = (struct sctp_nets *)data; sctp_notify_peer_addr_change(stcb, SCTP_ADDR_CONFIRMED, (struct sockaddr *)&net->ro._l_addr, error, so_locked); break; } case SCTP_NOTIFY_SPECIAL_SP_FAIL: sctp_notify_send_failed2(stcb, error, (struct sctp_stream_queue_pending *)data, so_locked); break; case SCTP_NOTIFY_SENT_DG_FAIL: sctp_notify_send_failed(stcb, 1, error, (struct sctp_tmit_chunk *)data, so_locked); break; case SCTP_NOTIFY_UNSENT_DG_FAIL: sctp_notify_send_failed(stcb, 0, error, (struct sctp_tmit_chunk *)data, so_locked); break; case SCTP_NOTIFY_PARTIAL_DELVIERY_INDICATION: { uint32_t val; val = *((uint32_t *)data); sctp_notify_partial_delivery_indication(stcb, error, val, so_locked); break; } case SCTP_NOTIFY_ASSOC_LOC_ABORTED: if ((SCTP_GET_STATE(stcb) == SCTP_STATE_COOKIE_WAIT) || (SCTP_GET_STATE(stcb) == SCTP_STATE_COOKIE_ECHOED)) { sctp_notify_assoc_change(SCTP_CANT_STR_ASSOC, stcb, error, data, 0, so_locked); } else { sctp_notify_assoc_change(SCTP_COMM_LOST, stcb, error, data, 0, so_locked); } break; case SCTP_NOTIFY_ASSOC_REM_ABORTED: if ((SCTP_GET_STATE(stcb) == SCTP_STATE_COOKIE_WAIT) || (SCTP_GET_STATE(stcb) == SCTP_STATE_COOKIE_ECHOED)) { sctp_notify_assoc_change(SCTP_CANT_STR_ASSOC, stcb, error, data, 1, so_locked); } else { sctp_notify_assoc_change(SCTP_COMM_LOST, stcb, error, data, 1, so_locked); } break; case SCTP_NOTIFY_ASSOC_RESTART: sctp_notify_assoc_change(SCTP_RESTART, stcb, error, NULL, 0, so_locked); if (stcb->asoc.auth_supported == 0) { sctp_ulp_notify(SCTP_NOTIFY_NO_PEER_AUTH, stcb, 0, NULL, so_locked); } break; case SCTP_NOTIFY_STR_RESET_SEND: sctp_notify_stream_reset(stcb, error, ((uint16_t *)data), SCTP_STREAM_RESET_OUTGOING_SSN); break; case SCTP_NOTIFY_STR_RESET_RECV: sctp_notify_stream_reset(stcb, error, ((uint16_t *)data), SCTP_STREAM_RESET_INCOMING); break; case SCTP_NOTIFY_STR_RESET_FAILED_OUT: sctp_notify_stream_reset(stcb, error, ((uint16_t *)data), (SCTP_STREAM_RESET_OUTGOING_SSN | SCTP_STREAM_RESET_FAILED)); break; case SCTP_NOTIFY_STR_RESET_DENIED_OUT: sctp_notify_stream_reset(stcb, error, ((uint16_t *)data), (SCTP_STREAM_RESET_OUTGOING_SSN | SCTP_STREAM_RESET_DENIED)); break; case SCTP_NOTIFY_STR_RESET_FAILED_IN: sctp_notify_stream_reset(stcb, error, ((uint16_t *)data), (SCTP_STREAM_RESET_INCOMING | SCTP_STREAM_RESET_FAILED)); break; case SCTP_NOTIFY_STR_RESET_DENIED_IN: sctp_notify_stream_reset(stcb, error, ((uint16_t *)data), (SCTP_STREAM_RESET_INCOMING | SCTP_STREAM_RESET_DENIED)); break; case SCTP_NOTIFY_ASCONF_ADD_IP: sctp_notify_peer_addr_change(stcb, SCTP_ADDR_ADDED, data, error, so_locked); break; case SCTP_NOTIFY_ASCONF_DELETE_IP: sctp_notify_peer_addr_change(stcb, SCTP_ADDR_REMOVED, data, error, so_locked); break; case SCTP_NOTIFY_ASCONF_SET_PRIMARY: sctp_notify_peer_addr_change(stcb, SCTP_ADDR_MADE_PRIM, data, error, so_locked); break; case SCTP_NOTIFY_PEER_SHUTDOWN: sctp_notify_shutdown_event(stcb); break; case SCTP_NOTIFY_AUTH_NEW_KEY: sctp_notify_authentication(stcb, SCTP_AUTH_NEW_KEY, error, (uint16_t)(uintptr_t)data, so_locked); break; case SCTP_NOTIFY_AUTH_FREE_KEY: sctp_notify_authentication(stcb, SCTP_AUTH_FREE_KEY, error, (uint16_t)(uintptr_t)data, so_locked); break; case SCTP_NOTIFY_NO_PEER_AUTH: sctp_notify_authentication(stcb, SCTP_AUTH_NO_AUTH, error, (uint16_t)(uintptr_t)data, so_locked); break; case SCTP_NOTIFY_SENDER_DRY: sctp_notify_sender_dry_event(stcb, so_locked); break; case SCTP_NOTIFY_REMOTE_ERROR: sctp_notify_remote_error(stcb, error, data); break; default: SCTPDBG(SCTP_DEBUG_UTIL1, "%s: unknown notification %xh (%u)\n", __func__, notification, notification); break; } /* end switch */ } void -sctp_report_all_outbound(struct sctp_tcb *stcb, uint16_t error, int holds_lock, int so_locked) +sctp_report_all_outbound(struct sctp_tcb *stcb, uint16_t error, int so_locked) { struct sctp_association *asoc; struct sctp_stream_out *outs; struct sctp_tmit_chunk *chk, *nchk; struct sctp_stream_queue_pending *sp, *nsp; int i; if (stcb == NULL) { return; } asoc = &stcb->asoc; if (asoc->state & SCTP_STATE_ABOUT_TO_BE_FREED) { /* already being freed */ return; } if ((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) || (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) || (asoc->state & SCTP_STATE_CLOSED_SOCKET)) { return; } /* now through all the gunk freeing chunks */ - if (holds_lock == 0) { - SCTP_TCB_SEND_LOCK(stcb); - } /* sent queue SHOULD be empty */ TAILQ_FOREACH_SAFE(chk, &asoc->sent_queue, sctp_next, nchk) { TAILQ_REMOVE(&asoc->sent_queue, chk, sctp_next); asoc->sent_queue_cnt--; if (chk->sent != SCTP_DATAGRAM_NR_ACKED) { if (asoc->strmout[chk->rec.data.sid].chunks_on_queues > 0) { asoc->strmout[chk->rec.data.sid].chunks_on_queues--; #ifdef INVARIANTS } else { panic("No chunks on the queues for sid %u.", chk->rec.data.sid); #endif } } if (chk->data != NULL) { sctp_free_bufspace(stcb, asoc, chk, 1); sctp_ulp_notify(SCTP_NOTIFY_SENT_DG_FAIL, stcb, error, chk, so_locked); if (chk->data) { sctp_m_freem(chk->data); chk->data = NULL; } } sctp_free_a_chunk(stcb, chk, so_locked); /* sa_ignore FREED_MEMORY */ } /* pending send queue SHOULD be empty */ TAILQ_FOREACH_SAFE(chk, &asoc->send_queue, sctp_next, nchk) { TAILQ_REMOVE(&asoc->send_queue, chk, sctp_next); asoc->send_queue_cnt--; if (asoc->strmout[chk->rec.data.sid].chunks_on_queues > 0) { asoc->strmout[chk->rec.data.sid].chunks_on_queues--; #ifdef INVARIANTS } else { panic("No chunks on the queues for sid %u.", chk->rec.data.sid); #endif } if (chk->data != NULL) { sctp_free_bufspace(stcb, asoc, chk, 1); sctp_ulp_notify(SCTP_NOTIFY_UNSENT_DG_FAIL, stcb, error, chk, so_locked); if (chk->data) { sctp_m_freem(chk->data); chk->data = NULL; } } sctp_free_a_chunk(stcb, chk, so_locked); /* sa_ignore FREED_MEMORY */ } for (i = 0; i < asoc->streamoutcnt; i++) { /* For each stream */ outs = &asoc->strmout[i]; /* clean up any sends there */ TAILQ_FOREACH_SAFE(sp, &outs->outqueue, next, nsp) { atomic_subtract_int(&asoc->stream_queue_cnt, 1); TAILQ_REMOVE(&outs->outqueue, sp, next); stcb->asoc.ss_functions.sctp_ss_remove_from_stream(stcb, asoc, outs, sp, 1); sctp_free_spbufspace(stcb, asoc, sp); if (sp->data) { sctp_ulp_notify(SCTP_NOTIFY_SPECIAL_SP_FAIL, stcb, error, (void *)sp, so_locked); if (sp->data) { sctp_m_freem(sp->data); sp->data = NULL; sp->tail_mbuf = NULL; sp->length = 0; } } if (sp->net) { sctp_free_remote_addr(sp->net); sp->net = NULL; } /* Free the chunk */ sctp_free_a_strmoq(stcb, sp, so_locked); /* sa_ignore FREED_MEMORY */ } } - - if (holds_lock == 0) { - SCTP_TCB_SEND_UNLOCK(stcb); - } } void sctp_abort_notification(struct sctp_tcb *stcb, uint8_t from_peer, uint16_t error, struct sctp_abort_chunk *abort, int so_locked) { if (stcb == NULL) { return; } if ((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) || ((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) && (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_CONNECTED))) { stcb->sctp_ep->sctp_flags |= SCTP_PCB_FLAGS_WAS_ABORTED; } if ((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) || (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) || (stcb->asoc.state & SCTP_STATE_CLOSED_SOCKET)) { return; } + SCTP_TCB_SEND_LOCK(stcb); + SCTP_ADD_SUBSTATE(stcb, SCTP_STATE_WAS_ABORTED); /* Tell them we lost the asoc */ - sctp_report_all_outbound(stcb, error, 0, so_locked); + sctp_report_all_outbound(stcb, error, so_locked); + SCTP_TCB_SEND_UNLOCK(stcb); if (from_peer) { sctp_ulp_notify(SCTP_NOTIFY_ASSOC_REM_ABORTED, stcb, error, abort, so_locked); } else { sctp_ulp_notify(SCTP_NOTIFY_ASSOC_LOC_ABORTED, stcb, error, abort, so_locked); } } void sctp_abort_association(struct sctp_inpcb *inp, struct sctp_tcb *stcb, struct mbuf *m, int iphlen, struct sockaddr *src, struct sockaddr *dst, struct sctphdr *sh, struct mbuf *op_err, uint8_t mflowtype, uint32_t mflowid, uint32_t vrf_id, uint16_t port) { uint32_t vtag; vtag = 0; if (stcb != NULL) { vtag = stcb->asoc.peer_vtag; vrf_id = stcb->asoc.vrf_id; } sctp_send_abort(m, iphlen, src, dst, sh, vtag, op_err, mflowtype, mflowid, inp->fibnum, vrf_id, port); if (stcb != NULL) { /* We have a TCB to abort, send notification too */ sctp_abort_notification(stcb, 0, 0, NULL, SCTP_SO_NOT_LOCKED); - SCTP_ADD_SUBSTATE(stcb, SCTP_STATE_WAS_ABORTED); /* Ok, now lets free it */ SCTP_STAT_INCR_COUNTER32(sctps_aborted); if ((SCTP_GET_STATE(stcb) == SCTP_STATE_OPEN) || (SCTP_GET_STATE(stcb) == SCTP_STATE_SHUTDOWN_RECEIVED)) { SCTP_STAT_DECR_GAUGE32(sctps_currestab); } (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTPUTIL + SCTP_LOC_4); } } #ifdef SCTP_ASOCLOG_OF_TSNS void sctp_print_out_track_log(struct sctp_tcb *stcb) { #ifdef NOSIY_PRINTS int i; SCTP_PRINTF("Last ep reason:%x\n", stcb->sctp_ep->last_abort_code); SCTP_PRINTF("IN bound TSN log-aaa\n"); if ((stcb->asoc.tsn_in_at == 0) && (stcb->asoc.tsn_in_wrapped == 0)) { SCTP_PRINTF("None rcvd\n"); goto none_in; } if (stcb->asoc.tsn_in_wrapped) { for (i = stcb->asoc.tsn_in_at; i < SCTP_TSN_LOG_SIZE; i++) { SCTP_PRINTF("TSN:%x strm:%d seq:%d flags:%x sz:%d\n", stcb->asoc.in_tsnlog[i].tsn, stcb->asoc.in_tsnlog[i].strm, stcb->asoc.in_tsnlog[i].seq, stcb->asoc.in_tsnlog[i].flgs, stcb->asoc.in_tsnlog[i].sz); } } if (stcb->asoc.tsn_in_at) { for (i = 0; i < stcb->asoc.tsn_in_at; i++) { SCTP_PRINTF("TSN:%x strm:%d seq:%d flags:%x sz:%d\n", stcb->asoc.in_tsnlog[i].tsn, stcb->asoc.in_tsnlog[i].strm, stcb->asoc.in_tsnlog[i].seq, stcb->asoc.in_tsnlog[i].flgs, stcb->asoc.in_tsnlog[i].sz); } } none_in: SCTP_PRINTF("OUT bound TSN log-aaa\n"); if ((stcb->asoc.tsn_out_at == 0) && (stcb->asoc.tsn_out_wrapped == 0)) { SCTP_PRINTF("None sent\n"); } if (stcb->asoc.tsn_out_wrapped) { for (i = stcb->asoc.tsn_out_at; i < SCTP_TSN_LOG_SIZE; i++) { SCTP_PRINTF("TSN:%x strm:%d seq:%d flags:%x sz:%d\n", stcb->asoc.out_tsnlog[i].tsn, stcb->asoc.out_tsnlog[i].strm, stcb->asoc.out_tsnlog[i].seq, stcb->asoc.out_tsnlog[i].flgs, stcb->asoc.out_tsnlog[i].sz); } } if (stcb->asoc.tsn_out_at) { for (i = 0; i < stcb->asoc.tsn_out_at; i++) { SCTP_PRINTF("TSN:%x strm:%d seq:%d flags:%x sz:%d\n", stcb->asoc.out_tsnlog[i].tsn, stcb->asoc.out_tsnlog[i].strm, stcb->asoc.out_tsnlog[i].seq, stcb->asoc.out_tsnlog[i].flgs, stcb->asoc.out_tsnlog[i].sz); } } #endif } #endif void sctp_abort_an_association(struct sctp_inpcb *inp, struct sctp_tcb *stcb, struct mbuf *op_err, int so_locked) { if (stcb == NULL) { /* Got to have a TCB */ if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) { if (LIST_EMPTY(&inp->sctp_asoc_list)) { sctp_inpcb_free(inp, SCTP_FREE_SHOULD_USE_ABORT, SCTP_CALLED_DIRECTLY_NOCMPSET); } } return; - } else { - SCTP_ADD_SUBSTATE(stcb, SCTP_STATE_WAS_ABORTED); } /* notify the peer */ sctp_send_abort_tcb(stcb, op_err, so_locked); SCTP_STAT_INCR_COUNTER32(sctps_aborted); if ((SCTP_GET_STATE(stcb) == SCTP_STATE_OPEN) || (SCTP_GET_STATE(stcb) == SCTP_STATE_SHUTDOWN_RECEIVED)) { SCTP_STAT_DECR_GAUGE32(sctps_currestab); } /* notify the ulp */ if ((inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) == 0) { sctp_abort_notification(stcb, 0, 0, NULL, so_locked); } /* now free the asoc */ #ifdef SCTP_ASOCLOG_OF_TSNS sctp_print_out_track_log(stcb); #endif (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTPUTIL + SCTP_LOC_5); } void sctp_handle_ootb(struct mbuf *m, int iphlen, int offset, struct sockaddr *src, struct sockaddr *dst, struct sctphdr *sh, struct sctp_inpcb *inp, struct mbuf *cause, uint8_t mflowtype, uint32_t mflowid, uint16_t fibnum, uint32_t vrf_id, uint16_t port) { struct sctp_chunkhdr *ch, chunk_buf; unsigned int chk_length; int contains_init_chunk; SCTP_STAT_INCR_COUNTER32(sctps_outoftheblue); /* Generate a TO address for future reference */ if (inp && (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE)) { if (LIST_EMPTY(&inp->sctp_asoc_list)) { sctp_inpcb_free(inp, SCTP_FREE_SHOULD_USE_ABORT, SCTP_CALLED_DIRECTLY_NOCMPSET); } } contains_init_chunk = 0; ch = (struct sctp_chunkhdr *)sctp_m_getptr(m, offset, sizeof(*ch), (uint8_t *)&chunk_buf); while (ch != NULL) { chk_length = ntohs(ch->chunk_length); if (chk_length < sizeof(*ch)) { /* break to abort land */ break; } switch (ch->chunk_type) { case SCTP_INIT: contains_init_chunk = 1; break; case SCTP_PACKET_DROPPED: /* we don't respond to pkt-dropped */ return; case SCTP_ABORT_ASSOCIATION: /* we don't respond with an ABORT to an ABORT */ return; case SCTP_SHUTDOWN_COMPLETE: /* * we ignore it since we are not waiting for it and * peer is gone */ return; case SCTP_SHUTDOWN_ACK: sctp_send_shutdown_complete2(src, dst, sh, mflowtype, mflowid, fibnum, vrf_id, port); return; default: break; } offset += SCTP_SIZE32(chk_length); ch = (struct sctp_chunkhdr *)sctp_m_getptr(m, offset, sizeof(*ch), (uint8_t *)&chunk_buf); } if ((SCTP_BASE_SYSCTL(sctp_blackhole) == 0) || ((SCTP_BASE_SYSCTL(sctp_blackhole) == 1) && (contains_init_chunk == 0))) { sctp_send_abort(m, iphlen, src, dst, sh, 0, cause, mflowtype, mflowid, fibnum, vrf_id, port); } } /* * check the inbound datagram to make sure there is not an abort inside it, * if there is return 1, else return 0. */ int sctp_is_there_an_abort_here(struct mbuf *m, int iphlen, uint32_t *vtagfill) { struct sctp_chunkhdr *ch; struct sctp_init_chunk *init_chk, chunk_buf; int offset; unsigned int chk_length; offset = iphlen + sizeof(struct sctphdr); ch = (struct sctp_chunkhdr *)sctp_m_getptr(m, offset, sizeof(*ch), (uint8_t *)&chunk_buf); while (ch != NULL) { chk_length = ntohs(ch->chunk_length); if (chk_length < sizeof(*ch)) { /* packet is probably corrupt */ break; } /* we seem to be ok, is it an abort? */ if (ch->chunk_type == SCTP_ABORT_ASSOCIATION) { /* yep, tell them */ return (1); } if (ch->chunk_type == SCTP_INITIATION) { /* need to update the Vtag */ init_chk = (struct sctp_init_chunk *)sctp_m_getptr(m, offset, sizeof(*init_chk), (uint8_t *)&chunk_buf); if (init_chk != NULL) { *vtagfill = ntohl(init_chk->init.initiate_tag); } } /* Nope, move to the next chunk */ offset += SCTP_SIZE32(chk_length); ch = (struct sctp_chunkhdr *)sctp_m_getptr(m, offset, sizeof(*ch), (uint8_t *)&chunk_buf); } return (0); } /* * currently (2/02), ifa_addr embeds scope_id's and don't have sin6_scope_id * set (i.e. it's 0) so, create this function to compare link local scopes */ #ifdef INET6 uint32_t sctp_is_same_scope(struct sockaddr_in6 *addr1, struct sockaddr_in6 *addr2) { struct sockaddr_in6 a, b; /* save copies */ a = *addr1; b = *addr2; if (a.sin6_scope_id == 0) if (sa6_recoverscope(&a)) { /* can't get scope, so can't match */ return (0); } if (b.sin6_scope_id == 0) if (sa6_recoverscope(&b)) { /* can't get scope, so can't match */ return (0); } if (a.sin6_scope_id != b.sin6_scope_id) return (0); return (1); } /* * returns a sockaddr_in6 with embedded scope recovered and removed */ struct sockaddr_in6 * sctp_recover_scope(struct sockaddr_in6 *addr, struct sockaddr_in6 *store) { /* check and strip embedded scope junk */ if (addr->sin6_family == AF_INET6) { if (IN6_IS_SCOPE_LINKLOCAL(&addr->sin6_addr)) { if (addr->sin6_scope_id == 0) { *store = *addr; if (!sa6_recoverscope(store)) { /* use the recovered scope */ addr = store; } } else { /* else, return the original "to" addr */ in6_clearscope(&addr->sin6_addr); } } } return (addr); } #endif /* * are the two addresses the same? currently a "scopeless" check returns: 1 * if same, 0 if not */ int sctp_cmpaddr(struct sockaddr *sa1, struct sockaddr *sa2) { /* must be valid */ if (sa1 == NULL || sa2 == NULL) return (0); /* must be the same family */ if (sa1->sa_family != sa2->sa_family) return (0); switch (sa1->sa_family) { #ifdef INET6 case AF_INET6: { /* IPv6 addresses */ struct sockaddr_in6 *sin6_1, *sin6_2; sin6_1 = (struct sockaddr_in6 *)sa1; sin6_2 = (struct sockaddr_in6 *)sa2; return (SCTP6_ARE_ADDR_EQUAL(sin6_1, sin6_2)); } #endif #ifdef INET case AF_INET: { /* IPv4 addresses */ struct sockaddr_in *sin_1, *sin_2; sin_1 = (struct sockaddr_in *)sa1; sin_2 = (struct sockaddr_in *)sa2; return (sin_1->sin_addr.s_addr == sin_2->sin_addr.s_addr); } #endif default: /* we don't do these... */ return (0); } } void sctp_print_address(struct sockaddr *sa) { #ifdef INET6 char ip6buf[INET6_ADDRSTRLEN]; #endif switch (sa->sa_family) { #ifdef INET6 case AF_INET6: { struct sockaddr_in6 *sin6; sin6 = (struct sockaddr_in6 *)sa; SCTP_PRINTF("IPv6 address: %s:port:%d scope:%u\n", ip6_sprintf(ip6buf, &sin6->sin6_addr), ntohs(sin6->sin6_port), sin6->sin6_scope_id); break; } #endif #ifdef INET case AF_INET: { struct sockaddr_in *sin; unsigned char *p; sin = (struct sockaddr_in *)sa; p = (unsigned char *)&sin->sin_addr; SCTP_PRINTF("IPv4 address: %u.%u.%u.%u:%d\n", p[0], p[1], p[2], p[3], ntohs(sin->sin_port)); break; } #endif default: SCTP_PRINTF("?\n"); break; } } void sctp_pull_off_control_to_new_inp(struct sctp_inpcb *old_inp, struct sctp_inpcb *new_inp, struct sctp_tcb *stcb, int waitflags) { /* * go through our old INP and pull off any control structures that * belong to stcb and move then to the new inp. */ struct socket *old_so, *new_so; struct sctp_queued_to_read *control, *nctl; struct sctp_readhead tmp_queue; struct mbuf *m; int error = 0; old_so = old_inp->sctp_socket; new_so = new_inp->sctp_socket; TAILQ_INIT(&tmp_queue); error = sblock(&old_so->so_rcv, waitflags); if (error) { /* * Gak, can't get sblock, we have a problem. data will be * left stranded.. and we don't dare look at it since the * other thread may be reading something. Oh well, its a * screwed up app that does a peeloff OR a accept while * reading from the main socket... actually its only the * peeloff() case, since I think read will fail on a * listening socket.. */ return; } /* lock the socket buffers */ SCTP_INP_READ_LOCK(old_inp); TAILQ_FOREACH_SAFE(control, &old_inp->read_queue, next, nctl) { /* Pull off all for out target stcb */ if (control->stcb == stcb) { /* remove it we want it */ TAILQ_REMOVE(&old_inp->read_queue, control, next); TAILQ_INSERT_TAIL(&tmp_queue, control, next); m = control->data; while (m) { if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_SB_LOGGING_ENABLE) { sctp_sblog(&old_so->so_rcv, control->do_not_ref_stcb ? NULL : stcb, SCTP_LOG_SBFREE, SCTP_BUF_LEN(m)); } sctp_sbfree(control, stcb, &old_so->so_rcv, m); if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_SB_LOGGING_ENABLE) { sctp_sblog(&old_so->so_rcv, control->do_not_ref_stcb ? NULL : stcb, SCTP_LOG_SBRESULT, 0); } m = SCTP_BUF_NEXT(m); } } } SCTP_INP_READ_UNLOCK(old_inp); /* Remove the sb-lock on the old socket */ sbunlock(&old_so->so_rcv); /* Now we move them over to the new socket buffer */ SCTP_INP_READ_LOCK(new_inp); TAILQ_FOREACH_SAFE(control, &tmp_queue, next, nctl) { TAILQ_INSERT_TAIL(&new_inp->read_queue, control, next); m = control->data; while (m) { if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_SB_LOGGING_ENABLE) { sctp_sblog(&new_so->so_rcv, control->do_not_ref_stcb ? NULL : stcb, SCTP_LOG_SBALLOC, SCTP_BUF_LEN(m)); } sctp_sballoc(stcb, &new_so->so_rcv, m); if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_SB_LOGGING_ENABLE) { sctp_sblog(&new_so->so_rcv, control->do_not_ref_stcb ? NULL : stcb, SCTP_LOG_SBRESULT, 0); } m = SCTP_BUF_NEXT(m); } } SCTP_INP_READ_UNLOCK(new_inp); } void sctp_wakeup_the_read_socket(struct sctp_inpcb *inp, struct sctp_tcb *stcb, int so_locked SCTP_UNUSED ) { if ((inp != NULL) && (inp->sctp_socket != NULL)) { sctp_sorwakeup(inp, inp->sctp_socket); } } void sctp_add_to_readq(struct sctp_inpcb *inp, struct sctp_tcb *stcb, struct sctp_queued_to_read *control, struct sockbuf *sb, int end, int inp_read_lock_held, int so_locked) { /* * Here we must place the control on the end of the socket read * queue AND increment sb_cc so that select will work properly on * read. */ struct mbuf *m, *prev = NULL; if (inp == NULL) { /* Gak, TSNH!! */ #ifdef INVARIANTS panic("Gak, inp NULL on add_to_readq"); #endif return; } if (inp_read_lock_held == 0) SCTP_INP_READ_LOCK(inp); if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_CANT_READ) { if (!control->on_strm_q) { sctp_free_remote_addr(control->whoFrom); if (control->data) { sctp_m_freem(control->data); control->data = NULL; } sctp_free_a_readq(stcb, control); } if (inp_read_lock_held == 0) SCTP_INP_READ_UNLOCK(inp); return; } if (!(control->spec_flags & M_NOTIFICATION)) { atomic_add_int(&inp->total_recvs, 1); if (!control->do_not_ref_stcb) { atomic_add_int(&stcb->total_recvs, 1); } } m = control->data; control->held_length = 0; control->length = 0; while (m) { if (SCTP_BUF_LEN(m) == 0) { /* Skip mbufs with NO length */ if (prev == NULL) { /* First one */ control->data = sctp_m_free(m); m = control->data; } else { SCTP_BUF_NEXT(prev) = sctp_m_free(m); m = SCTP_BUF_NEXT(prev); } if (m == NULL) { control->tail_mbuf = prev; } continue; } prev = m; if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_SB_LOGGING_ENABLE) { sctp_sblog(sb, control->do_not_ref_stcb ? NULL : stcb, SCTP_LOG_SBALLOC, SCTP_BUF_LEN(m)); } sctp_sballoc(stcb, sb, m); if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_SB_LOGGING_ENABLE) { sctp_sblog(sb, control->do_not_ref_stcb ? NULL : stcb, SCTP_LOG_SBRESULT, 0); } atomic_add_int(&control->length, SCTP_BUF_LEN(m)); m = SCTP_BUF_NEXT(m); } if (prev != NULL) { control->tail_mbuf = prev; } else { /* Everything got collapsed out?? */ if (!control->on_strm_q) { sctp_free_remote_addr(control->whoFrom); sctp_free_a_readq(stcb, control); } if (inp_read_lock_held == 0) SCTP_INP_READ_UNLOCK(inp); return; } if (end) { control->end_added = 1; } TAILQ_INSERT_TAIL(&inp->read_queue, control, next); control->on_read_q = 1; if (inp_read_lock_held == 0) SCTP_INP_READ_UNLOCK(inp); if (inp && inp->sctp_socket) { sctp_wakeup_the_read_socket(inp, stcb, so_locked); } } /*************HOLD THIS COMMENT FOR PATCH FILE OF *************ALTERNATE ROUTING CODE */ /*************HOLD THIS COMMENT FOR END OF PATCH FILE OF *************ALTERNATE ROUTING CODE */ struct mbuf * sctp_generate_cause(uint16_t code, char *info) { struct mbuf *m; struct sctp_gen_error_cause *cause; size_t info_len; uint16_t len; if ((code == 0) || (info == NULL)) { return (NULL); } info_len = strlen(info); if (info_len > (SCTP_MAX_CAUSE_LENGTH - sizeof(struct sctp_paramhdr))) { return (NULL); } len = (uint16_t)(sizeof(struct sctp_paramhdr) + info_len); m = sctp_get_mbuf_for_msg(len, 0, M_NOWAIT, 1, MT_DATA); if (m != NULL) { SCTP_BUF_LEN(m) = len; cause = mtod(m, struct sctp_gen_error_cause *); cause->code = htons(code); cause->length = htons(len); memcpy(cause->info, info, info_len); } return (m); } struct mbuf * sctp_generate_no_user_data_cause(uint32_t tsn) { struct mbuf *m; struct sctp_error_no_user_data *no_user_data_cause; uint16_t len; len = (uint16_t)sizeof(struct sctp_error_no_user_data); m = sctp_get_mbuf_for_msg(len, 0, M_NOWAIT, 1, MT_DATA); if (m != NULL) { SCTP_BUF_LEN(m) = len; no_user_data_cause = mtod(m, struct sctp_error_no_user_data *); no_user_data_cause->cause.code = htons(SCTP_CAUSE_NO_USER_DATA); no_user_data_cause->cause.length = htons(len); no_user_data_cause->tsn = htonl(tsn); } return (m); } #ifdef SCTP_MBCNT_LOGGING void sctp_free_bufspace(struct sctp_tcb *stcb, struct sctp_association *asoc, struct sctp_tmit_chunk *tp1, int chk_cnt) { if (tp1->data == NULL) { return; } asoc->chunks_on_out_queue -= chk_cnt; if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MBCNT_LOGGING_ENABLE) { sctp_log_mbcnt(SCTP_LOG_MBCNT_DECREASE, asoc->total_output_queue_size, tp1->book_size, 0, tp1->mbcnt); } if (asoc->total_output_queue_size >= tp1->book_size) { atomic_add_int(&asoc->total_output_queue_size, -tp1->book_size); } else { asoc->total_output_queue_size = 0; } if (stcb->sctp_socket && (((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL)) || ((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE)))) { if (stcb->sctp_socket->so_snd.sb_cc >= tp1->book_size) { stcb->sctp_socket->so_snd.sb_cc -= tp1->book_size; } else { stcb->sctp_socket->so_snd.sb_cc = 0; } } } #endif int sctp_release_pr_sctp_chunk(struct sctp_tcb *stcb, struct sctp_tmit_chunk *tp1, uint8_t sent, int so_locked) { struct sctp_stream_out *strq; struct sctp_tmit_chunk *chk = NULL, *tp2; struct sctp_stream_queue_pending *sp; uint32_t mid; uint16_t sid; uint8_t foundeom = 0; int ret_sz = 0; int notdone; int do_wakeup_routine = 0; sid = tp1->rec.data.sid; mid = tp1->rec.data.mid; if (sent || !(tp1->rec.data.rcv_flags & SCTP_DATA_FIRST_FRAG)) { stcb->asoc.abandoned_sent[0]++; stcb->asoc.abandoned_sent[PR_SCTP_POLICY(tp1->flags)]++; stcb->asoc.strmout[sid].abandoned_sent[0]++; #if defined(SCTP_DETAILED_STR_STATS) stcb->asoc.strmout[sid].abandoned_sent[PR_SCTP_POLICY(tp1->flags)]++; #endif } else { stcb->asoc.abandoned_unsent[0]++; stcb->asoc.abandoned_unsent[PR_SCTP_POLICY(tp1->flags)]++; stcb->asoc.strmout[sid].abandoned_unsent[0]++; #if defined(SCTP_DETAILED_STR_STATS) stcb->asoc.strmout[sid].abandoned_unsent[PR_SCTP_POLICY(tp1->flags)]++; #endif } do { ret_sz += tp1->book_size; if (tp1->data != NULL) { if (tp1->sent < SCTP_DATAGRAM_RESEND) { sctp_flight_size_decrease(tp1); sctp_total_flight_decrease(stcb, tp1); } sctp_free_bufspace(stcb, &stcb->asoc, tp1, 1); stcb->asoc.peers_rwnd += tp1->send_size; stcb->asoc.peers_rwnd += SCTP_BASE_SYSCTL(sctp_peer_chunk_oh); if (sent) { sctp_ulp_notify(SCTP_NOTIFY_SENT_DG_FAIL, stcb, 0, tp1, so_locked); } else { sctp_ulp_notify(SCTP_NOTIFY_UNSENT_DG_FAIL, stcb, 0, tp1, so_locked); } if (tp1->data) { sctp_m_freem(tp1->data); tp1->data = NULL; } do_wakeup_routine = 1; if (PR_SCTP_BUF_ENABLED(tp1->flags)) { stcb->asoc.sent_queue_cnt_removeable--; } } tp1->sent = SCTP_FORWARD_TSN_SKIP; if ((tp1->rec.data.rcv_flags & SCTP_DATA_NOT_FRAG) == SCTP_DATA_NOT_FRAG) { /* not frag'ed we ae done */ notdone = 0; foundeom = 1; } else if (tp1->rec.data.rcv_flags & SCTP_DATA_LAST_FRAG) { /* end of frag, we are done */ notdone = 0; foundeom = 1; } else { /* * Its a begin or middle piece, we must mark all of * it */ notdone = 1; tp1 = TAILQ_NEXT(tp1, sctp_next); } } while (tp1 && notdone); if (foundeom == 0) { /* * The multi-part message was scattered across the send and * sent queue. */ TAILQ_FOREACH_SAFE(tp1, &stcb->asoc.send_queue, sctp_next, tp2) { if ((tp1->rec.data.sid != sid) || (!SCTP_MID_EQ(stcb->asoc.idata_supported, tp1->rec.data.mid, mid))) { break; } /* * save to chk in case we have some on stream out * queue. If so and we have an un-transmitted one we * don't have to fudge the TSN. */ chk = tp1; ret_sz += tp1->book_size; sctp_free_bufspace(stcb, &stcb->asoc, tp1, 1); if (sent) { sctp_ulp_notify(SCTP_NOTIFY_SENT_DG_FAIL, stcb, 0, tp1, so_locked); } else { sctp_ulp_notify(SCTP_NOTIFY_UNSENT_DG_FAIL, stcb, 0, tp1, so_locked); } if (tp1->data) { sctp_m_freem(tp1->data); tp1->data = NULL; } /* No flight involved here book the size to 0 */ tp1->book_size = 0; if (tp1->rec.data.rcv_flags & SCTP_DATA_LAST_FRAG) { foundeom = 1; } do_wakeup_routine = 1; tp1->sent = SCTP_FORWARD_TSN_SKIP; TAILQ_REMOVE(&stcb->asoc.send_queue, tp1, sctp_next); /* * on to the sent queue so we can wait for it to be * passed by. */ TAILQ_INSERT_TAIL(&stcb->asoc.sent_queue, tp1, sctp_next); stcb->asoc.send_queue_cnt--; stcb->asoc.sent_queue_cnt++; } } if (foundeom == 0) { /* * Still no eom found. That means there is stuff left on the * stream out queue.. yuck. */ SCTP_TCB_SEND_LOCK(stcb); strq = &stcb->asoc.strmout[sid]; sp = TAILQ_FIRST(&strq->outqueue); if (sp != NULL) { sp->discard_rest = 1; /* * We may need to put a chunk on the queue that * holds the TSN that would have been sent with the * LAST bit. */ if (chk == NULL) { /* Yep, we have to */ sctp_alloc_a_chunk(stcb, chk); if (chk == NULL) { /* * we are hosed. All we can do is * nothing.. which will cause an * abort if the peer is paying * attention. */ goto oh_well; } memset(chk, 0, sizeof(*chk)); chk->rec.data.rcv_flags = 0; chk->sent = SCTP_FORWARD_TSN_SKIP; chk->asoc = &stcb->asoc; if (stcb->asoc.idata_supported == 0) { if (sp->sinfo_flags & SCTP_UNORDERED) { chk->rec.data.mid = 0; } else { chk->rec.data.mid = strq->next_mid_ordered; } } else { if (sp->sinfo_flags & SCTP_UNORDERED) { chk->rec.data.mid = strq->next_mid_unordered; } else { chk->rec.data.mid = strq->next_mid_ordered; } } chk->rec.data.sid = sp->sid; chk->rec.data.ppid = sp->ppid; chk->rec.data.context = sp->context; chk->flags = sp->act_flags; chk->whoTo = NULL; chk->rec.data.tsn = atomic_fetchadd_int(&stcb->asoc.sending_seq, 1); strq->chunks_on_queues++; TAILQ_INSERT_TAIL(&stcb->asoc.sent_queue, chk, sctp_next); stcb->asoc.sent_queue_cnt++; stcb->asoc.pr_sctp_cnt++; } chk->rec.data.rcv_flags |= SCTP_DATA_LAST_FRAG; if (sp->sinfo_flags & SCTP_UNORDERED) { chk->rec.data.rcv_flags |= SCTP_DATA_UNORDERED; } if (stcb->asoc.idata_supported == 0) { if ((sp->sinfo_flags & SCTP_UNORDERED) == 0) { strq->next_mid_ordered++; } } else { if (sp->sinfo_flags & SCTP_UNORDERED) { strq->next_mid_unordered++; } else { strq->next_mid_ordered++; } } oh_well: if (sp->data) { /* * Pull any data to free up the SB and allow * sender to "add more" while we will throw * away :-) */ sctp_free_spbufspace(stcb, &stcb->asoc, sp); ret_sz += sp->length; do_wakeup_routine = 1; sp->some_taken = 1; sctp_m_freem(sp->data); sp->data = NULL; sp->tail_mbuf = NULL; sp->length = 0; } } SCTP_TCB_SEND_UNLOCK(stcb); } if (do_wakeup_routine) { sctp_sowwakeup(stcb->sctp_ep, stcb->sctp_socket); } return (ret_sz); } /* * checks to see if the given address, sa, is one that is currently known by * the kernel note: can't distinguish the same address on multiple interfaces * and doesn't handle multiple addresses with different zone/scope id's note: * ifa_ifwithaddr() compares the entire sockaddr struct */ struct sctp_ifa * sctp_find_ifa_in_ep(struct sctp_inpcb *inp, struct sockaddr *addr, int holds_lock) { struct sctp_laddr *laddr; if (holds_lock == 0) { SCTP_INP_RLOCK(inp); } LIST_FOREACH(laddr, &inp->sctp_addr_list, sctp_nxt_addr) { if (laddr->ifa == NULL) continue; if (addr->sa_family != laddr->ifa->address.sa.sa_family) continue; #ifdef INET if (addr->sa_family == AF_INET) { if (((struct sockaddr_in *)addr)->sin_addr.s_addr == laddr->ifa->address.sin.sin_addr.s_addr) { /* found him. */ break; } } #endif #ifdef INET6 if (addr->sa_family == AF_INET6) { if (SCTP6_ARE_ADDR_EQUAL((struct sockaddr_in6 *)addr, &laddr->ifa->address.sin6)) { /* found him. */ break; } } #endif } if (holds_lock == 0) { SCTP_INP_RUNLOCK(inp); } if (laddr != NULL) { return (laddr->ifa); } else { return (NULL); } } uint32_t sctp_get_ifa_hash_val(struct sockaddr *addr) { switch (addr->sa_family) { #ifdef INET case AF_INET: { struct sockaddr_in *sin; sin = (struct sockaddr_in *)addr; return (sin->sin_addr.s_addr ^ (sin->sin_addr.s_addr >> 16)); } #endif #ifdef INET6 case AF_INET6: { struct sockaddr_in6 *sin6; uint32_t hash_of_addr; sin6 = (struct sockaddr_in6 *)addr; hash_of_addr = (sin6->sin6_addr.s6_addr32[0] + sin6->sin6_addr.s6_addr32[1] + sin6->sin6_addr.s6_addr32[2] + sin6->sin6_addr.s6_addr32[3]); hash_of_addr = (hash_of_addr ^ (hash_of_addr >> 16)); return (hash_of_addr); } #endif default: break; } return (0); } struct sctp_ifa * sctp_find_ifa_by_addr(struct sockaddr *addr, uint32_t vrf_id, int holds_lock) { struct sctp_ifa *sctp_ifap; struct sctp_vrf *vrf; struct sctp_ifalist *hash_head; uint32_t hash_of_addr; if (holds_lock == 0) { SCTP_IPI_ADDR_RLOCK(); } else { SCTP_IPI_ADDR_LOCK_ASSERT(); } vrf = sctp_find_vrf(vrf_id); if (vrf == NULL) { if (holds_lock == 0) SCTP_IPI_ADDR_RUNLOCK(); return (NULL); } hash_of_addr = sctp_get_ifa_hash_val(addr); hash_head = &vrf->vrf_addr_hash[(hash_of_addr & vrf->vrf_addr_hashmark)]; if (hash_head == NULL) { SCTP_PRINTF("hash_of_addr:%x mask:%x table:%x - ", hash_of_addr, (uint32_t)vrf->vrf_addr_hashmark, (uint32_t)(hash_of_addr & vrf->vrf_addr_hashmark)); sctp_print_address(addr); SCTP_PRINTF("No such bucket for address\n"); if (holds_lock == 0) SCTP_IPI_ADDR_RUNLOCK(); return (NULL); } LIST_FOREACH(sctp_ifap, hash_head, next_bucket) { if (addr->sa_family != sctp_ifap->address.sa.sa_family) continue; #ifdef INET if (addr->sa_family == AF_INET) { if (((struct sockaddr_in *)addr)->sin_addr.s_addr == sctp_ifap->address.sin.sin_addr.s_addr) { /* found him. */ break; } } #endif #ifdef INET6 if (addr->sa_family == AF_INET6) { if (SCTP6_ARE_ADDR_EQUAL((struct sockaddr_in6 *)addr, &sctp_ifap->address.sin6)) { /* found him. */ break; } } #endif } if (holds_lock == 0) SCTP_IPI_ADDR_RUNLOCK(); return (sctp_ifap); } static void sctp_user_rcvd(struct sctp_tcb *stcb, uint32_t *freed_so_far, int hold_rlock, uint32_t rwnd_req) { /* User pulled some data, do we need a rwnd update? */ struct epoch_tracker et; int r_unlocked = 0; uint32_t dif, rwnd; struct socket *so = NULL; if (stcb == NULL) return; atomic_add_int(&stcb->asoc.refcnt, 1); if ((SCTP_GET_STATE(stcb) == SCTP_STATE_SHUTDOWN_ACK_SENT) || (stcb->asoc.state & (SCTP_STATE_ABOUT_TO_BE_FREED | SCTP_STATE_SHUTDOWN_RECEIVED))) { /* Pre-check If we are freeing no update */ goto no_lock; } SCTP_INP_INCR_REF(stcb->sctp_ep); if ((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) || (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE)) { goto out; } so = stcb->sctp_socket; if (so == NULL) { goto out; } atomic_add_int(&stcb->freed_by_sorcv_sincelast, *freed_so_far); /* Have you have freed enough to look */ *freed_so_far = 0; /* Yep, its worth a look and the lock overhead */ /* Figure out what the rwnd would be */ rwnd = sctp_calc_rwnd(stcb, &stcb->asoc); if (rwnd >= stcb->asoc.my_last_reported_rwnd) { dif = rwnd - stcb->asoc.my_last_reported_rwnd; } else { dif = 0; } if (dif >= rwnd_req) { if (hold_rlock) { SCTP_INP_READ_UNLOCK(stcb->sctp_ep); r_unlocked = 1; } if (stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) { /* * One last check before we allow the guy possibly * to get in. There is a race, where the guy has not * reached the gate. In that case */ goto out; } SCTP_TCB_LOCK(stcb); if (stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) { /* No reports here */ SCTP_TCB_UNLOCK(stcb); goto out; } SCTP_STAT_INCR(sctps_wu_sacks_sent); NET_EPOCH_ENTER(et); sctp_send_sack(stcb, SCTP_SO_LOCKED); sctp_chunk_output(stcb->sctp_ep, stcb, SCTP_OUTPUT_FROM_USR_RCVD, SCTP_SO_LOCKED); /* make sure no timer is running */ NET_EPOCH_EXIT(et); sctp_timer_stop(SCTP_TIMER_TYPE_RECV, stcb->sctp_ep, stcb, NULL, SCTP_FROM_SCTPUTIL + SCTP_LOC_6); SCTP_TCB_UNLOCK(stcb); } else { /* Update how much we have pending */ stcb->freed_by_sorcv_sincelast = dif; } out: if (so && r_unlocked && hold_rlock) { SCTP_INP_READ_LOCK(stcb->sctp_ep); } SCTP_INP_DECR_REF(stcb->sctp_ep); no_lock: atomic_add_int(&stcb->asoc.refcnt, -1); return; } int sctp_sorecvmsg(struct socket *so, struct uio *uio, struct mbuf **mp, struct sockaddr *from, int fromlen, int *msg_flags, struct sctp_sndrcvinfo *sinfo, int filling_sinfo) { /* * MSG flags we will look at MSG_DONTWAIT - non-blocking IO. * MSG_PEEK - Look don't touch :-D (only valid with OUT mbuf copy * mp=NULL thus uio is the copy method to userland) MSG_WAITALL - ?? * On the way out we may send out any combination of: * MSG_NOTIFICATION MSG_EOR * */ struct sctp_inpcb *inp = NULL; ssize_t my_len = 0; ssize_t cp_len = 0; int error = 0; struct sctp_queued_to_read *control = NULL, *ctl = NULL, *nxt = NULL; struct mbuf *m = NULL; struct sctp_tcb *stcb = NULL; int wakeup_read_socket = 0; int freecnt_applied = 0; int out_flags = 0, in_flags = 0; int block_allowed = 1; uint32_t freed_so_far = 0; ssize_t copied_so_far = 0; int in_eeor_mode = 0; int no_rcv_needed = 0; uint32_t rwnd_req = 0; int hold_sblock = 0; int hold_rlock = 0; ssize_t slen = 0; uint32_t held_length = 0; int sockbuf_lock = 0; if (uio == NULL) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTPUTIL, EINVAL); return (EINVAL); } if (msg_flags) { in_flags = *msg_flags; if (in_flags & MSG_PEEK) SCTP_STAT_INCR(sctps_read_peeks); } else { in_flags = 0; } slen = uio->uio_resid; /* Pull in and set up our int flags */ if (in_flags & MSG_OOB) { /* Out of band's NOT supported */ return (EOPNOTSUPP); } if ((in_flags & MSG_PEEK) && (mp != NULL)) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTPUTIL, EINVAL); return (EINVAL); } if ((in_flags & (MSG_DONTWAIT | MSG_NBIO )) || SCTP_SO_IS_NBIO(so)) { block_allowed = 0; } /* setup the endpoint */ inp = (struct sctp_inpcb *)so->so_pcb; if (inp == NULL) { SCTP_LTRACE_ERR_RET(NULL, NULL, NULL, SCTP_FROM_SCTPUTIL, EFAULT); return (EFAULT); } rwnd_req = (SCTP_SB_LIMIT_RCV(so) >> SCTP_RWND_HIWAT_SHIFT); /* Must be at least a MTU's worth */ if (rwnd_req < SCTP_MIN_RWND) rwnd_req = SCTP_MIN_RWND; in_eeor_mode = sctp_is_feature_on(inp, SCTP_PCB_FLAGS_EXPLICIT_EOR); if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_RECV_RWND_LOGGING_ENABLE) { sctp_misc_ints(SCTP_SORECV_ENTER, rwnd_req, in_eeor_mode, so->so_rcv.sb_cc, (uint32_t)uio->uio_resid); } if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_RECV_RWND_LOGGING_ENABLE) { sctp_misc_ints(SCTP_SORECV_ENTERPL, rwnd_req, block_allowed, so->so_rcv.sb_cc, (uint32_t)uio->uio_resid); } error = sblock(&so->so_rcv, (block_allowed ? SBL_WAIT : 0)); if (error) { goto release_unlocked; } sockbuf_lock = 1; restart: restart_nosblocks: if (hold_sblock == 0) { SOCKBUF_LOCK(&so->so_rcv); hold_sblock = 1; } if ((inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) || (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE)) { goto out; } if ((so->so_rcv.sb_state & SBS_CANTRCVMORE) && (so->so_rcv.sb_cc == 0)) { if (so->so_error) { error = so->so_error; if ((in_flags & MSG_PEEK) == 0) so->so_error = 0; goto out; } else { if (so->so_rcv.sb_cc == 0) { /* indicate EOF */ error = 0; goto out; } } } if (so->so_rcv.sb_cc <= held_length) { if (so->so_error) { error = so->so_error; if ((in_flags & MSG_PEEK) == 0) { so->so_error = 0; } goto out; } if ((so->so_rcv.sb_cc == 0) && ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL))) { if ((inp->sctp_flags & SCTP_PCB_FLAGS_CONNECTED) == 0) { /* * For active open side clear flags for * re-use passive open is blocked by * connect. */ if (inp->sctp_flags & SCTP_PCB_FLAGS_WAS_ABORTED) { /* * You were aborted, passive side * always hits here */ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTPUTIL, ECONNRESET); error = ECONNRESET; } so->so_state &= ~(SS_ISCONNECTING | SS_ISDISCONNECTING | SS_ISCONFIRMING | SS_ISCONNECTED); if (error == 0) { if ((inp->sctp_flags & SCTP_PCB_FLAGS_WAS_CONNECTED) == 0) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTPUTIL, ENOTCONN); error = ENOTCONN; } } goto out; } } if (block_allowed) { error = sbwait(&so->so_rcv); if (error) { goto out; } held_length = 0; goto restart_nosblocks; } else { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTPUTIL, EWOULDBLOCK); error = EWOULDBLOCK; goto out; } } if (hold_sblock == 1) { SOCKBUF_UNLOCK(&so->so_rcv); hold_sblock = 0; } /* we possibly have data we can read */ /* sa_ignore FREED_MEMORY */ control = TAILQ_FIRST(&inp->read_queue); if (control == NULL) { /* * This could be happening since the appender did the * increment but as not yet did the tailq insert onto the * read_queue */ if (hold_rlock == 0) { SCTP_INP_READ_LOCK(inp); } control = TAILQ_FIRST(&inp->read_queue); if ((control == NULL) && (so->so_rcv.sb_cc != 0)) { #ifdef INVARIANTS panic("Huh, its non zero and nothing on control?"); #endif so->so_rcv.sb_cc = 0; } SCTP_INP_READ_UNLOCK(inp); hold_rlock = 0; goto restart; } if ((control->length == 0) && (control->do_not_ref_stcb)) { /* * Clean up code for freeing assoc that left behind a * pdapi.. maybe a peer in EEOR that just closed after * sending and never indicated a EOR. */ if (hold_rlock == 0) { hold_rlock = 1; SCTP_INP_READ_LOCK(inp); } control->held_length = 0; if (control->data) { /* Hmm there is data here .. fix */ struct mbuf *m_tmp; int cnt = 0; m_tmp = control->data; while (m_tmp) { cnt += SCTP_BUF_LEN(m_tmp); if (SCTP_BUF_NEXT(m_tmp) == NULL) { control->tail_mbuf = m_tmp; control->end_added = 1; } m_tmp = SCTP_BUF_NEXT(m_tmp); } control->length = cnt; } else { /* remove it */ TAILQ_REMOVE(&inp->read_queue, control, next); /* Add back any hiddend data */ sctp_free_remote_addr(control->whoFrom); sctp_free_a_readq(stcb, control); } if (hold_rlock) { hold_rlock = 0; SCTP_INP_READ_UNLOCK(inp); } goto restart; } if ((control->length == 0) && (control->end_added == 1)) { /* * Do we also need to check for (control->pdapi_aborted == * 1)? */ if (hold_rlock == 0) { hold_rlock = 1; SCTP_INP_READ_LOCK(inp); } TAILQ_REMOVE(&inp->read_queue, control, next); if (control->data) { #ifdef INVARIANTS panic("control->data not null but control->length == 0"); #else SCTP_PRINTF("Strange, data left in the control buffer. Cleaning up.\n"); sctp_m_freem(control->data); control->data = NULL; #endif } if (control->aux_data) { sctp_m_free(control->aux_data); control->aux_data = NULL; } #ifdef INVARIANTS if (control->on_strm_q) { panic("About to free ctl:%p so:%p and its in %d", control, so, control->on_strm_q); } #endif sctp_free_remote_addr(control->whoFrom); sctp_free_a_readq(stcb, control); if (hold_rlock) { hold_rlock = 0; SCTP_INP_READ_UNLOCK(inp); } goto restart; } if (control->length == 0) { if ((sctp_is_feature_on(inp, SCTP_PCB_FLAGS_FRAG_INTERLEAVE)) && (filling_sinfo)) { /* find a more suitable one then this */ ctl = TAILQ_NEXT(control, next); while (ctl) { if ((ctl->stcb != control->stcb) && (ctl->length) && (ctl->some_taken || (ctl->spec_flags & M_NOTIFICATION) || ((ctl->do_not_ref_stcb == 0) && (ctl->stcb->asoc.strmin[ctl->sinfo_stream].delivery_started == 0))) ) { /*- * If we have a different TCB next, and there is data * present. If we have already taken some (pdapi), OR we can * ref the tcb and no delivery as started on this stream, we * take it. Note we allow a notification on a different * assoc to be delivered.. */ control = ctl; goto found_one; } else if ((sctp_is_feature_on(inp, SCTP_PCB_FLAGS_INTERLEAVE_STRMS)) && (ctl->length) && ((ctl->some_taken) || ((ctl->do_not_ref_stcb == 0) && ((ctl->spec_flags & M_NOTIFICATION) == 0) && (ctl->stcb->asoc.strmin[ctl->sinfo_stream].delivery_started == 0)))) { /*- * If we have the same tcb, and there is data present, and we * have the strm interleave feature present. Then if we have * taken some (pdapi) or we can refer to tht tcb AND we have * not started a delivery for this stream, we can take it. * Note we do NOT allow a notificaiton on the same assoc to * be delivered. */ control = ctl; goto found_one; } ctl = TAILQ_NEXT(ctl, next); } } /* * if we reach here, not suitable replacement is available * fragment interleave is NOT on. So stuff the sb_cc * into the our held count, and its time to sleep again. */ held_length = so->so_rcv.sb_cc; control->held_length = so->so_rcv.sb_cc; goto restart; } /* Clear the held length since there is something to read */ control->held_length = 0; found_one: /* * If we reach here, control has a some data for us to read off. * Note that stcb COULD be NULL. */ if (hold_rlock == 0) { hold_rlock = 1; SCTP_INP_READ_LOCK(inp); } control->some_taken++; stcb = control->stcb; if (stcb) { if ((control->do_not_ref_stcb == 0) && (stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED)) { if (freecnt_applied == 0) stcb = NULL; } else if (control->do_not_ref_stcb == 0) { /* you can't free it on me please */ /* * The lock on the socket buffer protects us so the * free code will stop. But since we used the * socketbuf lock and the sender uses the tcb_lock * to increment, we need to use the atomic add to * the refcnt */ if (freecnt_applied) { #ifdef INVARIANTS panic("refcnt already incremented"); #else SCTP_PRINTF("refcnt already incremented?\n"); #endif } else { atomic_add_int(&stcb->asoc.refcnt, 1); freecnt_applied = 1; } /* * Setup to remember how much we have not yet told * the peer our rwnd has opened up. Note we grab the * value from the tcb from last time. Note too that * sack sending clears this when a sack is sent, * which is fine. Once we hit the rwnd_req, we then * will go to the sctp_user_rcvd() that will not * lock until it KNOWs it MUST send a WUP-SACK. */ freed_so_far = (uint32_t)stcb->freed_by_sorcv_sincelast; stcb->freed_by_sorcv_sincelast = 0; } } if (stcb && ((control->spec_flags & M_NOTIFICATION) == 0) && control->do_not_ref_stcb == 0) { stcb->asoc.strmin[control->sinfo_stream].delivery_started = 1; } /* First lets get off the sinfo and sockaddr info */ if ((sinfo != NULL) && (filling_sinfo != 0)) { sinfo->sinfo_stream = control->sinfo_stream; sinfo->sinfo_ssn = (uint16_t)control->mid; sinfo->sinfo_flags = control->sinfo_flags; sinfo->sinfo_ppid = control->sinfo_ppid; sinfo->sinfo_context = control->sinfo_context; sinfo->sinfo_timetolive = control->sinfo_timetolive; sinfo->sinfo_tsn = control->sinfo_tsn; sinfo->sinfo_cumtsn = control->sinfo_cumtsn; sinfo->sinfo_assoc_id = control->sinfo_assoc_id; nxt = TAILQ_NEXT(control, next); if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_EXT_RCVINFO) || sctp_is_feature_on(inp, SCTP_PCB_FLAGS_RECVNXTINFO)) { struct sctp_extrcvinfo *s_extra; s_extra = (struct sctp_extrcvinfo *)sinfo; if ((nxt) && (nxt->length)) { s_extra->serinfo_next_flags = SCTP_NEXT_MSG_AVAIL; if (nxt->sinfo_flags & SCTP_UNORDERED) { s_extra->serinfo_next_flags |= SCTP_NEXT_MSG_IS_UNORDERED; } if (nxt->spec_flags & M_NOTIFICATION) { s_extra->serinfo_next_flags |= SCTP_NEXT_MSG_IS_NOTIFICATION; } s_extra->serinfo_next_aid = nxt->sinfo_assoc_id; s_extra->serinfo_next_length = nxt->length; s_extra->serinfo_next_ppid = nxt->sinfo_ppid; s_extra->serinfo_next_stream = nxt->sinfo_stream; if (nxt->tail_mbuf != NULL) { if (nxt->end_added) { s_extra->serinfo_next_flags |= SCTP_NEXT_MSG_ISCOMPLETE; } } } else { /* * we explicitly 0 this, since the memcpy * got some other things beyond the older * sinfo_ that is on the control's structure * :-D */ nxt = NULL; s_extra->serinfo_next_flags = SCTP_NO_NEXT_MSG; s_extra->serinfo_next_aid = 0; s_extra->serinfo_next_length = 0; s_extra->serinfo_next_ppid = 0; s_extra->serinfo_next_stream = 0; } } /* * update off the real current cum-ack, if we have an stcb. */ if ((control->do_not_ref_stcb == 0) && stcb) sinfo->sinfo_cumtsn = stcb->asoc.cumulative_tsn; /* * mask off the high bits, we keep the actual chunk bits in * there. */ sinfo->sinfo_flags &= 0x00ff; if ((control->sinfo_flags >> 8) & SCTP_DATA_UNORDERED) { sinfo->sinfo_flags |= SCTP_UNORDERED; } } #ifdef SCTP_ASOCLOG_OF_TSNS { int index, newindex; struct sctp_pcbtsn_rlog *entry; do { index = inp->readlog_index; newindex = index + 1; if (newindex >= SCTP_READ_LOG_SIZE) { newindex = 0; } } while (atomic_cmpset_int(&inp->readlog_index, index, newindex) == 0); entry = &inp->readlog[index]; entry->vtag = control->sinfo_assoc_id; entry->strm = control->sinfo_stream; entry->seq = (uint16_t)control->mid; entry->sz = control->length; entry->flgs = control->sinfo_flags; } #endif if ((fromlen > 0) && (from != NULL)) { union sctp_sockstore store; size_t len; switch (control->whoFrom->ro._l_addr.sa.sa_family) { #ifdef INET6 case AF_INET6: len = sizeof(struct sockaddr_in6); store.sin6 = control->whoFrom->ro._l_addr.sin6; store.sin6.sin6_port = control->port_from; break; #endif #ifdef INET case AF_INET: #ifdef INET6 if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_NEEDS_MAPPED_V4)) { len = sizeof(struct sockaddr_in6); in6_sin_2_v4mapsin6(&control->whoFrom->ro._l_addr.sin, &store.sin6); store.sin6.sin6_port = control->port_from; } else { len = sizeof(struct sockaddr_in); store.sin = control->whoFrom->ro._l_addr.sin; store.sin.sin_port = control->port_from; } #else len = sizeof(struct sockaddr_in); store.sin = control->whoFrom->ro._l_addr.sin; store.sin.sin_port = control->port_from; #endif break; #endif default: len = 0; break; } memcpy(from, &store, min((size_t)fromlen, len)); #ifdef INET6 { struct sockaddr_in6 lsa6, *from6; from6 = (struct sockaddr_in6 *)from; sctp_recover_scope_mac(from6, (&lsa6)); } #endif } if (hold_rlock) { SCTP_INP_READ_UNLOCK(inp); hold_rlock = 0; } if (hold_sblock) { SOCKBUF_UNLOCK(&so->so_rcv); hold_sblock = 0; } /* now copy out what data we can */ if (mp == NULL) { /* copy out each mbuf in the chain up to length */ get_more_data: m = control->data; while (m) { /* Move out all we can */ cp_len = uio->uio_resid; my_len = SCTP_BUF_LEN(m); if (cp_len > my_len) { /* not enough in this buf */ cp_len = my_len; } if (hold_rlock) { SCTP_INP_READ_UNLOCK(inp); hold_rlock = 0; } if (cp_len > 0) error = uiomove(mtod(m, char *), (int)cp_len, uio); /* re-read */ if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) { goto release; } if ((control->do_not_ref_stcb == 0) && stcb && stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) { no_rcv_needed = 1; } if (error) { /* error we are out of here */ goto release; } SCTP_INP_READ_LOCK(inp); hold_rlock = 1; if (cp_len == SCTP_BUF_LEN(m)) { if ((SCTP_BUF_NEXT(m) == NULL) && (control->end_added)) { out_flags |= MSG_EOR; if ((control->do_not_ref_stcb == 0) && (control->stcb != NULL) && ((control->spec_flags & M_NOTIFICATION) == 0)) control->stcb->asoc.strmin[control->sinfo_stream].delivery_started = 0; } if (control->spec_flags & M_NOTIFICATION) { out_flags |= MSG_NOTIFICATION; } /* we ate up the mbuf */ if (in_flags & MSG_PEEK) { /* just looking */ m = SCTP_BUF_NEXT(m); copied_so_far += cp_len; } else { /* dispose of the mbuf */ if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_SB_LOGGING_ENABLE) { sctp_sblog(&so->so_rcv, control->do_not_ref_stcb ? NULL : stcb, SCTP_LOG_SBFREE, SCTP_BUF_LEN(m)); } sctp_sbfree(control, stcb, &so->so_rcv, m); if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_SB_LOGGING_ENABLE) { sctp_sblog(&so->so_rcv, control->do_not_ref_stcb ? NULL : stcb, SCTP_LOG_SBRESULT, 0); } copied_so_far += cp_len; freed_so_far += (uint32_t)cp_len; freed_so_far += MSIZE; atomic_subtract_int(&control->length, cp_len); control->data = sctp_m_free(m); m = control->data; /* * been through it all, must hold sb * lock ok to null tail */ if (control->data == NULL) { #ifdef INVARIANTS if ((control->end_added == 0) || (TAILQ_NEXT(control, next) == NULL)) { /* * If the end is not * added, OR the * next is NOT null * we MUST have the * lock. */ if (mtx_owned(&inp->inp_rdata_mtx) == 0) { panic("Hmm we don't own the lock?"); } } #endif control->tail_mbuf = NULL; #ifdef INVARIANTS if ((control->end_added) && ((out_flags & MSG_EOR) == 0)) { panic("end_added, nothing left and no MSG_EOR"); } #endif } } } else { /* Do we need to trim the mbuf? */ if (control->spec_flags & M_NOTIFICATION) { out_flags |= MSG_NOTIFICATION; } if ((in_flags & MSG_PEEK) == 0) { SCTP_BUF_RESV_UF(m, cp_len); SCTP_BUF_LEN(m) -= (int)cp_len; if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_SB_LOGGING_ENABLE) { sctp_sblog(&so->so_rcv, control->do_not_ref_stcb ? NULL : stcb, SCTP_LOG_SBFREE, (int)cp_len); } atomic_subtract_int(&so->so_rcv.sb_cc, cp_len); if ((control->do_not_ref_stcb == 0) && stcb) { atomic_subtract_int(&stcb->asoc.sb_cc, cp_len); } copied_so_far += cp_len; freed_so_far += (uint32_t)cp_len; freed_so_far += MSIZE; if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_SB_LOGGING_ENABLE) { sctp_sblog(&so->so_rcv, control->do_not_ref_stcb ? NULL : stcb, SCTP_LOG_SBRESULT, 0); } atomic_subtract_int(&control->length, cp_len); } else { copied_so_far += cp_len; } } if ((out_flags & MSG_EOR) || (uio->uio_resid == 0)) { break; } if (((stcb) && (in_flags & MSG_PEEK) == 0) && (control->do_not_ref_stcb == 0) && (freed_so_far >= rwnd_req)) { sctp_user_rcvd(stcb, &freed_so_far, hold_rlock, rwnd_req); } } /* end while(m) */ /* * At this point we have looked at it all and we either have * a MSG_EOR/or read all the user wants... * control->length == 0. */ if ((out_flags & MSG_EOR) && ((in_flags & MSG_PEEK) == 0)) { /* we are done with this control */ if (control->length == 0) { if (control->data) { #ifdef INVARIANTS panic("control->data not null at read eor?"); #else SCTP_PRINTF("Strange, data left in the control buffer .. invarients would panic?\n"); sctp_m_freem(control->data); control->data = NULL; #endif } done_with_control: if (hold_rlock == 0) { SCTP_INP_READ_LOCK(inp); hold_rlock = 1; } TAILQ_REMOVE(&inp->read_queue, control, next); /* Add back any hiddend data */ if (control->held_length) { held_length = 0; control->held_length = 0; wakeup_read_socket = 1; } if (control->aux_data) { sctp_m_free(control->aux_data); control->aux_data = NULL; } no_rcv_needed = control->do_not_ref_stcb; sctp_free_remote_addr(control->whoFrom); control->data = NULL; #ifdef INVARIANTS if (control->on_strm_q) { panic("About to free ctl:%p so:%p and its in %d", control, so, control->on_strm_q); } #endif sctp_free_a_readq(stcb, control); control = NULL; if ((freed_so_far >= rwnd_req) && (no_rcv_needed == 0)) sctp_user_rcvd(stcb, &freed_so_far, hold_rlock, rwnd_req); } else { /* * The user did not read all of this * message, turn off the returned MSG_EOR * since we are leaving more behind on the * control to read. */ #ifdef INVARIANTS if (control->end_added && (control->data == NULL) && (control->tail_mbuf == NULL)) { panic("Gak, control->length is corrupt?"); } #endif no_rcv_needed = control->do_not_ref_stcb; out_flags &= ~MSG_EOR; } } if (out_flags & MSG_EOR) { goto release; } if ((uio->uio_resid == 0) || ((in_eeor_mode) && (copied_so_far >= max(so->so_rcv.sb_lowat, 1)))) { goto release; } /* * If I hit here the receiver wants more and this message is * NOT done (pd-api). So two questions. Can we block? if not * we are done. Did the user NOT set MSG_WAITALL? */ if (block_allowed == 0) { goto release; } /* * We need to wait for more data a few things: - We don't * sbunlock() so we don't get someone else reading. - We * must be sure to account for the case where what is added * is NOT to our control when we wakeup. */ /* * Do we need to tell the transport a rwnd update might be * needed before we go to sleep? */ if (((stcb) && (in_flags & MSG_PEEK) == 0) && ((freed_so_far >= rwnd_req) && (control->do_not_ref_stcb == 0) && (no_rcv_needed == 0))) { sctp_user_rcvd(stcb, &freed_so_far, hold_rlock, rwnd_req); } wait_some_more: if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { goto release; } if (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) goto release; if (hold_rlock == 1) { SCTP_INP_READ_UNLOCK(inp); hold_rlock = 0; } if (hold_sblock == 0) { SOCKBUF_LOCK(&so->so_rcv); hold_sblock = 1; } if ((copied_so_far) && (control->length == 0) && (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_FRAG_INTERLEAVE))) { goto release; } if (so->so_rcv.sb_cc <= control->held_length) { error = sbwait(&so->so_rcv); if (error) { goto release; } control->held_length = 0; } if (hold_sblock) { SOCKBUF_UNLOCK(&so->so_rcv); hold_sblock = 0; } if (control->length == 0) { /* still nothing here */ if (control->end_added == 1) { /* he aborted, or is done i.e.did a shutdown */ out_flags |= MSG_EOR; if (control->pdapi_aborted) { if ((control->do_not_ref_stcb == 0) && ((control->spec_flags & M_NOTIFICATION) == 0)) control->stcb->asoc.strmin[control->sinfo_stream].delivery_started = 0; out_flags |= MSG_TRUNC; } else { if ((control->do_not_ref_stcb == 0) && ((control->spec_flags & M_NOTIFICATION) == 0)) control->stcb->asoc.strmin[control->sinfo_stream].delivery_started = 0; } goto done_with_control; } if (so->so_rcv.sb_cc > held_length) { control->held_length = so->so_rcv.sb_cc; held_length = 0; } goto wait_some_more; } else if (control->data == NULL) { /* * we must re-sync since data is probably being * added */ SCTP_INP_READ_LOCK(inp); if ((control->length > 0) && (control->data == NULL)) { /* * big trouble.. we have the lock and its * corrupt? */ #ifdef INVARIANTS panic("Impossible data==NULL length !=0"); #endif out_flags |= MSG_EOR; out_flags |= MSG_TRUNC; control->length = 0; SCTP_INP_READ_UNLOCK(inp); goto done_with_control; } SCTP_INP_READ_UNLOCK(inp); /* We will fall around to get more data */ } goto get_more_data; } else { /*- * Give caller back the mbuf chain, * store in uio_resid the length */ wakeup_read_socket = 0; if ((control->end_added == 0) || (TAILQ_NEXT(control, next) == NULL)) { /* Need to get rlock */ if (hold_rlock == 0) { SCTP_INP_READ_LOCK(inp); hold_rlock = 1; } } if (control->end_added) { out_flags |= MSG_EOR; if ((control->do_not_ref_stcb == 0) && (control->stcb != NULL) && ((control->spec_flags & M_NOTIFICATION) == 0)) control->stcb->asoc.strmin[control->sinfo_stream].delivery_started = 0; } if (control->spec_flags & M_NOTIFICATION) { out_flags |= MSG_NOTIFICATION; } uio->uio_resid = control->length; *mp = control->data; m = control->data; while (m) { if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_SB_LOGGING_ENABLE) { sctp_sblog(&so->so_rcv, control->do_not_ref_stcb ? NULL : stcb, SCTP_LOG_SBFREE, SCTP_BUF_LEN(m)); } sctp_sbfree(control, stcb, &so->so_rcv, m); freed_so_far += (uint32_t)SCTP_BUF_LEN(m); freed_so_far += MSIZE; if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_SB_LOGGING_ENABLE) { sctp_sblog(&so->so_rcv, control->do_not_ref_stcb ? NULL : stcb, SCTP_LOG_SBRESULT, 0); } m = SCTP_BUF_NEXT(m); } control->data = control->tail_mbuf = NULL; control->length = 0; if (out_flags & MSG_EOR) { /* Done with this control */ goto done_with_control; } } release: if (hold_rlock == 1) { SCTP_INP_READ_UNLOCK(inp); hold_rlock = 0; } if (hold_sblock == 1) { SOCKBUF_UNLOCK(&so->so_rcv); hold_sblock = 0; } sbunlock(&so->so_rcv); sockbuf_lock = 0; release_unlocked: if (hold_sblock) { SOCKBUF_UNLOCK(&so->so_rcv); hold_sblock = 0; } if ((stcb) && (in_flags & MSG_PEEK) == 0) { if ((freed_so_far >= rwnd_req) && (control && (control->do_not_ref_stcb == 0)) && (no_rcv_needed == 0)) sctp_user_rcvd(stcb, &freed_so_far, hold_rlock, rwnd_req); } out: if (msg_flags) { *msg_flags = out_flags; } if (((out_flags & MSG_EOR) == 0) && ((in_flags & MSG_PEEK) == 0) && (sinfo) && (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_EXT_RCVINFO) || sctp_is_feature_on(inp, SCTP_PCB_FLAGS_RECVNXTINFO))) { struct sctp_extrcvinfo *s_extra; s_extra = (struct sctp_extrcvinfo *)sinfo; s_extra->serinfo_next_flags = SCTP_NO_NEXT_MSG; } if (hold_rlock == 1) { SCTP_INP_READ_UNLOCK(inp); } if (hold_sblock) { SOCKBUF_UNLOCK(&so->so_rcv); } if (sockbuf_lock) { sbunlock(&so->so_rcv); } if (freecnt_applied) { /* * The lock on the socket buffer protects us so the free * code will stop. But since we used the socketbuf lock and * the sender uses the tcb_lock to increment, we need to use * the atomic add to the refcnt. */ if (stcb == NULL) { #ifdef INVARIANTS panic("stcb for refcnt has gone NULL?"); goto stage_left; #else goto stage_left; #endif } /* Save the value back for next time */ stcb->freed_by_sorcv_sincelast = freed_so_far; atomic_add_int(&stcb->asoc.refcnt, -1); } if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_RECV_RWND_LOGGING_ENABLE) { if (stcb) { sctp_misc_ints(SCTP_SORECV_DONE, freed_so_far, (uint32_t)((uio) ? (slen - uio->uio_resid) : slen), stcb->asoc.my_rwnd, so->so_rcv.sb_cc); } else { sctp_misc_ints(SCTP_SORECV_DONE, freed_so_far, (uint32_t)((uio) ? (slen - uio->uio_resid) : slen), 0, so->so_rcv.sb_cc); } } stage_left: if (wakeup_read_socket) { sctp_sorwakeup(inp, so); } return (error); } #ifdef SCTP_MBUF_LOGGING struct mbuf * sctp_m_free(struct mbuf *m) { if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MBUF_LOGGING_ENABLE) { sctp_log_mb(m, SCTP_MBUF_IFREE); } return (m_free(m)); } void sctp_m_freem(struct mbuf *mb) { while (mb != NULL) mb = sctp_m_free(mb); } #endif int sctp_dynamic_set_primary(struct sockaddr *sa, uint32_t vrf_id) { /* * Given a local address. For all associations that holds the * address, request a peer-set-primary. */ struct sctp_ifa *ifa; struct sctp_laddr *wi; ifa = sctp_find_ifa_by_addr(sa, vrf_id, SCTP_ADDR_NOT_LOCKED); if (ifa == NULL) { SCTP_LTRACE_ERR_RET(NULL, NULL, NULL, SCTP_FROM_SCTPUTIL, EADDRNOTAVAIL); return (EADDRNOTAVAIL); } /* * Now that we have the ifa we must awaken the iterator with this * message. */ wi = SCTP_ZONE_GET(SCTP_BASE_INFO(ipi_zone_laddr), struct sctp_laddr); if (wi == NULL) { SCTP_LTRACE_ERR_RET(NULL, NULL, NULL, SCTP_FROM_SCTPUTIL, ENOMEM); return (ENOMEM); } /* Now incr the count and int wi structure */ SCTP_INCR_LADDR_COUNT(); memset(wi, 0, sizeof(*wi)); (void)SCTP_GETTIME_TIMEVAL(&wi->start_time); wi->ifa = ifa; wi->action = SCTP_SET_PRIM_ADDR; atomic_add_int(&ifa->refcount, 1); /* Now add it to the work queue */ SCTP_WQ_ADDR_LOCK(); /* * Should this really be a tailq? As it is we will process the * newest first :-0 */ LIST_INSERT_HEAD(&SCTP_BASE_INFO(addr_wq), wi, sctp_nxt_addr); sctp_timer_start(SCTP_TIMER_TYPE_ADDR_WQ, (struct sctp_inpcb *)NULL, (struct sctp_tcb *)NULL, (struct sctp_nets *)NULL); SCTP_WQ_ADDR_UNLOCK(); return (0); } int sctp_soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp) { int error, fromlen; uint8_t sockbuf[256]; struct sockaddr *from; struct sctp_extrcvinfo sinfo; int filling_sinfo = 1; int flags; struct sctp_inpcb *inp; inp = (struct sctp_inpcb *)so->so_pcb; /* pickup the assoc we are reading from */ if (inp == NULL) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTPUTIL, EINVAL); return (EINVAL); } if ((sctp_is_feature_off(inp, SCTP_PCB_FLAGS_RECVDATAIOEVNT) && sctp_is_feature_off(inp, SCTP_PCB_FLAGS_RECVRCVINFO) && sctp_is_feature_off(inp, SCTP_PCB_FLAGS_RECVNXTINFO)) || (controlp == NULL)) { /* user does not want the sndrcv ctl */ filling_sinfo = 0; } if (psa) { from = (struct sockaddr *)sockbuf; fromlen = sizeof(sockbuf); from->sa_len = 0; } else { from = NULL; fromlen = 0; } if (filling_sinfo) { memset(&sinfo, 0, sizeof(struct sctp_extrcvinfo)); } if (flagsp != NULL) { flags = *flagsp; } else { flags = 0; } error = sctp_sorecvmsg(so, uio, mp0, from, fromlen, &flags, (struct sctp_sndrcvinfo *)&sinfo, filling_sinfo); if (flagsp != NULL) { *flagsp = flags; } if (controlp != NULL) { /* copy back the sinfo in a CMSG format */ if (filling_sinfo && ((flags & MSG_NOTIFICATION) == 0)) { *controlp = sctp_build_ctl_nchunk(inp, (struct sctp_sndrcvinfo *)&sinfo); } else { *controlp = NULL; } } if (psa) { /* copy back the address info */ if (from && from->sa_len) { *psa = sodupsockaddr(from, M_NOWAIT); } else { *psa = NULL; } } return (error); } int sctp_connectx_helper_add(struct sctp_tcb *stcb, struct sockaddr *addr, int totaddr, int *error) { int added = 0; int i; struct sctp_inpcb *inp; struct sockaddr *sa; size_t incr = 0; #ifdef INET struct sockaddr_in *sin; #endif #ifdef INET6 struct sockaddr_in6 *sin6; #endif sa = addr; inp = stcb->sctp_ep; *error = 0; for (i = 0; i < totaddr; i++) { switch (sa->sa_family) { #ifdef INET case AF_INET: incr = sizeof(struct sockaddr_in); sin = (struct sockaddr_in *)sa; if ((sin->sin_addr.s_addr == INADDR_ANY) || (sin->sin_addr.s_addr == INADDR_BROADCAST) || IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) { SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTPUTIL, EINVAL); (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTPUTIL + SCTP_LOC_7); *error = EINVAL; goto out_now; } if (sctp_add_remote_addr(stcb, sa, NULL, stcb->asoc.port, SCTP_DONOT_SETSCOPE, SCTP_ADDR_IS_CONFIRMED)) { /* assoc gone no un-lock */ SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTPUTIL, ENOBUFS); (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTPUTIL + SCTP_LOC_8); *error = ENOBUFS; goto out_now; } added++; break; #endif #ifdef INET6 case AF_INET6: incr = sizeof(struct sockaddr_in6); sin6 = (struct sockaddr_in6 *)sa; if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr) || IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) { SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTPUTIL, EINVAL); (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTPUTIL + SCTP_LOC_9); *error = EINVAL; goto out_now; } if (sctp_add_remote_addr(stcb, sa, NULL, stcb->asoc.port, SCTP_DONOT_SETSCOPE, SCTP_ADDR_IS_CONFIRMED)) { /* assoc gone no un-lock */ SCTP_LTRACE_ERR_RET(NULL, stcb, NULL, SCTP_FROM_SCTPUTIL, ENOBUFS); (void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC, SCTP_FROM_SCTPUTIL + SCTP_LOC_10); *error = ENOBUFS; goto out_now; } added++; break; #endif default: break; } sa = (struct sockaddr *)((caddr_t)sa + incr); } out_now: return (added); } int sctp_connectx_helper_find(struct sctp_inpcb *inp, struct sockaddr *addr, unsigned int totaddr, unsigned int *num_v4, unsigned int *num_v6, unsigned int limit) { struct sockaddr *sa; struct sctp_tcb *stcb; unsigned int incr, at, i; at = 0; sa = addr; *num_v6 = *num_v4 = 0; /* account and validate addresses */ if (totaddr == 0) { return (EINVAL); } for (i = 0; i < totaddr; i++) { if (at + sizeof(struct sockaddr) > limit) { return (EINVAL); } switch (sa->sa_family) { #ifdef INET case AF_INET: incr = (unsigned int)sizeof(struct sockaddr_in); if (sa->sa_len != incr) { return (EINVAL); } (*num_v4) += 1; break; #endif #ifdef INET6 case AF_INET6: { struct sockaddr_in6 *sin6; sin6 = (struct sockaddr_in6 *)sa; if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { /* Must be non-mapped for connectx */ return (EINVAL); } incr = (unsigned int)sizeof(struct sockaddr_in6); if (sa->sa_len != incr) { return (EINVAL); } (*num_v6) += 1; break; } #endif default: return (EINVAL); } if ((at + incr) > limit) { return (EINVAL); } SCTP_INP_INCR_REF(inp); stcb = sctp_findassociation_ep_addr(&inp, sa, NULL, NULL, NULL); if (stcb != NULL) { SCTP_TCB_UNLOCK(stcb); return (EALREADY); } else { SCTP_INP_DECR_REF(inp); } at += incr; sa = (struct sockaddr *)((caddr_t)sa + incr); } return (0); } /* * sctp_bindx(ADD) for one address. * assumes all arguments are valid/checked by caller. */ void sctp_bindx_add_address(struct socket *so, struct sctp_inpcb *inp, struct sockaddr *sa, uint32_t vrf_id, int *error, void *p) { #if defined(INET) && defined(INET6) struct sockaddr_in sin; #endif #ifdef INET6 struct sockaddr_in6 *sin6; #endif #ifdef INET struct sockaddr_in *sinp; #endif struct sockaddr *addr_to_use; struct sctp_inpcb *lep; uint16_t port; /* see if we're bound all already! */ if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTPUTIL, EINVAL); *error = EINVAL; return; } switch (sa->sa_family) { #ifdef INET6 case AF_INET6: if (sa->sa_len != sizeof(struct sockaddr_in6)) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTPUTIL, EINVAL); *error = EINVAL; return; } if ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) == 0) { /* can only bind v6 on PF_INET6 sockets */ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTPUTIL, EINVAL); *error = EINVAL; return; } sin6 = (struct sockaddr_in6 *)sa; port = sin6->sin6_port; #ifdef INET if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { if ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) && SCTP_IPV6_V6ONLY(inp)) { /* can't bind v4-mapped on PF_INET sockets */ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTPUTIL, EINVAL); *error = EINVAL; return; } in6_sin6_2_sin(&sin, sin6); addr_to_use = (struct sockaddr *)&sin; } else { addr_to_use = sa; } #else addr_to_use = sa; #endif break; #endif #ifdef INET case AF_INET: if (sa->sa_len != sizeof(struct sockaddr_in)) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTPUTIL, EINVAL); *error = EINVAL; return; } if ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) && SCTP_IPV6_V6ONLY(inp)) { /* can't bind v4 on PF_INET sockets */ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTPUTIL, EINVAL); *error = EINVAL; return; } sinp = (struct sockaddr_in *)sa; port = sinp->sin_port; addr_to_use = sa; break; #endif default: SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTPUTIL, EINVAL); *error = EINVAL; return; } if (inp->sctp_flags & SCTP_PCB_FLAGS_UNBOUND) { if (p == NULL) { /* Can't get proc for Net/Open BSD */ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTPUTIL, EINVAL); *error = EINVAL; return; } *error = sctp_inpcb_bind(so, addr_to_use, NULL, p); return; } /* Validate the incoming port. */ if ((port != 0) && (port != inp->sctp_lport)) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTPUTIL, EINVAL); *error = EINVAL; return; } lep = sctp_pcb_findep(addr_to_use, 1, 0, vrf_id); if (lep == NULL) { /* add the address */ *error = sctp_addr_mgmt_ep_sa(inp, addr_to_use, SCTP_ADD_IP_ADDRESS, vrf_id); } else { if (lep != inp) { *error = EADDRINUSE; } SCTP_INP_DECR_REF(lep); } } /* * sctp_bindx(DELETE) for one address. * assumes all arguments are valid/checked by caller. */ void sctp_bindx_delete_address(struct sctp_inpcb *inp, struct sockaddr *sa, uint32_t vrf_id, int *error) { struct sockaddr *addr_to_use; #if defined(INET) && defined(INET6) struct sockaddr_in6 *sin6; struct sockaddr_in sin; #endif /* see if we're bound all already! */ if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTPUTIL, EINVAL); *error = EINVAL; return; } switch (sa->sa_family) { #ifdef INET6 case AF_INET6: if (sa->sa_len != sizeof(struct sockaddr_in6)) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTPUTIL, EINVAL); *error = EINVAL; return; } if ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) == 0) { /* can only bind v6 on PF_INET6 sockets */ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTPUTIL, EINVAL); *error = EINVAL; return; } #ifdef INET sin6 = (struct sockaddr_in6 *)sa; if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { if ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) && SCTP_IPV6_V6ONLY(inp)) { /* can't bind mapped-v4 on PF_INET sockets */ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTPUTIL, EINVAL); *error = EINVAL; return; } in6_sin6_2_sin(&sin, sin6); addr_to_use = (struct sockaddr *)&sin; } else { addr_to_use = sa; } #else addr_to_use = sa; #endif break; #endif #ifdef INET case AF_INET: if (sa->sa_len != sizeof(struct sockaddr_in)) { SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTPUTIL, EINVAL); *error = EINVAL; return; } if ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) && SCTP_IPV6_V6ONLY(inp)) { /* can't bind v4 on PF_INET sockets */ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTPUTIL, EINVAL); *error = EINVAL; return; } addr_to_use = sa; break; #endif default: SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTPUTIL, EINVAL); *error = EINVAL; return; } /* No lock required mgmt_ep_sa does its own locking. */ *error = sctp_addr_mgmt_ep_sa(inp, addr_to_use, SCTP_DEL_IP_ADDRESS, vrf_id); } /* * returns the valid local address count for an assoc, taking into account * all scoping rules */ int sctp_local_addr_count(struct sctp_tcb *stcb) { int loopback_scope; #if defined(INET) int ipv4_local_scope, ipv4_addr_legal; #endif #if defined(INET6) int local_scope, site_scope, ipv6_addr_legal; #endif struct sctp_vrf *vrf; struct sctp_ifn *sctp_ifn; struct sctp_ifa *sctp_ifa; int count = 0; /* Turn on all the appropriate scopes */ loopback_scope = stcb->asoc.scope.loopback_scope; #if defined(INET) ipv4_local_scope = stcb->asoc.scope.ipv4_local_scope; ipv4_addr_legal = stcb->asoc.scope.ipv4_addr_legal; #endif #if defined(INET6) local_scope = stcb->asoc.scope.local_scope; site_scope = stcb->asoc.scope.site_scope; ipv6_addr_legal = stcb->asoc.scope.ipv6_addr_legal; #endif SCTP_IPI_ADDR_RLOCK(); vrf = sctp_find_vrf(stcb->asoc.vrf_id); if (vrf == NULL) { /* no vrf, no addresses */ SCTP_IPI_ADDR_RUNLOCK(); return (0); } if (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) { /* * bound all case: go through all ifns on the vrf */ LIST_FOREACH(sctp_ifn, &vrf->ifnlist, next_ifn) { if ((loopback_scope == 0) && SCTP_IFN_IS_IFT_LOOP(sctp_ifn)) { continue; } LIST_FOREACH(sctp_ifa, &sctp_ifn->ifalist, next_ifa) { if (sctp_is_addr_restricted(stcb, sctp_ifa)) continue; switch (sctp_ifa->address.sa.sa_family) { #ifdef INET case AF_INET: if (ipv4_addr_legal) { struct sockaddr_in *sin; sin = &sctp_ifa->address.sin; if (sin->sin_addr.s_addr == 0) { /* * skip unspecified * addrs */ continue; } if (prison_check_ip4(stcb->sctp_ep->ip_inp.inp.inp_cred, &sin->sin_addr) != 0) { continue; } if ((ipv4_local_scope == 0) && (IN4_ISPRIVATE_ADDRESS(&sin->sin_addr))) { continue; } /* count this one */ count++; } else { continue; } break; #endif #ifdef INET6 case AF_INET6: if (ipv6_addr_legal) { struct sockaddr_in6 *sin6; sin6 = &sctp_ifa->address.sin6; if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) { continue; } if (prison_check_ip6(stcb->sctp_ep->ip_inp.inp.inp_cred, &sin6->sin6_addr) != 0) { continue; } if (IN6_IS_ADDR_LINKLOCAL(&sin6->sin6_addr)) { if (local_scope == 0) continue; if (sin6->sin6_scope_id == 0) { if (sa6_recoverscope(sin6) != 0) /* * * bad * link * * local * * address */ continue; } } if ((site_scope == 0) && (IN6_IS_ADDR_SITELOCAL(&sin6->sin6_addr))) { continue; } /* count this one */ count++; } break; #endif default: /* TSNH */ break; } } } } else { /* * subset bound case */ struct sctp_laddr *laddr; LIST_FOREACH(laddr, &stcb->sctp_ep->sctp_addr_list, sctp_nxt_addr) { if (sctp_is_addr_restricted(stcb, laddr->ifa)) { continue; } /* count this one */ count++; } } SCTP_IPI_ADDR_RUNLOCK(); return (count); } #if defined(SCTP_LOCAL_TRACE_BUF) void sctp_log_trace(uint32_t subsys, const char *str SCTP_UNUSED, uint32_t a, uint32_t b, uint32_t c, uint32_t d, uint32_t e, uint32_t f) { uint32_t saveindex, newindex; do { saveindex = SCTP_BASE_SYSCTL(sctp_log).index; if (saveindex >= SCTP_MAX_LOGGING_SIZE) { newindex = 1; } else { newindex = saveindex + 1; } } while (atomic_cmpset_int(&SCTP_BASE_SYSCTL(sctp_log).index, saveindex, newindex) == 0); if (saveindex >= SCTP_MAX_LOGGING_SIZE) { saveindex = 0; } SCTP_BASE_SYSCTL(sctp_log).entry[saveindex].timestamp = SCTP_GET_CYCLECOUNT; SCTP_BASE_SYSCTL(sctp_log).entry[saveindex].subsys = subsys; SCTP_BASE_SYSCTL(sctp_log).entry[saveindex].params[0] = a; SCTP_BASE_SYSCTL(sctp_log).entry[saveindex].params[1] = b; SCTP_BASE_SYSCTL(sctp_log).entry[saveindex].params[2] = c; SCTP_BASE_SYSCTL(sctp_log).entry[saveindex].params[3] = d; SCTP_BASE_SYSCTL(sctp_log).entry[saveindex].params[4] = e; SCTP_BASE_SYSCTL(sctp_log).entry[saveindex].params[5] = f; } #endif static void sctp_recv_udp_tunneled_packet(struct mbuf *m, int off, struct inpcb *inp, const struct sockaddr *sa SCTP_UNUSED, void *ctx SCTP_UNUSED) { struct ip *iph; #ifdef INET6 struct ip6_hdr *ip6; #endif struct mbuf *sp, *last; struct udphdr *uhdr; uint16_t port; if ((m->m_flags & M_PKTHDR) == 0) { /* Can't handle one that is not a pkt hdr */ goto out; } /* Pull the src port */ iph = mtod(m, struct ip *); uhdr = (struct udphdr *)((caddr_t)iph + off); port = uhdr->uh_sport; /* * Split out the mbuf chain. Leave the IP header in m, place the * rest in the sp. */ sp = m_split(m, off, M_NOWAIT); if (sp == NULL) { /* Gak, drop packet, we can't do a split */ goto out; } if (sp->m_pkthdr.len < sizeof(struct udphdr) + sizeof(struct sctphdr)) { /* Gak, packet can't have an SCTP header in it - too small */ m_freem(sp); goto out; } /* Now pull up the UDP header and SCTP header together */ sp = m_pullup(sp, sizeof(struct udphdr) + sizeof(struct sctphdr)); if (sp == NULL) { /* Gak pullup failed */ goto out; } /* Trim out the UDP header */ m_adj(sp, sizeof(struct udphdr)); /* Now reconstruct the mbuf chain */ for (last = m; last->m_next; last = last->m_next); last->m_next = sp; m->m_pkthdr.len += sp->m_pkthdr.len; /* * The CSUM_DATA_VALID flags indicates that the HW checked the UDP * checksum and it was valid. Since CSUM_DATA_VALID == * CSUM_SCTP_VALID this would imply that the HW also verified the * SCTP checksum. Therefore, clear the bit. */ SCTPDBG(SCTP_DEBUG_CRCOFFLOAD, "sctp_recv_udp_tunneled_packet(): Packet of length %d received on %s with csum_flags 0x%b.\n", m->m_pkthdr.len, if_name(m->m_pkthdr.rcvif), (int)m->m_pkthdr.csum_flags, CSUM_BITS); m->m_pkthdr.csum_flags &= ~CSUM_DATA_VALID; iph = mtod(m, struct ip *); switch (iph->ip_v) { #ifdef INET case IPVERSION: iph->ip_len = htons(ntohs(iph->ip_len) - sizeof(struct udphdr)); sctp_input_with_port(m, off, port); break; #endif #ifdef INET6 case IPV6_VERSION >> 4: ip6 = mtod(m, struct ip6_hdr *); ip6->ip6_plen = htons(ntohs(ip6->ip6_plen) - sizeof(struct udphdr)); sctp6_input_with_port(&m, &off, port); break; #endif default: goto out; break; } return; out: m_freem(m); } #ifdef INET static void sctp_recv_icmp_tunneled_packet(int cmd, struct sockaddr *sa, void *vip, void *ctx SCTP_UNUSED) { struct ip *outer_ip, *inner_ip; struct sctphdr *sh; struct icmp *icmp; struct udphdr *udp; struct sctp_inpcb *inp; struct sctp_tcb *stcb; struct sctp_nets *net; struct sctp_init_chunk *ch; struct sockaddr_in src, dst; uint8_t type, code; inner_ip = (struct ip *)vip; icmp = (struct icmp *)((caddr_t)inner_ip - (sizeof(struct icmp) - sizeof(struct ip))); outer_ip = (struct ip *)((caddr_t)icmp - sizeof(struct ip)); if (ntohs(outer_ip->ip_len) < sizeof(struct ip) + 8 + (inner_ip->ip_hl << 2) + sizeof(struct udphdr) + 8) { return; } udp = (struct udphdr *)((caddr_t)inner_ip + (inner_ip->ip_hl << 2)); sh = (struct sctphdr *)(udp + 1); memset(&src, 0, sizeof(struct sockaddr_in)); src.sin_family = AF_INET; src.sin_len = sizeof(struct sockaddr_in); src.sin_port = sh->src_port; src.sin_addr = inner_ip->ip_src; memset(&dst, 0, sizeof(struct sockaddr_in)); dst.sin_family = AF_INET; dst.sin_len = sizeof(struct sockaddr_in); dst.sin_port = sh->dest_port; dst.sin_addr = inner_ip->ip_dst; /* * 'dst' holds the dest of the packet that failed to be sent. 'src' * holds our local endpoint address. Thus we reverse the dst and the * src in the lookup. */ inp = NULL; net = NULL; stcb = sctp_findassociation_addr_sa((struct sockaddr *)&dst, (struct sockaddr *)&src, &inp, &net, 1, SCTP_DEFAULT_VRFID); if ((stcb != NULL) && (net != NULL) && (inp != NULL)) { /* Check the UDP port numbers */ if ((udp->uh_dport != net->port) || (udp->uh_sport != htons(SCTP_BASE_SYSCTL(sctp_udp_tunneling_port)))) { SCTP_TCB_UNLOCK(stcb); return; } /* Check the verification tag */ if (ntohl(sh->v_tag) != 0) { /* * This must be the verification tag used for * sending out packets. We don't consider packets * reflecting the verification tag. */ if (ntohl(sh->v_tag) != stcb->asoc.peer_vtag) { SCTP_TCB_UNLOCK(stcb); return; } } else { if (ntohs(outer_ip->ip_len) >= sizeof(struct ip) + 8 + (inner_ip->ip_hl << 2) + 8 + 20) { /* * In this case we can check if we got an * INIT chunk and if the initiate tag * matches. */ ch = (struct sctp_init_chunk *)(sh + 1); if ((ch->ch.chunk_type != SCTP_INITIATION) || (ntohl(ch->init.initiate_tag) != stcb->asoc.my_vtag)) { SCTP_TCB_UNLOCK(stcb); return; } } else { SCTP_TCB_UNLOCK(stcb); return; } } type = icmp->icmp_type; code = icmp->icmp_code; if ((type == ICMP_UNREACH) && (code == ICMP_UNREACH_PORT)) { code = ICMP_UNREACH_PROTOCOL; } sctp_notify(inp, stcb, net, type, code, ntohs(inner_ip->ip_len), (uint32_t)ntohs(icmp->icmp_nextmtu)); } else { if ((stcb == NULL) && (inp != NULL)) { /* reduce ref-count */ SCTP_INP_WLOCK(inp); SCTP_INP_DECR_REF(inp); SCTP_INP_WUNLOCK(inp); } if (stcb) { SCTP_TCB_UNLOCK(stcb); } } return; } #endif #ifdef INET6 static void sctp_recv_icmp6_tunneled_packet(int cmd, struct sockaddr *sa, void *d, void *ctx SCTP_UNUSED) { struct ip6ctlparam *ip6cp; struct sctp_inpcb *inp; struct sctp_tcb *stcb; struct sctp_nets *net; struct sctphdr sh; struct udphdr udp; struct sockaddr_in6 src, dst; uint8_t type, code; ip6cp = (struct ip6ctlparam *)d; /* * XXX: We assume that when IPV6 is non NULL, M and OFF are valid. */ if (ip6cp->ip6c_m == NULL) { return; } /* * Check if we can safely examine the ports and the verification tag * of the SCTP common header. */ if (ip6cp->ip6c_m->m_pkthdr.len < ip6cp->ip6c_off + sizeof(struct udphdr) + offsetof(struct sctphdr, checksum)) { return; } /* Copy out the UDP header. */ memset(&udp, 0, sizeof(struct udphdr)); m_copydata(ip6cp->ip6c_m, ip6cp->ip6c_off, sizeof(struct udphdr), (caddr_t)&udp); /* Copy out the port numbers and the verification tag. */ memset(&sh, 0, sizeof(struct sctphdr)); m_copydata(ip6cp->ip6c_m, ip6cp->ip6c_off + sizeof(struct udphdr), sizeof(uint16_t) + sizeof(uint16_t) + sizeof(uint32_t), (caddr_t)&sh); memset(&src, 0, sizeof(struct sockaddr_in6)); src.sin6_family = AF_INET6; src.sin6_len = sizeof(struct sockaddr_in6); src.sin6_port = sh.src_port; src.sin6_addr = ip6cp->ip6c_ip6->ip6_src; if (in6_setscope(&src.sin6_addr, ip6cp->ip6c_m->m_pkthdr.rcvif, NULL) != 0) { return; } memset(&dst, 0, sizeof(struct sockaddr_in6)); dst.sin6_family = AF_INET6; dst.sin6_len = sizeof(struct sockaddr_in6); dst.sin6_port = sh.dest_port; dst.sin6_addr = ip6cp->ip6c_ip6->ip6_dst; if (in6_setscope(&dst.sin6_addr, ip6cp->ip6c_m->m_pkthdr.rcvif, NULL) != 0) { return; } inp = NULL; net = NULL; stcb = sctp_findassociation_addr_sa((struct sockaddr *)&dst, (struct sockaddr *)&src, &inp, &net, 1, SCTP_DEFAULT_VRFID); if ((stcb != NULL) && (net != NULL) && (inp != NULL)) { /* Check the UDP port numbers */ if ((udp.uh_dport != net->port) || (udp.uh_sport != htons(SCTP_BASE_SYSCTL(sctp_udp_tunneling_port)))) { SCTP_TCB_UNLOCK(stcb); return; } /* Check the verification tag */ if (ntohl(sh.v_tag) != 0) { /* * This must be the verification tag used for * sending out packets. We don't consider packets * reflecting the verification tag. */ if (ntohl(sh.v_tag) != stcb->asoc.peer_vtag) { SCTP_TCB_UNLOCK(stcb); return; } } else { if (ip6cp->ip6c_m->m_pkthdr.len >= ip6cp->ip6c_off + sizeof(struct udphdr) + sizeof(struct sctphdr) + sizeof(struct sctp_chunkhdr) + offsetof(struct sctp_init, a_rwnd)) { /* * In this case we can check if we got an * INIT chunk and if the initiate tag * matches. */ uint32_t initiate_tag; uint8_t chunk_type; m_copydata(ip6cp->ip6c_m, ip6cp->ip6c_off + sizeof(struct udphdr) + sizeof(struct sctphdr), sizeof(uint8_t), (caddr_t)&chunk_type); m_copydata(ip6cp->ip6c_m, ip6cp->ip6c_off + sizeof(struct udphdr) + sizeof(struct sctphdr) + sizeof(struct sctp_chunkhdr), sizeof(uint32_t), (caddr_t)&initiate_tag); if ((chunk_type != SCTP_INITIATION) || (ntohl(initiate_tag) != stcb->asoc.my_vtag)) { SCTP_TCB_UNLOCK(stcb); return; } } else { SCTP_TCB_UNLOCK(stcb); return; } } type = ip6cp->ip6c_icmp6->icmp6_type; code = ip6cp->ip6c_icmp6->icmp6_code; if ((type == ICMP6_DST_UNREACH) && (code == ICMP6_DST_UNREACH_NOPORT)) { type = ICMP6_PARAM_PROB; code = ICMP6_PARAMPROB_NEXTHEADER; } sctp6_notify(inp, stcb, net, type, code, ntohl(ip6cp->ip6c_icmp6->icmp6_mtu)); } else { if ((stcb == NULL) && (inp != NULL)) { /* reduce inp's ref-count */ SCTP_INP_WLOCK(inp); SCTP_INP_DECR_REF(inp); SCTP_INP_WUNLOCK(inp); } if (stcb) { SCTP_TCB_UNLOCK(stcb); } } } #endif void sctp_over_udp_stop(void) { /* * This function assumes sysctl caller holds sctp_sysctl_info_lock() * for writting! */ #ifdef INET if (SCTP_BASE_INFO(udp4_tun_socket) != NULL) { soclose(SCTP_BASE_INFO(udp4_tun_socket)); SCTP_BASE_INFO(udp4_tun_socket) = NULL; } #endif #ifdef INET6 if (SCTP_BASE_INFO(udp6_tun_socket) != NULL) { soclose(SCTP_BASE_INFO(udp6_tun_socket)); SCTP_BASE_INFO(udp6_tun_socket) = NULL; } #endif } int sctp_over_udp_start(void) { uint16_t port; int ret; #ifdef INET struct sockaddr_in sin; #endif #ifdef INET6 struct sockaddr_in6 sin6; #endif /* * This function assumes sysctl caller holds sctp_sysctl_info_lock() * for writting! */ port = SCTP_BASE_SYSCTL(sctp_udp_tunneling_port); if (ntohs(port) == 0) { /* Must have a port set */ return (EINVAL); } #ifdef INET if (SCTP_BASE_INFO(udp4_tun_socket) != NULL) { /* Already running -- must stop first */ return (EALREADY); } #endif #ifdef INET6 if (SCTP_BASE_INFO(udp6_tun_socket) != NULL) { /* Already running -- must stop first */ return (EALREADY); } #endif #ifdef INET if ((ret = socreate(PF_INET, &SCTP_BASE_INFO(udp4_tun_socket), SOCK_DGRAM, IPPROTO_UDP, curthread->td_ucred, curthread))) { sctp_over_udp_stop(); return (ret); } /* Call the special UDP hook. */ if ((ret = udp_set_kernel_tunneling(SCTP_BASE_INFO(udp4_tun_socket), sctp_recv_udp_tunneled_packet, sctp_recv_icmp_tunneled_packet, NULL))) { sctp_over_udp_stop(); return (ret); } /* Ok, we have a socket, bind it to the port. */ memset(&sin, 0, sizeof(struct sockaddr_in)); sin.sin_len = sizeof(struct sockaddr_in); sin.sin_family = AF_INET; sin.sin_port = htons(port); if ((ret = sobind(SCTP_BASE_INFO(udp4_tun_socket), (struct sockaddr *)&sin, curthread))) { sctp_over_udp_stop(); return (ret); } #endif #ifdef INET6 if ((ret = socreate(PF_INET6, &SCTP_BASE_INFO(udp6_tun_socket), SOCK_DGRAM, IPPROTO_UDP, curthread->td_ucred, curthread))) { sctp_over_udp_stop(); return (ret); } /* Call the special UDP hook. */ if ((ret = udp_set_kernel_tunneling(SCTP_BASE_INFO(udp6_tun_socket), sctp_recv_udp_tunneled_packet, sctp_recv_icmp6_tunneled_packet, NULL))) { sctp_over_udp_stop(); return (ret); } /* Ok, we have a socket, bind it to the port. */ memset(&sin6, 0, sizeof(struct sockaddr_in6)); sin6.sin6_len = sizeof(struct sockaddr_in6); sin6.sin6_family = AF_INET6; sin6.sin6_port = htons(port); if ((ret = sobind(SCTP_BASE_INFO(udp6_tun_socket), (struct sockaddr *)&sin6, curthread))) { sctp_over_udp_stop(); return (ret); } #endif return (0); } /* * sctp_min_mtu ()returns the minimum of all non-zero arguments. * If all arguments are zero, zero is returned. */ uint32_t sctp_min_mtu(uint32_t mtu1, uint32_t mtu2, uint32_t mtu3) { if (mtu1 > 0) { if (mtu2 > 0) { if (mtu3 > 0) { return (min(mtu1, min(mtu2, mtu3))); } else { return (min(mtu1, mtu2)); } } else { if (mtu3 > 0) { return (min(mtu1, mtu3)); } else { return (mtu1); } } } else { if (mtu2 > 0) { if (mtu3 > 0) { return (min(mtu2, mtu3)); } else { return (mtu2); } } else { return (mtu3); } } } void sctp_hc_set_mtu(union sctp_sockstore *addr, uint16_t fibnum, uint32_t mtu) { struct in_conninfo inc; memset(&inc, 0, sizeof(struct in_conninfo)); inc.inc_fibnum = fibnum; switch (addr->sa.sa_family) { #ifdef INET case AF_INET: inc.inc_faddr = addr->sin.sin_addr; break; #endif #ifdef INET6 case AF_INET6: inc.inc_flags |= INC_ISIPV6; inc.inc6_faddr = addr->sin6.sin6_addr; break; #endif default: return; } tcp_hc_updatemtu(&inc, (u_long)mtu); } uint32_t sctp_hc_get_mtu(union sctp_sockstore *addr, uint16_t fibnum) { struct in_conninfo inc; memset(&inc, 0, sizeof(struct in_conninfo)); inc.inc_fibnum = fibnum; switch (addr->sa.sa_family) { #ifdef INET case AF_INET: inc.inc_faddr = addr->sin.sin_addr; break; #endif #ifdef INET6 case AF_INET6: inc.inc_flags |= INC_ISIPV6; inc.inc6_faddr = addr->sin6.sin6_addr; break; #endif default: return (0); } return ((uint32_t)tcp_hc_getmtu(&inc)); } void sctp_set_state(struct sctp_tcb *stcb, int new_state) { #if defined(KDTRACE_HOOKS) int old_state = stcb->asoc.state; #endif KASSERT((new_state & ~SCTP_STATE_MASK) == 0, ("sctp_set_state: Can't set substate (new_state = %x)", new_state)); stcb->asoc.state = (stcb->asoc.state & ~SCTP_STATE_MASK) | new_state; if ((new_state == SCTP_STATE_SHUTDOWN_RECEIVED) || (new_state == SCTP_STATE_SHUTDOWN_SENT) || (new_state == SCTP_STATE_SHUTDOWN_ACK_SENT)) { SCTP_CLEAR_SUBSTATE(stcb, SCTP_STATE_SHUTDOWN_PENDING); } #if defined(KDTRACE_HOOKS) if (((old_state & SCTP_STATE_MASK) != new_state) && !(((old_state & SCTP_STATE_MASK) == SCTP_STATE_EMPTY) && (new_state == SCTP_STATE_INUSE))) { SCTP_PROBE6(state__change, NULL, stcb, NULL, stcb, NULL, old_state); } #endif } void sctp_add_substate(struct sctp_tcb *stcb, int substate) { #if defined(KDTRACE_HOOKS) int old_state = stcb->asoc.state; #endif KASSERT((substate & SCTP_STATE_MASK) == 0, ("sctp_add_substate: Can't set state (substate = %x)", substate)); stcb->asoc.state |= substate; #if defined(KDTRACE_HOOKS) if (((substate & SCTP_STATE_ABOUT_TO_BE_FREED) && ((old_state & SCTP_STATE_ABOUT_TO_BE_FREED) == 0)) || ((substate & SCTP_STATE_SHUTDOWN_PENDING) && ((old_state & SCTP_STATE_SHUTDOWN_PENDING) == 0))) { SCTP_PROBE6(state__change, NULL, stcb, NULL, stcb, NULL, old_state); } #endif } Index: projects/clang1100-import/sys/netinet/sctputil.h =================================================================== --- projects/clang1100-import/sys/netinet/sctputil.h (revision 364278) +++ projects/clang1100-import/sys/netinet/sctputil.h (revision 364279) @@ -1,380 +1,380 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2001-2007, by Cisco Systems, Inc. All rights reserved. * Copyright (c) 2008-2012, by Randall Stewart. All rights reserved. * Copyright (c) 2008-2012, by Michael Tuexen. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * a) Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * b) Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the distribution. * * c) Neither the name of Cisco Systems, Inc. nor the names of its * contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #ifndef _NETINET_SCTP_UTIL_H_ #define _NETINET_SCTP_UTIL_H_ #if defined(_KERNEL) || defined(__Userspace__) #define SCTP_READ_LOCK_HELD 1 #define SCTP_READ_LOCK_NOT_HELD 0 #ifdef SCTP_ASOCLOG_OF_TSNS void sctp_print_out_track_log(struct sctp_tcb *stcb); #endif #ifdef SCTP_MBUF_LOGGING struct mbuf *sctp_m_free(struct mbuf *m); void sctp_m_freem(struct mbuf *m); #else #define sctp_m_free m_free #define sctp_m_freem m_freem #endif #if defined(SCTP_LOCAL_TRACE_BUF) void sctp_log_trace(uint32_t fr, const char *str SCTP_UNUSED, uint32_t a, uint32_t b, uint32_t c, uint32_t d, uint32_t e, uint32_t f); #endif #define sctp_get_associd(stcb) ((sctp_assoc_t)stcb->asoc.assoc_id) /* * Function prototypes */ int32_t sctp_map_assoc_state(int); uint32_t sctp_get_ifa_hash_val(struct sockaddr *addr); struct sctp_ifa *sctp_find_ifa_in_ep(struct sctp_inpcb *inp, struct sockaddr *addr, int hold_lock); struct sctp_ifa *sctp_find_ifa_by_addr(struct sockaddr *addr, uint32_t vrf_id, int holds_lock); uint32_t sctp_select_initial_TSN(struct sctp_pcb *); uint32_t sctp_select_a_tag(struct sctp_inpcb *, uint16_t lport, uint16_t rport, int); int sctp_init_asoc(struct sctp_inpcb *, struct sctp_tcb *, uint32_t, uint32_t, uint16_t); void sctp_fill_random_store(struct sctp_pcb *); void sctp_notify_stream_reset_add(struct sctp_tcb *stcb, uint16_t numberin, uint16_t numberout, int flag); void sctp_notify_stream_reset_tsn(struct sctp_tcb *stcb, uint32_t sending_tsn, uint32_t recv_tsn, int flag); /* * NOTE: sctp_timer_start() will increment the reference count of any relevant * structure the timer is referencing, in order to prevent a race condition * between the timer executing and the structure being freed. * * When the timer fires or sctp_timer_stop() is called, these references are * removed. */ void sctp_timer_start(int, struct sctp_inpcb *, struct sctp_tcb *, struct sctp_nets *); void sctp_timer_stop(int, struct sctp_inpcb *, struct sctp_tcb *, struct sctp_nets *, uint32_t); int sctp_dynamic_set_primary(struct sockaddr *sa, uint32_t vrf_id); void sctp_mtu_size_reset(struct sctp_inpcb *, struct sctp_association *, uint32_t); void sctp_wakeup_the_read_socket(struct sctp_inpcb *inp, struct sctp_tcb *stcb, int so_locked SCTP_UNUSED ); void sctp_add_to_readq(struct sctp_inpcb *inp, struct sctp_tcb *stcb, struct sctp_queued_to_read *control, struct sockbuf *sb, int end, int inpread_locked, int so_locked); void sctp_iterator_worker(void); uint32_t sctp_get_prev_mtu(uint32_t); uint32_t sctp_get_next_mtu(uint32_t); void sctp_timeout_handler(void *); int sctp_calculate_rto(struct sctp_tcb *, struct sctp_association *, struct sctp_nets *, struct timeval *, int); uint32_t sctp_calculate_len(struct mbuf *); caddr_t sctp_m_getptr(struct mbuf *, int, int, uint8_t *); struct sctp_paramhdr * sctp_get_next_param(struct mbuf *, int, struct sctp_paramhdr *, int); struct mbuf *sctp_add_pad_tombuf(struct mbuf *, int); struct mbuf *sctp_pad_lastmbuf(struct mbuf *, int, struct mbuf *); void sctp_ulp_notify(uint32_t, struct sctp_tcb *, uint32_t, void *, int); void sctp_pull_off_control_to_new_inp(struct sctp_inpcb *old_inp, struct sctp_inpcb *new_inp, struct sctp_tcb *stcb, int waitflags); void sctp_stop_timers_for_shutdown(struct sctp_tcb *); /* Stop all timers for association and remote addresses. */ void sctp_stop_association_timers(struct sctp_tcb *, bool); -void sctp_report_all_outbound(struct sctp_tcb *, uint16_t, int, int); +void sctp_report_all_outbound(struct sctp_tcb *, uint16_t, int); int sctp_expand_mapping_array(struct sctp_association *, uint32_t); void sctp_abort_notification(struct sctp_tcb *, uint8_t, uint16_t, struct sctp_abort_chunk *, int); /* We abort responding to an IP packet for some reason */ void sctp_abort_association(struct sctp_inpcb *, struct sctp_tcb *, struct mbuf *, int, struct sockaddr *, struct sockaddr *, struct sctphdr *, struct mbuf *, uint8_t, uint32_t, uint32_t, uint16_t); /* We choose to abort via user input */ void sctp_abort_an_association(struct sctp_inpcb *, struct sctp_tcb *, struct mbuf *, int); void sctp_handle_ootb(struct mbuf *, int, int, struct sockaddr *, struct sockaddr *, struct sctphdr *, struct sctp_inpcb *, struct mbuf *, uint8_t, uint32_t, uint16_t, uint32_t, uint16_t); int sctp_connectx_helper_add(struct sctp_tcb *stcb, struct sockaddr *addr, int totaddr, int *error); int sctp_connectx_helper_find(struct sctp_inpcb *, struct sockaddr *, unsigned int, unsigned int *, unsigned int *, unsigned int); int sctp_is_there_an_abort_here(struct mbuf *, int, uint32_t *); #ifdef INET6 uint32_t sctp_is_same_scope(struct sockaddr_in6 *, struct sockaddr_in6 *); struct sockaddr_in6 *sctp_recover_scope(struct sockaddr_in6 *, struct sockaddr_in6 *); #define sctp_recover_scope_mac(addr, store) do { \ if ((addr->sin6_family == AF_INET6) && \ (IN6_IS_SCOPE_LINKLOCAL(&addr->sin6_addr))) { \ *store = *addr; \ if (addr->sin6_scope_id == 0) { \ if (!sa6_recoverscope(store)) { \ addr = store; \ } \ } else { \ in6_clearscope(&addr->sin6_addr); \ addr = store; \ } \ } \ } while (0) #endif int sctp_cmpaddr(struct sockaddr *, struct sockaddr *); void sctp_print_address(struct sockaddr *); int sctp_release_pr_sctp_chunk(struct sctp_tcb *, struct sctp_tmit_chunk *, uint8_t, int); struct mbuf *sctp_generate_cause(uint16_t, char *); struct mbuf *sctp_generate_no_user_data_cause(uint32_t); void sctp_bindx_add_address(struct socket *so, struct sctp_inpcb *inp, struct sockaddr *sa, uint32_t vrf_id, int *error, void *p); void sctp_bindx_delete_address(struct sctp_inpcb *inp, struct sockaddr *sa, uint32_t vrf_id, int *error); int sctp_local_addr_count(struct sctp_tcb *stcb); #ifdef SCTP_MBCNT_LOGGING void sctp_free_bufspace(struct sctp_tcb *, struct sctp_association *, struct sctp_tmit_chunk *, int); #else #define sctp_free_bufspace(stcb, asoc, tp1, chk_cnt) \ do { \ if (tp1->data != NULL) { \ atomic_subtract_int(&((asoc)->chunks_on_out_queue), chk_cnt); \ if ((asoc)->total_output_queue_size >= tp1->book_size) { \ atomic_subtract_int(&((asoc)->total_output_queue_size), tp1->book_size); \ } else { \ (asoc)->total_output_queue_size = 0; \ } \ if (stcb->sctp_socket && ((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || \ (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL))) { \ if (stcb->sctp_socket->so_snd.sb_cc >= tp1->book_size) { \ atomic_subtract_int(&((stcb)->sctp_socket->so_snd.sb_cc), tp1->book_size); \ } else { \ stcb->sctp_socket->so_snd.sb_cc = 0; \ } \ } \ } \ } while (0) #endif #define sctp_free_spbufspace(stcb, asoc, sp) \ do { \ if (sp->data != NULL) { \ if ((asoc)->total_output_queue_size >= sp->length) { \ atomic_subtract_int(&(asoc)->total_output_queue_size, sp->length); \ } else { \ (asoc)->total_output_queue_size = 0; \ } \ if (stcb->sctp_socket && ((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || \ (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL))) { \ if (stcb->sctp_socket->so_snd.sb_cc >= sp->length) { \ atomic_subtract_int(&stcb->sctp_socket->so_snd.sb_cc,sp->length); \ } else { \ stcb->sctp_socket->so_snd.sb_cc = 0; \ } \ } \ } \ } while (0) #define sctp_snd_sb_alloc(stcb, sz) \ do { \ atomic_add_int(&stcb->asoc.total_output_queue_size,sz); \ if ((stcb->sctp_socket != NULL) && \ ((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || \ (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL))) { \ atomic_add_int(&stcb->sctp_socket->so_snd.sb_cc,sz); \ } \ } while (0) /* functions to start/stop udp tunneling */ void sctp_over_udp_stop(void); int sctp_over_udp_start(void); int sctp_soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp); void sctp_misc_ints(uint8_t from, uint32_t a, uint32_t b, uint32_t c, uint32_t d); void sctp_wakeup_log(struct sctp_tcb *stcb, uint32_t wake_cnt, int from); void sctp_log_strm_del_alt(struct sctp_tcb *stcb, uint32_t, uint16_t, uint16_t, int); void sctp_log_nagle_event(struct sctp_tcb *stcb, int action); #ifdef SCTP_MBUF_LOGGING void sctp_log_mb(struct mbuf *m, int from); void sctp_log_mbc(struct mbuf *m, int from); #endif void sctp_sblog(struct sockbuf *sb, struct sctp_tcb *stcb, int from, int incr); void sctp_log_strm_del(struct sctp_queued_to_read *control, struct sctp_queued_to_read *poschk, int from); void sctp_log_cwnd(struct sctp_tcb *stcb, struct sctp_nets *, int, uint8_t); void rto_logging(struct sctp_nets *net, int from); void sctp_log_closing(struct sctp_inpcb *inp, struct sctp_tcb *stcb, int16_t loc); void sctp_log_lock(struct sctp_inpcb *inp, struct sctp_tcb *stcb, uint8_t from); void sctp_log_maxburst(struct sctp_tcb *stcb, struct sctp_nets *, int, int, uint8_t); void sctp_log_block(uint8_t, struct sctp_association *, ssize_t); void sctp_log_rwnd(uint8_t, uint32_t, uint32_t, uint32_t); void sctp_log_rwnd_set(uint8_t, uint32_t, uint32_t, uint32_t, uint32_t); int sctp_fill_stat_log(void *, size_t *); void sctp_log_fr(uint32_t, uint32_t, uint32_t, int); void sctp_log_sack(uint32_t, uint32_t, uint32_t, uint16_t, uint16_t, int); void sctp_log_map(uint32_t, uint32_t, uint32_t, int); void sctp_print_mapping_array(struct sctp_association *asoc); void sctp_clr_stat_log(void); #ifdef SCTP_AUDITING_ENABLED void sctp_auditing(int, struct sctp_inpcb *, struct sctp_tcb *, struct sctp_nets *); void sctp_audit_log(uint8_t, uint8_t); #endif uint32_t sctp_min_mtu(uint32_t, uint32_t, uint32_t); void sctp_hc_set_mtu(union sctp_sockstore *, uint16_t, uint32_t); uint32_t sctp_hc_get_mtu(union sctp_sockstore *, uint16_t); void sctp_set_state(struct sctp_tcb *, int); void sctp_add_substate(struct sctp_tcb *, int); uint32_t sctp_ticks_to_msecs(uint32_t); uint32_t sctp_msecs_to_ticks(uint32_t); uint32_t sctp_ticks_to_secs(uint32_t); uint32_t sctp_secs_to_ticks(uint32_t); #endif /* _KERNEL */ #endif Index: projects/clang1100-import/sys/sys/namei.h =================================================================== --- projects/clang1100-import/sys/sys/namei.h (revision 364278) +++ projects/clang1100-import/sys/sys/namei.h (revision 364279) @@ -1,276 +1,277 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1985, 1989, 1991, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)namei.h 8.5 (Berkeley) 1/9/95 * $FreeBSD$ */ #ifndef _SYS_NAMEI_H_ #define _SYS_NAMEI_H_ #include #include #include #include struct componentname { /* * Arguments to lookup. */ u_long cn_nameiop; /* namei operation */ u_int64_t cn_flags; /* flags to namei */ struct thread *cn_thread;/* thread requesting lookup */ struct ucred *cn_cred; /* credentials */ int cn_lkflags; /* Lock flags LK_EXCLUSIVE or LK_SHARED */ /* * Shared between lookup and commit routines. */ char *cn_pnbuf; /* pathname buffer */ char *cn_nameptr; /* pointer to looked up name */ long cn_namelen; /* length of looked up component */ }; struct nameicap_tracker; TAILQ_HEAD(nameicap_tracker_head, nameicap_tracker); /* * Encapsulation of namei parameters. */ struct nameidata { /* * Arguments to namei/lookup. */ const char *ni_dirp; /* pathname pointer */ enum uio_seg ni_segflg; /* location of pathname */ cap_rights_t *ni_rightsneeded; /* rights required to look up vnode */ /* * Arguments to lookup. */ struct vnode *ni_startdir; /* starting directory */ struct vnode *ni_rootdir; /* logical root directory */ struct vnode *ni_topdir; /* logical top directory */ int ni_dirfd; /* starting directory for *at functions */ int ni_lcf; /* local call flags */ /* * Results: returned from namei */ struct filecaps ni_filecaps; /* rights the *at base has */ /* * Results: returned from/manipulated by lookup */ struct vnode *ni_vp; /* vnode of result */ struct vnode *ni_dvp; /* vnode of intermediate directory */ /* * Results: flags returned from namei */ u_int ni_resflags; /* * Shared between namei and lookup/commit routines. */ size_t ni_pathlen; /* remaining chars in path */ char *ni_next; /* next location in pathname */ u_int ni_loopcnt; /* count of symlinks encountered */ /* * Lookup parameters: this structure describes the subset of * information from the nameidata structure that is passed * through the VOP interface. */ struct componentname ni_cnd; struct nameicap_tracker_head ni_cap_tracker; struct vnode *ni_beneath_latch; }; #ifdef _KERNEL enum cache_fpl_status { CACHE_FPL_STATUS_ABORTED, CACHE_FPL_STATUS_PARTIAL, CACHE_FPL_STATUS_HANDLED, CACHE_FPL_STATUS_UNSET }; int cache_fplookup(struct nameidata *ndp, enum cache_fpl_status *status, struct pwd **pwdp); /* * namei operations */ #define LOOKUP 0 /* perform name lookup only */ #define CREATE 1 /* setup for file creation */ #define DELETE 2 /* setup for file deletion */ #define RENAME 3 /* setup for file renaming */ #define OPMASK 3 /* mask for operation */ /* * namei operational modifier flags, stored in ni_cnd.flags */ #define LOCKLEAF 0x0004 /* lock vnode on return */ #define LOCKPARENT 0x0008 /* want parent vnode returned locked */ #define WANTPARENT 0x0010 /* want parent vnode returned unlocked */ #define NOCACHE 0x0020 /* name must not be left in cache */ #define FOLLOW 0x0040 /* follow symbolic links */ #define BENEATH 0x0080 /* No escape from the start dir */ #define LOCKSHARED 0x0100 /* Shared lock leaf */ #define NOFOLLOW 0x0000 /* do not follow symbolic links (pseudo) */ #define MODMASK 0x01fc /* mask of operational modifiers */ /* * Namei parameter descriptors. * * SAVENAME may be set by either the callers of namei or by VOP_LOOKUP. * If the caller of namei sets the flag (for example execve wants to * know the name of the program that is being executed), then it must * free the buffer. If VOP_LOOKUP sets the flag, then the buffer must * be freed by either the commit routine or the VOP_ABORT routine. * SAVESTART is set only by the callers of namei. It implies SAVENAME * plus the addition of saving the parent directory that contains the * name in ni_startdir. It allows repeated calls to lookup for the * name being sought. The caller is responsible for releasing the * buffer and for vrele'ing ni_startdir. */ #define RDONLY 0x00000200 /* lookup with read-only semantics */ -#define HASBUF 0x00000400 /* has allocated pathname buffer */ -#define SAVENAME 0x00000800 /* save pathname buffer */ -#define SAVESTART 0x00001000 /* save starting directory */ -#define ISWHITEOUT 0x00002000 /* found whiteout */ -#define DOWHITEOUT 0x00004000 /* do whiteouts */ -#define WILLBEDIR 0x00008000 /* new files will be dirs; allow trailing / */ -#define ISOPEN 0x00010000 /* caller is opening; return a real vnode. */ -#define NOCROSSMOUNT 0x00020000 /* do not cross mount points */ -#define NOMACCHECK 0x00040000 /* do not perform MAC checks */ -#define AUDITVNODE1 0x00080000 /* audit the looked up vnode information */ -#define AUDITVNODE2 0x00100000 /* audit the looked up vnode information */ -#define NOCAPCHECK 0x00200000 /* do not perform capability checks */ +#define SAVENAME 0x00000400 /* save pathname buffer */ +#define SAVESTART 0x00000800 /* save starting directory */ +#define ISWHITEOUT 0x00001000 /* found whiteout */ +#define DOWHITEOUT 0x00002000 /* do whiteouts */ +#define WILLBEDIR 0x00004000 /* new files will be dirs; allow trailing / */ +#define ISOPEN 0x00008000 /* caller is opening; return a real vnode. */ +#define NOCROSSMOUNT 0x00010000 /* do not cross mount points */ +#define NOMACCHECK 0x00020000 /* do not perform MAC checks */ +#define AUDITVNODE1 0x00040000 /* audit the looked up vnode information */ +#define AUDITVNODE2 0x00080000 /* audit the looked up vnode information */ +#define NOCAPCHECK 0x00100000 /* do not perform capability checks */ /* UNUSED 0x00400000 */ +/* UNUSED 0x00200000 */ /* UNUSED 0x00800000 */ -/* UNUSED 0x01000000 */ +#define HASBUF 0x01000000 /* has allocated pathname buffer */ #define NOEXECCHECK 0x02000000 /* do not perform exec check on dir */ #define MAKEENTRY 0x04000000 /* entry is to be added to name cache */ #define ISSYMLINK 0x08000000 /* symlink needs interpretation */ #define ISLASTCN 0x10000000 /* this is last component of pathname */ #define ISDOTDOT 0x20000000 /* current component name is .. */ #define TRAILINGSLASH 0x40000000 /* path ended in a slash */ #define PARAMASK 0x7ffffe00 /* mask of parameter descriptors */ /* * Flags which must not be passed in by callers. */ #define NAMEI_INTERNAL_FLAGS \ - (NOEXECCHECK | MAKEENTRY | ISSYMLINK | ISLASTCN | ISDOTDOT | TRAILINGSLASH) + (HASBUF | NOEXECCHECK | MAKEENTRY | ISSYMLINK | ISLASTCN | ISDOTDOT | \ + TRAILINGSLASH) /* * Namei results flags */ #define NIRES_ABS 0x00000001 /* Path was absolute */ /* * Flags in ni_lcf, valid for the duration of the namei call. */ #define NI_LCF_STRICTRELATIVE 0x0001 /* relative lookup only */ #define NI_LCF_CAP_DOTDOT 0x0002 /* ".." in strictrelative case */ #define NI_LCF_BENEATH_ABS 0x0004 /* BENEATH with absolute path */ #define NI_LCF_BENEATH_LATCHED 0x0008 /* BENEATH_ABS traversed starting dir */ #define NI_LCF_LATCH 0x0010 /* ni_beneath_latch valid */ /* * Initialization of a nameidata structure. */ #define NDINIT(ndp, op, flags, segflg, namep, td) \ NDINIT_ALL(ndp, op, flags, segflg, namep, AT_FDCWD, NULL, &cap_no_rights, td) #define NDINIT_AT(ndp, op, flags, segflg, namep, dirfd, td) \ NDINIT_ALL(ndp, op, flags, segflg, namep, dirfd, NULL, &cap_no_rights, td) #define NDINIT_ATRIGHTS(ndp, op, flags, segflg, namep, dirfd, rightsp, td) \ NDINIT_ALL(ndp, op, flags, segflg, namep, dirfd, NULL, rightsp, td) #define NDINIT_ATVP(ndp, op, flags, segflg, namep, vp, td) \ NDINIT_ALL(ndp, op, flags, segflg, namep, AT_FDCWD, vp, &cap_no_rights, td) /* * Note the constant pattern may *hide* bugs. */ #ifdef INVARIANTS #define NDINIT_PREFILL(arg) memset(arg, 0xff, sizeof(*arg)) #else #define NDINIT_PREFILL(arg) do { } while (0) #endif #define NDINIT_ALL(ndp, op, flags, segflg, namep, dirfd, startdir, rightsp, td) \ do { \ struct nameidata *_ndp = (ndp); \ cap_rights_t *_rightsp = (rightsp); \ MPASS(_rightsp != NULL); \ NDINIT_PREFILL(_ndp); \ _ndp->ni_cnd.cn_nameiop = op; \ _ndp->ni_cnd.cn_flags = flags; \ _ndp->ni_segflg = segflg; \ _ndp->ni_dirp = namep; \ _ndp->ni_dirfd = dirfd; \ _ndp->ni_startdir = startdir; \ _ndp->ni_resflags = 0; \ filecaps_init(&_ndp->ni_filecaps); \ _ndp->ni_cnd.cn_thread = td; \ _ndp->ni_rightsneeded = _rightsp; \ } while (0) #define NDF_NO_DVP_RELE 0x00000001 #define NDF_NO_DVP_UNLOCK 0x00000002 #define NDF_NO_DVP_PUT 0x00000003 #define NDF_NO_VP_RELE 0x00000004 #define NDF_NO_VP_UNLOCK 0x00000008 #define NDF_NO_VP_PUT 0x0000000c #define NDF_NO_STARTDIR_RELE 0x00000010 #define NDF_NO_FREE_PNBUF 0x00000020 #define NDF_ONLY_PNBUF (~NDF_NO_FREE_PNBUF) void NDFREE_PNBUF(struct nameidata *); void NDFREE(struct nameidata *, const u_int); #define NDFREE(ndp, flags) do { \ struct nameidata *_ndp = (ndp); \ if (__builtin_constant_p(flags) && flags == NDF_ONLY_PNBUF) \ NDFREE_PNBUF(_ndp); \ else \ NDFREE(_ndp, flags); \ } while (0) int namei(struct nameidata *ndp); int lookup(struct nameidata *ndp); int relookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp); #endif /* * Stats on usefulness of namei caches. */ struct nchstats { long ncs_goodhits; /* hits that we can really use */ long ncs_neghits; /* negative hits that we can use */ long ncs_badhits; /* hits we must drop */ long ncs_falsehits; /* hits with id mismatch */ long ncs_miss; /* misses */ long ncs_long; /* long names that ignore cache */ long ncs_pass2; /* names found with passes == 2 */ long ncs_2passes; /* number of times we attempt it */ }; extern struct nchstats nchstats; #endif /* !_SYS_NAMEI_H_ */ Index: projects/clang1100-import/sys/sys/param.h =================================================================== --- projects/clang1100-import/sys/sys/param.h (revision 364278) +++ projects/clang1100-import/sys/sys/param.h (revision 364279) @@ -1,370 +1,370 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)param.h 8.3 (Berkeley) 4/4/95 * $FreeBSD$ */ #ifndef _SYS_PARAM_H_ #define _SYS_PARAM_H_ #include #define BSD 199506 /* System version (year & month). */ #define BSD4_3 1 #define BSD4_4 1 /* * __FreeBSD_version numbers are documented in the Porter's Handbook. * If you bump the version for any reason, you should update the documentation * there. * Currently this lives here in the doc/ repository: * * head/en_US.ISO8859-1/books/porters-handbook/versions/chapter.xml * * scheme is: Rxx * 'R' is in the range 0 to 4 if this is a release branch or * X.0-CURRENT before releng/X.0 is created, otherwise 'R' is * in the range 5 to 9. */ #undef __FreeBSD_version -#define __FreeBSD_version 1300108 /* Master, propagated to newvers */ +#define __FreeBSD_version 1300109 /* Master, propagated to newvers */ /* * __FreeBSD_kernel__ indicates that this system uses the kernel of FreeBSD, * which by definition is always true on FreeBSD. This macro is also defined * on other systems that use the kernel of FreeBSD, such as GNU/kFreeBSD. * * It is tempting to use this macro in userland code when we want to enable * kernel-specific routines, and in fact it's fine to do this in code that * is part of FreeBSD itself. However, be aware that as presence of this * macro is still not widespread (e.g. older FreeBSD versions, 3rd party * compilers, etc), it is STRONGLY DISCOURAGED to check for this macro in * external applications without also checking for __FreeBSD__ as an * alternative. */ #undef __FreeBSD_kernel__ #define __FreeBSD_kernel__ #if defined(_KERNEL) || defined(IN_RTLD) #define P_OSREL_SIGWAIT 700000 #define P_OSREL_SIGSEGV 700004 #define P_OSREL_MAP_ANON 800104 #define P_OSREL_MAP_FSTRICT 1100036 #define P_OSREL_SHUTDOWN_ENOTCONN 1100077 #define P_OSREL_MAP_GUARD 1200035 #define P_OSREL_WRFSBASE 1200041 #define P_OSREL_CK_CYLGRP 1200046 #define P_OSREL_VMTOTAL64 1200054 #define P_OSREL_CK_SUPERBLOCK 1300000 #define P_OSREL_CK_INODE 1300005 #define P_OSREL_POWERPC_NEW_AUX_ARGS 1300070 #define P_OSREL_MAJOR(x) ((x) / 100000) #endif #ifndef LOCORE #include #endif /* * Machine-independent constants (some used in following include files). * Redefined constants are from POSIX 1003.1 limits file. * * MAXCOMLEN should be >= sizeof(ac_comm) (see ) */ #include #define MAXCOMLEN 19 /* max command name remembered */ #define MAXINTERP PATH_MAX /* max interpreter file name length */ #define MAXLOGNAME 33 /* max login name length (incl. NUL) */ #define MAXUPRC CHILD_MAX /* max simultaneous processes */ #define NCARGS ARG_MAX /* max bytes for an exec function */ #define NGROUPS (NGROUPS_MAX+1) /* max number groups */ #define NOFILE OPEN_MAX /* max open files per process */ #define NOGROUP 65535 /* marker for empty group set member */ #define MAXHOSTNAMELEN 256 /* max hostname size */ #define SPECNAMELEN 255 /* max length of devicename */ /* More types and definitions used throughout the kernel. */ #ifdef _KERNEL #include #include #ifndef LOCORE #include #include #endif #ifndef FALSE #define FALSE 0 #endif #ifndef TRUE #define TRUE 1 #endif #endif #ifndef _KERNEL #ifndef LOCORE /* Signals. */ #include #endif #endif /* Machine type dependent parameters. */ #include #ifndef _KERNEL #include #endif #ifndef DEV_BSHIFT #define DEV_BSHIFT 9 /* log2(DEV_BSIZE) */ #endif #define DEV_BSIZE (1<>PAGE_SHIFT) #endif /* * btodb() is messy and perhaps slow because `bytes' may be an off_t. We * want to shift an unsigned type to avoid sign extension and we don't * want to widen `bytes' unnecessarily. Assume that the result fits in * a daddr_t. */ #ifndef btodb #define btodb(bytes) /* calculates (bytes / DEV_BSIZE) */ \ (sizeof (bytes) > sizeof(long) \ ? (daddr_t)((unsigned long long)(bytes) >> DEV_BSHIFT) \ : (daddr_t)((unsigned long)(bytes) >> DEV_BSHIFT)) #endif #ifndef dbtob #define dbtob(db) /* calculates (db * DEV_BSIZE) */ \ ((off_t)(db) << DEV_BSHIFT) #endif #define PRIMASK 0x0ff #define PCATCH 0x100 /* OR'd with pri for tsleep to check signals */ #define PDROP 0x200 /* OR'd with pri to stop re-entry of interlock mutex */ #define NZERO 0 /* default "nice" */ #define NBBY 8 /* number of bits in a byte */ #define NBPW sizeof(int) /* number of bytes per word (integer) */ #define CMASK 022 /* default file mask: S_IWGRP|S_IWOTH */ #define NODEV (dev_t)(-1) /* non-existent device */ /* * File system parameters and macros. * * MAXBSIZE - Filesystems are made out of blocks of at most MAXBSIZE bytes * per block. MAXBSIZE may be made larger without effecting * any existing filesystems as long as it does not exceed MAXPHYS, * and may be made smaller at the risk of not being able to use * filesystems which require a block size exceeding MAXBSIZE. * * MAXBCACHEBUF - Maximum size of a buffer in the buffer cache. This must * be >= MAXBSIZE and can be set differently for different * architectures by defining it in . * Making this larger allows NFS to do larger reads/writes. * * BKVASIZE - Nominal buffer space per buffer, in bytes. BKVASIZE is the * minimum KVM memory reservation the kernel is willing to make. * Filesystems can of course request smaller chunks. Actual * backing memory uses a chunk size of a page (PAGE_SIZE). * The default value here can be overridden on a per-architecture * basis by defining it in . * * If you make BKVASIZE too small you risk seriously fragmenting * the buffer KVM map which may slow things down a bit. If you * make it too big the kernel will not be able to optimally use * the KVM memory reserved for the buffer cache and will wind * up with too-few buffers. * * The default is 16384, roughly 2x the block size used by a * normal UFS filesystem. */ #define MAXBSIZE 65536 /* must be power of 2 */ #ifndef MAXBCACHEBUF #define MAXBCACHEBUF MAXBSIZE /* must be a power of 2 >= MAXBSIZE */ #endif #ifndef BKVASIZE #define BKVASIZE 16384 /* must be power of 2 */ #endif #define BKVAMASK (BKVASIZE-1) /* * MAXPATHLEN defines the longest permissible path length after expanding * symbolic links. It is used to allocate a temporary buffer from the buffer * pool in which to do the name expansion, hence should be a power of two, * and must be less than or equal to MAXBSIZE. MAXSYMLINKS defines the * maximum number of symbolic links that may be expanded in a path name. * It should be set high enough to allow all legitimate uses, but halt * infinite loops reasonably quickly. */ #define MAXPATHLEN PATH_MAX #define MAXSYMLINKS 32 /* Bit map related macros. */ #define setbit(a,i) (((unsigned char *)(a))[(i)/NBBY] |= 1<<((i)%NBBY)) #define clrbit(a,i) (((unsigned char *)(a))[(i)/NBBY] &= ~(1<<((i)%NBBY))) #define isset(a,i) \ (((const unsigned char *)(a))[(i)/NBBY] & (1<<((i)%NBBY))) #define isclr(a,i) \ ((((const unsigned char *)(a))[(i)/NBBY] & (1<<((i)%NBBY))) == 0) /* Macros for counting and rounding. */ #ifndef howmany #define howmany(x, y) (((x)+((y)-1))/(y)) #endif #define nitems(x) (sizeof((x)) / sizeof((x)[0])) #define rounddown(x, y) (((x)/(y))*(y)) #define rounddown2(x, y) ((x)&(~((y)-1))) /* if y is power of two */ #define roundup(x, y) ((((x)+((y)-1))/(y))*(y)) /* to any y */ #define roundup2(x, y) (((x)+((y)-1))&(~((y)-1))) /* if y is powers of two */ #define powerof2(x) ((((x)-1)&(x))==0) /* Macros for min/max. */ #define MIN(a,b) (((a)<(b))?(a):(b)) #define MAX(a,b) (((a)>(b))?(a):(b)) #ifdef _KERNEL /* * Basic byte order function prototypes for non-inline functions. */ #ifndef LOCORE #ifndef _BYTEORDER_PROTOTYPED #define _BYTEORDER_PROTOTYPED __BEGIN_DECLS __uint32_t htonl(__uint32_t); __uint16_t htons(__uint16_t); __uint32_t ntohl(__uint32_t); __uint16_t ntohs(__uint16_t); __END_DECLS #endif #endif #ifndef _BYTEORDER_FUNC_DEFINED #define _BYTEORDER_FUNC_DEFINED #define htonl(x) __htonl(x) #define htons(x) __htons(x) #define ntohl(x) __ntohl(x) #define ntohs(x) __ntohs(x) #endif /* !_BYTEORDER_FUNC_DEFINED */ #endif /* _KERNEL */ /* * Scale factor for scaled integers used to count %cpu time and load avgs. * * The number of CPU `tick's that map to a unique `%age' can be expressed * by the formula (1 / (2 ^ (FSHIFT - 11))). The maximum load average that * can be calculated (assuming 32 bits) can be closely approximated using * the formula (2 ^ (2 * (16 - FSHIFT))) for (FSHIFT < 15). * * For the scheduler to maintain a 1:1 mapping of CPU `tick' to `%age', * FSHIFT must be at least 11; this gives us a maximum load avg of ~1024. */ #define FSHIFT 11 /* bits to right of fixed binary point */ #define FSCALE (1<> (PAGE_SHIFT - DEV_BSHIFT)) #define ctodb(db) /* calculates pages to devblks */ \ ((db) << (PAGE_SHIFT - DEV_BSHIFT)) /* * Old spelling of __containerof(). */ #define member2struct(s, m, x) \ ((struct s *)(void *)((char *)(x) - offsetof(struct s, m))) /* * Access a variable length array that has been declared as a fixed * length array. */ #define __PAST_END(array, offset) (((__typeof__(*(array)) *)(array))[offset]) #endif /* _SYS_PARAM_H_ */ Index: projects/clang1100-import/sys/sys/vnode.h =================================================================== --- projects/clang1100-import/sys/sys/vnode.h (revision 364278) +++ projects/clang1100-import/sys/sys/vnode.h (revision 364279) @@ -1,1070 +1,1070 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)vnode.h 8.7 (Berkeley) 2/4/94 * $FreeBSD$ */ #ifndef _SYS_VNODE_H_ #define _SYS_VNODE_H_ #include #include #include #include #include #include #include #include #include #include #include /* * The vnode is the focus of all file activity in UNIX. There is a * unique vnode allocated for each active file, each current directory, * each mounted-on file, text file, and the root. */ /* * Vnode types. VNON means no type. */ enum vtype { VNON, VREG, VDIR, VBLK, VCHR, VLNK, VSOCK, VFIFO, VBAD, VMARKER }; enum vgetstate { VGET_NONE, VGET_HOLDCNT, VGET_USECOUNT }; /* * Each underlying filesystem allocates its own private area and hangs * it from v_data. If non-null, this area is freed in getnewvnode(). */ struct namecache; struct vpollinfo { struct mtx vpi_lock; /* lock to protect below */ struct selinfo vpi_selinfo; /* identity of poller(s) */ short vpi_events; /* what they are looking for */ short vpi_revents; /* what has happened */ }; /* * Reading or writing any of these items requires holding the appropriate lock. * * Lock reference: * c - namecache mutex * i - interlock * l - mp mnt_listmtx or freelist mutex * I - updated with atomics, 0->1 and 1->0 transitions with interlock held * m - mount point interlock * p - pollinfo lock * u - Only a reference to the vnode is needed to read. * v - vnode lock * * Vnodes may be found on many lists. The general way to deal with operating * on a vnode that is on a list is: * 1) Lock the list and find the vnode. * 2) Lock interlock so that the vnode does not go away. * 3) Unlock the list to avoid lock order reversals. * 4) vget with LK_INTERLOCK and check for ENOENT, or * 5) Check for DOOMED if the vnode lock is not required. * 6) Perform your operation, then vput(). */ #if defined(_KERNEL) || defined(_KVM_VNODE) struct vnode { /* * Fields which define the identity of the vnode. These fields are * owned by the filesystem (XXX: and vgone() ?) */ enum vtype v_type:8; /* u vnode type */ short v_irflag; /* i frequently read flags */ seqc_t v_seqc; /* i modification count */ uint32_t v_nchash; /* u namecache hash */ struct vop_vector *v_op; /* u vnode operations vector */ void *v_data; /* u private data for fs */ /* * Filesystem instance stuff */ struct mount *v_mount; /* u ptr to vfs we are in */ TAILQ_ENTRY(vnode) v_nmntvnodes; /* m vnodes for mount point */ /* * Type specific fields, only one applies to any given vnode. */ union { struct mount *v_mountedhere; /* v ptr to mountpoint (VDIR) */ struct unpcb *v_unpcb; /* v unix domain net (VSOCK) */ struct cdev *v_rdev; /* v device (VCHR, VBLK) */ struct fifoinfo *v_fifoinfo; /* v fifo (VFIFO) */ }; /* * vfs_hash: (mount + inode) -> vnode hash. The hash value * itself is grouped with other int fields, to avoid padding. */ LIST_ENTRY(vnode) v_hashlist; /* * VFS_namecache stuff */ LIST_HEAD(, namecache) v_cache_src; /* c Cache entries from us */ TAILQ_HEAD(, namecache) v_cache_dst; /* c Cache entries to us */ struct namecache *v_cache_dd; /* c Cache entry for .. vnode */ /* * Locking */ struct lock v_lock; /* u (if fs don't have one) */ struct mtx v_interlock; /* lock for "i" things */ struct lock *v_vnlock; /* u pointer to vnode lock */ /* * The machinery of being a vnode */ TAILQ_ENTRY(vnode) v_vnodelist; /* l vnode lists */ TAILQ_ENTRY(vnode) v_lazylist; /* l vnode lazy list */ struct bufobj v_bufobj; /* * Buffer cache object */ /* * Hooks for various subsystems and features. */ struct vpollinfo *v_pollinfo; /* i Poll events, p for *v_pi */ struct label *v_label; /* MAC label for vnode */ struct lockf *v_lockf; /* Byte-level advisory lock list */ struct rangelock v_rl; /* Byte-range lock */ /* * clustering stuff */ daddr_t v_cstart; /* v start block of cluster */ daddr_t v_lasta; /* v last allocation */ daddr_t v_lastw; /* v last write */ int v_clen; /* v length of cur. cluster */ u_int v_holdcnt; /* I prevents recycling. */ u_int v_usecount; /* I ref count of users */ u_short v_iflag; /* i vnode flags (see below) */ u_short v_vflag; /* v vnode flags */ u_short v_mflag; /* l mnt-specific vnode flags */ short v_dbatchcpu; /* i LRU requeue deferral batch */ int v_writecount; /* I ref count of writers or (negative) text users */ int v_seqc_users; /* i modifications pending */ u_int v_hash; }; #endif /* defined(_KERNEL) || defined(_KVM_VNODE) */ #define bo2vnode(bo) __containerof((bo), struct vnode, v_bufobj) /* XXX: These are temporary to avoid a source sweep at this time */ #define v_object v_bufobj.bo_object /* * Userland version of struct vnode, for sysctl. */ struct xvnode { size_t xv_size; /* sizeof(struct xvnode) */ void *xv_vnode; /* address of real vnode */ u_long xv_flag; /* vnode vflags */ int xv_usecount; /* reference count of users */ int xv_writecount; /* reference count of writers */ int xv_holdcnt; /* page & buffer references */ u_long xv_id; /* capability identifier */ void *xv_mount; /* address of parent mount */ long xv_numoutput; /* num of writes in progress */ enum vtype xv_type; /* vnode type */ union { void *xvu_socket; /* unpcb, if VSOCK */ void *xvu_fifo; /* fifo, if VFIFO */ dev_t xvu_rdev; /* maj/min, if VBLK/VCHR */ struct { dev_t xvu_dev; /* device, if VDIR/VREG/VLNK */ ino_t xvu_ino; /* id, if VDIR/VREG/VLNK */ } xv_uns; } xv_un; }; #define xv_socket xv_un.xvu_socket #define xv_fifo xv_un.xvu_fifo #define xv_rdev xv_un.xvu_rdev #define xv_dev xv_un.xv_uns.xvu_dev #define xv_ino xv_un.xv_uns.xvu_ino /* We don't need to lock the knlist */ #define VN_KNLIST_EMPTY(vp) ((vp)->v_pollinfo == NULL || \ KNLIST_EMPTY(&(vp)->v_pollinfo->vpi_selinfo.si_note)) #define VN_KNOTE(vp, b, a) \ do { \ if (!VN_KNLIST_EMPTY(vp)) \ KNOTE(&vp->v_pollinfo->vpi_selinfo.si_note, (b), \ (a) | KNF_NOKQLOCK); \ } while (0) #define VN_KNOTE_LOCKED(vp, b) VN_KNOTE(vp, b, KNF_LISTLOCKED) #define VN_KNOTE_UNLOCKED(vp, b) VN_KNOTE(vp, b, 0) /* * Vnode flags. * VI flags are protected by interlock and live in v_iflag * VV flags are protected by the vnode lock and live in v_vflag * * VIRF_DOOMED is doubly protected by the interlock and vnode lock. Both * are required for writing but the status may be checked with either. */ #define VHOLD_NO_SMR (1<<29) /* Disable vhold_smr */ #define VHOLD_ALL_FLAGS (VHOLD_NO_SMR) #define VIRF_DOOMED 0x0001 /* This vnode is being recycled */ #define VI_TEXT_REF 0x0001 /* Text ref grabbed use ref */ #define VI_MOUNT 0x0002 /* Mount in progress */ #define VI_DOINGINACT 0x0004 /* VOP_INACTIVE is in progress */ #define VI_OWEINACT 0x0008 /* Need to call inactive */ #define VI_DEFINACT 0x0010 /* deferred inactive */ #define VV_ROOT 0x0001 /* root of its filesystem */ #define VV_ISTTY 0x0002 /* vnode represents a tty */ #define VV_NOSYNC 0x0004 /* unlinked, stop syncing */ #define VV_ETERNALDEV 0x0008 /* device that is never destroyed */ #define VV_CACHEDLABEL 0x0010 /* Vnode has valid cached MAC label */ #define VV_VMSIZEVNLOCK 0x0020 /* object size check requires vnode lock */ #define VV_COPYONWRITE 0x0040 /* vnode is doing copy-on-write */ #define VV_SYSTEM 0x0080 /* vnode being used by kernel */ #define VV_PROCDEP 0x0100 /* vnode is process dependent */ #define VV_NOKNOTE 0x0200 /* don't activate knotes on this vnode */ #define VV_DELETED 0x0400 /* should be removed */ #define VV_MD 0x0800 /* vnode backs the md device */ #define VV_FORCEINSMQ 0x1000 /* force the insmntque to succeed */ #define VV_READLINK 0x2000 /* fdescfs linux vnode */ #define VMP_LAZYLIST 0x0001 /* Vnode is on mnt's lazy list */ /* * Vnode attributes. A field value of VNOVAL represents a field whose value * is unavailable (getattr) or which is not to be changed (setattr). */ struct vattr { enum vtype va_type; /* vnode type (for create) */ u_short va_mode; /* files access mode and type */ u_short va_padding0; uid_t va_uid; /* owner user id */ gid_t va_gid; /* owner group id */ nlink_t va_nlink; /* number of references to file */ dev_t va_fsid; /* filesystem id */ ino_t va_fileid; /* file id */ u_quad_t va_size; /* file size in bytes */ long va_blocksize; /* blocksize preferred for i/o */ struct timespec va_atime; /* time of last access */ struct timespec va_mtime; /* time of last modification */ struct timespec va_ctime; /* time file changed */ struct timespec va_birthtime; /* time file created */ u_long va_gen; /* generation number of file */ u_long va_flags; /* flags defined for file */ dev_t va_rdev; /* device the special file represents */ u_quad_t va_bytes; /* bytes of disk space held by file */ u_quad_t va_filerev; /* file modification number */ u_int va_vaflags; /* operations flags, see below */ long va_spare; /* remain quad aligned */ }; /* * Flags for va_vaflags. */ #define VA_UTIMES_NULL 0x01 /* utimes argument was NULL */ #define VA_EXCLUSIVE 0x02 /* exclusive create request */ #define VA_SYNC 0x04 /* O_SYNC truncation */ /* * Flags for ioflag. (high 16 bits used to ask for read-ahead and * help with write clustering) * NB: IO_NDELAY and IO_DIRECT are linked to fcntl.h */ #define IO_UNIT 0x0001 /* do I/O as atomic unit */ #define IO_APPEND 0x0002 /* append write to end */ #define IO_NDELAY 0x0004 /* FNDELAY flag set in file table */ #define IO_NODELOCKED 0x0008 /* underlying node already locked */ #define IO_ASYNC 0x0010 /* bawrite rather then bdwrite */ #define IO_VMIO 0x0020 /* data already in VMIO space */ #define IO_INVAL 0x0040 /* invalidate after I/O */ #define IO_SYNC 0x0080 /* do I/O synchronously */ #define IO_DIRECT 0x0100 /* attempt to bypass buffer cache */ #define IO_NOREUSE 0x0200 /* VMIO data won't be reused */ #define IO_EXT 0x0400 /* operate on external attributes */ #define IO_NORMAL 0x0800 /* operate on regular data */ #define IO_NOMACCHECK 0x1000 /* MAC checks unnecessary */ #define IO_BUFLOCKED 0x2000 /* ffs flag; indir buf is locked */ #define IO_RANGELOCKED 0x4000 /* range locked */ #define IO_SEQMAX 0x7F /* seq heuristic max value */ #define IO_SEQSHIFT 16 /* seq heuristic in upper 16 bits */ /* * Flags for accmode_t. */ #define VEXEC 000000000100 /* execute/search permission */ #define VWRITE 000000000200 /* write permission */ #define VREAD 000000000400 /* read permission */ #define VADMIN 000000010000 /* being the file owner */ #define VAPPEND 000000040000 /* permission to write/append */ /* * VEXPLICIT_DENY makes VOP_ACCESSX(9) return EPERM or EACCES only * if permission was denied explicitly, by a "deny" rule in NFSv4 ACL, * and 0 otherwise. This never happens with ordinary unix access rights * or POSIX.1e ACLs. Obviously, VEXPLICIT_DENY must be OR-ed with * some other V* constant. */ #define VEXPLICIT_DENY 000000100000 #define VREAD_NAMED_ATTRS 000000200000 /* not used */ #define VWRITE_NAMED_ATTRS 000000400000 /* not used */ #define VDELETE_CHILD 000001000000 #define VREAD_ATTRIBUTES 000002000000 /* permission to stat(2) */ #define VWRITE_ATTRIBUTES 000004000000 /* change {m,c,a}time */ #define VDELETE 000010000000 #define VREAD_ACL 000020000000 /* read ACL and file mode */ #define VWRITE_ACL 000040000000 /* change ACL and/or file mode */ #define VWRITE_OWNER 000100000000 /* change file owner */ #define VSYNCHRONIZE 000200000000 /* not used */ #define VCREAT 000400000000 /* creating new file */ #define VVERIFY 001000000000 /* verification required */ /* * Permissions that were traditionally granted only to the file owner. */ #define VADMIN_PERMS (VADMIN | VWRITE_ATTRIBUTES | VWRITE_ACL | \ VWRITE_OWNER) /* * Permissions that were traditionally granted to everyone. */ #define VSTAT_PERMS (VREAD_ATTRIBUTES | VREAD_ACL) /* * Permissions that allow to change the state of the file in any way. */ #define VMODIFY_PERMS (VWRITE | VAPPEND | VADMIN_PERMS | VDELETE_CHILD | \ VDELETE) /* * Token indicating no attribute value yet assigned. */ #define VNOVAL (-1) /* * LK_TIMELOCK timeout for vnode locks (used mainly by the pageout daemon) */ #define VLKTIMEOUT (hz / 20 + 1) #ifdef _KERNEL #ifdef MALLOC_DECLARE MALLOC_DECLARE(M_VNODE); #endif extern u_int ncsizefactor; /* * Convert between vnode types and inode formats (since POSIX.1 * defines mode word of stat structure in terms of inode formats). */ extern enum vtype iftovt_tab[]; extern int vttoif_tab[]; #define IFTOVT(mode) (iftovt_tab[((mode) & S_IFMT) >> 12]) #define VTTOIF(indx) (vttoif_tab[(int)(indx)]) #define MAKEIMODE(indx, mode) (int)(VTTOIF(indx) | (mode)) /* * Flags to various vnode functions. */ #define SKIPSYSTEM 0x0001 /* vflush: skip vnodes marked VSYSTEM */ #define FORCECLOSE 0x0002 /* vflush: force file closure */ #define WRITECLOSE 0x0004 /* vflush: only close writable files */ #define EARLYFLUSH 0x0008 /* vflush: early call for ffs_flushfiles */ #define V_SAVE 0x0001 /* vinvalbuf: sync file first */ #define V_ALT 0x0002 /* vinvalbuf: invalidate only alternate bufs */ #define V_NORMAL 0x0004 /* vinvalbuf: invalidate only regular bufs */ #define V_CLEANONLY 0x0008 /* vinvalbuf: invalidate only clean bufs */ #define V_VMIO 0x0010 /* vinvalbuf: called during pageout */ #define V_ALLOWCLEAN 0x0020 /* vinvalbuf: allow clean buffers after flush */ #define REVOKEALL 0x0001 /* vop_revoke: revoke all aliases */ #define V_WAIT 0x0001 /* vn_start_write: sleep for suspend */ #define V_NOWAIT 0x0002 /* vn_start_write: don't sleep for suspend */ #define V_XSLEEP 0x0004 /* vn_start_write: just return after sleep */ #define V_MNTREF 0x0010 /* vn_start_write: mp is already ref-ed */ #define VR_START_WRITE 0x0001 /* vfs_write_resume: start write atomically */ #define VR_NO_SUSPCLR 0x0002 /* vfs_write_resume: do not clear suspension */ #define VS_SKIP_UNMOUNT 0x0001 /* vfs_write_suspend: fail if the filesystem is being unmounted */ #define VREF(vp) vref(vp) #ifdef DIAGNOSTIC #define VATTR_NULL(vap) vattr_null(vap) #else #define VATTR_NULL(vap) (*(vap) = va_null) /* initialize a vattr */ #endif /* DIAGNOSTIC */ #define NULLVP ((struct vnode *)NULL) /* * Global vnode data. */ extern struct vnode *rootvnode; /* root (i.e. "/") vnode */ extern struct mount *rootdevmp; /* "/dev" mount */ extern u_long desiredvnodes; /* number of vnodes desired */ extern struct uma_zone *namei_zone; extern struct vattr va_null; /* predefined null vattr structure */ #define VI_LOCK(vp) mtx_lock(&(vp)->v_interlock) #define VI_LOCK_FLAGS(vp, flags) mtx_lock_flags(&(vp)->v_interlock, (flags)) #define VI_TRYLOCK(vp) mtx_trylock(&(vp)->v_interlock) #define VI_UNLOCK(vp) mtx_unlock(&(vp)->v_interlock) #define VI_MTX(vp) (&(vp)->v_interlock) #define VN_LOCK_AREC(vp) lockallowrecurse((vp)->v_vnlock) #define VN_LOCK_ASHARE(vp) lockallowshare((vp)->v_vnlock) #define VN_LOCK_DSHARE(vp) lockdisableshare((vp)->v_vnlock) #endif /* _KERNEL */ /* * Mods for extensibility. */ /* * Flags for vdesc_flags: */ #define VDESC_MAX_VPS 16 /* Low order 16 flag bits are reserved for willrele flags for vp arguments. */ #define VDESC_VP0_WILLRELE 0x0001 #define VDESC_VP1_WILLRELE 0x0002 #define VDESC_VP2_WILLRELE 0x0004 #define VDESC_VP3_WILLRELE 0x0008 /* * A generic structure. * This can be used by bypass routines to identify generic arguments. */ struct vop_generic_args { struct vnodeop_desc *a_desc; /* other random data follows, presumably */ }; typedef int vop_bypass_t(struct vop_generic_args *); /* * VDESC_NO_OFFSET is used to identify the end of the offset list * and in places where no such field exists. */ #define VDESC_NO_OFFSET -1 /* * This structure describes the vnode operation taking place. */ struct vnodeop_desc { char *vdesc_name; /* a readable name for debugging */ int vdesc_flags; /* VDESC_* flags */ int vdesc_vop_offset; vop_bypass_t *vdesc_call; /* Function to call */ /* * These ops are used by bypass routines to map and locate arguments. * Creds and procs are not needed in bypass routines, but sometimes * they are useful to (for example) transport layers. * Nameidata is useful because it has a cred in it. */ int *vdesc_vp_offsets; /* list ended by VDESC_NO_OFFSET */ int vdesc_vpp_offset; /* return vpp location */ int vdesc_cred_offset; /* cred location, if any */ int vdesc_thread_offset; /* thread location, if any */ int vdesc_componentname_offset; /* if any */ }; #ifdef _KERNEL /* * A list of all the operation descs. */ extern struct vnodeop_desc *vnodeop_descs[]; #define VOPARG_OFFSETOF(s_type, field) __offsetof(s_type, field) #define VOPARG_OFFSETTO(s_type, s_offset, struct_p) \ ((s_type)(((char*)(struct_p)) + (s_offset))) #ifdef DEBUG_VFS_LOCKS /* * Support code to aid in debugging VFS locking problems. Not totally * reliable since if the thread sleeps between changing the lock * state and checking it with the assert, some other thread could * change the state. They are good enough for debugging a single * filesystem using a single-threaded test. Note that the unreliability is * limited to false negatives; efforts were made to ensure that false * positives cannot occur. */ void assert_vi_locked(struct vnode *vp, const char *str); void assert_vi_unlocked(struct vnode *vp, const char *str); void assert_vop_elocked(struct vnode *vp, const char *str); void assert_vop_locked(struct vnode *vp, const char *str); void assert_vop_unlocked(struct vnode *vp, const char *str); #define ASSERT_VI_LOCKED(vp, str) assert_vi_locked((vp), (str)) #define ASSERT_VI_UNLOCKED(vp, str) assert_vi_unlocked((vp), (str)) #define ASSERT_VOP_ELOCKED(vp, str) assert_vop_elocked((vp), (str)) #define ASSERT_VOP_LOCKED(vp, str) assert_vop_locked((vp), (str)) #define ASSERT_VOP_UNLOCKED(vp, str) assert_vop_unlocked((vp), (str)) #define ASSERT_VOP_IN_SEQC(vp) do { \ struct vnode *_vp = (vp); \ \ VNPASS(seqc_in_modify(_vp->v_seqc), _vp); \ } while (0) #define ASSERT_VOP_NOT_IN_SEQC(vp) do { \ struct vnode *_vp = (vp); \ \ VNPASS(!seqc_in_modify(_vp->v_seqc), _vp); \ } while (0) #else /* !DEBUG_VFS_LOCKS */ #define ASSERT_VI_LOCKED(vp, str) ((void)0) #define ASSERT_VI_UNLOCKED(vp, str) ((void)0) #define ASSERT_VOP_ELOCKED(vp, str) ((void)0) #define ASSERT_VOP_LOCKED(vp, str) ((void)0) #define ASSERT_VOP_UNLOCKED(vp, str) ((void)0) #define ASSERT_VOP_IN_SEQC(vp) ((void)0) #define ASSERT_VOP_NOT_IN_SEQC(vp) ((void)0) #endif /* DEBUG_VFS_LOCKS */ /* * This call works for vnodes in the kernel. */ #define VCALL(c) ((c)->a_desc->vdesc_call(c)) #define DOINGASYNC(vp) \ (((vp)->v_mount->mnt_kern_flag & MNTK_ASYNC) != 0 && \ ((curthread->td_pflags & TDP_SYNCIO) == 0)) /* * VMIO support inline */ extern int vmiodirenable; static __inline int vn_canvmio(struct vnode *vp) { if (vp && (vp->v_type == VREG || (vmiodirenable && vp->v_type == VDIR))) return(TRUE); return(FALSE); } /* * Finally, include the default set of vnode operations. */ typedef void vop_getpages_iodone_t(void *, vm_page_t *, int, int); #include "vnode_if.h" /* vn_open_flags */ #define VN_OPEN_NOAUDIT 0x00000001 #define VN_OPEN_NOCAPCHECK 0x00000002 #define VN_OPEN_NAMECACHE 0x00000004 #define VN_OPEN_INVFS 0x00000008 /* * Public vnode manipulation functions. */ struct componentname; struct file; struct mount; struct nameidata; struct ostat; struct freebsd11_stat; struct thread; struct proc; struct stat; struct nstat; struct ucred; struct uio; struct vattr; struct vfsops; struct vnode; typedef int (*vn_get_ino_t)(struct mount *, void *, int, struct vnode **); int bnoreuselist(struct bufv *bufv, struct bufobj *bo, daddr_t startn, daddr_t endn); /* cache_* may belong in namei.h. */ void cache_changesize(u_long newhashsize); #define cache_enter(dvp, vp, cnp) \ cache_enter_time(dvp, vp, cnp, NULL, NULL) void cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp, struct timespec *tsp, struct timespec *dtsp); int cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, struct timespec *tsp, int *ticksp); void cache_vnode_init(struct vnode *vp); void cache_purge(struct vnode *vp); void cache_purge_vgone(struct vnode *vp); void cache_purge_negative(struct vnode *vp); void cache_purgevfs(struct mount *mp, bool force); int change_dir(struct vnode *vp, struct thread *td); void cvtstat(struct stat *st, struct ostat *ost); void freebsd11_cvtnstat(struct stat *sb, struct nstat *nsb); int freebsd11_cvtstat(struct stat *st, struct freebsd11_stat *ost); int getnewvnode(const char *tag, struct mount *mp, struct vop_vector *vops, struct vnode **vpp); void getnewvnode_reserve(void); void getnewvnode_drop_reserve(void); int insmntque1(struct vnode *vp, struct mount *mp, void (*dtr)(struct vnode *, void *), void *dtr_arg); int insmntque(struct vnode *vp, struct mount *mp); u_quad_t init_va_filerev(void); int speedup_syncer(void); int vn_vptocnp(struct vnode **vp, struct ucred *cred, char *buf, size_t *buflen); int vn_getcwd(struct thread *td, char *buf, char **retbuf, size_t *buflen); int vn_fullpath(struct thread *td, struct vnode *vn, char **retbuf, char **freebuf); int vn_fullpath_global(struct thread *td, struct vnode *vn, char **retbuf, char **freebuf); struct vnode * vn_dir_dd_ino(struct vnode *vp); int vn_commname(struct vnode *vn, char *buf, u_int buflen); int vn_path_to_global_path(struct thread *td, struct vnode *vp, char *path, u_int pathlen); int vaccess(enum vtype type, mode_t file_mode, uid_t file_uid, gid_t file_gid, accmode_t accmode, struct ucred *cred); int vaccess_vexec_smr(mode_t file_mode, uid_t file_uid, gid_t file_gid, struct ucred *cred); int vaccess_acl_nfs4(enum vtype type, uid_t file_uid, gid_t file_gid, struct acl *aclp, accmode_t accmode, struct ucred *cred); int vaccess_acl_posix1e(enum vtype type, uid_t file_uid, gid_t file_gid, struct acl *acl, accmode_t accmode, struct ucred *cred); void vattr_null(struct vattr *vap); void vlazy(struct vnode *); void vdrop(struct vnode *); void vdropl(struct vnode *); int vflush(struct mount *mp, int rootrefs, int flags, struct thread *td); -int vget(struct vnode *vp, int flags, struct thread *td); +int vget(struct vnode *vp, int flags); enum vgetstate vget_prep_smr(struct vnode *vp); enum vgetstate vget_prep(struct vnode *vp); int vget_finish(struct vnode *vp, int flags, enum vgetstate vs); void vget_finish_ref(struct vnode *vp, enum vgetstate vs); void vget_abort(struct vnode *vp, enum vgetstate vs); void vgone(struct vnode *vp); void vhold(struct vnode *); void vholdl(struct vnode *); void vholdnz(struct vnode *); bool vhold_smr(struct vnode *); void vinactive(struct vnode *vp); int vinvalbuf(struct vnode *vp, int save, int slpflag, int slptimeo); int vtruncbuf(struct vnode *vp, off_t length, int blksize); void v_inval_buf_range(struct vnode *vp, daddr_t startlbn, daddr_t endlbn, int blksize); void vunref(struct vnode *); void vn_printf(struct vnode *vp, const char *fmt, ...) __printflike(2,3); int vrecycle(struct vnode *vp); int vrecyclel(struct vnode *vp); int vn_bmap_seekhole(struct vnode *vp, u_long cmd, off_t *off, struct ucred *cred); int vn_close(struct vnode *vp, int flags, struct ucred *file_cred, struct thread *td); int vn_copy_file_range(struct vnode *invp, off_t *inoffp, struct vnode *outvp, off_t *outoffp, size_t *lenp, unsigned int flags, struct ucred *incred, struct ucred *outcred, struct thread *fsize_td); void vn_finished_write(struct mount *mp); void vn_finished_secondary_write(struct mount *mp); int vn_fsync_buf(struct vnode *vp, int waitfor); int vn_generic_copy_file_range(struct vnode *invp, off_t *inoffp, struct vnode *outvp, off_t *outoffp, size_t *lenp, unsigned int flags, struct ucred *incred, struct ucred *outcred, struct thread *fsize_td); int vn_need_pageq_flush(struct vnode *vp); int vn_isdisk(struct vnode *vp, int *errp); int _vn_lock(struct vnode *vp, int flags, const char *file, int line); #define vn_lock(vp, flags) _vn_lock(vp, flags, __FILE__, __LINE__) int vn_open(struct nameidata *ndp, int *flagp, int cmode, struct file *fp); int vn_open_cred(struct nameidata *ndp, int *flagp, int cmode, u_int vn_open_flags, struct ucred *cred, struct file *fp); int vn_open_vnode(struct vnode *vp, int fmode, struct ucred *cred, struct thread *td, struct file *fp); void vn_pages_remove(struct vnode *vp, vm_pindex_t start, vm_pindex_t end); int vn_pollrecord(struct vnode *vp, struct thread *p, int events); int vn_rdwr(enum uio_rw rw, struct vnode *vp, void *base, int len, off_t offset, enum uio_seg segflg, int ioflg, struct ucred *active_cred, struct ucred *file_cred, ssize_t *aresid, struct thread *td); int vn_rdwr_inchunks(enum uio_rw rw, struct vnode *vp, void *base, size_t len, off_t offset, enum uio_seg segflg, int ioflg, struct ucred *active_cred, struct ucred *file_cred, size_t *aresid, struct thread *td); int vn_rlimit_fsize(const struct vnode *vn, const struct uio *uio, struct thread *td); int vn_start_write(struct vnode *vp, struct mount **mpp, int flags); int vn_start_secondary_write(struct vnode *vp, struct mount **mpp, int flags); int vn_truncate_locked(struct vnode *vp, off_t length, bool sync, struct ucred *cred); int vn_writechk(struct vnode *vp); int vn_extattr_get(struct vnode *vp, int ioflg, int attrnamespace, const char *attrname, int *buflen, char *buf, struct thread *td); int vn_extattr_set(struct vnode *vp, int ioflg, int attrnamespace, const char *attrname, int buflen, char *buf, struct thread *td); int vn_extattr_rm(struct vnode *vp, int ioflg, int attrnamespace, const char *attrname, struct thread *td); int vn_vget_ino(struct vnode *vp, ino_t ino, int lkflags, struct vnode **rvp); int vn_vget_ino_gen(struct vnode *vp, vn_get_ino_t alloc, void *alloc_arg, int lkflags, struct vnode **rvp); int vn_utimes_perm(struct vnode *vp, struct vattr *vap, struct ucred *cred, struct thread *td); int vn_io_fault_uiomove(char *data, int xfersize, struct uio *uio); int vn_io_fault_pgmove(vm_page_t ma[], vm_offset_t offset, int xfersize, struct uio *uio); void vn_seqc_write_begin_unheld_locked(struct vnode *vp); void vn_seqc_write_begin_unheld(struct vnode *vp); void vn_seqc_write_begin_locked(struct vnode *vp); void vn_seqc_write_begin(struct vnode *vp); void vn_seqc_write_end_locked(struct vnode *vp); void vn_seqc_write_end(struct vnode *vp); #define vn_seqc_read_any(vp) seqc_read_any(&(vp)->v_seqc) #define vn_seqc_consistent(vp, seq) seqc_consistent(&(vp)->v_seqc, seq) #define vn_rangelock_unlock(vp, cookie) \ rangelock_unlock(&(vp)->v_rl, (cookie), VI_MTX(vp)) #define vn_rangelock_unlock_range(vp, cookie, start, end) \ rangelock_unlock_range(&(vp)->v_rl, (cookie), (start), (end), \ VI_MTX(vp)) #define vn_rangelock_rlock(vp, start, end) \ rangelock_rlock(&(vp)->v_rl, (start), (end), VI_MTX(vp)) #define vn_rangelock_tryrlock(vp, start, end) \ rangelock_tryrlock(&(vp)->v_rl, (start), (end), VI_MTX(vp)) #define vn_rangelock_wlock(vp, start, end) \ rangelock_wlock(&(vp)->v_rl, (start), (end), VI_MTX(vp)) #define vn_rangelock_trywlock(vp, start, end) \ rangelock_trywlock(&(vp)->v_rl, (start), (end), VI_MTX(vp)) int vfs_cache_lookup(struct vop_lookup_args *ap); int vfs_cache_root(struct mount *mp, int flags, struct vnode **vpp); void vfs_timestamp(struct timespec *); void vfs_write_resume(struct mount *mp, int flags); int vfs_write_suspend(struct mount *mp, int flags); int vfs_write_suspend_umnt(struct mount *mp); void vnlru_free(int, struct vfsops *); int vop_stdbmap(struct vop_bmap_args *); int vop_stdfdatasync_buf(struct vop_fdatasync_args *); int vop_stdfsync(struct vop_fsync_args *); int vop_stdgetwritemount(struct vop_getwritemount_args *); int vop_stdgetpages(struct vop_getpages_args *); int vop_stdinactive(struct vop_inactive_args *); int vop_stdioctl(struct vop_ioctl_args *); int vop_stdneed_inactive(struct vop_need_inactive_args *); int vop_stdkqfilter(struct vop_kqfilter_args *); int vop_stdlock(struct vop_lock1_args *); int vop_stdunlock(struct vop_unlock_args *); int vop_stdislocked(struct vop_islocked_args *); int vop_lock(struct vop_lock1_args *); int vop_unlock(struct vop_unlock_args *); int vop_islocked(struct vop_islocked_args *); int vop_stdputpages(struct vop_putpages_args *); int vop_nopoll(struct vop_poll_args *); int vop_stdaccess(struct vop_access_args *ap); int vop_stdaccessx(struct vop_accessx_args *ap); int vop_stdadvise(struct vop_advise_args *ap); int vop_stdadvlock(struct vop_advlock_args *ap); int vop_stdadvlockasync(struct vop_advlockasync_args *ap); int vop_stdadvlockpurge(struct vop_advlockpurge_args *ap); int vop_stdallocate(struct vop_allocate_args *ap); int vop_stdset_text(struct vop_set_text_args *ap); int vop_stdpathconf(struct vop_pathconf_args *); int vop_stdpoll(struct vop_poll_args *); int vop_stdvptocnp(struct vop_vptocnp_args *ap); int vop_stdvptofh(struct vop_vptofh_args *ap); int vop_stdunp_bind(struct vop_unp_bind_args *ap); int vop_stdunp_connect(struct vop_unp_connect_args *ap); int vop_stdunp_detach(struct vop_unp_detach_args *ap); int vop_eopnotsupp(struct vop_generic_args *ap); int vop_ebadf(struct vop_generic_args *ap); int vop_einval(struct vop_generic_args *ap); int vop_enoent(struct vop_generic_args *ap); int vop_enotty(struct vop_generic_args *ap); int vop_null(struct vop_generic_args *ap); int vop_panic(struct vop_generic_args *ap); int dead_poll(struct vop_poll_args *ap); int dead_read(struct vop_read_args *ap); int dead_write(struct vop_write_args *ap); /* These are called from within the actual VOPS. */ void vop_close_post(void *a, int rc); void vop_create_pre(void *a); void vop_create_post(void *a, int rc); void vop_whiteout_pre(void *a); void vop_whiteout_post(void *a, int rc); void vop_deleteextattr_pre(void *a); void vop_deleteextattr_post(void *a, int rc); void vop_link_pre(void *a); void vop_link_post(void *a, int rc); void vop_lookup_post(void *a, int rc); void vop_lookup_pre(void *a); void vop_mkdir_pre(void *a); void vop_mkdir_post(void *a, int rc); void vop_mknod_pre(void *a); void vop_mknod_post(void *a, int rc); void vop_open_post(void *a, int rc); void vop_read_post(void *a, int rc); void vop_readdir_post(void *a, int rc); void vop_reclaim_post(void *a, int rc); void vop_remove_pre(void *a); void vop_remove_post(void *a, int rc); void vop_rename_post(void *a, int rc); void vop_rename_pre(void *a); void vop_rmdir_pre(void *a); void vop_rmdir_post(void *a, int rc); void vop_setattr_pre(void *a); void vop_setattr_post(void *a, int rc); void vop_setacl_pre(void *a); void vop_setacl_post(void *a, int rc); void vop_setextattr_pre(void *a); void vop_setextattr_post(void *a, int rc); void vop_symlink_pre(void *a); void vop_symlink_post(void *a, int rc); int vop_sigdefer(struct vop_vector *vop, struct vop_generic_args *a); #ifdef DEBUG_VFS_LOCKS void vop_fplookup_vexec_debugpre(void *a); void vop_fplookup_vexec_debugpost(void *a, int rc); void vop_strategy_debugpre(void *a); void vop_lock_debugpre(void *a); void vop_lock_debugpost(void *a, int rc); void vop_unlock_debugpre(void *a); void vop_need_inactive_debugpre(void *a); void vop_need_inactive_debugpost(void *a, int rc); #else #define vop_fplookup_vexec_debugpre(x) do { } while (0) #define vop_fplookup_vexec_debugpost(x, y) do { } while (0) #define vop_strategy_debugpre(x) do { } while (0) #define vop_lock_debugpre(x) do { } while (0) #define vop_lock_debugpost(x, y) do { } while (0) #define vop_unlock_debugpre(x) do { } while (0) #define vop_need_inactive_debugpre(x) do { } while (0) #define vop_need_inactive_debugpost(x, y) do { } while (0) #endif void vop_rename_fail(struct vop_rename_args *ap); #define vop_stat_helper_pre(ap) ({ \ int _error; \ AUDIT_ARG_VNODE1(ap->a_vp); \ _error = mac_vnode_check_stat(ap->a_active_cred, ap->a_file_cred, ap->a_vp);\ if (__predict_true(_error == 0)) \ bzero(ap->a_sb, sizeof(*ap->a_sb)); \ _error; \ }) #define vop_stat_helper_post(ap, error) ({ \ int _error = (error); \ if (priv_check_cred_vfs_generation(ap->a_td->td_ucred)) \ ap->a_sb->st_gen = 0; \ _error; \ }) #define VOP_WRITE_PRE(ap) \ struct vattr va; \ int error; \ off_t osize, ooffset, noffset; \ \ osize = ooffset = noffset = 0; \ if (!VN_KNLIST_EMPTY((ap)->a_vp)) { \ error = VOP_GETATTR((ap)->a_vp, &va, (ap)->a_cred); \ if (error) \ return (error); \ ooffset = (ap)->a_uio->uio_offset; \ osize = (off_t)va.va_size; \ } #define VOP_WRITE_POST(ap, ret) \ noffset = (ap)->a_uio->uio_offset; \ if (noffset > ooffset && !VN_KNLIST_EMPTY((ap)->a_vp)) { \ VFS_KNOTE_LOCKED((ap)->a_vp, NOTE_WRITE \ | (noffset > osize ? NOTE_EXTEND : 0)); \ } #define VOP_LOCK(vp, flags) VOP_LOCK1(vp, flags, __FILE__, __LINE__) #ifdef INVARIANTS #define VOP_ADD_WRITECOUNT_CHECKED(vp, cnt) \ do { \ int error_; \ \ error_ = VOP_ADD_WRITECOUNT((vp), (cnt)); \ VNASSERT(error_ == 0, (vp), ("VOP_ADD_WRITECOUNT returned %d", \ error_)); \ } while (0) #define VOP_SET_TEXT_CHECKED(vp) \ do { \ int error_; \ \ error_ = VOP_SET_TEXT((vp)); \ VNASSERT(error_ == 0, (vp), ("VOP_SET_TEXT returned %d", \ error_)); \ } while (0) #define VOP_UNSET_TEXT_CHECKED(vp) \ do { \ int error_; \ \ error_ = VOP_UNSET_TEXT((vp)); \ VNASSERT(error_ == 0, (vp), ("VOP_UNSET_TEXT returned %d", \ error_)); \ } while (0) #else #define VOP_ADD_WRITECOUNT_CHECKED(vp, cnt) VOP_ADD_WRITECOUNT((vp), (cnt)) #define VOP_SET_TEXT_CHECKED(vp) VOP_SET_TEXT((vp)) #define VOP_UNSET_TEXT_CHECKED(vp) VOP_UNSET_TEXT((vp)) #endif #define VN_IS_DOOMED(vp) __predict_false((vp)->v_irflag & VIRF_DOOMED) void vput(struct vnode *vp); void vrele(struct vnode *vp); void vref(struct vnode *vp); void vrefl(struct vnode *vp); void vrefact(struct vnode *vp); void v_addpollinfo(struct vnode *vp); static __inline int vrefcnt(struct vnode *vp) { return (vp->v_usecount); } int vnode_create_vobject(struct vnode *vp, off_t size, struct thread *td); void vnode_destroy_vobject(struct vnode *vp); extern struct vop_vector fifo_specops; extern struct vop_vector dead_vnodeops; extern struct vop_vector default_vnodeops; #define VOP_PANIC ((void*)(uintptr_t)vop_panic) #define VOP_NULL ((void*)(uintptr_t)vop_null) #define VOP_EBADF ((void*)(uintptr_t)vop_ebadf) #define VOP_ENOTTY ((void*)(uintptr_t)vop_enotty) #define VOP_EINVAL ((void*)(uintptr_t)vop_einval) #define VOP_ENOENT ((void*)(uintptr_t)vop_enoent) #define VOP_EOPNOTSUPP ((void*)(uintptr_t)vop_eopnotsupp) /* fifo_vnops.c */ int fifo_printinfo(struct vnode *); /* vfs_hash.c */ typedef int vfs_hash_cmp_t(struct vnode *vp, void *arg); void vfs_hash_changesize(u_long newhashsize); int vfs_hash_get(const struct mount *mp, u_int hash, int flags, struct thread *td, struct vnode **vpp, vfs_hash_cmp_t *fn, void *arg); u_int vfs_hash_index(struct vnode *vp); int vfs_hash_insert(struct vnode *vp, u_int hash, int flags, struct thread *td, struct vnode **vpp, vfs_hash_cmp_t *fn, void *arg); void vfs_hash_ref(const struct mount *mp, u_int hash, struct thread *td, struct vnode **vpp, vfs_hash_cmp_t *fn, void *arg); void vfs_hash_rehash(struct vnode *vp, u_int hash); void vfs_hash_remove(struct vnode *vp); int vfs_kqfilter(struct vop_kqfilter_args *); struct dirent; int vfs_read_dirent(struct vop_readdir_args *ap, struct dirent *dp, off_t off); int vfs_emptydir(struct vnode *vp); int vfs_unixify_accmode(accmode_t *accmode); void vfs_unp_reclaim(struct vnode *vp); int setfmode(struct thread *td, struct ucred *cred, struct vnode *vp, int mode); int setfown(struct thread *td, struct ucred *cred, struct vnode *vp, uid_t uid, gid_t gid); int vn_chmod(struct file *fp, mode_t mode, struct ucred *active_cred, struct thread *td); int vn_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred, struct thread *td); void vn_fsid(struct vnode *vp, struct vattr *va); int vn_dir_check_exec(struct vnode *vp, struct componentname *cnp); #define VOP_UNLOCK_FLAGS(vp, flags) ({ \ struct vnode *_vp = (vp); \ int _flags = (flags); \ int _error; \ \ if ((_flags & ~(LK_INTERLOCK | LK_RELEASE)) != 0) \ panic("%s: unsupported flags %x\n", __func__, flags); \ _error = VOP_UNLOCK(_vp); \ if (_flags & LK_INTERLOCK) \ VI_UNLOCK(_vp); \ _error; \ }) #include #define VFS_VOP_VECTOR_REGISTER(vnodeops) \ SYSINIT(vfs_vector_##vnodeops##_f, SI_SUB_VFS, SI_ORDER_ANY, \ vfs_vector_op_register, &vnodeops) #define VFS_SMR_DECLARE \ extern smr_t vfs_smr #define VFS_SMR() vfs_smr #define vfs_smr_enter() smr_enter(VFS_SMR()) #define vfs_smr_exit() smr_exit(VFS_SMR()) #define vfs_smr_entered_load(ptr) smr_entered_load((ptr), VFS_SMR()) #define VFS_SMR_ASSERT_ENTERED() SMR_ASSERT_ENTERED(VFS_SMR()) #define VFS_SMR_ASSERT_NOT_ENTERED() SMR_ASSERT_NOT_ENTERED(VFS_SMR()) #define VFS_SMR_ZONE_SET(zone) uma_zone_set_smr((zone), VFS_SMR()) #define vn_load_v_data_smr(vp) ({ \ struct vnode *_vp = (vp); \ \ VFS_SMR_ASSERT_ENTERED(); \ atomic_load_ptr(&(_vp)->v_data); \ }) #endif /* _KERNEL */ #endif /* !_SYS_VNODE_H_ */ Index: projects/clang1100-import/sys/ufs/ffs/ffs_alloc.c =================================================================== --- projects/clang1100-import/sys/ufs/ffs/ffs_alloc.c (revision 364278) +++ projects/clang1100-import/sys/ufs/ffs/ffs_alloc.c (revision 364279) @@ -1,3519 +1,3519 @@ /*- * SPDX-License-Identifier: (BSD-2-Clause-FreeBSD AND BSD-3-Clause) * * Copyright (c) 2002 Networks Associates Technology, Inc. * All rights reserved. * * This software was developed for the FreeBSD Project by Marshall * Kirk McKusick and Network Associates Laboratories, the Security * Research Division of Network Associates, Inc. under DARPA/SPAWAR * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS * research program * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ffs_alloc.c 8.18 (Berkeley) 5/26/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_quota.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include typedef ufs2_daddr_t allocfcn_t(struct inode *ip, u_int cg, ufs2_daddr_t bpref, int size, int rsize); static ufs2_daddr_t ffs_alloccg(struct inode *, u_int, ufs2_daddr_t, int, int); static ufs2_daddr_t ffs_alloccgblk(struct inode *, struct buf *, ufs2_daddr_t, int); static void ffs_blkfree_cg(struct ufsmount *, struct fs *, struct vnode *, ufs2_daddr_t, long, ino_t, struct workhead *); #ifdef INVARIANTS static int ffs_checkblk(struct inode *, ufs2_daddr_t, long); #endif static ufs2_daddr_t ffs_clusteralloc(struct inode *, u_int, ufs2_daddr_t, int); static ino_t ffs_dirpref(struct inode *); static ufs2_daddr_t ffs_fragextend(struct inode *, u_int, ufs2_daddr_t, int, int); static ufs2_daddr_t ffs_hashalloc (struct inode *, u_int, ufs2_daddr_t, int, int, allocfcn_t *); static ufs2_daddr_t ffs_nodealloccg(struct inode *, u_int, ufs2_daddr_t, int, int); static ufs1_daddr_t ffs_mapsearch(struct fs *, struct cg *, ufs2_daddr_t, int); static int ffs_reallocblks_ufs1(struct vop_reallocblks_args *); static int ffs_reallocblks_ufs2(struct vop_reallocblks_args *); static void ffs_ckhash_cg(struct buf *); /* * Allocate a block in the filesystem. * * The size of the requested block is given, which must be some * multiple of fs_fsize and <= fs_bsize. * A preference may be optionally specified. If a preference is given * the following hierarchy is used to allocate a block: * 1) allocate the requested block. * 2) allocate a rotationally optimal block in the same cylinder. * 3) allocate a block in the same cylinder group. * 4) quadradically rehash into other cylinder groups, until an * available block is located. * If no block preference is given the following hierarchy is used * to allocate a block: * 1) allocate a block in the cylinder group that contains the * inode for the file. * 2) quadradically rehash into other cylinder groups, until an * available block is located. */ int ffs_alloc(ip, lbn, bpref, size, flags, cred, bnp) struct inode *ip; ufs2_daddr_t lbn, bpref; int size, flags; struct ucred *cred; ufs2_daddr_t *bnp; { struct fs *fs; struct ufsmount *ump; ufs2_daddr_t bno; u_int cg, reclaimed; int64_t delta; #ifdef QUOTA int error; #endif *bnp = 0; ump = ITOUMP(ip); fs = ump->um_fs; mtx_assert(UFS_MTX(ump), MA_OWNED); #ifdef INVARIANTS if ((u_int)size > fs->fs_bsize || fragoff(fs, size) != 0) { printf("dev = %s, bsize = %ld, size = %d, fs = %s\n", devtoname(ump->um_dev), (long)fs->fs_bsize, size, fs->fs_fsmnt); panic("ffs_alloc: bad size"); } if (cred == NOCRED) panic("ffs_alloc: missing credential"); #endif /* INVARIANTS */ reclaimed = 0; retry: #ifdef QUOTA UFS_UNLOCK(ump); error = chkdq(ip, btodb(size), cred, 0); if (error) return (error); UFS_LOCK(ump); #endif if (size == fs->fs_bsize && fs->fs_cstotal.cs_nbfree == 0) goto nospace; if (priv_check_cred(cred, PRIV_VFS_BLOCKRESERVE) && freespace(fs, fs->fs_minfree) - numfrags(fs, size) < 0) goto nospace; if (bpref >= fs->fs_size) bpref = 0; if (bpref == 0) cg = ino_to_cg(fs, ip->i_number); else cg = dtog(fs, bpref); bno = ffs_hashalloc(ip, cg, bpref, size, size, ffs_alloccg); if (bno > 0) { delta = btodb(size); DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + delta); if (flags & IO_EXT) UFS_INODE_SET_FLAG(ip, IN_CHANGE); else UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_UPDATE); *bnp = bno; return (0); } nospace: #ifdef QUOTA UFS_UNLOCK(ump); /* * Restore user's disk quota because allocation failed. */ (void) chkdq(ip, -btodb(size), cred, FORCE); UFS_LOCK(ump); #endif if (reclaimed == 0 && (flags & IO_BUFLOCKED) == 0) { reclaimed = 1; softdep_request_cleanup(fs, ITOV(ip), cred, FLUSH_BLOCKS_WAIT); goto retry; } if (ffs_fsfail_cleanup_locked(ump, 0)) { UFS_UNLOCK(ump); return (ENXIO); } if (reclaimed > 0 && ppsratecheck(&ump->um_last_fullmsg, &ump->um_secs_fullmsg, 1)) { UFS_UNLOCK(ump); ffs_fserr(fs, ip->i_number, "filesystem full"); uprintf("\n%s: write failed, filesystem is full\n", fs->fs_fsmnt); } else { UFS_UNLOCK(ump); } return (ENOSPC); } /* * Reallocate a fragment to a bigger size * * The number and size of the old block is given, and a preference * and new size is also specified. The allocator attempts to extend * the original block. Failing that, the regular block allocator is * invoked to get an appropriate block. */ int ffs_realloccg(ip, lbprev, bprev, bpref, osize, nsize, flags, cred, bpp) struct inode *ip; ufs2_daddr_t lbprev; ufs2_daddr_t bprev; ufs2_daddr_t bpref; int osize, nsize, flags; struct ucred *cred; struct buf **bpp; { struct vnode *vp; struct fs *fs; struct buf *bp; struct ufsmount *ump; u_int cg, request, reclaimed; int error, gbflags; ufs2_daddr_t bno; int64_t delta; vp = ITOV(ip); ump = ITOUMP(ip); fs = ump->um_fs; bp = NULL; gbflags = (flags & BA_UNMAPPED) != 0 ? GB_UNMAPPED : 0; mtx_assert(UFS_MTX(ump), MA_OWNED); #ifdef INVARIANTS if (vp->v_mount->mnt_kern_flag & MNTK_SUSPENDED) panic("ffs_realloccg: allocation on suspended filesystem"); if ((u_int)osize > fs->fs_bsize || fragoff(fs, osize) != 0 || (u_int)nsize > fs->fs_bsize || fragoff(fs, nsize) != 0) { printf( "dev = %s, bsize = %ld, osize = %d, nsize = %d, fs = %s\n", devtoname(ump->um_dev), (long)fs->fs_bsize, osize, nsize, fs->fs_fsmnt); panic("ffs_realloccg: bad size"); } if (cred == NOCRED) panic("ffs_realloccg: missing credential"); #endif /* INVARIANTS */ reclaimed = 0; retry: if (priv_check_cred(cred, PRIV_VFS_BLOCKRESERVE) && freespace(fs, fs->fs_minfree) - numfrags(fs, nsize - osize) < 0) { goto nospace; } if (bprev == 0) { printf("dev = %s, bsize = %ld, bprev = %jd, fs = %s\n", devtoname(ump->um_dev), (long)fs->fs_bsize, (intmax_t)bprev, fs->fs_fsmnt); panic("ffs_realloccg: bad bprev"); } UFS_UNLOCK(ump); /* * Allocate the extra space in the buffer. */ error = bread_gb(vp, lbprev, osize, NOCRED, gbflags, &bp); if (error) { return (error); } if (bp->b_blkno == bp->b_lblkno) { if (lbprev >= UFS_NDADDR) panic("ffs_realloccg: lbprev out of range"); bp->b_blkno = fsbtodb(fs, bprev); } #ifdef QUOTA error = chkdq(ip, btodb(nsize - osize), cred, 0); if (error) { brelse(bp); return (error); } #endif /* * Check for extension in the existing location. */ *bpp = NULL; cg = dtog(fs, bprev); UFS_LOCK(ump); bno = ffs_fragextend(ip, cg, bprev, osize, nsize); if (bno) { if (bp->b_blkno != fsbtodb(fs, bno)) panic("ffs_realloccg: bad blockno"); delta = btodb(nsize - osize); DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + delta); if (flags & IO_EXT) UFS_INODE_SET_FLAG(ip, IN_CHANGE); else UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_UPDATE); allocbuf(bp, nsize); bp->b_flags |= B_DONE; vfs_bio_bzero_buf(bp, osize, nsize - osize); if ((bp->b_flags & (B_MALLOC | B_VMIO)) == B_VMIO) vfs_bio_set_valid(bp, osize, nsize - osize); *bpp = bp; return (0); } /* * Allocate a new disk location. */ if (bpref >= fs->fs_size) bpref = 0; switch ((int)fs->fs_optim) { case FS_OPTSPACE: /* * Allocate an exact sized fragment. Although this makes * best use of space, we will waste time relocating it if * the file continues to grow. If the fragmentation is * less than half of the minimum free reserve, we choose * to begin optimizing for time. */ request = nsize; if (fs->fs_minfree <= 5 || fs->fs_cstotal.cs_nffree > (off_t)fs->fs_dsize * fs->fs_minfree / (2 * 100)) break; log(LOG_NOTICE, "%s: optimization changed from SPACE to TIME\n", fs->fs_fsmnt); fs->fs_optim = FS_OPTTIME; break; case FS_OPTTIME: /* * At this point we have discovered a file that is trying to * grow a small fragment to a larger fragment. To save time, * we allocate a full sized block, then free the unused portion. * If the file continues to grow, the `ffs_fragextend' call * above will be able to grow it in place without further * copying. If aberrant programs cause disk fragmentation to * grow within 2% of the free reserve, we choose to begin * optimizing for space. */ request = fs->fs_bsize; if (fs->fs_cstotal.cs_nffree < (off_t)fs->fs_dsize * (fs->fs_minfree - 2) / 100) break; log(LOG_NOTICE, "%s: optimization changed from TIME to SPACE\n", fs->fs_fsmnt); fs->fs_optim = FS_OPTSPACE; break; default: printf("dev = %s, optim = %ld, fs = %s\n", devtoname(ump->um_dev), (long)fs->fs_optim, fs->fs_fsmnt); panic("ffs_realloccg: bad optim"); /* NOTREACHED */ } bno = ffs_hashalloc(ip, cg, bpref, request, nsize, ffs_alloccg); if (bno > 0) { bp->b_blkno = fsbtodb(fs, bno); if (!DOINGSOFTDEP(vp)) /* * The usual case is that a smaller fragment that * was just allocated has been replaced with a bigger * fragment or a full-size block. If it is marked as * B_DELWRI, the current contents have not been written * to disk. It is possible that the block was written * earlier, but very uncommon. If the block has never * been written, there is no need to send a BIO_DELETE * for it when it is freed. The gain from avoiding the * TRIMs for the common case of unwritten blocks far * exceeds the cost of the write amplification for the * uncommon case of failing to send a TRIM for a block * that had been written. */ ffs_blkfree(ump, fs, ump->um_devvp, bprev, (long)osize, ip->i_number, vp->v_type, NULL, (bp->b_flags & B_DELWRI) != 0 ? NOTRIM_KEY : SINGLETON_KEY); delta = btodb(nsize - osize); DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + delta); if (flags & IO_EXT) UFS_INODE_SET_FLAG(ip, IN_CHANGE); else UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_UPDATE); allocbuf(bp, nsize); bp->b_flags |= B_DONE; vfs_bio_bzero_buf(bp, osize, nsize - osize); if ((bp->b_flags & (B_MALLOC | B_VMIO)) == B_VMIO) vfs_bio_set_valid(bp, osize, nsize - osize); *bpp = bp; return (0); } #ifdef QUOTA UFS_UNLOCK(ump); /* * Restore user's disk quota because allocation failed. */ (void) chkdq(ip, -btodb(nsize - osize), cred, FORCE); UFS_LOCK(ump); #endif nospace: /* * no space available */ if (reclaimed == 0 && (flags & IO_BUFLOCKED) == 0) { reclaimed = 1; UFS_UNLOCK(ump); if (bp) { brelse(bp); bp = NULL; } UFS_LOCK(ump); softdep_request_cleanup(fs, vp, cred, FLUSH_BLOCKS_WAIT); goto retry; } if (bp) brelse(bp); if (ffs_fsfail_cleanup_locked(ump, 0)) { UFS_UNLOCK(ump); return (ENXIO); } if (reclaimed > 0 && ppsratecheck(&ump->um_last_fullmsg, &ump->um_secs_fullmsg, 1)) { UFS_UNLOCK(ump); ffs_fserr(fs, ip->i_number, "filesystem full"); uprintf("\n%s: write failed, filesystem is full\n", fs->fs_fsmnt); } else { UFS_UNLOCK(ump); } return (ENOSPC); } /* * Reallocate a sequence of blocks into a contiguous sequence of blocks. * * The vnode and an array of buffer pointers for a range of sequential * logical blocks to be made contiguous is given. The allocator attempts * to find a range of sequential blocks starting as close as possible * from the end of the allocation for the logical block immediately * preceding the current range. If successful, the physical block numbers * in the buffer pointers and in the inode are changed to reflect the new * allocation. If unsuccessful, the allocation is left unchanged. The * success in doing the reallocation is returned. Note that the error * return is not reflected back to the user. Rather the previous block * allocation will be used. */ SYSCTL_NODE(_vfs, OID_AUTO, ffs, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "FFS filesystem"); static int doasyncfree = 1; SYSCTL_INT(_vfs_ffs, OID_AUTO, doasyncfree, CTLFLAG_RW, &doasyncfree, 0, "do not force synchronous writes when blocks are reallocated"); static int doreallocblks = 1; SYSCTL_INT(_vfs_ffs, OID_AUTO, doreallocblks, CTLFLAG_RW, &doreallocblks, 0, "enable block reallocation"); static int dotrimcons = 1; SYSCTL_INT(_vfs_ffs, OID_AUTO, dotrimcons, CTLFLAG_RWTUN, &dotrimcons, 0, "enable BIO_DELETE / TRIM consolidation"); static int maxclustersearch = 10; SYSCTL_INT(_vfs_ffs, OID_AUTO, maxclustersearch, CTLFLAG_RW, &maxclustersearch, 0, "max number of cylinder group to search for contigous blocks"); #ifdef DIAGNOSTIC static int prtrealloc = 0; SYSCTL_INT(_debug, OID_AUTO, ffs_prtrealloc, CTLFLAG_RW, &prtrealloc, 0, "print out FFS filesystem block reallocation operations"); #endif int ffs_reallocblks(ap) struct vop_reallocblks_args /* { struct vnode *a_vp; struct cluster_save *a_buflist; } */ *ap; { struct ufsmount *ump; /* * We used to skip reallocating the blocks of a file into a * contiguous sequence if the underlying flash device requested * BIO_DELETE notifications, because devices that benefit from * BIO_DELETE also benefit from not moving the data. However, * the destination for the data is usually moved before the data * is written to the initially allocated location, so we rarely * suffer the penalty of extra writes. With the addition of the * consolidation of contiguous blocks into single BIO_DELETE * operations, having fewer but larger contiguous blocks reduces * the number of (slow and expensive) BIO_DELETE operations. So * when doing BIO_DELETE consolidation, we do block reallocation. * * Skip if reallocblks has been disabled globally. */ ump = ap->a_vp->v_mount->mnt_data; if ((((ump->um_flags) & UM_CANDELETE) != 0 && dotrimcons == 0) || doreallocblks == 0) return (ENOSPC); /* * We can't wait in softdep prealloc as it may fsync and recurse * here. Instead we simply fail to reallocate blocks if this * rare condition arises. */ if (DOINGSOFTDEP(ap->a_vp)) if (softdep_prealloc(ap->a_vp, MNT_NOWAIT) != 0) return (ENOSPC); if (ump->um_fstype == UFS1) return (ffs_reallocblks_ufs1(ap)); return (ffs_reallocblks_ufs2(ap)); } static int ffs_reallocblks_ufs1(ap) struct vop_reallocblks_args /* { struct vnode *a_vp; struct cluster_save *a_buflist; } */ *ap; { struct fs *fs; struct inode *ip; struct vnode *vp; struct buf *sbp, *ebp, *bp; ufs1_daddr_t *bap, *sbap, *ebap; struct cluster_save *buflist; struct ufsmount *ump; ufs_lbn_t start_lbn, end_lbn; ufs1_daddr_t soff, newblk, blkno; ufs2_daddr_t pref; struct indir start_ap[UFS_NIADDR + 1], end_ap[UFS_NIADDR + 1], *idp; int i, cg, len, start_lvl, end_lvl, ssize; vp = ap->a_vp; ip = VTOI(vp); ump = ITOUMP(ip); fs = ump->um_fs; /* * If we are not tracking block clusters or if we have less than 4% * free blocks left, then do not attempt to cluster. Running with * less than 5% free block reserve is not recommended and those that * choose to do so do not expect to have good file layout. */ if (fs->fs_contigsumsize <= 0 || freespace(fs, 4) < 0) return (ENOSPC); buflist = ap->a_buflist; len = buflist->bs_nchildren; start_lbn = buflist->bs_children[0]->b_lblkno; end_lbn = start_lbn + len - 1; #ifdef INVARIANTS for (i = 0; i < len; i++) if (!ffs_checkblk(ip, dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize)) panic("ffs_reallocblks: unallocated block 1"); for (i = 1; i < len; i++) if (buflist->bs_children[i]->b_lblkno != start_lbn + i) panic("ffs_reallocblks: non-logical cluster"); blkno = buflist->bs_children[0]->b_blkno; ssize = fsbtodb(fs, fs->fs_frag); for (i = 1; i < len - 1; i++) if (buflist->bs_children[i]->b_blkno != blkno + (i * ssize)) panic("ffs_reallocblks: non-physical cluster %d", i); #endif /* * If the cluster crosses the boundary for the first indirect * block, leave space for the indirect block. Indirect blocks * are initially laid out in a position after the last direct * block. Block reallocation would usually destroy locality by * moving the indirect block out of the way to make room for * data blocks if we didn't compensate here. We should also do * this for other indirect block boundaries, but it is only * important for the first one. */ if (start_lbn < UFS_NDADDR && end_lbn >= UFS_NDADDR) return (ENOSPC); /* * If the latest allocation is in a new cylinder group, assume that * the filesystem has decided to move and do not force it back to * the previous cylinder group. */ if (dtog(fs, dbtofsb(fs, buflist->bs_children[0]->b_blkno)) != dtog(fs, dbtofsb(fs, buflist->bs_children[len - 1]->b_blkno))) return (ENOSPC); if (ufs_getlbns(vp, start_lbn, start_ap, &start_lvl) || ufs_getlbns(vp, end_lbn, end_ap, &end_lvl)) return (ENOSPC); /* * Get the starting offset and block map for the first block. */ if (start_lvl == 0) { sbap = &ip->i_din1->di_db[0]; soff = start_lbn; } else { idp = &start_ap[start_lvl - 1]; if (bread(vp, idp->in_lbn, (int)fs->fs_bsize, NOCRED, &sbp)) { brelse(sbp); return (ENOSPC); } sbap = (ufs1_daddr_t *)sbp->b_data; soff = idp->in_off; } /* * If the block range spans two block maps, get the second map. */ ebap = NULL; if (end_lvl == 0 || (idp = &end_ap[end_lvl - 1])->in_off + 1 >= len) { ssize = len; } else { #ifdef INVARIANTS if (start_lvl > 0 && start_ap[start_lvl - 1].in_lbn == idp->in_lbn) panic("ffs_reallocblk: start == end"); #endif ssize = len - (idp->in_off + 1); if (bread(vp, idp->in_lbn, (int)fs->fs_bsize, NOCRED, &ebp)) goto fail; ebap = (ufs1_daddr_t *)ebp->b_data; } /* * Find the preferred location for the cluster. If we have not * previously failed at this endeavor, then follow our standard * preference calculation. If we have failed at it, then pick up * where we last ended our search. */ UFS_LOCK(ump); if (ip->i_nextclustercg == -1) pref = ffs_blkpref_ufs1(ip, start_lbn, soff, sbap); else pref = cgdata(fs, ip->i_nextclustercg); /* * Search the block map looking for an allocation of the desired size. * To avoid wasting too much time, we limit the number of cylinder * groups that we will search. */ cg = dtog(fs, pref); for (i = min(maxclustersearch, fs->fs_ncg); i > 0; i--) { if ((newblk = ffs_clusteralloc(ip, cg, pref, len)) != 0) break; cg += 1; if (cg >= fs->fs_ncg) cg = 0; } /* * If we have failed in our search, record where we gave up for * next time. Otherwise, fall back to our usual search citerion. */ if (newblk == 0) { ip->i_nextclustercg = cg; UFS_UNLOCK(ump); goto fail; } ip->i_nextclustercg = -1; /* * We have found a new contiguous block. * * First we have to replace the old block pointers with the new * block pointers in the inode and indirect blocks associated * with the file. */ #ifdef DIAGNOSTIC if (prtrealloc) printf("realloc: ino %ju, lbns %jd-%jd\n\told:", (uintmax_t)ip->i_number, (intmax_t)start_lbn, (intmax_t)end_lbn); #endif blkno = newblk; for (bap = &sbap[soff], i = 0; i < len; i++, blkno += fs->fs_frag) { if (i == ssize) { bap = ebap; soff = -i; } #ifdef INVARIANTS if (!ffs_checkblk(ip, dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize)) panic("ffs_reallocblks: unallocated block 2"); if (dbtofsb(fs, buflist->bs_children[i]->b_blkno) != *bap) panic("ffs_reallocblks: alloc mismatch"); #endif #ifdef DIAGNOSTIC if (prtrealloc) printf(" %d,", *bap); #endif if (DOINGSOFTDEP(vp)) { if (sbap == &ip->i_din1->di_db[0] && i < ssize) softdep_setup_allocdirect(ip, start_lbn + i, blkno, *bap, fs->fs_bsize, fs->fs_bsize, buflist->bs_children[i]); else softdep_setup_allocindir_page(ip, start_lbn + i, i < ssize ? sbp : ebp, soff + i, blkno, *bap, buflist->bs_children[i]); } *bap++ = blkno; } /* * Next we must write out the modified inode and indirect blocks. * For strict correctness, the writes should be synchronous since * the old block values may have been written to disk. In practise * they are almost never written, but if we are concerned about * strict correctness, the `doasyncfree' flag should be set to zero. * * The test on `doasyncfree' should be changed to test a flag * that shows whether the associated buffers and inodes have * been written. The flag should be set when the cluster is * started and cleared whenever the buffer or inode is flushed. * We can then check below to see if it is set, and do the * synchronous write only when it has been cleared. */ if (sbap != &ip->i_din1->di_db[0]) { if (doasyncfree) bdwrite(sbp); else bwrite(sbp); } else { UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_UPDATE); if (!doasyncfree) ffs_update(vp, 1); } if (ssize < len) { if (doasyncfree) bdwrite(ebp); else bwrite(ebp); } /* * Last, free the old blocks and assign the new blocks to the buffers. */ #ifdef DIAGNOSTIC if (prtrealloc) printf("\n\tnew:"); #endif for (blkno = newblk, i = 0; i < len; i++, blkno += fs->fs_frag) { bp = buflist->bs_children[i]; if (!DOINGSOFTDEP(vp)) /* * The usual case is that a set of N-contiguous blocks * that was just allocated has been replaced with a * set of N+1-contiguous blocks. If they are marked as * B_DELWRI, the current contents have not been written * to disk. It is possible that the blocks were written * earlier, but very uncommon. If the blocks have never * been written, there is no need to send a BIO_DELETE * for them when they are freed. The gain from avoiding * the TRIMs for the common case of unwritten blocks * far exceeds the cost of the write amplification for * the uncommon case of failing to send a TRIM for the * blocks that had been written. */ ffs_blkfree(ump, fs, ump->um_devvp, dbtofsb(fs, bp->b_blkno), fs->fs_bsize, ip->i_number, vp->v_type, NULL, (bp->b_flags & B_DELWRI) != 0 ? NOTRIM_KEY : SINGLETON_KEY); bp->b_blkno = fsbtodb(fs, blkno); #ifdef INVARIANTS if (!ffs_checkblk(ip, dbtofsb(fs, bp->b_blkno), fs->fs_bsize)) panic("ffs_reallocblks: unallocated block 3"); #endif #ifdef DIAGNOSTIC if (prtrealloc) printf(" %d,", blkno); #endif } #ifdef DIAGNOSTIC if (prtrealloc) { prtrealloc--; printf("\n"); } #endif return (0); fail: if (ssize < len) brelse(ebp); if (sbap != &ip->i_din1->di_db[0]) brelse(sbp); return (ENOSPC); } static int ffs_reallocblks_ufs2(ap) struct vop_reallocblks_args /* { struct vnode *a_vp; struct cluster_save *a_buflist; } */ *ap; { struct fs *fs; struct inode *ip; struct vnode *vp; struct buf *sbp, *ebp, *bp; ufs2_daddr_t *bap, *sbap, *ebap; struct cluster_save *buflist; struct ufsmount *ump; ufs_lbn_t start_lbn, end_lbn; ufs2_daddr_t soff, newblk, blkno, pref; struct indir start_ap[UFS_NIADDR + 1], end_ap[UFS_NIADDR + 1], *idp; int i, cg, len, start_lvl, end_lvl, ssize; vp = ap->a_vp; ip = VTOI(vp); ump = ITOUMP(ip); fs = ump->um_fs; /* * If we are not tracking block clusters or if we have less than 4% * free blocks left, then do not attempt to cluster. Running with * less than 5% free block reserve is not recommended and those that * choose to do so do not expect to have good file layout. */ if (fs->fs_contigsumsize <= 0 || freespace(fs, 4) < 0) return (ENOSPC); buflist = ap->a_buflist; len = buflist->bs_nchildren; start_lbn = buflist->bs_children[0]->b_lblkno; end_lbn = start_lbn + len - 1; #ifdef INVARIANTS for (i = 0; i < len; i++) if (!ffs_checkblk(ip, dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize)) panic("ffs_reallocblks: unallocated block 1"); for (i = 1; i < len; i++) if (buflist->bs_children[i]->b_lblkno != start_lbn + i) panic("ffs_reallocblks: non-logical cluster"); blkno = buflist->bs_children[0]->b_blkno; ssize = fsbtodb(fs, fs->fs_frag); for (i = 1; i < len - 1; i++) if (buflist->bs_children[i]->b_blkno != blkno + (i * ssize)) panic("ffs_reallocblks: non-physical cluster %d", i); #endif /* * If the cluster crosses the boundary for the first indirect * block, do not move anything in it. Indirect blocks are * usually initially laid out in a position between the data * blocks. Block reallocation would usually destroy locality by * moving the indirect block out of the way to make room for * data blocks if we didn't compensate here. We should also do * this for other indirect block boundaries, but it is only * important for the first one. */ if (start_lbn < UFS_NDADDR && end_lbn >= UFS_NDADDR) return (ENOSPC); /* * If the latest allocation is in a new cylinder group, assume that * the filesystem has decided to move and do not force it back to * the previous cylinder group. */ if (dtog(fs, dbtofsb(fs, buflist->bs_children[0]->b_blkno)) != dtog(fs, dbtofsb(fs, buflist->bs_children[len - 1]->b_blkno))) return (ENOSPC); if (ufs_getlbns(vp, start_lbn, start_ap, &start_lvl) || ufs_getlbns(vp, end_lbn, end_ap, &end_lvl)) return (ENOSPC); /* * Get the starting offset and block map for the first block. */ if (start_lvl == 0) { sbap = &ip->i_din2->di_db[0]; soff = start_lbn; } else { idp = &start_ap[start_lvl - 1]; if (bread(vp, idp->in_lbn, (int)fs->fs_bsize, NOCRED, &sbp)) { brelse(sbp); return (ENOSPC); } sbap = (ufs2_daddr_t *)sbp->b_data; soff = idp->in_off; } /* * If the block range spans two block maps, get the second map. */ ebap = NULL; if (end_lvl == 0 || (idp = &end_ap[end_lvl - 1])->in_off + 1 >= len) { ssize = len; } else { #ifdef INVARIANTS if (start_lvl > 0 && start_ap[start_lvl - 1].in_lbn == idp->in_lbn) panic("ffs_reallocblk: start == end"); #endif ssize = len - (idp->in_off + 1); if (bread(vp, idp->in_lbn, (int)fs->fs_bsize, NOCRED, &ebp)) goto fail; ebap = (ufs2_daddr_t *)ebp->b_data; } /* * Find the preferred location for the cluster. If we have not * previously failed at this endeavor, then follow our standard * preference calculation. If we have failed at it, then pick up * where we last ended our search. */ UFS_LOCK(ump); if (ip->i_nextclustercg == -1) pref = ffs_blkpref_ufs2(ip, start_lbn, soff, sbap); else pref = cgdata(fs, ip->i_nextclustercg); /* * Search the block map looking for an allocation of the desired size. * To avoid wasting too much time, we limit the number of cylinder * groups that we will search. */ cg = dtog(fs, pref); for (i = min(maxclustersearch, fs->fs_ncg); i > 0; i--) { if ((newblk = ffs_clusteralloc(ip, cg, pref, len)) != 0) break; cg += 1; if (cg >= fs->fs_ncg) cg = 0; } /* * If we have failed in our search, record where we gave up for * next time. Otherwise, fall back to our usual search citerion. */ if (newblk == 0) { ip->i_nextclustercg = cg; UFS_UNLOCK(ump); goto fail; } ip->i_nextclustercg = -1; /* * We have found a new contiguous block. * * First we have to replace the old block pointers with the new * block pointers in the inode and indirect blocks associated * with the file. */ #ifdef DIAGNOSTIC if (prtrealloc) printf("realloc: ino %ju, lbns %jd-%jd\n\told:", (uintmax_t)ip->i_number, (intmax_t)start_lbn, (intmax_t)end_lbn); #endif blkno = newblk; for (bap = &sbap[soff], i = 0; i < len; i++, blkno += fs->fs_frag) { if (i == ssize) { bap = ebap; soff = -i; } #ifdef INVARIANTS if (!ffs_checkblk(ip, dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize)) panic("ffs_reallocblks: unallocated block 2"); if (dbtofsb(fs, buflist->bs_children[i]->b_blkno) != *bap) panic("ffs_reallocblks: alloc mismatch"); #endif #ifdef DIAGNOSTIC if (prtrealloc) printf(" %jd,", (intmax_t)*bap); #endif if (DOINGSOFTDEP(vp)) { if (sbap == &ip->i_din2->di_db[0] && i < ssize) softdep_setup_allocdirect(ip, start_lbn + i, blkno, *bap, fs->fs_bsize, fs->fs_bsize, buflist->bs_children[i]); else softdep_setup_allocindir_page(ip, start_lbn + i, i < ssize ? sbp : ebp, soff + i, blkno, *bap, buflist->bs_children[i]); } *bap++ = blkno; } /* * Next we must write out the modified inode and indirect blocks. * For strict correctness, the writes should be synchronous since * the old block values may have been written to disk. In practise * they are almost never written, but if we are concerned about * strict correctness, the `doasyncfree' flag should be set to zero. * * The test on `doasyncfree' should be changed to test a flag * that shows whether the associated buffers and inodes have * been written. The flag should be set when the cluster is * started and cleared whenever the buffer or inode is flushed. * We can then check below to see if it is set, and do the * synchronous write only when it has been cleared. */ if (sbap != &ip->i_din2->di_db[0]) { if (doasyncfree) bdwrite(sbp); else bwrite(sbp); } else { UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_UPDATE); if (!doasyncfree) ffs_update(vp, 1); } if (ssize < len) { if (doasyncfree) bdwrite(ebp); else bwrite(ebp); } /* * Last, free the old blocks and assign the new blocks to the buffers. */ #ifdef DIAGNOSTIC if (prtrealloc) printf("\n\tnew:"); #endif for (blkno = newblk, i = 0; i < len; i++, blkno += fs->fs_frag) { bp = buflist->bs_children[i]; if (!DOINGSOFTDEP(vp)) /* * The usual case is that a set of N-contiguous blocks * that was just allocated has been replaced with a * set of N+1-contiguous blocks. If they are marked as * B_DELWRI, the current contents have not been written * to disk. It is possible that the blocks were written * earlier, but very uncommon. If the blocks have never * been written, there is no need to send a BIO_DELETE * for them when they are freed. The gain from avoiding * the TRIMs for the common case of unwritten blocks * far exceeds the cost of the write amplification for * the uncommon case of failing to send a TRIM for the * blocks that had been written. */ ffs_blkfree(ump, fs, ump->um_devvp, dbtofsb(fs, bp->b_blkno), fs->fs_bsize, ip->i_number, vp->v_type, NULL, (bp->b_flags & B_DELWRI) != 0 ? NOTRIM_KEY : SINGLETON_KEY); bp->b_blkno = fsbtodb(fs, blkno); #ifdef INVARIANTS if (!ffs_checkblk(ip, dbtofsb(fs, bp->b_blkno), fs->fs_bsize)) panic("ffs_reallocblks: unallocated block 3"); #endif #ifdef DIAGNOSTIC if (prtrealloc) printf(" %jd,", (intmax_t)blkno); #endif } #ifdef DIAGNOSTIC if (prtrealloc) { prtrealloc--; printf("\n"); } #endif return (0); fail: if (ssize < len) brelse(ebp); if (sbap != &ip->i_din2->di_db[0]) brelse(sbp); return (ENOSPC); } /* * Allocate an inode in the filesystem. * * If allocating a directory, use ffs_dirpref to select the inode. * If allocating in a directory, the following hierarchy is followed: * 1) allocate the preferred inode. * 2) allocate an inode in the same cylinder group. * 3) quadradically rehash into other cylinder groups, until an * available inode is located. * If no inode preference is given the following hierarchy is used * to allocate an inode: * 1) allocate an inode in cylinder group 0. * 2) quadradically rehash into other cylinder groups, until an * available inode is located. */ int ffs_valloc(pvp, mode, cred, vpp) struct vnode *pvp; int mode; struct ucred *cred; struct vnode **vpp; { struct inode *pip; struct fs *fs; struct inode *ip; struct timespec ts; struct ufsmount *ump; ino_t ino, ipref; u_int cg; int error, reclaimed; *vpp = NULL; pip = VTOI(pvp); ump = ITOUMP(pip); fs = ump->um_fs; UFS_LOCK(ump); reclaimed = 0; retry: if (fs->fs_cstotal.cs_nifree == 0) goto noinodes; if ((mode & IFMT) == IFDIR) ipref = ffs_dirpref(pip); else ipref = pip->i_number; if (ipref >= fs->fs_ncg * fs->fs_ipg) ipref = 0; cg = ino_to_cg(fs, ipref); /* * Track number of dirs created one after another * in a same cg without intervening by files. */ if ((mode & IFMT) == IFDIR) { if (fs->fs_contigdirs[cg] < 255) fs->fs_contigdirs[cg]++; } else { if (fs->fs_contigdirs[cg] > 0) fs->fs_contigdirs[cg]--; } ino = (ino_t)ffs_hashalloc(pip, cg, ipref, mode, 0, (allocfcn_t *)ffs_nodealloccg); if (ino == 0) goto noinodes; /* * Get rid of the cached old vnode, force allocation of a new vnode * for this inode. If this fails, release the allocated ino and * return the error. */ if ((error = ffs_vgetf(pvp->v_mount, ino, LK_EXCLUSIVE, vpp, FFSV_FORCEINSMQ | FFSV_REPLACE)) != 0) { ffs_vfree(pvp, ino, mode); return (error); } /* * We got an inode, so check mode and panic if it is already allocated. */ ip = VTOI(*vpp); if (ip->i_mode) { printf("mode = 0%o, inum = %ju, fs = %s\n", ip->i_mode, (uintmax_t)ip->i_number, fs->fs_fsmnt); panic("ffs_valloc: dup alloc"); } if (DIP(ip, i_blocks) && (fs->fs_flags & FS_UNCLEAN) == 0) { /* XXX */ printf("free inode %s/%lu had %ld blocks\n", fs->fs_fsmnt, (u_long)ino, (long)DIP(ip, i_blocks)); DIP_SET(ip, i_blocks, 0); } ip->i_flags = 0; DIP_SET(ip, i_flags, 0); /* * Set up a new generation number for this inode. */ while (ip->i_gen == 0 || ++ip->i_gen == 0) ip->i_gen = arc4random(); DIP_SET(ip, i_gen, ip->i_gen); if (fs->fs_magic == FS_UFS2_MAGIC) { vfs_timestamp(&ts); ip->i_din2->di_birthtime = ts.tv_sec; ip->i_din2->di_birthnsec = ts.tv_nsec; } ip->i_flag = 0; (*vpp)->v_vflag = 0; (*vpp)->v_type = VNON; if (fs->fs_magic == FS_UFS2_MAGIC) { (*vpp)->v_op = &ffs_vnodeops2; UFS_INODE_SET_FLAG(ip, IN_UFS2); } else { (*vpp)->v_op = &ffs_vnodeops1; } return (0); noinodes: if (reclaimed == 0) { reclaimed = 1; softdep_request_cleanup(fs, pvp, cred, FLUSH_INODES_WAIT); goto retry; } if (ffs_fsfail_cleanup_locked(ump, 0)) { UFS_UNLOCK(ump); return (ENXIO); } if (ppsratecheck(&ump->um_last_fullmsg, &ump->um_secs_fullmsg, 1)) { UFS_UNLOCK(ump); ffs_fserr(fs, pip->i_number, "out of inodes"); uprintf("\n%s: create/symlink failed, no inodes free\n", fs->fs_fsmnt); } else { UFS_UNLOCK(ump); } return (ENOSPC); } /* * Find a cylinder group to place a directory. * * The policy implemented by this algorithm is to allocate a * directory inode in the same cylinder group as its parent * directory, but also to reserve space for its files inodes * and data. Restrict the number of directories which may be * allocated one after another in the same cylinder group * without intervening allocation of files. * * If we allocate a first level directory then force allocation * in another cylinder group. */ static ino_t ffs_dirpref(pip) struct inode *pip; { struct fs *fs; int cg, prefcg, dirsize, cgsize; u_int avgifree, avgbfree, avgndir, curdirsize; u_int minifree, minbfree, maxndir; u_int mincg, minndir; u_int maxcontigdirs; mtx_assert(UFS_MTX(ITOUMP(pip)), MA_OWNED); fs = ITOFS(pip); avgifree = fs->fs_cstotal.cs_nifree / fs->fs_ncg; avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg; avgndir = fs->fs_cstotal.cs_ndir / fs->fs_ncg; /* * Force allocation in another cg if creating a first level dir. */ ASSERT_VOP_LOCKED(ITOV(pip), "ffs_dirpref"); if (ITOV(pip)->v_vflag & VV_ROOT) { prefcg = arc4random() % fs->fs_ncg; mincg = prefcg; minndir = fs->fs_ipg; for (cg = prefcg; cg < fs->fs_ncg; cg++) if (fs->fs_cs(fs, cg).cs_ndir < minndir && fs->fs_cs(fs, cg).cs_nifree >= avgifree && fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { mincg = cg; minndir = fs->fs_cs(fs, cg).cs_ndir; } for (cg = 0; cg < prefcg; cg++) if (fs->fs_cs(fs, cg).cs_ndir < minndir && fs->fs_cs(fs, cg).cs_nifree >= avgifree && fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { mincg = cg; minndir = fs->fs_cs(fs, cg).cs_ndir; } return ((ino_t)(fs->fs_ipg * mincg)); } /* * Count various limits which used for * optimal allocation of a directory inode. */ maxndir = min(avgndir + fs->fs_ipg / 16, fs->fs_ipg); minifree = avgifree - avgifree / 4; if (minifree < 1) minifree = 1; minbfree = avgbfree - avgbfree / 4; if (minbfree < 1) minbfree = 1; cgsize = fs->fs_fsize * fs->fs_fpg; dirsize = fs->fs_avgfilesize * fs->fs_avgfpdir; curdirsize = avgndir ? (cgsize - avgbfree * fs->fs_bsize) / avgndir : 0; if (dirsize < curdirsize) dirsize = curdirsize; if (dirsize <= 0) maxcontigdirs = 0; /* dirsize overflowed */ else maxcontigdirs = min((avgbfree * fs->fs_bsize) / dirsize, 255); if (fs->fs_avgfpdir > 0) maxcontigdirs = min(maxcontigdirs, fs->fs_ipg / fs->fs_avgfpdir); if (maxcontigdirs == 0) maxcontigdirs = 1; /* * Limit number of dirs in one cg and reserve space for * regular files, but only if we have no deficit in * inodes or space. * * We are trying to find a suitable cylinder group nearby * our preferred cylinder group to place a new directory. * We scan from our preferred cylinder group forward looking * for a cylinder group that meets our criterion. If we get * to the final cylinder group and do not find anything, * we start scanning forwards from the beginning of the * filesystem. While it might seem sensible to start scanning * backwards or even to alternate looking forward and backward, * this approach fails badly when the filesystem is nearly full. * Specifically, we first search all the areas that have no space * and finally try the one preceding that. We repeat this on * every request and in the case of the final block end up * searching the entire filesystem. By jumping to the front * of the filesystem, our future forward searches always look * in new cylinder groups so finds every possible block after * one pass over the filesystem. */ prefcg = ino_to_cg(fs, pip->i_number); for (cg = prefcg; cg < fs->fs_ncg; cg++) if (fs->fs_cs(fs, cg).cs_ndir < maxndir && fs->fs_cs(fs, cg).cs_nifree >= minifree && fs->fs_cs(fs, cg).cs_nbfree >= minbfree) { if (fs->fs_contigdirs[cg] < maxcontigdirs) return ((ino_t)(fs->fs_ipg * cg)); } for (cg = 0; cg < prefcg; cg++) if (fs->fs_cs(fs, cg).cs_ndir < maxndir && fs->fs_cs(fs, cg).cs_nifree >= minifree && fs->fs_cs(fs, cg).cs_nbfree >= minbfree) { if (fs->fs_contigdirs[cg] < maxcontigdirs) return ((ino_t)(fs->fs_ipg * cg)); } /* * This is a backstop when we have deficit in space. */ for (cg = prefcg; cg < fs->fs_ncg; cg++) if (fs->fs_cs(fs, cg).cs_nifree >= avgifree) return ((ino_t)(fs->fs_ipg * cg)); for (cg = 0; cg < prefcg; cg++) if (fs->fs_cs(fs, cg).cs_nifree >= avgifree) break; return ((ino_t)(fs->fs_ipg * cg)); } /* * Select the desired position for the next block in a file. The file is * logically divided into sections. The first section is composed of the * direct blocks and the next fs_maxbpg blocks. Each additional section * contains fs_maxbpg blocks. * * If no blocks have been allocated in the first section, the policy is to * request a block in the same cylinder group as the inode that describes * the file. The first indirect is allocated immediately following the last * direct block and the data blocks for the first indirect immediately * follow it. * * If no blocks have been allocated in any other section, the indirect * block(s) are allocated in the same cylinder group as its inode in an * area reserved immediately following the inode blocks. The policy for * the data blocks is to place them in a cylinder group with a greater than * average number of free blocks. An appropriate cylinder group is found * by using a rotor that sweeps the cylinder groups. When a new group of * blocks is needed, the sweep begins in the cylinder group following the * cylinder group from which the previous allocation was made. The sweep * continues until a cylinder group with greater than the average number * of free blocks is found. If the allocation is for the first block in an * indirect block or the previous block is a hole, then the information on * the previous allocation is unavailable; here a best guess is made based * on the logical block number being allocated. * * If a section is already partially allocated, the policy is to * allocate blocks contiguously within the section if possible. */ ufs2_daddr_t ffs_blkpref_ufs1(ip, lbn, indx, bap) struct inode *ip; ufs_lbn_t lbn; int indx; ufs1_daddr_t *bap; { struct fs *fs; u_int cg, inocg; u_int avgbfree, startcg; ufs2_daddr_t pref, prevbn; KASSERT(indx <= 0 || bap != NULL, ("need non-NULL bap")); mtx_assert(UFS_MTX(ITOUMP(ip)), MA_OWNED); fs = ITOFS(ip); /* * Allocation of indirect blocks is indicated by passing negative * values in indx: -1 for single indirect, -2 for double indirect, * -3 for triple indirect. As noted below, we attempt to allocate * the first indirect inline with the file data. For all later * indirect blocks, the data is often allocated in other cylinder * groups. However to speed random file access and to speed up * fsck, the filesystem reserves the first fs_metaspace blocks * (typically half of fs_minfree) of the data area of each cylinder * group to hold these later indirect blocks. */ inocg = ino_to_cg(fs, ip->i_number); if (indx < 0) { /* * Our preference for indirect blocks is the zone at the * beginning of the inode's cylinder group data area that * we try to reserve for indirect blocks. */ pref = cgmeta(fs, inocg); /* * If we are allocating the first indirect block, try to * place it immediately following the last direct block. */ if (indx == -1 && lbn < UFS_NDADDR + NINDIR(fs) && ip->i_din1->di_db[UFS_NDADDR - 1] != 0) pref = ip->i_din1->di_db[UFS_NDADDR - 1] + fs->fs_frag; return (pref); } /* * If we are allocating the first data block in the first indirect * block and the indirect has been allocated in the data block area, * try to place it immediately following the indirect block. */ if (lbn == UFS_NDADDR) { pref = ip->i_din1->di_ib[0]; if (pref != 0 && pref >= cgdata(fs, inocg) && pref < cgbase(fs, inocg + 1)) return (pref + fs->fs_frag); } /* * If we are at the beginning of a file, or we have already allocated * the maximum number of blocks per cylinder group, or we do not * have a block allocated immediately preceding us, then we need * to decide where to start allocating new blocks. */ if (indx == 0) { prevbn = 0; } else { prevbn = bap[indx - 1]; if (UFS_CHECK_BLKNO(ITOVFS(ip), ip->i_number, prevbn, fs->fs_bsize) != 0) prevbn = 0; } if (indx % fs->fs_maxbpg == 0 || prevbn == 0) { /* * If we are allocating a directory data block, we want * to place it in the metadata area. */ if ((ip->i_mode & IFMT) == IFDIR) return (cgmeta(fs, inocg)); /* * Until we fill all the direct and all the first indirect's * blocks, we try to allocate in the data area of the inode's * cylinder group. */ if (lbn < UFS_NDADDR + NINDIR(fs)) return (cgdata(fs, inocg)); /* * Find a cylinder with greater than average number of * unused data blocks. */ if (indx == 0 || prevbn == 0) startcg = inocg + lbn / fs->fs_maxbpg; else startcg = dtog(fs, prevbn) + 1; startcg %= fs->fs_ncg; avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg; for (cg = startcg; cg < fs->fs_ncg; cg++) if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { fs->fs_cgrotor = cg; return (cgdata(fs, cg)); } for (cg = 0; cg <= startcg; cg++) if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { fs->fs_cgrotor = cg; return (cgdata(fs, cg)); } return (0); } /* * Otherwise, we just always try to lay things out contiguously. */ return (prevbn + fs->fs_frag); } /* * Same as above, but for UFS2 */ ufs2_daddr_t ffs_blkpref_ufs2(ip, lbn, indx, bap) struct inode *ip; ufs_lbn_t lbn; int indx; ufs2_daddr_t *bap; { struct fs *fs; u_int cg, inocg; u_int avgbfree, startcg; ufs2_daddr_t pref, prevbn; KASSERT(indx <= 0 || bap != NULL, ("need non-NULL bap")); mtx_assert(UFS_MTX(ITOUMP(ip)), MA_OWNED); fs = ITOFS(ip); /* * Allocation of indirect blocks is indicated by passing negative * values in indx: -1 for single indirect, -2 for double indirect, * -3 for triple indirect. As noted below, we attempt to allocate * the first indirect inline with the file data. For all later * indirect blocks, the data is often allocated in other cylinder * groups. However to speed random file access and to speed up * fsck, the filesystem reserves the first fs_metaspace blocks * (typically half of fs_minfree) of the data area of each cylinder * group to hold these later indirect blocks. */ inocg = ino_to_cg(fs, ip->i_number); if (indx < 0) { /* * Our preference for indirect blocks is the zone at the * beginning of the inode's cylinder group data area that * we try to reserve for indirect blocks. */ pref = cgmeta(fs, inocg); /* * If we are allocating the first indirect block, try to * place it immediately following the last direct block. */ if (indx == -1 && lbn < UFS_NDADDR + NINDIR(fs) && ip->i_din2->di_db[UFS_NDADDR - 1] != 0) pref = ip->i_din2->di_db[UFS_NDADDR - 1] + fs->fs_frag; return (pref); } /* * If we are allocating the first data block in the first indirect * block and the indirect has been allocated in the data block area, * try to place it immediately following the indirect block. */ if (lbn == UFS_NDADDR) { pref = ip->i_din2->di_ib[0]; if (pref != 0 && pref >= cgdata(fs, inocg) && pref < cgbase(fs, inocg + 1)) return (pref + fs->fs_frag); } /* * If we are at the beginning of a file, or we have already allocated * the maximum number of blocks per cylinder group, or we do not * have a block allocated immediately preceding us, then we need * to decide where to start allocating new blocks. */ if (indx == 0) { prevbn = 0; } else { prevbn = bap[indx - 1]; if (UFS_CHECK_BLKNO(ITOVFS(ip), ip->i_number, prevbn, fs->fs_bsize) != 0) prevbn = 0; } if (indx % fs->fs_maxbpg == 0 || prevbn == 0) { /* * If we are allocating a directory data block, we want * to place it in the metadata area. */ if ((ip->i_mode & IFMT) == IFDIR) return (cgmeta(fs, inocg)); /* * Until we fill all the direct and all the first indirect's * blocks, we try to allocate in the data area of the inode's * cylinder group. */ if (lbn < UFS_NDADDR + NINDIR(fs)) return (cgdata(fs, inocg)); /* * Find a cylinder with greater than average number of * unused data blocks. */ if (indx == 0 || prevbn == 0) startcg = inocg + lbn / fs->fs_maxbpg; else startcg = dtog(fs, prevbn) + 1; startcg %= fs->fs_ncg; avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg; for (cg = startcg; cg < fs->fs_ncg; cg++) if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { fs->fs_cgrotor = cg; return (cgdata(fs, cg)); } for (cg = 0; cg <= startcg; cg++) if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { fs->fs_cgrotor = cg; return (cgdata(fs, cg)); } return (0); } /* * Otherwise, we just always try to lay things out contiguously. */ return (prevbn + fs->fs_frag); } /* * Implement the cylinder overflow algorithm. * * The policy implemented by this algorithm is: * 1) allocate the block in its requested cylinder group. * 2) quadradically rehash on the cylinder group number. * 3) brute force search for a free block. * * Must be called with the UFS lock held. Will release the lock on success * and return with it held on failure. */ /*VARARGS5*/ static ufs2_daddr_t ffs_hashalloc(ip, cg, pref, size, rsize, allocator) struct inode *ip; u_int cg; ufs2_daddr_t pref; int size; /* Search size for data blocks, mode for inodes */ int rsize; /* Real allocated size. */ allocfcn_t *allocator; { struct fs *fs; ufs2_daddr_t result; u_int i, icg = cg; mtx_assert(UFS_MTX(ITOUMP(ip)), MA_OWNED); #ifdef INVARIANTS if (ITOV(ip)->v_mount->mnt_kern_flag & MNTK_SUSPENDED) panic("ffs_hashalloc: allocation on suspended filesystem"); #endif fs = ITOFS(ip); /* * 1: preferred cylinder group */ result = (*allocator)(ip, cg, pref, size, rsize); if (result) return (result); /* * 2: quadratic rehash */ for (i = 1; i < fs->fs_ncg; i *= 2) { cg += i; if (cg >= fs->fs_ncg) cg -= fs->fs_ncg; result = (*allocator)(ip, cg, 0, size, rsize); if (result) return (result); } /* * 3: brute force search * Note that we start at i == 2, since 0 was checked initially, * and 1 is always checked in the quadratic rehash. */ cg = (icg + 2) % fs->fs_ncg; for (i = 2; i < fs->fs_ncg; i++) { result = (*allocator)(ip, cg, 0, size, rsize); if (result) return (result); cg++; if (cg == fs->fs_ncg) cg = 0; } return (0); } /* * Determine whether a fragment can be extended. * * Check to see if the necessary fragments are available, and * if they are, allocate them. */ static ufs2_daddr_t ffs_fragextend(ip, cg, bprev, osize, nsize) struct inode *ip; u_int cg; ufs2_daddr_t bprev; int osize, nsize; { struct fs *fs; struct cg *cgp; struct buf *bp; struct ufsmount *ump; int nffree; long bno; int frags, bbase; int i, error; u_int8_t *blksfree; ump = ITOUMP(ip); fs = ump->um_fs; if (fs->fs_cs(fs, cg).cs_nffree < numfrags(fs, nsize - osize)) return (0); frags = numfrags(fs, nsize); bbase = fragnum(fs, bprev); if (bbase > fragnum(fs, (bprev + frags - 1))) { /* cannot extend across a block boundary */ return (0); } UFS_UNLOCK(ump); if ((error = ffs_getcg(fs, ump->um_devvp, cg, 0, &bp, &cgp)) != 0) goto fail; bno = dtogd(fs, bprev); blksfree = cg_blksfree(cgp); for (i = numfrags(fs, osize); i < frags; i++) if (isclr(blksfree, bno + i)) goto fail; /* * the current fragment can be extended * deduct the count on fragment being extended into * increase the count on the remaining fragment (if any) * allocate the extended piece */ for (i = frags; i < fs->fs_frag - bbase; i++) if (isclr(blksfree, bno + i)) break; cgp->cg_frsum[i - numfrags(fs, osize)]--; if (i != frags) cgp->cg_frsum[i - frags]++; for (i = numfrags(fs, osize), nffree = 0; i < frags; i++) { clrbit(blksfree, bno + i); cgp->cg_cs.cs_nffree--; nffree++; } UFS_LOCK(ump); fs->fs_cstotal.cs_nffree -= nffree; fs->fs_cs(fs, cg).cs_nffree -= nffree; fs->fs_fmod = 1; ACTIVECLEAR(fs, cg); UFS_UNLOCK(ump); if (DOINGSOFTDEP(ITOV(ip))) softdep_setup_blkmapdep(bp, UFSTOVFS(ump), bprev, frags, numfrags(fs, osize)); bdwrite(bp); return (bprev); fail: brelse(bp); UFS_LOCK(ump); return (0); } /* * Determine whether a block can be allocated. * * Check to see if a block of the appropriate size is available, * and if it is, allocate it. */ static ufs2_daddr_t ffs_alloccg(ip, cg, bpref, size, rsize) struct inode *ip; u_int cg; ufs2_daddr_t bpref; int size; int rsize; { struct fs *fs; struct cg *cgp; struct buf *bp; struct ufsmount *ump; ufs1_daddr_t bno; ufs2_daddr_t blkno; int i, allocsiz, error, frags; u_int8_t *blksfree; ump = ITOUMP(ip); fs = ump->um_fs; if (fs->fs_cs(fs, cg).cs_nbfree == 0 && size == fs->fs_bsize) return (0); UFS_UNLOCK(ump); if ((error = ffs_getcg(fs, ump->um_devvp, cg, 0, &bp, &cgp)) != 0 || (cgp->cg_cs.cs_nbfree == 0 && size == fs->fs_bsize)) goto fail; if (size == fs->fs_bsize) { UFS_LOCK(ump); blkno = ffs_alloccgblk(ip, bp, bpref, rsize); ACTIVECLEAR(fs, cg); UFS_UNLOCK(ump); bdwrite(bp); return (blkno); } /* * check to see if any fragments are already available * allocsiz is the size which will be allocated, hacking * it down to a smaller size if necessary */ blksfree = cg_blksfree(cgp); frags = numfrags(fs, size); for (allocsiz = frags; allocsiz < fs->fs_frag; allocsiz++) if (cgp->cg_frsum[allocsiz] != 0) break; if (allocsiz == fs->fs_frag) { /* * no fragments were available, so a block will be * allocated, and hacked up */ if (cgp->cg_cs.cs_nbfree == 0) goto fail; UFS_LOCK(ump); blkno = ffs_alloccgblk(ip, bp, bpref, rsize); ACTIVECLEAR(fs, cg); UFS_UNLOCK(ump); bdwrite(bp); return (blkno); } KASSERT(size == rsize, ("ffs_alloccg: size(%d) != rsize(%d)", size, rsize)); bno = ffs_mapsearch(fs, cgp, bpref, allocsiz); if (bno < 0) goto fail; for (i = 0; i < frags; i++) clrbit(blksfree, bno + i); cgp->cg_cs.cs_nffree -= frags; cgp->cg_frsum[allocsiz]--; if (frags != allocsiz) cgp->cg_frsum[allocsiz - frags]++; UFS_LOCK(ump); fs->fs_cstotal.cs_nffree -= frags; fs->fs_cs(fs, cg).cs_nffree -= frags; fs->fs_fmod = 1; blkno = cgbase(fs, cg) + bno; ACTIVECLEAR(fs, cg); UFS_UNLOCK(ump); if (DOINGSOFTDEP(ITOV(ip))) softdep_setup_blkmapdep(bp, UFSTOVFS(ump), blkno, frags, 0); bdwrite(bp); return (blkno); fail: brelse(bp); UFS_LOCK(ump); return (0); } /* * Allocate a block in a cylinder group. * * This algorithm implements the following policy: * 1) allocate the requested block. * 2) allocate a rotationally optimal block in the same cylinder. * 3) allocate the next available block on the block rotor for the * specified cylinder group. * Note that this routine only allocates fs_bsize blocks; these * blocks may be fragmented by the routine that allocates them. */ static ufs2_daddr_t ffs_alloccgblk(ip, bp, bpref, size) struct inode *ip; struct buf *bp; ufs2_daddr_t bpref; int size; { struct fs *fs; struct cg *cgp; struct ufsmount *ump; ufs1_daddr_t bno; ufs2_daddr_t blkno; u_int8_t *blksfree; int i, cgbpref; ump = ITOUMP(ip); fs = ump->um_fs; mtx_assert(UFS_MTX(ump), MA_OWNED); cgp = (struct cg *)bp->b_data; blksfree = cg_blksfree(cgp); if (bpref == 0) { bpref = cgbase(fs, cgp->cg_cgx) + cgp->cg_rotor + fs->fs_frag; } else if ((cgbpref = dtog(fs, bpref)) != cgp->cg_cgx) { /* map bpref to correct zone in this cg */ if (bpref < cgdata(fs, cgbpref)) bpref = cgmeta(fs, cgp->cg_cgx); else bpref = cgdata(fs, cgp->cg_cgx); } /* * if the requested block is available, use it */ bno = dtogd(fs, blknum(fs, bpref)); if (ffs_isblock(fs, blksfree, fragstoblks(fs, bno))) goto gotit; /* * Take the next available block in this cylinder group. */ bno = ffs_mapsearch(fs, cgp, bpref, (int)fs->fs_frag); if (bno < 0) return (0); /* Update cg_rotor only if allocated from the data zone */ if (bno >= dtogd(fs, cgdata(fs, cgp->cg_cgx))) cgp->cg_rotor = bno; gotit: blkno = fragstoblks(fs, bno); ffs_clrblock(fs, blksfree, (long)blkno); ffs_clusteracct(fs, cgp, blkno, -1); cgp->cg_cs.cs_nbfree--; fs->fs_cstotal.cs_nbfree--; fs->fs_cs(fs, cgp->cg_cgx).cs_nbfree--; fs->fs_fmod = 1; blkno = cgbase(fs, cgp->cg_cgx) + bno; /* * If the caller didn't want the whole block free the frags here. */ size = numfrags(fs, size); if (size != fs->fs_frag) { bno = dtogd(fs, blkno); for (i = size; i < fs->fs_frag; i++) setbit(blksfree, bno + i); i = fs->fs_frag - size; cgp->cg_cs.cs_nffree += i; fs->fs_cstotal.cs_nffree += i; fs->fs_cs(fs, cgp->cg_cgx).cs_nffree += i; fs->fs_fmod = 1; cgp->cg_frsum[i]++; } /* XXX Fixme. */ UFS_UNLOCK(ump); if (DOINGSOFTDEP(ITOV(ip))) softdep_setup_blkmapdep(bp, UFSTOVFS(ump), blkno, size, 0); UFS_LOCK(ump); return (blkno); } /* * Determine whether a cluster can be allocated. * * We do not currently check for optimal rotational layout if there * are multiple choices in the same cylinder group. Instead we just * take the first one that we find following bpref. */ static ufs2_daddr_t ffs_clusteralloc(ip, cg, bpref, len) struct inode *ip; u_int cg; ufs2_daddr_t bpref; int len; { struct fs *fs; struct cg *cgp; struct buf *bp; struct ufsmount *ump; int i, run, bit, map, got, error; ufs2_daddr_t bno; u_char *mapp; int32_t *lp; u_int8_t *blksfree; ump = ITOUMP(ip); fs = ump->um_fs; if (fs->fs_maxcluster[cg] < len) return (0); UFS_UNLOCK(ump); if ((error = ffs_getcg(fs, ump->um_devvp, cg, 0, &bp, &cgp)) != 0) { UFS_LOCK(ump); return (0); } /* * Check to see if a cluster of the needed size (or bigger) is * available in this cylinder group. */ lp = &cg_clustersum(cgp)[len]; for (i = len; i <= fs->fs_contigsumsize; i++) if (*lp++ > 0) break; if (i > fs->fs_contigsumsize) { /* * This is the first time looking for a cluster in this * cylinder group. Update the cluster summary information * to reflect the true maximum sized cluster so that * future cluster allocation requests can avoid reading * the cylinder group map only to find no clusters. */ lp = &cg_clustersum(cgp)[len - 1]; for (i = len - 1; i > 0; i--) if (*lp-- > 0) break; UFS_LOCK(ump); fs->fs_maxcluster[cg] = i; brelse(bp); return (0); } /* * Search the cluster map to find a big enough cluster. * We take the first one that we find, even if it is larger * than we need as we prefer to get one close to the previous * block allocation. We do not search before the current * preference point as we do not want to allocate a block * that is allocated before the previous one (as we will * then have to wait for another pass of the elevator * algorithm before it will be read). We prefer to fail and * be recalled to try an allocation in the next cylinder group. */ if (dtog(fs, bpref) != cg) bpref = cgdata(fs, cg); else bpref = blknum(fs, bpref); bpref = fragstoblks(fs, dtogd(fs, bpref)); mapp = &cg_clustersfree(cgp)[bpref / NBBY]; map = *mapp++; bit = 1 << (bpref % NBBY); for (run = 0, got = bpref; got < cgp->cg_nclusterblks; got++) { if ((map & bit) == 0) { run = 0; } else { run++; if (run == len) break; } if ((got & (NBBY - 1)) != (NBBY - 1)) { bit <<= 1; } else { map = *mapp++; bit = 1; } } if (got >= cgp->cg_nclusterblks) { UFS_LOCK(ump); brelse(bp); return (0); } /* * Allocate the cluster that we have found. */ blksfree = cg_blksfree(cgp); for (i = 1; i <= len; i++) if (!ffs_isblock(fs, blksfree, got - run + i)) panic("ffs_clusteralloc: map mismatch"); bno = cgbase(fs, cg) + blkstofrags(fs, got - run + 1); if (dtog(fs, bno) != cg) panic("ffs_clusteralloc: allocated out of group"); len = blkstofrags(fs, len); UFS_LOCK(ump); for (i = 0; i < len; i += fs->fs_frag) if (ffs_alloccgblk(ip, bp, bno + i, fs->fs_bsize) != bno + i) panic("ffs_clusteralloc: lost block"); ACTIVECLEAR(fs, cg); UFS_UNLOCK(ump); bdwrite(bp); return (bno); } static inline struct buf * getinobuf(struct inode *ip, u_int cg, u_int32_t cginoblk, int gbflags) { struct fs *fs; fs = ITOFS(ip); return (getblk(ITODEVVP(ip), fsbtodb(fs, ino_to_fsba(fs, cg * fs->fs_ipg + cginoblk)), (int)fs->fs_bsize, 0, 0, gbflags)); } /* * Synchronous inode initialization is needed only when barrier writes do not * work as advertised, and will impose a heavy cost on file creation in a newly * created filesystem. */ static int doasyncinodeinit = 1; SYSCTL_INT(_vfs_ffs, OID_AUTO, doasyncinodeinit, CTLFLAG_RWTUN, &doasyncinodeinit, 0, "Perform inode block initialization using asynchronous writes"); /* * Determine whether an inode can be allocated. * * Check to see if an inode is available, and if it is, * allocate it using the following policy: * 1) allocate the requested inode. * 2) allocate the next available inode after the requested * inode in the specified cylinder group. */ static ufs2_daddr_t ffs_nodealloccg(ip, cg, ipref, mode, unused) struct inode *ip; u_int cg; ufs2_daddr_t ipref; int mode; int unused; { struct fs *fs; struct cg *cgp; struct buf *bp, *ibp; struct ufsmount *ump; u_int8_t *inosused, *loc; struct ufs2_dinode *dp2; int error, start, len, i; u_int32_t old_initediblk; ump = ITOUMP(ip); fs = ump->um_fs; check_nifree: if (fs->fs_cs(fs, cg).cs_nifree == 0) return (0); UFS_UNLOCK(ump); if ((error = ffs_getcg(fs, ump->um_devvp, cg, 0, &bp, &cgp)) != 0) { UFS_LOCK(ump); return (0); } restart: if (cgp->cg_cs.cs_nifree == 0) { brelse(bp); UFS_LOCK(ump); return (0); } inosused = cg_inosused(cgp); if (ipref) { ipref %= fs->fs_ipg; if (isclr(inosused, ipref)) goto gotit; } start = cgp->cg_irotor / NBBY; len = howmany(fs->fs_ipg - cgp->cg_irotor, NBBY); loc = memcchr(&inosused[start], 0xff, len); if (loc == NULL) { len = start + 1; start = 0; loc = memcchr(&inosused[start], 0xff, len); if (loc == NULL) { printf("cg = %d, irotor = %ld, fs = %s\n", cg, (long)cgp->cg_irotor, fs->fs_fsmnt); panic("ffs_nodealloccg: map corrupted"); /* NOTREACHED */ } } ipref = (loc - inosused) * NBBY + ffs(~*loc) - 1; gotit: /* * Check to see if we need to initialize more inodes. */ if (fs->fs_magic == FS_UFS2_MAGIC && ipref + INOPB(fs) > cgp->cg_initediblk && cgp->cg_initediblk < cgp->cg_niblk) { old_initediblk = cgp->cg_initediblk; /* * Free the cylinder group lock before writing the * initialized inode block. Entering the * babarrierwrite() with the cylinder group lock * causes lock order violation between the lock and * snaplk. * * Another thread can decide to initialize the same * inode block, but whichever thread first gets the * cylinder group lock after writing the newly * allocated inode block will update it and the other * will realize that it has lost and leave the * cylinder group unchanged. */ ibp = getinobuf(ip, cg, old_initediblk, GB_LOCK_NOWAIT); brelse(bp); if (ibp == NULL) { /* * The inode block buffer is already owned by * another thread, which must initialize it. * Wait on the buffer to allow another thread * to finish the updates, with dropped cg * buffer lock, then retry. */ ibp = getinobuf(ip, cg, old_initediblk, 0); brelse(ibp); UFS_LOCK(ump); goto check_nifree; } bzero(ibp->b_data, (int)fs->fs_bsize); dp2 = (struct ufs2_dinode *)(ibp->b_data); for (i = 0; i < INOPB(fs); i++) { while (dp2->di_gen == 0) dp2->di_gen = arc4random(); dp2++; } /* * Rather than adding a soft updates dependency to ensure * that the new inode block is written before it is claimed * by the cylinder group map, we just do a barrier write * here. The barrier write will ensure that the inode block * gets written before the updated cylinder group map can be * written. The barrier write should only slow down bulk * loading of newly created filesystems. */ if (doasyncinodeinit) babarrierwrite(ibp); else bwrite(ibp); /* * After the inode block is written, try to update the * cg initediblk pointer. If another thread beat us * to it, then leave it unchanged as the other thread * has already set it correctly. */ error = ffs_getcg(fs, ump->um_devvp, cg, 0, &bp, &cgp); UFS_LOCK(ump); ACTIVECLEAR(fs, cg); UFS_UNLOCK(ump); if (error != 0) return (error); if (cgp->cg_initediblk == old_initediblk) cgp->cg_initediblk += INOPB(fs); goto restart; } cgp->cg_irotor = ipref; UFS_LOCK(ump); ACTIVECLEAR(fs, cg); setbit(inosused, ipref); cgp->cg_cs.cs_nifree--; fs->fs_cstotal.cs_nifree--; fs->fs_cs(fs, cg).cs_nifree--; fs->fs_fmod = 1; if ((mode & IFMT) == IFDIR) { cgp->cg_cs.cs_ndir++; fs->fs_cstotal.cs_ndir++; fs->fs_cs(fs, cg).cs_ndir++; } UFS_UNLOCK(ump); if (DOINGSOFTDEP(ITOV(ip))) softdep_setup_inomapdep(bp, ip, cg * fs->fs_ipg + ipref, mode); bdwrite(bp); return ((ino_t)(cg * fs->fs_ipg + ipref)); } /* * Free a block or fragment. * * The specified block or fragment is placed back in the * free map. If a fragment is deallocated, a possible * block reassembly is checked. */ static void ffs_blkfree_cg(ump, fs, devvp, bno, size, inum, dephd) struct ufsmount *ump; struct fs *fs; struct vnode *devvp; ufs2_daddr_t bno; long size; ino_t inum; struct workhead *dephd; { struct mount *mp; struct cg *cgp; struct buf *bp; daddr_t dbn; ufs1_daddr_t fragno, cgbno; int i, blk, frags, bbase, error; u_int cg; u_int8_t *blksfree; struct cdev *dev; cg = dtog(fs, bno); if (devvp->v_type == VREG) { /* devvp is a snapshot */ MPASS(devvp->v_mount->mnt_data == ump); dev = ump->um_devvp->v_rdev; } else if (devvp->v_type == VCHR) { /* devvp is a normal disk device */ dev = devvp->v_rdev; ASSERT_VOP_LOCKED(devvp, "ffs_blkfree_cg"); } else return; #ifdef INVARIANTS if ((u_int)size > fs->fs_bsize || fragoff(fs, size) != 0 || fragnum(fs, bno) + numfrags(fs, size) > fs->fs_frag) { printf("dev=%s, bno = %jd, bsize = %ld, size = %ld, fs = %s\n", devtoname(dev), (intmax_t)bno, (long)fs->fs_bsize, size, fs->fs_fsmnt); panic("ffs_blkfree_cg: bad size"); } #endif if ((u_int)bno >= fs->fs_size) { printf("bad block %jd, ino %lu\n", (intmax_t)bno, (u_long)inum); ffs_fserr(fs, inum, "bad block"); return; } if ((error = ffs_getcg(fs, devvp, cg, GB_CVTENXIO, &bp, &cgp)) != 0) { if (!ffs_fsfail_cleanup(ump, error) || !MOUNTEDSOFTDEP(UFSTOVFS(ump)) || devvp->v_type != VCHR) return; if (devvp->v_type == VREG) dbn = fragstoblks(fs, cgtod(fs, cg)); else dbn = fsbtodb(fs, cgtod(fs, cg)); error = getblkx(devvp, dbn, dbn, fs->fs_cgsize, 0, 0, 0, &bp); KASSERT(error == 0, ("getblkx failed")); softdep_setup_blkfree(UFSTOVFS(ump), bp, bno, numfrags(fs, size), dephd); bp->b_flags |= B_RELBUF | B_NOCACHE; bp->b_flags &= ~B_CACHE; bawrite(bp); return; } cgbno = dtogd(fs, bno); blksfree = cg_blksfree(cgp); UFS_LOCK(ump); if (size == fs->fs_bsize) { fragno = fragstoblks(fs, cgbno); if (!ffs_isfreeblock(fs, blksfree, fragno)) { if (devvp->v_type == VREG) { UFS_UNLOCK(ump); /* devvp is a snapshot */ brelse(bp); return; } printf("dev = %s, block = %jd, fs = %s\n", devtoname(dev), (intmax_t)bno, fs->fs_fsmnt); panic("ffs_blkfree_cg: freeing free block"); } ffs_setblock(fs, blksfree, fragno); ffs_clusteracct(fs, cgp, fragno, 1); cgp->cg_cs.cs_nbfree++; fs->fs_cstotal.cs_nbfree++; fs->fs_cs(fs, cg).cs_nbfree++; } else { bbase = cgbno - fragnum(fs, cgbno); /* * decrement the counts associated with the old frags */ blk = blkmap(fs, blksfree, bbase); ffs_fragacct(fs, blk, cgp->cg_frsum, -1); /* * deallocate the fragment */ frags = numfrags(fs, size); for (i = 0; i < frags; i++) { if (isset(blksfree, cgbno + i)) { printf("dev = %s, block = %jd, fs = %s\n", devtoname(dev), (intmax_t)(bno + i), fs->fs_fsmnt); panic("ffs_blkfree_cg: freeing free frag"); } setbit(blksfree, cgbno + i); } cgp->cg_cs.cs_nffree += i; fs->fs_cstotal.cs_nffree += i; fs->fs_cs(fs, cg).cs_nffree += i; /* * add back in counts associated with the new frags */ blk = blkmap(fs, blksfree, bbase); ffs_fragacct(fs, blk, cgp->cg_frsum, 1); /* * if a complete block has been reassembled, account for it */ fragno = fragstoblks(fs, bbase); if (ffs_isblock(fs, blksfree, fragno)) { cgp->cg_cs.cs_nffree -= fs->fs_frag; fs->fs_cstotal.cs_nffree -= fs->fs_frag; fs->fs_cs(fs, cg).cs_nffree -= fs->fs_frag; ffs_clusteracct(fs, cgp, fragno, 1); cgp->cg_cs.cs_nbfree++; fs->fs_cstotal.cs_nbfree++; fs->fs_cs(fs, cg).cs_nbfree++; } } fs->fs_fmod = 1; ACTIVECLEAR(fs, cg); UFS_UNLOCK(ump); mp = UFSTOVFS(ump); if (MOUNTEDSOFTDEP(mp) && devvp->v_type == VCHR) softdep_setup_blkfree(UFSTOVFS(ump), bp, bno, numfrags(fs, size), dephd); bdwrite(bp); } /* * Structures and routines associated with trim management. * * The following requests are passed to trim_lookup to indicate * the actions that should be taken. */ #define NEW 1 /* if found, error else allocate and hash it */ #define OLD 2 /* if not found, error, else return it */ #define REPLACE 3 /* if not found, error else unhash and reallocate it */ #define DONE 4 /* if not found, error else unhash and return it */ #define SINGLE 5 /* don't look up, just allocate it and don't hash it */ MALLOC_DEFINE(M_TRIM, "ufs_trim", "UFS trim structures"); #define TRIMLIST_HASH(ump, key) \ (&(ump)->um_trimhash[(key) & (ump)->um_trimlisthashsize]) /* * These structures describe each of the block free requests aggregated * together to make up a trim request. */ struct trim_blkreq { TAILQ_ENTRY(trim_blkreq) blkreqlist; ufs2_daddr_t bno; long size; struct workhead *pdephd; struct workhead dephd; }; /* * Description of a trim request. */ struct ffs_blkfree_trim_params { TAILQ_HEAD(, trim_blkreq) blklist; LIST_ENTRY(ffs_blkfree_trim_params) hashlist; struct task task; struct ufsmount *ump; struct vnode *devvp; ino_t inum; ufs2_daddr_t bno; long size; long key; }; static void ffs_blkfree_trim_completed(struct buf *); static void ffs_blkfree_trim_task(void *ctx, int pending __unused); static struct ffs_blkfree_trim_params *trim_lookup(struct ufsmount *, struct vnode *, ufs2_daddr_t, long, ino_t, u_long, int); static void ffs_blkfree_sendtrim(struct ffs_blkfree_trim_params *); /* * Called on trim completion to start a task to free the associated block(s). */ static void ffs_blkfree_trim_completed(bp) struct buf *bp; { struct ffs_blkfree_trim_params *tp; tp = bp->b_fsprivate1; free(bp, M_TRIM); TASK_INIT(&tp->task, 0, ffs_blkfree_trim_task, tp); taskqueue_enqueue(tp->ump->um_trim_tq, &tp->task); } /* * Trim completion task that free associated block(s). */ static void ffs_blkfree_trim_task(ctx, pending) void *ctx; int pending; { struct ffs_blkfree_trim_params *tp; struct trim_blkreq *blkelm; struct ufsmount *ump; tp = ctx; ump = tp->ump; while ((blkelm = TAILQ_FIRST(&tp->blklist)) != NULL) { ffs_blkfree_cg(ump, ump->um_fs, tp->devvp, blkelm->bno, blkelm->size, tp->inum, blkelm->pdephd); TAILQ_REMOVE(&tp->blklist, blkelm, blkreqlist); free(blkelm, M_TRIM); } vn_finished_secondary_write(UFSTOVFS(ump)); UFS_LOCK(ump); ump->um_trim_inflight -= 1; ump->um_trim_inflight_blks -= numfrags(ump->um_fs, tp->size); UFS_UNLOCK(ump); free(tp, M_TRIM); } /* * Lookup a trim request by inode number. * Allocate if requested (NEW, REPLACE, SINGLE). */ static struct ffs_blkfree_trim_params * trim_lookup(ump, devvp, bno, size, inum, key, alloctype) struct ufsmount *ump; struct vnode *devvp; ufs2_daddr_t bno; long size; ino_t inum; u_long key; int alloctype; { struct trimlist_hashhead *tphashhead; struct ffs_blkfree_trim_params *tp, *ntp; ntp = malloc(sizeof(struct ffs_blkfree_trim_params), M_TRIM, M_WAITOK); if (alloctype != SINGLE) { KASSERT(key >= FIRST_VALID_KEY, ("trim_lookup: invalid key")); UFS_LOCK(ump); tphashhead = TRIMLIST_HASH(ump, key); LIST_FOREACH(tp, tphashhead, hashlist) if (key == tp->key) break; } switch (alloctype) { case NEW: KASSERT(tp == NULL, ("trim_lookup: found trim")); break; case OLD: KASSERT(tp != NULL, ("trim_lookup: missing call to ffs_blkrelease_start()")); UFS_UNLOCK(ump); free(ntp, M_TRIM); return (tp); case REPLACE: KASSERT(tp != NULL, ("trim_lookup: missing REPLACE trim")); LIST_REMOVE(tp, hashlist); /* tp will be freed by caller */ break; case DONE: KASSERT(tp != NULL, ("trim_lookup: missing DONE trim")); LIST_REMOVE(tp, hashlist); UFS_UNLOCK(ump); free(ntp, M_TRIM); return (tp); } TAILQ_INIT(&ntp->blklist); ntp->ump = ump; ntp->devvp = devvp; ntp->bno = bno; ntp->size = size; ntp->inum = inum; ntp->key = key; if (alloctype != SINGLE) { LIST_INSERT_HEAD(tphashhead, ntp, hashlist); UFS_UNLOCK(ump); } return (ntp); } /* * Dispatch a trim request. */ static void ffs_blkfree_sendtrim(tp) struct ffs_blkfree_trim_params *tp; { struct ufsmount *ump; struct mount *mp; struct buf *bp; /* * Postpone the set of the free bit in the cg bitmap until the * BIO_DELETE is completed. Otherwise, due to disk queue * reordering, TRIM might be issued after we reuse the block * and write some new data into it. */ ump = tp->ump; bp = malloc(sizeof(*bp), M_TRIM, M_WAITOK | M_ZERO); bp->b_iocmd = BIO_DELETE; bp->b_iooffset = dbtob(fsbtodb(ump->um_fs, tp->bno)); bp->b_iodone = ffs_blkfree_trim_completed; bp->b_bcount = tp->size; bp->b_fsprivate1 = tp; UFS_LOCK(ump); ump->um_trim_total += 1; ump->um_trim_inflight += 1; ump->um_trim_inflight_blks += numfrags(ump->um_fs, tp->size); ump->um_trim_total_blks += numfrags(ump->um_fs, tp->size); UFS_UNLOCK(ump); mp = UFSTOVFS(ump); vn_start_secondary_write(NULL, &mp, 0); g_vfs_strategy(ump->um_bo, bp); } /* * Allocate a new key to use to identify a range of blocks. */ u_long ffs_blkrelease_start(ump, devvp, inum) struct ufsmount *ump; struct vnode *devvp; ino_t inum; { static u_long masterkey; u_long key; if (((ump->um_flags & UM_CANDELETE) == 0) || dotrimcons == 0) return (SINGLETON_KEY); do { key = atomic_fetchadd_long(&masterkey, 1); } while (key < FIRST_VALID_KEY); (void) trim_lookup(ump, devvp, 0, 0, inum, key, NEW); return (key); } /* * Deallocate a key that has been used to identify a range of blocks. */ void ffs_blkrelease_finish(ump, key) struct ufsmount *ump; u_long key; { struct ffs_blkfree_trim_params *tp; if (((ump->um_flags & UM_CANDELETE) == 0) || dotrimcons == 0) return; /* * If the vfs.ffs.dotrimcons sysctl option is enabled while * a file deletion is active, specifically after a call * to ffs_blkrelease_start() but before the call to * ffs_blkrelease_finish(), ffs_blkrelease_start() will * have handed out SINGLETON_KEY rather than starting a * collection sequence. Thus if we get a SINGLETON_KEY * passed to ffs_blkrelease_finish(), we just return rather * than trying to finish the nonexistent sequence. */ if (key == SINGLETON_KEY) { #ifdef INVARIANTS printf("%s: vfs.ffs.dotrimcons enabled on active filesystem\n", ump->um_mountp->mnt_stat.f_mntonname); #endif return; } /* * We are done with sending blocks using this key. Look up the key * using the DONE alloctype (in tp) to request that it be unhashed * as we will not be adding to it. If the key has never been used, * tp->size will be zero, so we can just free tp. Otherwise the call * to ffs_blkfree_sendtrim(tp) causes the block range described by * tp to be issued (and then tp to be freed). */ tp = trim_lookup(ump, NULL, 0, 0, 0, key, DONE); if (tp->size == 0) free(tp, M_TRIM); else ffs_blkfree_sendtrim(tp); } /* * Setup to free a block or fragment. * * Check for snapshots that might want to claim the block. * If trims are requested, prepare a trim request. Attempt to * aggregate consecutive blocks into a single trim request. */ void ffs_blkfree(ump, fs, devvp, bno, size, inum, vtype, dephd, key) struct ufsmount *ump; struct fs *fs; struct vnode *devvp; ufs2_daddr_t bno; long size; ino_t inum; enum vtype vtype; struct workhead *dephd; u_long key; { struct ffs_blkfree_trim_params *tp, *ntp; struct trim_blkreq *blkelm; /* * Check to see if a snapshot wants to claim the block. * Check that devvp is a normal disk device, not a snapshot, * it has a snapshot(s) associated with it, and one of the * snapshots wants to claim the block. */ if (devvp->v_type == VCHR && (devvp->v_vflag & VV_COPYONWRITE) && ffs_snapblkfree(fs, devvp, bno, size, inum, vtype, dephd)) { return; } /* * Nothing to delay if TRIM is not required for this block or TRIM * is disabled or the operation is performed on a snapshot. */ if (key == NOTRIM_KEY || ((ump->um_flags & UM_CANDELETE) == 0) || devvp->v_type == VREG) { ffs_blkfree_cg(ump, fs, devvp, bno, size, inum, dephd); return; } blkelm = malloc(sizeof(struct trim_blkreq), M_TRIM, M_WAITOK); blkelm->bno = bno; blkelm->size = size; if (dephd == NULL) { blkelm->pdephd = NULL; } else { LIST_INIT(&blkelm->dephd); LIST_SWAP(dephd, &blkelm->dephd, worklist, wk_list); blkelm->pdephd = &blkelm->dephd; } if (key == SINGLETON_KEY) { /* * Just a single non-contiguous piece. Use the SINGLE * alloctype to return a trim request that will not be * hashed for future lookup. */ tp = trim_lookup(ump, devvp, bno, size, inum, key, SINGLE); TAILQ_INSERT_HEAD(&tp->blklist, blkelm, blkreqlist); ffs_blkfree_sendtrim(tp); return; } /* * The callers of this function are not tracking whether or not * the blocks are contiguous. They are just saying that they * are freeing a set of blocks. It is this code that determines * the pieces of that range that are actually contiguous. * * Calling ffs_blkrelease_start() will have created an entry * that we will use. */ tp = trim_lookup(ump, devvp, bno, size, inum, key, OLD); if (tp->size == 0) { /* * First block of a potential range, set block and size * for the trim block. */ tp->bno = bno; tp->size = size; TAILQ_INSERT_HEAD(&tp->blklist, blkelm, blkreqlist); return; } /* * If this block is a continuation of the range (either * follows at the end or preceeds in the front) then we * add it to the front or back of the list and return. * * If it is not a continuation of the trim that we were * building, using the REPLACE alloctype, we request that * the old trim request (still in tp) be unhashed and a * new range started (in ntp). The ffs_blkfree_sendtrim(tp) * call causes the block range described by tp to be issued * (and then tp to be freed). */ if (bno + numfrags(fs, size) == tp->bno) { TAILQ_INSERT_HEAD(&tp->blklist, blkelm, blkreqlist); tp->bno = bno; tp->size += size; return; } else if (bno == tp->bno + numfrags(fs, tp->size)) { TAILQ_INSERT_TAIL(&tp->blklist, blkelm, blkreqlist); tp->size += size; return; } ntp = trim_lookup(ump, devvp, bno, size, inum, key, REPLACE); TAILQ_INSERT_HEAD(&ntp->blklist, blkelm, blkreqlist); ffs_blkfree_sendtrim(tp); } #ifdef INVARIANTS /* * Verify allocation of a block or fragment. Returns true if block or * fragment is allocated, false if it is free. */ static int ffs_checkblk(ip, bno, size) struct inode *ip; ufs2_daddr_t bno; long size; { struct fs *fs; struct cg *cgp; struct buf *bp; ufs1_daddr_t cgbno; int i, error, frags, free; u_int8_t *blksfree; fs = ITOFS(ip); if ((u_int)size > fs->fs_bsize || fragoff(fs, size) != 0) { printf("bsize = %ld, size = %ld, fs = %s\n", (long)fs->fs_bsize, size, fs->fs_fsmnt); panic("ffs_checkblk: bad size"); } if ((u_int)bno >= fs->fs_size) panic("ffs_checkblk: bad block %jd", (intmax_t)bno); error = ffs_getcg(fs, ITODEVVP(ip), dtog(fs, bno), 0, &bp, &cgp); if (error) panic("ffs_checkblk: cylinder group read failed"); blksfree = cg_blksfree(cgp); cgbno = dtogd(fs, bno); if (size == fs->fs_bsize) { free = ffs_isblock(fs, blksfree, fragstoblks(fs, cgbno)); } else { frags = numfrags(fs, size); for (free = 0, i = 0; i < frags; i++) if (isset(blksfree, cgbno + i)) free++; if (free != 0 && free != frags) panic("ffs_checkblk: partially free fragment"); } brelse(bp); return (!free); } #endif /* INVARIANTS */ /* * Free an inode. */ int ffs_vfree(pvp, ino, mode) struct vnode *pvp; ino_t ino; int mode; { struct ufsmount *ump; if (DOINGSOFTDEP(pvp)) { softdep_freefile(pvp, ino, mode); return (0); } ump = VFSTOUFS(pvp->v_mount); return (ffs_freefile(ump, ump->um_fs, ump->um_devvp, ino, mode, NULL)); } /* * Do the actual free operation. * The specified inode is placed back in the free map. */ int ffs_freefile(ump, fs, devvp, ino, mode, wkhd) struct ufsmount *ump; struct fs *fs; struct vnode *devvp; ino_t ino; int mode; struct workhead *wkhd; { struct cg *cgp; struct buf *bp; daddr_t dbn; int error; u_int cg; u_int8_t *inosused; struct cdev *dev; ino_t cgino; cg = ino_to_cg(fs, ino); if (devvp->v_type == VREG) { /* devvp is a snapshot */ MPASS(devvp->v_mount->mnt_data == ump); dev = ump->um_devvp->v_rdev; } else if (devvp->v_type == VCHR) { /* devvp is a normal disk device */ dev = devvp->v_rdev; } else { bp = NULL; return (0); } if (ino >= fs->fs_ipg * fs->fs_ncg) panic("ffs_freefile: range: dev = %s, ino = %ju, fs = %s", devtoname(dev), (uintmax_t)ino, fs->fs_fsmnt); if ((error = ffs_getcg(fs, devvp, cg, GB_CVTENXIO, &bp, &cgp)) != 0) { if (!ffs_fsfail_cleanup(ump, error) || !MOUNTEDSOFTDEP(UFSTOVFS(ump)) || devvp->v_type != VCHR) return (error); if (devvp->v_type == VREG) dbn = fragstoblks(fs, cgtod(fs, cg)); else dbn = fsbtodb(fs, cgtod(fs, cg)); error = getblkx(devvp, dbn, dbn, fs->fs_cgsize, 0, 0, 0, &bp); KASSERT(error == 0, ("getblkx failed")); softdep_setup_inofree(UFSTOVFS(ump), bp, ino, wkhd); bp->b_flags |= B_RELBUF | B_NOCACHE; bp->b_flags &= ~B_CACHE; bawrite(bp); return (error); } inosused = cg_inosused(cgp); cgino = ino % fs->fs_ipg; if (isclr(inosused, cgino)) { printf("dev = %s, ino = %ju, fs = %s\n", devtoname(dev), (uintmax_t)ino, fs->fs_fsmnt); if (fs->fs_ronly == 0) panic("ffs_freefile: freeing free inode"); } clrbit(inosused, cgino); if (cgino < cgp->cg_irotor) cgp->cg_irotor = cgino; cgp->cg_cs.cs_nifree++; UFS_LOCK(ump); fs->fs_cstotal.cs_nifree++; fs->fs_cs(fs, cg).cs_nifree++; if ((mode & IFMT) == IFDIR) { cgp->cg_cs.cs_ndir--; fs->fs_cstotal.cs_ndir--; fs->fs_cs(fs, cg).cs_ndir--; } fs->fs_fmod = 1; ACTIVECLEAR(fs, cg); UFS_UNLOCK(ump); if (MOUNTEDSOFTDEP(UFSTOVFS(ump)) && devvp->v_type == VCHR) softdep_setup_inofree(UFSTOVFS(ump), bp, ino, wkhd); bdwrite(bp); return (0); } /* * Check to see if a file is free. * Used to check for allocated files in snapshots. */ int ffs_checkfreefile(fs, devvp, ino) struct fs *fs; struct vnode *devvp; ino_t ino; { struct cg *cgp; struct buf *bp; int ret, error; u_int cg; u_int8_t *inosused; cg = ino_to_cg(fs, ino); if ((devvp->v_type != VREG) && (devvp->v_type != VCHR)) return (1); if (ino >= fs->fs_ipg * fs->fs_ncg) return (1); if ((error = ffs_getcg(fs, devvp, cg, 0, &bp, &cgp)) != 0) return (1); inosused = cg_inosused(cgp); ino %= fs->fs_ipg; ret = isclr(inosused, ino); brelse(bp); return (ret); } /* * Find a block of the specified size in the specified cylinder group. * * It is a panic if a request is made to find a block if none are * available. */ static ufs1_daddr_t ffs_mapsearch(fs, cgp, bpref, allocsiz) struct fs *fs; struct cg *cgp; ufs2_daddr_t bpref; int allocsiz; { ufs1_daddr_t bno; int start, len, loc, i; int blk, field, subfield, pos; u_int8_t *blksfree; /* * find the fragment by searching through the free block * map for an appropriate bit pattern */ if (bpref) start = dtogd(fs, bpref) / NBBY; else start = cgp->cg_frotor / NBBY; blksfree = cg_blksfree(cgp); len = howmany(fs->fs_fpg, NBBY) - start; loc = scanc((u_int)len, (u_char *)&blksfree[start], fragtbl[fs->fs_frag], (u_char)(1 << (allocsiz - 1 + (fs->fs_frag % NBBY)))); if (loc == 0) { len = start + 1; start = 0; loc = scanc((u_int)len, (u_char *)&blksfree[0], fragtbl[fs->fs_frag], (u_char)(1 << (allocsiz - 1 + (fs->fs_frag % NBBY)))); if (loc == 0) { printf("start = %d, len = %d, fs = %s\n", start, len, fs->fs_fsmnt); panic("ffs_alloccg: map corrupted"); /* NOTREACHED */ } } bno = (start + len - loc) * NBBY; cgp->cg_frotor = bno; /* * found the byte in the map * sift through the bits to find the selected frag */ for (i = bno + NBBY; bno < i; bno += fs->fs_frag) { blk = blkmap(fs, blksfree, bno); blk <<= 1; field = around[allocsiz]; subfield = inside[allocsiz]; for (pos = 0; pos <= fs->fs_frag - allocsiz; pos++) { if ((blk & field) == subfield) return (bno + pos); field <<= 1; subfield <<= 1; } } printf("bno = %lu, fs = %s\n", (u_long)bno, fs->fs_fsmnt); panic("ffs_alloccg: block not in map"); return (-1); } static const struct statfs * ffs_getmntstat(struct vnode *devvp) { if (devvp->v_type == VCHR) return (&devvp->v_rdev->si_mountpt->mnt_stat); return (ffs_getmntstat(VFSTOUFS(devvp->v_mount)->um_devvp)); } /* * Fetch and verify a cylinder group. */ int ffs_getcg(fs, devvp, cg, flags, bpp, cgpp) struct fs *fs; struct vnode *devvp; u_int cg; int flags; struct buf **bpp; struct cg **cgpp; { struct buf *bp; struct cg *cgp; const struct statfs *sfs; daddr_t blkno; int error; *bpp = NULL; *cgpp = NULL; if ((fs->fs_metackhash & CK_CYLGRP) != 0) flags |= GB_CKHASH; if (devvp->v_type == VREG) blkno = fragstoblks(fs, cgtod(fs, cg)); else blkno = fsbtodb(fs, cgtod(fs, cg)); error = breadn_flags(devvp, blkno, blkno, (int)fs->fs_cgsize, NULL, NULL, 0, NOCRED, flags, ffs_ckhash_cg, &bp); if (error != 0) return (error); cgp = (struct cg *)bp->b_data; if ((fs->fs_metackhash & CK_CYLGRP) != 0 && (bp->b_flags & B_CKHASH) != 0 && cgp->cg_ckhash != bp->b_ckhash) { sfs = ffs_getmntstat(devvp); printf("UFS %s%s (%s) cylinder checksum failed: cg %u, cgp: " "0x%x != bp: 0x%jx\n", devvp->v_type == VCHR ? "" : "snapshot of ", sfs->f_mntfromname, sfs->f_mntonname, cg, cgp->cg_ckhash, (uintmax_t)bp->b_ckhash); bp->b_flags &= ~B_CKHASH; bp->b_flags |= B_INVAL | B_NOCACHE; brelse(bp); return (EIO); } if (!cg_chkmagic(cgp) || cgp->cg_cgx != cg) { sfs = ffs_getmntstat(devvp); printf("UFS %s%s (%s)", devvp->v_type == VCHR ? "" : "snapshot of ", sfs->f_mntfromname, sfs->f_mntonname); if (!cg_chkmagic(cgp)) printf(" cg %u: bad magic number 0x%x should be 0x%x\n", cg, cgp->cg_magic, CG_MAGIC); else printf(": wrong cylinder group cg %u != cgx %u\n", cg, cgp->cg_cgx); bp->b_flags &= ~B_CKHASH; bp->b_flags |= B_INVAL | B_NOCACHE; brelse(bp); return (EIO); } bp->b_flags &= ~B_CKHASH; bp->b_xflags |= BX_BKGRDWRITE; /* * If we are using check hashes on the cylinder group then we want * to limit changing the cylinder group time to when we are actually * going to write it to disk so that its check hash remains correct * in memory. If the CK_CYLGRP flag is set the time is updated in * ffs_bufwrite() as the buffer is queued for writing. Otherwise we * update the time here as we have done historically. */ if ((fs->fs_metackhash & CK_CYLGRP) != 0) bp->b_xflags |= BX_CYLGRP; else cgp->cg_old_time = cgp->cg_time = time_second; *bpp = bp; *cgpp = cgp; return (0); } static void ffs_ckhash_cg(bp) struct buf *bp; { uint32_t ckhash; struct cg *cgp; cgp = (struct cg *)bp->b_data; ckhash = cgp->cg_ckhash; cgp->cg_ckhash = 0; bp->b_ckhash = calculate_crc32c(~0L, bp->b_data, bp->b_bcount); cgp->cg_ckhash = ckhash; } /* * Fserr prints the name of a filesystem with an error diagnostic. * * The form of the error message is: * fs: error message */ void ffs_fserr(fs, inum, cp) struct fs *fs; ino_t inum; char *cp; { struct thread *td = curthread; /* XXX */ struct proc *p = td->td_proc; log(LOG_ERR, "pid %d (%s), uid %d inumber %ju on %s: %s\n", p->p_pid, p->p_comm, td->td_ucred->cr_uid, (uintmax_t)inum, fs->fs_fsmnt, cp); } /* * This function provides the capability for the fsck program to * update an active filesystem. Fourteen operations are provided: * * adjrefcnt(inode, amt) - adjusts the reference count on the * specified inode by the specified amount. Under normal * operation the count should always go down. Decrementing * the count to zero will cause the inode to be freed. * adjblkcnt(inode, amt) - adjust the number of blocks used by the * inode by the specified amount. * setsize(inode, size) - set the size of the inode to the * specified size. * adjndir, adjbfree, adjifree, adjffree, adjnumclusters(amt) - * adjust the superblock summary. * freedirs(inode, count) - directory inodes [inode..inode + count - 1] * are marked as free. Inodes should never have to be marked * as in use. * freefiles(inode, count) - file inodes [inode..inode + count - 1] * are marked as free. Inodes should never have to be marked * as in use. * freeblks(blockno, size) - blocks [blockno..blockno + size - 1] * are marked as free. Blocks should never have to be marked * as in use. * setflags(flags, set/clear) - the fs_flags field has the specified * flags set (second parameter +1) or cleared (second parameter -1). * setcwd(dirinode) - set the current directory to dirinode in the * filesystem associated with the snapshot. * setdotdot(oldvalue, newvalue) - Verify that the inode number for ".." * in the current directory is oldvalue then change it to newvalue. * unlink(nameptr, oldvalue) - Verify that the inode number associated * with nameptr in the current directory is oldvalue then unlink it. */ static int sysctl_ffs_fsck(SYSCTL_HANDLER_ARGS); SYSCTL_PROC(_vfs_ffs, FFS_ADJ_REFCNT, adjrefcnt, CTLFLAG_WR | CTLTYPE_STRUCT | CTLFLAG_NEEDGIANT, 0, 0, sysctl_ffs_fsck, "S,fsck", "Adjust Inode Reference Count"); static SYSCTL_NODE(_vfs_ffs, FFS_ADJ_BLKCNT, adjblkcnt, CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck, "Adjust Inode Used Blocks Count"); static SYSCTL_NODE(_vfs_ffs, FFS_SET_SIZE, setsize, CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck, "Set the inode size"); static SYSCTL_NODE(_vfs_ffs, FFS_ADJ_NDIR, adjndir, CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck, "Adjust number of directories"); static SYSCTL_NODE(_vfs_ffs, FFS_ADJ_NBFREE, adjnbfree, CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck, "Adjust number of free blocks"); static SYSCTL_NODE(_vfs_ffs, FFS_ADJ_NIFREE, adjnifree, CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck, "Adjust number of free inodes"); static SYSCTL_NODE(_vfs_ffs, FFS_ADJ_NFFREE, adjnffree, CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck, "Adjust number of free frags"); static SYSCTL_NODE(_vfs_ffs, FFS_ADJ_NUMCLUSTERS, adjnumclusters, CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck, "Adjust number of free clusters"); static SYSCTL_NODE(_vfs_ffs, FFS_DIR_FREE, freedirs, CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck, "Free Range of Directory Inodes"); static SYSCTL_NODE(_vfs_ffs, FFS_FILE_FREE, freefiles, CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck, "Free Range of File Inodes"); static SYSCTL_NODE(_vfs_ffs, FFS_BLK_FREE, freeblks, CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck, "Free Range of Blocks"); static SYSCTL_NODE(_vfs_ffs, FFS_SET_FLAGS, setflags, CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck, "Change Filesystem Flags"); static SYSCTL_NODE(_vfs_ffs, FFS_SET_CWD, setcwd, CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck, "Set Current Working Directory"); static SYSCTL_NODE(_vfs_ffs, FFS_SET_DOTDOT, setdotdot, CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck, "Change Value of .. Entry"); static SYSCTL_NODE(_vfs_ffs, FFS_UNLINK, unlink, CTLFLAG_WR | CTLFLAG_NEEDGIANT, sysctl_ffs_fsck, "Unlink a Duplicate Name"); #ifdef DIAGNOSTIC static int fsckcmds = 0; SYSCTL_INT(_debug, OID_AUTO, ffs_fsckcmds, CTLFLAG_RW, &fsckcmds, 0, "print out fsck_ffs-based filesystem update commands"); #endif /* DIAGNOSTIC */ static int sysctl_ffs_fsck(SYSCTL_HANDLER_ARGS) { struct thread *td = curthread; struct fsck_cmd cmd; struct ufsmount *ump; struct vnode *vp, *dvp, *fdvp; struct inode *ip, *dp; struct mount *mp; struct fs *fs; struct pwd *pwd; ufs2_daddr_t blkno; long blkcnt, blksize; u_long key; struct file *fp; cap_rights_t rights; int filetype, error; if (req->newlen > sizeof cmd) return (EBADRPC); if ((error = SYSCTL_IN(req, &cmd, sizeof cmd)) != 0) return (error); if (cmd.version != FFS_CMD_VERSION) return (ERPCMISMATCH); if ((error = getvnode(td, cmd.handle, cap_rights_init(&rights, CAP_FSCK), &fp)) != 0) return (error); vp = fp->f_data; if (vp->v_type != VREG && vp->v_type != VDIR) { fdrop(fp, td); return (EINVAL); } vn_start_write(vp, &mp, V_WAIT); if (mp == NULL || strncmp(mp->mnt_stat.f_fstypename, "ufs", MFSNAMELEN)) { vn_finished_write(mp); fdrop(fp, td); return (EINVAL); } ump = VFSTOUFS(mp); if ((mp->mnt_flag & MNT_RDONLY) && ump->um_fsckpid != td->td_proc->p_pid) { vn_finished_write(mp); fdrop(fp, td); return (EROFS); } fs = ump->um_fs; filetype = IFREG; switch (oidp->oid_number) { case FFS_SET_FLAGS: #ifdef DIAGNOSTIC if (fsckcmds) printf("%s: %s flags\n", mp->mnt_stat.f_mntonname, cmd.size > 0 ? "set" : "clear"); #endif /* DIAGNOSTIC */ if (cmd.size > 0) fs->fs_flags |= (long)cmd.value; else fs->fs_flags &= ~(long)cmd.value; break; case FFS_ADJ_REFCNT: #ifdef DIAGNOSTIC if (fsckcmds) { printf("%s: adjust inode %jd link count by %jd\n", mp->mnt_stat.f_mntonname, (intmax_t)cmd.value, (intmax_t)cmd.size); } #endif /* DIAGNOSTIC */ if ((error = ffs_vget(mp, (ino_t)cmd.value, LK_EXCLUSIVE, &vp))) break; ip = VTOI(vp); ip->i_nlink += cmd.size; DIP_SET(ip, i_nlink, ip->i_nlink); ip->i_effnlink += cmd.size; UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_MODIFIED); error = ffs_update(vp, 1); if (DOINGSOFTDEP(vp)) softdep_change_linkcnt(ip); vput(vp); break; case FFS_ADJ_BLKCNT: #ifdef DIAGNOSTIC if (fsckcmds) { printf("%s: adjust inode %jd block count by %jd\n", mp->mnt_stat.f_mntonname, (intmax_t)cmd.value, (intmax_t)cmd.size); } #endif /* DIAGNOSTIC */ if ((error = ffs_vget(mp, (ino_t)cmd.value, LK_EXCLUSIVE, &vp))) break; ip = VTOI(vp); DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + cmd.size); UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_MODIFIED); error = ffs_update(vp, 1); vput(vp); break; case FFS_SET_SIZE: #ifdef DIAGNOSTIC if (fsckcmds) { printf("%s: set inode %jd size to %jd\n", mp->mnt_stat.f_mntonname, (intmax_t)cmd.value, (intmax_t)cmd.size); } #endif /* DIAGNOSTIC */ if ((error = ffs_vget(mp, (ino_t)cmd.value, LK_EXCLUSIVE, &vp))) break; ip = VTOI(vp); DIP_SET(ip, i_size, cmd.size); UFS_INODE_SET_FLAG(ip, IN_SIZEMOD | IN_CHANGE | IN_MODIFIED); error = ffs_update(vp, 1); vput(vp); break; case FFS_DIR_FREE: filetype = IFDIR; /* fall through */ case FFS_FILE_FREE: #ifdef DIAGNOSTIC if (fsckcmds) { if (cmd.size == 1) printf("%s: free %s inode %ju\n", mp->mnt_stat.f_mntonname, filetype == IFDIR ? "directory" : "file", (uintmax_t)cmd.value); else printf("%s: free %s inodes %ju-%ju\n", mp->mnt_stat.f_mntonname, filetype == IFDIR ? "directory" : "file", (uintmax_t)cmd.value, (uintmax_t)(cmd.value + cmd.size - 1)); } #endif /* DIAGNOSTIC */ while (cmd.size > 0) { if ((error = ffs_freefile(ump, fs, ump->um_devvp, cmd.value, filetype, NULL))) break; cmd.size -= 1; cmd.value += 1; } break; case FFS_BLK_FREE: #ifdef DIAGNOSTIC if (fsckcmds) { if (cmd.size == 1) printf("%s: free block %jd\n", mp->mnt_stat.f_mntonname, (intmax_t)cmd.value); else printf("%s: free blocks %jd-%jd\n", mp->mnt_stat.f_mntonname, (intmax_t)cmd.value, (intmax_t)cmd.value + cmd.size - 1); } #endif /* DIAGNOSTIC */ blkno = cmd.value; blkcnt = cmd.size; blksize = fs->fs_frag - (blkno % fs->fs_frag); key = ffs_blkrelease_start(ump, ump->um_devvp, UFS_ROOTINO); while (blkcnt > 0) { if (blkcnt < blksize) blksize = blkcnt; ffs_blkfree(ump, fs, ump->um_devvp, blkno, blksize * fs->fs_fsize, UFS_ROOTINO, VDIR, NULL, key); blkno += blksize; blkcnt -= blksize; blksize = fs->fs_frag; } ffs_blkrelease_finish(ump, key); break; /* * Adjust superblock summaries. fsck(8) is expected to * submit deltas when necessary. */ case FFS_ADJ_NDIR: #ifdef DIAGNOSTIC if (fsckcmds) { printf("%s: adjust number of directories by %jd\n", mp->mnt_stat.f_mntonname, (intmax_t)cmd.value); } #endif /* DIAGNOSTIC */ fs->fs_cstotal.cs_ndir += cmd.value; break; case FFS_ADJ_NBFREE: #ifdef DIAGNOSTIC if (fsckcmds) { printf("%s: adjust number of free blocks by %+jd\n", mp->mnt_stat.f_mntonname, (intmax_t)cmd.value); } #endif /* DIAGNOSTIC */ fs->fs_cstotal.cs_nbfree += cmd.value; break; case FFS_ADJ_NIFREE: #ifdef DIAGNOSTIC if (fsckcmds) { printf("%s: adjust number of free inodes by %+jd\n", mp->mnt_stat.f_mntonname, (intmax_t)cmd.value); } #endif /* DIAGNOSTIC */ fs->fs_cstotal.cs_nifree += cmd.value; break; case FFS_ADJ_NFFREE: #ifdef DIAGNOSTIC if (fsckcmds) { printf("%s: adjust number of free frags by %+jd\n", mp->mnt_stat.f_mntonname, (intmax_t)cmd.value); } #endif /* DIAGNOSTIC */ fs->fs_cstotal.cs_nffree += cmd.value; break; case FFS_ADJ_NUMCLUSTERS: #ifdef DIAGNOSTIC if (fsckcmds) { printf("%s: adjust number of free clusters by %+jd\n", mp->mnt_stat.f_mntonname, (intmax_t)cmd.value); } #endif /* DIAGNOSTIC */ fs->fs_cstotal.cs_numclusters += cmd.value; break; case FFS_SET_CWD: #ifdef DIAGNOSTIC if (fsckcmds) { printf("%s: set current directory to inode %jd\n", mp->mnt_stat.f_mntonname, (intmax_t)cmd.value); } #endif /* DIAGNOSTIC */ if ((error = ffs_vget(mp, (ino_t)cmd.value, LK_SHARED, &vp))) break; AUDIT_ARG_VNODE1(vp); if ((error = change_dir(vp, td)) != 0) { vput(vp); break; } VOP_UNLOCK(vp); pwd_chdir(td, vp); break; case FFS_SET_DOTDOT: #ifdef DIAGNOSTIC if (fsckcmds) { printf("%s: change .. in cwd from %jd to %jd\n", mp->mnt_stat.f_mntonname, (intmax_t)cmd.value, (intmax_t)cmd.size); } #endif /* DIAGNOSTIC */ /* * First we have to get and lock the parent directory * to which ".." points. */ error = ffs_vget(mp, (ino_t)cmd.value, LK_EXCLUSIVE, &fdvp); if (error) break; /* * Now we get and lock the child directory containing "..". */ pwd = pwd_hold(td); dvp = pwd->pwd_cdir; - if ((error = vget(dvp, LK_EXCLUSIVE, td)) != 0) { + if ((error = vget(dvp, LK_EXCLUSIVE)) != 0) { vput(fdvp); pwd_drop(pwd); break; } dp = VTOI(dvp); dp->i_offset = 12; /* XXX mastertemplate.dot_reclen */ error = ufs_dirrewrite(dp, VTOI(fdvp), (ino_t)cmd.size, DT_DIR, 0); cache_purge(fdvp); cache_purge(dvp); vput(dvp); vput(fdvp); pwd_drop(pwd); break; case FFS_UNLINK: #ifdef DIAGNOSTIC if (fsckcmds) { char buf[32]; if (copyinstr((char *)(intptr_t)cmd.value, buf,32,NULL)) strncpy(buf, "Name_too_long", 32); printf("%s: unlink %s (inode %jd)\n", mp->mnt_stat.f_mntonname, buf, (intmax_t)cmd.size); } #endif /* DIAGNOSTIC */ /* * kern_funlinkat will do its own start/finish writes and * they do not nest, so drop ours here. Setting mp == NULL * indicates that vn_finished_write is not needed down below. */ vn_finished_write(mp); mp = NULL; error = kern_funlinkat(td, AT_FDCWD, (char *)(intptr_t)cmd.value, FD_NONE, UIO_USERSPACE, 0, (ino_t)cmd.size); break; default: #ifdef DIAGNOSTIC if (fsckcmds) { printf("Invalid request %d from fsck\n", oidp->oid_number); } #endif /* DIAGNOSTIC */ error = EINVAL; break; } fdrop(fp, td); vn_finished_write(mp); return (error); } Index: projects/clang1100-import/sys/ufs/ffs/ffs_softdep.c =================================================================== --- projects/clang1100-import/sys/ufs/ffs/ffs_softdep.c (revision 364278) +++ projects/clang1100-import/sys/ufs/ffs/ffs_softdep.c (revision 364279) @@ -1,14827 +1,14826 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright 1998, 2000 Marshall Kirk McKusick. * Copyright 2009, 2010 Jeffrey W. Roberson * All rights reserved. * * The soft updates code is derived from the appendix of a University * of Michigan technical report (Gregory R. Ganger and Yale N. Patt, * "Soft Updates: A Solution to the Metadata Update Problem in File * Systems", CSE-TR-254-95, August 1995). * * Further information about soft updates can be obtained from: * * Marshall Kirk McKusick http://www.mckusick.com/softdep/ * 1614 Oxford Street mckusick@mckusick.com * Berkeley, CA 94709-1608 +1-510-843-9542 * USA * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR * TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * from: @(#)ffs_softdep.c 9.59 (McKusick) 6/21/00 */ #include __FBSDID("$FreeBSD$"); #include "opt_ffs.h" #include "opt_quota.h" #include "opt_ddb.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define KTR_SUJ 0 /* Define to KTR_SPARE. */ #ifndef SOFTUPDATES int softdep_flushfiles(oldmnt, flags, td) struct mount *oldmnt; int flags; struct thread *td; { panic("softdep_flushfiles called"); } int softdep_mount(devvp, mp, fs, cred) struct vnode *devvp; struct mount *mp; struct fs *fs; struct ucred *cred; { return (0); } void softdep_initialize() { return; } void softdep_uninitialize() { return; } void softdep_unmount(mp) struct mount *mp; { panic("softdep_unmount called"); } void softdep_setup_sbupdate(ump, fs, bp) struct ufsmount *ump; struct fs *fs; struct buf *bp; { panic("softdep_setup_sbupdate called"); } void softdep_setup_inomapdep(bp, ip, newinum, mode) struct buf *bp; struct inode *ip; ino_t newinum; int mode; { panic("softdep_setup_inomapdep called"); } void softdep_setup_blkmapdep(bp, mp, newblkno, frags, oldfrags) struct buf *bp; struct mount *mp; ufs2_daddr_t newblkno; int frags; int oldfrags; { panic("softdep_setup_blkmapdep called"); } void softdep_setup_allocdirect(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp) struct inode *ip; ufs_lbn_t lbn; ufs2_daddr_t newblkno; ufs2_daddr_t oldblkno; long newsize; long oldsize; struct buf *bp; { panic("softdep_setup_allocdirect called"); } void softdep_setup_allocext(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp) struct inode *ip; ufs_lbn_t lbn; ufs2_daddr_t newblkno; ufs2_daddr_t oldblkno; long newsize; long oldsize; struct buf *bp; { panic("softdep_setup_allocext called"); } void softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp) struct inode *ip; ufs_lbn_t lbn; struct buf *bp; int ptrno; ufs2_daddr_t newblkno; ufs2_daddr_t oldblkno; struct buf *nbp; { panic("softdep_setup_allocindir_page called"); } void softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno) struct buf *nbp; struct inode *ip; struct buf *bp; int ptrno; ufs2_daddr_t newblkno; { panic("softdep_setup_allocindir_meta called"); } void softdep_journal_freeblocks(ip, cred, length, flags) struct inode *ip; struct ucred *cred; off_t length; int flags; { panic("softdep_journal_freeblocks called"); } void softdep_journal_fsync(ip) struct inode *ip; { panic("softdep_journal_fsync called"); } void softdep_setup_freeblocks(ip, length, flags) struct inode *ip; off_t length; int flags; { panic("softdep_setup_freeblocks called"); } void softdep_freefile(pvp, ino, mode) struct vnode *pvp; ino_t ino; int mode; { panic("softdep_freefile called"); } int softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp, isnewblk) struct buf *bp; struct inode *dp; off_t diroffset; ino_t newinum; struct buf *newdirbp; int isnewblk; { panic("softdep_setup_directory_add called"); } void softdep_change_directoryentry_offset(bp, dp, base, oldloc, newloc, entrysize) struct buf *bp; struct inode *dp; caddr_t base; caddr_t oldloc; caddr_t newloc; int entrysize; { panic("softdep_change_directoryentry_offset called"); } void softdep_setup_remove(bp, dp, ip, isrmdir) struct buf *bp; struct inode *dp; struct inode *ip; int isrmdir; { panic("softdep_setup_remove called"); } void softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir) struct buf *bp; struct inode *dp; struct inode *ip; ino_t newinum; int isrmdir; { panic("softdep_setup_directory_change called"); } void softdep_setup_blkfree(mp, bp, blkno, frags, wkhd) struct mount *mp; struct buf *bp; ufs2_daddr_t blkno; int frags; struct workhead *wkhd; { panic("%s called", __FUNCTION__); } void softdep_setup_inofree(mp, bp, ino, wkhd) struct mount *mp; struct buf *bp; ino_t ino; struct workhead *wkhd; { panic("%s called", __FUNCTION__); } void softdep_setup_unlink(dp, ip) struct inode *dp; struct inode *ip; { panic("%s called", __FUNCTION__); } void softdep_setup_link(dp, ip) struct inode *dp; struct inode *ip; { panic("%s called", __FUNCTION__); } void softdep_revert_link(dp, ip) struct inode *dp; struct inode *ip; { panic("%s called", __FUNCTION__); } void softdep_setup_rmdir(dp, ip) struct inode *dp; struct inode *ip; { panic("%s called", __FUNCTION__); } void softdep_revert_rmdir(dp, ip) struct inode *dp; struct inode *ip; { panic("%s called", __FUNCTION__); } void softdep_setup_create(dp, ip) struct inode *dp; struct inode *ip; { panic("%s called", __FUNCTION__); } void softdep_revert_create(dp, ip) struct inode *dp; struct inode *ip; { panic("%s called", __FUNCTION__); } void softdep_setup_mkdir(dp, ip) struct inode *dp; struct inode *ip; { panic("%s called", __FUNCTION__); } void softdep_revert_mkdir(dp, ip) struct inode *dp; struct inode *ip; { panic("%s called", __FUNCTION__); } void softdep_setup_dotdot_link(dp, ip) struct inode *dp; struct inode *ip; { panic("%s called", __FUNCTION__); } int softdep_prealloc(vp, waitok) struct vnode *vp; int waitok; { panic("%s called", __FUNCTION__); } int softdep_journal_lookup(mp, vpp) struct mount *mp; struct vnode **vpp; { return (ENOENT); } void softdep_change_linkcnt(ip) struct inode *ip; { panic("softdep_change_linkcnt called"); } void softdep_load_inodeblock(ip) struct inode *ip; { panic("softdep_load_inodeblock called"); } void softdep_update_inodeblock(ip, bp, waitfor) struct inode *ip; struct buf *bp; int waitfor; { panic("softdep_update_inodeblock called"); } int softdep_fsync(vp) struct vnode *vp; /* the "in_core" copy of the inode */ { return (0); } void softdep_fsync_mountdev(vp) struct vnode *vp; { return; } int softdep_flushworklist(oldmnt, countp, td) struct mount *oldmnt; int *countp; struct thread *td; { *countp = 0; return (0); } int softdep_sync_metadata(struct vnode *vp) { panic("softdep_sync_metadata called"); } int softdep_sync_buf(struct vnode *vp, struct buf *bp, int waitfor) { panic("softdep_sync_buf called"); } int softdep_slowdown(vp) struct vnode *vp; { panic("softdep_slowdown called"); } int softdep_request_cleanup(fs, vp, cred, resource) struct fs *fs; struct vnode *vp; struct ucred *cred; int resource; { return (0); } int softdep_check_suspend(struct mount *mp, struct vnode *devvp, int softdep_depcnt, int softdep_accdepcnt, int secondary_writes, int secondary_accwrites) { struct bufobj *bo; int error; (void) softdep_depcnt, (void) softdep_accdepcnt; bo = &devvp->v_bufobj; ASSERT_BO_WLOCKED(bo); MNT_ILOCK(mp); while (mp->mnt_secondary_writes != 0) { BO_UNLOCK(bo); msleep(&mp->mnt_secondary_writes, MNT_MTX(mp), (PUSER - 1) | PDROP, "secwr", 0); BO_LOCK(bo); MNT_ILOCK(mp); } /* * Reasons for needing more work before suspend: * - Dirty buffers on devvp. * - Secondary writes occurred after start of vnode sync loop */ error = 0; if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0 || secondary_writes != 0 || mp->mnt_secondary_writes != 0 || secondary_accwrites != mp->mnt_secondary_accwrites) error = EAGAIN; BO_UNLOCK(bo); return (error); } void softdep_get_depcounts(struct mount *mp, int *softdepactivep, int *softdepactiveaccp) { (void) mp; *softdepactivep = 0; *softdepactiveaccp = 0; } void softdep_buf_append(bp, wkhd) struct buf *bp; struct workhead *wkhd; { panic("softdep_buf_appendwork called"); } void softdep_inode_append(ip, cred, wkhd) struct inode *ip; struct ucred *cred; struct workhead *wkhd; { panic("softdep_inode_appendwork called"); } void softdep_freework(wkhd) struct workhead *wkhd; { panic("softdep_freework called"); } #else FEATURE(softupdates, "FFS soft-updates support"); static SYSCTL_NODE(_debug, OID_AUTO, softdep, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "soft updates stats"); static SYSCTL_NODE(_debug_softdep, OID_AUTO, total, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "total dependencies allocated"); static SYSCTL_NODE(_debug_softdep, OID_AUTO, highuse, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "high use dependencies allocated"); static SYSCTL_NODE(_debug_softdep, OID_AUTO, current, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "current dependencies allocated"); static SYSCTL_NODE(_debug_softdep, OID_AUTO, write, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "current dependencies written"); unsigned long dep_current[D_LAST + 1]; unsigned long dep_highuse[D_LAST + 1]; unsigned long dep_total[D_LAST + 1]; unsigned long dep_write[D_LAST + 1]; #define SOFTDEP_TYPE(type, str, long) \ static MALLOC_DEFINE(M_ ## type, #str, long); \ SYSCTL_ULONG(_debug_softdep_total, OID_AUTO, str, CTLFLAG_RD, \ &dep_total[D_ ## type], 0, ""); \ SYSCTL_ULONG(_debug_softdep_current, OID_AUTO, str, CTLFLAG_RD, \ &dep_current[D_ ## type], 0, ""); \ SYSCTL_ULONG(_debug_softdep_highuse, OID_AUTO, str, CTLFLAG_RD, \ &dep_highuse[D_ ## type], 0, ""); \ SYSCTL_ULONG(_debug_softdep_write, OID_AUTO, str, CTLFLAG_RD, \ &dep_write[D_ ## type], 0, ""); SOFTDEP_TYPE(PAGEDEP, pagedep, "File page dependencies"); SOFTDEP_TYPE(INODEDEP, inodedep, "Inode dependencies"); SOFTDEP_TYPE(BMSAFEMAP, bmsafemap, "Block or frag allocated from cyl group map"); SOFTDEP_TYPE(NEWBLK, newblk, "New block or frag allocation dependency"); SOFTDEP_TYPE(ALLOCDIRECT, allocdirect, "Block or frag dependency for an inode"); SOFTDEP_TYPE(INDIRDEP, indirdep, "Indirect block dependencies"); SOFTDEP_TYPE(ALLOCINDIR, allocindir, "Block dependency for an indirect block"); SOFTDEP_TYPE(FREEFRAG, freefrag, "Previously used frag for an inode"); SOFTDEP_TYPE(FREEBLKS, freeblks, "Blocks freed from an inode"); SOFTDEP_TYPE(FREEFILE, freefile, "Inode deallocated"); SOFTDEP_TYPE(DIRADD, diradd, "New directory entry"); SOFTDEP_TYPE(MKDIR, mkdir, "New directory"); SOFTDEP_TYPE(DIRREM, dirrem, "Directory entry deleted"); SOFTDEP_TYPE(NEWDIRBLK, newdirblk, "Unclaimed new directory block"); SOFTDEP_TYPE(FREEWORK, freework, "free an inode block"); SOFTDEP_TYPE(FREEDEP, freedep, "track a block free"); SOFTDEP_TYPE(JADDREF, jaddref, "Journal inode ref add"); SOFTDEP_TYPE(JREMREF, jremref, "Journal inode ref remove"); SOFTDEP_TYPE(JMVREF, jmvref, "Journal inode ref move"); SOFTDEP_TYPE(JNEWBLK, jnewblk, "Journal new block"); SOFTDEP_TYPE(JFREEBLK, jfreeblk, "Journal free block"); SOFTDEP_TYPE(JFREEFRAG, jfreefrag, "Journal free frag"); SOFTDEP_TYPE(JSEG, jseg, "Journal segment"); SOFTDEP_TYPE(JSEGDEP, jsegdep, "Journal segment complete"); SOFTDEP_TYPE(SBDEP, sbdep, "Superblock write dependency"); SOFTDEP_TYPE(JTRUNC, jtrunc, "Journal inode truncation"); SOFTDEP_TYPE(JFSYNC, jfsync, "Journal fsync complete"); static MALLOC_DEFINE(M_SENTINEL, "sentinel", "Worklist sentinel"); static MALLOC_DEFINE(M_SAVEDINO, "savedino", "Saved inodes"); static MALLOC_DEFINE(M_JBLOCKS, "jblocks", "Journal block locations"); static MALLOC_DEFINE(M_MOUNTDATA, "softdep", "Softdep per-mount data"); #define M_SOFTDEP_FLAGS (M_WAITOK) /* * translate from workitem type to memory type * MUST match the defines above, such that memtype[D_XXX] == M_XXX */ static struct malloc_type *memtype[] = { NULL, M_PAGEDEP, M_INODEDEP, M_BMSAFEMAP, M_NEWBLK, M_ALLOCDIRECT, M_INDIRDEP, M_ALLOCINDIR, M_FREEFRAG, M_FREEBLKS, M_FREEFILE, M_DIRADD, M_MKDIR, M_DIRREM, M_NEWDIRBLK, M_FREEWORK, M_FREEDEP, M_JADDREF, M_JREMREF, M_JMVREF, M_JNEWBLK, M_JFREEBLK, M_JFREEFRAG, M_JSEG, M_JSEGDEP, M_SBDEP, M_JTRUNC, M_JFSYNC, M_SENTINEL }; #define DtoM(type) (memtype[type]) /* * Names of malloc types. */ #define TYPENAME(type) \ ((unsigned)(type) <= D_LAST && (unsigned)(type) >= D_FIRST ? \ memtype[type]->ks_shortdesc : "???") /* * End system adaptation definitions. */ #define DOTDOT_OFFSET offsetof(struct dirtemplate, dotdot_ino) #define DOT_OFFSET offsetof(struct dirtemplate, dot_ino) /* * Internal function prototypes. */ static void check_clear_deps(struct mount *); static void softdep_error(char *, int); static int softdep_process_worklist(struct mount *, int); static int softdep_waitidle(struct mount *, int); static void drain_output(struct vnode *); static struct buf *getdirtybuf(struct buf *, struct rwlock *, int); static int check_inodedep_free(struct inodedep *); static void clear_remove(struct mount *); static void clear_inodedeps(struct mount *); static void unlinked_inodedep(struct mount *, struct inodedep *); static void clear_unlinked_inodedep(struct inodedep *); static struct inodedep *first_unlinked_inodedep(struct ufsmount *); static int flush_pagedep_deps(struct vnode *, struct mount *, struct diraddhd *); static int free_pagedep(struct pagedep *); static int flush_newblk_dep(struct vnode *, struct mount *, ufs_lbn_t); static int flush_inodedep_deps(struct vnode *, struct mount *, ino_t); static int flush_deplist(struct allocdirectlst *, int, int *); static int sync_cgs(struct mount *, int); static int handle_written_filepage(struct pagedep *, struct buf *, int); static int handle_written_sbdep(struct sbdep *, struct buf *); static void initiate_write_sbdep(struct sbdep *); static void diradd_inode_written(struct diradd *, struct inodedep *); static int handle_written_indirdep(struct indirdep *, struct buf *, struct buf**, int); static int handle_written_inodeblock(struct inodedep *, struct buf *, int); static int jnewblk_rollforward(struct jnewblk *, struct fs *, struct cg *, uint8_t *); static int handle_written_bmsafemap(struct bmsafemap *, struct buf *, int); static void handle_written_jaddref(struct jaddref *); static void handle_written_jremref(struct jremref *); static void handle_written_jseg(struct jseg *, struct buf *); static void handle_written_jnewblk(struct jnewblk *); static void handle_written_jblkdep(struct jblkdep *); static void handle_written_jfreefrag(struct jfreefrag *); static void complete_jseg(struct jseg *); static void complete_jsegs(struct jseg *); static void jseg_write(struct ufsmount *ump, struct jseg *, uint8_t *); static void jaddref_write(struct jaddref *, struct jseg *, uint8_t *); static void jremref_write(struct jremref *, struct jseg *, uint8_t *); static void jmvref_write(struct jmvref *, struct jseg *, uint8_t *); static void jtrunc_write(struct jtrunc *, struct jseg *, uint8_t *); static void jfsync_write(struct jfsync *, struct jseg *, uint8_t *data); static void jnewblk_write(struct jnewblk *, struct jseg *, uint8_t *); static void jfreeblk_write(struct jfreeblk *, struct jseg *, uint8_t *); static void jfreefrag_write(struct jfreefrag *, struct jseg *, uint8_t *); static inline void inoref_write(struct inoref *, struct jseg *, struct jrefrec *); static void handle_allocdirect_partdone(struct allocdirect *, struct workhead *); static struct jnewblk *cancel_newblk(struct newblk *, struct worklist *, struct workhead *); static void indirdep_complete(struct indirdep *); static int indirblk_lookup(struct mount *, ufs2_daddr_t); static void indirblk_insert(struct freework *); static void indirblk_remove(struct freework *); static void handle_allocindir_partdone(struct allocindir *); static void initiate_write_filepage(struct pagedep *, struct buf *); static void initiate_write_indirdep(struct indirdep*, struct buf *); static void handle_written_mkdir(struct mkdir *, int); static int jnewblk_rollback(struct jnewblk *, struct fs *, struct cg *, uint8_t *); static void initiate_write_bmsafemap(struct bmsafemap *, struct buf *); static void initiate_write_inodeblock_ufs1(struct inodedep *, struct buf *); static void initiate_write_inodeblock_ufs2(struct inodedep *, struct buf *); static void handle_workitem_freefile(struct freefile *); static int handle_workitem_remove(struct dirrem *, int); static struct dirrem *newdirrem(struct buf *, struct inode *, struct inode *, int, struct dirrem **); static struct indirdep *indirdep_lookup(struct mount *, struct inode *, struct buf *); static void cancel_indirdep(struct indirdep *, struct buf *, struct freeblks *); static void free_indirdep(struct indirdep *); static void free_diradd(struct diradd *, struct workhead *); static void merge_diradd(struct inodedep *, struct diradd *); static void complete_diradd(struct diradd *); static struct diradd *diradd_lookup(struct pagedep *, int); static struct jremref *cancel_diradd_dotdot(struct inode *, struct dirrem *, struct jremref *); static struct jremref *cancel_mkdir_dotdot(struct inode *, struct dirrem *, struct jremref *); static void cancel_diradd(struct diradd *, struct dirrem *, struct jremref *, struct jremref *, struct jremref *); static void dirrem_journal(struct dirrem *, struct jremref *, struct jremref *, struct jremref *); static void cancel_allocindir(struct allocindir *, struct buf *bp, struct freeblks *, int); static int setup_trunc_indir(struct freeblks *, struct inode *, ufs_lbn_t, ufs_lbn_t, ufs2_daddr_t); static void complete_trunc_indir(struct freework *); static void trunc_indirdep(struct indirdep *, struct freeblks *, struct buf *, int); static void complete_mkdir(struct mkdir *); static void free_newdirblk(struct newdirblk *); static void free_jremref(struct jremref *); static void free_jaddref(struct jaddref *); static void free_jsegdep(struct jsegdep *); static void free_jsegs(struct jblocks *); static void rele_jseg(struct jseg *); static void free_jseg(struct jseg *, struct jblocks *); static void free_jnewblk(struct jnewblk *); static void free_jblkdep(struct jblkdep *); static void free_jfreefrag(struct jfreefrag *); static void free_freedep(struct freedep *); static void journal_jremref(struct dirrem *, struct jremref *, struct inodedep *); static void cancel_jnewblk(struct jnewblk *, struct workhead *); static int cancel_jaddref(struct jaddref *, struct inodedep *, struct workhead *); static void cancel_jfreefrag(struct jfreefrag *); static inline void setup_freedirect(struct freeblks *, struct inode *, int, int); static inline void setup_freeext(struct freeblks *, struct inode *, int, int); static inline void setup_freeindir(struct freeblks *, struct inode *, int, ufs_lbn_t, int); static inline struct freeblks *newfreeblks(struct mount *, struct inode *); static void freeblks_free(struct ufsmount *, struct freeblks *, int); static void indir_trunc(struct freework *, ufs2_daddr_t, ufs_lbn_t); static ufs2_daddr_t blkcount(struct fs *, ufs2_daddr_t, off_t); static int trunc_check_buf(struct buf *, int *, ufs_lbn_t, int, int); static void trunc_dependencies(struct inode *, struct freeblks *, ufs_lbn_t, int, int); static void trunc_pages(struct inode *, off_t, ufs2_daddr_t, int); static int cancel_pagedep(struct pagedep *, struct freeblks *, int); static int deallocate_dependencies(struct buf *, struct freeblks *, int); static void newblk_freefrag(struct newblk*); static void free_newblk(struct newblk *); static void cancel_allocdirect(struct allocdirectlst *, struct allocdirect *, struct freeblks *); static int check_inode_unwritten(struct inodedep *); static int free_inodedep(struct inodedep *); static void freework_freeblock(struct freework *, u_long); static void freework_enqueue(struct freework *); static int handle_workitem_freeblocks(struct freeblks *, int); static int handle_complete_freeblocks(struct freeblks *, int); static void handle_workitem_indirblk(struct freework *); static void handle_written_freework(struct freework *); static void merge_inode_lists(struct allocdirectlst *,struct allocdirectlst *); static struct worklist *jnewblk_merge(struct worklist *, struct worklist *, struct workhead *); static struct freefrag *setup_allocindir_phase2(struct buf *, struct inode *, struct inodedep *, struct allocindir *, ufs_lbn_t); static struct allocindir *newallocindir(struct inode *, int, ufs2_daddr_t, ufs2_daddr_t, ufs_lbn_t); static void handle_workitem_freefrag(struct freefrag *); static struct freefrag *newfreefrag(struct inode *, ufs2_daddr_t, long, ufs_lbn_t, u_long); static void allocdirect_merge(struct allocdirectlst *, struct allocdirect *, struct allocdirect *); static struct freefrag *allocindir_merge(struct allocindir *, struct allocindir *); static int bmsafemap_find(struct bmsafemap_hashhead *, int, struct bmsafemap **); static struct bmsafemap *bmsafemap_lookup(struct mount *, struct buf *, int cg, struct bmsafemap *); static int newblk_find(struct newblk_hashhead *, ufs2_daddr_t, int, struct newblk **); static int newblk_lookup(struct mount *, ufs2_daddr_t, int, struct newblk **); static int inodedep_find(struct inodedep_hashhead *, ino_t, struct inodedep **); static int inodedep_lookup(struct mount *, ino_t, int, struct inodedep **); static int pagedep_lookup(struct mount *, struct buf *bp, ino_t, ufs_lbn_t, int, struct pagedep **); static int pagedep_find(struct pagedep_hashhead *, ino_t, ufs_lbn_t, struct pagedep **); static void pause_timer(void *); static int request_cleanup(struct mount *, int); static int softdep_request_cleanup_flush(struct mount *, struct ufsmount *); static void schedule_cleanup(struct mount *); static void softdep_ast_cleanup_proc(struct thread *); static struct ufsmount *softdep_bp_to_mp(struct buf *bp); static int process_worklist_item(struct mount *, int, int); static void process_removes(struct vnode *); static void process_truncates(struct vnode *); static void jwork_move(struct workhead *, struct workhead *); static void jwork_insert(struct workhead *, struct jsegdep *); static void add_to_worklist(struct worklist *, int); static void wake_worklist(struct worklist *); static void wait_worklist(struct worklist *, char *); static void remove_from_worklist(struct worklist *); static void softdep_flush(void *); static void softdep_flushjournal(struct mount *); static int softdep_speedup(struct ufsmount *); static void worklist_speedup(struct mount *); static int journal_mount(struct mount *, struct fs *, struct ucred *); static void journal_unmount(struct ufsmount *); static int journal_space(struct ufsmount *, int); static void journal_suspend(struct ufsmount *); static int journal_unsuspend(struct ufsmount *ump); static void softdep_prelink(struct vnode *, struct vnode *); static void add_to_journal(struct worklist *); static void remove_from_journal(struct worklist *); static bool softdep_excess_items(struct ufsmount *, int); static void softdep_process_journal(struct mount *, struct worklist *, int); static struct jremref *newjremref(struct dirrem *, struct inode *, struct inode *ip, off_t, nlink_t); static struct jaddref *newjaddref(struct inode *, ino_t, off_t, int16_t, uint16_t); static inline void newinoref(struct inoref *, ino_t, ino_t, off_t, nlink_t, uint16_t); static inline struct jsegdep *inoref_jseg(struct inoref *); static struct jmvref *newjmvref(struct inode *, ino_t, off_t, off_t); static struct jfreeblk *newjfreeblk(struct freeblks *, ufs_lbn_t, ufs2_daddr_t, int); static void adjust_newfreework(struct freeblks *, int); static struct jtrunc *newjtrunc(struct freeblks *, off_t, int); static void move_newblock_dep(struct jaddref *, struct inodedep *); static void cancel_jfreeblk(struct freeblks *, ufs2_daddr_t); static struct jfreefrag *newjfreefrag(struct freefrag *, struct inode *, ufs2_daddr_t, long, ufs_lbn_t); static struct freework *newfreework(struct ufsmount *, struct freeblks *, struct freework *, ufs_lbn_t, ufs2_daddr_t, int, int, int); static int jwait(struct worklist *, int); static struct inodedep *inodedep_lookup_ip(struct inode *); static int bmsafemap_backgroundwrite(struct bmsafemap *, struct buf *); static struct freefile *handle_bufwait(struct inodedep *, struct workhead *); static void handle_jwork(struct workhead *); static struct mkdir *setup_newdir(struct diradd *, ino_t, ino_t, struct buf *, struct mkdir **); static struct jblocks *jblocks_create(void); static ufs2_daddr_t jblocks_alloc(struct jblocks *, int, int *); static void jblocks_free(struct jblocks *, struct mount *, int); static void jblocks_destroy(struct jblocks *); static void jblocks_add(struct jblocks *, ufs2_daddr_t, int); /* * Exported softdep operations. */ static void softdep_disk_io_initiation(struct buf *); static void softdep_disk_write_complete(struct buf *); static void softdep_deallocate_dependencies(struct buf *); static int softdep_count_dependencies(struct buf *bp, int); /* * Global lock over all of soft updates. */ static struct mtx lk; MTX_SYSINIT(softdep_lock, &lk, "global softdep", MTX_DEF); #define ACQUIRE_GBLLOCK(lk) mtx_lock(lk) #define FREE_GBLLOCK(lk) mtx_unlock(lk) #define GBLLOCK_OWNED(lk) mtx_assert((lk), MA_OWNED) /* * Per-filesystem soft-updates locking. */ #define LOCK_PTR(ump) (&(ump)->um_softdep->sd_fslock) #define TRY_ACQUIRE_LOCK(ump) rw_try_wlock(&(ump)->um_softdep->sd_fslock) #define ACQUIRE_LOCK(ump) rw_wlock(&(ump)->um_softdep->sd_fslock) #define FREE_LOCK(ump) rw_wunlock(&(ump)->um_softdep->sd_fslock) #define LOCK_OWNED(ump) rw_assert(&(ump)->um_softdep->sd_fslock, \ RA_WLOCKED) #define BUF_AREC(bp) lockallowrecurse(&(bp)->b_lock) #define BUF_NOREC(bp) lockdisablerecurse(&(bp)->b_lock) /* * Worklist queue management. * These routines require that the lock be held. */ #ifndef /* NOT */ INVARIANTS #define WORKLIST_INSERT(head, item) do { \ (item)->wk_state |= ONWORKLIST; \ LIST_INSERT_HEAD(head, item, wk_list); \ } while (0) #define WORKLIST_REMOVE(item) do { \ (item)->wk_state &= ~ONWORKLIST; \ LIST_REMOVE(item, wk_list); \ } while (0) #define WORKLIST_INSERT_UNLOCKED WORKLIST_INSERT #define WORKLIST_REMOVE_UNLOCKED WORKLIST_REMOVE #else /* INVARIANTS */ static void worklist_insert(struct workhead *, struct worklist *, int, const char *, int); static void worklist_remove(struct worklist *, int, const char *, int); #define WORKLIST_INSERT(head, item) \ worklist_insert(head, item, 1, __func__, __LINE__) #define WORKLIST_INSERT_UNLOCKED(head, item)\ worklist_insert(head, item, 0, __func__, __LINE__) #define WORKLIST_REMOVE(item)\ worklist_remove(item, 1, __func__, __LINE__) #define WORKLIST_REMOVE_UNLOCKED(item)\ worklist_remove(item, 0, __func__, __LINE__) static void worklist_insert(head, item, locked, func, line) struct workhead *head; struct worklist *item; int locked; const char *func; int line; { if (locked) LOCK_OWNED(VFSTOUFS(item->wk_mp)); if (item->wk_state & ONWORKLIST) panic("worklist_insert: %p %s(0x%X) already on list, " "added in function %s at line %d", item, TYPENAME(item->wk_type), item->wk_state, item->wk_func, item->wk_line); item->wk_state |= ONWORKLIST; item->wk_func = func; item->wk_line = line; LIST_INSERT_HEAD(head, item, wk_list); } static void worklist_remove(item, locked, func, line) struct worklist *item; int locked; const char *func; int line; { if (locked) LOCK_OWNED(VFSTOUFS(item->wk_mp)); if ((item->wk_state & ONWORKLIST) == 0) panic("worklist_remove: %p %s(0x%X) not on list, " "removed in function %s at line %d", item, TYPENAME(item->wk_type), item->wk_state, item->wk_func, item->wk_line); item->wk_state &= ~ONWORKLIST; item->wk_func = func; item->wk_line = line; LIST_REMOVE(item, wk_list); } #endif /* INVARIANTS */ /* * Merge two jsegdeps keeping only the oldest one as newer references * can't be discarded until after older references. */ static inline struct jsegdep * jsegdep_merge(struct jsegdep *one, struct jsegdep *two) { struct jsegdep *swp; if (two == NULL) return (one); if (one->jd_seg->js_seq > two->jd_seg->js_seq) { swp = one; one = two; two = swp; } WORKLIST_REMOVE(&two->jd_list); free_jsegdep(two); return (one); } /* * If two freedeps are compatible free one to reduce list size. */ static inline struct freedep * freedep_merge(struct freedep *one, struct freedep *two) { if (two == NULL) return (one); if (one->fd_freework == two->fd_freework) { WORKLIST_REMOVE(&two->fd_list); free_freedep(two); } return (one); } /* * Move journal work from one list to another. Duplicate freedeps and * jsegdeps are coalesced to keep the lists as small as possible. */ static void jwork_move(dst, src) struct workhead *dst; struct workhead *src; { struct freedep *freedep; struct jsegdep *jsegdep; struct worklist *wkn; struct worklist *wk; KASSERT(dst != src, ("jwork_move: dst == src")); freedep = NULL; jsegdep = NULL; LIST_FOREACH_SAFE(wk, dst, wk_list, wkn) { if (wk->wk_type == D_JSEGDEP) jsegdep = jsegdep_merge(WK_JSEGDEP(wk), jsegdep); else if (wk->wk_type == D_FREEDEP) freedep = freedep_merge(WK_FREEDEP(wk), freedep); } while ((wk = LIST_FIRST(src)) != NULL) { WORKLIST_REMOVE(wk); WORKLIST_INSERT(dst, wk); if (wk->wk_type == D_JSEGDEP) { jsegdep = jsegdep_merge(WK_JSEGDEP(wk), jsegdep); continue; } if (wk->wk_type == D_FREEDEP) freedep = freedep_merge(WK_FREEDEP(wk), freedep); } } static void jwork_insert(dst, jsegdep) struct workhead *dst; struct jsegdep *jsegdep; { struct jsegdep *jsegdepn; struct worklist *wk; LIST_FOREACH(wk, dst, wk_list) if (wk->wk_type == D_JSEGDEP) break; if (wk == NULL) { WORKLIST_INSERT(dst, &jsegdep->jd_list); return; } jsegdepn = WK_JSEGDEP(wk); if (jsegdep->jd_seg->js_seq < jsegdepn->jd_seg->js_seq) { WORKLIST_REMOVE(wk); free_jsegdep(jsegdepn); WORKLIST_INSERT(dst, &jsegdep->jd_list); } else free_jsegdep(jsegdep); } /* * Routines for tracking and managing workitems. */ static void workitem_free(struct worklist *, int); static void workitem_alloc(struct worklist *, int, struct mount *); static void workitem_reassign(struct worklist *, int); #define WORKITEM_FREE(item, type) \ workitem_free((struct worklist *)(item), (type)) #define WORKITEM_REASSIGN(item, type) \ workitem_reassign((struct worklist *)(item), (type)) static void workitem_free(item, type) struct worklist *item; int type; { struct ufsmount *ump; #ifdef INVARIANTS if (item->wk_state & ONWORKLIST) panic("workitem_free: %s(0x%X) still on list, " "added in function %s at line %d", TYPENAME(item->wk_type), item->wk_state, item->wk_func, item->wk_line); if (item->wk_type != type && type != D_NEWBLK) panic("workitem_free: type mismatch %s != %s", TYPENAME(item->wk_type), TYPENAME(type)); #endif if (item->wk_state & IOWAITING) wakeup(item); ump = VFSTOUFS(item->wk_mp); LOCK_OWNED(ump); KASSERT(ump->softdep_deps > 0, ("workitem_free: %s: softdep_deps going negative", ump->um_fs->fs_fsmnt)); if (--ump->softdep_deps == 0 && ump->softdep_req) wakeup(&ump->softdep_deps); KASSERT(dep_current[item->wk_type] > 0, ("workitem_free: %s: dep_current[%s] going negative", ump->um_fs->fs_fsmnt, TYPENAME(item->wk_type))); KASSERT(ump->softdep_curdeps[item->wk_type] > 0, ("workitem_free: %s: softdep_curdeps[%s] going negative", ump->um_fs->fs_fsmnt, TYPENAME(item->wk_type))); atomic_subtract_long(&dep_current[item->wk_type], 1); ump->softdep_curdeps[item->wk_type] -= 1; #ifdef INVARIANTS LIST_REMOVE(item, wk_all); #endif free(item, DtoM(type)); } static void workitem_alloc(item, type, mp) struct worklist *item; int type; struct mount *mp; { struct ufsmount *ump; item->wk_type = type; item->wk_mp = mp; item->wk_state = 0; ump = VFSTOUFS(mp); ACQUIRE_GBLLOCK(&lk); dep_current[type]++; if (dep_current[type] > dep_highuse[type]) dep_highuse[type] = dep_current[type]; dep_total[type]++; FREE_GBLLOCK(&lk); ACQUIRE_LOCK(ump); ump->softdep_curdeps[type] += 1; ump->softdep_deps++; ump->softdep_accdeps++; #ifdef INVARIANTS LIST_INSERT_HEAD(&ump->softdep_alldeps[type], item, wk_all); #endif FREE_LOCK(ump); } static void workitem_reassign(item, newtype) struct worklist *item; int newtype; { struct ufsmount *ump; ump = VFSTOUFS(item->wk_mp); LOCK_OWNED(ump); KASSERT(ump->softdep_curdeps[item->wk_type] > 0, ("workitem_reassign: %s: softdep_curdeps[%s] going negative", VFSTOUFS(item->wk_mp)->um_fs->fs_fsmnt, TYPENAME(item->wk_type))); ump->softdep_curdeps[item->wk_type] -= 1; ump->softdep_curdeps[newtype] += 1; KASSERT(dep_current[item->wk_type] > 0, ("workitem_reassign: %s: dep_current[%s] going negative", VFSTOUFS(item->wk_mp)->um_fs->fs_fsmnt, TYPENAME(item->wk_type))); ACQUIRE_GBLLOCK(&lk); dep_current[newtype]++; dep_current[item->wk_type]--; if (dep_current[newtype] > dep_highuse[newtype]) dep_highuse[newtype] = dep_current[newtype]; dep_total[newtype]++; FREE_GBLLOCK(&lk); item->wk_type = newtype; } /* * Workitem queue management */ static int max_softdeps; /* maximum number of structs before slowdown */ static int tickdelay = 2; /* number of ticks to pause during slowdown */ static int proc_waiting; /* tracks whether we have a timeout posted */ static int *stat_countp; /* statistic to count in proc_waiting timeout */ static struct callout softdep_callout; static int req_clear_inodedeps; /* syncer process flush some inodedeps */ static int req_clear_remove; /* syncer process flush some freeblks */ static int softdep_flushcache = 0; /* Should we do BIO_FLUSH? */ /* * runtime statistics */ static int stat_flush_threads; /* number of softdep flushing threads */ static int stat_worklist_push; /* number of worklist cleanups */ static int stat_blk_limit_push; /* number of times block limit neared */ static int stat_ino_limit_push; /* number of times inode limit neared */ static int stat_blk_limit_hit; /* number of times block slowdown imposed */ static int stat_ino_limit_hit; /* number of times inode slowdown imposed */ static int stat_sync_limit_hit; /* number of synchronous slowdowns imposed */ static int stat_indir_blk_ptrs; /* bufs redirtied as indir ptrs not written */ static int stat_inode_bitmap; /* bufs redirtied as inode bitmap not written */ static int stat_direct_blk_ptrs;/* bufs redirtied as direct ptrs not written */ static int stat_dir_entry; /* bufs redirtied as dir entry cannot write */ static int stat_jaddref; /* bufs redirtied as ino bitmap can not write */ static int stat_jnewblk; /* bufs redirtied as blk bitmap can not write */ static int stat_journal_min; /* Times hit journal min threshold */ static int stat_journal_low; /* Times hit journal low threshold */ static int stat_journal_wait; /* Times blocked in jwait(). */ static int stat_jwait_filepage; /* Times blocked in jwait() for filepage. */ static int stat_jwait_freeblks; /* Times blocked in jwait() for freeblks. */ static int stat_jwait_inode; /* Times blocked in jwait() for inodes. */ static int stat_jwait_newblk; /* Times blocked in jwait() for newblks. */ static int stat_cleanup_high_delay; /* Maximum cleanup delay (in ticks) */ static int stat_cleanup_blkrequests; /* Number of block cleanup requests */ static int stat_cleanup_inorequests; /* Number of inode cleanup requests */ static int stat_cleanup_retries; /* Number of cleanups that needed to flush */ static int stat_cleanup_failures; /* Number of cleanup requests that failed */ static int stat_emptyjblocks; /* Number of potentially empty journal blocks */ SYSCTL_INT(_debug_softdep, OID_AUTO, max_softdeps, CTLFLAG_RW, &max_softdeps, 0, ""); SYSCTL_INT(_debug_softdep, OID_AUTO, tickdelay, CTLFLAG_RW, &tickdelay, 0, ""); SYSCTL_INT(_debug_softdep, OID_AUTO, flush_threads, CTLFLAG_RD, &stat_flush_threads, 0, ""); SYSCTL_INT(_debug_softdep, OID_AUTO, worklist_push, CTLFLAG_RW | CTLFLAG_STATS, &stat_worklist_push, 0,""); SYSCTL_INT(_debug_softdep, OID_AUTO, blk_limit_push, CTLFLAG_RW | CTLFLAG_STATS, &stat_blk_limit_push, 0,""); SYSCTL_INT(_debug_softdep, OID_AUTO, ino_limit_push, CTLFLAG_RW | CTLFLAG_STATS, &stat_ino_limit_push, 0,""); SYSCTL_INT(_debug_softdep, OID_AUTO, blk_limit_hit, CTLFLAG_RW | CTLFLAG_STATS, &stat_blk_limit_hit, 0, ""); SYSCTL_INT(_debug_softdep, OID_AUTO, ino_limit_hit, CTLFLAG_RW | CTLFLAG_STATS, &stat_ino_limit_hit, 0, ""); SYSCTL_INT(_debug_softdep, OID_AUTO, sync_limit_hit, CTLFLAG_RW | CTLFLAG_STATS, &stat_sync_limit_hit, 0, ""); SYSCTL_INT(_debug_softdep, OID_AUTO, indir_blk_ptrs, CTLFLAG_RW | CTLFLAG_STATS, &stat_indir_blk_ptrs, 0, ""); SYSCTL_INT(_debug_softdep, OID_AUTO, inode_bitmap, CTLFLAG_RW | CTLFLAG_STATS, &stat_inode_bitmap, 0, ""); SYSCTL_INT(_debug_softdep, OID_AUTO, direct_blk_ptrs, CTLFLAG_RW | CTLFLAG_STATS, &stat_direct_blk_ptrs, 0, ""); SYSCTL_INT(_debug_softdep, OID_AUTO, dir_entry, CTLFLAG_RW | CTLFLAG_STATS, &stat_dir_entry, 0, ""); SYSCTL_INT(_debug_softdep, OID_AUTO, jaddref_rollback, CTLFLAG_RW | CTLFLAG_STATS, &stat_jaddref, 0, ""); SYSCTL_INT(_debug_softdep, OID_AUTO, jnewblk_rollback, CTLFLAG_RW | CTLFLAG_STATS, &stat_jnewblk, 0, ""); SYSCTL_INT(_debug_softdep, OID_AUTO, journal_low, CTLFLAG_RW | CTLFLAG_STATS, &stat_journal_low, 0, ""); SYSCTL_INT(_debug_softdep, OID_AUTO, journal_min, CTLFLAG_RW | CTLFLAG_STATS, &stat_journal_min, 0, ""); SYSCTL_INT(_debug_softdep, OID_AUTO, journal_wait, CTLFLAG_RW | CTLFLAG_STATS, &stat_journal_wait, 0, ""); SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_filepage, CTLFLAG_RW | CTLFLAG_STATS, &stat_jwait_filepage, 0, ""); SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_freeblks, CTLFLAG_RW | CTLFLAG_STATS, &stat_jwait_freeblks, 0, ""); SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_inode, CTLFLAG_RW | CTLFLAG_STATS, &stat_jwait_inode, 0, ""); SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_newblk, CTLFLAG_RW | CTLFLAG_STATS, &stat_jwait_newblk, 0, ""); SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_blkrequests, CTLFLAG_RW | CTLFLAG_STATS, &stat_cleanup_blkrequests, 0, ""); SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_inorequests, CTLFLAG_RW | CTLFLAG_STATS, &stat_cleanup_inorequests, 0, ""); SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_high_delay, CTLFLAG_RW | CTLFLAG_STATS, &stat_cleanup_high_delay, 0, ""); SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_retries, CTLFLAG_RW | CTLFLAG_STATS, &stat_cleanup_retries, 0, ""); SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_failures, CTLFLAG_RW | CTLFLAG_STATS, &stat_cleanup_failures, 0, ""); SYSCTL_INT(_debug_softdep, OID_AUTO, flushcache, CTLFLAG_RW, &softdep_flushcache, 0, ""); SYSCTL_INT(_debug_softdep, OID_AUTO, emptyjblocks, CTLFLAG_RD, &stat_emptyjblocks, 0, ""); SYSCTL_DECL(_vfs_ffs); /* Whether to recompute the summary at mount time */ static int compute_summary_at_mount = 0; SYSCTL_INT(_vfs_ffs, OID_AUTO, compute_summary_at_mount, CTLFLAG_RW, &compute_summary_at_mount, 0, "Recompute summary at mount"); static int print_threads = 0; SYSCTL_INT(_debug_softdep, OID_AUTO, print_threads, CTLFLAG_RW, &print_threads, 0, "Notify flusher thread start/stop"); /* List of all filesystems mounted with soft updates */ static TAILQ_HEAD(, mount_softdeps) softdepmounts; /* * This function cleans the worklist for a filesystem. * Each filesystem running with soft dependencies gets its own * thread to run in this function. The thread is started up in * softdep_mount and shutdown in softdep_unmount. They show up * as part of the kernel "bufdaemon" process whose process * entry is available in bufdaemonproc. */ static int searchfailed; extern struct proc *bufdaemonproc; static void softdep_flush(addr) void *addr; { struct mount *mp; struct thread *td; struct ufsmount *ump; td = curthread; td->td_pflags |= TDP_NORUNNINGBUF; mp = (struct mount *)addr; ump = VFSTOUFS(mp); atomic_add_int(&stat_flush_threads, 1); ACQUIRE_LOCK(ump); ump->softdep_flags &= ~FLUSH_STARTING; wakeup(&ump->softdep_flushtd); FREE_LOCK(ump); if (print_threads) { if (stat_flush_threads == 1) printf("Running %s at pid %d\n", bufdaemonproc->p_comm, bufdaemonproc->p_pid); printf("Start thread %s\n", td->td_name); } for (;;) { while (softdep_process_worklist(mp, 0) > 0 || (MOUNTEDSUJ(mp) && VFSTOUFS(mp)->softdep_jblocks->jb_suspended)) kthread_suspend_check(); ACQUIRE_LOCK(ump); if ((ump->softdep_flags & (FLUSH_CLEANUP | FLUSH_EXIT)) == 0) msleep(&ump->softdep_flushtd, LOCK_PTR(ump), PVM, "sdflush", hz / 2); ump->softdep_flags &= ~FLUSH_CLEANUP; /* * Check to see if we are done and need to exit. */ if ((ump->softdep_flags & FLUSH_EXIT) == 0) { FREE_LOCK(ump); continue; } ump->softdep_flags &= ~FLUSH_EXIT; FREE_LOCK(ump); wakeup(&ump->softdep_flags); if (print_threads) printf("Stop thread %s: searchfailed %d, did cleanups %d\n", td->td_name, searchfailed, ump->um_softdep->sd_cleanups); atomic_subtract_int(&stat_flush_threads, 1); kthread_exit(); panic("kthread_exit failed\n"); } } static void worklist_speedup(mp) struct mount *mp; { struct ufsmount *ump; ump = VFSTOUFS(mp); LOCK_OWNED(ump); if ((ump->softdep_flags & (FLUSH_CLEANUP | FLUSH_EXIT)) == 0) ump->softdep_flags |= FLUSH_CLEANUP; wakeup(&ump->softdep_flushtd); } static void softdep_send_speedup(struct ufsmount *ump, size_t shortage, u_int flags) { struct buf *bp; if ((ump->um_flags & UM_CANSPEEDUP) == 0) return; bp = malloc(sizeof(*bp), M_TRIM, M_WAITOK | M_ZERO); bp->b_iocmd = BIO_SPEEDUP; bp->b_ioflags = flags; bp->b_bcount = shortage; g_vfs_strategy(ump->um_bo, bp); bufwait(bp); free(bp, M_TRIM); } static int softdep_speedup(ump) struct ufsmount *ump; { struct ufsmount *altump; struct mount_softdeps *sdp; LOCK_OWNED(ump); worklist_speedup(ump->um_mountp); bd_speedup(); /* * If we have global shortages, then we need other * filesystems to help with the cleanup. Here we wakeup a * flusher thread for a filesystem that is over its fair * share of resources. */ if (req_clear_inodedeps || req_clear_remove) { ACQUIRE_GBLLOCK(&lk); TAILQ_FOREACH(sdp, &softdepmounts, sd_next) { if ((altump = sdp->sd_ump) == ump) continue; if (((req_clear_inodedeps && altump->softdep_curdeps[D_INODEDEP] > max_softdeps / stat_flush_threads) || (req_clear_remove && altump->softdep_curdeps[D_DIRREM] > (max_softdeps / 2) / stat_flush_threads)) && TRY_ACQUIRE_LOCK(altump)) break; } if (sdp == NULL) { searchfailed++; FREE_GBLLOCK(&lk); } else { /* * Move to the end of the list so we pick a * different one on out next try. */ TAILQ_REMOVE(&softdepmounts, sdp, sd_next); TAILQ_INSERT_TAIL(&softdepmounts, sdp, sd_next); FREE_GBLLOCK(&lk); if ((altump->softdep_flags & (FLUSH_CLEANUP | FLUSH_EXIT)) == 0) altump->softdep_flags |= FLUSH_CLEANUP; altump->um_softdep->sd_cleanups++; wakeup(&altump->softdep_flushtd); FREE_LOCK(altump); } } return (speedup_syncer()); } /* * Add an item to the end of the work queue. * This routine requires that the lock be held. * This is the only routine that adds items to the list. * The following routine is the only one that removes items * and does so in order from first to last. */ #define WK_HEAD 0x0001 /* Add to HEAD. */ #define WK_NODELAY 0x0002 /* Process immediately. */ static void add_to_worklist(wk, flags) struct worklist *wk; int flags; { struct ufsmount *ump; ump = VFSTOUFS(wk->wk_mp); LOCK_OWNED(ump); if (wk->wk_state & ONWORKLIST) panic("add_to_worklist: %s(0x%X) already on list", TYPENAME(wk->wk_type), wk->wk_state); wk->wk_state |= ONWORKLIST; if (ump->softdep_on_worklist == 0) { LIST_INSERT_HEAD(&ump->softdep_workitem_pending, wk, wk_list); ump->softdep_worklist_tail = wk; } else if (flags & WK_HEAD) { LIST_INSERT_HEAD(&ump->softdep_workitem_pending, wk, wk_list); } else { LIST_INSERT_AFTER(ump->softdep_worklist_tail, wk, wk_list); ump->softdep_worklist_tail = wk; } ump->softdep_on_worklist += 1; if (flags & WK_NODELAY) worklist_speedup(wk->wk_mp); } /* * Remove the item to be processed. If we are removing the last * item on the list, we need to recalculate the tail pointer. */ static void remove_from_worklist(wk) struct worklist *wk; { struct ufsmount *ump; ump = VFSTOUFS(wk->wk_mp); if (ump->softdep_worklist_tail == wk) ump->softdep_worklist_tail = (struct worklist *)wk->wk_list.le_prev; WORKLIST_REMOVE(wk); ump->softdep_on_worklist -= 1; } static void wake_worklist(wk) struct worklist *wk; { if (wk->wk_state & IOWAITING) { wk->wk_state &= ~IOWAITING; wakeup(wk); } } static void wait_worklist(wk, wmesg) struct worklist *wk; char *wmesg; { struct ufsmount *ump; ump = VFSTOUFS(wk->wk_mp); wk->wk_state |= IOWAITING; msleep(wk, LOCK_PTR(ump), PVM, wmesg, 0); } /* * Process that runs once per second to handle items in the background queue. * * Note that we ensure that everything is done in the order in which they * appear in the queue. The code below depends on this property to ensure * that blocks of a file are freed before the inode itself is freed. This * ordering ensures that no new triples will be generated * until all the old ones have been purged from the dependency lists. */ static int softdep_process_worklist(mp, full) struct mount *mp; int full; { int cnt, matchcnt; struct ufsmount *ump; long starttime; KASSERT(mp != NULL, ("softdep_process_worklist: NULL mp")); if (MOUNTEDSOFTDEP(mp) == 0) return (0); matchcnt = 0; ump = VFSTOUFS(mp); ACQUIRE_LOCK(ump); starttime = time_second; softdep_process_journal(mp, NULL, full ? MNT_WAIT : 0); check_clear_deps(mp); while (ump->softdep_on_worklist > 0) { if ((cnt = process_worklist_item(mp, 10, LK_NOWAIT)) == 0) break; else matchcnt += cnt; check_clear_deps(mp); /* * We do not generally want to stop for buffer space, but if * we are really being a buffer hog, we will stop and wait. */ if (should_yield()) { FREE_LOCK(ump); kern_yield(PRI_USER); bwillwrite(); ACQUIRE_LOCK(ump); } /* * Never allow processing to run for more than one * second. This gives the syncer thread the opportunity * to pause if appropriate. */ if (!full && starttime != time_second) break; } if (full == 0) journal_unsuspend(ump); FREE_LOCK(ump); return (matchcnt); } /* * Process all removes associated with a vnode if we are running out of * journal space. Any other process which attempts to flush these will * be unable as we have the vnodes locked. */ static void process_removes(vp) struct vnode *vp; { struct inodedep *inodedep; struct dirrem *dirrem; struct ufsmount *ump; struct mount *mp; ino_t inum; mp = vp->v_mount; ump = VFSTOUFS(mp); LOCK_OWNED(ump); inum = VTOI(vp)->i_number; for (;;) { top: if (inodedep_lookup(mp, inum, 0, &inodedep) == 0) return; LIST_FOREACH(dirrem, &inodedep->id_dirremhd, dm_inonext) { /* * If another thread is trying to lock this vnode * it will fail but we must wait for it to do so * before we can proceed. */ if (dirrem->dm_state & INPROGRESS) { wait_worklist(&dirrem->dm_list, "pwrwait"); goto top; } if ((dirrem->dm_state & (COMPLETE | ONWORKLIST)) == (COMPLETE | ONWORKLIST)) break; } if (dirrem == NULL) return; remove_from_worklist(&dirrem->dm_list); FREE_LOCK(ump); if (vn_start_secondary_write(NULL, &mp, V_NOWAIT)) panic("process_removes: suspended filesystem"); handle_workitem_remove(dirrem, 0); vn_finished_secondary_write(mp); ACQUIRE_LOCK(ump); } } /* * Process all truncations associated with a vnode if we are running out * of journal space. This is called when the vnode lock is already held * and no other process can clear the truncation. This function returns * a value greater than zero if it did any work. */ static void process_truncates(vp) struct vnode *vp; { struct inodedep *inodedep; struct freeblks *freeblks; struct ufsmount *ump; struct mount *mp; ino_t inum; int cgwait; mp = vp->v_mount; ump = VFSTOUFS(mp); LOCK_OWNED(ump); inum = VTOI(vp)->i_number; for (;;) { if (inodedep_lookup(mp, inum, 0, &inodedep) == 0) return; cgwait = 0; TAILQ_FOREACH(freeblks, &inodedep->id_freeblklst, fb_next) { /* Journal entries not yet written. */ if (!LIST_EMPTY(&freeblks->fb_jblkdephd)) { jwait(&LIST_FIRST( &freeblks->fb_jblkdephd)->jb_list, MNT_WAIT); break; } /* Another thread is executing this item. */ if (freeblks->fb_state & INPROGRESS) { wait_worklist(&freeblks->fb_list, "ptrwait"); break; } /* Freeblks is waiting on a inode write. */ if ((freeblks->fb_state & COMPLETE) == 0) { FREE_LOCK(ump); ffs_update(vp, 1); ACQUIRE_LOCK(ump); break; } if ((freeblks->fb_state & (ALLCOMPLETE | ONWORKLIST)) == (ALLCOMPLETE | ONWORKLIST)) { remove_from_worklist(&freeblks->fb_list); freeblks->fb_state |= INPROGRESS; FREE_LOCK(ump); if (vn_start_secondary_write(NULL, &mp, V_NOWAIT)) panic("process_truncates: " "suspended filesystem"); handle_workitem_freeblocks(freeblks, 0); vn_finished_secondary_write(mp); ACQUIRE_LOCK(ump); break; } if (freeblks->fb_cgwait) cgwait++; } if (cgwait) { FREE_LOCK(ump); sync_cgs(mp, MNT_WAIT); ffs_sync_snap(mp, MNT_WAIT); ACQUIRE_LOCK(ump); continue; } if (freeblks == NULL) break; } return; } /* * Process one item on the worklist. */ static int process_worklist_item(mp, target, flags) struct mount *mp; int target; int flags; { struct worklist sentinel; struct worklist *wk; struct ufsmount *ump; int matchcnt; int error; KASSERT(mp != NULL, ("process_worklist_item: NULL mp")); /* * If we are being called because of a process doing a * copy-on-write, then it is not safe to write as we may * recurse into the copy-on-write routine. */ if (curthread->td_pflags & TDP_COWINPROGRESS) return (-1); PHOLD(curproc); /* Don't let the stack go away. */ ump = VFSTOUFS(mp); LOCK_OWNED(ump); matchcnt = 0; sentinel.wk_mp = NULL; sentinel.wk_type = D_SENTINEL; LIST_INSERT_HEAD(&ump->softdep_workitem_pending, &sentinel, wk_list); for (wk = LIST_NEXT(&sentinel, wk_list); wk != NULL; wk = LIST_NEXT(&sentinel, wk_list)) { if (wk->wk_type == D_SENTINEL) { LIST_REMOVE(&sentinel, wk_list); LIST_INSERT_AFTER(wk, &sentinel, wk_list); continue; } if (wk->wk_state & INPROGRESS) panic("process_worklist_item: %p already in progress.", wk); wk->wk_state |= INPROGRESS; remove_from_worklist(wk); FREE_LOCK(ump); if (vn_start_secondary_write(NULL, &mp, V_NOWAIT)) panic("process_worklist_item: suspended filesystem"); switch (wk->wk_type) { case D_DIRREM: /* removal of a directory entry */ error = handle_workitem_remove(WK_DIRREM(wk), flags); break; case D_FREEBLKS: /* releasing blocks and/or fragments from a file */ error = handle_workitem_freeblocks(WK_FREEBLKS(wk), flags); break; case D_FREEFRAG: /* releasing a fragment when replaced as a file grows */ handle_workitem_freefrag(WK_FREEFRAG(wk)); error = 0; break; case D_FREEFILE: /* releasing an inode when its link count drops to 0 */ handle_workitem_freefile(WK_FREEFILE(wk)); error = 0; break; default: panic("%s_process_worklist: Unknown type %s", "softdep", TYPENAME(wk->wk_type)); /* NOTREACHED */ } vn_finished_secondary_write(mp); ACQUIRE_LOCK(ump); if (error == 0) { if (++matchcnt == target) break; continue; } /* * We have to retry the worklist item later. Wake up any * waiters who may be able to complete it immediately and * add the item back to the head so we don't try to execute * it again. */ wk->wk_state &= ~INPROGRESS; wake_worklist(wk); add_to_worklist(wk, WK_HEAD); } /* Sentinal could've become the tail from remove_from_worklist. */ if (ump->softdep_worklist_tail == &sentinel) ump->softdep_worklist_tail = (struct worklist *)sentinel.wk_list.le_prev; LIST_REMOVE(&sentinel, wk_list); PRELE(curproc); return (matchcnt); } /* * Move dependencies from one buffer to another. */ int softdep_move_dependencies(oldbp, newbp) struct buf *oldbp; struct buf *newbp; { struct worklist *wk, *wktail; struct ufsmount *ump; int dirty; if ((wk = LIST_FIRST(&oldbp->b_dep)) == NULL) return (0); KASSERT(MOUNTEDSOFTDEP(wk->wk_mp) != 0, ("softdep_move_dependencies called on non-softdep filesystem")); dirty = 0; wktail = NULL; ump = VFSTOUFS(wk->wk_mp); ACQUIRE_LOCK(ump); while ((wk = LIST_FIRST(&oldbp->b_dep)) != NULL) { LIST_REMOVE(wk, wk_list); if (wk->wk_type == D_BMSAFEMAP && bmsafemap_backgroundwrite(WK_BMSAFEMAP(wk), newbp)) dirty = 1; if (wktail == NULL) LIST_INSERT_HEAD(&newbp->b_dep, wk, wk_list); else LIST_INSERT_AFTER(wktail, wk, wk_list); wktail = wk; } FREE_LOCK(ump); return (dirty); } /* * Purge the work list of all items associated with a particular mount point. */ int softdep_flushworklist(oldmnt, countp, td) struct mount *oldmnt; int *countp; struct thread *td; { struct vnode *devvp; struct ufsmount *ump; int count, error; /* * Alternately flush the block device associated with the mount * point and process any dependencies that the flushing * creates. We continue until no more worklist dependencies * are found. */ *countp = 0; error = 0; ump = VFSTOUFS(oldmnt); devvp = ump->um_devvp; while ((count = softdep_process_worklist(oldmnt, 1)) > 0) { *countp += count; vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); error = VOP_FSYNC(devvp, MNT_WAIT, td); VOP_UNLOCK(devvp); if (error != 0) break; } return (error); } #define SU_WAITIDLE_RETRIES 20 static int softdep_waitidle(struct mount *mp, int flags __unused) { struct ufsmount *ump; struct vnode *devvp; struct thread *td; int error, i; ump = VFSTOUFS(mp); devvp = ump->um_devvp; td = curthread; error = 0; ACQUIRE_LOCK(ump); for (i = 0; i < SU_WAITIDLE_RETRIES && ump->softdep_deps != 0; i++) { ump->softdep_req = 1; KASSERT((flags & FORCECLOSE) == 0 || ump->softdep_on_worklist == 0, ("softdep_waitidle: work added after flush")); msleep(&ump->softdep_deps, LOCK_PTR(ump), PVM | PDROP, "softdeps", 10 * hz); vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); error = VOP_FSYNC(devvp, MNT_WAIT, td); VOP_UNLOCK(devvp); ACQUIRE_LOCK(ump); if (error != 0) break; } ump->softdep_req = 0; if (i == SU_WAITIDLE_RETRIES && error == 0 && ump->softdep_deps != 0) { error = EBUSY; printf("softdep_waitidle: Failed to flush worklist for %p\n", mp); } FREE_LOCK(ump); return (error); } /* * Flush all vnodes and worklist items associated with a specified mount point. */ int softdep_flushfiles(oldmnt, flags, td) struct mount *oldmnt; int flags; struct thread *td; { #ifdef QUOTA struct ufsmount *ump; int i; #endif int error, early, depcount, loopcnt, retry_flush_count, retry; int morework; KASSERT(MOUNTEDSOFTDEP(oldmnt) != 0, ("softdep_flushfiles called on non-softdep filesystem")); loopcnt = 10; retry_flush_count = 3; retry_flush: error = 0; /* * Alternately flush the vnodes associated with the mount * point and process any dependencies that the flushing * creates. In theory, this loop can happen at most twice, * but we give it a few extra just to be sure. */ for (; loopcnt > 0; loopcnt--) { /* * Do another flush in case any vnodes were brought in * as part of the cleanup operations. */ early = retry_flush_count == 1 || (oldmnt->mnt_kern_flag & MNTK_UNMOUNT) == 0 ? 0 : EARLYFLUSH; if ((error = ffs_flushfiles(oldmnt, flags | early, td)) != 0) break; if ((error = softdep_flushworklist(oldmnt, &depcount, td)) != 0 || depcount == 0) break; } /* * If we are unmounting then it is an error to fail. If we * are simply trying to downgrade to read-only, then filesystem * activity can keep us busy forever, so we just fail with EBUSY. */ if (loopcnt == 0) { if (oldmnt->mnt_kern_flag & MNTK_UNMOUNT) panic("softdep_flushfiles: looping"); error = EBUSY; } if (!error) error = softdep_waitidle(oldmnt, flags); if (!error) { if (oldmnt->mnt_kern_flag & MNTK_UNMOUNT) { retry = 0; MNT_ILOCK(oldmnt); morework = oldmnt->mnt_nvnodelistsize > 0; #ifdef QUOTA ump = VFSTOUFS(oldmnt); UFS_LOCK(ump); for (i = 0; i < MAXQUOTAS; i++) { if (ump->um_quotas[i] != NULLVP) morework = 1; } UFS_UNLOCK(ump); #endif if (morework) { if (--retry_flush_count > 0) { retry = 1; loopcnt = 3; } else error = EBUSY; } MNT_IUNLOCK(oldmnt); if (retry) goto retry_flush; } } return (error); } /* * Structure hashing. * * There are four types of structures that can be looked up: * 1) pagedep structures identified by mount point, inode number, * and logical block. * 2) inodedep structures identified by mount point and inode number. * 3) newblk structures identified by mount point and * physical block number. * 4) bmsafemap structures identified by mount point and * cylinder group number. * * The "pagedep" and "inodedep" dependency structures are hashed * separately from the file blocks and inodes to which they correspond. * This separation helps when the in-memory copy of an inode or * file block must be replaced. It also obviates the need to access * an inode or file page when simply updating (or de-allocating) * dependency structures. Lookup of newblk structures is needed to * find newly allocated blocks when trying to associate them with * their allocdirect or allocindir structure. * * The lookup routines optionally create and hash a new instance when * an existing entry is not found. The bmsafemap lookup routine always * allocates a new structure if an existing one is not found. */ #define DEPALLOC 0x0001 /* allocate structure if lookup fails */ /* * Structures and routines associated with pagedep caching. */ #define PAGEDEP_HASH(ump, inum, lbn) \ (&(ump)->pagedep_hashtbl[((inum) + (lbn)) & (ump)->pagedep_hash_size]) static int pagedep_find(pagedephd, ino, lbn, pagedeppp) struct pagedep_hashhead *pagedephd; ino_t ino; ufs_lbn_t lbn; struct pagedep **pagedeppp; { struct pagedep *pagedep; LIST_FOREACH(pagedep, pagedephd, pd_hash) { if (ino == pagedep->pd_ino && lbn == pagedep->pd_lbn) { *pagedeppp = pagedep; return (1); } } *pagedeppp = NULL; return (0); } /* * Look up a pagedep. Return 1 if found, 0 otherwise. * If not found, allocate if DEPALLOC flag is passed. * Found or allocated entry is returned in pagedeppp. */ static int pagedep_lookup(mp, bp, ino, lbn, flags, pagedeppp) struct mount *mp; struct buf *bp; ino_t ino; ufs_lbn_t lbn; int flags; struct pagedep **pagedeppp; { struct pagedep *pagedep; struct pagedep_hashhead *pagedephd; struct worklist *wk; struct ufsmount *ump; int ret; int i; ump = VFSTOUFS(mp); LOCK_OWNED(ump); if (bp) { LIST_FOREACH(wk, &bp->b_dep, wk_list) { if (wk->wk_type == D_PAGEDEP) { *pagedeppp = WK_PAGEDEP(wk); return (1); } } } pagedephd = PAGEDEP_HASH(ump, ino, lbn); ret = pagedep_find(pagedephd, ino, lbn, pagedeppp); if (ret) { if (((*pagedeppp)->pd_state & ONWORKLIST) == 0 && bp) WORKLIST_INSERT(&bp->b_dep, &(*pagedeppp)->pd_list); return (1); } if ((flags & DEPALLOC) == 0) return (0); FREE_LOCK(ump); pagedep = malloc(sizeof(struct pagedep), M_PAGEDEP, M_SOFTDEP_FLAGS|M_ZERO); workitem_alloc(&pagedep->pd_list, D_PAGEDEP, mp); ACQUIRE_LOCK(ump); ret = pagedep_find(pagedephd, ino, lbn, pagedeppp); if (*pagedeppp) { /* * This should never happen since we only create pagedeps * with the vnode lock held. Could be an assert. */ WORKITEM_FREE(pagedep, D_PAGEDEP); return (ret); } pagedep->pd_ino = ino; pagedep->pd_lbn = lbn; LIST_INIT(&pagedep->pd_dirremhd); LIST_INIT(&pagedep->pd_pendinghd); for (i = 0; i < DAHASHSZ; i++) LIST_INIT(&pagedep->pd_diraddhd[i]); LIST_INSERT_HEAD(pagedephd, pagedep, pd_hash); WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list); *pagedeppp = pagedep; return (0); } /* * Structures and routines associated with inodedep caching. */ #define INODEDEP_HASH(ump, inum) \ (&(ump)->inodedep_hashtbl[(inum) & (ump)->inodedep_hash_size]) static int inodedep_find(inodedephd, inum, inodedeppp) struct inodedep_hashhead *inodedephd; ino_t inum; struct inodedep **inodedeppp; { struct inodedep *inodedep; LIST_FOREACH(inodedep, inodedephd, id_hash) if (inum == inodedep->id_ino) break; if (inodedep) { *inodedeppp = inodedep; return (1); } *inodedeppp = NULL; return (0); } /* * Look up an inodedep. Return 1 if found, 0 if not found. * If not found, allocate if DEPALLOC flag is passed. * Found or allocated entry is returned in inodedeppp. */ static int inodedep_lookup(mp, inum, flags, inodedeppp) struct mount *mp; ino_t inum; int flags; struct inodedep **inodedeppp; { struct inodedep *inodedep; struct inodedep_hashhead *inodedephd; struct ufsmount *ump; struct fs *fs; ump = VFSTOUFS(mp); LOCK_OWNED(ump); fs = ump->um_fs; inodedephd = INODEDEP_HASH(ump, inum); if (inodedep_find(inodedephd, inum, inodedeppp)) return (1); if ((flags & DEPALLOC) == 0) return (0); /* * If the system is over its limit and our filesystem is * responsible for more than our share of that usage and * we are not in a rush, request some inodedep cleanup. */ if (softdep_excess_items(ump, D_INODEDEP)) schedule_cleanup(mp); else FREE_LOCK(ump); inodedep = malloc(sizeof(struct inodedep), M_INODEDEP, M_SOFTDEP_FLAGS); workitem_alloc(&inodedep->id_list, D_INODEDEP, mp); ACQUIRE_LOCK(ump); if (inodedep_find(inodedephd, inum, inodedeppp)) { WORKITEM_FREE(inodedep, D_INODEDEP); return (1); } inodedep->id_fs = fs; inodedep->id_ino = inum; inodedep->id_state = ALLCOMPLETE; inodedep->id_nlinkdelta = 0; inodedep->id_nlinkwrote = -1; inodedep->id_savedino1 = NULL; inodedep->id_savedsize = -1; inodedep->id_savedextsize = -1; inodedep->id_savednlink = -1; inodedep->id_bmsafemap = NULL; inodedep->id_mkdiradd = NULL; LIST_INIT(&inodedep->id_dirremhd); LIST_INIT(&inodedep->id_pendinghd); LIST_INIT(&inodedep->id_inowait); LIST_INIT(&inodedep->id_bufwait); TAILQ_INIT(&inodedep->id_inoreflst); TAILQ_INIT(&inodedep->id_inoupdt); TAILQ_INIT(&inodedep->id_newinoupdt); TAILQ_INIT(&inodedep->id_extupdt); TAILQ_INIT(&inodedep->id_newextupdt); TAILQ_INIT(&inodedep->id_freeblklst); LIST_INSERT_HEAD(inodedephd, inodedep, id_hash); *inodedeppp = inodedep; return (0); } /* * Structures and routines associated with newblk caching. */ #define NEWBLK_HASH(ump, inum) \ (&(ump)->newblk_hashtbl[(inum) & (ump)->newblk_hash_size]) static int newblk_find(newblkhd, newblkno, flags, newblkpp) struct newblk_hashhead *newblkhd; ufs2_daddr_t newblkno; int flags; struct newblk **newblkpp; { struct newblk *newblk; LIST_FOREACH(newblk, newblkhd, nb_hash) { if (newblkno != newblk->nb_newblkno) continue; /* * If we're creating a new dependency don't match those that * have already been converted to allocdirects. This is for * a frag extend. */ if ((flags & DEPALLOC) && newblk->nb_list.wk_type != D_NEWBLK) continue; break; } if (newblk) { *newblkpp = newblk; return (1); } *newblkpp = NULL; return (0); } /* * Look up a newblk. Return 1 if found, 0 if not found. * If not found, allocate if DEPALLOC flag is passed. * Found or allocated entry is returned in newblkpp. */ static int newblk_lookup(mp, newblkno, flags, newblkpp) struct mount *mp; ufs2_daddr_t newblkno; int flags; struct newblk **newblkpp; { struct newblk *newblk; struct newblk_hashhead *newblkhd; struct ufsmount *ump; ump = VFSTOUFS(mp); LOCK_OWNED(ump); newblkhd = NEWBLK_HASH(ump, newblkno); if (newblk_find(newblkhd, newblkno, flags, newblkpp)) return (1); if ((flags & DEPALLOC) == 0) return (0); if (softdep_excess_items(ump, D_NEWBLK) || softdep_excess_items(ump, D_ALLOCDIRECT) || softdep_excess_items(ump, D_ALLOCINDIR)) schedule_cleanup(mp); else FREE_LOCK(ump); newblk = malloc(sizeof(union allblk), M_NEWBLK, M_SOFTDEP_FLAGS | M_ZERO); workitem_alloc(&newblk->nb_list, D_NEWBLK, mp); ACQUIRE_LOCK(ump); if (newblk_find(newblkhd, newblkno, flags, newblkpp)) { WORKITEM_FREE(newblk, D_NEWBLK); return (1); } newblk->nb_freefrag = NULL; LIST_INIT(&newblk->nb_indirdeps); LIST_INIT(&newblk->nb_newdirblk); LIST_INIT(&newblk->nb_jwork); newblk->nb_state = ATTACHED; newblk->nb_newblkno = newblkno; LIST_INSERT_HEAD(newblkhd, newblk, nb_hash); *newblkpp = newblk; return (0); } /* * Structures and routines associated with freed indirect block caching. */ #define INDIR_HASH(ump, blkno) \ (&(ump)->indir_hashtbl[(blkno) & (ump)->indir_hash_size]) /* * Lookup an indirect block in the indir hash table. The freework is * removed and potentially freed. The caller must do a blocking journal * write before writing to the blkno. */ static int indirblk_lookup(mp, blkno) struct mount *mp; ufs2_daddr_t blkno; { struct freework *freework; struct indir_hashhead *wkhd; struct ufsmount *ump; ump = VFSTOUFS(mp); wkhd = INDIR_HASH(ump, blkno); TAILQ_FOREACH(freework, wkhd, fw_next) { if (freework->fw_blkno != blkno) continue; indirblk_remove(freework); return (1); } return (0); } /* * Insert an indirect block represented by freework into the indirblk * hash table so that it may prevent the block from being re-used prior * to the journal being written. */ static void indirblk_insert(freework) struct freework *freework; { struct jblocks *jblocks; struct jseg *jseg; struct ufsmount *ump; ump = VFSTOUFS(freework->fw_list.wk_mp); jblocks = ump->softdep_jblocks; jseg = TAILQ_LAST(&jblocks->jb_segs, jseglst); if (jseg == NULL) return; LIST_INSERT_HEAD(&jseg->js_indirs, freework, fw_segs); TAILQ_INSERT_HEAD(INDIR_HASH(ump, freework->fw_blkno), freework, fw_next); freework->fw_state &= ~DEPCOMPLETE; } static void indirblk_remove(freework) struct freework *freework; { struct ufsmount *ump; ump = VFSTOUFS(freework->fw_list.wk_mp); LIST_REMOVE(freework, fw_segs); TAILQ_REMOVE(INDIR_HASH(ump, freework->fw_blkno), freework, fw_next); freework->fw_state |= DEPCOMPLETE; if ((freework->fw_state & ALLCOMPLETE) == ALLCOMPLETE) WORKITEM_FREE(freework, D_FREEWORK); } /* * Executed during filesystem system initialization before * mounting any filesystems. */ void softdep_initialize() { TAILQ_INIT(&softdepmounts); #ifdef __LP64__ max_softdeps = desiredvnodes * 4; #else max_softdeps = desiredvnodes * 2; #endif /* initialise bioops hack */ bioops.io_start = softdep_disk_io_initiation; bioops.io_complete = softdep_disk_write_complete; bioops.io_deallocate = softdep_deallocate_dependencies; bioops.io_countdeps = softdep_count_dependencies; softdep_ast_cleanup = softdep_ast_cleanup_proc; /* Initialize the callout with an mtx. */ callout_init_mtx(&softdep_callout, &lk, 0); } /* * Executed after all filesystems have been unmounted during * filesystem module unload. */ void softdep_uninitialize() { /* clear bioops hack */ bioops.io_start = NULL; bioops.io_complete = NULL; bioops.io_deallocate = NULL; bioops.io_countdeps = NULL; softdep_ast_cleanup = NULL; callout_drain(&softdep_callout); } /* * Called at mount time to notify the dependency code that a * filesystem wishes to use it. */ int softdep_mount(devvp, mp, fs, cred) struct vnode *devvp; struct mount *mp; struct fs *fs; struct ucred *cred; { struct csum_total cstotal; struct mount_softdeps *sdp; struct ufsmount *ump; struct cg *cgp; struct buf *bp; u_int cyl, i; int error; sdp = malloc(sizeof(struct mount_softdeps), M_MOUNTDATA, M_WAITOK | M_ZERO); MNT_ILOCK(mp); mp->mnt_flag = (mp->mnt_flag & ~MNT_ASYNC) | MNT_SOFTDEP; if ((mp->mnt_kern_flag & MNTK_SOFTDEP) == 0) { mp->mnt_kern_flag = (mp->mnt_kern_flag & ~MNTK_ASYNC) | MNTK_SOFTDEP | MNTK_NOASYNC; } ump = VFSTOUFS(mp); ump->um_softdep = sdp; MNT_IUNLOCK(mp); rw_init(LOCK_PTR(ump), "per-fs softdep"); sdp->sd_ump = ump; LIST_INIT(&ump->softdep_workitem_pending); LIST_INIT(&ump->softdep_journal_pending); TAILQ_INIT(&ump->softdep_unlinked); LIST_INIT(&ump->softdep_dirtycg); ump->softdep_worklist_tail = NULL; ump->softdep_on_worklist = 0; ump->softdep_deps = 0; LIST_INIT(&ump->softdep_mkdirlisthd); ump->pagedep_hashtbl = hashinit(desiredvnodes / 5, M_PAGEDEP, &ump->pagedep_hash_size); ump->pagedep_nextclean = 0; ump->inodedep_hashtbl = hashinit(desiredvnodes, M_INODEDEP, &ump->inodedep_hash_size); ump->inodedep_nextclean = 0; ump->newblk_hashtbl = hashinit(max_softdeps / 2, M_NEWBLK, &ump->newblk_hash_size); ump->bmsafemap_hashtbl = hashinit(1024, M_BMSAFEMAP, &ump->bmsafemap_hash_size); i = 1 << (ffs(desiredvnodes / 10) - 1); ump->indir_hashtbl = malloc(i * sizeof(struct indir_hashhead), M_FREEWORK, M_WAITOK); ump->indir_hash_size = i - 1; for (i = 0; i <= ump->indir_hash_size; i++) TAILQ_INIT(&ump->indir_hashtbl[i]); #ifdef INVARIANTS for (i = 0; i <= D_LAST; i++) LIST_INIT(&ump->softdep_alldeps[i]); #endif ACQUIRE_GBLLOCK(&lk); TAILQ_INSERT_TAIL(&softdepmounts, sdp, sd_next); FREE_GBLLOCK(&lk); if ((fs->fs_flags & FS_SUJ) && (error = journal_mount(mp, fs, cred)) != 0) { printf("Failed to start journal: %d\n", error); softdep_unmount(mp); return (error); } /* * Start our flushing thread in the bufdaemon process. */ ACQUIRE_LOCK(ump); ump->softdep_flags |= FLUSH_STARTING; FREE_LOCK(ump); kproc_kthread_add(&softdep_flush, mp, &bufdaemonproc, &ump->softdep_flushtd, 0, 0, "softdepflush", "%s worker", mp->mnt_stat.f_mntonname); ACQUIRE_LOCK(ump); while ((ump->softdep_flags & FLUSH_STARTING) != 0) { msleep(&ump->softdep_flushtd, LOCK_PTR(ump), PVM, "sdstart", hz / 2); } FREE_LOCK(ump); /* * When doing soft updates, the counters in the * superblock may have gotten out of sync. Recomputation * can take a long time and can be deferred for background * fsck. However, the old behavior of scanning the cylinder * groups and recalculating them at mount time is available * by setting vfs.ffs.compute_summary_at_mount to one. */ if (compute_summary_at_mount == 0 || fs->fs_clean != 0) return (0); bzero(&cstotal, sizeof cstotal); for (cyl = 0; cyl < fs->fs_ncg; cyl++) { if ((error = bread(devvp, fsbtodb(fs, cgtod(fs, cyl)), fs->fs_cgsize, cred, &bp)) != 0) { brelse(bp); softdep_unmount(mp); return (error); } cgp = (struct cg *)bp->b_data; cstotal.cs_nffree += cgp->cg_cs.cs_nffree; cstotal.cs_nbfree += cgp->cg_cs.cs_nbfree; cstotal.cs_nifree += cgp->cg_cs.cs_nifree; cstotal.cs_ndir += cgp->cg_cs.cs_ndir; fs->fs_cs(fs, cyl) = cgp->cg_cs; brelse(bp); } #ifdef INVARIANTS if (bcmp(&cstotal, &fs->fs_cstotal, sizeof cstotal)) printf("%s: superblock summary recomputed\n", fs->fs_fsmnt); #endif bcopy(&cstotal, &fs->fs_cstotal, sizeof cstotal); return (0); } void softdep_unmount(mp) struct mount *mp; { struct ufsmount *ump; #ifdef INVARIANTS int i; #endif KASSERT(MOUNTEDSOFTDEP(mp) != 0, ("softdep_unmount called on non-softdep filesystem")); ump = VFSTOUFS(mp); MNT_ILOCK(mp); mp->mnt_flag &= ~MNT_SOFTDEP; if (MOUNTEDSUJ(mp) == 0) { MNT_IUNLOCK(mp); } else { mp->mnt_flag &= ~MNT_SUJ; MNT_IUNLOCK(mp); journal_unmount(ump); } /* * Shut down our flushing thread. Check for NULL is if * softdep_mount errors out before the thread has been created. */ if (ump->softdep_flushtd != NULL) { ACQUIRE_LOCK(ump); ump->softdep_flags |= FLUSH_EXIT; wakeup(&ump->softdep_flushtd); msleep(&ump->softdep_flags, LOCK_PTR(ump), PVM | PDROP, "sdwait", 0); KASSERT((ump->softdep_flags & FLUSH_EXIT) == 0, ("Thread shutdown failed")); } /* * Free up our resources. */ ACQUIRE_GBLLOCK(&lk); TAILQ_REMOVE(&softdepmounts, ump->um_softdep, sd_next); FREE_GBLLOCK(&lk); rw_destroy(LOCK_PTR(ump)); hashdestroy(ump->pagedep_hashtbl, M_PAGEDEP, ump->pagedep_hash_size); hashdestroy(ump->inodedep_hashtbl, M_INODEDEP, ump->inodedep_hash_size); hashdestroy(ump->newblk_hashtbl, M_NEWBLK, ump->newblk_hash_size); hashdestroy(ump->bmsafemap_hashtbl, M_BMSAFEMAP, ump->bmsafemap_hash_size); free(ump->indir_hashtbl, M_FREEWORK); #ifdef INVARIANTS for (i = 0; i <= D_LAST; i++) { KASSERT(ump->softdep_curdeps[i] == 0, ("Unmount %s: Dep type %s != 0 (%ld)", ump->um_fs->fs_fsmnt, TYPENAME(i), ump->softdep_curdeps[i])); KASSERT(LIST_EMPTY(&ump->softdep_alldeps[i]), ("Unmount %s: Dep type %s not empty (%p)", ump->um_fs->fs_fsmnt, TYPENAME(i), LIST_FIRST(&ump->softdep_alldeps[i]))); } #endif free(ump->um_softdep, M_MOUNTDATA); } static struct jblocks * jblocks_create(void) { struct jblocks *jblocks; jblocks = malloc(sizeof(*jblocks), M_JBLOCKS, M_WAITOK | M_ZERO); TAILQ_INIT(&jblocks->jb_segs); jblocks->jb_avail = 10; jblocks->jb_extent = malloc(sizeof(struct jextent) * jblocks->jb_avail, M_JBLOCKS, M_WAITOK | M_ZERO); return (jblocks); } static ufs2_daddr_t jblocks_alloc(jblocks, bytes, actual) struct jblocks *jblocks; int bytes; int *actual; { ufs2_daddr_t daddr; struct jextent *jext; int freecnt; int blocks; blocks = bytes / DEV_BSIZE; jext = &jblocks->jb_extent[jblocks->jb_head]; freecnt = jext->je_blocks - jblocks->jb_off; if (freecnt == 0) { jblocks->jb_off = 0; if (++jblocks->jb_head > jblocks->jb_used) jblocks->jb_head = 0; jext = &jblocks->jb_extent[jblocks->jb_head]; freecnt = jext->je_blocks; } if (freecnt > blocks) freecnt = blocks; *actual = freecnt * DEV_BSIZE; daddr = jext->je_daddr + jblocks->jb_off; jblocks->jb_off += freecnt; jblocks->jb_free -= freecnt; return (daddr); } static void jblocks_free(jblocks, mp, bytes) struct jblocks *jblocks; struct mount *mp; int bytes; { LOCK_OWNED(VFSTOUFS(mp)); jblocks->jb_free += bytes / DEV_BSIZE; if (jblocks->jb_suspended) worklist_speedup(mp); wakeup(jblocks); } static void jblocks_destroy(jblocks) struct jblocks *jblocks; { if (jblocks->jb_extent) free(jblocks->jb_extent, M_JBLOCKS); free(jblocks, M_JBLOCKS); } static void jblocks_add(jblocks, daddr, blocks) struct jblocks *jblocks; ufs2_daddr_t daddr; int blocks; { struct jextent *jext; jblocks->jb_blocks += blocks; jblocks->jb_free += blocks; jext = &jblocks->jb_extent[jblocks->jb_used]; /* Adding the first block. */ if (jext->je_daddr == 0) { jext->je_daddr = daddr; jext->je_blocks = blocks; return; } /* Extending the last extent. */ if (jext->je_daddr + jext->je_blocks == daddr) { jext->je_blocks += blocks; return; } /* Adding a new extent. */ if (++jblocks->jb_used == jblocks->jb_avail) { jblocks->jb_avail *= 2; jext = malloc(sizeof(struct jextent) * jblocks->jb_avail, M_JBLOCKS, M_WAITOK | M_ZERO); memcpy(jext, jblocks->jb_extent, sizeof(struct jextent) * jblocks->jb_used); free(jblocks->jb_extent, M_JBLOCKS); jblocks->jb_extent = jext; } jext = &jblocks->jb_extent[jblocks->jb_used]; jext->je_daddr = daddr; jext->je_blocks = blocks; return; } int softdep_journal_lookup(mp, vpp) struct mount *mp; struct vnode **vpp; { struct componentname cnp; struct vnode *dvp; ino_t sujournal; int error; error = VFS_VGET(mp, UFS_ROOTINO, LK_EXCLUSIVE, &dvp); if (error) return (error); bzero(&cnp, sizeof(cnp)); cnp.cn_nameiop = LOOKUP; cnp.cn_flags = ISLASTCN; cnp.cn_thread = curthread; cnp.cn_cred = curthread->td_ucred; cnp.cn_pnbuf = SUJ_FILE; cnp.cn_nameptr = SUJ_FILE; cnp.cn_namelen = strlen(SUJ_FILE); error = ufs_lookup_ino(dvp, NULL, &cnp, &sujournal); vput(dvp); if (error != 0) return (error); error = VFS_VGET(mp, sujournal, LK_EXCLUSIVE, vpp); return (error); } /* * Open and verify the journal file. */ static int journal_mount(mp, fs, cred) struct mount *mp; struct fs *fs; struct ucred *cred; { struct jblocks *jblocks; struct ufsmount *ump; struct vnode *vp; struct inode *ip; ufs2_daddr_t blkno; int bcount; int error; int i; ump = VFSTOUFS(mp); ump->softdep_journal_tail = NULL; ump->softdep_on_journal = 0; ump->softdep_accdeps = 0; ump->softdep_req = 0; ump->softdep_jblocks = NULL; error = softdep_journal_lookup(mp, &vp); if (error != 0) { printf("Failed to find journal. Use tunefs to create one\n"); return (error); } ip = VTOI(vp); if (ip->i_size < SUJ_MIN) { error = ENOSPC; goto out; } bcount = lblkno(fs, ip->i_size); /* Only use whole blocks. */ jblocks = jblocks_create(); for (i = 0; i < bcount; i++) { error = ufs_bmaparray(vp, i, &blkno, NULL, NULL, NULL); if (error) break; jblocks_add(jblocks, blkno, fsbtodb(fs, fs->fs_frag)); } if (error) { jblocks_destroy(jblocks); goto out; } jblocks->jb_low = jblocks->jb_free / 3; /* Reserve 33%. */ jblocks->jb_min = jblocks->jb_free / 10; /* Suspend at 10%. */ ump->softdep_jblocks = jblocks; out: if (error == 0) { MNT_ILOCK(mp); mp->mnt_flag |= MNT_SUJ; mp->mnt_flag &= ~MNT_SOFTDEP; MNT_IUNLOCK(mp); /* * Only validate the journal contents if the * filesystem is clean, otherwise we write the logs * but they'll never be used. If the filesystem was * still dirty when we mounted it the journal is * invalid and a new journal can only be valid if it * starts from a clean mount. */ if (fs->fs_clean) { DIP_SET(ip, i_modrev, fs->fs_mtime); ip->i_flags |= IN_MODIFIED; ffs_update(vp, 1); } } vput(vp); return (error); } static void journal_unmount(ump) struct ufsmount *ump; { if (ump->softdep_jblocks) jblocks_destroy(ump->softdep_jblocks); ump->softdep_jblocks = NULL; } /* * Called when a journal record is ready to be written. Space is allocated * and the journal entry is created when the journal is flushed to stable * store. */ static void add_to_journal(wk) struct worklist *wk; { struct ufsmount *ump; ump = VFSTOUFS(wk->wk_mp); LOCK_OWNED(ump); if (wk->wk_state & ONWORKLIST) panic("add_to_journal: %s(0x%X) already on list", TYPENAME(wk->wk_type), wk->wk_state); wk->wk_state |= ONWORKLIST | DEPCOMPLETE; if (LIST_EMPTY(&ump->softdep_journal_pending)) { ump->softdep_jblocks->jb_age = ticks; LIST_INSERT_HEAD(&ump->softdep_journal_pending, wk, wk_list); } else LIST_INSERT_AFTER(ump->softdep_journal_tail, wk, wk_list); ump->softdep_journal_tail = wk; ump->softdep_on_journal += 1; } /* * Remove an arbitrary item for the journal worklist maintain the tail * pointer. This happens when a new operation obviates the need to * journal an old operation. */ static void remove_from_journal(wk) struct worklist *wk; { struct ufsmount *ump; ump = VFSTOUFS(wk->wk_mp); LOCK_OWNED(ump); #ifdef INVARIANTS { struct worklist *wkn; LIST_FOREACH(wkn, &ump->softdep_journal_pending, wk_list) if (wkn == wk) break; if (wkn == NULL) panic("remove_from_journal: %p is not in journal", wk); } #endif /* * We emulate a TAILQ to save space in most structures which do not * require TAILQ semantics. Here we must update the tail position * when removing the tail which is not the final entry. This works * only if the worklist linkage are at the beginning of the structure. */ if (ump->softdep_journal_tail == wk) ump->softdep_journal_tail = (struct worklist *)wk->wk_list.le_prev; WORKLIST_REMOVE(wk); ump->softdep_on_journal -= 1; } /* * Check for journal space as well as dependency limits so the prelink * code can throttle both journaled and non-journaled filesystems. * Threshold is 0 for low and 1 for min. */ static int journal_space(ump, thresh) struct ufsmount *ump; int thresh; { struct jblocks *jblocks; int limit, avail; jblocks = ump->softdep_jblocks; if (jblocks == NULL) return (1); /* * We use a tighter restriction here to prevent request_cleanup() * running in threads from running into locks we currently hold. * We have to be over the limit and our filesystem has to be * responsible for more than our share of that usage. */ limit = (max_softdeps / 10) * 9; if (dep_current[D_INODEDEP] > limit && ump->softdep_curdeps[D_INODEDEP] > limit / stat_flush_threads) return (0); if (thresh) thresh = jblocks->jb_min; else thresh = jblocks->jb_low; avail = (ump->softdep_on_journal * JREC_SIZE) / DEV_BSIZE; avail = jblocks->jb_free - avail; return (avail > thresh); } static void journal_suspend(ump) struct ufsmount *ump; { struct jblocks *jblocks; struct mount *mp; bool set; mp = UFSTOVFS(ump); if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) return; jblocks = ump->softdep_jblocks; vfs_op_enter(mp); set = false; MNT_ILOCK(mp); if ((mp->mnt_kern_flag & MNTK_SUSPEND) == 0) { stat_journal_min++; mp->mnt_kern_flag |= MNTK_SUSPEND; mp->mnt_susp_owner = ump->softdep_flushtd; set = true; } jblocks->jb_suspended = 1; MNT_IUNLOCK(mp); if (!set) vfs_op_exit(mp); } static int journal_unsuspend(struct ufsmount *ump) { struct jblocks *jblocks; struct mount *mp; mp = UFSTOVFS(ump); jblocks = ump->softdep_jblocks; if (jblocks != NULL && jblocks->jb_suspended && journal_space(ump, jblocks->jb_min)) { jblocks->jb_suspended = 0; FREE_LOCK(ump); mp->mnt_susp_owner = curthread; vfs_write_resume(mp, 0); ACQUIRE_LOCK(ump); return (1); } return (0); } /* * Called before any allocation function to be certain that there is * sufficient space in the journal prior to creating any new records. * Since in the case of block allocation we may have multiple locked * buffers at the time of the actual allocation we can not block * when the journal records are created. Doing so would create a deadlock * if any of these buffers needed to be flushed to reclaim space. Instead * we require a sufficiently large amount of available space such that * each thread in the system could have passed this allocation check and * still have sufficient free space. With 20% of a minimum journal size * of 1MB we have 6553 records available. */ int softdep_prealloc(vp, waitok) struct vnode *vp; int waitok; { struct ufsmount *ump; KASSERT(MOUNTEDSOFTDEP(vp->v_mount) != 0, ("softdep_prealloc called on non-softdep filesystem")); /* * Nothing to do if we are not running journaled soft updates. * If we currently hold the snapshot lock, we must avoid * handling other resources that could cause deadlock. Do not * touch quotas vnode since it is typically recursed with * other vnode locks held. */ if (DOINGSUJ(vp) == 0 || IS_SNAPSHOT(VTOI(vp)) || (vp->v_vflag & VV_SYSTEM) != 0) return (0); ump = VFSTOUFS(vp->v_mount); ACQUIRE_LOCK(ump); if (journal_space(ump, 0)) { FREE_LOCK(ump); return (0); } stat_journal_low++; FREE_LOCK(ump); if (waitok == MNT_NOWAIT) return (ENOSPC); /* * Attempt to sync this vnode once to flush any journal * work attached to it. */ if ((curthread->td_pflags & TDP_COWINPROGRESS) == 0) ffs_syncvnode(vp, waitok, 0); ACQUIRE_LOCK(ump); process_removes(vp); process_truncates(vp); if (journal_space(ump, 0) == 0) { softdep_speedup(ump); if (journal_space(ump, 1) == 0) journal_suspend(ump); } FREE_LOCK(ump); return (0); } /* * Before adjusting a link count on a vnode verify that we have sufficient * journal space. If not, process operations that depend on the currently * locked pair of vnodes to try to flush space as the syncer, buf daemon, * and softdep flush threads can not acquire these locks to reclaim space. */ static void softdep_prelink(dvp, vp) struct vnode *dvp; struct vnode *vp; { struct ufsmount *ump; ump = VFSTOUFS(dvp->v_mount); LOCK_OWNED(ump); /* * Nothing to do if we have sufficient journal space. * If we currently hold the snapshot lock, we must avoid * handling other resources that could cause deadlock. */ if (journal_space(ump, 0) || (vp && IS_SNAPSHOT(VTOI(vp)))) return; stat_journal_low++; FREE_LOCK(ump); if (vp) ffs_syncvnode(vp, MNT_NOWAIT, 0); ffs_syncvnode(dvp, MNT_WAIT, 0); ACQUIRE_LOCK(ump); /* Process vp before dvp as it may create .. removes. */ if (vp) { process_removes(vp); process_truncates(vp); } process_removes(dvp); process_truncates(dvp); softdep_speedup(ump); process_worklist_item(UFSTOVFS(ump), 2, LK_NOWAIT); if (journal_space(ump, 0) == 0) { softdep_speedup(ump); if (journal_space(ump, 1) == 0) journal_suspend(ump); } } static void jseg_write(ump, jseg, data) struct ufsmount *ump; struct jseg *jseg; uint8_t *data; { struct jsegrec *rec; rec = (struct jsegrec *)data; rec->jsr_seq = jseg->js_seq; rec->jsr_oldest = jseg->js_oldseq; rec->jsr_cnt = jseg->js_cnt; rec->jsr_blocks = jseg->js_size / ump->um_devvp->v_bufobj.bo_bsize; rec->jsr_crc = 0; rec->jsr_time = ump->um_fs->fs_mtime; } static inline void inoref_write(inoref, jseg, rec) struct inoref *inoref; struct jseg *jseg; struct jrefrec *rec; { inoref->if_jsegdep->jd_seg = jseg; rec->jr_ino = inoref->if_ino; rec->jr_parent = inoref->if_parent; rec->jr_nlink = inoref->if_nlink; rec->jr_mode = inoref->if_mode; rec->jr_diroff = inoref->if_diroff; } static void jaddref_write(jaddref, jseg, data) struct jaddref *jaddref; struct jseg *jseg; uint8_t *data; { struct jrefrec *rec; rec = (struct jrefrec *)data; rec->jr_op = JOP_ADDREF; inoref_write(&jaddref->ja_ref, jseg, rec); } static void jremref_write(jremref, jseg, data) struct jremref *jremref; struct jseg *jseg; uint8_t *data; { struct jrefrec *rec; rec = (struct jrefrec *)data; rec->jr_op = JOP_REMREF; inoref_write(&jremref->jr_ref, jseg, rec); } static void jmvref_write(jmvref, jseg, data) struct jmvref *jmvref; struct jseg *jseg; uint8_t *data; { struct jmvrec *rec; rec = (struct jmvrec *)data; rec->jm_op = JOP_MVREF; rec->jm_ino = jmvref->jm_ino; rec->jm_parent = jmvref->jm_parent; rec->jm_oldoff = jmvref->jm_oldoff; rec->jm_newoff = jmvref->jm_newoff; } static void jnewblk_write(jnewblk, jseg, data) struct jnewblk *jnewblk; struct jseg *jseg; uint8_t *data; { struct jblkrec *rec; jnewblk->jn_jsegdep->jd_seg = jseg; rec = (struct jblkrec *)data; rec->jb_op = JOP_NEWBLK; rec->jb_ino = jnewblk->jn_ino; rec->jb_blkno = jnewblk->jn_blkno; rec->jb_lbn = jnewblk->jn_lbn; rec->jb_frags = jnewblk->jn_frags; rec->jb_oldfrags = jnewblk->jn_oldfrags; } static void jfreeblk_write(jfreeblk, jseg, data) struct jfreeblk *jfreeblk; struct jseg *jseg; uint8_t *data; { struct jblkrec *rec; jfreeblk->jf_dep.jb_jsegdep->jd_seg = jseg; rec = (struct jblkrec *)data; rec->jb_op = JOP_FREEBLK; rec->jb_ino = jfreeblk->jf_ino; rec->jb_blkno = jfreeblk->jf_blkno; rec->jb_lbn = jfreeblk->jf_lbn; rec->jb_frags = jfreeblk->jf_frags; rec->jb_oldfrags = 0; } static void jfreefrag_write(jfreefrag, jseg, data) struct jfreefrag *jfreefrag; struct jseg *jseg; uint8_t *data; { struct jblkrec *rec; jfreefrag->fr_jsegdep->jd_seg = jseg; rec = (struct jblkrec *)data; rec->jb_op = JOP_FREEBLK; rec->jb_ino = jfreefrag->fr_ino; rec->jb_blkno = jfreefrag->fr_blkno; rec->jb_lbn = jfreefrag->fr_lbn; rec->jb_frags = jfreefrag->fr_frags; rec->jb_oldfrags = 0; } static void jtrunc_write(jtrunc, jseg, data) struct jtrunc *jtrunc; struct jseg *jseg; uint8_t *data; { struct jtrncrec *rec; jtrunc->jt_dep.jb_jsegdep->jd_seg = jseg; rec = (struct jtrncrec *)data; rec->jt_op = JOP_TRUNC; rec->jt_ino = jtrunc->jt_ino; rec->jt_size = jtrunc->jt_size; rec->jt_extsize = jtrunc->jt_extsize; } static void jfsync_write(jfsync, jseg, data) struct jfsync *jfsync; struct jseg *jseg; uint8_t *data; { struct jtrncrec *rec; rec = (struct jtrncrec *)data; rec->jt_op = JOP_SYNC; rec->jt_ino = jfsync->jfs_ino; rec->jt_size = jfsync->jfs_size; rec->jt_extsize = jfsync->jfs_extsize; } static void softdep_flushjournal(mp) struct mount *mp; { struct jblocks *jblocks; struct ufsmount *ump; if (MOUNTEDSUJ(mp) == 0) return; ump = VFSTOUFS(mp); jblocks = ump->softdep_jblocks; ACQUIRE_LOCK(ump); while (ump->softdep_on_journal) { jblocks->jb_needseg = 1; softdep_process_journal(mp, NULL, MNT_WAIT); } FREE_LOCK(ump); } static void softdep_synchronize_completed(struct bio *); static void softdep_synchronize(struct bio *, struct ufsmount *, void *); static void softdep_synchronize_completed(bp) struct bio *bp; { struct jseg *oldest; struct jseg *jseg; struct ufsmount *ump; /* * caller1 marks the last segment written before we issued the * synchronize cache. */ jseg = bp->bio_caller1; if (jseg == NULL) { g_destroy_bio(bp); return; } ump = VFSTOUFS(jseg->js_list.wk_mp); ACQUIRE_LOCK(ump); oldest = NULL; /* * Mark all the journal entries waiting on the synchronize cache * as completed so they may continue on. */ while (jseg != NULL && (jseg->js_state & COMPLETE) == 0) { jseg->js_state |= COMPLETE; oldest = jseg; jseg = TAILQ_PREV(jseg, jseglst, js_next); } /* * Restart deferred journal entry processing from the oldest * completed jseg. */ if (oldest) complete_jsegs(oldest); FREE_LOCK(ump); g_destroy_bio(bp); } /* * Send BIO_FLUSH/SYNCHRONIZE CACHE to the device to enforce write ordering * barriers. The journal must be written prior to any blocks that depend * on it and the journal can not be released until the blocks have be * written. This code handles both barriers simultaneously. */ static void softdep_synchronize(bp, ump, caller1) struct bio *bp; struct ufsmount *ump; void *caller1; { bp->bio_cmd = BIO_FLUSH; bp->bio_flags |= BIO_ORDERED; bp->bio_data = NULL; bp->bio_offset = ump->um_cp->provider->mediasize; bp->bio_length = 0; bp->bio_done = softdep_synchronize_completed; bp->bio_caller1 = caller1; g_io_request(bp, ump->um_cp); } /* * Flush some journal records to disk. */ static void softdep_process_journal(mp, needwk, flags) struct mount *mp; struct worklist *needwk; int flags; { struct jblocks *jblocks; struct ufsmount *ump; struct worklist *wk; struct jseg *jseg; struct buf *bp; struct bio *bio; uint8_t *data; struct fs *fs; int shouldflush; int segwritten; int jrecmin; /* Minimum records per block. */ int jrecmax; /* Maximum records per block. */ int size; int cnt; int off; int devbsize; if (MOUNTEDSUJ(mp) == 0) return; shouldflush = softdep_flushcache; bio = NULL; jseg = NULL; ump = VFSTOUFS(mp); LOCK_OWNED(ump); fs = ump->um_fs; jblocks = ump->softdep_jblocks; devbsize = ump->um_devvp->v_bufobj.bo_bsize; /* * We write anywhere between a disk block and fs block. The upper * bound is picked to prevent buffer cache fragmentation and limit * processing time per I/O. */ jrecmin = (devbsize / JREC_SIZE) - 1; /* -1 for seg header */ jrecmax = (fs->fs_bsize / devbsize) * jrecmin; segwritten = 0; for (;;) { cnt = ump->softdep_on_journal; /* * Criteria for writing a segment: * 1) We have a full block. * 2) We're called from jwait() and haven't found the * journal item yet. * 3) Always write if needseg is set. * 4) If we are called from process_worklist and have * not yet written anything we write a partial block * to enforce a 1 second maximum latency on journal * entries. */ if (cnt < (jrecmax - 1) && needwk == NULL && jblocks->jb_needseg == 0 && (segwritten || cnt == 0)) break; cnt++; /* * Verify some free journal space. softdep_prealloc() should * guarantee that we don't run out so this is indicative of * a problem with the flow control. Try to recover * gracefully in any event. */ while (jblocks->jb_free == 0) { if (flags != MNT_WAIT) break; printf("softdep: Out of journal space!\n"); softdep_speedup(ump); msleep(jblocks, LOCK_PTR(ump), PRIBIO, "jblocks", hz); } FREE_LOCK(ump); jseg = malloc(sizeof(*jseg), M_JSEG, M_SOFTDEP_FLAGS); workitem_alloc(&jseg->js_list, D_JSEG, mp); LIST_INIT(&jseg->js_entries); LIST_INIT(&jseg->js_indirs); jseg->js_state = ATTACHED; if (shouldflush == 0) jseg->js_state |= COMPLETE; else if (bio == NULL) bio = g_alloc_bio(); jseg->js_jblocks = jblocks; bp = geteblk(fs->fs_bsize, 0); ACQUIRE_LOCK(ump); /* * If there was a race while we were allocating the block * and jseg the entry we care about was likely written. * We bail out in both the WAIT and NOWAIT case and assume * the caller will loop if the entry it cares about is * not written. */ cnt = ump->softdep_on_journal; if (cnt + jblocks->jb_needseg == 0 || jblocks->jb_free == 0) { bp->b_flags |= B_INVAL | B_NOCACHE; WORKITEM_FREE(jseg, D_JSEG); FREE_LOCK(ump); brelse(bp); ACQUIRE_LOCK(ump); break; } /* * Calculate the disk block size required for the available * records rounded to the min size. */ if (cnt == 0) size = devbsize; else if (cnt < jrecmax) size = howmany(cnt, jrecmin) * devbsize; else size = fs->fs_bsize; /* * Allocate a disk block for this journal data and account * for truncation of the requested size if enough contiguous * space was not available. */ bp->b_blkno = jblocks_alloc(jblocks, size, &size); bp->b_lblkno = bp->b_blkno; bp->b_offset = bp->b_blkno * DEV_BSIZE; bp->b_bcount = size; bp->b_flags &= ~B_INVAL; bp->b_flags |= B_VALIDSUSPWRT | B_NOCOPY; /* * Initialize our jseg with cnt records. Assign the next * sequence number to it and link it in-order. */ cnt = MIN(cnt, (size / devbsize) * jrecmin); jseg->js_buf = bp; jseg->js_cnt = cnt; jseg->js_refs = cnt + 1; /* Self ref. */ jseg->js_size = size; jseg->js_seq = jblocks->jb_nextseq++; if (jblocks->jb_oldestseg == NULL) jblocks->jb_oldestseg = jseg; jseg->js_oldseq = jblocks->jb_oldestseg->js_seq; TAILQ_INSERT_TAIL(&jblocks->jb_segs, jseg, js_next); if (jblocks->jb_writeseg == NULL) jblocks->jb_writeseg = jseg; /* * Start filling in records from the pending list. */ data = bp->b_data; off = 0; /* * Always put a header on the first block. * XXX As with below, there might not be a chance to get * into the loop. Ensure that something valid is written. */ jseg_write(ump, jseg, data); off += JREC_SIZE; data = bp->b_data + off; /* * XXX Something is wrong here. There's no work to do, * but we need to perform and I/O and allow it to complete * anyways. */ if (LIST_EMPTY(&ump->softdep_journal_pending)) stat_emptyjblocks++; while ((wk = LIST_FIRST(&ump->softdep_journal_pending)) != NULL) { if (cnt == 0) break; /* Place a segment header on every device block. */ if ((off % devbsize) == 0) { jseg_write(ump, jseg, data); off += JREC_SIZE; data = bp->b_data + off; } if (wk == needwk) needwk = NULL; remove_from_journal(wk); wk->wk_state |= INPROGRESS; WORKLIST_INSERT(&jseg->js_entries, wk); switch (wk->wk_type) { case D_JADDREF: jaddref_write(WK_JADDREF(wk), jseg, data); break; case D_JREMREF: jremref_write(WK_JREMREF(wk), jseg, data); break; case D_JMVREF: jmvref_write(WK_JMVREF(wk), jseg, data); break; case D_JNEWBLK: jnewblk_write(WK_JNEWBLK(wk), jseg, data); break; case D_JFREEBLK: jfreeblk_write(WK_JFREEBLK(wk), jseg, data); break; case D_JFREEFRAG: jfreefrag_write(WK_JFREEFRAG(wk), jseg, data); break; case D_JTRUNC: jtrunc_write(WK_JTRUNC(wk), jseg, data); break; case D_JFSYNC: jfsync_write(WK_JFSYNC(wk), jseg, data); break; default: panic("process_journal: Unknown type %s", TYPENAME(wk->wk_type)); /* NOTREACHED */ } off += JREC_SIZE; data = bp->b_data + off; cnt--; } /* Clear any remaining space so we don't leak kernel data */ if (size > off) bzero(data, size - off); /* * Write this one buffer and continue. */ segwritten = 1; jblocks->jb_needseg = 0; WORKLIST_INSERT(&bp->b_dep, &jseg->js_list); FREE_LOCK(ump); bp->b_xflags |= BX_CVTENXIO; pbgetvp(ump->um_devvp, bp); /* * We only do the blocking wait once we find the journal * entry we're looking for. */ if (needwk == NULL && flags == MNT_WAIT) bwrite(bp); else bawrite(bp); ACQUIRE_LOCK(ump); } /* * If we wrote a segment issue a synchronize cache so the journal * is reflected on disk before the data is written. Since reclaiming * journal space also requires writing a journal record this * process also enforces a barrier before reclamation. */ if (segwritten && shouldflush) { softdep_synchronize(bio, ump, TAILQ_LAST(&jblocks->jb_segs, jseglst)); } else if (bio) g_destroy_bio(bio); /* * If we've suspended the filesystem because we ran out of journal * space either try to sync it here to make some progress or * unsuspend it if we already have. */ if (flags == 0 && jblocks->jb_suspended) { if (journal_unsuspend(ump)) return; FREE_LOCK(ump); VFS_SYNC(mp, MNT_NOWAIT); ffs_sbupdate(ump, MNT_WAIT, 0); ACQUIRE_LOCK(ump); } } /* * Complete a jseg, allowing all dependencies awaiting journal writes * to proceed. Each journal dependency also attaches a jsegdep to dependent * structures so that the journal segment can be freed to reclaim space. */ static void complete_jseg(jseg) struct jseg *jseg; { struct worklist *wk; struct jmvref *jmvref; #ifdef INVARIANTS int i = 0; #endif while ((wk = LIST_FIRST(&jseg->js_entries)) != NULL) { WORKLIST_REMOVE(wk); wk->wk_state &= ~INPROGRESS; wk->wk_state |= COMPLETE; KASSERT(i++ < jseg->js_cnt, ("handle_written_jseg: overflow %d >= %d", i - 1, jseg->js_cnt)); switch (wk->wk_type) { case D_JADDREF: handle_written_jaddref(WK_JADDREF(wk)); break; case D_JREMREF: handle_written_jremref(WK_JREMREF(wk)); break; case D_JMVREF: rele_jseg(jseg); /* No jsegdep. */ jmvref = WK_JMVREF(wk); LIST_REMOVE(jmvref, jm_deps); if ((jmvref->jm_pagedep->pd_state & ONWORKLIST) == 0) free_pagedep(jmvref->jm_pagedep); WORKITEM_FREE(jmvref, D_JMVREF); break; case D_JNEWBLK: handle_written_jnewblk(WK_JNEWBLK(wk)); break; case D_JFREEBLK: handle_written_jblkdep(&WK_JFREEBLK(wk)->jf_dep); break; case D_JTRUNC: handle_written_jblkdep(&WK_JTRUNC(wk)->jt_dep); break; case D_JFSYNC: rele_jseg(jseg); /* No jsegdep. */ WORKITEM_FREE(wk, D_JFSYNC); break; case D_JFREEFRAG: handle_written_jfreefrag(WK_JFREEFRAG(wk)); break; default: panic("handle_written_jseg: Unknown type %s", TYPENAME(wk->wk_type)); /* NOTREACHED */ } } /* Release the self reference so the structure may be freed. */ rele_jseg(jseg); } /* * Determine which jsegs are ready for completion processing. Waits for * synchronize cache to complete as well as forcing in-order completion * of journal entries. */ static void complete_jsegs(jseg) struct jseg *jseg; { struct jblocks *jblocks; struct jseg *jsegn; jblocks = jseg->js_jblocks; /* * Don't allow out of order completions. If this isn't the first * block wait for it to write before we're done. */ if (jseg != jblocks->jb_writeseg) return; /* Iterate through available jsegs processing their entries. */ while (jseg && (jseg->js_state & ALLCOMPLETE) == ALLCOMPLETE) { jblocks->jb_oldestwrseq = jseg->js_oldseq; jsegn = TAILQ_NEXT(jseg, js_next); complete_jseg(jseg); jseg = jsegn; } jblocks->jb_writeseg = jseg; /* * Attempt to free jsegs now that oldestwrseq may have advanced. */ free_jsegs(jblocks); } /* * Mark a jseg as DEPCOMPLETE and throw away the buffer. Attempt to handle * the final completions. */ static void handle_written_jseg(jseg, bp) struct jseg *jseg; struct buf *bp; { if (jseg->js_refs == 0) panic("handle_written_jseg: No self-reference on %p", jseg); jseg->js_state |= DEPCOMPLETE; /* * We'll never need this buffer again, set flags so it will be * discarded. */ bp->b_flags |= B_INVAL | B_NOCACHE; pbrelvp(bp); complete_jsegs(jseg); } static inline struct jsegdep * inoref_jseg(inoref) struct inoref *inoref; { struct jsegdep *jsegdep; jsegdep = inoref->if_jsegdep; inoref->if_jsegdep = NULL; return (jsegdep); } /* * Called once a jremref has made it to stable store. The jremref is marked * complete and we attempt to free it. Any pagedeps writes sleeping waiting * for the jremref to complete will be awoken by free_jremref. */ static void handle_written_jremref(jremref) struct jremref *jremref; { struct inodedep *inodedep; struct jsegdep *jsegdep; struct dirrem *dirrem; /* Grab the jsegdep. */ jsegdep = inoref_jseg(&jremref->jr_ref); /* * Remove us from the inoref list. */ if (inodedep_lookup(jremref->jr_list.wk_mp, jremref->jr_ref.if_ino, 0, &inodedep) == 0) panic("handle_written_jremref: Lost inodedep"); TAILQ_REMOVE(&inodedep->id_inoreflst, &jremref->jr_ref, if_deps); /* * Complete the dirrem. */ dirrem = jremref->jr_dirrem; jremref->jr_dirrem = NULL; LIST_REMOVE(jremref, jr_deps); jsegdep->jd_state |= jremref->jr_state & MKDIR_PARENT; jwork_insert(&dirrem->dm_jwork, jsegdep); if (LIST_EMPTY(&dirrem->dm_jremrefhd) && (dirrem->dm_state & COMPLETE) != 0) add_to_worklist(&dirrem->dm_list, 0); free_jremref(jremref); } /* * Called once a jaddref has made it to stable store. The dependency is * marked complete and any dependent structures are added to the inode * bufwait list to be completed as soon as it is written. If a bitmap write * depends on this entry we move the inode into the inodedephd of the * bmsafemap dependency and attempt to remove the jaddref from the bmsafemap. */ static void handle_written_jaddref(jaddref) struct jaddref *jaddref; { struct jsegdep *jsegdep; struct inodedep *inodedep; struct diradd *diradd; struct mkdir *mkdir; /* Grab the jsegdep. */ jsegdep = inoref_jseg(&jaddref->ja_ref); mkdir = NULL; diradd = NULL; if (inodedep_lookup(jaddref->ja_list.wk_mp, jaddref->ja_ino, 0, &inodedep) == 0) panic("handle_written_jaddref: Lost inodedep."); if (jaddref->ja_diradd == NULL) panic("handle_written_jaddref: No dependency"); if (jaddref->ja_diradd->da_list.wk_type == D_DIRADD) { diradd = jaddref->ja_diradd; WORKLIST_INSERT(&inodedep->id_bufwait, &diradd->da_list); } else if (jaddref->ja_state & MKDIR_PARENT) { mkdir = jaddref->ja_mkdir; WORKLIST_INSERT(&inodedep->id_bufwait, &mkdir->md_list); } else if (jaddref->ja_state & MKDIR_BODY) mkdir = jaddref->ja_mkdir; else panic("handle_written_jaddref: Unknown dependency %p", jaddref->ja_diradd); jaddref->ja_diradd = NULL; /* also clears ja_mkdir */ /* * Remove us from the inode list. */ TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref, if_deps); /* * The mkdir may be waiting on the jaddref to clear before freeing. */ if (mkdir) { KASSERT(mkdir->md_list.wk_type == D_MKDIR, ("handle_written_jaddref: Incorrect type for mkdir %s", TYPENAME(mkdir->md_list.wk_type))); mkdir->md_jaddref = NULL; diradd = mkdir->md_diradd; mkdir->md_state |= DEPCOMPLETE; complete_mkdir(mkdir); } jwork_insert(&diradd->da_jwork, jsegdep); if (jaddref->ja_state & NEWBLOCK) { inodedep->id_state |= ONDEPLIST; LIST_INSERT_HEAD(&inodedep->id_bmsafemap->sm_inodedephd, inodedep, id_deps); } free_jaddref(jaddref); } /* * Called once a jnewblk journal is written. The allocdirect or allocindir * is placed in the bmsafemap to await notification of a written bitmap. If * the operation was canceled we add the segdep to the appropriate * dependency to free the journal space once the canceling operation * completes. */ static void handle_written_jnewblk(jnewblk) struct jnewblk *jnewblk; { struct bmsafemap *bmsafemap; struct freefrag *freefrag; struct freework *freework; struct jsegdep *jsegdep; struct newblk *newblk; /* Grab the jsegdep. */ jsegdep = jnewblk->jn_jsegdep; jnewblk->jn_jsegdep = NULL; if (jnewblk->jn_dep == NULL) panic("handle_written_jnewblk: No dependency for the segdep."); switch (jnewblk->jn_dep->wk_type) { case D_NEWBLK: case D_ALLOCDIRECT: case D_ALLOCINDIR: /* * Add the written block to the bmsafemap so it can * be notified when the bitmap is on disk. */ newblk = WK_NEWBLK(jnewblk->jn_dep); newblk->nb_jnewblk = NULL; if ((newblk->nb_state & GOINGAWAY) == 0) { bmsafemap = newblk->nb_bmsafemap; newblk->nb_state |= ONDEPLIST; LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk, nb_deps); } jwork_insert(&newblk->nb_jwork, jsegdep); break; case D_FREEFRAG: /* * A newblock being removed by a freefrag when replaced by * frag extension. */ freefrag = WK_FREEFRAG(jnewblk->jn_dep); freefrag->ff_jdep = NULL; jwork_insert(&freefrag->ff_jwork, jsegdep); break; case D_FREEWORK: /* * A direct block was removed by truncate. */ freework = WK_FREEWORK(jnewblk->jn_dep); freework->fw_jnewblk = NULL; jwork_insert(&freework->fw_freeblks->fb_jwork, jsegdep); break; default: panic("handle_written_jnewblk: Unknown type %d.", jnewblk->jn_dep->wk_type); } jnewblk->jn_dep = NULL; free_jnewblk(jnewblk); } /* * Cancel a jfreefrag that won't be needed, probably due to colliding with * an in-flight allocation that has not yet been committed. Divorce us * from the freefrag and mark it DEPCOMPLETE so that it may be added * to the worklist. */ static void cancel_jfreefrag(jfreefrag) struct jfreefrag *jfreefrag; { struct freefrag *freefrag; if (jfreefrag->fr_jsegdep) { free_jsegdep(jfreefrag->fr_jsegdep); jfreefrag->fr_jsegdep = NULL; } freefrag = jfreefrag->fr_freefrag; jfreefrag->fr_freefrag = NULL; free_jfreefrag(jfreefrag); freefrag->ff_state |= DEPCOMPLETE; CTR1(KTR_SUJ, "cancel_jfreefrag: blkno %jd", freefrag->ff_blkno); } /* * Free a jfreefrag when the parent freefrag is rendered obsolete. */ static void free_jfreefrag(jfreefrag) struct jfreefrag *jfreefrag; { if (jfreefrag->fr_state & INPROGRESS) WORKLIST_REMOVE(&jfreefrag->fr_list); else if (jfreefrag->fr_state & ONWORKLIST) remove_from_journal(&jfreefrag->fr_list); if (jfreefrag->fr_freefrag != NULL) panic("free_jfreefrag: Still attached to a freefrag."); WORKITEM_FREE(jfreefrag, D_JFREEFRAG); } /* * Called when the journal write for a jfreefrag completes. The parent * freefrag is added to the worklist if this completes its dependencies. */ static void handle_written_jfreefrag(jfreefrag) struct jfreefrag *jfreefrag; { struct jsegdep *jsegdep; struct freefrag *freefrag; /* Grab the jsegdep. */ jsegdep = jfreefrag->fr_jsegdep; jfreefrag->fr_jsegdep = NULL; freefrag = jfreefrag->fr_freefrag; if (freefrag == NULL) panic("handle_written_jfreefrag: No freefrag."); freefrag->ff_state |= DEPCOMPLETE; freefrag->ff_jdep = NULL; jwork_insert(&freefrag->ff_jwork, jsegdep); if ((freefrag->ff_state & ALLCOMPLETE) == ALLCOMPLETE) add_to_worklist(&freefrag->ff_list, 0); jfreefrag->fr_freefrag = NULL; free_jfreefrag(jfreefrag); } /* * Called when the journal write for a jfreeblk completes. The jfreeblk * is removed from the freeblks list of pending journal writes and the * jsegdep is moved to the freeblks jwork to be completed when all blocks * have been reclaimed. */ static void handle_written_jblkdep(jblkdep) struct jblkdep *jblkdep; { struct freeblks *freeblks; struct jsegdep *jsegdep; /* Grab the jsegdep. */ jsegdep = jblkdep->jb_jsegdep; jblkdep->jb_jsegdep = NULL; freeblks = jblkdep->jb_freeblks; LIST_REMOVE(jblkdep, jb_deps); jwork_insert(&freeblks->fb_jwork, jsegdep); /* * If the freeblks is all journaled, we can add it to the worklist. */ if (LIST_EMPTY(&freeblks->fb_jblkdephd) && (freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE) add_to_worklist(&freeblks->fb_list, WK_NODELAY); free_jblkdep(jblkdep); } static struct jsegdep * newjsegdep(struct worklist *wk) { struct jsegdep *jsegdep; jsegdep = malloc(sizeof(*jsegdep), M_JSEGDEP, M_SOFTDEP_FLAGS); workitem_alloc(&jsegdep->jd_list, D_JSEGDEP, wk->wk_mp); jsegdep->jd_seg = NULL; return (jsegdep); } static struct jmvref * newjmvref(dp, ino, oldoff, newoff) struct inode *dp; ino_t ino; off_t oldoff; off_t newoff; { struct jmvref *jmvref; jmvref = malloc(sizeof(*jmvref), M_JMVREF, M_SOFTDEP_FLAGS); workitem_alloc(&jmvref->jm_list, D_JMVREF, ITOVFS(dp)); jmvref->jm_list.wk_state = ATTACHED | DEPCOMPLETE; jmvref->jm_parent = dp->i_number; jmvref->jm_ino = ino; jmvref->jm_oldoff = oldoff; jmvref->jm_newoff = newoff; return (jmvref); } /* * Allocate a new jremref that tracks the removal of ip from dp with the * directory entry offset of diroff. Mark the entry as ATTACHED and * DEPCOMPLETE as we have all the information required for the journal write * and the directory has already been removed from the buffer. The caller * is responsible for linking the jremref into the pagedep and adding it * to the journal to write. The MKDIR_PARENT flag is set if we're doing * a DOTDOT addition so handle_workitem_remove() can properly assign * the jsegdep when we're done. */ static struct jremref * newjremref(struct dirrem *dirrem, struct inode *dp, struct inode *ip, off_t diroff, nlink_t nlink) { struct jremref *jremref; jremref = malloc(sizeof(*jremref), M_JREMREF, M_SOFTDEP_FLAGS); workitem_alloc(&jremref->jr_list, D_JREMREF, ITOVFS(dp)); jremref->jr_state = ATTACHED; newinoref(&jremref->jr_ref, ip->i_number, dp->i_number, diroff, nlink, ip->i_mode); jremref->jr_dirrem = dirrem; return (jremref); } static inline void newinoref(struct inoref *inoref, ino_t ino, ino_t parent, off_t diroff, nlink_t nlink, uint16_t mode) { inoref->if_jsegdep = newjsegdep(&inoref->if_list); inoref->if_diroff = diroff; inoref->if_ino = ino; inoref->if_parent = parent; inoref->if_nlink = nlink; inoref->if_mode = mode; } /* * Allocate a new jaddref to track the addition of ino to dp at diroff. The * directory offset may not be known until later. The caller is responsible * adding the entry to the journal when this information is available. nlink * should be the link count prior to the addition and mode is only required * to have the correct FMT. */ static struct jaddref * newjaddref(struct inode *dp, ino_t ino, off_t diroff, int16_t nlink, uint16_t mode) { struct jaddref *jaddref; jaddref = malloc(sizeof(*jaddref), M_JADDREF, M_SOFTDEP_FLAGS); workitem_alloc(&jaddref->ja_list, D_JADDREF, ITOVFS(dp)); jaddref->ja_state = ATTACHED; jaddref->ja_mkdir = NULL; newinoref(&jaddref->ja_ref, ino, dp->i_number, diroff, nlink, mode); return (jaddref); } /* * Create a new free dependency for a freework. The caller is responsible * for adjusting the reference count when it has the lock held. The freedep * will track an outstanding bitmap write that will ultimately clear the * freework to continue. */ static struct freedep * newfreedep(struct freework *freework) { struct freedep *freedep; freedep = malloc(sizeof(*freedep), M_FREEDEP, M_SOFTDEP_FLAGS); workitem_alloc(&freedep->fd_list, D_FREEDEP, freework->fw_list.wk_mp); freedep->fd_freework = freework; return (freedep); } /* * Free a freedep structure once the buffer it is linked to is written. If * this is the last reference to the freework schedule it for completion. */ static void free_freedep(freedep) struct freedep *freedep; { struct freework *freework; freework = freedep->fd_freework; freework->fw_freeblks->fb_cgwait--; if (--freework->fw_ref == 0) freework_enqueue(freework); WORKITEM_FREE(freedep, D_FREEDEP); } /* * Allocate a new freework structure that may be a level in an indirect * when parent is not NULL or a top level block when it is. The top level * freework structures are allocated without the per-filesystem lock held * and before the freeblks is visible outside of softdep_setup_freeblocks(). */ static struct freework * newfreework(ump, freeblks, parent, lbn, nb, frags, off, journal) struct ufsmount *ump; struct freeblks *freeblks; struct freework *parent; ufs_lbn_t lbn; ufs2_daddr_t nb; int frags; int off; int journal; { struct freework *freework; freework = malloc(sizeof(*freework), M_FREEWORK, M_SOFTDEP_FLAGS); workitem_alloc(&freework->fw_list, D_FREEWORK, freeblks->fb_list.wk_mp); freework->fw_state = ATTACHED; freework->fw_jnewblk = NULL; freework->fw_freeblks = freeblks; freework->fw_parent = parent; freework->fw_lbn = lbn; freework->fw_blkno = nb; freework->fw_frags = frags; freework->fw_indir = NULL; freework->fw_ref = (MOUNTEDSUJ(UFSTOVFS(ump)) == 0 || lbn >= -UFS_NXADDR) ? 0 : NINDIR(ump->um_fs) + 1; freework->fw_start = freework->fw_off = off; if (journal) newjfreeblk(freeblks, lbn, nb, frags); if (parent == NULL) { ACQUIRE_LOCK(ump); WORKLIST_INSERT(&freeblks->fb_freeworkhd, &freework->fw_list); freeblks->fb_ref++; FREE_LOCK(ump); } return (freework); } /* * Eliminate a jfreeblk for a block that does not need journaling. */ static void cancel_jfreeblk(freeblks, blkno) struct freeblks *freeblks; ufs2_daddr_t blkno; { struct jfreeblk *jfreeblk; struct jblkdep *jblkdep; LIST_FOREACH(jblkdep, &freeblks->fb_jblkdephd, jb_deps) { if (jblkdep->jb_list.wk_type != D_JFREEBLK) continue; jfreeblk = WK_JFREEBLK(&jblkdep->jb_list); if (jfreeblk->jf_blkno == blkno) break; } if (jblkdep == NULL) return; CTR1(KTR_SUJ, "cancel_jfreeblk: blkno %jd", blkno); free_jsegdep(jblkdep->jb_jsegdep); LIST_REMOVE(jblkdep, jb_deps); WORKITEM_FREE(jfreeblk, D_JFREEBLK); } /* * Allocate a new jfreeblk to journal top level block pointer when truncating * a file. The caller must add this to the worklist when the per-filesystem * lock is held. */ static struct jfreeblk * newjfreeblk(freeblks, lbn, blkno, frags) struct freeblks *freeblks; ufs_lbn_t lbn; ufs2_daddr_t blkno; int frags; { struct jfreeblk *jfreeblk; jfreeblk = malloc(sizeof(*jfreeblk), M_JFREEBLK, M_SOFTDEP_FLAGS); workitem_alloc(&jfreeblk->jf_dep.jb_list, D_JFREEBLK, freeblks->fb_list.wk_mp); jfreeblk->jf_dep.jb_jsegdep = newjsegdep(&jfreeblk->jf_dep.jb_list); jfreeblk->jf_dep.jb_freeblks = freeblks; jfreeblk->jf_ino = freeblks->fb_inum; jfreeblk->jf_lbn = lbn; jfreeblk->jf_blkno = blkno; jfreeblk->jf_frags = frags; LIST_INSERT_HEAD(&freeblks->fb_jblkdephd, &jfreeblk->jf_dep, jb_deps); return (jfreeblk); } /* * The journal is only prepared to handle full-size block numbers, so we * have to adjust the record to reflect the change to a full-size block. * For example, suppose we have a block made up of fragments 8-15 and * want to free its last two fragments. We are given a request that says: * FREEBLK ino=5, blkno=14, lbn=0, frags=2, oldfrags=0 * where frags are the number of fragments to free and oldfrags are the * number of fragments to keep. To block align it, we have to change it to * have a valid full-size blkno, so it becomes: * FREEBLK ino=5, blkno=8, lbn=0, frags=2, oldfrags=6 */ static void adjust_newfreework(freeblks, frag_offset) struct freeblks *freeblks; int frag_offset; { struct jfreeblk *jfreeblk; KASSERT((LIST_FIRST(&freeblks->fb_jblkdephd) != NULL && LIST_FIRST(&freeblks->fb_jblkdephd)->jb_list.wk_type == D_JFREEBLK), ("adjust_newfreework: Missing freeblks dependency")); jfreeblk = WK_JFREEBLK(LIST_FIRST(&freeblks->fb_jblkdephd)); jfreeblk->jf_blkno -= frag_offset; jfreeblk->jf_frags += frag_offset; } /* * Allocate a new jtrunc to track a partial truncation. */ static struct jtrunc * newjtrunc(freeblks, size, extsize) struct freeblks *freeblks; off_t size; int extsize; { struct jtrunc *jtrunc; jtrunc = malloc(sizeof(*jtrunc), M_JTRUNC, M_SOFTDEP_FLAGS); workitem_alloc(&jtrunc->jt_dep.jb_list, D_JTRUNC, freeblks->fb_list.wk_mp); jtrunc->jt_dep.jb_jsegdep = newjsegdep(&jtrunc->jt_dep.jb_list); jtrunc->jt_dep.jb_freeblks = freeblks; jtrunc->jt_ino = freeblks->fb_inum; jtrunc->jt_size = size; jtrunc->jt_extsize = extsize; LIST_INSERT_HEAD(&freeblks->fb_jblkdephd, &jtrunc->jt_dep, jb_deps); return (jtrunc); } /* * If we're canceling a new bitmap we have to search for another ref * to move into the bmsafemap dep. This might be better expressed * with another structure. */ static void move_newblock_dep(jaddref, inodedep) struct jaddref *jaddref; struct inodedep *inodedep; { struct inoref *inoref; struct jaddref *jaddrefn; jaddrefn = NULL; for (inoref = TAILQ_NEXT(&jaddref->ja_ref, if_deps); inoref; inoref = TAILQ_NEXT(inoref, if_deps)) { if ((jaddref->ja_state & NEWBLOCK) && inoref->if_list.wk_type == D_JADDREF) { jaddrefn = (struct jaddref *)inoref; break; } } if (jaddrefn == NULL) return; jaddrefn->ja_state &= ~(ATTACHED | UNDONE); jaddrefn->ja_state |= jaddref->ja_state & (ATTACHED | UNDONE | NEWBLOCK); jaddref->ja_state &= ~(ATTACHED | UNDONE | NEWBLOCK); jaddref->ja_state |= ATTACHED; LIST_REMOVE(jaddref, ja_bmdeps); LIST_INSERT_HEAD(&inodedep->id_bmsafemap->sm_jaddrefhd, jaddrefn, ja_bmdeps); } /* * Cancel a jaddref either before it has been written or while it is being * written. This happens when a link is removed before the add reaches * the disk. The jaddref dependency is kept linked into the bmsafemap * and inode to prevent the link count or bitmap from reaching the disk * until handle_workitem_remove() re-adjusts the counts and bitmaps as * required. * * Returns 1 if the canceled addref requires journaling of the remove and * 0 otherwise. */ static int cancel_jaddref(jaddref, inodedep, wkhd) struct jaddref *jaddref; struct inodedep *inodedep; struct workhead *wkhd; { struct inoref *inoref; struct jsegdep *jsegdep; int needsj; KASSERT((jaddref->ja_state & COMPLETE) == 0, ("cancel_jaddref: Canceling complete jaddref")); if (jaddref->ja_state & (INPROGRESS | COMPLETE)) needsj = 1; else needsj = 0; if (inodedep == NULL) if (inodedep_lookup(jaddref->ja_list.wk_mp, jaddref->ja_ino, 0, &inodedep) == 0) panic("cancel_jaddref: Lost inodedep"); /* * We must adjust the nlink of any reference operation that follows * us so that it is consistent with the in-memory reference. This * ensures that inode nlink rollbacks always have the correct link. */ if (needsj == 0) { for (inoref = TAILQ_NEXT(&jaddref->ja_ref, if_deps); inoref; inoref = TAILQ_NEXT(inoref, if_deps)) { if (inoref->if_state & GOINGAWAY) break; inoref->if_nlink--; } } jsegdep = inoref_jseg(&jaddref->ja_ref); if (jaddref->ja_state & NEWBLOCK) move_newblock_dep(jaddref, inodedep); wake_worklist(&jaddref->ja_list); jaddref->ja_mkdir = NULL; if (jaddref->ja_state & INPROGRESS) { jaddref->ja_state &= ~INPROGRESS; WORKLIST_REMOVE(&jaddref->ja_list); jwork_insert(wkhd, jsegdep); } else { free_jsegdep(jsegdep); if (jaddref->ja_state & DEPCOMPLETE) remove_from_journal(&jaddref->ja_list); } jaddref->ja_state |= (GOINGAWAY | DEPCOMPLETE); /* * Leave NEWBLOCK jaddrefs on the inodedep so handle_workitem_remove * can arrange for them to be freed with the bitmap. Otherwise we * no longer need this addref attached to the inoreflst and it * will incorrectly adjust nlink if we leave it. */ if ((jaddref->ja_state & NEWBLOCK) == 0) { TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref, if_deps); jaddref->ja_state |= COMPLETE; free_jaddref(jaddref); return (needsj); } /* * Leave the head of the list for jsegdeps for fast merging. */ if (LIST_FIRST(wkhd) != NULL) { jaddref->ja_state |= ONWORKLIST; LIST_INSERT_AFTER(LIST_FIRST(wkhd), &jaddref->ja_list, wk_list); } else WORKLIST_INSERT(wkhd, &jaddref->ja_list); return (needsj); } /* * Attempt to free a jaddref structure when some work completes. This * should only succeed once the entry is written and all dependencies have * been notified. */ static void free_jaddref(jaddref) struct jaddref *jaddref; { if ((jaddref->ja_state & ALLCOMPLETE) != ALLCOMPLETE) return; if (jaddref->ja_ref.if_jsegdep) panic("free_jaddref: segdep attached to jaddref %p(0x%X)\n", jaddref, jaddref->ja_state); if (jaddref->ja_state & NEWBLOCK) LIST_REMOVE(jaddref, ja_bmdeps); if (jaddref->ja_state & (INPROGRESS | ONWORKLIST)) panic("free_jaddref: Bad state %p(0x%X)", jaddref, jaddref->ja_state); if (jaddref->ja_mkdir != NULL) panic("free_jaddref: Work pending, 0x%X\n", jaddref->ja_state); WORKITEM_FREE(jaddref, D_JADDREF); } /* * Free a jremref structure once it has been written or discarded. */ static void free_jremref(jremref) struct jremref *jremref; { if (jremref->jr_ref.if_jsegdep) free_jsegdep(jremref->jr_ref.if_jsegdep); if (jremref->jr_state & INPROGRESS) panic("free_jremref: IO still pending"); WORKITEM_FREE(jremref, D_JREMREF); } /* * Free a jnewblk structure. */ static void free_jnewblk(jnewblk) struct jnewblk *jnewblk; { if ((jnewblk->jn_state & ALLCOMPLETE) != ALLCOMPLETE) return; LIST_REMOVE(jnewblk, jn_deps); if (jnewblk->jn_dep != NULL) panic("free_jnewblk: Dependency still attached."); WORKITEM_FREE(jnewblk, D_JNEWBLK); } /* * Cancel a jnewblk which has been been made redundant by frag extension. */ static void cancel_jnewblk(jnewblk, wkhd) struct jnewblk *jnewblk; struct workhead *wkhd; { struct jsegdep *jsegdep; CTR1(KTR_SUJ, "cancel_jnewblk: blkno %jd", jnewblk->jn_blkno); jsegdep = jnewblk->jn_jsegdep; if (jnewblk->jn_jsegdep == NULL || jnewblk->jn_dep == NULL) panic("cancel_jnewblk: Invalid state"); jnewblk->jn_jsegdep = NULL; jnewblk->jn_dep = NULL; jnewblk->jn_state |= GOINGAWAY; if (jnewblk->jn_state & INPROGRESS) { jnewblk->jn_state &= ~INPROGRESS; WORKLIST_REMOVE(&jnewblk->jn_list); jwork_insert(wkhd, jsegdep); } else { free_jsegdep(jsegdep); remove_from_journal(&jnewblk->jn_list); } wake_worklist(&jnewblk->jn_list); WORKLIST_INSERT(wkhd, &jnewblk->jn_list); } static void free_jblkdep(jblkdep) struct jblkdep *jblkdep; { if (jblkdep->jb_list.wk_type == D_JFREEBLK) WORKITEM_FREE(jblkdep, D_JFREEBLK); else if (jblkdep->jb_list.wk_type == D_JTRUNC) WORKITEM_FREE(jblkdep, D_JTRUNC); else panic("free_jblkdep: Unexpected type %s", TYPENAME(jblkdep->jb_list.wk_type)); } /* * Free a single jseg once it is no longer referenced in memory or on * disk. Reclaim journal blocks and dependencies waiting for the segment * to disappear. */ static void free_jseg(jseg, jblocks) struct jseg *jseg; struct jblocks *jblocks; { struct freework *freework; /* * Free freework structures that were lingering to indicate freed * indirect blocks that forced journal write ordering on reallocate. */ while ((freework = LIST_FIRST(&jseg->js_indirs)) != NULL) indirblk_remove(freework); if (jblocks->jb_oldestseg == jseg) jblocks->jb_oldestseg = TAILQ_NEXT(jseg, js_next); TAILQ_REMOVE(&jblocks->jb_segs, jseg, js_next); jblocks_free(jblocks, jseg->js_list.wk_mp, jseg->js_size); KASSERT(LIST_EMPTY(&jseg->js_entries), ("free_jseg: Freed jseg has valid entries.")); WORKITEM_FREE(jseg, D_JSEG); } /* * Free all jsegs that meet the criteria for being reclaimed and update * oldestseg. */ static void free_jsegs(jblocks) struct jblocks *jblocks; { struct jseg *jseg; /* * Free only those jsegs which have none allocated before them to * preserve the journal space ordering. */ while ((jseg = TAILQ_FIRST(&jblocks->jb_segs)) != NULL) { /* * Only reclaim space when nothing depends on this journal * set and another set has written that it is no longer * valid. */ if (jseg->js_refs != 0) { jblocks->jb_oldestseg = jseg; return; } if ((jseg->js_state & ALLCOMPLETE) != ALLCOMPLETE) break; if (jseg->js_seq > jblocks->jb_oldestwrseq) break; /* * We can free jsegs that didn't write entries when * oldestwrseq == js_seq. */ if (jseg->js_seq == jblocks->jb_oldestwrseq && jseg->js_cnt != 0) break; free_jseg(jseg, jblocks); } /* * If we exited the loop above we still must discover the * oldest valid segment. */ if (jseg) for (jseg = jblocks->jb_oldestseg; jseg != NULL; jseg = TAILQ_NEXT(jseg, js_next)) if (jseg->js_refs != 0) break; jblocks->jb_oldestseg = jseg; /* * The journal has no valid records but some jsegs may still be * waiting on oldestwrseq to advance. We force a small record * out to permit these lingering records to be reclaimed. */ if (jblocks->jb_oldestseg == NULL && !TAILQ_EMPTY(&jblocks->jb_segs)) jblocks->jb_needseg = 1; } /* * Release one reference to a jseg and free it if the count reaches 0. This * should eventually reclaim journal space as well. */ static void rele_jseg(jseg) struct jseg *jseg; { KASSERT(jseg->js_refs > 0, ("free_jseg: Invalid refcnt %d", jseg->js_refs)); if (--jseg->js_refs != 0) return; free_jsegs(jseg->js_jblocks); } /* * Release a jsegdep and decrement the jseg count. */ static void free_jsegdep(jsegdep) struct jsegdep *jsegdep; { if (jsegdep->jd_seg) rele_jseg(jsegdep->jd_seg); WORKITEM_FREE(jsegdep, D_JSEGDEP); } /* * Wait for a journal item to make it to disk. Initiate journal processing * if required. */ static int jwait(wk, waitfor) struct worklist *wk; int waitfor; { LOCK_OWNED(VFSTOUFS(wk->wk_mp)); /* * Blocking journal waits cause slow synchronous behavior. Record * stats on the frequency of these blocking operations. */ if (waitfor == MNT_WAIT) { stat_journal_wait++; switch (wk->wk_type) { case D_JREMREF: case D_JMVREF: stat_jwait_filepage++; break; case D_JTRUNC: case D_JFREEBLK: stat_jwait_freeblks++; break; case D_JNEWBLK: stat_jwait_newblk++; break; case D_JADDREF: stat_jwait_inode++; break; default: break; } } /* * If IO has not started we process the journal. We can't mark the * worklist item as IOWAITING because we drop the lock while * processing the journal and the worklist entry may be freed after * this point. The caller may call back in and re-issue the request. */ if ((wk->wk_state & INPROGRESS) == 0) { softdep_process_journal(wk->wk_mp, wk, waitfor); if (waitfor != MNT_WAIT) return (EBUSY); return (0); } if (waitfor != MNT_WAIT) return (EBUSY); wait_worklist(wk, "jwait"); return (0); } /* * Lookup an inodedep based on an inode pointer and set the nlinkdelta as * appropriate. This is a convenience function to reduce duplicate code * for the setup and revert functions below. */ static struct inodedep * inodedep_lookup_ip(ip) struct inode *ip; { struct inodedep *inodedep; KASSERT(ip->i_nlink >= ip->i_effnlink, ("inodedep_lookup_ip: bad delta")); (void) inodedep_lookup(ITOVFS(ip), ip->i_number, DEPALLOC, &inodedep); inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink; KASSERT((inodedep->id_state & UNLINKED) == 0, ("inode unlinked")); return (inodedep); } /* * Called prior to creating a new inode and linking it to a directory. The * jaddref structure must already be allocated by softdep_setup_inomapdep * and it is discovered here so we can initialize the mode and update * nlinkdelta. */ void softdep_setup_create(dp, ip) struct inode *dp; struct inode *ip; { struct inodedep *inodedep; struct jaddref *jaddref; struct vnode *dvp; KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0, ("softdep_setup_create called on non-softdep filesystem")); KASSERT(ip->i_nlink == 1, ("softdep_setup_create: Invalid link count.")); dvp = ITOV(dp); ACQUIRE_LOCK(ITOUMP(dp)); inodedep = inodedep_lookup_ip(ip); if (DOINGSUJ(dvp)) { jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, inoreflst); KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number, ("softdep_setup_create: No addref structure present.")); } softdep_prelink(dvp, NULL); FREE_LOCK(ITOUMP(dp)); } /* * Create a jaddref structure to track the addition of a DOTDOT link when * we are reparenting an inode as part of a rename. This jaddref will be * found by softdep_setup_directory_change. Adjusts nlinkdelta for * non-journaling softdep. */ void softdep_setup_dotdot_link(dp, ip) struct inode *dp; struct inode *ip; { struct inodedep *inodedep; struct jaddref *jaddref; struct vnode *dvp; KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0, ("softdep_setup_dotdot_link called on non-softdep filesystem")); dvp = ITOV(dp); jaddref = NULL; /* * We don't set MKDIR_PARENT as this is not tied to a mkdir and * is used as a normal link would be. */ if (DOINGSUJ(dvp)) jaddref = newjaddref(ip, dp->i_number, DOTDOT_OFFSET, dp->i_effnlink - 1, dp->i_mode); ACQUIRE_LOCK(ITOUMP(dp)); inodedep = inodedep_lookup_ip(dp); if (jaddref) TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref, if_deps); softdep_prelink(dvp, ITOV(ip)); FREE_LOCK(ITOUMP(dp)); } /* * Create a jaddref structure to track a new link to an inode. The directory * offset is not known until softdep_setup_directory_add or * softdep_setup_directory_change. Adjusts nlinkdelta for non-journaling * softdep. */ void softdep_setup_link(dp, ip) struct inode *dp; struct inode *ip; { struct inodedep *inodedep; struct jaddref *jaddref; struct vnode *dvp; KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0, ("softdep_setup_link called on non-softdep filesystem")); dvp = ITOV(dp); jaddref = NULL; if (DOINGSUJ(dvp)) jaddref = newjaddref(dp, ip->i_number, 0, ip->i_effnlink - 1, ip->i_mode); ACQUIRE_LOCK(ITOUMP(dp)); inodedep = inodedep_lookup_ip(ip); if (jaddref) TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref, if_deps); softdep_prelink(dvp, ITOV(ip)); FREE_LOCK(ITOUMP(dp)); } /* * Called to create the jaddref structures to track . and .. references as * well as lookup and further initialize the incomplete jaddref created * by softdep_setup_inomapdep when the inode was allocated. Adjusts * nlinkdelta for non-journaling softdep. */ void softdep_setup_mkdir(dp, ip) struct inode *dp; struct inode *ip; { struct inodedep *inodedep; struct jaddref *dotdotaddref; struct jaddref *dotaddref; struct jaddref *jaddref; struct vnode *dvp; KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0, ("softdep_setup_mkdir called on non-softdep filesystem")); dvp = ITOV(dp); dotaddref = dotdotaddref = NULL; if (DOINGSUJ(dvp)) { dotaddref = newjaddref(ip, ip->i_number, DOT_OFFSET, 1, ip->i_mode); dotaddref->ja_state |= MKDIR_BODY; dotdotaddref = newjaddref(ip, dp->i_number, DOTDOT_OFFSET, dp->i_effnlink - 1, dp->i_mode); dotdotaddref->ja_state |= MKDIR_PARENT; } ACQUIRE_LOCK(ITOUMP(dp)); inodedep = inodedep_lookup_ip(ip); if (DOINGSUJ(dvp)) { jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, inoreflst); KASSERT(jaddref != NULL, ("softdep_setup_mkdir: No addref structure present.")); KASSERT(jaddref->ja_parent == dp->i_number, ("softdep_setup_mkdir: bad parent %ju", (uintmax_t)jaddref->ja_parent)); TAILQ_INSERT_BEFORE(&jaddref->ja_ref, &dotaddref->ja_ref, if_deps); } inodedep = inodedep_lookup_ip(dp); if (DOINGSUJ(dvp)) TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &dotdotaddref->ja_ref, if_deps); softdep_prelink(ITOV(dp), NULL); FREE_LOCK(ITOUMP(dp)); } /* * Called to track nlinkdelta of the inode and parent directories prior to * unlinking a directory. */ void softdep_setup_rmdir(dp, ip) struct inode *dp; struct inode *ip; { struct vnode *dvp; KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0, ("softdep_setup_rmdir called on non-softdep filesystem")); dvp = ITOV(dp); ACQUIRE_LOCK(ITOUMP(dp)); (void) inodedep_lookup_ip(ip); (void) inodedep_lookup_ip(dp); softdep_prelink(dvp, ITOV(ip)); FREE_LOCK(ITOUMP(dp)); } /* * Called to track nlinkdelta of the inode and parent directories prior to * unlink. */ void softdep_setup_unlink(dp, ip) struct inode *dp; struct inode *ip; { struct vnode *dvp; KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0, ("softdep_setup_unlink called on non-softdep filesystem")); dvp = ITOV(dp); ACQUIRE_LOCK(ITOUMP(dp)); (void) inodedep_lookup_ip(ip); (void) inodedep_lookup_ip(dp); softdep_prelink(dvp, ITOV(ip)); FREE_LOCK(ITOUMP(dp)); } /* * Called to release the journal structures created by a failed non-directory * creation. Adjusts nlinkdelta for non-journaling softdep. */ void softdep_revert_create(dp, ip) struct inode *dp; struct inode *ip; { struct inodedep *inodedep; struct jaddref *jaddref; struct vnode *dvp; KASSERT(MOUNTEDSOFTDEP(ITOVFS((dp))) != 0, ("softdep_revert_create called on non-softdep filesystem")); dvp = ITOV(dp); ACQUIRE_LOCK(ITOUMP(dp)); inodedep = inodedep_lookup_ip(ip); if (DOINGSUJ(dvp)) { jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, inoreflst); KASSERT(jaddref->ja_parent == dp->i_number, ("softdep_revert_create: addref parent mismatch")); cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait); } FREE_LOCK(ITOUMP(dp)); } /* * Called to release the journal structures created by a failed link * addition. Adjusts nlinkdelta for non-journaling softdep. */ void softdep_revert_link(dp, ip) struct inode *dp; struct inode *ip; { struct inodedep *inodedep; struct jaddref *jaddref; struct vnode *dvp; KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0, ("softdep_revert_link called on non-softdep filesystem")); dvp = ITOV(dp); ACQUIRE_LOCK(ITOUMP(dp)); inodedep = inodedep_lookup_ip(ip); if (DOINGSUJ(dvp)) { jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, inoreflst); KASSERT(jaddref->ja_parent == dp->i_number, ("softdep_revert_link: addref parent mismatch")); cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait); } FREE_LOCK(ITOUMP(dp)); } /* * Called to release the journal structures created by a failed mkdir * attempt. Adjusts nlinkdelta for non-journaling softdep. */ void softdep_revert_mkdir(dp, ip) struct inode *dp; struct inode *ip; { struct inodedep *inodedep; struct jaddref *jaddref; struct jaddref *dotaddref; struct vnode *dvp; KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0, ("softdep_revert_mkdir called on non-softdep filesystem")); dvp = ITOV(dp); ACQUIRE_LOCK(ITOUMP(dp)); inodedep = inodedep_lookup_ip(dp); if (DOINGSUJ(dvp)) { jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, inoreflst); KASSERT(jaddref->ja_parent == ip->i_number, ("softdep_revert_mkdir: dotdot addref parent mismatch")); cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait); } inodedep = inodedep_lookup_ip(ip); if (DOINGSUJ(dvp)) { jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, inoreflst); KASSERT(jaddref->ja_parent == dp->i_number, ("softdep_revert_mkdir: addref parent mismatch")); dotaddref = (struct jaddref *)TAILQ_PREV(&jaddref->ja_ref, inoreflst, if_deps); cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait); KASSERT(dotaddref->ja_parent == ip->i_number, ("softdep_revert_mkdir: dot addref parent mismatch")); cancel_jaddref(dotaddref, inodedep, &inodedep->id_inowait); } FREE_LOCK(ITOUMP(dp)); } /* * Called to correct nlinkdelta after a failed rmdir. */ void softdep_revert_rmdir(dp, ip) struct inode *dp; struct inode *ip; { KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0, ("softdep_revert_rmdir called on non-softdep filesystem")); ACQUIRE_LOCK(ITOUMP(dp)); (void) inodedep_lookup_ip(ip); (void) inodedep_lookup_ip(dp); FREE_LOCK(ITOUMP(dp)); } /* * Protecting the freemaps (or bitmaps). * * To eliminate the need to execute fsck before mounting a filesystem * after a power failure, one must (conservatively) guarantee that the * on-disk copy of the bitmaps never indicate that a live inode or block is * free. So, when a block or inode is allocated, the bitmap should be * updated (on disk) before any new pointers. When a block or inode is * freed, the bitmap should not be updated until all pointers have been * reset. The latter dependency is handled by the delayed de-allocation * approach described below for block and inode de-allocation. The former * dependency is handled by calling the following procedure when a block or * inode is allocated. When an inode is allocated an "inodedep" is created * with its DEPCOMPLETE flag cleared until its bitmap is written to disk. * Each "inodedep" is also inserted into the hash indexing structure so * that any additional link additions can be made dependent on the inode * allocation. * * The ufs filesystem maintains a number of free block counts (e.g., per * cylinder group, per cylinder and per pair) * in addition to the bitmaps. These counts are used to improve efficiency * during allocation and therefore must be consistent with the bitmaps. * There is no convenient way to guarantee post-crash consistency of these * counts with simple update ordering, for two main reasons: (1) The counts * and bitmaps for a single cylinder group block are not in the same disk * sector. If a disk write is interrupted (e.g., by power failure), one may * be written and the other not. (2) Some of the counts are located in the * superblock rather than the cylinder group block. So, we focus our soft * updates implementation on protecting the bitmaps. When mounting a * filesystem, we recompute the auxiliary counts from the bitmaps. */ /* * Called just after updating the cylinder group block to allocate an inode. */ void softdep_setup_inomapdep(bp, ip, newinum, mode) struct buf *bp; /* buffer for cylgroup block with inode map */ struct inode *ip; /* inode related to allocation */ ino_t newinum; /* new inode number being allocated */ int mode; { struct inodedep *inodedep; struct bmsafemap *bmsafemap; struct jaddref *jaddref; struct mount *mp; struct fs *fs; mp = ITOVFS(ip); KASSERT(MOUNTEDSOFTDEP(mp) != 0, ("softdep_setup_inomapdep called on non-softdep filesystem")); fs = VFSTOUFS(mp)->um_fs; jaddref = NULL; /* * Allocate the journal reference add structure so that the bitmap * can be dependent on it. */ if (MOUNTEDSUJ(mp)) { jaddref = newjaddref(ip, newinum, 0, 0, mode); jaddref->ja_state |= NEWBLOCK; } /* * Create a dependency for the newly allocated inode. * Panic if it already exists as something is seriously wrong. * Otherwise add it to the dependency list for the buffer holding * the cylinder group map from which it was allocated. * * We have to preallocate a bmsafemap entry in case it is needed * in bmsafemap_lookup since once we allocate the inodedep, we * have to finish initializing it before we can FREE_LOCK(). * By preallocating, we avoid FREE_LOCK() while doing a malloc * in bmsafemap_lookup. We cannot call bmsafemap_lookup before * creating the inodedep as it can be freed during the time * that we FREE_LOCK() while allocating the inodedep. We must * call workitem_alloc() before entering the locked section as * it also acquires the lock and we must avoid trying doing so * recursively. */ bmsafemap = malloc(sizeof(struct bmsafemap), M_BMSAFEMAP, M_SOFTDEP_FLAGS); workitem_alloc(&bmsafemap->sm_list, D_BMSAFEMAP, mp); ACQUIRE_LOCK(ITOUMP(ip)); if ((inodedep_lookup(mp, newinum, DEPALLOC, &inodedep))) panic("softdep_setup_inomapdep: dependency %p for new" "inode already exists", inodedep); bmsafemap = bmsafemap_lookup(mp, bp, ino_to_cg(fs, newinum), bmsafemap); if (jaddref) { LIST_INSERT_HEAD(&bmsafemap->sm_jaddrefhd, jaddref, ja_bmdeps); TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref, if_deps); } else { inodedep->id_state |= ONDEPLIST; LIST_INSERT_HEAD(&bmsafemap->sm_inodedephd, inodedep, id_deps); } inodedep->id_bmsafemap = bmsafemap; inodedep->id_state &= ~DEPCOMPLETE; FREE_LOCK(ITOUMP(ip)); } /* * Called just after updating the cylinder group block to * allocate block or fragment. */ void softdep_setup_blkmapdep(bp, mp, newblkno, frags, oldfrags) struct buf *bp; /* buffer for cylgroup block with block map */ struct mount *mp; /* filesystem doing allocation */ ufs2_daddr_t newblkno; /* number of newly allocated block */ int frags; /* Number of fragments. */ int oldfrags; /* Previous number of fragments for extend. */ { struct newblk *newblk; struct bmsafemap *bmsafemap; struct jnewblk *jnewblk; struct ufsmount *ump; struct fs *fs; KASSERT(MOUNTEDSOFTDEP(mp) != 0, ("softdep_setup_blkmapdep called on non-softdep filesystem")); ump = VFSTOUFS(mp); fs = ump->um_fs; jnewblk = NULL; /* * Create a dependency for the newly allocated block. * Add it to the dependency list for the buffer holding * the cylinder group map from which it was allocated. */ if (MOUNTEDSUJ(mp)) { jnewblk = malloc(sizeof(*jnewblk), M_JNEWBLK, M_SOFTDEP_FLAGS); workitem_alloc(&jnewblk->jn_list, D_JNEWBLK, mp); jnewblk->jn_jsegdep = newjsegdep(&jnewblk->jn_list); jnewblk->jn_state = ATTACHED; jnewblk->jn_blkno = newblkno; jnewblk->jn_frags = frags; jnewblk->jn_oldfrags = oldfrags; #ifdef INVARIANTS { struct cg *cgp; uint8_t *blksfree; long bno; int i; cgp = (struct cg *)bp->b_data; blksfree = cg_blksfree(cgp); bno = dtogd(fs, jnewblk->jn_blkno); for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags; i++) { if (isset(blksfree, bno + i)) panic("softdep_setup_blkmapdep: " "free fragment %d from %d-%d " "state 0x%X dep %p", i, jnewblk->jn_oldfrags, jnewblk->jn_frags, jnewblk->jn_state, jnewblk->jn_dep); } } #endif } CTR3(KTR_SUJ, "softdep_setup_blkmapdep: blkno %jd frags %d oldfrags %d", newblkno, frags, oldfrags); ACQUIRE_LOCK(ump); if (newblk_lookup(mp, newblkno, DEPALLOC, &newblk) != 0) panic("softdep_setup_blkmapdep: found block"); newblk->nb_bmsafemap = bmsafemap = bmsafemap_lookup(mp, bp, dtog(fs, newblkno), NULL); if (jnewblk) { jnewblk->jn_dep = (struct worklist *)newblk; LIST_INSERT_HEAD(&bmsafemap->sm_jnewblkhd, jnewblk, jn_deps); } else { newblk->nb_state |= ONDEPLIST; LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk, nb_deps); } newblk->nb_bmsafemap = bmsafemap; newblk->nb_jnewblk = jnewblk; FREE_LOCK(ump); } #define BMSAFEMAP_HASH(ump, cg) \ (&(ump)->bmsafemap_hashtbl[(cg) & (ump)->bmsafemap_hash_size]) static int bmsafemap_find(bmsafemaphd, cg, bmsafemapp) struct bmsafemap_hashhead *bmsafemaphd; int cg; struct bmsafemap **bmsafemapp; { struct bmsafemap *bmsafemap; LIST_FOREACH(bmsafemap, bmsafemaphd, sm_hash) if (bmsafemap->sm_cg == cg) break; if (bmsafemap) { *bmsafemapp = bmsafemap; return (1); } *bmsafemapp = NULL; return (0); } /* * Find the bmsafemap associated with a cylinder group buffer. * If none exists, create one. The buffer must be locked when * this routine is called and this routine must be called with * the softdep lock held. To avoid giving up the lock while * allocating a new bmsafemap, a preallocated bmsafemap may be * provided. If it is provided but not needed, it is freed. */ static struct bmsafemap * bmsafemap_lookup(mp, bp, cg, newbmsafemap) struct mount *mp; struct buf *bp; int cg; struct bmsafemap *newbmsafemap; { struct bmsafemap_hashhead *bmsafemaphd; struct bmsafemap *bmsafemap, *collision; struct worklist *wk; struct ufsmount *ump; ump = VFSTOUFS(mp); LOCK_OWNED(ump); KASSERT(bp != NULL, ("bmsafemap_lookup: missing buffer")); LIST_FOREACH(wk, &bp->b_dep, wk_list) { if (wk->wk_type == D_BMSAFEMAP) { if (newbmsafemap) WORKITEM_FREE(newbmsafemap, D_BMSAFEMAP); return (WK_BMSAFEMAP(wk)); } } bmsafemaphd = BMSAFEMAP_HASH(ump, cg); if (bmsafemap_find(bmsafemaphd, cg, &bmsafemap) == 1) { if (newbmsafemap) WORKITEM_FREE(newbmsafemap, D_BMSAFEMAP); return (bmsafemap); } if (newbmsafemap) { bmsafemap = newbmsafemap; } else { FREE_LOCK(ump); bmsafemap = malloc(sizeof(struct bmsafemap), M_BMSAFEMAP, M_SOFTDEP_FLAGS); workitem_alloc(&bmsafemap->sm_list, D_BMSAFEMAP, mp); ACQUIRE_LOCK(ump); } bmsafemap->sm_buf = bp; LIST_INIT(&bmsafemap->sm_inodedephd); LIST_INIT(&bmsafemap->sm_inodedepwr); LIST_INIT(&bmsafemap->sm_newblkhd); LIST_INIT(&bmsafemap->sm_newblkwr); LIST_INIT(&bmsafemap->sm_jaddrefhd); LIST_INIT(&bmsafemap->sm_jnewblkhd); LIST_INIT(&bmsafemap->sm_freehd); LIST_INIT(&bmsafemap->sm_freewr); if (bmsafemap_find(bmsafemaphd, cg, &collision) == 1) { WORKITEM_FREE(bmsafemap, D_BMSAFEMAP); return (collision); } bmsafemap->sm_cg = cg; LIST_INSERT_HEAD(bmsafemaphd, bmsafemap, sm_hash); LIST_INSERT_HEAD(&ump->softdep_dirtycg, bmsafemap, sm_next); WORKLIST_INSERT(&bp->b_dep, &bmsafemap->sm_list); return (bmsafemap); } /* * Direct block allocation dependencies. * * When a new block is allocated, the corresponding disk locations must be * initialized (with zeros or new data) before the on-disk inode points to * them. Also, the freemap from which the block was allocated must be * updated (on disk) before the inode's pointer. These two dependencies are * independent of each other and are needed for all file blocks and indirect * blocks that are pointed to directly by the inode. Just before the * "in-core" version of the inode is updated with a newly allocated block * number, a procedure (below) is called to setup allocation dependency * structures. These structures are removed when the corresponding * dependencies are satisfied or when the block allocation becomes obsolete * (i.e., the file is deleted, the block is de-allocated, or the block is a * fragment that gets upgraded). All of these cases are handled in * procedures described later. * * When a file extension causes a fragment to be upgraded, either to a larger * fragment or to a full block, the on-disk location may change (if the * previous fragment could not simply be extended). In this case, the old * fragment must be de-allocated, but not until after the inode's pointer has * been updated. In most cases, this is handled by later procedures, which * will construct a "freefrag" structure to be added to the workitem queue * when the inode update is complete (or obsolete). The main exception to * this is when an allocation occurs while a pending allocation dependency * (for the same block pointer) remains. This case is handled in the main * allocation dependency setup procedure by immediately freeing the * unreferenced fragments. */ void softdep_setup_allocdirect(ip, off, newblkno, oldblkno, newsize, oldsize, bp) struct inode *ip; /* inode to which block is being added */ ufs_lbn_t off; /* block pointer within inode */ ufs2_daddr_t newblkno; /* disk block number being added */ ufs2_daddr_t oldblkno; /* previous block number, 0 unless frag */ long newsize; /* size of new block */ long oldsize; /* size of new block */ struct buf *bp; /* bp for allocated block */ { struct allocdirect *adp, *oldadp; struct allocdirectlst *adphead; struct freefrag *freefrag; struct inodedep *inodedep; struct pagedep *pagedep; struct jnewblk *jnewblk; struct newblk *newblk; struct mount *mp; ufs_lbn_t lbn; lbn = bp->b_lblkno; mp = ITOVFS(ip); KASSERT(MOUNTEDSOFTDEP(mp) != 0, ("softdep_setup_allocdirect called on non-softdep filesystem")); if (oldblkno && oldblkno != newblkno) /* * The usual case is that a smaller fragment that * was just allocated has been replaced with a bigger * fragment or a full-size block. If it is marked as * B_DELWRI, the current contents have not been written * to disk. It is possible that the block was written * earlier, but very uncommon. If the block has never * been written, there is no need to send a BIO_DELETE * for it when it is freed. The gain from avoiding the * TRIMs for the common case of unwritten blocks far * exceeds the cost of the write amplification for the * uncommon case of failing to send a TRIM for a block * that had been written. */ freefrag = newfreefrag(ip, oldblkno, oldsize, lbn, (bp->b_flags & B_DELWRI) != 0 ? NOTRIM_KEY : SINGLETON_KEY); else freefrag = NULL; CTR6(KTR_SUJ, "softdep_setup_allocdirect: ino %d blkno %jd oldblkno %jd " "off %jd newsize %ld oldsize %d", ip->i_number, newblkno, oldblkno, off, newsize, oldsize); ACQUIRE_LOCK(ITOUMP(ip)); if (off >= UFS_NDADDR) { if (lbn > 0) panic("softdep_setup_allocdirect: bad lbn %jd, off %jd", lbn, off); /* allocating an indirect block */ if (oldblkno != 0) panic("softdep_setup_allocdirect: non-zero indir"); } else { if (off != lbn) panic("softdep_setup_allocdirect: lbn %jd != off %jd", lbn, off); /* * Allocating a direct block. * * If we are allocating a directory block, then we must * allocate an associated pagedep to track additions and * deletions. */ if ((ip->i_mode & IFMT) == IFDIR) pagedep_lookup(mp, bp, ip->i_number, off, DEPALLOC, &pagedep); } if (newblk_lookup(mp, newblkno, 0, &newblk) == 0) panic("softdep_setup_allocdirect: lost block"); KASSERT(newblk->nb_list.wk_type == D_NEWBLK, ("softdep_setup_allocdirect: newblk already initialized")); /* * Convert the newblk to an allocdirect. */ WORKITEM_REASSIGN(newblk, D_ALLOCDIRECT); adp = (struct allocdirect *)newblk; newblk->nb_freefrag = freefrag; adp->ad_offset = off; adp->ad_oldblkno = oldblkno; adp->ad_newsize = newsize; adp->ad_oldsize = oldsize; /* * Finish initializing the journal. */ if ((jnewblk = newblk->nb_jnewblk) != NULL) { jnewblk->jn_ino = ip->i_number; jnewblk->jn_lbn = lbn; add_to_journal(&jnewblk->jn_list); } if (freefrag && freefrag->ff_jdep != NULL && freefrag->ff_jdep->wk_type == D_JFREEFRAG) add_to_journal(freefrag->ff_jdep); inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep); adp->ad_inodedep = inodedep; WORKLIST_INSERT(&bp->b_dep, &newblk->nb_list); /* * The list of allocdirects must be kept in sorted and ascending * order so that the rollback routines can quickly determine the * first uncommitted block (the size of the file stored on disk * ends at the end of the lowest committed fragment, or if there * are no fragments, at the end of the highest committed block). * Since files generally grow, the typical case is that the new * block is to be added at the end of the list. We speed this * special case by checking against the last allocdirect in the * list before laboriously traversing the list looking for the * insertion point. */ adphead = &inodedep->id_newinoupdt; oldadp = TAILQ_LAST(adphead, allocdirectlst); if (oldadp == NULL || oldadp->ad_offset <= off) { /* insert at end of list */ TAILQ_INSERT_TAIL(adphead, adp, ad_next); if (oldadp != NULL && oldadp->ad_offset == off) allocdirect_merge(adphead, adp, oldadp); FREE_LOCK(ITOUMP(ip)); return; } TAILQ_FOREACH(oldadp, adphead, ad_next) { if (oldadp->ad_offset >= off) break; } if (oldadp == NULL) panic("softdep_setup_allocdirect: lost entry"); /* insert in middle of list */ TAILQ_INSERT_BEFORE(oldadp, adp, ad_next); if (oldadp->ad_offset == off) allocdirect_merge(adphead, adp, oldadp); FREE_LOCK(ITOUMP(ip)); } /* * Merge a newer and older journal record to be stored either in a * newblock or freefrag. This handles aggregating journal records for * fragment allocation into a second record as well as replacing a * journal free with an aborted journal allocation. A segment for the * oldest record will be placed on wkhd if it has been written. If not * the segment for the newer record will suffice. */ static struct worklist * jnewblk_merge(new, old, wkhd) struct worklist *new; struct worklist *old; struct workhead *wkhd; { struct jnewblk *njnewblk; struct jnewblk *jnewblk; /* Handle NULLs to simplify callers. */ if (new == NULL) return (old); if (old == NULL) return (new); /* Replace a jfreefrag with a jnewblk. */ if (new->wk_type == D_JFREEFRAG) { if (WK_JNEWBLK(old)->jn_blkno != WK_JFREEFRAG(new)->fr_blkno) panic("jnewblk_merge: blkno mismatch: %p, %p", old, new); cancel_jfreefrag(WK_JFREEFRAG(new)); return (old); } if (old->wk_type != D_JNEWBLK || new->wk_type != D_JNEWBLK) panic("jnewblk_merge: Bad type: old %d new %d\n", old->wk_type, new->wk_type); /* * Handle merging of two jnewblk records that describe * different sets of fragments in the same block. */ jnewblk = WK_JNEWBLK(old); njnewblk = WK_JNEWBLK(new); if (jnewblk->jn_blkno != njnewblk->jn_blkno) panic("jnewblk_merge: Merging disparate blocks."); /* * The record may be rolled back in the cg. */ if (jnewblk->jn_state & UNDONE) { jnewblk->jn_state &= ~UNDONE; njnewblk->jn_state |= UNDONE; njnewblk->jn_state &= ~ATTACHED; } /* * We modify the newer addref and free the older so that if neither * has been written the most up-to-date copy will be on disk. If * both have been written but rolled back we only temporarily need * one of them to fix the bits when the cg write completes. */ jnewblk->jn_state |= ATTACHED | COMPLETE; njnewblk->jn_oldfrags = jnewblk->jn_oldfrags; cancel_jnewblk(jnewblk, wkhd); WORKLIST_REMOVE(&jnewblk->jn_list); free_jnewblk(jnewblk); return (new); } /* * Replace an old allocdirect dependency with a newer one. */ static void allocdirect_merge(adphead, newadp, oldadp) struct allocdirectlst *adphead; /* head of list holding allocdirects */ struct allocdirect *newadp; /* allocdirect being added */ struct allocdirect *oldadp; /* existing allocdirect being checked */ { struct worklist *wk; struct freefrag *freefrag; freefrag = NULL; LOCK_OWNED(VFSTOUFS(newadp->ad_list.wk_mp)); if (newadp->ad_oldblkno != oldadp->ad_newblkno || newadp->ad_oldsize != oldadp->ad_newsize || newadp->ad_offset >= UFS_NDADDR) panic("%s %jd != new %jd || old size %ld != new %ld", "allocdirect_merge: old blkno", (intmax_t)newadp->ad_oldblkno, (intmax_t)oldadp->ad_newblkno, newadp->ad_oldsize, oldadp->ad_newsize); newadp->ad_oldblkno = oldadp->ad_oldblkno; newadp->ad_oldsize = oldadp->ad_oldsize; /* * If the old dependency had a fragment to free or had never * previously had a block allocated, then the new dependency * can immediately post its freefrag and adopt the old freefrag. * This action is done by swapping the freefrag dependencies. * The new dependency gains the old one's freefrag, and the * old one gets the new one and then immediately puts it on * the worklist when it is freed by free_newblk. It is * not possible to do this swap when the old dependency had a * non-zero size but no previous fragment to free. This condition * arises when the new block is an extension of the old block. * Here, the first part of the fragment allocated to the new * dependency is part of the block currently claimed on disk by * the old dependency, so cannot legitimately be freed until the * conditions for the new dependency are fulfilled. */ freefrag = newadp->ad_freefrag; if (oldadp->ad_freefrag != NULL || oldadp->ad_oldblkno == 0) { newadp->ad_freefrag = oldadp->ad_freefrag; oldadp->ad_freefrag = freefrag; } /* * If we are tracking a new directory-block allocation, * move it from the old allocdirect to the new allocdirect. */ if ((wk = LIST_FIRST(&oldadp->ad_newdirblk)) != NULL) { WORKLIST_REMOVE(wk); if (!LIST_EMPTY(&oldadp->ad_newdirblk)) panic("allocdirect_merge: extra newdirblk"); WORKLIST_INSERT(&newadp->ad_newdirblk, wk); } TAILQ_REMOVE(adphead, oldadp, ad_next); /* * We need to move any journal dependencies over to the freefrag * that releases this block if it exists. Otherwise we are * extending an existing block and we'll wait until that is * complete to release the journal space and extend the * new journal to cover this old space as well. */ if (freefrag == NULL) { if (oldadp->ad_newblkno != newadp->ad_newblkno) panic("allocdirect_merge: %jd != %jd", oldadp->ad_newblkno, newadp->ad_newblkno); newadp->ad_block.nb_jnewblk = (struct jnewblk *) jnewblk_merge(&newadp->ad_block.nb_jnewblk->jn_list, &oldadp->ad_block.nb_jnewblk->jn_list, &newadp->ad_block.nb_jwork); oldadp->ad_block.nb_jnewblk = NULL; cancel_newblk(&oldadp->ad_block, NULL, &newadp->ad_block.nb_jwork); } else { wk = (struct worklist *) cancel_newblk(&oldadp->ad_block, &freefrag->ff_list, &freefrag->ff_jwork); freefrag->ff_jdep = jnewblk_merge(freefrag->ff_jdep, wk, &freefrag->ff_jwork); } free_newblk(&oldadp->ad_block); } /* * Allocate a jfreefrag structure to journal a single block free. */ static struct jfreefrag * newjfreefrag(freefrag, ip, blkno, size, lbn) struct freefrag *freefrag; struct inode *ip; ufs2_daddr_t blkno; long size; ufs_lbn_t lbn; { struct jfreefrag *jfreefrag; struct fs *fs; fs = ITOFS(ip); jfreefrag = malloc(sizeof(struct jfreefrag), M_JFREEFRAG, M_SOFTDEP_FLAGS); workitem_alloc(&jfreefrag->fr_list, D_JFREEFRAG, ITOVFS(ip)); jfreefrag->fr_jsegdep = newjsegdep(&jfreefrag->fr_list); jfreefrag->fr_state = ATTACHED | DEPCOMPLETE; jfreefrag->fr_ino = ip->i_number; jfreefrag->fr_lbn = lbn; jfreefrag->fr_blkno = blkno; jfreefrag->fr_frags = numfrags(fs, size); jfreefrag->fr_freefrag = freefrag; return (jfreefrag); } /* * Allocate a new freefrag structure. */ static struct freefrag * newfreefrag(ip, blkno, size, lbn, key) struct inode *ip; ufs2_daddr_t blkno; long size; ufs_lbn_t lbn; u_long key; { struct freefrag *freefrag; struct ufsmount *ump; struct fs *fs; CTR4(KTR_SUJ, "newfreefrag: ino %d blkno %jd size %ld lbn %jd", ip->i_number, blkno, size, lbn); ump = ITOUMP(ip); fs = ump->um_fs; if (fragnum(fs, blkno) + numfrags(fs, size) > fs->fs_frag) panic("newfreefrag: frag size"); freefrag = malloc(sizeof(struct freefrag), M_FREEFRAG, M_SOFTDEP_FLAGS); workitem_alloc(&freefrag->ff_list, D_FREEFRAG, UFSTOVFS(ump)); freefrag->ff_state = ATTACHED; LIST_INIT(&freefrag->ff_jwork); freefrag->ff_inum = ip->i_number; freefrag->ff_vtype = ITOV(ip)->v_type; freefrag->ff_blkno = blkno; freefrag->ff_fragsize = size; freefrag->ff_key = key; if (MOUNTEDSUJ(UFSTOVFS(ump))) { freefrag->ff_jdep = (struct worklist *) newjfreefrag(freefrag, ip, blkno, size, lbn); } else { freefrag->ff_state |= DEPCOMPLETE; freefrag->ff_jdep = NULL; } return (freefrag); } /* * This workitem de-allocates fragments that were replaced during * file block allocation. */ static void handle_workitem_freefrag(freefrag) struct freefrag *freefrag; { struct ufsmount *ump = VFSTOUFS(freefrag->ff_list.wk_mp); struct workhead wkhd; CTR3(KTR_SUJ, "handle_workitem_freefrag: ino %d blkno %jd size %ld", freefrag->ff_inum, freefrag->ff_blkno, freefrag->ff_fragsize); /* * It would be illegal to add new completion items to the * freefrag after it was schedule to be done so it must be * safe to modify the list head here. */ LIST_INIT(&wkhd); ACQUIRE_LOCK(ump); LIST_SWAP(&freefrag->ff_jwork, &wkhd, worklist, wk_list); /* * If the journal has not been written we must cancel it here. */ if (freefrag->ff_jdep) { if (freefrag->ff_jdep->wk_type != D_JNEWBLK) panic("handle_workitem_freefrag: Unexpected type %d\n", freefrag->ff_jdep->wk_type); cancel_jnewblk(WK_JNEWBLK(freefrag->ff_jdep), &wkhd); } FREE_LOCK(ump); ffs_blkfree(ump, ump->um_fs, ump->um_devvp, freefrag->ff_blkno, freefrag->ff_fragsize, freefrag->ff_inum, freefrag->ff_vtype, &wkhd, freefrag->ff_key); ACQUIRE_LOCK(ump); WORKITEM_FREE(freefrag, D_FREEFRAG); FREE_LOCK(ump); } /* * Set up a dependency structure for an external attributes data block. * This routine follows much of the structure of softdep_setup_allocdirect. * See the description of softdep_setup_allocdirect above for details. */ void softdep_setup_allocext(ip, off, newblkno, oldblkno, newsize, oldsize, bp) struct inode *ip; ufs_lbn_t off; ufs2_daddr_t newblkno; ufs2_daddr_t oldblkno; long newsize; long oldsize; struct buf *bp; { struct allocdirect *adp, *oldadp; struct allocdirectlst *adphead; struct freefrag *freefrag; struct inodedep *inodedep; struct jnewblk *jnewblk; struct newblk *newblk; struct mount *mp; struct ufsmount *ump; ufs_lbn_t lbn; mp = ITOVFS(ip); ump = VFSTOUFS(mp); KASSERT(MOUNTEDSOFTDEP(mp) != 0, ("softdep_setup_allocext called on non-softdep filesystem")); KASSERT(off < UFS_NXADDR, ("softdep_setup_allocext: lbn %lld > UFS_NXADDR", (long long)off)); lbn = bp->b_lblkno; if (oldblkno && oldblkno != newblkno) /* * The usual case is that a smaller fragment that * was just allocated has been replaced with a bigger * fragment or a full-size block. If it is marked as * B_DELWRI, the current contents have not been written * to disk. It is possible that the block was written * earlier, but very uncommon. If the block has never * been written, there is no need to send a BIO_DELETE * for it when it is freed. The gain from avoiding the * TRIMs for the common case of unwritten blocks far * exceeds the cost of the write amplification for the * uncommon case of failing to send a TRIM for a block * that had been written. */ freefrag = newfreefrag(ip, oldblkno, oldsize, lbn, (bp->b_flags & B_DELWRI) != 0 ? NOTRIM_KEY : SINGLETON_KEY); else freefrag = NULL; ACQUIRE_LOCK(ump); if (newblk_lookup(mp, newblkno, 0, &newblk) == 0) panic("softdep_setup_allocext: lost block"); KASSERT(newblk->nb_list.wk_type == D_NEWBLK, ("softdep_setup_allocext: newblk already initialized")); /* * Convert the newblk to an allocdirect. */ WORKITEM_REASSIGN(newblk, D_ALLOCDIRECT); adp = (struct allocdirect *)newblk; newblk->nb_freefrag = freefrag; adp->ad_offset = off; adp->ad_oldblkno = oldblkno; adp->ad_newsize = newsize; adp->ad_oldsize = oldsize; adp->ad_state |= EXTDATA; /* * Finish initializing the journal. */ if ((jnewblk = newblk->nb_jnewblk) != NULL) { jnewblk->jn_ino = ip->i_number; jnewblk->jn_lbn = lbn; add_to_journal(&jnewblk->jn_list); } if (freefrag && freefrag->ff_jdep != NULL && freefrag->ff_jdep->wk_type == D_JFREEFRAG) add_to_journal(freefrag->ff_jdep); inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep); adp->ad_inodedep = inodedep; WORKLIST_INSERT(&bp->b_dep, &newblk->nb_list); /* * The list of allocdirects must be kept in sorted and ascending * order so that the rollback routines can quickly determine the * first uncommitted block (the size of the file stored on disk * ends at the end of the lowest committed fragment, or if there * are no fragments, at the end of the highest committed block). * Since files generally grow, the typical case is that the new * block is to be added at the end of the list. We speed this * special case by checking against the last allocdirect in the * list before laboriously traversing the list looking for the * insertion point. */ adphead = &inodedep->id_newextupdt; oldadp = TAILQ_LAST(adphead, allocdirectlst); if (oldadp == NULL || oldadp->ad_offset <= off) { /* insert at end of list */ TAILQ_INSERT_TAIL(adphead, adp, ad_next); if (oldadp != NULL && oldadp->ad_offset == off) allocdirect_merge(adphead, adp, oldadp); FREE_LOCK(ump); return; } TAILQ_FOREACH(oldadp, adphead, ad_next) { if (oldadp->ad_offset >= off) break; } if (oldadp == NULL) panic("softdep_setup_allocext: lost entry"); /* insert in middle of list */ TAILQ_INSERT_BEFORE(oldadp, adp, ad_next); if (oldadp->ad_offset == off) allocdirect_merge(adphead, adp, oldadp); FREE_LOCK(ump); } /* * Indirect block allocation dependencies. * * The same dependencies that exist for a direct block also exist when * a new block is allocated and pointed to by an entry in a block of * indirect pointers. The undo/redo states described above are also * used here. Because an indirect block contains many pointers that * may have dependencies, a second copy of the entire in-memory indirect * block is kept. The buffer cache copy is always completely up-to-date. * The second copy, which is used only as a source for disk writes, * contains only the safe pointers (i.e., those that have no remaining * update dependencies). The second copy is freed when all pointers * are safe. The cache is not allowed to replace indirect blocks with * pending update dependencies. If a buffer containing an indirect * block with dependencies is written, these routines will mark it * dirty again. It can only be successfully written once all the * dependencies are removed. The ffs_fsync routine in conjunction with * softdep_sync_metadata work together to get all the dependencies * removed so that a file can be successfully written to disk. Three * procedures are used when setting up indirect block pointer * dependencies. The division is necessary because of the organization * of the "balloc" routine and because of the distinction between file * pages and file metadata blocks. */ /* * Allocate a new allocindir structure. */ static struct allocindir * newallocindir(ip, ptrno, newblkno, oldblkno, lbn) struct inode *ip; /* inode for file being extended */ int ptrno; /* offset of pointer in indirect block */ ufs2_daddr_t newblkno; /* disk block number being added */ ufs2_daddr_t oldblkno; /* previous block number, 0 if none */ ufs_lbn_t lbn; { struct newblk *newblk; struct allocindir *aip; struct freefrag *freefrag; struct jnewblk *jnewblk; if (oldblkno) freefrag = newfreefrag(ip, oldblkno, ITOFS(ip)->fs_bsize, lbn, SINGLETON_KEY); else freefrag = NULL; ACQUIRE_LOCK(ITOUMP(ip)); if (newblk_lookup(ITOVFS(ip), newblkno, 0, &newblk) == 0) panic("new_allocindir: lost block"); KASSERT(newblk->nb_list.wk_type == D_NEWBLK, ("newallocindir: newblk already initialized")); WORKITEM_REASSIGN(newblk, D_ALLOCINDIR); newblk->nb_freefrag = freefrag; aip = (struct allocindir *)newblk; aip->ai_offset = ptrno; aip->ai_oldblkno = oldblkno; aip->ai_lbn = lbn; if ((jnewblk = newblk->nb_jnewblk) != NULL) { jnewblk->jn_ino = ip->i_number; jnewblk->jn_lbn = lbn; add_to_journal(&jnewblk->jn_list); } if (freefrag && freefrag->ff_jdep != NULL && freefrag->ff_jdep->wk_type == D_JFREEFRAG) add_to_journal(freefrag->ff_jdep); return (aip); } /* * Called just before setting an indirect block pointer * to a newly allocated file page. */ void softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp) struct inode *ip; /* inode for file being extended */ ufs_lbn_t lbn; /* allocated block number within file */ struct buf *bp; /* buffer with indirect blk referencing page */ int ptrno; /* offset of pointer in indirect block */ ufs2_daddr_t newblkno; /* disk block number being added */ ufs2_daddr_t oldblkno; /* previous block number, 0 if none */ struct buf *nbp; /* buffer holding allocated page */ { struct inodedep *inodedep; struct freefrag *freefrag; struct allocindir *aip; struct pagedep *pagedep; struct mount *mp; struct ufsmount *ump; mp = ITOVFS(ip); ump = VFSTOUFS(mp); KASSERT(MOUNTEDSOFTDEP(mp) != 0, ("softdep_setup_allocindir_page called on non-softdep filesystem")); KASSERT(lbn == nbp->b_lblkno, ("softdep_setup_allocindir_page: lbn %jd != lblkno %jd", lbn, bp->b_lblkno)); CTR4(KTR_SUJ, "softdep_setup_allocindir_page: ino %d blkno %jd oldblkno %jd " "lbn %jd", ip->i_number, newblkno, oldblkno, lbn); ASSERT_VOP_LOCKED(ITOV(ip), "softdep_setup_allocindir_page"); aip = newallocindir(ip, ptrno, newblkno, oldblkno, lbn); (void) inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep); /* * If we are allocating a directory page, then we must * allocate an associated pagedep to track additions and * deletions. */ if ((ip->i_mode & IFMT) == IFDIR) pagedep_lookup(mp, nbp, ip->i_number, lbn, DEPALLOC, &pagedep); WORKLIST_INSERT(&nbp->b_dep, &aip->ai_block.nb_list); freefrag = setup_allocindir_phase2(bp, ip, inodedep, aip, lbn); FREE_LOCK(ump); if (freefrag) handle_workitem_freefrag(freefrag); } /* * Called just before setting an indirect block pointer to a * newly allocated indirect block. */ void softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno) struct buf *nbp; /* newly allocated indirect block */ struct inode *ip; /* inode for file being extended */ struct buf *bp; /* indirect block referencing allocated block */ int ptrno; /* offset of pointer in indirect block */ ufs2_daddr_t newblkno; /* disk block number being added */ { struct inodedep *inodedep; struct allocindir *aip; struct ufsmount *ump; ufs_lbn_t lbn; ump = ITOUMP(ip); KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0, ("softdep_setup_allocindir_meta called on non-softdep filesystem")); CTR3(KTR_SUJ, "softdep_setup_allocindir_meta: ino %d blkno %jd ptrno %d", ip->i_number, newblkno, ptrno); lbn = nbp->b_lblkno; ASSERT_VOP_LOCKED(ITOV(ip), "softdep_setup_allocindir_meta"); aip = newallocindir(ip, ptrno, newblkno, 0, lbn); inodedep_lookup(UFSTOVFS(ump), ip->i_number, DEPALLOC, &inodedep); WORKLIST_INSERT(&nbp->b_dep, &aip->ai_block.nb_list); if (setup_allocindir_phase2(bp, ip, inodedep, aip, lbn)) panic("softdep_setup_allocindir_meta: Block already existed"); FREE_LOCK(ump); } static void indirdep_complete(indirdep) struct indirdep *indirdep; { struct allocindir *aip; LIST_REMOVE(indirdep, ir_next); indirdep->ir_state |= DEPCOMPLETE; while ((aip = LIST_FIRST(&indirdep->ir_completehd)) != NULL) { LIST_REMOVE(aip, ai_next); free_newblk(&aip->ai_block); } /* * If this indirdep is not attached to a buf it was simply waiting * on completion to clear completehd. free_indirdep() asserts * that nothing is dangling. */ if ((indirdep->ir_state & ONWORKLIST) == 0) free_indirdep(indirdep); } static struct indirdep * indirdep_lookup(mp, ip, bp) struct mount *mp; struct inode *ip; struct buf *bp; { struct indirdep *indirdep, *newindirdep; struct newblk *newblk; struct ufsmount *ump; struct worklist *wk; struct fs *fs; ufs2_daddr_t blkno; ump = VFSTOUFS(mp); LOCK_OWNED(ump); indirdep = NULL; newindirdep = NULL; fs = ump->um_fs; for (;;) { LIST_FOREACH(wk, &bp->b_dep, wk_list) { if (wk->wk_type != D_INDIRDEP) continue; indirdep = WK_INDIRDEP(wk); break; } /* Found on the buffer worklist, no new structure to free. */ if (indirdep != NULL && newindirdep == NULL) return (indirdep); if (indirdep != NULL && newindirdep != NULL) panic("indirdep_lookup: simultaneous create"); /* None found on the buffer and a new structure is ready. */ if (indirdep == NULL && newindirdep != NULL) break; /* None found and no new structure available. */ FREE_LOCK(ump); newindirdep = malloc(sizeof(struct indirdep), M_INDIRDEP, M_SOFTDEP_FLAGS); workitem_alloc(&newindirdep->ir_list, D_INDIRDEP, mp); newindirdep->ir_state = ATTACHED; if (I_IS_UFS1(ip)) newindirdep->ir_state |= UFS1FMT; TAILQ_INIT(&newindirdep->ir_trunc); newindirdep->ir_saveddata = NULL; LIST_INIT(&newindirdep->ir_deplisthd); LIST_INIT(&newindirdep->ir_donehd); LIST_INIT(&newindirdep->ir_writehd); LIST_INIT(&newindirdep->ir_completehd); if (bp->b_blkno == bp->b_lblkno) { ufs_bmaparray(bp->b_vp, bp->b_lblkno, &blkno, bp, NULL, NULL); bp->b_blkno = blkno; } newindirdep->ir_freeblks = NULL; newindirdep->ir_savebp = getblk(ump->um_devvp, bp->b_blkno, bp->b_bcount, 0, 0, 0); newindirdep->ir_bp = bp; BUF_KERNPROC(newindirdep->ir_savebp); bcopy(bp->b_data, newindirdep->ir_savebp->b_data, bp->b_bcount); ACQUIRE_LOCK(ump); } indirdep = newindirdep; WORKLIST_INSERT(&bp->b_dep, &indirdep->ir_list); /* * If the block is not yet allocated we don't set DEPCOMPLETE so * that we don't free dependencies until the pointers are valid. * This could search b_dep for D_ALLOCDIRECT/D_ALLOCINDIR rather * than using the hash. */ if (newblk_lookup(mp, dbtofsb(fs, bp->b_blkno), 0, &newblk)) LIST_INSERT_HEAD(&newblk->nb_indirdeps, indirdep, ir_next); else indirdep->ir_state |= DEPCOMPLETE; return (indirdep); } /* * Called to finish the allocation of the "aip" allocated * by one of the two routines above. */ static struct freefrag * setup_allocindir_phase2(bp, ip, inodedep, aip, lbn) struct buf *bp; /* in-memory copy of the indirect block */ struct inode *ip; /* inode for file being extended */ struct inodedep *inodedep; /* Inodedep for ip */ struct allocindir *aip; /* allocindir allocated by the above routines */ ufs_lbn_t lbn; /* Logical block number for this block. */ { struct fs *fs; struct indirdep *indirdep; struct allocindir *oldaip; struct freefrag *freefrag; struct mount *mp; struct ufsmount *ump; mp = ITOVFS(ip); ump = VFSTOUFS(mp); LOCK_OWNED(ump); fs = ump->um_fs; if (bp->b_lblkno >= 0) panic("setup_allocindir_phase2: not indir blk"); KASSERT(aip->ai_offset >= 0 && aip->ai_offset < NINDIR(fs), ("setup_allocindir_phase2: Bad offset %d", aip->ai_offset)); indirdep = indirdep_lookup(mp, ip, bp); KASSERT(indirdep->ir_savebp != NULL, ("setup_allocindir_phase2 NULL ir_savebp")); aip->ai_indirdep = indirdep; /* * Check for an unwritten dependency for this indirect offset. If * there is, merge the old dependency into the new one. This happens * as a result of reallocblk only. */ freefrag = NULL; if (aip->ai_oldblkno != 0) { LIST_FOREACH(oldaip, &indirdep->ir_deplisthd, ai_next) { if (oldaip->ai_offset == aip->ai_offset) { freefrag = allocindir_merge(aip, oldaip); goto done; } } LIST_FOREACH(oldaip, &indirdep->ir_donehd, ai_next) { if (oldaip->ai_offset == aip->ai_offset) { freefrag = allocindir_merge(aip, oldaip); goto done; } } } done: LIST_INSERT_HEAD(&indirdep->ir_deplisthd, aip, ai_next); return (freefrag); } /* * Merge two allocindirs which refer to the same block. Move newblock * dependencies and setup the freefrags appropriately. */ static struct freefrag * allocindir_merge(aip, oldaip) struct allocindir *aip; struct allocindir *oldaip; { struct freefrag *freefrag; struct worklist *wk; if (oldaip->ai_newblkno != aip->ai_oldblkno) panic("allocindir_merge: blkno"); aip->ai_oldblkno = oldaip->ai_oldblkno; freefrag = aip->ai_freefrag; aip->ai_freefrag = oldaip->ai_freefrag; oldaip->ai_freefrag = NULL; KASSERT(freefrag != NULL, ("setup_allocindir_phase2: No freefrag")); /* * If we are tracking a new directory-block allocation, * move it from the old allocindir to the new allocindir. */ if ((wk = LIST_FIRST(&oldaip->ai_newdirblk)) != NULL) { WORKLIST_REMOVE(wk); if (!LIST_EMPTY(&oldaip->ai_newdirblk)) panic("allocindir_merge: extra newdirblk"); WORKLIST_INSERT(&aip->ai_newdirblk, wk); } /* * We can skip journaling for this freefrag and just complete * any pending journal work for the allocindir that is being * removed after the freefrag completes. */ if (freefrag->ff_jdep) cancel_jfreefrag(WK_JFREEFRAG(freefrag->ff_jdep)); LIST_REMOVE(oldaip, ai_next); freefrag->ff_jdep = (struct worklist *)cancel_newblk(&oldaip->ai_block, &freefrag->ff_list, &freefrag->ff_jwork); free_newblk(&oldaip->ai_block); return (freefrag); } static inline void setup_freedirect(freeblks, ip, i, needj) struct freeblks *freeblks; struct inode *ip; int i; int needj; { struct ufsmount *ump; ufs2_daddr_t blkno; int frags; blkno = DIP(ip, i_db[i]); if (blkno == 0) return; DIP_SET(ip, i_db[i], 0); ump = ITOUMP(ip); frags = sblksize(ump->um_fs, ip->i_size, i); frags = numfrags(ump->um_fs, frags); newfreework(ump, freeblks, NULL, i, blkno, frags, 0, needj); } static inline void setup_freeext(freeblks, ip, i, needj) struct freeblks *freeblks; struct inode *ip; int i; int needj; { struct ufsmount *ump; ufs2_daddr_t blkno; int frags; blkno = ip->i_din2->di_extb[i]; if (blkno == 0) return; ip->i_din2->di_extb[i] = 0; ump = ITOUMP(ip); frags = sblksize(ump->um_fs, ip->i_din2->di_extsize, i); frags = numfrags(ump->um_fs, frags); newfreework(ump, freeblks, NULL, -1 - i, blkno, frags, 0, needj); } static inline void setup_freeindir(freeblks, ip, i, lbn, needj) struct freeblks *freeblks; struct inode *ip; int i; ufs_lbn_t lbn; int needj; { struct ufsmount *ump; ufs2_daddr_t blkno; blkno = DIP(ip, i_ib[i]); if (blkno == 0) return; DIP_SET(ip, i_ib[i], 0); ump = ITOUMP(ip); newfreework(ump, freeblks, NULL, lbn, blkno, ump->um_fs->fs_frag, 0, needj); } static inline struct freeblks * newfreeblks(mp, ip) struct mount *mp; struct inode *ip; { struct freeblks *freeblks; freeblks = malloc(sizeof(struct freeblks), M_FREEBLKS, M_SOFTDEP_FLAGS|M_ZERO); workitem_alloc(&freeblks->fb_list, D_FREEBLKS, mp); LIST_INIT(&freeblks->fb_jblkdephd); LIST_INIT(&freeblks->fb_jwork); freeblks->fb_ref = 0; freeblks->fb_cgwait = 0; freeblks->fb_state = ATTACHED; freeblks->fb_uid = ip->i_uid; freeblks->fb_inum = ip->i_number; freeblks->fb_vtype = ITOV(ip)->v_type; freeblks->fb_modrev = DIP(ip, i_modrev); freeblks->fb_devvp = ITODEVVP(ip); freeblks->fb_chkcnt = 0; freeblks->fb_len = 0; return (freeblks); } static void trunc_indirdep(indirdep, freeblks, bp, off) struct indirdep *indirdep; struct freeblks *freeblks; struct buf *bp; int off; { struct allocindir *aip, *aipn; /* * The first set of allocindirs won't be in savedbp. */ LIST_FOREACH_SAFE(aip, &indirdep->ir_deplisthd, ai_next, aipn) if (aip->ai_offset > off) cancel_allocindir(aip, bp, freeblks, 1); LIST_FOREACH_SAFE(aip, &indirdep->ir_donehd, ai_next, aipn) if (aip->ai_offset > off) cancel_allocindir(aip, bp, freeblks, 1); /* * These will exist in savedbp. */ LIST_FOREACH_SAFE(aip, &indirdep->ir_writehd, ai_next, aipn) if (aip->ai_offset > off) cancel_allocindir(aip, NULL, freeblks, 0); LIST_FOREACH_SAFE(aip, &indirdep->ir_completehd, ai_next, aipn) if (aip->ai_offset > off) cancel_allocindir(aip, NULL, freeblks, 0); } /* * Follow the chain of indirects down to lastlbn creating a freework * structure for each. This will be used to start indir_trunc() at * the right offset and create the journal records for the parrtial * truncation. A second step will handle the truncated dependencies. */ static int setup_trunc_indir(freeblks, ip, lbn, lastlbn, blkno) struct freeblks *freeblks; struct inode *ip; ufs_lbn_t lbn; ufs_lbn_t lastlbn; ufs2_daddr_t blkno; { struct indirdep *indirdep; struct indirdep *indirn; struct freework *freework; struct newblk *newblk; struct mount *mp; struct ufsmount *ump; struct buf *bp; uint8_t *start; uint8_t *end; ufs_lbn_t lbnadd; int level; int error; int off; freework = NULL; if (blkno == 0) return (0); mp = freeblks->fb_list.wk_mp; ump = VFSTOUFS(mp); /* * Here, calls to VOP_BMAP() will fail. However, we already have * the on-disk address, so we just pass it to bread() instead of * having bread() attempt to calculate it using VOP_BMAP(). */ error = ffs_breadz(ump, ITOV(ip), lbn, blkptrtodb(ump, blkno), (int)mp->mnt_stat.f_iosize, NULL, NULL, 0, NOCRED, 0, NULL, &bp); if (error) return (error); level = lbn_level(lbn); lbnadd = lbn_offset(ump->um_fs, level); /* * Compute the offset of the last block we want to keep. Store * in the freework the first block we want to completely free. */ off = (lastlbn - -(lbn + level)) / lbnadd; if (off + 1 == NINDIR(ump->um_fs)) goto nowork; freework = newfreework(ump, freeblks, NULL, lbn, blkno, 0, off + 1, 0); /* * Link the freework into the indirdep. This will prevent any new * allocations from proceeding until we are finished with the * truncate and the block is written. */ ACQUIRE_LOCK(ump); indirdep = indirdep_lookup(mp, ip, bp); if (indirdep->ir_freeblks) panic("setup_trunc_indir: indirdep already truncated."); TAILQ_INSERT_TAIL(&indirdep->ir_trunc, freework, fw_next); freework->fw_indir = indirdep; /* * Cancel any allocindirs that will not make it to disk. * We have to do this for all copies of the indirdep that * live on this newblk. */ if ((indirdep->ir_state & DEPCOMPLETE) == 0) { if (newblk_lookup(mp, dbtofsb(ump->um_fs, bp->b_blkno), 0, &newblk) == 0) panic("setup_trunc_indir: lost block"); LIST_FOREACH(indirn, &newblk->nb_indirdeps, ir_next) trunc_indirdep(indirn, freeblks, bp, off); } else trunc_indirdep(indirdep, freeblks, bp, off); FREE_LOCK(ump); /* * Creation is protected by the buf lock. The saveddata is only * needed if a full truncation follows a partial truncation but it * is difficult to allocate in that case so we fetch it anyway. */ if (indirdep->ir_saveddata == NULL) indirdep->ir_saveddata = malloc(bp->b_bcount, M_INDIRDEP, M_SOFTDEP_FLAGS); nowork: /* Fetch the blkno of the child and the zero start offset. */ if (I_IS_UFS1(ip)) { blkno = ((ufs1_daddr_t *)bp->b_data)[off]; start = (uint8_t *)&((ufs1_daddr_t *)bp->b_data)[off+1]; } else { blkno = ((ufs2_daddr_t *)bp->b_data)[off]; start = (uint8_t *)&((ufs2_daddr_t *)bp->b_data)[off+1]; } if (freework) { /* Zero the truncated pointers. */ end = bp->b_data + bp->b_bcount; bzero(start, end - start); bdwrite(bp); } else bqrelse(bp); if (level == 0) return (0); lbn++; /* adjust level */ lbn -= (off * lbnadd); return setup_trunc_indir(freeblks, ip, lbn, lastlbn, blkno); } /* * Complete the partial truncation of an indirect block setup by * setup_trunc_indir(). This zeros the truncated pointers in the saved * copy and writes them to disk before the freeblks is allowed to complete. */ static void complete_trunc_indir(freework) struct freework *freework; { struct freework *fwn; struct indirdep *indirdep; struct ufsmount *ump; struct buf *bp; uintptr_t start; int count; ump = VFSTOUFS(freework->fw_list.wk_mp); LOCK_OWNED(ump); indirdep = freework->fw_indir; for (;;) { bp = indirdep->ir_bp; /* See if the block was discarded. */ if (bp == NULL) break; /* Inline part of getdirtybuf(). We dont want bremfree. */ if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) == 0) break; if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, LOCK_PTR(ump)) == 0) BUF_UNLOCK(bp); ACQUIRE_LOCK(ump); } freework->fw_state |= DEPCOMPLETE; TAILQ_REMOVE(&indirdep->ir_trunc, freework, fw_next); /* * Zero the pointers in the saved copy. */ if (indirdep->ir_state & UFS1FMT) start = sizeof(ufs1_daddr_t); else start = sizeof(ufs2_daddr_t); start *= freework->fw_start; count = indirdep->ir_savebp->b_bcount - start; start += (uintptr_t)indirdep->ir_savebp->b_data; bzero((char *)start, count); /* * We need to start the next truncation in the list if it has not * been started yet. */ fwn = TAILQ_FIRST(&indirdep->ir_trunc); if (fwn != NULL) { if (fwn->fw_freeblks == indirdep->ir_freeblks) TAILQ_REMOVE(&indirdep->ir_trunc, fwn, fw_next); if ((fwn->fw_state & ONWORKLIST) == 0) freework_enqueue(fwn); } /* * If bp is NULL the block was fully truncated, restore * the saved block list otherwise free it if it is no * longer needed. */ if (TAILQ_EMPTY(&indirdep->ir_trunc)) { if (bp == NULL) bcopy(indirdep->ir_saveddata, indirdep->ir_savebp->b_data, indirdep->ir_savebp->b_bcount); free(indirdep->ir_saveddata, M_INDIRDEP); indirdep->ir_saveddata = NULL; } /* * When bp is NULL there is a full truncation pending. We * must wait for this full truncation to be journaled before * we can release this freework because the disk pointers will * never be written as zero. */ if (bp == NULL) { if (LIST_EMPTY(&indirdep->ir_freeblks->fb_jblkdephd)) handle_written_freework(freework); else WORKLIST_INSERT(&indirdep->ir_freeblks->fb_freeworkhd, &freework->fw_list); if (fwn == NULL) { freework->fw_indir = (void *)0x0000deadbeef0000; bp = indirdep->ir_savebp; indirdep->ir_savebp = NULL; free_indirdep(indirdep); FREE_LOCK(ump); brelse(bp); ACQUIRE_LOCK(ump); } } else { /* Complete when the real copy is written. */ WORKLIST_INSERT(&bp->b_dep, &freework->fw_list); BUF_UNLOCK(bp); } } /* * Calculate the number of blocks we are going to release where datablocks * is the current total and length is the new file size. */ static ufs2_daddr_t blkcount(fs, datablocks, length) struct fs *fs; ufs2_daddr_t datablocks; off_t length; { off_t totblks, numblks; totblks = 0; numblks = howmany(length, fs->fs_bsize); if (numblks <= UFS_NDADDR) { totblks = howmany(length, fs->fs_fsize); goto out; } totblks = blkstofrags(fs, numblks); numblks -= UFS_NDADDR; /* * Count all single, then double, then triple indirects required. * Subtracting one indirects worth of blocks for each pass * acknowledges one of each pointed to by the inode. */ for (;;) { totblks += blkstofrags(fs, howmany(numblks, NINDIR(fs))); numblks -= NINDIR(fs); if (numblks <= 0) break; numblks = howmany(numblks, NINDIR(fs)); } out: totblks = fsbtodb(fs, totblks); /* * Handle sparse files. We can't reclaim more blocks than the inode * references. We will correct it later in handle_complete_freeblks() * when we know the real count. */ if (totblks > datablocks) return (0); return (datablocks - totblks); } /* * Handle freeblocks for journaled softupdate filesystems. * * Contrary to normal softupdates, we must preserve the block pointers in * indirects until their subordinates are free. This is to avoid journaling * every block that is freed which may consume more space than the journal * itself. The recovery program will see the free block journals at the * base of the truncated area and traverse them to reclaim space. The * pointers in the inode may be cleared immediately after the journal * records are written because each direct and indirect pointer in the * inode is recorded in a journal. This permits full truncation to proceed * asynchronously. The write order is journal -> inode -> cgs -> indirects. * * The algorithm is as follows: * 1) Traverse the in-memory state and create journal entries to release * the relevant blocks and full indirect trees. * 2) Traverse the indirect block chain adding partial truncation freework * records to indirects in the path to lastlbn. The freework will * prevent new allocation dependencies from being satisfied in this * indirect until the truncation completes. * 3) Read and lock the inode block, performing an update with the new size * and pointers. This prevents truncated data from becoming valid on * disk through step 4. * 4) Reap unsatisfied dependencies that are beyond the truncated area, * eliminate journal work for those records that do not require it. * 5) Schedule the journal records to be written followed by the inode block. * 6) Allocate any necessary frags for the end of file. * 7) Zero any partially truncated blocks. * * From this truncation proceeds asynchronously using the freework and * indir_trunc machinery. The file will not be extended again into a * partially truncated indirect block until all work is completed but * the normal dependency mechanism ensures that it is rolled back/forward * as appropriate. Further truncation may occur without delay and is * serialized in indir_trunc(). */ void softdep_journal_freeblocks(ip, cred, length, flags) struct inode *ip; /* The inode whose length is to be reduced */ struct ucred *cred; off_t length; /* The new length for the file */ int flags; /* IO_EXT and/or IO_NORMAL */ { struct freeblks *freeblks, *fbn; struct worklist *wk, *wkn; struct inodedep *inodedep; struct jblkdep *jblkdep; struct allocdirect *adp, *adpn; struct ufsmount *ump; struct fs *fs; struct buf *bp; struct vnode *vp; struct mount *mp; daddr_t dbn; ufs2_daddr_t extblocks, datablocks; ufs_lbn_t tmpval, lbn, lastlbn; int frags, lastoff, iboff, allocblock, needj, error, i; ump = ITOUMP(ip); mp = UFSTOVFS(ump); fs = ump->um_fs; KASSERT(MOUNTEDSOFTDEP(mp) != 0, ("softdep_journal_freeblocks called on non-softdep filesystem")); vp = ITOV(ip); needj = 1; iboff = -1; allocblock = 0; extblocks = 0; datablocks = 0; frags = 0; freeblks = newfreeblks(mp, ip); ACQUIRE_LOCK(ump); /* * If we're truncating a removed file that will never be written * we don't need to journal the block frees. The canceled journals * for the allocations will suffice. */ inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep); if ((inodedep->id_state & (UNLINKED | DEPCOMPLETE)) == UNLINKED && length == 0) needj = 0; CTR3(KTR_SUJ, "softdep_journal_freeblks: ip %d length %ld needj %d", ip->i_number, length, needj); FREE_LOCK(ump); /* * Calculate the lbn that we are truncating to. This results in -1 * if we're truncating the 0 bytes. So it is the last lbn we want * to keep, not the first lbn we want to truncate. */ lastlbn = lblkno(fs, length + fs->fs_bsize - 1) - 1; lastoff = blkoff(fs, length); /* * Compute frags we are keeping in lastlbn. 0 means all. */ if (lastlbn >= 0 && lastlbn < UFS_NDADDR) { frags = fragroundup(fs, lastoff); /* adp offset of last valid allocdirect. */ iboff = lastlbn; } else if (lastlbn > 0) iboff = UFS_NDADDR; if (fs->fs_magic == FS_UFS2_MAGIC) extblocks = btodb(fragroundup(fs, ip->i_din2->di_extsize)); /* * Handle normal data blocks and indirects. This section saves * values used after the inode update to complete frag and indirect * truncation. */ if ((flags & IO_NORMAL) != 0) { /* * Handle truncation of whole direct and indirect blocks. */ for (i = iboff + 1; i < UFS_NDADDR; i++) setup_freedirect(freeblks, ip, i, needj); for (i = 0, tmpval = NINDIR(fs), lbn = UFS_NDADDR; i < UFS_NIADDR; i++, lbn += tmpval, tmpval *= NINDIR(fs)) { /* Release a whole indirect tree. */ if (lbn > lastlbn) { setup_freeindir(freeblks, ip, i, -lbn -i, needj); continue; } iboff = i + UFS_NDADDR; /* * Traverse partially truncated indirect tree. */ if (lbn <= lastlbn && lbn + tmpval - 1 > lastlbn) setup_trunc_indir(freeblks, ip, -lbn - i, lastlbn, DIP(ip, i_ib[i])); } /* * Handle partial truncation to a frag boundary. */ if (frags) { ufs2_daddr_t blkno; long oldfrags; oldfrags = blksize(fs, ip, lastlbn); blkno = DIP(ip, i_db[lastlbn]); if (blkno && oldfrags != frags) { oldfrags -= frags; oldfrags = numfrags(fs, oldfrags); blkno += numfrags(fs, frags); newfreework(ump, freeblks, NULL, lastlbn, blkno, oldfrags, 0, needj); if (needj) adjust_newfreework(freeblks, numfrags(fs, frags)); } else if (blkno == 0) allocblock = 1; } /* * Add a journal record for partial truncate if we are * handling indirect blocks. Non-indirects need no extra * journaling. */ if (length != 0 && lastlbn >= UFS_NDADDR) { UFS_INODE_SET_FLAG(ip, IN_TRUNCATED); newjtrunc(freeblks, length, 0); } ip->i_size = length; DIP_SET(ip, i_size, ip->i_size); UFS_INODE_SET_FLAG(ip, IN_SIZEMOD | IN_CHANGE); datablocks = DIP(ip, i_blocks) - extblocks; if (length != 0) datablocks = blkcount(fs, datablocks, length); freeblks->fb_len = length; } if ((flags & IO_EXT) != 0) { for (i = 0; i < UFS_NXADDR; i++) setup_freeext(freeblks, ip, i, needj); ip->i_din2->di_extsize = 0; datablocks += extblocks; UFS_INODE_SET_FLAG(ip, IN_SIZEMOD | IN_CHANGE); } #ifdef QUOTA /* Reference the quotas in case the block count is wrong in the end. */ quotaref(vp, freeblks->fb_quota); (void) chkdq(ip, -datablocks, NOCRED, FORCE); #endif freeblks->fb_chkcnt = -datablocks; UFS_LOCK(ump); fs->fs_pendingblocks += datablocks; UFS_UNLOCK(ump); DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - datablocks); /* * Handle truncation of incomplete alloc direct dependencies. We * hold the inode block locked to prevent incomplete dependencies * from reaching the disk while we are eliminating those that * have been truncated. This is a partially inlined ffs_update(). */ ufs_itimes(vp); ip->i_flag &= ~(IN_LAZYACCESS | IN_LAZYMOD | IN_MODIFIED); dbn = fsbtodb(fs, ino_to_fsba(fs, ip->i_number)); error = ffs_breadz(ump, ump->um_devvp, dbn, dbn, (int)fs->fs_bsize, NULL, NULL, 0, cred, 0, NULL, &bp); if (error) { softdep_error("softdep_journal_freeblocks", error); return; } if (bp->b_bufsize == fs->fs_bsize) bp->b_flags |= B_CLUSTEROK; softdep_update_inodeblock(ip, bp, 0); if (ump->um_fstype == UFS1) { *((struct ufs1_dinode *)bp->b_data + ino_to_fsbo(fs, ip->i_number)) = *ip->i_din1; } else { ffs_update_dinode_ckhash(fs, ip->i_din2); *((struct ufs2_dinode *)bp->b_data + ino_to_fsbo(fs, ip->i_number)) = *ip->i_din2; } ACQUIRE_LOCK(ump); (void) inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep); if ((inodedep->id_state & IOSTARTED) != 0) panic("softdep_setup_freeblocks: inode busy"); /* * Add the freeblks structure to the list of operations that * must await the zero'ed inode being written to disk. If we * still have a bitmap dependency (needj), then the inode * has never been written to disk, so we can process the * freeblks below once we have deleted the dependencies. */ if (needj) WORKLIST_INSERT(&bp->b_dep, &freeblks->fb_list); else freeblks->fb_state |= COMPLETE; if ((flags & IO_NORMAL) != 0) { TAILQ_FOREACH_SAFE(adp, &inodedep->id_inoupdt, ad_next, adpn) { if (adp->ad_offset > iboff) cancel_allocdirect(&inodedep->id_inoupdt, adp, freeblks); /* * Truncate the allocdirect. We could eliminate * or modify journal records as well. */ else if (adp->ad_offset == iboff && frags) adp->ad_newsize = frags; } } if ((flags & IO_EXT) != 0) while ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != NULL) cancel_allocdirect(&inodedep->id_extupdt, adp, freeblks); /* * Scan the bufwait list for newblock dependencies that will never * make it to disk. */ LIST_FOREACH_SAFE(wk, &inodedep->id_bufwait, wk_list, wkn) { if (wk->wk_type != D_ALLOCDIRECT) continue; adp = WK_ALLOCDIRECT(wk); if (((flags & IO_NORMAL) != 0 && (adp->ad_offset > iboff)) || ((flags & IO_EXT) != 0 && (adp->ad_state & EXTDATA))) { cancel_jfreeblk(freeblks, adp->ad_newblkno); cancel_newblk(WK_NEWBLK(wk), NULL, &freeblks->fb_jwork); WORKLIST_INSERT(&freeblks->fb_freeworkhd, wk); } } /* * Add journal work. */ LIST_FOREACH(jblkdep, &freeblks->fb_jblkdephd, jb_deps) add_to_journal(&jblkdep->jb_list); FREE_LOCK(ump); bdwrite(bp); /* * Truncate dependency structures beyond length. */ trunc_dependencies(ip, freeblks, lastlbn, frags, flags); /* * This is only set when we need to allocate a fragment because * none existed at the end of a frag-sized file. It handles only * allocating a new, zero filled block. */ if (allocblock) { ip->i_size = length - lastoff; DIP_SET(ip, i_size, ip->i_size); error = UFS_BALLOC(vp, length - 1, 1, cred, BA_CLRBUF, &bp); if (error != 0) { softdep_error("softdep_journal_freeblks", error); return; } ip->i_size = length; DIP_SET(ip, i_size, length); UFS_INODE_SET_FLAG(ip, IN_SIZEMOD | IN_CHANGE | IN_UPDATE); allocbuf(bp, frags); ffs_update(vp, 0); bawrite(bp); } else if (lastoff != 0 && vp->v_type != VDIR) { int size; /* * Zero the end of a truncated frag or block. */ size = sblksize(fs, length, lastlbn); error = bread(vp, lastlbn, size, cred, &bp); if (error == 0) { bzero((char *)bp->b_data + lastoff, size - lastoff); bawrite(bp); } else if (!ffs_fsfail_cleanup(ump, error)) { softdep_error("softdep_journal_freeblks", error); return; } } ACQUIRE_LOCK(ump); inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep); TAILQ_INSERT_TAIL(&inodedep->id_freeblklst, freeblks, fb_next); freeblks->fb_state |= DEPCOMPLETE | ONDEPLIST; /* * We zero earlier truncations so they don't erroneously * update i_blocks. */ if (freeblks->fb_len == 0 && (flags & IO_NORMAL) != 0) TAILQ_FOREACH(fbn, &inodedep->id_freeblklst, fb_next) fbn->fb_len = 0; if ((freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE && LIST_EMPTY(&freeblks->fb_jblkdephd)) freeblks->fb_state |= INPROGRESS; else freeblks = NULL; FREE_LOCK(ump); if (freeblks) handle_workitem_freeblocks(freeblks, 0); trunc_pages(ip, length, extblocks, flags); } /* * Flush a JOP_SYNC to the journal. */ void softdep_journal_fsync(ip) struct inode *ip; { struct jfsync *jfsync; struct ufsmount *ump; ump = ITOUMP(ip); KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0, ("softdep_journal_fsync called on non-softdep filesystem")); if ((ip->i_flag & IN_TRUNCATED) == 0) return; ip->i_flag &= ~IN_TRUNCATED; jfsync = malloc(sizeof(*jfsync), M_JFSYNC, M_SOFTDEP_FLAGS | M_ZERO); workitem_alloc(&jfsync->jfs_list, D_JFSYNC, UFSTOVFS(ump)); jfsync->jfs_size = ip->i_size; jfsync->jfs_ino = ip->i_number; ACQUIRE_LOCK(ump); add_to_journal(&jfsync->jfs_list); jwait(&jfsync->jfs_list, MNT_WAIT); FREE_LOCK(ump); } /* * Block de-allocation dependencies. * * When blocks are de-allocated, the on-disk pointers must be nullified before * the blocks are made available for use by other files. (The true * requirement is that old pointers must be nullified before new on-disk * pointers are set. We chose this slightly more stringent requirement to * reduce complexity.) Our implementation handles this dependency by updating * the inode (or indirect block) appropriately but delaying the actual block * de-allocation (i.e., freemap and free space count manipulation) until * after the updated versions reach stable storage. After the disk is * updated, the blocks can be safely de-allocated whenever it is convenient. * This implementation handles only the common case of reducing a file's * length to zero. Other cases are handled by the conventional synchronous * write approach. * * The ffs implementation with which we worked double-checks * the state of the block pointers and file size as it reduces * a file's length. Some of this code is replicated here in our * soft updates implementation. The freeblks->fb_chkcnt field is * used to transfer a part of this information to the procedure * that eventually de-allocates the blocks. * * This routine should be called from the routine that shortens * a file's length, before the inode's size or block pointers * are modified. It will save the block pointer information for * later release and zero the inode so that the calling routine * can release it. */ void softdep_setup_freeblocks(ip, length, flags) struct inode *ip; /* The inode whose length is to be reduced */ off_t length; /* The new length for the file */ int flags; /* IO_EXT and/or IO_NORMAL */ { struct ufs1_dinode *dp1; struct ufs2_dinode *dp2; struct freeblks *freeblks; struct inodedep *inodedep; struct allocdirect *adp; struct ufsmount *ump; struct buf *bp; struct fs *fs; ufs2_daddr_t extblocks, datablocks; struct mount *mp; int i, delay, error; ufs_lbn_t tmpval; ufs_lbn_t lbn; ump = ITOUMP(ip); mp = UFSTOVFS(ump); KASSERT(MOUNTEDSOFTDEP(mp) != 0, ("softdep_setup_freeblocks called on non-softdep filesystem")); CTR2(KTR_SUJ, "softdep_setup_freeblks: ip %d length %ld", ip->i_number, length); KASSERT(length == 0, ("softdep_setup_freeblocks: non-zero length")); fs = ump->um_fs; if ((error = bread(ump->um_devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)), (int)fs->fs_bsize, NOCRED, &bp)) != 0) { if (!ffs_fsfail_cleanup(ump, error)) softdep_error("softdep_setup_freeblocks", error); return; } freeblks = newfreeblks(mp, ip); extblocks = 0; datablocks = 0; if (fs->fs_magic == FS_UFS2_MAGIC) extblocks = btodb(fragroundup(fs, ip->i_din2->di_extsize)); if ((flags & IO_NORMAL) != 0) { for (i = 0; i < UFS_NDADDR; i++) setup_freedirect(freeblks, ip, i, 0); for (i = 0, tmpval = NINDIR(fs), lbn = UFS_NDADDR; i < UFS_NIADDR; i++, lbn += tmpval, tmpval *= NINDIR(fs)) setup_freeindir(freeblks, ip, i, -lbn -i, 0); ip->i_size = 0; DIP_SET(ip, i_size, 0); UFS_INODE_SET_FLAG(ip, IN_SIZEMOD | IN_CHANGE); datablocks = DIP(ip, i_blocks) - extblocks; } if ((flags & IO_EXT) != 0) { for (i = 0; i < UFS_NXADDR; i++) setup_freeext(freeblks, ip, i, 0); ip->i_din2->di_extsize = 0; datablocks += extblocks; UFS_INODE_SET_FLAG(ip, IN_SIZEMOD | IN_CHANGE); } #ifdef QUOTA /* Reference the quotas in case the block count is wrong in the end. */ quotaref(ITOV(ip), freeblks->fb_quota); (void) chkdq(ip, -datablocks, NOCRED, FORCE); #endif freeblks->fb_chkcnt = -datablocks; UFS_LOCK(ump); fs->fs_pendingblocks += datablocks; UFS_UNLOCK(ump); DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - datablocks); /* * Push the zero'ed inode to its disk buffer so that we are free * to delete its dependencies below. Once the dependencies are gone * the buffer can be safely released. */ if (ump->um_fstype == UFS1) { dp1 = ((struct ufs1_dinode *)bp->b_data + ino_to_fsbo(fs, ip->i_number)); ip->i_din1->di_freelink = dp1->di_freelink; *dp1 = *ip->i_din1; } else { dp2 = ((struct ufs2_dinode *)bp->b_data + ino_to_fsbo(fs, ip->i_number)); ip->i_din2->di_freelink = dp2->di_freelink; ffs_update_dinode_ckhash(fs, ip->i_din2); *dp2 = *ip->i_din2; } /* * Find and eliminate any inode dependencies. */ ACQUIRE_LOCK(ump); (void) inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep); if ((inodedep->id_state & IOSTARTED) != 0) panic("softdep_setup_freeblocks: inode busy"); /* * Add the freeblks structure to the list of operations that * must await the zero'ed inode being written to disk. If we * still have a bitmap dependency (delay == 0), then the inode * has never been written to disk, so we can process the * freeblks below once we have deleted the dependencies. */ delay = (inodedep->id_state & DEPCOMPLETE); if (delay) WORKLIST_INSERT(&bp->b_dep, &freeblks->fb_list); else freeblks->fb_state |= COMPLETE; /* * Because the file length has been truncated to zero, any * pending block allocation dependency structures associated * with this inode are obsolete and can simply be de-allocated. * We must first merge the two dependency lists to get rid of * any duplicate freefrag structures, then purge the merged list. * If we still have a bitmap dependency, then the inode has never * been written to disk, so we can free any fragments without delay. */ if (flags & IO_NORMAL) { merge_inode_lists(&inodedep->id_newinoupdt, &inodedep->id_inoupdt); while ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != NULL) cancel_allocdirect(&inodedep->id_inoupdt, adp, freeblks); } if (flags & IO_EXT) { merge_inode_lists(&inodedep->id_newextupdt, &inodedep->id_extupdt); while ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != NULL) cancel_allocdirect(&inodedep->id_extupdt, adp, freeblks); } FREE_LOCK(ump); bdwrite(bp); trunc_dependencies(ip, freeblks, -1, 0, flags); ACQUIRE_LOCK(ump); if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0) (void) free_inodedep(inodedep); freeblks->fb_state |= DEPCOMPLETE; /* * If the inode with zeroed block pointers is now on disk * we can start freeing blocks. */ if ((freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE) freeblks->fb_state |= INPROGRESS; else freeblks = NULL; FREE_LOCK(ump); if (freeblks) handle_workitem_freeblocks(freeblks, 0); trunc_pages(ip, length, extblocks, flags); } /* * Eliminate pages from the page cache that back parts of this inode and * adjust the vnode pager's idea of our size. This prevents stale data * from hanging around in the page cache. */ static void trunc_pages(ip, length, extblocks, flags) struct inode *ip; off_t length; ufs2_daddr_t extblocks; int flags; { struct vnode *vp; struct fs *fs; ufs_lbn_t lbn; off_t end, extend; vp = ITOV(ip); fs = ITOFS(ip); extend = OFF_TO_IDX(lblktosize(fs, -extblocks)); if ((flags & IO_EXT) != 0) vn_pages_remove(vp, extend, 0); if ((flags & IO_NORMAL) == 0) return; BO_LOCK(&vp->v_bufobj); drain_output(vp); BO_UNLOCK(&vp->v_bufobj); /* * The vnode pager eliminates file pages we eliminate indirects * below. */ vnode_pager_setsize(vp, length); /* * Calculate the end based on the last indirect we want to keep. If * the block extends into indirects we can just use the negative of * its lbn. Doubles and triples exist at lower numbers so we must * be careful not to remove those, if they exist. double and triple * indirect lbns do not overlap with others so it is not important * to verify how many levels are required. */ lbn = lblkno(fs, length); if (lbn >= UFS_NDADDR) { /* Calculate the virtual lbn of the triple indirect. */ lbn = -lbn - (UFS_NIADDR - 1); end = OFF_TO_IDX(lblktosize(fs, lbn)); } else end = extend; vn_pages_remove(vp, OFF_TO_IDX(OFF_MAX), end); } /* * See if the buf bp is in the range eliminated by truncation. */ static int trunc_check_buf(bp, blkoffp, lastlbn, lastoff, flags) struct buf *bp; int *blkoffp; ufs_lbn_t lastlbn; int lastoff; int flags; { ufs_lbn_t lbn; *blkoffp = 0; /* Only match ext/normal blocks as appropriate. */ if (((flags & IO_EXT) == 0 && (bp->b_xflags & BX_ALTDATA)) || ((flags & IO_NORMAL) == 0 && (bp->b_xflags & BX_ALTDATA) == 0)) return (0); /* ALTDATA is always a full truncation. */ if ((bp->b_xflags & BX_ALTDATA) != 0) return (1); /* -1 is full truncation. */ if (lastlbn == -1) return (1); /* * If this is a partial truncate we only want those * blocks and indirect blocks that cover the range * we're after. */ lbn = bp->b_lblkno; if (lbn < 0) lbn = -(lbn + lbn_level(lbn)); if (lbn < lastlbn) return (0); /* Here we only truncate lblkno if it's partial. */ if (lbn == lastlbn) { if (lastoff == 0) return (0); *blkoffp = lastoff; } return (1); } /* * Eliminate any dependencies that exist in memory beyond lblkno:off */ static void trunc_dependencies(ip, freeblks, lastlbn, lastoff, flags) struct inode *ip; struct freeblks *freeblks; ufs_lbn_t lastlbn; int lastoff; int flags; { struct bufobj *bo; struct vnode *vp; struct buf *bp; int blkoff; /* * We must wait for any I/O in progress to finish so that * all potential buffers on the dirty list will be visible. * Once they are all there, walk the list and get rid of * any dependencies. */ vp = ITOV(ip); bo = &vp->v_bufobj; BO_LOCK(bo); drain_output(vp); TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) bp->b_vflags &= ~BV_SCANNED; restart: TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) { if (bp->b_vflags & BV_SCANNED) continue; if (!trunc_check_buf(bp, &blkoff, lastlbn, lastoff, flags)) { bp->b_vflags |= BV_SCANNED; continue; } KASSERT(bp->b_bufobj == bo, ("Wrong object in buffer")); if ((bp = getdirtybuf(bp, BO_LOCKPTR(bo), MNT_WAIT)) == NULL) goto restart; BO_UNLOCK(bo); if (deallocate_dependencies(bp, freeblks, blkoff)) bqrelse(bp); else brelse(bp); BO_LOCK(bo); goto restart; } /* * Now do the work of vtruncbuf while also matching indirect blocks. */ TAILQ_FOREACH(bp, &bo->bo_clean.bv_hd, b_bobufs) bp->b_vflags &= ~BV_SCANNED; cleanrestart: TAILQ_FOREACH(bp, &bo->bo_clean.bv_hd, b_bobufs) { if (bp->b_vflags & BV_SCANNED) continue; if (!trunc_check_buf(bp, &blkoff, lastlbn, lastoff, flags)) { bp->b_vflags |= BV_SCANNED; continue; } if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_LOCKPTR(bo)) == ENOLCK) { BO_LOCK(bo); goto cleanrestart; } bp->b_vflags |= BV_SCANNED; bremfree(bp); if (blkoff != 0) { allocbuf(bp, blkoff); bqrelse(bp); } else { bp->b_flags |= B_INVAL | B_NOCACHE | B_RELBUF; brelse(bp); } BO_LOCK(bo); goto cleanrestart; } drain_output(vp); BO_UNLOCK(bo); } static int cancel_pagedep(pagedep, freeblks, blkoff) struct pagedep *pagedep; struct freeblks *freeblks; int blkoff; { struct jremref *jremref; struct jmvref *jmvref; struct dirrem *dirrem, *tmp; int i; /* * Copy any directory remove dependencies to the list * to be processed after the freeblks proceeds. If * directory entry never made it to disk they * can be dumped directly onto the work list. */ LIST_FOREACH_SAFE(dirrem, &pagedep->pd_dirremhd, dm_next, tmp) { /* Skip this directory removal if it is intended to remain. */ if (dirrem->dm_offset < blkoff) continue; /* * If there are any dirrems we wait for the journal write * to complete and then restart the buf scan as the lock * has been dropped. */ while ((jremref = LIST_FIRST(&dirrem->dm_jremrefhd)) != NULL) { jwait(&jremref->jr_list, MNT_WAIT); return (ERESTART); } LIST_REMOVE(dirrem, dm_next); dirrem->dm_dirinum = pagedep->pd_ino; WORKLIST_INSERT(&freeblks->fb_freeworkhd, &dirrem->dm_list); } while ((jmvref = LIST_FIRST(&pagedep->pd_jmvrefhd)) != NULL) { jwait(&jmvref->jm_list, MNT_WAIT); return (ERESTART); } /* * When we're partially truncating a pagedep we just want to flush * journal entries and return. There can not be any adds in the * truncated portion of the directory and newblk must remain if * part of the block remains. */ if (blkoff != 0) { struct diradd *dap; LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist) if (dap->da_offset > blkoff) panic("cancel_pagedep: diradd %p off %d > %d", dap, dap->da_offset, blkoff); for (i = 0; i < DAHASHSZ; i++) LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) if (dap->da_offset > blkoff) panic("cancel_pagedep: diradd %p off %d > %d", dap, dap->da_offset, blkoff); return (0); } /* * There should be no directory add dependencies present * as the directory could not be truncated until all * children were removed. */ KASSERT(LIST_FIRST(&pagedep->pd_pendinghd) == NULL, ("deallocate_dependencies: pendinghd != NULL")); for (i = 0; i < DAHASHSZ; i++) KASSERT(LIST_FIRST(&pagedep->pd_diraddhd[i]) == NULL, ("deallocate_dependencies: diraddhd != NULL")); if ((pagedep->pd_state & NEWBLOCK) != 0) free_newdirblk(pagedep->pd_newdirblk); if (free_pagedep(pagedep) == 0) panic("Failed to free pagedep %p", pagedep); return (0); } /* * Reclaim any dependency structures from a buffer that is about to * be reallocated to a new vnode. The buffer must be locked, thus, * no I/O completion operations can occur while we are manipulating * its associated dependencies. The mutex is held so that other I/O's * associated with related dependencies do not occur. */ static int deallocate_dependencies(bp, freeblks, off) struct buf *bp; struct freeblks *freeblks; int off; { struct indirdep *indirdep; struct pagedep *pagedep; struct worklist *wk, *wkn; struct ufsmount *ump; ump = softdep_bp_to_mp(bp); if (ump == NULL) goto done; ACQUIRE_LOCK(ump); LIST_FOREACH_SAFE(wk, &bp->b_dep, wk_list, wkn) { switch (wk->wk_type) { case D_INDIRDEP: indirdep = WK_INDIRDEP(wk); if (bp->b_lblkno >= 0 || bp->b_blkno != indirdep->ir_savebp->b_lblkno) panic("deallocate_dependencies: not indir"); cancel_indirdep(indirdep, bp, freeblks); continue; case D_PAGEDEP: pagedep = WK_PAGEDEP(wk); if (cancel_pagedep(pagedep, freeblks, off)) { FREE_LOCK(ump); return (ERESTART); } continue; case D_ALLOCINDIR: /* * Simply remove the allocindir, we'll find it via * the indirdep where we can clear pointers if * needed. */ WORKLIST_REMOVE(wk); continue; case D_FREEWORK: /* * A truncation is waiting for the zero'd pointers * to be written. It can be freed when the freeblks * is journaled. */ WORKLIST_REMOVE(wk); wk->wk_state |= ONDEPLIST; WORKLIST_INSERT(&freeblks->fb_freeworkhd, wk); break; case D_ALLOCDIRECT: if (off != 0) continue; /* FALLTHROUGH */ default: panic("deallocate_dependencies: Unexpected type %s", TYPENAME(wk->wk_type)); /* NOTREACHED */ } } FREE_LOCK(ump); done: /* * Don't throw away this buf, we were partially truncating and * some deps may always remain. */ if (off) { allocbuf(bp, off); bp->b_vflags |= BV_SCANNED; return (EBUSY); } bp->b_flags |= B_INVAL | B_NOCACHE; return (0); } /* * An allocdirect is being canceled due to a truncate. We must make sure * the journal entry is released in concert with the blkfree that releases * the storage. Completed journal entries must not be released until the * space is no longer pointed to by the inode or in the bitmap. */ static void cancel_allocdirect(adphead, adp, freeblks) struct allocdirectlst *adphead; struct allocdirect *adp; struct freeblks *freeblks; { struct freework *freework; struct newblk *newblk; struct worklist *wk; TAILQ_REMOVE(adphead, adp, ad_next); newblk = (struct newblk *)adp; freework = NULL; /* * Find the correct freework structure. */ LIST_FOREACH(wk, &freeblks->fb_freeworkhd, wk_list) { if (wk->wk_type != D_FREEWORK) continue; freework = WK_FREEWORK(wk); if (freework->fw_blkno == newblk->nb_newblkno) break; } if (freework == NULL) panic("cancel_allocdirect: Freework not found"); /* * If a newblk exists at all we still have the journal entry that * initiated the allocation so we do not need to journal the free. */ cancel_jfreeblk(freeblks, freework->fw_blkno); /* * If the journal hasn't been written the jnewblk must be passed * to the call to ffs_blkfree that reclaims the space. We accomplish * this by linking the journal dependency into the freework to be * freed when freework_freeblock() is called. If the journal has * been written we can simply reclaim the journal space when the * freeblks work is complete. */ freework->fw_jnewblk = cancel_newblk(newblk, &freework->fw_list, &freeblks->fb_jwork); WORKLIST_INSERT(&freeblks->fb_freeworkhd, &newblk->nb_list); } /* * Cancel a new block allocation. May be an indirect or direct block. We * remove it from various lists and return any journal record that needs to * be resolved by the caller. * * A special consideration is made for indirects which were never pointed * at on disk and will never be found once this block is released. */ static struct jnewblk * cancel_newblk(newblk, wk, wkhd) struct newblk *newblk; struct worklist *wk; struct workhead *wkhd; { struct jnewblk *jnewblk; CTR1(KTR_SUJ, "cancel_newblk: blkno %jd", newblk->nb_newblkno); newblk->nb_state |= GOINGAWAY; /* * Previously we traversed the completedhd on each indirdep * attached to this newblk to cancel them and gather journal * work. Since we need only the oldest journal segment and * the lowest point on the tree will always have the oldest * journal segment we are free to release the segments * of any subordinates and may leave the indirdep list to * indirdep_complete() when this newblk is freed. */ if (newblk->nb_state & ONDEPLIST) { newblk->nb_state &= ~ONDEPLIST; LIST_REMOVE(newblk, nb_deps); } if (newblk->nb_state & ONWORKLIST) WORKLIST_REMOVE(&newblk->nb_list); /* * If the journal entry hasn't been written we save a pointer to * the dependency that frees it until it is written or the * superseding operation completes. */ jnewblk = newblk->nb_jnewblk; if (jnewblk != NULL && wk != NULL) { newblk->nb_jnewblk = NULL; jnewblk->jn_dep = wk; } if (!LIST_EMPTY(&newblk->nb_jwork)) jwork_move(wkhd, &newblk->nb_jwork); /* * When truncating we must free the newdirblk early to remove * the pagedep from the hash before returning. */ if ((wk = LIST_FIRST(&newblk->nb_newdirblk)) != NULL) free_newdirblk(WK_NEWDIRBLK(wk)); if (!LIST_EMPTY(&newblk->nb_newdirblk)) panic("cancel_newblk: extra newdirblk"); return (jnewblk); } /* * Schedule the freefrag associated with a newblk to be released once * the pointers are written and the previous block is no longer needed. */ static void newblk_freefrag(newblk) struct newblk *newblk; { struct freefrag *freefrag; if (newblk->nb_freefrag == NULL) return; freefrag = newblk->nb_freefrag; newblk->nb_freefrag = NULL; freefrag->ff_state |= COMPLETE; if ((freefrag->ff_state & ALLCOMPLETE) == ALLCOMPLETE) add_to_worklist(&freefrag->ff_list, 0); } /* * Free a newblk. Generate a new freefrag work request if appropriate. * This must be called after the inode pointer and any direct block pointers * are valid or fully removed via truncate or frag extension. */ static void free_newblk(newblk) struct newblk *newblk; { struct indirdep *indirdep; struct worklist *wk; KASSERT(newblk->nb_jnewblk == NULL, ("free_newblk: jnewblk %p still attached", newblk->nb_jnewblk)); KASSERT(newblk->nb_list.wk_type != D_NEWBLK, ("free_newblk: unclaimed newblk")); LOCK_OWNED(VFSTOUFS(newblk->nb_list.wk_mp)); newblk_freefrag(newblk); if (newblk->nb_state & ONDEPLIST) LIST_REMOVE(newblk, nb_deps); if (newblk->nb_state & ONWORKLIST) WORKLIST_REMOVE(&newblk->nb_list); LIST_REMOVE(newblk, nb_hash); if ((wk = LIST_FIRST(&newblk->nb_newdirblk)) != NULL) free_newdirblk(WK_NEWDIRBLK(wk)); if (!LIST_EMPTY(&newblk->nb_newdirblk)) panic("free_newblk: extra newdirblk"); while ((indirdep = LIST_FIRST(&newblk->nb_indirdeps)) != NULL) indirdep_complete(indirdep); handle_jwork(&newblk->nb_jwork); WORKITEM_FREE(newblk, D_NEWBLK); } /* * Free a newdirblk. Clear the NEWBLOCK flag on its associated pagedep. */ static void free_newdirblk(newdirblk) struct newdirblk *newdirblk; { struct pagedep *pagedep; struct diradd *dap; struct worklist *wk; LOCK_OWNED(VFSTOUFS(newdirblk->db_list.wk_mp)); WORKLIST_REMOVE(&newdirblk->db_list); /* * If the pagedep is still linked onto the directory buffer * dependency chain, then some of the entries on the * pd_pendinghd list may not be committed to disk yet. In * this case, we will simply clear the NEWBLOCK flag and * let the pd_pendinghd list be processed when the pagedep * is next written. If the pagedep is no longer on the buffer * dependency chain, then all the entries on the pd_pending * list are committed to disk and we can free them here. */ pagedep = newdirblk->db_pagedep; pagedep->pd_state &= ~NEWBLOCK; if ((pagedep->pd_state & ONWORKLIST) == 0) { while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL) free_diradd(dap, NULL); /* * If no dependencies remain, the pagedep will be freed. */ free_pagedep(pagedep); } /* Should only ever be one item in the list. */ while ((wk = LIST_FIRST(&newdirblk->db_mkdir)) != NULL) { WORKLIST_REMOVE(wk); handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY); } WORKITEM_FREE(newdirblk, D_NEWDIRBLK); } /* * Prepare an inode to be freed. The actual free operation is not * done until the zero'ed inode has been written to disk. */ void softdep_freefile(pvp, ino, mode) struct vnode *pvp; ino_t ino; int mode; { struct inode *ip = VTOI(pvp); struct inodedep *inodedep; struct freefile *freefile; struct freeblks *freeblks; struct ufsmount *ump; ump = ITOUMP(ip); KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0, ("softdep_freefile called on non-softdep filesystem")); /* * This sets up the inode de-allocation dependency. */ freefile = malloc(sizeof(struct freefile), M_FREEFILE, M_SOFTDEP_FLAGS); workitem_alloc(&freefile->fx_list, D_FREEFILE, pvp->v_mount); freefile->fx_mode = mode; freefile->fx_oldinum = ino; freefile->fx_devvp = ump->um_devvp; LIST_INIT(&freefile->fx_jwork); UFS_LOCK(ump); ump->um_fs->fs_pendinginodes += 1; UFS_UNLOCK(ump); /* * If the inodedep does not exist, then the zero'ed inode has * been written to disk. If the allocated inode has never been * written to disk, then the on-disk inode is zero'ed. In either * case we can free the file immediately. If the journal was * canceled before being written the inode will never make it to * disk and we must send the canceled journal entrys to * ffs_freefile() to be cleared in conjunction with the bitmap. * Any blocks waiting on the inode to write can be safely freed * here as it will never been written. */ ACQUIRE_LOCK(ump); inodedep_lookup(pvp->v_mount, ino, 0, &inodedep); if (inodedep) { /* * Clear out freeblks that no longer need to reference * this inode. */ while ((freeblks = TAILQ_FIRST(&inodedep->id_freeblklst)) != NULL) { TAILQ_REMOVE(&inodedep->id_freeblklst, freeblks, fb_next); freeblks->fb_state &= ~ONDEPLIST; } /* * Remove this inode from the unlinked list. */ if (inodedep->id_state & UNLINKED) { /* * Save the journal work to be freed with the bitmap * before we clear UNLINKED. Otherwise it can be lost * if the inode block is written. */ handle_bufwait(inodedep, &freefile->fx_jwork); clear_unlinked_inodedep(inodedep); /* * Re-acquire inodedep as we've dropped the * per-filesystem lock in clear_unlinked_inodedep(). */ inodedep_lookup(pvp->v_mount, ino, 0, &inodedep); } } if (inodedep == NULL || check_inode_unwritten(inodedep)) { FREE_LOCK(ump); handle_workitem_freefile(freefile); return; } if ((inodedep->id_state & DEPCOMPLETE) == 0) inodedep->id_state |= GOINGAWAY; WORKLIST_INSERT(&inodedep->id_inowait, &freefile->fx_list); FREE_LOCK(ump); if (ip->i_number == ino) UFS_INODE_SET_FLAG(ip, IN_MODIFIED); } /* * Check to see if an inode has never been written to disk. If * so free the inodedep and return success, otherwise return failure. * * If we still have a bitmap dependency, then the inode has never * been written to disk. Drop the dependency as it is no longer * necessary since the inode is being deallocated. We set the * ALLCOMPLETE flags since the bitmap now properly shows that the * inode is not allocated. Even if the inode is actively being * written, it has been rolled back to its zero'ed state, so we * are ensured that a zero inode is what is on the disk. For short * lived files, this change will usually result in removing all the * dependencies from the inode so that it can be freed immediately. */ static int check_inode_unwritten(inodedep) struct inodedep *inodedep; { LOCK_OWNED(VFSTOUFS(inodedep->id_list.wk_mp)); if ((inodedep->id_state & (DEPCOMPLETE | UNLINKED)) != 0 || !LIST_EMPTY(&inodedep->id_dirremhd) || !LIST_EMPTY(&inodedep->id_pendinghd) || !LIST_EMPTY(&inodedep->id_bufwait) || !LIST_EMPTY(&inodedep->id_inowait) || !TAILQ_EMPTY(&inodedep->id_inoreflst) || !TAILQ_EMPTY(&inodedep->id_inoupdt) || !TAILQ_EMPTY(&inodedep->id_newinoupdt) || !TAILQ_EMPTY(&inodedep->id_extupdt) || !TAILQ_EMPTY(&inodedep->id_newextupdt) || !TAILQ_EMPTY(&inodedep->id_freeblklst) || inodedep->id_mkdiradd != NULL || inodedep->id_nlinkdelta != 0) return (0); /* * Another process might be in initiate_write_inodeblock_ufs[12] * trying to allocate memory without holding "Softdep Lock". */ if ((inodedep->id_state & IOSTARTED) != 0 && inodedep->id_savedino1 == NULL) return (0); if (inodedep->id_state & ONDEPLIST) LIST_REMOVE(inodedep, id_deps); inodedep->id_state &= ~ONDEPLIST; inodedep->id_state |= ALLCOMPLETE; inodedep->id_bmsafemap = NULL; if (inodedep->id_state & ONWORKLIST) WORKLIST_REMOVE(&inodedep->id_list); if (inodedep->id_savedino1 != NULL) { free(inodedep->id_savedino1, M_SAVEDINO); inodedep->id_savedino1 = NULL; } if (free_inodedep(inodedep) == 0) panic("check_inode_unwritten: busy inode"); return (1); } static int check_inodedep_free(inodedep) struct inodedep *inodedep; { LOCK_OWNED(VFSTOUFS(inodedep->id_list.wk_mp)); if ((inodedep->id_state & ALLCOMPLETE) != ALLCOMPLETE || !LIST_EMPTY(&inodedep->id_dirremhd) || !LIST_EMPTY(&inodedep->id_pendinghd) || !LIST_EMPTY(&inodedep->id_bufwait) || !LIST_EMPTY(&inodedep->id_inowait) || !TAILQ_EMPTY(&inodedep->id_inoreflst) || !TAILQ_EMPTY(&inodedep->id_inoupdt) || !TAILQ_EMPTY(&inodedep->id_newinoupdt) || !TAILQ_EMPTY(&inodedep->id_extupdt) || !TAILQ_EMPTY(&inodedep->id_newextupdt) || !TAILQ_EMPTY(&inodedep->id_freeblklst) || inodedep->id_mkdiradd != NULL || inodedep->id_nlinkdelta != 0 || inodedep->id_savedino1 != NULL) return (0); return (1); } /* * Try to free an inodedep structure. Return 1 if it could be freed. */ static int free_inodedep(inodedep) struct inodedep *inodedep; { LOCK_OWNED(VFSTOUFS(inodedep->id_list.wk_mp)); if ((inodedep->id_state & (ONWORKLIST | UNLINKED)) != 0 || !check_inodedep_free(inodedep)) return (0); if (inodedep->id_state & ONDEPLIST) LIST_REMOVE(inodedep, id_deps); LIST_REMOVE(inodedep, id_hash); WORKITEM_FREE(inodedep, D_INODEDEP); return (1); } /* * Free the block referenced by a freework structure. The parent freeblks * structure is released and completed when the final cg bitmap reaches * the disk. This routine may be freeing a jnewblk which never made it to * disk in which case we do not have to wait as the operation is undone * in memory immediately. */ static void freework_freeblock(freework, key) struct freework *freework; u_long key; { struct freeblks *freeblks; struct jnewblk *jnewblk; struct ufsmount *ump; struct workhead wkhd; struct fs *fs; int bsize; int needj; ump = VFSTOUFS(freework->fw_list.wk_mp); LOCK_OWNED(ump); /* * Handle partial truncate separately. */ if (freework->fw_indir) { complete_trunc_indir(freework); return; } freeblks = freework->fw_freeblks; fs = ump->um_fs; needj = MOUNTEDSUJ(freeblks->fb_list.wk_mp) != 0; bsize = lfragtosize(fs, freework->fw_frags); LIST_INIT(&wkhd); /* * DEPCOMPLETE is cleared in indirblk_insert() if the block lives * on the indirblk hashtable and prevents premature freeing. */ freework->fw_state |= DEPCOMPLETE; /* * SUJ needs to wait for the segment referencing freed indirect * blocks to expire so that we know the checker will not confuse * a re-allocated indirect block with its old contents. */ if (needj && freework->fw_lbn <= -UFS_NDADDR) indirblk_insert(freework); /* * If we are canceling an existing jnewblk pass it to the free * routine, otherwise pass the freeblk which will ultimately * release the freeblks. If we're not journaling, we can just * free the freeblks immediately. */ jnewblk = freework->fw_jnewblk; if (jnewblk != NULL) { cancel_jnewblk(jnewblk, &wkhd); needj = 0; } else if (needj) { freework->fw_state |= DELAYEDFREE; freeblks->fb_cgwait++; WORKLIST_INSERT(&wkhd, &freework->fw_list); } FREE_LOCK(ump); freeblks_free(ump, freeblks, btodb(bsize)); CTR4(KTR_SUJ, "freework_freeblock: ino %jd blkno %jd lbn %jd size %d", freeblks->fb_inum, freework->fw_blkno, freework->fw_lbn, bsize); ffs_blkfree(ump, fs, freeblks->fb_devvp, freework->fw_blkno, bsize, freeblks->fb_inum, freeblks->fb_vtype, &wkhd, key); ACQUIRE_LOCK(ump); /* * The jnewblk will be discarded and the bits in the map never * made it to disk. We can immediately free the freeblk. */ if (needj == 0) handle_written_freework(freework); } /* * We enqueue freework items that need processing back on the freeblks and * add the freeblks to the worklist. This makes it easier to find all work * required to flush a truncation in process_truncates(). */ static void freework_enqueue(freework) struct freework *freework; { struct freeblks *freeblks; freeblks = freework->fw_freeblks; if ((freework->fw_state & INPROGRESS) == 0) WORKLIST_INSERT(&freeblks->fb_freeworkhd, &freework->fw_list); if ((freeblks->fb_state & (ONWORKLIST | INPROGRESS | ALLCOMPLETE)) == ALLCOMPLETE && LIST_EMPTY(&freeblks->fb_jblkdephd)) add_to_worklist(&freeblks->fb_list, WK_NODELAY); } /* * Start, continue, or finish the process of freeing an indirect block tree. * The free operation may be paused at any point with fw_off containing the * offset to restart from. This enables us to implement some flow control * for large truncates which may fan out and generate a huge number of * dependencies. */ static void handle_workitem_indirblk(freework) struct freework *freework; { struct freeblks *freeblks; struct ufsmount *ump; struct fs *fs; freeblks = freework->fw_freeblks; ump = VFSTOUFS(freeblks->fb_list.wk_mp); fs = ump->um_fs; if (freework->fw_state & DEPCOMPLETE) { handle_written_freework(freework); return; } if (freework->fw_off == NINDIR(fs)) { freework_freeblock(freework, SINGLETON_KEY); return; } freework->fw_state |= INPROGRESS; FREE_LOCK(ump); indir_trunc(freework, fsbtodb(fs, freework->fw_blkno), freework->fw_lbn); ACQUIRE_LOCK(ump); } /* * Called when a freework structure attached to a cg buf is written. The * ref on either the parent or the freeblks structure is released and * the freeblks is added back to the worklist if there is more work to do. */ static void handle_written_freework(freework) struct freework *freework; { struct freeblks *freeblks; struct freework *parent; freeblks = freework->fw_freeblks; parent = freework->fw_parent; if (freework->fw_state & DELAYEDFREE) freeblks->fb_cgwait--; freework->fw_state |= COMPLETE; if ((freework->fw_state & ALLCOMPLETE) == ALLCOMPLETE) WORKITEM_FREE(freework, D_FREEWORK); if (parent) { if (--parent->fw_ref == 0) freework_enqueue(parent); return; } if (--freeblks->fb_ref != 0) return; if ((freeblks->fb_state & (ALLCOMPLETE | ONWORKLIST | INPROGRESS)) == ALLCOMPLETE && LIST_EMPTY(&freeblks->fb_jblkdephd)) add_to_worklist(&freeblks->fb_list, WK_NODELAY); } /* * This workitem routine performs the block de-allocation. * The workitem is added to the pending list after the updated * inode block has been written to disk. As mentioned above, * checks regarding the number of blocks de-allocated (compared * to the number of blocks allocated for the file) are also * performed in this function. */ static int handle_workitem_freeblocks(freeblks, flags) struct freeblks *freeblks; int flags; { struct freework *freework; struct newblk *newblk; struct allocindir *aip; struct ufsmount *ump; struct worklist *wk; u_long key; KASSERT(LIST_EMPTY(&freeblks->fb_jblkdephd), ("handle_workitem_freeblocks: Journal entries not written.")); ump = VFSTOUFS(freeblks->fb_list.wk_mp); key = ffs_blkrelease_start(ump, freeblks->fb_devvp, freeblks->fb_inum); ACQUIRE_LOCK(ump); while ((wk = LIST_FIRST(&freeblks->fb_freeworkhd)) != NULL) { WORKLIST_REMOVE(wk); switch (wk->wk_type) { case D_DIRREM: wk->wk_state |= COMPLETE; add_to_worklist(wk, 0); continue; case D_ALLOCDIRECT: free_newblk(WK_NEWBLK(wk)); continue; case D_ALLOCINDIR: aip = WK_ALLOCINDIR(wk); freework = NULL; if (aip->ai_state & DELAYEDFREE) { FREE_LOCK(ump); freework = newfreework(ump, freeblks, NULL, aip->ai_lbn, aip->ai_newblkno, ump->um_fs->fs_frag, 0, 0); ACQUIRE_LOCK(ump); } newblk = WK_NEWBLK(wk); if (newblk->nb_jnewblk) { freework->fw_jnewblk = newblk->nb_jnewblk; newblk->nb_jnewblk->jn_dep = &freework->fw_list; newblk->nb_jnewblk = NULL; } free_newblk(newblk); continue; case D_FREEWORK: freework = WK_FREEWORK(wk); if (freework->fw_lbn <= -UFS_NDADDR) handle_workitem_indirblk(freework); else freework_freeblock(freework, key); continue; default: panic("handle_workitem_freeblocks: Unknown type %s", TYPENAME(wk->wk_type)); } } if (freeblks->fb_ref != 0) { freeblks->fb_state &= ~INPROGRESS; wake_worklist(&freeblks->fb_list); freeblks = NULL; } FREE_LOCK(ump); ffs_blkrelease_finish(ump, key); if (freeblks) return handle_complete_freeblocks(freeblks, flags); return (0); } /* * Handle completion of block free via truncate. This allows fs_pending * to track the actual free block count more closely than if we only updated * it at the end. We must be careful to handle cases where the block count * on free was incorrect. */ static void freeblks_free(ump, freeblks, blocks) struct ufsmount *ump; struct freeblks *freeblks; int blocks; { struct fs *fs; ufs2_daddr_t remain; UFS_LOCK(ump); remain = -freeblks->fb_chkcnt; freeblks->fb_chkcnt += blocks; if (remain > 0) { if (remain < blocks) blocks = remain; fs = ump->um_fs; fs->fs_pendingblocks -= blocks; } UFS_UNLOCK(ump); } /* * Once all of the freework workitems are complete we can retire the * freeblocks dependency and any journal work awaiting completion. This * can not be called until all other dependencies are stable on disk. */ static int handle_complete_freeblocks(freeblks, flags) struct freeblks *freeblks; int flags; { struct inodedep *inodedep; struct inode *ip; struct vnode *vp; struct fs *fs; struct ufsmount *ump; ufs2_daddr_t spare; ump = VFSTOUFS(freeblks->fb_list.wk_mp); fs = ump->um_fs; flags = LK_EXCLUSIVE | flags; spare = freeblks->fb_chkcnt; /* * If we did not release the expected number of blocks we may have * to adjust the inode block count here. Only do so if it wasn't * a truncation to zero and the modrev still matches. */ if (spare && freeblks->fb_len != 0) { if (ffs_vgetf(freeblks->fb_list.wk_mp, freeblks->fb_inum, flags, &vp, FFSV_FORCEINSMQ) != 0) return (EBUSY); ip = VTOI(vp); if (ip->i_mode == 0) { vgone(vp); } else if (DIP(ip, i_modrev) == freeblks->fb_modrev) { DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - spare); UFS_INODE_SET_FLAG(ip, IN_CHANGE); /* * We must wait so this happens before the * journal is reclaimed. */ ffs_update(vp, 1); } vput(vp); } if (spare < 0) { UFS_LOCK(ump); fs->fs_pendingblocks += spare; UFS_UNLOCK(ump); } #ifdef QUOTA /* Handle spare. */ if (spare) quotaadj(freeblks->fb_quota, ump, -spare); quotarele(freeblks->fb_quota); #endif ACQUIRE_LOCK(ump); if (freeblks->fb_state & ONDEPLIST) { inodedep_lookup(freeblks->fb_list.wk_mp, freeblks->fb_inum, 0, &inodedep); TAILQ_REMOVE(&inodedep->id_freeblklst, freeblks, fb_next); freeblks->fb_state &= ~ONDEPLIST; if (TAILQ_EMPTY(&inodedep->id_freeblklst)) free_inodedep(inodedep); } /* * All of the freeblock deps must be complete prior to this call * so it's now safe to complete earlier outstanding journal entries. */ handle_jwork(&freeblks->fb_jwork); WORKITEM_FREE(freeblks, D_FREEBLKS); FREE_LOCK(ump); return (0); } /* * Release blocks associated with the freeblks and stored in the indirect * block dbn. If level is greater than SINGLE, the block is an indirect block * and recursive calls to indirtrunc must be used to cleanse other indirect * blocks. * * This handles partial and complete truncation of blocks. Partial is noted * with goingaway == 0. In this case the freework is completed after the * zero'd indirects are written to disk. For full truncation the freework * is completed after the block is freed. */ static void indir_trunc(freework, dbn, lbn) struct freework *freework; ufs2_daddr_t dbn; ufs_lbn_t lbn; { struct freework *nfreework; struct workhead wkhd; struct freeblks *freeblks; struct buf *bp; struct fs *fs; struct indirdep *indirdep; struct mount *mp; struct ufsmount *ump; ufs1_daddr_t *bap1; ufs2_daddr_t nb, nnb, *bap2; ufs_lbn_t lbnadd, nlbn; u_long key; int nblocks, ufs1fmt, freedblocks; int goingaway, freedeps, needj, level, cnt, i, error; freeblks = freework->fw_freeblks; mp = freeblks->fb_list.wk_mp; ump = VFSTOUFS(mp); fs = ump->um_fs; /* * Get buffer of block pointers to be freed. There are three cases: * * 1) Partial truncate caches the indirdep pointer in the freework * which provides us a back copy to the save bp which holds the * pointers we want to clear. When this completes the zero * pointers are written to the real copy. * 2) The indirect is being completely truncated, cancel_indirdep() * eliminated the real copy and placed the indirdep on the saved * copy. The indirdep and buf are discarded when this completes. * 3) The indirect was not in memory, we read a copy off of the disk * using the devvp and drop and invalidate the buffer when we're * done. */ goingaway = 1; indirdep = NULL; if (freework->fw_indir != NULL) { goingaway = 0; indirdep = freework->fw_indir; bp = indirdep->ir_savebp; if (bp == NULL || bp->b_blkno != dbn) panic("indir_trunc: Bad saved buf %p blkno %jd", bp, (intmax_t)dbn); } else if ((bp = incore(&freeblks->fb_devvp->v_bufobj, dbn)) != NULL) { /* * The lock prevents the buf dep list from changing and * indirects on devvp should only ever have one dependency. */ indirdep = WK_INDIRDEP(LIST_FIRST(&bp->b_dep)); if (indirdep == NULL || (indirdep->ir_state & GOINGAWAY) == 0) panic("indir_trunc: Bad indirdep %p from buf %p", indirdep, bp); } else { error = ffs_breadz(ump, freeblks->fb_devvp, dbn, dbn, (int)fs->fs_bsize, NULL, NULL, 0, NOCRED, 0, NULL, &bp); if (error) return; } ACQUIRE_LOCK(ump); /* Protects against a race with complete_trunc_indir(). */ freework->fw_state &= ~INPROGRESS; /* * If we have an indirdep we need to enforce the truncation order * and discard it when it is complete. */ if (indirdep) { if (freework != TAILQ_FIRST(&indirdep->ir_trunc) && !TAILQ_EMPTY(&indirdep->ir_trunc)) { /* * Add the complete truncate to the list on the * indirdep to enforce in-order processing. */ if (freework->fw_indir == NULL) TAILQ_INSERT_TAIL(&indirdep->ir_trunc, freework, fw_next); FREE_LOCK(ump); return; } /* * If we're goingaway, free the indirdep. Otherwise it will * linger until the write completes. */ if (goingaway) { KASSERT(indirdep->ir_savebp == bp, ("indir_trunc: losing ir_savebp %p", indirdep->ir_savebp)); indirdep->ir_savebp = NULL; free_indirdep(indirdep); } } FREE_LOCK(ump); /* Initialize pointers depending on block size. */ if (ump->um_fstype == UFS1) { bap1 = (ufs1_daddr_t *)bp->b_data; nb = bap1[freework->fw_off]; ufs1fmt = 1; bap2 = NULL; } else { bap2 = (ufs2_daddr_t *)bp->b_data; nb = bap2[freework->fw_off]; ufs1fmt = 0; bap1 = NULL; } level = lbn_level(lbn); needj = MOUNTEDSUJ(UFSTOVFS(ump)) != 0; lbnadd = lbn_offset(fs, level); nblocks = btodb(fs->fs_bsize); nfreework = freework; freedeps = 0; cnt = 0; /* * Reclaim blocks. Traverses into nested indirect levels and * arranges for the current level to be freed when subordinates * are free when journaling. */ key = ffs_blkrelease_start(ump, freeblks->fb_devvp, freeblks->fb_inum); for (i = freework->fw_off; i < NINDIR(fs); i++, nb = nnb) { if (UFS_CHECK_BLKNO(mp, freeblks->fb_inum, nb, fs->fs_bsize) != 0) nb = 0; if (i != NINDIR(fs) - 1) { if (ufs1fmt) nnb = bap1[i+1]; else nnb = bap2[i+1]; } else nnb = 0; if (nb == 0) continue; cnt++; if (level != 0) { nlbn = (lbn + 1) - (i * lbnadd); if (needj != 0) { nfreework = newfreework(ump, freeblks, freework, nlbn, nb, fs->fs_frag, 0, 0); freedeps++; } indir_trunc(nfreework, fsbtodb(fs, nb), nlbn); } else { struct freedep *freedep; /* * Attempt to aggregate freedep dependencies for * all blocks being released to the same CG. */ LIST_INIT(&wkhd); if (needj != 0 && (nnb == 0 || (dtog(fs, nb) != dtog(fs, nnb)))) { freedep = newfreedep(freework); WORKLIST_INSERT_UNLOCKED(&wkhd, &freedep->fd_list); freedeps++; } CTR3(KTR_SUJ, "indir_trunc: ino %jd blkno %jd size %d", freeblks->fb_inum, nb, fs->fs_bsize); ffs_blkfree(ump, fs, freeblks->fb_devvp, nb, fs->fs_bsize, freeblks->fb_inum, freeblks->fb_vtype, &wkhd, key); } } ffs_blkrelease_finish(ump, key); if (goingaway) { bp->b_flags |= B_INVAL | B_NOCACHE; brelse(bp); } freedblocks = 0; if (level == 0) freedblocks = (nblocks * cnt); if (needj == 0) freedblocks += nblocks; freeblks_free(ump, freeblks, freedblocks); /* * If we are journaling set up the ref counts and offset so this * indirect can be completed when its children are free. */ if (needj) { ACQUIRE_LOCK(ump); freework->fw_off = i; freework->fw_ref += freedeps; freework->fw_ref -= NINDIR(fs) + 1; if (level == 0) freeblks->fb_cgwait += freedeps; if (freework->fw_ref == 0) freework_freeblock(freework, SINGLETON_KEY); FREE_LOCK(ump); return; } /* * If we're not journaling we can free the indirect now. */ dbn = dbtofsb(fs, dbn); CTR3(KTR_SUJ, "indir_trunc 2: ino %jd blkno %jd size %d", freeblks->fb_inum, dbn, fs->fs_bsize); ffs_blkfree(ump, fs, freeblks->fb_devvp, dbn, fs->fs_bsize, freeblks->fb_inum, freeblks->fb_vtype, NULL, SINGLETON_KEY); /* Non SUJ softdep does single-threaded truncations. */ if (freework->fw_blkno == dbn) { freework->fw_state |= ALLCOMPLETE; ACQUIRE_LOCK(ump); handle_written_freework(freework); FREE_LOCK(ump); } return; } /* * Cancel an allocindir when it is removed via truncation. When bp is not * NULL the indirect never appeared on disk and is scheduled to be freed * independently of the indir so we can more easily track journal work. */ static void cancel_allocindir(aip, bp, freeblks, trunc) struct allocindir *aip; struct buf *bp; struct freeblks *freeblks; int trunc; { struct indirdep *indirdep; struct freefrag *freefrag; struct newblk *newblk; newblk = (struct newblk *)aip; LIST_REMOVE(aip, ai_next); /* * We must eliminate the pointer in bp if it must be freed on its * own due to partial truncate or pending journal work. */ if (bp && (trunc || newblk->nb_jnewblk)) { /* * Clear the pointer and mark the aip to be freed * directly if it never existed on disk. */ aip->ai_state |= DELAYEDFREE; indirdep = aip->ai_indirdep; if (indirdep->ir_state & UFS1FMT) ((ufs1_daddr_t *)bp->b_data)[aip->ai_offset] = 0; else ((ufs2_daddr_t *)bp->b_data)[aip->ai_offset] = 0; } /* * When truncating the previous pointer will be freed via * savedbp. Eliminate the freefrag which would dup free. */ if (trunc && (freefrag = newblk->nb_freefrag) != NULL) { newblk->nb_freefrag = NULL; if (freefrag->ff_jdep) cancel_jfreefrag( WK_JFREEFRAG(freefrag->ff_jdep)); jwork_move(&freeblks->fb_jwork, &freefrag->ff_jwork); WORKITEM_FREE(freefrag, D_FREEFRAG); } /* * If the journal hasn't been written the jnewblk must be passed * to the call to ffs_blkfree that reclaims the space. We accomplish * this by leaving the journal dependency on the newblk to be freed * when a freework is created in handle_workitem_freeblocks(). */ cancel_newblk(newblk, NULL, &freeblks->fb_jwork); WORKLIST_INSERT(&freeblks->fb_freeworkhd, &newblk->nb_list); } /* * Create the mkdir dependencies for . and .. in a new directory. Link them * in to a newdirblk so any subsequent additions are tracked properly. The * caller is responsible for adding the mkdir1 dependency to the journal * and updating id_mkdiradd. This function returns with the per-filesystem * lock held. */ static struct mkdir * setup_newdir(dap, newinum, dinum, newdirbp, mkdirp) struct diradd *dap; ino_t newinum; ino_t dinum; struct buf *newdirbp; struct mkdir **mkdirp; { struct newblk *newblk; struct pagedep *pagedep; struct inodedep *inodedep; struct newdirblk *newdirblk; struct mkdir *mkdir1, *mkdir2; struct worklist *wk; struct jaddref *jaddref; struct ufsmount *ump; struct mount *mp; mp = dap->da_list.wk_mp; ump = VFSTOUFS(mp); newdirblk = malloc(sizeof(struct newdirblk), M_NEWDIRBLK, M_SOFTDEP_FLAGS); workitem_alloc(&newdirblk->db_list, D_NEWDIRBLK, mp); LIST_INIT(&newdirblk->db_mkdir); mkdir1 = malloc(sizeof(struct mkdir), M_MKDIR, M_SOFTDEP_FLAGS); workitem_alloc(&mkdir1->md_list, D_MKDIR, mp); mkdir1->md_state = ATTACHED | MKDIR_BODY; mkdir1->md_diradd = dap; mkdir1->md_jaddref = NULL; mkdir2 = malloc(sizeof(struct mkdir), M_MKDIR, M_SOFTDEP_FLAGS); workitem_alloc(&mkdir2->md_list, D_MKDIR, mp); mkdir2->md_state = ATTACHED | MKDIR_PARENT; mkdir2->md_diradd = dap; mkdir2->md_jaddref = NULL; if (MOUNTEDSUJ(mp) == 0) { mkdir1->md_state |= DEPCOMPLETE; mkdir2->md_state |= DEPCOMPLETE; } /* * Dependency on "." and ".." being written to disk. */ mkdir1->md_buf = newdirbp; ACQUIRE_LOCK(VFSTOUFS(mp)); LIST_INSERT_HEAD(&ump->softdep_mkdirlisthd, mkdir1, md_mkdirs); /* * We must link the pagedep, allocdirect, and newdirblk for * the initial file page so the pointer to the new directory * is not written until the directory contents are live and * any subsequent additions are not marked live until the * block is reachable via the inode. */ if (pagedep_lookup(mp, newdirbp, newinum, 0, 0, &pagedep) == 0) panic("setup_newdir: lost pagedep"); LIST_FOREACH(wk, &newdirbp->b_dep, wk_list) if (wk->wk_type == D_ALLOCDIRECT) break; if (wk == NULL) panic("setup_newdir: lost allocdirect"); if (pagedep->pd_state & NEWBLOCK) panic("setup_newdir: NEWBLOCK already set"); newblk = WK_NEWBLK(wk); pagedep->pd_state |= NEWBLOCK; pagedep->pd_newdirblk = newdirblk; newdirblk->db_pagedep = pagedep; WORKLIST_INSERT(&newblk->nb_newdirblk, &newdirblk->db_list); WORKLIST_INSERT(&newdirblk->db_mkdir, &mkdir1->md_list); /* * Look up the inodedep for the parent directory so that we * can link mkdir2 into the pending dotdot jaddref or * the inode write if there is none. If the inode is * ALLCOMPLETE and no jaddref is present all dependencies have * been satisfied and mkdir2 can be freed. */ inodedep_lookup(mp, dinum, 0, &inodedep); if (MOUNTEDSUJ(mp)) { if (inodedep == NULL) panic("setup_newdir: Lost parent."); jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, inoreflst); KASSERT(jaddref != NULL && jaddref->ja_parent == newinum && (jaddref->ja_state & MKDIR_PARENT), ("setup_newdir: bad dotdot jaddref %p", jaddref)); LIST_INSERT_HEAD(&ump->softdep_mkdirlisthd, mkdir2, md_mkdirs); mkdir2->md_jaddref = jaddref; jaddref->ja_mkdir = mkdir2; } else if (inodedep == NULL || (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) { dap->da_state &= ~MKDIR_PARENT; WORKITEM_FREE(mkdir2, D_MKDIR); mkdir2 = NULL; } else { LIST_INSERT_HEAD(&ump->softdep_mkdirlisthd, mkdir2, md_mkdirs); WORKLIST_INSERT(&inodedep->id_bufwait, &mkdir2->md_list); } *mkdirp = mkdir2; return (mkdir1); } /* * Directory entry addition dependencies. * * When adding a new directory entry, the inode (with its incremented link * count) must be written to disk before the directory entry's pointer to it. * Also, if the inode is newly allocated, the corresponding freemap must be * updated (on disk) before the directory entry's pointer. These requirements * are met via undo/redo on the directory entry's pointer, which consists * simply of the inode number. * * As directory entries are added and deleted, the free space within a * directory block can become fragmented. The ufs filesystem will compact * a fragmented directory block to make space for a new entry. When this * occurs, the offsets of previously added entries change. Any "diradd" * dependency structures corresponding to these entries must be updated with * the new offsets. */ /* * This routine is called after the in-memory inode's link * count has been incremented, but before the directory entry's * pointer to the inode has been set. */ int softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp, isnewblk) struct buf *bp; /* buffer containing directory block */ struct inode *dp; /* inode for directory */ off_t diroffset; /* offset of new entry in directory */ ino_t newinum; /* inode referenced by new directory entry */ struct buf *newdirbp; /* non-NULL => contents of new mkdir */ int isnewblk; /* entry is in a newly allocated block */ { int offset; /* offset of new entry within directory block */ ufs_lbn_t lbn; /* block in directory containing new entry */ struct fs *fs; struct diradd *dap; struct newblk *newblk; struct pagedep *pagedep; struct inodedep *inodedep; struct newdirblk *newdirblk; struct mkdir *mkdir1, *mkdir2; struct jaddref *jaddref; struct ufsmount *ump; struct mount *mp; int isindir; mp = ITOVFS(dp); ump = VFSTOUFS(mp); KASSERT(MOUNTEDSOFTDEP(mp) != 0, ("softdep_setup_directory_add called on non-softdep filesystem")); /* * Whiteouts have no dependencies. */ if (newinum == UFS_WINO) { if (newdirbp != NULL) bdwrite(newdirbp); return (0); } jaddref = NULL; mkdir1 = mkdir2 = NULL; fs = ump->um_fs; lbn = lblkno(fs, diroffset); offset = blkoff(fs, diroffset); dap = malloc(sizeof(struct diradd), M_DIRADD, M_SOFTDEP_FLAGS|M_ZERO); workitem_alloc(&dap->da_list, D_DIRADD, mp); dap->da_offset = offset; dap->da_newinum = newinum; dap->da_state = ATTACHED; LIST_INIT(&dap->da_jwork); isindir = bp->b_lblkno >= UFS_NDADDR; newdirblk = NULL; if (isnewblk && (isindir ? blkoff(fs, diroffset) : fragoff(fs, diroffset)) == 0) { newdirblk = malloc(sizeof(struct newdirblk), M_NEWDIRBLK, M_SOFTDEP_FLAGS); workitem_alloc(&newdirblk->db_list, D_NEWDIRBLK, mp); LIST_INIT(&newdirblk->db_mkdir); } /* * If we're creating a new directory setup the dependencies and set * the dap state to wait for them. Otherwise it's COMPLETE and * we can move on. */ if (newdirbp == NULL) { dap->da_state |= DEPCOMPLETE; ACQUIRE_LOCK(ump); } else { dap->da_state |= MKDIR_BODY | MKDIR_PARENT; mkdir1 = setup_newdir(dap, newinum, dp->i_number, newdirbp, &mkdir2); } /* * Link into parent directory pagedep to await its being written. */ pagedep_lookup(mp, bp, dp->i_number, lbn, DEPALLOC, &pagedep); #ifdef INVARIANTS if (diradd_lookup(pagedep, offset) != NULL) panic("softdep_setup_directory_add: %p already at off %d\n", diradd_lookup(pagedep, offset), offset); #endif dap->da_pagedep = pagedep; LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], dap, da_pdlist); inodedep_lookup(mp, newinum, DEPALLOC, &inodedep); /* * If we're journaling, link the diradd into the jaddref so it * may be completed after the journal entry is written. Otherwise, * link the diradd into its inodedep. If the inode is not yet * written place it on the bufwait list, otherwise do the post-inode * write processing to put it on the id_pendinghd list. */ if (MOUNTEDSUJ(mp)) { jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, inoreflst); KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number, ("softdep_setup_directory_add: bad jaddref %p", jaddref)); jaddref->ja_diroff = diroffset; jaddref->ja_diradd = dap; add_to_journal(&jaddref->ja_list); } else if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) diradd_inode_written(dap, inodedep); else WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list); /* * Add the journal entries for . and .. links now that the primary * link is written. */ if (mkdir1 != NULL && MOUNTEDSUJ(mp)) { jaddref = (struct jaddref *)TAILQ_PREV(&jaddref->ja_ref, inoreflst, if_deps); KASSERT(jaddref != NULL && jaddref->ja_ino == jaddref->ja_parent && (jaddref->ja_state & MKDIR_BODY), ("softdep_setup_directory_add: bad dot jaddref %p", jaddref)); mkdir1->md_jaddref = jaddref; jaddref->ja_mkdir = mkdir1; /* * It is important that the dotdot journal entry * is added prior to the dot entry since dot writes * both the dot and dotdot links. These both must * be added after the primary link for the journal * to remain consistent. */ add_to_journal(&mkdir2->md_jaddref->ja_list); add_to_journal(&jaddref->ja_list); } /* * If we are adding a new directory remember this diradd so that if * we rename it we can keep the dot and dotdot dependencies. If * we are adding a new name for an inode that has a mkdiradd we * must be in rename and we have to move the dot and dotdot * dependencies to this new name. The old name is being orphaned * soon. */ if (mkdir1 != NULL) { if (inodedep->id_mkdiradd != NULL) panic("softdep_setup_directory_add: Existing mkdir"); inodedep->id_mkdiradd = dap; } else if (inodedep->id_mkdiradd) merge_diradd(inodedep, dap); if (newdirblk != NULL) { /* * There is nothing to do if we are already tracking * this block. */ if ((pagedep->pd_state & NEWBLOCK) != 0) { WORKITEM_FREE(newdirblk, D_NEWDIRBLK); FREE_LOCK(ump); return (0); } if (newblk_lookup(mp, dbtofsb(fs, bp->b_blkno), 0, &newblk) == 0) panic("softdep_setup_directory_add: lost entry"); WORKLIST_INSERT(&newblk->nb_newdirblk, &newdirblk->db_list); pagedep->pd_state |= NEWBLOCK; pagedep->pd_newdirblk = newdirblk; newdirblk->db_pagedep = pagedep; FREE_LOCK(ump); /* * If we extended into an indirect signal direnter to sync. */ if (isindir) return (1); return (0); } FREE_LOCK(ump); return (0); } /* * This procedure is called to change the offset of a directory * entry when compacting a directory block which must be owned * exclusively by the caller. Note that the actual entry movement * must be done in this procedure to ensure that no I/O completions * occur while the move is in progress. */ void softdep_change_directoryentry_offset(bp, dp, base, oldloc, newloc, entrysize) struct buf *bp; /* Buffer holding directory block. */ struct inode *dp; /* inode for directory */ caddr_t base; /* address of dp->i_offset */ caddr_t oldloc; /* address of old directory location */ caddr_t newloc; /* address of new directory location */ int entrysize; /* size of directory entry */ { int offset, oldoffset, newoffset; struct pagedep *pagedep; struct jmvref *jmvref; struct diradd *dap; struct direct *de; struct mount *mp; struct ufsmount *ump; ufs_lbn_t lbn; int flags; mp = ITOVFS(dp); ump = VFSTOUFS(mp); KASSERT(MOUNTEDSOFTDEP(mp) != 0, ("softdep_change_directoryentry_offset called on " "non-softdep filesystem")); de = (struct direct *)oldloc; jmvref = NULL; flags = 0; /* * Moves are always journaled as it would be too complex to * determine if any affected adds or removes are present in the * journal. */ if (MOUNTEDSUJ(mp)) { flags = DEPALLOC; jmvref = newjmvref(dp, de->d_ino, dp->i_offset + (oldloc - base), dp->i_offset + (newloc - base)); } lbn = lblkno(ump->um_fs, dp->i_offset); offset = blkoff(ump->um_fs, dp->i_offset); oldoffset = offset + (oldloc - base); newoffset = offset + (newloc - base); ACQUIRE_LOCK(ump); if (pagedep_lookup(mp, bp, dp->i_number, lbn, flags, &pagedep) == 0) goto done; dap = diradd_lookup(pagedep, oldoffset); if (dap) { dap->da_offset = newoffset; newoffset = DIRADDHASH(newoffset); oldoffset = DIRADDHASH(oldoffset); if ((dap->da_state & ALLCOMPLETE) != ALLCOMPLETE && newoffset != oldoffset) { LIST_REMOVE(dap, da_pdlist); LIST_INSERT_HEAD(&pagedep->pd_diraddhd[newoffset], dap, da_pdlist); } } done: if (jmvref) { jmvref->jm_pagedep = pagedep; LIST_INSERT_HEAD(&pagedep->pd_jmvrefhd, jmvref, jm_deps); add_to_journal(&jmvref->jm_list); } bcopy(oldloc, newloc, entrysize); FREE_LOCK(ump); } /* * Move the mkdir dependencies and journal work from one diradd to another * when renaming a directory. The new name must depend on the mkdir deps * completing as the old name did. Directories can only have one valid link * at a time so one must be canonical. */ static void merge_diradd(inodedep, newdap) struct inodedep *inodedep; struct diradd *newdap; { struct diradd *olddap; struct mkdir *mkdir, *nextmd; struct ufsmount *ump; short state; olddap = inodedep->id_mkdiradd; inodedep->id_mkdiradd = newdap; if ((olddap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) { newdap->da_state &= ~DEPCOMPLETE; ump = VFSTOUFS(inodedep->id_list.wk_mp); for (mkdir = LIST_FIRST(&ump->softdep_mkdirlisthd); mkdir; mkdir = nextmd) { nextmd = LIST_NEXT(mkdir, md_mkdirs); if (mkdir->md_diradd != olddap) continue; mkdir->md_diradd = newdap; state = mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY); newdap->da_state |= state; olddap->da_state &= ~state; if ((olddap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0) break; } if ((olddap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) panic("merge_diradd: unfound ref"); } /* * Any mkdir related journal items are not safe to be freed until * the new name is stable. */ jwork_move(&newdap->da_jwork, &olddap->da_jwork); olddap->da_state |= DEPCOMPLETE; complete_diradd(olddap); } /* * Move the diradd to the pending list when all diradd dependencies are * complete. */ static void complete_diradd(dap) struct diradd *dap; { struct pagedep *pagedep; if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) { if (dap->da_state & DIRCHG) pagedep = dap->da_previous->dm_pagedep; else pagedep = dap->da_pagedep; LIST_REMOVE(dap, da_pdlist); LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist); } } /* * Cancel a diradd when a dirrem overlaps with it. We must cancel the journal * add entries and conditonally journal the remove. */ static void cancel_diradd(dap, dirrem, jremref, dotremref, dotdotremref) struct diradd *dap; struct dirrem *dirrem; struct jremref *jremref; struct jremref *dotremref; struct jremref *dotdotremref; { struct inodedep *inodedep; struct jaddref *jaddref; struct inoref *inoref; struct ufsmount *ump; struct mkdir *mkdir; /* * If no remove references were allocated we're on a non-journaled * filesystem and can skip the cancel step. */ if (jremref == NULL) { free_diradd(dap, NULL); return; } /* * Cancel the primary name an free it if it does not require * journaling. */ if (inodedep_lookup(dap->da_list.wk_mp, dap->da_newinum, 0, &inodedep) != 0) { /* Abort the addref that reference this diradd. */ TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) { if (inoref->if_list.wk_type != D_JADDREF) continue; jaddref = (struct jaddref *)inoref; if (jaddref->ja_diradd != dap) continue; if (cancel_jaddref(jaddref, inodedep, &dirrem->dm_jwork) == 0) { free_jremref(jremref); jremref = NULL; } break; } } /* * Cancel subordinate names and free them if they do not require * journaling. */ if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) { ump = VFSTOUFS(dap->da_list.wk_mp); LIST_FOREACH(mkdir, &ump->softdep_mkdirlisthd, md_mkdirs) { if (mkdir->md_diradd != dap) continue; if ((jaddref = mkdir->md_jaddref) == NULL) continue; mkdir->md_jaddref = NULL; if (mkdir->md_state & MKDIR_PARENT) { if (cancel_jaddref(jaddref, NULL, &dirrem->dm_jwork) == 0) { free_jremref(dotdotremref); dotdotremref = NULL; } } else { if (cancel_jaddref(jaddref, inodedep, &dirrem->dm_jwork) == 0) { free_jremref(dotremref); dotremref = NULL; } } } } if (jremref) journal_jremref(dirrem, jremref, inodedep); if (dotremref) journal_jremref(dirrem, dotremref, inodedep); if (dotdotremref) journal_jremref(dirrem, dotdotremref, NULL); jwork_move(&dirrem->dm_jwork, &dap->da_jwork); free_diradd(dap, &dirrem->dm_jwork); } /* * Free a diradd dependency structure. */ static void free_diradd(dap, wkhd) struct diradd *dap; struct workhead *wkhd; { struct dirrem *dirrem; struct pagedep *pagedep; struct inodedep *inodedep; struct mkdir *mkdir, *nextmd; struct ufsmount *ump; ump = VFSTOUFS(dap->da_list.wk_mp); LOCK_OWNED(ump); LIST_REMOVE(dap, da_pdlist); if (dap->da_state & ONWORKLIST) WORKLIST_REMOVE(&dap->da_list); if ((dap->da_state & DIRCHG) == 0) { pagedep = dap->da_pagedep; } else { dirrem = dap->da_previous; pagedep = dirrem->dm_pagedep; dirrem->dm_dirinum = pagedep->pd_ino; dirrem->dm_state |= COMPLETE; if (LIST_EMPTY(&dirrem->dm_jremrefhd)) add_to_worklist(&dirrem->dm_list, 0); } if (inodedep_lookup(pagedep->pd_list.wk_mp, dap->da_newinum, 0, &inodedep) != 0) if (inodedep->id_mkdiradd == dap) inodedep->id_mkdiradd = NULL; if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) { for (mkdir = LIST_FIRST(&ump->softdep_mkdirlisthd); mkdir; mkdir = nextmd) { nextmd = LIST_NEXT(mkdir, md_mkdirs); if (mkdir->md_diradd != dap) continue; dap->da_state &= ~(mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY)); LIST_REMOVE(mkdir, md_mkdirs); if (mkdir->md_state & ONWORKLIST) WORKLIST_REMOVE(&mkdir->md_list); if (mkdir->md_jaddref != NULL) panic("free_diradd: Unexpected jaddref"); WORKITEM_FREE(mkdir, D_MKDIR); if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0) break; } if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) panic("free_diradd: unfound ref"); } if (inodedep) free_inodedep(inodedep); /* * Free any journal segments waiting for the directory write. */ handle_jwork(&dap->da_jwork); WORKITEM_FREE(dap, D_DIRADD); } /* * Directory entry removal dependencies. * * When removing a directory entry, the entry's inode pointer must be * zero'ed on disk before the corresponding inode's link count is decremented * (possibly freeing the inode for re-use). This dependency is handled by * updating the directory entry but delaying the inode count reduction until * after the directory block has been written to disk. After this point, the * inode count can be decremented whenever it is convenient. */ /* * This routine should be called immediately after removing * a directory entry. The inode's link count should not be * decremented by the calling procedure -- the soft updates * code will do this task when it is safe. */ void softdep_setup_remove(bp, dp, ip, isrmdir) struct buf *bp; /* buffer containing directory block */ struct inode *dp; /* inode for the directory being modified */ struct inode *ip; /* inode for directory entry being removed */ int isrmdir; /* indicates if doing RMDIR */ { struct dirrem *dirrem, *prevdirrem; struct inodedep *inodedep; struct ufsmount *ump; int direct; ump = ITOUMP(ip); KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0, ("softdep_setup_remove called on non-softdep filesystem")); /* * Allocate a new dirrem if appropriate and ACQUIRE_LOCK. We want * newdirrem() to setup the full directory remove which requires * isrmdir > 1. */ dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem); /* * Add the dirrem to the inodedep's pending remove list for quick * discovery later. */ if (inodedep_lookup(UFSTOVFS(ump), ip->i_number, 0, &inodedep) == 0) panic("softdep_setup_remove: Lost inodedep."); KASSERT((inodedep->id_state & UNLINKED) == 0, ("inode unlinked")); dirrem->dm_state |= ONDEPLIST; LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext); /* * If the COMPLETE flag is clear, then there were no active * entries and we want to roll back to a zeroed entry until * the new inode is committed to disk. If the COMPLETE flag is * set then we have deleted an entry that never made it to * disk. If the entry we deleted resulted from a name change, * then the old name still resides on disk. We cannot delete * its inode (returned to us in prevdirrem) until the zeroed * directory entry gets to disk. The new inode has never been * referenced on the disk, so can be deleted immediately. */ if ((dirrem->dm_state & COMPLETE) == 0) { LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, dirrem, dm_next); FREE_LOCK(ump); } else { if (prevdirrem != NULL) LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, prevdirrem, dm_next); dirrem->dm_dirinum = dirrem->dm_pagedep->pd_ino; direct = LIST_EMPTY(&dirrem->dm_jremrefhd); FREE_LOCK(ump); if (direct) handle_workitem_remove(dirrem, 0); } } /* * Check for an entry matching 'offset' on both the pd_dirraddhd list and the * pd_pendinghd list of a pagedep. */ static struct diradd * diradd_lookup(pagedep, offset) struct pagedep *pagedep; int offset; { struct diradd *dap; LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(offset)], da_pdlist) if (dap->da_offset == offset) return (dap); LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist) if (dap->da_offset == offset) return (dap); return (NULL); } /* * Search for a .. diradd dependency in a directory that is being removed. * If the directory was renamed to a new parent we have a diradd rather * than a mkdir for the .. entry. We need to cancel it now before * it is found in truncate(). */ static struct jremref * cancel_diradd_dotdot(ip, dirrem, jremref) struct inode *ip; struct dirrem *dirrem; struct jremref *jremref; { struct pagedep *pagedep; struct diradd *dap; struct worklist *wk; if (pagedep_lookup(ITOVFS(ip), NULL, ip->i_number, 0, 0, &pagedep) == 0) return (jremref); dap = diradd_lookup(pagedep, DOTDOT_OFFSET); if (dap == NULL) return (jremref); cancel_diradd(dap, dirrem, jremref, NULL, NULL); /* * Mark any journal work as belonging to the parent so it is freed * with the .. reference. */ LIST_FOREACH(wk, &dirrem->dm_jwork, wk_list) wk->wk_state |= MKDIR_PARENT; return (NULL); } /* * Cancel the MKDIR_PARENT mkdir component of a diradd when we're going to * replace it with a dirrem/diradd pair as a result of re-parenting a * directory. This ensures that we don't simultaneously have a mkdir and * a diradd for the same .. entry. */ static struct jremref * cancel_mkdir_dotdot(ip, dirrem, jremref) struct inode *ip; struct dirrem *dirrem; struct jremref *jremref; { struct inodedep *inodedep; struct jaddref *jaddref; struct ufsmount *ump; struct mkdir *mkdir; struct diradd *dap; struct mount *mp; mp = ITOVFS(ip); if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) return (jremref); dap = inodedep->id_mkdiradd; if (dap == NULL || (dap->da_state & MKDIR_PARENT) == 0) return (jremref); ump = VFSTOUFS(inodedep->id_list.wk_mp); for (mkdir = LIST_FIRST(&ump->softdep_mkdirlisthd); mkdir; mkdir = LIST_NEXT(mkdir, md_mkdirs)) if (mkdir->md_diradd == dap && mkdir->md_state & MKDIR_PARENT) break; if (mkdir == NULL) panic("cancel_mkdir_dotdot: Unable to find mkdir\n"); if ((jaddref = mkdir->md_jaddref) != NULL) { mkdir->md_jaddref = NULL; jaddref->ja_state &= ~MKDIR_PARENT; if (inodedep_lookup(mp, jaddref->ja_ino, 0, &inodedep) == 0) panic("cancel_mkdir_dotdot: Lost parent inodedep"); if (cancel_jaddref(jaddref, inodedep, &dirrem->dm_jwork)) { journal_jremref(dirrem, jremref, inodedep); jremref = NULL; } } if (mkdir->md_state & ONWORKLIST) WORKLIST_REMOVE(&mkdir->md_list); mkdir->md_state |= ALLCOMPLETE; complete_mkdir(mkdir); return (jremref); } static void journal_jremref(dirrem, jremref, inodedep) struct dirrem *dirrem; struct jremref *jremref; struct inodedep *inodedep; { if (inodedep == NULL) if (inodedep_lookup(jremref->jr_list.wk_mp, jremref->jr_ref.if_ino, 0, &inodedep) == 0) panic("journal_jremref: Lost inodedep"); LIST_INSERT_HEAD(&dirrem->dm_jremrefhd, jremref, jr_deps); TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jremref->jr_ref, if_deps); add_to_journal(&jremref->jr_list); } static void dirrem_journal(dirrem, jremref, dotremref, dotdotremref) struct dirrem *dirrem; struct jremref *jremref; struct jremref *dotremref; struct jremref *dotdotremref; { struct inodedep *inodedep; if (inodedep_lookup(jremref->jr_list.wk_mp, jremref->jr_ref.if_ino, 0, &inodedep) == 0) panic("dirrem_journal: Lost inodedep"); journal_jremref(dirrem, jremref, inodedep); if (dotremref) journal_jremref(dirrem, dotremref, inodedep); if (dotdotremref) journal_jremref(dirrem, dotdotremref, NULL); } /* * Allocate a new dirrem if appropriate and return it along with * its associated pagedep. Called without a lock, returns with lock. */ static struct dirrem * newdirrem(bp, dp, ip, isrmdir, prevdirremp) struct buf *bp; /* buffer containing directory block */ struct inode *dp; /* inode for the directory being modified */ struct inode *ip; /* inode for directory entry being removed */ int isrmdir; /* indicates if doing RMDIR */ struct dirrem **prevdirremp; /* previously referenced inode, if any */ { int offset; ufs_lbn_t lbn; struct diradd *dap; struct dirrem *dirrem; struct pagedep *pagedep; struct jremref *jremref; struct jremref *dotremref; struct jremref *dotdotremref; struct vnode *dvp; struct ufsmount *ump; /* * Whiteouts have no deletion dependencies. */ if (ip == NULL) panic("newdirrem: whiteout"); dvp = ITOV(dp); ump = ITOUMP(dp); /* * If the system is over its limit and our filesystem is * responsible for more than our share of that usage and * we are not a snapshot, request some inodedep cleanup. * Limiting the number of dirrem structures will also limit * the number of freefile and freeblks structures. */ ACQUIRE_LOCK(ump); if (!IS_SNAPSHOT(ip) && softdep_excess_items(ump, D_DIRREM)) schedule_cleanup(UFSTOVFS(ump)); else FREE_LOCK(ump); dirrem = malloc(sizeof(struct dirrem), M_DIRREM, M_SOFTDEP_FLAGS | M_ZERO); workitem_alloc(&dirrem->dm_list, D_DIRREM, dvp->v_mount); LIST_INIT(&dirrem->dm_jremrefhd); LIST_INIT(&dirrem->dm_jwork); dirrem->dm_state = isrmdir ? RMDIR : 0; dirrem->dm_oldinum = ip->i_number; *prevdirremp = NULL; /* * Allocate remove reference structures to track journal write * dependencies. We will always have one for the link and * when doing directories we will always have one more for dot. * When renaming a directory we skip the dotdot link change so * this is not needed. */ jremref = dotremref = dotdotremref = NULL; if (DOINGSUJ(dvp)) { if (isrmdir) { jremref = newjremref(dirrem, dp, ip, dp->i_offset, ip->i_effnlink + 2); dotremref = newjremref(dirrem, ip, ip, DOT_OFFSET, ip->i_effnlink + 1); dotdotremref = newjremref(dirrem, ip, dp, DOTDOT_OFFSET, dp->i_effnlink + 1); dotdotremref->jr_state |= MKDIR_PARENT; } else jremref = newjremref(dirrem, dp, ip, dp->i_offset, ip->i_effnlink + 1); } ACQUIRE_LOCK(ump); lbn = lblkno(ump->um_fs, dp->i_offset); offset = blkoff(ump->um_fs, dp->i_offset); pagedep_lookup(UFSTOVFS(ump), bp, dp->i_number, lbn, DEPALLOC, &pagedep); dirrem->dm_pagedep = pagedep; dirrem->dm_offset = offset; /* * If we're renaming a .. link to a new directory, cancel any * existing MKDIR_PARENT mkdir. If it has already been canceled * the jremref is preserved for any potential diradd in this * location. This can not coincide with a rmdir. */ if (dp->i_offset == DOTDOT_OFFSET) { if (isrmdir) panic("newdirrem: .. directory change during remove?"); jremref = cancel_mkdir_dotdot(dp, dirrem, jremref); } /* * If we're removing a directory search for the .. dependency now and * cancel it. Any pending journal work will be added to the dirrem * to be completed when the workitem remove completes. */ if (isrmdir) dotdotremref = cancel_diradd_dotdot(ip, dirrem, dotdotremref); /* * Check for a diradd dependency for the same directory entry. * If present, then both dependencies become obsolete and can * be de-allocated. */ dap = diradd_lookup(pagedep, offset); if (dap == NULL) { /* * Link the jremref structures into the dirrem so they are * written prior to the pagedep. */ if (jremref) dirrem_journal(dirrem, jremref, dotremref, dotdotremref); return (dirrem); } /* * Must be ATTACHED at this point. */ if ((dap->da_state & ATTACHED) == 0) panic("newdirrem: not ATTACHED"); if (dap->da_newinum != ip->i_number) panic("newdirrem: inum %ju should be %ju", (uintmax_t)ip->i_number, (uintmax_t)dap->da_newinum); /* * If we are deleting a changed name that never made it to disk, * then return the dirrem describing the previous inode (which * represents the inode currently referenced from this entry on disk). */ if ((dap->da_state & DIRCHG) != 0) { *prevdirremp = dap->da_previous; dap->da_state &= ~DIRCHG; dap->da_pagedep = pagedep; } /* * We are deleting an entry that never made it to disk. * Mark it COMPLETE so we can delete its inode immediately. */ dirrem->dm_state |= COMPLETE; cancel_diradd(dap, dirrem, jremref, dotremref, dotdotremref); #ifdef INVARIANTS if (isrmdir == 0) { struct worklist *wk; LIST_FOREACH(wk, &dirrem->dm_jwork, wk_list) if (wk->wk_state & (MKDIR_BODY | MKDIR_PARENT)) panic("bad wk %p (0x%X)\n", wk, wk->wk_state); } #endif return (dirrem); } /* * Directory entry change dependencies. * * Changing an existing directory entry requires that an add operation * be completed first followed by a deletion. The semantics for the addition * are identical to the description of adding a new entry above except * that the rollback is to the old inode number rather than zero. Once * the addition dependency is completed, the removal is done as described * in the removal routine above. */ /* * This routine should be called immediately after changing * a directory entry. The inode's link count should not be * decremented by the calling procedure -- the soft updates * code will perform this task when it is safe. */ void softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir) struct buf *bp; /* buffer containing directory block */ struct inode *dp; /* inode for the directory being modified */ struct inode *ip; /* inode for directory entry being removed */ ino_t newinum; /* new inode number for changed entry */ int isrmdir; /* indicates if doing RMDIR */ { int offset; struct diradd *dap = NULL; struct dirrem *dirrem, *prevdirrem; struct pagedep *pagedep; struct inodedep *inodedep; struct jaddref *jaddref; struct mount *mp; struct ufsmount *ump; mp = ITOVFS(dp); ump = VFSTOUFS(mp); offset = blkoff(ump->um_fs, dp->i_offset); KASSERT(MOUNTEDSOFTDEP(mp) != 0, ("softdep_setup_directory_change called on non-softdep filesystem")); /* * Whiteouts do not need diradd dependencies. */ if (newinum != UFS_WINO) { dap = malloc(sizeof(struct diradd), M_DIRADD, M_SOFTDEP_FLAGS|M_ZERO); workitem_alloc(&dap->da_list, D_DIRADD, mp); dap->da_state = DIRCHG | ATTACHED | DEPCOMPLETE; dap->da_offset = offset; dap->da_newinum = newinum; LIST_INIT(&dap->da_jwork); } /* * Allocate a new dirrem and ACQUIRE_LOCK. */ dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem); pagedep = dirrem->dm_pagedep; /* * The possible values for isrmdir: * 0 - non-directory file rename * 1 - directory rename within same directory * inum - directory rename to new directory of given inode number * When renaming to a new directory, we are both deleting and * creating a new directory entry, so the link count on the new * directory should not change. Thus we do not need the followup * dirrem which is usually done in handle_workitem_remove. We set * the DIRCHG flag to tell handle_workitem_remove to skip the * followup dirrem. */ if (isrmdir > 1) dirrem->dm_state |= DIRCHG; /* * Whiteouts have no additional dependencies, * so just put the dirrem on the correct list. */ if (newinum == UFS_WINO) { if ((dirrem->dm_state & COMPLETE) == 0) { LIST_INSERT_HEAD(&pagedep->pd_dirremhd, dirrem, dm_next); } else { dirrem->dm_dirinum = pagedep->pd_ino; if (LIST_EMPTY(&dirrem->dm_jremrefhd)) add_to_worklist(&dirrem->dm_list, 0); } FREE_LOCK(ump); return; } /* * Add the dirrem to the inodedep's pending remove list for quick * discovery later. A valid nlinkdelta ensures that this lookup * will not fail. */ if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) panic("softdep_setup_directory_change: Lost inodedep."); dirrem->dm_state |= ONDEPLIST; LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext); /* * If the COMPLETE flag is clear, then there were no active * entries and we want to roll back to the previous inode until * the new inode is committed to disk. If the COMPLETE flag is * set, then we have deleted an entry that never made it to disk. * If the entry we deleted resulted from a name change, then the old * inode reference still resides on disk. Any rollback that we do * needs to be to that old inode (returned to us in prevdirrem). If * the entry we deleted resulted from a create, then there is * no entry on the disk, so we want to roll back to zero rather * than the uncommitted inode. In either of the COMPLETE cases we * want to immediately free the unwritten and unreferenced inode. */ if ((dirrem->dm_state & COMPLETE) == 0) { dap->da_previous = dirrem; } else { if (prevdirrem != NULL) { dap->da_previous = prevdirrem; } else { dap->da_state &= ~DIRCHG; dap->da_pagedep = pagedep; } dirrem->dm_dirinum = pagedep->pd_ino; if (LIST_EMPTY(&dirrem->dm_jremrefhd)) add_to_worklist(&dirrem->dm_list, 0); } /* * Lookup the jaddref for this journal entry. We must finish * initializing it and make the diradd write dependent on it. * If we're not journaling, put it on the id_bufwait list if the * inode is not yet written. If it is written, do the post-inode * write processing to put it on the id_pendinghd list. */ inodedep_lookup(mp, newinum, DEPALLOC, &inodedep); if (MOUNTEDSUJ(mp)) { jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst, inoreflst); KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number, ("softdep_setup_directory_change: bad jaddref %p", jaddref)); jaddref->ja_diroff = dp->i_offset; jaddref->ja_diradd = dap; LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], dap, da_pdlist); add_to_journal(&jaddref->ja_list); } else if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) { dap->da_state |= COMPLETE; LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist); WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list); } else { LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], dap, da_pdlist); WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list); } /* * If we're making a new name for a directory that has not been * committed when need to move the dot and dotdot references to * this new name. */ if (inodedep->id_mkdiradd && dp->i_offset != DOTDOT_OFFSET) merge_diradd(inodedep, dap); FREE_LOCK(ump); } /* * Called whenever the link count on an inode is changed. * It creates an inode dependency so that the new reference(s) * to the inode cannot be committed to disk until the updated * inode has been written. */ void softdep_change_linkcnt(ip) struct inode *ip; /* the inode with the increased link count */ { struct inodedep *inodedep; struct ufsmount *ump; ump = ITOUMP(ip); KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0, ("softdep_change_linkcnt called on non-softdep filesystem")); ACQUIRE_LOCK(ump); inodedep_lookup(UFSTOVFS(ump), ip->i_number, DEPALLOC, &inodedep); if (ip->i_nlink < ip->i_effnlink) panic("softdep_change_linkcnt: bad delta"); inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink; FREE_LOCK(ump); } /* * Attach a sbdep dependency to the superblock buf so that we can keep * track of the head of the linked list of referenced but unlinked inodes. */ void softdep_setup_sbupdate(ump, fs, bp) struct ufsmount *ump; struct fs *fs; struct buf *bp; { struct sbdep *sbdep; struct worklist *wk; KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0, ("softdep_setup_sbupdate called on non-softdep filesystem")); LIST_FOREACH(wk, &bp->b_dep, wk_list) if (wk->wk_type == D_SBDEP) break; if (wk != NULL) return; sbdep = malloc(sizeof(struct sbdep), M_SBDEP, M_SOFTDEP_FLAGS); workitem_alloc(&sbdep->sb_list, D_SBDEP, UFSTOVFS(ump)); sbdep->sb_fs = fs; sbdep->sb_ump = ump; ACQUIRE_LOCK(ump); WORKLIST_INSERT(&bp->b_dep, &sbdep->sb_list); FREE_LOCK(ump); } /* * Return the first unlinked inodedep which is ready to be the head of the * list. The inodedep and all those after it must have valid next pointers. */ static struct inodedep * first_unlinked_inodedep(ump) struct ufsmount *ump; { struct inodedep *inodedep; struct inodedep *idp; LOCK_OWNED(ump); for (inodedep = TAILQ_LAST(&ump->softdep_unlinked, inodedeplst); inodedep; inodedep = idp) { if ((inodedep->id_state & UNLINKNEXT) == 0) return (NULL); idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked); if (idp == NULL || (idp->id_state & UNLINKNEXT) == 0) break; if ((inodedep->id_state & UNLINKPREV) == 0) break; } return (inodedep); } /* * Set the sujfree unlinked head pointer prior to writing a superblock. */ static void initiate_write_sbdep(sbdep) struct sbdep *sbdep; { struct inodedep *inodedep; struct fs *bpfs; struct fs *fs; bpfs = sbdep->sb_fs; fs = sbdep->sb_ump->um_fs; inodedep = first_unlinked_inodedep(sbdep->sb_ump); if (inodedep) { fs->fs_sujfree = inodedep->id_ino; inodedep->id_state |= UNLINKPREV; } else fs->fs_sujfree = 0; bpfs->fs_sujfree = fs->fs_sujfree; /* * Because we have made changes to the superblock, we need to * recompute its check-hash. */ bpfs->fs_ckhash = ffs_calc_sbhash(bpfs); } /* * After a superblock is written determine whether it must be written again * due to a changing unlinked list head. */ static int handle_written_sbdep(sbdep, bp) struct sbdep *sbdep; struct buf *bp; { struct inodedep *inodedep; struct fs *fs; LOCK_OWNED(sbdep->sb_ump); fs = sbdep->sb_fs; /* * If the superblock doesn't match the in-memory list start over. */ inodedep = first_unlinked_inodedep(sbdep->sb_ump); if ((inodedep && fs->fs_sujfree != inodedep->id_ino) || (inodedep == NULL && fs->fs_sujfree != 0)) { bdirty(bp); return (1); } WORKITEM_FREE(sbdep, D_SBDEP); if (fs->fs_sujfree == 0) return (0); /* * Now that we have a record of this inode in stable store allow it * to be written to free up pending work. Inodes may see a lot of * write activity after they are unlinked which we must not hold up. */ for (; inodedep != NULL; inodedep = TAILQ_NEXT(inodedep, id_unlinked)) { if ((inodedep->id_state & UNLINKLINKS) != UNLINKLINKS) panic("handle_written_sbdep: Bad inodedep %p (0x%X)", inodedep, inodedep->id_state); if (inodedep->id_state & UNLINKONLIST) break; inodedep->id_state |= DEPCOMPLETE | UNLINKONLIST; } return (0); } /* * Mark an inodedep as unlinked and insert it into the in-memory unlinked list. */ static void unlinked_inodedep(mp, inodedep) struct mount *mp; struct inodedep *inodedep; { struct ufsmount *ump; ump = VFSTOUFS(mp); LOCK_OWNED(ump); if (MOUNTEDSUJ(mp) == 0) return; ump->um_fs->fs_fmod = 1; if (inodedep->id_state & UNLINKED) panic("unlinked_inodedep: %p already unlinked\n", inodedep); inodedep->id_state |= UNLINKED; TAILQ_INSERT_HEAD(&ump->softdep_unlinked, inodedep, id_unlinked); } /* * Remove an inodedep from the unlinked inodedep list. This may require * disk writes if the inode has made it that far. */ static void clear_unlinked_inodedep(inodedep) struct inodedep *inodedep; { struct ufs2_dinode *dip; struct ufsmount *ump; struct inodedep *idp; struct inodedep *idn; struct fs *fs, *bpfs; struct buf *bp; daddr_t dbn; ino_t ino; ino_t nino; ino_t pino; int error; ump = VFSTOUFS(inodedep->id_list.wk_mp); fs = ump->um_fs; ino = inodedep->id_ino; error = 0; for (;;) { LOCK_OWNED(ump); KASSERT((inodedep->id_state & UNLINKED) != 0, ("clear_unlinked_inodedep: inodedep %p not unlinked", inodedep)); /* * If nothing has yet been written simply remove us from * the in memory list and return. This is the most common * case where handle_workitem_remove() loses the final * reference. */ if ((inodedep->id_state & UNLINKLINKS) == 0) break; /* * If we have a NEXT pointer and no PREV pointer we can simply * clear NEXT's PREV and remove ourselves from the list. Be * careful not to clear PREV if the superblock points at * next as well. */ idn = TAILQ_NEXT(inodedep, id_unlinked); if ((inodedep->id_state & UNLINKLINKS) == UNLINKNEXT) { if (idn && fs->fs_sujfree != idn->id_ino) idn->id_state &= ~UNLINKPREV; break; } /* * Here we have an inodedep which is actually linked into * the list. We must remove it by forcing a write to the * link before us, whether it be the superblock or an inode. * Unfortunately the list may change while we're waiting * on the buf lock for either resource so we must loop until * we lock the right one. If both the superblock and an * inode point to this inode we must clear the inode first * followed by the superblock. */ idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked); pino = 0; if (idp && (idp->id_state & UNLINKNEXT)) pino = idp->id_ino; FREE_LOCK(ump); if (pino == 0) { bp = getblk(ump->um_devvp, btodb(fs->fs_sblockloc), (int)fs->fs_sbsize, 0, 0, 0); } else { dbn = fsbtodb(fs, ino_to_fsba(fs, pino)); error = ffs_breadz(ump, ump->um_devvp, dbn, dbn, (int)fs->fs_bsize, NULL, NULL, 0, NOCRED, 0, NULL, &bp); } ACQUIRE_LOCK(ump); if (error) break; /* If the list has changed restart the loop. */ idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked); nino = 0; if (idp && (idp->id_state & UNLINKNEXT)) nino = idp->id_ino; if (nino != pino || (inodedep->id_state & UNLINKPREV) != UNLINKPREV) { FREE_LOCK(ump); brelse(bp); ACQUIRE_LOCK(ump); continue; } nino = 0; idn = TAILQ_NEXT(inodedep, id_unlinked); if (idn) nino = idn->id_ino; /* * Remove us from the in memory list. After this we cannot * access the inodedep. */ KASSERT((inodedep->id_state & UNLINKED) != 0, ("clear_unlinked_inodedep: inodedep %p not unlinked", inodedep)); inodedep->id_state &= ~(UNLINKED | UNLINKLINKS | UNLINKONLIST); TAILQ_REMOVE(&ump->softdep_unlinked, inodedep, id_unlinked); FREE_LOCK(ump); /* * The predecessor's next pointer is manually updated here * so that the NEXT flag is never cleared for an element * that is in the list. */ if (pino == 0) { bcopy((caddr_t)fs, bp->b_data, (u_int)fs->fs_sbsize); bpfs = (struct fs *)bp->b_data; ffs_oldfscompat_write(bpfs, ump); softdep_setup_sbupdate(ump, bpfs, bp); /* * Because we may have made changes to the superblock, * we need to recompute its check-hash. */ bpfs->fs_ckhash = ffs_calc_sbhash(bpfs); } else if (fs->fs_magic == FS_UFS1_MAGIC) { ((struct ufs1_dinode *)bp->b_data + ino_to_fsbo(fs, pino))->di_freelink = nino; } else { dip = (struct ufs2_dinode *)bp->b_data + ino_to_fsbo(fs, pino); dip->di_freelink = nino; ffs_update_dinode_ckhash(fs, dip); } /* * If the bwrite fails we have no recourse to recover. The * filesystem is corrupted already. */ bwrite(bp); ACQUIRE_LOCK(ump); /* * If the superblock pointer still needs to be cleared force * a write here. */ if (fs->fs_sujfree == ino) { FREE_LOCK(ump); bp = getblk(ump->um_devvp, btodb(fs->fs_sblockloc), (int)fs->fs_sbsize, 0, 0, 0); bcopy((caddr_t)fs, bp->b_data, (u_int)fs->fs_sbsize); bpfs = (struct fs *)bp->b_data; ffs_oldfscompat_write(bpfs, ump); softdep_setup_sbupdate(ump, bpfs, bp); /* * Because we may have made changes to the superblock, * we need to recompute its check-hash. */ bpfs->fs_ckhash = ffs_calc_sbhash(bpfs); bwrite(bp); ACQUIRE_LOCK(ump); } if (fs->fs_sujfree != ino) return; panic("clear_unlinked_inodedep: Failed to clear free head"); } if (inodedep->id_ino == fs->fs_sujfree) panic("clear_unlinked_inodedep: Freeing head of free list"); inodedep->id_state &= ~(UNLINKED | UNLINKLINKS | UNLINKONLIST); TAILQ_REMOVE(&ump->softdep_unlinked, inodedep, id_unlinked); return; } /* * This workitem decrements the inode's link count. * If the link count reaches zero, the file is removed. */ static int handle_workitem_remove(dirrem, flags) struct dirrem *dirrem; int flags; { struct inodedep *inodedep; struct workhead dotdotwk; struct worklist *wk; struct ufsmount *ump; struct mount *mp; struct vnode *vp; struct inode *ip; ino_t oldinum; if (dirrem->dm_state & ONWORKLIST) panic("handle_workitem_remove: dirrem %p still on worklist", dirrem); oldinum = dirrem->dm_oldinum; mp = dirrem->dm_list.wk_mp; ump = VFSTOUFS(mp); flags |= LK_EXCLUSIVE; if (ffs_vgetf(mp, oldinum, flags, &vp, FFSV_FORCEINSMQ) != 0) return (EBUSY); ip = VTOI(vp); MPASS(ip->i_mode != 0); ACQUIRE_LOCK(ump); if ((inodedep_lookup(mp, oldinum, 0, &inodedep)) == 0) panic("handle_workitem_remove: lost inodedep"); if (dirrem->dm_state & ONDEPLIST) LIST_REMOVE(dirrem, dm_inonext); KASSERT(LIST_EMPTY(&dirrem->dm_jremrefhd), ("handle_workitem_remove: Journal entries not written.")); /* * Move all dependencies waiting on the remove to complete * from the dirrem to the inode inowait list to be completed * after the inode has been updated and written to disk. * * Any marked MKDIR_PARENT are saved to be completed when the * dotdot ref is removed unless DIRCHG is specified. For * directory change operations there will be no further * directory writes and the jsegdeps need to be moved along * with the rest to be completed when the inode is free or * stable in the inode free list. */ LIST_INIT(&dotdotwk); while ((wk = LIST_FIRST(&dirrem->dm_jwork)) != NULL) { WORKLIST_REMOVE(wk); if ((dirrem->dm_state & DIRCHG) == 0 && wk->wk_state & MKDIR_PARENT) { wk->wk_state &= ~MKDIR_PARENT; WORKLIST_INSERT(&dotdotwk, wk); continue; } WORKLIST_INSERT(&inodedep->id_inowait, wk); } LIST_SWAP(&dirrem->dm_jwork, &dotdotwk, worklist, wk_list); /* * Normal file deletion. */ if ((dirrem->dm_state & RMDIR) == 0) { ip->i_nlink--; KASSERT(ip->i_nlink >= 0, ("handle_workitem_remove: file ino " "%ju negative i_nlink %d", (intmax_t)ip->i_number, ip->i_nlink)); DIP_SET(ip, i_nlink, ip->i_nlink); UFS_INODE_SET_FLAG(ip, IN_CHANGE); if (ip->i_nlink < ip->i_effnlink) panic("handle_workitem_remove: bad file delta"); if (ip->i_nlink == 0) unlinked_inodedep(mp, inodedep); inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink; KASSERT(LIST_EMPTY(&dirrem->dm_jwork), ("handle_workitem_remove: worklist not empty. %s", TYPENAME(LIST_FIRST(&dirrem->dm_jwork)->wk_type))); WORKITEM_FREE(dirrem, D_DIRREM); FREE_LOCK(ump); goto out; } /* * Directory deletion. Decrement reference count for both the * just deleted parent directory entry and the reference for ".". * Arrange to have the reference count on the parent decremented * to account for the loss of "..". */ ip->i_nlink -= 2; KASSERT(ip->i_nlink >= 0, ("handle_workitem_remove: directory ino " "%ju negative i_nlink %d", (intmax_t)ip->i_number, ip->i_nlink)); DIP_SET(ip, i_nlink, ip->i_nlink); UFS_INODE_SET_FLAG(ip, IN_CHANGE); if (ip->i_nlink < ip->i_effnlink) panic("handle_workitem_remove: bad dir delta"); if (ip->i_nlink == 0) unlinked_inodedep(mp, inodedep); inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink; /* * Rename a directory to a new parent. Since, we are both deleting * and creating a new directory entry, the link count on the new * directory should not change. Thus we skip the followup dirrem. */ if (dirrem->dm_state & DIRCHG) { KASSERT(LIST_EMPTY(&dirrem->dm_jwork), ("handle_workitem_remove: DIRCHG and worklist not empty.")); WORKITEM_FREE(dirrem, D_DIRREM); FREE_LOCK(ump); goto out; } dirrem->dm_state = ONDEPLIST; dirrem->dm_oldinum = dirrem->dm_dirinum; /* * Place the dirrem on the parent's diremhd list. */ if (inodedep_lookup(mp, dirrem->dm_oldinum, 0, &inodedep) == 0) panic("handle_workitem_remove: lost dir inodedep"); LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext); /* * If the allocated inode has never been written to disk, then * the on-disk inode is zero'ed and we can remove the file * immediately. When journaling if the inode has been marked * unlinked and not DEPCOMPLETE we know it can never be written. */ inodedep_lookup(mp, oldinum, 0, &inodedep); if (inodedep == NULL || (inodedep->id_state & (DEPCOMPLETE | UNLINKED)) == UNLINKED || check_inode_unwritten(inodedep)) { FREE_LOCK(ump); vput(vp); return handle_workitem_remove(dirrem, flags); } WORKLIST_INSERT(&inodedep->id_inowait, &dirrem->dm_list); FREE_LOCK(ump); UFS_INODE_SET_FLAG(ip, IN_CHANGE); out: ffs_update(vp, 0); vput(vp); return (0); } /* * Inode de-allocation dependencies. * * When an inode's link count is reduced to zero, it can be de-allocated. We * found it convenient to postpone de-allocation until after the inode is * written to disk with its new link count (zero). At this point, all of the * on-disk inode's block pointers are nullified and, with careful dependency * list ordering, all dependencies related to the inode will be satisfied and * the corresponding dependency structures de-allocated. So, if/when the * inode is reused, there will be no mixing of old dependencies with new * ones. This artificial dependency is set up by the block de-allocation * procedure above (softdep_setup_freeblocks) and completed by the * following procedure. */ static void handle_workitem_freefile(freefile) struct freefile *freefile; { struct workhead wkhd; struct fs *fs; struct ufsmount *ump; int error; #ifdef INVARIANTS struct inodedep *idp; #endif ump = VFSTOUFS(freefile->fx_list.wk_mp); fs = ump->um_fs; #ifdef INVARIANTS ACQUIRE_LOCK(ump); error = inodedep_lookup(UFSTOVFS(ump), freefile->fx_oldinum, 0, &idp); FREE_LOCK(ump); if (error) panic("handle_workitem_freefile: inodedep %p survived", idp); #endif UFS_LOCK(ump); fs->fs_pendinginodes -= 1; UFS_UNLOCK(ump); LIST_INIT(&wkhd); LIST_SWAP(&freefile->fx_jwork, &wkhd, worklist, wk_list); if ((error = ffs_freefile(ump, fs, freefile->fx_devvp, freefile->fx_oldinum, freefile->fx_mode, &wkhd)) != 0) softdep_error("handle_workitem_freefile", error); ACQUIRE_LOCK(ump); WORKITEM_FREE(freefile, D_FREEFILE); FREE_LOCK(ump); } /* * Helper function which unlinks marker element from work list and returns * the next element on the list. */ static __inline struct worklist * markernext(struct worklist *marker) { struct worklist *next; next = LIST_NEXT(marker, wk_list); LIST_REMOVE(marker, wk_list); return next; } /* * Disk writes. * * The dependency structures constructed above are most actively used when file * system blocks are written to disk. No constraints are placed on when a * block can be written, but unsatisfied update dependencies are made safe by * modifying (or replacing) the source memory for the duration of the disk * write. When the disk write completes, the memory block is again brought * up-to-date. * * In-core inode structure reclamation. * * Because there are a finite number of "in-core" inode structures, they are * reused regularly. By transferring all inode-related dependencies to the * in-memory inode block and indexing them separately (via "inodedep"s), we * can allow "in-core" inode structures to be reused at any time and avoid * any increase in contention. * * Called just before entering the device driver to initiate a new disk I/O. * The buffer must be locked, thus, no I/O completion operations can occur * while we are manipulating its associated dependencies. */ static void softdep_disk_io_initiation(bp) struct buf *bp; /* structure describing disk write to occur */ { struct worklist *wk; struct worklist marker; struct inodedep *inodedep; struct freeblks *freeblks; struct jblkdep *jblkdep; struct newblk *newblk; struct ufsmount *ump; /* * We only care about write operations. There should never * be dependencies for reads. */ if (bp->b_iocmd != BIO_WRITE) panic("softdep_disk_io_initiation: not write"); if (bp->b_vflags & BV_BKGRDINPROG) panic("softdep_disk_io_initiation: Writing buffer with " "background write in progress: %p", bp); ump = softdep_bp_to_mp(bp); if (ump == NULL) return; marker.wk_type = D_LAST + 1; /* Not a normal workitem */ PHOLD(curproc); /* Don't swap out kernel stack */ ACQUIRE_LOCK(ump); /* * Do any necessary pre-I/O processing. */ for (wk = LIST_FIRST(&bp->b_dep); wk != NULL; wk = markernext(&marker)) { LIST_INSERT_AFTER(wk, &marker, wk_list); switch (wk->wk_type) { case D_PAGEDEP: initiate_write_filepage(WK_PAGEDEP(wk), bp); continue; case D_INODEDEP: inodedep = WK_INODEDEP(wk); if (inodedep->id_fs->fs_magic == FS_UFS1_MAGIC) initiate_write_inodeblock_ufs1(inodedep, bp); else initiate_write_inodeblock_ufs2(inodedep, bp); continue; case D_INDIRDEP: initiate_write_indirdep(WK_INDIRDEP(wk), bp); continue; case D_BMSAFEMAP: initiate_write_bmsafemap(WK_BMSAFEMAP(wk), bp); continue; case D_JSEG: WK_JSEG(wk)->js_buf = NULL; continue; case D_FREEBLKS: freeblks = WK_FREEBLKS(wk); jblkdep = LIST_FIRST(&freeblks->fb_jblkdephd); /* * We have to wait for the freeblks to be journaled * before we can write an inodeblock with updated * pointers. Be careful to arrange the marker so * we revisit the freeblks if it's not removed by * the first jwait(). */ if (jblkdep != NULL) { LIST_REMOVE(&marker, wk_list); LIST_INSERT_BEFORE(wk, &marker, wk_list); jwait(&jblkdep->jb_list, MNT_WAIT); } continue; case D_ALLOCDIRECT: case D_ALLOCINDIR: /* * We have to wait for the jnewblk to be journaled * before we can write to a block if the contents * may be confused with an earlier file's indirect * at recovery time. Handle the marker as described * above. */ newblk = WK_NEWBLK(wk); if (newblk->nb_jnewblk != NULL && indirblk_lookup(newblk->nb_list.wk_mp, newblk->nb_newblkno)) { LIST_REMOVE(&marker, wk_list); LIST_INSERT_BEFORE(wk, &marker, wk_list); jwait(&newblk->nb_jnewblk->jn_list, MNT_WAIT); } continue; case D_SBDEP: initiate_write_sbdep(WK_SBDEP(wk)); continue; case D_MKDIR: case D_FREEWORK: case D_FREEDEP: case D_JSEGDEP: continue; default: panic("handle_disk_io_initiation: Unexpected type %s", TYPENAME(wk->wk_type)); /* NOTREACHED */ } } FREE_LOCK(ump); PRELE(curproc); /* Allow swapout of kernel stack */ } /* * Called from within the procedure above to deal with unsatisfied * allocation dependencies in a directory. The buffer must be locked, * thus, no I/O completion operations can occur while we are * manipulating its associated dependencies. */ static void initiate_write_filepage(pagedep, bp) struct pagedep *pagedep; struct buf *bp; { struct jremref *jremref; struct jmvref *jmvref; struct dirrem *dirrem; struct diradd *dap; struct direct *ep; int i; if (pagedep->pd_state & IOSTARTED) { /* * This can only happen if there is a driver that does not * understand chaining. Here biodone will reissue the call * to strategy for the incomplete buffers. */ printf("initiate_write_filepage: already started\n"); return; } pagedep->pd_state |= IOSTARTED; /* * Wait for all journal remove dependencies to hit the disk. * We can not allow any potentially conflicting directory adds * to be visible before removes and rollback is too difficult. * The per-filesystem lock may be dropped and re-acquired, however * we hold the buf locked so the dependency can not go away. */ LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next) while ((jremref = LIST_FIRST(&dirrem->dm_jremrefhd)) != NULL) jwait(&jremref->jr_list, MNT_WAIT); while ((jmvref = LIST_FIRST(&pagedep->pd_jmvrefhd)) != NULL) jwait(&jmvref->jm_list, MNT_WAIT); for (i = 0; i < DAHASHSZ; i++) { LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) { ep = (struct direct *) ((char *)bp->b_data + dap->da_offset); if (ep->d_ino != dap->da_newinum) panic("%s: dir inum %ju != new %ju", "initiate_write_filepage", (uintmax_t)ep->d_ino, (uintmax_t)dap->da_newinum); if (dap->da_state & DIRCHG) ep->d_ino = dap->da_previous->dm_oldinum; else ep->d_ino = 0; dap->da_state &= ~ATTACHED; dap->da_state |= UNDONE; } } } /* * Version of initiate_write_inodeblock that handles UFS1 dinodes. * Note that any bug fixes made to this routine must be done in the * version found below. * * Called from within the procedure above to deal with unsatisfied * allocation dependencies in an inodeblock. The buffer must be * locked, thus, no I/O completion operations can occur while we * are manipulating its associated dependencies. */ static void initiate_write_inodeblock_ufs1(inodedep, bp) struct inodedep *inodedep; struct buf *bp; /* The inode block */ { struct allocdirect *adp, *lastadp; struct ufs1_dinode *dp; struct ufs1_dinode *sip; struct inoref *inoref; struct ufsmount *ump; struct fs *fs; ufs_lbn_t i; #ifdef INVARIANTS ufs_lbn_t prevlbn = 0; #endif int deplist; if (inodedep->id_state & IOSTARTED) panic("initiate_write_inodeblock_ufs1: already started"); inodedep->id_state |= IOSTARTED; fs = inodedep->id_fs; ump = VFSTOUFS(inodedep->id_list.wk_mp); LOCK_OWNED(ump); dp = (struct ufs1_dinode *)bp->b_data + ino_to_fsbo(fs, inodedep->id_ino); /* * If we're on the unlinked list but have not yet written our * next pointer initialize it here. */ if ((inodedep->id_state & (UNLINKED | UNLINKNEXT)) == UNLINKED) { struct inodedep *inon; inon = TAILQ_NEXT(inodedep, id_unlinked); dp->di_freelink = inon ? inon->id_ino : 0; } /* * If the bitmap is not yet written, then the allocated * inode cannot be written to disk. */ if ((inodedep->id_state & DEPCOMPLETE) == 0) { if (inodedep->id_savedino1 != NULL) panic("initiate_write_inodeblock_ufs1: I/O underway"); FREE_LOCK(ump); sip = malloc(sizeof(struct ufs1_dinode), M_SAVEDINO, M_SOFTDEP_FLAGS); ACQUIRE_LOCK(ump); inodedep->id_savedino1 = sip; *inodedep->id_savedino1 = *dp; bzero((caddr_t)dp, sizeof(struct ufs1_dinode)); dp->di_gen = inodedep->id_savedino1->di_gen; dp->di_freelink = inodedep->id_savedino1->di_freelink; return; } /* * If no dependencies, then there is nothing to roll back. */ inodedep->id_savedsize = dp->di_size; inodedep->id_savedextsize = 0; inodedep->id_savednlink = dp->di_nlink; if (TAILQ_EMPTY(&inodedep->id_inoupdt) && TAILQ_EMPTY(&inodedep->id_inoreflst)) return; /* * Revert the link count to that of the first unwritten journal entry. */ inoref = TAILQ_FIRST(&inodedep->id_inoreflst); if (inoref) dp->di_nlink = inoref->if_nlink; /* * Set the dependencies to busy. */ for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; adp = TAILQ_NEXT(adp, ad_next)) { #ifdef INVARIANTS if (deplist != 0 && prevlbn >= adp->ad_offset) panic("softdep_write_inodeblock: lbn order"); prevlbn = adp->ad_offset; if (adp->ad_offset < UFS_NDADDR && dp->di_db[adp->ad_offset] != adp->ad_newblkno) panic("initiate_write_inodeblock_ufs1: " "direct pointer #%jd mismatch %d != %jd", (intmax_t)adp->ad_offset, dp->di_db[adp->ad_offset], (intmax_t)adp->ad_newblkno); if (adp->ad_offset >= UFS_NDADDR && dp->di_ib[adp->ad_offset - UFS_NDADDR] != adp->ad_newblkno) panic("initiate_write_inodeblock_ufs1: " "indirect pointer #%jd mismatch %d != %jd", (intmax_t)adp->ad_offset - UFS_NDADDR, dp->di_ib[adp->ad_offset - UFS_NDADDR], (intmax_t)adp->ad_newblkno); deplist |= 1 << adp->ad_offset; if ((adp->ad_state & ATTACHED) == 0) panic("initiate_write_inodeblock_ufs1: " "Unknown state 0x%x", adp->ad_state); #endif /* INVARIANTS */ adp->ad_state &= ~ATTACHED; adp->ad_state |= UNDONE; } /* * The on-disk inode cannot claim to be any larger than the last * fragment that has been written. Otherwise, the on-disk inode * might have fragments that were not the last block in the file * which would corrupt the filesystem. */ for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) { if (adp->ad_offset >= UFS_NDADDR) break; dp->di_db[adp->ad_offset] = adp->ad_oldblkno; /* keep going until hitting a rollback to a frag */ if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize) continue; dp->di_size = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize; for (i = adp->ad_offset + 1; i < UFS_NDADDR; i++) { #ifdef INVARIANTS if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0) panic("initiate_write_inodeblock_ufs1: " "lost dep1"); #endif /* INVARIANTS */ dp->di_db[i] = 0; } for (i = 0; i < UFS_NIADDR; i++) { #ifdef INVARIANTS if (dp->di_ib[i] != 0 && (deplist & ((1 << UFS_NDADDR) << i)) == 0) panic("initiate_write_inodeblock_ufs1: " "lost dep2"); #endif /* INVARIANTS */ dp->di_ib[i] = 0; } return; } /* * If we have zero'ed out the last allocated block of the file, * roll back the size to the last currently allocated block. * We know that this last allocated block is a full-sized as * we already checked for fragments in the loop above. */ if (lastadp != NULL && dp->di_size <= (lastadp->ad_offset + 1) * fs->fs_bsize) { for (i = lastadp->ad_offset; i >= 0; i--) if (dp->di_db[i] != 0) break; dp->di_size = (i + 1) * fs->fs_bsize; } /* * The only dependencies are for indirect blocks. * * The file size for indirect block additions is not guaranteed. * Such a guarantee would be non-trivial to achieve. The conventional * synchronous write implementation also does not make this guarantee. * Fsck should catch and fix discrepancies. Arguably, the file size * can be over-estimated without destroying integrity when the file * moves into the indirect blocks (i.e., is large). If we want to * postpone fsck, we are stuck with this argument. */ for (; adp; adp = TAILQ_NEXT(adp, ad_next)) dp->di_ib[adp->ad_offset - UFS_NDADDR] = 0; } /* * Version of initiate_write_inodeblock that handles UFS2 dinodes. * Note that any bug fixes made to this routine must be done in the * version found above. * * Called from within the procedure above to deal with unsatisfied * allocation dependencies in an inodeblock. The buffer must be * locked, thus, no I/O completion operations can occur while we * are manipulating its associated dependencies. */ static void initiate_write_inodeblock_ufs2(inodedep, bp) struct inodedep *inodedep; struct buf *bp; /* The inode block */ { struct allocdirect *adp, *lastadp; struct ufs2_dinode *dp; struct ufs2_dinode *sip; struct inoref *inoref; struct ufsmount *ump; struct fs *fs; ufs_lbn_t i; #ifdef INVARIANTS ufs_lbn_t prevlbn = 0; #endif int deplist; if (inodedep->id_state & IOSTARTED) panic("initiate_write_inodeblock_ufs2: already started"); inodedep->id_state |= IOSTARTED; fs = inodedep->id_fs; ump = VFSTOUFS(inodedep->id_list.wk_mp); LOCK_OWNED(ump); dp = (struct ufs2_dinode *)bp->b_data + ino_to_fsbo(fs, inodedep->id_ino); /* * If we're on the unlinked list but have not yet written our * next pointer initialize it here. */ if ((inodedep->id_state & (UNLINKED | UNLINKNEXT)) == UNLINKED) { struct inodedep *inon; inon = TAILQ_NEXT(inodedep, id_unlinked); dp->di_freelink = inon ? inon->id_ino : 0; ffs_update_dinode_ckhash(fs, dp); } /* * If the bitmap is not yet written, then the allocated * inode cannot be written to disk. */ if ((inodedep->id_state & DEPCOMPLETE) == 0) { if (inodedep->id_savedino2 != NULL) panic("initiate_write_inodeblock_ufs2: I/O underway"); FREE_LOCK(ump); sip = malloc(sizeof(struct ufs2_dinode), M_SAVEDINO, M_SOFTDEP_FLAGS); ACQUIRE_LOCK(ump); inodedep->id_savedino2 = sip; *inodedep->id_savedino2 = *dp; bzero((caddr_t)dp, sizeof(struct ufs2_dinode)); dp->di_gen = inodedep->id_savedino2->di_gen; dp->di_freelink = inodedep->id_savedino2->di_freelink; return; } /* * If no dependencies, then there is nothing to roll back. */ inodedep->id_savedsize = dp->di_size; inodedep->id_savedextsize = dp->di_extsize; inodedep->id_savednlink = dp->di_nlink; if (TAILQ_EMPTY(&inodedep->id_inoupdt) && TAILQ_EMPTY(&inodedep->id_extupdt) && TAILQ_EMPTY(&inodedep->id_inoreflst)) return; /* * Revert the link count to that of the first unwritten journal entry. */ inoref = TAILQ_FIRST(&inodedep->id_inoreflst); if (inoref) dp->di_nlink = inoref->if_nlink; /* * Set the ext data dependencies to busy. */ for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp; adp = TAILQ_NEXT(adp, ad_next)) { #ifdef INVARIANTS if (deplist != 0 && prevlbn >= adp->ad_offset) panic("initiate_write_inodeblock_ufs2: lbn order"); prevlbn = adp->ad_offset; if (dp->di_extb[adp->ad_offset] != adp->ad_newblkno) panic("initiate_write_inodeblock_ufs2: " "ext pointer #%jd mismatch %jd != %jd", (intmax_t)adp->ad_offset, (intmax_t)dp->di_extb[adp->ad_offset], (intmax_t)adp->ad_newblkno); deplist |= 1 << adp->ad_offset; if ((adp->ad_state & ATTACHED) == 0) panic("initiate_write_inodeblock_ufs2: Unknown " "state 0x%x", adp->ad_state); #endif /* INVARIANTS */ adp->ad_state &= ~ATTACHED; adp->ad_state |= UNDONE; } /* * The on-disk inode cannot claim to be any larger than the last * fragment that has been written. Otherwise, the on-disk inode * might have fragments that were not the last block in the ext * data which would corrupt the filesystem. */ for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp; lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) { dp->di_extb[adp->ad_offset] = adp->ad_oldblkno; /* keep going until hitting a rollback to a frag */ if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize) continue; dp->di_extsize = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize; for (i = adp->ad_offset + 1; i < UFS_NXADDR; i++) { #ifdef INVARIANTS if (dp->di_extb[i] != 0 && (deplist & (1 << i)) == 0) panic("initiate_write_inodeblock_ufs2: " "lost dep1"); #endif /* INVARIANTS */ dp->di_extb[i] = 0; } lastadp = NULL; break; } /* * If we have zero'ed out the last allocated block of the ext * data, roll back the size to the last currently allocated block. * We know that this last allocated block is a full-sized as * we already checked for fragments in the loop above. */ if (lastadp != NULL && dp->di_extsize <= (lastadp->ad_offset + 1) * fs->fs_bsize) { for (i = lastadp->ad_offset; i >= 0; i--) if (dp->di_extb[i] != 0) break; dp->di_extsize = (i + 1) * fs->fs_bsize; } /* * Set the file data dependencies to busy. */ for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; adp = TAILQ_NEXT(adp, ad_next)) { #ifdef INVARIANTS if (deplist != 0 && prevlbn >= adp->ad_offset) panic("softdep_write_inodeblock: lbn order"); if ((adp->ad_state & ATTACHED) == 0) panic("inodedep %p and adp %p not attached", inodedep, adp); prevlbn = adp->ad_offset; if (!ffs_fsfail_cleanup(ump, 0) && adp->ad_offset < UFS_NDADDR && dp->di_db[adp->ad_offset] != adp->ad_newblkno) panic("initiate_write_inodeblock_ufs2: " "direct pointer #%jd mismatch %jd != %jd", (intmax_t)adp->ad_offset, (intmax_t)dp->di_db[adp->ad_offset], (intmax_t)adp->ad_newblkno); if (!ffs_fsfail_cleanup(ump, 0) && adp->ad_offset >= UFS_NDADDR && dp->di_ib[adp->ad_offset - UFS_NDADDR] != adp->ad_newblkno) panic("initiate_write_inodeblock_ufs2: " "indirect pointer #%jd mismatch %jd != %jd", (intmax_t)adp->ad_offset - UFS_NDADDR, (intmax_t)dp->di_ib[adp->ad_offset - UFS_NDADDR], (intmax_t)adp->ad_newblkno); deplist |= 1 << adp->ad_offset; if ((adp->ad_state & ATTACHED) == 0) panic("initiate_write_inodeblock_ufs2: Unknown " "state 0x%x", adp->ad_state); #endif /* INVARIANTS */ adp->ad_state &= ~ATTACHED; adp->ad_state |= UNDONE; } /* * The on-disk inode cannot claim to be any larger than the last * fragment that has been written. Otherwise, the on-disk inode * might have fragments that were not the last block in the file * which would corrupt the filesystem. */ for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) { if (adp->ad_offset >= UFS_NDADDR) break; dp->di_db[adp->ad_offset] = adp->ad_oldblkno; /* keep going until hitting a rollback to a frag */ if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize) continue; dp->di_size = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize; for (i = adp->ad_offset + 1; i < UFS_NDADDR; i++) { #ifdef INVARIANTS if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0) panic("initiate_write_inodeblock_ufs2: " "lost dep2"); #endif /* INVARIANTS */ dp->di_db[i] = 0; } for (i = 0; i < UFS_NIADDR; i++) { #ifdef INVARIANTS if (dp->di_ib[i] != 0 && (deplist & ((1 << UFS_NDADDR) << i)) == 0) panic("initiate_write_inodeblock_ufs2: " "lost dep3"); #endif /* INVARIANTS */ dp->di_ib[i] = 0; } ffs_update_dinode_ckhash(fs, dp); return; } /* * If we have zero'ed out the last allocated block of the file, * roll back the size to the last currently allocated block. * We know that this last allocated block is a full-sized as * we already checked for fragments in the loop above. */ if (lastadp != NULL && dp->di_size <= (lastadp->ad_offset + 1) * fs->fs_bsize) { for (i = lastadp->ad_offset; i >= 0; i--) if (dp->di_db[i] != 0) break; dp->di_size = (i + 1) * fs->fs_bsize; } /* * The only dependencies are for indirect blocks. * * The file size for indirect block additions is not guaranteed. * Such a guarantee would be non-trivial to achieve. The conventional * synchronous write implementation also does not make this guarantee. * Fsck should catch and fix discrepancies. Arguably, the file size * can be over-estimated without destroying integrity when the file * moves into the indirect blocks (i.e., is large). If we want to * postpone fsck, we are stuck with this argument. */ for (; adp; adp = TAILQ_NEXT(adp, ad_next)) dp->di_ib[adp->ad_offset - UFS_NDADDR] = 0; ffs_update_dinode_ckhash(fs, dp); } /* * Cancel an indirdep as a result of truncation. Release all of the * children allocindirs and place their journal work on the appropriate * list. */ static void cancel_indirdep(indirdep, bp, freeblks) struct indirdep *indirdep; struct buf *bp; struct freeblks *freeblks; { struct allocindir *aip; /* * None of the indirect pointers will ever be visible, * so they can simply be tossed. GOINGAWAY ensures * that allocated pointers will be saved in the buffer * cache until they are freed. Note that they will * only be able to be found by their physical address * since the inode mapping the logical address will * be gone. The save buffer used for the safe copy * was allocated in setup_allocindir_phase2 using * the physical address so it could be used for this * purpose. Hence we swap the safe copy with the real * copy, allowing the safe copy to be freed and holding * on to the real copy for later use in indir_trunc. */ if (indirdep->ir_state & GOINGAWAY) panic("cancel_indirdep: already gone"); if ((indirdep->ir_state & DEPCOMPLETE) == 0) { indirdep->ir_state |= DEPCOMPLETE; LIST_REMOVE(indirdep, ir_next); } indirdep->ir_state |= GOINGAWAY; /* * Pass in bp for blocks still have journal writes * pending so we can cancel them on their own. */ while ((aip = LIST_FIRST(&indirdep->ir_deplisthd)) != NULL) cancel_allocindir(aip, bp, freeblks, 0); while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != NULL) cancel_allocindir(aip, NULL, freeblks, 0); while ((aip = LIST_FIRST(&indirdep->ir_writehd)) != NULL) cancel_allocindir(aip, NULL, freeblks, 0); while ((aip = LIST_FIRST(&indirdep->ir_completehd)) != NULL) cancel_allocindir(aip, NULL, freeblks, 0); /* * If there are pending partial truncations we need to keep the * old block copy around until they complete. This is because * the current b_data is not a perfect superset of the available * blocks. */ if (TAILQ_EMPTY(&indirdep->ir_trunc)) bcopy(bp->b_data, indirdep->ir_savebp->b_data, bp->b_bcount); else bcopy(bp->b_data, indirdep->ir_saveddata, bp->b_bcount); WORKLIST_REMOVE(&indirdep->ir_list); WORKLIST_INSERT(&indirdep->ir_savebp->b_dep, &indirdep->ir_list); indirdep->ir_bp = NULL; indirdep->ir_freeblks = freeblks; } /* * Free an indirdep once it no longer has new pointers to track. */ static void free_indirdep(indirdep) struct indirdep *indirdep; { KASSERT(TAILQ_EMPTY(&indirdep->ir_trunc), ("free_indirdep: Indir trunc list not empty.")); KASSERT(LIST_EMPTY(&indirdep->ir_completehd), ("free_indirdep: Complete head not empty.")); KASSERT(LIST_EMPTY(&indirdep->ir_writehd), ("free_indirdep: write head not empty.")); KASSERT(LIST_EMPTY(&indirdep->ir_donehd), ("free_indirdep: done head not empty.")); KASSERT(LIST_EMPTY(&indirdep->ir_deplisthd), ("free_indirdep: deplist head not empty.")); KASSERT((indirdep->ir_state & DEPCOMPLETE), ("free_indirdep: %p still on newblk list.", indirdep)); KASSERT(indirdep->ir_saveddata == NULL, ("free_indirdep: %p still has saved data.", indirdep)); KASSERT(indirdep->ir_savebp == NULL, ("free_indirdep: %p still has savebp buffer.", indirdep)); if (indirdep->ir_state & ONWORKLIST) WORKLIST_REMOVE(&indirdep->ir_list); WORKITEM_FREE(indirdep, D_INDIRDEP); } /* * Called before a write to an indirdep. This routine is responsible for * rolling back pointers to a safe state which includes only those * allocindirs which have been completed. */ static void initiate_write_indirdep(indirdep, bp) struct indirdep *indirdep; struct buf *bp; { struct ufsmount *ump; indirdep->ir_state |= IOSTARTED; if (indirdep->ir_state & GOINGAWAY) panic("disk_io_initiation: indirdep gone"); /* * If there are no remaining dependencies, this will be writing * the real pointers. */ if (LIST_EMPTY(&indirdep->ir_deplisthd) && TAILQ_EMPTY(&indirdep->ir_trunc)) return; /* * Replace up-to-date version with safe version. */ if (indirdep->ir_saveddata == NULL) { ump = VFSTOUFS(indirdep->ir_list.wk_mp); LOCK_OWNED(ump); FREE_LOCK(ump); indirdep->ir_saveddata = malloc(bp->b_bcount, M_INDIRDEP, M_SOFTDEP_FLAGS); ACQUIRE_LOCK(ump); } indirdep->ir_state &= ~ATTACHED; indirdep->ir_state |= UNDONE; bcopy(bp->b_data, indirdep->ir_saveddata, bp->b_bcount); bcopy(indirdep->ir_savebp->b_data, bp->b_data, bp->b_bcount); } /* * Called when an inode has been cleared in a cg bitmap. This finally * eliminates any canceled jaddrefs */ void softdep_setup_inofree(mp, bp, ino, wkhd) struct mount *mp; struct buf *bp; ino_t ino; struct workhead *wkhd; { struct worklist *wk, *wkn; struct inodedep *inodedep; struct ufsmount *ump; uint8_t *inosused; struct cg *cgp; struct fs *fs; KASSERT(MOUNTEDSOFTDEP(mp) != 0, ("softdep_setup_inofree called on non-softdep filesystem")); ump = VFSTOUFS(mp); ACQUIRE_LOCK(ump); if (!ffs_fsfail_cleanup(ump, 0)) { fs = ump->um_fs; cgp = (struct cg *)bp->b_data; inosused = cg_inosused(cgp); if (isset(inosused, ino % fs->fs_ipg)) panic("softdep_setup_inofree: inode %ju not freed.", (uintmax_t)ino); } if (inodedep_lookup(mp, ino, 0, &inodedep)) panic("softdep_setup_inofree: ino %ju has existing inodedep %p", (uintmax_t)ino, inodedep); if (wkhd) { LIST_FOREACH_SAFE(wk, wkhd, wk_list, wkn) { if (wk->wk_type != D_JADDREF) continue; WORKLIST_REMOVE(wk); /* * We can free immediately even if the jaddref * isn't attached in a background write as now * the bitmaps are reconciled. */ wk->wk_state |= COMPLETE | ATTACHED; free_jaddref(WK_JADDREF(wk)); } jwork_move(&bp->b_dep, wkhd); } FREE_LOCK(ump); } /* * Called via ffs_blkfree() after a set of frags has been cleared from a cg * map. Any dependencies waiting for the write to clear are added to the * buf's list and any jnewblks that are being canceled are discarded * immediately. */ void softdep_setup_blkfree(mp, bp, blkno, frags, wkhd) struct mount *mp; struct buf *bp; ufs2_daddr_t blkno; int frags; struct workhead *wkhd; { struct bmsafemap *bmsafemap; struct jnewblk *jnewblk; struct ufsmount *ump; struct worklist *wk; struct fs *fs; #ifdef INVARIANTS uint8_t *blksfree; struct cg *cgp; ufs2_daddr_t jstart; ufs2_daddr_t jend; ufs2_daddr_t end; long bno; int i; #endif CTR3(KTR_SUJ, "softdep_setup_blkfree: blkno %jd frags %d wk head %p", blkno, frags, wkhd); ump = VFSTOUFS(mp); KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0, ("softdep_setup_blkfree called on non-softdep filesystem")); ACQUIRE_LOCK(ump); /* Lookup the bmsafemap so we track when it is dirty. */ fs = ump->um_fs; bmsafemap = bmsafemap_lookup(mp, bp, dtog(fs, blkno), NULL); /* * Detach any jnewblks which have been canceled. They must linger * until the bitmap is cleared again by ffs_blkfree() to prevent * an unjournaled allocation from hitting the disk. */ if (wkhd) { while ((wk = LIST_FIRST(wkhd)) != NULL) { CTR2(KTR_SUJ, "softdep_setup_blkfree: blkno %jd wk type %d", blkno, wk->wk_type); WORKLIST_REMOVE(wk); if (wk->wk_type != D_JNEWBLK) { WORKLIST_INSERT(&bmsafemap->sm_freehd, wk); continue; } jnewblk = WK_JNEWBLK(wk); KASSERT(jnewblk->jn_state & GOINGAWAY, ("softdep_setup_blkfree: jnewblk not canceled.")); #ifdef INVARIANTS /* * Assert that this block is free in the bitmap * before we discard the jnewblk. */ cgp = (struct cg *)bp->b_data; blksfree = cg_blksfree(cgp); bno = dtogd(fs, jnewblk->jn_blkno); for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags; i++) { if (isset(blksfree, bno + i)) continue; panic("softdep_setup_blkfree: not free"); } #endif /* * Even if it's not attached we can free immediately * as the new bitmap is correct. */ wk->wk_state |= COMPLETE | ATTACHED; free_jnewblk(jnewblk); } } #ifdef INVARIANTS /* * Assert that we are not freeing a block which has an outstanding * allocation dependency. */ fs = VFSTOUFS(mp)->um_fs; bmsafemap = bmsafemap_lookup(mp, bp, dtog(fs, blkno), NULL); end = blkno + frags; LIST_FOREACH(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps) { /* * Don't match against blocks that will be freed when the * background write is done. */ if ((jnewblk->jn_state & (ATTACHED | COMPLETE | DEPCOMPLETE)) == (COMPLETE | DEPCOMPLETE)) continue; jstart = jnewblk->jn_blkno + jnewblk->jn_oldfrags; jend = jnewblk->jn_blkno + jnewblk->jn_frags; if ((blkno >= jstart && blkno < jend) || (end > jstart && end <= jend)) { printf("state 0x%X %jd - %d %d dep %p\n", jnewblk->jn_state, jnewblk->jn_blkno, jnewblk->jn_oldfrags, jnewblk->jn_frags, jnewblk->jn_dep); panic("softdep_setup_blkfree: " "%jd-%jd(%d) overlaps with %jd-%jd", blkno, end, frags, jstart, jend); } } #endif FREE_LOCK(ump); } /* * Revert a block allocation when the journal record that describes it * is not yet written. */ static int jnewblk_rollback(jnewblk, fs, cgp, blksfree) struct jnewblk *jnewblk; struct fs *fs; struct cg *cgp; uint8_t *blksfree; { ufs1_daddr_t fragno; long cgbno, bbase; int frags, blk; int i; frags = 0; cgbno = dtogd(fs, jnewblk->jn_blkno); /* * We have to test which frags need to be rolled back. We may * be operating on a stale copy when doing background writes. */ for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags; i++) if (isclr(blksfree, cgbno + i)) frags++; if (frags == 0) return (0); /* * This is mostly ffs_blkfree() sans some validation and * superblock updates. */ if (frags == fs->fs_frag) { fragno = fragstoblks(fs, cgbno); ffs_setblock(fs, blksfree, fragno); ffs_clusteracct(fs, cgp, fragno, 1); cgp->cg_cs.cs_nbfree++; } else { cgbno += jnewblk->jn_oldfrags; bbase = cgbno - fragnum(fs, cgbno); /* Decrement the old frags. */ blk = blkmap(fs, blksfree, bbase); ffs_fragacct(fs, blk, cgp->cg_frsum, -1); /* Deallocate the fragment */ for (i = 0; i < frags; i++) setbit(blksfree, cgbno + i); cgp->cg_cs.cs_nffree += frags; /* Add back in counts associated with the new frags */ blk = blkmap(fs, blksfree, bbase); ffs_fragacct(fs, blk, cgp->cg_frsum, 1); /* If a complete block has been reassembled, account for it. */ fragno = fragstoblks(fs, bbase); if (ffs_isblock(fs, blksfree, fragno)) { cgp->cg_cs.cs_nffree -= fs->fs_frag; ffs_clusteracct(fs, cgp, fragno, 1); cgp->cg_cs.cs_nbfree++; } } stat_jnewblk++; jnewblk->jn_state &= ~ATTACHED; jnewblk->jn_state |= UNDONE; return (frags); } static void initiate_write_bmsafemap(bmsafemap, bp) struct bmsafemap *bmsafemap; struct buf *bp; /* The cg block. */ { struct jaddref *jaddref; struct jnewblk *jnewblk; uint8_t *inosused; uint8_t *blksfree; struct cg *cgp; struct fs *fs; ino_t ino; /* * If this is a background write, we did this at the time that * the copy was made, so do not need to do it again. */ if (bmsafemap->sm_state & IOSTARTED) return; bmsafemap->sm_state |= IOSTARTED; /* * Clear any inode allocations which are pending journal writes. */ if (LIST_FIRST(&bmsafemap->sm_jaddrefhd) != NULL) { cgp = (struct cg *)bp->b_data; fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs; inosused = cg_inosused(cgp); LIST_FOREACH(jaddref, &bmsafemap->sm_jaddrefhd, ja_bmdeps) { ino = jaddref->ja_ino % fs->fs_ipg; if (isset(inosused, ino)) { if ((jaddref->ja_mode & IFMT) == IFDIR) cgp->cg_cs.cs_ndir--; cgp->cg_cs.cs_nifree++; clrbit(inosused, ino); jaddref->ja_state &= ~ATTACHED; jaddref->ja_state |= UNDONE; stat_jaddref++; } else panic("initiate_write_bmsafemap: inode %ju " "marked free", (uintmax_t)jaddref->ja_ino); } } /* * Clear any block allocations which are pending journal writes. */ if (LIST_FIRST(&bmsafemap->sm_jnewblkhd) != NULL) { cgp = (struct cg *)bp->b_data; fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs; blksfree = cg_blksfree(cgp); LIST_FOREACH(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps) { if (jnewblk_rollback(jnewblk, fs, cgp, blksfree)) continue; panic("initiate_write_bmsafemap: block %jd " "marked free", jnewblk->jn_blkno); } } /* * Move allocation lists to the written lists so they can be * cleared once the block write is complete. */ LIST_SWAP(&bmsafemap->sm_inodedephd, &bmsafemap->sm_inodedepwr, inodedep, id_deps); LIST_SWAP(&bmsafemap->sm_newblkhd, &bmsafemap->sm_newblkwr, newblk, nb_deps); LIST_SWAP(&bmsafemap->sm_freehd, &bmsafemap->sm_freewr, worklist, wk_list); } void softdep_handle_error(struct buf *bp) { struct ufsmount *ump; ump = softdep_bp_to_mp(bp); if (ump == NULL) return; if (ffs_fsfail_cleanup(ump, bp->b_error)) { /* * No future writes will succeed, so the on-disk image is safe. * Pretend that this write succeeded so that the softdep state * will be cleaned up naturally. */ bp->b_ioflags &= ~BIO_ERROR; bp->b_error = 0; } } /* * This routine is called during the completion interrupt * service routine for a disk write (from the procedure called * by the device driver to inform the filesystem caches of * a request completion). It should be called early in this * procedure, before the block is made available to other * processes or other routines are called. * */ static void softdep_disk_write_complete(bp) struct buf *bp; /* describes the completed disk write */ { struct worklist *wk; struct worklist *owk; struct ufsmount *ump; struct workhead reattach; struct freeblks *freeblks; struct buf *sbp; ump = softdep_bp_to_mp(bp); KASSERT(LIST_EMPTY(&bp->b_dep) || ump != NULL, ("softdep_disk_write_complete: softdep_bp_to_mp returned NULL " "with outstanding dependencies for buffer %p", bp)); if (ump == NULL) return; if ((bp->b_ioflags & BIO_ERROR) != 0) softdep_handle_error(bp); /* * If an error occurred while doing the write, then the data * has not hit the disk and the dependencies cannot be processed. * But we do have to go through and roll forward any dependencies * that were rolled back before the disk write. */ sbp = NULL; ACQUIRE_LOCK(ump); if ((bp->b_ioflags & BIO_ERROR) != 0 && (bp->b_flags & B_INVAL) == 0) { LIST_FOREACH(wk, &bp->b_dep, wk_list) { switch (wk->wk_type) { case D_PAGEDEP: handle_written_filepage(WK_PAGEDEP(wk), bp, 0); continue; case D_INODEDEP: handle_written_inodeblock(WK_INODEDEP(wk), bp, 0); continue; case D_BMSAFEMAP: handle_written_bmsafemap(WK_BMSAFEMAP(wk), bp, 0); continue; case D_INDIRDEP: handle_written_indirdep(WK_INDIRDEP(wk), bp, &sbp, 0); continue; default: /* nothing to roll forward */ continue; } } FREE_LOCK(ump); if (sbp) brelse(sbp); return; } LIST_INIT(&reattach); /* * Ump SU lock must not be released anywhere in this code segment. */ owk = NULL; while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) { WORKLIST_REMOVE(wk); atomic_add_long(&dep_write[wk->wk_type], 1); if (wk == owk) panic("duplicate worklist: %p\n", wk); owk = wk; switch (wk->wk_type) { case D_PAGEDEP: if (handle_written_filepage(WK_PAGEDEP(wk), bp, WRITESUCCEEDED)) WORKLIST_INSERT(&reattach, wk); continue; case D_INODEDEP: if (handle_written_inodeblock(WK_INODEDEP(wk), bp, WRITESUCCEEDED)) WORKLIST_INSERT(&reattach, wk); continue; case D_BMSAFEMAP: if (handle_written_bmsafemap(WK_BMSAFEMAP(wk), bp, WRITESUCCEEDED)) WORKLIST_INSERT(&reattach, wk); continue; case D_MKDIR: handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY); continue; case D_ALLOCDIRECT: wk->wk_state |= COMPLETE; handle_allocdirect_partdone(WK_ALLOCDIRECT(wk), NULL); continue; case D_ALLOCINDIR: wk->wk_state |= COMPLETE; handle_allocindir_partdone(WK_ALLOCINDIR(wk)); continue; case D_INDIRDEP: if (handle_written_indirdep(WK_INDIRDEP(wk), bp, &sbp, WRITESUCCEEDED)) WORKLIST_INSERT(&reattach, wk); continue; case D_FREEBLKS: wk->wk_state |= COMPLETE; freeblks = WK_FREEBLKS(wk); if ((wk->wk_state & ALLCOMPLETE) == ALLCOMPLETE && LIST_EMPTY(&freeblks->fb_jblkdephd)) add_to_worklist(wk, WK_NODELAY); continue; case D_FREEWORK: handle_written_freework(WK_FREEWORK(wk)); break; case D_JSEGDEP: free_jsegdep(WK_JSEGDEP(wk)); continue; case D_JSEG: handle_written_jseg(WK_JSEG(wk), bp); continue; case D_SBDEP: if (handle_written_sbdep(WK_SBDEP(wk), bp)) WORKLIST_INSERT(&reattach, wk); continue; case D_FREEDEP: free_freedep(WK_FREEDEP(wk)); continue; default: panic("handle_disk_write_complete: Unknown type %s", TYPENAME(wk->wk_type)); /* NOTREACHED */ } } /* * Reattach any requests that must be redone. */ while ((wk = LIST_FIRST(&reattach)) != NULL) { WORKLIST_REMOVE(wk); WORKLIST_INSERT(&bp->b_dep, wk); } FREE_LOCK(ump); if (sbp) brelse(sbp); } /* * Called from within softdep_disk_write_complete above. */ static void handle_allocdirect_partdone(adp, wkhd) struct allocdirect *adp; /* the completed allocdirect */ struct workhead *wkhd; /* Work to do when inode is writtne. */ { struct allocdirectlst *listhead; struct allocdirect *listadp; struct inodedep *inodedep; long bsize; LOCK_OWNED(VFSTOUFS(adp->ad_block.nb_list.wk_mp)); if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE) return; /* * The on-disk inode cannot claim to be any larger than the last * fragment that has been written. Otherwise, the on-disk inode * might have fragments that were not the last block in the file * which would corrupt the filesystem. Thus, we cannot free any * allocdirects after one whose ad_oldblkno claims a fragment as * these blocks must be rolled back to zero before writing the inode. * We check the currently active set of allocdirects in id_inoupdt * or id_extupdt as appropriate. */ inodedep = adp->ad_inodedep; bsize = inodedep->id_fs->fs_bsize; if (adp->ad_state & EXTDATA) listhead = &inodedep->id_extupdt; else listhead = &inodedep->id_inoupdt; TAILQ_FOREACH(listadp, listhead, ad_next) { /* found our block */ if (listadp == adp) break; /* continue if ad_oldlbn is not a fragment */ if (listadp->ad_oldsize == 0 || listadp->ad_oldsize == bsize) continue; /* hit a fragment */ return; } /* * If we have reached the end of the current list without * finding the just finished dependency, then it must be * on the future dependency list. Future dependencies cannot * be freed until they are moved to the current list. */ if (listadp == NULL) { #ifdef INVARIANTS if (adp->ad_state & EXTDATA) listhead = &inodedep->id_newextupdt; else listhead = &inodedep->id_newinoupdt; TAILQ_FOREACH(listadp, listhead, ad_next) /* found our block */ if (listadp == adp) break; if (listadp == NULL) panic("handle_allocdirect_partdone: lost dep"); #endif /* INVARIANTS */ return; } /* * If we have found the just finished dependency, then queue * it along with anything that follows it that is complete. * Since the pointer has not yet been written in the inode * as the dependency prevents it, place the allocdirect on the * bufwait list where it will be freed once the pointer is * valid. */ if (wkhd == NULL) wkhd = &inodedep->id_bufwait; for (; adp; adp = listadp) { listadp = TAILQ_NEXT(adp, ad_next); if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE) return; TAILQ_REMOVE(listhead, adp, ad_next); WORKLIST_INSERT(wkhd, &adp->ad_block.nb_list); } } /* * Called from within softdep_disk_write_complete above. This routine * completes successfully written allocindirs. */ static void handle_allocindir_partdone(aip) struct allocindir *aip; /* the completed allocindir */ { struct indirdep *indirdep; if ((aip->ai_state & ALLCOMPLETE) != ALLCOMPLETE) return; indirdep = aip->ai_indirdep; LIST_REMOVE(aip, ai_next); /* * Don't set a pointer while the buffer is undergoing IO or while * we have active truncations. */ if (indirdep->ir_state & UNDONE || !TAILQ_EMPTY(&indirdep->ir_trunc)) { LIST_INSERT_HEAD(&indirdep->ir_donehd, aip, ai_next); return; } if (indirdep->ir_state & UFS1FMT) ((ufs1_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] = aip->ai_newblkno; else ((ufs2_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] = aip->ai_newblkno; /* * Await the pointer write before freeing the allocindir. */ LIST_INSERT_HEAD(&indirdep->ir_writehd, aip, ai_next); } /* * Release segments held on a jwork list. */ static void handle_jwork(wkhd) struct workhead *wkhd; { struct worklist *wk; while ((wk = LIST_FIRST(wkhd)) != NULL) { WORKLIST_REMOVE(wk); switch (wk->wk_type) { case D_JSEGDEP: free_jsegdep(WK_JSEGDEP(wk)); continue; case D_FREEDEP: free_freedep(WK_FREEDEP(wk)); continue; case D_FREEFRAG: rele_jseg(WK_JSEG(WK_FREEFRAG(wk)->ff_jdep)); WORKITEM_FREE(wk, D_FREEFRAG); continue; case D_FREEWORK: handle_written_freework(WK_FREEWORK(wk)); continue; default: panic("handle_jwork: Unknown type %s\n", TYPENAME(wk->wk_type)); } } } /* * Handle the bufwait list on an inode when it is safe to release items * held there. This normally happens after an inode block is written but * may be delayed and handled later if there are pending journal items that * are not yet safe to be released. */ static struct freefile * handle_bufwait(inodedep, refhd) struct inodedep *inodedep; struct workhead *refhd; { struct jaddref *jaddref; struct freefile *freefile; struct worklist *wk; freefile = NULL; while ((wk = LIST_FIRST(&inodedep->id_bufwait)) != NULL) { WORKLIST_REMOVE(wk); switch (wk->wk_type) { case D_FREEFILE: /* * We defer adding freefile to the worklist * until all other additions have been made to * ensure that it will be done after all the * old blocks have been freed. */ if (freefile != NULL) panic("handle_bufwait: freefile"); freefile = WK_FREEFILE(wk); continue; case D_MKDIR: handle_written_mkdir(WK_MKDIR(wk), MKDIR_PARENT); continue; case D_DIRADD: diradd_inode_written(WK_DIRADD(wk), inodedep); continue; case D_FREEFRAG: wk->wk_state |= COMPLETE; if ((wk->wk_state & ALLCOMPLETE) == ALLCOMPLETE) add_to_worklist(wk, 0); continue; case D_DIRREM: wk->wk_state |= COMPLETE; add_to_worklist(wk, 0); continue; case D_ALLOCDIRECT: case D_ALLOCINDIR: free_newblk(WK_NEWBLK(wk)); continue; case D_JNEWBLK: wk->wk_state |= COMPLETE; free_jnewblk(WK_JNEWBLK(wk)); continue; /* * Save freed journal segments and add references on * the supplied list which will delay their release * until the cg bitmap is cleared on disk. */ case D_JSEGDEP: if (refhd == NULL) free_jsegdep(WK_JSEGDEP(wk)); else WORKLIST_INSERT(refhd, wk); continue; case D_JADDREF: jaddref = WK_JADDREF(wk); TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref, if_deps); /* * Transfer any jaddrefs to the list to be freed with * the bitmap if we're handling a removed file. */ if (refhd == NULL) { wk->wk_state |= COMPLETE; free_jaddref(jaddref); } else WORKLIST_INSERT(refhd, wk); continue; default: panic("handle_bufwait: Unknown type %p(%s)", wk, TYPENAME(wk->wk_type)); /* NOTREACHED */ } } return (freefile); } /* * Called from within softdep_disk_write_complete above to restore * in-memory inode block contents to their most up-to-date state. Note * that this routine is always called from interrupt level with further * interrupts from this device blocked. * * If the write did not succeed, we will do all the roll-forward * operations, but we will not take the actions that will allow its * dependencies to be processed. */ static int handle_written_inodeblock(inodedep, bp, flags) struct inodedep *inodedep; struct buf *bp; /* buffer containing the inode block */ int flags; { struct freefile *freefile; struct allocdirect *adp, *nextadp; struct ufs1_dinode *dp1 = NULL; struct ufs2_dinode *dp2 = NULL; struct workhead wkhd; int hadchanges, fstype; ino_t freelink; LIST_INIT(&wkhd); hadchanges = 0; freefile = NULL; if ((inodedep->id_state & IOSTARTED) == 0) panic("handle_written_inodeblock: not started"); inodedep->id_state &= ~IOSTARTED; if (inodedep->id_fs->fs_magic == FS_UFS1_MAGIC) { fstype = UFS1; dp1 = (struct ufs1_dinode *)bp->b_data + ino_to_fsbo(inodedep->id_fs, inodedep->id_ino); freelink = dp1->di_freelink; } else { fstype = UFS2; dp2 = (struct ufs2_dinode *)bp->b_data + ino_to_fsbo(inodedep->id_fs, inodedep->id_ino); freelink = dp2->di_freelink; } /* * Leave this inodeblock dirty until it's in the list. */ if ((inodedep->id_state & (UNLINKED | UNLINKONLIST)) == UNLINKED && (flags & WRITESUCCEEDED)) { struct inodedep *inon; inon = TAILQ_NEXT(inodedep, id_unlinked); if ((inon == NULL && freelink == 0) || (inon && inon->id_ino == freelink)) { if (inon) inon->id_state |= UNLINKPREV; inodedep->id_state |= UNLINKNEXT; } hadchanges = 1; } /* * If we had to rollback the inode allocation because of * bitmaps being incomplete, then simply restore it. * Keep the block dirty so that it will not be reclaimed until * all associated dependencies have been cleared and the * corresponding updates written to disk. */ if (inodedep->id_savedino1 != NULL) { hadchanges = 1; if (fstype == UFS1) *dp1 = *inodedep->id_savedino1; else *dp2 = *inodedep->id_savedino2; free(inodedep->id_savedino1, M_SAVEDINO); inodedep->id_savedino1 = NULL; if ((bp->b_flags & B_DELWRI) == 0) stat_inode_bitmap++; bdirty(bp); /* * If the inode is clear here and GOINGAWAY it will never * be written. Process the bufwait and clear any pending * work which may include the freefile. */ if (inodedep->id_state & GOINGAWAY) goto bufwait; return (1); } if (flags & WRITESUCCEEDED) inodedep->id_state |= COMPLETE; /* * Roll forward anything that had to be rolled back before * the inode could be updated. */ for (adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; adp = nextadp) { nextadp = TAILQ_NEXT(adp, ad_next); if (adp->ad_state & ATTACHED) panic("handle_written_inodeblock: new entry"); if (fstype == UFS1) { if (adp->ad_offset < UFS_NDADDR) { if (dp1->di_db[adp->ad_offset]!=adp->ad_oldblkno) panic("%s %s #%jd mismatch %d != %jd", "handle_written_inodeblock:", "direct pointer", (intmax_t)adp->ad_offset, dp1->di_db[adp->ad_offset], (intmax_t)adp->ad_oldblkno); dp1->di_db[adp->ad_offset] = adp->ad_newblkno; } else { if (dp1->di_ib[adp->ad_offset - UFS_NDADDR] != 0) panic("%s: %s #%jd allocated as %d", "handle_written_inodeblock", "indirect pointer", (intmax_t)adp->ad_offset - UFS_NDADDR, dp1->di_ib[adp->ad_offset - UFS_NDADDR]); dp1->di_ib[adp->ad_offset - UFS_NDADDR] = adp->ad_newblkno; } } else { if (adp->ad_offset < UFS_NDADDR) { if (dp2->di_db[adp->ad_offset]!=adp->ad_oldblkno) panic("%s: %s #%jd %s %jd != %jd", "handle_written_inodeblock", "direct pointer", (intmax_t)adp->ad_offset, "mismatch", (intmax_t)dp2->di_db[adp->ad_offset], (intmax_t)adp->ad_oldblkno); dp2->di_db[adp->ad_offset] = adp->ad_newblkno; } else { if (dp2->di_ib[adp->ad_offset - UFS_NDADDR] != 0) panic("%s: %s #%jd allocated as %jd", "handle_written_inodeblock", "indirect pointer", (intmax_t)adp->ad_offset - UFS_NDADDR, (intmax_t) dp2->di_ib[adp->ad_offset - UFS_NDADDR]); dp2->di_ib[adp->ad_offset - UFS_NDADDR] = adp->ad_newblkno; } } adp->ad_state &= ~UNDONE; adp->ad_state |= ATTACHED; hadchanges = 1; } for (adp = TAILQ_FIRST(&inodedep->id_extupdt); adp; adp = nextadp) { nextadp = TAILQ_NEXT(adp, ad_next); if (adp->ad_state & ATTACHED) panic("handle_written_inodeblock: new entry"); if (dp2->di_extb[adp->ad_offset] != adp->ad_oldblkno) panic("%s: direct pointers #%jd %s %jd != %jd", "handle_written_inodeblock", (intmax_t)adp->ad_offset, "mismatch", (intmax_t)dp2->di_extb[adp->ad_offset], (intmax_t)adp->ad_oldblkno); dp2->di_extb[adp->ad_offset] = adp->ad_newblkno; adp->ad_state &= ~UNDONE; adp->ad_state |= ATTACHED; hadchanges = 1; } if (hadchanges && (bp->b_flags & B_DELWRI) == 0) stat_direct_blk_ptrs++; /* * Reset the file size to its most up-to-date value. */ if (inodedep->id_savedsize == -1 || inodedep->id_savedextsize == -1) panic("handle_written_inodeblock: bad size"); if (inodedep->id_savednlink > UFS_LINK_MAX) panic("handle_written_inodeblock: Invalid link count " "%jd for inodedep %p", (uintmax_t)inodedep->id_savednlink, inodedep); if (fstype == UFS1) { if (dp1->di_nlink != inodedep->id_savednlink) { dp1->di_nlink = inodedep->id_savednlink; hadchanges = 1; } if (dp1->di_size != inodedep->id_savedsize) { dp1->di_size = inodedep->id_savedsize; hadchanges = 1; } } else { if (dp2->di_nlink != inodedep->id_savednlink) { dp2->di_nlink = inodedep->id_savednlink; hadchanges = 1; } if (dp2->di_size != inodedep->id_savedsize) { dp2->di_size = inodedep->id_savedsize; hadchanges = 1; } if (dp2->di_extsize != inodedep->id_savedextsize) { dp2->di_extsize = inodedep->id_savedextsize; hadchanges = 1; } } inodedep->id_savedsize = -1; inodedep->id_savedextsize = -1; inodedep->id_savednlink = -1; /* * If there were any rollbacks in the inode block, then it must be * marked dirty so that its will eventually get written back in * its correct form. */ if (hadchanges) { if (fstype == UFS2) ffs_update_dinode_ckhash(inodedep->id_fs, dp2); bdirty(bp); } bufwait: /* * If the write did not succeed, we have done all the roll-forward * operations, but we cannot take the actions that will allow its * dependencies to be processed. */ if ((flags & WRITESUCCEEDED) == 0) return (hadchanges); /* * Process any allocdirects that completed during the update. */ if ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != NULL) handle_allocdirect_partdone(adp, &wkhd); if ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != NULL) handle_allocdirect_partdone(adp, &wkhd); /* * Process deallocations that were held pending until the * inode had been written to disk. Freeing of the inode * is delayed until after all blocks have been freed to * avoid creation of new triples * before the old ones have been deleted. Completely * unlinked inodes are not processed until the unlinked * inode list is written or the last reference is removed. */ if ((inodedep->id_state & (UNLINKED | UNLINKONLIST)) != UNLINKED) { freefile = handle_bufwait(inodedep, NULL); if (freefile && !LIST_EMPTY(&wkhd)) { WORKLIST_INSERT(&wkhd, &freefile->fx_list); freefile = NULL; } } /* * Move rolled forward dependency completions to the bufwait list * now that those that were already written have been processed. */ if (!LIST_EMPTY(&wkhd) && hadchanges == 0) panic("handle_written_inodeblock: bufwait but no changes"); jwork_move(&inodedep->id_bufwait, &wkhd); if (freefile != NULL) { /* * If the inode is goingaway it was never written. Fake up * the state here so free_inodedep() can succeed. */ if (inodedep->id_state & GOINGAWAY) inodedep->id_state |= COMPLETE | DEPCOMPLETE; if (free_inodedep(inodedep) == 0) panic("handle_written_inodeblock: live inodedep %p", inodedep); add_to_worklist(&freefile->fx_list, 0); return (0); } /* * If no outstanding dependencies, free it. */ if (free_inodedep(inodedep) || (TAILQ_FIRST(&inodedep->id_inoreflst) == 0 && TAILQ_FIRST(&inodedep->id_inoupdt) == 0 && TAILQ_FIRST(&inodedep->id_extupdt) == 0 && LIST_FIRST(&inodedep->id_bufwait) == 0)) return (0); return (hadchanges); } /* * Perform needed roll-forwards and kick off any dependencies that * can now be processed. * * If the write did not succeed, we will do all the roll-forward * operations, but we will not take the actions that will allow its * dependencies to be processed. */ static int handle_written_indirdep(indirdep, bp, bpp, flags) struct indirdep *indirdep; struct buf *bp; struct buf **bpp; int flags; { struct allocindir *aip; struct buf *sbp; int chgs; if (indirdep->ir_state & GOINGAWAY) panic("handle_written_indirdep: indirdep gone"); if ((indirdep->ir_state & IOSTARTED) == 0) panic("handle_written_indirdep: IO not started"); chgs = 0; /* * If there were rollbacks revert them here. */ if (indirdep->ir_saveddata) { bcopy(indirdep->ir_saveddata, bp->b_data, bp->b_bcount); if (TAILQ_EMPTY(&indirdep->ir_trunc)) { free(indirdep->ir_saveddata, M_INDIRDEP); indirdep->ir_saveddata = NULL; } chgs = 1; } indirdep->ir_state &= ~(UNDONE | IOSTARTED); indirdep->ir_state |= ATTACHED; /* * If the write did not succeed, we have done all the roll-forward * operations, but we cannot take the actions that will allow its * dependencies to be processed. */ if ((flags & WRITESUCCEEDED) == 0) { stat_indir_blk_ptrs++; bdirty(bp); return (1); } /* * Move allocindirs with written pointers to the completehd if * the indirdep's pointer is not yet written. Otherwise * free them here. */ while ((aip = LIST_FIRST(&indirdep->ir_writehd)) != NULL) { LIST_REMOVE(aip, ai_next); if ((indirdep->ir_state & DEPCOMPLETE) == 0) { LIST_INSERT_HEAD(&indirdep->ir_completehd, aip, ai_next); newblk_freefrag(&aip->ai_block); continue; } free_newblk(&aip->ai_block); } /* * Move allocindirs that have finished dependency processing from * the done list to the write list after updating the pointers. */ if (TAILQ_EMPTY(&indirdep->ir_trunc)) { while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != NULL) { handle_allocindir_partdone(aip); if (aip == LIST_FIRST(&indirdep->ir_donehd)) panic("disk_write_complete: not gone"); chgs = 1; } } /* * Preserve the indirdep if there were any changes or if it is not * yet valid on disk. */ if (chgs) { stat_indir_blk_ptrs++; bdirty(bp); return (1); } /* * If there were no changes we can discard the savedbp and detach * ourselves from the buf. We are only carrying completed pointers * in this case. */ sbp = indirdep->ir_savebp; sbp->b_flags |= B_INVAL | B_NOCACHE; indirdep->ir_savebp = NULL; indirdep->ir_bp = NULL; if (*bpp != NULL) panic("handle_written_indirdep: bp already exists."); *bpp = sbp; /* * The indirdep may not be freed until its parent points at it. */ if (indirdep->ir_state & DEPCOMPLETE) free_indirdep(indirdep); return (0); } /* * Process a diradd entry after its dependent inode has been written. */ static void diradd_inode_written(dap, inodedep) struct diradd *dap; struct inodedep *inodedep; { LOCK_OWNED(VFSTOUFS(dap->da_list.wk_mp)); dap->da_state |= COMPLETE; complete_diradd(dap); WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list); } /* * Returns true if the bmsafemap will have rollbacks when written. Must only * be called with the per-filesystem lock and the buf lock on the cg held. */ static int bmsafemap_backgroundwrite(bmsafemap, bp) struct bmsafemap *bmsafemap; struct buf *bp; { int dirty; LOCK_OWNED(VFSTOUFS(bmsafemap->sm_list.wk_mp)); dirty = !LIST_EMPTY(&bmsafemap->sm_jaddrefhd) | !LIST_EMPTY(&bmsafemap->sm_jnewblkhd); /* * If we're initiating a background write we need to process the * rollbacks as they exist now, not as they exist when IO starts. * No other consumers will look at the contents of the shadowed * buf so this is safe to do here. */ if (bp->b_xflags & BX_BKGRDMARKER) initiate_write_bmsafemap(bmsafemap, bp); return (dirty); } /* * Re-apply an allocation when a cg write is complete. */ static int jnewblk_rollforward(jnewblk, fs, cgp, blksfree) struct jnewblk *jnewblk; struct fs *fs; struct cg *cgp; uint8_t *blksfree; { ufs1_daddr_t fragno; ufs2_daddr_t blkno; long cgbno, bbase; int frags, blk; int i; frags = 0; cgbno = dtogd(fs, jnewblk->jn_blkno); for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags; i++) { if (isclr(blksfree, cgbno + i)) panic("jnewblk_rollforward: re-allocated fragment"); frags++; } if (frags == fs->fs_frag) { blkno = fragstoblks(fs, cgbno); ffs_clrblock(fs, blksfree, (long)blkno); ffs_clusteracct(fs, cgp, blkno, -1); cgp->cg_cs.cs_nbfree--; } else { bbase = cgbno - fragnum(fs, cgbno); cgbno += jnewblk->jn_oldfrags; /* If a complete block had been reassembled, account for it. */ fragno = fragstoblks(fs, bbase); if (ffs_isblock(fs, blksfree, fragno)) { cgp->cg_cs.cs_nffree += fs->fs_frag; ffs_clusteracct(fs, cgp, fragno, -1); cgp->cg_cs.cs_nbfree--; } /* Decrement the old frags. */ blk = blkmap(fs, blksfree, bbase); ffs_fragacct(fs, blk, cgp->cg_frsum, -1); /* Allocate the fragment */ for (i = 0; i < frags; i++) clrbit(blksfree, cgbno + i); cgp->cg_cs.cs_nffree -= frags; /* Add back in counts associated with the new frags */ blk = blkmap(fs, blksfree, bbase); ffs_fragacct(fs, blk, cgp->cg_frsum, 1); } return (frags); } /* * Complete a write to a bmsafemap structure. Roll forward any bitmap * changes if it's not a background write. Set all written dependencies * to DEPCOMPLETE and free the structure if possible. * * If the write did not succeed, we will do all the roll-forward * operations, but we will not take the actions that will allow its * dependencies to be processed. */ static int handle_written_bmsafemap(bmsafemap, bp, flags) struct bmsafemap *bmsafemap; struct buf *bp; int flags; { struct newblk *newblk; struct inodedep *inodedep; struct jaddref *jaddref, *jatmp; struct jnewblk *jnewblk, *jntmp; struct ufsmount *ump; uint8_t *inosused; uint8_t *blksfree; struct cg *cgp; struct fs *fs; ino_t ino; int foreground; int chgs; if ((bmsafemap->sm_state & IOSTARTED) == 0) panic("handle_written_bmsafemap: Not started\n"); ump = VFSTOUFS(bmsafemap->sm_list.wk_mp); chgs = 0; bmsafemap->sm_state &= ~IOSTARTED; foreground = (bp->b_xflags & BX_BKGRDMARKER) == 0; /* * If write was successful, release journal work that was waiting * on the write. Otherwise move the work back. */ if (flags & WRITESUCCEEDED) handle_jwork(&bmsafemap->sm_freewr); else LIST_CONCAT(&bmsafemap->sm_freehd, &bmsafemap->sm_freewr, worklist, wk_list); /* * Restore unwritten inode allocation pending jaddref writes. */ if (!LIST_EMPTY(&bmsafemap->sm_jaddrefhd)) { cgp = (struct cg *)bp->b_data; fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs; inosused = cg_inosused(cgp); LIST_FOREACH_SAFE(jaddref, &bmsafemap->sm_jaddrefhd, ja_bmdeps, jatmp) { if ((jaddref->ja_state & UNDONE) == 0) continue; ino = jaddref->ja_ino % fs->fs_ipg; if (isset(inosused, ino)) panic("handle_written_bmsafemap: " "re-allocated inode"); /* Do the roll-forward only if it's a real copy. */ if (foreground) { if ((jaddref->ja_mode & IFMT) == IFDIR) cgp->cg_cs.cs_ndir++; cgp->cg_cs.cs_nifree--; setbit(inosused, ino); chgs = 1; } jaddref->ja_state &= ~UNDONE; jaddref->ja_state |= ATTACHED; free_jaddref(jaddref); } } /* * Restore any block allocations which are pending journal writes. */ if (LIST_FIRST(&bmsafemap->sm_jnewblkhd) != NULL) { cgp = (struct cg *)bp->b_data; fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs; blksfree = cg_blksfree(cgp); LIST_FOREACH_SAFE(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps, jntmp) { if ((jnewblk->jn_state & UNDONE) == 0) continue; /* Do the roll-forward only if it's a real copy. */ if (foreground && jnewblk_rollforward(jnewblk, fs, cgp, blksfree)) chgs = 1; jnewblk->jn_state &= ~(UNDONE | NEWBLOCK); jnewblk->jn_state |= ATTACHED; free_jnewblk(jnewblk); } } /* * If the write did not succeed, we have done all the roll-forward * operations, but we cannot take the actions that will allow its * dependencies to be processed. */ if ((flags & WRITESUCCEEDED) == 0) { LIST_CONCAT(&bmsafemap->sm_newblkhd, &bmsafemap->sm_newblkwr, newblk, nb_deps); LIST_CONCAT(&bmsafemap->sm_freehd, &bmsafemap->sm_freewr, worklist, wk_list); if (foreground) bdirty(bp); return (1); } while ((newblk = LIST_FIRST(&bmsafemap->sm_newblkwr))) { newblk->nb_state |= DEPCOMPLETE; newblk->nb_state &= ~ONDEPLIST; newblk->nb_bmsafemap = NULL; LIST_REMOVE(newblk, nb_deps); if (newblk->nb_list.wk_type == D_ALLOCDIRECT) handle_allocdirect_partdone( WK_ALLOCDIRECT(&newblk->nb_list), NULL); else if (newblk->nb_list.wk_type == D_ALLOCINDIR) handle_allocindir_partdone( WK_ALLOCINDIR(&newblk->nb_list)); else if (newblk->nb_list.wk_type != D_NEWBLK) panic("handle_written_bmsafemap: Unexpected type: %s", TYPENAME(newblk->nb_list.wk_type)); } while ((inodedep = LIST_FIRST(&bmsafemap->sm_inodedepwr)) != NULL) { inodedep->id_state |= DEPCOMPLETE; inodedep->id_state &= ~ONDEPLIST; LIST_REMOVE(inodedep, id_deps); inodedep->id_bmsafemap = NULL; } LIST_REMOVE(bmsafemap, sm_next); if (chgs == 0 && LIST_EMPTY(&bmsafemap->sm_jaddrefhd) && LIST_EMPTY(&bmsafemap->sm_jnewblkhd) && LIST_EMPTY(&bmsafemap->sm_newblkhd) && LIST_EMPTY(&bmsafemap->sm_inodedephd) && LIST_EMPTY(&bmsafemap->sm_freehd)) { LIST_REMOVE(bmsafemap, sm_hash); WORKITEM_FREE(bmsafemap, D_BMSAFEMAP); return (0); } LIST_INSERT_HEAD(&ump->softdep_dirtycg, bmsafemap, sm_next); if (foreground) bdirty(bp); return (1); } /* * Try to free a mkdir dependency. */ static void complete_mkdir(mkdir) struct mkdir *mkdir; { struct diradd *dap; if ((mkdir->md_state & ALLCOMPLETE) != ALLCOMPLETE) return; LIST_REMOVE(mkdir, md_mkdirs); dap = mkdir->md_diradd; dap->da_state &= ~(mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY)); if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0) { dap->da_state |= DEPCOMPLETE; complete_diradd(dap); } WORKITEM_FREE(mkdir, D_MKDIR); } /* * Handle the completion of a mkdir dependency. */ static void handle_written_mkdir(mkdir, type) struct mkdir *mkdir; int type; { if ((mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY)) != type) panic("handle_written_mkdir: bad type"); mkdir->md_state |= COMPLETE; complete_mkdir(mkdir); } static int free_pagedep(pagedep) struct pagedep *pagedep; { int i; if (pagedep->pd_state & NEWBLOCK) return (0); if (!LIST_EMPTY(&pagedep->pd_dirremhd)) return (0); for (i = 0; i < DAHASHSZ; i++) if (!LIST_EMPTY(&pagedep->pd_diraddhd[i])) return (0); if (!LIST_EMPTY(&pagedep->pd_pendinghd)) return (0); if (!LIST_EMPTY(&pagedep->pd_jmvrefhd)) return (0); if (pagedep->pd_state & ONWORKLIST) WORKLIST_REMOVE(&pagedep->pd_list); LIST_REMOVE(pagedep, pd_hash); WORKITEM_FREE(pagedep, D_PAGEDEP); return (1); } /* * Called from within softdep_disk_write_complete above. * A write operation was just completed. Removed inodes can * now be freed and associated block pointers may be committed. * Note that this routine is always called from interrupt level * with further interrupts from this device blocked. * * If the write did not succeed, we will do all the roll-forward * operations, but we will not take the actions that will allow its * dependencies to be processed. */ static int handle_written_filepage(pagedep, bp, flags) struct pagedep *pagedep; struct buf *bp; /* buffer containing the written page */ int flags; { struct dirrem *dirrem; struct diradd *dap, *nextdap; struct direct *ep; int i, chgs; if ((pagedep->pd_state & IOSTARTED) == 0) panic("handle_written_filepage: not started"); pagedep->pd_state &= ~IOSTARTED; if ((flags & WRITESUCCEEDED) == 0) goto rollforward; /* * Process any directory removals that have been committed. */ while ((dirrem = LIST_FIRST(&pagedep->pd_dirremhd)) != NULL) { LIST_REMOVE(dirrem, dm_next); dirrem->dm_state |= COMPLETE; dirrem->dm_dirinum = pagedep->pd_ino; KASSERT(LIST_EMPTY(&dirrem->dm_jremrefhd), ("handle_written_filepage: Journal entries not written.")); add_to_worklist(&dirrem->dm_list, 0); } /* * Free any directory additions that have been committed. * If it is a newly allocated block, we have to wait until * the on-disk directory inode claims the new block. */ if ((pagedep->pd_state & NEWBLOCK) == 0) while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL) free_diradd(dap, NULL); rollforward: /* * Uncommitted directory entries must be restored. */ for (chgs = 0, i = 0; i < DAHASHSZ; i++) { for (dap = LIST_FIRST(&pagedep->pd_diraddhd[i]); dap; dap = nextdap) { nextdap = LIST_NEXT(dap, da_pdlist); if (dap->da_state & ATTACHED) panic("handle_written_filepage: attached"); ep = (struct direct *) ((char *)bp->b_data + dap->da_offset); ep->d_ino = dap->da_newinum; dap->da_state &= ~UNDONE; dap->da_state |= ATTACHED; chgs = 1; /* * If the inode referenced by the directory has * been written out, then the dependency can be * moved to the pending list. */ if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) { LIST_REMOVE(dap, da_pdlist); LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist); } } } /* * If there were any rollbacks in the directory, then it must be * marked dirty so that its will eventually get written back in * its correct form. */ if (chgs || (flags & WRITESUCCEEDED) == 0) { if ((bp->b_flags & B_DELWRI) == 0) stat_dir_entry++; bdirty(bp); return (1); } /* * If we are not waiting for a new directory block to be * claimed by its inode, then the pagedep will be freed. * Otherwise it will remain to track any new entries on * the page in case they are fsync'ed. */ free_pagedep(pagedep); return (0); } /* * Writing back in-core inode structures. * * The filesystem only accesses an inode's contents when it occupies an * "in-core" inode structure. These "in-core" structures are separate from * the page frames used to cache inode blocks. Only the latter are * transferred to/from the disk. So, when the updated contents of the * "in-core" inode structure are copied to the corresponding in-memory inode * block, the dependencies are also transferred. The following procedure is * called when copying a dirty "in-core" inode to a cached inode block. */ /* * Called when an inode is loaded from disk. If the effective link count * differed from the actual link count when it was last flushed, then we * need to ensure that the correct effective link count is put back. */ void softdep_load_inodeblock(ip) struct inode *ip; /* the "in_core" copy of the inode */ { struct inodedep *inodedep; struct ufsmount *ump; ump = ITOUMP(ip); KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0, ("softdep_load_inodeblock called on non-softdep filesystem")); /* * Check for alternate nlink count. */ ip->i_effnlink = ip->i_nlink; ACQUIRE_LOCK(ump); if (inodedep_lookup(UFSTOVFS(ump), ip->i_number, 0, &inodedep) == 0) { FREE_LOCK(ump); return; } if (ip->i_nlink != inodedep->id_nlinkwrote && inodedep->id_nlinkwrote != -1) { KASSERT(ip->i_nlink == 0 && (ump->um_flags & UM_FSFAIL_CLEANUP) != 0, ("read bad i_nlink value")); ip->i_effnlink = ip->i_nlink = inodedep->id_nlinkwrote; } ip->i_effnlink -= inodedep->id_nlinkdelta; KASSERT(ip->i_effnlink >= 0, ("softdep_load_inodeblock: negative i_effnlink")); FREE_LOCK(ump); } /* * This routine is called just before the "in-core" inode * information is to be copied to the in-memory inode block. * Recall that an inode block contains several inodes. If * the force flag is set, then the dependencies will be * cleared so that the update can always be made. Note that * the buffer is locked when this routine is called, so we * will never be in the middle of writing the inode block * to disk. */ void softdep_update_inodeblock(ip, bp, waitfor) struct inode *ip; /* the "in_core" copy of the inode */ struct buf *bp; /* the buffer containing the inode block */ int waitfor; /* nonzero => update must be allowed */ { struct inodedep *inodedep; struct inoref *inoref; struct ufsmount *ump; struct worklist *wk; struct mount *mp; struct buf *ibp; struct fs *fs; int error; ump = ITOUMP(ip); mp = UFSTOVFS(ump); KASSERT(MOUNTEDSOFTDEP(mp) != 0, ("softdep_update_inodeblock called on non-softdep filesystem")); fs = ump->um_fs; /* * Preserve the freelink that is on disk. clear_unlinked_inodedep() * does not have access to the in-core ip so must write directly into * the inode block buffer when setting freelink. */ if (fs->fs_magic == FS_UFS1_MAGIC) DIP_SET(ip, i_freelink, ((struct ufs1_dinode *)bp->b_data + ino_to_fsbo(fs, ip->i_number))->di_freelink); else DIP_SET(ip, i_freelink, ((struct ufs2_dinode *)bp->b_data + ino_to_fsbo(fs, ip->i_number))->di_freelink); /* * If the effective link count is not equal to the actual link * count, then we must track the difference in an inodedep while * the inode is (potentially) tossed out of the cache. Otherwise, * if there is no existing inodedep, then there are no dependencies * to track. */ ACQUIRE_LOCK(ump); again: if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) { FREE_LOCK(ump); if (ip->i_effnlink != ip->i_nlink) panic("softdep_update_inodeblock: bad link count"); return; } KASSERT(ip->i_nlink >= inodedep->id_nlinkdelta, ("softdep_update_inodeblock inconsistent ip %p i_nlink %d " "inodedep %p id_nlinkdelta %jd", ip, ip->i_nlink, inodedep, (intmax_t)inodedep->id_nlinkdelta)); inodedep->id_nlinkwrote = ip->i_nlink; if (inodedep->id_nlinkdelta != ip->i_nlink - ip->i_effnlink) panic("softdep_update_inodeblock: bad delta"); /* * If we're flushing all dependencies we must also move any waiting * for journal writes onto the bufwait list prior to I/O. */ if (waitfor) { TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) { if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY)) == DEPCOMPLETE) { jwait(&inoref->if_list, MNT_WAIT); goto again; } } } /* * Changes have been initiated. Anything depending on these * changes cannot occur until this inode has been written. */ inodedep->id_state &= ~COMPLETE; if ((inodedep->id_state & ONWORKLIST) == 0) WORKLIST_INSERT(&bp->b_dep, &inodedep->id_list); /* * Any new dependencies associated with the incore inode must * now be moved to the list associated with the buffer holding * the in-memory copy of the inode. Once merged process any * allocdirects that are completed by the merger. */ merge_inode_lists(&inodedep->id_newinoupdt, &inodedep->id_inoupdt); if (!TAILQ_EMPTY(&inodedep->id_inoupdt)) handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_inoupdt), NULL); merge_inode_lists(&inodedep->id_newextupdt, &inodedep->id_extupdt); if (!TAILQ_EMPTY(&inodedep->id_extupdt)) handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_extupdt), NULL); /* * Now that the inode has been pushed into the buffer, the * operations dependent on the inode being written to disk * can be moved to the id_bufwait so that they will be * processed when the buffer I/O completes. */ while ((wk = LIST_FIRST(&inodedep->id_inowait)) != NULL) { WORKLIST_REMOVE(wk); WORKLIST_INSERT(&inodedep->id_bufwait, wk); } /* * Newly allocated inodes cannot be written until the bitmap * that allocates them have been written (indicated by * DEPCOMPLETE being set in id_state). If we are doing a * forced sync (e.g., an fsync on a file), we force the bitmap * to be written so that the update can be done. */ if (waitfor == 0) { FREE_LOCK(ump); return; } retry: if ((inodedep->id_state & (DEPCOMPLETE | GOINGAWAY)) != 0) { FREE_LOCK(ump); return; } ibp = inodedep->id_bmsafemap->sm_buf; ibp = getdirtybuf(ibp, LOCK_PTR(ump), MNT_WAIT); if (ibp == NULL) { /* * If ibp came back as NULL, the dependency could have been * freed while we slept. Look it up again, and check to see * that it has completed. */ if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0) goto retry; FREE_LOCK(ump); return; } FREE_LOCK(ump); if ((error = bwrite(ibp)) != 0) softdep_error("softdep_update_inodeblock: bwrite", error); } /* * Merge the a new inode dependency list (such as id_newinoupdt) into an * old inode dependency list (such as id_inoupdt). */ static void merge_inode_lists(newlisthead, oldlisthead) struct allocdirectlst *newlisthead; struct allocdirectlst *oldlisthead; { struct allocdirect *listadp, *newadp; newadp = TAILQ_FIRST(newlisthead); if (newadp != NULL) LOCK_OWNED(VFSTOUFS(newadp->ad_block.nb_list.wk_mp)); for (listadp = TAILQ_FIRST(oldlisthead); listadp && newadp;) { if (listadp->ad_offset < newadp->ad_offset) { listadp = TAILQ_NEXT(listadp, ad_next); continue; } TAILQ_REMOVE(newlisthead, newadp, ad_next); TAILQ_INSERT_BEFORE(listadp, newadp, ad_next); if (listadp->ad_offset == newadp->ad_offset) { allocdirect_merge(oldlisthead, newadp, listadp); listadp = newadp; } newadp = TAILQ_FIRST(newlisthead); } while ((newadp = TAILQ_FIRST(newlisthead)) != NULL) { TAILQ_REMOVE(newlisthead, newadp, ad_next); TAILQ_INSERT_TAIL(oldlisthead, newadp, ad_next); } } /* * If we are doing an fsync, then we must ensure that any directory * entries for the inode have been written after the inode gets to disk. */ int softdep_fsync(vp) struct vnode *vp; /* the "in_core" copy of the inode */ { struct inodedep *inodedep; struct pagedep *pagedep; struct inoref *inoref; struct ufsmount *ump; struct worklist *wk; struct diradd *dap; struct mount *mp; struct vnode *pvp; struct inode *ip; struct buf *bp; struct fs *fs; struct thread *td = curthread; int error, flushparent, pagedep_new_block; ino_t parentino; ufs_lbn_t lbn; ip = VTOI(vp); mp = vp->v_mount; ump = VFSTOUFS(mp); fs = ump->um_fs; if (MOUNTEDSOFTDEP(mp) == 0) return (0); ACQUIRE_LOCK(ump); restart: if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) { FREE_LOCK(ump); return (0); } TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) { if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY)) == DEPCOMPLETE) { jwait(&inoref->if_list, MNT_WAIT); goto restart; } } if (!LIST_EMPTY(&inodedep->id_inowait) || !TAILQ_EMPTY(&inodedep->id_extupdt) || !TAILQ_EMPTY(&inodedep->id_newextupdt) || !TAILQ_EMPTY(&inodedep->id_inoupdt) || !TAILQ_EMPTY(&inodedep->id_newinoupdt)) panic("softdep_fsync: pending ops %p", inodedep); for (error = 0, flushparent = 0; ; ) { if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) == NULL) break; if (wk->wk_type != D_DIRADD) panic("softdep_fsync: Unexpected type %s", TYPENAME(wk->wk_type)); dap = WK_DIRADD(wk); /* * Flush our parent if this directory entry has a MKDIR_PARENT * dependency or is contained in a newly allocated block. */ if (dap->da_state & DIRCHG) pagedep = dap->da_previous->dm_pagedep; else pagedep = dap->da_pagedep; parentino = pagedep->pd_ino; lbn = pagedep->pd_lbn; if ((dap->da_state & (MKDIR_BODY | COMPLETE)) != COMPLETE) panic("softdep_fsync: dirty"); if ((dap->da_state & MKDIR_PARENT) || (pagedep->pd_state & NEWBLOCK)) flushparent = 1; else flushparent = 0; /* * If we are being fsync'ed as part of vgone'ing this vnode, * then we will not be able to release and recover the * vnode below, so we just have to give up on writing its * directory entry out. It will eventually be written, just * not now, but then the user was not asking to have it * written, so we are not breaking any promises. */ if (VN_IS_DOOMED(vp)) break; /* * We prevent deadlock by always fetching inodes from the * root, moving down the directory tree. Thus, when fetching * our parent directory, we first try to get the lock. If * that fails, we must unlock ourselves before requesting * the lock on our parent. See the comment in ufs_lookup * for details on possible races. */ FREE_LOCK(ump); if (ffs_vgetf(mp, parentino, LK_NOWAIT | LK_EXCLUSIVE, &pvp, FFSV_FORCEINSMQ)) { /* * Unmount cannot proceed after unlock because * caller must have called vn_start_write(). */ VOP_UNLOCK(vp); error = ffs_vgetf(mp, parentino, LK_EXCLUSIVE, &pvp, FFSV_FORCEINSMQ); MPASS(VTOI(pvp)->i_mode != 0); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); if (VN_IS_DOOMED(vp)) { if (error == 0) vput(pvp); error = ENOENT; } if (error != 0) return (error); } /* * All MKDIR_PARENT dependencies and all the NEWBLOCK pagedeps * that are contained in direct blocks will be resolved by * doing a ffs_update. Pagedeps contained in indirect blocks * may require a complete sync'ing of the directory. So, we * try the cheap and fast ffs_update first, and if that fails, * then we do the slower ffs_syncvnode of the directory. */ if (flushparent) { int locked; if ((error = ffs_update(pvp, 1)) != 0) { vput(pvp); return (error); } ACQUIRE_LOCK(ump); locked = 1; if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0) { if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) != NULL) { if (wk->wk_type != D_DIRADD) panic("softdep_fsync: Unexpected type %s", TYPENAME(wk->wk_type)); dap = WK_DIRADD(wk); if (dap->da_state & DIRCHG) pagedep = dap->da_previous->dm_pagedep; else pagedep = dap->da_pagedep; pagedep_new_block = pagedep->pd_state & NEWBLOCK; FREE_LOCK(ump); locked = 0; if (pagedep_new_block && (error = ffs_syncvnode(pvp, MNT_WAIT, 0))) { vput(pvp); return (error); } } } if (locked) FREE_LOCK(ump); } /* * Flush directory page containing the inode's name. */ error = bread(pvp, lbn, blksize(fs, VTOI(pvp), lbn), td->td_ucred, &bp); if (error == 0) error = bwrite(bp); else brelse(bp); vput(pvp); if (!ffs_fsfail_cleanup(ump, error)) return (error); ACQUIRE_LOCK(ump); if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) break; } FREE_LOCK(ump); return (0); } /* * Flush all the dirty bitmaps associated with the block device * before flushing the rest of the dirty blocks so as to reduce * the number of dependencies that will have to be rolled back. * * XXX Unused? */ void softdep_fsync_mountdev(vp) struct vnode *vp; { struct buf *bp, *nbp; struct worklist *wk; struct bufobj *bo; if (!vn_isdisk(vp, NULL)) panic("softdep_fsync_mountdev: vnode not a disk"); bo = &vp->v_bufobj; restart: BO_LOCK(bo); TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) { /* * If it is already scheduled, skip to the next buffer. */ if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL)) continue; if ((bp->b_flags & B_DELWRI) == 0) panic("softdep_fsync_mountdev: not dirty"); /* * We are only interested in bitmaps with outstanding * dependencies. */ if ((wk = LIST_FIRST(&bp->b_dep)) == NULL || wk->wk_type != D_BMSAFEMAP || (bp->b_vflags & BV_BKGRDINPROG)) { BUF_UNLOCK(bp); continue; } BO_UNLOCK(bo); bremfree(bp); (void) bawrite(bp); goto restart; } drain_output(vp); BO_UNLOCK(bo); } /* * Sync all cylinder groups that were dirty at the time this function is * called. Newly dirtied cgs will be inserted before the sentinel. This * is used to flush freedep activity that may be holding up writes to a * indirect block. */ static int sync_cgs(mp, waitfor) struct mount *mp; int waitfor; { struct bmsafemap *bmsafemap; struct bmsafemap *sentinel; struct ufsmount *ump; struct buf *bp; int error; sentinel = malloc(sizeof(*sentinel), M_BMSAFEMAP, M_ZERO | M_WAITOK); sentinel->sm_cg = -1; ump = VFSTOUFS(mp); error = 0; ACQUIRE_LOCK(ump); LIST_INSERT_HEAD(&ump->softdep_dirtycg, sentinel, sm_next); for (bmsafemap = LIST_NEXT(sentinel, sm_next); bmsafemap != NULL; bmsafemap = LIST_NEXT(sentinel, sm_next)) { /* Skip sentinels and cgs with no work to release. */ if (bmsafemap->sm_cg == -1 || (LIST_EMPTY(&bmsafemap->sm_freehd) && LIST_EMPTY(&bmsafemap->sm_freewr))) { LIST_REMOVE(sentinel, sm_next); LIST_INSERT_AFTER(bmsafemap, sentinel, sm_next); continue; } /* * If we don't get the lock and we're waiting try again, if * not move on to the next buf and try to sync it. */ bp = getdirtybuf(bmsafemap->sm_buf, LOCK_PTR(ump), waitfor); if (bp == NULL && waitfor == MNT_WAIT) continue; LIST_REMOVE(sentinel, sm_next); LIST_INSERT_AFTER(bmsafemap, sentinel, sm_next); if (bp == NULL) continue; FREE_LOCK(ump); if (waitfor == MNT_NOWAIT) bawrite(bp); else error = bwrite(bp); ACQUIRE_LOCK(ump); if (error) break; } LIST_REMOVE(sentinel, sm_next); FREE_LOCK(ump); free(sentinel, M_BMSAFEMAP); return (error); } /* * This routine is called when we are trying to synchronously flush a * file. This routine must eliminate any filesystem metadata dependencies * so that the syncing routine can succeed. */ int softdep_sync_metadata(struct vnode *vp) { struct inode *ip; int error; ip = VTOI(vp); KASSERT(MOUNTEDSOFTDEP(vp->v_mount) != 0, ("softdep_sync_metadata called on non-softdep filesystem")); /* * Ensure that any direct block dependencies have been cleared, * truncations are started, and inode references are journaled. */ ACQUIRE_LOCK(VFSTOUFS(vp->v_mount)); /* * Write all journal records to prevent rollbacks on devvp. */ if (vp->v_type == VCHR) softdep_flushjournal(vp->v_mount); error = flush_inodedep_deps(vp, vp->v_mount, ip->i_number); /* * Ensure that all truncates are written so we won't find deps on * indirect blocks. */ process_truncates(vp); FREE_LOCK(VFSTOUFS(vp->v_mount)); return (error); } /* * This routine is called when we are attempting to sync a buf with * dependencies. If waitfor is MNT_NOWAIT it attempts to schedule any * other IO it can but returns EBUSY if the buffer is not yet able to * be written. Dependencies which will not cause rollbacks will always * return 0. */ int softdep_sync_buf(struct vnode *vp, struct buf *bp, int waitfor) { struct indirdep *indirdep; struct pagedep *pagedep; struct allocindir *aip; struct newblk *newblk; struct ufsmount *ump; struct buf *nbp; struct worklist *wk; int i, error; KASSERT(MOUNTEDSOFTDEP(vp->v_mount) != 0, ("softdep_sync_buf called on non-softdep filesystem")); /* * For VCHR we just don't want to force flush any dependencies that * will cause rollbacks. */ if (vp->v_type == VCHR) { if (waitfor == MNT_NOWAIT && softdep_count_dependencies(bp, 0)) return (EBUSY); return (0); } ump = VFSTOUFS(vp->v_mount); ACQUIRE_LOCK(ump); /* * As we hold the buffer locked, none of its dependencies * will disappear. */ error = 0; top: LIST_FOREACH(wk, &bp->b_dep, wk_list) { switch (wk->wk_type) { case D_ALLOCDIRECT: case D_ALLOCINDIR: newblk = WK_NEWBLK(wk); if (newblk->nb_jnewblk != NULL) { if (waitfor == MNT_NOWAIT) { error = EBUSY; goto out_unlock; } jwait(&newblk->nb_jnewblk->jn_list, waitfor); goto top; } if (newblk->nb_state & DEPCOMPLETE || waitfor == MNT_NOWAIT) continue; nbp = newblk->nb_bmsafemap->sm_buf; nbp = getdirtybuf(nbp, LOCK_PTR(ump), waitfor); if (nbp == NULL) goto top; FREE_LOCK(ump); if ((error = bwrite(nbp)) != 0) goto out; ACQUIRE_LOCK(ump); continue; case D_INDIRDEP: indirdep = WK_INDIRDEP(wk); if (waitfor == MNT_NOWAIT) { if (!TAILQ_EMPTY(&indirdep->ir_trunc) || !LIST_EMPTY(&indirdep->ir_deplisthd)) { error = EBUSY; goto out_unlock; } } if (!TAILQ_EMPTY(&indirdep->ir_trunc)) panic("softdep_sync_buf: truncation pending."); restart: LIST_FOREACH(aip, &indirdep->ir_deplisthd, ai_next) { newblk = (struct newblk *)aip; if (newblk->nb_jnewblk != NULL) { jwait(&newblk->nb_jnewblk->jn_list, waitfor); goto restart; } if (newblk->nb_state & DEPCOMPLETE) continue; nbp = newblk->nb_bmsafemap->sm_buf; nbp = getdirtybuf(nbp, LOCK_PTR(ump), waitfor); if (nbp == NULL) goto restart; FREE_LOCK(ump); if ((error = bwrite(nbp)) != 0) goto out; ACQUIRE_LOCK(ump); goto restart; } continue; case D_PAGEDEP: /* * Only flush directory entries in synchronous passes. */ if (waitfor != MNT_WAIT) { error = EBUSY; goto out_unlock; } /* * While syncing snapshots, we must allow recursive * lookups. */ BUF_AREC(bp); /* * We are trying to sync a directory that may * have dependencies on both its own metadata * and/or dependencies on the inodes of any * recently allocated files. We walk its diradd * lists pushing out the associated inode. */ pagedep = WK_PAGEDEP(wk); for (i = 0; i < DAHASHSZ; i++) { if (LIST_FIRST(&pagedep->pd_diraddhd[i]) == 0) continue; if ((error = flush_pagedep_deps(vp, wk->wk_mp, &pagedep->pd_diraddhd[i]))) { BUF_NOREC(bp); goto out_unlock; } } BUF_NOREC(bp); continue; case D_FREEWORK: case D_FREEDEP: case D_JSEGDEP: case D_JNEWBLK: continue; default: panic("softdep_sync_buf: Unknown type %s", TYPENAME(wk->wk_type)); /* NOTREACHED */ } } out_unlock: FREE_LOCK(ump); out: return (error); } /* * Flush the dependencies associated with an inodedep. */ static int flush_inodedep_deps(vp, mp, ino) struct vnode *vp; struct mount *mp; ino_t ino; { struct inodedep *inodedep; struct inoref *inoref; struct ufsmount *ump; int error, waitfor; /* * This work is done in two passes. The first pass grabs most * of the buffers and begins asynchronously writing them. The * only way to wait for these asynchronous writes is to sleep * on the filesystem vnode which may stay busy for a long time * if the filesystem is active. So, instead, we make a second * pass over the dependencies blocking on each write. In the * usual case we will be blocking against a write that we * initiated, so when it is done the dependency will have been * resolved. Thus the second pass is expected to end quickly. * We give a brief window at the top of the loop to allow * any pending I/O to complete. */ ump = VFSTOUFS(mp); LOCK_OWNED(ump); for (error = 0, waitfor = MNT_NOWAIT; ; ) { if (error) return (error); FREE_LOCK(ump); ACQUIRE_LOCK(ump); restart: if (inodedep_lookup(mp, ino, 0, &inodedep) == 0) return (0); TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) { if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY)) == DEPCOMPLETE) { jwait(&inoref->if_list, MNT_WAIT); goto restart; } } if (flush_deplist(&inodedep->id_inoupdt, waitfor, &error) || flush_deplist(&inodedep->id_newinoupdt, waitfor, &error) || flush_deplist(&inodedep->id_extupdt, waitfor, &error) || flush_deplist(&inodedep->id_newextupdt, waitfor, &error)) continue; /* * If pass2, we are done, otherwise do pass 2. */ if (waitfor == MNT_WAIT) break; waitfor = MNT_WAIT; } /* * Try freeing inodedep in case all dependencies have been removed. */ if (inodedep_lookup(mp, ino, 0, &inodedep) != 0) (void) free_inodedep(inodedep); return (0); } /* * Flush an inode dependency list. */ static int flush_deplist(listhead, waitfor, errorp) struct allocdirectlst *listhead; int waitfor; int *errorp; { struct allocdirect *adp; struct newblk *newblk; struct ufsmount *ump; struct buf *bp; if ((adp = TAILQ_FIRST(listhead)) == NULL) return (0); ump = VFSTOUFS(adp->ad_list.wk_mp); LOCK_OWNED(ump); TAILQ_FOREACH(adp, listhead, ad_next) { newblk = (struct newblk *)adp; if (newblk->nb_jnewblk != NULL) { jwait(&newblk->nb_jnewblk->jn_list, MNT_WAIT); return (1); } if (newblk->nb_state & DEPCOMPLETE) continue; bp = newblk->nb_bmsafemap->sm_buf; bp = getdirtybuf(bp, LOCK_PTR(ump), waitfor); if (bp == NULL) { if (waitfor == MNT_NOWAIT) continue; return (1); } FREE_LOCK(ump); if (waitfor == MNT_NOWAIT) bawrite(bp); else *errorp = bwrite(bp); ACQUIRE_LOCK(ump); return (1); } return (0); } /* * Flush dependencies associated with an allocdirect block. */ static int flush_newblk_dep(vp, mp, lbn) struct vnode *vp; struct mount *mp; ufs_lbn_t lbn; { struct newblk *newblk; struct ufsmount *ump; struct bufobj *bo; struct inode *ip; struct buf *bp; ufs2_daddr_t blkno; int error; error = 0; bo = &vp->v_bufobj; ip = VTOI(vp); blkno = DIP(ip, i_db[lbn]); if (blkno == 0) panic("flush_newblk_dep: Missing block"); ump = VFSTOUFS(mp); ACQUIRE_LOCK(ump); /* * Loop until all dependencies related to this block are satisfied. * We must be careful to restart after each sleep in case a write * completes some part of this process for us. */ for (;;) { if (newblk_lookup(mp, blkno, 0, &newblk) == 0) { FREE_LOCK(ump); break; } if (newblk->nb_list.wk_type != D_ALLOCDIRECT) panic("flush_newblk_dep: Bad newblk %p", newblk); /* * Flush the journal. */ if (newblk->nb_jnewblk != NULL) { jwait(&newblk->nb_jnewblk->jn_list, MNT_WAIT); continue; } /* * Write the bitmap dependency. */ if ((newblk->nb_state & DEPCOMPLETE) == 0) { bp = newblk->nb_bmsafemap->sm_buf; bp = getdirtybuf(bp, LOCK_PTR(ump), MNT_WAIT); if (bp == NULL) continue; FREE_LOCK(ump); error = bwrite(bp); if (error) break; ACQUIRE_LOCK(ump); continue; } /* * Write the buffer. */ FREE_LOCK(ump); BO_LOCK(bo); bp = gbincore(bo, lbn); if (bp != NULL) { error = BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_LOCKPTR(bo)); if (error == ENOLCK) { ACQUIRE_LOCK(ump); error = 0; continue; /* Slept, retry */ } if (error != 0) break; /* Failed */ if (bp->b_flags & B_DELWRI) { bremfree(bp); error = bwrite(bp); if (error) break; } else BUF_UNLOCK(bp); } else BO_UNLOCK(bo); /* * We have to wait for the direct pointers to * point at the newdirblk before the dependency * will go away. */ error = ffs_update(vp, 1); if (error) break; ACQUIRE_LOCK(ump); } return (error); } /* * Eliminate a pagedep dependency by flushing out all its diradd dependencies. */ static int flush_pagedep_deps(pvp, mp, diraddhdp) struct vnode *pvp; struct mount *mp; struct diraddhd *diraddhdp; { struct inodedep *inodedep; struct inoref *inoref; struct ufsmount *ump; struct diradd *dap; struct vnode *vp; int error = 0; struct buf *bp; ino_t inum; struct diraddhd unfinished; LIST_INIT(&unfinished); ump = VFSTOUFS(mp); LOCK_OWNED(ump); restart: while ((dap = LIST_FIRST(diraddhdp)) != NULL) { /* * Flush ourselves if this directory entry * has a MKDIR_PARENT dependency. */ if (dap->da_state & MKDIR_PARENT) { FREE_LOCK(ump); if ((error = ffs_update(pvp, 1)) != 0) break; ACQUIRE_LOCK(ump); /* * If that cleared dependencies, go on to next. */ if (dap != LIST_FIRST(diraddhdp)) continue; /* * All MKDIR_PARENT dependencies and all the * NEWBLOCK pagedeps that are contained in direct * blocks were resolved by doing above ffs_update. * Pagedeps contained in indirect blocks may * require a complete sync'ing of the directory. * We are in the midst of doing a complete sync, * so if they are not resolved in this pass we * defer them for now as they will be sync'ed by * our caller shortly. */ LIST_REMOVE(dap, da_pdlist); LIST_INSERT_HEAD(&unfinished, dap, da_pdlist); continue; } /* * A newly allocated directory must have its "." and * ".." entries written out before its name can be * committed in its parent. */ inum = dap->da_newinum; if (inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep) == 0) panic("flush_pagedep_deps: lost inode1"); /* * Wait for any pending journal adds to complete so we don't * cause rollbacks while syncing. */ TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) { if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY)) == DEPCOMPLETE) { jwait(&inoref->if_list, MNT_WAIT); goto restart; } } if (dap->da_state & MKDIR_BODY) { FREE_LOCK(ump); if ((error = ffs_vgetf(mp, inum, LK_EXCLUSIVE, &vp, FFSV_FORCEINSMQ))) break; MPASS(VTOI(vp)->i_mode != 0); error = flush_newblk_dep(vp, mp, 0); /* * If we still have the dependency we might need to * update the vnode to sync the new link count to * disk. */ if (error == 0 && dap == LIST_FIRST(diraddhdp)) error = ffs_update(vp, 1); vput(vp); if (error != 0) break; ACQUIRE_LOCK(ump); /* * If that cleared dependencies, go on to next. */ if (dap != LIST_FIRST(diraddhdp)) continue; if (dap->da_state & MKDIR_BODY) { inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep); panic("flush_pagedep_deps: MKDIR_BODY " "inodedep %p dap %p vp %p", inodedep, dap, vp); } } /* * Flush the inode on which the directory entry depends. * Having accounted for MKDIR_PARENT and MKDIR_BODY above, * the only remaining dependency is that the updated inode * count must get pushed to disk. The inode has already * been pushed into its inode buffer (via VOP_UPDATE) at * the time of the reference count change. So we need only * locate that buffer, ensure that there will be no rollback * caused by a bitmap dependency, then write the inode buffer. */ retry: if (inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep) == 0) panic("flush_pagedep_deps: lost inode"); /* * If the inode still has bitmap dependencies, * push them to disk. */ if ((inodedep->id_state & (DEPCOMPLETE | GOINGAWAY)) == 0) { bp = inodedep->id_bmsafemap->sm_buf; bp = getdirtybuf(bp, LOCK_PTR(ump), MNT_WAIT); if (bp == NULL) goto retry; FREE_LOCK(ump); if ((error = bwrite(bp)) != 0) break; ACQUIRE_LOCK(ump); if (dap != LIST_FIRST(diraddhdp)) continue; } /* * If the inode is still sitting in a buffer waiting * to be written or waiting for the link count to be * adjusted update it here to flush it to disk. */ if (dap == LIST_FIRST(diraddhdp)) { FREE_LOCK(ump); if ((error = ffs_vgetf(mp, inum, LK_EXCLUSIVE, &vp, FFSV_FORCEINSMQ))) break; MPASS(VTOI(vp)->i_mode != 0); error = ffs_update(vp, 1); vput(vp); if (error) break; ACQUIRE_LOCK(ump); } /* * If we have failed to get rid of all the dependencies * then something is seriously wrong. */ if (dap == LIST_FIRST(diraddhdp)) { inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep); panic("flush_pagedep_deps: failed to flush " "inodedep %p ino %ju dap %p", inodedep, (uintmax_t)inum, dap); } } if (error) ACQUIRE_LOCK(ump); while ((dap = LIST_FIRST(&unfinished)) != NULL) { LIST_REMOVE(dap, da_pdlist); LIST_INSERT_HEAD(diraddhdp, dap, da_pdlist); } return (error); } /* * A large burst of file addition or deletion activity can drive the * memory load excessively high. First attempt to slow things down * using the techniques below. If that fails, this routine requests * the offending operations to fall back to running synchronously * until the memory load returns to a reasonable level. */ int softdep_slowdown(vp) struct vnode *vp; { struct ufsmount *ump; int jlow; int max_softdeps_hard; KASSERT(MOUNTEDSOFTDEP(vp->v_mount) != 0, ("softdep_slowdown called on non-softdep filesystem")); ump = VFSTOUFS(vp->v_mount); ACQUIRE_LOCK(ump); jlow = 0; /* * Check for journal space if needed. */ if (DOINGSUJ(vp)) { if (journal_space(ump, 0) == 0) jlow = 1; } /* * If the system is under its limits and our filesystem is * not responsible for more than our share of the usage and * we are not low on journal space, then no need to slow down. */ max_softdeps_hard = max_softdeps * 11 / 10; if (dep_current[D_DIRREM] < max_softdeps_hard / 2 && dep_current[D_INODEDEP] < max_softdeps_hard && dep_current[D_INDIRDEP] < max_softdeps_hard / 1000 && dep_current[D_FREEBLKS] < max_softdeps_hard && jlow == 0 && ump->softdep_curdeps[D_DIRREM] < (max_softdeps_hard / 2) / stat_flush_threads && ump->softdep_curdeps[D_INODEDEP] < max_softdeps_hard / stat_flush_threads && ump->softdep_curdeps[D_INDIRDEP] < (max_softdeps_hard / 1000) / stat_flush_threads && ump->softdep_curdeps[D_FREEBLKS] < max_softdeps_hard / stat_flush_threads) { FREE_LOCK(ump); return (0); } /* * If the journal is low or our filesystem is over its limit * then speedup the cleanup. */ if (ump->softdep_curdeps[D_INDIRDEP] < (max_softdeps_hard / 1000) / stat_flush_threads || jlow) softdep_speedup(ump); stat_sync_limit_hit += 1; FREE_LOCK(ump); /* * We only slow down the rate at which new dependencies are * generated if we are not using journaling. With journaling, * the cleanup should always be sufficient to keep things * under control. */ if (DOINGSUJ(vp)) return (0); return (1); } /* * Called by the allocation routines when they are about to fail * in the hope that we can free up the requested resource (inodes * or disk space). * * First check to see if the work list has anything on it. If it has, * clean up entries until we successfully free the requested resource. * Because this process holds inodes locked, we cannot handle any remove * requests that might block on a locked inode as that could lead to * deadlock. If the worklist yields none of the requested resource, * start syncing out vnodes to free up the needed space. */ int softdep_request_cleanup(fs, vp, cred, resource) struct fs *fs; struct vnode *vp; struct ucred *cred; int resource; { struct ufsmount *ump; struct mount *mp; long starttime; ufs2_daddr_t needed; int error, failed_vnode; /* * If we are being called because of a process doing a * copy-on-write, then it is not safe to process any * worklist items as we will recurse into the copyonwrite * routine. This will result in an incoherent snapshot. * If the vnode that we hold is a snapshot, we must avoid * handling other resources that could cause deadlock. */ if ((curthread->td_pflags & TDP_COWINPROGRESS) || IS_SNAPSHOT(VTOI(vp))) return (0); if (resource == FLUSH_BLOCKS_WAIT) stat_cleanup_blkrequests += 1; else stat_cleanup_inorequests += 1; mp = vp->v_mount; ump = VFSTOUFS(mp); mtx_assert(UFS_MTX(ump), MA_OWNED); UFS_UNLOCK(ump); error = ffs_update(vp, 1); if (error != 0 || MOUNTEDSOFTDEP(mp) == 0) { UFS_LOCK(ump); return (0); } /* * If we are in need of resources, start by cleaning up * any block removals associated with our inode. */ ACQUIRE_LOCK(ump); process_removes(vp); process_truncates(vp); FREE_LOCK(ump); /* * Now clean up at least as many resources as we will need. * * When requested to clean up inodes, the number that are needed * is set by the number of simultaneous writers (mnt_writeopcount) * plus a bit of slop (2) in case some more writers show up while * we are cleaning. * * When requested to free up space, the amount of space that * we need is enough blocks to allocate a full-sized segment * (fs_contigsumsize). The number of such segments that will * be needed is set by the number of simultaneous writers * (mnt_writeopcount) plus a bit of slop (2) in case some more * writers show up while we are cleaning. * * Additionally, if we are unpriviledged and allocating space, * we need to ensure that we clean up enough blocks to get the * needed number of blocks over the threshold of the minimum * number of blocks required to be kept free by the filesystem * (fs_minfree). */ if (resource == FLUSH_INODES_WAIT) { needed = vfs_mount_fetch_counter(vp->v_mount, MNT_COUNT_WRITEOPCOUNT) + 2; } else if (resource == FLUSH_BLOCKS_WAIT) { needed = (vfs_mount_fetch_counter(vp->v_mount, MNT_COUNT_WRITEOPCOUNT) + 2) * fs->fs_contigsumsize; if (priv_check_cred(cred, PRIV_VFS_BLOCKRESERVE)) needed += fragstoblks(fs, roundup((fs->fs_dsize * fs->fs_minfree / 100) - fs->fs_cstotal.cs_nffree, fs->fs_frag)); } else { printf("softdep_request_cleanup: Unknown resource type %d\n", resource); UFS_LOCK(ump); return (0); } starttime = time_second; retry: if (resource == FLUSH_BLOCKS_WAIT && fs->fs_cstotal.cs_nbfree <= needed) softdep_send_speedup(ump, needed * fs->fs_bsize, BIO_SPEEDUP_TRIM); if ((resource == FLUSH_BLOCKS_WAIT && ump->softdep_on_worklist > 0 && fs->fs_cstotal.cs_nbfree <= needed) || (resource == FLUSH_INODES_WAIT && fs->fs_pendinginodes > 0 && fs->fs_cstotal.cs_nifree <= needed)) { ACQUIRE_LOCK(ump); if (ump->softdep_on_worklist > 0 && process_worklist_item(UFSTOVFS(ump), ump->softdep_on_worklist, LK_NOWAIT) != 0) stat_worklist_push += 1; FREE_LOCK(ump); } /* * If we still need resources and there are no more worklist * entries to process to obtain them, we have to start flushing * the dirty vnodes to force the release of additional requests * to the worklist that we can then process to reap addition * resources. We walk the vnodes associated with the mount point * until we get the needed worklist requests that we can reap. * * If there are several threads all needing to clean the same * mount point, only one is allowed to walk the mount list. * When several threads all try to walk the same mount list, * they end up competing with each other and often end up in * livelock. This approach ensures that forward progress is * made at the cost of occational ENOSPC errors being returned * that might otherwise have been avoided. */ error = 1; if ((resource == FLUSH_BLOCKS_WAIT && fs->fs_cstotal.cs_nbfree <= needed) || (resource == FLUSH_INODES_WAIT && fs->fs_pendinginodes > 0 && fs->fs_cstotal.cs_nifree <= needed)) { ACQUIRE_LOCK(ump); if ((ump->um_softdep->sd_flags & FLUSH_RC_ACTIVE) == 0) { ump->um_softdep->sd_flags |= FLUSH_RC_ACTIVE; FREE_LOCK(ump); failed_vnode = softdep_request_cleanup_flush(mp, ump); ACQUIRE_LOCK(ump); ump->um_softdep->sd_flags &= ~FLUSH_RC_ACTIVE; FREE_LOCK(ump); if (ump->softdep_on_worklist > 0) { stat_cleanup_retries += 1; if (!failed_vnode) goto retry; } } else { FREE_LOCK(ump); error = 0; } stat_cleanup_failures += 1; } if (time_second - starttime > stat_cleanup_high_delay) stat_cleanup_high_delay = time_second - starttime; UFS_LOCK(ump); return (error); } /* * Scan the vnodes for the specified mount point flushing out any * vnodes that can be locked without waiting. Finally, try to flush * the device associated with the mount point if it can be locked * without waiting. * * We return 0 if we were able to lock every vnode in our scan. * If we had to skip one or more vnodes, we return 1. */ static int softdep_request_cleanup_flush(mp, ump) struct mount *mp; struct ufsmount *ump; { struct thread *td; struct vnode *lvp, *mvp; int failed_vnode; failed_vnode = 0; td = curthread; MNT_VNODE_FOREACH_ALL(lvp, mp, mvp) { if (TAILQ_FIRST(&lvp->v_bufobj.bo_dirty.bv_hd) == 0) { VI_UNLOCK(lvp); continue; } - if (vget(lvp, LK_EXCLUSIVE | LK_INTERLOCK | LK_NOWAIT, - td) != 0) { + if (vget(lvp, LK_EXCLUSIVE | LK_INTERLOCK | LK_NOWAIT) != 0) { failed_vnode = 1; continue; } if (lvp->v_vflag & VV_NOSYNC) { /* unlinked */ vput(lvp); continue; } (void) ffs_syncvnode(lvp, MNT_NOWAIT, 0); vput(lvp); } lvp = ump->um_devvp; if (vn_lock(lvp, LK_EXCLUSIVE | LK_NOWAIT) == 0) { VOP_FSYNC(lvp, MNT_NOWAIT, td); VOP_UNLOCK(lvp); } return (failed_vnode); } static bool softdep_excess_items(struct ufsmount *ump, int item) { KASSERT(item >= 0 && item < D_LAST, ("item %d", item)); return (dep_current[item] > max_softdeps && ump->softdep_curdeps[item] > max_softdeps / stat_flush_threads); } static void schedule_cleanup(struct mount *mp) { struct ufsmount *ump; struct thread *td; ump = VFSTOUFS(mp); LOCK_OWNED(ump); FREE_LOCK(ump); td = curthread; if ((td->td_pflags & TDP_KTHREAD) != 0 && (td->td_proc->p_flag2 & P2_AST_SU) == 0) { /* * No ast is delivered to kernel threads, so nobody * would deref the mp. Some kernel threads * explicitely check for AST, e.g. NFS daemon does * this in the serving loop. */ return; } if (td->td_su != NULL) vfs_rel(td->td_su); vfs_ref(mp); td->td_su = mp; thread_lock(td); td->td_flags |= TDF_ASTPENDING; thread_unlock(td); } static void softdep_ast_cleanup_proc(struct thread *td) { struct mount *mp; struct ufsmount *ump; int error; bool req; while ((mp = td->td_su) != NULL) { td->td_su = NULL; error = vfs_busy(mp, MBF_NOWAIT); vfs_rel(mp); if (error != 0) return; if (ffs_own_mount(mp) && MOUNTEDSOFTDEP(mp)) { ump = VFSTOUFS(mp); for (;;) { req = false; ACQUIRE_LOCK(ump); if (softdep_excess_items(ump, D_INODEDEP)) { req = true; request_cleanup(mp, FLUSH_INODES); } if (softdep_excess_items(ump, D_DIRREM)) { req = true; request_cleanup(mp, FLUSH_BLOCKS); } FREE_LOCK(ump); if (softdep_excess_items(ump, D_NEWBLK) || softdep_excess_items(ump, D_ALLOCDIRECT) || softdep_excess_items(ump, D_ALLOCINDIR)) { error = vn_start_write(NULL, &mp, V_WAIT); if (error == 0) { req = true; VFS_SYNC(mp, MNT_WAIT); vn_finished_write(mp); } } if ((td->td_pflags & TDP_KTHREAD) != 0 || !req) break; } } vfs_unbusy(mp); } if ((mp = td->td_su) != NULL) { td->td_su = NULL; vfs_rel(mp); } } /* * If memory utilization has gotten too high, deliberately slow things * down and speed up the I/O processing. */ static int request_cleanup(mp, resource) struct mount *mp; int resource; { struct thread *td = curthread; struct ufsmount *ump; ump = VFSTOUFS(mp); LOCK_OWNED(ump); /* * We never hold up the filesystem syncer or buf daemon. */ if (td->td_pflags & (TDP_SOFTDEP|TDP_NORUNNINGBUF)) return (0); /* * First check to see if the work list has gotten backlogged. * If it has, co-opt this process to help clean up two entries. * Because this process may hold inodes locked, we cannot * handle any remove requests that might block on a locked * inode as that could lead to deadlock. We set TDP_SOFTDEP * to avoid recursively processing the worklist. */ if (ump->softdep_on_worklist > max_softdeps / 10) { td->td_pflags |= TDP_SOFTDEP; process_worklist_item(mp, 2, LK_NOWAIT); td->td_pflags &= ~TDP_SOFTDEP; stat_worklist_push += 2; return(1); } /* * Next, we attempt to speed up the syncer process. If that * is successful, then we allow the process to continue. */ if (softdep_speedup(ump) && resource != FLUSH_BLOCKS_WAIT && resource != FLUSH_INODES_WAIT) return(0); /* * If we are resource constrained on inode dependencies, try * flushing some dirty inodes. Otherwise, we are constrained * by file deletions, so try accelerating flushes of directories * with removal dependencies. We would like to do the cleanup * here, but we probably hold an inode locked at this point and * that might deadlock against one that we try to clean. So, * the best that we can do is request the syncer daemon to do * the cleanup for us. */ switch (resource) { case FLUSH_INODES: case FLUSH_INODES_WAIT: ACQUIRE_GBLLOCK(&lk); stat_ino_limit_push += 1; req_clear_inodedeps += 1; FREE_GBLLOCK(&lk); stat_countp = &stat_ino_limit_hit; break; case FLUSH_BLOCKS: case FLUSH_BLOCKS_WAIT: ACQUIRE_GBLLOCK(&lk); stat_blk_limit_push += 1; req_clear_remove += 1; FREE_GBLLOCK(&lk); stat_countp = &stat_blk_limit_hit; break; default: panic("request_cleanup: unknown type"); } /* * Hopefully the syncer daemon will catch up and awaken us. * We wait at most tickdelay before proceeding in any case. */ ACQUIRE_GBLLOCK(&lk); FREE_LOCK(ump); proc_waiting += 1; if (callout_pending(&softdep_callout) == FALSE) callout_reset(&softdep_callout, tickdelay > 2 ? tickdelay : 2, pause_timer, 0); if ((td->td_pflags & TDP_KTHREAD) == 0) msleep((caddr_t)&proc_waiting, &lk, PPAUSE, "softupdate", 0); proc_waiting -= 1; FREE_GBLLOCK(&lk); ACQUIRE_LOCK(ump); return (1); } /* * Awaken processes pausing in request_cleanup and clear proc_waiting * to indicate that there is no longer a timer running. Pause_timer * will be called with the global softdep mutex (&lk) locked. */ static void pause_timer(arg) void *arg; { GBLLOCK_OWNED(&lk); /* * The callout_ API has acquired mtx and will hold it around this * function call. */ *stat_countp += proc_waiting; wakeup(&proc_waiting); } /* * If requested, try removing inode or removal dependencies. */ static void check_clear_deps(mp) struct mount *mp; { struct ufsmount *ump; bool suj_susp; /* * Tell the lower layers that any TRIM or WRITE transactions that have * been delayed for performance reasons should proceed to help alleviate * the shortage faster. The race between checking req_* and the softdep * mutex (lk) is fine since this is an advisory operation that at most * causes deferred work to be done sooner. */ ump = VFSTOUFS(mp); suj_susp = MOUNTEDSUJ(mp) && ump->softdep_jblocks->jb_suspended; if (req_clear_remove || req_clear_inodedeps || suj_susp) { FREE_LOCK(ump); softdep_send_speedup(ump, 0, BIO_SPEEDUP_TRIM | BIO_SPEEDUP_WRITE); ACQUIRE_LOCK(ump); } /* * If we are suspended, it may be because of our using * too many inodedeps, so help clear them out. */ if (suj_susp) clear_inodedeps(mp); /* * General requests for cleanup of backed up dependencies */ ACQUIRE_GBLLOCK(&lk); if (req_clear_inodedeps) { req_clear_inodedeps -= 1; FREE_GBLLOCK(&lk); clear_inodedeps(mp); ACQUIRE_GBLLOCK(&lk); wakeup(&proc_waiting); } if (req_clear_remove) { req_clear_remove -= 1; FREE_GBLLOCK(&lk); clear_remove(mp); ACQUIRE_GBLLOCK(&lk); wakeup(&proc_waiting); } FREE_GBLLOCK(&lk); } /* * Flush out a directory with at least one removal dependency in an effort to * reduce the number of dirrem, freefile, and freeblks dependency structures. */ static void clear_remove(mp) struct mount *mp; { struct pagedep_hashhead *pagedephd; struct pagedep *pagedep; struct ufsmount *ump; struct vnode *vp; struct bufobj *bo; int error, cnt; ino_t ino; ump = VFSTOUFS(mp); LOCK_OWNED(ump); for (cnt = 0; cnt <= ump->pagedep_hash_size; cnt++) { pagedephd = &ump->pagedep_hashtbl[ump->pagedep_nextclean++]; if (ump->pagedep_nextclean > ump->pagedep_hash_size) ump->pagedep_nextclean = 0; LIST_FOREACH(pagedep, pagedephd, pd_hash) { if (LIST_EMPTY(&pagedep->pd_dirremhd)) continue; ino = pagedep->pd_ino; if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) continue; FREE_LOCK(ump); /* * Let unmount clear deps */ error = vfs_busy(mp, MBF_NOWAIT); if (error != 0) goto finish_write; error = ffs_vgetf(mp, ino, LK_EXCLUSIVE, &vp, FFSV_FORCEINSMQ); vfs_unbusy(mp); if (error != 0) { softdep_error("clear_remove: vget", error); goto finish_write; } MPASS(VTOI(vp)->i_mode != 0); if ((error = ffs_syncvnode(vp, MNT_NOWAIT, 0))) softdep_error("clear_remove: fsync", error); bo = &vp->v_bufobj; BO_LOCK(bo); drain_output(vp); BO_UNLOCK(bo); vput(vp); finish_write: vn_finished_write(mp); ACQUIRE_LOCK(ump); return; } } } /* * Clear out a block of dirty inodes in an effort to reduce * the number of inodedep dependency structures. */ static void clear_inodedeps(mp) struct mount *mp; { struct inodedep_hashhead *inodedephd; struct inodedep *inodedep; struct ufsmount *ump; struct vnode *vp; struct fs *fs; int error, cnt; ino_t firstino, lastino, ino; ump = VFSTOUFS(mp); fs = ump->um_fs; LOCK_OWNED(ump); /* * Pick a random inode dependency to be cleared. * We will then gather up all the inodes in its block * that have dependencies and flush them out. */ for (cnt = 0; cnt <= ump->inodedep_hash_size; cnt++) { inodedephd = &ump->inodedep_hashtbl[ump->inodedep_nextclean++]; if (ump->inodedep_nextclean > ump->inodedep_hash_size) ump->inodedep_nextclean = 0; if ((inodedep = LIST_FIRST(inodedephd)) != NULL) break; } if (inodedep == NULL) return; /* * Find the last inode in the block with dependencies. */ firstino = rounddown2(inodedep->id_ino, INOPB(fs)); for (lastino = firstino + INOPB(fs) - 1; lastino > firstino; lastino--) if (inodedep_lookup(mp, lastino, 0, &inodedep) != 0) break; /* * Asynchronously push all but the last inode with dependencies. * Synchronously push the last inode with dependencies to ensure * that the inode block gets written to free up the inodedeps. */ for (ino = firstino; ino <= lastino; ino++) { if (inodedep_lookup(mp, ino, 0, &inodedep) == 0) continue; if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) continue; FREE_LOCK(ump); error = vfs_busy(mp, MBF_NOWAIT); /* Let unmount clear deps */ if (error != 0) { vn_finished_write(mp); ACQUIRE_LOCK(ump); return; } if ((error = ffs_vgetf(mp, ino, LK_EXCLUSIVE, &vp, FFSV_FORCEINSMQ)) != 0) { softdep_error("clear_inodedeps: vget", error); vfs_unbusy(mp); vn_finished_write(mp); ACQUIRE_LOCK(ump); return; } vfs_unbusy(mp); if (VTOI(vp)->i_mode == 0) { vgone(vp); } else if (ino == lastino) { if ((error = ffs_syncvnode(vp, MNT_WAIT, 0))) softdep_error("clear_inodedeps: fsync1", error); } else { if ((error = ffs_syncvnode(vp, MNT_NOWAIT, 0))) softdep_error("clear_inodedeps: fsync2", error); BO_LOCK(&vp->v_bufobj); drain_output(vp); BO_UNLOCK(&vp->v_bufobj); } vput(vp); vn_finished_write(mp); ACQUIRE_LOCK(ump); } } void softdep_buf_append(bp, wkhd) struct buf *bp; struct workhead *wkhd; { struct worklist *wk; struct ufsmount *ump; if ((wk = LIST_FIRST(wkhd)) == NULL) return; KASSERT(MOUNTEDSOFTDEP(wk->wk_mp) != 0, ("softdep_buf_append called on non-softdep filesystem")); ump = VFSTOUFS(wk->wk_mp); ACQUIRE_LOCK(ump); while ((wk = LIST_FIRST(wkhd)) != NULL) { WORKLIST_REMOVE(wk); WORKLIST_INSERT(&bp->b_dep, wk); } FREE_LOCK(ump); } void softdep_inode_append(ip, cred, wkhd) struct inode *ip; struct ucred *cred; struct workhead *wkhd; { struct buf *bp; struct fs *fs; struct ufsmount *ump; int error; ump = ITOUMP(ip); KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0, ("softdep_inode_append called on non-softdep filesystem")); fs = ump->um_fs; error = bread(ump->um_devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)), (int)fs->fs_bsize, cred, &bp); if (error) { bqrelse(bp); softdep_freework(wkhd); return; } softdep_buf_append(bp, wkhd); bqrelse(bp); } void softdep_freework(wkhd) struct workhead *wkhd; { struct worklist *wk; struct ufsmount *ump; if ((wk = LIST_FIRST(wkhd)) == NULL) return; KASSERT(MOUNTEDSOFTDEP(wk->wk_mp) != 0, ("softdep_freework called on non-softdep filesystem")); ump = VFSTOUFS(wk->wk_mp); ACQUIRE_LOCK(ump); handle_jwork(wkhd); FREE_LOCK(ump); } static struct ufsmount * softdep_bp_to_mp(bp) struct buf *bp; { struct mount *mp; struct vnode *vp; if (LIST_EMPTY(&bp->b_dep)) return (NULL); vp = bp->b_vp; KASSERT(vp != NULL, ("%s, buffer with dependencies lacks vnode", __func__)); /* * The ump mount point is stable after we get a correct * pointer, since bp is locked and this prevents unmount from * proceeding. But to get to it, we cannot dereference bp->b_dep * head wk_mp, because we do not yet own SU ump lock and * workitem might be freed while dereferenced. */ retry: switch (vp->v_type) { case VCHR: VI_LOCK(vp); mp = vp->v_type == VCHR ? vp->v_rdev->si_mountpt : NULL; VI_UNLOCK(vp); if (mp == NULL) goto retry; break; case VREG: case VDIR: case VLNK: case VFIFO: case VSOCK: mp = vp->v_mount; break; case VBLK: vn_printf(vp, "softdep_bp_to_mp: unexpected block device\n"); /* FALLTHROUGH */ case VNON: case VBAD: case VMARKER: mp = NULL; break; default: vn_printf(vp, "unknown vnode type"); mp = NULL; break; } return (VFSTOUFS(mp)); } /* * Function to determine if the buffer has outstanding dependencies * that will cause a roll-back if the buffer is written. If wantcount * is set, return number of dependencies, otherwise just yes or no. */ static int softdep_count_dependencies(bp, wantcount) struct buf *bp; int wantcount; { struct worklist *wk; struct ufsmount *ump; struct bmsafemap *bmsafemap; struct freework *freework; struct inodedep *inodedep; struct indirdep *indirdep; struct freeblks *freeblks; struct allocindir *aip; struct pagedep *pagedep; struct dirrem *dirrem; struct newblk *newblk; struct mkdir *mkdir; struct diradd *dap; int i, retval; ump = softdep_bp_to_mp(bp); if (ump == NULL) return (0); retval = 0; ACQUIRE_LOCK(ump); LIST_FOREACH(wk, &bp->b_dep, wk_list) { switch (wk->wk_type) { case D_INODEDEP: inodedep = WK_INODEDEP(wk); if ((inodedep->id_state & DEPCOMPLETE) == 0) { /* bitmap allocation dependency */ retval += 1; if (!wantcount) goto out; } if (TAILQ_FIRST(&inodedep->id_inoupdt)) { /* direct block pointer dependency */ retval += 1; if (!wantcount) goto out; } if (TAILQ_FIRST(&inodedep->id_extupdt)) { /* direct block pointer dependency */ retval += 1; if (!wantcount) goto out; } if (TAILQ_FIRST(&inodedep->id_inoreflst)) { /* Add reference dependency. */ retval += 1; if (!wantcount) goto out; } continue; case D_INDIRDEP: indirdep = WK_INDIRDEP(wk); TAILQ_FOREACH(freework, &indirdep->ir_trunc, fw_next) { /* indirect truncation dependency */ retval += 1; if (!wantcount) goto out; } LIST_FOREACH(aip, &indirdep->ir_deplisthd, ai_next) { /* indirect block pointer dependency */ retval += 1; if (!wantcount) goto out; } continue; case D_PAGEDEP: pagedep = WK_PAGEDEP(wk); LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next) { if (LIST_FIRST(&dirrem->dm_jremrefhd)) { /* Journal remove ref dependency. */ retval += 1; if (!wantcount) goto out; } } for (i = 0; i < DAHASHSZ; i++) { LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) { /* directory entry dependency */ retval += 1; if (!wantcount) goto out; } } continue; case D_BMSAFEMAP: bmsafemap = WK_BMSAFEMAP(wk); if (LIST_FIRST(&bmsafemap->sm_jaddrefhd)) { /* Add reference dependency. */ retval += 1; if (!wantcount) goto out; } if (LIST_FIRST(&bmsafemap->sm_jnewblkhd)) { /* Allocate block dependency. */ retval += 1; if (!wantcount) goto out; } continue; case D_FREEBLKS: freeblks = WK_FREEBLKS(wk); if (LIST_FIRST(&freeblks->fb_jblkdephd)) { /* Freeblk journal dependency. */ retval += 1; if (!wantcount) goto out; } continue; case D_ALLOCDIRECT: case D_ALLOCINDIR: newblk = WK_NEWBLK(wk); if (newblk->nb_jnewblk) { /* Journal allocate dependency. */ retval += 1; if (!wantcount) goto out; } continue; case D_MKDIR: mkdir = WK_MKDIR(wk); if (mkdir->md_jaddref) { /* Journal reference dependency. */ retval += 1; if (!wantcount) goto out; } continue; case D_FREEWORK: case D_FREEDEP: case D_JSEGDEP: case D_JSEG: case D_SBDEP: /* never a dependency on these blocks */ continue; default: panic("softdep_count_dependencies: Unexpected type %s", TYPENAME(wk->wk_type)); /* NOTREACHED */ } } out: FREE_LOCK(ump); return (retval); } /* * Acquire exclusive access to a buffer. * Must be called with a locked mtx parameter. * Return acquired buffer or NULL on failure. */ static struct buf * getdirtybuf(bp, lock, waitfor) struct buf *bp; struct rwlock *lock; int waitfor; { int error; if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) { if (waitfor != MNT_WAIT) return (NULL); error = BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, lock); /* * Even if we successfully acquire bp here, we have dropped * lock, which may violates our guarantee. */ if (error == 0) BUF_UNLOCK(bp); else if (error != ENOLCK) panic("getdirtybuf: inconsistent lock: %d", error); rw_wlock(lock); return (NULL); } if ((bp->b_vflags & BV_BKGRDINPROG) != 0) { if (lock != BO_LOCKPTR(bp->b_bufobj) && waitfor == MNT_WAIT) { rw_wunlock(lock); BO_LOCK(bp->b_bufobj); BUF_UNLOCK(bp); if ((bp->b_vflags & BV_BKGRDINPROG) != 0) { bp->b_vflags |= BV_BKGRDWAIT; msleep(&bp->b_xflags, BO_LOCKPTR(bp->b_bufobj), PRIBIO | PDROP, "getbuf", 0); } else BO_UNLOCK(bp->b_bufobj); rw_wlock(lock); return (NULL); } BUF_UNLOCK(bp); if (waitfor != MNT_WAIT) return (NULL); #ifdef DEBUG_VFS_LOCKS if (bp->b_vp->v_type != VCHR) ASSERT_BO_WLOCKED(bp->b_bufobj); #endif bp->b_vflags |= BV_BKGRDWAIT; rw_sleep(&bp->b_xflags, lock, PRIBIO, "getbuf", 0); return (NULL); } if ((bp->b_flags & B_DELWRI) == 0) { BUF_UNLOCK(bp); return (NULL); } bremfree(bp); return (bp); } /* * Check if it is safe to suspend the file system now. On entry, * the vnode interlock for devvp should be held. Return 0 with * the mount interlock held if the file system can be suspended now, * otherwise return EAGAIN with the mount interlock held. */ int softdep_check_suspend(struct mount *mp, struct vnode *devvp, int softdep_depcnt, int softdep_accdepcnt, int secondary_writes, int secondary_accwrites) { struct bufobj *bo; struct ufsmount *ump; struct inodedep *inodedep; int error, unlinked; bo = &devvp->v_bufobj; ASSERT_BO_WLOCKED(bo); /* * If we are not running with soft updates, then we need only * deal with secondary writes as we try to suspend. */ if (MOUNTEDSOFTDEP(mp) == 0) { MNT_ILOCK(mp); while (mp->mnt_secondary_writes != 0) { BO_UNLOCK(bo); msleep(&mp->mnt_secondary_writes, MNT_MTX(mp), (PUSER - 1) | PDROP, "secwr", 0); BO_LOCK(bo); MNT_ILOCK(mp); } /* * Reasons for needing more work before suspend: * - Dirty buffers on devvp. * - Secondary writes occurred after start of vnode sync loop */ error = 0; if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0 || secondary_writes != 0 || mp->mnt_secondary_writes != 0 || secondary_accwrites != mp->mnt_secondary_accwrites) error = EAGAIN; BO_UNLOCK(bo); return (error); } /* * If we are running with soft updates, then we need to coordinate * with them as we try to suspend. */ ump = VFSTOUFS(mp); for (;;) { if (!TRY_ACQUIRE_LOCK(ump)) { BO_UNLOCK(bo); ACQUIRE_LOCK(ump); FREE_LOCK(ump); BO_LOCK(bo); continue; } MNT_ILOCK(mp); if (mp->mnt_secondary_writes != 0) { FREE_LOCK(ump); BO_UNLOCK(bo); msleep(&mp->mnt_secondary_writes, MNT_MTX(mp), (PUSER - 1) | PDROP, "secwr", 0); BO_LOCK(bo); continue; } break; } unlinked = 0; if (MOUNTEDSUJ(mp)) { for (inodedep = TAILQ_FIRST(&ump->softdep_unlinked); inodedep != NULL; inodedep = TAILQ_NEXT(inodedep, id_unlinked)) { if ((inodedep->id_state & (UNLINKED | UNLINKLINKS | UNLINKONLIST)) != (UNLINKED | UNLINKLINKS | UNLINKONLIST) || !check_inodedep_free(inodedep)) continue; unlinked++; } } /* * Reasons for needing more work before suspend: * - Dirty buffers on devvp. * - Softdep activity occurred after start of vnode sync loop * - Secondary writes occurred after start of vnode sync loop */ error = 0; if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0 || softdep_depcnt != unlinked || ump->softdep_deps != unlinked || softdep_accdepcnt != ump->softdep_accdeps || secondary_writes != 0 || mp->mnt_secondary_writes != 0 || secondary_accwrites != mp->mnt_secondary_accwrites) error = EAGAIN; FREE_LOCK(ump); BO_UNLOCK(bo); return (error); } /* * Get the number of dependency structures for the file system, both * the current number and the total number allocated. These will * later be used to detect that softdep processing has occurred. */ void softdep_get_depcounts(struct mount *mp, int *softdep_depsp, int *softdep_accdepsp) { struct ufsmount *ump; if (MOUNTEDSOFTDEP(mp) == 0) { *softdep_depsp = 0; *softdep_accdepsp = 0; return; } ump = VFSTOUFS(mp); ACQUIRE_LOCK(ump); *softdep_depsp = ump->softdep_deps; *softdep_accdepsp = ump->softdep_accdeps; FREE_LOCK(ump); } /* * Wait for pending output on a vnode to complete. */ static void drain_output(vp) struct vnode *vp; { ASSERT_VOP_LOCKED(vp, "drain_output"); (void)bufobj_wwait(&vp->v_bufobj, 0, 0); } /* * Called whenever a buffer that is being invalidated or reallocated * contains dependencies. This should only happen if an I/O error has * occurred. The routine is called with the buffer locked. */ static void softdep_deallocate_dependencies(bp) struct buf *bp; { if ((bp->b_ioflags & BIO_ERROR) == 0) panic("softdep_deallocate_dependencies: dangling deps"); if (bp->b_vp != NULL && bp->b_vp->v_mount != NULL) softdep_error(bp->b_vp->v_mount->mnt_stat.f_mntonname, bp->b_error); else printf("softdep_deallocate_dependencies: " "got error %d while accessing filesystem\n", bp->b_error); if (bp->b_error != ENXIO) panic("softdep_deallocate_dependencies: unrecovered I/O error"); } /* * Function to handle asynchronous write errors in the filesystem. */ static void softdep_error(func, error) char *func; int error; { /* XXX should do something better! */ printf("%s: got error %d while accessing filesystem\n", func, error); } #ifdef DDB /* exported to ffs_vfsops.c */ extern void db_print_ffs(struct ufsmount *ump); void db_print_ffs(struct ufsmount *ump) { db_printf("mp %p (%s) devvp %p\n", ump->um_mountp, ump->um_mountp->mnt_stat.f_mntonname, ump->um_devvp); db_printf(" fs %p su_wl %d su_deps %d su_req %d\n", ump->um_fs, ump->softdep_on_worklist, ump->softdep_deps, ump->softdep_req); } static void worklist_print(struct worklist *wk, int verbose) { if (!verbose) { db_printf("%s: %p state 0x%b\n", TYPENAME(wk->wk_type), wk, (u_int)wk->wk_state, PRINT_SOFTDEP_FLAGS); return; } db_printf("worklist: %p type %s state 0x%b next %p\n ", wk, TYPENAME(wk->wk_type), (u_int)wk->wk_state, PRINT_SOFTDEP_FLAGS, LIST_NEXT(wk, wk_list)); db_print_ffs(VFSTOUFS(wk->wk_mp)); } static void inodedep_print(struct inodedep *inodedep, int verbose) { worklist_print(&inodedep->id_list, 0); db_printf(" fs %p ino %jd inoblk %jd delta %jd nlink %jd\n", inodedep->id_fs, (intmax_t)inodedep->id_ino, (intmax_t)fsbtodb(inodedep->id_fs, ino_to_fsba(inodedep->id_fs, inodedep->id_ino)), (intmax_t)inodedep->id_nlinkdelta, (intmax_t)inodedep->id_savednlink); if (verbose == 0) return; db_printf(" bmsafemap %p, mkdiradd %p, inoreflst %p\n", inodedep->id_bmsafemap, inodedep->id_mkdiradd, TAILQ_FIRST(&inodedep->id_inoreflst)); db_printf(" dirremhd %p, pendinghd %p, bufwait %p\n", LIST_FIRST(&inodedep->id_dirremhd), LIST_FIRST(&inodedep->id_pendinghd), LIST_FIRST(&inodedep->id_bufwait)); db_printf(" inowait %p, inoupdt %p, newinoupdt %p\n", LIST_FIRST(&inodedep->id_inowait), TAILQ_FIRST(&inodedep->id_inoupdt), TAILQ_FIRST(&inodedep->id_newinoupdt)); db_printf(" extupdt %p, newextupdt %p, freeblklst %p\n", TAILQ_FIRST(&inodedep->id_extupdt), TAILQ_FIRST(&inodedep->id_newextupdt), TAILQ_FIRST(&inodedep->id_freeblklst)); db_printf(" saveino %p, savedsize %jd, savedextsize %jd\n", inodedep->id_savedino1, (intmax_t)inodedep->id_savedsize, (intmax_t)inodedep->id_savedextsize); } static void newblk_print(struct newblk *nbp) { worklist_print(&nbp->nb_list, 0); db_printf(" newblkno %jd\n", (intmax_t)nbp->nb_newblkno); db_printf(" jnewblk %p, bmsafemap %p, freefrag %p\n", &nbp->nb_jnewblk, &nbp->nb_bmsafemap, &nbp->nb_freefrag); db_printf(" indirdeps %p, newdirblk %p, jwork %p\n", LIST_FIRST(&nbp->nb_indirdeps), LIST_FIRST(&nbp->nb_newdirblk), LIST_FIRST(&nbp->nb_jwork)); } static void allocdirect_print(struct allocdirect *adp) { newblk_print(&adp->ad_block); db_printf(" oldblkno %jd, oldsize %ld, newsize %ld\n", adp->ad_oldblkno, adp->ad_oldsize, adp->ad_newsize); db_printf(" offset %d, inodedep %p\n", adp->ad_offset, adp->ad_inodedep); } static void allocindir_print(struct allocindir *aip) { newblk_print(&aip->ai_block); db_printf(" oldblkno %jd, lbn %jd\n", (intmax_t)aip->ai_oldblkno, (intmax_t)aip->ai_lbn); db_printf(" offset %d, indirdep %p\n", aip->ai_offset, aip->ai_indirdep); } static void mkdir_print(struct mkdir *mkdir) { worklist_print(&mkdir->md_list, 0); db_printf(" diradd %p, jaddref %p, buf %p\n", mkdir->md_diradd, mkdir->md_jaddref, mkdir->md_buf); } DB_SHOW_COMMAND(sd_inodedep, db_show_sd_inodedep) { if (have_addr == 0) { db_printf("inodedep address required\n"); return; } inodedep_print((struct inodedep*)addr, 1); } DB_SHOW_COMMAND(sd_allinodedeps, db_show_sd_allinodedeps) { struct inodedep_hashhead *inodedephd; struct inodedep *inodedep; struct ufsmount *ump; int cnt; if (have_addr == 0) { db_printf("ufsmount address required\n"); return; } ump = (struct ufsmount *)addr; for (cnt = 0; cnt < ump->inodedep_hash_size; cnt++) { inodedephd = &ump->inodedep_hashtbl[cnt]; LIST_FOREACH(inodedep, inodedephd, id_hash) { inodedep_print(inodedep, 0); } } } DB_SHOW_COMMAND(sd_worklist, db_show_sd_worklist) { if (have_addr == 0) { db_printf("worklist address required\n"); return; } worklist_print((struct worklist *)addr, 1); } DB_SHOW_COMMAND(sd_workhead, db_show_sd_workhead) { struct worklist *wk; struct workhead *wkhd; if (have_addr == 0) { db_printf("worklist address required " "(for example value in bp->b_dep)\n"); return; } /* * We often do not have the address of the worklist head but * instead a pointer to its first entry (e.g., we have the * contents of bp->b_dep rather than &bp->b_dep). But the back * pointer of bp->b_dep will point at the head of the list, so * we cheat and use that instead. If we are in the middle of * a list we will still get the same result, so nothing * unexpected will result. */ wk = (struct worklist *)addr; if (wk == NULL) return; wkhd = (struct workhead *)wk->wk_list.le_prev; LIST_FOREACH(wk, wkhd, wk_list) { switch(wk->wk_type) { case D_INODEDEP: inodedep_print(WK_INODEDEP(wk), 0); continue; case D_ALLOCDIRECT: allocdirect_print(WK_ALLOCDIRECT(wk)); continue; case D_ALLOCINDIR: allocindir_print(WK_ALLOCINDIR(wk)); continue; case D_MKDIR: mkdir_print(WK_MKDIR(wk)); continue; default: worklist_print(wk, 0); continue; } } } DB_SHOW_COMMAND(sd_mkdir, db_show_sd_mkdir) { if (have_addr == 0) { db_printf("mkdir address required\n"); return; } mkdir_print((struct mkdir *)addr); } DB_SHOW_COMMAND(sd_mkdir_list, db_show_sd_mkdir_list) { struct mkdirlist *mkdirlisthd; struct mkdir *mkdir; if (have_addr == 0) { db_printf("mkdir listhead address required\n"); return; } mkdirlisthd = (struct mkdirlist *)addr; LIST_FOREACH(mkdir, mkdirlisthd, md_mkdirs) { mkdir_print(mkdir); if (mkdir->md_diradd != NULL) { db_printf(" "); worklist_print(&mkdir->md_diradd->da_list, 0); } if (mkdir->md_jaddref != NULL) { db_printf(" "); worklist_print(&mkdir->md_jaddref->ja_list, 0); } } } DB_SHOW_COMMAND(sd_allocdirect, db_show_sd_allocdirect) { if (have_addr == 0) { db_printf("allocdirect address required\n"); return; } allocdirect_print((struct allocdirect *)addr); } DB_SHOW_COMMAND(sd_allocindir, db_show_sd_allocindir) { if (have_addr == 0) { db_printf("allocindir address required\n"); return; } allocindir_print((struct allocindir *)addr); } #endif /* DDB */ #endif /* SOFTUPDATES */ Index: projects/clang1100-import/sys/ufs/ffs/ffs_vfsops.c =================================================================== --- projects/clang1100-import/sys/ufs/ffs/ffs_vfsops.c (revision 364278) +++ projects/clang1100-import/sys/ufs/ffs/ffs_vfsops.c (revision 364279) @@ -1,2684 +1,2683 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1989, 1991, 1993, 1994 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ffs_vfsops.c 8.31 (Berkeley) 5/20/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_quota.h" #include "opt_ufs.h" #include "opt_ffs.h" #include "opt_ddb.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static uma_zone_t uma_inode, uma_ufs1, uma_ufs2; VFS_SMR_DECLARE; static int ffs_mountfs(struct vnode *, struct mount *, struct thread *); static void ffs_oldfscompat_read(struct fs *, struct ufsmount *, ufs2_daddr_t); static void ffs_ifree(struct ufsmount *ump, struct inode *ip); static int ffs_sync_lazy(struct mount *mp); static int ffs_use_bread(void *devfd, off_t loc, void **bufp, int size); static int ffs_use_bwrite(void *devfd, off_t loc, void *buf, int size); static vfs_init_t ffs_init; static vfs_uninit_t ffs_uninit; static vfs_extattrctl_t ffs_extattrctl; static vfs_cmount_t ffs_cmount; static vfs_unmount_t ffs_unmount; static vfs_mount_t ffs_mount; static vfs_statfs_t ffs_statfs; static vfs_fhtovp_t ffs_fhtovp; static vfs_sync_t ffs_sync; static struct vfsops ufs_vfsops = { .vfs_extattrctl = ffs_extattrctl, .vfs_fhtovp = ffs_fhtovp, .vfs_init = ffs_init, .vfs_mount = ffs_mount, .vfs_cmount = ffs_cmount, .vfs_quotactl = ufs_quotactl, .vfs_root = vfs_cache_root, .vfs_cachedroot = ufs_root, .vfs_statfs = ffs_statfs, .vfs_sync = ffs_sync, .vfs_uninit = ffs_uninit, .vfs_unmount = ffs_unmount, .vfs_vget = ffs_vget, .vfs_susp_clean = process_deferred_inactive, }; VFS_SET(ufs_vfsops, ufs, 0); MODULE_VERSION(ufs, 1); static b_strategy_t ffs_geom_strategy; static b_write_t ffs_bufwrite; static struct buf_ops ffs_ops = { .bop_name = "FFS", .bop_write = ffs_bufwrite, .bop_strategy = ffs_geom_strategy, .bop_sync = bufsync, #ifdef NO_FFS_SNAPSHOT .bop_bdflush = bufbdflush, #else .bop_bdflush = ffs_bdflush, #endif }; /* * Note that userquota and groupquota options are not currently used * by UFS/FFS code and generally mount(8) does not pass those options * from userland, but they can be passed by loader(8) via * vfs.root.mountfrom.options. */ static const char *ffs_opts[] = { "acls", "async", "noatime", "noclusterr", "noclusterw", "noexec", "export", "force", "from", "groupquota", "multilabel", "nfsv4acls", "fsckpid", "snapshot", "nosuid", "suiddir", "nosymfollow", "sync", "union", "userquota", "untrusted", NULL }; static int ffs_enxio_enable = 1; SYSCTL_DECL(_vfs_ffs); SYSCTL_INT(_vfs_ffs, OID_AUTO, enxio_enable, CTLFLAG_RWTUN, &ffs_enxio_enable, 0, "enable mapping of other disk I/O errors to ENXIO"); /* * Return buffer with the contents of block "offset" from the beginning of * directory "ip". If "res" is non-zero, fill it in with a pointer to the * remaining space in the directory. */ static int ffs_blkatoff(struct vnode *vp, off_t offset, char **res, struct buf **bpp) { struct inode *ip; struct fs *fs; struct buf *bp; ufs_lbn_t lbn; int bsize, error; ip = VTOI(vp); fs = ITOFS(ip); lbn = lblkno(fs, offset); bsize = blksize(fs, ip, lbn); *bpp = NULL; error = bread(vp, lbn, bsize, NOCRED, &bp); if (error) { return (error); } if (res) *res = (char *)bp->b_data + blkoff(fs, offset); *bpp = bp; return (0); } /* * Load up the contents of an inode and copy the appropriate pieces * to the incore copy. */ static int ffs_load_inode(struct buf *bp, struct inode *ip, struct fs *fs, ino_t ino) { struct ufs1_dinode *dip1; struct ufs2_dinode *dip2; int error; if (I_IS_UFS1(ip)) { dip1 = ip->i_din1; *dip1 = *((struct ufs1_dinode *)bp->b_data + ino_to_fsbo(fs, ino)); ip->i_mode = dip1->di_mode; ip->i_nlink = dip1->di_nlink; ip->i_effnlink = dip1->di_nlink; ip->i_size = dip1->di_size; ip->i_flags = dip1->di_flags; ip->i_gen = dip1->di_gen; ip->i_uid = dip1->di_uid; ip->i_gid = dip1->di_gid; return (0); } dip2 = ((struct ufs2_dinode *)bp->b_data + ino_to_fsbo(fs, ino)); if ((error = ffs_verify_dinode_ckhash(fs, dip2)) != 0 && !ffs_fsfail_cleanup(ITOUMP(ip), error)) { printf("%s: inode %jd: check-hash failed\n", fs->fs_fsmnt, (intmax_t)ino); return (error); } *ip->i_din2 = *dip2; dip2 = ip->i_din2; ip->i_mode = dip2->di_mode; ip->i_nlink = dip2->di_nlink; ip->i_effnlink = dip2->di_nlink; ip->i_size = dip2->di_size; ip->i_flags = dip2->di_flags; ip->i_gen = dip2->di_gen; ip->i_uid = dip2->di_uid; ip->i_gid = dip2->di_gid; return (0); } /* * Verify that a filesystem block number is a valid data block. * This routine is only called on untrusted filesystems. */ static int ffs_check_blkno(struct mount *mp, ino_t inum, ufs2_daddr_t daddr, int blksize) { struct fs *fs; struct ufsmount *ump; ufs2_daddr_t end_daddr; int cg, havemtx; KASSERT((mp->mnt_flag & MNT_UNTRUSTED) != 0, ("ffs_check_blkno called on a trusted file system")); ump = VFSTOUFS(mp); fs = ump->um_fs; cg = dtog(fs, daddr); end_daddr = daddr + numfrags(fs, blksize); /* * Verify that the block number is a valid data block. Also check * that it does not point to an inode block or a superblock. Accept * blocks that are unalloacted (0) or part of snapshot metadata * (BLK_NOCOPY or BLK_SNAP). * * Thus, the block must be in a valid range for the filesystem and * either in the space before a backup superblock (except the first * cylinder group where that space is used by the bootstrap code) or * after the inode blocks and before the end of the cylinder group. */ if ((uint64_t)daddr <= BLK_SNAP || ((uint64_t)end_daddr <= fs->fs_size && ((cg > 0 && end_daddr <= cgsblock(fs, cg)) || (daddr >= cgdmin(fs, cg) && end_daddr <= cgbase(fs, cg) + fs->fs_fpg)))) return (0); if ((havemtx = mtx_owned(UFS_MTX(ump))) == 0) UFS_LOCK(ump); if (ppsratecheck(&ump->um_last_integritymsg, &ump->um_secs_integritymsg, 1)) { UFS_UNLOCK(ump); uprintf("\n%s: inode %jd, out-of-range indirect block " "number %jd\n", mp->mnt_stat.f_mntonname, inum, daddr); if (havemtx) UFS_LOCK(ump); } else if (!havemtx) UFS_UNLOCK(ump); return (EINTEGRITY); } /* * Initiate a forcible unmount. * Used to unmount filesystems whose underlying media has gone away. */ static void ffs_fsfail_unmount(void *v, int pending) { struct fsfail_task *etp; struct mount *mp; etp = v; /* * Find our mount and get a ref on it, then try to unmount. */ mp = vfs_getvfs(&etp->fsid); if (mp != NULL) dounmount(mp, MNT_FORCE, curthread); free(etp, M_UFSMNT); } /* * On first ENXIO error, start a task that forcibly unmounts the filesystem. * * Return true if a cleanup is in progress. */ int ffs_fsfail_cleanup(struct ufsmount *ump, int error) { int retval; UFS_LOCK(ump); retval = ffs_fsfail_cleanup_locked(ump, error); UFS_UNLOCK(ump); return (retval); } int ffs_fsfail_cleanup_locked(struct ufsmount *ump, int error) { struct fsfail_task *etp; struct task *tp; mtx_assert(UFS_MTX(ump), MA_OWNED); if (error == ENXIO && (ump->um_flags & UM_FSFAIL_CLEANUP) == 0) { ump->um_flags |= UM_FSFAIL_CLEANUP; /* * Queue an async forced unmount. */ etp = ump->um_fsfail_task; ump->um_fsfail_task = NULL; if (etp != NULL) { tp = &etp->task; TASK_INIT(tp, 0, ffs_fsfail_unmount, etp); taskqueue_enqueue(taskqueue_thread, tp); printf("UFS: forcibly unmounting %s from %s\n", ump->um_mountp->mnt_stat.f_mntfromname, ump->um_mountp->mnt_stat.f_mntonname); } } return ((ump->um_flags & UM_FSFAIL_CLEANUP) != 0); } /* * Wrapper used during ENXIO cleanup to allocate empty buffers when * the kernel is unable to read the real one. They are needed so that * the soft updates code can use them to unwind its dependencies. */ int ffs_breadz(struct ufsmount *ump, struct vnode *vp, daddr_t lblkno, daddr_t dblkno, int size, daddr_t *rablkno, int *rabsize, int cnt, struct ucred *cred, int flags, void (*ckhashfunc)(struct buf *), struct buf **bpp) { int error; flags |= GB_CVTENXIO; error = breadn_flags(vp, lblkno, dblkno, size, rablkno, rabsize, cnt, cred, flags, ckhashfunc, bpp); if (error != 0 && ffs_fsfail_cleanup(ump, error)) { error = getblkx(vp, lblkno, dblkno, size, 0, 0, flags, bpp); KASSERT(error == 0, ("getblkx failed")); vfs_bio_bzero_buf(*bpp, 0, size); } return (error); } static int ffs_mount(struct mount *mp) { struct vnode *devvp, *odevvp; struct thread *td; struct ufsmount *ump = NULL; struct fs *fs; pid_t fsckpid = 0; int error, error1, flags; uint64_t mntorflags, saved_mnt_flag; accmode_t accmode; struct nameidata ndp; char *fspec; td = curthread; if (vfs_filteropt(mp->mnt_optnew, ffs_opts)) return (EINVAL); if (uma_inode == NULL) { uma_inode = uma_zcreate("FFS inode", sizeof(struct inode), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); uma_ufs1 = uma_zcreate("FFS1 dinode", sizeof(struct ufs1_dinode), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); uma_ufs2 = uma_zcreate("FFS2 dinode", sizeof(struct ufs2_dinode), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); VFS_SMR_ZONE_SET(uma_inode); } vfs_deleteopt(mp->mnt_optnew, "groupquota"); vfs_deleteopt(mp->mnt_optnew, "userquota"); fspec = vfs_getopts(mp->mnt_optnew, "from", &error); if (error) return (error); mntorflags = 0; if (vfs_getopt(mp->mnt_optnew, "untrusted", NULL, NULL) == 0) mntorflags |= MNT_UNTRUSTED; if (vfs_getopt(mp->mnt_optnew, "acls", NULL, NULL) == 0) mntorflags |= MNT_ACLS; if (vfs_getopt(mp->mnt_optnew, "snapshot", NULL, NULL) == 0) { mntorflags |= MNT_SNAPSHOT; /* * Once we have set the MNT_SNAPSHOT flag, do not * persist "snapshot" in the options list. */ vfs_deleteopt(mp->mnt_optnew, "snapshot"); vfs_deleteopt(mp->mnt_opt, "snapshot"); } if (vfs_getopt(mp->mnt_optnew, "fsckpid", NULL, NULL) == 0 && vfs_scanopt(mp->mnt_optnew, "fsckpid", "%d", &fsckpid) == 1) { /* * Once we have set the restricted PID, do not * persist "fsckpid" in the options list. */ vfs_deleteopt(mp->mnt_optnew, "fsckpid"); vfs_deleteopt(mp->mnt_opt, "fsckpid"); if (mp->mnt_flag & MNT_UPDATE) { if (VFSTOUFS(mp)->um_fs->fs_ronly == 0 && vfs_flagopt(mp->mnt_optnew, "ro", NULL, 0) == 0) { vfs_mount_error(mp, "Checker enable: Must be read-only"); return (EINVAL); } } else if (vfs_flagopt(mp->mnt_optnew, "ro", NULL, 0) == 0) { vfs_mount_error(mp, "Checker enable: Must be read-only"); return (EINVAL); } /* Set to -1 if we are done */ if (fsckpid == 0) fsckpid = -1; } if (vfs_getopt(mp->mnt_optnew, "nfsv4acls", NULL, NULL) == 0) { if (mntorflags & MNT_ACLS) { vfs_mount_error(mp, "\"acls\" and \"nfsv4acls\" options " "are mutually exclusive"); return (EINVAL); } mntorflags |= MNT_NFS4ACLS; } MNT_ILOCK(mp); mp->mnt_kern_flag &= ~MNTK_FPLOOKUP; mp->mnt_flag |= mntorflags; MNT_IUNLOCK(mp); /* * If updating, check whether changing from read-only to * read/write; if there is no device name, that's all we do. */ if (mp->mnt_flag & MNT_UPDATE) { ump = VFSTOUFS(mp); fs = ump->um_fs; odevvp = ump->um_odevvp; devvp = ump->um_devvp; if (fsckpid == -1 && ump->um_fsckpid > 0) { if ((error = ffs_flushfiles(mp, WRITECLOSE, td)) != 0 || (error = ffs_sbupdate(ump, MNT_WAIT, 0)) != 0) return (error); g_topology_lock(); /* * Return to normal read-only mode. */ error = g_access(ump->um_cp, 0, -1, 0); g_topology_unlock(); ump->um_fsckpid = 0; } if (fs->fs_ronly == 0 && vfs_flagopt(mp->mnt_optnew, "ro", NULL, 0)) { /* * Flush any dirty data and suspend filesystem. */ if ((error = vn_start_write(NULL, &mp, V_WAIT)) != 0) return (error); error = vfs_write_suspend_umnt(mp); if (error != 0) return (error); /* * Check for and optionally get rid of files open * for writing. */ flags = WRITECLOSE; if (mp->mnt_flag & MNT_FORCE) flags |= FORCECLOSE; if (MOUNTEDSOFTDEP(mp)) { error = softdep_flushfiles(mp, flags, td); } else { error = ffs_flushfiles(mp, flags, td); } if (error) { vfs_write_resume(mp, 0); return (error); } if (fs->fs_pendingblocks != 0 || fs->fs_pendinginodes != 0) { printf("WARNING: %s Update error: blocks %jd " "files %d\n", fs->fs_fsmnt, (intmax_t)fs->fs_pendingblocks, fs->fs_pendinginodes); fs->fs_pendingblocks = 0; fs->fs_pendinginodes = 0; } if ((fs->fs_flags & (FS_UNCLEAN | FS_NEEDSFSCK)) == 0) fs->fs_clean = 1; if ((error = ffs_sbupdate(ump, MNT_WAIT, 0)) != 0) { fs->fs_ronly = 0; fs->fs_clean = 0; vfs_write_resume(mp, 0); return (error); } if (MOUNTEDSOFTDEP(mp)) softdep_unmount(mp); g_topology_lock(); /* * Drop our write and exclusive access. */ g_access(ump->um_cp, 0, -1, -1); g_topology_unlock(); fs->fs_ronly = 1; MNT_ILOCK(mp); mp->mnt_flag |= MNT_RDONLY; MNT_IUNLOCK(mp); /* * Allow the writers to note that filesystem * is ro now. */ vfs_write_resume(mp, 0); } if ((mp->mnt_flag & MNT_RELOAD) && (error = ffs_reload(mp, td, 0)) != 0) return (error); if (fs->fs_ronly && !vfs_flagopt(mp->mnt_optnew, "ro", NULL, 0)) { /* * If we are running a checker, do not allow upgrade. */ if (ump->um_fsckpid > 0) { vfs_mount_error(mp, "Active checker, cannot upgrade to write"); return (EINVAL); } /* * If upgrade to read-write by non-root, then verify * that user has necessary permissions on the device. */ vn_lock(odevvp, LK_EXCLUSIVE | LK_RETRY); error = VOP_ACCESS(odevvp, VREAD | VWRITE, td->td_ucred, td); if (error) error = priv_check(td, PRIV_VFS_MOUNT_PERM); VOP_UNLOCK(odevvp); if (error) { return (error); } fs->fs_flags &= ~FS_UNCLEAN; if (fs->fs_clean == 0) { fs->fs_flags |= FS_UNCLEAN; if ((mp->mnt_flag & MNT_FORCE) || ((fs->fs_flags & (FS_SUJ | FS_NEEDSFSCK)) == 0 && (fs->fs_flags & FS_DOSOFTDEP))) { printf("WARNING: %s was not properly " "dismounted\n", fs->fs_fsmnt); } else { vfs_mount_error(mp, "R/W mount of %s denied. %s.%s", fs->fs_fsmnt, "Filesystem is not clean - run fsck", (fs->fs_flags & FS_SUJ) == 0 ? "" : " Forced mount will invalidate" " journal contents"); return (EPERM); } } g_topology_lock(); /* * Request exclusive write access. */ error = g_access(ump->um_cp, 0, 1, 1); g_topology_unlock(); if (error) return (error); if ((error = vn_start_write(NULL, &mp, V_WAIT)) != 0) return (error); error = vfs_write_suspend_umnt(mp); if (error != 0) return (error); fs->fs_ronly = 0; MNT_ILOCK(mp); saved_mnt_flag = MNT_RDONLY; if (MOUNTEDSOFTDEP(mp) && (mp->mnt_flag & MNT_ASYNC) != 0) saved_mnt_flag |= MNT_ASYNC; mp->mnt_flag &= ~saved_mnt_flag; MNT_IUNLOCK(mp); fs->fs_mtime = time_second; /* check to see if we need to start softdep */ if ((fs->fs_flags & FS_DOSOFTDEP) && (error = softdep_mount(devvp, mp, fs, td->td_ucred))){ fs->fs_ronly = 1; MNT_ILOCK(mp); mp->mnt_flag |= saved_mnt_flag; MNT_IUNLOCK(mp); vfs_write_resume(mp, 0); return (error); } fs->fs_clean = 0; if ((error = ffs_sbupdate(ump, MNT_WAIT, 0)) != 0) { fs->fs_ronly = 1; MNT_ILOCK(mp); mp->mnt_flag |= saved_mnt_flag; MNT_IUNLOCK(mp); vfs_write_resume(mp, 0); return (error); } if (fs->fs_snapinum[0] != 0) ffs_snapshot_mount(mp); vfs_write_resume(mp, 0); } /* * Soft updates is incompatible with "async", * so if we are doing softupdates stop the user * from setting the async flag in an update. * Softdep_mount() clears it in an initial mount * or ro->rw remount. */ if (MOUNTEDSOFTDEP(mp)) { /* XXX: Reset too late ? */ MNT_ILOCK(mp); mp->mnt_flag &= ~MNT_ASYNC; MNT_IUNLOCK(mp); } /* * Keep MNT_ACLS flag if it is stored in superblock. */ if ((fs->fs_flags & FS_ACLS) != 0) { /* XXX: Set too late ? */ MNT_ILOCK(mp); mp->mnt_flag |= MNT_ACLS; MNT_IUNLOCK(mp); } if ((fs->fs_flags & FS_NFS4ACLS) != 0) { /* XXX: Set too late ? */ MNT_ILOCK(mp); mp->mnt_flag |= MNT_NFS4ACLS; MNT_IUNLOCK(mp); } /* * If this is a request from fsck to clean up the filesystem, * then allow the specified pid to proceed. */ if (fsckpid > 0) { if (ump->um_fsckpid != 0) { vfs_mount_error(mp, "Active checker already running on %s", fs->fs_fsmnt); return (EINVAL); } KASSERT(MOUNTEDSOFTDEP(mp) == 0, ("soft updates enabled on read-only file system")); g_topology_lock(); /* * Request write access. */ error = g_access(ump->um_cp, 0, 1, 0); g_topology_unlock(); if (error) { vfs_mount_error(mp, "Checker activation failed on %s", fs->fs_fsmnt); return (error); } ump->um_fsckpid = fsckpid; if (fs->fs_snapinum[0] != 0) ffs_snapshot_mount(mp); fs->fs_mtime = time_second; fs->fs_fmod = 1; fs->fs_clean = 0; (void) ffs_sbupdate(ump, MNT_WAIT, 0); } /* * If this is a snapshot request, take the snapshot. */ if (mp->mnt_flag & MNT_SNAPSHOT) return (ffs_snapshot(mp, fspec)); /* * Must not call namei() while owning busy ref. */ vfs_unbusy(mp); } /* * Not an update, or updating the name: look up the name * and verify that it refers to a sensible disk device. */ NDINIT(&ndp, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, fspec, td); error = namei(&ndp); if ((mp->mnt_flag & MNT_UPDATE) != 0) { /* * Unmount does not start if MNT_UPDATE is set. Mount * update busies mp before setting MNT_UPDATE. We * must be able to retain our busy ref succesfully, * without sleep. */ error1 = vfs_busy(mp, MBF_NOWAIT); MPASS(error1 == 0); } if (error != 0) return (error); NDFREE(&ndp, NDF_ONLY_PNBUF); devvp = ndp.ni_vp; if (!vn_isdisk(devvp, &error)) { vput(devvp); return (error); } /* * If mount by non-root, then verify that user has necessary * permissions on the device. */ accmode = VREAD; if ((mp->mnt_flag & MNT_RDONLY) == 0) accmode |= VWRITE; error = VOP_ACCESS(devvp, accmode, td->td_ucred, td); if (error) error = priv_check(td, PRIV_VFS_MOUNT_PERM); if (error) { vput(devvp); return (error); } if (mp->mnt_flag & MNT_UPDATE) { /* * Update only * * If it's not the same vnode, or at least the same device * then it's not correct. */ if (devvp->v_rdev != ump->um_devvp->v_rdev) error = EINVAL; /* needs translation */ vput(devvp); if (error) return (error); } else { /* * New mount * * We need the name for the mount point (also used for * "last mounted on") copied in. If an error occurs, * the mount point is discarded by the upper level code. * Note that vfs_mount_alloc() populates f_mntonname for us. */ if ((error = ffs_mountfs(devvp, mp, td)) != 0) { vrele(devvp); return (error); } if (fsckpid > 0) { KASSERT(MOUNTEDSOFTDEP(mp) == 0, ("soft updates enabled on read-only file system")); ump = VFSTOUFS(mp); fs = ump->um_fs; g_topology_lock(); /* * Request write access. */ error = g_access(ump->um_cp, 0, 1, 0); g_topology_unlock(); if (error) { printf("WARNING: %s: Checker activation " "failed\n", fs->fs_fsmnt); } else { ump->um_fsckpid = fsckpid; if (fs->fs_snapinum[0] != 0) ffs_snapshot_mount(mp); fs->fs_mtime = time_second; fs->fs_clean = 0; (void) ffs_sbupdate(ump, MNT_WAIT, 0); } } } MNT_ILOCK(mp); /* * This is racy versus lookup, see ufs_fplookup_vexec for details. */ if ((mp->mnt_kern_flag & MNTK_FPLOOKUP) != 0) panic("MNTK_FPLOOKUP set on mount %p when it should not be", mp); if ((mp->mnt_flag & (MNT_ACLS | MNT_NFS4ACLS | MNT_UNION)) == 0) mp->mnt_kern_flag |= MNTK_FPLOOKUP; MNT_IUNLOCK(mp); vfs_mountedfrom(mp, fspec); return (0); } /* * Compatibility with old mount system call. */ static int ffs_cmount(struct mntarg *ma, void *data, uint64_t flags) { struct ufs_args args; int error; if (data == NULL) return (EINVAL); error = copyin(data, &args, sizeof args); if (error) return (error); ma = mount_argsu(ma, "from", args.fspec, MAXPATHLEN); ma = mount_arg(ma, "export", &args.export, sizeof(args.export)); error = kernel_mount(ma, flags); return (error); } /* * Reload all incore data for a filesystem (used after running fsck on * the root filesystem and finding things to fix). If the 'force' flag * is 0, the filesystem must be mounted read-only. * * Things to do to update the mount: * 1) invalidate all cached meta-data. * 2) re-read superblock from disk. * 3) re-read summary information from disk. * 4) invalidate all inactive vnodes. * 5) clear MNTK_SUSPEND2 and MNTK_SUSPENDED flags, allowing secondary * writers, if requested. * 6) invalidate all cached file data. * 7) re-read inode data for all active vnodes. */ int ffs_reload(struct mount *mp, struct thread *td, int flags) { struct vnode *vp, *mvp, *devvp; struct inode *ip; void *space; struct buf *bp; struct fs *fs, *newfs; struct ufsmount *ump; ufs2_daddr_t sblockloc; int i, blks, error; u_long size; int32_t *lp; ump = VFSTOUFS(mp); MNT_ILOCK(mp); if ((mp->mnt_flag & MNT_RDONLY) == 0 && (flags & FFSR_FORCE) == 0) { MNT_IUNLOCK(mp); return (EINVAL); } MNT_IUNLOCK(mp); /* * Step 1: invalidate all cached meta-data. */ devvp = VFSTOUFS(mp)->um_devvp; vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); if (vinvalbuf(devvp, 0, 0, 0) != 0) panic("ffs_reload: dirty1"); VOP_UNLOCK(devvp); /* * Step 2: re-read superblock from disk. */ fs = VFSTOUFS(mp)->um_fs; if ((error = bread(devvp, btodb(fs->fs_sblockloc), fs->fs_sbsize, NOCRED, &bp)) != 0) return (error); newfs = (struct fs *)bp->b_data; if ((newfs->fs_magic != FS_UFS1_MAGIC && newfs->fs_magic != FS_UFS2_MAGIC) || newfs->fs_bsize > MAXBSIZE || newfs->fs_bsize < sizeof(struct fs)) { brelse(bp); return (EIO); /* XXX needs translation */ } /* * Preserve the summary information, read-only status, and * superblock location by copying these fields into our new * superblock before using it to update the existing superblock. */ newfs->fs_si = fs->fs_si; newfs->fs_ronly = fs->fs_ronly; sblockloc = fs->fs_sblockloc; bcopy(newfs, fs, (u_int)fs->fs_sbsize); brelse(bp); mp->mnt_maxsymlinklen = fs->fs_maxsymlinklen; ffs_oldfscompat_read(fs, VFSTOUFS(mp), sblockloc); UFS_LOCK(ump); if (fs->fs_pendingblocks != 0 || fs->fs_pendinginodes != 0) { printf("WARNING: %s: reload pending error: blocks %jd " "files %d\n", fs->fs_fsmnt, (intmax_t)fs->fs_pendingblocks, fs->fs_pendinginodes); fs->fs_pendingblocks = 0; fs->fs_pendinginodes = 0; } UFS_UNLOCK(ump); /* * Step 3: re-read summary information from disk. */ size = fs->fs_cssize; blks = howmany(size, fs->fs_fsize); if (fs->fs_contigsumsize > 0) size += fs->fs_ncg * sizeof(int32_t); size += fs->fs_ncg * sizeof(u_int8_t); free(fs->fs_csp, M_UFSMNT); space = malloc(size, M_UFSMNT, M_WAITOK); fs->fs_csp = space; for (i = 0; i < blks; i += fs->fs_frag) { size = fs->fs_bsize; if (i + fs->fs_frag > blks) size = (blks - i) * fs->fs_fsize; error = bread(devvp, fsbtodb(fs, fs->fs_csaddr + i), size, NOCRED, &bp); if (error) return (error); bcopy(bp->b_data, space, (u_int)size); space = (char *)space + size; brelse(bp); } /* * We no longer know anything about clusters per cylinder group. */ if (fs->fs_contigsumsize > 0) { fs->fs_maxcluster = lp = space; for (i = 0; i < fs->fs_ncg; i++) *lp++ = fs->fs_contigsumsize; space = lp; } size = fs->fs_ncg * sizeof(u_int8_t); fs->fs_contigdirs = (u_int8_t *)space; bzero(fs->fs_contigdirs, size); if ((flags & FFSR_UNSUSPEND) != 0) { MNT_ILOCK(mp); mp->mnt_kern_flag &= ~(MNTK_SUSPENDED | MNTK_SUSPEND2); wakeup(&mp->mnt_flag); MNT_IUNLOCK(mp); } loop: MNT_VNODE_FOREACH_ALL(vp, mp, mvp) { /* * Skip syncer vnode. */ if (vp->v_type == VNON) { VI_UNLOCK(vp); continue; } /* * Step 4: invalidate all cached file data. */ - if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK, td)) { + if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK)) { MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); goto loop; } if (vinvalbuf(vp, 0, 0, 0)) panic("ffs_reload: dirty2"); /* * Step 5: re-read inode data for all active vnodes. */ ip = VTOI(vp); error = bread(devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)), (int)fs->fs_bsize, NOCRED, &bp); if (error) { vput(vp); MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); return (error); } if ((error = ffs_load_inode(bp, ip, fs, ip->i_number)) != 0) { brelse(bp); vput(vp); MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); return (error); } ip->i_effnlink = ip->i_nlink; brelse(bp); vput(vp); } return (0); } /* * Common code for mount and mountroot */ static int ffs_mountfs(odevvp, mp, td) struct vnode *odevvp; struct mount *mp; struct thread *td; { struct ufsmount *ump; struct fs *fs; struct cdev *dev; int error, i, len, ronly; struct ucred *cred; struct g_consumer *cp; struct mount *nmp; struct vnode *devvp; struct fsfail_task *etp; int candelete, canspeedup; off_t loc; fs = NULL; ump = NULL; cred = td ? td->td_ucred : NOCRED; ronly = (mp->mnt_flag & MNT_RDONLY) != 0; devvp = mntfs_allocvp(mp, odevvp); VOP_UNLOCK(odevvp); KASSERT(devvp->v_type == VCHR, ("reclaimed devvp")); dev = devvp->v_rdev; if (atomic_cmpset_acq_ptr((uintptr_t *)&dev->si_mountpt, 0, (uintptr_t)mp) == 0) { mntfs_freevp(devvp); return (EBUSY); } g_topology_lock(); error = g_vfs_open(devvp, &cp, "ffs", ronly ? 0 : 1); g_topology_unlock(); if (error != 0) { atomic_store_rel_ptr((uintptr_t *)&dev->si_mountpt, 0); mntfs_freevp(devvp); return (error); } dev_ref(dev); devvp->v_bufobj.bo_ops = &ffs_ops; BO_LOCK(&odevvp->v_bufobj); odevvp->v_bufobj.bo_flag |= BO_NOBUFS; BO_UNLOCK(&odevvp->v_bufobj); if (dev->si_iosize_max != 0) mp->mnt_iosize_max = dev->si_iosize_max; if (mp->mnt_iosize_max > MAXPHYS) mp->mnt_iosize_max = MAXPHYS; if ((SBLOCKSIZE % cp->provider->sectorsize) != 0) { error = EINVAL; vfs_mount_error(mp, "Invalid sectorsize %d for superblock size %d", cp->provider->sectorsize, SBLOCKSIZE); goto out; } /* fetch the superblock and summary information */ loc = STDSB; if ((mp->mnt_flag & MNT_ROOTFS) != 0) loc = STDSB_NOHASHFAIL; if ((error = ffs_sbget(devvp, &fs, loc, M_UFSMNT, ffs_use_bread)) != 0) goto out; /* none of these types of check-hashes are maintained by this kernel */ fs->fs_metackhash &= ~(CK_INDIR | CK_DIR); /* no support for any undefined flags */ fs->fs_flags &= FS_SUPPORTED; fs->fs_flags &= ~FS_UNCLEAN; if (fs->fs_clean == 0) { fs->fs_flags |= FS_UNCLEAN; if (ronly || (mp->mnt_flag & MNT_FORCE) || ((fs->fs_flags & (FS_SUJ | FS_NEEDSFSCK)) == 0 && (fs->fs_flags & FS_DOSOFTDEP))) { printf("WARNING: %s was not properly dismounted\n", fs->fs_fsmnt); } else { vfs_mount_error(mp, "R/W mount of %s denied. %s%s", fs->fs_fsmnt, "Filesystem is not clean - run fsck.", (fs->fs_flags & FS_SUJ) == 0 ? "" : " Forced mount will invalidate journal contents"); error = EPERM; goto out; } if ((fs->fs_pendingblocks != 0 || fs->fs_pendinginodes != 0) && (mp->mnt_flag & MNT_FORCE)) { printf("WARNING: %s: lost blocks %jd files %d\n", fs->fs_fsmnt, (intmax_t)fs->fs_pendingblocks, fs->fs_pendinginodes); fs->fs_pendingblocks = 0; fs->fs_pendinginodes = 0; } } if (fs->fs_pendingblocks != 0 || fs->fs_pendinginodes != 0) { printf("WARNING: %s: mount pending error: blocks %jd " "files %d\n", fs->fs_fsmnt, (intmax_t)fs->fs_pendingblocks, fs->fs_pendinginodes); fs->fs_pendingblocks = 0; fs->fs_pendinginodes = 0; } if ((fs->fs_flags & FS_GJOURNAL) != 0) { #ifdef UFS_GJOURNAL /* * Get journal provider name. */ len = 1024; mp->mnt_gjprovider = malloc((u_long)len, M_UFSMNT, M_WAITOK); if (g_io_getattr("GJOURNAL::provider", cp, &len, mp->mnt_gjprovider) == 0) { mp->mnt_gjprovider = realloc(mp->mnt_gjprovider, len, M_UFSMNT, M_WAITOK); MNT_ILOCK(mp); mp->mnt_flag |= MNT_GJOURNAL; MNT_IUNLOCK(mp); } else { printf("WARNING: %s: GJOURNAL flag on fs " "but no gjournal provider below\n", mp->mnt_stat.f_mntonname); free(mp->mnt_gjprovider, M_UFSMNT); mp->mnt_gjprovider = NULL; } #else printf("WARNING: %s: GJOURNAL flag on fs but no " "UFS_GJOURNAL support\n", mp->mnt_stat.f_mntonname); #endif } else { mp->mnt_gjprovider = NULL; } ump = malloc(sizeof *ump, M_UFSMNT, M_WAITOK | M_ZERO); ump->um_cp = cp; ump->um_bo = &devvp->v_bufobj; ump->um_fs = fs; if (fs->fs_magic == FS_UFS1_MAGIC) { ump->um_fstype = UFS1; ump->um_balloc = ffs_balloc_ufs1; } else { ump->um_fstype = UFS2; ump->um_balloc = ffs_balloc_ufs2; } ump->um_blkatoff = ffs_blkatoff; ump->um_truncate = ffs_truncate; ump->um_update = ffs_update; ump->um_valloc = ffs_valloc; ump->um_vfree = ffs_vfree; ump->um_ifree = ffs_ifree; ump->um_rdonly = ffs_rdonly; ump->um_snapgone = ffs_snapgone; if ((mp->mnt_flag & MNT_UNTRUSTED) != 0) ump->um_check_blkno = ffs_check_blkno; else ump->um_check_blkno = NULL; mtx_init(UFS_MTX(ump), "FFS", "FFS Lock", MTX_DEF); ffs_oldfscompat_read(fs, ump, fs->fs_sblockloc); fs->fs_ronly = ronly; fs->fs_active = NULL; mp->mnt_data = ump; mp->mnt_stat.f_fsid.val[0] = fs->fs_id[0]; mp->mnt_stat.f_fsid.val[1] = fs->fs_id[1]; nmp = NULL; if (fs->fs_id[0] == 0 || fs->fs_id[1] == 0 || (nmp = vfs_getvfs(&mp->mnt_stat.f_fsid))) { if (nmp) vfs_rel(nmp); vfs_getnewfsid(mp); } mp->mnt_maxsymlinklen = fs->fs_maxsymlinklen; MNT_ILOCK(mp); mp->mnt_flag |= MNT_LOCAL; MNT_IUNLOCK(mp); if ((fs->fs_flags & FS_MULTILABEL) != 0) { #ifdef MAC MNT_ILOCK(mp); mp->mnt_flag |= MNT_MULTILABEL; MNT_IUNLOCK(mp); #else printf("WARNING: %s: multilabel flag on fs but " "no MAC support\n", mp->mnt_stat.f_mntonname); #endif } if ((fs->fs_flags & FS_ACLS) != 0) { #ifdef UFS_ACL MNT_ILOCK(mp); if (mp->mnt_flag & MNT_NFS4ACLS) printf("WARNING: %s: ACLs flag on fs conflicts with " "\"nfsv4acls\" mount option; option ignored\n", mp->mnt_stat.f_mntonname); mp->mnt_flag &= ~MNT_NFS4ACLS; mp->mnt_flag |= MNT_ACLS; MNT_IUNLOCK(mp); #else printf("WARNING: %s: ACLs flag on fs but no ACLs support\n", mp->mnt_stat.f_mntonname); #endif } if ((fs->fs_flags & FS_NFS4ACLS) != 0) { #ifdef UFS_ACL MNT_ILOCK(mp); if (mp->mnt_flag & MNT_ACLS) printf("WARNING: %s: NFSv4 ACLs flag on fs conflicts " "with \"acls\" mount option; option ignored\n", mp->mnt_stat.f_mntonname); mp->mnt_flag &= ~MNT_ACLS; mp->mnt_flag |= MNT_NFS4ACLS; MNT_IUNLOCK(mp); #else printf("WARNING: %s: NFSv4 ACLs flag on fs but no " "ACLs support\n", mp->mnt_stat.f_mntonname); #endif } if ((fs->fs_flags & FS_TRIM) != 0) { len = sizeof(int); if (g_io_getattr("GEOM::candelete", cp, &len, &candelete) == 0) { if (candelete) ump->um_flags |= UM_CANDELETE; else printf("WARNING: %s: TRIM flag on fs but disk " "does not support TRIM\n", mp->mnt_stat.f_mntonname); } else { printf("WARNING: %s: TRIM flag on fs but disk does " "not confirm that it supports TRIM\n", mp->mnt_stat.f_mntonname); } if (((ump->um_flags) & UM_CANDELETE) != 0) { ump->um_trim_tq = taskqueue_create("trim", M_WAITOK, taskqueue_thread_enqueue, &ump->um_trim_tq); taskqueue_start_threads(&ump->um_trim_tq, 1, PVFS, "%s trim", mp->mnt_stat.f_mntonname); ump->um_trimhash = hashinit(MAXTRIMIO, M_TRIM, &ump->um_trimlisthashsize); } } len = sizeof(int); if (g_io_getattr("GEOM::canspeedup", cp, &len, &canspeedup) == 0) { if (canspeedup) ump->um_flags |= UM_CANSPEEDUP; } ump->um_mountp = mp; ump->um_dev = dev; ump->um_devvp = devvp; ump->um_odevvp = odevvp; ump->um_nindir = fs->fs_nindir; ump->um_bptrtodb = fs->fs_fsbtodb; ump->um_seqinc = fs->fs_frag; for (i = 0; i < MAXQUOTAS; i++) ump->um_quotas[i] = NULLVP; #ifdef UFS_EXTATTR ufs_extattr_uepm_init(&ump->um_extattr); #endif /* * Set FS local "last mounted on" information (NULL pad) */ bzero(fs->fs_fsmnt, MAXMNTLEN); strlcpy(fs->fs_fsmnt, mp->mnt_stat.f_mntonname, MAXMNTLEN); mp->mnt_stat.f_iosize = fs->fs_bsize; if (mp->mnt_flag & MNT_ROOTFS) { /* * Root mount; update timestamp in mount structure. * this will be used by the common root mount code * to update the system clock. */ mp->mnt_time = fs->fs_time; } if (ronly == 0) { fs->fs_mtime = time_second; if ((fs->fs_flags & FS_DOSOFTDEP) && (error = softdep_mount(devvp, mp, fs, cred)) != 0) { ffs_flushfiles(mp, FORCECLOSE, td); goto out; } if (fs->fs_snapinum[0] != 0) ffs_snapshot_mount(mp); fs->fs_fmod = 1; fs->fs_clean = 0; (void) ffs_sbupdate(ump, MNT_WAIT, 0); } /* * Initialize filesystem state information in mount struct. */ MNT_ILOCK(mp); mp->mnt_kern_flag |= MNTK_LOOKUP_SHARED | MNTK_EXTENDED_SHARED | MNTK_NO_IOPF | MNTK_UNMAPPED_BUFS | MNTK_USES_BCACHE; MNT_IUNLOCK(mp); #ifdef UFS_EXTATTR #ifdef UFS_EXTATTR_AUTOSTART /* * * Auto-starting does the following: * - check for /.attribute in the fs, and extattr_start if so * - for each file in .attribute, enable that file with * an attribute of the same name. * Not clear how to report errors -- probably eat them. * This would all happen while the filesystem was busy/not * available, so would effectively be "atomic". */ (void) ufs_extattr_autostart(mp, td); #endif /* !UFS_EXTATTR_AUTOSTART */ #endif /* !UFS_EXTATTR */ etp = malloc(sizeof *ump->um_fsfail_task, M_UFSMNT, M_WAITOK | M_ZERO); etp->fsid = mp->mnt_stat.f_fsid; ump->um_fsfail_task = etp; return (0); out: if (fs != NULL) { free(fs->fs_csp, M_UFSMNT); free(fs->fs_si, M_UFSMNT); free(fs, M_UFSMNT); } if (cp != NULL) { g_topology_lock(); g_vfs_close(cp); g_topology_unlock(); } if (ump) { mtx_destroy(UFS_MTX(ump)); if (mp->mnt_gjprovider != NULL) { free(mp->mnt_gjprovider, M_UFSMNT); mp->mnt_gjprovider = NULL; } free(ump, M_UFSMNT); mp->mnt_data = NULL; } BO_LOCK(&odevvp->v_bufobj); odevvp->v_bufobj.bo_flag &= ~BO_NOBUFS; BO_UNLOCK(&odevvp->v_bufobj); atomic_store_rel_ptr((uintptr_t *)&dev->si_mountpt, 0); mntfs_freevp(devvp); dev_rel(dev); return (error); } /* * A read function for use by filesystem-layer routines. */ static int ffs_use_bread(void *devfd, off_t loc, void **bufp, int size) { struct buf *bp; int error; KASSERT(*bufp == NULL, ("ffs_use_bread: non-NULL *bufp %p\n", *bufp)); *bufp = malloc(size, M_UFSMNT, M_WAITOK); if ((error = bread((struct vnode *)devfd, btodb(loc), size, NOCRED, &bp)) != 0) return (error); bcopy(bp->b_data, *bufp, size); bp->b_flags |= B_INVAL | B_NOCACHE; brelse(bp); return (0); } static int bigcgs = 0; SYSCTL_INT(_debug, OID_AUTO, bigcgs, CTLFLAG_RW, &bigcgs, 0, ""); /* * Sanity checks for loading old filesystem superblocks. * See ffs_oldfscompat_write below for unwound actions. * * XXX - Parts get retired eventually. * Unfortunately new bits get added. */ static void ffs_oldfscompat_read(fs, ump, sblockloc) struct fs *fs; struct ufsmount *ump; ufs2_daddr_t sblockloc; { off_t maxfilesize; /* * If not yet done, update fs_flags location and value of fs_sblockloc. */ if ((fs->fs_old_flags & FS_FLAGS_UPDATED) == 0) { fs->fs_flags = fs->fs_old_flags; fs->fs_old_flags |= FS_FLAGS_UPDATED; fs->fs_sblockloc = sblockloc; } /* * If not yet done, update UFS1 superblock with new wider fields. */ if (fs->fs_magic == FS_UFS1_MAGIC && fs->fs_maxbsize != fs->fs_bsize) { fs->fs_maxbsize = fs->fs_bsize; fs->fs_time = fs->fs_old_time; fs->fs_size = fs->fs_old_size; fs->fs_dsize = fs->fs_old_dsize; fs->fs_csaddr = fs->fs_old_csaddr; fs->fs_cstotal.cs_ndir = fs->fs_old_cstotal.cs_ndir; fs->fs_cstotal.cs_nbfree = fs->fs_old_cstotal.cs_nbfree; fs->fs_cstotal.cs_nifree = fs->fs_old_cstotal.cs_nifree; fs->fs_cstotal.cs_nffree = fs->fs_old_cstotal.cs_nffree; } if (fs->fs_magic == FS_UFS1_MAGIC && fs->fs_old_inodefmt < FS_44INODEFMT) { fs->fs_maxfilesize = ((uint64_t)1 << 31) - 1; fs->fs_qbmask = ~fs->fs_bmask; fs->fs_qfmask = ~fs->fs_fmask; } if (fs->fs_magic == FS_UFS1_MAGIC) { ump->um_savedmaxfilesize = fs->fs_maxfilesize; maxfilesize = (uint64_t)0x80000000 * fs->fs_bsize - 1; if (fs->fs_maxfilesize > maxfilesize) fs->fs_maxfilesize = maxfilesize; } /* Compatibility for old filesystems */ if (fs->fs_avgfilesize <= 0) fs->fs_avgfilesize = AVFILESIZ; if (fs->fs_avgfpdir <= 0) fs->fs_avgfpdir = AFPDIR; if (bigcgs) { fs->fs_save_cgsize = fs->fs_cgsize; fs->fs_cgsize = fs->fs_bsize; } } /* * Unwinding superblock updates for old filesystems. * See ffs_oldfscompat_read above for details. * * XXX - Parts get retired eventually. * Unfortunately new bits get added. */ void ffs_oldfscompat_write(fs, ump) struct fs *fs; struct ufsmount *ump; { /* * Copy back UFS2 updated fields that UFS1 inspects. */ if (fs->fs_magic == FS_UFS1_MAGIC) { fs->fs_old_time = fs->fs_time; fs->fs_old_cstotal.cs_ndir = fs->fs_cstotal.cs_ndir; fs->fs_old_cstotal.cs_nbfree = fs->fs_cstotal.cs_nbfree; fs->fs_old_cstotal.cs_nifree = fs->fs_cstotal.cs_nifree; fs->fs_old_cstotal.cs_nffree = fs->fs_cstotal.cs_nffree; fs->fs_maxfilesize = ump->um_savedmaxfilesize; } if (bigcgs) { fs->fs_cgsize = fs->fs_save_cgsize; fs->fs_save_cgsize = 0; } } /* * unmount system call */ static int ffs_unmount(mp, mntflags) struct mount *mp; int mntflags; { struct thread *td; struct ufsmount *ump = VFSTOUFS(mp); struct fs *fs; int error, flags, susp; #ifdef UFS_EXTATTR int e_restart; #endif flags = 0; td = curthread; fs = ump->um_fs; if (mntflags & MNT_FORCE) flags |= FORCECLOSE; susp = fs->fs_ronly == 0; #ifdef UFS_EXTATTR if ((error = ufs_extattr_stop(mp, td))) { if (error != EOPNOTSUPP) printf("WARNING: unmount %s: ufs_extattr_stop " "returned errno %d\n", mp->mnt_stat.f_mntonname, error); e_restart = 0; } else { ufs_extattr_uepm_destroy(&ump->um_extattr); e_restart = 1; } #endif if (susp) { error = vfs_write_suspend_umnt(mp); if (error != 0) goto fail1; } if (MOUNTEDSOFTDEP(mp)) error = softdep_flushfiles(mp, flags, td); else error = ffs_flushfiles(mp, flags, td); if (error != 0 && !ffs_fsfail_cleanup(ump, error)) goto fail; UFS_LOCK(ump); if (fs->fs_pendingblocks != 0 || fs->fs_pendinginodes != 0) { printf("WARNING: unmount %s: pending error: blocks %jd " "files %d\n", fs->fs_fsmnt, (intmax_t)fs->fs_pendingblocks, fs->fs_pendinginodes); fs->fs_pendingblocks = 0; fs->fs_pendinginodes = 0; } UFS_UNLOCK(ump); if (MOUNTEDSOFTDEP(mp)) softdep_unmount(mp); if (fs->fs_ronly == 0 || ump->um_fsckpid > 0) { fs->fs_clean = fs->fs_flags & (FS_UNCLEAN|FS_NEEDSFSCK) ? 0 : 1; error = ffs_sbupdate(ump, MNT_WAIT, 0); if (ffs_fsfail_cleanup(ump, error)) error = 0; if (error != 0 && !ffs_fsfail_cleanup(ump, error)) { fs->fs_clean = 0; goto fail; } } if (susp) vfs_write_resume(mp, VR_START_WRITE); if (ump->um_trim_tq != NULL) { while (ump->um_trim_inflight != 0) pause("ufsutr", hz); taskqueue_drain_all(ump->um_trim_tq); taskqueue_free(ump->um_trim_tq); free (ump->um_trimhash, M_TRIM); } g_topology_lock(); if (ump->um_fsckpid > 0) { /* * Return to normal read-only mode. */ error = g_access(ump->um_cp, 0, -1, 0); ump->um_fsckpid = 0; } g_vfs_close(ump->um_cp); g_topology_unlock(); BO_LOCK(&ump->um_odevvp->v_bufobj); ump->um_odevvp->v_bufobj.bo_flag &= ~BO_NOBUFS; BO_UNLOCK(&ump->um_odevvp->v_bufobj); atomic_store_rel_ptr((uintptr_t *)&ump->um_dev->si_mountpt, 0); mntfs_freevp(ump->um_devvp); vrele(ump->um_odevvp); dev_rel(ump->um_dev); mtx_destroy(UFS_MTX(ump)); if (mp->mnt_gjprovider != NULL) { free(mp->mnt_gjprovider, M_UFSMNT); mp->mnt_gjprovider = NULL; } free(fs->fs_csp, M_UFSMNT); free(fs->fs_si, M_UFSMNT); free(fs, M_UFSMNT); if (ump->um_fsfail_task != NULL) free(ump->um_fsfail_task, M_UFSMNT); free(ump, M_UFSMNT); mp->mnt_data = NULL; MNT_ILOCK(mp); mp->mnt_flag &= ~MNT_LOCAL; MNT_IUNLOCK(mp); if (td->td_su == mp) { td->td_su = NULL; vfs_rel(mp); } return (error); fail: if (susp) vfs_write_resume(mp, VR_START_WRITE); fail1: #ifdef UFS_EXTATTR if (e_restart) { ufs_extattr_uepm_init(&ump->um_extattr); #ifdef UFS_EXTATTR_AUTOSTART (void) ufs_extattr_autostart(mp, td); #endif } #endif return (error); } /* * Flush out all the files in a filesystem. */ int ffs_flushfiles(mp, flags, td) struct mount *mp; int flags; struct thread *td; { struct ufsmount *ump; int qerror, error; ump = VFSTOUFS(mp); qerror = 0; #ifdef QUOTA if (mp->mnt_flag & MNT_QUOTA) { int i; error = vflush(mp, 0, SKIPSYSTEM|flags, td); if (error) return (error); for (i = 0; i < MAXQUOTAS; i++) { error = quotaoff(td, mp, i); if (error != 0) { if ((flags & EARLYFLUSH) == 0) return (error); else qerror = error; } } /* * Here we fall through to vflush again to ensure that * we have gotten rid of all the system vnodes, unless * quotas must not be closed. */ } #endif ASSERT_VOP_LOCKED(ump->um_devvp, "ffs_flushfiles"); if (ump->um_devvp->v_vflag & VV_COPYONWRITE) { if ((error = vflush(mp, 0, SKIPSYSTEM | flags, td)) != 0) return (error); ffs_snapshot_unmount(mp); flags |= FORCECLOSE; /* * Here we fall through to vflush again to ensure * that we have gotten rid of all the system vnodes. */ } /* * Do not close system files if quotas were not closed, to be * able to sync the remaining dquots. The freeblks softupdate * workitems might hold a reference on a dquot, preventing * quotaoff() from completing. Next round of * softdep_flushworklist() iteration should process the * blockers, allowing the next run of quotaoff() to finally * flush held dquots. * * Otherwise, flush all the files. */ if (qerror == 0 && (error = vflush(mp, 0, flags, td)) != 0) return (error); /* * Flush filesystem metadata. */ vn_lock(ump->um_devvp, LK_EXCLUSIVE | LK_RETRY); error = VOP_FSYNC(ump->um_devvp, MNT_WAIT, td); VOP_UNLOCK(ump->um_devvp); return (error); } /* * Get filesystem statistics. */ static int ffs_statfs(mp, sbp) struct mount *mp; struct statfs *sbp; { struct ufsmount *ump; struct fs *fs; ump = VFSTOUFS(mp); fs = ump->um_fs; if (fs->fs_magic != FS_UFS1_MAGIC && fs->fs_magic != FS_UFS2_MAGIC) panic("ffs_statfs"); sbp->f_version = STATFS_VERSION; sbp->f_bsize = fs->fs_fsize; sbp->f_iosize = fs->fs_bsize; sbp->f_blocks = fs->fs_dsize; UFS_LOCK(ump); sbp->f_bfree = fs->fs_cstotal.cs_nbfree * fs->fs_frag + fs->fs_cstotal.cs_nffree + dbtofsb(fs, fs->fs_pendingblocks); sbp->f_bavail = freespace(fs, fs->fs_minfree) + dbtofsb(fs, fs->fs_pendingblocks); sbp->f_files = fs->fs_ncg * fs->fs_ipg - UFS_ROOTINO; sbp->f_ffree = fs->fs_cstotal.cs_nifree + fs->fs_pendinginodes; UFS_UNLOCK(ump); sbp->f_namemax = UFS_MAXNAMLEN; return (0); } static bool sync_doupdate(struct inode *ip) { return ((ip->i_flag & (IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE)) != 0); } static int ffs_sync_lazy_filter(struct vnode *vp, void *arg __unused) { struct inode *ip; /* * Flags are safe to access because ->v_data invalidation * is held off by listmtx. */ if (vp->v_type == VNON) return (false); ip = VTOI(vp); if (!sync_doupdate(ip) && (vp->v_iflag & VI_OWEINACT) == 0) return (false); return (true); } /* * For a lazy sync, we only care about access times, quotas and the * superblock. Other filesystem changes are already converted to * cylinder group blocks or inode blocks updates and are written to * disk by syncer. */ static int ffs_sync_lazy(mp) struct mount *mp; { struct vnode *mvp, *vp; struct inode *ip; struct thread *td; int allerror, error; allerror = 0; td = curthread; if ((mp->mnt_flag & MNT_NOATIME) != 0) { #ifdef QUOTA qsync(mp); #endif goto sbupdate; } MNT_VNODE_FOREACH_LAZY(vp, mp, mvp, ffs_sync_lazy_filter, NULL) { if (vp->v_type == VNON) { VI_UNLOCK(vp); continue; } ip = VTOI(vp); /* * The IN_ACCESS flag is converted to IN_MODIFIED by * ufs_close() and ufs_getattr() by the calls to * ufs_itimes_locked(), without subsequent UFS_UPDATE(). * Test also all the other timestamp flags too, to pick up * any other cases that could be missed. */ if (!sync_doupdate(ip) && (vp->v_iflag & VI_OWEINACT) == 0) { VI_UNLOCK(vp); continue; } - if ((error = vget(vp, LK_EXCLUSIVE | LK_NOWAIT | LK_INTERLOCK, - td)) != 0) + if ((error = vget(vp, LK_EXCLUSIVE | LK_NOWAIT | LK_INTERLOCK)) != 0) continue; #ifdef QUOTA qsyncvp(vp); #endif if (sync_doupdate(ip)) error = ffs_update(vp, 0); if (error != 0) allerror = error; vput(vp); } sbupdate: if (VFSTOUFS(mp)->um_fs->fs_fmod != 0 && (error = ffs_sbupdate(VFSTOUFS(mp), MNT_LAZY, 0)) != 0) allerror = error; return (allerror); } /* * Go through the disk queues to initiate sandbagged IO; * go through the inodes to write those that have been modified; * initiate the writing of the super block if it has been modified. * * Note: we are always called with the filesystem marked busy using * vfs_busy(). */ static int ffs_sync(mp, waitfor) struct mount *mp; int waitfor; { struct vnode *mvp, *vp, *devvp; struct thread *td; struct inode *ip; struct ufsmount *ump = VFSTOUFS(mp); struct fs *fs; int error, count, lockreq, allerror = 0; int suspend; int suspended; int secondary_writes; int secondary_accwrites; int softdep_deps; int softdep_accdeps; struct bufobj *bo; suspend = 0; suspended = 0; td = curthread; fs = ump->um_fs; if (fs->fs_fmod != 0 && fs->fs_ronly != 0 && ump->um_fsckpid == 0) panic("%s: ffs_sync: modification on read-only filesystem", fs->fs_fsmnt); if (waitfor == MNT_LAZY) { if (!rebooting) return (ffs_sync_lazy(mp)); waitfor = MNT_NOWAIT; } /* * Write back each (modified) inode. */ lockreq = LK_EXCLUSIVE | LK_NOWAIT; if (waitfor == MNT_SUSPEND) { suspend = 1; waitfor = MNT_WAIT; } if (waitfor == MNT_WAIT) lockreq = LK_EXCLUSIVE; lockreq |= LK_INTERLOCK | LK_SLEEPFAIL; loop: /* Grab snapshot of secondary write counts */ MNT_ILOCK(mp); secondary_writes = mp->mnt_secondary_writes; secondary_accwrites = mp->mnt_secondary_accwrites; MNT_IUNLOCK(mp); /* Grab snapshot of softdep dependency counts */ softdep_get_depcounts(mp, &softdep_deps, &softdep_accdeps); MNT_VNODE_FOREACH_ALL(vp, mp, mvp) { /* * Depend on the vnode interlock to keep things stable enough * for a quick test. Since there might be hundreds of * thousands of vnodes, we cannot afford even a subroutine * call unless there's a good chance that we have work to do. */ if (vp->v_type == VNON) { VI_UNLOCK(vp); continue; } ip = VTOI(vp); if ((ip->i_flag & (IN_ACCESS | IN_CHANGE | IN_MODIFIED | IN_UPDATE)) == 0 && vp->v_bufobj.bo_dirty.bv_cnt == 0) { VI_UNLOCK(vp); continue; } - if ((error = vget(vp, lockreq, td)) != 0) { + if ((error = vget(vp, lockreq)) != 0) { if (error == ENOENT || error == ENOLCK) { MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); goto loop; } continue; } #ifdef QUOTA qsyncvp(vp); #endif if ((error = ffs_syncvnode(vp, waitfor, 0)) != 0) allerror = error; vput(vp); } /* * Force stale filesystem control information to be flushed. */ if (waitfor == MNT_WAIT || rebooting) { if ((error = softdep_flushworklist(ump->um_mountp, &count, td))) allerror = error; if (ffs_fsfail_cleanup(ump, allerror)) allerror = 0; /* Flushed work items may create new vnodes to clean */ if (allerror == 0 && count) goto loop; } devvp = ump->um_devvp; bo = &devvp->v_bufobj; BO_LOCK(bo); if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0) { BO_UNLOCK(bo); vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); error = VOP_FSYNC(devvp, waitfor, td); VOP_UNLOCK(devvp); if (MOUNTEDSOFTDEP(mp) && (error == 0 || error == EAGAIN)) error = ffs_sbupdate(ump, waitfor, 0); if (error != 0) allerror = error; if (ffs_fsfail_cleanup(ump, allerror)) allerror = 0; if (allerror == 0 && waitfor == MNT_WAIT) goto loop; } else if (suspend != 0) { if (softdep_check_suspend(mp, devvp, softdep_deps, softdep_accdeps, secondary_writes, secondary_accwrites) != 0) { MNT_IUNLOCK(mp); goto loop; /* More work needed */ } mtx_assert(MNT_MTX(mp), MA_OWNED); mp->mnt_kern_flag |= MNTK_SUSPEND2 | MNTK_SUSPENDED; MNT_IUNLOCK(mp); suspended = 1; } else BO_UNLOCK(bo); /* * Write back modified superblock. */ if (fs->fs_fmod != 0 && (error = ffs_sbupdate(ump, waitfor, suspended)) != 0) allerror = error; if (ffs_fsfail_cleanup(ump, allerror)) allerror = 0; return (allerror); } int ffs_vget(mp, ino, flags, vpp) struct mount *mp; ino_t ino; int flags; struct vnode **vpp; { return (ffs_vgetf(mp, ino, flags, vpp, 0)); } int ffs_vgetf(mp, ino, flags, vpp, ffs_flags) struct mount *mp; ino_t ino; int flags; struct vnode **vpp; int ffs_flags; { struct fs *fs; struct inode *ip; struct ufsmount *ump; struct buf *bp; struct vnode *vp; daddr_t dbn; int error; MPASS((ffs_flags & FFSV_REPLACE) == 0 || (flags & LK_EXCLUSIVE) != 0); error = vfs_hash_get(mp, ino, flags, curthread, vpp, NULL, NULL); if (error != 0) return (error); if (*vpp != NULL) { if ((ffs_flags & FFSV_REPLACE) == 0) return (0); vgone(*vpp); vput(*vpp); } /* * We must promote to an exclusive lock for vnode creation. This * can happen if lookup is passed LOCKSHARED. */ if ((flags & LK_TYPE_MASK) == LK_SHARED) { flags &= ~LK_TYPE_MASK; flags |= LK_EXCLUSIVE; } /* * We do not lock vnode creation as it is believed to be too * expensive for such rare case as simultaneous creation of vnode * for same ino by different processes. We just allow them to race * and check later to decide who wins. Let the race begin! */ ump = VFSTOUFS(mp); fs = ump->um_fs; ip = uma_zalloc_smr(uma_inode, M_WAITOK | M_ZERO); /* Allocate a new vnode/inode. */ error = getnewvnode("ufs", mp, fs->fs_magic == FS_UFS1_MAGIC ? &ffs_vnodeops1 : &ffs_vnodeops2, &vp); if (error) { *vpp = NULL; uma_zfree_smr(uma_inode, ip); return (error); } /* * FFS supports recursive locking. */ lockmgr(vp->v_vnlock, LK_EXCLUSIVE, NULL); VN_LOCK_AREC(vp); vp->v_data = ip; vp->v_bufobj.bo_bsize = fs->fs_bsize; ip->i_vnode = vp; ip->i_ump = ump; ip->i_number = ino; ip->i_ea_refs = 0; ip->i_nextclustercg = -1; ip->i_flag = fs->fs_magic == FS_UFS1_MAGIC ? 0 : IN_UFS2; ip->i_mode = 0; /* ensure error cases below throw away vnode */ #ifdef QUOTA { int i; for (i = 0; i < MAXQUOTAS; i++) ip->i_dquot[i] = NODQUOT; } #endif if (ffs_flags & FFSV_FORCEINSMQ) vp->v_vflag |= VV_FORCEINSMQ; error = insmntque(vp, mp); if (error != 0) { uma_zfree_smr(uma_inode, ip); *vpp = NULL; return (error); } vp->v_vflag &= ~VV_FORCEINSMQ; error = vfs_hash_insert(vp, ino, flags, curthread, vpp, NULL, NULL); if (error != 0) return (error); if (*vpp != NULL) { /* * Calls from ffs_valloc() (i.e. FFSV_REPLACE set) * operate on empty inode, which must not be found by * other threads until fully filled. Vnode for empty * inode must be not re-inserted on the hash by other * thread, after removal by us at the beginning. */ MPASS((ffs_flags & FFSV_REPLACE) == 0); return (0); } /* Read in the disk contents for the inode, copy into the inode. */ dbn = fsbtodb(fs, ino_to_fsba(fs, ino)); error = ffs_breadz(ump, ump->um_devvp, dbn, dbn, (int)fs->fs_bsize, NULL, NULL, 0, NOCRED, 0, NULL, &bp); if (error != 0) { /* * The inode does not contain anything useful, so it would * be misleading to leave it on its hash chain. With mode * still zero, it will be unlinked and returned to the free * list by vput(). */ vgone(vp); vput(vp); *vpp = NULL; return (error); } if (I_IS_UFS1(ip)) ip->i_din1 = uma_zalloc(uma_ufs1, M_WAITOK); else ip->i_din2 = uma_zalloc(uma_ufs2, M_WAITOK); if ((error = ffs_load_inode(bp, ip, fs, ino)) != 0) { bqrelse(bp); vgone(vp); vput(vp); *vpp = NULL; return (error); } if (DOINGSOFTDEP(vp)) softdep_load_inodeblock(ip); else ip->i_effnlink = ip->i_nlink; bqrelse(bp); /* * Initialize the vnode from the inode, check for aliases. * Note that the underlying vnode may have changed. */ error = ufs_vinit(mp, I_IS_UFS1(ip) ? &ffs_fifoops1 : &ffs_fifoops2, &vp); if (error) { vgone(vp); vput(vp); *vpp = NULL; return (error); } /* * Finish inode initialization. */ if (vp->v_type != VFIFO) { /* FFS supports shared locking for all files except fifos. */ VN_LOCK_ASHARE(vp); } /* * Set up a generation number for this inode if it does not * already have one. This should only happen on old filesystems. */ if (ip->i_gen == 0) { while (ip->i_gen == 0) ip->i_gen = arc4random(); if ((vp->v_mount->mnt_flag & MNT_RDONLY) == 0) { UFS_INODE_SET_FLAG(ip, IN_MODIFIED); DIP_SET(ip, i_gen, ip->i_gen); } } #ifdef MAC if ((mp->mnt_flag & MNT_MULTILABEL) && ip->i_mode) { /* * If this vnode is already allocated, and we're running * multi-label, attempt to perform a label association * from the extended attributes on the inode. */ error = mac_vnode_associate_extattr(mp, vp); if (error) { /* ufs_inactive will release ip->i_devvp ref. */ vgone(vp); vput(vp); *vpp = NULL; return (error); } } #endif *vpp = vp; return (0); } /* * File handle to vnode * * Have to be really careful about stale file handles: * - check that the inode number is valid * - for UFS2 check that the inode number is initialized * - call ffs_vget() to get the locked inode * - check for an unallocated inode (i_mode == 0) * - check that the given client host has export rights and return * those rights via. exflagsp and credanonp */ static int ffs_fhtovp(mp, fhp, flags, vpp) struct mount *mp; struct fid *fhp; int flags; struct vnode **vpp; { struct ufid *ufhp; struct ufsmount *ump; struct fs *fs; struct cg *cgp; struct buf *bp; ino_t ino; u_int cg; int error; ufhp = (struct ufid *)fhp; ino = ufhp->ufid_ino; ump = VFSTOUFS(mp); fs = ump->um_fs; if (ino < UFS_ROOTINO || ino >= fs->fs_ncg * fs->fs_ipg) return (ESTALE); /* * Need to check if inode is initialized because UFS2 does lazy * initialization and nfs_fhtovp can offer arbitrary inode numbers. */ if (fs->fs_magic != FS_UFS2_MAGIC) return (ufs_fhtovp(mp, ufhp, flags, vpp)); cg = ino_to_cg(fs, ino); if ((error = ffs_getcg(fs, ump->um_devvp, cg, 0, &bp, &cgp)) != 0) return (error); if (ino >= cg * fs->fs_ipg + cgp->cg_initediblk) { brelse(bp); return (ESTALE); } brelse(bp); return (ufs_fhtovp(mp, ufhp, flags, vpp)); } /* * Initialize the filesystem. */ static int ffs_init(vfsp) struct vfsconf *vfsp; { ffs_susp_initialize(); softdep_initialize(); return (ufs_init(vfsp)); } /* * Undo the work of ffs_init(). */ static int ffs_uninit(vfsp) struct vfsconf *vfsp; { int ret; ret = ufs_uninit(vfsp); softdep_uninitialize(); ffs_susp_uninitialize(); taskqueue_drain_all(taskqueue_thread); return (ret); } /* * Structure used to pass information from ffs_sbupdate to its * helper routine ffs_use_bwrite. */ struct devfd { struct ufsmount *ump; struct buf *sbbp; int waitfor; int suspended; int error; }; /* * Write a superblock and associated information back to disk. */ int ffs_sbupdate(ump, waitfor, suspended) struct ufsmount *ump; int waitfor; int suspended; { struct fs *fs; struct buf *sbbp; struct devfd devfd; fs = ump->um_fs; if (fs->fs_ronly == 1 && (ump->um_mountp->mnt_flag & (MNT_RDONLY | MNT_UPDATE)) != (MNT_RDONLY | MNT_UPDATE) && ump->um_fsckpid == 0) panic("ffs_sbupdate: write read-only filesystem"); /* * We use the superblock's buf to serialize calls to ffs_sbupdate(). */ sbbp = getblk(ump->um_devvp, btodb(fs->fs_sblockloc), (int)fs->fs_sbsize, 0, 0, 0); /* * Initialize info needed for write function. */ devfd.ump = ump; devfd.sbbp = sbbp; devfd.waitfor = waitfor; devfd.suspended = suspended; devfd.error = 0; return (ffs_sbput(&devfd, fs, fs->fs_sblockloc, ffs_use_bwrite)); } /* * Write function for use by filesystem-layer routines. */ static int ffs_use_bwrite(void *devfd, off_t loc, void *buf, int size) { struct devfd *devfdp; struct ufsmount *ump; struct buf *bp; struct fs *fs; int error; devfdp = devfd; ump = devfdp->ump; fs = ump->um_fs; /* * Writing the superblock summary information. */ if (loc != fs->fs_sblockloc) { bp = getblk(ump->um_devvp, btodb(loc), size, 0, 0, 0); bcopy(buf, bp->b_data, (u_int)size); if (devfdp->suspended) bp->b_flags |= B_VALIDSUSPWRT; if (devfdp->waitfor != MNT_WAIT) bawrite(bp); else if ((error = bwrite(bp)) != 0) devfdp->error = error; return (0); } /* * Writing the superblock itself. We need to do special checks for it. */ bp = devfdp->sbbp; if (ffs_fsfail_cleanup(ump, devfdp->error)) devfdp->error = 0; if (devfdp->error != 0) { brelse(bp); return (devfdp->error); } if (fs->fs_magic == FS_UFS1_MAGIC && fs->fs_sblockloc != SBLOCK_UFS1 && (fs->fs_old_flags & FS_FLAGS_UPDATED) == 0) { printf("WARNING: %s: correcting fs_sblockloc from %jd to %d\n", fs->fs_fsmnt, fs->fs_sblockloc, SBLOCK_UFS1); fs->fs_sblockloc = SBLOCK_UFS1; } if (fs->fs_magic == FS_UFS2_MAGIC && fs->fs_sblockloc != SBLOCK_UFS2 && (fs->fs_old_flags & FS_FLAGS_UPDATED) == 0) { printf("WARNING: %s: correcting fs_sblockloc from %jd to %d\n", fs->fs_fsmnt, fs->fs_sblockloc, SBLOCK_UFS2); fs->fs_sblockloc = SBLOCK_UFS2; } if (MOUNTEDSOFTDEP(ump->um_mountp)) softdep_setup_sbupdate(ump, (struct fs *)bp->b_data, bp); bcopy((caddr_t)fs, bp->b_data, (u_int)fs->fs_sbsize); fs = (struct fs *)bp->b_data; ffs_oldfscompat_write(fs, ump); fs->fs_si = NULL; /* Recalculate the superblock hash */ fs->fs_ckhash = ffs_calc_sbhash(fs); if (devfdp->suspended) bp->b_flags |= B_VALIDSUSPWRT; if (devfdp->waitfor != MNT_WAIT) bawrite(bp); else if ((error = bwrite(bp)) != 0) devfdp->error = error; return (devfdp->error); } static int ffs_extattrctl(struct mount *mp, int cmd, struct vnode *filename_vp, int attrnamespace, const char *attrname) { #ifdef UFS_EXTATTR return (ufs_extattrctl(mp, cmd, filename_vp, attrnamespace, attrname)); #else return (vfs_stdextattrctl(mp, cmd, filename_vp, attrnamespace, attrname)); #endif } static void ffs_ifree(struct ufsmount *ump, struct inode *ip) { if (ump->um_fstype == UFS1 && ip->i_din1 != NULL) uma_zfree(uma_ufs1, ip->i_din1); else if (ip->i_din2 != NULL) uma_zfree(uma_ufs2, ip->i_din2); uma_zfree_smr(uma_inode, ip); } static int dobkgrdwrite = 1; SYSCTL_INT(_debug, OID_AUTO, dobkgrdwrite, CTLFLAG_RW, &dobkgrdwrite, 0, "Do background writes (honoring the BV_BKGRDWRITE flag)?"); /* * Complete a background write started from bwrite. */ static void ffs_backgroundwritedone(struct buf *bp) { struct bufobj *bufobj; struct buf *origbp; #ifdef SOFTUPDATES if (!LIST_EMPTY(&bp->b_dep) && (bp->b_ioflags & BIO_ERROR) != 0) softdep_handle_error(bp); #endif /* * Find the original buffer that we are writing. */ bufobj = bp->b_bufobj; BO_LOCK(bufobj); if ((origbp = gbincore(bp->b_bufobj, bp->b_lblkno)) == NULL) panic("backgroundwritedone: lost buffer"); /* * We should mark the cylinder group buffer origbp as * dirty, to not lose the failed write. */ if ((bp->b_ioflags & BIO_ERROR) != 0) origbp->b_vflags |= BV_BKGRDERR; BO_UNLOCK(bufobj); /* * Process dependencies then return any unfinished ones. */ if (!LIST_EMPTY(&bp->b_dep) && (bp->b_ioflags & BIO_ERROR) == 0) buf_complete(bp); #ifdef SOFTUPDATES if (!LIST_EMPTY(&bp->b_dep)) softdep_move_dependencies(bp, origbp); #endif /* * This buffer is marked B_NOCACHE so when it is released * by biodone it will be tossed. */ bp->b_flags |= B_NOCACHE; bp->b_flags &= ~B_CACHE; pbrelvp(bp); /* * Prevent brelse() from trying to keep and re-dirtying bp on * errors. It causes b_bufobj dereference in * bdirty()/reassignbuf(), and b_bufobj was cleared in * pbrelvp() above. */ if ((bp->b_ioflags & BIO_ERROR) != 0) bp->b_flags |= B_INVAL; bufdone(bp); BO_LOCK(bufobj); /* * Clear the BV_BKGRDINPROG flag in the original buffer * and awaken it if it is waiting for the write to complete. * If BV_BKGRDINPROG is not set in the original buffer it must * have been released and re-instantiated - which is not legal. */ KASSERT((origbp->b_vflags & BV_BKGRDINPROG), ("backgroundwritedone: lost buffer2")); origbp->b_vflags &= ~BV_BKGRDINPROG; if (origbp->b_vflags & BV_BKGRDWAIT) { origbp->b_vflags &= ~BV_BKGRDWAIT; wakeup(&origbp->b_xflags); } BO_UNLOCK(bufobj); } /* * Write, release buffer on completion. (Done by iodone * if async). Do not bother writing anything if the buffer * is invalid. * * Note that we set B_CACHE here, indicating that buffer is * fully valid and thus cacheable. This is true even of NFS * now so we set it generally. This could be set either here * or in biodone() since the I/O is synchronous. We put it * here. */ static int ffs_bufwrite(struct buf *bp) { struct buf *newbp; struct cg *cgp; CTR3(KTR_BUF, "bufwrite(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); if (bp->b_flags & B_INVAL) { brelse(bp); return (0); } if (!BUF_ISLOCKED(bp)) panic("bufwrite: buffer is not busy???"); /* * If a background write is already in progress, delay * writing this block if it is asynchronous. Otherwise * wait for the background write to complete. */ BO_LOCK(bp->b_bufobj); if (bp->b_vflags & BV_BKGRDINPROG) { if (bp->b_flags & B_ASYNC) { BO_UNLOCK(bp->b_bufobj); bdwrite(bp); return (0); } bp->b_vflags |= BV_BKGRDWAIT; msleep(&bp->b_xflags, BO_LOCKPTR(bp->b_bufobj), PRIBIO, "bwrbg", 0); if (bp->b_vflags & BV_BKGRDINPROG) panic("bufwrite: still writing"); } bp->b_vflags &= ~BV_BKGRDERR; BO_UNLOCK(bp->b_bufobj); /* * If this buffer is marked for background writing and we * do not have to wait for it, make a copy and write the * copy so as to leave this buffer ready for further use. * * This optimization eats a lot of memory. If we have a page * or buffer shortfall we can't do it. */ if (dobkgrdwrite && (bp->b_xflags & BX_BKGRDWRITE) && (bp->b_flags & B_ASYNC) && !vm_page_count_severe() && !buf_dirty_count_severe()) { KASSERT(bp->b_iodone == NULL, ("bufwrite: needs chained iodone (%p)", bp->b_iodone)); /* get a new block */ newbp = geteblk(bp->b_bufsize, GB_NOWAIT_BD); if (newbp == NULL) goto normal_write; KASSERT(buf_mapped(bp), ("Unmapped cg")); memcpy(newbp->b_data, bp->b_data, bp->b_bufsize); BO_LOCK(bp->b_bufobj); bp->b_vflags |= BV_BKGRDINPROG; BO_UNLOCK(bp->b_bufobj); newbp->b_xflags |= (bp->b_xflags & BX_FSPRIV) | BX_BKGRDMARKER; newbp->b_lblkno = bp->b_lblkno; newbp->b_blkno = bp->b_blkno; newbp->b_offset = bp->b_offset; newbp->b_iodone = ffs_backgroundwritedone; newbp->b_flags |= B_ASYNC; newbp->b_flags &= ~B_INVAL; pbgetvp(bp->b_vp, newbp); #ifdef SOFTUPDATES /* * Move over the dependencies. If there are rollbacks, * leave the parent buffer dirtied as it will need to * be written again. */ if (LIST_EMPTY(&bp->b_dep) || softdep_move_dependencies(bp, newbp) == 0) bundirty(bp); #else bundirty(bp); #endif /* * Initiate write on the copy, release the original. The * BKGRDINPROG flag prevents it from going away until * the background write completes. We have to recalculate * its check hash in case the buffer gets freed and then * reconstituted from the buffer cache during a later read. */ if ((bp->b_xflags & BX_CYLGRP) != 0) { cgp = (struct cg *)bp->b_data; cgp->cg_ckhash = 0; cgp->cg_ckhash = calculate_crc32c(~0L, bp->b_data, bp->b_bcount); } bqrelse(bp); bp = newbp; } else /* Mark the buffer clean */ bundirty(bp); /* Let the normal bufwrite do the rest for us */ normal_write: /* * If we are writing a cylinder group, update its time. */ if ((bp->b_xflags & BX_CYLGRP) != 0) { cgp = (struct cg *)bp->b_data; cgp->cg_old_time = cgp->cg_time = time_second; } return (bufwrite(bp)); } static void ffs_geom_strategy(struct bufobj *bo, struct buf *bp) { struct vnode *vp; struct buf *tbp; int error, nocopy; /* * This is the bufobj strategy for the private VCHR vnodes * used by FFS to access the underlying storage device. * We override the default bufobj strategy and thus bypass * VOP_STRATEGY() for these vnodes. */ vp = bo2vnode(bo); KASSERT(bp->b_vp == NULL || bp->b_vp->v_type != VCHR || bp->b_vp->v_rdev == NULL || bp->b_vp->v_rdev->si_mountpt == NULL || VFSTOUFS(bp->b_vp->v_rdev->si_mountpt) == NULL || vp == VFSTOUFS(bp->b_vp->v_rdev->si_mountpt)->um_devvp, ("ffs_geom_strategy() with wrong vp")); if (bp->b_iocmd == BIO_WRITE) { if ((bp->b_flags & B_VALIDSUSPWRT) == 0 && bp->b_vp != NULL && bp->b_vp->v_mount != NULL && (bp->b_vp->v_mount->mnt_kern_flag & MNTK_SUSPENDED) != 0) panic("ffs_geom_strategy: bad I/O"); nocopy = bp->b_flags & B_NOCOPY; bp->b_flags &= ~(B_VALIDSUSPWRT | B_NOCOPY); if ((vp->v_vflag & VV_COPYONWRITE) && nocopy == 0 && vp->v_rdev->si_snapdata != NULL) { if ((bp->b_flags & B_CLUSTER) != 0) { runningbufwakeup(bp); TAILQ_FOREACH(tbp, &bp->b_cluster.cluster_head, b_cluster.cluster_entry) { error = ffs_copyonwrite(vp, tbp); if (error != 0 && error != EOPNOTSUPP) { bp->b_error = error; bp->b_ioflags |= BIO_ERROR; bufdone(bp); return; } } bp->b_runningbufspace = bp->b_bufsize; atomic_add_long(&runningbufspace, bp->b_runningbufspace); } else { error = ffs_copyonwrite(vp, bp); if (error != 0 && error != EOPNOTSUPP) { bp->b_error = error; bp->b_ioflags |= BIO_ERROR; bufdone(bp); return; } } } #ifdef SOFTUPDATES if ((bp->b_flags & B_CLUSTER) != 0) { TAILQ_FOREACH(tbp, &bp->b_cluster.cluster_head, b_cluster.cluster_entry) { if (!LIST_EMPTY(&tbp->b_dep)) buf_start(tbp); } } else { if (!LIST_EMPTY(&bp->b_dep)) buf_start(bp); } #endif /* * Check for metadata that needs check-hashes and update them. */ switch (bp->b_xflags & BX_FSPRIV) { case BX_CYLGRP: ((struct cg *)bp->b_data)->cg_ckhash = 0; ((struct cg *)bp->b_data)->cg_ckhash = calculate_crc32c(~0L, bp->b_data, bp->b_bcount); break; case BX_SUPERBLOCK: case BX_INODE: case BX_INDIR: case BX_DIR: printf("Check-hash write is unimplemented!!!\n"); break; case 0: break; default: printf("multiple buffer types 0x%b\n", (u_int)(bp->b_xflags & BX_FSPRIV), PRINT_UFS_BUF_XFLAGS); break; } } if (bp->b_iocmd != BIO_READ && ffs_enxio_enable) bp->b_xflags |= BX_CVTENXIO; g_vfs_strategy(bo, bp); } int ffs_own_mount(const struct mount *mp) { if (mp->mnt_op == &ufs_vfsops) return (1); return (0); } #ifdef DDB #ifdef SOFTUPDATES /* defined in ffs_softdep.c */ extern void db_print_ffs(struct ufsmount *ump); DB_SHOW_COMMAND(ffs, db_show_ffs) { struct mount *mp; struct ufsmount *ump; if (have_addr) { ump = VFSTOUFS((struct mount *)addr); db_print_ffs(ump); return; } TAILQ_FOREACH(mp, &mountlist, mnt_list) { if (!strcmp(mp->mnt_stat.f_fstypename, ufs_vfsconf.vfc_name)) db_print_ffs(VFSTOUFS(mp)); } } #endif /* SOFTUPDATES */ #endif /* DDB */ Index: projects/clang1100-import/sys/ufs/ufs/ufs_quota.c =================================================================== --- projects/clang1100-import/sys/ufs/ufs/ufs_quota.c (revision 364278) +++ projects/clang1100-import/sys/ufs/ufs/ufs_quota.c (revision 364279) @@ -1,1886 +1,1885 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1990, 1993, 1995 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * Robert Elz at The University of Melbourne. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ufs_quota.c 8.5 (Berkeley) 5/20/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_ffs.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include CTASSERT(sizeof(struct dqblk64) == sizeof(struct dqhdr64)); static int unprivileged_get_quota = 0; SYSCTL_INT(_security_bsd, OID_AUTO, unprivileged_get_quota, CTLFLAG_RW, &unprivileged_get_quota, 0, "Unprivileged processes may retrieve quotas for other uids and gids"); static MALLOC_DEFINE(M_DQUOT, "ufs_quota", "UFS quota entries"); /* * Quota name to error message mapping. */ static char *quotatypes[] = INITQFNAMES; static int chkdqchg(struct inode *, ufs2_daddr_t, struct ucred *, int, int *); static int chkiqchg(struct inode *, int, struct ucred *, int, int *); static int dqopen(struct vnode *, struct ufsmount *, int); static int dqget(struct vnode *, u_long, struct ufsmount *, int, struct dquot **); static int dqsync(struct vnode *, struct dquot *); static int dqflush(struct vnode *); static int quotaoff1(struct thread *td, struct mount *mp, int type); static int quotaoff_inchange(struct thread *td, struct mount *mp, int type); /* conversion functions - from_to() */ static void dqb32_dq(const struct dqblk32 *, struct dquot *); static void dqb64_dq(const struct dqblk64 *, struct dquot *); static void dq_dqb32(const struct dquot *, struct dqblk32 *); static void dq_dqb64(const struct dquot *, struct dqblk64 *); static void dqb32_dqb64(const struct dqblk32 *, struct dqblk64 *); static void dqb64_dqb32(const struct dqblk64 *, struct dqblk32 *); #ifdef DIAGNOSTIC static void dqref(struct dquot *); static void chkdquot(struct inode *); #endif /* * Set up the quotas for an inode. * * This routine completely defines the semantics of quotas. * If other criterion want to be used to establish quotas, the * MAXQUOTAS value in quota.h should be increased, and the * additional dquots set up here. */ int getinoquota(struct inode *ip) { struct ufsmount *ump; struct vnode *vp; int error; vp = ITOV(ip); /* * Disk quotas must be turned off for system files. Currently * snapshot and quota files. */ if ((vp->v_vflag & VV_SYSTEM) != 0) return (0); /* * XXX: Turn off quotas for files with a negative UID or GID. * This prevents the creation of 100GB+ quota files. */ if ((int)ip->i_uid < 0 || (int)ip->i_gid < 0) return (0); ump = VFSTOUFS(vp->v_mount); /* * Set up the user quota based on file uid. * EINVAL means that quotas are not enabled. */ if ((error = dqget(vp, ip->i_uid, ump, USRQUOTA, &ip->i_dquot[USRQUOTA])) && error != EINVAL) return (error); /* * Set up the group quota based on file gid. * EINVAL means that quotas are not enabled. */ if ((error = dqget(vp, ip->i_gid, ump, GRPQUOTA, &ip->i_dquot[GRPQUOTA])) && error != EINVAL) return (error); return (0); } /* * Update disk usage, and take corrective action. */ int chkdq(struct inode *ip, ufs2_daddr_t change, struct ucred *cred, int flags) { struct dquot *dq; ufs2_daddr_t ncurblocks; struct vnode *vp = ITOV(ip); int i, error, warn, do_check; MPASS(cred != NOCRED || (flags & FORCE) != 0); /* * Disk quotas must be turned off for system files. Currently * snapshot and quota files. */ if ((vp->v_vflag & VV_SYSTEM) != 0) return (0); /* * XXX: Turn off quotas for files with a negative UID or GID. * This prevents the creation of 100GB+ quota files. */ if ((int)ip->i_uid < 0 || (int)ip->i_gid < 0) return (0); #ifdef DIAGNOSTIC if ((flags & CHOWN) == 0) chkdquot(ip); #endif if (change == 0) return (0); if (change < 0) { for (i = 0; i < MAXQUOTAS; i++) { if ((dq = ip->i_dquot[i]) == NODQUOT) continue; DQI_LOCK(dq); DQI_WAIT(dq, PINOD+1, "chkdq1"); ncurblocks = dq->dq_curblocks + change; if (ncurblocks >= 0) dq->dq_curblocks = ncurblocks; else dq->dq_curblocks = 0; dq->dq_flags &= ~DQ_BLKS; dq->dq_flags |= DQ_MOD; DQI_UNLOCK(dq); } return (0); } if ((flags & FORCE) == 0 && priv_check_cred(cred, PRIV_VFS_EXCEEDQUOTA)) do_check = 1; else do_check = 0; for (i = 0; i < MAXQUOTAS; i++) { if ((dq = ip->i_dquot[i]) == NODQUOT) continue; warn = 0; DQI_LOCK(dq); DQI_WAIT(dq, PINOD+1, "chkdq2"); if (do_check) { error = chkdqchg(ip, change, cred, i, &warn); if (error) { /* * Roll back user quota changes when * group quota failed. */ while (i > 0) { --i; dq = ip->i_dquot[i]; if (dq == NODQUOT) continue; DQI_LOCK(dq); DQI_WAIT(dq, PINOD+1, "chkdq3"); ncurblocks = dq->dq_curblocks - change; if (ncurblocks >= 0) dq->dq_curblocks = ncurblocks; else dq->dq_curblocks = 0; dq->dq_flags &= ~DQ_BLKS; dq->dq_flags |= DQ_MOD; DQI_UNLOCK(dq); } return (error); } } /* Reset timer when crossing soft limit */ if (dq->dq_curblocks + change >= dq->dq_bsoftlimit && dq->dq_curblocks < dq->dq_bsoftlimit) dq->dq_btime = time_second + ITOUMP(ip)->um_btime[i]; dq->dq_curblocks += change; dq->dq_flags |= DQ_MOD; DQI_UNLOCK(dq); if (warn) uprintf("\n%s: warning, %s disk quota exceeded\n", ITOVFS(ip)->mnt_stat.f_mntonname, quotatypes[i]); } return (0); } /* * Check for a valid change to a users allocation. * Issue an error message if appropriate. */ static int chkdqchg(struct inode *ip, ufs2_daddr_t change, struct ucred *cred, int type, int *warn) { struct dquot *dq = ip->i_dquot[type]; ufs2_daddr_t ncurblocks = dq->dq_curblocks + change; /* * If user would exceed their hard limit, disallow space allocation. */ if (ncurblocks >= dq->dq_bhardlimit && dq->dq_bhardlimit) { if ((dq->dq_flags & DQ_BLKS) == 0 && ip->i_uid == cred->cr_uid) { dq->dq_flags |= DQ_BLKS; DQI_UNLOCK(dq); uprintf("\n%s: write failed, %s disk limit reached\n", ITOVFS(ip)->mnt_stat.f_mntonname, quotatypes[type]); return (EDQUOT); } DQI_UNLOCK(dq); return (EDQUOT); } /* * If user is over their soft limit for too long, disallow space * allocation. Reset time limit as they cross their soft limit. */ if (ncurblocks >= dq->dq_bsoftlimit && dq->dq_bsoftlimit) { if (dq->dq_curblocks < dq->dq_bsoftlimit) { dq->dq_btime = time_second + ITOUMP(ip)->um_btime[type]; if (ip->i_uid == cred->cr_uid) *warn = 1; return (0); } if (time_second > dq->dq_btime) { if ((dq->dq_flags & DQ_BLKS) == 0 && ip->i_uid == cred->cr_uid) { dq->dq_flags |= DQ_BLKS; DQI_UNLOCK(dq); uprintf("\n%s: write failed, %s " "disk quota exceeded for too long\n", ITOVFS(ip)->mnt_stat.f_mntonname, quotatypes[type]); return (EDQUOT); } DQI_UNLOCK(dq); return (EDQUOT); } } return (0); } /* * Check the inode limit, applying corrective action. */ int chkiq(struct inode *ip, int change, struct ucred *cred, int flags) { struct dquot *dq; int i, error, warn, do_check; MPASS(cred != NOCRED || (flags & FORCE) != 0); #ifdef DIAGNOSTIC if ((flags & CHOWN) == 0) chkdquot(ip); #endif if (change == 0) return (0); if (change < 0) { for (i = 0; i < MAXQUOTAS; i++) { if ((dq = ip->i_dquot[i]) == NODQUOT) continue; DQI_LOCK(dq); DQI_WAIT(dq, PINOD+1, "chkiq1"); if (dq->dq_curinodes >= -change) dq->dq_curinodes += change; else dq->dq_curinodes = 0; dq->dq_flags &= ~DQ_INODS; dq->dq_flags |= DQ_MOD; DQI_UNLOCK(dq); } return (0); } if ((flags & FORCE) == 0 && priv_check_cred(cred, PRIV_VFS_EXCEEDQUOTA)) do_check = 1; else do_check = 0; for (i = 0; i < MAXQUOTAS; i++) { if ((dq = ip->i_dquot[i]) == NODQUOT) continue; warn = 0; DQI_LOCK(dq); DQI_WAIT(dq, PINOD+1, "chkiq2"); if (do_check) { error = chkiqchg(ip, change, cred, i, &warn); if (error) { /* * Roll back user quota changes when * group quota failed. */ while (i > 0) { --i; dq = ip->i_dquot[i]; if (dq == NODQUOT) continue; DQI_LOCK(dq); DQI_WAIT(dq, PINOD+1, "chkiq3"); if (dq->dq_curinodes >= change) dq->dq_curinodes -= change; else dq->dq_curinodes = 0; dq->dq_flags &= ~DQ_INODS; dq->dq_flags |= DQ_MOD; DQI_UNLOCK(dq); } return (error); } } /* Reset timer when crossing soft limit */ if (dq->dq_curinodes + change >= dq->dq_isoftlimit && dq->dq_curinodes < dq->dq_isoftlimit) dq->dq_itime = time_second + ITOUMP(ip)->um_itime[i]; dq->dq_curinodes += change; dq->dq_flags |= DQ_MOD; DQI_UNLOCK(dq); if (warn) uprintf("\n%s: warning, %s inode quota exceeded\n", ITOVFS(ip)->mnt_stat.f_mntonname, quotatypes[i]); } return (0); } /* * Check for a valid change to a users allocation. * Issue an error message if appropriate. */ static int chkiqchg(struct inode *ip, int change, struct ucred *cred, int type, int *warn) { struct dquot *dq = ip->i_dquot[type]; ino_t ncurinodes = dq->dq_curinodes + change; /* * If user would exceed their hard limit, disallow inode allocation. */ if (ncurinodes >= dq->dq_ihardlimit && dq->dq_ihardlimit) { if ((dq->dq_flags & DQ_INODS) == 0 && ip->i_uid == cred->cr_uid) { dq->dq_flags |= DQ_INODS; DQI_UNLOCK(dq); uprintf("\n%s: write failed, %s inode limit reached\n", ITOVFS(ip)->mnt_stat.f_mntonname, quotatypes[type]); return (EDQUOT); } DQI_UNLOCK(dq); return (EDQUOT); } /* * If user is over their soft limit for too long, disallow inode * allocation. Reset time limit as they cross their soft limit. */ if (ncurinodes >= dq->dq_isoftlimit && dq->dq_isoftlimit) { if (dq->dq_curinodes < dq->dq_isoftlimit) { dq->dq_itime = time_second + ITOUMP(ip)->um_itime[type]; if (ip->i_uid == cred->cr_uid) *warn = 1; return (0); } if (time_second > dq->dq_itime) { if ((dq->dq_flags & DQ_INODS) == 0 && ip->i_uid == cred->cr_uid) { dq->dq_flags |= DQ_INODS; DQI_UNLOCK(dq); uprintf("\n%s: write failed, %s " "inode quota exceeded for too long\n", ITOVFS(ip)->mnt_stat.f_mntonname, quotatypes[type]); return (EDQUOT); } DQI_UNLOCK(dq); return (EDQUOT); } } return (0); } #ifdef DIAGNOSTIC /* * On filesystems with quotas enabled, it is an error for a file to change * size and not to have a dquot structure associated with it. */ static void chkdquot(struct inode *ip) { struct ufsmount *ump; struct vnode *vp; int i; ump = ITOUMP(ip); vp = ITOV(ip); /* * Disk quotas must be turned off for system files. Currently * these are snapshots and quota files. */ if ((vp->v_vflag & VV_SYSTEM) != 0) return; /* * XXX: Turn off quotas for files with a negative UID or GID. * This prevents the creation of 100GB+ quota files. */ if ((int)ip->i_uid < 0 || (int)ip->i_gid < 0) return; UFS_LOCK(ump); for (i = 0; i < MAXQUOTAS; i++) { if (ump->um_quotas[i] == NULLVP || (ump->um_qflags[i] & (QTF_OPENING|QTF_CLOSING))) continue; if (ip->i_dquot[i] == NODQUOT) { UFS_UNLOCK(ump); vn_printf(ITOV(ip), "chkdquot: missing dquot "); panic("chkdquot: missing dquot"); } } UFS_UNLOCK(ump); } #endif /* * Code to process quotactl commands. */ /* * Q_QUOTAON - set up a quota file for a particular filesystem. */ int quotaon(struct thread *td, struct mount *mp, int type, void *fname) { struct ufsmount *ump; struct vnode *vp, **vpp; struct vnode *mvp; struct dquot *dq; int error, flags; struct nameidata nd; error = priv_check(td, PRIV_UFS_QUOTAON); if (error != 0) { vfs_unbusy(mp); return (error); } if ((mp->mnt_flag & MNT_RDONLY) != 0) { vfs_unbusy(mp); return (EROFS); } ump = VFSTOUFS(mp); dq = NODQUOT; NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, fname, td); flags = FREAD | FWRITE; vfs_ref(mp); vfs_unbusy(mp); error = vn_open(&nd, &flags, 0, NULL); if (error != 0) { vfs_rel(mp); return (error); } NDFREE(&nd, NDF_ONLY_PNBUF); vp = nd.ni_vp; error = vfs_busy(mp, MBF_NOWAIT); vfs_rel(mp); if (error == 0) { if (vp->v_type != VREG) { error = EACCES; vfs_unbusy(mp); } } if (error != 0) { VOP_UNLOCK(vp); (void) vn_close(vp, FREAD|FWRITE, td->td_ucred, td); return (error); } UFS_LOCK(ump); if ((ump->um_qflags[type] & (QTF_OPENING|QTF_CLOSING)) != 0) { UFS_UNLOCK(ump); VOP_UNLOCK(vp); (void) vn_close(vp, FREAD|FWRITE, td->td_ucred, td); vfs_unbusy(mp); return (EALREADY); } ump->um_qflags[type] |= QTF_OPENING|QTF_CLOSING; UFS_UNLOCK(ump); if ((error = dqopen(vp, ump, type)) != 0) { VOP_UNLOCK(vp); UFS_LOCK(ump); ump->um_qflags[type] &= ~(QTF_OPENING|QTF_CLOSING); UFS_UNLOCK(ump); (void) vn_close(vp, FREAD|FWRITE, td->td_ucred, td); vfs_unbusy(mp); return (error); } VOP_UNLOCK(vp); MNT_ILOCK(mp); mp->mnt_flag |= MNT_QUOTA; MNT_IUNLOCK(mp); vpp = &ump->um_quotas[type]; if (*vpp != vp) quotaoff1(td, mp, type); /* * When the directory vnode containing the quota file is * inactivated, due to the shared lookup of the quota file * vput()ing the dvp, the qsyncvp() call for the containing * directory would try to acquire the quota lock exclusive. * At the same time, lookup already locked the quota vnode * shared. Mark the quota vnode lock as allowing recursion * and automatically converting shared locks to exclusive. * * Also mark quota vnode as system. */ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); vp->v_vflag |= VV_SYSTEM; VN_LOCK_AREC(vp); VN_LOCK_DSHARE(vp); VOP_UNLOCK(vp); *vpp = vp; /* * Save the credential of the process that turned on quotas. * Set up the time limits for this quota. */ ump->um_cred[type] = crhold(td->td_ucred); ump->um_btime[type] = MAX_DQ_TIME; ump->um_itime[type] = MAX_IQ_TIME; if (dqget(NULLVP, 0, ump, type, &dq) == 0) { if (dq->dq_btime > 0) ump->um_btime[type] = dq->dq_btime; if (dq->dq_itime > 0) ump->um_itime[type] = dq->dq_itime; dqrele(NULLVP, dq); } /* * Allow the getdq from getinoquota below to read the quota * from file. */ UFS_LOCK(ump); ump->um_qflags[type] &= ~QTF_CLOSING; UFS_UNLOCK(ump); /* * Search vnodes associated with this mount point, * adding references to quota file being opened. * NB: only need to add dquot's for inodes being modified. */ again: MNT_VNODE_FOREACH_ALL(vp, mp, mvp) { - if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK, td)) { + if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK)) { MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); goto again; } if (vp->v_type == VNON || vp->v_writecount <= 0) { VOP_UNLOCK(vp); vrele(vp); continue; } error = getinoquota(VTOI(vp)); VOP_UNLOCK(vp); vrele(vp); if (error) { MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); break; } } if (error) quotaoff_inchange(td, mp, type); UFS_LOCK(ump); ump->um_qflags[type] &= ~QTF_OPENING; KASSERT((ump->um_qflags[type] & QTF_CLOSING) == 0, ("quotaon: leaking flags")); UFS_UNLOCK(ump); vfs_unbusy(mp); return (error); } /* * Main code to turn off disk quotas for a filesystem. Does not change * flags. */ static int quotaoff1(struct thread *td, struct mount *mp, int type) { struct vnode *vp; struct vnode *qvp, *mvp; struct ufsmount *ump; struct dquot *dq; struct inode *ip; struct ucred *cr; int error; ump = VFSTOUFS(mp); UFS_LOCK(ump); KASSERT((ump->um_qflags[type] & QTF_CLOSING) != 0, ("quotaoff1: flags are invalid")); if ((qvp = ump->um_quotas[type]) == NULLVP) { UFS_UNLOCK(ump); return (0); } cr = ump->um_cred[type]; UFS_UNLOCK(ump); /* * Search vnodes associated with this mount point, * deleting any references to quota file being closed. */ again: MNT_VNODE_FOREACH_ALL(vp, mp, mvp) { if (vp->v_type == VNON) { VI_UNLOCK(vp); continue; } - if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK, td)) { + if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK)) { MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); goto again; } ip = VTOI(vp); dq = ip->i_dquot[type]; ip->i_dquot[type] = NODQUOT; dqrele(vp, dq); VOP_UNLOCK(vp); vrele(vp); } error = dqflush(qvp); if (error != 0) return (error); /* * Clear um_quotas before closing the quota vnode to prevent * access to the closed vnode from dqget/dqsync */ UFS_LOCK(ump); ump->um_quotas[type] = NULLVP; ump->um_cred[type] = NOCRED; UFS_UNLOCK(ump); vn_lock(qvp, LK_EXCLUSIVE | LK_RETRY); qvp->v_vflag &= ~VV_SYSTEM; VOP_UNLOCK(qvp); error = vn_close(qvp, FREAD|FWRITE, td->td_ucred, td); crfree(cr); return (error); } static int quotaoff_inchange1(struct thread *td, struct mount *mp, int type) { int error; bool need_resume; /* * mp is already suspended on unmount. If not, suspend it, to * avoid the situation where quotaoff operation eventually * failing due to SU structures still keeping references on * dquots, but vnode's references are already clean. This * would cause quota accounting leak and asserts otherwise. * Note that the thread has already called vn_start_write(). */ if (mp->mnt_susp_owner == td) { need_resume = false; } else { error = vfs_write_suspend_umnt(mp); if (error != 0) return (error); need_resume = true; } error = quotaoff1(td, mp, type); if (need_resume) vfs_write_resume(mp, VR_START_WRITE); return (error); } /* * Turns off quotas, assumes that ump->um_qflags are already checked * and QTF_CLOSING is set to indicate operation in progress. Fixes * ump->um_qflags and mp->mnt_flag after. */ int quotaoff_inchange(struct thread *td, struct mount *mp, int type) { struct ufsmount *ump; int error, i; error = quotaoff_inchange1(td, mp, type); ump = VFSTOUFS(mp); UFS_LOCK(ump); ump->um_qflags[type] &= ~QTF_CLOSING; for (i = 0; i < MAXQUOTAS; i++) if (ump->um_quotas[i] != NULLVP) break; if (i == MAXQUOTAS) { MNT_ILOCK(mp); mp->mnt_flag &= ~MNT_QUOTA; MNT_IUNLOCK(mp); } UFS_UNLOCK(ump); return (error); } /* * Q_QUOTAOFF - turn off disk quotas for a filesystem. */ int quotaoff(struct thread *td, struct mount *mp, int type) { struct ufsmount *ump; int error; error = priv_check(td, PRIV_UFS_QUOTAOFF); if (error) return (error); ump = VFSTOUFS(mp); UFS_LOCK(ump); if ((ump->um_qflags[type] & (QTF_OPENING|QTF_CLOSING)) != 0) { UFS_UNLOCK(ump); return (EALREADY); } ump->um_qflags[type] |= QTF_CLOSING; UFS_UNLOCK(ump); return (quotaoff_inchange(td, mp, type)); } /* * Q_GETQUOTA - return current values in a dqblk structure. */ static int _getquota(struct thread *td, struct mount *mp, u_long id, int type, struct dqblk64 *dqb) { struct dquot *dq; int error; switch (type) { case USRQUOTA: if ((td->td_ucred->cr_uid != id) && !unprivileged_get_quota) { error = priv_check(td, PRIV_VFS_GETQUOTA); if (error) return (error); } break; case GRPQUOTA: if (!groupmember(id, td->td_ucred) && !unprivileged_get_quota) { error = priv_check(td, PRIV_VFS_GETQUOTA); if (error) return (error); } break; default: return (EINVAL); } dq = NODQUOT; error = dqget(NULLVP, id, VFSTOUFS(mp), type, &dq); if (error) return (error); *dqb = dq->dq_dqb; dqrele(NULLVP, dq); return (error); } /* * Q_SETQUOTA - assign an entire dqblk structure. */ static int _setquota(struct thread *td, struct mount *mp, u_long id, int type, struct dqblk64 *dqb) { struct dquot *dq; struct dquot *ndq; struct ufsmount *ump; struct dqblk64 newlim; int error; error = priv_check(td, PRIV_VFS_SETQUOTA); if (error) return (error); newlim = *dqb; ndq = NODQUOT; ump = VFSTOUFS(mp); error = dqget(NULLVP, id, ump, type, &ndq); if (error) return (error); dq = ndq; DQI_LOCK(dq); DQI_WAIT(dq, PINOD+1, "setqta"); /* * Copy all but the current values. * Reset time limit if previously had no soft limit or were * under it, but now have a soft limit and are over it. */ newlim.dqb_curblocks = dq->dq_curblocks; newlim.dqb_curinodes = dq->dq_curinodes; if (dq->dq_id != 0) { newlim.dqb_btime = dq->dq_btime; newlim.dqb_itime = dq->dq_itime; } if (newlim.dqb_bsoftlimit && dq->dq_curblocks >= newlim.dqb_bsoftlimit && (dq->dq_bsoftlimit == 0 || dq->dq_curblocks < dq->dq_bsoftlimit)) newlim.dqb_btime = time_second + ump->um_btime[type]; if (newlim.dqb_isoftlimit && dq->dq_curinodes >= newlim.dqb_isoftlimit && (dq->dq_isoftlimit == 0 || dq->dq_curinodes < dq->dq_isoftlimit)) newlim.dqb_itime = time_second + ump->um_itime[type]; dq->dq_dqb = newlim; if (dq->dq_curblocks < dq->dq_bsoftlimit) dq->dq_flags &= ~DQ_BLKS; if (dq->dq_curinodes < dq->dq_isoftlimit) dq->dq_flags &= ~DQ_INODS; if (dq->dq_isoftlimit == 0 && dq->dq_bsoftlimit == 0 && dq->dq_ihardlimit == 0 && dq->dq_bhardlimit == 0) dq->dq_flags |= DQ_FAKE; else dq->dq_flags &= ~DQ_FAKE; dq->dq_flags |= DQ_MOD; DQI_UNLOCK(dq); dqrele(NULLVP, dq); return (0); } /* * Q_SETUSE - set current inode and block usage. */ static int _setuse(struct thread *td, struct mount *mp, u_long id, int type, struct dqblk64 *dqb) { struct dquot *dq; struct ufsmount *ump; struct dquot *ndq; struct dqblk64 usage; int error; error = priv_check(td, PRIV_UFS_SETUSE); if (error) return (error); usage = *dqb; ump = VFSTOUFS(mp); ndq = NODQUOT; error = dqget(NULLVP, id, ump, type, &ndq); if (error) return (error); dq = ndq; DQI_LOCK(dq); DQI_WAIT(dq, PINOD+1, "setuse"); /* * Reset time limit if have a soft limit and were * previously under it, but are now over it. */ if (dq->dq_bsoftlimit && dq->dq_curblocks < dq->dq_bsoftlimit && usage.dqb_curblocks >= dq->dq_bsoftlimit) dq->dq_btime = time_second + ump->um_btime[type]; if (dq->dq_isoftlimit && dq->dq_curinodes < dq->dq_isoftlimit && usage.dqb_curinodes >= dq->dq_isoftlimit) dq->dq_itime = time_second + ump->um_itime[type]; dq->dq_curblocks = usage.dqb_curblocks; dq->dq_curinodes = usage.dqb_curinodes; if (dq->dq_curblocks < dq->dq_bsoftlimit) dq->dq_flags &= ~DQ_BLKS; if (dq->dq_curinodes < dq->dq_isoftlimit) dq->dq_flags &= ~DQ_INODS; dq->dq_flags |= DQ_MOD; DQI_UNLOCK(dq); dqrele(NULLVP, dq); return (0); } int getquota32(struct thread *td, struct mount *mp, u_long id, int type, void *addr) { struct dqblk32 dqb32; struct dqblk64 dqb64; int error; error = _getquota(td, mp, id, type, &dqb64); if (error) return (error); dqb64_dqb32(&dqb64, &dqb32); error = copyout(&dqb32, addr, sizeof(dqb32)); return (error); } int setquota32(struct thread *td, struct mount *mp, u_long id, int type, void *addr) { struct dqblk32 dqb32; struct dqblk64 dqb64; int error; error = copyin(addr, &dqb32, sizeof(dqb32)); if (error) return (error); dqb32_dqb64(&dqb32, &dqb64); error = _setquota(td, mp, id, type, &dqb64); return (error); } int setuse32(struct thread *td, struct mount *mp, u_long id, int type, void *addr) { struct dqblk32 dqb32; struct dqblk64 dqb64; int error; error = copyin(addr, &dqb32, sizeof(dqb32)); if (error) return (error); dqb32_dqb64(&dqb32, &dqb64); error = _setuse(td, mp, id, type, &dqb64); return (error); } int getquota(struct thread *td, struct mount *mp, u_long id, int type, void *addr) { struct dqblk64 dqb64; int error; error = _getquota(td, mp, id, type, &dqb64); if (error) return (error); error = copyout(&dqb64, addr, sizeof(dqb64)); return (error); } int setquota(struct thread *td, struct mount *mp, u_long id, int type, void *addr) { struct dqblk64 dqb64; int error; error = copyin(addr, &dqb64, sizeof(dqb64)); if (error) return (error); error = _setquota(td, mp, id, type, &dqb64); return (error); } int setuse(struct thread *td, struct mount *mp, u_long id, int type, void *addr) { struct dqblk64 dqb64; int error; error = copyin(addr, &dqb64, sizeof(dqb64)); if (error) return (error); error = _setuse(td, mp, id, type, &dqb64); return (error); } /* * Q_GETQUOTASIZE - get bit-size of quota file fields */ int getquotasize(struct thread *td, struct mount *mp, u_long id, int type, void *sizep) { struct ufsmount *ump = VFSTOUFS(mp); int bitsize; UFS_LOCK(ump); if (ump->um_quotas[type] == NULLVP || (ump->um_qflags[type] & QTF_CLOSING)) { UFS_UNLOCK(ump); return (EINVAL); } if ((ump->um_qflags[type] & QTF_64BIT) != 0) bitsize = 64; else bitsize = 32; UFS_UNLOCK(ump); return (copyout(&bitsize, sizep, sizeof(int))); } /* * Q_SYNC - sync quota files to disk. */ int qsync(struct mount *mp) { struct ufsmount *ump = VFSTOUFS(mp); - struct thread *td = curthread; /* XXX */ struct vnode *vp, *mvp; struct dquot *dq; int i, error; /* * Check if the mount point has any quotas. * If not, simply return. */ for (i = 0; i < MAXQUOTAS; i++) if (ump->um_quotas[i] != NULLVP) break; if (i == MAXQUOTAS) return (0); /* * Search vnodes associated with this mount point, * synchronizing any modified dquot structures. */ again: MNT_VNODE_FOREACH_ALL(vp, mp, mvp) { if (vp->v_type == VNON) { VI_UNLOCK(vp); continue; } - error = vget(vp, LK_EXCLUSIVE | LK_INTERLOCK, td); + error = vget(vp, LK_EXCLUSIVE | LK_INTERLOCK); if (error) { if (error == ENOENT) { MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); goto again; } continue; } for (i = 0; i < MAXQUOTAS; i++) { dq = VTOI(vp)->i_dquot[i]; if (dq != NODQUOT) dqsync(vp, dq); } vput(vp); } return (0); } /* * Sync quota file for given vnode to disk. */ int qsyncvp(struct vnode *vp) { struct ufsmount *ump = VFSTOUFS(vp->v_mount); struct dquot *dq; int i; /* * Check if the mount point has any quotas. * If not, simply return. */ for (i = 0; i < MAXQUOTAS; i++) if (ump->um_quotas[i] != NULLVP) break; if (i == MAXQUOTAS) return (0); /* * Search quotas associated with this vnode * synchronizing any modified dquot structures. */ for (i = 0; i < MAXQUOTAS; i++) { dq = VTOI(vp)->i_dquot[i]; if (dq != NODQUOT) dqsync(vp, dq); } return (0); } /* * Code pertaining to management of the in-core dquot data structures. */ #define DQHASH(dqvp, id) \ (&dqhashtbl[((((intptr_t)(dqvp)) >> 8) + id) & dqhash]) static LIST_HEAD(dqhash, dquot) *dqhashtbl; static u_long dqhash; /* * Dquot free list. */ #define DQUOTINC 5 /* minimum free dquots desired */ static TAILQ_HEAD(dqfreelist, dquot) dqfreelist; static long numdquot, desireddquot = DQUOTINC; /* * Lock to protect quota hash, dq free list and dq_cnt ref counters of * _all_ dqs. */ struct mtx dqhlock; #define DQH_LOCK() mtx_lock(&dqhlock) #define DQH_UNLOCK() mtx_unlock(&dqhlock) static struct dquot *dqhashfind(struct dqhash *dqh, u_long id, struct vnode *dqvp); /* * Initialize the quota system. */ void dqinit(void) { mtx_init(&dqhlock, "dqhlock", NULL, MTX_DEF); dqhashtbl = hashinit(desiredvnodes, M_DQUOT, &dqhash); TAILQ_INIT(&dqfreelist); } /* * Shut down the quota system. */ void dquninit(void) { struct dquot *dq; hashdestroy(dqhashtbl, M_DQUOT, dqhash); while ((dq = TAILQ_FIRST(&dqfreelist)) != NULL) { TAILQ_REMOVE(&dqfreelist, dq, dq_freelist); mtx_destroy(&dq->dq_lock); free(dq, M_DQUOT); } mtx_destroy(&dqhlock); } static struct dquot * dqhashfind(struct dqhash *dqh, u_long id, struct vnode *dqvp) { struct dquot *dq; mtx_assert(&dqhlock, MA_OWNED); LIST_FOREACH(dq, dqh, dq_hash) { if (dq->dq_id != id || dq->dq_ump->um_quotas[dq->dq_type] != dqvp) continue; /* * Cache hit with no references. Take * the structure off the free list. */ if (dq->dq_cnt == 0) TAILQ_REMOVE(&dqfreelist, dq, dq_freelist); DQREF(dq); return (dq); } return (NODQUOT); } /* * Determine the quota file type. * * A 32-bit quota file is simply an array of struct dqblk32. * * A 64-bit quota file is a struct dqhdr64 followed by an array of struct * dqblk64. The header contains various magic bits which allow us to be * reasonably confident that it is indeeda 64-bit quota file and not just * a 32-bit quota file that just happens to "look right". * */ static int dqopen(struct vnode *vp, struct ufsmount *ump, int type) { struct dqhdr64 dqh; struct iovec aiov; struct uio auio; int error; ASSERT_VOP_LOCKED(vp, "dqopen"); auio.uio_iov = &aiov; auio.uio_iovcnt = 1; aiov.iov_base = &dqh; aiov.iov_len = sizeof(dqh); auio.uio_resid = sizeof(dqh); auio.uio_offset = 0; auio.uio_segflg = UIO_SYSSPACE; auio.uio_rw = UIO_READ; auio.uio_td = (struct thread *)0; error = VOP_READ(vp, &auio, 0, ump->um_cred[type]); if (error != 0) return (error); if (auio.uio_resid > 0) { /* assume 32 bits */ return (0); } UFS_LOCK(ump); if (strcmp(dqh.dqh_magic, Q_DQHDR64_MAGIC) == 0 && be32toh(dqh.dqh_version) == Q_DQHDR64_VERSION && be32toh(dqh.dqh_hdrlen) == (uint32_t)sizeof(struct dqhdr64) && be32toh(dqh.dqh_reclen) == (uint32_t)sizeof(struct dqblk64)) { /* XXX: what if the magic matches, but the sizes are wrong? */ ump->um_qflags[type] |= QTF_64BIT; } else { ump->um_qflags[type] &= ~QTF_64BIT; } UFS_UNLOCK(ump); return (0); } /* * Obtain a dquot structure for the specified identifier and quota file * reading the information from the file if necessary. */ static int dqget(struct vnode *vp, u_long id, struct ufsmount *ump, int type, struct dquot **dqp) { uint8_t buf[sizeof(struct dqblk64)]; off_t base, recsize; struct dquot *dq, *dq1; struct dqhash *dqh; struct vnode *dqvp; struct iovec aiov; struct uio auio; int dqvplocked, error; #ifdef DEBUG_VFS_LOCKS if (vp != NULLVP) ASSERT_VOP_ELOCKED(vp, "dqget"); #endif if (vp != NULLVP && *dqp != NODQUOT) { return (0); } /* XXX: Disallow negative id values to prevent the * creation of 100GB+ quota data files. */ if ((int)id < 0) return (EINVAL); UFS_LOCK(ump); dqvp = ump->um_quotas[type]; if (dqvp == NULLVP || (ump->um_qflags[type] & QTF_CLOSING)) { *dqp = NODQUOT; UFS_UNLOCK(ump); return (EINVAL); } vref(dqvp); UFS_UNLOCK(ump); error = 0; dqvplocked = 0; /* * Check the cache first. */ dqh = DQHASH(dqvp, id); DQH_LOCK(); dq = dqhashfind(dqh, id, dqvp); if (dq != NULL) { DQH_UNLOCK(); hfound: DQI_LOCK(dq); DQI_WAIT(dq, PINOD+1, "dqget"); DQI_UNLOCK(dq); if (dq->dq_ump == NULL) { dqrele(vp, dq); dq = NODQUOT; error = EIO; } *dqp = dq; if (dqvplocked) vput(dqvp); else vrele(dqvp); return (error); } /* * Quota vnode lock is before DQ_LOCK. Acquire dqvp lock there * since new dq will appear on the hash chain DQ_LOCKed. */ if (vp != dqvp) { DQH_UNLOCK(); vn_lock(dqvp, LK_SHARED | LK_RETRY); dqvplocked = 1; DQH_LOCK(); /* * Recheck the cache after sleep for quota vnode lock. */ dq = dqhashfind(dqh, id, dqvp); if (dq != NULL) { DQH_UNLOCK(); goto hfound; } } /* * Not in cache, allocate a new one or take it from the * free list. */ if (TAILQ_FIRST(&dqfreelist) == NODQUOT && numdquot < MAXQUOTAS * desiredvnodes) desireddquot += DQUOTINC; if (numdquot < desireddquot) { numdquot++; DQH_UNLOCK(); dq1 = malloc(sizeof *dq1, M_DQUOT, M_WAITOK | M_ZERO); mtx_init(&dq1->dq_lock, "dqlock", NULL, MTX_DEF); DQH_LOCK(); /* * Recheck the cache after sleep for memory. */ dq = dqhashfind(dqh, id, dqvp); if (dq != NULL) { numdquot--; DQH_UNLOCK(); mtx_destroy(&dq1->dq_lock); free(dq1, M_DQUOT); goto hfound; } dq = dq1; } else { if ((dq = TAILQ_FIRST(&dqfreelist)) == NULL) { DQH_UNLOCK(); tablefull("dquot"); *dqp = NODQUOT; if (dqvplocked) vput(dqvp); else vrele(dqvp); return (EUSERS); } if (dq->dq_cnt || (dq->dq_flags & DQ_MOD)) panic("dqget: free dquot isn't %p", dq); TAILQ_REMOVE(&dqfreelist, dq, dq_freelist); if (dq->dq_ump != NULL) LIST_REMOVE(dq, dq_hash); } /* * Dq is put into hash already locked to prevent parallel * usage while it is being read from file. */ dq->dq_flags = DQ_LOCK; dq->dq_id = id; dq->dq_type = type; dq->dq_ump = ump; LIST_INSERT_HEAD(dqh, dq, dq_hash); DQREF(dq); DQH_UNLOCK(); /* * Read the requested quota record from the quota file, performing * any necessary conversions. */ if (ump->um_qflags[type] & QTF_64BIT) { recsize = sizeof(struct dqblk64); base = sizeof(struct dqhdr64); } else { recsize = sizeof(struct dqblk32); base = 0; } auio.uio_iov = &aiov; auio.uio_iovcnt = 1; aiov.iov_base = buf; aiov.iov_len = recsize; auio.uio_resid = recsize; auio.uio_offset = base + id * recsize; auio.uio_segflg = UIO_SYSSPACE; auio.uio_rw = UIO_READ; auio.uio_td = (struct thread *)0; error = VOP_READ(dqvp, &auio, 0, ump->um_cred[type]); if (auio.uio_resid == recsize && error == 0) { bzero(&dq->dq_dqb, sizeof(dq->dq_dqb)); } else { if (ump->um_qflags[type] & QTF_64BIT) dqb64_dq((struct dqblk64 *)buf, dq); else dqb32_dq((struct dqblk32 *)buf, dq); } if (dqvplocked) vput(dqvp); else vrele(dqvp); /* * I/O error in reading quota file, release * quota structure and reflect problem to caller. */ if (error) { DQH_LOCK(); dq->dq_ump = NULL; LIST_REMOVE(dq, dq_hash); DQH_UNLOCK(); DQI_LOCK(dq); if (dq->dq_flags & DQ_WANT) wakeup(dq); dq->dq_flags = 0; DQI_UNLOCK(dq); dqrele(vp, dq); *dqp = NODQUOT; return (error); } DQI_LOCK(dq); /* * Check for no limit to enforce. * Initialize time values if necessary. */ if (dq->dq_isoftlimit == 0 && dq->dq_bsoftlimit == 0 && dq->dq_ihardlimit == 0 && dq->dq_bhardlimit == 0) dq->dq_flags |= DQ_FAKE; if (dq->dq_id != 0) { if (dq->dq_btime == 0) { dq->dq_btime = time_second + ump->um_btime[type]; if (dq->dq_bsoftlimit && dq->dq_curblocks >= dq->dq_bsoftlimit) dq->dq_flags |= DQ_MOD; } if (dq->dq_itime == 0) { dq->dq_itime = time_second + ump->um_itime[type]; if (dq->dq_isoftlimit && dq->dq_curinodes >= dq->dq_isoftlimit) dq->dq_flags |= DQ_MOD; } } DQI_WAKEUP(dq); DQI_UNLOCK(dq); *dqp = dq; return (0); } #ifdef DIAGNOSTIC /* * Obtain a reference to a dquot. */ static void dqref(struct dquot *dq) { dq->dq_cnt++; } #endif /* * Release a reference to a dquot. */ void dqrele(struct vnode *vp, struct dquot *dq) { if (dq == NODQUOT) return; DQH_LOCK(); KASSERT(dq->dq_cnt > 0, ("Lost dq %p reference 1", dq)); if (dq->dq_cnt > 1) { dq->dq_cnt--; DQH_UNLOCK(); return; } DQH_UNLOCK(); sync: (void) dqsync(vp, dq); DQH_LOCK(); KASSERT(dq->dq_cnt > 0, ("Lost dq %p reference 2", dq)); if (--dq->dq_cnt > 0) { DQH_UNLOCK(); return; } /* * The dq may become dirty after it is synced but before it is * put to the free list. Checking the DQ_MOD there without * locking dq should be safe since no other references to the * dq exist. */ if ((dq->dq_flags & DQ_MOD) != 0) { dq->dq_cnt++; DQH_UNLOCK(); goto sync; } TAILQ_INSERT_TAIL(&dqfreelist, dq, dq_freelist); DQH_UNLOCK(); } /* * Update the disk quota in the quota file. */ static int dqsync(struct vnode *vp, struct dquot *dq) { uint8_t buf[sizeof(struct dqblk64)]; off_t base, recsize; struct vnode *dqvp; struct iovec aiov; struct uio auio; int error; struct mount *mp; struct ufsmount *ump; #ifdef DEBUG_VFS_LOCKS if (vp != NULL) ASSERT_VOP_ELOCKED(vp, "dqsync"); #endif mp = NULL; error = 0; if (dq == NODQUOT) panic("dqsync: dquot"); if ((ump = dq->dq_ump) == NULL) return (0); UFS_LOCK(ump); if ((dqvp = ump->um_quotas[dq->dq_type]) == NULLVP) { if (vp == NULL) { UFS_UNLOCK(ump); return (0); } else panic("dqsync: file"); } vref(dqvp); UFS_UNLOCK(ump); DQI_LOCK(dq); if ((dq->dq_flags & DQ_MOD) == 0) { DQI_UNLOCK(dq); vrele(dqvp); return (0); } DQI_UNLOCK(dq); (void) vn_start_secondary_write(dqvp, &mp, V_WAIT); if (vp != dqvp) vn_lock(dqvp, LK_EXCLUSIVE | LK_RETRY); DQI_LOCK(dq); DQI_WAIT(dq, PINOD+2, "dqsync"); if ((dq->dq_flags & DQ_MOD) == 0) goto out; dq->dq_flags |= DQ_LOCK; DQI_UNLOCK(dq); /* * Write the quota record to the quota file, performing any * necessary conversions. See dqget() for additional details. */ if (ump->um_qflags[dq->dq_type] & QTF_64BIT) { dq_dqb64(dq, (struct dqblk64 *)buf); recsize = sizeof(struct dqblk64); base = sizeof(struct dqhdr64); } else { dq_dqb32(dq, (struct dqblk32 *)buf); recsize = sizeof(struct dqblk32); base = 0; } auio.uio_iov = &aiov; auio.uio_iovcnt = 1; aiov.iov_base = buf; aiov.iov_len = recsize; auio.uio_resid = recsize; auio.uio_offset = base + dq->dq_id * recsize; auio.uio_segflg = UIO_SYSSPACE; auio.uio_rw = UIO_WRITE; auio.uio_td = (struct thread *)0; error = VOP_WRITE(dqvp, &auio, 0, dq->dq_ump->um_cred[dq->dq_type]); if (auio.uio_resid && error == 0) error = EIO; DQI_LOCK(dq); DQI_WAKEUP(dq); dq->dq_flags &= ~DQ_MOD; out: DQI_UNLOCK(dq); if (vp != dqvp) vput(dqvp); else vrele(dqvp); vn_finished_secondary_write(mp); return (error); } /* * Flush all entries from the cache for a particular vnode. */ static int dqflush(struct vnode *vp) { struct dquot *dq, *nextdq; struct dqhash *dqh; int error; /* * Move all dquot's that used to refer to this quota * file off their hash chains (they will eventually * fall off the head of the free list and be re-used). */ error = 0; DQH_LOCK(); for (dqh = &dqhashtbl[dqhash]; dqh >= dqhashtbl; dqh--) { for (dq = LIST_FIRST(dqh); dq; dq = nextdq) { nextdq = LIST_NEXT(dq, dq_hash); if (dq->dq_ump->um_quotas[dq->dq_type] != vp) continue; if (dq->dq_cnt) error = EBUSY; else { LIST_REMOVE(dq, dq_hash); dq->dq_ump = NULL; } } } DQH_UNLOCK(); return (error); } /* * The following three functions are provided for the adjustment of * quotas by the soft updates code. */ #ifdef SOFTUPDATES /* * Acquire a reference to the quota structures associated with a vnode. * Return count of number of quota structures found. */ int quotaref(vp, qrp) struct vnode *vp; struct dquot **qrp; { struct inode *ip; struct dquot *dq; int i, found; for (i = 0; i < MAXQUOTAS; i++) qrp[i] = NODQUOT; /* * Disk quotas must be turned off for system files. Currently * snapshot and quota files. */ if ((vp->v_vflag & VV_SYSTEM) != 0) return (0); /* * Iterate through and copy active quotas. */ found = 0; ip = VTOI(vp); mtx_lock(&dqhlock); for (i = 0; i < MAXQUOTAS; i++) { if ((dq = ip->i_dquot[i]) == NODQUOT) continue; DQREF(dq); qrp[i] = dq; found++; } mtx_unlock(&dqhlock); return (found); } /* * Release a set of quota structures obtained from a vnode. */ void quotarele(qrp) struct dquot **qrp; { struct dquot *dq; int i; for (i = 0; i < MAXQUOTAS; i++) { if ((dq = qrp[i]) == NODQUOT) continue; dqrele(NULL, dq); } } /* * Adjust the number of blocks associated with a quota. * Positive numbers when adding blocks; negative numbers when freeing blocks. */ void quotaadj(qrp, ump, blkcount) struct dquot **qrp; struct ufsmount *ump; int64_t blkcount; { struct dquot *dq; ufs2_daddr_t ncurblocks; int i; if (blkcount == 0) return; for (i = 0; i < MAXQUOTAS; i++) { if ((dq = qrp[i]) == NODQUOT) continue; DQI_LOCK(dq); DQI_WAIT(dq, PINOD+1, "adjqta"); ncurblocks = dq->dq_curblocks + blkcount; if (ncurblocks >= 0) dq->dq_curblocks = ncurblocks; else dq->dq_curblocks = 0; if (blkcount < 0) dq->dq_flags &= ~DQ_BLKS; else if (dq->dq_curblocks + blkcount >= dq->dq_bsoftlimit && dq->dq_curblocks < dq->dq_bsoftlimit) dq->dq_btime = time_second + ump->um_btime[i]; dq->dq_flags |= DQ_MOD; DQI_UNLOCK(dq); } } #endif /* SOFTUPDATES */ /* * 32-bit / 64-bit conversion functions. * * 32-bit quota records are stored in native byte order. Attention must * be paid to overflow issues. * * 64-bit quota records are stored in network byte order. */ #define CLIP32(u64) (u64 > UINT32_MAX ? UINT32_MAX : (uint32_t)u64) /* * Convert 32-bit host-order structure to dquot. */ static void dqb32_dq(const struct dqblk32 *dqb32, struct dquot *dq) { dq->dq_bhardlimit = dqb32->dqb_bhardlimit; dq->dq_bsoftlimit = dqb32->dqb_bsoftlimit; dq->dq_curblocks = dqb32->dqb_curblocks; dq->dq_ihardlimit = dqb32->dqb_ihardlimit; dq->dq_isoftlimit = dqb32->dqb_isoftlimit; dq->dq_curinodes = dqb32->dqb_curinodes; dq->dq_btime = dqb32->dqb_btime; dq->dq_itime = dqb32->dqb_itime; } /* * Convert 64-bit network-order structure to dquot. */ static void dqb64_dq(const struct dqblk64 *dqb64, struct dquot *dq) { dq->dq_bhardlimit = be64toh(dqb64->dqb_bhardlimit); dq->dq_bsoftlimit = be64toh(dqb64->dqb_bsoftlimit); dq->dq_curblocks = be64toh(dqb64->dqb_curblocks); dq->dq_ihardlimit = be64toh(dqb64->dqb_ihardlimit); dq->dq_isoftlimit = be64toh(dqb64->dqb_isoftlimit); dq->dq_curinodes = be64toh(dqb64->dqb_curinodes); dq->dq_btime = be64toh(dqb64->dqb_btime); dq->dq_itime = be64toh(dqb64->dqb_itime); } /* * Convert dquot to 32-bit host-order structure. */ static void dq_dqb32(const struct dquot *dq, struct dqblk32 *dqb32) { dqb32->dqb_bhardlimit = CLIP32(dq->dq_bhardlimit); dqb32->dqb_bsoftlimit = CLIP32(dq->dq_bsoftlimit); dqb32->dqb_curblocks = CLIP32(dq->dq_curblocks); dqb32->dqb_ihardlimit = CLIP32(dq->dq_ihardlimit); dqb32->dqb_isoftlimit = CLIP32(dq->dq_isoftlimit); dqb32->dqb_curinodes = CLIP32(dq->dq_curinodes); dqb32->dqb_btime = CLIP32(dq->dq_btime); dqb32->dqb_itime = CLIP32(dq->dq_itime); } /* * Convert dquot to 64-bit network-order structure. */ static void dq_dqb64(const struct dquot *dq, struct dqblk64 *dqb64) { dqb64->dqb_bhardlimit = htobe64(dq->dq_bhardlimit); dqb64->dqb_bsoftlimit = htobe64(dq->dq_bsoftlimit); dqb64->dqb_curblocks = htobe64(dq->dq_curblocks); dqb64->dqb_ihardlimit = htobe64(dq->dq_ihardlimit); dqb64->dqb_isoftlimit = htobe64(dq->dq_isoftlimit); dqb64->dqb_curinodes = htobe64(dq->dq_curinodes); dqb64->dqb_btime = htobe64(dq->dq_btime); dqb64->dqb_itime = htobe64(dq->dq_itime); } /* * Convert 64-bit host-order structure to 32-bit host-order structure. */ static void dqb64_dqb32(const struct dqblk64 *dqb64, struct dqblk32 *dqb32) { dqb32->dqb_bhardlimit = CLIP32(dqb64->dqb_bhardlimit); dqb32->dqb_bsoftlimit = CLIP32(dqb64->dqb_bsoftlimit); dqb32->dqb_curblocks = CLIP32(dqb64->dqb_curblocks); dqb32->dqb_ihardlimit = CLIP32(dqb64->dqb_ihardlimit); dqb32->dqb_isoftlimit = CLIP32(dqb64->dqb_isoftlimit); dqb32->dqb_curinodes = CLIP32(dqb64->dqb_curinodes); dqb32->dqb_btime = CLIP32(dqb64->dqb_btime); dqb32->dqb_itime = CLIP32(dqb64->dqb_itime); } /* * Convert 32-bit host-order structure to 64-bit host-order structure. */ static void dqb32_dqb64(const struct dqblk32 *dqb32, struct dqblk64 *dqb64) { dqb64->dqb_bhardlimit = dqb32->dqb_bhardlimit; dqb64->dqb_bsoftlimit = dqb32->dqb_bsoftlimit; dqb64->dqb_curblocks = dqb32->dqb_curblocks; dqb64->dqb_ihardlimit = dqb32->dqb_ihardlimit; dqb64->dqb_isoftlimit = dqb32->dqb_isoftlimit; dqb64->dqb_curinodes = dqb32->dqb_curinodes; dqb64->dqb_btime = dqb32->dqb_btime; dqb64->dqb_itime = dqb32->dqb_itime; } Index: projects/clang1100-import/sys/vm/vm_fault.c =================================================================== --- projects/clang1100-import/sys/vm/vm_fault.c (revision 364278) +++ projects/clang1100-import/sys/vm/vm_fault.c (revision 364279) @@ -1,2007 +1,2007 @@ /*- * SPDX-License-Identifier: (BSD-4-Clause AND MIT-CMU) * * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * Copyright (c) 1994 John S. Dyson * All rights reserved. * Copyright (c) 1994 David Greenman * All rights reserved. * * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_fault.c 8.4 (Berkeley) 1/12/94 * * * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Authors: Avadis Tevanian, Jr., Michael Wayne Young * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. */ /* * Page fault handling module. */ #include __FBSDID("$FreeBSD$"); #include "opt_ktrace.h" #include "opt_vm.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KTRACE #include #endif #include #include #include #include #include #include #include #include #include #include #include #define PFBAK 4 #define PFFOR 4 #define VM_FAULT_READ_DEFAULT (1 + VM_FAULT_READ_AHEAD_INIT) #define VM_FAULT_READ_MAX (1 + VM_FAULT_READ_AHEAD_MAX) #define VM_FAULT_DONTNEED_MIN 1048576 struct faultstate { /* Fault parameters. */ vm_offset_t vaddr; vm_page_t *m_hold; vm_prot_t fault_type; vm_prot_t prot; int fault_flags; int oom; boolean_t wired; /* Page reference for cow. */ vm_page_t m_cow; /* Current object. */ vm_object_t object; vm_pindex_t pindex; vm_page_t m; /* Top-level map object. */ vm_object_t first_object; vm_pindex_t first_pindex; vm_page_t first_m; /* Map state. */ vm_map_t map; vm_map_entry_t entry; int map_generation; bool lookup_still_valid; /* Vnode if locked. */ struct vnode *vp; }; static void vm_fault_dontneed(const struct faultstate *fs, vm_offset_t vaddr, int ahead); static void vm_fault_prefault(const struct faultstate *fs, vm_offset_t addra, int backward, int forward, bool obj_locked); static int vm_pfault_oom_attempts = 3; SYSCTL_INT(_vm, OID_AUTO, pfault_oom_attempts, CTLFLAG_RWTUN, &vm_pfault_oom_attempts, 0, "Number of page allocation attempts in page fault handler before it " "triggers OOM handling"); static int vm_pfault_oom_wait = 10; SYSCTL_INT(_vm, OID_AUTO, pfault_oom_wait, CTLFLAG_RWTUN, &vm_pfault_oom_wait, 0, "Number of seconds to wait for free pages before retrying " "the page fault handler"); static inline void fault_page_release(vm_page_t *mp) { vm_page_t m; m = *mp; if (m != NULL) { /* * We are likely to loop around again and attempt to busy * this page. Deactivating it leaves it available for * pageout while optimizing fault restarts. */ vm_page_deactivate(m); vm_page_xunbusy(m); *mp = NULL; } } static inline void fault_page_free(vm_page_t *mp) { vm_page_t m; m = *mp; if (m != NULL) { VM_OBJECT_ASSERT_WLOCKED(m->object); if (!vm_page_wired(m)) vm_page_free(m); else vm_page_xunbusy(m); *mp = NULL; } } static inline void unlock_map(struct faultstate *fs) { if (fs->lookup_still_valid) { vm_map_lookup_done(fs->map, fs->entry); fs->lookup_still_valid = false; } } static void unlock_vp(struct faultstate *fs) { if (fs->vp != NULL) { vput(fs->vp); fs->vp = NULL; } } static void fault_deallocate(struct faultstate *fs) { fault_page_release(&fs->m_cow); fault_page_release(&fs->m); vm_object_pip_wakeup(fs->object); if (fs->object != fs->first_object) { VM_OBJECT_WLOCK(fs->first_object); fault_page_free(&fs->first_m); VM_OBJECT_WUNLOCK(fs->first_object); vm_object_pip_wakeup(fs->first_object); } vm_object_deallocate(fs->first_object); unlock_map(fs); unlock_vp(fs); } static void unlock_and_deallocate(struct faultstate *fs) { VM_OBJECT_WUNLOCK(fs->object); fault_deallocate(fs); } static void vm_fault_dirty(struct faultstate *fs, vm_page_t m) { bool need_dirty; if (((fs->prot & VM_PROT_WRITE) == 0 && (fs->fault_flags & VM_FAULT_DIRTY) == 0) || (m->oflags & VPO_UNMANAGED) != 0) return; VM_PAGE_OBJECT_BUSY_ASSERT(m); need_dirty = ((fs->fault_type & VM_PROT_WRITE) != 0 && (fs->fault_flags & VM_FAULT_WIRE) == 0) || (fs->fault_flags & VM_FAULT_DIRTY) != 0; vm_object_set_writeable_dirty(m->object); /* * If the fault is a write, we know that this page is being * written NOW so dirty it explicitly to save on * pmap_is_modified() calls later. * * Also, since the page is now dirty, we can possibly tell * the pager to release any swap backing the page. */ if (need_dirty && vm_page_set_dirty(m) == 0) { /* * If this is a NOSYNC mmap we do not want to set PGA_NOSYNC * if the page is already dirty to prevent data written with * the expectation of being synced from not being synced. * Likewise if this entry does not request NOSYNC then make * sure the page isn't marked NOSYNC. Applications sharing * data should use the same flags to avoid ping ponging. */ if ((fs->entry->eflags & MAP_ENTRY_NOSYNC) != 0) vm_page_aflag_set(m, PGA_NOSYNC); else vm_page_aflag_clear(m, PGA_NOSYNC); } } /* * Unlocks fs.first_object and fs.map on success. */ static int vm_fault_soft_fast(struct faultstate *fs) { vm_page_t m, m_map; #if VM_NRESERVLEVEL > 0 vm_page_t m_super; int flags; #endif int psind, rv; vm_offset_t vaddr; MPASS(fs->vp == NULL); vaddr = fs->vaddr; vm_object_busy(fs->first_object); m = vm_page_lookup(fs->first_object, fs->first_pindex); /* A busy page can be mapped for read|execute access. */ if (m == NULL || ((fs->prot & VM_PROT_WRITE) != 0 && vm_page_busied(m)) || !vm_page_all_valid(m)) { rv = KERN_FAILURE; goto out; } m_map = m; psind = 0; #if VM_NRESERVLEVEL > 0 if ((m->flags & PG_FICTITIOUS) == 0 && (m_super = vm_reserv_to_superpage(m)) != NULL && rounddown2(vaddr, pagesizes[m_super->psind]) >= fs->entry->start && roundup2(vaddr + 1, pagesizes[m_super->psind]) <= fs->entry->end && (vaddr & (pagesizes[m_super->psind] - 1)) == (VM_PAGE_TO_PHYS(m) & (pagesizes[m_super->psind] - 1)) && !fs->wired && pmap_ps_enabled(fs->map->pmap)) { flags = PS_ALL_VALID; if ((fs->prot & VM_PROT_WRITE) != 0) { /* * Create a superpage mapping allowing write access * only if none of the constituent pages are busy and * all of them are already dirty (except possibly for * the page that was faulted on). */ flags |= PS_NONE_BUSY; if ((fs->first_object->flags & OBJ_UNMANAGED) == 0) flags |= PS_ALL_DIRTY; } if (vm_page_ps_test(m_super, flags, m)) { m_map = m_super; psind = m_super->psind; vaddr = rounddown2(vaddr, pagesizes[psind]); /* Preset the modified bit for dirty superpages. */ if ((flags & PS_ALL_DIRTY) != 0) fs->fault_type |= VM_PROT_WRITE; } } #endif rv = pmap_enter(fs->map->pmap, vaddr, m_map, fs->prot, fs->fault_type | PMAP_ENTER_NOSLEEP | (fs->wired ? PMAP_ENTER_WIRED : 0), psind); if (rv != KERN_SUCCESS) goto out; if (fs->m_hold != NULL) { (*fs->m_hold) = m; vm_page_wire(m); } if (psind == 0 && !fs->wired) vm_fault_prefault(fs, vaddr, PFBAK, PFFOR, true); VM_OBJECT_RUNLOCK(fs->first_object); vm_fault_dirty(fs, m); vm_map_lookup_done(fs->map, fs->entry); curthread->td_ru.ru_minflt++; out: vm_object_unbusy(fs->first_object); return (rv); } static void vm_fault_restore_map_lock(struct faultstate *fs) { VM_OBJECT_ASSERT_WLOCKED(fs->first_object); MPASS(blockcount_read(&fs->first_object->paging_in_progress) > 0); if (!vm_map_trylock_read(fs->map)) { VM_OBJECT_WUNLOCK(fs->first_object); vm_map_lock_read(fs->map); VM_OBJECT_WLOCK(fs->first_object); } fs->lookup_still_valid = true; } static void vm_fault_populate_check_page(vm_page_t m) { /* * Check each page to ensure that the pager is obeying the * interface: the page must be installed in the object, fully * valid, and exclusively busied. */ MPASS(m != NULL); MPASS(vm_page_all_valid(m)); MPASS(vm_page_xbusied(m)); } static void vm_fault_populate_cleanup(vm_object_t object, vm_pindex_t first, vm_pindex_t last) { vm_page_t m; vm_pindex_t pidx; VM_OBJECT_ASSERT_WLOCKED(object); MPASS(first <= last); for (pidx = first, m = vm_page_lookup(object, pidx); pidx <= last; pidx++, m = vm_page_next(m)) { vm_fault_populate_check_page(m); vm_page_deactivate(m); vm_page_xunbusy(m); } } static int vm_fault_populate(struct faultstate *fs) { vm_offset_t vaddr; vm_page_t m; vm_pindex_t map_first, map_last, pager_first, pager_last, pidx; int i, npages, psind, rv; MPASS(fs->object == fs->first_object); VM_OBJECT_ASSERT_WLOCKED(fs->first_object); MPASS(blockcount_read(&fs->first_object->paging_in_progress) > 0); MPASS(fs->first_object->backing_object == NULL); MPASS(fs->lookup_still_valid); pager_first = OFF_TO_IDX(fs->entry->offset); pager_last = pager_first + atop(fs->entry->end - fs->entry->start) - 1; unlock_map(fs); unlock_vp(fs); /* * Call the pager (driver) populate() method. * * There is no guarantee that the method will be called again * if the current fault is for read, and a future fault is * for write. Report the entry's maximum allowed protection * to the driver. */ rv = vm_pager_populate(fs->first_object, fs->first_pindex, fs->fault_type, fs->entry->max_protection, &pager_first, &pager_last); VM_OBJECT_ASSERT_WLOCKED(fs->first_object); if (rv == VM_PAGER_BAD) { /* * VM_PAGER_BAD is the backdoor for a pager to request * normal fault handling. */ vm_fault_restore_map_lock(fs); if (fs->map->timestamp != fs->map_generation) return (KERN_RESTART); return (KERN_NOT_RECEIVER); } if (rv != VM_PAGER_OK) return (KERN_FAILURE); /* AKA SIGSEGV */ /* Ensure that the driver is obeying the interface. */ MPASS(pager_first <= pager_last); MPASS(fs->first_pindex <= pager_last); MPASS(fs->first_pindex >= pager_first); MPASS(pager_last < fs->first_object->size); vm_fault_restore_map_lock(fs); if (fs->map->timestamp != fs->map_generation) { vm_fault_populate_cleanup(fs->first_object, pager_first, pager_last); return (KERN_RESTART); } /* * The map is unchanged after our last unlock. Process the fault. * * The range [pager_first, pager_last] that is given to the * pager is only a hint. The pager may populate any range * within the object that includes the requested page index. * In case the pager expanded the range, clip it to fit into * the map entry. */ map_first = OFF_TO_IDX(fs->entry->offset); if (map_first > pager_first) { vm_fault_populate_cleanup(fs->first_object, pager_first, map_first - 1); pager_first = map_first; } map_last = map_first + atop(fs->entry->end - fs->entry->start) - 1; if (map_last < pager_last) { vm_fault_populate_cleanup(fs->first_object, map_last + 1, pager_last); pager_last = map_last; } for (pidx = pager_first, m = vm_page_lookup(fs->first_object, pidx); pidx <= pager_last; pidx += npages, m = vm_page_next(&m[npages - 1])) { vaddr = fs->entry->start + IDX_TO_OFF(pidx) - fs->entry->offset; #if defined(__aarch64__) || defined(__amd64__) || (defined(__arm__) && \ __ARM_ARCH >= 6) || defined(__i386__) || defined(__riscv) psind = m->psind; if (psind > 0 && ((vaddr & (pagesizes[psind] - 1)) != 0 || pidx + OFF_TO_IDX(pagesizes[psind]) - 1 > pager_last || !pmap_ps_enabled(fs->map->pmap) || fs->wired)) psind = 0; #else psind = 0; #endif npages = atop(pagesizes[psind]); for (i = 0; i < npages; i++) { vm_fault_populate_check_page(&m[i]); vm_fault_dirty(fs, &m[i]); } VM_OBJECT_WUNLOCK(fs->first_object); rv = pmap_enter(fs->map->pmap, vaddr, m, fs->prot, fs->fault_type | (fs->wired ? PMAP_ENTER_WIRED : 0), psind); #if defined(__amd64__) if (psind > 0 && rv == KERN_FAILURE) { for (i = 0; i < npages; i++) { rv = pmap_enter(fs->map->pmap, vaddr + ptoa(i), &m[i], fs->prot, fs->fault_type | (fs->wired ? PMAP_ENTER_WIRED : 0), 0); MPASS(rv == KERN_SUCCESS); } } #else MPASS(rv == KERN_SUCCESS); #endif VM_OBJECT_WLOCK(fs->first_object); for (i = 0; i < npages; i++) { if ((fs->fault_flags & VM_FAULT_WIRE) != 0) vm_page_wire(&m[i]); else vm_page_activate(&m[i]); if (fs->m_hold != NULL && m[i].pindex == fs->first_pindex) { (*fs->m_hold) = &m[i]; vm_page_wire(&m[i]); } vm_page_xunbusy(&m[i]); } } curthread->td_ru.ru_majflt++; return (KERN_SUCCESS); } static int prot_fault_translation; SYSCTL_INT(_machdep, OID_AUTO, prot_fault_translation, CTLFLAG_RWTUN, &prot_fault_translation, 0, "Control signal to deliver on protection fault"); /* compat definition to keep common code for signal translation */ #define UCODE_PAGEFLT 12 #ifdef T_PAGEFLT _Static_assert(UCODE_PAGEFLT == T_PAGEFLT, "T_PAGEFLT"); #endif /* * vm_fault_trap: * * Handle a page fault occurring at the given address, * requiring the given permissions, in the map specified. * If successful, the page is inserted into the * associated physical map. * * NOTE: the given address should be truncated to the * proper page address. * * KERN_SUCCESS is returned if the page fault is handled; otherwise, * a standard error specifying why the fault is fatal is returned. * * The map in question must be referenced, and remains so. * Caller may hold no locks. */ int vm_fault_trap(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type, int fault_flags, int *signo, int *ucode) { int result; MPASS(signo == NULL || ucode != NULL); #ifdef KTRACE if (map != kernel_map && KTRPOINT(curthread, KTR_FAULT)) ktrfault(vaddr, fault_type); #endif result = vm_fault(map, trunc_page(vaddr), fault_type, fault_flags, NULL); KASSERT(result == KERN_SUCCESS || result == KERN_FAILURE || result == KERN_INVALID_ADDRESS || result == KERN_RESOURCE_SHORTAGE || result == KERN_PROTECTION_FAILURE || result == KERN_OUT_OF_BOUNDS, ("Unexpected Mach error %d from vm_fault()", result)); #ifdef KTRACE if (map != kernel_map && KTRPOINT(curthread, KTR_FAULTEND)) ktrfaultend(result); #endif if (result != KERN_SUCCESS && signo != NULL) { switch (result) { case KERN_FAILURE: case KERN_INVALID_ADDRESS: *signo = SIGSEGV; *ucode = SEGV_MAPERR; break; case KERN_RESOURCE_SHORTAGE: *signo = SIGBUS; *ucode = BUS_OOMERR; break; case KERN_OUT_OF_BOUNDS: *signo = SIGBUS; *ucode = BUS_OBJERR; break; case KERN_PROTECTION_FAILURE: if (prot_fault_translation == 0) { /* * Autodetect. This check also covers * the images without the ABI-tag ELF * note. */ if (SV_CURPROC_ABI() == SV_ABI_FREEBSD && curproc->p_osrel >= P_OSREL_SIGSEGV) { *signo = SIGSEGV; *ucode = SEGV_ACCERR; } else { *signo = SIGBUS; *ucode = UCODE_PAGEFLT; } } else if (prot_fault_translation == 1) { /* Always compat mode. */ *signo = SIGBUS; *ucode = UCODE_PAGEFLT; } else { /* Always SIGSEGV mode. */ *signo = SIGSEGV; *ucode = SEGV_ACCERR; } break; default: KASSERT(0, ("Unexpected Mach error %d from vm_fault()", result)); break; } } return (result); } static int vm_fault_lock_vnode(struct faultstate *fs, bool objlocked) { struct vnode *vp; int error, locked; if (fs->object->type != OBJT_VNODE) return (KERN_SUCCESS); vp = fs->object->handle; if (vp == fs->vp) { ASSERT_VOP_LOCKED(vp, "saved vnode is not locked"); return (KERN_SUCCESS); } /* * Perform an unlock in case the desired vnode changed while * the map was unlocked during a retry. */ unlock_vp(fs); locked = VOP_ISLOCKED(vp); if (locked != LK_EXCLUSIVE) locked = LK_SHARED; /* * We must not sleep acquiring the vnode lock while we have * the page exclusive busied or the object's * paging-in-progress count incremented. Otherwise, we could * deadlock. */ - error = vget(vp, locked | LK_CANRECURSE | LK_NOWAIT, curthread); + error = vget(vp, locked | LK_CANRECURSE | LK_NOWAIT); if (error == 0) { fs->vp = vp; return (KERN_SUCCESS); } vhold(vp); if (objlocked) unlock_and_deallocate(fs); else fault_deallocate(fs); - error = vget(vp, locked | LK_RETRY | LK_CANRECURSE, curthread); + error = vget(vp, locked | LK_RETRY | LK_CANRECURSE); vdrop(vp); fs->vp = vp; KASSERT(error == 0, ("vm_fault: vget failed %d", error)); return (KERN_RESOURCE_SHORTAGE); } /* * Calculate the desired readahead. Handle drop-behind. * * Returns the number of readahead blocks to pass to the pager. */ static int vm_fault_readahead(struct faultstate *fs) { int era, nera; u_char behavior; KASSERT(fs->lookup_still_valid, ("map unlocked")); era = fs->entry->read_ahead; behavior = vm_map_entry_behavior(fs->entry); if (behavior == MAP_ENTRY_BEHAV_RANDOM) { nera = 0; } else if (behavior == MAP_ENTRY_BEHAV_SEQUENTIAL) { nera = VM_FAULT_READ_AHEAD_MAX; if (fs->vaddr == fs->entry->next_read) vm_fault_dontneed(fs, fs->vaddr, nera); } else if (fs->vaddr == fs->entry->next_read) { /* * This is a sequential fault. Arithmetically * increase the requested number of pages in * the read-ahead window. The requested * number of pages is "# of sequential faults * x (read ahead min + 1) + read ahead min" */ nera = VM_FAULT_READ_AHEAD_MIN; if (era > 0) { nera += era + 1; if (nera > VM_FAULT_READ_AHEAD_MAX) nera = VM_FAULT_READ_AHEAD_MAX; } if (era == VM_FAULT_READ_AHEAD_MAX) vm_fault_dontneed(fs, fs->vaddr, nera); } else { /* * This is a non-sequential fault. */ nera = 0; } if (era != nera) { /* * A read lock on the map suffices to update * the read ahead count safely. */ fs->entry->read_ahead = nera; } return (nera); } static int vm_fault_lookup(struct faultstate *fs) { int result; KASSERT(!fs->lookup_still_valid, ("vm_fault_lookup: Map already locked.")); result = vm_map_lookup(&fs->map, fs->vaddr, fs->fault_type | VM_PROT_FAULT_LOOKUP, &fs->entry, &fs->first_object, &fs->first_pindex, &fs->prot, &fs->wired); if (result != KERN_SUCCESS) { unlock_vp(fs); return (result); } fs->map_generation = fs->map->timestamp; if (fs->entry->eflags & MAP_ENTRY_NOFAULT) { panic("%s: fault on nofault entry, addr: %#lx", __func__, (u_long)fs->vaddr); } if (fs->entry->eflags & MAP_ENTRY_IN_TRANSITION && fs->entry->wiring_thread != curthread) { vm_map_unlock_read(fs->map); vm_map_lock(fs->map); if (vm_map_lookup_entry(fs->map, fs->vaddr, &fs->entry) && (fs->entry->eflags & MAP_ENTRY_IN_TRANSITION)) { unlock_vp(fs); fs->entry->eflags |= MAP_ENTRY_NEEDS_WAKEUP; vm_map_unlock_and_wait(fs->map, 0); } else vm_map_unlock(fs->map); return (KERN_RESOURCE_SHORTAGE); } MPASS((fs->entry->eflags & MAP_ENTRY_GUARD) == 0); if (fs->wired) fs->fault_type = fs->prot | (fs->fault_type & VM_PROT_COPY); else KASSERT((fs->fault_flags & VM_FAULT_WIRE) == 0, ("!fs->wired && VM_FAULT_WIRE")); fs->lookup_still_valid = true; return (KERN_SUCCESS); } static int vm_fault_relookup(struct faultstate *fs) { vm_object_t retry_object; vm_pindex_t retry_pindex; vm_prot_t retry_prot; int result; if (!vm_map_trylock_read(fs->map)) return (KERN_RESTART); fs->lookup_still_valid = true; if (fs->map->timestamp == fs->map_generation) return (KERN_SUCCESS); result = vm_map_lookup_locked(&fs->map, fs->vaddr, fs->fault_type, &fs->entry, &retry_object, &retry_pindex, &retry_prot, &fs->wired); if (result != KERN_SUCCESS) { /* * If retry of map lookup would have blocked then * retry fault from start. */ if (result == KERN_FAILURE) return (KERN_RESTART); return (result); } if (retry_object != fs->first_object || retry_pindex != fs->first_pindex) return (KERN_RESTART); /* * Check whether the protection has changed or the object has * been copied while we left the map unlocked. Changing from * read to write permission is OK - we leave the page * write-protected, and catch the write fault. Changing from * write to read permission means that we can't mark the page * write-enabled after all. */ fs->prot &= retry_prot; fs->fault_type &= retry_prot; if (fs->prot == 0) return (KERN_RESTART); /* Reassert because wired may have changed. */ KASSERT(fs->wired || (fs->fault_flags & VM_FAULT_WIRE) == 0, ("!wired && VM_FAULT_WIRE")); return (KERN_SUCCESS); } static void vm_fault_cow(struct faultstate *fs) { bool is_first_object_locked; /* * This allows pages to be virtually copied from a backing_object * into the first_object, where the backing object has no other * refs to it, and cannot gain any more refs. Instead of a bcopy, * we just move the page from the backing object to the first * object. Note that we must mark the page dirty in the first * object so that it will go out to swap when needed. */ is_first_object_locked = false; if ( /* * Only one shadow object and no other refs. */ fs->object->shadow_count == 1 && fs->object->ref_count == 1 && /* * No other ways to look the object up */ fs->object->handle == NULL && (fs->object->flags & OBJ_ANON) != 0 && /* * We don't chase down the shadow chain and we can acquire locks. */ (is_first_object_locked = VM_OBJECT_TRYWLOCK(fs->first_object)) && fs->object == fs->first_object->backing_object && VM_OBJECT_TRYWLOCK(fs->object)) { /* * Remove but keep xbusy for replace. fs->m is moved into * fs->first_object and left busy while fs->first_m is * conditionally freed. */ vm_page_remove_xbusy(fs->m); vm_page_replace(fs->m, fs->first_object, fs->first_pindex, fs->first_m); vm_page_dirty(fs->m); #if VM_NRESERVLEVEL > 0 /* * Rename the reservation. */ vm_reserv_rename(fs->m, fs->first_object, fs->object, OFF_TO_IDX(fs->first_object->backing_object_offset)); #endif VM_OBJECT_WUNLOCK(fs->object); VM_OBJECT_WUNLOCK(fs->first_object); fs->first_m = fs->m; fs->m = NULL; VM_CNT_INC(v_cow_optim); } else { if (is_first_object_locked) VM_OBJECT_WUNLOCK(fs->first_object); /* * Oh, well, lets copy it. */ pmap_copy_page(fs->m, fs->first_m); vm_page_valid(fs->first_m); if (fs->wired && (fs->fault_flags & VM_FAULT_WIRE) == 0) { vm_page_wire(fs->first_m); vm_page_unwire(fs->m, PQ_INACTIVE); } /* * Save the cow page to be released after * pmap_enter is complete. */ fs->m_cow = fs->m; fs->m = NULL; } /* * fs->object != fs->first_object due to above * conditional */ vm_object_pip_wakeup(fs->object); /* * Only use the new page below... */ fs->object = fs->first_object; fs->pindex = fs->first_pindex; fs->m = fs->first_m; VM_CNT_INC(v_cow_faults); curthread->td_cow++; } static bool vm_fault_next(struct faultstate *fs) { vm_object_t next_object; /* * The requested page does not exist at this object/ * offset. Remove the invalid page from the object, * waking up anyone waiting for it, and continue on to * the next object. However, if this is the top-level * object, we must leave the busy page in place to * prevent another process from rushing past us, and * inserting the page in that object at the same time * that we are. */ if (fs->object == fs->first_object) { fs->first_m = fs->m; fs->m = NULL; } else fault_page_free(&fs->m); /* * Move on to the next object. Lock the next object before * unlocking the current one. */ VM_OBJECT_ASSERT_WLOCKED(fs->object); next_object = fs->object->backing_object; if (next_object == NULL) return (false); MPASS(fs->first_m != NULL); KASSERT(fs->object != next_object, ("object loop %p", next_object)); VM_OBJECT_WLOCK(next_object); vm_object_pip_add(next_object, 1); if (fs->object != fs->first_object) vm_object_pip_wakeup(fs->object); fs->pindex += OFF_TO_IDX(fs->object->backing_object_offset); VM_OBJECT_WUNLOCK(fs->object); fs->object = next_object; return (true); } static void vm_fault_zerofill(struct faultstate *fs) { /* * If there's no object left, fill the page in the top * object with zeros. */ if (fs->object != fs->first_object) { vm_object_pip_wakeup(fs->object); fs->object = fs->first_object; fs->pindex = fs->first_pindex; } MPASS(fs->first_m != NULL); MPASS(fs->m == NULL); fs->m = fs->first_m; fs->first_m = NULL; /* * Zero the page if necessary and mark it valid. */ if ((fs->m->flags & PG_ZERO) == 0) { pmap_zero_page(fs->m); } else { VM_CNT_INC(v_ozfod); } VM_CNT_INC(v_zfod); vm_page_valid(fs->m); } /* * Allocate a page directly or via the object populate method. */ static int vm_fault_allocate(struct faultstate *fs) { struct domainset *dset; int alloc_req; int rv; if ((fs->object->flags & OBJ_SIZEVNLOCK) != 0) { rv = vm_fault_lock_vnode(fs, true); MPASS(rv == KERN_SUCCESS || rv == KERN_RESOURCE_SHORTAGE); if (rv == KERN_RESOURCE_SHORTAGE) return (rv); } if (fs->pindex >= fs->object->size) return (KERN_OUT_OF_BOUNDS); if (fs->object == fs->first_object && (fs->first_object->flags & OBJ_POPULATE) != 0 && fs->first_object->shadow_count == 0) { rv = vm_fault_populate(fs); switch (rv) { case KERN_SUCCESS: case KERN_FAILURE: case KERN_RESTART: return (rv); case KERN_NOT_RECEIVER: /* * Pager's populate() method * returned VM_PAGER_BAD. */ break; default: panic("inconsistent return codes"); } } /* * Allocate a new page for this object/offset pair. * * Unlocked read of the p_flag is harmless. At worst, the P_KILLED * might be not observed there, and allocation can fail, causing * restart and new reading of the p_flag. */ dset = fs->object->domain.dr_policy; if (dset == NULL) dset = curthread->td_domain.dr_policy; if (!vm_page_count_severe_set(&dset->ds_mask) || P_KILLED(curproc)) { #if VM_NRESERVLEVEL > 0 vm_object_color(fs->object, atop(fs->vaddr) - fs->pindex); #endif alloc_req = P_KILLED(curproc) ? VM_ALLOC_SYSTEM : VM_ALLOC_NORMAL; if (fs->object->type != OBJT_VNODE && fs->object->backing_object == NULL) alloc_req |= VM_ALLOC_ZERO; fs->m = vm_page_alloc(fs->object, fs->pindex, alloc_req); } if (fs->m == NULL) { unlock_and_deallocate(fs); if (vm_pfault_oom_attempts < 0 || fs->oom < vm_pfault_oom_attempts) { fs->oom++; vm_waitpfault(dset, vm_pfault_oom_wait * hz); } else { if (bootverbose) printf( "proc %d (%s) failed to alloc page on fault, starting OOM\n", curproc->p_pid, curproc->p_comm); vm_pageout_oom(VM_OOM_MEM_PF); fs->oom = 0; } return (KERN_RESOURCE_SHORTAGE); } fs->oom = 0; return (KERN_NOT_RECEIVER); } /* * Call the pager to retrieve the page if there is a chance * that the pager has it, and potentially retrieve additional * pages at the same time. */ static int vm_fault_getpages(struct faultstate *fs, int nera, int *behindp, int *aheadp) { vm_offset_t e_end, e_start; int ahead, behind, cluster_offset, rv; u_char behavior; /* * Prepare for unlocking the map. Save the map * entry's start and end addresses, which are used to * optimize the size of the pager operation below. * Even if the map entry's addresses change after * unlocking the map, using the saved addresses is * safe. */ e_start = fs->entry->start; e_end = fs->entry->end; behavior = vm_map_entry_behavior(fs->entry); /* * Release the map lock before locking the vnode or * sleeping in the pager. (If the current object has * a shadow, then an earlier iteration of this loop * may have already unlocked the map.) */ unlock_map(fs); rv = vm_fault_lock_vnode(fs, false); MPASS(rv == KERN_SUCCESS || rv == KERN_RESOURCE_SHORTAGE); if (rv == KERN_RESOURCE_SHORTAGE) return (rv); KASSERT(fs->vp == NULL || !fs->map->system_map, ("vm_fault: vnode-backed object mapped by system map")); /* * Page in the requested page and hint the pager, * that it may bring up surrounding pages. */ if (nera == -1 || behavior == MAP_ENTRY_BEHAV_RANDOM || P_KILLED(curproc)) { behind = 0; ahead = 0; } else { /* Is this a sequential fault? */ if (nera > 0) { behind = 0; ahead = nera; } else { /* * Request a cluster of pages that is * aligned to a VM_FAULT_READ_DEFAULT * page offset boundary within the * object. Alignment to a page offset * boundary is more likely to coincide * with the underlying file system * block than alignment to a virtual * address boundary. */ cluster_offset = fs->pindex % VM_FAULT_READ_DEFAULT; behind = ulmin(cluster_offset, atop(fs->vaddr - e_start)); ahead = VM_FAULT_READ_DEFAULT - 1 - cluster_offset; } ahead = ulmin(ahead, atop(e_end - fs->vaddr) - 1); } *behindp = behind; *aheadp = ahead; rv = vm_pager_get_pages(fs->object, &fs->m, 1, behindp, aheadp); if (rv == VM_PAGER_OK) return (KERN_SUCCESS); if (rv == VM_PAGER_ERROR) printf("vm_fault: pager read error, pid %d (%s)\n", curproc->p_pid, curproc->p_comm); /* * If an I/O error occurred or the requested page was * outside the range of the pager, clean up and return * an error. */ if (rv == VM_PAGER_ERROR || rv == VM_PAGER_BAD) return (KERN_OUT_OF_BOUNDS); return (KERN_NOT_RECEIVER); } /* * Wait/Retry if the page is busy. We have to do this if the page is * either exclusive or shared busy because the vm_pager may be using * read busy for pageouts (and even pageins if it is the vnode pager), * and we could end up trying to pagein and pageout the same page * simultaneously. * * We can theoretically allow the busy case on a read fault if the page * is marked valid, but since such pages are typically already pmap'd, * putting that special case in might be more effort then it is worth. * We cannot under any circumstances mess around with a shared busied * page except, perhaps, to pmap it. */ static void vm_fault_busy_sleep(struct faultstate *fs) { /* * Reference the page before unlocking and * sleeping so that the page daemon is less * likely to reclaim it. */ vm_page_aflag_set(fs->m, PGA_REFERENCED); if (fs->object != fs->first_object) { fault_page_release(&fs->first_m); vm_object_pip_wakeup(fs->first_object); } vm_object_pip_wakeup(fs->object); unlock_map(fs); if (fs->m == vm_page_lookup(fs->object, fs->pindex)) vm_page_busy_sleep(fs->m, "vmpfw", false); else VM_OBJECT_WUNLOCK(fs->object); VM_CNT_INC(v_intrans); vm_object_deallocate(fs->first_object); } int vm_fault(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type, int fault_flags, vm_page_t *m_hold) { struct faultstate fs; int ahead, behind, faultcount; int nera, result, rv; bool dead, hardfault; VM_CNT_INC(v_vm_faults); if ((curthread->td_pflags & TDP_NOFAULTING) != 0) return (KERN_PROTECTION_FAILURE); fs.vp = NULL; fs.vaddr = vaddr; fs.m_hold = m_hold; fs.fault_flags = fault_flags; fs.map = map; fs.lookup_still_valid = false; fs.oom = 0; faultcount = 0; nera = -1; hardfault = false; RetryFault: fs.fault_type = fault_type; /* * Find the backing store object and offset into it to begin the * search. */ result = vm_fault_lookup(&fs); if (result != KERN_SUCCESS) { if (result == KERN_RESOURCE_SHORTAGE) goto RetryFault; return (result); } /* * Try to avoid lock contention on the top-level object through * special-case handling of some types of page faults, specifically, * those that are mapping an existing page from the top-level object. * Under this condition, a read lock on the object suffices, allowing * multiple page faults of a similar type to run in parallel. */ if (fs.vp == NULL /* avoid locked vnode leak */ && (fs.fault_flags & (VM_FAULT_WIRE | VM_FAULT_DIRTY)) == 0) { VM_OBJECT_RLOCK(fs.first_object); rv = vm_fault_soft_fast(&fs); if (rv == KERN_SUCCESS) return (rv); if (!VM_OBJECT_TRYUPGRADE(fs.first_object)) { VM_OBJECT_RUNLOCK(fs.first_object); VM_OBJECT_WLOCK(fs.first_object); } } else { VM_OBJECT_WLOCK(fs.first_object); } /* * Make a reference to this object to prevent its disposal while we * are messing with it. Once we have the reference, the map is free * to be diddled. Since objects reference their shadows (and copies), * they will stay around as well. * * Bump the paging-in-progress count to prevent size changes (e.g. * truncation operations) during I/O. */ vm_object_reference_locked(fs.first_object); vm_object_pip_add(fs.first_object, 1); fs.m_cow = fs.m = fs.first_m = NULL; /* * Search for the page at object/offset. */ fs.object = fs.first_object; fs.pindex = fs.first_pindex; while (TRUE) { KASSERT(fs.m == NULL, ("page still set %p at loop start", fs.m)); /* * If the object is marked for imminent termination, * we retry here, since the collapse pass has raced * with us. Otherwise, if we see terminally dead * object, return fail. */ if ((fs.object->flags & OBJ_DEAD) != 0) { dead = fs.object->type == OBJT_DEAD; unlock_and_deallocate(&fs); if (dead) return (KERN_PROTECTION_FAILURE); pause("vmf_de", 1); goto RetryFault; } /* * See if page is resident */ fs.m = vm_page_lookup(fs.object, fs.pindex); if (fs.m != NULL) { if (vm_page_tryxbusy(fs.m) == 0) { vm_fault_busy_sleep(&fs); goto RetryFault; } /* * The page is marked busy for other processes and the * pagedaemon. If it still is completely valid we * are done. */ if (vm_page_all_valid(fs.m)) { VM_OBJECT_WUNLOCK(fs.object); break; /* break to PAGE HAS BEEN FOUND. */ } } VM_OBJECT_ASSERT_WLOCKED(fs.object); /* * Page is not resident. If the pager might contain the page * or this is the beginning of the search, allocate a new * page. (Default objects are zero-fill, so there is no real * pager for them.) */ if (fs.m == NULL && (fs.object->type != OBJT_DEFAULT || fs.object == fs.first_object)) { rv = vm_fault_allocate(&fs); switch (rv) { case KERN_RESTART: unlock_and_deallocate(&fs); /* FALLTHROUGH */ case KERN_RESOURCE_SHORTAGE: goto RetryFault; case KERN_SUCCESS: case KERN_FAILURE: case KERN_OUT_OF_BOUNDS: unlock_and_deallocate(&fs); return (rv); case KERN_NOT_RECEIVER: break; default: panic("vm_fault: Unhandled rv %d", rv); } } /* * Default objects have no pager so no exclusive busy exists * to protect this page in the chain. Skip to the next * object without dropping the lock to preserve atomicity of * shadow faults. */ if (fs.object->type != OBJT_DEFAULT) { /* * At this point, we have either allocated a new page * or found an existing page that is only partially * valid. * * We hold a reference on the current object and the * page is exclusive busied. The exclusive busy * prevents simultaneous faults and collapses while * the object lock is dropped. */ VM_OBJECT_WUNLOCK(fs.object); /* * If the pager for the current object might have * the page, then determine the number of additional * pages to read and potentially reprioritize * previously read pages for earlier reclamation. * These operations should only be performed once per * page fault. Even if the current pager doesn't * have the page, the number of additional pages to * read will apply to subsequent objects in the * shadow chain. */ if (nera == -1 && !P_KILLED(curproc)) nera = vm_fault_readahead(&fs); rv = vm_fault_getpages(&fs, nera, &behind, &ahead); if (rv == KERN_SUCCESS) { faultcount = behind + 1 + ahead; hardfault = true; break; /* break to PAGE HAS BEEN FOUND. */ } if (rv == KERN_RESOURCE_SHORTAGE) goto RetryFault; VM_OBJECT_WLOCK(fs.object); if (rv == KERN_OUT_OF_BOUNDS) { fault_page_free(&fs.m); unlock_and_deallocate(&fs); return (rv); } } /* * The page was not found in the current object. Try to * traverse into a backing object or zero fill if none is * found. */ if (vm_fault_next(&fs)) continue; VM_OBJECT_WUNLOCK(fs.object); vm_fault_zerofill(&fs); /* Don't try to prefault neighboring pages. */ faultcount = 1; break; /* break to PAGE HAS BEEN FOUND. */ } /* * PAGE HAS BEEN FOUND. A valid page has been found and exclusively * busied. The object lock must no longer be held. */ vm_page_assert_xbusied(fs.m); VM_OBJECT_ASSERT_UNLOCKED(fs.object); /* * If the page is being written, but isn't already owned by the * top-level object, we have to copy it into a new page owned by the * top-level object. */ if (fs.object != fs.first_object) { /* * We only really need to copy if we want to write it. */ if ((fs.fault_type & (VM_PROT_COPY | VM_PROT_WRITE)) != 0) { vm_fault_cow(&fs); /* * We only try to prefault read-only mappings to the * neighboring pages when this copy-on-write fault is * a hard fault. In other cases, trying to prefault * is typically wasted effort. */ if (faultcount == 0) faultcount = 1; } else { fs.prot &= ~VM_PROT_WRITE; } } /* * We must verify that the maps have not changed since our last * lookup. */ if (!fs.lookup_still_valid) { result = vm_fault_relookup(&fs); if (result != KERN_SUCCESS) { fault_deallocate(&fs); if (result == KERN_RESTART) goto RetryFault; return (result); } } VM_OBJECT_ASSERT_UNLOCKED(fs.object); /* * If the page was filled by a pager, save the virtual address that * should be faulted on next under a sequential access pattern to the * map entry. A read lock on the map suffices to update this address * safely. */ if (hardfault) fs.entry->next_read = vaddr + ptoa(ahead) + PAGE_SIZE; /* * Page must be completely valid or it is not fit to * map into user space. vm_pager_get_pages() ensures this. */ vm_page_assert_xbusied(fs.m); KASSERT(vm_page_all_valid(fs.m), ("vm_fault: page %p partially invalid", fs.m)); vm_fault_dirty(&fs, fs.m); /* * Put this page into the physical map. We had to do the unlock above * because pmap_enter() may sleep. We don't put the page * back on the active queue until later so that the pageout daemon * won't find it (yet). */ pmap_enter(fs.map->pmap, vaddr, fs.m, fs.prot, fs.fault_type | (fs.wired ? PMAP_ENTER_WIRED : 0), 0); if (faultcount != 1 && (fs.fault_flags & VM_FAULT_WIRE) == 0 && fs.wired == 0) vm_fault_prefault(&fs, vaddr, faultcount > 0 ? behind : PFBAK, faultcount > 0 ? ahead : PFFOR, false); /* * If the page is not wired down, then put it where the pageout daemon * can find it. */ if ((fs.fault_flags & VM_FAULT_WIRE) != 0) vm_page_wire(fs.m); else vm_page_activate(fs.m); if (fs.m_hold != NULL) { (*fs.m_hold) = fs.m; vm_page_wire(fs.m); } vm_page_xunbusy(fs.m); fs.m = NULL; /* * Unlock everything, and return */ fault_deallocate(&fs); if (hardfault) { VM_CNT_INC(v_io_faults); curthread->td_ru.ru_majflt++; #ifdef RACCT if (racct_enable && fs.object->type == OBJT_VNODE) { PROC_LOCK(curproc); if ((fs.fault_type & (VM_PROT_COPY | VM_PROT_WRITE)) != 0) { racct_add_force(curproc, RACCT_WRITEBPS, PAGE_SIZE + behind * PAGE_SIZE); racct_add_force(curproc, RACCT_WRITEIOPS, 1); } else { racct_add_force(curproc, RACCT_READBPS, PAGE_SIZE + ahead * PAGE_SIZE); racct_add_force(curproc, RACCT_READIOPS, 1); } PROC_UNLOCK(curproc); } #endif } else curthread->td_ru.ru_minflt++; return (KERN_SUCCESS); } /* * Speed up the reclamation of pages that precede the faulting pindex within * the first object of the shadow chain. Essentially, perform the equivalent * to madvise(..., MADV_DONTNEED) on a large cluster of pages that precedes * the faulting pindex by the cluster size when the pages read by vm_fault() * cross a cluster-size boundary. The cluster size is the greater of the * smallest superpage size and VM_FAULT_DONTNEED_MIN. * * When "fs->first_object" is a shadow object, the pages in the backing object * that precede the faulting pindex are deactivated by vm_fault(). So, this * function must only be concerned with pages in the first object. */ static void vm_fault_dontneed(const struct faultstate *fs, vm_offset_t vaddr, int ahead) { vm_map_entry_t entry; vm_object_t first_object, object; vm_offset_t end, start; vm_page_t m, m_next; vm_pindex_t pend, pstart; vm_size_t size; object = fs->object; VM_OBJECT_ASSERT_UNLOCKED(object); first_object = fs->first_object; /* Neither fictitious nor unmanaged pages can be reclaimed. */ if ((first_object->flags & (OBJ_FICTITIOUS | OBJ_UNMANAGED)) == 0) { VM_OBJECT_RLOCK(first_object); size = VM_FAULT_DONTNEED_MIN; if (MAXPAGESIZES > 1 && size < pagesizes[1]) size = pagesizes[1]; end = rounddown2(vaddr, size); if (vaddr - end >= size - PAGE_SIZE - ptoa(ahead) && (entry = fs->entry)->start < end) { if (end - entry->start < size) start = entry->start; else start = end - size; pmap_advise(fs->map->pmap, start, end, MADV_DONTNEED); pstart = OFF_TO_IDX(entry->offset) + atop(start - entry->start); m_next = vm_page_find_least(first_object, pstart); pend = OFF_TO_IDX(entry->offset) + atop(end - entry->start); while ((m = m_next) != NULL && m->pindex < pend) { m_next = TAILQ_NEXT(m, listq); if (!vm_page_all_valid(m) || vm_page_busied(m)) continue; /* * Don't clear PGA_REFERENCED, since it would * likely represent a reference by a different * process. * * Typically, at this point, prefetched pages * are still in the inactive queue. Only * pages that triggered page faults are in the * active queue. The test for whether the page * is in the inactive queue is racy; in the * worst case we will requeue the page * unnecessarily. */ if (!vm_page_inactive(m)) vm_page_deactivate(m); } } VM_OBJECT_RUNLOCK(first_object); } } /* * vm_fault_prefault provides a quick way of clustering * pagefaults into a processes address space. It is a "cousin" * of vm_map_pmap_enter, except it runs at page fault time instead * of mmap time. */ static void vm_fault_prefault(const struct faultstate *fs, vm_offset_t addra, int backward, int forward, bool obj_locked) { pmap_t pmap; vm_map_entry_t entry; vm_object_t backing_object, lobject; vm_offset_t addr, starta; vm_pindex_t pindex; vm_page_t m; int i; pmap = fs->map->pmap; if (pmap != vmspace_pmap(curthread->td_proc->p_vmspace)) return; entry = fs->entry; if (addra < backward * PAGE_SIZE) { starta = entry->start; } else { starta = addra - backward * PAGE_SIZE; if (starta < entry->start) starta = entry->start; } /* * Generate the sequence of virtual addresses that are candidates for * prefaulting in an outward spiral from the faulting virtual address, * "addra". Specifically, the sequence is "addra - PAGE_SIZE", "addra * + PAGE_SIZE", "addra - 2 * PAGE_SIZE", "addra + 2 * PAGE_SIZE", ... * If the candidate address doesn't have a backing physical page, then * the loop immediately terminates. */ for (i = 0; i < 2 * imax(backward, forward); i++) { addr = addra + ((i >> 1) + 1) * ((i & 1) == 0 ? -PAGE_SIZE : PAGE_SIZE); if (addr > addra + forward * PAGE_SIZE) addr = 0; if (addr < starta || addr >= entry->end) continue; if (!pmap_is_prefaultable(pmap, addr)) continue; pindex = ((addr - entry->start) + entry->offset) >> PAGE_SHIFT; lobject = entry->object.vm_object; if (!obj_locked) VM_OBJECT_RLOCK(lobject); while ((m = vm_page_lookup(lobject, pindex)) == NULL && lobject->type == OBJT_DEFAULT && (backing_object = lobject->backing_object) != NULL) { KASSERT((lobject->backing_object_offset & PAGE_MASK) == 0, ("vm_fault_prefault: unaligned object offset")); pindex += lobject->backing_object_offset >> PAGE_SHIFT; VM_OBJECT_RLOCK(backing_object); if (!obj_locked || lobject != entry->object.vm_object) VM_OBJECT_RUNLOCK(lobject); lobject = backing_object; } if (m == NULL) { if (!obj_locked || lobject != entry->object.vm_object) VM_OBJECT_RUNLOCK(lobject); break; } if (vm_page_all_valid(m) && (m->flags & PG_FICTITIOUS) == 0) pmap_enter_quick(pmap, addr, m, entry->protection); if (!obj_locked || lobject != entry->object.vm_object) VM_OBJECT_RUNLOCK(lobject); } } /* * Hold each of the physical pages that are mapped by the specified range of * virtual addresses, ["addr", "addr" + "len"), if those mappings are valid * and allow the specified types of access, "prot". If all of the implied * pages are successfully held, then the number of held pages is returned * together with pointers to those pages in the array "ma". However, if any * of the pages cannot be held, -1 is returned. */ int vm_fault_quick_hold_pages(vm_map_t map, vm_offset_t addr, vm_size_t len, vm_prot_t prot, vm_page_t *ma, int max_count) { vm_offset_t end, va; vm_page_t *mp; int count; boolean_t pmap_failed; if (len == 0) return (0); end = round_page(addr + len); addr = trunc_page(addr); if (!vm_map_range_valid(map, addr, end)) return (-1); if (atop(end - addr) > max_count) panic("vm_fault_quick_hold_pages: count > max_count"); count = atop(end - addr); /* * Most likely, the physical pages are resident in the pmap, so it is * faster to try pmap_extract_and_hold() first. */ pmap_failed = FALSE; for (mp = ma, va = addr; va < end; mp++, va += PAGE_SIZE) { *mp = pmap_extract_and_hold(map->pmap, va, prot); if (*mp == NULL) pmap_failed = TRUE; else if ((prot & VM_PROT_WRITE) != 0 && (*mp)->dirty != VM_PAGE_BITS_ALL) { /* * Explicitly dirty the physical page. Otherwise, the * caller's changes may go unnoticed because they are * performed through an unmanaged mapping or by a DMA * operation. * * The object lock is not held here. * See vm_page_clear_dirty_mask(). */ vm_page_dirty(*mp); } } if (pmap_failed) { /* * One or more pages could not be held by the pmap. Either no * page was mapped at the specified virtual address or that * mapping had insufficient permissions. Attempt to fault in * and hold these pages. * * If vm_fault_disable_pagefaults() was called, * i.e., TDP_NOFAULTING is set, we must not sleep nor * acquire MD VM locks, which means we must not call * vm_fault(). Some (out of tree) callers mark * too wide a code area with vm_fault_disable_pagefaults() * already, use the VM_PROT_QUICK_NOFAULT flag to request * the proper behaviour explicitly. */ if ((prot & VM_PROT_QUICK_NOFAULT) != 0 && (curthread->td_pflags & TDP_NOFAULTING) != 0) goto error; for (mp = ma, va = addr; va < end; mp++, va += PAGE_SIZE) if (*mp == NULL && vm_fault(map, va, prot, VM_FAULT_NORMAL, mp) != KERN_SUCCESS) goto error; } return (count); error: for (mp = ma; mp < ma + count; mp++) if (*mp != NULL) vm_page_unwire(*mp, PQ_INACTIVE); return (-1); } /* * Routine: * vm_fault_copy_entry * Function: * Create new shadow object backing dst_entry with private copy of * all underlying pages. When src_entry is equal to dst_entry, * function implements COW for wired-down map entry. Otherwise, * it forks wired entry into dst_map. * * In/out conditions: * The source and destination maps must be locked for write. * The source map entry must be wired down (or be a sharing map * entry corresponding to a main map entry that is wired down). */ void vm_fault_copy_entry(vm_map_t dst_map, vm_map_t src_map, vm_map_entry_t dst_entry, vm_map_entry_t src_entry, vm_ooffset_t *fork_charge) { vm_object_t backing_object, dst_object, object, src_object; vm_pindex_t dst_pindex, pindex, src_pindex; vm_prot_t access, prot; vm_offset_t vaddr; vm_page_t dst_m; vm_page_t src_m; boolean_t upgrade; #ifdef lint src_map++; #endif /* lint */ upgrade = src_entry == dst_entry; access = prot = dst_entry->protection; src_object = src_entry->object.vm_object; src_pindex = OFF_TO_IDX(src_entry->offset); if (upgrade && (dst_entry->eflags & MAP_ENTRY_NEEDS_COPY) == 0) { dst_object = src_object; vm_object_reference(dst_object); } else { /* * Create the top-level object for the destination entry. * Doesn't actually shadow anything - we copy the pages * directly. */ dst_object = vm_object_allocate_anon(atop(dst_entry->end - dst_entry->start), NULL, NULL, 0); #if VM_NRESERVLEVEL > 0 dst_object->flags |= OBJ_COLORED; dst_object->pg_color = atop(dst_entry->start); #endif dst_object->domain = src_object->domain; dst_object->charge = dst_entry->end - dst_entry->start; } VM_OBJECT_WLOCK(dst_object); KASSERT(upgrade || dst_entry->object.vm_object == NULL, ("vm_fault_copy_entry: vm_object not NULL")); if (src_object != dst_object) { dst_entry->object.vm_object = dst_object; dst_entry->offset = 0; dst_entry->eflags &= ~MAP_ENTRY_VN_EXEC; } if (fork_charge != NULL) { KASSERT(dst_entry->cred == NULL, ("vm_fault_copy_entry: leaked swp charge")); dst_object->cred = curthread->td_ucred; crhold(dst_object->cred); *fork_charge += dst_object->charge; } else if ((dst_object->type == OBJT_DEFAULT || dst_object->type == OBJT_SWAP) && dst_object->cred == NULL) { KASSERT(dst_entry->cred != NULL, ("no cred for entry %p", dst_entry)); dst_object->cred = dst_entry->cred; dst_entry->cred = NULL; } /* * If not an upgrade, then enter the mappings in the pmap as * read and/or execute accesses. Otherwise, enter them as * write accesses. * * A writeable large page mapping is only created if all of * the constituent small page mappings are modified. Marking * PTEs as modified on inception allows promotion to happen * without taking potentially large number of soft faults. */ if (!upgrade) access &= ~VM_PROT_WRITE; /* * Loop through all of the virtual pages within the entry's * range, copying each page from the source object to the * destination object. Since the source is wired, those pages * must exist. In contrast, the destination is pageable. * Since the destination object doesn't share any backing storage * with the source object, all of its pages must be dirtied, * regardless of whether they can be written. */ for (vaddr = dst_entry->start, dst_pindex = 0; vaddr < dst_entry->end; vaddr += PAGE_SIZE, dst_pindex++) { again: /* * Find the page in the source object, and copy it in. * Because the source is wired down, the page will be * in memory. */ if (src_object != dst_object) VM_OBJECT_RLOCK(src_object); object = src_object; pindex = src_pindex + dst_pindex; while ((src_m = vm_page_lookup(object, pindex)) == NULL && (backing_object = object->backing_object) != NULL) { /* * Unless the source mapping is read-only or * it is presently being upgraded from * read-only, the first object in the shadow * chain should provide all of the pages. In * other words, this loop body should never be * executed when the source mapping is already * read/write. */ KASSERT((src_entry->protection & VM_PROT_WRITE) == 0 || upgrade, ("vm_fault_copy_entry: main object missing page")); VM_OBJECT_RLOCK(backing_object); pindex += OFF_TO_IDX(object->backing_object_offset); if (object != dst_object) VM_OBJECT_RUNLOCK(object); object = backing_object; } KASSERT(src_m != NULL, ("vm_fault_copy_entry: page missing")); if (object != dst_object) { /* * Allocate a page in the destination object. */ dst_m = vm_page_alloc(dst_object, (src_object == dst_object ? src_pindex : 0) + dst_pindex, VM_ALLOC_NORMAL); if (dst_m == NULL) { VM_OBJECT_WUNLOCK(dst_object); VM_OBJECT_RUNLOCK(object); vm_wait(dst_object); VM_OBJECT_WLOCK(dst_object); goto again; } pmap_copy_page(src_m, dst_m); VM_OBJECT_RUNLOCK(object); dst_m->dirty = dst_m->valid = src_m->valid; } else { dst_m = src_m; if (vm_page_busy_acquire(dst_m, VM_ALLOC_WAITFAIL) == 0) goto again; if (dst_m->pindex >= dst_object->size) { /* * We are upgrading. Index can occur * out of bounds if the object type is * vnode and the file was truncated. */ vm_page_xunbusy(dst_m); break; } } VM_OBJECT_WUNLOCK(dst_object); /* * Enter it in the pmap. If a wired, copy-on-write * mapping is being replaced by a write-enabled * mapping, then wire that new mapping. * * The page can be invalid if the user called * msync(MS_INVALIDATE) or truncated the backing vnode * or shared memory object. In this case, do not * insert it into pmap, but still do the copy so that * all copies of the wired map entry have similar * backing pages. */ if (vm_page_all_valid(dst_m)) { pmap_enter(dst_map->pmap, vaddr, dst_m, prot, access | (upgrade ? PMAP_ENTER_WIRED : 0), 0); } /* * Mark it no longer busy, and put it on the active list. */ VM_OBJECT_WLOCK(dst_object); if (upgrade) { if (src_m != dst_m) { vm_page_unwire(src_m, PQ_INACTIVE); vm_page_wire(dst_m); } else { KASSERT(vm_page_wired(dst_m), ("dst_m %p is not wired", dst_m)); } } else { vm_page_activate(dst_m); } vm_page_xunbusy(dst_m); } VM_OBJECT_WUNLOCK(dst_object); if (upgrade) { dst_entry->eflags &= ~(MAP_ENTRY_COW | MAP_ENTRY_NEEDS_COPY); vm_object_deallocate(src_object); } } /* * Block entry into the machine-independent layer's page fault handler by * the calling thread. Subsequent calls to vm_fault() by that thread will * return KERN_PROTECTION_FAILURE. Enable machine-dependent handling of * spurious page faults. */ int vm_fault_disable_pagefaults(void) { return (curthread_pflags_set(TDP_NOFAULTING | TDP_RESETSPUR)); } void vm_fault_enable_pagefaults(int save) { curthread_pflags_restore(save); } Index: projects/clang1100-import/sys/vm/vm_mmap.c =================================================================== --- projects/clang1100-import/sys/vm/vm_mmap.c (revision 364278) +++ projects/clang1100-import/sys/vm/vm_mmap.c (revision 364279) @@ -1,1671 +1,1671 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1988 University of Utah. * Copyright (c) 1991, 1993 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: Utah $Hdr: vm_mmap.c 1.6 91/10/21$ * * @(#)vm_mmap.c 8.4 (Berkeley) 1/12/94 */ /* * Mapped file (mmap) interface to VM */ #include __FBSDID("$FreeBSD$"); #include "opt_hwpmc_hooks.h" #include "opt_vm.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(__amd64__) || defined(__i386__) /* for i386_read_exec */ #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef HWPMC_HOOKS #include #endif int old_mlock = 0; SYSCTL_INT(_vm, OID_AUTO, old_mlock, CTLFLAG_RWTUN, &old_mlock, 0, "Do not apply RLIMIT_MEMLOCK on mlockall"); static int mincore_mapped = 1; SYSCTL_INT(_vm, OID_AUTO, mincore_mapped, CTLFLAG_RWTUN, &mincore_mapped, 0, "mincore reports mappings, not residency"); static int imply_prot_max = 0; SYSCTL_INT(_vm, OID_AUTO, imply_prot_max, CTLFLAG_RWTUN, &imply_prot_max, 0, "Imply maximum page protections in mmap() when none are specified"); #ifdef MAP_32BIT #define MAP_32BIT_MAX_ADDR ((vm_offset_t)1 << 31) #endif #ifndef _SYS_SYSPROTO_H_ struct sbrk_args { int incr; }; #endif int sys_sbrk(struct thread *td, struct sbrk_args *uap) { /* Not yet implemented */ return (EOPNOTSUPP); } #ifndef _SYS_SYSPROTO_H_ struct sstk_args { int incr; }; #endif int sys_sstk(struct thread *td, struct sstk_args *uap) { /* Not yet implemented */ return (EOPNOTSUPP); } #if defined(COMPAT_43) int ogetpagesize(struct thread *td, struct ogetpagesize_args *uap) { td->td_retval[0] = PAGE_SIZE; return (0); } #endif /* COMPAT_43 */ /* * Memory Map (mmap) system call. Note that the file offset * and address are allowed to be NOT page aligned, though if * the MAP_FIXED flag it set, both must have the same remainder * modulo the PAGE_SIZE (POSIX 1003.1b). If the address is not * page-aligned, the actual mapping starts at trunc_page(addr) * and the return value is adjusted up by the page offset. * * Generally speaking, only character devices which are themselves * memory-based, such as a video framebuffer, can be mmap'd. Otherwise * there would be no cache coherency between a descriptor and a VM mapping * both to the same character device. */ #ifndef _SYS_SYSPROTO_H_ struct mmap_args { void *addr; size_t len; int prot; int flags; int fd; long pad; off_t pos; }; #endif int sys_mmap(struct thread *td, struct mmap_args *uap) { return (kern_mmap(td, (uintptr_t)uap->addr, uap->len, uap->prot, uap->flags, uap->fd, uap->pos)); } int kern_mmap_maxprot(struct proc *p, int prot) { if ((p->p_flag2 & P2_PROTMAX_DISABLE) != 0 || (p->p_fctl0 & NT_FREEBSD_FCTL_PROTMAX_DISABLE) != 0) return (_PROT_ALL); if (((p->p_flag2 & P2_PROTMAX_ENABLE) != 0 || imply_prot_max) && prot != PROT_NONE) return (prot); return (_PROT_ALL); } int kern_mmap(struct thread *td, uintptr_t addr0, size_t len, int prot, int flags, int fd, off_t pos) { struct mmap_req mr = { .mr_hint = addr0, .mr_len = len, .mr_prot = prot, .mr_flags = flags, .mr_fd = fd, .mr_pos = pos }; return (kern_mmap_req(td, &mr)); } int kern_mmap_req(struct thread *td, const struct mmap_req *mrp) { struct vmspace *vms; struct file *fp; struct proc *p; off_t pos; vm_offset_t addr; vm_size_t len, pageoff, size; vm_prot_t cap_maxprot; int align, error, fd, flags, max_prot, prot; cap_rights_t rights; mmap_check_fp_fn check_fp_fn; addr = mrp->mr_hint; len = mrp->mr_len; prot = mrp->mr_prot; flags = mrp->mr_flags; fd = mrp->mr_fd; pos = mrp->mr_pos; check_fp_fn = mrp->mr_check_fp_fn; if ((prot & ~(_PROT_ALL | PROT_MAX(_PROT_ALL))) != 0) return (EINVAL); max_prot = PROT_MAX_EXTRACT(prot); prot = PROT_EXTRACT(prot); if (max_prot != 0 && (max_prot & prot) != prot) return (ENOTSUP); p = td->td_proc; /* * Always honor PROT_MAX if set. If not, default to all * permissions unless we're implying maximum permissions. */ if (max_prot == 0) max_prot = kern_mmap_maxprot(p, prot); vms = p->p_vmspace; fp = NULL; AUDIT_ARG_FD(fd); /* * Ignore old flags that used to be defined but did not do anything. */ flags &= ~(MAP_RESERVED0020 | MAP_RESERVED0040); /* * Enforce the constraints. * Mapping of length 0 is only allowed for old binaries. * Anonymous mapping shall specify -1 as filedescriptor and * zero position for new code. Be nice to ancient a.out * binaries and correct pos for anonymous mapping, since old * ld.so sometimes issues anonymous map requests with non-zero * pos. */ if (!SV_CURPROC_FLAG(SV_AOUT)) { if ((len == 0 && p->p_osrel >= P_OSREL_MAP_ANON) || ((flags & MAP_ANON) != 0 && (fd != -1 || pos != 0))) return (EINVAL); } else { if ((flags & MAP_ANON) != 0) pos = 0; } if (flags & MAP_STACK) { if ((fd != -1) || ((prot & (PROT_READ | PROT_WRITE)) != (PROT_READ | PROT_WRITE))) return (EINVAL); flags |= MAP_ANON; pos = 0; } if ((flags & ~(MAP_SHARED | MAP_PRIVATE | MAP_FIXED | MAP_HASSEMAPHORE | MAP_STACK | MAP_NOSYNC | MAP_ANON | MAP_EXCL | MAP_NOCORE | MAP_PREFAULT_READ | MAP_GUARD | #ifdef MAP_32BIT MAP_32BIT | #endif MAP_ALIGNMENT_MASK)) != 0) return (EINVAL); if ((flags & (MAP_EXCL | MAP_FIXED)) == MAP_EXCL) return (EINVAL); if ((flags & (MAP_SHARED | MAP_PRIVATE)) == (MAP_SHARED | MAP_PRIVATE)) return (EINVAL); if (prot != PROT_NONE && (prot & ~(PROT_READ | PROT_WRITE | PROT_EXEC)) != 0) return (EINVAL); if ((flags & MAP_GUARD) != 0 && (prot != PROT_NONE || fd != -1 || pos != 0 || (flags & ~(MAP_FIXED | MAP_GUARD | MAP_EXCL | #ifdef MAP_32BIT MAP_32BIT | #endif MAP_ALIGNMENT_MASK)) != 0)) return (EINVAL); /* * Align the file position to a page boundary, * and save its page offset component. */ pageoff = (pos & PAGE_MASK); pos -= pageoff; /* Compute size from len by rounding (on both ends). */ size = len + pageoff; /* low end... */ size = round_page(size); /* hi end */ /* Check for rounding up to zero. */ if (len > size) return (ENOMEM); /* Ensure alignment is at least a page and fits in a pointer. */ align = flags & MAP_ALIGNMENT_MASK; if (align != 0 && align != MAP_ALIGNED_SUPER && (align >> MAP_ALIGNMENT_SHIFT >= sizeof(void *) * NBBY || align >> MAP_ALIGNMENT_SHIFT < PAGE_SHIFT)) return (EINVAL); /* * Check for illegal addresses. Watch out for address wrap... Note * that VM_*_ADDRESS are not constants due to casts (argh). */ if (flags & MAP_FIXED) { /* * The specified address must have the same remainder * as the file offset taken modulo PAGE_SIZE, so it * should be aligned after adjustment by pageoff. */ addr -= pageoff; if (addr & PAGE_MASK) return (EINVAL); /* Address range must be all in user VM space. */ if (!vm_map_range_valid(&vms->vm_map, addr, addr + size)) return (EINVAL); #ifdef MAP_32BIT if (flags & MAP_32BIT && addr + size > MAP_32BIT_MAX_ADDR) return (EINVAL); } else if (flags & MAP_32BIT) { /* * For MAP_32BIT, override the hint if it is too high and * do not bother moving the mapping past the heap (since * the heap is usually above 2GB). */ if (addr + size > MAP_32BIT_MAX_ADDR) addr = 0; #endif } else { /* * XXX for non-fixed mappings where no hint is provided or * the hint would fall in the potential heap space, * place it after the end of the largest possible heap. * * There should really be a pmap call to determine a reasonable * location. */ if (addr == 0 || (addr >= round_page((vm_offset_t)vms->vm_taddr) && addr < round_page((vm_offset_t)vms->vm_daddr + lim_max(td, RLIMIT_DATA)))) addr = round_page((vm_offset_t)vms->vm_daddr + lim_max(td, RLIMIT_DATA)); } if (len == 0) { /* * Return success without mapping anything for old * binaries that request a page-aligned mapping of * length 0. For modern binaries, this function * returns an error earlier. */ error = 0; } else if ((flags & MAP_GUARD) != 0) { error = vm_mmap_object(&vms->vm_map, &addr, size, VM_PROT_NONE, VM_PROT_NONE, flags, NULL, pos, FALSE, td); } else if ((flags & MAP_ANON) != 0) { /* * Mapping blank space is trivial. * * This relies on VM_PROT_* matching PROT_*. */ error = vm_mmap_object(&vms->vm_map, &addr, size, prot, max_prot, flags, NULL, pos, FALSE, td); } else { /* * Mapping file, get fp for validation and don't let the * descriptor disappear on us if we block. Check capability * rights, but also return the maximum rights to be combined * with maxprot later. */ cap_rights_init_one(&rights, CAP_MMAP); if (prot & PROT_READ) cap_rights_set_one(&rights, CAP_MMAP_R); if ((flags & MAP_SHARED) != 0) { if (prot & PROT_WRITE) cap_rights_set_one(&rights, CAP_MMAP_W); } if (prot & PROT_EXEC) cap_rights_set_one(&rights, CAP_MMAP_X); error = fget_mmap(td, fd, &rights, &cap_maxprot, &fp); if (error != 0) goto done; if ((flags & (MAP_SHARED | MAP_PRIVATE)) == 0 && p->p_osrel >= P_OSREL_MAP_FSTRICT) { error = EINVAL; goto done; } if (check_fp_fn != NULL) { error = check_fp_fn(fp, prot, max_prot & cap_maxprot, flags); if (error != 0) goto done; } /* This relies on VM_PROT_* matching PROT_*. */ error = fo_mmap(fp, &vms->vm_map, &addr, size, prot, max_prot & cap_maxprot, flags, pos, td); } if (error == 0) td->td_retval[0] = (register_t) (addr + pageoff); done: if (fp) fdrop(fp, td); return (error); } #if defined(COMPAT_FREEBSD6) int freebsd6_mmap(struct thread *td, struct freebsd6_mmap_args *uap) { return (kern_mmap(td, (uintptr_t)uap->addr, uap->len, uap->prot, uap->flags, uap->fd, uap->pos)); } #endif #ifdef COMPAT_43 #ifndef _SYS_SYSPROTO_H_ struct ommap_args { caddr_t addr; int len; int prot; int flags; int fd; long pos; }; #endif int ommap(struct thread *td, struct ommap_args *uap) { static const char cvtbsdprot[8] = { 0, PROT_EXEC, PROT_WRITE, PROT_EXEC | PROT_WRITE, PROT_READ, PROT_EXEC | PROT_READ, PROT_WRITE | PROT_READ, PROT_EXEC | PROT_WRITE | PROT_READ, }; int flags, prot; #define OMAP_ANON 0x0002 #define OMAP_COPY 0x0020 #define OMAP_SHARED 0x0010 #define OMAP_FIXED 0x0100 prot = cvtbsdprot[uap->prot & 0x7]; #if (defined(COMPAT_FREEBSD32) && defined(__amd64__)) || defined(__i386__) if (i386_read_exec && SV_PROC_FLAG(td->td_proc, SV_ILP32) && prot != 0) prot |= PROT_EXEC; #endif flags = 0; if (uap->flags & OMAP_ANON) flags |= MAP_ANON; if (uap->flags & OMAP_COPY) flags |= MAP_COPY; if (uap->flags & OMAP_SHARED) flags |= MAP_SHARED; else flags |= MAP_PRIVATE; if (uap->flags & OMAP_FIXED) flags |= MAP_FIXED; return (kern_mmap(td, (uintptr_t)uap->addr, uap->len, prot, flags, uap->fd, uap->pos)); } #endif /* COMPAT_43 */ #ifndef _SYS_SYSPROTO_H_ struct msync_args { void *addr; size_t len; int flags; }; #endif int sys_msync(struct thread *td, struct msync_args *uap) { return (kern_msync(td, (uintptr_t)uap->addr, uap->len, uap->flags)); } int kern_msync(struct thread *td, uintptr_t addr0, size_t size, int flags) { vm_offset_t addr; vm_size_t pageoff; vm_map_t map; int rv; addr = addr0; pageoff = (addr & PAGE_MASK); addr -= pageoff; size += pageoff; size = (vm_size_t) round_page(size); if (addr + size < addr) return (EINVAL); if ((flags & (MS_ASYNC|MS_INVALIDATE)) == (MS_ASYNC|MS_INVALIDATE)) return (EINVAL); map = &td->td_proc->p_vmspace->vm_map; /* * Clean the pages and interpret the return value. */ rv = vm_map_sync(map, addr, addr + size, (flags & MS_ASYNC) == 0, (flags & MS_INVALIDATE) != 0); switch (rv) { case KERN_SUCCESS: return (0); case KERN_INVALID_ADDRESS: return (ENOMEM); case KERN_INVALID_ARGUMENT: return (EBUSY); case KERN_FAILURE: return (EIO); default: return (EINVAL); } } #ifndef _SYS_SYSPROTO_H_ struct munmap_args { void *addr; size_t len; }; #endif int sys_munmap(struct thread *td, struct munmap_args *uap) { return (kern_munmap(td, (uintptr_t)uap->addr, uap->len)); } int kern_munmap(struct thread *td, uintptr_t addr0, size_t size) { #ifdef HWPMC_HOOKS struct pmckern_map_out pkm; vm_map_entry_t entry; bool pmc_handled; #endif vm_offset_t addr, end; vm_size_t pageoff; vm_map_t map; if (size == 0) return (EINVAL); addr = addr0; pageoff = (addr & PAGE_MASK); addr -= pageoff; size += pageoff; size = (vm_size_t) round_page(size); end = addr + size; map = &td->td_proc->p_vmspace->vm_map; if (!vm_map_range_valid(map, addr, end)) return (EINVAL); vm_map_lock(map); #ifdef HWPMC_HOOKS pmc_handled = false; if (PMC_HOOK_INSTALLED(PMC_FN_MUNMAP)) { pmc_handled = true; /* * Inform hwpmc if the address range being unmapped contains * an executable region. */ pkm.pm_address = (uintptr_t) NULL; if (vm_map_lookup_entry(map, addr, &entry)) { for (; entry->start < end; entry = vm_map_entry_succ(entry)) { if (vm_map_check_protection(map, entry->start, entry->end, VM_PROT_EXECUTE) == TRUE) { pkm.pm_address = (uintptr_t) addr; pkm.pm_size = (size_t) size; break; } } } } #endif vm_map_delete(map, addr, end); #ifdef HWPMC_HOOKS if (__predict_false(pmc_handled)) { /* downgrade the lock to prevent a LOR with the pmc-sx lock */ vm_map_lock_downgrade(map); if (pkm.pm_address != (uintptr_t) NULL) PMC_CALL_HOOK(td, PMC_FN_MUNMAP, (void *) &pkm); vm_map_unlock_read(map); } else #endif vm_map_unlock(map); /* vm_map_delete returns nothing but KERN_SUCCESS anyway */ return (0); } #ifndef _SYS_SYSPROTO_H_ struct mprotect_args { const void *addr; size_t len; int prot; }; #endif int sys_mprotect(struct thread *td, struct mprotect_args *uap) { return (kern_mprotect(td, (uintptr_t)uap->addr, uap->len, uap->prot)); } int kern_mprotect(struct thread *td, uintptr_t addr0, size_t size, int prot) { vm_offset_t addr; vm_size_t pageoff; int vm_error, max_prot; addr = addr0; if ((prot & ~(_PROT_ALL | PROT_MAX(_PROT_ALL))) != 0) return (EINVAL); max_prot = PROT_MAX_EXTRACT(prot); prot = PROT_EXTRACT(prot); pageoff = (addr & PAGE_MASK); addr -= pageoff; size += pageoff; size = (vm_size_t) round_page(size); #ifdef COMPAT_FREEBSD32 if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) { if (((addr + size) & 0xffffffff) < addr) return (EINVAL); } else #endif if (addr + size < addr) return (EINVAL); vm_error = KERN_SUCCESS; if (max_prot != 0) { if ((max_prot & prot) != prot) return (ENOTSUP); vm_error = vm_map_protect(&td->td_proc->p_vmspace->vm_map, addr, addr + size, max_prot, TRUE); } if (vm_error == KERN_SUCCESS) vm_error = vm_map_protect(&td->td_proc->p_vmspace->vm_map, addr, addr + size, prot, FALSE); switch (vm_error) { case KERN_SUCCESS: return (0); case KERN_PROTECTION_FAILURE: return (EACCES); case KERN_RESOURCE_SHORTAGE: return (ENOMEM); } return (EINVAL); } #ifndef _SYS_SYSPROTO_H_ struct minherit_args { void *addr; size_t len; int inherit; }; #endif int sys_minherit(struct thread *td, struct minherit_args *uap) { return (kern_minherit(td, (uintptr_t)uap->addr, uap->len, uap->inherit)); } int kern_minherit(struct thread *td, uintptr_t addr0, size_t len, int inherit0) { vm_offset_t addr; vm_size_t size, pageoff; vm_inherit_t inherit; addr = (vm_offset_t)addr0; size = len; inherit = inherit0; pageoff = (addr & PAGE_MASK); addr -= pageoff; size += pageoff; size = (vm_size_t) round_page(size); if (addr + size < addr) return (EINVAL); switch (vm_map_inherit(&td->td_proc->p_vmspace->vm_map, addr, addr + size, inherit)) { case KERN_SUCCESS: return (0); case KERN_PROTECTION_FAILURE: return (EACCES); } return (EINVAL); } #ifndef _SYS_SYSPROTO_H_ struct madvise_args { void *addr; size_t len; int behav; }; #endif int sys_madvise(struct thread *td, struct madvise_args *uap) { return (kern_madvise(td, (uintptr_t)uap->addr, uap->len, uap->behav)); } int kern_madvise(struct thread *td, uintptr_t addr0, size_t len, int behav) { vm_map_t map; vm_offset_t addr, end, start; int flags; /* * Check for our special case, advising the swap pager we are * "immortal." */ if (behav == MADV_PROTECT) { flags = PPROT_SET; return (kern_procctl(td, P_PID, td->td_proc->p_pid, PROC_SPROTECT, &flags)); } /* * Check for illegal addresses. Watch out for address wrap... Note * that VM_*_ADDRESS are not constants due to casts (argh). */ map = &td->td_proc->p_vmspace->vm_map; addr = addr0; if (!vm_map_range_valid(map, addr, addr + len)) return (EINVAL); /* * Since this routine is only advisory, we default to conservative * behavior. */ start = trunc_page(addr); end = round_page(addr + len); /* * vm_map_madvise() checks for illegal values of behav. */ return (vm_map_madvise(map, start, end, behav)); } #ifndef _SYS_SYSPROTO_H_ struct mincore_args { const void *addr; size_t len; char *vec; }; #endif int sys_mincore(struct thread *td, struct mincore_args *uap) { return (kern_mincore(td, (uintptr_t)uap->addr, uap->len, uap->vec)); } int kern_mincore(struct thread *td, uintptr_t addr0, size_t len, char *vec) { pmap_t pmap; vm_map_t map; vm_map_entry_t current, entry; vm_object_t object; vm_offset_t addr, cend, end, first_addr; vm_paddr_t pa; vm_page_t m; vm_pindex_t pindex; int error, lastvecindex, mincoreinfo, vecindex; unsigned int timestamp; /* * Make sure that the addresses presented are valid for user * mode. */ first_addr = addr = trunc_page(addr0); end = round_page(addr0 + len); map = &td->td_proc->p_vmspace->vm_map; if (end > vm_map_max(map) || end < addr) return (ENOMEM); pmap = vmspace_pmap(td->td_proc->p_vmspace); vm_map_lock_read(map); RestartScan: timestamp = map->timestamp; if (!vm_map_lookup_entry(map, addr, &entry)) { vm_map_unlock_read(map); return (ENOMEM); } /* * Do this on a map entry basis so that if the pages are not * in the current processes address space, we can easily look * up the pages elsewhere. */ lastvecindex = -1; while (entry->start < end) { /* * check for contiguity */ current = entry; entry = vm_map_entry_succ(current); if (current->end < end && entry->start > current->end) { vm_map_unlock_read(map); return (ENOMEM); } /* * ignore submaps (for now) or null objects */ if ((current->eflags & MAP_ENTRY_IS_SUB_MAP) || current->object.vm_object == NULL) continue; /* * limit this scan to the current map entry and the * limits for the mincore call */ if (addr < current->start) addr = current->start; cend = current->end; if (cend > end) cend = end; for (; addr < cend; addr += PAGE_SIZE) { /* * Check pmap first, it is likely faster, also * it can provide info as to whether we are the * one referencing or modifying the page. */ m = NULL; object = NULL; retry: pa = 0; mincoreinfo = pmap_mincore(pmap, addr, &pa); if (mincore_mapped) { /* * We only care about this pmap's * mapping of the page, if any. */ ; } else if (pa != 0) { /* * The page is mapped by this process but not * both accessed and modified. It is also * managed. Acquire the object lock so that * other mappings might be examined. The page's * identity may change at any point before its * object lock is acquired, so re-validate if * necessary. */ m = PHYS_TO_VM_PAGE(pa); while (object == NULL || m->object != object) { if (object != NULL) VM_OBJECT_WUNLOCK(object); object = atomic_load_ptr(&m->object); if (object == NULL) goto retry; VM_OBJECT_WLOCK(object); } if (pa != pmap_extract(pmap, addr)) goto retry; KASSERT(vm_page_all_valid(m), ("mincore: page %p is mapped but invalid", m)); } else if (mincoreinfo == 0) { /* * The page is not mapped by this process. If * the object implements managed pages, then * determine if the page is resident so that * the mappings might be examined. */ if (current->object.vm_object != object) { if (object != NULL) VM_OBJECT_WUNLOCK(object); object = current->object.vm_object; VM_OBJECT_WLOCK(object); } if (object->type == OBJT_DEFAULT || object->type == OBJT_SWAP || object->type == OBJT_VNODE) { pindex = OFF_TO_IDX(current->offset + (addr - current->start)); m = vm_page_lookup(object, pindex); if (m != NULL && vm_page_none_valid(m)) m = NULL; if (m != NULL) mincoreinfo = MINCORE_INCORE; } } if (m != NULL) { VM_OBJECT_ASSERT_WLOCKED(m->object); /* Examine other mappings of the page. */ if (m->dirty == 0 && pmap_is_modified(m)) vm_page_dirty(m); if (m->dirty != 0) mincoreinfo |= MINCORE_MODIFIED_OTHER; /* * The first test for PGA_REFERENCED is an * optimization. The second test is * required because a concurrent pmap * operation could clear the last reference * and set PGA_REFERENCED before the call to * pmap_is_referenced(). */ if ((m->a.flags & PGA_REFERENCED) != 0 || pmap_is_referenced(m) || (m->a.flags & PGA_REFERENCED) != 0) mincoreinfo |= MINCORE_REFERENCED_OTHER; } if (object != NULL) VM_OBJECT_WUNLOCK(object); /* * subyte may page fault. In case it needs to modify * the map, we release the lock. */ vm_map_unlock_read(map); /* * calculate index into user supplied byte vector */ vecindex = atop(addr - first_addr); /* * If we have skipped map entries, we need to make sure that * the byte vector is zeroed for those skipped entries. */ while ((lastvecindex + 1) < vecindex) { ++lastvecindex; error = subyte(vec + lastvecindex, 0); if (error) { error = EFAULT; goto done2; } } /* * Pass the page information to the user */ error = subyte(vec + vecindex, mincoreinfo); if (error) { error = EFAULT; goto done2; } /* * If the map has changed, due to the subyte, the previous * output may be invalid. */ vm_map_lock_read(map); if (timestamp != map->timestamp) goto RestartScan; lastvecindex = vecindex; } } /* * subyte may page fault. In case it needs to modify * the map, we release the lock. */ vm_map_unlock_read(map); /* * Zero the last entries in the byte vector. */ vecindex = atop(end - first_addr); while ((lastvecindex + 1) < vecindex) { ++lastvecindex; error = subyte(vec + lastvecindex, 0); if (error) { error = EFAULT; goto done2; } } /* * If the map has changed, due to the subyte, the previous * output may be invalid. */ vm_map_lock_read(map); if (timestamp != map->timestamp) goto RestartScan; vm_map_unlock_read(map); done2: return (error); } #ifndef _SYS_SYSPROTO_H_ struct mlock_args { const void *addr; size_t len; }; #endif int sys_mlock(struct thread *td, struct mlock_args *uap) { return (kern_mlock(td->td_proc, td->td_ucred, __DECONST(uintptr_t, uap->addr), uap->len)); } int kern_mlock(struct proc *proc, struct ucred *cred, uintptr_t addr0, size_t len) { vm_offset_t addr, end, last, start; vm_size_t npages, size; vm_map_t map; unsigned long nsize; int error; error = priv_check_cred(cred, PRIV_VM_MLOCK); if (error) return (error); addr = addr0; size = len; last = addr + size; start = trunc_page(addr); end = round_page(last); if (last < addr || end < addr) return (EINVAL); npages = atop(end - start); if (npages > vm_page_max_user_wired) return (ENOMEM); map = &proc->p_vmspace->vm_map; PROC_LOCK(proc); nsize = ptoa(npages + pmap_wired_count(map->pmap)); if (nsize > lim_cur_proc(proc, RLIMIT_MEMLOCK)) { PROC_UNLOCK(proc); return (ENOMEM); } PROC_UNLOCK(proc); #ifdef RACCT if (racct_enable) { PROC_LOCK(proc); error = racct_set(proc, RACCT_MEMLOCK, nsize); PROC_UNLOCK(proc); if (error != 0) return (ENOMEM); } #endif error = vm_map_wire(map, start, end, VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); #ifdef RACCT if (racct_enable && error != KERN_SUCCESS) { PROC_LOCK(proc); racct_set(proc, RACCT_MEMLOCK, ptoa(pmap_wired_count(map->pmap))); PROC_UNLOCK(proc); } #endif return (error == KERN_SUCCESS ? 0 : ENOMEM); } #ifndef _SYS_SYSPROTO_H_ struct mlockall_args { int how; }; #endif int sys_mlockall(struct thread *td, struct mlockall_args *uap) { vm_map_t map; int error; map = &td->td_proc->p_vmspace->vm_map; error = priv_check(td, PRIV_VM_MLOCK); if (error) return (error); if ((uap->how == 0) || ((uap->how & ~(MCL_CURRENT|MCL_FUTURE)) != 0)) return (EINVAL); /* * If wiring all pages in the process would cause it to exceed * a hard resource limit, return ENOMEM. */ if (!old_mlock && uap->how & MCL_CURRENT) { if (map->size > lim_cur(td, RLIMIT_MEMLOCK)) return (ENOMEM); } #ifdef RACCT if (racct_enable) { PROC_LOCK(td->td_proc); error = racct_set(td->td_proc, RACCT_MEMLOCK, map->size); PROC_UNLOCK(td->td_proc); if (error != 0) return (ENOMEM); } #endif if (uap->how & MCL_FUTURE) { vm_map_lock(map); vm_map_modflags(map, MAP_WIREFUTURE, 0); vm_map_unlock(map); error = 0; } if (uap->how & MCL_CURRENT) { /* * P1003.1-2001 mandates that all currently mapped pages * will be memory resident and locked (wired) upon return * from mlockall(). vm_map_wire() will wire pages, by * calling vm_fault_wire() for each page in the region. */ error = vm_map_wire(map, vm_map_min(map), vm_map_max(map), VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK); if (error == KERN_SUCCESS) error = 0; else if (error == KERN_RESOURCE_SHORTAGE) error = ENOMEM; else error = EAGAIN; } #ifdef RACCT if (racct_enable && error != KERN_SUCCESS) { PROC_LOCK(td->td_proc); racct_set(td->td_proc, RACCT_MEMLOCK, ptoa(pmap_wired_count(map->pmap))); PROC_UNLOCK(td->td_proc); } #endif return (error); } #ifndef _SYS_SYSPROTO_H_ struct munlockall_args { register_t dummy; }; #endif int sys_munlockall(struct thread *td, struct munlockall_args *uap) { vm_map_t map; int error; map = &td->td_proc->p_vmspace->vm_map; error = priv_check(td, PRIV_VM_MUNLOCK); if (error) return (error); /* Clear the MAP_WIREFUTURE flag from this vm_map. */ vm_map_lock(map); vm_map_modflags(map, 0, MAP_WIREFUTURE); vm_map_unlock(map); /* Forcibly unwire all pages. */ error = vm_map_unwire(map, vm_map_min(map), vm_map_max(map), VM_MAP_WIRE_USER|VM_MAP_WIRE_HOLESOK); #ifdef RACCT if (racct_enable && error == KERN_SUCCESS) { PROC_LOCK(td->td_proc); racct_set(td->td_proc, RACCT_MEMLOCK, 0); PROC_UNLOCK(td->td_proc); } #endif return (error); } #ifndef _SYS_SYSPROTO_H_ struct munlock_args { const void *addr; size_t len; }; #endif int sys_munlock(struct thread *td, struct munlock_args *uap) { return (kern_munlock(td, (uintptr_t)uap->addr, uap->len)); } int kern_munlock(struct thread *td, uintptr_t addr0, size_t size) { vm_offset_t addr, end, last, start; #ifdef RACCT vm_map_t map; #endif int error; error = priv_check(td, PRIV_VM_MUNLOCK); if (error) return (error); addr = addr0; last = addr + size; start = trunc_page(addr); end = round_page(last); if (last < addr || end < addr) return (EINVAL); error = vm_map_unwire(&td->td_proc->p_vmspace->vm_map, start, end, VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); #ifdef RACCT if (racct_enable && error == KERN_SUCCESS) { PROC_LOCK(td->td_proc); map = &td->td_proc->p_vmspace->vm_map; racct_set(td->td_proc, RACCT_MEMLOCK, ptoa(pmap_wired_count(map->pmap))); PROC_UNLOCK(td->td_proc); } #endif return (error == KERN_SUCCESS ? 0 : ENOMEM); } /* * vm_mmap_vnode() * * Helper function for vm_mmap. Perform sanity check specific for mmap * operations on vnodes. */ int vm_mmap_vnode(struct thread *td, vm_size_t objsize, vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp, struct vnode *vp, vm_ooffset_t *foffp, vm_object_t *objp, boolean_t *writecounted) { struct vattr va; vm_object_t obj; vm_ooffset_t foff; struct ucred *cred; int error, flags; bool writex; cred = td->td_ucred; writex = (*maxprotp & VM_PROT_WRITE) != 0 && (*flagsp & MAP_SHARED) != 0; - if ((error = vget(vp, LK_SHARED, td)) != 0) + if ((error = vget(vp, LK_SHARED)) != 0) return (error); AUDIT_ARG_VNODE1(vp); foff = *foffp; flags = *flagsp; obj = vp->v_object; if (vp->v_type == VREG) { /* * Get the proper underlying object */ if (obj == NULL) { error = EINVAL; goto done; } if (obj->type == OBJT_VNODE && obj->handle != vp) { vput(vp); vp = (struct vnode *)obj->handle; /* * Bypass filesystems obey the mpsafety of the * underlying fs. Tmpfs never bypasses. */ - error = vget(vp, LK_SHARED, td); + error = vget(vp, LK_SHARED); if (error != 0) return (error); } if (writex) { *writecounted = TRUE; vm_pager_update_writecount(obj, 0, objsize); } } else { error = EINVAL; goto done; } if ((error = VOP_GETATTR(vp, &va, cred))) goto done; #ifdef MAC /* This relies on VM_PROT_* matching PROT_*. */ error = mac_vnode_check_mmap(cred, vp, (int)prot, flags); if (error != 0) goto done; #endif if ((flags & MAP_SHARED) != 0) { if ((va.va_flags & (SF_SNAPSHOT|IMMUTABLE|APPEND)) != 0) { if (prot & VM_PROT_WRITE) { error = EPERM; goto done; } *maxprotp &= ~VM_PROT_WRITE; } } /* * If it is a regular file without any references * we do not need to sync it. * Adjust object size to be the size of actual file. */ objsize = round_page(va.va_size); if (va.va_nlink == 0) flags |= MAP_NOSYNC; if (obj->type == OBJT_VNODE) { obj = vm_pager_allocate(OBJT_VNODE, vp, objsize, prot, foff, cred); if (obj == NULL) { error = ENOMEM; goto done; } } else { KASSERT(obj->type == OBJT_DEFAULT || obj->type == OBJT_SWAP, ("wrong object type")); vm_object_reference(obj); #if VM_NRESERVLEVEL > 0 if ((obj->flags & OBJ_COLORED) == 0) { VM_OBJECT_WLOCK(obj); vm_object_color(obj, 0); VM_OBJECT_WUNLOCK(obj); } #endif } *objp = obj; *flagsp = flags; VOP_MMAPPED(vp); done: if (error != 0 && *writecounted) { *writecounted = FALSE; vm_pager_update_writecount(obj, objsize, 0); } vput(vp); return (error); } /* * vm_mmap_cdev() * * Helper function for vm_mmap. Perform sanity check specific for mmap * operations on cdevs. */ int vm_mmap_cdev(struct thread *td, vm_size_t objsize, vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp, struct cdev *cdev, struct cdevsw *dsw, vm_ooffset_t *foff, vm_object_t *objp) { vm_object_t obj; int error, flags; flags = *flagsp; if (dsw->d_flags & D_MMAP_ANON) { *objp = NULL; *foff = 0; *maxprotp = VM_PROT_ALL; *flagsp |= MAP_ANON; return (0); } /* * cdevs do not provide private mappings of any kind. */ if ((*maxprotp & VM_PROT_WRITE) == 0 && (prot & VM_PROT_WRITE) != 0) return (EACCES); if (flags & (MAP_PRIVATE|MAP_COPY)) return (EINVAL); /* * Force device mappings to be shared. */ flags |= MAP_SHARED; #ifdef MAC_XXX error = mac_cdev_check_mmap(td->td_ucred, cdev, (int)prot); if (error != 0) return (error); #endif /* * First, try d_mmap_single(). If that is not implemented * (returns ENODEV), fall back to using the device pager. * Note that d_mmap_single() must return a reference to the * object (it needs to bump the reference count of the object * it returns somehow). * * XXX assumes VM_PROT_* == PROT_* */ error = dsw->d_mmap_single(cdev, foff, objsize, objp, (int)prot); if (error != ENODEV) return (error); obj = vm_pager_allocate(OBJT_DEVICE, cdev, objsize, prot, *foff, td->td_ucred); if (obj == NULL) return (EINVAL); *objp = obj; *flagsp = flags; return (0); } /* * vm_mmap() * * Internal version of mmap used by exec, sys5 shared memory, and * various device drivers. Handle is either a vnode pointer, a * character device, or NULL for MAP_ANON. */ int vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, vm_prot_t maxprot, int flags, objtype_t handle_type, void *handle, vm_ooffset_t foff) { vm_object_t object; struct thread *td = curthread; int error; boolean_t writecounted; if (size == 0) return (EINVAL); size = round_page(size); object = NULL; writecounted = FALSE; /* * Lookup/allocate object. */ switch (handle_type) { case OBJT_DEVICE: { struct cdevsw *dsw; struct cdev *cdev; int ref; cdev = handle; dsw = dev_refthread(cdev, &ref); if (dsw == NULL) return (ENXIO); error = vm_mmap_cdev(td, size, prot, &maxprot, &flags, cdev, dsw, &foff, &object); dev_relthread(cdev, ref); break; } case OBJT_VNODE: error = vm_mmap_vnode(td, size, prot, &maxprot, &flags, handle, &foff, &object, &writecounted); break; case OBJT_DEFAULT: if (handle == NULL) { error = 0; break; } /* FALLTHROUGH */ default: error = EINVAL; break; } if (error) return (error); error = vm_mmap_object(map, addr, size, prot, maxprot, flags, object, foff, writecounted, td); if (error != 0 && object != NULL) { /* * If this mapping was accounted for in the vnode's * writecount, then undo that now. */ if (writecounted) vm_pager_release_writecount(object, 0, size); vm_object_deallocate(object); } return (error); } /* * Internal version of mmap that maps a specific VM object into an * map. Called by mmap for MAP_ANON, vm_mmap, shm_mmap, and vn_mmap. */ int vm_mmap_object(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, vm_prot_t maxprot, int flags, vm_object_t object, vm_ooffset_t foff, boolean_t writecounted, struct thread *td) { boolean_t curmap, fitit; vm_offset_t max_addr; int docow, error, findspace, rv; curmap = map == &td->td_proc->p_vmspace->vm_map; if (curmap) { RACCT_PROC_LOCK(td->td_proc); if (map->size + size > lim_cur(td, RLIMIT_VMEM)) { RACCT_PROC_UNLOCK(td->td_proc); return (ENOMEM); } if (racct_set(td->td_proc, RACCT_VMEM, map->size + size)) { RACCT_PROC_UNLOCK(td->td_proc); return (ENOMEM); } if (!old_mlock && map->flags & MAP_WIREFUTURE) { if (ptoa(pmap_wired_count(map->pmap)) + size > lim_cur(td, RLIMIT_MEMLOCK)) { racct_set_force(td->td_proc, RACCT_VMEM, map->size); RACCT_PROC_UNLOCK(td->td_proc); return (ENOMEM); } error = racct_set(td->td_proc, RACCT_MEMLOCK, ptoa(pmap_wired_count(map->pmap)) + size); if (error != 0) { racct_set_force(td->td_proc, RACCT_VMEM, map->size); RACCT_PROC_UNLOCK(td->td_proc); return (error); } } RACCT_PROC_UNLOCK(td->td_proc); } /* * We currently can only deal with page aligned file offsets. * The mmap() system call already enforces this by subtracting * the page offset from the file offset, but checking here * catches errors in device drivers (e.g. d_single_mmap() * callbacks) and other internal mapping requests (such as in * exec). */ if (foff & PAGE_MASK) return (EINVAL); if ((flags & MAP_FIXED) == 0) { fitit = TRUE; *addr = round_page(*addr); } else { if (*addr != trunc_page(*addr)) return (EINVAL); fitit = FALSE; } if (flags & MAP_ANON) { if (object != NULL || foff != 0) return (EINVAL); docow = 0; } else if (flags & MAP_PREFAULT_READ) docow = MAP_PREFAULT; else docow = MAP_PREFAULT_PARTIAL; if ((flags & (MAP_ANON|MAP_SHARED)) == 0) docow |= MAP_COPY_ON_WRITE; if (flags & MAP_NOSYNC) docow |= MAP_DISABLE_SYNCER; if (flags & MAP_NOCORE) docow |= MAP_DISABLE_COREDUMP; /* Shared memory is also shared with children. */ if (flags & MAP_SHARED) docow |= MAP_INHERIT_SHARE; if (writecounted) docow |= MAP_WRITECOUNT; if (flags & MAP_STACK) { if (object != NULL) return (EINVAL); docow |= MAP_STACK_GROWS_DOWN; } if ((flags & MAP_EXCL) != 0) docow |= MAP_CHECK_EXCL; if ((flags & MAP_GUARD) != 0) docow |= MAP_CREATE_GUARD; if (fitit) { if ((flags & MAP_ALIGNMENT_MASK) == MAP_ALIGNED_SUPER) findspace = VMFS_SUPER_SPACE; else if ((flags & MAP_ALIGNMENT_MASK) != 0) findspace = VMFS_ALIGNED_SPACE(flags >> MAP_ALIGNMENT_SHIFT); else findspace = VMFS_OPTIMAL_SPACE; max_addr = 0; #ifdef MAP_32BIT if ((flags & MAP_32BIT) != 0) max_addr = MAP_32BIT_MAX_ADDR; #endif if (curmap) { rv = vm_map_find_min(map, object, foff, addr, size, round_page((vm_offset_t)td->td_proc->p_vmspace-> vm_daddr + lim_max(td, RLIMIT_DATA)), max_addr, findspace, prot, maxprot, docow); } else { rv = vm_map_find(map, object, foff, addr, size, max_addr, findspace, prot, maxprot, docow); } } else { rv = vm_map_fixed(map, object, foff, *addr, size, prot, maxprot, docow); } if (rv == KERN_SUCCESS) { /* * If the process has requested that all future mappings * be wired, then heed this. */ if ((map->flags & MAP_WIREFUTURE) != 0) { vm_map_lock(map); if ((map->flags & MAP_WIREFUTURE) != 0) (void)vm_map_wire_locked(map, *addr, *addr + size, VM_MAP_WIRE_USER | ((flags & MAP_STACK) ? VM_MAP_WIRE_HOLESOK : VM_MAP_WIRE_NOHOLES)); vm_map_unlock(map); } } return (vm_mmap_to_errno(rv)); } /* * Translate a Mach VM return code to zero on success or the appropriate errno * on failure. */ int vm_mmap_to_errno(int rv) { switch (rv) { case KERN_SUCCESS: return (0); case KERN_INVALID_ADDRESS: case KERN_NO_SPACE: return (ENOMEM); case KERN_PROTECTION_FAILURE: return (EACCES); default: return (EINVAL); } } Index: projects/clang1100-import/sys/vm/vm_pageout.c =================================================================== --- projects/clang1100-import/sys/vm/vm_pageout.c (revision 364278) +++ projects/clang1100-import/sys/vm/vm_pageout.c (revision 364279) @@ -1,2406 +1,2406 @@ /*- * SPDX-License-Identifier: (BSD-4-Clause AND MIT-CMU) * * Copyright (c) 1991 Regents of the University of California. * All rights reserved. * Copyright (c) 1994 John S. Dyson * All rights reserved. * Copyright (c) 1994 David Greenman * All rights reserved. * Copyright (c) 2005 Yahoo! Technologies Norway AS * All rights reserved. * * This code is derived from software contributed to Berkeley by * The Mach Operating System project at Carnegie-Mellon University. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: @(#)vm_pageout.c 7.4 (Berkeley) 5/7/91 * * * Copyright (c) 1987, 1990 Carnegie-Mellon University. * All rights reserved. * * Authors: Avadis Tevanian, Jr., Michael Wayne Young * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. */ /* * The proverbial page-out daemon. */ #include __FBSDID("$FreeBSD$"); #include "opt_vm.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * System initialization */ /* the kernel process "vm_pageout"*/ static void vm_pageout(void); static void vm_pageout_init(void); static int vm_pageout_clean(vm_page_t m, int *numpagedout); static int vm_pageout_cluster(vm_page_t m); static void vm_pageout_mightbe_oom(struct vm_domain *vmd, int page_shortage, int starting_page_shortage); SYSINIT(pagedaemon_init, SI_SUB_KTHREAD_PAGE, SI_ORDER_FIRST, vm_pageout_init, NULL); struct proc *pageproc; static struct kproc_desc page_kp = { "pagedaemon", vm_pageout, &pageproc }; SYSINIT(pagedaemon, SI_SUB_KTHREAD_PAGE, SI_ORDER_SECOND, kproc_start, &page_kp); SDT_PROVIDER_DEFINE(vm); SDT_PROBE_DEFINE(vm, , , vm__lowmem_scan); /* Pagedaemon activity rates, in subdivisions of one second. */ #define VM_LAUNDER_RATE 10 #define VM_INACT_SCAN_RATE 10 static int vm_pageout_oom_seq = 12; static int vm_pageout_update_period; static int disable_swap_pageouts; static int lowmem_period = 10; static int swapdev_enabled; static int vm_panic_on_oom = 0; SYSCTL_INT(_vm, OID_AUTO, panic_on_oom, CTLFLAG_RWTUN, &vm_panic_on_oom, 0, "Panic on the given number of out-of-memory errors instead of killing the largest process"); SYSCTL_INT(_vm, OID_AUTO, pageout_update_period, CTLFLAG_RWTUN, &vm_pageout_update_period, 0, "Maximum active LRU update period"); /* Access with get_pageout_threads_per_domain(). */ static int pageout_threads_per_domain = 1; SYSCTL_INT(_vm, OID_AUTO, pageout_threads_per_domain, CTLFLAG_RDTUN, &pageout_threads_per_domain, 0, "Number of worker threads comprising each per-domain pagedaemon"); SYSCTL_INT(_vm, OID_AUTO, lowmem_period, CTLFLAG_RWTUN, &lowmem_period, 0, "Low memory callback period"); SYSCTL_INT(_vm, OID_AUTO, disable_swapspace_pageouts, CTLFLAG_RWTUN, &disable_swap_pageouts, 0, "Disallow swapout of dirty pages"); static int pageout_lock_miss; SYSCTL_INT(_vm, OID_AUTO, pageout_lock_miss, CTLFLAG_RD, &pageout_lock_miss, 0, "vget() lock misses during pageout"); SYSCTL_INT(_vm, OID_AUTO, pageout_oom_seq, CTLFLAG_RWTUN, &vm_pageout_oom_seq, 0, "back-to-back calls to oom detector to start OOM"); static int act_scan_laundry_weight = 3; SYSCTL_INT(_vm, OID_AUTO, act_scan_laundry_weight, CTLFLAG_RWTUN, &act_scan_laundry_weight, 0, "weight given to clean vs. dirty pages in active queue scans"); static u_int vm_background_launder_rate = 4096; SYSCTL_UINT(_vm, OID_AUTO, background_launder_rate, CTLFLAG_RWTUN, &vm_background_launder_rate, 0, "background laundering rate, in kilobytes per second"); static u_int vm_background_launder_max = 20 * 1024; SYSCTL_UINT(_vm, OID_AUTO, background_launder_max, CTLFLAG_RWTUN, &vm_background_launder_max, 0, "background laundering cap, in kilobytes"); int vm_pageout_page_count = 32; u_long vm_page_max_user_wired; SYSCTL_ULONG(_vm, OID_AUTO, max_user_wired, CTLFLAG_RW, &vm_page_max_user_wired, 0, "system-wide limit to user-wired page count"); static u_int isqrt(u_int num); static int vm_pageout_launder(struct vm_domain *vmd, int launder, bool in_shortfall); static void vm_pageout_laundry_worker(void *arg); struct scan_state { struct vm_batchqueue bq; struct vm_pagequeue *pq; vm_page_t marker; int maxscan; int scanned; }; static void vm_pageout_init_scan(struct scan_state *ss, struct vm_pagequeue *pq, vm_page_t marker, vm_page_t after, int maxscan) { vm_pagequeue_assert_locked(pq); KASSERT((marker->a.flags & PGA_ENQUEUED) == 0, ("marker %p already enqueued", marker)); if (after == NULL) TAILQ_INSERT_HEAD(&pq->pq_pl, marker, plinks.q); else TAILQ_INSERT_AFTER(&pq->pq_pl, after, marker, plinks.q); vm_page_aflag_set(marker, PGA_ENQUEUED); vm_batchqueue_init(&ss->bq); ss->pq = pq; ss->marker = marker; ss->maxscan = maxscan; ss->scanned = 0; vm_pagequeue_unlock(pq); } static void vm_pageout_end_scan(struct scan_state *ss) { struct vm_pagequeue *pq; pq = ss->pq; vm_pagequeue_assert_locked(pq); KASSERT((ss->marker->a.flags & PGA_ENQUEUED) != 0, ("marker %p not enqueued", ss->marker)); TAILQ_REMOVE(&pq->pq_pl, ss->marker, plinks.q); vm_page_aflag_clear(ss->marker, PGA_ENQUEUED); pq->pq_pdpages += ss->scanned; } /* * Add a small number of queued pages to a batch queue for later processing * without the corresponding queue lock held. The caller must have enqueued a * marker page at the desired start point for the scan. Pages will be * physically dequeued if the caller so requests. Otherwise, the returned * batch may contain marker pages, and it is up to the caller to handle them. * * When processing the batch queue, vm_pageout_defer() must be used to * determine whether the page has been logically dequeued since the batch was * collected. */ static __always_inline void vm_pageout_collect_batch(struct scan_state *ss, const bool dequeue) { struct vm_pagequeue *pq; vm_page_t m, marker, n; marker = ss->marker; pq = ss->pq; KASSERT((marker->a.flags & PGA_ENQUEUED) != 0, ("marker %p not enqueued", ss->marker)); vm_pagequeue_lock(pq); for (m = TAILQ_NEXT(marker, plinks.q); m != NULL && ss->scanned < ss->maxscan && ss->bq.bq_cnt < VM_BATCHQUEUE_SIZE; m = n, ss->scanned++) { n = TAILQ_NEXT(m, plinks.q); if ((m->flags & PG_MARKER) == 0) { KASSERT((m->a.flags & PGA_ENQUEUED) != 0, ("page %p not enqueued", m)); KASSERT((m->flags & PG_FICTITIOUS) == 0, ("Fictitious page %p cannot be in page queue", m)); KASSERT((m->oflags & VPO_UNMANAGED) == 0, ("Unmanaged page %p cannot be in page queue", m)); } else if (dequeue) continue; (void)vm_batchqueue_insert(&ss->bq, m); if (dequeue) { TAILQ_REMOVE(&pq->pq_pl, m, plinks.q); vm_page_aflag_clear(m, PGA_ENQUEUED); } } TAILQ_REMOVE(&pq->pq_pl, marker, plinks.q); if (__predict_true(m != NULL)) TAILQ_INSERT_BEFORE(m, marker, plinks.q); else TAILQ_INSERT_TAIL(&pq->pq_pl, marker, plinks.q); if (dequeue) vm_pagequeue_cnt_add(pq, -ss->bq.bq_cnt); vm_pagequeue_unlock(pq); } /* * Return the next page to be scanned, or NULL if the scan is complete. */ static __always_inline vm_page_t vm_pageout_next(struct scan_state *ss, const bool dequeue) { if (ss->bq.bq_cnt == 0) vm_pageout_collect_batch(ss, dequeue); return (vm_batchqueue_pop(&ss->bq)); } /* * Determine whether processing of a page should be deferred and ensure that any * outstanding queue operations are processed. */ static __always_inline bool vm_pageout_defer(vm_page_t m, const uint8_t queue, const bool enqueued) { vm_page_astate_t as; as = vm_page_astate_load(m); if (__predict_false(as.queue != queue || ((as.flags & PGA_ENQUEUED) != 0) != enqueued)) return (true); if ((as.flags & PGA_QUEUE_OP_MASK) != 0) { vm_page_pqbatch_submit(m, queue); return (true); } return (false); } /* * Scan for pages at adjacent offsets within the given page's object that are * eligible for laundering, form a cluster of these pages and the given page, * and launder that cluster. */ static int vm_pageout_cluster(vm_page_t m) { vm_object_t object; vm_page_t mc[2 * vm_pageout_page_count], p, pb, ps; vm_pindex_t pindex; int ib, is, page_base, pageout_count; object = m->object; VM_OBJECT_ASSERT_WLOCKED(object); pindex = m->pindex; vm_page_assert_xbusied(m); mc[vm_pageout_page_count] = pb = ps = m; pageout_count = 1; page_base = vm_pageout_page_count; ib = 1; is = 1; /* * We can cluster only if the page is not clean, busy, or held, and * the page is in the laundry queue. * * During heavy mmap/modification loads the pageout * daemon can really fragment the underlying file * due to flushing pages out of order and not trying to * align the clusters (which leaves sporadic out-of-order * holes). To solve this problem we do the reverse scan * first and attempt to align our cluster, then do a * forward scan if room remains. */ more: while (ib != 0 && pageout_count < vm_pageout_page_count) { if (ib > pindex) { ib = 0; break; } if ((p = vm_page_prev(pb)) == NULL || vm_page_tryxbusy(p) == 0) { ib = 0; break; } if (vm_page_wired(p)) { ib = 0; vm_page_xunbusy(p); break; } vm_page_test_dirty(p); if (p->dirty == 0) { ib = 0; vm_page_xunbusy(p); break; } if (!vm_page_in_laundry(p) || !vm_page_try_remove_write(p)) { vm_page_xunbusy(p); ib = 0; break; } mc[--page_base] = pb = p; ++pageout_count; ++ib; /* * We are at an alignment boundary. Stop here, and switch * directions. Do not clear ib. */ if ((pindex - (ib - 1)) % vm_pageout_page_count == 0) break; } while (pageout_count < vm_pageout_page_count && pindex + is < object->size) { if ((p = vm_page_next(ps)) == NULL || vm_page_tryxbusy(p) == 0) break; if (vm_page_wired(p)) { vm_page_xunbusy(p); break; } vm_page_test_dirty(p); if (p->dirty == 0) { vm_page_xunbusy(p); break; } if (!vm_page_in_laundry(p) || !vm_page_try_remove_write(p)) { vm_page_xunbusy(p); break; } mc[page_base + pageout_count] = ps = p; ++pageout_count; ++is; } /* * If we exhausted our forward scan, continue with the reverse scan * when possible, even past an alignment boundary. This catches * boundary conditions. */ if (ib != 0 && pageout_count < vm_pageout_page_count) goto more; return (vm_pageout_flush(&mc[page_base], pageout_count, VM_PAGER_PUT_NOREUSE, 0, NULL, NULL)); } /* * vm_pageout_flush() - launder the given pages * * The given pages are laundered. Note that we setup for the start of * I/O ( i.e. busy the page ), mark it read-only, and bump the object * reference count all in here rather then in the parent. If we want * the parent to do more sophisticated things we may have to change * the ordering. * * Returned runlen is the count of pages between mreq and first * page after mreq with status VM_PAGER_AGAIN. * *eio is set to TRUE if pager returned VM_PAGER_ERROR or VM_PAGER_FAIL * for any page in runlen set. */ int vm_pageout_flush(vm_page_t *mc, int count, int flags, int mreq, int *prunlen, boolean_t *eio) { vm_object_t object = mc[0]->object; int pageout_status[count]; int numpagedout = 0; int i, runlen; VM_OBJECT_ASSERT_WLOCKED(object); /* * Initiate I/O. Mark the pages shared busy and verify that they're * valid and read-only. * * We do not have to fixup the clean/dirty bits here... we can * allow the pager to do it after the I/O completes. * * NOTE! mc[i]->dirty may be partial or fragmented due to an * edge case with file fragments. */ for (i = 0; i < count; i++) { KASSERT(vm_page_all_valid(mc[i]), ("vm_pageout_flush: partially invalid page %p index %d/%d", mc[i], i, count)); KASSERT((mc[i]->a.flags & PGA_WRITEABLE) == 0, ("vm_pageout_flush: writeable page %p", mc[i])); vm_page_busy_downgrade(mc[i]); } vm_object_pip_add(object, count); vm_pager_put_pages(object, mc, count, flags, pageout_status); runlen = count - mreq; if (eio != NULL) *eio = FALSE; for (i = 0; i < count; i++) { vm_page_t mt = mc[i]; KASSERT(pageout_status[i] == VM_PAGER_PEND || !pmap_page_is_write_mapped(mt), ("vm_pageout_flush: page %p is not write protected", mt)); switch (pageout_status[i]) { case VM_PAGER_OK: /* * The page may have moved since laundering started, in * which case it should be left alone. */ if (vm_page_in_laundry(mt)) vm_page_deactivate_noreuse(mt); /* FALLTHROUGH */ case VM_PAGER_PEND: numpagedout++; break; case VM_PAGER_BAD: /* * The page is outside the object's range. We pretend * that the page out worked and clean the page, so the * changes will be lost if the page is reclaimed by * the page daemon. */ vm_page_undirty(mt); if (vm_page_in_laundry(mt)) vm_page_deactivate_noreuse(mt); break; case VM_PAGER_ERROR: case VM_PAGER_FAIL: /* * If the page couldn't be paged out to swap because the * pager wasn't able to find space, place the page in * the PQ_UNSWAPPABLE holding queue. This is an * optimization that prevents the page daemon from * wasting CPU cycles on pages that cannot be reclaimed * becase no swap device is configured. * * Otherwise, reactivate the page so that it doesn't * clog the laundry and inactive queues. (We will try * paging it out again later.) */ if (object->type == OBJT_SWAP && pageout_status[i] == VM_PAGER_FAIL) { vm_page_unswappable(mt); numpagedout++; } else vm_page_activate(mt); if (eio != NULL && i >= mreq && i - mreq < runlen) *eio = TRUE; break; case VM_PAGER_AGAIN: if (i >= mreq && i - mreq < runlen) runlen = i - mreq; break; } /* * If the operation is still going, leave the page busy to * block all other accesses. Also, leave the paging in * progress indicator set so that we don't attempt an object * collapse. */ if (pageout_status[i] != VM_PAGER_PEND) { vm_object_pip_wakeup(object); vm_page_sunbusy(mt); } } if (prunlen != NULL) *prunlen = runlen; return (numpagedout); } static void vm_pageout_swapon(void *arg __unused, struct swdevt *sp __unused) { atomic_store_rel_int(&swapdev_enabled, 1); } static void vm_pageout_swapoff(void *arg __unused, struct swdevt *sp __unused) { if (swap_pager_nswapdev() == 1) atomic_store_rel_int(&swapdev_enabled, 0); } /* * Attempt to acquire all of the necessary locks to launder a page and * then call through the clustering layer to PUTPAGES. Wait a short * time for a vnode lock. * * Requires the page and object lock on entry, releases both before return. * Returns 0 on success and an errno otherwise. */ static int vm_pageout_clean(vm_page_t m, int *numpagedout) { struct vnode *vp; struct mount *mp; vm_object_t object; vm_pindex_t pindex; int error, lockmode; object = m->object; VM_OBJECT_ASSERT_WLOCKED(object); error = 0; vp = NULL; mp = NULL; /* * The object is already known NOT to be dead. It * is possible for the vget() to block the whole * pageout daemon, but the new low-memory handling * code should prevent it. * * We can't wait forever for the vnode lock, we might * deadlock due to a vn_read() getting stuck in * vm_wait while holding this vnode. We skip the * vnode if we can't get it in a reasonable amount * of time. */ if (object->type == OBJT_VNODE) { vm_page_xunbusy(m); vp = object->handle; if (vp->v_type == VREG && vn_start_write(vp, &mp, V_NOWAIT) != 0) { mp = NULL; error = EDEADLK; goto unlock_all; } KASSERT(mp != NULL, ("vp %p with NULL v_mount", vp)); vm_object_reference_locked(object); pindex = m->pindex; VM_OBJECT_WUNLOCK(object); lockmode = MNT_SHARED_WRITES(vp->v_mount) ? LK_SHARED : LK_EXCLUSIVE; - if (vget(vp, lockmode | LK_TIMELOCK, curthread)) { + if (vget(vp, lockmode | LK_TIMELOCK)) { vp = NULL; error = EDEADLK; goto unlock_mp; } VM_OBJECT_WLOCK(object); /* * Ensure that the object and vnode were not disassociated * while locks were dropped. */ if (vp->v_object != object) { error = ENOENT; goto unlock_all; } /* * While the object was unlocked, the page may have been: * (1) moved to a different queue, * (2) reallocated to a different object, * (3) reallocated to a different offset, or * (4) cleaned. */ if (!vm_page_in_laundry(m) || m->object != object || m->pindex != pindex || m->dirty == 0) { error = ENXIO; goto unlock_all; } /* * The page may have been busied while the object lock was * released. */ if (vm_page_tryxbusy(m) == 0) { error = EBUSY; goto unlock_all; } } /* * Remove all writeable mappings, failing if the page is wired. */ if (!vm_page_try_remove_write(m)) { vm_page_xunbusy(m); error = EBUSY; goto unlock_all; } /* * If a page is dirty, then it is either being washed * (but not yet cleaned) or it is still in the * laundry. If it is still in the laundry, then we * start the cleaning operation. */ if ((*numpagedout = vm_pageout_cluster(m)) == 0) error = EIO; unlock_all: VM_OBJECT_WUNLOCK(object); unlock_mp: if (mp != NULL) { if (vp != NULL) vput(vp); vm_object_deallocate(object); vn_finished_write(mp); } return (error); } /* * Attempt to launder the specified number of pages. * * Returns the number of pages successfully laundered. */ static int vm_pageout_launder(struct vm_domain *vmd, int launder, bool in_shortfall) { struct scan_state ss; struct vm_pagequeue *pq; vm_object_t object; vm_page_t m, marker; vm_page_astate_t new, old; int act_delta, error, numpagedout, queue, refs, starting_target; int vnodes_skipped; bool pageout_ok; object = NULL; starting_target = launder; vnodes_skipped = 0; /* * Scan the laundry queues for pages eligible to be laundered. We stop * once the target number of dirty pages have been laundered, or once * we've reached the end of the queue. A single iteration of this loop * may cause more than one page to be laundered because of clustering. * * As an optimization, we avoid laundering from PQ_UNSWAPPABLE when no * swap devices are configured. */ if (atomic_load_acq_int(&swapdev_enabled)) queue = PQ_UNSWAPPABLE; else queue = PQ_LAUNDRY; scan: marker = &vmd->vmd_markers[queue]; pq = &vmd->vmd_pagequeues[queue]; vm_pagequeue_lock(pq); vm_pageout_init_scan(&ss, pq, marker, NULL, pq->pq_cnt); while (launder > 0 && (m = vm_pageout_next(&ss, false)) != NULL) { if (__predict_false((m->flags & PG_MARKER) != 0)) continue; /* * Don't touch a page that was removed from the queue after the * page queue lock was released. Otherwise, ensure that any * pending queue operations, such as dequeues for wired pages, * are handled. */ if (vm_pageout_defer(m, queue, true)) continue; /* * Lock the page's object. */ if (object == NULL || object != m->object) { if (object != NULL) VM_OBJECT_WUNLOCK(object); object = atomic_load_ptr(&m->object); if (__predict_false(object == NULL)) /* The page is being freed by another thread. */ continue; /* Depends on type-stability. */ VM_OBJECT_WLOCK(object); if (__predict_false(m->object != object)) { VM_OBJECT_WUNLOCK(object); object = NULL; continue; } } if (vm_page_tryxbusy(m) == 0) continue; /* * Check for wirings now that we hold the object lock and have * exclusively busied the page. If the page is mapped, it may * still be wired by pmap lookups. The call to * vm_page_try_remove_all() below atomically checks for such * wirings and removes mappings. If the page is unmapped, the * wire count is guaranteed not to increase after this check. */ if (__predict_false(vm_page_wired(m))) goto skip_page; /* * Invalid pages can be easily freed. They cannot be * mapped; vm_page_free() asserts this. */ if (vm_page_none_valid(m)) goto free_page; refs = object->ref_count != 0 ? pmap_ts_referenced(m) : 0; for (old = vm_page_astate_load(m);;) { /* * Check to see if the page has been removed from the * queue since the first such check. Leave it alone if * so, discarding any references collected by * pmap_ts_referenced(). */ if (__predict_false(_vm_page_queue(old) == PQ_NONE)) goto skip_page; new = old; act_delta = refs; if ((old.flags & PGA_REFERENCED) != 0) { new.flags &= ~PGA_REFERENCED; act_delta++; } if (act_delta == 0) { ; } else if (object->ref_count != 0) { /* * Increase the activation count if the page was * referenced while in the laundry queue. This * makes it less likely that the page will be * returned prematurely to the laundry queue. */ new.act_count += ACT_ADVANCE + act_delta; if (new.act_count > ACT_MAX) new.act_count = ACT_MAX; new.flags &= ~PGA_QUEUE_OP_MASK; new.flags |= PGA_REQUEUE; new.queue = PQ_ACTIVE; if (!vm_page_pqstate_commit(m, &old, new)) continue; /* * If this was a background laundering, count * activated pages towards our target. The * purpose of background laundering is to ensure * that pages are eventually cycled through the * laundry queue, and an activation is a valid * way out. */ if (!in_shortfall) launder--; VM_CNT_INC(v_reactivated); goto skip_page; } else if ((object->flags & OBJ_DEAD) == 0) { new.flags |= PGA_REQUEUE; if (!vm_page_pqstate_commit(m, &old, new)) continue; goto skip_page; } break; } /* * If the page appears to be clean at the machine-independent * layer, then remove all of its mappings from the pmap in * anticipation of freeing it. If, however, any of the page's * mappings allow write access, then the page may still be * modified until the last of those mappings are removed. */ if (object->ref_count != 0) { vm_page_test_dirty(m); if (m->dirty == 0 && !vm_page_try_remove_all(m)) goto skip_page; } /* * Clean pages are freed, and dirty pages are paged out unless * they belong to a dead object. Requeueing dirty pages from * dead objects is pointless, as they are being paged out and * freed by the thread that destroyed the object. */ if (m->dirty == 0) { free_page: /* * Now we are guaranteed that no other threads are * manipulating the page, check for a last-second * reference. */ if (vm_pageout_defer(m, queue, true)) goto skip_page; vm_page_free(m); VM_CNT_INC(v_dfree); } else if ((object->flags & OBJ_DEAD) == 0) { if (object->type != OBJT_SWAP && object->type != OBJT_DEFAULT) pageout_ok = true; else if (disable_swap_pageouts) pageout_ok = false; else pageout_ok = true; if (!pageout_ok) { vm_page_launder(m); goto skip_page; } /* * Form a cluster with adjacent, dirty pages from the * same object, and page out that entire cluster. * * The adjacent, dirty pages must also be in the * laundry. However, their mappings are not checked * for new references. Consequently, a recently * referenced page may be paged out. However, that * page will not be prematurely reclaimed. After page * out, the page will be placed in the inactive queue, * where any new references will be detected and the * page reactivated. */ error = vm_pageout_clean(m, &numpagedout); if (error == 0) { launder -= numpagedout; ss.scanned += numpagedout; } else if (error == EDEADLK) { pageout_lock_miss++; vnodes_skipped++; } object = NULL; } else { skip_page: vm_page_xunbusy(m); } } if (object != NULL) { VM_OBJECT_WUNLOCK(object); object = NULL; } vm_pagequeue_lock(pq); vm_pageout_end_scan(&ss); vm_pagequeue_unlock(pq); if (launder > 0 && queue == PQ_UNSWAPPABLE) { queue = PQ_LAUNDRY; goto scan; } /* * Wakeup the sync daemon if we skipped a vnode in a writeable object * and we didn't launder enough pages. */ if (vnodes_skipped > 0 && launder > 0) (void)speedup_syncer(); return (starting_target - launder); } /* * Compute the integer square root. */ static u_int isqrt(u_int num) { u_int bit, root, tmp; bit = num != 0 ? (1u << ((fls(num) - 1) & ~1)) : 0; root = 0; while (bit != 0) { tmp = root + bit; root >>= 1; if (num >= tmp) { num -= tmp; root += bit; } bit >>= 2; } return (root); } /* * Perform the work of the laundry thread: periodically wake up and determine * whether any pages need to be laundered. If so, determine the number of pages * that need to be laundered, and launder them. */ static void vm_pageout_laundry_worker(void *arg) { struct vm_domain *vmd; struct vm_pagequeue *pq; uint64_t nclean, ndirty, nfreed; int domain, last_target, launder, shortfall, shortfall_cycle, target; bool in_shortfall; domain = (uintptr_t)arg; vmd = VM_DOMAIN(domain); pq = &vmd->vmd_pagequeues[PQ_LAUNDRY]; KASSERT(vmd->vmd_segs != 0, ("domain without segments")); shortfall = 0; in_shortfall = false; shortfall_cycle = 0; last_target = target = 0; nfreed = 0; /* * Calls to these handlers are serialized by the swap syscall lock. */ (void)EVENTHANDLER_REGISTER(swapon, vm_pageout_swapon, vmd, EVENTHANDLER_PRI_ANY); (void)EVENTHANDLER_REGISTER(swapoff, vm_pageout_swapoff, vmd, EVENTHANDLER_PRI_ANY); /* * The pageout laundry worker is never done, so loop forever. */ for (;;) { KASSERT(target >= 0, ("negative target %d", target)); KASSERT(shortfall_cycle >= 0, ("negative cycle %d", shortfall_cycle)); launder = 0; /* * First determine whether we need to launder pages to meet a * shortage of free pages. */ if (shortfall > 0) { in_shortfall = true; shortfall_cycle = VM_LAUNDER_RATE / VM_INACT_SCAN_RATE; target = shortfall; } else if (!in_shortfall) goto trybackground; else if (shortfall_cycle == 0 || vm_laundry_target(vmd) <= 0) { /* * We recently entered shortfall and began laundering * pages. If we have completed that laundering run * (and we are no longer in shortfall) or we have met * our laundry target through other activity, then we * can stop laundering pages. */ in_shortfall = false; target = 0; goto trybackground; } launder = target / shortfall_cycle--; goto dolaundry; /* * There's no immediate need to launder any pages; see if we * meet the conditions to perform background laundering: * * 1. The ratio of dirty to clean inactive pages exceeds the * background laundering threshold, or * 2. we haven't yet reached the target of the current * background laundering run. * * The background laundering threshold is not a constant. * Instead, it is a slowly growing function of the number of * clean pages freed by the page daemon since the last * background laundering. Thus, as the ratio of dirty to * clean inactive pages grows, the amount of memory pressure * required to trigger laundering decreases. We ensure * that the threshold is non-zero after an inactive queue * scan, even if that scan failed to free a single clean page. */ trybackground: nclean = vmd->vmd_free_count + vmd->vmd_pagequeues[PQ_INACTIVE].pq_cnt; ndirty = vmd->vmd_pagequeues[PQ_LAUNDRY].pq_cnt; if (target == 0 && ndirty * isqrt(howmany(nfreed + 1, vmd->vmd_free_target - vmd->vmd_free_min)) >= nclean) { target = vmd->vmd_background_launder_target; } /* * We have a non-zero background laundering target. If we've * laundered up to our maximum without observing a page daemon * request, just stop. This is a safety belt that ensures we * don't launder an excessive amount if memory pressure is low * and the ratio of dirty to clean pages is large. Otherwise, * proceed at the background laundering rate. */ if (target > 0) { if (nfreed > 0) { nfreed = 0; last_target = target; } else if (last_target - target >= vm_background_launder_max * PAGE_SIZE / 1024) { target = 0; } launder = vm_background_launder_rate * PAGE_SIZE / 1024; launder /= VM_LAUNDER_RATE; if (launder > target) launder = target; } dolaundry: if (launder > 0) { /* * Because of I/O clustering, the number of laundered * pages could exceed "target" by the maximum size of * a cluster minus one. */ target -= min(vm_pageout_launder(vmd, launder, in_shortfall), target); pause("laundp", hz / VM_LAUNDER_RATE); } /* * If we're not currently laundering pages and the page daemon * hasn't posted a new request, sleep until the page daemon * kicks us. */ vm_pagequeue_lock(pq); if (target == 0 && vmd->vmd_laundry_request == VM_LAUNDRY_IDLE) (void)mtx_sleep(&vmd->vmd_laundry_request, vm_pagequeue_lockptr(pq), PVM, "launds", 0); /* * If the pagedaemon has indicated that it's in shortfall, start * a shortfall laundering unless we're already in the middle of * one. This may preempt a background laundering. */ if (vmd->vmd_laundry_request == VM_LAUNDRY_SHORTFALL && (!in_shortfall || shortfall_cycle == 0)) { shortfall = vm_laundry_target(vmd) + vmd->vmd_pageout_deficit; target = 0; } else shortfall = 0; if (target == 0) vmd->vmd_laundry_request = VM_LAUNDRY_IDLE; nfreed += vmd->vmd_clean_pages_freed; vmd->vmd_clean_pages_freed = 0; vm_pagequeue_unlock(pq); } } /* * Compute the number of pages we want to try to move from the * active queue to either the inactive or laundry queue. * * When scanning active pages during a shortage, we make clean pages * count more heavily towards the page shortage than dirty pages. * This is because dirty pages must be laundered before they can be * reused and thus have less utility when attempting to quickly * alleviate a free page shortage. However, this weighting also * causes the scan to deactivate dirty pages more aggressively, * improving the effectiveness of clustering. */ static int vm_pageout_active_target(struct vm_domain *vmd) { int shortage; shortage = vmd->vmd_inactive_target + vm_paging_target(vmd) - (vmd->vmd_pagequeues[PQ_INACTIVE].pq_cnt + vmd->vmd_pagequeues[PQ_LAUNDRY].pq_cnt / act_scan_laundry_weight); shortage *= act_scan_laundry_weight; return (shortage); } /* * Scan the active queue. If there is no shortage of inactive pages, scan a * small portion of the queue in order to maintain quasi-LRU. */ static void vm_pageout_scan_active(struct vm_domain *vmd, int page_shortage) { struct scan_state ss; vm_object_t object; vm_page_t m, marker; struct vm_pagequeue *pq; vm_page_astate_t old, new; long min_scan; int act_delta, max_scan, ps_delta, refs, scan_tick; uint8_t nqueue; marker = &vmd->vmd_markers[PQ_ACTIVE]; pq = &vmd->vmd_pagequeues[PQ_ACTIVE]; vm_pagequeue_lock(pq); /* * If we're just idle polling attempt to visit every * active page within 'update_period' seconds. */ scan_tick = ticks; if (vm_pageout_update_period != 0) { min_scan = pq->pq_cnt; min_scan *= scan_tick - vmd->vmd_last_active_scan; min_scan /= hz * vm_pageout_update_period; } else min_scan = 0; if (min_scan > 0 || (page_shortage > 0 && pq->pq_cnt > 0)) vmd->vmd_last_active_scan = scan_tick; /* * Scan the active queue for pages that can be deactivated. Update * the per-page activity counter and use it to identify deactivation * candidates. Held pages may be deactivated. * * To avoid requeuing each page that remains in the active queue, we * implement the CLOCK algorithm. To keep the implementation of the * enqueue operation consistent for all page queues, we use two hands, * represented by marker pages. Scans begin at the first hand, which * precedes the second hand in the queue. When the two hands meet, * they are moved back to the head and tail of the queue, respectively, * and scanning resumes. */ max_scan = page_shortage > 0 ? pq->pq_cnt : min_scan; act_scan: vm_pageout_init_scan(&ss, pq, marker, &vmd->vmd_clock[0], max_scan); while ((m = vm_pageout_next(&ss, false)) != NULL) { if (__predict_false(m == &vmd->vmd_clock[1])) { vm_pagequeue_lock(pq); TAILQ_REMOVE(&pq->pq_pl, &vmd->vmd_clock[0], plinks.q); TAILQ_REMOVE(&pq->pq_pl, &vmd->vmd_clock[1], plinks.q); TAILQ_INSERT_HEAD(&pq->pq_pl, &vmd->vmd_clock[0], plinks.q); TAILQ_INSERT_TAIL(&pq->pq_pl, &vmd->vmd_clock[1], plinks.q); max_scan -= ss.scanned; vm_pageout_end_scan(&ss); goto act_scan; } if (__predict_false((m->flags & PG_MARKER) != 0)) continue; /* * Don't touch a page that was removed from the queue after the * page queue lock was released. Otherwise, ensure that any * pending queue operations, such as dequeues for wired pages, * are handled. */ if (vm_pageout_defer(m, PQ_ACTIVE, true)) continue; /* * A page's object pointer may be set to NULL before * the object lock is acquired. */ object = atomic_load_ptr(&m->object); if (__predict_false(object == NULL)) /* * The page has been removed from its object. */ continue; /* Deferred free of swap space. */ if ((m->a.flags & PGA_SWAP_FREE) != 0 && VM_OBJECT_TRYWLOCK(object)) { if (m->object == object) vm_pager_page_unswapped(m); VM_OBJECT_WUNLOCK(object); } /* * Check to see "how much" the page has been used. * * Test PGA_REFERENCED after calling pmap_ts_referenced() so * that a reference from a concurrently destroyed mapping is * observed here and now. * * Perform an unsynchronized object ref count check. While * the page lock ensures that the page is not reallocated to * another object, in particular, one with unmanaged mappings * that cannot support pmap_ts_referenced(), two races are, * nonetheless, possible: * 1) The count was transitioning to zero, but we saw a non- * zero value. pmap_ts_referenced() will return zero * because the page is not mapped. * 2) The count was transitioning to one, but we saw zero. * This race delays the detection of a new reference. At * worst, we will deactivate and reactivate the page. */ refs = object->ref_count != 0 ? pmap_ts_referenced(m) : 0; old = vm_page_astate_load(m); do { /* * Check to see if the page has been removed from the * queue since the first such check. Leave it alone if * so, discarding any references collected by * pmap_ts_referenced(). */ if (__predict_false(_vm_page_queue(old) == PQ_NONE)) break; /* * Advance or decay the act_count based on recent usage. */ new = old; act_delta = refs; if ((old.flags & PGA_REFERENCED) != 0) { new.flags &= ~PGA_REFERENCED; act_delta++; } if (act_delta != 0) { new.act_count += ACT_ADVANCE + act_delta; if (new.act_count > ACT_MAX) new.act_count = ACT_MAX; } else { new.act_count -= min(new.act_count, ACT_DECLINE); } if (new.act_count > 0) { /* * Adjust the activation count and keep the page * in the active queue. The count might be left * unchanged if it is saturated. The page may * have been moved to a different queue since we * started the scan, in which case we move it * back. */ ps_delta = 0; if (old.queue != PQ_ACTIVE) { new.flags &= ~PGA_QUEUE_OP_MASK; new.flags |= PGA_REQUEUE; new.queue = PQ_ACTIVE; } } else { /* * When not short for inactive pages, let dirty * pages go through the inactive queue before * moving to the laundry queue. This gives them * some extra time to be reactivated, * potentially avoiding an expensive pageout. * However, during a page shortage, the inactive * queue is necessarily small, and so dirty * pages would only spend a trivial amount of * time in the inactive queue. Therefore, we * might as well place them directly in the * laundry queue to reduce queuing overhead. * * Calling vm_page_test_dirty() here would * require acquisition of the object's write * lock. However, during a page shortage, * directing dirty pages into the laundry queue * is only an optimization and not a * requirement. Therefore, we simply rely on * the opportunistic updates to the page's dirty * field by the pmap. */ if (page_shortage <= 0) { nqueue = PQ_INACTIVE; ps_delta = 0; } else if (m->dirty == 0) { nqueue = PQ_INACTIVE; ps_delta = act_scan_laundry_weight; } else { nqueue = PQ_LAUNDRY; ps_delta = 1; } new.flags &= ~PGA_QUEUE_OP_MASK; new.flags |= PGA_REQUEUE; new.queue = nqueue; } } while (!vm_page_pqstate_commit(m, &old, new)); page_shortage -= ps_delta; } vm_pagequeue_lock(pq); TAILQ_REMOVE(&pq->pq_pl, &vmd->vmd_clock[0], plinks.q); TAILQ_INSERT_AFTER(&pq->pq_pl, marker, &vmd->vmd_clock[0], plinks.q); vm_pageout_end_scan(&ss); vm_pagequeue_unlock(pq); } static int vm_pageout_reinsert_inactive_page(struct vm_pagequeue *pq, vm_page_t marker, vm_page_t m) { vm_page_astate_t as; vm_pagequeue_assert_locked(pq); as = vm_page_astate_load(m); if (as.queue != PQ_INACTIVE || (as.flags & PGA_ENQUEUED) != 0) return (0); vm_page_aflag_set(m, PGA_ENQUEUED); TAILQ_INSERT_BEFORE(marker, m, plinks.q); return (1); } /* * Re-add stuck pages to the inactive queue. We will examine them again * during the next scan. If the queue state of a page has changed since * it was physically removed from the page queue in * vm_pageout_collect_batch(), don't do anything with that page. */ static void vm_pageout_reinsert_inactive(struct scan_state *ss, struct vm_batchqueue *bq, vm_page_t m) { struct vm_pagequeue *pq; vm_page_t marker; int delta; delta = 0; marker = ss->marker; pq = ss->pq; if (m != NULL) { if (vm_batchqueue_insert(bq, m)) return; vm_pagequeue_lock(pq); delta += vm_pageout_reinsert_inactive_page(pq, marker, m); } else vm_pagequeue_lock(pq); while ((m = vm_batchqueue_pop(bq)) != NULL) delta += vm_pageout_reinsert_inactive_page(pq, marker, m); vm_pagequeue_cnt_add(pq, delta); vm_pagequeue_unlock(pq); vm_batchqueue_init(bq); } static void vm_pageout_scan_inactive(struct vm_domain *vmd, int page_shortage) { struct timeval start, end; struct scan_state ss; struct vm_batchqueue rq; struct vm_page marker_page; vm_page_t m, marker; struct vm_pagequeue *pq; vm_object_t object; vm_page_astate_t old, new; int act_delta, addl_page_shortage, starting_page_shortage, refs; object = NULL; vm_batchqueue_init(&rq); getmicrouptime(&start); /* * The addl_page_shortage is an estimate of the number of temporarily * stuck pages in the inactive queue. In other words, the * number of pages from the inactive count that should be * discounted in setting the target for the active queue scan. */ addl_page_shortage = 0; /* * Start scanning the inactive queue for pages that we can free. The * scan will stop when we reach the target or we have scanned the * entire queue. (Note that m->a.act_count is not used to make * decisions for the inactive queue, only for the active queue.) */ starting_page_shortage = page_shortage; marker = &marker_page; vm_page_init_marker(marker, PQ_INACTIVE, 0); pq = &vmd->vmd_pagequeues[PQ_INACTIVE]; vm_pagequeue_lock(pq); vm_pageout_init_scan(&ss, pq, marker, NULL, pq->pq_cnt); while (page_shortage > 0 && (m = vm_pageout_next(&ss, true)) != NULL) { KASSERT((m->flags & PG_MARKER) == 0, ("marker page %p was dequeued", m)); /* * Don't touch a page that was removed from the queue after the * page queue lock was released. Otherwise, ensure that any * pending queue operations, such as dequeues for wired pages, * are handled. */ if (vm_pageout_defer(m, PQ_INACTIVE, false)) continue; /* * Lock the page's object. */ if (object == NULL || object != m->object) { if (object != NULL) VM_OBJECT_WUNLOCK(object); object = atomic_load_ptr(&m->object); if (__predict_false(object == NULL)) /* The page is being freed by another thread. */ continue; /* Depends on type-stability. */ VM_OBJECT_WLOCK(object); if (__predict_false(m->object != object)) { VM_OBJECT_WUNLOCK(object); object = NULL; goto reinsert; } } if (vm_page_tryxbusy(m) == 0) { /* * Don't mess with busy pages. Leave them at * the front of the queue. Most likely, they * are being paged out and will leave the * queue shortly after the scan finishes. So, * they ought to be discounted from the * inactive count. */ addl_page_shortage++; goto reinsert; } /* Deferred free of swap space. */ if ((m->a.flags & PGA_SWAP_FREE) != 0) vm_pager_page_unswapped(m); /* * Check for wirings now that we hold the object lock and have * exclusively busied the page. If the page is mapped, it may * still be wired by pmap lookups. The call to * vm_page_try_remove_all() below atomically checks for such * wirings and removes mappings. If the page is unmapped, the * wire count is guaranteed not to increase after this check. */ if (__predict_false(vm_page_wired(m))) goto skip_page; /* * Invalid pages can be easily freed. They cannot be * mapped, vm_page_free() asserts this. */ if (vm_page_none_valid(m)) goto free_page; refs = object->ref_count != 0 ? pmap_ts_referenced(m) : 0; for (old = vm_page_astate_load(m);;) { /* * Check to see if the page has been removed from the * queue since the first such check. Leave it alone if * so, discarding any references collected by * pmap_ts_referenced(). */ if (__predict_false(_vm_page_queue(old) == PQ_NONE)) goto skip_page; new = old; act_delta = refs; if ((old.flags & PGA_REFERENCED) != 0) { new.flags &= ~PGA_REFERENCED; act_delta++; } if (act_delta == 0) { ; } else if (object->ref_count != 0) { /* * Increase the activation count if the * page was referenced while in the * inactive queue. This makes it less * likely that the page will be returned * prematurely to the inactive queue. */ new.act_count += ACT_ADVANCE + act_delta; if (new.act_count > ACT_MAX) new.act_count = ACT_MAX; new.flags &= ~PGA_QUEUE_OP_MASK; new.flags |= PGA_REQUEUE; new.queue = PQ_ACTIVE; if (!vm_page_pqstate_commit(m, &old, new)) continue; VM_CNT_INC(v_reactivated); goto skip_page; } else if ((object->flags & OBJ_DEAD) == 0) { new.queue = PQ_INACTIVE; new.flags |= PGA_REQUEUE; if (!vm_page_pqstate_commit(m, &old, new)) continue; goto skip_page; } break; } /* * If the page appears to be clean at the machine-independent * layer, then remove all of its mappings from the pmap in * anticipation of freeing it. If, however, any of the page's * mappings allow write access, then the page may still be * modified until the last of those mappings are removed. */ if (object->ref_count != 0) { vm_page_test_dirty(m); if (m->dirty == 0 && !vm_page_try_remove_all(m)) goto skip_page; } /* * Clean pages can be freed, but dirty pages must be sent back * to the laundry, unless they belong to a dead object. * Requeueing dirty pages from dead objects is pointless, as * they are being paged out and freed by the thread that * destroyed the object. */ if (m->dirty == 0) { free_page: /* * Now we are guaranteed that no other threads are * manipulating the page, check for a last-second * reference that would save it from doom. */ if (vm_pageout_defer(m, PQ_INACTIVE, false)) goto skip_page; /* * Because we dequeued the page and have already checked * for pending dequeue and enqueue requests, we can * safely disassociate the page from the inactive queue * without holding the queue lock. */ m->a.queue = PQ_NONE; vm_page_free(m); page_shortage--; continue; } if ((object->flags & OBJ_DEAD) == 0) vm_page_launder(m); skip_page: vm_page_xunbusy(m); continue; reinsert: vm_pageout_reinsert_inactive(&ss, &rq, m); } if (object != NULL) VM_OBJECT_WUNLOCK(object); vm_pageout_reinsert_inactive(&ss, &rq, NULL); vm_pageout_reinsert_inactive(&ss, &ss.bq, NULL); vm_pagequeue_lock(pq); vm_pageout_end_scan(&ss); vm_pagequeue_unlock(pq); /* * Record the remaining shortage and the progress and rate it was made. */ atomic_add_int(&vmd->vmd_addl_shortage, addl_page_shortage); getmicrouptime(&end); timevalsub(&end, &start); atomic_add_int(&vmd->vmd_inactive_us, end.tv_sec * 1000000 + end.tv_usec); atomic_add_int(&vmd->vmd_inactive_freed, starting_page_shortage - page_shortage); } /* * Dispatch a number of inactive threads according to load and collect the * results to prevent a coherent (CEM: incoherent?) view of paging activity on * this domain. */ static int vm_pageout_inactive_dispatch(struct vm_domain *vmd, int shortage) { u_int freed, pps, threads, us; vmd->vmd_inactive_shortage = shortage; /* * If we have more work than we can do in a quarter of our interval, we * fire off multiple threads to process it. */ if (vmd->vmd_inactive_threads > 1 && vmd->vmd_inactive_pps != 0 && shortage > vmd->vmd_inactive_pps / VM_INACT_SCAN_RATE / 4) { threads = vmd->vmd_inactive_threads; vm_domain_pageout_lock(vmd); vmd->vmd_inactive_shortage /= threads; blockcount_acquire(&vmd->vmd_inactive_starting, threads - 1); blockcount_acquire(&vmd->vmd_inactive_running, threads - 1); wakeup(&vmd->vmd_inactive_shortage); vm_domain_pageout_unlock(vmd); } /* Run the local thread scan. */ vm_pageout_scan_inactive(vmd, vmd->vmd_inactive_shortage); /* * Block until helper threads report results and then accumulate * totals. */ blockcount_wait(&vmd->vmd_inactive_running, NULL, "vmpoid", PVM); freed = atomic_readandclear_int(&vmd->vmd_inactive_freed); VM_CNT_ADD(v_dfree, freed); /* * Calculate the per-thread paging rate with an exponential decay of * prior results. Careful to avoid integer rounding errors with large * us values. */ us = max(atomic_readandclear_int(&vmd->vmd_inactive_us), 1); if (us > 1000000) /* Keep rounding to tenths */ pps = (freed * 10) / ((us * 10) / 1000000); else pps = (1000000 / us) * freed; vmd->vmd_inactive_pps = (vmd->vmd_inactive_pps / 2) + (pps / 2); return (shortage - freed); } /* * Attempt to reclaim the requested number of pages from the inactive queue. * Returns true if the shortage was addressed. */ static int vm_pageout_inactive(struct vm_domain *vmd, int shortage, int *addl_shortage) { struct vm_pagequeue *pq; u_int addl_page_shortage, deficit, page_shortage; u_int starting_page_shortage; /* * vmd_pageout_deficit counts the number of pages requested in * allocations that failed because of a free page shortage. We assume * that the allocations will be reattempted and thus include the deficit * in our scan target. */ deficit = atomic_readandclear_int(&vmd->vmd_pageout_deficit); starting_page_shortage = shortage + deficit; /* * Run the inactive scan on as many threads as is necessary. */ page_shortage = vm_pageout_inactive_dispatch(vmd, starting_page_shortage); addl_page_shortage = atomic_readandclear_int(&vmd->vmd_addl_shortage); /* * Wake up the laundry thread so that it can perform any needed * laundering. If we didn't meet our target, we're in shortfall and * need to launder more aggressively. If PQ_LAUNDRY is empty and no * swap devices are configured, the laundry thread has no work to do, so * don't bother waking it up. * * The laundry thread uses the number of inactive queue scans elapsed * since the last laundering to determine whether to launder again, so * keep count. */ if (starting_page_shortage > 0) { pq = &vmd->vmd_pagequeues[PQ_LAUNDRY]; vm_pagequeue_lock(pq); if (vmd->vmd_laundry_request == VM_LAUNDRY_IDLE && (pq->pq_cnt > 0 || atomic_load_acq_int(&swapdev_enabled))) { if (page_shortage > 0) { vmd->vmd_laundry_request = VM_LAUNDRY_SHORTFALL; VM_CNT_INC(v_pdshortfalls); } else if (vmd->vmd_laundry_request != VM_LAUNDRY_SHORTFALL) vmd->vmd_laundry_request = VM_LAUNDRY_BACKGROUND; wakeup(&vmd->vmd_laundry_request); } vmd->vmd_clean_pages_freed += starting_page_shortage - page_shortage; vm_pagequeue_unlock(pq); } /* * Wakeup the swapout daemon if we didn't free the targeted number of * pages. */ if (page_shortage > 0) vm_swapout_run(); /* * If the inactive queue scan fails repeatedly to meet its * target, kill the largest process. */ vm_pageout_mightbe_oom(vmd, page_shortage, starting_page_shortage); /* * Reclaim pages by swapping out idle processes, if configured to do so. */ vm_swapout_run_idle(); /* * See the description of addl_page_shortage above. */ *addl_shortage = addl_page_shortage + deficit; return (page_shortage <= 0); } static int vm_pageout_oom_vote; /* * The pagedaemon threads randlomly select one to perform the * OOM. Trying to kill processes before all pagedaemons * failed to reach free target is premature. */ static void vm_pageout_mightbe_oom(struct vm_domain *vmd, int page_shortage, int starting_page_shortage) { int old_vote; if (starting_page_shortage <= 0 || starting_page_shortage != page_shortage) vmd->vmd_oom_seq = 0; else vmd->vmd_oom_seq++; if (vmd->vmd_oom_seq < vm_pageout_oom_seq) { if (vmd->vmd_oom) { vmd->vmd_oom = FALSE; atomic_subtract_int(&vm_pageout_oom_vote, 1); } return; } /* * Do not follow the call sequence until OOM condition is * cleared. */ vmd->vmd_oom_seq = 0; if (vmd->vmd_oom) return; vmd->vmd_oom = TRUE; old_vote = atomic_fetchadd_int(&vm_pageout_oom_vote, 1); if (old_vote != vm_ndomains - 1) return; /* * The current pagedaemon thread is the last in the quorum to * start OOM. Initiate the selection and signaling of the * victim. */ vm_pageout_oom(VM_OOM_MEM); /* * After one round of OOM terror, recall our vote. On the * next pass, current pagedaemon would vote again if the low * memory condition is still there, due to vmd_oom being * false. */ vmd->vmd_oom = FALSE; atomic_subtract_int(&vm_pageout_oom_vote, 1); } /* * The OOM killer is the page daemon's action of last resort when * memory allocation requests have been stalled for a prolonged period * of time because it cannot reclaim memory. This function computes * the approximate number of physical pages that could be reclaimed if * the specified address space is destroyed. * * Private, anonymous memory owned by the address space is the * principal resource that we expect to recover after an OOM kill. * Since the physical pages mapped by the address space's COW entries * are typically shared pages, they are unlikely to be released and so * they are not counted. * * To get to the point where the page daemon runs the OOM killer, its * efforts to write-back vnode-backed pages may have stalled. This * could be caused by a memory allocation deadlock in the write path * that might be resolved by an OOM kill. Therefore, physical pages * belonging to vnode-backed objects are counted, because they might * be freed without being written out first if the address space holds * the last reference to an unlinked vnode. * * Similarly, physical pages belonging to OBJT_PHYS objects are * counted because the address space might hold the last reference to * the object. */ static long vm_pageout_oom_pagecount(struct vmspace *vmspace) { vm_map_t map; vm_map_entry_t entry; vm_object_t obj; long res; map = &vmspace->vm_map; KASSERT(!map->system_map, ("system map")); sx_assert(&map->lock, SA_LOCKED); res = 0; VM_MAP_ENTRY_FOREACH(entry, map) { if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0) continue; obj = entry->object.vm_object; if (obj == NULL) continue; if ((entry->eflags & MAP_ENTRY_NEEDS_COPY) != 0 && obj->ref_count != 1) continue; switch (obj->type) { case OBJT_DEFAULT: case OBJT_SWAP: case OBJT_PHYS: case OBJT_VNODE: res += obj->resident_page_count; break; } } return (res); } static int vm_oom_ratelim_last; static int vm_oom_pf_secs = 10; SYSCTL_INT(_vm, OID_AUTO, oom_pf_secs, CTLFLAG_RWTUN, &vm_oom_pf_secs, 0, ""); static struct mtx vm_oom_ratelim_mtx; void vm_pageout_oom(int shortage) { struct proc *p, *bigproc; vm_offset_t size, bigsize; struct thread *td; struct vmspace *vm; int now; bool breakout; /* * For OOM requests originating from vm_fault(), there is a high * chance that a single large process faults simultaneously in * several threads. Also, on an active system running many * processes of middle-size, like buildworld, all of them * could fault almost simultaneously as well. * * To avoid killing too many processes, rate-limit OOMs * initiated by vm_fault() time-outs on the waits for free * pages. */ mtx_lock(&vm_oom_ratelim_mtx); now = ticks; if (shortage == VM_OOM_MEM_PF && (u_int)(now - vm_oom_ratelim_last) < hz * vm_oom_pf_secs) { mtx_unlock(&vm_oom_ratelim_mtx); return; } vm_oom_ratelim_last = now; mtx_unlock(&vm_oom_ratelim_mtx); /* * We keep the process bigproc locked once we find it to keep anyone * from messing with it; however, there is a possibility of * deadlock if process B is bigproc and one of its child processes * attempts to propagate a signal to B while we are waiting for A's * lock while walking this list. To avoid this, we don't block on * the process lock but just skip a process if it is already locked. */ bigproc = NULL; bigsize = 0; sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { PROC_LOCK(p); /* * If this is a system, protected or killed process, skip it. */ if (p->p_state != PRS_NORMAL || (p->p_flag & (P_INEXEC | P_PROTECTED | P_SYSTEM | P_WEXIT)) != 0 || p->p_pid == 1 || P_KILLED(p) || (p->p_pid < 48 && swap_pager_avail != 0)) { PROC_UNLOCK(p); continue; } /* * If the process is in a non-running type state, * don't touch it. Check all the threads individually. */ breakout = false; FOREACH_THREAD_IN_PROC(p, td) { thread_lock(td); if (!TD_ON_RUNQ(td) && !TD_IS_RUNNING(td) && !TD_IS_SLEEPING(td) && !TD_IS_SUSPENDED(td) && !TD_IS_SWAPPED(td)) { thread_unlock(td); breakout = true; break; } thread_unlock(td); } if (breakout) { PROC_UNLOCK(p); continue; } /* * get the process size */ vm = vmspace_acquire_ref(p); if (vm == NULL) { PROC_UNLOCK(p); continue; } _PHOLD_LITE(p); PROC_UNLOCK(p); sx_sunlock(&allproc_lock); if (!vm_map_trylock_read(&vm->vm_map)) { vmspace_free(vm); sx_slock(&allproc_lock); PRELE(p); continue; } size = vmspace_swap_count(vm); if (shortage == VM_OOM_MEM || shortage == VM_OOM_MEM_PF) size += vm_pageout_oom_pagecount(vm); vm_map_unlock_read(&vm->vm_map); vmspace_free(vm); sx_slock(&allproc_lock); /* * If this process is bigger than the biggest one, * remember it. */ if (size > bigsize) { if (bigproc != NULL) PRELE(bigproc); bigproc = p; bigsize = size; } else { PRELE(p); } } sx_sunlock(&allproc_lock); if (bigproc != NULL) { if (vm_panic_on_oom != 0 && --vm_panic_on_oom == 0) panic("out of swap space"); PROC_LOCK(bigproc); killproc(bigproc, "out of swap space"); sched_nice(bigproc, PRIO_MIN); _PRELE(bigproc); PROC_UNLOCK(bigproc); } } /* * Signal a free page shortage to subsystems that have registered an event * handler. Reclaim memory from UMA in the event of a severe shortage. * Return true if the free page count should be re-evaluated. */ static bool vm_pageout_lowmem(void) { static int lowmem_ticks = 0; int last; bool ret; ret = false; last = atomic_load_int(&lowmem_ticks); while ((u_int)(ticks - last) / hz >= lowmem_period) { if (atomic_fcmpset_int(&lowmem_ticks, &last, ticks) == 0) continue; /* * Decrease registered cache sizes. */ SDT_PROBE0(vm, , , vm__lowmem_scan); EVENTHANDLER_INVOKE(vm_lowmem, VM_LOW_PAGES); /* * We do this explicitly after the caches have been * drained above. */ uma_reclaim(UMA_RECLAIM_TRIM); ret = true; break; } /* * Kick off an asynchronous reclaim of cached memory if one of the * page daemons is failing to keep up with demand. Use the "severe" * threshold instead of "min" to ensure that we do not blow away the * caches if a subset of the NUMA domains are depleted by kernel memory * allocations; the domainset iterators automatically skip domains * below the "min" threshold on the first pass. * * UMA reclaim worker has its own rate-limiting mechanism, so don't * worry about kicking it too often. */ if (vm_page_count_severe()) uma_reclaim_wakeup(); return (ret); } static void vm_pageout_worker(void *arg) { struct vm_domain *vmd; u_int ofree; int addl_shortage, domain, shortage; bool target_met; domain = (uintptr_t)arg; vmd = VM_DOMAIN(domain); shortage = 0; target_met = true; /* * XXXKIB It could be useful to bind pageout daemon threads to * the cores belonging to the domain, from which vm_page_array * is allocated. */ KASSERT(vmd->vmd_segs != 0, ("domain without segments")); vmd->vmd_last_active_scan = ticks; /* * The pageout daemon worker is never done, so loop forever. */ while (TRUE) { vm_domain_pageout_lock(vmd); /* * We need to clear wanted before we check the limits. This * prevents races with wakers who will check wanted after they * reach the limit. */ atomic_store_int(&vmd->vmd_pageout_wanted, 0); /* * Might the page daemon need to run again? */ if (vm_paging_needed(vmd, vmd->vmd_free_count)) { /* * Yes. If the scan failed to produce enough free * pages, sleep uninterruptibly for some time in the * hope that the laundry thread will clean some pages. */ vm_domain_pageout_unlock(vmd); if (!target_met) pause("pwait", hz / VM_INACT_SCAN_RATE); } else { /* * No, sleep until the next wakeup or until pages * need to have their reference stats updated. */ if (mtx_sleep(&vmd->vmd_pageout_wanted, vm_domain_pageout_lockptr(vmd), PDROP | PVM, "psleep", hz / VM_INACT_SCAN_RATE) == 0) VM_CNT_INC(v_pdwakeups); } /* Prevent spurious wakeups by ensuring that wanted is set. */ atomic_store_int(&vmd->vmd_pageout_wanted, 1); /* * Use the controller to calculate how many pages to free in * this interval, and scan the inactive queue. If the lowmem * handlers appear to have freed up some pages, subtract the * difference from the inactive queue scan target. */ shortage = pidctrl_daemon(&vmd->vmd_pid, vmd->vmd_free_count); if (shortage > 0) { ofree = vmd->vmd_free_count; if (vm_pageout_lowmem() && vmd->vmd_free_count > ofree) shortage -= min(vmd->vmd_free_count - ofree, (u_int)shortage); target_met = vm_pageout_inactive(vmd, shortage, &addl_shortage); } else addl_shortage = 0; /* * Scan the active queue. A positive value for shortage * indicates that we must aggressively deactivate pages to avoid * a shortfall. */ shortage = vm_pageout_active_target(vmd) + addl_shortage; vm_pageout_scan_active(vmd, shortage); } } /* * vm_pageout_helper runs additional pageout daemons in times of high paging * activity. */ static void vm_pageout_helper(void *arg) { struct vm_domain *vmd; int domain; domain = (uintptr_t)arg; vmd = VM_DOMAIN(domain); vm_domain_pageout_lock(vmd); for (;;) { msleep(&vmd->vmd_inactive_shortage, vm_domain_pageout_lockptr(vmd), PVM, "psleep", 0); blockcount_release(&vmd->vmd_inactive_starting, 1); vm_domain_pageout_unlock(vmd); vm_pageout_scan_inactive(vmd, vmd->vmd_inactive_shortage); vm_domain_pageout_lock(vmd); /* * Release the running count while the pageout lock is held to * prevent wakeup races. */ blockcount_release(&vmd->vmd_inactive_running, 1); } } static int get_pageout_threads_per_domain(void) { static bool resolved = false; int half_cpus_per_dom; /* * This is serialized externally by the sorted autoconfig portion of * boot. */ if (__predict_true(resolved)) return (pageout_threads_per_domain); /* * Semi-arbitrarily constrain pagedaemon threads to less than half the * total number of threads in the system as an insane upper limit. */ half_cpus_per_dom = howmany(mp_ncpus / vm_ndomains, 2); if (pageout_threads_per_domain < 1) { printf("Invalid tuneable vm.pageout_threads_per_domain value: " "%d out of valid range: [1-%d]; clamping to 1\n", pageout_threads_per_domain, half_cpus_per_dom); pageout_threads_per_domain = 1; } else if (pageout_threads_per_domain > half_cpus_per_dom) { printf("Invalid tuneable vm.pageout_threads_per_domain value: " "%d out of valid range: [1-%d]; clamping to %d\n", pageout_threads_per_domain, half_cpus_per_dom, half_cpus_per_dom); pageout_threads_per_domain = half_cpus_per_dom; } resolved = true; return (pageout_threads_per_domain); } /* * Initialize basic pageout daemon settings. See the comment above the * definition of vm_domain for some explanation of how these thresholds are * used. */ static void vm_pageout_init_domain(int domain) { struct vm_domain *vmd; struct sysctl_oid *oid; vmd = VM_DOMAIN(domain); vmd->vmd_interrupt_free_min = 2; /* * v_free_reserved needs to include enough for the largest * swap pager structures plus enough for any pv_entry structs * when paging. */ vmd->vmd_pageout_free_min = 2 * MAXBSIZE / PAGE_SIZE + vmd->vmd_interrupt_free_min; vmd->vmd_free_reserved = vm_pageout_page_count + vmd->vmd_pageout_free_min + vmd->vmd_page_count / 768; vmd->vmd_free_min = vmd->vmd_page_count / 200; vmd->vmd_free_severe = vmd->vmd_free_min / 2; vmd->vmd_free_target = 4 * vmd->vmd_free_min + vmd->vmd_free_reserved; vmd->vmd_free_min += vmd->vmd_free_reserved; vmd->vmd_free_severe += vmd->vmd_free_reserved; vmd->vmd_inactive_target = (3 * vmd->vmd_free_target) / 2; if (vmd->vmd_inactive_target > vmd->vmd_free_count / 3) vmd->vmd_inactive_target = vmd->vmd_free_count / 3; /* * Set the default wakeup threshold to be 10% below the paging * target. This keeps the steady state out of shortfall. */ vmd->vmd_pageout_wakeup_thresh = (vmd->vmd_free_target / 10) * 9; /* * Target amount of memory to move out of the laundry queue during a * background laundering. This is proportional to the amount of system * memory. */ vmd->vmd_background_launder_target = (vmd->vmd_free_target - vmd->vmd_free_min) / 10; /* Initialize the pageout daemon pid controller. */ pidctrl_init(&vmd->vmd_pid, hz / VM_INACT_SCAN_RATE, vmd->vmd_free_target, PIDCTRL_BOUND, PIDCTRL_KPD, PIDCTRL_KID, PIDCTRL_KDD); oid = SYSCTL_ADD_NODE(NULL, SYSCTL_CHILDREN(vmd->vmd_oid), OID_AUTO, "pidctrl", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, ""); pidctrl_init_sysctl(&vmd->vmd_pid, SYSCTL_CHILDREN(oid)); vmd->vmd_inactive_threads = get_pageout_threads_per_domain(); } static void vm_pageout_init(void) { u_int freecount; int i; /* * Initialize some paging parameters. */ if (vm_cnt.v_page_count < 2000) vm_pageout_page_count = 8; freecount = 0; for (i = 0; i < vm_ndomains; i++) { struct vm_domain *vmd; vm_pageout_init_domain(i); vmd = VM_DOMAIN(i); vm_cnt.v_free_reserved += vmd->vmd_free_reserved; vm_cnt.v_free_target += vmd->vmd_free_target; vm_cnt.v_free_min += vmd->vmd_free_min; vm_cnt.v_inactive_target += vmd->vmd_inactive_target; vm_cnt.v_pageout_free_min += vmd->vmd_pageout_free_min; vm_cnt.v_interrupt_free_min += vmd->vmd_interrupt_free_min; vm_cnt.v_free_severe += vmd->vmd_free_severe; freecount += vmd->vmd_free_count; } /* * Set interval in seconds for active scan. We want to visit each * page at least once every ten minutes. This is to prevent worst * case paging behaviors with stale active LRU. */ if (vm_pageout_update_period == 0) vm_pageout_update_period = 600; if (vm_page_max_user_wired == 0) vm_page_max_user_wired = freecount / 3; } /* * vm_pageout is the high level pageout daemon. */ static void vm_pageout(void) { struct proc *p; struct thread *td; int error, first, i, j, pageout_threads; p = curproc; td = curthread; pageout_threads = get_pageout_threads_per_domain(); mtx_init(&vm_oom_ratelim_mtx, "vmoomr", NULL, MTX_DEF); swap_pager_swap_init(); for (first = -1, i = 0; i < vm_ndomains; i++) { if (VM_DOMAIN_EMPTY(i)) { if (bootverbose) printf("domain %d empty; skipping pageout\n", i); continue; } if (first == -1) first = i; else { error = kthread_add(vm_pageout_worker, (void *)(uintptr_t)i, p, NULL, 0, 0, "dom%d", i); if (error != 0) panic("starting pageout for domain %d: %d\n", i, error); } for (j = 0; j < pageout_threads - 1; j++) { error = kthread_add(vm_pageout_helper, (void *)(uintptr_t)i, p, NULL, 0, 0, "dom%d helper%d", i, j); if (error != 0) panic("starting pageout helper %d for domain " "%d: %d\n", j, i, error); } error = kthread_add(vm_pageout_laundry_worker, (void *)(uintptr_t)i, p, NULL, 0, 0, "laundry: dom%d", i); if (error != 0) panic("starting laundry for domain %d: %d", i, error); } error = kthread_add(uma_reclaim_worker, NULL, p, NULL, 0, 0, "uma"); if (error != 0) panic("starting uma_reclaim helper, error %d\n", error); snprintf(td->td_name, sizeof(td->td_name), "dom%d", first); vm_pageout_worker((void *)(uintptr_t)first); } /* * Perform an advisory wakeup of the page daemon. */ void pagedaemon_wakeup(int domain) { struct vm_domain *vmd; vmd = VM_DOMAIN(domain); vm_domain_pageout_assert_unlocked(vmd); if (curproc == pageproc) return; if (atomic_fetchadd_int(&vmd->vmd_pageout_wanted, 1) == 0) { vm_domain_pageout_lock(vmd); atomic_store_int(&vmd->vmd_pageout_wanted, 1); wakeup(&vmd->vmd_pageout_wanted); vm_domain_pageout_unlock(vmd); } } Index: projects/clang1100-import =================================================================== --- projects/clang1100-import (revision 364278) +++ projects/clang1100-import (revision 364279) Property changes on: projects/clang1100-import ___________________________________________________________________ Modified: svn:mergeinfo ## -0,0 +0,1 ## Merged /head:r364264-364278