Index: head/lib/libnetbsd/sys/time.h =================================================================== --- head/lib/libnetbsd/sys/time.h (revision 336913) +++ head/lib/libnetbsd/sys/time.h (nonexistent) @@ -1,65 +0,0 @@ -/* $FreeBSD$ */ - -/* - * Copyright (c) 1982, 1986, 1993 - * The Regents of the University of California. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * @(#)time.h 8.5 (Berkeley) 5/4/95 - */ - -#ifndef _LIBNETBSD_SYS_TIME_H_ -#define _LIBNETBSD_SYS_TIME_H_ - -#include_next - -/* Operations on timespecs. */ -#define timespecclear(tsp) (tsp)->tv_sec = (time_t)((tsp)->tv_nsec = 0L) -#define timespecisset(tsp) ((tsp)->tv_sec || (tsp)->tv_nsec) -#define timespeccmp(tsp, usp, cmp) \ - (((tsp)->tv_sec == (usp)->tv_sec) ? \ - ((tsp)->tv_nsec cmp (usp)->tv_nsec) : \ - ((tsp)->tv_sec cmp (usp)->tv_sec)) -#define timespecadd(tsp, usp, vsp) \ - do { \ - (vsp)->tv_sec = (tsp)->tv_sec + (usp)->tv_sec; \ - (vsp)->tv_nsec = (tsp)->tv_nsec + (usp)->tv_nsec; \ - if ((vsp)->tv_nsec >= 1000000000L) { \ - (vsp)->tv_sec++; \ - (vsp)->tv_nsec -= 1000000000L; \ - } \ - } while (/* CONSTCOND */ 0) -#define timespecsub(tsp, usp, vsp) \ - do { \ - (vsp)->tv_sec = (tsp)->tv_sec - (usp)->tv_sec; \ - (vsp)->tv_nsec = (tsp)->tv_nsec - (usp)->tv_nsec; \ - if ((vsp)->tv_nsec < 0) { \ - (vsp)->tv_sec--; \ - (vsp)->tv_nsec += 1000000000L; \ - } \ - } while (/* CONSTCOND */ 0) - -#endif Property changes on: head/lib/libnetbsd/sys/time.h ___________________________________________________________________ Deleted: svn:eol-style ## -1 +0,0 ## -native \ No newline at end of property Deleted: svn:keywords ## -1 +0,0 ## -FreeBSD=%H \ No newline at end of property Deleted: svn:mime-type ## -1 +0,0 ## -text/plain \ No newline at end of property Index: head/sbin/fsck_ffs/fsutil.c =================================================================== --- head/sbin/fsck_ffs/fsutil.c (revision 336913) +++ head/sbin/fsck_ffs/fsutil.c (revision 336914) @@ -1,1059 +1,1038 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1980, 1986, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #if 0 #ifndef lint static const char sccsid[] = "@(#)utilities.c 8.6 (Berkeley) 5/19/95"; #endif /* not lint */ #endif #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "fsck.h" static void slowio_start(void); static void slowio_end(void); static void printIOstats(void); static long diskreads, totaldiskreads, totalreads; /* Disk cache statistics */ static struct timespec startpass, finishpass; struct timeval slowio_starttime; int slowio_delay_usec = 10000; /* Initial IO delay for background fsck */ int slowio_pollcnt; static struct bufarea cgblk; /* backup buffer for cylinder group blocks */ static TAILQ_HEAD(buflist, bufarea) bufhead; /* head of buffer cache list */ static int numbufs; /* size of buffer cache */ static char *buftype[BT_NUMBUFTYPES] = BT_NAMES; static struct bufarea *cgbufs; /* header for cylinder group cache */ static int flushtries; /* number of tries to reclaim memory */ void fsutilinit(void) { diskreads = totaldiskreads = totalreads = 0; bzero(&startpass, sizeof(struct timespec)); bzero(&finishpass, sizeof(struct timespec)); bzero(&slowio_starttime, sizeof(struct timeval)); slowio_delay_usec = 10000; slowio_pollcnt = 0; bzero(&cgblk, sizeof(struct bufarea)); TAILQ_INIT(&bufhead); numbufs = 0; /* buftype ? */ cgbufs = NULL; flushtries = 0; } int ftypeok(union dinode *dp) { switch (DIP(dp, di_mode) & IFMT) { case IFDIR: case IFREG: case IFBLK: case IFCHR: case IFLNK: case IFSOCK: case IFIFO: return (1); default: if (debug) printf("bad file type 0%o\n", DIP(dp, di_mode)); return (0); } } int reply(const char *question) { int persevere; char c; if (preen) pfatal("INTERNAL ERROR: GOT TO reply()"); persevere = !strcmp(question, "CONTINUE"); printf("\n"); if (!persevere && (nflag || (fswritefd < 0 && bkgrdflag == 0))) { printf("%s? no\n\n", question); resolved = 0; return (0); } if (yflag || (persevere && nflag)) { printf("%s? yes\n\n", question); return (1); } do { printf("%s? [yn] ", question); (void) fflush(stdout); c = getc(stdin); while (c != '\n' && getc(stdin) != '\n') { if (feof(stdin)) { resolved = 0; return (0); } } } while (c != 'y' && c != 'Y' && c != 'n' && c != 'N'); printf("\n"); if (c == 'y' || c == 'Y') return (1); resolved = 0; return (0); } /* * Look up state information for an inode. */ struct inostat * inoinfo(ino_t inum) { static struct inostat unallocated = { USTATE, 0, 0 }; struct inostatlist *ilp; int iloff; if (inum > maxino) errx(EEXIT, "inoinfo: inumber %ju out of range", (uintmax_t)inum); ilp = &inostathead[inum / sblock.fs_ipg]; iloff = inum % sblock.fs_ipg; if (iloff >= ilp->il_numalloced) return (&unallocated); return (&ilp->il_stat[iloff]); } /* * Malloc buffers and set up cache. */ void bufinit(void) { struct bufarea *bp; long bufcnt, i; char *bufp; pbp = pdirbp = (struct bufarea *)0; bufp = Malloc((unsigned int)sblock.fs_bsize); if (bufp == NULL) errx(EEXIT, "cannot allocate buffer pool"); cgblk.b_un.b_buf = bufp; initbarea(&cgblk, BT_CYLGRP); TAILQ_INIT(&bufhead); bufcnt = MAXBUFS; if (bufcnt < MINBUFS) bufcnt = MINBUFS; for (i = 0; i < bufcnt; i++) { bp = (struct bufarea *)Malloc(sizeof(struct bufarea)); bufp = Malloc((unsigned int)sblock.fs_bsize); if (bp == NULL || bufp == NULL) { if (i >= MINBUFS) break; errx(EEXIT, "cannot allocate buffer pool"); } bp->b_un.b_buf = bufp; TAILQ_INSERT_HEAD(&bufhead, bp, b_list); initbarea(bp, BT_UNKNOWN); } numbufs = i; /* save number of buffers */ for (i = 0; i < BT_NUMBUFTYPES; i++) { readtime[i].tv_sec = totalreadtime[i].tv_sec = 0; readtime[i].tv_nsec = totalreadtime[i].tv_nsec = 0; readcnt[i] = totalreadcnt[i] = 0; } } /* * Manage cylinder group buffers. */ static struct bufarea *cgbufs; /* header for cylinder group cache */ static int flushtries; /* number of tries to reclaim memory */ struct bufarea * cglookup(int cg) { struct bufarea *cgbp; struct cg *cgp; if (cgbufs == NULL) { cgbufs = calloc(sblock.fs_ncg, sizeof(struct bufarea)); if (cgbufs == NULL) errx(EEXIT, "cannot allocate cylinder group buffers"); } cgbp = &cgbufs[cg]; if (cgbp->b_un.b_cg != NULL) return (cgbp); cgp = NULL; if (flushtries == 0) cgp = malloc((unsigned int)sblock.fs_cgsize); if (cgp == NULL) { getblk(&cgblk, cgtod(&sblock, cg), sblock.fs_cgsize); return (&cgblk); } cgbp->b_un.b_cg = cgp; initbarea(cgbp, BT_CYLGRP); getblk(cgbp, cgtod(&sblock, cg), sblock.fs_cgsize); return (cgbp); } /* * Attempt to flush a cylinder group cache entry. * Return whether the flush was successful. */ int flushentry(void) { struct bufarea *cgbp; if (flushtries == sblock.fs_ncg || cgbufs == NULL) return (0); cgbp = &cgbufs[flushtries++]; if (cgbp->b_un.b_cg == NULL) return (0); flush(fswritefd, cgbp); free(cgbp->b_un.b_buf); cgbp->b_un.b_buf = NULL; return (1); } /* * Manage a cache of directory blocks. */ struct bufarea * getdatablk(ufs2_daddr_t blkno, long size, int type) { struct bufarea *bp; TAILQ_FOREACH(bp, &bufhead, b_list) if (bp->b_bno == fsbtodb(&sblock, blkno)) goto foundit; TAILQ_FOREACH_REVERSE(bp, &bufhead, buflist, b_list) if ((bp->b_flags & B_INUSE) == 0) break; if (bp == NULL) errx(EEXIT, "deadlocked buffer pool"); bp->b_type = type; getblk(bp, blkno, size); /* fall through */ foundit: if (debug && bp->b_type != type) printf("Buffer type changed from %s to %s\n", buftype[bp->b_type], buftype[type]); TAILQ_REMOVE(&bufhead, bp, b_list); TAILQ_INSERT_HEAD(&bufhead, bp, b_list); bp->b_flags |= B_INUSE; return (bp); } -/* - * Timespec operations (from ). - */ -#define timespecsub(vvp, uvp) \ - do { \ - (vvp)->tv_sec -= (uvp)->tv_sec; \ - (vvp)->tv_nsec -= (uvp)->tv_nsec; \ - if ((vvp)->tv_nsec < 0) { \ - (vvp)->tv_sec--; \ - (vvp)->tv_nsec += 1000000000; \ - } \ - } while (0) -#define timespecadd(vvp, uvp) \ - do { \ - (vvp)->tv_sec += (uvp)->tv_sec; \ - (vvp)->tv_nsec += (uvp)->tv_nsec; \ - if ((vvp)->tv_nsec >= 1000000000) { \ - (vvp)->tv_sec++; \ - (vvp)->tv_nsec -= 1000000000; \ - } \ - } while (0) - void getblk(struct bufarea *bp, ufs2_daddr_t blk, long size) { ufs2_daddr_t dblk; struct timespec start, finish; dblk = fsbtodb(&sblock, blk); if (bp->b_bno == dblk) { totalreads++; } else { flush(fswritefd, bp); if (debug) { readcnt[bp->b_type]++; clock_gettime(CLOCK_REALTIME_PRECISE, &start); } bp->b_errs = blread(fsreadfd, bp->b_un.b_buf, dblk, size); if (debug) { clock_gettime(CLOCK_REALTIME_PRECISE, &finish); - timespecsub(&finish, &start); - timespecadd(&readtime[bp->b_type], &finish); + timespecsub(&finish, &start, &finish); + timespecadd(&readtime[bp->b_type], &finish, + &readtime[bp->b_type]); } bp->b_bno = dblk; bp->b_size = size; } } void flush(int fd, struct bufarea *bp) { if (!bp->b_dirty) return; bp->b_dirty = 0; if (fswritefd < 0) { pfatal("WRITING IN READ_ONLY MODE.\n"); return; } if (bp->b_errs != 0) pfatal("WRITING %sZERO'ED BLOCK %lld TO DISK\n", (bp->b_errs == bp->b_size / dev_bsize) ? "" : "PARTIALLY ", (long long)bp->b_bno); bp->b_errs = 0; /* * Write using the appropriate function. */ switch (bp->b_type) { case BT_SUPERBLK: if (bp != &sblk) pfatal("BUFFER %p DOES NOT MATCH SBLK %p\n", bp, &sblk); if (sbput(fd, (struct fs *)bp->b_un.b_buf, 0) == 0) fsmodified = 1; break; case BT_CYLGRP: if (cgput(&disk, (struct cg *)bp->b_un.b_buf) == 0) fsmodified = 1; break; default: blwrite(fd, bp->b_un.b_buf, bp->b_bno, bp->b_size); break; } } void rwerror(const char *mesg, ufs2_daddr_t blk) { if (bkgrdcheck) exit(EEXIT); if (preen == 0) printf("\n"); pfatal("CANNOT %s: %ld", mesg, (long)blk); if (reply("CONTINUE") == 0) exit(EEXIT); } void ckfini(int markclean) { struct bufarea *bp, *nbp; int ofsmodified, cnt; if (bkgrdflag) { unlink(snapname); if ((!(sblock.fs_flags & FS_UNCLEAN)) != markclean) { cmd.value = FS_UNCLEAN; cmd.size = markclean ? -1 : 1; if (sysctlbyname("vfs.ffs.setflags", 0, 0, &cmd, sizeof cmd) == -1) rwerror("SET FILE SYSTEM FLAGS", FS_UNCLEAN); if (!preen) { printf("\n***** FILE SYSTEM MARKED %s *****\n", markclean ? "CLEAN" : "DIRTY"); if (!markclean) rerun = 1; } } else if (!preen && !markclean) { printf("\n***** FILE SYSTEM STILL DIRTY *****\n"); rerun = 1; } } if (debug && totalreads > 0) printf("cache with %d buffers missed %ld of %ld (%d%%)\n", numbufs, totaldiskreads, totalreads, (int)(totaldiskreads * 100 / totalreads)); if (fswritefd < 0) { (void)close(fsreadfd); return; } flush(fswritefd, &sblk); if (havesb && cursnapshot == 0 && sblock.fs_magic == FS_UFS2_MAGIC && sblk.b_bno != sblock.fs_sblockloc / dev_bsize && !preen && reply("UPDATE STANDARD SUPERBLOCK")) { /* Change the write destination to standard superblock */ sblock.fs_sblockactualloc = sblock.fs_sblockloc; sblk.b_bno = sblock.fs_sblockloc / dev_bsize; sbdirty(); flush(fswritefd, &sblk); } flush(fswritefd, &cgblk); free(cgblk.b_un.b_buf); cnt = 0; TAILQ_FOREACH_REVERSE_SAFE(bp, &bufhead, buflist, b_list, nbp) { TAILQ_REMOVE(&bufhead, bp, b_list); cnt++; flush(fswritefd, bp); free(bp->b_un.b_buf); free((char *)bp); } if (numbufs != cnt) errx(EEXIT, "panic: lost %d buffers", numbufs - cnt); if (cgbufs != NULL) { for (cnt = 0; cnt < sblock.fs_ncg; cnt++) { if (cgbufs[cnt].b_un.b_cg == NULL) continue; flush(fswritefd, &cgbufs[cnt]); free(cgbufs[cnt].b_un.b_cg); } free(cgbufs); } pbp = pdirbp = (struct bufarea *)0; if (cursnapshot == 0 && sblock.fs_clean != markclean) { if ((sblock.fs_clean = markclean) != 0) { sblock.fs_flags &= ~(FS_UNCLEAN | FS_NEEDSFSCK); sblock.fs_pendingblocks = 0; sblock.fs_pendinginodes = 0; } sbdirty(); ofsmodified = fsmodified; flush(fswritefd, &sblk); fsmodified = ofsmodified; if (!preen) { printf("\n***** FILE SYSTEM MARKED %s *****\n", markclean ? "CLEAN" : "DIRTY"); if (!markclean) rerun = 1; } } else if (!preen) { if (markclean) { printf("\n***** FILE SYSTEM IS CLEAN *****\n"); } else { printf("\n***** FILE SYSTEM STILL DIRTY *****\n"); rerun = 1; } } (void)close(fsreadfd); (void)close(fswritefd); } /* * Print out I/O statistics. */ void IOstats(char *what) { int i; if (debug == 0) return; if (diskreads == 0) { printf("%s: no I/O\n\n", what); return; } if (startpass.tv_sec == 0) startpass = startprog; printf("%s: I/O statistics\n", what); printIOstats(); totaldiskreads += diskreads; diskreads = 0; for (i = 0; i < BT_NUMBUFTYPES; i++) { - timespecadd(&totalreadtime[i], &readtime[i]); + timespecadd(&totalreadtime[i], &readtime[i], &totalreadtime[i]); totalreadcnt[i] += readcnt[i]; readtime[i].tv_sec = readtime[i].tv_nsec = 0; readcnt[i] = 0; } clock_gettime(CLOCK_REALTIME_PRECISE, &startpass); } void finalIOstats(void) { int i; if (debug == 0) return; printf("Final I/O statistics\n"); totaldiskreads += diskreads; diskreads = totaldiskreads; startpass = startprog; for (i = 0; i < BT_NUMBUFTYPES; i++) { - timespecadd(&totalreadtime[i], &readtime[i]); + timespecadd(&totalreadtime[i], &readtime[i], &totalreadtime[i]); totalreadcnt[i] += readcnt[i]; readtime[i] = totalreadtime[i]; readcnt[i] = totalreadcnt[i]; } printIOstats(); } static void printIOstats(void) { long long msec, totalmsec; int i; clock_gettime(CLOCK_REALTIME_PRECISE, &finishpass); - timespecsub(&finishpass, &startpass); + timespecsub(&finishpass, &startpass, &finishpass); printf("Running time: %jd.%03ld sec\n", (intmax_t)finishpass.tv_sec, finishpass.tv_nsec / 1000000); printf("buffer reads by type:\n"); for (totalmsec = 0, i = 0; i < BT_NUMBUFTYPES; i++) totalmsec += readtime[i].tv_sec * 1000 + readtime[i].tv_nsec / 1000000; if (totalmsec == 0) totalmsec = 1; for (i = 0; i < BT_NUMBUFTYPES; i++) { if (readcnt[i] == 0) continue; msec = readtime[i].tv_sec * 1000 + readtime[i].tv_nsec / 1000000; printf("%21s:%8ld %2ld.%ld%% %4jd.%03ld sec %2lld.%lld%%\n", buftype[i], readcnt[i], readcnt[i] * 100 / diskreads, (readcnt[i] * 1000 / diskreads) % 10, (intmax_t)readtime[i].tv_sec, readtime[i].tv_nsec / 1000000, msec * 100 / totalmsec, (msec * 1000 / totalmsec) % 10); } printf("\n"); } int blread(int fd, char *buf, ufs2_daddr_t blk, long size) { char *cp; int i, errs; off_t offset; offset = blk; offset *= dev_bsize; if (bkgrdflag) slowio_start(); totalreads++; diskreads++; if (lseek(fd, offset, 0) < 0) rwerror("SEEK BLK", blk); else if (read(fd, buf, (int)size) == size) { if (bkgrdflag) slowio_end(); return (0); } /* * This is handled specially here instead of in rwerror because * rwerror is used for all sorts of errors, not just true read/write * errors. It should be refactored and fixed. */ if (surrender) { pfatal("CANNOT READ_BLK: %ld", (long)blk); errx(EEXIT, "ABORTING DUE TO READ ERRORS"); } else rwerror("READ BLK", blk); if (lseek(fd, offset, 0) < 0) rwerror("SEEK BLK", blk); errs = 0; memset(buf, 0, (size_t)size); printf("THE FOLLOWING DISK SECTORS COULD NOT BE READ:"); for (cp = buf, i = 0; i < size; i += secsize, cp += secsize) { if (read(fd, cp, (int)secsize) != secsize) { (void)lseek(fd, offset + i + secsize, 0); if (secsize != dev_bsize && dev_bsize != 1) printf(" %jd (%jd),", (intmax_t)(blk * dev_bsize + i) / secsize, (intmax_t)blk + i / dev_bsize); else printf(" %jd,", (intmax_t)blk + i / dev_bsize); errs++; } } printf("\n"); if (errs) resolved = 0; return (errs); } void blwrite(int fd, char *buf, ufs2_daddr_t blk, ssize_t size) { int i; char *cp; off_t offset; if (fd < 0) return; offset = blk; offset *= dev_bsize; if (lseek(fd, offset, 0) < 0) rwerror("SEEK BLK", blk); else if (write(fd, buf, size) == size) { fsmodified = 1; return; } resolved = 0; rwerror("WRITE BLK", blk); if (lseek(fd, offset, 0) < 0) rwerror("SEEK BLK", blk); printf("THE FOLLOWING SECTORS COULD NOT BE WRITTEN:"); for (cp = buf, i = 0; i < size; i += dev_bsize, cp += dev_bsize) if (write(fd, cp, dev_bsize) != dev_bsize) { (void)lseek(fd, offset + i + dev_bsize, 0); printf(" %jd,", (intmax_t)blk + i / dev_bsize); } printf("\n"); return; } void blerase(int fd, ufs2_daddr_t blk, long size) { off_t ioarg[2]; if (fd < 0) return; ioarg[0] = blk * dev_bsize; ioarg[1] = size; ioctl(fd, DIOCGDELETE, ioarg); /* we don't really care if we succeed or not */ return; } /* * Fill a contiguous region with all-zeroes. Note ZEROBUFSIZE is by * definition a multiple of dev_bsize. */ void blzero(int fd, ufs2_daddr_t blk, long size) { static char *zero; off_t offset, len; if (fd < 0) return; if (zero == NULL) { zero = calloc(ZEROBUFSIZE, 1); if (zero == NULL) errx(EEXIT, "cannot allocate buffer pool"); } offset = blk * dev_bsize; if (lseek(fd, offset, 0) < 0) rwerror("SEEK BLK", blk); while (size > 0) { len = MIN(ZEROBUFSIZE, size); if (write(fd, zero, len) != len) rwerror("WRITE BLK", blk); blk += len / dev_bsize; size -= len; } } /* * Verify cylinder group's magic number and other parameters. If the * test fails, offer an option to rebuild the whole cylinder group. */ int check_cgmagic(int cg, struct bufarea *cgbp) { struct cg *cgp = cgbp->b_un.b_cg; /* * Extended cylinder group checks. */ if (cg_chkmagic(cgp) && ((sblock.fs_magic == FS_UFS1_MAGIC && cgp->cg_old_niblk == sblock.fs_ipg && cgp->cg_ndblk <= sblock.fs_fpg && cgp->cg_old_ncyl <= sblock.fs_old_cpg) || (sblock.fs_magic == FS_UFS2_MAGIC && cgp->cg_niblk == sblock.fs_ipg && cgp->cg_ndblk <= sblock.fs_fpg && cgp->cg_initediblk <= sblock.fs_ipg))) { return (1); } pfatal("CYLINDER GROUP %d: BAD MAGIC NUMBER", cg); if (!reply("REBUILD CYLINDER GROUP")) { printf("YOU WILL NEED TO RERUN FSCK.\n"); rerun = 1; return (1); } /* * Zero out the cylinder group and then initialize critical fields. * Bit maps and summaries will be recalculated by later passes. */ memset(cgp, 0, (size_t)sblock.fs_cgsize); cgp->cg_magic = CG_MAGIC; cgp->cg_cgx = cg; cgp->cg_niblk = sblock.fs_ipg; cgp->cg_initediblk = MIN(sblock.fs_ipg, 2 * INOPB(&sblock)); if (cgbase(&sblock, cg) + sblock.fs_fpg < sblock.fs_size) cgp->cg_ndblk = sblock.fs_fpg; else cgp->cg_ndblk = sblock.fs_size - cgbase(&sblock, cg); cgp->cg_iusedoff = &cgp->cg_space[0] - (u_char *)(&cgp->cg_firstfield); if (sblock.fs_magic == FS_UFS1_MAGIC) { cgp->cg_niblk = 0; cgp->cg_initediblk = 0; cgp->cg_old_ncyl = sblock.fs_old_cpg; cgp->cg_old_niblk = sblock.fs_ipg; cgp->cg_old_btotoff = cgp->cg_iusedoff; cgp->cg_old_boff = cgp->cg_old_btotoff + sblock.fs_old_cpg * sizeof(int32_t); cgp->cg_iusedoff = cgp->cg_old_boff + sblock.fs_old_cpg * sizeof(u_int16_t); } cgp->cg_freeoff = cgp->cg_iusedoff + howmany(sblock.fs_ipg, CHAR_BIT); cgp->cg_nextfreeoff = cgp->cg_freeoff + howmany(sblock.fs_fpg,CHAR_BIT); if (sblock.fs_contigsumsize > 0) { cgp->cg_nclusterblks = cgp->cg_ndblk / sblock.fs_frag; cgp->cg_clustersumoff = roundup(cgp->cg_nextfreeoff, sizeof(u_int32_t)); cgp->cg_clustersumoff -= sizeof(u_int32_t); cgp->cg_clusteroff = cgp->cg_clustersumoff + (sblock.fs_contigsumsize + 1) * sizeof(u_int32_t); cgp->cg_nextfreeoff = cgp->cg_clusteroff + howmany(fragstoblks(&sblock, sblock.fs_fpg), CHAR_BIT); } dirty(cgbp); return (0); } /* * allocate a data block with the specified number of fragments */ ufs2_daddr_t allocblk(long frags) { int i, j, k, cg, baseblk; struct bufarea *cgbp; struct cg *cgp; if (frags <= 0 || frags > sblock.fs_frag) return (0); for (i = 0; i < maxfsblock - sblock.fs_frag; i += sblock.fs_frag) { for (j = 0; j <= sblock.fs_frag - frags; j++) { if (testbmap(i + j)) continue; for (k = 1; k < frags; k++) if (testbmap(i + j + k)) break; if (k < frags) { j += k; continue; } cg = dtog(&sblock, i + j); cgbp = cglookup(cg); cgp = cgbp->b_un.b_cg; if (!check_cgmagic(cg, cgbp)) return (0); baseblk = dtogd(&sblock, i + j); for (k = 0; k < frags; k++) { setbmap(i + j + k); clrbit(cg_blksfree(cgp), baseblk + k); } n_blks += frags; if (frags == sblock.fs_frag) cgp->cg_cs.cs_nbfree--; else cgp->cg_cs.cs_nffree -= frags; dirty(cgbp); return (i + j); } } return (0); } /* * Free a previously allocated block */ void freeblk(ufs2_daddr_t blkno, long frags) { struct inodesc idesc; idesc.id_blkno = blkno; idesc.id_numfrags = frags; (void)pass4check(&idesc); } /* Slow down IO so as to leave some disk bandwidth for other processes */ void slowio_start() { /* Delay one in every 8 operations */ slowio_pollcnt = (slowio_pollcnt + 1) & 7; if (slowio_pollcnt == 0) { gettimeofday(&slowio_starttime, NULL); } } void slowio_end() { struct timeval tv; int delay_usec; if (slowio_pollcnt != 0) return; /* Update the slowdown interval. */ gettimeofday(&tv, NULL); delay_usec = (tv.tv_sec - slowio_starttime.tv_sec) * 1000000 + (tv.tv_usec - slowio_starttime.tv_usec); if (delay_usec < 64) delay_usec = 64; if (delay_usec > 2500000) delay_usec = 2500000; slowio_delay_usec = (slowio_delay_usec * 63 + delay_usec) >> 6; /* delay by 8 times the average IO delay */ if (slowio_delay_usec > 64) usleep(slowio_delay_usec * 8); } /* * Find a pathname */ void getpathname(char *namebuf, ino_t curdir, ino_t ino) { int len; char *cp; struct inodesc idesc; static int busy = 0; if (curdir == ino && ino == UFS_ROOTINO) { (void)strcpy(namebuf, "/"); return; } if (busy || !INO_IS_DVALID(curdir)) { (void)strcpy(namebuf, "?"); return; } busy = 1; memset(&idesc, 0, sizeof(struct inodesc)); idesc.id_type = DATA; idesc.id_fix = IGNORE; cp = &namebuf[MAXPATHLEN - 1]; *cp = '\0'; if (curdir != ino) { idesc.id_parent = curdir; goto namelookup; } while (ino != UFS_ROOTINO) { idesc.id_number = ino; idesc.id_func = findino; idesc.id_name = strdup(".."); if ((ckinode(ginode(ino), &idesc) & FOUND) == 0) break; namelookup: idesc.id_number = idesc.id_parent; idesc.id_parent = ino; idesc.id_func = findname; idesc.id_name = namebuf; if ((ckinode(ginode(idesc.id_number), &idesc)&FOUND) == 0) break; len = strlen(namebuf); cp -= len; memmove(cp, namebuf, (size_t)len); *--cp = '/'; if (cp < &namebuf[UFS_MAXNAMLEN]) break; ino = idesc.id_number; } busy = 0; if (ino != UFS_ROOTINO) *--cp = '?'; memmove(namebuf, cp, (size_t)(&namebuf[MAXPATHLEN] - cp)); } void catch(int sig __unused) { ckfini(0); exit(12); } /* * When preening, allow a single quit to signal * a special exit after file system checks complete * so that reboot sequence may be interrupted. */ void catchquit(int sig __unused) { printf("returning to single-user after file system check\n"); returntosingle = 1; (void)signal(SIGQUIT, SIG_DFL); } /* * determine whether an inode should be fixed. */ int dofix(struct inodesc *idesc, const char *msg) { switch (idesc->id_fix) { case DONTKNOW: if (idesc->id_type == DATA) direrror(idesc->id_number, msg); else pwarn("%s", msg); if (preen) { printf(" (SALVAGED)\n"); idesc->id_fix = FIX; return (ALTERED); } if (reply("SALVAGE") == 0) { idesc->id_fix = NOFIX; return (0); } idesc->id_fix = FIX; return (ALTERED); case FIX: return (ALTERED); case NOFIX: case IGNORE: return (0); default: errx(EEXIT, "UNKNOWN INODESC FIX MODE %d", idesc->id_fix); } /* NOTREACHED */ return (0); } #include /* * An unexpected inconsistency occurred. * Die if preening or file system is running with soft dependency protocol, * otherwise just print message and continue. */ void pfatal(const char *fmt, ...) { va_list ap; va_start(ap, fmt); if (!preen) { (void)vfprintf(stdout, fmt, ap); va_end(ap); if (usedsoftdep) (void)fprintf(stdout, "\nUNEXPECTED SOFT UPDATE INCONSISTENCY\n"); /* * Force foreground fsck to clean up inconsistency. */ if (bkgrdflag) { cmd.value = FS_NEEDSFSCK; cmd.size = 1; if (sysctlbyname("vfs.ffs.setflags", 0, 0, &cmd, sizeof cmd) == -1) pwarn("CANNOT SET FS_NEEDSFSCK FLAG\n"); fprintf(stdout, "CANNOT RUN IN BACKGROUND\n"); ckfini(0); exit(EEXIT); } return; } if (cdevname == NULL) cdevname = strdup("fsck"); (void)fprintf(stdout, "%s: ", cdevname); (void)vfprintf(stdout, fmt, ap); (void)fprintf(stdout, "\n%s: UNEXPECTED%sINCONSISTENCY; RUN fsck MANUALLY.\n", cdevname, usedsoftdep ? " SOFT UPDATE " : " "); /* * Force foreground fsck to clean up inconsistency. */ if (bkgrdflag) { cmd.value = FS_NEEDSFSCK; cmd.size = 1; if (sysctlbyname("vfs.ffs.setflags", 0, 0, &cmd, sizeof cmd) == -1) pwarn("CANNOT SET FS_NEEDSFSCK FLAG\n"); } ckfini(0); exit(EEXIT); } /* * Pwarn just prints a message when not preening or running soft dependency * protocol, or a warning (preceded by filename) when preening. */ void pwarn(const char *fmt, ...) { va_list ap; va_start(ap, fmt); if (preen) (void)fprintf(stdout, "%s: ", cdevname); (void)vfprintf(stdout, fmt, ap); va_end(ap); } /* * Stub for routines from kernel. */ void panic(const char *fmt, ...) { va_list ap; va_start(ap, fmt); pfatal("INTERNAL INCONSISTENCY:"); (void)vfprintf(stdout, fmt, ap); va_end(ap); exit(EEXIT); } Index: head/share/man/man3/Makefile =================================================================== --- head/share/man/man3/Makefile (revision 336913) +++ head/share/man/man3/Makefile (revision 336914) @@ -1,341 +1,346 @@ # @(#)Makefile 8.2 (Berkeley) 12/13/93 # $FreeBSD$ .include PACKAGE=runtime-manuals MAN= assert.3 \ ATOMIC_VAR_INIT.3 \ bitstring.3 \ end.3 \ fpgetround.3 \ intro.3 \ makedev.3 \ offsetof.3 \ ${PTHREAD_MAN} \ queue.3 \ sigevent.3 \ siginfo.3 \ stdarg.3 \ sysexits.3 \ tgmath.3 \ timeradd.3 \ tree.3 MLINKS= ATOMIC_VAR_INIT.3 atomic_compare_exchange_strong.3 \ ATOMIC_VAR_INIT.3 atomic_compare_exchange_strong_explicit.3 \ ATOMIC_VAR_INIT.3 atomic_compare_exchange_weak.3 \ ATOMIC_VAR_INIT.3 atomic_compare_exchange_weak_explicit.3 \ ATOMIC_VAR_INIT.3 atomic_exchange.3 \ ATOMIC_VAR_INIT.3 atomic_exchange_explicit.3 \ ATOMIC_VAR_INIT.3 atomic_fetch_add.3 \ ATOMIC_VAR_INIT.3 atomic_fetch_add_explicit.3 \ ATOMIC_VAR_INIT.3 atomic_fetch_and.3 \ ATOMIC_VAR_INIT.3 atomic_fetch_and_explicit.3 \ ATOMIC_VAR_INIT.3 atomic_fetch_or.3 \ ATOMIC_VAR_INIT.3 atomic_fetch_or_explicit.3 \ ATOMIC_VAR_INIT.3 atomic_fetch_sub.3 \ ATOMIC_VAR_INIT.3 atomic_fetch_sub_explicit.3 \ ATOMIC_VAR_INIT.3 atomic_fetch_xor.3 \ ATOMIC_VAR_INIT.3 atomic_fetch_xor_explicit.3 \ ATOMIC_VAR_INIT.3 atomic_init.3 \ ATOMIC_VAR_INIT.3 atomic_is_lock_free.3 \ ATOMIC_VAR_INIT.3 atomic_load.3 \ ATOMIC_VAR_INIT.3 atomic_load_explicit.3 \ ATOMIC_VAR_INIT.3 atomic_store.3 \ ATOMIC_VAR_INIT.3 atomic_store_explicit.3 MLINKS+= bitstring.3 bit_alloc.3 \ bitstring.3 bit_clear.3 \ bitstring.3 bit_decl.3 \ bitstring.3 bit_ffc.3 \ bitstring.3 bit_ffc_at.3 \ bitstring.3 bit_ffs.3 \ bitstring.3 bit_ffs_at.3 \ bitstring.3 bit_nclear.3 \ bitstring.3 bit_nset.3 \ bitstring.3 bit_set.3 \ bitstring.3 bitstr_size.3 \ bitstring.3 bit_test.3 MLINKS+= end.3 edata.3 \ end.3 etext.3 MLINKS+= fpgetround.3 fpgetmask.3 \ fpgetround.3 fpgetprec.3 \ fpgetround.3 fpgetsticky.3 \ fpgetround.3 fpresetsticky.3 \ fpgetround.3 fpsetmask.3 \ fpgetround.3 fpsetprec.3 \ fpgetround.3 fpsetround.3 MLINKS+= makedev.3 major.3 \ makedev.3 minor.3 MLINKS+= ${PTHREAD_MLINKS} MLINKS+= queue.3 LIST_CLASS_ENTRY.3 \ queue.3 LIST_CLASS_HEAD.3 \ queue.3 LIST_EMPTY.3 \ queue.3 LIST_ENTRY.3 \ queue.3 LIST_FIRST.3 \ queue.3 LIST_FOREACH.3 \ queue.3 LIST_FOREACH_FROM.3 \ queue.3 LIST_FOREACH_FROM_SAFE.3 \ queue.3 LIST_FOREACH_SAFE.3 \ queue.3 LIST_HEAD.3 \ queue.3 LIST_HEAD_INITIALIZER.3 \ queue.3 LIST_INIT.3 \ queue.3 LIST_INSERT_AFTER.3 \ queue.3 LIST_INSERT_BEFORE.3 \ queue.3 LIST_INSERT_HEAD.3 \ queue.3 LIST_NEXT.3 \ queue.3 LIST_PREV.3 \ queue.3 LIST_REMOVE.3 \ queue.3 LIST_SWAP.3 \ queue.3 SLIST_CLASS_ENTRY.3 \ queue.3 SLIST_CLASS_HEAD.3 \ queue.3 SLIST_EMPTY.3 \ queue.3 SLIST_ENTRY.3 \ queue.3 SLIST_FIRST.3 \ queue.3 SLIST_FOREACH.3 \ queue.3 SLIST_FOREACH_FROM.3 \ queue.3 SLIST_FOREACH_FROM_SAFE.3 \ queue.3 SLIST_FOREACH_SAFE.3 \ queue.3 SLIST_HEAD.3 \ queue.3 SLIST_HEAD_INITIALIZER.3 \ queue.3 SLIST_INIT.3 \ queue.3 SLIST_INSERT_AFTER.3 \ queue.3 SLIST_INSERT_HEAD.3 \ queue.3 SLIST_NEXT.3 \ queue.3 SLIST_REMOVE.3 \ queue.3 SLIST_REMOVE_AFTER.3 \ queue.3 SLIST_REMOVE_HEAD.3 \ queue.3 SLIST_REMOVE_PREVPTR.3 \ queue.3 SLIST_SWAP.3 \ queue.3 STAILQ_CLASS_ENTRY.3 \ queue.3 STAILQ_CLASS_HEAD.3 \ queue.3 STAILQ_CONCAT.3 \ queue.3 STAILQ_EMPTY.3 \ queue.3 STAILQ_ENTRY.3 \ queue.3 STAILQ_FIRST.3 \ queue.3 STAILQ_FOREACH.3 \ queue.3 STAILQ_FOREACH_FROM.3 \ queue.3 STAILQ_FOREACH_FROM_SAFE.3 \ queue.3 STAILQ_FOREACH_SAFE.3 \ queue.3 STAILQ_HEAD.3 \ queue.3 STAILQ_HEAD_INITIALIZER.3 \ queue.3 STAILQ_INIT.3 \ queue.3 STAILQ_INSERT_AFTER.3 \ queue.3 STAILQ_INSERT_HEAD.3 \ queue.3 STAILQ_INSERT_TAIL.3 \ queue.3 STAILQ_LAST.3 \ queue.3 STAILQ_NEXT.3 \ queue.3 STAILQ_REMOVE.3 \ queue.3 STAILQ_REMOVE_AFTER.3 \ queue.3 STAILQ_REMOVE_HEAD.3 \ queue.3 STAILQ_SWAP.3 \ queue.3 TAILQ_CLASS_ENTRY.3 \ queue.3 TAILQ_CLASS_HEAD.3 \ queue.3 TAILQ_CONCAT.3 \ queue.3 TAILQ_EMPTY.3 \ queue.3 TAILQ_ENTRY.3 \ queue.3 TAILQ_FIRST.3 \ queue.3 TAILQ_FOREACH.3 \ queue.3 TAILQ_FOREACH_FROM.3 \ queue.3 TAILQ_FOREACH_FROM_SAFE.3 \ queue.3 TAILQ_FOREACH_REVERSE.3 \ queue.3 TAILQ_FOREACH_REVERSE_FROM.3 \ queue.3 TAILQ_FOREACH_REVERSE_FROM_SAFE.3 \ queue.3 TAILQ_FOREACH_REVERSE_SAFE.3 \ queue.3 TAILQ_FOREACH_SAFE.3 \ queue.3 TAILQ_HEAD.3 \ queue.3 TAILQ_HEAD_INITIALIZER.3 \ queue.3 TAILQ_INIT.3 \ queue.3 TAILQ_INSERT_AFTER.3 \ queue.3 TAILQ_INSERT_BEFORE.3 \ queue.3 TAILQ_INSERT_HEAD.3 \ queue.3 TAILQ_INSERT_TAIL.3 \ queue.3 TAILQ_LAST.3 \ queue.3 TAILQ_NEXT.3 \ queue.3 TAILQ_PREV.3 \ queue.3 TAILQ_REMOVE.3 \ queue.3 TAILQ_SWAP.3 MLINKS+= stdarg.3 va_arg.3 \ stdarg.3 va_copy.3 \ stdarg.3 va_end.3 \ stdarg.3 varargs.3 \ stdarg.3 va_start.3 MLINKS+= timeradd.3 timerclear.3 \ timeradd.3 timercmp.3 \ timeradd.3 timerisset.3 \ - timeradd.3 timersub.3 + timeradd.3 timersub.3 \ + timeradd.3 timespecadd.3 \ + timeradd.3 timespecsub.3 \ + timeradd.3 timespecclear.3 \ + timeradd.3 timespecisset.3 \ + timeradd.3 timespeccmp.3 MLINKS+= tree.3 RB_EMPTY.3 \ tree.3 RB_ENTRY.3 \ tree.3 RB_FIND.3 \ tree.3 RB_FOREACH.3 \ tree.3 RB_FOREACH_REVERSE.3 \ tree.3 RB_GENERATE.3 \ tree.3 RB_GENERATE_STATIC.3 \ tree.3 RB_HEAD.3 \ tree.3 RB_INIT.3 \ tree.3 RB_INITIALIZER.3 \ tree.3 RB_INSERT.3 \ tree.3 RB_LEFT.3 \ tree.3 RB_MAX.3 \ tree.3 RB_MIN.3 \ tree.3 RB_NEXT.3 \ tree.3 RB_NFIND.3 \ tree.3 RB_PARENT.3 \ tree.3 RB_PREV.3 \ tree.3 RB_PROTOTYPE.3 \ tree.3 RB_PROTOTYPE_STATIC.3 \ tree.3 RB_REMOVE.3 \ tree.3 RB_RIGHT.3 \ tree.3 RB_ROOT.3 \ tree.3 SPLAY_EMPTY.3 \ tree.3 SPLAY_ENTRY.3 \ tree.3 SPLAY_FIND.3 \ tree.3 SPLAY_FOREACH.3 \ tree.3 SPLAY_GENERATE.3 \ tree.3 SPLAY_HEAD.3 \ tree.3 SPLAY_INIT.3 \ tree.3 SPLAY_INITIALIZER.3 \ tree.3 SPLAY_INSERT.3 \ tree.3 SPLAY_LEFT.3 \ tree.3 SPLAY_MAX.3 \ tree.3 SPLAY_MIN.3 \ tree.3 SPLAY_NEXT.3 \ tree.3 SPLAY_PROTOTYPE.3 \ tree.3 SPLAY_REMOVE.3 \ tree.3 SPLAY_RIGHT.3 \ tree.3 SPLAY_ROOT.3 .if ${MK_LIBTHR} != "no" PTHREAD_MAN= pthread.3 \ pthread_affinity_np.3 \ pthread_atfork.3 \ pthread_attr.3 \ pthread_attr_affinity_np.3 \ pthread_attr_get_np.3 \ pthread_attr_setcreatesuspend_np.3 \ pthread_barrierattr.3 \ pthread_barrier_destroy.3 \ pthread_cancel.3 \ pthread_cleanup_pop.3 \ pthread_cleanup_push.3 \ pthread_condattr.3 \ pthread_cond_broadcast.3 \ pthread_cond_destroy.3 \ pthread_cond_init.3 \ pthread_cond_signal.3 \ pthread_cond_timedwait.3 \ pthread_cond_wait.3 \ pthread_create.3 \ pthread_detach.3 \ pthread_equal.3 \ pthread_exit.3 \ pthread_getconcurrency.3 \ pthread_getcpuclockid.3 \ pthread_getspecific.3 \ pthread_getthreadid_np.3 \ pthread_join.3 \ pthread_key_create.3 \ pthread_key_delete.3 \ pthread_kill.3 \ pthread_main_np.3 \ pthread_multi_np.3 \ pthread_mutexattr.3 \ pthread_mutexattr_getkind_np.3 \ pthread_mutex_consistent.3 \ pthread_mutex_destroy.3 \ pthread_mutex_init.3 \ pthread_mutex_lock.3 \ pthread_mutex_timedlock.3 \ pthread_mutex_trylock.3 \ pthread_mutex_unlock.3 \ pthread_once.3 \ pthread_resume_all_np.3 \ pthread_resume_np.3 \ pthread_rwlockattr_destroy.3 \ pthread_rwlockattr_getpshared.3 \ pthread_rwlockattr_init.3 \ pthread_rwlockattr_setpshared.3 \ pthread_rwlock_destroy.3 \ pthread_rwlock_init.3 \ pthread_rwlock_rdlock.3 \ pthread_rwlock_timedrdlock.3 \ pthread_rwlock_timedwrlock.3 \ pthread_rwlock_unlock.3 \ pthread_rwlock_wrlock.3 \ pthread_schedparam.3 \ pthread_self.3 \ pthread_set_name_np.3 \ pthread_setspecific.3 \ pthread_sigmask.3 \ pthread_spin_init.3 \ pthread_spin_lock.3 \ pthread_suspend_all_np.3 \ pthread_suspend_np.3 \ pthread_switch_add_np.3 \ pthread_testcancel.3 \ pthread_yield.3 PTHREAD_MLINKS= pthread_affinity_np.3 pthread_getaffinity_np.3 \ pthread_affinity_np.3 pthread_setaffinity_np.3 PTHREAD_MLINKS+=pthread_attr.3 pthread_attr_destroy.3 \ pthread_attr.3 pthread_attr_getdetachstate.3 \ pthread_attr.3 pthread_attr_getguardsize.3 \ pthread_attr.3 pthread_attr_getinheritsched.3 \ pthread_attr.3 pthread_attr_getschedparam.3 \ pthread_attr.3 pthread_attr_getschedpolicy.3 \ pthread_attr.3 pthread_attr_getscope.3 \ pthread_attr.3 pthread_attr_getstack.3 \ pthread_attr.3 pthread_attr_getstackaddr.3 \ pthread_attr.3 pthread_attr_getstacksize.3 \ pthread_attr.3 pthread_attr_init.3 \ pthread_attr.3 pthread_attr_setdetachstate.3 \ pthread_attr.3 pthread_attr_setguardsize.3 \ pthread_attr.3 pthread_attr_setinheritsched.3 \ pthread_attr.3 pthread_attr_setschedparam.3 \ pthread_attr.3 pthread_attr_setschedpolicy.3 \ pthread_attr.3 pthread_attr_setscope.3 \ pthread_attr.3 pthread_attr_setstack.3 \ pthread_attr.3 pthread_attr_setstackaddr.3 \ pthread_attr.3 pthread_attr_setstacksize.3 PTHREAD_MLINKS+=pthread_attr_affinity_np.3 pthread_attr_getaffinity_np.3 \ pthread_attr_affinity_np.3 pthread_attr_setaffinity_np.3 PTHREAD_MLINKS+=pthread_barrierattr.3 pthread_barrierattr_destroy.3 \ pthread_barrierattr.3 pthread_barrierattr_getpshared.3 \ pthread_barrierattr.3 pthread_barrierattr_init.3 \ pthread_barrierattr.3 pthread_barrierattr_setpshared.3 PTHREAD_MLINKS+=pthread_barrier_destroy.3 pthread_barrier_init.3 \ pthread_barrier_destroy.3 pthread_barrier_wait.3 PTHREAD_MLINKS+=pthread_condattr.3 pthread_condattr_destroy.3 \ pthread_condattr.3 pthread_condattr_init.3 \ pthread_condattr.3 pthread_condattr_getclock.3 \ pthread_condattr.3 pthread_condattr_setclock.3 \ pthread_condattr.3 pthread_condattr_getpshared.3 \ pthread_condattr.3 pthread_condattr_setpshared.3 PTHREAD_MLINKS+=pthread_getconcurrency.3 pthread_setconcurrency.3 PTHREAD_MLINKS+=pthread_multi_np.3 pthread_single_np.3 PTHREAD_MLINKS+=pthread_mutexattr.3 pthread_mutexattr_destroy.3 \ pthread_mutexattr.3 pthread_mutexattr_getprioceiling.3 \ pthread_mutexattr.3 pthread_mutexattr_getprotocol.3 \ pthread_mutexattr.3 pthread_mutexattr_getrobust.3 \ pthread_mutexattr.3 pthread_mutexattr_gettype.3 \ pthread_mutexattr.3 pthread_mutexattr_init.3 \ pthread_mutexattr.3 pthread_mutexattr_setprioceiling.3 \ pthread_mutexattr.3 pthread_mutexattr_setprotocol.3 \ pthread_mutexattr.3 pthread_mutexattr_setrobust.3 \ pthread_mutexattr.3 pthread_mutexattr_settype.3 PTHREAD_MLINKS+=pthread_mutexattr_getkind_np.3 pthread_mutexattr_setkind_np.3 PTHREAD_MLINKS+=pthread_rwlock_rdlock.3 pthread_rwlock_tryrdlock.3 PTHREAD_MLINKS+=pthread_rwlock_wrlock.3 pthread_rwlock_trywrlock.3 PTHREAD_MLINKS+=pthread_schedparam.3 pthread_getschedparam.3 \ pthread_schedparam.3 pthread_setschedparam.3 PTHREAD_MLINKS+=pthread_spin_init.3 pthread_spin_destroy.3 \ pthread_spin_lock.3 pthread_spin_trylock.3 \ pthread_spin_lock.3 pthread_spin_unlock.3 PTHREAD_MLINKS+=pthread_switch_add_np.3 pthread_switch_delete_np.3 PTHREAD_MLINKS+=pthread_testcancel.3 pthread_setcancelstate.3 \ pthread_testcancel.3 pthread_setcanceltype.3 PTHREAD_MLINKS+=pthread_join.3 pthread_timedjoin_np.3 .endif .include Index: head/share/man/man3/timeradd.3 =================================================================== --- head/share/man/man3/timeradd.3 (revision 336913) +++ head/share/man/man3/timeradd.3 (revision 336914) @@ -1,119 +1,165 @@ .\" Copyright (c) 1999 Kelly Yancey .\" All rights reserved. .\" .\" Redistribution and use in source and binary forms, with or without .\" modification, are permitted provided that the following conditions .\" are met: .\" 1. Redistributions of source code must retain the above copyright .\" notice, this list of conditions and the following disclaimer. .\" 2. Redistributions in binary form must reproduce the above copyright .\" notice, this list of conditions and the following disclaimer in the .\" documentation and/or other materials provided with the distribution. .\" 3. Neither the name of the author nor the names of any co-contributors .\" may be used to endorse or promote products derived from this software .\" without specific prior written permission. .\" .\" THIS SOFTWARE IS PROVIDED BY JOHN BIRRELL AND CONTRIBUTORS ``AS IS'' AND .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE .\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE .\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL .\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS .\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) .\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF .\" SUCH DAMAGE. .\" .\" $FreeBSD$ .\" -.Dd August 11, 1999 +.Dd July 30, 2018 .Dt TIMERADD 3 .Os .Sh NAME .Nm timeradd , .Nm timersub , .Nm timerclear , .Nm timerisset , -.Nm timercmp -.Nd operations on timevals +.Nm timercmp , +.Nm timespecadd , +.Nm timespecsub , +.Nm timespecclear , +.Nm timespecisset , +.Nm timespeccmp +.Nd operations on timevals and timespecs .Sh SYNOPSIS .In sys/time.h .Ft void .Fn timeradd "struct timeval *a" "struct timeval *b" "struct timeval *res" .Ft void .Fn timersub "struct timeval *a" "struct timeval *b" "struct timeval *res" .Ft void .Fn timerclear "struct timeval *tvp" .Ft int .Fn timerisset "struct timeval *tvp" .Ft int .Fn timercmp "struct timeval *a" "struct timeval *b" CMP +.Ft void +.Fn timespecadd "struct timespec *a" "struct timespec *b" "struct timespec *res" +.Ft void +.Fn timespecsub "struct timespec *a" "struct timespec *b" "struct timespec *res" +.Ft void +.Fn timespecclear "struct timespec *ts" +.Ft int +.Fn timespecisset "struct timespec *ts" +.Ft int +.Fn timespeccmp "struct timespec *a" "struct timespec *b" CMP .Sh DESCRIPTION These macros are provided for manipulating .Fa timeval +and +.Fa timespec structures for use with the +.Xr clock_gettime 2 , +.Xr clock_settime 2 , .Xr gettimeofday 2 and .Xr settimeofday 2 calls. -The structure is defined in +The +.Fa timeval +structure is defined in .In sys/time.h as: .Bd -literal struct timeval { long tv_sec; /* seconds since Jan. 1, 1970 */ long tv_usec; /* and microseconds */ }; .Ed +And the +.Fa timespec +structure is defined in +.In time.h +as: +.Bd -literal +struct timespec { + time_t tv_nsec; /* seconds */ + long tv_nsec; /* and nanoseconds */ +}; +.Ed .Pp .Fn timeradd -adds the time information stored in +and +.Fn timespecadd +add the time information stored in .Fa a to .Fa b -and stores the resulting -.Vt timeval -in +and store the result in .Fa res . The results are simplified such that the value of .Fa res->tv_usec -is always less than 1,000,000 (1 second). +or +.Fa res->tv_nsec +is always less than 1 second. .Pp .Fn timersub -subtracts the time information stored in +and +.Fn timespecsub +subtract the time information stored in .Fa b from .Fa a -and stores the resulting -.Vt timeval +and store the result in .Fa res . .Pp .Fn timerclear -initializes -.Fa tvp -to midnight (0 hour) January 1st, 1970 (the Epoch). +and +.Fn timespecclear +initialize their argument to midnight (0 hour) January 1st, 1970 (the Epoch). .Pp .Fn timerisset -returns true if -.Fa tvp -is set to any time value other than the Epoch. +and +.Fn timespecisset +return true if their argument is set to any time value other than the Epoch. .Pp .Fn timercmp -compares +and +.Fn timespeccmp +compare .Fa a to .Fa b using the comparison operator given in .Fa CMP , -and returns the result of that comparison. +and return the result of that comparison. .Sh SEE ALSO -.Xr gettimeofday 2 +.Xr gettimeofday 2 , +.Xr clock_gettime 2 .Sh HISTORY The .Fn timeradd family of macros were imported from .Nx 1.1 , and appeared in .Fx 2.2.6 . +The +.Fn timespecadd +family of macros were imported from +.Nx 1.3 +into +.Fx 3.0 , +though they were not exposed to userland until +.Fx 12.0 . Index: head/sys/compat/linux/linux_event.c =================================================================== --- head/sys/compat/linux/linux_event.c (revision 336913) +++ head/sys/compat/linux/linux_event.c (revision 336914) @@ -1,1328 +1,1328 @@ /*- * Copyright (c) 2007 Roman Divacky * Copyright (c) 2014 Dmitry Chagin * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef COMPAT_LINUX32 #include #include #else #include #include #endif #include #include #include #include #include /* * epoll defines 'struct epoll_event' with the field 'data' as 64 bits * on all architectures. But on 32 bit architectures BSD 'struct kevent' only * has 32 bit opaque pointer as 'udata' field. So we can't pass epoll supplied * data verbatuim. Therefore we allocate 64-bit memory block to pass * user supplied data for every file descriptor. */ typedef uint64_t epoll_udata_t; struct epoll_emuldata { uint32_t fdc; /* epoll udata max index */ epoll_udata_t udata[1]; /* epoll user data vector */ }; #define EPOLL_DEF_SZ 16 #define EPOLL_SIZE(fdn) \ (sizeof(struct epoll_emuldata)+(fdn) * sizeof(epoll_udata_t)) struct epoll_event { uint32_t events; epoll_udata_t data; } #if defined(__amd64__) __attribute__((packed)) #endif ; #define LINUX_MAX_EVENTS (INT_MAX / sizeof(struct epoll_event)) static void epoll_fd_install(struct thread *td, int fd, epoll_udata_t udata); static int epoll_to_kevent(struct thread *td, struct file *epfp, int fd, struct epoll_event *l_event, int *kev_flags, struct kevent *kevent, int *nkevents); static void kevent_to_epoll(struct kevent *kevent, struct epoll_event *l_event); static int epoll_kev_copyout(void *arg, struct kevent *kevp, int count); static int epoll_kev_copyin(void *arg, struct kevent *kevp, int count); static int epoll_delete_event(struct thread *td, struct file *epfp, int fd, int filter); static int epoll_delete_all_events(struct thread *td, struct file *epfp, int fd); struct epoll_copyin_args { struct kevent *changelist; }; struct epoll_copyout_args { struct epoll_event *leventlist; struct proc *p; uint32_t count; int error; }; /* eventfd */ typedef uint64_t eventfd_t; static fo_rdwr_t eventfd_read; static fo_rdwr_t eventfd_write; static fo_ioctl_t eventfd_ioctl; static fo_poll_t eventfd_poll; static fo_kqfilter_t eventfd_kqfilter; static fo_stat_t eventfd_stat; static fo_close_t eventfd_close; static fo_fill_kinfo_t eventfd_fill_kinfo; static struct fileops eventfdops = { .fo_read = eventfd_read, .fo_write = eventfd_write, .fo_truncate = invfo_truncate, .fo_ioctl = eventfd_ioctl, .fo_poll = eventfd_poll, .fo_kqfilter = eventfd_kqfilter, .fo_stat = eventfd_stat, .fo_close = eventfd_close, .fo_chmod = invfo_chmod, .fo_chown = invfo_chown, .fo_sendfile = invfo_sendfile, .fo_fill_kinfo = eventfd_fill_kinfo, .fo_flags = DFLAG_PASSABLE }; static void filt_eventfddetach(struct knote *kn); static int filt_eventfdread(struct knote *kn, long hint); static int filt_eventfdwrite(struct knote *kn, long hint); static struct filterops eventfd_rfiltops = { .f_isfd = 1, .f_detach = filt_eventfddetach, .f_event = filt_eventfdread }; static struct filterops eventfd_wfiltops = { .f_isfd = 1, .f_detach = filt_eventfddetach, .f_event = filt_eventfdwrite }; /* timerfd */ typedef uint64_t timerfd_t; static fo_rdwr_t timerfd_read; static fo_poll_t timerfd_poll; static fo_kqfilter_t timerfd_kqfilter; static fo_stat_t timerfd_stat; static fo_close_t timerfd_close; static fo_fill_kinfo_t timerfd_fill_kinfo; static struct fileops timerfdops = { .fo_read = timerfd_read, .fo_write = invfo_rdwr, .fo_truncate = invfo_truncate, .fo_ioctl = eventfd_ioctl, .fo_poll = timerfd_poll, .fo_kqfilter = timerfd_kqfilter, .fo_stat = timerfd_stat, .fo_close = timerfd_close, .fo_chmod = invfo_chmod, .fo_chown = invfo_chown, .fo_sendfile = invfo_sendfile, .fo_fill_kinfo = timerfd_fill_kinfo, .fo_flags = DFLAG_PASSABLE }; static void filt_timerfddetach(struct knote *kn); static int filt_timerfdread(struct knote *kn, long hint); static struct filterops timerfd_rfiltops = { .f_isfd = 1, .f_detach = filt_timerfddetach, .f_event = filt_timerfdread }; struct eventfd { eventfd_t efd_count; uint32_t efd_flags; struct selinfo efd_sel; struct mtx efd_lock; }; struct timerfd { clockid_t tfd_clockid; struct itimerspec tfd_time; struct callout tfd_callout; timerfd_t tfd_count; bool tfd_canceled; struct selinfo tfd_sel; struct mtx tfd_lock; }; static int eventfd_create(struct thread *td, uint32_t initval, int flags); static void linux_timerfd_expire(void *); static void linux_timerfd_curval(struct timerfd *, struct itimerspec *); static void epoll_fd_install(struct thread *td, int fd, epoll_udata_t udata) { struct linux_pemuldata *pem; struct epoll_emuldata *emd; struct proc *p; p = td->td_proc; pem = pem_find(p); KASSERT(pem != NULL, ("epoll proc emuldata not found.\n")); LINUX_PEM_XLOCK(pem); if (pem->epoll == NULL) { emd = malloc(EPOLL_SIZE(fd), M_EPOLL, M_WAITOK); emd->fdc = fd; pem->epoll = emd; } else { emd = pem->epoll; if (fd > emd->fdc) { emd = realloc(emd, EPOLL_SIZE(fd), M_EPOLL, M_WAITOK); emd->fdc = fd; pem->epoll = emd; } } emd->udata[fd] = udata; LINUX_PEM_XUNLOCK(pem); } static int epoll_create_common(struct thread *td, int flags) { int error; error = kern_kqueue(td, flags, NULL); if (error != 0) return (error); epoll_fd_install(td, EPOLL_DEF_SZ, 0); return (0); } #ifdef LINUX_LEGACY_SYSCALLS int linux_epoll_create(struct thread *td, struct linux_epoll_create_args *args) { /* * args->size is unused. Linux just tests it * and then forgets it as well. */ if (args->size <= 0) return (EINVAL); return (epoll_create_common(td, 0)); } #endif int linux_epoll_create1(struct thread *td, struct linux_epoll_create1_args *args) { int flags; if ((args->flags & ~(LINUX_O_CLOEXEC)) != 0) return (EINVAL); flags = 0; if ((args->flags & LINUX_O_CLOEXEC) != 0) flags |= O_CLOEXEC; return (epoll_create_common(td, flags)); } /* Structure converting function from epoll to kevent. */ static int epoll_to_kevent(struct thread *td, struct file *epfp, int fd, struct epoll_event *l_event, int *kev_flags, struct kevent *kevent, int *nkevents) { uint32_t levents = l_event->events; struct linux_pemuldata *pem; struct proc *p; /* flags related to how event is registered */ if ((levents & LINUX_EPOLLONESHOT) != 0) *kev_flags |= EV_ONESHOT; if ((levents & LINUX_EPOLLET) != 0) *kev_flags |= EV_CLEAR; if ((levents & LINUX_EPOLLERR) != 0) *kev_flags |= EV_ERROR; if ((levents & LINUX_EPOLLRDHUP) != 0) *kev_flags |= EV_EOF; /* flags related to what event is registered */ if ((levents & LINUX_EPOLL_EVRD) != 0) { EV_SET(kevent++, fd, EVFILT_READ, *kev_flags, 0, 0, 0); ++(*nkevents); } if ((levents & LINUX_EPOLL_EVWR) != 0) { EV_SET(kevent++, fd, EVFILT_WRITE, *kev_flags, 0, 0, 0); ++(*nkevents); } if ((levents & ~(LINUX_EPOLL_EVSUP)) != 0) { p = td->td_proc; pem = pem_find(p); KASSERT(pem != NULL, ("epoll proc emuldata not found.\n")); KASSERT(pem->epoll != NULL, ("epoll proc epolldata not found.\n")); LINUX_PEM_XLOCK(pem); if ((pem->flags & LINUX_XUNSUP_EPOLL) == 0) { pem->flags |= LINUX_XUNSUP_EPOLL; LINUX_PEM_XUNLOCK(pem); linux_msg(td, "epoll_ctl unsupported flags: 0x%x\n", levents); } else LINUX_PEM_XUNLOCK(pem); return (EINVAL); } return (0); } /* * Structure converting function from kevent to epoll. In a case * this is called on error in registration we store the error in * event->data and pick it up later in linux_epoll_ctl(). */ static void kevent_to_epoll(struct kevent *kevent, struct epoll_event *l_event) { if ((kevent->flags & EV_ERROR) != 0) { l_event->events = LINUX_EPOLLERR; return; } /* XXX EPOLLPRI, EPOLLHUP */ switch (kevent->filter) { case EVFILT_READ: l_event->events = LINUX_EPOLLIN; if ((kevent->flags & EV_EOF) != 0) l_event->events |= LINUX_EPOLLRDHUP; break; case EVFILT_WRITE: l_event->events = LINUX_EPOLLOUT; break; } } /* * Copyout callback used by kevent. This converts kevent * events to epoll events and copies them back to the * userspace. This is also called on error on registering * of the filter. */ static int epoll_kev_copyout(void *arg, struct kevent *kevp, int count) { struct epoll_copyout_args *args; struct linux_pemuldata *pem; struct epoll_emuldata *emd; struct epoll_event *eep; int error, fd, i; args = (struct epoll_copyout_args*) arg; eep = malloc(sizeof(*eep) * count, M_EPOLL, M_WAITOK | M_ZERO); pem = pem_find(args->p); KASSERT(pem != NULL, ("epoll proc emuldata not found.\n")); LINUX_PEM_SLOCK(pem); emd = pem->epoll; KASSERT(emd != NULL, ("epoll proc epolldata not found.\n")); for (i = 0; i < count; i++) { kevent_to_epoll(&kevp[i], &eep[i]); fd = kevp[i].ident; KASSERT(fd <= emd->fdc, ("epoll user data vector" " is too small.\n")); eep[i].data = emd->udata[fd]; } LINUX_PEM_SUNLOCK(pem); error = copyout(eep, args->leventlist, count * sizeof(*eep)); if (error == 0) { args->leventlist += count; args->count += count; } else if (args->error == 0) args->error = error; free(eep, M_EPOLL); return (error); } /* * Copyin callback used by kevent. This copies already * converted filters from kernel memory to the kevent * internal kernel memory. Hence the memcpy instead of * copyin. */ static int epoll_kev_copyin(void *arg, struct kevent *kevp, int count) { struct epoll_copyin_args *args; args = (struct epoll_copyin_args*) arg; memcpy(kevp, args->changelist, count * sizeof(*kevp)); args->changelist += count; return (0); } /* * Load epoll filter, convert it to kevent filter * and load it into kevent subsystem. */ int linux_epoll_ctl(struct thread *td, struct linux_epoll_ctl_args *args) { struct file *epfp, *fp; struct epoll_copyin_args ciargs; struct kevent kev[2]; struct kevent_copyops k_ops = { &ciargs, NULL, epoll_kev_copyin}; struct epoll_event le; cap_rights_t rights; int kev_flags; int nchanges = 0; int error; if (args->op != LINUX_EPOLL_CTL_DEL) { error = copyin(args->event, &le, sizeof(le)); if (error != 0) return (error); } error = fget(td, args->epfd, cap_rights_init(&rights, CAP_KQUEUE_CHANGE), &epfp); if (error != 0) return (error); if (epfp->f_type != DTYPE_KQUEUE) { error = EINVAL; goto leave1; } /* Protect user data vector from incorrectly supplied fd. */ error = fget(td, args->fd, cap_rights_init(&rights, CAP_POLL_EVENT), &fp); if (error != 0) goto leave1; /* Linux disallows spying on himself */ if (epfp == fp) { error = EINVAL; goto leave0; } ciargs.changelist = kev; if (args->op != LINUX_EPOLL_CTL_DEL) { kev_flags = EV_ADD | EV_ENABLE; error = epoll_to_kevent(td, epfp, args->fd, &le, &kev_flags, kev, &nchanges); if (error != 0) goto leave0; } switch (args->op) { case LINUX_EPOLL_CTL_MOD: error = epoll_delete_all_events(td, epfp, args->fd); if (error != 0) goto leave0; break; case LINUX_EPOLL_CTL_ADD: /* * kqueue_register() return ENOENT if event does not exists * and the EV_ADD flag is not set. */ kev[0].flags &= ~EV_ADD; error = kqfd_register(args->epfd, &kev[0], td, 1); if (error != ENOENT) { error = EEXIST; goto leave0; } error = 0; kev[0].flags |= EV_ADD; break; case LINUX_EPOLL_CTL_DEL: /* CTL_DEL means unregister this fd with this epoll */ error = epoll_delete_all_events(td, epfp, args->fd); goto leave0; default: error = EINVAL; goto leave0; } epoll_fd_install(td, args->fd, le.data); error = kern_kevent_fp(td, epfp, nchanges, 0, &k_ops, NULL); leave0: fdrop(fp, td); leave1: fdrop(epfp, td); return (error); } /* * Wait for a filter to be triggered on the epoll file descriptor. */ static int linux_epoll_wait_common(struct thread *td, int epfd, struct epoll_event *events, int maxevents, int timeout, sigset_t *uset) { struct epoll_copyout_args coargs; struct kevent_copyops k_ops = { &coargs, epoll_kev_copyout, NULL}; struct timespec ts, *tsp; cap_rights_t rights; struct file *epfp; sigset_t omask; int error; if (maxevents <= 0 || maxevents > LINUX_MAX_EVENTS) return (EINVAL); error = fget(td, epfd, cap_rights_init(&rights, CAP_KQUEUE_EVENT), &epfp); if (error != 0) return (error); if (epfp->f_type != DTYPE_KQUEUE) { error = EINVAL; goto leave1; } if (uset != NULL) { error = kern_sigprocmask(td, SIG_SETMASK, uset, &omask, 0); if (error != 0) goto leave1; td->td_pflags |= TDP_OLDMASK; /* * Make sure that ast() is called on return to * usermode and TDP_OLDMASK is cleared, restoring old * sigmask. */ thread_lock(td); td->td_flags |= TDF_ASTPENDING; thread_unlock(td); } coargs.leventlist = events; coargs.p = td->td_proc; coargs.count = 0; coargs.error = 0; if (timeout != -1) { if (timeout < 0) { error = EINVAL; goto leave0; } /* Convert from milliseconds to timespec. */ ts.tv_sec = timeout / 1000; ts.tv_nsec = (timeout % 1000) * 1000000; tsp = &ts; } else { tsp = NULL; } error = kern_kevent_fp(td, epfp, 0, maxevents, &k_ops, tsp); if (error == 0 && coargs.error != 0) error = coargs.error; /* * kern_kevent might return ENOMEM which is not expected from epoll_wait. * Maybe we should translate that but I don't think it matters at all. */ if (error == 0) td->td_retval[0] = coargs.count; leave0: if (uset != NULL) error = kern_sigprocmask(td, SIG_SETMASK, &omask, NULL, 0); leave1: fdrop(epfp, td); return (error); } #ifdef LINUX_LEGACY_SYSCALLS int linux_epoll_wait(struct thread *td, struct linux_epoll_wait_args *args) { return (linux_epoll_wait_common(td, args->epfd, args->events, args->maxevents, args->timeout, NULL)); } #endif int linux_epoll_pwait(struct thread *td, struct linux_epoll_pwait_args *args) { sigset_t mask, *pmask; l_sigset_t lmask; int error; if (args->mask != NULL) { if (args->sigsetsize != sizeof(l_sigset_t)) return (EINVAL); error = copyin(args->mask, &lmask, sizeof(l_sigset_t)); if (error != 0) return (error); linux_to_bsd_sigset(&lmask, &mask); pmask = &mask; } else pmask = NULL; return (linux_epoll_wait_common(td, args->epfd, args->events, args->maxevents, args->timeout, pmask)); } static int epoll_delete_event(struct thread *td, struct file *epfp, int fd, int filter) { struct epoll_copyin_args ciargs; struct kevent kev; struct kevent_copyops k_ops = { &ciargs, NULL, epoll_kev_copyin}; ciargs.changelist = &kev; EV_SET(&kev, fd, filter, EV_DELETE | EV_DISABLE, 0, 0, 0); return (kern_kevent_fp(td, epfp, 1, 0, &k_ops, NULL)); } static int epoll_delete_all_events(struct thread *td, struct file *epfp, int fd) { int error1, error2; error1 = epoll_delete_event(td, epfp, fd, EVFILT_READ); error2 = epoll_delete_event(td, epfp, fd, EVFILT_WRITE); /* return 0 if at least one result positive */ return (error1 == 0 ? 0 : error2); } static int eventfd_create(struct thread *td, uint32_t initval, int flags) { struct filedesc *fdp; struct eventfd *efd; struct file *fp; int fflags, fd, error; fflags = 0; if ((flags & LINUX_O_CLOEXEC) != 0) fflags |= O_CLOEXEC; fdp = td->td_proc->p_fd; error = falloc(td, &fp, &fd, fflags); if (error != 0) return (error); efd = malloc(sizeof(*efd), M_EPOLL, M_WAITOK | M_ZERO); efd->efd_flags = flags; efd->efd_count = initval; mtx_init(&efd->efd_lock, "eventfd", NULL, MTX_DEF); knlist_init_mtx(&efd->efd_sel.si_note, &efd->efd_lock); fflags = FREAD | FWRITE; if ((flags & LINUX_O_NONBLOCK) != 0) fflags |= FNONBLOCK; finit(fp, fflags, DTYPE_LINUXEFD, efd, &eventfdops); fdrop(fp, td); td->td_retval[0] = fd; return (error); } #ifdef LINUX_LEGACY_SYSCALLS int linux_eventfd(struct thread *td, struct linux_eventfd_args *args) { return (eventfd_create(td, args->initval, 0)); } #endif int linux_eventfd2(struct thread *td, struct linux_eventfd2_args *args) { if ((args->flags & ~(LINUX_O_CLOEXEC|LINUX_O_NONBLOCK|LINUX_EFD_SEMAPHORE)) != 0) return (EINVAL); return (eventfd_create(td, args->initval, args->flags)); } static int eventfd_close(struct file *fp, struct thread *td) { struct eventfd *efd; efd = fp->f_data; if (fp->f_type != DTYPE_LINUXEFD || efd == NULL) return (EINVAL); seldrain(&efd->efd_sel); knlist_destroy(&efd->efd_sel.si_note); fp->f_ops = &badfileops; mtx_destroy(&efd->efd_lock); free(efd, M_EPOLL); return (0); } static int eventfd_read(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { struct eventfd *efd; eventfd_t count; int error; efd = fp->f_data; if (fp->f_type != DTYPE_LINUXEFD || efd == NULL) return (EINVAL); if (uio->uio_resid < sizeof(eventfd_t)) return (EINVAL); error = 0; mtx_lock(&efd->efd_lock); retry: if (efd->efd_count == 0) { if ((fp->f_flag & FNONBLOCK) != 0) { mtx_unlock(&efd->efd_lock); return (EAGAIN); } error = mtx_sleep(&efd->efd_count, &efd->efd_lock, PCATCH, "lefdrd", 0); if (error == 0) goto retry; } if (error == 0) { if ((efd->efd_flags & LINUX_EFD_SEMAPHORE) != 0) { count = 1; --efd->efd_count; } else { count = efd->efd_count; efd->efd_count = 0; } KNOTE_LOCKED(&efd->efd_sel.si_note, 0); selwakeup(&efd->efd_sel); wakeup(&efd->efd_count); mtx_unlock(&efd->efd_lock); error = uiomove(&count, sizeof(eventfd_t), uio); } else mtx_unlock(&efd->efd_lock); return (error); } static int eventfd_write(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { struct eventfd *efd; eventfd_t count; int error; efd = fp->f_data; if (fp->f_type != DTYPE_LINUXEFD || efd == NULL) return (EINVAL); if (uio->uio_resid < sizeof(eventfd_t)) return (EINVAL); error = uiomove(&count, sizeof(eventfd_t), uio); if (error != 0) return (error); if (count == UINT64_MAX) return (EINVAL); mtx_lock(&efd->efd_lock); retry: if (UINT64_MAX - efd->efd_count <= count) { if ((fp->f_flag & FNONBLOCK) != 0) { mtx_unlock(&efd->efd_lock); /* Do not not return the number of bytes written */ uio->uio_resid += sizeof(eventfd_t); return (EAGAIN); } error = mtx_sleep(&efd->efd_count, &efd->efd_lock, PCATCH, "lefdwr", 0); if (error == 0) goto retry; } if (error == 0) { efd->efd_count += count; KNOTE_LOCKED(&efd->efd_sel.si_note, 0); selwakeup(&efd->efd_sel); wakeup(&efd->efd_count); } mtx_unlock(&efd->efd_lock); return (error); } static int eventfd_poll(struct file *fp, int events, struct ucred *active_cred, struct thread *td) { struct eventfd *efd; int revents = 0; efd = fp->f_data; if (fp->f_type != DTYPE_LINUXEFD || efd == NULL) return (POLLERR); mtx_lock(&efd->efd_lock); if ((events & (POLLIN|POLLRDNORM)) && efd->efd_count > 0) revents |= events & (POLLIN|POLLRDNORM); if ((events & (POLLOUT|POLLWRNORM)) && UINT64_MAX - 1 > efd->efd_count) revents |= events & (POLLOUT|POLLWRNORM); if (revents == 0) selrecord(td, &efd->efd_sel); mtx_unlock(&efd->efd_lock); return (revents); } /*ARGSUSED*/ static int eventfd_kqfilter(struct file *fp, struct knote *kn) { struct eventfd *efd; efd = fp->f_data; if (fp->f_type != DTYPE_LINUXEFD || efd == NULL) return (EINVAL); mtx_lock(&efd->efd_lock); switch (kn->kn_filter) { case EVFILT_READ: kn->kn_fop = &eventfd_rfiltops; break; case EVFILT_WRITE: kn->kn_fop = &eventfd_wfiltops; break; default: mtx_unlock(&efd->efd_lock); return (EINVAL); } kn->kn_hook = efd; knlist_add(&efd->efd_sel.si_note, kn, 1); mtx_unlock(&efd->efd_lock); return (0); } static void filt_eventfddetach(struct knote *kn) { struct eventfd *efd = kn->kn_hook; mtx_lock(&efd->efd_lock); knlist_remove(&efd->efd_sel.si_note, kn, 1); mtx_unlock(&efd->efd_lock); } /*ARGSUSED*/ static int filt_eventfdread(struct knote *kn, long hint) { struct eventfd *efd = kn->kn_hook; int ret; mtx_assert(&efd->efd_lock, MA_OWNED); ret = (efd->efd_count > 0); return (ret); } /*ARGSUSED*/ static int filt_eventfdwrite(struct knote *kn, long hint) { struct eventfd *efd = kn->kn_hook; int ret; mtx_assert(&efd->efd_lock, MA_OWNED); ret = (UINT64_MAX - 1 > efd->efd_count); return (ret); } /*ARGSUSED*/ static int eventfd_ioctl(struct file *fp, u_long cmd, void *data, struct ucred *active_cred, struct thread *td) { if (fp->f_data == NULL || (fp->f_type != DTYPE_LINUXEFD && fp->f_type != DTYPE_LINUXTFD)) return (EINVAL); switch (cmd) { case FIONBIO: if ((*(int *)data)) atomic_set_int(&fp->f_flag, FNONBLOCK); else atomic_clear_int(&fp->f_flag, FNONBLOCK); case FIOASYNC: return (0); default: return (ENXIO); } } /*ARGSUSED*/ static int eventfd_stat(struct file *fp, struct stat *st, struct ucred *active_cred, struct thread *td) { return (ENXIO); } /*ARGSUSED*/ static int eventfd_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp) { kif->kf_type = KF_TYPE_UNKNOWN; return (0); } int linux_timerfd_create(struct thread *td, struct linux_timerfd_create_args *args) { struct filedesc *fdp; struct timerfd *tfd; struct file *fp; clockid_t clockid; int fflags, fd, error; if ((args->flags & ~LINUX_TFD_CREATE_FLAGS) != 0) return (EINVAL); error = linux_to_native_clockid(&clockid, args->clockid); if (error != 0) return (error); if (clockid != CLOCK_REALTIME && clockid != CLOCK_MONOTONIC) return (EINVAL); fflags = 0; if ((args->flags & LINUX_TFD_CLOEXEC) != 0) fflags |= O_CLOEXEC; fdp = td->td_proc->p_fd; error = falloc(td, &fp, &fd, fflags); if (error != 0) return (error); tfd = malloc(sizeof(*tfd), M_EPOLL, M_WAITOK | M_ZERO); tfd->tfd_clockid = clockid; mtx_init(&tfd->tfd_lock, "timerfd", NULL, MTX_DEF); callout_init_mtx(&tfd->tfd_callout, &tfd->tfd_lock, 0); knlist_init_mtx(&tfd->tfd_sel.si_note, &tfd->tfd_lock); fflags = FREAD; if ((args->flags & LINUX_O_NONBLOCK) != 0) fflags |= FNONBLOCK; finit(fp, fflags, DTYPE_LINUXTFD, tfd, &timerfdops); fdrop(fp, td); td->td_retval[0] = fd; return (error); } static int timerfd_close(struct file *fp, struct thread *td) { struct timerfd *tfd; tfd = fp->f_data; if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL) return (EINVAL); timespecclear(&tfd->tfd_time.it_value); timespecclear(&tfd->tfd_time.it_interval); mtx_lock(&tfd->tfd_lock); callout_drain(&tfd->tfd_callout); mtx_unlock(&tfd->tfd_lock); seldrain(&tfd->tfd_sel); knlist_destroy(&tfd->tfd_sel.si_note); fp->f_ops = &badfileops; mtx_destroy(&tfd->tfd_lock); free(tfd, M_EPOLL); return (0); } static int timerfd_read(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { struct timerfd *tfd; timerfd_t count; int error; tfd = fp->f_data; if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL) return (EINVAL); if (uio->uio_resid < sizeof(timerfd_t)) return (EINVAL); error = 0; mtx_lock(&tfd->tfd_lock); retry: if (tfd->tfd_canceled) { tfd->tfd_count = 0; mtx_unlock(&tfd->tfd_lock); return (ECANCELED); } if (tfd->tfd_count == 0) { if ((fp->f_flag & FNONBLOCK) != 0) { mtx_unlock(&tfd->tfd_lock); return (EAGAIN); } error = mtx_sleep(&tfd->tfd_count, &tfd->tfd_lock, PCATCH, "ltfdrd", 0); if (error == 0) goto retry; } if (error == 0) { count = tfd->tfd_count; tfd->tfd_count = 0; mtx_unlock(&tfd->tfd_lock); error = uiomove(&count, sizeof(timerfd_t), uio); } else mtx_unlock(&tfd->tfd_lock); return (error); } static int timerfd_poll(struct file *fp, int events, struct ucred *active_cred, struct thread *td) { struct timerfd *tfd; int revents = 0; tfd = fp->f_data; if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL) return (POLLERR); mtx_lock(&tfd->tfd_lock); if ((events & (POLLIN|POLLRDNORM)) && tfd->tfd_count > 0) revents |= events & (POLLIN|POLLRDNORM); if (revents == 0) selrecord(td, &tfd->tfd_sel); mtx_unlock(&tfd->tfd_lock); return (revents); } /*ARGSUSED*/ static int timerfd_kqfilter(struct file *fp, struct knote *kn) { struct timerfd *tfd; tfd = fp->f_data; if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL) return (EINVAL); if (kn->kn_filter == EVFILT_READ) kn->kn_fop = &timerfd_rfiltops; else return (EINVAL); kn->kn_hook = tfd; knlist_add(&tfd->tfd_sel.si_note, kn, 0); return (0); } static void filt_timerfddetach(struct knote *kn) { struct timerfd *tfd = kn->kn_hook; mtx_lock(&tfd->tfd_lock); knlist_remove(&tfd->tfd_sel.si_note, kn, 1); mtx_unlock(&tfd->tfd_lock); } /*ARGSUSED*/ static int filt_timerfdread(struct knote *kn, long hint) { struct timerfd *tfd = kn->kn_hook; return (tfd->tfd_count > 0); } /*ARGSUSED*/ static int timerfd_stat(struct file *fp, struct stat *st, struct ucred *active_cred, struct thread *td) { return (ENXIO); } /*ARGSUSED*/ static int timerfd_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp) { kif->kf_type = KF_TYPE_UNKNOWN; return (0); } static void linux_timerfd_clocktime(struct timerfd *tfd, struct timespec *ts) { if (tfd->tfd_clockid == CLOCK_REALTIME) getnanotime(ts); else /* CLOCK_MONOTONIC */ getnanouptime(ts); } static void linux_timerfd_curval(struct timerfd *tfd, struct itimerspec *ots) { struct timespec cts; linux_timerfd_clocktime(tfd, &cts); *ots = tfd->tfd_time; if (ots->it_value.tv_sec != 0 || ots->it_value.tv_nsec != 0) { - timespecsub(&ots->it_value, &cts); + timespecsub(&ots->it_value, &cts, &ots->it_value); if (ots->it_value.tv_sec < 0 || (ots->it_value.tv_sec == 0 && ots->it_value.tv_nsec == 0)) { ots->it_value.tv_sec = 0; ots->it_value.tv_nsec = 1; } } } int linux_timerfd_gettime(struct thread *td, struct linux_timerfd_gettime_args *args) { struct l_itimerspec lots; struct itimerspec ots; struct timerfd *tfd; struct file *fp; int error; error = fget(td, args->fd, &cap_read_rights, &fp); if (error != 0) return (error); tfd = fp->f_data; if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL) { error = EINVAL; goto out; } mtx_lock(&tfd->tfd_lock); linux_timerfd_curval(tfd, &ots); mtx_unlock(&tfd->tfd_lock); error = native_to_linux_itimerspec(&lots, &ots); if (error == 0) error = copyout(&lots, args->old_value, sizeof(lots)); out: fdrop(fp, td); return (error); } int linux_timerfd_settime(struct thread *td, struct linux_timerfd_settime_args *args) { struct l_itimerspec lots; struct itimerspec nts, ots; struct timespec cts, ts; struct timerfd *tfd; struct timeval tv; struct file *fp; int error; if ((args->flags & ~LINUX_TFD_SETTIME_FLAGS) != 0) return (EINVAL); error = copyin(args->new_value, &lots, sizeof(lots)); if (error != 0) return (error); error = linux_to_native_itimerspec(&nts, &lots); if (error != 0) return (error); error = fget(td, args->fd, &cap_write_rights, &fp); if (error != 0) return (error); tfd = fp->f_data; if (fp->f_type != DTYPE_LINUXTFD || tfd == NULL) { error = EINVAL; goto out; } mtx_lock(&tfd->tfd_lock); if (!timespecisset(&nts.it_value)) timespecclear(&nts.it_interval); if (args->old_value != NULL) linux_timerfd_curval(tfd, &ots); tfd->tfd_time = nts; if (timespecisset(&nts.it_value)) { linux_timerfd_clocktime(tfd, &cts); ts = nts.it_value; if ((args->flags & LINUX_TFD_TIMER_ABSTIME) == 0) { - timespecadd(&tfd->tfd_time.it_value, &cts); + timespecadd(&tfd->tfd_time.it_value, &cts, + &tfd->tfd_time.it_value); } else { - timespecsub(&ts, &cts); + timespecsub(&ts, &cts, &ts); } TIMESPEC_TO_TIMEVAL(&tv, &ts); callout_reset(&tfd->tfd_callout, tvtohz(&tv), linux_timerfd_expire, tfd); tfd->tfd_canceled = false; } else { tfd->tfd_canceled = true; callout_stop(&tfd->tfd_callout); } mtx_unlock(&tfd->tfd_lock); if (args->old_value != NULL) { error = native_to_linux_itimerspec(&lots, &ots); if (error == 0) error = copyout(&lots, args->old_value, sizeof(lots)); } out: fdrop(fp, td); return (error); } static void linux_timerfd_expire(void *arg) { struct timespec cts, ts; struct timeval tv; struct timerfd *tfd; tfd = (struct timerfd *)arg; linux_timerfd_clocktime(tfd, &cts); if (timespeccmp(&cts, &tfd->tfd_time.it_value, >=)) { if (timespecisset(&tfd->tfd_time.it_interval)) timespecadd(&tfd->tfd_time.it_value, - &tfd->tfd_time.it_interval); + &tfd->tfd_time.it_interval, + &tfd->tfd_time.it_value); else /* single shot timer */ timespecclear(&tfd->tfd_time.it_value); if (timespecisset(&tfd->tfd_time.it_value)) { - ts = tfd->tfd_time.it_value; - timespecsub(&ts, &cts); + timespecsub(&tfd->tfd_time.it_value, &cts, &ts); TIMESPEC_TO_TIMEVAL(&tv, &ts); callout_reset(&tfd->tfd_callout, tvtohz(&tv), linux_timerfd_expire, tfd); } tfd->tfd_count++; KNOTE_LOCKED(&tfd->tfd_sel.si_note, 0); selwakeup(&tfd->tfd_sel); wakeup(&tfd->tfd_count); } else if (timespecisset(&tfd->tfd_time.it_value)) { - ts = tfd->tfd_time.it_value; - timespecsub(&ts, &cts); + timespecsub(&tfd->tfd_time.it_value, &cts, &ts); TIMESPEC_TO_TIMEVAL(&tv, &ts); callout_reset(&tfd->tfd_callout, tvtohz(&tv), linux_timerfd_expire, tfd); } } Index: head/sys/compat/linux/linux_futex.c =================================================================== --- head/sys/compat/linux/linux_futex.c (revision 336913) +++ head/sys/compat/linux/linux_futex.c (revision 336914) @@ -1,1335 +1,1335 @@ /* $NetBSD: linux_futex.c,v 1.7 2006/07/24 19:01:49 manu Exp $ */ /*- * SPDX-License-Identifier: BSD-4-Clause * * Copyright (c) 2009-2016 Dmitry Chagin * Copyright (c) 2005 Emmanuel Dreyfus * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by Emmanuel Dreyfus * 4. The name of the author may not be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE THE AUTHOR AND CONTRIBUTORS ``AS IS'' * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #if 0 __KERNEL_RCSID(1, "$NetBSD: linux_futex.c,v 1.7 2006/07/24 19:01:49 manu Exp $"); #endif #include "opt_compat.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef COMPAT_LINUX32 #include #include #else #include #include #endif #include #include #include #include #include /* DTrace init */ LIN_SDT_PROVIDER_DECLARE(LINUX_DTRACE); /** * Futex part for the special DTrace module "locks". */ LIN_SDT_PROBE_DEFINE1(locks, futex_mtx, locked, "struct mtx *"); LIN_SDT_PROBE_DEFINE1(locks, futex_mtx, unlock, "struct mtx *"); /** * Per futex probes. */ LIN_SDT_PROBE_DEFINE1(futex, futex, create, "struct sx *"); LIN_SDT_PROBE_DEFINE1(futex, futex, destroy, "struct sx *"); /** * DTrace probes in this module. */ LIN_SDT_PROBE_DEFINE2(futex, futex_put, entry, "struct futex *", "struct waiting_proc *"); LIN_SDT_PROBE_DEFINE3(futex, futex_put, destroy, "uint32_t *", "uint32_t", "int"); LIN_SDT_PROBE_DEFINE3(futex, futex_put, unlock, "uint32_t *", "uint32_t", "int"); LIN_SDT_PROBE_DEFINE0(futex, futex_put, return); LIN_SDT_PROBE_DEFINE3(futex, futex_get0, entry, "uint32_t *", "struct futex **", "uint32_t"); LIN_SDT_PROBE_DEFINE1(futex, futex_get0, umtx_key_get_error, "int"); LIN_SDT_PROBE_DEFINE3(futex, futex_get0, shared, "uint32_t *", "uint32_t", "int"); LIN_SDT_PROBE_DEFINE1(futex, futex_get0, null, "uint32_t *"); LIN_SDT_PROBE_DEFINE3(futex, futex_get0, new, "uint32_t *", "uint32_t", "int"); LIN_SDT_PROBE_DEFINE1(futex, futex_get0, return, "int"); LIN_SDT_PROBE_DEFINE3(futex, futex_get, entry, "uint32_t *", "struct waiting_proc **", "struct futex **"); LIN_SDT_PROBE_DEFINE0(futex, futex_get, error); LIN_SDT_PROBE_DEFINE1(futex, futex_get, return, "int"); LIN_SDT_PROBE_DEFINE3(futex, futex_sleep, entry, "struct futex *", "struct waiting_proc **", "struct timespec *"); LIN_SDT_PROBE_DEFINE5(futex, futex_sleep, requeue_error, "int", "uint32_t *", "struct waiting_proc *", "uint32_t *", "uint32_t"); LIN_SDT_PROBE_DEFINE3(futex, futex_sleep, sleep_error, "int", "uint32_t *", "struct waiting_proc *"); LIN_SDT_PROBE_DEFINE1(futex, futex_sleep, return, "int"); LIN_SDT_PROBE_DEFINE3(futex, futex_wake, entry, "struct futex *", "int", "uint32_t"); LIN_SDT_PROBE_DEFINE3(futex, futex_wake, iterate, "uint32_t", "struct waiting_proc *", "uint32_t"); LIN_SDT_PROBE_DEFINE1(futex, futex_wake, wakeup, "struct waiting_proc *"); LIN_SDT_PROBE_DEFINE1(futex, futex_wake, return, "int"); LIN_SDT_PROBE_DEFINE4(futex, futex_requeue, entry, "struct futex *", "int", "struct futex *", "int"); LIN_SDT_PROBE_DEFINE1(futex, futex_requeue, wakeup, "struct waiting_proc *"); LIN_SDT_PROBE_DEFINE3(futex, futex_requeue, requeue, "uint32_t *", "struct waiting_proc *", "uint32_t"); LIN_SDT_PROBE_DEFINE1(futex, futex_requeue, return, "int"); LIN_SDT_PROBE_DEFINE4(futex, futex_wait, entry, "struct futex *", "struct waiting_proc **", "struct timespec *", "uint32_t"); LIN_SDT_PROBE_DEFINE1(futex, futex_wait, sleep_error, "int"); LIN_SDT_PROBE_DEFINE1(futex, futex_wait, return, "int"); LIN_SDT_PROBE_DEFINE3(futex, futex_atomic_op, entry, "struct thread *", "int", "uint32_t"); LIN_SDT_PROBE_DEFINE4(futex, futex_atomic_op, decoded_op, "int", "int", "int", "int"); LIN_SDT_PROBE_DEFINE0(futex, futex_atomic_op, missing_access_check); LIN_SDT_PROBE_DEFINE1(futex, futex_atomic_op, unimplemented_op, "int"); LIN_SDT_PROBE_DEFINE1(futex, futex_atomic_op, unimplemented_cmp, "int"); LIN_SDT_PROBE_DEFINE1(futex, futex_atomic_op, return, "int"); LIN_SDT_PROBE_DEFINE2(futex, linux_sys_futex, entry, "struct thread *", "struct linux_sys_futex_args *"); LIN_SDT_PROBE_DEFINE0(futex, linux_sys_futex, unimplemented_clockswitch); LIN_SDT_PROBE_DEFINE1(futex, linux_sys_futex, copyin_error, "int"); LIN_SDT_PROBE_DEFINE0(futex, linux_sys_futex, invalid_cmp_requeue_use); LIN_SDT_PROBE_DEFINE3(futex, linux_sys_futex, debug_wait, "uint32_t *", "uint32_t", "uint32_t"); LIN_SDT_PROBE_DEFINE4(futex, linux_sys_futex, debug_wait_value_neq, "uint32_t *", "uint32_t", "int", "uint32_t"); LIN_SDT_PROBE_DEFINE3(futex, linux_sys_futex, debug_wake, "uint32_t *", "uint32_t", "uint32_t"); LIN_SDT_PROBE_DEFINE5(futex, linux_sys_futex, debug_cmp_requeue, "uint32_t *", "uint32_t", "uint32_t", "uint32_t *", "struct l_timespec *"); LIN_SDT_PROBE_DEFINE2(futex, linux_sys_futex, debug_cmp_requeue_value_neq, "uint32_t", "int"); LIN_SDT_PROBE_DEFINE5(futex, linux_sys_futex, debug_wake_op, "uint32_t *", "int", "uint32_t", "uint32_t *", "uint32_t"); LIN_SDT_PROBE_DEFINE0(futex, linux_sys_futex, unhandled_efault); LIN_SDT_PROBE_DEFINE0(futex, linux_sys_futex, unimplemented_lock_pi); LIN_SDT_PROBE_DEFINE0(futex, linux_sys_futex, unimplemented_unlock_pi); LIN_SDT_PROBE_DEFINE0(futex, linux_sys_futex, unimplemented_trylock_pi); LIN_SDT_PROBE_DEFINE0(futex, linux_sys_futex, deprecated_requeue); LIN_SDT_PROBE_DEFINE0(futex, linux_sys_futex, unimplemented_wait_requeue_pi); LIN_SDT_PROBE_DEFINE0(futex, linux_sys_futex, unimplemented_cmp_requeue_pi); LIN_SDT_PROBE_DEFINE1(futex, linux_sys_futex, unknown_operation, "int"); LIN_SDT_PROBE_DEFINE1(futex, linux_sys_futex, return, "int"); LIN_SDT_PROBE_DEFINE2(futex, linux_set_robust_list, entry, "struct thread *", "struct linux_set_robust_list_args *"); LIN_SDT_PROBE_DEFINE0(futex, linux_set_robust_list, size_error); LIN_SDT_PROBE_DEFINE1(futex, linux_set_robust_list, return, "int"); LIN_SDT_PROBE_DEFINE2(futex, linux_get_robust_list, entry, "struct thread *", "struct linux_get_robust_list_args *"); LIN_SDT_PROBE_DEFINE1(futex, linux_get_robust_list, copyout_error, "int"); LIN_SDT_PROBE_DEFINE1(futex, linux_get_robust_list, return, "int"); LIN_SDT_PROBE_DEFINE3(futex, handle_futex_death, entry, "struct linux_emuldata *", "uint32_t *", "unsigned int"); LIN_SDT_PROBE_DEFINE1(futex, handle_futex_death, copyin_error, "int"); LIN_SDT_PROBE_DEFINE1(futex, handle_futex_death, return, "int"); LIN_SDT_PROBE_DEFINE3(futex, fetch_robust_entry, entry, "struct linux_robust_list **", "struct linux_robust_list **", "unsigned int *"); LIN_SDT_PROBE_DEFINE1(futex, fetch_robust_entry, copyin_error, "int"); LIN_SDT_PROBE_DEFINE1(futex, fetch_robust_entry, return, "int"); LIN_SDT_PROBE_DEFINE2(futex, release_futexes, entry, "struct thread *", "struct linux_emuldata *"); LIN_SDT_PROBE_DEFINE1(futex, release_futexes, copyin_error, "int"); LIN_SDT_PROBE_DEFINE0(futex, release_futexes, return); struct futex; struct waiting_proc { uint32_t wp_flags; struct futex *wp_futex; TAILQ_ENTRY(waiting_proc) wp_list; }; struct futex { struct mtx f_lck; uint32_t *f_uaddr; /* user-supplied value, for debug */ struct umtx_key f_key; uint32_t f_refcount; uint32_t f_bitset; LIST_ENTRY(futex) f_list; TAILQ_HEAD(lf_waiting_proc, waiting_proc) f_waiting_proc; }; struct futex_list futex_list; #define FUTEX_LOCK(f) mtx_lock(&(f)->f_lck) #define FUTEX_LOCKED(f) mtx_owned(&(f)->f_lck) #define FUTEX_UNLOCK(f) mtx_unlock(&(f)->f_lck) #define FUTEX_INIT(f) do { \ mtx_init(&(f)->f_lck, "ftlk", NULL, \ MTX_DUPOK); \ LIN_SDT_PROBE1(futex, futex, create, \ &(f)->f_lck); \ } while (0) #define FUTEX_DESTROY(f) do { \ LIN_SDT_PROBE1(futex, futex, destroy, \ &(f)->f_lck); \ mtx_destroy(&(f)->f_lck); \ } while (0) #define FUTEX_ASSERT_LOCKED(f) mtx_assert(&(f)->f_lck, MA_OWNED) #define FUTEX_ASSERT_UNLOCKED(f) mtx_assert(&(f)->f_lck, MA_NOTOWNED) struct mtx futex_mtx; /* protects the futex list */ #define FUTEXES_LOCK do { \ mtx_lock(&futex_mtx); \ LIN_SDT_PROBE1(locks, futex_mtx, \ locked, &futex_mtx); \ } while (0) #define FUTEXES_UNLOCK do { \ LIN_SDT_PROBE1(locks, futex_mtx, \ unlock, &futex_mtx); \ mtx_unlock(&futex_mtx); \ } while (0) /* flags for futex_get() */ #define FUTEX_CREATE_WP 0x1 /* create waiting_proc */ #define FUTEX_DONTCREATE 0x2 /* don't create futex if not exists */ #define FUTEX_DONTEXISTS 0x4 /* return EINVAL if futex exists */ #define FUTEX_SHARED 0x8 /* shared futex */ #define FUTEX_DONTLOCK 0x10 /* don't lock futex */ /* wp_flags */ #define FUTEX_WP_REQUEUED 0x1 /* wp requeued - wp moved from wp_list * of futex where thread sleep to wp_list * of another futex. */ #define FUTEX_WP_REMOVED 0x2 /* wp is woken up and removed from futex * wp_list to prevent double wakeup. */ static void futex_put(struct futex *, struct waiting_proc *); static int futex_get0(uint32_t *, struct futex **f, uint32_t); static int futex_get(uint32_t *, struct waiting_proc **, struct futex **, uint32_t); static int futex_sleep(struct futex *, struct waiting_proc *, struct timespec *); static int futex_wake(struct futex *, int, uint32_t); static int futex_requeue(struct futex *, int, struct futex *, int); static int futex_copyin_timeout(int, struct l_timespec *, int, struct timespec *); static int futex_wait(struct futex *, struct waiting_proc *, struct timespec *, uint32_t); static void futex_lock(struct futex *); static void futex_unlock(struct futex *); static int futex_atomic_op(struct thread *, int, uint32_t *); static int handle_futex_death(struct linux_emuldata *, uint32_t *, unsigned int); static int fetch_robust_entry(struct linux_robust_list **, struct linux_robust_list **, unsigned int *); static int futex_copyin_timeout(int op, struct l_timespec *luts, int clockrt, struct timespec *ts) { struct l_timespec lts; struct timespec kts; int error; error = copyin(luts, <s, sizeof(lts)); if (error) return (error); error = linux_to_native_timespec(ts, <s); if (error) return (error); if (clockrt) { nanotime(&kts); - timespecsub(ts, &kts); + timespecsub(ts, &kts, ts); } else if (op == LINUX_FUTEX_WAIT_BITSET) { nanouptime(&kts); - timespecsub(ts, &kts); + timespecsub(ts, &kts, ts); } return (error); } static void futex_put(struct futex *f, struct waiting_proc *wp) { LIN_SDT_PROBE2(futex, futex_put, entry, f, wp); if (wp != NULL) { if ((wp->wp_flags & FUTEX_WP_REMOVED) == 0) TAILQ_REMOVE(&f->f_waiting_proc, wp, wp_list); free(wp, M_FUTEX_WP); } FUTEXES_LOCK; if (--f->f_refcount == 0) { LIST_REMOVE(f, f_list); FUTEXES_UNLOCK; if (FUTEX_LOCKED(f)) futex_unlock(f); LIN_SDT_PROBE3(futex, futex_put, destroy, f->f_uaddr, f->f_refcount, f->f_key.shared); LINUX_CTR3(sys_futex, "futex_put destroy uaddr %p ref %d " "shared %d", f->f_uaddr, f->f_refcount, f->f_key.shared); umtx_key_release(&f->f_key); FUTEX_DESTROY(f); free(f, M_FUTEX); LIN_SDT_PROBE0(futex, futex_put, return); return; } LIN_SDT_PROBE3(futex, futex_put, unlock, f->f_uaddr, f->f_refcount, f->f_key.shared); LINUX_CTR3(sys_futex, "futex_put uaddr %p ref %d shared %d", f->f_uaddr, f->f_refcount, f->f_key.shared); FUTEXES_UNLOCK; if (FUTEX_LOCKED(f)) futex_unlock(f); LIN_SDT_PROBE0(futex, futex_put, return); } static int futex_get0(uint32_t *uaddr, struct futex **newf, uint32_t flags) { struct futex *f, *tmpf; struct umtx_key key; int error; LIN_SDT_PROBE3(futex, futex_get0, entry, uaddr, newf, flags); *newf = tmpf = NULL; error = umtx_key_get(uaddr, TYPE_FUTEX, (flags & FUTEX_SHARED) ? AUTO_SHARE : THREAD_SHARE, &key); if (error) { LIN_SDT_PROBE1(futex, futex_get0, umtx_key_get_error, error); LIN_SDT_PROBE1(futex, futex_get0, return, error); return (error); } retry: FUTEXES_LOCK; LIST_FOREACH(f, &futex_list, f_list) { if (umtx_key_match(&f->f_key, &key)) { if (tmpf != NULL) { if (FUTEX_LOCKED(tmpf)) futex_unlock(tmpf); FUTEX_DESTROY(tmpf); free(tmpf, M_FUTEX); } if (flags & FUTEX_DONTEXISTS) { FUTEXES_UNLOCK; umtx_key_release(&key); LIN_SDT_PROBE1(futex, futex_get0, return, EINVAL); return (EINVAL); } /* * Increment refcount of the found futex to * prevent it from deallocation before FUTEX_LOCK() */ ++f->f_refcount; FUTEXES_UNLOCK; umtx_key_release(&key); if ((flags & FUTEX_DONTLOCK) == 0) futex_lock(f); *newf = f; LIN_SDT_PROBE3(futex, futex_get0, shared, uaddr, f->f_refcount, f->f_key.shared); LINUX_CTR3(sys_futex, "futex_get uaddr %p ref %d shared %d", uaddr, f->f_refcount, f->f_key.shared); LIN_SDT_PROBE1(futex, futex_get0, return, 0); return (0); } } if (flags & FUTEX_DONTCREATE) { FUTEXES_UNLOCK; umtx_key_release(&key); LIN_SDT_PROBE1(futex, futex_get0, null, uaddr); LINUX_CTR1(sys_futex, "futex_get uaddr %p null", uaddr); LIN_SDT_PROBE1(futex, futex_get0, return, 0); return (0); } if (tmpf == NULL) { FUTEXES_UNLOCK; tmpf = malloc(sizeof(*tmpf), M_FUTEX, M_WAITOK | M_ZERO); tmpf->f_uaddr = uaddr; tmpf->f_key = key; tmpf->f_refcount = 1; tmpf->f_bitset = FUTEX_BITSET_MATCH_ANY; FUTEX_INIT(tmpf); TAILQ_INIT(&tmpf->f_waiting_proc); /* * Lock the new futex before an insert into the futex_list * to prevent futex usage by other. */ if ((flags & FUTEX_DONTLOCK) == 0) futex_lock(tmpf); goto retry; } LIST_INSERT_HEAD(&futex_list, tmpf, f_list); FUTEXES_UNLOCK; LIN_SDT_PROBE3(futex, futex_get0, new, uaddr, tmpf->f_refcount, tmpf->f_key.shared); LINUX_CTR3(sys_futex, "futex_get uaddr %p ref %d shared %d new", uaddr, tmpf->f_refcount, tmpf->f_key.shared); *newf = tmpf; LIN_SDT_PROBE1(futex, futex_get0, return, 0); return (0); } static int futex_get(uint32_t *uaddr, struct waiting_proc **wp, struct futex **f, uint32_t flags) { int error; LIN_SDT_PROBE3(futex, futex_get, entry, uaddr, wp, f); if (flags & FUTEX_CREATE_WP) { *wp = malloc(sizeof(struct waiting_proc), M_FUTEX_WP, M_WAITOK); (*wp)->wp_flags = 0; } error = futex_get0(uaddr, f, flags); if (error) { LIN_SDT_PROBE0(futex, futex_get, error); if (flags & FUTEX_CREATE_WP) free(*wp, M_FUTEX_WP); LIN_SDT_PROBE1(futex, futex_get, return, error); return (error); } if (flags & FUTEX_CREATE_WP) { TAILQ_INSERT_HEAD(&(*f)->f_waiting_proc, *wp, wp_list); (*wp)->wp_futex = *f; } LIN_SDT_PROBE1(futex, futex_get, return, error); return (error); } static inline void futex_lock(struct futex *f) { LINUX_CTR3(sys_futex, "futex_lock uaddr %p ref %d shared %d", f->f_uaddr, f->f_refcount, f->f_key.shared); FUTEX_ASSERT_UNLOCKED(f); FUTEX_LOCK(f); } static inline void futex_unlock(struct futex *f) { LINUX_CTR3(sys_futex, "futex_unlock uaddr %p ref %d shared %d", f->f_uaddr, f->f_refcount, f->f_key.shared); FUTEX_ASSERT_LOCKED(f); FUTEX_UNLOCK(f); } static int futex_sleep(struct futex *f, struct waiting_proc *wp, struct timespec *ts) { struct timespec uts; sbintime_t sbt, prec, tmp; time_t over; int error; FUTEX_ASSERT_LOCKED(f); if (ts != NULL) { uts = *ts; if (uts.tv_sec > INT32_MAX / 2) { over = uts.tv_sec - INT32_MAX / 2; uts.tv_sec -= over; } tmp = tstosbt(uts); if (TIMESEL(&sbt, tmp)) sbt += tc_tick_sbt; sbt += tmp; prec = tmp; prec >>= tc_precexp; } else { sbt = 0; prec = 0; } LIN_SDT_PROBE3(futex, futex_sleep, entry, f, wp, sbt); LINUX_CTR4(sys_futex, "futex_sleep enter uaddr %p wp %p timo %ld ref %d", f->f_uaddr, wp, sbt, f->f_refcount); error = msleep_sbt(wp, &f->f_lck, PCATCH, "futex", sbt, prec, C_ABSOLUTE); if (wp->wp_flags & FUTEX_WP_REQUEUED) { KASSERT(f != wp->wp_futex, ("futex != wp_futex")); if (error) { LIN_SDT_PROBE5(futex, futex_sleep, requeue_error, error, f->f_uaddr, wp, wp->wp_futex->f_uaddr, wp->wp_futex->f_refcount); } LINUX_CTR5(sys_futex, "futex_sleep out error %d uaddr %p wp" " %p requeued uaddr %p ref %d", error, f->f_uaddr, wp, wp->wp_futex->f_uaddr, wp->wp_futex->f_refcount); futex_put(f, NULL); f = wp->wp_futex; futex_lock(f); } else { if (error) { LIN_SDT_PROBE3(futex, futex_sleep, sleep_error, error, f->f_uaddr, wp); } LINUX_CTR3(sys_futex, "futex_sleep out error %d uaddr %p wp %p", error, f->f_uaddr, wp); } futex_put(f, wp); LIN_SDT_PROBE1(futex, futex_sleep, return, error); return (error); } static int futex_wake(struct futex *f, int n, uint32_t bitset) { struct waiting_proc *wp, *wpt; int count = 0; LIN_SDT_PROBE3(futex, futex_wake, entry, f, n, bitset); if (bitset == 0) { LIN_SDT_PROBE1(futex, futex_wake, return, EINVAL); return (EINVAL); } FUTEX_ASSERT_LOCKED(f); TAILQ_FOREACH_SAFE(wp, &f->f_waiting_proc, wp_list, wpt) { LIN_SDT_PROBE3(futex, futex_wake, iterate, f->f_uaddr, wp, f->f_refcount); LINUX_CTR3(sys_futex, "futex_wake uaddr %p wp %p ref %d", f->f_uaddr, wp, f->f_refcount); /* * Unless we find a matching bit in * the bitset, continue searching. */ if (!(wp->wp_futex->f_bitset & bitset)) continue; wp->wp_flags |= FUTEX_WP_REMOVED; TAILQ_REMOVE(&f->f_waiting_proc, wp, wp_list); LIN_SDT_PROBE1(futex, futex_wake, wakeup, wp); wakeup_one(wp); if (++count == n) break; } LIN_SDT_PROBE1(futex, futex_wake, return, count); return (count); } static int futex_requeue(struct futex *f, int n, struct futex *f2, int n2) { struct waiting_proc *wp, *wpt; int count = 0; LIN_SDT_PROBE4(futex, futex_requeue, entry, f, n, f2, n2); FUTEX_ASSERT_LOCKED(f); FUTEX_ASSERT_LOCKED(f2); TAILQ_FOREACH_SAFE(wp, &f->f_waiting_proc, wp_list, wpt) { if (++count <= n) { LINUX_CTR2(sys_futex, "futex_req_wake uaddr %p wp %p", f->f_uaddr, wp); wp->wp_flags |= FUTEX_WP_REMOVED; TAILQ_REMOVE(&f->f_waiting_proc, wp, wp_list); LIN_SDT_PROBE1(futex, futex_requeue, wakeup, wp); wakeup_one(wp); } else { LIN_SDT_PROBE3(futex, futex_requeue, requeue, f->f_uaddr, wp, f2->f_uaddr); LINUX_CTR3(sys_futex, "futex_requeue uaddr %p wp %p to %p", f->f_uaddr, wp, f2->f_uaddr); wp->wp_flags |= FUTEX_WP_REQUEUED; /* Move wp to wp_list of f2 futex */ TAILQ_REMOVE(&f->f_waiting_proc, wp, wp_list); TAILQ_INSERT_HEAD(&f2->f_waiting_proc, wp, wp_list); /* * Thread which sleeps on wp after waking should * acquire f2 lock, so increment refcount of f2 to * prevent it from premature deallocation. */ wp->wp_futex = f2; FUTEXES_LOCK; ++f2->f_refcount; FUTEXES_UNLOCK; if (count - n >= n2) break; } } LIN_SDT_PROBE1(futex, futex_requeue, return, count); return (count); } static int futex_wait(struct futex *f, struct waiting_proc *wp, struct timespec *ts, uint32_t bitset) { int error; LIN_SDT_PROBE4(futex, futex_wait, entry, f, wp, ts, bitset); if (bitset == 0) { LIN_SDT_PROBE1(futex, futex_wait, return, EINVAL); return (EINVAL); } f->f_bitset = bitset; error = futex_sleep(f, wp, ts); if (error) LIN_SDT_PROBE1(futex, futex_wait, sleep_error, error); if (error == EWOULDBLOCK) error = ETIMEDOUT; LIN_SDT_PROBE1(futex, futex_wait, return, error); return (error); } static int futex_atomic_op(struct thread *td, int encoded_op, uint32_t *uaddr) { int op = (encoded_op >> 28) & 7; int cmp = (encoded_op >> 24) & 15; int oparg = (encoded_op << 8) >> 20; int cmparg = (encoded_op << 20) >> 20; int oldval = 0, ret; LIN_SDT_PROBE3(futex, futex_atomic_op, entry, td, encoded_op, uaddr); if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) oparg = 1 << oparg; LIN_SDT_PROBE4(futex, futex_atomic_op, decoded_op, op, cmp, oparg, cmparg); /* XXX: Linux verifies access here and returns EFAULT */ LIN_SDT_PROBE0(futex, futex_atomic_op, missing_access_check); switch (op) { case FUTEX_OP_SET: ret = futex_xchgl(oparg, uaddr, &oldval); break; case FUTEX_OP_ADD: ret = futex_addl(oparg, uaddr, &oldval); break; case FUTEX_OP_OR: ret = futex_orl(oparg, uaddr, &oldval); break; case FUTEX_OP_ANDN: ret = futex_andl(~oparg, uaddr, &oldval); break; case FUTEX_OP_XOR: ret = futex_xorl(oparg, uaddr, &oldval); break; default: LIN_SDT_PROBE1(futex, futex_atomic_op, unimplemented_op, op); ret = -ENOSYS; break; } if (ret) { LIN_SDT_PROBE1(futex, futex_atomic_op, return, ret); return (ret); } switch (cmp) { case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break; case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break; case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break; case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break; case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break; case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break; default: LIN_SDT_PROBE1(futex, futex_atomic_op, unimplemented_cmp, cmp); ret = -ENOSYS; } LIN_SDT_PROBE1(futex, futex_atomic_op, return, ret); return (ret); } int linux_sys_futex(struct thread *td, struct linux_sys_futex_args *args) { int clockrt, nrwake, op_ret, ret; struct linux_pemuldata *pem; struct waiting_proc *wp; struct futex *f, *f2; struct timespec uts, *ts; int error, save; uint32_t flags, val; LIN_SDT_PROBE2(futex, linux_sys_futex, entry, td, args); if (args->op & LINUX_FUTEX_PRIVATE_FLAG) { flags = 0; args->op &= ~LINUX_FUTEX_PRIVATE_FLAG; } else flags = FUTEX_SHARED; /* * Currently support for switching between CLOCK_MONOTONIC and * CLOCK_REALTIME is not present. However Linux forbids the use of * FUTEX_CLOCK_REALTIME with any op except FUTEX_WAIT_BITSET and * FUTEX_WAIT_REQUEUE_PI. */ clockrt = args->op & LINUX_FUTEX_CLOCK_REALTIME; args->op = args->op & ~LINUX_FUTEX_CLOCK_REALTIME; if (clockrt && args->op != LINUX_FUTEX_WAIT_BITSET && args->op != LINUX_FUTEX_WAIT_REQUEUE_PI) { LIN_SDT_PROBE0(futex, linux_sys_futex, unimplemented_clockswitch); LIN_SDT_PROBE1(futex, linux_sys_futex, return, ENOSYS); return (ENOSYS); } error = 0; f = f2 = NULL; switch (args->op) { case LINUX_FUTEX_WAIT: args->val3 = FUTEX_BITSET_MATCH_ANY; /* FALLTHROUGH */ case LINUX_FUTEX_WAIT_BITSET: LIN_SDT_PROBE3(futex, linux_sys_futex, debug_wait, args->uaddr, args->val, args->val3); LINUX_CTR3(sys_futex, "WAIT uaddr %p val 0x%x bitset 0x%x", args->uaddr, args->val, args->val3); if (args->timeout != NULL) { error = futex_copyin_timeout(args->op, args->timeout, clockrt, &uts); if (error) { LIN_SDT_PROBE1(futex, linux_sys_futex, copyin_error, error); LIN_SDT_PROBE1(futex, linux_sys_futex, return, error); return (error); } ts = &uts; } else ts = NULL; retry0: error = futex_get(args->uaddr, &wp, &f, flags | FUTEX_CREATE_WP); if (error) { LIN_SDT_PROBE1(futex, linux_sys_futex, return, error); return (error); } error = copyin_nofault(args->uaddr, &val, sizeof(val)); if (error) { futex_put(f, wp); error = copyin(args->uaddr, &val, sizeof(val)); if (error == 0) goto retry0; LIN_SDT_PROBE1(futex, linux_sys_futex, copyin_error, error); LINUX_CTR1(sys_futex, "WAIT copyin failed %d", error); LIN_SDT_PROBE1(futex, linux_sys_futex, return, error); return (error); } if (val != args->val) { LIN_SDT_PROBE4(futex, linux_sys_futex, debug_wait_value_neq, args->uaddr, args->val, val, args->val3); LINUX_CTR3(sys_futex, "WAIT uaddr %p val 0x%x != uval 0x%x", args->uaddr, args->val, val); futex_put(f, wp); LIN_SDT_PROBE1(futex, linux_sys_futex, return, EWOULDBLOCK); return (EWOULDBLOCK); } error = futex_wait(f, wp, ts, args->val3); break; case LINUX_FUTEX_WAKE: args->val3 = FUTEX_BITSET_MATCH_ANY; /* FALLTHROUGH */ case LINUX_FUTEX_WAKE_BITSET: LIN_SDT_PROBE3(futex, linux_sys_futex, debug_wake, args->uaddr, args->val, args->val3); LINUX_CTR3(sys_futex, "WAKE uaddr %p nrwake 0x%x bitset 0x%x", args->uaddr, args->val, args->val3); error = futex_get(args->uaddr, NULL, &f, flags | FUTEX_DONTCREATE); if (error) { LIN_SDT_PROBE1(futex, linux_sys_futex, return, error); return (error); } if (f == NULL) { td->td_retval[0] = 0; LIN_SDT_PROBE1(futex, linux_sys_futex, return, error); return (error); } td->td_retval[0] = futex_wake(f, args->val, args->val3); futex_put(f, NULL); break; case LINUX_FUTEX_CMP_REQUEUE: LIN_SDT_PROBE5(futex, linux_sys_futex, debug_cmp_requeue, args->uaddr, args->val, args->val3, args->uaddr2, args->timeout); LINUX_CTR5(sys_futex, "CMP_REQUEUE uaddr %p " "nrwake 0x%x uval 0x%x uaddr2 %p nrequeue 0x%x", args->uaddr, args->val, args->val3, args->uaddr2, args->timeout); /* * Linux allows this, we would not, it is an incorrect * usage of declared ABI, so return EINVAL. */ if (args->uaddr == args->uaddr2) { LIN_SDT_PROBE0(futex, linux_sys_futex, invalid_cmp_requeue_use); LIN_SDT_PROBE1(futex, linux_sys_futex, return, EINVAL); return (EINVAL); } retry1: error = futex_get(args->uaddr, NULL, &f, flags | FUTEX_DONTLOCK); if (error) { LIN_SDT_PROBE1(futex, linux_sys_futex, return, error); return (error); } /* * To avoid deadlocks return EINVAL if second futex * exists at this time. * * Glibc fall back to FUTEX_WAKE in case of any error * returned by FUTEX_CMP_REQUEUE. */ error = futex_get(args->uaddr2, NULL, &f2, flags | FUTEX_DONTEXISTS | FUTEX_DONTLOCK); if (error) { futex_put(f, NULL); LIN_SDT_PROBE1(futex, linux_sys_futex, return, error); return (error); } futex_lock(f); futex_lock(f2); error = copyin_nofault(args->uaddr, &val, sizeof(val)); if (error) { futex_put(f2, NULL); futex_put(f, NULL); error = copyin(args->uaddr, &val, sizeof(val)); if (error == 0) goto retry1; LIN_SDT_PROBE1(futex, linux_sys_futex, copyin_error, error); LINUX_CTR1(sys_futex, "CMP_REQUEUE copyin failed %d", error); LIN_SDT_PROBE1(futex, linux_sys_futex, return, error); return (error); } if (val != args->val3) { LIN_SDT_PROBE2(futex, linux_sys_futex, debug_cmp_requeue_value_neq, args->val, val); LINUX_CTR2(sys_futex, "CMP_REQUEUE val 0x%x != uval 0x%x", args->val, val); futex_put(f2, NULL); futex_put(f, NULL); LIN_SDT_PROBE1(futex, linux_sys_futex, return, EAGAIN); return (EAGAIN); } nrwake = (int)(unsigned long)args->timeout; td->td_retval[0] = futex_requeue(f, args->val, f2, nrwake); futex_put(f2, NULL); futex_put(f, NULL); break; case LINUX_FUTEX_WAKE_OP: LIN_SDT_PROBE5(futex, linux_sys_futex, debug_wake_op, args->uaddr, args->op, args->val, args->uaddr2, args->val3); LINUX_CTR5(sys_futex, "WAKE_OP " "uaddr %p nrwake 0x%x uaddr2 %p op 0x%x nrwake2 0x%x", args->uaddr, args->val, args->uaddr2, args->val3, args->timeout); if (args->uaddr == args->uaddr2) { LIN_SDT_PROBE1(futex, linux_sys_futex, return, EINVAL); return (EINVAL); } retry2: error = futex_get(args->uaddr, NULL, &f, flags | FUTEX_DONTLOCK); if (error) { LIN_SDT_PROBE1(futex, linux_sys_futex, return, error); return (error); } error = futex_get(args->uaddr2, NULL, &f2, flags | FUTEX_DONTLOCK); if (error) { futex_put(f, NULL); LIN_SDT_PROBE1(futex, linux_sys_futex, return, error); return (error); } futex_lock(f); futex_lock(f2); /* * This function returns positive number as results and * negative as errors */ save = vm_fault_disable_pagefaults(); op_ret = futex_atomic_op(td, args->val3, args->uaddr2); vm_fault_enable_pagefaults(save); LINUX_CTR2(sys_futex, "WAKE_OP atomic_op uaddr %p ret 0x%x", args->uaddr, op_ret); if (op_ret < 0) { if (f2 != NULL) futex_put(f2, NULL); futex_put(f, NULL); error = copyin(args->uaddr2, &val, sizeof(val)); if (error == 0) goto retry2; LIN_SDT_PROBE1(futex, linux_sys_futex, return, error); return (error); } ret = futex_wake(f, args->val, args->val3); if (op_ret > 0) { op_ret = 0; nrwake = (int)(unsigned long)args->timeout; if (f2 != NULL) op_ret += futex_wake(f2, nrwake, args->val3); else op_ret += futex_wake(f, nrwake, args->val3); ret += op_ret; } if (f2 != NULL) futex_put(f2, NULL); futex_put(f, NULL); td->td_retval[0] = ret; break; case LINUX_FUTEX_LOCK_PI: /* not yet implemented */ pem = pem_find(td->td_proc); if ((pem->flags & LINUX_XUNSUP_FUTEXPIOP) == 0) { linux_msg(td, "linux_sys_futex: " "unsupported futex_pi op\n"); pem->flags |= LINUX_XUNSUP_FUTEXPIOP; LIN_SDT_PROBE0(futex, linux_sys_futex, unimplemented_lock_pi); } LIN_SDT_PROBE1(futex, linux_sys_futex, return, ENOSYS); return (ENOSYS); case LINUX_FUTEX_UNLOCK_PI: /* not yet implemented */ pem = pem_find(td->td_proc); if ((pem->flags & LINUX_XUNSUP_FUTEXPIOP) == 0) { linux_msg(td, "linux_sys_futex: " "unsupported futex_pi op\n"); pem->flags |= LINUX_XUNSUP_FUTEXPIOP; LIN_SDT_PROBE0(futex, linux_sys_futex, unimplemented_unlock_pi); } LIN_SDT_PROBE1(futex, linux_sys_futex, return, ENOSYS); return (ENOSYS); case LINUX_FUTEX_TRYLOCK_PI: /* not yet implemented */ pem = pem_find(td->td_proc); if ((pem->flags & LINUX_XUNSUP_FUTEXPIOP) == 0) { linux_msg(td, "linux_sys_futex: " "unsupported futex_pi op\n"); pem->flags |= LINUX_XUNSUP_FUTEXPIOP; LIN_SDT_PROBE0(futex, linux_sys_futex, unimplemented_trylock_pi); } LIN_SDT_PROBE1(futex, linux_sys_futex, return, ENOSYS); return (ENOSYS); case LINUX_FUTEX_REQUEUE: /* * Glibc does not use this operation since version 2.3.3, * as it is racy and replaced by FUTEX_CMP_REQUEUE operation. * Glibc versions prior to 2.3.3 fall back to FUTEX_WAKE when * FUTEX_REQUEUE returned EINVAL. */ pem = pem_find(td->td_proc); if ((pem->flags & LINUX_XDEPR_REQUEUEOP) == 0) { linux_msg(td, "linux_sys_futex: " "unsupported futex_requeue op\n"); pem->flags |= LINUX_XDEPR_REQUEUEOP; LIN_SDT_PROBE0(futex, linux_sys_futex, deprecated_requeue); } LIN_SDT_PROBE1(futex, linux_sys_futex, return, EINVAL); return (EINVAL); case LINUX_FUTEX_WAIT_REQUEUE_PI: /* not yet implemented */ pem = pem_find(td->td_proc); if ((pem->flags & LINUX_XUNSUP_FUTEXPIOP) == 0) { linux_msg(td, "linux_sys_futex: " "unsupported futex_pi op\n"); pem->flags |= LINUX_XUNSUP_FUTEXPIOP; LIN_SDT_PROBE0(futex, linux_sys_futex, unimplemented_wait_requeue_pi); } LIN_SDT_PROBE1(futex, linux_sys_futex, return, ENOSYS); return (ENOSYS); case LINUX_FUTEX_CMP_REQUEUE_PI: /* not yet implemented */ pem = pem_find(td->td_proc); if ((pem->flags & LINUX_XUNSUP_FUTEXPIOP) == 0) { linux_msg(td, "linux_sys_futex: " "unsupported futex_pi op\n"); pem->flags |= LINUX_XUNSUP_FUTEXPIOP; LIN_SDT_PROBE0(futex, linux_sys_futex, unimplemented_cmp_requeue_pi); } LIN_SDT_PROBE1(futex, linux_sys_futex, return, ENOSYS); return (ENOSYS); default: linux_msg(td, "linux_sys_futex: unknown op %d\n", args->op); LIN_SDT_PROBE1(futex, linux_sys_futex, unknown_operation, args->op); LIN_SDT_PROBE1(futex, linux_sys_futex, return, ENOSYS); return (ENOSYS); } LIN_SDT_PROBE1(futex, linux_sys_futex, return, error); return (error); } int linux_set_robust_list(struct thread *td, struct linux_set_robust_list_args *args) { struct linux_emuldata *em; LIN_SDT_PROBE2(futex, linux_set_robust_list, entry, td, args); if (args->len != sizeof(struct linux_robust_list_head)) { LIN_SDT_PROBE0(futex, linux_set_robust_list, size_error); LIN_SDT_PROBE1(futex, linux_set_robust_list, return, EINVAL); return (EINVAL); } em = em_find(td); em->robust_futexes = args->head; LIN_SDT_PROBE1(futex, linux_set_robust_list, return, 0); return (0); } int linux_get_robust_list(struct thread *td, struct linux_get_robust_list_args *args) { struct linux_emuldata *em; struct linux_robust_list_head *head; l_size_t len = sizeof(struct linux_robust_list_head); struct thread *td2; int error = 0; LIN_SDT_PROBE2(futex, linux_get_robust_list, entry, td, args); if (!args->pid) { em = em_find(td); KASSERT(em != NULL, ("get_robust_list: emuldata notfound.\n")); head = em->robust_futexes; } else { td2 = tdfind(args->pid, -1); if (td2 == NULL) { LIN_SDT_PROBE1(futex, linux_get_robust_list, return, ESRCH); return (ESRCH); } if (SV_PROC_ABI(td2->td_proc) != SV_ABI_LINUX) { LIN_SDT_PROBE1(futex, linux_get_robust_list, return, EPERM); PROC_UNLOCK(td2->td_proc); return (EPERM); } em = em_find(td2); KASSERT(em != NULL, ("get_robust_list: emuldata notfound.\n")); /* XXX: ptrace? */ if (priv_check(td, PRIV_CRED_SETUID) || priv_check(td, PRIV_CRED_SETEUID) || p_candebug(td, td2->td_proc)) { PROC_UNLOCK(td2->td_proc); LIN_SDT_PROBE1(futex, linux_get_robust_list, return, EPERM); return (EPERM); } head = em->robust_futexes; PROC_UNLOCK(td2->td_proc); } error = copyout(&len, args->len, sizeof(l_size_t)); if (error) { LIN_SDT_PROBE1(futex, linux_get_robust_list, copyout_error, error); LIN_SDT_PROBE1(futex, linux_get_robust_list, return, EFAULT); return (EFAULT); } error = copyout(&head, args->head, sizeof(head)); if (error) { LIN_SDT_PROBE1(futex, linux_get_robust_list, copyout_error, error); } LIN_SDT_PROBE1(futex, linux_get_robust_list, return, error); return (error); } static int handle_futex_death(struct linux_emuldata *em, uint32_t *uaddr, unsigned int pi) { uint32_t uval, nval, mval; struct futex *f; int error; LIN_SDT_PROBE3(futex, handle_futex_death, entry, em, uaddr, pi); retry: error = copyin(uaddr, &uval, 4); if (error) { LIN_SDT_PROBE1(futex, handle_futex_death, copyin_error, error); LIN_SDT_PROBE1(futex, handle_futex_death, return, EFAULT); return (EFAULT); } if ((uval & FUTEX_TID_MASK) == em->em_tid) { mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED; nval = casuword32(uaddr, uval, mval); if (nval == -1) { LIN_SDT_PROBE1(futex, handle_futex_death, return, EFAULT); return (EFAULT); } if (nval != uval) goto retry; if (!pi && (uval & FUTEX_WAITERS)) { error = futex_get(uaddr, NULL, &f, FUTEX_DONTCREATE | FUTEX_SHARED); if (error) { LIN_SDT_PROBE1(futex, handle_futex_death, return, error); return (error); } if (f != NULL) { futex_wake(f, 1, FUTEX_BITSET_MATCH_ANY); futex_put(f, NULL); } } } LIN_SDT_PROBE1(futex, handle_futex_death, return, 0); return (0); } static int fetch_robust_entry(struct linux_robust_list **entry, struct linux_robust_list **head, unsigned int *pi) { l_ulong uentry; int error; LIN_SDT_PROBE3(futex, fetch_robust_entry, entry, entry, head, pi); error = copyin((const void *)head, &uentry, sizeof(l_ulong)); if (error) { LIN_SDT_PROBE1(futex, fetch_robust_entry, copyin_error, error); LIN_SDT_PROBE1(futex, fetch_robust_entry, return, EFAULT); return (EFAULT); } *entry = (void *)(uentry & ~1UL); *pi = uentry & 1; LIN_SDT_PROBE1(futex, fetch_robust_entry, return, 0); return (0); } /* This walks the list of robust futexes releasing them. */ void release_futexes(struct thread *td, struct linux_emuldata *em) { struct linux_robust_list_head *head = NULL; struct linux_robust_list *entry, *next_entry, *pending; unsigned int limit = 2048, pi, next_pi, pip; l_long futex_offset; int rc, error; LIN_SDT_PROBE2(futex, release_futexes, entry, td, em); head = em->robust_futexes; if (head == NULL) { LIN_SDT_PROBE0(futex, release_futexes, return); return; } if (fetch_robust_entry(&entry, PTRIN(&head->list.next), &pi)) { LIN_SDT_PROBE0(futex, release_futexes, return); return; } error = copyin(&head->futex_offset, &futex_offset, sizeof(futex_offset)); if (error) { LIN_SDT_PROBE1(futex, release_futexes, copyin_error, error); LIN_SDT_PROBE0(futex, release_futexes, return); return; } if (fetch_robust_entry(&pending, PTRIN(&head->pending_list), &pip)) { LIN_SDT_PROBE0(futex, release_futexes, return); return; } while (entry != &head->list) { rc = fetch_robust_entry(&next_entry, PTRIN(&entry->next), &next_pi); if (entry != pending) if (handle_futex_death(em, (uint32_t *)((caddr_t)entry + futex_offset), pi)) { LIN_SDT_PROBE0(futex, release_futexes, return); return; } if (rc) { LIN_SDT_PROBE0(futex, release_futexes, return); return; } entry = next_entry; pi = next_pi; if (!--limit) break; sched_relinquish(curthread); } if (pending) handle_futex_death(em, (uint32_t *)((caddr_t)pending + futex_offset), pip); LIN_SDT_PROBE0(futex, release_futexes, return); } Index: head/sys/compat/linux/linux_misc.c =================================================================== --- head/sys/compat/linux/linux_misc.c (revision 336913) +++ head/sys/compat/linux/linux_misc.c (revision 336914) @@ -1,2593 +1,2593 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2002 Doug Rabson * Copyright (c) 1994-1995 Søren Schmidt * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer * in this position and unchanged. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #include #include #include #if defined(__i386__) #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef COMPAT_LINUX32 #include #include #else #include #include #endif #include #include #include #include #include #include #include #include #include /** * Special DTrace provider for the linuxulator. * * In this file we define the provider for the entire linuxulator. All * modules (= files of the linuxulator) use it. * * We define a different name depending on the emulated bitsize, see * ../..//linux{,32}/linux.h, e.g.: * native bitsize = linuxulator * amd64, 32bit emulation = linuxulator32 */ LIN_SDT_PROVIDER_DEFINE(LINUX_DTRACE); int stclohz; /* Statistics clock frequency */ static unsigned int linux_to_bsd_resource[LINUX_RLIM_NLIMITS] = { RLIMIT_CPU, RLIMIT_FSIZE, RLIMIT_DATA, RLIMIT_STACK, RLIMIT_CORE, RLIMIT_RSS, RLIMIT_NPROC, RLIMIT_NOFILE, RLIMIT_MEMLOCK, RLIMIT_AS }; struct l_sysinfo { l_long uptime; /* Seconds since boot */ l_ulong loads[3]; /* 1, 5, and 15 minute load averages */ #define LINUX_SYSINFO_LOADS_SCALE 65536 l_ulong totalram; /* Total usable main memory size */ l_ulong freeram; /* Available memory size */ l_ulong sharedram; /* Amount of shared memory */ l_ulong bufferram; /* Memory used by buffers */ l_ulong totalswap; /* Total swap space size */ l_ulong freeswap; /* swap space still available */ l_ushort procs; /* Number of current processes */ l_ushort pads; l_ulong totalbig; l_ulong freebig; l_uint mem_unit; char _f[20-2*sizeof(l_long)-sizeof(l_int)]; /* padding */ }; struct l_pselect6arg { l_uintptr_t ss; l_size_t ss_len; }; static int linux_utimensat_nsec_valid(l_long); int linux_sysinfo(struct thread *td, struct linux_sysinfo_args *args) { struct l_sysinfo sysinfo; vm_object_t object; int i, j; struct timespec ts; bzero(&sysinfo, sizeof(sysinfo)); getnanouptime(&ts); if (ts.tv_nsec != 0) ts.tv_sec++; sysinfo.uptime = ts.tv_sec; /* Use the information from the mib to get our load averages */ for (i = 0; i < 3; i++) sysinfo.loads[i] = averunnable.ldavg[i] * LINUX_SYSINFO_LOADS_SCALE / averunnable.fscale; sysinfo.totalram = physmem * PAGE_SIZE; sysinfo.freeram = sysinfo.totalram - vm_wire_count() * PAGE_SIZE; sysinfo.sharedram = 0; mtx_lock(&vm_object_list_mtx); TAILQ_FOREACH(object, &vm_object_list, object_list) if (object->shadow_count > 1) sysinfo.sharedram += object->resident_page_count; mtx_unlock(&vm_object_list_mtx); sysinfo.sharedram *= PAGE_SIZE; sysinfo.bufferram = 0; swap_pager_status(&i, &j); sysinfo.totalswap = i * PAGE_SIZE; sysinfo.freeswap = (i - j) * PAGE_SIZE; sysinfo.procs = nprocs; /* The following are only present in newer Linux kernels. */ sysinfo.totalbig = 0; sysinfo.freebig = 0; sysinfo.mem_unit = 1; return (copyout(&sysinfo, args->info, sizeof(sysinfo))); } #ifdef LINUX_LEGACY_SYSCALLS int linux_alarm(struct thread *td, struct linux_alarm_args *args) { struct itimerval it, old_it; u_int secs; int error; #ifdef DEBUG if (ldebug(alarm)) printf(ARGS(alarm, "%u"), args->secs); #endif secs = args->secs; /* * Linux alarm() is always successful. Limit secs to INT32_MAX / 2 * to match kern_setitimer()'s limit to avoid error from it. * * XXX. Linux limit secs to INT_MAX on 32 and does not limit on 64-bit * platforms. */ if (secs > INT32_MAX / 2) secs = INT32_MAX / 2; it.it_value.tv_sec = secs; it.it_value.tv_usec = 0; timevalclear(&it.it_interval); error = kern_setitimer(td, ITIMER_REAL, &it, &old_it); KASSERT(error == 0, ("kern_setitimer returns %d", error)); if ((old_it.it_value.tv_sec == 0 && old_it.it_value.tv_usec > 0) || old_it.it_value.tv_usec >= 500000) old_it.it_value.tv_sec++; td->td_retval[0] = old_it.it_value.tv_sec; return (0); } #endif int linux_brk(struct thread *td, struct linux_brk_args *args) { struct vmspace *vm = td->td_proc->p_vmspace; uintptr_t new, old; #ifdef DEBUG if (ldebug(brk)) printf(ARGS(brk, "%p"), (void *)(uintptr_t)args->dsend); #endif old = (uintptr_t)vm->vm_daddr + ctob(vm->vm_dsize); new = (uintptr_t)args->dsend; if ((caddr_t)new > vm->vm_daddr && !kern_break(td, &new)) td->td_retval[0] = (register_t)new; else td->td_retval[0] = (register_t)old; return (0); } #if defined(__i386__) /* XXX: what about amd64/linux32? */ int linux_uselib(struct thread *td, struct linux_uselib_args *args) { struct nameidata ni; struct vnode *vp; struct exec *a_out; struct vattr attr; vm_offset_t vmaddr; unsigned long file_offset; unsigned long bss_size; char *library; ssize_t aresid; int error, locked, writecount; LCONVPATHEXIST(td, args->library, &library); #ifdef DEBUG if (ldebug(uselib)) printf(ARGS(uselib, "%s"), library); #endif a_out = NULL; locked = 0; vp = NULL; NDINIT(&ni, LOOKUP, ISOPEN | FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_SYSSPACE, library, td); error = namei(&ni); LFREEPATH(library); if (error) goto cleanup; vp = ni.ni_vp; NDFREE(&ni, NDF_ONLY_PNBUF); /* * From here on down, we have a locked vnode that must be unlocked. * XXX: The code below largely duplicates exec_check_permissions(). */ locked = 1; /* Writable? */ error = VOP_GET_WRITECOUNT(vp, &writecount); if (error != 0) goto cleanup; if (writecount != 0) { error = ETXTBSY; goto cleanup; } /* Executable? */ error = VOP_GETATTR(vp, &attr, td->td_ucred); if (error) goto cleanup; if ((vp->v_mount->mnt_flag & MNT_NOEXEC) || ((attr.va_mode & 0111) == 0) || (attr.va_type != VREG)) { /* EACCESS is what exec(2) returns. */ error = ENOEXEC; goto cleanup; } /* Sensible size? */ if (attr.va_size == 0) { error = ENOEXEC; goto cleanup; } /* Can we access it? */ error = VOP_ACCESS(vp, VEXEC, td->td_ucred, td); if (error) goto cleanup; /* * XXX: This should use vn_open() so that it is properly authorized, * and to reduce code redundancy all over the place here. * XXX: Not really, it duplicates far more of exec_check_permissions() * than vn_open(). */ #ifdef MAC error = mac_vnode_check_open(td->td_ucred, vp, VREAD); if (error) goto cleanup; #endif error = VOP_OPEN(vp, FREAD, td->td_ucred, td, NULL); if (error) goto cleanup; /* Pull in executable header into exec_map */ error = vm_mmap(exec_map, (vm_offset_t *)&a_out, PAGE_SIZE, VM_PROT_READ, VM_PROT_READ, 0, OBJT_VNODE, vp, 0); if (error) goto cleanup; /* Is it a Linux binary ? */ if (((a_out->a_magic >> 16) & 0xff) != 0x64) { error = ENOEXEC; goto cleanup; } /* * While we are here, we should REALLY do some more checks */ /* Set file/virtual offset based on a.out variant. */ switch ((int)(a_out->a_magic & 0xffff)) { case 0413: /* ZMAGIC */ file_offset = 1024; break; case 0314: /* QMAGIC */ file_offset = 0; break; default: error = ENOEXEC; goto cleanup; } bss_size = round_page(a_out->a_bss); /* Check various fields in header for validity/bounds. */ if (a_out->a_text & PAGE_MASK || a_out->a_data & PAGE_MASK) { error = ENOEXEC; goto cleanup; } /* text + data can't exceed file size */ if (a_out->a_data + a_out->a_text > attr.va_size) { error = EFAULT; goto cleanup; } /* * text/data/bss must not exceed limits * XXX - this is not complete. it should check current usage PLUS * the resources needed by this library. */ PROC_LOCK(td->td_proc); if (a_out->a_text > maxtsiz || a_out->a_data + bss_size > lim_cur_proc(td->td_proc, RLIMIT_DATA) || racct_set(td->td_proc, RACCT_DATA, a_out->a_data + bss_size) != 0) { PROC_UNLOCK(td->td_proc); error = ENOMEM; goto cleanup; } PROC_UNLOCK(td->td_proc); /* * Prevent more writers. * XXX: Note that if any of the VM operations fail below we don't * clear this flag. */ VOP_SET_TEXT(vp); /* * Lock no longer needed */ locked = 0; VOP_UNLOCK(vp, 0); /* * Check if file_offset page aligned. Currently we cannot handle * misalinged file offsets, and so we read in the entire image * (what a waste). */ if (file_offset & PAGE_MASK) { #ifdef DEBUG printf("uselib: Non page aligned binary %lu\n", file_offset); #endif /* Map text+data read/write/execute */ /* a_entry is the load address and is page aligned */ vmaddr = trunc_page(a_out->a_entry); /* get anon user mapping, read+write+execute */ error = vm_map_find(&td->td_proc->p_vmspace->vm_map, NULL, 0, &vmaddr, a_out->a_text + a_out->a_data, 0, VMFS_NO_SPACE, VM_PROT_ALL, VM_PROT_ALL, 0); if (error) goto cleanup; error = vn_rdwr(UIO_READ, vp, (void *)vmaddr, file_offset, a_out->a_text + a_out->a_data, UIO_USERSPACE, 0, td->td_ucred, NOCRED, &aresid, td); if (error != 0) goto cleanup; if (aresid != 0) { error = ENOEXEC; goto cleanup; } } else { #ifdef DEBUG printf("uselib: Page aligned binary %lu\n", file_offset); #endif /* * for QMAGIC, a_entry is 20 bytes beyond the load address * to skip the executable header */ vmaddr = trunc_page(a_out->a_entry); /* * Map it all into the process's space as a single * copy-on-write "data" segment. */ error = vm_mmap(&td->td_proc->p_vmspace->vm_map, &vmaddr, a_out->a_text + a_out->a_data, VM_PROT_ALL, VM_PROT_ALL, MAP_PRIVATE | MAP_FIXED, OBJT_VNODE, vp, file_offset); if (error) goto cleanup; } #ifdef DEBUG printf("mem=%08lx = %08lx %08lx\n", (long)vmaddr, ((long *)vmaddr)[0], ((long *)vmaddr)[1]); #endif if (bss_size != 0) { /* Calculate BSS start address */ vmaddr = trunc_page(a_out->a_entry) + a_out->a_text + a_out->a_data; /* allocate some 'anon' space */ error = vm_map_find(&td->td_proc->p_vmspace->vm_map, NULL, 0, &vmaddr, bss_size, 0, VMFS_NO_SPACE, VM_PROT_ALL, VM_PROT_ALL, 0); if (error) goto cleanup; } cleanup: /* Unlock vnode if needed */ if (locked) VOP_UNLOCK(vp, 0); /* Release the temporary mapping. */ if (a_out) kmap_free_wakeup(exec_map, (vm_offset_t)a_out, PAGE_SIZE); return (error); } #endif /* __i386__ */ #ifdef LINUX_LEGACY_SYSCALLS int linux_select(struct thread *td, struct linux_select_args *args) { l_timeval ltv; struct timeval tv0, tv1, utv, *tvp; int error; #ifdef DEBUG if (ldebug(select)) printf(ARGS(select, "%d, %p, %p, %p, %p"), args->nfds, (void *)args->readfds, (void *)args->writefds, (void *)args->exceptfds, (void *)args->timeout); #endif /* * Store current time for computation of the amount of * time left. */ if (args->timeout) { if ((error = copyin(args->timeout, <v, sizeof(ltv)))) goto select_out; utv.tv_sec = ltv.tv_sec; utv.tv_usec = ltv.tv_usec; #ifdef DEBUG if (ldebug(select)) printf(LMSG("incoming timeout (%jd/%ld)"), (intmax_t)utv.tv_sec, utv.tv_usec); #endif if (itimerfix(&utv)) { /* * The timeval was invalid. Convert it to something * valid that will act as it does under Linux. */ utv.tv_sec += utv.tv_usec / 1000000; utv.tv_usec %= 1000000; if (utv.tv_usec < 0) { utv.tv_sec -= 1; utv.tv_usec += 1000000; } if (utv.tv_sec < 0) timevalclear(&utv); } microtime(&tv0); tvp = &utv; } else tvp = NULL; error = kern_select(td, args->nfds, args->readfds, args->writefds, args->exceptfds, tvp, LINUX_NFDBITS); #ifdef DEBUG if (ldebug(select)) printf(LMSG("real select returns %d"), error); #endif if (error) goto select_out; if (args->timeout) { if (td->td_retval[0]) { /* * Compute how much time was left of the timeout, * by subtracting the current time and the time * before we started the call, and subtracting * that result from the user-supplied value. */ microtime(&tv1); timevalsub(&tv1, &tv0); timevalsub(&utv, &tv1); if (utv.tv_sec < 0) timevalclear(&utv); } else timevalclear(&utv); #ifdef DEBUG if (ldebug(select)) printf(LMSG("outgoing timeout (%jd/%ld)"), (intmax_t)utv.tv_sec, utv.tv_usec); #endif ltv.tv_sec = utv.tv_sec; ltv.tv_usec = utv.tv_usec; if ((error = copyout(<v, args->timeout, sizeof(ltv)))) goto select_out; } select_out: #ifdef DEBUG if (ldebug(select)) printf(LMSG("select_out -> %d"), error); #endif return (error); } #endif int linux_mremap(struct thread *td, struct linux_mremap_args *args) { uintptr_t addr; size_t len; int error = 0; #ifdef DEBUG if (ldebug(mremap)) printf(ARGS(mremap, "%p, %08lx, %08lx, %08lx"), (void *)(uintptr_t)args->addr, (unsigned long)args->old_len, (unsigned long)args->new_len, (unsigned long)args->flags); #endif if (args->flags & ~(LINUX_MREMAP_FIXED | LINUX_MREMAP_MAYMOVE)) { td->td_retval[0] = 0; return (EINVAL); } /* * Check for the page alignment. * Linux defines PAGE_MASK to be FreeBSD ~PAGE_MASK. */ if (args->addr & PAGE_MASK) { td->td_retval[0] = 0; return (EINVAL); } args->new_len = round_page(args->new_len); args->old_len = round_page(args->old_len); if (args->new_len > args->old_len) { td->td_retval[0] = 0; return (ENOMEM); } if (args->new_len < args->old_len) { addr = args->addr + args->new_len; len = args->old_len - args->new_len; error = kern_munmap(td, addr, len); } td->td_retval[0] = error ? 0 : (uintptr_t)args->addr; return (error); } #define LINUX_MS_ASYNC 0x0001 #define LINUX_MS_INVALIDATE 0x0002 #define LINUX_MS_SYNC 0x0004 int linux_msync(struct thread *td, struct linux_msync_args *args) { return (kern_msync(td, args->addr, args->len, args->fl & ~LINUX_MS_SYNC)); } #ifdef LINUX_LEGACY_SYSCALLS int linux_time(struct thread *td, struct linux_time_args *args) { struct timeval tv; l_time_t tm; int error; #ifdef DEBUG if (ldebug(time)) printf(ARGS(time, "*")); #endif microtime(&tv); tm = tv.tv_sec; if (args->tm && (error = copyout(&tm, args->tm, sizeof(tm)))) return (error); td->td_retval[0] = tm; return (0); } #endif struct l_times_argv { l_clock_t tms_utime; l_clock_t tms_stime; l_clock_t tms_cutime; l_clock_t tms_cstime; }; /* * Glibc versions prior to 2.2.1 always use hard-coded CLK_TCK value. * Since 2.2.1 Glibc uses value exported from kernel via AT_CLKTCK * auxiliary vector entry. */ #define CLK_TCK 100 #define CONVOTCK(r) (r.tv_sec * CLK_TCK + r.tv_usec / (1000000 / CLK_TCK)) #define CONVNTCK(r) (r.tv_sec * stclohz + r.tv_usec / (1000000 / stclohz)) #define CONVTCK(r) (linux_kernver(td) >= LINUX_KERNVER_2004000 ? \ CONVNTCK(r) : CONVOTCK(r)) int linux_times(struct thread *td, struct linux_times_args *args) { struct timeval tv, utime, stime, cutime, cstime; struct l_times_argv tms; struct proc *p; int error; #ifdef DEBUG if (ldebug(times)) printf(ARGS(times, "*")); #endif if (args->buf != NULL) { p = td->td_proc; PROC_LOCK(p); PROC_STATLOCK(p); calcru(p, &utime, &stime); PROC_STATUNLOCK(p); calccru(p, &cutime, &cstime); PROC_UNLOCK(p); tms.tms_utime = CONVTCK(utime); tms.tms_stime = CONVTCK(stime); tms.tms_cutime = CONVTCK(cutime); tms.tms_cstime = CONVTCK(cstime); if ((error = copyout(&tms, args->buf, sizeof(tms)))) return (error); } microuptime(&tv); td->td_retval[0] = (int)CONVTCK(tv); return (0); } int linux_newuname(struct thread *td, struct linux_newuname_args *args) { struct l_new_utsname utsname; char osname[LINUX_MAX_UTSNAME]; char osrelease[LINUX_MAX_UTSNAME]; char *p; #ifdef DEBUG if (ldebug(newuname)) printf(ARGS(newuname, "*")); #endif linux_get_osname(td, osname); linux_get_osrelease(td, osrelease); bzero(&utsname, sizeof(utsname)); strlcpy(utsname.sysname, osname, LINUX_MAX_UTSNAME); getcredhostname(td->td_ucred, utsname.nodename, LINUX_MAX_UTSNAME); getcreddomainname(td->td_ucred, utsname.domainname, LINUX_MAX_UTSNAME); strlcpy(utsname.release, osrelease, LINUX_MAX_UTSNAME); strlcpy(utsname.version, version, LINUX_MAX_UTSNAME); for (p = utsname.version; *p != '\0'; ++p) if (*p == '\n') { *p = '\0'; break; } strlcpy(utsname.machine, linux_kplatform, LINUX_MAX_UTSNAME); return (copyout(&utsname, args->buf, sizeof(utsname))); } struct l_utimbuf { l_time_t l_actime; l_time_t l_modtime; }; #ifdef LINUX_LEGACY_SYSCALLS int linux_utime(struct thread *td, struct linux_utime_args *args) { struct timeval tv[2], *tvp; struct l_utimbuf lut; char *fname; int error; LCONVPATHEXIST(td, args->fname, &fname); #ifdef DEBUG if (ldebug(utime)) printf(ARGS(utime, "%s, *"), fname); #endif if (args->times) { if ((error = copyin(args->times, &lut, sizeof lut))) { LFREEPATH(fname); return (error); } tv[0].tv_sec = lut.l_actime; tv[0].tv_usec = 0; tv[1].tv_sec = lut.l_modtime; tv[1].tv_usec = 0; tvp = tv; } else tvp = NULL; error = kern_utimesat(td, AT_FDCWD, fname, UIO_SYSSPACE, tvp, UIO_SYSSPACE); LFREEPATH(fname); return (error); } #endif #ifdef LINUX_LEGACY_SYSCALLS int linux_utimes(struct thread *td, struct linux_utimes_args *args) { l_timeval ltv[2]; struct timeval tv[2], *tvp = NULL; char *fname; int error; LCONVPATHEXIST(td, args->fname, &fname); #ifdef DEBUG if (ldebug(utimes)) printf(ARGS(utimes, "%s, *"), fname); #endif if (args->tptr != NULL) { if ((error = copyin(args->tptr, ltv, sizeof ltv))) { LFREEPATH(fname); return (error); } tv[0].tv_sec = ltv[0].tv_sec; tv[0].tv_usec = ltv[0].tv_usec; tv[1].tv_sec = ltv[1].tv_sec; tv[1].tv_usec = ltv[1].tv_usec; tvp = tv; } error = kern_utimesat(td, AT_FDCWD, fname, UIO_SYSSPACE, tvp, UIO_SYSSPACE); LFREEPATH(fname); return (error); } #endif static int linux_utimensat_nsec_valid(l_long nsec) { if (nsec == LINUX_UTIME_OMIT || nsec == LINUX_UTIME_NOW) return (0); if (nsec >= 0 && nsec <= 999999999) return (0); return (1); } int linux_utimensat(struct thread *td, struct linux_utimensat_args *args) { struct l_timespec l_times[2]; struct timespec times[2], *timesp = NULL; char *path = NULL; int error, dfd, flags = 0; dfd = (args->dfd == LINUX_AT_FDCWD) ? AT_FDCWD : args->dfd; #ifdef DEBUG if (ldebug(utimensat)) printf(ARGS(utimensat, "%d, *"), dfd); #endif if (args->flags & ~LINUX_AT_SYMLINK_NOFOLLOW) return (EINVAL); if (args->times != NULL) { error = copyin(args->times, l_times, sizeof(l_times)); if (error != 0) return (error); if (linux_utimensat_nsec_valid(l_times[0].tv_nsec) != 0 || linux_utimensat_nsec_valid(l_times[1].tv_nsec) != 0) return (EINVAL); times[0].tv_sec = l_times[0].tv_sec; switch (l_times[0].tv_nsec) { case LINUX_UTIME_OMIT: times[0].tv_nsec = UTIME_OMIT; break; case LINUX_UTIME_NOW: times[0].tv_nsec = UTIME_NOW; break; default: times[0].tv_nsec = l_times[0].tv_nsec; } times[1].tv_sec = l_times[1].tv_sec; switch (l_times[1].tv_nsec) { case LINUX_UTIME_OMIT: times[1].tv_nsec = UTIME_OMIT; break; case LINUX_UTIME_NOW: times[1].tv_nsec = UTIME_NOW; break; default: times[1].tv_nsec = l_times[1].tv_nsec; break; } timesp = times; /* This breaks POSIX, but is what the Linux kernel does * _on purpose_ (documented in the man page for utimensat(2)), * so we must follow that behaviour. */ if (times[0].tv_nsec == UTIME_OMIT && times[1].tv_nsec == UTIME_OMIT) return (0); } if (args->pathname != NULL) LCONVPATHEXIST_AT(td, args->pathname, &path, dfd); else if (args->flags != 0) return (EINVAL); if (args->flags & LINUX_AT_SYMLINK_NOFOLLOW) flags |= AT_SYMLINK_NOFOLLOW; if (path == NULL) error = kern_futimens(td, dfd, timesp, UIO_SYSSPACE); else { error = kern_utimensat(td, dfd, path, UIO_SYSSPACE, timesp, UIO_SYSSPACE, flags); LFREEPATH(path); } return (error); } #ifdef LINUX_LEGACY_SYSCALLS int linux_futimesat(struct thread *td, struct linux_futimesat_args *args) { l_timeval ltv[2]; struct timeval tv[2], *tvp = NULL; char *fname; int error, dfd; dfd = (args->dfd == LINUX_AT_FDCWD) ? AT_FDCWD : args->dfd; LCONVPATHEXIST_AT(td, args->filename, &fname, dfd); #ifdef DEBUG if (ldebug(futimesat)) printf(ARGS(futimesat, "%s, *"), fname); #endif if (args->utimes != NULL) { if ((error = copyin(args->utimes, ltv, sizeof ltv))) { LFREEPATH(fname); return (error); } tv[0].tv_sec = ltv[0].tv_sec; tv[0].tv_usec = ltv[0].tv_usec; tv[1].tv_sec = ltv[1].tv_sec; tv[1].tv_usec = ltv[1].tv_usec; tvp = tv; } error = kern_utimesat(td, dfd, fname, UIO_SYSSPACE, tvp, UIO_SYSSPACE); LFREEPATH(fname); return (error); } #endif int linux_common_wait(struct thread *td, int pid, int *status, int options, struct rusage *ru) { int error, tmpstat; error = kern_wait(td, pid, &tmpstat, options, ru); if (error) return (error); if (status) { tmpstat &= 0xffff; if (WIFSIGNALED(tmpstat)) tmpstat = (tmpstat & 0xffffff80) | bsd_to_linux_signal(WTERMSIG(tmpstat)); else if (WIFSTOPPED(tmpstat)) tmpstat = (tmpstat & 0xffff00ff) | (bsd_to_linux_signal(WSTOPSIG(tmpstat)) << 8); else if (WIFCONTINUED(tmpstat)) tmpstat = 0xffff; error = copyout(&tmpstat, status, sizeof(int)); } return (error); } #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32)) int linux_waitpid(struct thread *td, struct linux_waitpid_args *args) { struct linux_wait4_args wait4_args; #ifdef DEBUG if (ldebug(waitpid)) printf(ARGS(waitpid, "%d, %p, %d"), args->pid, (void *)args->status, args->options); #endif wait4_args.pid = args->pid; wait4_args.status = args->status; wait4_args.options = args->options; wait4_args.rusage = NULL; return (linux_wait4(td, &wait4_args)); } #endif /* __i386__ || (__amd64__ && COMPAT_LINUX32) */ int linux_wait4(struct thread *td, struct linux_wait4_args *args) { int error, options; struct rusage ru, *rup; #ifdef DEBUG if (ldebug(wait4)) printf(ARGS(wait4, "%d, %p, %d, %p"), args->pid, (void *)args->status, args->options, (void *)args->rusage); #endif if (args->options & ~(LINUX_WUNTRACED | LINUX_WNOHANG | LINUX_WCONTINUED | __WCLONE | __WNOTHREAD | __WALL)) return (EINVAL); options = WEXITED; linux_to_bsd_waitopts(args->options, &options); if (args->rusage != NULL) rup = &ru; else rup = NULL; error = linux_common_wait(td, args->pid, args->status, options, rup); if (error != 0) return (error); if (args->rusage != NULL) error = linux_copyout_rusage(&ru, args->rusage); return (error); } int linux_waitid(struct thread *td, struct linux_waitid_args *args) { int status, options, sig; struct __wrusage wru; siginfo_t siginfo; l_siginfo_t lsi; idtype_t idtype; struct proc *p; int error; options = 0; linux_to_bsd_waitopts(args->options, &options); if (options & ~(WNOHANG | WNOWAIT | WEXITED | WUNTRACED | WCONTINUED)) return (EINVAL); if (!(options & (WEXITED | WUNTRACED | WCONTINUED))) return (EINVAL); switch (args->idtype) { case LINUX_P_ALL: idtype = P_ALL; break; case LINUX_P_PID: if (args->id <= 0) return (EINVAL); idtype = P_PID; break; case LINUX_P_PGID: if (args->id <= 0) return (EINVAL); idtype = P_PGID; break; default: return (EINVAL); } error = kern_wait6(td, idtype, args->id, &status, options, &wru, &siginfo); if (error != 0) return (error); if (args->rusage != NULL) { error = linux_copyout_rusage(&wru.wru_children, args->rusage); if (error != 0) return (error); } if (args->info != NULL) { p = td->td_proc; if (td->td_retval[0] == 0) bzero(&lsi, sizeof(lsi)); else { sig = bsd_to_linux_signal(siginfo.si_signo); siginfo_to_lsiginfo(&siginfo, &lsi, sig); } error = copyout(&lsi, args->info, sizeof(lsi)); } td->td_retval[0] = 0; return (error); } #ifdef LINUX_LEGACY_SYSCALLS int linux_mknod(struct thread *td, struct linux_mknod_args *args) { char *path; int error; LCONVPATHCREAT(td, args->path, &path); #ifdef DEBUG if (ldebug(mknod)) printf(ARGS(mknod, "%s, %d, %ju"), path, args->mode, (uintmax_t)args->dev); #endif switch (args->mode & S_IFMT) { case S_IFIFO: case S_IFSOCK: error = kern_mkfifoat(td, AT_FDCWD, path, UIO_SYSSPACE, args->mode); break; case S_IFCHR: case S_IFBLK: error = kern_mknodat(td, AT_FDCWD, path, UIO_SYSSPACE, args->mode, args->dev); break; case S_IFDIR: error = EPERM; break; case 0: args->mode |= S_IFREG; /* FALLTHROUGH */ case S_IFREG: error = kern_openat(td, AT_FDCWD, path, UIO_SYSSPACE, O_WRONLY | O_CREAT | O_TRUNC, args->mode); if (error == 0) kern_close(td, td->td_retval[0]); break; default: error = EINVAL; break; } LFREEPATH(path); return (error); } #endif int linux_mknodat(struct thread *td, struct linux_mknodat_args *args) { char *path; int error, dfd; dfd = (args->dfd == LINUX_AT_FDCWD) ? AT_FDCWD : args->dfd; LCONVPATHCREAT_AT(td, args->filename, &path, dfd); #ifdef DEBUG if (ldebug(mknodat)) printf(ARGS(mknodat, "%s, %d, %d"), path, args->mode, args->dev); #endif switch (args->mode & S_IFMT) { case S_IFIFO: case S_IFSOCK: error = kern_mkfifoat(td, dfd, path, UIO_SYSSPACE, args->mode); break; case S_IFCHR: case S_IFBLK: error = kern_mknodat(td, dfd, path, UIO_SYSSPACE, args->mode, args->dev); break; case S_IFDIR: error = EPERM; break; case 0: args->mode |= S_IFREG; /* FALLTHROUGH */ case S_IFREG: error = kern_openat(td, dfd, path, UIO_SYSSPACE, O_WRONLY | O_CREAT | O_TRUNC, args->mode); if (error == 0) kern_close(td, td->td_retval[0]); break; default: error = EINVAL; break; } LFREEPATH(path); return (error); } /* * UGH! This is just about the dumbest idea I've ever heard!! */ int linux_personality(struct thread *td, struct linux_personality_args *args) { struct linux_pemuldata *pem; struct proc *p = td->td_proc; uint32_t old; #ifdef DEBUG if (ldebug(personality)) printf(ARGS(personality, "%u"), args->per); #endif PROC_LOCK(p); pem = pem_find(p); old = pem->persona; if (args->per != 0xffffffff) pem->persona = args->per; PROC_UNLOCK(p); td->td_retval[0] = old; return (0); } struct l_itimerval { l_timeval it_interval; l_timeval it_value; }; #define B2L_ITIMERVAL(bip, lip) \ (bip)->it_interval.tv_sec = (lip)->it_interval.tv_sec; \ (bip)->it_interval.tv_usec = (lip)->it_interval.tv_usec; \ (bip)->it_value.tv_sec = (lip)->it_value.tv_sec; \ (bip)->it_value.tv_usec = (lip)->it_value.tv_usec; int linux_setitimer(struct thread *td, struct linux_setitimer_args *uap) { int error; struct l_itimerval ls; struct itimerval aitv, oitv; #ifdef DEBUG if (ldebug(setitimer)) printf(ARGS(setitimer, "%p, %p"), (void *)uap->itv, (void *)uap->oitv); #endif if (uap->itv == NULL) { uap->itv = uap->oitv; return (linux_getitimer(td, (struct linux_getitimer_args *)uap)); } error = copyin(uap->itv, &ls, sizeof(ls)); if (error != 0) return (error); B2L_ITIMERVAL(&aitv, &ls); #ifdef DEBUG if (ldebug(setitimer)) { printf("setitimer: value: sec: %jd, usec: %ld\n", (intmax_t)aitv.it_value.tv_sec, aitv.it_value.tv_usec); printf("setitimer: interval: sec: %jd, usec: %ld\n", (intmax_t)aitv.it_interval.tv_sec, aitv.it_interval.tv_usec); } #endif error = kern_setitimer(td, uap->which, &aitv, &oitv); if (error != 0 || uap->oitv == NULL) return (error); B2L_ITIMERVAL(&ls, &oitv); return (copyout(&ls, uap->oitv, sizeof(ls))); } int linux_getitimer(struct thread *td, struct linux_getitimer_args *uap) { int error; struct l_itimerval ls; struct itimerval aitv; #ifdef DEBUG if (ldebug(getitimer)) printf(ARGS(getitimer, "%p"), (void *)uap->itv); #endif error = kern_getitimer(td, uap->which, &aitv); if (error != 0) return (error); B2L_ITIMERVAL(&ls, &aitv); return (copyout(&ls, uap->itv, sizeof(ls))); } #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32)) int linux_nice(struct thread *td, struct linux_nice_args *args) { struct setpriority_args bsd_args; bsd_args.which = PRIO_PROCESS; bsd_args.who = 0; /* current process */ bsd_args.prio = args->inc; return (sys_setpriority(td, &bsd_args)); } #endif /* __i386__ || (__amd64__ && COMPAT_LINUX32) */ int linux_setgroups(struct thread *td, struct linux_setgroups_args *args) { struct ucred *newcred, *oldcred; l_gid_t *linux_gidset; gid_t *bsd_gidset; int ngrp, error; struct proc *p; ngrp = args->gidsetsize; if (ngrp < 0 || ngrp >= ngroups_max + 1) return (EINVAL); linux_gidset = malloc(ngrp * sizeof(*linux_gidset), M_LINUX, M_WAITOK); error = copyin(args->grouplist, linux_gidset, ngrp * sizeof(l_gid_t)); if (error) goto out; newcred = crget(); crextend(newcred, ngrp + 1); p = td->td_proc; PROC_LOCK(p); oldcred = p->p_ucred; crcopy(newcred, oldcred); /* * cr_groups[0] holds egid. Setting the whole set from * the supplied set will cause egid to be changed too. * Keep cr_groups[0] unchanged to prevent that. */ if ((error = priv_check_cred(oldcred, PRIV_CRED_SETGROUPS, 0)) != 0) { PROC_UNLOCK(p); crfree(newcred); goto out; } if (ngrp > 0) { newcred->cr_ngroups = ngrp + 1; bsd_gidset = newcred->cr_groups; ngrp--; while (ngrp >= 0) { bsd_gidset[ngrp + 1] = linux_gidset[ngrp]; ngrp--; } } else newcred->cr_ngroups = 1; setsugid(p); proc_set_cred(p, newcred); PROC_UNLOCK(p); crfree(oldcred); error = 0; out: free(linux_gidset, M_LINUX); return (error); } int linux_getgroups(struct thread *td, struct linux_getgroups_args *args) { struct ucred *cred; l_gid_t *linux_gidset; gid_t *bsd_gidset; int bsd_gidsetsz, ngrp, error; cred = td->td_ucred; bsd_gidset = cred->cr_groups; bsd_gidsetsz = cred->cr_ngroups - 1; /* * cr_groups[0] holds egid. Returning the whole set * here will cause a duplicate. Exclude cr_groups[0] * to prevent that. */ if ((ngrp = args->gidsetsize) == 0) { td->td_retval[0] = bsd_gidsetsz; return (0); } if (ngrp < bsd_gidsetsz) return (EINVAL); ngrp = 0; linux_gidset = malloc(bsd_gidsetsz * sizeof(*linux_gidset), M_LINUX, M_WAITOK); while (ngrp < bsd_gidsetsz) { linux_gidset[ngrp] = bsd_gidset[ngrp + 1]; ngrp++; } error = copyout(linux_gidset, args->grouplist, ngrp * sizeof(l_gid_t)); free(linux_gidset, M_LINUX); if (error) return (error); td->td_retval[0] = ngrp; return (0); } int linux_setrlimit(struct thread *td, struct linux_setrlimit_args *args) { struct rlimit bsd_rlim; struct l_rlimit rlim; u_int which; int error; #ifdef DEBUG if (ldebug(setrlimit)) printf(ARGS(setrlimit, "%d, %p"), args->resource, (void *)args->rlim); #endif if (args->resource >= LINUX_RLIM_NLIMITS) return (EINVAL); which = linux_to_bsd_resource[args->resource]; if (which == -1) return (EINVAL); error = copyin(args->rlim, &rlim, sizeof(rlim)); if (error) return (error); bsd_rlim.rlim_cur = (rlim_t)rlim.rlim_cur; bsd_rlim.rlim_max = (rlim_t)rlim.rlim_max; return (kern_setrlimit(td, which, &bsd_rlim)); } #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32)) int linux_old_getrlimit(struct thread *td, struct linux_old_getrlimit_args *args) { struct l_rlimit rlim; struct rlimit bsd_rlim; u_int which; #ifdef DEBUG if (ldebug(old_getrlimit)) printf(ARGS(old_getrlimit, "%d, %p"), args->resource, (void *)args->rlim); #endif if (args->resource >= LINUX_RLIM_NLIMITS) return (EINVAL); which = linux_to_bsd_resource[args->resource]; if (which == -1) return (EINVAL); lim_rlimit(td, which, &bsd_rlim); #ifdef COMPAT_LINUX32 rlim.rlim_cur = (unsigned int)bsd_rlim.rlim_cur; if (rlim.rlim_cur == UINT_MAX) rlim.rlim_cur = INT_MAX; rlim.rlim_max = (unsigned int)bsd_rlim.rlim_max; if (rlim.rlim_max == UINT_MAX) rlim.rlim_max = INT_MAX; #else rlim.rlim_cur = (unsigned long)bsd_rlim.rlim_cur; if (rlim.rlim_cur == ULONG_MAX) rlim.rlim_cur = LONG_MAX; rlim.rlim_max = (unsigned long)bsd_rlim.rlim_max; if (rlim.rlim_max == ULONG_MAX) rlim.rlim_max = LONG_MAX; #endif return (copyout(&rlim, args->rlim, sizeof(rlim))); } #endif /* __i386__ || (__amd64__ && COMPAT_LINUX32) */ int linux_getrlimit(struct thread *td, struct linux_getrlimit_args *args) { struct l_rlimit rlim; struct rlimit bsd_rlim; u_int which; #ifdef DEBUG if (ldebug(getrlimit)) printf(ARGS(getrlimit, "%d, %p"), args->resource, (void *)args->rlim); #endif if (args->resource >= LINUX_RLIM_NLIMITS) return (EINVAL); which = linux_to_bsd_resource[args->resource]; if (which == -1) return (EINVAL); lim_rlimit(td, which, &bsd_rlim); rlim.rlim_cur = (l_ulong)bsd_rlim.rlim_cur; rlim.rlim_max = (l_ulong)bsd_rlim.rlim_max; return (copyout(&rlim, args->rlim, sizeof(rlim))); } int linux_sched_setscheduler(struct thread *td, struct linux_sched_setscheduler_args *args) { struct sched_param sched_param; struct thread *tdt; int error, policy; #ifdef DEBUG if (ldebug(sched_setscheduler)) printf(ARGS(sched_setscheduler, "%d, %d, %p"), args->pid, args->policy, (const void *)args->param); #endif switch (args->policy) { case LINUX_SCHED_OTHER: policy = SCHED_OTHER; break; case LINUX_SCHED_FIFO: policy = SCHED_FIFO; break; case LINUX_SCHED_RR: policy = SCHED_RR; break; default: return (EINVAL); } error = copyin(args->param, &sched_param, sizeof(sched_param)); if (error) return (error); tdt = linux_tdfind(td, args->pid, -1); if (tdt == NULL) return (ESRCH); error = kern_sched_setscheduler(td, tdt, policy, &sched_param); PROC_UNLOCK(tdt->td_proc); return (error); } int linux_sched_getscheduler(struct thread *td, struct linux_sched_getscheduler_args *args) { struct thread *tdt; int error, policy; #ifdef DEBUG if (ldebug(sched_getscheduler)) printf(ARGS(sched_getscheduler, "%d"), args->pid); #endif tdt = linux_tdfind(td, args->pid, -1); if (tdt == NULL) return (ESRCH); error = kern_sched_getscheduler(td, tdt, &policy); PROC_UNLOCK(tdt->td_proc); switch (policy) { case SCHED_OTHER: td->td_retval[0] = LINUX_SCHED_OTHER; break; case SCHED_FIFO: td->td_retval[0] = LINUX_SCHED_FIFO; break; case SCHED_RR: td->td_retval[0] = LINUX_SCHED_RR; break; } return (error); } int linux_sched_get_priority_max(struct thread *td, struct linux_sched_get_priority_max_args *args) { struct sched_get_priority_max_args bsd; #ifdef DEBUG if (ldebug(sched_get_priority_max)) printf(ARGS(sched_get_priority_max, "%d"), args->policy); #endif switch (args->policy) { case LINUX_SCHED_OTHER: bsd.policy = SCHED_OTHER; break; case LINUX_SCHED_FIFO: bsd.policy = SCHED_FIFO; break; case LINUX_SCHED_RR: bsd.policy = SCHED_RR; break; default: return (EINVAL); } return (sys_sched_get_priority_max(td, &bsd)); } int linux_sched_get_priority_min(struct thread *td, struct linux_sched_get_priority_min_args *args) { struct sched_get_priority_min_args bsd; #ifdef DEBUG if (ldebug(sched_get_priority_min)) printf(ARGS(sched_get_priority_min, "%d"), args->policy); #endif switch (args->policy) { case LINUX_SCHED_OTHER: bsd.policy = SCHED_OTHER; break; case LINUX_SCHED_FIFO: bsd.policy = SCHED_FIFO; break; case LINUX_SCHED_RR: bsd.policy = SCHED_RR; break; default: return (EINVAL); } return (sys_sched_get_priority_min(td, &bsd)); } #define REBOOT_CAD_ON 0x89abcdef #define REBOOT_CAD_OFF 0 #define REBOOT_HALT 0xcdef0123 #define REBOOT_RESTART 0x01234567 #define REBOOT_RESTART2 0xA1B2C3D4 #define REBOOT_POWEROFF 0x4321FEDC #define REBOOT_MAGIC1 0xfee1dead #define REBOOT_MAGIC2 0x28121969 #define REBOOT_MAGIC2A 0x05121996 #define REBOOT_MAGIC2B 0x16041998 int linux_reboot(struct thread *td, struct linux_reboot_args *args) { struct reboot_args bsd_args; #ifdef DEBUG if (ldebug(reboot)) printf(ARGS(reboot, "0x%x"), args->cmd); #endif if (args->magic1 != REBOOT_MAGIC1) return (EINVAL); switch (args->magic2) { case REBOOT_MAGIC2: case REBOOT_MAGIC2A: case REBOOT_MAGIC2B: break; default: return (EINVAL); } switch (args->cmd) { case REBOOT_CAD_ON: case REBOOT_CAD_OFF: return (priv_check(td, PRIV_REBOOT)); case REBOOT_HALT: bsd_args.opt = RB_HALT; break; case REBOOT_RESTART: case REBOOT_RESTART2: bsd_args.opt = 0; break; case REBOOT_POWEROFF: bsd_args.opt = RB_POWEROFF; break; default: return (EINVAL); } return (sys_reboot(td, &bsd_args)); } /* * The FreeBSD native getpid(2), getgid(2) and getuid(2) also modify * td->td_retval[1] when COMPAT_43 is defined. This clobbers registers that * are assumed to be preserved. The following lightweight syscalls fixes * this. See also linux_getgid16() and linux_getuid16() in linux_uid16.c * * linux_getpid() - MP SAFE * linux_getgid() - MP SAFE * linux_getuid() - MP SAFE */ int linux_getpid(struct thread *td, struct linux_getpid_args *args) { #ifdef DEBUG if (ldebug(getpid)) printf(ARGS(getpid, "")); #endif td->td_retval[0] = td->td_proc->p_pid; return (0); } int linux_gettid(struct thread *td, struct linux_gettid_args *args) { struct linux_emuldata *em; #ifdef DEBUG if (ldebug(gettid)) printf(ARGS(gettid, "")); #endif em = em_find(td); KASSERT(em != NULL, ("gettid: emuldata not found.\n")); td->td_retval[0] = em->em_tid; return (0); } int linux_getppid(struct thread *td, struct linux_getppid_args *args) { #ifdef DEBUG if (ldebug(getppid)) printf(ARGS(getppid, "")); #endif td->td_retval[0] = kern_getppid(td); return (0); } int linux_getgid(struct thread *td, struct linux_getgid_args *args) { #ifdef DEBUG if (ldebug(getgid)) printf(ARGS(getgid, "")); #endif td->td_retval[0] = td->td_ucred->cr_rgid; return (0); } int linux_getuid(struct thread *td, struct linux_getuid_args *args) { #ifdef DEBUG if (ldebug(getuid)) printf(ARGS(getuid, "")); #endif td->td_retval[0] = td->td_ucred->cr_ruid; return (0); } int linux_getsid(struct thread *td, struct linux_getsid_args *args) { struct getsid_args bsd; #ifdef DEBUG if (ldebug(getsid)) printf(ARGS(getsid, "%i"), args->pid); #endif bsd.pid = args->pid; return (sys_getsid(td, &bsd)); } int linux_nosys(struct thread *td, struct nosys_args *ignore) { return (ENOSYS); } int linux_getpriority(struct thread *td, struct linux_getpriority_args *args) { struct getpriority_args bsd_args; int error; #ifdef DEBUG if (ldebug(getpriority)) printf(ARGS(getpriority, "%i, %i"), args->which, args->who); #endif bsd_args.which = args->which; bsd_args.who = args->who; error = sys_getpriority(td, &bsd_args); td->td_retval[0] = 20 - td->td_retval[0]; return (error); } int linux_sethostname(struct thread *td, struct linux_sethostname_args *args) { int name[2]; #ifdef DEBUG if (ldebug(sethostname)) printf(ARGS(sethostname, "*, %i"), args->len); #endif name[0] = CTL_KERN; name[1] = KERN_HOSTNAME; return (userland_sysctl(td, name, 2, 0, 0, 0, args->hostname, args->len, 0, 0)); } int linux_setdomainname(struct thread *td, struct linux_setdomainname_args *args) { int name[2]; #ifdef DEBUG if (ldebug(setdomainname)) printf(ARGS(setdomainname, "*, %i"), args->len); #endif name[0] = CTL_KERN; name[1] = KERN_NISDOMAINNAME; return (userland_sysctl(td, name, 2, 0, 0, 0, args->name, args->len, 0, 0)); } int linux_exit_group(struct thread *td, struct linux_exit_group_args *args) { #ifdef DEBUG if (ldebug(exit_group)) printf(ARGS(exit_group, "%i"), args->error_code); #endif LINUX_CTR2(exit_group, "thread(%d) (%d)", td->td_tid, args->error_code); /* * XXX: we should send a signal to the parent if * SIGNAL_EXIT_GROUP is set. We ignore that (temporarily?) * as it doesnt occur often. */ exit1(td, args->error_code, 0); /* NOTREACHED */ } #define _LINUX_CAPABILITY_VERSION_1 0x19980330 #define _LINUX_CAPABILITY_VERSION_2 0x20071026 #define _LINUX_CAPABILITY_VERSION_3 0x20080522 struct l_user_cap_header { l_int version; l_int pid; }; struct l_user_cap_data { l_int effective; l_int permitted; l_int inheritable; }; int linux_capget(struct thread *td, struct linux_capget_args *uap) { struct l_user_cap_header luch; struct l_user_cap_data lucd[2]; int error, u32s; if (uap->hdrp == NULL) return (EFAULT); error = copyin(uap->hdrp, &luch, sizeof(luch)); if (error != 0) return (error); switch (luch.version) { case _LINUX_CAPABILITY_VERSION_1: u32s = 1; break; case _LINUX_CAPABILITY_VERSION_2: case _LINUX_CAPABILITY_VERSION_3: u32s = 2; break; default: #ifdef DEBUG if (ldebug(capget)) printf(LMSG("invalid capget capability version 0x%x"), luch.version); #endif luch.version = _LINUX_CAPABILITY_VERSION_1; error = copyout(&luch, uap->hdrp, sizeof(luch)); if (error) return (error); return (EINVAL); } if (luch.pid) return (EPERM); if (uap->datap) { /* * The current implementation doesn't support setting * a capability (it's essentially a stub) so indicate * that no capabilities are currently set or available * to request. */ memset(&lucd, 0, u32s * sizeof(lucd[0])); error = copyout(&lucd, uap->datap, u32s * sizeof(lucd[0])); } return (error); } int linux_capset(struct thread *td, struct linux_capset_args *uap) { struct l_user_cap_header luch; struct l_user_cap_data lucd[2]; int error, i, u32s; if (uap->hdrp == NULL || uap->datap == NULL) return (EFAULT); error = copyin(uap->hdrp, &luch, sizeof(luch)); if (error != 0) return (error); switch (luch.version) { case _LINUX_CAPABILITY_VERSION_1: u32s = 1; break; case _LINUX_CAPABILITY_VERSION_2: case _LINUX_CAPABILITY_VERSION_3: u32s = 2; break; default: #ifdef DEBUG if (ldebug(capset)) printf(LMSG("invalid capset capability version 0x%x"), luch.version); #endif luch.version = _LINUX_CAPABILITY_VERSION_1; error = copyout(&luch, uap->hdrp, sizeof(luch)); if (error) return (error); return (EINVAL); } if (luch.pid) return (EPERM); error = copyin(uap->datap, &lucd, u32s * sizeof(lucd[0])); if (error != 0) return (error); /* We currently don't support setting any capabilities. */ for (i = 0; i < u32s; i++) { if (lucd[i].effective || lucd[i].permitted || lucd[i].inheritable) { linux_msg(td, "capset[%d] effective=0x%x, permitted=0x%x, " "inheritable=0x%x is not implemented", i, (int)lucd[i].effective, (int)lucd[i].permitted, (int)lucd[i].inheritable); return (EPERM); } } return (0); } int linux_prctl(struct thread *td, struct linux_prctl_args *args) { int error = 0, max_size; struct proc *p = td->td_proc; char comm[LINUX_MAX_COMM_LEN]; struct linux_emuldata *em; int pdeath_signal; #ifdef DEBUG if (ldebug(prctl)) printf(ARGS(prctl, "%d, %ju, %ju, %ju, %ju"), args->option, (uintmax_t)args->arg2, (uintmax_t)args->arg3, (uintmax_t)args->arg4, (uintmax_t)args->arg5); #endif switch (args->option) { case LINUX_PR_SET_PDEATHSIG: if (!LINUX_SIG_VALID(args->arg2)) return (EINVAL); em = em_find(td); KASSERT(em != NULL, ("prctl: emuldata not found.\n")); em->pdeath_signal = args->arg2; break; case LINUX_PR_GET_PDEATHSIG: em = em_find(td); KASSERT(em != NULL, ("prctl: emuldata not found.\n")); pdeath_signal = em->pdeath_signal; error = copyout(&pdeath_signal, (void *)(register_t)args->arg2, sizeof(pdeath_signal)); break; case LINUX_PR_GET_KEEPCAPS: /* * Indicate that we always clear the effective and * permitted capability sets when the user id becomes * non-zero (actually the capability sets are simply * always zero in the current implementation). */ td->td_retval[0] = 0; break; case LINUX_PR_SET_KEEPCAPS: /* * Ignore requests to keep the effective and permitted * capability sets when the user id becomes non-zero. */ break; case LINUX_PR_SET_NAME: /* * To be on the safe side we need to make sure to not * overflow the size a Linux program expects. We already * do this here in the copyin, so that we don't need to * check on copyout. */ max_size = MIN(sizeof(comm), sizeof(p->p_comm)); error = copyinstr((void *)(register_t)args->arg2, comm, max_size, NULL); /* Linux silently truncates the name if it is too long. */ if (error == ENAMETOOLONG) { /* * XXX: copyinstr() isn't documented to populate the * array completely, so do a copyin() to be on the * safe side. This should be changed in case * copyinstr() is changed to guarantee this. */ error = copyin((void *)(register_t)args->arg2, comm, max_size - 1); comm[max_size - 1] = '\0'; } if (error) return (error); PROC_LOCK(p); strlcpy(p->p_comm, comm, sizeof(p->p_comm)); PROC_UNLOCK(p); break; case LINUX_PR_GET_NAME: PROC_LOCK(p); strlcpy(comm, p->p_comm, sizeof(comm)); PROC_UNLOCK(p); error = copyout(comm, (void *)(register_t)args->arg2, strlen(comm) + 1); break; default: error = EINVAL; break; } return (error); } int linux_sched_setparam(struct thread *td, struct linux_sched_setparam_args *uap) { struct sched_param sched_param; struct thread *tdt; int error; #ifdef DEBUG if (ldebug(sched_setparam)) printf(ARGS(sched_setparam, "%d, *"), uap->pid); #endif error = copyin(uap->param, &sched_param, sizeof(sched_param)); if (error) return (error); tdt = linux_tdfind(td, uap->pid, -1); if (tdt == NULL) return (ESRCH); error = kern_sched_setparam(td, tdt, &sched_param); PROC_UNLOCK(tdt->td_proc); return (error); } int linux_sched_getparam(struct thread *td, struct linux_sched_getparam_args *uap) { struct sched_param sched_param; struct thread *tdt; int error; #ifdef DEBUG if (ldebug(sched_getparam)) printf(ARGS(sched_getparam, "%d, *"), uap->pid); #endif tdt = linux_tdfind(td, uap->pid, -1); if (tdt == NULL) return (ESRCH); error = kern_sched_getparam(td, tdt, &sched_param); PROC_UNLOCK(tdt->td_proc); if (error == 0) error = copyout(&sched_param, uap->param, sizeof(sched_param)); return (error); } /* * Get affinity of a process. */ int linux_sched_getaffinity(struct thread *td, struct linux_sched_getaffinity_args *args) { int error; struct thread *tdt; #ifdef DEBUG if (ldebug(sched_getaffinity)) printf(ARGS(sched_getaffinity, "%d, %d, *"), args->pid, args->len); #endif if (args->len < sizeof(cpuset_t)) return (EINVAL); tdt = linux_tdfind(td, args->pid, -1); if (tdt == NULL) return (ESRCH); PROC_UNLOCK(tdt->td_proc); error = kern_cpuset_getaffinity(td, CPU_LEVEL_WHICH, CPU_WHICH_TID, tdt->td_tid, sizeof(cpuset_t), (cpuset_t *)args->user_mask_ptr); if (error == 0) td->td_retval[0] = sizeof(cpuset_t); return (error); } /* * Set affinity of a process. */ int linux_sched_setaffinity(struct thread *td, struct linux_sched_setaffinity_args *args) { struct thread *tdt; #ifdef DEBUG if (ldebug(sched_setaffinity)) printf(ARGS(sched_setaffinity, "%d, %d, *"), args->pid, args->len); #endif if (args->len < sizeof(cpuset_t)) return (EINVAL); tdt = linux_tdfind(td, args->pid, -1); if (tdt == NULL) return (ESRCH); PROC_UNLOCK(tdt->td_proc); return (kern_cpuset_setaffinity(td, CPU_LEVEL_WHICH, CPU_WHICH_TID, tdt->td_tid, sizeof(cpuset_t), (cpuset_t *) args->user_mask_ptr)); } struct linux_rlimit64 { uint64_t rlim_cur; uint64_t rlim_max; }; int linux_prlimit64(struct thread *td, struct linux_prlimit64_args *args) { struct rlimit rlim, nrlim; struct linux_rlimit64 lrlim; struct proc *p; u_int which; int flags; int error; #ifdef DEBUG if (ldebug(prlimit64)) printf(ARGS(prlimit64, "%d, %d, %p, %p"), args->pid, args->resource, (void *)args->new, (void *)args->old); #endif if (args->resource >= LINUX_RLIM_NLIMITS) return (EINVAL); which = linux_to_bsd_resource[args->resource]; if (which == -1) return (EINVAL); if (args->new != NULL) { /* * Note. Unlike FreeBSD where rlim is signed 64-bit Linux * rlim is unsigned 64-bit. FreeBSD treats negative limits * as INFINITY so we do not need a conversion even. */ error = copyin(args->new, &nrlim, sizeof(nrlim)); if (error != 0) return (error); } flags = PGET_HOLD | PGET_NOTWEXIT; if (args->new != NULL) flags |= PGET_CANDEBUG; else flags |= PGET_CANSEE; error = pget(args->pid, flags, &p); if (error != 0) return (error); if (args->old != NULL) { PROC_LOCK(p); lim_rlimit_proc(p, which, &rlim); PROC_UNLOCK(p); if (rlim.rlim_cur == RLIM_INFINITY) lrlim.rlim_cur = LINUX_RLIM_INFINITY; else lrlim.rlim_cur = rlim.rlim_cur; if (rlim.rlim_max == RLIM_INFINITY) lrlim.rlim_max = LINUX_RLIM_INFINITY; else lrlim.rlim_max = rlim.rlim_max; error = copyout(&lrlim, args->old, sizeof(lrlim)); if (error != 0) goto out; } if (args->new != NULL) error = kern_proc_setrlimit(td, p, which, &nrlim); out: PRELE(p); return (error); } int linux_pselect6(struct thread *td, struct linux_pselect6_args *args) { struct timeval utv, tv0, tv1, *tvp; struct l_pselect6arg lpse6; struct l_timespec lts; struct timespec uts; l_sigset_t l_ss; sigset_t *ssp; sigset_t ss; int error; ssp = NULL; if (args->sig != NULL) { error = copyin(args->sig, &lpse6, sizeof(lpse6)); if (error != 0) return (error); if (lpse6.ss_len != sizeof(l_ss)) return (EINVAL); if (lpse6.ss != 0) { error = copyin(PTRIN(lpse6.ss), &l_ss, sizeof(l_ss)); if (error != 0) return (error); linux_to_bsd_sigset(&l_ss, &ss); ssp = &ss; } } /* * Currently glibc changes nanosecond number to microsecond. * This mean losing precision but for now it is hardly seen. */ if (args->tsp != NULL) { error = copyin(args->tsp, <s, sizeof(lts)); if (error != 0) return (error); error = linux_to_native_timespec(&uts, <s); if (error != 0) return (error); TIMESPEC_TO_TIMEVAL(&utv, &uts); if (itimerfix(&utv)) return (EINVAL); microtime(&tv0); tvp = &utv; } else tvp = NULL; error = kern_pselect(td, args->nfds, args->readfds, args->writefds, args->exceptfds, tvp, ssp, LINUX_NFDBITS); if (error == 0 && args->tsp != NULL) { if (td->td_retval[0] != 0) { /* * Compute how much time was left of the timeout, * by subtracting the current time and the time * before we started the call, and subtracting * that result from the user-supplied value. */ microtime(&tv1); timevalsub(&tv1, &tv0); timevalsub(&utv, &tv1); if (utv.tv_sec < 0) timevalclear(&utv); } else timevalclear(&utv); TIMEVAL_TO_TIMESPEC(&utv, &uts); error = native_to_linux_timespec(<s, &uts); if (error == 0) error = copyout(<s, args->tsp, sizeof(lts)); } return (error); } int linux_ppoll(struct thread *td, struct linux_ppoll_args *args) { struct timespec ts0, ts1; struct l_timespec lts; struct timespec uts, *tsp; l_sigset_t l_ss; sigset_t *ssp; sigset_t ss; int error; if (args->sset != NULL) { if (args->ssize != sizeof(l_ss)) return (EINVAL); error = copyin(args->sset, &l_ss, sizeof(l_ss)); if (error) return (error); linux_to_bsd_sigset(&l_ss, &ss); ssp = &ss; } else ssp = NULL; if (args->tsp != NULL) { error = copyin(args->tsp, <s, sizeof(lts)); if (error) return (error); error = linux_to_native_timespec(&uts, <s); if (error != 0) return (error); nanotime(&ts0); tsp = &uts; } else tsp = NULL; error = kern_poll(td, args->fds, args->nfds, tsp, ssp); if (error == 0 && args->tsp != NULL) { if (td->td_retval[0]) { nanotime(&ts1); - timespecsub(&ts1, &ts0); - timespecsub(&uts, &ts1); + timespecsub(&ts1, &ts0, &ts1); + timespecsub(&uts, &ts1, &uts); if (uts.tv_sec < 0) timespecclear(&uts); } else timespecclear(&uts); error = native_to_linux_timespec(<s, &uts); if (error == 0) error = copyout(<s, args->tsp, sizeof(lts)); } return (error); } #if defined(DEBUG) || defined(KTR) /* XXX: can be removed when every ldebug(...) and KTR stuff are removed. */ #ifdef COMPAT_LINUX32 #define L_MAXSYSCALL LINUX32_SYS_MAXSYSCALL #else #define L_MAXSYSCALL LINUX_SYS_MAXSYSCALL #endif u_char linux_debug_map[howmany(L_MAXSYSCALL, sizeof(u_char))]; static int linux_debug(int syscall, int toggle, int global) { if (global) { char c = toggle ? 0 : 0xff; memset(linux_debug_map, c, sizeof(linux_debug_map)); return (0); } if (syscall < 0 || syscall >= L_MAXSYSCALL) return (EINVAL); if (toggle) clrbit(linux_debug_map, syscall); else setbit(linux_debug_map, syscall); return (0); } #undef L_MAXSYSCALL /* * Usage: sysctl linux.debug=.<0/1> * * E.g.: sysctl linux.debug=21.0 * * As a special case, syscall "all" will apply to all syscalls globally. */ #define LINUX_MAX_DEBUGSTR 16 int linux_sysctl_debug(SYSCTL_HANDLER_ARGS) { char value[LINUX_MAX_DEBUGSTR], *p; int error, sysc, toggle; int global = 0; value[0] = '\0'; error = sysctl_handle_string(oidp, value, LINUX_MAX_DEBUGSTR, req); if (error || req->newptr == NULL) return (error); for (p = value; *p != '\0' && *p != '.'; p++); if (*p == '\0') return (EINVAL); *p++ = '\0'; sysc = strtol(value, NULL, 0); toggle = strtol(p, NULL, 0); if (strcmp(value, "all") == 0) global = 1; error = linux_debug(sysc, toggle, global); return (error); } #endif /* DEBUG || KTR */ int linux_sched_rr_get_interval(struct thread *td, struct linux_sched_rr_get_interval_args *uap) { struct timespec ts; struct l_timespec lts; struct thread *tdt; int error; /* * According to man in case the invalid pid specified * EINVAL should be returned. */ if (uap->pid < 0) return (EINVAL); tdt = linux_tdfind(td, uap->pid, -1); if (tdt == NULL) return (ESRCH); error = kern_sched_rr_get_interval_td(td, tdt, &ts); PROC_UNLOCK(tdt->td_proc); if (error != 0) return (error); error = native_to_linux_timespec(<s, &ts); if (error != 0) return (error); return (copyout(<s, uap->interval, sizeof(lts))); } /* * In case when the Linux thread is the initial thread in * the thread group thread id is equal to the process id. * Glibc depends on this magic (assert in pthread_getattr_np.c). */ struct thread * linux_tdfind(struct thread *td, lwpid_t tid, pid_t pid) { struct linux_emuldata *em; struct thread *tdt; struct proc *p; tdt = NULL; if (tid == 0 || tid == td->td_tid) { tdt = td; PROC_LOCK(tdt->td_proc); } else if (tid > PID_MAX) tdt = tdfind(tid, pid); else { /* * Initial thread where the tid equal to the pid. */ p = pfind(tid); if (p != NULL) { if (SV_PROC_ABI(p) != SV_ABI_LINUX) { /* * p is not a Linuxulator process. */ PROC_UNLOCK(p); return (NULL); } FOREACH_THREAD_IN_PROC(p, tdt) { em = em_find(tdt); if (tid == em->em_tid) return (tdt); } PROC_UNLOCK(p); } return (NULL); } return (tdt); } void linux_to_bsd_waitopts(int options, int *bsdopts) { if (options & LINUX_WNOHANG) *bsdopts |= WNOHANG; if (options & LINUX_WUNTRACED) *bsdopts |= WUNTRACED; if (options & LINUX_WEXITED) *bsdopts |= WEXITED; if (options & LINUX_WCONTINUED) *bsdopts |= WCONTINUED; if (options & LINUX_WNOWAIT) *bsdopts |= WNOWAIT; if (options & __WCLONE) *bsdopts |= WLINUXCLONE; } int linux_getrandom(struct thread *td, struct linux_getrandom_args *args) { struct uio uio; struct iovec iov; int error; if (args->flags & ~(LINUX_GRND_NONBLOCK|LINUX_GRND_RANDOM)) return (EINVAL); if (args->count > INT_MAX) args->count = INT_MAX; iov.iov_base = args->buf; iov.iov_len = args->count; uio.uio_iov = &iov; uio.uio_iovcnt = 1; uio.uio_resid = iov.iov_len; uio.uio_segflg = UIO_USERSPACE; uio.uio_rw = UIO_READ; uio.uio_td = td; error = read_random_uio(&uio, args->flags & LINUX_GRND_NONBLOCK); if (error == 0) td->td_retval[0] = args->count - uio.uio_resid; return (error); } int linux_mincore(struct thread *td, struct linux_mincore_args *args) { /* Needs to be page-aligned */ if (args->start & PAGE_MASK) return (EINVAL); return (kern_mincore(td, args->start, args->len, args->vec)); } Index: head/sys/compat/linux/linux_socket.c =================================================================== --- head/sys/compat/linux/linux_socket.c (revision 336913) +++ head/sys/compat/linux/linux_socket.c (revision 336914) @@ -1,1764 +1,1764 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 1995 Søren Schmidt * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); /* XXX we use functions that might not exist. */ #include "opt_compat.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #include #endif #ifdef COMPAT_LINUX32 #include #include #else #include #include #endif #include #include #include #include static int linux_to_bsd_domain(int); static int linux_sendmsg_common(struct thread *, l_int, struct l_msghdr *, l_uint); static int linux_recvmsg_common(struct thread *, l_int, struct l_msghdr *, l_uint, struct msghdr *); static int linux_set_socket_flags(int, int *); /* * Reads a Linux sockaddr and does any necessary translation. * Linux sockaddrs don't have a length field, only a family. * Copy the osockaddr structure pointed to by osa to kernel, adjust * family and convert to sockaddr. */ static int linux_getsockaddr(struct sockaddr **sap, const struct osockaddr *osa, int salen) { struct sockaddr *sa; struct osockaddr *kosa; #ifdef INET6 struct sockaddr_in6 *sin6; int oldv6size; #endif char *name; int bdom, error, hdrlen, namelen; if (salen < 2 || salen > UCHAR_MAX || !osa) return (EINVAL); #ifdef INET6 oldv6size = 0; /* * Check for old (pre-RFC2553) sockaddr_in6. We may accept it * if it's a v4-mapped address, so reserve the proper space * for it. */ if (salen == sizeof(struct sockaddr_in6) - sizeof(uint32_t)) { salen += sizeof(uint32_t); oldv6size = 1; } #endif kosa = malloc(salen, M_SONAME, M_WAITOK); if ((error = copyin(osa, kosa, salen))) goto out; bdom = linux_to_bsd_domain(kosa->sa_family); if (bdom == -1) { error = EAFNOSUPPORT; goto out; } #ifdef INET6 /* * Older Linux IPv6 code uses obsolete RFC2133 struct sockaddr_in6, * which lacks the scope id compared with RFC2553 one. If we detect * the situation, reject the address and write a message to system log. * * Still accept addresses for which the scope id is not used. */ if (oldv6size) { if (bdom == AF_INET6) { sin6 = (struct sockaddr_in6 *)kosa; if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr) || (!IN6_IS_ADDR_LINKLOCAL(&sin6->sin6_addr) && !IN6_IS_ADDR_SITELOCAL(&sin6->sin6_addr) && !IN6_IS_ADDR_V4COMPAT(&sin6->sin6_addr) && !IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr) && !IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr))) { sin6->sin6_scope_id = 0; } else { log(LOG_DEBUG, "obsolete pre-RFC2553 sockaddr_in6 rejected\n"); error = EINVAL; goto out; } } else salen -= sizeof(uint32_t); } #endif if (bdom == AF_INET) { if (salen < sizeof(struct sockaddr_in)) { error = EINVAL; goto out; } salen = sizeof(struct sockaddr_in); } if (bdom == AF_LOCAL && salen > sizeof(struct sockaddr_un)) { hdrlen = offsetof(struct sockaddr_un, sun_path); name = ((struct sockaddr_un *)kosa)->sun_path; if (*name == '\0') { /* * Linux abstract namespace starts with a NULL byte. * XXX We do not support abstract namespace yet. */ namelen = strnlen(name + 1, salen - hdrlen - 1) + 1; } else namelen = strnlen(name, salen - hdrlen); salen = hdrlen + namelen; if (salen > sizeof(struct sockaddr_un)) { error = ENAMETOOLONG; goto out; } } sa = (struct sockaddr *)kosa; sa->sa_family = bdom; sa->sa_len = salen; *sap = sa; return (0); out: free(kosa, M_SONAME); return (error); } static int linux_to_bsd_domain(int domain) { switch (domain) { case LINUX_AF_UNSPEC: return (AF_UNSPEC); case LINUX_AF_UNIX: return (AF_LOCAL); case LINUX_AF_INET: return (AF_INET); case LINUX_AF_INET6: return (AF_INET6); case LINUX_AF_AX25: return (AF_CCITT); case LINUX_AF_IPX: return (AF_IPX); case LINUX_AF_APPLETALK: return (AF_APPLETALK); } return (-1); } static int bsd_to_linux_domain(int domain) { switch (domain) { case AF_UNSPEC: return (LINUX_AF_UNSPEC); case AF_LOCAL: return (LINUX_AF_UNIX); case AF_INET: return (LINUX_AF_INET); case AF_INET6: return (LINUX_AF_INET6); case AF_CCITT: return (LINUX_AF_AX25); case AF_IPX: return (LINUX_AF_IPX); case AF_APPLETALK: return (LINUX_AF_APPLETALK); } return (-1); } static int linux_to_bsd_sockopt_level(int level) { switch (level) { case LINUX_SOL_SOCKET: return (SOL_SOCKET); } return (level); } static int bsd_to_linux_sockopt_level(int level) { switch (level) { case SOL_SOCKET: return (LINUX_SOL_SOCKET); } return (level); } static int linux_to_bsd_ip_sockopt(int opt) { switch (opt) { case LINUX_IP_TOS: return (IP_TOS); case LINUX_IP_TTL: return (IP_TTL); case LINUX_IP_OPTIONS: return (IP_OPTIONS); case LINUX_IP_MULTICAST_IF: return (IP_MULTICAST_IF); case LINUX_IP_MULTICAST_TTL: return (IP_MULTICAST_TTL); case LINUX_IP_MULTICAST_LOOP: return (IP_MULTICAST_LOOP); case LINUX_IP_ADD_MEMBERSHIP: return (IP_ADD_MEMBERSHIP); case LINUX_IP_DROP_MEMBERSHIP: return (IP_DROP_MEMBERSHIP); case LINUX_IP_HDRINCL: return (IP_HDRINCL); } return (-1); } static int linux_to_bsd_ip6_sockopt(int opt) { switch (opt) { case LINUX_IPV6_NEXTHOP: return (IPV6_NEXTHOP); case LINUX_IPV6_UNICAST_HOPS: return (IPV6_UNICAST_HOPS); case LINUX_IPV6_MULTICAST_IF: return (IPV6_MULTICAST_IF); case LINUX_IPV6_MULTICAST_HOPS: return (IPV6_MULTICAST_HOPS); case LINUX_IPV6_MULTICAST_LOOP: return (IPV6_MULTICAST_LOOP); case LINUX_IPV6_ADD_MEMBERSHIP: return (IPV6_JOIN_GROUP); case LINUX_IPV6_DROP_MEMBERSHIP: return (IPV6_LEAVE_GROUP); case LINUX_IPV6_V6ONLY: return (IPV6_V6ONLY); case LINUX_IPV6_DONTFRAG: return (IPV6_DONTFRAG); #if 0 case LINUX_IPV6_CHECKSUM: return (IPV6_CHECKSUM); case LINUX_IPV6_RECVPKTINFO: return (IPV6_RECVPKTINFO); case LINUX_IPV6_PKTINFO: return (IPV6_PKTINFO); case LINUX_IPV6_RECVHOPLIMIT: return (IPV6_RECVHOPLIMIT); case LINUX_IPV6_HOPLIMIT: return (IPV6_HOPLIMIT); case LINUX_IPV6_RECVHOPOPTS: return (IPV6_RECVHOPOPTS); case LINUX_IPV6_HOPOPTS: return (IPV6_HOPOPTS); case LINUX_IPV6_RTHDRDSTOPTS: return (IPV6_RTHDRDSTOPTS); case LINUX_IPV6_RECVRTHDR: return (IPV6_RECVRTHDR); case LINUX_IPV6_RTHDR: return (IPV6_RTHDR); case LINUX_IPV6_RECVDSTOPTS: return (IPV6_RECVDSTOPTS); case LINUX_IPV6_DSTOPTS: return (IPV6_DSTOPTS); case LINUX_IPV6_RECVPATHMTU: return (IPV6_RECVPATHMTU); case LINUX_IPV6_PATHMTU: return (IPV6_PATHMTU); #endif } return (-1); } static int linux_to_bsd_so_sockopt(int opt) { switch (opt) { case LINUX_SO_DEBUG: return (SO_DEBUG); case LINUX_SO_REUSEADDR: return (SO_REUSEADDR); case LINUX_SO_TYPE: return (SO_TYPE); case LINUX_SO_ERROR: return (SO_ERROR); case LINUX_SO_DONTROUTE: return (SO_DONTROUTE); case LINUX_SO_BROADCAST: return (SO_BROADCAST); case LINUX_SO_SNDBUF: return (SO_SNDBUF); case LINUX_SO_RCVBUF: return (SO_RCVBUF); case LINUX_SO_KEEPALIVE: return (SO_KEEPALIVE); case LINUX_SO_OOBINLINE: return (SO_OOBINLINE); case LINUX_SO_LINGER: return (SO_LINGER); case LINUX_SO_PEERCRED: return (LOCAL_PEERCRED); case LINUX_SO_RCVLOWAT: return (SO_RCVLOWAT); case LINUX_SO_SNDLOWAT: return (SO_SNDLOWAT); case LINUX_SO_RCVTIMEO: return (SO_RCVTIMEO); case LINUX_SO_SNDTIMEO: return (SO_SNDTIMEO); case LINUX_SO_TIMESTAMP: return (SO_TIMESTAMP); case LINUX_SO_ACCEPTCONN: return (SO_ACCEPTCONN); } return (-1); } static int linux_to_bsd_tcp_sockopt(int opt) { switch (opt) { case LINUX_TCP_NODELAY: return (TCP_NODELAY); case LINUX_TCP_MAXSEG: return (TCP_MAXSEG); case LINUX_TCP_KEEPIDLE: return (TCP_KEEPIDLE); case LINUX_TCP_KEEPINTVL: return (TCP_KEEPINTVL); case LINUX_TCP_KEEPCNT: return (TCP_KEEPCNT); case LINUX_TCP_MD5SIG: return (TCP_MD5SIG); } return (-1); } static int linux_to_bsd_msg_flags(int flags) { int ret_flags = 0; if (flags & LINUX_MSG_OOB) ret_flags |= MSG_OOB; if (flags & LINUX_MSG_PEEK) ret_flags |= MSG_PEEK; if (flags & LINUX_MSG_DONTROUTE) ret_flags |= MSG_DONTROUTE; if (flags & LINUX_MSG_CTRUNC) ret_flags |= MSG_CTRUNC; if (flags & LINUX_MSG_TRUNC) ret_flags |= MSG_TRUNC; if (flags & LINUX_MSG_DONTWAIT) ret_flags |= MSG_DONTWAIT; if (flags & LINUX_MSG_EOR) ret_flags |= MSG_EOR; if (flags & LINUX_MSG_WAITALL) ret_flags |= MSG_WAITALL; if (flags & LINUX_MSG_NOSIGNAL) ret_flags |= MSG_NOSIGNAL; #if 0 /* not handled */ if (flags & LINUX_MSG_PROXY) ; if (flags & LINUX_MSG_FIN) ; if (flags & LINUX_MSG_SYN) ; if (flags & LINUX_MSG_CONFIRM) ; if (flags & LINUX_MSG_RST) ; if (flags & LINUX_MSG_ERRQUEUE) ; #endif return (ret_flags); } /* * If bsd_to_linux_sockaddr() or linux_to_bsd_sockaddr() faults, then the * native syscall will fault. Thus, we don't really need to check the * return values for these functions. */ static int bsd_to_linux_sockaddr(struct sockaddr *arg) { struct sockaddr sa; size_t sa_len = sizeof(struct sockaddr); int error, bdom; if ((error = copyin(arg, &sa, sa_len))) return (error); bdom = bsd_to_linux_domain(sa.sa_family); if (bdom == -1) return (EAFNOSUPPORT); *(u_short *)&sa = bdom; return (copyout(&sa, arg, sa_len)); } static int linux_to_bsd_sockaddr(struct sockaddr *arg, int len) { struct sockaddr sa; size_t sa_len = sizeof(struct sockaddr); int error, bdom; if ((error = copyin(arg, &sa, sa_len))) return (error); bdom = linux_to_bsd_domain(*(sa_family_t *)&sa); if (bdom == -1) return (EAFNOSUPPORT); sa.sa_family = bdom; sa.sa_len = len; return (copyout(&sa, arg, sa_len)); } static int linux_sa_put(struct osockaddr *osa) { struct osockaddr sa; int error, bdom; /* * Only read/write the osockaddr family part, the rest is * not changed. */ error = copyin(osa, &sa, sizeof(sa.sa_family)); if (error != 0) return (error); bdom = bsd_to_linux_domain(sa.sa_family); if (bdom == -1) return (EINVAL); sa.sa_family = bdom; return (copyout(&sa, osa, sizeof(sa.sa_family))); } static int linux_to_bsd_cmsg_type(int cmsg_type) { switch (cmsg_type) { case LINUX_SCM_RIGHTS: return (SCM_RIGHTS); case LINUX_SCM_CREDENTIALS: return (SCM_CREDS); } return (-1); } static int bsd_to_linux_cmsg_type(int cmsg_type) { switch (cmsg_type) { case SCM_RIGHTS: return (LINUX_SCM_RIGHTS); case SCM_CREDS: return (LINUX_SCM_CREDENTIALS); case SCM_TIMESTAMP: return (LINUX_SCM_TIMESTAMP); } return (-1); } static int linux_to_bsd_msghdr(struct msghdr *bhdr, const struct l_msghdr *lhdr) { if (lhdr->msg_controllen > INT_MAX) return (ENOBUFS); bhdr->msg_name = PTRIN(lhdr->msg_name); bhdr->msg_namelen = lhdr->msg_namelen; bhdr->msg_iov = PTRIN(lhdr->msg_iov); bhdr->msg_iovlen = lhdr->msg_iovlen; bhdr->msg_control = PTRIN(lhdr->msg_control); /* * msg_controllen is skipped since BSD and LINUX control messages * are potentially different sizes (e.g. the cred structure used * by SCM_CREDS is different between the two operating system). * * The caller can set it (if necessary) after converting all the * control messages. */ bhdr->msg_flags = linux_to_bsd_msg_flags(lhdr->msg_flags); return (0); } static int bsd_to_linux_msghdr(const struct msghdr *bhdr, struct l_msghdr *lhdr) { lhdr->msg_name = PTROUT(bhdr->msg_name); lhdr->msg_namelen = bhdr->msg_namelen; lhdr->msg_iov = PTROUT(bhdr->msg_iov); lhdr->msg_iovlen = bhdr->msg_iovlen; lhdr->msg_control = PTROUT(bhdr->msg_control); /* * msg_controllen is skipped since BSD and LINUX control messages * are potentially different sizes (e.g. the cred structure used * by SCM_CREDS is different between the two operating system). * * The caller can set it (if necessary) after converting all the * control messages. */ /* msg_flags skipped */ return (0); } static int linux_set_socket_flags(int lflags, int *flags) { if (lflags & ~(LINUX_SOCK_CLOEXEC | LINUX_SOCK_NONBLOCK)) return (EINVAL); if (lflags & LINUX_SOCK_NONBLOCK) *flags |= SOCK_NONBLOCK; if (lflags & LINUX_SOCK_CLOEXEC) *flags |= SOCK_CLOEXEC; return (0); } static int linux_sendit(struct thread *td, int s, struct msghdr *mp, int flags, struct mbuf *control, enum uio_seg segflg) { struct sockaddr *to; int error; if (mp->msg_name != NULL) { error = linux_getsockaddr(&to, mp->msg_name, mp->msg_namelen); if (error != 0) return (error); mp->msg_name = to; } else to = NULL; error = kern_sendit(td, s, mp, linux_to_bsd_msg_flags(flags), control, segflg); if (to) free(to, M_SONAME); return (error); } /* Return 0 if IP_HDRINCL is set for the given socket. */ static int linux_check_hdrincl(struct thread *td, int s) { int error, optval; socklen_t size_val; size_val = sizeof(optval); error = kern_getsockopt(td, s, IPPROTO_IP, IP_HDRINCL, &optval, UIO_SYSSPACE, &size_val); if (error != 0) return (error); return (optval == 0); } /* * Updated sendto() when IP_HDRINCL is set: * tweak endian-dependent fields in the IP packet. */ static int linux_sendto_hdrincl(struct thread *td, struct linux_sendto_args *linux_args) { /* * linux_ip_copysize defines how many bytes we should copy * from the beginning of the IP packet before we customize it for BSD. * It should include all the fields we modify (ip_len and ip_off). */ #define linux_ip_copysize 8 struct ip *packet; struct msghdr msg; struct iovec aiov[1]; int error; /* Check that the packet isn't too big or too small. */ if (linux_args->len < linux_ip_copysize || linux_args->len > IP_MAXPACKET) return (EINVAL); packet = (struct ip *)malloc(linux_args->len, M_LINUX, M_WAITOK); /* Make kernel copy of the packet to be sent */ if ((error = copyin(PTRIN(linux_args->msg), packet, linux_args->len))) goto goout; /* Convert fields from Linux to BSD raw IP socket format */ packet->ip_len = linux_args->len; packet->ip_off = ntohs(packet->ip_off); /* Prepare the msghdr and iovec structures describing the new packet */ msg.msg_name = PTRIN(linux_args->to); msg.msg_namelen = linux_args->tolen; msg.msg_iov = aiov; msg.msg_iovlen = 1; msg.msg_control = NULL; msg.msg_flags = 0; aiov[0].iov_base = (char *)packet; aiov[0].iov_len = linux_args->len; error = linux_sendit(td, linux_args->s, &msg, linux_args->flags, NULL, UIO_SYSSPACE); goout: free(packet, M_LINUX); return (error); } int linux_socket(struct thread *td, struct linux_socket_args *args) { int domain, retval_socket, type; type = args->type & LINUX_SOCK_TYPE_MASK; if (type < 0 || type > LINUX_SOCK_MAX) return (EINVAL); retval_socket = linux_set_socket_flags(args->type & ~LINUX_SOCK_TYPE_MASK, &type); if (retval_socket != 0) return (retval_socket); domain = linux_to_bsd_domain(args->domain); if (domain == -1) return (EAFNOSUPPORT); retval_socket = kern_socket(td, domain, type, args->protocol); if (retval_socket) return (retval_socket); if (type == SOCK_RAW && (args->protocol == IPPROTO_RAW || args->protocol == 0) && domain == PF_INET) { /* It's a raw IP socket: set the IP_HDRINCL option. */ int hdrincl; hdrincl = 1; /* We ignore any error returned by kern_setsockopt() */ kern_setsockopt(td, td->td_retval[0], IPPROTO_IP, IP_HDRINCL, &hdrincl, UIO_SYSSPACE, sizeof(hdrincl)); } #ifdef INET6 /* * Linux AF_INET6 socket has IPV6_V6ONLY setsockopt set to 0 by default * and some apps depend on this. So, set V6ONLY to 0 for Linux apps. * For simplicity we do this unconditionally of the net.inet6.ip6.v6only * sysctl value. */ if (domain == PF_INET6) { int v6only; v6only = 0; /* We ignore any error returned by setsockopt() */ kern_setsockopt(td, td->td_retval[0], IPPROTO_IPV6, IPV6_V6ONLY, &v6only, UIO_SYSSPACE, sizeof(v6only)); } #endif return (retval_socket); } int linux_bind(struct thread *td, struct linux_bind_args *args) { struct sockaddr *sa; int error; error = linux_getsockaddr(&sa, PTRIN(args->name), args->namelen); if (error != 0) return (error); error = kern_bindat(td, AT_FDCWD, args->s, sa); free(sa, M_SONAME); if (error == EADDRNOTAVAIL && args->namelen != sizeof(struct sockaddr_in)) return (EINVAL); return (error); } int linux_connect(struct thread *td, struct linux_connect_args *args) { struct socket *so; struct sockaddr *sa; struct file *fp; u_int fflag; int error; error = linux_getsockaddr(&sa, (struct osockaddr *)PTRIN(args->name), args->namelen); if (error != 0) return (error); error = kern_connectat(td, AT_FDCWD, args->s, sa); free(sa, M_SONAME); if (error != EISCONN) return (error); /* * Linux doesn't return EISCONN the first time it occurs, * when on a non-blocking socket. Instead it returns the * error getsockopt(SOL_SOCKET, SO_ERROR) would return on BSD. */ error = getsock_cap(td, args->s, &cap_connect_rights, &fp, &fflag, NULL); if (error != 0) return (error); error = EISCONN; so = fp->f_data; if (fflag & FNONBLOCK) { SOCK_LOCK(so); if (so->so_emuldata == 0) error = so->so_error; so->so_emuldata = (void *)1; SOCK_UNLOCK(so); } fdrop(fp, td); return (error); } int linux_listen(struct thread *td, struct linux_listen_args *args) { return (kern_listen(td, args->s, args->backlog)); } static int linux_accept_common(struct thread *td, int s, l_uintptr_t addr, l_uintptr_t namelen, int flags) { struct accept4_args /* { int s; struct sockaddr * __restrict name; socklen_t * __restrict anamelen; int flags; } */ bsd_args; struct socket *so; struct file *fp; int error, error1; bsd_args.s = s; bsd_args.name = (struct sockaddr * __restrict)PTRIN(addr); bsd_args.anamelen = PTRIN(namelen); bsd_args.flags = 0; error = linux_set_socket_flags(flags, &bsd_args.flags); if (error != 0) return (error); error = sys_accept4(td, &bsd_args); bsd_to_linux_sockaddr((struct sockaddr *)bsd_args.name); if (error != 0) { if (error == EFAULT && namelen != sizeof(struct sockaddr_in)) return (EINVAL); if (error == EINVAL) { error1 = getsock_cap(td, s, &cap_accept_rights, &fp, NULL, NULL); if (error1 != 0) return (error1); so = fp->f_data; if (so->so_type == SOCK_DGRAM) { fdrop(fp, td); return (EOPNOTSUPP); } fdrop(fp, td); } return (error); } if (addr) error = linux_sa_put(PTRIN(addr)); if (error != 0) { (void)kern_close(td, td->td_retval[0]); td->td_retval[0] = 0; } return (error); } int linux_accept(struct thread *td, struct linux_accept_args *args) { return (linux_accept_common(td, args->s, args->addr, args->namelen, 0)); } int linux_accept4(struct thread *td, struct linux_accept4_args *args) { return (linux_accept_common(td, args->s, args->addr, args->namelen, args->flags)); } int linux_getsockname(struct thread *td, struct linux_getsockname_args *args) { struct getsockname_args /* { int fdes; struct sockaddr * __restrict asa; socklen_t * __restrict alen; } */ bsd_args; int error; bsd_args.fdes = args->s; bsd_args.asa = (struct sockaddr * __restrict)PTRIN(args->addr); bsd_args.alen = PTRIN(args->namelen); error = sys_getsockname(td, &bsd_args); bsd_to_linux_sockaddr((struct sockaddr *)bsd_args.asa); if (error != 0) return (error); return (linux_sa_put(PTRIN(args->addr))); } int linux_getpeername(struct thread *td, struct linux_getpeername_args *args) { struct getpeername_args /* { int fdes; caddr_t asa; int *alen; } */ bsd_args; int error; bsd_args.fdes = args->s; bsd_args.asa = (struct sockaddr *)PTRIN(args->addr); bsd_args.alen = (socklen_t *)PTRIN(args->namelen); error = sys_getpeername(td, &bsd_args); bsd_to_linux_sockaddr((struct sockaddr *)bsd_args.asa); if (error != 0) return (error); return (linux_sa_put(PTRIN(args->addr))); } int linux_socketpair(struct thread *td, struct linux_socketpair_args *args) { struct socketpair_args /* { int domain; int type; int protocol; int *rsv; } */ bsd_args; int error; bsd_args.domain = linux_to_bsd_domain(args->domain); if (bsd_args.domain != PF_LOCAL) return (EAFNOSUPPORT); bsd_args.type = args->type & LINUX_SOCK_TYPE_MASK; if (bsd_args.type < 0 || bsd_args.type > LINUX_SOCK_MAX) return (EINVAL); error = linux_set_socket_flags(args->type & ~LINUX_SOCK_TYPE_MASK, &bsd_args.type); if (error != 0) return (error); if (args->protocol != 0 && args->protocol != PF_UNIX) /* * Use of PF_UNIX as protocol argument is not right, * but Linux does it. * Do not map PF_UNIX as its Linux value is identical * to FreeBSD one. */ return (EPROTONOSUPPORT); else bsd_args.protocol = 0; bsd_args.rsv = (int *)PTRIN(args->rsv); return (sys_socketpair(td, &bsd_args)); } #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32)) struct linux_send_args { register_t s; register_t msg; register_t len; register_t flags; }; static int linux_send(struct thread *td, struct linux_send_args *args) { struct sendto_args /* { int s; caddr_t buf; int len; int flags; caddr_t to; int tolen; } */ bsd_args; bsd_args.s = args->s; bsd_args.buf = (caddr_t)PTRIN(args->msg); bsd_args.len = args->len; bsd_args.flags = args->flags; bsd_args.to = NULL; bsd_args.tolen = 0; return (sys_sendto(td, &bsd_args)); } struct linux_recv_args { register_t s; register_t msg; register_t len; register_t flags; }; static int linux_recv(struct thread *td, struct linux_recv_args *args) { struct recvfrom_args /* { int s; caddr_t buf; int len; int flags; struct sockaddr *from; socklen_t fromlenaddr; } */ bsd_args; bsd_args.s = args->s; bsd_args.buf = (caddr_t)PTRIN(args->msg); bsd_args.len = args->len; bsd_args.flags = linux_to_bsd_msg_flags(args->flags); bsd_args.from = NULL; bsd_args.fromlenaddr = 0; return (sys_recvfrom(td, &bsd_args)); } #endif /* __i386__ || (__amd64__ && COMPAT_LINUX32) */ int linux_sendto(struct thread *td, struct linux_sendto_args *args) { struct msghdr msg; struct iovec aiov; if (linux_check_hdrincl(td, args->s) == 0) /* IP_HDRINCL set, tweak the packet before sending */ return (linux_sendto_hdrincl(td, args)); msg.msg_name = PTRIN(args->to); msg.msg_namelen = args->tolen; msg.msg_iov = &aiov; msg.msg_iovlen = 1; msg.msg_control = NULL; msg.msg_flags = 0; aiov.iov_base = PTRIN(args->msg); aiov.iov_len = args->len; return (linux_sendit(td, args->s, &msg, args->flags, NULL, UIO_USERSPACE)); } int linux_recvfrom(struct thread *td, struct linux_recvfrom_args *args) { struct msghdr msg; struct iovec aiov; int error, fromlen; if (PTRIN(args->fromlen) != NULL) { error = copyin(PTRIN(args->fromlen), &fromlen, sizeof(fromlen)); if (error != 0) return (error); if (fromlen < 0) return (EINVAL); msg.msg_namelen = fromlen; } else msg.msg_namelen = 0; msg.msg_name = (struct sockaddr * __restrict)PTRIN(args->from); msg.msg_iov = &aiov; msg.msg_iovlen = 1; aiov.iov_base = PTRIN(args->buf); aiov.iov_len = args->len; msg.msg_control = 0; msg.msg_flags = linux_to_bsd_msg_flags(args->flags); error = kern_recvit(td, args->s, &msg, UIO_USERSPACE, NULL); if (error != 0) return (error); if (PTRIN(args->from) != NULL) { error = bsd_to_linux_sockaddr((struct sockaddr *) PTRIN(args->from)); if (error != 0) return (error); error = linux_sa_put((struct osockaddr *) PTRIN(args->from)); } if (PTRIN(args->fromlen) != NULL) error = copyout(&msg.msg_namelen, PTRIN(args->fromlen), sizeof(msg.msg_namelen)); return (error); } static int linux_sendmsg_common(struct thread *td, l_int s, struct l_msghdr *msghdr, l_uint flags) { struct cmsghdr *cmsg; struct cmsgcred cmcred; struct mbuf *control; struct msghdr msg; struct l_cmsghdr linux_cmsg; struct l_cmsghdr *ptr_cmsg; struct l_msghdr linux_msg; struct iovec *iov; socklen_t datalen; struct sockaddr *sa; sa_family_t sa_family; void *data; int error; error = copyin(msghdr, &linux_msg, sizeof(linux_msg)); if (error != 0) return (error); /* * Some Linux applications (ping) define a non-NULL control data * pointer, but a msg_controllen of 0, which is not allowed in the * FreeBSD system call interface. NULL the msg_control pointer in * order to handle this case. This should be checked, but allows the * Linux ping to work. */ if (PTRIN(linux_msg.msg_control) != NULL && linux_msg.msg_controllen == 0) linux_msg.msg_control = PTROUT(NULL); error = linux_to_bsd_msghdr(&msg, &linux_msg); if (error != 0) return (error); #ifdef COMPAT_LINUX32 error = linux32_copyiniov(PTRIN(msg.msg_iov), msg.msg_iovlen, &iov, EMSGSIZE); #else error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE); #endif if (error != 0) return (error); control = NULL; cmsg = NULL; if ((ptr_cmsg = LINUX_CMSG_FIRSTHDR(&linux_msg)) != NULL) { error = kern_getsockname(td, s, &sa, &datalen); if (error != 0) goto bad; sa_family = sa->sa_family; free(sa, M_SONAME); error = ENOBUFS; cmsg = malloc(CMSG_HDRSZ, M_LINUX, M_WAITOK|M_ZERO); control = m_get(M_WAITOK, MT_CONTROL); do { error = copyin(ptr_cmsg, &linux_cmsg, sizeof(struct l_cmsghdr)); if (error != 0) goto bad; error = EINVAL; if (linux_cmsg.cmsg_len < sizeof(struct l_cmsghdr)) goto bad; /* * Now we support only SCM_RIGHTS and SCM_CRED, * so return EINVAL in any other cmsg_type */ cmsg->cmsg_type = linux_to_bsd_cmsg_type(linux_cmsg.cmsg_type); cmsg->cmsg_level = linux_to_bsd_sockopt_level(linux_cmsg.cmsg_level); if (cmsg->cmsg_type == -1 || cmsg->cmsg_level != SOL_SOCKET) goto bad; /* * Some applications (e.g. pulseaudio) attempt to * send ancillary data even if the underlying protocol * doesn't support it which is not allowed in the * FreeBSD system call interface. */ if (sa_family != AF_UNIX) continue; data = LINUX_CMSG_DATA(ptr_cmsg); datalen = linux_cmsg.cmsg_len - L_CMSG_HDRSZ; switch (cmsg->cmsg_type) { case SCM_RIGHTS: break; case SCM_CREDS: data = &cmcred; datalen = sizeof(cmcred); /* * The lower levels will fill in the structure */ bzero(data, datalen); break; } cmsg->cmsg_len = CMSG_LEN(datalen); error = ENOBUFS; if (!m_append(control, CMSG_HDRSZ, (c_caddr_t)cmsg)) goto bad; if (!m_append(control, datalen, (c_caddr_t)data)) goto bad; } while ((ptr_cmsg = LINUX_CMSG_NXTHDR(&linux_msg, ptr_cmsg))); if (m_length(control, NULL) == 0) { m_freem(control); control = NULL; } } msg.msg_iov = iov; msg.msg_flags = 0; error = linux_sendit(td, s, &msg, flags, control, UIO_USERSPACE); control = NULL; bad: m_freem(control); free(iov, M_IOV); if (cmsg) free(cmsg, M_LINUX); return (error); } int linux_sendmsg(struct thread *td, struct linux_sendmsg_args *args) { return (linux_sendmsg_common(td, args->s, PTRIN(args->msg), args->flags)); } int linux_sendmmsg(struct thread *td, struct linux_sendmmsg_args *args) { struct l_mmsghdr *msg; l_uint retval; int error, datagrams; if (args->vlen > UIO_MAXIOV) args->vlen = UIO_MAXIOV; msg = PTRIN(args->msg); datagrams = 0; while (datagrams < args->vlen) { error = linux_sendmsg_common(td, args->s, &msg->msg_hdr, args->flags); if (error != 0) break; retval = td->td_retval[0]; error = copyout(&retval, &msg->msg_len, sizeof(msg->msg_len)); if (error != 0) break; ++msg; ++datagrams; } if (error == 0) td->td_retval[0] = datagrams; return (error); } static int linux_recvmsg_common(struct thread *td, l_int s, struct l_msghdr *msghdr, l_uint flags, struct msghdr *msg) { struct cmsghdr *cm; struct cmsgcred *cmcred; struct l_cmsghdr *linux_cmsg = NULL; struct l_ucred linux_ucred; socklen_t datalen, outlen; struct l_msghdr linux_msg; struct iovec *iov, *uiov; struct mbuf *control = NULL; struct mbuf **controlp; struct timeval *ftmvl; l_timeval ltmvl; caddr_t outbuf; void *data; int error, i, fd, fds, *fdp; error = copyin(msghdr, &linux_msg, sizeof(linux_msg)); if (error != 0) return (error); error = linux_to_bsd_msghdr(msg, &linux_msg); if (error != 0) return (error); #ifdef COMPAT_LINUX32 error = linux32_copyiniov(PTRIN(msg->msg_iov), msg->msg_iovlen, &iov, EMSGSIZE); #else error = copyiniov(msg->msg_iov, msg->msg_iovlen, &iov, EMSGSIZE); #endif if (error != 0) return (error); if (msg->msg_name) { error = linux_to_bsd_sockaddr((struct sockaddr *)msg->msg_name, msg->msg_namelen); if (error != 0) goto bad; } uiov = msg->msg_iov; msg->msg_iov = iov; controlp = (msg->msg_control != NULL) ? &control : NULL; error = kern_recvit(td, s, msg, UIO_USERSPACE, controlp); msg->msg_iov = uiov; if (error != 0) goto bad; error = bsd_to_linux_msghdr(msg, &linux_msg); if (error != 0) goto bad; if (linux_msg.msg_name) { error = bsd_to_linux_sockaddr((struct sockaddr *) PTRIN(linux_msg.msg_name)); if (error != 0) goto bad; } if (linux_msg.msg_name && linux_msg.msg_namelen > 2) { error = linux_sa_put(PTRIN(linux_msg.msg_name)); if (error != 0) goto bad; } outbuf = PTRIN(linux_msg.msg_control); outlen = 0; if (control) { linux_cmsg = malloc(L_CMSG_HDRSZ, M_LINUX, M_WAITOK | M_ZERO); msg->msg_control = mtod(control, struct cmsghdr *); msg->msg_controllen = control->m_len; cm = CMSG_FIRSTHDR(msg); while (cm != NULL) { linux_cmsg->cmsg_type = bsd_to_linux_cmsg_type(cm->cmsg_type); linux_cmsg->cmsg_level = bsd_to_linux_sockopt_level(cm->cmsg_level); if (linux_cmsg->cmsg_type == -1 || cm->cmsg_level != SOL_SOCKET) { error = EINVAL; goto bad; } data = CMSG_DATA(cm); datalen = (caddr_t)cm + cm->cmsg_len - (caddr_t)data; switch (cm->cmsg_type) { case SCM_RIGHTS: if (flags & LINUX_MSG_CMSG_CLOEXEC) { fds = datalen / sizeof(int); fdp = data; for (i = 0; i < fds; i++) { fd = *fdp++; (void)kern_fcntl(td, fd, F_SETFD, FD_CLOEXEC); } } break; case SCM_CREDS: /* * Currently LOCAL_CREDS is never in * effect for Linux so no need to worry * about sockcred */ if (datalen != sizeof(*cmcred)) { error = EMSGSIZE; goto bad; } cmcred = (struct cmsgcred *)data; bzero(&linux_ucred, sizeof(linux_ucred)); linux_ucred.pid = cmcred->cmcred_pid; linux_ucred.uid = cmcred->cmcred_uid; linux_ucred.gid = cmcred->cmcred_gid; data = &linux_ucred; datalen = sizeof(linux_ucred); break; case SCM_TIMESTAMP: if (datalen != sizeof(struct timeval)) { error = EMSGSIZE; goto bad; } ftmvl = (struct timeval *)data; ltmvl.tv_sec = ftmvl->tv_sec; ltmvl.tv_usec = ftmvl->tv_usec; data = <mvl; datalen = sizeof(ltmvl); break; } if (outlen + LINUX_CMSG_LEN(datalen) > linux_msg.msg_controllen) { if (outlen == 0) { error = EMSGSIZE; goto bad; } else { linux_msg.msg_flags |= LINUX_MSG_CTRUNC; goto out; } } linux_cmsg->cmsg_len = LINUX_CMSG_LEN(datalen); error = copyout(linux_cmsg, outbuf, L_CMSG_HDRSZ); if (error != 0) goto bad; outbuf += L_CMSG_HDRSZ; error = copyout(data, outbuf, datalen); if (error != 0) goto bad; outbuf += LINUX_CMSG_ALIGN(datalen); outlen += LINUX_CMSG_LEN(datalen); cm = CMSG_NXTHDR(msg, cm); } } out: linux_msg.msg_controllen = outlen; error = copyout(&linux_msg, msghdr, sizeof(linux_msg)); bad: free(iov, M_IOV); m_freem(control); free(linux_cmsg, M_LINUX); return (error); } int linux_recvmsg(struct thread *td, struct linux_recvmsg_args *args) { struct msghdr bsd_msg; return (linux_recvmsg_common(td, args->s, PTRIN(args->msg), args->flags, &bsd_msg)); } int linux_recvmmsg(struct thread *td, struct linux_recvmmsg_args *args) { struct l_mmsghdr *msg; struct msghdr bsd_msg; struct l_timespec lts; struct timespec ts, tts; l_uint retval; int error, datagrams; if (args->timeout) { error = copyin(args->timeout, <s, sizeof(struct l_timespec)); if (error != 0) return (error); error = linux_to_native_timespec(&ts, <s); if (error != 0) return (error); getnanotime(&tts); - timespecadd(&tts, &ts); + timespecadd(&tts, &ts, &tts); } msg = PTRIN(args->msg); datagrams = 0; while (datagrams < args->vlen) { error = linux_recvmsg_common(td, args->s, &msg->msg_hdr, args->flags & ~LINUX_MSG_WAITFORONE, &bsd_msg); if (error != 0) break; retval = td->td_retval[0]; error = copyout(&retval, &msg->msg_len, sizeof(msg->msg_len)); if (error != 0) break; ++msg; ++datagrams; /* * MSG_WAITFORONE turns on MSG_DONTWAIT after one packet. */ if (args->flags & LINUX_MSG_WAITFORONE) args->flags |= LINUX_MSG_DONTWAIT; /* * See BUGS section of recvmmsg(2). */ if (args->timeout) { getnanotime(&ts); - timespecsub(&ts, &tts); + timespecsub(&ts, &tts, &ts); if (!timespecisset(&ts) || ts.tv_sec > 0) break; } /* Out of band data, return right away. */ if (bsd_msg.msg_flags & MSG_OOB) break; } if (error == 0) td->td_retval[0] = datagrams; return (error); } int linux_shutdown(struct thread *td, struct linux_shutdown_args *args) { return (kern_shutdown(td, args->s, args->how)); } int linux_setsockopt(struct thread *td, struct linux_setsockopt_args *args) { struct setsockopt_args /* { int s; int level; int name; caddr_t val; int valsize; } */ bsd_args; l_timeval linux_tv; struct timeval tv; int error, name; bsd_args.s = args->s; bsd_args.level = linux_to_bsd_sockopt_level(args->level); switch (bsd_args.level) { case SOL_SOCKET: name = linux_to_bsd_so_sockopt(args->optname); switch (name) { case SO_RCVTIMEO: /* FALLTHROUGH */ case SO_SNDTIMEO: error = copyin(PTRIN(args->optval), &linux_tv, sizeof(linux_tv)); if (error != 0) return (error); tv.tv_sec = linux_tv.tv_sec; tv.tv_usec = linux_tv.tv_usec; return (kern_setsockopt(td, args->s, bsd_args.level, name, &tv, UIO_SYSSPACE, sizeof(tv))); /* NOTREACHED */ default: break; } break; case IPPROTO_IP: name = linux_to_bsd_ip_sockopt(args->optname); break; case IPPROTO_IPV6: name = linux_to_bsd_ip6_sockopt(args->optname); break; case IPPROTO_TCP: name = linux_to_bsd_tcp_sockopt(args->optname); break; default: name = -1; break; } if (name == -1) return (ENOPROTOOPT); bsd_args.name = name; bsd_args.val = PTRIN(args->optval); bsd_args.valsize = args->optlen; if (name == IPV6_NEXTHOP) { linux_to_bsd_sockaddr((struct sockaddr *)bsd_args.val, bsd_args.valsize); error = sys_setsockopt(td, &bsd_args); bsd_to_linux_sockaddr((struct sockaddr *)bsd_args.val); } else error = sys_setsockopt(td, &bsd_args); return (error); } int linux_getsockopt(struct thread *td, struct linux_getsockopt_args *args) { struct getsockopt_args /* { int s; int level; int name; caddr_t val; int *avalsize; } */ bsd_args; l_timeval linux_tv; struct timeval tv; socklen_t tv_len, xulen, len; struct xucred xu; struct l_ucred lxu; int error, name, newval; bsd_args.s = args->s; bsd_args.level = linux_to_bsd_sockopt_level(args->level); switch (bsd_args.level) { case SOL_SOCKET: name = linux_to_bsd_so_sockopt(args->optname); switch (name) { case SO_RCVTIMEO: /* FALLTHROUGH */ case SO_SNDTIMEO: tv_len = sizeof(tv); error = kern_getsockopt(td, args->s, bsd_args.level, name, &tv, UIO_SYSSPACE, &tv_len); if (error != 0) return (error); linux_tv.tv_sec = tv.tv_sec; linux_tv.tv_usec = tv.tv_usec; return (copyout(&linux_tv, PTRIN(args->optval), sizeof(linux_tv))); /* NOTREACHED */ case LOCAL_PEERCRED: if (args->optlen < sizeof(lxu)) return (EINVAL); xulen = sizeof(xu); error = kern_getsockopt(td, args->s, bsd_args.level, name, &xu, UIO_SYSSPACE, &xulen); if (error != 0) return (error); /* * XXX Use 0 for pid as the FreeBSD does not cache peer pid. */ lxu.pid = 0; lxu.uid = xu.cr_uid; lxu.gid = xu.cr_gid; return (copyout(&lxu, PTRIN(args->optval), sizeof(lxu))); /* NOTREACHED */ case SO_ERROR: len = sizeof(newval); error = kern_getsockopt(td, args->s, bsd_args.level, name, &newval, UIO_SYSSPACE, &len); if (error != 0) return (error); newval = -SV_ABI_ERRNO(td->td_proc, newval); return (copyout(&newval, PTRIN(args->optval), len)); /* NOTREACHED */ default: break; } break; case IPPROTO_IP: name = linux_to_bsd_ip_sockopt(args->optname); break; case IPPROTO_IPV6: name = linux_to_bsd_ip6_sockopt(args->optname); break; case IPPROTO_TCP: name = linux_to_bsd_tcp_sockopt(args->optname); break; default: name = -1; break; } if (name == -1) return (EINVAL); bsd_args.name = name; bsd_args.val = PTRIN(args->optval); bsd_args.avalsize = PTRIN(args->optlen); if (name == IPV6_NEXTHOP) { error = sys_getsockopt(td, &bsd_args); bsd_to_linux_sockaddr((struct sockaddr *)bsd_args.val); } else error = sys_getsockopt(td, &bsd_args); return (error); } #if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32)) /* Argument list sizes for linux_socketcall */ static const unsigned char lxs_args_cnt[] = { 0 /* unused*/, 3 /* socket */, 3 /* bind */, 3 /* connect */, 2 /* listen */, 3 /* accept */, 3 /* getsockname */, 3 /* getpeername */, 4 /* socketpair */, 4 /* send */, 4 /* recv */, 6 /* sendto */, 6 /* recvfrom */, 2 /* shutdown */, 5 /* setsockopt */, 5 /* getsockopt */, 3 /* sendmsg */, 3 /* recvmsg */, 4 /* accept4 */, 5 /* recvmmsg */, 4 /* sendmmsg */ }; #define LINUX_ARGS_CNT (nitems(lxs_args_cnt) - 1) #define LINUX_ARG_SIZE(x) (lxs_args_cnt[x] * sizeof(l_ulong)) int linux_socketcall(struct thread *td, struct linux_socketcall_args *args) { l_ulong a[6]; #if defined(__amd64__) && defined(COMPAT_LINUX32) register_t l_args[6]; #endif void *arg; int error; if (args->what < LINUX_SOCKET || args->what > LINUX_ARGS_CNT) return (EINVAL); error = copyin(PTRIN(args->args), a, LINUX_ARG_SIZE(args->what)); if (error != 0) return (error); #if defined(__amd64__) && defined(COMPAT_LINUX32) for (int i = 0; i < lxs_args_cnt[args->what]; ++i) l_args[i] = a[i]; arg = l_args; #else arg = a; #endif switch (args->what) { case LINUX_SOCKET: return (linux_socket(td, arg)); case LINUX_BIND: return (linux_bind(td, arg)); case LINUX_CONNECT: return (linux_connect(td, arg)); case LINUX_LISTEN: return (linux_listen(td, arg)); case LINUX_ACCEPT: return (linux_accept(td, arg)); case LINUX_GETSOCKNAME: return (linux_getsockname(td, arg)); case LINUX_GETPEERNAME: return (linux_getpeername(td, arg)); case LINUX_SOCKETPAIR: return (linux_socketpair(td, arg)); case LINUX_SEND: return (linux_send(td, arg)); case LINUX_RECV: return (linux_recv(td, arg)); case LINUX_SENDTO: return (linux_sendto(td, arg)); case LINUX_RECVFROM: return (linux_recvfrom(td, arg)); case LINUX_SHUTDOWN: return (linux_shutdown(td, arg)); case LINUX_SETSOCKOPT: return (linux_setsockopt(td, arg)); case LINUX_GETSOCKOPT: return (linux_getsockopt(td, arg)); case LINUX_SENDMSG: return (linux_sendmsg(td, arg)); case LINUX_RECVMSG: return (linux_recvmsg(td, arg)); case LINUX_ACCEPT4: return (linux_accept4(td, arg)); case LINUX_RECVMMSG: return (linux_recvmmsg(td, arg)); case LINUX_SENDMMSG: return (linux_sendmmsg(td, arg)); } uprintf("LINUX: 'socket' typ=%d not implemented\n", args->what); return (ENOSYS); } #endif /* __i386__ || (__amd64__ && COMPAT_LINUX32) */ Index: head/sys/compat/linuxkpi/common/include/linux/time.h =================================================================== --- head/sys/compat/linuxkpi/common/include/linux/time.h (revision 336913) +++ head/sys/compat/linuxkpi/common/include/linux/time.h (revision 336914) @@ -1,137 +1,135 @@ /*- * Copyright (c) 2014-2015 François Tigeot * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _LINUX_TIME_H_ #define _LINUX_TIME_H_ #define NSEC_PER_USEC 1000L #define NSEC_PER_MSEC 1000000L #define NSEC_PER_SEC 1000000000L #define USEC_PER_MSEC 1000L #define USEC_PER_SEC 1000000L #define timespec64 timespec #include #include static inline struct timeval ns_to_timeval(const int64_t nsec) { struct timeval tv; long rem; if (nsec == 0) { tv.tv_sec = 0; tv.tv_usec = 0; return (tv); } tv.tv_sec = nsec / NSEC_PER_SEC; rem = nsec % NSEC_PER_SEC; if (rem < 0) { tv.tv_sec--; rem += NSEC_PER_SEC; } tv.tv_usec = rem / 1000; return (tv); } static inline int64_t timeval_to_ns(const struct timeval *tv) { return ((int64_t)tv->tv_sec * NSEC_PER_SEC) + tv->tv_usec * NSEC_PER_USEC; } #define getrawmonotonic(ts) nanouptime(ts) static inline struct timespec timespec_sub(struct timespec lhs, struct timespec rhs) { struct timespec ts; - ts.tv_sec = lhs.tv_sec; - ts.tv_nsec = lhs.tv_nsec; - timespecsub(&ts, &rhs); + timespecsub(&lhs, &rhs, &ts); return ts; } static inline void set_normalized_timespec(struct timespec *ts, time_t sec, int64_t nsec) { /* XXX: this doesn't actually normalize anything */ ts->tv_sec = sec; ts->tv_nsec = nsec; } static inline int64_t timespec_to_ns(const struct timespec *ts) { return ((ts->tv_sec * NSEC_PER_SEC) + ts->tv_nsec); } static inline struct timespec ns_to_timespec(const int64_t nsec) { struct timespec ts; int32_t rem; if (nsec == 0) { ts.tv_sec = 0; ts.tv_nsec = 0; return (ts); } ts.tv_sec = nsec / NSEC_PER_SEC; rem = nsec % NSEC_PER_SEC; if (rem < 0) { ts.tv_sec--; rem += NSEC_PER_SEC; } ts.tv_nsec = rem; return (ts); } static inline int timespec_valid(const struct timespec *ts) { if (ts->tv_sec < 0 || ts->tv_sec > 100000000 || ts->tv_nsec < 0 || ts->tv_nsec >= 1000000000) return (0); return (1); } static inline unsigned long get_seconds(void) { return time_uptime; } #endif /* _LINUX_TIME_H_ */ Index: head/sys/dev/acpica/acpi_cmbat.c =================================================================== --- head/sys/dev/acpica/acpi_cmbat.c (revision 336913) +++ head/sys/dev/acpica/acpi_cmbat.c (revision 336914) @@ -1,507 +1,507 @@ /*- * Copyright (c) 2005 Nate Lawson * Copyright (c) 2000 Munehiro Matsuda * Copyright (c) 2000 Takanori Watanabe * Copyright (c) 2000 Mitsuru IWASAKI * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_acpi.h" #include #include #include #include #include #include #include #include #include #include #include static MALLOC_DEFINE(M_ACPICMBAT, "acpicmbat", "ACPI control method battery data"); /* Number of times to retry initialization before giving up. */ #define ACPI_CMBAT_RETRY_MAX 6 /* Check the battery once a minute. */ #define CMBAT_POLLRATE (60 * hz) /* Hooks for the ACPI CA debugging infrastructure */ #define _COMPONENT ACPI_BATTERY ACPI_MODULE_NAME("BATTERY") #define ACPI_BATTERY_BST_CHANGE 0x80 #define ACPI_BATTERY_BIF_CHANGE 0x81 struct acpi_cmbat_softc { device_t dev; int flags; struct acpi_bif bif; struct acpi_bst bst; struct timespec bst_lastupdated; }; ACPI_SERIAL_DECL(cmbat, "ACPI cmbat"); static int acpi_cmbat_probe(device_t dev); static int acpi_cmbat_attach(device_t dev); static int acpi_cmbat_detach(device_t dev); static int acpi_cmbat_resume(device_t dev); static void acpi_cmbat_notify_handler(ACPI_HANDLE h, UINT32 notify, void *context); static int acpi_cmbat_info_expired(struct timespec *lastupdated); static void acpi_cmbat_info_updated(struct timespec *lastupdated); static void acpi_cmbat_get_bst(void *arg); static void acpi_cmbat_get_bif_task(void *arg); static void acpi_cmbat_get_bif(void *arg); static int acpi_cmbat_bst(device_t dev, struct acpi_bst *bstp); static int acpi_cmbat_bif(device_t dev, struct acpi_bif *bifp); static void acpi_cmbat_init_battery(void *arg); static device_method_t acpi_cmbat_methods[] = { /* Device interface */ DEVMETHOD(device_probe, acpi_cmbat_probe), DEVMETHOD(device_attach, acpi_cmbat_attach), DEVMETHOD(device_detach, acpi_cmbat_detach), DEVMETHOD(device_resume, acpi_cmbat_resume), /* ACPI battery interface */ DEVMETHOD(acpi_batt_get_info, acpi_cmbat_bif), DEVMETHOD(acpi_batt_get_status, acpi_cmbat_bst), DEVMETHOD_END }; static driver_t acpi_cmbat_driver = { "battery", acpi_cmbat_methods, sizeof(struct acpi_cmbat_softc), }; static devclass_t acpi_cmbat_devclass; DRIVER_MODULE(acpi_cmbat, acpi, acpi_cmbat_driver, acpi_cmbat_devclass, 0, 0); MODULE_DEPEND(acpi_cmbat, acpi, 1, 1, 1); static int acpi_cmbat_probe(device_t dev) { static char *cmbat_ids[] = { "PNP0C0A", NULL }; if (acpi_disabled("cmbat") || ACPI_ID_PROBE(device_get_parent(dev), dev, cmbat_ids) == NULL) return (ENXIO); device_set_desc(dev, "ACPI Control Method Battery"); return (0); } static int acpi_cmbat_attach(device_t dev) { int error; ACPI_HANDLE handle; struct acpi_cmbat_softc *sc; sc = device_get_softc(dev); handle = acpi_get_handle(dev); sc->dev = dev; timespecclear(&sc->bst_lastupdated); error = acpi_battery_register(dev); if (error != 0) { device_printf(dev, "registering battery failed\n"); return (error); } /* * Install a system notify handler in addition to the device notify. * Toshiba notebook uses this alternate notify for its battery. */ AcpiInstallNotifyHandler(handle, ACPI_ALL_NOTIFY, acpi_cmbat_notify_handler, dev); AcpiOsExecute(OSL_NOTIFY_HANDLER, acpi_cmbat_init_battery, dev); return (0); } static int acpi_cmbat_detach(device_t dev) { ACPI_HANDLE handle; handle = acpi_get_handle(dev); AcpiRemoveNotifyHandler(handle, ACPI_ALL_NOTIFY, acpi_cmbat_notify_handler); acpi_battery_remove(dev); /* * Force any pending notification handler calls to complete by * requesting cmbat serialisation while freeing and clearing the * softc pointer: */ ACPI_SERIAL_BEGIN(cmbat); device_set_softc(dev, NULL); ACPI_SERIAL_END(cmbat); return (0); } static int acpi_cmbat_resume(device_t dev) { AcpiOsExecute(OSL_NOTIFY_HANDLER, acpi_cmbat_init_battery, dev); return (0); } static void acpi_cmbat_notify_handler(ACPI_HANDLE h, UINT32 notify, void *context) { struct acpi_cmbat_softc *sc; device_t dev; dev = (device_t)context; sc = device_get_softc(dev); switch (notify) { case ACPI_NOTIFY_DEVICE_CHECK: case ACPI_BATTERY_BST_CHANGE: /* * Clear the last updated time. The next call to retrieve the * battery status will get the new value for us. */ timespecclear(&sc->bst_lastupdated); break; case ACPI_NOTIFY_BUS_CHECK: case ACPI_BATTERY_BIF_CHANGE: /* * Queue a callback to get the current battery info from thread * context. It's not safe to block in a notify handler. */ AcpiOsExecute(OSL_NOTIFY_HANDLER, acpi_cmbat_get_bif_task, dev); break; } acpi_UserNotify("CMBAT", h, notify); } static int acpi_cmbat_info_expired(struct timespec *lastupdated) { struct timespec curtime; ACPI_SERIAL_ASSERT(cmbat); if (lastupdated == NULL) return (TRUE); if (!timespecisset(lastupdated)) return (TRUE); getnanotime(&curtime); - timespecsub(&curtime, lastupdated); + timespecsub(&curtime, lastupdated, &curtime); return (curtime.tv_sec < 0 || curtime.tv_sec > acpi_battery_get_info_expire()); } static void acpi_cmbat_info_updated(struct timespec *lastupdated) { ACPI_SERIAL_ASSERT(cmbat); if (lastupdated != NULL) getnanotime(lastupdated); } static void acpi_cmbat_get_bst(void *arg) { struct acpi_cmbat_softc *sc; ACPI_STATUS as; ACPI_OBJECT *res; ACPI_HANDLE h; ACPI_BUFFER bst_buffer; device_t dev; ACPI_SERIAL_ASSERT(cmbat); dev = arg; sc = device_get_softc(dev); h = acpi_get_handle(dev); bst_buffer.Pointer = NULL; bst_buffer.Length = ACPI_ALLOCATE_BUFFER; if (!acpi_cmbat_info_expired(&sc->bst_lastupdated)) goto end; as = AcpiEvaluateObject(h, "_BST", NULL, &bst_buffer); if (ACPI_FAILURE(as)) { ACPI_VPRINT(dev, acpi_device_get_parent_softc(dev), "error fetching current battery status -- %s\n", AcpiFormatException(as)); goto end; } res = (ACPI_OBJECT *)bst_buffer.Pointer; if (!ACPI_PKG_VALID(res, 4)) { ACPI_VPRINT(dev, acpi_device_get_parent_softc(dev), "battery status corrupted\n"); goto end; } if (acpi_PkgInt32(res, 0, &sc->bst.state) != 0) goto end; if (acpi_PkgInt32(res, 1, &sc->bst.rate) != 0) goto end; if (acpi_PkgInt32(res, 2, &sc->bst.cap) != 0) goto end; if (acpi_PkgInt32(res, 3, &sc->bst.volt) != 0) goto end; acpi_cmbat_info_updated(&sc->bst_lastupdated); /* Clear out undefined/extended bits that might be set by hardware. */ sc->bst.state &= ACPI_BATT_STAT_BST_MASK; if ((sc->bst.state & ACPI_BATT_STAT_INVALID) == ACPI_BATT_STAT_INVALID) ACPI_VPRINT(dev, acpi_device_get_parent_softc(dev), "battery reports simultaneous charging and discharging\n"); /* XXX If all batteries are critical, perhaps we should suspend. */ if (sc->bst.state & ACPI_BATT_STAT_CRITICAL) { if ((sc->flags & ACPI_BATT_STAT_CRITICAL) == 0) { sc->flags |= ACPI_BATT_STAT_CRITICAL; device_printf(dev, "critically low charge!\n"); } } else sc->flags &= ~ACPI_BATT_STAT_CRITICAL; end: if (bst_buffer.Pointer != NULL) AcpiOsFree(bst_buffer.Pointer); } /* XXX There should be a cleaner way to do this locking. */ static void acpi_cmbat_get_bif_task(void *arg) { ACPI_SERIAL_BEGIN(cmbat); acpi_cmbat_get_bif(arg); ACPI_SERIAL_END(cmbat); } static void acpi_cmbat_get_bif(void *arg) { struct acpi_cmbat_softc *sc; ACPI_STATUS as; ACPI_OBJECT *res; ACPI_HANDLE h; ACPI_BUFFER bif_buffer; device_t dev; ACPI_SERIAL_ASSERT(cmbat); dev = arg; sc = device_get_softc(dev); h = acpi_get_handle(dev); bif_buffer.Pointer = NULL; bif_buffer.Length = ACPI_ALLOCATE_BUFFER; as = AcpiEvaluateObject(h, "_BIF", NULL, &bif_buffer); if (ACPI_FAILURE(as)) { ACPI_VPRINT(dev, acpi_device_get_parent_softc(dev), "error fetching current battery info -- %s\n", AcpiFormatException(as)); goto end; } res = (ACPI_OBJECT *)bif_buffer.Pointer; if (!ACPI_PKG_VALID(res, 13)) { ACPI_VPRINT(dev, acpi_device_get_parent_softc(dev), "battery info corrupted\n"); goto end; } if (acpi_PkgInt32(res, 0, &sc->bif.units) != 0) goto end; if (acpi_PkgInt32(res, 1, &sc->bif.dcap) != 0) goto end; if (acpi_PkgInt32(res, 2, &sc->bif.lfcap) != 0) goto end; if (acpi_PkgInt32(res, 3, &sc->bif.btech) != 0) goto end; if (acpi_PkgInt32(res, 4, &sc->bif.dvol) != 0) goto end; if (acpi_PkgInt32(res, 5, &sc->bif.wcap) != 0) goto end; if (acpi_PkgInt32(res, 6, &sc->bif.lcap) != 0) goto end; if (acpi_PkgInt32(res, 7, &sc->bif.gra1) != 0) goto end; if (acpi_PkgInt32(res, 8, &sc->bif.gra2) != 0) goto end; if (acpi_PkgStr(res, 9, sc->bif.model, ACPI_CMBAT_MAXSTRLEN) != 0) goto end; if (acpi_PkgStr(res, 10, sc->bif.serial, ACPI_CMBAT_MAXSTRLEN) != 0) goto end; if (acpi_PkgStr(res, 11, sc->bif.type, ACPI_CMBAT_MAXSTRLEN) != 0) goto end; if (acpi_PkgStr(res, 12, sc->bif.oeminfo, ACPI_CMBAT_MAXSTRLEN) != 0) goto end; end: if (bif_buffer.Pointer != NULL) AcpiOsFree(bif_buffer.Pointer); } static int acpi_cmbat_bif(device_t dev, struct acpi_bif *bifp) { struct acpi_cmbat_softc *sc; sc = device_get_softc(dev); /* * Just copy the data. The only value that should change is the * last-full capacity, so we only update when we get a notify that says * the info has changed. Many systems apparently take a long time to * process a _BIF call so we avoid it if possible. */ ACPI_SERIAL_BEGIN(cmbat); bifp->units = sc->bif.units; bifp->dcap = sc->bif.dcap; bifp->lfcap = sc->bif.lfcap; bifp->btech = sc->bif.btech; bifp->dvol = sc->bif.dvol; bifp->wcap = sc->bif.wcap; bifp->lcap = sc->bif.lcap; bifp->gra1 = sc->bif.gra1; bifp->gra2 = sc->bif.gra2; strncpy(bifp->model, sc->bif.model, sizeof(sc->bif.model)); strncpy(bifp->serial, sc->bif.serial, sizeof(sc->bif.serial)); strncpy(bifp->type, sc->bif.type, sizeof(sc->bif.type)); strncpy(bifp->oeminfo, sc->bif.oeminfo, sizeof(sc->bif.oeminfo)); ACPI_SERIAL_END(cmbat); return (0); } static int acpi_cmbat_bst(device_t dev, struct acpi_bst *bstp) { struct acpi_cmbat_softc *sc; sc = device_get_softc(dev); ACPI_SERIAL_BEGIN(cmbat); if (acpi_BatteryIsPresent(dev)) { acpi_cmbat_get_bst(dev); bstp->state = sc->bst.state; bstp->rate = sc->bst.rate; bstp->cap = sc->bst.cap; bstp->volt = sc->bst.volt; } else bstp->state = ACPI_BATT_STAT_NOT_PRESENT; ACPI_SERIAL_END(cmbat); return (0); } static void acpi_cmbat_init_battery(void *arg) { struct acpi_cmbat_softc *sc; int retry, valid; device_t dev; dev = (device_t)arg; ACPI_VPRINT(dev, acpi_device_get_parent_softc(dev), "battery initialization start\n"); /* * Try repeatedly to get valid data from the battery. Since the * embedded controller isn't always ready just after boot, we may have * to wait a while. */ for (retry = 0; retry < ACPI_CMBAT_RETRY_MAX; retry++, AcpiOsSleep(10000)) { /* * Batteries on DOCK can be ejected w/ DOCK during retrying. * * If there is a valid softc pointer the device may be in * attaching, attached or detaching state. If the state is * different from attached retry getting the device state * until it becomes stable. This solves a race if the ACPI * notification handler is called during attach, because * device_is_attached() doesn't return non-zero until after * the attach code has been executed. */ ACPI_SERIAL_BEGIN(cmbat); sc = device_get_softc(dev); if (sc == NULL) { ACPI_SERIAL_END(cmbat); return; } if (!acpi_BatteryIsPresent(dev) || !device_is_attached(dev)) { ACPI_SERIAL_END(cmbat); continue; } /* * Only query the battery if this is the first try or the specific * type of info is still invalid. */ if (retry == 0 || !acpi_battery_bst_valid(&sc->bst)) { timespecclear(&sc->bst_lastupdated); acpi_cmbat_get_bst(dev); } if (retry == 0 || !acpi_battery_bif_valid(&sc->bif)) acpi_cmbat_get_bif(dev); valid = acpi_battery_bst_valid(&sc->bst) && acpi_battery_bif_valid(&sc->bif); ACPI_SERIAL_END(cmbat); if (valid) break; } if (retry == ACPI_CMBAT_RETRY_MAX) { ACPI_VPRINT(dev, acpi_device_get_parent_softc(dev), "battery initialization failed, giving up\n"); } else { ACPI_VPRINT(dev, acpi_device_get_parent_softc(dev), "battery initialization done, tried %d times\n", retry + 1); } } Index: head/sys/dev/acpica/acpi_smbat.c =================================================================== --- head/sys/dev/acpica/acpi_smbat.c (revision 336913) +++ head/sys/dev/acpica/acpi_smbat.c (revision 336914) @@ -1,490 +1,490 @@ /*- * Copyright (c) 2005 Hans Petter Selasky * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_acpi.h" #include #include #include #include #include #include #include #include /* Transactions have failed after 500 ms. */ #define SMBUS_TIMEOUT 50 struct acpi_smbat_softc { uint8_t sb_base_addr; device_t ec_dev; struct acpi_bif bif; struct acpi_bst bst; struct timespec bif_lastupdated; struct timespec bst_lastupdated; }; static int acpi_smbat_probe(device_t dev); static int acpi_smbat_attach(device_t dev); static int acpi_smbat_shutdown(device_t dev); static int acpi_smbat_info_expired(struct timespec *lastupdated); static void acpi_smbat_info_updated(struct timespec *lastupdated); static int acpi_smbat_get_bif(device_t dev, struct acpi_bif *bif); static int acpi_smbat_get_bst(device_t dev, struct acpi_bst *bst); ACPI_SERIAL_DECL(smbat, "ACPI Smart Battery"); static SYSCTL_NODE(_debug_acpi, OID_AUTO, batt, CTLFLAG_RD, NULL, "Battery debugging"); /* On some laptops with smart batteries, enabling battery monitoring * software causes keystrokes from atkbd to be lost. This has also been * reported on Linux, and is apparently due to the keyboard and I2C line * for the battery being routed through the same chip. Whether that's * accurate or not, adding extra sleeps to the status checking code * causes the problem to go away. * * If you experience that problem, try a value of 10ms and move up * from there. */ static int batt_sleep_ms; SYSCTL_INT(_debug_acpi_batt, OID_AUTO, batt_sleep_ms, CTLFLAG_RW, &batt_sleep_ms, 0, "Sleep during battery status updates to prevent keystroke loss."); static device_method_t acpi_smbat_methods[] = { /* device interface */ DEVMETHOD(device_probe, acpi_smbat_probe), DEVMETHOD(device_attach, acpi_smbat_attach), DEVMETHOD(device_shutdown, acpi_smbat_shutdown), /* ACPI battery interface */ DEVMETHOD(acpi_batt_get_status, acpi_smbat_get_bst), DEVMETHOD(acpi_batt_get_info, acpi_smbat_get_bif), DEVMETHOD_END }; static driver_t acpi_smbat_driver = { "battery", acpi_smbat_methods, sizeof(struct acpi_smbat_softc), }; static devclass_t acpi_smbat_devclass; DRIVER_MODULE(acpi_smbat, acpi, acpi_smbat_driver, acpi_smbat_devclass, 0, 0); MODULE_DEPEND(acpi_smbat, acpi, 1, 1, 1); static int acpi_smbat_probe(device_t dev) { static char *smbat_ids[] = {"ACPI0001", "ACPI0005", NULL}; ACPI_STATUS status; if (acpi_disabled("smbat") || ACPI_ID_PROBE(device_get_parent(dev), dev, smbat_ids) == NULL) return (ENXIO); status = AcpiEvaluateObject(acpi_get_handle(dev), "_EC", NULL, NULL); if (ACPI_FAILURE(status)) return (ENXIO); device_set_desc(dev, "ACPI Smart Battery"); return (0); } static int acpi_smbat_attach(device_t dev) { struct acpi_smbat_softc *sc; uint32_t base; sc = device_get_softc(dev); if (ACPI_FAILURE(acpi_GetInteger(acpi_get_handle(dev), "_EC", &base))) { device_printf(dev, "cannot get EC base address\n"); return (ENXIO); } sc->sb_base_addr = (base >> 8) & 0xff; /* XXX Only works with one EC, but nearly all systems only have one. */ sc->ec_dev = devclass_get_device(devclass_find("acpi_ec"), 0); if (sc->ec_dev == NULL) { device_printf(dev, "cannot find EC device\n"); return (ENXIO); } timespecclear(&sc->bif_lastupdated); timespecclear(&sc->bst_lastupdated); if (acpi_battery_register(dev) != 0) { device_printf(dev, "cannot register battery\n"); return (ENXIO); } return (0); } static int acpi_smbat_shutdown(device_t dev) { acpi_battery_remove(dev); return (0); } static int acpi_smbat_info_expired(struct timespec *lastupdated) { struct timespec curtime; ACPI_SERIAL_ASSERT(smbat); if (lastupdated == NULL) return (TRUE); if (!timespecisset(lastupdated)) return (TRUE); getnanotime(&curtime); - timespecsub(&curtime, lastupdated); + timespecsub(&curtime, lastupdated, &curtime); return (curtime.tv_sec < 0 || curtime.tv_sec > acpi_battery_get_info_expire()); } static void acpi_smbat_info_updated(struct timespec *lastupdated) { ACPI_SERIAL_ASSERT(smbat); if (lastupdated != NULL) getnanotime(lastupdated); } static int acpi_smbus_read_2(struct acpi_smbat_softc *sc, uint8_t addr, uint8_t cmd, uint16_t *ptr) { int error, to; UINT64 val; ACPI_SERIAL_ASSERT(smbat); if (batt_sleep_ms) AcpiOsSleep(batt_sleep_ms); val = addr; error = ACPI_EC_WRITE(sc->ec_dev, sc->sb_base_addr + SMBUS_ADDR, val, 1); if (error) goto out; val = cmd; error = ACPI_EC_WRITE(sc->ec_dev, sc->sb_base_addr + SMBUS_CMD, val, 1); if (error) goto out; val = 0x09; /* | 0x80 if PEC */ error = ACPI_EC_WRITE(sc->ec_dev, sc->sb_base_addr + SMBUS_PRTCL, val, 1); if (error) goto out; if (batt_sleep_ms) AcpiOsSleep(batt_sleep_ms); for (to = SMBUS_TIMEOUT; to != 0; to--) { error = ACPI_EC_READ(sc->ec_dev, sc->sb_base_addr + SMBUS_PRTCL, &val, 1); if (error) goto out; if (val == 0) break; AcpiOsSleep(10); } if (to == 0) { error = ETIMEDOUT; goto out; } error = ACPI_EC_READ(sc->ec_dev, sc->sb_base_addr + SMBUS_STS, &val, 1); if (error) goto out; if (val & SMBUS_STS_MASK) { printf("%s: AE_ERROR 0x%x\n", __FUNCTION__, (int)(val & SMBUS_STS_MASK)); error = EIO; goto out; } error = ACPI_EC_READ(sc->ec_dev, sc->sb_base_addr + SMBUS_DATA, &val, 2); if (error) goto out; *ptr = val; out: return (error); } static int acpi_smbus_read_multi_1(struct acpi_smbat_softc *sc, uint8_t addr, uint8_t cmd, uint8_t *ptr, uint16_t len) { UINT64 val; uint8_t to; int error; ACPI_SERIAL_ASSERT(smbat); if (batt_sleep_ms) AcpiOsSleep(batt_sleep_ms); val = addr; error = ACPI_EC_WRITE(sc->ec_dev, sc->sb_base_addr + SMBUS_ADDR, val, 1); if (error) goto out; val = cmd; error = ACPI_EC_WRITE(sc->ec_dev, sc->sb_base_addr + SMBUS_CMD, val, 1); if (error) goto out; val = 0x0B /* | 0x80 if PEC */ ; error = ACPI_EC_WRITE(sc->ec_dev, sc->sb_base_addr + SMBUS_PRTCL, val, 1); if (error) goto out; if (batt_sleep_ms) AcpiOsSleep(batt_sleep_ms); for (to = SMBUS_TIMEOUT; to != 0; to--) { error = ACPI_EC_READ(sc->ec_dev, sc->sb_base_addr + SMBUS_PRTCL, &val, 1); if (error) goto out; if (val == 0) break; AcpiOsSleep(10); } if (to == 0) { error = ETIMEDOUT; goto out; } error = ACPI_EC_READ(sc->ec_dev, sc->sb_base_addr + SMBUS_STS, &val, 1); if (error) goto out; if (val & SMBUS_STS_MASK) { printf("%s: AE_ERROR 0x%x\n", __FUNCTION__, (int)(val & SMBUS_STS_MASK)); error = EIO; goto out; } /* get length */ error = ACPI_EC_READ(sc->ec_dev, sc->sb_base_addr + SMBUS_BCNT, &val, 1); if (error) goto out; val = (val & 0x1f) + 1; bzero(ptr, len); if (len > val) len = val; if (batt_sleep_ms) AcpiOsSleep(batt_sleep_ms); while (len--) { error = ACPI_EC_READ(sc->ec_dev, sc->sb_base_addr + SMBUS_DATA + len, &val, 1); if (error) goto out; ptr[len] = val; if (batt_sleep_ms) AcpiOsSleep(batt_sleep_ms); } out: return (error); } static int acpi_smbat_get_bst(device_t dev, struct acpi_bst *bst) { struct acpi_smbat_softc *sc; int error; uint32_t factor; int16_t val; uint8_t addr; ACPI_SERIAL_BEGIN(smbat); addr = SMBATT_ADDRESS; error = ENXIO; sc = device_get_softc(dev); if (!acpi_smbat_info_expired(&sc->bst_lastupdated)) { error = 0; goto out; } if (acpi_smbus_read_2(sc, addr, SMBATT_CMD_BATTERY_MODE, &val)) goto out; if (val & SMBATT_BM_CAPACITY_MODE) factor = 10; else factor = 1; /* get battery status */ if (acpi_smbus_read_2(sc, addr, SMBATT_CMD_BATTERY_STATUS, &val)) goto out; sc->bst.state = 0; if (val & SMBATT_BS_DISCHARGING) sc->bst.state |= ACPI_BATT_STAT_DISCHARG; if (val & SMBATT_BS_REMAINING_CAPACITY_ALARM) sc->bst.state |= ACPI_BATT_STAT_CRITICAL; /* * If the rate is negative, it is discharging. Otherwise, * it is charging. */ if (acpi_smbus_read_2(sc, addr, SMBATT_CMD_CURRENT, &val)) goto out; if (val > 0) { sc->bst.rate = val * factor; sc->bst.state &= ~SMBATT_BS_DISCHARGING; sc->bst.state |= ACPI_BATT_STAT_CHARGING; } else if (val < 0) sc->bst.rate = (-val) * factor; else sc->bst.rate = 0; if (acpi_smbus_read_2(sc, addr, SMBATT_CMD_REMAINING_CAPACITY, &val)) goto out; sc->bst.cap = val * factor; if (acpi_smbus_read_2(sc, addr, SMBATT_CMD_VOLTAGE, &val)) goto out; sc->bst.volt = val; acpi_smbat_info_updated(&sc->bst_lastupdated); error = 0; out: if (error == 0) memcpy(bst, &sc->bst, sizeof(sc->bst)); ACPI_SERIAL_END(smbat); return (error); } static int acpi_smbat_get_bif(device_t dev, struct acpi_bif *bif) { struct acpi_smbat_softc *sc; int error; uint32_t factor; uint16_t val; uint8_t addr; ACPI_SERIAL_BEGIN(smbat); addr = SMBATT_ADDRESS; error = ENXIO; sc = device_get_softc(dev); if (!acpi_smbat_info_expired(&sc->bif_lastupdated)) { error = 0; goto out; } if (acpi_smbus_read_2(sc, addr, SMBATT_CMD_BATTERY_MODE, &val)) goto out; if (val & SMBATT_BM_CAPACITY_MODE) { factor = 10; sc->bif.units = ACPI_BIF_UNITS_MW; } else { factor = 1; sc->bif.units = ACPI_BIF_UNITS_MA; } if (acpi_smbus_read_2(sc, addr, SMBATT_CMD_DESIGN_CAPACITY, &val)) goto out; sc->bif.dcap = val * factor; if (acpi_smbus_read_2(sc, addr, SMBATT_CMD_FULL_CHARGE_CAPACITY, &val)) goto out; sc->bif.lfcap = val * factor; sc->bif.btech = 1; /* secondary (rechargeable) */ if (acpi_smbus_read_2(sc, addr, SMBATT_CMD_DESIGN_VOLTAGE, &val)) goto out; sc->bif.dvol = val; sc->bif.wcap = sc->bif.dcap / 10; sc->bif.lcap = sc->bif.dcap / 10; sc->bif.gra1 = factor; /* not supported */ sc->bif.gra2 = factor; /* not supported */ if (acpi_smbus_read_multi_1(sc, addr, SMBATT_CMD_DEVICE_NAME, sc->bif.model, sizeof(sc->bif.model))) goto out; if (acpi_smbus_read_2(sc, addr, SMBATT_CMD_SERIAL_NUMBER, &val)) goto out; snprintf(sc->bif.serial, sizeof(sc->bif.serial), "0x%04x", val); if (acpi_smbus_read_multi_1(sc, addr, SMBATT_CMD_DEVICE_CHEMISTRY, sc->bif.type, sizeof(sc->bif.type))) goto out; if (acpi_smbus_read_multi_1(sc, addr, SMBATT_CMD_MANUFACTURER_DATA, sc->bif.oeminfo, sizeof(sc->bif.oeminfo))) goto out; /* XXX check if device was replugged during read? */ acpi_smbat_info_updated(&sc->bif_lastupdated); error = 0; out: if (error == 0) memcpy(bif, &sc->bif, sizeof(sc->bif)); ACPI_SERIAL_END(smbat); return (error); } Index: head/sys/dev/acpica/acpi_thermal.c =================================================================== --- head/sys/dev/acpica/acpi_thermal.c (revision 336913) +++ head/sys/dev/acpica/acpi_thermal.c (revision 336914) @@ -1,1225 +1,1225 @@ /*- * Copyright (c) 2000, 2001 Michael Smith * Copyright (c) 2000 BSDi * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_acpi.h" #include #include #include #include #include #include #include #include #include #include #include #include #include "cpufreq_if.h" #include #include #include /* Hooks for the ACPI CA debugging infrastructure */ #define _COMPONENT ACPI_THERMAL ACPI_MODULE_NAME("THERMAL") #define TZ_ZEROC 2731 #define TZ_KELVTOC(x) (((x) - TZ_ZEROC) / 10), abs(((x) - TZ_ZEROC) % 10) #define TZ_NOTIFY_TEMPERATURE 0x80 /* Temperature changed. */ #define TZ_NOTIFY_LEVELS 0x81 /* Cooling levels changed. */ #define TZ_NOTIFY_DEVICES 0x82 /* Device lists changed. */ #define TZ_NOTIFY_CRITICAL 0xcc /* Fake notify that _CRT/_HOT reached. */ /* Check for temperature changes every 10 seconds by default */ #define TZ_POLLRATE 10 /* Make sure the reported temperature is valid for this number of polls. */ #define TZ_VALIDCHECKS 3 /* Notify the user we will be shutting down in one more poll cycle. */ #define TZ_NOTIFYCOUNT (TZ_VALIDCHECKS - 1) /* ACPI spec defines this */ #define TZ_NUMLEVELS 10 struct acpi_tz_zone { int ac[TZ_NUMLEVELS]; ACPI_BUFFER al[TZ_NUMLEVELS]; int crt; int hot; ACPI_BUFFER psl; int psv; int tc1; int tc2; int tsp; int tzp; }; struct acpi_tz_softc { device_t tz_dev; ACPI_HANDLE tz_handle; /*Thermal zone handle*/ int tz_temperature; /*Current temperature*/ int tz_active; /*Current active cooling*/ #define TZ_ACTIVE_NONE -1 #define TZ_ACTIVE_UNKNOWN -2 int tz_requested; /*Minimum active cooling*/ int tz_thflags; /*Current temp-related flags*/ #define TZ_THFLAG_NONE 0 #define TZ_THFLAG_PSV (1<<0) #define TZ_THFLAG_HOT (1<<2) #define TZ_THFLAG_CRT (1<<3) int tz_flags; #define TZ_FLAG_NO_SCP (1<<0) /*No _SCP method*/ #define TZ_FLAG_GETPROFILE (1<<1) /*Get power_profile in timeout*/ #define TZ_FLAG_GETSETTINGS (1<<2) /*Get devs/setpoints*/ struct timespec tz_cooling_started; /*Current cooling starting time*/ struct sysctl_ctx_list tz_sysctl_ctx; struct sysctl_oid *tz_sysctl_tree; eventhandler_tag tz_event; struct acpi_tz_zone tz_zone; /*Thermal zone parameters*/ int tz_validchecks; int tz_insane_tmp_notified; /* passive cooling */ struct proc *tz_cooling_proc; int tz_cooling_proc_running; int tz_cooling_enabled; int tz_cooling_active; int tz_cooling_updated; int tz_cooling_saved_freq; }; #define TZ_ACTIVE_LEVEL(act) ((act) >= 0 ? (act) : TZ_NUMLEVELS) #define CPUFREQ_MAX_LEVELS 64 /* XXX cpufreq should export this */ static int acpi_tz_probe(device_t dev); static int acpi_tz_attach(device_t dev); static int acpi_tz_establish(struct acpi_tz_softc *sc); static void acpi_tz_monitor(void *Context); static void acpi_tz_switch_cooler_off(ACPI_OBJECT *obj, void *arg); static void acpi_tz_switch_cooler_on(ACPI_OBJECT *obj, void *arg); static void acpi_tz_getparam(struct acpi_tz_softc *sc, char *node, int *data); static void acpi_tz_sanity(struct acpi_tz_softc *sc, int *val, char *what); static int acpi_tz_active_sysctl(SYSCTL_HANDLER_ARGS); static int acpi_tz_cooling_sysctl(SYSCTL_HANDLER_ARGS); static int acpi_tz_temp_sysctl(SYSCTL_HANDLER_ARGS); static int acpi_tz_passive_sysctl(SYSCTL_HANDLER_ARGS); static void acpi_tz_notify_handler(ACPI_HANDLE h, UINT32 notify, void *context); static void acpi_tz_signal(struct acpi_tz_softc *sc, int flags); static void acpi_tz_timeout(struct acpi_tz_softc *sc, int flags); static void acpi_tz_power_profile(void *arg); static void acpi_tz_thread(void *arg); static int acpi_tz_cooling_is_available(struct acpi_tz_softc *sc); static int acpi_tz_cooling_thread_start(struct acpi_tz_softc *sc); static device_method_t acpi_tz_methods[] = { /* Device interface */ DEVMETHOD(device_probe, acpi_tz_probe), DEVMETHOD(device_attach, acpi_tz_attach), DEVMETHOD_END }; static driver_t acpi_tz_driver = { "acpi_tz", acpi_tz_methods, sizeof(struct acpi_tz_softc), }; static char *acpi_tz_tmp_name = "_TMP"; static devclass_t acpi_tz_devclass; DRIVER_MODULE(acpi_tz, acpi, acpi_tz_driver, acpi_tz_devclass, 0, 0); MODULE_DEPEND(acpi_tz, acpi, 1, 1, 1); static struct sysctl_ctx_list acpi_tz_sysctl_ctx; static struct sysctl_oid *acpi_tz_sysctl_tree; /* Minimum cooling run time */ static int acpi_tz_min_runtime; static int acpi_tz_polling_rate = TZ_POLLRATE; static int acpi_tz_override; /* Timezone polling thread */ static struct proc *acpi_tz_proc; ACPI_LOCK_DECL(thermal, "ACPI thermal zone"); static int acpi_tz_cooling_unit = -1; static int acpi_tz_probe(device_t dev) { int result; if (acpi_get_type(dev) == ACPI_TYPE_THERMAL && !acpi_disabled("thermal")) { device_set_desc(dev, "Thermal Zone"); result = -10; } else result = ENXIO; return (result); } static int acpi_tz_attach(device_t dev) { struct acpi_tz_softc *sc; struct acpi_softc *acpi_sc; int error; char oidname[8]; ACPI_FUNCTION_TRACE((char *)(uintptr_t)__func__); sc = device_get_softc(dev); sc->tz_dev = dev; sc->tz_handle = acpi_get_handle(dev); sc->tz_requested = TZ_ACTIVE_NONE; sc->tz_active = TZ_ACTIVE_UNKNOWN; sc->tz_thflags = TZ_THFLAG_NONE; sc->tz_cooling_proc = NULL; sc->tz_cooling_proc_running = FALSE; sc->tz_cooling_active = FALSE; sc->tz_cooling_updated = FALSE; sc->tz_cooling_enabled = FALSE; /* * Parse the current state of the thermal zone and build control * structures. We don't need to worry about interference with the * control thread since we haven't fully attached this device yet. */ if ((error = acpi_tz_establish(sc)) != 0) return (error); /* * Register for any Notify events sent to this zone. */ AcpiInstallNotifyHandler(sc->tz_handle, ACPI_DEVICE_NOTIFY, acpi_tz_notify_handler, sc); /* * Create our sysctl nodes. * * XXX we need a mechanism for adding nodes under ACPI. */ if (device_get_unit(dev) == 0) { acpi_sc = acpi_device_get_parent_softc(dev); sysctl_ctx_init(&acpi_tz_sysctl_ctx); acpi_tz_sysctl_tree = SYSCTL_ADD_NODE(&acpi_tz_sysctl_ctx, SYSCTL_CHILDREN(acpi_sc->acpi_sysctl_tree), OID_AUTO, "thermal", CTLFLAG_RD, 0, ""); SYSCTL_ADD_INT(&acpi_tz_sysctl_ctx, SYSCTL_CHILDREN(acpi_tz_sysctl_tree), OID_AUTO, "min_runtime", CTLFLAG_RW, &acpi_tz_min_runtime, 0, "minimum cooling run time in sec"); SYSCTL_ADD_INT(&acpi_tz_sysctl_ctx, SYSCTL_CHILDREN(acpi_tz_sysctl_tree), OID_AUTO, "polling_rate", CTLFLAG_RW, &acpi_tz_polling_rate, 0, "monitor polling interval in seconds"); SYSCTL_ADD_INT(&acpi_tz_sysctl_ctx, SYSCTL_CHILDREN(acpi_tz_sysctl_tree), OID_AUTO, "user_override", CTLFLAG_RW, &acpi_tz_override, 0, "allow override of thermal settings"); } sysctl_ctx_init(&sc->tz_sysctl_ctx); sprintf(oidname, "tz%d", device_get_unit(dev)); sc->tz_sysctl_tree = SYSCTL_ADD_NODE_WITH_LABEL(&sc->tz_sysctl_ctx, SYSCTL_CHILDREN(acpi_tz_sysctl_tree), OID_AUTO, oidname, CTLFLAG_RD, 0, "", "thermal_zone"); SYSCTL_ADD_PROC(&sc->tz_sysctl_ctx, SYSCTL_CHILDREN(sc->tz_sysctl_tree), OID_AUTO, "temperature", CTLTYPE_INT | CTLFLAG_RD, &sc->tz_temperature, 0, sysctl_handle_int, "IK", "current thermal zone temperature"); SYSCTL_ADD_PROC(&sc->tz_sysctl_ctx, SYSCTL_CHILDREN(sc->tz_sysctl_tree), OID_AUTO, "active", CTLTYPE_INT | CTLFLAG_RW, sc, 0, acpi_tz_active_sysctl, "I", "cooling is active"); SYSCTL_ADD_PROC(&sc->tz_sysctl_ctx, SYSCTL_CHILDREN(sc->tz_sysctl_tree), OID_AUTO, "passive_cooling", CTLTYPE_INT | CTLFLAG_RW, sc, 0, acpi_tz_cooling_sysctl, "I", "enable passive (speed reduction) cooling"); SYSCTL_ADD_INT(&sc->tz_sysctl_ctx, SYSCTL_CHILDREN(sc->tz_sysctl_tree), OID_AUTO, "thermal_flags", CTLFLAG_RD, &sc->tz_thflags, 0, "thermal zone flags"); SYSCTL_ADD_PROC(&sc->tz_sysctl_ctx, SYSCTL_CHILDREN(sc->tz_sysctl_tree), OID_AUTO, "_PSV", CTLTYPE_INT | CTLFLAG_RW, sc, offsetof(struct acpi_tz_softc, tz_zone.psv), acpi_tz_temp_sysctl, "IK", "passive cooling temp setpoint"); SYSCTL_ADD_PROC(&sc->tz_sysctl_ctx, SYSCTL_CHILDREN(sc->tz_sysctl_tree), OID_AUTO, "_HOT", CTLTYPE_INT | CTLFLAG_RW, sc, offsetof(struct acpi_tz_softc, tz_zone.hot), acpi_tz_temp_sysctl, "IK", "too hot temp setpoint (suspend now)"); SYSCTL_ADD_PROC(&sc->tz_sysctl_ctx, SYSCTL_CHILDREN(sc->tz_sysctl_tree), OID_AUTO, "_CRT", CTLTYPE_INT | CTLFLAG_RW, sc, offsetof(struct acpi_tz_softc, tz_zone.crt), acpi_tz_temp_sysctl, "IK", "critical temp setpoint (shutdown now)"); SYSCTL_ADD_PROC(&sc->tz_sysctl_ctx, SYSCTL_CHILDREN(sc->tz_sysctl_tree), OID_AUTO, "_ACx", CTLTYPE_INT | CTLFLAG_RD, &sc->tz_zone.ac, sizeof(sc->tz_zone.ac), sysctl_handle_opaque, "IK", ""); SYSCTL_ADD_PROC(&sc->tz_sysctl_ctx, SYSCTL_CHILDREN(sc->tz_sysctl_tree), OID_AUTO, "_TC1", CTLTYPE_INT | CTLFLAG_RW, sc, offsetof(struct acpi_tz_softc, tz_zone.tc1), acpi_tz_passive_sysctl, "I", "thermal constant 1 for passive cooling"); SYSCTL_ADD_PROC(&sc->tz_sysctl_ctx, SYSCTL_CHILDREN(sc->tz_sysctl_tree), OID_AUTO, "_TC2", CTLTYPE_INT | CTLFLAG_RW, sc, offsetof(struct acpi_tz_softc, tz_zone.tc2), acpi_tz_passive_sysctl, "I", "thermal constant 2 for passive cooling"); SYSCTL_ADD_PROC(&sc->tz_sysctl_ctx, SYSCTL_CHILDREN(sc->tz_sysctl_tree), OID_AUTO, "_TSP", CTLTYPE_INT | CTLFLAG_RW, sc, offsetof(struct acpi_tz_softc, tz_zone.tsp), acpi_tz_passive_sysctl, "I", "thermal sampling period for passive cooling"); /* * Register our power profile event handler. */ sc->tz_event = EVENTHANDLER_REGISTER(power_profile_change, acpi_tz_power_profile, sc, 0); /* * Flag the event handler for a manual invocation by our timeout. * We defer it like this so that the rest of the subsystem has time * to come up. Don't bother evaluating/printing the temperature at * this point; on many systems it'll be bogus until the EC is running. */ sc->tz_flags |= TZ_FLAG_GETPROFILE; return_VALUE (0); } static void acpi_tz_startup(void *arg __unused) { struct acpi_tz_softc *sc; device_t *devs; int devcount, error, i; devclass_get_devices(acpi_tz_devclass, &devs, &devcount); if (devcount == 0) { free(devs, M_TEMP); return; } /* * Create thread to service all of the thermal zones. */ error = kproc_create(acpi_tz_thread, NULL, &acpi_tz_proc, RFHIGHPID, 0, "acpi_thermal"); if (error != 0) printf("acpi_tz: could not create thread - %d", error); /* * Create a thread to handle passive cooling for 1st zone which * has _PSV, _TSP, _TC1 and _TC2. Users can enable it for other * zones manually for now. * * XXX We enable only one zone to avoid multiple zones conflict * with each other since cpufreq currently sets all CPUs to the * given frequency whereas it's possible for different thermal * zones to specify independent settings for multiple CPUs. */ for (i = 0; i < devcount; i++) { sc = device_get_softc(devs[i]); if (acpi_tz_cooling_is_available(sc)) { sc->tz_cooling_enabled = TRUE; error = acpi_tz_cooling_thread_start(sc); if (error != 0) { sc->tz_cooling_enabled = FALSE; break; } acpi_tz_cooling_unit = device_get_unit(devs[i]); break; } } free(devs, M_TEMP); } SYSINIT(acpi_tz, SI_SUB_KICK_SCHEDULER, SI_ORDER_ANY, acpi_tz_startup, NULL); /* * Parse the current state of this thermal zone and set up to use it. * * Note that we may have previous state, which will have to be discarded. */ static int acpi_tz_establish(struct acpi_tz_softc *sc) { ACPI_OBJECT *obj; int i; char nbuf[8]; ACPI_FUNCTION_TRACE((char *)(uintptr_t)__func__); /* Erase any existing state. */ for (i = 0; i < TZ_NUMLEVELS; i++) if (sc->tz_zone.al[i].Pointer != NULL) AcpiOsFree(sc->tz_zone.al[i].Pointer); if (sc->tz_zone.psl.Pointer != NULL) AcpiOsFree(sc->tz_zone.psl.Pointer); /* * XXX: We initialize only ACPI_BUFFER to avoid race condition * with passive cooling thread which refers psv, tc1, tc2 and tsp. */ bzero(sc->tz_zone.ac, sizeof(sc->tz_zone.ac)); bzero(sc->tz_zone.al, sizeof(sc->tz_zone.al)); bzero(&sc->tz_zone.psl, sizeof(sc->tz_zone.psl)); /* Evaluate thermal zone parameters. */ for (i = 0; i < TZ_NUMLEVELS; i++) { sprintf(nbuf, "_AC%d", i); acpi_tz_getparam(sc, nbuf, &sc->tz_zone.ac[i]); sprintf(nbuf, "_AL%d", i); sc->tz_zone.al[i].Length = ACPI_ALLOCATE_BUFFER; sc->tz_zone.al[i].Pointer = NULL; AcpiEvaluateObject(sc->tz_handle, nbuf, NULL, &sc->tz_zone.al[i]); obj = (ACPI_OBJECT *)sc->tz_zone.al[i].Pointer; if (obj != NULL) { /* Should be a package containing a list of power objects */ if (obj->Type != ACPI_TYPE_PACKAGE) { device_printf(sc->tz_dev, "%s has unknown type %d, rejecting\n", nbuf, obj->Type); return_VALUE (ENXIO); } } } acpi_tz_getparam(sc, "_CRT", &sc->tz_zone.crt); acpi_tz_getparam(sc, "_HOT", &sc->tz_zone.hot); sc->tz_zone.psl.Length = ACPI_ALLOCATE_BUFFER; sc->tz_zone.psl.Pointer = NULL; AcpiEvaluateObject(sc->tz_handle, "_PSL", NULL, &sc->tz_zone.psl); acpi_tz_getparam(sc, "_PSV", &sc->tz_zone.psv); acpi_tz_getparam(sc, "_TC1", &sc->tz_zone.tc1); acpi_tz_getparam(sc, "_TC2", &sc->tz_zone.tc2); acpi_tz_getparam(sc, "_TSP", &sc->tz_zone.tsp); acpi_tz_getparam(sc, "_TZP", &sc->tz_zone.tzp); /* * Sanity-check the values we've been given. * * XXX what do we do about systems that give us the same value for * more than one of these setpoints? */ acpi_tz_sanity(sc, &sc->tz_zone.crt, "_CRT"); acpi_tz_sanity(sc, &sc->tz_zone.hot, "_HOT"); acpi_tz_sanity(sc, &sc->tz_zone.psv, "_PSV"); for (i = 0; i < TZ_NUMLEVELS; i++) acpi_tz_sanity(sc, &sc->tz_zone.ac[i], "_ACx"); return_VALUE (0); } static char *aclevel_string[] = { "NONE", "_AC0", "_AC1", "_AC2", "_AC3", "_AC4", "_AC5", "_AC6", "_AC7", "_AC8", "_AC9" }; static __inline const char * acpi_tz_aclevel_string(int active) { if (active < -1 || active >= TZ_NUMLEVELS) return (aclevel_string[0]); return (aclevel_string[active + 1]); } /* * Get the current temperature. */ static int acpi_tz_get_temperature(struct acpi_tz_softc *sc) { int temp; ACPI_STATUS status; ACPI_FUNCTION_NAME ("acpi_tz_get_temperature"); /* Evaluate the thermal zone's _TMP method. */ status = acpi_GetInteger(sc->tz_handle, acpi_tz_tmp_name, &temp); if (ACPI_FAILURE(status)) { ACPI_VPRINT(sc->tz_dev, acpi_device_get_parent_softc(sc->tz_dev), "error fetching current temperature -- %s\n", AcpiFormatException(status)); return (FALSE); } /* Check it for validity. */ acpi_tz_sanity(sc, &temp, acpi_tz_tmp_name); if (temp == -1) return (FALSE); ACPI_DEBUG_PRINT((ACPI_DB_VALUES, "got %d.%dC\n", TZ_KELVTOC(temp))); sc->tz_temperature = temp; return (TRUE); } /* * Evaluate the condition of a thermal zone, take appropriate actions. */ static void acpi_tz_monitor(void *Context) { struct acpi_tz_softc *sc; struct timespec curtime; int temp; int i; int newactive, newflags; ACPI_FUNCTION_TRACE((char *)(uintptr_t)__func__); sc = (struct acpi_tz_softc *)Context; /* Get the current temperature. */ if (!acpi_tz_get_temperature(sc)) { /* XXX disable zone? go to max cooling? */ return_VOID; } temp = sc->tz_temperature; /* * Work out what we ought to be doing right now. * * Note that the _ACx levels sort from hot to cold. */ newactive = TZ_ACTIVE_NONE; for (i = TZ_NUMLEVELS - 1; i >= 0; i--) { if (sc->tz_zone.ac[i] != -1 && temp >= sc->tz_zone.ac[i]) newactive = i; } /* * We are going to get _ACx level down (colder side), but give a guaranteed * minimum cooling run time if requested. */ if (acpi_tz_min_runtime > 0 && sc->tz_active != TZ_ACTIVE_NONE && sc->tz_active != TZ_ACTIVE_UNKNOWN && (newactive == TZ_ACTIVE_NONE || newactive > sc->tz_active)) { getnanotime(&curtime); - timespecsub(&curtime, &sc->tz_cooling_started); + timespecsub(&curtime, &sc->tz_cooling_started, &curtime); if (curtime.tv_sec < acpi_tz_min_runtime) newactive = sc->tz_active; } /* Handle user override of active mode */ if (sc->tz_requested != TZ_ACTIVE_NONE && (newactive == TZ_ACTIVE_NONE || sc->tz_requested < newactive)) newactive = sc->tz_requested; /* update temperature-related flags */ newflags = TZ_THFLAG_NONE; if (sc->tz_zone.psv != -1 && temp >= sc->tz_zone.psv) newflags |= TZ_THFLAG_PSV; if (sc->tz_zone.hot != -1 && temp >= sc->tz_zone.hot) newflags |= TZ_THFLAG_HOT; if (sc->tz_zone.crt != -1 && temp >= sc->tz_zone.crt) newflags |= TZ_THFLAG_CRT; /* If the active cooling state has changed, we have to switch things. */ if (sc->tz_active == TZ_ACTIVE_UNKNOWN) { /* * We don't know which cooling device is on or off, * so stop them all, because we now know which * should be on (if any). */ for (i = 0; i < TZ_NUMLEVELS; i++) { if (sc->tz_zone.al[i].Pointer != NULL) { acpi_ForeachPackageObject( (ACPI_OBJECT *)sc->tz_zone.al[i].Pointer, acpi_tz_switch_cooler_off, sc); } } /* now we know that all devices are off */ sc->tz_active = TZ_ACTIVE_NONE; } if (newactive != sc->tz_active) { /* Turn off unneeded cooling devices that are on, if any are */ for (i = TZ_ACTIVE_LEVEL(sc->tz_active); i < TZ_ACTIVE_LEVEL(newactive); i++) { acpi_ForeachPackageObject( (ACPI_OBJECT *)sc->tz_zone.al[i].Pointer, acpi_tz_switch_cooler_off, sc); } /* Turn on cooling devices that are required, if any are */ for (i = TZ_ACTIVE_LEVEL(sc->tz_active) - 1; i >= TZ_ACTIVE_LEVEL(newactive); i--) { acpi_ForeachPackageObject( (ACPI_OBJECT *)sc->tz_zone.al[i].Pointer, acpi_tz_switch_cooler_on, sc); } ACPI_VPRINT(sc->tz_dev, acpi_device_get_parent_softc(sc->tz_dev), "switched from %s to %s: %d.%dC\n", acpi_tz_aclevel_string(sc->tz_active), acpi_tz_aclevel_string(newactive), TZ_KELVTOC(temp)); sc->tz_active = newactive; getnanotime(&sc->tz_cooling_started); } /* XXX (de)activate any passive cooling that may be required. */ /* * If the temperature is at _HOT or _CRT, increment our event count. * If it has occurred enough times, shutdown the system. This is * needed because some systems will report an invalid high temperature * for one poll cycle. It is suspected this is due to the embedded * controller timing out. A typical value is 138C for one cycle on * a system that is otherwise 65C. * * If we're almost at that threshold, notify the user through devd(8). */ if ((newflags & (TZ_THFLAG_HOT | TZ_THFLAG_CRT)) != 0) { sc->tz_validchecks++; if (sc->tz_validchecks == TZ_VALIDCHECKS) { device_printf(sc->tz_dev, "WARNING - current temperature (%d.%dC) exceeds safe limits\n", TZ_KELVTOC(sc->tz_temperature)); shutdown_nice(RB_POWEROFF); } else if (sc->tz_validchecks == TZ_NOTIFYCOUNT) acpi_UserNotify("Thermal", sc->tz_handle, TZ_NOTIFY_CRITICAL); } else { sc->tz_validchecks = 0; } sc->tz_thflags = newflags; return_VOID; } /* * Given an object, verify that it's a reference to a device of some sort, * and try to switch it off. */ static void acpi_tz_switch_cooler_off(ACPI_OBJECT *obj, void *arg) { ACPI_HANDLE cooler; ACPI_FUNCTION_TRACE((char *)(uintptr_t)__func__); cooler = acpi_GetReference(NULL, obj); if (cooler == NULL) { ACPI_DEBUG_PRINT((ACPI_DB_OBJECTS, "can't get handle\n")); return_VOID; } ACPI_DEBUG_PRINT((ACPI_DB_OBJECTS, "called to turn %s off\n", acpi_name(cooler))); acpi_pwr_switch_consumer(cooler, ACPI_STATE_D3); return_VOID; } /* * Given an object, verify that it's a reference to a device of some sort, * and try to switch it on. * * XXX replication of off/on function code is bad. */ static void acpi_tz_switch_cooler_on(ACPI_OBJECT *obj, void *arg) { struct acpi_tz_softc *sc = (struct acpi_tz_softc *)arg; ACPI_HANDLE cooler; ACPI_STATUS status; ACPI_FUNCTION_TRACE((char *)(uintptr_t)__func__); cooler = acpi_GetReference(NULL, obj); if (cooler == NULL) { ACPI_DEBUG_PRINT((ACPI_DB_OBJECTS, "can't get handle\n")); return_VOID; } ACPI_DEBUG_PRINT((ACPI_DB_OBJECTS, "called to turn %s on\n", acpi_name(cooler))); status = acpi_pwr_switch_consumer(cooler, ACPI_STATE_D0); if (ACPI_FAILURE(status)) { ACPI_VPRINT(sc->tz_dev, acpi_device_get_parent_softc(sc->tz_dev), "failed to activate %s - %s\n", acpi_name(cooler), AcpiFormatException(status)); } return_VOID; } /* * Read/debug-print a parameter, default it to -1. */ static void acpi_tz_getparam(struct acpi_tz_softc *sc, char *node, int *data) { ACPI_FUNCTION_TRACE((char *)(uintptr_t)__func__); if (ACPI_FAILURE(acpi_GetInteger(sc->tz_handle, node, data))) { *data = -1; } else { ACPI_DEBUG_PRINT((ACPI_DB_VALUES, "%s.%s = %d\n", acpi_name(sc->tz_handle), node, *data)); } return_VOID; } /* * Sanity-check a temperature value. Assume that setpoints * should be between 0C and 200C. */ static void acpi_tz_sanity(struct acpi_tz_softc *sc, int *val, char *what) { if (*val != -1 && (*val < TZ_ZEROC || *val > TZ_ZEROC + 2000)) { /* * If the value we are checking is _TMP, warn the user only * once. This avoids spamming messages if, for instance, the * sensor is broken and always returns an invalid temperature. * * This is only done for _TMP; other values always emit a * warning. */ if (what != acpi_tz_tmp_name || !sc->tz_insane_tmp_notified) { device_printf(sc->tz_dev, "%s value is absurd, ignored (%d.%dC)\n", what, TZ_KELVTOC(*val)); /* Don't warn the user again if the read value doesn't improve. */ if (what == acpi_tz_tmp_name) sc->tz_insane_tmp_notified = 1; } *val = -1; return; } /* This value is correct. Warn if it's incorrect again. */ if (what == acpi_tz_tmp_name) sc->tz_insane_tmp_notified = 0; } /* * Respond to a sysctl on the active state node. */ static int acpi_tz_active_sysctl(SYSCTL_HANDLER_ARGS) { struct acpi_tz_softc *sc; int active; int error; sc = (struct acpi_tz_softc *)oidp->oid_arg1; active = sc->tz_active; error = sysctl_handle_int(oidp, &active, 0, req); /* Error or no new value */ if (error != 0 || req->newptr == NULL) return (error); if (active < -1 || active >= TZ_NUMLEVELS) return (EINVAL); /* Set new preferred level and re-switch */ sc->tz_requested = active; acpi_tz_signal(sc, 0); return (0); } static int acpi_tz_cooling_sysctl(SYSCTL_HANDLER_ARGS) { struct acpi_tz_softc *sc; int enabled, error; sc = (struct acpi_tz_softc *)oidp->oid_arg1; enabled = sc->tz_cooling_enabled; error = sysctl_handle_int(oidp, &enabled, 0, req); /* Error or no new value */ if (error != 0 || req->newptr == NULL) return (error); if (enabled != TRUE && enabled != FALSE) return (EINVAL); if (enabled) { if (acpi_tz_cooling_is_available(sc)) error = acpi_tz_cooling_thread_start(sc); else error = ENODEV; if (error) enabled = FALSE; } sc->tz_cooling_enabled = enabled; return (error); } static int acpi_tz_temp_sysctl(SYSCTL_HANDLER_ARGS) { struct acpi_tz_softc *sc; int temp, *temp_ptr; int error; sc = oidp->oid_arg1; temp_ptr = (int *)(void *)(uintptr_t)((uintptr_t)sc + oidp->oid_arg2); temp = *temp_ptr; error = sysctl_handle_int(oidp, &temp, 0, req); /* Error or no new value */ if (error != 0 || req->newptr == NULL) return (error); /* Only allow changing settings if override is set. */ if (!acpi_tz_override) return (EPERM); /* Check user-supplied value for sanity. */ acpi_tz_sanity(sc, &temp, "user-supplied temp"); if (temp == -1) return (EINVAL); *temp_ptr = temp; return (0); } static int acpi_tz_passive_sysctl(SYSCTL_HANDLER_ARGS) { struct acpi_tz_softc *sc; int val, *val_ptr; int error; sc = oidp->oid_arg1; val_ptr = (int *)(void *)(uintptr_t)((uintptr_t)sc + oidp->oid_arg2); val = *val_ptr; error = sysctl_handle_int(oidp, &val, 0, req); /* Error or no new value */ if (error != 0 || req->newptr == NULL) return (error); /* Only allow changing settings if override is set. */ if (!acpi_tz_override) return (EPERM); *val_ptr = val; return (0); } static void acpi_tz_notify_handler(ACPI_HANDLE h, UINT32 notify, void *context) { struct acpi_tz_softc *sc = (struct acpi_tz_softc *)context; ACPI_FUNCTION_TRACE((char *)(uintptr_t)__func__); switch (notify) { case TZ_NOTIFY_TEMPERATURE: /* Temperature change occurred */ acpi_tz_signal(sc, 0); break; case TZ_NOTIFY_DEVICES: case TZ_NOTIFY_LEVELS: /* Zone devices/setpoints changed */ acpi_tz_signal(sc, TZ_FLAG_GETSETTINGS); break; default: ACPI_VPRINT(sc->tz_dev, acpi_device_get_parent_softc(sc->tz_dev), "unknown Notify event 0x%x\n", notify); break; } acpi_UserNotify("Thermal", h, notify); return_VOID; } static void acpi_tz_signal(struct acpi_tz_softc *sc, int flags) { ACPI_LOCK(thermal); sc->tz_flags |= flags; ACPI_UNLOCK(thermal); wakeup(&acpi_tz_proc); } /* * Notifies can be generated asynchronously but have also been seen to be * triggered by other thermal methods. One system generates a notify of * 0x81 when the fan is turned on or off. Another generates it when _SCP * is called. To handle these situations, we check the zone via * acpi_tz_monitor() before evaluating changes to setpoints or the cooling * policy. */ static void acpi_tz_timeout(struct acpi_tz_softc *sc, int flags) { /* Check the current temperature and take action based on it */ acpi_tz_monitor(sc); /* If requested, get the power profile settings. */ if (flags & TZ_FLAG_GETPROFILE) acpi_tz_power_profile(sc); /* * If requested, check for new devices/setpoints. After finding them, * check if we need to switch fans based on the new values. */ if (flags & TZ_FLAG_GETSETTINGS) { acpi_tz_establish(sc); acpi_tz_monitor(sc); } /* XXX passive cooling actions? */ } /* * System power profile may have changed; fetch and notify the * thermal zone accordingly. * * Since this can be called from an arbitrary eventhandler, it needs * to get the ACPI lock itself. */ static void acpi_tz_power_profile(void *arg) { ACPI_STATUS status; struct acpi_tz_softc *sc = (struct acpi_tz_softc *)arg; int state; state = power_profile_get_state(); if (state != POWER_PROFILE_PERFORMANCE && state != POWER_PROFILE_ECONOMY) return; /* check that we haven't decided there's no _SCP method */ if ((sc->tz_flags & TZ_FLAG_NO_SCP) == 0) { /* Call _SCP to set the new profile */ status = acpi_SetInteger(sc->tz_handle, "_SCP", (state == POWER_PROFILE_PERFORMANCE) ? 0 : 1); if (ACPI_FAILURE(status)) { if (status != AE_NOT_FOUND) ACPI_VPRINT(sc->tz_dev, acpi_device_get_parent_softc(sc->tz_dev), "can't evaluate %s._SCP - %s\n", acpi_name(sc->tz_handle), AcpiFormatException(status)); sc->tz_flags |= TZ_FLAG_NO_SCP; } else { /* We have to re-evaluate the entire zone now */ acpi_tz_signal(sc, TZ_FLAG_GETSETTINGS); } } } /* * Thermal zone monitor thread. */ static void acpi_tz_thread(void *arg) { device_t *devs; int devcount, i; int flags; struct acpi_tz_softc **sc; ACPI_FUNCTION_TRACE((char *)(uintptr_t)__func__); devs = NULL; devcount = 0; sc = NULL; for (;;) { /* If the number of devices has changed, re-evaluate. */ if (devclass_get_count(acpi_tz_devclass) != devcount) { if (devs != NULL) { free(devs, M_TEMP); free(sc, M_TEMP); } devclass_get_devices(acpi_tz_devclass, &devs, &devcount); sc = malloc(sizeof(struct acpi_tz_softc *) * devcount, M_TEMP, M_WAITOK | M_ZERO); for (i = 0; i < devcount; i++) sc[i] = device_get_softc(devs[i]); } /* Check for temperature events and act on them. */ for (i = 0; i < devcount; i++) { ACPI_LOCK(thermal); flags = sc[i]->tz_flags; sc[i]->tz_flags &= TZ_FLAG_NO_SCP; ACPI_UNLOCK(thermal); acpi_tz_timeout(sc[i], flags); } /* If more work to do, don't go to sleep yet. */ ACPI_LOCK(thermal); for (i = 0; i < devcount; i++) { if (sc[i]->tz_flags & ~TZ_FLAG_NO_SCP) break; } /* * If we have no more work, sleep for a while, setting PDROP so that * the mutex will not be reacquired. Otherwise, drop the mutex and * loop to handle more events. */ if (i == devcount) msleep(&acpi_tz_proc, &thermal_mutex, PZERO | PDROP, "tzpoll", hz * acpi_tz_polling_rate); else ACPI_UNLOCK(thermal); } } static int acpi_tz_cpufreq_restore(struct acpi_tz_softc *sc) { device_t dev; int error; if (!sc->tz_cooling_updated) return (0); if ((dev = devclass_get_device(devclass_find("cpufreq"), 0)) == NULL) return (ENXIO); ACPI_VPRINT(sc->tz_dev, acpi_device_get_parent_softc(sc->tz_dev), "temperature %d.%dC: resuming previous clock speed (%d MHz)\n", TZ_KELVTOC(sc->tz_temperature), sc->tz_cooling_saved_freq); error = CPUFREQ_SET(dev, NULL, CPUFREQ_PRIO_KERN); if (error == 0) sc->tz_cooling_updated = FALSE; return (error); } static int acpi_tz_cpufreq_update(struct acpi_tz_softc *sc, int req) { device_t dev; struct cf_level *levels; int num_levels, error, freq, desired_freq, perf, i; levels = malloc(CPUFREQ_MAX_LEVELS * sizeof(*levels), M_TEMP, M_NOWAIT); if (levels == NULL) return (ENOMEM); /* * Find the main device, cpufreq0. We don't yet support independent * CPU frequency control on SMP. */ if ((dev = devclass_get_device(devclass_find("cpufreq"), 0)) == NULL) { error = ENXIO; goto out; } /* Get the current frequency. */ error = CPUFREQ_GET(dev, &levels[0]); if (error) goto out; freq = levels[0].total_set.freq; /* Get the current available frequency levels. */ num_levels = CPUFREQ_MAX_LEVELS; error = CPUFREQ_LEVELS(dev, levels, &num_levels); if (error) { if (error == E2BIG) printf("cpufreq: need to increase CPUFREQ_MAX_LEVELS\n"); goto out; } /* Calculate the desired frequency as a percent of the max frequency. */ perf = 100 * freq / levels[0].total_set.freq - req; if (perf < 0) perf = 0; else if (perf > 100) perf = 100; desired_freq = levels[0].total_set.freq * perf / 100; if (desired_freq < freq) { /* Find the closest available frequency, rounding down. */ for (i = 0; i < num_levels; i++) if (levels[i].total_set.freq <= desired_freq) break; /* If we didn't find a relevant setting, use the lowest. */ if (i == num_levels) i--; } else { /* If we didn't decrease frequency yet, don't increase it. */ if (!sc->tz_cooling_updated) { sc->tz_cooling_active = FALSE; goto out; } /* Use saved cpu frequency as maximum value. */ if (desired_freq > sc->tz_cooling_saved_freq) desired_freq = sc->tz_cooling_saved_freq; /* Find the closest available frequency, rounding up. */ for (i = num_levels - 1; i >= 0; i--) if (levels[i].total_set.freq >= desired_freq) break; /* If we didn't find a relevant setting, use the highest. */ if (i == -1) i++; /* If we're going to the highest frequency, restore the old setting. */ if (i == 0 || desired_freq == sc->tz_cooling_saved_freq) { error = acpi_tz_cpufreq_restore(sc); if (error == 0) sc->tz_cooling_active = FALSE; goto out; } } /* If we are going to a new frequency, activate it. */ if (levels[i].total_set.freq != freq) { ACPI_VPRINT(sc->tz_dev, acpi_device_get_parent_softc(sc->tz_dev), "temperature %d.%dC: %screasing clock speed " "from %d MHz to %d MHz\n", TZ_KELVTOC(sc->tz_temperature), (freq > levels[i].total_set.freq) ? "de" : "in", freq, levels[i].total_set.freq); error = CPUFREQ_SET(dev, &levels[i], CPUFREQ_PRIO_KERN); if (error == 0 && !sc->tz_cooling_updated) { sc->tz_cooling_saved_freq = freq; sc->tz_cooling_updated = TRUE; } } out: if (levels) free(levels, M_TEMP); return (error); } /* * Passive cooling thread; monitors current temperature according to the * cooling interval and calculates whether to scale back CPU frequency. */ static void acpi_tz_cooling_thread(void *arg) { struct acpi_tz_softc *sc; int error, perf, curr_temp, prev_temp; ACPI_FUNCTION_TRACE((char *)(uintptr_t)__func__); sc = (struct acpi_tz_softc *)arg; prev_temp = sc->tz_temperature; while (sc->tz_cooling_enabled) { if (sc->tz_cooling_active) (void)acpi_tz_get_temperature(sc); curr_temp = sc->tz_temperature; if (curr_temp >= sc->tz_zone.psv) sc->tz_cooling_active = TRUE; if (sc->tz_cooling_active) { perf = sc->tz_zone.tc1 * (curr_temp - prev_temp) + sc->tz_zone.tc2 * (curr_temp - sc->tz_zone.psv); perf /= 10; if (perf != 0) { error = acpi_tz_cpufreq_update(sc, perf); /* * If error and not simply a higher priority setting was * active, disable cooling. */ if (error != 0 && error != EPERM) { device_printf(sc->tz_dev, "failed to set new freq, disabling passive cooling\n"); sc->tz_cooling_enabled = FALSE; } } } prev_temp = curr_temp; tsleep(&sc->tz_cooling_proc, PZERO, "cooling", hz * sc->tz_zone.tsp / 10); } if (sc->tz_cooling_active) { acpi_tz_cpufreq_restore(sc); sc->tz_cooling_active = FALSE; } sc->tz_cooling_proc = NULL; ACPI_LOCK(thermal); sc->tz_cooling_proc_running = FALSE; ACPI_UNLOCK(thermal); kproc_exit(0); } /* * TODO: We ignore _PSL (list of cooling devices) since cpufreq enumerates * all CPUs for us. However, it's possible in the future _PSL will * reference non-CPU devices so we may want to support it then. */ static int acpi_tz_cooling_is_available(struct acpi_tz_softc *sc) { return (sc->tz_zone.tc1 != -1 && sc->tz_zone.tc2 != -1 && sc->tz_zone.tsp != -1 && sc->tz_zone.tsp != 0 && sc->tz_zone.psv != -1); } static int acpi_tz_cooling_thread_start(struct acpi_tz_softc *sc) { int error; ACPI_LOCK(thermal); if (sc->tz_cooling_proc_running) { ACPI_UNLOCK(thermal); return (0); } sc->tz_cooling_proc_running = TRUE; ACPI_UNLOCK(thermal); error = 0; if (sc->tz_cooling_proc == NULL) { error = kproc_create(acpi_tz_cooling_thread, sc, &sc->tz_cooling_proc, RFHIGHPID, 0, "acpi_cooling%d", device_get_unit(sc->tz_dev)); if (error != 0) { device_printf(sc->tz_dev, "could not create thread - %d", error); ACPI_LOCK(thermal); sc->tz_cooling_proc_running = FALSE; ACPI_UNLOCK(thermal); } } return (error); } Index: head/sys/dev/drm2/i915/i915_gem.c =================================================================== --- head/sys/dev/drm2/i915/i915_gem.c (revision 336913) +++ head/sys/dev/drm2/i915/i915_gem.c (revision 336914) @@ -1,4767 +1,4767 @@ /* * Copyright © 2008 Intel Corporation * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice (including the next * paragraph) shall be included in all copies or substantial portions of the * Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS * IN THE SOFTWARE. * * Authors: * Eric Anholt * * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2011 The FreeBSD Foundation * All rights reserved. * * This software was developed by Konstantin Belousov under sponsorship from * the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include static void i915_gem_object_flush_gtt_write_domain(struct drm_i915_gem_object *obj); static void i915_gem_object_flush_cpu_write_domain(struct drm_i915_gem_object *obj); static __must_check int i915_gem_object_bind_to_gtt(struct drm_i915_gem_object *obj, unsigned alignment, bool map_and_fenceable, bool nonblocking); static int i915_gem_phys_pwrite(struct drm_device *dev, struct drm_i915_gem_object *obj, struct drm_i915_gem_pwrite *args, struct drm_file *file); static void i915_gem_write_fence(struct drm_device *dev, int reg, struct drm_i915_gem_object *obj); static void i915_gem_object_update_fence(struct drm_i915_gem_object *obj, struct drm_i915_fence_reg *fence, bool enable); static void i915_gem_inactive_shrink(void *); static long i915_gem_purge(struct drm_i915_private *dev_priv, long target); static void i915_gem_shrink_all(struct drm_i915_private *dev_priv); static void i915_gem_object_truncate(struct drm_i915_gem_object *obj); static int i915_gem_object_get_pages_range(struct drm_i915_gem_object *obj, off_t start, off_t end); static vm_page_t i915_gem_wire_page(vm_object_t object, vm_pindex_t pindex, bool *fresh); MALLOC_DEFINE(DRM_I915_GEM, "i915gem", "Allocations from i915 gem"); long i915_gem_wired_pages_cnt; static inline void i915_gem_object_fence_lost(struct drm_i915_gem_object *obj) { if (obj->tiling_mode) i915_gem_release_mmap(obj); /* As we do not have an associated fence register, we will force * a tiling change if we ever need to acquire one. */ obj->fence_dirty = false; obj->fence_reg = I915_FENCE_REG_NONE; } /* some bookkeeping */ static void i915_gem_info_add_obj(struct drm_i915_private *dev_priv, size_t size) { dev_priv->mm.object_count++; dev_priv->mm.object_memory += size; } static void i915_gem_info_remove_obj(struct drm_i915_private *dev_priv, size_t size) { dev_priv->mm.object_count--; dev_priv->mm.object_memory -= size; } static int i915_gem_wait_for_error(struct drm_device *dev) { struct drm_i915_private *dev_priv = dev->dev_private; struct completion *x = &dev_priv->error_completion; int ret; if (!atomic_read(&dev_priv->mm.wedged)) return 0; /* * Only wait 10 seconds for the gpu reset to complete to avoid hanging * userspace. If it takes that long something really bad is going on and * we should simply try to bail out and fail as gracefully as possible. */ ret = wait_for_completion_interruptible_timeout(x, 10*HZ); if (ret == 0) { DRM_ERROR("Timed out waiting for the gpu reset to complete\n"); return -EIO; } else if (ret < 0) { return ret; } if (atomic_read(&dev_priv->mm.wedged)) { /* GPU is hung, bump the completion count to account for * the token we just consumed so that we never hit zero and * end up waiting upon a subsequent completion event that * will never happen. */ mtx_lock(&x->lock); x->done++; mtx_unlock(&x->lock); } return 0; } int i915_mutex_lock_interruptible(struct drm_device *dev) { int ret; ret = i915_gem_wait_for_error(dev); if (ret) return ret; /* * interruptible shall it be. might indeed be if dev_lock is * changed to sx */ ret = sx_xlock_sig(&dev->dev_struct_lock); if (ret) return -EINTR; WARN_ON(i915_verify_lists(dev)); return 0; } static inline bool i915_gem_object_is_inactive(struct drm_i915_gem_object *obj) { return obj->gtt_space && !obj->active; } int i915_gem_init_ioctl(struct drm_device *dev, void *data, struct drm_file *file) { struct drm_i915_gem_init *args = data; if (drm_core_check_feature(dev, DRIVER_MODESET)) return -ENODEV; if (args->gtt_start >= args->gtt_end || (args->gtt_end | args->gtt_start) & (PAGE_SIZE - 1)) return -EINVAL; /* GEM with user mode setting was never supported on ilk and later. */ if (INTEL_INFO(dev)->gen >= 5) return -ENODEV; /* * XXXKIB. The second-time initialization should be guarded * against. */ DRM_LOCK(dev); i915_gem_init_global_gtt(dev, args->gtt_start, args->gtt_end, args->gtt_end); DRM_UNLOCK(dev); return 0; } int i915_gem_get_aperture_ioctl(struct drm_device *dev, void *data, struct drm_file *file) { struct drm_i915_private *dev_priv = dev->dev_private; struct drm_i915_gem_get_aperture *args = data; struct drm_i915_gem_object *obj; size_t pinned; pinned = 0; DRM_LOCK(dev); list_for_each_entry(obj, &dev_priv->mm.bound_list, gtt_list) if (obj->pin_count) pinned += obj->gtt_space->size; DRM_UNLOCK(dev); args->aper_size = dev_priv->mm.gtt_total; args->aper_available_size = args->aper_size - pinned; return 0; } static int i915_gem_create(struct drm_file *file, struct drm_device *dev, uint64_t size, uint32_t *handle_p) { struct drm_i915_gem_object *obj; int ret; u32 handle; size = roundup(size, PAGE_SIZE); if (size == 0) return -EINVAL; /* Allocate the new object */ obj = i915_gem_alloc_object(dev, size); if (obj == NULL) return -ENOMEM; ret = drm_gem_handle_create(file, &obj->base, &handle); if (ret) { drm_gem_object_release(&obj->base); i915_gem_info_remove_obj(dev->dev_private, obj->base.size); free(obj, DRM_I915_GEM); return ret; } /* drop reference from allocate - handle holds it now */ drm_gem_object_unreference(&obj->base); CTR2(KTR_DRM, "object_create %p %x", obj, size); *handle_p = handle; return 0; } int i915_gem_dumb_create(struct drm_file *file, struct drm_device *dev, struct drm_mode_create_dumb *args) { /* have to work out size/pitch and return them */ args->pitch = roundup2(args->width * ((args->bpp + 7) / 8), 64); args->size = args->pitch * args->height; return i915_gem_create(file, dev, args->size, &args->handle); } int i915_gem_dumb_destroy(struct drm_file *file, struct drm_device *dev, uint32_t handle) { return drm_gem_handle_delete(file, handle); } /** * Creates a new mm object and returns a handle to it. */ int i915_gem_create_ioctl(struct drm_device *dev, void *data, struct drm_file *file) { struct drm_i915_gem_create *args = data; return i915_gem_create(file, dev, args->size, &args->handle); } static int i915_gem_object_needs_bit17_swizzle(struct drm_i915_gem_object *obj) { drm_i915_private_t *dev_priv = obj->base.dev->dev_private; return dev_priv->mm.bit_6_swizzle_x == I915_BIT_6_SWIZZLE_9_10_17 && obj->tiling_mode != I915_TILING_NONE; } static inline int __copy_to_user_swizzled(char __user *cpu_vaddr, const char *gpu_vaddr, int gpu_offset, int length) { int ret, cpu_offset = 0; while (length > 0) { int cacheline_end = roundup2(gpu_offset + 1, 64); int this_length = min(cacheline_end - gpu_offset, length); int swizzled_gpu_offset = gpu_offset ^ 64; ret = __copy_to_user(cpu_vaddr + cpu_offset, gpu_vaddr + swizzled_gpu_offset, this_length); if (ret) return ret + length; cpu_offset += this_length; gpu_offset += this_length; length -= this_length; } return 0; } static inline int __copy_from_user_swizzled(char *gpu_vaddr, int gpu_offset, const char __user *cpu_vaddr, int length) { int ret, cpu_offset = 0; while (length > 0) { int cacheline_end = roundup2(gpu_offset + 1, 64); int this_length = min(cacheline_end - gpu_offset, length); int swizzled_gpu_offset = gpu_offset ^ 64; ret = __copy_from_user(gpu_vaddr + swizzled_gpu_offset, cpu_vaddr + cpu_offset, this_length); if (ret) return ret + length; cpu_offset += this_length; gpu_offset += this_length; length -= this_length; } return 0; } /* Per-page copy function for the shmem pread fastpath. * Flushes invalid cachelines before reading the target if * needs_clflush is set. */ static int shmem_pread_fast(vm_page_t page, int shmem_page_offset, int page_length, char __user *user_data, bool page_do_bit17_swizzling, bool needs_clflush) { char *vaddr; struct sf_buf *sf; int ret; if (unlikely(page_do_bit17_swizzling)) return -EINVAL; sched_pin(); sf = sf_buf_alloc(page, SFB_NOWAIT | SFB_CPUPRIVATE); if (sf == NULL) { sched_unpin(); return (-EFAULT); } vaddr = (char *)sf_buf_kva(sf); if (needs_clflush) drm_clflush_virt_range(vaddr + shmem_page_offset, page_length); ret = __copy_to_user_inatomic(user_data, vaddr + shmem_page_offset, page_length); sf_buf_free(sf); sched_unpin(); return ret ? -EFAULT : 0; } static void shmem_clflush_swizzled_range(char *addr, unsigned long length, bool swizzled) { if (unlikely(swizzled)) { unsigned long start = (unsigned long) addr; unsigned long end = (unsigned long) addr + length; /* For swizzling simply ensure that we always flush both * channels. Lame, but simple and it works. Swizzled * pwrite/pread is far from a hotpath - current userspace * doesn't use it at all. */ start = round_down(start, 128); end = round_up(end, 128); drm_clflush_virt_range((void *)start, end - start); } else { drm_clflush_virt_range(addr, length); } } /* Only difference to the fast-path function is that this can handle bit17 * and uses non-atomic copy and kmap functions. */ static int shmem_pread_slow(vm_page_t page, int shmem_page_offset, int page_length, char __user *user_data, bool page_do_bit17_swizzling, bool needs_clflush) { char *vaddr; struct sf_buf *sf; int ret; sf = sf_buf_alloc(page, 0); vaddr = (char *)sf_buf_kva(sf); if (needs_clflush) shmem_clflush_swizzled_range(vaddr + shmem_page_offset, page_length, page_do_bit17_swizzling); if (page_do_bit17_swizzling) ret = __copy_to_user_swizzled(user_data, vaddr, shmem_page_offset, page_length); else ret = __copy_to_user(user_data, vaddr + shmem_page_offset, page_length); sf_buf_free(sf); return ret ? - EFAULT : 0; } static int i915_gem_shmem_pread(struct drm_device *dev, struct drm_i915_gem_object *obj, struct drm_i915_gem_pread *args, struct drm_file *file) { char __user *user_data; ssize_t remain; off_t offset; int shmem_page_offset, page_length, ret = 0; int obj_do_bit17_swizzling, page_do_bit17_swizzling; int hit_slowpath = 0; int prefaulted = 0; int needs_clflush = 0; user_data = to_user_ptr(args->data_ptr); remain = args->size; obj_do_bit17_swizzling = i915_gem_object_needs_bit17_swizzle(obj); if (!(obj->base.read_domains & I915_GEM_DOMAIN_CPU)) { /* If we're not in the cpu read domain, set ourself into the gtt * read domain and manually flush cachelines (if required). This * optimizes for the case when the gpu will dirty the data * anyway again before the next pread happens. */ if (obj->cache_level == I915_CACHE_NONE) needs_clflush = 1; if (obj->gtt_space) { ret = i915_gem_object_set_to_gtt_domain(obj, false); if (ret) return ret; } } ret = i915_gem_object_get_pages(obj); if (ret) return ret; i915_gem_object_pin_pages(obj); offset = args->offset; VM_OBJECT_WLOCK(obj->base.vm_obj); for (vm_page_t page = vm_page_find_least(obj->base.vm_obj, OFF_TO_IDX(offset));; page = vm_page_next(page)) { VM_OBJECT_WUNLOCK(obj->base.vm_obj); if (remain <= 0) break; /* Operation in this page * * shmem_page_offset = offset within page in shmem file * page_length = bytes to copy for this page */ shmem_page_offset = offset_in_page(offset); page_length = remain; if ((shmem_page_offset + page_length) > PAGE_SIZE) page_length = PAGE_SIZE - shmem_page_offset; page_do_bit17_swizzling = obj_do_bit17_swizzling && (page_to_phys(page) & (1 << 17)) != 0; ret = shmem_pread_fast(page, shmem_page_offset, page_length, user_data, page_do_bit17_swizzling, needs_clflush); if (ret == 0) goto next_page; hit_slowpath = 1; DRM_UNLOCK(dev); if (!prefaulted) { ret = fault_in_multipages_writeable(user_data, remain); /* Userspace is tricking us, but we've already clobbered * its pages with the prefault and promised to write the * data up to the first fault. Hence ignore any errors * and just continue. */ (void)ret; prefaulted = 1; } ret = shmem_pread_slow(page, shmem_page_offset, page_length, user_data, page_do_bit17_swizzling, needs_clflush); DRM_LOCK(dev); next_page: vm_page_reference(page); if (ret) goto out; remain -= page_length; user_data += page_length; offset += page_length; VM_OBJECT_WLOCK(obj->base.vm_obj); } out: i915_gem_object_unpin_pages(obj); if (hit_slowpath) { /* Fixup: Kill any reinstated backing storage pages */ if (obj->madv == __I915_MADV_PURGED) i915_gem_object_truncate(obj); } return ret; } /** * Reads data from the object referenced by handle. * * On error, the contents of *data are undefined. */ int i915_gem_pread_ioctl(struct drm_device *dev, void *data, struct drm_file *file) { struct drm_i915_gem_pread *args = data; struct drm_i915_gem_object *obj; int ret = 0; if (args->size == 0) return 0; if (!useracc(to_user_ptr(args->data_ptr), args->size, VM_PROT_WRITE)) return -EFAULT; ret = i915_mutex_lock_interruptible(dev); if (ret) return ret; obj = to_intel_bo(drm_gem_object_lookup(dev, file, args->handle)); if (&obj->base == NULL) { ret = -ENOENT; goto unlock; } /* Bounds check source. */ if (args->offset > obj->base.size || args->size > obj->base.size - args->offset) { ret = -EINVAL; goto out; } #ifdef FREEBSD_WIP /* prime objects have no backing filp to GEM pread/pwrite * pages from. */ if (!obj->base.filp) { ret = -EINVAL; goto out; } #endif /* FREEBSD_WIP */ CTR3(KTR_DRM, "pread %p %jx %jx", obj, args->offset, args->size); ret = i915_gem_shmem_pread(dev, obj, args, file); out: drm_gem_object_unreference(&obj->base); unlock: DRM_UNLOCK(dev); return ret; } /* This is the fast write path which cannot handle * page faults in the source data */ static inline int fast_user_write(vm_paddr_t mapping_addr, off_t page_base, int page_offset, char __user *user_data, int length) { void __iomem *vaddr_atomic; void *vaddr; unsigned long unwritten; vaddr_atomic = pmap_mapdev_attr(mapping_addr + page_base, length, PAT_WRITE_COMBINING); /* We can use the cpu mem copy function because this is X86. */ vaddr = (char __force*)vaddr_atomic + page_offset; unwritten = __copy_from_user_inatomic_nocache(vaddr, user_data, length); pmap_unmapdev((vm_offset_t)vaddr_atomic, length); return unwritten; } /** * This is the fast pwrite path, where we copy the data directly from the * user into the GTT, uncached. */ static int i915_gem_gtt_pwrite_fast(struct drm_device *dev, struct drm_i915_gem_object *obj, struct drm_i915_gem_pwrite *args, struct drm_file *file) { drm_i915_private_t *dev_priv = dev->dev_private; ssize_t remain; off_t offset, page_base; char __user *user_data; int page_offset, page_length, ret; ret = i915_gem_object_pin(obj, 0, true, true); if (ret) goto out; ret = i915_gem_object_set_to_gtt_domain(obj, true); if (ret) goto out_unpin; ret = i915_gem_object_put_fence(obj); if (ret) goto out_unpin; user_data = to_user_ptr(args->data_ptr); remain = args->size; offset = obj->gtt_offset + args->offset; while (remain > 0) { /* Operation in this page * * page_base = page offset within aperture * page_offset = offset within page * page_length = bytes to copy for this page */ page_base = offset & ~PAGE_MASK; page_offset = offset_in_page(offset); page_length = remain; if ((page_offset + remain) > PAGE_SIZE) page_length = PAGE_SIZE - page_offset; /* If we get a fault while copying data, then (presumably) our * source page isn't available. Return the error and we'll * retry in the slow path. */ if (fast_user_write(dev_priv->mm.gtt_base_addr, page_base, page_offset, user_data, page_length)) { ret = -EFAULT; goto out_unpin; } remain -= page_length; user_data += page_length; offset += page_length; } out_unpin: i915_gem_object_unpin(obj); out: return ret; } /* Per-page copy function for the shmem pwrite fastpath. * Flushes invalid cachelines before writing to the target if * needs_clflush_before is set and flushes out any written cachelines after * writing if needs_clflush is set. */ static int shmem_pwrite_fast(vm_page_t page, int shmem_page_offset, int page_length, char __user *user_data, bool page_do_bit17_swizzling, bool needs_clflush_before, bool needs_clflush_after) { char *vaddr; struct sf_buf *sf; int ret; if (unlikely(page_do_bit17_swizzling)) return -EINVAL; sched_pin(); sf = sf_buf_alloc(page, SFB_NOWAIT | SFB_CPUPRIVATE); if (sf == NULL) { sched_unpin(); return (-EFAULT); } vaddr = (char *)sf_buf_kva(sf); if (needs_clflush_before) drm_clflush_virt_range(vaddr + shmem_page_offset, page_length); ret = __copy_from_user_inatomic_nocache(vaddr + shmem_page_offset, user_data, page_length); if (needs_clflush_after) drm_clflush_virt_range(vaddr + shmem_page_offset, page_length); sf_buf_free(sf); sched_unpin(); return ret ? -EFAULT : 0; } /* Only difference to the fast-path function is that this can handle bit17 * and uses non-atomic copy and kmap functions. */ static int shmem_pwrite_slow(vm_page_t page, int shmem_page_offset, int page_length, char __user *user_data, bool page_do_bit17_swizzling, bool needs_clflush_before, bool needs_clflush_after) { char *vaddr; struct sf_buf *sf; int ret; sf = sf_buf_alloc(page, 0); vaddr = (char *)sf_buf_kva(sf); if (unlikely(needs_clflush_before || page_do_bit17_swizzling)) shmem_clflush_swizzled_range(vaddr + shmem_page_offset, page_length, page_do_bit17_swizzling); if (page_do_bit17_swizzling) ret = __copy_from_user_swizzled(vaddr, shmem_page_offset, user_data, page_length); else ret = __copy_from_user(vaddr + shmem_page_offset, user_data, page_length); if (needs_clflush_after) shmem_clflush_swizzled_range(vaddr + shmem_page_offset, page_length, page_do_bit17_swizzling); sf_buf_free(sf); return ret ? -EFAULT : 0; } static int i915_gem_shmem_pwrite(struct drm_device *dev, struct drm_i915_gem_object *obj, struct drm_i915_gem_pwrite *args, struct drm_file *file) { ssize_t remain; off_t offset; char __user *user_data; int shmem_page_offset, page_length, ret = 0; int obj_do_bit17_swizzling, page_do_bit17_swizzling; int hit_slowpath = 0; int needs_clflush_after = 0; int needs_clflush_before = 0; user_data = to_user_ptr(args->data_ptr); remain = args->size; obj_do_bit17_swizzling = i915_gem_object_needs_bit17_swizzle(obj); if (obj->base.write_domain != I915_GEM_DOMAIN_CPU) { /* If we're not in the cpu write domain, set ourself into the gtt * write domain and manually flush cachelines (if required). This * optimizes for the case when the gpu will use the data * right away and we therefore have to clflush anyway. */ if (obj->cache_level == I915_CACHE_NONE) needs_clflush_after = 1; if (obj->gtt_space) { ret = i915_gem_object_set_to_gtt_domain(obj, true); if (ret) return ret; } } /* Same trick applies for invalidate partially written cachelines before * writing. */ if (!(obj->base.read_domains & I915_GEM_DOMAIN_CPU) && obj->cache_level == I915_CACHE_NONE) needs_clflush_before = 1; ret = i915_gem_object_get_pages(obj); if (ret) return ret; i915_gem_object_pin_pages(obj); offset = args->offset; obj->dirty = 1; VM_OBJECT_WLOCK(obj->base.vm_obj); for (vm_page_t page = vm_page_find_least(obj->base.vm_obj, OFF_TO_IDX(offset));; page = vm_page_next(page)) { VM_OBJECT_WUNLOCK(obj->base.vm_obj); int partial_cacheline_write; if (remain <= 0) break; /* Operation in this page * * shmem_page_offset = offset within page in shmem file * page_length = bytes to copy for this page */ shmem_page_offset = offset_in_page(offset); page_length = remain; if ((shmem_page_offset + page_length) > PAGE_SIZE) page_length = PAGE_SIZE - shmem_page_offset; /* If we don't overwrite a cacheline completely we need to be * careful to have up-to-date data by first clflushing. Don't * overcomplicate things and flush the entire patch. */ partial_cacheline_write = needs_clflush_before && ((shmem_page_offset | page_length) & (cpu_clflush_line_size - 1)); page_do_bit17_swizzling = obj_do_bit17_swizzling && (page_to_phys(page) & (1 << 17)) != 0; ret = shmem_pwrite_fast(page, shmem_page_offset, page_length, user_data, page_do_bit17_swizzling, partial_cacheline_write, needs_clflush_after); if (ret == 0) goto next_page; hit_slowpath = 1; DRM_UNLOCK(dev); ret = shmem_pwrite_slow(page, shmem_page_offset, page_length, user_data, page_do_bit17_swizzling, partial_cacheline_write, needs_clflush_after); DRM_LOCK(dev); next_page: vm_page_dirty(page); vm_page_reference(page); if (ret) goto out; remain -= page_length; user_data += page_length; offset += page_length; VM_OBJECT_WLOCK(obj->base.vm_obj); } out: i915_gem_object_unpin_pages(obj); if (hit_slowpath) { /* Fixup: Kill any reinstated backing storage pages */ if (obj->madv == __I915_MADV_PURGED) i915_gem_object_truncate(obj); /* and flush dirty cachelines in case the object isn't in the cpu write * domain anymore. */ if (obj->base.write_domain != I915_GEM_DOMAIN_CPU) { i915_gem_clflush_object(obj); i915_gem_chipset_flush(dev); } } if (needs_clflush_after) i915_gem_chipset_flush(dev); return ret; } /** * Writes data to the object referenced by handle. * * On error, the contents of the buffer that were to be modified are undefined. */ int i915_gem_pwrite_ioctl(struct drm_device *dev, void *data, struct drm_file *file) { struct drm_i915_gem_pwrite *args = data; struct drm_i915_gem_object *obj; int ret; if (args->size == 0) return 0; if (!useracc(to_user_ptr(args->data_ptr), args->size, VM_PROT_READ)) return -EFAULT; ret = fault_in_multipages_readable(to_user_ptr(args->data_ptr), args->size); if (ret) return -EFAULT; ret = i915_mutex_lock_interruptible(dev); if (ret) return ret; obj = to_intel_bo(drm_gem_object_lookup(dev, file, args->handle)); if (&obj->base == NULL) { ret = -ENOENT; goto unlock; } /* Bounds check destination. */ if (args->offset > obj->base.size || args->size > obj->base.size - args->offset) { ret = -EINVAL; goto out; } #ifdef FREEBSD_WIP /* prime objects have no backing filp to GEM pread/pwrite * pages from. */ if (!obj->base.filp) { ret = -EINVAL; goto out; } #endif /* FREEBSD_WIP */ CTR3(KTR_DRM, "pwrite %p %jx %jx", obj, args->offset, args->size); ret = -EFAULT; /* We can only do the GTT pwrite on untiled buffers, as otherwise * it would end up going through the fenced access, and we'll get * different detiling behavior between reading and writing. * pread/pwrite currently are reading and writing from the CPU * perspective, requiring manual detiling by the client. */ if (obj->phys_obj) { ret = i915_gem_phys_pwrite(dev, obj, args, file); goto out; } if (obj->cache_level == I915_CACHE_NONE && obj->tiling_mode == I915_TILING_NONE && obj->base.write_domain != I915_GEM_DOMAIN_CPU) { ret = i915_gem_gtt_pwrite_fast(dev, obj, args, file); /* Note that the gtt paths might fail with non-page-backed user * pointers (e.g. gtt mappings when moving data between * textures). Fallback to the shmem path in that case. */ } if (ret == -EFAULT || ret == -ENOSPC) ret = i915_gem_shmem_pwrite(dev, obj, args, file); out: drm_gem_object_unreference(&obj->base); unlock: DRM_UNLOCK(dev); return ret; } int i915_gem_check_wedge(struct drm_i915_private *dev_priv, bool interruptible) { if (atomic_read(&dev_priv->mm.wedged)) { struct completion *x = &dev_priv->error_completion; bool recovery_complete; /* Give the error handler a chance to run. */ mtx_lock(&x->lock); recovery_complete = x->done > 0; mtx_unlock(&x->lock); /* Non-interruptible callers can't handle -EAGAIN, hence return * -EIO unconditionally for these. */ if (!interruptible) return -EIO; /* Recovery complete, but still wedged means reset failure. */ if (recovery_complete) return -EIO; return -EAGAIN; } return 0; } /* * Compare seqno against outstanding lazy request. Emit a request if they are * equal. */ static int i915_gem_check_olr(struct intel_ring_buffer *ring, u32 seqno) { int ret; DRM_LOCK_ASSERT(ring->dev); ret = 0; if (seqno == ring->outstanding_lazy_request) ret = i915_add_request(ring, NULL, NULL); return ret; } /** * __wait_seqno - wait until execution of seqno has finished * @ring: the ring expected to report seqno * @seqno: duh! * @interruptible: do an interruptible wait (normally yes) * @timeout: in - how long to wait (NULL forever); out - how much time remaining * * Returns 0 if the seqno was found within the alloted time. Else returns the * errno with remaining time filled in timeout argument. */ static int __wait_seqno(struct intel_ring_buffer *ring, u32 seqno, bool interruptible, struct timespec *timeout) { drm_i915_private_t *dev_priv = ring->dev->dev_private; struct timespec before, now, wait_time={1,0}; sbintime_t timeout_sbt; long end; bool wait_forever = true; int ret, flags; if (i915_seqno_passed(ring->get_seqno(ring, true), seqno)) return 0; CTR2(KTR_DRM, "request_wait_begin %s %d", ring->name, seqno); if (timeout != NULL) { wait_time = *timeout; wait_forever = false; } timeout_sbt = tstosbt(wait_time); if (WARN_ON(!ring->irq_get(ring))) return -ENODEV; /* Record current time in case interrupted by signal, or wedged * */ getrawmonotonic(&before); #define EXIT_COND \ (i915_seqno_passed(ring->get_seqno(ring, false), seqno) || \ atomic_read(&dev_priv->mm.wedged)) flags = interruptible ? PCATCH : 0; mtx_lock(&dev_priv->irq_lock); do { if (EXIT_COND) { end = 1; } else { ret = -msleep_sbt(&ring->irq_queue, &dev_priv->irq_lock, flags, "915gwr", timeout_sbt, 0, 0); /* * NOTE Linux<->FreeBSD: Convert msleep_sbt() return * value to something close to wait_event*_timeout() * functions used on Linux. * * >0 -> condition is true (end = time remaining) * =0 -> sleep timed out * <0 -> error (interrupted) * * We fake the remaining time by returning 1. We * compute a proper value later. */ if (EXIT_COND) /* We fake a remaining time of 1 tick. */ end = 1; else if (ret == -EINTR || ret == -ERESTART) /* Interrupted. */ end = -ERESTARTSYS; else /* Timeout. */ end = 0; } ret = i915_gem_check_wedge(dev_priv, interruptible); if (ret) end = ret; } while (end == 0 && wait_forever); mtx_unlock(&dev_priv->irq_lock); getrawmonotonic(&now); ring->irq_put(ring); CTR3(KTR_DRM, "request_wait_end %s %d %d", ring->name, seqno, end); #undef EXIT_COND if (timeout) { - timespecsub(&now, &before); - timespecsub(timeout, &now); + timespecsub(&now, &before, &now); + timespecsub(timeout, &now, timeout); } switch (end) { case -EIO: case -EAGAIN: /* Wedged */ case -ERESTARTSYS: /* Signal */ case -ETIMEDOUT: /* Timeout */ return (int)end; case 0: /* Timeout */ return -ETIMEDOUT; default: /* Completed */ WARN_ON(end < 0); /* We're not aware of other errors */ return 0; } } /** * Waits for a sequence number to be signaled, and cleans up the * request and object lists appropriately for that event. */ int i915_wait_seqno(struct intel_ring_buffer *ring, uint32_t seqno) { struct drm_device *dev = ring->dev; struct drm_i915_private *dev_priv = dev->dev_private; bool interruptible = dev_priv->mm.interruptible; int ret; DRM_LOCK_ASSERT(dev); BUG_ON(seqno == 0); ret = i915_gem_check_wedge(dev_priv, interruptible); if (ret) return ret; ret = i915_gem_check_olr(ring, seqno); if (ret) return ret; return __wait_seqno(ring, seqno, interruptible, NULL); } /** * Ensures that all rendering to the object has completed and the object is * safe to unbind from the GTT or access from the CPU. */ static __must_check int i915_gem_object_wait_rendering(struct drm_i915_gem_object *obj, bool readonly) { struct intel_ring_buffer *ring = obj->ring; u32 seqno; int ret; seqno = readonly ? obj->last_write_seqno : obj->last_read_seqno; if (seqno == 0) return 0; ret = i915_wait_seqno(ring, seqno); if (ret) return ret; i915_gem_retire_requests_ring(ring); /* Manually manage the write flush as we may have not yet * retired the buffer. */ if (obj->last_write_seqno && i915_seqno_passed(seqno, obj->last_write_seqno)) { obj->last_write_seqno = 0; obj->base.write_domain &= ~I915_GEM_GPU_DOMAINS; } return 0; } /* A nonblocking variant of the above wait. This is a highly dangerous routine * as the object state may change during this call. */ static __must_check int i915_gem_object_wait_rendering__nonblocking(struct drm_i915_gem_object *obj, bool readonly) { struct drm_device *dev = obj->base.dev; struct drm_i915_private *dev_priv = dev->dev_private; struct intel_ring_buffer *ring = obj->ring; u32 seqno; int ret; DRM_LOCK_ASSERT(dev); BUG_ON(!dev_priv->mm.interruptible); seqno = readonly ? obj->last_write_seqno : obj->last_read_seqno; if (seqno == 0) return 0; ret = i915_gem_check_wedge(dev_priv, true); if (ret) return ret; ret = i915_gem_check_olr(ring, seqno); if (ret) return ret; DRM_UNLOCK(dev); ret = __wait_seqno(ring, seqno, true, NULL); DRM_LOCK(dev); i915_gem_retire_requests_ring(ring); /* Manually manage the write flush as we may have not yet * retired the buffer. */ if (ret == 0 && obj->last_write_seqno && i915_seqno_passed(seqno, obj->last_write_seqno)) { obj->last_write_seqno = 0; obj->base.write_domain &= ~I915_GEM_GPU_DOMAINS; } return ret; } /** * Called when user space prepares to use an object with the CPU, either * through the mmap ioctl's mapping or a GTT mapping. */ int i915_gem_set_domain_ioctl(struct drm_device *dev, void *data, struct drm_file *file) { struct drm_i915_gem_set_domain *args = data; struct drm_i915_gem_object *obj; uint32_t read_domains = args->read_domains; uint32_t write_domain = args->write_domain; int ret; /* Only handle setting domains to types used by the CPU. */ if (write_domain & I915_GEM_GPU_DOMAINS) return -EINVAL; if (read_domains & I915_GEM_GPU_DOMAINS) return -EINVAL; /* Having something in the write domain implies it's in the read * domain, and only that read domain. Enforce that in the request. */ if (write_domain != 0 && read_domains != write_domain) return -EINVAL; ret = i915_mutex_lock_interruptible(dev); if (ret) return ret; obj = to_intel_bo(drm_gem_object_lookup(dev, file, args->handle)); if (&obj->base == NULL) { ret = -ENOENT; goto unlock; } /* Try to flush the object off the GPU without holding the lock. * We will repeat the flush holding the lock in the normal manner * to catch cases where we are gazumped. */ ret = i915_gem_object_wait_rendering__nonblocking(obj, !write_domain); if (ret) goto unref; if (read_domains & I915_GEM_DOMAIN_GTT) { ret = i915_gem_object_set_to_gtt_domain(obj, write_domain != 0); /* Silently promote "you're not bound, there was nothing to do" * to success, since the client was just asking us to * make sure everything was done. */ if (ret == -EINVAL) ret = 0; } else { ret = i915_gem_object_set_to_cpu_domain(obj, write_domain != 0); } unref: drm_gem_object_unreference(&obj->base); unlock: DRM_UNLOCK(dev); return ret; } /** * Called when user space has done writes to this buffer */ int i915_gem_sw_finish_ioctl(struct drm_device *dev, void *data, struct drm_file *file) { struct drm_i915_gem_sw_finish *args = data; struct drm_i915_gem_object *obj; int ret = 0; ret = i915_mutex_lock_interruptible(dev); if (ret) return ret; obj = to_intel_bo(drm_gem_object_lookup(dev, file, args->handle)); if (&obj->base == NULL) { ret = -ENOENT; goto unlock; } /* Pinned buffers may be scanout, so flush the cache */ if (obj->pin_count) i915_gem_object_flush_cpu_write_domain(obj); drm_gem_object_unreference(&obj->base); unlock: DRM_UNLOCK(dev); return ret; } /** * Maps the contents of an object, returning the address it is mapped * into. * * While the mapping holds a reference on the contents of the object, it doesn't * imply a ref on the object itself. */ int i915_gem_mmap_ioctl(struct drm_device *dev, void *data, struct drm_file *file) { struct drm_i915_gem_mmap *args = data; struct drm_gem_object *obj; struct proc *p; vm_map_t map; vm_offset_t addr; vm_size_t size; int error, rv; obj = drm_gem_object_lookup(dev, file, args->handle); if (obj == NULL) return -ENOENT; #ifdef FREEBSD_WIP /* prime objects have no backing filp to GEM mmap * pages from. */ if (!obj->filp) { drm_gem_object_unreference_unlocked(obj); return -EINVAL; } #endif /* FREEBSD_WIP */ error = 0; if (args->size == 0) goto out; p = curproc; map = &p->p_vmspace->vm_map; size = round_page(args->size); PROC_LOCK(p); if (map->size + size > lim_cur_proc(p, RLIMIT_VMEM)) { PROC_UNLOCK(p); error = -ENOMEM; goto out; } PROC_UNLOCK(p); addr = 0; vm_object_reference(obj->vm_obj); rv = vm_map_find(map, obj->vm_obj, args->offset, &addr, args->size, 0, VMFS_OPTIMAL_SPACE, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE, MAP_INHERIT_SHARE); if (rv != KERN_SUCCESS) { vm_object_deallocate(obj->vm_obj); error = -vm_mmap_to_errno(rv); } else { args->addr_ptr = (uint64_t)addr; } out: drm_gem_object_unreference_unlocked(obj); return (error); } static int i915_gem_pager_ctor(void *handle, vm_ooffset_t size, vm_prot_t prot, vm_ooffset_t foff, struct ucred *cred, u_short *color) { /* * NOTE Linux<->FreeBSD: drm_gem_mmap_single() takes care of * calling drm_gem_object_reference(). That's why we don't * do this here. i915_gem_pager_dtor(), below, will call * drm_gem_object_unreference(). * * On Linux, drm_gem_vm_open() references the object because * it's called the mapping is copied. drm_gem_vm_open() is not * called when the mapping is created. So the possible sequences * are: * 1. drm_gem_mmap(): ref++ * 2. drm_gem_vm_close(): ref-- * * 1. drm_gem_mmap(): ref++ * 2. drm_gem_vm_open(): ref++ (for the copied vma) * 3. drm_gem_vm_close(): ref-- (for the copied vma) * 4. drm_gem_vm_close(): ref-- (for the initial vma) * * On FreeBSD, i915_gem_pager_ctor() is called once during the * creation of the mapping. No callback is called when the * mapping is shared during a fork(). i915_gem_pager_dtor() is * called when the last reference to the mapping is dropped. So * the only sequence is: * 1. drm_gem_mmap_single(): ref++ * 2. i915_gem_pager_ctor(): * 3. i915_gem_pager_dtor(): ref-- */ *color = 0; /* XXXKIB */ return (0); } /** * i915_gem_fault - fault a page into the GTT * vma: VMA in question * vmf: fault info * * The fault handler is set up by drm_gem_mmap() when a object is GTT mapped * from userspace. The fault handler takes care of binding the object to * the GTT (if needed), allocating and programming a fence register (again, * only if needed based on whether the old reg is still valid or the object * is tiled) and inserting a new PTE into the faulting process. * * Note that the faulting process may involve evicting existing objects * from the GTT and/or fence registers to make room. So performance may * suffer if the GTT working set is large or there are few fence registers * left. */ int i915_intr_pf; static int i915_gem_pager_populate(vm_object_t vm_obj, vm_pindex_t pidx, int fault_type, vm_prot_t max_prot, vm_pindex_t *first, vm_pindex_t *last) { struct drm_gem_object *gem_obj = vm_obj->handle; struct drm_i915_gem_object *obj = to_intel_bo(gem_obj); struct drm_device *dev = obj->base.dev; drm_i915_private_t *dev_priv = dev->dev_private; vm_page_t page; int ret = 0; bool write = (max_prot & VM_PROT_WRITE) != 0; bool pinned; VM_OBJECT_WUNLOCK(vm_obj); retry: ret = 0; pinned = 0; page = NULL; if (i915_intr_pf) { ret = i915_mutex_lock_interruptible(dev); if (ret != 0) goto out; } else DRM_LOCK(dev); /* * Since the object lock was dropped, other thread might have * faulted on the same GTT address and instantiated the * mapping for the page. Recheck. */ VM_OBJECT_WLOCK(vm_obj); page = vm_page_lookup(vm_obj, pidx); if (page != NULL) { if (vm_page_busied(page)) { DRM_UNLOCK(dev); vm_page_lock(page); VM_OBJECT_WUNLOCK(vm_obj); vm_page_busy_sleep(page, "915pee", false); goto retry; } goto have_page; } else VM_OBJECT_WUNLOCK(vm_obj); /* Now bind it into the GTT if needed */ ret = i915_gem_object_pin(obj, 0, true, false); if (ret) goto unlock; pinned = 1; ret = i915_gem_object_set_to_gtt_domain(obj, write); if (ret) goto unpin; ret = i915_gem_object_get_fence(obj); if (ret) goto unpin; obj->fault_mappable = true; page = PHYS_TO_VM_PAGE(dev_priv->mm.gtt_base_addr + obj->gtt_offset + IDX_TO_OFF(pidx)); if (page == NULL) { ret = -EFAULT; goto unpin; } KASSERT((page->flags & PG_FICTITIOUS) != 0, ("physical address %#jx not fictitious, page %p", (uintmax_t)(dev_priv->mm.gtt_base_addr + obj->gtt_offset + IDX_TO_OFF(pidx)), page)); KASSERT(page->wire_count == 1, ("wire_count not 1 %p", page)); VM_OBJECT_WLOCK(vm_obj); if (vm_page_busied(page)) { i915_gem_object_unpin(obj); DRM_UNLOCK(dev); vm_page_lock(page); VM_OBJECT_WUNLOCK(vm_obj); vm_page_busy_sleep(page, "915pbs", false); goto retry; } if (vm_page_insert(page, vm_obj, pidx)) { i915_gem_object_unpin(obj); DRM_UNLOCK(dev); VM_OBJECT_WUNLOCK(vm_obj); vm_wait(vm_obj); goto retry; } page->valid = VM_PAGE_BITS_ALL; have_page: vm_page_xbusy(page); CTR4(KTR_DRM, "fault %p %jx %x phys %x", gem_obj, pidx, fault_type, page->phys_addr); if (pinned) { /* * We may have not pinned the object if the page was * found by the call to vm_page_lookup(). */ i915_gem_object_unpin(obj); } DRM_UNLOCK(dev); *first = *last = pidx; return (VM_PAGER_OK); unpin: i915_gem_object_unpin(obj); unlock: DRM_UNLOCK(dev); out: KASSERT(ret != 0, ("i915_gem_pager_fault: wrong return")); CTR4(KTR_DRM, "fault_fail %p %jx %x err %d", gem_obj, pidx, fault_type, -ret); if (ret == -ERESTARTSYS) { /* * NOTE Linux<->FreeBSD: Convert Linux' -ERESTARTSYS to * the more common -EINTR, so the page fault is retried. */ ret = -EINTR; } if (ret == -EAGAIN || ret == -EIO || ret == -EINTR) { kern_yield(PRI_USER); goto retry; } VM_OBJECT_WLOCK(vm_obj); return (VM_PAGER_ERROR); } static void i915_gem_pager_dtor(void *handle) { struct drm_gem_object *obj = handle; struct drm_device *dev = obj->dev; DRM_LOCK(dev); drm_gem_object_unreference(obj); DRM_UNLOCK(dev); } struct cdev_pager_ops i915_gem_pager_ops = { .cdev_pg_populate = i915_gem_pager_populate, .cdev_pg_ctor = i915_gem_pager_ctor, .cdev_pg_dtor = i915_gem_pager_dtor, }; /** * i915_gem_release_mmap - remove physical page mappings * @obj: obj in question * * Preserve the reservation of the mmapping with the DRM core code, but * relinquish ownership of the pages back to the system. * * It is vital that we remove the page mapping if we have mapped a tiled * object through the GTT and then lose the fence register due to * resource pressure. Similarly if the object has been moved out of the * aperture, than pages mapped into userspace must be revoked. Removing the * mapping will then trigger a page fault on the next user access, allowing * fixup by i915_gem_fault(). */ void i915_gem_release_mmap(struct drm_i915_gem_object *obj) { vm_object_t devobj; vm_page_t page; int i, page_count; if (!obj->fault_mappable) return; CTR3(KTR_DRM, "release_mmap %p %x %x", obj, obj->gtt_offset, OFF_TO_IDX(obj->base.size)); devobj = cdev_pager_lookup(obj); if (devobj != NULL) { page_count = OFF_TO_IDX(obj->base.size); VM_OBJECT_WLOCK(devobj); retry: for (i = 0; i < page_count; i++) { page = vm_page_lookup(devobj, i); if (page == NULL) continue; if (vm_page_sleep_if_busy(page, "915unm")) goto retry; cdev_pager_free_page(devobj, page); } VM_OBJECT_WUNLOCK(devobj); vm_object_deallocate(devobj); } obj->fault_mappable = false; } static uint32_t i915_gem_get_gtt_size(struct drm_device *dev, uint32_t size, int tiling_mode) { uint32_t gtt_size; if (INTEL_INFO(dev)->gen >= 4 || tiling_mode == I915_TILING_NONE) return size; /* Previous chips need a power-of-two fence region when tiling */ if (INTEL_INFO(dev)->gen == 3) gtt_size = 1024*1024; else gtt_size = 512*1024; while (gtt_size < size) gtt_size <<= 1; return gtt_size; } /** * i915_gem_get_gtt_alignment - return required GTT alignment for an object * @obj: object to check * * Return the required GTT alignment for an object, taking into account * potential fence register mapping. */ static uint32_t i915_gem_get_gtt_alignment(struct drm_device *dev, uint32_t size, int tiling_mode) { /* * Minimum alignment is 4k (GTT page size), but might be greater * if a fence register is needed for the object. */ if (INTEL_INFO(dev)->gen >= 4 || tiling_mode == I915_TILING_NONE) return 4096; /* * Previous chips need to be aligned to the size of the smallest * fence register that can contain the object. */ return i915_gem_get_gtt_size(dev, size, tiling_mode); } /** * i915_gem_get_unfenced_gtt_alignment - return required GTT alignment for an * unfenced object * @dev: the device * @size: size of the object * @tiling_mode: tiling mode of the object * * Return the required GTT alignment for an object, only taking into account * unfenced tiled surface requirements. */ uint32_t i915_gem_get_unfenced_gtt_alignment(struct drm_device *dev, uint32_t size, int tiling_mode) { /* * Minimum alignment is 4k (GTT page size) for sane hw. */ if (INTEL_INFO(dev)->gen >= 4 || IS_G33(dev) || tiling_mode == I915_TILING_NONE) return 4096; /* Previous hardware however needs to be aligned to a power-of-two * tile height. The simplest method for determining this is to reuse * the power-of-tile object size. */ return i915_gem_get_gtt_size(dev, size, tiling_mode); } static int i915_gem_object_create_mmap_offset(struct drm_i915_gem_object *obj) { struct drm_i915_private *dev_priv = obj->base.dev->dev_private; int ret; if (obj->base.on_map) return 0; dev_priv->mm.shrinker_no_lock_stealing = true; ret = drm_gem_create_mmap_offset(&obj->base); if (ret != -ENOSPC) goto out; /* Badly fragmented mmap space? The only way we can recover * space is by destroying unwanted objects. We can't randomly release * mmap_offsets as userspace expects them to be persistent for the * lifetime of the objects. The closest we can is to release the * offsets on purgeable objects by truncating it and marking it purged, * which prevents userspace from ever using that object again. */ i915_gem_purge(dev_priv, obj->base.size >> PAGE_SHIFT); ret = drm_gem_create_mmap_offset(&obj->base); if (ret != -ENOSPC) goto out; i915_gem_shrink_all(dev_priv); ret = drm_gem_create_mmap_offset(&obj->base); out: dev_priv->mm.shrinker_no_lock_stealing = false; return ret; } static void i915_gem_object_free_mmap_offset(struct drm_i915_gem_object *obj) { if (!obj->base.on_map) return; drm_gem_free_mmap_offset(&obj->base); } int i915_gem_mmap_gtt(struct drm_file *file, struct drm_device *dev, uint32_t handle, uint64_t *offset) { struct drm_i915_private *dev_priv = dev->dev_private; struct drm_i915_gem_object *obj; int ret; ret = i915_mutex_lock_interruptible(dev); if (ret) return ret; obj = to_intel_bo(drm_gem_object_lookup(dev, file, handle)); if (&obj->base == NULL) { ret = -ENOENT; goto unlock; } if (obj->base.size > dev_priv->mm.gtt_mappable_end) { ret = -E2BIG; goto out; } if (obj->madv != I915_MADV_WILLNEED) { DRM_ERROR("Attempting to mmap a purgeable buffer\n"); ret = -EINVAL; goto out; } ret = i915_gem_object_create_mmap_offset(obj); if (ret) goto out; *offset = DRM_GEM_MAPPING_OFF(obj->base.map_list.key) | DRM_GEM_MAPPING_KEY; out: drm_gem_object_unreference(&obj->base); unlock: DRM_UNLOCK(dev); return ret; } /** * i915_gem_mmap_gtt_ioctl - prepare an object for GTT mmap'ing * @dev: DRM device * @data: GTT mapping ioctl data * @file: GEM object info * * Simply returns the fake offset to userspace so it can mmap it. * The mmap call will end up in drm_gem_mmap(), which will set things * up so we can get faults in the handler above. * * The fault handler will take care of binding the object into the GTT * (since it may have been evicted to make room for something), allocating * a fence register, and mapping the appropriate aperture address into * userspace. */ int i915_gem_mmap_gtt_ioctl(struct drm_device *dev, void *data, struct drm_file *file) { struct drm_i915_gem_mmap_gtt *args = data; return i915_gem_mmap_gtt(file, dev, args->handle, &args->offset); } /* Immediately discard the backing storage */ static void i915_gem_object_truncate(struct drm_i915_gem_object *obj) { vm_object_t vm_obj; vm_obj = obj->base.vm_obj; VM_OBJECT_WLOCK(vm_obj); vm_object_page_remove(vm_obj, 0, 0, false); VM_OBJECT_WUNLOCK(vm_obj); i915_gem_object_free_mmap_offset(obj); obj->madv = __I915_MADV_PURGED; } static inline int i915_gem_object_is_purgeable(struct drm_i915_gem_object *obj) { return obj->madv == I915_MADV_DONTNEED; } static void i915_gem_object_put_pages_range_locked(struct drm_i915_gem_object *obj, vm_pindex_t si, vm_pindex_t ei) { vm_object_t vm_obj; vm_page_t page; vm_pindex_t i; vm_obj = obj->base.vm_obj; VM_OBJECT_ASSERT_LOCKED(vm_obj); for (i = si, page = vm_page_lookup(vm_obj, i); i < ei; page = vm_page_next(page), i++) { KASSERT(page->pindex == i, ("pindex %jx %jx", (uintmax_t)page->pindex, (uintmax_t)i)); vm_page_lock(page); if (vm_page_unwire(page, PQ_INACTIVE)) atomic_add_long(&i915_gem_wired_pages_cnt, -1); vm_page_unlock(page); } } #define GEM_PARANOID_CHECK_GTT 0 #if GEM_PARANOID_CHECK_GTT static void i915_gem_assert_pages_not_mapped(struct drm_device *dev, vm_page_t *ma, int page_count) { struct drm_i915_private *dev_priv; vm_paddr_t pa; unsigned long start, end; u_int i; int j; dev_priv = dev->dev_private; start = OFF_TO_IDX(dev_priv->mm.gtt_start); end = OFF_TO_IDX(dev_priv->mm.gtt_end); for (i = start; i < end; i++) { pa = intel_gtt_read_pte_paddr(i); for (j = 0; j < page_count; j++) { if (pa == VM_PAGE_TO_PHYS(ma[j])) { panic("Page %p in GTT pte index %d pte %x", ma[i], i, intel_gtt_read_pte(i)); } } } } #endif static void i915_gem_object_put_pages_gtt(struct drm_i915_gem_object *obj) { int page_count = obj->base.size / PAGE_SIZE; int ret, i; BUG_ON(obj->madv == __I915_MADV_PURGED); ret = i915_gem_object_set_to_cpu_domain(obj, true); if (ret) { /* In the event of a disaster, abandon all caches and * hope for the best. */ WARN_ON(ret != -EIO); i915_gem_clflush_object(obj); obj->base.read_domains = obj->base.write_domain = I915_GEM_DOMAIN_CPU; } if (i915_gem_object_needs_bit17_swizzle(obj)) i915_gem_object_save_bit_17_swizzle(obj); if (obj->madv == I915_MADV_DONTNEED) obj->dirty = 0; VM_OBJECT_WLOCK(obj->base.vm_obj); #if GEM_PARANOID_CHECK_GTT i915_gem_assert_pages_not_mapped(obj->base.dev, obj->pages, page_count); #endif for (i = 0; i < page_count; i++) { vm_page_t page = obj->pages[i]; if (obj->dirty) vm_page_dirty(page); if (obj->madv == I915_MADV_WILLNEED) vm_page_reference(page); vm_page_lock(page); vm_page_unwire(obj->pages[i], PQ_ACTIVE); vm_page_unlock(page); atomic_add_long(&i915_gem_wired_pages_cnt, -1); } VM_OBJECT_WUNLOCK(obj->base.vm_obj); obj->dirty = 0; free(obj->pages, DRM_I915_GEM); obj->pages = NULL; } static int i915_gem_object_put_pages(struct drm_i915_gem_object *obj) { const struct drm_i915_gem_object_ops *ops = obj->ops; if (obj->pages == NULL) return 0; BUG_ON(obj->gtt_space); if (obj->pages_pin_count) return -EBUSY; /* ->put_pages might need to allocate memory for the bit17 swizzle * array, hence protect them from being reaped by removing them from gtt * lists early. */ list_del(&obj->gtt_list); ops->put_pages(obj); obj->pages = NULL; if (i915_gem_object_is_purgeable(obj)) i915_gem_object_truncate(obj); return 0; } static long __i915_gem_shrink(struct drm_i915_private *dev_priv, long target, bool purgeable_only) { struct drm_i915_gem_object *obj, *next; long count = 0; list_for_each_entry_safe(obj, next, &dev_priv->mm.unbound_list, gtt_list) { if ((i915_gem_object_is_purgeable(obj) || !purgeable_only) && i915_gem_object_put_pages(obj) == 0) { count += obj->base.size >> PAGE_SHIFT; if (target != -1 && count >= target) return count; } } list_for_each_entry_safe(obj, next, &dev_priv->mm.inactive_list, mm_list) { if ((i915_gem_object_is_purgeable(obj) || !purgeable_only) && i915_gem_object_unbind(obj) == 0 && i915_gem_object_put_pages(obj) == 0) { count += obj->base.size >> PAGE_SHIFT; if (target != -1 && count >= target) return count; } } return count; } static long i915_gem_purge(struct drm_i915_private *dev_priv, long target) { return __i915_gem_shrink(dev_priv, target, true); } static void i915_gem_shrink_all(struct drm_i915_private *dev_priv) { struct drm_i915_gem_object *obj, *next; i915_gem_evict_everything(dev_priv->dev); list_for_each_entry_safe(obj, next, &dev_priv->mm.unbound_list, gtt_list) i915_gem_object_put_pages(obj); } static int i915_gem_object_get_pages_range(struct drm_i915_gem_object *obj, off_t start, off_t end) { vm_object_t vm_obj; vm_page_t page; vm_pindex_t si, ei, i; bool need_swizzle, fresh; need_swizzle = i915_gem_object_needs_bit17_swizzle(obj) != 0; vm_obj = obj->base.vm_obj; si = OFF_TO_IDX(trunc_page(start)); ei = OFF_TO_IDX(round_page(end)); VM_OBJECT_WLOCK(vm_obj); for (i = si; i < ei; i++) { page = i915_gem_wire_page(vm_obj, i, &fresh); if (page == NULL) goto failed; if (need_swizzle && fresh) i915_gem_object_do_bit_17_swizzle_page(obj, page); } VM_OBJECT_WUNLOCK(vm_obj); return (0); failed: i915_gem_object_put_pages_range_locked(obj, si, i); VM_OBJECT_WUNLOCK(vm_obj); return (-EIO); } static int i915_gem_object_get_pages_gtt(struct drm_i915_gem_object *obj) { vm_object_t vm_obj; vm_page_t page; vm_pindex_t i, page_count; int res; /* Assert that the object is not currently in any GPU domain. As it * wasn't in the GTT, there shouldn't be any way it could have been in * a GPU cache */ BUG_ON(obj->base.read_domains & I915_GEM_GPU_DOMAINS); BUG_ON(obj->base.write_domain & I915_GEM_GPU_DOMAINS); KASSERT(obj->pages == NULL, ("Obj already has pages")); page_count = OFF_TO_IDX(obj->base.size); obj->pages = malloc(page_count * sizeof(vm_page_t), DRM_I915_GEM, M_WAITOK); res = i915_gem_object_get_pages_range(obj, 0, obj->base.size); if (res != 0) { free(obj->pages, DRM_I915_GEM); obj->pages = NULL; return (res); } vm_obj = obj->base.vm_obj; VM_OBJECT_WLOCK(vm_obj); for (i = 0, page = vm_page_lookup(vm_obj, 0); i < page_count; i++, page = vm_page_next(page)) { KASSERT(page->pindex == i, ("pindex %jx %jx", (uintmax_t)page->pindex, (uintmax_t)i)); obj->pages[i] = page; } VM_OBJECT_WUNLOCK(vm_obj); return (0); } /* Ensure that the associated pages are gathered from the backing storage * and pinned into our object. i915_gem_object_get_pages() may be called * multiple times before they are released by a single call to * i915_gem_object_put_pages() - once the pages are no longer referenced * either as a result of memory pressure (reaping pages under the shrinker) * or as the object is itself released. */ int i915_gem_object_get_pages(struct drm_i915_gem_object *obj) { struct drm_i915_private *dev_priv = obj->base.dev->dev_private; const struct drm_i915_gem_object_ops *ops = obj->ops; int ret; if (obj->pages) return 0; BUG_ON(obj->pages_pin_count); ret = ops->get_pages(obj); if (ret) return ret; list_add_tail(&obj->gtt_list, &dev_priv->mm.unbound_list); return 0; } void i915_gem_object_move_to_active(struct drm_i915_gem_object *obj, struct intel_ring_buffer *ring) { struct drm_device *dev = obj->base.dev; struct drm_i915_private *dev_priv = dev->dev_private; u32 seqno = intel_ring_get_seqno(ring); BUG_ON(ring == NULL); obj->ring = ring; /* Add a reference if we're newly entering the active list. */ if (!obj->active) { drm_gem_object_reference(&obj->base); obj->active = 1; } /* Move from whatever list we were on to the tail of execution. */ list_move_tail(&obj->mm_list, &dev_priv->mm.active_list); list_move_tail(&obj->ring_list, &ring->active_list); obj->last_read_seqno = seqno; if (obj->fenced_gpu_access) { obj->last_fenced_seqno = seqno; /* Bump MRU to take account of the delayed flush */ if (obj->fence_reg != I915_FENCE_REG_NONE) { struct drm_i915_fence_reg *reg; reg = &dev_priv->fence_regs[obj->fence_reg]; list_move_tail(®->lru_list, &dev_priv->mm.fence_list); } } } static void i915_gem_object_move_to_inactive(struct drm_i915_gem_object *obj) { struct drm_device *dev = obj->base.dev; struct drm_i915_private *dev_priv = dev->dev_private; BUG_ON(obj->base.write_domain & ~I915_GEM_GPU_DOMAINS); BUG_ON(!obj->active); list_move_tail(&obj->mm_list, &dev_priv->mm.inactive_list); list_del_init(&obj->ring_list); obj->ring = NULL; obj->last_read_seqno = 0; obj->last_write_seqno = 0; obj->base.write_domain = 0; obj->last_fenced_seqno = 0; obj->fenced_gpu_access = false; obj->active = 0; drm_gem_object_unreference(&obj->base); WARN_ON(i915_verify_lists(dev)); } static int i915_gem_handle_seqno_wrap(struct drm_device *dev) { struct drm_i915_private *dev_priv = dev->dev_private; struct intel_ring_buffer *ring; int ret, i, j; /* The hardware uses various monotonic 32-bit counters, if we * detect that they will wraparound we need to idle the GPU * and reset those counters. */ ret = 0; for_each_ring(ring, dev_priv, i) { for (j = 0; j < ARRAY_SIZE(ring->sync_seqno); j++) ret |= ring->sync_seqno[j] != 0; } if (ret == 0) return ret; ret = i915_gpu_idle(dev); if (ret) return ret; i915_gem_retire_requests(dev); for_each_ring(ring, dev_priv, i) { for (j = 0; j < ARRAY_SIZE(ring->sync_seqno); j++) ring->sync_seqno[j] = 0; } return 0; } int i915_gem_get_seqno(struct drm_device *dev, u32 *seqno) { struct drm_i915_private *dev_priv = dev->dev_private; /* reserve 0 for non-seqno */ if (dev_priv->next_seqno == 0) { int ret = i915_gem_handle_seqno_wrap(dev); if (ret) return ret; dev_priv->next_seqno = 1; } *seqno = dev_priv->next_seqno++; return 0; } int i915_add_request(struct intel_ring_buffer *ring, struct drm_file *file, u32 *out_seqno) { drm_i915_private_t *dev_priv = ring->dev->dev_private; struct drm_i915_gem_request *request; u32 request_ring_position; int was_empty; int ret; /* * Emit any outstanding flushes - execbuf can fail to emit the flush * after having emitted the batchbuffer command. Hence we need to fix * things up similar to emitting the lazy request. The difference here * is that the flush _must_ happen before the next request, no matter * what. */ ret = intel_ring_flush_all_caches(ring); if (ret) return ret; request = malloc(sizeof(*request), DRM_I915_GEM, M_NOWAIT); if (request == NULL) return -ENOMEM; /* Record the position of the start of the request so that * should we detect the updated seqno part-way through the * GPU processing the request, we never over-estimate the * position of the head. */ request_ring_position = intel_ring_get_tail(ring); ret = ring->add_request(ring); if (ret) { free(request, DRM_I915_GEM); return ret; } request->seqno = intel_ring_get_seqno(ring); request->ring = ring; request->tail = request_ring_position; request->emitted_jiffies = jiffies; was_empty = list_empty(&ring->request_list); list_add_tail(&request->list, &ring->request_list); request->file_priv = NULL; if (file) { struct drm_i915_file_private *file_priv = file->driver_priv; mtx_lock(&file_priv->mm.lock); request->file_priv = file_priv; list_add_tail(&request->client_list, &file_priv->mm.request_list); mtx_unlock(&file_priv->mm.lock); } CTR2(KTR_DRM, "request_add %s %d", ring->name, request->seqno); ring->outstanding_lazy_request = 0; if (!dev_priv->mm.suspended) { if (i915_enable_hangcheck) { callout_schedule(&dev_priv->hangcheck_timer, DRM_I915_HANGCHECK_PERIOD); } if (was_empty) { taskqueue_enqueue_timeout(dev_priv->wq, &dev_priv->mm.retire_work, hz); intel_mark_busy(dev_priv->dev); } } if (out_seqno) *out_seqno = request->seqno; return 0; } static inline void i915_gem_request_remove_from_client(struct drm_i915_gem_request *request) { struct drm_i915_file_private *file_priv = request->file_priv; if (!file_priv) return; mtx_lock(&file_priv->mm.lock); if (request->file_priv) { list_del(&request->client_list); request->file_priv = NULL; } mtx_unlock(&file_priv->mm.lock); } static void i915_gem_reset_ring_lists(struct drm_i915_private *dev_priv, struct intel_ring_buffer *ring) { if (ring->dev != NULL) DRM_LOCK_ASSERT(ring->dev); while (!list_empty(&ring->request_list)) { struct drm_i915_gem_request *request; request = list_first_entry(&ring->request_list, struct drm_i915_gem_request, list); list_del(&request->list); i915_gem_request_remove_from_client(request); free(request, DRM_I915_GEM); } while (!list_empty(&ring->active_list)) { struct drm_i915_gem_object *obj; obj = list_first_entry(&ring->active_list, struct drm_i915_gem_object, ring_list); i915_gem_object_move_to_inactive(obj); } } static void i915_gem_reset_fences(struct drm_device *dev) { struct drm_i915_private *dev_priv = dev->dev_private; int i; for (i = 0; i < dev_priv->num_fence_regs; i++) { struct drm_i915_fence_reg *reg = &dev_priv->fence_regs[i]; i915_gem_write_fence(dev, i, NULL); if (reg->obj) i915_gem_object_fence_lost(reg->obj); reg->pin_count = 0; reg->obj = NULL; INIT_LIST_HEAD(®->lru_list); } INIT_LIST_HEAD(&dev_priv->mm.fence_list); } void i915_gem_reset(struct drm_device *dev) { struct drm_i915_private *dev_priv = dev->dev_private; struct drm_i915_gem_object *obj; struct intel_ring_buffer *ring; int i; for_each_ring(ring, dev_priv, i) i915_gem_reset_ring_lists(dev_priv, ring); /* Move everything out of the GPU domains to ensure we do any * necessary invalidation upon reuse. */ list_for_each_entry(obj, &dev_priv->mm.inactive_list, mm_list) { obj->base.read_domains &= ~I915_GEM_GPU_DOMAINS; } /* The fence registers are invalidated so clear them out */ i915_gem_reset_fences(dev); } /** * This function clears the request list as sequence numbers are passed. */ void i915_gem_retire_requests_ring(struct intel_ring_buffer *ring) { uint32_t seqno; if (list_empty(&ring->request_list)) return; WARN_ON(i915_verify_lists(ring->dev)); seqno = ring->get_seqno(ring, true); CTR2(KTR_DRM, "retire_request_ring %s %d", ring->name, seqno); while (!list_empty(&ring->request_list)) { struct drm_i915_gem_request *request; request = list_first_entry(&ring->request_list, struct drm_i915_gem_request, list); if (!i915_seqno_passed(seqno, request->seqno)) break; CTR2(KTR_DRM, "retire_request_seqno_passed %s %d", ring->name, seqno); /* We know the GPU must have read the request to have * sent us the seqno + interrupt, so use the position * of tail of the request to update the last known position * of the GPU head. */ ring->last_retired_head = request->tail; list_del(&request->list); i915_gem_request_remove_from_client(request); free(request, DRM_I915_GEM); } /* Move any buffers on the active list that are no longer referenced * by the ringbuffer to the flushing/inactive lists as appropriate. */ while (!list_empty(&ring->active_list)) { struct drm_i915_gem_object *obj; obj = list_first_entry(&ring->active_list, struct drm_i915_gem_object, ring_list); if (!i915_seqno_passed(seqno, obj->last_read_seqno)) break; i915_gem_object_move_to_inactive(obj); } if (unlikely(ring->trace_irq_seqno && i915_seqno_passed(seqno, ring->trace_irq_seqno))) { ring->irq_put(ring); ring->trace_irq_seqno = 0; } WARN_ON(i915_verify_lists(ring->dev)); } void i915_gem_retire_requests(struct drm_device *dev) { drm_i915_private_t *dev_priv = dev->dev_private; struct intel_ring_buffer *ring; int i; for_each_ring(ring, dev_priv, i) i915_gem_retire_requests_ring(ring); } static void i915_gem_retire_work_handler(void *arg, int pending) { drm_i915_private_t *dev_priv; struct drm_device *dev; struct intel_ring_buffer *ring; bool idle; int i; dev_priv = arg; dev = dev_priv->dev; /* Come back later if the device is busy... */ if (!sx_try_xlock(&dev->dev_struct_lock)) { taskqueue_enqueue_timeout(dev_priv->wq, &dev_priv->mm.retire_work, hz); return; } CTR0(KTR_DRM, "retire_task"); i915_gem_retire_requests(dev); /* Send a periodic flush down the ring so we don't hold onto GEM * objects indefinitely. */ idle = true; for_each_ring(ring, dev_priv, i) { if (ring->gpu_caches_dirty) i915_add_request(ring, NULL, NULL); idle &= list_empty(&ring->request_list); } if (!dev_priv->mm.suspended && !idle) taskqueue_enqueue_timeout(dev_priv->wq, &dev_priv->mm.retire_work, hz); if (idle) intel_mark_idle(dev); DRM_UNLOCK(dev); } /** * Ensures that an object will eventually get non-busy by flushing any required * write domains, emitting any outstanding lazy request and retiring and * completed requests. */ static int i915_gem_object_flush_active(struct drm_i915_gem_object *obj) { int ret; if (obj->active) { ret = i915_gem_check_olr(obj->ring, obj->last_read_seqno); if (ret) return ret; i915_gem_retire_requests_ring(obj->ring); } return 0; } /** * i915_gem_wait_ioctl - implements DRM_IOCTL_I915_GEM_WAIT * @DRM_IOCTL_ARGS: standard ioctl arguments * * Returns 0 if successful, else an error is returned with the remaining time in * the timeout parameter. * -ETIME: object is still busy after timeout * -ERESTARTSYS: signal interrupted the wait * -ENONENT: object doesn't exist * Also possible, but rare: * -EAGAIN: GPU wedged * -ENOMEM: damn * -ENODEV: Internal IRQ fail * -E?: The add request failed * * The wait ioctl with a timeout of 0 reimplements the busy ioctl. With any * non-zero timeout parameter the wait ioctl will wait for the given number of * nanoseconds on an object becoming unbusy. Since the wait itself does so * without holding struct_mutex the object may become re-busied before this * function completes. A similar but shorter * race condition exists in the busy * ioctl */ int i915_gem_wait_ioctl(struct drm_device *dev, void *data, struct drm_file *file) { struct drm_i915_gem_wait *args = data; struct drm_i915_gem_object *obj; struct intel_ring_buffer *ring = NULL; struct timespec timeout_stack, *timeout = NULL; u32 seqno = 0; int ret = 0; if (args->timeout_ns >= 0) { timeout_stack.tv_sec = args->timeout_ns / 1000000; timeout_stack.tv_nsec = args->timeout_ns % 1000000; timeout = &timeout_stack; } ret = i915_mutex_lock_interruptible(dev); if (ret) return ret; obj = to_intel_bo(drm_gem_object_lookup(dev, file, args->bo_handle)); if (&obj->base == NULL) { DRM_UNLOCK(dev); return -ENOENT; } /* Need to make sure the object gets inactive eventually. */ ret = i915_gem_object_flush_active(obj); if (ret) goto out; if (obj->active) { seqno = obj->last_read_seqno; ring = obj->ring; } if (seqno == 0) goto out; /* Do this after OLR check to make sure we make forward progress polling * on this IOCTL with a 0 timeout (like busy ioctl) */ if (!args->timeout_ns) { ret = -ETIMEDOUT; goto out; } drm_gem_object_unreference(&obj->base); DRM_UNLOCK(dev); ret = __wait_seqno(ring, seqno, true, timeout); if (timeout) { args->timeout_ns = timeout->tv_sec * 1000000 + timeout->tv_nsec; } return ret; out: drm_gem_object_unreference(&obj->base); DRM_UNLOCK(dev); return ret; } /** * i915_gem_object_sync - sync an object to a ring. * * @obj: object which may be in use on another ring. * @to: ring we wish to use the object on. May be NULL. * * This code is meant to abstract object synchronization with the GPU. * Calling with NULL implies synchronizing the object with the CPU * rather than a particular GPU ring. * * Returns 0 if successful, else propagates up the lower layer error. */ int i915_gem_object_sync(struct drm_i915_gem_object *obj, struct intel_ring_buffer *to) { struct intel_ring_buffer *from = obj->ring; u32 seqno; int ret, idx; if (from == NULL || to == from) return 0; if (to == NULL || !i915_semaphore_is_enabled(obj->base.dev)) return i915_gem_object_wait_rendering(obj, false); idx = intel_ring_sync_index(from, to); seqno = obj->last_read_seqno; if (seqno <= from->sync_seqno[idx]) return 0; ret = i915_gem_check_olr(obj->ring, seqno); if (ret) return ret; ret = to->sync_to(to, from, seqno); if (!ret) /* We use last_read_seqno because sync_to() * might have just caused seqno wrap under * the radar. */ from->sync_seqno[idx] = obj->last_read_seqno; return ret; } static void i915_gem_object_finish_gtt(struct drm_i915_gem_object *obj) { u32 old_write_domain, old_read_domains; /* Act a barrier for all accesses through the GTT */ mb(); /* Force a pagefault for domain tracking on next user access */ i915_gem_release_mmap(obj); if ((obj->base.read_domains & I915_GEM_DOMAIN_GTT) == 0) return; old_read_domains = obj->base.read_domains; old_write_domain = obj->base.write_domain; obj->base.read_domains &= ~I915_GEM_DOMAIN_GTT; obj->base.write_domain &= ~I915_GEM_DOMAIN_GTT; CTR3(KTR_DRM, "object_change_domain finish gtt %p %x %x", obj, old_read_domains, old_write_domain); } /** * Unbinds an object from the GTT aperture. */ int i915_gem_object_unbind(struct drm_i915_gem_object *obj) { drm_i915_private_t *dev_priv = obj->base.dev->dev_private; int ret = 0; if (obj->gtt_space == NULL) return 0; if (obj->pin_count) return -EBUSY; BUG_ON(obj->pages == NULL); ret = i915_gem_object_finish_gpu(obj); if (ret) return ret; /* Continue on if we fail due to EIO, the GPU is hung so we * should be safe and we need to cleanup or else we might * cause memory corruption through use-after-free. */ i915_gem_object_finish_gtt(obj); /* release the fence reg _after_ flushing */ ret = i915_gem_object_put_fence(obj); if (ret) return ret; if (obj->has_global_gtt_mapping) i915_gem_gtt_unbind_object(obj); if (obj->has_aliasing_ppgtt_mapping) { i915_ppgtt_unbind_object(dev_priv->mm.aliasing_ppgtt, obj); obj->has_aliasing_ppgtt_mapping = 0; } i915_gem_gtt_finish_object(obj); list_del(&obj->mm_list); list_move_tail(&obj->gtt_list, &dev_priv->mm.unbound_list); /* Avoid an unnecessary call to unbind on rebind. */ obj->map_and_fenceable = true; drm_mm_put_block(obj->gtt_space); obj->gtt_space = NULL; obj->gtt_offset = 0; return 0; } int i915_gpu_idle(struct drm_device *dev) { drm_i915_private_t *dev_priv = dev->dev_private; struct intel_ring_buffer *ring; int ret, i; /* Flush everything onto the inactive list. */ for_each_ring(ring, dev_priv, i) { ret = i915_switch_context(ring, NULL, DEFAULT_CONTEXT_ID); if (ret) return ret; ret = intel_ring_idle(ring); if (ret) return ret; } return 0; } static void sandybridge_write_fence_reg(struct drm_device *dev, int reg, struct drm_i915_gem_object *obj) { drm_i915_private_t *dev_priv = dev->dev_private; uint64_t val; if (obj) { u32 size = obj->gtt_space->size; val = (uint64_t)((obj->gtt_offset + size - 4096) & 0xfffff000) << 32; val |= obj->gtt_offset & 0xfffff000; val |= (uint64_t)((obj->stride / 128) - 1) << SANDYBRIDGE_FENCE_PITCH_SHIFT; if (obj->tiling_mode == I915_TILING_Y) val |= 1 << I965_FENCE_TILING_Y_SHIFT; val |= I965_FENCE_REG_VALID; } else val = 0; I915_WRITE64(FENCE_REG_SANDYBRIDGE_0 + reg * 8, val); POSTING_READ(FENCE_REG_SANDYBRIDGE_0 + reg * 8); } static void i965_write_fence_reg(struct drm_device *dev, int reg, struct drm_i915_gem_object *obj) { drm_i915_private_t *dev_priv = dev->dev_private; uint64_t val; if (obj) { u32 size = obj->gtt_space->size; val = (uint64_t)((obj->gtt_offset + size - 4096) & 0xfffff000) << 32; val |= obj->gtt_offset & 0xfffff000; val |= ((obj->stride / 128) - 1) << I965_FENCE_PITCH_SHIFT; if (obj->tiling_mode == I915_TILING_Y) val |= 1 << I965_FENCE_TILING_Y_SHIFT; val |= I965_FENCE_REG_VALID; } else val = 0; I915_WRITE64(FENCE_REG_965_0 + reg * 8, val); POSTING_READ(FENCE_REG_965_0 + reg * 8); } static void i915_write_fence_reg(struct drm_device *dev, int reg, struct drm_i915_gem_object *obj) { drm_i915_private_t *dev_priv = dev->dev_private; u32 val; if (obj) { u32 size = obj->gtt_space->size; int pitch_val; int tile_width; WARN((obj->gtt_offset & ~I915_FENCE_START_MASK) || (size & -size) != size || (obj->gtt_offset & (size - 1)), "object 0x%08x [fenceable? %d] not 1M or pot-size (0x%08x) aligned\n", obj->gtt_offset, obj->map_and_fenceable, size); if (obj->tiling_mode == I915_TILING_Y && HAS_128_BYTE_Y_TILING(dev)) tile_width = 128; else tile_width = 512; /* Note: pitch better be a power of two tile widths */ pitch_val = obj->stride / tile_width; pitch_val = ffs(pitch_val) - 1; val = obj->gtt_offset; if (obj->tiling_mode == I915_TILING_Y) val |= 1 << I830_FENCE_TILING_Y_SHIFT; val |= I915_FENCE_SIZE_BITS(size); val |= pitch_val << I830_FENCE_PITCH_SHIFT; val |= I830_FENCE_REG_VALID; } else val = 0; if (reg < 8) reg = FENCE_REG_830_0 + reg * 4; else reg = FENCE_REG_945_8 + (reg - 8) * 4; I915_WRITE(reg, val); POSTING_READ(reg); } static void i830_write_fence_reg(struct drm_device *dev, int reg, struct drm_i915_gem_object *obj) { drm_i915_private_t *dev_priv = dev->dev_private; uint32_t val; if (obj) { u32 size = obj->gtt_space->size; uint32_t pitch_val; WARN((obj->gtt_offset & ~I830_FENCE_START_MASK) || (size & -size) != size || (obj->gtt_offset & (size - 1)), "object 0x%08x not 512K or pot-size 0x%08x aligned\n", obj->gtt_offset, size); pitch_val = obj->stride / 128; pitch_val = ffs(pitch_val) - 1; val = obj->gtt_offset; if (obj->tiling_mode == I915_TILING_Y) val |= 1 << I830_FENCE_TILING_Y_SHIFT; val |= I830_FENCE_SIZE_BITS(size); val |= pitch_val << I830_FENCE_PITCH_SHIFT; val |= I830_FENCE_REG_VALID; } else val = 0; I915_WRITE(FENCE_REG_830_0 + reg * 4, val); POSTING_READ(FENCE_REG_830_0 + reg * 4); } static void i915_gem_write_fence(struct drm_device *dev, int reg, struct drm_i915_gem_object *obj) { switch (INTEL_INFO(dev)->gen) { case 7: case 6: sandybridge_write_fence_reg(dev, reg, obj); break; case 5: case 4: i965_write_fence_reg(dev, reg, obj); break; case 3: i915_write_fence_reg(dev, reg, obj); break; case 2: i830_write_fence_reg(dev, reg, obj); break; default: break; } } static inline int fence_number(struct drm_i915_private *dev_priv, struct drm_i915_fence_reg *fence) { return fence - dev_priv->fence_regs; } static void i915_gem_write_fence__ipi(void *data) { wbinvd(); } static void i915_gem_object_update_fence(struct drm_i915_gem_object *obj, struct drm_i915_fence_reg *fence, bool enable) { struct drm_device *dev = obj->base.dev; struct drm_i915_private *dev_priv = dev->dev_private; int fence_reg = fence_number(dev_priv, fence); /* In order to fully serialize access to the fenced region and * the update to the fence register we need to take extreme * measures on SNB+. In theory, the write to the fence register * flushes all memory transactions before, and coupled with the * mb() placed around the register write we serialise all memory * operations with respect to the changes in the tiler. Yet, on * SNB+ we need to take a step further and emit an explicit wbinvd() * on each processor in order to manually flush all memory * transactions before updating the fence register. */ if (HAS_LLC(obj->base.dev)) on_each_cpu(i915_gem_write_fence__ipi, NULL, 1); i915_gem_write_fence(dev, fence_reg, enable ? obj : NULL); if (enable) { obj->fence_reg = fence_reg; fence->obj = obj; list_move_tail(&fence->lru_list, &dev_priv->mm.fence_list); } else { obj->fence_reg = I915_FENCE_REG_NONE; fence->obj = NULL; list_del_init(&fence->lru_list); } } static int i915_gem_object_flush_fence(struct drm_i915_gem_object *obj) { if (obj->last_fenced_seqno) { int ret = i915_wait_seqno(obj->ring, obj->last_fenced_seqno); if (ret) return ret; obj->last_fenced_seqno = 0; } /* Ensure that all CPU reads are completed before installing a fence * and all writes before removing the fence. */ if (obj->base.read_domains & I915_GEM_DOMAIN_GTT) mb(); obj->fenced_gpu_access = false; return 0; } int i915_gem_object_put_fence(struct drm_i915_gem_object *obj) { struct drm_i915_private *dev_priv = obj->base.dev->dev_private; int ret; ret = i915_gem_object_flush_fence(obj); if (ret) return ret; if (obj->fence_reg == I915_FENCE_REG_NONE) return 0; i915_gem_object_update_fence(obj, &dev_priv->fence_regs[obj->fence_reg], false); i915_gem_object_fence_lost(obj); return 0; } static struct drm_i915_fence_reg * i915_find_fence_reg(struct drm_device *dev) { struct drm_i915_private *dev_priv = dev->dev_private; struct drm_i915_fence_reg *reg, *avail; int i; /* First try to find a free reg */ avail = NULL; for (i = dev_priv->fence_reg_start; i < dev_priv->num_fence_regs; i++) { reg = &dev_priv->fence_regs[i]; if (!reg->obj) return reg; if (!reg->pin_count) avail = reg; } if (avail == NULL) return NULL; /* None available, try to steal one or wait for a user to finish */ list_for_each_entry(reg, &dev_priv->mm.fence_list, lru_list) { if (reg->pin_count) continue; return reg; } return NULL; } /** * i915_gem_object_get_fence - set up fencing for an object * @obj: object to map through a fence reg * * When mapping objects through the GTT, userspace wants to be able to write * to them without having to worry about swizzling if the object is tiled. * This function walks the fence regs looking for a free one for @obj, * stealing one if it can't find any. * * It then sets up the reg based on the object's properties: address, pitch * and tiling format. * * For an untiled surface, this removes any existing fence. */ int i915_gem_object_get_fence(struct drm_i915_gem_object *obj) { struct drm_device *dev = obj->base.dev; struct drm_i915_private *dev_priv = dev->dev_private; bool enable = obj->tiling_mode != I915_TILING_NONE; struct drm_i915_fence_reg *reg; int ret; /* Have we updated the tiling parameters upon the object and so * will need to serialise the write to the associated fence register? */ if (obj->fence_dirty) { ret = i915_gem_object_flush_fence(obj); if (ret) return ret; } /* Just update our place in the LRU if our fence is getting reused. */ if (obj->fence_reg != I915_FENCE_REG_NONE) { reg = &dev_priv->fence_regs[obj->fence_reg]; if (!obj->fence_dirty) { list_move_tail(®->lru_list, &dev_priv->mm.fence_list); return 0; } } else if (enable) { reg = i915_find_fence_reg(dev); if (reg == NULL) return -EDEADLK; if (reg->obj) { struct drm_i915_gem_object *old = reg->obj; ret = i915_gem_object_flush_fence(old); if (ret) return ret; i915_gem_object_fence_lost(old); } } else return 0; i915_gem_object_update_fence(obj, reg, enable); obj->fence_dirty = false; return 0; } static bool i915_gem_valid_gtt_space(struct drm_device *dev, struct drm_mm_node *gtt_space, unsigned long cache_level) { struct drm_mm_node *other; /* On non-LLC machines we have to be careful when putting differing * types of snoopable memory together to avoid the prefetcher * crossing memory domains and dying. */ if (HAS_LLC(dev)) return true; if (gtt_space == NULL) return true; if (list_empty(>t_space->node_list)) return true; other = list_entry(gtt_space->node_list.prev, struct drm_mm_node, node_list); if (other->allocated && !other->hole_follows && other->color != cache_level) return false; other = list_entry(gtt_space->node_list.next, struct drm_mm_node, node_list); if (other->allocated && !gtt_space->hole_follows && other->color != cache_level) return false; return true; } static void i915_gem_verify_gtt(struct drm_device *dev) { #if WATCH_GTT struct drm_i915_private *dev_priv = dev->dev_private; struct drm_i915_gem_object *obj; int err = 0; list_for_each_entry(obj, &dev_priv->mm.gtt_list, gtt_list) { if (obj->gtt_space == NULL) { DRM_ERROR("object found on GTT list with no space reserved\n"); err++; continue; } if (obj->cache_level != obj->gtt_space->color) { DRM_ERROR("object reserved space [%08lx, %08lx] with wrong color, cache_level=%x, color=%lx\n", obj->gtt_space->start, obj->gtt_space->start + obj->gtt_space->size, obj->cache_level, obj->gtt_space->color); err++; continue; } if (!i915_gem_valid_gtt_space(dev, obj->gtt_space, obj->cache_level)) { DRM_ERROR("invalid GTT space found at [%08lx, %08lx] - color=%x\n", obj->gtt_space->start, obj->gtt_space->start + obj->gtt_space->size, obj->cache_level); err++; continue; } } WARN_ON(err); #endif } /** * Finds free space in the GTT aperture and binds the object there. */ static int i915_gem_object_bind_to_gtt(struct drm_i915_gem_object *obj, unsigned alignment, bool map_and_fenceable, bool nonblocking) { struct drm_device *dev = obj->base.dev; drm_i915_private_t *dev_priv = dev->dev_private; struct drm_mm_node *node; u32 size, fence_size, fence_alignment, unfenced_alignment; bool mappable, fenceable; int ret; if (obj->madv != I915_MADV_WILLNEED) { DRM_ERROR("Attempting to bind a purgeable object\n"); return -EINVAL; } fence_size = i915_gem_get_gtt_size(dev, obj->base.size, obj->tiling_mode); fence_alignment = i915_gem_get_gtt_alignment(dev, obj->base.size, obj->tiling_mode); unfenced_alignment = i915_gem_get_unfenced_gtt_alignment(dev, obj->base.size, obj->tiling_mode); if (alignment == 0) alignment = map_and_fenceable ? fence_alignment : unfenced_alignment; if (map_and_fenceable && alignment & (fence_alignment - 1)) { DRM_ERROR("Invalid object alignment requested %u\n", alignment); return -EINVAL; } size = map_and_fenceable ? fence_size : obj->base.size; /* If the object is bigger than the entire aperture, reject it early * before evicting everything in a vain attempt to find space. */ if (obj->base.size > (map_and_fenceable ? dev_priv->mm.gtt_mappable_end : dev_priv->mm.gtt_total)) { DRM_ERROR("Attempting to bind an object larger than the aperture\n"); return -E2BIG; } ret = i915_gem_object_get_pages(obj); if (ret) return ret; i915_gem_object_pin_pages(obj); node = malloc(sizeof(*node), DRM_MEM_MM, M_NOWAIT | M_ZERO); if (node == NULL) { i915_gem_object_unpin_pages(obj); return -ENOMEM; } search_free: if (map_and_fenceable) ret = drm_mm_insert_node_in_range_generic(&dev_priv->mm.gtt_space, node, size, alignment, obj->cache_level, 0, dev_priv->mm.gtt_mappable_end); else ret = drm_mm_insert_node_generic(&dev_priv->mm.gtt_space, node, size, alignment, obj->cache_level); if (ret) { ret = i915_gem_evict_something(dev, size, alignment, obj->cache_level, map_and_fenceable, nonblocking); if (ret == 0) goto search_free; i915_gem_object_unpin_pages(obj); free(node, DRM_MEM_MM); return ret; } if (WARN_ON(!i915_gem_valid_gtt_space(dev, node, obj->cache_level))) { i915_gem_object_unpin_pages(obj); drm_mm_put_block(node); return -EINVAL; } ret = i915_gem_gtt_prepare_object(obj); if (ret) { i915_gem_object_unpin_pages(obj); drm_mm_put_block(node); return ret; } list_move_tail(&obj->gtt_list, &dev_priv->mm.bound_list); list_add_tail(&obj->mm_list, &dev_priv->mm.inactive_list); obj->gtt_space = node; obj->gtt_offset = node->start; fenceable = node->size == fence_size && (node->start & (fence_alignment - 1)) == 0; mappable = obj->gtt_offset + obj->base.size <= dev_priv->mm.gtt_mappable_end; obj->map_and_fenceable = mappable && fenceable; i915_gem_object_unpin_pages(obj); CTR4(KTR_DRM, "object_bind %p %x %x %d", obj, obj->gtt_offset, obj->base.size, map_and_fenceable); i915_gem_verify_gtt(dev); return 0; } void i915_gem_clflush_object(struct drm_i915_gem_object *obj) { /* If we don't have a page list set up, then we're not pinned * to GPU, and we can ignore the cache flush because it'll happen * again at bind time. */ if (obj->pages == NULL) return; /* If the GPU is snooping the contents of the CPU cache, * we do not need to manually clear the CPU cache lines. However, * the caches are only snooped when the render cache is * flushed/invalidated. As we always have to emit invalidations * and flushes when moving into and out of the RENDER domain, correct * snooping behaviour occurs naturally as the result of our domain * tracking. */ if (obj->cache_level != I915_CACHE_NONE) return; CTR1(KTR_DRM, "object_clflush %p", obj); drm_clflush_pages(obj->pages, obj->base.size / PAGE_SIZE); } /** Flushes the GTT write domain for the object if it's dirty. */ static void i915_gem_object_flush_gtt_write_domain(struct drm_i915_gem_object *obj) { uint32_t old_write_domain; if (obj->base.write_domain != I915_GEM_DOMAIN_GTT) return; /* No actual flushing is required for the GTT write domain. Writes * to it immediately go to main memory as far as we know, so there's * no chipset flush. It also doesn't land in render cache. * * However, we do have to enforce the order so that all writes through * the GTT land before any writes to the device, such as updates to * the GATT itself. */ wmb(); old_write_domain = obj->base.write_domain; obj->base.write_domain = 0; CTR3(KTR_DRM, "object_change_domain flush gtt_write %p %x %x", obj, obj->base.read_domains, old_write_domain); } /** Flushes the CPU write domain for the object if it's dirty. */ static void i915_gem_object_flush_cpu_write_domain(struct drm_i915_gem_object *obj) { uint32_t old_write_domain; if (obj->base.write_domain != I915_GEM_DOMAIN_CPU) return; i915_gem_clflush_object(obj); i915_gem_chipset_flush(obj->base.dev); old_write_domain = obj->base.write_domain; obj->base.write_domain = 0; CTR3(KTR_DRM, "object_change_domain flush_cpu_write %p %x %x", obj, obj->base.read_domains, old_write_domain); } /** * Moves a single object to the GTT read, and possibly write domain. * * This function returns when the move is complete, including waiting on * flushes to occur. */ int i915_gem_object_set_to_gtt_domain(struct drm_i915_gem_object *obj, bool write) { drm_i915_private_t *dev_priv = obj->base.dev->dev_private; uint32_t old_write_domain, old_read_domains; int ret; /* Not valid to be called on unbound objects. */ if (obj->gtt_space == NULL) return -EINVAL; if (obj->base.write_domain == I915_GEM_DOMAIN_GTT) return 0; ret = i915_gem_object_wait_rendering(obj, !write); if (ret) return ret; i915_gem_object_flush_cpu_write_domain(obj); old_write_domain = obj->base.write_domain; old_read_domains = obj->base.read_domains; /* It should now be out of any other write domains, and we can update * the domain values for our changes. */ BUG_ON((obj->base.write_domain & ~I915_GEM_DOMAIN_GTT) != 0); obj->base.read_domains |= I915_GEM_DOMAIN_GTT; if (write) { obj->base.read_domains = I915_GEM_DOMAIN_GTT; obj->base.write_domain = I915_GEM_DOMAIN_GTT; obj->dirty = 1; } CTR3(KTR_DRM, "object_change_domain set_to_gtt %p %x %x", obj, old_read_domains, old_write_domain); /* And bump the LRU for this access */ if (i915_gem_object_is_inactive(obj)) list_move_tail(&obj->mm_list, &dev_priv->mm.inactive_list); return 0; } int i915_gem_object_set_cache_level(struct drm_i915_gem_object *obj, enum i915_cache_level cache_level) { struct drm_device *dev = obj->base.dev; drm_i915_private_t *dev_priv = dev->dev_private; int ret; if (obj->cache_level == cache_level) return 0; if (obj->pin_count) { DRM_DEBUG("can not change the cache level of pinned objects\n"); return -EBUSY; } if (!i915_gem_valid_gtt_space(dev, obj->gtt_space, cache_level)) { ret = i915_gem_object_unbind(obj); if (ret) return ret; } if (obj->gtt_space) { ret = i915_gem_object_finish_gpu(obj); if (ret) return ret; i915_gem_object_finish_gtt(obj); /* Before SandyBridge, you could not use tiling or fence * registers with snooped memory, so relinquish any fences * currently pointing to our region in the aperture. */ if (INTEL_INFO(dev)->gen < 6) { ret = i915_gem_object_put_fence(obj); if (ret) return ret; } if (obj->has_global_gtt_mapping) i915_gem_gtt_bind_object(obj, cache_level); if (obj->has_aliasing_ppgtt_mapping) i915_ppgtt_bind_object(dev_priv->mm.aliasing_ppgtt, obj, cache_level); obj->gtt_space->color = cache_level; } if (cache_level == I915_CACHE_NONE) { u32 old_read_domains, old_write_domain; /* If we're coming from LLC cached, then we haven't * actually been tracking whether the data is in the * CPU cache or not, since we only allow one bit set * in obj->write_domain and have been skipping the clflushes. * Just set it to the CPU cache for now. */ WARN_ON(obj->base.write_domain & ~I915_GEM_DOMAIN_CPU); WARN_ON(obj->base.read_domains & ~I915_GEM_DOMAIN_CPU); old_read_domains = obj->base.read_domains; old_write_domain = obj->base.write_domain; obj->base.read_domains = I915_GEM_DOMAIN_CPU; obj->base.write_domain = I915_GEM_DOMAIN_CPU; CTR3(KTR_DRM, "object_change_domain set_cache_level %p %x %x", obj, old_read_domains, old_write_domain); } obj->cache_level = cache_level; i915_gem_verify_gtt(dev); return 0; } int i915_gem_get_caching_ioctl(struct drm_device *dev, void *data, struct drm_file *file) { struct drm_i915_gem_caching *args = data; struct drm_i915_gem_object *obj; int ret; ret = i915_mutex_lock_interruptible(dev); if (ret) return ret; obj = to_intel_bo(drm_gem_object_lookup(dev, file, args->handle)); if (&obj->base == NULL) { ret = -ENOENT; goto unlock; } args->caching = obj->cache_level != I915_CACHE_NONE; drm_gem_object_unreference(&obj->base); unlock: DRM_UNLOCK(dev); return ret; } int i915_gem_set_caching_ioctl(struct drm_device *dev, void *data, struct drm_file *file) { struct drm_i915_gem_caching *args = data; struct drm_i915_gem_object *obj; enum i915_cache_level level; int ret; switch (args->caching) { case I915_CACHING_NONE: level = I915_CACHE_NONE; break; case I915_CACHING_CACHED: level = I915_CACHE_LLC; break; default: return -EINVAL; } ret = i915_mutex_lock_interruptible(dev); if (ret) return ret; obj = to_intel_bo(drm_gem_object_lookup(dev, file, args->handle)); if (&obj->base == NULL) { ret = -ENOENT; goto unlock; } ret = i915_gem_object_set_cache_level(obj, level); drm_gem_object_unreference(&obj->base); unlock: DRM_UNLOCK(dev); return ret; } static bool is_pin_display(struct drm_i915_gem_object *obj) { /* There are 3 sources that pin objects: * 1. The display engine (scanouts, sprites, cursors); * 2. Reservations for execbuffer; * 3. The user. * * We can ignore reservations as we hold the struct_mutex and * are only called outside of the reservation path. The user * can only increment pin_count once, and so if after * subtracting the potential reference by the user, any pin_count * remains, it must be due to another use by the display engine. */ return obj->pin_count - !!obj->user_pin_count; } /* * Prepare buffer for display plane (scanout, cursors, etc). * Can be called from an uninterruptible phase (modesetting) and allows * any flushes to be pipelined (for pageflips). */ int i915_gem_object_pin_to_display_plane(struct drm_i915_gem_object *obj, u32 alignment, struct intel_ring_buffer *pipelined) { u32 old_read_domains, old_write_domain; int ret; if (pipelined != obj->ring) { ret = i915_gem_object_sync(obj, pipelined); if (ret) return ret; } /* Mark the pin_display early so that we account for the * display coherency whilst setting up the cache domains. */ obj->pin_display = true; /* The display engine is not coherent with the LLC cache on gen6. As * a result, we make sure that the pinning that is about to occur is * done with uncached PTEs. This is lowest common denominator for all * chipsets. * * However for gen6+, we could do better by using the GFDT bit instead * of uncaching, which would allow us to flush all the LLC-cached data * with that bit in the PTE to main memory with just one PIPE_CONTROL. */ ret = i915_gem_object_set_cache_level(obj, I915_CACHE_NONE); if (ret) goto err_unpin_display; /* As the user may map the buffer once pinned in the display plane * (e.g. libkms for the bootup splash), we have to ensure that we * always use map_and_fenceable for all scanout buffers. */ ret = i915_gem_object_pin(obj, alignment, true, false); if (ret) goto err_unpin_display; i915_gem_object_flush_cpu_write_domain(obj); old_write_domain = obj->base.write_domain; old_read_domains = obj->base.read_domains; /* It should now be out of any other write domains, and we can update * the domain values for our changes. */ obj->base.write_domain = 0; obj->base.read_domains |= I915_GEM_DOMAIN_GTT; CTR3(KTR_DRM, "object_change_domain pin_to_display_plan %p %x %x", obj, old_read_domains, old_write_domain); return 0; err_unpin_display: obj->pin_display = is_pin_display(obj); return ret; } void i915_gem_object_unpin_from_display_plane(struct drm_i915_gem_object *obj) { i915_gem_object_unpin(obj); obj->pin_display = is_pin_display(obj); } int i915_gem_object_finish_gpu(struct drm_i915_gem_object *obj) { int ret; if ((obj->base.read_domains & I915_GEM_GPU_DOMAINS) == 0) return 0; ret = i915_gem_object_wait_rendering(obj, false); if (ret) return ret; /* Ensure that we invalidate the GPU's caches and TLBs. */ obj->base.read_domains &= ~I915_GEM_GPU_DOMAINS; return 0; } /** * Moves a single object to the CPU read, and possibly write domain. * * This function returns when the move is complete, including waiting on * flushes to occur. */ int i915_gem_object_set_to_cpu_domain(struct drm_i915_gem_object *obj, bool write) { uint32_t old_write_domain, old_read_domains; int ret; if (obj->base.write_domain == I915_GEM_DOMAIN_CPU) return 0; ret = i915_gem_object_wait_rendering(obj, !write); if (ret) return ret; i915_gem_object_flush_gtt_write_domain(obj); old_write_domain = obj->base.write_domain; old_read_domains = obj->base.read_domains; /* Flush the CPU cache if it's still invalid. */ if ((obj->base.read_domains & I915_GEM_DOMAIN_CPU) == 0) { i915_gem_clflush_object(obj); obj->base.read_domains |= I915_GEM_DOMAIN_CPU; } /* It should now be out of any other write domains, and we can update * the domain values for our changes. */ BUG_ON((obj->base.write_domain & ~I915_GEM_DOMAIN_CPU) != 0); /* If we're writing through the CPU, then the GPU read domains will * need to be invalidated at next use. */ if (write) { obj->base.read_domains = I915_GEM_DOMAIN_CPU; obj->base.write_domain = I915_GEM_DOMAIN_CPU; } CTR3(KTR_DRM, "object_change_domain set_to_cpu %p %x %x", obj, old_read_domains, old_write_domain); return 0; } /* Throttle our rendering by waiting until the ring has completed our requests * emitted over 20 msec ago. * * Note that if we were to use the current jiffies each time around the loop, * we wouldn't escape the function with any frames outstanding if the time to * render a frame was over 20ms. * * This should get us reasonable parallelism between CPU and GPU but also * relatively low latency when blocking on a particular request to finish. */ static int i915_gem_ring_throttle(struct drm_device *dev, struct drm_file *file) { struct drm_i915_private *dev_priv = dev->dev_private; struct drm_i915_file_private *file_priv = file->driver_priv; unsigned long recent_enough = jiffies - msecs_to_jiffies(20); struct drm_i915_gem_request *request; struct intel_ring_buffer *ring = NULL; u32 seqno = 0; int ret; if (atomic_read(&dev_priv->mm.wedged)) return -EIO; mtx_lock(&file_priv->mm.lock); list_for_each_entry(request, &file_priv->mm.request_list, client_list) { if (time_after_eq(request->emitted_jiffies, recent_enough)) break; ring = request->ring; seqno = request->seqno; } mtx_unlock(&file_priv->mm.lock); if (seqno == 0) return 0; ret = __wait_seqno(ring, seqno, true, NULL); if (ret == 0) taskqueue_enqueue_timeout(dev_priv->wq, &dev_priv->mm.retire_work, 0); return ret; } int i915_gem_object_pin(struct drm_i915_gem_object *obj, uint32_t alignment, bool map_and_fenceable, bool nonblocking) { int ret; if (WARN_ON(obj->pin_count == DRM_I915_GEM_OBJECT_MAX_PIN_COUNT)) return -EBUSY; if (obj->gtt_space != NULL) { if ((alignment && obj->gtt_offset & (alignment - 1)) || (map_and_fenceable && !obj->map_and_fenceable)) { WARN(obj->pin_count, "bo is already pinned with incorrect alignment:" " offset=%x, req.alignment=%x, req.map_and_fenceable=%d," " obj->map_and_fenceable=%d\n", obj->gtt_offset, alignment, map_and_fenceable, obj->map_and_fenceable); ret = i915_gem_object_unbind(obj); if (ret) return ret; } } if (obj->gtt_space == NULL) { struct drm_i915_private *dev_priv = obj->base.dev->dev_private; ret = i915_gem_object_bind_to_gtt(obj, alignment, map_and_fenceable, nonblocking); if (ret) return ret; if (!dev_priv->mm.aliasing_ppgtt) i915_gem_gtt_bind_object(obj, obj->cache_level); } if (!obj->has_global_gtt_mapping && map_and_fenceable) i915_gem_gtt_bind_object(obj, obj->cache_level); obj->pin_count++; obj->pin_mappable |= map_and_fenceable; return 0; } void i915_gem_object_unpin(struct drm_i915_gem_object *obj) { BUG_ON(obj->pin_count == 0); BUG_ON(obj->gtt_space == NULL); if (--obj->pin_count == 0) obj->pin_mappable = false; } int i915_gem_pin_ioctl(struct drm_device *dev, void *data, struct drm_file *file) { struct drm_i915_gem_pin *args = data; struct drm_i915_gem_object *obj; int ret; ret = i915_mutex_lock_interruptible(dev); if (ret) return ret; obj = to_intel_bo(drm_gem_object_lookup(dev, file, args->handle)); if (&obj->base == NULL) { ret = -ENOENT; goto unlock; } if (obj->madv != I915_MADV_WILLNEED) { DRM_ERROR("Attempting to pin a purgeable buffer\n"); ret = -EINVAL; goto out; } if (obj->pin_filp != NULL && obj->pin_filp != file) { DRM_ERROR("Already pinned in i915_gem_pin_ioctl(): %d\n", args->handle); ret = -EINVAL; goto out; } if (obj->user_pin_count == 0) { ret = i915_gem_object_pin(obj, args->alignment, true, false); if (ret) goto out; } obj->user_pin_count++; obj->pin_filp = file; /* XXX - flush the CPU caches for pinned objects * as the X server doesn't manage domains yet */ i915_gem_object_flush_cpu_write_domain(obj); args->offset = obj->gtt_offset; out: drm_gem_object_unreference(&obj->base); unlock: DRM_UNLOCK(dev); return ret; } int i915_gem_unpin_ioctl(struct drm_device *dev, void *data, struct drm_file *file) { struct drm_i915_gem_pin *args = data; struct drm_i915_gem_object *obj; int ret; ret = i915_mutex_lock_interruptible(dev); if (ret) return ret; obj = to_intel_bo(drm_gem_object_lookup(dev, file, args->handle)); if (&obj->base == NULL) { ret = -ENOENT; goto unlock; } if (obj->pin_filp != file) { DRM_ERROR("Not pinned by caller in i915_gem_pin_ioctl(): %d\n", args->handle); ret = -EINVAL; goto out; } obj->user_pin_count--; if (obj->user_pin_count == 0) { obj->pin_filp = NULL; i915_gem_object_unpin(obj); } out: drm_gem_object_unreference(&obj->base); unlock: DRM_UNLOCK(dev); return ret; } int i915_gem_busy_ioctl(struct drm_device *dev, void *data, struct drm_file *file) { struct drm_i915_gem_busy *args = data; struct drm_i915_gem_object *obj; int ret; ret = i915_mutex_lock_interruptible(dev); if (ret) return ret; obj = to_intel_bo(drm_gem_object_lookup(dev, file, args->handle)); if (&obj->base == NULL) { ret = -ENOENT; goto unlock; } /* Count all active objects as busy, even if they are currently not used * by the gpu. Users of this interface expect objects to eventually * become non-busy without any further actions, therefore emit any * necessary flushes here. */ ret = i915_gem_object_flush_active(obj); args->busy = obj->active; if (obj->ring) { BUILD_BUG_ON(I915_NUM_RINGS > 16); args->busy |= intel_ring_flag(obj->ring) << 16; } drm_gem_object_unreference(&obj->base); unlock: DRM_UNLOCK(dev); return ret; } int i915_gem_throttle_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv) { return i915_gem_ring_throttle(dev, file_priv); } int i915_gem_madvise_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv) { struct drm_i915_gem_madvise *args = data; struct drm_i915_gem_object *obj; int ret; switch (args->madv) { case I915_MADV_DONTNEED: case I915_MADV_WILLNEED: break; default: return -EINVAL; } ret = i915_mutex_lock_interruptible(dev); if (ret) return ret; obj = to_intel_bo(drm_gem_object_lookup(dev, file_priv, args->handle)); if (&obj->base == NULL) { ret = -ENOENT; goto unlock; } if (obj->pin_count) { ret = -EINVAL; goto out; } if (obj->madv != __I915_MADV_PURGED) obj->madv = args->madv; /* if the object is no longer attached, discard its backing storage */ if (i915_gem_object_is_purgeable(obj) && obj->pages == NULL) i915_gem_object_truncate(obj); args->retained = obj->madv != __I915_MADV_PURGED; out: drm_gem_object_unreference(&obj->base); unlock: DRM_UNLOCK(dev); return ret; } void i915_gem_object_init(struct drm_i915_gem_object *obj, const struct drm_i915_gem_object_ops *ops) { INIT_LIST_HEAD(&obj->mm_list); INIT_LIST_HEAD(&obj->gtt_list); INIT_LIST_HEAD(&obj->ring_list); INIT_LIST_HEAD(&obj->exec_list); obj->ops = ops; obj->fence_reg = I915_FENCE_REG_NONE; obj->madv = I915_MADV_WILLNEED; /* Avoid an unnecessary call to unbind on the first bind. */ obj->map_and_fenceable = true; i915_gem_info_add_obj(obj->base.dev->dev_private, obj->base.size); } static const struct drm_i915_gem_object_ops i915_gem_object_ops = { .get_pages = i915_gem_object_get_pages_gtt, .put_pages = i915_gem_object_put_pages_gtt, }; struct drm_i915_gem_object *i915_gem_alloc_object(struct drm_device *dev, size_t size) { struct drm_i915_gem_object *obj; obj = malloc(sizeof(*obj), DRM_I915_GEM, M_WAITOK | M_ZERO); if (obj == NULL) return NULL; if (drm_gem_object_init(dev, &obj->base, size) != 0) { free(obj, DRM_I915_GEM); return NULL; } #ifdef FREEBSD_WIP mask = GFP_HIGHUSER | __GFP_RECLAIMABLE; if (IS_CRESTLINE(dev) || IS_BROADWATER(dev)) { /* 965gm cannot relocate objects above 4GiB. */ mask &= ~__GFP_HIGHMEM; mask |= __GFP_DMA32; } mapping = obj->base.filp->f_path.dentry->d_inode->i_mapping; mapping_set_gfp_mask(mapping, mask); #endif /* FREEBSD_WIP */ i915_gem_object_init(obj, &i915_gem_object_ops); obj->base.write_domain = I915_GEM_DOMAIN_CPU; obj->base.read_domains = I915_GEM_DOMAIN_CPU; if (HAS_LLC(dev)) { /* On some devices, we can have the GPU use the LLC (the CPU * cache) for about a 10% performance improvement * compared to uncached. Graphics requests other than * display scanout are coherent with the CPU in * accessing this cache. This means in this mode we * don't need to clflush on the CPU side, and on the * GPU side we only need to flush internal caches to * get data visible to the CPU. * * However, we maintain the display planes as UC, and so * need to rebind when first used as such. */ obj->cache_level = I915_CACHE_LLC; } else obj->cache_level = I915_CACHE_NONE; return obj; } int i915_gem_init_object(struct drm_gem_object *obj) { printf("i915_gem_init_object called\n"); return 0; } void i915_gem_free_object(struct drm_gem_object *gem_obj) { struct drm_i915_gem_object *obj = to_intel_bo(gem_obj); struct drm_device *dev = obj->base.dev; drm_i915_private_t *dev_priv = dev->dev_private; CTR1(KTR_DRM, "object_destroy_tail %p", obj); if (obj->phys_obj) i915_gem_detach_phys_object(dev, obj); obj->pin_count = 0; if (WARN_ON(i915_gem_object_unbind(obj) == -ERESTARTSYS)) { bool was_interruptible; was_interruptible = dev_priv->mm.interruptible; dev_priv->mm.interruptible = false; WARN_ON(i915_gem_object_unbind(obj)); dev_priv->mm.interruptible = was_interruptible; } obj->pages_pin_count = 0; i915_gem_object_put_pages(obj); i915_gem_object_free_mmap_offset(obj); BUG_ON(obj->pages); #ifdef FREEBSD_WIP if (obj->base.import_attach) drm_prime_gem_destroy(&obj->base, NULL); #endif /* FREEBSD_WIP */ drm_gem_object_release(&obj->base); i915_gem_info_remove_obj(dev_priv, obj->base.size); free(obj->bit_17, DRM_I915_GEM); free(obj, DRM_I915_GEM); } int i915_gem_idle(struct drm_device *dev) { drm_i915_private_t *dev_priv = dev->dev_private; int ret; DRM_LOCK(dev); if (dev_priv->mm.suspended) { DRM_UNLOCK(dev); return 0; } ret = i915_gpu_idle(dev); if (ret) { DRM_UNLOCK(dev); return ret; } i915_gem_retire_requests(dev); /* Under UMS, be paranoid and evict. */ if (!drm_core_check_feature(dev, DRIVER_MODESET)) i915_gem_evict_everything(dev); i915_gem_reset_fences(dev); /* Hack! Don't let anybody do execbuf while we don't control the chip. * We need to replace this with a semaphore, or something. * And not confound mm.suspended! */ dev_priv->mm.suspended = 1; callout_stop(&dev_priv->hangcheck_timer); i915_kernel_lost_context(dev); i915_gem_cleanup_ringbuffer(dev); DRM_UNLOCK(dev); /* Cancel the retire work handler, which should be idle now. */ taskqueue_cancel_timeout(dev_priv->wq, &dev_priv->mm.retire_work, NULL); return 0; } void i915_gem_l3_remap(struct drm_device *dev) { drm_i915_private_t *dev_priv = dev->dev_private; u32 misccpctl; int i; if (!HAS_L3_GPU_CACHE(dev)) return; if (!dev_priv->l3_parity.remap_info) return; misccpctl = I915_READ(GEN7_MISCCPCTL); I915_WRITE(GEN7_MISCCPCTL, misccpctl & ~GEN7_DOP_CLOCK_GATE_ENABLE); POSTING_READ(GEN7_MISCCPCTL); for (i = 0; i < GEN7_L3LOG_SIZE; i += 4) { u32 remap = I915_READ(GEN7_L3LOG_BASE + i); if (remap && remap != dev_priv->l3_parity.remap_info[i/4]) DRM_DEBUG("0x%x was already programmed to %x\n", GEN7_L3LOG_BASE + i, remap); if (remap && !dev_priv->l3_parity.remap_info[i/4]) DRM_DEBUG_DRIVER("Clearing remapped register\n"); I915_WRITE(GEN7_L3LOG_BASE + i, dev_priv->l3_parity.remap_info[i/4]); } /* Make sure all the writes land before disabling dop clock gating */ POSTING_READ(GEN7_L3LOG_BASE); I915_WRITE(GEN7_MISCCPCTL, misccpctl); } void i915_gem_init_swizzling(struct drm_device *dev) { drm_i915_private_t *dev_priv = dev->dev_private; if (INTEL_INFO(dev)->gen < 5 || dev_priv->mm.bit_6_swizzle_x == I915_BIT_6_SWIZZLE_NONE) return; I915_WRITE(DISP_ARB_CTL, I915_READ(DISP_ARB_CTL) | DISP_TILE_SURFACE_SWIZZLING); if (IS_GEN5(dev)) return; I915_WRITE(TILECTL, I915_READ(TILECTL) | TILECTL_SWZCTL); if (IS_GEN6(dev)) I915_WRITE(ARB_MODE, _MASKED_BIT_ENABLE(ARB_MODE_SWIZZLE_SNB)); else I915_WRITE(ARB_MODE, _MASKED_BIT_ENABLE(ARB_MODE_SWIZZLE_IVB)); } static bool intel_enable_blt(struct drm_device *dev) { if (!HAS_BLT(dev)) return false; /* The blitter was dysfunctional on early prototypes */ if (IS_GEN6(dev) && pci_get_revid(dev->dev) < 8) { DRM_INFO("BLT not supported on this pre-production hardware;" " graphics performance will be degraded.\n"); return false; } return true; } int i915_gem_init_hw(struct drm_device *dev) { drm_i915_private_t *dev_priv = dev->dev_private; int ret; #ifdef FREEBSD_WIP if (INTEL_INFO(dev)->gen < 6 && !intel_enable_gtt()) return -EIO; #endif /* FREEBSD_WIP */ if (IS_HASWELL(dev) && (I915_READ(0x120010) == 1)) I915_WRITE(0x9008, I915_READ(0x9008) | 0xf0000); i915_gem_l3_remap(dev); i915_gem_init_swizzling(dev); ret = intel_init_render_ring_buffer(dev); if (ret) return ret; if (HAS_BSD(dev)) { ret = intel_init_bsd_ring_buffer(dev); if (ret) goto cleanup_render_ring; } if (intel_enable_blt(dev)) { ret = intel_init_blt_ring_buffer(dev); if (ret) goto cleanup_bsd_ring; } dev_priv->next_seqno = 1; /* * XXX: There was some w/a described somewhere suggesting loading * contexts before PPGTT. */ i915_gem_context_init(dev); i915_gem_init_ppgtt(dev); return 0; cleanup_bsd_ring: intel_cleanup_ring_buffer(&dev_priv->ring[VCS]); cleanup_render_ring: intel_cleanup_ring_buffer(&dev_priv->ring[RCS]); return ret; } static bool intel_enable_ppgtt(struct drm_device *dev) { if (i915_enable_ppgtt >= 0) return i915_enable_ppgtt; #ifdef CONFIG_INTEL_IOMMU /* Disable ppgtt on SNB if VT-d is on. */ if (INTEL_INFO(dev)->gen == 6 && intel_iommu_gfx_mapped) return false; #endif return true; } int i915_gem_init(struct drm_device *dev) { struct drm_i915_private *dev_priv = dev->dev_private; unsigned long gtt_size, mappable_size; int ret; gtt_size = dev_priv->mm.gtt->gtt_total_entries << PAGE_SHIFT; mappable_size = dev_priv->mm.gtt->gtt_mappable_entries << PAGE_SHIFT; DRM_LOCK(dev); if (intel_enable_ppgtt(dev) && HAS_ALIASING_PPGTT(dev)) { /* PPGTT pdes are stolen from global gtt ptes, so shrink the * aperture accordingly when using aliasing ppgtt. */ gtt_size -= I915_PPGTT_PD_ENTRIES*PAGE_SIZE; i915_gem_init_global_gtt(dev, 0, mappable_size, gtt_size); ret = i915_gem_init_aliasing_ppgtt(dev); if (ret) { DRM_UNLOCK(dev); return ret; } } else { /* Let GEM Manage all of the aperture. * * However, leave one page at the end still bound to the scratch * page. There are a number of places where the hardware * apparently prefetches past the end of the object, and we've * seen multiple hangs with the GPU head pointer stuck in a * batchbuffer bound at the last page of the aperture. One page * should be enough to keep any prefetching inside of the * aperture. */ i915_gem_init_global_gtt(dev, 0, mappable_size, gtt_size); } ret = i915_gem_init_hw(dev); DRM_UNLOCK(dev); if (ret) { i915_gem_cleanup_aliasing_ppgtt(dev); return ret; } /* Allow hardware batchbuffers unless told otherwise, but not for KMS. */ if (!drm_core_check_feature(dev, DRIVER_MODESET)) dev_priv->dri1.allow_batchbuffer = 1; return 0; } void i915_gem_cleanup_ringbuffer(struct drm_device *dev) { drm_i915_private_t *dev_priv = dev->dev_private; struct intel_ring_buffer *ring; int i; for_each_ring(ring, dev_priv, i) intel_cleanup_ring_buffer(ring); } int i915_gem_entervt_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv) { drm_i915_private_t *dev_priv = dev->dev_private; int ret; if (drm_core_check_feature(dev, DRIVER_MODESET)) return 0; if (atomic_read(&dev_priv->mm.wedged)) { DRM_ERROR("Reenabling wedged hardware, good luck\n"); atomic_set(&dev_priv->mm.wedged, 0); } DRM_LOCK(dev); dev_priv->mm.suspended = 0; ret = i915_gem_init_hw(dev); if (ret != 0) { DRM_UNLOCK(dev); return ret; } BUG_ON(!list_empty(&dev_priv->mm.active_list)); DRM_UNLOCK(dev); ret = drm_irq_install(dev); if (ret) goto cleanup_ringbuffer; return 0; cleanup_ringbuffer: DRM_LOCK(dev); i915_gem_cleanup_ringbuffer(dev); dev_priv->mm.suspended = 1; DRM_UNLOCK(dev); return ret; } int i915_gem_leavevt_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv) { if (drm_core_check_feature(dev, DRIVER_MODESET)) return 0; drm_irq_uninstall(dev); return i915_gem_idle(dev); } void i915_gem_lastclose(struct drm_device *dev) { int ret; if (drm_core_check_feature(dev, DRIVER_MODESET)) return; ret = i915_gem_idle(dev); if (ret) DRM_ERROR("failed to idle hardware: %d\n", ret); } static void init_ring_lists(struct intel_ring_buffer *ring) { INIT_LIST_HEAD(&ring->active_list); INIT_LIST_HEAD(&ring->request_list); } void i915_gem_load(struct drm_device *dev) { int i; drm_i915_private_t *dev_priv = dev->dev_private; INIT_LIST_HEAD(&dev_priv->mm.active_list); INIT_LIST_HEAD(&dev_priv->mm.inactive_list); INIT_LIST_HEAD(&dev_priv->mm.unbound_list); INIT_LIST_HEAD(&dev_priv->mm.bound_list); INIT_LIST_HEAD(&dev_priv->mm.fence_list); for (i = 0; i < I915_NUM_RINGS; i++) init_ring_lists(&dev_priv->ring[i]); for (i = 0; i < I915_MAX_NUM_FENCES; i++) INIT_LIST_HEAD(&dev_priv->fence_regs[i].lru_list); TIMEOUT_TASK_INIT(dev_priv->wq, &dev_priv->mm.retire_work, 0, i915_gem_retire_work_handler, dev_priv); init_completion(&dev_priv->error_completion); /* On GEN3 we really need to make sure the ARB C3 LP bit is set */ if (IS_GEN3(dev)) { I915_WRITE(MI_ARB_STATE, _MASKED_BIT_ENABLE(MI_ARB_C3_LP_WRITE_ENABLE)); } dev_priv->relative_constants_mode = I915_EXEC_CONSTANTS_REL_GENERAL; /* Old X drivers will take 0-2 for front, back, depth buffers */ if (!drm_core_check_feature(dev, DRIVER_MODESET)) dev_priv->fence_reg_start = 3; if (INTEL_INFO(dev)->gen >= 4 || IS_I945G(dev) || IS_I945GM(dev) || IS_G33(dev)) dev_priv->num_fence_regs = 16; else dev_priv->num_fence_regs = 8; /* Initialize fence registers to zero */ i915_gem_reset_fences(dev); i915_gem_detect_bit_6_swizzle(dev); DRM_INIT_WAITQUEUE(&dev_priv->pending_flip_queue); dev_priv->mm.interruptible = true; dev_priv->mm.inactive_shrinker = EVENTHANDLER_REGISTER(vm_lowmem, i915_gem_inactive_shrink, dev, EVENTHANDLER_PRI_ANY); } /* * Create a physically contiguous memory object for this object * e.g. for cursor + overlay regs */ static int i915_gem_init_phys_object(struct drm_device *dev, int id, int size, int align) { drm_i915_private_t *dev_priv = dev->dev_private; struct drm_i915_gem_phys_object *phys_obj; int ret; if (dev_priv->mm.phys_objs[id - 1] || !size) return 0; phys_obj = malloc(sizeof(struct drm_i915_gem_phys_object), DRM_I915_GEM, M_WAITOK | M_ZERO); if (!phys_obj) return -ENOMEM; phys_obj->id = id; phys_obj->handle = drm_pci_alloc(dev, size, align, BUS_SPACE_MAXADDR); if (!phys_obj->handle) { ret = -ENOMEM; goto kfree_obj; } #ifdef CONFIG_X86 pmap_change_attr((vm_offset_t)phys_obj->handle->vaddr, size / PAGE_SIZE, PAT_WRITE_COMBINING); #endif dev_priv->mm.phys_objs[id - 1] = phys_obj; return 0; kfree_obj: free(phys_obj, DRM_I915_GEM); return ret; } static void i915_gem_free_phys_object(struct drm_device *dev, int id) { drm_i915_private_t *dev_priv = dev->dev_private; struct drm_i915_gem_phys_object *phys_obj; if (!dev_priv->mm.phys_objs[id - 1]) return; phys_obj = dev_priv->mm.phys_objs[id - 1]; if (phys_obj->cur_obj) { i915_gem_detach_phys_object(dev, phys_obj->cur_obj); } #ifdef FREEBSD_WIP #ifdef CONFIG_X86 set_memory_wb((unsigned long)phys_obj->handle->vaddr, phys_obj->handle->size / PAGE_SIZE); #endif #endif /* FREEBSD_WIP */ drm_pci_free(dev, phys_obj->handle); free(phys_obj, DRM_I915_GEM); dev_priv->mm.phys_objs[id - 1] = NULL; } void i915_gem_free_all_phys_object(struct drm_device *dev) { int i; for (i = I915_GEM_PHYS_CURSOR_0; i <= I915_MAX_PHYS_OBJECT; i++) i915_gem_free_phys_object(dev, i); } void i915_gem_detach_phys_object(struct drm_device *dev, struct drm_i915_gem_object *obj) { struct sf_buf *sf; char *vaddr; char *dst; int i; int page_count; if (!obj->phys_obj) return; vaddr = obj->phys_obj->handle->vaddr; page_count = obj->base.size / PAGE_SIZE; VM_OBJECT_WLOCK(obj->base.vm_obj); for (i = 0; i < page_count; i++) { vm_page_t page = i915_gem_wire_page(obj->base.vm_obj, i, NULL); if (page == NULL) continue; /* XXX */ VM_OBJECT_WUNLOCK(obj->base.vm_obj); sf = sf_buf_alloc(page, 0); if (sf != NULL) { dst = (char *)sf_buf_kva(sf); memcpy(dst, vaddr + IDX_TO_OFF(i), PAGE_SIZE); sf_buf_free(sf); } drm_clflush_pages(&page, 1); VM_OBJECT_WLOCK(obj->base.vm_obj); vm_page_reference(page); vm_page_lock(page); vm_page_dirty(page); vm_page_unwire(page, PQ_INACTIVE); vm_page_unlock(page); atomic_add_long(&i915_gem_wired_pages_cnt, -1); } VM_OBJECT_WUNLOCK(obj->base.vm_obj); i915_gem_chipset_flush(dev); obj->phys_obj->cur_obj = NULL; obj->phys_obj = NULL; } int i915_gem_attach_phys_object(struct drm_device *dev, struct drm_i915_gem_object *obj, int id, int align) { drm_i915_private_t *dev_priv = dev->dev_private; struct sf_buf *sf; char *dst, *src; int ret = 0; int page_count; int i; if (id > I915_MAX_PHYS_OBJECT) return -EINVAL; if (obj->phys_obj) { if (obj->phys_obj->id == id) return 0; i915_gem_detach_phys_object(dev, obj); } /* create a new object */ if (!dev_priv->mm.phys_objs[id - 1]) { ret = i915_gem_init_phys_object(dev, id, obj->base.size, align); if (ret) { DRM_ERROR("failed to init phys object %d size: %zu\n", id, obj->base.size); return ret; } } /* bind to the object */ obj->phys_obj = dev_priv->mm.phys_objs[id - 1]; obj->phys_obj->cur_obj = obj; page_count = obj->base.size / PAGE_SIZE; VM_OBJECT_WLOCK(obj->base.vm_obj); for (i = 0; i < page_count; i++) { vm_page_t page = i915_gem_wire_page(obj->base.vm_obj, i, NULL); if (page == NULL) { ret = -EIO; break; } VM_OBJECT_WUNLOCK(obj->base.vm_obj); sf = sf_buf_alloc(page, 0); src = (char *)sf_buf_kva(sf); dst = (char *)obj->phys_obj->handle->vaddr + IDX_TO_OFF(i); memcpy(dst, src, PAGE_SIZE); sf_buf_free(sf); VM_OBJECT_WLOCK(obj->base.vm_obj); vm_page_reference(page); vm_page_lock(page); vm_page_unwire(page, PQ_INACTIVE); vm_page_unlock(page); atomic_add_long(&i915_gem_wired_pages_cnt, -1); } VM_OBJECT_WUNLOCK(obj->base.vm_obj); return ret; } static int i915_gem_phys_pwrite(struct drm_device *dev, struct drm_i915_gem_object *obj, struct drm_i915_gem_pwrite *args, struct drm_file *file_priv) { void *vaddr = (char *)obj->phys_obj->handle->vaddr + args->offset; char __user *user_data = to_user_ptr(args->data_ptr); if (__copy_from_user_inatomic_nocache(vaddr, user_data, args->size)) { unsigned long unwritten; /* The physical object once assigned is fixed for the lifetime * of the obj, so we can safely drop the lock and continue * to access vaddr. */ DRM_UNLOCK(dev); unwritten = copy_from_user(vaddr, user_data, args->size); DRM_LOCK(dev); if (unwritten) return -EFAULT; } i915_gem_chipset_flush(dev); return 0; } void i915_gem_release(struct drm_device *dev, struct drm_file *file) { struct drm_i915_file_private *file_priv = file->driver_priv; /* Clean up our request list when the client is going away, so that * later retire_requests won't dereference our soon-to-be-gone * file_priv. */ mtx_lock(&file_priv->mm.lock); while (!list_empty(&file_priv->mm.request_list)) { struct drm_i915_gem_request *request; request = list_first_entry(&file_priv->mm.request_list, struct drm_i915_gem_request, client_list); list_del(&request->client_list); request->file_priv = NULL; } mtx_unlock(&file_priv->mm.lock); } static void i915_gem_inactive_shrink(void *arg) { struct drm_device *dev = arg; struct drm_i915_private *dev_priv = dev->dev_private; int pass1, pass2; if (!sx_try_xlock(&dev->dev_struct_lock)) { return; } CTR0(KTR_DRM, "gem_lowmem"); pass1 = i915_gem_purge(dev_priv, -1); pass2 = __i915_gem_shrink(dev_priv, -1, false); if (pass2 <= pass1 / 100) i915_gem_shrink_all(dev_priv); DRM_UNLOCK(dev); } static vm_page_t i915_gem_wire_page(vm_object_t object, vm_pindex_t pindex, bool *fresh) { vm_page_t page; int rv; VM_OBJECT_ASSERT_WLOCKED(object); page = vm_page_grab(object, pindex, VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY | VM_ALLOC_WIRED); if (page->valid != VM_PAGE_BITS_ALL) { vm_page_xbusy(page); if (vm_pager_has_page(object, pindex, NULL, NULL)) { rv = vm_pager_get_pages(object, &page, 1, NULL, NULL); if (rv != VM_PAGER_OK) { vm_page_lock(page); vm_page_unwire(page, PQ_NONE); vm_page_free(page); vm_page_unlock(page); return (NULL); } if (fresh != NULL) *fresh = true; } else { pmap_zero_page(page); page->valid = VM_PAGE_BITS_ALL; page->dirty = 0; if (fresh != NULL) *fresh = false; } vm_page_xunbusy(page); } else if (fresh != NULL) *fresh = false; atomic_add_long(&i915_gem_wired_pages_cnt, 1); return (page); } Index: head/sys/dev/drm2/i915/intel_pm.c =================================================================== --- head/sys/dev/drm2/i915/intel_pm.c (revision 336913) +++ head/sys/dev/drm2/i915/intel_pm.c (revision 336914) @@ -1,4510 +1,4509 @@ /* * Copyright © 2012 Intel Corporation * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice (including the next * paragraph) shall be included in all copies or substantial portions of the * Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS * IN THE SOFTWARE. * * Authors: * Eugeni Dodonov * */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #define FORCEWAKE_ACK_TIMEOUT_MS 2 /* FBC, or Frame Buffer Compression, is a technique employed to compress the * framebuffer contents in-memory, aiming at reducing the required bandwidth * during in-memory transfers and, therefore, reduce the power packet. * * The benefits of FBC are mostly visible with solid backgrounds and * variation-less patterns. * * FBC-related functionality can be enabled by the means of the * i915.i915_enable_fbc parameter */ static bool intel_crtc_active(struct drm_crtc *crtc) { /* Be paranoid as we can arrive here with only partial * state retrieved from the hardware during setup. */ return to_intel_crtc(crtc)->active && crtc->fb && crtc->mode.clock; } static void i8xx_disable_fbc(struct drm_device *dev) { struct drm_i915_private *dev_priv = dev->dev_private; u32 fbc_ctl; /* Disable compression */ fbc_ctl = I915_READ(FBC_CONTROL); if ((fbc_ctl & FBC_CTL_EN) == 0) return; fbc_ctl &= ~FBC_CTL_EN; I915_WRITE(FBC_CONTROL, fbc_ctl); /* Wait for compressing bit to clear */ if (wait_for((I915_READ(FBC_STATUS) & FBC_STAT_COMPRESSING) == 0, 10)) { DRM_DEBUG_KMS("FBC idle timed out\n"); return; } DRM_DEBUG_KMS("disabled FBC\n"); } static void i8xx_enable_fbc(struct drm_crtc *crtc, unsigned long interval) { struct drm_device *dev = crtc->dev; struct drm_i915_private *dev_priv = dev->dev_private; struct drm_framebuffer *fb = crtc->fb; struct intel_framebuffer *intel_fb = to_intel_framebuffer(fb); struct drm_i915_gem_object *obj = intel_fb->obj; struct intel_crtc *intel_crtc = to_intel_crtc(crtc); int cfb_pitch; int plane, i; u32 fbc_ctl, fbc_ctl2; cfb_pitch = dev_priv->cfb_size / FBC_LL_SIZE; if (fb->pitches[0] < cfb_pitch) cfb_pitch = fb->pitches[0]; /* FBC_CTL wants 64B units */ cfb_pitch = (cfb_pitch / 64) - 1; plane = intel_crtc->plane == 0 ? FBC_CTL_PLANEA : FBC_CTL_PLANEB; /* Clear old tags */ for (i = 0; i < (FBC_LL_SIZE / 32) + 1; i++) I915_WRITE(FBC_TAG + (i * 4), 0); /* Set it up... */ fbc_ctl2 = FBC_CTL_FENCE_DBL | FBC_CTL_IDLE_IMM | FBC_CTL_CPU_FENCE; fbc_ctl2 |= plane; I915_WRITE(FBC_CONTROL2, fbc_ctl2); I915_WRITE(FBC_FENCE_OFF, crtc->y); /* enable it... */ fbc_ctl = FBC_CTL_EN | FBC_CTL_PERIODIC; if (IS_I945GM(dev)) fbc_ctl |= FBC_CTL_C3_IDLE; /* 945 needs special SR handling */ fbc_ctl |= (cfb_pitch & 0xff) << FBC_CTL_STRIDE_SHIFT; fbc_ctl |= (interval & 0x2fff) << FBC_CTL_INTERVAL_SHIFT; fbc_ctl |= obj->fence_reg; I915_WRITE(FBC_CONTROL, fbc_ctl); DRM_DEBUG_KMS("enabled FBC, pitch %d, yoff %d, plane %d, ", cfb_pitch, crtc->y, intel_crtc->plane); } static bool i8xx_fbc_enabled(struct drm_device *dev) { struct drm_i915_private *dev_priv = dev->dev_private; return I915_READ(FBC_CONTROL) & FBC_CTL_EN; } static void g4x_enable_fbc(struct drm_crtc *crtc, unsigned long interval) { struct drm_device *dev = crtc->dev; struct drm_i915_private *dev_priv = dev->dev_private; struct drm_framebuffer *fb = crtc->fb; struct intel_framebuffer *intel_fb = to_intel_framebuffer(fb); struct drm_i915_gem_object *obj = intel_fb->obj; struct intel_crtc *intel_crtc = to_intel_crtc(crtc); int plane = intel_crtc->plane == 0 ? DPFC_CTL_PLANEA : DPFC_CTL_PLANEB; unsigned long stall_watermark = 200; u32 dpfc_ctl; dpfc_ctl = plane | DPFC_SR_EN | DPFC_CTL_LIMIT_1X; dpfc_ctl |= DPFC_CTL_FENCE_EN | obj->fence_reg; I915_WRITE(DPFC_CHICKEN, DPFC_HT_MODIFY); I915_WRITE(DPFC_RECOMP_CTL, DPFC_RECOMP_STALL_EN | (stall_watermark << DPFC_RECOMP_STALL_WM_SHIFT) | (interval << DPFC_RECOMP_TIMER_COUNT_SHIFT)); I915_WRITE(DPFC_FENCE_YOFF, crtc->y); /* enable it... */ I915_WRITE(DPFC_CONTROL, I915_READ(DPFC_CONTROL) | DPFC_CTL_EN); DRM_DEBUG_KMS("enabled fbc on plane %d\n", intel_crtc->plane); } static void g4x_disable_fbc(struct drm_device *dev) { struct drm_i915_private *dev_priv = dev->dev_private; u32 dpfc_ctl; /* Disable compression */ dpfc_ctl = I915_READ(DPFC_CONTROL); if (dpfc_ctl & DPFC_CTL_EN) { dpfc_ctl &= ~DPFC_CTL_EN; I915_WRITE(DPFC_CONTROL, dpfc_ctl); DRM_DEBUG_KMS("disabled FBC\n"); } } static bool g4x_fbc_enabled(struct drm_device *dev) { struct drm_i915_private *dev_priv = dev->dev_private; return I915_READ(DPFC_CONTROL) & DPFC_CTL_EN; } static void sandybridge_blit_fbc_update(struct drm_device *dev) { struct drm_i915_private *dev_priv = dev->dev_private; u32 blt_ecoskpd; /* Make sure blitter notifies FBC of writes */ gen6_gt_force_wake_get(dev_priv); blt_ecoskpd = I915_READ(GEN6_BLITTER_ECOSKPD); blt_ecoskpd |= GEN6_BLITTER_FBC_NOTIFY << GEN6_BLITTER_LOCK_SHIFT; I915_WRITE(GEN6_BLITTER_ECOSKPD, blt_ecoskpd); blt_ecoskpd |= GEN6_BLITTER_FBC_NOTIFY; I915_WRITE(GEN6_BLITTER_ECOSKPD, blt_ecoskpd); blt_ecoskpd &= ~(GEN6_BLITTER_FBC_NOTIFY << GEN6_BLITTER_LOCK_SHIFT); I915_WRITE(GEN6_BLITTER_ECOSKPD, blt_ecoskpd); POSTING_READ(GEN6_BLITTER_ECOSKPD); gen6_gt_force_wake_put(dev_priv); } static void ironlake_enable_fbc(struct drm_crtc *crtc, unsigned long interval) { struct drm_device *dev = crtc->dev; struct drm_i915_private *dev_priv = dev->dev_private; struct drm_framebuffer *fb = crtc->fb; struct intel_framebuffer *intel_fb = to_intel_framebuffer(fb); struct drm_i915_gem_object *obj = intel_fb->obj; struct intel_crtc *intel_crtc = to_intel_crtc(crtc); int plane = intel_crtc->plane == 0 ? DPFC_CTL_PLANEA : DPFC_CTL_PLANEB; unsigned long stall_watermark = 200; u32 dpfc_ctl; dpfc_ctl = I915_READ(ILK_DPFC_CONTROL); dpfc_ctl &= DPFC_RESERVED; dpfc_ctl |= (plane | DPFC_CTL_LIMIT_1X); /* Set persistent mode for front-buffer rendering, ala X. */ dpfc_ctl |= DPFC_CTL_PERSISTENT_MODE; dpfc_ctl |= (DPFC_CTL_FENCE_EN | obj->fence_reg); I915_WRITE(ILK_DPFC_CHICKEN, DPFC_HT_MODIFY); I915_WRITE(ILK_DPFC_RECOMP_CTL, DPFC_RECOMP_STALL_EN | (stall_watermark << DPFC_RECOMP_STALL_WM_SHIFT) | (interval << DPFC_RECOMP_TIMER_COUNT_SHIFT)); I915_WRITE(ILK_DPFC_FENCE_YOFF, crtc->y); I915_WRITE(ILK_FBC_RT_BASE, obj->gtt_offset | ILK_FBC_RT_VALID); /* enable it... */ I915_WRITE(ILK_DPFC_CONTROL, dpfc_ctl | DPFC_CTL_EN); if (IS_GEN6(dev)) { I915_WRITE(SNB_DPFC_CTL_SA, SNB_CPU_FENCE_ENABLE | obj->fence_reg); I915_WRITE(DPFC_CPU_FENCE_OFFSET, crtc->y); sandybridge_blit_fbc_update(dev); } DRM_DEBUG_KMS("enabled fbc on plane %d\n", intel_crtc->plane); } static void ironlake_disable_fbc(struct drm_device *dev) { struct drm_i915_private *dev_priv = dev->dev_private; u32 dpfc_ctl; /* Disable compression */ dpfc_ctl = I915_READ(ILK_DPFC_CONTROL); if (dpfc_ctl & DPFC_CTL_EN) { dpfc_ctl &= ~DPFC_CTL_EN; I915_WRITE(ILK_DPFC_CONTROL, dpfc_ctl); DRM_DEBUG_KMS("disabled FBC\n"); } } static bool ironlake_fbc_enabled(struct drm_device *dev) { struct drm_i915_private *dev_priv = dev->dev_private; return I915_READ(ILK_DPFC_CONTROL) & DPFC_CTL_EN; } bool intel_fbc_enabled(struct drm_device *dev) { struct drm_i915_private *dev_priv = dev->dev_private; if (!dev_priv->display.fbc_enabled) return false; return dev_priv->display.fbc_enabled(dev); } static void intel_fbc_work_fn(void *arg, int pending) { struct intel_fbc_work *work = arg; struct drm_device *dev = work->crtc->dev; struct drm_i915_private *dev_priv = dev->dev_private; DRM_LOCK(dev); if (work == dev_priv->fbc_work) { /* Double check that we haven't switched fb without cancelling * the prior work. */ if (work->crtc->fb == work->fb) { dev_priv->display.enable_fbc(work->crtc, work->interval); dev_priv->cfb_plane = to_intel_crtc(work->crtc)->plane; dev_priv->cfb_fb = work->crtc->fb->base.id; dev_priv->cfb_y = work->crtc->y; } dev_priv->fbc_work = NULL; } DRM_UNLOCK(dev); free(work, DRM_MEM_KMS); } static void intel_cancel_fbc_work(struct drm_i915_private *dev_priv) { if (dev_priv->fbc_work == NULL) return; DRM_DEBUG_KMS("cancelling pending FBC enable\n"); /* Synchronisation is provided by struct_mutex and checking of * dev_priv->fbc_work, so we can perform the cancellation * entirely asynchronously. */ if (taskqueue_cancel_timeout(dev_priv->wq, &dev_priv->fbc_work->work, NULL) == 0) /* tasklet was killed before being run, clean up */ free(dev_priv->fbc_work, DRM_MEM_KMS); /* Mark the work as no longer wanted so that if it does * wake-up (because the work was already running and waiting * for our mutex), it will discover that is no longer * necessary to run. */ dev_priv->fbc_work = NULL; } void intel_enable_fbc(struct drm_crtc *crtc, unsigned long interval) { struct intel_fbc_work *work; struct drm_device *dev = crtc->dev; struct drm_i915_private *dev_priv = dev->dev_private; if (!dev_priv->display.enable_fbc) return; intel_cancel_fbc_work(dev_priv); work = malloc(sizeof *work, DRM_MEM_KMS, M_WAITOK | M_ZERO); if (work == NULL) { dev_priv->display.enable_fbc(crtc, interval); return; } work->crtc = crtc; work->fb = crtc->fb; work->interval = interval; TIMEOUT_TASK_INIT(dev_priv->wq, &work->work, 0, intel_fbc_work_fn, work); dev_priv->fbc_work = work; DRM_DEBUG_KMS("scheduling delayed FBC enable\n"); /* Delay the actual enabling to let pageflipping cease and the * display to settle before starting the compression. Note that * this delay also serves a second purpose: it allows for a * vblank to pass after disabling the FBC before we attempt * to modify the control registers. * * A more complicated solution would involve tracking vblanks * following the termination of the page-flipping sequence * and indeed performing the enable as a co-routine and not * waiting synchronously upon the vblank. */ taskqueue_enqueue_timeout(dev_priv->wq, &work->work, msecs_to_jiffies(50)); } void intel_disable_fbc(struct drm_device *dev) { struct drm_i915_private *dev_priv = dev->dev_private; intel_cancel_fbc_work(dev_priv); if (!dev_priv->display.disable_fbc) return; dev_priv->display.disable_fbc(dev); dev_priv->cfb_plane = -1; } /** * intel_update_fbc - enable/disable FBC as needed * @dev: the drm_device * * Set up the framebuffer compression hardware at mode set time. We * enable it if possible: * - plane A only (on pre-965) * - no pixel mulitply/line duplication * - no alpha buffer discard * - no dual wide * - framebuffer <= 2048 in width, 1536 in height * * We can't assume that any compression will take place (worst case), * so the compressed buffer has to be the same size as the uncompressed * one. It also must reside (along with the line length buffer) in * stolen memory. * * We need to enable/disable FBC on a global basis. */ void intel_update_fbc(struct drm_device *dev) { struct drm_i915_private *dev_priv = dev->dev_private; struct drm_crtc *crtc = NULL, *tmp_crtc; struct intel_crtc *intel_crtc; struct drm_framebuffer *fb; struct intel_framebuffer *intel_fb; struct drm_i915_gem_object *obj; int enable_fbc; if (!i915_powersave) return; if (!I915_HAS_FBC(dev)) return; /* * If FBC is already on, we just have to verify that we can * keep it that way... * Need to disable if: * - more than one pipe is active * - changing FBC params (stride, fence, mode) * - new fb is too large to fit in compressed buffer * - going to an unsupported config (interlace, pixel multiply, etc.) */ list_for_each_entry(tmp_crtc, &dev->mode_config.crtc_list, head) { if (intel_crtc_active(tmp_crtc) && !to_intel_crtc(tmp_crtc)->primary_disabled) { if (crtc) { DRM_DEBUG_KMS("more than one pipe active, disabling compression\n"); dev_priv->no_fbc_reason = FBC_MULTIPLE_PIPES; goto out_disable; } crtc = tmp_crtc; } } if (!crtc || crtc->fb == NULL) { DRM_DEBUG_KMS("no output, disabling\n"); dev_priv->no_fbc_reason = FBC_NO_OUTPUT; goto out_disable; } intel_crtc = to_intel_crtc(crtc); fb = crtc->fb; intel_fb = to_intel_framebuffer(fb); obj = intel_fb->obj; enable_fbc = i915_enable_fbc; if (enable_fbc < 0) { DRM_DEBUG_KMS("fbc set to per-chip default\n"); enable_fbc = 1; if (INTEL_INFO(dev)->gen <= 6) enable_fbc = 0; } if (!enable_fbc) { DRM_DEBUG_KMS("fbc disabled per module param\n"); dev_priv->no_fbc_reason = FBC_MODULE_PARAM; goto out_disable; } if (intel_fb->obj->base.size > dev_priv->cfb_size) { DRM_DEBUG_KMS("framebuffer too large, disabling " "compression\n"); dev_priv->no_fbc_reason = FBC_STOLEN_TOO_SMALL; goto out_disable; } if ((crtc->mode.flags & DRM_MODE_FLAG_INTERLACE) || (crtc->mode.flags & DRM_MODE_FLAG_DBLSCAN)) { DRM_DEBUG_KMS("mode incompatible with compression, " "disabling\n"); dev_priv->no_fbc_reason = FBC_UNSUPPORTED_MODE; goto out_disable; } if ((crtc->mode.hdisplay > 2048) || (crtc->mode.vdisplay > 1536)) { DRM_DEBUG_KMS("mode too large for compression, disabling\n"); dev_priv->no_fbc_reason = FBC_MODE_TOO_LARGE; goto out_disable; } if ((IS_I915GM(dev) || IS_I945GM(dev)) && intel_crtc->plane != 0) { DRM_DEBUG_KMS("plane not 0, disabling compression\n"); dev_priv->no_fbc_reason = FBC_BAD_PLANE; goto out_disable; } /* The use of a CPU fence is mandatory in order to detect writes * by the CPU to the scanout and trigger updates to the FBC. */ if (obj->tiling_mode != I915_TILING_X || obj->fence_reg == I915_FENCE_REG_NONE) { DRM_DEBUG_KMS("framebuffer not tiled or fenced, disabling compression\n"); dev_priv->no_fbc_reason = FBC_NOT_TILED; goto out_disable; } /* If the kernel debugger is active, always disable compression */ if (kdb_active) goto out_disable; /* If the scanout has not changed, don't modify the FBC settings. * Note that we make the fundamental assumption that the fb->obj * cannot be unpinned (and have its GTT offset and fence revoked) * without first being decoupled from the scanout and FBC disabled. */ if (dev_priv->cfb_plane == intel_crtc->plane && dev_priv->cfb_fb == fb->base.id && dev_priv->cfb_y == crtc->y) return; if (intel_fbc_enabled(dev)) { /* We update FBC along two paths, after changing fb/crtc * configuration (modeswitching) and after page-flipping * finishes. For the latter, we know that not only did * we disable the FBC at the start of the page-flip * sequence, but also more than one vblank has passed. * * For the former case of modeswitching, it is possible * to switch between two FBC valid configurations * instantaneously so we do need to disable the FBC * before we can modify its control registers. We also * have to wait for the next vblank for that to take * effect. However, since we delay enabling FBC we can * assume that a vblank has passed since disabling and * that we can safely alter the registers in the deferred * callback. * * In the scenario that we go from a valid to invalid * and then back to valid FBC configuration we have * no strict enforcement that a vblank occurred since * disabling the FBC. However, along all current pipe * disabling paths we do need to wait for a vblank at * some point. And we wait before enabling FBC anyway. */ DRM_DEBUG_KMS("disabling active FBC for update\n"); intel_disable_fbc(dev); } intel_enable_fbc(crtc, 500); return; out_disable: /* Multiple disables should be harmless */ if (intel_fbc_enabled(dev)) { DRM_DEBUG_KMS("unsupported config, disabling FBC\n"); intel_disable_fbc(dev); } } static void i915_pineview_get_mem_freq(struct drm_device *dev) { drm_i915_private_t *dev_priv = dev->dev_private; u32 tmp; tmp = I915_READ(CLKCFG); switch (tmp & CLKCFG_FSB_MASK) { case CLKCFG_FSB_533: dev_priv->fsb_freq = 533; /* 133*4 */ break; case CLKCFG_FSB_800: dev_priv->fsb_freq = 800; /* 200*4 */ break; case CLKCFG_FSB_667: dev_priv->fsb_freq = 667; /* 167*4 */ break; case CLKCFG_FSB_400: dev_priv->fsb_freq = 400; /* 100*4 */ break; } switch (tmp & CLKCFG_MEM_MASK) { case CLKCFG_MEM_533: dev_priv->mem_freq = 533; break; case CLKCFG_MEM_667: dev_priv->mem_freq = 667; break; case CLKCFG_MEM_800: dev_priv->mem_freq = 800; break; } /* detect pineview DDR3 setting */ tmp = I915_READ(CSHRDDR3CTL); dev_priv->is_ddr3 = (tmp & CSHRDDR3CTL_DDR3) ? 1 : 0; } static void i915_ironlake_get_mem_freq(struct drm_device *dev) { drm_i915_private_t *dev_priv = dev->dev_private; u16 ddrpll, csipll; ddrpll = I915_READ16(DDRMPLL1); csipll = I915_READ16(CSIPLL0); switch (ddrpll & 0xff) { case 0xc: dev_priv->mem_freq = 800; break; case 0x10: dev_priv->mem_freq = 1066; break; case 0x14: dev_priv->mem_freq = 1333; break; case 0x18: dev_priv->mem_freq = 1600; break; default: DRM_DEBUG_DRIVER("unknown memory frequency 0x%02x\n", ddrpll & 0xff); dev_priv->mem_freq = 0; break; } dev_priv->ips.r_t = dev_priv->mem_freq; switch (csipll & 0x3ff) { case 0x00c: dev_priv->fsb_freq = 3200; break; case 0x00e: dev_priv->fsb_freq = 3733; break; case 0x010: dev_priv->fsb_freq = 4266; break; case 0x012: dev_priv->fsb_freq = 4800; break; case 0x014: dev_priv->fsb_freq = 5333; break; case 0x016: dev_priv->fsb_freq = 5866; break; case 0x018: dev_priv->fsb_freq = 6400; break; default: DRM_DEBUG_DRIVER("unknown fsb frequency 0x%04x\n", csipll & 0x3ff); dev_priv->fsb_freq = 0; break; } if (dev_priv->fsb_freq == 3200) { dev_priv->ips.c_m = 0; } else if (dev_priv->fsb_freq > 3200 && dev_priv->fsb_freq <= 4800) { dev_priv->ips.c_m = 1; } else { dev_priv->ips.c_m = 2; } } static const struct cxsr_latency cxsr_latency_table[] = { {1, 0, 800, 400, 3382, 33382, 3983, 33983}, /* DDR2-400 SC */ {1, 0, 800, 667, 3354, 33354, 3807, 33807}, /* DDR2-667 SC */ {1, 0, 800, 800, 3347, 33347, 3763, 33763}, /* DDR2-800 SC */ {1, 1, 800, 667, 6420, 36420, 6873, 36873}, /* DDR3-667 SC */ {1, 1, 800, 800, 5902, 35902, 6318, 36318}, /* DDR3-800 SC */ {1, 0, 667, 400, 3400, 33400, 4021, 34021}, /* DDR2-400 SC */ {1, 0, 667, 667, 3372, 33372, 3845, 33845}, /* DDR2-667 SC */ {1, 0, 667, 800, 3386, 33386, 3822, 33822}, /* DDR2-800 SC */ {1, 1, 667, 667, 6438, 36438, 6911, 36911}, /* DDR3-667 SC */ {1, 1, 667, 800, 5941, 35941, 6377, 36377}, /* DDR3-800 SC */ {1, 0, 400, 400, 3472, 33472, 4173, 34173}, /* DDR2-400 SC */ {1, 0, 400, 667, 3443, 33443, 3996, 33996}, /* DDR2-667 SC */ {1, 0, 400, 800, 3430, 33430, 3946, 33946}, /* DDR2-800 SC */ {1, 1, 400, 667, 6509, 36509, 7062, 37062}, /* DDR3-667 SC */ {1, 1, 400, 800, 5985, 35985, 6501, 36501}, /* DDR3-800 SC */ {0, 0, 800, 400, 3438, 33438, 4065, 34065}, /* DDR2-400 SC */ {0, 0, 800, 667, 3410, 33410, 3889, 33889}, /* DDR2-667 SC */ {0, 0, 800, 800, 3403, 33403, 3845, 33845}, /* DDR2-800 SC */ {0, 1, 800, 667, 6476, 36476, 6955, 36955}, /* DDR3-667 SC */ {0, 1, 800, 800, 5958, 35958, 6400, 36400}, /* DDR3-800 SC */ {0, 0, 667, 400, 3456, 33456, 4103, 34106}, /* DDR2-400 SC */ {0, 0, 667, 667, 3428, 33428, 3927, 33927}, /* DDR2-667 SC */ {0, 0, 667, 800, 3443, 33443, 3905, 33905}, /* DDR2-800 SC */ {0, 1, 667, 667, 6494, 36494, 6993, 36993}, /* DDR3-667 SC */ {0, 1, 667, 800, 5998, 35998, 6460, 36460}, /* DDR3-800 SC */ {0, 0, 400, 400, 3528, 33528, 4255, 34255}, /* DDR2-400 SC */ {0, 0, 400, 667, 3500, 33500, 4079, 34079}, /* DDR2-667 SC */ {0, 0, 400, 800, 3487, 33487, 4029, 34029}, /* DDR2-800 SC */ {0, 1, 400, 667, 6566, 36566, 7145, 37145}, /* DDR3-667 SC */ {0, 1, 400, 800, 6042, 36042, 6584, 36584}, /* DDR3-800 SC */ }; static const struct cxsr_latency *intel_get_cxsr_latency(int is_desktop, int is_ddr3, int fsb, int mem) { const struct cxsr_latency *latency; int i; if (fsb == 0 || mem == 0) return NULL; for (i = 0; i < ARRAY_SIZE(cxsr_latency_table); i++) { latency = &cxsr_latency_table[i]; if (is_desktop == latency->is_desktop && is_ddr3 == latency->is_ddr3 && fsb == latency->fsb_freq && mem == latency->mem_freq) return latency; } DRM_DEBUG_KMS("Unknown FSB/MEM found, disable CxSR\n"); return NULL; } static void pineview_disable_cxsr(struct drm_device *dev) { struct drm_i915_private *dev_priv = dev->dev_private; /* deactivate cxsr */ I915_WRITE(DSPFW3, I915_READ(DSPFW3) & ~PINEVIEW_SELF_REFRESH_EN); } /* * Latency for FIFO fetches is dependent on several factors: * - memory configuration (speed, channels) * - chipset * - current MCH state * It can be fairly high in some situations, so here we assume a fairly * pessimal value. It's a tradeoff between extra memory fetches (if we * set this value too high, the FIFO will fetch frequently to stay full) * and power consumption (set it too low to save power and we might see * FIFO underruns and display "flicker"). * * A value of 5us seems to be a good balance; safe for very low end * platforms but not overly aggressive on lower latency configs. */ static const int latency_ns = 5000; static int i9xx_get_fifo_size(struct drm_device *dev, int plane) { struct drm_i915_private *dev_priv = dev->dev_private; uint32_t dsparb = I915_READ(DSPARB); int size; size = dsparb & 0x7f; if (plane) size = ((dsparb >> DSPARB_CSTART_SHIFT) & 0x7f) - size; DRM_DEBUG_KMS("FIFO size - (0x%08x) %s: %d\n", dsparb, plane ? "B" : "A", size); return size; } static int i85x_get_fifo_size(struct drm_device *dev, int plane) { struct drm_i915_private *dev_priv = dev->dev_private; uint32_t dsparb = I915_READ(DSPARB); int size; size = dsparb & 0x1ff; if (plane) size = ((dsparb >> DSPARB_BEND_SHIFT) & 0x1ff) - size; size >>= 1; /* Convert to cachelines */ DRM_DEBUG_KMS("FIFO size - (0x%08x) %s: %d\n", dsparb, plane ? "B" : "A", size); return size; } static int i845_get_fifo_size(struct drm_device *dev, int plane) { struct drm_i915_private *dev_priv = dev->dev_private; uint32_t dsparb = I915_READ(DSPARB); int size; size = dsparb & 0x7f; size >>= 2; /* Convert to cachelines */ DRM_DEBUG_KMS("FIFO size - (0x%08x) %s: %d\n", dsparb, plane ? "B" : "A", size); return size; } static int i830_get_fifo_size(struct drm_device *dev, int plane) { struct drm_i915_private *dev_priv = dev->dev_private; uint32_t dsparb = I915_READ(DSPARB); int size; size = dsparb & 0x7f; size >>= 1; /* Convert to cachelines */ DRM_DEBUG_KMS("FIFO size - (0x%08x) %s: %d\n", dsparb, plane ? "B" : "A", size); return size; } /* Pineview has different values for various configs */ static const struct intel_watermark_params pineview_display_wm = { PINEVIEW_DISPLAY_FIFO, PINEVIEW_MAX_WM, PINEVIEW_DFT_WM, PINEVIEW_GUARD_WM, PINEVIEW_FIFO_LINE_SIZE }; static const struct intel_watermark_params pineview_display_hplloff_wm = { PINEVIEW_DISPLAY_FIFO, PINEVIEW_MAX_WM, PINEVIEW_DFT_HPLLOFF_WM, PINEVIEW_GUARD_WM, PINEVIEW_FIFO_LINE_SIZE }; static const struct intel_watermark_params pineview_cursor_wm = { PINEVIEW_CURSOR_FIFO, PINEVIEW_CURSOR_MAX_WM, PINEVIEW_CURSOR_DFT_WM, PINEVIEW_CURSOR_GUARD_WM, PINEVIEW_FIFO_LINE_SIZE, }; static const struct intel_watermark_params pineview_cursor_hplloff_wm = { PINEVIEW_CURSOR_FIFO, PINEVIEW_CURSOR_MAX_WM, PINEVIEW_CURSOR_DFT_WM, PINEVIEW_CURSOR_GUARD_WM, PINEVIEW_FIFO_LINE_SIZE }; static const struct intel_watermark_params g4x_wm_info = { G4X_FIFO_SIZE, G4X_MAX_WM, G4X_MAX_WM, 2, G4X_FIFO_LINE_SIZE, }; static const struct intel_watermark_params g4x_cursor_wm_info = { I965_CURSOR_FIFO, I965_CURSOR_MAX_WM, I965_CURSOR_DFT_WM, 2, G4X_FIFO_LINE_SIZE, }; static const struct intel_watermark_params valleyview_wm_info = { VALLEYVIEW_FIFO_SIZE, VALLEYVIEW_MAX_WM, VALLEYVIEW_MAX_WM, 2, G4X_FIFO_LINE_SIZE, }; static const struct intel_watermark_params valleyview_cursor_wm_info = { I965_CURSOR_FIFO, VALLEYVIEW_CURSOR_MAX_WM, I965_CURSOR_DFT_WM, 2, G4X_FIFO_LINE_SIZE, }; static const struct intel_watermark_params i965_cursor_wm_info = { I965_CURSOR_FIFO, I965_CURSOR_MAX_WM, I965_CURSOR_DFT_WM, 2, I915_FIFO_LINE_SIZE, }; static const struct intel_watermark_params i945_wm_info = { I945_FIFO_SIZE, I915_MAX_WM, 1, 2, I915_FIFO_LINE_SIZE }; static const struct intel_watermark_params i915_wm_info = { I915_FIFO_SIZE, I915_MAX_WM, 1, 2, I915_FIFO_LINE_SIZE }; static const struct intel_watermark_params i855_wm_info = { I855GM_FIFO_SIZE, I915_MAX_WM, 1, 2, I830_FIFO_LINE_SIZE }; static const struct intel_watermark_params i830_wm_info = { I830_FIFO_SIZE, I915_MAX_WM, 1, 2, I830_FIFO_LINE_SIZE }; static const struct intel_watermark_params ironlake_display_wm_info = { ILK_DISPLAY_FIFO, ILK_DISPLAY_MAXWM, ILK_DISPLAY_DFTWM, 2, ILK_FIFO_LINE_SIZE }; static const struct intel_watermark_params ironlake_cursor_wm_info = { ILK_CURSOR_FIFO, ILK_CURSOR_MAXWM, ILK_CURSOR_DFTWM, 2, ILK_FIFO_LINE_SIZE }; static const struct intel_watermark_params ironlake_display_srwm_info = { ILK_DISPLAY_SR_FIFO, ILK_DISPLAY_MAX_SRWM, ILK_DISPLAY_DFT_SRWM, 2, ILK_FIFO_LINE_SIZE }; static const struct intel_watermark_params ironlake_cursor_srwm_info = { ILK_CURSOR_SR_FIFO, ILK_CURSOR_MAX_SRWM, ILK_CURSOR_DFT_SRWM, 2, ILK_FIFO_LINE_SIZE }; static const struct intel_watermark_params sandybridge_display_wm_info = { SNB_DISPLAY_FIFO, SNB_DISPLAY_MAXWM, SNB_DISPLAY_DFTWM, 2, SNB_FIFO_LINE_SIZE }; static const struct intel_watermark_params sandybridge_cursor_wm_info = { SNB_CURSOR_FIFO, SNB_CURSOR_MAXWM, SNB_CURSOR_DFTWM, 2, SNB_FIFO_LINE_SIZE }; static const struct intel_watermark_params sandybridge_display_srwm_info = { SNB_DISPLAY_SR_FIFO, SNB_DISPLAY_MAX_SRWM, SNB_DISPLAY_DFT_SRWM, 2, SNB_FIFO_LINE_SIZE }; static const struct intel_watermark_params sandybridge_cursor_srwm_info = { SNB_CURSOR_SR_FIFO, SNB_CURSOR_MAX_SRWM, SNB_CURSOR_DFT_SRWM, 2, SNB_FIFO_LINE_SIZE }; /** * intel_calculate_wm - calculate watermark level * @clock_in_khz: pixel clock * @wm: chip FIFO params * @pixel_size: display pixel size * @latency_ns: memory latency for the platform * * Calculate the watermark level (the level at which the display plane will * start fetching from memory again). Each chip has a different display * FIFO size and allocation, so the caller needs to figure that out and pass * in the correct intel_watermark_params structure. * * As the pixel clock runs, the FIFO will be drained at a rate that depends * on the pixel size. When it reaches the watermark level, it'll start * fetching FIFO line sized based chunks from memory until the FIFO fills * past the watermark point. If the FIFO drains completely, a FIFO underrun * will occur, and a display engine hang could result. */ static unsigned long intel_calculate_wm(unsigned long clock_in_khz, const struct intel_watermark_params *wm, int fifo_size, int pixel_size, unsigned long latency_ns) { long entries_required, wm_size; /* * Note: we need to make sure we don't overflow for various clock & * latency values. * clocks go from a few thousand to several hundred thousand. * latency is usually a few thousand */ entries_required = ((clock_in_khz / 1000) * pixel_size * latency_ns) / 1000; entries_required = DIV_ROUND_UP(entries_required, wm->cacheline_size); DRM_DEBUG_KMS("FIFO entries required for mode: %ld\n", entries_required); wm_size = fifo_size - (entries_required + wm->guard_size); DRM_DEBUG_KMS("FIFO watermark level: %ld\n", wm_size); /* Don't promote wm_size to unsigned... */ if (wm_size > (long)wm->max_wm) wm_size = wm->max_wm; if (wm_size <= 0) wm_size = wm->default_wm; return wm_size; } static struct drm_crtc *single_enabled_crtc(struct drm_device *dev) { struct drm_crtc *crtc, *enabled = NULL; list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) { if (intel_crtc_active(crtc)) { if (enabled) return NULL; enabled = crtc; } } return enabled; } static void pineview_update_wm(struct drm_device *dev) { struct drm_i915_private *dev_priv = dev->dev_private; struct drm_crtc *crtc; const struct cxsr_latency *latency; u32 reg; unsigned long wm; latency = intel_get_cxsr_latency(IS_PINEVIEW_G(dev), dev_priv->is_ddr3, dev_priv->fsb_freq, dev_priv->mem_freq); if (!latency) { DRM_DEBUG_KMS("Unknown FSB/MEM found, disable CxSR\n"); pineview_disable_cxsr(dev); return; } crtc = single_enabled_crtc(dev); if (crtc) { int clock = crtc->mode.clock; int pixel_size = crtc->fb->bits_per_pixel / 8; /* Display SR */ wm = intel_calculate_wm(clock, &pineview_display_wm, pineview_display_wm.fifo_size, pixel_size, latency->display_sr); reg = I915_READ(DSPFW1); reg &= ~DSPFW_SR_MASK; reg |= wm << DSPFW_SR_SHIFT; I915_WRITE(DSPFW1, reg); DRM_DEBUG_KMS("DSPFW1 register is %x\n", reg); /* cursor SR */ wm = intel_calculate_wm(clock, &pineview_cursor_wm, pineview_display_wm.fifo_size, pixel_size, latency->cursor_sr); reg = I915_READ(DSPFW3); reg &= ~DSPFW_CURSOR_SR_MASK; reg |= (wm & 0x3f) << DSPFW_CURSOR_SR_SHIFT; I915_WRITE(DSPFW3, reg); /* Display HPLL off SR */ wm = intel_calculate_wm(clock, &pineview_display_hplloff_wm, pineview_display_hplloff_wm.fifo_size, pixel_size, latency->display_hpll_disable); reg = I915_READ(DSPFW3); reg &= ~DSPFW_HPLL_SR_MASK; reg |= wm & DSPFW_HPLL_SR_MASK; I915_WRITE(DSPFW3, reg); /* cursor HPLL off SR */ wm = intel_calculate_wm(clock, &pineview_cursor_hplloff_wm, pineview_display_hplloff_wm.fifo_size, pixel_size, latency->cursor_hpll_disable); reg = I915_READ(DSPFW3); reg &= ~DSPFW_HPLL_CURSOR_MASK; reg |= (wm & 0x3f) << DSPFW_HPLL_CURSOR_SHIFT; I915_WRITE(DSPFW3, reg); DRM_DEBUG_KMS("DSPFW3 register is %x\n", reg); /* activate cxsr */ I915_WRITE(DSPFW3, I915_READ(DSPFW3) | PINEVIEW_SELF_REFRESH_EN); DRM_DEBUG_KMS("Self-refresh is enabled\n"); } else { pineview_disable_cxsr(dev); DRM_DEBUG_KMS("Self-refresh is disabled\n"); } } static bool g4x_compute_wm0(struct drm_device *dev, int plane, const struct intel_watermark_params *display, int display_latency_ns, const struct intel_watermark_params *cursor, int cursor_latency_ns, int *plane_wm, int *cursor_wm) { struct drm_crtc *crtc; int htotal, hdisplay, clock, pixel_size; int line_time_us, line_count; int entries, tlb_miss; crtc = intel_get_crtc_for_plane(dev, plane); if (!intel_crtc_active(crtc)) { *cursor_wm = cursor->guard_size; *plane_wm = display->guard_size; return false; } htotal = crtc->mode.htotal; hdisplay = crtc->mode.hdisplay; clock = crtc->mode.clock; pixel_size = crtc->fb->bits_per_pixel / 8; /* Use the small buffer method to calculate plane watermark */ entries = ((clock * pixel_size / 1000) * display_latency_ns) / 1000; tlb_miss = display->fifo_size*display->cacheline_size - hdisplay * 8; if (tlb_miss > 0) entries += tlb_miss; entries = DIV_ROUND_UP(entries, display->cacheline_size); *plane_wm = entries + display->guard_size; if (*plane_wm > (int)display->max_wm) *plane_wm = display->max_wm; /* Use the large buffer method to calculate cursor watermark */ line_time_us = ((htotal * 1000) / clock); line_count = (cursor_latency_ns / line_time_us + 1000) / 1000; entries = line_count * 64 * pixel_size; tlb_miss = cursor->fifo_size*cursor->cacheline_size - hdisplay * 8; if (tlb_miss > 0) entries += tlb_miss; entries = DIV_ROUND_UP(entries, cursor->cacheline_size); *cursor_wm = entries + cursor->guard_size; if (*cursor_wm > (int)cursor->max_wm) *cursor_wm = (int)cursor->max_wm; return true; } /* * Check the wm result. * * If any calculated watermark values is larger than the maximum value that * can be programmed into the associated watermark register, that watermark * must be disabled. */ static bool g4x_check_srwm(struct drm_device *dev, int display_wm, int cursor_wm, const struct intel_watermark_params *display, const struct intel_watermark_params *cursor) { DRM_DEBUG_KMS("SR watermark: display plane %d, cursor %d\n", display_wm, cursor_wm); if (display_wm > display->max_wm) { DRM_DEBUG_KMS("display watermark is too large(%d/%ld), disabling\n", display_wm, display->max_wm); return false; } if (cursor_wm > cursor->max_wm) { DRM_DEBUG_KMS("cursor watermark is too large(%d/%ld), disabling\n", cursor_wm, cursor->max_wm); return false; } if (!(display_wm || cursor_wm)) { DRM_DEBUG_KMS("SR latency is 0, disabling\n"); return false; } return true; } static bool g4x_compute_srwm(struct drm_device *dev, int plane, int latency_ns, const struct intel_watermark_params *display, const struct intel_watermark_params *cursor, int *display_wm, int *cursor_wm) { struct drm_crtc *crtc; int hdisplay, htotal, pixel_size, clock; unsigned long line_time_us; int line_count, line_size; int small, large; int entries; if (!latency_ns) { *display_wm = *cursor_wm = 0; return false; } crtc = intel_get_crtc_for_plane(dev, plane); hdisplay = crtc->mode.hdisplay; htotal = crtc->mode.htotal; clock = crtc->mode.clock; pixel_size = crtc->fb->bits_per_pixel / 8; line_time_us = (htotal * 1000) / clock; line_count = (latency_ns / line_time_us + 1000) / 1000; line_size = hdisplay * pixel_size; /* Use the minimum of the small and large buffer method for primary */ small = ((clock * pixel_size / 1000) * latency_ns) / 1000; large = line_count * line_size; entries = DIV_ROUND_UP(min(small, large), display->cacheline_size); *display_wm = entries + display->guard_size; /* calculate the self-refresh watermark for display cursor */ entries = line_count * pixel_size * 64; entries = DIV_ROUND_UP(entries, cursor->cacheline_size); *cursor_wm = entries + cursor->guard_size; return g4x_check_srwm(dev, *display_wm, *cursor_wm, display, cursor); } static bool vlv_compute_drain_latency(struct drm_device *dev, int plane, int *plane_prec_mult, int *plane_dl, int *cursor_prec_mult, int *cursor_dl) { struct drm_crtc *crtc; int clock, pixel_size; int entries; crtc = intel_get_crtc_for_plane(dev, plane); if (!intel_crtc_active(crtc)) return false; clock = crtc->mode.clock; /* VESA DOT Clock */ pixel_size = crtc->fb->bits_per_pixel / 8; /* BPP */ entries = (clock / 1000) * pixel_size; *plane_prec_mult = (entries > 256) ? DRAIN_LATENCY_PRECISION_32 : DRAIN_LATENCY_PRECISION_16; *plane_dl = (64 * (*plane_prec_mult) * 4) / ((clock / 1000) * pixel_size); entries = (clock / 1000) * 4; /* BPP is always 4 for cursor */ *cursor_prec_mult = (entries > 256) ? DRAIN_LATENCY_PRECISION_32 : DRAIN_LATENCY_PRECISION_16; *cursor_dl = (64 * (*cursor_prec_mult) * 4) / ((clock / 1000) * 4); return true; } /* * Update drain latency registers of memory arbiter * * Valleyview SoC has a new memory arbiter and needs drain latency registers * to be programmed. Each plane has a drain latency multiplier and a drain * latency value. */ static void vlv_update_drain_latency(struct drm_device *dev) { struct drm_i915_private *dev_priv = dev->dev_private; int planea_prec, planea_dl, planeb_prec, planeb_dl; int cursora_prec, cursora_dl, cursorb_prec, cursorb_dl; int plane_prec_mult, cursor_prec_mult; /* Precision multiplier is either 16 or 32 */ /* For plane A, Cursor A */ if (vlv_compute_drain_latency(dev, 0, &plane_prec_mult, &planea_dl, &cursor_prec_mult, &cursora_dl)) { cursora_prec = (cursor_prec_mult == DRAIN_LATENCY_PRECISION_32) ? DDL_CURSORA_PRECISION_32 : DDL_CURSORA_PRECISION_16; planea_prec = (plane_prec_mult == DRAIN_LATENCY_PRECISION_32) ? DDL_PLANEA_PRECISION_32 : DDL_PLANEA_PRECISION_16; I915_WRITE(VLV_DDL1, cursora_prec | (cursora_dl << DDL_CURSORA_SHIFT) | planea_prec | planea_dl); } /* For plane B, Cursor B */ if (vlv_compute_drain_latency(dev, 1, &plane_prec_mult, &planeb_dl, &cursor_prec_mult, &cursorb_dl)) { cursorb_prec = (cursor_prec_mult == DRAIN_LATENCY_PRECISION_32) ? DDL_CURSORB_PRECISION_32 : DDL_CURSORB_PRECISION_16; planeb_prec = (plane_prec_mult == DRAIN_LATENCY_PRECISION_32) ? DDL_PLANEB_PRECISION_32 : DDL_PLANEB_PRECISION_16; I915_WRITE(VLV_DDL2, cursorb_prec | (cursorb_dl << DDL_CURSORB_SHIFT) | planeb_prec | planeb_dl); } } #define single_plane_enabled(mask) ((mask) != 0 && powerof2(mask)) static void valleyview_update_wm(struct drm_device *dev) { static const int sr_latency_ns = 12000; struct drm_i915_private *dev_priv = dev->dev_private; int planea_wm, planeb_wm, cursora_wm, cursorb_wm; int plane_sr, cursor_sr; int ignore_plane_sr, ignore_cursor_sr; unsigned int enabled = 0; vlv_update_drain_latency(dev); if (g4x_compute_wm0(dev, 0, &valleyview_wm_info, latency_ns, &valleyview_cursor_wm_info, latency_ns, &planea_wm, &cursora_wm)) enabled |= 1; if (g4x_compute_wm0(dev, 1, &valleyview_wm_info, latency_ns, &valleyview_cursor_wm_info, latency_ns, &planeb_wm, &cursorb_wm)) enabled |= 2; if (single_plane_enabled(enabled) && g4x_compute_srwm(dev, ffs(enabled) - 1, sr_latency_ns, &valleyview_wm_info, &valleyview_cursor_wm_info, &plane_sr, &ignore_cursor_sr) && g4x_compute_srwm(dev, ffs(enabled) - 1, 2*sr_latency_ns, &valleyview_wm_info, &valleyview_cursor_wm_info, &ignore_plane_sr, &cursor_sr)) { I915_WRITE(FW_BLC_SELF_VLV, FW_CSPWRDWNEN); } else { I915_WRITE(FW_BLC_SELF_VLV, I915_READ(FW_BLC_SELF_VLV) & ~FW_CSPWRDWNEN); plane_sr = cursor_sr = 0; } DRM_DEBUG_KMS("Setting FIFO watermarks - A: plane=%d, cursor=%d, B: plane=%d, cursor=%d, SR: plane=%d, cursor=%d\n", planea_wm, cursora_wm, planeb_wm, cursorb_wm, plane_sr, cursor_sr); I915_WRITE(DSPFW1, (plane_sr << DSPFW_SR_SHIFT) | (cursorb_wm << DSPFW_CURSORB_SHIFT) | (planeb_wm << DSPFW_PLANEB_SHIFT) | planea_wm); I915_WRITE(DSPFW2, (I915_READ(DSPFW2) & ~DSPFW_CURSORA_MASK) | (cursora_wm << DSPFW_CURSORA_SHIFT)); I915_WRITE(DSPFW3, (I915_READ(DSPFW3) & ~DSPFW_CURSOR_SR_MASK) | (cursor_sr << DSPFW_CURSOR_SR_SHIFT)); } static void g4x_update_wm(struct drm_device *dev) { static const int sr_latency_ns = 12000; struct drm_i915_private *dev_priv = dev->dev_private; int planea_wm, planeb_wm, cursora_wm, cursorb_wm; int plane_sr, cursor_sr; unsigned int enabled = 0; if (g4x_compute_wm0(dev, 0, &g4x_wm_info, latency_ns, &g4x_cursor_wm_info, latency_ns, &planea_wm, &cursora_wm)) enabled |= 1; if (g4x_compute_wm0(dev, 1, &g4x_wm_info, latency_ns, &g4x_cursor_wm_info, latency_ns, &planeb_wm, &cursorb_wm)) enabled |= 2; if (single_plane_enabled(enabled) && g4x_compute_srwm(dev, ffs(enabled) - 1, sr_latency_ns, &g4x_wm_info, &g4x_cursor_wm_info, &plane_sr, &cursor_sr)) { I915_WRITE(FW_BLC_SELF, FW_BLC_SELF_EN); } else { I915_WRITE(FW_BLC_SELF, I915_READ(FW_BLC_SELF) & ~FW_BLC_SELF_EN); plane_sr = cursor_sr = 0; } DRM_DEBUG_KMS("Setting FIFO watermarks - A: plane=%d, cursor=%d, B: plane=%d, cursor=%d, SR: plane=%d, cursor=%d\n", planea_wm, cursora_wm, planeb_wm, cursorb_wm, plane_sr, cursor_sr); I915_WRITE(DSPFW1, (plane_sr << DSPFW_SR_SHIFT) | (cursorb_wm << DSPFW_CURSORB_SHIFT) | (planeb_wm << DSPFW_PLANEB_SHIFT) | planea_wm); I915_WRITE(DSPFW2, (I915_READ(DSPFW2) & ~DSPFW_CURSORA_MASK) | (cursora_wm << DSPFW_CURSORA_SHIFT)); /* HPLL off in SR has some issues on G4x... disable it */ I915_WRITE(DSPFW3, (I915_READ(DSPFW3) & ~(DSPFW_HPLL_SR_EN | DSPFW_CURSOR_SR_MASK)) | (cursor_sr << DSPFW_CURSOR_SR_SHIFT)); } static void i965_update_wm(struct drm_device *dev) { struct drm_i915_private *dev_priv = dev->dev_private; struct drm_crtc *crtc; int srwm = 1; int cursor_sr = 16; /* Calc sr entries for one plane configs */ crtc = single_enabled_crtc(dev); if (crtc) { /* self-refresh has much higher latency */ static const int sr_latency_ns = 12000; int clock = crtc->mode.clock; int htotal = crtc->mode.htotal; int hdisplay = crtc->mode.hdisplay; int pixel_size = crtc->fb->bits_per_pixel / 8; unsigned long line_time_us; int entries; line_time_us = ((htotal * 1000) / clock); /* Use ns/us then divide to preserve precision */ entries = (((sr_latency_ns / line_time_us) + 1000) / 1000) * pixel_size * hdisplay; entries = DIV_ROUND_UP(entries, I915_FIFO_LINE_SIZE); srwm = I965_FIFO_SIZE - entries; if (srwm < 0) srwm = 1; srwm &= 0x1ff; DRM_DEBUG_KMS("self-refresh entries: %d, wm: %d\n", entries, srwm); entries = (((sr_latency_ns / line_time_us) + 1000) / 1000) * pixel_size * 64; entries = DIV_ROUND_UP(entries, i965_cursor_wm_info.cacheline_size); cursor_sr = i965_cursor_wm_info.fifo_size - (entries + i965_cursor_wm_info.guard_size); if (cursor_sr > i965_cursor_wm_info.max_wm) cursor_sr = i965_cursor_wm_info.max_wm; DRM_DEBUG_KMS("self-refresh watermark: display plane %d " "cursor %d\n", srwm, cursor_sr); if (IS_CRESTLINE(dev)) I915_WRITE(FW_BLC_SELF, FW_BLC_SELF_EN); } else { /* Turn off self refresh if both pipes are enabled */ if (IS_CRESTLINE(dev)) I915_WRITE(FW_BLC_SELF, I915_READ(FW_BLC_SELF) & ~FW_BLC_SELF_EN); } DRM_DEBUG_KMS("Setting FIFO watermarks - A: 8, B: 8, C: 8, SR %d\n", srwm); /* 965 has limitations... */ I915_WRITE(DSPFW1, (srwm << DSPFW_SR_SHIFT) | (8 << 16) | (8 << 8) | (8 << 0)); I915_WRITE(DSPFW2, (8 << 8) | (8 << 0)); /* update cursor SR watermark */ I915_WRITE(DSPFW3, (cursor_sr << DSPFW_CURSOR_SR_SHIFT)); } static void i9xx_update_wm(struct drm_device *dev) { struct drm_i915_private *dev_priv = dev->dev_private; const struct intel_watermark_params *wm_info; uint32_t fwater_lo; uint32_t fwater_hi; int cwm, srwm = 1; int fifo_size; int planea_wm, planeb_wm; struct drm_crtc *crtc, *enabled = NULL; if (IS_I945GM(dev)) wm_info = &i945_wm_info; else if (!IS_GEN2(dev)) wm_info = &i915_wm_info; else wm_info = &i855_wm_info; fifo_size = dev_priv->display.get_fifo_size(dev, 0); crtc = intel_get_crtc_for_plane(dev, 0); if (intel_crtc_active(crtc)) { int cpp = crtc->fb->bits_per_pixel / 8; if (IS_GEN2(dev)) cpp = 4; planea_wm = intel_calculate_wm(crtc->mode.clock, wm_info, fifo_size, cpp, latency_ns); enabled = crtc; } else planea_wm = fifo_size - wm_info->guard_size; fifo_size = dev_priv->display.get_fifo_size(dev, 1); crtc = intel_get_crtc_for_plane(dev, 1); if (intel_crtc_active(crtc)) { int cpp = crtc->fb->bits_per_pixel / 8; if (IS_GEN2(dev)) cpp = 4; planeb_wm = intel_calculate_wm(crtc->mode.clock, wm_info, fifo_size, cpp, latency_ns); if (enabled == NULL) enabled = crtc; else enabled = NULL; } else planeb_wm = fifo_size - wm_info->guard_size; DRM_DEBUG_KMS("FIFO watermarks - A: %d, B: %d\n", planea_wm, planeb_wm); /* * Overlay gets an aggressive default since video jitter is bad. */ cwm = 2; /* Play safe and disable self-refresh before adjusting watermarks. */ if (IS_I945G(dev) || IS_I945GM(dev)) I915_WRITE(FW_BLC_SELF, FW_BLC_SELF_EN_MASK | 0); else if (IS_I915GM(dev)) I915_WRITE(INSTPM, I915_READ(INSTPM) & ~INSTPM_SELF_EN); /* Calc sr entries for one plane configs */ if (HAS_FW_BLC(dev) && enabled) { /* self-refresh has much higher latency */ static const int sr_latency_ns = 6000; int clock = enabled->mode.clock; int htotal = enabled->mode.htotal; int hdisplay = enabled->mode.hdisplay; int pixel_size = enabled->fb->bits_per_pixel / 8; unsigned long line_time_us; int entries; line_time_us = (htotal * 1000) / clock; /* Use ns/us then divide to preserve precision */ entries = (((sr_latency_ns / line_time_us) + 1000) / 1000) * pixel_size * hdisplay; entries = DIV_ROUND_UP(entries, wm_info->cacheline_size); DRM_DEBUG_KMS("self-refresh entries: %d\n", entries); srwm = wm_info->fifo_size - entries; if (srwm < 0) srwm = 1; if (IS_I945G(dev) || IS_I945GM(dev)) I915_WRITE(FW_BLC_SELF, FW_BLC_SELF_FIFO_MASK | (srwm & 0xff)); else if (IS_I915GM(dev)) I915_WRITE(FW_BLC_SELF, srwm & 0x3f); } DRM_DEBUG_KMS("Setting FIFO watermarks - A: %d, B: %d, C: %d, SR %d\n", planea_wm, planeb_wm, cwm, srwm); fwater_lo = ((planeb_wm & 0x3f) << 16) | (planea_wm & 0x3f); fwater_hi = (cwm & 0x1f); /* Set request length to 8 cachelines per fetch */ fwater_lo = fwater_lo | (1 << 24) | (1 << 8); fwater_hi = fwater_hi | (1 << 8); I915_WRITE(FW_BLC, fwater_lo); I915_WRITE(FW_BLC2, fwater_hi); if (HAS_FW_BLC(dev)) { if (enabled) { if (IS_I945G(dev) || IS_I945GM(dev)) I915_WRITE(FW_BLC_SELF, FW_BLC_SELF_EN_MASK | FW_BLC_SELF_EN); else if (IS_I915GM(dev)) I915_WRITE(INSTPM, I915_READ(INSTPM) | INSTPM_SELF_EN); DRM_DEBUG_KMS("memory self refresh enabled\n"); } else DRM_DEBUG_KMS("memory self refresh disabled\n"); } } static void i830_update_wm(struct drm_device *dev) { struct drm_i915_private *dev_priv = dev->dev_private; struct drm_crtc *crtc; uint32_t fwater_lo; int planea_wm; crtc = single_enabled_crtc(dev); if (crtc == NULL) return; planea_wm = intel_calculate_wm(crtc->mode.clock, &i830_wm_info, dev_priv->display.get_fifo_size(dev, 0), 4, latency_ns); fwater_lo = I915_READ(FW_BLC) & ~0xfff; fwater_lo |= (3<<8) | planea_wm; DRM_DEBUG_KMS("Setting FIFO watermarks - A: %d\n", planea_wm); I915_WRITE(FW_BLC, fwater_lo); } #define ILK_LP0_PLANE_LATENCY 700 #define ILK_LP0_CURSOR_LATENCY 1300 /* * Check the wm result. * * If any calculated watermark values is larger than the maximum value that * can be programmed into the associated watermark register, that watermark * must be disabled. */ static bool ironlake_check_srwm(struct drm_device *dev, int level, int fbc_wm, int display_wm, int cursor_wm, const struct intel_watermark_params *display, const struct intel_watermark_params *cursor) { struct drm_i915_private *dev_priv = dev->dev_private; DRM_DEBUG_KMS("watermark %d: display plane %d, fbc lines %d," " cursor %d\n", level, display_wm, fbc_wm, cursor_wm); if (fbc_wm > SNB_FBC_MAX_SRWM) { DRM_DEBUG_KMS("fbc watermark(%d) is too large(%d), disabling wm%d+\n", fbc_wm, SNB_FBC_MAX_SRWM, level); /* fbc has it's own way to disable FBC WM */ I915_WRITE(DISP_ARB_CTL, I915_READ(DISP_ARB_CTL) | DISP_FBC_WM_DIS); return false; } if (display_wm > display->max_wm) { DRM_DEBUG_KMS("display watermark(%d) is too large(%d), disabling wm%d+\n", display_wm, SNB_DISPLAY_MAX_SRWM, level); return false; } if (cursor_wm > cursor->max_wm) { DRM_DEBUG_KMS("cursor watermark(%d) is too large(%d), disabling wm%d+\n", cursor_wm, SNB_CURSOR_MAX_SRWM, level); return false; } if (!(fbc_wm || display_wm || cursor_wm)) { DRM_DEBUG_KMS("latency %d is 0, disabling wm%d+\n", level, level); return false; } return true; } /* * Compute watermark values of WM[1-3], */ static bool ironlake_compute_srwm(struct drm_device *dev, int level, int plane, int latency_ns, const struct intel_watermark_params *display, const struct intel_watermark_params *cursor, int *fbc_wm, int *display_wm, int *cursor_wm) { struct drm_crtc *crtc; unsigned long line_time_us; int hdisplay, htotal, pixel_size, clock; int line_count, line_size; int small, large; int entries; if (!latency_ns) { *fbc_wm = *display_wm = *cursor_wm = 0; return false; } crtc = intel_get_crtc_for_plane(dev, plane); hdisplay = crtc->mode.hdisplay; htotal = crtc->mode.htotal; clock = crtc->mode.clock; pixel_size = crtc->fb->bits_per_pixel / 8; line_time_us = (htotal * 1000) / clock; line_count = (latency_ns / line_time_us + 1000) / 1000; line_size = hdisplay * pixel_size; /* Use the minimum of the small and large buffer method for primary */ small = ((clock * pixel_size / 1000) * latency_ns) / 1000; large = line_count * line_size; entries = DIV_ROUND_UP(min(small, large), display->cacheline_size); *display_wm = entries + display->guard_size; /* * Spec says: * FBC WM = ((Final Primary WM * 64) / number of bytes per line) + 2 */ *fbc_wm = DIV_ROUND_UP(*display_wm * 64, line_size) + 2; /* calculate the self-refresh watermark for display cursor */ entries = line_count * pixel_size * 64; entries = DIV_ROUND_UP(entries, cursor->cacheline_size); *cursor_wm = entries + cursor->guard_size; return ironlake_check_srwm(dev, level, *fbc_wm, *display_wm, *cursor_wm, display, cursor); } static void ironlake_update_wm(struct drm_device *dev) { struct drm_i915_private *dev_priv = dev->dev_private; int fbc_wm, plane_wm, cursor_wm; unsigned int enabled; enabled = 0; if (g4x_compute_wm0(dev, 0, &ironlake_display_wm_info, ILK_LP0_PLANE_LATENCY, &ironlake_cursor_wm_info, ILK_LP0_CURSOR_LATENCY, &plane_wm, &cursor_wm)) { I915_WRITE(WM0_PIPEA_ILK, (plane_wm << WM0_PIPE_PLANE_SHIFT) | cursor_wm); DRM_DEBUG_KMS("FIFO watermarks For pipe A -" " plane %d, " "cursor: %d\n", plane_wm, cursor_wm); enabled |= 1; } if (g4x_compute_wm0(dev, 1, &ironlake_display_wm_info, ILK_LP0_PLANE_LATENCY, &ironlake_cursor_wm_info, ILK_LP0_CURSOR_LATENCY, &plane_wm, &cursor_wm)) { I915_WRITE(WM0_PIPEB_ILK, (plane_wm << WM0_PIPE_PLANE_SHIFT) | cursor_wm); DRM_DEBUG_KMS("FIFO watermarks For pipe B -" " plane %d, cursor: %d\n", plane_wm, cursor_wm); enabled |= 2; } /* * Calculate and update the self-refresh watermark only when one * display plane is used. */ I915_WRITE(WM3_LP_ILK, 0); I915_WRITE(WM2_LP_ILK, 0); I915_WRITE(WM1_LP_ILK, 0); if (!single_plane_enabled(enabled)) return; enabled = ffs(enabled) - 1; /* WM1 */ if (!ironlake_compute_srwm(dev, 1, enabled, ILK_READ_WM1_LATENCY() * 500, &ironlake_display_srwm_info, &ironlake_cursor_srwm_info, &fbc_wm, &plane_wm, &cursor_wm)) return; I915_WRITE(WM1_LP_ILK, WM1_LP_SR_EN | (ILK_READ_WM1_LATENCY() << WM1_LP_LATENCY_SHIFT) | (fbc_wm << WM1_LP_FBC_SHIFT) | (plane_wm << WM1_LP_SR_SHIFT) | cursor_wm); /* WM2 */ if (!ironlake_compute_srwm(dev, 2, enabled, ILK_READ_WM2_LATENCY() * 500, &ironlake_display_srwm_info, &ironlake_cursor_srwm_info, &fbc_wm, &plane_wm, &cursor_wm)) return; I915_WRITE(WM2_LP_ILK, WM2_LP_EN | (ILK_READ_WM2_LATENCY() << WM1_LP_LATENCY_SHIFT) | (fbc_wm << WM1_LP_FBC_SHIFT) | (plane_wm << WM1_LP_SR_SHIFT) | cursor_wm); /* * WM3 is unsupported on ILK, probably because we don't have latency * data for that power state */ } static void sandybridge_update_wm(struct drm_device *dev) { struct drm_i915_private *dev_priv = dev->dev_private; int latency = SNB_READ_WM0_LATENCY() * 100; /* In unit 0.1us */ u32 val; int fbc_wm, plane_wm, cursor_wm; unsigned int enabled; enabled = 0; if (g4x_compute_wm0(dev, 0, &sandybridge_display_wm_info, latency, &sandybridge_cursor_wm_info, latency, &plane_wm, &cursor_wm)) { val = I915_READ(WM0_PIPEA_ILK); val &= ~(WM0_PIPE_PLANE_MASK | WM0_PIPE_CURSOR_MASK); I915_WRITE(WM0_PIPEA_ILK, val | ((plane_wm << WM0_PIPE_PLANE_SHIFT) | cursor_wm)); DRM_DEBUG_KMS("FIFO watermarks For pipe A -" " plane %d, " "cursor: %d\n", plane_wm, cursor_wm); enabled |= 1; } if (g4x_compute_wm0(dev, 1, &sandybridge_display_wm_info, latency, &sandybridge_cursor_wm_info, latency, &plane_wm, &cursor_wm)) { val = I915_READ(WM0_PIPEB_ILK); val &= ~(WM0_PIPE_PLANE_MASK | WM0_PIPE_CURSOR_MASK); I915_WRITE(WM0_PIPEB_ILK, val | ((plane_wm << WM0_PIPE_PLANE_SHIFT) | cursor_wm)); DRM_DEBUG_KMS("FIFO watermarks For pipe B -" " plane %d, cursor: %d\n", plane_wm, cursor_wm); enabled |= 2; } /* * Calculate and update the self-refresh watermark only when one * display plane is used. * * SNB support 3 levels of watermark. * * WM1/WM2/WM2 watermarks have to be enabled in the ascending order, * and disabled in the descending order * */ I915_WRITE(WM3_LP_ILK, 0); I915_WRITE(WM2_LP_ILK, 0); I915_WRITE(WM1_LP_ILK, 0); if (!single_plane_enabled(enabled) || dev_priv->sprite_scaling_enabled) return; enabled = ffs(enabled) - 1; /* WM1 */ if (!ironlake_compute_srwm(dev, 1, enabled, SNB_READ_WM1_LATENCY() * 500, &sandybridge_display_srwm_info, &sandybridge_cursor_srwm_info, &fbc_wm, &plane_wm, &cursor_wm)) return; I915_WRITE(WM1_LP_ILK, WM1_LP_SR_EN | (SNB_READ_WM1_LATENCY() << WM1_LP_LATENCY_SHIFT) | (fbc_wm << WM1_LP_FBC_SHIFT) | (plane_wm << WM1_LP_SR_SHIFT) | cursor_wm); /* WM2 */ if (!ironlake_compute_srwm(dev, 2, enabled, SNB_READ_WM2_LATENCY() * 500, &sandybridge_display_srwm_info, &sandybridge_cursor_srwm_info, &fbc_wm, &plane_wm, &cursor_wm)) return; I915_WRITE(WM2_LP_ILK, WM2_LP_EN | (SNB_READ_WM2_LATENCY() << WM1_LP_LATENCY_SHIFT) | (fbc_wm << WM1_LP_FBC_SHIFT) | (plane_wm << WM1_LP_SR_SHIFT) | cursor_wm); /* WM3 */ if (!ironlake_compute_srwm(dev, 3, enabled, SNB_READ_WM3_LATENCY() * 500, &sandybridge_display_srwm_info, &sandybridge_cursor_srwm_info, &fbc_wm, &plane_wm, &cursor_wm)) return; I915_WRITE(WM3_LP_ILK, WM3_LP_EN | (SNB_READ_WM3_LATENCY() << WM1_LP_LATENCY_SHIFT) | (fbc_wm << WM1_LP_FBC_SHIFT) | (plane_wm << WM1_LP_SR_SHIFT) | cursor_wm); } static void ivybridge_update_wm(struct drm_device *dev) { struct drm_i915_private *dev_priv = dev->dev_private; int latency = SNB_READ_WM0_LATENCY() * 100; /* In unit 0.1us */ u32 val; int fbc_wm, plane_wm, cursor_wm; int ignore_fbc_wm, ignore_plane_wm, ignore_cursor_wm; unsigned int enabled; enabled = 0; if (g4x_compute_wm0(dev, 0, &sandybridge_display_wm_info, latency, &sandybridge_cursor_wm_info, latency, &plane_wm, &cursor_wm)) { val = I915_READ(WM0_PIPEA_ILK); val &= ~(WM0_PIPE_PLANE_MASK | WM0_PIPE_CURSOR_MASK); I915_WRITE(WM0_PIPEA_ILK, val | ((plane_wm << WM0_PIPE_PLANE_SHIFT) | cursor_wm)); DRM_DEBUG_KMS("FIFO watermarks For pipe A -" " plane %d, " "cursor: %d\n", plane_wm, cursor_wm); enabled |= 1; } if (g4x_compute_wm0(dev, 1, &sandybridge_display_wm_info, latency, &sandybridge_cursor_wm_info, latency, &plane_wm, &cursor_wm)) { val = I915_READ(WM0_PIPEB_ILK); val &= ~(WM0_PIPE_PLANE_MASK | WM0_PIPE_CURSOR_MASK); I915_WRITE(WM0_PIPEB_ILK, val | ((plane_wm << WM0_PIPE_PLANE_SHIFT) | cursor_wm)); DRM_DEBUG_KMS("FIFO watermarks For pipe B -" " plane %d, cursor: %d\n", plane_wm, cursor_wm); enabled |= 2; } if (g4x_compute_wm0(dev, 2, &sandybridge_display_wm_info, latency, &sandybridge_cursor_wm_info, latency, &plane_wm, &cursor_wm)) { val = I915_READ(WM0_PIPEC_IVB); val &= ~(WM0_PIPE_PLANE_MASK | WM0_PIPE_CURSOR_MASK); I915_WRITE(WM0_PIPEC_IVB, val | ((plane_wm << WM0_PIPE_PLANE_SHIFT) | cursor_wm)); DRM_DEBUG_KMS("FIFO watermarks For pipe C -" " plane %d, cursor: %d\n", plane_wm, cursor_wm); enabled |= 3; } /* * Calculate and update the self-refresh watermark only when one * display plane is used. * * SNB support 3 levels of watermark. * * WM1/WM2/WM2 watermarks have to be enabled in the ascending order, * and disabled in the descending order * */ I915_WRITE(WM3_LP_ILK, 0); I915_WRITE(WM2_LP_ILK, 0); I915_WRITE(WM1_LP_ILK, 0); if (!single_plane_enabled(enabled) || dev_priv->sprite_scaling_enabled) return; enabled = ffs(enabled) - 1; /* WM1 */ if (!ironlake_compute_srwm(dev, 1, enabled, SNB_READ_WM1_LATENCY() * 500, &sandybridge_display_srwm_info, &sandybridge_cursor_srwm_info, &fbc_wm, &plane_wm, &cursor_wm)) return; I915_WRITE(WM1_LP_ILK, WM1_LP_SR_EN | (SNB_READ_WM1_LATENCY() << WM1_LP_LATENCY_SHIFT) | (fbc_wm << WM1_LP_FBC_SHIFT) | (plane_wm << WM1_LP_SR_SHIFT) | cursor_wm); /* WM2 */ if (!ironlake_compute_srwm(dev, 2, enabled, SNB_READ_WM2_LATENCY() * 500, &sandybridge_display_srwm_info, &sandybridge_cursor_srwm_info, &fbc_wm, &plane_wm, &cursor_wm)) return; I915_WRITE(WM2_LP_ILK, WM2_LP_EN | (SNB_READ_WM2_LATENCY() << WM1_LP_LATENCY_SHIFT) | (fbc_wm << WM1_LP_FBC_SHIFT) | (plane_wm << WM1_LP_SR_SHIFT) | cursor_wm); /* WM3, note we have to correct the cursor latency */ if (!ironlake_compute_srwm(dev, 3, enabled, SNB_READ_WM3_LATENCY() * 500, &sandybridge_display_srwm_info, &sandybridge_cursor_srwm_info, &fbc_wm, &plane_wm, &ignore_cursor_wm) || !ironlake_compute_srwm(dev, 3, enabled, 2 * SNB_READ_WM3_LATENCY() * 500, &sandybridge_display_srwm_info, &sandybridge_cursor_srwm_info, &ignore_fbc_wm, &ignore_plane_wm, &cursor_wm)) return; I915_WRITE(WM3_LP_ILK, WM3_LP_EN | (SNB_READ_WM3_LATENCY() << WM1_LP_LATENCY_SHIFT) | (fbc_wm << WM1_LP_FBC_SHIFT) | (plane_wm << WM1_LP_SR_SHIFT) | cursor_wm); } static void haswell_update_linetime_wm(struct drm_device *dev, int pipe, struct drm_display_mode *mode) { struct drm_i915_private *dev_priv = dev->dev_private; u32 temp; temp = I915_READ(PIPE_WM_LINETIME(pipe)); temp &= ~PIPE_WM_LINETIME_MASK; /* The WM are computed with base on how long it takes to fill a single * row at the given clock rate, multiplied by 8. * */ temp |= PIPE_WM_LINETIME_TIME( ((mode->crtc_hdisplay * 1000) / mode->clock) * 8); /* IPS watermarks are only used by pipe A, and are ignored by * pipes B and C. They are calculated similarly to the common * linetime values, except that we are using CD clock frequency * in MHz instead of pixel rate for the division. * * This is a placeholder for the IPS watermark calculation code. */ I915_WRITE(PIPE_WM_LINETIME(pipe), temp); } static bool sandybridge_compute_sprite_wm(struct drm_device *dev, int plane, uint32_t sprite_width, int pixel_size, const struct intel_watermark_params *display, int display_latency_ns, int *sprite_wm) { struct drm_crtc *crtc; int clock; int entries, tlb_miss; crtc = intel_get_crtc_for_plane(dev, plane); if (!intel_crtc_active(crtc)) { *sprite_wm = display->guard_size; return false; } clock = crtc->mode.clock; /* Use the small buffer method to calculate the sprite watermark */ entries = ((clock * pixel_size / 1000) * display_latency_ns) / 1000; tlb_miss = display->fifo_size*display->cacheline_size - sprite_width * 8; if (tlb_miss > 0) entries += tlb_miss; entries = DIV_ROUND_UP(entries, display->cacheline_size); *sprite_wm = entries + display->guard_size; if (*sprite_wm > (int)display->max_wm) *sprite_wm = display->max_wm; return true; } static bool sandybridge_compute_sprite_srwm(struct drm_device *dev, int plane, uint32_t sprite_width, int pixel_size, const struct intel_watermark_params *display, int latency_ns, int *sprite_wm) { struct drm_crtc *crtc; unsigned long line_time_us; int clock; int line_count, line_size; int small, large; int entries; if (!latency_ns) { *sprite_wm = 0; return false; } crtc = intel_get_crtc_for_plane(dev, plane); clock = crtc->mode.clock; if (!clock) { *sprite_wm = 0; return false; } line_time_us = (sprite_width * 1000) / clock; if (!line_time_us) { *sprite_wm = 0; return false; } line_count = (latency_ns / line_time_us + 1000) / 1000; line_size = sprite_width * pixel_size; /* Use the minimum of the small and large buffer method for primary */ small = ((clock * pixel_size / 1000) * latency_ns) / 1000; large = line_count * line_size; entries = DIV_ROUND_UP(min(small, large), display->cacheline_size); *sprite_wm = entries + display->guard_size; return *sprite_wm > 0x3ff ? false : true; } static void sandybridge_update_sprite_wm(struct drm_device *dev, int pipe, uint32_t sprite_width, int pixel_size) { struct drm_i915_private *dev_priv = dev->dev_private; int latency = SNB_READ_WM0_LATENCY() * 100; /* In unit 0.1us */ u32 val; int sprite_wm, reg; int ret; switch (pipe) { case 0: reg = WM0_PIPEA_ILK; break; case 1: reg = WM0_PIPEB_ILK; break; case 2: reg = WM0_PIPEC_IVB; break; default: return; /* bad pipe */ } ret = sandybridge_compute_sprite_wm(dev, pipe, sprite_width, pixel_size, &sandybridge_display_wm_info, latency, &sprite_wm); if (!ret) { DRM_DEBUG_KMS("failed to compute sprite wm for pipe %d\n", pipe); return; } val = I915_READ(reg); val &= ~WM0_PIPE_SPRITE_MASK; I915_WRITE(reg, val | (sprite_wm << WM0_PIPE_SPRITE_SHIFT)); DRM_DEBUG_KMS("sprite watermarks For pipe %d - %d\n", pipe, sprite_wm); ret = sandybridge_compute_sprite_srwm(dev, pipe, sprite_width, pixel_size, &sandybridge_display_srwm_info, SNB_READ_WM1_LATENCY() * 500, &sprite_wm); if (!ret) { DRM_DEBUG_KMS("failed to compute sprite lp1 wm on pipe %d\n", pipe); return; } I915_WRITE(WM1S_LP_ILK, sprite_wm); /* Only IVB has two more LP watermarks for sprite */ if (!IS_IVYBRIDGE(dev)) return; ret = sandybridge_compute_sprite_srwm(dev, pipe, sprite_width, pixel_size, &sandybridge_display_srwm_info, SNB_READ_WM2_LATENCY() * 500, &sprite_wm); if (!ret) { DRM_DEBUG_KMS("failed to compute sprite lp2 wm on pipe %d\n", pipe); return; } I915_WRITE(WM2S_LP_IVB, sprite_wm); ret = sandybridge_compute_sprite_srwm(dev, pipe, sprite_width, pixel_size, &sandybridge_display_srwm_info, SNB_READ_WM3_LATENCY() * 500, &sprite_wm); if (!ret) { DRM_DEBUG_KMS("failed to compute sprite lp3 wm on pipe %d\n", pipe); return; } I915_WRITE(WM3S_LP_IVB, sprite_wm); } /** * intel_update_watermarks - update FIFO watermark values based on current modes * * Calculate watermark values for the various WM regs based on current mode * and plane configuration. * * There are several cases to deal with here: * - normal (i.e. non-self-refresh) * - self-refresh (SR) mode * - lines are large relative to FIFO size (buffer can hold up to 2) * - lines are small relative to FIFO size (buffer can hold more than 2 * lines), so need to account for TLB latency * * The normal calculation is: * watermark = dotclock * bytes per pixel * latency * where latency is platform & configuration dependent (we assume pessimal * values here). * * The SR calculation is: * watermark = (trunc(latency/line time)+1) * surface width * * bytes per pixel * where * line time = htotal / dotclock * surface width = hdisplay for normal plane and 64 for cursor * and latency is assumed to be high, as above. * * The final value programmed to the register should always be rounded up, * and include an extra 2 entries to account for clock crossings. * * We don't use the sprite, so we can ignore that. And on Crestline we have * to set the non-SR watermarks to 8. */ void intel_update_watermarks(struct drm_device *dev) { struct drm_i915_private *dev_priv = dev->dev_private; if (dev_priv->display.update_wm) dev_priv->display.update_wm(dev); } void intel_update_linetime_watermarks(struct drm_device *dev, int pipe, struct drm_display_mode *mode) { struct drm_i915_private *dev_priv = dev->dev_private; if (dev_priv->display.update_linetime_wm) dev_priv->display.update_linetime_wm(dev, pipe, mode); } void intel_update_sprite_watermarks(struct drm_device *dev, int pipe, uint32_t sprite_width, int pixel_size) { struct drm_i915_private *dev_priv = dev->dev_private; if (dev_priv->display.update_sprite_wm) dev_priv->display.update_sprite_wm(dev, pipe, sprite_width, pixel_size); } static struct drm_i915_gem_object * intel_alloc_context_page(struct drm_device *dev) { struct drm_i915_gem_object *ctx; int ret; DRM_LOCK_ASSERT(dev); ctx = i915_gem_alloc_object(dev, 4096); if (!ctx) { DRM_DEBUG("failed to alloc power context, RC6 disabled\n"); return NULL; } ret = i915_gem_object_pin(ctx, 4096, true, false); if (ret) { DRM_ERROR("failed to pin power context: %d\n", ret); goto err_unref; } ret = i915_gem_object_set_to_gtt_domain(ctx, 1); if (ret) { DRM_ERROR("failed to set-domain on power context: %d\n", ret); goto err_unpin; } return ctx; err_unpin: i915_gem_object_unpin(ctx); err_unref: drm_gem_object_unreference(&ctx->base); DRM_UNLOCK(dev); return NULL; } /** * Lock protecting IPS related data structures */ struct mtx mchdev_lock; MTX_SYSINIT(mchdev, &mchdev_lock, "mchdev", MTX_DEF); /* Global for IPS driver to get at the current i915 device. Protected by * mchdev_lock. */ static struct drm_i915_private *i915_mch_dev; bool ironlake_set_drps(struct drm_device *dev, u8 val) { struct drm_i915_private *dev_priv = dev->dev_private; u16 rgvswctl; mtx_assert(&mchdev_lock, MA_OWNED); rgvswctl = I915_READ16(MEMSWCTL); if (rgvswctl & MEMCTL_CMD_STS) { DRM_DEBUG("gpu busy, RCS change rejected\n"); return false; /* still busy with another command */ } rgvswctl = (MEMCTL_CMD_CHFREQ << MEMCTL_CMD_SHIFT) | (val << MEMCTL_FREQ_SHIFT) | MEMCTL_SFCAVM; I915_WRITE16(MEMSWCTL, rgvswctl); POSTING_READ16(MEMSWCTL); rgvswctl |= MEMCTL_CMD_STS; I915_WRITE16(MEMSWCTL, rgvswctl); return true; } static void ironlake_enable_drps(struct drm_device *dev) { struct drm_i915_private *dev_priv = dev->dev_private; u32 rgvmodectl = I915_READ(MEMMODECTL); u8 fmax, fmin, fstart, vstart; mtx_lock(&mchdev_lock); /* Enable temp reporting */ I915_WRITE16(PMMISC, I915_READ(PMMISC) | MCPPCE_EN); I915_WRITE16(TSC1, I915_READ(TSC1) | TSE); /* 100ms RC evaluation intervals */ I915_WRITE(RCUPEI, 100000); I915_WRITE(RCDNEI, 100000); /* Set max/min thresholds to 90ms and 80ms respectively */ I915_WRITE(RCBMAXAVG, 90000); I915_WRITE(RCBMINAVG, 80000); I915_WRITE(MEMIHYST, 1); /* Set up min, max, and cur for interrupt handling */ fmax = (rgvmodectl & MEMMODE_FMAX_MASK) >> MEMMODE_FMAX_SHIFT; fmin = (rgvmodectl & MEMMODE_FMIN_MASK); fstart = (rgvmodectl & MEMMODE_FSTART_MASK) >> MEMMODE_FSTART_SHIFT; vstart = (I915_READ(PXVFREQ_BASE + (fstart * 4)) & PXVFREQ_PX_MASK) >> PXVFREQ_PX_SHIFT; dev_priv->ips.fmax = fmax; /* IPS callback will increase this */ dev_priv->ips.fstart = fstart; dev_priv->ips.max_delay = fstart; dev_priv->ips.min_delay = fmin; dev_priv->ips.cur_delay = fstart; DRM_DEBUG_DRIVER("fmax: %d, fmin: %d, fstart: %d\n", fmax, fmin, fstart); I915_WRITE(MEMINTREN, MEMINT_CX_SUPR_EN | MEMINT_EVAL_CHG_EN); /* * Interrupts will be enabled in ironlake_irq_postinstall */ I915_WRITE(VIDSTART, vstart); POSTING_READ(VIDSTART); rgvmodectl |= MEMMODE_SWMODE_EN; I915_WRITE(MEMMODECTL, rgvmodectl); if (wait_for_atomic((I915_READ(MEMSWCTL) & MEMCTL_CMD_STS) == 0, 10)) DRM_ERROR("stuck trying to change perf mode\n"); mdelay(1); ironlake_set_drps(dev, fstart); dev_priv->ips.last_count1 = I915_READ(0x112e4) + I915_READ(0x112e8) + I915_READ(0x112e0); dev_priv->ips.last_time1 = jiffies_to_msecs(jiffies); dev_priv->ips.last_count2 = I915_READ(0x112f4); getrawmonotonic(&dev_priv->ips.last_time2); mtx_unlock(&mchdev_lock); } static void ironlake_disable_drps(struct drm_device *dev) { struct drm_i915_private *dev_priv = dev->dev_private; u16 rgvswctl; mtx_lock(&mchdev_lock); rgvswctl = I915_READ16(MEMSWCTL); /* Ack interrupts, disable EFC interrupt */ I915_WRITE(MEMINTREN, I915_READ(MEMINTREN) & ~MEMINT_EVAL_CHG_EN); I915_WRITE(MEMINTRSTS, MEMINT_EVAL_CHG); I915_WRITE(DEIER, I915_READ(DEIER) & ~DE_PCU_EVENT); I915_WRITE(DEIIR, DE_PCU_EVENT); I915_WRITE(DEIMR, I915_READ(DEIMR) | DE_PCU_EVENT); /* Go back to the starting frequency */ ironlake_set_drps(dev, dev_priv->ips.fstart); mdelay(1); rgvswctl |= MEMCTL_CMD_STS; I915_WRITE(MEMSWCTL, rgvswctl); mdelay(1); mtx_unlock(&mchdev_lock); } /* There's a funny hw issue where the hw returns all 0 when reading from * GEN6_RP_INTERRUPT_LIMITS. Hence we always need to compute the desired value * ourselves, instead of doing a rmw cycle (which might result in us clearing * all limits and the gpu stuck at whatever frequency it is at atm). */ static u32 gen6_rps_limits(struct drm_i915_private *dev_priv, u8 *val) { u32 limits; limits = 0; if (*val >= dev_priv->rps.max_delay) *val = dev_priv->rps.max_delay; limits |= dev_priv->rps.max_delay << 24; /* Only set the down limit when we've reached the lowest level to avoid * getting more interrupts, otherwise leave this clear. This prevents a * race in the hw when coming out of rc6: There's a tiny window where * the hw runs at the minimal clock before selecting the desired * frequency, if the down threshold expires in that window we will not * receive a down interrupt. */ if (*val <= dev_priv->rps.min_delay) { *val = dev_priv->rps.min_delay; limits |= dev_priv->rps.min_delay << 16; } return limits; } void gen6_set_rps(struct drm_device *dev, u8 val) { struct drm_i915_private *dev_priv = dev->dev_private; u32 limits = gen6_rps_limits(dev_priv, &val); sx_assert(&dev_priv->rps.hw_lock, SA_XLOCKED); WARN_ON(val > dev_priv->rps.max_delay); WARN_ON(val < dev_priv->rps.min_delay); if (val == dev_priv->rps.cur_delay) return; I915_WRITE(GEN6_RPNSWREQ, GEN6_FREQUENCY(val) | GEN6_OFFSET(0) | GEN6_AGGRESSIVE_TURBO); /* Make sure we continue to get interrupts * until we hit the minimum or maximum frequencies. */ I915_WRITE(GEN6_RP_INTERRUPT_LIMITS, limits); POSTING_READ(GEN6_RPNSWREQ); dev_priv->rps.cur_delay = val; } static void gen6_disable_rps(struct drm_device *dev) { struct drm_i915_private *dev_priv = dev->dev_private; I915_WRITE(GEN6_RC_CONTROL, 0); I915_WRITE(GEN6_RPNSWREQ, 1 << 31); I915_WRITE(GEN6_PMINTRMSK, 0xffffffff); I915_WRITE(GEN6_PMIER, 0); /* Complete PM interrupt masking here doesn't race with the rps work * item again unmasking PM interrupts because that is using a different * register (PMIMR) to mask PM interrupts. The only risk is in leaving * stale bits in PMIIR and PMIMR which gen6_enable_rps will clean up. */ mtx_lock(&dev_priv->rps.lock); dev_priv->rps.pm_iir = 0; mtx_unlock(&dev_priv->rps.lock); I915_WRITE(GEN6_PMIIR, I915_READ(GEN6_PMIIR)); } int intel_enable_rc6(const struct drm_device *dev) { /* Respect the kernel parameter if it is set */ if (i915_enable_rc6 >= 0) return i915_enable_rc6; /* Disable RC6 on Ironlake */ if (INTEL_INFO(dev)->gen == 5) return 0; if (IS_HASWELL(dev)) { DRM_DEBUG_DRIVER("Haswell: only RC6 available\n"); return INTEL_RC6_ENABLE; } /* snb/ivb have more than one rc6 state. */ if (INTEL_INFO(dev)->gen == 6) { DRM_DEBUG_DRIVER("Sandybridge: deep RC6 disabled\n"); return INTEL_RC6_ENABLE; } DRM_DEBUG_DRIVER("RC6 and deep RC6 enabled\n"); return (INTEL_RC6_ENABLE | INTEL_RC6p_ENABLE); } static void gen6_enable_rps(struct drm_device *dev) { struct drm_i915_private *dev_priv = dev->dev_private; struct intel_ring_buffer *ring; u32 rp_state_cap; u32 gt_perf_status; u32 rc6vids, pcu_mbox, rc6_mask = 0; u32 gtfifodbg; int rc6_mode; int i, ret; sx_assert(&dev_priv->rps.hw_lock, SA_XLOCKED); /* Here begins a magic sequence of register writes to enable * auto-downclocking. * * Perhaps there might be some value in exposing these to * userspace... */ I915_WRITE(GEN6_RC_STATE, 0); /* Clear the DBG now so we don't confuse earlier errors */ if ((gtfifodbg = I915_READ(GTFIFODBG))) { DRM_ERROR("GT fifo had a previous error %x\n", gtfifodbg); I915_WRITE(GTFIFODBG, gtfifodbg); } gen6_gt_force_wake_get(dev_priv); rp_state_cap = I915_READ(GEN6_RP_STATE_CAP); gt_perf_status = I915_READ(GEN6_GT_PERF_STATUS); /* In units of 100MHz */ dev_priv->rps.max_delay = rp_state_cap & 0xff; dev_priv->rps.min_delay = (rp_state_cap & 0xff0000) >> 16; dev_priv->rps.cur_delay = 0; /* disable the counters and set deterministic thresholds */ I915_WRITE(GEN6_RC_CONTROL, 0); I915_WRITE(GEN6_RC1_WAKE_RATE_LIMIT, 1000 << 16); I915_WRITE(GEN6_RC6_WAKE_RATE_LIMIT, 40 << 16 | 30); I915_WRITE(GEN6_RC6pp_WAKE_RATE_LIMIT, 30); I915_WRITE(GEN6_RC_EVALUATION_INTERVAL, 125000); I915_WRITE(GEN6_RC_IDLE_HYSTERSIS, 25); for_each_ring(ring, dev_priv, i) I915_WRITE(RING_MAX_IDLE(ring->mmio_base), 10); I915_WRITE(GEN6_RC_SLEEP, 0); I915_WRITE(GEN6_RC1e_THRESHOLD, 1000); I915_WRITE(GEN6_RC6_THRESHOLD, 50000); I915_WRITE(GEN6_RC6p_THRESHOLD, 150000); I915_WRITE(GEN6_RC6pp_THRESHOLD, 64000); /* unused */ /* Check if we are enabling RC6 */ rc6_mode = intel_enable_rc6(dev_priv->dev); if (rc6_mode & INTEL_RC6_ENABLE) rc6_mask |= GEN6_RC_CTL_RC6_ENABLE; /* We don't use those on Haswell */ if (!IS_HASWELL(dev)) { if (rc6_mode & INTEL_RC6p_ENABLE) rc6_mask |= GEN6_RC_CTL_RC6p_ENABLE; if (rc6_mode & INTEL_RC6pp_ENABLE) rc6_mask |= GEN6_RC_CTL_RC6pp_ENABLE; } DRM_INFO("Enabling RC6 states: RC6 %s, RC6p %s, RC6pp %s\n", (rc6_mask & GEN6_RC_CTL_RC6_ENABLE) ? "on" : "off", (rc6_mask & GEN6_RC_CTL_RC6p_ENABLE) ? "on" : "off", (rc6_mask & GEN6_RC_CTL_RC6pp_ENABLE) ? "on" : "off"); I915_WRITE(GEN6_RC_CONTROL, rc6_mask | GEN6_RC_CTL_EI_MODE(1) | GEN6_RC_CTL_HW_ENABLE); I915_WRITE(GEN6_RPNSWREQ, GEN6_FREQUENCY(10) | GEN6_OFFSET(0) | GEN6_AGGRESSIVE_TURBO); I915_WRITE(GEN6_RC_VIDEO_FREQ, GEN6_FREQUENCY(12)); I915_WRITE(GEN6_RP_DOWN_TIMEOUT, 1000000); I915_WRITE(GEN6_RP_INTERRUPT_LIMITS, dev_priv->rps.max_delay << 24 | dev_priv->rps.min_delay << 16); I915_WRITE(GEN6_RP_UP_THRESHOLD, 59400); I915_WRITE(GEN6_RP_DOWN_THRESHOLD, 245000); I915_WRITE(GEN6_RP_UP_EI, 66000); I915_WRITE(GEN6_RP_DOWN_EI, 350000); I915_WRITE(GEN6_RP_IDLE_HYSTERSIS, 10); I915_WRITE(GEN6_RP_CONTROL, GEN6_RP_MEDIA_TURBO | GEN6_RP_MEDIA_HW_NORMAL_MODE | GEN6_RP_MEDIA_IS_GFX | GEN6_RP_ENABLE | GEN6_RP_UP_BUSY_AVG | (IS_HASWELL(dev) ? GEN7_RP_DOWN_IDLE_AVG : GEN6_RP_DOWN_IDLE_CONT)); ret = sandybridge_pcode_write(dev_priv, GEN6_PCODE_WRITE_MIN_FREQ_TABLE, 0); if (!ret) { pcu_mbox = 0; ret = sandybridge_pcode_read(dev_priv, GEN6_READ_OC_PARAMS, &pcu_mbox); if (ret && pcu_mbox & (1<<31)) { /* OC supported */ dev_priv->rps.max_delay = pcu_mbox & 0xff; DRM_DEBUG_DRIVER("overclocking supported, adjusting frequency max to %dMHz\n", pcu_mbox * 50); } } else { DRM_DEBUG_DRIVER("Failed to set the min frequency\n"); } gen6_set_rps(dev_priv->dev, (gt_perf_status & 0xff00) >> 8); /* requires MSI enabled */ I915_WRITE(GEN6_PMIER, GEN6_PM_DEFERRED_EVENTS); mtx_lock(&dev_priv->rps.lock); WARN_ON(dev_priv->rps.pm_iir != 0); I915_WRITE(GEN6_PMIMR, 0); mtx_unlock(&dev_priv->rps.lock); /* enable all PM interrupts */ I915_WRITE(GEN6_PMINTRMSK, 0); rc6vids = 0; ret = sandybridge_pcode_read(dev_priv, GEN6_PCODE_READ_RC6VIDS, &rc6vids); if (IS_GEN6(dev) && ret) { DRM_DEBUG_DRIVER("Couldn't check for BIOS workaround\n"); } else if (IS_GEN6(dev) && (GEN6_DECODE_RC6_VID(rc6vids & 0xff) < 450)) { DRM_DEBUG_DRIVER("You should update your BIOS. Correcting minimum rc6 voltage (%dmV->%dmV)\n", GEN6_DECODE_RC6_VID(rc6vids & 0xff), 450); rc6vids &= 0xffff00; rc6vids |= GEN6_ENCODE_RC6_VID(450); ret = sandybridge_pcode_write(dev_priv, GEN6_PCODE_WRITE_RC6VIDS, rc6vids); if (ret) DRM_ERROR("Couldn't fix incorrect rc6 voltage\n"); } gen6_gt_force_wake_put(dev_priv); } static void gen6_update_ring_freq(struct drm_device *dev) { struct drm_i915_private *dev_priv = dev->dev_private; int min_freq = 15; int gpu_freq; unsigned int ia_freq, max_ia_freq; int scaling_factor = 180; sx_assert(&dev_priv->rps.hw_lock, SA_XLOCKED); #ifdef FREEBSD_WIP max_ia_freq = cpufreq_quick_get_max(0); /* * Default to measured freq if none found, PCU will ensure we don't go * over */ if (!max_ia_freq) max_ia_freq = tsc_khz; #else uint64_t freq; freq = atomic_load_acq_64(&tsc_freq); max_ia_freq = freq / 1000; #endif /* FREEBSD_WIP */ /* Convert from kHz to MHz */ max_ia_freq /= 1000; /* * For each potential GPU frequency, load a ring frequency we'd like * to use for memory access. We do this by specifying the IA frequency * the PCU should use as a reference to determine the ring frequency. */ for (gpu_freq = dev_priv->rps.max_delay; gpu_freq >= dev_priv->rps.min_delay; gpu_freq--) { int diff = dev_priv->rps.max_delay - gpu_freq; /* * For GPU frequencies less than 750MHz, just use the lowest * ring freq. */ if (gpu_freq < min_freq) ia_freq = 800; else ia_freq = max_ia_freq - ((diff * scaling_factor) / 2); ia_freq = DIV_ROUND_CLOSEST(ia_freq, 100); ia_freq <<= GEN6_PCODE_FREQ_IA_RATIO_SHIFT; sandybridge_pcode_write(dev_priv, GEN6_PCODE_WRITE_MIN_FREQ_TABLE, ia_freq | gpu_freq); } } void ironlake_teardown_rc6(struct drm_device *dev) { struct drm_i915_private *dev_priv = dev->dev_private; if (dev_priv->ips.renderctx) { i915_gem_object_unpin(dev_priv->ips.renderctx); drm_gem_object_unreference(&dev_priv->ips.renderctx->base); dev_priv->ips.renderctx = NULL; } if (dev_priv->ips.pwrctx) { i915_gem_object_unpin(dev_priv->ips.pwrctx); drm_gem_object_unreference(&dev_priv->ips.pwrctx->base); dev_priv->ips.pwrctx = NULL; } } static void ironlake_disable_rc6(struct drm_device *dev) { struct drm_i915_private *dev_priv = dev->dev_private; if (I915_READ(PWRCTXA)) { /* Wake the GPU, prevent RC6, then restore RSTDBYCTL */ I915_WRITE(RSTDBYCTL, I915_READ(RSTDBYCTL) | RCX_SW_EXIT); wait_for(((I915_READ(RSTDBYCTL) & RSX_STATUS_MASK) == RSX_STATUS_ON), 50); I915_WRITE(PWRCTXA, 0); POSTING_READ(PWRCTXA); I915_WRITE(RSTDBYCTL, I915_READ(RSTDBYCTL) & ~RCX_SW_EXIT); POSTING_READ(RSTDBYCTL); } } static int ironlake_setup_rc6(struct drm_device *dev) { struct drm_i915_private *dev_priv = dev->dev_private; if (dev_priv->ips.renderctx == NULL) dev_priv->ips.renderctx = intel_alloc_context_page(dev); if (!dev_priv->ips.renderctx) return -ENOMEM; if (dev_priv->ips.pwrctx == NULL) dev_priv->ips.pwrctx = intel_alloc_context_page(dev); if (!dev_priv->ips.pwrctx) { ironlake_teardown_rc6(dev); return -ENOMEM; } return 0; } static void ironlake_enable_rc6(struct drm_device *dev) { struct drm_i915_private *dev_priv = dev->dev_private; struct intel_ring_buffer *ring = &dev_priv->ring[RCS]; bool was_interruptible; int ret; /* rc6 disabled by default due to repeated reports of hanging during * boot and resume. */ if (!intel_enable_rc6(dev)) return; DRM_LOCK_ASSERT(dev); ret = ironlake_setup_rc6(dev); if (ret) return; was_interruptible = dev_priv->mm.interruptible; dev_priv->mm.interruptible = false; /* * GPU can automatically power down the render unit if given a page * to save state. */ ret = intel_ring_begin(ring, 6); if (ret) { ironlake_teardown_rc6(dev); dev_priv->mm.interruptible = was_interruptible; return; } intel_ring_emit(ring, MI_SUSPEND_FLUSH | MI_SUSPEND_FLUSH_EN); intel_ring_emit(ring, MI_SET_CONTEXT); intel_ring_emit(ring, dev_priv->ips.renderctx->gtt_offset | MI_MM_SPACE_GTT | MI_SAVE_EXT_STATE_EN | MI_RESTORE_EXT_STATE_EN | MI_RESTORE_INHIBIT); intel_ring_emit(ring, MI_SUSPEND_FLUSH); intel_ring_emit(ring, MI_NOOP); intel_ring_emit(ring, MI_FLUSH); intel_ring_advance(ring); /* * Wait for the command parser to advance past MI_SET_CONTEXT. The HW * does an implicit flush, combined with MI_FLUSH above, it should be * safe to assume that renderctx is valid */ ret = intel_ring_idle(ring); dev_priv->mm.interruptible = was_interruptible; if (ret) { DRM_ERROR("failed to enable ironlake power power savings\n"); ironlake_teardown_rc6(dev); return; } I915_WRITE(PWRCTXA, dev_priv->ips.pwrctx->gtt_offset | PWRCTX_EN); I915_WRITE(RSTDBYCTL, I915_READ(RSTDBYCTL) & ~RCX_SW_EXIT); } static unsigned long intel_pxfreq(u32 vidfreq) { unsigned long freq; int div = (vidfreq & 0x3f0000) >> 16; int post = (vidfreq & 0x3000) >> 12; int pre = (vidfreq & 0x7); if (!pre) return 0; freq = ((div * 133333) / ((1<ips.last_time1; /* Prevent division-by-zero if we are asking too fast. * Also, we don't get interesting results if we are polling * faster than once in 10ms, so just return the saved value * in such cases. */ if (diff1 <= 10) return dev_priv->ips.chipset_power; count1 = I915_READ(DMIEC); count2 = I915_READ(DDREC); count3 = I915_READ(CSIEC); total_count = count1 + count2 + count3; /* FIXME: handle per-counter overflow */ if (total_count < dev_priv->ips.last_count1) { diff = ~0UL - dev_priv->ips.last_count1; diff += total_count; } else { diff = total_count - dev_priv->ips.last_count1; } for (i = 0; i < ARRAY_SIZE(cparams); i++) { if (cparams[i].i == dev_priv->ips.c_m && cparams[i].t == dev_priv->ips.r_t) { m = cparams[i].m; c = cparams[i].c; break; } } diff = div_u64(diff, diff1); ret = ((m * diff) + c); ret = div_u64(ret, 10); dev_priv->ips.last_count1 = total_count; dev_priv->ips.last_time1 = now; dev_priv->ips.chipset_power = ret; return ret; } unsigned long i915_chipset_val(struct drm_i915_private *dev_priv) { unsigned long val; if (dev_priv->info->gen != 5) return 0; mtx_lock(&mchdev_lock); val = __i915_chipset_val(dev_priv); mtx_unlock(&mchdev_lock); return val; } unsigned long i915_mch_val(struct drm_i915_private *dev_priv) { unsigned long m, x, b; u32 tsfs; tsfs = I915_READ(TSFS); m = ((tsfs & TSFS_SLOPE_MASK) >> TSFS_SLOPE_SHIFT); x = I915_READ8(I915_TR1); b = tsfs & TSFS_INTR_MASK; return ((m * x) / 127) - b; } static u16 pvid_to_extvid(struct drm_i915_private *dev_priv, u8 pxvid) { static const struct v_table { u16 vd; /* in .1 mil */ u16 vm; /* in .1 mil */ } v_table[] = { { 0, 0, }, { 375, 0, }, { 500, 0, }, { 625, 0, }, { 750, 0, }, { 875, 0, }, { 1000, 0, }, { 1125, 0, }, { 4125, 3000, }, { 4125, 3000, }, { 4125, 3000, }, { 4125, 3000, }, { 4125, 3000, }, { 4125, 3000, }, { 4125, 3000, }, { 4125, 3000, }, { 4125, 3000, }, { 4125, 3000, }, { 4125, 3000, }, { 4125, 3000, }, { 4125, 3000, }, { 4125, 3000, }, { 4125, 3000, }, { 4125, 3000, }, { 4125, 3000, }, { 4125, 3000, }, { 4125, 3000, }, { 4125, 3000, }, { 4125, 3000, }, { 4125, 3000, }, { 4125, 3000, }, { 4125, 3000, }, { 4250, 3125, }, { 4375, 3250, }, { 4500, 3375, }, { 4625, 3500, }, { 4750, 3625, }, { 4875, 3750, }, { 5000, 3875, }, { 5125, 4000, }, { 5250, 4125, }, { 5375, 4250, }, { 5500, 4375, }, { 5625, 4500, }, { 5750, 4625, }, { 5875, 4750, }, { 6000, 4875, }, { 6125, 5000, }, { 6250, 5125, }, { 6375, 5250, }, { 6500, 5375, }, { 6625, 5500, }, { 6750, 5625, }, { 6875, 5750, }, { 7000, 5875, }, { 7125, 6000, }, { 7250, 6125, }, { 7375, 6250, }, { 7500, 6375, }, { 7625, 6500, }, { 7750, 6625, }, { 7875, 6750, }, { 8000, 6875, }, { 8125, 7000, }, { 8250, 7125, }, { 8375, 7250, }, { 8500, 7375, }, { 8625, 7500, }, { 8750, 7625, }, { 8875, 7750, }, { 9000, 7875, }, { 9125, 8000, }, { 9250, 8125, }, { 9375, 8250, }, { 9500, 8375, }, { 9625, 8500, }, { 9750, 8625, }, { 9875, 8750, }, { 10000, 8875, }, { 10125, 9000, }, { 10250, 9125, }, { 10375, 9250, }, { 10500, 9375, }, { 10625, 9500, }, { 10750, 9625, }, { 10875, 9750, }, { 11000, 9875, }, { 11125, 10000, }, { 11250, 10125, }, { 11375, 10250, }, { 11500, 10375, }, { 11625, 10500, }, { 11750, 10625, }, { 11875, 10750, }, { 12000, 10875, }, { 12125, 11000, }, { 12250, 11125, }, { 12375, 11250, }, { 12500, 11375, }, { 12625, 11500, }, { 12750, 11625, }, { 12875, 11750, }, { 13000, 11875, }, { 13125, 12000, }, { 13250, 12125, }, { 13375, 12250, }, { 13500, 12375, }, { 13625, 12500, }, { 13750, 12625, }, { 13875, 12750, }, { 14000, 12875, }, { 14125, 13000, }, { 14250, 13125, }, { 14375, 13250, }, { 14500, 13375, }, { 14625, 13500, }, { 14750, 13625, }, { 14875, 13750, }, { 15000, 13875, }, { 15125, 14000, }, { 15250, 14125, }, { 15375, 14250, }, { 15500, 14375, }, { 15625, 14500, }, { 15750, 14625, }, { 15875, 14750, }, { 16000, 14875, }, { 16125, 15000, }, }; if (dev_priv->info->is_mobile) return v_table[pxvid].vm; else return v_table[pxvid].vd; } static void __i915_update_gfx_val(struct drm_i915_private *dev_priv) { struct timespec now, diff1; u64 diff; unsigned long diffms; u32 count; mtx_assert(&mchdev_lock, MA_OWNED); nanotime(&now); - diff1 = now; - timespecsub(&diff1, &dev_priv->ips.last_time2); + timespecsub(&now, &dev_priv->ips.last_time2, &diff1); /* Don't divide by 0 */ diffms = diff1.tv_sec * 1000 + diff1.tv_nsec / 1000000; if (!diffms) return; count = I915_READ(GFXEC); if (count < dev_priv->ips.last_count2) { diff = ~0UL - dev_priv->ips.last_count2; diff += count; } else { diff = count - dev_priv->ips.last_count2; } dev_priv->ips.last_count2 = count; dev_priv->ips.last_time2 = now; /* More magic constants... */ diff = diff * 1181; diff = div_u64(diff, diffms * 10); dev_priv->ips.gfx_power = diff; } void i915_update_gfx_val(struct drm_i915_private *dev_priv) { if (dev_priv->info->gen != 5) return; mtx_lock(&mchdev_lock); __i915_update_gfx_val(dev_priv); mtx_unlock(&mchdev_lock); } static unsigned long __i915_gfx_val(struct drm_i915_private *dev_priv) { unsigned long t, corr, state1, corr2, state2; u32 pxvid, ext_v; mtx_assert(&mchdev_lock, MA_OWNED); pxvid = I915_READ(PXVFREQ_BASE + (dev_priv->rps.cur_delay * 4)); pxvid = (pxvid >> 24) & 0x7f; ext_v = pvid_to_extvid(dev_priv, pxvid); state1 = ext_v; t = i915_mch_val(dev_priv); /* Revel in the empirically derived constants */ /* Correction factor in 1/100000 units */ if (t > 80) corr = ((t * 2349) + 135940); else if (t >= 50) corr = ((t * 964) + 29317); else /* < 50 */ corr = ((t * 301) + 1004); corr = corr * ((150142 * state1) / 10000 - 78642); corr /= 100000; corr2 = (corr * dev_priv->ips.corr); state2 = (corr2 * state1) / 10000; state2 /= 100; /* convert to mW */ __i915_update_gfx_val(dev_priv); return dev_priv->ips.gfx_power + state2; } unsigned long i915_gfx_val(struct drm_i915_private *dev_priv) { unsigned long val; if (dev_priv->info->gen != 5) return 0; mtx_lock(&mchdev_lock); val = __i915_gfx_val(dev_priv); mtx_unlock(&mchdev_lock); return val; } /** * i915_read_mch_val - return value for IPS use * * Calculate and return a value for the IPS driver to use when deciding whether * we have thermal and power headroom to increase CPU or GPU power budget. */ unsigned long i915_read_mch_val(void) { struct drm_i915_private *dev_priv; unsigned long chipset_val, graphics_val, ret = 0; mtx_lock(&mchdev_lock); if (!i915_mch_dev) goto out_unlock; dev_priv = i915_mch_dev; chipset_val = __i915_chipset_val(dev_priv); graphics_val = __i915_gfx_val(dev_priv); ret = chipset_val + graphics_val; out_unlock: mtx_unlock(&mchdev_lock); return ret; } EXPORT_SYMBOL_GPL(i915_read_mch_val); /** * i915_gpu_raise - raise GPU frequency limit * * Raise the limit; IPS indicates we have thermal headroom. */ bool i915_gpu_raise(void) { struct drm_i915_private *dev_priv; bool ret = true; mtx_lock(&mchdev_lock); if (!i915_mch_dev) { ret = false; goto out_unlock; } dev_priv = i915_mch_dev; if (dev_priv->ips.max_delay > dev_priv->ips.fmax) dev_priv->ips.max_delay--; out_unlock: mtx_unlock(&mchdev_lock); return ret; } EXPORT_SYMBOL_GPL(i915_gpu_raise); /** * i915_gpu_lower - lower GPU frequency limit * * IPS indicates we're close to a thermal limit, so throttle back the GPU * frequency maximum. */ bool i915_gpu_lower(void) { struct drm_i915_private *dev_priv; bool ret = true; mtx_lock(&mchdev_lock); if (!i915_mch_dev) { ret = false; goto out_unlock; } dev_priv = i915_mch_dev; if (dev_priv->ips.max_delay < dev_priv->ips.min_delay) dev_priv->ips.max_delay++; out_unlock: mtx_unlock(&mchdev_lock); return ret; } EXPORT_SYMBOL_GPL(i915_gpu_lower); /** * i915_gpu_busy - indicate GPU business to IPS * * Tell the IPS driver whether or not the GPU is busy. */ bool i915_gpu_busy(void) { struct drm_i915_private *dev_priv; struct intel_ring_buffer *ring; bool ret = false; int i; mtx_lock(&mchdev_lock); if (!i915_mch_dev) goto out_unlock; dev_priv = i915_mch_dev; for_each_ring(ring, dev_priv, i) ret |= !list_empty(&ring->request_list); out_unlock: mtx_unlock(&mchdev_lock); return ret; } EXPORT_SYMBOL_GPL(i915_gpu_busy); /** * i915_gpu_turbo_disable - disable graphics turbo * * Disable graphics turbo by resetting the max frequency and setting the * current frequency to the default. */ bool i915_gpu_turbo_disable(void) { struct drm_i915_private *dev_priv; bool ret = true; mtx_lock(&mchdev_lock); if (!i915_mch_dev) { ret = false; goto out_unlock; } dev_priv = i915_mch_dev; dev_priv->ips.max_delay = dev_priv->ips.fstart; if (!ironlake_set_drps(dev_priv->dev, dev_priv->ips.fstart)) ret = false; out_unlock: mtx_unlock(&mchdev_lock); return ret; } EXPORT_SYMBOL_GPL(i915_gpu_turbo_disable); #ifdef FREEBSD_WIP /** * Tells the intel_ips driver that the i915 driver is now loaded, if * IPS got loaded first. * * This awkward dance is so that neither module has to depend on the * other in order for IPS to do the appropriate communication of * GPU turbo limits to i915. */ static void ips_ping_for_i915_load(void) { void (*link)(void); link = symbol_get(ips_link_to_i915_driver); if (link) { link(); symbol_put(ips_link_to_i915_driver); } } #endif /* FREEBSD_WIP */ void intel_gpu_ips_init(struct drm_i915_private *dev_priv) { /* We only register the i915 ips part with intel-ips once everything is * set up, to avoid intel-ips sneaking in and reading bogus values. */ mtx_lock(&mchdev_lock); i915_mch_dev = dev_priv; mtx_unlock(&mchdev_lock); #ifdef FREEBSD_WIP ips_ping_for_i915_load(); #endif /* FREEBSD_WIP */ } void intel_gpu_ips_teardown(void) { mtx_lock(&mchdev_lock); i915_mch_dev = NULL; mtx_unlock(&mchdev_lock); } static void intel_init_emon(struct drm_device *dev) { struct drm_i915_private *dev_priv = dev->dev_private; u32 lcfuse; u8 pxw[16]; int i; /* Disable to program */ I915_WRITE(ECR, 0); POSTING_READ(ECR); /* Program energy weights for various events */ I915_WRITE(SDEW, 0x15040d00); I915_WRITE(CSIEW0, 0x007f0000); I915_WRITE(CSIEW1, 0x1e220004); I915_WRITE(CSIEW2, 0x04000004); for (i = 0; i < 5; i++) I915_WRITE(PEW + (i * 4), 0); for (i = 0; i < 3; i++) I915_WRITE(DEW + (i * 4), 0); /* Program P-state weights to account for frequency power adjustment */ for (i = 0; i < 16; i++) { u32 pxvidfreq = I915_READ(PXVFREQ_BASE + (i * 4)); unsigned long freq = intel_pxfreq(pxvidfreq); unsigned long vid = (pxvidfreq & PXVFREQ_PX_MASK) >> PXVFREQ_PX_SHIFT; unsigned long val; val = vid * vid; val *= (freq / 1000); val *= 255; val /= (127*127*900); if (val > 0xff) DRM_ERROR("bad pxval: %ld\n", val); pxw[i] = val; } /* Render standby states get 0 weight */ pxw[14] = 0; pxw[15] = 0; for (i = 0; i < 4; i++) { u32 val = (pxw[i*4] << 24) | (pxw[(i*4)+1] << 16) | (pxw[(i*4)+2] << 8) | (pxw[(i*4)+3]); I915_WRITE(PXW + (i * 4), val); } /* Adjust magic regs to magic values (more experimental results) */ I915_WRITE(OGW0, 0); I915_WRITE(OGW1, 0); I915_WRITE(EG0, 0x00007f00); I915_WRITE(EG1, 0x0000000e); I915_WRITE(EG2, 0x000e0000); I915_WRITE(EG3, 0x68000300); I915_WRITE(EG4, 0x42000000); I915_WRITE(EG5, 0x00140031); I915_WRITE(EG6, 0); I915_WRITE(EG7, 0); for (i = 0; i < 8; i++) I915_WRITE(PXWL + (i * 4), 0); /* Enable PMON + select events */ I915_WRITE(ECR, 0x80000019); lcfuse = I915_READ(LCFUSE02); dev_priv->ips.corr = (lcfuse & LCFUSE_HIV_MASK); } void intel_disable_gt_powersave(struct drm_device *dev) { struct drm_i915_private *dev_priv = dev->dev_private; if (IS_IRONLAKE_M(dev)) { ironlake_disable_drps(dev); ironlake_disable_rc6(dev); } else if (INTEL_INFO(dev)->gen >= 6 && !IS_VALLEYVIEW(dev)) { taskqueue_cancel_timeout(dev_priv->wq, &dev_priv->rps.delayed_resume_work, NULL); sx_xlock(&dev_priv->rps.hw_lock); gen6_disable_rps(dev); sx_xunlock(&dev_priv->rps.hw_lock); } } static void intel_gen6_powersave_work(void *arg, int pending) { struct drm_i915_private *dev_priv = arg; struct drm_device *dev = dev_priv->dev; sx_xlock(&dev_priv->rps.hw_lock); gen6_enable_rps(dev); gen6_update_ring_freq(dev); sx_xunlock(&dev_priv->rps.hw_lock); } void intel_enable_gt_powersave(struct drm_device *dev) { struct drm_i915_private *dev_priv = dev->dev_private; if (IS_IRONLAKE_M(dev)) { ironlake_enable_drps(dev); ironlake_enable_rc6(dev); intel_init_emon(dev); } else if ((IS_GEN6(dev) || IS_GEN7(dev)) && !IS_VALLEYVIEW(dev)) { /* * PCU communication is slow and this doesn't need to be * done at any specific time, so do this out of our fast path * to make resume and init faster. */ taskqueue_enqueue_timeout(dev_priv->wq, &dev_priv->rps.delayed_resume_work, round_jiffies_up_relative(HZ)); } } static void ibx_init_clock_gating(struct drm_device *dev) { struct drm_i915_private *dev_priv = dev->dev_private; /* * On Ibex Peak and Cougar Point, we need to disable clock * gating for the panel power sequencer or it will fail to * start up when no ports are active. */ I915_WRITE(SOUTH_DSPCLK_GATE_D, PCH_DPLSUNIT_CLOCK_GATE_DISABLE); } static void ironlake_init_clock_gating(struct drm_device *dev) { struct drm_i915_private *dev_priv = dev->dev_private; uint32_t dspclk_gate = ILK_VRHUNIT_CLOCK_GATE_DISABLE; /* Required for FBC */ dspclk_gate |= ILK_DPFCRUNIT_CLOCK_GATE_DISABLE | ILK_DPFCUNIT_CLOCK_GATE_DISABLE | ILK_DPFDUNIT_CLOCK_GATE_ENABLE; I915_WRITE(PCH_3DCGDIS0, MARIUNIT_CLOCK_GATE_DISABLE | SVSMUNIT_CLOCK_GATE_DISABLE); I915_WRITE(PCH_3DCGDIS1, VFMUNIT_CLOCK_GATE_DISABLE); /* * According to the spec the following bits should be set in * order to enable memory self-refresh * The bit 22/21 of 0x42004 * The bit 5 of 0x42020 * The bit 15 of 0x45000 */ I915_WRITE(ILK_DISPLAY_CHICKEN2, (I915_READ(ILK_DISPLAY_CHICKEN2) | ILK_DPARB_GATE | ILK_VSDPFD_FULL)); dspclk_gate |= ILK_DPARBUNIT_CLOCK_GATE_ENABLE; I915_WRITE(DISP_ARB_CTL, (I915_READ(DISP_ARB_CTL) | DISP_FBC_WM_DIS)); I915_WRITE(WM3_LP_ILK, 0); I915_WRITE(WM2_LP_ILK, 0); I915_WRITE(WM1_LP_ILK, 0); /* * Based on the document from hardware guys the following bits * should be set unconditionally in order to enable FBC. * The bit 22 of 0x42000 * The bit 22 of 0x42004 * The bit 7,8,9 of 0x42020. */ if (IS_IRONLAKE_M(dev)) { I915_WRITE(ILK_DISPLAY_CHICKEN1, I915_READ(ILK_DISPLAY_CHICKEN1) | ILK_FBCQ_DIS); I915_WRITE(ILK_DISPLAY_CHICKEN2, I915_READ(ILK_DISPLAY_CHICKEN2) | ILK_DPARB_GATE); } I915_WRITE(ILK_DSPCLK_GATE_D, dspclk_gate); I915_WRITE(ILK_DISPLAY_CHICKEN2, I915_READ(ILK_DISPLAY_CHICKEN2) | ILK_ELPIN_409_SELECT); I915_WRITE(_3D_CHICKEN2, _3D_CHICKEN2_WM_READ_PIPELINED << 16 | _3D_CHICKEN2_WM_READ_PIPELINED); /* WaDisableRenderCachePipelinedFlush */ I915_WRITE(CACHE_MODE_0, _MASKED_BIT_ENABLE(CM0_PIPELINED_RENDER_FLUSH_DISABLE)); ibx_init_clock_gating(dev); } static void cpt_init_clock_gating(struct drm_device *dev) { struct drm_i915_private *dev_priv = dev->dev_private; int pipe; uint32_t val; /* * On Ibex Peak and Cougar Point, we need to disable clock * gating for the panel power sequencer or it will fail to * start up when no ports are active. */ I915_WRITE(SOUTH_DSPCLK_GATE_D, PCH_DPLSUNIT_CLOCK_GATE_DISABLE); I915_WRITE(SOUTH_CHICKEN2, I915_READ(SOUTH_CHICKEN2) | DPLS_EDP_PPS_FIX_DIS); /* The below fixes the weird display corruption, a few pixels shifted * downward, on (only) LVDS of some HP laptops with IVY. */ for_each_pipe(pipe) { val = TRANS_CHICKEN2_TIMING_OVERRIDE; if (dev_priv->fdi_rx_polarity_inverted) val |= TRANS_CHICKEN2_FDI_POLARITY_REVERSED; I915_WRITE(TRANS_CHICKEN2(pipe), val); } /* WADP0ClockGatingDisable */ for_each_pipe(pipe) { I915_WRITE(TRANS_CHICKEN1(pipe), TRANS_CHICKEN1_DP0UNIT_GC_DISABLE); } } static void gen6_init_clock_gating(struct drm_device *dev) { struct drm_i915_private *dev_priv = dev->dev_private; int pipe; uint32_t dspclk_gate = ILK_VRHUNIT_CLOCK_GATE_DISABLE; I915_WRITE(ILK_DSPCLK_GATE_D, dspclk_gate); I915_WRITE(ILK_DISPLAY_CHICKEN2, I915_READ(ILK_DISPLAY_CHICKEN2) | ILK_ELPIN_409_SELECT); /* WaDisableHiZPlanesWhenMSAAEnabled */ I915_WRITE(_3D_CHICKEN, _MASKED_BIT_ENABLE(_3D_CHICKEN_HIZ_PLANE_DISABLE_MSAA_4X_SNB)); /* WaSetupGtModeTdRowDispatch */ if (IS_SNB_GT1(dev)) I915_WRITE(GEN6_GT_MODE, _MASKED_BIT_ENABLE(GEN6_TD_FOUR_ROW_DISPATCH_DISABLE)); I915_WRITE(WM3_LP_ILK, 0); I915_WRITE(WM2_LP_ILK, 0); I915_WRITE(WM1_LP_ILK, 0); I915_WRITE(CACHE_MODE_0, _MASKED_BIT_DISABLE(CM0_STC_EVICT_DISABLE_LRA_SNB)); I915_WRITE(GEN6_UCGCTL1, I915_READ(GEN6_UCGCTL1) | GEN6_BLBUNIT_CLOCK_GATE_DISABLE | GEN6_CSUNIT_CLOCK_GATE_DISABLE); /* According to the BSpec vol1g, bit 12 (RCPBUNIT) clock * gating disable must be set. Failure to set it results in * flickering pixels due to Z write ordering failures after * some amount of runtime in the Mesa "fire" demo, and Unigine * Sanctuary and Tropics, and apparently anything else with * alpha test or pixel discard. * * According to the spec, bit 11 (RCCUNIT) must also be set, * but we didn't debug actual testcases to find it out. * * Also apply WaDisableVDSUnitClockGating and * WaDisableRCPBUnitClockGating. */ I915_WRITE(GEN6_UCGCTL2, GEN7_VDSUNIT_CLOCK_GATE_DISABLE | GEN6_RCPBUNIT_CLOCK_GATE_DISABLE | GEN6_RCCUNIT_CLOCK_GATE_DISABLE); /* Bspec says we need to always set all mask bits. */ I915_WRITE(_3D_CHICKEN3, (0xFFFF << 16) | _3D_CHICKEN3_SF_DISABLE_FASTCLIP_CULL); /* * According to the spec the following bits should be * set in order to enable memory self-refresh and fbc: * The bit21 and bit22 of 0x42000 * The bit21 and bit22 of 0x42004 * The bit5 and bit7 of 0x42020 * The bit14 of 0x70180 * The bit14 of 0x71180 */ I915_WRITE(ILK_DISPLAY_CHICKEN1, I915_READ(ILK_DISPLAY_CHICKEN1) | ILK_FBCQ_DIS | ILK_PABSTRETCH_DIS); I915_WRITE(ILK_DISPLAY_CHICKEN2, I915_READ(ILK_DISPLAY_CHICKEN2) | ILK_DPARB_GATE | ILK_VSDPFD_FULL); I915_WRITE(ILK_DSPCLK_GATE_D, I915_READ(ILK_DSPCLK_GATE_D) | ILK_DPARBUNIT_CLOCK_GATE_ENABLE | ILK_DPFDUNIT_CLOCK_GATE_ENABLE); #ifdef FREEBSD_WIP /* NOTE Linux<->FreeBSD: Disable GEN6_MBCTL write. * * This arrived in Linux 3.6 in commit * b4ae3f22d238617ca11610b29fde16cf8c0bc6e0 and causes significantly * increased power consumption after kldloading i915kms.ko on FreeBSD * on (some) Sandy Bridge laptops. A Thinkpad X220 reported about 11W * after booting while idle at the vt(4) console and about double that * after loading the driver. * * There were reports in Linux of increased consumption after a suspend * and resume cycle due to that change. * * Linux bug reports: * https://bugs.freedesktop.org/show_bug.cgi?id=54089 * https://bugzilla.kernel.org/show_bug.cgi?id=58971 * * This suspend and resume issue is reportedly fixed in Linux with * commits 7dcd2677ea912573d9ed4bcd629b0023b2d11505 and * 7dcd2677ea912573d9ed4bcd629b0023b2d11505 (Linux 3.11). However, I * found that those changes did not help on FreeBSD, where increased * power consumption is observed after loading i915kms.ko without * suspending and resuming. * * This workaround should be removed after updating to a future Linux * i915 version and verifying normal power consumption on Sandy Bridge. */ /* WaMbcDriverBootEnable */ I915_WRITE(GEN6_MBCTL, I915_READ(GEN6_MBCTL) | GEN6_MBCTL_ENABLE_BOOT_FETCH); #endif /* FREEBSD_WIP */ for_each_pipe(pipe) { I915_WRITE(DSPCNTR(pipe), I915_READ(DSPCNTR(pipe)) | DISPPLANE_TRICKLE_FEED_DISABLE); intel_flush_display_plane(dev_priv, pipe); } /* The default value should be 0x200 according to docs, but the two * platforms I checked have a 0 for this. (Maybe BIOS overrides?) */ I915_WRITE(GEN6_GT_MODE, _MASKED_BIT_DISABLE(0xffff)); I915_WRITE(GEN6_GT_MODE, _MASKED_BIT_ENABLE(GEN6_GT_MODE_HI)); cpt_init_clock_gating(dev); } static void gen7_setup_fixed_func_scheduler(struct drm_i915_private *dev_priv) { uint32_t reg = I915_READ(GEN7_FF_THREAD_MODE); reg &= ~GEN7_FF_SCHED_MASK; reg |= GEN7_FF_TS_SCHED_HW; reg |= GEN7_FF_VS_SCHED_HW; reg |= GEN7_FF_DS_SCHED_HW; I915_WRITE(GEN7_FF_THREAD_MODE, reg); } static void lpt_init_clock_gating(struct drm_device *dev) { struct drm_i915_private *dev_priv = dev->dev_private; /* * TODO: this bit should only be enabled when really needed, then * disabled when not needed anymore in order to save power. */ if (dev_priv->pch_id == INTEL_PCH_LPT_LP_DEVICE_ID_TYPE) I915_WRITE(SOUTH_DSPCLK_GATE_D, I915_READ(SOUTH_DSPCLK_GATE_D) | PCH_LP_PARTITION_LEVEL_DISABLE); } static void haswell_init_clock_gating(struct drm_device *dev) { struct drm_i915_private *dev_priv = dev->dev_private; int pipe; I915_WRITE(WM3_LP_ILK, 0); I915_WRITE(WM2_LP_ILK, 0); I915_WRITE(WM1_LP_ILK, 0); /* According to the spec, bit 13 (RCZUNIT) must be set on IVB. * This implements the WaDisableRCZUnitClockGating workaround. */ I915_WRITE(GEN6_UCGCTL2, GEN6_RCZUNIT_CLOCK_GATE_DISABLE); /* Apply the WaDisableRHWOOptimizationForRenderHang workaround. */ I915_WRITE(GEN7_COMMON_SLICE_CHICKEN1, GEN7_CSC1_RHWO_OPT_DISABLE_IN_RCC); /* WaApplyL3ControlAndL3ChickenMode requires those two on Ivy Bridge */ I915_WRITE(GEN7_L3CNTLREG1, GEN7_WA_FOR_GEN7_L3_CONTROL); I915_WRITE(GEN7_L3_CHICKEN_MODE_REGISTER, GEN7_WA_L3_CHICKEN_MODE); /* This is required by WaCatErrorRejectionIssue */ I915_WRITE(GEN7_SQ_CHICKEN_MBCUNIT_CONFIG, I915_READ(GEN7_SQ_CHICKEN_MBCUNIT_CONFIG) | GEN7_SQ_CHICKEN_MBCUNIT_SQINTMOB); for_each_pipe(pipe) { I915_WRITE(DSPCNTR(pipe), I915_READ(DSPCNTR(pipe)) | DISPPLANE_TRICKLE_FEED_DISABLE); intel_flush_display_plane(dev_priv, pipe); } gen7_setup_fixed_func_scheduler(dev_priv); /* WaDisable4x2SubspanOptimization */ I915_WRITE(CACHE_MODE_1, _MASKED_BIT_ENABLE(PIXEL_SUBSPAN_COLLECT_OPT_DISABLE)); /* WaMbcDriverBootEnable */ I915_WRITE(GEN6_MBCTL, I915_READ(GEN6_MBCTL) | GEN6_MBCTL_ENABLE_BOOT_FETCH); /* XXX: This is a workaround for early silicon revisions and should be * removed later. */ I915_WRITE(WM_DBG, I915_READ(WM_DBG) | WM_DBG_DISALLOW_MULTIPLE_LP | WM_DBG_DISALLOW_SPRITE | WM_DBG_DISALLOW_MAXFIFO); lpt_init_clock_gating(dev); } static void ivybridge_init_clock_gating(struct drm_device *dev) { struct drm_i915_private *dev_priv = dev->dev_private; int pipe; uint32_t snpcr; I915_WRITE(WM3_LP_ILK, 0); I915_WRITE(WM2_LP_ILK, 0); I915_WRITE(WM1_LP_ILK, 0); I915_WRITE(ILK_DSPCLK_GATE_D, ILK_VRHUNIT_CLOCK_GATE_DISABLE); /* WaDisableEarlyCull */ I915_WRITE(_3D_CHICKEN3, _MASKED_BIT_ENABLE(_3D_CHICKEN_SF_DISABLE_OBJEND_CULL)); /* WaDisableBackToBackFlipFix */ I915_WRITE(IVB_CHICKEN3, CHICKEN3_DGMG_REQ_OUT_FIX_DISABLE | CHICKEN3_DGMG_DONE_FIX_DISABLE); /* WaDisablePSDDualDispatchEnable */ if (IS_IVB_GT1(dev)) I915_WRITE(GEN7_HALF_SLICE_CHICKEN1, _MASKED_BIT_ENABLE(GEN7_PSD_SINGLE_PORT_DISPATCH_ENABLE)); else I915_WRITE(GEN7_HALF_SLICE_CHICKEN1_GT2, _MASKED_BIT_ENABLE(GEN7_PSD_SINGLE_PORT_DISPATCH_ENABLE)); /* Apply the WaDisableRHWOOptimizationForRenderHang workaround. */ I915_WRITE(GEN7_COMMON_SLICE_CHICKEN1, GEN7_CSC1_RHWO_OPT_DISABLE_IN_RCC); /* WaApplyL3ControlAndL3ChickenMode requires those two on Ivy Bridge */ I915_WRITE(GEN7_L3CNTLREG1, GEN7_WA_FOR_GEN7_L3_CONTROL); I915_WRITE(GEN7_L3_CHICKEN_MODE_REGISTER, GEN7_WA_L3_CHICKEN_MODE); if (IS_IVB_GT1(dev)) I915_WRITE(GEN7_ROW_CHICKEN2, _MASKED_BIT_ENABLE(DOP_CLOCK_GATING_DISABLE)); else I915_WRITE(GEN7_ROW_CHICKEN2_GT2, _MASKED_BIT_ENABLE(DOP_CLOCK_GATING_DISABLE)); /* WaForceL3Serialization */ I915_WRITE(GEN7_L3SQCREG4, I915_READ(GEN7_L3SQCREG4) & ~L3SQ_URB_READ_CAM_MATCH_DISABLE); /* According to the BSpec vol1g, bit 12 (RCPBUNIT) clock * gating disable must be set. Failure to set it results in * flickering pixels due to Z write ordering failures after * some amount of runtime in the Mesa "fire" demo, and Unigine * Sanctuary and Tropics, and apparently anything else with * alpha test or pixel discard. * * According to the spec, bit 11 (RCCUNIT) must also be set, * but we didn't debug actual testcases to find it out. * * According to the spec, bit 13 (RCZUNIT) must be set on IVB. * This implements the WaDisableRCZUnitClockGating workaround. */ I915_WRITE(GEN6_UCGCTL2, GEN6_RCZUNIT_CLOCK_GATE_DISABLE | GEN6_RCCUNIT_CLOCK_GATE_DISABLE); /* This is required by WaCatErrorRejectionIssue */ I915_WRITE(GEN7_SQ_CHICKEN_MBCUNIT_CONFIG, I915_READ(GEN7_SQ_CHICKEN_MBCUNIT_CONFIG) | GEN7_SQ_CHICKEN_MBCUNIT_SQINTMOB); for_each_pipe(pipe) { I915_WRITE(DSPCNTR(pipe), I915_READ(DSPCNTR(pipe)) | DISPPLANE_TRICKLE_FEED_DISABLE); intel_flush_display_plane(dev_priv, pipe); } /* WaMbcDriverBootEnable */ I915_WRITE(GEN6_MBCTL, I915_READ(GEN6_MBCTL) | GEN6_MBCTL_ENABLE_BOOT_FETCH); gen7_setup_fixed_func_scheduler(dev_priv); /* WaDisable4x2SubspanOptimization */ I915_WRITE(CACHE_MODE_1, _MASKED_BIT_ENABLE(PIXEL_SUBSPAN_COLLECT_OPT_DISABLE)); snpcr = I915_READ(GEN6_MBCUNIT_SNPCR); snpcr &= ~GEN6_MBC_SNPCR_MASK; snpcr |= GEN6_MBC_SNPCR_MED; I915_WRITE(GEN6_MBCUNIT_SNPCR, snpcr); cpt_init_clock_gating(dev); } static void valleyview_init_clock_gating(struct drm_device *dev) { struct drm_i915_private *dev_priv = dev->dev_private; int pipe; I915_WRITE(WM3_LP_ILK, 0); I915_WRITE(WM2_LP_ILK, 0); I915_WRITE(WM1_LP_ILK, 0); I915_WRITE(ILK_DSPCLK_GATE_D, ILK_VRHUNIT_CLOCK_GATE_DISABLE); /* WaDisableEarlyCull */ I915_WRITE(_3D_CHICKEN3, _MASKED_BIT_ENABLE(_3D_CHICKEN_SF_DISABLE_OBJEND_CULL)); /* WaDisableBackToBackFlipFix */ I915_WRITE(IVB_CHICKEN3, CHICKEN3_DGMG_REQ_OUT_FIX_DISABLE | CHICKEN3_DGMG_DONE_FIX_DISABLE); I915_WRITE(GEN7_HALF_SLICE_CHICKEN1, _MASKED_BIT_ENABLE(GEN7_PSD_SINGLE_PORT_DISPATCH_ENABLE)); /* Apply the WaDisableRHWOOptimizationForRenderHang workaround. */ I915_WRITE(GEN7_COMMON_SLICE_CHICKEN1, GEN7_CSC1_RHWO_OPT_DISABLE_IN_RCC); /* WaApplyL3ControlAndL3ChickenMode requires those two on Ivy Bridge */ I915_WRITE(GEN7_L3CNTLREG1, I915_READ(GEN7_L3CNTLREG1) | GEN7_L3AGDIS); I915_WRITE(GEN7_L3_CHICKEN_MODE_REGISTER, GEN7_WA_L3_CHICKEN_MODE); /* WaForceL3Serialization */ I915_WRITE(GEN7_L3SQCREG4, I915_READ(GEN7_L3SQCREG4) & ~L3SQ_URB_READ_CAM_MATCH_DISABLE); /* WaDisableDopClockGating */ I915_WRITE(GEN7_ROW_CHICKEN2, _MASKED_BIT_ENABLE(DOP_CLOCK_GATING_DISABLE)); /* WaForceL3Serialization */ I915_WRITE(GEN7_L3SQCREG4, I915_READ(GEN7_L3SQCREG4) & ~L3SQ_URB_READ_CAM_MATCH_DISABLE); /* This is required by WaCatErrorRejectionIssue */ I915_WRITE(GEN7_SQ_CHICKEN_MBCUNIT_CONFIG, I915_READ(GEN7_SQ_CHICKEN_MBCUNIT_CONFIG) | GEN7_SQ_CHICKEN_MBCUNIT_SQINTMOB); /* WaMbcDriverBootEnable */ I915_WRITE(GEN6_MBCTL, I915_READ(GEN6_MBCTL) | GEN6_MBCTL_ENABLE_BOOT_FETCH); /* According to the BSpec vol1g, bit 12 (RCPBUNIT) clock * gating disable must be set. Failure to set it results in * flickering pixels due to Z write ordering failures after * some amount of runtime in the Mesa "fire" demo, and Unigine * Sanctuary and Tropics, and apparently anything else with * alpha test or pixel discard. * * According to the spec, bit 11 (RCCUNIT) must also be set, * but we didn't debug actual testcases to find it out. * * According to the spec, bit 13 (RCZUNIT) must be set on IVB. * This implements the WaDisableRCZUnitClockGating workaround. * * Also apply WaDisableVDSUnitClockGating and * WaDisableRCPBUnitClockGating. */ I915_WRITE(GEN6_UCGCTL2, GEN7_VDSUNIT_CLOCK_GATE_DISABLE | GEN7_TDLUNIT_CLOCK_GATE_DISABLE | GEN6_RCZUNIT_CLOCK_GATE_DISABLE | GEN6_RCPBUNIT_CLOCK_GATE_DISABLE | GEN6_RCCUNIT_CLOCK_GATE_DISABLE); I915_WRITE(GEN7_UCGCTL4, GEN7_L3BANK2X_CLOCK_GATE_DISABLE); for_each_pipe(pipe) { I915_WRITE(DSPCNTR(pipe), I915_READ(DSPCNTR(pipe)) | DISPPLANE_TRICKLE_FEED_DISABLE); intel_flush_display_plane(dev_priv, pipe); } I915_WRITE(CACHE_MODE_1, _MASKED_BIT_ENABLE(PIXEL_SUBSPAN_COLLECT_OPT_DISABLE)); /* * On ValleyView, the GUnit needs to signal the GT * when flip and other events complete. So enable * all the GUnit->GT interrupts here */ I915_WRITE(VLV_DPFLIPSTAT, PIPEB_LINE_COMPARE_INT_EN | PIPEB_HLINE_INT_EN | PIPEB_VBLANK_INT_EN | SPRITED_FLIPDONE_INT_EN | SPRITEC_FLIPDONE_INT_EN | PLANEB_FLIPDONE_INT_EN | PIPEA_LINE_COMPARE_INT_EN | PIPEA_HLINE_INT_EN | PIPEA_VBLANK_INT_EN | SPRITEB_FLIPDONE_INT_EN | SPRITEA_FLIPDONE_INT_EN | PLANEA_FLIPDONE_INT_EN); /* * WaDisableVLVClockGating_VBIIssue * Disable clock gating on th GCFG unit to prevent a delay * in the reporting of vblank events. */ I915_WRITE(VLV_GUNIT_CLOCK_GATE, GCFG_DIS); } static void g4x_init_clock_gating(struct drm_device *dev) { struct drm_i915_private *dev_priv = dev->dev_private; uint32_t dspclk_gate; I915_WRITE(RENCLK_GATE_D1, 0); I915_WRITE(RENCLK_GATE_D2, VF_UNIT_CLOCK_GATE_DISABLE | GS_UNIT_CLOCK_GATE_DISABLE | CL_UNIT_CLOCK_GATE_DISABLE); I915_WRITE(RAMCLK_GATE_D, 0); dspclk_gate = VRHUNIT_CLOCK_GATE_DISABLE | OVRUNIT_CLOCK_GATE_DISABLE | OVCUNIT_CLOCK_GATE_DISABLE; if (IS_GM45(dev)) dspclk_gate |= DSSUNIT_CLOCK_GATE_DISABLE; I915_WRITE(DSPCLK_GATE_D, dspclk_gate); /* WaDisableRenderCachePipelinedFlush */ I915_WRITE(CACHE_MODE_0, _MASKED_BIT_ENABLE(CM0_PIPELINED_RENDER_FLUSH_DISABLE)); } static void crestline_init_clock_gating(struct drm_device *dev) { struct drm_i915_private *dev_priv = dev->dev_private; I915_WRITE(RENCLK_GATE_D1, I965_RCC_CLOCK_GATE_DISABLE); I915_WRITE(RENCLK_GATE_D2, 0); I915_WRITE(DSPCLK_GATE_D, 0); I915_WRITE(RAMCLK_GATE_D, 0); I915_WRITE16(DEUC, 0); } static void broadwater_init_clock_gating(struct drm_device *dev) { struct drm_i915_private *dev_priv = dev->dev_private; I915_WRITE(RENCLK_GATE_D1, I965_RCZ_CLOCK_GATE_DISABLE | I965_RCC_CLOCK_GATE_DISABLE | I965_RCPB_CLOCK_GATE_DISABLE | I965_ISC_CLOCK_GATE_DISABLE | I965_FBC_CLOCK_GATE_DISABLE); I915_WRITE(RENCLK_GATE_D2, 0); } static void gen3_init_clock_gating(struct drm_device *dev) { struct drm_i915_private *dev_priv = dev->dev_private; u32 dstate = I915_READ(D_STATE); dstate |= DSTATE_PLL_D3_OFF | DSTATE_GFX_CLOCK_GATING | DSTATE_DOT_CLOCK_GATING; I915_WRITE(D_STATE, dstate); if (IS_PINEVIEW(dev)) I915_WRITE(ECOSKPD, _MASKED_BIT_ENABLE(ECO_GATING_CX_ONLY)); /* IIR "flip pending" means done if this bit is set */ I915_WRITE(ECOSKPD, _MASKED_BIT_DISABLE(ECO_FLIP_DONE)); } static void i85x_init_clock_gating(struct drm_device *dev) { struct drm_i915_private *dev_priv = dev->dev_private; I915_WRITE(RENCLK_GATE_D1, SV_CLOCK_GATE_DISABLE); } static void i830_init_clock_gating(struct drm_device *dev) { struct drm_i915_private *dev_priv = dev->dev_private; I915_WRITE(DSPCLK_GATE_D, OVRUNIT_CLOCK_GATE_DISABLE); } void intel_init_clock_gating(struct drm_device *dev) { struct drm_i915_private *dev_priv = dev->dev_private; dev_priv->display.init_clock_gating(dev); } /* Starting with Haswell, we have different power wells for * different parts of the GPU. This attempts to enable them all. */ void intel_init_power_wells(struct drm_device *dev) { struct drm_i915_private *dev_priv = dev->dev_private; unsigned long power_wells[] = { HSW_PWR_WELL_CTL1, HSW_PWR_WELL_CTL2, HSW_PWR_WELL_CTL4 }; int i; if (!IS_HASWELL(dev)) return; DRM_LOCK(dev); for (i = 0; i < ARRAY_SIZE(power_wells); i++) { int well = I915_READ(power_wells[i]); if ((well & HSW_PWR_WELL_STATE) == 0) { I915_WRITE(power_wells[i], well & HSW_PWR_WELL_ENABLE); if (wait_for((I915_READ(power_wells[i]) & HSW_PWR_WELL_STATE), 20)) DRM_ERROR("Error enabling power well %lx\n", power_wells[i]); } } DRM_UNLOCK(dev); } /* Set up chip specific power management-related functions */ void intel_init_pm(struct drm_device *dev) { struct drm_i915_private *dev_priv = dev->dev_private; if (I915_HAS_FBC(dev)) { if (HAS_PCH_SPLIT(dev)) { dev_priv->display.fbc_enabled = ironlake_fbc_enabled; dev_priv->display.enable_fbc = ironlake_enable_fbc; dev_priv->display.disable_fbc = ironlake_disable_fbc; } else if (IS_GM45(dev)) { dev_priv->display.fbc_enabled = g4x_fbc_enabled; dev_priv->display.enable_fbc = g4x_enable_fbc; dev_priv->display.disable_fbc = g4x_disable_fbc; } else if (IS_CRESTLINE(dev)) { dev_priv->display.fbc_enabled = i8xx_fbc_enabled; dev_priv->display.enable_fbc = i8xx_enable_fbc; dev_priv->display.disable_fbc = i8xx_disable_fbc; } /* 855GM needs testing */ } /* For cxsr */ if (IS_PINEVIEW(dev)) i915_pineview_get_mem_freq(dev); else if (IS_GEN5(dev)) i915_ironlake_get_mem_freq(dev); /* For FIFO watermark updates */ if (HAS_PCH_SPLIT(dev)) { if (IS_GEN5(dev)) { if (I915_READ(MLTR_ILK) & ILK_SRLT_MASK) dev_priv->display.update_wm = ironlake_update_wm; else { DRM_DEBUG_KMS("Failed to get proper latency. " "Disable CxSR\n"); dev_priv->display.update_wm = NULL; } dev_priv->display.init_clock_gating = ironlake_init_clock_gating; } else if (IS_GEN6(dev)) { if (SNB_READ_WM0_LATENCY()) { dev_priv->display.update_wm = sandybridge_update_wm; dev_priv->display.update_sprite_wm = sandybridge_update_sprite_wm; } else { DRM_DEBUG_KMS("Failed to read display plane latency. " "Disable CxSR\n"); dev_priv->display.update_wm = NULL; } dev_priv->display.init_clock_gating = gen6_init_clock_gating; } else if (IS_IVYBRIDGE(dev)) { /* FIXME: detect B0+ stepping and use auto training */ if (SNB_READ_WM0_LATENCY()) { dev_priv->display.update_wm = ivybridge_update_wm; dev_priv->display.update_sprite_wm = sandybridge_update_sprite_wm; } else { DRM_DEBUG_KMS("Failed to read display plane latency. " "Disable CxSR\n"); dev_priv->display.update_wm = NULL; } dev_priv->display.init_clock_gating = ivybridge_init_clock_gating; } else if (IS_HASWELL(dev)) { if (SNB_READ_WM0_LATENCY()) { dev_priv->display.update_wm = sandybridge_update_wm; dev_priv->display.update_sprite_wm = sandybridge_update_sprite_wm; dev_priv->display.update_linetime_wm = haswell_update_linetime_wm; } else { DRM_DEBUG_KMS("Failed to read display plane latency. " "Disable CxSR\n"); dev_priv->display.update_wm = NULL; } dev_priv->display.init_clock_gating = haswell_init_clock_gating; } else dev_priv->display.update_wm = NULL; } else if (IS_VALLEYVIEW(dev)) { dev_priv->display.update_wm = valleyview_update_wm; dev_priv->display.init_clock_gating = valleyview_init_clock_gating; } else if (IS_PINEVIEW(dev)) { if (!intel_get_cxsr_latency(IS_PINEVIEW_G(dev), dev_priv->is_ddr3, dev_priv->fsb_freq, dev_priv->mem_freq)) { DRM_INFO("failed to find known CxSR latency " "(found ddr%s fsb freq %d, mem freq %d), " "disabling CxSR\n", (dev_priv->is_ddr3 == 1) ? "3" : "2", dev_priv->fsb_freq, dev_priv->mem_freq); /* Disable CxSR and never update its watermark again */ pineview_disable_cxsr(dev); dev_priv->display.update_wm = NULL; } else dev_priv->display.update_wm = pineview_update_wm; dev_priv->display.init_clock_gating = gen3_init_clock_gating; } else if (IS_G4X(dev)) { dev_priv->display.update_wm = g4x_update_wm; dev_priv->display.init_clock_gating = g4x_init_clock_gating; } else if (IS_GEN4(dev)) { dev_priv->display.update_wm = i965_update_wm; if (IS_CRESTLINE(dev)) dev_priv->display.init_clock_gating = crestline_init_clock_gating; else if (IS_BROADWATER(dev)) dev_priv->display.init_clock_gating = broadwater_init_clock_gating; } else if (IS_GEN3(dev)) { dev_priv->display.update_wm = i9xx_update_wm; dev_priv->display.get_fifo_size = i9xx_get_fifo_size; dev_priv->display.init_clock_gating = gen3_init_clock_gating; } else if (IS_I865G(dev)) { dev_priv->display.update_wm = i830_update_wm; dev_priv->display.init_clock_gating = i85x_init_clock_gating; dev_priv->display.get_fifo_size = i830_get_fifo_size; } else if (IS_I85X(dev)) { dev_priv->display.update_wm = i9xx_update_wm; dev_priv->display.get_fifo_size = i85x_get_fifo_size; dev_priv->display.init_clock_gating = i85x_init_clock_gating; } else { dev_priv->display.update_wm = i830_update_wm; dev_priv->display.init_clock_gating = i830_init_clock_gating; if (IS_845G(dev)) dev_priv->display.get_fifo_size = i845_get_fifo_size; else dev_priv->display.get_fifo_size = i830_get_fifo_size; } } static void __gen6_gt_wait_for_thread_c0(struct drm_i915_private *dev_priv) { u32 gt_thread_status_mask; if (IS_HASWELL(dev_priv->dev)) gt_thread_status_mask = GEN6_GT_THREAD_STATUS_CORE_MASK_HSW; else gt_thread_status_mask = GEN6_GT_THREAD_STATUS_CORE_MASK; /* w/a for a sporadic read returning 0 by waiting for the GT * thread to wake up. */ if (wait_for_atomic_us((I915_READ_NOTRACE(GEN6_GT_THREAD_STATUS_REG) & gt_thread_status_mask) == 0, 500)) DRM_ERROR("GT thread status wait timed out\n"); } static void __gen6_gt_force_wake_reset(struct drm_i915_private *dev_priv) { I915_WRITE_NOTRACE(FORCEWAKE, 0); POSTING_READ(ECOBUS); /* something from same cacheline, but !FORCEWAKE */ } static void __gen6_gt_force_wake_get(struct drm_i915_private *dev_priv) { u32 forcewake_ack; if (IS_HASWELL(dev_priv->dev)) forcewake_ack = FORCEWAKE_ACK_HSW; else forcewake_ack = FORCEWAKE_ACK; if (wait_for_atomic((I915_READ_NOTRACE(forcewake_ack) & 1) == 0, FORCEWAKE_ACK_TIMEOUT_MS)) DRM_ERROR("Timed out waiting for forcewake old ack to clear.\n"); I915_WRITE_NOTRACE(FORCEWAKE, FORCEWAKE_KERNEL); POSTING_READ(ECOBUS); /* something from same cacheline, but !FORCEWAKE */ if (wait_for_atomic((I915_READ_NOTRACE(forcewake_ack) & 1), FORCEWAKE_ACK_TIMEOUT_MS)) DRM_ERROR("Timed out waiting for forcewake to ack request.\n"); __gen6_gt_wait_for_thread_c0(dev_priv); } static void __gen6_gt_force_wake_mt_reset(struct drm_i915_private *dev_priv) { I915_WRITE_NOTRACE(FORCEWAKE_MT, _MASKED_BIT_DISABLE(0xffff)); /* something from same cacheline, but !FORCEWAKE_MT */ POSTING_READ(ECOBUS); } static void __gen6_gt_force_wake_mt_get(struct drm_i915_private *dev_priv) { u32 forcewake_ack; if (IS_HASWELL(dev_priv->dev)) forcewake_ack = FORCEWAKE_ACK_HSW; else forcewake_ack = FORCEWAKE_MT_ACK; if (wait_for_atomic((I915_READ_NOTRACE(forcewake_ack) & 1) == 0, FORCEWAKE_ACK_TIMEOUT_MS)) DRM_ERROR("Timed out waiting for forcewake old ack to clear.\n"); I915_WRITE_NOTRACE(FORCEWAKE_MT, _MASKED_BIT_ENABLE(FORCEWAKE_KERNEL)); /* something from same cacheline, but !FORCEWAKE_MT */ POSTING_READ(ECOBUS); if (wait_for_atomic((I915_READ_NOTRACE(forcewake_ack) & 1), FORCEWAKE_ACK_TIMEOUT_MS)) DRM_ERROR("Timed out waiting for forcewake to ack request.\n"); __gen6_gt_wait_for_thread_c0(dev_priv); } /* * Generally this is called implicitly by the register read function. However, * if some sequence requires the GT to not power down then this function should * be called at the beginning of the sequence followed by a call to * gen6_gt_force_wake_put() at the end of the sequence. */ void gen6_gt_force_wake_get(struct drm_i915_private *dev_priv) { mtx_lock(&dev_priv->gt_lock); if (dev_priv->forcewake_count++ == 0) dev_priv->gt.force_wake_get(dev_priv); mtx_unlock(&dev_priv->gt_lock); } void gen6_gt_check_fifodbg(struct drm_i915_private *dev_priv) { u32 gtfifodbg; gtfifodbg = I915_READ_NOTRACE(GTFIFODBG); if (WARN(gtfifodbg & GT_FIFO_CPU_ERROR_MASK, "MMIO read or write has been dropped %x\n", gtfifodbg)) I915_WRITE_NOTRACE(GTFIFODBG, GT_FIFO_CPU_ERROR_MASK); } static void __gen6_gt_force_wake_put(struct drm_i915_private *dev_priv) { I915_WRITE_NOTRACE(FORCEWAKE, 0); /* something from same cacheline, but !FORCEWAKE */ POSTING_READ(ECOBUS); gen6_gt_check_fifodbg(dev_priv); } static void __gen6_gt_force_wake_mt_put(struct drm_i915_private *dev_priv) { I915_WRITE_NOTRACE(FORCEWAKE_MT, _MASKED_BIT_DISABLE(FORCEWAKE_KERNEL)); /* something from same cacheline, but !FORCEWAKE_MT */ POSTING_READ(ECOBUS); gen6_gt_check_fifodbg(dev_priv); } /* * see gen6_gt_force_wake_get() */ void gen6_gt_force_wake_put(struct drm_i915_private *dev_priv) { mtx_lock(&dev_priv->gt_lock); if (--dev_priv->forcewake_count == 0) dev_priv->gt.force_wake_put(dev_priv); mtx_unlock(&dev_priv->gt_lock); } int __gen6_gt_wait_for_fifo(struct drm_i915_private *dev_priv) { int ret = 0; if (dev_priv->gt_fifo_count < GT_FIFO_NUM_RESERVED_ENTRIES) { int loop = 500; u32 fifo = I915_READ_NOTRACE(GT_FIFO_FREE_ENTRIES); while (fifo <= GT_FIFO_NUM_RESERVED_ENTRIES && loop--) { udelay(10); fifo = I915_READ_NOTRACE(GT_FIFO_FREE_ENTRIES); } if (WARN_ON(loop < 0 && fifo <= GT_FIFO_NUM_RESERVED_ENTRIES)) ++ret; dev_priv->gt_fifo_count = fifo; } dev_priv->gt_fifo_count--; return ret; } static void vlv_force_wake_reset(struct drm_i915_private *dev_priv) { I915_WRITE_NOTRACE(FORCEWAKE_VLV, _MASKED_BIT_DISABLE(0xffff)); /* something from same cacheline, but !FORCEWAKE_VLV */ POSTING_READ(FORCEWAKE_ACK_VLV); } static void vlv_force_wake_get(struct drm_i915_private *dev_priv) { if (wait_for_atomic((I915_READ_NOTRACE(FORCEWAKE_ACK_VLV) & 1) == 0, FORCEWAKE_ACK_TIMEOUT_MS)) DRM_ERROR("Timed out waiting for forcewake old ack to clear.\n"); I915_WRITE_NOTRACE(FORCEWAKE_VLV, _MASKED_BIT_ENABLE(FORCEWAKE_KERNEL)); if (wait_for_atomic((I915_READ_NOTRACE(FORCEWAKE_ACK_VLV) & 1), FORCEWAKE_ACK_TIMEOUT_MS)) DRM_ERROR("Timed out waiting for forcewake to ack request.\n"); __gen6_gt_wait_for_thread_c0(dev_priv); } static void vlv_force_wake_put(struct drm_i915_private *dev_priv) { I915_WRITE_NOTRACE(FORCEWAKE_VLV, _MASKED_BIT_DISABLE(FORCEWAKE_KERNEL)); /* something from same cacheline, but !FORCEWAKE_VLV */ POSTING_READ(FORCEWAKE_ACK_VLV); gen6_gt_check_fifodbg(dev_priv); } void intel_gt_reset(struct drm_device *dev) { struct drm_i915_private *dev_priv = dev->dev_private; if (IS_VALLEYVIEW(dev)) { vlv_force_wake_reset(dev_priv); } else if (INTEL_INFO(dev)->gen >= 6) { __gen6_gt_force_wake_reset(dev_priv); if (IS_IVYBRIDGE(dev) || IS_HASWELL(dev)) __gen6_gt_force_wake_mt_reset(dev_priv); } } void intel_gt_init(struct drm_device *dev) { struct drm_i915_private *dev_priv = dev->dev_private; mtx_init(&dev_priv->gt_lock, "i915_gt_lock", NULL, MTX_DEF); intel_gt_reset(dev); if (IS_VALLEYVIEW(dev)) { dev_priv->gt.force_wake_get = vlv_force_wake_get; dev_priv->gt.force_wake_put = vlv_force_wake_put; } else if (IS_IVYBRIDGE(dev) || IS_HASWELL(dev)) { dev_priv->gt.force_wake_get = __gen6_gt_force_wake_mt_get; dev_priv->gt.force_wake_put = __gen6_gt_force_wake_mt_put; } else if (IS_GEN6(dev)) { dev_priv->gt.force_wake_get = __gen6_gt_force_wake_get; dev_priv->gt.force_wake_put = __gen6_gt_force_wake_put; } TIMEOUT_TASK_INIT(dev_priv->wq, &dev_priv->rps.delayed_resume_work, 0, intel_gen6_powersave_work, dev_priv); } int sandybridge_pcode_read(struct drm_i915_private *dev_priv, u8 mbox, u32 *val) { sx_assert(&dev_priv->rps.hw_lock, SA_XLOCKED); if (I915_READ(GEN6_PCODE_MAILBOX) & GEN6_PCODE_READY) { DRM_DEBUG_DRIVER("warning: pcode (read) mailbox access failed\n"); return -EAGAIN; } I915_WRITE(GEN6_PCODE_DATA, *val); I915_WRITE(GEN6_PCODE_MAILBOX, GEN6_PCODE_READY | mbox); if (wait_for((I915_READ(GEN6_PCODE_MAILBOX) & GEN6_PCODE_READY) == 0, 500)) { DRM_ERROR("timeout waiting for pcode read (%d) to finish\n", mbox); return -ETIMEDOUT; } *val = I915_READ(GEN6_PCODE_DATA); I915_WRITE(GEN6_PCODE_DATA, 0); return 0; } int sandybridge_pcode_write(struct drm_i915_private *dev_priv, u8 mbox, u32 val) { sx_assert(&dev_priv->rps.hw_lock, SA_XLOCKED); if (I915_READ(GEN6_PCODE_MAILBOX) & GEN6_PCODE_READY) { DRM_DEBUG_DRIVER("warning: pcode (write) mailbox access failed\n"); return -EAGAIN; } I915_WRITE(GEN6_PCODE_DATA, val); I915_WRITE(GEN6_PCODE_MAILBOX, GEN6_PCODE_READY | mbox); if (wait_for((I915_READ(GEN6_PCODE_MAILBOX) & GEN6_PCODE_READY) == 0, 500)) { DRM_ERROR("timeout waiting for pcode write (%d) to finish\n", mbox); return -ETIMEDOUT; } I915_WRITE(GEN6_PCODE_DATA, 0); return 0; } Index: head/sys/dev/efidev/efirtc.c =================================================================== --- head/sys/dev/efidev/efirtc.c (revision 336913) +++ head/sys/dev/efidev/efirtc.c (revision 336914) @@ -1,206 +1,206 @@ /*- * Copyright (c) 2017 Andrew Turner * All rights reserved. * * This software was developed by SRI International and the University of * Cambridge Computer Laboratory under DARPA/AFRL contract FA8750-10-C-0237 * ("CTSRD"), as part of the DARPA CRASH research programme. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include "clock_if.h" static bool efirtc_zeroes_subseconds; static struct timespec efirtc_resadj; static const u_int us_per_s = 1000000; static const u_int ns_per_s = 1000000000; static const u_int ns_per_us = 1000; static void efirtc_identify(driver_t *driver, device_t parent) { /* Don't add the driver unless we have working runtime services. */ if (efi_rt_ok() != 0) return; if (device_find_child(parent, "efirtc", -1) != NULL) return; if (BUS_ADD_CHILD(parent, 0, "efirtc", -1) == NULL) device_printf(parent, "add child failed\n"); } static int efirtc_probe(device_t dev) { struct efi_tm tm; int error; /* * Check whether we can read the time. This will stop us from attaching * when there is EFI Runtime support but the gettime function is * unimplemented, e.g. on some builds of U-Boot. */ if ((error = efi_get_time(&tm)) != 0) { if (bootverbose) device_printf(dev, "cannot read EFI realtime clock\n"); return (error); } device_set_desc(dev, "EFI Realtime Clock"); return (BUS_PROBE_DEFAULT); } static int efirtc_attach(device_t dev) { struct efi_tmcap tmcap; long res; int error; bzero(&tmcap, sizeof(tmcap)); if ((error = efi_get_time_capabilities(&tmcap)) != 0) { device_printf(dev, "cannot get EFI time capabilities"); return (error); } /* Translate resolution in Hz to tick length in usec. */ if (tmcap.tc_res == 0) res = us_per_s; /* 0 is insane, assume 1 Hz. */ else if (tmcap.tc_res > us_per_s) res = 1; /* 1us is the best we can represent */ else res = us_per_s / tmcap.tc_res; /* Clock rounding adjustment is 1/2 of resolution, in nsec. */ efirtc_resadj.tv_nsec = (res * ns_per_us) / 2; /* Does the clock zero the subseconds when time is set? */ efirtc_zeroes_subseconds = tmcap.tc_stz; /* * Register. If the clock zeroes out the subseconds when it's set, * schedule the SetTime calls to happen just before top-of-second. */ clock_register_flags(dev, res, CLOCKF_SETTIME_NO_ADJ); if (efirtc_zeroes_subseconds) clock_schedule(dev, ns_per_s - ns_per_us); return (0); } static int efirtc_detach(device_t dev) { clock_unregister(dev); return (0); } static int efirtc_gettime(device_t dev, struct timespec *ts) { struct clocktime ct; struct efi_tm tm; int error; error = efi_get_time(&tm); if (error != 0) return (error); ct.sec = tm.tm_sec; ct.min = tm.tm_min; ct.hour = tm.tm_hour; ct.day = tm.tm_mday; ct.mon = tm.tm_mon; ct.year = tm.tm_year; ct.nsec = tm.tm_nsec; clock_dbgprint_ct(dev, CLOCK_DBG_READ, &ct); return (clock_ct_to_ts(&ct, ts)); } static int efirtc_settime(device_t dev, struct timespec *ts) { struct clocktime ct; struct efi_tm tm; /* * We request a timespec with no resolution-adjustment so that we can * apply it ourselves based on whether or not the clock zeroes the * sub-second part of the time when setting the time. */ ts->tv_sec -= utc_offset(); if (!efirtc_zeroes_subseconds) - timespecadd(ts, &efirtc_resadj); + timespecadd(ts, &efirtc_resadj, ts); clock_ts_to_ct(ts, &ct); clock_dbgprint_ct(dev, CLOCK_DBG_WRITE, &ct); bzero(&tm, sizeof(tm)); tm.tm_sec = ct.sec; tm.tm_min = ct.min; tm.tm_hour = ct.hour; tm.tm_mday = ct.day; tm.tm_mon = ct.mon; tm.tm_year = ct.year; tm.tm_nsec = ct.nsec; return (efi_set_time(&tm)); } static device_method_t efirtc_methods[] = { /* Device interface */ DEVMETHOD(device_identify, efirtc_identify), DEVMETHOD(device_probe, efirtc_probe), DEVMETHOD(device_attach, efirtc_attach), DEVMETHOD(device_detach, efirtc_detach), /* Clock interface */ DEVMETHOD(clock_gettime, efirtc_gettime), DEVMETHOD(clock_settime, efirtc_settime), DEVMETHOD_END }; static devclass_t efirtc_devclass; static driver_t efirtc_driver = { "efirtc", efirtc_methods, 0 }; DRIVER_MODULE(efirtc, nexus, efirtc_driver, efirtc_devclass, 0, 0); MODULE_VERSION(efirtc, 1); MODULE_DEPEND(efirtc, efirt, 1, 1, 1); Index: head/sys/dev/isp/isp_freebsd.c =================================================================== --- head/sys/dev/isp/isp_freebsd.c (revision 336913) +++ head/sys/dev/isp/isp_freebsd.c (revision 336914) @@ -1,4313 +1,4314 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2009-2017 Alexander Motin * Copyright (c) 1997-2009 by Matthew Jacob * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice immediately at the beginning of the file, without modification, * this list of conditions, and the following disclaimer. * 2. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * Platform (FreeBSD) dependent common attachment code for Qlogic adapters. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include MODULE_VERSION(isp, 1); MODULE_DEPEND(isp, cam, 1, 1, 1); int isp_announced = 0; int isp_loop_down_limit = 60; /* default loop down limit */ int isp_quickboot_time = 7; /* don't wait more than N secs for loop up */ int isp_gone_device_time = 30; /* grace time before reporting device lost */ static const char prom3[] = "Chan %d [%u] PortID 0x%06x Departed because of %s"; static void isp_freeze_loopdown(ispsoftc_t *, int); static void isp_loop_changed(ispsoftc_t *isp, int chan); static d_ioctl_t ispioctl; static void isp_cam_async(void *, uint32_t, struct cam_path *, void *); static void isp_poll(struct cam_sim *); static timeout_t isp_watchdog; static timeout_t isp_gdt; static task_fn_t isp_gdt_task; static void isp_kthread(void *); static void isp_action(struct cam_sim *, union ccb *); static int isp_timer_count; static void isp_timer(void *); static struct cdevsw isp_cdevsw = { .d_version = D_VERSION, .d_ioctl = ispioctl, .d_name = "isp", }; static int isp_role_sysctl(SYSCTL_HANDLER_ARGS) { ispsoftc_t *isp = (ispsoftc_t *)arg1; int chan = arg2; int error, old, value; value = FCPARAM(isp, chan)->role; error = sysctl_handle_int(oidp, &value, 0, req); if ((error != 0) || (req->newptr == NULL)) return (error); if (value < ISP_ROLE_NONE || value > ISP_ROLE_BOTH) return (EINVAL); ISP_LOCK(isp); old = FCPARAM(isp, chan)->role; /* We don't allow target mode switch from here. */ value = (old & ISP_ROLE_TARGET) | (value & ISP_ROLE_INITIATOR); /* If nothing has changed -- we are done. */ if (value == old) { ISP_UNLOCK(isp); return (0); } /* Actually change the role. */ error = isp_control(isp, ISPCTL_CHANGE_ROLE, chan, value); ISP_UNLOCK(isp); return (error); } static int isp_attach_chan(ispsoftc_t *isp, struct cam_devq *devq, int chan) { struct ccb_setasync csa; struct cam_sim *sim; struct cam_path *path; #ifdef ISP_TARGET_MODE int i; #endif sim = cam_sim_alloc(isp_action, isp_poll, "isp", isp, device_get_unit(isp->isp_dev), &isp->isp_lock, isp->isp_maxcmds, isp->isp_maxcmds, devq); if (sim == NULL) return (ENOMEM); ISP_LOCK(isp); if (xpt_bus_register(sim, isp->isp_dev, chan) != CAM_SUCCESS) { ISP_UNLOCK(isp); cam_sim_free(sim, FALSE); return (EIO); } ISP_UNLOCK(isp); if (xpt_create_path(&path, NULL, cam_sim_path(sim), CAM_TARGET_WILDCARD, CAM_LUN_WILDCARD) != CAM_REQ_CMP) { ISP_LOCK(isp); xpt_bus_deregister(cam_sim_path(sim)); ISP_UNLOCK(isp); cam_sim_free(sim, FALSE); return (ENXIO); } xpt_setup_ccb(&csa.ccb_h, path, 5); csa.ccb_h.func_code = XPT_SASYNC_CB; csa.event_enable = AC_LOST_DEVICE; csa.callback = isp_cam_async; csa.callback_arg = sim; ISP_LOCK(isp); xpt_action((union ccb *)&csa); ISP_UNLOCK(isp); if (IS_SCSI(isp)) { struct isp_spi *spi = ISP_SPI_PC(isp, chan); spi->sim = sim; spi->path = path; #ifdef ISP_TARGET_MODE TAILQ_INIT(&spi->waitq); STAILQ_INIT(&spi->ntfree); for (i = 0; i < ATPDPSIZE; i++) STAILQ_INSERT_TAIL(&spi->ntfree, &spi->ntpool[i], next); LIST_INIT(&spi->atfree); for (i = ATPDPSIZE-1; i >= 0; i--) LIST_INSERT_HEAD(&spi->atfree, &spi->atpool[i], next); for (i = 0; i < ATPDPHASHSIZE; i++) LIST_INIT(&spi->atused[i]); #endif } else { fcparam *fcp = FCPARAM(isp, chan); struct isp_fc *fc = ISP_FC_PC(isp, chan); struct sysctl_ctx_list *ctx = device_get_sysctl_ctx(isp->isp_osinfo.dev); struct sysctl_oid *tree = device_get_sysctl_tree(isp->isp_osinfo.dev); char name[16]; ISP_LOCK(isp); fc->sim = sim; fc->path = path; fc->isp = isp; fc->ready = 1; fcp->isp_use_gft_id = 1; fcp->isp_use_gff_id = 1; callout_init_mtx(&fc->gdt, &isp->isp_lock, 0); TASK_INIT(&fc->gtask, 1, isp_gdt_task, fc); #ifdef ISP_TARGET_MODE TAILQ_INIT(&fc->waitq); STAILQ_INIT(&fc->ntfree); for (i = 0; i < ATPDPSIZE; i++) STAILQ_INSERT_TAIL(&fc->ntfree, &fc->ntpool[i], next); LIST_INIT(&fc->atfree); for (i = ATPDPSIZE-1; i >= 0; i--) LIST_INSERT_HEAD(&fc->atfree, &fc->atpool[i], next); for (i = 0; i < ATPDPHASHSIZE; i++) LIST_INIT(&fc->atused[i]); #endif isp_loop_changed(isp, chan); ISP_UNLOCK(isp); if (kproc_create(isp_kthread, fc, &fc->kproc, 0, 0, "%s_%d", device_get_nameunit(isp->isp_osinfo.dev), chan)) { xpt_free_path(fc->path); ISP_LOCK(isp); xpt_bus_deregister(cam_sim_path(fc->sim)); ISP_UNLOCK(isp); cam_sim_free(fc->sim, FALSE); return (ENOMEM); } fc->num_threads += 1; if (chan > 0) { snprintf(name, sizeof(name), "chan%d", chan); tree = SYSCTL_ADD_NODE(ctx, SYSCTL_CHILDREN(tree), OID_AUTO, name, CTLFLAG_RW, 0, "Virtual channel"); } SYSCTL_ADD_QUAD(ctx, SYSCTL_CHILDREN(tree), OID_AUTO, "wwnn", CTLFLAG_RD, &fcp->isp_wwnn, "World Wide Node Name"); SYSCTL_ADD_QUAD(ctx, SYSCTL_CHILDREN(tree), OID_AUTO, "wwpn", CTLFLAG_RD, &fcp->isp_wwpn, "World Wide Port Name"); SYSCTL_ADD_UINT(ctx, SYSCTL_CHILDREN(tree), OID_AUTO, "loop_down_limit", CTLFLAG_RW, &fc->loop_down_limit, 0, "Loop Down Limit"); SYSCTL_ADD_UINT(ctx, SYSCTL_CHILDREN(tree), OID_AUTO, "gone_device_time", CTLFLAG_RW, &fc->gone_device_time, 0, "Gone Device Time"); #if defined(ISP_TARGET_MODE) && defined(DEBUG) SYSCTL_ADD_UINT(ctx, SYSCTL_CHILDREN(tree), OID_AUTO, "inject_lost_data_frame", CTLFLAG_RW, &fc->inject_lost_data_frame, 0, "Cause a Lost Frame on a Read"); #endif SYSCTL_ADD_PROC(ctx, SYSCTL_CHILDREN(tree), OID_AUTO, "role", CTLTYPE_INT | CTLFLAG_RW, isp, chan, isp_role_sysctl, "I", "Current role"); SYSCTL_ADD_UINT(ctx, SYSCTL_CHILDREN(tree), OID_AUTO, "speed", CTLFLAG_RD, &fcp->isp_gbspeed, 0, "Connection speed in gigabits"); SYSCTL_ADD_UINT(ctx, SYSCTL_CHILDREN(tree), OID_AUTO, "linkstate", CTLFLAG_RD, &fcp->isp_linkstate, 0, "Link state"); SYSCTL_ADD_UINT(ctx, SYSCTL_CHILDREN(tree), OID_AUTO, "fwstate", CTLFLAG_RD, &fcp->isp_fwstate, 0, "Firmware state"); SYSCTL_ADD_UINT(ctx, SYSCTL_CHILDREN(tree), OID_AUTO, "loopstate", CTLFLAG_RD, &fcp->isp_loopstate, 0, "Loop state"); SYSCTL_ADD_UINT(ctx, SYSCTL_CHILDREN(tree), OID_AUTO, "topo", CTLFLAG_RD, &fcp->isp_topo, 0, "Connection topology"); SYSCTL_ADD_INT(ctx, SYSCTL_CHILDREN(tree), OID_AUTO, "use_gft_id", CTLFLAG_RWTUN, &fcp->isp_use_gft_id, 0, "Use GFT_ID during fabric scan"); SYSCTL_ADD_INT(ctx, SYSCTL_CHILDREN(tree), OID_AUTO, "use_gff_id", CTLFLAG_RWTUN, &fcp->isp_use_gff_id, 0, "Use GFF_ID during fabric scan"); } return (0); } static void isp_detach_chan(ispsoftc_t *isp, int chan) { struct cam_sim *sim; struct cam_path *path; struct ccb_setasync csa; int *num_threads; ISP_GET_PC(isp, chan, sim, sim); ISP_GET_PC(isp, chan, path, path); ISP_GET_PC_ADDR(isp, chan, num_threads, num_threads); xpt_setup_ccb(&csa.ccb_h, path, 5); csa.ccb_h.func_code = XPT_SASYNC_CB; csa.event_enable = 0; csa.callback = isp_cam_async; csa.callback_arg = sim; xpt_action((union ccb *)&csa); xpt_free_path(path); xpt_bus_deregister(cam_sim_path(sim)); cam_sim_free(sim, FALSE); /* Wait for the channel's spawned threads to exit. */ wakeup(isp->isp_osinfo.pc.ptr); while (*num_threads != 0) mtx_sleep(isp, &isp->isp_lock, PRIBIO, "isp_reap", 100); } int isp_attach(ispsoftc_t *isp) { const char *nu = device_get_nameunit(isp->isp_osinfo.dev); int du = device_get_unit(isp->isp_dev); int chan; /* * Create the device queue for our SIM(s). */ isp->isp_osinfo.devq = cam_simq_alloc(isp->isp_maxcmds); if (isp->isp_osinfo.devq == NULL) { return (EIO); } for (chan = 0; chan < isp->isp_nchan; chan++) { if (isp_attach_chan(isp, isp->isp_osinfo.devq, chan)) { goto unwind; } } callout_init_mtx(&isp->isp_osinfo.tmo, &isp->isp_lock, 0); isp_timer_count = hz >> 2; callout_reset(&isp->isp_osinfo.tmo, isp_timer_count, isp_timer, isp); isp->isp_osinfo.cdev = make_dev(&isp_cdevsw, du, UID_ROOT, GID_OPERATOR, 0600, "%s", nu); if (isp->isp_osinfo.cdev) { isp->isp_osinfo.cdev->si_drv1 = isp; } return (0); unwind: while (--chan >= 0) { struct cam_sim *sim; struct cam_path *path; ISP_GET_PC(isp, chan, sim, sim); ISP_GET_PC(isp, chan, path, path); xpt_free_path(path); ISP_LOCK(isp); xpt_bus_deregister(cam_sim_path(sim)); ISP_UNLOCK(isp); cam_sim_free(sim, FALSE); } cam_simq_free(isp->isp_osinfo.devq); isp->isp_osinfo.devq = NULL; return (-1); } int isp_detach(ispsoftc_t *isp) { int chan; if (isp->isp_osinfo.cdev) { destroy_dev(isp->isp_osinfo.cdev); isp->isp_osinfo.cdev = NULL; } ISP_LOCK(isp); /* Tell spawned threads that we're exiting. */ isp->isp_osinfo.is_exiting = 1; for (chan = isp->isp_nchan - 1; chan >= 0; chan -= 1) isp_detach_chan(isp, chan); ISP_UNLOCK(isp); callout_drain(&isp->isp_osinfo.tmo); cam_simq_free(isp->isp_osinfo.devq); return (0); } static void isp_freeze_loopdown(ispsoftc_t *isp, int chan) { struct isp_fc *fc = ISP_FC_PC(isp, chan); if (fc->sim == NULL) return; if (fc->simqfrozen == 0) { isp_prt(isp, ISP_LOGDEBUG0, "Chan %d Freeze simq (loopdown)", chan); fc->simqfrozen = SIMQFRZ_LOOPDOWN; xpt_hold_boot(); xpt_freeze_simq(fc->sim, 1); } else { isp_prt(isp, ISP_LOGDEBUG0, "Chan %d Mark simq frozen (loopdown)", chan); fc->simqfrozen |= SIMQFRZ_LOOPDOWN; } } static void isp_unfreeze_loopdown(ispsoftc_t *isp, int chan) { struct isp_fc *fc = ISP_FC_PC(isp, chan); if (fc->sim == NULL) return; int wasfrozen = fc->simqfrozen & SIMQFRZ_LOOPDOWN; fc->simqfrozen &= ~SIMQFRZ_LOOPDOWN; if (wasfrozen && fc->simqfrozen == 0) { isp_prt(isp, ISP_LOGDEBUG0, "Chan %d Release simq", chan); xpt_release_simq(fc->sim, 1); xpt_release_boot(); } } static int ispioctl(struct cdev *dev, u_long c, caddr_t addr, int flags, struct thread *td) { ispsoftc_t *isp; int nr, chan, retval = ENOTTY; isp = dev->si_drv1; switch (c) { case ISP_SDBLEV: { int olddblev = isp->isp_dblev; isp->isp_dblev = *(int *)addr; *(int *)addr = olddblev; retval = 0; break; } case ISP_GETROLE: chan = *(int *)addr; if (chan < 0 || chan >= isp->isp_nchan) { retval = -ENXIO; break; } if (IS_FC(isp)) { *(int *)addr = FCPARAM(isp, chan)->role; } else { *(int *)addr = ISP_ROLE_INITIATOR; } retval = 0; break; case ISP_SETROLE: if (IS_SCSI(isp)) break; nr = *(int *)addr; chan = nr >> 8; if (chan < 0 || chan >= isp->isp_nchan) { retval = -ENXIO; break; } nr &= 0xff; if (nr & ~(ISP_ROLE_INITIATOR|ISP_ROLE_TARGET)) { retval = EINVAL; break; } ISP_LOCK(isp); *(int *)addr = FCPARAM(isp, chan)->role; retval = isp_control(isp, ISPCTL_CHANGE_ROLE, chan, nr); ISP_UNLOCK(isp); retval = 0; break; case ISP_RESETHBA: ISP_LOCK(isp); isp_reinit(isp, 0); ISP_UNLOCK(isp); retval = 0; break; case ISP_RESCAN: if (IS_FC(isp)) { chan = *(intptr_t *)addr; if (chan < 0 || chan >= isp->isp_nchan) { retval = -ENXIO; break; } ISP_LOCK(isp); if (isp_fc_runstate(isp, chan, 5 * 1000000) != LOOP_READY) { retval = EIO; } else { retval = 0; } ISP_UNLOCK(isp); } break; case ISP_FC_LIP: if (IS_FC(isp)) { chan = *(intptr_t *)addr; if (chan < 0 || chan >= isp->isp_nchan) { retval = -ENXIO; break; } ISP_LOCK(isp); if (isp_control(isp, ISPCTL_SEND_LIP, chan)) { retval = EIO; } else { retval = 0; } ISP_UNLOCK(isp); } break; case ISP_FC_GETDINFO: { struct isp_fc_device *ifc = (struct isp_fc_device *) addr; fcportdb_t *lp; if (IS_SCSI(isp)) { break; } if (ifc->loopid >= MAX_FC_TARG) { retval = EINVAL; break; } lp = &FCPARAM(isp, ifc->chan)->portdb[ifc->loopid]; if (lp->state != FC_PORTDB_STATE_NIL) { ifc->role = (lp->prli_word3 & SVC3_ROLE_MASK) >> SVC3_ROLE_SHIFT; ifc->loopid = lp->handle; ifc->portid = lp->portid; ifc->node_wwn = lp->node_wwn; ifc->port_wwn = lp->port_wwn; retval = 0; } else { retval = ENODEV; } break; } case ISP_FC_GETHINFO: { struct isp_hba_device *hba = (struct isp_hba_device *) addr; int chan = hba->fc_channel; if (chan < 0 || chan >= isp->isp_nchan) { retval = ENXIO; break; } hba->fc_fw_major = ISP_FW_MAJORX(isp->isp_fwrev); hba->fc_fw_minor = ISP_FW_MINORX(isp->isp_fwrev); hba->fc_fw_micro = ISP_FW_MICROX(isp->isp_fwrev); hba->fc_nchannels = isp->isp_nchan; if (IS_FC(isp)) { hba->fc_nports = MAX_FC_TARG; hba->fc_speed = FCPARAM(isp, hba->fc_channel)->isp_gbspeed; hba->fc_topology = FCPARAM(isp, chan)->isp_topo + 1; hba->fc_loopid = FCPARAM(isp, chan)->isp_loopid; hba->nvram_node_wwn = FCPARAM(isp, chan)->isp_wwnn_nvram; hba->nvram_port_wwn = FCPARAM(isp, chan)->isp_wwpn_nvram; hba->active_node_wwn = FCPARAM(isp, chan)->isp_wwnn; hba->active_port_wwn = FCPARAM(isp, chan)->isp_wwpn; } else { hba->fc_nports = MAX_TARGETS; hba->fc_speed = 0; hba->fc_topology = 0; hba->nvram_node_wwn = 0ull; hba->nvram_port_wwn = 0ull; hba->active_node_wwn = 0ull; hba->active_port_wwn = 0ull; } retval = 0; break; } case ISP_TSK_MGMT: { int needmarker; struct isp_fc_tsk_mgmt *fct = (struct isp_fc_tsk_mgmt *) addr; uint16_t nphdl; mbreg_t mbs; if (IS_SCSI(isp)) { break; } chan = fct->chan; if (chan < 0 || chan >= isp->isp_nchan) { retval = -ENXIO; break; } needmarker = retval = 0; nphdl = fct->loopid; ISP_LOCK(isp); if (IS_24XX(isp)) { void *reqp; uint8_t resp[QENTRY_LEN]; isp24xx_tmf_t tmf; isp24xx_statusreq_t sp; fcparam *fcp = FCPARAM(isp, chan); fcportdb_t *lp; int i; for (i = 0; i < MAX_FC_TARG; i++) { lp = &fcp->portdb[i]; if (lp->handle == nphdl) { break; } } if (i == MAX_FC_TARG) { retval = ENXIO; ISP_UNLOCK(isp); break; } ISP_MEMZERO(&tmf, sizeof(tmf)); tmf.tmf_header.rqs_entry_type = RQSTYPE_TSK_MGMT; tmf.tmf_header.rqs_entry_count = 1; tmf.tmf_nphdl = lp->handle; tmf.tmf_delay = 2; tmf.tmf_timeout = 4; tmf.tmf_tidlo = lp->portid; tmf.tmf_tidhi = lp->portid >> 16; tmf.tmf_vpidx = ISP_GET_VPIDX(isp, chan); tmf.tmf_lun[1] = fct->lun & 0xff; if (fct->lun >= 256) { tmf.tmf_lun[0] = 0x40 | (fct->lun >> 8); } switch (fct->action) { case IPT_CLEAR_ACA: tmf.tmf_flags = ISP24XX_TMF_CLEAR_ACA; break; case IPT_TARGET_RESET: tmf.tmf_flags = ISP24XX_TMF_TARGET_RESET; needmarker = 1; break; case IPT_LUN_RESET: tmf.tmf_flags = ISP24XX_TMF_LUN_RESET; needmarker = 1; break; case IPT_CLEAR_TASK_SET: tmf.tmf_flags = ISP24XX_TMF_CLEAR_TASK_SET; needmarker = 1; break; case IPT_ABORT_TASK_SET: tmf.tmf_flags = ISP24XX_TMF_ABORT_TASK_SET; needmarker = 1; break; default: retval = EINVAL; break; } if (retval) { ISP_UNLOCK(isp); break; } /* Prepare space for response in memory */ memset(resp, 0xff, sizeof(resp)); tmf.tmf_handle = isp_allocate_handle(isp, resp, ISP_HANDLE_CTRL); if (tmf.tmf_handle == 0) { isp_prt(isp, ISP_LOGERR, "%s: TMF of Chan %d out of handles", __func__, chan); ISP_UNLOCK(isp); retval = ENOMEM; break; } /* Send request and wait for response. */ reqp = isp_getrqentry(isp); if (reqp == NULL) { isp_prt(isp, ISP_LOGERR, "%s: TMF of Chan %d out of rqent", __func__, chan); isp_destroy_handle(isp, tmf.tmf_handle); ISP_UNLOCK(isp); retval = EIO; break; } isp_put_24xx_tmf(isp, &tmf, (isp24xx_tmf_t *)reqp); if (isp->isp_dblev & ISP_LOGDEBUG1) isp_print_bytes(isp, "IOCB TMF", QENTRY_LEN, reqp); ISP_SYNC_REQUEST(isp); if (msleep(resp, &isp->isp_lock, 0, "TMF", 5*hz) == EWOULDBLOCK) { isp_prt(isp, ISP_LOGERR, "%s: TMF of Chan %d timed out", __func__, chan); isp_destroy_handle(isp, tmf.tmf_handle); ISP_UNLOCK(isp); retval = EIO; break; } if (isp->isp_dblev & ISP_LOGDEBUG1) isp_print_bytes(isp, "IOCB TMF response", QENTRY_LEN, resp); isp_get_24xx_response(isp, (isp24xx_statusreq_t *)resp, &sp); if (sp.req_completion_status != 0) retval = EIO; else if (needmarker) fcp->sendmarker = 1; } else { MBSINIT(&mbs, 0, MBLOGALL, 0); if (ISP_CAP_2KLOGIN(isp) == 0) { nphdl <<= 8; } switch (fct->action) { case IPT_CLEAR_ACA: mbs.param[0] = MBOX_CLEAR_ACA; mbs.param[1] = nphdl; mbs.param[2] = fct->lun; break; case IPT_TARGET_RESET: mbs.param[0] = MBOX_TARGET_RESET; mbs.param[1] = nphdl; needmarker = 1; break; case IPT_LUN_RESET: mbs.param[0] = MBOX_LUN_RESET; mbs.param[1] = nphdl; mbs.param[2] = fct->lun; needmarker = 1; break; case IPT_CLEAR_TASK_SET: mbs.param[0] = MBOX_CLEAR_TASK_SET; mbs.param[1] = nphdl; mbs.param[2] = fct->lun; needmarker = 1; break; case IPT_ABORT_TASK_SET: mbs.param[0] = MBOX_ABORT_TASK_SET; mbs.param[1] = nphdl; mbs.param[2] = fct->lun; needmarker = 1; break; default: retval = EINVAL; break; } if (retval == 0) { if (needmarker) { FCPARAM(isp, chan)->sendmarker = 1; } retval = isp_control(isp, ISPCTL_RUN_MBOXCMD, &mbs); if (retval) { retval = EIO; } } } ISP_UNLOCK(isp); break; } default: break; } return (retval); } /* * Local Inlines */ static ISP_INLINE int isp_get_pcmd(ispsoftc_t *, union ccb *); static ISP_INLINE void isp_free_pcmd(ispsoftc_t *, union ccb *); static ISP_INLINE int isp_get_pcmd(ispsoftc_t *isp, union ccb *ccb) { ISP_PCMD(ccb) = isp->isp_osinfo.pcmd_free; if (ISP_PCMD(ccb) == NULL) { return (-1); } isp->isp_osinfo.pcmd_free = ((struct isp_pcmd *)ISP_PCMD(ccb))->next; return (0); } static ISP_INLINE void isp_free_pcmd(ispsoftc_t *isp, union ccb *ccb) { if (ISP_PCMD(ccb)) { #ifdef ISP_TARGET_MODE PISP_PCMD(ccb)->datalen = 0; #endif PISP_PCMD(ccb)->next = isp->isp_osinfo.pcmd_free; isp->isp_osinfo.pcmd_free = ISP_PCMD(ccb); ISP_PCMD(ccb) = NULL; } } /* * Put the target mode functions here, because some are inlines */ #ifdef ISP_TARGET_MODE static ISP_INLINE tstate_t *get_lun_statep(ispsoftc_t *, int, lun_id_t); static atio_private_data_t *isp_get_atpd(ispsoftc_t *, int, uint32_t); static atio_private_data_t *isp_find_atpd(ispsoftc_t *, int, uint32_t); static void isp_put_atpd(ispsoftc_t *, int, atio_private_data_t *); static inot_private_data_t *isp_get_ntpd(ispsoftc_t *, int); static inot_private_data_t *isp_find_ntpd(ispsoftc_t *, int, uint32_t, uint32_t); static void isp_put_ntpd(ispsoftc_t *, int, inot_private_data_t *); static cam_status create_lun_state(ispsoftc_t *, int, struct cam_path *, tstate_t **); static void destroy_lun_state(ispsoftc_t *, int, tstate_t *); static void isp_enable_lun(ispsoftc_t *, union ccb *); static void isp_disable_lun(ispsoftc_t *, union ccb *); static timeout_t isp_refire_putback_atio; static timeout_t isp_refire_notify_ack; static void isp_complete_ctio(union ccb *); static void isp_target_putback_atio(union ccb *); enum Start_Ctio_How { FROM_CAM, FROM_TIMER, FROM_SRR, FROM_CTIO_DONE }; static void isp_target_start_ctio(ispsoftc_t *, union ccb *, enum Start_Ctio_How); static void isp_handle_platform_atio2(ispsoftc_t *, at2_entry_t *); static void isp_handle_platform_atio7(ispsoftc_t *, at7_entry_t *); static void isp_handle_platform_ctio(ispsoftc_t *, void *); static int isp_handle_platform_target_notify_ack(ispsoftc_t *, isp_notify_t *, uint32_t rsp); static void isp_handle_platform_target_tmf(ispsoftc_t *, isp_notify_t *); static void isp_target_mark_aborted_early(ispsoftc_t *, int chan, tstate_t *, uint32_t); static ISP_INLINE tstate_t * get_lun_statep(ispsoftc_t *isp, int bus, lun_id_t lun) { tstate_t *tptr = NULL; struct tslist *lhp; if (bus < isp->isp_nchan) { ISP_GET_PC_ADDR(isp, bus, lun_hash[LUN_HASH_FUNC(lun)], lhp); SLIST_FOREACH(tptr, lhp, next) { if (tptr->ts_lun == lun) return (tptr); } } return (NULL); } static int isp_atio_restart(ispsoftc_t *isp, int bus, tstate_t *tptr) { inot_private_data_t *ntp; struct ntpdlist rq; if (STAILQ_EMPTY(&tptr->restart_queue)) return (0); STAILQ_INIT(&rq); STAILQ_CONCAT(&rq, &tptr->restart_queue); while ((ntp = STAILQ_FIRST(&rq)) != NULL) { STAILQ_REMOVE_HEAD(&rq, next); if (IS_24XX(isp)) { isp_prt(isp, ISP_LOGTDEBUG0, "%s: restarting resrc deprived %x", __func__, ((at7_entry_t *)ntp->data)->at_rxid); isp_handle_platform_atio7(isp, (at7_entry_t *) ntp->data); } else { isp_prt(isp, ISP_LOGTDEBUG0, "%s: restarting resrc deprived %x", __func__, ((at2_entry_t *)ntp->data)->at_rxid); isp_handle_platform_atio2(isp, (at2_entry_t *) ntp->data); } isp_put_ntpd(isp, bus, ntp); if (!STAILQ_EMPTY(&tptr->restart_queue)) break; } if (!STAILQ_EMPTY(&rq)) { STAILQ_CONCAT(&rq, &tptr->restart_queue); STAILQ_CONCAT(&tptr->restart_queue, &rq); } return (!STAILQ_EMPTY(&tptr->restart_queue)); } static void isp_tmcmd_restart(ispsoftc_t *isp) { tstate_t *tptr; union ccb *ccb; struct tslist *lhp; struct isp_ccbq *waitq; int bus, i; for (bus = 0; bus < isp->isp_nchan; bus++) { for (i = 0; i < LUN_HASH_SIZE; i++) { ISP_GET_PC_ADDR(isp, bus, lun_hash[i], lhp); SLIST_FOREACH(tptr, lhp, next) isp_atio_restart(isp, bus, tptr); } /* * We only need to do this once per channel. */ ISP_GET_PC_ADDR(isp, bus, waitq, waitq); ccb = (union ccb *)TAILQ_FIRST(waitq); if (ccb != NULL) { TAILQ_REMOVE(waitq, &ccb->ccb_h, sim_links.tqe); isp_target_start_ctio(isp, ccb, FROM_TIMER); } } } static atio_private_data_t * isp_get_atpd(ispsoftc_t *isp, int chan, uint32_t tag) { struct atpdlist *atfree; struct atpdlist *atused; atio_private_data_t *atp; ISP_GET_PC_ADDR(isp, chan, atfree, atfree); atp = LIST_FIRST(atfree); if (atp) { LIST_REMOVE(atp, next); atp->tag = tag; ISP_GET_PC(isp, chan, atused, atused); LIST_INSERT_HEAD(&atused[ATPDPHASH(tag)], atp, next); } return (atp); } static atio_private_data_t * isp_find_atpd(ispsoftc_t *isp, int chan, uint32_t tag) { struct atpdlist *atused; atio_private_data_t *atp; ISP_GET_PC(isp, chan, atused, atused); LIST_FOREACH(atp, &atused[ATPDPHASH(tag)], next) { if (atp->tag == tag) return (atp); } return (NULL); } static void isp_put_atpd(ispsoftc_t *isp, int chan, atio_private_data_t *atp) { struct atpdlist *atfree; if (atp->ests) { isp_put_ecmd(isp, atp->ests); } LIST_REMOVE(atp, next); memset(atp, 0, sizeof (*atp)); ISP_GET_PC_ADDR(isp, chan, atfree, atfree); LIST_INSERT_HEAD(atfree, atp, next); } static void isp_dump_atpd(ispsoftc_t *isp, int chan) { atio_private_data_t *atp, *atpool; const char *states[8] = { "Free", "ATIO", "CAM", "CTIO", "LAST_CTIO", "PDON", "?6", "7" }; ISP_GET_PC(isp, chan, atpool, atpool); for (atp = atpool; atp < &atpool[ATPDPSIZE]; atp++) { if (atp->state == ATPD_STATE_FREE) continue; isp_prt(isp, ISP_LOGALL, "Chan %d ATP [0x%x] origdlen %u bytes_xfrd %u lun %jx nphdl 0x%04x s_id 0x%06x d_id 0x%06x oxid 0x%04x state %s", chan, atp->tag, atp->orig_datalen, atp->bytes_xfered, (uintmax_t)atp->lun, atp->nphdl, atp->sid, atp->did, atp->oxid, states[atp->state & 0x7]); } } static inot_private_data_t * isp_get_ntpd(ispsoftc_t *isp, int chan) { struct ntpdlist *ntfree; inot_private_data_t *ntp; ISP_GET_PC_ADDR(isp, chan, ntfree, ntfree); ntp = STAILQ_FIRST(ntfree); if (ntp) STAILQ_REMOVE_HEAD(ntfree, next); return (ntp); } static inot_private_data_t * isp_find_ntpd(ispsoftc_t *isp, int chan, uint32_t tag_id, uint32_t seq_id) { inot_private_data_t *ntp, *ntp2; ISP_GET_PC(isp, chan, ntpool, ntp); ISP_GET_PC_ADDR(isp, chan, ntpool[ATPDPSIZE], ntp2); for (; ntp < ntp2; ntp++) { if (ntp->tag_id == tag_id && ntp->seq_id == seq_id) return (ntp); } return (NULL); } static void isp_put_ntpd(ispsoftc_t *isp, int chan, inot_private_data_t *ntp) { struct ntpdlist *ntfree; ntp->tag_id = ntp->seq_id = 0; ISP_GET_PC_ADDR(isp, chan, ntfree, ntfree); STAILQ_INSERT_HEAD(ntfree, ntp, next); } static cam_status create_lun_state(ispsoftc_t *isp, int bus, struct cam_path *path, tstate_t **rslt) { lun_id_t lun; struct tslist *lhp; tstate_t *tptr; lun = xpt_path_lun_id(path); if (lun != CAM_LUN_WILDCARD) { if (ISP_MAX_LUNS(isp) > 0 && lun >= ISP_MAX_LUNS(isp)) { return (CAM_LUN_INVALID); } } tptr = malloc(sizeof (tstate_t), M_DEVBUF, M_NOWAIT|M_ZERO); if (tptr == NULL) { return (CAM_RESRC_UNAVAIL); } tptr->ts_lun = lun; SLIST_INIT(&tptr->atios); SLIST_INIT(&tptr->inots); STAILQ_INIT(&tptr->restart_queue); ISP_GET_PC_ADDR(isp, bus, lun_hash[LUN_HASH_FUNC(lun)], lhp); SLIST_INSERT_HEAD(lhp, tptr, next); *rslt = tptr; ISP_PATH_PRT(isp, ISP_LOGTDEBUG0, path, "created tstate\n"); return (CAM_REQ_CMP); } static void destroy_lun_state(ispsoftc_t *isp, int bus, tstate_t *tptr) { union ccb *ccb; struct tslist *lhp; inot_private_data_t *ntp; while ((ccb = (union ccb *)SLIST_FIRST(&tptr->atios)) != NULL) { SLIST_REMOVE_HEAD(&tptr->atios, sim_links.sle); ccb->ccb_h.status = CAM_REQ_ABORTED; xpt_done(ccb); }; while ((ccb = (union ccb *)SLIST_FIRST(&tptr->inots)) != NULL) { SLIST_REMOVE_HEAD(&tptr->inots, sim_links.sle); ccb->ccb_h.status = CAM_REQ_ABORTED; xpt_done(ccb); } while ((ntp = STAILQ_FIRST(&tptr->restart_queue)) != NULL) { isp_endcmd(isp, ntp->data, NIL_HANDLE, bus, SCSI_STATUS_BUSY, 0); STAILQ_REMOVE_HEAD(&tptr->restart_queue, next); isp_put_ntpd(isp, bus, ntp); } ISP_GET_PC_ADDR(isp, bus, lun_hash[LUN_HASH_FUNC(tptr->ts_lun)], lhp); SLIST_REMOVE(lhp, tptr, tstate, next); free(tptr, M_DEVBUF); } static void isp_enable_lun(ispsoftc_t *isp, union ccb *ccb) { tstate_t *tptr; int bus; target_id_t target; lun_id_t lun; if (!IS_FC(isp) || !ISP_CAP_TMODE(isp) || !ISP_CAP_SCCFW(isp)) { xpt_print(ccb->ccb_h.path, "Target mode is not supported\n"); ccb->ccb_h.status = CAM_FUNC_NOTAVAIL; xpt_done(ccb); return; } /* * We only support either target and lun both wildcard * or target and lun both non-wildcard. */ bus = XS_CHANNEL(ccb); target = ccb->ccb_h.target_id; lun = ccb->ccb_h.target_lun; ISP_PATH_PRT(isp, ISP_LOGTDEBUG0|ISP_LOGCONFIG, ccb->ccb_h.path, "enabling lun %jx\n", (uintmax_t)lun); if ((target == CAM_TARGET_WILDCARD) != (lun == CAM_LUN_WILDCARD)) { ccb->ccb_h.status = CAM_LUN_INVALID; xpt_done(ccb); return; } /* Create the state pointer. It should not already exist. */ tptr = get_lun_statep(isp, bus, lun); if (tptr) { ccb->ccb_h.status = CAM_LUN_ALRDY_ENA; xpt_done(ccb); return; } ccb->ccb_h.status = create_lun_state(isp, bus, ccb->ccb_h.path, &tptr); if (ccb->ccb_h.status != CAM_REQ_CMP) { xpt_done(ccb); return; } ccb->ccb_h.status = CAM_REQ_CMP; xpt_done(ccb); } static void isp_disable_lun(ispsoftc_t *isp, union ccb *ccb) { tstate_t *tptr = NULL; int bus; target_id_t target; lun_id_t lun; bus = XS_CHANNEL(ccb); target = ccb->ccb_h.target_id; lun = ccb->ccb_h.target_lun; ISP_PATH_PRT(isp, ISP_LOGTDEBUG0|ISP_LOGCONFIG, ccb->ccb_h.path, "disabling lun %jx\n", (uintmax_t)lun); if ((target == CAM_TARGET_WILDCARD) != (lun == CAM_LUN_WILDCARD)) { ccb->ccb_h.status = CAM_LUN_INVALID; xpt_done(ccb); return; } /* Find the state pointer. */ if ((tptr = get_lun_statep(isp, bus, lun)) == NULL) { ccb->ccb_h.status = CAM_PATH_INVALID; xpt_done(ccb); return; } destroy_lun_state(isp, bus, tptr); ccb->ccb_h.status = CAM_REQ_CMP; xpt_done(ccb); } static void isp_target_start_ctio(ispsoftc_t *isp, union ccb *ccb, enum Start_Ctio_How how) { int fctape, sendstatus, resid; fcparam *fcp; atio_private_data_t *atp; struct ccb_scsiio *cso; struct isp_ccbq *waitq; uint32_t dmaresult, handle, xfrlen, sense_length, tmp; uint8_t local[QENTRY_LEN]; isp_prt(isp, ISP_LOGTDEBUG0, "%s: ENTRY[0x%x] how %u xfrlen %u sendstatus %d sense_len %u", __func__, ccb->csio.tag_id, how, ccb->csio.dxfer_len, (ccb->ccb_h.flags & CAM_SEND_STATUS) != 0, ((ccb->ccb_h.flags & CAM_SEND_SENSE)? ccb->csio.sense_len : 0)); ISP_GET_PC_ADDR(isp, XS_CHANNEL(ccb), waitq, waitq); switch (how) { case FROM_CAM: /* * Insert at the tail of the list, if any, waiting CTIO CCBs */ TAILQ_INSERT_TAIL(waitq, &ccb->ccb_h, sim_links.tqe); break; case FROM_TIMER: case FROM_SRR: case FROM_CTIO_DONE: TAILQ_INSERT_HEAD(waitq, &ccb->ccb_h, sim_links.tqe); break; } while ((ccb = (union ccb *) TAILQ_FIRST(waitq)) != NULL) { TAILQ_REMOVE(waitq, &ccb->ccb_h, sim_links.tqe); cso = &ccb->csio; xfrlen = cso->dxfer_len; if (xfrlen == 0) { if ((ccb->ccb_h.flags & CAM_SEND_STATUS) == 0) { ISP_PATH_PRT(isp, ISP_LOGERR, ccb->ccb_h.path, "a data transfer length of zero but no status to send is wrong\n"); ccb->ccb_h.status = CAM_REQ_INVALID; xpt_done(ccb); continue; } } atp = isp_find_atpd(isp, XS_CHANNEL(ccb), cso->tag_id); if (atp == NULL) { isp_prt(isp, ISP_LOGERR, "%s: [0x%x] cannot find private data adjunct in %s", __func__, cso->tag_id, __func__); isp_dump_atpd(isp, XS_CHANNEL(ccb)); ccb->ccb_h.status = CAM_REQ_CMP_ERR; xpt_done(ccb); continue; } /* * Is this command a dead duck? */ if (atp->dead) { isp_prt(isp, ISP_LOGERR, "%s: [0x%x] not sending a CTIO for a dead command", __func__, cso->tag_id); ccb->ccb_h.status = CAM_REQ_ABORTED; xpt_done(ccb); continue; } /* * Check to make sure we're still in target mode. */ fcp = FCPARAM(isp, XS_CHANNEL(ccb)); if ((fcp->role & ISP_ROLE_TARGET) == 0) { isp_prt(isp, ISP_LOGERR, "%s: [0x%x] stopping sending a CTIO because we're no longer in target mode", __func__, cso->tag_id); ccb->ccb_h.status = CAM_PROVIDE_FAIL; xpt_done(ccb); continue; } /* * We're only handling ATPD_CCB_OUTSTANDING outstanding CCB at a time (one of which * could be split into two CTIOs to split data and status). */ if (atp->ctcnt >= ATPD_CCB_OUTSTANDING) { isp_prt(isp, ISP_LOGTINFO, "[0x%x] handling only %d CCBs at a time (flags for this ccb: 0x%x)", cso->tag_id, ATPD_CCB_OUTSTANDING, ccb->ccb_h.flags); TAILQ_INSERT_HEAD(waitq, &ccb->ccb_h, sim_links.tqe); break; } /* * Does the initiator expect FC-Tape style responses? */ if ((atp->word3 & PRLI_WD3_RETRY) && fcp->fctape_enabled) { fctape = 1; } else { fctape = 0; } /* * If we already did the data xfer portion of a CTIO that sends data * and status, don't do it again and do the status portion now. */ if (atp->sendst) { isp_prt(isp, ISP_LOGTDEBUG0, "[0x%x] now sending synthesized status orig_dl=%u xfered=%u bit=%u", cso->tag_id, atp->orig_datalen, atp->bytes_xfered, atp->bytes_in_transit); xfrlen = 0; /* we already did the data transfer */ atp->sendst = 0; } if (ccb->ccb_h.flags & CAM_SEND_STATUS) { sendstatus = 1; } else { sendstatus = 0; } if (ccb->ccb_h.flags & CAM_SEND_SENSE) { KASSERT((sendstatus != 0), ("how can you have CAM_SEND_SENSE w/o CAM_SEND_STATUS?")); /* * Sense length is not the entire sense data structure size. Periph * drivers don't seem to be setting sense_len to reflect the actual * size. We'll peek inside to get the right amount. */ sense_length = cso->sense_len; /* * This 'cannot' happen */ if (sense_length > (XCMD_SIZE - MIN_FCP_RESPONSE_SIZE)) { sense_length = XCMD_SIZE - MIN_FCP_RESPONSE_SIZE; } } else { sense_length = 0; } memset(local, 0, QENTRY_LEN); /* * Check for overflow */ tmp = atp->bytes_xfered + atp->bytes_in_transit; if (xfrlen > 0 && tmp > atp->orig_datalen) { isp_prt(isp, ISP_LOGERR, "%s: [0x%x] data overflow by %u bytes", __func__, cso->tag_id, tmp + xfrlen - atp->orig_datalen); ccb->ccb_h.status = CAM_DATA_RUN_ERR; xpt_done(ccb); continue; } if (xfrlen > atp->orig_datalen - tmp) { xfrlen = atp->orig_datalen - tmp; if (xfrlen == 0 && !sendstatus) { cso->resid = cso->dxfer_len; ccb->ccb_h.status = CAM_REQ_CMP; xpt_done(ccb); continue; } } if (IS_24XX(isp)) { ct7_entry_t *cto = (ct7_entry_t *) local; cto->ct_header.rqs_entry_type = RQSTYPE_CTIO7; cto->ct_header.rqs_entry_count = 1; cto->ct_header.rqs_seqno |= ATPD_SEQ_NOTIFY_CAM; ATPD_SET_SEQNO(cto, atp); cto->ct_nphdl = atp->nphdl; cto->ct_rxid = atp->tag; cto->ct_iid_lo = atp->sid; cto->ct_iid_hi = atp->sid >> 16; cto->ct_oxid = atp->oxid; cto->ct_vpidx = ISP_GET_VPIDX(isp, XS_CHANNEL(ccb)); cto->ct_timeout = XS_TIME(ccb); cto->ct_flags = atp->tattr << CT7_TASK_ATTR_SHIFT; /* * Mode 1, status, no data. Only possible when we are sending status, have * no data to transfer, and any sense data can fit into a ct7_entry_t. * * Mode 2, status, no data. We have to use this in the case that * the sense data won't fit into a ct7_entry_t. * */ if (sendstatus && xfrlen == 0) { cto->ct_flags |= CT7_SENDSTATUS | CT7_NO_DATA; resid = atp->orig_datalen - atp->bytes_xfered - atp->bytes_in_transit; if (sense_length <= MAXRESPLEN_24XX) { cto->ct_flags |= CT7_FLAG_MODE1; cto->ct_scsi_status = cso->scsi_status; if (resid < 0) { cto->ct_resid = -resid; cto->ct_scsi_status |= (FCP_RESID_OVERFLOW << 8); } else if (resid > 0) { cto->ct_resid = resid; cto->ct_scsi_status |= (FCP_RESID_UNDERFLOW << 8); } if (fctape) { cto->ct_flags |= CT7_CONFIRM|CT7_EXPLCT_CONF; } if (sense_length) { cto->ct_scsi_status |= (FCP_SNSLEN_VALID << 8); cto->rsp.m1.ct_resplen = cto->ct_senselen = sense_length; memcpy(cto->rsp.m1.ct_resp, &cso->sense_data, sense_length); } } else { bus_addr_t addr; char buf[XCMD_SIZE]; fcp_rsp_iu_t *rp; if (atp->ests == NULL) { atp->ests = isp_get_ecmd(isp); if (atp->ests == NULL) { TAILQ_INSERT_HEAD(waitq, &ccb->ccb_h, sim_links.tqe); break; } } memset(buf, 0, sizeof (buf)); rp = (fcp_rsp_iu_t *)buf; if (fctape) { cto->ct_flags |= CT7_CONFIRM|CT7_EXPLCT_CONF; rp->fcp_rsp_bits |= FCP_CONF_REQ; } cto->ct_flags |= CT7_FLAG_MODE2; rp->fcp_rsp_scsi_status = cso->scsi_status; if (resid < 0) { rp->fcp_rsp_resid = -resid; rp->fcp_rsp_bits |= FCP_RESID_OVERFLOW; } else if (resid > 0) { rp->fcp_rsp_resid = resid; rp->fcp_rsp_bits |= FCP_RESID_UNDERFLOW; } if (sense_length) { rp->fcp_rsp_snslen = sense_length; cto->ct_senselen = sense_length; rp->fcp_rsp_bits |= FCP_SNSLEN_VALID; isp_put_fcp_rsp_iu(isp, rp, atp->ests); memcpy(((fcp_rsp_iu_t *)atp->ests)->fcp_rsp_extra, &cso->sense_data, sense_length); } else { isp_put_fcp_rsp_iu(isp, rp, atp->ests); } if (isp->isp_dblev & ISP_LOGTDEBUG1) { isp_print_bytes(isp, "FCP Response Frame After Swizzling", MIN_FCP_RESPONSE_SIZE + sense_length, atp->ests); } addr = isp->isp_osinfo.ecmd_dma; addr += ((((isp_ecmd_t *)atp->ests) - isp->isp_osinfo.ecmd_base) * XCMD_SIZE); isp_prt(isp, ISP_LOGTDEBUG0, "%s: ests base %p vaddr %p ecmd_dma %jx addr %jx len %u", __func__, isp->isp_osinfo.ecmd_base, atp->ests, (uintmax_t) isp->isp_osinfo.ecmd_dma, (uintmax_t)addr, MIN_FCP_RESPONSE_SIZE + sense_length); cto->rsp.m2.ct_datalen = MIN_FCP_RESPONSE_SIZE + sense_length; cto->rsp.m2.ct_fcp_rsp_iudata.ds_base = DMA_LO32(addr); cto->rsp.m2.ct_fcp_rsp_iudata.ds_basehi = DMA_HI32(addr); cto->rsp.m2.ct_fcp_rsp_iudata.ds_count = MIN_FCP_RESPONSE_SIZE + sense_length; } if (sense_length) { isp_prt(isp, ISP_LOGTDEBUG0, "%s: CTIO7[0x%x] seq %u nc %d CDB0=%x sstatus=0x%x flags=0x%x resid=%d slen %u sense: %x %x/%x/%x", __func__, cto->ct_rxid, ATPD_GET_SEQNO(cto), ATPD_GET_NCAM(cto), atp->cdb0, cto->ct_scsi_status, cto->ct_flags, cto->ct_resid, sense_length, cso->sense_data.error_code, cso->sense_data.sense_buf[1], cso->sense_data.sense_buf[11], cso->sense_data.sense_buf[12]); } else { isp_prt(isp, ISP_LOGDEBUG0, "%s: CTIO7[0x%x] seq %u nc %d CDB0=%x sstatus=0x%x flags=0x%x resid=%d", __func__, cto->ct_rxid, ATPD_GET_SEQNO(cto), ATPD_GET_NCAM(cto), atp->cdb0, cto->ct_scsi_status, cto->ct_flags, cto->ct_resid); } atp->state = ATPD_STATE_LAST_CTIO; } /* * Mode 0 data transfers, *possibly* with status. */ if (xfrlen != 0) { cto->ct_flags |= CT7_FLAG_MODE0; if ((cso->ccb_h.flags & CAM_DIR_MASK) == CAM_DIR_IN) { cto->ct_flags |= CT7_DATA_IN; } else { cto->ct_flags |= CT7_DATA_OUT; } cto->rsp.m0.reloff = atp->bytes_xfered + atp->bytes_in_transit; cto->rsp.m0.ct_xfrlen = xfrlen; #ifdef DEBUG if (ISP_FC_PC(isp, XS_CHANNEL(ccb))->inject_lost_data_frame && xfrlen > ISP_FC_PC(isp, XS_CHANNEL(ccb))->inject_lost_data_frame) { isp_prt(isp, ISP_LOGWARN, "%s: truncating data frame with xfrlen %d to %d", __func__, xfrlen, xfrlen - (xfrlen >> 2)); ISP_FC_PC(isp, XS_CHANNEL(ccb))->inject_lost_data_frame = 0; cto->rsp.m0.ct_xfrlen -= xfrlen >> 2; } #endif if (sendstatus) { resid = atp->orig_datalen - atp->bytes_xfered - xfrlen; if (cso->scsi_status == SCSI_STATUS_OK && resid == 0 /* && fctape == 0 */) { cto->ct_flags |= CT7_SENDSTATUS; atp->state = ATPD_STATE_LAST_CTIO; if (fctape) { cto->ct_flags |= CT7_CONFIRM|CT7_EXPLCT_CONF; } } else { atp->sendst = 1; /* send status later */ cto->ct_header.rqs_seqno &= ~ATPD_SEQ_NOTIFY_CAM; atp->state = ATPD_STATE_CTIO; } } else { atp->state = ATPD_STATE_CTIO; } isp_prt(isp, ISP_LOGTDEBUG0, "%s: CTIO7[0x%x] seq %u nc %d CDB0=%x sstatus=0x%x flags=0x%x xfrlen=%u off=%u", __func__, cto->ct_rxid, ATPD_GET_SEQNO(cto), ATPD_GET_NCAM(cto), atp->cdb0, cto->ct_scsi_status, cto->ct_flags, xfrlen, atp->bytes_xfered); } } else { ct2_entry_t *cto = (ct2_entry_t *) local; cto->ct_header.rqs_entry_type = RQSTYPE_CTIO2; cto->ct_header.rqs_entry_count = 1; cto->ct_header.rqs_seqno |= ATPD_SEQ_NOTIFY_CAM; ATPD_SET_SEQNO(cto, atp); if (ISP_CAP_2KLOGIN(isp)) { ((ct2e_entry_t *)cto)->ct_iid = atp->nphdl; } else { cto->ct_iid = atp->nphdl; if (ISP_CAP_SCCFW(isp) == 0) { cto->ct_lun = ccb->ccb_h.target_lun; } } cto->ct_timeout = XS_TIME(ccb); cto->ct_rxid = cso->tag_id; /* * Mode 1, status, no data. Only possible when we are sending status, have * no data to transfer, and the sense length can fit in the ct7_entry. * * Mode 2, status, no data. We have to use this in the case the response * length won't fit into a ct2_entry_t. * * We'll fill out this structure with information as if this were a * Mode 1. The hardware layer will create the Mode 2 FCP RSP IU as * needed based upon this. */ if (sendstatus && xfrlen == 0) { cto->ct_flags |= CT2_SENDSTATUS | CT2_NO_DATA; resid = atp->orig_datalen - atp->bytes_xfered - atp->bytes_in_transit; if (sense_length <= MAXRESPLEN) { if (resid < 0) { cto->ct_resid = -resid; } else if (resid > 0) { cto->ct_resid = resid; } cto->ct_flags |= CT2_FLAG_MODE1; cto->rsp.m1.ct_scsi_status = cso->scsi_status; if (resid < 0) { cto->rsp.m1.ct_scsi_status |= CT2_DATA_OVER; } else if (resid > 0) { cto->rsp.m1.ct_scsi_status |= CT2_DATA_UNDER; } if (fctape) { cto->ct_flags |= CT2_CONFIRM; } if (sense_length) { cto->rsp.m1.ct_scsi_status |= CT2_SNSLEN_VALID; cto->rsp.m1.ct_resplen = cto->rsp.m1.ct_senselen = sense_length; memcpy(cto->rsp.m1.ct_resp, &cso->sense_data, sense_length); } } else { bus_addr_t addr; char buf[XCMD_SIZE]; fcp_rsp_iu_t *rp; if (atp->ests == NULL) { atp->ests = isp_get_ecmd(isp); if (atp->ests == NULL) { TAILQ_INSERT_HEAD(waitq, &ccb->ccb_h, sim_links.tqe); break; } } memset(buf, 0, sizeof (buf)); rp = (fcp_rsp_iu_t *)buf; if (fctape) { cto->ct_flags |= CT2_CONFIRM; rp->fcp_rsp_bits |= FCP_CONF_REQ; } cto->ct_flags |= CT2_FLAG_MODE2; rp->fcp_rsp_scsi_status = cso->scsi_status; if (resid < 0) { rp->fcp_rsp_resid = -resid; rp->fcp_rsp_bits |= FCP_RESID_OVERFLOW; } else if (resid > 0) { rp->fcp_rsp_resid = resid; rp->fcp_rsp_bits |= FCP_RESID_UNDERFLOW; } if (sense_length) { rp->fcp_rsp_snslen = sense_length; rp->fcp_rsp_bits |= FCP_SNSLEN_VALID; isp_put_fcp_rsp_iu(isp, rp, atp->ests); memcpy(((fcp_rsp_iu_t *)atp->ests)->fcp_rsp_extra, &cso->sense_data, sense_length); } else { isp_put_fcp_rsp_iu(isp, rp, atp->ests); } if (isp->isp_dblev & ISP_LOGTDEBUG1) { isp_print_bytes(isp, "FCP Response Frame After Swizzling", MIN_FCP_RESPONSE_SIZE + sense_length, atp->ests); } addr = isp->isp_osinfo.ecmd_dma; addr += ((((isp_ecmd_t *)atp->ests) - isp->isp_osinfo.ecmd_base) * XCMD_SIZE); isp_prt(isp, ISP_LOGTDEBUG0, "%s: ests base %p vaddr %p ecmd_dma %jx addr %jx len %u", __func__, isp->isp_osinfo.ecmd_base, atp->ests, (uintmax_t) isp->isp_osinfo.ecmd_dma, (uintmax_t)addr, MIN_FCP_RESPONSE_SIZE + sense_length); cto->rsp.m2.ct_datalen = MIN_FCP_RESPONSE_SIZE + sense_length; cto->rsp.m2.u.ct_fcp_rsp_iudata_32.ds_base = DMA_LO32(addr); cto->rsp.m2.u.ct_fcp_rsp_iudata_32.ds_count = MIN_FCP_RESPONSE_SIZE + sense_length; } if (sense_length) { isp_prt(isp, ISP_LOGTDEBUG0, "%s: CTIO2[0x%x] seq %u nc %d CDB0=%x sstatus=0x%x flags=0x%x resid=%d sense: %x %x/%x/%x", __func__, cto->ct_rxid, ATPD_GET_SEQNO(cto), ATPD_GET_NCAM(cto), atp->cdb0, cso->scsi_status, cto->ct_flags, cto->ct_resid, cso->sense_data.error_code, cso->sense_data.sense_buf[1], cso->sense_data.sense_buf[11], cso->sense_data.sense_buf[12]); } else { isp_prt(isp, ISP_LOGTDEBUG0, "%s: CTIO2[0x%x] seq %u nc %d CDB0=%x sstatus=0x%x flags=0x%x resid=%d", __func__, cto->ct_rxid, ATPD_GET_SEQNO(cto), ATPD_GET_NCAM(cto), atp->cdb0, cso->scsi_status, cto->ct_flags, cto->ct_resid); } atp->state = ATPD_STATE_LAST_CTIO; } if (xfrlen != 0) { cto->ct_flags |= CT2_FLAG_MODE0; if ((cso->ccb_h.flags & CAM_DIR_MASK) == CAM_DIR_IN) { cto->ct_flags |= CT2_DATA_IN; } else { cto->ct_flags |= CT2_DATA_OUT; } cto->ct_reloff = atp->bytes_xfered + atp->bytes_in_transit; cto->rsp.m0.ct_xfrlen = xfrlen; if (sendstatus) { resid = atp->orig_datalen - atp->bytes_xfered - xfrlen; if (cso->scsi_status == SCSI_STATUS_OK && resid == 0 /*&& fctape == 0*/) { cto->ct_flags |= CT2_SENDSTATUS; atp->state = ATPD_STATE_LAST_CTIO; if (fctape) { cto->ct_flags |= CT2_CONFIRM; } } else { atp->sendst = 1; /* send status later */ cto->ct_header.rqs_seqno &= ~ATPD_SEQ_NOTIFY_CAM; atp->state = ATPD_STATE_CTIO; } } else { atp->state = ATPD_STATE_CTIO; } } isp_prt(isp, ISP_LOGTDEBUG0, "%s: CTIO2[%x] seq %u nc %d CDB0=%x scsi status %x flags %x resid %d xfrlen %u offset %u", __func__, cto->ct_rxid, ATPD_GET_SEQNO(cto), ATPD_GET_NCAM(cto), atp->cdb0, cso->scsi_status, cto->ct_flags, cto->ct_resid, cso->dxfer_len, atp->bytes_xfered); } if (isp_get_pcmd(isp, ccb)) { ISP_PATH_PRT(isp, ISP_LOGWARN, ccb->ccb_h.path, "out of PCMDs\n"); TAILQ_INSERT_HEAD(waitq, &ccb->ccb_h, sim_links.tqe); break; } handle = isp_allocate_handle(isp, ccb, ISP_HANDLE_TARGET); if (handle == 0) { ISP_PATH_PRT(isp, ISP_LOGWARN, ccb->ccb_h.path, "No XFLIST pointers for %s\n", __func__); TAILQ_INSERT_HEAD(waitq, &ccb->ccb_h, sim_links.tqe); isp_free_pcmd(isp, ccb); break; } atp->bytes_in_transit += xfrlen; PISP_PCMD(ccb)->datalen = xfrlen; /* * Call the dma setup routines for this entry (and any subsequent * CTIOs) if there's data to move, and then tell the f/w it's got * new things to play with. As with isp_start's usage of DMA setup, * any swizzling is done in the machine dependent layer. Because * of this, we put the request onto the queue area first in native * format. */ if (IS_24XX(isp)) { ct7_entry_t *cto = (ct7_entry_t *) local; cto->ct_syshandle = handle; } else { ct2_entry_t *cto = (ct2_entry_t *) local; cto->ct_syshandle = handle; } dmaresult = ISP_DMASETUP(isp, cso, (ispreq_t *) local); if (dmaresult != CMD_QUEUED) { isp_destroy_handle(isp, handle); isp_free_pcmd(isp, ccb); if (dmaresult == CMD_EAGAIN) { TAILQ_INSERT_HEAD(waitq, &ccb->ccb_h, sim_links.tqe); break; } ccb->ccb_h.status = CAM_REQ_CMP_ERR; xpt_done(ccb); continue; } ccb->ccb_h.status = CAM_REQ_INPROG | CAM_SIM_QUEUED; if (xfrlen) { ccb->ccb_h.spriv_field0 = atp->bytes_xfered; } else { ccb->ccb_h.spriv_field0 = ~0; } atp->ctcnt++; atp->seqno++; } } static void isp_refire_putback_atio(void *arg) { union ccb *ccb = arg; ISP_ASSERT_LOCKED((ispsoftc_t *)XS_ISP(ccb)); isp_target_putback_atio(ccb); } static void isp_refire_notify_ack(void *arg) { isp_tna_t *tp = arg; ispsoftc_t *isp = tp->isp; ISP_ASSERT_LOCKED(isp); if (isp_notify_ack(isp, tp->not)) { callout_schedule(&tp->timer, 5); } else { free(tp, M_DEVBUF); } } static void isp_target_putback_atio(union ccb *ccb) { ispsoftc_t *isp = XS_ISP(ccb); struct ccb_scsiio *cso = &ccb->csio; at2_entry_t local, *at = &local; ISP_MEMZERO(at, sizeof (at2_entry_t)); at->at_header.rqs_entry_type = RQSTYPE_ATIO2; at->at_header.rqs_entry_count = 1; if (ISP_CAP_SCCFW(isp)) { at->at_scclun = (uint16_t) ccb->ccb_h.target_lun; } else { at->at_lun = (uint8_t) ccb->ccb_h.target_lun; } at->at_status = CT_OK; at->at_rxid = cso->tag_id; at->at_iid = cso->init_id; if (isp_target_put_entry(isp, at)) { callout_reset(&PISP_PCMD(ccb)->wdog, 10, isp_refire_putback_atio, ccb); } else isp_complete_ctio(ccb); } static void isp_complete_ctio(union ccb *ccb) { if ((ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_INPROG) { ccb->ccb_h.status &= ~CAM_SIM_QUEUED; xpt_done(ccb); } } static void isp_handle_platform_atio2(ispsoftc_t *isp, at2_entry_t *aep) { fcparam *fcp; lun_id_t lun; fcportdb_t *lp; tstate_t *tptr; struct ccb_accept_tio *atiop; uint16_t nphdl; atio_private_data_t *atp; inot_private_data_t *ntp; /* * The firmware status (except for the QLTM_SVALID bit) * indicates why this ATIO was sent to us. * * If QLTM_SVALID is set, the firmware has recommended Sense Data. */ if ((aep->at_status & ~QLTM_SVALID) != AT_CDB) { isp_prt(isp, ISP_LOGWARN, "bogus atio (0x%x) leaked to platform", aep->at_status); isp_endcmd(isp, aep, NIL_HANDLE, 0, SCSI_STATUS_BUSY, 0); return; } fcp = FCPARAM(isp, 0); if (ISP_CAP_SCCFW(isp)) { lun = aep->at_scclun; } else { lun = aep->at_lun; } if (ISP_CAP_2KLOGIN(isp)) { nphdl = ((at2e_entry_t *)aep)->at_iid; } else { nphdl = aep->at_iid; } tptr = get_lun_statep(isp, 0, lun); if (tptr == NULL) { tptr = get_lun_statep(isp, 0, CAM_LUN_WILDCARD); if (tptr == NULL) { isp_prt(isp, ISP_LOGWARN, "%s: [0x%x] no state pointer for lun %jx or wildcard", __func__, aep->at_rxid, (uintmax_t)lun); if (lun == 0) { isp_endcmd(isp, aep, nphdl, 0, SCSI_STATUS_BUSY, 0); } else { isp_endcmd(isp, aep, nphdl, 0, SCSI_STATUS_CHECK_COND | ECMD_SVALID | (0x5 << 12) | (0x25 << 16), 0); } return; } } /* * Start any commands pending resources first. */ if (isp_atio_restart(isp, 0, tptr)) goto noresrc; atiop = (struct ccb_accept_tio *) SLIST_FIRST(&tptr->atios); if (atiop == NULL) { goto noresrc; } atp = isp_get_atpd(isp, 0, aep->at_rxid); if (atp == NULL) { goto noresrc; } atp->state = ATPD_STATE_ATIO; SLIST_REMOVE_HEAD(&tptr->atios, sim_links.sle); ISP_PATH_PRT(isp, ISP_LOGTDEBUG2, atiop->ccb_h.path, "Take FREE ATIO\n"); atiop->ccb_h.target_id = ISP_MAX_TARGETS(isp); atiop->ccb_h.target_lun = lun; /* * We don't get 'suggested' sense data as we do with SCSI cards. */ atiop->sense_len = 0; /* * If we're not in the port database, add ourselves. */ if (IS_2100(isp)) atiop->init_id = nphdl; else { if (isp_find_pdb_by_handle(isp, 0, nphdl, &lp)) { atiop->init_id = FC_PORTDB_TGT(isp, 0, lp); } else { isp_prt(isp, ISP_LOGTINFO, "%s: port %x isn't in PDB", __func__, nphdl); isp_dump_portdb(isp, 0); isp_endcmd(isp, aep, NIL_HANDLE, 0, ECMD_TERMINATE, 0); return; } } atiop->cdb_len = ATIO2_CDBLEN; ISP_MEMCPY(atiop->cdb_io.cdb_bytes, aep->at_cdb, ATIO2_CDBLEN); atiop->ccb_h.status = CAM_CDB_RECVD; atiop->tag_id = atp->tag; switch (aep->at_taskflags & ATIO2_TC_ATTR_MASK) { case ATIO2_TC_ATTR_SIMPLEQ: atiop->ccb_h.flags |= CAM_TAG_ACTION_VALID; atiop->tag_action = MSG_SIMPLE_Q_TAG; break; case ATIO2_TC_ATTR_HEADOFQ: atiop->ccb_h.flags |= CAM_TAG_ACTION_VALID; atiop->tag_action = MSG_HEAD_OF_Q_TAG; break; case ATIO2_TC_ATTR_ORDERED: atiop->ccb_h.flags |= CAM_TAG_ACTION_VALID; atiop->tag_action = MSG_ORDERED_Q_TAG; break; case ATIO2_TC_ATTR_ACAQ: /* ?? */ case ATIO2_TC_ATTR_UNTAGGED: default: atiop->tag_action = 0; break; } atp->orig_datalen = aep->at_datalen; atp->bytes_xfered = 0; atp->lun = lun; atp->nphdl = nphdl; atp->sid = PORT_ANY; atp->oxid = aep->at_oxid; atp->cdb0 = aep->at_cdb[0]; atp->tattr = aep->at_taskflags & ATIO2_TC_ATTR_MASK; atp->state = ATPD_STATE_CAM; xpt_done((union ccb *)atiop); isp_prt(isp, ISP_LOGTDEBUG0, "ATIO2[0x%x] CDB=0x%x lun %jx datalen %u", aep->at_rxid, atp->cdb0, (uintmax_t)lun, atp->orig_datalen); return; noresrc: ntp = isp_get_ntpd(isp, 0); if (ntp == NULL) { isp_endcmd(isp, aep, nphdl, 0, SCSI_STATUS_BUSY, 0); return; } memcpy(ntp->data, aep, QENTRY_LEN); STAILQ_INSERT_TAIL(&tptr->restart_queue, ntp, next); } static void isp_handle_platform_atio7(ispsoftc_t *isp, at7_entry_t *aep) { int cdbxlen; lun_id_t lun; uint16_t chan, nphdl = NIL_HANDLE; uint32_t did, sid; fcportdb_t *lp; tstate_t *tptr; struct ccb_accept_tio *atiop; atio_private_data_t *atp = NULL; atio_private_data_t *oatp; inot_private_data_t *ntp; did = (aep->at_hdr.d_id[0] << 16) | (aep->at_hdr.d_id[1] << 8) | aep->at_hdr.d_id[2]; sid = (aep->at_hdr.s_id[0] << 16) | (aep->at_hdr.s_id[1] << 8) | aep->at_hdr.s_id[2]; lun = CAM_EXTLUN_BYTE_SWIZZLE(be64dec(aep->at_cmnd.fcp_cmnd_lun)); if (ISP_CAP_MULTI_ID(isp) && isp->isp_nchan > 1) { /* Channel has to be derived from D_ID */ isp_find_chan_by_did(isp, did, &chan); if (chan == ISP_NOCHAN) { isp_prt(isp, ISP_LOGWARN, "%s: [RX_ID 0x%x] D_ID %x not found on any channel", __func__, aep->at_rxid, did); isp_endcmd(isp, aep, NIL_HANDLE, ISP_NOCHAN, ECMD_TERMINATE, 0); return; } } else { chan = 0; } /* * Find the PDB entry for this initiator */ if (isp_find_pdb_by_portid(isp, chan, sid, &lp) == 0) { /* * If we're not in the port database terminate the exchange. */ isp_prt(isp, ISP_LOGTINFO, "%s: [RX_ID 0x%x] D_ID 0x%06x found on Chan %d for S_ID 0x%06x wasn't in PDB already", __func__, aep->at_rxid, did, chan, sid); isp_dump_portdb(isp, chan); isp_endcmd(isp, aep, NIL_HANDLE, chan, ECMD_TERMINATE, 0); return; } nphdl = lp->handle; /* * Get the tstate pointer */ tptr = get_lun_statep(isp, chan, lun); if (tptr == NULL) { tptr = get_lun_statep(isp, chan, CAM_LUN_WILDCARD); if (tptr == NULL) { isp_prt(isp, ISP_LOGWARN, "%s: [0x%x] no state pointer for lun %jx or wildcard", __func__, aep->at_rxid, (uintmax_t)lun); if (lun == 0) { isp_endcmd(isp, aep, nphdl, chan, SCSI_STATUS_BUSY, 0); } else { isp_endcmd(isp, aep, nphdl, chan, SCSI_STATUS_CHECK_COND | ECMD_SVALID | (0x5 << 12) | (0x25 << 16), 0); } return; } } /* * Start any commands pending resources first. */ if (isp_atio_restart(isp, chan, tptr)) goto noresrc; /* * If the f/w is out of resources, just send a BUSY status back. */ if (aep->at_rxid == AT7_NORESRC_RXID) { isp_endcmd(isp, aep, nphdl, chan, SCSI_BUSY, 0); return; } /* * If we're out of resources, just send a BUSY status back. */ atiop = (struct ccb_accept_tio *) SLIST_FIRST(&tptr->atios); if (atiop == NULL) { isp_prt(isp, ISP_LOGTDEBUG0, "[0x%x] out of atios", aep->at_rxid); goto noresrc; } oatp = isp_find_atpd(isp, chan, aep->at_rxid); if (oatp) { isp_prt(isp, ISP_LOGTDEBUG0, "[0x%x] tag wraparound in isp_handle_platforms_atio7 (N-Port Handle 0x%04x S_ID 0x%04x OX_ID 0x%04x) oatp state %d", aep->at_rxid, nphdl, sid, aep->at_hdr.ox_id, oatp->state); /* * It's not a "no resource" condition- but we can treat it like one */ goto noresrc; } atp = isp_get_atpd(isp, chan, aep->at_rxid); if (atp == NULL) { isp_prt(isp, ISP_LOGTDEBUG0, "[0x%x] out of atps", aep->at_rxid); goto noresrc; } atp->word3 = lp->prli_word3; atp->state = ATPD_STATE_ATIO; SLIST_REMOVE_HEAD(&tptr->atios, sim_links.sle); ISP_PATH_PRT(isp, ISP_LOGTDEBUG2, atiop->ccb_h.path, "Take FREE ATIO\n"); atiop->init_id = FC_PORTDB_TGT(isp, chan, lp); atiop->ccb_h.target_id = ISP_MAX_TARGETS(isp); atiop->ccb_h.target_lun = lun; atiop->sense_len = 0; cdbxlen = aep->at_cmnd.fcp_cmnd_alen_datadir >> FCP_CMND_ADDTL_CDBLEN_SHIFT; if (cdbxlen) { isp_prt(isp, ISP_LOGWARN, "additional CDBLEN ignored"); } cdbxlen = sizeof (aep->at_cmnd.cdb_dl.sf.fcp_cmnd_cdb); ISP_MEMCPY(atiop->cdb_io.cdb_bytes, aep->at_cmnd.cdb_dl.sf.fcp_cmnd_cdb, cdbxlen); atiop->cdb_len = cdbxlen; atiop->ccb_h.status = CAM_CDB_RECVD; atiop->tag_id = atp->tag; switch (aep->at_cmnd.fcp_cmnd_task_attribute & FCP_CMND_TASK_ATTR_MASK) { case FCP_CMND_TASK_ATTR_SIMPLE: atiop->ccb_h.flags |= CAM_TAG_ACTION_VALID; atiop->tag_action = MSG_SIMPLE_Q_TAG; break; case FCP_CMND_TASK_ATTR_HEAD: atiop->ccb_h.flags |= CAM_TAG_ACTION_VALID; atiop->tag_action = MSG_HEAD_OF_Q_TAG; break; case FCP_CMND_TASK_ATTR_ORDERED: atiop->ccb_h.flags |= CAM_TAG_ACTION_VALID; atiop->tag_action = MSG_ORDERED_Q_TAG; break; default: /* FALLTHROUGH */ case FCP_CMND_TASK_ATTR_ACA: case FCP_CMND_TASK_ATTR_UNTAGGED: atiop->tag_action = 0; break; } atp->orig_datalen = aep->at_cmnd.cdb_dl.sf.fcp_cmnd_dl; atp->bytes_xfered = 0; atp->lun = lun; atp->nphdl = nphdl; atp->sid = sid; atp->did = did; atp->oxid = aep->at_hdr.ox_id; atp->rxid = aep->at_hdr.rx_id; atp->cdb0 = atiop->cdb_io.cdb_bytes[0]; atp->tattr = aep->at_cmnd.fcp_cmnd_task_attribute & FCP_CMND_TASK_ATTR_MASK; atp->state = ATPD_STATE_CAM; isp_prt(isp, ISP_LOGTDEBUG0, "ATIO7[0x%x] CDB=0x%x lun %jx datalen %u", aep->at_rxid, atp->cdb0, (uintmax_t)lun, atp->orig_datalen); xpt_done((union ccb *)atiop); return; noresrc: if (atp) isp_put_atpd(isp, chan, atp); ntp = isp_get_ntpd(isp, chan); if (ntp == NULL) { isp_endcmd(isp, aep, nphdl, chan, SCSI_STATUS_BUSY, 0); return; } memcpy(ntp->data, aep, QENTRY_LEN); STAILQ_INSERT_TAIL(&tptr->restart_queue, ntp, next); } /* * Handle starting an SRR (sequence retransmit request) * We get here when we've gotten the immediate notify * and the return of all outstanding CTIOs for this * transaction. */ static void isp_handle_srr_start(ispsoftc_t *isp, atio_private_data_t *atp) { in_fcentry_24xx_t *inot; uint32_t srr_off, ccb_off, ccb_len, ccb_end; union ccb *ccb; inot = (in_fcentry_24xx_t *)atp->srr; srr_off = inot->in_srr_reloff_lo | (inot->in_srr_reloff_hi << 16); ccb = atp->srr_ccb; atp->srr_ccb = NULL; atp->nsrr++; if (ccb == NULL) { isp_prt(isp, ISP_LOGWARN, "SRR[0x%x] null ccb", atp->tag); goto fail; } ccb_off = ccb->ccb_h.spriv_field0; ccb_len = ccb->csio.dxfer_len; ccb_end = (ccb_off == ~0)? ~0 : ccb_off + ccb_len; switch (inot->in_srr_iu) { case R_CTL_INFO_SOLICITED_DATA: /* * We have to restart a FCP_DATA data out transaction */ atp->sendst = 0; atp->bytes_xfered = srr_off; if (ccb_len == 0) { isp_prt(isp, ISP_LOGWARN, "SRR[0x%x] SRR offset 0x%x but current CCB doesn't transfer data", atp->tag, srr_off); goto mdp; } if (srr_off < ccb_off || ccb_off > srr_off + ccb_len) { isp_prt(isp, ISP_LOGWARN, "SRR[0x%x] SRR offset 0x%x not covered by current CCB data range [0x%x..0x%x]", atp->tag, srr_off, ccb_off, ccb_end); goto mdp; } isp_prt(isp, ISP_LOGWARN, "SRR[0x%x] SRR offset 0x%x covered by current CCB data range [0x%x..0x%x]", atp->tag, srr_off, ccb_off, ccb_end); break; case R_CTL_INFO_COMMAND_STATUS: isp_prt(isp, ISP_LOGTINFO, "SRR[0x%x] Got an FCP RSP SRR- resending status", atp->tag); atp->sendst = 1; /* * We have to restart a FCP_RSP IU transaction */ break; case R_CTL_INFO_DATA_DESCRIPTOR: /* * We have to restart an FCP DATA in transaction */ isp_prt(isp, ISP_LOGWARN, "Got an FCP DATA IN SRR- dropping"); goto fail; default: isp_prt(isp, ISP_LOGWARN, "Got an unknown information (%x) SRR- dropping", inot->in_srr_iu); goto fail; } /* * We can't do anything until this is acked, so we might as well start it now. * We aren't going to do the usual asynchronous ack issue because we need * to make sure this gets on the wire first. */ if (isp_notify_ack(isp, inot)) { isp_prt(isp, ISP_LOGWARN, "could not push positive ack for SRR- you lose"); goto fail; } isp_target_start_ctio(isp, ccb, FROM_SRR); return; fail: inot->in_reserved = 1; isp_async(isp, ISPASYNC_TARGET_NOTIFY_ACK, inot); ccb->ccb_h.status &= ~CAM_STATUS_MASK; ccb->ccb_h.status |= CAM_REQ_CMP_ERR; isp_complete_ctio(ccb); return; mdp: if (isp_notify_ack(isp, inot)) { isp_prt(isp, ISP_LOGWARN, "could not push positive ack for SRR- you lose"); goto fail; } ccb->ccb_h.status &= ~CAM_STATUS_MASK; ccb->ccb_h.status = CAM_MESSAGE_RECV; /* * This is not a strict interpretation of MDP, but it's close */ ccb->csio.msg_ptr = &ccb->csio.sense_data.sense_buf[SSD_FULL_SIZE - 16]; ccb->csio.msg_len = 7; ccb->csio.msg_ptr[0] = MSG_EXTENDED; ccb->csio.msg_ptr[1] = 5; ccb->csio.msg_ptr[2] = 0; /* modify data pointer */ ccb->csio.msg_ptr[3] = srr_off >> 24; ccb->csio.msg_ptr[4] = srr_off >> 16; ccb->csio.msg_ptr[5] = srr_off >> 8; ccb->csio.msg_ptr[6] = srr_off; isp_complete_ctio(ccb); } static void isp_handle_platform_srr(ispsoftc_t *isp, isp_notify_t *notify) { in_fcentry_24xx_t *inot = notify->nt_lreserved; atio_private_data_t *atp; uint32_t tag = notify->nt_tagval & 0xffffffff; atp = isp_find_atpd(isp, notify->nt_channel, tag); if (atp == NULL) { isp_prt(isp, ISP_LOGERR, "%s: cannot find adjunct for %x in SRR Notify", __func__, tag); isp_async(isp, ISPASYNC_TARGET_NOTIFY_ACK, inot); return; } atp->srr_notify_rcvd = 1; memcpy(atp->srr, inot, sizeof (atp->srr)); isp_prt(isp, ISP_LOGTINFO, "SRR[0x%x] flags 0x%x srr_iu %x reloff 0x%x", inot->in_rxid, inot->in_flags, inot->in_srr_iu, ((uint32_t)inot->in_srr_reloff_hi << 16) | inot->in_srr_reloff_lo); if (atp->srr_ccb) isp_handle_srr_start(isp, atp); } static void isp_handle_platform_ctio(ispsoftc_t *isp, void *arg) { union ccb *ccb; int sentstatus = 0, ok = 0, notify_cam = 0, failure = 0; atio_private_data_t *atp = NULL; int bus; uint32_t handle, data_requested, resid; handle = ((ct2_entry_t *)arg)->ct_syshandle; ccb = isp_find_xs(isp, handle); if (ccb == NULL) { isp_print_bytes(isp, "null ccb in isp_handle_platform_ctio", QENTRY_LEN, arg); return; } isp_destroy_handle(isp, handle); resid = data_requested = PISP_PCMD(ccb)->datalen; isp_free_pcmd(isp, ccb); bus = XS_CHANNEL(ccb); if (IS_24XX(isp)) { atp = isp_find_atpd(isp, bus, ((ct7_entry_t *)arg)->ct_rxid); } else { atp = isp_find_atpd(isp, bus, ((ct2_entry_t *)arg)->ct_rxid); } if (atp == NULL) { /* * XXX: isp_clear_commands() generates fake CTIO with zero * ct_rxid value, filling only ct_syshandle. Workaround * that using tag_id from the CCB, pointed by ct_syshandle. */ atp = isp_find_atpd(isp, bus, ccb->csio.tag_id); } if (atp == NULL) { isp_prt(isp, ISP_LOGERR, "%s: cannot find adjunct for %x after I/O", __func__, ccb->csio.tag_id); return; } KASSERT((atp->ctcnt > 0), ("ctio count not greater than zero")); atp->bytes_in_transit -= data_requested; atp->ctcnt -= 1; ccb->ccb_h.status &= ~CAM_STATUS_MASK; if (IS_24XX(isp)) { ct7_entry_t *ct = arg; if (ct->ct_nphdl == CT7_SRR) { atp->srr_ccb = ccb; if (atp->srr_notify_rcvd) isp_handle_srr_start(isp, atp); return; } if (ct->ct_nphdl == CT_HBA_RESET) { sentstatus = (ccb->ccb_h.flags & CAM_SEND_STATUS) && (atp->sendst == 0); failure = CAM_UNREC_HBA_ERROR; } else { sentstatus = ct->ct_flags & CT7_SENDSTATUS; ok = (ct->ct_nphdl == CT7_OK); notify_cam = (ct->ct_header.rqs_seqno & ATPD_SEQ_NOTIFY_CAM) != 0; if ((ct->ct_flags & CT7_DATAMASK) != CT7_NO_DATA) resid = ct->ct_resid; } isp_prt(isp, ok? ISP_LOGTDEBUG0 : ISP_LOGWARN, "%s: CTIO7[%x] seq %u nc %d sts 0x%x flg 0x%x sns %d resid %d %s", __func__, ct->ct_rxid, ATPD_GET_SEQNO(ct), notify_cam, ct->ct_nphdl, ct->ct_flags, (ccb->ccb_h.status & CAM_SENT_SENSE) != 0, resid, sentstatus? "FIN" : "MID"); } else { ct2_entry_t *ct = arg; if (ct->ct_status == CT_SRR) { atp->srr_ccb = ccb; if (atp->srr_notify_rcvd) isp_handle_srr_start(isp, atp); isp_target_putback_atio(ccb); return; } if (ct->ct_status == CT_HBA_RESET) { sentstatus = (ccb->ccb_h.flags & CAM_SEND_STATUS) && (atp->sendst == 0); failure = CAM_UNREC_HBA_ERROR; } else { sentstatus = ct->ct_flags & CT2_SENDSTATUS; ok = (ct->ct_status & ~QLTM_SVALID) == CT_OK; notify_cam = (ct->ct_header.rqs_seqno & ATPD_SEQ_NOTIFY_CAM) != 0; if ((ct->ct_flags & CT2_DATAMASK) != CT2_NO_DATA) resid = ct->ct_resid; } isp_prt(isp, ok? ISP_LOGTDEBUG0 : ISP_LOGWARN, "%s: CTIO2[%x] seq %u nc %d sts 0x%x flg 0x%x sns %d resid %d %s", __func__, ct->ct_rxid, ATPD_GET_SEQNO(ct), notify_cam, ct->ct_status, ct->ct_flags, (ccb->ccb_h.status & CAM_SENT_SENSE) != 0, resid, sentstatus? "FIN" : "MID"); } if (ok) { if (data_requested > 0) { atp->bytes_xfered += data_requested - resid; ccb->csio.resid = ccb->csio.dxfer_len - (data_requested - resid); } if (sentstatus && (ccb->ccb_h.flags & CAM_SEND_SENSE)) ccb->ccb_h.status |= CAM_SENT_SENSE; ccb->ccb_h.status |= CAM_REQ_CMP; } else { notify_cam = 1; if (failure == CAM_UNREC_HBA_ERROR) ccb->ccb_h.status |= CAM_UNREC_HBA_ERROR; else ccb->ccb_h.status |= CAM_REQ_CMP_ERR; } atp->state = ATPD_STATE_PDON; /* * We never *not* notify CAM when there has been any error (ok == 0), * so we never need to do an ATIO putback if we're not notifying CAM. */ isp_prt(isp, ISP_LOGTDEBUG0, "%s CTIO[0x%x] done (ok=%d nc=%d nowsendstatus=%d ccb ss=%d)", (sentstatus)? " FINAL " : "MIDTERM ", atp->tag, ok, notify_cam, atp->sendst, (ccb->ccb_h.flags & CAM_SEND_STATUS) != 0); if (notify_cam == 0) { if (atp->sendst) { isp_target_start_ctio(isp, ccb, FROM_CTIO_DONE); } return; } /* * We are done with this ATIO if we successfully sent status. * In all other cases expect either another CTIO or XPT_ABORT. */ if (ok && sentstatus) isp_put_atpd(isp, bus, atp); /* * We're telling CAM we're done with this CTIO transaction. * * 24XX cards never need an ATIO put back. * * Other cards need one put back only on error. * In the latter case, a timeout will re-fire * and try again in case we didn't have * queue resources to do so at first. In any case, * once the putback is done we do the completion * call. */ if (ok || IS_24XX(isp)) { isp_complete_ctio(ccb); } else { isp_target_putback_atio(ccb); } } static int isp_handle_platform_target_notify_ack(ispsoftc_t *isp, isp_notify_t *mp, uint32_t rsp) { if (isp->isp_state != ISP_RUNSTATE) { isp_prt(isp, ISP_LOGTINFO, "Notify Code 0x%x (qevalid=%d) acked- h/w not ready (dropping)", mp->nt_ncode, mp->nt_lreserved != NULL); return (0); } /* * This case is for a Task Management Function, which shows up as an ATIO7 entry. */ if (IS_24XX(isp) && mp->nt_lreserved && ((isphdr_t *)mp->nt_lreserved)->rqs_entry_type == RQSTYPE_ATIO) { ct7_entry_t local, *cto = &local; at7_entry_t *aep = (at7_entry_t *)mp->nt_lreserved; fcportdb_t *lp; uint32_t sid; uint16_t nphdl; sid = (aep->at_hdr.s_id[0] << 16) | (aep->at_hdr.s_id[1] << 8) | aep->at_hdr.s_id[2]; if (isp_find_pdb_by_portid(isp, mp->nt_channel, sid, &lp)) { nphdl = lp->handle; } else { nphdl = NIL_HANDLE; } ISP_MEMZERO(&local, sizeof (local)); cto->ct_header.rqs_entry_type = RQSTYPE_CTIO7; cto->ct_header.rqs_entry_count = 1; cto->ct_nphdl = nphdl; cto->ct_rxid = aep->at_rxid; cto->ct_vpidx = mp->nt_channel; cto->ct_iid_lo = sid; cto->ct_iid_hi = sid >> 16; cto->ct_oxid = aep->at_hdr.ox_id; cto->ct_flags = CT7_SENDSTATUS|CT7_NOACK|CT7_NO_DATA|CT7_FLAG_MODE1; cto->ct_flags |= (aep->at_ta_len >> 12) << CT7_TASK_ATTR_SHIFT; if (rsp != 0) { cto->ct_scsi_status |= (FCP_RSPLEN_VALID << 8); cto->rsp.m1.ct_resplen = 4; ISP_MEMZERO(cto->rsp.m1.ct_resp, sizeof (cto->rsp.m1.ct_resp)); cto->rsp.m1.ct_resp[0] = rsp & 0xff; cto->rsp.m1.ct_resp[1] = (rsp >> 8) & 0xff; cto->rsp.m1.ct_resp[2] = (rsp >> 16) & 0xff; cto->rsp.m1.ct_resp[3] = (rsp >> 24) & 0xff; } return (isp_target_put_entry(isp, &local)); } /* * This case is for a responding to an ABTS frame */ if (IS_24XX(isp) && mp->nt_lreserved && ((isphdr_t *)mp->nt_lreserved)->rqs_entry_type == RQSTYPE_ABTS_RCVD) { /* * Overload nt_need_ack here to mark whether we've terminated the associated command. */ if (mp->nt_need_ack) { uint8_t storage[QENTRY_LEN]; ct7_entry_t *cto = (ct7_entry_t *) storage; abts_t *abts = (abts_t *)mp->nt_lreserved; ISP_MEMZERO(cto, sizeof (ct7_entry_t)); isp_prt(isp, ISP_LOGTDEBUG0, "%s: [%x] terminating after ABTS received", __func__, abts->abts_rxid_task); cto->ct_header.rqs_entry_type = RQSTYPE_CTIO7; cto->ct_header.rqs_entry_count = 1; cto->ct_nphdl = mp->nt_nphdl; cto->ct_rxid = abts->abts_rxid_task; cto->ct_iid_lo = mp->nt_sid; cto->ct_iid_hi = mp->nt_sid >> 16; cto->ct_oxid = abts->abts_ox_id; cto->ct_vpidx = mp->nt_channel; cto->ct_flags = CT7_NOACK|CT7_TERMINATE; if (isp_target_put_entry(isp, cto)) { return (ENOMEM); } mp->nt_need_ack = 0; } if (isp_acknak_abts(isp, mp->nt_lreserved, 0) == ENOMEM) { return (ENOMEM); } else { return (0); } } /* * Handle logout cases here */ if (mp->nt_ncode == NT_GLOBAL_LOGOUT) { isp_del_all_wwn_entries(isp, mp->nt_channel); } if (mp->nt_ncode == NT_LOGOUT) { if (!IS_2100(isp) && IS_FC(isp)) { isp_del_wwn_entries(isp, mp); } } /* * General purpose acknowledgement */ if (mp->nt_need_ack) { isp_prt(isp, ISP_LOGTINFO, "Notify Code 0x%x (qevalid=%d) being acked", mp->nt_ncode, mp->nt_lreserved != NULL); /* * Don't need to use the guaranteed send because the caller can retry */ return (isp_notify_ack(isp, mp->nt_lreserved)); } return (0); } /* * Handle task management functions. * * We show up here with a notify structure filled out. * * The nt_lreserved tag points to the original queue entry */ static void isp_handle_platform_target_tmf(ispsoftc_t *isp, isp_notify_t *notify) { tstate_t *tptr; fcportdb_t *lp; struct ccb_immediate_notify *inot; inot_private_data_t *ntp = NULL; atio_private_data_t *atp; lun_id_t lun; isp_prt(isp, ISP_LOGTDEBUG0, "%s: code 0x%x sid 0x%x tagval 0x%016llx chan %d lun %jx", __func__, notify->nt_ncode, notify->nt_sid, (unsigned long long) notify->nt_tagval, notify->nt_channel, notify->nt_lun); if (notify->nt_lun == LUN_ANY) { if (notify->nt_tagval == TAG_ANY) { lun = CAM_LUN_WILDCARD; } else { atp = isp_find_atpd(isp, notify->nt_channel, notify->nt_tagval & 0xffffffff); lun = atp ? atp->lun : CAM_LUN_WILDCARD; } } else { lun = notify->nt_lun; } tptr = get_lun_statep(isp, notify->nt_channel, lun); if (tptr == NULL) { tptr = get_lun_statep(isp, notify->nt_channel, CAM_LUN_WILDCARD); if (tptr == NULL) { isp_prt(isp, ISP_LOGWARN, "%s: no state pointer found for chan %d lun %#jx", __func__, notify->nt_channel, (uintmax_t)lun); goto bad; } } inot = (struct ccb_immediate_notify *) SLIST_FIRST(&tptr->inots); if (inot == NULL) { isp_prt(isp, ISP_LOGWARN, "%s: out of immediate notify structures for chan %d lun %#jx", __func__, notify->nt_channel, (uintmax_t)lun); goto bad; } inot->ccb_h.target_id = ISP_MAX_TARGETS(isp); inot->ccb_h.target_lun = lun; if (isp_find_pdb_by_portid(isp, notify->nt_channel, notify->nt_sid, &lp) == 0 && isp_find_pdb_by_handle(isp, notify->nt_channel, notify->nt_nphdl, &lp) == 0) { inot->initiator_id = CAM_TARGET_WILDCARD; } else { inot->initiator_id = FC_PORTDB_TGT(isp, notify->nt_channel, lp); } inot->seq_id = notify->nt_tagval; inot->tag_id = notify->nt_tagval >> 32; switch (notify->nt_ncode) { case NT_ABORT_TASK: isp_target_mark_aborted_early(isp, notify->nt_channel, tptr, inot->tag_id); inot->arg = MSG_ABORT_TASK; break; case NT_ABORT_TASK_SET: isp_target_mark_aborted_early(isp, notify->nt_channel, tptr, TAG_ANY); inot->arg = MSG_ABORT_TASK_SET; break; case NT_CLEAR_ACA: inot->arg = MSG_CLEAR_ACA; break; case NT_CLEAR_TASK_SET: inot->arg = MSG_CLEAR_TASK_SET; break; case NT_LUN_RESET: inot->arg = MSG_LOGICAL_UNIT_RESET; break; case NT_TARGET_RESET: inot->arg = MSG_TARGET_RESET; break; case NT_QUERY_TASK_SET: inot->arg = MSG_QUERY_TASK_SET; break; case NT_QUERY_ASYNC_EVENT: inot->arg = MSG_QUERY_ASYNC_EVENT; break; default: isp_prt(isp, ISP_LOGWARN, "%s: unknown TMF code 0x%x for chan %d lun %#jx", __func__, notify->nt_ncode, notify->nt_channel, (uintmax_t)lun); goto bad; } ntp = isp_get_ntpd(isp, notify->nt_channel); if (ntp == NULL) { isp_prt(isp, ISP_LOGWARN, "%s: out of inotify private structures", __func__); goto bad; } ISP_MEMCPY(&ntp->nt, notify, sizeof (isp_notify_t)); if (notify->nt_lreserved) { ISP_MEMCPY(&ntp->data, notify->nt_lreserved, QENTRY_LEN); ntp->nt.nt_lreserved = &ntp->data; } ntp->seq_id = notify->nt_tagval; ntp->tag_id = notify->nt_tagval >> 32; SLIST_REMOVE_HEAD(&tptr->inots, sim_links.sle); ISP_PATH_PRT(isp, ISP_LOGTDEBUG2, inot->ccb_h.path, "Take FREE INOT\n"); inot->ccb_h.status = CAM_MESSAGE_RECV; xpt_done((union ccb *)inot); return; bad: if (notify->nt_need_ack) { if (((isphdr_t *)notify->nt_lreserved)->rqs_entry_type == RQSTYPE_ABTS_RCVD) { if (isp_acknak_abts(isp, notify->nt_lreserved, ENOMEM)) { isp_prt(isp, ISP_LOGWARN, "you lose- unable to send an ACKNAK"); } } else { isp_async(isp, ISPASYNC_TARGET_NOTIFY_ACK, notify->nt_lreserved); } } } static void isp_target_mark_aborted_early(ispsoftc_t *isp, int chan, tstate_t *tptr, uint32_t tag_id) { atio_private_data_t *atp, *atpool; inot_private_data_t *ntp, *tmp; uint32_t this_tag_id; /* * First, clean any commands pending restart */ STAILQ_FOREACH_SAFE(ntp, &tptr->restart_queue, next, tmp) { if (IS_24XX(isp)) this_tag_id = ((at7_entry_t *)ntp->data)->at_rxid; else this_tag_id = ((at2_entry_t *)ntp->data)->at_rxid; if ((uint64_t)tag_id == TAG_ANY || tag_id == this_tag_id) { isp_endcmd(isp, ntp->data, NIL_HANDLE, chan, ECMD_TERMINATE, 0); isp_put_ntpd(isp, chan, ntp); STAILQ_REMOVE(&tptr->restart_queue, ntp, inot_private_data, next); } } /* * Now mark other ones dead as well. */ ISP_GET_PC(isp, chan, atpool, atpool); for (atp = atpool; atp < &atpool[ATPDPSIZE]; atp++) { if (atp->lun != tptr->ts_lun) continue; if ((uint64_t)tag_id == TAG_ANY || atp->tag == tag_id) atp->dead = 1; } } #endif static void isp_cam_async(void *cbarg, uint32_t code, struct cam_path *path, void *arg) { struct cam_sim *sim; int bus, tgt; ispsoftc_t *isp; sim = (struct cam_sim *)cbarg; isp = (ispsoftc_t *) cam_sim_softc(sim); bus = cam_sim_bus(sim); tgt = xpt_path_target_id(path); switch (code) { case AC_LOST_DEVICE: if (IS_SCSI(isp)) { uint16_t oflags, nflags; sdparam *sdp = SDPARAM(isp, bus); if (tgt >= 0) { nflags = sdp->isp_devparam[tgt].nvrm_flags; nflags &= DPARM_SAFE_DFLT; if (isp->isp_loaded_fw) { nflags |= DPARM_NARROW | DPARM_ASYNC; } oflags = sdp->isp_devparam[tgt].goal_flags; sdp->isp_devparam[tgt].goal_flags = nflags; sdp->isp_devparam[tgt].dev_update = 1; sdp->update = 1; (void) isp_control(isp, ISPCTL_UPDATE_PARAMS, bus); sdp->isp_devparam[tgt].goal_flags = oflags; } } break; default: isp_prt(isp, ISP_LOGWARN, "isp_cam_async: Code 0x%x", code); break; } } static void isp_poll(struct cam_sim *sim) { ispsoftc_t *isp = cam_sim_softc(sim); ISP_RUN_ISR(isp); } static void isp_watchdog(void *arg) { struct ccb_scsiio *xs = arg; ispsoftc_t *isp; uint32_t ohandle = ISP_HANDLE_FREE, handle; isp = XS_ISP(xs); handle = isp_find_handle(isp, xs); /* * Hand crank the interrupt code just to be sure the command isn't stuck somewhere. */ if (handle != ISP_HANDLE_FREE) { ISP_RUN_ISR(isp); ohandle = handle; handle = isp_find_handle(isp, xs); } if (handle != ISP_HANDLE_FREE) { /* * Try and make sure the command is really dead before * we release the handle (and DMA resources) for reuse. * * If we are successful in aborting the command then * we're done here because we'll get the command returned * back separately. */ if (isp_control(isp, ISPCTL_ABORT_CMD, xs) == 0) { return; } /* * Note that after calling the above, the command may in * fact have been completed. */ xs = isp_find_xs(isp, handle); /* * If the command no longer exists, then we won't * be able to find the xs again with this handle. */ if (xs == NULL) { return; } /* * After this point, the command is really dead. */ if (XS_XFRLEN(xs)) { ISP_DMAFREE(isp, xs, handle); } isp_destroy_handle(isp, handle); isp_prt(isp, ISP_LOGERR, "%s: timeout for handle 0x%x", __func__, handle); XS_SETERR(xs, CAM_CMD_TIMEOUT); isp_done(xs); } else { if (ohandle != ISP_HANDLE_FREE) { isp_prt(isp, ISP_LOGWARN, "%s: timeout for handle 0x%x, recovered during interrupt", __func__, ohandle); } else { isp_prt(isp, ISP_LOGWARN, "%s: timeout for handle already free", __func__); } } } static void isp_make_here(ispsoftc_t *isp, fcportdb_t *fcp, int chan, int tgt) { union ccb *ccb; struct isp_fc *fc = ISP_FC_PC(isp, chan); /* * Allocate a CCB, create a wildcard path for this target and schedule a rescan. */ ccb = xpt_alloc_ccb_nowait(); if (ccb == NULL) { isp_prt(isp, ISP_LOGWARN, "Chan %d unable to alloc CCB for rescan", chan); return; } if (xpt_create_path(&ccb->ccb_h.path, NULL, cam_sim_path(fc->sim), tgt, CAM_LUN_WILDCARD) != CAM_REQ_CMP) { isp_prt(isp, ISP_LOGWARN, "unable to create path for rescan"); xpt_free_ccb(ccb); return; } xpt_rescan(ccb); } static void isp_make_gone(ispsoftc_t *isp, fcportdb_t *fcp, int chan, int tgt) { struct cam_path *tp; struct isp_fc *fc = ISP_FC_PC(isp, chan); if (xpt_create_path(&tp, NULL, cam_sim_path(fc->sim), tgt, CAM_LUN_WILDCARD) == CAM_REQ_CMP) { xpt_async(AC_LOST_DEVICE, tp, NULL); xpt_free_path(tp); } } /* * Gone Device Timer Function- when we have decided that a device has gone * away, we wait a specific period of time prior to telling the OS it has * gone away. * * This timer function fires once a second and then scans the port database * for devices that are marked dead but still have a virtual target assigned. * We decrement a counter for that port database entry, and when it hits zero, * we tell the OS the device has gone away. */ static void isp_gdt(void *arg) { struct isp_fc *fc = arg; taskqueue_enqueue(taskqueue_thread, &fc->gtask); } static void isp_gdt_task(void *arg, int pending) { struct isp_fc *fc = arg; ispsoftc_t *isp = fc->isp; int chan = fc - isp->isp_osinfo.pc.fc; fcportdb_t *lp; struct ac_contract ac; struct ac_device_changed *adc; int dbidx, more_to_do = 0; ISP_LOCK(isp); isp_prt(isp, ISP_LOGDEBUG0, "Chan %d GDT timer expired", chan); for (dbidx = 0; dbidx < MAX_FC_TARG; dbidx++) { lp = &FCPARAM(isp, chan)->portdb[dbidx]; if (lp->state != FC_PORTDB_STATE_ZOMBIE) { continue; } if (lp->gone_timer != 0) { lp->gone_timer -= 1; more_to_do++; continue; } isp_prt(isp, ISP_LOGCONFIG, prom3, chan, dbidx, lp->portid, "Gone Device Timeout"); if (lp->is_target) { lp->is_target = 0; isp_make_gone(isp, lp, chan, dbidx); } if (lp->is_initiator) { lp->is_initiator = 0; ac.contract_number = AC_CONTRACT_DEV_CHG; adc = (struct ac_device_changed *) ac.contract_data; adc->wwpn = lp->port_wwn; adc->port = lp->portid; adc->target = dbidx; adc->arrived = 0; xpt_async(AC_CONTRACT, fc->path, &ac); } lp->state = FC_PORTDB_STATE_NIL; } if (fc->ready) { if (more_to_do) { callout_reset(&fc->gdt, hz, isp_gdt, fc); } else { callout_deactivate(&fc->gdt); isp_prt(isp, ISP_LOG_SANCFG, "Chan %d Stopping Gone Device Timer @ %lu", chan, (unsigned long) time_uptime); } } ISP_UNLOCK(isp); } /* * When loop goes down we remember the time and freeze CAM command queue. * During some time period we are trying to reprobe the loop. But if we * fail, we tell the OS that devices have gone away and drop the freeze. * * We don't clear the devices out of our port database because, when loop * come back up, we have to do some actual cleanup with the chip at that * point (implicit PLOGO, e.g., to get the chip's port database state right). */ static void isp_loop_changed(ispsoftc_t *isp, int chan) { fcparam *fcp = FCPARAM(isp, chan); struct isp_fc *fc = ISP_FC_PC(isp, chan); if (fc->loop_down_time) return; isp_prt(isp, ISP_LOG_SANCFG|ISP_LOGDEBUG0, "Chan %d Loop changed", chan); if (fcp->role & ISP_ROLE_INITIATOR) isp_freeze_loopdown(isp, chan); fc->loop_down_time = time_uptime; wakeup(fc); } static void isp_loop_up(ispsoftc_t *isp, int chan) { struct isp_fc *fc = ISP_FC_PC(isp, chan); isp_prt(isp, ISP_LOG_SANCFG|ISP_LOGDEBUG0, "Chan %d Loop is up", chan); fc->loop_seen_once = 1; fc->loop_down_time = 0; isp_unfreeze_loopdown(isp, chan); } static void isp_loop_dead(ispsoftc_t *isp, int chan) { fcparam *fcp = FCPARAM(isp, chan); struct isp_fc *fc = ISP_FC_PC(isp, chan); fcportdb_t *lp; struct ac_contract ac; struct ac_device_changed *adc; int dbidx, i; isp_prt(isp, ISP_LOG_SANCFG|ISP_LOGDEBUG0, "Chan %d Loop is dead", chan); /* * Notify to the OS all targets who we now consider have departed. */ for (dbidx = 0; dbidx < MAX_FC_TARG; dbidx++) { lp = &fcp->portdb[dbidx]; if (lp->state == FC_PORTDB_STATE_NIL) continue; for (i = 0; i < isp->isp_maxcmds; i++) { struct ccb_scsiio *xs; if (ISP_H2HT(isp->isp_xflist[i].handle) != ISP_HANDLE_INITIATOR) { continue; } if ((xs = isp->isp_xflist[i].cmd) == NULL) { continue; } if (dbidx != XS_TGT(xs)) { continue; } isp_prt(isp, ISP_LOGWARN, "command handle 0x%x for %d.%d.%jx orphaned by loop down timeout", isp->isp_xflist[i].handle, chan, XS_TGT(xs), (uintmax_t)XS_LUN(xs)); /* * Just like in isp_watchdog, abort the outstanding * command or immediately free its resources if it is * not active */ if (isp_control(isp, ISPCTL_ABORT_CMD, xs) == 0) { continue; } if (XS_XFRLEN(xs)) { ISP_DMAFREE(isp, xs, isp->isp_xflist[i].handle); } isp_destroy_handle(isp, isp->isp_xflist[i].handle); isp_prt(isp, ISP_LOGWARN, "command handle 0x%x for %d.%d.%jx could not be aborted and was destroyed", isp->isp_xflist[i].handle, chan, XS_TGT(xs), (uintmax_t)XS_LUN(xs)); XS_SETERR(xs, HBA_BUSRESET); isp_done(xs); } isp_prt(isp, ISP_LOGCONFIG, prom3, chan, dbidx, lp->portid, "Loop Down Timeout"); if (lp->is_target) { lp->is_target = 0; isp_make_gone(isp, lp, chan, dbidx); } if (lp->is_initiator) { lp->is_initiator = 0; ac.contract_number = AC_CONTRACT_DEV_CHG; adc = (struct ac_device_changed *) ac.contract_data; adc->wwpn = lp->port_wwn; adc->port = lp->portid; adc->target = dbidx; adc->arrived = 0; xpt_async(AC_CONTRACT, fc->path, &ac); } } isp_unfreeze_loopdown(isp, chan); fc->loop_down_time = 0; } static void isp_kthread(void *arg) { struct isp_fc *fc = arg; ispsoftc_t *isp = fc->isp; int chan = fc - isp->isp_osinfo.pc.fc; int slp = 0, d; int lb, lim; ISP_LOCK(isp); while (isp->isp_osinfo.is_exiting == 0) { isp_prt(isp, ISP_LOG_SANCFG|ISP_LOGDEBUG0, "Chan %d Checking FC state", chan); lb = isp_fc_runstate(isp, chan, 250000); isp_prt(isp, ISP_LOG_SANCFG|ISP_LOGDEBUG0, "Chan %d FC got to %s state", chan, isp_fc_loop_statename(lb)); /* * Our action is different based upon whether we're supporting * Initiator mode or not. If we are, we might freeze the simq * when loop is down and set all sorts of different delays to * check again. * * If not, we simply just wait for loop to come up. */ if (lb == LOOP_READY || lb < 0) { slp = 0; } else { /* * If we've never seen loop up and we've waited longer * than quickboot time, or we've seen loop up but we've * waited longer than loop_down_limit, give up and go * to sleep until loop comes up. */ if (fc->loop_seen_once == 0) lim = isp_quickboot_time; else lim = fc->loop_down_limit; d = time_uptime - fc->loop_down_time; if (d >= lim) slp = 0; else if (d < 10) slp = 1; else if (d < 30) slp = 5; else if (d < 60) slp = 10; else if (d < 120) slp = 20; else slp = 30; } if (slp == 0) { if (lb == LOOP_READY) isp_loop_up(isp, chan); else isp_loop_dead(isp, chan); } isp_prt(isp, ISP_LOG_SANCFG|ISP_LOGDEBUG0, "Chan %d sleep for %d seconds", chan, slp); msleep(fc, &isp->isp_lock, PRIBIO, "ispf", slp * hz); } fc->num_threads -= 1; ISP_UNLOCK(isp); kthread_exit(); } #ifdef ISP_TARGET_MODE static void isp_abort_atio(ispsoftc_t *isp, union ccb *ccb) { atio_private_data_t *atp; union ccb *accb = ccb->cab.abort_ccb; struct ccb_hdr *sccb; tstate_t *tptr; tptr = get_lun_statep(isp, XS_CHANNEL(accb), XS_LUN(accb)); if (tptr != NULL) { /* Search for the ATIO among queueued. */ SLIST_FOREACH(sccb, &tptr->atios, sim_links.sle) { if (sccb != &accb->ccb_h) continue; SLIST_REMOVE(&tptr->atios, sccb, ccb_hdr, sim_links.sle); ISP_PATH_PRT(isp, ISP_LOGTDEBUG2, sccb->path, "Abort FREE ATIO\n"); accb->ccb_h.status = CAM_REQ_ABORTED; xpt_done(accb); ccb->ccb_h.status = CAM_REQ_CMP; return; } } /* Search for the ATIO among running. */ atp = isp_find_atpd(isp, XS_CHANNEL(accb), accb->atio.tag_id); if (atp != NULL) { /* Send TERMINATE to firmware. */ if (!atp->dead && IS_24XX(isp)) { uint8_t storage[QENTRY_LEN]; ct7_entry_t *cto = (ct7_entry_t *) storage; ISP_MEMZERO(cto, sizeof (ct7_entry_t)); cto->ct_header.rqs_entry_type = RQSTYPE_CTIO7; cto->ct_header.rqs_entry_count = 1; cto->ct_nphdl = atp->nphdl; cto->ct_rxid = atp->tag; cto->ct_iid_lo = atp->sid; cto->ct_iid_hi = atp->sid >> 16; cto->ct_oxid = atp->oxid; cto->ct_vpidx = XS_CHANNEL(accb); cto->ct_flags = CT7_NOACK|CT7_TERMINATE; isp_target_put_entry(isp, cto); } isp_put_atpd(isp, XS_CHANNEL(accb), atp); ccb->ccb_h.status = CAM_REQ_CMP; } else { ccb->ccb_h.status = CAM_UA_ABORT; } } static void isp_abort_inot(ispsoftc_t *isp, union ccb *ccb) { inot_private_data_t *ntp; union ccb *accb = ccb->cab.abort_ccb; struct ccb_hdr *sccb; tstate_t *tptr; tptr = get_lun_statep(isp, XS_CHANNEL(accb), XS_LUN(accb)); if (tptr != NULL) { /* Search for the INOT among queueued. */ SLIST_FOREACH(sccb, &tptr->inots, sim_links.sle) { if (sccb != &accb->ccb_h) continue; SLIST_REMOVE(&tptr->inots, sccb, ccb_hdr, sim_links.sle); ISP_PATH_PRT(isp, ISP_LOGTDEBUG2, sccb->path, "Abort FREE INOT\n"); accb->ccb_h.status = CAM_REQ_ABORTED; xpt_done(accb); ccb->ccb_h.status = CAM_REQ_CMP; return; } } /* Search for the INOT among running. */ ntp = isp_find_ntpd(isp, XS_CHANNEL(accb), accb->cin1.tag_id, accb->cin1.seq_id); if (ntp != NULL) { if (ntp->nt.nt_need_ack) { isp_async(isp, ISPASYNC_TARGET_NOTIFY_ACK, ntp->nt.nt_lreserved); } isp_put_ntpd(isp, XS_CHANNEL(accb), ntp); ccb->ccb_h.status = CAM_REQ_CMP; } else { ccb->ccb_h.status = CAM_UA_ABORT; return; } } #endif static void isp_action(struct cam_sim *sim, union ccb *ccb) { int bus, tgt, error; ispsoftc_t *isp; struct ccb_trans_settings *cts; sbintime_t ts; CAM_DEBUG(ccb->ccb_h.path, CAM_DEBUG_TRACE, ("isp_action\n")); isp = (ispsoftc_t *)cam_sim_softc(sim); ISP_ASSERT_LOCKED(isp); bus = cam_sim_bus(sim); isp_prt(isp, ISP_LOGDEBUG2, "isp_action code %x", ccb->ccb_h.func_code); ISP_PCMD(ccb) = NULL; switch (ccb->ccb_h.func_code) { case XPT_SCSI_IO: /* Execute the requested I/O operation */ /* * Do a couple of preliminary checks... */ if ((ccb->ccb_h.flags & CAM_CDB_POINTER) != 0) { if ((ccb->ccb_h.flags & CAM_CDB_PHYS) != 0) { ccb->ccb_h.status = CAM_REQ_INVALID; isp_done((struct ccb_scsiio *) ccb); break; } } ccb->csio.req_map = NULL; #ifdef DIAGNOSTIC if (ccb->ccb_h.target_id >= ISP_MAX_TARGETS(isp)) { xpt_print(ccb->ccb_h.path, "invalid target\n"); ccb->ccb_h.status = CAM_PATH_INVALID; } else if (ISP_MAX_LUNS(isp) > 0 && ccb->ccb_h.target_lun >= ISP_MAX_LUNS(isp)) { xpt_print(ccb->ccb_h.path, "invalid lun\n"); ccb->ccb_h.status = CAM_PATH_INVALID; } if (ccb->ccb_h.status == CAM_PATH_INVALID) { xpt_done(ccb); break; } #endif ccb->csio.scsi_status = SCSI_STATUS_OK; if (isp_get_pcmd(isp, ccb)) { isp_prt(isp, ISP_LOGWARN, "out of PCMDs"); cam_freeze_devq(ccb->ccb_h.path); cam_release_devq(ccb->ccb_h.path, RELSIM_RELEASE_AFTER_TIMEOUT, 0, 250, 0); ccb->ccb_h.status = CAM_REQUEUE_REQ; xpt_done(ccb); break; } error = isp_start((XS_T *) ccb); switch (error) { case CMD_QUEUED: ccb->ccb_h.status |= CAM_SIM_QUEUED; if (ccb->ccb_h.timeout == CAM_TIME_INFINITY) break; /* Give firmware extra 10s to handle timeout. */ ts = SBT_1MS * ccb->ccb_h.timeout + 10 * SBT_1S; callout_reset_sbt(&PISP_PCMD(ccb)->wdog, ts, 0, isp_watchdog, ccb, 0); break; case CMD_RQLATER: isp_prt(isp, ISP_LOGDEBUG0, "%d.%jx retry later", XS_TGT(ccb), (uintmax_t)XS_LUN(ccb)); cam_freeze_devq(ccb->ccb_h.path); cam_release_devq(ccb->ccb_h.path, RELSIM_RELEASE_AFTER_TIMEOUT, 0, 1000, 0); ccb->ccb_h.status = CAM_REQUEUE_REQ; isp_free_pcmd(isp, ccb); xpt_done(ccb); break; case CMD_EAGAIN: isp_free_pcmd(isp, ccb); cam_freeze_devq(ccb->ccb_h.path); cam_release_devq(ccb->ccb_h.path, RELSIM_RELEASE_AFTER_TIMEOUT, 0, 100, 0); ccb->ccb_h.status = CAM_REQUEUE_REQ; xpt_done(ccb); break; case CMD_COMPLETE: isp_done((struct ccb_scsiio *) ccb); break; default: isp_prt(isp, ISP_LOGERR, "What's this? 0x%x at %d in file %s", error, __LINE__, __FILE__); ccb->ccb_h.status = CAM_REQUEUE_REQ; isp_free_pcmd(isp, ccb); xpt_done(ccb); } break; #ifdef ISP_TARGET_MODE case XPT_EN_LUN: /* Enable/Disable LUN as a target */ if (ccb->cel.enable) { isp_enable_lun(isp, ccb); } else { isp_disable_lun(isp, ccb); } break; case XPT_IMMEDIATE_NOTIFY: /* Add Immediate Notify Resource */ case XPT_ACCEPT_TARGET_IO: /* Add Accept Target IO Resource */ { tstate_t *tptr = get_lun_statep(isp, XS_CHANNEL(ccb), ccb->ccb_h.target_lun); if (tptr == NULL) { const char *str; if (ccb->ccb_h.func_code == XPT_IMMEDIATE_NOTIFY) str = "XPT_IMMEDIATE_NOTIFY"; else str = "XPT_ACCEPT_TARGET_IO"; ISP_PATH_PRT(isp, ISP_LOGWARN, ccb->ccb_h.path, "%s: no state pointer found for %s\n", __func__, str); ccb->ccb_h.status = CAM_DEV_NOT_THERE; xpt_done(ccb); break; } ccb->ccb_h.spriv_field0 = 0; ccb->ccb_h.spriv_ptr1 = isp; if (ccb->ccb_h.func_code == XPT_ACCEPT_TARGET_IO) { ccb->atio.tag_id = 0; SLIST_INSERT_HEAD(&tptr->atios, &ccb->ccb_h, sim_links.sle); ISP_PATH_PRT(isp, ISP_LOGTDEBUG2, ccb->ccb_h.path, "Put FREE ATIO\n"); } else if (ccb->ccb_h.func_code == XPT_IMMEDIATE_NOTIFY) { ccb->cin1.seq_id = ccb->cin1.tag_id = 0; SLIST_INSERT_HEAD(&tptr->inots, &ccb->ccb_h, sim_links.sle); ISP_PATH_PRT(isp, ISP_LOGTDEBUG2, ccb->ccb_h.path, "Put FREE INOT\n"); } ccb->ccb_h.status = CAM_REQ_INPROG; break; } case XPT_NOTIFY_ACKNOWLEDGE: /* notify ack */ { inot_private_data_t *ntp; /* * XXX: Because we cannot guarantee that the path information in the notify acknowledge ccb * XXX: matches that for the immediate notify, we have to *search* for the notify structure */ /* * All the relevant path information is in the associated immediate notify */ ISP_PATH_PRT(isp, ISP_LOGTDEBUG0, ccb->ccb_h.path, "%s: [0x%x] NOTIFY ACKNOWLEDGE for 0x%x seen\n", __func__, ccb->cna2.tag_id, ccb->cna2.seq_id); ntp = isp_find_ntpd(isp, XS_CHANNEL(ccb), ccb->cna2.tag_id, ccb->cna2.seq_id); if (ntp == NULL) { ISP_PATH_PRT(isp, ISP_LOGWARN, ccb->ccb_h.path, "%s: [0x%x] XPT_NOTIFY_ACKNOWLEDGE of 0x%x cannot find ntp private data\n", __func__, ccb->cna2.tag_id, ccb->cna2.seq_id); ccb->ccb_h.status = CAM_DEV_NOT_THERE; xpt_done(ccb); break; } if (isp_handle_platform_target_notify_ack(isp, &ntp->nt, (ccb->ccb_h.flags & CAM_SEND_STATUS) ? ccb->cna2.arg : 0)) { cam_freeze_devq(ccb->ccb_h.path); cam_release_devq(ccb->ccb_h.path, RELSIM_RELEASE_AFTER_TIMEOUT, 0, 1000, 0); ccb->ccb_h.status &= ~CAM_STATUS_MASK; ccb->ccb_h.status |= CAM_REQUEUE_REQ; break; } isp_put_ntpd(isp, XS_CHANNEL(ccb), ntp); ccb->ccb_h.status = CAM_REQ_CMP; ISP_PATH_PRT(isp, ISP_LOGTDEBUG0, ccb->ccb_h.path, "%s: [0x%x] calling xpt_done for tag 0x%x\n", __func__, ccb->cna2.tag_id, ccb->cna2.seq_id); xpt_done(ccb); break; } case XPT_CONT_TARGET_IO: isp_target_start_ctio(isp, ccb, FROM_CAM); break; #endif case XPT_RESET_DEV: /* BDR the specified SCSI device */ tgt = ccb->ccb_h.target_id; tgt |= (bus << 16); error = isp_control(isp, ISPCTL_RESET_DEV, bus, tgt); if (error) { ccb->ccb_h.status = CAM_REQ_CMP_ERR; } else { /* * If we have a FC device, reset the Command * Reference Number, because the target will expect * that we re-start the CRN at 1 after a reset. */ if (IS_FC(isp)) isp_fcp_reset_crn(isp, bus, tgt, /*tgt_set*/ 1); ccb->ccb_h.status = CAM_REQ_CMP; } xpt_done(ccb); break; case XPT_ABORT: /* Abort the specified CCB */ { union ccb *accb = ccb->cab.abort_ccb; switch (accb->ccb_h.func_code) { #ifdef ISP_TARGET_MODE case XPT_ACCEPT_TARGET_IO: isp_abort_atio(isp, ccb); break; case XPT_IMMEDIATE_NOTIFY: isp_abort_inot(isp, ccb); break; #endif case XPT_SCSI_IO: error = isp_control(isp, ISPCTL_ABORT_CMD, accb); if (error) { ccb->ccb_h.status = CAM_UA_ABORT; } else { ccb->ccb_h.status = CAM_REQ_CMP; } break; default: ccb->ccb_h.status = CAM_REQ_INVALID; break; } /* * This is not a queued CCB, so the caller expects it to be * complete when control is returned. */ break; } #define IS_CURRENT_SETTINGS(c) (c->type == CTS_TYPE_CURRENT_SETTINGS) case XPT_SET_TRAN_SETTINGS: /* Nexus Settings */ cts = &ccb->cts; if (!IS_CURRENT_SETTINGS(cts)) { ccb->ccb_h.status = CAM_REQ_INVALID; xpt_done(ccb); break; } tgt = cts->ccb_h.target_id; if (IS_SCSI(isp)) { struct ccb_trans_settings_scsi *scsi = &cts->proto_specific.scsi; struct ccb_trans_settings_spi *spi = &cts->xport_specific.spi; sdparam *sdp = SDPARAM(isp, bus); uint16_t *dptr; if (spi->valid == 0 && scsi->valid == 0) { ccb->ccb_h.status = CAM_REQ_CMP; xpt_done(ccb); break; } /* * We always update (internally) from goal_flags * so any request to change settings just gets * vectored to that location. */ dptr = &sdp->isp_devparam[tgt].goal_flags; if ((spi->valid & CTS_SPI_VALID_DISC) != 0) { if ((spi->flags & CTS_SPI_FLAGS_DISC_ENB) != 0) *dptr |= DPARM_DISC; else *dptr &= ~DPARM_DISC; } if ((scsi->valid & CTS_SCSI_VALID_TQ) != 0) { if ((scsi->flags & CTS_SCSI_FLAGS_TAG_ENB) != 0) *dptr |= DPARM_TQING; else *dptr &= ~DPARM_TQING; } if ((spi->valid & CTS_SPI_VALID_BUS_WIDTH) != 0) { if (spi->bus_width == MSG_EXT_WDTR_BUS_16_BIT) *dptr |= DPARM_WIDE; else *dptr &= ~DPARM_WIDE; } /* * XXX: FIX ME */ if ((spi->valid & CTS_SPI_VALID_SYNC_OFFSET) && (spi->valid & CTS_SPI_VALID_SYNC_RATE) && (spi->sync_period && spi->sync_offset)) { *dptr |= DPARM_SYNC; /* * XXX: CHECK FOR LEGALITY */ sdp->isp_devparam[tgt].goal_period = spi->sync_period; sdp->isp_devparam[tgt].goal_offset = spi->sync_offset; } else { *dptr &= ~DPARM_SYNC; } isp_prt(isp, ISP_LOGDEBUG0, "SET (%d.%d.%jx) to flags %x off %x per %x", bus, tgt, (uintmax_t)cts->ccb_h.target_lun, sdp->isp_devparam[tgt].goal_flags, sdp->isp_devparam[tgt].goal_offset, sdp->isp_devparam[tgt].goal_period); sdp->isp_devparam[tgt].dev_update = 1; sdp->update = 1; } ccb->ccb_h.status = CAM_REQ_CMP; xpt_done(ccb); break; case XPT_GET_TRAN_SETTINGS: cts = &ccb->cts; tgt = cts->ccb_h.target_id; if (IS_FC(isp)) { fcparam *fcp = FCPARAM(isp, bus); struct ccb_trans_settings_scsi *scsi = &cts->proto_specific.scsi; struct ccb_trans_settings_fc *fc = &cts->xport_specific.fc; cts->protocol = PROTO_SCSI; cts->protocol_version = SCSI_REV_2; cts->transport = XPORT_FC; cts->transport_version = 0; scsi->valid = CTS_SCSI_VALID_TQ; scsi->flags = CTS_SCSI_FLAGS_TAG_ENB; fc->valid = CTS_FC_VALID_SPEED; fc->bitrate = 100000; fc->bitrate *= fcp->isp_gbspeed; if (tgt < MAX_FC_TARG) { fcportdb_t *lp = &fcp->portdb[tgt]; fc->wwnn = lp->node_wwn; fc->wwpn = lp->port_wwn; fc->port = lp->portid; fc->valid |= CTS_FC_VALID_WWNN | CTS_FC_VALID_WWPN | CTS_FC_VALID_PORT; } } else { struct ccb_trans_settings_scsi *scsi = &cts->proto_specific.scsi; struct ccb_trans_settings_spi *spi = &cts->xport_specific.spi; sdparam *sdp = SDPARAM(isp, bus); uint16_t dval, pval, oval; if (IS_CURRENT_SETTINGS(cts)) { sdp->isp_devparam[tgt].dev_refresh = 1; sdp->update = 1; (void) isp_control(isp, ISPCTL_UPDATE_PARAMS, bus); dval = sdp->isp_devparam[tgt].actv_flags; oval = sdp->isp_devparam[tgt].actv_offset; pval = sdp->isp_devparam[tgt].actv_period; } else { dval = sdp->isp_devparam[tgt].nvrm_flags; oval = sdp->isp_devparam[tgt].nvrm_offset; pval = sdp->isp_devparam[tgt].nvrm_period; } cts->protocol = PROTO_SCSI; cts->protocol_version = SCSI_REV_2; cts->transport = XPORT_SPI; cts->transport_version = 2; spi->valid = 0; scsi->valid = 0; spi->flags = 0; scsi->flags = 0; if (dval & DPARM_DISC) { spi->flags |= CTS_SPI_FLAGS_DISC_ENB; } if ((dval & DPARM_SYNC) && oval && pval) { spi->sync_offset = oval; spi->sync_period = pval; } else { spi->sync_offset = 0; spi->sync_period = 0; } spi->valid |= CTS_SPI_VALID_SYNC_OFFSET; spi->valid |= CTS_SPI_VALID_SYNC_RATE; spi->valid |= CTS_SPI_VALID_BUS_WIDTH; if (dval & DPARM_WIDE) { spi->bus_width = MSG_EXT_WDTR_BUS_16_BIT; } else { spi->bus_width = MSG_EXT_WDTR_BUS_8_BIT; } if (cts->ccb_h.target_lun != CAM_LUN_WILDCARD) { scsi->valid = CTS_SCSI_VALID_TQ; if (dval & DPARM_TQING) { scsi->flags |= CTS_SCSI_FLAGS_TAG_ENB; } spi->valid |= CTS_SPI_VALID_DISC; } isp_prt(isp, ISP_LOGDEBUG0, "GET %s (%d.%d.%jx) to flags %x off %x per %x", IS_CURRENT_SETTINGS(cts)? "ACTIVE" : "NVRAM", bus, tgt, (uintmax_t)cts->ccb_h.target_lun, dval, oval, pval); } ccb->ccb_h.status = CAM_REQ_CMP; xpt_done(ccb); break; case XPT_CALC_GEOMETRY: cam_calc_geometry(&ccb->ccg, 1); xpt_done(ccb); break; case XPT_RESET_BUS: /* Reset the specified bus */ error = isp_control(isp, ISPCTL_RESET_BUS, bus); if (error) { ccb->ccb_h.status = CAM_REQ_CMP_ERR; xpt_done(ccb); break; } if (bootverbose) { xpt_print(ccb->ccb_h.path, "reset bus on channel %d\n", bus); } if (IS_FC(isp)) { xpt_async(AC_BUS_RESET, ISP_FC_PC(isp, bus)->path, 0); } else { xpt_async(AC_BUS_RESET, ISP_SPI_PC(isp, bus)->path, 0); } ccb->ccb_h.status = CAM_REQ_CMP; xpt_done(ccb); break; case XPT_TERM_IO: /* Terminate the I/O process */ ccb->ccb_h.status = CAM_REQ_INVALID; xpt_done(ccb); break; case XPT_SET_SIM_KNOB: /* Set SIM knobs */ { struct ccb_sim_knob *kp = &ccb->knob; fcparam *fcp; if (!IS_FC(isp)) { ccb->ccb_h.status = CAM_REQ_INVALID; xpt_done(ccb); break; } fcp = FCPARAM(isp, bus); if (kp->xport_specific.fc.valid & KNOB_VALID_ADDRESS) { fcp->isp_wwnn = ISP_FC_PC(isp, bus)->def_wwnn = kp->xport_specific.fc.wwnn; fcp->isp_wwpn = ISP_FC_PC(isp, bus)->def_wwpn = kp->xport_specific.fc.wwpn; isp_prt(isp, ISP_LOGALL, "Setting Channel %d wwns to 0x%jx 0x%jx", bus, fcp->isp_wwnn, fcp->isp_wwpn); } ccb->ccb_h.status = CAM_REQ_CMP; if (kp->xport_specific.fc.valid & KNOB_VALID_ROLE) { int rchange = 0; int newrole = 0; switch (kp->xport_specific.fc.role) { case KNOB_ROLE_NONE: if (fcp->role != ISP_ROLE_NONE) { rchange = 1; newrole = ISP_ROLE_NONE; } break; case KNOB_ROLE_TARGET: if (fcp->role != ISP_ROLE_TARGET) { rchange = 1; newrole = ISP_ROLE_TARGET; } break; case KNOB_ROLE_INITIATOR: if (fcp->role != ISP_ROLE_INITIATOR) { rchange = 1; newrole = ISP_ROLE_INITIATOR; } break; case KNOB_ROLE_BOTH: if (fcp->role != ISP_ROLE_BOTH) { rchange = 1; newrole = ISP_ROLE_BOTH; } break; } if (rchange) { ISP_PATH_PRT(isp, ISP_LOGCONFIG, ccb->ccb_h.path, "changing role on from %d to %d\n", fcp->role, newrole); if (isp_control(isp, ISPCTL_CHANGE_ROLE, bus, newrole) != 0) { ccb->ccb_h.status = CAM_REQ_CMP_ERR; xpt_done(ccb); break; } } } xpt_done(ccb); break; } case XPT_GET_SIM_KNOB_OLD: /* Get SIM knobs -- compat value */ case XPT_GET_SIM_KNOB: /* Get SIM knobs */ { struct ccb_sim_knob *kp = &ccb->knob; if (IS_FC(isp)) { fcparam *fcp; fcp = FCPARAM(isp, bus); kp->xport_specific.fc.wwnn = fcp->isp_wwnn; kp->xport_specific.fc.wwpn = fcp->isp_wwpn; switch (fcp->role) { case ISP_ROLE_NONE: kp->xport_specific.fc.role = KNOB_ROLE_NONE; break; case ISP_ROLE_TARGET: kp->xport_specific.fc.role = KNOB_ROLE_TARGET; break; case ISP_ROLE_INITIATOR: kp->xport_specific.fc.role = KNOB_ROLE_INITIATOR; break; case ISP_ROLE_BOTH: kp->xport_specific.fc.role = KNOB_ROLE_BOTH; break; } kp->xport_specific.fc.valid = KNOB_VALID_ADDRESS | KNOB_VALID_ROLE; ccb->ccb_h.status = CAM_REQ_CMP; } else { ccb->ccb_h.status = CAM_REQ_INVALID; } xpt_done(ccb); break; } case XPT_PATH_INQ: /* Path routing inquiry */ { struct ccb_pathinq *cpi = &ccb->cpi; cpi->version_num = 1; #ifdef ISP_TARGET_MODE if (IS_FC(isp) && ISP_CAP_TMODE(isp) && ISP_CAP_SCCFW(isp)) cpi->target_sprt = PIT_PROCESSOR | PIT_DISCONNECT | PIT_TERM_IO; else #endif cpi->target_sprt = 0; cpi->hba_eng_cnt = 0; cpi->max_target = ISP_MAX_TARGETS(isp) - 1; cpi->max_lun = ISP_MAX_LUNS(isp) == 0 ? 255 : ISP_MAX_LUNS(isp) - 1; cpi->bus_id = cam_sim_bus(sim); if (sizeof (bus_size_t) > 4) cpi->maxio = (ISP_NSEG64_MAX - 1) * PAGE_SIZE; else cpi->maxio = (ISP_NSEG_MAX - 1) * PAGE_SIZE; if (IS_FC(isp)) { fcparam *fcp = FCPARAM(isp, bus); cpi->hba_misc = PIM_NOBUSRESET | PIM_UNMAPPED; cpi->hba_misc |= PIM_EXTLUNS | PIM_NOSCAN; /* * Because our loop ID can shift from time to time, * make our initiator ID out of range of our bus. */ cpi->initiator_id = cpi->max_target + 1; /* * Set base transfer capabilities for Fibre Channel, for this HBA. */ if (IS_25XX(isp)) { cpi->base_transfer_speed = 8000000; } else if (IS_24XX(isp)) { cpi->base_transfer_speed = 4000000; } else if (IS_23XX(isp)) { cpi->base_transfer_speed = 2000000; } else { cpi->base_transfer_speed = 1000000; } cpi->hba_inquiry = PI_TAG_ABLE; cpi->transport = XPORT_FC; cpi->transport_version = 0; cpi->xport_specific.fc.wwnn = fcp->isp_wwnn; cpi->xport_specific.fc.wwpn = fcp->isp_wwpn; cpi->xport_specific.fc.port = fcp->isp_portid; cpi->xport_specific.fc.bitrate = fcp->isp_gbspeed * 1000; } else { sdparam *sdp = SDPARAM(isp, bus); cpi->hba_inquiry = PI_SDTR_ABLE|PI_TAG_ABLE|PI_WIDE_16; cpi->hba_misc = PIM_UNMAPPED; cpi->initiator_id = sdp->isp_initiator_id; cpi->base_transfer_speed = 3300; cpi->transport = XPORT_SPI; cpi->transport_version = 2; } cpi->protocol = PROTO_SCSI; cpi->protocol_version = SCSI_REV_2; strlcpy(cpi->sim_vid, "FreeBSD", SIM_IDLEN); strlcpy(cpi->hba_vid, "Qlogic", HBA_IDLEN); strlcpy(cpi->dev_name, cam_sim_name(sim), DEV_IDLEN); cpi->unit_number = cam_sim_unit(sim); cpi->ccb_h.status = CAM_REQ_CMP; xpt_done(ccb); break; } default: ccb->ccb_h.status = CAM_REQ_INVALID; xpt_done(ccb); break; } } void isp_done(XS_T *sccb) { ispsoftc_t *isp = XS_ISP(sccb); uint32_t status; if (XS_NOERR(sccb)) XS_SETERR(sccb, CAM_REQ_CMP); if ((sccb->ccb_h.status & CAM_STATUS_MASK) == CAM_REQ_CMP && (sccb->scsi_status != SCSI_STATUS_OK)) { sccb->ccb_h.status &= ~CAM_STATUS_MASK; if ((sccb->scsi_status == SCSI_STATUS_CHECK_COND) && (sccb->ccb_h.status & CAM_AUTOSNS_VALID) == 0) { sccb->ccb_h.status |= CAM_AUTOSENSE_FAIL; } else { sccb->ccb_h.status |= CAM_SCSI_STATUS_ERROR; } } sccb->ccb_h.status &= ~CAM_SIM_QUEUED; status = sccb->ccb_h.status & CAM_STATUS_MASK; if (status != CAM_REQ_CMP && (sccb->ccb_h.status & CAM_DEV_QFRZN) == 0) { sccb->ccb_h.status |= CAM_DEV_QFRZN; xpt_freeze_devq(sccb->ccb_h.path, 1); } if (ISP_PCMD(sccb)) { if (callout_active(&PISP_PCMD(sccb)->wdog)) callout_stop(&PISP_PCMD(sccb)->wdog); isp_free_pcmd(isp, (union ccb *) sccb); } xpt_done((union ccb *) sccb); } void isp_async(ispsoftc_t *isp, ispasync_t cmd, ...) { int bus; static const char prom[] = "Chan %d [%d] WWPN 0x%16jx PortID 0x%06x handle 0x%x %s %s"; char buf[64]; char *msg = NULL; target_id_t tgt = 0; fcportdb_t *lp; struct isp_fc *fc; struct cam_path *tmppath; struct ac_contract ac; struct ac_device_changed *adc; va_list ap; switch (cmd) { case ISPASYNC_NEW_TGT_PARAMS: { struct ccb_trans_settings_scsi *scsi; struct ccb_trans_settings_spi *spi; int flags, tgt; sdparam *sdp; struct ccb_trans_settings cts; memset(&cts, 0, sizeof (struct ccb_trans_settings)); va_start(ap, cmd); bus = va_arg(ap, int); tgt = va_arg(ap, int); va_end(ap); sdp = SDPARAM(isp, bus); if (xpt_create_path(&tmppath, NULL, cam_sim_path(ISP_SPI_PC(isp, bus)->sim), tgt, CAM_LUN_WILDCARD) != CAM_REQ_CMP) { isp_prt(isp, ISP_LOGWARN, "isp_async cannot make temp path for %d.%d", tgt, bus); break; } flags = sdp->isp_devparam[tgt].actv_flags; cts.type = CTS_TYPE_CURRENT_SETTINGS; cts.protocol = PROTO_SCSI; cts.transport = XPORT_SPI; scsi = &cts.proto_specific.scsi; spi = &cts.xport_specific.spi; if (flags & DPARM_TQING) { scsi->valid |= CTS_SCSI_VALID_TQ; scsi->flags |= CTS_SCSI_FLAGS_TAG_ENB; } if (flags & DPARM_DISC) { spi->valid |= CTS_SPI_VALID_DISC; spi->flags |= CTS_SPI_FLAGS_DISC_ENB; } spi->flags |= CTS_SPI_VALID_BUS_WIDTH; if (flags & DPARM_WIDE) { spi->bus_width = MSG_EXT_WDTR_BUS_16_BIT; } else { spi->bus_width = MSG_EXT_WDTR_BUS_8_BIT; } if (flags & DPARM_SYNC) { spi->valid |= CTS_SPI_VALID_SYNC_RATE; spi->valid |= CTS_SPI_VALID_SYNC_OFFSET; spi->sync_period = sdp->isp_devparam[tgt].actv_period; spi->sync_offset = sdp->isp_devparam[tgt].actv_offset; } isp_prt(isp, ISP_LOGDEBUG2, "NEW_TGT_PARAMS bus %d tgt %d period %x offset %x flags %x", bus, tgt, sdp->isp_devparam[tgt].actv_period, sdp->isp_devparam[tgt].actv_offset, flags); xpt_setup_ccb(&cts.ccb_h, tmppath, 1); xpt_async(AC_TRANSFER_NEG, tmppath, &cts); xpt_free_path(tmppath); break; } case ISPASYNC_BUS_RESET: { va_start(ap, cmd); bus = va_arg(ap, int); va_end(ap); isp_prt(isp, ISP_LOGINFO, "SCSI bus reset on bus %d detected", bus); if (IS_FC(isp)) { xpt_async(AC_BUS_RESET, ISP_FC_PC(isp, bus)->path, NULL); } else { xpt_async(AC_BUS_RESET, ISP_SPI_PC(isp, bus)->path, NULL); } break; } case ISPASYNC_LOOP_RESET: { uint16_t lipp; fcparam *fcp; va_start(ap, cmd); bus = va_arg(ap, int); va_end(ap); lipp = ISP_READ(isp, OUTMAILBOX1); fcp = FCPARAM(isp, bus); isp_prt(isp, ISP_LOGINFO, "Chan %d LOOP Reset, LIP primitive %x", bus, lipp); /* * Per FCP-4, a Reset LIP should result in a CRN reset. Other * LIPs and loop up/down events should never reset the CRN. For * an as of yet unknown reason, 24xx series cards (and * potentially others) can interrupt with a LIP Reset status * when no LIP reset came down the wire. Additionally, the LIP * primitive accompanying this status would not be a valid LIP * Reset primitive, but some variation of an invalid AL_PA * LIP. As a result, we have to verify the AL_PD in the LIP * addresses our port before blindly resetting. */ if (FCP_IS_DEST_ALPD(fcp, (lipp & 0x00FF))) isp_fcp_reset_crn(isp, bus, /*tgt*/0, /*tgt_set*/ 0); isp_loop_changed(isp, bus); break; } case ISPASYNC_LIP: if (msg == NULL) msg = "LIP Received"; /* FALLTHROUGH */ case ISPASYNC_LOOP_DOWN: if (msg == NULL) msg = "LOOP Down"; /* FALLTHROUGH */ case ISPASYNC_LOOP_UP: if (msg == NULL) msg = "LOOP Up"; va_start(ap, cmd); bus = va_arg(ap, int); va_end(ap); isp_loop_changed(isp, bus); isp_prt(isp, ISP_LOGINFO, "Chan %d %s", bus, msg); break; case ISPASYNC_DEV_ARRIVED: va_start(ap, cmd); bus = va_arg(ap, int); lp = va_arg(ap, fcportdb_t *); va_end(ap); fc = ISP_FC_PC(isp, bus); tgt = FC_PORTDB_TGT(isp, bus, lp); isp_gen_role_str(buf, sizeof (buf), lp->prli_word3); isp_prt(isp, ISP_LOGCONFIG, prom, bus, tgt, lp->port_wwn, lp->portid, lp->handle, buf, "arrived"); if ((FCPARAM(isp, bus)->role & ISP_ROLE_INITIATOR) && (lp->prli_word3 & PRLI_WD3_TARGET_FUNCTION)) { lp->is_target = 1; isp_fcp_reset_crn(isp, bus, tgt, /*tgt_set*/ 1); isp_make_here(isp, lp, bus, tgt); } if ((FCPARAM(isp, bus)->role & ISP_ROLE_TARGET) && (lp->prli_word3 & PRLI_WD3_INITIATOR_FUNCTION)) { lp->is_initiator = 1; ac.contract_number = AC_CONTRACT_DEV_CHG; adc = (struct ac_device_changed *) ac.contract_data; adc->wwpn = lp->port_wwn; adc->port = lp->portid; adc->target = tgt; adc->arrived = 1; xpt_async(AC_CONTRACT, fc->path, &ac); } break; case ISPASYNC_DEV_CHANGED: case ISPASYNC_DEV_STAYED: va_start(ap, cmd); bus = va_arg(ap, int); lp = va_arg(ap, fcportdb_t *); va_end(ap); fc = ISP_FC_PC(isp, bus); tgt = FC_PORTDB_TGT(isp, bus, lp); isp_gen_role_str(buf, sizeof (buf), lp->new_prli_word3); if (cmd == ISPASYNC_DEV_CHANGED) isp_prt(isp, ISP_LOGCONFIG, prom, bus, tgt, lp->port_wwn, lp->new_portid, lp->handle, buf, "changed"); else isp_prt(isp, ISP_LOGCONFIG, prom, bus, tgt, lp->port_wwn, lp->portid, lp->handle, buf, "stayed"); if (lp->is_target != ((FCPARAM(isp, bus)->role & ISP_ROLE_INITIATOR) && (lp->new_prli_word3 & PRLI_WD3_TARGET_FUNCTION))) { lp->is_target = !lp->is_target; if (lp->is_target) { if (cmd == ISPASYNC_DEV_CHANGED) isp_fcp_reset_crn(isp, bus, tgt, /*tgt_set*/ 1); isp_make_here(isp, lp, bus, tgt); } else { isp_make_gone(isp, lp, bus, tgt); if (cmd == ISPASYNC_DEV_CHANGED) isp_fcp_reset_crn(isp, bus, tgt, /*tgt_set*/ 1); } } if (lp->is_initiator != ((FCPARAM(isp, bus)->role & ISP_ROLE_TARGET) && (lp->new_prli_word3 & PRLI_WD3_INITIATOR_FUNCTION))) { lp->is_initiator = !lp->is_initiator; ac.contract_number = AC_CONTRACT_DEV_CHG; adc = (struct ac_device_changed *) ac.contract_data; adc->wwpn = lp->port_wwn; adc->port = lp->portid; adc->target = tgt; adc->arrived = lp->is_initiator; xpt_async(AC_CONTRACT, fc->path, &ac); } break; case ISPASYNC_DEV_GONE: va_start(ap, cmd); bus = va_arg(ap, int); lp = va_arg(ap, fcportdb_t *); va_end(ap); fc = ISP_FC_PC(isp, bus); tgt = FC_PORTDB_TGT(isp, bus, lp); /* * If this has a virtual target or initiator set the isp_gdt * timer running on it to delay its departure. */ isp_gen_role_str(buf, sizeof (buf), lp->prli_word3); if (lp->is_target || lp->is_initiator) { lp->state = FC_PORTDB_STATE_ZOMBIE; lp->gone_timer = fc->gone_device_time; isp_prt(isp, ISP_LOGCONFIG, prom, bus, tgt, lp->port_wwn, lp->portid, lp->handle, buf, "gone zombie"); if (fc->ready && !callout_active(&fc->gdt)) { isp_prt(isp, ISP_LOG_SANCFG|ISP_LOGDEBUG0, "Chan %d Starting Gone Device Timer with %u seconds time now %lu", bus, lp->gone_timer, (unsigned long)time_uptime); callout_reset(&fc->gdt, hz, isp_gdt, fc); } break; } isp_prt(isp, ISP_LOGCONFIG, prom, bus, tgt, lp->port_wwn, lp->portid, lp->handle, buf, "gone"); break; case ISPASYNC_CHANGE_NOTIFY: { char *msg; int evt, nphdl, nlstate, portid, reason; va_start(ap, cmd); bus = va_arg(ap, int); evt = va_arg(ap, int); if (evt == ISPASYNC_CHANGE_PDB) { nphdl = va_arg(ap, int); nlstate = va_arg(ap, int); reason = va_arg(ap, int); } else if (evt == ISPASYNC_CHANGE_SNS) { portid = va_arg(ap, int); } else { nphdl = NIL_HANDLE; nlstate = reason = 0; } va_end(ap); if (evt == ISPASYNC_CHANGE_PDB) { int tgt_set = 0; msg = "Port Database Changed"; isp_prt(isp, ISP_LOGINFO, "Chan %d %s (nphdl 0x%x state 0x%x reason 0x%x)", bus, msg, nphdl, nlstate, reason); /* * Port database syncs are not sufficient for * determining that logins or logouts are done on the * loop, but this information is directly available from * the reason code from the incoming mbox. We must reset * the fcp crn on these events according to FCP-4 */ switch (reason) { case PDB24XX_AE_IMPL_LOGO_1: case PDB24XX_AE_IMPL_LOGO_2: case PDB24XX_AE_IMPL_LOGO_3: case PDB24XX_AE_PLOGI_RCVD: case PDB24XX_AE_PRLI_RCVD: case PDB24XX_AE_PRLO_RCVD: case PDB24XX_AE_LOGO_RCVD: case PDB24XX_AE_PLOGI_DONE: case PDB24XX_AE_PRLI_DONE: /* * If the event is not global, twiddle tgt and * tgt_set to nominate only the target * associated with the nphdl. */ if (nphdl != PDB24XX_AE_GLOBAL) { /* Break if we don't yet have the pdb */ if (!isp_find_pdb_by_handle(isp, bus, nphdl, &lp)) break; tgt = FC_PORTDB_TGT(isp, bus, lp); tgt_set = 1; } isp_fcp_reset_crn(isp, bus, tgt, tgt_set); break; default: break; /* NOP */ } } else if (evt == ISPASYNC_CHANGE_SNS) { msg = "Name Server Database Changed"; isp_prt(isp, ISP_LOGINFO, "Chan %d %s (PortID 0x%06x)", bus, msg, portid); } else { msg = "Other Change Notify"; isp_prt(isp, ISP_LOGINFO, "Chan %d %s", bus, msg); } isp_loop_changed(isp, bus); break; } #ifdef ISP_TARGET_MODE case ISPASYNC_TARGET_NOTIFY: { isp_notify_t *notify; va_start(ap, cmd); notify = va_arg(ap, isp_notify_t *); va_end(ap); switch (notify->nt_ncode) { case NT_ABORT_TASK: case NT_ABORT_TASK_SET: case NT_CLEAR_ACA: case NT_CLEAR_TASK_SET: case NT_LUN_RESET: case NT_TARGET_RESET: case NT_QUERY_TASK_SET: case NT_QUERY_ASYNC_EVENT: /* * These are task management functions. */ isp_handle_platform_target_tmf(isp, notify); break; case NT_BUS_RESET: case NT_LIP_RESET: case NT_LINK_UP: case NT_LINK_DOWN: case NT_HBA_RESET: /* * No action need be taken here. */ break; case NT_GLOBAL_LOGOUT: case NT_LOGOUT: /* * This is device arrival/departure notification */ isp_handle_platform_target_notify_ack(isp, notify, 0); break; case NT_SRR: isp_handle_platform_srr(isp, notify); break; default: isp_prt(isp, ISP_LOGALL, "target notify code 0x%x", notify->nt_ncode); isp_handle_platform_target_notify_ack(isp, notify, 0); break; } break; } case ISPASYNC_TARGET_NOTIFY_ACK: { void *inot; va_start(ap, cmd); inot = va_arg(ap, void *); va_end(ap); if (isp_notify_ack(isp, inot)) { isp_tna_t *tp = malloc(sizeof (*tp), M_DEVBUF, M_NOWAIT); if (tp) { tp->isp = isp; memcpy(tp->data, inot, sizeof (tp->data)); tp->not = tp->data; callout_init_mtx(&tp->timer, &isp->isp_lock, 0); callout_reset(&tp->timer, 5, isp_refire_notify_ack, tp); } else { isp_prt(isp, ISP_LOGERR, "you lose- cannot allocate a notify refire"); } } break; } case ISPASYNC_TARGET_ACTION: { isphdr_t *hp; va_start(ap, cmd); hp = va_arg(ap, isphdr_t *); va_end(ap); switch (hp->rqs_entry_type) { case RQSTYPE_ATIO: isp_handle_platform_atio7(isp, (at7_entry_t *) hp); break; case RQSTYPE_ATIO2: isp_handle_platform_atio2(isp, (at2_entry_t *) hp); break; case RQSTYPE_CTIO7: case RQSTYPE_CTIO3: case RQSTYPE_CTIO2: case RQSTYPE_CTIO: isp_handle_platform_ctio(isp, hp); break; default: isp_prt(isp, ISP_LOGWARN, "%s: unhandled target action 0x%x", __func__, hp->rqs_entry_type); break; } break; } #endif case ISPASYNC_FW_CRASH: { uint16_t mbox1, mbox6; mbox1 = ISP_READ(isp, OUTMAILBOX1); if (IS_DUALBUS(isp)) { mbox6 = ISP_READ(isp, OUTMAILBOX6); } else { mbox6 = 0; } isp_prt(isp, ISP_LOGERR, "Internal Firmware Error on bus %d @ RISC Address 0x%x", mbox6, mbox1); #if 0 mbox1 = isp->isp_osinfo.mbox_sleep_ok; isp->isp_osinfo.mbox_sleep_ok = 0; isp_reinit(isp, 1); isp->isp_osinfo.mbox_sleep_ok = mbox1; isp_async(isp, ISPASYNC_FW_RESTARTED, NULL); #endif break; } default: isp_prt(isp, ISP_LOGERR, "unknown isp_async event %d", cmd); break; } } uint64_t isp_default_wwn(ispsoftc_t * isp, int chan, int isactive, int iswwnn) { uint64_t seed; struct isp_fc *fc = ISP_FC_PC(isp, chan); /* First try to use explicitly configured WWNs. */ seed = iswwnn ? fc->def_wwnn : fc->def_wwpn; if (seed) return (seed); /* Otherwise try to use WWNs from NVRAM. */ if (isactive) { seed = iswwnn ? FCPARAM(isp, chan)->isp_wwnn_nvram : FCPARAM(isp, chan)->isp_wwpn_nvram; if (seed) return (seed); } /* If still no WWNs, try to steal them from the first channel. */ if (chan > 0) { seed = iswwnn ? ISP_FC_PC(isp, 0)->def_wwnn : ISP_FC_PC(isp, 0)->def_wwpn; if (seed == 0) { seed = iswwnn ? FCPARAM(isp, 0)->isp_wwnn_nvram : FCPARAM(isp, 0)->isp_wwpn_nvram; } } /* If still nothing -- improvise. */ if (seed == 0) { seed = 0x400000007F000000ull + device_get_unit(isp->isp_dev); if (!iswwnn) seed ^= 0x0100000000000000ULL; } /* For additional channels we have to improvise even more. */ if (!iswwnn && chan > 0) { /* * We'll stick our channel number plus one first into bits * 57..59 and thence into bits 52..55 which allows for 8 bits * of channel which is enough for our maximum of 255 channels. */ seed ^= 0x0100000000000000ULL; seed ^= ((uint64_t) (chan + 1) & 0xf) << 56; seed ^= ((uint64_t) ((chan + 1) >> 4) & 0xf) << 52; } return (seed); } void isp_prt(ispsoftc_t *isp, int level, const char *fmt, ...) { int loc; char lbuf[200]; va_list ap; if (level != ISP_LOGALL && (level & isp->isp_dblev) == 0) { return; } snprintf(lbuf, sizeof (lbuf), "%s: ", device_get_nameunit(isp->isp_dev)); loc = strlen(lbuf); va_start(ap, fmt); vsnprintf(&lbuf[loc], sizeof (lbuf) - loc - 1, fmt, ap); va_end(ap); printf("%s\n", lbuf); } void isp_xs_prt(ispsoftc_t *isp, XS_T *xs, int level, const char *fmt, ...) { va_list ap; if (level != ISP_LOGALL && (level & isp->isp_dblev) == 0) { return; } xpt_print_path(xs->ccb_h.path); va_start(ap, fmt); vprintf(fmt, ap); va_end(ap); printf("\n"); } uint64_t isp_nanotime_sub(struct timespec *b, struct timespec *a) { uint64_t elapsed; - struct timespec x = *b; - timespecsub(&x, a); + struct timespec x; + + timespecsub(b, a, &x); elapsed = GET_NANOSEC(&x); if (elapsed == 0) elapsed++; return (elapsed); } int isp_mbox_acquire(ispsoftc_t *isp) { if (isp->isp_osinfo.mboxbsy) { return (1); } else { isp->isp_osinfo.mboxcmd_done = 0; isp->isp_osinfo.mboxbsy = 1; return (0); } } void isp_mbox_wait_complete(ispsoftc_t *isp, mbreg_t *mbp) { u_int t, to; to = (mbp->timeout == 0) ? MBCMD_DEFAULT_TIMEOUT : mbp->timeout; if (isp->isp_osinfo.mbox_sleep_ok) { isp->isp_osinfo.mbox_sleep_ok = 0; isp->isp_osinfo.mbox_sleeping = 1; msleep_sbt(&isp->isp_osinfo.mboxcmd_done, &isp->isp_lock, PRIBIO, "ispmbx_sleep", to * SBT_1US, 0, 0); isp->isp_osinfo.mbox_sleep_ok = 1; isp->isp_osinfo.mbox_sleeping = 0; } else { for (t = 0; t < to; t += 100) { if (isp->isp_osinfo.mboxcmd_done) break; ISP_RUN_ISR(isp); if (isp->isp_osinfo.mboxcmd_done) break; ISP_DELAY(100); } } if (isp->isp_osinfo.mboxcmd_done == 0) { isp_prt(isp, ISP_LOGWARN, "%s Mailbox Command (0x%x) Timeout (%uus) (%s:%d)", isp->isp_osinfo.mbox_sleep_ok? "Interrupting" : "Polled", isp->isp_lastmbxcmd, to, mbp->func, mbp->lineno); mbp->param[0] = MBOX_TIMEOUT; isp->isp_osinfo.mboxcmd_done = 1; } } void isp_mbox_notify_done(ispsoftc_t *isp) { isp->isp_osinfo.mboxcmd_done = 1; if (isp->isp_osinfo.mbox_sleeping) wakeup(&isp->isp_osinfo.mboxcmd_done); } void isp_mbox_release(ispsoftc_t *isp) { isp->isp_osinfo.mboxbsy = 0; } int isp_fc_scratch_acquire(ispsoftc_t *isp, int chan) { int ret = 0; if (isp->isp_osinfo.pc.fc[chan].fcbsy) { ret = -1; } else { isp->isp_osinfo.pc.fc[chan].fcbsy = 1; } return (ret); } void isp_platform_intr(void *arg) { ispsoftc_t *isp = arg; ISP_LOCK(isp); ISP_RUN_ISR(isp); ISP_UNLOCK(isp); } void isp_platform_intr_resp(void *arg) { ispsoftc_t *isp = arg; ISP_LOCK(isp); isp_intr_respq(isp); ISP_UNLOCK(isp); /* We have handshake enabled, so explicitly complete interrupt */ ISP_WRITE(isp, BIU2400_HCCR, HCCR_2400_CMD_CLEAR_RISC_INT); } void isp_platform_intr_atio(void *arg) { ispsoftc_t *isp = arg; ISP_LOCK(isp); #ifdef ISP_TARGET_MODE isp_intr_atioq(isp); #endif ISP_UNLOCK(isp); /* We have handshake enabled, so explicitly complete interrupt */ ISP_WRITE(isp, BIU2400_HCCR, HCCR_2400_CMD_CLEAR_RISC_INT); } void isp_common_dmateardown(ispsoftc_t *isp, struct ccb_scsiio *csio, uint32_t hdl) { if ((csio->ccb_h.flags & CAM_DIR_MASK) == CAM_DIR_IN) { bus_dmamap_sync(isp->isp_osinfo.dmat, PISP_PCMD(csio)->dmap, BUS_DMASYNC_POSTREAD); } else { bus_dmamap_sync(isp->isp_osinfo.dmat, PISP_PCMD(csio)->dmap, BUS_DMASYNC_POSTWRITE); } bus_dmamap_unload(isp->isp_osinfo.dmat, PISP_PCMD(csio)->dmap); } /* * Reset the command reference number for all LUNs on a specific target * (needed when a target arrives again) or for all targets on a port * (needed for events like a LIP). */ void isp_fcp_reset_crn(ispsoftc_t *isp, int chan, uint32_t tgt, int tgt_set) { struct isp_fc *fc = ISP_FC_PC(isp, chan); struct isp_nexus *nxp; int i; if (tgt_set == 0) isp_prt(isp, ISP_LOGDEBUG0, "Chan %d resetting CRN on all targets", chan); else isp_prt(isp, ISP_LOGDEBUG0, "Chan %d resetting CRN on target %u", chan, tgt); for (i = 0; i < NEXUS_HASH_WIDTH; i++) { for (nxp = fc->nexus_hash[i]; nxp != NULL; nxp = nxp->next) { if (tgt_set == 0 || tgt == nxp->tgt) nxp->crnseed = 0; } } } int isp_fcp_next_crn(ispsoftc_t *isp, uint8_t *crnp, XS_T *cmd) { lun_id_t lun; uint32_t chan, tgt; struct isp_fc *fc; struct isp_nexus *nxp; int idx; if (IS_2100(isp)) return (0); chan = XS_CHANNEL(cmd); tgt = XS_TGT(cmd); lun = XS_LUN(cmd); fc = &isp->isp_osinfo.pc.fc[chan]; idx = NEXUS_HASH(tgt, lun); nxp = fc->nexus_hash[idx]; while (nxp) { if (nxp->tgt == tgt && nxp->lun == lun) break; nxp = nxp->next; } if (nxp == NULL) { nxp = fc->nexus_free_list; if (nxp == NULL) { nxp = malloc(sizeof (struct isp_nexus), M_DEVBUF, M_ZERO|M_NOWAIT); if (nxp == NULL) { return (-1); } } else { fc->nexus_free_list = nxp->next; } nxp->tgt = tgt; nxp->lun = lun; nxp->next = fc->nexus_hash[idx]; fc->nexus_hash[idx] = nxp; } if (nxp->crnseed == 0) nxp->crnseed = 1; *crnp = nxp->crnseed++; return (0); } /* * We enter with the lock held */ void isp_timer(void *arg) { ispsoftc_t *isp = arg; #ifdef ISP_TARGET_MODE isp_tmcmd_restart(isp); #endif callout_reset(&isp->isp_osinfo.tmo, isp_timer_count, isp_timer, isp); } isp_ecmd_t * isp_get_ecmd(ispsoftc_t *isp) { isp_ecmd_t *ecmd = isp->isp_osinfo.ecmd_free; if (ecmd) { isp->isp_osinfo.ecmd_free = ecmd->next; } return (ecmd); } void isp_put_ecmd(ispsoftc_t *isp, isp_ecmd_t *ecmd) { ecmd->next = isp->isp_osinfo.ecmd_free; isp->isp_osinfo.ecmd_free = ecmd; } Index: head/sys/dev/joy/joy.c =================================================================== --- head/sys/dev/joy/joy.c (revision 336913) +++ head/sys/dev/joy/joy.c (revision 336914) @@ -1,251 +1,251 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1995 Jean-Marc Zucconi * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer * in this position and unchanged. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include /* The game port can manage 4 buttons and 4 variable resistors (usually 2 * joysticks, each with 2 buttons and 2 pots.) via the port at address 0x201. * Getting the state of the buttons is done by reading the game port: * buttons 1-4 correspond to bits 4-7 and resistors 1-4 (X1, Y1, X2, Y2) * to bits 0-3. * if button 1 (resp 2, 3, 4) is pressed, the bit 4 (resp 5, 6, 7) is set to 0 * to get the value of a resistor, write the value 0xff at port and * wait until the corresponding bit returns to 0. */ #define joypart(d) (dev2unit(d)&1) #ifndef JOY_TIMEOUT #define JOY_TIMEOUT 2000 /* 2 milliseconds */ #endif static d_open_t joyopen; static d_close_t joyclose; static d_read_t joyread; static d_ioctl_t joyioctl; static struct cdevsw joy_cdevsw = { .d_version = D_VERSION, .d_flags = D_NEEDGIANT, .d_open = joyopen, .d_close = joyclose, .d_read = joyread, .d_ioctl = joyioctl, .d_name = "joy", }; devclass_t joy_devclass; int joy_probe(device_t dev) { #ifdef WANT_JOYSTICK_CONNECTED #ifdef notyet outb(dev->id_iobase, 0xff); DELAY(10000); /* 10 ms delay */ return (inb(dev->id_iobase) & 0x0f) != 0x0f; #else return (0); #endif #else return (0); #endif } int joy_attach(device_t dev) { int unit = device_get_unit(dev); struct joy_softc *joy = device_get_softc(dev); joy->rid = 0; joy->res = bus_alloc_resource_any(dev, SYS_RES_IOPORT, &joy->rid, RF_ACTIVE|RF_SHAREABLE); if (joy->res == NULL) return ENXIO; joy->bt = rman_get_bustag(joy->res); joy->port = rman_get_bushandle(joy->res); joy->timeout[0] = joy->timeout[1] = 0; joy->d = make_dev(&joy_cdevsw, unit, 0, 0, 0600, "joy%d", unit); joy->d->si_drv1 = joy; gone_in_dev(dev, 12, "joy(4) driver"); return (0); } int joy_detach(device_t dev) { struct joy_softc *joy = device_get_softc(dev); if (joy->res != NULL) bus_release_resource(dev, SYS_RES_IOPORT, joy->rid, joy->res); if (joy->d) destroy_dev(joy->d); return (0); } static int joyopen(struct cdev *dev, int flags, int fmt, struct thread *td) { int i = joypart (dev); struct joy_softc *joy = dev->si_drv1; if (joy->timeout[i]) return (EBUSY); joy->x_off[i] = joy->y_off[i] = 0; joy->timeout[i] = JOY_TIMEOUT; return (0); } static int joyclose(struct cdev *dev, int flags, int fmt, struct thread *td) { int i = joypart (dev); struct joy_softc *joy = dev->si_drv1; joy->timeout[i] = 0; return (0); } static int joyread(struct cdev *dev, struct uio *uio, int flag) { struct joy_softc *joy = dev->si_drv1; bus_space_handle_t port = joy->port; bus_space_tag_t bt = joy->bt; struct timespec t, start, end; int state = 0; struct timespec x, y; struct joystick c; #ifndef __i386__ int s; s = splhigh(); #else disable_intr (); #endif nanotime(&t); end.tv_sec = 0; end.tv_nsec = joy->timeout[joypart(dev)] * 1000; - timespecadd(&end, &t); + timespecadd(&end, &t, &end); for (; timespeccmp(&t, &end, <) && (bus_space_read_1(bt, port, 0) & 0x0f); nanotime(&t)) ; /* nothing */ bus_space_write_1 (bt, port, 0, 0xff); nanotime(&start); end.tv_sec = 0; end.tv_nsec = joy->timeout[joypart(dev)] * 1000; - timespecadd(&end, &start); + timespecadd(&end, &start, &end); t = start; timespecclear(&x); timespecclear(&y); while (timespeccmp(&t, &end, <)) { state = bus_space_read_1 (bt, port, 0); if (joypart(dev) == 1) state >>= 2; nanotime(&t); if (!timespecisset(&x) && !(state & 0x01)) x = t; if (!timespecisset(&y) && !(state & 0x02)) y = t; if (timespecisset(&x) && timespecisset(&y)) break; } #ifndef __i386__ splx(s); #else enable_intr (); #endif if (timespecisset(&x)) { - timespecsub(&x, &start); + timespecsub(&x, &start, &x); c.x = joy->x_off[joypart(dev)] + x.tv_nsec / 1000; } else c.x = 0x80000000; if (timespecisset(&y)) { - timespecsub(&y, &start); + timespecsub(&y, &start, &y); c.y = joy->y_off[joypart(dev)] + y.tv_nsec / 1000; } else c.y = 0x80000000; state >>= 4; c.b1 = ~state & 1; c.b2 = ~(state >> 1) & 1; return (uiomove((caddr_t)&c, sizeof(struct joystick), uio)); } static int joyioctl(struct cdev *dev, u_long cmd, caddr_t data, int flag, struct thread *td) { struct joy_softc *joy = dev->si_drv1; int i = joypart (dev); int x; switch (cmd) { case JOY_SETTIMEOUT: x = *(int *) data; if (x < 1 || x > 10000) /* 10ms maximum! */ return EINVAL; joy->timeout[i] = x; break; case JOY_GETTIMEOUT: *(int *) data = joy->timeout[i]; break; case JOY_SET_X_OFFSET: joy->x_off[i] = *(int *) data; break; case JOY_SET_Y_OFFSET: joy->y_off[i] = *(int *) data; break; case JOY_GET_X_OFFSET: *(int *) data = joy->x_off[i]; break; case JOY_GET_Y_OFFSET: *(int *) data = joy->y_off[i]; break; default: return (ENOTTY); } return (0); } Index: head/sys/dev/xen/timer/timer.c =================================================================== --- head/sys/dev/xen/timer/timer.c (revision 336913) +++ head/sys/dev/xen/timer/timer.c (revision 336914) @@ -1,559 +1,559 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2009 Adrian Chadd * Copyright (c) 2012 Spectra Logic Corporation * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ /** * \file dev/xen/timer/timer.c * \brief A timer driver for the Xen hypervisor's PV clock. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "clock_if.h" static devclass_t xentimer_devclass; #define NSEC_IN_SEC 1000000000ULL #define NSEC_IN_USEC 1000ULL /* 18446744073 = int(2^64 / NSEC_IN_SC) = 1 ns in 64-bit fractions */ #define FRAC_IN_NSEC 18446744073LL /* Xen timers may fire up to 100us off */ #define XENTIMER_MIN_PERIOD_IN_NSEC 100*NSEC_IN_USEC /* * The real resolution of the PV clock is 1ns, but the highest * resolution that FreeBSD supports is 1us, so just use that. */ #define XENCLOCK_RESOLUTION 1 #define XENTIMER_QUALITY 950 struct xentimer_pcpu_data { uint64_t timer; uint64_t last_processed; void *irq_handle; }; DPCPU_DEFINE(struct xentimer_pcpu_data, xentimer_pcpu); DPCPU_DECLARE(struct vcpu_info *, vcpu_info); struct xentimer_softc { device_t dev; struct timecounter tc; struct eventtimer et; }; static void xentimer_identify(driver_t *driver, device_t parent) { if (!xen_domain()) return; /* Handle all Xen PV timers in one device instance. */ if (devclass_get_device(xentimer_devclass, 0)) return; BUS_ADD_CHILD(parent, 0, "xen_et", 0); } static int xentimer_probe(device_t dev) { KASSERT((xen_domain()), ("Trying to use Xen timer on bare metal")); /* * In order to attach, this driver requires the following: * - Vector callback support by the hypervisor, in order to deliver * timer interrupts to the correct CPU for CPUs other than 0. * - Access to the hypervisor shared info page, in order to look up * each VCPU's timer information and the Xen wallclock time. * - The hypervisor must say its PV clock is "safe" to use. * - The hypervisor must support VCPUOP hypercalls. * - The maximum number of CPUs supported by FreeBSD must not exceed * the number of VCPUs supported by the hypervisor. */ #define XTREQUIRES(condition, reason...) \ if (!(condition)) { \ device_printf(dev, ## reason); \ device_detach(dev); \ return (ENXIO); \ } if (xen_hvm_domain()) { XTREQUIRES(xen_vector_callback_enabled, "vector callbacks unavailable\n"); XTREQUIRES(xen_feature(XENFEAT_hvm_safe_pvclock), "HVM safe pvclock unavailable\n"); } XTREQUIRES(HYPERVISOR_shared_info != NULL, "shared info page unavailable\n"); XTREQUIRES(HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, 0, NULL) == 0, "VCPUOPs interface unavailable\n"); #undef XTREQUIRES device_set_desc(dev, "Xen PV Clock"); return (BUS_PROBE_NOWILDCARD); } /** * \brief Get the current time, in nanoseconds, since the hypervisor booted. * * \param vcpu vcpu_info structure to fetch the time from. * */ static uint64_t xen_fetch_vcpu_time(struct vcpu_info *vcpu) { struct pvclock_vcpu_time_info *time; time = (struct pvclock_vcpu_time_info *) &vcpu->time; return (pvclock_get_timecount(time)); } static uint32_t xentimer_get_timecount(struct timecounter *tc) { uint64_t vcpu_time; /* * We don't disable preemption here because the worst that can * happen is reading the vcpu_info area of a different CPU than * the one we are currently running on, but that would also * return a valid tc (and we avoid the overhead of * critical_{enter/exit} calls). */ vcpu_time = xen_fetch_vcpu_time(DPCPU_GET(vcpu_info)); return (vcpu_time & UINT32_MAX); } /** * \brief Fetch the hypervisor boot time, known as the "Xen wallclock". * * \param ts Timespec to store the current stable value. * \param version Pointer to store the corresponding wallclock version. * * \note This value is updated when Domain-0 shifts its clock to follow * clock drift, e.g. as detected by NTP. */ static void xen_fetch_wallclock(struct timespec *ts) { shared_info_t *src = HYPERVISOR_shared_info; struct pvclock_wall_clock *wc; wc = (struct pvclock_wall_clock *) &src->wc_version; pvclock_get_wallclock(wc, ts); } static void xen_fetch_uptime(struct timespec *ts) { uint64_t uptime; uptime = xen_fetch_vcpu_time(DPCPU_GET(vcpu_info)); ts->tv_sec = uptime / NSEC_IN_SEC; ts->tv_nsec = uptime % NSEC_IN_SEC; } static int xentimer_settime(device_t dev __unused, struct timespec *ts) { struct xen_platform_op settime; int ret; /* * Don't return EINVAL here; just silently fail if the domain isn't * privileged enough to set the TOD. */ if (!xen_initial_domain()) return (0); settime.cmd = XENPF_settime64; settime.u.settime64.mbz = 0; settime.u.settime64.secs = ts->tv_sec; settime.u.settime64.nsecs = ts->tv_nsec; settime.u.settime64.system_time = xen_fetch_vcpu_time(DPCPU_GET(vcpu_info)); ret = HYPERVISOR_platform_op(&settime); ret = ret != 0 ? xen_translate_error(ret) : 0; if (ret != 0 && bootverbose) device_printf(dev, "failed to set Xen PV clock: %d\n", ret); return (ret); } /** * \brief Return current time according to the Xen Hypervisor wallclock. * * \param dev Xentimer device. * \param ts Pointer to store the wallclock time. * * \note The Xen time structures document the hypervisor start time and the * uptime-since-hypervisor-start (in nsec.) They need to be combined * in order to calculate a TOD clock. */ static int xentimer_gettime(device_t dev, struct timespec *ts) { struct timespec u_ts; timespecclear(ts); xen_fetch_wallclock(ts); xen_fetch_uptime(&u_ts); - timespecadd(ts, &u_ts); + timespecadd(ts, &u_ts, ts); return (0); } /** * \brief Handle a timer interrupt for the Xen PV timer driver. * * \param arg Xen timer driver softc that is expecting the interrupt. */ static int xentimer_intr(void *arg) { struct xentimer_softc *sc = (struct xentimer_softc *)arg; struct xentimer_pcpu_data *pcpu = DPCPU_PTR(xentimer_pcpu); pcpu->last_processed = xen_fetch_vcpu_time(DPCPU_GET(vcpu_info)); if (pcpu->timer != 0 && sc->et.et_active) sc->et.et_event_cb(&sc->et, sc->et.et_arg); return (FILTER_HANDLED); } static int xentimer_vcpu_start_timer(int vcpu, uint64_t next_time) { struct vcpu_set_singleshot_timer single; single.timeout_abs_ns = next_time; /* Get an event anyway, even if the timeout is already expired */ single.flags = 0; return (HYPERVISOR_vcpu_op(VCPUOP_set_singleshot_timer, vcpu, &single)); } static int xentimer_vcpu_stop_timer(int vcpu) { return (HYPERVISOR_vcpu_op(VCPUOP_stop_singleshot_timer, vcpu, NULL)); } /** * \brief Set the next oneshot time for the current CPU. * * \param et Xen timer driver event timer to schedule on. * \param first Delta to the next time to schedule the interrupt for. * \param period Not used. * * \note See eventtimers(9) for more information. * \note * * \returns 0 */ static int xentimer_et_start(struct eventtimer *et, sbintime_t first, sbintime_t period) { int error; struct xentimer_softc *sc = et->et_priv; int cpu = PCPU_GET(vcpu_id); struct xentimer_pcpu_data *pcpu = DPCPU_PTR(xentimer_pcpu); struct vcpu_info *vcpu = DPCPU_GET(vcpu_info); uint64_t first_in_ns, next_time; #ifdef INVARIANTS struct thread *td = curthread; #endif KASSERT(td->td_critnest != 0, ("xentimer_et_start called without preemption disabled")); /* See sbttots() for this formula. */ first_in_ns = (((first >> 32) * NSEC_IN_SEC) + (((uint64_t)NSEC_IN_SEC * (uint32_t)first) >> 32)); next_time = xen_fetch_vcpu_time(vcpu) + first_in_ns; error = xentimer_vcpu_start_timer(cpu, next_time); if (error) panic("%s: Error %d setting singleshot timer to %"PRIu64"\n", device_get_nameunit(sc->dev), error, next_time); pcpu->timer = next_time; return (error); } /** * \brief Cancel the event timer's currently running timer, if any. */ static int xentimer_et_stop(struct eventtimer *et) { int cpu = PCPU_GET(vcpu_id); struct xentimer_pcpu_data *pcpu = DPCPU_PTR(xentimer_pcpu); pcpu->timer = 0; return (xentimer_vcpu_stop_timer(cpu)); } /** * \brief Attach a Xen PV timer driver instance. * * \param dev Bus device object to attach. * * \note * \returns EINVAL */ static int xentimer_attach(device_t dev) { struct xentimer_softc *sc = device_get_softc(dev); int error, i; sc->dev = dev; /* Bind an event channel to a VIRQ on each VCPU. */ CPU_FOREACH(i) { struct xentimer_pcpu_data *pcpu; pcpu = DPCPU_ID_PTR(i, xentimer_pcpu); error = HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, i, NULL); if (error) { device_printf(dev, "Error disabling Xen periodic timer " "on CPU %d\n", i); return (error); } error = xen_intr_bind_virq(dev, VIRQ_TIMER, i, xentimer_intr, NULL, sc, INTR_TYPE_CLK, &pcpu->irq_handle); if (error) { device_printf(dev, "Error %d binding VIRQ_TIMER " "to VCPU %d\n", error, i); return (error); } xen_intr_describe(pcpu->irq_handle, "c%d", i); } /* Register the event timer. */ sc->et.et_name = "XENTIMER"; sc->et.et_quality = XENTIMER_QUALITY; sc->et.et_flags = ET_FLAGS_ONESHOT | ET_FLAGS_PERCPU; sc->et.et_frequency = NSEC_IN_SEC; /* See tstosbt() for this formula */ sc->et.et_min_period = (XENTIMER_MIN_PERIOD_IN_NSEC * (((uint64_t)1 << 63) / 500000000) >> 32); sc->et.et_max_period = ((sbintime_t)4 << 32); sc->et.et_start = xentimer_et_start; sc->et.et_stop = xentimer_et_stop; sc->et.et_priv = sc; et_register(&sc->et); /* Register the timecounter. */ sc->tc.tc_name = "XENTIMER"; sc->tc.tc_quality = XENTIMER_QUALITY; /* * FIXME: due to the lack of ordering during resume, FreeBSD cannot * guarantee that the Xen PV timer is resumed before any other device * attempts to make use of it, so mark it as not safe for suspension * (ie: remove the TC_FLAGS_SUSPEND_SAFE flag). * * NB: This was not a problem in previous FreeBSD versions because the * timer was directly attached to the nexus, but it is an issue now * that the timer is attached to the xenpv bus, and thus resumed * later. * * sc->tc.tc_flags = TC_FLAGS_SUSPEND_SAFE; */ /* * The underlying resolution is in nanoseconds, since the timer info * scales TSC frequencies using a fraction that represents time in * terms of nanoseconds. */ sc->tc.tc_frequency = NSEC_IN_SEC; sc->tc.tc_counter_mask = ~0u; sc->tc.tc_get_timecount = xentimer_get_timecount; sc->tc.tc_priv = sc; tc_init(&sc->tc); /* Register the Hypervisor wall clock */ clock_register(dev, XENCLOCK_RESOLUTION); return (0); } static int xentimer_detach(device_t dev) { /* Implement Xen PV clock teardown - XXX see hpet_detach ? */ /* If possible: * 1. need to deregister timecounter * 2. need to deregister event timer * 3. need to deregister virtual IRQ event channels */ return (EBUSY); } static void xentimer_percpu_resume(void *arg) { device_t dev = (device_t) arg; struct xentimer_softc *sc = device_get_softc(dev); xentimer_et_start(&sc->et, sc->et.et_min_period, 0); } static int xentimer_resume(device_t dev) { int error; int i; /* Disable the periodic timer */ CPU_FOREACH(i) { error = HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, i, NULL); if (error != 0) { device_printf(dev, "Error disabling Xen periodic timer on CPU %d\n", i); return (error); } } /* Reset the last uptime value */ pvclock_resume(); /* Reset the RTC clock */ inittodr(time_second); /* Kick the timers on all CPUs */ smp_rendezvous(NULL, xentimer_percpu_resume, NULL, dev); if (bootverbose) device_printf(dev, "resumed operation after suspension\n"); return (0); } static int xentimer_suspend(device_t dev) { return (0); } /* * Xen early clock init */ void xen_clock_init(void) { } /* * Xen PV DELAY function * * When running on PVH mode we don't have an emulated i8524, so * make use of the Xen time info in order to code a simple DELAY * function that can be used during early boot. */ void xen_delay(int n) { struct vcpu_info *vcpu = &HYPERVISOR_shared_info->vcpu_info[0]; uint64_t end_ns; uint64_t current; end_ns = xen_fetch_vcpu_time(vcpu); end_ns += n * NSEC_IN_USEC; for (;;) { current = xen_fetch_vcpu_time(vcpu); if (current >= end_ns) break; } } static device_method_t xentimer_methods[] = { DEVMETHOD(device_identify, xentimer_identify), DEVMETHOD(device_probe, xentimer_probe), DEVMETHOD(device_attach, xentimer_attach), DEVMETHOD(device_detach, xentimer_detach), DEVMETHOD(device_suspend, xentimer_suspend), DEVMETHOD(device_resume, xentimer_resume), /* clock interface */ DEVMETHOD(clock_gettime, xentimer_gettime), DEVMETHOD(clock_settime, xentimer_settime), DEVMETHOD_END }; static driver_t xentimer_driver = { "xen_et", xentimer_methods, sizeof(struct xentimer_softc), }; DRIVER_MODULE(xentimer, xenpv, xentimer_driver, xentimer_devclass, 0, 0); MODULE_DEPEND(xentimer, xenpv, 1, 1, 1); Index: head/sys/kern/kern_sig.c =================================================================== --- head/sys/kern/kern_sig.c (revision 336913) +++ head/sys/kern/kern_sig.c (revision 336914) @@ -1,3816 +1,3814 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1989, 1991, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_sig.c 8.7 (Berkeley) 4/18/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_ktrace.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define ONSIG 32 /* NSIG for osig* syscalls. XXX. */ SDT_PROVIDER_DECLARE(proc); SDT_PROBE_DEFINE3(proc, , , signal__send, "struct thread *", "struct proc *", "int"); SDT_PROBE_DEFINE2(proc, , , signal__clear, "int", "ksiginfo_t *"); SDT_PROBE_DEFINE3(proc, , , signal__discard, "struct thread *", "struct proc *", "int"); static int coredump(struct thread *); static int killpg1(struct thread *td, int sig, int pgid, int all, ksiginfo_t *ksi); static int issignal(struct thread *td); static int sigprop(int sig); static void tdsigwakeup(struct thread *, int, sig_t, int); static int sig_suspend_threads(struct thread *, struct proc *, int); static int filt_sigattach(struct knote *kn); static void filt_sigdetach(struct knote *kn); static int filt_signal(struct knote *kn, long hint); static struct thread *sigtd(struct proc *p, int sig, int prop); static void sigqueue_start(void); static uma_zone_t ksiginfo_zone = NULL; struct filterops sig_filtops = { .f_isfd = 0, .f_attach = filt_sigattach, .f_detach = filt_sigdetach, .f_event = filt_signal, }; static int kern_logsigexit = 1; SYSCTL_INT(_kern, KERN_LOGSIGEXIT, logsigexit, CTLFLAG_RW, &kern_logsigexit, 0, "Log processes quitting on abnormal signals to syslog(3)"); static int kern_forcesigexit = 1; SYSCTL_INT(_kern, OID_AUTO, forcesigexit, CTLFLAG_RW, &kern_forcesigexit, 0, "Force trap signal to be handled"); static SYSCTL_NODE(_kern, OID_AUTO, sigqueue, CTLFLAG_RW, 0, "POSIX real time signal"); static int max_pending_per_proc = 128; SYSCTL_INT(_kern_sigqueue, OID_AUTO, max_pending_per_proc, CTLFLAG_RW, &max_pending_per_proc, 0, "Max pending signals per proc"); static int preallocate_siginfo = 1024; SYSCTL_INT(_kern_sigqueue, OID_AUTO, preallocate, CTLFLAG_RDTUN, &preallocate_siginfo, 0, "Preallocated signal memory size"); static int signal_overflow = 0; SYSCTL_INT(_kern_sigqueue, OID_AUTO, overflow, CTLFLAG_RD, &signal_overflow, 0, "Number of signals overflew"); static int signal_alloc_fail = 0; SYSCTL_INT(_kern_sigqueue, OID_AUTO, alloc_fail, CTLFLAG_RD, &signal_alloc_fail, 0, "signals failed to be allocated"); static int kern_lognosys = 0; SYSCTL_INT(_kern, OID_AUTO, lognosys, CTLFLAG_RWTUN, &kern_lognosys, 0, "Log invalid syscalls"); SYSINIT(signal, SI_SUB_P1003_1B, SI_ORDER_FIRST+3, sigqueue_start, NULL); /* * Policy -- Can ucred cr1 send SIGIO to process cr2? * Should use cr_cansignal() once cr_cansignal() allows SIGIO and SIGURG * in the right situations. */ #define CANSIGIO(cr1, cr2) \ ((cr1)->cr_uid == 0 || \ (cr1)->cr_ruid == (cr2)->cr_ruid || \ (cr1)->cr_uid == (cr2)->cr_ruid || \ (cr1)->cr_ruid == (cr2)->cr_uid || \ (cr1)->cr_uid == (cr2)->cr_uid) static int sugid_coredump; SYSCTL_INT(_kern, OID_AUTO, sugid_coredump, CTLFLAG_RWTUN, &sugid_coredump, 0, "Allow setuid and setgid processes to dump core"); static int capmode_coredump; SYSCTL_INT(_kern, OID_AUTO, capmode_coredump, CTLFLAG_RWTUN, &capmode_coredump, 0, "Allow processes in capability mode to dump core"); static int do_coredump = 1; SYSCTL_INT(_kern, OID_AUTO, coredump, CTLFLAG_RW, &do_coredump, 0, "Enable/Disable coredumps"); static int set_core_nodump_flag = 0; SYSCTL_INT(_kern, OID_AUTO, nodump_coredump, CTLFLAG_RW, &set_core_nodump_flag, 0, "Enable setting the NODUMP flag on coredump files"); static int coredump_devctl = 0; SYSCTL_INT(_kern, OID_AUTO, coredump_devctl, CTLFLAG_RW, &coredump_devctl, 0, "Generate a devctl notification when processes coredump"); /* * Signal properties and actions. * The array below categorizes the signals and their default actions * according to the following properties: */ #define SIGPROP_KILL 0x01 /* terminates process by default */ #define SIGPROP_CORE 0x02 /* ditto and coredumps */ #define SIGPROP_STOP 0x04 /* suspend process */ #define SIGPROP_TTYSTOP 0x08 /* ditto, from tty */ #define SIGPROP_IGNORE 0x10 /* ignore by default */ #define SIGPROP_CONT 0x20 /* continue if suspended */ #define SIGPROP_CANTMASK 0x40 /* non-maskable, catchable */ static int sigproptbl[NSIG] = { [SIGHUP] = SIGPROP_KILL, [SIGINT] = SIGPROP_KILL, [SIGQUIT] = SIGPROP_KILL | SIGPROP_CORE, [SIGILL] = SIGPROP_KILL | SIGPROP_CORE, [SIGTRAP] = SIGPROP_KILL | SIGPROP_CORE, [SIGABRT] = SIGPROP_KILL | SIGPROP_CORE, [SIGEMT] = SIGPROP_KILL | SIGPROP_CORE, [SIGFPE] = SIGPROP_KILL | SIGPROP_CORE, [SIGKILL] = SIGPROP_KILL, [SIGBUS] = SIGPROP_KILL | SIGPROP_CORE, [SIGSEGV] = SIGPROP_KILL | SIGPROP_CORE, [SIGSYS] = SIGPROP_KILL | SIGPROP_CORE, [SIGPIPE] = SIGPROP_KILL, [SIGALRM] = SIGPROP_KILL, [SIGTERM] = SIGPROP_KILL, [SIGURG] = SIGPROP_IGNORE, [SIGSTOP] = SIGPROP_STOP, [SIGTSTP] = SIGPROP_STOP | SIGPROP_TTYSTOP, [SIGCONT] = SIGPROP_IGNORE | SIGPROP_CONT, [SIGCHLD] = SIGPROP_IGNORE, [SIGTTIN] = SIGPROP_STOP | SIGPROP_TTYSTOP, [SIGTTOU] = SIGPROP_STOP | SIGPROP_TTYSTOP, [SIGIO] = SIGPROP_IGNORE, [SIGXCPU] = SIGPROP_KILL, [SIGXFSZ] = SIGPROP_KILL, [SIGVTALRM] = SIGPROP_KILL, [SIGPROF] = SIGPROP_KILL, [SIGWINCH] = SIGPROP_IGNORE, [SIGINFO] = SIGPROP_IGNORE, [SIGUSR1] = SIGPROP_KILL, [SIGUSR2] = SIGPROP_KILL, }; static void reschedule_signals(struct proc *p, sigset_t block, int flags); static void sigqueue_start(void) { ksiginfo_zone = uma_zcreate("ksiginfo", sizeof(ksiginfo_t), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); uma_prealloc(ksiginfo_zone, preallocate_siginfo); p31b_setcfg(CTL_P1003_1B_REALTIME_SIGNALS, _POSIX_REALTIME_SIGNALS); p31b_setcfg(CTL_P1003_1B_RTSIG_MAX, SIGRTMAX - SIGRTMIN + 1); p31b_setcfg(CTL_P1003_1B_SIGQUEUE_MAX, max_pending_per_proc); } ksiginfo_t * ksiginfo_alloc(int wait) { int flags; flags = M_ZERO; if (! wait) flags |= M_NOWAIT; if (ksiginfo_zone != NULL) return ((ksiginfo_t *)uma_zalloc(ksiginfo_zone, flags)); return (NULL); } void ksiginfo_free(ksiginfo_t *ksi) { uma_zfree(ksiginfo_zone, ksi); } static __inline int ksiginfo_tryfree(ksiginfo_t *ksi) { if (!(ksi->ksi_flags & KSI_EXT)) { uma_zfree(ksiginfo_zone, ksi); return (1); } return (0); } void sigqueue_init(sigqueue_t *list, struct proc *p) { SIGEMPTYSET(list->sq_signals); SIGEMPTYSET(list->sq_kill); SIGEMPTYSET(list->sq_ptrace); TAILQ_INIT(&list->sq_list); list->sq_proc = p; list->sq_flags = SQ_INIT; } /* * Get a signal's ksiginfo. * Return: * 0 - signal not found * others - signal number */ static int sigqueue_get(sigqueue_t *sq, int signo, ksiginfo_t *si) { struct proc *p = sq->sq_proc; struct ksiginfo *ksi, *next; int count = 0; KASSERT(sq->sq_flags & SQ_INIT, ("sigqueue not inited")); if (!SIGISMEMBER(sq->sq_signals, signo)) return (0); if (SIGISMEMBER(sq->sq_ptrace, signo)) { count++; SIGDELSET(sq->sq_ptrace, signo); si->ksi_flags |= KSI_PTRACE; } if (SIGISMEMBER(sq->sq_kill, signo)) { count++; if (count == 1) SIGDELSET(sq->sq_kill, signo); } TAILQ_FOREACH_SAFE(ksi, &sq->sq_list, ksi_link, next) { if (ksi->ksi_signo == signo) { if (count == 0) { TAILQ_REMOVE(&sq->sq_list, ksi, ksi_link); ksi->ksi_sigq = NULL; ksiginfo_copy(ksi, si); if (ksiginfo_tryfree(ksi) && p != NULL) p->p_pendingcnt--; } if (++count > 1) break; } } if (count <= 1) SIGDELSET(sq->sq_signals, signo); si->ksi_signo = signo; return (signo); } void sigqueue_take(ksiginfo_t *ksi) { struct ksiginfo *kp; struct proc *p; sigqueue_t *sq; if (ksi == NULL || (sq = ksi->ksi_sigq) == NULL) return; p = sq->sq_proc; TAILQ_REMOVE(&sq->sq_list, ksi, ksi_link); ksi->ksi_sigq = NULL; if (!(ksi->ksi_flags & KSI_EXT) && p != NULL) p->p_pendingcnt--; for (kp = TAILQ_FIRST(&sq->sq_list); kp != NULL; kp = TAILQ_NEXT(kp, ksi_link)) { if (kp->ksi_signo == ksi->ksi_signo) break; } if (kp == NULL && !SIGISMEMBER(sq->sq_kill, ksi->ksi_signo) && !SIGISMEMBER(sq->sq_ptrace, ksi->ksi_signo)) SIGDELSET(sq->sq_signals, ksi->ksi_signo); } static int sigqueue_add(sigqueue_t *sq, int signo, ksiginfo_t *si) { struct proc *p = sq->sq_proc; struct ksiginfo *ksi; int ret = 0; KASSERT(sq->sq_flags & SQ_INIT, ("sigqueue not inited")); /* * SIGKILL/SIGSTOP cannot be caught or masked, so take the fast path * for these signals. */ if (signo == SIGKILL || signo == SIGSTOP || si == NULL) { SIGADDSET(sq->sq_kill, signo); goto out_set_bit; } /* directly insert the ksi, don't copy it */ if (si->ksi_flags & KSI_INS) { if (si->ksi_flags & KSI_HEAD) TAILQ_INSERT_HEAD(&sq->sq_list, si, ksi_link); else TAILQ_INSERT_TAIL(&sq->sq_list, si, ksi_link); si->ksi_sigq = sq; goto out_set_bit; } if (__predict_false(ksiginfo_zone == NULL)) { SIGADDSET(sq->sq_kill, signo); goto out_set_bit; } if (p != NULL && p->p_pendingcnt >= max_pending_per_proc) { signal_overflow++; ret = EAGAIN; } else if ((ksi = ksiginfo_alloc(0)) == NULL) { signal_alloc_fail++; ret = EAGAIN; } else { if (p != NULL) p->p_pendingcnt++; ksiginfo_copy(si, ksi); ksi->ksi_signo = signo; if (si->ksi_flags & KSI_HEAD) TAILQ_INSERT_HEAD(&sq->sq_list, ksi, ksi_link); else TAILQ_INSERT_TAIL(&sq->sq_list, ksi, ksi_link); ksi->ksi_sigq = sq; } if (ret != 0) { if ((si->ksi_flags & KSI_PTRACE) != 0) { SIGADDSET(sq->sq_ptrace, signo); ret = 0; goto out_set_bit; } else if ((si->ksi_flags & KSI_TRAP) != 0 || (si->ksi_flags & KSI_SIGQ) == 0) { SIGADDSET(sq->sq_kill, signo); ret = 0; goto out_set_bit; } return (ret); } out_set_bit: SIGADDSET(sq->sq_signals, signo); return (ret); } void sigqueue_flush(sigqueue_t *sq) { struct proc *p = sq->sq_proc; ksiginfo_t *ksi; KASSERT(sq->sq_flags & SQ_INIT, ("sigqueue not inited")); if (p != NULL) PROC_LOCK_ASSERT(p, MA_OWNED); while ((ksi = TAILQ_FIRST(&sq->sq_list)) != NULL) { TAILQ_REMOVE(&sq->sq_list, ksi, ksi_link); ksi->ksi_sigq = NULL; if (ksiginfo_tryfree(ksi) && p != NULL) p->p_pendingcnt--; } SIGEMPTYSET(sq->sq_signals); SIGEMPTYSET(sq->sq_kill); SIGEMPTYSET(sq->sq_ptrace); } static void sigqueue_move_set(sigqueue_t *src, sigqueue_t *dst, const sigset_t *set) { sigset_t tmp; struct proc *p1, *p2; ksiginfo_t *ksi, *next; KASSERT(src->sq_flags & SQ_INIT, ("src sigqueue not inited")); KASSERT(dst->sq_flags & SQ_INIT, ("dst sigqueue not inited")); p1 = src->sq_proc; p2 = dst->sq_proc; /* Move siginfo to target list */ TAILQ_FOREACH_SAFE(ksi, &src->sq_list, ksi_link, next) { if (SIGISMEMBER(*set, ksi->ksi_signo)) { TAILQ_REMOVE(&src->sq_list, ksi, ksi_link); if (p1 != NULL) p1->p_pendingcnt--; TAILQ_INSERT_TAIL(&dst->sq_list, ksi, ksi_link); ksi->ksi_sigq = dst; if (p2 != NULL) p2->p_pendingcnt++; } } /* Move pending bits to target list */ tmp = src->sq_kill; SIGSETAND(tmp, *set); SIGSETOR(dst->sq_kill, tmp); SIGSETNAND(src->sq_kill, tmp); tmp = src->sq_ptrace; SIGSETAND(tmp, *set); SIGSETOR(dst->sq_ptrace, tmp); SIGSETNAND(src->sq_ptrace, tmp); tmp = src->sq_signals; SIGSETAND(tmp, *set); SIGSETOR(dst->sq_signals, tmp); SIGSETNAND(src->sq_signals, tmp); } #if 0 static void sigqueue_move(sigqueue_t *src, sigqueue_t *dst, int signo) { sigset_t set; SIGEMPTYSET(set); SIGADDSET(set, signo); sigqueue_move_set(src, dst, &set); } #endif static void sigqueue_delete_set(sigqueue_t *sq, const sigset_t *set) { struct proc *p = sq->sq_proc; ksiginfo_t *ksi, *next; KASSERT(sq->sq_flags & SQ_INIT, ("src sigqueue not inited")); /* Remove siginfo queue */ TAILQ_FOREACH_SAFE(ksi, &sq->sq_list, ksi_link, next) { if (SIGISMEMBER(*set, ksi->ksi_signo)) { TAILQ_REMOVE(&sq->sq_list, ksi, ksi_link); ksi->ksi_sigq = NULL; if (ksiginfo_tryfree(ksi) && p != NULL) p->p_pendingcnt--; } } SIGSETNAND(sq->sq_kill, *set); SIGSETNAND(sq->sq_ptrace, *set); SIGSETNAND(sq->sq_signals, *set); } void sigqueue_delete(sigqueue_t *sq, int signo) { sigset_t set; SIGEMPTYSET(set); SIGADDSET(set, signo); sigqueue_delete_set(sq, &set); } /* Remove a set of signals for a process */ static void sigqueue_delete_set_proc(struct proc *p, const sigset_t *set) { sigqueue_t worklist; struct thread *td0; PROC_LOCK_ASSERT(p, MA_OWNED); sigqueue_init(&worklist, NULL); sigqueue_move_set(&p->p_sigqueue, &worklist, set); FOREACH_THREAD_IN_PROC(p, td0) sigqueue_move_set(&td0->td_sigqueue, &worklist, set); sigqueue_flush(&worklist); } void sigqueue_delete_proc(struct proc *p, int signo) { sigset_t set; SIGEMPTYSET(set); SIGADDSET(set, signo); sigqueue_delete_set_proc(p, &set); } static void sigqueue_delete_stopmask_proc(struct proc *p) { sigset_t set; SIGEMPTYSET(set); SIGADDSET(set, SIGSTOP); SIGADDSET(set, SIGTSTP); SIGADDSET(set, SIGTTIN); SIGADDSET(set, SIGTTOU); sigqueue_delete_set_proc(p, &set); } /* * Determine signal that should be delivered to thread td, the current * thread, 0 if none. If there is a pending stop signal with default * action, the process stops in issignal(). */ int cursig(struct thread *td) { PROC_LOCK_ASSERT(td->td_proc, MA_OWNED); mtx_assert(&td->td_proc->p_sigacts->ps_mtx, MA_OWNED); THREAD_LOCK_ASSERT(td, MA_NOTOWNED); return (SIGPENDING(td) ? issignal(td) : 0); } /* * Arrange for ast() to handle unmasked pending signals on return to user * mode. This must be called whenever a signal is added to td_sigqueue or * unmasked in td_sigmask. */ void signotify(struct thread *td) { PROC_LOCK_ASSERT(td->td_proc, MA_OWNED); if (SIGPENDING(td)) { thread_lock(td); td->td_flags |= TDF_NEEDSIGCHK | TDF_ASTPENDING; thread_unlock(td); } } int sigonstack(size_t sp) { struct thread *td = curthread; return ((td->td_pflags & TDP_ALTSTACK) ? #if defined(COMPAT_43) ((td->td_sigstk.ss_size == 0) ? (td->td_sigstk.ss_flags & SS_ONSTACK) : ((sp - (size_t)td->td_sigstk.ss_sp) < td->td_sigstk.ss_size)) #else ((sp - (size_t)td->td_sigstk.ss_sp) < td->td_sigstk.ss_size) #endif : 0); } static __inline int sigprop(int sig) { if (sig > 0 && sig < nitems(sigproptbl)) return (sigproptbl[sig]); return (0); } int sig_ffs(sigset_t *set) { int i; for (i = 0; i < _SIG_WORDS; i++) if (set->__bits[i]) return (ffs(set->__bits[i]) + (i * 32)); return (0); } static bool sigact_flag_test(const struct sigaction *act, int flag) { /* * SA_SIGINFO is reset when signal disposition is set to * ignore or default. Other flags are kept according to user * settings. */ return ((act->sa_flags & flag) != 0 && (flag != SA_SIGINFO || ((__sighandler_t *)act->sa_sigaction != SIG_IGN && (__sighandler_t *)act->sa_sigaction != SIG_DFL))); } /* * kern_sigaction * sigaction * freebsd4_sigaction * osigaction */ int kern_sigaction(struct thread *td, int sig, const struct sigaction *act, struct sigaction *oact, int flags) { struct sigacts *ps; struct proc *p = td->td_proc; if (!_SIG_VALID(sig)) return (EINVAL); if (act != NULL && act->sa_handler != SIG_DFL && act->sa_handler != SIG_IGN && (act->sa_flags & ~(SA_ONSTACK | SA_RESTART | SA_RESETHAND | SA_NOCLDSTOP | SA_NODEFER | SA_NOCLDWAIT | SA_SIGINFO)) != 0) return (EINVAL); PROC_LOCK(p); ps = p->p_sigacts; mtx_lock(&ps->ps_mtx); if (oact) { memset(oact, 0, sizeof(*oact)); oact->sa_mask = ps->ps_catchmask[_SIG_IDX(sig)]; if (SIGISMEMBER(ps->ps_sigonstack, sig)) oact->sa_flags |= SA_ONSTACK; if (!SIGISMEMBER(ps->ps_sigintr, sig)) oact->sa_flags |= SA_RESTART; if (SIGISMEMBER(ps->ps_sigreset, sig)) oact->sa_flags |= SA_RESETHAND; if (SIGISMEMBER(ps->ps_signodefer, sig)) oact->sa_flags |= SA_NODEFER; if (SIGISMEMBER(ps->ps_siginfo, sig)) { oact->sa_flags |= SA_SIGINFO; oact->sa_sigaction = (__siginfohandler_t *)ps->ps_sigact[_SIG_IDX(sig)]; } else oact->sa_handler = ps->ps_sigact[_SIG_IDX(sig)]; if (sig == SIGCHLD && ps->ps_flag & PS_NOCLDSTOP) oact->sa_flags |= SA_NOCLDSTOP; if (sig == SIGCHLD && ps->ps_flag & PS_NOCLDWAIT) oact->sa_flags |= SA_NOCLDWAIT; } if (act) { if ((sig == SIGKILL || sig == SIGSTOP) && act->sa_handler != SIG_DFL) { mtx_unlock(&ps->ps_mtx); PROC_UNLOCK(p); return (EINVAL); } /* * Change setting atomically. */ ps->ps_catchmask[_SIG_IDX(sig)] = act->sa_mask; SIG_CANTMASK(ps->ps_catchmask[_SIG_IDX(sig)]); if (sigact_flag_test(act, SA_SIGINFO)) { ps->ps_sigact[_SIG_IDX(sig)] = (__sighandler_t *)act->sa_sigaction; SIGADDSET(ps->ps_siginfo, sig); } else { ps->ps_sigact[_SIG_IDX(sig)] = act->sa_handler; SIGDELSET(ps->ps_siginfo, sig); } if (!sigact_flag_test(act, SA_RESTART)) SIGADDSET(ps->ps_sigintr, sig); else SIGDELSET(ps->ps_sigintr, sig); if (sigact_flag_test(act, SA_ONSTACK)) SIGADDSET(ps->ps_sigonstack, sig); else SIGDELSET(ps->ps_sigonstack, sig); if (sigact_flag_test(act, SA_RESETHAND)) SIGADDSET(ps->ps_sigreset, sig); else SIGDELSET(ps->ps_sigreset, sig); if (sigact_flag_test(act, SA_NODEFER)) SIGADDSET(ps->ps_signodefer, sig); else SIGDELSET(ps->ps_signodefer, sig); if (sig == SIGCHLD) { if (act->sa_flags & SA_NOCLDSTOP) ps->ps_flag |= PS_NOCLDSTOP; else ps->ps_flag &= ~PS_NOCLDSTOP; if (act->sa_flags & SA_NOCLDWAIT) { /* * Paranoia: since SA_NOCLDWAIT is implemented * by reparenting the dying child to PID 1 (and * trust it to reap the zombie), PID 1 itself * is forbidden to set SA_NOCLDWAIT. */ if (p->p_pid == 1) ps->ps_flag &= ~PS_NOCLDWAIT; else ps->ps_flag |= PS_NOCLDWAIT; } else ps->ps_flag &= ~PS_NOCLDWAIT; if (ps->ps_sigact[_SIG_IDX(SIGCHLD)] == SIG_IGN) ps->ps_flag |= PS_CLDSIGIGN; else ps->ps_flag &= ~PS_CLDSIGIGN; } /* * Set bit in ps_sigignore for signals that are set to SIG_IGN, * and for signals set to SIG_DFL where the default is to * ignore. However, don't put SIGCONT in ps_sigignore, as we * have to restart the process. */ if (ps->ps_sigact[_SIG_IDX(sig)] == SIG_IGN || (sigprop(sig) & SIGPROP_IGNORE && ps->ps_sigact[_SIG_IDX(sig)] == SIG_DFL)) { /* never to be seen again */ sigqueue_delete_proc(p, sig); if (sig != SIGCONT) /* easier in psignal */ SIGADDSET(ps->ps_sigignore, sig); SIGDELSET(ps->ps_sigcatch, sig); } else { SIGDELSET(ps->ps_sigignore, sig); if (ps->ps_sigact[_SIG_IDX(sig)] == SIG_DFL) SIGDELSET(ps->ps_sigcatch, sig); else SIGADDSET(ps->ps_sigcatch, sig); } #ifdef COMPAT_FREEBSD4 if (ps->ps_sigact[_SIG_IDX(sig)] == SIG_IGN || ps->ps_sigact[_SIG_IDX(sig)] == SIG_DFL || (flags & KSA_FREEBSD4) == 0) SIGDELSET(ps->ps_freebsd4, sig); else SIGADDSET(ps->ps_freebsd4, sig); #endif #ifdef COMPAT_43 if (ps->ps_sigact[_SIG_IDX(sig)] == SIG_IGN || ps->ps_sigact[_SIG_IDX(sig)] == SIG_DFL || (flags & KSA_OSIGSET) == 0) SIGDELSET(ps->ps_osigset, sig); else SIGADDSET(ps->ps_osigset, sig); #endif } mtx_unlock(&ps->ps_mtx); PROC_UNLOCK(p); return (0); } #ifndef _SYS_SYSPROTO_H_ struct sigaction_args { int sig; struct sigaction *act; struct sigaction *oact; }; #endif int sys_sigaction(struct thread *td, struct sigaction_args *uap) { struct sigaction act, oact; struct sigaction *actp, *oactp; int error; actp = (uap->act != NULL) ? &act : NULL; oactp = (uap->oact != NULL) ? &oact : NULL; if (actp) { error = copyin(uap->act, actp, sizeof(act)); if (error) return (error); } error = kern_sigaction(td, uap->sig, actp, oactp, 0); if (oactp && !error) error = copyout(oactp, uap->oact, sizeof(oact)); return (error); } #ifdef COMPAT_FREEBSD4 #ifndef _SYS_SYSPROTO_H_ struct freebsd4_sigaction_args { int sig; struct sigaction *act; struct sigaction *oact; }; #endif int freebsd4_sigaction(struct thread *td, struct freebsd4_sigaction_args *uap) { struct sigaction act, oact; struct sigaction *actp, *oactp; int error; actp = (uap->act != NULL) ? &act : NULL; oactp = (uap->oact != NULL) ? &oact : NULL; if (actp) { error = copyin(uap->act, actp, sizeof(act)); if (error) return (error); } error = kern_sigaction(td, uap->sig, actp, oactp, KSA_FREEBSD4); if (oactp && !error) error = copyout(oactp, uap->oact, sizeof(oact)); return (error); } #endif /* COMAPT_FREEBSD4 */ #ifdef COMPAT_43 /* XXX - COMPAT_FBSD3 */ #ifndef _SYS_SYSPROTO_H_ struct osigaction_args { int signum; struct osigaction *nsa; struct osigaction *osa; }; #endif int osigaction(struct thread *td, struct osigaction_args *uap) { struct osigaction sa; struct sigaction nsa, osa; struct sigaction *nsap, *osap; int error; if (uap->signum <= 0 || uap->signum >= ONSIG) return (EINVAL); nsap = (uap->nsa != NULL) ? &nsa : NULL; osap = (uap->osa != NULL) ? &osa : NULL; if (nsap) { error = copyin(uap->nsa, &sa, sizeof(sa)); if (error) return (error); nsap->sa_handler = sa.sa_handler; nsap->sa_flags = sa.sa_flags; OSIG2SIG(sa.sa_mask, nsap->sa_mask); } error = kern_sigaction(td, uap->signum, nsap, osap, KSA_OSIGSET); if (osap && !error) { sa.sa_handler = osap->sa_handler; sa.sa_flags = osap->sa_flags; SIG2OSIG(osap->sa_mask, sa.sa_mask); error = copyout(&sa, uap->osa, sizeof(sa)); } return (error); } #if !defined(__i386__) /* Avoid replicating the same stub everywhere */ int osigreturn(struct thread *td, struct osigreturn_args *uap) { return (nosys(td, (struct nosys_args *)uap)); } #endif #endif /* COMPAT_43 */ /* * Initialize signal state for process 0; * set to ignore signals that are ignored by default. */ void siginit(struct proc *p) { int i; struct sigacts *ps; PROC_LOCK(p); ps = p->p_sigacts; mtx_lock(&ps->ps_mtx); for (i = 1; i <= NSIG; i++) { if (sigprop(i) & SIGPROP_IGNORE && i != SIGCONT) { SIGADDSET(ps->ps_sigignore, i); } } mtx_unlock(&ps->ps_mtx); PROC_UNLOCK(p); } /* * Reset specified signal to the default disposition. */ static void sigdflt(struct sigacts *ps, int sig) { mtx_assert(&ps->ps_mtx, MA_OWNED); SIGDELSET(ps->ps_sigcatch, sig); if ((sigprop(sig) & SIGPROP_IGNORE) != 0 && sig != SIGCONT) SIGADDSET(ps->ps_sigignore, sig); ps->ps_sigact[_SIG_IDX(sig)] = SIG_DFL; SIGDELSET(ps->ps_siginfo, sig); } /* * Reset signals for an exec of the specified process. */ void execsigs(struct proc *p) { sigset_t osigignore; struct sigacts *ps; int sig; struct thread *td; /* * Reset caught signals. Held signals remain held * through td_sigmask (unless they were caught, * and are now ignored by default). */ PROC_LOCK_ASSERT(p, MA_OWNED); ps = p->p_sigacts; mtx_lock(&ps->ps_mtx); while (SIGNOTEMPTY(ps->ps_sigcatch)) { sig = sig_ffs(&ps->ps_sigcatch); sigdflt(ps, sig); if ((sigprop(sig) & SIGPROP_IGNORE) != 0) sigqueue_delete_proc(p, sig); } /* * As CloudABI processes cannot modify signal handlers, fully * reset all signals to their default behavior. Do ignore * SIGPIPE, as it would otherwise be impossible to recover from * writes to broken pipes and sockets. */ if (SV_PROC_ABI(p) == SV_ABI_CLOUDABI) { osigignore = ps->ps_sigignore; while (SIGNOTEMPTY(osigignore)) { sig = sig_ffs(&osigignore); SIGDELSET(osigignore, sig); if (sig != SIGPIPE) sigdflt(ps, sig); } SIGADDSET(ps->ps_sigignore, SIGPIPE); } /* * Reset stack state to the user stack. * Clear set of signals caught on the signal stack. */ td = curthread; MPASS(td->td_proc == p); td->td_sigstk.ss_flags = SS_DISABLE; td->td_sigstk.ss_size = 0; td->td_sigstk.ss_sp = 0; td->td_pflags &= ~TDP_ALTSTACK; /* * Reset no zombies if child dies flag as Solaris does. */ ps->ps_flag &= ~(PS_NOCLDWAIT | PS_CLDSIGIGN); if (ps->ps_sigact[_SIG_IDX(SIGCHLD)] == SIG_IGN) ps->ps_sigact[_SIG_IDX(SIGCHLD)] = SIG_DFL; mtx_unlock(&ps->ps_mtx); } /* * kern_sigprocmask() * * Manipulate signal mask. */ int kern_sigprocmask(struct thread *td, int how, sigset_t *set, sigset_t *oset, int flags) { sigset_t new_block, oset1; struct proc *p; int error; p = td->td_proc; if ((flags & SIGPROCMASK_PROC_LOCKED) != 0) PROC_LOCK_ASSERT(p, MA_OWNED); else PROC_LOCK(p); mtx_assert(&p->p_sigacts->ps_mtx, (flags & SIGPROCMASK_PS_LOCKED) != 0 ? MA_OWNED : MA_NOTOWNED); if (oset != NULL) *oset = td->td_sigmask; error = 0; if (set != NULL) { switch (how) { case SIG_BLOCK: SIG_CANTMASK(*set); oset1 = td->td_sigmask; SIGSETOR(td->td_sigmask, *set); new_block = td->td_sigmask; SIGSETNAND(new_block, oset1); break; case SIG_UNBLOCK: SIGSETNAND(td->td_sigmask, *set); signotify(td); goto out; case SIG_SETMASK: SIG_CANTMASK(*set); oset1 = td->td_sigmask; if (flags & SIGPROCMASK_OLD) SIGSETLO(td->td_sigmask, *set); else td->td_sigmask = *set; new_block = td->td_sigmask; SIGSETNAND(new_block, oset1); signotify(td); break; default: error = EINVAL; goto out; } /* * The new_block set contains signals that were not previously * blocked, but are blocked now. * * In case we block any signal that was not previously blocked * for td, and process has the signal pending, try to schedule * signal delivery to some thread that does not block the * signal, possibly waking it up. */ if (p->p_numthreads != 1) reschedule_signals(p, new_block, flags); } out: if (!(flags & SIGPROCMASK_PROC_LOCKED)) PROC_UNLOCK(p); return (error); } #ifndef _SYS_SYSPROTO_H_ struct sigprocmask_args { int how; const sigset_t *set; sigset_t *oset; }; #endif int sys_sigprocmask(struct thread *td, struct sigprocmask_args *uap) { sigset_t set, oset; sigset_t *setp, *osetp; int error; setp = (uap->set != NULL) ? &set : NULL; osetp = (uap->oset != NULL) ? &oset : NULL; if (setp) { error = copyin(uap->set, setp, sizeof(set)); if (error) return (error); } error = kern_sigprocmask(td, uap->how, setp, osetp, 0); if (osetp && !error) { error = copyout(osetp, uap->oset, sizeof(oset)); } return (error); } #ifdef COMPAT_43 /* XXX - COMPAT_FBSD3 */ #ifndef _SYS_SYSPROTO_H_ struct osigprocmask_args { int how; osigset_t mask; }; #endif int osigprocmask(struct thread *td, struct osigprocmask_args *uap) { sigset_t set, oset; int error; OSIG2SIG(uap->mask, set); error = kern_sigprocmask(td, uap->how, &set, &oset, 1); SIG2OSIG(oset, td->td_retval[0]); return (error); } #endif /* COMPAT_43 */ int sys_sigwait(struct thread *td, struct sigwait_args *uap) { ksiginfo_t ksi; sigset_t set; int error; error = copyin(uap->set, &set, sizeof(set)); if (error) { td->td_retval[0] = error; return (0); } error = kern_sigtimedwait(td, set, &ksi, NULL); if (error) { if (error == EINTR && td->td_proc->p_osrel < P_OSREL_SIGWAIT) error = ERESTART; if (error == ERESTART) return (error); td->td_retval[0] = error; return (0); } error = copyout(&ksi.ksi_signo, uap->sig, sizeof(ksi.ksi_signo)); td->td_retval[0] = error; return (0); } int sys_sigtimedwait(struct thread *td, struct sigtimedwait_args *uap) { struct timespec ts; struct timespec *timeout; sigset_t set; ksiginfo_t ksi; int error; if (uap->timeout) { error = copyin(uap->timeout, &ts, sizeof(ts)); if (error) return (error); timeout = &ts; } else timeout = NULL; error = copyin(uap->set, &set, sizeof(set)); if (error) return (error); error = kern_sigtimedwait(td, set, &ksi, timeout); if (error) return (error); if (uap->info) error = copyout(&ksi.ksi_info, uap->info, sizeof(siginfo_t)); if (error == 0) td->td_retval[0] = ksi.ksi_signo; return (error); } int sys_sigwaitinfo(struct thread *td, struct sigwaitinfo_args *uap) { ksiginfo_t ksi; sigset_t set; int error; error = copyin(uap->set, &set, sizeof(set)); if (error) return (error); error = kern_sigtimedwait(td, set, &ksi, NULL); if (error) return (error); if (uap->info) error = copyout(&ksi.ksi_info, uap->info, sizeof(siginfo_t)); if (error == 0) td->td_retval[0] = ksi.ksi_signo; return (error); } static void proc_td_siginfo_capture(struct thread *td, siginfo_t *si) { struct thread *thr; FOREACH_THREAD_IN_PROC(td->td_proc, thr) { if (thr == td) thr->td_si = *si; else thr->td_si.si_signo = 0; } } int kern_sigtimedwait(struct thread *td, sigset_t waitset, ksiginfo_t *ksi, struct timespec *timeout) { struct sigacts *ps; sigset_t saved_mask, new_block; struct proc *p; int error, sig, timo, timevalid = 0; struct timespec rts, ets, ts; struct timeval tv; p = td->td_proc; error = 0; ets.tv_sec = 0; ets.tv_nsec = 0; if (timeout != NULL) { if (timeout->tv_nsec >= 0 && timeout->tv_nsec < 1000000000) { timevalid = 1; getnanouptime(&rts); - ets = rts; - timespecadd(&ets, timeout); + timespecadd(&rts, timeout, &ets); } } ksiginfo_init(ksi); /* Some signals can not be waited for. */ SIG_CANTMASK(waitset); ps = p->p_sigacts; PROC_LOCK(p); saved_mask = td->td_sigmask; SIGSETNAND(td->td_sigmask, waitset); for (;;) { mtx_lock(&ps->ps_mtx); sig = cursig(td); mtx_unlock(&ps->ps_mtx); KASSERT(sig >= 0, ("sig %d", sig)); if (sig != 0 && SIGISMEMBER(waitset, sig)) { if (sigqueue_get(&td->td_sigqueue, sig, ksi) != 0 || sigqueue_get(&p->p_sigqueue, sig, ksi) != 0) { error = 0; break; } } if (error != 0) break; /* * POSIX says this must be checked after looking for pending * signals. */ if (timeout != NULL) { if (!timevalid) { error = EINVAL; break; } getnanouptime(&rts); if (timespeccmp(&rts, &ets, >=)) { error = EAGAIN; break; } - ts = ets; - timespecsub(&ts, &rts); + timespecsub(&ets, &rts, &ts); TIMESPEC_TO_TIMEVAL(&tv, &ts); timo = tvtohz(&tv); } else { timo = 0; } error = msleep(ps, &p->p_mtx, PPAUSE|PCATCH, "sigwait", timo); if (timeout != NULL) { if (error == ERESTART) { /* Timeout can not be restarted. */ error = EINTR; } else if (error == EAGAIN) { /* We will calculate timeout by ourself. */ error = 0; } } } new_block = saved_mask; SIGSETNAND(new_block, td->td_sigmask); td->td_sigmask = saved_mask; /* * Fewer signals can be delivered to us, reschedule signal * notification. */ if (p->p_numthreads != 1) reschedule_signals(p, new_block, 0); if (error == 0) { SDT_PROBE2(proc, , , signal__clear, sig, ksi); if (ksi->ksi_code == SI_TIMER) itimer_accept(p, ksi->ksi_timerid, ksi); #ifdef KTRACE if (KTRPOINT(td, KTR_PSIG)) { sig_t action; mtx_lock(&ps->ps_mtx); action = ps->ps_sigact[_SIG_IDX(sig)]; mtx_unlock(&ps->ps_mtx); ktrpsig(sig, action, &td->td_sigmask, ksi->ksi_code); } #endif if (sig == SIGKILL) { proc_td_siginfo_capture(td, &ksi->ksi_info); sigexit(td, sig); } } PROC_UNLOCK(p); return (error); } #ifndef _SYS_SYSPROTO_H_ struct sigpending_args { sigset_t *set; }; #endif int sys_sigpending(struct thread *td, struct sigpending_args *uap) { struct proc *p = td->td_proc; sigset_t pending; PROC_LOCK(p); pending = p->p_sigqueue.sq_signals; SIGSETOR(pending, td->td_sigqueue.sq_signals); PROC_UNLOCK(p); return (copyout(&pending, uap->set, sizeof(sigset_t))); } #ifdef COMPAT_43 /* XXX - COMPAT_FBSD3 */ #ifndef _SYS_SYSPROTO_H_ struct osigpending_args { int dummy; }; #endif int osigpending(struct thread *td, struct osigpending_args *uap) { struct proc *p = td->td_proc; sigset_t pending; PROC_LOCK(p); pending = p->p_sigqueue.sq_signals; SIGSETOR(pending, td->td_sigqueue.sq_signals); PROC_UNLOCK(p); SIG2OSIG(pending, td->td_retval[0]); return (0); } #endif /* COMPAT_43 */ #if defined(COMPAT_43) /* * Generalized interface signal handler, 4.3-compatible. */ #ifndef _SYS_SYSPROTO_H_ struct osigvec_args { int signum; struct sigvec *nsv; struct sigvec *osv; }; #endif /* ARGSUSED */ int osigvec(struct thread *td, struct osigvec_args *uap) { struct sigvec vec; struct sigaction nsa, osa; struct sigaction *nsap, *osap; int error; if (uap->signum <= 0 || uap->signum >= ONSIG) return (EINVAL); nsap = (uap->nsv != NULL) ? &nsa : NULL; osap = (uap->osv != NULL) ? &osa : NULL; if (nsap) { error = copyin(uap->nsv, &vec, sizeof(vec)); if (error) return (error); nsap->sa_handler = vec.sv_handler; OSIG2SIG(vec.sv_mask, nsap->sa_mask); nsap->sa_flags = vec.sv_flags; nsap->sa_flags ^= SA_RESTART; /* opposite of SV_INTERRUPT */ } error = kern_sigaction(td, uap->signum, nsap, osap, KSA_OSIGSET); if (osap && !error) { vec.sv_handler = osap->sa_handler; SIG2OSIG(osap->sa_mask, vec.sv_mask); vec.sv_flags = osap->sa_flags; vec.sv_flags &= ~SA_NOCLDWAIT; vec.sv_flags ^= SA_RESTART; error = copyout(&vec, uap->osv, sizeof(vec)); } return (error); } #ifndef _SYS_SYSPROTO_H_ struct osigblock_args { int mask; }; #endif int osigblock(struct thread *td, struct osigblock_args *uap) { sigset_t set, oset; OSIG2SIG(uap->mask, set); kern_sigprocmask(td, SIG_BLOCK, &set, &oset, 0); SIG2OSIG(oset, td->td_retval[0]); return (0); } #ifndef _SYS_SYSPROTO_H_ struct osigsetmask_args { int mask; }; #endif int osigsetmask(struct thread *td, struct osigsetmask_args *uap) { sigset_t set, oset; OSIG2SIG(uap->mask, set); kern_sigprocmask(td, SIG_SETMASK, &set, &oset, 0); SIG2OSIG(oset, td->td_retval[0]); return (0); } #endif /* COMPAT_43 */ /* * Suspend calling thread until signal, providing mask to be set in the * meantime. */ #ifndef _SYS_SYSPROTO_H_ struct sigsuspend_args { const sigset_t *sigmask; }; #endif /* ARGSUSED */ int sys_sigsuspend(struct thread *td, struct sigsuspend_args *uap) { sigset_t mask; int error; error = copyin(uap->sigmask, &mask, sizeof(mask)); if (error) return (error); return (kern_sigsuspend(td, mask)); } int kern_sigsuspend(struct thread *td, sigset_t mask) { struct proc *p = td->td_proc; int has_sig, sig; /* * When returning from sigsuspend, we want * the old mask to be restored after the * signal handler has finished. Thus, we * save it here and mark the sigacts structure * to indicate this. */ PROC_LOCK(p); kern_sigprocmask(td, SIG_SETMASK, &mask, &td->td_oldsigmask, SIGPROCMASK_PROC_LOCKED); td->td_pflags |= TDP_OLDMASK; /* * Process signals now. Otherwise, we can get spurious wakeup * due to signal entered process queue, but delivered to other * thread. But sigsuspend should return only on signal * delivery. */ (p->p_sysent->sv_set_syscall_retval)(td, EINTR); for (has_sig = 0; !has_sig;) { while (msleep(&p->p_sigacts, &p->p_mtx, PPAUSE|PCATCH, "pause", 0) == 0) /* void */; thread_suspend_check(0); mtx_lock(&p->p_sigacts->ps_mtx); while ((sig = cursig(td)) != 0) { KASSERT(sig >= 0, ("sig %d", sig)); has_sig += postsig(sig); } mtx_unlock(&p->p_sigacts->ps_mtx); } PROC_UNLOCK(p); td->td_errno = EINTR; td->td_pflags |= TDP_NERRNO; return (EJUSTRETURN); } #ifdef COMPAT_43 /* XXX - COMPAT_FBSD3 */ /* * Compatibility sigsuspend call for old binaries. Note nonstandard calling * convention: libc stub passes mask, not pointer, to save a copyin. */ #ifndef _SYS_SYSPROTO_H_ struct osigsuspend_args { osigset_t mask; }; #endif /* ARGSUSED */ int osigsuspend(struct thread *td, struct osigsuspend_args *uap) { sigset_t mask; OSIG2SIG(uap->mask, mask); return (kern_sigsuspend(td, mask)); } #endif /* COMPAT_43 */ #if defined(COMPAT_43) #ifndef _SYS_SYSPROTO_H_ struct osigstack_args { struct sigstack *nss; struct sigstack *oss; }; #endif /* ARGSUSED */ int osigstack(struct thread *td, struct osigstack_args *uap) { struct sigstack nss, oss; int error = 0; if (uap->nss != NULL) { error = copyin(uap->nss, &nss, sizeof(nss)); if (error) return (error); } oss.ss_sp = td->td_sigstk.ss_sp; oss.ss_onstack = sigonstack(cpu_getstack(td)); if (uap->nss != NULL) { td->td_sigstk.ss_sp = nss.ss_sp; td->td_sigstk.ss_size = 0; td->td_sigstk.ss_flags |= nss.ss_onstack & SS_ONSTACK; td->td_pflags |= TDP_ALTSTACK; } if (uap->oss != NULL) error = copyout(&oss, uap->oss, sizeof(oss)); return (error); } #endif /* COMPAT_43 */ #ifndef _SYS_SYSPROTO_H_ struct sigaltstack_args { stack_t *ss; stack_t *oss; }; #endif /* ARGSUSED */ int sys_sigaltstack(struct thread *td, struct sigaltstack_args *uap) { stack_t ss, oss; int error; if (uap->ss != NULL) { error = copyin(uap->ss, &ss, sizeof(ss)); if (error) return (error); } error = kern_sigaltstack(td, (uap->ss != NULL) ? &ss : NULL, (uap->oss != NULL) ? &oss : NULL); if (error) return (error); if (uap->oss != NULL) error = copyout(&oss, uap->oss, sizeof(stack_t)); return (error); } int kern_sigaltstack(struct thread *td, stack_t *ss, stack_t *oss) { struct proc *p = td->td_proc; int oonstack; oonstack = sigonstack(cpu_getstack(td)); if (oss != NULL) { *oss = td->td_sigstk; oss->ss_flags = (td->td_pflags & TDP_ALTSTACK) ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE; } if (ss != NULL) { if (oonstack) return (EPERM); if ((ss->ss_flags & ~SS_DISABLE) != 0) return (EINVAL); if (!(ss->ss_flags & SS_DISABLE)) { if (ss->ss_size < p->p_sysent->sv_minsigstksz) return (ENOMEM); td->td_sigstk = *ss; td->td_pflags |= TDP_ALTSTACK; } else { td->td_pflags &= ~TDP_ALTSTACK; } } return (0); } /* * Common code for kill process group/broadcast kill. * cp is calling process. */ static int killpg1(struct thread *td, int sig, int pgid, int all, ksiginfo_t *ksi) { struct proc *p; struct pgrp *pgrp; int err; int ret; ret = ESRCH; if (all) { /* * broadcast */ sx_slock(&allproc_lock); FOREACH_PROC_IN_SYSTEM(p) { if (p->p_pid <= 1 || p->p_flag & P_SYSTEM || p == td->td_proc || p->p_state == PRS_NEW) { continue; } PROC_LOCK(p); err = p_cansignal(td, p, sig); if (err == 0) { if (sig) pksignal(p, sig, ksi); ret = err; } else if (ret == ESRCH) ret = err; PROC_UNLOCK(p); } sx_sunlock(&allproc_lock); } else { sx_slock(&proctree_lock); if (pgid == 0) { /* * zero pgid means send to my process group. */ pgrp = td->td_proc->p_pgrp; PGRP_LOCK(pgrp); } else { pgrp = pgfind(pgid); if (pgrp == NULL) { sx_sunlock(&proctree_lock); return (ESRCH); } } sx_sunlock(&proctree_lock); LIST_FOREACH(p, &pgrp->pg_members, p_pglist) { PROC_LOCK(p); if (p->p_pid <= 1 || p->p_flag & P_SYSTEM || p->p_state == PRS_NEW) { PROC_UNLOCK(p); continue; } err = p_cansignal(td, p, sig); if (err == 0) { if (sig) pksignal(p, sig, ksi); ret = err; } else if (ret == ESRCH) ret = err; PROC_UNLOCK(p); } PGRP_UNLOCK(pgrp); } return (ret); } #ifndef _SYS_SYSPROTO_H_ struct kill_args { int pid; int signum; }; #endif /* ARGSUSED */ int sys_kill(struct thread *td, struct kill_args *uap) { ksiginfo_t ksi; struct proc *p; int error; /* * A process in capability mode can send signals only to himself. * The main rationale behind this is that abort(3) is implemented as * kill(getpid(), SIGABRT). */ if (IN_CAPABILITY_MODE(td) && uap->pid != td->td_proc->p_pid) return (ECAPMODE); AUDIT_ARG_SIGNUM(uap->signum); AUDIT_ARG_PID(uap->pid); if ((u_int)uap->signum > _SIG_MAXSIG) return (EINVAL); ksiginfo_init(&ksi); ksi.ksi_signo = uap->signum; ksi.ksi_code = SI_USER; ksi.ksi_pid = td->td_proc->p_pid; ksi.ksi_uid = td->td_ucred->cr_ruid; if (uap->pid > 0) { /* kill single process */ if ((p = pfind_any(uap->pid)) == NULL) return (ESRCH); AUDIT_ARG_PROCESS(p); error = p_cansignal(td, p, uap->signum); if (error == 0 && uap->signum) pksignal(p, uap->signum, &ksi); PROC_UNLOCK(p); return (error); } switch (uap->pid) { case -1: /* broadcast signal */ return (killpg1(td, uap->signum, 0, 1, &ksi)); case 0: /* signal own process group */ return (killpg1(td, uap->signum, 0, 0, &ksi)); default: /* negative explicit process group */ return (killpg1(td, uap->signum, -uap->pid, 0, &ksi)); } /* NOTREACHED */ } int sys_pdkill(struct thread *td, struct pdkill_args *uap) { struct proc *p; int error; AUDIT_ARG_SIGNUM(uap->signum); AUDIT_ARG_FD(uap->fd); if ((u_int)uap->signum > _SIG_MAXSIG) return (EINVAL); error = procdesc_find(td, uap->fd, &cap_pdkill_rights, &p); if (error) return (error); AUDIT_ARG_PROCESS(p); error = p_cansignal(td, p, uap->signum); if (error == 0 && uap->signum) kern_psignal(p, uap->signum); PROC_UNLOCK(p); return (error); } #if defined(COMPAT_43) #ifndef _SYS_SYSPROTO_H_ struct okillpg_args { int pgid; int signum; }; #endif /* ARGSUSED */ int okillpg(struct thread *td, struct okillpg_args *uap) { ksiginfo_t ksi; AUDIT_ARG_SIGNUM(uap->signum); AUDIT_ARG_PID(uap->pgid); if ((u_int)uap->signum > _SIG_MAXSIG) return (EINVAL); ksiginfo_init(&ksi); ksi.ksi_signo = uap->signum; ksi.ksi_code = SI_USER; ksi.ksi_pid = td->td_proc->p_pid; ksi.ksi_uid = td->td_ucred->cr_ruid; return (killpg1(td, uap->signum, uap->pgid, 0, &ksi)); } #endif /* COMPAT_43 */ #ifndef _SYS_SYSPROTO_H_ struct sigqueue_args { pid_t pid; int signum; /* union sigval */ void *value; }; #endif int sys_sigqueue(struct thread *td, struct sigqueue_args *uap) { union sigval sv; sv.sival_ptr = uap->value; return (kern_sigqueue(td, uap->pid, uap->signum, &sv)); } int kern_sigqueue(struct thread *td, pid_t pid, int signum, union sigval *value) { ksiginfo_t ksi; struct proc *p; int error; if ((u_int)signum > _SIG_MAXSIG) return (EINVAL); /* * Specification says sigqueue can only send signal to * single process. */ if (pid <= 0) return (EINVAL); if ((p = pfind_any(pid)) == NULL) return (ESRCH); error = p_cansignal(td, p, signum); if (error == 0 && signum != 0) { ksiginfo_init(&ksi); ksi.ksi_flags = KSI_SIGQ; ksi.ksi_signo = signum; ksi.ksi_code = SI_QUEUE; ksi.ksi_pid = td->td_proc->p_pid; ksi.ksi_uid = td->td_ucred->cr_ruid; ksi.ksi_value = *value; error = pksignal(p, ksi.ksi_signo, &ksi); } PROC_UNLOCK(p); return (error); } /* * Send a signal to a process group. */ void gsignal(int pgid, int sig, ksiginfo_t *ksi) { struct pgrp *pgrp; if (pgid != 0) { sx_slock(&proctree_lock); pgrp = pgfind(pgid); sx_sunlock(&proctree_lock); if (pgrp != NULL) { pgsignal(pgrp, sig, 0, ksi); PGRP_UNLOCK(pgrp); } } } /* * Send a signal to a process group. If checktty is 1, * limit to members which have a controlling terminal. */ void pgsignal(struct pgrp *pgrp, int sig, int checkctty, ksiginfo_t *ksi) { struct proc *p; if (pgrp) { PGRP_LOCK_ASSERT(pgrp, MA_OWNED); LIST_FOREACH(p, &pgrp->pg_members, p_pglist) { PROC_LOCK(p); if (p->p_state == PRS_NORMAL && (checkctty == 0 || p->p_flag & P_CONTROLT)) pksignal(p, sig, ksi); PROC_UNLOCK(p); } } } /* * Recalculate the signal mask and reset the signal disposition after * usermode frame for delivery is formed. Should be called after * mach-specific routine, because sysent->sv_sendsig() needs correct * ps_siginfo and signal mask. */ static void postsig_done(int sig, struct thread *td, struct sigacts *ps) { sigset_t mask; mtx_assert(&ps->ps_mtx, MA_OWNED); td->td_ru.ru_nsignals++; mask = ps->ps_catchmask[_SIG_IDX(sig)]; if (!SIGISMEMBER(ps->ps_signodefer, sig)) SIGADDSET(mask, sig); kern_sigprocmask(td, SIG_BLOCK, &mask, NULL, SIGPROCMASK_PROC_LOCKED | SIGPROCMASK_PS_LOCKED); if (SIGISMEMBER(ps->ps_sigreset, sig)) sigdflt(ps, sig); } /* * Send a signal caused by a trap to the current thread. If it will be * caught immediately, deliver it with correct code. Otherwise, post it * normally. */ void trapsignal(struct thread *td, ksiginfo_t *ksi) { struct sigacts *ps; struct proc *p; int sig; int code; p = td->td_proc; sig = ksi->ksi_signo; code = ksi->ksi_code; KASSERT(_SIG_VALID(sig), ("invalid signal")); PROC_LOCK(p); ps = p->p_sigacts; mtx_lock(&ps->ps_mtx); if ((p->p_flag & P_TRACED) == 0 && SIGISMEMBER(ps->ps_sigcatch, sig) && !SIGISMEMBER(td->td_sigmask, sig)) { #ifdef KTRACE if (KTRPOINT(curthread, KTR_PSIG)) ktrpsig(sig, ps->ps_sigact[_SIG_IDX(sig)], &td->td_sigmask, code); #endif (*p->p_sysent->sv_sendsig)(ps->ps_sigact[_SIG_IDX(sig)], ksi, &td->td_sigmask); postsig_done(sig, td, ps); mtx_unlock(&ps->ps_mtx); } else { /* * Avoid a possible infinite loop if the thread * masking the signal or process is ignoring the * signal. */ if (kern_forcesigexit && (SIGISMEMBER(td->td_sigmask, sig) || ps->ps_sigact[_SIG_IDX(sig)] == SIG_IGN)) { SIGDELSET(td->td_sigmask, sig); SIGDELSET(ps->ps_sigcatch, sig); SIGDELSET(ps->ps_sigignore, sig); ps->ps_sigact[_SIG_IDX(sig)] = SIG_DFL; } mtx_unlock(&ps->ps_mtx); p->p_code = code; /* XXX for core dump/debugger */ p->p_sig = sig; /* XXX to verify code */ tdsendsignal(p, td, sig, ksi); } PROC_UNLOCK(p); } static struct thread * sigtd(struct proc *p, int sig, int prop) { struct thread *td, *signal_td; PROC_LOCK_ASSERT(p, MA_OWNED); /* * Check if current thread can handle the signal without * switching context to another thread. */ if (curproc == p && !SIGISMEMBER(curthread->td_sigmask, sig)) return (curthread); signal_td = NULL; FOREACH_THREAD_IN_PROC(p, td) { if (!SIGISMEMBER(td->td_sigmask, sig)) { signal_td = td; break; } } if (signal_td == NULL) signal_td = FIRST_THREAD_IN_PROC(p); return (signal_td); } /* * Send the signal to the process. If the signal has an action, the action * is usually performed by the target process rather than the caller; we add * the signal to the set of pending signals for the process. * * Exceptions: * o When a stop signal is sent to a sleeping process that takes the * default action, the process is stopped without awakening it. * o SIGCONT restarts stopped processes (or puts them back to sleep) * regardless of the signal action (eg, blocked or ignored). * * Other ignored signals are discarded immediately. * * NB: This function may be entered from the debugger via the "kill" DDB * command. There is little that can be done to mitigate the possibly messy * side effects of this unwise possibility. */ void kern_psignal(struct proc *p, int sig) { ksiginfo_t ksi; ksiginfo_init(&ksi); ksi.ksi_signo = sig; ksi.ksi_code = SI_KERNEL; (void) tdsendsignal(p, NULL, sig, &ksi); } int pksignal(struct proc *p, int sig, ksiginfo_t *ksi) { return (tdsendsignal(p, NULL, sig, ksi)); } /* Utility function for finding a thread to send signal event to. */ int sigev_findtd(struct proc *p ,struct sigevent *sigev, struct thread **ttd) { struct thread *td; if (sigev->sigev_notify == SIGEV_THREAD_ID) { td = tdfind(sigev->sigev_notify_thread_id, p->p_pid); if (td == NULL) return (ESRCH); *ttd = td; } else { *ttd = NULL; PROC_LOCK(p); } return (0); } void tdsignal(struct thread *td, int sig) { ksiginfo_t ksi; ksiginfo_init(&ksi); ksi.ksi_signo = sig; ksi.ksi_code = SI_KERNEL; (void) tdsendsignal(td->td_proc, td, sig, &ksi); } void tdksignal(struct thread *td, int sig, ksiginfo_t *ksi) { (void) tdsendsignal(td->td_proc, td, sig, ksi); } int tdsendsignal(struct proc *p, struct thread *td, int sig, ksiginfo_t *ksi) { sig_t action; sigqueue_t *sigqueue; int prop; struct sigacts *ps; int intrval; int ret = 0; int wakeup_swapper; MPASS(td == NULL || p == td->td_proc); PROC_LOCK_ASSERT(p, MA_OWNED); if (!_SIG_VALID(sig)) panic("%s(): invalid signal %d", __func__, sig); KASSERT(ksi == NULL || !KSI_ONQ(ksi), ("%s: ksi on queue", __func__)); /* * IEEE Std 1003.1-2001: return success when killing a zombie. */ if (p->p_state == PRS_ZOMBIE) { if (ksi && (ksi->ksi_flags & KSI_INS)) ksiginfo_tryfree(ksi); return (ret); } ps = p->p_sigacts; KNOTE_LOCKED(p->p_klist, NOTE_SIGNAL | sig); prop = sigprop(sig); if (td == NULL) { td = sigtd(p, sig, prop); sigqueue = &p->p_sigqueue; } else sigqueue = &td->td_sigqueue; SDT_PROBE3(proc, , , signal__send, td, p, sig); /* * If the signal is being ignored, * then we forget about it immediately. * (Note: we don't set SIGCONT in ps_sigignore, * and if it is set to SIG_IGN, * action will be SIG_DFL here.) */ mtx_lock(&ps->ps_mtx); if (SIGISMEMBER(ps->ps_sigignore, sig)) { SDT_PROBE3(proc, , , signal__discard, td, p, sig); mtx_unlock(&ps->ps_mtx); if (ksi && (ksi->ksi_flags & KSI_INS)) ksiginfo_tryfree(ksi); return (ret); } if (SIGISMEMBER(td->td_sigmask, sig)) action = SIG_HOLD; else if (SIGISMEMBER(ps->ps_sigcatch, sig)) action = SIG_CATCH; else action = SIG_DFL; if (SIGISMEMBER(ps->ps_sigintr, sig)) intrval = EINTR; else intrval = ERESTART; mtx_unlock(&ps->ps_mtx); if (prop & SIGPROP_CONT) sigqueue_delete_stopmask_proc(p); else if (prop & SIGPROP_STOP) { /* * If sending a tty stop signal to a member of an orphaned * process group, discard the signal here if the action * is default; don't stop the process below if sleeping, * and don't clear any pending SIGCONT. */ if ((prop & SIGPROP_TTYSTOP) && (p->p_pgrp->pg_jobc == 0) && (action == SIG_DFL)) { if (ksi && (ksi->ksi_flags & KSI_INS)) ksiginfo_tryfree(ksi); return (ret); } sigqueue_delete_proc(p, SIGCONT); if (p->p_flag & P_CONTINUED) { p->p_flag &= ~P_CONTINUED; PROC_LOCK(p->p_pptr); sigqueue_take(p->p_ksi); PROC_UNLOCK(p->p_pptr); } } ret = sigqueue_add(sigqueue, sig, ksi); if (ret != 0) return (ret); signotify(td); /* * Defer further processing for signals which are held, * except that stopped processes must be continued by SIGCONT. */ if (action == SIG_HOLD && !((prop & SIGPROP_CONT) && (p->p_flag & P_STOPPED_SIG))) return (ret); /* SIGKILL: Remove procfs STOPEVENTs. */ if (sig == SIGKILL) { /* from procfs_ioctl.c: PIOCBIC */ p->p_stops = 0; /* from procfs_ioctl.c: PIOCCONT */ p->p_step = 0; wakeup(&p->p_step); } /* * Some signals have a process-wide effect and a per-thread * component. Most processing occurs when the process next * tries to cross the user boundary, however there are some * times when processing needs to be done immediately, such as * waking up threads so that they can cross the user boundary. * We try to do the per-process part here. */ if (P_SHOULDSTOP(p)) { KASSERT(!(p->p_flag & P_WEXIT), ("signal to stopped but exiting process")); if (sig == SIGKILL) { /* * If traced process is already stopped, * then no further action is necessary. */ if (p->p_flag & P_TRACED) goto out; /* * SIGKILL sets process running. * It will die elsewhere. * All threads must be restarted. */ p->p_flag &= ~P_STOPPED_SIG; goto runfast; } if (prop & SIGPROP_CONT) { /* * If traced process is already stopped, * then no further action is necessary. */ if (p->p_flag & P_TRACED) goto out; /* * If SIGCONT is default (or ignored), we continue the * process but don't leave the signal in sigqueue as * it has no further action. If SIGCONT is held, we * continue the process and leave the signal in * sigqueue. If the process catches SIGCONT, let it * handle the signal itself. If it isn't waiting on * an event, it goes back to run state. * Otherwise, process goes back to sleep state. */ p->p_flag &= ~P_STOPPED_SIG; PROC_SLOCK(p); if (p->p_numthreads == p->p_suspcount) { PROC_SUNLOCK(p); p->p_flag |= P_CONTINUED; p->p_xsig = SIGCONT; PROC_LOCK(p->p_pptr); childproc_continued(p); PROC_UNLOCK(p->p_pptr); PROC_SLOCK(p); } if (action == SIG_DFL) { thread_unsuspend(p); PROC_SUNLOCK(p); sigqueue_delete(sigqueue, sig); goto out; } if (action == SIG_CATCH) { /* * The process wants to catch it so it needs * to run at least one thread, but which one? */ PROC_SUNLOCK(p); goto runfast; } /* * The signal is not ignored or caught. */ thread_unsuspend(p); PROC_SUNLOCK(p); goto out; } if (prop & SIGPROP_STOP) { /* * If traced process is already stopped, * then no further action is necessary. */ if (p->p_flag & P_TRACED) goto out; /* * Already stopped, don't need to stop again * (If we did the shell could get confused). * Just make sure the signal STOP bit set. */ p->p_flag |= P_STOPPED_SIG; sigqueue_delete(sigqueue, sig); goto out; } /* * All other kinds of signals: * If a thread is sleeping interruptibly, simulate a * wakeup so that when it is continued it will be made * runnable and can look at the signal. However, don't make * the PROCESS runnable, leave it stopped. * It may run a bit until it hits a thread_suspend_check(). */ wakeup_swapper = 0; PROC_SLOCK(p); thread_lock(td); if (TD_ON_SLEEPQ(td) && (td->td_flags & TDF_SINTR)) wakeup_swapper = sleepq_abort(td, intrval); thread_unlock(td); PROC_SUNLOCK(p); if (wakeup_swapper) kick_proc0(); goto out; /* * Mutexes are short lived. Threads waiting on them will * hit thread_suspend_check() soon. */ } else if (p->p_state == PRS_NORMAL) { if (p->p_flag & P_TRACED || action == SIG_CATCH) { tdsigwakeup(td, sig, action, intrval); goto out; } MPASS(action == SIG_DFL); if (prop & SIGPROP_STOP) { if (p->p_flag & (P_PPWAIT|P_WEXIT)) goto out; p->p_flag |= P_STOPPED_SIG; p->p_xsig = sig; PROC_SLOCK(p); wakeup_swapper = sig_suspend_threads(td, p, 1); if (p->p_numthreads == p->p_suspcount) { /* * only thread sending signal to another * process can reach here, if thread is sending * signal to its process, because thread does * not suspend itself here, p_numthreads * should never be equal to p_suspcount. */ thread_stopped(p); PROC_SUNLOCK(p); sigqueue_delete_proc(p, p->p_xsig); } else PROC_SUNLOCK(p); if (wakeup_swapper) kick_proc0(); goto out; } } else { /* Not in "NORMAL" state. discard the signal. */ sigqueue_delete(sigqueue, sig); goto out; } /* * The process is not stopped so we need to apply the signal to all the * running threads. */ runfast: tdsigwakeup(td, sig, action, intrval); PROC_SLOCK(p); thread_unsuspend(p); PROC_SUNLOCK(p); out: /* If we jump here, proc slock should not be owned. */ PROC_SLOCK_ASSERT(p, MA_NOTOWNED); return (ret); } /* * The force of a signal has been directed against a single * thread. We need to see what we can do about knocking it * out of any sleep it may be in etc. */ static void tdsigwakeup(struct thread *td, int sig, sig_t action, int intrval) { struct proc *p = td->td_proc; int prop; int wakeup_swapper; wakeup_swapper = 0; PROC_LOCK_ASSERT(p, MA_OWNED); prop = sigprop(sig); PROC_SLOCK(p); thread_lock(td); /* * Bring the priority of a thread up if we want it to get * killed in this lifetime. Be careful to avoid bumping the * priority of the idle thread, since we still allow to signal * kernel processes. */ if (action == SIG_DFL && (prop & SIGPROP_KILL) != 0 && td->td_priority > PUSER && !TD_IS_IDLETHREAD(td)) sched_prio(td, PUSER); if (TD_ON_SLEEPQ(td)) { /* * If thread is sleeping uninterruptibly * we can't interrupt the sleep... the signal will * be noticed when the process returns through * trap() or syscall(). */ if ((td->td_flags & TDF_SINTR) == 0) goto out; /* * If SIGCONT is default (or ignored) and process is * asleep, we are finished; the process should not * be awakened. */ if ((prop & SIGPROP_CONT) && action == SIG_DFL) { thread_unlock(td); PROC_SUNLOCK(p); sigqueue_delete(&p->p_sigqueue, sig); /* * It may be on either list in this state. * Remove from both for now. */ sigqueue_delete(&td->td_sigqueue, sig); return; } /* * Don't awaken a sleeping thread for SIGSTOP if the * STOP signal is deferred. */ if ((prop & SIGPROP_STOP) != 0 && (td->td_flags & (TDF_SBDRY | TDF_SERESTART | TDF_SEINTR)) == TDF_SBDRY) goto out; /* * Give low priority threads a better chance to run. */ if (td->td_priority > PUSER && !TD_IS_IDLETHREAD(td)) sched_prio(td, PUSER); wakeup_swapper = sleepq_abort(td, intrval); } else { /* * Other states do nothing with the signal immediately, * other than kicking ourselves if we are running. * It will either never be noticed, or noticed very soon. */ #ifdef SMP if (TD_IS_RUNNING(td) && td != curthread) forward_signal(td); #endif } out: PROC_SUNLOCK(p); thread_unlock(td); if (wakeup_swapper) kick_proc0(); } static int sig_suspend_threads(struct thread *td, struct proc *p, int sending) { struct thread *td2; int wakeup_swapper; PROC_LOCK_ASSERT(p, MA_OWNED); PROC_SLOCK_ASSERT(p, MA_OWNED); MPASS(sending || td == curthread); wakeup_swapper = 0; FOREACH_THREAD_IN_PROC(p, td2) { thread_lock(td2); td2->td_flags |= TDF_ASTPENDING | TDF_NEEDSUSPCHK; if ((TD_IS_SLEEPING(td2) || TD_IS_SWAPPED(td2)) && (td2->td_flags & TDF_SINTR)) { if (td2->td_flags & TDF_SBDRY) { /* * Once a thread is asleep with * TDF_SBDRY and without TDF_SERESTART * or TDF_SEINTR set, it should never * become suspended due to this check. */ KASSERT(!TD_IS_SUSPENDED(td2), ("thread with deferred stops suspended")); if (TD_SBDRY_INTR(td2)) wakeup_swapper |= sleepq_abort(td2, TD_SBDRY_ERRNO(td2)); } else if (!TD_IS_SUSPENDED(td2)) { thread_suspend_one(td2); } } else if (!TD_IS_SUSPENDED(td2)) { if (sending || td != td2) td2->td_flags |= TDF_ASTPENDING; #ifdef SMP if (TD_IS_RUNNING(td2) && td2 != td) forward_signal(td2); #endif } thread_unlock(td2); } return (wakeup_swapper); } /* * Stop the process for an event deemed interesting to the debugger. If si is * non-NULL, this is a signal exchange; the new signal requested by the * debugger will be returned for handling. If si is NULL, this is some other * type of interesting event. The debugger may request a signal be delivered in * that case as well, however it will be deferred until it can be handled. */ int ptracestop(struct thread *td, int sig, ksiginfo_t *si) { struct proc *p = td->td_proc; struct thread *td2; ksiginfo_t ksi; int prop; PROC_LOCK_ASSERT(p, MA_OWNED); KASSERT(!(p->p_flag & P_WEXIT), ("Stopping exiting process")); WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, &p->p_mtx.lock_object, "Stopping for traced signal"); td->td_xsig = sig; if (si == NULL || (si->ksi_flags & KSI_PTRACE) == 0) { td->td_dbgflags |= TDB_XSIG; CTR4(KTR_PTRACE, "ptracestop: tid %d (pid %d) flags %#x sig %d", td->td_tid, p->p_pid, td->td_dbgflags, sig); PROC_SLOCK(p); while ((p->p_flag & P_TRACED) && (td->td_dbgflags & TDB_XSIG)) { if (P_KILLED(p)) { /* * Ensure that, if we've been PT_KILLed, the * exit status reflects that. Another thread * may also be in ptracestop(), having just * received the SIGKILL, but this thread was * unsuspended first. */ td->td_dbgflags &= ~TDB_XSIG; td->td_xsig = SIGKILL; p->p_ptevents = 0; break; } if (p->p_flag & P_SINGLE_EXIT && !(td->td_dbgflags & TDB_EXIT)) { /* * Ignore ptrace stops except for thread exit * events when the process exits. */ td->td_dbgflags &= ~TDB_XSIG; PROC_SUNLOCK(p); return (0); } /* * Make wait(2) work. Ensure that right after the * attach, the thread which was decided to become the * leader of attach gets reported to the waiter. * Otherwise, just avoid overwriting another thread's * assignment to p_xthread. If another thread has * already set p_xthread, the current thread will get * a chance to report itself upon the next iteration. */ if ((td->td_dbgflags & TDB_FSTP) != 0 || ((p->p_flag2 & P2_PTRACE_FSTP) == 0 && p->p_xthread == NULL)) { p->p_xsig = sig; p->p_xthread = td; td->td_dbgflags &= ~TDB_FSTP; p->p_flag2 &= ~P2_PTRACE_FSTP; p->p_flag |= P_STOPPED_SIG | P_STOPPED_TRACE; sig_suspend_threads(td, p, 0); } if ((td->td_dbgflags & TDB_STOPATFORK) != 0) { td->td_dbgflags &= ~TDB_STOPATFORK; } stopme: thread_suspend_switch(td, p); if (p->p_xthread == td) p->p_xthread = NULL; if (!(p->p_flag & P_TRACED)) break; if (td->td_dbgflags & TDB_SUSPEND) { if (p->p_flag & P_SINGLE_EXIT) break; goto stopme; } } PROC_SUNLOCK(p); } if (si != NULL && sig == td->td_xsig) { /* Parent wants us to take the original signal unchanged. */ si->ksi_flags |= KSI_HEAD; if (sigqueue_add(&td->td_sigqueue, sig, si) != 0) si->ksi_signo = 0; } else if (td->td_xsig != 0) { /* * If parent wants us to take a new signal, then it will leave * it in td->td_xsig; otherwise we just look for signals again. */ ksiginfo_init(&ksi); ksi.ksi_signo = td->td_xsig; ksi.ksi_flags |= KSI_PTRACE; prop = sigprop(td->td_xsig); td2 = sigtd(p, td->td_xsig, prop); tdsendsignal(p, td2, td->td_xsig, &ksi); if (td != td2) return (0); } return (td->td_xsig); } static void reschedule_signals(struct proc *p, sigset_t block, int flags) { struct sigacts *ps; struct thread *td; int sig; PROC_LOCK_ASSERT(p, MA_OWNED); ps = p->p_sigacts; mtx_assert(&ps->ps_mtx, (flags & SIGPROCMASK_PS_LOCKED) != 0 ? MA_OWNED : MA_NOTOWNED); if (SIGISEMPTY(p->p_siglist)) return; SIGSETAND(block, p->p_siglist); while ((sig = sig_ffs(&block)) != 0) { SIGDELSET(block, sig); td = sigtd(p, sig, 0); signotify(td); if (!(flags & SIGPROCMASK_PS_LOCKED)) mtx_lock(&ps->ps_mtx); if (p->p_flag & P_TRACED || (SIGISMEMBER(ps->ps_sigcatch, sig) && !SIGISMEMBER(td->td_sigmask, sig))) tdsigwakeup(td, sig, SIG_CATCH, (SIGISMEMBER(ps->ps_sigintr, sig) ? EINTR : ERESTART)); if (!(flags & SIGPROCMASK_PS_LOCKED)) mtx_unlock(&ps->ps_mtx); } } void tdsigcleanup(struct thread *td) { struct proc *p; sigset_t unblocked; p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); sigqueue_flush(&td->td_sigqueue); if (p->p_numthreads == 1) return; /* * Since we cannot handle signals, notify signal post code * about this by filling the sigmask. * * Also, if needed, wake up thread(s) that do not block the * same signals as the exiting thread, since the thread might * have been selected for delivery and woken up. */ SIGFILLSET(unblocked); SIGSETNAND(unblocked, td->td_sigmask); SIGFILLSET(td->td_sigmask); reschedule_signals(p, unblocked, 0); } static int sigdeferstop_curr_flags(int cflags) { MPASS((cflags & (TDF_SEINTR | TDF_SERESTART)) == 0 || (cflags & TDF_SBDRY) != 0); return (cflags & (TDF_SBDRY | TDF_SEINTR | TDF_SERESTART)); } /* * Defer the delivery of SIGSTOP for the current thread, according to * the requested mode. Returns previous flags, which must be restored * by sigallowstop(). * * TDF_SBDRY, TDF_SEINTR, and TDF_SERESTART flags are only set and * cleared by the current thread, which allow the lock-less read-only * accesses below. */ int sigdeferstop_impl(int mode) { struct thread *td; int cflags, nflags; td = curthread; cflags = sigdeferstop_curr_flags(td->td_flags); switch (mode) { case SIGDEFERSTOP_NOP: nflags = cflags; break; case SIGDEFERSTOP_OFF: nflags = 0; break; case SIGDEFERSTOP_SILENT: nflags = (cflags | TDF_SBDRY) & ~(TDF_SEINTR | TDF_SERESTART); break; case SIGDEFERSTOP_EINTR: nflags = (cflags | TDF_SBDRY | TDF_SEINTR) & ~TDF_SERESTART; break; case SIGDEFERSTOP_ERESTART: nflags = (cflags | TDF_SBDRY | TDF_SERESTART) & ~TDF_SEINTR; break; default: panic("sigdeferstop: invalid mode %x", mode); break; } if (cflags == nflags) return (SIGDEFERSTOP_VAL_NCHG); thread_lock(td); td->td_flags = (td->td_flags & ~cflags) | nflags; thread_unlock(td); return (cflags); } /* * Restores the STOP handling mode, typically permitting the delivery * of SIGSTOP for the current thread. This does not immediately * suspend if a stop was posted. Instead, the thread will suspend * either via ast() or a subsequent interruptible sleep. */ void sigallowstop_impl(int prev) { struct thread *td; int cflags; KASSERT(prev != SIGDEFERSTOP_VAL_NCHG, ("failed sigallowstop")); KASSERT((prev & ~(TDF_SBDRY | TDF_SEINTR | TDF_SERESTART)) == 0, ("sigallowstop: incorrect previous mode %x", prev)); td = curthread; cflags = sigdeferstop_curr_flags(td->td_flags); if (cflags != prev) { thread_lock(td); td->td_flags = (td->td_flags & ~cflags) | prev; thread_unlock(td); } } /* * If the current process has received a signal (should be caught or cause * termination, should interrupt current syscall), return the signal number. * Stop signals with default action are processed immediately, then cleared; * they aren't returned. This is checked after each entry to the system for * a syscall or trap (though this can usually be done without calling issignal * by checking the pending signal masks in cursig.) The normal call * sequence is * * while (sig = cursig(curthread)) * postsig(sig); */ static int issignal(struct thread *td) { struct proc *p; struct sigacts *ps; struct sigqueue *queue; sigset_t sigpending; ksiginfo_t ksi; int prop, sig, traced; p = td->td_proc; ps = p->p_sigacts; mtx_assert(&ps->ps_mtx, MA_OWNED); PROC_LOCK_ASSERT(p, MA_OWNED); for (;;) { traced = (p->p_flag & P_TRACED) || (p->p_stops & S_SIG); sigpending = td->td_sigqueue.sq_signals; SIGSETOR(sigpending, p->p_sigqueue.sq_signals); SIGSETNAND(sigpending, td->td_sigmask); if ((p->p_flag & P_PPWAIT) != 0 || (td->td_flags & (TDF_SBDRY | TDF_SERESTART | TDF_SEINTR)) == TDF_SBDRY) SIG_STOPSIGMASK(sigpending); if (SIGISEMPTY(sigpending)) /* no signal to send */ return (0); if ((p->p_flag & (P_TRACED | P_PPTRACE)) == P_TRACED && (p->p_flag2 & P2_PTRACE_FSTP) != 0 && SIGISMEMBER(sigpending, SIGSTOP)) { /* * If debugger just attached, always consume * SIGSTOP from ptrace(PT_ATTACH) first, to * execute the debugger attach ritual in * order. */ sig = SIGSTOP; td->td_dbgflags |= TDB_FSTP; } else { sig = sig_ffs(&sigpending); } if (p->p_stops & S_SIG) { mtx_unlock(&ps->ps_mtx); stopevent(p, S_SIG, sig); mtx_lock(&ps->ps_mtx); } /* * We should see pending but ignored signals * only if P_TRACED was on when they were posted. */ if (SIGISMEMBER(ps->ps_sigignore, sig) && (traced == 0)) { sigqueue_delete(&td->td_sigqueue, sig); sigqueue_delete(&p->p_sigqueue, sig); continue; } if ((p->p_flag & (P_TRACED | P_PPTRACE)) == P_TRACED) { /* * If traced, always stop. * Remove old signal from queue before the stop. * XXX shrug off debugger, it causes siginfo to * be thrown away. */ queue = &td->td_sigqueue; ksiginfo_init(&ksi); if (sigqueue_get(queue, sig, &ksi) == 0) { queue = &p->p_sigqueue; sigqueue_get(queue, sig, &ksi); } td->td_si = ksi.ksi_info; mtx_unlock(&ps->ps_mtx); sig = ptracestop(td, sig, &ksi); mtx_lock(&ps->ps_mtx); /* * Keep looking if the debugger discarded or * replaced the signal. */ if (sig == 0) continue; /* * If the signal became masked, re-queue it. */ if (SIGISMEMBER(td->td_sigmask, sig)) { ksi.ksi_flags |= KSI_HEAD; sigqueue_add(&p->p_sigqueue, sig, &ksi); continue; } /* * If the traced bit got turned off, requeue * the signal and go back up to the top to * rescan signals. This ensures that p_sig* * and p_sigact are consistent. */ if ((p->p_flag & P_TRACED) == 0) { ksi.ksi_flags |= KSI_HEAD; sigqueue_add(queue, sig, &ksi); continue; } } prop = sigprop(sig); /* * Decide whether the signal should be returned. * Return the signal's number, or fall through * to clear it from the pending mask. */ switch ((intptr_t)p->p_sigacts->ps_sigact[_SIG_IDX(sig)]) { case (intptr_t)SIG_DFL: /* * Don't take default actions on system processes. */ if (p->p_pid <= 1) { #ifdef DIAGNOSTIC /* * Are you sure you want to ignore SIGSEGV * in init? XXX */ printf("Process (pid %lu) got signal %d\n", (u_long)p->p_pid, sig); #endif break; /* == ignore */ } /* * If there is a pending stop signal to process with * default action, stop here, then clear the signal. * Traced or exiting processes should ignore stops. * Additionally, a member of an orphaned process group * should ignore tty stops. */ if (prop & SIGPROP_STOP) { if (p->p_flag & (P_TRACED | P_WEXIT | P_SINGLE_EXIT) || (p->p_pgrp->pg_jobc == 0 && prop & SIGPROP_TTYSTOP)) break; /* == ignore */ if (TD_SBDRY_INTR(td)) { KASSERT((td->td_flags & TDF_SBDRY) != 0, ("lost TDF_SBDRY")); return (-1); } mtx_unlock(&ps->ps_mtx); WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, &p->p_mtx.lock_object, "Catching SIGSTOP"); sigqueue_delete(&td->td_sigqueue, sig); sigqueue_delete(&p->p_sigqueue, sig); p->p_flag |= P_STOPPED_SIG; p->p_xsig = sig; PROC_SLOCK(p); sig_suspend_threads(td, p, 0); thread_suspend_switch(td, p); PROC_SUNLOCK(p); mtx_lock(&ps->ps_mtx); goto next; } else if (prop & SIGPROP_IGNORE) { /* * Except for SIGCONT, shouldn't get here. * Default action is to ignore; drop it. */ break; /* == ignore */ } else return (sig); /*NOTREACHED*/ case (intptr_t)SIG_IGN: /* * Masking above should prevent us ever trying * to take action on an ignored signal other * than SIGCONT, unless process is traced. */ if ((prop & SIGPROP_CONT) == 0 && (p->p_flag & P_TRACED) == 0) printf("issignal\n"); break; /* == ignore */ default: /* * This signal has an action, let * postsig() process it. */ return (sig); } sigqueue_delete(&td->td_sigqueue, sig); /* take the signal! */ sigqueue_delete(&p->p_sigqueue, sig); next:; } /* NOTREACHED */ } void thread_stopped(struct proc *p) { int n; PROC_LOCK_ASSERT(p, MA_OWNED); PROC_SLOCK_ASSERT(p, MA_OWNED); n = p->p_suspcount; if (p == curproc) n++; if ((p->p_flag & P_STOPPED_SIG) && (n == p->p_numthreads)) { PROC_SUNLOCK(p); p->p_flag &= ~P_WAITED; PROC_LOCK(p->p_pptr); childproc_stopped(p, (p->p_flag & P_TRACED) ? CLD_TRAPPED : CLD_STOPPED); PROC_UNLOCK(p->p_pptr); PROC_SLOCK(p); } } /* * Take the action for the specified signal * from the current set of pending signals. */ int postsig(int sig) { struct thread *td; struct proc *p; struct sigacts *ps; sig_t action; ksiginfo_t ksi; sigset_t returnmask; KASSERT(sig != 0, ("postsig")); td = curthread; p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); ps = p->p_sigacts; mtx_assert(&ps->ps_mtx, MA_OWNED); ksiginfo_init(&ksi); if (sigqueue_get(&td->td_sigqueue, sig, &ksi) == 0 && sigqueue_get(&p->p_sigqueue, sig, &ksi) == 0) return (0); ksi.ksi_signo = sig; if (ksi.ksi_code == SI_TIMER) itimer_accept(p, ksi.ksi_timerid, &ksi); action = ps->ps_sigact[_SIG_IDX(sig)]; #ifdef KTRACE if (KTRPOINT(td, KTR_PSIG)) ktrpsig(sig, action, td->td_pflags & TDP_OLDMASK ? &td->td_oldsigmask : &td->td_sigmask, ksi.ksi_code); #endif if ((p->p_stops & S_SIG) != 0) { mtx_unlock(&ps->ps_mtx); stopevent(p, S_SIG, sig); mtx_lock(&ps->ps_mtx); } if (action == SIG_DFL) { /* * Default action, where the default is to kill * the process. (Other cases were ignored above.) */ mtx_unlock(&ps->ps_mtx); proc_td_siginfo_capture(td, &ksi.ksi_info); sigexit(td, sig); /* NOTREACHED */ } else { /* * If we get here, the signal must be caught. */ KASSERT(action != SIG_IGN, ("postsig action %p", action)); KASSERT(!SIGISMEMBER(td->td_sigmask, sig), ("postsig action: blocked sig %d", sig)); /* * Set the new mask value and also defer further * occurrences of this signal. * * Special case: user has done a sigsuspend. Here the * current mask is not of interest, but rather the * mask from before the sigsuspend is what we want * restored after the signal processing is completed. */ if (td->td_pflags & TDP_OLDMASK) { returnmask = td->td_oldsigmask; td->td_pflags &= ~TDP_OLDMASK; } else returnmask = td->td_sigmask; if (p->p_sig == sig) { p->p_code = 0; p->p_sig = 0; } (*p->p_sysent->sv_sendsig)(action, &ksi, &returnmask); postsig_done(sig, td, ps); } return (1); } /* * Kill the current process for stated reason. */ void killproc(struct proc *p, char *why) { PROC_LOCK_ASSERT(p, MA_OWNED); CTR3(KTR_PROC, "killproc: proc %p (pid %d, %s)", p, p->p_pid, p->p_comm); log(LOG_ERR, "pid %d (%s), uid %d, was killed: %s\n", p->p_pid, p->p_comm, p->p_ucred ? p->p_ucred->cr_uid : -1, why); p->p_flag |= P_WKILLED; kern_psignal(p, SIGKILL); } /* * Force the current process to exit with the specified signal, dumping core * if appropriate. We bypass the normal tests for masked and caught signals, * allowing unrecoverable failures to terminate the process without changing * signal state. Mark the accounting record with the signal termination. * If dumping core, save the signal number for the debugger. Calls exit and * does not return. */ void sigexit(struct thread *td, int sig) { struct proc *p = td->td_proc; PROC_LOCK_ASSERT(p, MA_OWNED); p->p_acflag |= AXSIG; /* * We must be single-threading to generate a core dump. This * ensures that the registers in the core file are up-to-date. * Also, the ELF dump handler assumes that the thread list doesn't * change out from under it. * * XXX If another thread attempts to single-thread before us * (e.g. via fork()), we won't get a dump at all. */ if ((sigprop(sig) & SIGPROP_CORE) && thread_single(p, SINGLE_NO_EXIT) == 0) { p->p_sig = sig; /* * Log signals which would cause core dumps * (Log as LOG_INFO to appease those who don't want * these messages.) * XXX : Todo, as well as euid, write out ruid too * Note that coredump() drops proc lock. */ if (coredump(td) == 0) sig |= WCOREFLAG; if (kern_logsigexit) log(LOG_INFO, "pid %d (%s), uid %d: exited on signal %d%s\n", p->p_pid, p->p_comm, td->td_ucred ? td->td_ucred->cr_uid : -1, sig &~ WCOREFLAG, sig & WCOREFLAG ? " (core dumped)" : ""); } else PROC_UNLOCK(p); exit1(td, 0, sig); /* NOTREACHED */ } /* * Send queued SIGCHLD to parent when child process's state * is changed. */ static void sigparent(struct proc *p, int reason, int status) { PROC_LOCK_ASSERT(p, MA_OWNED); PROC_LOCK_ASSERT(p->p_pptr, MA_OWNED); if (p->p_ksi != NULL) { p->p_ksi->ksi_signo = SIGCHLD; p->p_ksi->ksi_code = reason; p->p_ksi->ksi_status = status; p->p_ksi->ksi_pid = p->p_pid; p->p_ksi->ksi_uid = p->p_ucred->cr_ruid; if (KSI_ONQ(p->p_ksi)) return; } pksignal(p->p_pptr, SIGCHLD, p->p_ksi); } static void childproc_jobstate(struct proc *p, int reason, int sig) { struct sigacts *ps; PROC_LOCK_ASSERT(p, MA_OWNED); PROC_LOCK_ASSERT(p->p_pptr, MA_OWNED); /* * Wake up parent sleeping in kern_wait(), also send * SIGCHLD to parent, but SIGCHLD does not guarantee * that parent will awake, because parent may masked * the signal. */ p->p_pptr->p_flag |= P_STATCHILD; wakeup(p->p_pptr); ps = p->p_pptr->p_sigacts; mtx_lock(&ps->ps_mtx); if ((ps->ps_flag & PS_NOCLDSTOP) == 0) { mtx_unlock(&ps->ps_mtx); sigparent(p, reason, sig); } else mtx_unlock(&ps->ps_mtx); } void childproc_stopped(struct proc *p, int reason) { childproc_jobstate(p, reason, p->p_xsig); } void childproc_continued(struct proc *p) { childproc_jobstate(p, CLD_CONTINUED, SIGCONT); } void childproc_exited(struct proc *p) { int reason, status; if (WCOREDUMP(p->p_xsig)) { reason = CLD_DUMPED; status = WTERMSIG(p->p_xsig); } else if (WIFSIGNALED(p->p_xsig)) { reason = CLD_KILLED; status = WTERMSIG(p->p_xsig); } else { reason = CLD_EXITED; status = p->p_xexit; } /* * XXX avoid calling wakeup(p->p_pptr), the work is * done in exit1(). */ sigparent(p, reason, status); } #define MAX_NUM_CORE_FILES 100000 #ifndef NUM_CORE_FILES #define NUM_CORE_FILES 5 #endif CTASSERT(NUM_CORE_FILES >= 0 && NUM_CORE_FILES <= MAX_NUM_CORE_FILES); static int num_cores = NUM_CORE_FILES; static int sysctl_debug_num_cores_check (SYSCTL_HANDLER_ARGS) { int error; int new_val; new_val = num_cores; error = sysctl_handle_int(oidp, &new_val, 0, req); if (error != 0 || req->newptr == NULL) return (error); if (new_val > MAX_NUM_CORE_FILES) new_val = MAX_NUM_CORE_FILES; if (new_val < 0) new_val = 0; num_cores = new_val; return (0); } SYSCTL_PROC(_debug, OID_AUTO, ncores, CTLTYPE_INT|CTLFLAG_RW, 0, sizeof(int), sysctl_debug_num_cores_check, "I", "Maximum number of generated process corefiles while using index format"); #define GZIP_SUFFIX ".gz" #define ZSTD_SUFFIX ".zst" int compress_user_cores = 0; static int sysctl_compress_user_cores(SYSCTL_HANDLER_ARGS) { int error, val; val = compress_user_cores; error = sysctl_handle_int(oidp, &val, 0, req); if (error != 0 || req->newptr == NULL) return (error); if (val != 0 && !compressor_avail(val)) return (EINVAL); compress_user_cores = val; return (error); } SYSCTL_PROC(_kern, OID_AUTO, compress_user_cores, CTLTYPE_INT | CTLFLAG_RWTUN, 0, sizeof(int), sysctl_compress_user_cores, "I", "Enable compression of user corefiles (" __XSTRING(COMPRESS_GZIP) " = gzip, " __XSTRING(COMPRESS_ZSTD) " = zstd)"); int compress_user_cores_level = 6; SYSCTL_INT(_kern, OID_AUTO, compress_user_cores_level, CTLFLAG_RWTUN, &compress_user_cores_level, 0, "Corefile compression level"); /* * Protect the access to corefilename[] by allproc_lock. */ #define corefilename_lock allproc_lock static char corefilename[MAXPATHLEN] = {"%N.core"}; TUNABLE_STR("kern.corefile", corefilename, sizeof(corefilename)); static int sysctl_kern_corefile(SYSCTL_HANDLER_ARGS) { int error; sx_xlock(&corefilename_lock); error = sysctl_handle_string(oidp, corefilename, sizeof(corefilename), req); sx_xunlock(&corefilename_lock); return (error); } SYSCTL_PROC(_kern, OID_AUTO, corefile, CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 0, sysctl_kern_corefile, "A", "Process corefile name format string"); static void vnode_close_locked(struct thread *td, struct vnode *vp) { VOP_UNLOCK(vp, 0); vn_close(vp, FWRITE, td->td_ucred, td); } /* * If the core format has a %I in it, then we need to check * for existing corefiles before defining a name. * To do this we iterate over 0..ncores to find a * non-existing core file name to use. If all core files are * already used we choose the oldest one. */ static int corefile_open_last(struct thread *td, char *name, int indexpos, int indexlen, int ncores, struct vnode **vpp) { struct vnode *oldvp, *nextvp, *vp; struct vattr vattr; struct nameidata nd; int error, i, flags, oflags, cmode; char ch; struct timespec lasttime; nextvp = oldvp = NULL; cmode = S_IRUSR | S_IWUSR; oflags = VN_OPEN_NOAUDIT | VN_OPEN_NAMECACHE | (capmode_coredump ? VN_OPEN_NOCAPCHECK : 0); for (i = 0; i < ncores; i++) { flags = O_CREAT | FWRITE | O_NOFOLLOW; ch = name[indexpos + indexlen]; (void)snprintf(name + indexpos, indexlen + 1, "%.*u", indexlen, i); name[indexpos + indexlen] = ch; NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, name, td); error = vn_open_cred(&nd, &flags, cmode, oflags, td->td_ucred, NULL); if (error != 0) break; vp = nd.ni_vp; NDFREE(&nd, NDF_ONLY_PNBUF); if ((flags & O_CREAT) == O_CREAT) { nextvp = vp; break; } error = VOP_GETATTR(vp, &vattr, td->td_ucred); if (error != 0) { vnode_close_locked(td, vp); break; } if (oldvp == NULL || lasttime.tv_sec > vattr.va_mtime.tv_sec || (lasttime.tv_sec == vattr.va_mtime.tv_sec && lasttime.tv_nsec >= vattr.va_mtime.tv_nsec)) { if (oldvp != NULL) vnode_close_locked(td, oldvp); oldvp = vp; lasttime = vattr.va_mtime; } else { vnode_close_locked(td, vp); } } if (oldvp != NULL) { if (nextvp == NULL) nextvp = oldvp; else vnode_close_locked(td, oldvp); } if (error != 0) { if (nextvp != NULL) vnode_close_locked(td, oldvp); } else { *vpp = nextvp; } return (error); } /* * corefile_open(comm, uid, pid, td, compress, vpp, namep) * Expand the name described in corefilename, using name, uid, and pid * and open/create core file. * corefilename is a printf-like string, with three format specifiers: * %N name of process ("name") * %P process id (pid) * %U user id (uid) * For example, "%N.core" is the default; they can be disabled completely * by using "/dev/null", or all core files can be stored in "/cores/%U/%N-%P". * This is controlled by the sysctl variable kern.corefile (see above). */ static int corefile_open(const char *comm, uid_t uid, pid_t pid, struct thread *td, int compress, struct vnode **vpp, char **namep) { struct sbuf sb; struct nameidata nd; const char *format; char *hostname, *name; int cmode, error, flags, i, indexpos, indexlen, oflags, ncores; hostname = NULL; format = corefilename; name = malloc(MAXPATHLEN, M_TEMP, M_WAITOK | M_ZERO); indexlen = 0; indexpos = -1; ncores = num_cores; (void)sbuf_new(&sb, name, MAXPATHLEN, SBUF_FIXEDLEN); sx_slock(&corefilename_lock); for (i = 0; format[i] != '\0'; i++) { switch (format[i]) { case '%': /* Format character */ i++; switch (format[i]) { case '%': sbuf_putc(&sb, '%'); break; case 'H': /* hostname */ if (hostname == NULL) { hostname = malloc(MAXHOSTNAMELEN, M_TEMP, M_WAITOK); } getcredhostname(td->td_ucred, hostname, MAXHOSTNAMELEN); sbuf_printf(&sb, "%s", hostname); break; case 'I': /* autoincrementing index */ if (indexpos != -1) { sbuf_printf(&sb, "%%I"); break; } indexpos = sbuf_len(&sb); sbuf_printf(&sb, "%u", ncores - 1); indexlen = sbuf_len(&sb) - indexpos; break; case 'N': /* process name */ sbuf_printf(&sb, "%s", comm); break; case 'P': /* process id */ sbuf_printf(&sb, "%u", pid); break; case 'U': /* user id */ sbuf_printf(&sb, "%u", uid); break; default: log(LOG_ERR, "Unknown format character %c in " "corename `%s'\n", format[i], format); break; } break; default: sbuf_putc(&sb, format[i]); break; } } sx_sunlock(&corefilename_lock); free(hostname, M_TEMP); if (compress == COMPRESS_GZIP) sbuf_printf(&sb, GZIP_SUFFIX); else if (compress == COMPRESS_ZSTD) sbuf_printf(&sb, ZSTD_SUFFIX); if (sbuf_error(&sb) != 0) { log(LOG_ERR, "pid %ld (%s), uid (%lu): corename is too " "long\n", (long)pid, comm, (u_long)uid); sbuf_delete(&sb); free(name, M_TEMP); return (ENOMEM); } sbuf_finish(&sb); sbuf_delete(&sb); if (indexpos != -1) { error = corefile_open_last(td, name, indexpos, indexlen, ncores, vpp); if (error != 0) { log(LOG_ERR, "pid %d (%s), uid (%u): Path `%s' failed " "on initial open test, error = %d\n", pid, comm, uid, name, error); } } else { cmode = S_IRUSR | S_IWUSR; oflags = VN_OPEN_NOAUDIT | VN_OPEN_NAMECACHE | (capmode_coredump ? VN_OPEN_NOCAPCHECK : 0); flags = O_CREAT | FWRITE | O_NOFOLLOW; NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, name, td); error = vn_open_cred(&nd, &flags, cmode, oflags, td->td_ucred, NULL); if (error == 0) { *vpp = nd.ni_vp; NDFREE(&nd, NDF_ONLY_PNBUF); } } if (error != 0) { #ifdef AUDIT audit_proc_coredump(td, name, error); #endif free(name, M_TEMP); return (error); } *namep = name; return (0); } /* * Dump a process' core. The main routine does some * policy checking, and creates the name of the coredump; * then it passes on a vnode and a size limit to the process-specific * coredump routine if there is one; if there _is not_ one, it returns * ENOSYS; otherwise it returns the error from the process-specific routine. */ static int coredump(struct thread *td) { struct proc *p = td->td_proc; struct ucred *cred = td->td_ucred; struct vnode *vp; struct flock lf; struct vattr vattr; int error, error1, locked; char *name; /* name of corefile */ void *rl_cookie; off_t limit; char *fullpath, *freepath = NULL; struct sbuf *sb; PROC_LOCK_ASSERT(p, MA_OWNED); MPASS((p->p_flag & P_HADTHREADS) == 0 || p->p_singlethread == td); _STOPEVENT(p, S_CORE, 0); if (!do_coredump || (!sugid_coredump && (p->p_flag & P_SUGID) != 0) || (p->p_flag2 & P2_NOTRACE) != 0) { PROC_UNLOCK(p); return (EFAULT); } /* * Note that the bulk of limit checking is done after * the corefile is created. The exception is if the limit * for corefiles is 0, in which case we don't bother * creating the corefile at all. This layout means that * a corefile is truncated instead of not being created, * if it is larger than the limit. */ limit = (off_t)lim_cur(td, RLIMIT_CORE); if (limit == 0 || racct_get_available(p, RACCT_CORE) == 0) { PROC_UNLOCK(p); return (EFBIG); } PROC_UNLOCK(p); error = corefile_open(p->p_comm, cred->cr_uid, p->p_pid, td, compress_user_cores, &vp, &name); if (error != 0) return (error); /* * Don't dump to non-regular files or files with links. * Do not dump into system files. */ if (vp->v_type != VREG || VOP_GETATTR(vp, &vattr, cred) != 0 || vattr.va_nlink != 1 || (vp->v_vflag & VV_SYSTEM) != 0) { VOP_UNLOCK(vp, 0); error = EFAULT; goto out; } VOP_UNLOCK(vp, 0); /* Postpone other writers, including core dumps of other processes. */ rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX); lf.l_whence = SEEK_SET; lf.l_start = 0; lf.l_len = 0; lf.l_type = F_WRLCK; locked = (VOP_ADVLOCK(vp, (caddr_t)p, F_SETLK, &lf, F_FLOCK) == 0); VATTR_NULL(&vattr); vattr.va_size = 0; if (set_core_nodump_flag) vattr.va_flags = UF_NODUMP; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); VOP_SETATTR(vp, &vattr, cred); VOP_UNLOCK(vp, 0); PROC_LOCK(p); p->p_acflag |= ACORE; PROC_UNLOCK(p); if (p->p_sysent->sv_coredump != NULL) { error = p->p_sysent->sv_coredump(td, vp, limit, 0); } else { error = ENOSYS; } if (locked) { lf.l_type = F_UNLCK; VOP_ADVLOCK(vp, (caddr_t)p, F_UNLCK, &lf, F_FLOCK); } vn_rangelock_unlock(vp, rl_cookie); /* * Notify the userland helper that a process triggered a core dump. * This allows the helper to run an automated debugging session. */ if (error != 0 || coredump_devctl == 0) goto out; sb = sbuf_new_auto(); if (vn_fullpath_global(td, p->p_textvp, &fullpath, &freepath) != 0) goto out2; sbuf_printf(sb, "comm=\""); devctl_safe_quote_sb(sb, fullpath); free(freepath, M_TEMP); sbuf_printf(sb, "\" core=\""); /* * We can't lookup core file vp directly. When we're replacing a core, and * other random times, we flush the name cache, so it will fail. Instead, * if the path of the core is relative, add the current dir in front if it. */ if (name[0] != '/') { fullpath = malloc(MAXPATHLEN, M_TEMP, M_WAITOK); if (kern___getcwd(td, fullpath, UIO_SYSSPACE, MAXPATHLEN, MAXPATHLEN) != 0) { free(fullpath, M_TEMP); goto out2; } devctl_safe_quote_sb(sb, fullpath); free(fullpath, M_TEMP); sbuf_putc(sb, '/'); } devctl_safe_quote_sb(sb, name); sbuf_printf(sb, "\""); if (sbuf_finish(sb) == 0) devctl_notify("kernel", "signal", "coredump", sbuf_data(sb)); out2: sbuf_delete(sb); out: error1 = vn_close(vp, FWRITE, cred, td); if (error == 0) error = error1; #ifdef AUDIT audit_proc_coredump(td, name, error); #endif free(name, M_TEMP); return (error); } /* * Nonexistent system call-- signal process (may want to handle it). Flag * error in case process won't see signal immediately (blocked or ignored). */ #ifndef _SYS_SYSPROTO_H_ struct nosys_args { int dummy; }; #endif /* ARGSUSED */ int nosys(struct thread *td, struct nosys_args *args) { struct proc *p; p = td->td_proc; PROC_LOCK(p); tdsignal(td, SIGSYS); PROC_UNLOCK(p); if (kern_lognosys == 1 || kern_lognosys == 3) { uprintf("pid %d comm %s: nosys %d\n", p->p_pid, p->p_comm, td->td_sa.code); } if (kern_lognosys == 2 || kern_lognosys == 3) { printf("pid %d comm %s: nosys %d\n", p->p_pid, p->p_comm, td->td_sa.code); } return (ENOSYS); } /* * Send a SIGIO or SIGURG signal to a process or process group using stored * credentials rather than those of the current process. */ void pgsigio(struct sigio **sigiop, int sig, int checkctty) { ksiginfo_t ksi; struct sigio *sigio; ksiginfo_init(&ksi); ksi.ksi_signo = sig; ksi.ksi_code = SI_KERNEL; SIGIO_LOCK(); sigio = *sigiop; if (sigio == NULL) { SIGIO_UNLOCK(); return; } if (sigio->sio_pgid > 0) { PROC_LOCK(sigio->sio_proc); if (CANSIGIO(sigio->sio_ucred, sigio->sio_proc->p_ucred)) kern_psignal(sigio->sio_proc, sig); PROC_UNLOCK(sigio->sio_proc); } else if (sigio->sio_pgid < 0) { struct proc *p; PGRP_LOCK(sigio->sio_pgrp); LIST_FOREACH(p, &sigio->sio_pgrp->pg_members, p_pglist) { PROC_LOCK(p); if (p->p_state == PRS_NORMAL && CANSIGIO(sigio->sio_ucred, p->p_ucred) && (checkctty == 0 || (p->p_flag & P_CONTROLT))) kern_psignal(p, sig); PROC_UNLOCK(p); } PGRP_UNLOCK(sigio->sio_pgrp); } SIGIO_UNLOCK(); } static int filt_sigattach(struct knote *kn) { struct proc *p = curproc; kn->kn_ptr.p_proc = p; kn->kn_flags |= EV_CLEAR; /* automatically set */ knlist_add(p->p_klist, kn, 0); return (0); } static void filt_sigdetach(struct knote *kn) { struct proc *p = kn->kn_ptr.p_proc; knlist_remove(p->p_klist, kn, 0); } /* * signal knotes are shared with proc knotes, so we apply a mask to * the hint in order to differentiate them from process hints. This * could be avoided by using a signal-specific knote list, but probably * isn't worth the trouble. */ static int filt_signal(struct knote *kn, long hint) { if (hint & NOTE_SIGNAL) { hint &= ~NOTE_SIGNAL; if (kn->kn_id == hint) kn->kn_data++; } return (kn->kn_data != 0); } struct sigacts * sigacts_alloc(void) { struct sigacts *ps; ps = malloc(sizeof(struct sigacts), M_SUBPROC, M_WAITOK | M_ZERO); refcount_init(&ps->ps_refcnt, 1); mtx_init(&ps->ps_mtx, "sigacts", NULL, MTX_DEF); return (ps); } void sigacts_free(struct sigacts *ps) { if (refcount_release(&ps->ps_refcnt) == 0) return; mtx_destroy(&ps->ps_mtx); free(ps, M_SUBPROC); } struct sigacts * sigacts_hold(struct sigacts *ps) { refcount_acquire(&ps->ps_refcnt); return (ps); } void sigacts_copy(struct sigacts *dest, struct sigacts *src) { KASSERT(dest->ps_refcnt == 1, ("sigacts_copy to shared dest")); mtx_lock(&src->ps_mtx); bcopy(src, dest, offsetof(struct sigacts, ps_refcnt)); mtx_unlock(&src->ps_mtx); } int sigacts_shared(struct sigacts *ps) { return (ps->ps_refcnt > 1); } Index: head/sys/kern/kern_tc.c =================================================================== --- head/sys/kern/kern_tc.c (revision 336913) +++ head/sys/kern/kern_tc.c (revision 336914) @@ -1,2202 +1,2202 @@ /*- * SPDX-License-Identifier: Beerware * * ---------------------------------------------------------------------------- * "THE BEER-WARE LICENSE" (Revision 42): * wrote this file. As long as you retain this notice you * can do whatever you want with this stuff. If we meet some day, and you think * this stuff is worth it, you can buy me a beer in return. Poul-Henning Kamp * ---------------------------------------------------------------------------- * * Copyright (c) 2011, 2015, 2016 The FreeBSD Foundation * All rights reserved. * * Portions of this software were developed by Julien Ridoux at the University * of Melbourne under sponsorship from the FreeBSD Foundation. * * Portions of this software were developed by Konstantin Belousov * under sponsorship from the FreeBSD Foundation. */ #include __FBSDID("$FreeBSD$"); #include "opt_ntp.h" #include "opt_ffclock.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * A large step happens on boot. This constant detects such steps. * It is relatively small so that ntp_update_second gets called enough * in the typical 'missed a couple of seconds' case, but doesn't loop * forever when the time step is large. */ #define LARGE_STEP 200 /* * Implement a dummy timecounter which we can use until we get a real one * in the air. This allows the console and other early stuff to use * time services. */ static u_int dummy_get_timecount(struct timecounter *tc) { static u_int now; return (++now); } static struct timecounter dummy_timecounter = { dummy_get_timecount, 0, ~0u, 1000000, "dummy", -1000000 }; struct timehands { /* These fields must be initialized by the driver. */ struct timecounter *th_counter; int64_t th_adjustment; uint64_t th_scale; u_int th_offset_count; struct bintime th_offset; struct bintime th_bintime; struct timeval th_microtime; struct timespec th_nanotime; struct bintime th_boottime; /* Fields not to be copied in tc_windup start with th_generation. */ u_int th_generation; struct timehands *th_next; }; static struct timehands th0; static struct timehands th1 = { .th_next = &th0 }; static struct timehands th0 = { .th_counter = &dummy_timecounter, .th_scale = (uint64_t)-1 / 1000000, .th_offset = { .sec = 1 }, .th_generation = 1, .th_next = &th1 }; static struct timehands *volatile timehands = &th0; struct timecounter *timecounter = &dummy_timecounter; static struct timecounter *timecounters = &dummy_timecounter; int tc_min_ticktock_freq = 1; volatile time_t time_second = 1; volatile time_t time_uptime = 1; static int sysctl_kern_boottime(SYSCTL_HANDLER_ARGS); SYSCTL_PROC(_kern, KERN_BOOTTIME, boottime, CTLTYPE_STRUCT|CTLFLAG_RD, NULL, 0, sysctl_kern_boottime, "S,timeval", "System boottime"); SYSCTL_NODE(_kern, OID_AUTO, timecounter, CTLFLAG_RW, 0, ""); static SYSCTL_NODE(_kern_timecounter, OID_AUTO, tc, CTLFLAG_RW, 0, ""); static int timestepwarnings; SYSCTL_INT(_kern_timecounter, OID_AUTO, stepwarnings, CTLFLAG_RW, ×tepwarnings, 0, "Log time steps"); struct bintime bt_timethreshold; struct bintime bt_tickthreshold; sbintime_t sbt_timethreshold; sbintime_t sbt_tickthreshold; struct bintime tc_tick_bt; sbintime_t tc_tick_sbt; int tc_precexp; int tc_timepercentage = TC_DEFAULTPERC; static int sysctl_kern_timecounter_adjprecision(SYSCTL_HANDLER_ARGS); SYSCTL_PROC(_kern_timecounter, OID_AUTO, alloweddeviation, CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, 0, 0, sysctl_kern_timecounter_adjprecision, "I", "Allowed time interval deviation in percents"); volatile int rtc_generation = 1; static int tc_chosen; /* Non-zero if a specific tc was chosen via sysctl. */ static void tc_windup(struct bintime *new_boottimebin); static void cpu_tick_calibrate(int); void dtrace_getnanotime(struct timespec *tsp); static int sysctl_kern_boottime(SYSCTL_HANDLER_ARGS) { struct timeval boottime; getboottime(&boottime); #ifndef __mips__ #ifdef SCTL_MASK32 int tv[2]; if (req->flags & SCTL_MASK32) { tv[0] = boottime.tv_sec; tv[1] = boottime.tv_usec; return (SYSCTL_OUT(req, tv, sizeof(tv))); } #endif #endif return (SYSCTL_OUT(req, &boottime, sizeof(boottime))); } static int sysctl_kern_timecounter_get(SYSCTL_HANDLER_ARGS) { u_int ncount; struct timecounter *tc = arg1; ncount = tc->tc_get_timecount(tc); return (sysctl_handle_int(oidp, &ncount, 0, req)); } static int sysctl_kern_timecounter_freq(SYSCTL_HANDLER_ARGS) { uint64_t freq; struct timecounter *tc = arg1; freq = tc->tc_frequency; return (sysctl_handle_64(oidp, &freq, 0, req)); } /* * Return the difference between the timehands' counter value now and what * was when we copied it to the timehands' offset_count. */ static __inline u_int tc_delta(struct timehands *th) { struct timecounter *tc; tc = th->th_counter; return ((tc->tc_get_timecount(tc) - th->th_offset_count) & tc->tc_counter_mask); } /* * Functions for reading the time. We have to loop until we are sure that * the timehands that we operated on was not updated under our feet. See * the comment in for a description of these 12 functions. */ #ifdef FFCLOCK void fbclock_binuptime(struct bintime *bt) { struct timehands *th; unsigned int gen; do { th = timehands; gen = atomic_load_acq_int(&th->th_generation); *bt = th->th_offset; bintime_addx(bt, th->th_scale * tc_delta(th)); atomic_thread_fence_acq(); } while (gen == 0 || gen != th->th_generation); } void fbclock_nanouptime(struct timespec *tsp) { struct bintime bt; fbclock_binuptime(&bt); bintime2timespec(&bt, tsp); } void fbclock_microuptime(struct timeval *tvp) { struct bintime bt; fbclock_binuptime(&bt); bintime2timeval(&bt, tvp); } void fbclock_bintime(struct bintime *bt) { struct timehands *th; unsigned int gen; do { th = timehands; gen = atomic_load_acq_int(&th->th_generation); *bt = th->th_bintime; bintime_addx(bt, th->th_scale * tc_delta(th)); atomic_thread_fence_acq(); } while (gen == 0 || gen != th->th_generation); } void fbclock_nanotime(struct timespec *tsp) { struct bintime bt; fbclock_bintime(&bt); bintime2timespec(&bt, tsp); } void fbclock_microtime(struct timeval *tvp) { struct bintime bt; fbclock_bintime(&bt); bintime2timeval(&bt, tvp); } void fbclock_getbinuptime(struct bintime *bt) { struct timehands *th; unsigned int gen; do { th = timehands; gen = atomic_load_acq_int(&th->th_generation); *bt = th->th_offset; atomic_thread_fence_acq(); } while (gen == 0 || gen != th->th_generation); } void fbclock_getnanouptime(struct timespec *tsp) { struct timehands *th; unsigned int gen; do { th = timehands; gen = atomic_load_acq_int(&th->th_generation); bintime2timespec(&th->th_offset, tsp); atomic_thread_fence_acq(); } while (gen == 0 || gen != th->th_generation); } void fbclock_getmicrouptime(struct timeval *tvp) { struct timehands *th; unsigned int gen; do { th = timehands; gen = atomic_load_acq_int(&th->th_generation); bintime2timeval(&th->th_offset, tvp); atomic_thread_fence_acq(); } while (gen == 0 || gen != th->th_generation); } void fbclock_getbintime(struct bintime *bt) { struct timehands *th; unsigned int gen; do { th = timehands; gen = atomic_load_acq_int(&th->th_generation); *bt = th->th_bintime; atomic_thread_fence_acq(); } while (gen == 0 || gen != th->th_generation); } void fbclock_getnanotime(struct timespec *tsp) { struct timehands *th; unsigned int gen; do { th = timehands; gen = atomic_load_acq_int(&th->th_generation); *tsp = th->th_nanotime; atomic_thread_fence_acq(); } while (gen == 0 || gen != th->th_generation); } void fbclock_getmicrotime(struct timeval *tvp) { struct timehands *th; unsigned int gen; do { th = timehands; gen = atomic_load_acq_int(&th->th_generation); *tvp = th->th_microtime; atomic_thread_fence_acq(); } while (gen == 0 || gen != th->th_generation); } #else /* !FFCLOCK */ void binuptime(struct bintime *bt) { struct timehands *th; u_int gen; do { th = timehands; gen = atomic_load_acq_int(&th->th_generation); *bt = th->th_offset; bintime_addx(bt, th->th_scale * tc_delta(th)); atomic_thread_fence_acq(); } while (gen == 0 || gen != th->th_generation); } void nanouptime(struct timespec *tsp) { struct bintime bt; binuptime(&bt); bintime2timespec(&bt, tsp); } void microuptime(struct timeval *tvp) { struct bintime bt; binuptime(&bt); bintime2timeval(&bt, tvp); } void bintime(struct bintime *bt) { struct timehands *th; u_int gen; do { th = timehands; gen = atomic_load_acq_int(&th->th_generation); *bt = th->th_bintime; bintime_addx(bt, th->th_scale * tc_delta(th)); atomic_thread_fence_acq(); } while (gen == 0 || gen != th->th_generation); } void nanotime(struct timespec *tsp) { struct bintime bt; bintime(&bt); bintime2timespec(&bt, tsp); } void microtime(struct timeval *tvp) { struct bintime bt; bintime(&bt); bintime2timeval(&bt, tvp); } void getbinuptime(struct bintime *bt) { struct timehands *th; u_int gen; do { th = timehands; gen = atomic_load_acq_int(&th->th_generation); *bt = th->th_offset; atomic_thread_fence_acq(); } while (gen == 0 || gen != th->th_generation); } void getnanouptime(struct timespec *tsp) { struct timehands *th; u_int gen; do { th = timehands; gen = atomic_load_acq_int(&th->th_generation); bintime2timespec(&th->th_offset, tsp); atomic_thread_fence_acq(); } while (gen == 0 || gen != th->th_generation); } void getmicrouptime(struct timeval *tvp) { struct timehands *th; u_int gen; do { th = timehands; gen = atomic_load_acq_int(&th->th_generation); bintime2timeval(&th->th_offset, tvp); atomic_thread_fence_acq(); } while (gen == 0 || gen != th->th_generation); } void getbintime(struct bintime *bt) { struct timehands *th; u_int gen; do { th = timehands; gen = atomic_load_acq_int(&th->th_generation); *bt = th->th_bintime; atomic_thread_fence_acq(); } while (gen == 0 || gen != th->th_generation); } void getnanotime(struct timespec *tsp) { struct timehands *th; u_int gen; do { th = timehands; gen = atomic_load_acq_int(&th->th_generation); *tsp = th->th_nanotime; atomic_thread_fence_acq(); } while (gen == 0 || gen != th->th_generation); } void getmicrotime(struct timeval *tvp) { struct timehands *th; u_int gen; do { th = timehands; gen = atomic_load_acq_int(&th->th_generation); *tvp = th->th_microtime; atomic_thread_fence_acq(); } while (gen == 0 || gen != th->th_generation); } #endif /* FFCLOCK */ void getboottime(struct timeval *boottime) { struct bintime boottimebin; getboottimebin(&boottimebin); bintime2timeval(&boottimebin, boottime); } void getboottimebin(struct bintime *boottimebin) { struct timehands *th; u_int gen; do { th = timehands; gen = atomic_load_acq_int(&th->th_generation); *boottimebin = th->th_boottime; atomic_thread_fence_acq(); } while (gen == 0 || gen != th->th_generation); } #ifdef FFCLOCK /* * Support for feed-forward synchronization algorithms. This is heavily inspired * by the timehands mechanism but kept independent from it. *_windup() functions * have some connection to avoid accessing the timecounter hardware more than * necessary. */ /* Feed-forward clock estimates kept updated by the synchronization daemon. */ struct ffclock_estimate ffclock_estimate; struct bintime ffclock_boottime; /* Feed-forward boot time estimate. */ uint32_t ffclock_status; /* Feed-forward clock status. */ int8_t ffclock_updated; /* New estimates are available. */ struct mtx ffclock_mtx; /* Mutex on ffclock_estimate. */ struct fftimehands { struct ffclock_estimate cest; struct bintime tick_time; struct bintime tick_time_lerp; ffcounter tick_ffcount; uint64_t period_lerp; volatile uint8_t gen; struct fftimehands *next; }; #define NUM_ELEMENTS(x) (sizeof(x) / sizeof(*x)) static struct fftimehands ffth[10]; static struct fftimehands *volatile fftimehands = ffth; static void ffclock_init(void) { struct fftimehands *cur; struct fftimehands *last; memset(ffth, 0, sizeof(ffth)); last = ffth + NUM_ELEMENTS(ffth) - 1; for (cur = ffth; cur < last; cur++) cur->next = cur + 1; last->next = ffth; ffclock_updated = 0; ffclock_status = FFCLOCK_STA_UNSYNC; mtx_init(&ffclock_mtx, "ffclock lock", NULL, MTX_DEF); } /* * Reset the feed-forward clock estimates. Called from inittodr() to get things * kick started and uses the timecounter nominal frequency as a first period * estimate. Note: this function may be called several time just after boot. * Note: this is the only function that sets the value of boot time for the * monotonic (i.e. uptime) version of the feed-forward clock. */ void ffclock_reset_clock(struct timespec *ts) { struct timecounter *tc; struct ffclock_estimate cest; tc = timehands->th_counter; memset(&cest, 0, sizeof(struct ffclock_estimate)); timespec2bintime(ts, &ffclock_boottime); timespec2bintime(ts, &(cest.update_time)); ffclock_read_counter(&cest.update_ffcount); cest.leapsec_next = 0; cest.period = ((1ULL << 63) / tc->tc_frequency) << 1; cest.errb_abs = 0; cest.errb_rate = 0; cest.status = FFCLOCK_STA_UNSYNC; cest.leapsec_total = 0; cest.leapsec = 0; mtx_lock(&ffclock_mtx); bcopy(&cest, &ffclock_estimate, sizeof(struct ffclock_estimate)); ffclock_updated = INT8_MAX; mtx_unlock(&ffclock_mtx); printf("ffclock reset: %s (%llu Hz), time = %ld.%09lu\n", tc->tc_name, (unsigned long long)tc->tc_frequency, (long)ts->tv_sec, (unsigned long)ts->tv_nsec); } /* * Sub-routine to convert a time interval measured in RAW counter units to time * in seconds stored in bintime format. * NOTE: bintime_mul requires u_int, but the value of the ffcounter may be * larger than the max value of u_int (on 32 bit architecture). Loop to consume * extra cycles. */ static void ffclock_convert_delta(ffcounter ffdelta, uint64_t period, struct bintime *bt) { struct bintime bt2; ffcounter delta, delta_max; delta_max = (1ULL << (8 * sizeof(unsigned int))) - 1; bintime_clear(bt); do { if (ffdelta > delta_max) delta = delta_max; else delta = ffdelta; bt2.sec = 0; bt2.frac = period; bintime_mul(&bt2, (unsigned int)delta); bintime_add(bt, &bt2); ffdelta -= delta; } while (ffdelta > 0); } /* * Update the fftimehands. * Push the tick ffcount and time(s) forward based on current clock estimate. * The conversion from ffcounter to bintime relies on the difference clock * principle, whose accuracy relies on computing small time intervals. If a new * clock estimate has been passed by the synchronisation daemon, make it * current, and compute the linear interpolation for monotonic time if needed. */ static void ffclock_windup(unsigned int delta) { struct ffclock_estimate *cest; struct fftimehands *ffth; struct bintime bt, gap_lerp; ffcounter ffdelta; uint64_t frac; unsigned int polling; uint8_t forward_jump, ogen; /* * Pick the next timehand, copy current ffclock estimates and move tick * times and counter forward. */ forward_jump = 0; ffth = fftimehands->next; ogen = ffth->gen; ffth->gen = 0; cest = &ffth->cest; bcopy(&fftimehands->cest, cest, sizeof(struct ffclock_estimate)); ffdelta = (ffcounter)delta; ffth->period_lerp = fftimehands->period_lerp; ffth->tick_time = fftimehands->tick_time; ffclock_convert_delta(ffdelta, cest->period, &bt); bintime_add(&ffth->tick_time, &bt); ffth->tick_time_lerp = fftimehands->tick_time_lerp; ffclock_convert_delta(ffdelta, ffth->period_lerp, &bt); bintime_add(&ffth->tick_time_lerp, &bt); ffth->tick_ffcount = fftimehands->tick_ffcount + ffdelta; /* * Assess the status of the clock, if the last update is too old, it is * likely the synchronisation daemon is dead and the clock is free * running. */ if (ffclock_updated == 0) { ffdelta = ffth->tick_ffcount - cest->update_ffcount; ffclock_convert_delta(ffdelta, cest->period, &bt); if (bt.sec > 2 * FFCLOCK_SKM_SCALE) ffclock_status |= FFCLOCK_STA_UNSYNC; } /* * If available, grab updated clock estimates and make them current. * Recompute time at this tick using the updated estimates. The clock * estimates passed the feed-forward synchronisation daemon may result * in time conversion that is not monotonically increasing (just after * the update). time_lerp is a particular linear interpolation over the * synchronisation algo polling period that ensures monotonicity for the * clock ids requesting it. */ if (ffclock_updated > 0) { bcopy(&ffclock_estimate, cest, sizeof(struct ffclock_estimate)); ffdelta = ffth->tick_ffcount - cest->update_ffcount; ffth->tick_time = cest->update_time; ffclock_convert_delta(ffdelta, cest->period, &bt); bintime_add(&ffth->tick_time, &bt); /* ffclock_reset sets ffclock_updated to INT8_MAX */ if (ffclock_updated == INT8_MAX) ffth->tick_time_lerp = ffth->tick_time; if (bintime_cmp(&ffth->tick_time, &ffth->tick_time_lerp, >)) forward_jump = 1; else forward_jump = 0; bintime_clear(&gap_lerp); if (forward_jump) { gap_lerp = ffth->tick_time; bintime_sub(&gap_lerp, &ffth->tick_time_lerp); } else { gap_lerp = ffth->tick_time_lerp; bintime_sub(&gap_lerp, &ffth->tick_time); } /* * The reset from the RTC clock may be far from accurate, and * reducing the gap between real time and interpolated time * could take a very long time if the interpolated clock insists * on strict monotonicity. The clock is reset under very strict * conditions (kernel time is known to be wrong and * synchronization daemon has been restarted recently. * ffclock_boottime absorbs the jump to ensure boot time is * correct and uptime functions stay consistent. */ if (((ffclock_status & FFCLOCK_STA_UNSYNC) == FFCLOCK_STA_UNSYNC) && ((cest->status & FFCLOCK_STA_UNSYNC) == 0) && ((cest->status & FFCLOCK_STA_WARMUP) == FFCLOCK_STA_WARMUP)) { if (forward_jump) bintime_add(&ffclock_boottime, &gap_lerp); else bintime_sub(&ffclock_boottime, &gap_lerp); ffth->tick_time_lerp = ffth->tick_time; bintime_clear(&gap_lerp); } ffclock_status = cest->status; ffth->period_lerp = cest->period; /* * Compute corrected period used for the linear interpolation of * time. The rate of linear interpolation is capped to 5000PPM * (5ms/s). */ if (bintime_isset(&gap_lerp)) { ffdelta = cest->update_ffcount; ffdelta -= fftimehands->cest.update_ffcount; ffclock_convert_delta(ffdelta, cest->period, &bt); polling = bt.sec; bt.sec = 0; bt.frac = 5000000 * (uint64_t)18446744073LL; bintime_mul(&bt, polling); if (bintime_cmp(&gap_lerp, &bt, >)) gap_lerp = bt; /* Approximate 1 sec by 1-(1/2^64) to ease arithmetic */ frac = 0; if (gap_lerp.sec > 0) { frac -= 1; frac /= ffdelta / gap_lerp.sec; } frac += gap_lerp.frac / ffdelta; if (forward_jump) ffth->period_lerp += frac; else ffth->period_lerp -= frac; } ffclock_updated = 0; } if (++ogen == 0) ogen = 1; ffth->gen = ogen; fftimehands = ffth; } /* * Adjust the fftimehands when the timecounter is changed. Stating the obvious, * the old and new hardware counter cannot be read simultaneously. tc_windup() * does read the two counters 'back to back', but a few cycles are effectively * lost, and not accumulated in tick_ffcount. This is a fairly radical * operation for a feed-forward synchronization daemon, and it is its job to not * pushing irrelevant data to the kernel. Because there is no locking here, * simply force to ignore pending or next update to give daemon a chance to * realize the counter has changed. */ static void ffclock_change_tc(struct timehands *th) { struct fftimehands *ffth; struct ffclock_estimate *cest; struct timecounter *tc; uint8_t ogen; tc = th->th_counter; ffth = fftimehands->next; ogen = ffth->gen; ffth->gen = 0; cest = &ffth->cest; bcopy(&(fftimehands->cest), cest, sizeof(struct ffclock_estimate)); cest->period = ((1ULL << 63) / tc->tc_frequency ) << 1; cest->errb_abs = 0; cest->errb_rate = 0; cest->status |= FFCLOCK_STA_UNSYNC; ffth->tick_ffcount = fftimehands->tick_ffcount; ffth->tick_time_lerp = fftimehands->tick_time_lerp; ffth->tick_time = fftimehands->tick_time; ffth->period_lerp = cest->period; /* Do not lock but ignore next update from synchronization daemon. */ ffclock_updated--; if (++ogen == 0) ogen = 1; ffth->gen = ogen; fftimehands = ffth; } /* * Retrieve feed-forward counter and time of last kernel tick. */ void ffclock_last_tick(ffcounter *ffcount, struct bintime *bt, uint32_t flags) { struct fftimehands *ffth; uint8_t gen; /* * No locking but check generation has not changed. Also need to make * sure ffdelta is positive, i.e. ffcount > tick_ffcount. */ do { ffth = fftimehands; gen = ffth->gen; if ((flags & FFCLOCK_LERP) == FFCLOCK_LERP) *bt = ffth->tick_time_lerp; else *bt = ffth->tick_time; *ffcount = ffth->tick_ffcount; } while (gen == 0 || gen != ffth->gen); } /* * Absolute clock conversion. Low level function to convert ffcounter to * bintime. The ffcounter is converted using the current ffclock period estimate * or the "interpolated period" to ensure monotonicity. * NOTE: this conversion may have been deferred, and the clock updated since the * hardware counter has been read. */ void ffclock_convert_abs(ffcounter ffcount, struct bintime *bt, uint32_t flags) { struct fftimehands *ffth; struct bintime bt2; ffcounter ffdelta; uint8_t gen; /* * No locking but check generation has not changed. Also need to make * sure ffdelta is positive, i.e. ffcount > tick_ffcount. */ do { ffth = fftimehands; gen = ffth->gen; if (ffcount > ffth->tick_ffcount) ffdelta = ffcount - ffth->tick_ffcount; else ffdelta = ffth->tick_ffcount - ffcount; if ((flags & FFCLOCK_LERP) == FFCLOCK_LERP) { *bt = ffth->tick_time_lerp; ffclock_convert_delta(ffdelta, ffth->period_lerp, &bt2); } else { *bt = ffth->tick_time; ffclock_convert_delta(ffdelta, ffth->cest.period, &bt2); } if (ffcount > ffth->tick_ffcount) bintime_add(bt, &bt2); else bintime_sub(bt, &bt2); } while (gen == 0 || gen != ffth->gen); } /* * Difference clock conversion. * Low level function to Convert a time interval measured in RAW counter units * into bintime. The difference clock allows measuring small intervals much more * reliably than the absolute clock. */ void ffclock_convert_diff(ffcounter ffdelta, struct bintime *bt) { struct fftimehands *ffth; uint8_t gen; /* No locking but check generation has not changed. */ do { ffth = fftimehands; gen = ffth->gen; ffclock_convert_delta(ffdelta, ffth->cest.period, bt); } while (gen == 0 || gen != ffth->gen); } /* * Access to current ffcounter value. */ void ffclock_read_counter(ffcounter *ffcount) { struct timehands *th; struct fftimehands *ffth; unsigned int gen, delta; /* * ffclock_windup() called from tc_windup(), safe to rely on * th->th_generation only, for correct delta and ffcounter. */ do { th = timehands; gen = atomic_load_acq_int(&th->th_generation); ffth = fftimehands; delta = tc_delta(th); *ffcount = ffth->tick_ffcount; atomic_thread_fence_acq(); } while (gen == 0 || gen != th->th_generation); *ffcount += delta; } void binuptime(struct bintime *bt) { binuptime_fromclock(bt, sysclock_active); } void nanouptime(struct timespec *tsp) { nanouptime_fromclock(tsp, sysclock_active); } void microuptime(struct timeval *tvp) { microuptime_fromclock(tvp, sysclock_active); } void bintime(struct bintime *bt) { bintime_fromclock(bt, sysclock_active); } void nanotime(struct timespec *tsp) { nanotime_fromclock(tsp, sysclock_active); } void microtime(struct timeval *tvp) { microtime_fromclock(tvp, sysclock_active); } void getbinuptime(struct bintime *bt) { getbinuptime_fromclock(bt, sysclock_active); } void getnanouptime(struct timespec *tsp) { getnanouptime_fromclock(tsp, sysclock_active); } void getmicrouptime(struct timeval *tvp) { getmicrouptime_fromclock(tvp, sysclock_active); } void getbintime(struct bintime *bt) { getbintime_fromclock(bt, sysclock_active); } void getnanotime(struct timespec *tsp) { getnanotime_fromclock(tsp, sysclock_active); } void getmicrotime(struct timeval *tvp) { getmicrouptime_fromclock(tvp, sysclock_active); } #endif /* FFCLOCK */ /* * This is a clone of getnanotime and used for walltimestamps. * The dtrace_ prefix prevents fbt from creating probes for * it so walltimestamp can be safely used in all fbt probes. */ void dtrace_getnanotime(struct timespec *tsp) { struct timehands *th; u_int gen; do { th = timehands; gen = atomic_load_acq_int(&th->th_generation); *tsp = th->th_nanotime; atomic_thread_fence_acq(); } while (gen == 0 || gen != th->th_generation); } /* * System clock currently providing time to the system. Modifiable via sysctl * when the FFCLOCK option is defined. */ int sysclock_active = SYSCLOCK_FBCK; /* Internal NTP status and error estimates. */ extern int time_status; extern long time_esterror; /* * Take a snapshot of sysclock data which can be used to compare system clocks * and generate timestamps after the fact. */ void sysclock_getsnapshot(struct sysclock_snap *clock_snap, int fast) { struct fbclock_info *fbi; struct timehands *th; struct bintime bt; unsigned int delta, gen; #ifdef FFCLOCK ffcounter ffcount; struct fftimehands *ffth; struct ffclock_info *ffi; struct ffclock_estimate cest; ffi = &clock_snap->ff_info; #endif fbi = &clock_snap->fb_info; delta = 0; do { th = timehands; gen = atomic_load_acq_int(&th->th_generation); fbi->th_scale = th->th_scale; fbi->tick_time = th->th_offset; #ifdef FFCLOCK ffth = fftimehands; ffi->tick_time = ffth->tick_time_lerp; ffi->tick_time_lerp = ffth->tick_time_lerp; ffi->period = ffth->cest.period; ffi->period_lerp = ffth->period_lerp; clock_snap->ffcount = ffth->tick_ffcount; cest = ffth->cest; #endif if (!fast) delta = tc_delta(th); atomic_thread_fence_acq(); } while (gen == 0 || gen != th->th_generation); clock_snap->delta = delta; clock_snap->sysclock_active = sysclock_active; /* Record feedback clock status and error. */ clock_snap->fb_info.status = time_status; /* XXX: Very crude estimate of feedback clock error. */ bt.sec = time_esterror / 1000000; bt.frac = ((time_esterror - bt.sec) * 1000000) * (uint64_t)18446744073709ULL; clock_snap->fb_info.error = bt; #ifdef FFCLOCK if (!fast) clock_snap->ffcount += delta; /* Record feed-forward clock leap second adjustment. */ ffi->leapsec_adjustment = cest.leapsec_total; if (clock_snap->ffcount > cest.leapsec_next) ffi->leapsec_adjustment -= cest.leapsec; /* Record feed-forward clock status and error. */ clock_snap->ff_info.status = cest.status; ffcount = clock_snap->ffcount - cest.update_ffcount; ffclock_convert_delta(ffcount, cest.period, &bt); /* 18446744073709 = int(2^64/1e12), err_bound_rate in [ps/s]. */ bintime_mul(&bt, cest.errb_rate * (uint64_t)18446744073709ULL); /* 18446744073 = int(2^64 / 1e9), since err_abs in [ns]. */ bintime_addx(&bt, cest.errb_abs * (uint64_t)18446744073ULL); clock_snap->ff_info.error = bt; #endif } /* * Convert a sysclock snapshot into a struct bintime based on the specified * clock source and flags. */ int sysclock_snap2bintime(struct sysclock_snap *cs, struct bintime *bt, int whichclock, uint32_t flags) { struct bintime boottimebin; #ifdef FFCLOCK struct bintime bt2; uint64_t period; #endif switch (whichclock) { case SYSCLOCK_FBCK: *bt = cs->fb_info.tick_time; /* If snapshot was created with !fast, delta will be >0. */ if (cs->delta > 0) bintime_addx(bt, cs->fb_info.th_scale * cs->delta); if ((flags & FBCLOCK_UPTIME) == 0) { getboottimebin(&boottimebin); bintime_add(bt, &boottimebin); } break; #ifdef FFCLOCK case SYSCLOCK_FFWD: if (flags & FFCLOCK_LERP) { *bt = cs->ff_info.tick_time_lerp; period = cs->ff_info.period_lerp; } else { *bt = cs->ff_info.tick_time; period = cs->ff_info.period; } /* If snapshot was created with !fast, delta will be >0. */ if (cs->delta > 0) { ffclock_convert_delta(cs->delta, period, &bt2); bintime_add(bt, &bt2); } /* Leap second adjustment. */ if (flags & FFCLOCK_LEAPSEC) bt->sec -= cs->ff_info.leapsec_adjustment; /* Boot time adjustment, for uptime/monotonic clocks. */ if (flags & FFCLOCK_UPTIME) bintime_sub(bt, &ffclock_boottime); break; #endif default: return (EINVAL); break; } return (0); } /* * Initialize a new timecounter and possibly use it. */ void tc_init(struct timecounter *tc) { u_int u; struct sysctl_oid *tc_root; u = tc->tc_frequency / tc->tc_counter_mask; /* XXX: We need some margin here, 10% is a guess */ u *= 11; u /= 10; if (u > hz && tc->tc_quality >= 0) { tc->tc_quality = -2000; if (bootverbose) { printf("Timecounter \"%s\" frequency %ju Hz", tc->tc_name, (uintmax_t)tc->tc_frequency); printf(" -- Insufficient hz, needs at least %u\n", u); } } else if (tc->tc_quality >= 0 || bootverbose) { printf("Timecounter \"%s\" frequency %ju Hz quality %d\n", tc->tc_name, (uintmax_t)tc->tc_frequency, tc->tc_quality); } tc->tc_next = timecounters; timecounters = tc; /* * Set up sysctl tree for this counter. */ tc_root = SYSCTL_ADD_NODE_WITH_LABEL(NULL, SYSCTL_STATIC_CHILDREN(_kern_timecounter_tc), OID_AUTO, tc->tc_name, CTLFLAG_RW, 0, "timecounter description", "timecounter"); SYSCTL_ADD_UINT(NULL, SYSCTL_CHILDREN(tc_root), OID_AUTO, "mask", CTLFLAG_RD, &(tc->tc_counter_mask), 0, "mask for implemented bits"); SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(tc_root), OID_AUTO, "counter", CTLTYPE_UINT | CTLFLAG_RD, tc, sizeof(*tc), sysctl_kern_timecounter_get, "IU", "current timecounter value"); SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(tc_root), OID_AUTO, "frequency", CTLTYPE_U64 | CTLFLAG_RD, tc, sizeof(*tc), sysctl_kern_timecounter_freq, "QU", "timecounter frequency"); SYSCTL_ADD_INT(NULL, SYSCTL_CHILDREN(tc_root), OID_AUTO, "quality", CTLFLAG_RD, &(tc->tc_quality), 0, "goodness of time counter"); /* * Do not automatically switch if the current tc was specifically * chosen. Never automatically use a timecounter with negative quality. * Even though we run on the dummy counter, switching here may be * worse since this timecounter may not be monotonic. */ if (tc_chosen) return; if (tc->tc_quality < 0) return; if (tc->tc_quality < timecounter->tc_quality) return; if (tc->tc_quality == timecounter->tc_quality && tc->tc_frequency < timecounter->tc_frequency) return; (void)tc->tc_get_timecount(tc); (void)tc->tc_get_timecount(tc); timecounter = tc; } /* Report the frequency of the current timecounter. */ uint64_t tc_getfrequency(void) { return (timehands->th_counter->tc_frequency); } static bool sleeping_on_old_rtc(struct thread *td) { /* * td_rtcgen is modified by curthread when it is running, * and by other threads in this function. By finding the thread * on a sleepqueue and holding the lock on the sleepqueue * chain, we guarantee that the thread is not running and that * modifying td_rtcgen is safe. Setting td_rtcgen to zero informs * the thread that it was woken due to a real-time clock adjustment. * (The declaration of td_rtcgen refers to this comment.) */ if (td->td_rtcgen != 0 && td->td_rtcgen != rtc_generation) { td->td_rtcgen = 0; return (true); } return (false); } static struct mtx tc_setclock_mtx; MTX_SYSINIT(tc_setclock_init, &tc_setclock_mtx, "tcsetc", MTX_SPIN); /* * Step our concept of UTC. This is done by modifying our estimate of * when we booted. */ void tc_setclock(struct timespec *ts) { struct timespec tbef, taft; struct bintime bt, bt2; timespec2bintime(ts, &bt); nanotime(&tbef); mtx_lock_spin(&tc_setclock_mtx); cpu_tick_calibrate(1); binuptime(&bt2); bintime_sub(&bt, &bt2); /* XXX fiddle all the little crinkly bits around the fiords... */ tc_windup(&bt); mtx_unlock_spin(&tc_setclock_mtx); /* Avoid rtc_generation == 0, since td_rtcgen == 0 is special. */ atomic_add_rel_int(&rtc_generation, 2); sleepq_chains_remove_matching(sleeping_on_old_rtc); if (timestepwarnings) { nanotime(&taft); log(LOG_INFO, "Time stepped from %jd.%09ld to %jd.%09ld (%jd.%09ld)\n", (intmax_t)tbef.tv_sec, tbef.tv_nsec, (intmax_t)taft.tv_sec, taft.tv_nsec, (intmax_t)ts->tv_sec, ts->tv_nsec); } } /* * Initialize the next struct timehands in the ring and make * it the active timehands. Along the way we might switch to a different * timecounter and/or do seconds processing in NTP. Slightly magic. */ static void tc_windup(struct bintime *new_boottimebin) { struct bintime bt; struct timehands *th, *tho; uint64_t scale; u_int delta, ncount, ogen; int i; time_t t; /* * Make the next timehands a copy of the current one, but do * not overwrite the generation or next pointer. While we * update the contents, the generation must be zero. We need * to ensure that the zero generation is visible before the * data updates become visible, which requires release fence. * For similar reasons, re-reading of the generation after the * data is read should use acquire fence. */ tho = timehands; th = tho->th_next; ogen = th->th_generation; th->th_generation = 0; atomic_thread_fence_rel(); memcpy(th, tho, offsetof(struct timehands, th_generation)); if (new_boottimebin != NULL) th->th_boottime = *new_boottimebin; /* * Capture a timecounter delta on the current timecounter and if * changing timecounters, a counter value from the new timecounter. * Update the offset fields accordingly. */ delta = tc_delta(th); if (th->th_counter != timecounter) ncount = timecounter->tc_get_timecount(timecounter); else ncount = 0; #ifdef FFCLOCK ffclock_windup(delta); #endif th->th_offset_count += delta; th->th_offset_count &= th->th_counter->tc_counter_mask; while (delta > th->th_counter->tc_frequency) { /* Eat complete unadjusted seconds. */ delta -= th->th_counter->tc_frequency; th->th_offset.sec++; } if ((delta > th->th_counter->tc_frequency / 2) && (th->th_scale * delta < ((uint64_t)1 << 63))) { /* The product th_scale * delta just barely overflows. */ th->th_offset.sec++; } bintime_addx(&th->th_offset, th->th_scale * delta); /* * Hardware latching timecounters may not generate interrupts on * PPS events, so instead we poll them. There is a finite risk that * the hardware might capture a count which is later than the one we * got above, and therefore possibly in the next NTP second which might * have a different rate than the current NTP second. It doesn't * matter in practice. */ if (tho->th_counter->tc_poll_pps) tho->th_counter->tc_poll_pps(tho->th_counter); /* * Deal with NTP second processing. The for loop normally * iterates at most once, but in extreme situations it might * keep NTP sane if timeouts are not run for several seconds. * At boot, the time step can be large when the TOD hardware * has been read, so on really large steps, we call * ntp_update_second only twice. We need to call it twice in * case we missed a leap second. */ bt = th->th_offset; bintime_add(&bt, &th->th_boottime); i = bt.sec - tho->th_microtime.tv_sec; if (i > LARGE_STEP) i = 2; for (; i > 0; i--) { t = bt.sec; ntp_update_second(&th->th_adjustment, &bt.sec); if (bt.sec != t) th->th_boottime.sec += bt.sec - t; } /* Update the UTC timestamps used by the get*() functions. */ th->th_bintime = bt; bintime2timeval(&bt, &th->th_microtime); bintime2timespec(&bt, &th->th_nanotime); /* Now is a good time to change timecounters. */ if (th->th_counter != timecounter) { #ifndef __arm__ if ((timecounter->tc_flags & TC_FLAGS_C2STOP) != 0) cpu_disable_c2_sleep++; if ((th->th_counter->tc_flags & TC_FLAGS_C2STOP) != 0) cpu_disable_c2_sleep--; #endif th->th_counter = timecounter; th->th_offset_count = ncount; tc_min_ticktock_freq = max(1, timecounter->tc_frequency / (((uint64_t)timecounter->tc_counter_mask + 1) / 3)); #ifdef FFCLOCK ffclock_change_tc(th); #endif } /*- * Recalculate the scaling factor. We want the number of 1/2^64 * fractions of a second per period of the hardware counter, taking * into account the th_adjustment factor which the NTP PLL/adjtime(2) * processing provides us with. * * The th_adjustment is nanoseconds per second with 32 bit binary * fraction and we want 64 bit binary fraction of second: * * x = a * 2^32 / 10^9 = a * 4.294967296 * * The range of th_adjustment is +/- 5000PPM so inside a 64bit int * we can only multiply by about 850 without overflowing, that * leaves no suitably precise fractions for multiply before divide. * * Divide before multiply with a fraction of 2199/512 results in a * systematic undercompensation of 10PPM of th_adjustment. On a * 5000PPM adjustment this is a 0.05PPM error. This is acceptable. * * We happily sacrifice the lowest of the 64 bits of our result * to the goddess of code clarity. * */ scale = (uint64_t)1 << 63; scale += (th->th_adjustment / 1024) * 2199; scale /= th->th_counter->tc_frequency; th->th_scale = scale * 2; /* * Now that the struct timehands is again consistent, set the new * generation number, making sure to not make it zero. */ if (++ogen == 0) ogen = 1; atomic_store_rel_int(&th->th_generation, ogen); /* Go live with the new struct timehands. */ #ifdef FFCLOCK switch (sysclock_active) { case SYSCLOCK_FBCK: #endif time_second = th->th_microtime.tv_sec; time_uptime = th->th_offset.sec; #ifdef FFCLOCK break; case SYSCLOCK_FFWD: time_second = fftimehands->tick_time_lerp.sec; time_uptime = fftimehands->tick_time_lerp.sec - ffclock_boottime.sec; break; } #endif timehands = th; timekeep_push_vdso(); } /* Report or change the active timecounter hardware. */ static int sysctl_kern_timecounter_hardware(SYSCTL_HANDLER_ARGS) { char newname[32]; struct timecounter *newtc, *tc; int error; tc = timecounter; strlcpy(newname, tc->tc_name, sizeof(newname)); error = sysctl_handle_string(oidp, &newname[0], sizeof(newname), req); if (error != 0 || req->newptr == NULL) return (error); /* Record that the tc in use now was specifically chosen. */ tc_chosen = 1; if (strcmp(newname, tc->tc_name) == 0) return (0); for (newtc = timecounters; newtc != NULL; newtc = newtc->tc_next) { if (strcmp(newname, newtc->tc_name) != 0) continue; /* Warm up new timecounter. */ (void)newtc->tc_get_timecount(newtc); (void)newtc->tc_get_timecount(newtc); timecounter = newtc; /* * The vdso timehands update is deferred until the next * 'tc_windup()'. * * This is prudent given that 'timekeep_push_vdso()' does not * use any locking and that it can be called in hard interrupt * context via 'tc_windup()'. */ return (0); } return (EINVAL); } SYSCTL_PROC(_kern_timecounter, OID_AUTO, hardware, CTLTYPE_STRING | CTLFLAG_RW, 0, 0, sysctl_kern_timecounter_hardware, "A", "Timecounter hardware selected"); /* Report the available timecounter hardware. */ static int sysctl_kern_timecounter_choice(SYSCTL_HANDLER_ARGS) { struct sbuf sb; struct timecounter *tc; int error; sbuf_new_for_sysctl(&sb, NULL, 0, req); for (tc = timecounters; tc != NULL; tc = tc->tc_next) { if (tc != timecounters) sbuf_putc(&sb, ' '); sbuf_printf(&sb, "%s(%d)", tc->tc_name, tc->tc_quality); } error = sbuf_finish(&sb); sbuf_delete(&sb); return (error); } SYSCTL_PROC(_kern_timecounter, OID_AUTO, choice, CTLTYPE_STRING | CTLFLAG_RD, 0, 0, sysctl_kern_timecounter_choice, "A", "Timecounter hardware detected"); /* * RFC 2783 PPS-API implementation. */ /* * Return true if the driver is aware of the abi version extensions in the * pps_state structure, and it supports at least the given abi version number. */ static inline int abi_aware(struct pps_state *pps, int vers) { return ((pps->kcmode & KCMODE_ABIFLAG) && pps->driver_abi >= vers); } static int pps_fetch(struct pps_fetch_args *fapi, struct pps_state *pps) { int err, timo; pps_seq_t aseq, cseq; struct timeval tv; if (fapi->tsformat && fapi->tsformat != PPS_TSFMT_TSPEC) return (EINVAL); /* * If no timeout is requested, immediately return whatever values were * most recently captured. If timeout seconds is -1, that's a request * to block without a timeout. WITNESS won't let us sleep forever * without a lock (we really don't need a lock), so just repeatedly * sleep a long time. */ if (fapi->timeout.tv_sec || fapi->timeout.tv_nsec) { if (fapi->timeout.tv_sec == -1) timo = 0x7fffffff; else { tv.tv_sec = fapi->timeout.tv_sec; tv.tv_usec = fapi->timeout.tv_nsec / 1000; timo = tvtohz(&tv); } aseq = atomic_load_int(&pps->ppsinfo.assert_sequence); cseq = atomic_load_int(&pps->ppsinfo.clear_sequence); while (aseq == atomic_load_int(&pps->ppsinfo.assert_sequence) && cseq == atomic_load_int(&pps->ppsinfo.clear_sequence)) { if (abi_aware(pps, 1) && pps->driver_mtx != NULL) { if (pps->flags & PPSFLAG_MTX_SPIN) { err = msleep_spin(pps, pps->driver_mtx, "ppsfch", timo); } else { err = msleep(pps, pps->driver_mtx, PCATCH, "ppsfch", timo); } } else { err = tsleep(pps, PCATCH, "ppsfch", timo); } if (err == EWOULDBLOCK) { if (fapi->timeout.tv_sec == -1) { continue; } else { return (ETIMEDOUT); } } else if (err != 0) { return (err); } } } pps->ppsinfo.current_mode = pps->ppsparam.mode; fapi->pps_info_buf = pps->ppsinfo; return (0); } int pps_ioctl(u_long cmd, caddr_t data, struct pps_state *pps) { pps_params_t *app; struct pps_fetch_args *fapi; #ifdef FFCLOCK struct pps_fetch_ffc_args *fapi_ffc; #endif #ifdef PPS_SYNC struct pps_kcbind_args *kapi; #endif KASSERT(pps != NULL, ("NULL pps pointer in pps_ioctl")); switch (cmd) { case PPS_IOC_CREATE: return (0); case PPS_IOC_DESTROY: return (0); case PPS_IOC_SETPARAMS: app = (pps_params_t *)data; if (app->mode & ~pps->ppscap) return (EINVAL); #ifdef FFCLOCK /* Ensure only a single clock is selected for ffc timestamp. */ if ((app->mode & PPS_TSCLK_MASK) == PPS_TSCLK_MASK) return (EINVAL); #endif pps->ppsparam = *app; return (0); case PPS_IOC_GETPARAMS: app = (pps_params_t *)data; *app = pps->ppsparam; app->api_version = PPS_API_VERS_1; return (0); case PPS_IOC_GETCAP: *(int*)data = pps->ppscap; return (0); case PPS_IOC_FETCH: fapi = (struct pps_fetch_args *)data; return (pps_fetch(fapi, pps)); #ifdef FFCLOCK case PPS_IOC_FETCH_FFCOUNTER: fapi_ffc = (struct pps_fetch_ffc_args *)data; if (fapi_ffc->tsformat && fapi_ffc->tsformat != PPS_TSFMT_TSPEC) return (EINVAL); if (fapi_ffc->timeout.tv_sec || fapi_ffc->timeout.tv_nsec) return (EOPNOTSUPP); pps->ppsinfo_ffc.current_mode = pps->ppsparam.mode; fapi_ffc->pps_info_buf_ffc = pps->ppsinfo_ffc; /* Overwrite timestamps if feedback clock selected. */ switch (pps->ppsparam.mode & PPS_TSCLK_MASK) { case PPS_TSCLK_FBCK: fapi_ffc->pps_info_buf_ffc.assert_timestamp = pps->ppsinfo.assert_timestamp; fapi_ffc->pps_info_buf_ffc.clear_timestamp = pps->ppsinfo.clear_timestamp; break; case PPS_TSCLK_FFWD: break; default: break; } return (0); #endif /* FFCLOCK */ case PPS_IOC_KCBIND: #ifdef PPS_SYNC kapi = (struct pps_kcbind_args *)data; /* XXX Only root should be able to do this */ if (kapi->tsformat && kapi->tsformat != PPS_TSFMT_TSPEC) return (EINVAL); if (kapi->kernel_consumer != PPS_KC_HARDPPS) return (EINVAL); if (kapi->edge & ~pps->ppscap) return (EINVAL); pps->kcmode = (kapi->edge & KCMODE_EDGEMASK) | (pps->kcmode & KCMODE_ABIFLAG); return (0); #else return (EOPNOTSUPP); #endif default: return (ENOIOCTL); } } void pps_init(struct pps_state *pps) { pps->ppscap |= PPS_TSFMT_TSPEC | PPS_CANWAIT; if (pps->ppscap & PPS_CAPTUREASSERT) pps->ppscap |= PPS_OFFSETASSERT; if (pps->ppscap & PPS_CAPTURECLEAR) pps->ppscap |= PPS_OFFSETCLEAR; #ifdef FFCLOCK pps->ppscap |= PPS_TSCLK_MASK; #endif pps->kcmode &= ~KCMODE_ABIFLAG; } void pps_init_abi(struct pps_state *pps) { pps_init(pps); if (pps->driver_abi > 0) { pps->kcmode |= KCMODE_ABIFLAG; pps->kernel_abi = PPS_ABI_VERSION; } } void pps_capture(struct pps_state *pps) { struct timehands *th; KASSERT(pps != NULL, ("NULL pps pointer in pps_capture")); th = timehands; pps->capgen = atomic_load_acq_int(&th->th_generation); pps->capth = th; #ifdef FFCLOCK pps->capffth = fftimehands; #endif pps->capcount = th->th_counter->tc_get_timecount(th->th_counter); atomic_thread_fence_acq(); if (pps->capgen != th->th_generation) pps->capgen = 0; } void pps_event(struct pps_state *pps, int event) { struct bintime bt; struct timespec ts, *tsp, *osp; u_int tcount, *pcount; int foff; pps_seq_t *pseq; #ifdef FFCLOCK struct timespec *tsp_ffc; pps_seq_t *pseq_ffc; ffcounter *ffcount; #endif #ifdef PPS_SYNC int fhard; #endif KASSERT(pps != NULL, ("NULL pps pointer in pps_event")); /* Nothing to do if not currently set to capture this event type. */ if ((event & pps->ppsparam.mode) == 0) return; /* If the timecounter was wound up underneath us, bail out. */ if (pps->capgen == 0 || pps->capgen != atomic_load_acq_int(&pps->capth->th_generation)) return; /* Things would be easier with arrays. */ if (event == PPS_CAPTUREASSERT) { tsp = &pps->ppsinfo.assert_timestamp; osp = &pps->ppsparam.assert_offset; foff = pps->ppsparam.mode & PPS_OFFSETASSERT; #ifdef PPS_SYNC fhard = pps->kcmode & PPS_CAPTUREASSERT; #endif pcount = &pps->ppscount[0]; pseq = &pps->ppsinfo.assert_sequence; #ifdef FFCLOCK ffcount = &pps->ppsinfo_ffc.assert_ffcount; tsp_ffc = &pps->ppsinfo_ffc.assert_timestamp; pseq_ffc = &pps->ppsinfo_ffc.assert_sequence; #endif } else { tsp = &pps->ppsinfo.clear_timestamp; osp = &pps->ppsparam.clear_offset; foff = pps->ppsparam.mode & PPS_OFFSETCLEAR; #ifdef PPS_SYNC fhard = pps->kcmode & PPS_CAPTURECLEAR; #endif pcount = &pps->ppscount[1]; pseq = &pps->ppsinfo.clear_sequence; #ifdef FFCLOCK ffcount = &pps->ppsinfo_ffc.clear_ffcount; tsp_ffc = &pps->ppsinfo_ffc.clear_timestamp; pseq_ffc = &pps->ppsinfo_ffc.clear_sequence; #endif } /* * If the timecounter changed, we cannot compare the count values, so * we have to drop the rest of the PPS-stuff until the next event. */ if (pps->ppstc != pps->capth->th_counter) { pps->ppstc = pps->capth->th_counter; *pcount = pps->capcount; pps->ppscount[2] = pps->capcount; return; } /* Convert the count to a timespec. */ tcount = pps->capcount - pps->capth->th_offset_count; tcount &= pps->capth->th_counter->tc_counter_mask; bt = pps->capth->th_bintime; bintime_addx(&bt, pps->capth->th_scale * tcount); bintime2timespec(&bt, &ts); /* If the timecounter was wound up underneath us, bail out. */ atomic_thread_fence_acq(); if (pps->capgen != pps->capth->th_generation) return; *pcount = pps->capcount; (*pseq)++; *tsp = ts; if (foff) { - timespecadd(tsp, osp); + timespecadd(tsp, osp, tsp); if (tsp->tv_nsec < 0) { tsp->tv_nsec += 1000000000; tsp->tv_sec -= 1; } } #ifdef FFCLOCK *ffcount = pps->capffth->tick_ffcount + tcount; bt = pps->capffth->tick_time; ffclock_convert_delta(tcount, pps->capffth->cest.period, &bt); bintime_add(&bt, &pps->capffth->tick_time); bintime2timespec(&bt, &ts); (*pseq_ffc)++; *tsp_ffc = ts; #endif #ifdef PPS_SYNC if (fhard) { uint64_t scale; /* * Feed the NTP PLL/FLL. * The FLL wants to know how many (hardware) nanoseconds * elapsed since the previous event. */ tcount = pps->capcount - pps->ppscount[2]; pps->ppscount[2] = pps->capcount; tcount &= pps->capth->th_counter->tc_counter_mask; scale = (uint64_t)1 << 63; scale /= pps->capth->th_counter->tc_frequency; scale *= 2; bt.sec = 0; bt.frac = 0; bintime_addx(&bt, scale * tcount); bintime2timespec(&bt, &ts); hardpps(tsp, ts.tv_nsec + 1000000000 * ts.tv_sec); } #endif /* Wakeup anyone sleeping in pps_fetch(). */ wakeup(pps); } /* * Timecounters need to be updated every so often to prevent the hardware * counter from overflowing. Updating also recalculates the cached values * used by the get*() family of functions, so their precision depends on * the update frequency. */ static int tc_tick; SYSCTL_INT(_kern_timecounter, OID_AUTO, tick, CTLFLAG_RD, &tc_tick, 0, "Approximate number of hardclock ticks in a millisecond"); void tc_ticktock(int cnt) { static int count; if (mtx_trylock_spin(&tc_setclock_mtx)) { count += cnt; if (count >= tc_tick) { count = 0; tc_windup(NULL); } mtx_unlock_spin(&tc_setclock_mtx); } } static void __inline tc_adjprecision(void) { int t; if (tc_timepercentage > 0) { t = (99 + tc_timepercentage) / tc_timepercentage; tc_precexp = fls(t + (t >> 1)) - 1; FREQ2BT(hz / tc_tick, &bt_timethreshold); FREQ2BT(hz, &bt_tickthreshold); bintime_shift(&bt_timethreshold, tc_precexp); bintime_shift(&bt_tickthreshold, tc_precexp); } else { tc_precexp = 31; bt_timethreshold.sec = INT_MAX; bt_timethreshold.frac = ~(uint64_t)0; bt_tickthreshold = bt_timethreshold; } sbt_timethreshold = bttosbt(bt_timethreshold); sbt_tickthreshold = bttosbt(bt_tickthreshold); } static int sysctl_kern_timecounter_adjprecision(SYSCTL_HANDLER_ARGS) { int error, val; val = tc_timepercentage; error = sysctl_handle_int(oidp, &val, 0, req); if (error != 0 || req->newptr == NULL) return (error); tc_timepercentage = val; if (cold) goto done; tc_adjprecision(); done: return (0); } static void inittimecounter(void *dummy) { u_int p; int tick_rate; /* * Set the initial timeout to * max(1, ). * People should probably not use the sysctl to set the timeout * to smaller than its initial value, since that value is the * smallest reasonable one. If they want better timestamps they * should use the non-"get"* functions. */ if (hz > 1000) tc_tick = (hz + 500) / 1000; else tc_tick = 1; tc_adjprecision(); FREQ2BT(hz, &tick_bt); tick_sbt = bttosbt(tick_bt); tick_rate = hz / tc_tick; FREQ2BT(tick_rate, &tc_tick_bt); tc_tick_sbt = bttosbt(tc_tick_bt); p = (tc_tick * 1000000) / hz; printf("Timecounters tick every %d.%03u msec\n", p / 1000, p % 1000); #ifdef FFCLOCK ffclock_init(); #endif /* warm up new timecounter (again) and get rolling. */ (void)timecounter->tc_get_timecount(timecounter); (void)timecounter->tc_get_timecount(timecounter); mtx_lock_spin(&tc_setclock_mtx); tc_windup(NULL); mtx_unlock_spin(&tc_setclock_mtx); } SYSINIT(timecounter, SI_SUB_CLOCKS, SI_ORDER_SECOND, inittimecounter, NULL); /* Cpu tick handling -------------------------------------------------*/ static int cpu_tick_variable; static uint64_t cpu_tick_frequency; DPCPU_DEFINE_STATIC(uint64_t, tc_cpu_ticks_base); DPCPU_DEFINE_STATIC(unsigned, tc_cpu_ticks_last); static uint64_t tc_cpu_ticks(void) { struct timecounter *tc; uint64_t res, *base; unsigned u, *last; critical_enter(); base = DPCPU_PTR(tc_cpu_ticks_base); last = DPCPU_PTR(tc_cpu_ticks_last); tc = timehands->th_counter; u = tc->tc_get_timecount(tc) & tc->tc_counter_mask; if (u < *last) *base += (uint64_t)tc->tc_counter_mask + 1; *last = u; res = u + *base; critical_exit(); return (res); } void cpu_tick_calibration(void) { static time_t last_calib; if (time_uptime != last_calib && !(time_uptime & 0xf)) { cpu_tick_calibrate(0); last_calib = time_uptime; } } /* * This function gets called every 16 seconds on only one designated * CPU in the system from hardclock() via cpu_tick_calibration()(). * * Whenever the real time clock is stepped we get called with reset=1 * to make sure we handle suspend/resume and similar events correctly. */ static void cpu_tick_calibrate(int reset) { static uint64_t c_last; uint64_t c_this, c_delta; static struct bintime t_last; struct bintime t_this, t_delta; uint32_t divi; if (reset) { /* The clock was stepped, abort & reset */ t_last.sec = 0; return; } /* we don't calibrate fixed rate cputicks */ if (!cpu_tick_variable) return; getbinuptime(&t_this); c_this = cpu_ticks(); if (t_last.sec != 0) { c_delta = c_this - c_last; t_delta = t_this; bintime_sub(&t_delta, &t_last); /* * Headroom: * 2^(64-20) / 16[s] = * 2^(44) / 16[s] = * 17.592.186.044.416 / 16 = * 1.099.511.627.776 [Hz] */ divi = t_delta.sec << 20; divi |= t_delta.frac >> (64 - 20); c_delta <<= 20; c_delta /= divi; if (c_delta > cpu_tick_frequency) { if (0 && bootverbose) printf("cpu_tick increased to %ju Hz\n", c_delta); cpu_tick_frequency = c_delta; } } c_last = c_this; t_last = t_this; } void set_cputicker(cpu_tick_f *func, uint64_t freq, unsigned var) { if (func == NULL) { cpu_ticks = tc_cpu_ticks; } else { cpu_tick_frequency = freq; cpu_tick_variable = var; cpu_ticks = func; } } uint64_t cpu_tickrate(void) { if (cpu_ticks == tc_cpu_ticks) return (tc_getfrequency()); return (cpu_tick_frequency); } /* * We need to be slightly careful converting cputicks to microseconds. * There is plenty of margin in 64 bits of microseconds (half a million * years) and in 64 bits at 4 GHz (146 years), but if we do a multiply * before divide conversion (to retain precision) we find that the * margin shrinks to 1.5 hours (one millionth of 146y). * With a three prong approach we never lose significant bits, no * matter what the cputick rate and length of timeinterval is. */ uint64_t cputick2usec(uint64_t tick) { if (tick > 18446744073709551LL) /* floor(2^64 / 1000) */ return (tick / (cpu_tickrate() / 1000000LL)); else if (tick > 18446744073709LL) /* floor(2^64 / 1000000) */ return ((tick * 1000LL) / (cpu_tickrate() / 1000LL)); else return ((tick * 1000000LL) / cpu_tickrate()); } cpu_tick_f *cpu_ticks = tc_cpu_ticks; static int vdso_th_enable = 1; static int sysctl_fast_gettime(SYSCTL_HANDLER_ARGS) { int old_vdso_th_enable, error; old_vdso_th_enable = vdso_th_enable; error = sysctl_handle_int(oidp, &old_vdso_th_enable, 0, req); if (error != 0) return (error); vdso_th_enable = old_vdso_th_enable; return (0); } SYSCTL_PROC(_kern_timecounter, OID_AUTO, fast_gettime, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0, sysctl_fast_gettime, "I", "Enable fast time of day"); uint32_t tc_fill_vdso_timehands(struct vdso_timehands *vdso_th) { struct timehands *th; uint32_t enabled; th = timehands; vdso_th->th_scale = th->th_scale; vdso_th->th_offset_count = th->th_offset_count; vdso_th->th_counter_mask = th->th_counter->tc_counter_mask; vdso_th->th_offset = th->th_offset; vdso_th->th_boottime = th->th_boottime; if (th->th_counter->tc_fill_vdso_timehands != NULL) { enabled = th->th_counter->tc_fill_vdso_timehands(vdso_th, th->th_counter); } else enabled = 0; if (!vdso_th_enable) enabled = 0; return (enabled); } #ifdef COMPAT_FREEBSD32 uint32_t tc_fill_vdso_timehands32(struct vdso_timehands32 *vdso_th32) { struct timehands *th; uint32_t enabled; th = timehands; *(uint64_t *)&vdso_th32->th_scale[0] = th->th_scale; vdso_th32->th_offset_count = th->th_offset_count; vdso_th32->th_counter_mask = th->th_counter->tc_counter_mask; vdso_th32->th_offset.sec = th->th_offset.sec; *(uint64_t *)&vdso_th32->th_offset.frac[0] = th->th_offset.frac; vdso_th32->th_boottime.sec = th->th_boottime.sec; *(uint64_t *)&vdso_th32->th_boottime.frac[0] = th->th_boottime.frac; if (th->th_counter->tc_fill_vdso_timehands32 != NULL) { enabled = th->th_counter->tc_fill_vdso_timehands32(vdso_th32, th->th_counter); } else enabled = 0; if (!vdso_th_enable) enabled = 0; return (enabled); } #endif Index: head/sys/kern/kern_time.c =================================================================== --- head/sys/kern/kern_time.c (revision 336913) +++ head/sys/kern/kern_time.c (revision 336914) @@ -1,1763 +1,1765 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)kern_time.c 8.1 (Berkeley) 6/10/93 */ #include __FBSDID("$FreeBSD$"); #include "opt_ktrace.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef KTRACE #include #endif #include #include #define MAX_CLOCKS (CLOCK_MONOTONIC+1) #define CPUCLOCK_BIT 0x80000000 #define CPUCLOCK_PROCESS_BIT 0x40000000 #define CPUCLOCK_ID_MASK (~(CPUCLOCK_BIT|CPUCLOCK_PROCESS_BIT)) #define MAKE_THREAD_CPUCLOCK(tid) (CPUCLOCK_BIT|(tid)) #define MAKE_PROCESS_CPUCLOCK(pid) \ (CPUCLOCK_BIT|CPUCLOCK_PROCESS_BIT|(pid)) static struct kclock posix_clocks[MAX_CLOCKS]; static uma_zone_t itimer_zone = NULL; /* * Time of day and interval timer support. * * These routines provide the kernel entry points to get and set * the time-of-day and per-process interval timers. Subroutines * here provide support for adding and subtracting timeval structures * and decrementing interval timers, optionally reloading the interval * timers when they expire. */ static int settime(struct thread *, struct timeval *); static void timevalfix(struct timeval *); static int user_clock_nanosleep(struct thread *td, clockid_t clock_id, int flags, const struct timespec *ua_rqtp, struct timespec *ua_rmtp); static void itimer_start(void); static int itimer_init(void *, int, int); static void itimer_fini(void *, int); static void itimer_enter(struct itimer *); static void itimer_leave(struct itimer *); static struct itimer *itimer_find(struct proc *, int); static void itimers_alloc(struct proc *); static void itimers_event_hook_exec(void *arg, struct proc *p, struct image_params *imgp); static void itimers_event_hook_exit(void *arg, struct proc *p); static int realtimer_create(struct itimer *); static int realtimer_gettime(struct itimer *, struct itimerspec *); static int realtimer_settime(struct itimer *, int, struct itimerspec *, struct itimerspec *); static int realtimer_delete(struct itimer *); static void realtimer_clocktime(clockid_t, struct timespec *); static void realtimer_expire(void *); int register_posix_clock(int, struct kclock *); void itimer_fire(struct itimer *it); int itimespecfix(struct timespec *ts); #define CLOCK_CALL(clock, call, arglist) \ ((*posix_clocks[clock].call) arglist) SYSINIT(posix_timer, SI_SUB_P1003_1B, SI_ORDER_FIRST+4, itimer_start, NULL); static int settime(struct thread *td, struct timeval *tv) { struct timeval delta, tv1, tv2; static struct timeval maxtime, laststep; struct timespec ts; microtime(&tv1); delta = *tv; timevalsub(&delta, &tv1); /* * If the system is secure, we do not allow the time to be * set to a value earlier than 1 second less than the highest * time we have yet seen. The worst a miscreant can do in * this circumstance is "freeze" time. He couldn't go * back to the past. * * We similarly do not allow the clock to be stepped more * than one second, nor more than once per second. This allows * a miscreant to make the clock march double-time, but no worse. */ if (securelevel_gt(td->td_ucred, 1) != 0) { if (delta.tv_sec < 0 || delta.tv_usec < 0) { /* * Update maxtime to latest time we've seen. */ if (tv1.tv_sec > maxtime.tv_sec) maxtime = tv1; tv2 = *tv; timevalsub(&tv2, &maxtime); if (tv2.tv_sec < -1) { tv->tv_sec = maxtime.tv_sec - 1; printf("Time adjustment clamped to -1 second\n"); } } else { if (tv1.tv_sec == laststep.tv_sec) return (EPERM); if (delta.tv_sec > 1) { tv->tv_sec = tv1.tv_sec + 1; printf("Time adjustment clamped to +1 second\n"); } laststep = *tv; } } ts.tv_sec = tv->tv_sec; ts.tv_nsec = tv->tv_usec * 1000; tc_setclock(&ts); resettodr(); return (0); } #ifndef _SYS_SYSPROTO_H_ struct clock_getcpuclockid2_args { id_t id; int which, clockid_t *clock_id; }; #endif /* ARGSUSED */ int sys_clock_getcpuclockid2(struct thread *td, struct clock_getcpuclockid2_args *uap) { clockid_t clk_id; int error; error = kern_clock_getcpuclockid2(td, uap->id, uap->which, &clk_id); if (error == 0) error = copyout(&clk_id, uap->clock_id, sizeof(clockid_t)); return (error); } int kern_clock_getcpuclockid2(struct thread *td, id_t id, int which, clockid_t *clk_id) { struct proc *p; pid_t pid; lwpid_t tid; int error; switch (which) { case CPUCLOCK_WHICH_PID: if (id != 0) { error = pget(id, PGET_CANSEE | PGET_NOTID, &p); if (error != 0) return (error); PROC_UNLOCK(p); pid = id; } else { pid = td->td_proc->p_pid; } *clk_id = MAKE_PROCESS_CPUCLOCK(pid); return (0); case CPUCLOCK_WHICH_TID: tid = id == 0 ? td->td_tid : id; *clk_id = MAKE_THREAD_CPUCLOCK(tid); return (0); default: return (EINVAL); } } #ifndef _SYS_SYSPROTO_H_ struct clock_gettime_args { clockid_t clock_id; struct timespec *tp; }; #endif /* ARGSUSED */ int sys_clock_gettime(struct thread *td, struct clock_gettime_args *uap) { struct timespec ats; int error; error = kern_clock_gettime(td, uap->clock_id, &ats); if (error == 0) error = copyout(&ats, uap->tp, sizeof(ats)); return (error); } static inline void cputick2timespec(uint64_t runtime, struct timespec *ats) { runtime = cputick2usec(runtime); ats->tv_sec = runtime / 1000000; ats->tv_nsec = runtime % 1000000 * 1000; } static void get_thread_cputime(struct thread *targettd, struct timespec *ats) { uint64_t runtime, curtime, switchtime; if (targettd == NULL) { /* current thread */ critical_enter(); switchtime = PCPU_GET(switchtime); curtime = cpu_ticks(); runtime = curthread->td_runtime; critical_exit(); runtime += curtime - switchtime; } else { thread_lock(targettd); runtime = targettd->td_runtime; thread_unlock(targettd); } cputick2timespec(runtime, ats); } static void get_process_cputime(struct proc *targetp, struct timespec *ats) { uint64_t runtime; struct rusage ru; PROC_STATLOCK(targetp); rufetch(targetp, &ru); runtime = targetp->p_rux.rux_runtime; if (curthread->td_proc == targetp) runtime += cpu_ticks() - PCPU_GET(switchtime); PROC_STATUNLOCK(targetp); cputick2timespec(runtime, ats); } static int get_cputime(struct thread *td, clockid_t clock_id, struct timespec *ats) { struct proc *p, *p2; struct thread *td2; lwpid_t tid; pid_t pid; int error; p = td->td_proc; if ((clock_id & CPUCLOCK_PROCESS_BIT) == 0) { tid = clock_id & CPUCLOCK_ID_MASK; td2 = tdfind(tid, p->p_pid); if (td2 == NULL) return (EINVAL); get_thread_cputime(td2, ats); PROC_UNLOCK(td2->td_proc); } else { pid = clock_id & CPUCLOCK_ID_MASK; error = pget(pid, PGET_CANSEE, &p2); if (error != 0) return (EINVAL); get_process_cputime(p2, ats); PROC_UNLOCK(p2); } return (0); } int kern_clock_gettime(struct thread *td, clockid_t clock_id, struct timespec *ats) { struct timeval sys, user; struct proc *p; p = td->td_proc; switch (clock_id) { case CLOCK_REALTIME: /* Default to precise. */ case CLOCK_REALTIME_PRECISE: nanotime(ats); break; case CLOCK_REALTIME_FAST: getnanotime(ats); break; case CLOCK_VIRTUAL: PROC_LOCK(p); PROC_STATLOCK(p); calcru(p, &user, &sys); PROC_STATUNLOCK(p); PROC_UNLOCK(p); TIMEVAL_TO_TIMESPEC(&user, ats); break; case CLOCK_PROF: PROC_LOCK(p); PROC_STATLOCK(p); calcru(p, &user, &sys); PROC_STATUNLOCK(p); PROC_UNLOCK(p); timevaladd(&user, &sys); TIMEVAL_TO_TIMESPEC(&user, ats); break; case CLOCK_MONOTONIC: /* Default to precise. */ case CLOCK_MONOTONIC_PRECISE: case CLOCK_UPTIME: case CLOCK_UPTIME_PRECISE: nanouptime(ats); break; case CLOCK_UPTIME_FAST: case CLOCK_MONOTONIC_FAST: getnanouptime(ats); break; case CLOCK_SECOND: ats->tv_sec = time_second; ats->tv_nsec = 0; break; case CLOCK_THREAD_CPUTIME_ID: get_thread_cputime(NULL, ats); break; case CLOCK_PROCESS_CPUTIME_ID: PROC_LOCK(p); get_process_cputime(p, ats); PROC_UNLOCK(p); break; default: if ((int)clock_id >= 0) return (EINVAL); return (get_cputime(td, clock_id, ats)); } return (0); } #ifndef _SYS_SYSPROTO_H_ struct clock_settime_args { clockid_t clock_id; const struct timespec *tp; }; #endif /* ARGSUSED */ int sys_clock_settime(struct thread *td, struct clock_settime_args *uap) { struct timespec ats; int error; if ((error = copyin(uap->tp, &ats, sizeof(ats))) != 0) return (error); return (kern_clock_settime(td, uap->clock_id, &ats)); } static int allow_insane_settime = 0; SYSCTL_INT(_debug, OID_AUTO, allow_insane_settime, CTLFLAG_RWTUN, &allow_insane_settime, 0, "do not perform possibly restrictive checks on settime(2) args"); int kern_clock_settime(struct thread *td, clockid_t clock_id, struct timespec *ats) { struct timeval atv; int error; if ((error = priv_check(td, PRIV_CLOCK_SETTIME)) != 0) return (error); if (clock_id != CLOCK_REALTIME) return (EINVAL); if (ats->tv_nsec < 0 || ats->tv_nsec >= 1000000000 || ats->tv_sec < 0) return (EINVAL); if (!allow_insane_settime && ats->tv_sec > 8000ULL * 365 * 24 * 60 * 60) return (EINVAL); /* XXX Don't convert nsec->usec and back */ TIMESPEC_TO_TIMEVAL(&atv, ats); error = settime(td, &atv); return (error); } #ifndef _SYS_SYSPROTO_H_ struct clock_getres_args { clockid_t clock_id; struct timespec *tp; }; #endif int sys_clock_getres(struct thread *td, struct clock_getres_args *uap) { struct timespec ts; int error; if (uap->tp == NULL) return (0); error = kern_clock_getres(td, uap->clock_id, &ts); if (error == 0) error = copyout(&ts, uap->tp, sizeof(ts)); return (error); } int kern_clock_getres(struct thread *td, clockid_t clock_id, struct timespec *ts) { ts->tv_sec = 0; switch (clock_id) { case CLOCK_REALTIME: case CLOCK_REALTIME_FAST: case CLOCK_REALTIME_PRECISE: case CLOCK_MONOTONIC: case CLOCK_MONOTONIC_FAST: case CLOCK_MONOTONIC_PRECISE: case CLOCK_UPTIME: case CLOCK_UPTIME_FAST: case CLOCK_UPTIME_PRECISE: /* * Round up the result of the division cheaply by adding 1. * Rounding up is especially important if rounding down * would give 0. Perfect rounding is unimportant. */ ts->tv_nsec = 1000000000 / tc_getfrequency() + 1; break; case CLOCK_VIRTUAL: case CLOCK_PROF: /* Accurately round up here because we can do so cheaply. */ ts->tv_nsec = howmany(1000000000, hz); break; case CLOCK_SECOND: ts->tv_sec = 1; ts->tv_nsec = 0; break; case CLOCK_THREAD_CPUTIME_ID: case CLOCK_PROCESS_CPUTIME_ID: cputime: /* sync with cputick2usec */ ts->tv_nsec = 1000000 / cpu_tickrate(); if (ts->tv_nsec == 0) ts->tv_nsec = 1000; break; default: if ((int)clock_id < 0) goto cputime; return (EINVAL); } return (0); } int kern_nanosleep(struct thread *td, struct timespec *rqt, struct timespec *rmt) { return (kern_clock_nanosleep(td, CLOCK_REALTIME, TIMER_RELTIME, rqt, rmt)); } static uint8_t nanowait[MAXCPU]; int kern_clock_nanosleep(struct thread *td, clockid_t clock_id, int flags, const struct timespec *rqt, struct timespec *rmt) { struct timespec ts, now; sbintime_t sbt, sbtt, prec, tmp; time_t over; int error; bool is_abs_real; if (rqt->tv_nsec < 0 || rqt->tv_nsec >= 1000000000) return (EINVAL); if ((flags & ~TIMER_ABSTIME) != 0) return (EINVAL); switch (clock_id) { case CLOCK_REALTIME: case CLOCK_REALTIME_PRECISE: case CLOCK_REALTIME_FAST: case CLOCK_SECOND: is_abs_real = (flags & TIMER_ABSTIME) != 0; break; case CLOCK_MONOTONIC: case CLOCK_MONOTONIC_PRECISE: case CLOCK_MONOTONIC_FAST: case CLOCK_UPTIME: case CLOCK_UPTIME_PRECISE: case CLOCK_UPTIME_FAST: is_abs_real = false; break; case CLOCK_VIRTUAL: case CLOCK_PROF: case CLOCK_PROCESS_CPUTIME_ID: return (ENOTSUP); case CLOCK_THREAD_CPUTIME_ID: default: return (EINVAL); } do { ts = *rqt; if ((flags & TIMER_ABSTIME) != 0) { if (is_abs_real) td->td_rtcgen = atomic_load_acq_int(&rtc_generation); error = kern_clock_gettime(td, clock_id, &now); KASSERT(error == 0, ("kern_clock_gettime: %d", error)); - timespecsub(&ts, &now); + timespecsub(&ts, &now, &ts); } if (ts.tv_sec < 0 || (ts.tv_sec == 0 && ts.tv_nsec == 0)) { error = EWOULDBLOCK; break; } if (ts.tv_sec > INT32_MAX / 2) { over = ts.tv_sec - INT32_MAX / 2; ts.tv_sec -= over; } else over = 0; tmp = tstosbt(ts); prec = tmp; prec >>= tc_precexp; if (TIMESEL(&sbt, tmp)) sbt += tc_tick_sbt; sbt += tmp; error = tsleep_sbt(&nanowait[curcpu], PWAIT | PCATCH, "nanslp", sbt, prec, C_ABSOLUTE); } while (error == 0 && is_abs_real && td->td_rtcgen == 0); td->td_rtcgen = 0; if (error != EWOULDBLOCK) { if (TIMESEL(&sbtt, tmp)) sbtt += tc_tick_sbt; if (sbtt >= sbt) return (0); if (error == ERESTART) error = EINTR; if ((flags & TIMER_ABSTIME) == 0 && rmt != NULL) { ts = sbttots(sbt - sbtt); ts.tv_sec += over; if (ts.tv_sec < 0) timespecclear(&ts); *rmt = ts; } return (error); } return (0); } #ifndef _SYS_SYSPROTO_H_ struct nanosleep_args { struct timespec *rqtp; struct timespec *rmtp; }; #endif /* ARGSUSED */ int sys_nanosleep(struct thread *td, struct nanosleep_args *uap) { return (user_clock_nanosleep(td, CLOCK_REALTIME, TIMER_RELTIME, uap->rqtp, uap->rmtp)); } #ifndef _SYS_SYSPROTO_H_ struct clock_nanosleep_args { clockid_t clock_id; int flags; struct timespec *rqtp; struct timespec *rmtp; }; #endif /* ARGSUSED */ int sys_clock_nanosleep(struct thread *td, struct clock_nanosleep_args *uap) { int error; error = user_clock_nanosleep(td, uap->clock_id, uap->flags, uap->rqtp, uap->rmtp); return (kern_posix_error(td, error)); } static int user_clock_nanosleep(struct thread *td, clockid_t clock_id, int flags, const struct timespec *ua_rqtp, struct timespec *ua_rmtp) { struct timespec rmt, rqt; int error; error = copyin(ua_rqtp, &rqt, sizeof(rqt)); if (error) return (error); if (ua_rmtp != NULL && (flags & TIMER_ABSTIME) == 0 && !useracc(ua_rmtp, sizeof(rmt), VM_PROT_WRITE)) return (EFAULT); error = kern_clock_nanosleep(td, clock_id, flags, &rqt, &rmt); if (error == EINTR && ua_rmtp != NULL && (flags & TIMER_ABSTIME) == 0) { int error2; error2 = copyout(&rmt, ua_rmtp, sizeof(rmt)); if (error2) error = error2; } return (error); } #ifndef _SYS_SYSPROTO_H_ struct gettimeofday_args { struct timeval *tp; struct timezone *tzp; }; #endif /* ARGSUSED */ int sys_gettimeofday(struct thread *td, struct gettimeofday_args *uap) { struct timeval atv; struct timezone rtz; int error = 0; if (uap->tp) { microtime(&atv); error = copyout(&atv, uap->tp, sizeof (atv)); } if (error == 0 && uap->tzp != NULL) { rtz.tz_minuteswest = tz_minuteswest; rtz.tz_dsttime = tz_dsttime; error = copyout(&rtz, uap->tzp, sizeof (rtz)); } return (error); } #ifndef _SYS_SYSPROTO_H_ struct settimeofday_args { struct timeval *tv; struct timezone *tzp; }; #endif /* ARGSUSED */ int sys_settimeofday(struct thread *td, struct settimeofday_args *uap) { struct timeval atv, *tvp; struct timezone atz, *tzp; int error; if (uap->tv) { error = copyin(uap->tv, &atv, sizeof(atv)); if (error) return (error); tvp = &atv; } else tvp = NULL; if (uap->tzp) { error = copyin(uap->tzp, &atz, sizeof(atz)); if (error) return (error); tzp = &atz; } else tzp = NULL; return (kern_settimeofday(td, tvp, tzp)); } int kern_settimeofday(struct thread *td, struct timeval *tv, struct timezone *tzp) { int error; error = priv_check(td, PRIV_SETTIMEOFDAY); if (error) return (error); /* Verify all parameters before changing time. */ if (tv) { if (tv->tv_usec < 0 || tv->tv_usec >= 1000000 || tv->tv_sec < 0) return (EINVAL); error = settime(td, tv); } if (tzp && error == 0) { tz_minuteswest = tzp->tz_minuteswest; tz_dsttime = tzp->tz_dsttime; } return (error); } /* * Get value of an interval timer. The process virtual and profiling virtual * time timers are kept in the p_stats area, since they can be swapped out. * These are kept internally in the way they are specified externally: in * time until they expire. * * The real time interval timer is kept in the process table slot for the * process, and its value (it_value) is kept as an absolute time rather than * as a delta, so that it is easy to keep periodic real-time signals from * drifting. * * Virtual time timers are processed in the hardclock() routine of * kern_clock.c. The real time timer is processed by a timeout routine, * called from the softclock() routine. Since a callout may be delayed in * real time due to interrupt processing in the system, it is possible for * the real time timeout routine (realitexpire, given below), to be delayed * in real time past when it is supposed to occur. It does not suffice, * therefore, to reload the real timer .it_value from the real time timers * .it_interval. Rather, we compute the next time in absolute time the timer * should go off. */ #ifndef _SYS_SYSPROTO_H_ struct getitimer_args { u_int which; struct itimerval *itv; }; #endif int sys_getitimer(struct thread *td, struct getitimer_args *uap) { struct itimerval aitv; int error; error = kern_getitimer(td, uap->which, &aitv); if (error != 0) return (error); return (copyout(&aitv, uap->itv, sizeof (struct itimerval))); } int kern_getitimer(struct thread *td, u_int which, struct itimerval *aitv) { struct proc *p = td->td_proc; struct timeval ctv; if (which > ITIMER_PROF) return (EINVAL); if (which == ITIMER_REAL) { /* * Convert from absolute to relative time in .it_value * part of real time timer. If time for real time timer * has passed return 0, else return difference between * current time and time for the timer to go off. */ PROC_LOCK(p); *aitv = p->p_realtimer; PROC_UNLOCK(p); if (timevalisset(&aitv->it_value)) { microuptime(&ctv); if (timevalcmp(&aitv->it_value, &ctv, <)) timevalclear(&aitv->it_value); else timevalsub(&aitv->it_value, &ctv); } } else { PROC_ITIMLOCK(p); *aitv = p->p_stats->p_timer[which]; PROC_ITIMUNLOCK(p); } #ifdef KTRACE if (KTRPOINT(td, KTR_STRUCT)) ktritimerval(aitv); #endif return (0); } #ifndef _SYS_SYSPROTO_H_ struct setitimer_args { u_int which; struct itimerval *itv, *oitv; }; #endif int sys_setitimer(struct thread *td, struct setitimer_args *uap) { struct itimerval aitv, oitv; int error; if (uap->itv == NULL) { uap->itv = uap->oitv; return (sys_getitimer(td, (struct getitimer_args *)uap)); } if ((error = copyin(uap->itv, &aitv, sizeof(struct itimerval)))) return (error); error = kern_setitimer(td, uap->which, &aitv, &oitv); if (error != 0 || uap->oitv == NULL) return (error); return (copyout(&oitv, uap->oitv, sizeof(struct itimerval))); } int kern_setitimer(struct thread *td, u_int which, struct itimerval *aitv, struct itimerval *oitv) { struct proc *p = td->td_proc; struct timeval ctv; sbintime_t sbt, pr; if (aitv == NULL) return (kern_getitimer(td, which, oitv)); if (which > ITIMER_PROF) return (EINVAL); #ifdef KTRACE if (KTRPOINT(td, KTR_STRUCT)) ktritimerval(aitv); #endif if (itimerfix(&aitv->it_value) || aitv->it_value.tv_sec > INT32_MAX / 2) return (EINVAL); if (!timevalisset(&aitv->it_value)) timevalclear(&aitv->it_interval); else if (itimerfix(&aitv->it_interval) || aitv->it_interval.tv_sec > INT32_MAX / 2) return (EINVAL); if (which == ITIMER_REAL) { PROC_LOCK(p); if (timevalisset(&p->p_realtimer.it_value)) callout_stop(&p->p_itcallout); microuptime(&ctv); if (timevalisset(&aitv->it_value)) { pr = tvtosbt(aitv->it_value) >> tc_precexp; timevaladd(&aitv->it_value, &ctv); sbt = tvtosbt(aitv->it_value); callout_reset_sbt(&p->p_itcallout, sbt, pr, realitexpire, p, C_ABSOLUTE); } *oitv = p->p_realtimer; p->p_realtimer = *aitv; PROC_UNLOCK(p); if (timevalisset(&oitv->it_value)) { if (timevalcmp(&oitv->it_value, &ctv, <)) timevalclear(&oitv->it_value); else timevalsub(&oitv->it_value, &ctv); } } else { if (aitv->it_interval.tv_sec == 0 && aitv->it_interval.tv_usec != 0 && aitv->it_interval.tv_usec < tick) aitv->it_interval.tv_usec = tick; if (aitv->it_value.tv_sec == 0 && aitv->it_value.tv_usec != 0 && aitv->it_value.tv_usec < tick) aitv->it_value.tv_usec = tick; PROC_ITIMLOCK(p); *oitv = p->p_stats->p_timer[which]; p->p_stats->p_timer[which] = *aitv; PROC_ITIMUNLOCK(p); } #ifdef KTRACE if (KTRPOINT(td, KTR_STRUCT)) ktritimerval(oitv); #endif return (0); } /* * Real interval timer expired: * send process whose timer expired an alarm signal. * If time is not set up to reload, then just return. * Else compute next time timer should go off which is > current time. * This is where delay in processing this timeout causes multiple * SIGALRM calls to be compressed into one. * tvtohz() always adds 1 to allow for the time until the next clock * interrupt being strictly less than 1 clock tick, but we don't want * that here since we want to appear to be in sync with the clock * interrupt even when we're delayed. */ void realitexpire(void *arg) { struct proc *p; struct timeval ctv; sbintime_t isbt; p = (struct proc *)arg; kern_psignal(p, SIGALRM); if (!timevalisset(&p->p_realtimer.it_interval)) { timevalclear(&p->p_realtimer.it_value); if (p->p_flag & P_WEXIT) wakeup(&p->p_itcallout); return; } isbt = tvtosbt(p->p_realtimer.it_interval); if (isbt >= sbt_timethreshold) getmicrouptime(&ctv); else microuptime(&ctv); do { timevaladd(&p->p_realtimer.it_value, &p->p_realtimer.it_interval); } while (timevalcmp(&p->p_realtimer.it_value, &ctv, <=)); callout_reset_sbt(&p->p_itcallout, tvtosbt(p->p_realtimer.it_value), isbt >> tc_precexp, realitexpire, p, C_ABSOLUTE); } /* * Check that a proposed value to load into the .it_value or * .it_interval part of an interval timer is acceptable, and * fix it to have at least minimal value (i.e. if it is less * than the resolution of the clock, round it up.) */ int itimerfix(struct timeval *tv) { if (tv->tv_sec < 0 || tv->tv_usec < 0 || tv->tv_usec >= 1000000) return (EINVAL); if (tv->tv_sec == 0 && tv->tv_usec != 0 && tv->tv_usec < (u_int)tick / 16) tv->tv_usec = (u_int)tick / 16; return (0); } /* * Decrement an interval timer by a specified number * of microseconds, which must be less than a second, * i.e. < 1000000. If the timer expires, then reload * it. In this case, carry over (usec - old value) to * reduce the value reloaded into the timer so that * the timer does not drift. This routine assumes * that it is called in a context where the timers * on which it is operating cannot change in value. */ int itimerdecr(struct itimerval *itp, int usec) { if (itp->it_value.tv_usec < usec) { if (itp->it_value.tv_sec == 0) { /* expired, and already in next interval */ usec -= itp->it_value.tv_usec; goto expire; } itp->it_value.tv_usec += 1000000; itp->it_value.tv_sec--; } itp->it_value.tv_usec -= usec; usec = 0; if (timevalisset(&itp->it_value)) return (1); /* expired, exactly at end of interval */ expire: if (timevalisset(&itp->it_interval)) { itp->it_value = itp->it_interval; itp->it_value.tv_usec -= usec; if (itp->it_value.tv_usec < 0) { itp->it_value.tv_usec += 1000000; itp->it_value.tv_sec--; } } else itp->it_value.tv_usec = 0; /* sec is already 0 */ return (0); } /* * Add and subtract routines for timevals. * N.B.: subtract routine doesn't deal with * results which are before the beginning, * it just gets very confused in this case. * Caveat emptor. */ void timevaladd(struct timeval *t1, const struct timeval *t2) { t1->tv_sec += t2->tv_sec; t1->tv_usec += t2->tv_usec; timevalfix(t1); } void timevalsub(struct timeval *t1, const struct timeval *t2) { t1->tv_sec -= t2->tv_sec; t1->tv_usec -= t2->tv_usec; timevalfix(t1); } static void timevalfix(struct timeval *t1) { if (t1->tv_usec < 0) { t1->tv_sec--; t1->tv_usec += 1000000; } if (t1->tv_usec >= 1000000) { t1->tv_sec++; t1->tv_usec -= 1000000; } } /* * ratecheck(): simple time-based rate-limit checking. */ int ratecheck(struct timeval *lasttime, const struct timeval *mininterval) { struct timeval tv, delta; int rv = 0; getmicrouptime(&tv); /* NB: 10ms precision */ delta = tv; timevalsub(&delta, lasttime); /* * check for 0,0 is so that the message will be seen at least once, * even if interval is huge. */ if (timevalcmp(&delta, mininterval, >=) || (lasttime->tv_sec == 0 && lasttime->tv_usec == 0)) { *lasttime = tv; rv = 1; } return (rv); } /* * ppsratecheck(): packets (or events) per second limitation. * * Return 0 if the limit is to be enforced (e.g. the caller * should drop a packet because of the rate limitation). * * maxpps of 0 always causes zero to be returned. maxpps of -1 * always causes 1 to be returned; this effectively defeats rate * limiting. * * Note that we maintain the struct timeval for compatibility * with other bsd systems. We reuse the storage and just monitor * clock ticks for minimal overhead. */ int ppsratecheck(struct timeval *lasttime, int *curpps, int maxpps) { int now; /* * Reset the last time and counter if this is the first call * or more than a second has passed since the last update of * lasttime. */ now = ticks; if (lasttime->tv_sec == 0 || (u_int)(now - lasttime->tv_sec) >= hz) { lasttime->tv_sec = now; *curpps = 1; return (maxpps != 0); } else { (*curpps)++; /* NB: ignore potential overflow */ return (maxpps < 0 || *curpps <= maxpps); } } static void itimer_start(void) { struct kclock rt_clock = { .timer_create = realtimer_create, .timer_delete = realtimer_delete, .timer_settime = realtimer_settime, .timer_gettime = realtimer_gettime, .event_hook = NULL }; itimer_zone = uma_zcreate("itimer", sizeof(struct itimer), NULL, NULL, itimer_init, itimer_fini, UMA_ALIGN_PTR, 0); register_posix_clock(CLOCK_REALTIME, &rt_clock); register_posix_clock(CLOCK_MONOTONIC, &rt_clock); p31b_setcfg(CTL_P1003_1B_TIMERS, 200112L); p31b_setcfg(CTL_P1003_1B_DELAYTIMER_MAX, INT_MAX); p31b_setcfg(CTL_P1003_1B_TIMER_MAX, TIMER_MAX); EVENTHANDLER_REGISTER(process_exit, itimers_event_hook_exit, (void *)ITIMER_EV_EXIT, EVENTHANDLER_PRI_ANY); EVENTHANDLER_REGISTER(process_exec, itimers_event_hook_exec, (void *)ITIMER_EV_EXEC, EVENTHANDLER_PRI_ANY); } int register_posix_clock(int clockid, struct kclock *clk) { if ((unsigned)clockid >= MAX_CLOCKS) { printf("%s: invalid clockid\n", __func__); return (0); } posix_clocks[clockid] = *clk; return (1); } static int itimer_init(void *mem, int size, int flags) { struct itimer *it; it = (struct itimer *)mem; mtx_init(&it->it_mtx, "itimer lock", NULL, MTX_DEF); return (0); } static void itimer_fini(void *mem, int size) { struct itimer *it; it = (struct itimer *)mem; mtx_destroy(&it->it_mtx); } static void itimer_enter(struct itimer *it) { mtx_assert(&it->it_mtx, MA_OWNED); it->it_usecount++; } static void itimer_leave(struct itimer *it) { mtx_assert(&it->it_mtx, MA_OWNED); KASSERT(it->it_usecount > 0, ("invalid it_usecount")); if (--it->it_usecount == 0 && (it->it_flags & ITF_WANTED) != 0) wakeup(it); } #ifndef _SYS_SYSPROTO_H_ struct ktimer_create_args { clockid_t clock_id; struct sigevent * evp; int * timerid; }; #endif int sys_ktimer_create(struct thread *td, struct ktimer_create_args *uap) { struct sigevent *evp, ev; int id; int error; if (uap->evp == NULL) { evp = NULL; } else { error = copyin(uap->evp, &ev, sizeof(ev)); if (error != 0) return (error); evp = &ev; } error = kern_ktimer_create(td, uap->clock_id, evp, &id, -1); if (error == 0) { error = copyout(&id, uap->timerid, sizeof(int)); if (error != 0) kern_ktimer_delete(td, id); } return (error); } int kern_ktimer_create(struct thread *td, clockid_t clock_id, struct sigevent *evp, int *timerid, int preset_id) { struct proc *p = td->td_proc; struct itimer *it; int id; int error; if (clock_id < 0 || clock_id >= MAX_CLOCKS) return (EINVAL); if (posix_clocks[clock_id].timer_create == NULL) return (EINVAL); if (evp != NULL) { if (evp->sigev_notify != SIGEV_NONE && evp->sigev_notify != SIGEV_SIGNAL && evp->sigev_notify != SIGEV_THREAD_ID) return (EINVAL); if ((evp->sigev_notify == SIGEV_SIGNAL || evp->sigev_notify == SIGEV_THREAD_ID) && !_SIG_VALID(evp->sigev_signo)) return (EINVAL); } if (p->p_itimers == NULL) itimers_alloc(p); it = uma_zalloc(itimer_zone, M_WAITOK); it->it_flags = 0; it->it_usecount = 0; it->it_active = 0; timespecclear(&it->it_time.it_value); timespecclear(&it->it_time.it_interval); it->it_overrun = 0; it->it_overrun_last = 0; it->it_clockid = clock_id; it->it_timerid = -1; it->it_proc = p; ksiginfo_init(&it->it_ksi); it->it_ksi.ksi_flags |= KSI_INS | KSI_EXT; error = CLOCK_CALL(clock_id, timer_create, (it)); if (error != 0) goto out; PROC_LOCK(p); if (preset_id != -1) { KASSERT(preset_id >= 0 && preset_id < 3, ("invalid preset_id")); id = preset_id; if (p->p_itimers->its_timers[id] != NULL) { PROC_UNLOCK(p); error = 0; goto out; } } else { /* * Find a free timer slot, skipping those reserved * for setitimer(). */ for (id = 3; id < TIMER_MAX; id++) if (p->p_itimers->its_timers[id] == NULL) break; if (id == TIMER_MAX) { PROC_UNLOCK(p); error = EAGAIN; goto out; } } it->it_timerid = id; p->p_itimers->its_timers[id] = it; if (evp != NULL) it->it_sigev = *evp; else { it->it_sigev.sigev_notify = SIGEV_SIGNAL; switch (clock_id) { default: case CLOCK_REALTIME: it->it_sigev.sigev_signo = SIGALRM; break; case CLOCK_VIRTUAL: it->it_sigev.sigev_signo = SIGVTALRM; break; case CLOCK_PROF: it->it_sigev.sigev_signo = SIGPROF; break; } it->it_sigev.sigev_value.sival_int = id; } if (it->it_sigev.sigev_notify == SIGEV_SIGNAL || it->it_sigev.sigev_notify == SIGEV_THREAD_ID) { it->it_ksi.ksi_signo = it->it_sigev.sigev_signo; it->it_ksi.ksi_code = SI_TIMER; it->it_ksi.ksi_value = it->it_sigev.sigev_value; it->it_ksi.ksi_timerid = id; } PROC_UNLOCK(p); *timerid = id; return (0); out: ITIMER_LOCK(it); CLOCK_CALL(it->it_clockid, timer_delete, (it)); ITIMER_UNLOCK(it); uma_zfree(itimer_zone, it); return (error); } #ifndef _SYS_SYSPROTO_H_ struct ktimer_delete_args { int timerid; }; #endif int sys_ktimer_delete(struct thread *td, struct ktimer_delete_args *uap) { return (kern_ktimer_delete(td, uap->timerid)); } static struct itimer * itimer_find(struct proc *p, int timerid) { struct itimer *it; PROC_LOCK_ASSERT(p, MA_OWNED); if ((p->p_itimers == NULL) || (timerid < 0) || (timerid >= TIMER_MAX) || (it = p->p_itimers->its_timers[timerid]) == NULL) { return (NULL); } ITIMER_LOCK(it); if ((it->it_flags & ITF_DELETING) != 0) { ITIMER_UNLOCK(it); it = NULL; } return (it); } int kern_ktimer_delete(struct thread *td, int timerid) { struct proc *p = td->td_proc; struct itimer *it; PROC_LOCK(p); it = itimer_find(p, timerid); if (it == NULL) { PROC_UNLOCK(p); return (EINVAL); } PROC_UNLOCK(p); it->it_flags |= ITF_DELETING; while (it->it_usecount > 0) { it->it_flags |= ITF_WANTED; msleep(it, &it->it_mtx, PPAUSE, "itimer", 0); } it->it_flags &= ~ITF_WANTED; CLOCK_CALL(it->it_clockid, timer_delete, (it)); ITIMER_UNLOCK(it); PROC_LOCK(p); if (KSI_ONQ(&it->it_ksi)) sigqueue_take(&it->it_ksi); p->p_itimers->its_timers[timerid] = NULL; PROC_UNLOCK(p); uma_zfree(itimer_zone, it); return (0); } #ifndef _SYS_SYSPROTO_H_ struct ktimer_settime_args { int timerid; int flags; const struct itimerspec * value; struct itimerspec * ovalue; }; #endif int sys_ktimer_settime(struct thread *td, struct ktimer_settime_args *uap) { struct itimerspec val, oval, *ovalp; int error; error = copyin(uap->value, &val, sizeof(val)); if (error != 0) return (error); ovalp = uap->ovalue != NULL ? &oval : NULL; error = kern_ktimer_settime(td, uap->timerid, uap->flags, &val, ovalp); if (error == 0 && uap->ovalue != NULL) error = copyout(ovalp, uap->ovalue, sizeof(*ovalp)); return (error); } int kern_ktimer_settime(struct thread *td, int timer_id, int flags, struct itimerspec *val, struct itimerspec *oval) { struct proc *p; struct itimer *it; int error; p = td->td_proc; PROC_LOCK(p); if (timer_id < 3 || (it = itimer_find(p, timer_id)) == NULL) { PROC_UNLOCK(p); error = EINVAL; } else { PROC_UNLOCK(p); itimer_enter(it); error = CLOCK_CALL(it->it_clockid, timer_settime, (it, flags, val, oval)); itimer_leave(it); ITIMER_UNLOCK(it); } return (error); } #ifndef _SYS_SYSPROTO_H_ struct ktimer_gettime_args { int timerid; struct itimerspec * value; }; #endif int sys_ktimer_gettime(struct thread *td, struct ktimer_gettime_args *uap) { struct itimerspec val; int error; error = kern_ktimer_gettime(td, uap->timerid, &val); if (error == 0) error = copyout(&val, uap->value, sizeof(val)); return (error); } int kern_ktimer_gettime(struct thread *td, int timer_id, struct itimerspec *val) { struct proc *p; struct itimer *it; int error; p = td->td_proc; PROC_LOCK(p); if (timer_id < 3 || (it = itimer_find(p, timer_id)) == NULL) { PROC_UNLOCK(p); error = EINVAL; } else { PROC_UNLOCK(p); itimer_enter(it); error = CLOCK_CALL(it->it_clockid, timer_gettime, (it, val)); itimer_leave(it); ITIMER_UNLOCK(it); } return (error); } #ifndef _SYS_SYSPROTO_H_ struct timer_getoverrun_args { int timerid; }; #endif int sys_ktimer_getoverrun(struct thread *td, struct ktimer_getoverrun_args *uap) { return (kern_ktimer_getoverrun(td, uap->timerid)); } int kern_ktimer_getoverrun(struct thread *td, int timer_id) { struct proc *p = td->td_proc; struct itimer *it; int error ; PROC_LOCK(p); if (timer_id < 3 || (it = itimer_find(p, timer_id)) == NULL) { PROC_UNLOCK(p); error = EINVAL; } else { td->td_retval[0] = it->it_overrun_last; ITIMER_UNLOCK(it); PROC_UNLOCK(p); error = 0; } return (error); } static int realtimer_create(struct itimer *it) { callout_init_mtx(&it->it_callout, &it->it_mtx, 0); return (0); } static int realtimer_delete(struct itimer *it) { mtx_assert(&it->it_mtx, MA_OWNED); /* * clear timer's value and interval to tell realtimer_expire * to not rearm the timer. */ timespecclear(&it->it_time.it_value); timespecclear(&it->it_time.it_interval); ITIMER_UNLOCK(it); callout_drain(&it->it_callout); ITIMER_LOCK(it); return (0); } static int realtimer_gettime(struct itimer *it, struct itimerspec *ovalue) { struct timespec cts; mtx_assert(&it->it_mtx, MA_OWNED); realtimer_clocktime(it->it_clockid, &cts); *ovalue = it->it_time; if (ovalue->it_value.tv_sec != 0 || ovalue->it_value.tv_nsec != 0) { - timespecsub(&ovalue->it_value, &cts); + timespecsub(&ovalue->it_value, &cts, &ovalue->it_value); if (ovalue->it_value.tv_sec < 0 || (ovalue->it_value.tv_sec == 0 && ovalue->it_value.tv_nsec == 0)) { ovalue->it_value.tv_sec = 0; ovalue->it_value.tv_nsec = 1; } } return (0); } static int realtimer_settime(struct itimer *it, int flags, struct itimerspec *value, struct itimerspec *ovalue) { struct timespec cts, ts; struct timeval tv; struct itimerspec val; mtx_assert(&it->it_mtx, MA_OWNED); val = *value; if (itimespecfix(&val.it_value)) return (EINVAL); if (timespecisset(&val.it_value)) { if (itimespecfix(&val.it_interval)) return (EINVAL); } else { timespecclear(&val.it_interval); } if (ovalue != NULL) realtimer_gettime(it, ovalue); it->it_time = val; if (timespecisset(&val.it_value)) { realtimer_clocktime(it->it_clockid, &cts); ts = val.it_value; if ((flags & TIMER_ABSTIME) == 0) { /* Convert to absolute time. */ - timespecadd(&it->it_time.it_value, &cts); + timespecadd(&it->it_time.it_value, &cts, + &it->it_time.it_value); } else { - timespecsub(&ts, &cts); + timespecsub(&ts, &cts, &ts); /* * We don't care if ts is negative, tztohz will * fix it. */ } TIMESPEC_TO_TIMEVAL(&tv, &ts); callout_reset(&it->it_callout, tvtohz(&tv), realtimer_expire, it); } else { callout_stop(&it->it_callout); } return (0); } static void realtimer_clocktime(clockid_t id, struct timespec *ts) { if (id == CLOCK_REALTIME) getnanotime(ts); else /* CLOCK_MONOTONIC */ getnanouptime(ts); } int itimer_accept(struct proc *p, int timerid, ksiginfo_t *ksi) { struct itimer *it; PROC_LOCK_ASSERT(p, MA_OWNED); it = itimer_find(p, timerid); if (it != NULL) { ksi->ksi_overrun = it->it_overrun; it->it_overrun_last = it->it_overrun; it->it_overrun = 0; ITIMER_UNLOCK(it); return (0); } return (EINVAL); } int itimespecfix(struct timespec *ts) { if (ts->tv_sec < 0 || ts->tv_nsec < 0 || ts->tv_nsec >= 1000000000) return (EINVAL); if (ts->tv_sec == 0 && ts->tv_nsec != 0 && ts->tv_nsec < tick * 1000) ts->tv_nsec = tick * 1000; return (0); } /* Timeout callback for realtime timer */ static void realtimer_expire(void *arg) { struct timespec cts, ts; struct timeval tv; struct itimer *it; it = (struct itimer *)arg; realtimer_clocktime(it->it_clockid, &cts); /* Only fire if time is reached. */ if (timespeccmp(&cts, &it->it_time.it_value, >=)) { if (timespecisset(&it->it_time.it_interval)) { timespecadd(&it->it_time.it_value, - &it->it_time.it_interval); + &it->it_time.it_interval, + &it->it_time.it_value); while (timespeccmp(&cts, &it->it_time.it_value, >=)) { if (it->it_overrun < INT_MAX) it->it_overrun++; else it->it_ksi.ksi_errno = ERANGE; timespecadd(&it->it_time.it_value, - &it->it_time.it_interval); + &it->it_time.it_interval, + &it->it_time.it_value); } } else { /* single shot timer ? */ timespecclear(&it->it_time.it_value); } if (timespecisset(&it->it_time.it_value)) { - ts = it->it_time.it_value; - timespecsub(&ts, &cts); + timespecsub(&it->it_time.it_value, &cts, &ts); TIMESPEC_TO_TIMEVAL(&tv, &ts); callout_reset(&it->it_callout, tvtohz(&tv), realtimer_expire, it); } itimer_enter(it); ITIMER_UNLOCK(it); itimer_fire(it); ITIMER_LOCK(it); itimer_leave(it); } else if (timespecisset(&it->it_time.it_value)) { ts = it->it_time.it_value; - timespecsub(&ts, &cts); + timespecsub(&ts, &cts, &ts); TIMESPEC_TO_TIMEVAL(&tv, &ts); callout_reset(&it->it_callout, tvtohz(&tv), realtimer_expire, it); } } void itimer_fire(struct itimer *it) { struct proc *p = it->it_proc; struct thread *td; if (it->it_sigev.sigev_notify == SIGEV_SIGNAL || it->it_sigev.sigev_notify == SIGEV_THREAD_ID) { if (sigev_findtd(p, &it->it_sigev, &td) != 0) { ITIMER_LOCK(it); timespecclear(&it->it_time.it_value); timespecclear(&it->it_time.it_interval); callout_stop(&it->it_callout); ITIMER_UNLOCK(it); return; } if (!KSI_ONQ(&it->it_ksi)) { it->it_ksi.ksi_errno = 0; ksiginfo_set_sigev(&it->it_ksi, &it->it_sigev); tdsendsignal(p, td, it->it_ksi.ksi_signo, &it->it_ksi); } else { if (it->it_overrun < INT_MAX) it->it_overrun++; else it->it_ksi.ksi_errno = ERANGE; } PROC_UNLOCK(p); } } static void itimers_alloc(struct proc *p) { struct itimers *its; int i; its = malloc(sizeof (struct itimers), M_SUBPROC, M_WAITOK | M_ZERO); LIST_INIT(&its->its_virtual); LIST_INIT(&its->its_prof); TAILQ_INIT(&its->its_worklist); for (i = 0; i < TIMER_MAX; i++) its->its_timers[i] = NULL; PROC_LOCK(p); if (p->p_itimers == NULL) { p->p_itimers = its; PROC_UNLOCK(p); } else { PROC_UNLOCK(p); free(its, M_SUBPROC); } } static void itimers_event_hook_exec(void *arg, struct proc *p, struct image_params *imgp __unused) { itimers_event_hook_exit(arg, p); } /* Clean up timers when some process events are being triggered. */ static void itimers_event_hook_exit(void *arg, struct proc *p) { struct itimers *its; struct itimer *it; int event = (int)(intptr_t)arg; int i; if (p->p_itimers != NULL) { its = p->p_itimers; for (i = 0; i < MAX_CLOCKS; ++i) { if (posix_clocks[i].event_hook != NULL) CLOCK_CALL(i, event_hook, (p, i, event)); } /* * According to susv3, XSI interval timers should be inherited * by new image. */ if (event == ITIMER_EV_EXEC) i = 3; else if (event == ITIMER_EV_EXIT) i = 0; else panic("unhandled event"); for (; i < TIMER_MAX; ++i) { if ((it = its->its_timers[i]) != NULL) kern_ktimer_delete(curthread, i); } if (its->its_timers[0] == NULL && its->its_timers[1] == NULL && its->its_timers[2] == NULL) { free(its, M_SUBPROC); p->p_itimers = NULL; } } } Index: head/sys/kern/kern_umtx.c =================================================================== --- head/sys/kern/kern_umtx.c (revision 336913) +++ head/sys/kern/kern_umtx.c (revision 336914) @@ -1,4567 +1,4565 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2015, 2016 The FreeBSD Foundation * Copyright (c) 2004, David Xu * Copyright (c) 2002, Jeffrey Roberson * All rights reserved. * * Portions of this software were developed by Konstantin Belousov * under sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_umtx_profiling.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef COMPAT_FREEBSD32 #include #endif #define _UMUTEX_TRY 1 #define _UMUTEX_WAIT 2 #ifdef UMTX_PROFILING #define UPROF_PERC_BIGGER(w, f, sw, sf) \ (((w) > (sw)) || ((w) == (sw) && (f) > (sf))) #endif /* Priority inheritance mutex info. */ struct umtx_pi { /* Owner thread */ struct thread *pi_owner; /* Reference count */ int pi_refcount; /* List entry to link umtx holding by thread */ TAILQ_ENTRY(umtx_pi) pi_link; /* List entry in hash */ TAILQ_ENTRY(umtx_pi) pi_hashlink; /* List for waiters */ TAILQ_HEAD(,umtx_q) pi_blocked; /* Identify a userland lock object */ struct umtx_key pi_key; }; /* A userland synchronous object user. */ struct umtx_q { /* Linked list for the hash. */ TAILQ_ENTRY(umtx_q) uq_link; /* Umtx key. */ struct umtx_key uq_key; /* Umtx flags. */ int uq_flags; #define UQF_UMTXQ 0x0001 /* The thread waits on. */ struct thread *uq_thread; /* * Blocked on PI mutex. read can use chain lock * or umtx_lock, write must have both chain lock and * umtx_lock being hold. */ struct umtx_pi *uq_pi_blocked; /* On blocked list */ TAILQ_ENTRY(umtx_q) uq_lockq; /* Thread contending with us */ TAILQ_HEAD(,umtx_pi) uq_pi_contested; /* Inherited priority from PP mutex */ u_char uq_inherited_pri; /* Spare queue ready to be reused */ struct umtxq_queue *uq_spare_queue; /* The queue we on */ struct umtxq_queue *uq_cur_queue; }; TAILQ_HEAD(umtxq_head, umtx_q); /* Per-key wait-queue */ struct umtxq_queue { struct umtxq_head head; struct umtx_key key; LIST_ENTRY(umtxq_queue) link; int length; }; LIST_HEAD(umtxq_list, umtxq_queue); /* Userland lock object's wait-queue chain */ struct umtxq_chain { /* Lock for this chain. */ struct mtx uc_lock; /* List of sleep queues. */ struct umtxq_list uc_queue[2]; #define UMTX_SHARED_QUEUE 0 #define UMTX_EXCLUSIVE_QUEUE 1 LIST_HEAD(, umtxq_queue) uc_spare_queue; /* Busy flag */ char uc_busy; /* Chain lock waiters */ int uc_waiters; /* All PI in the list */ TAILQ_HEAD(,umtx_pi) uc_pi_list; #ifdef UMTX_PROFILING u_int length; u_int max_length; #endif }; #define UMTXQ_LOCKED_ASSERT(uc) mtx_assert(&(uc)->uc_lock, MA_OWNED) /* * Don't propagate time-sharing priority, there is a security reason, * a user can simply introduce PI-mutex, let thread A lock the mutex, * and let another thread B block on the mutex, because B is * sleeping, its priority will be boosted, this causes A's priority to * be boosted via priority propagating too and will never be lowered even * if it is using 100%CPU, this is unfair to other processes. */ #define UPRI(td) (((td)->td_user_pri >= PRI_MIN_TIMESHARE &&\ (td)->td_user_pri <= PRI_MAX_TIMESHARE) ?\ PRI_MAX_TIMESHARE : (td)->td_user_pri) #define GOLDEN_RATIO_PRIME 2654404609U #ifndef UMTX_CHAINS #define UMTX_CHAINS 512 #endif #define UMTX_SHIFTS (__WORD_BIT - 9) #define GET_SHARE(flags) \ (((flags) & USYNC_PROCESS_SHARED) == 0 ? THREAD_SHARE : PROCESS_SHARE) #define BUSY_SPINS 200 struct abs_timeout { int clockid; bool is_abs_real; /* TIMER_ABSTIME && CLOCK_REALTIME* */ struct timespec cur; struct timespec end; }; #ifdef COMPAT_FREEBSD32 struct umutex32 { volatile __lwpid_t m_owner; /* Owner of the mutex */ __uint32_t m_flags; /* Flags of the mutex */ __uint32_t m_ceilings[2]; /* Priority protect ceiling */ __uint32_t m_rb_lnk; /* Robust linkage */ __uint32_t m_pad; __uint32_t m_spare[2]; }; _Static_assert(sizeof(struct umutex) == sizeof(struct umutex32), "umutex32"); _Static_assert(__offsetof(struct umutex, m_spare[0]) == __offsetof(struct umutex32, m_spare[0]), "m_spare32"); #endif int umtx_shm_vnobj_persistent = 0; SYSCTL_INT(_kern_ipc, OID_AUTO, umtx_vnode_persistent, CTLFLAG_RWTUN, &umtx_shm_vnobj_persistent, 0, "False forces destruction of umtx attached to file, on last close"); static int umtx_max_rb = 1000; SYSCTL_INT(_kern_ipc, OID_AUTO, umtx_max_robust, CTLFLAG_RWTUN, &umtx_max_rb, 0, ""); static uma_zone_t umtx_pi_zone; static struct umtxq_chain umtxq_chains[2][UMTX_CHAINS]; static MALLOC_DEFINE(M_UMTX, "umtx", "UMTX queue memory"); static int umtx_pi_allocated; static SYSCTL_NODE(_debug, OID_AUTO, umtx, CTLFLAG_RW, 0, "umtx debug"); SYSCTL_INT(_debug_umtx, OID_AUTO, umtx_pi_allocated, CTLFLAG_RD, &umtx_pi_allocated, 0, "Allocated umtx_pi"); static int umtx_verbose_rb = 1; SYSCTL_INT(_debug_umtx, OID_AUTO, robust_faults_verbose, CTLFLAG_RWTUN, &umtx_verbose_rb, 0, ""); #ifdef UMTX_PROFILING static long max_length; SYSCTL_LONG(_debug_umtx, OID_AUTO, max_length, CTLFLAG_RD, &max_length, 0, "max_length"); static SYSCTL_NODE(_debug_umtx, OID_AUTO, chains, CTLFLAG_RD, 0, "umtx chain stats"); #endif static void abs_timeout_update(struct abs_timeout *timo); static void umtx_shm_init(void); static void umtxq_sysinit(void *); static void umtxq_hash(struct umtx_key *key); static struct umtxq_chain *umtxq_getchain(struct umtx_key *key); static void umtxq_lock(struct umtx_key *key); static void umtxq_unlock(struct umtx_key *key); static void umtxq_busy(struct umtx_key *key); static void umtxq_unbusy(struct umtx_key *key); static void umtxq_insert_queue(struct umtx_q *uq, int q); static void umtxq_remove_queue(struct umtx_q *uq, int q); static int umtxq_sleep(struct umtx_q *uq, const char *wmesg, struct abs_timeout *); static int umtxq_count(struct umtx_key *key); static struct umtx_pi *umtx_pi_alloc(int); static void umtx_pi_free(struct umtx_pi *pi); static int do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags, bool rb); static void umtx_thread_cleanup(struct thread *td); static void umtx_exec_hook(void *arg __unused, struct proc *p __unused, struct image_params *imgp __unused); SYSINIT(umtx, SI_SUB_EVENTHANDLER+1, SI_ORDER_MIDDLE, umtxq_sysinit, NULL); #define umtxq_signal(key, nwake) umtxq_signal_queue((key), (nwake), UMTX_SHARED_QUEUE) #define umtxq_insert(uq) umtxq_insert_queue((uq), UMTX_SHARED_QUEUE) #define umtxq_remove(uq) umtxq_remove_queue((uq), UMTX_SHARED_QUEUE) static struct mtx umtx_lock; #ifdef UMTX_PROFILING static void umtx_init_profiling(void) { struct sysctl_oid *chain_oid; char chain_name[10]; int i; for (i = 0; i < UMTX_CHAINS; ++i) { snprintf(chain_name, sizeof(chain_name), "%d", i); chain_oid = SYSCTL_ADD_NODE(NULL, SYSCTL_STATIC_CHILDREN(_debug_umtx_chains), OID_AUTO, chain_name, CTLFLAG_RD, NULL, "umtx hash stats"); SYSCTL_ADD_INT(NULL, SYSCTL_CHILDREN(chain_oid), OID_AUTO, "max_length0", CTLFLAG_RD, &umtxq_chains[0][i].max_length, 0, NULL); SYSCTL_ADD_INT(NULL, SYSCTL_CHILDREN(chain_oid), OID_AUTO, "max_length1", CTLFLAG_RD, &umtxq_chains[1][i].max_length, 0, NULL); } } static int sysctl_debug_umtx_chains_peaks(SYSCTL_HANDLER_ARGS) { char buf[512]; struct sbuf sb; struct umtxq_chain *uc; u_int fract, i, j, tot, whole; u_int sf0, sf1, sf2, sf3, sf4; u_int si0, si1, si2, si3, si4; u_int sw0, sw1, sw2, sw3, sw4; sbuf_new(&sb, buf, sizeof(buf), SBUF_FIXEDLEN); for (i = 0; i < 2; i++) { tot = 0; for (j = 0; j < UMTX_CHAINS; ++j) { uc = &umtxq_chains[i][j]; mtx_lock(&uc->uc_lock); tot += uc->max_length; mtx_unlock(&uc->uc_lock); } if (tot == 0) sbuf_printf(&sb, "%u) Empty ", i); else { sf0 = sf1 = sf2 = sf3 = sf4 = 0; si0 = si1 = si2 = si3 = si4 = 0; sw0 = sw1 = sw2 = sw3 = sw4 = 0; for (j = 0; j < UMTX_CHAINS; j++) { uc = &umtxq_chains[i][j]; mtx_lock(&uc->uc_lock); whole = uc->max_length * 100; mtx_unlock(&uc->uc_lock); fract = (whole % tot) * 100; if (UPROF_PERC_BIGGER(whole, fract, sw0, sf0)) { sf0 = fract; si0 = j; sw0 = whole; } else if (UPROF_PERC_BIGGER(whole, fract, sw1, sf1)) { sf1 = fract; si1 = j; sw1 = whole; } else if (UPROF_PERC_BIGGER(whole, fract, sw2, sf2)) { sf2 = fract; si2 = j; sw2 = whole; } else if (UPROF_PERC_BIGGER(whole, fract, sw3, sf3)) { sf3 = fract; si3 = j; sw3 = whole; } else if (UPROF_PERC_BIGGER(whole, fract, sw4, sf4)) { sf4 = fract; si4 = j; sw4 = whole; } } sbuf_printf(&sb, "queue %u:\n", i); sbuf_printf(&sb, "1st: %u.%u%% idx: %u\n", sw0 / tot, sf0 / tot, si0); sbuf_printf(&sb, "2nd: %u.%u%% idx: %u\n", sw1 / tot, sf1 / tot, si1); sbuf_printf(&sb, "3rd: %u.%u%% idx: %u\n", sw2 / tot, sf2 / tot, si2); sbuf_printf(&sb, "4th: %u.%u%% idx: %u\n", sw3 / tot, sf3 / tot, si3); sbuf_printf(&sb, "5th: %u.%u%% idx: %u\n", sw4 / tot, sf4 / tot, si4); } } sbuf_trim(&sb); sbuf_finish(&sb); sysctl_handle_string(oidp, sbuf_data(&sb), sbuf_len(&sb), req); sbuf_delete(&sb); return (0); } static int sysctl_debug_umtx_chains_clear(SYSCTL_HANDLER_ARGS) { struct umtxq_chain *uc; u_int i, j; int clear, error; clear = 0; error = sysctl_handle_int(oidp, &clear, 0, req); if (error != 0 || req->newptr == NULL) return (error); if (clear != 0) { for (i = 0; i < 2; ++i) { for (j = 0; j < UMTX_CHAINS; ++j) { uc = &umtxq_chains[i][j]; mtx_lock(&uc->uc_lock); uc->length = 0; uc->max_length = 0; mtx_unlock(&uc->uc_lock); } } } return (0); } SYSCTL_PROC(_debug_umtx_chains, OID_AUTO, clear, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 0, sysctl_debug_umtx_chains_clear, "I", "Clear umtx chains statistics"); SYSCTL_PROC(_debug_umtx_chains, OID_AUTO, peaks, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 0, sysctl_debug_umtx_chains_peaks, "A", "Highest peaks in chains max length"); #endif static void umtxq_sysinit(void *arg __unused) { int i, j; umtx_pi_zone = uma_zcreate("umtx pi", sizeof(struct umtx_pi), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); for (i = 0; i < 2; ++i) { for (j = 0; j < UMTX_CHAINS; ++j) { mtx_init(&umtxq_chains[i][j].uc_lock, "umtxql", NULL, MTX_DEF | MTX_DUPOK); LIST_INIT(&umtxq_chains[i][j].uc_queue[0]); LIST_INIT(&umtxq_chains[i][j].uc_queue[1]); LIST_INIT(&umtxq_chains[i][j].uc_spare_queue); TAILQ_INIT(&umtxq_chains[i][j].uc_pi_list); umtxq_chains[i][j].uc_busy = 0; umtxq_chains[i][j].uc_waiters = 0; #ifdef UMTX_PROFILING umtxq_chains[i][j].length = 0; umtxq_chains[i][j].max_length = 0; #endif } } #ifdef UMTX_PROFILING umtx_init_profiling(); #endif mtx_init(&umtx_lock, "umtx lock", NULL, MTX_DEF); EVENTHANDLER_REGISTER(process_exec, umtx_exec_hook, NULL, EVENTHANDLER_PRI_ANY); umtx_shm_init(); } struct umtx_q * umtxq_alloc(void) { struct umtx_q *uq; uq = malloc(sizeof(struct umtx_q), M_UMTX, M_WAITOK | M_ZERO); uq->uq_spare_queue = malloc(sizeof(struct umtxq_queue), M_UMTX, M_WAITOK | M_ZERO); TAILQ_INIT(&uq->uq_spare_queue->head); TAILQ_INIT(&uq->uq_pi_contested); uq->uq_inherited_pri = PRI_MAX; return (uq); } void umtxq_free(struct umtx_q *uq) { MPASS(uq->uq_spare_queue != NULL); free(uq->uq_spare_queue, M_UMTX); free(uq, M_UMTX); } static inline void umtxq_hash(struct umtx_key *key) { unsigned n; n = (uintptr_t)key->info.both.a + key->info.both.b; key->hash = ((n * GOLDEN_RATIO_PRIME) >> UMTX_SHIFTS) % UMTX_CHAINS; } static inline struct umtxq_chain * umtxq_getchain(struct umtx_key *key) { if (key->type <= TYPE_SEM) return (&umtxq_chains[1][key->hash]); return (&umtxq_chains[0][key->hash]); } /* * Lock a chain. */ static inline void umtxq_lock(struct umtx_key *key) { struct umtxq_chain *uc; uc = umtxq_getchain(key); mtx_lock(&uc->uc_lock); } /* * Unlock a chain. */ static inline void umtxq_unlock(struct umtx_key *key) { struct umtxq_chain *uc; uc = umtxq_getchain(key); mtx_unlock(&uc->uc_lock); } /* * Set chain to busy state when following operation * may be blocked (kernel mutex can not be used). */ static inline void umtxq_busy(struct umtx_key *key) { struct umtxq_chain *uc; uc = umtxq_getchain(key); mtx_assert(&uc->uc_lock, MA_OWNED); if (uc->uc_busy) { #ifdef SMP if (smp_cpus > 1) { int count = BUSY_SPINS; if (count > 0) { umtxq_unlock(key); while (uc->uc_busy && --count > 0) cpu_spinwait(); umtxq_lock(key); } } #endif while (uc->uc_busy) { uc->uc_waiters++; msleep(uc, &uc->uc_lock, 0, "umtxqb", 0); uc->uc_waiters--; } } uc->uc_busy = 1; } /* * Unbusy a chain. */ static inline void umtxq_unbusy(struct umtx_key *key) { struct umtxq_chain *uc; uc = umtxq_getchain(key); mtx_assert(&uc->uc_lock, MA_OWNED); KASSERT(uc->uc_busy != 0, ("not busy")); uc->uc_busy = 0; if (uc->uc_waiters) wakeup_one(uc); } static inline void umtxq_unbusy_unlocked(struct umtx_key *key) { umtxq_lock(key); umtxq_unbusy(key); umtxq_unlock(key); } static struct umtxq_queue * umtxq_queue_lookup(struct umtx_key *key, int q) { struct umtxq_queue *uh; struct umtxq_chain *uc; uc = umtxq_getchain(key); UMTXQ_LOCKED_ASSERT(uc); LIST_FOREACH(uh, &uc->uc_queue[q], link) { if (umtx_key_match(&uh->key, key)) return (uh); } return (NULL); } static inline void umtxq_insert_queue(struct umtx_q *uq, int q) { struct umtxq_queue *uh; struct umtxq_chain *uc; uc = umtxq_getchain(&uq->uq_key); UMTXQ_LOCKED_ASSERT(uc); KASSERT((uq->uq_flags & UQF_UMTXQ) == 0, ("umtx_q is already on queue")); uh = umtxq_queue_lookup(&uq->uq_key, q); if (uh != NULL) { LIST_INSERT_HEAD(&uc->uc_spare_queue, uq->uq_spare_queue, link); } else { uh = uq->uq_spare_queue; uh->key = uq->uq_key; LIST_INSERT_HEAD(&uc->uc_queue[q], uh, link); #ifdef UMTX_PROFILING uc->length++; if (uc->length > uc->max_length) { uc->max_length = uc->length; if (uc->max_length > max_length) max_length = uc->max_length; } #endif } uq->uq_spare_queue = NULL; TAILQ_INSERT_TAIL(&uh->head, uq, uq_link); uh->length++; uq->uq_flags |= UQF_UMTXQ; uq->uq_cur_queue = uh; return; } static inline void umtxq_remove_queue(struct umtx_q *uq, int q) { struct umtxq_chain *uc; struct umtxq_queue *uh; uc = umtxq_getchain(&uq->uq_key); UMTXQ_LOCKED_ASSERT(uc); if (uq->uq_flags & UQF_UMTXQ) { uh = uq->uq_cur_queue; TAILQ_REMOVE(&uh->head, uq, uq_link); uh->length--; uq->uq_flags &= ~UQF_UMTXQ; if (TAILQ_EMPTY(&uh->head)) { KASSERT(uh->length == 0, ("inconsistent umtxq_queue length")); #ifdef UMTX_PROFILING uc->length--; #endif LIST_REMOVE(uh, link); } else { uh = LIST_FIRST(&uc->uc_spare_queue); KASSERT(uh != NULL, ("uc_spare_queue is empty")); LIST_REMOVE(uh, link); } uq->uq_spare_queue = uh; uq->uq_cur_queue = NULL; } } /* * Check if there are multiple waiters */ static int umtxq_count(struct umtx_key *key) { struct umtxq_queue *uh; UMTXQ_LOCKED_ASSERT(umtxq_getchain(key)); uh = umtxq_queue_lookup(key, UMTX_SHARED_QUEUE); if (uh != NULL) return (uh->length); return (0); } /* * Check if there are multiple PI waiters and returns first * waiter. */ static int umtxq_count_pi(struct umtx_key *key, struct umtx_q **first) { struct umtxq_queue *uh; *first = NULL; UMTXQ_LOCKED_ASSERT(umtxq_getchain(key)); uh = umtxq_queue_lookup(key, UMTX_SHARED_QUEUE); if (uh != NULL) { *first = TAILQ_FIRST(&uh->head); return (uh->length); } return (0); } static int umtxq_check_susp(struct thread *td) { struct proc *p; int error; /* * The check for TDF_NEEDSUSPCHK is racy, but it is enough to * eventually break the lockstep loop. */ if ((td->td_flags & TDF_NEEDSUSPCHK) == 0) return (0); error = 0; p = td->td_proc; PROC_LOCK(p); if (P_SHOULDSTOP(p) || ((p->p_flag & P_TRACED) && (td->td_dbgflags & TDB_SUSPEND))) { if (p->p_flag & P_SINGLE_EXIT) error = EINTR; else error = ERESTART; } PROC_UNLOCK(p); return (error); } /* * Wake up threads waiting on an userland object. */ static int umtxq_signal_queue(struct umtx_key *key, int n_wake, int q) { struct umtxq_queue *uh; struct umtx_q *uq; int ret; ret = 0; UMTXQ_LOCKED_ASSERT(umtxq_getchain(key)); uh = umtxq_queue_lookup(key, q); if (uh != NULL) { while ((uq = TAILQ_FIRST(&uh->head)) != NULL) { umtxq_remove_queue(uq, q); wakeup(uq); if (++ret >= n_wake) return (ret); } } return (ret); } /* * Wake up specified thread. */ static inline void umtxq_signal_thread(struct umtx_q *uq) { UMTXQ_LOCKED_ASSERT(umtxq_getchain(&uq->uq_key)); umtxq_remove(uq); wakeup(uq); } static inline int tstohz(const struct timespec *tsp) { struct timeval tv; TIMESPEC_TO_TIMEVAL(&tv, tsp); return tvtohz(&tv); } static void abs_timeout_init(struct abs_timeout *timo, int clockid, int absolute, const struct timespec *timeout) { timo->clockid = clockid; if (!absolute) { timo->is_abs_real = false; abs_timeout_update(timo); - timo->end = timo->cur; - timespecadd(&timo->end, timeout); + timespecadd(&timo->cur, timeout, &timo->end); } else { timo->end = *timeout; timo->is_abs_real = clockid == CLOCK_REALTIME || clockid == CLOCK_REALTIME_FAST || clockid == CLOCK_REALTIME_PRECISE; /* * If is_abs_real, umtxq_sleep will read the clock * after setting td_rtcgen; otherwise, read it here. */ if (!timo->is_abs_real) { abs_timeout_update(timo); } } } static void abs_timeout_init2(struct abs_timeout *timo, const struct _umtx_time *umtxtime) { abs_timeout_init(timo, umtxtime->_clockid, (umtxtime->_flags & UMTX_ABSTIME) != 0, &umtxtime->_timeout); } static inline void abs_timeout_update(struct abs_timeout *timo) { kern_clock_gettime(curthread, timo->clockid, &timo->cur); } static int abs_timeout_gethz(struct abs_timeout *timo) { struct timespec tts; if (timespeccmp(&timo->end, &timo->cur, <=)) return (-1); - tts = timo->end; - timespecsub(&tts, &timo->cur); + timespecsub(&timo->end, &timo->cur, &tts); return (tstohz(&tts)); } static uint32_t umtx_unlock_val(uint32_t flags, bool rb) { if (rb) return (UMUTEX_RB_OWNERDEAD); else if ((flags & UMUTEX_NONCONSISTENT) != 0) return (UMUTEX_RB_NOTRECOV); else return (UMUTEX_UNOWNED); } /* * Put thread into sleep state, before sleeping, check if * thread was removed from umtx queue. */ static inline int umtxq_sleep(struct umtx_q *uq, const char *wmesg, struct abs_timeout *abstime) { struct umtxq_chain *uc; int error, timo; if (abstime != NULL && abstime->is_abs_real) { curthread->td_rtcgen = atomic_load_acq_int(&rtc_generation); abs_timeout_update(abstime); } uc = umtxq_getchain(&uq->uq_key); UMTXQ_LOCKED_ASSERT(uc); for (;;) { if (!(uq->uq_flags & UQF_UMTXQ)) { error = 0; break; } if (abstime != NULL) { timo = abs_timeout_gethz(abstime); if (timo < 0) { error = ETIMEDOUT; break; } } else timo = 0; error = msleep(uq, &uc->uc_lock, PCATCH | PDROP, wmesg, timo); if (error == EINTR || error == ERESTART) { umtxq_lock(&uq->uq_key); break; } if (abstime != NULL) { if (abstime->is_abs_real) curthread->td_rtcgen = atomic_load_acq_int(&rtc_generation); abs_timeout_update(abstime); } umtxq_lock(&uq->uq_key); } curthread->td_rtcgen = 0; return (error); } /* * Convert userspace address into unique logical address. */ int umtx_key_get(const void *addr, int type, int share, struct umtx_key *key) { struct thread *td = curthread; vm_map_t map; vm_map_entry_t entry; vm_pindex_t pindex; vm_prot_t prot; boolean_t wired; key->type = type; if (share == THREAD_SHARE) { key->shared = 0; key->info.private.vs = td->td_proc->p_vmspace; key->info.private.addr = (uintptr_t)addr; } else { MPASS(share == PROCESS_SHARE || share == AUTO_SHARE); map = &td->td_proc->p_vmspace->vm_map; if (vm_map_lookup(&map, (vm_offset_t)addr, VM_PROT_WRITE, &entry, &key->info.shared.object, &pindex, &prot, &wired) != KERN_SUCCESS) { return (EFAULT); } if ((share == PROCESS_SHARE) || (share == AUTO_SHARE && VM_INHERIT_SHARE == entry->inheritance)) { key->shared = 1; key->info.shared.offset = (vm_offset_t)addr - entry->start + entry->offset; vm_object_reference(key->info.shared.object); } else { key->shared = 0; key->info.private.vs = td->td_proc->p_vmspace; key->info.private.addr = (uintptr_t)addr; } vm_map_lookup_done(map, entry); } umtxq_hash(key); return (0); } /* * Release key. */ void umtx_key_release(struct umtx_key *key) { if (key->shared) vm_object_deallocate(key->info.shared.object); } /* * Fetch and compare value, sleep on the address if value is not changed. */ static int do_wait(struct thread *td, void *addr, u_long id, struct _umtx_time *timeout, int compat32, int is_private) { struct abs_timeout timo; struct umtx_q *uq; u_long tmp; uint32_t tmp32; int error = 0; uq = td->td_umtxq; if ((error = umtx_key_get(addr, TYPE_SIMPLE_WAIT, is_private ? THREAD_SHARE : AUTO_SHARE, &uq->uq_key)) != 0) return (error); if (timeout != NULL) abs_timeout_init2(&timo, timeout); umtxq_lock(&uq->uq_key); umtxq_insert(uq); umtxq_unlock(&uq->uq_key); if (compat32 == 0) { error = fueword(addr, &tmp); if (error != 0) error = EFAULT; } else { error = fueword32(addr, &tmp32); if (error == 0) tmp = tmp32; else error = EFAULT; } umtxq_lock(&uq->uq_key); if (error == 0) { if (tmp == id) error = umtxq_sleep(uq, "uwait", timeout == NULL ? NULL : &timo); if ((uq->uq_flags & UQF_UMTXQ) == 0) error = 0; else umtxq_remove(uq); } else if ((uq->uq_flags & UQF_UMTXQ) != 0) { umtxq_remove(uq); } umtxq_unlock(&uq->uq_key); umtx_key_release(&uq->uq_key); if (error == ERESTART) error = EINTR; return (error); } /* * Wake up threads sleeping on the specified address. */ int kern_umtx_wake(struct thread *td, void *uaddr, int n_wake, int is_private) { struct umtx_key key; int ret; if ((ret = umtx_key_get(uaddr, TYPE_SIMPLE_WAIT, is_private ? THREAD_SHARE : AUTO_SHARE, &key)) != 0) return (ret); umtxq_lock(&key); umtxq_signal(&key, n_wake); umtxq_unlock(&key); umtx_key_release(&key); return (0); } /* * Lock PTHREAD_PRIO_NONE protocol POSIX mutex. */ static int do_lock_normal(struct thread *td, struct umutex *m, uint32_t flags, struct _umtx_time *timeout, int mode) { struct abs_timeout timo; struct umtx_q *uq; uint32_t owner, old, id; int error, rv; id = td->td_tid; uq = td->td_umtxq; error = 0; if (timeout != NULL) abs_timeout_init2(&timo, timeout); /* * Care must be exercised when dealing with umtx structure. It * can fault on any access. */ for (;;) { rv = fueword32(&m->m_owner, &owner); if (rv == -1) return (EFAULT); if (mode == _UMUTEX_WAIT) { if (owner == UMUTEX_UNOWNED || owner == UMUTEX_CONTESTED || owner == UMUTEX_RB_OWNERDEAD || owner == UMUTEX_RB_NOTRECOV) return (0); } else { /* * Robust mutex terminated. Kernel duty is to * return EOWNERDEAD to the userspace. The * umutex.m_flags UMUTEX_NONCONSISTENT is set * by the common userspace code. */ if (owner == UMUTEX_RB_OWNERDEAD) { rv = casueword32(&m->m_owner, UMUTEX_RB_OWNERDEAD, &owner, id | UMUTEX_CONTESTED); if (rv == -1) return (EFAULT); if (owner == UMUTEX_RB_OWNERDEAD) return (EOWNERDEAD); /* success */ rv = umtxq_check_susp(td); if (rv != 0) return (rv); continue; } if (owner == UMUTEX_RB_NOTRECOV) return (ENOTRECOVERABLE); /* * Try the uncontested case. This should be * done in userland. */ rv = casueword32(&m->m_owner, UMUTEX_UNOWNED, &owner, id); /* The address was invalid. */ if (rv == -1) return (EFAULT); /* The acquire succeeded. */ if (owner == UMUTEX_UNOWNED) return (0); /* * If no one owns it but it is contested try * to acquire it. */ if (owner == UMUTEX_CONTESTED) { rv = casueword32(&m->m_owner, UMUTEX_CONTESTED, &owner, id | UMUTEX_CONTESTED); /* The address was invalid. */ if (rv == -1) return (EFAULT); if (owner == UMUTEX_CONTESTED) return (0); rv = umtxq_check_susp(td); if (rv != 0) return (rv); /* * If this failed the lock has * changed, restart. */ continue; } } if (mode == _UMUTEX_TRY) return (EBUSY); /* * If we caught a signal, we have retried and now * exit immediately. */ if (error != 0) return (error); if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags), &uq->uq_key)) != 0) return (error); umtxq_lock(&uq->uq_key); umtxq_busy(&uq->uq_key); umtxq_insert(uq); umtxq_unlock(&uq->uq_key); /* * Set the contested bit so that a release in user space * knows to use the system call for unlock. If this fails * either some one else has acquired the lock or it has been * released. */ rv = casueword32(&m->m_owner, owner, &old, owner | UMUTEX_CONTESTED); /* The address was invalid. */ if (rv == -1) { umtxq_lock(&uq->uq_key); umtxq_remove(uq); umtxq_unbusy(&uq->uq_key); umtxq_unlock(&uq->uq_key); umtx_key_release(&uq->uq_key); return (EFAULT); } /* * We set the contested bit, sleep. Otherwise the lock changed * and we need to retry or we lost a race to the thread * unlocking the umtx. */ umtxq_lock(&uq->uq_key); umtxq_unbusy(&uq->uq_key); if (old == owner) error = umtxq_sleep(uq, "umtxn", timeout == NULL ? NULL : &timo); umtxq_remove(uq); umtxq_unlock(&uq->uq_key); umtx_key_release(&uq->uq_key); if (error == 0) error = umtxq_check_susp(td); } return (0); } /* * Unlock PTHREAD_PRIO_NONE protocol POSIX mutex. */ static int do_unlock_normal(struct thread *td, struct umutex *m, uint32_t flags, bool rb) { struct umtx_key key; uint32_t owner, old, id, newlock; int error, count; id = td->td_tid; /* * Make sure we own this mtx. */ error = fueword32(&m->m_owner, &owner); if (error == -1) return (EFAULT); if ((owner & ~UMUTEX_CONTESTED) != id) return (EPERM); newlock = umtx_unlock_val(flags, rb); if ((owner & UMUTEX_CONTESTED) == 0) { error = casueword32(&m->m_owner, owner, &old, newlock); if (error == -1) return (EFAULT); if (old == owner) return (0); owner = old; } /* We should only ever be in here for contested locks */ if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags), &key)) != 0) return (error); umtxq_lock(&key); umtxq_busy(&key); count = umtxq_count(&key); umtxq_unlock(&key); /* * When unlocking the umtx, it must be marked as unowned if * there is zero or one thread only waiting for it. * Otherwise, it must be marked as contested. */ if (count > 1) newlock |= UMUTEX_CONTESTED; error = casueword32(&m->m_owner, owner, &old, newlock); umtxq_lock(&key); umtxq_signal(&key, 1); umtxq_unbusy(&key); umtxq_unlock(&key); umtx_key_release(&key); if (error == -1) return (EFAULT); if (old != owner) return (EINVAL); return (0); } /* * Check if the mutex is available and wake up a waiter, * only for simple mutex. */ static int do_wake_umutex(struct thread *td, struct umutex *m) { struct umtx_key key; uint32_t owner; uint32_t flags; int error; int count; error = fueword32(&m->m_owner, &owner); if (error == -1) return (EFAULT); if ((owner & ~UMUTEX_CONTESTED) != 0 && owner != UMUTEX_RB_OWNERDEAD && owner != UMUTEX_RB_NOTRECOV) return (0); error = fueword32(&m->m_flags, &flags); if (error == -1) return (EFAULT); /* We should only ever be in here for contested locks */ if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags), &key)) != 0) return (error); umtxq_lock(&key); umtxq_busy(&key); count = umtxq_count(&key); umtxq_unlock(&key); if (count <= 1 && owner != UMUTEX_RB_OWNERDEAD && owner != UMUTEX_RB_NOTRECOV) { error = casueword32(&m->m_owner, UMUTEX_CONTESTED, &owner, UMUTEX_UNOWNED); if (error == -1) error = EFAULT; } umtxq_lock(&key); if (error == 0 && count != 0 && ((owner & ~UMUTEX_CONTESTED) == 0 || owner == UMUTEX_RB_OWNERDEAD || owner == UMUTEX_RB_NOTRECOV)) umtxq_signal(&key, 1); umtxq_unbusy(&key); umtxq_unlock(&key); umtx_key_release(&key); return (error); } /* * Check if the mutex has waiters and tries to fix contention bit. */ static int do_wake2_umutex(struct thread *td, struct umutex *m, uint32_t flags) { struct umtx_key key; uint32_t owner, old; int type; int error; int count; switch (flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT | UMUTEX_ROBUST)) { case 0: case UMUTEX_ROBUST: type = TYPE_NORMAL_UMUTEX; break; case UMUTEX_PRIO_INHERIT: type = TYPE_PI_UMUTEX; break; case (UMUTEX_PRIO_INHERIT | UMUTEX_ROBUST): type = TYPE_PI_ROBUST_UMUTEX; break; case UMUTEX_PRIO_PROTECT: type = TYPE_PP_UMUTEX; break; case (UMUTEX_PRIO_PROTECT | UMUTEX_ROBUST): type = TYPE_PP_ROBUST_UMUTEX; break; default: return (EINVAL); } if ((error = umtx_key_get(m, type, GET_SHARE(flags), &key)) != 0) return (error); owner = 0; umtxq_lock(&key); umtxq_busy(&key); count = umtxq_count(&key); umtxq_unlock(&key); /* * Only repair contention bit if there is a waiter, this means the mutex * is still being referenced by userland code, otherwise don't update * any memory. */ if (count > 1) { error = fueword32(&m->m_owner, &owner); if (error == -1) error = EFAULT; while (error == 0 && (owner & UMUTEX_CONTESTED) == 0) { error = casueword32(&m->m_owner, owner, &old, owner | UMUTEX_CONTESTED); if (error == -1) { error = EFAULT; break; } if (old == owner) break; owner = old; error = umtxq_check_susp(td); if (error != 0) break; } } else if (count == 1) { error = fueword32(&m->m_owner, &owner); if (error == -1) error = EFAULT; while (error == 0 && (owner & ~UMUTEX_CONTESTED) != 0 && (owner & UMUTEX_CONTESTED) == 0) { error = casueword32(&m->m_owner, owner, &old, owner | UMUTEX_CONTESTED); if (error == -1) { error = EFAULT; break; } if (old == owner) break; owner = old; error = umtxq_check_susp(td); if (error != 0) break; } } umtxq_lock(&key); if (error == EFAULT) { umtxq_signal(&key, INT_MAX); } else if (count != 0 && ((owner & ~UMUTEX_CONTESTED) == 0 || owner == UMUTEX_RB_OWNERDEAD || owner == UMUTEX_RB_NOTRECOV)) umtxq_signal(&key, 1); umtxq_unbusy(&key); umtxq_unlock(&key); umtx_key_release(&key); return (error); } static inline struct umtx_pi * umtx_pi_alloc(int flags) { struct umtx_pi *pi; pi = uma_zalloc(umtx_pi_zone, M_ZERO | flags); TAILQ_INIT(&pi->pi_blocked); atomic_add_int(&umtx_pi_allocated, 1); return (pi); } static inline void umtx_pi_free(struct umtx_pi *pi) { uma_zfree(umtx_pi_zone, pi); atomic_add_int(&umtx_pi_allocated, -1); } /* * Adjust the thread's position on a pi_state after its priority has been * changed. */ static int umtx_pi_adjust_thread(struct umtx_pi *pi, struct thread *td) { struct umtx_q *uq, *uq1, *uq2; struct thread *td1; mtx_assert(&umtx_lock, MA_OWNED); if (pi == NULL) return (0); uq = td->td_umtxq; /* * Check if the thread needs to be moved on the blocked chain. * It needs to be moved if either its priority is lower than * the previous thread or higher than the next thread. */ uq1 = TAILQ_PREV(uq, umtxq_head, uq_lockq); uq2 = TAILQ_NEXT(uq, uq_lockq); if ((uq1 != NULL && UPRI(td) < UPRI(uq1->uq_thread)) || (uq2 != NULL && UPRI(td) > UPRI(uq2->uq_thread))) { /* * Remove thread from blocked chain and determine where * it should be moved to. */ TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq); TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) { td1 = uq1->uq_thread; MPASS(td1->td_proc->p_magic == P_MAGIC); if (UPRI(td1) > UPRI(td)) break; } if (uq1 == NULL) TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq); else TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq); } return (1); } static struct umtx_pi * umtx_pi_next(struct umtx_pi *pi) { struct umtx_q *uq_owner; if (pi->pi_owner == NULL) return (NULL); uq_owner = pi->pi_owner->td_umtxq; if (uq_owner == NULL) return (NULL); return (uq_owner->uq_pi_blocked); } /* * Floyd's Cycle-Finding Algorithm. */ static bool umtx_pi_check_loop(struct umtx_pi *pi) { struct umtx_pi *pi1; /* fast iterator */ mtx_assert(&umtx_lock, MA_OWNED); if (pi == NULL) return (false); pi1 = pi; for (;;) { pi = umtx_pi_next(pi); if (pi == NULL) break; pi1 = umtx_pi_next(pi1); if (pi1 == NULL) break; pi1 = umtx_pi_next(pi1); if (pi1 == NULL) break; if (pi == pi1) return (true); } return (false); } /* * Propagate priority when a thread is blocked on POSIX * PI mutex. */ static void umtx_propagate_priority(struct thread *td) { struct umtx_q *uq; struct umtx_pi *pi; int pri; mtx_assert(&umtx_lock, MA_OWNED); pri = UPRI(td); uq = td->td_umtxq; pi = uq->uq_pi_blocked; if (pi == NULL) return; if (umtx_pi_check_loop(pi)) return; for (;;) { td = pi->pi_owner; if (td == NULL || td == curthread) return; MPASS(td->td_proc != NULL); MPASS(td->td_proc->p_magic == P_MAGIC); thread_lock(td); if (td->td_lend_user_pri > pri) sched_lend_user_prio(td, pri); else { thread_unlock(td); break; } thread_unlock(td); /* * Pick up the lock that td is blocked on. */ uq = td->td_umtxq; pi = uq->uq_pi_blocked; if (pi == NULL) break; /* Resort td on the list if needed. */ umtx_pi_adjust_thread(pi, td); } } /* * Unpropagate priority for a PI mutex when a thread blocked on * it is interrupted by signal or resumed by others. */ static void umtx_repropagate_priority(struct umtx_pi *pi) { struct umtx_q *uq, *uq_owner; struct umtx_pi *pi2; int pri; mtx_assert(&umtx_lock, MA_OWNED); if (umtx_pi_check_loop(pi)) return; while (pi != NULL && pi->pi_owner != NULL) { pri = PRI_MAX; uq_owner = pi->pi_owner->td_umtxq; TAILQ_FOREACH(pi2, &uq_owner->uq_pi_contested, pi_link) { uq = TAILQ_FIRST(&pi2->pi_blocked); if (uq != NULL) { if (pri > UPRI(uq->uq_thread)) pri = UPRI(uq->uq_thread); } } if (pri > uq_owner->uq_inherited_pri) pri = uq_owner->uq_inherited_pri; thread_lock(pi->pi_owner); sched_lend_user_prio(pi->pi_owner, pri); thread_unlock(pi->pi_owner); if ((pi = uq_owner->uq_pi_blocked) != NULL) umtx_pi_adjust_thread(pi, uq_owner->uq_thread); } } /* * Insert a PI mutex into owned list. */ static void umtx_pi_setowner(struct umtx_pi *pi, struct thread *owner) { struct umtx_q *uq_owner; uq_owner = owner->td_umtxq; mtx_assert(&umtx_lock, MA_OWNED); MPASS(pi->pi_owner == NULL); pi->pi_owner = owner; TAILQ_INSERT_TAIL(&uq_owner->uq_pi_contested, pi, pi_link); } /* * Disown a PI mutex, and remove it from the owned list. */ static void umtx_pi_disown(struct umtx_pi *pi) { mtx_assert(&umtx_lock, MA_OWNED); TAILQ_REMOVE(&pi->pi_owner->td_umtxq->uq_pi_contested, pi, pi_link); pi->pi_owner = NULL; } /* * Claim ownership of a PI mutex. */ static int umtx_pi_claim(struct umtx_pi *pi, struct thread *owner) { struct umtx_q *uq; int pri; mtx_lock(&umtx_lock); if (pi->pi_owner == owner) { mtx_unlock(&umtx_lock); return (0); } if (pi->pi_owner != NULL) { /* * userland may have already messed the mutex, sigh. */ mtx_unlock(&umtx_lock); return (EPERM); } umtx_pi_setowner(pi, owner); uq = TAILQ_FIRST(&pi->pi_blocked); if (uq != NULL) { pri = UPRI(uq->uq_thread); thread_lock(owner); if (pri < UPRI(owner)) sched_lend_user_prio(owner, pri); thread_unlock(owner); } mtx_unlock(&umtx_lock); return (0); } /* * Adjust a thread's order position in its blocked PI mutex, * this may result new priority propagating process. */ void umtx_pi_adjust(struct thread *td, u_char oldpri) { struct umtx_q *uq; struct umtx_pi *pi; uq = td->td_umtxq; mtx_lock(&umtx_lock); /* * Pick up the lock that td is blocked on. */ pi = uq->uq_pi_blocked; if (pi != NULL) { umtx_pi_adjust_thread(pi, td); umtx_repropagate_priority(pi); } mtx_unlock(&umtx_lock); } /* * Sleep on a PI mutex. */ static int umtxq_sleep_pi(struct umtx_q *uq, struct umtx_pi *pi, uint32_t owner, const char *wmesg, struct abs_timeout *timo, bool shared) { struct thread *td, *td1; struct umtx_q *uq1; int error, pri; #ifdef INVARIANTS struct umtxq_chain *uc; uc = umtxq_getchain(&pi->pi_key); #endif error = 0; td = uq->uq_thread; KASSERT(td == curthread, ("inconsistent uq_thread")); UMTXQ_LOCKED_ASSERT(umtxq_getchain(&uq->uq_key)); KASSERT(uc->uc_busy != 0, ("umtx chain is not busy")); umtxq_insert(uq); mtx_lock(&umtx_lock); if (pi->pi_owner == NULL) { mtx_unlock(&umtx_lock); td1 = tdfind(owner, shared ? -1 : td->td_proc->p_pid); mtx_lock(&umtx_lock); if (td1 != NULL) { if (pi->pi_owner == NULL) umtx_pi_setowner(pi, td1); PROC_UNLOCK(td1->td_proc); } } TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) { pri = UPRI(uq1->uq_thread); if (pri > UPRI(td)) break; } if (uq1 != NULL) TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq); else TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq); uq->uq_pi_blocked = pi; thread_lock(td); td->td_flags |= TDF_UPIBLOCKED; thread_unlock(td); umtx_propagate_priority(td); mtx_unlock(&umtx_lock); umtxq_unbusy(&uq->uq_key); error = umtxq_sleep(uq, wmesg, timo); umtxq_remove(uq); mtx_lock(&umtx_lock); uq->uq_pi_blocked = NULL; thread_lock(td); td->td_flags &= ~TDF_UPIBLOCKED; thread_unlock(td); TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq); umtx_repropagate_priority(pi); mtx_unlock(&umtx_lock); umtxq_unlock(&uq->uq_key); return (error); } /* * Add reference count for a PI mutex. */ static void umtx_pi_ref(struct umtx_pi *pi) { UMTXQ_LOCKED_ASSERT(umtxq_getchain(&pi->pi_key)); pi->pi_refcount++; } /* * Decrease reference count for a PI mutex, if the counter * is decreased to zero, its memory space is freed. */ static void umtx_pi_unref(struct umtx_pi *pi) { struct umtxq_chain *uc; uc = umtxq_getchain(&pi->pi_key); UMTXQ_LOCKED_ASSERT(uc); KASSERT(pi->pi_refcount > 0, ("invalid reference count")); if (--pi->pi_refcount == 0) { mtx_lock(&umtx_lock); if (pi->pi_owner != NULL) umtx_pi_disown(pi); KASSERT(TAILQ_EMPTY(&pi->pi_blocked), ("blocked queue not empty")); mtx_unlock(&umtx_lock); TAILQ_REMOVE(&uc->uc_pi_list, pi, pi_hashlink); umtx_pi_free(pi); } } /* * Find a PI mutex in hash table. */ static struct umtx_pi * umtx_pi_lookup(struct umtx_key *key) { struct umtxq_chain *uc; struct umtx_pi *pi; uc = umtxq_getchain(key); UMTXQ_LOCKED_ASSERT(uc); TAILQ_FOREACH(pi, &uc->uc_pi_list, pi_hashlink) { if (umtx_key_match(&pi->pi_key, key)) { return (pi); } } return (NULL); } /* * Insert a PI mutex into hash table. */ static inline void umtx_pi_insert(struct umtx_pi *pi) { struct umtxq_chain *uc; uc = umtxq_getchain(&pi->pi_key); UMTXQ_LOCKED_ASSERT(uc); TAILQ_INSERT_TAIL(&uc->uc_pi_list, pi, pi_hashlink); } /* * Lock a PI mutex. */ static int do_lock_pi(struct thread *td, struct umutex *m, uint32_t flags, struct _umtx_time *timeout, int try) { struct abs_timeout timo; struct umtx_q *uq; struct umtx_pi *pi, *new_pi; uint32_t id, old_owner, owner, old; int error, rv; id = td->td_tid; uq = td->td_umtxq; if ((error = umtx_key_get(m, (flags & UMUTEX_ROBUST) != 0 ? TYPE_PI_ROBUST_UMUTEX : TYPE_PI_UMUTEX, GET_SHARE(flags), &uq->uq_key)) != 0) return (error); if (timeout != NULL) abs_timeout_init2(&timo, timeout); umtxq_lock(&uq->uq_key); pi = umtx_pi_lookup(&uq->uq_key); if (pi == NULL) { new_pi = umtx_pi_alloc(M_NOWAIT); if (new_pi == NULL) { umtxq_unlock(&uq->uq_key); new_pi = umtx_pi_alloc(M_WAITOK); umtxq_lock(&uq->uq_key); pi = umtx_pi_lookup(&uq->uq_key); if (pi != NULL) { umtx_pi_free(new_pi); new_pi = NULL; } } if (new_pi != NULL) { new_pi->pi_key = uq->uq_key; umtx_pi_insert(new_pi); pi = new_pi; } } umtx_pi_ref(pi); umtxq_unlock(&uq->uq_key); /* * Care must be exercised when dealing with umtx structure. It * can fault on any access. */ for (;;) { /* * Try the uncontested case. This should be done in userland. */ rv = casueword32(&m->m_owner, UMUTEX_UNOWNED, &owner, id); /* The address was invalid. */ if (rv == -1) { error = EFAULT; break; } /* The acquire succeeded. */ if (owner == UMUTEX_UNOWNED) { error = 0; break; } if (owner == UMUTEX_RB_NOTRECOV) { error = ENOTRECOVERABLE; break; } /* If no one owns it but it is contested try to acquire it. */ if (owner == UMUTEX_CONTESTED || owner == UMUTEX_RB_OWNERDEAD) { old_owner = owner; rv = casueword32(&m->m_owner, owner, &owner, id | UMUTEX_CONTESTED); /* The address was invalid. */ if (rv == -1) { error = EFAULT; break; } if (owner == old_owner) { umtxq_lock(&uq->uq_key); umtxq_busy(&uq->uq_key); error = umtx_pi_claim(pi, td); umtxq_unbusy(&uq->uq_key); umtxq_unlock(&uq->uq_key); if (error != 0) { /* * Since we're going to return an * error, restore the m_owner to its * previous, unowned state to avoid * compounding the problem. */ (void)casuword32(&m->m_owner, id | UMUTEX_CONTESTED, old_owner); } if (error == 0 && old_owner == UMUTEX_RB_OWNERDEAD) error = EOWNERDEAD; break; } error = umtxq_check_susp(td); if (error != 0) break; /* If this failed the lock has changed, restart. */ continue; } if ((owner & ~UMUTEX_CONTESTED) == id) { error = EDEADLK; break; } if (try != 0) { error = EBUSY; break; } /* * If we caught a signal, we have retried and now * exit immediately. */ if (error != 0) break; umtxq_lock(&uq->uq_key); umtxq_busy(&uq->uq_key); umtxq_unlock(&uq->uq_key); /* * Set the contested bit so that a release in user space * knows to use the system call for unlock. If this fails * either some one else has acquired the lock or it has been * released. */ rv = casueword32(&m->m_owner, owner, &old, owner | UMUTEX_CONTESTED); /* The address was invalid. */ if (rv == -1) { umtxq_unbusy_unlocked(&uq->uq_key); error = EFAULT; break; } umtxq_lock(&uq->uq_key); /* * We set the contested bit, sleep. Otherwise the lock changed * and we need to retry or we lost a race to the thread * unlocking the umtx. Note that the UMUTEX_RB_OWNERDEAD * value for owner is impossible there. */ if (old == owner) { error = umtxq_sleep_pi(uq, pi, owner & ~UMUTEX_CONTESTED, "umtxpi", timeout == NULL ? NULL : &timo, (flags & USYNC_PROCESS_SHARED) != 0); if (error != 0) continue; } else { umtxq_unbusy(&uq->uq_key); umtxq_unlock(&uq->uq_key); } error = umtxq_check_susp(td); if (error != 0) break; } umtxq_lock(&uq->uq_key); umtx_pi_unref(pi); umtxq_unlock(&uq->uq_key); umtx_key_release(&uq->uq_key); return (error); } /* * Unlock a PI mutex. */ static int do_unlock_pi(struct thread *td, struct umutex *m, uint32_t flags, bool rb) { struct umtx_key key; struct umtx_q *uq_first, *uq_first2, *uq_me; struct umtx_pi *pi, *pi2; uint32_t id, new_owner, old, owner; int count, error, pri; id = td->td_tid; /* * Make sure we own this mtx. */ error = fueword32(&m->m_owner, &owner); if (error == -1) return (EFAULT); if ((owner & ~UMUTEX_CONTESTED) != id) return (EPERM); new_owner = umtx_unlock_val(flags, rb); /* This should be done in userland */ if ((owner & UMUTEX_CONTESTED) == 0) { error = casueword32(&m->m_owner, owner, &old, new_owner); if (error == -1) return (EFAULT); if (old == owner) return (0); owner = old; } /* We should only ever be in here for contested locks */ if ((error = umtx_key_get(m, (flags & UMUTEX_ROBUST) != 0 ? TYPE_PI_ROBUST_UMUTEX : TYPE_PI_UMUTEX, GET_SHARE(flags), &key)) != 0) return (error); umtxq_lock(&key); umtxq_busy(&key); count = umtxq_count_pi(&key, &uq_first); if (uq_first != NULL) { mtx_lock(&umtx_lock); pi = uq_first->uq_pi_blocked; KASSERT(pi != NULL, ("pi == NULL?")); if (pi->pi_owner != td && !(rb && pi->pi_owner == NULL)) { mtx_unlock(&umtx_lock); umtxq_unbusy(&key); umtxq_unlock(&key); umtx_key_release(&key); /* userland messed the mutex */ return (EPERM); } uq_me = td->td_umtxq; if (pi->pi_owner == td) umtx_pi_disown(pi); /* get highest priority thread which is still sleeping. */ uq_first = TAILQ_FIRST(&pi->pi_blocked); while (uq_first != NULL && (uq_first->uq_flags & UQF_UMTXQ) == 0) { uq_first = TAILQ_NEXT(uq_first, uq_lockq); } pri = PRI_MAX; TAILQ_FOREACH(pi2, &uq_me->uq_pi_contested, pi_link) { uq_first2 = TAILQ_FIRST(&pi2->pi_blocked); if (uq_first2 != NULL) { if (pri > UPRI(uq_first2->uq_thread)) pri = UPRI(uq_first2->uq_thread); } } thread_lock(td); sched_lend_user_prio(td, pri); thread_unlock(td); mtx_unlock(&umtx_lock); if (uq_first) umtxq_signal_thread(uq_first); } else { pi = umtx_pi_lookup(&key); /* * A umtx_pi can exist if a signal or timeout removed the * last waiter from the umtxq, but there is still * a thread in do_lock_pi() holding the umtx_pi. */ if (pi != NULL) { /* * The umtx_pi can be unowned, such as when a thread * has just entered do_lock_pi(), allocated the * umtx_pi, and unlocked the umtxq. * If the current thread owns it, it must disown it. */ mtx_lock(&umtx_lock); if (pi->pi_owner == td) umtx_pi_disown(pi); mtx_unlock(&umtx_lock); } } umtxq_unlock(&key); /* * When unlocking the umtx, it must be marked as unowned if * there is zero or one thread only waiting for it. * Otherwise, it must be marked as contested. */ if (count > 1) new_owner |= UMUTEX_CONTESTED; error = casueword32(&m->m_owner, owner, &old, new_owner); umtxq_unbusy_unlocked(&key); umtx_key_release(&key); if (error == -1) return (EFAULT); if (old != owner) return (EINVAL); return (0); } /* * Lock a PP mutex. */ static int do_lock_pp(struct thread *td, struct umutex *m, uint32_t flags, struct _umtx_time *timeout, int try) { struct abs_timeout timo; struct umtx_q *uq, *uq2; struct umtx_pi *pi; uint32_t ceiling; uint32_t owner, id; int error, pri, old_inherited_pri, su, rv; id = td->td_tid; uq = td->td_umtxq; if ((error = umtx_key_get(m, (flags & UMUTEX_ROBUST) != 0 ? TYPE_PP_ROBUST_UMUTEX : TYPE_PP_UMUTEX, GET_SHARE(flags), &uq->uq_key)) != 0) return (error); if (timeout != NULL) abs_timeout_init2(&timo, timeout); su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0); for (;;) { old_inherited_pri = uq->uq_inherited_pri; umtxq_lock(&uq->uq_key); umtxq_busy(&uq->uq_key); umtxq_unlock(&uq->uq_key); rv = fueword32(&m->m_ceilings[0], &ceiling); if (rv == -1) { error = EFAULT; goto out; } ceiling = RTP_PRIO_MAX - ceiling; if (ceiling > RTP_PRIO_MAX) { error = EINVAL; goto out; } mtx_lock(&umtx_lock); if (UPRI(td) < PRI_MIN_REALTIME + ceiling) { mtx_unlock(&umtx_lock); error = EINVAL; goto out; } if (su && PRI_MIN_REALTIME + ceiling < uq->uq_inherited_pri) { uq->uq_inherited_pri = PRI_MIN_REALTIME + ceiling; thread_lock(td); if (uq->uq_inherited_pri < UPRI(td)) sched_lend_user_prio(td, uq->uq_inherited_pri); thread_unlock(td); } mtx_unlock(&umtx_lock); rv = casueword32(&m->m_owner, UMUTEX_CONTESTED, &owner, id | UMUTEX_CONTESTED); /* The address was invalid. */ if (rv == -1) { error = EFAULT; break; } if (owner == UMUTEX_CONTESTED) { error = 0; break; } else if (owner == UMUTEX_RB_OWNERDEAD) { rv = casueword32(&m->m_owner, UMUTEX_RB_OWNERDEAD, &owner, id | UMUTEX_CONTESTED); if (rv == -1) { error = EFAULT; break; } if (owner == UMUTEX_RB_OWNERDEAD) { error = EOWNERDEAD; /* success */ break; } error = 0; } else if (owner == UMUTEX_RB_NOTRECOV) { error = ENOTRECOVERABLE; break; } if (try != 0) { error = EBUSY; break; } /* * If we caught a signal, we have retried and now * exit immediately. */ if (error != 0) break; umtxq_lock(&uq->uq_key); umtxq_insert(uq); umtxq_unbusy(&uq->uq_key); error = umtxq_sleep(uq, "umtxpp", timeout == NULL ? NULL : &timo); umtxq_remove(uq); umtxq_unlock(&uq->uq_key); mtx_lock(&umtx_lock); uq->uq_inherited_pri = old_inherited_pri; pri = PRI_MAX; TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) { uq2 = TAILQ_FIRST(&pi->pi_blocked); if (uq2 != NULL) { if (pri > UPRI(uq2->uq_thread)) pri = UPRI(uq2->uq_thread); } } if (pri > uq->uq_inherited_pri) pri = uq->uq_inherited_pri; thread_lock(td); sched_lend_user_prio(td, pri); thread_unlock(td); mtx_unlock(&umtx_lock); } if (error != 0 && error != EOWNERDEAD) { mtx_lock(&umtx_lock); uq->uq_inherited_pri = old_inherited_pri; pri = PRI_MAX; TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) { uq2 = TAILQ_FIRST(&pi->pi_blocked); if (uq2 != NULL) { if (pri > UPRI(uq2->uq_thread)) pri = UPRI(uq2->uq_thread); } } if (pri > uq->uq_inherited_pri) pri = uq->uq_inherited_pri; thread_lock(td); sched_lend_user_prio(td, pri); thread_unlock(td); mtx_unlock(&umtx_lock); } out: umtxq_unbusy_unlocked(&uq->uq_key); umtx_key_release(&uq->uq_key); return (error); } /* * Unlock a PP mutex. */ static int do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags, bool rb) { struct umtx_key key; struct umtx_q *uq, *uq2; struct umtx_pi *pi; uint32_t id, owner, rceiling; int error, pri, new_inherited_pri, su; id = td->td_tid; uq = td->td_umtxq; su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0); /* * Make sure we own this mtx. */ error = fueword32(&m->m_owner, &owner); if (error == -1) return (EFAULT); if ((owner & ~UMUTEX_CONTESTED) != id) return (EPERM); error = copyin(&m->m_ceilings[1], &rceiling, sizeof(uint32_t)); if (error != 0) return (error); if (rceiling == -1) new_inherited_pri = PRI_MAX; else { rceiling = RTP_PRIO_MAX - rceiling; if (rceiling > RTP_PRIO_MAX) return (EINVAL); new_inherited_pri = PRI_MIN_REALTIME + rceiling; } if ((error = umtx_key_get(m, (flags & UMUTEX_ROBUST) != 0 ? TYPE_PP_ROBUST_UMUTEX : TYPE_PP_UMUTEX, GET_SHARE(flags), &key)) != 0) return (error); umtxq_lock(&key); umtxq_busy(&key); umtxq_unlock(&key); /* * For priority protected mutex, always set unlocked state * to UMUTEX_CONTESTED, so that userland always enters kernel * to lock the mutex, it is necessary because thread priority * has to be adjusted for such mutex. */ error = suword32(&m->m_owner, umtx_unlock_val(flags, rb) | UMUTEX_CONTESTED); umtxq_lock(&key); if (error == 0) umtxq_signal(&key, 1); umtxq_unbusy(&key); umtxq_unlock(&key); if (error == -1) error = EFAULT; else { mtx_lock(&umtx_lock); if (su != 0) uq->uq_inherited_pri = new_inherited_pri; pri = PRI_MAX; TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) { uq2 = TAILQ_FIRST(&pi->pi_blocked); if (uq2 != NULL) { if (pri > UPRI(uq2->uq_thread)) pri = UPRI(uq2->uq_thread); } } if (pri > uq->uq_inherited_pri) pri = uq->uq_inherited_pri; thread_lock(td); sched_lend_user_prio(td, pri); thread_unlock(td); mtx_unlock(&umtx_lock); } umtx_key_release(&key); return (error); } static int do_set_ceiling(struct thread *td, struct umutex *m, uint32_t ceiling, uint32_t *old_ceiling) { struct umtx_q *uq; uint32_t flags, id, owner, save_ceiling; int error, rv, rv1; error = fueword32(&m->m_flags, &flags); if (error == -1) return (EFAULT); if ((flags & UMUTEX_PRIO_PROTECT) == 0) return (EINVAL); if (ceiling > RTP_PRIO_MAX) return (EINVAL); id = td->td_tid; uq = td->td_umtxq; if ((error = umtx_key_get(m, (flags & UMUTEX_ROBUST) != 0 ? TYPE_PP_ROBUST_UMUTEX : TYPE_PP_UMUTEX, GET_SHARE(flags), &uq->uq_key)) != 0) return (error); for (;;) { umtxq_lock(&uq->uq_key); umtxq_busy(&uq->uq_key); umtxq_unlock(&uq->uq_key); rv = fueword32(&m->m_ceilings[0], &save_ceiling); if (rv == -1) { error = EFAULT; break; } rv = casueword32(&m->m_owner, UMUTEX_CONTESTED, &owner, id | UMUTEX_CONTESTED); if (rv == -1) { error = EFAULT; break; } if (owner == UMUTEX_CONTESTED) { rv = suword32(&m->m_ceilings[0], ceiling); rv1 = suword32(&m->m_owner, UMUTEX_CONTESTED); error = (rv == 0 && rv1 == 0) ? 0: EFAULT; break; } if ((owner & ~UMUTEX_CONTESTED) == id) { rv = suword32(&m->m_ceilings[0], ceiling); error = rv == 0 ? 0 : EFAULT; break; } if (owner == UMUTEX_RB_OWNERDEAD) { error = EOWNERDEAD; break; } else if (owner == UMUTEX_RB_NOTRECOV) { error = ENOTRECOVERABLE; break; } /* * If we caught a signal, we have retried and now * exit immediately. */ if (error != 0) break; /* * We set the contested bit, sleep. Otherwise the lock changed * and we need to retry or we lost a race to the thread * unlocking the umtx. */ umtxq_lock(&uq->uq_key); umtxq_insert(uq); umtxq_unbusy(&uq->uq_key); error = umtxq_sleep(uq, "umtxpp", NULL); umtxq_remove(uq); umtxq_unlock(&uq->uq_key); } umtxq_lock(&uq->uq_key); if (error == 0) umtxq_signal(&uq->uq_key, INT_MAX); umtxq_unbusy(&uq->uq_key); umtxq_unlock(&uq->uq_key); umtx_key_release(&uq->uq_key); if (error == 0 && old_ceiling != NULL) { rv = suword32(old_ceiling, save_ceiling); error = rv == 0 ? 0 : EFAULT; } return (error); } /* * Lock a userland POSIX mutex. */ static int do_lock_umutex(struct thread *td, struct umutex *m, struct _umtx_time *timeout, int mode) { uint32_t flags; int error; error = fueword32(&m->m_flags, &flags); if (error == -1) return (EFAULT); switch (flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) { case 0: error = do_lock_normal(td, m, flags, timeout, mode); break; case UMUTEX_PRIO_INHERIT: error = do_lock_pi(td, m, flags, timeout, mode); break; case UMUTEX_PRIO_PROTECT: error = do_lock_pp(td, m, flags, timeout, mode); break; default: return (EINVAL); } if (timeout == NULL) { if (error == EINTR && mode != _UMUTEX_WAIT) error = ERESTART; } else { /* Timed-locking is not restarted. */ if (error == ERESTART) error = EINTR; } return (error); } /* * Unlock a userland POSIX mutex. */ static int do_unlock_umutex(struct thread *td, struct umutex *m, bool rb) { uint32_t flags; int error; error = fueword32(&m->m_flags, &flags); if (error == -1) return (EFAULT); switch (flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) { case 0: return (do_unlock_normal(td, m, flags, rb)); case UMUTEX_PRIO_INHERIT: return (do_unlock_pi(td, m, flags, rb)); case UMUTEX_PRIO_PROTECT: return (do_unlock_pp(td, m, flags, rb)); } return (EINVAL); } static int do_cv_wait(struct thread *td, struct ucond *cv, struct umutex *m, struct timespec *timeout, u_long wflags) { struct abs_timeout timo; struct umtx_q *uq; uint32_t flags, clockid, hasw; int error; uq = td->td_umtxq; error = fueword32(&cv->c_flags, &flags); if (error == -1) return (EFAULT); error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &uq->uq_key); if (error != 0) return (error); if ((wflags & CVWAIT_CLOCKID) != 0) { error = fueword32(&cv->c_clockid, &clockid); if (error == -1) { umtx_key_release(&uq->uq_key); return (EFAULT); } if (clockid < CLOCK_REALTIME || clockid >= CLOCK_THREAD_CPUTIME_ID) { /* hmm, only HW clock id will work. */ umtx_key_release(&uq->uq_key); return (EINVAL); } } else { clockid = CLOCK_REALTIME; } umtxq_lock(&uq->uq_key); umtxq_busy(&uq->uq_key); umtxq_insert(uq); umtxq_unlock(&uq->uq_key); /* * Set c_has_waiters to 1 before releasing user mutex, also * don't modify cache line when unnecessary. */ error = fueword32(&cv->c_has_waiters, &hasw); if (error == 0 && hasw == 0) suword32(&cv->c_has_waiters, 1); umtxq_unbusy_unlocked(&uq->uq_key); error = do_unlock_umutex(td, m, false); if (timeout != NULL) abs_timeout_init(&timo, clockid, (wflags & CVWAIT_ABSTIME) != 0, timeout); umtxq_lock(&uq->uq_key); if (error == 0) { error = umtxq_sleep(uq, "ucond", timeout == NULL ? NULL : &timo); } if ((uq->uq_flags & UQF_UMTXQ) == 0) error = 0; else { /* * This must be timeout,interrupted by signal or * surprious wakeup, clear c_has_waiter flag when * necessary. */ umtxq_busy(&uq->uq_key); if ((uq->uq_flags & UQF_UMTXQ) != 0) { int oldlen = uq->uq_cur_queue->length; umtxq_remove(uq); if (oldlen == 1) { umtxq_unlock(&uq->uq_key); suword32(&cv->c_has_waiters, 0); umtxq_lock(&uq->uq_key); } } umtxq_unbusy(&uq->uq_key); if (error == ERESTART) error = EINTR; } umtxq_unlock(&uq->uq_key); umtx_key_release(&uq->uq_key); return (error); } /* * Signal a userland condition variable. */ static int do_cv_signal(struct thread *td, struct ucond *cv) { struct umtx_key key; int error, cnt, nwake; uint32_t flags; error = fueword32(&cv->c_flags, &flags); if (error == -1) return (EFAULT); if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0) return (error); umtxq_lock(&key); umtxq_busy(&key); cnt = umtxq_count(&key); nwake = umtxq_signal(&key, 1); if (cnt <= nwake) { umtxq_unlock(&key); error = suword32(&cv->c_has_waiters, 0); if (error == -1) error = EFAULT; umtxq_lock(&key); } umtxq_unbusy(&key); umtxq_unlock(&key); umtx_key_release(&key); return (error); } static int do_cv_broadcast(struct thread *td, struct ucond *cv) { struct umtx_key key; int error; uint32_t flags; error = fueword32(&cv->c_flags, &flags); if (error == -1) return (EFAULT); if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0) return (error); umtxq_lock(&key); umtxq_busy(&key); umtxq_signal(&key, INT_MAX); umtxq_unlock(&key); error = suword32(&cv->c_has_waiters, 0); if (error == -1) error = EFAULT; umtxq_unbusy_unlocked(&key); umtx_key_release(&key); return (error); } static int do_rw_rdlock(struct thread *td, struct urwlock *rwlock, long fflag, struct _umtx_time *timeout) { struct abs_timeout timo; struct umtx_q *uq; uint32_t flags, wrflags; int32_t state, oldstate; int32_t blocked_readers; int error, error1, rv; uq = td->td_umtxq; error = fueword32(&rwlock->rw_flags, &flags); if (error == -1) return (EFAULT); error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key); if (error != 0) return (error); if (timeout != NULL) abs_timeout_init2(&timo, timeout); wrflags = URWLOCK_WRITE_OWNER; if (!(fflag & URWLOCK_PREFER_READER) && !(flags & URWLOCK_PREFER_READER)) wrflags |= URWLOCK_WRITE_WAITERS; for (;;) { rv = fueword32(&rwlock->rw_state, &state); if (rv == -1) { umtx_key_release(&uq->uq_key); return (EFAULT); } /* try to lock it */ while (!(state & wrflags)) { if (__predict_false(URWLOCK_READER_COUNT(state) == URWLOCK_MAX_READERS)) { umtx_key_release(&uq->uq_key); return (EAGAIN); } rv = casueword32(&rwlock->rw_state, state, &oldstate, state + 1); if (rv == -1) { umtx_key_release(&uq->uq_key); return (EFAULT); } if (oldstate == state) { umtx_key_release(&uq->uq_key); return (0); } error = umtxq_check_susp(td); if (error != 0) break; state = oldstate; } if (error) break; /* grab monitor lock */ umtxq_lock(&uq->uq_key); umtxq_busy(&uq->uq_key); umtxq_unlock(&uq->uq_key); /* * re-read the state, in case it changed between the try-lock above * and the check below */ rv = fueword32(&rwlock->rw_state, &state); if (rv == -1) error = EFAULT; /* set read contention bit */ while (error == 0 && (state & wrflags) && !(state & URWLOCK_READ_WAITERS)) { rv = casueword32(&rwlock->rw_state, state, &oldstate, state | URWLOCK_READ_WAITERS); if (rv == -1) { error = EFAULT; break; } if (oldstate == state) goto sleep; state = oldstate; error = umtxq_check_susp(td); if (error != 0) break; } if (error != 0) { umtxq_unbusy_unlocked(&uq->uq_key); break; } /* state is changed while setting flags, restart */ if (!(state & wrflags)) { umtxq_unbusy_unlocked(&uq->uq_key); error = umtxq_check_susp(td); if (error != 0) break; continue; } sleep: /* contention bit is set, before sleeping, increase read waiter count */ rv = fueword32(&rwlock->rw_blocked_readers, &blocked_readers); if (rv == -1) { umtxq_unbusy_unlocked(&uq->uq_key); error = EFAULT; break; } suword32(&rwlock->rw_blocked_readers, blocked_readers+1); while (state & wrflags) { umtxq_lock(&uq->uq_key); umtxq_insert(uq); umtxq_unbusy(&uq->uq_key); error = umtxq_sleep(uq, "urdlck", timeout == NULL ? NULL : &timo); umtxq_busy(&uq->uq_key); umtxq_remove(uq); umtxq_unlock(&uq->uq_key); if (error) break; rv = fueword32(&rwlock->rw_state, &state); if (rv == -1) { error = EFAULT; break; } } /* decrease read waiter count, and may clear read contention bit */ rv = fueword32(&rwlock->rw_blocked_readers, &blocked_readers); if (rv == -1) { umtxq_unbusy_unlocked(&uq->uq_key); error = EFAULT; break; } suword32(&rwlock->rw_blocked_readers, blocked_readers-1); if (blocked_readers == 1) { rv = fueword32(&rwlock->rw_state, &state); if (rv == -1) { umtxq_unbusy_unlocked(&uq->uq_key); error = EFAULT; break; } for (;;) { rv = casueword32(&rwlock->rw_state, state, &oldstate, state & ~URWLOCK_READ_WAITERS); if (rv == -1) { error = EFAULT; break; } if (oldstate == state) break; state = oldstate; error1 = umtxq_check_susp(td); if (error1 != 0) { if (error == 0) error = error1; break; } } } umtxq_unbusy_unlocked(&uq->uq_key); if (error != 0) break; } umtx_key_release(&uq->uq_key); if (error == ERESTART) error = EINTR; return (error); } static int do_rw_wrlock(struct thread *td, struct urwlock *rwlock, struct _umtx_time *timeout) { struct abs_timeout timo; struct umtx_q *uq; uint32_t flags; int32_t state, oldstate; int32_t blocked_writers; int32_t blocked_readers; int error, error1, rv; uq = td->td_umtxq; error = fueword32(&rwlock->rw_flags, &flags); if (error == -1) return (EFAULT); error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key); if (error != 0) return (error); if (timeout != NULL) abs_timeout_init2(&timo, timeout); blocked_readers = 0; for (;;) { rv = fueword32(&rwlock->rw_state, &state); if (rv == -1) { umtx_key_release(&uq->uq_key); return (EFAULT); } while (!(state & URWLOCK_WRITE_OWNER) && URWLOCK_READER_COUNT(state) == 0) { rv = casueword32(&rwlock->rw_state, state, &oldstate, state | URWLOCK_WRITE_OWNER); if (rv == -1) { umtx_key_release(&uq->uq_key); return (EFAULT); } if (oldstate == state) { umtx_key_release(&uq->uq_key); return (0); } state = oldstate; error = umtxq_check_susp(td); if (error != 0) break; } if (error) { if (!(state & (URWLOCK_WRITE_OWNER|URWLOCK_WRITE_WAITERS)) && blocked_readers != 0) { umtxq_lock(&uq->uq_key); umtxq_busy(&uq->uq_key); umtxq_signal_queue(&uq->uq_key, INT_MAX, UMTX_SHARED_QUEUE); umtxq_unbusy(&uq->uq_key); umtxq_unlock(&uq->uq_key); } break; } /* grab monitor lock */ umtxq_lock(&uq->uq_key); umtxq_busy(&uq->uq_key); umtxq_unlock(&uq->uq_key); /* * re-read the state, in case it changed between the try-lock above * and the check below */ rv = fueword32(&rwlock->rw_state, &state); if (rv == -1) error = EFAULT; while (error == 0 && ((state & URWLOCK_WRITE_OWNER) || URWLOCK_READER_COUNT(state) != 0) && (state & URWLOCK_WRITE_WAITERS) == 0) { rv = casueword32(&rwlock->rw_state, state, &oldstate, state | URWLOCK_WRITE_WAITERS); if (rv == -1) { error = EFAULT; break; } if (oldstate == state) goto sleep; state = oldstate; error = umtxq_check_susp(td); if (error != 0) break; } if (error != 0) { umtxq_unbusy_unlocked(&uq->uq_key); break; } if (!(state & URWLOCK_WRITE_OWNER) && URWLOCK_READER_COUNT(state) == 0) { umtxq_unbusy_unlocked(&uq->uq_key); error = umtxq_check_susp(td); if (error != 0) break; continue; } sleep: rv = fueword32(&rwlock->rw_blocked_writers, &blocked_writers); if (rv == -1) { umtxq_unbusy_unlocked(&uq->uq_key); error = EFAULT; break; } suword32(&rwlock->rw_blocked_writers, blocked_writers+1); while ((state & URWLOCK_WRITE_OWNER) || URWLOCK_READER_COUNT(state) != 0) { umtxq_lock(&uq->uq_key); umtxq_insert_queue(uq, UMTX_EXCLUSIVE_QUEUE); umtxq_unbusy(&uq->uq_key); error = umtxq_sleep(uq, "uwrlck", timeout == NULL ? NULL : &timo); umtxq_busy(&uq->uq_key); umtxq_remove_queue(uq, UMTX_EXCLUSIVE_QUEUE); umtxq_unlock(&uq->uq_key); if (error) break; rv = fueword32(&rwlock->rw_state, &state); if (rv == -1) { error = EFAULT; break; } } rv = fueword32(&rwlock->rw_blocked_writers, &blocked_writers); if (rv == -1) { umtxq_unbusy_unlocked(&uq->uq_key); error = EFAULT; break; } suword32(&rwlock->rw_blocked_writers, blocked_writers-1); if (blocked_writers == 1) { rv = fueword32(&rwlock->rw_state, &state); if (rv == -1) { umtxq_unbusy_unlocked(&uq->uq_key); error = EFAULT; break; } for (;;) { rv = casueword32(&rwlock->rw_state, state, &oldstate, state & ~URWLOCK_WRITE_WAITERS); if (rv == -1) { error = EFAULT; break; } if (oldstate == state) break; state = oldstate; error1 = umtxq_check_susp(td); /* * We are leaving the URWLOCK_WRITE_WAITERS * behind, but this should not harm the * correctness. */ if (error1 != 0) { if (error == 0) error = error1; break; } } rv = fueword32(&rwlock->rw_blocked_readers, &blocked_readers); if (rv == -1) { umtxq_unbusy_unlocked(&uq->uq_key); error = EFAULT; break; } } else blocked_readers = 0; umtxq_unbusy_unlocked(&uq->uq_key); } umtx_key_release(&uq->uq_key); if (error == ERESTART) error = EINTR; return (error); } static int do_rw_unlock(struct thread *td, struct urwlock *rwlock) { struct umtx_q *uq; uint32_t flags; int32_t state, oldstate; int error, rv, q, count; uq = td->td_umtxq; error = fueword32(&rwlock->rw_flags, &flags); if (error == -1) return (EFAULT); error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key); if (error != 0) return (error); error = fueword32(&rwlock->rw_state, &state); if (error == -1) { error = EFAULT; goto out; } if (state & URWLOCK_WRITE_OWNER) { for (;;) { rv = casueword32(&rwlock->rw_state, state, &oldstate, state & ~URWLOCK_WRITE_OWNER); if (rv == -1) { error = EFAULT; goto out; } if (oldstate != state) { state = oldstate; if (!(oldstate & URWLOCK_WRITE_OWNER)) { error = EPERM; goto out; } error = umtxq_check_susp(td); if (error != 0) goto out; } else break; } } else if (URWLOCK_READER_COUNT(state) != 0) { for (;;) { rv = casueword32(&rwlock->rw_state, state, &oldstate, state - 1); if (rv == -1) { error = EFAULT; goto out; } if (oldstate != state) { state = oldstate; if (URWLOCK_READER_COUNT(oldstate) == 0) { error = EPERM; goto out; } error = umtxq_check_susp(td); if (error != 0) goto out; } else break; } } else { error = EPERM; goto out; } count = 0; if (!(flags & URWLOCK_PREFER_READER)) { if (state & URWLOCK_WRITE_WAITERS) { count = 1; q = UMTX_EXCLUSIVE_QUEUE; } else if (state & URWLOCK_READ_WAITERS) { count = INT_MAX; q = UMTX_SHARED_QUEUE; } } else { if (state & URWLOCK_READ_WAITERS) { count = INT_MAX; q = UMTX_SHARED_QUEUE; } else if (state & URWLOCK_WRITE_WAITERS) { count = 1; q = UMTX_EXCLUSIVE_QUEUE; } } if (count) { umtxq_lock(&uq->uq_key); umtxq_busy(&uq->uq_key); umtxq_signal_queue(&uq->uq_key, count, q); umtxq_unbusy(&uq->uq_key); umtxq_unlock(&uq->uq_key); } out: umtx_key_release(&uq->uq_key); return (error); } #if defined(COMPAT_FREEBSD9) || defined(COMPAT_FREEBSD10) static int do_sem_wait(struct thread *td, struct _usem *sem, struct _umtx_time *timeout) { struct abs_timeout timo; struct umtx_q *uq; uint32_t flags, count, count1; int error, rv; uq = td->td_umtxq; error = fueword32(&sem->_flags, &flags); if (error == -1) return (EFAULT); error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &uq->uq_key); if (error != 0) return (error); if (timeout != NULL) abs_timeout_init2(&timo, timeout); umtxq_lock(&uq->uq_key); umtxq_busy(&uq->uq_key); umtxq_insert(uq); umtxq_unlock(&uq->uq_key); rv = casueword32(&sem->_has_waiters, 0, &count1, 1); if (rv == 0) rv = fueword32(&sem->_count, &count); if (rv == -1 || count != 0) { umtxq_lock(&uq->uq_key); umtxq_unbusy(&uq->uq_key); umtxq_remove(uq); umtxq_unlock(&uq->uq_key); umtx_key_release(&uq->uq_key); return (rv == -1 ? EFAULT : 0); } umtxq_lock(&uq->uq_key); umtxq_unbusy(&uq->uq_key); error = umtxq_sleep(uq, "usem", timeout == NULL ? NULL : &timo); if ((uq->uq_flags & UQF_UMTXQ) == 0) error = 0; else { umtxq_remove(uq); /* A relative timeout cannot be restarted. */ if (error == ERESTART && timeout != NULL && (timeout->_flags & UMTX_ABSTIME) == 0) error = EINTR; } umtxq_unlock(&uq->uq_key); umtx_key_release(&uq->uq_key); return (error); } /* * Signal a userland semaphore. */ static int do_sem_wake(struct thread *td, struct _usem *sem) { struct umtx_key key; int error, cnt; uint32_t flags; error = fueword32(&sem->_flags, &flags); if (error == -1) return (EFAULT); if ((error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &key)) != 0) return (error); umtxq_lock(&key); umtxq_busy(&key); cnt = umtxq_count(&key); if (cnt > 0) { /* * Check if count is greater than 0, this means the memory is * still being referenced by user code, so we can safely * update _has_waiters flag. */ if (cnt == 1) { umtxq_unlock(&key); error = suword32(&sem->_has_waiters, 0); umtxq_lock(&key); if (error == -1) error = EFAULT; } umtxq_signal(&key, 1); } umtxq_unbusy(&key); umtxq_unlock(&key); umtx_key_release(&key); return (error); } #endif static int do_sem2_wait(struct thread *td, struct _usem2 *sem, struct _umtx_time *timeout) { struct abs_timeout timo; struct umtx_q *uq; uint32_t count, flags; int error, rv; uq = td->td_umtxq; flags = fuword32(&sem->_flags); error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &uq->uq_key); if (error != 0) return (error); if (timeout != NULL) abs_timeout_init2(&timo, timeout); umtxq_lock(&uq->uq_key); umtxq_busy(&uq->uq_key); umtxq_insert(uq); umtxq_unlock(&uq->uq_key); rv = fueword32(&sem->_count, &count); if (rv == -1) { umtxq_lock(&uq->uq_key); umtxq_unbusy(&uq->uq_key); umtxq_remove(uq); umtxq_unlock(&uq->uq_key); umtx_key_release(&uq->uq_key); return (EFAULT); } for (;;) { if (USEM_COUNT(count) != 0) { umtxq_lock(&uq->uq_key); umtxq_unbusy(&uq->uq_key); umtxq_remove(uq); umtxq_unlock(&uq->uq_key); umtx_key_release(&uq->uq_key); return (0); } if (count == USEM_HAS_WAITERS) break; rv = casueword32(&sem->_count, 0, &count, USEM_HAS_WAITERS); if (rv == -1) { umtxq_lock(&uq->uq_key); umtxq_unbusy(&uq->uq_key); umtxq_remove(uq); umtxq_unlock(&uq->uq_key); umtx_key_release(&uq->uq_key); return (EFAULT); } if (count == 0) break; } umtxq_lock(&uq->uq_key); umtxq_unbusy(&uq->uq_key); error = umtxq_sleep(uq, "usem", timeout == NULL ? NULL : &timo); if ((uq->uq_flags & UQF_UMTXQ) == 0) error = 0; else { umtxq_remove(uq); if (timeout != NULL && (timeout->_flags & UMTX_ABSTIME) == 0) { /* A relative timeout cannot be restarted. */ if (error == ERESTART) error = EINTR; if (error == EINTR) { abs_timeout_update(&timo); - timeout->_timeout = timo.end; - timespecsub(&timeout->_timeout, &timo.cur); + timespecsub(&timo.end, &timo.cur, + &timeout->_timeout); } } } umtxq_unlock(&uq->uq_key); umtx_key_release(&uq->uq_key); return (error); } /* * Signal a userland semaphore. */ static int do_sem2_wake(struct thread *td, struct _usem2 *sem) { struct umtx_key key; int error, cnt, rv; uint32_t count, flags; rv = fueword32(&sem->_flags, &flags); if (rv == -1) return (EFAULT); if ((error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &key)) != 0) return (error); umtxq_lock(&key); umtxq_busy(&key); cnt = umtxq_count(&key); if (cnt > 0) { /* * If this was the last sleeping thread, clear the waiters * flag in _count. */ if (cnt == 1) { umtxq_unlock(&key); rv = fueword32(&sem->_count, &count); while (rv != -1 && count & USEM_HAS_WAITERS) rv = casueword32(&sem->_count, count, &count, count & ~USEM_HAS_WAITERS); if (rv == -1) error = EFAULT; umtxq_lock(&key); } umtxq_signal(&key, 1); } umtxq_unbusy(&key); umtxq_unlock(&key); umtx_key_release(&key); return (error); } inline int umtx_copyin_timeout(const void *addr, struct timespec *tsp) { int error; error = copyin(addr, tsp, sizeof(struct timespec)); if (error == 0) { if (tsp->tv_sec < 0 || tsp->tv_nsec >= 1000000000 || tsp->tv_nsec < 0) error = EINVAL; } return (error); } static inline int umtx_copyin_umtx_time(const void *addr, size_t size, struct _umtx_time *tp) { int error; if (size <= sizeof(struct timespec)) { tp->_clockid = CLOCK_REALTIME; tp->_flags = 0; error = copyin(addr, &tp->_timeout, sizeof(struct timespec)); } else error = copyin(addr, tp, sizeof(struct _umtx_time)); if (error != 0) return (error); if (tp->_timeout.tv_sec < 0 || tp->_timeout.tv_nsec >= 1000000000 || tp->_timeout.tv_nsec < 0) return (EINVAL); return (0); } static int __umtx_op_unimpl(struct thread *td, struct _umtx_op_args *uap) { return (EOPNOTSUPP); } static int __umtx_op_wait(struct thread *td, struct _umtx_op_args *uap) { struct _umtx_time timeout, *tm_p; int error; if (uap->uaddr2 == NULL) tm_p = NULL; else { error = umtx_copyin_umtx_time( uap->uaddr2, (size_t)uap->uaddr1, &timeout); if (error != 0) return (error); tm_p = &timeout; } return (do_wait(td, uap->obj, uap->val, tm_p, 0, 0)); } static int __umtx_op_wait_uint(struct thread *td, struct _umtx_op_args *uap) { struct _umtx_time timeout, *tm_p; int error; if (uap->uaddr2 == NULL) tm_p = NULL; else { error = umtx_copyin_umtx_time( uap->uaddr2, (size_t)uap->uaddr1, &timeout); if (error != 0) return (error); tm_p = &timeout; } return (do_wait(td, uap->obj, uap->val, tm_p, 1, 0)); } static int __umtx_op_wait_uint_private(struct thread *td, struct _umtx_op_args *uap) { struct _umtx_time *tm_p, timeout; int error; if (uap->uaddr2 == NULL) tm_p = NULL; else { error = umtx_copyin_umtx_time( uap->uaddr2, (size_t)uap->uaddr1, &timeout); if (error != 0) return (error); tm_p = &timeout; } return (do_wait(td, uap->obj, uap->val, tm_p, 1, 1)); } static int __umtx_op_wake(struct thread *td, struct _umtx_op_args *uap) { return (kern_umtx_wake(td, uap->obj, uap->val, 0)); } #define BATCH_SIZE 128 static int __umtx_op_nwake_private(struct thread *td, struct _umtx_op_args *uap) { char *uaddrs[BATCH_SIZE], **upp; int count, error, i, pos, tocopy; upp = (char **)uap->obj; error = 0; for (count = uap->val, pos = 0; count > 0; count -= tocopy, pos += tocopy) { tocopy = MIN(count, BATCH_SIZE); error = copyin(upp + pos, uaddrs, tocopy * sizeof(char *)); if (error != 0) break; for (i = 0; i < tocopy; ++i) kern_umtx_wake(td, uaddrs[i], INT_MAX, 1); maybe_yield(); } return (error); } static int __umtx_op_wake_private(struct thread *td, struct _umtx_op_args *uap) { return (kern_umtx_wake(td, uap->obj, uap->val, 1)); } static int __umtx_op_lock_umutex(struct thread *td, struct _umtx_op_args *uap) { struct _umtx_time *tm_p, timeout; int error; /* Allow a null timespec (wait forever). */ if (uap->uaddr2 == NULL) tm_p = NULL; else { error = umtx_copyin_umtx_time( uap->uaddr2, (size_t)uap->uaddr1, &timeout); if (error != 0) return (error); tm_p = &timeout; } return (do_lock_umutex(td, uap->obj, tm_p, 0)); } static int __umtx_op_trylock_umutex(struct thread *td, struct _umtx_op_args *uap) { return (do_lock_umutex(td, uap->obj, NULL, _UMUTEX_TRY)); } static int __umtx_op_wait_umutex(struct thread *td, struct _umtx_op_args *uap) { struct _umtx_time *tm_p, timeout; int error; /* Allow a null timespec (wait forever). */ if (uap->uaddr2 == NULL) tm_p = NULL; else { error = umtx_copyin_umtx_time( uap->uaddr2, (size_t)uap->uaddr1, &timeout); if (error != 0) return (error); tm_p = &timeout; } return (do_lock_umutex(td, uap->obj, tm_p, _UMUTEX_WAIT)); } static int __umtx_op_wake_umutex(struct thread *td, struct _umtx_op_args *uap) { return (do_wake_umutex(td, uap->obj)); } static int __umtx_op_unlock_umutex(struct thread *td, struct _umtx_op_args *uap) { return (do_unlock_umutex(td, uap->obj, false)); } static int __umtx_op_set_ceiling(struct thread *td, struct _umtx_op_args *uap) { return (do_set_ceiling(td, uap->obj, uap->val, uap->uaddr1)); } static int __umtx_op_cv_wait(struct thread *td, struct _umtx_op_args *uap) { struct timespec *ts, timeout; int error; /* Allow a null timespec (wait forever). */ if (uap->uaddr2 == NULL) ts = NULL; else { error = umtx_copyin_timeout(uap->uaddr2, &timeout); if (error != 0) return (error); ts = &timeout; } return (do_cv_wait(td, uap->obj, uap->uaddr1, ts, uap->val)); } static int __umtx_op_cv_signal(struct thread *td, struct _umtx_op_args *uap) { return (do_cv_signal(td, uap->obj)); } static int __umtx_op_cv_broadcast(struct thread *td, struct _umtx_op_args *uap) { return (do_cv_broadcast(td, uap->obj)); } static int __umtx_op_rw_rdlock(struct thread *td, struct _umtx_op_args *uap) { struct _umtx_time timeout; int error; /* Allow a null timespec (wait forever). */ if (uap->uaddr2 == NULL) { error = do_rw_rdlock(td, uap->obj, uap->val, 0); } else { error = umtx_copyin_umtx_time(uap->uaddr2, (size_t)uap->uaddr1, &timeout); if (error != 0) return (error); error = do_rw_rdlock(td, uap->obj, uap->val, &timeout); } return (error); } static int __umtx_op_rw_wrlock(struct thread *td, struct _umtx_op_args *uap) { struct _umtx_time timeout; int error; /* Allow a null timespec (wait forever). */ if (uap->uaddr2 == NULL) { error = do_rw_wrlock(td, uap->obj, 0); } else { error = umtx_copyin_umtx_time(uap->uaddr2, (size_t)uap->uaddr1, &timeout); if (error != 0) return (error); error = do_rw_wrlock(td, uap->obj, &timeout); } return (error); } static int __umtx_op_rw_unlock(struct thread *td, struct _umtx_op_args *uap) { return (do_rw_unlock(td, uap->obj)); } #if defined(COMPAT_FREEBSD9) || defined(COMPAT_FREEBSD10) static int __umtx_op_sem_wait(struct thread *td, struct _umtx_op_args *uap) { struct _umtx_time *tm_p, timeout; int error; /* Allow a null timespec (wait forever). */ if (uap->uaddr2 == NULL) tm_p = NULL; else { error = umtx_copyin_umtx_time( uap->uaddr2, (size_t)uap->uaddr1, &timeout); if (error != 0) return (error); tm_p = &timeout; } return (do_sem_wait(td, uap->obj, tm_p)); } static int __umtx_op_sem_wake(struct thread *td, struct _umtx_op_args *uap) { return (do_sem_wake(td, uap->obj)); } #endif static int __umtx_op_wake2_umutex(struct thread *td, struct _umtx_op_args *uap) { return (do_wake2_umutex(td, uap->obj, uap->val)); } static int __umtx_op_sem2_wait(struct thread *td, struct _umtx_op_args *uap) { struct _umtx_time *tm_p, timeout; size_t uasize; int error; /* Allow a null timespec (wait forever). */ if (uap->uaddr2 == NULL) { uasize = 0; tm_p = NULL; } else { uasize = (size_t)uap->uaddr1; error = umtx_copyin_umtx_time(uap->uaddr2, uasize, &timeout); if (error != 0) return (error); tm_p = &timeout; } error = do_sem2_wait(td, uap->obj, tm_p); if (error == EINTR && uap->uaddr2 != NULL && (timeout._flags & UMTX_ABSTIME) == 0 && uasize >= sizeof(struct _umtx_time) + sizeof(struct timespec)) { error = copyout(&timeout._timeout, (struct _umtx_time *)uap->uaddr2 + 1, sizeof(struct timespec)); if (error == 0) { error = EINTR; } } return (error); } static int __umtx_op_sem2_wake(struct thread *td, struct _umtx_op_args *uap) { return (do_sem2_wake(td, uap->obj)); } #define USHM_OBJ_UMTX(o) \ ((struct umtx_shm_obj_list *)(&(o)->umtx_data)) #define USHMF_REG_LINKED 0x0001 #define USHMF_OBJ_LINKED 0x0002 struct umtx_shm_reg { TAILQ_ENTRY(umtx_shm_reg) ushm_reg_link; LIST_ENTRY(umtx_shm_reg) ushm_obj_link; struct umtx_key ushm_key; struct ucred *ushm_cred; struct shmfd *ushm_obj; u_int ushm_refcnt; u_int ushm_flags; }; LIST_HEAD(umtx_shm_obj_list, umtx_shm_reg); TAILQ_HEAD(umtx_shm_reg_head, umtx_shm_reg); static uma_zone_t umtx_shm_reg_zone; static struct umtx_shm_reg_head umtx_shm_registry[UMTX_CHAINS]; static struct mtx umtx_shm_lock; static struct umtx_shm_reg_head umtx_shm_reg_delfree = TAILQ_HEAD_INITIALIZER(umtx_shm_reg_delfree); static void umtx_shm_free_reg(struct umtx_shm_reg *reg); static void umtx_shm_reg_delfree_tq(void *context __unused, int pending __unused) { struct umtx_shm_reg_head d; struct umtx_shm_reg *reg, *reg1; TAILQ_INIT(&d); mtx_lock(&umtx_shm_lock); TAILQ_CONCAT(&d, &umtx_shm_reg_delfree, ushm_reg_link); mtx_unlock(&umtx_shm_lock); TAILQ_FOREACH_SAFE(reg, &d, ushm_reg_link, reg1) { TAILQ_REMOVE(&d, reg, ushm_reg_link); umtx_shm_free_reg(reg); } } static struct task umtx_shm_reg_delfree_task = TASK_INITIALIZER(0, umtx_shm_reg_delfree_tq, NULL); static struct umtx_shm_reg * umtx_shm_find_reg_locked(const struct umtx_key *key) { struct umtx_shm_reg *reg; struct umtx_shm_reg_head *reg_head; KASSERT(key->shared, ("umtx_p_find_rg: private key")); mtx_assert(&umtx_shm_lock, MA_OWNED); reg_head = &umtx_shm_registry[key->hash]; TAILQ_FOREACH(reg, reg_head, ushm_reg_link) { KASSERT(reg->ushm_key.shared, ("non-shared key on reg %p %d", reg, reg->ushm_key.shared)); if (reg->ushm_key.info.shared.object == key->info.shared.object && reg->ushm_key.info.shared.offset == key->info.shared.offset) { KASSERT(reg->ushm_key.type == TYPE_SHM, ("TYPE_USHM")); KASSERT(reg->ushm_refcnt > 0, ("reg %p refcnt 0 onlist", reg)); KASSERT((reg->ushm_flags & USHMF_REG_LINKED) != 0, ("reg %p not linked", reg)); reg->ushm_refcnt++; return (reg); } } return (NULL); } static struct umtx_shm_reg * umtx_shm_find_reg(const struct umtx_key *key) { struct umtx_shm_reg *reg; mtx_lock(&umtx_shm_lock); reg = umtx_shm_find_reg_locked(key); mtx_unlock(&umtx_shm_lock); return (reg); } static void umtx_shm_free_reg(struct umtx_shm_reg *reg) { chgumtxcnt(reg->ushm_cred->cr_ruidinfo, -1, 0); crfree(reg->ushm_cred); shm_drop(reg->ushm_obj); uma_zfree(umtx_shm_reg_zone, reg); } static bool umtx_shm_unref_reg_locked(struct umtx_shm_reg *reg, bool force) { bool res; mtx_assert(&umtx_shm_lock, MA_OWNED); KASSERT(reg->ushm_refcnt > 0, ("ushm_reg %p refcnt 0", reg)); reg->ushm_refcnt--; res = reg->ushm_refcnt == 0; if (res || force) { if ((reg->ushm_flags & USHMF_REG_LINKED) != 0) { TAILQ_REMOVE(&umtx_shm_registry[reg->ushm_key.hash], reg, ushm_reg_link); reg->ushm_flags &= ~USHMF_REG_LINKED; } if ((reg->ushm_flags & USHMF_OBJ_LINKED) != 0) { LIST_REMOVE(reg, ushm_obj_link); reg->ushm_flags &= ~USHMF_OBJ_LINKED; } } return (res); } static void umtx_shm_unref_reg(struct umtx_shm_reg *reg, bool force) { vm_object_t object; bool dofree; if (force) { object = reg->ushm_obj->shm_object; VM_OBJECT_WLOCK(object); object->flags |= OBJ_UMTXDEAD; VM_OBJECT_WUNLOCK(object); } mtx_lock(&umtx_shm_lock); dofree = umtx_shm_unref_reg_locked(reg, force); mtx_unlock(&umtx_shm_lock); if (dofree) umtx_shm_free_reg(reg); } void umtx_shm_object_init(vm_object_t object) { LIST_INIT(USHM_OBJ_UMTX(object)); } void umtx_shm_object_terminated(vm_object_t object) { struct umtx_shm_reg *reg, *reg1; bool dofree; dofree = false; mtx_lock(&umtx_shm_lock); LIST_FOREACH_SAFE(reg, USHM_OBJ_UMTX(object), ushm_obj_link, reg1) { if (umtx_shm_unref_reg_locked(reg, true)) { TAILQ_INSERT_TAIL(&umtx_shm_reg_delfree, reg, ushm_reg_link); dofree = true; } } mtx_unlock(&umtx_shm_lock); if (dofree) taskqueue_enqueue(taskqueue_thread, &umtx_shm_reg_delfree_task); } static int umtx_shm_create_reg(struct thread *td, const struct umtx_key *key, struct umtx_shm_reg **res) { struct umtx_shm_reg *reg, *reg1; struct ucred *cred; int error; reg = umtx_shm_find_reg(key); if (reg != NULL) { *res = reg; return (0); } cred = td->td_ucred; if (!chgumtxcnt(cred->cr_ruidinfo, 1, lim_cur(td, RLIMIT_UMTXP))) return (ENOMEM); reg = uma_zalloc(umtx_shm_reg_zone, M_WAITOK | M_ZERO); reg->ushm_refcnt = 1; bcopy(key, ®->ushm_key, sizeof(*key)); reg->ushm_obj = shm_alloc(td->td_ucred, O_RDWR); reg->ushm_cred = crhold(cred); error = shm_dotruncate(reg->ushm_obj, PAGE_SIZE); if (error != 0) { umtx_shm_free_reg(reg); return (error); } mtx_lock(&umtx_shm_lock); reg1 = umtx_shm_find_reg_locked(key); if (reg1 != NULL) { mtx_unlock(&umtx_shm_lock); umtx_shm_free_reg(reg); *res = reg1; return (0); } reg->ushm_refcnt++; TAILQ_INSERT_TAIL(&umtx_shm_registry[key->hash], reg, ushm_reg_link); LIST_INSERT_HEAD(USHM_OBJ_UMTX(key->info.shared.object), reg, ushm_obj_link); reg->ushm_flags = USHMF_REG_LINKED | USHMF_OBJ_LINKED; mtx_unlock(&umtx_shm_lock); *res = reg; return (0); } static int umtx_shm_alive(struct thread *td, void *addr) { vm_map_t map; vm_map_entry_t entry; vm_object_t object; vm_pindex_t pindex; vm_prot_t prot; int res, ret; boolean_t wired; map = &td->td_proc->p_vmspace->vm_map; res = vm_map_lookup(&map, (uintptr_t)addr, VM_PROT_READ, &entry, &object, &pindex, &prot, &wired); if (res != KERN_SUCCESS) return (EFAULT); if (object == NULL) ret = EINVAL; else ret = (object->flags & OBJ_UMTXDEAD) != 0 ? ENOTTY : 0; vm_map_lookup_done(map, entry); return (ret); } static void umtx_shm_init(void) { int i; umtx_shm_reg_zone = uma_zcreate("umtx_shm", sizeof(struct umtx_shm_reg), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); mtx_init(&umtx_shm_lock, "umtxshm", NULL, MTX_DEF); for (i = 0; i < nitems(umtx_shm_registry); i++) TAILQ_INIT(&umtx_shm_registry[i]); } static int umtx_shm(struct thread *td, void *addr, u_int flags) { struct umtx_key key; struct umtx_shm_reg *reg; struct file *fp; int error, fd; if (__bitcount(flags & (UMTX_SHM_CREAT | UMTX_SHM_LOOKUP | UMTX_SHM_DESTROY| UMTX_SHM_ALIVE)) != 1) return (EINVAL); if ((flags & UMTX_SHM_ALIVE) != 0) return (umtx_shm_alive(td, addr)); error = umtx_key_get(addr, TYPE_SHM, PROCESS_SHARE, &key); if (error != 0) return (error); KASSERT(key.shared == 1, ("non-shared key")); if ((flags & UMTX_SHM_CREAT) != 0) { error = umtx_shm_create_reg(td, &key, ®); } else { reg = umtx_shm_find_reg(&key); if (reg == NULL) error = ESRCH; } umtx_key_release(&key); if (error != 0) return (error); KASSERT(reg != NULL, ("no reg")); if ((flags & UMTX_SHM_DESTROY) != 0) { umtx_shm_unref_reg(reg, true); } else { #if 0 #ifdef MAC error = mac_posixshm_check_open(td->td_ucred, reg->ushm_obj, FFLAGS(O_RDWR)); if (error == 0) #endif error = shm_access(reg->ushm_obj, td->td_ucred, FFLAGS(O_RDWR)); if (error == 0) #endif error = falloc_caps(td, &fp, &fd, O_CLOEXEC, NULL); if (error == 0) { shm_hold(reg->ushm_obj); finit(fp, FFLAGS(O_RDWR), DTYPE_SHM, reg->ushm_obj, &shm_ops); td->td_retval[0] = fd; fdrop(fp, td); } } umtx_shm_unref_reg(reg, false); return (error); } static int __umtx_op_shm(struct thread *td, struct _umtx_op_args *uap) { return (umtx_shm(td, uap->uaddr1, uap->val)); } static int umtx_robust_lists(struct thread *td, struct umtx_robust_lists_params *rbp) { td->td_rb_list = rbp->robust_list_offset; td->td_rbp_list = rbp->robust_priv_list_offset; td->td_rb_inact = rbp->robust_inact_offset; return (0); } static int __umtx_op_robust_lists(struct thread *td, struct _umtx_op_args *uap) { struct umtx_robust_lists_params rb; int error; if (uap->val > sizeof(rb)) return (EINVAL); bzero(&rb, sizeof(rb)); error = copyin(uap->uaddr1, &rb, uap->val); if (error != 0) return (error); return (umtx_robust_lists(td, &rb)); } typedef int (*_umtx_op_func)(struct thread *td, struct _umtx_op_args *uap); static const _umtx_op_func op_table[] = { [UMTX_OP_RESERVED0] = __umtx_op_unimpl, [UMTX_OP_RESERVED1] = __umtx_op_unimpl, [UMTX_OP_WAIT] = __umtx_op_wait, [UMTX_OP_WAKE] = __umtx_op_wake, [UMTX_OP_MUTEX_TRYLOCK] = __umtx_op_trylock_umutex, [UMTX_OP_MUTEX_LOCK] = __umtx_op_lock_umutex, [UMTX_OP_MUTEX_UNLOCK] = __umtx_op_unlock_umutex, [UMTX_OP_SET_CEILING] = __umtx_op_set_ceiling, [UMTX_OP_CV_WAIT] = __umtx_op_cv_wait, [UMTX_OP_CV_SIGNAL] = __umtx_op_cv_signal, [UMTX_OP_CV_BROADCAST] = __umtx_op_cv_broadcast, [UMTX_OP_WAIT_UINT] = __umtx_op_wait_uint, [UMTX_OP_RW_RDLOCK] = __umtx_op_rw_rdlock, [UMTX_OP_RW_WRLOCK] = __umtx_op_rw_wrlock, [UMTX_OP_RW_UNLOCK] = __umtx_op_rw_unlock, [UMTX_OP_WAIT_UINT_PRIVATE] = __umtx_op_wait_uint_private, [UMTX_OP_WAKE_PRIVATE] = __umtx_op_wake_private, [UMTX_OP_MUTEX_WAIT] = __umtx_op_wait_umutex, [UMTX_OP_MUTEX_WAKE] = __umtx_op_wake_umutex, #if defined(COMPAT_FREEBSD9) || defined(COMPAT_FREEBSD10) [UMTX_OP_SEM_WAIT] = __umtx_op_sem_wait, [UMTX_OP_SEM_WAKE] = __umtx_op_sem_wake, #else [UMTX_OP_SEM_WAIT] = __umtx_op_unimpl, [UMTX_OP_SEM_WAKE] = __umtx_op_unimpl, #endif [UMTX_OP_NWAKE_PRIVATE] = __umtx_op_nwake_private, [UMTX_OP_MUTEX_WAKE2] = __umtx_op_wake2_umutex, [UMTX_OP_SEM2_WAIT] = __umtx_op_sem2_wait, [UMTX_OP_SEM2_WAKE] = __umtx_op_sem2_wake, [UMTX_OP_SHM] = __umtx_op_shm, [UMTX_OP_ROBUST_LISTS] = __umtx_op_robust_lists, }; int sys__umtx_op(struct thread *td, struct _umtx_op_args *uap) { if ((unsigned)uap->op < nitems(op_table)) return (*op_table[uap->op])(td, uap); return (EINVAL); } #ifdef COMPAT_FREEBSD32 struct timespec32 { int32_t tv_sec; int32_t tv_nsec; }; struct umtx_time32 { struct timespec32 timeout; uint32_t flags; uint32_t clockid; }; static inline int umtx_copyin_timeout32(void *addr, struct timespec *tsp) { struct timespec32 ts32; int error; error = copyin(addr, &ts32, sizeof(struct timespec32)); if (error == 0) { if (ts32.tv_sec < 0 || ts32.tv_nsec >= 1000000000 || ts32.tv_nsec < 0) error = EINVAL; else { tsp->tv_sec = ts32.tv_sec; tsp->tv_nsec = ts32.tv_nsec; } } return (error); } static inline int umtx_copyin_umtx_time32(const void *addr, size_t size, struct _umtx_time *tp) { struct umtx_time32 t32; int error; t32.clockid = CLOCK_REALTIME; t32.flags = 0; if (size <= sizeof(struct timespec32)) error = copyin(addr, &t32.timeout, sizeof(struct timespec32)); else error = copyin(addr, &t32, sizeof(struct umtx_time32)); if (error != 0) return (error); if (t32.timeout.tv_sec < 0 || t32.timeout.tv_nsec >= 1000000000 || t32.timeout.tv_nsec < 0) return (EINVAL); tp->_timeout.tv_sec = t32.timeout.tv_sec; tp->_timeout.tv_nsec = t32.timeout.tv_nsec; tp->_flags = t32.flags; tp->_clockid = t32.clockid; return (0); } static int __umtx_op_wait_compat32(struct thread *td, struct _umtx_op_args *uap) { struct _umtx_time *tm_p, timeout; int error; if (uap->uaddr2 == NULL) tm_p = NULL; else { error = umtx_copyin_umtx_time32(uap->uaddr2, (size_t)uap->uaddr1, &timeout); if (error != 0) return (error); tm_p = &timeout; } return (do_wait(td, uap->obj, uap->val, tm_p, 1, 0)); } static int __umtx_op_lock_umutex_compat32(struct thread *td, struct _umtx_op_args *uap) { struct _umtx_time *tm_p, timeout; int error; /* Allow a null timespec (wait forever). */ if (uap->uaddr2 == NULL) tm_p = NULL; else { error = umtx_copyin_umtx_time32(uap->uaddr2, (size_t)uap->uaddr1, &timeout); if (error != 0) return (error); tm_p = &timeout; } return (do_lock_umutex(td, uap->obj, tm_p, 0)); } static int __umtx_op_wait_umutex_compat32(struct thread *td, struct _umtx_op_args *uap) { struct _umtx_time *tm_p, timeout; int error; /* Allow a null timespec (wait forever). */ if (uap->uaddr2 == NULL) tm_p = NULL; else { error = umtx_copyin_umtx_time32(uap->uaddr2, (size_t)uap->uaddr1, &timeout); if (error != 0) return (error); tm_p = &timeout; } return (do_lock_umutex(td, uap->obj, tm_p, _UMUTEX_WAIT)); } static int __umtx_op_cv_wait_compat32(struct thread *td, struct _umtx_op_args *uap) { struct timespec *ts, timeout; int error; /* Allow a null timespec (wait forever). */ if (uap->uaddr2 == NULL) ts = NULL; else { error = umtx_copyin_timeout32(uap->uaddr2, &timeout); if (error != 0) return (error); ts = &timeout; } return (do_cv_wait(td, uap->obj, uap->uaddr1, ts, uap->val)); } static int __umtx_op_rw_rdlock_compat32(struct thread *td, struct _umtx_op_args *uap) { struct _umtx_time timeout; int error; /* Allow a null timespec (wait forever). */ if (uap->uaddr2 == NULL) { error = do_rw_rdlock(td, uap->obj, uap->val, 0); } else { error = umtx_copyin_umtx_time32(uap->uaddr2, (size_t)uap->uaddr1, &timeout); if (error != 0) return (error); error = do_rw_rdlock(td, uap->obj, uap->val, &timeout); } return (error); } static int __umtx_op_rw_wrlock_compat32(struct thread *td, struct _umtx_op_args *uap) { struct _umtx_time timeout; int error; /* Allow a null timespec (wait forever). */ if (uap->uaddr2 == NULL) { error = do_rw_wrlock(td, uap->obj, 0); } else { error = umtx_copyin_umtx_time32(uap->uaddr2, (size_t)uap->uaddr1, &timeout); if (error != 0) return (error); error = do_rw_wrlock(td, uap->obj, &timeout); } return (error); } static int __umtx_op_wait_uint_private_compat32(struct thread *td, struct _umtx_op_args *uap) { struct _umtx_time *tm_p, timeout; int error; if (uap->uaddr2 == NULL) tm_p = NULL; else { error = umtx_copyin_umtx_time32( uap->uaddr2, (size_t)uap->uaddr1,&timeout); if (error != 0) return (error); tm_p = &timeout; } return (do_wait(td, uap->obj, uap->val, tm_p, 1, 1)); } #if defined(COMPAT_FREEBSD9) || defined(COMPAT_FREEBSD10) static int __umtx_op_sem_wait_compat32(struct thread *td, struct _umtx_op_args *uap) { struct _umtx_time *tm_p, timeout; int error; /* Allow a null timespec (wait forever). */ if (uap->uaddr2 == NULL) tm_p = NULL; else { error = umtx_copyin_umtx_time32(uap->uaddr2, (size_t)uap->uaddr1, &timeout); if (error != 0) return (error); tm_p = &timeout; } return (do_sem_wait(td, uap->obj, tm_p)); } #endif static int __umtx_op_sem2_wait_compat32(struct thread *td, struct _umtx_op_args *uap) { struct _umtx_time *tm_p, timeout; size_t uasize; int error; /* Allow a null timespec (wait forever). */ if (uap->uaddr2 == NULL) { uasize = 0; tm_p = NULL; } else { uasize = (size_t)uap->uaddr1; error = umtx_copyin_umtx_time32(uap->uaddr2, uasize, &timeout); if (error != 0) return (error); tm_p = &timeout; } error = do_sem2_wait(td, uap->obj, tm_p); if (error == EINTR && uap->uaddr2 != NULL && (timeout._flags & UMTX_ABSTIME) == 0 && uasize >= sizeof(struct umtx_time32) + sizeof(struct timespec32)) { struct timespec32 remain32 = { .tv_sec = timeout._timeout.tv_sec, .tv_nsec = timeout._timeout.tv_nsec }; error = copyout(&remain32, (struct umtx_time32 *)uap->uaddr2 + 1, sizeof(struct timespec32)); if (error == 0) { error = EINTR; } } return (error); } static int __umtx_op_nwake_private32(struct thread *td, struct _umtx_op_args *uap) { uint32_t uaddrs[BATCH_SIZE], **upp; int count, error, i, pos, tocopy; upp = (uint32_t **)uap->obj; error = 0; for (count = uap->val, pos = 0; count > 0; count -= tocopy, pos += tocopy) { tocopy = MIN(count, BATCH_SIZE); error = copyin(upp + pos, uaddrs, tocopy * sizeof(uint32_t)); if (error != 0) break; for (i = 0; i < tocopy; ++i) kern_umtx_wake(td, (void *)(intptr_t)uaddrs[i], INT_MAX, 1); maybe_yield(); } return (error); } struct umtx_robust_lists_params_compat32 { uint32_t robust_list_offset; uint32_t robust_priv_list_offset; uint32_t robust_inact_offset; }; static int __umtx_op_robust_lists_compat32(struct thread *td, struct _umtx_op_args *uap) { struct umtx_robust_lists_params rb; struct umtx_robust_lists_params_compat32 rb32; int error; if (uap->val > sizeof(rb32)) return (EINVAL); bzero(&rb, sizeof(rb)); bzero(&rb32, sizeof(rb32)); error = copyin(uap->uaddr1, &rb32, uap->val); if (error != 0) return (error); rb.robust_list_offset = rb32.robust_list_offset; rb.robust_priv_list_offset = rb32.robust_priv_list_offset; rb.robust_inact_offset = rb32.robust_inact_offset; return (umtx_robust_lists(td, &rb)); } static const _umtx_op_func op_table_compat32[] = { [UMTX_OP_RESERVED0] = __umtx_op_unimpl, [UMTX_OP_RESERVED1] = __umtx_op_unimpl, [UMTX_OP_WAIT] = __umtx_op_wait_compat32, [UMTX_OP_WAKE] = __umtx_op_wake, [UMTX_OP_MUTEX_TRYLOCK] = __umtx_op_trylock_umutex, [UMTX_OP_MUTEX_LOCK] = __umtx_op_lock_umutex_compat32, [UMTX_OP_MUTEX_UNLOCK] = __umtx_op_unlock_umutex, [UMTX_OP_SET_CEILING] = __umtx_op_set_ceiling, [UMTX_OP_CV_WAIT] = __umtx_op_cv_wait_compat32, [UMTX_OP_CV_SIGNAL] = __umtx_op_cv_signal, [UMTX_OP_CV_BROADCAST] = __umtx_op_cv_broadcast, [UMTX_OP_WAIT_UINT] = __umtx_op_wait_compat32, [UMTX_OP_RW_RDLOCK] = __umtx_op_rw_rdlock_compat32, [UMTX_OP_RW_WRLOCK] = __umtx_op_rw_wrlock_compat32, [UMTX_OP_RW_UNLOCK] = __umtx_op_rw_unlock, [UMTX_OP_WAIT_UINT_PRIVATE] = __umtx_op_wait_uint_private_compat32, [UMTX_OP_WAKE_PRIVATE] = __umtx_op_wake_private, [UMTX_OP_MUTEX_WAIT] = __umtx_op_wait_umutex_compat32, [UMTX_OP_MUTEX_WAKE] = __umtx_op_wake_umutex, #if defined(COMPAT_FREEBSD9) || defined(COMPAT_FREEBSD10) [UMTX_OP_SEM_WAIT] = __umtx_op_sem_wait_compat32, [UMTX_OP_SEM_WAKE] = __umtx_op_sem_wake, #else [UMTX_OP_SEM_WAIT] = __umtx_op_unimpl, [UMTX_OP_SEM_WAKE] = __umtx_op_unimpl, #endif [UMTX_OP_NWAKE_PRIVATE] = __umtx_op_nwake_private32, [UMTX_OP_MUTEX_WAKE2] = __umtx_op_wake2_umutex, [UMTX_OP_SEM2_WAIT] = __umtx_op_sem2_wait_compat32, [UMTX_OP_SEM2_WAKE] = __umtx_op_sem2_wake, [UMTX_OP_SHM] = __umtx_op_shm, [UMTX_OP_ROBUST_LISTS] = __umtx_op_robust_lists_compat32, }; int freebsd32_umtx_op(struct thread *td, struct freebsd32_umtx_op_args *uap) { if ((unsigned)uap->op < nitems(op_table_compat32)) { return (*op_table_compat32[uap->op])(td, (struct _umtx_op_args *)uap); } return (EINVAL); } #endif void umtx_thread_init(struct thread *td) { td->td_umtxq = umtxq_alloc(); td->td_umtxq->uq_thread = td; } void umtx_thread_fini(struct thread *td) { umtxq_free(td->td_umtxq); } /* * It will be called when new thread is created, e.g fork(). */ void umtx_thread_alloc(struct thread *td) { struct umtx_q *uq; uq = td->td_umtxq; uq->uq_inherited_pri = PRI_MAX; KASSERT(uq->uq_flags == 0, ("uq_flags != 0")); KASSERT(uq->uq_thread == td, ("uq_thread != td")); KASSERT(uq->uq_pi_blocked == NULL, ("uq_pi_blocked != NULL")); KASSERT(TAILQ_EMPTY(&uq->uq_pi_contested), ("uq_pi_contested is not empty")); } /* * exec() hook. * * Clear robust lists for all process' threads, not delaying the * cleanup to thread_exit hook, since the relevant address space is * destroyed right now. */ static void umtx_exec_hook(void *arg __unused, struct proc *p, struct image_params *imgp __unused) { struct thread *td; KASSERT(p == curproc, ("need curproc")); PROC_LOCK(p); KASSERT((p->p_flag & P_HADTHREADS) == 0 || (p->p_flag & P_STOPPED_SINGLE) != 0, ("curproc must be single-threaded")); FOREACH_THREAD_IN_PROC(p, td) { KASSERT(td == curthread || ((td->td_flags & TDF_BOUNDARY) != 0 && TD_IS_SUSPENDED(td)), ("running thread %p %p", p, td)); PROC_UNLOCK(p); umtx_thread_cleanup(td); PROC_LOCK(p); td->td_rb_list = td->td_rbp_list = td->td_rb_inact = 0; } PROC_UNLOCK(p); } /* * thread_exit() hook. */ void umtx_thread_exit(struct thread *td) { umtx_thread_cleanup(td); } static int umtx_read_uptr(struct thread *td, uintptr_t ptr, uintptr_t *res) { u_long res1; #ifdef COMPAT_FREEBSD32 uint32_t res32; #endif int error; #ifdef COMPAT_FREEBSD32 if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) { error = fueword32((void *)ptr, &res32); if (error == 0) res1 = res32; } else #endif { error = fueword((void *)ptr, &res1); } if (error == 0) *res = res1; else error = EFAULT; return (error); } static void umtx_read_rb_list(struct thread *td, struct umutex *m, uintptr_t *rb_list) { #ifdef COMPAT_FREEBSD32 struct umutex32 m32; if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) { memcpy(&m32, m, sizeof(m32)); *rb_list = m32.m_rb_lnk; } else #endif *rb_list = m->m_rb_lnk; } static int umtx_handle_rb(struct thread *td, uintptr_t rbp, uintptr_t *rb_list, bool inact) { struct umutex m; int error; KASSERT(td->td_proc == curproc, ("need current vmspace")); error = copyin((void *)rbp, &m, sizeof(m)); if (error != 0) return (error); if (rb_list != NULL) umtx_read_rb_list(td, &m, rb_list); if ((m.m_flags & UMUTEX_ROBUST) == 0) return (EINVAL); if ((m.m_owner & ~UMUTEX_CONTESTED) != td->td_tid) /* inact is cleared after unlock, allow the inconsistency */ return (inact ? 0 : EINVAL); return (do_unlock_umutex(td, (struct umutex *)rbp, true)); } static void umtx_cleanup_rb_list(struct thread *td, uintptr_t rb_list, uintptr_t *rb_inact, const char *name) { int error, i; uintptr_t rbp; bool inact; if (rb_list == 0) return; error = umtx_read_uptr(td, rb_list, &rbp); for (i = 0; error == 0 && rbp != 0 && i < umtx_max_rb; i++) { if (rbp == *rb_inact) { inact = true; *rb_inact = 0; } else inact = false; error = umtx_handle_rb(td, rbp, &rbp, inact); } if (i == umtx_max_rb && umtx_verbose_rb) { uprintf("comm %s pid %d: reached umtx %smax rb %d\n", td->td_proc->p_comm, td->td_proc->p_pid, name, umtx_max_rb); } if (error != 0 && umtx_verbose_rb) { uprintf("comm %s pid %d: handling %srb error %d\n", td->td_proc->p_comm, td->td_proc->p_pid, name, error); } } /* * Clean up umtx data. */ static void umtx_thread_cleanup(struct thread *td) { struct umtx_q *uq; struct umtx_pi *pi; uintptr_t rb_inact; /* * Disown pi mutexes. */ uq = td->td_umtxq; if (uq != NULL) { mtx_lock(&umtx_lock); uq->uq_inherited_pri = PRI_MAX; while ((pi = TAILQ_FIRST(&uq->uq_pi_contested)) != NULL) { pi->pi_owner = NULL; TAILQ_REMOVE(&uq->uq_pi_contested, pi, pi_link); } mtx_unlock(&umtx_lock); thread_lock(td); sched_lend_user_prio(td, PRI_MAX); thread_unlock(td); } /* * Handle terminated robust mutexes. Must be done after * robust pi disown, otherwise unlock could see unowned * entries. */ rb_inact = td->td_rb_inact; if (rb_inact != 0) (void)umtx_read_uptr(td, rb_inact, &rb_inact); umtx_cleanup_rb_list(td, td->td_rb_list, &rb_inact, ""); umtx_cleanup_rb_list(td, td->td_rbp_list, &rb_inact, "priv "); if (rb_inact != 0) (void)umtx_handle_rb(td, rb_inact, NULL, true); } Index: head/sys/kern/subr_rtc.c =================================================================== --- head/sys/kern/subr_rtc.c (revision 336913) +++ head/sys/kern/subr_rtc.c (revision 336914) @@ -1,424 +1,424 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1988 University of Utah. * Copyright (c) 1982, 1990, 1993 * The Regents of the University of California. * Copyright (c) 2011 The FreeBSD Foundation * All rights reserved. * * This code is derived from software contributed to Berkeley by * the Systems Programming Group of the University of Utah Computer * Science Department. * * Portions of this software were developed by Julien Ridoux at the University * of Melbourne under sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * from: Utah $Hdr: clock.c 1.18 91/01/21$ * from: @(#)clock.c 8.2 (Berkeley) 1/12/94 * from: NetBSD: clock_subr.c,v 1.6 2001/07/07 17:04:02 thorpej Exp * and * from: src/sys/i386/isa/clock.c,v 1.176 2001/09/04 */ /* * Helpers for time-of-day clocks. This is useful for architectures that need * support multiple models of such clocks, and generally serves to make the * code more machine-independent. * If the clock in question can also be used as a time counter, the driver * needs to initiate this. * This code is not yet used by all architectures. */ #include __FBSDID("$FreeBSD$"); #include "opt_ffclock.h" #include #include #include #include #include #include #include #include #include #include #ifdef FFCLOCK #include #endif #include #include "clock_if.h" static int show_io; SYSCTL_INT(_debug, OID_AUTO, clock_show_io, CTLFLAG_RWTUN, &show_io, 0, "Enable debug printing of RTC clock I/O; 1=reads, 2=writes, 3=both."); static int sysctl_clock_do_io(SYSCTL_HANDLER_ARGS); SYSCTL_PROC(_debug, OID_AUTO, clock_do_io, CTLTYPE_INT | CTLFLAG_RW, 0, 0, sysctl_clock_do_io, "I", "Trigger one-time IO on RTC clocks; 1=read (and discard), 2=write"); /* XXX: should be kern. now, it's no longer machdep. */ static int disable_rtc_set; SYSCTL_INT(_machdep, OID_AUTO, disable_rtc_set, CTLFLAG_RW, &disable_rtc_set, 0, "Disallow adjusting time-of-day clock"); /* * An instance of a realtime clock. A list of these tracks all the registered * clocks in the system. * * The resadj member is used to apply a "resolution adjustment" equal to half * the clock's resolution, which is useful mainly on clocks with a whole-second * resolution. Because the clock truncates the fractional part, adding half the * resolution performs 4/5 rounding. The same adjustment is applied to the * times returned from clock_gettime(), because the fraction returned will * always be zero, but on average the actual fraction at the time of the call * should be about .5. */ struct rtc_instance { device_t clockdev; int resolution; int flags; u_int schedns; struct timespec resadj; struct timeout_task stask; LIST_ENTRY(rtc_instance) rtc_entries; }; /* * Clocks are updated using a task running on taskqueue_thread. */ static void settime_task_func(void *arg, int pending); /* * Registered clocks are kept in a list which is sorted by resolution; the more * accurate clocks get the first shot at providing the time. */ LIST_HEAD(rtc_listhead, rtc_instance); static struct rtc_listhead rtc_list = LIST_HEAD_INITIALIZER(rtc_list); static struct sx rtc_list_lock; SX_SYSINIT(rtc_list_lock_init, &rtc_list_lock, "rtc list"); /* * On the task thread, invoke the clock_settime() method of the clock. Do so * holding no locks, so that clock drivers are free to do whatever kind of * locking or sleeping they need to. */ static void settime_task_func(void *arg, int pending) { struct timespec ts; struct rtc_instance *rtc; rtc = arg; if (!(rtc->flags & CLOCKF_SETTIME_NO_TS)) { getnanotime(&ts); if (!(rtc->flags & CLOCKF_SETTIME_NO_ADJ)) { ts.tv_sec -= utc_offset(); - timespecadd(&ts, &rtc->resadj); + timespecadd(&ts, &rtc->resadj, &ts); } } else { ts.tv_sec = 0; ts.tv_nsec = 0; } CLOCK_SETTIME(rtc->clockdev, &ts); } static void clock_dbgprint_hdr(device_t dev, int rw) { struct timespec now; getnanotime(&now); device_printf(dev, "%s at ", (rw & CLOCK_DBG_READ) ? "read " : "write"); clock_print_ts(&now, 9); printf(": "); } void clock_dbgprint_bcd(device_t dev, int rw, const struct bcd_clocktime *bct) { if (show_io & rw) { clock_dbgprint_hdr(dev, rw); clock_print_bcd(bct, 9); printf("\n"); } } void clock_dbgprint_ct(device_t dev, int rw, const struct clocktime *ct) { if (show_io & rw) { clock_dbgprint_hdr(dev, rw); clock_print_ct(ct, 9); printf("\n"); } } void clock_dbgprint_err(device_t dev, int rw, int err) { if (show_io & rw) { clock_dbgprint_hdr(dev, rw); printf("error = %d\n", err); } } void clock_dbgprint_ts(device_t dev, int rw, const struct timespec *ts) { if (show_io & rw) { clock_dbgprint_hdr(dev, rw); clock_print_ts(ts, 9); printf("\n"); } } void clock_register_flags(device_t clockdev, long resolution, int flags) { struct rtc_instance *rtc, *newrtc; newrtc = malloc(sizeof(*newrtc), M_DEVBUF, M_WAITOK); newrtc->clockdev = clockdev; newrtc->resolution = (int)resolution; newrtc->flags = flags; newrtc->schedns = 0; newrtc->resadj.tv_sec = newrtc->resolution / 2 / 1000000; newrtc->resadj.tv_nsec = newrtc->resolution / 2 % 1000000 * 1000; TIMEOUT_TASK_INIT(taskqueue_thread, &newrtc->stask, 0, settime_task_func, newrtc); sx_xlock(&rtc_list_lock); if (LIST_EMPTY(&rtc_list)) { LIST_INSERT_HEAD(&rtc_list, newrtc, rtc_entries); } else { LIST_FOREACH(rtc, &rtc_list, rtc_entries) { if (rtc->resolution > newrtc->resolution) { LIST_INSERT_BEFORE(rtc, newrtc, rtc_entries); break; } else if (LIST_NEXT(rtc, rtc_entries) == NULL) { LIST_INSERT_AFTER(rtc, newrtc, rtc_entries); break; } } } sx_xunlock(&rtc_list_lock); device_printf(clockdev, "registered as a time-of-day clock, resolution %d.%6.6ds\n", newrtc->resolution / 1000000, newrtc->resolution % 1000000); } void clock_register(device_t dev, long res) { clock_register_flags(dev, res, 0); } void clock_unregister(device_t clockdev) { struct rtc_instance *rtc, *tmp; sx_xlock(&rtc_list_lock); LIST_FOREACH_SAFE(rtc, &rtc_list, rtc_entries, tmp) { if (rtc->clockdev == clockdev) { LIST_REMOVE(rtc, rtc_entries); break; } } sx_xunlock(&rtc_list_lock); if (rtc != NULL) { taskqueue_cancel_timeout(taskqueue_thread, &rtc->stask, NULL); taskqueue_drain_timeout(taskqueue_thread, &rtc->stask); free(rtc, M_DEVBUF); } } void clock_schedule(device_t clockdev, u_int offsetns) { struct rtc_instance *rtc; sx_xlock(&rtc_list_lock); LIST_FOREACH(rtc, &rtc_list, rtc_entries) { if (rtc->clockdev == clockdev) { rtc->schedns = offsetns; break; } } sx_xunlock(&rtc_list_lock); } static int read_clocks(struct timespec *ts, bool debug_read) { struct rtc_instance *rtc; int error; error = ENXIO; sx_xlock(&rtc_list_lock); LIST_FOREACH(rtc, &rtc_list, rtc_entries) { if ((error = CLOCK_GETTIME(rtc->clockdev, ts)) != 0) continue; if (ts->tv_sec < 0 || ts->tv_nsec < 0) { error = EINVAL; continue; } if (!(rtc->flags & CLOCKF_GETTIME_NO_ADJ)) { - timespecadd(ts, &rtc->resadj); + timespecadd(ts, &rtc->resadj, ts); ts->tv_sec += utc_offset(); } if (!debug_read) { if (bootverbose) device_printf(rtc->clockdev, "providing initial system time\n"); break; } } sx_xunlock(&rtc_list_lock); return (error); } /* * Initialize the system time. Must be called from a context which does not * restrict any locking or sleeping that clock drivers may need to do. * * First attempt to get the time from a registered realtime clock. The clocks * are queried in order of resolution until one provides the time. If no clock * can provide the current time, use the 'base' time provided by the caller, if * non-zero. The 'base' time is potentially highly inaccurate, such as the last * known good value of the system clock, or even a filesystem last-updated * timestamp. It is used to prevent system time from appearing to move * backwards in logs. */ void inittodr(time_t base) { struct timespec ts; int error; error = read_clocks(&ts, false); /* * Do not report errors from each clock; it is expected that some clocks * cannot provide results in some situations. Only report problems when * no clocks could provide the time. */ if (error != 0) { switch (error) { case ENXIO: printf("Warning: no time-of-day clock registered, "); break; case EINVAL: printf("Warning: bad time from time-of-day clock, "); break; default: printf("Error reading time-of-day clock (%d), ", error); break; } printf("system time will not be set accurately\n"); ts.tv_sec = (base > 0) ? base : -1; ts.tv_nsec = 0; } if (ts.tv_sec >= 0) { tc_setclock(&ts); #ifdef FFCLOCK ffclock_reset_clock(&ts); #endif } } /* * Write system time back to all registered clocks, unless disabled by admin. * This can be called from a context that restricts locking and/or sleeping; the * actual updating is done asynchronously on a task thread. */ void resettodr(void) { struct timespec now; struct rtc_instance *rtc; sbintime_t sbt; long waitns; if (disable_rtc_set) return; sx_xlock(&rtc_list_lock); LIST_FOREACH(rtc, &rtc_list, rtc_entries) { if (rtc->schedns != 0) { getnanotime(&now); waitns = rtc->schedns - now.tv_nsec; if (waitns < 0) waitns += 1000000000; sbt = nstosbt(waitns); } else sbt = 0; taskqueue_enqueue_timeout_sbt(taskqueue_thread, &rtc->stask, -sbt, 0, C_PREL(31)); } sx_xunlock(&rtc_list_lock); } static int sysctl_clock_do_io(SYSCTL_HANDLER_ARGS) { struct timespec ts_discard; int error, value; value = 0; error = sysctl_handle_int(oidp, &value, 0, req); if (error != 0 || req->newptr == NULL) return (error); switch (value) { case CLOCK_DBG_READ: if (read_clocks(&ts_discard, true) == ENXIO) printf("No registered RTC clocks\n"); break; case CLOCK_DBG_WRITE: resettodr(); break; default: return (EINVAL); } return (0); } Index: head/sys/kern/uipc_mqueue.c =================================================================== --- head/sys/kern/uipc_mqueue.c (revision 336913) +++ head/sys/kern/uipc_mqueue.c (revision 336914) @@ -1,2933 +1,2931 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2005 David Xu * Copyright (c) 2016-2017 Robert N. M. Watson * All rights reserved. * * Portions of this software were developed by BAE Systems, the University of * Cambridge Computer Laboratory, and Memorial University under DARPA/AFRL * contract FA8650-15-C-7558 ("CADETS"), as part of the DARPA Transparent * Computing (TC) research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ /* * POSIX message queue implementation. * * 1) A mqueue filesystem can be mounted, each message queue appears * in mounted directory, user can change queue's permission and * ownership, or remove a queue. Manually creating a file in the * directory causes a message queue to be created in the kernel with * default message queue attributes applied and same name used, this * method is not advocated since mq_open syscall allows user to specify * different attributes. Also the file system can be mounted multiple * times at different mount points but shows same contents. * * 2) Standard POSIX message queue API. The syscalls do not use vfs layer, * but directly operate on internal data structure, this allows user to * use the IPC facility without having to mount mqueue file system. */ #include __FBSDID("$FreeBSD$"); #include "opt_capsicum.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include FEATURE(p1003_1b_mqueue, "POSIX P1003.1B message queues support"); /* * Limits and constants */ #define MQFS_NAMELEN NAME_MAX #define MQFS_DELEN (8 + MQFS_NAMELEN) /* node types */ typedef enum { mqfstype_none = 0, mqfstype_root, mqfstype_dir, mqfstype_this, mqfstype_parent, mqfstype_file, mqfstype_symlink, } mqfs_type_t; struct mqfs_node; /* * mqfs_info: describes a mqfs instance */ struct mqfs_info { struct sx mi_lock; struct mqfs_node *mi_root; struct unrhdr *mi_unrhdr; }; struct mqfs_vdata { LIST_ENTRY(mqfs_vdata) mv_link; struct mqfs_node *mv_node; struct vnode *mv_vnode; struct task mv_task; }; /* * mqfs_node: describes a node (file or directory) within a mqfs */ struct mqfs_node { char mn_name[MQFS_NAMELEN+1]; struct mqfs_info *mn_info; struct mqfs_node *mn_parent; LIST_HEAD(,mqfs_node) mn_children; LIST_ENTRY(mqfs_node) mn_sibling; LIST_HEAD(,mqfs_vdata) mn_vnodes; const void *mn_pr_root; int mn_refcount; mqfs_type_t mn_type; int mn_deleted; uint32_t mn_fileno; void *mn_data; struct timespec mn_birth; struct timespec mn_ctime; struct timespec mn_atime; struct timespec mn_mtime; uid_t mn_uid; gid_t mn_gid; int mn_mode; }; #define VTON(vp) (((struct mqfs_vdata *)((vp)->v_data))->mv_node) #define VTOMQ(vp) ((struct mqueue *)(VTON(vp)->mn_data)) #define VFSTOMQFS(m) ((struct mqfs_info *)((m)->mnt_data)) #define FPTOMQ(fp) ((struct mqueue *)(((struct mqfs_node *) \ (fp)->f_data)->mn_data)) TAILQ_HEAD(msgq, mqueue_msg); struct mqueue; struct mqueue_notifier { LIST_ENTRY(mqueue_notifier) nt_link; struct sigevent nt_sigev; ksiginfo_t nt_ksi; struct proc *nt_proc; }; struct mqueue { struct mtx mq_mutex; int mq_flags; long mq_maxmsg; long mq_msgsize; long mq_curmsgs; long mq_totalbytes; struct msgq mq_msgq; int mq_receivers; int mq_senders; struct selinfo mq_rsel; struct selinfo mq_wsel; struct mqueue_notifier *mq_notifier; }; #define MQ_RSEL 0x01 #define MQ_WSEL 0x02 struct mqueue_msg { TAILQ_ENTRY(mqueue_msg) msg_link; unsigned int msg_prio; unsigned int msg_size; /* following real data... */ }; static SYSCTL_NODE(_kern, OID_AUTO, mqueue, CTLFLAG_RW, 0, "POSIX real time message queue"); static int default_maxmsg = 10; static int default_msgsize = 1024; static int maxmsg = 100; SYSCTL_INT(_kern_mqueue, OID_AUTO, maxmsg, CTLFLAG_RW, &maxmsg, 0, "Default maximum messages in queue"); static int maxmsgsize = 16384; SYSCTL_INT(_kern_mqueue, OID_AUTO, maxmsgsize, CTLFLAG_RW, &maxmsgsize, 0, "Default maximum message size"); static int maxmq = 100; SYSCTL_INT(_kern_mqueue, OID_AUTO, maxmq, CTLFLAG_RW, &maxmq, 0, "maximum message queues"); static int curmq = 0; SYSCTL_INT(_kern_mqueue, OID_AUTO, curmq, CTLFLAG_RW, &curmq, 0, "current message queue number"); static int unloadable = 0; static MALLOC_DEFINE(M_MQUEUEDATA, "mqdata", "mqueue data"); static eventhandler_tag exit_tag; /* Only one instance per-system */ static struct mqfs_info mqfs_data; static uma_zone_t mqnode_zone; static uma_zone_t mqueue_zone; static uma_zone_t mvdata_zone; static uma_zone_t mqnoti_zone; static struct vop_vector mqfs_vnodeops; static struct fileops mqueueops; static unsigned mqfs_osd_jail_slot; /* * Directory structure construction and manipulation */ #ifdef notyet static struct mqfs_node *mqfs_create_dir(struct mqfs_node *parent, const char *name, int namelen, struct ucred *cred, int mode); static struct mqfs_node *mqfs_create_link(struct mqfs_node *parent, const char *name, int namelen, struct ucred *cred, int mode); #endif static struct mqfs_node *mqfs_create_file(struct mqfs_node *parent, const char *name, int namelen, struct ucred *cred, int mode); static int mqfs_destroy(struct mqfs_node *mn); static void mqfs_fileno_alloc(struct mqfs_info *mi, struct mqfs_node *mn); static void mqfs_fileno_free(struct mqfs_info *mi, struct mqfs_node *mn); static int mqfs_allocv(struct mount *mp, struct vnode **vpp, struct mqfs_node *pn); static int mqfs_prison_remove(void *obj, void *data); /* * Message queue construction and maniplation */ static struct mqueue *mqueue_alloc(const struct mq_attr *attr); static void mqueue_free(struct mqueue *mq); static int mqueue_send(struct mqueue *mq, const char *msg_ptr, size_t msg_len, unsigned msg_prio, int waitok, const struct timespec *abs_timeout); static int mqueue_receive(struct mqueue *mq, char *msg_ptr, size_t msg_len, unsigned *msg_prio, int waitok, const struct timespec *abs_timeout); static int _mqueue_send(struct mqueue *mq, struct mqueue_msg *msg, int timo); static int _mqueue_recv(struct mqueue *mq, struct mqueue_msg **msg, int timo); static void mqueue_send_notification(struct mqueue *mq); static void mqueue_fdclose(struct thread *td, int fd, struct file *fp); static void mq_proc_exit(void *arg, struct proc *p); /* * kqueue filters */ static void filt_mqdetach(struct knote *kn); static int filt_mqread(struct knote *kn, long hint); static int filt_mqwrite(struct knote *kn, long hint); struct filterops mq_rfiltops = { .f_isfd = 1, .f_detach = filt_mqdetach, .f_event = filt_mqread, }; struct filterops mq_wfiltops = { .f_isfd = 1, .f_detach = filt_mqdetach, .f_event = filt_mqwrite, }; /* * Initialize fileno bitmap */ static void mqfs_fileno_init(struct mqfs_info *mi) { struct unrhdr *up; up = new_unrhdr(1, INT_MAX, NULL); mi->mi_unrhdr = up; } /* * Tear down fileno bitmap */ static void mqfs_fileno_uninit(struct mqfs_info *mi) { struct unrhdr *up; up = mi->mi_unrhdr; mi->mi_unrhdr = NULL; delete_unrhdr(up); } /* * Allocate a file number */ static void mqfs_fileno_alloc(struct mqfs_info *mi, struct mqfs_node *mn) { /* make sure our parent has a file number */ if (mn->mn_parent && !mn->mn_parent->mn_fileno) mqfs_fileno_alloc(mi, mn->mn_parent); switch (mn->mn_type) { case mqfstype_root: case mqfstype_dir: case mqfstype_file: case mqfstype_symlink: mn->mn_fileno = alloc_unr(mi->mi_unrhdr); break; case mqfstype_this: KASSERT(mn->mn_parent != NULL, ("mqfstype_this node has no parent")); mn->mn_fileno = mn->mn_parent->mn_fileno; break; case mqfstype_parent: KASSERT(mn->mn_parent != NULL, ("mqfstype_parent node has no parent")); if (mn->mn_parent == mi->mi_root) { mn->mn_fileno = mn->mn_parent->mn_fileno; break; } KASSERT(mn->mn_parent->mn_parent != NULL, ("mqfstype_parent node has no grandparent")); mn->mn_fileno = mn->mn_parent->mn_parent->mn_fileno; break; default: KASSERT(0, ("mqfs_fileno_alloc() called for unknown type node: %d", mn->mn_type)); break; } } /* * Release a file number */ static void mqfs_fileno_free(struct mqfs_info *mi, struct mqfs_node *mn) { switch (mn->mn_type) { case mqfstype_root: case mqfstype_dir: case mqfstype_file: case mqfstype_symlink: free_unr(mi->mi_unrhdr, mn->mn_fileno); break; case mqfstype_this: case mqfstype_parent: /* ignore these, as they don't "own" their file number */ break; default: KASSERT(0, ("mqfs_fileno_free() called for unknown type node: %d", mn->mn_type)); break; } } static __inline struct mqfs_node * mqnode_alloc(void) { return uma_zalloc(mqnode_zone, M_WAITOK | M_ZERO); } static __inline void mqnode_free(struct mqfs_node *node) { uma_zfree(mqnode_zone, node); } static __inline void mqnode_addref(struct mqfs_node *node) { atomic_fetchadd_int(&node->mn_refcount, 1); } static __inline void mqnode_release(struct mqfs_node *node) { struct mqfs_info *mqfs; int old, exp; mqfs = node->mn_info; old = atomic_fetchadd_int(&node->mn_refcount, -1); if (node->mn_type == mqfstype_dir || node->mn_type == mqfstype_root) exp = 3; /* include . and .. */ else exp = 1; if (old == exp) { int locked = sx_xlocked(&mqfs->mi_lock); if (!locked) sx_xlock(&mqfs->mi_lock); mqfs_destroy(node); if (!locked) sx_xunlock(&mqfs->mi_lock); } } /* * Add a node to a directory */ static int mqfs_add_node(struct mqfs_node *parent, struct mqfs_node *node) { KASSERT(parent != NULL, ("%s(): parent is NULL", __func__)); KASSERT(parent->mn_info != NULL, ("%s(): parent has no mn_info", __func__)); KASSERT(parent->mn_type == mqfstype_dir || parent->mn_type == mqfstype_root, ("%s(): parent is not a directory", __func__)); node->mn_info = parent->mn_info; node->mn_parent = parent; LIST_INIT(&node->mn_children); LIST_INIT(&node->mn_vnodes); LIST_INSERT_HEAD(&parent->mn_children, node, mn_sibling); mqnode_addref(parent); return (0); } static struct mqfs_node * mqfs_create_node(const char *name, int namelen, struct ucred *cred, int mode, int nodetype) { struct mqfs_node *node; node = mqnode_alloc(); strncpy(node->mn_name, name, namelen); node->mn_pr_root = cred->cr_prison->pr_root; node->mn_type = nodetype; node->mn_refcount = 1; vfs_timestamp(&node->mn_birth); node->mn_ctime = node->mn_atime = node->mn_mtime = node->mn_birth; node->mn_uid = cred->cr_uid; node->mn_gid = cred->cr_gid; node->mn_mode = mode; return (node); } /* * Create a file */ static struct mqfs_node * mqfs_create_file(struct mqfs_node *parent, const char *name, int namelen, struct ucred *cred, int mode) { struct mqfs_node *node; node = mqfs_create_node(name, namelen, cred, mode, mqfstype_file); if (mqfs_add_node(parent, node) != 0) { mqnode_free(node); return (NULL); } return (node); } /* * Add . and .. to a directory */ static int mqfs_fixup_dir(struct mqfs_node *parent) { struct mqfs_node *dir; dir = mqnode_alloc(); dir->mn_name[0] = '.'; dir->mn_type = mqfstype_this; dir->mn_refcount = 1; if (mqfs_add_node(parent, dir) != 0) { mqnode_free(dir); return (-1); } dir = mqnode_alloc(); dir->mn_name[0] = dir->mn_name[1] = '.'; dir->mn_type = mqfstype_parent; dir->mn_refcount = 1; if (mqfs_add_node(parent, dir) != 0) { mqnode_free(dir); return (-1); } return (0); } #ifdef notyet /* * Create a directory */ static struct mqfs_node * mqfs_create_dir(struct mqfs_node *parent, const char *name, int namelen, struct ucred *cred, int mode) { struct mqfs_node *node; node = mqfs_create_node(name, namelen, cred, mode, mqfstype_dir); if (mqfs_add_node(parent, node) != 0) { mqnode_free(node); return (NULL); } if (mqfs_fixup_dir(node) != 0) { mqfs_destroy(node); return (NULL); } return (node); } /* * Create a symlink */ static struct mqfs_node * mqfs_create_link(struct mqfs_node *parent, const char *name, int namelen, struct ucred *cred, int mode) { struct mqfs_node *node; node = mqfs_create_node(name, namelen, cred, mode, mqfstype_symlink); if (mqfs_add_node(parent, node) != 0) { mqnode_free(node); return (NULL); } return (node); } #endif /* * Destroy a node or a tree of nodes */ static int mqfs_destroy(struct mqfs_node *node) { struct mqfs_node *parent; KASSERT(node != NULL, ("%s(): node is NULL", __func__)); KASSERT(node->mn_info != NULL, ("%s(): node has no mn_info", __func__)); /* destroy children */ if (node->mn_type == mqfstype_dir || node->mn_type == mqfstype_root) while (! LIST_EMPTY(&node->mn_children)) mqfs_destroy(LIST_FIRST(&node->mn_children)); /* unlink from parent */ if ((parent = node->mn_parent) != NULL) { KASSERT(parent->mn_info == node->mn_info, ("%s(): parent has different mn_info", __func__)); LIST_REMOVE(node, mn_sibling); } if (node->mn_fileno != 0) mqfs_fileno_free(node->mn_info, node); if (node->mn_data != NULL) mqueue_free(node->mn_data); mqnode_free(node); return (0); } /* * Mount a mqfs instance */ static int mqfs_mount(struct mount *mp) { struct statfs *sbp; if (mp->mnt_flag & MNT_UPDATE) return (EOPNOTSUPP); mp->mnt_data = &mqfs_data; MNT_ILOCK(mp); mp->mnt_flag |= MNT_LOCAL; MNT_IUNLOCK(mp); vfs_getnewfsid(mp); sbp = &mp->mnt_stat; vfs_mountedfrom(mp, "mqueue"); sbp->f_bsize = PAGE_SIZE; sbp->f_iosize = PAGE_SIZE; sbp->f_blocks = 1; sbp->f_bfree = 0; sbp->f_bavail = 0; sbp->f_files = 1; sbp->f_ffree = 0; return (0); } /* * Unmount a mqfs instance */ static int mqfs_unmount(struct mount *mp, int mntflags) { int error; error = vflush(mp, 0, (mntflags & MNT_FORCE) ? FORCECLOSE : 0, curthread); return (error); } /* * Return a root vnode */ static int mqfs_root(struct mount *mp, int flags, struct vnode **vpp) { struct mqfs_info *mqfs; int ret; mqfs = VFSTOMQFS(mp); ret = mqfs_allocv(mp, vpp, mqfs->mi_root); return (ret); } /* * Return filesystem stats */ static int mqfs_statfs(struct mount *mp, struct statfs *sbp) { /* XXX update statistics */ return (0); } /* * Initialize a mqfs instance */ static int mqfs_init(struct vfsconf *vfc) { struct mqfs_node *root; struct mqfs_info *mi; osd_method_t methods[PR_MAXMETHOD] = { [PR_METHOD_REMOVE] = mqfs_prison_remove, }; mqnode_zone = uma_zcreate("mqnode", sizeof(struct mqfs_node), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); mqueue_zone = uma_zcreate("mqueue", sizeof(struct mqueue), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); mvdata_zone = uma_zcreate("mvdata", sizeof(struct mqfs_vdata), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); mqnoti_zone = uma_zcreate("mqnotifier", sizeof(struct mqueue_notifier), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); mi = &mqfs_data; sx_init(&mi->mi_lock, "mqfs lock"); /* set up the root diretory */ root = mqfs_create_node("/", 1, curthread->td_ucred, 01777, mqfstype_root); root->mn_info = mi; LIST_INIT(&root->mn_children); LIST_INIT(&root->mn_vnodes); mi->mi_root = root; mqfs_fileno_init(mi); mqfs_fileno_alloc(mi, root); mqfs_fixup_dir(root); exit_tag = EVENTHANDLER_REGISTER(process_exit, mq_proc_exit, NULL, EVENTHANDLER_PRI_ANY); mq_fdclose = mqueue_fdclose; p31b_setcfg(CTL_P1003_1B_MESSAGE_PASSING, _POSIX_MESSAGE_PASSING); mqfs_osd_jail_slot = osd_jail_register(NULL, methods); return (0); } /* * Destroy a mqfs instance */ static int mqfs_uninit(struct vfsconf *vfc) { struct mqfs_info *mi; if (!unloadable) return (EOPNOTSUPP); osd_jail_deregister(mqfs_osd_jail_slot); EVENTHANDLER_DEREGISTER(process_exit, exit_tag); mi = &mqfs_data; mqfs_destroy(mi->mi_root); mi->mi_root = NULL; mqfs_fileno_uninit(mi); sx_destroy(&mi->mi_lock); uma_zdestroy(mqnode_zone); uma_zdestroy(mqueue_zone); uma_zdestroy(mvdata_zone); uma_zdestroy(mqnoti_zone); return (0); } /* * task routine */ static void do_recycle(void *context, int pending __unused) { struct vnode *vp = (struct vnode *)context; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); vrecycle(vp); VOP_UNLOCK(vp, 0); vdrop(vp); } /* * Allocate a vnode */ static int mqfs_allocv(struct mount *mp, struct vnode **vpp, struct mqfs_node *pn) { struct mqfs_vdata *vd; struct mqfs_info *mqfs; struct vnode *newvpp; int error; mqfs = pn->mn_info; *vpp = NULL; sx_xlock(&mqfs->mi_lock); LIST_FOREACH(vd, &pn->mn_vnodes, mv_link) { if (vd->mv_vnode->v_mount == mp) { vhold(vd->mv_vnode); break; } } if (vd != NULL) { found: *vpp = vd->mv_vnode; sx_xunlock(&mqfs->mi_lock); error = vget(*vpp, LK_RETRY | LK_EXCLUSIVE, curthread); vdrop(*vpp); return (error); } sx_xunlock(&mqfs->mi_lock); error = getnewvnode("mqueue", mp, &mqfs_vnodeops, &newvpp); if (error) return (error); vn_lock(newvpp, LK_EXCLUSIVE | LK_RETRY); error = insmntque(newvpp, mp); if (error != 0) return (error); sx_xlock(&mqfs->mi_lock); /* * Check if it has already been allocated * while we were blocked. */ LIST_FOREACH(vd, &pn->mn_vnodes, mv_link) { if (vd->mv_vnode->v_mount == mp) { vhold(vd->mv_vnode); sx_xunlock(&mqfs->mi_lock); vgone(newvpp); vput(newvpp); goto found; } } *vpp = newvpp; vd = uma_zalloc(mvdata_zone, M_WAITOK); (*vpp)->v_data = vd; vd->mv_vnode = *vpp; vd->mv_node = pn; TASK_INIT(&vd->mv_task, 0, do_recycle, *vpp); LIST_INSERT_HEAD(&pn->mn_vnodes, vd, mv_link); mqnode_addref(pn); switch (pn->mn_type) { case mqfstype_root: (*vpp)->v_vflag = VV_ROOT; /* fall through */ case mqfstype_dir: case mqfstype_this: case mqfstype_parent: (*vpp)->v_type = VDIR; break; case mqfstype_file: (*vpp)->v_type = VREG; break; case mqfstype_symlink: (*vpp)->v_type = VLNK; break; case mqfstype_none: KASSERT(0, ("mqfs_allocf called for null node\n")); default: panic("%s has unexpected type: %d", pn->mn_name, pn->mn_type); } sx_xunlock(&mqfs->mi_lock); return (0); } /* * Search a directory entry */ static struct mqfs_node * mqfs_search(struct mqfs_node *pd, const char *name, int len, struct ucred *cred) { struct mqfs_node *pn; const void *pr_root; sx_assert(&pd->mn_info->mi_lock, SX_LOCKED); pr_root = cred->cr_prison->pr_root; LIST_FOREACH(pn, &pd->mn_children, mn_sibling) { /* Only match names within the same prison root directory */ if ((pn->mn_pr_root == NULL || pn->mn_pr_root == pr_root) && strncmp(pn->mn_name, name, len) == 0 && pn->mn_name[len] == '\0') return (pn); } return (NULL); } /* * Look up a file or directory. */ static int mqfs_lookupx(struct vop_cachedlookup_args *ap) { struct componentname *cnp; struct vnode *dvp, **vpp; struct mqfs_node *pd; struct mqfs_node *pn; struct mqfs_info *mqfs; int nameiop, flags, error, namelen; char *pname; struct thread *td; cnp = ap->a_cnp; vpp = ap->a_vpp; dvp = ap->a_dvp; pname = cnp->cn_nameptr; namelen = cnp->cn_namelen; td = cnp->cn_thread; flags = cnp->cn_flags; nameiop = cnp->cn_nameiop; pd = VTON(dvp); pn = NULL; mqfs = pd->mn_info; *vpp = NULLVP; if (dvp->v_type != VDIR) return (ENOTDIR); error = VOP_ACCESS(dvp, VEXEC, cnp->cn_cred, cnp->cn_thread); if (error) return (error); /* shortcut: check if the name is too long */ if (cnp->cn_namelen >= MQFS_NAMELEN) return (ENOENT); /* self */ if (namelen == 1 && pname[0] == '.') { if ((flags & ISLASTCN) && nameiop != LOOKUP) return (EINVAL); pn = pd; *vpp = dvp; VREF(dvp); return (0); } /* parent */ if (cnp->cn_flags & ISDOTDOT) { if (dvp->v_vflag & VV_ROOT) return (EIO); if ((flags & ISLASTCN) && nameiop != LOOKUP) return (EINVAL); VOP_UNLOCK(dvp, 0); KASSERT(pd->mn_parent, ("non-root directory has no parent")); pn = pd->mn_parent; error = mqfs_allocv(dvp->v_mount, vpp, pn); vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY); return (error); } /* named node */ sx_xlock(&mqfs->mi_lock); pn = mqfs_search(pd, pname, namelen, cnp->cn_cred); if (pn != NULL) mqnode_addref(pn); sx_xunlock(&mqfs->mi_lock); /* found */ if (pn != NULL) { /* DELETE */ if (nameiop == DELETE && (flags & ISLASTCN)) { error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred, td); if (error) { mqnode_release(pn); return (error); } if (*vpp == dvp) { VREF(dvp); *vpp = dvp; mqnode_release(pn); return (0); } } /* allocate vnode */ error = mqfs_allocv(dvp->v_mount, vpp, pn); mqnode_release(pn); if (error == 0 && cnp->cn_flags & MAKEENTRY) cache_enter(dvp, *vpp, cnp); return (error); } /* not found */ /* will create a new entry in the directory ? */ if ((nameiop == CREATE || nameiop == RENAME) && (flags & LOCKPARENT) && (flags & ISLASTCN)) { error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred, td); if (error) return (error); cnp->cn_flags |= SAVENAME; return (EJUSTRETURN); } return (ENOENT); } #if 0 struct vop_lookup_args { struct vop_generic_args a_gen; struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; }; #endif /* * vnode lookup operation */ static int mqfs_lookup(struct vop_cachedlookup_args *ap) { int rc; rc = mqfs_lookupx(ap); return (rc); } #if 0 struct vop_create_args { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; }; #endif /* * vnode creation operation */ static int mqfs_create(struct vop_create_args *ap) { struct mqfs_info *mqfs = VFSTOMQFS(ap->a_dvp->v_mount); struct componentname *cnp = ap->a_cnp; struct mqfs_node *pd; struct mqfs_node *pn; struct mqueue *mq; int error; pd = VTON(ap->a_dvp); if (pd->mn_type != mqfstype_root && pd->mn_type != mqfstype_dir) return (ENOTDIR); mq = mqueue_alloc(NULL); if (mq == NULL) return (EAGAIN); sx_xlock(&mqfs->mi_lock); if ((cnp->cn_flags & HASBUF) == 0) panic("%s: no name", __func__); pn = mqfs_create_file(pd, cnp->cn_nameptr, cnp->cn_namelen, cnp->cn_cred, ap->a_vap->va_mode); if (pn == NULL) { sx_xunlock(&mqfs->mi_lock); error = ENOSPC; } else { mqnode_addref(pn); sx_xunlock(&mqfs->mi_lock); error = mqfs_allocv(ap->a_dvp->v_mount, ap->a_vpp, pn); mqnode_release(pn); if (error) mqfs_destroy(pn); else pn->mn_data = mq; } if (error) mqueue_free(mq); return (error); } /* * Remove an entry */ static int do_unlink(struct mqfs_node *pn, struct ucred *ucred) { struct mqfs_node *parent; struct mqfs_vdata *vd; int error = 0; sx_assert(&pn->mn_info->mi_lock, SX_LOCKED); if (ucred->cr_uid != pn->mn_uid && (error = priv_check_cred(ucred, PRIV_MQ_ADMIN, 0)) != 0) error = EACCES; else if (!pn->mn_deleted) { parent = pn->mn_parent; pn->mn_parent = NULL; pn->mn_deleted = 1; LIST_REMOVE(pn, mn_sibling); LIST_FOREACH(vd, &pn->mn_vnodes, mv_link) { cache_purge(vd->mv_vnode); vhold(vd->mv_vnode); taskqueue_enqueue(taskqueue_thread, &vd->mv_task); } mqnode_release(pn); mqnode_release(parent); } else error = ENOENT; return (error); } #if 0 struct vop_remove_args { struct vnode *a_dvp; struct vnode *a_vp; struct componentname *a_cnp; }; #endif /* * vnode removal operation */ static int mqfs_remove(struct vop_remove_args *ap) { struct mqfs_info *mqfs = VFSTOMQFS(ap->a_dvp->v_mount); struct mqfs_node *pn; int error; if (ap->a_vp->v_type == VDIR) return (EPERM); pn = VTON(ap->a_vp); sx_xlock(&mqfs->mi_lock); error = do_unlink(pn, ap->a_cnp->cn_cred); sx_xunlock(&mqfs->mi_lock); return (error); } #if 0 struct vop_inactive_args { struct vnode *a_vp; struct thread *a_td; }; #endif static int mqfs_inactive(struct vop_inactive_args *ap) { struct mqfs_node *pn = VTON(ap->a_vp); if (pn->mn_deleted) vrecycle(ap->a_vp); return (0); } #if 0 struct vop_reclaim_args { struct vop_generic_args a_gen; struct vnode *a_vp; struct thread *a_td; }; #endif static int mqfs_reclaim(struct vop_reclaim_args *ap) { struct mqfs_info *mqfs = VFSTOMQFS(ap->a_vp->v_mount); struct vnode *vp = ap->a_vp; struct mqfs_node *pn; struct mqfs_vdata *vd; vd = vp->v_data; pn = vd->mv_node; sx_xlock(&mqfs->mi_lock); vp->v_data = NULL; LIST_REMOVE(vd, mv_link); uma_zfree(mvdata_zone, vd); mqnode_release(pn); sx_xunlock(&mqfs->mi_lock); return (0); } #if 0 struct vop_open_args { struct vop_generic_args a_gen; struct vnode *a_vp; int a_mode; struct ucred *a_cred; struct thread *a_td; struct file *a_fp; }; #endif static int mqfs_open(struct vop_open_args *ap) { return (0); } #if 0 struct vop_close_args { struct vop_generic_args a_gen; struct vnode *a_vp; int a_fflag; struct ucred *a_cred; struct thread *a_td; }; #endif static int mqfs_close(struct vop_close_args *ap) { return (0); } #if 0 struct vop_access_args { struct vop_generic_args a_gen; struct vnode *a_vp; accmode_t a_accmode; struct ucred *a_cred; struct thread *a_td; }; #endif /* * Verify permissions */ static int mqfs_access(struct vop_access_args *ap) { struct vnode *vp = ap->a_vp; struct vattr vattr; int error; error = VOP_GETATTR(vp, &vattr, ap->a_cred); if (error) return (error); error = vaccess(vp->v_type, vattr.va_mode, vattr.va_uid, vattr.va_gid, ap->a_accmode, ap->a_cred, NULL); return (error); } #if 0 struct vop_getattr_args { struct vop_generic_args a_gen; struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; }; #endif /* * Get file attributes */ static int mqfs_getattr(struct vop_getattr_args *ap) { struct vnode *vp = ap->a_vp; struct mqfs_node *pn = VTON(vp); struct vattr *vap = ap->a_vap; int error = 0; vap->va_type = vp->v_type; vap->va_mode = pn->mn_mode; vap->va_nlink = 1; vap->va_uid = pn->mn_uid; vap->va_gid = pn->mn_gid; vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0]; vap->va_fileid = pn->mn_fileno; vap->va_size = 0; vap->va_blocksize = PAGE_SIZE; vap->va_bytes = vap->va_size = 0; vap->va_atime = pn->mn_atime; vap->va_mtime = pn->mn_mtime; vap->va_ctime = pn->mn_ctime; vap->va_birthtime = pn->mn_birth; vap->va_gen = 0; vap->va_flags = 0; vap->va_rdev = NODEV; vap->va_bytes = 0; vap->va_filerev = 0; return (error); } #if 0 struct vop_setattr_args { struct vop_generic_args a_gen; struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; }; #endif /* * Set attributes */ static int mqfs_setattr(struct vop_setattr_args *ap) { struct mqfs_node *pn; struct vattr *vap; struct vnode *vp; struct thread *td; int c, error; uid_t uid; gid_t gid; td = curthread; vap = ap->a_vap; vp = ap->a_vp; if ((vap->va_type != VNON) || (vap->va_nlink != VNOVAL) || (vap->va_fsid != VNOVAL) || (vap->va_fileid != VNOVAL) || (vap->va_blocksize != VNOVAL) || (vap->va_flags != VNOVAL && vap->va_flags != 0) || (vap->va_rdev != VNOVAL) || ((int)vap->va_bytes != VNOVAL) || (vap->va_gen != VNOVAL)) { return (EINVAL); } pn = VTON(vp); error = c = 0; if (vap->va_uid == (uid_t)VNOVAL) uid = pn->mn_uid; else uid = vap->va_uid; if (vap->va_gid == (gid_t)VNOVAL) gid = pn->mn_gid; else gid = vap->va_gid; if (uid != pn->mn_uid || gid != pn->mn_gid) { /* * To modify the ownership of a file, must possess VADMIN * for that file. */ if ((error = VOP_ACCESS(vp, VADMIN, ap->a_cred, td))) return (error); /* * XXXRW: Why is there a privilege check here: shouldn't the * check in VOP_ACCESS() be enough? Also, are the group bits * below definitely right? */ if (((ap->a_cred->cr_uid != pn->mn_uid) || uid != pn->mn_uid || (gid != pn->mn_gid && !groupmember(gid, ap->a_cred))) && (error = priv_check(td, PRIV_MQ_ADMIN)) != 0) return (error); pn->mn_uid = uid; pn->mn_gid = gid; c = 1; } if (vap->va_mode != (mode_t)VNOVAL) { if ((ap->a_cred->cr_uid != pn->mn_uid) && (error = priv_check(td, PRIV_MQ_ADMIN))) return (error); pn->mn_mode = vap->va_mode; c = 1; } if (vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL) { /* See the comment in ufs_vnops::ufs_setattr(). */ if ((error = VOP_ACCESS(vp, VADMIN, ap->a_cred, td)) && ((vap->va_vaflags & VA_UTIMES_NULL) == 0 || (error = VOP_ACCESS(vp, VWRITE, ap->a_cred, td)))) return (error); if (vap->va_atime.tv_sec != VNOVAL) { pn->mn_atime = vap->va_atime; } if (vap->va_mtime.tv_sec != VNOVAL) { pn->mn_mtime = vap->va_mtime; } c = 1; } if (c) { vfs_timestamp(&pn->mn_ctime); } return (0); } #if 0 struct vop_read_args { struct vop_generic_args a_gen; struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; }; #endif /* * Read from a file */ static int mqfs_read(struct vop_read_args *ap) { char buf[80]; struct vnode *vp = ap->a_vp; struct uio *uio = ap->a_uio; struct mqueue *mq; int len, error; if (vp->v_type != VREG) return (EINVAL); mq = VTOMQ(vp); snprintf(buf, sizeof(buf), "QSIZE:%-10ld MAXMSG:%-10ld CURMSG:%-10ld MSGSIZE:%-10ld\n", mq->mq_totalbytes, mq->mq_maxmsg, mq->mq_curmsgs, mq->mq_msgsize); buf[sizeof(buf)-1] = '\0'; len = strlen(buf); error = uiomove_frombuf(buf, len, uio); return (error); } #if 0 struct vop_readdir_args { struct vop_generic_args a_gen; struct vnode *a_vp; struct uio *a_uio; struct ucred *a_cred; int *a_eofflag; int *a_ncookies; u_long **a_cookies; }; #endif /* * Return directory entries. */ static int mqfs_readdir(struct vop_readdir_args *ap) { struct vnode *vp; struct mqfs_info *mi; struct mqfs_node *pd; struct mqfs_node *pn; struct dirent entry; struct uio *uio; const void *pr_root; int *tmp_ncookies = NULL; off_t offset; int error, i; vp = ap->a_vp; mi = VFSTOMQFS(vp->v_mount); pd = VTON(vp); uio = ap->a_uio; if (vp->v_type != VDIR) return (ENOTDIR); if (uio->uio_offset < 0) return (EINVAL); if (ap->a_ncookies != NULL) { tmp_ncookies = ap->a_ncookies; *ap->a_ncookies = 0; ap->a_ncookies = NULL; } error = 0; offset = 0; pr_root = ap->a_cred->cr_prison->pr_root; sx_xlock(&mi->mi_lock); LIST_FOREACH(pn, &pd->mn_children, mn_sibling) { entry.d_reclen = sizeof(entry); /* * Only show names within the same prison root directory * (or not associated with a prison, e.g. "." and ".."). */ if (pn->mn_pr_root != NULL && pn->mn_pr_root != pr_root) continue; if (!pn->mn_fileno) mqfs_fileno_alloc(mi, pn); entry.d_fileno = pn->mn_fileno; for (i = 0; i < MQFS_NAMELEN - 1 && pn->mn_name[i] != '\0'; ++i) entry.d_name[i] = pn->mn_name[i]; entry.d_name[i] = 0; entry.d_namlen = i; switch (pn->mn_type) { case mqfstype_root: case mqfstype_dir: case mqfstype_this: case mqfstype_parent: entry.d_type = DT_DIR; break; case mqfstype_file: entry.d_type = DT_REG; break; case mqfstype_symlink: entry.d_type = DT_LNK; break; default: panic("%s has unexpected node type: %d", pn->mn_name, pn->mn_type); } if (entry.d_reclen > uio->uio_resid) break; if (offset >= uio->uio_offset) { error = vfs_read_dirent(ap, &entry, offset); if (error) break; } offset += entry.d_reclen; } sx_xunlock(&mi->mi_lock); uio->uio_offset = offset; if (tmp_ncookies != NULL) ap->a_ncookies = tmp_ncookies; return (error); } #ifdef notyet #if 0 struct vop_mkdir_args { struct vnode *a_dvp; struvt vnode **a_vpp; struvt componentname *a_cnp; struct vattr *a_vap; }; #endif /* * Create a directory. */ static int mqfs_mkdir(struct vop_mkdir_args *ap) { struct mqfs_info *mqfs = VFSTOMQFS(ap->a_dvp->v_mount); struct componentname *cnp = ap->a_cnp; struct mqfs_node *pd = VTON(ap->a_dvp); struct mqfs_node *pn; int error; if (pd->mn_type != mqfstype_root && pd->mn_type != mqfstype_dir) return (ENOTDIR); sx_xlock(&mqfs->mi_lock); if ((cnp->cn_flags & HASBUF) == 0) panic("%s: no name", __func__); pn = mqfs_create_dir(pd, cnp->cn_nameptr, cnp->cn_namelen, ap->a_vap->cn_cred, ap->a_vap->va_mode); if (pn != NULL) mqnode_addref(pn); sx_xunlock(&mqfs->mi_lock); if (pn == NULL) { error = ENOSPC; } else { error = mqfs_allocv(ap->a_dvp->v_mount, ap->a_vpp, pn); mqnode_release(pn); } return (error); } #if 0 struct vop_rmdir_args { struct vnode *a_dvp; struct vnode *a_vp; struct componentname *a_cnp; }; #endif /* * Remove a directory. */ static int mqfs_rmdir(struct vop_rmdir_args *ap) { struct mqfs_info *mqfs = VFSTOMQFS(ap->a_dvp->v_mount); struct mqfs_node *pn = VTON(ap->a_vp); struct mqfs_node *pt; if (pn->mn_type != mqfstype_dir) return (ENOTDIR); sx_xlock(&mqfs->mi_lock); if (pn->mn_deleted) { sx_xunlock(&mqfs->mi_lock); return (ENOENT); } pt = LIST_FIRST(&pn->mn_children); pt = LIST_NEXT(pt, mn_sibling); pt = LIST_NEXT(pt, mn_sibling); if (pt != NULL) { sx_xunlock(&mqfs->mi_lock); return (ENOTEMPTY); } pt = pn->mn_parent; pn->mn_parent = NULL; pn->mn_deleted = 1; LIST_REMOVE(pn, mn_sibling); mqnode_release(pn); mqnode_release(pt); sx_xunlock(&mqfs->mi_lock); cache_purge(ap->a_vp); return (0); } #endif /* notyet */ /* * See if this prison root is obsolete, and clean up associated queues if it is. */ static int mqfs_prison_remove(void *obj, void *data __unused) { const struct prison *pr = obj; const struct prison *tpr; struct mqfs_node *pn, *tpn; int found; found = 0; TAILQ_FOREACH(tpr, &allprison, pr_list) { if (tpr->pr_root == pr->pr_root && tpr != pr && tpr->pr_ref > 0) found = 1; } if (!found) { /* * No jails are rooted in this directory anymore, * so no queues should be either. */ sx_xlock(&mqfs_data.mi_lock); LIST_FOREACH_SAFE(pn, &mqfs_data.mi_root->mn_children, mn_sibling, tpn) { if (pn->mn_pr_root == pr->pr_root) (void)do_unlink(pn, curthread->td_ucred); } sx_xunlock(&mqfs_data.mi_lock); } return (0); } /* * Allocate a message queue */ static struct mqueue * mqueue_alloc(const struct mq_attr *attr) { struct mqueue *mq; if (curmq >= maxmq) return (NULL); mq = uma_zalloc(mqueue_zone, M_WAITOK | M_ZERO); TAILQ_INIT(&mq->mq_msgq); if (attr != NULL) { mq->mq_maxmsg = attr->mq_maxmsg; mq->mq_msgsize = attr->mq_msgsize; } else { mq->mq_maxmsg = default_maxmsg; mq->mq_msgsize = default_msgsize; } mtx_init(&mq->mq_mutex, "mqueue lock", NULL, MTX_DEF); knlist_init_mtx(&mq->mq_rsel.si_note, &mq->mq_mutex); knlist_init_mtx(&mq->mq_wsel.si_note, &mq->mq_mutex); atomic_add_int(&curmq, 1); return (mq); } /* * Destroy a message queue */ static void mqueue_free(struct mqueue *mq) { struct mqueue_msg *msg; while ((msg = TAILQ_FIRST(&mq->mq_msgq)) != NULL) { TAILQ_REMOVE(&mq->mq_msgq, msg, msg_link); free(msg, M_MQUEUEDATA); } mtx_destroy(&mq->mq_mutex); seldrain(&mq->mq_rsel); seldrain(&mq->mq_wsel); knlist_destroy(&mq->mq_rsel.si_note); knlist_destroy(&mq->mq_wsel.si_note); uma_zfree(mqueue_zone, mq); atomic_add_int(&curmq, -1); } /* * Load a message from user space */ static struct mqueue_msg * mqueue_loadmsg(const char *msg_ptr, size_t msg_size, int msg_prio) { struct mqueue_msg *msg; size_t len; int error; len = sizeof(struct mqueue_msg) + msg_size; msg = malloc(len, M_MQUEUEDATA, M_WAITOK); error = copyin(msg_ptr, ((char *)msg) + sizeof(struct mqueue_msg), msg_size); if (error) { free(msg, M_MQUEUEDATA); msg = NULL; } else { msg->msg_size = msg_size; msg->msg_prio = msg_prio; } return (msg); } /* * Save a message to user space */ static int mqueue_savemsg(struct mqueue_msg *msg, char *msg_ptr, int *msg_prio) { int error; error = copyout(((char *)msg) + sizeof(*msg), msg_ptr, msg->msg_size); if (error == 0 && msg_prio != NULL) error = copyout(&msg->msg_prio, msg_prio, sizeof(int)); return (error); } /* * Free a message's memory */ static __inline void mqueue_freemsg(struct mqueue_msg *msg) { free(msg, M_MQUEUEDATA); } /* * Send a message. if waitok is false, thread will not be * blocked if there is no data in queue, otherwise, absolute * time will be checked. */ int mqueue_send(struct mqueue *mq, const char *msg_ptr, size_t msg_len, unsigned msg_prio, int waitok, const struct timespec *abs_timeout) { struct mqueue_msg *msg; struct timespec ts, ts2; struct timeval tv; int error; if (msg_prio >= MQ_PRIO_MAX) return (EINVAL); if (msg_len > mq->mq_msgsize) return (EMSGSIZE); msg = mqueue_loadmsg(msg_ptr, msg_len, msg_prio); if (msg == NULL) return (EFAULT); /* O_NONBLOCK case */ if (!waitok) { error = _mqueue_send(mq, msg, -1); if (error) goto bad; return (0); } /* we allow a null timeout (wait forever) */ if (abs_timeout == NULL) { error = _mqueue_send(mq, msg, 0); if (error) goto bad; return (0); } /* send it before checking time */ error = _mqueue_send(mq, msg, -1); if (error == 0) return (0); if (error != EAGAIN) goto bad; if (abs_timeout->tv_nsec >= 1000000000 || abs_timeout->tv_nsec < 0) { error = EINVAL; goto bad; } for (;;) { - ts2 = *abs_timeout; getnanotime(&ts); - timespecsub(&ts2, &ts); + timespecsub(abs_timeout, &ts, &ts2); if (ts2.tv_sec < 0 || (ts2.tv_sec == 0 && ts2.tv_nsec <= 0)) { error = ETIMEDOUT; break; } TIMESPEC_TO_TIMEVAL(&tv, &ts2); error = _mqueue_send(mq, msg, tvtohz(&tv)); if (error != ETIMEDOUT) break; } if (error == 0) return (0); bad: mqueue_freemsg(msg); return (error); } /* * Common routine to send a message */ static int _mqueue_send(struct mqueue *mq, struct mqueue_msg *msg, int timo) { struct mqueue_msg *msg2; int error = 0; mtx_lock(&mq->mq_mutex); while (mq->mq_curmsgs >= mq->mq_maxmsg && error == 0) { if (timo < 0) { mtx_unlock(&mq->mq_mutex); return (EAGAIN); } mq->mq_senders++; error = msleep(&mq->mq_senders, &mq->mq_mutex, PCATCH, "mqsend", timo); mq->mq_senders--; if (error == EAGAIN) error = ETIMEDOUT; } if (mq->mq_curmsgs >= mq->mq_maxmsg) { mtx_unlock(&mq->mq_mutex); return (error); } error = 0; if (TAILQ_EMPTY(&mq->mq_msgq)) { TAILQ_INSERT_HEAD(&mq->mq_msgq, msg, msg_link); } else { if (msg->msg_prio <= TAILQ_LAST(&mq->mq_msgq, msgq)->msg_prio) { TAILQ_INSERT_TAIL(&mq->mq_msgq, msg, msg_link); } else { TAILQ_FOREACH(msg2, &mq->mq_msgq, msg_link) { if (msg2->msg_prio < msg->msg_prio) break; } TAILQ_INSERT_BEFORE(msg2, msg, msg_link); } } mq->mq_curmsgs++; mq->mq_totalbytes += msg->msg_size; if (mq->mq_receivers) wakeup_one(&mq->mq_receivers); else if (mq->mq_notifier != NULL) mqueue_send_notification(mq); if (mq->mq_flags & MQ_RSEL) { mq->mq_flags &= ~MQ_RSEL; selwakeup(&mq->mq_rsel); } KNOTE_LOCKED(&mq->mq_rsel.si_note, 0); mtx_unlock(&mq->mq_mutex); return (0); } /* * Send realtime a signal to process which registered itself * successfully by mq_notify. */ static void mqueue_send_notification(struct mqueue *mq) { struct mqueue_notifier *nt; struct thread *td; struct proc *p; int error; mtx_assert(&mq->mq_mutex, MA_OWNED); nt = mq->mq_notifier; if (nt->nt_sigev.sigev_notify != SIGEV_NONE) { p = nt->nt_proc; error = sigev_findtd(p, &nt->nt_sigev, &td); if (error) { mq->mq_notifier = NULL; return; } if (!KSI_ONQ(&nt->nt_ksi)) { ksiginfo_set_sigev(&nt->nt_ksi, &nt->nt_sigev); tdsendsignal(p, td, nt->nt_ksi.ksi_signo, &nt->nt_ksi); } PROC_UNLOCK(p); } mq->mq_notifier = NULL; } /* * Get a message. if waitok is false, thread will not be * blocked if there is no data in queue, otherwise, absolute * time will be checked. */ int mqueue_receive(struct mqueue *mq, char *msg_ptr, size_t msg_len, unsigned *msg_prio, int waitok, const struct timespec *abs_timeout) { struct mqueue_msg *msg; struct timespec ts, ts2; struct timeval tv; int error; if (msg_len < mq->mq_msgsize) return (EMSGSIZE); /* O_NONBLOCK case */ if (!waitok) { error = _mqueue_recv(mq, &msg, -1); if (error) return (error); goto received; } /* we allow a null timeout (wait forever). */ if (abs_timeout == NULL) { error = _mqueue_recv(mq, &msg, 0); if (error) return (error); goto received; } /* try to get a message before checking time */ error = _mqueue_recv(mq, &msg, -1); if (error == 0) goto received; if (error != EAGAIN) return (error); if (abs_timeout->tv_nsec >= 1000000000 || abs_timeout->tv_nsec < 0) { error = EINVAL; return (error); } for (;;) { - ts2 = *abs_timeout; getnanotime(&ts); - timespecsub(&ts2, &ts); + timespecsub(abs_timeout, &ts, &ts2); if (ts2.tv_sec < 0 || (ts2.tv_sec == 0 && ts2.tv_nsec <= 0)) { error = ETIMEDOUT; return (error); } TIMESPEC_TO_TIMEVAL(&tv, &ts2); error = _mqueue_recv(mq, &msg, tvtohz(&tv)); if (error == 0) break; if (error != ETIMEDOUT) return (error); } received: error = mqueue_savemsg(msg, msg_ptr, msg_prio); if (error == 0) { curthread->td_retval[0] = msg->msg_size; curthread->td_retval[1] = 0; } mqueue_freemsg(msg); return (error); } /* * Common routine to receive a message */ static int _mqueue_recv(struct mqueue *mq, struct mqueue_msg **msg, int timo) { int error = 0; mtx_lock(&mq->mq_mutex); while ((*msg = TAILQ_FIRST(&mq->mq_msgq)) == NULL && error == 0) { if (timo < 0) { mtx_unlock(&mq->mq_mutex); return (EAGAIN); } mq->mq_receivers++; error = msleep(&mq->mq_receivers, &mq->mq_mutex, PCATCH, "mqrecv", timo); mq->mq_receivers--; if (error == EAGAIN) error = ETIMEDOUT; } if (*msg != NULL) { error = 0; TAILQ_REMOVE(&mq->mq_msgq, *msg, msg_link); mq->mq_curmsgs--; mq->mq_totalbytes -= (*msg)->msg_size; if (mq->mq_senders) wakeup_one(&mq->mq_senders); if (mq->mq_flags & MQ_WSEL) { mq->mq_flags &= ~MQ_WSEL; selwakeup(&mq->mq_wsel); } KNOTE_LOCKED(&mq->mq_wsel.si_note, 0); } if (mq->mq_notifier != NULL && mq->mq_receivers == 0 && !TAILQ_EMPTY(&mq->mq_msgq)) { mqueue_send_notification(mq); } mtx_unlock(&mq->mq_mutex); return (error); } static __inline struct mqueue_notifier * notifier_alloc(void) { return (uma_zalloc(mqnoti_zone, M_WAITOK | M_ZERO)); } static __inline void notifier_free(struct mqueue_notifier *p) { uma_zfree(mqnoti_zone, p); } static struct mqueue_notifier * notifier_search(struct proc *p, int fd) { struct mqueue_notifier *nt; LIST_FOREACH(nt, &p->p_mqnotifier, nt_link) { if (nt->nt_ksi.ksi_mqd == fd) break; } return (nt); } static __inline void notifier_insert(struct proc *p, struct mqueue_notifier *nt) { LIST_INSERT_HEAD(&p->p_mqnotifier, nt, nt_link); } static __inline void notifier_delete(struct proc *p, struct mqueue_notifier *nt) { LIST_REMOVE(nt, nt_link); notifier_free(nt); } static void notifier_remove(struct proc *p, struct mqueue *mq, int fd) { struct mqueue_notifier *nt; mtx_assert(&mq->mq_mutex, MA_OWNED); PROC_LOCK(p); nt = notifier_search(p, fd); if (nt != NULL) { if (mq->mq_notifier == nt) mq->mq_notifier = NULL; sigqueue_take(&nt->nt_ksi); notifier_delete(p, nt); } PROC_UNLOCK(p); } static int kern_kmq_open(struct thread *td, const char *upath, int flags, mode_t mode, const struct mq_attr *attr) { char path[MQFS_NAMELEN + 1]; struct mqfs_node *pn; struct filedesc *fdp; struct file *fp; struct mqueue *mq; int fd, error, len, cmode; AUDIT_ARG_FFLAGS(flags); AUDIT_ARG_MODE(mode); fdp = td->td_proc->p_fd; cmode = (((mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT); mq = NULL; if ((flags & O_CREAT) != 0 && attr != NULL) { if (attr->mq_maxmsg <= 0 || attr->mq_maxmsg > maxmsg) return (EINVAL); if (attr->mq_msgsize <= 0 || attr->mq_msgsize > maxmsgsize) return (EINVAL); } error = copyinstr(upath, path, MQFS_NAMELEN + 1, NULL); if (error) return (error); /* * The first character of name must be a slash (/) character * and the remaining characters of name cannot include any slash * characters. */ len = strlen(path); if (len < 2 || path[0] != '/' || strchr(path + 1, '/') != NULL) return (EINVAL); AUDIT_ARG_UPATH1_CANON(path); error = falloc(td, &fp, &fd, O_CLOEXEC); if (error) return (error); sx_xlock(&mqfs_data.mi_lock); pn = mqfs_search(mqfs_data.mi_root, path + 1, len - 1, td->td_ucred); if (pn == NULL) { if (!(flags & O_CREAT)) { error = ENOENT; } else { mq = mqueue_alloc(attr); if (mq == NULL) { error = ENFILE; } else { pn = mqfs_create_file(mqfs_data.mi_root, path + 1, len - 1, td->td_ucred, cmode); if (pn == NULL) { error = ENOSPC; mqueue_free(mq); } } } if (error == 0) { pn->mn_data = mq; } } else { if ((flags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL)) { error = EEXIST; } else { accmode_t accmode = 0; if (flags & FREAD) accmode |= VREAD; if (flags & FWRITE) accmode |= VWRITE; error = vaccess(VREG, pn->mn_mode, pn->mn_uid, pn->mn_gid, accmode, td->td_ucred, NULL); } } if (error) { sx_xunlock(&mqfs_data.mi_lock); fdclose(td, fp, fd); fdrop(fp, td); return (error); } mqnode_addref(pn); sx_xunlock(&mqfs_data.mi_lock); finit(fp, flags & (FREAD | FWRITE | O_NONBLOCK), DTYPE_MQUEUE, pn, &mqueueops); td->td_retval[0] = fd; fdrop(fp, td); return (0); } /* * Syscall to open a message queue. */ int sys_kmq_open(struct thread *td, struct kmq_open_args *uap) { struct mq_attr attr; int flags, error; if ((uap->flags & O_ACCMODE) == O_ACCMODE || uap->flags & O_EXEC) return (EINVAL); flags = FFLAGS(uap->flags); if ((flags & O_CREAT) != 0 && uap->attr != NULL) { error = copyin(uap->attr, &attr, sizeof(attr)); if (error) return (error); } return (kern_kmq_open(td, uap->path, flags, uap->mode, uap->attr != NULL ? &attr : NULL)); } /* * Syscall to unlink a message queue. */ int sys_kmq_unlink(struct thread *td, struct kmq_unlink_args *uap) { char path[MQFS_NAMELEN+1]; struct mqfs_node *pn; int error, len; error = copyinstr(uap->path, path, MQFS_NAMELEN + 1, NULL); if (error) return (error); len = strlen(path); if (len < 2 || path[0] != '/' || strchr(path + 1, '/') != NULL) return (EINVAL); AUDIT_ARG_UPATH1_CANON(path); sx_xlock(&mqfs_data.mi_lock); pn = mqfs_search(mqfs_data.mi_root, path + 1, len - 1, td->td_ucred); if (pn != NULL) error = do_unlink(pn, td->td_ucred); else error = ENOENT; sx_xunlock(&mqfs_data.mi_lock); return (error); } typedef int (*_fgetf)(struct thread *, int, cap_rights_t *, struct file **); /* * Get message queue by giving file slot */ static int _getmq(struct thread *td, int fd, cap_rights_t *rightsp, _fgetf func, struct file **fpp, struct mqfs_node **ppn, struct mqueue **pmq) { struct mqfs_node *pn; int error; error = func(td, fd, rightsp, fpp); if (error) return (error); if (&mqueueops != (*fpp)->f_ops) { fdrop(*fpp, td); return (EBADF); } pn = (*fpp)->f_data; if (ppn) *ppn = pn; if (pmq) *pmq = pn->mn_data; return (0); } static __inline int getmq(struct thread *td, int fd, struct file **fpp, struct mqfs_node **ppn, struct mqueue **pmq) { return _getmq(td, fd, &cap_event_rights, fget, fpp, ppn, pmq); } static __inline int getmq_read(struct thread *td, int fd, struct file **fpp, struct mqfs_node **ppn, struct mqueue **pmq) { return _getmq(td, fd, &cap_read_rights, fget_read, fpp, ppn, pmq); } static __inline int getmq_write(struct thread *td, int fd, struct file **fpp, struct mqfs_node **ppn, struct mqueue **pmq) { return _getmq(td, fd, &cap_write_rights, fget_write, fpp, ppn, pmq); } static int kern_kmq_setattr(struct thread *td, int mqd, const struct mq_attr *attr, struct mq_attr *oattr) { struct mqueue *mq; struct file *fp; u_int oflag, flag; int error; AUDIT_ARG_FD(mqd); if (attr != NULL && (attr->mq_flags & ~O_NONBLOCK) != 0) return (EINVAL); error = getmq(td, mqd, &fp, NULL, &mq); if (error) return (error); oattr->mq_maxmsg = mq->mq_maxmsg; oattr->mq_msgsize = mq->mq_msgsize; oattr->mq_curmsgs = mq->mq_curmsgs; if (attr != NULL) { do { oflag = flag = fp->f_flag; flag &= ~O_NONBLOCK; flag |= (attr->mq_flags & O_NONBLOCK); } while (atomic_cmpset_int(&fp->f_flag, oflag, flag) == 0); } else oflag = fp->f_flag; oattr->mq_flags = (O_NONBLOCK & oflag); fdrop(fp, td); return (error); } int sys_kmq_setattr(struct thread *td, struct kmq_setattr_args *uap) { struct mq_attr attr, oattr; int error; if (uap->attr != NULL) { error = copyin(uap->attr, &attr, sizeof(attr)); if (error != 0) return (error); } error = kern_kmq_setattr(td, uap->mqd, uap->attr != NULL ? &attr : NULL, &oattr); if (error == 0 && uap->oattr != NULL) { bzero(oattr.__reserved, sizeof(oattr.__reserved)); error = copyout(&oattr, uap->oattr, sizeof(oattr)); } return (error); } int sys_kmq_timedreceive(struct thread *td, struct kmq_timedreceive_args *uap) { struct mqueue *mq; struct file *fp; struct timespec *abs_timeout, ets; int error; int waitok; AUDIT_ARG_FD(uap->mqd); error = getmq_read(td, uap->mqd, &fp, NULL, &mq); if (error) return (error); if (uap->abs_timeout != NULL) { error = copyin(uap->abs_timeout, &ets, sizeof(ets)); if (error != 0) return (error); abs_timeout = &ets; } else abs_timeout = NULL; waitok = !(fp->f_flag & O_NONBLOCK); error = mqueue_receive(mq, uap->msg_ptr, uap->msg_len, uap->msg_prio, waitok, abs_timeout); fdrop(fp, td); return (error); } int sys_kmq_timedsend(struct thread *td, struct kmq_timedsend_args *uap) { struct mqueue *mq; struct file *fp; struct timespec *abs_timeout, ets; int error, waitok; AUDIT_ARG_FD(uap->mqd); error = getmq_write(td, uap->mqd, &fp, NULL, &mq); if (error) return (error); if (uap->abs_timeout != NULL) { error = copyin(uap->abs_timeout, &ets, sizeof(ets)); if (error != 0) return (error); abs_timeout = &ets; } else abs_timeout = NULL; waitok = !(fp->f_flag & O_NONBLOCK); error = mqueue_send(mq, uap->msg_ptr, uap->msg_len, uap->msg_prio, waitok, abs_timeout); fdrop(fp, td); return (error); } static int kern_kmq_notify(struct thread *td, int mqd, struct sigevent *sigev) { struct filedesc *fdp; struct proc *p; struct mqueue *mq; struct file *fp, *fp2; struct mqueue_notifier *nt, *newnt = NULL; int error; AUDIT_ARG_FD(mqd); if (sigev != NULL) { if (sigev->sigev_notify != SIGEV_SIGNAL && sigev->sigev_notify != SIGEV_THREAD_ID && sigev->sigev_notify != SIGEV_NONE) return (EINVAL); if ((sigev->sigev_notify == SIGEV_SIGNAL || sigev->sigev_notify == SIGEV_THREAD_ID) && !_SIG_VALID(sigev->sigev_signo)) return (EINVAL); } p = td->td_proc; fdp = td->td_proc->p_fd; error = getmq(td, mqd, &fp, NULL, &mq); if (error) return (error); again: FILEDESC_SLOCK(fdp); fp2 = fget_locked(fdp, mqd); if (fp2 == NULL) { FILEDESC_SUNLOCK(fdp); error = EBADF; goto out; } #ifdef CAPABILITIES error = cap_check(cap_rights(fdp, mqd), &cap_event_rights); if (error) { FILEDESC_SUNLOCK(fdp); goto out; } #endif if (fp2 != fp) { FILEDESC_SUNLOCK(fdp); error = EBADF; goto out; } mtx_lock(&mq->mq_mutex); FILEDESC_SUNLOCK(fdp); if (sigev != NULL) { if (mq->mq_notifier != NULL) { error = EBUSY; } else { PROC_LOCK(p); nt = notifier_search(p, mqd); if (nt == NULL) { if (newnt == NULL) { PROC_UNLOCK(p); mtx_unlock(&mq->mq_mutex); newnt = notifier_alloc(); goto again; } } if (nt != NULL) { sigqueue_take(&nt->nt_ksi); if (newnt != NULL) { notifier_free(newnt); newnt = NULL; } } else { nt = newnt; newnt = NULL; ksiginfo_init(&nt->nt_ksi); nt->nt_ksi.ksi_flags |= KSI_INS | KSI_EXT; nt->nt_ksi.ksi_code = SI_MESGQ; nt->nt_proc = p; nt->nt_ksi.ksi_mqd = mqd; notifier_insert(p, nt); } nt->nt_sigev = *sigev; mq->mq_notifier = nt; PROC_UNLOCK(p); /* * if there is no receivers and message queue * is not empty, we should send notification * as soon as possible. */ if (mq->mq_receivers == 0 && !TAILQ_EMPTY(&mq->mq_msgq)) mqueue_send_notification(mq); } } else { notifier_remove(p, mq, mqd); } mtx_unlock(&mq->mq_mutex); out: fdrop(fp, td); if (newnt != NULL) notifier_free(newnt); return (error); } int sys_kmq_notify(struct thread *td, struct kmq_notify_args *uap) { struct sigevent ev, *evp; int error; if (uap->sigev == NULL) { evp = NULL; } else { error = copyin(uap->sigev, &ev, sizeof(ev)); if (error != 0) return (error); evp = &ev; } return (kern_kmq_notify(td, uap->mqd, evp)); } static void mqueue_fdclose(struct thread *td, int fd, struct file *fp) { struct mqueue *mq; #ifdef INVARIANTS struct filedesc *fdp; fdp = td->td_proc->p_fd; FILEDESC_LOCK_ASSERT(fdp); #endif if (fp->f_ops == &mqueueops) { mq = FPTOMQ(fp); mtx_lock(&mq->mq_mutex); notifier_remove(td->td_proc, mq, fd); /* have to wakeup thread in same process */ if (mq->mq_flags & MQ_RSEL) { mq->mq_flags &= ~MQ_RSEL; selwakeup(&mq->mq_rsel); } if (mq->mq_flags & MQ_WSEL) { mq->mq_flags &= ~MQ_WSEL; selwakeup(&mq->mq_wsel); } mtx_unlock(&mq->mq_mutex); } } static void mq_proc_exit(void *arg __unused, struct proc *p) { struct filedesc *fdp; struct file *fp; struct mqueue *mq; int i; fdp = p->p_fd; FILEDESC_SLOCK(fdp); for (i = 0; i < fdp->fd_nfiles; ++i) { fp = fget_locked(fdp, i); if (fp != NULL && fp->f_ops == &mqueueops) { mq = FPTOMQ(fp); mtx_lock(&mq->mq_mutex); notifier_remove(p, FPTOMQ(fp), i); mtx_unlock(&mq->mq_mutex); } } FILEDESC_SUNLOCK(fdp); KASSERT(LIST_EMPTY(&p->p_mqnotifier), ("mq notifiers left")); } static int mqf_poll(struct file *fp, int events, struct ucred *active_cred, struct thread *td) { struct mqueue *mq = FPTOMQ(fp); int revents = 0; mtx_lock(&mq->mq_mutex); if (events & (POLLIN | POLLRDNORM)) { if (mq->mq_curmsgs) { revents |= events & (POLLIN | POLLRDNORM); } else { mq->mq_flags |= MQ_RSEL; selrecord(td, &mq->mq_rsel); } } if (events & POLLOUT) { if (mq->mq_curmsgs < mq->mq_maxmsg) revents |= POLLOUT; else { mq->mq_flags |= MQ_WSEL; selrecord(td, &mq->mq_wsel); } } mtx_unlock(&mq->mq_mutex); return (revents); } static int mqf_close(struct file *fp, struct thread *td) { struct mqfs_node *pn; fp->f_ops = &badfileops; pn = fp->f_data; fp->f_data = NULL; sx_xlock(&mqfs_data.mi_lock); mqnode_release(pn); sx_xunlock(&mqfs_data.mi_lock); return (0); } static int mqf_stat(struct file *fp, struct stat *st, struct ucred *active_cred, struct thread *td) { struct mqfs_node *pn = fp->f_data; bzero(st, sizeof *st); sx_xlock(&mqfs_data.mi_lock); st->st_atim = pn->mn_atime; st->st_mtim = pn->mn_mtime; st->st_ctim = pn->mn_ctime; st->st_birthtim = pn->mn_birth; st->st_uid = pn->mn_uid; st->st_gid = pn->mn_gid; st->st_mode = S_IFIFO | pn->mn_mode; sx_xunlock(&mqfs_data.mi_lock); return (0); } static int mqf_chmod(struct file *fp, mode_t mode, struct ucred *active_cred, struct thread *td) { struct mqfs_node *pn; int error; error = 0; pn = fp->f_data; sx_xlock(&mqfs_data.mi_lock); error = vaccess(VREG, pn->mn_mode, pn->mn_uid, pn->mn_gid, VADMIN, active_cred, NULL); if (error != 0) goto out; pn->mn_mode = mode & ACCESSPERMS; out: sx_xunlock(&mqfs_data.mi_lock); return (error); } static int mqf_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred, struct thread *td) { struct mqfs_node *pn; int error; error = 0; pn = fp->f_data; sx_xlock(&mqfs_data.mi_lock); if (uid == (uid_t)-1) uid = pn->mn_uid; if (gid == (gid_t)-1) gid = pn->mn_gid; if (((uid != pn->mn_uid && uid != active_cred->cr_uid) || (gid != pn->mn_gid && !groupmember(gid, active_cred))) && (error = priv_check_cred(active_cred, PRIV_VFS_CHOWN, 0))) goto out; pn->mn_uid = uid; pn->mn_gid = gid; out: sx_xunlock(&mqfs_data.mi_lock); return (error); } static int mqf_kqfilter(struct file *fp, struct knote *kn) { struct mqueue *mq = FPTOMQ(fp); int error = 0; if (kn->kn_filter == EVFILT_READ) { kn->kn_fop = &mq_rfiltops; knlist_add(&mq->mq_rsel.si_note, kn, 0); } else if (kn->kn_filter == EVFILT_WRITE) { kn->kn_fop = &mq_wfiltops; knlist_add(&mq->mq_wsel.si_note, kn, 0); } else error = EINVAL; return (error); } static void filt_mqdetach(struct knote *kn) { struct mqueue *mq = FPTOMQ(kn->kn_fp); if (kn->kn_filter == EVFILT_READ) knlist_remove(&mq->mq_rsel.si_note, kn, 0); else if (kn->kn_filter == EVFILT_WRITE) knlist_remove(&mq->mq_wsel.si_note, kn, 0); else panic("filt_mqdetach"); } static int filt_mqread(struct knote *kn, long hint) { struct mqueue *mq = FPTOMQ(kn->kn_fp); mtx_assert(&mq->mq_mutex, MA_OWNED); return (mq->mq_curmsgs != 0); } static int filt_mqwrite(struct knote *kn, long hint) { struct mqueue *mq = FPTOMQ(kn->kn_fp); mtx_assert(&mq->mq_mutex, MA_OWNED); return (mq->mq_curmsgs < mq->mq_maxmsg); } static int mqf_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp) { kif->kf_type = KF_TYPE_MQUEUE; return (0); } static struct fileops mqueueops = { .fo_read = invfo_rdwr, .fo_write = invfo_rdwr, .fo_truncate = invfo_truncate, .fo_ioctl = invfo_ioctl, .fo_poll = mqf_poll, .fo_kqfilter = mqf_kqfilter, .fo_stat = mqf_stat, .fo_close = mqf_close, .fo_chmod = mqf_chmod, .fo_chown = mqf_chown, .fo_sendfile = invfo_sendfile, .fo_fill_kinfo = mqf_fill_kinfo, }; static struct vop_vector mqfs_vnodeops = { .vop_default = &default_vnodeops, .vop_access = mqfs_access, .vop_cachedlookup = mqfs_lookup, .vop_lookup = vfs_cache_lookup, .vop_reclaim = mqfs_reclaim, .vop_create = mqfs_create, .vop_remove = mqfs_remove, .vop_inactive = mqfs_inactive, .vop_open = mqfs_open, .vop_close = mqfs_close, .vop_getattr = mqfs_getattr, .vop_setattr = mqfs_setattr, .vop_read = mqfs_read, .vop_write = VOP_EOPNOTSUPP, .vop_readdir = mqfs_readdir, .vop_mkdir = VOP_EOPNOTSUPP, .vop_rmdir = VOP_EOPNOTSUPP }; static struct vfsops mqfs_vfsops = { .vfs_init = mqfs_init, .vfs_uninit = mqfs_uninit, .vfs_mount = mqfs_mount, .vfs_unmount = mqfs_unmount, .vfs_root = mqfs_root, .vfs_statfs = mqfs_statfs, }; static struct vfsconf mqueuefs_vfsconf = { .vfc_version = VFS_VERSION, .vfc_name = "mqueuefs", .vfc_vfsops = &mqfs_vfsops, .vfc_typenum = -1, .vfc_flags = VFCF_SYNTHETIC }; static struct syscall_helper_data mq_syscalls[] = { SYSCALL_INIT_HELPER(kmq_open), SYSCALL_INIT_HELPER_F(kmq_setattr, SYF_CAPENABLED), SYSCALL_INIT_HELPER_F(kmq_timedsend, SYF_CAPENABLED), SYSCALL_INIT_HELPER_F(kmq_timedreceive, SYF_CAPENABLED), SYSCALL_INIT_HELPER_F(kmq_notify, SYF_CAPENABLED), SYSCALL_INIT_HELPER(kmq_unlink), SYSCALL_INIT_LAST }; #ifdef COMPAT_FREEBSD32 #include #include #include #include #include static void mq_attr_from32(const struct mq_attr32 *from, struct mq_attr *to) { to->mq_flags = from->mq_flags; to->mq_maxmsg = from->mq_maxmsg; to->mq_msgsize = from->mq_msgsize; to->mq_curmsgs = from->mq_curmsgs; } static void mq_attr_to32(const struct mq_attr *from, struct mq_attr32 *to) { to->mq_flags = from->mq_flags; to->mq_maxmsg = from->mq_maxmsg; to->mq_msgsize = from->mq_msgsize; to->mq_curmsgs = from->mq_curmsgs; } int freebsd32_kmq_open(struct thread *td, struct freebsd32_kmq_open_args *uap) { struct mq_attr attr; struct mq_attr32 attr32; int flags, error; if ((uap->flags & O_ACCMODE) == O_ACCMODE || uap->flags & O_EXEC) return (EINVAL); flags = FFLAGS(uap->flags); if ((flags & O_CREAT) != 0 && uap->attr != NULL) { error = copyin(uap->attr, &attr32, sizeof(attr32)); if (error) return (error); mq_attr_from32(&attr32, &attr); } return (kern_kmq_open(td, uap->path, flags, uap->mode, uap->attr != NULL ? &attr : NULL)); } int freebsd32_kmq_setattr(struct thread *td, struct freebsd32_kmq_setattr_args *uap) { struct mq_attr attr, oattr; struct mq_attr32 attr32, oattr32; int error; if (uap->attr != NULL) { error = copyin(uap->attr, &attr32, sizeof(attr32)); if (error != 0) return (error); mq_attr_from32(&attr32, &attr); } error = kern_kmq_setattr(td, uap->mqd, uap->attr != NULL ? &attr : NULL, &oattr); if (error == 0 && uap->oattr != NULL) { mq_attr_to32(&oattr, &oattr32); bzero(oattr32.__reserved, sizeof(oattr32.__reserved)); error = copyout(&oattr32, uap->oattr, sizeof(oattr32)); } return (error); } int freebsd32_kmq_timedsend(struct thread *td, struct freebsd32_kmq_timedsend_args *uap) { struct mqueue *mq; struct file *fp; struct timespec32 ets32; struct timespec *abs_timeout, ets; int error; int waitok; AUDIT_ARG_FD(uap->mqd); error = getmq_write(td, uap->mqd, &fp, NULL, &mq); if (error) return (error); if (uap->abs_timeout != NULL) { error = copyin(uap->abs_timeout, &ets32, sizeof(ets32)); if (error != 0) return (error); CP(ets32, ets, tv_sec); CP(ets32, ets, tv_nsec); abs_timeout = &ets; } else abs_timeout = NULL; waitok = !(fp->f_flag & O_NONBLOCK); error = mqueue_send(mq, uap->msg_ptr, uap->msg_len, uap->msg_prio, waitok, abs_timeout); fdrop(fp, td); return (error); } int freebsd32_kmq_timedreceive(struct thread *td, struct freebsd32_kmq_timedreceive_args *uap) { struct mqueue *mq; struct file *fp; struct timespec32 ets32; struct timespec *abs_timeout, ets; int error, waitok; AUDIT_ARG_FD(uap->mqd); error = getmq_read(td, uap->mqd, &fp, NULL, &mq); if (error) return (error); if (uap->abs_timeout != NULL) { error = copyin(uap->abs_timeout, &ets32, sizeof(ets32)); if (error != 0) return (error); CP(ets32, ets, tv_sec); CP(ets32, ets, tv_nsec); abs_timeout = &ets; } else abs_timeout = NULL; waitok = !(fp->f_flag & O_NONBLOCK); error = mqueue_receive(mq, uap->msg_ptr, uap->msg_len, uap->msg_prio, waitok, abs_timeout); fdrop(fp, td); return (error); } int freebsd32_kmq_notify(struct thread *td, struct freebsd32_kmq_notify_args *uap) { struct sigevent ev, *evp; struct sigevent32 ev32; int error; if (uap->sigev == NULL) { evp = NULL; } else { error = copyin(uap->sigev, &ev32, sizeof(ev32)); if (error != 0) return (error); error = convert_sigevent32(&ev32, &ev); if (error != 0) return (error); evp = &ev; } return (kern_kmq_notify(td, uap->mqd, evp)); } static struct syscall_helper_data mq32_syscalls[] = { SYSCALL32_INIT_HELPER(freebsd32_kmq_open), SYSCALL32_INIT_HELPER_F(freebsd32_kmq_setattr, SYF_CAPENABLED), SYSCALL32_INIT_HELPER_F(freebsd32_kmq_timedsend, SYF_CAPENABLED), SYSCALL32_INIT_HELPER_F(freebsd32_kmq_timedreceive, SYF_CAPENABLED), SYSCALL32_INIT_HELPER_F(freebsd32_kmq_notify, SYF_CAPENABLED), SYSCALL32_INIT_HELPER_COMPAT(kmq_unlink), SYSCALL_INIT_LAST }; #endif static int mqinit(void) { int error; error = syscall_helper_register(mq_syscalls, SY_THR_STATIC_KLD); if (error != 0) return (error); #ifdef COMPAT_FREEBSD32 error = syscall32_helper_register(mq32_syscalls, SY_THR_STATIC_KLD); if (error != 0) return (error); #endif return (0); } static int mqunload(void) { #ifdef COMPAT_FREEBSD32 syscall32_helper_unregister(mq32_syscalls); #endif syscall_helper_unregister(mq_syscalls); return (0); } static int mq_modload(struct module *module, int cmd, void *arg) { int error = 0; error = vfs_modevent(module, cmd, arg); if (error != 0) return (error); switch (cmd) { case MOD_LOAD: error = mqinit(); if (error != 0) mqunload(); break; case MOD_UNLOAD: error = mqunload(); break; default: break; } return (error); } static moduledata_t mqueuefs_mod = { "mqueuefs", mq_modload, &mqueuefs_vfsconf }; DECLARE_MODULE(mqueuefs, mqueuefs_mod, SI_SUB_VFS, SI_ORDER_MIDDLE); MODULE_VERSION(mqueuefs, 1); Index: head/sys/kern/uipc_sem.c =================================================================== --- head/sys/kern/uipc_sem.c (revision 336913) +++ head/sys/kern/uipc_sem.c (revision 336914) @@ -1,1110 +1,1110 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2002 Alfred Perlstein * Copyright (c) 2003-2005 SPARTA, Inc. * Copyright (c) 2005, 2016-2017 Robert N. M. Watson * All rights reserved. * * This software was developed for the FreeBSD Project in part by Network * Associates Laboratories, the Security Research Division of Network * Associates, Inc. under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), * as part of the DARPA CHATS research program. * * Portions of this software were developed by BAE Systems, the University of * Cambridge Computer Laboratory, and Memorial University under DARPA/AFRL * contract FA8650-15-C-7558 ("CADETS"), as part of the DARPA Transparent * Computing (TC) research program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_posix.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include FEATURE(p1003_1b_semaphores, "POSIX P1003.1B semaphores support"); /* * TODO * * - Resource limits? * - Replace global sem_lock with mtx_pool locks? * - Add a MAC check_create() hook for creating new named semaphores. */ #ifndef SEM_MAX #define SEM_MAX 30 #endif #ifdef SEM_DEBUG #define DP(x) printf x #else #define DP(x) #endif struct ksem_mapping { char *km_path; Fnv32_t km_fnv; struct ksem *km_ksem; LIST_ENTRY(ksem_mapping) km_link; }; static MALLOC_DEFINE(M_KSEM, "ksem", "semaphore file descriptor"); static LIST_HEAD(, ksem_mapping) *ksem_dictionary; static struct sx ksem_dict_lock; static struct mtx ksem_count_lock; static struct mtx sem_lock; static u_long ksem_hash; static int ksem_dead; #define KSEM_HASH(fnv) (&ksem_dictionary[(fnv) & ksem_hash]) static int nsems = 0; SYSCTL_DECL(_p1003_1b); SYSCTL_INT(_p1003_1b, OID_AUTO, nsems, CTLFLAG_RD, &nsems, 0, "Number of active kernel POSIX semaphores"); static int kern_sem_wait(struct thread *td, semid_t id, int tryflag, struct timespec *abstime); static int ksem_access(struct ksem *ks, struct ucred *ucred); static struct ksem *ksem_alloc(struct ucred *ucred, mode_t mode, unsigned int value); static int ksem_create(struct thread *td, const char *path, semid_t *semidp, mode_t mode, unsigned int value, int flags, int compat32); static void ksem_drop(struct ksem *ks); static int ksem_get(struct thread *td, semid_t id, cap_rights_t *rightsp, struct file **fpp); static struct ksem *ksem_hold(struct ksem *ks); static void ksem_insert(char *path, Fnv32_t fnv, struct ksem *ks); static struct ksem *ksem_lookup(char *path, Fnv32_t fnv); static void ksem_module_destroy(void); static int ksem_module_init(void); static int ksem_remove(char *path, Fnv32_t fnv, struct ucred *ucred); static int sem_modload(struct module *module, int cmd, void *arg); static fo_stat_t ksem_stat; static fo_close_t ksem_closef; static fo_chmod_t ksem_chmod; static fo_chown_t ksem_chown; static fo_fill_kinfo_t ksem_fill_kinfo; /* File descriptor operations. */ static struct fileops ksem_ops = { .fo_read = invfo_rdwr, .fo_write = invfo_rdwr, .fo_truncate = invfo_truncate, .fo_ioctl = invfo_ioctl, .fo_poll = invfo_poll, .fo_kqfilter = invfo_kqfilter, .fo_stat = ksem_stat, .fo_close = ksem_closef, .fo_chmod = ksem_chmod, .fo_chown = ksem_chown, .fo_sendfile = invfo_sendfile, .fo_fill_kinfo = ksem_fill_kinfo, .fo_flags = DFLAG_PASSABLE }; FEATURE(posix_sem, "POSIX semaphores"); static int ksem_stat(struct file *fp, struct stat *sb, struct ucred *active_cred, struct thread *td) { struct ksem *ks; #ifdef MAC int error; #endif ks = fp->f_data; #ifdef MAC error = mac_posixsem_check_stat(active_cred, fp->f_cred, ks); if (error) return (error); #endif /* * Attempt to return sanish values for fstat() on a semaphore * file descriptor. */ bzero(sb, sizeof(*sb)); mtx_lock(&sem_lock); sb->st_atim = ks->ks_atime; sb->st_ctim = ks->ks_ctime; sb->st_mtim = ks->ks_mtime; sb->st_birthtim = ks->ks_birthtime; sb->st_uid = ks->ks_uid; sb->st_gid = ks->ks_gid; sb->st_mode = S_IFREG | ks->ks_mode; /* XXX */ mtx_unlock(&sem_lock); return (0); } static int ksem_chmod(struct file *fp, mode_t mode, struct ucred *active_cred, struct thread *td) { struct ksem *ks; int error; error = 0; ks = fp->f_data; mtx_lock(&sem_lock); #ifdef MAC error = mac_posixsem_check_setmode(active_cred, ks, mode); if (error != 0) goto out; #endif error = vaccess(VREG, ks->ks_mode, ks->ks_uid, ks->ks_gid, VADMIN, active_cred, NULL); if (error != 0) goto out; ks->ks_mode = mode & ACCESSPERMS; out: mtx_unlock(&sem_lock); return (error); } static int ksem_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred, struct thread *td) { struct ksem *ks; int error; error = 0; ks = fp->f_data; mtx_lock(&sem_lock); #ifdef MAC error = mac_posixsem_check_setowner(active_cred, ks, uid, gid); if (error != 0) goto out; #endif if (uid == (uid_t)-1) uid = ks->ks_uid; if (gid == (gid_t)-1) gid = ks->ks_gid; if (((uid != ks->ks_uid && uid != active_cred->cr_uid) || (gid != ks->ks_gid && !groupmember(gid, active_cred))) && (error = priv_check_cred(active_cred, PRIV_VFS_CHOWN, 0))) goto out; ks->ks_uid = uid; ks->ks_gid = gid; out: mtx_unlock(&sem_lock); return (error); } static int ksem_closef(struct file *fp, struct thread *td) { struct ksem *ks; ks = fp->f_data; fp->f_data = NULL; ksem_drop(ks); return (0); } static int ksem_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp) { const char *path, *pr_path; struct ksem *ks; size_t pr_pathlen; kif->kf_type = KF_TYPE_SEM; ks = fp->f_data; mtx_lock(&sem_lock); kif->kf_un.kf_sem.kf_sem_value = ks->ks_value; kif->kf_un.kf_sem.kf_sem_mode = S_IFREG | ks->ks_mode; /* XXX */ mtx_unlock(&sem_lock); if (ks->ks_path != NULL) { sx_slock(&ksem_dict_lock); if (ks->ks_path != NULL) { path = ks->ks_path; pr_path = curthread->td_ucred->cr_prison->pr_path; if (strcmp(pr_path, "/") != 0) { /* Return the jail-rooted pathname. */ pr_pathlen = strlen(pr_path); if (strncmp(path, pr_path, pr_pathlen) == 0 && path[pr_pathlen] == '/') path += pr_pathlen; } strlcpy(kif->kf_path, path, sizeof(kif->kf_path)); } sx_sunlock(&ksem_dict_lock); } return (0); } /* * ksem object management including creation and reference counting * routines. */ static struct ksem * ksem_alloc(struct ucred *ucred, mode_t mode, unsigned int value) { struct ksem *ks; mtx_lock(&ksem_count_lock); if (nsems == p31b_getcfg(CTL_P1003_1B_SEM_NSEMS_MAX) || ksem_dead) { mtx_unlock(&ksem_count_lock); return (NULL); } nsems++; mtx_unlock(&ksem_count_lock); ks = malloc(sizeof(*ks), M_KSEM, M_WAITOK | M_ZERO); ks->ks_uid = ucred->cr_uid; ks->ks_gid = ucred->cr_gid; ks->ks_mode = mode; ks->ks_value = value; cv_init(&ks->ks_cv, "ksem"); vfs_timestamp(&ks->ks_birthtime); ks->ks_atime = ks->ks_mtime = ks->ks_ctime = ks->ks_birthtime; refcount_init(&ks->ks_ref, 1); #ifdef MAC mac_posixsem_init(ks); mac_posixsem_create(ucred, ks); #endif return (ks); } static struct ksem * ksem_hold(struct ksem *ks) { refcount_acquire(&ks->ks_ref); return (ks); } static void ksem_drop(struct ksem *ks) { if (refcount_release(&ks->ks_ref)) { #ifdef MAC mac_posixsem_destroy(ks); #endif cv_destroy(&ks->ks_cv); free(ks, M_KSEM); mtx_lock(&ksem_count_lock); nsems--; mtx_unlock(&ksem_count_lock); } } /* * Determine if the credentials have sufficient permissions for read * and write access. */ static int ksem_access(struct ksem *ks, struct ucred *ucred) { int error; error = vaccess(VREG, ks->ks_mode, ks->ks_uid, ks->ks_gid, VREAD | VWRITE, ucred, NULL); if (error) error = priv_check_cred(ucred, PRIV_SEM_WRITE, 0); return (error); } /* * Dictionary management. We maintain an in-kernel dictionary to map * paths to semaphore objects. We use the FNV hash on the path to * store the mappings in a hash table. */ static struct ksem * ksem_lookup(char *path, Fnv32_t fnv) { struct ksem_mapping *map; LIST_FOREACH(map, KSEM_HASH(fnv), km_link) { if (map->km_fnv != fnv) continue; if (strcmp(map->km_path, path) == 0) return (map->km_ksem); } return (NULL); } static void ksem_insert(char *path, Fnv32_t fnv, struct ksem *ks) { struct ksem_mapping *map; map = malloc(sizeof(struct ksem_mapping), M_KSEM, M_WAITOK); map->km_path = path; map->km_fnv = fnv; map->km_ksem = ksem_hold(ks); ks->ks_path = path; LIST_INSERT_HEAD(KSEM_HASH(fnv), map, km_link); } static int ksem_remove(char *path, Fnv32_t fnv, struct ucred *ucred) { struct ksem_mapping *map; int error; LIST_FOREACH(map, KSEM_HASH(fnv), km_link) { if (map->km_fnv != fnv) continue; if (strcmp(map->km_path, path) == 0) { #ifdef MAC error = mac_posixsem_check_unlink(ucred, map->km_ksem); if (error) return (error); #endif error = ksem_access(map->km_ksem, ucred); if (error) return (error); map->km_ksem->ks_path = NULL; LIST_REMOVE(map, km_link); ksem_drop(map->km_ksem); free(map->km_path, M_KSEM); free(map, M_KSEM); return (0); } } return (ENOENT); } static int ksem_create_copyout_semid(struct thread *td, semid_t *semidp, int fd, int compat32) { semid_t semid; #ifdef COMPAT_FREEBSD32 int32_t semid32; #endif void *ptr; size_t ptrs; #ifdef COMPAT_FREEBSD32 if (compat32) { semid32 = fd; ptr = &semid32; ptrs = sizeof(semid32); } else { #endif semid = fd; ptr = &semid; ptrs = sizeof(semid); compat32 = 0; /* silence gcc */ #ifdef COMPAT_FREEBSD32 } #endif return (copyout(ptr, semidp, ptrs)); } /* Other helper routines. */ static int ksem_create(struct thread *td, const char *name, semid_t *semidp, mode_t mode, unsigned int value, int flags, int compat32) { struct filedesc *fdp; struct ksem *ks; struct file *fp; char *path; const char *pr_path; size_t pr_pathlen; Fnv32_t fnv; int error, fd; AUDIT_ARG_FFLAGS(flags); AUDIT_ARG_MODE(mode); AUDIT_ARG_VALUE(value); if (value > SEM_VALUE_MAX) return (EINVAL); fdp = td->td_proc->p_fd; mode = (mode & ~fdp->fd_cmask) & ACCESSPERMS; error = falloc(td, &fp, &fd, O_CLOEXEC); if (error) { if (name == NULL) error = ENOSPC; return (error); } /* * Go ahead and copyout the file descriptor now. This is a bit * premature, but it is a lot easier to handle errors as opposed * to later when we've possibly created a new semaphore, etc. */ error = ksem_create_copyout_semid(td, semidp, fd, compat32); if (error) { fdclose(td, fp, fd); fdrop(fp, td); return (error); } if (name == NULL) { /* Create an anonymous semaphore. */ ks = ksem_alloc(td->td_ucred, mode, value); if (ks == NULL) error = ENOSPC; else ks->ks_flags |= KS_ANONYMOUS; } else { path = malloc(MAXPATHLEN, M_KSEM, M_WAITOK); pr_path = td->td_ucred->cr_prison->pr_path; /* Construct a full pathname for jailed callers. */ pr_pathlen = strcmp(pr_path, "/") == 0 ? 0 : strlcpy(path, pr_path, MAXPATHLEN); error = copyinstr(name, path + pr_pathlen, MAXPATHLEN - pr_pathlen, NULL); /* Require paths to start with a '/' character. */ if (error == 0 && path[pr_pathlen] != '/') error = EINVAL; if (error) { fdclose(td, fp, fd); fdrop(fp, td); free(path, M_KSEM); return (error); } AUDIT_ARG_UPATH1_CANON(path); fnv = fnv_32_str(path, FNV1_32_INIT); sx_xlock(&ksem_dict_lock); ks = ksem_lookup(path, fnv); if (ks == NULL) { /* Object does not exist, create it if requested. */ if (flags & O_CREAT) { ks = ksem_alloc(td->td_ucred, mode, value); if (ks == NULL) error = ENFILE; else { ksem_insert(path, fnv, ks); path = NULL; } } else error = ENOENT; } else { /* * Object already exists, obtain a new * reference if requested and permitted. */ if ((flags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL)) error = EEXIST; else { #ifdef MAC error = mac_posixsem_check_open(td->td_ucred, ks); if (error == 0) #endif error = ksem_access(ks, td->td_ucred); } if (error == 0) ksem_hold(ks); #ifdef INVARIANTS else ks = NULL; #endif } sx_xunlock(&ksem_dict_lock); if (path) free(path, M_KSEM); } if (error) { KASSERT(ks == NULL, ("ksem_create error with a ksem")); fdclose(td, fp, fd); fdrop(fp, td); return (error); } KASSERT(ks != NULL, ("ksem_create w/o a ksem")); finit(fp, FREAD | FWRITE, DTYPE_SEM, ks, &ksem_ops); fdrop(fp, td); return (0); } static int ksem_get(struct thread *td, semid_t id, cap_rights_t *rightsp, struct file **fpp) { struct ksem *ks; struct file *fp; int error; error = fget(td, id, rightsp, &fp); if (error) return (EINVAL); if (fp->f_type != DTYPE_SEM) { fdrop(fp, td); return (EINVAL); } ks = fp->f_data; if (ks->ks_flags & KS_DEAD) { fdrop(fp, td); return (EINVAL); } *fpp = fp; return (0); } /* System calls. */ #ifndef _SYS_SYSPROTO_H_ struct ksem_init_args { unsigned int value; semid_t *idp; }; #endif int sys_ksem_init(struct thread *td, struct ksem_init_args *uap) { return (ksem_create(td, NULL, uap->idp, S_IRWXU | S_IRWXG, uap->value, 0, 0)); } #ifndef _SYS_SYSPROTO_H_ struct ksem_open_args { char *name; int oflag; mode_t mode; unsigned int value; semid_t *idp; }; #endif int sys_ksem_open(struct thread *td, struct ksem_open_args *uap) { DP((">>> ksem_open start, pid=%d\n", (int)td->td_proc->p_pid)); if ((uap->oflag & ~(O_CREAT | O_EXCL)) != 0) return (EINVAL); return (ksem_create(td, uap->name, uap->idp, uap->mode, uap->value, uap->oflag, 0)); } #ifndef _SYS_SYSPROTO_H_ struct ksem_unlink_args { char *name; }; #endif int sys_ksem_unlink(struct thread *td, struct ksem_unlink_args *uap) { char *path; const char *pr_path; size_t pr_pathlen; Fnv32_t fnv; int error; path = malloc(MAXPATHLEN, M_TEMP, M_WAITOK); pr_path = td->td_ucred->cr_prison->pr_path; pr_pathlen = strcmp(pr_path, "/") == 0 ? 0 : strlcpy(path, pr_path, MAXPATHLEN); error = copyinstr(uap->name, path + pr_pathlen, MAXPATHLEN - pr_pathlen, NULL); if (error) { free(path, M_TEMP); return (error); } AUDIT_ARG_UPATH1_CANON(path); fnv = fnv_32_str(path, FNV1_32_INIT); sx_xlock(&ksem_dict_lock); error = ksem_remove(path, fnv, td->td_ucred); sx_xunlock(&ksem_dict_lock); free(path, M_TEMP); return (error); } #ifndef _SYS_SYSPROTO_H_ struct ksem_close_args { semid_t id; }; #endif int sys_ksem_close(struct thread *td, struct ksem_close_args *uap) { struct ksem *ks; struct file *fp; int error; /* No capability rights required to close a semaphore. */ AUDIT_ARG_FD(uap->id); error = ksem_get(td, uap->id, &cap_no_rights, &fp); if (error) return (error); ks = fp->f_data; if (ks->ks_flags & KS_ANONYMOUS) { fdrop(fp, td); return (EINVAL); } error = kern_close(td, uap->id); fdrop(fp, td); return (error); } #ifndef _SYS_SYSPROTO_H_ struct ksem_post_args { semid_t id; }; #endif int sys_ksem_post(struct thread *td, struct ksem_post_args *uap) { cap_rights_t rights; struct file *fp; struct ksem *ks; int error; AUDIT_ARG_FD(uap->id); error = ksem_get(td, uap->id, cap_rights_init(&rights, CAP_SEM_POST), &fp); if (error) return (error); ks = fp->f_data; mtx_lock(&sem_lock); #ifdef MAC error = mac_posixsem_check_post(td->td_ucred, fp->f_cred, ks); if (error) goto err; #endif if (ks->ks_value == SEM_VALUE_MAX) { error = EOVERFLOW; goto err; } ++ks->ks_value; if (ks->ks_waiters > 0) cv_signal(&ks->ks_cv); error = 0; vfs_timestamp(&ks->ks_ctime); err: mtx_unlock(&sem_lock); fdrop(fp, td); return (error); } #ifndef _SYS_SYSPROTO_H_ struct ksem_wait_args { semid_t id; }; #endif int sys_ksem_wait(struct thread *td, struct ksem_wait_args *uap) { return (kern_sem_wait(td, uap->id, 0, NULL)); } #ifndef _SYS_SYSPROTO_H_ struct ksem_timedwait_args { semid_t id; const struct timespec *abstime; }; #endif int sys_ksem_timedwait(struct thread *td, struct ksem_timedwait_args *uap) { struct timespec abstime; struct timespec *ts; int error; /* * We allow a null timespec (wait forever). */ if (uap->abstime == NULL) ts = NULL; else { error = copyin(uap->abstime, &abstime, sizeof(abstime)); if (error != 0) return (error); if (abstime.tv_nsec >= 1000000000 || abstime.tv_nsec < 0) return (EINVAL); ts = &abstime; } return (kern_sem_wait(td, uap->id, 0, ts)); } #ifndef _SYS_SYSPROTO_H_ struct ksem_trywait_args { semid_t id; }; #endif int sys_ksem_trywait(struct thread *td, struct ksem_trywait_args *uap) { return (kern_sem_wait(td, uap->id, 1, NULL)); } static int kern_sem_wait(struct thread *td, semid_t id, int tryflag, struct timespec *abstime) { struct timespec ts1, ts2; struct timeval tv; cap_rights_t rights; struct file *fp; struct ksem *ks; int error; DP((">>> kern_sem_wait entered! pid=%d\n", (int)td->td_proc->p_pid)); AUDIT_ARG_FD(id); error = ksem_get(td, id, cap_rights_init(&rights, CAP_SEM_WAIT), &fp); if (error) return (error); ks = fp->f_data; mtx_lock(&sem_lock); DP((">>> kern_sem_wait critical section entered! pid=%d\n", (int)td->td_proc->p_pid)); #ifdef MAC error = mac_posixsem_check_wait(td->td_ucred, fp->f_cred, ks); if (error) { DP(("kern_sem_wait mac failed\n")); goto err; } #endif DP(("kern_sem_wait value = %d, tryflag %d\n", ks->ks_value, tryflag)); vfs_timestamp(&ks->ks_atime); while (ks->ks_value == 0) { ks->ks_waiters++; if (tryflag != 0) error = EAGAIN; else if (abstime == NULL) error = cv_wait_sig(&ks->ks_cv, &sem_lock); else { for (;;) { ts1 = *abstime; getnanotime(&ts2); - timespecsub(&ts1, &ts2); + timespecsub(&ts1, &ts2, &ts1); TIMESPEC_TO_TIMEVAL(&tv, &ts1); if (tv.tv_sec < 0) { error = ETIMEDOUT; break; } error = cv_timedwait_sig(&ks->ks_cv, &sem_lock, tvtohz(&tv)); if (error != EWOULDBLOCK) break; } } ks->ks_waiters--; if (error) goto err; } ks->ks_value--; DP(("kern_sem_wait value post-decrement = %d\n", ks->ks_value)); error = 0; err: mtx_unlock(&sem_lock); fdrop(fp, td); DP(("<<< kern_sem_wait leaving, pid=%d, error = %d\n", (int)td->td_proc->p_pid, error)); return (error); } #ifndef _SYS_SYSPROTO_H_ struct ksem_getvalue_args { semid_t id; int *val; }; #endif int sys_ksem_getvalue(struct thread *td, struct ksem_getvalue_args *uap) { cap_rights_t rights; struct file *fp; struct ksem *ks; int error, val; AUDIT_ARG_FD(uap->id); error = ksem_get(td, uap->id, cap_rights_init(&rights, CAP_SEM_GETVALUE), &fp); if (error) return (error); ks = fp->f_data; mtx_lock(&sem_lock); #ifdef MAC error = mac_posixsem_check_getvalue(td->td_ucred, fp->f_cred, ks); if (error) { mtx_unlock(&sem_lock); fdrop(fp, td); return (error); } #endif val = ks->ks_value; vfs_timestamp(&ks->ks_atime); mtx_unlock(&sem_lock); fdrop(fp, td); error = copyout(&val, uap->val, sizeof(val)); return (error); } #ifndef _SYS_SYSPROTO_H_ struct ksem_destroy_args { semid_t id; }; #endif int sys_ksem_destroy(struct thread *td, struct ksem_destroy_args *uap) { struct file *fp; struct ksem *ks; int error; /* No capability rights required to close a semaphore. */ AUDIT_ARG_FD(uap->id); error = ksem_get(td, uap->id, &cap_no_rights, &fp); if (error) return (error); ks = fp->f_data; if (!(ks->ks_flags & KS_ANONYMOUS)) { fdrop(fp, td); return (EINVAL); } mtx_lock(&sem_lock); if (ks->ks_waiters != 0) { mtx_unlock(&sem_lock); error = EBUSY; goto err; } ks->ks_flags |= KS_DEAD; mtx_unlock(&sem_lock); error = kern_close(td, uap->id); err: fdrop(fp, td); return (error); } static struct syscall_helper_data ksem_syscalls[] = { SYSCALL_INIT_HELPER(ksem_init), SYSCALL_INIT_HELPER(ksem_open), SYSCALL_INIT_HELPER(ksem_unlink), SYSCALL_INIT_HELPER(ksem_close), SYSCALL_INIT_HELPER(ksem_post), SYSCALL_INIT_HELPER(ksem_wait), SYSCALL_INIT_HELPER(ksem_timedwait), SYSCALL_INIT_HELPER(ksem_trywait), SYSCALL_INIT_HELPER(ksem_getvalue), SYSCALL_INIT_HELPER(ksem_destroy), SYSCALL_INIT_LAST }; #ifdef COMPAT_FREEBSD32 #include #include #include #include #include int freebsd32_ksem_init(struct thread *td, struct freebsd32_ksem_init_args *uap) { return (ksem_create(td, NULL, uap->idp, S_IRWXU | S_IRWXG, uap->value, 0, 1)); } int freebsd32_ksem_open(struct thread *td, struct freebsd32_ksem_open_args *uap) { if ((uap->oflag & ~(O_CREAT | O_EXCL)) != 0) return (EINVAL); return (ksem_create(td, uap->name, uap->idp, uap->mode, uap->value, uap->oflag, 1)); } int freebsd32_ksem_timedwait(struct thread *td, struct freebsd32_ksem_timedwait_args *uap) { struct timespec32 abstime32; struct timespec *ts, abstime; int error; /* * We allow a null timespec (wait forever). */ if (uap->abstime == NULL) ts = NULL; else { error = copyin(uap->abstime, &abstime32, sizeof(abstime32)); if (error != 0) return (error); CP(abstime32, abstime, tv_sec); CP(abstime32, abstime, tv_nsec); if (abstime.tv_nsec >= 1000000000 || abstime.tv_nsec < 0) return (EINVAL); ts = &abstime; } return (kern_sem_wait(td, uap->id, 0, ts)); } static struct syscall_helper_data ksem32_syscalls[] = { SYSCALL32_INIT_HELPER(freebsd32_ksem_init), SYSCALL32_INIT_HELPER(freebsd32_ksem_open), SYSCALL32_INIT_HELPER_COMPAT(ksem_unlink), SYSCALL32_INIT_HELPER_COMPAT(ksem_close), SYSCALL32_INIT_HELPER_COMPAT(ksem_post), SYSCALL32_INIT_HELPER_COMPAT(ksem_wait), SYSCALL32_INIT_HELPER(freebsd32_ksem_timedwait), SYSCALL32_INIT_HELPER_COMPAT(ksem_trywait), SYSCALL32_INIT_HELPER_COMPAT(ksem_getvalue), SYSCALL32_INIT_HELPER_COMPAT(ksem_destroy), SYSCALL_INIT_LAST }; #endif static int ksem_module_init(void) { int error; mtx_init(&sem_lock, "sem", NULL, MTX_DEF); mtx_init(&ksem_count_lock, "ksem count", NULL, MTX_DEF); sx_init(&ksem_dict_lock, "ksem dictionary"); ksem_dictionary = hashinit(1024, M_KSEM, &ksem_hash); p31b_setcfg(CTL_P1003_1B_SEMAPHORES, 200112L); p31b_setcfg(CTL_P1003_1B_SEM_NSEMS_MAX, SEM_MAX); p31b_setcfg(CTL_P1003_1B_SEM_VALUE_MAX, SEM_VALUE_MAX); error = syscall_helper_register(ksem_syscalls, SY_THR_STATIC_KLD); if (error) return (error); #ifdef COMPAT_FREEBSD32 error = syscall32_helper_register(ksem32_syscalls, SY_THR_STATIC_KLD); if (error) return (error); #endif return (0); } static void ksem_module_destroy(void) { #ifdef COMPAT_FREEBSD32 syscall32_helper_unregister(ksem32_syscalls); #endif syscall_helper_unregister(ksem_syscalls); p31b_setcfg(CTL_P1003_1B_SEMAPHORES, 0); hashdestroy(ksem_dictionary, M_KSEM, ksem_hash); sx_destroy(&ksem_dict_lock); mtx_destroy(&ksem_count_lock); mtx_destroy(&sem_lock); p31b_unsetcfg(CTL_P1003_1B_SEM_VALUE_MAX); p31b_unsetcfg(CTL_P1003_1B_SEM_NSEMS_MAX); } static int sem_modload(struct module *module, int cmd, void *arg) { int error = 0; switch (cmd) { case MOD_LOAD: error = ksem_module_init(); if (error) ksem_module_destroy(); break; case MOD_UNLOAD: mtx_lock(&ksem_count_lock); if (nsems != 0) { error = EOPNOTSUPP; mtx_unlock(&ksem_count_lock); break; } ksem_dead = 1; mtx_unlock(&ksem_count_lock); ksem_module_destroy(); break; case MOD_SHUTDOWN: break; default: error = EINVAL; break; } return (error); } static moduledata_t sem_mod = { "sem", &sem_modload, NULL }; DECLARE_MODULE(sem, sem_mod, SI_SUB_SYSV_SEM, SI_ORDER_FIRST); MODULE_VERSION(sem, 1); Index: head/sys/mips/ingenic/jz4780_smb.c =================================================================== --- head/sys/mips/ingenic/jz4780_smb.c (revision 336913) +++ head/sys/mips/ingenic/jz4780_smb.c (revision 336914) @@ -1,480 +1,480 @@ /*- * Copyright (c) 2016 Jared McNeill * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ /* * Ingenic JZ4780 SMB Controller */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "iicbus_if.h" #define JZSMB_TIMEOUT ((300UL * hz) / 1000) #define JZSMB_SPEED_STANDARD 100000 #define JZSMB_SETUP_TIME_STANDARD 300 #define JZSMB_HOLD_TIME_STANDARD 400 #define JZSMB_PERIOD_MIN_STANDARD 4000 #define JZSMB_PERIOD_MAX_STANDARD 4700 #define JZSMB_SPEED_FAST 400000 #define JZSMB_SETUP_TIME_FAST 450 #define JZSMB_HOLD_TIME_FAST 450 #define JZSMB_PERIOD_MIN_FAST 600 #define JZSMB_PERIOD_MAX_FAST 1300 #define JZSMB_HCNT_BASE 8 #define JZSMB_HCNT_MIN 6 #define JZSMB_LCNT_BASE 1 #define JZSMB_LCNT_MIN 8 static inline int tstohz(const struct timespec *tsp) { struct timeval tv; TIMESPEC_TO_TIMEVAL(&tv, tsp); return (tvtohz(&tv)); } static struct ofw_compat_data compat_data[] = { { "ingenic,jz4780-i2c", 1 }, { NULL, 0 } }; static struct resource_spec jzsmb_spec[] = { { SYS_RES_MEMORY, 0, RF_ACTIVE }, { -1, 0 } }; struct jzsmb_softc { struct resource *res; struct mtx mtx; clk_t clk; device_t iicbus; int busy; uint32_t i2c_freq; uint64_t bus_freq; uint32_t status; struct iic_msg *msg; }; #define SMB_LOCK(sc) mtx_lock(&(sc)->mtx) #define SMB_UNLOCK(sc) mtx_unlock(&(sc)->mtx) #define SMB_ASSERT_LOCKED(sc) mtx_assert(&(sc)->mtx, MA_OWNED) #define SMB_READ(sc, reg) bus_read_2((sc)->res, (reg)) #define SMB_WRITE(sc, reg, val) bus_write_2((sc)->res, (reg), (val)) static phandle_t jzsmb_get_node(device_t bus, device_t dev) { return (ofw_bus_get_node(bus)); } static int jzsmb_enable(struct jzsmb_softc *sc, int enable) { SMB_ASSERT_LOCKED(sc); if (enable) { SMB_WRITE(sc, SMBENB, SMBENB_SMBENB); while ((SMB_READ(sc, SMBENBST) & SMBENBST_SMBEN) == 0) ; } else { SMB_WRITE(sc, SMBENB, 0); while ((SMB_READ(sc, SMBENBST) & SMBENBST_SMBEN) != 0) ; } return (0); } static int jzsmb_reset_locked(device_t dev, u_char addr) { struct jzsmb_softc *sc; uint16_t con; uint32_t period; int hcnt, lcnt, setup_time, hold_time; sc = device_get_softc(dev); SMB_ASSERT_LOCKED(sc); /* Setup master mode operation */ /* Disable SMB */ jzsmb_enable(sc, 0); /* Disable interrupts */ SMB_WRITE(sc, SMBINTM, 0); /* Set supported speed mode and expected SCL frequency */ period = sc->bus_freq / sc->i2c_freq; con = SMBCON_REST | SMBCON_SLVDIS | SMBCON_MD; switch (sc->i2c_freq) { case JZSMB_SPEED_STANDARD: con |= SMBCON_SPD_STANDARD; setup_time = JZSMB_SETUP_TIME_STANDARD; hold_time = JZSMB_HOLD_TIME_STANDARD; hcnt = (period * JZSMB_PERIOD_MIN_STANDARD) / (JZSMB_PERIOD_MAX_STANDARD + JZSMB_PERIOD_MIN_STANDARD); lcnt = period - hcnt; hcnt = MAX(hcnt - JZSMB_HCNT_BASE, JZSMB_HCNT_MIN); lcnt = MAX(lcnt - JZSMB_LCNT_BASE, JZSMB_LCNT_MIN); SMB_WRITE(sc, SMBCON, con); SMB_WRITE(sc, SMBSHCNT, hcnt); SMB_WRITE(sc, SMBSLCNT, lcnt); break; case JZSMB_SPEED_FAST: con |= SMBCON_SPD_FAST; setup_time = JZSMB_SETUP_TIME_FAST; hold_time = JZSMB_HOLD_TIME_FAST; hcnt = (period * JZSMB_PERIOD_MIN_FAST) / (JZSMB_PERIOD_MAX_FAST + JZSMB_PERIOD_MIN_FAST); lcnt = period - hcnt; hcnt = MAX(hcnt - JZSMB_HCNT_BASE, JZSMB_HCNT_MIN); lcnt = MAX(lcnt - JZSMB_LCNT_BASE, JZSMB_LCNT_MIN); SMB_WRITE(sc, SMBCON, con); SMB_WRITE(sc, SMBFHCNT, hcnt); SMB_WRITE(sc, SMBFLCNT, lcnt); break; default: return (EINVAL); } setup_time = ((setup_time * sc->bus_freq / 1000) / 1000000) + 1; setup_time = MIN(1, MAX(255, setup_time)); SMB_WRITE(sc, SMBSDASU, setup_time); hold_time = ((hold_time * sc->bus_freq / 1000) / 1000000) - 1; hold_time = MAX(255, hold_time); if (hold_time >= 0) SMB_WRITE(sc, SMBSDAHD, hold_time | SMBSDAHD_HDENB); else SMB_WRITE(sc, SMBSDAHD, 0); SMB_WRITE(sc, SMBTAR, addr >> 1); if (addr != 0) { /* Enable SMB */ jzsmb_enable(sc, 1); } return (0); } static int jzsmb_reset(device_t dev, u_char speed, u_char addr, u_char *oldaddr) { struct jzsmb_softc *sc; int error; sc = device_get_softc(dev); SMB_LOCK(sc); error = jzsmb_reset_locked(dev, addr); SMB_UNLOCK(sc); return (error); } static int jzsmb_transfer_read(device_t dev, struct iic_msg *msg) { struct jzsmb_softc *sc; struct timespec start, diff; uint16_t con, resid; int timeo; sc = device_get_softc(dev); timeo = JZSMB_TIMEOUT * msg->len; SMB_ASSERT_LOCKED(sc); con = SMB_READ(sc, SMBCON); con |= SMBCON_STPHLD; SMB_WRITE(sc, SMBCON, con); getnanouptime(&start); for (resid = msg->len; resid > 0; resid--) { for (int i = 0; i < min(resid, 8); i++) SMB_WRITE(sc, SMBDC, SMBDC_CMD); for (;;) { getnanouptime(&diff); - timespecsub(&diff, &start); + timespecsub(&diff, &start, &diff); if ((SMB_READ(sc, SMBST) & SMBST_RFNE) != 0) { msg->buf[msg->len - resid] = SMB_READ(sc, SMBDC) & SMBDC_DAT; break; } else DELAY(1000); if (tstohz(&diff) >= timeo) { device_printf(dev, "read timeout (status=0x%02x)\n", SMB_READ(sc, SMBST)); return (EIO); } } } con = SMB_READ(sc, SMBCON); con &= ~SMBCON_STPHLD; SMB_WRITE(sc, SMBCON, con); return (0); } static int jzsmb_transfer_write(device_t dev, struct iic_msg *msg, int stop_hold) { struct jzsmb_softc *sc; struct timespec start, diff; uint16_t con, resid; int timeo; sc = device_get_softc(dev); timeo = JZSMB_TIMEOUT * msg->len; SMB_ASSERT_LOCKED(sc); con = SMB_READ(sc, SMBCON); con |= SMBCON_STPHLD; SMB_WRITE(sc, SMBCON, con); getnanouptime(&start); for (resid = msg->len; resid > 0; resid--) { for (;;) { getnanouptime(&diff); - timespecsub(&diff, &start); + timespecsub(&diff, &start, &diff); if ((SMB_READ(sc, SMBST) & SMBST_TFNF) != 0) { SMB_WRITE(sc, SMBDC, msg->buf[msg->len - resid]); break; } else DELAY((1000 * hz) / JZSMB_TIMEOUT); if (tstohz(&diff) >= timeo) { device_printf(dev, "write timeout (status=0x%02x)\n", SMB_READ(sc, SMBST)); return (EIO); } } } if (!stop_hold) { con = SMB_READ(sc, SMBCON); con &= ~SMBCON_STPHLD; SMB_WRITE(sc, SMBCON, con); } return (0); } static int jzsmb_transfer(device_t dev, struct iic_msg *msgs, uint32_t nmsgs) { struct jzsmb_softc *sc; uint32_t n; uint16_t con; int error; sc = device_get_softc(dev); SMB_LOCK(sc); while (sc->busy) mtx_sleep(sc, &sc->mtx, 0, "i2cbuswait", 0); sc->busy = 1; sc->status = 0; for (n = 0; n < nmsgs; n++) { /* Set target address */ if (n == 0 || msgs[n].slave != msgs[n - 1].slave) jzsmb_reset_locked(dev, msgs[n].slave); /* Set read or write */ if ((msgs[n].flags & IIC_M_RD) != 0) error = jzsmb_transfer_read(dev, &msgs[n]); else error = jzsmb_transfer_write(dev, &msgs[n], n < nmsgs - 1); if (error != 0) goto done; } done: /* Send stop if necessary */ con = SMB_READ(sc, SMBCON); con &= ~SMBCON_STPHLD; SMB_WRITE(sc, SMBCON, con); /* Disable SMB */ jzsmb_enable(sc, 0); sc->msg = NULL; sc->busy = 0; wakeup(sc); SMB_UNLOCK(sc); return (error); } static int jzsmb_probe(device_t dev) { if (!ofw_bus_status_okay(dev)) return (ENXIO); if (ofw_bus_search_compatible(dev, compat_data)->ocd_data == 0) return (ENXIO); device_set_desc(dev, "Ingenic JZ4780 SMB Controller"); return (BUS_PROBE_DEFAULT); } static int jzsmb_attach(device_t dev) { struct jzsmb_softc *sc; phandle_t node; int error; sc = device_get_softc(dev); node = ofw_bus_get_node(dev); mtx_init(&sc->mtx, device_get_nameunit(dev), "jzsmb", MTX_DEF); error = clk_get_by_ofw_index(dev, 0, 0, &sc->clk); if (error != 0) { device_printf(dev, "cannot get clock\n"); goto fail; } error = clk_enable(sc->clk); if (error != 0) { device_printf(dev, "cannot enable clock\n"); goto fail; } error = clk_get_freq(sc->clk, &sc->bus_freq); if (error != 0 || sc->bus_freq == 0) { device_printf(dev, "cannot get bus frequency\n"); return (error); } if (bus_alloc_resources(dev, jzsmb_spec, &sc->res) != 0) { device_printf(dev, "cannot allocate resources for device\n"); error = ENXIO; goto fail; } if (OF_getencprop(node, "clock-frequency", &sc->i2c_freq, sizeof(sc->i2c_freq)) != 0 || sc->i2c_freq == 0) sc->i2c_freq = 100000; /* Default to standard mode */ sc->iicbus = device_add_child(dev, "iicbus", -1); if (sc->iicbus == NULL) { device_printf(dev, "cannot add iicbus child device\n"); error = ENXIO; goto fail; } bus_generic_attach(dev); return (0); fail: bus_release_resources(dev, jzsmb_spec, &sc->res); if (sc->clk != NULL) clk_release(sc->clk); mtx_destroy(&sc->mtx); return (error); } static device_method_t jzsmb_methods[] = { /* Device interface */ DEVMETHOD(device_probe, jzsmb_probe), DEVMETHOD(device_attach, jzsmb_attach), /* Bus interface */ DEVMETHOD(bus_setup_intr, bus_generic_setup_intr), DEVMETHOD(bus_teardown_intr, bus_generic_teardown_intr), DEVMETHOD(bus_alloc_resource, bus_generic_alloc_resource), DEVMETHOD(bus_release_resource, bus_generic_release_resource), DEVMETHOD(bus_activate_resource, bus_generic_activate_resource), DEVMETHOD(bus_deactivate_resource, bus_generic_deactivate_resource), DEVMETHOD(bus_adjust_resource, bus_generic_adjust_resource), DEVMETHOD(bus_set_resource, bus_generic_rl_set_resource), DEVMETHOD(bus_get_resource, bus_generic_rl_get_resource), /* OFW methods */ DEVMETHOD(ofw_bus_get_node, jzsmb_get_node), /* iicbus interface */ DEVMETHOD(iicbus_callback, iicbus_null_callback), DEVMETHOD(iicbus_reset, jzsmb_reset), DEVMETHOD(iicbus_transfer, jzsmb_transfer), DEVMETHOD_END }; static driver_t jzsmb_driver = { "iichb", jzsmb_methods, sizeof(struct jzsmb_softc), }; static devclass_t jzsmb_devclass; EARLY_DRIVER_MODULE(iicbus, jzsmb, iicbus_driver, iicbus_devclass, 0, 0, BUS_PASS_RESOURCE + BUS_PASS_ORDER_MIDDLE); EARLY_DRIVER_MODULE(jzsmb, simplebus, jzsmb_driver, jzsmb_devclass, 0, 0, BUS_PASS_RESOURCE + BUS_PASS_ORDER_MIDDLE); MODULE_VERSION(jzsmb, 1); Index: head/sys/netinet/ip_input.c =================================================================== --- head/sys/netinet/ip_input.c (revision 336913) +++ head/sys/netinet/ip_input.c (revision 336914) @@ -1,1427 +1,1427 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1988, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ip_input.c 8.2 (Berkeley) 1/4/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_bootp.h" #include "opt_ipstealth.h" #include "opt_ipsec.h" #include "opt_route.h" #include "opt_rss.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef CTASSERT CTASSERT(sizeof(struct ip) == 20); #endif /* IP reassembly functions are defined in ip_reass.c. */ extern void ipreass_init(void); extern void ipreass_drain(void); extern void ipreass_slowtimo(void); #ifdef VIMAGE extern void ipreass_destroy(void); #endif struct rmlock in_ifaddr_lock; RM_SYSINIT(in_ifaddr_lock, &in_ifaddr_lock, "in_ifaddr_lock"); VNET_DEFINE(int, rsvp_on); VNET_DEFINE(int, ipforwarding); SYSCTL_INT(_net_inet_ip, IPCTL_FORWARDING, forwarding, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ipforwarding), 0, "Enable IP forwarding between interfaces"); VNET_DEFINE_STATIC(int, ipsendredirects) = 1; /* XXX */ #define V_ipsendredirects VNET(ipsendredirects) SYSCTL_INT(_net_inet_ip, IPCTL_SENDREDIRECTS, redirect, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ipsendredirects), 0, "Enable sending IP redirects"); /* * XXX - Setting ip_checkinterface mostly implements the receive side of * the Strong ES model described in RFC 1122, but since the routing table * and transmit implementation do not implement the Strong ES model, * setting this to 1 results in an odd hybrid. * * XXX - ip_checkinterface currently must be disabled if you use ipnat * to translate the destination address to another local interface. * * XXX - ip_checkinterface must be disabled if you add IP aliases * to the loopback interface instead of the interface where the * packets for those addresses are received. */ VNET_DEFINE_STATIC(int, ip_checkinterface); #define V_ip_checkinterface VNET(ip_checkinterface) SYSCTL_INT(_net_inet_ip, OID_AUTO, check_interface, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip_checkinterface), 0, "Verify packet arrives on correct interface"); VNET_DEFINE(struct pfil_head, inet_pfil_hook); /* Packet filter hooks */ static struct netisr_handler ip_nh = { .nh_name = "ip", .nh_handler = ip_input, .nh_proto = NETISR_IP, #ifdef RSS .nh_m2cpuid = rss_soft_m2cpuid_v4, .nh_policy = NETISR_POLICY_CPU, .nh_dispatch = NETISR_DISPATCH_HYBRID, #else .nh_policy = NETISR_POLICY_FLOW, #endif }; #ifdef RSS /* * Directly dispatched frames are currently assumed * to have a flowid already calculated. * * It should likely have something that assert it * actually has valid flow details. */ static struct netisr_handler ip_direct_nh = { .nh_name = "ip_direct", .nh_handler = ip_direct_input, .nh_proto = NETISR_IP_DIRECT, .nh_m2cpuid = rss_soft_m2cpuid_v4, .nh_policy = NETISR_POLICY_CPU, .nh_dispatch = NETISR_DISPATCH_HYBRID, }; #endif extern struct domain inetdomain; extern struct protosw inetsw[]; u_char ip_protox[IPPROTO_MAX]; VNET_DEFINE(struct in_ifaddrhead, in_ifaddrhead); /* first inet address */ VNET_DEFINE(struct in_ifaddrhashhead *, in_ifaddrhashtbl); /* inet addr hash table */ VNET_DEFINE(u_long, in_ifaddrhmask); /* mask for hash table */ #ifdef IPCTL_DEFMTU SYSCTL_INT(_net_inet_ip, IPCTL_DEFMTU, mtu, CTLFLAG_RW, &ip_mtu, 0, "Default MTU"); #endif #ifdef IPSTEALTH VNET_DEFINE(int, ipstealth); SYSCTL_INT(_net_inet_ip, OID_AUTO, stealth, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ipstealth), 0, "IP stealth mode, no TTL decrementation on forwarding"); #endif /* * IP statistics are stored in the "array" of counter(9)s. */ VNET_PCPUSTAT_DEFINE(struct ipstat, ipstat); VNET_PCPUSTAT_SYSINIT(ipstat); SYSCTL_VNET_PCPUSTAT(_net_inet_ip, IPCTL_STATS, stats, struct ipstat, ipstat, "IP statistics (struct ipstat, netinet/ip_var.h)"); #ifdef VIMAGE VNET_PCPUSTAT_SYSUNINIT(ipstat); #endif /* VIMAGE */ /* * Kernel module interface for updating ipstat. The argument is an index * into ipstat treated as an array. */ void kmod_ipstat_inc(int statnum) { counter_u64_add(VNET(ipstat)[statnum], 1); } void kmod_ipstat_dec(int statnum) { counter_u64_add(VNET(ipstat)[statnum], -1); } static int sysctl_netinet_intr_queue_maxlen(SYSCTL_HANDLER_ARGS) { int error, qlimit; netisr_getqlimit(&ip_nh, &qlimit); error = sysctl_handle_int(oidp, &qlimit, 0, req); if (error || !req->newptr) return (error); if (qlimit < 1) return (EINVAL); return (netisr_setqlimit(&ip_nh, qlimit)); } SYSCTL_PROC(_net_inet_ip, IPCTL_INTRQMAXLEN, intr_queue_maxlen, CTLTYPE_INT|CTLFLAG_RW, 0, 0, sysctl_netinet_intr_queue_maxlen, "I", "Maximum size of the IP input queue"); static int sysctl_netinet_intr_queue_drops(SYSCTL_HANDLER_ARGS) { u_int64_t qdrops_long; int error, qdrops; netisr_getqdrops(&ip_nh, &qdrops_long); qdrops = qdrops_long; error = sysctl_handle_int(oidp, &qdrops, 0, req); if (error || !req->newptr) return (error); if (qdrops != 0) return (EINVAL); netisr_clearqdrops(&ip_nh); return (0); } SYSCTL_PROC(_net_inet_ip, IPCTL_INTRQDROPS, intr_queue_drops, CTLTYPE_INT|CTLFLAG_RD, 0, 0, sysctl_netinet_intr_queue_drops, "I", "Number of packets dropped from the IP input queue"); #ifdef RSS static int sysctl_netinet_intr_direct_queue_maxlen(SYSCTL_HANDLER_ARGS) { int error, qlimit; netisr_getqlimit(&ip_direct_nh, &qlimit); error = sysctl_handle_int(oidp, &qlimit, 0, req); if (error || !req->newptr) return (error); if (qlimit < 1) return (EINVAL); return (netisr_setqlimit(&ip_direct_nh, qlimit)); } SYSCTL_PROC(_net_inet_ip, IPCTL_INTRDQMAXLEN, intr_direct_queue_maxlen, CTLTYPE_INT|CTLFLAG_RW, 0, 0, sysctl_netinet_intr_direct_queue_maxlen, "I", "Maximum size of the IP direct input queue"); static int sysctl_netinet_intr_direct_queue_drops(SYSCTL_HANDLER_ARGS) { u_int64_t qdrops_long; int error, qdrops; netisr_getqdrops(&ip_direct_nh, &qdrops_long); qdrops = qdrops_long; error = sysctl_handle_int(oidp, &qdrops, 0, req); if (error || !req->newptr) return (error); if (qdrops != 0) return (EINVAL); netisr_clearqdrops(&ip_direct_nh); return (0); } SYSCTL_PROC(_net_inet_ip, IPCTL_INTRDQDROPS, intr_direct_queue_drops, CTLTYPE_INT|CTLFLAG_RD, 0, 0, sysctl_netinet_intr_direct_queue_drops, "I", "Number of packets dropped from the IP direct input queue"); #endif /* RSS */ /* * IP initialization: fill in IP protocol switch table. * All protocols not implemented in kernel go to raw IP protocol handler. */ void ip_init(void) { struct protosw *pr; int i; CK_STAILQ_INIT(&V_in_ifaddrhead); V_in_ifaddrhashtbl = hashinit(INADDR_NHASH, M_IFADDR, &V_in_ifaddrhmask); /* Initialize IP reassembly queue. */ ipreass_init(); /* Initialize packet filter hooks. */ V_inet_pfil_hook.ph_type = PFIL_TYPE_AF; V_inet_pfil_hook.ph_af = AF_INET; if ((i = pfil_head_register(&V_inet_pfil_hook)) != 0) printf("%s: WARNING: unable to register pfil hook, " "error %d\n", __func__, i); if (hhook_head_register(HHOOK_TYPE_IPSEC_IN, AF_INET, &V_ipsec_hhh_in[HHOOK_IPSEC_INET], HHOOK_WAITOK | HHOOK_HEADISINVNET) != 0) printf("%s: WARNING: unable to register input helper hook\n", __func__); if (hhook_head_register(HHOOK_TYPE_IPSEC_OUT, AF_INET, &V_ipsec_hhh_out[HHOOK_IPSEC_INET], HHOOK_WAITOK | HHOOK_HEADISINVNET) != 0) printf("%s: WARNING: unable to register output helper hook\n", __func__); /* Skip initialization of globals for non-default instances. */ #ifdef VIMAGE if (!IS_DEFAULT_VNET(curvnet)) { netisr_register_vnet(&ip_nh); #ifdef RSS netisr_register_vnet(&ip_direct_nh); #endif return; } #endif pr = pffindproto(PF_INET, IPPROTO_RAW, SOCK_RAW); if (pr == NULL) panic("ip_init: PF_INET not found"); /* Initialize the entire ip_protox[] array to IPPROTO_RAW. */ for (i = 0; i < IPPROTO_MAX; i++) ip_protox[i] = pr - inetsw; /* * Cycle through IP protocols and put them into the appropriate place * in ip_protox[]. */ for (pr = inetdomain.dom_protosw; pr < inetdomain.dom_protoswNPROTOSW; pr++) if (pr->pr_domain->dom_family == PF_INET && pr->pr_protocol && pr->pr_protocol != IPPROTO_RAW) { /* Be careful to only index valid IP protocols. */ if (pr->pr_protocol < IPPROTO_MAX) ip_protox[pr->pr_protocol] = pr - inetsw; } netisr_register(&ip_nh); #ifdef RSS netisr_register(&ip_direct_nh); #endif } #ifdef VIMAGE static void ip_destroy(void *unused __unused) { struct ifnet *ifp; int error; #ifdef RSS netisr_unregister_vnet(&ip_direct_nh); #endif netisr_unregister_vnet(&ip_nh); if ((error = pfil_head_unregister(&V_inet_pfil_hook)) != 0) printf("%s: WARNING: unable to unregister pfil hook, " "error %d\n", __func__, error); error = hhook_head_deregister(V_ipsec_hhh_in[HHOOK_IPSEC_INET]); if (error != 0) { printf("%s: WARNING: unable to deregister input helper hook " "type HHOOK_TYPE_IPSEC_IN, id HHOOK_IPSEC_INET: " "error %d returned\n", __func__, error); } error = hhook_head_deregister(V_ipsec_hhh_out[HHOOK_IPSEC_INET]); if (error != 0) { printf("%s: WARNING: unable to deregister output helper hook " "type HHOOK_TYPE_IPSEC_OUT, id HHOOK_IPSEC_INET: " "error %d returned\n", __func__, error); } /* Remove the IPv4 addresses from all interfaces. */ in_ifscrub_all(); /* Make sure the IPv4 routes are gone as well. */ IFNET_RLOCK(); CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) rt_flushifroutes_af(ifp, AF_INET); IFNET_RUNLOCK(); /* Destroy IP reassembly queue. */ ipreass_destroy(); /* Cleanup in_ifaddr hash table; should be empty. */ hashdestroy(V_in_ifaddrhashtbl, M_IFADDR, V_in_ifaddrhmask); } VNET_SYSUNINIT(ip, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, ip_destroy, NULL); #endif #ifdef RSS /* * IP direct input routine. * * This is called when reinjecting completed fragments where * all of the previous checking and book-keeping has been done. */ void ip_direct_input(struct mbuf *m) { struct ip *ip; int hlen; ip = mtod(m, struct ip *); hlen = ip->ip_hl << 2; #if defined(IPSEC) || defined(IPSEC_SUPPORT) if (IPSEC_ENABLED(ipv4)) { if (IPSEC_INPUT(ipv4, m, hlen, ip->ip_p) != 0) return; } #endif /* IPSEC */ IPSTAT_INC(ips_delivered); (*inetsw[ip_protox[ip->ip_p]].pr_input)(&m, &hlen, ip->ip_p); return; } #endif /* * Ip input routine. Checksum and byte swap header. If fragmented * try to reassemble. Process options. Pass to next level. */ void ip_input(struct mbuf *m) { struct ip *ip = NULL; struct in_ifaddr *ia = NULL; struct ifaddr *ifa; struct ifnet *ifp; int checkif, hlen = 0; uint16_t sum, ip_len; int dchg = 0; /* dest changed after fw */ struct in_addr odst; /* original dst address */ M_ASSERTPKTHDR(m); if (m->m_flags & M_FASTFWD_OURS) { m->m_flags &= ~M_FASTFWD_OURS; /* Set up some basics that will be used later. */ ip = mtod(m, struct ip *); hlen = ip->ip_hl << 2; ip_len = ntohs(ip->ip_len); goto ours; } IPSTAT_INC(ips_total); if (m->m_pkthdr.len < sizeof(struct ip)) goto tooshort; if (m->m_len < sizeof (struct ip) && (m = m_pullup(m, sizeof (struct ip))) == NULL) { IPSTAT_INC(ips_toosmall); return; } ip = mtod(m, struct ip *); if (ip->ip_v != IPVERSION) { IPSTAT_INC(ips_badvers); goto bad; } hlen = ip->ip_hl << 2; if (hlen < sizeof(struct ip)) { /* minimum header length */ IPSTAT_INC(ips_badhlen); goto bad; } if (hlen > m->m_len) { if ((m = m_pullup(m, hlen)) == NULL) { IPSTAT_INC(ips_badhlen); return; } ip = mtod(m, struct ip *); } IP_PROBE(receive, NULL, NULL, ip, m->m_pkthdr.rcvif, ip, NULL); /* 127/8 must not appear on wire - RFC1122 */ ifp = m->m_pkthdr.rcvif; if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET || (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) { if ((ifp->if_flags & IFF_LOOPBACK) == 0) { IPSTAT_INC(ips_badaddr); goto bad; } } if (m->m_pkthdr.csum_flags & CSUM_IP_CHECKED) { sum = !(m->m_pkthdr.csum_flags & CSUM_IP_VALID); } else { if (hlen == sizeof(struct ip)) { sum = in_cksum_hdr(ip); } else { sum = in_cksum(m, hlen); } } if (sum) { IPSTAT_INC(ips_badsum); goto bad; } #ifdef ALTQ if (altq_input != NULL && (*altq_input)(m, AF_INET) == 0) /* packet is dropped by traffic conditioner */ return; #endif ip_len = ntohs(ip->ip_len); if (ip_len < hlen) { IPSTAT_INC(ips_badlen); goto bad; } /* * Check that the amount of data in the buffers * is as at least much as the IP header would have us expect. * Trim mbufs if longer than we expect. * Drop packet if shorter than we expect. */ if (m->m_pkthdr.len < ip_len) { tooshort: IPSTAT_INC(ips_tooshort); goto bad; } if (m->m_pkthdr.len > ip_len) { if (m->m_len == m->m_pkthdr.len) { m->m_len = ip_len; m->m_pkthdr.len = ip_len; } else m_adj(m, ip_len - m->m_pkthdr.len); } /* * Try to forward the packet, but if we fail continue. * ip_tryforward() does inbound and outbound packet firewall * processing. If firewall has decided that destination becomes * our local address, it sets M_FASTFWD_OURS flag. In this * case skip another inbound firewall processing and update * ip pointer. */ if (V_ipforwarding != 0 #if defined(IPSEC) || defined(IPSEC_SUPPORT) && (!IPSEC_ENABLED(ipv4) || IPSEC_CAPS(ipv4, m, IPSEC_CAP_OPERABLE) == 0) #endif ) { if ((m = ip_tryforward(m)) == NULL) return; if (m->m_flags & M_FASTFWD_OURS) { m->m_flags &= ~M_FASTFWD_OURS; ip = mtod(m, struct ip *); goto ours; } } #if defined(IPSEC) || defined(IPSEC_SUPPORT) /* * Bypass packet filtering for packets previously handled by IPsec. */ if (IPSEC_ENABLED(ipv4) && IPSEC_CAPS(ipv4, m, IPSEC_CAP_BYPASS_FILTER) != 0) goto passin; #endif /* * Run through list of hooks for input packets. * * NB: Beware of the destination address changing (e.g. * by NAT rewriting). When this happens, tell * ip_forward to do the right thing. */ /* Jump over all PFIL processing if hooks are not active. */ if (!PFIL_HOOKED(&V_inet_pfil_hook)) goto passin; odst = ip->ip_dst; if (pfil_run_hooks(&V_inet_pfil_hook, &m, ifp, PFIL_IN, 0, NULL) != 0) return; if (m == NULL) /* consumed by filter */ return; ip = mtod(m, struct ip *); dchg = (odst.s_addr != ip->ip_dst.s_addr); ifp = m->m_pkthdr.rcvif; if (m->m_flags & M_FASTFWD_OURS) { m->m_flags &= ~M_FASTFWD_OURS; goto ours; } if (m->m_flags & M_IP_NEXTHOP) { if (m_tag_find(m, PACKET_TAG_IPFORWARD, NULL) != NULL) { /* * Directly ship the packet on. This allows * forwarding packets originally destined to us * to some other directly connected host. */ ip_forward(m, 1); return; } } passin: /* * Process options and, if not destined for us, * ship it on. ip_dooptions returns 1 when an * error was detected (causing an icmp message * to be sent and the original packet to be freed). */ if (hlen > sizeof (struct ip) && ip_dooptions(m, 0)) return; /* greedy RSVP, snatches any PATH packet of the RSVP protocol and no * matter if it is destined to another node, or whether it is * a multicast one, RSVP wants it! and prevents it from being forwarded * anywhere else. Also checks if the rsvp daemon is running before * grabbing the packet. */ if (V_rsvp_on && ip->ip_p==IPPROTO_RSVP) goto ours; /* * Check our list of addresses, to see if the packet is for us. * If we don't have any addresses, assume any unicast packet * we receive might be for us (and let the upper layers deal * with it). */ if (CK_STAILQ_EMPTY(&V_in_ifaddrhead) && (m->m_flags & (M_MCAST|M_BCAST)) == 0) goto ours; /* * Enable a consistency check between the destination address * and the arrival interface for a unicast packet (the RFC 1122 * strong ES model) if IP forwarding is disabled and the packet * is not locally generated and the packet is not subject to * 'ipfw fwd'. * * XXX - Checking also should be disabled if the destination * address is ipnat'ed to a different interface. * * XXX - Checking is incompatible with IP aliases added * to the loopback interface instead of the interface where * the packets are received. * * XXX - This is the case for carp vhost IPs as well so we * insert a workaround. If the packet got here, we already * checked with carp_iamatch() and carp_forus(). */ checkif = V_ip_checkinterface && (V_ipforwarding == 0) && ifp != NULL && ((ifp->if_flags & IFF_LOOPBACK) == 0) && ifp->if_carp == NULL && (dchg == 0); /* * Check for exact addresses in the hash bucket. */ /* IN_IFADDR_RLOCK(); */ LIST_FOREACH(ia, INADDR_HASH(ip->ip_dst.s_addr), ia_hash) { /* * If the address matches, verify that the packet * arrived via the correct interface if checking is * enabled. */ if (IA_SIN(ia)->sin_addr.s_addr == ip->ip_dst.s_addr && (!checkif || ia->ia_ifp == ifp)) { counter_u64_add(ia->ia_ifa.ifa_ipackets, 1); counter_u64_add(ia->ia_ifa.ifa_ibytes, m->m_pkthdr.len); /* IN_IFADDR_RUNLOCK(); */ goto ours; } } /* IN_IFADDR_RUNLOCK(); */ /* * Check for broadcast addresses. * * Only accept broadcast packets that arrive via the matching * interface. Reception of forwarded directed broadcasts would * be handled via ip_forward() and ether_output() with the loopback * into the stack for SIMPLEX interfaces handled by ether_output(). */ if (ifp != NULL && ifp->if_flags & IFF_BROADCAST) { IF_ADDR_RLOCK(ifp); CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family != AF_INET) continue; ia = ifatoia(ifa); if (satosin(&ia->ia_broadaddr)->sin_addr.s_addr == ip->ip_dst.s_addr) { counter_u64_add(ia->ia_ifa.ifa_ipackets, 1); counter_u64_add(ia->ia_ifa.ifa_ibytes, m->m_pkthdr.len); IF_ADDR_RUNLOCK(ifp); goto ours; } #ifdef BOOTP_COMPAT if (IA_SIN(ia)->sin_addr.s_addr == INADDR_ANY) { counter_u64_add(ia->ia_ifa.ifa_ipackets, 1); counter_u64_add(ia->ia_ifa.ifa_ibytes, m->m_pkthdr.len); IF_ADDR_RUNLOCK(ifp); goto ours; } #endif } IF_ADDR_RUNLOCK(ifp); ia = NULL; } /* RFC 3927 2.7: Do not forward datagrams for 169.254.0.0/16. */ if (IN_LINKLOCAL(ntohl(ip->ip_dst.s_addr))) { IPSTAT_INC(ips_cantforward); m_freem(m); return; } if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) { if (V_ip_mrouter) { /* * If we are acting as a multicast router, all * incoming multicast packets are passed to the * kernel-level multicast forwarding function. * The packet is returned (relatively) intact; if * ip_mforward() returns a non-zero value, the packet * must be discarded, else it may be accepted below. */ if (ip_mforward && ip_mforward(ip, ifp, m, 0) != 0) { IPSTAT_INC(ips_cantforward); m_freem(m); return; } /* * The process-level routing daemon needs to receive * all multicast IGMP packets, whether or not this * host belongs to their destination groups. */ if (ip->ip_p == IPPROTO_IGMP) goto ours; IPSTAT_INC(ips_forward); } /* * Assume the packet is for us, to avoid prematurely taking * a lock on the in_multi hash. Protocols must perform * their own filtering and update statistics accordingly. */ goto ours; } if (ip->ip_dst.s_addr == (u_long)INADDR_BROADCAST) goto ours; if (ip->ip_dst.s_addr == INADDR_ANY) goto ours; /* * Not for us; forward if possible and desirable. */ if (V_ipforwarding == 0) { IPSTAT_INC(ips_cantforward); m_freem(m); } else { ip_forward(m, dchg); } return; ours: #ifdef IPSTEALTH /* * IPSTEALTH: Process non-routing options only * if the packet is destined for us. */ if (V_ipstealth && hlen > sizeof (struct ip) && ip_dooptions(m, 1)) return; #endif /* IPSTEALTH */ /* * Attempt reassembly; if it succeeds, proceed. * ip_reass() will return a different mbuf. */ if (ip->ip_off & htons(IP_MF | IP_OFFMASK)) { /* XXXGL: shouldn't we save & set m_flags? */ m = ip_reass(m); if (m == NULL) return; ip = mtod(m, struct ip *); /* Get the header length of the reassembled packet */ hlen = ip->ip_hl << 2; } #if defined(IPSEC) || defined(IPSEC_SUPPORT) if (IPSEC_ENABLED(ipv4)) { if (IPSEC_INPUT(ipv4, m, hlen, ip->ip_p) != 0) return; } #endif /* IPSEC */ /* * Switch out to protocol's input routine. */ IPSTAT_INC(ips_delivered); (*inetsw[ip_protox[ip->ip_p]].pr_input)(&m, &hlen, ip->ip_p); return; bad: m_freem(m); } /* * IP timer processing; * if a timer expires on a reassembly * queue, discard it. */ void ip_slowtimo(void) { VNET_ITERATOR_DECL(vnet_iter); VNET_LIST_RLOCK_NOSLEEP(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); ipreass_slowtimo(); CURVNET_RESTORE(); } VNET_LIST_RUNLOCK_NOSLEEP(); } void ip_drain(void) { VNET_ITERATOR_DECL(vnet_iter); VNET_LIST_RLOCK_NOSLEEP(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); ipreass_drain(); CURVNET_RESTORE(); } VNET_LIST_RUNLOCK_NOSLEEP(); } /* * The protocol to be inserted into ip_protox[] must be already registered * in inetsw[], either statically or through pf_proto_register(). */ int ipproto_register(short ipproto) { struct protosw *pr; /* Sanity checks. */ if (ipproto <= 0 || ipproto >= IPPROTO_MAX) return (EPROTONOSUPPORT); /* * The protocol slot must not be occupied by another protocol * already. An index pointing to IPPROTO_RAW is unused. */ pr = pffindproto(PF_INET, IPPROTO_RAW, SOCK_RAW); if (pr == NULL) return (EPFNOSUPPORT); if (ip_protox[ipproto] != pr - inetsw) /* IPPROTO_RAW */ return (EEXIST); /* Find the protocol position in inetsw[] and set the index. */ for (pr = inetdomain.dom_protosw; pr < inetdomain.dom_protoswNPROTOSW; pr++) { if (pr->pr_domain->dom_family == PF_INET && pr->pr_protocol && pr->pr_protocol == ipproto) { ip_protox[pr->pr_protocol] = pr - inetsw; return (0); } } return (EPROTONOSUPPORT); } int ipproto_unregister(short ipproto) { struct protosw *pr; /* Sanity checks. */ if (ipproto <= 0 || ipproto >= IPPROTO_MAX) return (EPROTONOSUPPORT); /* Check if the protocol was indeed registered. */ pr = pffindproto(PF_INET, IPPROTO_RAW, SOCK_RAW); if (pr == NULL) return (EPFNOSUPPORT); if (ip_protox[ipproto] == pr - inetsw) /* IPPROTO_RAW */ return (ENOENT); /* Reset the protocol slot to IPPROTO_RAW. */ ip_protox[ipproto] = pr - inetsw; return (0); } u_char inetctlerrmap[PRC_NCMDS] = { 0, 0, 0, 0, 0, EMSGSIZE, EHOSTDOWN, EHOSTUNREACH, EHOSTUNREACH, EHOSTUNREACH, ECONNREFUSED, ECONNREFUSED, EMSGSIZE, EHOSTUNREACH, 0, 0, 0, 0, EHOSTUNREACH, 0, ENOPROTOOPT, ECONNREFUSED }; /* * Forward a packet. If some error occurs return the sender * an icmp packet. Note we can't always generate a meaningful * icmp message because icmp doesn't have a large enough repertoire * of codes and types. * * If not forwarding, just drop the packet. This could be confusing * if ipforwarding was zero but some routing protocol was advancing * us as a gateway to somewhere. However, we must let the routing * protocol deal with that. * * The srcrt parameter indicates whether the packet is being forwarded * via a source route. */ void ip_forward(struct mbuf *m, int srcrt) { struct ip *ip = mtod(m, struct ip *); struct in_ifaddr *ia; struct mbuf *mcopy; struct sockaddr_in *sin; struct in_addr dest; struct route ro; int error, type = 0, code = 0, mtu = 0; if (m->m_flags & (M_BCAST|M_MCAST) || in_canforward(ip->ip_dst) == 0) { IPSTAT_INC(ips_cantforward); m_freem(m); return; } if ( #ifdef IPSTEALTH V_ipstealth == 0 && #endif ip->ip_ttl <= IPTTLDEC) { icmp_error(m, ICMP_TIMXCEED, ICMP_TIMXCEED_INTRANS, 0, 0); return; } bzero(&ro, sizeof(ro)); sin = (struct sockaddr_in *)&ro.ro_dst; sin->sin_family = AF_INET; sin->sin_len = sizeof(*sin); sin->sin_addr = ip->ip_dst; #ifdef RADIX_MPATH rtalloc_mpath_fib(&ro, ntohl(ip->ip_src.s_addr ^ ip->ip_dst.s_addr), M_GETFIB(m)); #else in_rtalloc_ign(&ro, 0, M_GETFIB(m)); #endif NET_EPOCH_ENTER(); if (ro.ro_rt != NULL) { ia = ifatoia(ro.ro_rt->rt_ifa); } else ia = NULL; /* * Save the IP header and at most 8 bytes of the payload, * in case we need to generate an ICMP message to the src. * * XXX this can be optimized a lot by saving the data in a local * buffer on the stack (72 bytes at most), and only allocating the * mbuf if really necessary. The vast majority of the packets * are forwarded without having to send an ICMP back (either * because unnecessary, or because rate limited), so we are * really we are wasting a lot of work here. * * We don't use m_copym() because it might return a reference * to a shared cluster. Both this function and ip_output() * assume exclusive access to the IP header in `m', so any * data in a cluster may change before we reach icmp_error(). */ mcopy = m_gethdr(M_NOWAIT, m->m_type); if (mcopy != NULL && !m_dup_pkthdr(mcopy, m, M_NOWAIT)) { /* * It's probably ok if the pkthdr dup fails (because * the deep copy of the tag chain failed), but for now * be conservative and just discard the copy since * code below may some day want the tags. */ m_free(mcopy); mcopy = NULL; } if (mcopy != NULL) { mcopy->m_len = min(ntohs(ip->ip_len), M_TRAILINGSPACE(mcopy)); mcopy->m_pkthdr.len = mcopy->m_len; m_copydata(m, 0, mcopy->m_len, mtod(mcopy, caddr_t)); } #ifdef IPSTEALTH if (V_ipstealth == 0) #endif ip->ip_ttl -= IPTTLDEC; #if defined(IPSEC) || defined(IPSEC_SUPPORT) if (IPSEC_ENABLED(ipv4)) { if ((error = IPSEC_FORWARD(ipv4, m)) != 0) { /* mbuf consumed by IPsec */ m_freem(mcopy); if (error != EINPROGRESS) IPSTAT_INC(ips_cantforward); goto out; } /* No IPsec processing required */ } #endif /* IPSEC */ /* * If forwarding packet using same interface that it came in on, * perhaps should send a redirect to sender to shortcut a hop. * Only send redirect if source is sending directly to us, * and if packet was not source routed (or has any options). * Also, don't send redirect if forwarding using a default route * or a route modified by a redirect. */ dest.s_addr = 0; if (!srcrt && V_ipsendredirects && ia != NULL && ia->ia_ifp == m->m_pkthdr.rcvif) { struct rtentry *rt; rt = ro.ro_rt; if (rt && (rt->rt_flags & (RTF_DYNAMIC|RTF_MODIFIED)) == 0 && satosin(rt_key(rt))->sin_addr.s_addr != 0) { #define RTA(rt) ((struct in_ifaddr *)(rt->rt_ifa)) u_long src = ntohl(ip->ip_src.s_addr); if (RTA(rt) && (src & RTA(rt)->ia_subnetmask) == RTA(rt)->ia_subnet) { if (rt->rt_flags & RTF_GATEWAY) dest.s_addr = satosin(rt->rt_gateway)->sin_addr.s_addr; else dest.s_addr = ip->ip_dst.s_addr; /* Router requirements says to only send host redirects */ type = ICMP_REDIRECT; code = ICMP_REDIRECT_HOST; } } } error = ip_output(m, NULL, &ro, IP_FORWARDING, NULL, NULL); if (error == EMSGSIZE && ro.ro_rt) mtu = ro.ro_rt->rt_mtu; RO_RTFREE(&ro); if (error) IPSTAT_INC(ips_cantforward); else { IPSTAT_INC(ips_forward); if (type) IPSTAT_INC(ips_redirectsent); else { if (mcopy) m_freem(mcopy); goto out; } } if (mcopy == NULL) goto out; switch (error) { case 0: /* forwarded, but need redirect */ /* type, code set above */ break; case ENETUNREACH: case EHOSTUNREACH: case ENETDOWN: case EHOSTDOWN: default: type = ICMP_UNREACH; code = ICMP_UNREACH_HOST; break; case EMSGSIZE: type = ICMP_UNREACH; code = ICMP_UNREACH_NEEDFRAG; /* * If the MTU was set before make sure we are below the * interface MTU. * If the MTU wasn't set before use the interface mtu or * fall back to the next smaller mtu step compared to the * current packet size. */ if (mtu != 0) { if (ia != NULL) mtu = min(mtu, ia->ia_ifp->if_mtu); } else { if (ia != NULL) mtu = ia->ia_ifp->if_mtu; else mtu = ip_next_mtu(ntohs(ip->ip_len), 0); } IPSTAT_INC(ips_cantfrag); break; case ENOBUFS: case EACCES: /* ipfw denied packet */ m_freem(mcopy); goto out; } icmp_error(mcopy, type, code, dest.s_addr, mtu); out: NET_EPOCH_EXIT(); } #define CHECK_SO_CT(sp, ct) \ (((sp->so_options & SO_TIMESTAMP) && (sp->so_ts_clock == ct)) ? 1 : 0) void ip_savecontrol(struct inpcb *inp, struct mbuf **mp, struct ip *ip, struct mbuf *m) { bool stamped; stamped = false; if ((inp->inp_socket->so_options & SO_BINTIME) || CHECK_SO_CT(inp->inp_socket, SO_TS_BINTIME)) { struct bintime boottimebin, bt; struct timespec ts1; if ((m->m_flags & (M_PKTHDR | M_TSTMP)) == (M_PKTHDR | M_TSTMP)) { mbuf_tstmp2timespec(m, &ts1); timespec2bintime(&ts1, &bt); getboottimebin(&boottimebin); bintime_add(&bt, &boottimebin); } else { bintime(&bt); } *mp = sbcreatecontrol((caddr_t)&bt, sizeof(bt), SCM_BINTIME, SOL_SOCKET); if (*mp != NULL) { mp = &(*mp)->m_next; stamped = true; } } if (CHECK_SO_CT(inp->inp_socket, SO_TS_REALTIME_MICRO)) { struct bintime boottimebin, bt1; struct timespec ts1;; struct timeval tv; if ((m->m_flags & (M_PKTHDR | M_TSTMP)) == (M_PKTHDR | M_TSTMP)) { mbuf_tstmp2timespec(m, &ts1); timespec2bintime(&ts1, &bt1); getboottimebin(&boottimebin); bintime_add(&bt1, &boottimebin); bintime2timeval(&bt1, &tv); } else { microtime(&tv); } *mp = sbcreatecontrol((caddr_t)&tv, sizeof(tv), SCM_TIMESTAMP, SOL_SOCKET); if (*mp != NULL) { mp = &(*mp)->m_next; stamped = true; } } else if (CHECK_SO_CT(inp->inp_socket, SO_TS_REALTIME)) { struct bintime boottimebin; struct timespec ts, ts1; if ((m->m_flags & (M_PKTHDR | M_TSTMP)) == (M_PKTHDR | M_TSTMP)) { mbuf_tstmp2timespec(m, &ts); getboottimebin(&boottimebin); bintime2timespec(&boottimebin, &ts1); - timespecadd(&ts, &ts1); + timespecadd(&ts, &ts1, &ts); } else { nanotime(&ts); } *mp = sbcreatecontrol((caddr_t)&ts, sizeof(ts), SCM_REALTIME, SOL_SOCKET); if (*mp != NULL) { mp = &(*mp)->m_next; stamped = true; } } else if (CHECK_SO_CT(inp->inp_socket, SO_TS_MONOTONIC)) { struct timespec ts; if ((m->m_flags & (M_PKTHDR | M_TSTMP)) == (M_PKTHDR | M_TSTMP)) mbuf_tstmp2timespec(m, &ts); else nanouptime(&ts); *mp = sbcreatecontrol((caddr_t)&ts, sizeof(ts), SCM_MONOTONIC, SOL_SOCKET); if (*mp != NULL) { mp = &(*mp)->m_next; stamped = true; } } if (stamped && (m->m_flags & (M_PKTHDR | M_TSTMP)) == (M_PKTHDR | M_TSTMP)) { struct sock_timestamp_info sti; bzero(&sti, sizeof(sti)); sti.st_info_flags = ST_INFO_HW; if ((m->m_flags & M_TSTMP_HPREC) != 0) sti.st_info_flags |= ST_INFO_HW_HPREC; *mp = sbcreatecontrol((caddr_t)&sti, sizeof(sti), SCM_TIME_INFO, SOL_SOCKET); if (*mp != NULL) mp = &(*mp)->m_next; } if (inp->inp_flags & INP_RECVDSTADDR) { *mp = sbcreatecontrol((caddr_t)&ip->ip_dst, sizeof(struct in_addr), IP_RECVDSTADDR, IPPROTO_IP); if (*mp) mp = &(*mp)->m_next; } if (inp->inp_flags & INP_RECVTTL) { *mp = sbcreatecontrol((caddr_t)&ip->ip_ttl, sizeof(u_char), IP_RECVTTL, IPPROTO_IP); if (*mp) mp = &(*mp)->m_next; } #ifdef notyet /* XXX * Moving these out of udp_input() made them even more broken * than they already were. */ /* options were tossed already */ if (inp->inp_flags & INP_RECVOPTS) { *mp = sbcreatecontrol((caddr_t)opts_deleted_above, sizeof(struct in_addr), IP_RECVOPTS, IPPROTO_IP); if (*mp) mp = &(*mp)->m_next; } /* ip_srcroute doesn't do what we want here, need to fix */ if (inp->inp_flags & INP_RECVRETOPTS) { *mp = sbcreatecontrol((caddr_t)ip_srcroute(m), sizeof(struct in_addr), IP_RECVRETOPTS, IPPROTO_IP); if (*mp) mp = &(*mp)->m_next; } #endif if (inp->inp_flags & INP_RECVIF) { struct ifnet *ifp; struct sdlbuf { struct sockaddr_dl sdl; u_char pad[32]; } sdlbuf; struct sockaddr_dl *sdp; struct sockaddr_dl *sdl2 = &sdlbuf.sdl; if ((ifp = m->m_pkthdr.rcvif) && ifp->if_index && ifp->if_index <= V_if_index) { sdp = (struct sockaddr_dl *)ifp->if_addr->ifa_addr; /* * Change our mind and don't try copy. */ if (sdp->sdl_family != AF_LINK || sdp->sdl_len > sizeof(sdlbuf)) { goto makedummy; } bcopy(sdp, sdl2, sdp->sdl_len); } else { makedummy: sdl2->sdl_len = offsetof(struct sockaddr_dl, sdl_data[0]); sdl2->sdl_family = AF_LINK; sdl2->sdl_index = 0; sdl2->sdl_nlen = sdl2->sdl_alen = sdl2->sdl_slen = 0; } *mp = sbcreatecontrol((caddr_t)sdl2, sdl2->sdl_len, IP_RECVIF, IPPROTO_IP); if (*mp) mp = &(*mp)->m_next; } if (inp->inp_flags & INP_RECVTOS) { *mp = sbcreatecontrol((caddr_t)&ip->ip_tos, sizeof(u_char), IP_RECVTOS, IPPROTO_IP); if (*mp) mp = &(*mp)->m_next; } if (inp->inp_flags2 & INP_RECVFLOWID) { uint32_t flowid, flow_type; flowid = m->m_pkthdr.flowid; flow_type = M_HASHTYPE_GET(m); /* * XXX should handle the failure of one or the * other - don't populate both? */ *mp = sbcreatecontrol((caddr_t) &flowid, sizeof(uint32_t), IP_FLOWID, IPPROTO_IP); if (*mp) mp = &(*mp)->m_next; *mp = sbcreatecontrol((caddr_t) &flow_type, sizeof(uint32_t), IP_FLOWTYPE, IPPROTO_IP); if (*mp) mp = &(*mp)->m_next; } #ifdef RSS if (inp->inp_flags2 & INP_RECVRSSBUCKETID) { uint32_t flowid, flow_type; uint32_t rss_bucketid; flowid = m->m_pkthdr.flowid; flow_type = M_HASHTYPE_GET(m); if (rss_hash2bucket(flowid, flow_type, &rss_bucketid) == 0) { *mp = sbcreatecontrol((caddr_t) &rss_bucketid, sizeof(uint32_t), IP_RSSBUCKETID, IPPROTO_IP); if (*mp) mp = &(*mp)->m_next; } } #endif } /* * XXXRW: Multicast routing code in ip_mroute.c is generally MPSAFE, but the * ip_rsvp and ip_rsvp_on variables need to be interlocked with rsvp_on * locking. This code remains in ip_input.c as ip_mroute.c is optionally * compiled. */ VNET_DEFINE_STATIC(int, ip_rsvp_on); VNET_DEFINE(struct socket *, ip_rsvpd); #define V_ip_rsvp_on VNET(ip_rsvp_on) int ip_rsvp_init(struct socket *so) { if (so->so_type != SOCK_RAW || so->so_proto->pr_protocol != IPPROTO_RSVP) return EOPNOTSUPP; if (V_ip_rsvpd != NULL) return EADDRINUSE; V_ip_rsvpd = so; /* * This may seem silly, but we need to be sure we don't over-increment * the RSVP counter, in case something slips up. */ if (!V_ip_rsvp_on) { V_ip_rsvp_on = 1; V_rsvp_on++; } return 0; } int ip_rsvp_done(void) { V_ip_rsvpd = NULL; /* * This may seem silly, but we need to be sure we don't over-decrement * the RSVP counter, in case something slips up. */ if (V_ip_rsvp_on) { V_ip_rsvp_on = 0; V_rsvp_on--; } return 0; } int rsvp_input(struct mbuf **mp, int *offp, int proto) { struct mbuf *m; m = *mp; *mp = NULL; if (rsvp_input_p) { /* call the real one if loaded */ *mp = m; rsvp_input_p(mp, offp, proto); return (IPPROTO_DONE); } /* Can still get packets with rsvp_on = 0 if there is a local member * of the group to which the RSVP packet is addressed. But in this * case we want to throw the packet away. */ if (!V_rsvp_on) { m_freem(m); return (IPPROTO_DONE); } if (V_ip_rsvpd != NULL) { *mp = m; rip_input(mp, offp, proto); return (IPPROTO_DONE); } /* Drop the packet */ m_freem(m); return (IPPROTO_DONE); } Index: head/sys/netinet6/ip6_input.c =================================================================== --- head/sys/netinet6/ip6_input.c (revision 336913) +++ head/sys/netinet6/ip6_input.c (revision 336914) @@ -1,1862 +1,1862 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $KAME: ip6_input.c,v 1.259 2002/01/21 04:58:09 jinmei Exp $ */ /*- * Copyright (c) 1982, 1986, 1988, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ip_input.c 8.2 (Berkeley) 1/4/94 */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_route.h" #include "opt_rss.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET #include #include #endif /* INET */ #include #include #include #include #include #include #include #include #include #include #include #include extern struct domain inet6domain; u_char ip6_protox[IPPROTO_MAX]; VNET_DEFINE(struct in6_ifaddrhead, in6_ifaddrhead); VNET_DEFINE(struct in6_ifaddrlisthead *, in6_ifaddrhashtbl); VNET_DEFINE(u_long, in6_ifaddrhmask); static struct netisr_handler ip6_nh = { .nh_name = "ip6", .nh_handler = ip6_input, .nh_proto = NETISR_IPV6, #ifdef RSS .nh_m2cpuid = rss_soft_m2cpuid_v6, .nh_policy = NETISR_POLICY_CPU, .nh_dispatch = NETISR_DISPATCH_HYBRID, #else .nh_policy = NETISR_POLICY_FLOW, #endif }; static int sysctl_netinet6_intr_queue_maxlen(SYSCTL_HANDLER_ARGS) { int error, qlimit; netisr_getqlimit(&ip6_nh, &qlimit); error = sysctl_handle_int(oidp, &qlimit, 0, req); if (error || !req->newptr) return (error); if (qlimit < 1) return (EINVAL); return (netisr_setqlimit(&ip6_nh, qlimit)); } SYSCTL_DECL(_net_inet6_ip6); SYSCTL_PROC(_net_inet6_ip6, IPV6CTL_INTRQMAXLEN, intr_queue_maxlen, CTLTYPE_INT|CTLFLAG_RW, 0, 0, sysctl_netinet6_intr_queue_maxlen, "I", "Maximum size of the IPv6 input queue"); #ifdef RSS static struct netisr_handler ip6_direct_nh = { .nh_name = "ip6_direct", .nh_handler = ip6_direct_input, .nh_proto = NETISR_IPV6_DIRECT, .nh_m2cpuid = rss_soft_m2cpuid_v6, .nh_policy = NETISR_POLICY_CPU, .nh_dispatch = NETISR_DISPATCH_HYBRID, }; static int sysctl_netinet6_intr_direct_queue_maxlen(SYSCTL_HANDLER_ARGS) { int error, qlimit; netisr_getqlimit(&ip6_direct_nh, &qlimit); error = sysctl_handle_int(oidp, &qlimit, 0, req); if (error || !req->newptr) return (error); if (qlimit < 1) return (EINVAL); return (netisr_setqlimit(&ip6_direct_nh, qlimit)); } SYSCTL_PROC(_net_inet6_ip6, IPV6CTL_INTRDQMAXLEN, intr_direct_queue_maxlen, CTLTYPE_INT|CTLFLAG_RW, 0, 0, sysctl_netinet6_intr_direct_queue_maxlen, "I", "Maximum size of the IPv6 direct input queue"); #endif VNET_DEFINE(struct pfil_head, inet6_pfil_hook); VNET_PCPUSTAT_DEFINE(struct ip6stat, ip6stat); VNET_PCPUSTAT_SYSINIT(ip6stat); #ifdef VIMAGE VNET_PCPUSTAT_SYSUNINIT(ip6stat); #endif /* VIMAGE */ struct rmlock in6_ifaddr_lock; RM_SYSINIT(in6_ifaddr_lock, &in6_ifaddr_lock, "in6_ifaddr_lock"); static int ip6_hopopts_input(u_int32_t *, u_int32_t *, struct mbuf **, int *); #ifdef PULLDOWN_TEST static struct mbuf *ip6_pullexthdr(struct mbuf *, size_t, int); #endif /* * IP6 initialization: fill in IP6 protocol switch table. * All protocols not implemented in kernel go to raw IP6 protocol handler. */ void ip6_init(void) { struct protosw *pr; int i; TUNABLE_INT_FETCH("net.inet6.ip6.auto_linklocal", &V_ip6_auto_linklocal); TUNABLE_INT_FETCH("net.inet6.ip6.accept_rtadv", &V_ip6_accept_rtadv); TUNABLE_INT_FETCH("net.inet6.ip6.no_radr", &V_ip6_no_radr); CK_STAILQ_INIT(&V_in6_ifaddrhead); V_in6_ifaddrhashtbl = hashinit(IN6ADDR_NHASH, M_IFADDR, &V_in6_ifaddrhmask); /* Initialize packet filter hooks. */ V_inet6_pfil_hook.ph_type = PFIL_TYPE_AF; V_inet6_pfil_hook.ph_af = AF_INET6; if ((i = pfil_head_register(&V_inet6_pfil_hook)) != 0) printf("%s: WARNING: unable to register pfil hook, " "error %d\n", __func__, i); if (hhook_head_register(HHOOK_TYPE_IPSEC_IN, AF_INET6, &V_ipsec_hhh_in[HHOOK_IPSEC_INET6], HHOOK_WAITOK | HHOOK_HEADISINVNET) != 0) printf("%s: WARNING: unable to register input helper hook\n", __func__); if (hhook_head_register(HHOOK_TYPE_IPSEC_OUT, AF_INET6, &V_ipsec_hhh_out[HHOOK_IPSEC_INET6], HHOOK_WAITOK | HHOOK_HEADISINVNET) != 0) printf("%s: WARNING: unable to register output helper hook\n", __func__); scope6_init(); addrsel_policy_init(); nd6_init(); frag6_init(); V_ip6_desync_factor = arc4random() % MAX_TEMP_DESYNC_FACTOR; /* Skip global initialization stuff for non-default instances. */ #ifdef VIMAGE if (!IS_DEFAULT_VNET(curvnet)) { netisr_register_vnet(&ip6_nh); #ifdef RSS netisr_register_vnet(&ip6_direct_nh); #endif return; } #endif pr = pffindproto(PF_INET6, IPPROTO_RAW, SOCK_RAW); if (pr == NULL) panic("ip6_init"); /* Initialize the entire ip6_protox[] array to IPPROTO_RAW. */ for (i = 0; i < IPPROTO_MAX; i++) ip6_protox[i] = pr - inet6sw; /* * Cycle through IP protocols and put them into the appropriate place * in ip6_protox[]. */ for (pr = inet6domain.dom_protosw; pr < inet6domain.dom_protoswNPROTOSW; pr++) if (pr->pr_domain->dom_family == PF_INET6 && pr->pr_protocol && pr->pr_protocol != IPPROTO_RAW) { /* Be careful to only index valid IP protocols. */ if (pr->pr_protocol < IPPROTO_MAX) ip6_protox[pr->pr_protocol] = pr - inet6sw; } netisr_register(&ip6_nh); #ifdef RSS netisr_register(&ip6_direct_nh); #endif } /* * The protocol to be inserted into ip6_protox[] must be already registered * in inet6sw[], either statically or through pf_proto_register(). */ int ip6proto_register(short ip6proto) { struct protosw *pr; /* Sanity checks. */ if (ip6proto <= 0 || ip6proto >= IPPROTO_MAX) return (EPROTONOSUPPORT); /* * The protocol slot must not be occupied by another protocol * already. An index pointing to IPPROTO_RAW is unused. */ pr = pffindproto(PF_INET6, IPPROTO_RAW, SOCK_RAW); if (pr == NULL) return (EPFNOSUPPORT); if (ip6_protox[ip6proto] != pr - inet6sw) /* IPPROTO_RAW */ return (EEXIST); /* * Find the protocol position in inet6sw[] and set the index. */ for (pr = inet6domain.dom_protosw; pr < inet6domain.dom_protoswNPROTOSW; pr++) { if (pr->pr_domain->dom_family == PF_INET6 && pr->pr_protocol && pr->pr_protocol == ip6proto) { ip6_protox[pr->pr_protocol] = pr - inet6sw; return (0); } } return (EPROTONOSUPPORT); } int ip6proto_unregister(short ip6proto) { struct protosw *pr; /* Sanity checks. */ if (ip6proto <= 0 || ip6proto >= IPPROTO_MAX) return (EPROTONOSUPPORT); /* Check if the protocol was indeed registered. */ pr = pffindproto(PF_INET6, IPPROTO_RAW, SOCK_RAW); if (pr == NULL) return (EPFNOSUPPORT); if (ip6_protox[ip6proto] == pr - inet6sw) /* IPPROTO_RAW */ return (ENOENT); /* Reset the protocol slot to IPPROTO_RAW. */ ip6_protox[ip6proto] = pr - inet6sw; return (0); } #ifdef VIMAGE static void ip6_destroy(void *unused __unused) { struct ifaddr *ifa, *nifa; struct ifnet *ifp; int error; #ifdef RSS netisr_unregister_vnet(&ip6_direct_nh); #endif netisr_unregister_vnet(&ip6_nh); if ((error = pfil_head_unregister(&V_inet6_pfil_hook)) != 0) printf("%s: WARNING: unable to unregister pfil hook, " "error %d\n", __func__, error); error = hhook_head_deregister(V_ipsec_hhh_in[HHOOK_IPSEC_INET6]); if (error != 0) { printf("%s: WARNING: unable to deregister input helper hook " "type HHOOK_TYPE_IPSEC_IN, id HHOOK_IPSEC_INET6: " "error %d returned\n", __func__, error); } error = hhook_head_deregister(V_ipsec_hhh_out[HHOOK_IPSEC_INET6]); if (error != 0) { printf("%s: WARNING: unable to deregister output helper hook " "type HHOOK_TYPE_IPSEC_OUT, id HHOOK_IPSEC_INET6: " "error %d returned\n", __func__, error); } /* Cleanup addresses. */ IFNET_RLOCK(); CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) { /* Cannot lock here - lock recursion. */ /* IF_ADDR_LOCK(ifp); */ CK_STAILQ_FOREACH_SAFE(ifa, &ifp->if_addrhead, ifa_link, nifa) { if (ifa->ifa_addr->sa_family != AF_INET6) continue; in6_purgeaddr(ifa); } /* IF_ADDR_UNLOCK(ifp); */ in6_ifdetach_destroy(ifp); mld_domifdetach(ifp); /* Make sure any routes are gone as well. */ rt_flushifroutes_af(ifp, AF_INET6); } IFNET_RUNLOCK(); nd6_destroy(); in6_ifattach_destroy(); hashdestroy(V_in6_ifaddrhashtbl, M_IFADDR, V_in6_ifaddrhmask); } VNET_SYSUNINIT(inet6, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, ip6_destroy, NULL); #endif static int ip6_input_hbh(struct mbuf *m, uint32_t *plen, uint32_t *rtalert, int *off, int *nxt, int *ours) { struct ip6_hdr *ip6; struct ip6_hbh *hbh; if (ip6_hopopts_input(plen, rtalert, &m, off)) { #if 0 /*touches NULL pointer*/ in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_discard); #endif goto out; /* m have already been freed */ } /* adjust pointer */ ip6 = mtod(m, struct ip6_hdr *); /* * if the payload length field is 0 and the next header field * indicates Hop-by-Hop Options header, then a Jumbo Payload * option MUST be included. */ if (ip6->ip6_plen == 0 && *plen == 0) { /* * Note that if a valid jumbo payload option is * contained, ip6_hopopts_input() must set a valid * (non-zero) payload length to the variable plen. */ IP6STAT_INC(ip6s_badoptions); in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_discard); in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_hdrerr); icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, (caddr_t)&ip6->ip6_plen - (caddr_t)ip6); goto out; } #ifndef PULLDOWN_TEST /* ip6_hopopts_input() ensures that mbuf is contiguous */ hbh = (struct ip6_hbh *)(ip6 + 1); #else IP6_EXTHDR_GET(hbh, struct ip6_hbh *, m, sizeof(struct ip6_hdr), sizeof(struct ip6_hbh)); if (hbh == NULL) { IP6STAT_INC(ip6s_tooshort); goto out; } #endif *nxt = hbh->ip6h_nxt; /* * If we are acting as a router and the packet contains a * router alert option, see if we know the option value. * Currently, we only support the option value for MLD, in which * case we should pass the packet to the multicast routing * daemon. */ if (*rtalert != ~0) { switch (*rtalert) { case IP6OPT_RTALERT_MLD: if (V_ip6_forwarding) *ours = 1; break; default: /* * RFC2711 requires unrecognized values must be * silently ignored. */ break; } } return (0); out: return (1); } #ifdef RSS /* * IPv6 direct input routine. * * This is called when reinjecting completed fragments where * all of the previous checking and book-keeping has been done. */ void ip6_direct_input(struct mbuf *m) { int off, nxt; int nest; struct m_tag *mtag; struct ip6_direct_ctx *ip6dc; mtag = m_tag_locate(m, MTAG_ABI_IPV6, IPV6_TAG_DIRECT, NULL); KASSERT(mtag != NULL, ("Reinjected packet w/o direct ctx tag!")); ip6dc = (struct ip6_direct_ctx *)(mtag + 1); nxt = ip6dc->ip6dc_nxt; off = ip6dc->ip6dc_off; nest = 0; m_tag_delete(m, mtag); while (nxt != IPPROTO_DONE) { if (V_ip6_hdrnestlimit && (++nest > V_ip6_hdrnestlimit)) { IP6STAT_INC(ip6s_toomanyhdr); goto bad; } /* * protection against faulty packet - there should be * more sanity checks in header chain processing. */ if (m->m_pkthdr.len < off) { IP6STAT_INC(ip6s_tooshort); in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_truncated); goto bad; } #if defined(IPSEC) || defined(IPSEC_SUPPORT) if (IPSEC_ENABLED(ipv6)) { if (IPSEC_INPUT(ipv6, m, off, nxt) != 0) return; } #endif /* IPSEC */ nxt = (*inet6sw[ip6_protox[nxt]].pr_input)(&m, &off, nxt); } return; bad: m_freem(m); } #endif void ip6_input(struct mbuf *m) { struct in6_addr odst; struct ip6_hdr *ip6; struct in6_ifaddr *ia; struct ifnet *rcvif; u_int32_t plen; u_int32_t rtalert = ~0; int off = sizeof(struct ip6_hdr), nest; int nxt, ours = 0; int srcrt = 0; /* * Drop the packet if IPv6 operation is disabled on the interface. */ rcvif = m->m_pkthdr.rcvif; if ((ND_IFINFO(rcvif)->flags & ND6_IFF_IFDISABLED)) goto bad; #if defined(IPSEC) || defined(IPSEC_SUPPORT) /* * should the inner packet be considered authentic? * see comment in ah4_input(). * NB: m cannot be NULL when passed to the input routine */ m->m_flags &= ~M_AUTHIPHDR; m->m_flags &= ~M_AUTHIPDGM; #endif /* IPSEC */ if (m->m_flags & M_FASTFWD_OURS) { /* * Firewall changed destination to local. */ ip6 = mtod(m, struct ip6_hdr *); goto passin; } /* * mbuf statistics */ if (m->m_flags & M_EXT) { if (m->m_next) IP6STAT_INC(ip6s_mext2m); else IP6STAT_INC(ip6s_mext1); } else { if (m->m_next) { if (m->m_flags & M_LOOP) { IP6STAT_INC(ip6s_m2m[V_loif->if_index]); } else if (rcvif->if_index < IP6S_M2MMAX) IP6STAT_INC(ip6s_m2m[rcvif->if_index]); else IP6STAT_INC(ip6s_m2m[0]); } else IP6STAT_INC(ip6s_m1); } in6_ifstat_inc(rcvif, ifs6_in_receive); IP6STAT_INC(ip6s_total); #ifndef PULLDOWN_TEST /* * L2 bridge code and some other code can return mbuf chain * that does not conform to KAME requirement. too bad. * XXX: fails to join if interface MTU > MCLBYTES. jumbogram? */ if (m && m->m_next != NULL && m->m_pkthdr.len < MCLBYTES) { struct mbuf *n; if (m->m_pkthdr.len > MHLEN) n = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); else n = m_gethdr(M_NOWAIT, MT_DATA); if (n == NULL) goto bad; m_move_pkthdr(n, m); m_copydata(m, 0, n->m_pkthdr.len, mtod(n, caddr_t)); n->m_len = n->m_pkthdr.len; m_freem(m); m = n; } IP6_EXTHDR_CHECK(m, 0, sizeof(struct ip6_hdr), /* nothing */); #endif if (m->m_len < sizeof(struct ip6_hdr)) { if ((m = m_pullup(m, sizeof(struct ip6_hdr))) == NULL) { IP6STAT_INC(ip6s_toosmall); in6_ifstat_inc(rcvif, ifs6_in_hdrerr); goto bad; } } ip6 = mtod(m, struct ip6_hdr *); if ((ip6->ip6_vfc & IPV6_VERSION_MASK) != IPV6_VERSION) { IP6STAT_INC(ip6s_badvers); in6_ifstat_inc(rcvif, ifs6_in_hdrerr); goto bad; } IP6STAT_INC(ip6s_nxthist[ip6->ip6_nxt]); IP_PROBE(receive, NULL, NULL, ip6, rcvif, NULL, ip6); /* * Check against address spoofing/corruption. */ if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_src) || IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_dst)) { /* * XXX: "badscope" is not very suitable for a multicast source. */ IP6STAT_INC(ip6s_badscope); in6_ifstat_inc(rcvif, ifs6_in_addrerr); goto bad; } if (IN6_IS_ADDR_MC_INTFACELOCAL(&ip6->ip6_dst) && !(m->m_flags & M_LOOP)) { /* * In this case, the packet should come from the loopback * interface. However, we cannot just check the if_flags, * because ip6_mloopback() passes the "actual" interface * as the outgoing/incoming interface. */ IP6STAT_INC(ip6s_badscope); in6_ifstat_inc(rcvif, ifs6_in_addrerr); goto bad; } if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) && IPV6_ADDR_MC_SCOPE(&ip6->ip6_dst) == 0) { /* * RFC4291 2.7: * Nodes must not originate a packet to a multicast address * whose scop field contains the reserved value 0; if such * a packet is received, it must be silently dropped. */ IP6STAT_INC(ip6s_badscope); in6_ifstat_inc(rcvif, ifs6_in_addrerr); goto bad; } #ifdef ALTQ if (altq_input != NULL && (*altq_input)(m, AF_INET6) == 0) { /* packet is dropped by traffic conditioner */ return; } #endif /* * The following check is not documented in specs. A malicious * party may be able to use IPv4 mapped addr to confuse tcp/udp stack * and bypass security checks (act as if it was from 127.0.0.1 by using * IPv6 src ::ffff:127.0.0.1). Be cautious. * * This check chokes if we are in an SIIT cloud. As none of BSDs * support IPv4-less kernel compilation, we cannot support SIIT * environment at all. So, it makes more sense for us to reject any * malicious packets for non-SIIT environment, than try to do a * partial support for SIIT environment. */ if (IN6_IS_ADDR_V4MAPPED(&ip6->ip6_src) || IN6_IS_ADDR_V4MAPPED(&ip6->ip6_dst)) { IP6STAT_INC(ip6s_badscope); in6_ifstat_inc(rcvif, ifs6_in_addrerr); goto bad; } #if 0 /* * Reject packets with IPv4 compatible addresses (auto tunnel). * * The code forbids auto tunnel relay case in RFC1933 (the check is * stronger than RFC1933). We may want to re-enable it if mech-xx * is revised to forbid relaying case. */ if (IN6_IS_ADDR_V4COMPAT(&ip6->ip6_src) || IN6_IS_ADDR_V4COMPAT(&ip6->ip6_dst)) { IP6STAT_INC(ip6s_badscope); in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_addrerr); goto bad; } #endif /* * Try to forward the packet, but if we fail continue. * ip6_tryforward() does inbound and outbound packet firewall * processing. If firewall has decided that destination becomes * our local address, it sets M_FASTFWD_OURS flag. In this * case skip another inbound firewall processing and update * ip6 pointer. */ if (V_ip6_forwarding != 0 #if defined(IPSEC) || defined(IPSEC_SUPPORT) && (!IPSEC_ENABLED(ipv6) || IPSEC_CAPS(ipv6, m, IPSEC_CAP_OPERABLE) == 0) #endif ) { if ((m = ip6_tryforward(m)) == NULL) return; if (m->m_flags & M_FASTFWD_OURS) { ip6 = mtod(m, struct ip6_hdr *); goto passin; } } #if defined(IPSEC) || defined(IPSEC_SUPPORT) /* * Bypass packet filtering for packets previously handled by IPsec. */ if (IPSEC_ENABLED(ipv6) && IPSEC_CAPS(ipv6, m, IPSEC_CAP_BYPASS_FILTER) != 0) goto passin; #endif /* * Run through list of hooks for input packets. * * NB: Beware of the destination address changing * (e.g. by NAT rewriting). When this happens, * tell ip6_forward to do the right thing. */ /* Jump over all PFIL processing if hooks are not active. */ if (!PFIL_HOOKED(&V_inet6_pfil_hook)) goto passin; odst = ip6->ip6_dst; if (pfil_run_hooks(&V_inet6_pfil_hook, &m, m->m_pkthdr.rcvif, PFIL_IN, 0, NULL)) return; if (m == NULL) /* consumed by filter */ return; ip6 = mtod(m, struct ip6_hdr *); srcrt = !IN6_ARE_ADDR_EQUAL(&odst, &ip6->ip6_dst); if ((m->m_flags & (M_IP6_NEXTHOP | M_FASTFWD_OURS)) == M_IP6_NEXTHOP && m_tag_find(m, PACKET_TAG_IPFORWARD, NULL) != NULL) { /* * Directly ship the packet on. This allows forwarding * packets originally destined to us to some other directly * connected host. */ ip6_forward(m, 1); return; } passin: /* * Disambiguate address scope zones (if there is ambiguity). * We first make sure that the original source or destination address * is not in our internal form for scoped addresses. Such addresses * are not necessarily invalid spec-wise, but we cannot accept them due * to the usage conflict. * in6_setscope() then also checks and rejects the cases where src or * dst are the loopback address and the receiving interface * is not loopback. */ if (in6_clearscope(&ip6->ip6_src) || in6_clearscope(&ip6->ip6_dst)) { IP6STAT_INC(ip6s_badscope); /* XXX */ goto bad; } if (in6_setscope(&ip6->ip6_src, rcvif, NULL) || in6_setscope(&ip6->ip6_dst, rcvif, NULL)) { IP6STAT_INC(ip6s_badscope); goto bad; } if (m->m_flags & M_FASTFWD_OURS) { m->m_flags &= ~M_FASTFWD_OURS; ours = 1; goto hbhcheck; } /* * Multicast check. Assume packet is for us to avoid * prematurely taking locks. */ if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { ours = 1; in6_ifstat_inc(rcvif, ifs6_in_mcast); goto hbhcheck; } /* * Unicast check * XXX: For now we keep link-local IPv6 addresses with embedded * scope zone id, therefore we use zero zoneid here. */ ia = in6ifa_ifwithaddr(&ip6->ip6_dst, 0 /* XXX */); if (ia != NULL) { if (ia->ia6_flags & IN6_IFF_NOTREADY) { char ip6bufs[INET6_ADDRSTRLEN]; char ip6bufd[INET6_ADDRSTRLEN]; /* address is not ready, so discard the packet. */ nd6log((LOG_INFO, "ip6_input: packet to an unready address %s->%s\n", ip6_sprintf(ip6bufs, &ip6->ip6_src), ip6_sprintf(ip6bufd, &ip6->ip6_dst))); ifa_free(&ia->ia_ifa); goto bad; } /* Count the packet in the ip address stats */ counter_u64_add(ia->ia_ifa.ifa_ipackets, 1); counter_u64_add(ia->ia_ifa.ifa_ibytes, m->m_pkthdr.len); ifa_free(&ia->ia_ifa); ours = 1; goto hbhcheck; } /* * Now there is no reason to process the packet if it's not our own * and we're not a router. */ if (!V_ip6_forwarding) { IP6STAT_INC(ip6s_cantforward); goto bad; } hbhcheck: /* * Process Hop-by-Hop options header if it's contained. * m may be modified in ip6_hopopts_input(). * If a JumboPayload option is included, plen will also be modified. */ plen = (u_int32_t)ntohs(ip6->ip6_plen); if (ip6->ip6_nxt == IPPROTO_HOPOPTS) { if (ip6_input_hbh(m, &plen, &rtalert, &off, &nxt, &ours) != 0) return; } else nxt = ip6->ip6_nxt; /* * Use mbuf flags to propagate Router Alert option to * ICMPv6 layer, as hop-by-hop options have been stripped. */ if (rtalert != ~0) m->m_flags |= M_RTALERT_MLD; /* * Check that the amount of data in the buffers * is as at least much as the IPv6 header would have us expect. * Trim mbufs if longer than we expect. * Drop packet if shorter than we expect. */ if (m->m_pkthdr.len - sizeof(struct ip6_hdr) < plen) { IP6STAT_INC(ip6s_tooshort); in6_ifstat_inc(rcvif, ifs6_in_truncated); goto bad; } if (m->m_pkthdr.len > sizeof(struct ip6_hdr) + plen) { if (m->m_len == m->m_pkthdr.len) { m->m_len = sizeof(struct ip6_hdr) + plen; m->m_pkthdr.len = sizeof(struct ip6_hdr) + plen; } else m_adj(m, sizeof(struct ip6_hdr) + plen - m->m_pkthdr.len); } /* * Forward if desirable. */ if (V_ip6_mrouter && IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { /* * If we are acting as a multicast router, all * incoming multicast packets are passed to the * kernel-level multicast forwarding function. * The packet is returned (relatively) intact; if * ip6_mforward() returns a non-zero value, the packet * must be discarded, else it may be accepted below. * * XXX TODO: Check hlim and multicast scope here to avoid * unnecessarily calling into ip6_mforward(). */ if (ip6_mforward && ip6_mforward(ip6, rcvif, m)) { IP6STAT_INC(ip6s_cantforward); goto bad; } } else if (!ours) { ip6_forward(m, srcrt); return; } ip6 = mtod(m, struct ip6_hdr *); /* * Malicious party may be able to use IPv4 mapped addr to confuse * tcp/udp stack and bypass security checks (act as if it was from * 127.0.0.1 by using IPv6 src ::ffff:127.0.0.1). Be cautious. * * For SIIT end node behavior, you may want to disable the check. * However, you will become vulnerable to attacks using IPv4 mapped * source. */ if (IN6_IS_ADDR_V4MAPPED(&ip6->ip6_src) || IN6_IS_ADDR_V4MAPPED(&ip6->ip6_dst)) { IP6STAT_INC(ip6s_badscope); in6_ifstat_inc(rcvif, ifs6_in_addrerr); goto bad; } /* * Tell launch routine the next header */ IP6STAT_INC(ip6s_delivered); in6_ifstat_inc(rcvif, ifs6_in_deliver); nest = 0; while (nxt != IPPROTO_DONE) { if (V_ip6_hdrnestlimit && (++nest > V_ip6_hdrnestlimit)) { IP6STAT_INC(ip6s_toomanyhdr); goto bad; } /* * protection against faulty packet - there should be * more sanity checks in header chain processing. */ if (m->m_pkthdr.len < off) { IP6STAT_INC(ip6s_tooshort); in6_ifstat_inc(rcvif, ifs6_in_truncated); goto bad; } #if defined(IPSEC) || defined(IPSEC_SUPPORT) if (IPSEC_ENABLED(ipv6)) { if (IPSEC_INPUT(ipv6, m, off, nxt) != 0) return; } #endif /* IPSEC */ nxt = (*inet6sw[ip6_protox[nxt]].pr_input)(&m, &off, nxt); } return; bad: in6_ifstat_inc(rcvif, ifs6_in_discard); if (m != NULL) m_freem(m); } /* * Hop-by-Hop options header processing. If a valid jumbo payload option is * included, the real payload length will be stored in plenp. * * rtalertp - XXX: should be stored more smart way */ static int ip6_hopopts_input(u_int32_t *plenp, u_int32_t *rtalertp, struct mbuf **mp, int *offp) { struct mbuf *m = *mp; int off = *offp, hbhlen; struct ip6_hbh *hbh; /* validation of the length of the header */ #ifndef PULLDOWN_TEST IP6_EXTHDR_CHECK(m, off, sizeof(*hbh), -1); hbh = (struct ip6_hbh *)(mtod(m, caddr_t) + off); hbhlen = (hbh->ip6h_len + 1) << 3; IP6_EXTHDR_CHECK(m, off, hbhlen, -1); hbh = (struct ip6_hbh *)(mtod(m, caddr_t) + off); #else IP6_EXTHDR_GET(hbh, struct ip6_hbh *, m, sizeof(struct ip6_hdr), sizeof(struct ip6_hbh)); if (hbh == NULL) { IP6STAT_INC(ip6s_tooshort); return -1; } hbhlen = (hbh->ip6h_len + 1) << 3; IP6_EXTHDR_GET(hbh, struct ip6_hbh *, m, sizeof(struct ip6_hdr), hbhlen); if (hbh == NULL) { IP6STAT_INC(ip6s_tooshort); return -1; } #endif off += hbhlen; hbhlen -= sizeof(struct ip6_hbh); if (ip6_process_hopopts(m, (u_int8_t *)hbh + sizeof(struct ip6_hbh), hbhlen, rtalertp, plenp) < 0) return (-1); *offp = off; *mp = m; return (0); } /* * Search header for all Hop-by-hop options and process each option. * This function is separate from ip6_hopopts_input() in order to * handle a case where the sending node itself process its hop-by-hop * options header. In such a case, the function is called from ip6_output(). * * The function assumes that hbh header is located right after the IPv6 header * (RFC2460 p7), opthead is pointer into data content in m, and opthead to * opthead + hbhlen is located in contiguous memory region. */ int ip6_process_hopopts(struct mbuf *m, u_int8_t *opthead, int hbhlen, u_int32_t *rtalertp, u_int32_t *plenp) { struct ip6_hdr *ip6; int optlen = 0; u_int8_t *opt = opthead; u_int16_t rtalert_val; u_int32_t jumboplen; const int erroff = sizeof(struct ip6_hdr) + sizeof(struct ip6_hbh); for (; hbhlen > 0; hbhlen -= optlen, opt += optlen) { switch (*opt) { case IP6OPT_PAD1: optlen = 1; break; case IP6OPT_PADN: if (hbhlen < IP6OPT_MINLEN) { IP6STAT_INC(ip6s_toosmall); goto bad; } optlen = *(opt + 1) + 2; break; case IP6OPT_ROUTER_ALERT: /* XXX may need check for alignment */ if (hbhlen < IP6OPT_RTALERT_LEN) { IP6STAT_INC(ip6s_toosmall); goto bad; } if (*(opt + 1) != IP6OPT_RTALERT_LEN - 2) { /* XXX stat */ icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, erroff + opt + 1 - opthead); return (-1); } optlen = IP6OPT_RTALERT_LEN; bcopy((caddr_t)(opt + 2), (caddr_t)&rtalert_val, 2); *rtalertp = ntohs(rtalert_val); break; case IP6OPT_JUMBO: /* XXX may need check for alignment */ if (hbhlen < IP6OPT_JUMBO_LEN) { IP6STAT_INC(ip6s_toosmall); goto bad; } if (*(opt + 1) != IP6OPT_JUMBO_LEN - 2) { /* XXX stat */ icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, erroff + opt + 1 - opthead); return (-1); } optlen = IP6OPT_JUMBO_LEN; /* * IPv6 packets that have non 0 payload length * must not contain a jumbo payload option. */ ip6 = mtod(m, struct ip6_hdr *); if (ip6->ip6_plen) { IP6STAT_INC(ip6s_badoptions); icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, erroff + opt - opthead); return (-1); } /* * We may see jumbolen in unaligned location, so * we'd need to perform bcopy(). */ bcopy(opt + 2, &jumboplen, sizeof(jumboplen)); jumboplen = (u_int32_t)htonl(jumboplen); #if 1 /* * if there are multiple jumbo payload options, * *plenp will be non-zero and the packet will be * rejected. * the behavior may need some debate in ipngwg - * multiple options does not make sense, however, * there's no explicit mention in specification. */ if (*plenp != 0) { IP6STAT_INC(ip6s_badoptions); icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, erroff + opt + 2 - opthead); return (-1); } #endif /* * jumbo payload length must be larger than 65535. */ if (jumboplen <= IPV6_MAXPACKET) { IP6STAT_INC(ip6s_badoptions); icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, erroff + opt + 2 - opthead); return (-1); } *plenp = jumboplen; break; default: /* unknown option */ if (hbhlen < IP6OPT_MINLEN) { IP6STAT_INC(ip6s_toosmall); goto bad; } optlen = ip6_unknown_opt(opt, m, erroff + opt - opthead); if (optlen == -1) return (-1); optlen += 2; break; } } return (0); bad: m_freem(m); return (-1); } /* * Unknown option processing. * The third argument `off' is the offset from the IPv6 header to the option, * which is necessary if the IPv6 header the and option header and IPv6 header * is not contiguous in order to return an ICMPv6 error. */ int ip6_unknown_opt(u_int8_t *optp, struct mbuf *m, int off) { struct ip6_hdr *ip6; switch (IP6OPT_TYPE(*optp)) { case IP6OPT_TYPE_SKIP: /* ignore the option */ return ((int)*(optp + 1)); case IP6OPT_TYPE_DISCARD: /* silently discard */ m_freem(m); return (-1); case IP6OPT_TYPE_FORCEICMP: /* send ICMP even if multicasted */ IP6STAT_INC(ip6s_badoptions); icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_OPTION, off); return (-1); case IP6OPT_TYPE_ICMP: /* send ICMP if not multicasted */ IP6STAT_INC(ip6s_badoptions); ip6 = mtod(m, struct ip6_hdr *); if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) || (m->m_flags & (M_BCAST|M_MCAST))) m_freem(m); else icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_OPTION, off); return (-1); } m_freem(m); /* XXX: NOTREACHED */ return (-1); } /* * Create the "control" list for this pcb. * These functions will not modify mbuf chain at all. * * With KAME mbuf chain restriction: * The routine will be called from upper layer handlers like tcp6_input(). * Thus the routine assumes that the caller (tcp6_input) have already * called IP6_EXTHDR_CHECK() and all the extension headers are located in the * very first mbuf on the mbuf chain. * * ip6_savecontrol_v4 will handle those options that are possible to be * set on a v4-mapped socket. * ip6_savecontrol will directly call ip6_savecontrol_v4 to handle those * options and handle the v6-only ones itself. */ struct mbuf ** ip6_savecontrol_v4(struct inpcb *inp, struct mbuf *m, struct mbuf **mp, int *v4only) { struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); #ifdef SO_TIMESTAMP if ((inp->inp_socket->so_options & SO_TIMESTAMP) != 0) { union { struct timeval tv; struct bintime bt; struct timespec ts; } t; struct bintime boottimebin, bt1; struct timespec ts1; bool stamped; stamped = false; switch (inp->inp_socket->so_ts_clock) { case SO_TS_REALTIME_MICRO: if ((m->m_flags & (M_PKTHDR | M_TSTMP)) == (M_PKTHDR | M_TSTMP)) { mbuf_tstmp2timespec(m, &ts1); timespec2bintime(&ts1, &bt1); getboottimebin(&boottimebin); bintime_add(&bt1, &boottimebin); bintime2timeval(&bt1, &t.tv); } else { microtime(&t.tv); } *mp = sbcreatecontrol((caddr_t) &t.tv, sizeof(t.tv), SCM_TIMESTAMP, SOL_SOCKET); if (*mp != NULL) { mp = &(*mp)->m_next; stamped = true; } break; case SO_TS_BINTIME: if ((m->m_flags & (M_PKTHDR | M_TSTMP)) == (M_PKTHDR | M_TSTMP)) { mbuf_tstmp2timespec(m, &ts1); timespec2bintime(&ts1, &t.bt); getboottimebin(&boottimebin); bintime_add(&t.bt, &boottimebin); } else { bintime(&t.bt); } *mp = sbcreatecontrol((caddr_t)&t.bt, sizeof(t.bt), SCM_BINTIME, SOL_SOCKET); if (*mp != NULL) { mp = &(*mp)->m_next; stamped = true; } break; case SO_TS_REALTIME: if ((m->m_flags & (M_PKTHDR | M_TSTMP)) == (M_PKTHDR | M_TSTMP)) { mbuf_tstmp2timespec(m, &t.ts); getboottimebin(&boottimebin); bintime2timespec(&boottimebin, &ts1); - timespecadd(&t.ts, &ts1); + timespecadd(&t.ts, &ts1, &t.ts); } else { nanotime(&t.ts); } *mp = sbcreatecontrol((caddr_t)&t.ts, sizeof(t.ts), SCM_REALTIME, SOL_SOCKET); if (*mp != NULL) { mp = &(*mp)->m_next; stamped = true; } break; case SO_TS_MONOTONIC: if ((m->m_flags & (M_PKTHDR | M_TSTMP)) == (M_PKTHDR | M_TSTMP)) mbuf_tstmp2timespec(m, &t.ts); else nanouptime(&t.ts); *mp = sbcreatecontrol((caddr_t)&t.ts, sizeof(t.ts), SCM_MONOTONIC, SOL_SOCKET); if (*mp != NULL) { mp = &(*mp)->m_next; stamped = true; } break; default: panic("unknown (corrupted) so_ts_clock"); } if (stamped && (m->m_flags & (M_PKTHDR | M_TSTMP)) == (M_PKTHDR | M_TSTMP)) { struct sock_timestamp_info sti; bzero(&sti, sizeof(sti)); sti.st_info_flags = ST_INFO_HW; if ((m->m_flags & M_TSTMP_HPREC) != 0) sti.st_info_flags |= ST_INFO_HW_HPREC; *mp = sbcreatecontrol((caddr_t)&sti, sizeof(sti), SCM_TIME_INFO, SOL_SOCKET); if (*mp != NULL) mp = &(*mp)->m_next; } } #endif #define IS2292(inp, x, y) (((inp)->inp_flags & IN6P_RFC2292) ? (x) : (y)) /* RFC 2292 sec. 5 */ if ((inp->inp_flags & IN6P_PKTINFO) != 0) { struct in6_pktinfo pi6; if ((ip6->ip6_vfc & IPV6_VERSION_MASK) != IPV6_VERSION) { #ifdef INET struct ip *ip; ip = mtod(m, struct ip *); pi6.ipi6_addr.s6_addr32[0] = 0; pi6.ipi6_addr.s6_addr32[1] = 0; pi6.ipi6_addr.s6_addr32[2] = IPV6_ADDR_INT32_SMP; pi6.ipi6_addr.s6_addr32[3] = ip->ip_dst.s_addr; #else /* We won't hit this code */ bzero(&pi6.ipi6_addr, sizeof(struct in6_addr)); #endif } else { bcopy(&ip6->ip6_dst, &pi6.ipi6_addr, sizeof(struct in6_addr)); in6_clearscope(&pi6.ipi6_addr); /* XXX */ } pi6.ipi6_ifindex = (m && m->m_pkthdr.rcvif) ? m->m_pkthdr.rcvif->if_index : 0; *mp = sbcreatecontrol((caddr_t) &pi6, sizeof(struct in6_pktinfo), IS2292(inp, IPV6_2292PKTINFO, IPV6_PKTINFO), IPPROTO_IPV6); if (*mp) mp = &(*mp)->m_next; } if ((inp->inp_flags & IN6P_HOPLIMIT) != 0) { int hlim; if ((ip6->ip6_vfc & IPV6_VERSION_MASK) != IPV6_VERSION) { #ifdef INET struct ip *ip; ip = mtod(m, struct ip *); hlim = ip->ip_ttl; #else /* We won't hit this code */ hlim = 0; #endif } else { hlim = ip6->ip6_hlim & 0xff; } *mp = sbcreatecontrol((caddr_t) &hlim, sizeof(int), IS2292(inp, IPV6_2292HOPLIMIT, IPV6_HOPLIMIT), IPPROTO_IPV6); if (*mp) mp = &(*mp)->m_next; } if ((inp->inp_flags & IN6P_TCLASS) != 0) { int tclass; if ((ip6->ip6_vfc & IPV6_VERSION_MASK) != IPV6_VERSION) { #ifdef INET struct ip *ip; ip = mtod(m, struct ip *); tclass = ip->ip_tos; #else /* We won't hit this code */ tclass = 0; #endif } else { u_int32_t flowinfo; flowinfo = (u_int32_t)ntohl(ip6->ip6_flow & IPV6_FLOWINFO_MASK); flowinfo >>= 20; tclass = flowinfo & 0xff; } *mp = sbcreatecontrol((caddr_t) &tclass, sizeof(int), IPV6_TCLASS, IPPROTO_IPV6); if (*mp) mp = &(*mp)->m_next; } if (v4only != NULL) { if ((ip6->ip6_vfc & IPV6_VERSION_MASK) != IPV6_VERSION) { *v4only = 1; } else { *v4only = 0; } } return (mp); } void ip6_savecontrol(struct inpcb *in6p, struct mbuf *m, struct mbuf **mp) { struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); int v4only = 0; mp = ip6_savecontrol_v4(in6p, m, mp, &v4only); if (v4only) return; /* * IPV6_HOPOPTS socket option. Recall that we required super-user * privilege for the option (see ip6_ctloutput), but it might be too * strict, since there might be some hop-by-hop options which can be * returned to normal user. * See also RFC 2292 section 6 (or RFC 3542 section 8). */ if ((in6p->inp_flags & IN6P_HOPOPTS) != 0) { /* * Check if a hop-by-hop options header is contatined in the * received packet, and if so, store the options as ancillary * data. Note that a hop-by-hop options header must be * just after the IPv6 header, which is assured through the * IPv6 input processing. */ if (ip6->ip6_nxt == IPPROTO_HOPOPTS) { struct ip6_hbh *hbh; int hbhlen = 0; #ifdef PULLDOWN_TEST struct mbuf *ext; #endif #ifndef PULLDOWN_TEST hbh = (struct ip6_hbh *)(ip6 + 1); hbhlen = (hbh->ip6h_len + 1) << 3; #else ext = ip6_pullexthdr(m, sizeof(struct ip6_hdr), ip6->ip6_nxt); if (ext == NULL) { IP6STAT_INC(ip6s_tooshort); return; } hbh = mtod(ext, struct ip6_hbh *); hbhlen = (hbh->ip6h_len + 1) << 3; if (hbhlen != ext->m_len) { m_freem(ext); IP6STAT_INC(ip6s_tooshort); return; } #endif /* * XXX: We copy the whole header even if a * jumbo payload option is included, the option which * is to be removed before returning according to * RFC2292. * Note: this constraint is removed in RFC3542 */ *mp = sbcreatecontrol((caddr_t)hbh, hbhlen, IS2292(in6p, IPV6_2292HOPOPTS, IPV6_HOPOPTS), IPPROTO_IPV6); if (*mp) mp = &(*mp)->m_next; #ifdef PULLDOWN_TEST m_freem(ext); #endif } } if ((in6p->inp_flags & (IN6P_RTHDR | IN6P_DSTOPTS)) != 0) { int nxt = ip6->ip6_nxt, off = sizeof(struct ip6_hdr); /* * Search for destination options headers or routing * header(s) through the header chain, and stores each * header as ancillary data. * Note that the order of the headers remains in * the chain of ancillary data. */ while (1) { /* is explicit loop prevention necessary? */ struct ip6_ext *ip6e = NULL; int elen; #ifdef PULLDOWN_TEST struct mbuf *ext = NULL; #endif /* * if it is not an extension header, don't try to * pull it from the chain. */ switch (nxt) { case IPPROTO_DSTOPTS: case IPPROTO_ROUTING: case IPPROTO_HOPOPTS: case IPPROTO_AH: /* is it possible? */ break; default: goto loopend; } #ifndef PULLDOWN_TEST if (off + sizeof(*ip6e) > m->m_len) goto loopend; ip6e = (struct ip6_ext *)(mtod(m, caddr_t) + off); if (nxt == IPPROTO_AH) elen = (ip6e->ip6e_len + 2) << 2; else elen = (ip6e->ip6e_len + 1) << 3; if (off + elen > m->m_len) goto loopend; #else ext = ip6_pullexthdr(m, off, nxt); if (ext == NULL) { IP6STAT_INC(ip6s_tooshort); return; } ip6e = mtod(ext, struct ip6_ext *); if (nxt == IPPROTO_AH) elen = (ip6e->ip6e_len + 2) << 2; else elen = (ip6e->ip6e_len + 1) << 3; if (elen != ext->m_len) { m_freem(ext); IP6STAT_INC(ip6s_tooshort); return; } #endif switch (nxt) { case IPPROTO_DSTOPTS: if (!(in6p->inp_flags & IN6P_DSTOPTS)) break; *mp = sbcreatecontrol((caddr_t)ip6e, elen, IS2292(in6p, IPV6_2292DSTOPTS, IPV6_DSTOPTS), IPPROTO_IPV6); if (*mp) mp = &(*mp)->m_next; break; case IPPROTO_ROUTING: if (!(in6p->inp_flags & IN6P_RTHDR)) break; *mp = sbcreatecontrol((caddr_t)ip6e, elen, IS2292(in6p, IPV6_2292RTHDR, IPV6_RTHDR), IPPROTO_IPV6); if (*mp) mp = &(*mp)->m_next; break; case IPPROTO_HOPOPTS: case IPPROTO_AH: /* is it possible? */ break; default: /* * other cases have been filtered in the above. * none will visit this case. here we supply * the code just in case (nxt overwritten or * other cases). */ #ifdef PULLDOWN_TEST m_freem(ext); #endif goto loopend; } /* proceed with the next header. */ off += elen; nxt = ip6e->ip6e_nxt; ip6e = NULL; #ifdef PULLDOWN_TEST m_freem(ext); ext = NULL; #endif } loopend: ; } if (in6p->inp_flags2 & INP_RECVFLOWID) { uint32_t flowid, flow_type; flowid = m->m_pkthdr.flowid; flow_type = M_HASHTYPE_GET(m); /* * XXX should handle the failure of one or the * other - don't populate both? */ *mp = sbcreatecontrol((caddr_t) &flowid, sizeof(uint32_t), IPV6_FLOWID, IPPROTO_IPV6); if (*mp) mp = &(*mp)->m_next; *mp = sbcreatecontrol((caddr_t) &flow_type, sizeof(uint32_t), IPV6_FLOWTYPE, IPPROTO_IPV6); if (*mp) mp = &(*mp)->m_next; } #ifdef RSS if (in6p->inp_flags2 & INP_RECVRSSBUCKETID) { uint32_t flowid, flow_type; uint32_t rss_bucketid; flowid = m->m_pkthdr.flowid; flow_type = M_HASHTYPE_GET(m); if (rss_hash2bucket(flowid, flow_type, &rss_bucketid) == 0) { *mp = sbcreatecontrol((caddr_t) &rss_bucketid, sizeof(uint32_t), IPV6_RSSBUCKETID, IPPROTO_IPV6); if (*mp) mp = &(*mp)->m_next; } } #endif } #undef IS2292 void ip6_notify_pmtu(struct inpcb *inp, struct sockaddr_in6 *dst, u_int32_t mtu) { struct socket *so; struct mbuf *m_mtu; struct ip6_mtuinfo mtuctl; KASSERT(inp != NULL, ("%s: inp == NULL", __func__)); /* * Notify the error by sending IPV6_PATHMTU ancillary data if * application wanted to know the MTU value. * NOTE: we notify disconnected sockets, because some udp * applications keep sending sockets disconnected. * NOTE: our implementation doesn't notify connected sockets that has * foreign address that is different than given destination addresses * (this is permitted by RFC 3542). */ if ((inp->inp_flags & IN6P_MTU) == 0 || ( !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr) && !IN6_ARE_ADDR_EQUAL(&inp->in6p_faddr, &dst->sin6_addr))) return; mtuctl.ip6m_mtu = mtu; mtuctl.ip6m_addr = *dst; if (sa6_recoverscope(&mtuctl.ip6m_addr)) return; if ((m_mtu = sbcreatecontrol((caddr_t)&mtuctl, sizeof(mtuctl), IPV6_PATHMTU, IPPROTO_IPV6)) == NULL) return; so = inp->inp_socket; if (sbappendaddr(&so->so_rcv, (struct sockaddr *)dst, NULL, m_mtu) == 0) { m_freem(m_mtu); /* XXX: should count statistics */ } else sorwakeup(so); } #ifdef PULLDOWN_TEST /* * pull single extension header from mbuf chain. returns single mbuf that * contains the result, or NULL on error. */ static struct mbuf * ip6_pullexthdr(struct mbuf *m, size_t off, int nxt) { struct ip6_ext ip6e; size_t elen; struct mbuf *n; #ifdef DIAGNOSTIC switch (nxt) { case IPPROTO_DSTOPTS: case IPPROTO_ROUTING: case IPPROTO_HOPOPTS: case IPPROTO_AH: /* is it possible? */ break; default: printf("ip6_pullexthdr: invalid nxt=%d\n", nxt); } #endif m_copydata(m, off, sizeof(ip6e), (caddr_t)&ip6e); if (nxt == IPPROTO_AH) elen = (ip6e.ip6e_len + 2) << 2; else elen = (ip6e.ip6e_len + 1) << 3; if (elen > MLEN) n = m_getcl(M_NOWAIT, MT_DATA, 0); else n = m_get(M_NOWAIT, MT_DATA); if (n == NULL) return NULL; m_copydata(m, off, elen, mtod(n, caddr_t)); n->m_len = elen; return n; } #endif /* * Get pointer to the previous header followed by the header * currently processed. */ int ip6_get_prevhdr(const struct mbuf *m, int off) { struct ip6_ext ip6e; struct ip6_hdr *ip6; int len, nlen, nxt; if (off == sizeof(struct ip6_hdr)) return (offsetof(struct ip6_hdr, ip6_nxt)); if (off < sizeof(struct ip6_hdr)) panic("%s: off < sizeof(struct ip6_hdr)", __func__); ip6 = mtod(m, struct ip6_hdr *); nxt = ip6->ip6_nxt; len = sizeof(struct ip6_hdr); nlen = 0; while (len < off) { m_copydata(m, len, sizeof(ip6e), (caddr_t)&ip6e); switch (nxt) { case IPPROTO_FRAGMENT: nlen = sizeof(struct ip6_frag); break; case IPPROTO_AH: nlen = (ip6e.ip6e_len + 2) << 2; break; default: nlen = (ip6e.ip6e_len + 1) << 3; } len += nlen; nxt = ip6e.ip6e_nxt; } return (len - nlen); } /* * get next header offset. m will be retained. */ int ip6_nexthdr(const struct mbuf *m, int off, int proto, int *nxtp) { struct ip6_hdr ip6; struct ip6_ext ip6e; struct ip6_frag fh; /* just in case */ if (m == NULL) panic("ip6_nexthdr: m == NULL"); if ((m->m_flags & M_PKTHDR) == 0 || m->m_pkthdr.len < off) return -1; switch (proto) { case IPPROTO_IPV6: if (m->m_pkthdr.len < off + sizeof(ip6)) return -1; m_copydata(m, off, sizeof(ip6), (caddr_t)&ip6); if (nxtp) *nxtp = ip6.ip6_nxt; off += sizeof(ip6); return off; case IPPROTO_FRAGMENT: /* * terminate parsing if it is not the first fragment, * it does not make sense to parse through it. */ if (m->m_pkthdr.len < off + sizeof(fh)) return -1; m_copydata(m, off, sizeof(fh), (caddr_t)&fh); /* IP6F_OFF_MASK = 0xfff8(BigEndian), 0xf8ff(LittleEndian) */ if (fh.ip6f_offlg & IP6F_OFF_MASK) return -1; if (nxtp) *nxtp = fh.ip6f_nxt; off += sizeof(struct ip6_frag); return off; case IPPROTO_AH: if (m->m_pkthdr.len < off + sizeof(ip6e)) return -1; m_copydata(m, off, sizeof(ip6e), (caddr_t)&ip6e); if (nxtp) *nxtp = ip6e.ip6e_nxt; off += (ip6e.ip6e_len + 2) << 2; return off; case IPPROTO_HOPOPTS: case IPPROTO_ROUTING: case IPPROTO_DSTOPTS: if (m->m_pkthdr.len < off + sizeof(ip6e)) return -1; m_copydata(m, off, sizeof(ip6e), (caddr_t)&ip6e); if (nxtp) *nxtp = ip6e.ip6e_nxt; off += (ip6e.ip6e_len + 1) << 3; return off; case IPPROTO_NONE: case IPPROTO_ESP: case IPPROTO_IPCOMP: /* give up */ return -1; default: return -1; } /* NOTREACHED */ } /* * get offset for the last header in the chain. m will be kept untainted. */ int ip6_lasthdr(const struct mbuf *m, int off, int proto, int *nxtp) { int newoff; int nxt; if (!nxtp) { nxt = -1; nxtp = &nxt; } while (1) { newoff = ip6_nexthdr(m, off, proto, nxtp); if (newoff < 0) return off; else if (newoff < off) return -1; /* invalid */ else if (newoff == off) return newoff; off = newoff; proto = *nxtp; } } /* * System control for IP6 */ u_char inet6ctlerrmap[PRC_NCMDS] = { 0, 0, 0, 0, 0, EMSGSIZE, EHOSTDOWN, EHOSTUNREACH, EHOSTUNREACH, EHOSTUNREACH, ECONNREFUSED, ECONNREFUSED, EMSGSIZE, EHOSTUNREACH, 0, 0, 0, 0, EHOSTUNREACH, 0, ENOPROTOOPT, ECONNREFUSED }; Index: head/sys/netsmb/smb_iod.c =================================================================== --- head/sys/netsmb/smb_iod.c (revision 336913) +++ head/sys/netsmb/smb_iod.c (revision 336914) @@ -1,722 +1,722 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2000-2001 Boris Popov * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define SMBIOD_SLEEP_TIMO 2 #define SMBIOD_PING_TIMO 60 /* seconds */ #define SMB_IOD_EVLOCKPTR(iod) (&((iod)->iod_evlock)) #define SMB_IOD_EVLOCK(iod) smb_sl_lock(&((iod)->iod_evlock)) #define SMB_IOD_EVUNLOCK(iod) smb_sl_unlock(&((iod)->iod_evlock)) #define SMB_IOD_RQLOCKPTR(iod) (&((iod)->iod_rqlock)) #define SMB_IOD_RQLOCK(iod) smb_sl_lock(&((iod)->iod_rqlock)) #define SMB_IOD_RQUNLOCK(iod) smb_sl_unlock(&((iod)->iod_rqlock)) #define smb_iod_wakeup(iod) wakeup(&(iod)->iod_flags) static MALLOC_DEFINE(M_SMBIOD, "SMBIOD", "SMB network io daemon"); static int smb_iod_next; static int smb_iod_sendall(struct smbiod *iod); static int smb_iod_disconnect(struct smbiod *iod); static void smb_iod_thread(void *); static __inline void smb_iod_rqprocessed(struct smb_rq *rqp, int error) { SMBRQ_SLOCK(rqp); rqp->sr_lerror = error; rqp->sr_rpgen++; rqp->sr_state = SMBRQ_NOTIFIED; wakeup(&rqp->sr_state); SMBRQ_SUNLOCK(rqp); } static void smb_iod_invrq(struct smbiod *iod) { struct smb_rq *rqp; /* * Invalidate all outstanding requests for this connection */ SMB_IOD_RQLOCK(iod); TAILQ_FOREACH(rqp, &iod->iod_rqlist, sr_link) { rqp->sr_flags |= SMBR_RESTART; smb_iod_rqprocessed(rqp, ENOTCONN); } SMB_IOD_RQUNLOCK(iod); } static void smb_iod_closetran(struct smbiod *iod) { struct smb_vc *vcp = iod->iod_vc; struct thread *td = iod->iod_td; if (vcp->vc_tdata == NULL) return; SMB_TRAN_DISCONNECT(vcp, td); SMB_TRAN_DONE(vcp, td); vcp->vc_tdata = NULL; } static void smb_iod_dead(struct smbiod *iod) { iod->iod_state = SMBIOD_ST_DEAD; smb_iod_closetran(iod); smb_iod_invrq(iod); } static int smb_iod_connect(struct smbiod *iod) { struct smb_vc *vcp = iod->iod_vc; struct thread *td = iod->iod_td; int error; SMBIODEBUG("%d\n", iod->iod_state); switch(iod->iod_state) { case SMBIOD_ST_VCACTIVE: SMBERROR("called for already opened connection\n"); return EISCONN; case SMBIOD_ST_DEAD: return ENOTCONN; /* XXX: last error code ? */ default: break; } vcp->vc_genid++; error = 0; error = (int)SMB_TRAN_CREATE(vcp, td); if (error) goto fail; SMBIODEBUG("tcreate\n"); if (vcp->vc_laddr) { error = (int)SMB_TRAN_BIND(vcp, vcp->vc_laddr, td); if (error) goto fail; } SMBIODEBUG("tbind\n"); error = (int)SMB_TRAN_CONNECT(vcp, vcp->vc_paddr, td); if (error) goto fail; SMB_TRAN_SETPARAM(vcp, SMBTP_SELECTID, &iod->iod_flags); iod->iod_state = SMBIOD_ST_TRANACTIVE; SMBIODEBUG("tconnect\n"); /* vcp->vc_mid = 0;*/ error = (int)smb_smb_negotiate(vcp, &iod->iod_scred); if (error) goto fail; SMBIODEBUG("snegotiate\n"); error = (int)smb_smb_ssnsetup(vcp, &iod->iod_scred); if (error) goto fail; iod->iod_state = SMBIOD_ST_VCACTIVE; SMBIODEBUG("completed\n"); smb_iod_invrq(iod); return (0); fail: smb_iod_dead(iod); return (error); } static int smb_iod_disconnect(struct smbiod *iod) { struct smb_vc *vcp = iod->iod_vc; SMBIODEBUG("\n"); if (iod->iod_state == SMBIOD_ST_VCACTIVE) { smb_smb_ssnclose(vcp, &iod->iod_scred); iod->iod_state = SMBIOD_ST_TRANACTIVE; } vcp->vc_smbuid = SMB_UID_UNKNOWN; smb_iod_closetran(iod); iod->iod_state = SMBIOD_ST_NOTCONN; return 0; } static int smb_iod_treeconnect(struct smbiod *iod, struct smb_share *ssp) { int error; if (iod->iod_state != SMBIOD_ST_VCACTIVE) { if (iod->iod_state != SMBIOD_ST_DEAD) return ENOTCONN; iod->iod_state = SMBIOD_ST_RECONNECT; error = smb_iod_connect(iod); if (error) return error; } SMBIODEBUG("tree reconnect\n"); SMBS_ST_LOCK(ssp); ssp->ss_flags |= SMBS_RECONNECTING; SMBS_ST_UNLOCK(ssp); error = smb_smb_treeconnect(ssp, &iod->iod_scred); SMBS_ST_LOCK(ssp); ssp->ss_flags &= ~SMBS_RECONNECTING; SMBS_ST_UNLOCK(ssp); wakeup(&ssp->ss_vcgenid); return error; } static int smb_iod_sendrq(struct smbiod *iod, struct smb_rq *rqp) { struct thread *td = iod->iod_td; struct smb_vc *vcp = iod->iod_vc; struct smb_share *ssp = rqp->sr_share; struct mbuf *m; int error; SMBIODEBUG("iod_state = %d\n", iod->iod_state); switch (iod->iod_state) { case SMBIOD_ST_NOTCONN: smb_iod_rqprocessed(rqp, ENOTCONN); return 0; case SMBIOD_ST_DEAD: iod->iod_state = SMBIOD_ST_RECONNECT; return 0; case SMBIOD_ST_RECONNECT: return 0; default: break; } if (rqp->sr_sendcnt == 0) { #ifdef movedtoanotherplace if (vcp->vc_maxmux != 0 && iod->iod_muxcnt >= vcp->vc_maxmux) return 0; #endif le16enc(rqp->sr_rqtid, ssp ? ssp->ss_tid : SMB_TID_UNKNOWN); le16enc(rqp->sr_rquid, vcp ? vcp->vc_smbuid : 0); mb_fixhdr(&rqp->sr_rq); if (vcp->vc_hflags2 & SMB_FLAGS2_SECURITY_SIGNATURE) smb_rq_sign(rqp); } if (rqp->sr_sendcnt++ > 5) { rqp->sr_flags |= SMBR_RESTART; smb_iod_rqprocessed(rqp, rqp->sr_lerror); /* * If all attempts to send a request failed, then * something is seriously hosed. */ return ENOTCONN; } SMBSDEBUG("M:%04x, P:%04x, U:%04x, T:%04x\n", rqp->sr_mid, 0, 0, 0); m_dumpm(rqp->sr_rq.mb_top); m = m_copym(rqp->sr_rq.mb_top, 0, M_COPYALL, M_WAITOK); error = rqp->sr_lerror = SMB_TRAN_SEND(vcp, m, td); if (error == 0) { getnanotime(&rqp->sr_timesent); iod->iod_lastrqsent = rqp->sr_timesent; rqp->sr_flags |= SMBR_SENT; rqp->sr_state = SMBRQ_SENT; return 0; } /* * Check for fatal errors */ if (SMB_TRAN_FATAL(vcp, error)) { /* * No further attempts should be made */ return ENOTCONN; } if (smb_rq_intr(rqp)) smb_iod_rqprocessed(rqp, EINTR); return 0; } /* * Process incoming packets */ static int smb_iod_recvall(struct smbiod *iod) { struct smb_vc *vcp = iod->iod_vc; struct thread *td = iod->iod_td; struct smb_rq *rqp; struct mbuf *m; u_char *hp; u_short mid; int error; switch (iod->iod_state) { case SMBIOD_ST_NOTCONN: case SMBIOD_ST_DEAD: case SMBIOD_ST_RECONNECT: return 0; default: break; } for (;;) { m = NULL; error = SMB_TRAN_RECV(vcp, &m, td); if (error == EWOULDBLOCK) break; if (SMB_TRAN_FATAL(vcp, error)) { smb_iod_dead(iod); break; } if (error) break; if (m == NULL) { SMBERROR("tran return NULL without error\n"); error = EPIPE; continue; } m = m_pullup(m, SMB_HDRLEN); if (m == NULL) continue; /* wait for a good packet */ /* * Now we got an entire and possibly invalid SMB packet. * Be careful while parsing it. */ m_dumpm(m); hp = mtod(m, u_char*); if (bcmp(hp, SMB_SIGNATURE, SMB_SIGLEN) != 0) { m_freem(m); continue; } mid = SMB_HDRMID(hp); SMBSDEBUG("mid %04x\n", (u_int)mid); SMB_IOD_RQLOCK(iod); TAILQ_FOREACH(rqp, &iod->iod_rqlist, sr_link) { if (rqp->sr_mid != mid) continue; SMBRQ_SLOCK(rqp); if (rqp->sr_rp.md_top == NULL) { md_initm(&rqp->sr_rp, m); } else { if (rqp->sr_flags & SMBR_MULTIPACKET) { md_append_record(&rqp->sr_rp, m); } else { SMBRQ_SUNLOCK(rqp); SMBERROR("duplicate response %d (ignored)\n", mid); break; } } SMBRQ_SUNLOCK(rqp); smb_iod_rqprocessed(rqp, 0); break; } SMB_IOD_RQUNLOCK(iod); if (rqp == NULL) { SMBERROR("drop resp with mid %d\n", (u_int)mid); /* smb_printrqlist(vcp);*/ m_freem(m); } } /* * check for interrupts */ SMB_IOD_RQLOCK(iod); TAILQ_FOREACH(rqp, &iod->iod_rqlist, sr_link) { if (smb_td_intr(rqp->sr_cred->scr_td)) { smb_iod_rqprocessed(rqp, EINTR); } } SMB_IOD_RQUNLOCK(iod); return 0; } int smb_iod_request(struct smbiod *iod, int event, void *ident) { struct smbiod_event *evp; int error; SMBIODEBUG("\n"); evp = smb_zmalloc(sizeof(*evp), M_SMBIOD, M_WAITOK); evp->ev_type = event; evp->ev_ident = ident; SMB_IOD_EVLOCK(iod); STAILQ_INSERT_TAIL(&iod->iod_evlist, evp, ev_link); if ((event & SMBIOD_EV_SYNC) == 0) { SMB_IOD_EVUNLOCK(iod); smb_iod_wakeup(iod); return 0; } smb_iod_wakeup(iod); msleep(evp, SMB_IOD_EVLOCKPTR(iod), PWAIT | PDROP, "90evw", 0); error = evp->ev_error; free(evp, M_SMBIOD); return error; } /* * Place request in the queue. * Request from smbiod have a high priority. */ int smb_iod_addrq(struct smb_rq *rqp) { struct smb_vc *vcp = rqp->sr_vc; struct smbiod *iod = vcp->vc_iod; int error; SMBIODEBUG("\n"); if (rqp->sr_cred->scr_td != NULL && rqp->sr_cred->scr_td->td_proc == iod->iod_p) { rqp->sr_flags |= SMBR_INTERNAL; SMB_IOD_RQLOCK(iod); TAILQ_INSERT_HEAD(&iod->iod_rqlist, rqp, sr_link); SMB_IOD_RQUNLOCK(iod); for (;;) { if (smb_iod_sendrq(iod, rqp) != 0) { smb_iod_dead(iod); break; } /* * we don't need to lock state field here */ if (rqp->sr_state != SMBRQ_NOTSENT) break; tsleep(&iod->iod_flags, PWAIT, "90sndw", hz); } if (rqp->sr_lerror) smb_iod_removerq(rqp); return rqp->sr_lerror; } switch (iod->iod_state) { case SMBIOD_ST_NOTCONN: return ENOTCONN; case SMBIOD_ST_DEAD: error = smb_iod_request(vcp->vc_iod, SMBIOD_EV_CONNECT | SMBIOD_EV_SYNC, NULL); if (error) return error; return EXDEV; default: break; } SMB_IOD_RQLOCK(iod); for (;;) { if (vcp->vc_maxmux == 0) { SMBERROR("maxmux == 0\n"); break; } if (iod->iod_muxcnt < vcp->vc_maxmux) break; iod->iod_muxwant++; msleep(&iod->iod_muxwant, SMB_IOD_RQLOCKPTR(iod), PWAIT, "90mux", 0); } iod->iod_muxcnt++; TAILQ_INSERT_TAIL(&iod->iod_rqlist, rqp, sr_link); SMB_IOD_RQUNLOCK(iod); smb_iod_wakeup(iod); return 0; } int smb_iod_removerq(struct smb_rq *rqp) { struct smb_vc *vcp = rqp->sr_vc; struct smbiod *iod = vcp->vc_iod; SMBIODEBUG("\n"); if (rqp->sr_flags & SMBR_INTERNAL) { SMB_IOD_RQLOCK(iod); TAILQ_REMOVE(&iod->iod_rqlist, rqp, sr_link); SMB_IOD_RQUNLOCK(iod); return 0; } SMB_IOD_RQLOCK(iod); while (rqp->sr_flags & SMBR_XLOCK) { rqp->sr_flags |= SMBR_XLOCKWANT; msleep(rqp, SMB_IOD_RQLOCKPTR(iod), PWAIT, "90xrm", 0); } TAILQ_REMOVE(&iod->iod_rqlist, rqp, sr_link); iod->iod_muxcnt--; if (iod->iod_muxwant) { iod->iod_muxwant--; wakeup(&iod->iod_muxwant); } SMB_IOD_RQUNLOCK(iod); return 0; } int smb_iod_waitrq(struct smb_rq *rqp) { struct smbiod *iod = rqp->sr_vc->vc_iod; int error; SMBIODEBUG("\n"); if (rqp->sr_flags & SMBR_INTERNAL) { for (;;) { smb_iod_sendall(iod); smb_iod_recvall(iod); if (rqp->sr_rpgen != rqp->sr_rplast) break; tsleep(&iod->iod_flags, PWAIT, "90irq", hz); } smb_iod_removerq(rqp); return rqp->sr_lerror; } SMBRQ_SLOCK(rqp); if (rqp->sr_rpgen == rqp->sr_rplast) msleep(&rqp->sr_state, SMBRQ_SLOCKPTR(rqp), PWAIT, "90wrq", 0); rqp->sr_rplast++; SMBRQ_SUNLOCK(rqp); error = rqp->sr_lerror; if (rqp->sr_flags & SMBR_MULTIPACKET) { /* * If request should stay in the list, then reinsert it * at the end of queue so other waiters have chance to concur */ SMB_IOD_RQLOCK(iod); TAILQ_REMOVE(&iod->iod_rqlist, rqp, sr_link); TAILQ_INSERT_TAIL(&iod->iod_rqlist, rqp, sr_link); SMB_IOD_RQUNLOCK(iod); } else smb_iod_removerq(rqp); return error; } static int smb_iod_sendall(struct smbiod *iod) { struct smb_vc *vcp = iod->iod_vc; struct smb_rq *rqp; struct timespec ts, tstimeout; int herror; herror = 0; /* * Loop through the list of requests and send them if possible */ SMB_IOD_RQLOCK(iod); TAILQ_FOREACH(rqp, &iod->iod_rqlist, sr_link) { switch (rqp->sr_state) { case SMBRQ_NOTSENT: rqp->sr_flags |= SMBR_XLOCK; SMB_IOD_RQUNLOCK(iod); herror = smb_iod_sendrq(iod, rqp); SMB_IOD_RQLOCK(iod); rqp->sr_flags &= ~SMBR_XLOCK; if (rqp->sr_flags & SMBR_XLOCKWANT) { rqp->sr_flags &= ~SMBR_XLOCKWANT; wakeup(rqp); } break; case SMBRQ_SENT: SMB_TRAN_GETPARAM(vcp, SMBTP_TIMEOUT, &tstimeout); - timespecadd(&tstimeout, &tstimeout); + timespecadd(&tstimeout, &tstimeout, &tstimeout); getnanotime(&ts); - timespecsub(&ts, &tstimeout); + timespecsub(&ts, &tstimeout, &ts); if (timespeccmp(&ts, &rqp->sr_timesent, >)) { smb_iod_rqprocessed(rqp, ETIMEDOUT); } break; default: break; } if (herror) break; } SMB_IOD_RQUNLOCK(iod); if (herror == ENOTCONN) smb_iod_dead(iod); return 0; } /* * "main" function for smbiod daemon */ static __inline void smb_iod_main(struct smbiod *iod) { /* struct smb_vc *vcp = iod->iod_vc;*/ struct smbiod_event *evp; /* struct timespec tsnow;*/ int error; SMBIODEBUG("\n"); error = 0; /* * Check all interesting events */ for (;;) { SMB_IOD_EVLOCK(iod); evp = STAILQ_FIRST(&iod->iod_evlist); if (evp == NULL) { SMB_IOD_EVUNLOCK(iod); break; } STAILQ_REMOVE_HEAD(&iod->iod_evlist, ev_link); evp->ev_type |= SMBIOD_EV_PROCESSING; SMB_IOD_EVUNLOCK(iod); switch (evp->ev_type & SMBIOD_EV_MASK) { case SMBIOD_EV_CONNECT: iod->iod_state = SMBIOD_ST_RECONNECT; evp->ev_error = smb_iod_connect(iod); break; case SMBIOD_EV_DISCONNECT: evp->ev_error = smb_iod_disconnect(iod); break; case SMBIOD_EV_TREECONNECT: evp->ev_error = smb_iod_treeconnect(iod, evp->ev_ident); break; case SMBIOD_EV_SHUTDOWN: iod->iod_flags |= SMBIOD_SHUTDOWN; break; case SMBIOD_EV_NEWRQ: break; } if (evp->ev_type & SMBIOD_EV_SYNC) { SMB_IOD_EVLOCK(iod); wakeup(evp); SMB_IOD_EVUNLOCK(iod); } else free(evp, M_SMBIOD); } #if 0 if (iod->iod_state == SMBIOD_ST_VCACTIVE) { getnanotime(&tsnow); - timespecsub(&tsnow, &iod->iod_pingtimo); + timespecsub(&tsnow, &iod->iod_pingtimo, &tsnow); if (timespeccmp(&tsnow, &iod->iod_lastrqsent, >)) { smb_smb_echo(vcp, &iod->iod_scred); } } #endif smb_iod_sendall(iod); smb_iod_recvall(iod); return; } void smb_iod_thread(void *arg) { struct smbiod *iod = arg; mtx_lock(&Giant); /* * Here we assume that the thread structure will be the same * for an entire kthread (kproc, to be more precise) life. */ iod->iod_td = curthread; smb_makescred(&iod->iod_scred, iod->iod_td, NULL); while ((iod->iod_flags & SMBIOD_SHUTDOWN) == 0) { smb_iod_main(iod); SMBIODEBUG("going to sleep for %d ticks\n", iod->iod_sleeptimo); if (iod->iod_flags & SMBIOD_SHUTDOWN) break; tsleep(&iod->iod_flags, PWAIT, "90idle", iod->iod_sleeptimo); } /* We can now safely destroy the mutexes and free the iod structure. */ smb_sl_destroy(&iod->iod_rqlock); smb_sl_destroy(&iod->iod_evlock); free(iod, M_SMBIOD); mtx_unlock(&Giant); kproc_exit(0); } int smb_iod_create(struct smb_vc *vcp) { struct smbiod *iod; int error; iod = smb_zmalloc(sizeof(*iod), M_SMBIOD, M_WAITOK); iod->iod_id = smb_iod_next++; iod->iod_state = SMBIOD_ST_NOTCONN; iod->iod_vc = vcp; iod->iod_sleeptimo = hz * SMBIOD_SLEEP_TIMO; iod->iod_pingtimo.tv_sec = SMBIOD_PING_TIMO; getnanotime(&iod->iod_lastrqsent); vcp->vc_iod = iod; smb_sl_init(&iod->iod_rqlock, "90rql"); TAILQ_INIT(&iod->iod_rqlist); smb_sl_init(&iod->iod_evlock, "90evl"); STAILQ_INIT(&iod->iod_evlist); error = kproc_create(smb_iod_thread, iod, &iod->iod_p, RFNOWAIT, 0, "smbiod%d", iod->iod_id); if (error) { SMBERROR("can't start smbiod: %d", error); vcp->vc_iod = NULL; smb_sl_destroy(&iod->iod_rqlock); smb_sl_destroy(&iod->iod_evlock); free(iod, M_SMBIOD); return error; } return 0; } int smb_iod_destroy(struct smbiod *iod) { smb_iod_request(iod, SMBIOD_EV_SHUTDOWN | SMBIOD_EV_SYNC, NULL); return 0; } int smb_iod_init(void) { return 0; } int smb_iod_done(void) { return 0; } Index: head/sys/netsmb/smb_trantcp.c =================================================================== --- head/sys/netsmb/smb_trantcp.c (revision 336913) +++ head/sys/netsmb/smb_trantcp.c (revision 336914) @@ -1,696 +1,695 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2000-2001 Boris Popov * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define M_NBDATA M_PCB static int smb_tcpsndbuf = NB_SNDQ - 1; static int smb_tcprcvbuf = NB_RCVQ - 1; SYSCTL_DECL(_net_smb); SYSCTL_INT(_net_smb, OID_AUTO, tcpsndbuf, CTLFLAG_RW, &smb_tcpsndbuf, 0, ""); SYSCTL_INT(_net_smb, OID_AUTO, tcprcvbuf, CTLFLAG_RW, &smb_tcprcvbuf, 0, ""); #define nb_sosend(so,m,flags,td) sosend(so, NULL, 0, m, 0, flags, td) static int nbssn_recv(struct nbpcb *nbp, struct mbuf **mpp, int *lenp, u_int8_t *rpcodep, struct thread *td); static int smb_nbst_disconnect(struct smb_vc *vcp, struct thread *td); static int nb_setsockopt_int(struct socket *so, int level, int name, int val) { struct sockopt sopt; int error; bzero(&sopt, sizeof(sopt)); sopt.sopt_level = level; sopt.sopt_name = name; sopt.sopt_val = &val; sopt.sopt_valsize = sizeof(val); CURVNET_SET(so->so_vnet); error = sosetopt(so, &sopt); CURVNET_RESTORE(); return error; } static int nb_intr(struct nbpcb *nbp, struct proc *p) { return 0; } static int nb_upcall(struct socket *so, void *arg, int waitflag) { struct nbpcb *nbp = arg; if (arg == NULL || nbp->nbp_selectid == NULL) return (SU_OK); wakeup(nbp->nbp_selectid); return (SU_OK); } static int nb_sethdr(struct mbuf *m, u_int8_t type, u_int32_t len) { u_int32_t *p = mtod(m, u_int32_t *); *p = htonl((len & 0x1FFFF) | (type << 24)); return 0; } static int nb_put_name(struct mbchain *mbp, struct sockaddr_nb *snb) { int error; u_char seglen, *cp; cp = snb->snb_name; if (*cp == 0) return EINVAL; NBDEBUG("[%s]\n", cp); for (;;) { seglen = (*cp) + 1; error = mb_put_mem(mbp, cp, seglen, MB_MSYSTEM); if (error) return error; if (seglen == 1) break; cp += seglen; } return 0; } static int nb_connect_in(struct nbpcb *nbp, struct sockaddr_in *to, struct thread *td) { struct socket *so; int error, s; error = socreate(AF_INET, &so, SOCK_STREAM, IPPROTO_TCP, td->td_ucred, td); if (error) return error; nbp->nbp_tso = so; SOCKBUF_LOCK(&so->so_rcv); soupcall_set(so, SO_RCV, nb_upcall, nbp); SOCKBUF_UNLOCK(&so->so_rcv); so->so_rcv.sb_timeo = (5 * SBT_1S); so->so_snd.sb_timeo = (5 * SBT_1S); error = soreserve(so, nbp->nbp_sndbuf, nbp->nbp_rcvbuf); if (error) goto bad; nb_setsockopt_int(so, SOL_SOCKET, SO_KEEPALIVE, 1); nb_setsockopt_int(so, IPPROTO_TCP, TCP_NODELAY, 1); SOCKBUF_LOCK(&so->so_rcv); so->so_rcv.sb_flags &= ~SB_NOINTR; SOCKBUF_UNLOCK(&so->so_rcv); SOCKBUF_LOCK(&so->so_snd); so->so_snd.sb_flags &= ~SB_NOINTR; SOCKBUF_UNLOCK(&so->so_snd); error = soconnect(so, (struct sockaddr*)to, td); if (error) goto bad; s = splnet(); while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) { tsleep(&so->so_timeo, PSOCK, "nbcon", 2 * hz); if ((so->so_state & SS_ISCONNECTING) && so->so_error == 0 && (error = nb_intr(nbp, td->td_proc)) != 0) { so->so_state &= ~SS_ISCONNECTING; splx(s); goto bad; } } if (so->so_error) { error = so->so_error; so->so_error = 0; splx(s); goto bad; } splx(s); return 0; bad: smb_nbst_disconnect(nbp->nbp_vc, td); return error; } static int nbssn_rq_request(struct nbpcb *nbp, struct thread *td) { struct mbchain *mbp; struct mdchain *mdp; struct mbuf *m0; struct timeval tv; struct sockaddr_in sin; u_short port; u_int8_t rpcode; int error, rplen; mbp = malloc(sizeof(struct mbchain), M_NBDATA, M_WAITOK); mdp = malloc(sizeof(struct mbchain), M_NBDATA, M_WAITOK); error = mb_init(mbp); if (error) { free(mbp, M_NBDATA); free(mdp, M_NBDATA); return error; } mb_put_uint32le(mbp, 0); nb_put_name(mbp, nbp->nbp_paddr); nb_put_name(mbp, nbp->nbp_laddr); nb_sethdr(mbp->mb_top, NB_SSN_REQUEST, mb_fixhdr(mbp) - 4); error = nb_sosend(nbp->nbp_tso, mbp->mb_top, 0, td); if (!error) { nbp->nbp_state = NBST_RQSENT; } mb_detach(mbp); mb_done(mbp); free(mbp, M_NBDATA); if (error) { free(mdp, M_NBDATA); return error; } TIMESPEC_TO_TIMEVAL(&tv, &nbp->nbp_timo); error = selsocket(nbp->nbp_tso, POLLIN, &tv, td); if (error == EWOULDBLOCK) { /* Timeout */ NBDEBUG("initial request timeout\n"); free(mdp, M_NBDATA); return ETIMEDOUT; } if (error) { /* restart or interrupt */ free(mdp, M_NBDATA); return error; } error = nbssn_recv(nbp, &m0, &rplen, &rpcode, td); if (error) { NBDEBUG("recv() error %d\n", error); free(mdp, M_NBDATA); return error; } /* * Process NETBIOS reply */ if (m0) md_initm(mdp, m0); error = 0; do { if (rpcode == NB_SSN_POSRESP) { nbp->nbp_state = NBST_SESSION; nbp->nbp_flags |= NBF_CONNECTED; break; } if (rpcode != NB_SSN_RTGRESP) { error = ECONNABORTED; break; } if (rplen != 6) { error = ECONNABORTED; break; } md_get_mem(mdp, (caddr_t)&sin.sin_addr, 4, MB_MSYSTEM); md_get_uint16(mdp, &port); sin.sin_port = port; nbp->nbp_state = NBST_RETARGET; smb_nbst_disconnect(nbp->nbp_vc, td); error = nb_connect_in(nbp, &sin, td); if (!error) error = nbssn_rq_request(nbp, td); if (error) { smb_nbst_disconnect(nbp->nbp_vc, td); break; } } while(0); if (m0) md_done(mdp); free(mdp, M_NBDATA); return error; } static int nbssn_recvhdr(struct nbpcb *nbp, int *lenp, u_int8_t *rpcodep, int flags, struct thread *td) { struct socket *so = nbp->nbp_tso; struct uio auio; struct iovec aio; u_int32_t len; int error; aio.iov_base = (caddr_t)&len; aio.iov_len = sizeof(len); auio.uio_iov = &aio; auio.uio_iovcnt = 1; auio.uio_segflg = UIO_SYSSPACE; auio.uio_rw = UIO_READ; auio.uio_offset = 0; auio.uio_resid = sizeof(len); auio.uio_td = td; CURVNET_SET(so->so_vnet); error = soreceive(so, (struct sockaddr **)NULL, &auio, (struct mbuf **)NULL, (struct mbuf **)NULL, &flags); CURVNET_RESTORE(); if (error) return error; if (auio.uio_resid > 0) { SMBSDEBUG("short reply\n"); return EPIPE; } len = ntohl(len); *rpcodep = (len >> 24) & 0xFF; len &= 0x1ffff; if (len > SMB_MAXPKTLEN) { SMBERROR("packet too long (%d)\n", len); return EFBIG; } *lenp = len; return 0; } static int nbssn_recv(struct nbpcb *nbp, struct mbuf **mpp, int *lenp, u_int8_t *rpcodep, struct thread *td) { struct socket *so = nbp->nbp_tso; struct uio auio; struct mbuf *m, *tm, *im; u_int8_t rpcode; int len, resid; int error, rcvflg; if (so == NULL) return ENOTCONN; if (mpp) *mpp = NULL; m = NULL; for(;;) { /* * Poll for a response header. * If we don't have one waiting, return. */ len = 0; rpcode = 0; error = nbssn_recvhdr(nbp, &len, &rpcode, MSG_DONTWAIT, td); if ((so->so_state & (SS_ISDISCONNECTING | SS_ISDISCONNECTED)) || (so->so_rcv.sb_state & SBS_CANTRCVMORE)) { nbp->nbp_state = NBST_CLOSED; NBDEBUG("session closed by peer\n"); return ECONNRESET; } if (error) return error; if (len == 0 && nbp->nbp_state != NBST_SESSION) break; /* no data, try again */ if (rpcode == NB_SSN_KEEPALIVE) continue; /* * Loop, blocking, for data following the response header. * * Note that we can't simply block here with MSG_WAITALL for the * entire response size, as it may be larger than the TCP * slow-start window that the sender employs. This will result * in the sender stalling until the delayed ACK is sent, then * resuming slow-start, resulting in very poor performance. * * Instead, we never request more than NB_SORECEIVE_CHUNK * bytes at a time, resulting in an ack being pushed by * the TCP code at the completion of each call. */ resid = len; while (resid > 0) { tm = NULL; rcvflg = MSG_WAITALL; bzero(&auio, sizeof(auio)); auio.uio_resid = min(resid, NB_SORECEIVE_CHUNK); auio.uio_td = td; resid -= auio.uio_resid; /* * Spin until we have collected everything in * this chunk. */ do { rcvflg = MSG_WAITALL; CURVNET_SET(so->so_vnet); error = soreceive(so, (struct sockaddr **)NULL, &auio, &tm, (struct mbuf **)NULL, &rcvflg); CURVNET_RESTORE(); } while (error == EWOULDBLOCK || error == EINTR || error == ERESTART); if (error) goto out; /* short return guarantees unhappiness */ if (auio.uio_resid > 0) { SMBERROR("packet is shorter than expected\n"); error = EPIPE; goto out; } /* append received chunk to previous chunk(s) */ if (m == NULL) { m = tm; } else { /* * Just glue the new chain on the end. * Consumer will pullup as required. */ for (im = m; im->m_next != NULL; im = im->m_next) ; im->m_next = tm; } } /* got a session/message packet? */ if (nbp->nbp_state == NBST_SESSION && rpcode == NB_SSN_MESSAGE) break; /* drop packet and try for another */ NBDEBUG("non-session packet %x\n", rpcode); if (m) { m_freem(m); m = NULL; } } out: if (error) { if (m) m_freem(m); return error; } if (mpp) *mpp = m; else m_freem(m); *lenp = len; *rpcodep = rpcode; return 0; } /* * SMB transport interface */ static int smb_nbst_create(struct smb_vc *vcp, struct thread *td) { struct nbpcb *nbp; nbp = malloc(sizeof *nbp, M_NBDATA, M_WAITOK); bzero(nbp, sizeof *nbp); nbp->nbp_timo.tv_sec = 15; /* XXX: sysctl ? */ nbp->nbp_state = NBST_CLOSED; nbp->nbp_vc = vcp; nbp->nbp_sndbuf = smb_tcpsndbuf; nbp->nbp_rcvbuf = smb_tcprcvbuf; vcp->vc_tdata = nbp; return 0; } static int smb_nbst_done(struct smb_vc *vcp, struct thread *td) { struct nbpcb *nbp = vcp->vc_tdata; if (nbp == NULL) return ENOTCONN; smb_nbst_disconnect(vcp, td); if (nbp->nbp_laddr) free(nbp->nbp_laddr, M_SONAME); if (nbp->nbp_paddr) free(nbp->nbp_paddr, M_SONAME); free(nbp, M_NBDATA); return 0; } static int smb_nbst_bind(struct smb_vc *vcp, struct sockaddr *sap, struct thread *td) { struct nbpcb *nbp = vcp->vc_tdata; struct sockaddr_nb *snb; int error, slen; NBDEBUG("\n"); error = EINVAL; do { if (nbp->nbp_flags & NBF_LOCADDR) break; /* * It is possible to create NETBIOS name in the kernel, * but nothing prevents us to do it in the user space. */ if (sap == NULL) break; slen = sap->sa_len; if (slen < NB_MINSALEN) break; snb = (struct sockaddr_nb*)sodupsockaddr(sap, M_WAITOK); if (snb == NULL) { error = ENOMEM; break; } nbp->nbp_laddr = snb; nbp->nbp_flags |= NBF_LOCADDR; error = 0; } while(0); return error; } static int smb_nbst_connect(struct smb_vc *vcp, struct sockaddr *sap, struct thread *td) { struct nbpcb *nbp = vcp->vc_tdata; struct sockaddr_in sin; struct sockaddr_nb *snb; struct timespec ts1, ts2; int error, slen; NBDEBUG("\n"); if (nbp->nbp_tso != NULL) return EISCONN; if (nbp->nbp_laddr == NULL) return EINVAL; slen = sap->sa_len; if (slen < NB_MINSALEN) return EINVAL; if (nbp->nbp_paddr) { free(nbp->nbp_paddr, M_SONAME); nbp->nbp_paddr = NULL; } snb = (struct sockaddr_nb*)sodupsockaddr(sap, M_WAITOK); if (snb == NULL) return ENOMEM; nbp->nbp_paddr = snb; sin = snb->snb_addrin; getnanotime(&ts1); error = nb_connect_in(nbp, &sin, td); if (error) return error; getnanotime(&ts2); - timespecsub(&ts2, &ts1); + timespecsub(&ts2, &ts1, &ts2); if (ts2.tv_sec == 0) { ts2.tv_sec = 1; ts2.tv_nsec = 0; } - nbp->nbp_timo = ts2; - timespecadd(&nbp->nbp_timo, &ts2); - timespecadd(&nbp->nbp_timo, &ts2); - timespecadd(&nbp->nbp_timo, &ts2); /* * 4 */ + timespecadd(&ts2, &ts2, &nbp->nbp_timo); + timespecadd(&nbp->nbp_timo, &ts2, &nbp->nbp_timo); + timespecadd(&nbp->nbp_timo, &ts2, &nbp->nbp_timo); /* * 4 */ error = nbssn_rq_request(nbp, td); if (error) smb_nbst_disconnect(vcp, td); return error; } static int smb_nbst_disconnect(struct smb_vc *vcp, struct thread *td) { struct nbpcb *nbp = vcp->vc_tdata; struct socket *so; if (nbp == NULL || nbp->nbp_tso == NULL) return ENOTCONN; if ((so = nbp->nbp_tso) != NULL) { nbp->nbp_flags &= ~NBF_CONNECTED; nbp->nbp_tso = (struct socket *)NULL; soshutdown(so, 2); soclose(so); } if (nbp->nbp_state != NBST_RETARGET) { nbp->nbp_state = NBST_CLOSED; } return 0; } static int smb_nbst_send(struct smb_vc *vcp, struct mbuf *m0, struct thread *td) { struct nbpcb *nbp = vcp->vc_tdata; int error; if (nbp->nbp_state != NBST_SESSION) { error = ENOTCONN; goto abort; } M_PREPEND(m0, 4, M_WAITOK); nb_sethdr(m0, NB_SSN_MESSAGE, m_fixhdr(m0) - 4); error = nb_sosend(nbp->nbp_tso, m0, 0, td); return error; abort: if (m0) m_freem(m0); return error; } static int smb_nbst_recv(struct smb_vc *vcp, struct mbuf **mpp, struct thread *td) { struct nbpcb *nbp = vcp->vc_tdata; u_int8_t rpcode; int error, rplen; nbp->nbp_flags |= NBF_RECVLOCK; error = nbssn_recv(nbp, mpp, &rplen, &rpcode, td); nbp->nbp_flags &= ~NBF_RECVLOCK; return error; } static void smb_nbst_timo(struct smb_vc *vcp) { return; } static void smb_nbst_intr(struct smb_vc *vcp) { struct nbpcb *nbp = vcp->vc_tdata; if (nbp == NULL || nbp->nbp_tso == NULL) return; sorwakeup(nbp->nbp_tso); sowwakeup(nbp->nbp_tso); } static int smb_nbst_getparam(struct smb_vc *vcp, int param, void *data) { struct nbpcb *nbp = vcp->vc_tdata; switch (param) { case SMBTP_SNDSZ: *(int*)data = nbp->nbp_sndbuf; break; case SMBTP_RCVSZ: *(int*)data = nbp->nbp_rcvbuf; break; case SMBTP_TIMEOUT: *(struct timespec*)data = nbp->nbp_timo; break; default: return EINVAL; } return 0; } static int smb_nbst_setparam(struct smb_vc *vcp, int param, void *data) { struct nbpcb *nbp = vcp->vc_tdata; switch (param) { case SMBTP_SELECTID: nbp->nbp_selectid = data; break; default: return EINVAL; } return 0; } /* * Check for fatal errors */ static int smb_nbst_fatal(struct smb_vc *vcp, int error) { switch (error) { case ENOTCONN: case ENETRESET: case ECONNABORTED: return 1; } return 0; } struct smb_tran_desc smb_tran_nbtcp_desc = { SMBT_NBTCP, smb_nbst_create, smb_nbst_done, smb_nbst_bind, smb_nbst_connect, smb_nbst_disconnect, smb_nbst_send, smb_nbst_recv, smb_nbst_timo, smb_nbst_intr, smb_nbst_getparam, smb_nbst_setparam, smb_nbst_fatal }; Index: head/sys/opencrypto/crypto.c =================================================================== --- head/sys/opencrypto/crypto.c (revision 336913) +++ head/sys/opencrypto/crypto.c (revision 336914) @@ -1,1853 +1,1853 @@ /*- * Copyright (c) 2002-2006 Sam Leffler. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); /* * Cryptographic Subsystem. * * This code is derived from the Openbsd Cryptographic Framework (OCF) * that has the copyright shown below. Very little of the original * code remains. */ /*- * The author of this code is Angelos D. Keromytis (angelos@cis.upenn.edu) * * This code was written by Angelos D. Keromytis in Athens, Greece, in * February 2000. Network Security Technologies Inc. (NSTI) kindly * supported the development of this code. * * Copyright (c) 2000, 2001 Angelos D. Keromytis * * Permission to use, copy, and modify this software with or without fee * is hereby granted, provided that this entire notice is included in * all source code copies of any software which is or includes a copy or * modification of this software. * * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR * IMPLIED WARRANTY. IN PARTICULAR, NONE OF THE AUTHORS MAKES ANY * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE * MERCHANTABILITY OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR * PURPOSE. */ #define CRYPTO_TIMING /* enable timing support */ #include "opt_ddb.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* XXX for M_XDATA */ #include #include #include "cryptodev_if.h" #if defined(__i386__) || defined(__amd64__) || defined(__aarch64__) #include #endif struct crypto_session { device_t parent; void *softc; uint32_t hid; uint32_t capabilities; }; SDT_PROVIDER_DEFINE(opencrypto); /* * Crypto drivers register themselves by allocating a slot in the * crypto_drivers table with crypto_get_driverid() and then registering * each algorithm they support with crypto_register() and crypto_kregister(). */ static struct mtx crypto_drivers_mtx; /* lock on driver table */ #define CRYPTO_DRIVER_LOCK() mtx_lock(&crypto_drivers_mtx) #define CRYPTO_DRIVER_UNLOCK() mtx_unlock(&crypto_drivers_mtx) #define CRYPTO_DRIVER_ASSERT() mtx_assert(&crypto_drivers_mtx, MA_OWNED) /* * Crypto device/driver capabilities structure. * * Synchronization: * (d) - protected by CRYPTO_DRIVER_LOCK() * (q) - protected by CRYPTO_Q_LOCK() * Not tagged fields are read-only. */ struct cryptocap { device_t cc_dev; /* (d) device/driver */ u_int32_t cc_sessions; /* (d) # of sessions */ u_int32_t cc_koperations; /* (d) # os asym operations */ /* * Largest possible operator length (in bits) for each type of * encryption algorithm. XXX not used */ u_int16_t cc_max_op_len[CRYPTO_ALGORITHM_MAX + 1]; u_int8_t cc_alg[CRYPTO_ALGORITHM_MAX + 1]; u_int8_t cc_kalg[CRK_ALGORITHM_MAX + 1]; int cc_flags; /* (d) flags */ #define CRYPTOCAP_F_CLEANUP 0x80000000 /* needs resource cleanup */ int cc_qblocked; /* (q) symmetric q blocked */ int cc_kqblocked; /* (q) asymmetric q blocked */ size_t cc_session_size; }; static struct cryptocap *crypto_drivers = NULL; static int crypto_drivers_num = 0; /* * There are two queues for crypto requests; one for symmetric (e.g. * cipher) operations and one for asymmetric (e.g. MOD)operations. * A single mutex is used to lock access to both queues. We could * have one per-queue but having one simplifies handling of block/unblock * operations. */ static int crp_sleep = 0; static TAILQ_HEAD(cryptop_q ,cryptop) crp_q; /* request queues */ static TAILQ_HEAD(,cryptkop) crp_kq; static struct mtx crypto_q_mtx; #define CRYPTO_Q_LOCK() mtx_lock(&crypto_q_mtx) #define CRYPTO_Q_UNLOCK() mtx_unlock(&crypto_q_mtx) /* * Taskqueue used to dispatch the crypto requests * that have the CRYPTO_F_ASYNC flag */ static struct taskqueue *crypto_tq; /* * Crypto seq numbers are operated on with modular arithmetic */ #define CRYPTO_SEQ_GT(a,b) ((int)((a)-(b)) > 0) struct crypto_ret_worker { struct mtx crypto_ret_mtx; TAILQ_HEAD(,cryptop) crp_ordered_ret_q; /* ordered callback queue for symetric jobs */ TAILQ_HEAD(,cryptop) crp_ret_q; /* callback queue for symetric jobs */ TAILQ_HEAD(,cryptkop) crp_ret_kq; /* callback queue for asym jobs */ u_int32_t reorder_ops; /* total ordered sym jobs received */ u_int32_t reorder_cur_seq; /* current sym job dispatched */ struct proc *cryptoretproc; }; static struct crypto_ret_worker *crypto_ret_workers = NULL; #define CRYPTO_RETW(i) (&crypto_ret_workers[i]) #define CRYPTO_RETW_ID(w) ((w) - crypto_ret_workers) #define FOREACH_CRYPTO_RETW(w) \ for (w = crypto_ret_workers; w < crypto_ret_workers + crypto_workers_num; ++w) #define CRYPTO_RETW_LOCK(w) mtx_lock(&w->crypto_ret_mtx) #define CRYPTO_RETW_UNLOCK(w) mtx_unlock(&w->crypto_ret_mtx) #define CRYPTO_RETW_EMPTY(w) \ (TAILQ_EMPTY(&w->crp_ret_q) && TAILQ_EMPTY(&w->crp_ret_kq) && TAILQ_EMPTY(&w->crp_ordered_ret_q)) static int crypto_workers_num = 0; SYSCTL_INT(_kern, OID_AUTO, crypto_workers_num, CTLFLAG_RDTUN, &crypto_workers_num, 0, "Number of crypto workers used to dispatch crypto jobs"); static uma_zone_t cryptop_zone; static uma_zone_t cryptodesc_zone; static uma_zone_t cryptoses_zone; int crypto_userasymcrypto = 1; /* userland may do asym crypto reqs */ SYSCTL_INT(_kern, OID_AUTO, userasymcrypto, CTLFLAG_RW, &crypto_userasymcrypto, 0, "Enable/disable user-mode access to asymmetric crypto support"); int crypto_devallowsoft = 0; /* only use hardware crypto */ SYSCTL_INT(_kern, OID_AUTO, cryptodevallowsoft, CTLFLAG_RW, &crypto_devallowsoft, 0, "Enable/disable use of software crypto by /dev/crypto"); MALLOC_DEFINE(M_CRYPTO_DATA, "crypto", "crypto session records"); static void crypto_proc(void); static struct proc *cryptoproc; static void crypto_ret_proc(struct crypto_ret_worker *ret_worker); static void crypto_destroy(void); static int crypto_invoke(struct cryptocap *cap, struct cryptop *crp, int hint); static int crypto_kinvoke(struct cryptkop *krp, int flags); static void crypto_remove(struct cryptocap *cap); static void crypto_task_invoke(void *ctx, int pending); static void crypto_batch_enqueue(struct cryptop *crp); static struct cryptostats cryptostats; SYSCTL_STRUCT(_kern, OID_AUTO, crypto_stats, CTLFLAG_RW, &cryptostats, cryptostats, "Crypto system statistics"); #ifdef CRYPTO_TIMING static int crypto_timing = 0; SYSCTL_INT(_debug, OID_AUTO, crypto_timing, CTLFLAG_RW, &crypto_timing, 0, "Enable/disable crypto timing support"); #endif /* Try to avoid directly exposing the key buffer as a symbol */ static struct keybuf *keybuf; static struct keybuf empty_keybuf = { .kb_nents = 0 }; /* Obtain the key buffer from boot metadata */ static void keybuf_init(void) { caddr_t kmdp; kmdp = preload_search_by_type("elf kernel"); if (kmdp == NULL) kmdp = preload_search_by_type("elf64 kernel"); keybuf = (struct keybuf *)preload_search_info(kmdp, MODINFO_METADATA | MODINFOMD_KEYBUF); if (keybuf == NULL) keybuf = &empty_keybuf; } /* It'd be nice if we could store these in some kind of secure memory... */ struct keybuf * get_keybuf(void) { return (keybuf); } static int crypto_init(void) { struct crypto_ret_worker *ret_worker; int error; mtx_init(&crypto_drivers_mtx, "crypto", "crypto driver table", MTX_DEF|MTX_QUIET); TAILQ_INIT(&crp_q); TAILQ_INIT(&crp_kq); mtx_init(&crypto_q_mtx, "crypto", "crypto op queues", MTX_DEF); cryptop_zone = uma_zcreate("cryptop", sizeof (struct cryptop), 0, 0, 0, 0, UMA_ALIGN_PTR, UMA_ZONE_ZINIT); cryptodesc_zone = uma_zcreate("cryptodesc", sizeof (struct cryptodesc), 0, 0, 0, 0, UMA_ALIGN_PTR, UMA_ZONE_ZINIT); cryptoses_zone = uma_zcreate("crypto_session", sizeof(struct crypto_session), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_ZINIT); if (cryptodesc_zone == NULL || cryptop_zone == NULL || cryptoses_zone == NULL) { printf("crypto_init: cannot setup crypto zones\n"); error = ENOMEM; goto bad; } crypto_drivers_num = CRYPTO_DRIVERS_INITIAL; crypto_drivers = malloc(crypto_drivers_num * sizeof(struct cryptocap), M_CRYPTO_DATA, M_NOWAIT | M_ZERO); if (crypto_drivers == NULL) { printf("crypto_init: cannot setup crypto drivers\n"); error = ENOMEM; goto bad; } if (crypto_workers_num < 1 || crypto_workers_num > mp_ncpus) crypto_workers_num = mp_ncpus; crypto_tq = taskqueue_create("crypto", M_WAITOK|M_ZERO, taskqueue_thread_enqueue, &crypto_tq); if (crypto_tq == NULL) { printf("crypto init: cannot setup crypto taskqueue\n"); error = ENOMEM; goto bad; } taskqueue_start_threads(&crypto_tq, crypto_workers_num, PRI_MIN_KERN, "crypto"); error = kproc_create((void (*)(void *)) crypto_proc, NULL, &cryptoproc, 0, 0, "crypto"); if (error) { printf("crypto_init: cannot start crypto thread; error %d", error); goto bad; } crypto_ret_workers = malloc(crypto_workers_num * sizeof(struct crypto_ret_worker), M_CRYPTO_DATA, M_NOWAIT|M_ZERO); if (crypto_ret_workers == NULL) { error = ENOMEM; printf("crypto_init: cannot allocate ret workers\n"); goto bad; } FOREACH_CRYPTO_RETW(ret_worker) { TAILQ_INIT(&ret_worker->crp_ordered_ret_q); TAILQ_INIT(&ret_worker->crp_ret_q); TAILQ_INIT(&ret_worker->crp_ret_kq); ret_worker->reorder_ops = 0; ret_worker->reorder_cur_seq = 0; mtx_init(&ret_worker->crypto_ret_mtx, "crypto", "crypto return queues", MTX_DEF); error = kproc_create((void (*)(void *)) crypto_ret_proc, ret_worker, &ret_worker->cryptoretproc, 0, 0, "crypto returns %td", CRYPTO_RETW_ID(ret_worker)); if (error) { printf("crypto_init: cannot start cryptoret thread; error %d", error); goto bad; } } keybuf_init(); return 0; bad: crypto_destroy(); return error; } /* * Signal a crypto thread to terminate. We use the driver * table lock to synchronize the sleep/wakeups so that we * are sure the threads have terminated before we release * the data structures they use. See crypto_finis below * for the other half of this song-and-dance. */ static void crypto_terminate(struct proc **pp, void *q) { struct proc *p; mtx_assert(&crypto_drivers_mtx, MA_OWNED); p = *pp; *pp = NULL; if (p) { wakeup_one(q); PROC_LOCK(p); /* NB: insure we don't miss wakeup */ CRYPTO_DRIVER_UNLOCK(); /* let crypto_finis progress */ msleep(p, &p->p_mtx, PWAIT, "crypto_destroy", 0); PROC_UNLOCK(p); CRYPTO_DRIVER_LOCK(); } } static void crypto_destroy(void) { struct crypto_ret_worker *ret_worker; /* * Terminate any crypto threads. */ if (crypto_tq != NULL) taskqueue_drain_all(crypto_tq); CRYPTO_DRIVER_LOCK(); crypto_terminate(&cryptoproc, &crp_q); FOREACH_CRYPTO_RETW(ret_worker) crypto_terminate(&ret_worker->cryptoretproc, &ret_worker->crp_ret_q); CRYPTO_DRIVER_UNLOCK(); /* XXX flush queues??? */ /* * Reclaim dynamically allocated resources. */ if (crypto_drivers != NULL) free(crypto_drivers, M_CRYPTO_DATA); if (cryptoses_zone != NULL) uma_zdestroy(cryptoses_zone); if (cryptodesc_zone != NULL) uma_zdestroy(cryptodesc_zone); if (cryptop_zone != NULL) uma_zdestroy(cryptop_zone); mtx_destroy(&crypto_q_mtx); FOREACH_CRYPTO_RETW(ret_worker) mtx_destroy(&ret_worker->crypto_ret_mtx); free(crypto_ret_workers, M_CRYPTO_DATA); if (crypto_tq != NULL) taskqueue_free(crypto_tq); mtx_destroy(&crypto_drivers_mtx); } uint32_t crypto_ses2hid(crypto_session_t crypto_session) { return (crypto_session->hid); } uint32_t crypto_ses2caps(crypto_session_t crypto_session) { return (crypto_session->capabilities); } void * crypto_get_driver_session(crypto_session_t crypto_session) { return (crypto_session->softc); } static struct cryptocap * crypto_checkdriver(u_int32_t hid) { if (crypto_drivers == NULL) return NULL; return (hid >= crypto_drivers_num ? NULL : &crypto_drivers[hid]); } /* * Compare a driver's list of supported algorithms against another * list; return non-zero if all algorithms are supported. */ static int driver_suitable(const struct cryptocap *cap, const struct cryptoini *cri) { const struct cryptoini *cr; /* See if all the algorithms are supported. */ for (cr = cri; cr; cr = cr->cri_next) if (cap->cc_alg[cr->cri_alg] == 0) return 0; return 1; } /* * Select a driver for a new session that supports the specified * algorithms and, optionally, is constrained according to the flags. * The algorithm we use here is pretty stupid; just use the * first driver that supports all the algorithms we need. If there * are multiple drivers we choose the driver with the fewest active * sessions. We prefer hardware-backed drivers to software ones. * * XXX We need more smarts here (in real life too, but that's * XXX another story altogether). */ static struct cryptocap * crypto_select_driver(const struct cryptoini *cri, int flags) { struct cryptocap *cap, *best; int match, hid; CRYPTO_DRIVER_ASSERT(); /* * Look first for hardware crypto devices if permitted. */ if (flags & CRYPTOCAP_F_HARDWARE) match = CRYPTOCAP_F_HARDWARE; else match = CRYPTOCAP_F_SOFTWARE; best = NULL; again: for (hid = 0; hid < crypto_drivers_num; hid++) { cap = &crypto_drivers[hid]; /* * If it's not initialized, is in the process of * going away, or is not appropriate (hardware * or software based on match), then skip. */ if (cap->cc_dev == NULL || (cap->cc_flags & CRYPTOCAP_F_CLEANUP) || (cap->cc_flags & match) == 0) continue; /* verify all the algorithms are supported. */ if (driver_suitable(cap, cri)) { if (best == NULL || cap->cc_sessions < best->cc_sessions) best = cap; } } if (best == NULL && match == CRYPTOCAP_F_HARDWARE && (flags & CRYPTOCAP_F_SOFTWARE)) { /* sort of an Algol 68-style for loop */ match = CRYPTOCAP_F_SOFTWARE; goto again; } return best; } /* * Create a new session. The crid argument specifies a crypto * driver to use or constraints on a driver to select (hardware * only, software only, either). Whatever driver is selected * must be capable of the requested crypto algorithms. */ int crypto_newsession(crypto_session_t *cses, struct cryptoini *cri, int crid) { crypto_session_t res; void *softc_mem; struct cryptocap *cap; u_int32_t hid; size_t softc_size; int err; restart: res = NULL; softc_mem = NULL; CRYPTO_DRIVER_LOCK(); if ((crid & (CRYPTOCAP_F_HARDWARE | CRYPTOCAP_F_SOFTWARE)) == 0) { /* * Use specified driver; verify it is capable. */ cap = crypto_checkdriver(crid); if (cap != NULL && !driver_suitable(cap, cri)) cap = NULL; } else { /* * No requested driver; select based on crid flags. */ cap = crypto_select_driver(cri, crid); /* * if NULL then can't do everything in one session. * XXX Fix this. We need to inject a "virtual" session * XXX layer right about here. */ } if (cap == NULL) { CRYPTDEB("no driver"); err = EOPNOTSUPP; goto out; } cap->cc_sessions++; softc_size = cap->cc_session_size; hid = cap - crypto_drivers; cap = NULL; CRYPTO_DRIVER_UNLOCK(); softc_mem = malloc(softc_size, M_CRYPTO_DATA, M_WAITOK | M_ZERO); res = uma_zalloc(cryptoses_zone, M_WAITOK | M_ZERO); res->softc = softc_mem; CRYPTO_DRIVER_LOCK(); cap = crypto_checkdriver(hid); if (cap != NULL && (cap->cc_flags & CRYPTOCAP_F_CLEANUP) != 0) { cap->cc_sessions--; crypto_remove(cap); cap = NULL; } if (cap == NULL) { free(softc_mem, M_CRYPTO_DATA); uma_zfree(cryptoses_zone, res); CRYPTO_DRIVER_UNLOCK(); goto restart; } /* Call the driver initialization routine. */ err = CRYPTODEV_NEWSESSION(cap->cc_dev, res, cri); if (err != 0) { CRYPTDEB("dev newsession failed: %d", err); goto out; } res->capabilities = cap->cc_flags & 0xff000000; res->hid = hid; *cses = res; out: CRYPTO_DRIVER_UNLOCK(); if (err != 0) { free(softc_mem, M_CRYPTO_DATA); if (res != NULL) uma_zfree(cryptoses_zone, res); } return err; } static void crypto_remove(struct cryptocap *cap) { mtx_assert(&crypto_drivers_mtx, MA_OWNED); if (cap->cc_sessions == 0 && cap->cc_koperations == 0) bzero(cap, sizeof(*cap)); } /* * Delete an existing session (or a reserved session on an unregistered * driver). */ void crypto_freesession(crypto_session_t cses) { struct cryptocap *cap; void *ses; size_t ses_size; u_int32_t hid; if (cses == NULL) return; CRYPTO_DRIVER_LOCK(); hid = crypto_ses2hid(cses); KASSERT(hid < crypto_drivers_num, ("bogus crypto_session %p hid %u", cses, hid)); cap = &crypto_drivers[hid]; ses = cses->softc; ses_size = cap->cc_session_size; if (cap->cc_sessions) cap->cc_sessions--; /* Call the driver cleanup routine, if available. */ CRYPTODEV_FREESESSION(cap->cc_dev, cses); explicit_bzero(ses, ses_size); free(ses, M_CRYPTO_DATA); uma_zfree(cryptoses_zone, cses); if (cap->cc_flags & CRYPTOCAP_F_CLEANUP) crypto_remove(cap); CRYPTO_DRIVER_UNLOCK(); } /* * Return an unused driver id. Used by drivers prior to registering * support for the algorithms they handle. */ int32_t crypto_get_driverid(device_t dev, size_t sessionsize, int flags) { struct cryptocap *newdrv; int i; if ((flags & (CRYPTOCAP_F_HARDWARE | CRYPTOCAP_F_SOFTWARE)) == 0) { printf("%s: no flags specified when registering driver\n", device_get_nameunit(dev)); return -1; } CRYPTO_DRIVER_LOCK(); for (i = 0; i < crypto_drivers_num; i++) { if (crypto_drivers[i].cc_dev == NULL && (crypto_drivers[i].cc_flags & CRYPTOCAP_F_CLEANUP) == 0) { break; } } /* Out of entries, allocate some more. */ if (i == crypto_drivers_num) { /* Be careful about wrap-around. */ if (2 * crypto_drivers_num <= crypto_drivers_num) { CRYPTO_DRIVER_UNLOCK(); printf("crypto: driver count wraparound!\n"); return -1; } newdrv = malloc(2 * crypto_drivers_num * sizeof(struct cryptocap), M_CRYPTO_DATA, M_NOWAIT|M_ZERO); if (newdrv == NULL) { CRYPTO_DRIVER_UNLOCK(); printf("crypto: no space to expand driver table!\n"); return -1; } bcopy(crypto_drivers, newdrv, crypto_drivers_num * sizeof(struct cryptocap)); crypto_drivers_num *= 2; free(crypto_drivers, M_CRYPTO_DATA); crypto_drivers = newdrv; } /* NB: state is zero'd on free */ crypto_drivers[i].cc_sessions = 1; /* Mark */ crypto_drivers[i].cc_dev = dev; crypto_drivers[i].cc_flags = flags; crypto_drivers[i].cc_session_size = sessionsize; if (bootverbose) printf("crypto: assign %s driver id %u, flags 0x%x\n", device_get_nameunit(dev), i, flags); CRYPTO_DRIVER_UNLOCK(); return i; } /* * Lookup a driver by name. We match against the full device * name and unit, and against just the name. The latter gives * us a simple widlcarding by device name. On success return the * driver/hardware identifier; otherwise return -1. */ int crypto_find_driver(const char *match) { int i, len = strlen(match); CRYPTO_DRIVER_LOCK(); for (i = 0; i < crypto_drivers_num; i++) { device_t dev = crypto_drivers[i].cc_dev; if (dev == NULL || (crypto_drivers[i].cc_flags & CRYPTOCAP_F_CLEANUP)) continue; if (strncmp(match, device_get_nameunit(dev), len) == 0 || strncmp(match, device_get_name(dev), len) == 0) break; } CRYPTO_DRIVER_UNLOCK(); return i < crypto_drivers_num ? i : -1; } /* * Return the device_t for the specified driver or NULL * if the driver identifier is invalid. */ device_t crypto_find_device_byhid(int hid) { struct cryptocap *cap = crypto_checkdriver(hid); return cap != NULL ? cap->cc_dev : NULL; } /* * Return the device/driver capabilities. */ int crypto_getcaps(int hid) { struct cryptocap *cap = crypto_checkdriver(hid); return cap != NULL ? cap->cc_flags : 0; } /* * Register support for a key-related algorithm. This routine * is called once for each algorithm supported a driver. */ int crypto_kregister(u_int32_t driverid, int kalg, u_int32_t flags) { struct cryptocap *cap; int err; CRYPTO_DRIVER_LOCK(); cap = crypto_checkdriver(driverid); if (cap != NULL && (CRK_ALGORITM_MIN <= kalg && kalg <= CRK_ALGORITHM_MAX)) { /* * XXX Do some performance testing to determine placing. * XXX We probably need an auxiliary data structure that * XXX describes relative performances. */ cap->cc_kalg[kalg] = flags | CRYPTO_ALG_FLAG_SUPPORTED; if (bootverbose) printf("crypto: %s registers key alg %u flags %u\n" , device_get_nameunit(cap->cc_dev) , kalg , flags ); err = 0; } else err = EINVAL; CRYPTO_DRIVER_UNLOCK(); return err; } /* * Register support for a non-key-related algorithm. This routine * is called once for each such algorithm supported by a driver. */ int crypto_register(u_int32_t driverid, int alg, u_int16_t maxoplen, u_int32_t flags) { struct cryptocap *cap; int err; CRYPTO_DRIVER_LOCK(); cap = crypto_checkdriver(driverid); /* NB: algorithms are in the range [1..max] */ if (cap != NULL && (CRYPTO_ALGORITHM_MIN <= alg && alg <= CRYPTO_ALGORITHM_MAX)) { /* * XXX Do some performance testing to determine placing. * XXX We probably need an auxiliary data structure that * XXX describes relative performances. */ cap->cc_alg[alg] = flags | CRYPTO_ALG_FLAG_SUPPORTED; cap->cc_max_op_len[alg] = maxoplen; if (bootverbose) printf("crypto: %s registers alg %u flags %u maxoplen %u\n" , device_get_nameunit(cap->cc_dev) , alg , flags , maxoplen ); cap->cc_sessions = 0; /* Unmark */ err = 0; } else err = EINVAL; CRYPTO_DRIVER_UNLOCK(); return err; } static void driver_finis(struct cryptocap *cap) { u_int32_t ses, kops; CRYPTO_DRIVER_ASSERT(); ses = cap->cc_sessions; kops = cap->cc_koperations; bzero(cap, sizeof(*cap)); if (ses != 0 || kops != 0) { /* * If there are pending sessions, * just mark as invalid. */ cap->cc_flags |= CRYPTOCAP_F_CLEANUP; cap->cc_sessions = ses; cap->cc_koperations = kops; } } /* * Unregister a crypto driver. If there are pending sessions using it, * leave enough information around so that subsequent calls using those * sessions will correctly detect the driver has been unregistered and * reroute requests. */ int crypto_unregister(u_int32_t driverid, int alg) { struct cryptocap *cap; int i, err; CRYPTO_DRIVER_LOCK(); cap = crypto_checkdriver(driverid); if (cap != NULL && (CRYPTO_ALGORITHM_MIN <= alg && alg <= CRYPTO_ALGORITHM_MAX) && cap->cc_alg[alg] != 0) { cap->cc_alg[alg] = 0; cap->cc_max_op_len[alg] = 0; /* Was this the last algorithm ? */ for (i = 1; i <= CRYPTO_ALGORITHM_MAX; i++) if (cap->cc_alg[i] != 0) break; if (i == CRYPTO_ALGORITHM_MAX + 1) driver_finis(cap); err = 0; } else err = EINVAL; CRYPTO_DRIVER_UNLOCK(); return err; } /* * Unregister all algorithms associated with a crypto driver. * If there are pending sessions using it, leave enough information * around so that subsequent calls using those sessions will * correctly detect the driver has been unregistered and reroute * requests. */ int crypto_unregister_all(u_int32_t driverid) { struct cryptocap *cap; int err; CRYPTO_DRIVER_LOCK(); cap = crypto_checkdriver(driverid); if (cap != NULL) { driver_finis(cap); err = 0; } else err = EINVAL; CRYPTO_DRIVER_UNLOCK(); return err; } /* * Clear blockage on a driver. The what parameter indicates whether * the driver is now ready for cryptop's and/or cryptokop's. */ int crypto_unblock(u_int32_t driverid, int what) { struct cryptocap *cap; int err; CRYPTO_Q_LOCK(); cap = crypto_checkdriver(driverid); if (cap != NULL) { if (what & CRYPTO_SYMQ) cap->cc_qblocked = 0; if (what & CRYPTO_ASYMQ) cap->cc_kqblocked = 0; if (crp_sleep) wakeup_one(&crp_q); err = 0; } else err = EINVAL; CRYPTO_Q_UNLOCK(); return err; } /* * Add a crypto request to a queue, to be processed by the kernel thread. */ int crypto_dispatch(struct cryptop *crp) { struct cryptocap *cap; u_int32_t hid; int result; cryptostats.cs_ops++; #ifdef CRYPTO_TIMING if (crypto_timing) binuptime(&crp->crp_tstamp); #endif crp->crp_retw_id = ((uintptr_t)crp->crp_session) % crypto_workers_num; if (CRYPTOP_ASYNC(crp)) { if (crp->crp_flags & CRYPTO_F_ASYNC_KEEPORDER) { struct crypto_ret_worker *ret_worker; ret_worker = CRYPTO_RETW(crp->crp_retw_id); CRYPTO_RETW_LOCK(ret_worker); crp->crp_seq = ret_worker->reorder_ops++; CRYPTO_RETW_UNLOCK(ret_worker); } TASK_INIT(&crp->crp_task, 0, crypto_task_invoke, crp); taskqueue_enqueue(crypto_tq, &crp->crp_task); return (0); } if ((crp->crp_flags & CRYPTO_F_BATCH) == 0) { hid = crypto_ses2hid(crp->crp_session); /* * Caller marked the request to be processed * immediately; dispatch it directly to the * driver unless the driver is currently blocked. */ cap = crypto_checkdriver(hid); /* Driver cannot disappeared when there is an active session. */ KASSERT(cap != NULL, ("%s: Driver disappeared.", __func__)); if (!cap->cc_qblocked) { result = crypto_invoke(cap, crp, 0); if (result != ERESTART) return (result); /* * The driver ran out of resources, put the request on * the queue. */ } } crypto_batch_enqueue(crp); return 0; } void crypto_batch_enqueue(struct cryptop *crp) { CRYPTO_Q_LOCK(); TAILQ_INSERT_TAIL(&crp_q, crp, crp_next); if (crp_sleep) wakeup_one(&crp_q); CRYPTO_Q_UNLOCK(); } /* * Add an asymetric crypto request to a queue, * to be processed by the kernel thread. */ int crypto_kdispatch(struct cryptkop *krp) { int error; cryptostats.cs_kops++; error = crypto_kinvoke(krp, krp->krp_crid); if (error == ERESTART) { CRYPTO_Q_LOCK(); TAILQ_INSERT_TAIL(&crp_kq, krp, krp_next); if (crp_sleep) wakeup_one(&crp_q); CRYPTO_Q_UNLOCK(); error = 0; } return error; } /* * Verify a driver is suitable for the specified operation. */ static __inline int kdriver_suitable(const struct cryptocap *cap, const struct cryptkop *krp) { return (cap->cc_kalg[krp->krp_op] & CRYPTO_ALG_FLAG_SUPPORTED) != 0; } /* * Select a driver for an asym operation. The driver must * support the necessary algorithm. The caller can constrain * which device is selected with the flags parameter. The * algorithm we use here is pretty stupid; just use the first * driver that supports the algorithms we need. If there are * multiple suitable drivers we choose the driver with the * fewest active operations. We prefer hardware-backed * drivers to software ones when either may be used. */ static struct cryptocap * crypto_select_kdriver(const struct cryptkop *krp, int flags) { struct cryptocap *cap, *best; int match, hid; CRYPTO_DRIVER_ASSERT(); /* * Look first for hardware crypto devices if permitted. */ if (flags & CRYPTOCAP_F_HARDWARE) match = CRYPTOCAP_F_HARDWARE; else match = CRYPTOCAP_F_SOFTWARE; best = NULL; again: for (hid = 0; hid < crypto_drivers_num; hid++) { cap = &crypto_drivers[hid]; /* * If it's not initialized, is in the process of * going away, or is not appropriate (hardware * or software based on match), then skip. */ if (cap->cc_dev == NULL || (cap->cc_flags & CRYPTOCAP_F_CLEANUP) || (cap->cc_flags & match) == 0) continue; /* verify all the algorithms are supported. */ if (kdriver_suitable(cap, krp)) { if (best == NULL || cap->cc_koperations < best->cc_koperations) best = cap; } } if (best != NULL) return best; if (match == CRYPTOCAP_F_HARDWARE && (flags & CRYPTOCAP_F_SOFTWARE)) { /* sort of an Algol 68-style for loop */ match = CRYPTOCAP_F_SOFTWARE; goto again; } return best; } /* * Dispatch an asymmetric crypto request. */ static int crypto_kinvoke(struct cryptkop *krp, int crid) { struct cryptocap *cap = NULL; int error; KASSERT(krp != NULL, ("%s: krp == NULL", __func__)); KASSERT(krp->krp_callback != NULL, ("%s: krp->crp_callback == NULL", __func__)); CRYPTO_DRIVER_LOCK(); if ((crid & (CRYPTOCAP_F_HARDWARE | CRYPTOCAP_F_SOFTWARE)) == 0) { cap = crypto_checkdriver(crid); if (cap != NULL) { /* * Driver present, it must support the necessary * algorithm and, if s/w drivers are excluded, * it must be registered as hardware-backed. */ if (!kdriver_suitable(cap, krp) || (!crypto_devallowsoft && (cap->cc_flags & CRYPTOCAP_F_HARDWARE) == 0)) cap = NULL; } } else { /* * No requested driver; select based on crid flags. */ if (!crypto_devallowsoft) /* NB: disallow s/w drivers */ crid &= ~CRYPTOCAP_F_SOFTWARE; cap = crypto_select_kdriver(krp, crid); } if (cap != NULL && !cap->cc_kqblocked) { krp->krp_hid = cap - crypto_drivers; cap->cc_koperations++; CRYPTO_DRIVER_UNLOCK(); error = CRYPTODEV_KPROCESS(cap->cc_dev, krp, 0); CRYPTO_DRIVER_LOCK(); if (error == ERESTART) { cap->cc_koperations--; CRYPTO_DRIVER_UNLOCK(); return (error); } } else { /* * NB: cap is !NULL if device is blocked; in * that case return ERESTART so the operation * is resubmitted if possible. */ error = (cap == NULL) ? ENODEV : ERESTART; } CRYPTO_DRIVER_UNLOCK(); if (error) { krp->krp_status = error; crypto_kdone(krp); } return 0; } #ifdef CRYPTO_TIMING static void crypto_tstat(struct cryptotstat *ts, struct bintime *bt) { struct bintime now, delta; struct timespec t; uint64_t u; binuptime(&now); u = now.frac; delta.frac = now.frac - bt->frac; delta.sec = now.sec - bt->sec; if (u < delta.frac) delta.sec--; bintime2timespec(&delta, &t); - timespecadd(&ts->acc, &t); + timespecadd(&ts->acc, &t, &ts->acc); if (timespeccmp(&t, &ts->min, <)) ts->min = t; if (timespeccmp(&t, &ts->max, >)) ts->max = t; ts->count++; *bt = now; } #endif static void crypto_task_invoke(void *ctx, int pending) { struct cryptocap *cap; struct cryptop *crp; int hid, result; crp = (struct cryptop *)ctx; hid = crypto_ses2hid(crp->crp_session); cap = crypto_checkdriver(hid); result = crypto_invoke(cap, crp, 0); if (result == ERESTART) crypto_batch_enqueue(crp); } /* * Dispatch a crypto request to the appropriate crypto devices. */ static int crypto_invoke(struct cryptocap *cap, struct cryptop *crp, int hint) { KASSERT(crp != NULL, ("%s: crp == NULL", __func__)); KASSERT(crp->crp_callback != NULL, ("%s: crp->crp_callback == NULL", __func__)); KASSERT(crp->crp_desc != NULL, ("%s: crp->crp_desc == NULL", __func__)); #ifdef CRYPTO_TIMING if (crypto_timing) crypto_tstat(&cryptostats.cs_invoke, &crp->crp_tstamp); #endif if (cap->cc_flags & CRYPTOCAP_F_CLEANUP) { struct cryptodesc *crd; crypto_session_t nses; /* * Driver has unregistered; migrate the session and return * an error to the caller so they'll resubmit the op. * * XXX: What if there are more already queued requests for this * session? */ crypto_freesession(crp->crp_session); for (crd = crp->crp_desc; crd->crd_next; crd = crd->crd_next) crd->CRD_INI.cri_next = &(crd->crd_next->CRD_INI); /* XXX propagate flags from initial session? */ if (crypto_newsession(&nses, &(crp->crp_desc->CRD_INI), CRYPTOCAP_F_HARDWARE | CRYPTOCAP_F_SOFTWARE) == 0) crp->crp_session = nses; crp->crp_etype = EAGAIN; crypto_done(crp); return 0; } else { /* * Invoke the driver to process the request. */ return CRYPTODEV_PROCESS(cap->cc_dev, crp, hint); } } /* * Release a set of crypto descriptors. */ void crypto_freereq(struct cryptop *crp) { struct cryptodesc *crd; if (crp == NULL) return; #ifdef DIAGNOSTIC { struct cryptop *crp2; struct crypto_ret_worker *ret_worker; CRYPTO_Q_LOCK(); TAILQ_FOREACH(crp2, &crp_q, crp_next) { KASSERT(crp2 != crp, ("Freeing cryptop from the crypto queue (%p).", crp)); } CRYPTO_Q_UNLOCK(); FOREACH_CRYPTO_RETW(ret_worker) { CRYPTO_RETW_LOCK(ret_worker); TAILQ_FOREACH(crp2, &ret_worker->crp_ret_q, crp_next) { KASSERT(crp2 != crp, ("Freeing cryptop from the return queue (%p).", crp)); } CRYPTO_RETW_UNLOCK(ret_worker); } } #endif while ((crd = crp->crp_desc) != NULL) { crp->crp_desc = crd->crd_next; uma_zfree(cryptodesc_zone, crd); } uma_zfree(cryptop_zone, crp); } /* * Acquire a set of crypto descriptors. */ struct cryptop * crypto_getreq(int num) { struct cryptodesc *crd; struct cryptop *crp; crp = uma_zalloc(cryptop_zone, M_NOWAIT|M_ZERO); if (crp != NULL) { while (num--) { crd = uma_zalloc(cryptodesc_zone, M_NOWAIT|M_ZERO); if (crd == NULL) { crypto_freereq(crp); return NULL; } crd->crd_next = crp->crp_desc; crp->crp_desc = crd; } } return crp; } /* * Invoke the callback on behalf of the driver. */ void crypto_done(struct cryptop *crp) { KASSERT((crp->crp_flags & CRYPTO_F_DONE) == 0, ("crypto_done: op already done, flags 0x%x", crp->crp_flags)); crp->crp_flags |= CRYPTO_F_DONE; if (crp->crp_etype != 0) cryptostats.cs_errs++; #ifdef CRYPTO_TIMING if (crypto_timing) crypto_tstat(&cryptostats.cs_done, &crp->crp_tstamp); #endif /* * CBIMM means unconditionally do the callback immediately; * CBIFSYNC means do the callback immediately only if the * operation was done synchronously. Both are used to avoid * doing extraneous context switches; the latter is mostly * used with the software crypto driver. */ if (!CRYPTOP_ASYNC_KEEPORDER(crp) && ((crp->crp_flags & CRYPTO_F_CBIMM) || ((crp->crp_flags & CRYPTO_F_CBIFSYNC) && (crypto_ses2caps(crp->crp_session) & CRYPTOCAP_F_SYNC)))) { /* * Do the callback directly. This is ok when the * callback routine does very little (e.g. the * /dev/crypto callback method just does a wakeup). */ #ifdef CRYPTO_TIMING if (crypto_timing) { /* * NB: We must copy the timestamp before * doing the callback as the cryptop is * likely to be reclaimed. */ struct bintime t = crp->crp_tstamp; crypto_tstat(&cryptostats.cs_cb, &t); crp->crp_callback(crp); crypto_tstat(&cryptostats.cs_finis, &t); } else #endif crp->crp_callback(crp); } else { struct crypto_ret_worker *ret_worker; bool wake; ret_worker = CRYPTO_RETW(crp->crp_retw_id); wake = false; /* * Normal case; queue the callback for the thread. */ CRYPTO_RETW_LOCK(ret_worker); if (CRYPTOP_ASYNC_KEEPORDER(crp)) { struct cryptop *tmp; TAILQ_FOREACH_REVERSE(tmp, &ret_worker->crp_ordered_ret_q, cryptop_q, crp_next) { if (CRYPTO_SEQ_GT(crp->crp_seq, tmp->crp_seq)) { TAILQ_INSERT_AFTER(&ret_worker->crp_ordered_ret_q, tmp, crp, crp_next); break; } } if (tmp == NULL) { TAILQ_INSERT_HEAD(&ret_worker->crp_ordered_ret_q, crp, crp_next); } if (crp->crp_seq == ret_worker->reorder_cur_seq) wake = true; } else { if (CRYPTO_RETW_EMPTY(ret_worker)) wake = true; TAILQ_INSERT_TAIL(&ret_worker->crp_ret_q, crp, crp_next); } if (wake) wakeup_one(&ret_worker->crp_ret_q); /* shared wait channel */ CRYPTO_RETW_UNLOCK(ret_worker); } } /* * Invoke the callback on behalf of the driver. */ void crypto_kdone(struct cryptkop *krp) { struct crypto_ret_worker *ret_worker; struct cryptocap *cap; if (krp->krp_status != 0) cryptostats.cs_kerrs++; CRYPTO_DRIVER_LOCK(); /* XXX: What if driver is loaded in the meantime? */ if (krp->krp_hid < crypto_drivers_num) { cap = &crypto_drivers[krp->krp_hid]; KASSERT(cap->cc_koperations > 0, ("cc_koperations == 0")); cap->cc_koperations--; if (cap->cc_flags & CRYPTOCAP_F_CLEANUP) crypto_remove(cap); } CRYPTO_DRIVER_UNLOCK(); ret_worker = CRYPTO_RETW(0); CRYPTO_RETW_LOCK(ret_worker); if (CRYPTO_RETW_EMPTY(ret_worker)) wakeup_one(&ret_worker->crp_ret_q); /* shared wait channel */ TAILQ_INSERT_TAIL(&ret_worker->crp_ret_kq, krp, krp_next); CRYPTO_RETW_UNLOCK(ret_worker); } int crypto_getfeat(int *featp) { int hid, kalg, feat = 0; CRYPTO_DRIVER_LOCK(); for (hid = 0; hid < crypto_drivers_num; hid++) { const struct cryptocap *cap = &crypto_drivers[hid]; if ((cap->cc_flags & CRYPTOCAP_F_SOFTWARE) && !crypto_devallowsoft) { continue; } for (kalg = 0; kalg < CRK_ALGORITHM_MAX; kalg++) if (cap->cc_kalg[kalg] & CRYPTO_ALG_FLAG_SUPPORTED) feat |= 1 << kalg; } CRYPTO_DRIVER_UNLOCK(); *featp = feat; return (0); } /* * Terminate a thread at module unload. The process that * initiated this is waiting for us to signal that we're gone; * wake it up and exit. We use the driver table lock to insure * we don't do the wakeup before they're waiting. There is no * race here because the waiter sleeps on the proc lock for the * thread so it gets notified at the right time because of an * extra wakeup that's done in exit1(). */ static void crypto_finis(void *chan) { CRYPTO_DRIVER_LOCK(); wakeup_one(chan); CRYPTO_DRIVER_UNLOCK(); kproc_exit(0); } /* * Crypto thread, dispatches crypto requests. */ static void crypto_proc(void) { struct cryptop *crp, *submit; struct cryptkop *krp; struct cryptocap *cap; u_int32_t hid; int result, hint; #if defined(__i386__) || defined(__amd64__) || defined(__aarch64__) fpu_kern_thread(FPU_KERN_NORMAL); #endif CRYPTO_Q_LOCK(); for (;;) { /* * Find the first element in the queue that can be * processed and look-ahead to see if multiple ops * are ready for the same driver. */ submit = NULL; hint = 0; TAILQ_FOREACH(crp, &crp_q, crp_next) { hid = crypto_ses2hid(crp->crp_session); cap = crypto_checkdriver(hid); /* * Driver cannot disappeared when there is an active * session. */ KASSERT(cap != NULL, ("%s:%u Driver disappeared.", __func__, __LINE__)); if (cap == NULL || cap->cc_dev == NULL) { /* Op needs to be migrated, process it. */ if (submit == NULL) submit = crp; break; } if (!cap->cc_qblocked) { if (submit != NULL) { /* * We stop on finding another op, * regardless whether its for the same * driver or not. We could keep * searching the queue but it might be * better to just use a per-driver * queue instead. */ if (crypto_ses2hid(submit->crp_session) == hid) hint = CRYPTO_HINT_MORE; break; } else { submit = crp; if ((submit->crp_flags & CRYPTO_F_BATCH) == 0) break; /* keep scanning for more are q'd */ } } } if (submit != NULL) { TAILQ_REMOVE(&crp_q, submit, crp_next); hid = crypto_ses2hid(submit->crp_session); cap = crypto_checkdriver(hid); KASSERT(cap != NULL, ("%s:%u Driver disappeared.", __func__, __LINE__)); result = crypto_invoke(cap, submit, hint); if (result == ERESTART) { /* * The driver ran out of resources, mark the * driver ``blocked'' for cryptop's and put * the request back in the queue. It would * best to put the request back where we got * it but that's hard so for now we put it * at the front. This should be ok; putting * it at the end does not work. */ /* XXX validate sid again? */ crypto_drivers[crypto_ses2hid(submit->crp_session)].cc_qblocked = 1; TAILQ_INSERT_HEAD(&crp_q, submit, crp_next); cryptostats.cs_blocks++; } } /* As above, but for key ops */ TAILQ_FOREACH(krp, &crp_kq, krp_next) { cap = crypto_checkdriver(krp->krp_hid); if (cap == NULL || cap->cc_dev == NULL) { /* * Operation needs to be migrated, invalidate * the assigned device so it will reselect a * new one below. Propagate the original * crid selection flags if supplied. */ krp->krp_hid = krp->krp_crid & (CRYPTOCAP_F_SOFTWARE|CRYPTOCAP_F_HARDWARE); if (krp->krp_hid == 0) krp->krp_hid = CRYPTOCAP_F_SOFTWARE|CRYPTOCAP_F_HARDWARE; break; } if (!cap->cc_kqblocked) break; } if (krp != NULL) { TAILQ_REMOVE(&crp_kq, krp, krp_next); result = crypto_kinvoke(krp, krp->krp_hid); if (result == ERESTART) { /* * The driver ran out of resources, mark the * driver ``blocked'' for cryptkop's and put * the request back in the queue. It would * best to put the request back where we got * it but that's hard so for now we put it * at the front. This should be ok; putting * it at the end does not work. */ /* XXX validate sid again? */ crypto_drivers[krp->krp_hid].cc_kqblocked = 1; TAILQ_INSERT_HEAD(&crp_kq, krp, krp_next); cryptostats.cs_kblocks++; } } if (submit == NULL && krp == NULL) { /* * Nothing more to be processed. Sleep until we're * woken because there are more ops to process. * This happens either by submission or by a driver * becoming unblocked and notifying us through * crypto_unblock. Note that when we wakeup we * start processing each queue again from the * front. It's not clear that it's important to * preserve this ordering since ops may finish * out of order if dispatched to different devices * and some become blocked while others do not. */ crp_sleep = 1; msleep(&crp_q, &crypto_q_mtx, PWAIT, "crypto_wait", 0); crp_sleep = 0; if (cryptoproc == NULL) break; cryptostats.cs_intrs++; } } CRYPTO_Q_UNLOCK(); crypto_finis(&crp_q); } /* * Crypto returns thread, does callbacks for processed crypto requests. * Callbacks are done here, rather than in the crypto drivers, because * callbacks typically are expensive and would slow interrupt handling. */ static void crypto_ret_proc(struct crypto_ret_worker *ret_worker) { struct cryptop *crpt; struct cryptkop *krpt; CRYPTO_RETW_LOCK(ret_worker); for (;;) { /* Harvest return q's for completed ops */ crpt = TAILQ_FIRST(&ret_worker->crp_ordered_ret_q); if (crpt != NULL) { if (crpt->crp_seq == ret_worker->reorder_cur_seq) { TAILQ_REMOVE(&ret_worker->crp_ordered_ret_q, crpt, crp_next); ret_worker->reorder_cur_seq++; } else { crpt = NULL; } } if (crpt == NULL) { crpt = TAILQ_FIRST(&ret_worker->crp_ret_q); if (crpt != NULL) TAILQ_REMOVE(&ret_worker->crp_ret_q, crpt, crp_next); } krpt = TAILQ_FIRST(&ret_worker->crp_ret_kq); if (krpt != NULL) TAILQ_REMOVE(&ret_worker->crp_ret_kq, krpt, krp_next); if (crpt != NULL || krpt != NULL) { CRYPTO_RETW_UNLOCK(ret_worker); /* * Run callbacks unlocked. */ if (crpt != NULL) { #ifdef CRYPTO_TIMING if (crypto_timing) { /* * NB: We must copy the timestamp before * doing the callback as the cryptop is * likely to be reclaimed. */ struct bintime t = crpt->crp_tstamp; crypto_tstat(&cryptostats.cs_cb, &t); crpt->crp_callback(crpt); crypto_tstat(&cryptostats.cs_finis, &t); } else #endif crpt->crp_callback(crpt); } if (krpt != NULL) krpt->krp_callback(krpt); CRYPTO_RETW_LOCK(ret_worker); } else { /* * Nothing more to be processed. Sleep until we're * woken because there are more returns to process. */ msleep(&ret_worker->crp_ret_q, &ret_worker->crypto_ret_mtx, PWAIT, "crypto_ret_wait", 0); if (ret_worker->cryptoretproc == NULL) break; cryptostats.cs_rets++; } } CRYPTO_RETW_UNLOCK(ret_worker); crypto_finis(&ret_worker->crp_ret_q); } #ifdef DDB static void db_show_drivers(void) { int hid; db_printf("%12s %4s %4s %8s %2s %2s\n" , "Device" , "Ses" , "Kops" , "Flags" , "QB" , "KB" ); for (hid = 0; hid < crypto_drivers_num; hid++) { const struct cryptocap *cap = &crypto_drivers[hid]; if (cap->cc_dev == NULL) continue; db_printf("%-12s %4u %4u %08x %2u %2u\n" , device_get_nameunit(cap->cc_dev) , cap->cc_sessions , cap->cc_koperations , cap->cc_flags , cap->cc_qblocked , cap->cc_kqblocked ); } } DB_SHOW_COMMAND(crypto, db_show_crypto) { struct cryptop *crp; struct crypto_ret_worker *ret_worker; db_show_drivers(); db_printf("\n"); db_printf("%4s %8s %4s %4s %4s %4s %8s %8s\n", "HID", "Caps", "Ilen", "Olen", "Etype", "Flags", "Desc", "Callback"); TAILQ_FOREACH(crp, &crp_q, crp_next) { db_printf("%4u %08x %4u %4u %4u %04x %8p %8p\n" , (int) crypto_ses2hid(crp->crp_session) , (int) crypto_ses2caps(crp->crp_session) , crp->crp_ilen, crp->crp_olen , crp->crp_etype , crp->crp_flags , crp->crp_desc , crp->crp_callback ); } FOREACH_CRYPTO_RETW(ret_worker) { db_printf("\n%8s %4s %4s %4s %8s\n", "ret_worker", "HID", "Etype", "Flags", "Callback"); if (!TAILQ_EMPTY(&ret_worker->crp_ret_q)) { TAILQ_FOREACH(crp, &ret_worker->crp_ret_q, crp_next) { db_printf("%8td %4u %4u %04x %8p\n" , CRYPTO_RETW_ID(ret_worker) , (int) crypto_ses2hid(crp->crp_session) , crp->crp_etype , crp->crp_flags , crp->crp_callback ); } } } } DB_SHOW_COMMAND(kcrypto, db_show_kcrypto) { struct cryptkop *krp; struct crypto_ret_worker *ret_worker; db_show_drivers(); db_printf("\n"); db_printf("%4s %5s %4s %4s %8s %4s %8s\n", "Op", "Status", "#IP", "#OP", "CRID", "HID", "Callback"); TAILQ_FOREACH(krp, &crp_kq, krp_next) { db_printf("%4u %5u %4u %4u %08x %4u %8p\n" , krp->krp_op , krp->krp_status , krp->krp_iparams, krp->krp_oparams , krp->krp_crid, krp->krp_hid , krp->krp_callback ); } ret_worker = CRYPTO_RETW(0); if (!TAILQ_EMPTY(&ret_worker->crp_ret_q)) { db_printf("%4s %5s %8s %4s %8s\n", "Op", "Status", "CRID", "HID", "Callback"); TAILQ_FOREACH(krp, &ret_worker->crp_ret_kq, krp_next) { db_printf("%4u %5u %08x %4u %8p\n" , krp->krp_op , krp->krp_status , krp->krp_crid, krp->krp_hid , krp->krp_callback ); } } } #endif int crypto_modevent(module_t mod, int type, void *unused); /* * Initialization code, both for static and dynamic loading. * Note this is not invoked with the usual MODULE_DECLARE * mechanism but instead is listed as a dependency by the * cryptosoft driver. This guarantees proper ordering of * calls on module load/unload. */ int crypto_modevent(module_t mod, int type, void *unused) { int error = EINVAL; switch (type) { case MOD_LOAD: error = crypto_init(); if (error == 0 && bootverbose) printf("crypto: \n"); break; case MOD_UNLOAD: /*XXX disallow if active sessions */ error = 0; crypto_destroy(); return 0; } return error; } MODULE_VERSION(crypto, 1); MODULE_DEPEND(crypto, zlib, 1, 1, 1); Index: head/sys/sys/param.h =================================================================== --- head/sys/sys/param.h (revision 336913) +++ head/sys/sys/param.h (revision 336914) @@ -1,365 +1,365 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. * All or some portions of this file are derived from material licensed * to the University of California by American Telephone and Telegraph * Co. or Unix System Laboratories, Inc. and are reproduced herein with * the permission of UNIX System Laboratories, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)param.h 8.3 (Berkeley) 4/4/95 * $FreeBSD$ */ #ifndef _SYS_PARAM_H_ #define _SYS_PARAM_H_ #include #define BSD 199506 /* System version (year & month). */ #define BSD4_3 1 #define BSD4_4 1 /* * __FreeBSD_version numbers are documented in the Porter's Handbook. * If you bump the version for any reason, you should update the documentation * there. * Currently this lives here in the doc/ repository: * * head/en_US.ISO8859-1/books/porters-handbook/versions/chapter.xml * * scheme is: Rxx * 'R' is in the range 0 to 4 if this is a release branch or * X.0-CURRENT before releng/X.0 is created, otherwise 'R' is * in the range 5 to 9. */ #undef __FreeBSD_version -#define __FreeBSD_version 1200075 /* Master, propagated to newvers */ +#define __FreeBSD_version 1200076 /* Master, propagated to newvers */ /* * __FreeBSD_kernel__ indicates that this system uses the kernel of FreeBSD, * which by definition is always true on FreeBSD. This macro is also defined * on other systems that use the kernel of FreeBSD, such as GNU/kFreeBSD. * * It is tempting to use this macro in userland code when we want to enable * kernel-specific routines, and in fact it's fine to do this in code that * is part of FreeBSD itself. However, be aware that as presence of this * macro is still not widespread (e.g. older FreeBSD versions, 3rd party * compilers, etc), it is STRONGLY DISCOURAGED to check for this macro in * external applications without also checking for __FreeBSD__ as an * alternative. */ #undef __FreeBSD_kernel__ #define __FreeBSD_kernel__ #if defined(_KERNEL) || defined(IN_RTLD) #define P_OSREL_SIGWAIT 700000 #define P_OSREL_SIGSEGV 700004 #define P_OSREL_MAP_ANON 800104 #define P_OSREL_MAP_FSTRICT 1100036 #define P_OSREL_SHUTDOWN_ENOTCONN 1100077 #define P_OSREL_MAP_GUARD 1200035 #define P_OSREL_WRFSBASE 1200041 #define P_OSREL_CK_CYLGRP 1200046 #define P_OSREL_VMTOTAL64 1200054 #define P_OSREL_MAJOR(x) ((x) / 100000) #endif #ifndef LOCORE #include #endif /* * Machine-independent constants (some used in following include files). * Redefined constants are from POSIX 1003.1 limits file. * * MAXCOMLEN should be >= sizeof(ac_comm) (see ) */ #include #define MAXCOMLEN 19 /* max command name remembered */ #define MAXINTERP PATH_MAX /* max interpreter file name length */ #define MAXLOGNAME 33 /* max login name length (incl. NUL) */ #define MAXUPRC CHILD_MAX /* max simultaneous processes */ #define NCARGS ARG_MAX /* max bytes for an exec function */ #define NGROUPS (NGROUPS_MAX+1) /* max number groups */ #define NOFILE OPEN_MAX /* max open files per process */ #define NOGROUP 65535 /* marker for empty group set member */ #define MAXHOSTNAMELEN 256 /* max hostname size */ #define SPECNAMELEN 63 /* max length of devicename */ /* More types and definitions used throughout the kernel. */ #ifdef _KERNEL #include #include #ifndef LOCORE #include #include #endif #ifndef FALSE #define FALSE 0 #endif #ifndef TRUE #define TRUE 1 #endif #endif #ifndef _KERNEL /* Signals. */ #include #endif /* Machine type dependent parameters. */ #include #ifndef _KERNEL #include #endif #ifndef DEV_BSHIFT #define DEV_BSHIFT 9 /* log2(DEV_BSIZE) */ #endif #define DEV_BSIZE (1<>PAGE_SHIFT) #endif /* * btodb() is messy and perhaps slow because `bytes' may be an off_t. We * want to shift an unsigned type to avoid sign extension and we don't * want to widen `bytes' unnecessarily. Assume that the result fits in * a daddr_t. */ #ifndef btodb #define btodb(bytes) /* calculates (bytes / DEV_BSIZE) */ \ (sizeof (bytes) > sizeof(long) \ ? (daddr_t)((unsigned long long)(bytes) >> DEV_BSHIFT) \ : (daddr_t)((unsigned long)(bytes) >> DEV_BSHIFT)) #endif #ifndef dbtob #define dbtob(db) /* calculates (db * DEV_BSIZE) */ \ ((off_t)(db) << DEV_BSHIFT) #endif #define PRIMASK 0x0ff #define PCATCH 0x100 /* OR'd with pri for tsleep to check signals */ #define PDROP 0x200 /* OR'd with pri to stop re-entry of interlock mutex */ #define NZERO 0 /* default "nice" */ #define NBBY 8 /* number of bits in a byte */ #define NBPW sizeof(int) /* number of bytes per word (integer) */ #define CMASK 022 /* default file mask: S_IWGRP|S_IWOTH */ #define NODEV (dev_t)(-1) /* non-existent device */ /* * File system parameters and macros. * * MAXBSIZE - Filesystems are made out of blocks of at most MAXBSIZE bytes * per block. MAXBSIZE may be made larger without effecting * any existing filesystems as long as it does not exceed MAXPHYS, * and may be made smaller at the risk of not being able to use * filesystems which require a block size exceeding MAXBSIZE. * * MAXBCACHEBUF - Maximum size of a buffer in the buffer cache. This must * be >= MAXBSIZE and can be set differently for different * architectures by defining it in . * Making this larger allows NFS to do larger reads/writes. * * BKVASIZE - Nominal buffer space per buffer, in bytes. BKVASIZE is the * minimum KVM memory reservation the kernel is willing to make. * Filesystems can of course request smaller chunks. Actual * backing memory uses a chunk size of a page (PAGE_SIZE). * The default value here can be overridden on a per-architecture * basis by defining it in . * * If you make BKVASIZE too small you risk seriously fragmenting * the buffer KVM map which may slow things down a bit. If you * make it too big the kernel will not be able to optimally use * the KVM memory reserved for the buffer cache and will wind * up with too-few buffers. * * The default is 16384, roughly 2x the block size used by a * normal UFS filesystem. */ #define MAXBSIZE 65536 /* must be power of 2 */ #ifndef MAXBCACHEBUF #define MAXBCACHEBUF MAXBSIZE /* must be a power of 2 >= MAXBSIZE */ #endif #ifndef BKVASIZE #define BKVASIZE 16384 /* must be power of 2 */ #endif #define BKVAMASK (BKVASIZE-1) /* * MAXPATHLEN defines the longest permissible path length after expanding * symbolic links. It is used to allocate a temporary buffer from the buffer * pool in which to do the name expansion, hence should be a power of two, * and must be less than or equal to MAXBSIZE. MAXSYMLINKS defines the * maximum number of symbolic links that may be expanded in a path name. * It should be set high enough to allow all legitimate uses, but halt * infinite loops reasonably quickly. */ #define MAXPATHLEN PATH_MAX #define MAXSYMLINKS 32 /* Bit map related macros. */ #define setbit(a,i) (((unsigned char *)(a))[(i)/NBBY] |= 1<<((i)%NBBY)) #define clrbit(a,i) (((unsigned char *)(a))[(i)/NBBY] &= ~(1<<((i)%NBBY))) #define isset(a,i) \ (((const unsigned char *)(a))[(i)/NBBY] & (1<<((i)%NBBY))) #define isclr(a,i) \ ((((const unsigned char *)(a))[(i)/NBBY] & (1<<((i)%NBBY))) == 0) /* Macros for counting and rounding. */ #ifndef howmany #define howmany(x, y) (((x)+((y)-1))/(y)) #endif #define nitems(x) (sizeof((x)) / sizeof((x)[0])) #define rounddown(x, y) (((x)/(y))*(y)) #define rounddown2(x, y) ((x)&(~((y)-1))) /* if y is power of two */ #define roundup(x, y) ((((x)+((y)-1))/(y))*(y)) /* to any y */ #define roundup2(x, y) (((x)+((y)-1))&(~((y)-1))) /* if y is powers of two */ #define powerof2(x) ((((x)-1)&(x))==0) /* Macros for min/max. */ #define MIN(a,b) (((a)<(b))?(a):(b)) #define MAX(a,b) (((a)>(b))?(a):(b)) #ifdef _KERNEL /* * Basic byte order function prototypes for non-inline functions. */ #ifndef LOCORE #ifndef _BYTEORDER_PROTOTYPED #define _BYTEORDER_PROTOTYPED __BEGIN_DECLS __uint32_t htonl(__uint32_t); __uint16_t htons(__uint16_t); __uint32_t ntohl(__uint32_t); __uint16_t ntohs(__uint16_t); __END_DECLS #endif #endif #ifndef _BYTEORDER_FUNC_DEFINED #define _BYTEORDER_FUNC_DEFINED #define htonl(x) __htonl(x) #define htons(x) __htons(x) #define ntohl(x) __ntohl(x) #define ntohs(x) __ntohs(x) #endif /* !_BYTEORDER_FUNC_DEFINED */ #endif /* _KERNEL */ /* * Scale factor for scaled integers used to count %cpu time and load avgs. * * The number of CPU `tick's that map to a unique `%age' can be expressed * by the formula (1 / (2 ^ (FSHIFT - 11))). The maximum load average that * can be calculated (assuming 32 bits) can be closely approximated using * the formula (2 ^ (2 * (16 - FSHIFT))) for (FSHIFT < 15). * * For the scheduler to maintain a 1:1 mapping of CPU `tick' to `%age', * FSHIFT must be at least 11; this gives us a maximum load avg of ~1024. */ #define FSHIFT 11 /* bits to right of fixed binary point */ #define FSCALE (1<> (PAGE_SHIFT - DEV_BSHIFT)) #define ctodb(db) /* calculates pages to devblks */ \ ((db) << (PAGE_SHIFT - DEV_BSHIFT)) /* * Old spelling of __containerof(). */ #define member2struct(s, m, x) \ ((struct s *)(void *)((char *)(x) - offsetof(struct s, m))) /* * Access a variable length array that has been declared as a fixed * length array. */ #define __PAST_END(array, offset) (((__typeof__(*(array)) *)(array))[offset]) #endif /* _SYS_PARAM_H_ */ Index: head/sys/sys/time.h =================================================================== --- head/sys/sys/time.h (revision 336913) +++ head/sys/sys/time.h (revision 336914) @@ -1,564 +1,568 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)time.h 8.5 (Berkeley) 5/4/95 * $FreeBSD$ */ #ifndef _SYS_TIME_H_ #define _SYS_TIME_H_ #include #include #include struct timezone { int tz_minuteswest; /* minutes west of Greenwich */ int tz_dsttime; /* type of dst correction */ }; #define DST_NONE 0 /* not on dst */ #define DST_USA 1 /* USA style dst */ #define DST_AUST 2 /* Australian style dst */ #define DST_WET 3 /* Western European dst */ #define DST_MET 4 /* Middle European dst */ #define DST_EET 5 /* Eastern European dst */ #define DST_CAN 6 /* Canada */ #if __BSD_VISIBLE struct bintime { time_t sec; uint64_t frac; }; static __inline void bintime_addx(struct bintime *_bt, uint64_t _x) { uint64_t _u; _u = _bt->frac; _bt->frac += _x; if (_u > _bt->frac) _bt->sec++; } static __inline void bintime_add(struct bintime *_bt, const struct bintime *_bt2) { uint64_t _u; _u = _bt->frac; _bt->frac += _bt2->frac; if (_u > _bt->frac) _bt->sec++; _bt->sec += _bt2->sec; } static __inline void bintime_sub(struct bintime *_bt, const struct bintime *_bt2) { uint64_t _u; _u = _bt->frac; _bt->frac -= _bt2->frac; if (_u < _bt->frac) _bt->sec--; _bt->sec -= _bt2->sec; } static __inline void bintime_mul(struct bintime *_bt, u_int _x) { uint64_t _p1, _p2; _p1 = (_bt->frac & 0xffffffffull) * _x; _p2 = (_bt->frac >> 32) * _x + (_p1 >> 32); _bt->sec *= _x; _bt->sec += (_p2 >> 32); _bt->frac = (_p2 << 32) | (_p1 & 0xffffffffull); } static __inline void bintime_shift(struct bintime *_bt, int _exp) { if (_exp > 0) { _bt->sec <<= _exp; _bt->sec |= _bt->frac >> (64 - _exp); _bt->frac <<= _exp; } else if (_exp < 0) { _bt->frac >>= -_exp; _bt->frac |= (uint64_t)_bt->sec << (64 + _exp); _bt->sec >>= -_exp; } } #define bintime_clear(a) ((a)->sec = (a)->frac = 0) #define bintime_isset(a) ((a)->sec || (a)->frac) #define bintime_cmp(a, b, cmp) \ (((a)->sec == (b)->sec) ? \ ((a)->frac cmp (b)->frac) : \ ((a)->sec cmp (b)->sec)) #define SBT_1S ((sbintime_t)1 << 32) #define SBT_1M (SBT_1S * 60) #define SBT_1MS (SBT_1S / 1000) #define SBT_1US (SBT_1S / 1000000) #define SBT_1NS (SBT_1S / 1000000000) /* beware rounding, see nstosbt() */ #define SBT_MAX 0x7fffffffffffffffLL static __inline int sbintime_getsec(sbintime_t _sbt) { return (_sbt >> 32); } static __inline sbintime_t bttosbt(const struct bintime _bt) { return (((sbintime_t)_bt.sec << 32) + (_bt.frac >> 32)); } static __inline struct bintime sbttobt(sbintime_t _sbt) { struct bintime _bt; _bt.sec = _sbt >> 32; _bt.frac = _sbt << 32; return (_bt); } /* * Decimal<->sbt conversions. Multiplying or dividing by SBT_1NS results in * large roundoff errors which sbttons() and nstosbt() avoid. Millisecond and * microsecond functions are also provided for completeness. */ static __inline int64_t sbttons(sbintime_t _sbt) { return ((1000000000 * _sbt) >> 32); } static __inline sbintime_t nstosbt(int64_t _ns) { return ((_ns * (((uint64_t)1 << 63) / 500000000)) >> 32); } static __inline int64_t sbttous(sbintime_t _sbt) { return ((1000000 * _sbt) >> 32); } static __inline sbintime_t ustosbt(int64_t _us) { return ((_us * (((uint64_t)1 << 63) / 500000)) >> 32); } static __inline int64_t sbttoms(sbintime_t _sbt) { return ((1000 * _sbt) >> 32); } static __inline sbintime_t mstosbt(int64_t _ms) { return ((_ms * (((uint64_t)1 << 63) / 500)) >> 32); } /*- * Background information: * * When converting between timestamps on parallel timescales of differing * resolutions it is historical and scientific practice to round down rather * than doing 4/5 rounding. * * The date changes at midnight, not at noon. * * Even at 15:59:59.999999999 it's not four'o'clock. * * time_second ticks after N.999999999 not after N.4999999999 */ static __inline void bintime2timespec(const struct bintime *_bt, struct timespec *_ts) { _ts->tv_sec = _bt->sec; _ts->tv_nsec = ((uint64_t)1000000000 * (uint32_t)(_bt->frac >> 32)) >> 32; } static __inline void timespec2bintime(const struct timespec *_ts, struct bintime *_bt) { _bt->sec = _ts->tv_sec; /* 18446744073 = int(2^64 / 1000000000) */ _bt->frac = _ts->tv_nsec * (uint64_t)18446744073LL; } static __inline void bintime2timeval(const struct bintime *_bt, struct timeval *_tv) { _tv->tv_sec = _bt->sec; _tv->tv_usec = ((uint64_t)1000000 * (uint32_t)(_bt->frac >> 32)) >> 32; } static __inline void timeval2bintime(const struct timeval *_tv, struct bintime *_bt) { _bt->sec = _tv->tv_sec; /* 18446744073709 = int(2^64 / 1000000) */ _bt->frac = _tv->tv_usec * (uint64_t)18446744073709LL; } static __inline struct timespec sbttots(sbintime_t _sbt) { struct timespec _ts; _ts.tv_sec = _sbt >> 32; _ts.tv_nsec = sbttons((uint32_t)_sbt); return (_ts); } static __inline sbintime_t tstosbt(struct timespec _ts) { return (((sbintime_t)_ts.tv_sec << 32) + nstosbt(_ts.tv_nsec)); } static __inline struct timeval sbttotv(sbintime_t _sbt) { struct timeval _tv; _tv.tv_sec = _sbt >> 32; _tv.tv_usec = sbttous((uint32_t)_sbt); return (_tv); } static __inline sbintime_t tvtosbt(struct timeval _tv) { return (((sbintime_t)_tv.tv_sec << 32) + ustosbt(_tv.tv_usec)); } #endif /* __BSD_VISIBLE */ #ifdef _KERNEL /* * Simple macros to convert ticks to milliseconds * or microseconds and vice-versa. The answer * will always be at least 1. Note the return * value is a uint32_t however we step up the * operations to 64 bit to avoid any overflow/underflow * problems. */ #define TICKS_2_MSEC(t) max(1, (uint32_t)(hz == 1000) ? \ (t) : (((uint64_t)(t) * (uint64_t)1000)/(uint64_t)hz)) #define TICKS_2_USEC(t) max(1, (uint32_t)(hz == 1000) ? \ ((t) * 1000) : (((uint64_t)(t) * (uint64_t)1000000)/(uint64_t)hz)) #define MSEC_2_TICKS(m) max(1, (uint32_t)((hz == 1000) ? \ (m) : ((uint64_t)(m) * (uint64_t)hz)/(uint64_t)1000)) #define USEC_2_TICKS(u) max(1, (uint32_t)((hz == 1000) ? \ ((u) / 1000) : ((uint64_t)(u) * (uint64_t)hz)/(uint64_t)1000000)) +#endif /* Operations on timespecs */ #define timespecclear(tvp) ((tvp)->tv_sec = (tvp)->tv_nsec = 0) #define timespecisset(tvp) ((tvp)->tv_sec || (tvp)->tv_nsec) #define timespeccmp(tvp, uvp, cmp) \ (((tvp)->tv_sec == (uvp)->tv_sec) ? \ ((tvp)->tv_nsec cmp (uvp)->tv_nsec) : \ ((tvp)->tv_sec cmp (uvp)->tv_sec)) -#define timespecadd(vvp, uvp) \ + +#define timespecadd(tsp, usp, vsp) \ do { \ - (vvp)->tv_sec += (uvp)->tv_sec; \ - (vvp)->tv_nsec += (uvp)->tv_nsec; \ - if ((vvp)->tv_nsec >= 1000000000) { \ - (vvp)->tv_sec++; \ - (vvp)->tv_nsec -= 1000000000; \ + (vsp)->tv_sec = (tsp)->tv_sec + (usp)->tv_sec; \ + (vsp)->tv_nsec = (tsp)->tv_nsec + (usp)->tv_nsec; \ + if ((vsp)->tv_nsec >= 1000000000L) { \ + (vsp)->tv_sec++; \ + (vsp)->tv_nsec -= 1000000000L; \ } \ } while (0) -#define timespecsub(vvp, uvp) \ +#define timespecsub(tsp, usp, vsp) \ do { \ - (vvp)->tv_sec -= (uvp)->tv_sec; \ - (vvp)->tv_nsec -= (uvp)->tv_nsec; \ - if ((vvp)->tv_nsec < 0) { \ - (vvp)->tv_sec--; \ - (vvp)->tv_nsec += 1000000000; \ + (vsp)->tv_sec = (tsp)->tv_sec - (usp)->tv_sec; \ + (vsp)->tv_nsec = (tsp)->tv_nsec - (usp)->tv_nsec; \ + if ((vsp)->tv_nsec < 0) { \ + (vsp)->tv_sec--; \ + (vsp)->tv_nsec += 1000000000L; \ } \ } while (0) + +#ifdef _KERNEL /* Operations on timevals. */ #define timevalclear(tvp) ((tvp)->tv_sec = (tvp)->tv_usec = 0) #define timevalisset(tvp) ((tvp)->tv_sec || (tvp)->tv_usec) #define timevalcmp(tvp, uvp, cmp) \ (((tvp)->tv_sec == (uvp)->tv_sec) ? \ ((tvp)->tv_usec cmp (uvp)->tv_usec) : \ ((tvp)->tv_sec cmp (uvp)->tv_sec)) /* timevaladd and timevalsub are not inlined */ #endif /* _KERNEL */ #ifndef _KERNEL /* NetBSD/OpenBSD compatible interfaces */ #define timerclear(tvp) ((tvp)->tv_sec = (tvp)->tv_usec = 0) #define timerisset(tvp) ((tvp)->tv_sec || (tvp)->tv_usec) #define timercmp(tvp, uvp, cmp) \ (((tvp)->tv_sec == (uvp)->tv_sec) ? \ ((tvp)->tv_usec cmp (uvp)->tv_usec) : \ ((tvp)->tv_sec cmp (uvp)->tv_sec)) #define timeradd(tvp, uvp, vvp) \ do { \ (vvp)->tv_sec = (tvp)->tv_sec + (uvp)->tv_sec; \ (vvp)->tv_usec = (tvp)->tv_usec + (uvp)->tv_usec; \ if ((vvp)->tv_usec >= 1000000) { \ (vvp)->tv_sec++; \ (vvp)->tv_usec -= 1000000; \ } \ } while (0) #define timersub(tvp, uvp, vvp) \ do { \ (vvp)->tv_sec = (tvp)->tv_sec - (uvp)->tv_sec; \ (vvp)->tv_usec = (tvp)->tv_usec - (uvp)->tv_usec; \ if ((vvp)->tv_usec < 0) { \ (vvp)->tv_sec--; \ (vvp)->tv_usec += 1000000; \ } \ } while (0) #endif /* * Names of the interval timers, and structure * defining a timer setting. */ #define ITIMER_REAL 0 #define ITIMER_VIRTUAL 1 #define ITIMER_PROF 2 struct itimerval { struct timeval it_interval; /* timer interval */ struct timeval it_value; /* current value */ }; /* * Getkerninfo clock information structure */ struct clockinfo { int hz; /* clock frequency */ int tick; /* micro-seconds per hz tick */ int spare; int stathz; /* statistics clock frequency */ int profhz; /* profiling clock frequency */ }; /* These macros are also in time.h. */ #ifndef CLOCK_REALTIME #define CLOCK_REALTIME 0 #define CLOCK_VIRTUAL 1 #define CLOCK_PROF 2 #define CLOCK_MONOTONIC 4 #define CLOCK_UPTIME 5 /* FreeBSD-specific. */ #define CLOCK_UPTIME_PRECISE 7 /* FreeBSD-specific. */ #define CLOCK_UPTIME_FAST 8 /* FreeBSD-specific. */ #define CLOCK_REALTIME_PRECISE 9 /* FreeBSD-specific. */ #define CLOCK_REALTIME_FAST 10 /* FreeBSD-specific. */ #define CLOCK_MONOTONIC_PRECISE 11 /* FreeBSD-specific. */ #define CLOCK_MONOTONIC_FAST 12 /* FreeBSD-specific. */ #define CLOCK_SECOND 13 /* FreeBSD-specific. */ #define CLOCK_THREAD_CPUTIME_ID 14 #define CLOCK_PROCESS_CPUTIME_ID 15 #endif #ifndef TIMER_ABSTIME #define TIMER_RELTIME 0x0 /* relative timer */ #define TIMER_ABSTIME 0x1 /* absolute timer */ #endif #if __BSD_VISIBLE #define CPUCLOCK_WHICH_PID 0 #define CPUCLOCK_WHICH_TID 1 #endif #ifdef _KERNEL /* * Kernel to clock driver interface. */ void inittodr(time_t base); void resettodr(void); extern volatile time_t time_second; extern volatile time_t time_uptime; extern struct bintime tc_tick_bt; extern sbintime_t tc_tick_sbt; extern struct bintime tick_bt; extern sbintime_t tick_sbt; extern int tc_precexp; extern int tc_timepercentage; extern struct bintime bt_timethreshold; extern struct bintime bt_tickthreshold; extern sbintime_t sbt_timethreshold; extern sbintime_t sbt_tickthreshold; extern volatile int rtc_generation; /* * Functions for looking at our clock: [get]{bin,nano,micro}[up]time() * * Functions without the "get" prefix returns the best timestamp * we can produce in the given format. * * "bin" == struct bintime == seconds + 64 bit fraction of seconds. * "nano" == struct timespec == seconds + nanoseconds. * "micro" == struct timeval == seconds + microseconds. * * Functions containing "up" returns time relative to boot and * should be used for calculating time intervals. * * Functions without "up" returns UTC time. * * Functions with the "get" prefix returns a less precise result * much faster than the functions without "get" prefix and should * be used where a precision of 1/hz seconds is acceptable or where * performance is priority. (NB: "precision", _not_ "resolution" !) */ void binuptime(struct bintime *bt); void nanouptime(struct timespec *tsp); void microuptime(struct timeval *tvp); static __inline sbintime_t sbinuptime(void) { struct bintime _bt; binuptime(&_bt); return (bttosbt(_bt)); } void bintime(struct bintime *bt); void nanotime(struct timespec *tsp); void microtime(struct timeval *tvp); void getbinuptime(struct bintime *bt); void getnanouptime(struct timespec *tsp); void getmicrouptime(struct timeval *tvp); static __inline sbintime_t getsbinuptime(void) { struct bintime _bt; getbinuptime(&_bt); return (bttosbt(_bt)); } void getbintime(struct bintime *bt); void getnanotime(struct timespec *tsp); void getmicrotime(struct timeval *tvp); void getboottime(struct timeval *boottime); void getboottimebin(struct bintime *boottimebin); /* Other functions */ int itimerdecr(struct itimerval *itp, int usec); int itimerfix(struct timeval *tv); int ppsratecheck(struct timeval *, int *, int); int ratecheck(struct timeval *, const struct timeval *); void timevaladd(struct timeval *t1, const struct timeval *t2); void timevalsub(struct timeval *t1, const struct timeval *t2); int tvtohz(struct timeval *tv); #define TC_DEFAULTPERC 5 #define BT2FREQ(bt) \ (((uint64_t)0x8000000000000000 + ((bt)->frac >> 2)) / \ ((bt)->frac >> 1)) #define SBT2FREQ(sbt) ((SBT_1S + ((sbt) >> 1)) / (sbt)) #define FREQ2BT(freq, bt) \ { \ (bt)->sec = 0; \ (bt)->frac = ((uint64_t)0x8000000000000000 / (freq)) << 1; \ } #define TIMESEL(sbt, sbt2) \ (((sbt2) >= sbt_timethreshold) ? \ ((*(sbt) = getsbinuptime()), 1) : ((*(sbt) = sbinuptime()), 0)) #else /* !_KERNEL */ #include #include #include __BEGIN_DECLS int setitimer(int, const struct itimerval *, struct itimerval *); int utimes(const char *, const struct timeval *); #if __BSD_VISIBLE int adjtime(const struct timeval *, struct timeval *); int clock_getcpuclockid2(id_t, int, clockid_t *); int futimes(int, const struct timeval *); int futimesat(int, const char *, const struct timeval [2]); int lutimes(const char *, const struct timeval *); int settimeofday(const struct timeval *, const struct timezone *); #endif #if __XSI_VISIBLE int getitimer(int, struct itimerval *); int gettimeofday(struct timeval *, struct timezone *); #endif __END_DECLS #endif /* !_KERNEL */ #endif /* !_SYS_TIME_H_ */ Index: head/sys/ufs/ffs/ffs_snapshot.c =================================================================== --- head/sys/ufs/ffs/ffs_snapshot.c (revision 336913) +++ head/sys/ufs/ffs/ffs_snapshot.c (revision 336914) @@ -1,2706 +1,2706 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright 2000 Marshall Kirk McKusick. All Rights Reserved. * * Further information about snapshots can be obtained from: * * Marshall Kirk McKusick http://www.mckusick.com/softdep/ * 1614 Oxford Street mckusick@mckusick.com * Berkeley, CA 94709-1608 +1-510-843-9542 * USA * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)ffs_snapshot.c 8.11 (McKusick) 7/23/00 */ #include __FBSDID("$FreeBSD$"); #include "opt_quota.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define KERNCRED thread0.td_ucred #define DEBUG 1 #include "opt_ffs.h" #ifdef NO_FFS_SNAPSHOT int ffs_snapshot(mp, snapfile) struct mount *mp; char *snapfile; { return (EINVAL); } int ffs_snapblkfree(fs, devvp, bno, size, inum, vtype, wkhd) struct fs *fs; struct vnode *devvp; ufs2_daddr_t bno; long size; ino_t inum; enum vtype vtype; struct workhead *wkhd; { return (EINVAL); } void ffs_snapremove(vp) struct vnode *vp; { } void ffs_snapshot_mount(mp) struct mount *mp; { } void ffs_snapshot_unmount(mp) struct mount *mp; { } void ffs_snapgone(ip) struct inode *ip; { } int ffs_copyonwrite(devvp, bp) struct vnode *devvp; struct buf *bp; { return (EINVAL); } void ffs_sync_snap(mp, waitfor) struct mount *mp; int waitfor; { } #else FEATURE(ffs_snapshot, "FFS snapshot support"); LIST_HEAD(, snapdata) snapfree; static struct mtx snapfree_lock; MTX_SYSINIT(ffs_snapfree, &snapfree_lock, "snapdata free list", MTX_DEF); static int cgaccount(int, struct vnode *, struct buf *, int); static int expunge_ufs1(struct vnode *, struct inode *, struct fs *, int (*)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *, ufs_lbn_t, int), int, int); static int indiracct_ufs1(struct vnode *, struct vnode *, int, ufs1_daddr_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, struct fs *, int (*)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *, ufs_lbn_t, int), int); static int fullacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *, ufs_lbn_t, int); static int snapacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *, ufs_lbn_t, int); static int mapacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *, ufs_lbn_t, int); static int expunge_ufs2(struct vnode *, struct inode *, struct fs *, int (*)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *, ufs_lbn_t, int), int, int); static int indiracct_ufs2(struct vnode *, struct vnode *, int, ufs2_daddr_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, struct fs *, int (*)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *, ufs_lbn_t, int), int); static int fullacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *, ufs_lbn_t, int); static int snapacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *, ufs_lbn_t, int); static int mapacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *, ufs_lbn_t, int); static int readblock(struct vnode *vp, struct buf *, ufs2_daddr_t); static void try_free_snapdata(struct vnode *devvp); static struct snapdata *ffs_snapdata_acquire(struct vnode *devvp); static int ffs_bp_snapblk(struct vnode *, struct buf *); /* * To ensure the consistency of snapshots across crashes, we must * synchronously write out copied blocks before allowing the * originals to be modified. Because of the rather severe speed * penalty that this imposes, the code normally only ensures * persistence for the filesystem metadata contained within a * snapshot. Setting the following flag allows this crash * persistence to be enabled for file contents. */ int dopersistence = 0; #ifdef DEBUG #include SYSCTL_INT(_debug, OID_AUTO, dopersistence, CTLFLAG_RW, &dopersistence, 0, ""); static int snapdebug = 0; SYSCTL_INT(_debug, OID_AUTO, snapdebug, CTLFLAG_RW, &snapdebug, 0, ""); int collectsnapstats = 0; SYSCTL_INT(_debug, OID_AUTO, collectsnapstats, CTLFLAG_RW, &collectsnapstats, 0, ""); #endif /* DEBUG */ /* * Create a snapshot file and initialize it for the filesystem. */ int ffs_snapshot(mp, snapfile) struct mount *mp; char *snapfile; { ufs2_daddr_t numblks, blkno, *blkp, *snapblklist; int error, cg, snaploc; int i, size, len, loc; ufs2_daddr_t blockno; uint64_t flag; struct timespec starttime = {0, 0}, endtime; char saved_nice = 0; long redo = 0, snaplistsize = 0; int32_t *lp; void *space; struct fs *copy_fs = NULL, *fs; struct thread *td = curthread; struct inode *ip, *xp; struct buf *bp, *nbp, *ibp; struct nameidata nd; struct mount *wrtmp; struct vattr vat; struct vnode *vp, *xvp, *mvp, *devvp; struct uio auio; struct iovec aiov; struct snapdata *sn; struct ufsmount *ump; ump = VFSTOUFS(mp); fs = ump->um_fs; sn = NULL; /* * At the moment, journaled soft updates cannot support * taking snapshots. */ if (MOUNTEDSUJ(mp)) { vfs_mount_error(mp, "%s: Snapshots are not yet supported when " "running with journaled soft updates", fs->fs_fsmnt); return (EOPNOTSUPP); } MNT_ILOCK(mp); flag = mp->mnt_flag; MNT_IUNLOCK(mp); /* * Need to serialize access to snapshot code per filesystem. */ /* * Assign a snapshot slot in the superblock. */ UFS_LOCK(ump); for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) if (fs->fs_snapinum[snaploc] == 0) break; UFS_UNLOCK(ump); if (snaploc == FSMAXSNAP) return (ENOSPC); /* * Create the snapshot file. */ restart: NDINIT(&nd, CREATE, LOCKPARENT | LOCKLEAF | NOCACHE, UIO_SYSSPACE, snapfile, td); if ((error = namei(&nd)) != 0) return (error); if (nd.ni_vp != NULL) { vput(nd.ni_vp); error = EEXIST; } if (nd.ni_dvp->v_mount != mp) error = EXDEV; if (error) { NDFREE(&nd, NDF_ONLY_PNBUF); if (nd.ni_dvp == nd.ni_vp) vrele(nd.ni_dvp); else vput(nd.ni_dvp); return (error); } VATTR_NULL(&vat); vat.va_type = VREG; vat.va_mode = S_IRUSR; vat.va_vaflags |= VA_EXCLUSIVE; if (VOP_GETWRITEMOUNT(nd.ni_dvp, &wrtmp)) wrtmp = NULL; if (wrtmp != mp) panic("ffs_snapshot: mount mismatch"); vfs_rel(wrtmp); if (vn_start_write(NULL, &wrtmp, V_NOWAIT) != 0) { NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); if ((error = vn_start_write(NULL, &wrtmp, V_XSLEEP | PCATCH)) != 0) return (error); goto restart; } error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vat); VOP_UNLOCK(nd.ni_dvp, 0); if (error) { NDFREE(&nd, NDF_ONLY_PNBUF); vn_finished_write(wrtmp); vrele(nd.ni_dvp); return (error); } vp = nd.ni_vp; vp->v_vflag |= VV_SYSTEM; ip = VTOI(vp); devvp = ITODEVVP(ip); /* * Allocate and copy the last block contents so as to be able * to set size to that of the filesystem. */ numblks = howmany(fs->fs_size, fs->fs_frag); error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(numblks - 1)), fs->fs_bsize, KERNCRED, BA_CLRBUF, &bp); if (error) goto out; ip->i_size = lblktosize(fs, (off_t)numblks); DIP_SET(ip, i_size, ip->i_size); ip->i_flag |= IN_CHANGE | IN_UPDATE; error = readblock(vp, bp, numblks - 1); bawrite(bp); if (error != 0) goto out; /* * Preallocate critical data structures so that we can copy * them in without further allocation after we suspend all * operations on the filesystem. We would like to just release * the allocated buffers without writing them since they will * be filled in below once we are ready to go, but this upsets * the soft update code, so we go ahead and write the new buffers. * * Allocate all indirect blocks and mark all of them as not * needing to be copied. */ for (blkno = UFS_NDADDR; blkno < numblks; blkno += NINDIR(fs)) { error = UFS_BALLOC(vp, lblktosize(fs, (off_t)blkno), fs->fs_bsize, td->td_ucred, BA_METAONLY, &ibp); if (error) goto out; bawrite(ibp); } /* * Allocate copies for the superblock and its summary information. */ error = UFS_BALLOC(vp, fs->fs_sblockloc, fs->fs_sbsize, KERNCRED, 0, &nbp); if (error) goto out; bawrite(nbp); blkno = fragstoblks(fs, fs->fs_csaddr); len = howmany(fs->fs_cssize, fs->fs_bsize); for (loc = 0; loc < len; loc++) { error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(blkno + loc)), fs->fs_bsize, KERNCRED, 0, &nbp); if (error) goto out; bawrite(nbp); } /* * Allocate all cylinder group blocks. */ for (cg = 0; cg < fs->fs_ncg; cg++) { error = UFS_BALLOC(vp, lfragtosize(fs, cgtod(fs, cg)), fs->fs_bsize, KERNCRED, 0, &nbp); if (error) goto out; bawrite(nbp); if (cg % 10 == 0) ffs_syncvnode(vp, MNT_WAIT, 0); } /* * Copy all the cylinder group maps. Although the * filesystem is still active, we hope that only a few * cylinder groups will change between now and when we * suspend operations. Thus, we will be able to quickly * touch up the few cylinder groups that changed during * the suspension period. */ len = howmany(fs->fs_ncg, NBBY); space = malloc(len, M_DEVBUF, M_WAITOK|M_ZERO); UFS_LOCK(ump); fs->fs_active = space; UFS_UNLOCK(ump); for (cg = 0; cg < fs->fs_ncg; cg++) { error = UFS_BALLOC(vp, lfragtosize(fs, cgtod(fs, cg)), fs->fs_bsize, KERNCRED, 0, &nbp); if (error) goto out; error = cgaccount(cg, vp, nbp, 1); bawrite(nbp); if (cg % 10 == 0) ffs_syncvnode(vp, MNT_WAIT, 0); if (error) goto out; } /* * Change inode to snapshot type file. */ ip->i_flags |= SF_SNAPSHOT; DIP_SET(ip, i_flags, ip->i_flags); ip->i_flag |= IN_CHANGE | IN_UPDATE; /* * Ensure that the snapshot is completely on disk. * Since we have marked it as a snapshot it is safe to * unlock it as no process will be allowed to write to it. */ if ((error = ffs_syncvnode(vp, MNT_WAIT, 0)) != 0) goto out; VOP_UNLOCK(vp, 0); /* * All allocations are done, so we can now snapshot the system. * * Recind nice scheduling while running with the filesystem suspended. */ if (td->td_proc->p_nice > 0) { struct proc *p; p = td->td_proc; PROC_LOCK(p); saved_nice = p->p_nice; sched_nice(p, 0); PROC_UNLOCK(p); } /* * Suspend operation on filesystem. */ for (;;) { vn_finished_write(wrtmp); if ((error = vfs_write_suspend(vp->v_mount, 0)) != 0) { vn_start_write(NULL, &wrtmp, V_WAIT); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); goto out; } if (mp->mnt_kern_flag & MNTK_SUSPENDED) break; vn_start_write(NULL, &wrtmp, V_WAIT); } vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); if (ip->i_effnlink == 0) { error = ENOENT; /* Snapshot file unlinked */ goto out1; } if (collectsnapstats) nanotime(&starttime); /* The last block might have changed. Copy it again to be sure. */ error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(numblks - 1)), fs->fs_bsize, KERNCRED, BA_CLRBUF, &bp); if (error != 0) goto out1; error = readblock(vp, bp, numblks - 1); bp->b_flags |= B_VALIDSUSPWRT; bawrite(bp); if (error != 0) goto out1; /* * First, copy all the cylinder group maps that have changed. */ for (cg = 0; cg < fs->fs_ncg; cg++) { if ((ACTIVECGNUM(fs, cg) & ACTIVECGOFF(cg)) != 0) continue; redo++; error = UFS_BALLOC(vp, lfragtosize(fs, cgtod(fs, cg)), fs->fs_bsize, KERNCRED, 0, &nbp); if (error) goto out1; error = cgaccount(cg, vp, nbp, 2); bawrite(nbp); if (error) goto out1; } /* * Grab a copy of the superblock and its summary information. * We delay writing it until the suspension is released below. */ copy_fs = malloc((u_long)fs->fs_bsize, M_UFSMNT, M_WAITOK); bcopy(fs, copy_fs, fs->fs_sbsize); if ((fs->fs_flags & (FS_UNCLEAN | FS_NEEDSFSCK)) == 0) copy_fs->fs_clean = 1; size = fs->fs_bsize < SBLOCKSIZE ? fs->fs_bsize : SBLOCKSIZE; if (fs->fs_sbsize < size) bzero(&((char *)copy_fs)[fs->fs_sbsize], size - fs->fs_sbsize); size = blkroundup(fs, fs->fs_cssize); if (fs->fs_contigsumsize > 0) size += fs->fs_ncg * sizeof(int32_t); space = malloc((u_long)size, M_UFSMNT, M_WAITOK); copy_fs->fs_csp = space; bcopy(fs->fs_csp, copy_fs->fs_csp, fs->fs_cssize); space = (char *)space + fs->fs_cssize; loc = howmany(fs->fs_cssize, fs->fs_fsize); i = fs->fs_frag - loc % fs->fs_frag; len = (i == fs->fs_frag) ? 0 : i * fs->fs_fsize; if (len > 0) { if ((error = bread(devvp, fsbtodb(fs, fs->fs_csaddr + loc), len, KERNCRED, &bp)) != 0) { brelse(bp); free(copy_fs->fs_csp, M_UFSMNT); free(copy_fs, M_UFSMNT); copy_fs = NULL; goto out1; } bcopy(bp->b_data, space, (u_int)len); space = (char *)space + len; bp->b_flags |= B_INVAL | B_NOCACHE; brelse(bp); } if (fs->fs_contigsumsize > 0) { copy_fs->fs_maxcluster = lp = space; for (i = 0; i < fs->fs_ncg; i++) *lp++ = fs->fs_contigsumsize; } /* * We must check for active files that have been unlinked * (e.g., with a zero link count). We have to expunge all * trace of these files from the snapshot so that they are * not reclaimed prematurely by fsck or unnecessarily dumped. * We turn off the MNTK_SUSPENDED flag to avoid a panic from * spec_strategy about writing on a suspended filesystem. * Note that we skip unlinked snapshot files as they will * be handled separately below. * * We also calculate the needed size for the snapshot list. */ snaplistsize = fs->fs_ncg + howmany(fs->fs_cssize, fs->fs_bsize) + FSMAXSNAP + 1 /* superblock */ + 1 /* last block */ + 1 /* size */; MNT_ILOCK(mp); mp->mnt_kern_flag &= ~MNTK_SUSPENDED; MNT_IUNLOCK(mp); loop: MNT_VNODE_FOREACH_ALL(xvp, mp, mvp) { if ((xvp->v_usecount == 0 && (xvp->v_iflag & (VI_OWEINACT | VI_DOINGINACT)) == 0) || xvp->v_type == VNON || IS_SNAPSHOT(VTOI(xvp))) { VI_UNLOCK(xvp); continue; } /* * We can skip parent directory vnode because it must have * this snapshot file in it. */ if (xvp == nd.ni_dvp) { VI_UNLOCK(xvp); continue; } vholdl(xvp); if (vn_lock(xvp, LK_EXCLUSIVE | LK_INTERLOCK) != 0) { MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); vdrop(xvp); goto loop; } VI_LOCK(xvp); if (xvp->v_usecount == 0 && (xvp->v_iflag & (VI_OWEINACT | VI_DOINGINACT)) == 0) { VI_UNLOCK(xvp); VOP_UNLOCK(xvp, 0); vdrop(xvp); continue; } VI_UNLOCK(xvp); if (snapdebug) vn_printf(xvp, "ffs_snapshot: busy vnode "); if (VOP_GETATTR(xvp, &vat, td->td_ucred) == 0 && vat.va_nlink > 0) { VOP_UNLOCK(xvp, 0); vdrop(xvp); continue; } xp = VTOI(xvp); if (ffs_checkfreefile(copy_fs, vp, xp->i_number)) { VOP_UNLOCK(xvp, 0); vdrop(xvp); continue; } /* * If there is a fragment, clear it here. */ blkno = 0; loc = howmany(xp->i_size, fs->fs_bsize) - 1; if (loc < UFS_NDADDR) { len = fragroundup(fs, blkoff(fs, xp->i_size)); if (len != 0 && len < fs->fs_bsize) { ffs_blkfree(ump, copy_fs, vp, DIP(xp, i_db[loc]), len, xp->i_number, xvp->v_type, NULL); blkno = DIP(xp, i_db[loc]); DIP_SET(xp, i_db[loc], 0); } } snaplistsize += 1; if (I_IS_UFS1(xp)) error = expunge_ufs1(vp, xp, copy_fs, fullacct_ufs1, BLK_NOCOPY, 1); else error = expunge_ufs2(vp, xp, copy_fs, fullacct_ufs2, BLK_NOCOPY, 1); if (blkno) DIP_SET(xp, i_db[loc], blkno); if (!error) error = ffs_freefile(ump, copy_fs, vp, xp->i_number, xp->i_mode, NULL); VOP_UNLOCK(xvp, 0); vdrop(xvp); if (error) { free(copy_fs->fs_csp, M_UFSMNT); free(copy_fs, M_UFSMNT); copy_fs = NULL; MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); goto out1; } } /* * Erase the journal file from the snapshot. */ if (fs->fs_flags & FS_SUJ) { error = softdep_journal_lookup(mp, &xvp); if (error) { free(copy_fs->fs_csp, M_UFSMNT); free(copy_fs, M_UFSMNT); copy_fs = NULL; goto out1; } xp = VTOI(xvp); if (I_IS_UFS1(xp)) error = expunge_ufs1(vp, xp, copy_fs, fullacct_ufs1, BLK_NOCOPY, 0); else error = expunge_ufs2(vp, xp, copy_fs, fullacct_ufs2, BLK_NOCOPY, 0); vput(xvp); } /* * Acquire a lock on the snapdata structure, creating it if necessary. */ sn = ffs_snapdata_acquire(devvp); /* * Change vnode to use shared snapshot lock instead of the original * private lock. */ vp->v_vnlock = &sn->sn_lock; lockmgr(&vp->v_lock, LK_RELEASE, NULL); xp = TAILQ_FIRST(&sn->sn_head); /* * If this is the first snapshot on this filesystem, then we need * to allocate the space for the list of preallocated snapshot blocks. * This list will be refined below, but this preliminary one will * keep us out of deadlock until the full one is ready. */ if (xp == NULL) { snapblklist = malloc(snaplistsize * sizeof(daddr_t), M_UFSMNT, M_WAITOK); blkp = &snapblklist[1]; *blkp++ = lblkno(fs, fs->fs_sblockloc); blkno = fragstoblks(fs, fs->fs_csaddr); for (cg = 0; cg < fs->fs_ncg; cg++) { if (fragstoblks(fs, cgtod(fs, cg) > blkno)) break; *blkp++ = fragstoblks(fs, cgtod(fs, cg)); } len = howmany(fs->fs_cssize, fs->fs_bsize); for (loc = 0; loc < len; loc++) *blkp++ = blkno + loc; for (; cg < fs->fs_ncg; cg++) *blkp++ = fragstoblks(fs, cgtod(fs, cg)); snapblklist[0] = blkp - snapblklist; VI_LOCK(devvp); if (sn->sn_blklist != NULL) panic("ffs_snapshot: non-empty list"); sn->sn_blklist = snapblklist; sn->sn_listsize = blkp - snapblklist; VI_UNLOCK(devvp); } /* * Record snapshot inode. Since this is the newest snapshot, * it must be placed at the end of the list. */ VI_LOCK(devvp); fs->fs_snapinum[snaploc] = ip->i_number; if (ip->i_nextsnap.tqe_prev != 0) panic("ffs_snapshot: %ju already on list", (uintmax_t)ip->i_number); TAILQ_INSERT_TAIL(&sn->sn_head, ip, i_nextsnap); devvp->v_vflag |= VV_COPYONWRITE; VI_UNLOCK(devvp); ASSERT_VOP_LOCKED(vp, "ffs_snapshot vp"); out1: KASSERT((sn != NULL && copy_fs != NULL && error == 0) || (sn == NULL && copy_fs == NULL && error != 0), ("email phk@ and mckusick@")); /* * Resume operation on filesystem. */ vfs_write_resume(vp->v_mount, VR_START_WRITE | VR_NO_SUSPCLR); if (collectsnapstats && starttime.tv_sec > 0) { nanotime(&endtime); - timespecsub(&endtime, &starttime); + timespecsub(&endtime, &starttime, &endtime); printf("%s: suspended %ld.%03ld sec, redo %ld of %d\n", vp->v_mount->mnt_stat.f_mntonname, (long)endtime.tv_sec, endtime.tv_nsec / 1000000, redo, fs->fs_ncg); } if (copy_fs == NULL) goto out; /* * Copy allocation information from all the snapshots in * this snapshot and then expunge them from its view. */ TAILQ_FOREACH(xp, &sn->sn_head, i_nextsnap) { if (xp == ip) break; if (I_IS_UFS1(xp)) error = expunge_ufs1(vp, xp, fs, snapacct_ufs1, BLK_SNAP, 0); else error = expunge_ufs2(vp, xp, fs, snapacct_ufs2, BLK_SNAP, 0); if (error == 0 && xp->i_effnlink == 0) { error = ffs_freefile(ump, copy_fs, vp, xp->i_number, xp->i_mode, NULL); } if (error) { fs->fs_snapinum[snaploc] = 0; goto done; } } /* * Allocate space for the full list of preallocated snapshot blocks. */ snapblklist = malloc(snaplistsize * sizeof(daddr_t), M_UFSMNT, M_WAITOK); ip->i_snapblklist = &snapblklist[1]; /* * Expunge the blocks used by the snapshots from the set of * blocks marked as used in the snapshot bitmaps. Also, collect * the list of allocated blocks in i_snapblklist. */ if (I_IS_UFS1(ip)) error = expunge_ufs1(vp, ip, copy_fs, mapacct_ufs1, BLK_SNAP, 0); else error = expunge_ufs2(vp, ip, copy_fs, mapacct_ufs2, BLK_SNAP, 0); if (error) { fs->fs_snapinum[snaploc] = 0; free(snapblklist, M_UFSMNT); goto done; } if (snaplistsize < ip->i_snapblklist - snapblklist) panic("ffs_snapshot: list too small"); snaplistsize = ip->i_snapblklist - snapblklist; snapblklist[0] = snaplistsize; ip->i_snapblklist = 0; /* * Write out the list of allocated blocks to the end of the snapshot. */ auio.uio_iov = &aiov; auio.uio_iovcnt = 1; aiov.iov_base = (void *)snapblklist; aiov.iov_len = snaplistsize * sizeof(daddr_t); auio.uio_resid = aiov.iov_len; auio.uio_offset = ip->i_size; auio.uio_segflg = UIO_SYSSPACE; auio.uio_rw = UIO_WRITE; auio.uio_td = td; if ((error = VOP_WRITE(vp, &auio, IO_UNIT, td->td_ucred)) != 0) { fs->fs_snapinum[snaploc] = 0; free(snapblklist, M_UFSMNT); goto done; } /* * Write the superblock and its summary information * to the snapshot. */ blkno = fragstoblks(fs, fs->fs_csaddr); len = howmany(fs->fs_cssize, fs->fs_bsize); space = copy_fs->fs_csp; for (loc = 0; loc < len; loc++) { error = bread(vp, blkno + loc, fs->fs_bsize, KERNCRED, &nbp); if (error) { brelse(nbp); fs->fs_snapinum[snaploc] = 0; free(snapblklist, M_UFSMNT); goto done; } bcopy(space, nbp->b_data, fs->fs_bsize); space = (char *)space + fs->fs_bsize; bawrite(nbp); } error = bread(vp, lblkno(fs, fs->fs_sblockloc), fs->fs_bsize, KERNCRED, &nbp); if (error) { brelse(nbp); } else { loc = blkoff(fs, fs->fs_sblockloc); bcopy((char *)copy_fs, &nbp->b_data[loc], (u_int)fs->fs_sbsize); bawrite(nbp); } /* * As this is the newest list, it is the most inclusive, so * should replace the previous list. */ VI_LOCK(devvp); space = sn->sn_blklist; sn->sn_blklist = snapblklist; sn->sn_listsize = snaplistsize; VI_UNLOCK(devvp); if (space != NULL) free(space, M_UFSMNT); /* * Preallocate all the direct blocks in the snapshot inode so * that we never have to write the inode itself to commit an * update to the contents of the snapshot. Note that once * created, the size of the snapshot will never change, so * there will never be a need to write the inode except to * update the non-integrity-critical time fields and * allocated-block count. */ for (blockno = 0; blockno < UFS_NDADDR; blockno++) { if (DIP(ip, i_db[blockno]) != 0) continue; error = UFS_BALLOC(vp, lblktosize(fs, blockno), fs->fs_bsize, KERNCRED, BA_CLRBUF, &bp); if (error) break; error = readblock(vp, bp, blockno); bawrite(bp); if (error != 0) break; } done: free(copy_fs->fs_csp, M_UFSMNT); free(copy_fs, M_UFSMNT); copy_fs = NULL; out: NDFREE(&nd, NDF_ONLY_PNBUF); if (saved_nice > 0) { struct proc *p; p = td->td_proc; PROC_LOCK(p); sched_nice(td->td_proc, saved_nice); PROC_UNLOCK(td->td_proc); } UFS_LOCK(ump); if (fs->fs_active != 0) { free(fs->fs_active, M_DEVBUF); fs->fs_active = 0; } UFS_UNLOCK(ump); MNT_ILOCK(mp); mp->mnt_flag = (mp->mnt_flag & MNT_QUOTA) | (flag & ~MNT_QUOTA); MNT_IUNLOCK(mp); if (error) (void) ffs_truncate(vp, (off_t)0, 0, NOCRED); (void) ffs_syncvnode(vp, MNT_WAIT, 0); if (error) vput(vp); else VOP_UNLOCK(vp, 0); vrele(nd.ni_dvp); vn_finished_write(wrtmp); process_deferred_inactive(mp); return (error); } /* * Copy a cylinder group map. All the unallocated blocks are marked * BLK_NOCOPY so that the snapshot knows that it need not copy them * if they are later written. If passno is one, then this is a first * pass, so only setting needs to be done. If passno is 2, then this * is a revision to a previous pass which must be undone as the * replacement pass is done. */ static int cgaccount(cg, vp, nbp, passno) int cg; struct vnode *vp; struct buf *nbp; int passno; { struct buf *bp, *ibp; struct inode *ip; struct cg *cgp; struct fs *fs; ufs2_daddr_t base, numblks; int error, len, loc, indiroff; ip = VTOI(vp); fs = ITOFS(ip); if ((error = ffs_getcg(fs, ITODEVVP(ip), cg, &bp, &cgp)) != 0) return (error); UFS_LOCK(ITOUMP(ip)); ACTIVESET(fs, cg); /* * Recomputation of summary information might not have been performed * at mount time. Sync up summary information for current cylinder * group while data is in memory to ensure that result of background * fsck is slightly more consistent. */ fs->fs_cs(fs, cg) = cgp->cg_cs; UFS_UNLOCK(ITOUMP(ip)); bcopy(bp->b_data, nbp->b_data, fs->fs_cgsize); if (fs->fs_cgsize < fs->fs_bsize) bzero(&nbp->b_data[fs->fs_cgsize], fs->fs_bsize - fs->fs_cgsize); cgp = (struct cg *)nbp->b_data; bqrelse(bp); if (passno == 2) nbp->b_flags |= B_VALIDSUSPWRT; numblks = howmany(fs->fs_size, fs->fs_frag); len = howmany(fs->fs_fpg, fs->fs_frag); base = cgbase(fs, cg) / fs->fs_frag; if (base + len >= numblks) len = numblks - base - 1; loc = 0; if (base < UFS_NDADDR) { for ( ; loc < UFS_NDADDR; loc++) { if (ffs_isblock(fs, cg_blksfree(cgp), loc)) DIP_SET(ip, i_db[loc], BLK_NOCOPY); else if (passno == 2 && DIP(ip, i_db[loc])== BLK_NOCOPY) DIP_SET(ip, i_db[loc], 0); else if (passno == 1 && DIP(ip, i_db[loc])== BLK_NOCOPY) panic("ffs_snapshot: lost direct block"); } } error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(base + loc)), fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); if (error) { goto out; } indiroff = (base + loc - UFS_NDADDR) % NINDIR(fs); for ( ; loc < len; loc++, indiroff++) { if (indiroff >= NINDIR(fs)) { if (passno == 2) ibp->b_flags |= B_VALIDSUSPWRT; bawrite(ibp); error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(base + loc)), fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); if (error) { goto out; } indiroff = 0; } if (I_IS_UFS1(ip)) { if (ffs_isblock(fs, cg_blksfree(cgp), loc)) ((ufs1_daddr_t *)(ibp->b_data))[indiroff] = BLK_NOCOPY; else if (passno == 2 && ((ufs1_daddr_t *)(ibp->b_data)) [indiroff] == BLK_NOCOPY) ((ufs1_daddr_t *)(ibp->b_data))[indiroff] = 0; else if (passno == 1 && ((ufs1_daddr_t *)(ibp->b_data)) [indiroff] == BLK_NOCOPY) panic("ffs_snapshot: lost indirect block"); continue; } if (ffs_isblock(fs, cg_blksfree(cgp), loc)) ((ufs2_daddr_t *)(ibp->b_data))[indiroff] = BLK_NOCOPY; else if (passno == 2 && ((ufs2_daddr_t *)(ibp->b_data)) [indiroff] == BLK_NOCOPY) ((ufs2_daddr_t *)(ibp->b_data))[indiroff] = 0; else if (passno == 1 && ((ufs2_daddr_t *)(ibp->b_data)) [indiroff] == BLK_NOCOPY) panic("ffs_snapshot: lost indirect block"); } if (passno == 2) ibp->b_flags |= B_VALIDSUSPWRT; bdwrite(ibp); out: /* * We have to calculate the crc32c here rather than just setting the * BX_CYLGRP b_xflags because the allocation of the block for the * the cylinder group map will always be a full size block (fs_bsize) * even though the cylinder group may be smaller (fs_cgsize). The * crc32c must be computed only over fs_cgsize whereas the BX_CYLGRP * flag causes it to be computed over the size of the buffer. */ if ((fs->fs_metackhash & CK_CYLGRP) != 0) { ((struct cg *)nbp->b_data)->cg_ckhash = 0; ((struct cg *)nbp->b_data)->cg_ckhash = calculate_crc32c(~0L, nbp->b_data, fs->fs_cgsize); } return (error); } /* * Before expunging a snapshot inode, note all the * blocks that it claims with BLK_SNAP so that fsck will * be able to account for those blocks properly and so * that this snapshot knows that it need not copy them * if the other snapshot holding them is freed. This code * is reproduced once each for UFS1 and UFS2. */ static int expunge_ufs1(snapvp, cancelip, fs, acctfunc, expungetype, clearmode) struct vnode *snapvp; struct inode *cancelip; struct fs *fs; int (*acctfunc)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *, ufs_lbn_t, int); int expungetype; int clearmode; { int i, error, indiroff; ufs_lbn_t lbn, rlbn; ufs2_daddr_t len, blkno, numblks, blksperindir; struct ufs1_dinode *dip; struct thread *td = curthread; struct buf *bp; /* * Prepare to expunge the inode. If its inode block has not * yet been copied, then allocate and fill the copy. */ lbn = fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number)); blkno = 0; if (lbn < UFS_NDADDR) { blkno = VTOI(snapvp)->i_din1->di_db[lbn]; } else { if (DOINGSOFTDEP(snapvp)) softdep_prealloc(snapvp, MNT_WAIT); td->td_pflags |= TDP_COWINPROGRESS; error = ffs_balloc_ufs1(snapvp, lblktosize(fs, (off_t)lbn), fs->fs_bsize, KERNCRED, BA_METAONLY, &bp); td->td_pflags &= ~TDP_COWINPROGRESS; if (error) return (error); indiroff = (lbn - UFS_NDADDR) % NINDIR(fs); blkno = ((ufs1_daddr_t *)(bp->b_data))[indiroff]; bqrelse(bp); } if (blkno != 0) { if ((error = bread(snapvp, lbn, fs->fs_bsize, KERNCRED, &bp))) return (error); } else { error = ffs_balloc_ufs1(snapvp, lblktosize(fs, (off_t)lbn), fs->fs_bsize, KERNCRED, 0, &bp); if (error) return (error); if ((error = readblock(snapvp, bp, lbn)) != 0) return (error); } /* * Set a snapshot inode to be a zero length file, regular files * or unlinked snapshots to be completely unallocated. */ dip = (struct ufs1_dinode *)bp->b_data + ino_to_fsbo(fs, cancelip->i_number); if (clearmode || cancelip->i_effnlink == 0) dip->di_mode = 0; dip->di_size = 0; dip->di_blocks = 0; dip->di_flags &= ~SF_SNAPSHOT; bzero(&dip->di_db[0], (UFS_NDADDR + UFS_NIADDR) * sizeof(ufs1_daddr_t)); bdwrite(bp); /* * Now go through and expunge all the blocks in the file * using the function requested. */ numblks = howmany(cancelip->i_size, fs->fs_bsize); if ((error = (*acctfunc)(snapvp, &cancelip->i_din1->di_db[0], &cancelip->i_din1->di_db[UFS_NDADDR], fs, 0, expungetype))) return (error); if ((error = (*acctfunc)(snapvp, &cancelip->i_din1->di_ib[0], &cancelip->i_din1->di_ib[UFS_NIADDR], fs, -1, expungetype))) return (error); blksperindir = 1; lbn = -UFS_NDADDR; len = numblks - UFS_NDADDR; rlbn = UFS_NDADDR; for (i = 0; len > 0 && i < UFS_NIADDR; i++) { error = indiracct_ufs1(snapvp, ITOV(cancelip), i, cancelip->i_din1->di_ib[i], lbn, rlbn, len, blksperindir, fs, acctfunc, expungetype); if (error) return (error); blksperindir *= NINDIR(fs); lbn -= blksperindir + 1; len -= blksperindir; rlbn += blksperindir; } return (0); } /* * Descend an indirect block chain for vnode cancelvp accounting for all * its indirect blocks in snapvp. */ static int indiracct_ufs1(snapvp, cancelvp, level, blkno, lbn, rlbn, remblks, blksperindir, fs, acctfunc, expungetype) struct vnode *snapvp; struct vnode *cancelvp; int level; ufs1_daddr_t blkno; ufs_lbn_t lbn; ufs_lbn_t rlbn; ufs_lbn_t remblks; ufs_lbn_t blksperindir; struct fs *fs; int (*acctfunc)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *, ufs_lbn_t, int); int expungetype; { int error, num, i; ufs_lbn_t subblksperindir; struct indir indirs[UFS_NIADDR + 2]; ufs1_daddr_t last, *bap; struct buf *bp; if (blkno == 0) { if (expungetype == BLK_NOCOPY) return (0); panic("indiracct_ufs1: missing indir"); } if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0) return (error); if (lbn != indirs[num - 1 - level].in_lbn || num < 2) panic("indiracct_ufs1: botched params"); /* * We have to expand bread here since it will deadlock looking * up the block number for any blocks that are not in the cache. */ bp = getblk(cancelvp, lbn, fs->fs_bsize, 0, 0, 0); bp->b_blkno = fsbtodb(fs, blkno); if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0 && (error = readblock(cancelvp, bp, fragstoblks(fs, blkno)))) { brelse(bp); return (error); } /* * Account for the block pointers in this indirect block. */ last = howmany(remblks, blksperindir); if (last > NINDIR(fs)) last = NINDIR(fs); bap = malloc(fs->fs_bsize, M_DEVBUF, M_WAITOK); bcopy(bp->b_data, (caddr_t)bap, fs->fs_bsize); bqrelse(bp); error = (*acctfunc)(snapvp, &bap[0], &bap[last], fs, level == 0 ? rlbn : -1, expungetype); if (error || level == 0) goto out; /* * Account for the block pointers in each of the indirect blocks * in the levels below us. */ subblksperindir = blksperindir / NINDIR(fs); for (lbn++, level--, i = 0; i < last; i++) { error = indiracct_ufs1(snapvp, cancelvp, level, bap[i], lbn, rlbn, remblks, subblksperindir, fs, acctfunc, expungetype); if (error) goto out; rlbn += blksperindir; lbn -= blksperindir; remblks -= blksperindir; } out: free(bap, M_DEVBUF); return (error); } /* * Do both snap accounting and map accounting. */ static int fullacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype) struct vnode *vp; ufs1_daddr_t *oldblkp, *lastblkp; struct fs *fs; ufs_lbn_t lblkno; int exptype; /* BLK_SNAP or BLK_NOCOPY */ { int error; if ((error = snapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype))) return (error); return (mapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype)); } /* * Identify a set of blocks allocated in a snapshot inode. */ static int snapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, expungetype) struct vnode *vp; ufs1_daddr_t *oldblkp, *lastblkp; struct fs *fs; ufs_lbn_t lblkno; int expungetype; /* BLK_SNAP or BLK_NOCOPY */ { struct inode *ip = VTOI(vp); ufs1_daddr_t blkno, *blkp; ufs_lbn_t lbn; struct buf *ibp; int error; for ( ; oldblkp < lastblkp; oldblkp++) { blkno = *oldblkp; if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP) continue; lbn = fragstoblks(fs, blkno); if (lbn < UFS_NDADDR) { blkp = &ip->i_din1->di_db[lbn]; ip->i_flag |= IN_CHANGE | IN_UPDATE; } else { error = ffs_balloc_ufs1(vp, lblktosize(fs, (off_t)lbn), fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); if (error) return (error); blkp = &((ufs1_daddr_t *)(ibp->b_data)) [(lbn - UFS_NDADDR) % NINDIR(fs)]; } /* * If we are expunging a snapshot vnode and we * find a block marked BLK_NOCOPY, then it is * one that has been allocated to this snapshot after * we took our current snapshot and can be ignored. */ if (expungetype == BLK_SNAP && *blkp == BLK_NOCOPY) { if (lbn >= UFS_NDADDR) brelse(ibp); } else { if (*blkp != 0) panic("snapacct_ufs1: bad block"); *blkp = expungetype; if (lbn >= UFS_NDADDR) bdwrite(ibp); } } return (0); } /* * Account for a set of blocks allocated in a snapshot inode. */ static int mapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, expungetype) struct vnode *vp; ufs1_daddr_t *oldblkp, *lastblkp; struct fs *fs; ufs_lbn_t lblkno; int expungetype; { ufs1_daddr_t blkno; struct inode *ip; ino_t inum; int acctit; ip = VTOI(vp); inum = ip->i_number; if (lblkno == -1) acctit = 0; else acctit = 1; for ( ; oldblkp < lastblkp; oldblkp++, lblkno++) { blkno = *oldblkp; if (blkno == 0 || blkno == BLK_NOCOPY) continue; if (acctit && expungetype == BLK_SNAP && blkno != BLK_SNAP) *ip->i_snapblklist++ = lblkno; if (blkno == BLK_SNAP) blkno = blkstofrags(fs, lblkno); ffs_blkfree(ITOUMP(ip), fs, vp, blkno, fs->fs_bsize, inum, vp->v_type, NULL); } return (0); } /* * Before expunging a snapshot inode, note all the * blocks that it claims with BLK_SNAP so that fsck will * be able to account for those blocks properly and so * that this snapshot knows that it need not copy them * if the other snapshot holding them is freed. This code * is reproduced once each for UFS1 and UFS2. */ static int expunge_ufs2(snapvp, cancelip, fs, acctfunc, expungetype, clearmode) struct vnode *snapvp; struct inode *cancelip; struct fs *fs; int (*acctfunc)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *, ufs_lbn_t, int); int expungetype; int clearmode; { int i, error, indiroff; ufs_lbn_t lbn, rlbn; ufs2_daddr_t len, blkno, numblks, blksperindir; struct ufs2_dinode *dip; struct thread *td = curthread; struct buf *bp; /* * Prepare to expunge the inode. If its inode block has not * yet been copied, then allocate and fill the copy. */ lbn = fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number)); blkno = 0; if (lbn < UFS_NDADDR) { blkno = VTOI(snapvp)->i_din2->di_db[lbn]; } else { if (DOINGSOFTDEP(snapvp)) softdep_prealloc(snapvp, MNT_WAIT); td->td_pflags |= TDP_COWINPROGRESS; error = ffs_balloc_ufs2(snapvp, lblktosize(fs, (off_t)lbn), fs->fs_bsize, KERNCRED, BA_METAONLY, &bp); td->td_pflags &= ~TDP_COWINPROGRESS; if (error) return (error); indiroff = (lbn - UFS_NDADDR) % NINDIR(fs); blkno = ((ufs2_daddr_t *)(bp->b_data))[indiroff]; bqrelse(bp); } if (blkno != 0) { if ((error = bread(snapvp, lbn, fs->fs_bsize, KERNCRED, &bp))) return (error); } else { error = ffs_balloc_ufs2(snapvp, lblktosize(fs, (off_t)lbn), fs->fs_bsize, KERNCRED, 0, &bp); if (error) return (error); if ((error = readblock(snapvp, bp, lbn)) != 0) return (error); } /* * Set a snapshot inode to be a zero length file, regular files * to be completely unallocated. */ dip = (struct ufs2_dinode *)bp->b_data + ino_to_fsbo(fs, cancelip->i_number); if (clearmode || cancelip->i_effnlink == 0) dip->di_mode = 0; dip->di_size = 0; dip->di_blocks = 0; dip->di_flags &= ~SF_SNAPSHOT; bzero(&dip->di_db[0], (UFS_NDADDR + UFS_NIADDR) * sizeof(ufs2_daddr_t)); bdwrite(bp); /* * Now go through and expunge all the blocks in the file * using the function requested. */ numblks = howmany(cancelip->i_size, fs->fs_bsize); if ((error = (*acctfunc)(snapvp, &cancelip->i_din2->di_db[0], &cancelip->i_din2->di_db[UFS_NDADDR], fs, 0, expungetype))) return (error); if ((error = (*acctfunc)(snapvp, &cancelip->i_din2->di_ib[0], &cancelip->i_din2->di_ib[UFS_NIADDR], fs, -1, expungetype))) return (error); blksperindir = 1; lbn = -UFS_NDADDR; len = numblks - UFS_NDADDR; rlbn = UFS_NDADDR; for (i = 0; len > 0 && i < UFS_NIADDR; i++) { error = indiracct_ufs2(snapvp, ITOV(cancelip), i, cancelip->i_din2->di_ib[i], lbn, rlbn, len, blksperindir, fs, acctfunc, expungetype); if (error) return (error); blksperindir *= NINDIR(fs); lbn -= blksperindir + 1; len -= blksperindir; rlbn += blksperindir; } return (0); } /* * Descend an indirect block chain for vnode cancelvp accounting for all * its indirect blocks in snapvp. */ static int indiracct_ufs2(snapvp, cancelvp, level, blkno, lbn, rlbn, remblks, blksperindir, fs, acctfunc, expungetype) struct vnode *snapvp; struct vnode *cancelvp; int level; ufs2_daddr_t blkno; ufs_lbn_t lbn; ufs_lbn_t rlbn; ufs_lbn_t remblks; ufs_lbn_t blksperindir; struct fs *fs; int (*acctfunc)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *, ufs_lbn_t, int); int expungetype; { int error, num, i; ufs_lbn_t subblksperindir; struct indir indirs[UFS_NIADDR + 2]; ufs2_daddr_t last, *bap; struct buf *bp; if (blkno == 0) { if (expungetype == BLK_NOCOPY) return (0); panic("indiracct_ufs2: missing indir"); } if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0) return (error); if (lbn != indirs[num - 1 - level].in_lbn || num < 2) panic("indiracct_ufs2: botched params"); /* * We have to expand bread here since it will deadlock looking * up the block number for any blocks that are not in the cache. */ bp = getblk(cancelvp, lbn, fs->fs_bsize, 0, 0, 0); bp->b_blkno = fsbtodb(fs, blkno); if ((bp->b_flags & B_CACHE) == 0 && (error = readblock(cancelvp, bp, fragstoblks(fs, blkno)))) { brelse(bp); return (error); } /* * Account for the block pointers in this indirect block. */ last = howmany(remblks, blksperindir); if (last > NINDIR(fs)) last = NINDIR(fs); bap = malloc(fs->fs_bsize, M_DEVBUF, M_WAITOK); bcopy(bp->b_data, (caddr_t)bap, fs->fs_bsize); bqrelse(bp); error = (*acctfunc)(snapvp, &bap[0], &bap[last], fs, level == 0 ? rlbn : -1, expungetype); if (error || level == 0) goto out; /* * Account for the block pointers in each of the indirect blocks * in the levels below us. */ subblksperindir = blksperindir / NINDIR(fs); for (lbn++, level--, i = 0; i < last; i++) { error = indiracct_ufs2(snapvp, cancelvp, level, bap[i], lbn, rlbn, remblks, subblksperindir, fs, acctfunc, expungetype); if (error) goto out; rlbn += blksperindir; lbn -= blksperindir; remblks -= blksperindir; } out: free(bap, M_DEVBUF); return (error); } /* * Do both snap accounting and map accounting. */ static int fullacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype) struct vnode *vp; ufs2_daddr_t *oldblkp, *lastblkp; struct fs *fs; ufs_lbn_t lblkno; int exptype; /* BLK_SNAP or BLK_NOCOPY */ { int error; if ((error = snapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype))) return (error); return (mapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype)); } /* * Identify a set of blocks allocated in a snapshot inode. */ static int snapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, expungetype) struct vnode *vp; ufs2_daddr_t *oldblkp, *lastblkp; struct fs *fs; ufs_lbn_t lblkno; int expungetype; /* BLK_SNAP or BLK_NOCOPY */ { struct inode *ip = VTOI(vp); ufs2_daddr_t blkno, *blkp; ufs_lbn_t lbn; struct buf *ibp; int error; for ( ; oldblkp < lastblkp; oldblkp++) { blkno = *oldblkp; if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP) continue; lbn = fragstoblks(fs, blkno); if (lbn < UFS_NDADDR) { blkp = &ip->i_din2->di_db[lbn]; ip->i_flag |= IN_CHANGE | IN_UPDATE; } else { error = ffs_balloc_ufs2(vp, lblktosize(fs, (off_t)lbn), fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); if (error) return (error); blkp = &((ufs2_daddr_t *)(ibp->b_data)) [(lbn - UFS_NDADDR) % NINDIR(fs)]; } /* * If we are expunging a snapshot vnode and we * find a block marked BLK_NOCOPY, then it is * one that has been allocated to this snapshot after * we took our current snapshot and can be ignored. */ if (expungetype == BLK_SNAP && *blkp == BLK_NOCOPY) { if (lbn >= UFS_NDADDR) brelse(ibp); } else { if (*blkp != 0) panic("snapacct_ufs2: bad block"); *blkp = expungetype; if (lbn >= UFS_NDADDR) bdwrite(ibp); } } return (0); } /* * Account for a set of blocks allocated in a snapshot inode. */ static int mapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, expungetype) struct vnode *vp; ufs2_daddr_t *oldblkp, *lastblkp; struct fs *fs; ufs_lbn_t lblkno; int expungetype; { ufs2_daddr_t blkno; struct inode *ip; ino_t inum; int acctit; ip = VTOI(vp); inum = ip->i_number; if (lblkno == -1) acctit = 0; else acctit = 1; for ( ; oldblkp < lastblkp; oldblkp++, lblkno++) { blkno = *oldblkp; if (blkno == 0 || blkno == BLK_NOCOPY) continue; if (acctit && expungetype == BLK_SNAP && blkno != BLK_SNAP) *ip->i_snapblklist++ = lblkno; if (blkno == BLK_SNAP) blkno = blkstofrags(fs, lblkno); ffs_blkfree(ITOUMP(ip), fs, vp, blkno, fs->fs_bsize, inum, vp->v_type, NULL); } return (0); } /* * Decrement extra reference on snapshot when last name is removed. * It will not be freed until the last open reference goes away. */ void ffs_snapgone(ip) struct inode *ip; { struct inode *xp; struct fs *fs; int snaploc; struct snapdata *sn; struct ufsmount *ump; /* * Find snapshot in incore list. */ xp = NULL; sn = ITODEVVP(ip)->v_rdev->si_snapdata; if (sn != NULL) TAILQ_FOREACH(xp, &sn->sn_head, i_nextsnap) if (xp == ip) break; if (xp != NULL) vrele(ITOV(ip)); else if (snapdebug) printf("ffs_snapgone: lost snapshot vnode %ju\n", (uintmax_t)ip->i_number); /* * Delete snapshot inode from superblock. Keep list dense. */ ump = ITOUMP(ip); fs = ump->um_fs; UFS_LOCK(ump); for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) if (fs->fs_snapinum[snaploc] == ip->i_number) break; if (snaploc < FSMAXSNAP) { for (snaploc++; snaploc < FSMAXSNAP; snaploc++) { if (fs->fs_snapinum[snaploc] == 0) break; fs->fs_snapinum[snaploc - 1] = fs->fs_snapinum[snaploc]; } fs->fs_snapinum[snaploc - 1] = 0; } UFS_UNLOCK(ump); } /* * Prepare a snapshot file for being removed. */ void ffs_snapremove(vp) struct vnode *vp; { struct inode *ip; struct vnode *devvp; struct buf *ibp; struct fs *fs; ufs2_daddr_t numblks, blkno, dblk; int error, i, last, loc; struct snapdata *sn; ip = VTOI(vp); fs = ITOFS(ip); devvp = ITODEVVP(ip); /* * If active, delete from incore list (this snapshot may * already have been in the process of being deleted, so * would not have been active). * * Clear copy-on-write flag if last snapshot. */ VI_LOCK(devvp); if (ip->i_nextsnap.tqe_prev != 0) { sn = devvp->v_rdev->si_snapdata; TAILQ_REMOVE(&sn->sn_head, ip, i_nextsnap); ip->i_nextsnap.tqe_prev = 0; VI_UNLOCK(devvp); lockmgr(&vp->v_lock, LK_EXCLUSIVE, NULL); for (i = 0; i < sn->sn_lock.lk_recurse; i++) lockmgr(&vp->v_lock, LK_EXCLUSIVE, NULL); KASSERT(vp->v_vnlock == &sn->sn_lock, ("ffs_snapremove: lost lock mutation")); vp->v_vnlock = &vp->v_lock; VI_LOCK(devvp); while (sn->sn_lock.lk_recurse > 0) lockmgr(&sn->sn_lock, LK_RELEASE, NULL); lockmgr(&sn->sn_lock, LK_RELEASE, NULL); try_free_snapdata(devvp); } else VI_UNLOCK(devvp); /* * Clear all BLK_NOCOPY fields. Pass any block claims to other * snapshots that want them (see ffs_snapblkfree below). */ for (blkno = 1; blkno < UFS_NDADDR; blkno++) { dblk = DIP(ip, i_db[blkno]); if (dblk == 0) continue; if (dblk == BLK_NOCOPY || dblk == BLK_SNAP) DIP_SET(ip, i_db[blkno], 0); else if ((dblk == blkstofrags(fs, blkno) && ffs_snapblkfree(fs, ITODEVVP(ip), dblk, fs->fs_bsize, ip->i_number, vp->v_type, NULL))) { DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - btodb(fs->fs_bsize)); DIP_SET(ip, i_db[blkno], 0); } } numblks = howmany(ip->i_size, fs->fs_bsize); for (blkno = UFS_NDADDR; blkno < numblks; blkno += NINDIR(fs)) { error = UFS_BALLOC(vp, lblktosize(fs, (off_t)blkno), fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); if (error) continue; if (fs->fs_size - blkno > NINDIR(fs)) last = NINDIR(fs); else last = fs->fs_size - blkno; for (loc = 0; loc < last; loc++) { if (I_IS_UFS1(ip)) { dblk = ((ufs1_daddr_t *)(ibp->b_data))[loc]; if (dblk == 0) continue; if (dblk == BLK_NOCOPY || dblk == BLK_SNAP) ((ufs1_daddr_t *)(ibp->b_data))[loc]= 0; else if ((dblk == blkstofrags(fs, blkno) && ffs_snapblkfree(fs, ITODEVVP(ip), dblk, fs->fs_bsize, ip->i_number, vp->v_type, NULL))) { ip->i_din1->di_blocks -= btodb(fs->fs_bsize); ((ufs1_daddr_t *)(ibp->b_data))[loc]= 0; } continue; } dblk = ((ufs2_daddr_t *)(ibp->b_data))[loc]; if (dblk == 0) continue; if (dblk == BLK_NOCOPY || dblk == BLK_SNAP) ((ufs2_daddr_t *)(ibp->b_data))[loc] = 0; else if ((dblk == blkstofrags(fs, blkno) && ffs_snapblkfree(fs, ITODEVVP(ip), dblk, fs->fs_bsize, ip->i_number, vp->v_type, NULL))) { ip->i_din2->di_blocks -= btodb(fs->fs_bsize); ((ufs2_daddr_t *)(ibp->b_data))[loc] = 0; } } bawrite(ibp); } /* * Clear snapshot flag and drop reference. */ ip->i_flags &= ~SF_SNAPSHOT; DIP_SET(ip, i_flags, ip->i_flags); ip->i_flag |= IN_CHANGE | IN_UPDATE; /* * The dirtied indirects must be written out before * softdep_setup_freeblocks() is called. Otherwise indir_trunc() * may find indirect pointers using the magic BLK_* values. */ if (DOINGSOFTDEP(vp)) ffs_syncvnode(vp, MNT_WAIT, 0); #ifdef QUOTA /* * Reenable disk quotas for ex-snapshot file. */ if (!getinoquota(ip)) (void) chkdq(ip, DIP(ip, i_blocks), KERNCRED, FORCE); #endif } /* * Notification that a block is being freed. Return zero if the free * should be allowed to proceed. Return non-zero if the snapshot file * wants to claim the block. The block will be claimed if it is an * uncopied part of one of the snapshots. It will be freed if it is * either a BLK_NOCOPY or has already been copied in all of the snapshots. * If a fragment is being freed, then all snapshots that care about * it must make a copy since a snapshot file can only claim full sized * blocks. Note that if more than one snapshot file maps the block, * we can pick one at random to claim it. Since none of the snapshots * can change, we are assurred that they will all see the same unmodified * image. When deleting a snapshot file (see ffs_snapremove above), we * must push any of these claimed blocks to one of the other snapshots * that maps it. These claimed blocks are easily identified as they will * have a block number equal to their logical block number within the * snapshot. A copied block can never have this property because they * must always have been allocated from a BLK_NOCOPY location. */ int ffs_snapblkfree(fs, devvp, bno, size, inum, vtype, wkhd) struct fs *fs; struct vnode *devvp; ufs2_daddr_t bno; long size; ino_t inum; enum vtype vtype; struct workhead *wkhd; { struct buf *ibp, *cbp, *savedcbp = NULL; struct thread *td = curthread; struct inode *ip; struct vnode *vp = NULL; ufs_lbn_t lbn; ufs2_daddr_t blkno; int indiroff = 0, error = 0, claimedblk = 0; struct snapdata *sn; lbn = fragstoblks(fs, bno); retry: VI_LOCK(devvp); sn = devvp->v_rdev->si_snapdata; if (sn == NULL) { VI_UNLOCK(devvp); return (0); } if (lockmgr(&sn->sn_lock, LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL, VI_MTX(devvp)) != 0) goto retry; TAILQ_FOREACH(ip, &sn->sn_head, i_nextsnap) { vp = ITOV(ip); if (DOINGSOFTDEP(vp)) softdep_prealloc(vp, MNT_WAIT); /* * Lookup block being written. */ if (lbn < UFS_NDADDR) { blkno = DIP(ip, i_db[lbn]); } else { td->td_pflags |= TDP_COWINPROGRESS; error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); td->td_pflags &= ~TDP_COWINPROGRESS; if (error) break; indiroff = (lbn - UFS_NDADDR) % NINDIR(fs); if (I_IS_UFS1(ip)) blkno=((ufs1_daddr_t *)(ibp->b_data))[indiroff]; else blkno=((ufs2_daddr_t *)(ibp->b_data))[indiroff]; } /* * Check to see if block needs to be copied. */ if (blkno == 0) { /* * A block that we map is being freed. If it has not * been claimed yet, we will claim or copy it (below). */ claimedblk = 1; } else if (blkno == BLK_SNAP) { /* * No previous snapshot claimed the block, * so it will be freed and become a BLK_NOCOPY * (don't care) for us. */ if (claimedblk) panic("snapblkfree: inconsistent block type"); if (lbn < UFS_NDADDR) { DIP_SET(ip, i_db[lbn], BLK_NOCOPY); ip->i_flag |= IN_CHANGE | IN_UPDATE; } else if (I_IS_UFS1(ip)) { ((ufs1_daddr_t *)(ibp->b_data))[indiroff] = BLK_NOCOPY; bdwrite(ibp); } else { ((ufs2_daddr_t *)(ibp->b_data))[indiroff] = BLK_NOCOPY; bdwrite(ibp); } continue; } else /* BLK_NOCOPY or default */ { /* * If the snapshot has already copied the block * (default), or does not care about the block, * it is not needed. */ if (lbn >= UFS_NDADDR) bqrelse(ibp); continue; } /* * If this is a full size block, we will just grab it * and assign it to the snapshot inode. Otherwise we * will proceed to copy it. See explanation for this * routine as to why only a single snapshot needs to * claim this block. */ if (size == fs->fs_bsize) { #ifdef DEBUG if (snapdebug) printf("%s %ju lbn %jd from inum %ju\n", "Grabonremove: snapino", (uintmax_t)ip->i_number, (intmax_t)lbn, (uintmax_t)inum); #endif /* * If journaling is tracking this write we must add * the work to the inode or indirect being written. */ if (wkhd != NULL) { if (lbn < UFS_NDADDR) softdep_inode_append(ip, curthread->td_ucred, wkhd); else softdep_buf_append(ibp, wkhd); } if (lbn < UFS_NDADDR) { DIP_SET(ip, i_db[lbn], bno); } else if (I_IS_UFS1(ip)) { ((ufs1_daddr_t *)(ibp->b_data))[indiroff] = bno; bdwrite(ibp); } else { ((ufs2_daddr_t *)(ibp->b_data))[indiroff] = bno; bdwrite(ibp); } DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + btodb(size)); ip->i_flag |= IN_CHANGE | IN_UPDATE; lockmgr(vp->v_vnlock, LK_RELEASE, NULL); return (1); } if (lbn >= UFS_NDADDR) bqrelse(ibp); /* * Allocate the block into which to do the copy. Note that this * allocation will never require any additional allocations for * the snapshot inode. */ td->td_pflags |= TDP_COWINPROGRESS; error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), fs->fs_bsize, KERNCRED, 0, &cbp); td->td_pflags &= ~TDP_COWINPROGRESS; if (error) break; #ifdef DEBUG if (snapdebug) printf("%s%ju lbn %jd %s %ju size %ld to blkno %jd\n", "Copyonremove: snapino ", (uintmax_t)ip->i_number, (intmax_t)lbn, "for inum", (uintmax_t)inum, size, (intmax_t)cbp->b_blkno); #endif /* * If we have already read the old block contents, then * simply copy them to the new block. Note that we need * to synchronously write snapshots that have not been * unlinked, and hence will be visible after a crash, * to ensure their integrity. At a minimum we ensure the * integrity of the filesystem metadata, but use the * dopersistence sysctl-setable flag to decide on the * persistence needed for file content data. */ if (savedcbp != NULL) { bcopy(savedcbp->b_data, cbp->b_data, fs->fs_bsize); bawrite(cbp); if ((vtype == VDIR || dopersistence) && ip->i_effnlink > 0) (void) ffs_syncvnode(vp, MNT_WAIT, NO_INO_UPDT); continue; } /* * Otherwise, read the old block contents into the buffer. */ if ((error = readblock(vp, cbp, lbn)) != 0) { bzero(cbp->b_data, fs->fs_bsize); bawrite(cbp); if ((vtype == VDIR || dopersistence) && ip->i_effnlink > 0) (void) ffs_syncvnode(vp, MNT_WAIT, NO_INO_UPDT); break; } savedcbp = cbp; } /* * Note that we need to synchronously write snapshots that * have not been unlinked, and hence will be visible after * a crash, to ensure their integrity. At a minimum we * ensure the integrity of the filesystem metadata, but * use the dopersistence sysctl-setable flag to decide on * the persistence needed for file content data. */ if (savedcbp) { vp = savedcbp->b_vp; bawrite(savedcbp); if ((vtype == VDIR || dopersistence) && VTOI(vp)->i_effnlink > 0) (void) ffs_syncvnode(vp, MNT_WAIT, NO_INO_UPDT); } /* * If we have been unable to allocate a block in which to do * the copy, then return non-zero so that the fragment will * not be freed. Although space will be lost, the snapshot * will stay consistent. */ if (error != 0 && wkhd != NULL) softdep_freework(wkhd); lockmgr(&sn->sn_lock, LK_RELEASE, NULL); return (error); } /* * Associate snapshot files when mounting. */ void ffs_snapshot_mount(mp) struct mount *mp; { struct ufsmount *ump = VFSTOUFS(mp); struct vnode *devvp = ump->um_devvp; struct fs *fs = ump->um_fs; struct thread *td = curthread; struct snapdata *sn; struct vnode *vp; struct vnode *lastvp; struct inode *ip; struct uio auio; struct iovec aiov; void *snapblklist; char *reason; daddr_t snaplistsize; int error, snaploc, loc; /* * XXX The following needs to be set before ffs_truncate or * VOP_READ can be called. */ mp->mnt_stat.f_iosize = fs->fs_bsize; /* * Process each snapshot listed in the superblock. */ vp = NULL; lastvp = NULL; sn = NULL; for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) { if (fs->fs_snapinum[snaploc] == 0) break; if ((error = ffs_vget(mp, fs->fs_snapinum[snaploc], LK_EXCLUSIVE, &vp)) != 0){ printf("ffs_snapshot_mount: vget failed %d\n", error); continue; } ip = VTOI(vp); if (!IS_SNAPSHOT(ip) || ip->i_size == lblktosize(fs, howmany(fs->fs_size, fs->fs_frag))) { if (!IS_SNAPSHOT(ip)) { reason = "non-snapshot"; } else { reason = "old format snapshot"; (void)ffs_truncate(vp, (off_t)0, 0, NOCRED); (void)ffs_syncvnode(vp, MNT_WAIT, 0); } printf("ffs_snapshot_mount: %s inode %d\n", reason, fs->fs_snapinum[snaploc]); vput(vp); vp = NULL; for (loc = snaploc + 1; loc < FSMAXSNAP; loc++) { if (fs->fs_snapinum[loc] == 0) break; fs->fs_snapinum[loc - 1] = fs->fs_snapinum[loc]; } fs->fs_snapinum[loc - 1] = 0; snaploc--; continue; } /* * Acquire a lock on the snapdata structure, creating it if * necessary. */ sn = ffs_snapdata_acquire(devvp); /* * Change vnode to use shared snapshot lock instead of the * original private lock. */ vp->v_vnlock = &sn->sn_lock; lockmgr(&vp->v_lock, LK_RELEASE, NULL); /* * Link it onto the active snapshot list. */ VI_LOCK(devvp); if (ip->i_nextsnap.tqe_prev != 0) panic("ffs_snapshot_mount: %ju already on list", (uintmax_t)ip->i_number); else TAILQ_INSERT_TAIL(&sn->sn_head, ip, i_nextsnap); vp->v_vflag |= VV_SYSTEM; VI_UNLOCK(devvp); VOP_UNLOCK(vp, 0); lastvp = vp; } vp = lastvp; /* * No usable snapshots found. */ if (sn == NULL || vp == NULL) return; /* * Allocate the space for the block hints list. We always want to * use the list from the newest snapshot. */ auio.uio_iov = &aiov; auio.uio_iovcnt = 1; aiov.iov_base = (void *)&snaplistsize; aiov.iov_len = sizeof(snaplistsize); auio.uio_resid = aiov.iov_len; auio.uio_offset = lblktosize(fs, howmany(fs->fs_size, fs->fs_frag)); auio.uio_segflg = UIO_SYSSPACE; auio.uio_rw = UIO_READ; auio.uio_td = td; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); if ((error = VOP_READ(vp, &auio, IO_UNIT, td->td_ucred)) != 0) { printf("ffs_snapshot_mount: read_1 failed %d\n", error); VOP_UNLOCK(vp, 0); return; } snapblklist = malloc(snaplistsize * sizeof(daddr_t), M_UFSMNT, M_WAITOK); auio.uio_iovcnt = 1; aiov.iov_base = snapblklist; aiov.iov_len = snaplistsize * sizeof (daddr_t); auio.uio_resid = aiov.iov_len; auio.uio_offset -= sizeof(snaplistsize); if ((error = VOP_READ(vp, &auio, IO_UNIT, td->td_ucred)) != 0) { printf("ffs_snapshot_mount: read_2 failed %d\n", error); VOP_UNLOCK(vp, 0); free(snapblklist, M_UFSMNT); return; } VOP_UNLOCK(vp, 0); VI_LOCK(devvp); ASSERT_VOP_LOCKED(devvp, "ffs_snapshot_mount"); sn->sn_listsize = snaplistsize; sn->sn_blklist = (daddr_t *)snapblklist; devvp->v_vflag |= VV_COPYONWRITE; VI_UNLOCK(devvp); } /* * Disassociate snapshot files when unmounting. */ void ffs_snapshot_unmount(mp) struct mount *mp; { struct vnode *devvp = VFSTOUFS(mp)->um_devvp; struct snapdata *sn; struct inode *xp; struct vnode *vp; VI_LOCK(devvp); sn = devvp->v_rdev->si_snapdata; while (sn != NULL && (xp = TAILQ_FIRST(&sn->sn_head)) != NULL) { vp = ITOV(xp); TAILQ_REMOVE(&sn->sn_head, xp, i_nextsnap); xp->i_nextsnap.tqe_prev = 0; lockmgr(&sn->sn_lock, LK_INTERLOCK | LK_EXCLUSIVE, VI_MTX(devvp)); lockmgr(&vp->v_lock, LK_EXCLUSIVE, NULL); KASSERT(vp->v_vnlock == &sn->sn_lock, ("ffs_snapshot_unmount: lost lock mutation")); vp->v_vnlock = &vp->v_lock; lockmgr(&vp->v_lock, LK_RELEASE, NULL); lockmgr(&sn->sn_lock, LK_RELEASE, NULL); if (xp->i_effnlink > 0) vrele(vp); VI_LOCK(devvp); sn = devvp->v_rdev->si_snapdata; } try_free_snapdata(devvp); ASSERT_VOP_LOCKED(devvp, "ffs_snapshot_unmount"); } /* * Check the buffer block to be belong to device buffer that shall be * locked after snaplk. devvp shall be locked on entry, and will be * leaved locked upon exit. */ static int ffs_bp_snapblk(devvp, bp) struct vnode *devvp; struct buf *bp; { struct snapdata *sn; struct fs *fs; ufs2_daddr_t lbn, *snapblklist; int lower, upper, mid; ASSERT_VI_LOCKED(devvp, "ffs_bp_snapblk"); KASSERT(devvp->v_type == VCHR, ("Not a device %p", devvp)); sn = devvp->v_rdev->si_snapdata; if (sn == NULL || TAILQ_FIRST(&sn->sn_head) == NULL) return (0); fs = ITOFS(TAILQ_FIRST(&sn->sn_head)); lbn = fragstoblks(fs, dbtofsb(fs, bp->b_blkno)); snapblklist = sn->sn_blklist; upper = sn->sn_listsize - 1; lower = 1; while (lower <= upper) { mid = (lower + upper) / 2; if (snapblklist[mid] == lbn) break; if (snapblklist[mid] < lbn) lower = mid + 1; else upper = mid - 1; } if (lower <= upper) return (1); return (0); } void ffs_bdflush(bo, bp) struct bufobj *bo; struct buf *bp; { struct thread *td; struct vnode *vp, *devvp; struct buf *nbp; int bp_bdskip; if (bo->bo_dirty.bv_cnt <= dirtybufthresh) return; td = curthread; vp = bp->b_vp; devvp = bo2vnode(bo); KASSERT(vp == devvp, ("devvp != vp %p %p", bo, bp)); VI_LOCK(devvp); bp_bdskip = ffs_bp_snapblk(devvp, bp); if (bp_bdskip) bdwriteskip++; VI_UNLOCK(devvp); if (bo->bo_dirty.bv_cnt > dirtybufthresh + 10 && !bp_bdskip) { (void) VOP_FSYNC(vp, MNT_NOWAIT, td); altbufferflushes++; } else { BO_LOCK(bo); /* * Try to find a buffer to flush. */ TAILQ_FOREACH(nbp, &bo->bo_dirty.bv_hd, b_bobufs) { if ((nbp->b_vflags & BV_BKGRDINPROG) || BUF_LOCK(nbp, LK_EXCLUSIVE | LK_NOWAIT, NULL)) continue; if (bp == nbp) panic("bdwrite: found ourselves"); BO_UNLOCK(bo); /* * Don't countdeps with the bo lock * held. */ if (buf_countdeps(nbp, 0)) { BO_LOCK(bo); BUF_UNLOCK(nbp); continue; } if (bp_bdskip) { VI_LOCK(devvp); if (!ffs_bp_snapblk(vp, nbp)) { VI_UNLOCK(devvp); BO_LOCK(bo); BUF_UNLOCK(nbp); continue; } VI_UNLOCK(devvp); } if (nbp->b_flags & B_CLUSTEROK) { vfs_bio_awrite(nbp); } else { bremfree(nbp); bawrite(nbp); } dirtybufferflushes++; break; } if (nbp == NULL) BO_UNLOCK(bo); } } /* * Check for need to copy block that is about to be written, * copying the block if necessary. */ int ffs_copyonwrite(devvp, bp) struct vnode *devvp; struct buf *bp; { struct snapdata *sn; struct buf *ibp, *cbp, *savedcbp = NULL; struct thread *td = curthread; struct fs *fs; struct inode *ip; struct vnode *vp = NULL; ufs2_daddr_t lbn, blkno, *snapblklist; int lower, upper, mid, indiroff, error = 0; int launched_async_io, prev_norunningbuf; long saved_runningbufspace; if (devvp != bp->b_vp && IS_SNAPSHOT(VTOI(bp->b_vp))) return (0); /* Update on a snapshot file */ if (td->td_pflags & TDP_COWINPROGRESS) panic("ffs_copyonwrite: recursive call"); /* * First check to see if it is in the preallocated list. * By doing this check we avoid several potential deadlocks. */ VI_LOCK(devvp); sn = devvp->v_rdev->si_snapdata; if (sn == NULL || TAILQ_EMPTY(&sn->sn_head)) { VI_UNLOCK(devvp); return (0); /* No snapshot */ } ip = TAILQ_FIRST(&sn->sn_head); fs = ITOFS(ip); lbn = fragstoblks(fs, dbtofsb(fs, bp->b_blkno)); snapblklist = sn->sn_blklist; upper = sn->sn_listsize - 1; lower = 1; while (lower <= upper) { mid = (lower + upper) / 2; if (snapblklist[mid] == lbn) break; if (snapblklist[mid] < lbn) lower = mid + 1; else upper = mid - 1; } if (lower <= upper) { VI_UNLOCK(devvp); return (0); } launched_async_io = 0; prev_norunningbuf = td->td_pflags & TDP_NORUNNINGBUF; /* * Since I/O on bp isn't yet in progress and it may be blocked * for a long time waiting on snaplk, back it out of * runningbufspace, possibly waking other threads waiting for space. */ saved_runningbufspace = bp->b_runningbufspace; if (saved_runningbufspace != 0) runningbufwakeup(bp); /* * Not in the precomputed list, so check the snapshots. */ while (lockmgr(&sn->sn_lock, LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL, VI_MTX(devvp)) != 0) { VI_LOCK(devvp); sn = devvp->v_rdev->si_snapdata; if (sn == NULL || TAILQ_EMPTY(&sn->sn_head)) { VI_UNLOCK(devvp); if (saved_runningbufspace != 0) { bp->b_runningbufspace = saved_runningbufspace; atomic_add_long(&runningbufspace, bp->b_runningbufspace); } return (0); /* Snapshot gone */ } } TAILQ_FOREACH(ip, &sn->sn_head, i_nextsnap) { vp = ITOV(ip); if (DOINGSOFTDEP(vp)) softdep_prealloc(vp, MNT_WAIT); /* * We ensure that everything of our own that needs to be * copied will be done at the time that ffs_snapshot is * called. Thus we can skip the check here which can * deadlock in doing the lookup in UFS_BALLOC. */ if (bp->b_vp == vp) continue; /* * Check to see if block needs to be copied. We do not have * to hold the snapshot lock while doing this lookup as it * will never require any additional allocations for the * snapshot inode. */ if (lbn < UFS_NDADDR) { blkno = DIP(ip, i_db[lbn]); } else { td->td_pflags |= TDP_COWINPROGRESS | TDP_NORUNNINGBUF; error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp); td->td_pflags &= ~TDP_COWINPROGRESS; if (error) break; indiroff = (lbn - UFS_NDADDR) % NINDIR(fs); if (I_IS_UFS1(ip)) blkno=((ufs1_daddr_t *)(ibp->b_data))[indiroff]; else blkno=((ufs2_daddr_t *)(ibp->b_data))[indiroff]; bqrelse(ibp); } #ifdef INVARIANTS if (blkno == BLK_SNAP && bp->b_lblkno >= 0) panic("ffs_copyonwrite: bad copy block"); #endif if (blkno != 0) continue; /* * Allocate the block into which to do the copy. Since * multiple processes may all try to copy the same block, * we have to recheck our need to do a copy if we sleep * waiting for the lock. * * Because all snapshots on a filesystem share a single * lock, we ensure that we will never be in competition * with another process to allocate a block. */ td->td_pflags |= TDP_COWINPROGRESS | TDP_NORUNNINGBUF; error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn), fs->fs_bsize, KERNCRED, 0, &cbp); td->td_pflags &= ~TDP_COWINPROGRESS; if (error) break; #ifdef DEBUG if (snapdebug) { printf("Copyonwrite: snapino %ju lbn %jd for ", (uintmax_t)ip->i_number, (intmax_t)lbn); if (bp->b_vp == devvp) printf("fs metadata"); else printf("inum %ju", (uintmax_t)VTOI(bp->b_vp)->i_number); printf(" lblkno %jd to blkno %jd\n", (intmax_t)bp->b_lblkno, (intmax_t)cbp->b_blkno); } #endif /* * If we have already read the old block contents, then * simply copy them to the new block. Note that we need * to synchronously write snapshots that have not been * unlinked, and hence will be visible after a crash, * to ensure their integrity. At a minimum we ensure the * integrity of the filesystem metadata, but use the * dopersistence sysctl-setable flag to decide on the * persistence needed for file content data. */ if (savedcbp != NULL) { bcopy(savedcbp->b_data, cbp->b_data, fs->fs_bsize); bawrite(cbp); if ((devvp == bp->b_vp || bp->b_vp->v_type == VDIR || dopersistence) && ip->i_effnlink > 0) (void) ffs_syncvnode(vp, MNT_WAIT, NO_INO_UPDT); else launched_async_io = 1; continue; } /* * Otherwise, read the old block contents into the buffer. */ if ((error = readblock(vp, cbp, lbn)) != 0) { bzero(cbp->b_data, fs->fs_bsize); bawrite(cbp); if ((devvp == bp->b_vp || bp->b_vp->v_type == VDIR || dopersistence) && ip->i_effnlink > 0) (void) ffs_syncvnode(vp, MNT_WAIT, NO_INO_UPDT); else launched_async_io = 1; break; } savedcbp = cbp; } /* * Note that we need to synchronously write snapshots that * have not been unlinked, and hence will be visible after * a crash, to ensure their integrity. At a minimum we * ensure the integrity of the filesystem metadata, but * use the dopersistence sysctl-setable flag to decide on * the persistence needed for file content data. */ if (savedcbp) { vp = savedcbp->b_vp; bawrite(savedcbp); if ((devvp == bp->b_vp || bp->b_vp->v_type == VDIR || dopersistence) && VTOI(vp)->i_effnlink > 0) (void) ffs_syncvnode(vp, MNT_WAIT, NO_INO_UPDT); else launched_async_io = 1; } lockmgr(vp->v_vnlock, LK_RELEASE, NULL); td->td_pflags = (td->td_pflags & ~TDP_NORUNNINGBUF) | prev_norunningbuf; if (launched_async_io && (td->td_pflags & TDP_NORUNNINGBUF) == 0) waitrunningbufspace(); /* * I/O on bp will now be started, so count it in runningbufspace. */ if (saved_runningbufspace != 0) { bp->b_runningbufspace = saved_runningbufspace; atomic_add_long(&runningbufspace, bp->b_runningbufspace); } return (error); } /* * sync snapshots to force freework records waiting on snapshots to claim * blocks to free. */ void ffs_sync_snap(mp, waitfor) struct mount *mp; int waitfor; { struct snapdata *sn; struct vnode *devvp; struct vnode *vp; struct inode *ip; devvp = VFSTOUFS(mp)->um_devvp; if ((devvp->v_vflag & VV_COPYONWRITE) == 0) return; for (;;) { VI_LOCK(devvp); sn = devvp->v_rdev->si_snapdata; if (sn == NULL) { VI_UNLOCK(devvp); return; } if (lockmgr(&sn->sn_lock, LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL, VI_MTX(devvp)) == 0) break; } TAILQ_FOREACH(ip, &sn->sn_head, i_nextsnap) { vp = ITOV(ip); ffs_syncvnode(vp, waitfor, NO_INO_UPDT); } lockmgr(&sn->sn_lock, LK_RELEASE, NULL); } /* * Read the specified block into the given buffer. * Much of this boiler-plate comes from bwrite(). */ static int readblock(vp, bp, lbn) struct vnode *vp; struct buf *bp; ufs2_daddr_t lbn; { struct inode *ip; struct bio *bip; struct fs *fs; ip = VTOI(vp); fs = ITOFS(ip); bip = g_alloc_bio(); bip->bio_cmd = BIO_READ; bip->bio_offset = dbtob(fsbtodb(fs, blkstofrags(fs, lbn))); bip->bio_data = bp->b_data; bip->bio_length = bp->b_bcount; bip->bio_done = NULL; g_io_request(bip, ITODEVVP(ip)->v_bufobj.bo_private); bp->b_error = biowait(bip, "snaprdb"); g_destroy_bio(bip); return (bp->b_error); } #endif /* * Process file deletes that were deferred by ufs_inactive() due to * the file system being suspended. Transfer IN_LAZYACCESS into * IN_MODIFIED for vnodes that were accessed during suspension. */ void process_deferred_inactive(struct mount *mp) { struct vnode *vp, *mvp; struct inode *ip; struct thread *td; int error; td = curthread; (void) vn_start_secondary_write(NULL, &mp, V_WAIT); loop: MNT_VNODE_FOREACH_ALL(vp, mp, mvp) { /* * IN_LAZYACCESS is checked here without holding any * vnode lock, but this flag is set only while holding * vnode interlock. */ if (vp->v_type == VNON || ((VTOI(vp)->i_flag & IN_LAZYACCESS) == 0 && ((vp->v_iflag & VI_OWEINACT) == 0 || vp->v_usecount > 0))) { VI_UNLOCK(vp); continue; } vholdl(vp); error = vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK); if (error != 0) { vdrop(vp); if (error == ENOENT) continue; /* vnode recycled */ MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp); goto loop; } ip = VTOI(vp); if ((ip->i_flag & IN_LAZYACCESS) != 0) { ip->i_flag &= ~IN_LAZYACCESS; ip->i_flag |= IN_MODIFIED; } VI_LOCK(vp); if ((vp->v_iflag & VI_OWEINACT) == 0 || vp->v_usecount > 0) { VI_UNLOCK(vp); VOP_UNLOCK(vp, 0); vdrop(vp); continue; } vinactive(vp, td); VNASSERT((vp->v_iflag & VI_OWEINACT) == 0, vp, ("process_deferred_inactive: got VI_OWEINACT")); VI_UNLOCK(vp); VOP_UNLOCK(vp, 0); vdrop(vp); } vn_finished_secondary_write(mp); } #ifndef NO_FFS_SNAPSHOT static struct snapdata * ffs_snapdata_alloc(void) { struct snapdata *sn; /* * Fetch a snapdata from the free list if there is one available. */ mtx_lock(&snapfree_lock); sn = LIST_FIRST(&snapfree); if (sn != NULL) LIST_REMOVE(sn, sn_link); mtx_unlock(&snapfree_lock); if (sn != NULL) return (sn); /* * If there were no free snapdatas allocate one. */ sn = malloc(sizeof *sn, M_UFSMNT, M_WAITOK | M_ZERO); TAILQ_INIT(&sn->sn_head); lockinit(&sn->sn_lock, PVFS, "snaplk", VLKTIMEOUT, LK_CANRECURSE | LK_NOSHARE); return (sn); } /* * The snapdata is never freed because we can not be certain that * there are no threads sleeping on the snap lock. Persisting * them permanently avoids costly synchronization in ffs_lock(). */ static void ffs_snapdata_free(struct snapdata *sn) { mtx_lock(&snapfree_lock); LIST_INSERT_HEAD(&snapfree, sn, sn_link); mtx_unlock(&snapfree_lock); } /* Try to free snapdata associated with devvp */ static void try_free_snapdata(struct vnode *devvp) { struct snapdata *sn; ufs2_daddr_t *snapblklist; ASSERT_VI_LOCKED(devvp, "try_free_snapdata"); sn = devvp->v_rdev->si_snapdata; if (sn == NULL || TAILQ_FIRST(&sn->sn_head) != NULL || (devvp->v_vflag & VV_COPYONWRITE) == 0) { VI_UNLOCK(devvp); return; } devvp->v_rdev->si_snapdata = NULL; devvp->v_vflag &= ~VV_COPYONWRITE; lockmgr(&sn->sn_lock, LK_DRAIN|LK_INTERLOCK, VI_MTX(devvp)); snapblklist = sn->sn_blklist; sn->sn_blklist = NULL; sn->sn_listsize = 0; lockmgr(&sn->sn_lock, LK_RELEASE, NULL); if (snapblklist != NULL) free(snapblklist, M_UFSMNT); ffs_snapdata_free(sn); } static struct snapdata * ffs_snapdata_acquire(struct vnode *devvp) { struct snapdata *nsn, *sn; int error; /* * Allocate a free snapdata. This is done before acquiring the * devvp lock to avoid allocation while the devvp interlock is * held. */ nsn = ffs_snapdata_alloc(); for (;;) { VI_LOCK(devvp); sn = devvp->v_rdev->si_snapdata; if (sn == NULL) { /* * This is the first snapshot on this * filesystem and we use our pre-allocated * snapdata. Publish sn with the sn_lock * owned by us, to avoid the race. */ error = lockmgr(&nsn->sn_lock, LK_EXCLUSIVE | LK_NOWAIT, NULL); if (error != 0) panic("leaked sn, lockmgr error %d", error); sn = devvp->v_rdev->si_snapdata = nsn; VI_UNLOCK(devvp); nsn = NULL; break; } /* * There is a snapshots which already exists on this * filesystem, grab a reference to the common lock. */ error = lockmgr(&sn->sn_lock, LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL, VI_MTX(devvp)); if (error == 0) break; } /* * Free any unused snapdata. */ if (nsn != NULL) ffs_snapdata_free(nsn); return (sn); } #endif Index: head/sys/x86/iommu/intel_dmar.h =================================================================== --- head/sys/x86/iommu/intel_dmar.h (revision 336913) +++ head/sys/x86/iommu/intel_dmar.h (revision 336914) @@ -1,557 +1,556 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2013-2015 The FreeBSD Foundation * All rights reserved. * * This software was developed by Konstantin Belousov * under sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef __X86_IOMMU_INTEL_DMAR_H #define __X86_IOMMU_INTEL_DMAR_H /* Host or physical memory address, after translation. */ typedef uint64_t dmar_haddr_t; /* Guest or bus address, before translation. */ typedef uint64_t dmar_gaddr_t; struct dmar_qi_genseq { u_int gen; uint32_t seq; }; struct dmar_map_entry { dmar_gaddr_t start; dmar_gaddr_t end; dmar_gaddr_t free_after; /* Free space after the entry */ dmar_gaddr_t free_down; /* Max free space below the current R/B tree node */ u_int flags; TAILQ_ENTRY(dmar_map_entry) dmamap_link; /* Link for dmamap entries */ RB_ENTRY(dmar_map_entry) rb_entry; /* Links for domain entries */ TAILQ_ENTRY(dmar_map_entry) unroll_link; /* Link for unroll after dmamap_load failure */ struct dmar_domain *domain; struct dmar_qi_genseq gseq; }; RB_HEAD(dmar_gas_entries_tree, dmar_map_entry); RB_PROTOTYPE(dmar_gas_entries_tree, dmar_map_entry, rb_entry, dmar_gas_cmp_entries); #define DMAR_MAP_ENTRY_PLACE 0x0001 /* Fake entry */ #define DMAR_MAP_ENTRY_RMRR 0x0002 /* Permanent, not linked by dmamap_link */ #define DMAR_MAP_ENTRY_MAP 0x0004 /* Busdma created, linked by dmamap_link */ #define DMAR_MAP_ENTRY_UNMAPPED 0x0010 /* No backing pages */ #define DMAR_MAP_ENTRY_QI_NF 0x0020 /* qi task, do not free entry */ #define DMAR_MAP_ENTRY_READ 0x1000 /* Read permitted */ #define DMAR_MAP_ENTRY_WRITE 0x2000 /* Write permitted */ #define DMAR_MAP_ENTRY_SNOOP 0x4000 /* Snoop */ #define DMAR_MAP_ENTRY_TM 0x8000 /* Transient */ /* * Locking annotations: * (u) - Protected by dmar unit lock * (d) - Protected by domain lock * (c) - Immutable after initialization */ /* * The domain abstraction. Most non-constant members of the domain * are protected by owning dmar unit lock, not by the domain lock. * Most important, the dmar lock protects the contexts list. * * The domain lock protects the address map for the domain, and list * of unload entries delayed. * * Page tables pages and pages content is protected by the vm object * lock pgtbl_obj, which contains the page tables pages. */ struct dmar_domain { int domain; /* (c) DID, written in context entry */ int mgaw; /* (c) Real max address width */ int agaw; /* (c) Adjusted guest address width */ int pglvl; /* (c) The pagelevel */ int awlvl; /* (c) The pagelevel as the bitmask, to set in context entry */ dmar_gaddr_t end; /* (c) Highest address + 1 in the guest AS */ u_int ctx_cnt; /* (u) Number of contexts owned */ u_int refs; /* (u) Refs, including ctx */ struct dmar_unit *dmar; /* (c) */ struct mtx lock; /* (c) */ LIST_ENTRY(dmar_domain) link; /* (u) Member in the dmar list */ LIST_HEAD(, dmar_ctx) contexts; /* (u) */ vm_object_t pgtbl_obj; /* (c) Page table pages */ u_int flags; /* (u) */ u_int entries_cnt; /* (d) */ struct dmar_gas_entries_tree rb_root; /* (d) */ struct dmar_map_entries_tailq unload_entries; /* (d) Entries to unload */ struct dmar_map_entry *first_place, *last_place; /* (d) */ struct task unload_task; /* (c) */ u_int batch_no; }; struct dmar_ctx { struct bus_dma_tag_dmar ctx_tag; /* (c) Root tag */ uint16_t rid; /* (c) pci RID */ uint64_t last_fault_rec[2]; /* Last fault reported */ struct dmar_domain *domain; /* (c) */ LIST_ENTRY(dmar_ctx) link; /* (u) Member in the domain list */ u_int refs; /* (u) References from tags */ u_int flags; /* (u) */ u_long loads; /* atomic updates, for stat only */ u_long unloads; /* same */ }; #define DMAR_DOMAIN_GAS_INITED 0x0001 #define DMAR_DOMAIN_PGTBL_INITED 0x0002 #define DMAR_DOMAIN_IDMAP 0x0010 /* Domain uses identity page table */ #define DMAR_DOMAIN_RMRR 0x0020 /* Domain contains RMRR entry, cannot be turned off */ /* struct dmar_ctx flags */ #define DMAR_CTX_FAULTED 0x0001 /* Fault was reported, last_fault_rec is valid */ #define DMAR_CTX_DISABLED 0x0002 /* Device is disabled, the ephemeral reference is kept to prevent context destruction */ #define DMAR_DOMAIN_PGLOCK(dom) VM_OBJECT_WLOCK((dom)->pgtbl_obj) #define DMAR_DOMAIN_PGTRYLOCK(dom) VM_OBJECT_TRYWLOCK((dom)->pgtbl_obj) #define DMAR_DOMAIN_PGUNLOCK(dom) VM_OBJECT_WUNLOCK((dom)->pgtbl_obj) #define DMAR_DOMAIN_ASSERT_PGLOCKED(dom) \ VM_OBJECT_ASSERT_WLOCKED((dom)->pgtbl_obj) #define DMAR_DOMAIN_LOCK(dom) mtx_lock(&(dom)->lock) #define DMAR_DOMAIN_UNLOCK(dom) mtx_unlock(&(dom)->lock) #define DMAR_DOMAIN_ASSERT_LOCKED(dom) mtx_assert(&(dom)->lock, MA_OWNED) struct dmar_msi_data { int irq; int irq_rid; struct resource *irq_res; void *intr_handle; int (*handler)(void *); int msi_data_reg; int msi_addr_reg; int msi_uaddr_reg; void (*enable_intr)(struct dmar_unit *); void (*disable_intr)(struct dmar_unit *); const char *name; }; #define DMAR_INTR_FAULT 0 #define DMAR_INTR_QI 1 #define DMAR_INTR_TOTAL 2 struct dmar_unit { device_t dev; int unit; uint16_t segment; uint64_t base; /* Resources */ int reg_rid; struct resource *regs; struct dmar_msi_data intrs[DMAR_INTR_TOTAL]; /* Hardware registers cache */ uint32_t hw_ver; uint64_t hw_cap; uint64_t hw_ecap; uint32_t hw_gcmd; /* Data for being a dmar */ struct mtx lock; LIST_HEAD(, dmar_domain) domains; struct unrhdr *domids; vm_object_t ctx_obj; u_int barrier_flags; /* Fault handler data */ struct mtx fault_lock; uint64_t *fault_log; int fault_log_head; int fault_log_tail; int fault_log_size; struct task fault_task; struct taskqueue *fault_taskqueue; /* QI */ int qi_enabled; vm_offset_t inv_queue; vm_size_t inv_queue_size; uint32_t inv_queue_avail; uint32_t inv_queue_tail; volatile uint32_t inv_waitd_seq_hw; /* hw writes there on wait descr completion */ uint64_t inv_waitd_seq_hw_phys; uint32_t inv_waitd_seq; /* next sequence number to use for wait descr */ u_int inv_waitd_gen; /* seq number generation AKA seq overflows */ u_int inv_seq_waiters; /* count of waiters for seq */ u_int inv_queue_full; /* informational counter */ /* IR */ int ir_enabled; vm_paddr_t irt_phys; dmar_irte_t *irt; u_int irte_cnt; vmem_t *irtids; /* Delayed freeing of map entries queue processing */ struct dmar_map_entries_tailq tlb_flush_entries; struct task qi_task; struct taskqueue *qi_taskqueue; /* Busdma delayed map load */ struct task dmamap_load_task; TAILQ_HEAD(, bus_dmamap_dmar) delayed_maps; struct taskqueue *delayed_taskqueue; int dma_enabled; }; #define DMAR_LOCK(dmar) mtx_lock(&(dmar)->lock) #define DMAR_UNLOCK(dmar) mtx_unlock(&(dmar)->lock) #define DMAR_ASSERT_LOCKED(dmar) mtx_assert(&(dmar)->lock, MA_OWNED) #define DMAR_FAULT_LOCK(dmar) mtx_lock_spin(&(dmar)->fault_lock) #define DMAR_FAULT_UNLOCK(dmar) mtx_unlock_spin(&(dmar)->fault_lock) #define DMAR_FAULT_ASSERT_LOCKED(dmar) mtx_assert(&(dmar)->fault_lock, MA_OWNED) #define DMAR_IS_COHERENT(dmar) (((dmar)->hw_ecap & DMAR_ECAP_C) != 0) #define DMAR_HAS_QI(dmar) (((dmar)->hw_ecap & DMAR_ECAP_QI) != 0) #define DMAR_X2APIC(dmar) \ (x2apic_mode && ((dmar)->hw_ecap & DMAR_ECAP_EIM) != 0) /* Barrier ids */ #define DMAR_BARRIER_RMRR 0 #define DMAR_BARRIER_USEQ 1 struct dmar_unit *dmar_find(device_t dev); struct dmar_unit *dmar_find_hpet(device_t dev, uint16_t *rid); struct dmar_unit *dmar_find_ioapic(u_int apic_id, uint16_t *rid); u_int dmar_nd2mask(u_int nd); bool dmar_pglvl_supported(struct dmar_unit *unit, int pglvl); int domain_set_agaw(struct dmar_domain *domain, int mgaw); int dmar_maxaddr2mgaw(struct dmar_unit *unit, dmar_gaddr_t maxaddr, bool allow_less); vm_pindex_t pglvl_max_pages(int pglvl); int domain_is_sp_lvl(struct dmar_domain *domain, int lvl); dmar_gaddr_t pglvl_page_size(int total_pglvl, int lvl); dmar_gaddr_t domain_page_size(struct dmar_domain *domain, int lvl); int calc_am(struct dmar_unit *unit, dmar_gaddr_t base, dmar_gaddr_t size, dmar_gaddr_t *isizep); struct vm_page *dmar_pgalloc(vm_object_t obj, vm_pindex_t idx, int flags); void dmar_pgfree(vm_object_t obj, vm_pindex_t idx, int flags); void *dmar_map_pgtbl(vm_object_t obj, vm_pindex_t idx, int flags, struct sf_buf **sf); void dmar_unmap_pgtbl(struct sf_buf *sf); int dmar_load_root_entry_ptr(struct dmar_unit *unit); int dmar_inv_ctx_glob(struct dmar_unit *unit); int dmar_inv_iotlb_glob(struct dmar_unit *unit); int dmar_flush_write_bufs(struct dmar_unit *unit); void dmar_flush_pte_to_ram(struct dmar_unit *unit, dmar_pte_t *dst); void dmar_flush_ctx_to_ram(struct dmar_unit *unit, dmar_ctx_entry_t *dst); void dmar_flush_root_to_ram(struct dmar_unit *unit, dmar_root_entry_t *dst); int dmar_enable_translation(struct dmar_unit *unit); int dmar_disable_translation(struct dmar_unit *unit); int dmar_load_irt_ptr(struct dmar_unit *unit); int dmar_enable_ir(struct dmar_unit *unit); int dmar_disable_ir(struct dmar_unit *unit); bool dmar_barrier_enter(struct dmar_unit *dmar, u_int barrier_id); void dmar_barrier_exit(struct dmar_unit *dmar, u_int barrier_id); uint64_t dmar_get_timeout(void); void dmar_update_timeout(uint64_t newval); int dmar_fault_intr(void *arg); void dmar_enable_fault_intr(struct dmar_unit *unit); void dmar_disable_fault_intr(struct dmar_unit *unit); int dmar_init_fault_log(struct dmar_unit *unit); void dmar_fini_fault_log(struct dmar_unit *unit); int dmar_qi_intr(void *arg); void dmar_enable_qi_intr(struct dmar_unit *unit); void dmar_disable_qi_intr(struct dmar_unit *unit); int dmar_init_qi(struct dmar_unit *unit); void dmar_fini_qi(struct dmar_unit *unit); void dmar_qi_invalidate_locked(struct dmar_domain *domain, dmar_gaddr_t start, dmar_gaddr_t size, struct dmar_qi_genseq *psec, bool emit_wait); void dmar_qi_invalidate_ctx_glob_locked(struct dmar_unit *unit); void dmar_qi_invalidate_iotlb_glob_locked(struct dmar_unit *unit); void dmar_qi_invalidate_iec_glob(struct dmar_unit *unit); void dmar_qi_invalidate_iec(struct dmar_unit *unit, u_int start, u_int cnt); vm_object_t domain_get_idmap_pgtbl(struct dmar_domain *domain, dmar_gaddr_t maxaddr); void put_idmap_pgtbl(vm_object_t obj); int domain_map_buf(struct dmar_domain *domain, dmar_gaddr_t base, dmar_gaddr_t size, vm_page_t *ma, uint64_t pflags, int flags); int domain_unmap_buf(struct dmar_domain *domain, dmar_gaddr_t base, dmar_gaddr_t size, int flags); void domain_flush_iotlb_sync(struct dmar_domain *domain, dmar_gaddr_t base, dmar_gaddr_t size); int domain_alloc_pgtbl(struct dmar_domain *domain); void domain_free_pgtbl(struct dmar_domain *domain); struct dmar_ctx *dmar_instantiate_ctx(struct dmar_unit *dmar, device_t dev, bool rmrr); struct dmar_ctx *dmar_get_ctx_for_dev(struct dmar_unit *dmar, device_t dev, uint16_t rid, bool id_mapped, bool rmrr_init); int dmar_move_ctx_to_domain(struct dmar_domain *domain, struct dmar_ctx *ctx); void dmar_free_ctx_locked(struct dmar_unit *dmar, struct dmar_ctx *ctx); void dmar_free_ctx(struct dmar_ctx *ctx); struct dmar_ctx *dmar_find_ctx_locked(struct dmar_unit *dmar, uint16_t rid); void dmar_domain_unload_entry(struct dmar_map_entry *entry, bool free); void dmar_domain_unload(struct dmar_domain *domain, struct dmar_map_entries_tailq *entries, bool cansleep); void dmar_domain_free_entry(struct dmar_map_entry *entry, bool free); int dmar_init_busdma(struct dmar_unit *unit); void dmar_fini_busdma(struct dmar_unit *unit); device_t dmar_get_requester(device_t dev, uint16_t *rid); void dmar_gas_init_domain(struct dmar_domain *domain); void dmar_gas_fini_domain(struct dmar_domain *domain); struct dmar_map_entry *dmar_gas_alloc_entry(struct dmar_domain *domain, u_int flags); void dmar_gas_free_entry(struct dmar_domain *domain, struct dmar_map_entry *entry); void dmar_gas_free_space(struct dmar_domain *domain, struct dmar_map_entry *entry); int dmar_gas_map(struct dmar_domain *domain, const struct bus_dma_tag_common *common, dmar_gaddr_t size, int offset, u_int eflags, u_int flags, vm_page_t *ma, struct dmar_map_entry **res); void dmar_gas_free_region(struct dmar_domain *domain, struct dmar_map_entry *entry); int dmar_gas_map_region(struct dmar_domain *domain, struct dmar_map_entry *entry, u_int eflags, u_int flags, vm_page_t *ma); int dmar_gas_reserve_region(struct dmar_domain *domain, dmar_gaddr_t start, dmar_gaddr_t end); void dmar_dev_parse_rmrr(struct dmar_domain *domain, device_t dev, struct dmar_map_entries_tailq *rmrr_entries); int dmar_instantiate_rmrr_ctxs(struct dmar_unit *dmar); void dmar_quirks_post_ident(struct dmar_unit *dmar); void dmar_quirks_pre_use(struct dmar_unit *dmar); int dmar_init_irt(struct dmar_unit *unit); void dmar_fini_irt(struct dmar_unit *unit); #define DMAR_GM_CANWAIT 0x0001 #define DMAR_GM_CANSPLIT 0x0002 #define DMAR_PGF_WAITOK 0x0001 #define DMAR_PGF_ZERO 0x0002 #define DMAR_PGF_ALLOC 0x0004 #define DMAR_PGF_NOALLOC 0x0008 #define DMAR_PGF_OBJL 0x0010 extern dmar_haddr_t dmar_high; extern int haw; extern int dmar_tbl_pagecnt; extern int dmar_match_verbose; extern int dmar_batch_coalesce; extern int dmar_check_free; static inline uint32_t dmar_read4(const struct dmar_unit *unit, int reg) { return (bus_read_4(unit->regs, reg)); } static inline uint64_t dmar_read8(const struct dmar_unit *unit, int reg) { #ifdef __i386__ uint32_t high, low; low = bus_read_4(unit->regs, reg); high = bus_read_4(unit->regs, reg + 4); return (low | ((uint64_t)high << 32)); #else return (bus_read_8(unit->regs, reg)); #endif } static inline void dmar_write4(const struct dmar_unit *unit, int reg, uint32_t val) { KASSERT(reg != DMAR_GCMD_REG || (val & DMAR_GCMD_TE) == (unit->hw_gcmd & DMAR_GCMD_TE), ("dmar%d clearing TE 0x%08x 0x%08x", unit->unit, unit->hw_gcmd, val)); bus_write_4(unit->regs, reg, val); } static inline void dmar_write8(const struct dmar_unit *unit, int reg, uint64_t val) { KASSERT(reg != DMAR_GCMD_REG, ("8byte GCMD write")); #ifdef __i386__ uint32_t high, low; low = val; high = val >> 32; bus_write_4(unit->regs, reg, low); bus_write_4(unit->regs, reg + 4, high); #else bus_write_8(unit->regs, reg, val); #endif } /* * dmar_pte_store and dmar_pte_clear ensure that on i386, 32bit writes * are issued in the correct order. For store, the lower word, * containing the P or R and W bits, is set only after the high word * is written. For clear, the P bit is cleared first, then the high * word is cleared. * * dmar_pte_update updates the pte. For amd64, the update is atomic. * For i386, it first disables the entry by clearing the word * containing the P bit, and then defer to dmar_pte_store. The locked * cmpxchg8b is probably available on any machine having DMAR support, * but interrupt translation table may be mapped uncached. */ static inline void dmar_pte_store1(volatile uint64_t *dst, uint64_t val) { #ifdef __i386__ volatile uint32_t *p; uint32_t hi, lo; hi = val >> 32; lo = val; p = (volatile uint32_t *)dst; *(p + 1) = hi; *p = lo; #else *dst = val; #endif } static inline void dmar_pte_store(volatile uint64_t *dst, uint64_t val) { KASSERT(*dst == 0, ("used pte %p oldval %jx newval %jx", dst, (uintmax_t)*dst, (uintmax_t)val)); dmar_pte_store1(dst, val); } static inline void dmar_pte_update(volatile uint64_t *dst, uint64_t val) { #ifdef __i386__ volatile uint32_t *p; p = (volatile uint32_t *)dst; *p = 0; #endif dmar_pte_store1(dst, val); } static inline void dmar_pte_clear(volatile uint64_t *dst) { #ifdef __i386__ volatile uint32_t *p; p = (volatile uint32_t *)dst; *p = 0; *(p + 1) = 0; #else *dst = 0; #endif } static inline bool dmar_test_boundary(dmar_gaddr_t start, dmar_gaddr_t size, dmar_gaddr_t boundary) { if (boundary == 0) return (true); return (start + size <= ((start + boundary) & ~(boundary - 1))); } extern struct timespec dmar_hw_timeout; #define DMAR_WAIT_UNTIL(cond) \ { \ struct timespec last, curr; \ bool forever; \ \ if (dmar_hw_timeout.tv_sec == 0 && \ dmar_hw_timeout.tv_nsec == 0) { \ forever = true; \ } else { \ forever = false; \ nanouptime(&curr); \ - last = curr; \ - timespecadd(&last, &dmar_hw_timeout); \ + timespecadd(&curr, &dmar_hw_timeout, &last); \ } \ for (;;) { \ if (cond) { \ error = 0; \ break; \ } \ nanouptime(&curr); \ if (!forever && timespeccmp(&last, &curr, <)) { \ error = ETIMEDOUT; \ break; \ } \ cpu_spinwait(); \ } \ } #ifdef INVARIANTS #define TD_PREP_PINNED_ASSERT \ int old_td_pinned; \ old_td_pinned = curthread->td_pinned #define TD_PINNED_ASSERT \ KASSERT(curthread->td_pinned == old_td_pinned, \ ("pin count leak: %d %d %s:%d", curthread->td_pinned, \ old_td_pinned, __FILE__, __LINE__)) #else #define TD_PREP_PINNED_ASSERT #define TD_PINNED_ASSERT #endif #endif Index: head/tools/regression/posixsem/posixsem.c =================================================================== --- head/tools/regression/posixsem/posixsem.c (revision 336913) +++ head/tools/regression/posixsem/posixsem.c (revision 336914) @@ -1,1443 +1,1414 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2008 Yahoo!, Inc. * All rights reserved. * Written by: John Baldwin * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the author nor the names of any co-contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "test.h" -/* Cut and pasted from kernel header, bah! */ - -/* Operations on timespecs */ -#define timespecclear(tvp) ((tvp)->tv_sec = (tvp)->tv_nsec = 0) -#define timespecisset(tvp) ((tvp)->tv_sec || (tvp)->tv_nsec) -#define timespeccmp(tvp, uvp, cmp) \ - (((tvp)->tv_sec == (uvp)->tv_sec) ? \ - ((tvp)->tv_nsec cmp (uvp)->tv_nsec) : \ - ((tvp)->tv_sec cmp (uvp)->tv_sec)) -#define timespecadd(vvp, uvp) \ - do { \ - (vvp)->tv_sec += (uvp)->tv_sec; \ - (vvp)->tv_nsec += (uvp)->tv_nsec; \ - if ((vvp)->tv_nsec >= 1000000000) { \ - (vvp)->tv_sec++; \ - (vvp)->tv_nsec -= 1000000000; \ - } \ - } while (0) -#define timespecsub(vvp, uvp) \ - do { \ - (vvp)->tv_sec -= (uvp)->tv_sec; \ - (vvp)->tv_nsec -= (uvp)->tv_nsec; \ - if ((vvp)->tv_nsec < 0) { \ - (vvp)->tv_sec--; \ - (vvp)->tv_nsec += 1000000000; \ - } \ - } while (0) - - #define TEST_PATH "/tmp/posixsem_regression_test" #define ELAPSED(elapsed, limit) (abs((elapsed) - (limit)) < 100) /* Macros for passing child status to parent over a pipe. */ #define CSTAT(class, error) ((class) << 16 | (error)) #define CSTAT_CLASS(stat) ((stat) >> 16) #define CSTAT_ERROR(stat) ((stat) & 0xffff) /* * Helper routine for tests that use a child process. This routine * creates a pipe and forks a child process. The child process runs * the 'func' routine which returns a status integer. The status * integer gets written over the pipe to the parent and returned in * '*stat'. If there is an error in pipe(), fork(), or wait() this * returns -1 and fails the test. */ static int child_worker(int (*func)(void *arg), void *arg, int *stat) { pid_t pid; int pfd[2], cstat; if (pipe(pfd) < 0) { fail_errno("pipe"); return (-1); } pid = fork(); switch (pid) { case -1: /* Error. */ fail_errno("fork"); close(pfd[0]); close(pfd[1]); return (-1); case 0: /* Child. */ cstat = func(arg); write(pfd[1], &cstat, sizeof(cstat)); exit(0); } if (read(pfd[0], stat, sizeof(*stat)) < 0) { fail_errno("read(pipe)"); close(pfd[0]); close(pfd[1]); return (-1); } if (waitpid(pid, NULL, 0) < 0) { fail_errno("wait"); close(pfd[0]); close(pfd[1]); return (-1); } close(pfd[0]); close(pfd[1]); return (0); } /* * Attempt a ksem_open() that should fail with an expected error of * 'error'. */ static void ksem_open_should_fail(const char *path, int flags, mode_t mode, unsigned int value, int error) { semid_t id; if (ksem_open(&id, path, flags, mode, value) >= 0) { fail_err("ksem_open() didn't fail"); ksem_close(id); return; } if (errno != error) { fail_errno("ksem_open"); return; } pass(); } /* * Attempt a ksem_unlink() that should fail with an expected error of * 'error'. */ static void ksem_unlink_should_fail(const char *path, int error) { if (ksem_unlink(path) >= 0) { fail_err("ksem_unlink() didn't fail"); return; } if (errno != error) { fail_errno("ksem_unlink"); return; } pass(); } /* * Attempt a ksem_close() that should fail with an expected error of * 'error'. */ static void ksem_close_should_fail(semid_t id, int error) { if (ksem_close(id) >= 0) { fail_err("ksem_close() didn't fail"); return; } if (errno != error) { fail_errno("ksem_close"); return; } pass(); } /* * Attempt a ksem_init() that should fail with an expected error of * 'error'. */ static void ksem_init_should_fail(unsigned int value, int error) { semid_t id; if (ksem_init(&id, value) >= 0) { fail_err("ksem_init() didn't fail"); ksem_destroy(id); return; } if (errno != error) { fail_errno("ksem_init"); return; } pass(); } /* * Attempt a ksem_destroy() that should fail with an expected error of * 'error'. */ static void ksem_destroy_should_fail(semid_t id, int error) { if (ksem_destroy(id) >= 0) { fail_err("ksem_destroy() didn't fail"); return; } if (errno != error) { fail_errno("ksem_destroy"); return; } pass(); } /* * Attempt a ksem_post() that should fail with an expected error of * 'error'. */ static void ksem_post_should_fail(semid_t id, int error) { if (ksem_post(id) >= 0) { fail_err("ksem_post() didn't fail"); return; } if (errno != error) { fail_errno("ksem_post"); return; } pass(); } static void open_after_unlink(void) { semid_t id; if (ksem_open(&id, TEST_PATH, O_CREAT, 0777, 1) < 0) { fail_errno("ksem_open(1)"); return; } ksem_close(id); if (ksem_unlink(TEST_PATH) < 0) { fail_errno("ksem_unlink"); return; } ksem_open_should_fail(TEST_PATH, O_RDONLY, 0777, 1, ENOENT); } TEST(open_after_unlink, "open after unlink"); static void open_invalid_path(void) { ksem_open_should_fail("blah", 0, 0777, 1, EINVAL); } TEST(open_invalid_path, "open invalid path"); static void open_extra_flags(void) { ksem_open_should_fail(TEST_PATH, O_RDONLY | O_DIRECT, 0777, 1, EINVAL); } TEST(open_extra_flags, "open with extra flags"); static void open_bad_value(void) { (void)ksem_unlink(TEST_PATH); ksem_open_should_fail(TEST_PATH, O_CREAT, 0777, UINT_MAX, EINVAL); } TEST(open_bad_value, "open with invalid initial value"); static void open_bad_path_pointer(void) { ksem_open_should_fail((char *)1024, O_RDONLY, 0777, 1, EFAULT); } TEST(open_bad_path_pointer, "open bad path pointer"); static void open_path_too_long(void) { char *page; page = malloc(MAXPATHLEN + 1); memset(page, 'a', MAXPATHLEN); page[MAXPATHLEN] = '\0'; ksem_open_should_fail(page, O_RDONLY, 0777, 1, ENAMETOOLONG); free(page); } TEST(open_path_too_long, "open pathname too long"); static void open_nonexisting_semaphore(void) { ksem_open_should_fail("/notreallythere", 0, 0777, 1, ENOENT); } TEST(open_nonexisting_semaphore, "open nonexistent semaphore"); static void exclusive_create_existing_semaphore(void) { semid_t id; if (ksem_open(&id, TEST_PATH, O_CREAT, 0777, 1) < 0) { fail_errno("ksem_open(O_CREAT)"); return; } ksem_close(id); ksem_open_should_fail(TEST_PATH, O_CREAT | O_EXCL, 0777, 1, EEXIST); ksem_unlink(TEST_PATH); } TEST(exclusive_create_existing_semaphore, "O_EXCL of existing semaphore"); static void init_bad_value(void) { ksem_init_should_fail(UINT_MAX, EINVAL); } TEST(init_bad_value, "init with invalid initial value"); static void unlink_bad_path_pointer(void) { ksem_unlink_should_fail((char *)1024, EFAULT); } TEST(unlink_bad_path_pointer, "unlink bad path pointer"); static void unlink_path_too_long(void) { char *page; page = malloc(MAXPATHLEN + 1); memset(page, 'a', MAXPATHLEN); page[MAXPATHLEN] = '\0'; ksem_unlink_should_fail(page, ENAMETOOLONG); free(page); } TEST(unlink_path_too_long, "unlink pathname too long"); static void destroy_named_semaphore(void) { semid_t id; if (ksem_open(&id, TEST_PATH, O_CREAT, 0777, 1) < 0) { fail_errno("ksem_open(O_CREAT)"); return; } ksem_destroy_should_fail(id, EINVAL); ksem_close(id); ksem_unlink(TEST_PATH); } TEST(destroy_named_semaphore, "destroy named semaphore"); static void close_unnamed_semaphore(void) { semid_t id; if (ksem_init(&id, 1) < 0) { fail_errno("ksem_init"); return; } ksem_close_should_fail(id, EINVAL); ksem_destroy(id); } TEST(close_unnamed_semaphore, "close unnamed semaphore"); static void destroy_invalid_fd(void) { ksem_destroy_should_fail(STDERR_FILENO, EINVAL); } TEST(destroy_invalid_fd, "destroy non-semaphore file descriptor"); static void close_invalid_fd(void) { ksem_close_should_fail(STDERR_FILENO, EINVAL); } TEST(close_invalid_fd, "close non-semaphore file descriptor"); static void create_unnamed_semaphore(void) { semid_t id; if (ksem_init(&id, 1) < 0) { fail_errno("ksem_init"); return; } if (ksem_destroy(id) < 0) { fail_errno("ksem_destroy"); return; } pass(); } TEST(create_unnamed_semaphore, "create unnamed semaphore"); static void open_named_semaphore(void) { semid_t id; if (ksem_open(&id, TEST_PATH, O_CREAT, 0777, 1) < 0) { fail_errno("ksem_open(O_CREAT)"); return; } if (ksem_close(id) < 0) { fail_errno("ksem_close"); return; } if (ksem_unlink(TEST_PATH) < 0) { fail_errno("ksem_unlink"); return; } pass(); } TEST(open_named_semaphore, "create named semaphore"); static void getvalue_invalid_semaphore(void) { int val; if (ksem_getvalue(STDERR_FILENO, &val) >= 0) { fail_err("ksem_getvalue() didn't fail"); return; } if (errno != EINVAL) { fail_errno("ksem_getvalue"); return; } pass(); } TEST(getvalue_invalid_semaphore, "get value of invalid semaphore"); static void post_invalid_semaphore(void) { ksem_post_should_fail(STDERR_FILENO, EINVAL); } TEST(post_invalid_semaphore, "post of invalid semaphore"); static void wait_invalid_semaphore(void) { if (ksem_wait(STDERR_FILENO) >= 0) { fail_err("ksem_wait() didn't fail"); return; } if (errno != EINVAL) { fail_errno("ksem_wait"); return; } pass(); } TEST(wait_invalid_semaphore, "wait for invalid semaphore"); static void trywait_invalid_semaphore(void) { if (ksem_trywait(STDERR_FILENO) >= 0) { fail_err("ksem_trywait() didn't fail"); return; } if (errno != EINVAL) { fail_errno("ksem_trywait"); return; } pass(); } TEST(trywait_invalid_semaphore, "try wait for invalid semaphore"); static void timedwait_invalid_semaphore(void) { if (ksem_timedwait(STDERR_FILENO, NULL) >= 0) { fail_err("ksem_timedwait() didn't fail"); return; } if (errno != EINVAL) { fail_errno("ksem_timedwait"); return; } pass(); } TEST(timedwait_invalid_semaphore, "timed wait for invalid semaphore"); static int checkvalue(semid_t id, int expected) { int val; if (ksem_getvalue(id, &val) < 0) { fail_errno("ksem_getvalue"); return (-1); } if (val != expected) { fail_err("sem value should be %d instead of %d", expected, val); return (-1); } return (0); } static void post_test(void) { semid_t id; if (ksem_init(&id, 1) < 0) { fail_errno("ksem_init"); return; } if (checkvalue(id, 1) < 0) { ksem_destroy(id); return; } if (ksem_post(id) < 0) { fail_errno("ksem_post"); ksem_destroy(id); return; } if (checkvalue(id, 2) < 0) { ksem_destroy(id); return; } if (ksem_destroy(id) < 0) { fail_errno("ksem_destroy"); return; } pass(); } TEST(post_test, "simple post"); static void use_after_unlink_test(void) { semid_t id; /* * Create named semaphore with value of 1 and then unlink it * while still retaining the initial reference. */ if (ksem_open(&id, TEST_PATH, O_CREAT | O_EXCL, 0777, 1) < 0) { fail_errno("ksem_open(O_CREAT | O_EXCL)"); return; } if (ksem_unlink(TEST_PATH) < 0) { fail_errno("ksem_unlink"); ksem_close(id); return; } if (checkvalue(id, 1) < 0) { ksem_close(id); return; } /* Post the semaphore to set its value to 2. */ if (ksem_post(id) < 0) { fail_errno("ksem_post"); ksem_close(id); return; } if (checkvalue(id, 2) < 0) { ksem_close(id); return; } /* Wait on the semaphore which should set its value to 1. */ if (ksem_wait(id) < 0) { fail_errno("ksem_wait"); ksem_close(id); return; } if (checkvalue(id, 1) < 0) { ksem_close(id); return; } if (ksem_close(id) < 0) { fail_errno("ksem_close"); return; } pass(); } TEST(use_after_unlink_test, "use named semaphore after unlink"); static void unlocked_trywait(void) { semid_t id; if (ksem_init(&id, 1) < 0) { fail_errno("ksem_init"); return; } /* This should succeed and decrement the value to 0. */ if (ksem_trywait(id) < 0) { fail_errno("ksem_trywait()"); ksem_destroy(id); return; } if (checkvalue(id, 0) < 0) { ksem_destroy(id); return; } if (ksem_destroy(id) < 0) { fail_errno("ksem_destroy"); return; } pass(); } TEST(unlocked_trywait, "unlocked trywait"); static void locked_trywait(void) { semid_t id; if (ksem_init(&id, 0) < 0) { fail_errno("ksem_init"); return; } /* This should fail with EAGAIN and leave the value at 0. */ if (ksem_trywait(id) >= 0) { fail_err("ksem_trywait() didn't fail"); ksem_destroy(id); return; } if (errno != EAGAIN) { fail_errno("wrong error from ksem_trywait()"); ksem_destroy(id); return; } if (checkvalue(id, 0) < 0) { ksem_destroy(id); return; } if (ksem_destroy(id) < 0) { fail_errno("ksem_destroy"); return; } pass(); } TEST(locked_trywait, "locked trywait"); /* * Use a timer to post a specific semaphore after a timeout. A timer * is scheduled via schedule_post(). check_alarm() must be called * afterwards to clean up and check for errors. */ static semid_t alarm_id = -1; static int alarm_errno; static int alarm_handler_installed; static void alarm_handler(int signo) { if (ksem_post(alarm_id) < 0) alarm_errno = errno; } static int check_alarm(int just_clear) { struct itimerval it; bzero(&it, sizeof(it)); if (just_clear) { setitimer(ITIMER_REAL, &it, NULL); alarm_errno = 0; alarm_id = -1; return (0); } if (setitimer(ITIMER_REAL, &it, NULL) < 0) { fail_errno("setitimer"); return (-1); } if (alarm_errno != 0 && !just_clear) { errno = alarm_errno; fail_errno("ksem_post() (via timeout)"); alarm_errno = 0; return (-1); } alarm_id = -1; return (0); } static int schedule_post(semid_t id, u_int msec) { struct itimerval it; if (!alarm_handler_installed) { if (signal(SIGALRM, alarm_handler) == SIG_ERR) { fail_errno("signal(SIGALRM)"); return (-1); } alarm_handler_installed = 1; } if (alarm_id != -1) { fail_err("ksem_post() already scheduled"); return (-1); } alarm_id = id; bzero(&it, sizeof(it)); it.it_value.tv_sec = msec / 1000; it.it_value.tv_usec = (msec % 1000) * 1000; if (setitimer(ITIMER_REAL, &it, NULL) < 0) { fail_errno("setitimer"); return (-1); } return (0); } static int timedwait(semid_t id, u_int msec, u_int *delta, int error) { struct timespec start, end; if (clock_gettime(CLOCK_REALTIME, &start) < 0) { fail_errno("clock_gettime(CLOCK_REALTIME)"); return (-1); } end.tv_sec = msec / 1000; end.tv_nsec = msec % 1000 * 1000000; - timespecadd(&end, &start); + timespecadd(&end, &start, &end); if (ksem_timedwait(id, &end) < 0) { if (errno != error) { fail_errno("ksem_timedwait"); return (-1); } } else if (error != 0) { fail_err("ksem_timedwait() didn't fail"); return (-1); } if (clock_gettime(CLOCK_REALTIME, &end) < 0) { fail_errno("clock_gettime(CLOCK_REALTIME)"); return (-1); } - timespecsub(&end, &start); + timespecsub(&end, &start, &end); *delta = end.tv_nsec / 1000000; *delta += end.tv_sec * 1000; return (0); } static void unlocked_timedwait(void) { semid_t id; u_int elapsed; if (ksem_init(&id, 1) < 0) { fail_errno("ksem_init"); return; } /* This should succeed right away and set the value to 0. */ if (timedwait(id, 5000, &elapsed, 0) < 0) { ksem_destroy(id); return; } if (!ELAPSED(elapsed, 0)) { fail_err("ksem_timedwait() of unlocked sem took %ums", elapsed); ksem_destroy(id); return; } if (checkvalue(id, 0) < 0) { ksem_destroy(id); return; } if (ksem_destroy(id) < 0) { fail_errno("ksem_destroy"); return; } pass(); } TEST(unlocked_timedwait, "unlocked timedwait"); static void expired_timedwait(void) { semid_t id; u_int elapsed; if (ksem_init(&id, 0) < 0) { fail_errno("ksem_init"); return; } /* This should fail with a timeout and leave the value at 0. */ if (timedwait(id, 2500, &elapsed, ETIMEDOUT) < 0) { ksem_destroy(id); return; } if (!ELAPSED(elapsed, 2500)) { fail_err( "ksem_timedwait() of locked sem took %ums instead of 2500ms", elapsed); ksem_destroy(id); return; } if (checkvalue(id, 0) < 0) { ksem_destroy(id); return; } if (ksem_destroy(id) < 0) { fail_errno("ksem_destroy"); return; } pass(); } TEST(expired_timedwait, "locked timedwait timeout"); static void locked_timedwait(void) { semid_t id; u_int elapsed; if (ksem_init(&id, 0) < 0) { fail_errno("ksem_init"); return; } /* * Schedule a post to trigger after 1000 ms. The subsequent * timedwait should succeed after 1000 ms as a result w/o * timing out. */ if (schedule_post(id, 1000) < 0) { ksem_destroy(id); return; } if (timedwait(id, 2000, &elapsed, 0) < 0) { check_alarm(1); ksem_destroy(id); return; } if (!ELAPSED(elapsed, 1000)) { fail_err( "ksem_timedwait() with delayed post took %ums instead of 1000ms", elapsed); check_alarm(1); ksem_destroy(id); return; } if (check_alarm(0) < 0) { ksem_destroy(id); return; } if (ksem_destroy(id) < 0) { fail_errno("ksem_destroy"); return; } pass(); } TEST(locked_timedwait, "locked timedwait"); static int testwait(semid_t id, u_int *delta) { struct timespec start, end; if (clock_gettime(CLOCK_REALTIME, &start) < 0) { fail_errno("clock_gettime(CLOCK_REALTIME)"); return (-1); } if (ksem_wait(id) < 0) { fail_errno("ksem_wait"); return (-1); } if (clock_gettime(CLOCK_REALTIME, &end) < 0) { fail_errno("clock_gettime(CLOCK_REALTIME)"); return (-1); } - timespecsub(&end, &start); + timespecsub(&end, &start, &end); *delta = end.tv_nsec / 1000000; *delta += end.tv_sec * 1000; return (0); } static void unlocked_wait(void) { semid_t id; u_int elapsed; if (ksem_init(&id, 1) < 0) { fail_errno("ksem_init"); return; } /* This should succeed right away and set the value to 0. */ if (testwait(id, &elapsed) < 0) { ksem_destroy(id); return; } if (!ELAPSED(elapsed, 0)) { fail_err("ksem_wait() of unlocked sem took %ums", elapsed); ksem_destroy(id); return; } if (checkvalue(id, 0) < 0) { ksem_destroy(id); return; } if (ksem_destroy(id) < 0) { fail_errno("ksem_destroy"); return; } pass(); } TEST(unlocked_wait, "unlocked wait"); static void locked_wait(void) { semid_t id; u_int elapsed; if (ksem_init(&id, 0) < 0) { fail_errno("ksem_init"); return; } /* * Schedule a post to trigger after 1000 ms. The subsequent * wait should succeed after 1000 ms as a result. */ if (schedule_post(id, 1000) < 0) { ksem_destroy(id); return; } if (testwait(id, &elapsed) < 0) { check_alarm(1); ksem_destroy(id); return; } if (!ELAPSED(elapsed, 1000)) { fail_err( "ksem_wait() with delayed post took %ums instead of 1000ms", elapsed); check_alarm(1); ksem_destroy(id); return; } if (check_alarm(0) < 0) { ksem_destroy(id); return; } if (ksem_destroy(id) < 0) { fail_errno("ksem_destroy"); return; } pass(); } TEST(locked_wait, "locked wait"); /* * Fork off a child process. The child will open the semaphore via * the same name. The child will then block on the semaphore waiting * for the parent to post it. */ static int wait_twoproc_child(void *arg) { semid_t id; if (ksem_open(&id, TEST_PATH, 0, 0, 0) < 0) return (CSTAT(1, errno)); if (ksem_wait(id) < 0) return (CSTAT(2, errno)); if (ksem_close(id) < 0) return (CSTAT(3, errno)); return (CSTAT(0, 0)); } static void wait_twoproc_test(void) { semid_t id; int stat; if (ksem_open(&id, TEST_PATH, O_CREAT, 0777, 0)) { fail_errno("ksem_open"); return; } if (schedule_post(id, 500) < 0) { ksem_close(id); ksem_unlink(TEST_PATH); return; } if (child_worker(wait_twoproc_child, NULL, &stat) < 0) { check_alarm(1); ksem_close(id); ksem_unlink(TEST_PATH); return; } errno = CSTAT_ERROR(stat); switch (CSTAT_CLASS(stat)) { case 0: pass(); break; case 1: fail_errno("child ksem_open()"); break; case 2: fail_errno("child ksem_wait()"); break; case 3: fail_errno("child ksem_close()"); break; default: fail_err("bad child state %#x", stat); break; } check_alarm(1); ksem_close(id); ksem_unlink(TEST_PATH); } TEST(wait_twoproc_test, "two proc wait"); static void maxvalue_test(void) { semid_t id; int val; if (ksem_init(&id, SEM_VALUE_MAX) < 0) { fail_errno("ksem_init"); return; } if (ksem_getvalue(id, &val) < 0) { fail_errno("ksem_getvalue"); ksem_destroy(id); return; } if (val != SEM_VALUE_MAX) { fail_err("value %d != SEM_VALUE_MAX"); ksem_destroy(id); return; } if (val < 0) { fail_err("value < 0"); ksem_destroy(id); return; } if (ksem_destroy(id) < 0) { fail_errno("ksem_destroy"); return; } pass(); } TEST(maxvalue_test, "get value of SEM_VALUE_MAX semaphore"); static void maxvalue_post_test(void) { semid_t id; if (ksem_init(&id, SEM_VALUE_MAX) < 0) { fail_errno("ksem_init"); return; } ksem_post_should_fail(id, EOVERFLOW); ksem_destroy(id); } TEST(maxvalue_post_test, "post SEM_VALUE_MAX semaphore"); static void busy_destroy_test(void) { char errbuf[_POSIX2_LINE_MAX]; struct kinfo_proc *kp; semid_t id; pid_t pid; kvm_t *kd; int count; kd = kvm_openfiles(NULL, "/dev/null", NULL, O_RDONLY, errbuf); if (kd == NULL) { fail_err("kvm_openfiles: %s", errbuf); return; } if (ksem_init(&id, 0) < 0) { fail_errno("ksem_init"); kvm_close(kd); return; } pid = fork(); switch (pid) { case -1: /* Error. */ fail_errno("fork"); ksem_destroy(id); kvm_close(kd); return; case 0: /* Child. */ ksem_wait(id); exit(0); } /* * Wait for the child process to block on the semaphore. This * is a bit gross. */ for (;;) { kp = kvm_getprocs(kd, KERN_PROC_PID, pid, &count); if (kp == NULL) { fail_err("kvm_getprocs: %s", kvm_geterr(kd)); kvm_close(kd); ksem_destroy(id); return; } if (kp->ki_stat == SSLEEP && (strcmp(kp->ki_wmesg, "sem") == 0 || strcmp(kp->ki_wmesg, "ksem") == 0)) break; usleep(1000); } kvm_close(kd); ksem_destroy_should_fail(id, EBUSY); /* Cleanup. */ ksem_post(id); waitpid(pid, NULL, 0); ksem_destroy(id); } TEST(busy_destroy_test, "destroy unnamed semaphore with waiter"); static int exhaust_unnamed_child(void *arg) { semid_t id; int i, max; max = (intptr_t)arg; for (i = 0; i < max + 1; i++) { if (ksem_init(&id, 1) < 0) { if (errno == ENOSPC) return (CSTAT(0, 0)); return (CSTAT(1, errno)); } } return (CSTAT(2, 0)); } static void exhaust_unnamed_sems(void) { size_t len; int nsems_max, stat; len = sizeof(nsems_max); if (sysctlbyname("p1003_1b.sem_nsems_max", &nsems_max, &len, NULL, 0) < 0) { fail_errno("sysctl(p1003_1b.sem_nsems_max)"); return; } if (child_worker(exhaust_unnamed_child, (void *)(uintptr_t)nsems_max, &stat)) return; errno = CSTAT_ERROR(stat); switch (CSTAT_CLASS(stat)) { case 0: pass(); break; case 1: fail_errno("ksem_init"); break; case 2: fail_err("Limit of %d semaphores not enforced", nsems_max); break; default: fail_err("bad child state %#x", stat); break; } } TEST(exhaust_unnamed_sems, "exhaust unnamed semaphores (1)"); static int exhaust_named_child(void *arg) { char buffer[64]; semid_t id; int i, max; max = (intptr_t)arg; for (i = 0; i < max + 1; i++) { snprintf(buffer, sizeof(buffer), "%s%d", TEST_PATH, i); if (ksem_open(&id, buffer, O_CREAT, 0777, 1) < 0) { if (errno == ENOSPC || errno == EMFILE || errno == ENFILE) return (CSTAT(0, 0)); return (CSTAT(1, errno)); } } return (CSTAT(2, errno)); } static void exhaust_named_sems(void) { char buffer[64]; size_t len; int i, nsems_max, stat; len = sizeof(nsems_max); if (sysctlbyname("p1003_1b.sem_nsems_max", &nsems_max, &len, NULL, 0) < 0) { fail_errno("sysctl(p1003_1b.sem_nsems_max)"); return; } if (child_worker(exhaust_named_child, (void *)(uintptr_t)nsems_max, &stat) < 0) return; errno = CSTAT_ERROR(stat); switch (CSTAT_CLASS(stat)) { case 0: pass(); break; case 1: fail_errno("ksem_open"); break; case 2: fail_err("Limit of %d semaphores not enforced", nsems_max); break; default: fail_err("bad child state %#x", stat); break; } /* Cleanup any semaphores created by the child. */ for (i = 0; i < nsems_max + 1; i++) { snprintf(buffer, sizeof(buffer), "%s%d", TEST_PATH, i); ksem_unlink(buffer); } } TEST(exhaust_named_sems, "exhaust named semaphores (1)"); static int fdlimit_set(void *arg) { struct rlimit rlim; int max; max = (intptr_t)arg; if (getrlimit(RLIMIT_NOFILE, &rlim) < 0) return (CSTAT(3, errno)); rlim.rlim_cur = max; if (setrlimit(RLIMIT_NOFILE, &rlim) < 0) return (CSTAT(4, errno)); return (0); } static int fdlimit_unnamed_child(void *arg) { int stat; stat = fdlimit_set(arg); if (stat == 0) stat = exhaust_unnamed_child(arg); return (stat); } static void fdlimit_unnamed_sems(void) { int nsems_max, stat; nsems_max = 10; if (child_worker(fdlimit_unnamed_child, (void *)(uintptr_t)nsems_max, &stat)) return; errno = CSTAT_ERROR(stat); switch (CSTAT_CLASS(stat)) { case 0: pass(); break; case 1: fail_errno("ksem_init"); break; case 2: fail_err("Limit of %d semaphores not enforced", nsems_max); break; case 3: fail_errno("getrlimit"); break; case 4: fail_errno("getrlimit"); break; default: fail_err("bad child state %#x", stat); break; } } TEST(fdlimit_unnamed_sems, "exhaust unnamed semaphores (2)"); static int fdlimit_named_child(void *arg) { int stat; stat = fdlimit_set(arg); if (stat == 0) stat = exhaust_named_child(arg); return (stat); } static void fdlimit_named_sems(void) { char buffer[64]; int i, nsems_max, stat; nsems_max = 10; if (child_worker(fdlimit_named_child, (void *)(uintptr_t)nsems_max, &stat) < 0) return; errno = CSTAT_ERROR(stat); switch (CSTAT_CLASS(stat)) { case 0: pass(); break; case 1: fail_errno("ksem_open"); break; case 2: fail_err("Limit of %d semaphores not enforced", nsems_max); break; case 3: fail_errno("getrlimit"); break; case 4: fail_errno("getrlimit"); break; default: fail_err("bad child state %#x", stat); break; } /* Cleanup any semaphores created by the child. */ for (i = 0; i < nsems_max + 1; i++) { snprintf(buffer, sizeof(buffer), "%s%d", TEST_PATH, i); ksem_unlink(buffer); } } TEST(fdlimit_named_sems, "exhaust named semaphores (2)"); int main(int argc, char *argv[]) { signal(SIGSYS, SIG_IGN); run_tests(); return (0); } Index: head/tools/regression/sockets/udp_pingpong/udp_pingpong.c =================================================================== --- head/tools/regression/sockets/udp_pingpong/udp_pingpong.c (revision 336913) +++ head/tools/regression/sockets/udp_pingpong/udp_pingpong.c (revision 336914) @@ -1,651 +1,626 @@ /*- * Copyright (c) 2017 Maksym Sobolyev * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * The test that setups two processes A and B and make A sending * B UDP packet(s) and B send it back. The time of sending is recorded * in the payload and time of the arrival is either determined by * reading clock after recv() completes or using kernel-supplied * via recvmsg(). End-to-end time t(A->B->A) is then calculated * and compared against time for both t(A->B) + t(B->A) to make * sure it makes sense. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define NPKTS 1000 #define PKT_SIZE 128 /* Timeout to receive pong on the side A, 100ms */ #define SRECV_TIMEOUT (1 * 100) /* * Timeout to receive ping on the side B. 4x as large as on the side A, * so that in the case of packet loss the side A will have a chance to * realize that and send few more before B bails out. */ #define RRECV_TIMEOUT (SRECV_TIMEOUT * 4) #define MIN_NRECV ((NPKTS * 99) / 100) /* 99% */ //#define SIMULATE_PLOSS struct trip_ts { struct timespec sent; struct timespec recvd; }; struct test_pkt { int pnum; struct trip_ts tss[2]; int lost; unsigned char data[PKT_SIZE]; }; struct test_ctx { const char *name; int fds[2]; struct pollfd pfds[2]; union { struct sockaddr_in v4; struct sockaddr_in6 v6; } sin[2]; struct test_pkt test_pkts[NPKTS]; int nsent; int nrecvd; clockid_t clock; int use_recvmsg; int ts_type; }; struct rtt { struct timespec a2b; struct timespec b2a; struct timespec e2e; struct timespec a2b_b2a; }; #define SEC(x) ((x)->tv_sec) #define NSEC(x) ((x)->tv_nsec) #define NSEC_MAX 1000000000L #define NSEC_IN_USEC 1000L -#define timespecsub2(r, v, u) \ - do { \ - SEC(r) = SEC(v) - SEC(u); \ - NSEC(r) = NSEC(v) - NSEC(u); \ - if (NSEC(r) < 0 && (SEC(r) > 0 || NSEC(r) <= -NSEC_MAX)) { \ - SEC(r)--; \ - NSEC(r) += NSEC_MAX; \ - } \ - } while (0); - -#define timespecadd2(r, v, u) \ - do { \ - SEC(r) = SEC(v) + SEC(u); \ - NSEC(r) = NSEC(v) + NSEC(u); \ - if (NSEC(r) >= NSEC_MAX) { \ - SEC(r)++; \ - NSEC(r) -= NSEC_MAX; \ - } \ - } while (0); - -#define timespeccmp(t, c, u) \ - ((SEC(t) == SEC(u)) ? \ - (NSEC(t) c NSEC(u)) : \ - (SEC(t) c SEC(u))) - #define timeval2timespec(tv, ts) \ do { \ SEC(ts) = (tv)->tv_sec; \ NSEC(ts) = (tv)->tv_usec * NSEC_IN_USEC; \ } while (0); static const struct timespec zero_ts; /* 0.01s, should be more than enough for the loopback communication */ static const struct timespec max_ts = {.tv_nsec = (NSEC_MAX / 100)}; enum ts_types {TT_TIMESTAMP = -2, TT_BINTIME = -1, TT_REALTIME_MICRO = SO_TS_REALTIME_MICRO, TT_TS_BINTIME = SO_TS_BINTIME, TT_REALTIME = SO_TS_REALTIME, TT_MONOTONIC = SO_TS_MONOTONIC}; static clockid_t get_clock_type(struct test_ctx *tcp) { switch (tcp->ts_type) { case TT_TIMESTAMP: case TT_BINTIME: case TT_REALTIME_MICRO: case TT_TS_BINTIME: case TT_REALTIME: return (CLOCK_REALTIME); case TT_MONOTONIC: return (CLOCK_MONOTONIC); } abort(); } static int get_scm_type(struct test_ctx *tcp) { switch (tcp->ts_type) { case TT_TIMESTAMP: case TT_REALTIME_MICRO: return (SCM_TIMESTAMP); case TT_BINTIME: case TT_TS_BINTIME: return (SCM_BINTIME); case TT_REALTIME: return (SCM_REALTIME); case TT_MONOTONIC: return (SCM_MONOTONIC); } abort(); } static size_t get_scm_size(struct test_ctx *tcp) { switch (tcp->ts_type) { case TT_TIMESTAMP: case TT_REALTIME_MICRO: return (sizeof(struct timeval)); case TT_BINTIME: case TT_TS_BINTIME: return (sizeof(struct bintime)); case TT_REALTIME: case TT_MONOTONIC: return (sizeof(struct timespec)); } abort(); } static void setup_ts_sockopt(struct test_ctx *tcp, int fd) { int rval, oname1, oname2, sval1, sval2; oname1 = SO_TIMESTAMP; oname2 = -1; sval2 = -1; switch (tcp->ts_type) { case TT_REALTIME_MICRO: case TT_TS_BINTIME: case TT_REALTIME: case TT_MONOTONIC: oname2 = SO_TS_CLOCK; sval2 = tcp->ts_type; break; case TT_TIMESTAMP: break; case TT_BINTIME: oname1 = SO_BINTIME; break; default: abort(); } sval1 = 1; rval = setsockopt(fd, SOL_SOCKET, oname1, &sval1, sizeof(sval1)); if (rval != 0) { err(1, "%s: setup_udp: setsockopt(%d, %d, 1)", tcp->name, fd, oname1); } if (oname2 == -1) return; rval = setsockopt(fd, SOL_SOCKET, oname2, &sval2, sizeof(sval2)); if (rval != 0) { err(1, "%s: setup_udp: setsockopt(%d, %d, %d)", tcp->name, fd, oname2, sval2); } } static void setup_udp(struct test_ctx *tcp) { int i; socklen_t sin_len, af_len; af_len = sizeof(tcp->sin[0].v4); for (i = 0; i < 2; i++) { tcp->sin[i].v4.sin_len = af_len; tcp->sin[i].v4.sin_family = AF_INET; tcp->sin[i].v4.sin_addr.s_addr = htonl(INADDR_LOOPBACK); tcp->fds[i] = socket(PF_INET, SOCK_DGRAM, 0); if (tcp->fds[i] < 0) err(1, "%s: setup_udp: socket", tcp->name); if (bind(tcp->fds[i], (struct sockaddr *)&tcp->sin[i], af_len) < 0) err(1, "%s: setup_udp: bind(%s, %d)", tcp->name, inet_ntoa(tcp->sin[i].v4.sin_addr), 0); sin_len = af_len; if (getsockname(tcp->fds[i], (struct sockaddr *)&tcp->sin[i], &sin_len) < 0) err(1, "%s: setup_udp: getsockname(%d)", tcp->name, tcp->fds[i]); if (tcp->use_recvmsg != 0) { setup_ts_sockopt(tcp, tcp->fds[i]); } tcp->pfds[i].fd = tcp->fds[i]; tcp->pfds[i].events = POLLIN; } if (connect(tcp->fds[0], (struct sockaddr *)&tcp->sin[1], af_len) < 0) err(1, "%s: setup_udp: connect(%s, %d)", tcp->name, inet_ntoa(tcp->sin[1].v4.sin_addr), ntohs(tcp->sin[1].v4.sin_port)); if (connect(tcp->fds[1], (struct sockaddr *)&tcp->sin[0], af_len) < 0) err(1, "%s: setup_udp: connect(%s, %d)", tcp->name, inet_ntoa(tcp->sin[0].v4.sin_addr), ntohs(tcp->sin[0].v4.sin_port)); } static char * inet_ntoa6(const void *sin6_addr) { static char straddr[INET6_ADDRSTRLEN]; inet_ntop(AF_INET6, sin6_addr, straddr, sizeof(straddr)); return (straddr); } static void setup_udp6(struct test_ctx *tcp) { int i; socklen_t sin_len, af_len; af_len = sizeof(tcp->sin[0].v6); for (i = 0; i < 2; i++) { tcp->sin[i].v6.sin6_len = af_len; tcp->sin[i].v6.sin6_family = AF_INET6; tcp->sin[i].v6.sin6_addr = in6addr_loopback; tcp->fds[i] = socket(PF_INET6, SOCK_DGRAM, 0); if (tcp->fds[i] < 0) err(1, "%s: setup_udp: socket", tcp->name); if (bind(tcp->fds[i], (struct sockaddr *)&tcp->sin[i], af_len) < 0) err(1, "%s: setup_udp: bind(%s, %d)", tcp->name, inet_ntoa6(&tcp->sin[i].v6.sin6_addr), 0); sin_len = af_len; if (getsockname(tcp->fds[i], (struct sockaddr *)&tcp->sin[i], &sin_len) < 0) err(1, "%s: setup_udp: getsockname(%d)", tcp->name, tcp->fds[i]); if (tcp->use_recvmsg != 0) { setup_ts_sockopt(tcp, tcp->fds[i]); } tcp->pfds[i].fd = tcp->fds[i]; tcp->pfds[i].events = POLLIN; } if (connect(tcp->fds[0], (struct sockaddr *)&tcp->sin[1], af_len) < 0) err(1, "%s: setup_udp: connect(%s, %d)", tcp->name, inet_ntoa6(&tcp->sin[1].v6.sin6_addr), ntohs(tcp->sin[1].v6.sin6_port)); if (connect(tcp->fds[1], (struct sockaddr *)&tcp->sin[0], af_len) < 0) err(1, "%s: setup_udp: connect(%s, %d)", tcp->name, inet_ntoa6(&tcp->sin[0].v6.sin6_addr), ntohs(tcp->sin[0].v6.sin6_port)); } static void teardown_udp(struct test_ctx *tcp) { close(tcp->fds[0]); close(tcp->fds[1]); } static void send_pkt(struct test_ctx *tcp, int pnum, int fdidx, const char *face) { ssize_t r; size_t slen; slen = sizeof(tcp->test_pkts[pnum]); clock_gettime(get_clock_type(tcp), &tcp->test_pkts[pnum].tss[fdidx].sent); r = send(tcp->fds[fdidx], &tcp->test_pkts[pnum], slen, 0); if (r < 0) { err(1, "%s: %s: send(%d)", tcp->name, face, tcp->fds[fdidx]); } if (r < (ssize_t)slen) { errx(1, "%s: %s: send(%d): short send", tcp->name, face, tcp->fds[fdidx]); } tcp->nsent += 1; } #define PDATA(tcp, i) ((tcp)->test_pkts[(i)].data) static void hdr_extract_ts(struct test_ctx *tcp, struct msghdr *mhp, struct timespec *tp) { int scm_type; size_t scm_size; union { struct timespec ts; struct bintime bt; struct timeval tv; } tdata; struct cmsghdr *cmsg; scm_type = get_scm_type(tcp); scm_size = get_scm_size(tcp); for (cmsg = CMSG_FIRSTHDR(mhp); cmsg != NULL; cmsg = CMSG_NXTHDR(mhp, cmsg)) { if ((cmsg->cmsg_level == SOL_SOCKET) && (cmsg->cmsg_type == scm_type)) { memcpy(&tdata, CMSG_DATA(cmsg), scm_size); break; } } if (cmsg == NULL) { abort(); } switch (tcp->ts_type) { case TT_REALTIME: case TT_MONOTONIC: *tp = tdata.ts; break; case TT_TIMESTAMP: case TT_REALTIME_MICRO: timeval2timespec(&tdata.tv, tp); break; case TT_BINTIME: case TT_TS_BINTIME: bintime2timespec(&tdata.bt, tp); break; default: abort(); } } static void recv_pkt_recvmsg(struct test_ctx *tcp, int fdidx, const char *face, void *buf, size_t rlen, struct timespec *tp) { /* We use a union to make sure hdr is aligned */ union { struct cmsghdr hdr; unsigned char buf[CMSG_SPACE(1024)]; } cmsgbuf; struct msghdr msg; struct iovec iov; ssize_t rval; memset(&msg, '\0', sizeof(msg)); iov.iov_base = buf; iov.iov_len = rlen; msg.msg_iov = &iov; msg.msg_iovlen = 1; msg.msg_control = cmsgbuf.buf; msg.msg_controllen = sizeof(cmsgbuf.buf); rval = recvmsg(tcp->fds[fdidx], &msg, 0); if (rval < 0) { err(1, "%s: %s: recvmsg(%d)", tcp->name, face, tcp->fds[fdidx]); } if (rval < (ssize_t)rlen) { errx(1, "%s: %s: recvmsg(%d): short recv", tcp->name, face, tcp->fds[fdidx]); } hdr_extract_ts(tcp, &msg, tp); } static void recv_pkt_recv(struct test_ctx *tcp, int fdidx, const char *face, void *buf, size_t rlen, struct timespec *tp) { ssize_t rval; rval = recv(tcp->fds[fdidx], buf, rlen, 0); clock_gettime(get_clock_type(tcp), tp); if (rval < 0) { err(1, "%s: %s: recv(%d)", tcp->name, face, tcp->fds[fdidx]); } if (rval < (ssize_t)rlen) { errx(1, "%s: %s: recv(%d): short recv", tcp->name, face, tcp->fds[fdidx]); } } static int recv_pkt(struct test_ctx *tcp, int fdidx, const char *face, int tout) { int pr; struct test_pkt recv_buf; size_t rlen; pr = poll(&tcp->pfds[fdidx], 1, tout); if (pr < 0) { err(1, "%s: %s: poll(%d)", tcp->name, face, tcp->fds[fdidx]); } if (pr == 0) { return (-1); } if(tcp->pfds[fdidx].revents != POLLIN) { errx(1, "%s: %s: poll(%d): unexpected result", tcp->name, face, tcp->fds[fdidx]); } rlen = sizeof(recv_buf); if (tcp->use_recvmsg == 0) { recv_pkt_recv(tcp, fdidx, face, &recv_buf, rlen, &recv_buf.tss[fdidx].recvd); } else { recv_pkt_recvmsg(tcp, fdidx, face, &recv_buf, rlen, &recv_buf.tss[fdidx].recvd); } if (recv_buf.pnum < 0 || recv_buf.pnum >= NPKTS || memcmp(recv_buf.data, PDATA(tcp, recv_buf.pnum), PKT_SIZE) != 0) { errx(1, "%s: %s: recv(%d): corrupted data, packet %d", tcp->name, face, tcp->fds[fdidx], recv_buf.pnum); } tcp->nrecvd += 1; memcpy(tcp->test_pkts[recv_buf.pnum].tss, recv_buf.tss, sizeof(recv_buf.tss)); tcp->test_pkts[recv_buf.pnum].lost = 0; return (recv_buf.pnum); } static void test_server(struct test_ctx *tcp) { int i, j; for (i = 0; i < NPKTS; i++) { send_pkt(tcp, i, 0, __FUNCTION__); j = recv_pkt(tcp, 0, __FUNCTION__, SRECV_TIMEOUT); if (j < 0) { warnx("packet %d is lost", i); /* timeout */ continue; } } } static void test_client(struct test_ctx *tcp) { int i, j; for (i = 0; i < NPKTS; i++) { j = recv_pkt(tcp, 1, __FUNCTION__, RRECV_TIMEOUT); if (j < 0) { /* timeout */ return; } #if defined(SIMULATE_PLOSS) if ((i % 99) == 0) { warnx("dropping packet %d", i); continue; } #endif send_pkt(tcp, j, 1, __FUNCTION__); } } static void calc_rtt(struct test_pkt *tpp, struct rtt *rttp) { - timespecsub2(&rttp->a2b, &tpp->tss[1].recvd, &tpp->tss[0].sent); - timespecsub2(&rttp->b2a, &tpp->tss[0].recvd, &tpp->tss[1].sent); - timespecadd2(&rttp->a2b_b2a, &rttp->a2b, &rttp->b2a); - timespecsub2(&rttp->e2e, &tpp->tss[0].recvd, &tpp->tss[0].sent); + timespecsub(&tpp->tss[1].recvd, &tpp->tss[0].sent, &rttp->a2b); + timespecsub(&tpp->tss[0].recvd, &tpp->tss[1].sent, &rttp->b2a); + timespecadd(&rttp->a2b, &rttp->b2a, &rttp->a2b_b2a); + timespecsub(&tpp->tss[0].recvd, &tpp->tss[0].sent, &rttp->e2e); } static void test_run(int ts_type, int use_ipv6, int use_recvmsg, const char *name) { struct test_ctx test_ctx; pid_t pid, cpid; int i, j, status; printf("Testing %s via %s: ", name, (use_ipv6 == 0) ? "IPv4" : "IPv6"); fflush(stdout); bzero(&test_ctx, sizeof(test_ctx)); test_ctx.name = name; test_ctx.use_recvmsg = use_recvmsg; test_ctx.ts_type = ts_type; if (use_ipv6 == 0) { setup_udp(&test_ctx); } else { setup_udp6(&test_ctx); } for (i = 0; i < NPKTS; i++) { test_ctx.test_pkts[i].pnum = i; test_ctx.test_pkts[i].lost = 1; for (j = 0; j < PKT_SIZE; j++) { test_ctx.test_pkts[i].data[j] = (unsigned char)random(); } } cpid = fork(); if (cpid < 0) { err(1, "%s: fork()", test_ctx.name); } if (cpid == 0) { test_client(&test_ctx); exit(0); } test_server(&test_ctx); pid = waitpid(cpid, &status, 0); if (pid == (pid_t)-1) { err(1, "%s: waitpid(%d)", test_ctx.name, cpid); } if (WIFEXITED(status)) { if (WEXITSTATUS(status) != EXIT_SUCCESS) { errx(1, "client exit status is %d", WEXITSTATUS(status)); } } else { if (WIFSIGNALED(status)) errx(1, "abnormal termination of client, signal %d%s", WTERMSIG(status), WCOREDUMP(status) ? " (core file generated)" : ""); else errx(1, "termination of client, unknown status"); } if (test_ctx.nrecvd < MIN_NRECV) { errx(1, "packet loss is too high %d received out of %d, min %d", test_ctx.nrecvd, test_ctx.nsent, MIN_NRECV); } for (i = 0; i < NPKTS; i++) { struct rtt rtt; if (test_ctx.test_pkts[i].lost != 0) { continue; } calc_rtt(&test_ctx.test_pkts[i], &rtt); - if (!timespeccmp(&rtt.e2e, >, &rtt.a2b_b2a)) + if (!timespeccmp(&rtt.e2e, &rtt.a2b_b2a, >)) errx(1, "end-to-end trip time is too small"); - if (!timespeccmp(&rtt.e2e, <, &max_ts)) + if (!timespeccmp(&rtt.e2e, &max_ts, <)) errx(1, "end-to-end trip time is too large"); - if (!timespeccmp(&rtt.a2b, >, &zero_ts)) + if (!timespeccmp(&rtt.a2b, &zero_ts, >)) errx(1, "A2B trip time is not positive"); - if (!timespeccmp(&rtt.b2a, >, &zero_ts)) + if (!timespeccmp(&rtt.b2a, &zero_ts, >)) errx(1, "B2A trip time is not positive"); } teardown_udp(&test_ctx); } int main(void) { int i; srandomdev(); for (i = 0; i < 2; i++) { test_run(0, i, 0, "send()/recv()"); printf("OK\n"); test_run(TT_TIMESTAMP, i, 1, "send()/recvmsg(), setsockopt(SO_TIMESTAMP, 1)"); printf("OK\n"); if (i == 0) { test_run(TT_BINTIME, i, 1, "send()/recvmsg(), setsockopt(SO_BINTIME, 1)"); printf("OK\n"); } test_run(TT_REALTIME_MICRO, i, 1, "send()/recvmsg(), setsockopt(SO_TS_CLOCK, SO_TS_REALTIME_MICRO)"); printf("OK\n"); test_run(TT_TS_BINTIME, i, 1, "send()/recvmsg(), setsockopt(SO_TS_CLOCK, SO_TS_BINTIME)"); printf("OK\n"); test_run(TT_REALTIME, i, 1, "send()/recvmsg(), setsockopt(SO_TS_CLOCK, SO_TS_REALTIME)"); printf("OK\n"); test_run(TT_MONOTONIC, i, 1, "send()/recvmsg(), setsockopt(SO_TS_CLOCK, SO_TS_MONOTONIC)"); printf("OK\n"); } exit(0); } Index: head/tools/regression/sockets/unix_cmsg/uc_check_time.c =================================================================== --- head/tools/regression/sockets/unix_cmsg/uc_check_time.c (revision 336913) +++ head/tools/regression/sockets/unix_cmsg/uc_check_time.c (revision 336914) @@ -1,101 +1,87 @@ /*- * Copyright (c) 2016 Maksym Sobolyev * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include "uc_check_time.h" static const struct timeval max_diff_tv = {.tv_sec = 1, .tv_usec = 0}; static const struct timespec max_diff_ts = {.tv_sec = 1, .tv_nsec = 0}; -#define timespeccmp(tvp, uvp, cmp) \ - (((tvp)->tv_sec == (uvp)->tv_sec) ? \ - ((tvp)->tv_nsec cmp (uvp)->tv_nsec) : \ - ((tvp)->tv_sec cmp (uvp)->tv_sec)) -#define timespecsub(vvp, uvp) \ - do { \ - (vvp)->tv_sec -= (uvp)->tv_sec; \ - (vvp)->tv_nsec -= (uvp)->tv_nsec; \ - if ((vvp)->tv_nsec < 0) { \ - (vvp)->tv_sec--; \ - (vvp)->tv_nsec += 1000000000; \ - } \ - } while (0) - int uc_check_bintime(const struct bintime *mt) { struct timespec bt; bintime2timespec(mt, &bt); return (uc_check_timespec_real(&bt)); } int uc_check_timeval(const struct timeval *bt) { struct timeval ct, dt; if (gettimeofday(&ct, NULL) < 0) return (-1); timersub(&ct, bt, &dt); if (!timercmp(&dt, &max_diff_tv, <)) return (-1); return (0); } int uc_check_timespec_real(const struct timespec *bt) { struct timespec ct; if (clock_gettime(CLOCK_REALTIME, &ct) < 0) return (-1); - timespecsub(&ct, bt); + timespecsub(&ct, bt, &ct); if (!timespeccmp(&ct, &max_diff_ts, <)) return (-1); return (0); } int uc_check_timespec_mono(const struct timespec *bt) { struct timespec ct; if (clock_gettime(CLOCK_MONOTONIC, &ct) < 0) return (-1); - timespecsub(&ct, bt); + timespecsub(&ct, bt, &ct); if (!timespeccmp(&ct, &max_diff_ts, <)) return (-1); return (0); } Index: head/tools/tools/netrate/juggle/juggle.c =================================================================== --- head/tools/tools/netrate/juggle/juggle.c (revision 336913) +++ head/tools/tools/netrate/juggle/juggle.c (revision 336914) @@ -1,592 +1,579 @@ /*- * Copyright (c) 2005 Robert N. M. Watson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * juggle is a simple IPC/context switch performance test, which works on * pairs of file descriptors of various types. In various runs, it considers * the cost of bouncing a message synchronously across the descriptor pair, * either in the same thread, two different threads, or two different * processes. Timing measurements for each series of I/O's are reported, but * the first measurement in each series discarded as "warmup" on the IPC * primitive. Variations on the test permit for pipelining, or the insertion * of more than one packet into the stream at a time, intended to permit * greater parallelism, hopefully allowing performance numbers to reflect * use of available parallelism, and/or intelligence in context switching to * avoid premature switching when multiple messages are queued. */ /* * The UDP test uses UDP over the loopback interface. Two arbitrary but * fixed port numbers. */ #define UDP_PORT1 2020 #define UDP_PORT2 2021 /* * Size of each message. Must be smaller than the socket buffer or pipe * buffer maximum size, as we want to send it atomically without blocking. * If pipelining is in use, must be able to fit PIPELINE_MAX of these * messages into the send queue. */ #define MESSAGELEN 128 /* * Number of message cycles -- into fd1, out of fd2, into fd2, and out of * fd1. By counting in cycles, we allow the master thread or process to * perform timing without explicitly synchronizing with the secondary thread * or process. */ #define NUMCYCLES 1024 /* * Number of times to run each test. */ #define LOOPS 10 /* * Number of in-flight messages per cycle. I adjusting this value, be * careful not to exceed the socket/etc buffer depth, or messages may be lost * or result in blocking. */ #define PIPELINE_MAX 4 -/* - * As in all programs, steal timespecsub() from time.h. - */ -#define timespecsub(vvp, uvp) \ - do { \ - (vvp)->tv_sec -= (uvp)->tv_sec; \ - (vvp)->tv_nsec -= (uvp)->tv_nsec; \ - if ((vvp)->tv_nsec < 0) { \ - (vvp)->tv_sec--; \ - (vvp)->tv_nsec += 1000000000; \ - } \ - } while (0) - static int udp_create(int *fd1p, int *fd2p) { struct sockaddr_in sin1, sin2; int sock1, sock2; sock1 = socket(PF_INET, SOCK_DGRAM, 0); if (sock1 == -1) return (-1); sock2 = socket(PF_INET, SOCK_DGRAM, 0); if (sock2 == -1) { close(sock1); return (-1); } bzero(&sin1, sizeof(sin1)); sin1.sin_len = sizeof(sin1); sin1.sin_family = AF_INET; sin1.sin_addr.s_addr = htonl(INADDR_LOOPBACK); sin1.sin_port = htons(UDP_PORT1); bzero(&sin2, sizeof(sin2)); sin2.sin_len = sizeof(sin2); sin2.sin_family = AF_INET; sin2.sin_addr.s_addr = htonl(INADDR_LOOPBACK); sin2.sin_port = htons(UDP_PORT2); if (bind(sock1, (struct sockaddr *) &sin1, sizeof(sin1)) < 0) { close(sock1); close(sock2); return (-1); } if (bind(sock2, (struct sockaddr *) &sin2, sizeof(sin2)) < 0) { close(sock1); close(sock2); return (-1); } if (connect(sock1, (struct sockaddr *) &sin2, sizeof(sin2)) < 0) { close(sock1); close(sock2); return (-1); } if (connect(sock2, (struct sockaddr *) &sin1, sizeof(sin1)) < 0) { close(sock1); close(sock2); return (-1); } *fd1p = sock1; *fd2p = sock2; return (0); } static int pipe_create(int *fd1p, int *fd2p) { int fds[2]; if (pipe(fds) < 0) return (-1); *fd1p = fds[0]; *fd2p = fds[1]; return (0); } static int socketpairdgram_create(int *fd1p, int *fd2p) { int fds[2]; if (socketpair(PF_LOCAL, SOCK_DGRAM, 0, fds) < 0) return (-1); *fd1p = fds[0]; *fd2p = fds[1]; return (0); } static int socketpairstream_create(int *fd1p, int *fd2p) { int fds[2]; if (socketpair(PF_LOCAL, SOCK_STREAM, 0, fds) < 0) return (-1); *fd1p = fds[0]; *fd2p = fds[1]; return (0); } static int message_send(int s) { u_char buffer[MESSAGELEN]; ssize_t len; bzero(buffer, sizeof(buffer)); len = write(s, buffer, sizeof(buffer)); if (len == -1) return (-1); if (len != sizeof(buffer)) { errno = EMSGSIZE; return (-1); } return (0); } static int message_recv(int s) { u_char buffer[MESSAGELEN]; ssize_t len; len = read(s, buffer, sizeof(buffer)); if (len == -1) return (-1); if (len != sizeof(buffer)) { errno = EMSGSIZE; return (-1); } return (0); } /* * Juggle messages between two file descriptors in a single thread/process, * so simply a measure of IPC performance. */ static struct timespec juggle(int fd1, int fd2, int pipeline) { struct timespec tstart, tfinish; int i, j; if (clock_gettime(CLOCK_REALTIME, &tstart) < 0) err(-1, "juggle: clock_gettime"); for (i = 0; i < NUMCYCLES; i++) { for (j = 0; j < pipeline; j++) { if (message_send(fd1) < 0) err(-1, "message_send fd1"); } for (j = 0; j < pipeline; j++) { if (message_recv(fd2) < 0) err(-1, "message_recv fd2"); if (message_send(fd2) < 0) err(-1, "message_send fd2"); } for (j = 0; j < pipeline; j++) { if (message_recv(fd1) < 0) err(-1, "message_recv fd1"); } } if (clock_gettime(CLOCK_REALTIME, &tfinish) < 0) err(-1, "juggle: clock_gettime"); - timespecsub(&tfinish, &tstart); + timespecsub(&tfinish, &tstart, &tfinish); return (tfinish); } /* * Juggle messages between two file descriptors in two threads, so measure * the cost of IPC and the cost of a thread context switch. * * In order to avoid measuring thread creation time, we make use of a * condition variable to decide when both threads are ready to begin * juggling. */ static int threaded_child_ready; static pthread_mutex_t threaded_mtx; static pthread_cond_t threaded_cond; static int threaded_pipeline; static void * juggling_thread(void *arg) { int fd2, i, j; fd2 = *(int *)arg; if (pthread_mutex_lock(&threaded_mtx) != 0) err(-1, "juggling_thread: pthread_mutex_lock"); threaded_child_ready = 1; if (pthread_cond_signal(&threaded_cond) != 0) err(-1, "juggling_thread: pthread_cond_signal"); if (pthread_mutex_unlock(&threaded_mtx) != 0) err(-1, "juggling_thread: pthread_mutex_unlock"); for (i = 0; i < NUMCYCLES; i++) { for (j = 0; j < threaded_pipeline; j++) { if (message_recv(fd2) < 0) err(-1, "message_recv fd2"); if (message_send(fd2) < 0) err(-1, "message_send fd2"); } } return (NULL); } static struct timespec thread_juggle(int fd1, int fd2, int pipeline) { struct timespec tstart, tfinish; pthread_t thread; int i, j; threaded_pipeline = pipeline; if (pthread_mutex_init(&threaded_mtx, NULL) != 0) err(-1, "thread_juggle: pthread_mutex_init"); if (pthread_create(&thread, NULL, juggling_thread, &fd2) != 0) err(-1, "thread_juggle: pthread_create"); if (pthread_mutex_lock(&threaded_mtx) != 0) err(-1, "thread_juggle: pthread_mutex_lock"); while (!threaded_child_ready) { if (pthread_cond_wait(&threaded_cond, &threaded_mtx) != 0) err(-1, "thread_juggle: pthread_cond_wait"); } if (pthread_mutex_unlock(&threaded_mtx) != 0) err(-1, "thread_juggle: pthread_mutex_unlock"); if (clock_gettime(CLOCK_REALTIME, &tstart) < 0) err(-1, "thread_juggle: clock_gettime"); for (i = 0; i < NUMCYCLES; i++) { for (j = 0; j < pipeline; j++) { if (message_send(fd1) < 0) err(-1, "message_send fd1"); } for (j = 0; j < pipeline; j++) { if (message_recv(fd1) < 0) err(-1, "message_recv fd1"); } } if (clock_gettime(CLOCK_REALTIME, &tfinish) < 0) err(-1, "thread_juggle: clock_gettime"); if (pthread_join(thread, NULL) != 0) err(-1, "thread_juggle: pthread_join"); - timespecsub(&tfinish, &tstart); + timespecsub(&tfinish, &tstart, &tfinish); return (tfinish); } /* * Juggle messages between two file descriptors in two processes, so measure * the cost of IPC and the cost of a process context switch. * * Since we can't use a mutex between the processes, we simply do an extra * write on the child to let the parent know that it's ready to start. */ static struct timespec process_juggle(int fd1, int fd2, int pipeline) { struct timespec tstart, tfinish; pid_t pid, ppid, wpid; int error, i, j; ppid = getpid(); pid = fork(); if (pid < 0) err(-1, "process_juggle: fork"); if (pid == 0) { if (message_send(fd2) < 0) { error = errno; kill(ppid, SIGTERM); errno = error; err(-1, "process_juggle: child: message_send"); } for (i = 0; i < NUMCYCLES; i++) { for (j = 0; j < pipeline; j++) { if (message_send(fd2) < 0) err(-1, "message_send fd2"); if (message_recv(fd2) < 0) err(-1, "message_recv fd2"); } } exit(0); } else { if (message_recv(fd1) < 0) { error = errno; kill(pid, SIGTERM); errno = error; err(-1, "process_juggle: parent: message_recv"); } if (clock_gettime(CLOCK_REALTIME, &tstart) < 0) err(-1, "process_juggle: clock_gettime"); for (i = 0; i < NUMCYCLES; i++) { for (j = 0; j < pipeline; j++) { if (message_send(fd1) < 0) { error = errno; kill(pid, SIGTERM); errno = error; err(-1, "message_send fd1"); } } for (j = 0; j < pipeline; j++) { if (message_recv(fd1) < 0) { error = errno; kill(pid, SIGTERM); errno = error; err(-1, "message_recv fd1"); } } } if (clock_gettime(CLOCK_REALTIME, &tfinish) < 0) err(-1, "process_juggle: clock_gettime"); } wpid = waitpid(pid, NULL, 0); if (wpid < 0) err(-1, "process_juggle: waitpid"); if (wpid != pid) errx(-1, "process_juggle: waitpid: pid != wpid"); - timespecsub(&tfinish, &tstart); + timespecsub(&tfinish, &tstart, &tfinish); return (tfinish); } /* * When we print out results for larger pipeline sizes, we scale back by the * depth of the pipeline. This generally means dividing by the pipeline * depth. Except when it means dividing by zero. */ static void scale_timespec(struct timespec *ts, int p) { if (p == 0) return; ts->tv_sec /= p; ts->tv_nsec /= p; } static const struct ipctype { int (*it_create)(int *fd1p, int *fd2p); const char *it_name; } ipctypes[] = { { pipe_create, "pipe" }, { udp_create, "udp" }, { socketpairdgram_create, "socketpairdgram" }, { socketpairstream_create, "socketpairstream" }, }; static const int ipctypes_len = (sizeof(ipctypes) / sizeof(struct ipctype)); int main(int argc, char *argv[]) { struct timespec juggle_results[LOOPS], process_results[LOOPS]; struct timespec thread_results[LOOPS]; int fd1, fd2, i, j, p; struct utsname uts; printf("version, juggle.c %s\n", "$FreeBSD$"); if (uname(&uts) < 0) err(-1, "utsname"); printf("sysname, %s\n", uts.sysname); printf("nodename, %s\n", uts.nodename); printf("release, %s\n", uts.release); printf("version, %s\n", uts.version); printf("machine, %s\n", uts.machine); printf("\n"); printf("MESSAGELEN, %d\n", MESSAGELEN); printf("NUMCYCLES, %d\n", NUMCYCLES); printf("LOOPS, %d\n", LOOPS); printf("PIPELINE_MAX, %d\n", PIPELINE_MAX); printf("\n\n"); printf("ipctype, test, pipeline_depth"); for (j = 0; j < LOOPS; j++) printf(", data%d", j); printf("\n"); fflush(stdout); for (p = 0; p < PIPELINE_MAX + 1; p++) { for (i = 0; i < ipctypes_len; i++) { if (ipctypes[i].it_create(&fd1, &fd2) < 0) err(-1, "main: %s", ipctypes[i].it_name); /* * For each test, do one uncounted warmup, then LOOPS * runs of the actual test. */ juggle(fd1, fd2, p); for (j = 0; j < LOOPS; j++) juggle_results[j] = juggle(fd1, fd2, p); process_juggle(fd1, fd2, p); for (j = 0; j < LOOPS; j++) process_results[j] = process_juggle(fd1, fd2, p); thread_juggle(fd1, fd2, p); for (j = 0; j < LOOPS; j++) thread_results[j] = thread_juggle(fd1, fd2, p); for (j = 0; j < LOOPS; j++) { thread_results[j].tv_sec = 0; thread_results[j].tv_nsec = 0; } close(fd1); close(fd2); } /* * When printing results for the round, normalize the results * with respect to the pipeline depth. We're doing p times * as much work, and are we taking p times as long? */ for (i = 0; i < ipctypes_len; i++) { printf("%s, juggle, %d, ", ipctypes[i].it_name, p); for (j = 0; j < LOOPS; j++) { if (j != 0) printf(", "); scale_timespec(&juggle_results[j], p); printf("%jd.%09lu", (intmax_t)juggle_results[j].tv_sec, juggle_results[j].tv_nsec); } printf("\n"); printf("%s, process_juggle, %d, ", ipctypes[i].it_name, p); for (j = 0; j < LOOPS; j++) { if (j != 0) printf(", "); scale_timespec(&process_results[j], p); printf("%jd.%09lu", (intmax_t)process_results[j].tv_sec, process_results[j].tv_nsec); } printf("\n"); printf("%s, thread_juggle, %d, ", ipctypes[i].it_name, p); for (j = 0; j < LOOPS; j++) { if (j != 0) printf(", "); scale_timespec(&thread_results[j], p); printf("%jd.%09lu", (intmax_t)thread_results[j].tv_sec, thread_results[j].tv_nsec); } printf("\n"); } fflush(stdout); } return (0); } Index: head/tools/tools/netrate/tcpp/tcpp_client.c =================================================================== --- head/tools/tools/netrate/tcpp/tcpp_client.c (revision 336913) +++ head/tools/tools/netrate/tcpp/tcpp_client.c (revision 336914) @@ -1,377 +1,367 @@ /*- * Copyright (c) 2008-2009 Robert N. M. Watson * Copyright (c) 2010 Juniper Networks, Inc. * All rights reserved. * * This software was developed by Robert N. M. Watson under contract * to Juniper Networks, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "tcpp.h" #define min(x, y) (x < y ? x : y) -#define timespecsub(vvp, uvp) \ - do { \ - (vvp)->tv_sec -= (uvp)->tv_sec; \ - (vvp)->tv_nsec -= (uvp)->tv_nsec; \ - if ((vvp)->tv_nsec < 0) { \ - (vvp)->tv_sec--; \ - (vvp)->tv_nsec += 1000000000; \ - } \ - } while (0) - /* * Gist of each client worker: build up to mflag connections at a time, and * pump data in to them somewhat fairly until tflag connections have been * completed. */ #define CONNECTION_MAGIC 0x87a3f56e struct connection { uint32_t conn_magic; /* Just magic. */ int conn_fd; struct tcpp_header conn_header; /* Header buffer. */ u_int conn_header_sent; /* Header bytes sent. */ u_int64_t conn_data_sent; /* Data bytes sent.*/ }; static u_char buffer[256 * 1024]; /* Buffer to send. */ static pid_t *pid_list; static int kq; static int started; /* Number started so far. */ static int finished; /* Number finished so far. */ static int counter; /* IP number offset. */ static uint64_t payload_len; static struct connection * tcpp_client_newconn(void) { struct sockaddr_in sin; struct connection *conn; struct kevent kev; int fd, i; /* * Spread load over available IPs, rotating through them as we go. No * attempt to localize IPs to particular workers. */ sin = localipbase; sin.sin_addr.s_addr = htonl(ntohl(localipbase.sin_addr.s_addr) + (counter++ % Mflag)); fd = socket(PF_INET, SOCK_STREAM, 0); if (fd < 0) err(-1, "socket"); if (fcntl(fd, F_SETFL, O_NONBLOCK) < 0) err(-1, "fcntl"); i = 1; if (setsockopt(fd, SOL_SOCKET, SO_NOSIGPIPE, &i, sizeof(i)) < 0) err(-1, "setsockopt"); i = 1; if (setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, &i, sizeof(i)) < 0) err(-1, "setsockopt"); #if 0 i = 1; if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &i, sizeof(i)) < 0) err(-1, "setsockopt"); #endif if (lflag) { if (bind(fd, (struct sockaddr *)&sin, sizeof(sin)) < 0) err(-1, "bind"); } if (connect(fd, (struct sockaddr *)&remoteip, sizeof(remoteip)) < 0 && errno != EINPROGRESS) err(-1, "connect"); conn = malloc(sizeof(*conn)); if (conn == NULL) return (NULL); bzero(conn, sizeof(*conn)); conn->conn_magic = CONNECTION_MAGIC; conn->conn_fd = fd; conn->conn_header.th_magic = TCPP_MAGIC; conn->conn_header.th_len = payload_len; tcpp_header_encode(&conn->conn_header); EV_SET(&kev, fd, EVFILT_WRITE, EV_ADD, 0, 0, conn); if (kevent(kq, &kev, 1, NULL, 0, NULL) < 0) err(-1, "newconn kevent"); started++; return (conn); } static void tcpp_client_closeconn(struct connection *conn) { close(conn->conn_fd); bzero(conn, sizeof(*conn)); free(conn); finished++; } static void tcpp_client_handleconn(struct kevent *kev) { struct connection *conn; struct iovec iov[2]; ssize_t len, header_left; conn = kev->udata; if (conn->conn_magic != CONNECTION_MAGIC) errx(-1, "tcpp_client_handleconn: magic"); if (conn->conn_header_sent < sizeof(conn->conn_header)) { header_left = sizeof(conn->conn_header) - conn->conn_header_sent; iov[0].iov_base = ((u_char *)&conn->conn_header) + conn->conn_header_sent; iov[0].iov_len = header_left; iov[1].iov_base = buffer; iov[1].iov_len = min(sizeof(buffer), payload_len); len = writev(conn->conn_fd, iov, 2); if (len < 0) { tcpp_client_closeconn(conn); err(-1, "tcpp_client_handleconn: header write"); } if (len == 0) { tcpp_client_closeconn(conn); errx(-1, "tcpp_client_handleconn: header write " "premature EOF"); } if (len > header_left) { conn->conn_data_sent += (len - header_left); conn->conn_header_sent += header_left; } else conn->conn_header_sent += len; } else { len = write(conn->conn_fd, buffer, min(sizeof(buffer), payload_len - conn->conn_data_sent)); if (len < 0) { tcpp_client_closeconn(conn); err(-1, "tcpp_client_handleconn: data write"); } if (len == 0) { tcpp_client_closeconn(conn); errx(-1, "tcpp_client_handleconn: data write: " "premature EOF"); } conn->conn_data_sent += len; } if (conn->conn_data_sent >= payload_len) { /* * All is well. */ tcpp_client_closeconn(conn); } } static void tcpp_client_worker(int workernum) { struct kevent *kev_array; int i, numevents, kev_bytes; #if defined(CPU_SETSIZE) && 0 cpu_set_t mask; int ncpus; size_t len; if (Pflag) { len = sizeof(ncpus); if (sysctlbyname(SYSCTLNAME_CPUS, &ncpus, &len, NULL, 0) < 0) err(-1, "sysctlbyname: %s", SYSCTLNAME_CPUS); if (len != sizeof(ncpus)) errx(-1, "sysctlbyname: %s: len %jd", SYSCTLNAME_CPUS, (intmax_t)len); CPU_ZERO(&mask); CPU_SET(workernum % ncpus, &mask); if (sched_setaffinity(0, CPU_SETSIZE, &mask) < 0) err(-1, "sched_setaffinity"); } #endif setproctitle("tcpp_client %d", workernum); /* * Add the worker number to the remote port. */ remoteip.sin_port = htons(rflag + workernum); kev_bytes = sizeof(*kev_array) * mflag; kev_array = malloc(kev_bytes); if (kev_array == NULL) err(-1, "malloc"); bzero(kev_array, kev_bytes); kq = kqueue(); if (kq < 0) err(-1, "kqueue"); while (finished < tflag) { while ((started - finished < mflag) && (started < tflag)) (void)tcpp_client_newconn(); numevents = kevent(kq, NULL, 0, kev_array, mflag, NULL); if (numevents < 0) err(-1, "kevent"); if (numevents > mflag) errx(-1, "kevent: %d", numevents); for (i = 0; i < numevents; i++) tcpp_client_handleconn(&kev_array[i]); } /* printf("Worker %d done - %d finished\n", workernum, finished); */ } void tcpp_client(void) { struct timespec ts_start, ts_finish; long cp_time_start[CPUSTATES], cp_time_finish[CPUSTATES]; long ticks; size_t size; pid_t pid; int i, failed, status; if (bflag < sizeof(struct tcpp_header)) errx(-1, "Can't use -b less than %zu\n", sizeof(struct tcpp_header)); payload_len = bflag - sizeof(struct tcpp_header); pid_list = malloc(sizeof(*pid_list) * pflag); if (pid_list == NULL) err(-1, "malloc pid_list"); bzero(pid_list, sizeof(*pid_list) * pflag); /* * Start workers. */ size = sizeof(cp_time_start); if (sysctlbyname(SYSCTLNAME_CPTIME, &cp_time_start, &size, NULL, 0) < 0) err(-1, "sysctlbyname: %s", SYSCTLNAME_CPTIME); if (clock_gettime(CLOCK_REALTIME, &ts_start) < 0) err(-1, "clock_gettime"); for (i = 0; i < pflag; i++) { pid = fork(); if (pid < 0) { warn("fork"); for (i = 0; i < pflag; i++) { if (pid_list[i] != 0) (void)kill(pid_list[i], SIGKILL); } exit(-1); } if (pid == 0) { tcpp_client_worker(i); exit(0); } pid_list[i] = pid; } /* * GC workers. */ failed = 0; for (i = 0; i < pflag; i++) { if (pid_list[i] != 0) { while (waitpid(pid_list[i], &status, 0) != pid_list[i]); if (WEXITSTATUS(status) != 0) failed = 1; } } if (clock_gettime(CLOCK_REALTIME, &ts_finish) < 0) err(-1, "clock_gettime"); size = sizeof(cp_time_finish); if (sysctlbyname(SYSCTLNAME_CPTIME, &cp_time_finish, &size, NULL, 0) < 0) err(-1, "sysctlbyname: %s", SYSCTLNAME_CPTIME); - timespecsub(&ts_finish, &ts_start); + timespecsub(&ts_finish, &ts_start, &ts_finish); if (failed) errx(-1, "Too many errors"); if (hflag) printf("bytes,seconds,conn/s,Gb/s,user%%,nice%%,sys%%," "intr%%,idle%%\n"); /* * Configuration parameters. */ printf("%jd,", bflag * tflag * pflag); printf("%jd.%09jd,", (intmax_t)ts_finish.tv_sec, (intmax_t)(ts_finish.tv_nsec)); /* * Effective transmit rates. */ printf("%f,", (double)(pflag * tflag)/ (ts_finish.tv_sec + ts_finish.tv_nsec * 1e-9)); printf("%f,", (double)(bflag * tflag * pflag * 8) / (ts_finish.tv_sec + ts_finish.tv_nsec * 1e-9) * 1e-9); /* * CPU time (est). */ ticks = 0; for (i = 0; i < CPUSTATES; i++) { cp_time_finish[i] -= cp_time_start[i]; ticks += cp_time_finish[i]; } printf("%0.02f,", (float)(100 * cp_time_finish[CP_USER]) / ticks); printf("%0.02f,", (float)(100 * cp_time_finish[CP_NICE]) / ticks); printf("%0.02f,", (float)(100 * cp_time_finish[CP_SYS]) / ticks); printf("%0.02f,", (float)(100 * cp_time_finish[CP_INTR]) / ticks); printf("%0.02f", (float)(100 * cp_time_finish[CP_IDLE]) / ticks); printf("\n"); } Index: head/tools/tools/syscall_timing/syscall_timing.c =================================================================== --- head/tools/tools/syscall_timing/syscall_timing.c (revision 336913) +++ head/tools/tools/syscall_timing/syscall_timing.c (revision 336914) @@ -1,1143 +1,1133 @@ /*- * Copyright (c) 2003-2004, 2010 Robert N. M. Watson * All rights reserved. * * Portions of this software were developed at the University of Cambridge * Computer Laboratory with support from a grant from Google, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef WITH_PTHREAD #include #endif #include #include #include #include #include #include static struct timespec ts_start, ts_end; static int alarm_timeout; static volatile int alarm_fired; -#define timespecsub(vvp, uvp) \ - do { \ - (vvp)->tv_sec -= (uvp)->tv_sec; \ - (vvp)->tv_nsec -= (uvp)->tv_nsec; \ - if ((vvp)->tv_nsec < 0) { \ - (vvp)->tv_sec--; \ - (vvp)->tv_nsec += 1000000000; \ - } \ - } while (0) - #define BENCHMARK_FOREACH(I, NUM) for (I = 0; I < NUM && alarm_fired == 0; I++) static void alarm_handler(int signum __unused) { alarm_fired = 1; } static void benchmark_start(void) { int error; alarm_fired = 0; if (alarm_timeout) { signal(SIGALRM, alarm_handler); alarm(alarm_timeout); } error = clock_gettime(CLOCK_REALTIME, &ts_start); assert(error == 0); } static void benchmark_stop(void) { int error; error = clock_gettime(CLOCK_REALTIME, &ts_end); assert(error == 0); } static uintmax_t test_access(uintmax_t num, uintmax_t int_arg __unused, const char *path) { uintmax_t i; int fd; fd = access(path, O_RDONLY); if (fd < 0) err(-1, "test_access: %s", path); close(fd); benchmark_start(); BENCHMARK_FOREACH(i, num) { access(path, O_RDONLY); close(fd); } benchmark_stop(); return (i); } static uintmax_t test_bad_open(uintmax_t num, uintmax_t int_arg __unused, const char *path __unused) { uintmax_t i; benchmark_start(); BENCHMARK_FOREACH(i, num) { open("", O_RDONLY); } benchmark_stop(); return (i); } static uintmax_t test_chroot(uintmax_t num, uintmax_t int_arg __unused, const char *path __unused) { uintmax_t i; if (chroot("/") < 0) err(-1, "test_chroot: chroot"); benchmark_start(); BENCHMARK_FOREACH(i, num) { if (chroot("/") < 0) err(-1, "test_chroot: chroot"); } benchmark_stop(); return (i); } static uintmax_t test_clock_gettime(uintmax_t num, uintmax_t int_arg __unused, const char *path __unused) { struct timespec ts; uintmax_t i; benchmark_start(); BENCHMARK_FOREACH(i, num) { (void)clock_gettime(CLOCK_REALTIME, &ts); } benchmark_stop(); return (i); } static uintmax_t test_create_unlink(uintmax_t num, uintmax_t int_arg __unused, const char *path) { uintmax_t i; int fd; (void)unlink(path); fd = open(path, O_RDWR | O_CREAT, 0600); if (fd < 0) err(-1, "test_create_unlink: create: %s", path); close(fd); if (unlink(path) < 0) err(-1, "test_create_unlink: unlink: %s", path); benchmark_start(); BENCHMARK_FOREACH(i, num) { fd = open(path, O_RDWR | O_CREAT, 0600); if (fd < 0) err(-1, "test_create_unlink: create: %s", path); close(fd); if (unlink(path) < 0) err(-1, "test_create_unlink: unlink: %s", path); } benchmark_stop(); return (i); } static uintmax_t test_fork(uintmax_t num, uintmax_t int_arg __unused, const char *path __unused) { pid_t pid; uintmax_t i; pid = fork(); if (pid < 0) err(-1, "test_fork: fork"); if (pid == 0) _exit(0); if (waitpid(pid, NULL, 0) < 0) err(-1, "test_fork: waitpid"); benchmark_start(); BENCHMARK_FOREACH(i, num) { pid = fork(); if (pid < 0) err(-1, "test_fork: fork"); if (pid == 0) _exit(0); if (waitpid(pid, NULL, 0) < 0) err(-1, "test_fork: waitpid"); } benchmark_stop(); return (i); } #define USR_BIN_TRUE "/usr/bin/true" static char *execve_args[] = { __DECONST(char *, USR_BIN_TRUE), NULL}; extern char **environ; static uintmax_t test_fork_exec(uintmax_t num, uintmax_t int_arg __unused, const char *path __unused) { pid_t pid; uintmax_t i; pid = fork(); if (pid < 0) err(-1, "test_fork_exec: fork"); if (pid == 0) { (void)execve(USR_BIN_TRUE, execve_args, environ); err(-1, "execve"); } if (waitpid(pid, NULL, 0) < 0) err(-1, "test_fork: waitpid"); benchmark_start(); BENCHMARK_FOREACH(i, num) { pid = fork(); if (pid < 0) err(-1, "test_fork_exec: fork"); if (pid == 0) { (void)execve(USR_BIN_TRUE, execve_args, environ); err(-1, "test_fork_exec: execve"); } if (waitpid(pid, NULL, 0) < 0) err(-1, "test_fork_exec: waitpid"); } benchmark_stop(); return (i); } static uintmax_t test_getppid(uintmax_t num, uintmax_t int_arg __unused, const char *path __unused) { uintmax_t i; /* * This is process-local, but can change, so will require a * lock. */ benchmark_start(); BENCHMARK_FOREACH(i, num) { getppid(); } benchmark_stop(); return (i); } static uintmax_t test_getpriority(uintmax_t num, uintmax_t int_arg __unused, const char *path __unused) { uintmax_t i; benchmark_start(); BENCHMARK_FOREACH(i, num) { (void)getpriority(PRIO_PROCESS, 0); } benchmark_stop(); return (i); } /* * The point of this one is to figure out the cost of a call into libc, * through PLT, and back. */ static uintmax_t test_getprogname(uintmax_t num, uintmax_t int_arg __unused, const char *path __unused) { uintmax_t i; benchmark_start(); BENCHMARK_FOREACH(i, num) { (void)getprogname(); } benchmark_stop(); return (i); } static uintmax_t test_getresuid(uintmax_t num, uintmax_t int_arg __unused, const char *path __unused) { uid_t ruid, euid, suid; uintmax_t i; benchmark_start(); BENCHMARK_FOREACH(i, num) { (void)getresuid(&ruid, &euid, &suid); } benchmark_stop(); return (i); } static uintmax_t test_gettimeofday(uintmax_t num, uintmax_t int_arg __unused, const char *path __unused) { struct timeval tv; uintmax_t i; benchmark_start(); BENCHMARK_FOREACH(i, num) { (void)gettimeofday(&tv, NULL); } benchmark_stop(); return (i); } static uintmax_t test_getuid(uintmax_t num, uintmax_t int_arg __unused, const char *path __unused) { uintmax_t i; /* * Thread-local data should require no locking if system * call is MPSAFE. */ benchmark_start(); BENCHMARK_FOREACH(i, num) { getuid(); } benchmark_stop(); return (i); } static uintmax_t test_memcpy(uintmax_t num, uintmax_t int_arg, const char *path __unused) { char buf[int_arg], buf2[int_arg]; uintmax_t i; benchmark_start(); BENCHMARK_FOREACH(i, num) { /* * Copy the memory there and back, to match the total amount * moved by pipeping/pipepingtd tests. */ memcpy(buf2, buf, int_arg); memcpy(buf, buf2, int_arg); } benchmark_stop(); return (i); } static uintmax_t test_open_close(uintmax_t num, uintmax_t int_arg __unused, const char *path) { uintmax_t i; int fd; fd = open(path, O_RDONLY); if (fd < 0) err(-1, "test_open_close: %s", path); close(fd); benchmark_start(); BENCHMARK_FOREACH(i, num) { fd = open(path, O_RDONLY); if (fd < 0) err(-1, "test_open_close: %s", path); close(fd); } benchmark_stop(); return (i); } static uintmax_t test_open_read_close(uintmax_t num, uintmax_t int_arg, const char *path) { char buf[int_arg]; uintmax_t i; int fd; fd = open(path, O_RDONLY); if (fd < 0) err(-1, "test_open_read_close: %s", path); (void)read(fd, buf, int_arg); close(fd); benchmark_start(); BENCHMARK_FOREACH(i, num) { fd = open(path, O_RDONLY); if (fd < 0) err(-1, "test_open_read_close: %s", path); (void)read(fd, buf, int_arg); close(fd); } benchmark_stop(); return (i); } static uintmax_t test_pipe(uintmax_t num, uintmax_t int_arg __unused, const char *path __unused) { int fd[2]; uintmax_t i; /* * pipe creation is expensive, as it will allocate a new file * descriptor, allocate a new pipe, hook it all up, and return. * Destroying is also expensive, as we now have to free up * the file descriptors and return the pipe. */ if (pipe(fd) < 0) err(-1, "test_pipe: pipe"); close(fd[0]); close(fd[1]); benchmark_start(); BENCHMARK_FOREACH(i, num) { if (pipe(fd) == -1) err(-1, "test_pipe: pipe"); close(fd[0]); close(fd[1]); } benchmark_stop(); return (i); } static void readx(int fd, char *buf, size_t size) { ssize_t ret; do { ret = read(fd, buf, size); if (ret == -1) err(1, "read"); assert((size_t)ret <= size); size -= ret; buf += ret; } while (size > 0); } static void writex(int fd, const char *buf, size_t size) { ssize_t ret; do { ret = write(fd, buf, size); if (ret == -1) err(1, "write"); assert((size_t)ret <= size); size -= ret; buf += ret; } while (size > 0); } static uintmax_t test_pipeping(uintmax_t num, uintmax_t int_arg, const char *path __unused) { char buf[int_arg]; uintmax_t i; pid_t pid; int fd[2], procfd; if (pipe(fd) < 0) err(-1, "pipe"); pid = pdfork(&procfd, 0); if (pid < 0) err(1, "pdfork"); if (pid == 0) { close(fd[0]); for (;;) { readx(fd[1], buf, int_arg); writex(fd[1], buf, int_arg); } } close(fd[1]); benchmark_start(); BENCHMARK_FOREACH(i, num) { writex(fd[0], buf, int_arg); readx(fd[0], buf, int_arg); } benchmark_stop(); close(procfd); return (i); } #ifdef WITH_PTHREAD struct pipepingtd_ctx { int fd; uintmax_t int_arg; }; static void * pipepingtd_proc(void *arg) { struct pipepingtd_ctx *ctxp; int fd; void *buf; uintmax_t int_arg; ctxp = arg; fd = ctxp->fd; int_arg = ctxp->int_arg; buf = malloc(int_arg); if (buf == NULL) err(1, "malloc"); for (;;) { readx(fd, buf, int_arg); writex(fd, buf, int_arg); } } static uintmax_t test_pipepingtd(uintmax_t num, uintmax_t int_arg, const char *path __unused) { struct pipepingtd_ctx ctx; char buf[int_arg]; pthread_t td; uintmax_t i; int error, fd[2]; if (pipe(fd) < 0) err(-1, "pipe"); ctx.fd = fd[1]; ctx.int_arg = int_arg; error = pthread_create(&td, NULL, pipepingtd_proc, &ctx); if (error != 0) err(1, "pthread_create"); benchmark_start(); BENCHMARK_FOREACH(i, num) { writex(fd[0], buf, int_arg); readx(fd[0], buf, int_arg); } benchmark_stop(); pthread_cancel(td); return (i); } #endif /* WITH_PTHREAD */ static uintmax_t test_read(uintmax_t num, uintmax_t int_arg, const char *path) { char buf[int_arg]; uintmax_t i; int fd; fd = open(path, O_RDONLY); if (fd < 0) err(-1, "test_open_read: %s", path); (void)pread(fd, buf, int_arg, 0); benchmark_start(); BENCHMARK_FOREACH(i, num) { (void)pread(fd, buf, int_arg, 0); } benchmark_stop(); close(fd); return (i); } static uintmax_t test_select(uintmax_t num, uintmax_t int_arg __unused, const char *path __unused) { fd_set readfds, writefds, exceptfds; struct timeval tv; uintmax_t i; FD_ZERO(&readfds); FD_ZERO(&writefds); FD_ZERO(&exceptfds); tv.tv_sec = 0; tv.tv_usec = 0; benchmark_start(); BENCHMARK_FOREACH(i, num) { (void)select(0, &readfds, &writefds, &exceptfds, &tv); } benchmark_stop(); return (i); } static uintmax_t test_semaping(uintmax_t num, uintmax_t int_arg __unused, const char *path __unused) { uintmax_t i; pid_t pid; sem_t *buf; int error, j, procfd; buf = mmap(0, PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_ANON | MAP_SHARED, -1, 0); if (buf == MAP_FAILED) err(1, "mmap"); for (j = 0; j < 2; j++) { error = sem_init(&buf[j], 1, 0); if (error != 0) err(1, "sem_init"); } pid = pdfork(&procfd, 0); if (pid < 0) err(1, "pdfork"); if (pid == 0) { for (;;) { error = sem_wait(&buf[0]); if (error != 0) err(1, "sem_wait"); error = sem_post(&buf[1]); if (error != 0) err(1, "sem_post"); } } benchmark_start(); BENCHMARK_FOREACH(i, num) { error = sem_post(&buf[0]); if (error != 0) err(1, "sem_post"); error = sem_wait(&buf[1]); if (error != 0) err(1, "sem_wait"); } benchmark_stop(); close(procfd); for (j = 0; j < 2; j++) { error = sem_destroy(&buf[j]); if (error != 0) err(1, "sem_destroy"); } error = munmap(buf, PAGE_SIZE); if (error != 0) err(1, "munmap"); return (i); } static uintmax_t test_setuid(uintmax_t num, uintmax_t int_arg __unused, const char *path __unused) { uid_t uid; uintmax_t i; uid = getuid(); if (setuid(uid) < 0) err(-1, "test_setuid: setuid"); benchmark_start(); BENCHMARK_FOREACH(i, num) { if (setuid(uid) < 0) err(-1, "test_setuid: setuid"); } benchmark_stop(); return (i); } static uintmax_t test_shmfd(uintmax_t num, uintmax_t int_arg __unused, const char *path __unused) { uintmax_t i; int shmfd; shmfd = shm_open(SHM_ANON, O_CREAT | O_RDWR, 0600); if (shmfd < 0) err(-1, "test_shmfd: shm_open"); close(shmfd); benchmark_start(); BENCHMARK_FOREACH(i, num) { shmfd = shm_open(SHM_ANON, O_CREAT | O_RDWR, 0600); if (shmfd < 0) err(-1, "test_shmfd: shm_open"); close(shmfd); } benchmark_stop(); return (i); } static uintmax_t test_shmfd_dup(uintmax_t num, uintmax_t int_arg __unused, const char *path __unused) { uintmax_t i; int fd, shmfd; shmfd = shm_open(SHM_ANON, O_CREAT | O_RDWR, 0600); if (shmfd < 0) err(-1, "test_shmfd_dup: shm_open"); fd = dup(shmfd); if (fd >= 0) close(fd); benchmark_start(); BENCHMARK_FOREACH(i, num) { fd = dup(shmfd); if (fd >= 0) close(fd); } benchmark_stop(); close(shmfd); return (i); } static uintmax_t test_shmfd_fstat(uintmax_t num, uintmax_t int_arg __unused, const char *path __unused) { struct stat sb; uintmax_t i; int shmfd; shmfd = shm_open(SHM_ANON, O_CREAT | O_RDWR, 0600); if (shmfd < 0) err(-1, "test_shmfd_fstat: shm_open"); if (fstat(shmfd, &sb) < 0) err(-1, "test_shmfd_fstat: fstat"); benchmark_start(); BENCHMARK_FOREACH(i, num) { (void)fstat(shmfd, &sb); } benchmark_stop(); close(shmfd); return (i); } static uintmax_t test_socket_stream(uintmax_t num, uintmax_t int_arg, const char *path __unused) { uintmax_t i; int so; so = socket(int_arg, SOCK_STREAM, 0); if (so < 0) err(-1, "test_socket_stream: socket"); close(so); benchmark_start(); BENCHMARK_FOREACH(i, num) { so = socket(int_arg, SOCK_STREAM, 0); if (so == -1) err(-1, "test_socket_stream: socket"); close(so); } benchmark_stop(); return (i); } static uintmax_t test_socket_dgram(uintmax_t num, uintmax_t int_arg, const char *path __unused) { uintmax_t i; int so; so = socket(int_arg, SOCK_DGRAM, 0); if (so < 0) err(-1, "test_socket_dgram: socket"); close(so); benchmark_start(); BENCHMARK_FOREACH(i, num) { so = socket(int_arg, SOCK_DGRAM, 0); if (so == -1) err(-1, "test_socket_dgram: socket"); close(so); } benchmark_stop(); return (i); } static uintmax_t test_socketpair_stream(uintmax_t num, uintmax_t int_arg __unused, const char *path __unused) { uintmax_t i; int so[2]; if (socketpair(PF_LOCAL, SOCK_STREAM, 0, so) == -1) err(-1, "test_socketpair_stream: socketpair"); close(so[0]); close(so[1]); benchmark_start(); BENCHMARK_FOREACH(i, num) { if (socketpair(PF_LOCAL, SOCK_STREAM, 0, so) == -1) err(-1, "test_socketpair_stream: socketpair"); close(so[0]); close(so[1]); } benchmark_stop(); return (i); } static uintmax_t test_socketpair_dgram(uintmax_t num, uintmax_t int_arg __unused, const char *path __unused) { uintmax_t i; int so[2]; if (socketpair(PF_LOCAL, SOCK_DGRAM, 0, so) == -1) err(-1, "test_socketpair_dgram: socketpair"); close(so[0]); close(so[1]); benchmark_start(); BENCHMARK_FOREACH(i, num) { if (socketpair(PF_LOCAL, SOCK_DGRAM, 0, so) == -1) err(-1, "test_socketpair_dgram: socketpair"); close(so[0]); close(so[1]); } benchmark_stop(); return (i); } static uintmax_t test_vfork(uintmax_t num, uintmax_t int_arg __unused, const char *path __unused) { pid_t pid; uintmax_t i; pid = vfork(); if (pid < 0) err(-1, "test_vfork: vfork"); if (pid == 0) _exit(0); if (waitpid(pid, NULL, 0) < 0) err(-1, "test_vfork: waitpid"); benchmark_start(); BENCHMARK_FOREACH(i, num) { pid = vfork(); if (pid < 0) err(-1, "test_vfork: vfork"); if (pid == 0) _exit(0); if (waitpid(pid, NULL, 0) < 0) err(-1, "test_vfork: waitpid"); } benchmark_stop(); return (i); } static uintmax_t test_vfork_exec(uintmax_t num, uintmax_t int_arg __unused, const char *path __unused) { pid_t pid; uintmax_t i; pid = vfork(); if (pid < 0) err(-1, "test_vfork_exec: vfork"); if (pid == 0) { (void)execve(USR_BIN_TRUE, execve_args, environ); err(-1, "test_vfork_exec: execve"); } if (waitpid(pid, NULL, 0) < 0) err(-1, "test_vfork_exec: waitpid"); benchmark_start(); BENCHMARK_FOREACH(i, num) { pid = vfork(); if (pid < 0) err(-1, "test_vfork_exec: vfork"); if (pid == 0) { (void)execve(USR_BIN_TRUE, execve_args, environ); err(-1, "execve"); } if (waitpid(pid, NULL, 0) < 0) err(-1, "test_vfork_exec: waitpid"); } benchmark_stop(); return (i); } struct test { const char *t_name; uintmax_t (*t_func)(uintmax_t, uintmax_t, const char *); int t_flags; uintmax_t t_int; }; #define FLAG_PATH 0x00000001 static const struct test tests[] = { { "access", test_access, .t_flags = FLAG_PATH }, { "bad_open", test_bad_open, .t_flags = 0 }, { "chroot", test_chroot, .t_flags = 0 }, { "clock_gettime", test_clock_gettime, .t_flags = 0 }, { "create_unlink", test_create_unlink, .t_flags = FLAG_PATH }, { "fork", test_fork, .t_flags = 0 }, { "fork_exec", test_fork_exec, .t_flags = 0 }, { "getppid", test_getppid, .t_flags = 0 }, { "getpriority", test_getpriority, .t_flags = 0 }, { "getprogname", test_getprogname, .t_flags = 0 }, { "getresuid", test_getresuid, .t_flags = 0 }, { "gettimeofday", test_gettimeofday, .t_flags = 0 }, { "getuid", test_getuid, .t_flags = 0 }, { "memcpy_1", test_memcpy, .t_flags = 0, .t_int = 1 }, { "memcpy_10", test_memcpy, .t_flags = 0, .t_int = 10 }, { "memcpy_100", test_memcpy, .t_flags = 0, .t_int = 100 }, { "memcpy_1000", test_memcpy, .t_flags = 0, .t_int = 1000 }, { "memcpy_10000", test_memcpy, .t_flags = 0, .t_int = 10000 }, { "memcpy_100000", test_memcpy, .t_flags = 0, .t_int = 100000 }, { "memcpy_1000000", test_memcpy, .t_flags = 0, .t_int = 1000000 }, { "open_close", test_open_close, .t_flags = FLAG_PATH }, { "open_read_close_1", test_open_read_close, .t_flags = FLAG_PATH, .t_int = 1 }, { "open_read_close_10", test_open_read_close, .t_flags = FLAG_PATH, .t_int = 10 }, { "open_read_close_100", test_open_read_close, .t_flags = FLAG_PATH, .t_int = 100 }, { "open_read_close_1000", test_open_read_close, .t_flags = FLAG_PATH, .t_int = 1000 }, { "open_read_close_10000", test_open_read_close, .t_flags = FLAG_PATH, .t_int = 10000 }, { "open_read_close_100000", test_open_read_close, .t_flags = FLAG_PATH, .t_int = 100000 }, { "open_read_close_1000000", test_open_read_close, .t_flags = FLAG_PATH, .t_int = 1000000 }, { "pipe", test_pipe, .t_flags = 0 }, { "pipeping_1", test_pipeping, .t_flags = 0, .t_int = 1 }, { "pipeping_10", test_pipeping, .t_flags = 0, .t_int = 10 }, { "pipeping_100", test_pipeping, .t_flags = 0, .t_int = 100 }, { "pipeping_1000", test_pipeping, .t_flags = 0, .t_int = 1000 }, { "pipeping_10000", test_pipeping, .t_flags = 0, .t_int = 10000 }, { "pipeping_100000", test_pipeping, .t_flags = 0, .t_int = 100000 }, { "pipeping_1000000", test_pipeping, .t_flags = 0, .t_int = 1000000 }, #ifdef WITH_PTHREAD { "pipepingtd_1", test_pipepingtd, .t_flags = 0, .t_int = 1 }, { "pipepingtd_10", test_pipepingtd, .t_flags = 0, .t_int = 10 }, { "pipepingtd_100", test_pipepingtd, .t_flags = 0, .t_int = 100 }, { "pipepingtd_1000", test_pipepingtd, .t_flags = 0, .t_int = 1000 }, { "pipepingtd_10000", test_pipepingtd, .t_flags = 0, .t_int = 10000 }, { "pipepingtd_100000", test_pipepingtd, .t_flags = 0, .t_int = 100000 }, { "pipepingtd_1000000", test_pipepingtd, .t_flags = 0, .t_int = 1000000 }, #endif { "read_1", test_read, .t_flags = FLAG_PATH, .t_int = 1 }, { "read_10", test_read, .t_flags = FLAG_PATH, .t_int = 10 }, { "read_100", test_read, .t_flags = FLAG_PATH, .t_int = 100 }, { "read_1000", test_read, .t_flags = FLAG_PATH, .t_int = 1000 }, { "read_10000", test_read, .t_flags = FLAG_PATH, .t_int = 10000 }, { "read_100000", test_read, .t_flags = FLAG_PATH, .t_int = 100000 }, { "read_1000000", test_read, .t_flags = FLAG_PATH, .t_int = 1000000 }, { "select", test_select, .t_flags = 0 }, { "semaping", test_semaping, .t_flags = 0 }, { "setuid", test_setuid, .t_flags = 0 }, { "shmfd", test_shmfd, .t_flags = 0 }, { "shmfd_dup", test_shmfd_dup, .t_flags = 0 }, { "shmfd_fstat", test_shmfd_fstat, .t_flags = 0 }, { "socket_local_stream", test_socket_stream, .t_int = PF_LOCAL }, { "socket_local_dgram", test_socket_dgram, .t_int = PF_LOCAL }, { "socketpair_stream", test_socketpair_stream, .t_flags = 0 }, { "socketpair_dgram", test_socketpair_dgram, .t_flags = 0 }, { "socket_tcp", test_socket_stream, .t_int = PF_INET }, { "socket_udp", test_socket_dgram, .t_int = PF_INET }, { "vfork", test_vfork, .t_flags = 0 }, { "vfork_exec", test_vfork_exec, .t_flags = 0 }, }; static const int tests_count = sizeof(tests) / sizeof(tests[0]); static void usage(void) { int i; fprintf(stderr, "syscall_timing [-i iterations] [-l loops] " "[-p path] [-s seconds] test\n"); for (i = 0; i < tests_count; i++) fprintf(stderr, " %s\n", tests[i].t_name); exit(-1); } int main(int argc, char *argv[]) { struct timespec ts_res; const struct test *the_test; const char *path; char *tmp_dir, *tmp_path; long long ll; char *endp; int ch, fd, error, i, j, rv; uintmax_t iterations, k, loops; alarm_timeout = 1; iterations = 0; loops = 10; path = NULL; tmp_path = NULL; while ((ch = getopt(argc, argv, "i:l:p:s:")) != -1) { switch (ch) { case 'i': ll = strtol(optarg, &endp, 10); if (*endp != 0 || ll < 1) usage(); iterations = ll; break; case 'l': ll = strtol(optarg, &endp, 10); if (*endp != 0 || ll < 1 || ll > 100000) usage(); loops = ll; break; case 'p': path = optarg; break; case 's': ll = strtol(optarg, &endp, 10); if (*endp != 0 || ll < 1 || ll > 60*60) usage(); alarm_timeout = ll; break; case '?': default: usage(); } } argc -= optind; argv += optind; if (iterations < 1 && alarm_timeout < 1) usage(); if (iterations < 1) iterations = UINT64_MAX; if (loops < 1) loops = 1; if (argc < 1) usage(); /* * Validate test list and that, if a path is required, it is * defined. */ for (j = 0; j < argc; j++) { the_test = NULL; for (i = 0; i < tests_count; i++) { if (strcmp(argv[j], tests[i].t_name) == 0) the_test = &tests[i]; } if (the_test == NULL) usage(); if ((the_test->t_flags & FLAG_PATH) && (path == NULL)) { tmp_dir = strdup("/tmp/syscall_timing.XXXXXXXX"); if (tmp_dir == NULL) err(1, "strdup"); tmp_dir = mkdtemp(tmp_dir); if (tmp_dir == NULL) err(1, "mkdtemp"); rv = asprintf(&tmp_path, "%s/testfile", tmp_dir); if (rv <= 0) err(1, "asprintf"); } } error = clock_getres(CLOCK_REALTIME, &ts_res); assert(error == 0); printf("Clock resolution: %ju.%09ju\n", (uintmax_t)ts_res.tv_sec, (uintmax_t)ts_res.tv_nsec); printf("test\tloop\ttime\titerations\tperiteration\n"); for (j = 0; j < argc; j++) { uintmax_t calls, nsecsperit; the_test = NULL; for (i = 0; i < tests_count; i++) { if (strcmp(argv[j], tests[i].t_name) == 0) the_test = &tests[i]; } if (tmp_path != NULL) { fd = open(tmp_path, O_WRONLY | O_CREAT, 0700); if (fd < 0) err(1, "cannot open %s", tmp_path); error = ftruncate(fd, 1000000); if (error != 0) err(1, "ftruncate"); error = close(fd); if (error != 0) err(1, "close"); path = tmp_path; } /* * Run one warmup, then do the real thing (loops) times. */ the_test->t_func(iterations, the_test->t_int, path); calls = 0; for (k = 0; k < loops; k++) { calls = the_test->t_func(iterations, the_test->t_int, path); - timespecsub(&ts_end, &ts_start); + timespecsub(&ts_end, &ts_start, &ts_end); printf("%s\t%ju\t", the_test->t_name, k); printf("%ju.%09ju\t%ju\t", (uintmax_t)ts_end.tv_sec, (uintmax_t)ts_end.tv_nsec, calls); /* * Note. This assumes that each iteration takes less than * a second, and that our total nanoseconds doesn't exceed * the room in our arithmetic unit. Fine for system calls, * but not for long things. */ nsecsperit = ts_end.tv_sec * 1000000000; nsecsperit += ts_end.tv_nsec; nsecsperit /= calls; printf("0.%09ju\n", (uintmax_t)nsecsperit); } } if (tmp_path != NULL) { error = unlink(tmp_path); if (error != 0 && errno != ENOENT) warn("cannot unlink %s", tmp_path); error = rmdir(tmp_dir); if (error != 0) warn("cannot rmdir %s", tmp_dir); } return (0); } Index: head/usr.bin/truss/setup.c =================================================================== --- head/usr.bin/truss/setup.c (revision 336913) +++ head/usr.bin/truss/setup.c (revision 336914) @@ -1,748 +1,749 @@ /*- * SPDX-License-Identifier: BSD-4-Clause * * Copyright 1997 Sean Eric Fagan * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by Sean Eric Fagan * 4. Neither the name of the author may be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); /* * Various setup functions for truss. Not the cleanest-written code, * I'm afraid. */ #include #include +#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "truss.h" #include "syscall.h" #include "extern.h" SET_DECLARE(procabi, struct procabi); static sig_atomic_t detaching; static void enter_syscall(struct trussinfo *, struct threadinfo *, struct ptrace_lwpinfo *); static void new_proc(struct trussinfo *, pid_t, lwpid_t); /* * setup_and_wait() is called to start a process. All it really does * is fork(), enable tracing in the child, and then exec the given * command. At that point, the child process stops, and the parent * can wake up and deal with it. */ void setup_and_wait(struct trussinfo *info, char *command[]) { pid_t pid; pid = vfork(); if (pid == -1) err(1, "fork failed"); if (pid == 0) { /* Child */ ptrace(PT_TRACE_ME, 0, 0, 0); execvp(command[0], command); err(1, "execvp %s", command[0]); } /* Only in the parent here */ if (waitpid(pid, NULL, 0) < 0) err(1, "unexpect stop in waitpid"); new_proc(info, pid, 0); } /* * start_tracing is called to attach to an existing process. */ void start_tracing(struct trussinfo *info, pid_t pid) { int ret, retry; retry = 10; do { ret = ptrace(PT_ATTACH, pid, NULL, 0); usleep(200); } while (ret && retry-- > 0); if (ret) err(1, "can not attach to target process"); if (waitpid(pid, NULL, 0) < 0) err(1, "Unexpect stop in waitpid"); new_proc(info, pid, 0); } /* * Restore a process back to it's pre-truss state. * Called for SIGINT, SIGTERM, SIGQUIT. This only * applies if truss was told to monitor an already-existing * process. */ void restore_proc(int signo __unused) { detaching = 1; } static void detach_proc(pid_t pid) { /* stop the child so that we can detach */ kill(pid, SIGSTOP); if (waitpid(pid, NULL, 0) < 0) err(1, "Unexpected stop in waitpid"); if (ptrace(PT_DETACH, pid, (caddr_t)1, 0) < 0) err(1, "Can not detach the process"); kill(pid, SIGCONT); } /* * Determine the ABI. This is called after every exec, and when * a process is first monitored. */ static struct procabi * find_abi(pid_t pid) { struct procabi **pabi; size_t len; int error; int mib[4]; char progt[32]; len = sizeof(progt); mib[0] = CTL_KERN; mib[1] = KERN_PROC; mib[2] = KERN_PROC_SV_NAME; mib[3] = pid; error = sysctl(mib, 4, progt, &len, NULL, 0); if (error != 0) err(2, "can not get sysvec name"); SET_FOREACH(pabi, procabi) { if (strcmp((*pabi)->type, progt) == 0) return (*pabi); } warnx("ABI %s for pid %ld is not supported", progt, (long)pid); return (NULL); } static struct threadinfo * new_thread(struct procinfo *p, lwpid_t lwpid) { struct threadinfo *nt; /* * If this happens it means there is a bug in truss. Unfortunately * this will kill any processes truss is attached to. */ LIST_FOREACH(nt, &p->threadlist, entries) { if (nt->tid == lwpid) errx(1, "Duplicate thread for LWP %ld", (long)lwpid); } nt = calloc(1, sizeof(struct threadinfo)); if (nt == NULL) err(1, "calloc() failed"); nt->proc = p; nt->tid = lwpid; LIST_INSERT_HEAD(&p->threadlist, nt, entries); return (nt); } static void free_thread(struct threadinfo *t) { LIST_REMOVE(t, entries); free(t); } static void add_threads(struct trussinfo *info, struct procinfo *p) { struct ptrace_lwpinfo pl; struct threadinfo *t; lwpid_t *lwps; int i, nlwps; nlwps = ptrace(PT_GETNUMLWPS, p->pid, NULL, 0); if (nlwps == -1) err(1, "Unable to fetch number of LWPs"); assert(nlwps > 0); lwps = calloc(nlwps, sizeof(*lwps)); nlwps = ptrace(PT_GETLWPLIST, p->pid, (caddr_t)lwps, nlwps); if (nlwps == -1) err(1, "Unable to fetch LWP list"); for (i = 0; i < nlwps; i++) { t = new_thread(p, lwps[i]); if (ptrace(PT_LWPINFO, lwps[i], (caddr_t)&pl, sizeof(pl)) == -1) err(1, "ptrace(PT_LWPINFO)"); if (pl.pl_flags & PL_FLAG_SCE) { info->curthread = t; enter_syscall(info, t, &pl); } } free(lwps); } static void new_proc(struct trussinfo *info, pid_t pid, lwpid_t lwpid) { struct procinfo *np; /* * If this happens it means there is a bug in truss. Unfortunately * this will kill any processes truss is attached to. */ LIST_FOREACH(np, &info->proclist, entries) { if (np->pid == pid) errx(1, "Duplicate process for pid %ld", (long)pid); } if (info->flags & FOLLOWFORKS) if (ptrace(PT_FOLLOW_FORK, pid, NULL, 1) == -1) err(1, "Unable to follow forks for pid %ld", (long)pid); if (ptrace(PT_LWP_EVENTS, pid, NULL, 1) == -1) err(1, "Unable to enable LWP events for pid %ld", (long)pid); np = calloc(1, sizeof(struct procinfo)); np->pid = pid; np->abi = find_abi(pid); LIST_INIT(&np->threadlist); LIST_INSERT_HEAD(&info->proclist, np, entries); if (lwpid != 0) new_thread(np, lwpid); else add_threads(info, np); } static void free_proc(struct procinfo *p) { struct threadinfo *t, *t2; LIST_FOREACH_SAFE(t, &p->threadlist, entries, t2) { free(t); } LIST_REMOVE(p, entries); free(p); } static void detach_all_procs(struct trussinfo *info) { struct procinfo *p, *p2; LIST_FOREACH_SAFE(p, &info->proclist, entries, p2) { detach_proc(p->pid); free_proc(p); } } static struct procinfo * find_proc(struct trussinfo *info, pid_t pid) { struct procinfo *np; LIST_FOREACH(np, &info->proclist, entries) { if (np->pid == pid) return (np); } return (NULL); } /* * Change curthread member based on (pid, lwpid). */ static void find_thread(struct trussinfo *info, pid_t pid, lwpid_t lwpid) { struct procinfo *np; struct threadinfo *nt; np = find_proc(info, pid); assert(np != NULL); LIST_FOREACH(nt, &np->threadlist, entries) { if (nt->tid == lwpid) { info->curthread = nt; return; } } errx(1, "could not find thread"); } /* * When a process exits, it should have exactly one thread left. * All of the other threads should have reported thread exit events. */ static void find_exit_thread(struct trussinfo *info, pid_t pid) { struct procinfo *p; p = find_proc(info, pid); assert(p != NULL); info->curthread = LIST_FIRST(&p->threadlist); assert(info->curthread != NULL); assert(LIST_NEXT(info->curthread, entries) == NULL); } static void alloc_syscall(struct threadinfo *t, struct ptrace_lwpinfo *pl) { u_int i; assert(t->in_syscall == 0); assert(t->cs.number == 0); assert(t->cs.sc == NULL); assert(t->cs.nargs == 0); for (i = 0; i < nitems(t->cs.s_args); i++) assert(t->cs.s_args[i] == NULL); memset(t->cs.args, 0, sizeof(t->cs.args)); t->cs.number = pl->pl_syscall_code; t->in_syscall = 1; } static void free_syscall(struct threadinfo *t) { u_int i; for (i = 0; i < t->cs.nargs; i++) free(t->cs.s_args[i]); memset(&t->cs, 0, sizeof(t->cs)); t->in_syscall = 0; } static void enter_syscall(struct trussinfo *info, struct threadinfo *t, struct ptrace_lwpinfo *pl) { struct syscall *sc; u_int i, narg; alloc_syscall(t, pl); narg = MIN(pl->pl_syscall_narg, nitems(t->cs.args)); if (narg != 0 && t->proc->abi->fetch_args(info, narg) != 0) { free_syscall(t); return; } sc = get_syscall(t, t->cs.number, narg); if (sc->unknown) fprintf(info->outfile, "-- UNKNOWN %s SYSCALL %d --\n", t->proc->abi->type, t->cs.number); t->cs.nargs = sc->nargs; assert(sc->nargs <= nitems(t->cs.s_args)); t->cs.sc = sc; /* * At this point, we set up the system call arguments. * We ignore any OUT ones, however -- those are arguments that * are set by the system call, and so are probably meaningless * now. This doesn't currently support arguments that are * passed in *and* out, however. */ #if DEBUG fprintf(stderr, "syscall %s(", sc->name); #endif for (i = 0; i < t->cs.nargs; i++) { #if DEBUG fprintf(stderr, "0x%lx%s", t->cs.args[sc->args[i].offset], i < (t->cs.nargs - 1) ? "," : ""); #endif if (!(sc->args[i].type & OUT)) { t->cs.s_args[i] = print_arg(&sc->args[i], t->cs.args, 0, info); } } #if DEBUG fprintf(stderr, ")\n"); #endif clock_gettime(CLOCK_REALTIME, &t->before); } /* * When a thread exits voluntarily (including when a thread calls * exit() to trigger a process exit), the thread's internal state * holds the arguments passed to the exit system call. When the * thread's exit is reported, log that system call without a return * value. */ static void thread_exit_syscall(struct trussinfo *info) { struct threadinfo *t; t = info->curthread; if (!t->in_syscall) return; clock_gettime(CLOCK_REALTIME, &t->after); print_syscall_ret(info, 0, NULL); free_syscall(t); } static void exit_syscall(struct trussinfo *info, struct ptrace_lwpinfo *pl) { struct threadinfo *t; struct procinfo *p; struct syscall *sc; long retval[2]; u_int i; int errorp; t = info->curthread; if (!t->in_syscall) return; clock_gettime(CLOCK_REALTIME, &t->after); p = t->proc; if (p->abi->fetch_retval(info, retval, &errorp) < 0) { free_syscall(t); return; } sc = t->cs.sc; /* * Here, we only look for arguments that have OUT masked in -- * otherwise, they were handled in enter_syscall(). */ for (i = 0; i < sc->nargs; i++) { char *temp; if (sc->args[i].type & OUT) { /* * If an error occurred, then don't bother * getting the data; it may not be valid. */ if (errorp) { asprintf(&temp, "0x%lx", t->cs.args[sc->args[i].offset]); } else { temp = print_arg(&sc->args[i], t->cs.args, retval, info); } t->cs.s_args[i] = temp; } } print_syscall_ret(info, errorp, retval); free_syscall(t); /* * If the process executed a new image, check the ABI. If the * new ABI isn't supported, stop tracing this process. */ if (pl->pl_flags & PL_FLAG_EXEC) { assert(LIST_NEXT(LIST_FIRST(&p->threadlist), entries) == NULL); p->abi = find_abi(p->pid); if (p->abi == NULL) { if (ptrace(PT_DETACH, p->pid, (caddr_t)1, 0) < 0) err(1, "Can not detach the process"); free_proc(p); } } } int print_line_prefix(struct trussinfo *info) { struct timespec timediff; struct threadinfo *t; int len; len = 0; t = info->curthread; if (info->flags & (FOLLOWFORKS | DISPLAYTIDS)) { if (info->flags & FOLLOWFORKS) len += fprintf(info->outfile, "%5d", t->proc->pid); if ((info->flags & (FOLLOWFORKS | DISPLAYTIDS)) == (FOLLOWFORKS | DISPLAYTIDS)) len += fprintf(info->outfile, " "); if (info->flags & DISPLAYTIDS) len += fprintf(info->outfile, "%6d", t->tid); len += fprintf(info->outfile, ": "); } if (info->flags & ABSOLUTETIMESTAMPS) { - timespecsubt(&t->after, &info->start_time, &timediff); + timespecsub(&t->after, &info->start_time, &timediff); len += fprintf(info->outfile, "%jd.%09ld ", (intmax_t)timediff.tv_sec, timediff.tv_nsec); } if (info->flags & RELATIVETIMESTAMPS) { - timespecsubt(&t->after, &t->before, &timediff); + timespecsub(&t->after, &t->before, &timediff); len += fprintf(info->outfile, "%jd.%09ld ", (intmax_t)timediff.tv_sec, timediff.tv_nsec); } return (len); } static void report_thread_death(struct trussinfo *info) { struct threadinfo *t; t = info->curthread; clock_gettime(CLOCK_REALTIME, &t->after); print_line_prefix(info); fprintf(info->outfile, "\n", (long)t->tid); } static void report_thread_birth(struct trussinfo *info) { struct threadinfo *t; t = info->curthread; clock_gettime(CLOCK_REALTIME, &t->after); t->before = t->after; print_line_prefix(info); fprintf(info->outfile, "\n", (long)t->tid); } static void report_exit(struct trussinfo *info, siginfo_t *si) { struct threadinfo *t; t = info->curthread; clock_gettime(CLOCK_REALTIME, &t->after); print_line_prefix(info); if (si->si_code == CLD_EXITED) fprintf(info->outfile, "process exit, rval = %u\n", si->si_status); else fprintf(info->outfile, "process killed, signal = %u%s\n", si->si_status, si->si_code == CLD_DUMPED ? " (core dumped)" : ""); } static void report_new_child(struct trussinfo *info) { struct threadinfo *t; t = info->curthread; clock_gettime(CLOCK_REALTIME, &t->after); t->before = t->after; print_line_prefix(info); fprintf(info->outfile, "\n"); } void decode_siginfo(FILE *fp, siginfo_t *si) { const char *str; fprintf(fp, " code="); str = sysdecode_sigcode(si->si_signo, si->si_code); if (str == NULL) fprintf(fp, "%d", si->si_code); else fprintf(fp, "%s", str); switch (si->si_code) { case SI_NOINFO: break; case SI_QUEUE: fprintf(fp, " value=%p", si->si_value.sival_ptr); /* FALLTHROUGH */ case SI_USER: case SI_LWP: fprintf(fp, " pid=%jd uid=%jd", (intmax_t)si->si_pid, (intmax_t)si->si_uid); break; case SI_TIMER: fprintf(fp, " value=%p", si->si_value.sival_ptr); fprintf(fp, " timerid=%d", si->si_timerid); fprintf(fp, " overrun=%d", si->si_overrun); if (si->si_errno != 0) fprintf(fp, " errno=%d", si->si_errno); break; case SI_ASYNCIO: fprintf(fp, " value=%p", si->si_value.sival_ptr); break; case SI_MESGQ: fprintf(fp, " value=%p", si->si_value.sival_ptr); fprintf(fp, " mqd=%d", si->si_mqd); break; default: switch (si->si_signo) { case SIGILL: case SIGFPE: case SIGSEGV: case SIGBUS: fprintf(fp, " trapno=%d", si->si_trapno); fprintf(fp, " addr=%p", si->si_addr); break; case SIGCHLD: fprintf(fp, " pid=%jd uid=%jd", (intmax_t)si->si_pid, (intmax_t)si->si_uid); fprintf(fp, " status=%d", si->si_status); break; } } } static void report_signal(struct trussinfo *info, siginfo_t *si, struct ptrace_lwpinfo *pl) { struct threadinfo *t; const char *signame; t = info->curthread; clock_gettime(CLOCK_REALTIME, &t->after); print_line_prefix(info); signame = sysdecode_signal(si->si_status); if (signame == NULL) signame = "?"; fprintf(info->outfile, "SIGNAL %u (%s)", si->si_status, signame); if (pl->pl_event == PL_EVENT_SIGNAL && pl->pl_flags & PL_FLAG_SI) decode_siginfo(info->outfile, &pl->pl_siginfo); fprintf(info->outfile, "\n"); } /* * Wait for events until all the processes have exited or truss has been * asked to stop. */ void eventloop(struct trussinfo *info) { struct ptrace_lwpinfo pl; siginfo_t si; int pending_signal; while (!LIST_EMPTY(&info->proclist)) { if (detaching) { detach_all_procs(info); return; } if (waitid(P_ALL, 0, &si, WTRAPPED | WEXITED) == -1) { if (errno == EINTR) continue; err(1, "Unexpected error from waitid"); } assert(si.si_signo == SIGCHLD); switch (si.si_code) { case CLD_EXITED: case CLD_KILLED: case CLD_DUMPED: find_exit_thread(info, si.si_pid); if ((info->flags & COUNTONLY) == 0) { if (si.si_code == CLD_EXITED) thread_exit_syscall(info); report_exit(info, &si); } free_proc(info->curthread->proc); info->curthread = NULL; break; case CLD_TRAPPED: if (ptrace(PT_LWPINFO, si.si_pid, (caddr_t)&pl, sizeof(pl)) == -1) err(1, "ptrace(PT_LWPINFO)"); if (pl.pl_flags & PL_FLAG_CHILD) { new_proc(info, si.si_pid, pl.pl_lwpid); assert(LIST_FIRST(&info->proclist)->abi != NULL); } else if (pl.pl_flags & PL_FLAG_BORN) new_thread(find_proc(info, si.si_pid), pl.pl_lwpid); find_thread(info, si.si_pid, pl.pl_lwpid); if (si.si_status == SIGTRAP && (pl.pl_flags & (PL_FLAG_BORN|PL_FLAG_EXITED| PL_FLAG_SCE|PL_FLAG_SCX)) != 0) { if (pl.pl_flags & PL_FLAG_BORN) { if ((info->flags & COUNTONLY) == 0) report_thread_birth(info); } else if (pl.pl_flags & PL_FLAG_EXITED) { if ((info->flags & COUNTONLY) == 0) report_thread_death(info); free_thread(info->curthread); info->curthread = NULL; } else if (pl.pl_flags & PL_FLAG_SCE) enter_syscall(info, info->curthread, &pl); else if (pl.pl_flags & PL_FLAG_SCX) exit_syscall(info, &pl); pending_signal = 0; } else if (pl.pl_flags & PL_FLAG_CHILD) { if ((info->flags & COUNTONLY) == 0) report_new_child(info); pending_signal = 0; } else { if ((info->flags & NOSIGS) == 0) report_signal(info, &si, &pl); pending_signal = si.si_status; } ptrace(PT_SYSCALL, si.si_pid, (caddr_t)1, pending_signal); break; case CLD_STOPPED: errx(1, "waitid reported CLD_STOPPED"); case CLD_CONTINUED: break; } } } Index: head/usr.bin/truss/syscalls.c =================================================================== --- head/usr.bin/truss/syscalls.c (revision 336913) +++ head/usr.bin/truss/syscalls.c (revision 336914) @@ -1,2709 +1,2710 @@ /*- * SPDX-License-Identifier: BSD-4-Clause * * Copyright 1997 Sean Eric Fagan * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by Sean Eric Fagan * 4. Neither the name of the author may be used to endorse or promote * products derived from this software without specific prior written * permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); /* * This file has routines used to print out system calls and their * arguments. */ #include #include #define _WANT_FREEBSD11_KEVENT #include #include #include #include #include #include #define _WANT_FREEBSD11_STAT #include +#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "truss.h" #include "extern.h" #include "syscall.h" /* * This should probably be in its own file, sorted alphabetically. */ static struct syscall decoded_syscalls[] = { /* Native ABI */ { .name = "__acl_aclcheck_fd", .ret_type = 1, .nargs = 3, .args = { { Int, 0 }, { Acltype, 1 }, { Ptr, 2 } } }, { .name = "__acl_aclcheck_file", .ret_type = 1, .nargs = 3, .args = { { Name, 0 }, { Acltype, 1 }, { Ptr, 2 } } }, { .name = "__acl_aclcheck_link", .ret_type = 1, .nargs = 3, .args = { { Name, 0 }, { Acltype, 1 }, { Ptr, 2 } } }, { .name = "__acl_delete_fd", .ret_type = 1, .nargs = 2, .args = { { Int, 0 }, { Acltype, 1 } } }, { .name = "__acl_delete_file", .ret_type = 1, .nargs = 2, .args = { { Name, 0 }, { Acltype, 1 } } }, { .name = "__acl_delete_link", .ret_type = 1, .nargs = 2, .args = { { Name, 0 }, { Acltype, 1 } } }, { .name = "__acl_get_fd", .ret_type = 1, .nargs = 3, .args = { { Int, 0 }, { Acltype, 1 }, { Ptr, 2 } } }, { .name = "__acl_get_file", .ret_type = 1, .nargs = 3, .args = { { Name, 0 }, { Acltype, 1 }, { Ptr, 2 } } }, { .name = "__acl_get_link", .ret_type = 1, .nargs = 3, .args = { { Name, 0 }, { Acltype, 1 }, { Ptr, 2 } } }, { .name = "__acl_set_fd", .ret_type = 1, .nargs = 3, .args = { { Int, 0 }, { Acltype, 1 }, { Ptr, 2 } } }, { .name = "__acl_set_file", .ret_type = 1, .nargs = 3, .args = { { Name, 0 }, { Acltype, 1 }, { Ptr, 2 } } }, { .name = "__acl_set_link", .ret_type = 1, .nargs = 3, .args = { { Name, 0 }, { Acltype, 1 }, { Ptr, 2 } } }, { .name = "__cap_rights_get", .ret_type = 1, .nargs = 3, .args = { { Int, 0 }, { Int, 1 }, { CapRights | OUT, 2 } } }, { .name = "__getcwd", .ret_type = 1, .nargs = 2, .args = { { Name | OUT, 0 }, { Int, 1 } } }, { .name = "_umtx_op", .ret_type = 1, .nargs = 5, .args = { { Ptr, 0 }, { Umtxop, 1 }, { LongHex, 2 }, { Ptr, 3 }, { Ptr, 4 } } }, { .name = "accept", .ret_type = 1, .nargs = 3, .args = { { Int, 0 }, { Sockaddr | OUT, 1 }, { Ptr | OUT, 2 } } }, { .name = "access", .ret_type = 1, .nargs = 2, .args = { { Name | IN, 0 }, { Accessmode, 1 } } }, { .name = "bind", .ret_type = 1, .nargs = 3, .args = { { Int, 0 }, { Sockaddr | IN, 1 }, { Socklent, 2 } } }, { .name = "bindat", .ret_type = 1, .nargs = 4, .args = { { Atfd, 0 }, { Int, 1 }, { Sockaddr | IN, 2 }, { Int, 3 } } }, { .name = "break", .ret_type = 1, .nargs = 1, .args = { { Ptr, 0 } } }, { .name = "cap_fcntls_get", .ret_type = 1, .nargs = 2, .args = { { Int, 0 }, { CapFcntlRights | OUT, 1 } } }, { .name = "cap_fcntls_limit", .ret_type = 1, .nargs = 2, .args = { { Int, 0 }, { CapFcntlRights, 1 } } }, { .name = "cap_getmode", .ret_type = 1, .nargs = 1, .args = { { PUInt | OUT, 0 } } }, { .name = "cap_rights_limit", .ret_type = 1, .nargs = 2, .args = { { Int, 0 }, { CapRights, 1 } } }, { .name = "chdir", .ret_type = 1, .nargs = 1, .args = { { Name, 0 } } }, { .name = "chflags", .ret_type = 1, .nargs = 2, .args = { { Name | IN, 0 }, { FileFlags, 1 } } }, { .name = "chflagsat", .ret_type = 1, .nargs = 4, .args = { { Atfd, 0 }, { Name | IN, 1 }, { FileFlags, 2 }, { Atflags, 3 } } }, { .name = "chmod", .ret_type = 1, .nargs = 2, .args = { { Name, 0 }, { Octal, 1 } } }, { .name = "chown", .ret_type = 1, .nargs = 3, .args = { { Name, 0 }, { Int, 1 }, { Int, 2 } } }, { .name = "chroot", .ret_type = 1, .nargs = 1, .args = { { Name, 0 } } }, { .name = "clock_gettime", .ret_type = 1, .nargs = 2, .args = { { Int, 0 }, { Timespec | OUT, 1 } } }, { .name = "close", .ret_type = 1, .nargs = 1, .args = { { Int, 0 } } }, { .name = "compat11.fstat", .ret_type = 1, .nargs = 2, .args = { { Int, 0 }, { Stat11 | OUT, 1 } } }, { .name = "compat11.fstatat", .ret_type = 1, .nargs = 4, .args = { { Atfd, 0 }, { Name | IN, 1 }, { Stat11 | OUT, 2 }, { Atflags, 3 } } }, { .name = "compat11.kevent", .ret_type = 1, .nargs = 6, .args = { { Int, 0 }, { Kevent11, 1 }, { Int, 2 }, { Kevent11 | OUT, 3 }, { Int, 4 }, { Timespec, 5 } } }, { .name = "compat11.lstat", .ret_type = 1, .nargs = 2, .args = { { Name | IN, 0 }, { Stat11 | OUT, 1 } } }, { .name = "compat11.stat", .ret_type = 1, .nargs = 2, .args = { { Name | IN, 0 }, { Stat11 | OUT, 1 } } }, { .name = "connect", .ret_type = 1, .nargs = 3, .args = { { Int, 0 }, { Sockaddr | IN, 1 }, { Socklent, 2 } } }, { .name = "connectat", .ret_type = 1, .nargs = 4, .args = { { Atfd, 0 }, { Int, 1 }, { Sockaddr | IN, 2 }, { Int, 3 } } }, { .name = "dup", .ret_type = 1, .nargs = 1, .args = { { Int, 0 } } }, { .name = "dup2", .ret_type = 1, .nargs = 2, .args = { { Int, 0 }, { Int, 1 } } }, { .name = "eaccess", .ret_type = 1, .nargs = 2, .args = { { Name | IN, 0 }, { Accessmode, 1 } } }, { .name = "execve", .ret_type = 1, .nargs = 3, .args = { { Name | IN, 0 }, { ExecArgs | IN, 1 }, { ExecEnv | IN, 2 } } }, { .name = "exit", .ret_type = 0, .nargs = 1, .args = { { Hex, 0 } } }, { .name = "extattr_delete_fd", .ret_type = 1, .nargs = 3, .args = { { Int, 0 }, { Extattrnamespace, 1 }, { Name, 2 } } }, { .name = "extattr_delete_file", .ret_type = 1, .nargs = 3, .args = { { Name, 0 }, { Extattrnamespace, 1 }, { Name, 2 } } }, { .name = "extattr_delete_link", .ret_type = 1, .nargs = 3, .args = { { Name, 0 }, { Extattrnamespace, 1 }, { Name, 2 } } }, { .name = "extattr_get_fd", .ret_type = 1, .nargs = 5, .args = { { Int, 0 }, { Extattrnamespace, 1 }, { Name, 2 }, { BinString | OUT, 3 }, { Sizet, 4 } } }, { .name = "extattr_get_file", .ret_type = 1, .nargs = 5, .args = { { Name, 0 }, { Extattrnamespace, 1 }, { Name, 2 }, { BinString | OUT, 3 }, { Sizet, 4 } } }, { .name = "extattr_get_link", .ret_type = 1, .nargs = 5, .args = { { Name, 0 }, { Extattrnamespace, 1 }, { Name, 2 }, { BinString | OUT, 3 }, { Sizet, 4 } } }, { .name = "extattr_list_fd", .ret_type = 1, .nargs = 4, .args = { { Int, 0 }, { Extattrnamespace, 1 }, { BinString | OUT, 2 }, { Sizet, 3 } } }, { .name = "extattr_list_file", .ret_type = 1, .nargs = 4, .args = { { Name, 0 }, { Extattrnamespace, 1 }, { BinString | OUT, 2 }, { Sizet, 3 } } }, { .name = "extattr_list_link", .ret_type = 1, .nargs = 4, .args = { { Name, 0 }, { Extattrnamespace, 1 }, { BinString | OUT, 2 }, { Sizet, 3 } } }, { .name = "extattr_set_fd", .ret_type = 1, .nargs = 5, .args = { { Int, 0 }, { Extattrnamespace, 1 }, { Name, 2 }, { BinString | IN, 3 }, { Sizet, 4 } } }, { .name = "extattr_set_file", .ret_type = 1, .nargs = 5, .args = { { Name, 0 }, { Extattrnamespace, 1 }, { Name, 2 }, { BinString | IN, 3 }, { Sizet, 4 } } }, { .name = "extattr_set_link", .ret_type = 1, .nargs = 5, .args = { { Name, 0 }, { Extattrnamespace, 1 }, { Name, 2 }, { BinString | IN, 3 }, { Sizet, 4 } } }, { .name = "extattrctl", .ret_type = 1, .nargs = 5, .args = { { Name, 0 }, { Hex, 1 }, { Name, 2 }, { Extattrnamespace, 3 }, { Name, 4 } } }, { .name = "faccessat", .ret_type = 1, .nargs = 4, .args = { { Atfd, 0 }, { Name | IN, 1 }, { Accessmode, 2 }, { Atflags, 3 } } }, { .name = "fchflags", .ret_type = 1, .nargs = 2, .args = { { Int, 0 }, { FileFlags, 1 } } }, { .name = "fchmod", .ret_type = 1, .nargs = 2, .args = { { Int, 0 }, { Octal, 1 } } }, { .name = "fchmodat", .ret_type = 1, .nargs = 4, .args = { { Atfd, 0 }, { Name, 1 }, { Octal, 2 }, { Atflags, 3 } } }, { .name = "fchown", .ret_type = 1, .nargs = 3, .args = { { Int, 0 }, { Int, 1 }, { Int, 2 } } }, { .name = "fchownat", .ret_type = 1, .nargs = 5, .args = { { Atfd, 0 }, { Name, 1 }, { Int, 2 }, { Int, 3 }, { Atflags, 4 } } }, { .name = "fcntl", .ret_type = 1, .nargs = 3, .args = { { Int, 0 }, { Fcntl, 1 }, { Fcntlflag, 2 } } }, { .name = "flock", .ret_type = 1, .nargs = 2, .args = { { Int, 0 }, { Flockop, 1 } } }, { .name = "fstat", .ret_type = 1, .nargs = 2, .args = { { Int, 0 }, { Stat | OUT, 1 } } }, { .name = "fstatat", .ret_type = 1, .nargs = 4, .args = { { Atfd, 0 }, { Name | IN, 1 }, { Stat | OUT, 2 }, { Atflags, 3 } } }, { .name = "fstatfs", .ret_type = 1, .nargs = 2, .args = { { Int, 0 }, { StatFs | OUT, 1 } } }, { .name = "ftruncate", .ret_type = 1, .nargs = 2, .args = { { Int | IN, 0 }, { QuadHex | IN, 1 } } }, { .name = "futimens", .ret_type = 1, .nargs = 2, .args = { { Int, 0 }, { Timespec2 | IN, 1 } } }, { .name = "futimes", .ret_type = 1, .nargs = 2, .args = { { Int, 0 }, { Timeval2 | IN, 1 } } }, { .name = "futimesat", .ret_type = 1, .nargs = 3, .args = { { Atfd, 0 }, { Name | IN, 1 }, { Timeval2 | IN, 2 } } }, { .name = "getdirentries", .ret_type = 1, .nargs = 4, .args = { { Int, 0 }, { BinString | OUT, 1 }, { Int, 2 }, { PQuadHex | OUT, 3 } } }, { .name = "getfsstat", .ret_type = 1, .nargs = 3, .args = { { Ptr, 0 }, { Long, 1 }, { Getfsstatmode, 2 } } }, { .name = "getitimer", .ret_type = 1, .nargs = 2, .args = { { Int, 0 }, { Itimerval | OUT, 2 } } }, { .name = "getpeername", .ret_type = 1, .nargs = 3, .args = { { Int, 0 }, { Sockaddr | OUT, 1 }, { Ptr | OUT, 2 } } }, { .name = "getpgid", .ret_type = 1, .nargs = 1, .args = { { Int, 0 } } }, { .name = "getpriority", .ret_type = 1, .nargs = 2, .args = { { Priowhich, 0 }, { Int, 1 } } }, { .name = "getrandom", .ret_type = 1, .nargs = 3, .args = { { BinString | OUT, 0 }, { Sizet, 1 }, { UInt, 2 } } }, { .name = "getrlimit", .ret_type = 1, .nargs = 2, .args = { { Resource, 0 }, { Rlimit | OUT, 1 } } }, { .name = "getrusage", .ret_type = 1, .nargs = 2, .args = { { RusageWho, 0 }, { Rusage | OUT, 1 } } }, { .name = "getsid", .ret_type = 1, .nargs = 1, .args = { { Int, 0 } } }, { .name = "getsockname", .ret_type = 1, .nargs = 3, .args = { { Int, 0 }, { Sockaddr | OUT, 1 }, { Ptr | OUT, 2 } } }, { .name = "getsockopt", .ret_type = 1, .nargs = 5, .args = { { Int, 0 }, { Sockoptlevel, 1 }, { Sockoptname, 2 }, { Ptr | OUT, 3 }, { Ptr | OUT, 4 } } }, { .name = "gettimeofday", .ret_type = 1, .nargs = 2, .args = { { Timeval | OUT, 0 }, { Ptr, 1 } } }, { .name = "ioctl", .ret_type = 1, .nargs = 3, .args = { { Int, 0 }, { Ioctl, 1 }, { Ptr, 2 } } }, { .name = "kevent", .ret_type = 1, .nargs = 6, .args = { { Int, 0 }, { Kevent, 1 }, { Int, 2 }, { Kevent | OUT, 3 }, { Int, 4 }, { Timespec, 5 } } }, { .name = "kill", .ret_type = 1, .nargs = 2, .args = { { Int | IN, 0 }, { Signal | IN, 1 } } }, { .name = "kldfind", .ret_type = 1, .nargs = 1, .args = { { Name | IN, 0 } } }, { .name = "kldfirstmod", .ret_type = 1, .nargs = 1, .args = { { Int, 0 } } }, { .name = "kldload", .ret_type = 1, .nargs = 1, .args = { { Name | IN, 0 } } }, { .name = "kldnext", .ret_type = 1, .nargs = 1, .args = { { Int, 0 } } }, { .name = "kldstat", .ret_type = 1, .nargs = 2, .args = { { Int, 0 }, { Ptr, 1 } } }, { .name = "kldsym", .ret_type = 1, .nargs = 3, .args = { { Int, 0 }, { Kldsymcmd, 1 }, { Ptr, 2 } } }, { .name = "kldunload", .ret_type = 1, .nargs = 1, .args = { { Int, 0 } } }, { .name = "kldunloadf", .ret_type = 1, .nargs = 2, .args = { { Int, 0 }, { Kldunloadflags, 1 } } }, { .name = "kse_release", .ret_type = 0, .nargs = 1, .args = { { Timespec, 0 } } }, { .name = "lchflags", .ret_type = 1, .nargs = 2, .args = { { Name | IN, 0 }, { FileFlags, 1 } } }, { .name = "lchmod", .ret_type = 1, .nargs = 2, .args = { { Name, 0 }, { Octal, 1 } } }, { .name = "lchown", .ret_type = 1, .nargs = 3, .args = { { Name, 0 }, { Int, 1 }, { Int, 2 } } }, { .name = "link", .ret_type = 1, .nargs = 2, .args = { { Name, 0 }, { Name, 1 } } }, { .name = "linkat", .ret_type = 1, .nargs = 5, .args = { { Atfd, 0 }, { Name, 1 }, { Atfd, 2 }, { Name, 3 }, { Atflags, 4 } } }, { .name = "listen", .ret_type = 1, .nargs = 2, .args = { { Int, 0 }, { Int, 1 } } }, { .name = "lseek", .ret_type = 2, .nargs = 3, .args = { { Int, 0 }, { QuadHex, 1 }, { Whence, 2 } } }, { .name = "lstat", .ret_type = 1, .nargs = 2, .args = { { Name | IN, 0 }, { Stat | OUT, 1 } } }, { .name = "lutimes", .ret_type = 1, .nargs = 2, .args = { { Name | IN, 0 }, { Timeval2 | IN, 1 } } }, { .name = "madvise", .ret_type = 1, .nargs = 3, .args = { { Ptr, 0 }, { Sizet, 1 }, { Madvice, 2 } } }, { .name = "minherit", .ret_type = 1, .nargs = 3, .args = { { Ptr, 0 }, { Sizet, 1 }, { Minherit, 2 } } }, { .name = "mkdir", .ret_type = 1, .nargs = 2, .args = { { Name, 0 }, { Octal, 1 } } }, { .name = "mkdirat", .ret_type = 1, .nargs = 3, .args = { { Atfd, 0 }, { Name, 1 }, { Octal, 2 } } }, { .name = "mkfifo", .ret_type = 1, .nargs = 2, .args = { { Name, 0 }, { Octal, 1 } } }, { .name = "mkfifoat", .ret_type = 1, .nargs = 3, .args = { { Atfd, 0 }, { Name, 1 }, { Octal, 2 } } }, { .name = "mknod", .ret_type = 1, .nargs = 3, .args = { { Name, 0 }, { Octal, 1 }, { Int, 2 } } }, { .name = "mknodat", .ret_type = 1, .nargs = 4, .args = { { Atfd, 0 }, { Name, 1 }, { Octal, 2 }, { Int, 3 } } }, { .name = "mlock", .ret_type = 1, .nargs = 2, .args = { { Ptr, 0 }, { Sizet, 1 } } }, { .name = "mlockall", .ret_type = 1, .nargs = 1, .args = { { Mlockall, 0 } } }, { .name = "mmap", .ret_type = 1, .nargs = 6, .args = { { Ptr, 0 }, { Sizet, 1 }, { Mprot, 2 }, { Mmapflags, 3 }, { Int, 4 }, { QuadHex, 5 } } }, { .name = "modfind", .ret_type = 1, .nargs = 1, .args = { { Name | IN, 0 } } }, { .name = "mount", .ret_type = 1, .nargs = 4, .args = { { Name, 0 }, { Name, 1 }, { Mountflags, 2 }, { Ptr, 3 } } }, { .name = "mprotect", .ret_type = 1, .nargs = 3, .args = { { Ptr, 0 }, { Sizet, 1 }, { Mprot, 2 } } }, { .name = "msync", .ret_type = 1, .nargs = 3, .args = { { Ptr, 0 }, { Sizet, 1 }, { Msync, 2 } } }, { .name = "munlock", .ret_type = 1, .nargs = 2, .args = { { Ptr, 0 }, { Sizet, 1 } } }, { .name = "munmap", .ret_type = 1, .nargs = 2, .args = { { Ptr, 0 }, { Sizet, 1 } } }, { .name = "nanosleep", .ret_type = 1, .nargs = 1, .args = { { Timespec, 0 } } }, { .name = "nmount", .ret_type = 1, .nargs = 3, .args = { { Ptr, 0 }, { UInt, 1 }, { Mountflags, 2 } } }, { .name = "open", .ret_type = 1, .nargs = 3, .args = { { Name | IN, 0 }, { Open, 1 }, { Octal, 2 } } }, { .name = "openat", .ret_type = 1, .nargs = 4, .args = { { Atfd, 0 }, { Name | IN, 1 }, { Open, 2 }, { Octal, 3 } } }, { .name = "pathconf", .ret_type = 1, .nargs = 2, .args = { { Name | IN, 0 }, { Pathconf, 1 } } }, { .name = "pipe", .ret_type = 1, .nargs = 1, .args = { { PipeFds | OUT, 0 } } }, { .name = "pipe2", .ret_type = 1, .nargs = 2, .args = { { Ptr, 0 }, { Pipe2, 1 } } }, { .name = "poll", .ret_type = 1, .nargs = 3, .args = { { Pollfd, 0 }, { Int, 1 }, { Int, 2 } } }, { .name = "posix_fadvise", .ret_type = 1, .nargs = 4, .args = { { Int, 0 }, { QuadHex, 1 }, { QuadHex, 2 }, { Fadvice, 3 } } }, { .name = "posix_openpt", .ret_type = 1, .nargs = 1, .args = { { Open, 0 } } }, { .name = "pread", .ret_type = 1, .nargs = 4, .args = { { Int, 0 }, { BinString | OUT, 1 }, { Sizet, 2 }, { QuadHex, 3 } } }, { .name = "procctl", .ret_type = 1, .nargs = 4, .args = { { Idtype, 0 }, { Quad, 1 }, { Procctl, 2 }, { Ptr, 3 } } }, { .name = "ptrace", .ret_type = 1, .nargs = 4, .args = { { Ptraceop, 0 }, { Int, 1 }, { Ptr, 2 }, { Int, 3 } } }, { .name = "pwrite", .ret_type = 1, .nargs = 4, .args = { { Int, 0 }, { BinString | IN, 1 }, { Sizet, 2 }, { QuadHex, 3 } } }, { .name = "quotactl", .ret_type = 1, .nargs = 4, .args = { { Name, 0 }, { Quotactlcmd, 1 }, { Int, 2 }, { Ptr, 3 } } }, { .name = "read", .ret_type = 1, .nargs = 3, .args = { { Int, 0 }, { BinString | OUT, 1 }, { Sizet, 2 } } }, { .name = "readlink", .ret_type = 1, .nargs = 3, .args = { { Name, 0 }, { Readlinkres | OUT, 1 }, { Sizet, 2 } } }, { .name = "readlinkat", .ret_type = 1, .nargs = 4, .args = { { Atfd, 0 }, { Name, 1 }, { Readlinkres | OUT, 2 }, { Sizet, 3 } } }, { .name = "readv", .ret_type = 1, .nargs = 3, .args = { { Int, 0 }, { Iovec | OUT, 1 }, { Int, 2 } } }, { .name = "reboot", .ret_type = 1, .nargs = 1, .args = { { Reboothowto, 0 } } }, { .name = "recvfrom", .ret_type = 1, .nargs = 6, .args = { { Int, 0 }, { BinString | OUT, 1 }, { Sizet, 2 }, { Msgflags, 3 }, { Sockaddr | OUT, 4 }, { Ptr | OUT, 5 } } }, { .name = "recvmsg", .ret_type = 1, .nargs = 3, .args = { { Int, 0 }, { Msghdr | OUT, 1 }, { Msgflags, 2 } } }, { .name = "rename", .ret_type = 1, .nargs = 2, .args = { { Name, 0 }, { Name, 1 } } }, { .name = "renameat", .ret_type = 1, .nargs = 4, .args = { { Atfd, 0 }, { Name, 1 }, { Atfd, 2 }, { Name, 3 } } }, { .name = "rfork", .ret_type = 1, .nargs = 1, .args = { { Rforkflags, 0 } } }, { .name = "rmdir", .ret_type = 1, .nargs = 1, .args = { { Name, 0 } } }, { .name = "rtprio", .ret_type = 1, .nargs = 3, .args = { { Rtpriofunc, 0 }, { Int, 1 }, { Ptr, 2 } } }, { .name = "rtprio_thread", .ret_type = 1, .nargs = 3, .args = { { Rtpriofunc, 0 }, { Int, 1 }, { Ptr, 2 } } }, { .name = "sched_get_priority_max", .ret_type = 1, .nargs = 1, .args = { { Schedpolicy, 0 } } }, { .name = "sched_get_priority_min", .ret_type = 1, .nargs = 1, .args = { { Schedpolicy, 0 } } }, { .name = "sched_getparam", .ret_type = 1, .nargs = 2, .args = { { Int, 0 }, { Schedparam | OUT, 1 } } }, { .name = "sched_getscheduler", .ret_type = 1, .nargs = 1, .args = { { Int, 0 } } }, { .name = "sched_rr_get_interval", .ret_type = 1, .nargs = 2, .args = { { Int, 0 }, { Timespec | OUT, 1 } } }, { .name = "sched_setparam", .ret_type = 1, .nargs = 2, .args = { { Int, 0 }, { Schedparam, 1 } } }, { .name = "sched_setscheduler", .ret_type = 1, .nargs = 3, .args = { { Int, 0 }, { Schedpolicy, 1 }, { Schedparam, 2 } } }, { .name = "sctp_generic_recvmsg", .ret_type = 1, .nargs = 7, .args = { { Int, 0 }, { Iovec | OUT, 1 }, { Int, 2 }, { Sockaddr | OUT, 3 }, { Ptr | OUT, 4 }, { Sctpsndrcvinfo | OUT, 5 }, { Ptr | OUT, 6 } } }, { .name = "sctp_generic_sendmsg", .ret_type = 1, .nargs = 7, .args = { { Int, 0 }, { BinString | IN, 1 }, { Int, 2 }, { Sockaddr | IN, 3 }, { Socklent, 4 }, { Sctpsndrcvinfo | IN, 5 }, { Msgflags, 6 } } }, { .name = "sctp_generic_sendmsg_iov", .ret_type = 1, .nargs = 7, .args = { { Int, 0 }, { Iovec | IN, 1 }, { Int, 2 }, { Sockaddr | IN, 3 }, { Socklent, 4 }, { Sctpsndrcvinfo | IN, 5 }, { Msgflags, 6 } } }, { .name = "select", .ret_type = 1, .nargs = 5, .args = { { Int, 0 }, { Fd_set, 1 }, { Fd_set, 2 }, { Fd_set, 3 }, { Timeval, 4 } } }, { .name = "sendmsg", .ret_type = 1, .nargs = 3, .args = { { Int, 0 }, { Msghdr | IN, 1 }, { Msgflags, 2 } } }, { .name = "sendto", .ret_type = 1, .nargs = 6, .args = { { Int, 0 }, { BinString | IN, 1 }, { Sizet, 2 }, { Msgflags, 3 }, { Sockaddr | IN, 4 }, { Socklent | IN, 5 } } }, { .name = "setitimer", .ret_type = 1, .nargs = 3, .args = { { Int, 0 }, { Itimerval, 1 }, { Itimerval | OUT, 2 } } }, { .name = "setpriority", .ret_type = 1, .nargs = 3, .args = { { Priowhich, 0 }, { Int, 1 }, { Int, 2 } } }, { .name = "setrlimit", .ret_type = 1, .nargs = 2, .args = { { Resource, 0 }, { Rlimit | IN, 1 } } }, { .name = "setsockopt", .ret_type = 1, .nargs = 5, .args = { { Int, 0 }, { Sockoptlevel, 1 }, { Sockoptname, 2 }, { Ptr | IN, 3 }, { Socklent, 4 } } }, { .name = "shutdown", .ret_type = 1, .nargs = 2, .args = { { Int, 0 }, { Shutdown, 1 } } }, { .name = "sigaction", .ret_type = 1, .nargs = 3, .args = { { Signal, 0 }, { Sigaction | IN, 1 }, { Sigaction | OUT, 2 } } }, { .name = "sigpending", .ret_type = 1, .nargs = 1, .args = { { Sigset | OUT, 0 } } }, { .name = "sigprocmask", .ret_type = 1, .nargs = 3, .args = { { Sigprocmask, 0 }, { Sigset, 1 }, { Sigset | OUT, 2 } } }, { .name = "sigqueue", .ret_type = 1, .nargs = 3, .args = { { Int, 0 }, { Signal, 1 }, { LongHex, 2 } } }, { .name = "sigreturn", .ret_type = 1, .nargs = 1, .args = { { Ptr, 0 } } }, { .name = "sigsuspend", .ret_type = 1, .nargs = 1, .args = { { Sigset | IN, 0 } } }, { .name = "sigtimedwait", .ret_type = 1, .nargs = 3, .args = { { Sigset | IN, 0 }, { Siginfo | OUT, 1 }, { Timespec | IN, 2 } } }, { .name = "sigwait", .ret_type = 1, .nargs = 2, .args = { { Sigset | IN, 0 }, { PSig | OUT, 1 } } }, { .name = "sigwaitinfo", .ret_type = 1, .nargs = 2, .args = { { Sigset | IN, 0 }, { Siginfo | OUT, 1 } } }, { .name = "socket", .ret_type = 1, .nargs = 3, .args = { { Sockdomain, 0 }, { Socktype, 1 }, { Sockprotocol, 2 } } }, { .name = "stat", .ret_type = 1, .nargs = 2, .args = { { Name | IN, 0 }, { Stat | OUT, 1 } } }, { .name = "statfs", .ret_type = 1, .nargs = 2, .args = { { Name | IN, 0 }, { StatFs | OUT, 1 } } }, { .name = "symlink", .ret_type = 1, .nargs = 2, .args = { { Name, 0 }, { Name, 1 } } }, { .name = "symlinkat", .ret_type = 1, .nargs = 3, .args = { { Name, 0 }, { Atfd, 1 }, { Name, 2 } } }, { .name = "sysarch", .ret_type = 1, .nargs = 2, .args = { { Sysarch, 0 }, { Ptr, 1 } } }, { .name = "thr_kill", .ret_type = 1, .nargs = 2, .args = { { Long, 0 }, { Signal, 1 } } }, { .name = "thr_self", .ret_type = 1, .nargs = 1, .args = { { Ptr, 0 } } }, { .name = "thr_set_name", .ret_type = 1, .nargs = 2, .args = { { Long, 0 }, { Name, 1 } } }, { .name = "truncate", .ret_type = 1, .nargs = 2, .args = { { Name | IN, 0 }, { QuadHex | IN, 1 } } }, #if 0 /* Does not exist */ { .name = "umount", .ret_type = 1, .nargs = 2, .args = { { Name, 0 }, { Int, 2 } } }, #endif { .name = "unlink", .ret_type = 1, .nargs = 1, .args = { { Name, 0 } } }, { .name = "unlinkat", .ret_type = 1, .nargs = 3, .args = { { Atfd, 0 }, { Name, 1 }, { Atflags, 2 } } }, { .name = "unmount", .ret_type = 1, .nargs = 2, .args = { { Name, 0 }, { Mountflags, 1 } } }, { .name = "utimensat", .ret_type = 1, .nargs = 4, .args = { { Atfd, 0 }, { Name | IN, 1 }, { Timespec2 | IN, 2 }, { Atflags, 3 } } }, { .name = "utimes", .ret_type = 1, .nargs = 2, .args = { { Name | IN, 0 }, { Timeval2 | IN, 1 } } }, { .name = "utrace", .ret_type = 1, .nargs = 1, .args = { { Utrace, 0 } } }, { .name = "wait4", .ret_type = 1, .nargs = 4, .args = { { Int, 0 }, { ExitStatus | OUT, 1 }, { Waitoptions, 2 }, { Rusage | OUT, 3 } } }, { .name = "wait6", .ret_type = 1, .nargs = 6, .args = { { Idtype, 0 }, { Quad, 1 }, { ExitStatus | OUT, 2 }, { Waitoptions, 3 }, { Rusage | OUT, 4 }, { Siginfo | OUT, 5 } } }, { .name = "write", .ret_type = 1, .nargs = 3, .args = { { Int, 0 }, { BinString | IN, 1 }, { Sizet, 2 } } }, { .name = "writev", .ret_type = 1, .nargs = 3, .args = { { Int, 0 }, { Iovec | IN, 1 }, { Int, 2 } } }, /* Linux ABI */ { .name = "linux_access", .ret_type = 1, .nargs = 2, .args = { { Name, 0 }, { Accessmode, 1 } } }, { .name = "linux_execve", .ret_type = 1, .nargs = 3, .args = { { Name | IN, 0 }, { ExecArgs | IN, 1 }, { ExecEnv | IN, 2 } } }, { .name = "linux_lseek", .ret_type = 2, .nargs = 3, .args = { { Int, 0 }, { Int, 1 }, { Whence, 2 } } }, { .name = "linux_mkdir", .ret_type = 1, .nargs = 2, .args = { { Name | IN, 0 }, { Int, 1 } } }, { .name = "linux_newfstat", .ret_type = 1, .nargs = 2, .args = { { Int, 0 }, { Ptr | OUT, 1 } } }, { .name = "linux_newstat", .ret_type = 1, .nargs = 2, .args = { { Name | IN, 0 }, { Ptr | OUT, 1 } } }, { .name = "linux_open", .ret_type = 1, .nargs = 3, .args = { { Name, 0 }, { Hex, 1 }, { Octal, 2 } } }, { .name = "linux_readlink", .ret_type = 1, .nargs = 3, .args = { { Name, 0 }, { Name | OUT, 1 }, { Sizet, 2 } } }, { .name = "linux_socketcall", .ret_type = 1, .nargs = 2, .args = { { Int, 0 }, { LinuxSockArgs, 1 } } }, { .name = "linux_stat64", .ret_type = 1, .nargs = 2, .args = { { Name | IN, 0 }, { Ptr | OUT, 1 } } }, /* CloudABI system calls. */ { .name = "cloudabi_sys_clock_res_get", .ret_type = 1, .nargs = 1, .args = { { CloudABIClockID, 0 } } }, { .name = "cloudabi_sys_clock_time_get", .ret_type = 1, .nargs = 2, .args = { { CloudABIClockID, 0 }, { CloudABITimestamp, 1 } } }, { .name = "cloudabi_sys_condvar_signal", .ret_type = 1, .nargs = 3, .args = { { Ptr, 0 }, { CloudABIMFlags, 1 }, { UInt, 2 } } }, { .name = "cloudabi_sys_fd_close", .ret_type = 1, .nargs = 1, .args = { { Int, 0 } } }, { .name = "cloudabi_sys_fd_create1", .ret_type = 1, .nargs = 1, .args = { { CloudABIFileType, 0 } } }, { .name = "cloudabi_sys_fd_create2", .ret_type = 1, .nargs = 2, .args = { { CloudABIFileType, 0 }, { PipeFds | OUT, 0 } } }, { .name = "cloudabi_sys_fd_datasync", .ret_type = 1, .nargs = 1, .args = { { Int, 0 } } }, { .name = "cloudabi_sys_fd_dup", .ret_type = 1, .nargs = 1, .args = { { Int, 0 } } }, { .name = "cloudabi_sys_fd_replace", .ret_type = 1, .nargs = 2, .args = { { Int, 0 }, { Int, 1 } } }, { .name = "cloudabi_sys_fd_seek", .ret_type = 1, .nargs = 3, .args = { { Int, 0 }, { Int, 1 }, { CloudABIWhence, 2 } } }, { .name = "cloudabi_sys_fd_stat_get", .ret_type = 1, .nargs = 2, .args = { { Int, 0 }, { CloudABIFDStat | OUT, 1 } } }, { .name = "cloudabi_sys_fd_stat_put", .ret_type = 1, .nargs = 3, .args = { { Int, 0 }, { CloudABIFDStat | IN, 1 }, { CloudABIFDSFlags, 2 } } }, { .name = "cloudabi_sys_fd_sync", .ret_type = 1, .nargs = 1, .args = { { Int, 0 } } }, { .name = "cloudabi_sys_file_advise", .ret_type = 1, .nargs = 4, .args = { { Int, 0 }, { Int, 1 }, { Int, 2 }, { CloudABIAdvice, 3 } } }, { .name = "cloudabi_sys_file_allocate", .ret_type = 1, .nargs = 3, .args = { { Int, 0 }, { Int, 1 }, { Int, 2 } } }, { .name = "cloudabi_sys_file_create", .ret_type = 1, .nargs = 3, .args = { { Int, 0 }, { BinString | IN, 1 }, { CloudABIFileType, 3 } } }, { .name = "cloudabi_sys_file_link", .ret_type = 1, .nargs = 4, .args = { { CloudABILookup, 0 }, { BinString | IN, 1 }, { Int, 3 }, { BinString | IN, 4 } } }, { .name = "cloudabi_sys_file_open", .ret_type = 1, .nargs = 4, .args = { { Int, 0 }, { BinString | IN, 1 }, { CloudABIOFlags, 3 }, { CloudABIFDStat | IN, 4 } } }, { .name = "cloudabi_sys_file_readdir", .ret_type = 1, .nargs = 4, .args = { { Int, 0 }, { BinString | OUT, 1 }, { Int, 2 }, { Int, 3 } } }, { .name = "cloudabi_sys_file_readlink", .ret_type = 1, .nargs = 4, .args = { { Int, 0 }, { BinString | IN, 1 }, { BinString | OUT, 3 }, { Int, 4 } } }, { .name = "cloudabi_sys_file_rename", .ret_type = 1, .nargs = 4, .args = { { Int, 0 }, { BinString | IN, 1 }, { Int, 3 }, { BinString | IN, 4 } } }, { .name = "cloudabi_sys_file_stat_fget", .ret_type = 1, .nargs = 2, .args = { { Int, 0 }, { CloudABIFileStat | OUT, 1 } } }, { .name = "cloudabi_sys_file_stat_fput", .ret_type = 1, .nargs = 3, .args = { { Int, 0 }, { CloudABIFileStat | IN, 1 }, { CloudABIFSFlags, 2 } } }, { .name = "cloudabi_sys_file_stat_get", .ret_type = 1, .nargs = 3, .args = { { CloudABILookup, 0 }, { BinString | IN, 1 }, { CloudABIFileStat | OUT, 3 } } }, { .name = "cloudabi_sys_file_stat_put", .ret_type = 1, .nargs = 4, .args = { { CloudABILookup, 0 }, { BinString | IN, 1 }, { CloudABIFileStat | IN, 3 }, { CloudABIFSFlags, 4 } } }, { .name = "cloudabi_sys_file_symlink", .ret_type = 1, .nargs = 3, .args = { { BinString | IN, 0 }, { Int, 2 }, { BinString | IN, 3 } } }, { .name = "cloudabi_sys_file_unlink", .ret_type = 1, .nargs = 3, .args = { { Int, 0 }, { BinString | IN, 1 }, { CloudABIULFlags, 3 } } }, { .name = "cloudabi_sys_lock_unlock", .ret_type = 1, .nargs = 2, .args = { { Ptr, 0 }, { CloudABIMFlags, 1 } } }, { .name = "cloudabi_sys_mem_advise", .ret_type = 1, .nargs = 3, .args = { { Ptr, 0 }, { Int, 1 }, { CloudABIAdvice, 2 } } }, { .name = "cloudabi_sys_mem_map", .ret_type = 1, .nargs = 6, .args = { { Ptr, 0 }, { Int, 1 }, { CloudABIMProt, 2 }, { CloudABIMFlags, 3 }, { Int, 4 }, { Int, 5 } } }, { .name = "cloudabi_sys_mem_protect", .ret_type = 1, .nargs = 3, .args = { { Ptr, 0 }, { Int, 1 }, { CloudABIMProt, 2 } } }, { .name = "cloudabi_sys_mem_sync", .ret_type = 1, .nargs = 3, .args = { { Ptr, 0 }, { Int, 1 }, { CloudABIMSFlags, 2 } } }, { .name = "cloudabi_sys_mem_unmap", .ret_type = 1, .nargs = 2, .args = { { Ptr, 0 }, { Int, 1 } } }, { .name = "cloudabi_sys_proc_exec", .ret_type = 1, .nargs = 5, .args = { { Int, 0 }, { BinString | IN, 1 }, { Int, 2 }, { IntArray, 3 }, { Int, 4 } } }, { .name = "cloudabi_sys_proc_exit", .ret_type = 1, .nargs = 1, .args = { { Int, 0 } } }, { .name = "cloudabi_sys_proc_fork", .ret_type = 1, .nargs = 0 }, { .name = "cloudabi_sys_proc_raise", .ret_type = 1, .nargs = 1, .args = { { CloudABISignal, 0 } } }, { .name = "cloudabi_sys_random_get", .ret_type = 1, .nargs = 2, .args = { { BinString | OUT, 0 }, { Int, 1 } } }, { .name = "cloudabi_sys_sock_shutdown", .ret_type = 1, .nargs = 2, .args = { { Int, 0 }, { CloudABISDFlags, 1 } } }, { .name = "cloudabi_sys_thread_exit", .ret_type = 1, .nargs = 2, .args = { { Ptr, 0 }, { CloudABIMFlags, 1 } } }, { .name = "cloudabi_sys_thread_yield", .ret_type = 1, .nargs = 0 }, { .name = 0 }, }; static STAILQ_HEAD(, syscall) syscalls; /* Xlat idea taken from strace */ struct xlat { int val; const char *str; }; #define X(a) { a, #a }, #define XEND { 0, NULL } static struct xlat poll_flags[] = { X(POLLSTANDARD) X(POLLIN) X(POLLPRI) X(POLLOUT) X(POLLERR) X(POLLHUP) X(POLLNVAL) X(POLLRDNORM) X(POLLRDBAND) X(POLLWRBAND) X(POLLINIGNEOF) XEND }; static struct xlat sigaction_flags[] = { X(SA_ONSTACK) X(SA_RESTART) X(SA_RESETHAND) X(SA_NOCLDSTOP) X(SA_NODEFER) X(SA_NOCLDWAIT) X(SA_SIGINFO) XEND }; static struct xlat linux_socketcall_ops[] = { X(LINUX_SOCKET) X(LINUX_BIND) X(LINUX_CONNECT) X(LINUX_LISTEN) X(LINUX_ACCEPT) X(LINUX_GETSOCKNAME) X(LINUX_GETPEERNAME) X(LINUX_SOCKETPAIR) X(LINUX_SEND) X(LINUX_RECV) X(LINUX_SENDTO) X(LINUX_RECVFROM) X(LINUX_SHUTDOWN) X(LINUX_SETSOCKOPT) X(LINUX_GETSOCKOPT) X(LINUX_SENDMSG) X(LINUX_RECVMSG) XEND }; #undef X #define X(a) { CLOUDABI_##a, #a }, static struct xlat cloudabi_advice[] = { X(ADVICE_DONTNEED) X(ADVICE_NOREUSE) X(ADVICE_NORMAL) X(ADVICE_RANDOM) X(ADVICE_SEQUENTIAL) X(ADVICE_WILLNEED) XEND }; static struct xlat cloudabi_clockid[] = { X(CLOCK_MONOTONIC) X(CLOCK_PROCESS_CPUTIME_ID) X(CLOCK_REALTIME) X(CLOCK_THREAD_CPUTIME_ID) XEND }; static struct xlat cloudabi_fdflags[] = { X(FDFLAG_APPEND) X(FDFLAG_DSYNC) X(FDFLAG_NONBLOCK) X(FDFLAG_RSYNC) X(FDFLAG_SYNC) XEND }; static struct xlat cloudabi_fdsflags[] = { X(FDSTAT_FLAGS) X(FDSTAT_RIGHTS) XEND }; static struct xlat cloudabi_filetype[] = { X(FILETYPE_UNKNOWN) X(FILETYPE_BLOCK_DEVICE) X(FILETYPE_CHARACTER_DEVICE) X(FILETYPE_DIRECTORY) X(FILETYPE_PROCESS) X(FILETYPE_REGULAR_FILE) X(FILETYPE_SHARED_MEMORY) X(FILETYPE_SOCKET_DGRAM) X(FILETYPE_SOCKET_STREAM) X(FILETYPE_SYMBOLIC_LINK) XEND }; static struct xlat cloudabi_fsflags[] = { X(FILESTAT_ATIM) X(FILESTAT_ATIM_NOW) X(FILESTAT_MTIM) X(FILESTAT_MTIM_NOW) X(FILESTAT_SIZE) XEND }; static struct xlat cloudabi_mflags[] = { X(MAP_ANON) X(MAP_FIXED) X(MAP_PRIVATE) X(MAP_SHARED) XEND }; static struct xlat cloudabi_mprot[] = { X(PROT_EXEC) X(PROT_WRITE) X(PROT_READ) XEND }; static struct xlat cloudabi_msflags[] = { X(MS_ASYNC) X(MS_INVALIDATE) X(MS_SYNC) XEND }; static struct xlat cloudabi_oflags[] = { X(O_CREAT) X(O_DIRECTORY) X(O_EXCL) X(O_TRUNC) XEND }; static struct xlat cloudabi_sdflags[] = { X(SHUT_RD) X(SHUT_WR) XEND }; static struct xlat cloudabi_signal[] = { X(SIGABRT) X(SIGALRM) X(SIGBUS) X(SIGCHLD) X(SIGCONT) X(SIGFPE) X(SIGHUP) X(SIGILL) X(SIGINT) X(SIGKILL) X(SIGPIPE) X(SIGQUIT) X(SIGSEGV) X(SIGSTOP) X(SIGSYS) X(SIGTERM) X(SIGTRAP) X(SIGTSTP) X(SIGTTIN) X(SIGTTOU) X(SIGURG) X(SIGUSR1) X(SIGUSR2) X(SIGVTALRM) X(SIGXCPU) X(SIGXFSZ) XEND }; static struct xlat cloudabi_ulflags[] = { X(UNLINK_REMOVEDIR) XEND }; static struct xlat cloudabi_whence[] = { X(WHENCE_CUR) X(WHENCE_END) X(WHENCE_SET) XEND }; #undef X #undef XEND /* * Searches an xlat array for a value, and returns it if found. Otherwise * return a string representation. */ static const char * lookup(struct xlat *xlat, int val, int base) { static char tmp[16]; for (; xlat->str != NULL; xlat++) if (xlat->val == val) return (xlat->str); switch (base) { case 8: sprintf(tmp, "0%o", val); break; case 16: sprintf(tmp, "0x%x", val); break; case 10: sprintf(tmp, "%u", val); break; default: errx(1,"Unknown lookup base"); break; } return (tmp); } static const char * xlookup(struct xlat *xlat, int val) { return (lookup(xlat, val, 16)); } /* * Searches an xlat array containing bitfield values. Remaining bits * set after removing the known ones are printed at the end: * IN|0x400. */ static char * xlookup_bits(struct xlat *xlat, int val) { int len, rem; static char str[512]; len = 0; rem = val; for (; xlat->str != NULL; xlat++) { if ((xlat->val & rem) == xlat->val) { /* * Don't print the "all-bits-zero" string unless all * bits are really zero. */ if (xlat->val == 0 && val != 0) continue; len += sprintf(str + len, "%s|", xlat->str); rem &= ~(xlat->val); } } /* * If we have leftover bits or didn't match anything, print * the remainder. */ if (rem || len == 0) len += sprintf(str + len, "0x%x", rem); if (len && str[len - 1] == '|') len--; str[len] = 0; return (str); } static void print_integer_arg(const char *(*decoder)(int), FILE *fp, int value) { const char *str; str = decoder(value); if (str != NULL) fputs(str, fp); else fprintf(fp, "%d", value); } static void print_mask_arg(bool (*decoder)(FILE *, int, int *), FILE *fp, int value) { int rem; if (!decoder(fp, value, &rem)) fprintf(fp, "0x%x", rem); else if (rem != 0) fprintf(fp, "|0x%x", rem); } static void print_mask_arg32(bool (*decoder)(FILE *, uint32_t, uint32_t *), FILE *fp, uint32_t value) { uint32_t rem; if (!decoder(fp, value, &rem)) fprintf(fp, "0x%x", rem); else if (rem != 0) fprintf(fp, "|0x%x", rem); } #ifndef __LP64__ /* * Add argument padding to subsequent system calls afater a Quad * syscall arguments as needed. This used to be done by hand in the * decoded_syscalls table which was ugly and error prone. It is * simpler to do the fixup of offsets at initalization time than when * decoding arguments. */ static void quad_fixup(struct syscall *sc) { int offset, prev; u_int i; offset = 0; prev = -1; for (i = 0; i < sc->nargs; i++) { /* This arg type is a dummy that doesn't use offset. */ if ((sc->args[i].type & ARG_MASK) == PipeFds) continue; assert(prev < sc->args[i].offset); prev = sc->args[i].offset; sc->args[i].offset += offset; switch (sc->args[i].type & ARG_MASK) { case Quad: case QuadHex: #ifdef __powerpc__ /* * 64-bit arguments on 32-bit powerpc must be * 64-bit aligned. If the current offset is * not aligned, the calling convention inserts * a 32-bit pad argument that should be skipped. */ if (sc->args[i].offset % 2 == 1) { sc->args[i].offset++; offset++; } #endif offset++; default: break; } } } #endif void init_syscalls(void) { struct syscall *sc; STAILQ_INIT(&syscalls); for (sc = decoded_syscalls; sc->name != NULL; sc++) { #ifndef __LP64__ quad_fixup(sc); #endif STAILQ_INSERT_HEAD(&syscalls, sc, entries); } } static struct syscall * find_syscall(struct procabi *abi, u_int number) { struct extra_syscall *es; if (number < nitems(abi->syscalls)) return (abi->syscalls[number]); STAILQ_FOREACH(es, &abi->extra_syscalls, entries) { if (es->number == number) return (es->sc); } return (NULL); } static void add_syscall(struct procabi *abi, u_int number, struct syscall *sc) { struct extra_syscall *es; if (number < nitems(abi->syscalls)) { assert(abi->syscalls[number] == NULL); abi->syscalls[number] = sc; } else { es = malloc(sizeof(*es)); es->sc = sc; es->number = number; STAILQ_INSERT_TAIL(&abi->extra_syscalls, es, entries); } } /* * If/when the list gets big, it might be desirable to do it * as a hash table or binary search. */ struct syscall * get_syscall(struct threadinfo *t, u_int number, u_int nargs) { struct syscall *sc; const char *name; char *new_name; u_int i; sc = find_syscall(t->proc->abi, number); if (sc != NULL) return (sc); name = sysdecode_syscallname(t->proc->abi->abi, number); if (name == NULL) { asprintf(&new_name, "#%d", number); name = new_name; } else new_name = NULL; STAILQ_FOREACH(sc, &syscalls, entries) { if (strcmp(name, sc->name) == 0) { add_syscall(t->proc->abi, number, sc); free(new_name); return (sc); } } /* It is unknown. Add it into the list. */ #if DEBUG fprintf(stderr, "unknown syscall %s -- setting args to %d\n", name, nargs); #endif sc = calloc(1, sizeof(struct syscall)); sc->name = name; if (new_name != NULL) sc->unknown = true; sc->ret_type = 1; sc->nargs = nargs; for (i = 0; i < nargs; i++) { sc->args[i].offset = i; /* Treat all unknown arguments as LongHex. */ sc->args[i].type = LongHex; } STAILQ_INSERT_HEAD(&syscalls, sc, entries); add_syscall(t->proc->abi, number, sc); return (sc); } /* * Copy a fixed amount of bytes from the process. */ static int get_struct(pid_t pid, void *offset, void *buf, int len) { struct ptrace_io_desc iorequest; iorequest.piod_op = PIOD_READ_D; iorequest.piod_offs = offset; iorequest.piod_addr = buf; iorequest.piod_len = len; if (ptrace(PT_IO, pid, (caddr_t)&iorequest, 0) < 0) return (-1); return (0); } #define MAXSIZE 4096 /* * Copy a string from the process. Note that it is * expected to be a C string, but if max is set, it will * only get that much. */ static char * get_string(pid_t pid, void *addr, int max) { struct ptrace_io_desc iorequest; char *buf, *nbuf; size_t offset, size, totalsize; offset = 0; if (max) size = max + 1; else { /* Read up to the end of the current page. */ size = PAGE_SIZE - ((uintptr_t)addr % PAGE_SIZE); if (size > MAXSIZE) size = MAXSIZE; } totalsize = size; buf = malloc(totalsize); if (buf == NULL) return (NULL); for (;;) { iorequest.piod_op = PIOD_READ_D; iorequest.piod_offs = (char *)addr + offset; iorequest.piod_addr = buf + offset; iorequest.piod_len = size; if (ptrace(PT_IO, pid, (caddr_t)&iorequest, 0) < 0) { free(buf); return (NULL); } if (memchr(buf + offset, '\0', size) != NULL) return (buf); offset += size; if (totalsize < MAXSIZE && max == 0) { size = MAXSIZE - totalsize; if (size > PAGE_SIZE) size = PAGE_SIZE; nbuf = realloc(buf, totalsize + size); if (nbuf == NULL) { buf[totalsize - 1] = '\0'; return (buf); } buf = nbuf; totalsize += size; } else { buf[totalsize - 1] = '\0'; return (buf); } } } static const char * strsig2(int sig) { static char tmp[32]; const char *signame; signame = sysdecode_signal(sig); if (signame == NULL) { snprintf(tmp, sizeof(tmp), "%d", sig); signame = tmp; } return (signame); } static void print_kevent(FILE *fp, struct kevent *ke) { switch (ke->filter) { case EVFILT_READ: case EVFILT_WRITE: case EVFILT_VNODE: case EVFILT_PROC: case EVFILT_TIMER: case EVFILT_PROCDESC: case EVFILT_EMPTY: fprintf(fp, "%ju", (uintmax_t)ke->ident); break; case EVFILT_SIGNAL: fputs(strsig2(ke->ident), fp); break; default: fprintf(fp, "%p", (void *)ke->ident); } fprintf(fp, ","); print_integer_arg(sysdecode_kevent_filter, fp, ke->filter); fprintf(fp, ","); print_mask_arg(sysdecode_kevent_flags, fp, ke->flags); fprintf(fp, ","); sysdecode_kevent_fflags(fp, ke->filter, ke->fflags, 16); fprintf(fp, ",%#jx,%p", (uintmax_t)ke->data, ke->udata); } static void print_utrace(FILE *fp, void *utrace_addr, size_t len) { unsigned char *utrace_buffer; fprintf(fp, "{ "); if (sysdecode_utrace(fp, utrace_addr, len)) { fprintf(fp, " }"); return; } utrace_buffer = utrace_addr; fprintf(fp, "%zu:", len); while (len--) fprintf(fp, " %02x", *utrace_buffer++); fprintf(fp, " }"); } static void print_sockaddr(FILE *fp, struct trussinfo *trussinfo, void *arg, socklen_t len) { char addr[64]; struct sockaddr_in *lsin; struct sockaddr_in6 *lsin6; struct sockaddr_un *sun; struct sockaddr *sa; u_char *q; pid_t pid = trussinfo->curthread->proc->pid; if (arg == NULL) { fputs("NULL", fp); return; } /* If the length is too small, just bail. */ if (len < sizeof(*sa)) { fprintf(fp, "%p", arg); return; } sa = calloc(1, len); if (get_struct(pid, arg, sa, len) == -1) { free(sa); fprintf(fp, "%p", arg); return; } switch (sa->sa_family) { case AF_INET: if (len < sizeof(*lsin)) goto sockaddr_short; lsin = (struct sockaddr_in *)(void *)sa; inet_ntop(AF_INET, &lsin->sin_addr, addr, sizeof(addr)); fprintf(fp, "{ AF_INET %s:%d }", addr, htons(lsin->sin_port)); break; case AF_INET6: if (len < sizeof(*lsin6)) goto sockaddr_short; lsin6 = (struct sockaddr_in6 *)(void *)sa; inet_ntop(AF_INET6, &lsin6->sin6_addr, addr, sizeof(addr)); fprintf(fp, "{ AF_INET6 [%s]:%d }", addr, htons(lsin6->sin6_port)); break; case AF_UNIX: sun = (struct sockaddr_un *)sa; fprintf(fp, "{ AF_UNIX \"%.*s\" }", (int)(len - offsetof(struct sockaddr_un, sun_path)), sun->sun_path); break; default: sockaddr_short: fprintf(fp, "{ sa_len = %d, sa_family = %d, sa_data = {", (int)sa->sa_len, (int)sa->sa_family); for (q = (u_char *)sa->sa_data; q < (u_char *)sa + len; q++) fprintf(fp, "%s 0x%02x", q == (u_char *)sa->sa_data ? "" : ",", *q); fputs(" } }", fp); } free(sa); } #define IOV_LIMIT 16 static void print_iovec(FILE *fp, struct trussinfo *trussinfo, void *arg, int iovcnt) { struct iovec iov[IOV_LIMIT]; size_t max_string = trussinfo->strsize; char tmp2[max_string + 1], *tmp3; size_t len; pid_t pid = trussinfo->curthread->proc->pid; int i; bool buf_truncated, iov_truncated; if (iovcnt <= 0) { fprintf(fp, "%p", arg); return; } if (iovcnt > IOV_LIMIT) { iovcnt = IOV_LIMIT; iov_truncated = true; } else { iov_truncated = false; } if (get_struct(pid, arg, &iov, iovcnt * sizeof(struct iovec)) == -1) { fprintf(fp, "%p", arg); return; } fputs("[", fp); for (i = 0; i < iovcnt; i++) { len = iov[i].iov_len; if (len > max_string) { len = max_string; buf_truncated = true; } else { buf_truncated = false; } fprintf(fp, "%s{", (i > 0) ? "," : ""); if (len && get_struct(pid, iov[i].iov_base, &tmp2, len) != -1) { tmp3 = malloc(len * 4 + 1); while (len) { if (strvisx(tmp3, tmp2, len, VIS_CSTYLE|VIS_TAB|VIS_NL) <= (int)max_string) break; len--; buf_truncated = true; } fprintf(fp, "\"%s\"%s", tmp3, buf_truncated ? "..." : ""); free(tmp3); } else { fprintf(fp, "%p", iov[i].iov_base); } fprintf(fp, ",%zu}", iov[i].iov_len); } fprintf(fp, "%s%s", iov_truncated ? ",..." : "", "]"); } static void print_gen_cmsg(FILE *fp, struct cmsghdr *cmsghdr) { u_char *q; fputs("{", fp); for (q = CMSG_DATA(cmsghdr); q < (u_char *)cmsghdr + cmsghdr->cmsg_len; q++) { fprintf(fp, "%s0x%02x", q == CMSG_DATA(cmsghdr) ? "" : ",", *q); } fputs("}", fp); } static void print_sctp_initmsg(FILE *fp, struct sctp_initmsg *init) { fprintf(fp, "{out=%u,", init->sinit_num_ostreams); fprintf(fp, "in=%u,", init->sinit_max_instreams); fprintf(fp, "max_rtx=%u,", init->sinit_max_attempts); fprintf(fp, "max_rto=%u}", init->sinit_max_init_timeo); } static void print_sctp_sndrcvinfo(FILE *fp, bool receive, struct sctp_sndrcvinfo *info) { fprintf(fp, "{sid=%u,", info->sinfo_stream); if (receive) { fprintf(fp, "ssn=%u,", info->sinfo_ssn); } fputs("flgs=", fp); sysdecode_sctp_sinfo_flags(fp, info->sinfo_flags); fprintf(fp, ",ppid=%u,", ntohl(info->sinfo_ppid)); if (!receive) { fprintf(fp, "ctx=%u,", info->sinfo_context); fprintf(fp, "ttl=%u,", info->sinfo_timetolive); } if (receive) { fprintf(fp, "tsn=%u,", info->sinfo_tsn); fprintf(fp, "cumtsn=%u,", info->sinfo_cumtsn); } fprintf(fp, "id=%u}", info->sinfo_assoc_id); } static void print_sctp_sndinfo(FILE *fp, struct sctp_sndinfo *info) { fprintf(fp, "{sid=%u,", info->snd_sid); fputs("flgs=", fp); print_mask_arg(sysdecode_sctp_snd_flags, fp, info->snd_flags); fprintf(fp, ",ppid=%u,", ntohl(info->snd_ppid)); fprintf(fp, "ctx=%u,", info->snd_context); fprintf(fp, "id=%u}", info->snd_assoc_id); } static void print_sctp_rcvinfo(FILE *fp, struct sctp_rcvinfo *info) { fprintf(fp, "{sid=%u,", info->rcv_sid); fprintf(fp, "ssn=%u,", info->rcv_ssn); fputs("flgs=", fp); print_mask_arg(sysdecode_sctp_rcv_flags, fp, info->rcv_flags); fprintf(fp, ",ppid=%u,", ntohl(info->rcv_ppid)); fprintf(fp, "tsn=%u,", info->rcv_tsn); fprintf(fp, "cumtsn=%u,", info->rcv_cumtsn); fprintf(fp, "ctx=%u,", info->rcv_context); fprintf(fp, "id=%u}", info->rcv_assoc_id); } static void print_sctp_nxtinfo(FILE *fp, struct sctp_nxtinfo *info) { fprintf(fp, "{sid=%u,", info->nxt_sid); fputs("flgs=", fp); print_mask_arg(sysdecode_sctp_nxt_flags, fp, info->nxt_flags); fprintf(fp, ",ppid=%u,", ntohl(info->nxt_ppid)); fprintf(fp, "len=%u,", info->nxt_length); fprintf(fp, "id=%u}", info->nxt_assoc_id); } static void print_sctp_prinfo(FILE *fp, struct sctp_prinfo *info) { fputs("{pol=", fp); print_integer_arg(sysdecode_sctp_pr_policy, fp, info->pr_policy); fprintf(fp, ",val=%u}", info->pr_value); } static void print_sctp_authinfo(FILE *fp, struct sctp_authinfo *info) { fprintf(fp, "{num=%u}", info->auth_keynumber); } static void print_sctp_ipv4_addr(FILE *fp, struct in_addr *addr) { char buf[INET_ADDRSTRLEN]; const char *s; s = inet_ntop(AF_INET, addr, buf, INET_ADDRSTRLEN); if (s != NULL) fprintf(fp, "{addr=%s}", s); else fputs("{addr=???}", fp); } static void print_sctp_ipv6_addr(FILE *fp, struct in6_addr *addr) { char buf[INET6_ADDRSTRLEN]; const char *s; s = inet_ntop(AF_INET6, addr, buf, INET6_ADDRSTRLEN); if (s != NULL) fprintf(fp, "{addr=%s}", s); else fputs("{addr=???}", fp); } static void print_sctp_cmsg(FILE *fp, bool receive, struct cmsghdr *cmsghdr) { void *data; socklen_t len; len = cmsghdr->cmsg_len; data = CMSG_DATA(cmsghdr); switch (cmsghdr->cmsg_type) { case SCTP_INIT: if (len == CMSG_LEN(sizeof(struct sctp_initmsg))) print_sctp_initmsg(fp, (struct sctp_initmsg *)data); else print_gen_cmsg(fp, cmsghdr); break; case SCTP_SNDRCV: if (len == CMSG_LEN(sizeof(struct sctp_sndrcvinfo))) print_sctp_sndrcvinfo(fp, receive, (struct sctp_sndrcvinfo *)data); else print_gen_cmsg(fp, cmsghdr); break; #if 0 case SCTP_EXTRCV: if (len == CMSG_LEN(sizeof(struct sctp_extrcvinfo))) print_sctp_extrcvinfo(fp, (struct sctp_extrcvinfo *)data); else print_gen_cmsg(fp, cmsghdr); break; #endif case SCTP_SNDINFO: if (len == CMSG_LEN(sizeof(struct sctp_sndinfo))) print_sctp_sndinfo(fp, (struct sctp_sndinfo *)data); else print_gen_cmsg(fp, cmsghdr); break; case SCTP_RCVINFO: if (len == CMSG_LEN(sizeof(struct sctp_rcvinfo))) print_sctp_rcvinfo(fp, (struct sctp_rcvinfo *)data); else print_gen_cmsg(fp, cmsghdr); break; case SCTP_NXTINFO: if (len == CMSG_LEN(sizeof(struct sctp_nxtinfo))) print_sctp_nxtinfo(fp, (struct sctp_nxtinfo *)data); else print_gen_cmsg(fp, cmsghdr); break; case SCTP_PRINFO: if (len == CMSG_LEN(sizeof(struct sctp_prinfo))) print_sctp_prinfo(fp, (struct sctp_prinfo *)data); else print_gen_cmsg(fp, cmsghdr); break; case SCTP_AUTHINFO: if (len == CMSG_LEN(sizeof(struct sctp_authinfo))) print_sctp_authinfo(fp, (struct sctp_authinfo *)data); else print_gen_cmsg(fp, cmsghdr); break; case SCTP_DSTADDRV4: if (len == CMSG_LEN(sizeof(struct in_addr))) print_sctp_ipv4_addr(fp, (struct in_addr *)data); else print_gen_cmsg(fp, cmsghdr); break; case SCTP_DSTADDRV6: if (len == CMSG_LEN(sizeof(struct in6_addr))) print_sctp_ipv6_addr(fp, (struct in6_addr *)data); else print_gen_cmsg(fp, cmsghdr); break; default: print_gen_cmsg(fp, cmsghdr); } } static void print_cmsgs(FILE *fp, pid_t pid, bool receive, struct msghdr *msghdr) { struct cmsghdr *cmsghdr; char *cmsgbuf; const char *temp; socklen_t len; int level, type; bool first; len = msghdr->msg_controllen; if (len == 0) { fputs("{}", fp); return; } cmsgbuf = calloc(1, len); if (get_struct(pid, msghdr->msg_control, cmsgbuf, len) == -1) { fprintf(fp, "%p", msghdr->msg_control); free(cmsgbuf); return; } msghdr->msg_control = cmsgbuf; first = true; fputs("{", fp); for (cmsghdr = CMSG_FIRSTHDR(msghdr); cmsghdr != NULL; cmsghdr = CMSG_NXTHDR(msghdr, cmsghdr)) { level = cmsghdr->cmsg_level; type = cmsghdr->cmsg_type; len = cmsghdr->cmsg_len; fprintf(fp, "%s{level=", first ? "" : ","); print_integer_arg(sysdecode_sockopt_level, fp, level); fputs(",type=", fp); temp = sysdecode_cmsg_type(level, type); if (temp) { fputs(temp, fp); } else { fprintf(fp, "%d", type); } fputs(",data=", fp); switch (level) { case IPPROTO_SCTP: print_sctp_cmsg(fp, receive, cmsghdr); break; default: print_gen_cmsg(fp, cmsghdr); break; } fputs("}", fp); first = false; } fputs("}", fp); free(cmsgbuf); } /* * Converts a syscall argument into a string. Said string is * allocated via malloc(), so needs to be free()'d. sc is * a pointer to the syscall description (see above); args is * an array of all of the system call arguments. */ char * print_arg(struct syscall_args *sc, unsigned long *args, long *retval, struct trussinfo *trussinfo) { FILE *fp; char *tmp; size_t tmplen; pid_t pid; fp = open_memstream(&tmp, &tmplen); pid = trussinfo->curthread->proc->pid; switch (sc->type & ARG_MASK) { case Hex: fprintf(fp, "0x%x", (int)args[sc->offset]); break; case Octal: fprintf(fp, "0%o", (int)args[sc->offset]); break; case Int: fprintf(fp, "%d", (int)args[sc->offset]); break; case UInt: fprintf(fp, "%u", (unsigned int)args[sc->offset]); break; case PUInt: { unsigned int val; if (get_struct(pid, (void *)args[sc->offset], &val, sizeof(val)) == 0) fprintf(fp, "{ %u }", val); else fprintf(fp, "0x%lx", args[sc->offset]); break; } case LongHex: fprintf(fp, "0x%lx", args[sc->offset]); break; case Long: fprintf(fp, "%ld", args[sc->offset]); break; case Sizet: fprintf(fp, "%zu", (size_t)args[sc->offset]); break; case Name: { /* NULL-terminated string. */ char *tmp2; tmp2 = get_string(pid, (void*)args[sc->offset], 0); fprintf(fp, "\"%s\"", tmp2); free(tmp2); break; } case BinString: { /* * Binary block of data that might have printable characters. * XXX If type|OUT, assume that the length is the syscall's * return value. Otherwise, assume that the length of the block * is in the next syscall argument. */ int max_string = trussinfo->strsize; char tmp2[max_string + 1], *tmp3; int len; int truncated = 0; if (sc->type & OUT) len = retval[0]; else len = args[sc->offset + 1]; /* * Don't print more than max_string characters, to avoid word * wrap. If we have to truncate put some ... after the string. */ if (len > max_string) { len = max_string; truncated = 1; } if (len && get_struct(pid, (void*)args[sc->offset], &tmp2, len) != -1) { tmp3 = malloc(len * 4 + 1); while (len) { if (strvisx(tmp3, tmp2, len, VIS_CSTYLE|VIS_TAB|VIS_NL) <= max_string) break; len--; truncated = 1; } fprintf(fp, "\"%s\"%s", tmp3, truncated ? "..." : ""); free(tmp3); } else { fprintf(fp, "0x%lx", args[sc->offset]); } break; } case ExecArgs: case ExecEnv: case StringArray: { uintptr_t addr; union { char *strarray[0]; char buf[PAGE_SIZE]; } u; char *string; size_t len; u_int first, i; /* * Only parse argv[] and environment arrays from exec calls * if requested. */ if (((sc->type & ARG_MASK) == ExecArgs && (trussinfo->flags & EXECVEARGS) == 0) || ((sc->type & ARG_MASK) == ExecEnv && (trussinfo->flags & EXECVEENVS) == 0)) { fprintf(fp, "0x%lx", args[sc->offset]); break; } /* * Read a page of pointers at a time. Punt if the top-level * pointer is not aligned. Note that the first read is of * a partial page. */ addr = args[sc->offset]; if (addr % sizeof(char *) != 0) { fprintf(fp, "0x%lx", args[sc->offset]); break; } len = PAGE_SIZE - (addr & PAGE_MASK); if (get_struct(pid, (void *)addr, u.buf, len) == -1) { fprintf(fp, "0x%lx", args[sc->offset]); break; } fputc('[', fp); first = 1; i = 0; while (u.strarray[i] != NULL) { string = get_string(pid, u.strarray[i], 0); fprintf(fp, "%s \"%s\"", first ? "" : ",", string); free(string); first = 0; i++; if (i == len / sizeof(char *)) { addr += len; len = PAGE_SIZE; if (get_struct(pid, (void *)addr, u.buf, len) == -1) { fprintf(fp, ", "); break; } i = 0; } } fputs(" ]", fp); break; } #ifdef __LP64__ case Quad: fprintf(fp, "%ld", args[sc->offset]); break; case QuadHex: fprintf(fp, "0x%lx", args[sc->offset]); break; #else case Quad: case QuadHex: { unsigned long long ll; #if _BYTE_ORDER == _LITTLE_ENDIAN ll = (unsigned long long)args[sc->offset + 1] << 32 | args[sc->offset]; #else ll = (unsigned long long)args[sc->offset] << 32 | args[sc->offset + 1]; #endif if ((sc->type & ARG_MASK) == Quad) fprintf(fp, "%lld", ll); else fprintf(fp, "0x%llx", ll); break; } #endif case PQuadHex: { uint64_t val; if (get_struct(pid, (void *)args[sc->offset], &val, sizeof(val)) == 0) fprintf(fp, "{ 0x%jx }", (uintmax_t)val); else fprintf(fp, "0x%lx", args[sc->offset]); break; } case Ptr: fprintf(fp, "0x%lx", args[sc->offset]); break; case Readlinkres: { char *tmp2; if (retval[0] == -1) break; tmp2 = get_string(pid, (void*)args[sc->offset], retval[0]); fprintf(fp, "\"%s\"", tmp2); free(tmp2); break; } case Ioctl: { const char *temp; unsigned long cmd; cmd = args[sc->offset]; temp = sysdecode_ioctlname(cmd); if (temp) fputs(temp, fp); else { fprintf(fp, "0x%lx { IO%s%s 0x%lx('%c'), %lu, %lu }", cmd, cmd & IOC_OUT ? "R" : "", cmd & IOC_IN ? "W" : "", IOCGROUP(cmd), isprint(IOCGROUP(cmd)) ? (char)IOCGROUP(cmd) : '?', cmd & 0xFF, IOCPARM_LEN(cmd)); } break; } case Timespec: { struct timespec ts; if (get_struct(pid, (void *)args[sc->offset], &ts, sizeof(ts)) != -1) fprintf(fp, "{ %jd.%09ld }", (intmax_t)ts.tv_sec, ts.tv_nsec); else fprintf(fp, "0x%lx", args[sc->offset]); break; } case Timespec2: { struct timespec ts[2]; const char *sep; unsigned int i; if (get_struct(pid, (void *)args[sc->offset], &ts, sizeof(ts)) != -1) { fputs("{ ", fp); sep = ""; for (i = 0; i < nitems(ts); i++) { fputs(sep, fp); sep = ", "; switch (ts[i].tv_nsec) { case UTIME_NOW: fprintf(fp, "UTIME_NOW"); break; case UTIME_OMIT: fprintf(fp, "UTIME_OMIT"); break; default: fprintf(fp, "%jd.%09ld", (intmax_t)ts[i].tv_sec, ts[i].tv_nsec); break; } } fputs(" }", fp); } else fprintf(fp, "0x%lx", args[sc->offset]); break; } case Timeval: { struct timeval tv; if (get_struct(pid, (void *)args[sc->offset], &tv, sizeof(tv)) != -1) fprintf(fp, "{ %jd.%06ld }", (intmax_t)tv.tv_sec, tv.tv_usec); else fprintf(fp, "0x%lx", args[sc->offset]); break; } case Timeval2: { struct timeval tv[2]; if (get_struct(pid, (void *)args[sc->offset], &tv, sizeof(tv)) != -1) fprintf(fp, "{ %jd.%06ld, %jd.%06ld }", (intmax_t)tv[0].tv_sec, tv[0].tv_usec, (intmax_t)tv[1].tv_sec, tv[1].tv_usec); else fprintf(fp, "0x%lx", args[sc->offset]); break; } case Itimerval: { struct itimerval itv; if (get_struct(pid, (void *)args[sc->offset], &itv, sizeof(itv)) != -1) fprintf(fp, "{ %jd.%06ld, %jd.%06ld }", (intmax_t)itv.it_interval.tv_sec, itv.it_interval.tv_usec, (intmax_t)itv.it_value.tv_sec, itv.it_value.tv_usec); else fprintf(fp, "0x%lx", args[sc->offset]); break; } case LinuxSockArgs: { struct linux_socketcall_args largs; if (get_struct(pid, (void *)args[sc->offset], (void *)&largs, sizeof(largs)) != -1) fprintf(fp, "{ %s, 0x%lx }", lookup(linux_socketcall_ops, largs.what, 10), (long unsigned int)largs.args); else fprintf(fp, "0x%lx", args[sc->offset]); break; } case Pollfd: { /* * XXX: A Pollfd argument expects the /next/ syscall argument * to be the number of fds in the array. This matches the poll * syscall. */ struct pollfd *pfd; int numfds = args[sc->offset + 1]; size_t bytes = sizeof(struct pollfd) * numfds; int i; if ((pfd = malloc(bytes)) == NULL) err(1, "Cannot malloc %zu bytes for pollfd array", bytes); if (get_struct(pid, (void *)args[sc->offset], pfd, bytes) != -1) { fputs("{", fp); for (i = 0; i < numfds; i++) { fprintf(fp, " %d/%s", pfd[i].fd, xlookup_bits(poll_flags, pfd[i].events)); } fputs(" }", fp); } else { fprintf(fp, "0x%lx", args[sc->offset]); } free(pfd); break; } case Fd_set: { /* * XXX: A Fd_set argument expects the /first/ syscall argument * to be the number of fds in the array. This matches the * select syscall. */ fd_set *fds; int numfds = args[0]; size_t bytes = _howmany(numfds, _NFDBITS) * _NFDBITS; int i; if ((fds = malloc(bytes)) == NULL) err(1, "Cannot malloc %zu bytes for fd_set array", bytes); if (get_struct(pid, (void *)args[sc->offset], fds, bytes) != -1) { fputs("{", fp); for (i = 0; i < numfds; i++) { if (FD_ISSET(i, fds)) fprintf(fp, " %d", i); } fputs(" }", fp); } else fprintf(fp, "0x%lx", args[sc->offset]); free(fds); break; } case Signal: fputs(strsig2(args[sc->offset]), fp); break; case Sigset: { long sig; sigset_t ss; int i, first; sig = args[sc->offset]; if (get_struct(pid, (void *)args[sc->offset], (void *)&ss, sizeof(ss)) == -1) { fprintf(fp, "0x%lx", args[sc->offset]); break; } fputs("{ ", fp); first = 1; for (i = 1; i < sys_nsig; i++) { if (sigismember(&ss, i)) { fprintf(fp, "%s%s", !first ? "|" : "", strsig2(i)); first = 0; } } if (!first) fputc(' ', fp); fputc('}', fp); break; } case Sigprocmask: print_integer_arg(sysdecode_sigprocmask_how, fp, args[sc->offset]); break; case Fcntlflag: /* XXX: Output depends on the value of the previous argument. */ if (sysdecode_fcntl_arg_p(args[sc->offset - 1])) sysdecode_fcntl_arg(fp, args[sc->offset - 1], args[sc->offset], 16); break; case Open: print_mask_arg(sysdecode_open_flags, fp, args[sc->offset]); break; case Fcntl: print_integer_arg(sysdecode_fcntl_cmd, fp, args[sc->offset]); break; case Mprot: print_mask_arg(sysdecode_mmap_prot, fp, args[sc->offset]); break; case Mmapflags: print_mask_arg(sysdecode_mmap_flags, fp, args[sc->offset]); break; case Whence: print_integer_arg(sysdecode_whence, fp, args[sc->offset]); break; case Sockdomain: print_integer_arg(sysdecode_socketdomain, fp, args[sc->offset]); break; case Socktype: print_mask_arg(sysdecode_socket_type, fp, args[sc->offset]); break; case Shutdown: print_integer_arg(sysdecode_shutdown_how, fp, args[sc->offset]); break; case Resource: print_integer_arg(sysdecode_rlimit, fp, args[sc->offset]); break; case RusageWho: print_integer_arg(sysdecode_getrusage_who, fp, args[sc->offset]); break; case Pathconf: print_integer_arg(sysdecode_pathconf_name, fp, args[sc->offset]); break; case Rforkflags: print_mask_arg(sysdecode_rfork_flags, fp, args[sc->offset]); break; case Sockaddr: { socklen_t len; if (args[sc->offset] == 0) { fputs("NULL", fp); break; } /* * Extract the address length from the next argument. If * this is an output sockaddr (OUT is set), then the * next argument is a pointer to a socklen_t. Otherwise * the next argument contains a socklen_t by value. */ if (sc->type & OUT) { if (get_struct(pid, (void *)args[sc->offset + 1], &len, sizeof(len)) == -1) { fprintf(fp, "0x%lx", args[sc->offset]); break; } } else len = args[sc->offset + 1]; print_sockaddr(fp, trussinfo, (void *)args[sc->offset], len); break; } case Sigaction: { struct sigaction sa; if (get_struct(pid, (void *)args[sc->offset], &sa, sizeof(sa)) != -1) { fputs("{ ", fp); if (sa.sa_handler == SIG_DFL) fputs("SIG_DFL", fp); else if (sa.sa_handler == SIG_IGN) fputs("SIG_IGN", fp); else fprintf(fp, "%p", sa.sa_handler); fprintf(fp, " %s ss_t }", xlookup_bits(sigaction_flags, sa.sa_flags)); } else fprintf(fp, "0x%lx", args[sc->offset]); break; } case Kevent: { /* * XXX XXX: The size of the array is determined by either the * next syscall argument, or by the syscall return value, * depending on which argument number we are. This matches the * kevent syscall, but luckily that's the only syscall that uses * them. */ struct kevent *ke; int numevents = -1; size_t bytes; int i; if (sc->offset == 1) numevents = args[sc->offset+1]; else if (sc->offset == 3 && retval[0] != -1) numevents = retval[0]; if (numevents >= 0) { bytes = sizeof(struct kevent) * numevents; if ((ke = malloc(bytes)) == NULL) err(1, "Cannot malloc %zu bytes for kevent array", bytes); } else ke = NULL; if (numevents >= 0 && get_struct(pid, (void *)args[sc->offset], ke, bytes) != -1) { fputc('{', fp); for (i = 0; i < numevents; i++) { fputc(' ', fp); print_kevent(fp, &ke[i]); } fputs(" }", fp); } else { fprintf(fp, "0x%lx", args[sc->offset]); } free(ke); break; } case Kevent11: { struct kevent_freebsd11 *ke11; struct kevent ke; int numevents = -1; size_t bytes; int i; if (sc->offset == 1) numevents = args[sc->offset+1]; else if (sc->offset == 3 && retval[0] != -1) numevents = retval[0]; if (numevents >= 0) { bytes = sizeof(struct kevent_freebsd11) * numevents; if ((ke11 = malloc(bytes)) == NULL) err(1, "Cannot malloc %zu bytes for kevent array", bytes); } else ke11 = NULL; memset(&ke, 0, sizeof(ke)); if (numevents >= 0 && get_struct(pid, (void *)args[sc->offset], ke11, bytes) != -1) { fputc('{', fp); for (i = 0; i < numevents; i++) { fputc(' ', fp); ke.ident = ke11[i].ident; ke.filter = ke11[i].filter; ke.flags = ke11[i].flags; ke.fflags = ke11[i].fflags; ke.data = ke11[i].data; ke.udata = ke11[i].udata; print_kevent(fp, &ke); } fputs(" }", fp); } else { fprintf(fp, "0x%lx", args[sc->offset]); } free(ke11); break; } case Stat: { struct stat st; if (get_struct(pid, (void *)args[sc->offset], &st, sizeof(st)) != -1) { char mode[12]; strmode(st.st_mode, mode); fprintf(fp, "{ mode=%s,inode=%ju,size=%jd,blksize=%ld }", mode, (uintmax_t)st.st_ino, (intmax_t)st.st_size, (long)st.st_blksize); } else { fprintf(fp, "0x%lx", args[sc->offset]); } break; } case Stat11: { struct freebsd11_stat st; if (get_struct(pid, (void *)args[sc->offset], &st, sizeof(st)) != -1) { char mode[12]; strmode(st.st_mode, mode); fprintf(fp, "{ mode=%s,inode=%ju,size=%jd,blksize=%ld }", mode, (uintmax_t)st.st_ino, (intmax_t)st.st_size, (long)st.st_blksize); } else { fprintf(fp, "0x%lx", args[sc->offset]); } break; } case StatFs: { unsigned int i; struct statfs buf; if (get_struct(pid, (void *)args[sc->offset], &buf, sizeof(buf)) != -1) { char fsid[17]; bzero(fsid, sizeof(fsid)); if (buf.f_fsid.val[0] != 0 || buf.f_fsid.val[1] != 0) { for (i = 0; i < sizeof(buf.f_fsid); i++) snprintf(&fsid[i*2], sizeof(fsid) - (i*2), "%02x", ((u_char *)&buf.f_fsid)[i]); } fprintf(fp, "{ fstypename=%s,mntonname=%s,mntfromname=%s," "fsid=%s }", buf.f_fstypename, buf.f_mntonname, buf.f_mntfromname, fsid); } else fprintf(fp, "0x%lx", args[sc->offset]); break; } case Rusage: { struct rusage ru; if (get_struct(pid, (void *)args[sc->offset], &ru, sizeof(ru)) != -1) { fprintf(fp, "{ u=%jd.%06ld,s=%jd.%06ld,in=%ld,out=%ld }", (intmax_t)ru.ru_utime.tv_sec, ru.ru_utime.tv_usec, (intmax_t)ru.ru_stime.tv_sec, ru.ru_stime.tv_usec, ru.ru_inblock, ru.ru_oublock); } else fprintf(fp, "0x%lx", args[sc->offset]); break; } case Rlimit: { struct rlimit rl; if (get_struct(pid, (void *)args[sc->offset], &rl, sizeof(rl)) != -1) { fprintf(fp, "{ cur=%ju,max=%ju }", rl.rlim_cur, rl.rlim_max); } else fprintf(fp, "0x%lx", args[sc->offset]); break; } case ExitStatus: { int status; if (get_struct(pid, (void *)args[sc->offset], &status, sizeof(status)) != -1) { fputs("{ ", fp); if (WIFCONTINUED(status)) fputs("CONTINUED", fp); else if (WIFEXITED(status)) fprintf(fp, "EXITED,val=%d", WEXITSTATUS(status)); else if (WIFSIGNALED(status)) fprintf(fp, "SIGNALED,sig=%s%s", strsig2(WTERMSIG(status)), WCOREDUMP(status) ? ",cored" : ""); else fprintf(fp, "STOPPED,sig=%s", strsig2(WTERMSIG(status))); fputs(" }", fp); } else fprintf(fp, "0x%lx", args[sc->offset]); break; } case Waitoptions: print_mask_arg(sysdecode_wait6_options, fp, args[sc->offset]); break; case Idtype: print_integer_arg(sysdecode_idtype, fp, args[sc->offset]); break; case Procctl: print_integer_arg(sysdecode_procctl_cmd, fp, args[sc->offset]); break; case Umtxop: print_integer_arg(sysdecode_umtx_op, fp, args[sc->offset]); break; case Atfd: print_integer_arg(sysdecode_atfd, fp, args[sc->offset]); break; case Atflags: print_mask_arg(sysdecode_atflags, fp, args[sc->offset]); break; case Accessmode: print_mask_arg(sysdecode_access_mode, fp, args[sc->offset]); break; case Sysarch: print_integer_arg(sysdecode_sysarch_number, fp, args[sc->offset]); break; case PipeFds: /* * The pipe() system call in the kernel returns its * two file descriptors via return values. However, * the interface exposed by libc is that pipe() * accepts a pointer to an array of descriptors. * Format the output to match the libc API by printing * the returned file descriptors as a fake argument. * * Overwrite the first retval to signal a successful * return as well. */ fprintf(fp, "{ %ld, %ld }", retval[0], retval[1]); retval[0] = 0; break; case Utrace: { size_t len; void *utrace_addr; len = args[sc->offset + 1]; utrace_addr = calloc(1, len); if (get_struct(pid, (void *)args[sc->offset], (void *)utrace_addr, len) != -1) print_utrace(fp, utrace_addr, len); else fprintf(fp, "0x%lx", args[sc->offset]); free(utrace_addr); break; } case IntArray: { int descriptors[16]; unsigned long i, ndescriptors; bool truncated; ndescriptors = args[sc->offset + 1]; truncated = false; if (ndescriptors > nitems(descriptors)) { ndescriptors = nitems(descriptors); truncated = true; } if (get_struct(pid, (void *)args[sc->offset], descriptors, ndescriptors * sizeof(descriptors[0])) != -1) { fprintf(fp, "{"); for (i = 0; i < ndescriptors; i++) fprintf(fp, i == 0 ? " %d" : ", %d", descriptors[i]); fprintf(fp, truncated ? ", ... }" : " }"); } else fprintf(fp, "0x%lx", args[sc->offset]); break; } case Pipe2: print_mask_arg(sysdecode_pipe2_flags, fp, args[sc->offset]); break; case CapFcntlRights: { uint32_t rights; if (sc->type & OUT) { if (get_struct(pid, (void *)args[sc->offset], &rights, sizeof(rights)) == -1) { fprintf(fp, "0x%lx", args[sc->offset]); break; } } else rights = args[sc->offset]; print_mask_arg32(sysdecode_cap_fcntlrights, fp, rights); break; } case Fadvice: print_integer_arg(sysdecode_fadvice, fp, args[sc->offset]); break; case FileFlags: { fflags_t rem; if (!sysdecode_fileflags(fp, args[sc->offset], &rem)) fprintf(fp, "0x%x", rem); else if (rem != 0) fprintf(fp, "|0x%x", rem); break; } case Flockop: print_mask_arg(sysdecode_flock_operation, fp, args[sc->offset]); break; case Getfsstatmode: print_integer_arg(sysdecode_getfsstat_mode, fp, args[sc->offset]); break; case Kldsymcmd: print_integer_arg(sysdecode_kldsym_cmd, fp, args[sc->offset]); break; case Kldunloadflags: print_integer_arg(sysdecode_kldunload_flags, fp, args[sc->offset]); break; case Madvice: print_integer_arg(sysdecode_madvice, fp, args[sc->offset]); break; case Socklent: fprintf(fp, "%u", (socklen_t)args[sc->offset]); break; case Sockprotocol: { const char *temp; int domain, protocol; domain = args[sc->offset - 2]; protocol = args[sc->offset]; if (protocol == 0) { fputs("0", fp); } else { temp = sysdecode_socket_protocol(domain, protocol); if (temp) { fputs(temp, fp); } else { fprintf(fp, "%d", protocol); } } break; } case Sockoptlevel: print_integer_arg(sysdecode_sockopt_level, fp, args[sc->offset]); break; case Sockoptname: { const char *temp; int level, name; level = args[sc->offset - 1]; name = args[sc->offset]; temp = sysdecode_sockopt_name(level, name); if (temp) { fputs(temp, fp); } else { fprintf(fp, "%d", name); } break; } case Msgflags: print_mask_arg(sysdecode_msg_flags, fp, args[sc->offset]); break; case CapRights: { cap_rights_t rights; if (get_struct(pid, (void *)args[sc->offset], &rights, sizeof(rights)) != -1) { fputs("{ ", fp); sysdecode_cap_rights(fp, &rights); fputs(" }", fp); } else fprintf(fp, "0x%lx", args[sc->offset]); break; } case Acltype: print_integer_arg(sysdecode_acltype, fp, args[sc->offset]); break; case Extattrnamespace: print_integer_arg(sysdecode_extattrnamespace, fp, args[sc->offset]); break; case Minherit: print_integer_arg(sysdecode_minherit_inherit, fp, args[sc->offset]); break; case Mlockall: print_mask_arg(sysdecode_mlockall_flags, fp, args[sc->offset]); break; case Mountflags: print_mask_arg(sysdecode_mount_flags, fp, args[sc->offset]); break; case Msync: print_mask_arg(sysdecode_msync_flags, fp, args[sc->offset]); break; case Priowhich: print_integer_arg(sysdecode_prio_which, fp, args[sc->offset]); break; case Ptraceop: print_integer_arg(sysdecode_ptrace_request, fp, args[sc->offset]); break; case Quotactlcmd: if (!sysdecode_quotactl_cmd(fp, args[sc->offset])) fprintf(fp, "%#x", (int)args[sc->offset]); break; case Reboothowto: print_mask_arg(sysdecode_reboot_howto, fp, args[sc->offset]); break; case Rtpriofunc: print_integer_arg(sysdecode_rtprio_function, fp, args[sc->offset]); break; case Schedpolicy: print_integer_arg(sysdecode_scheduler_policy, fp, args[sc->offset]); break; case Schedparam: { struct sched_param sp; if (get_struct(pid, (void *)args[sc->offset], &sp, sizeof(sp)) != -1) fprintf(fp, "{ %d }", sp.sched_priority); else fprintf(fp, "0x%lx", args[sc->offset]); break; } case PSig: { int sig; if (get_struct(pid, (void *)args[sc->offset], &sig, sizeof(sig)) == 0) fprintf(fp, "{ %s }", strsig2(sig)); else fprintf(fp, "0x%lx", args[sc->offset]); break; } case Siginfo: { siginfo_t si; if (get_struct(pid, (void *)args[sc->offset], &si, sizeof(si)) != -1) { fprintf(fp, "{ signo=%s", strsig2(si.si_signo)); decode_siginfo(fp, &si); fprintf(fp, " }"); } else fprintf(fp, "0x%lx", args[sc->offset]); break; } case Iovec: /* * Print argument as an array of struct iovec, where the next * syscall argument is the number of elements of the array. */ print_iovec(fp, trussinfo, (void *)args[sc->offset], (int)args[sc->offset + 1]); break; case Sctpsndrcvinfo: { struct sctp_sndrcvinfo info; if (get_struct(pid, (void *)args[sc->offset], &info, sizeof(struct sctp_sndrcvinfo)) == -1) { fprintf(fp, "0x%lx", args[sc->offset]); break; } print_sctp_sndrcvinfo(fp, sc->type & OUT, &info); break; } case Msghdr: { struct msghdr msghdr; if (get_struct(pid, (void *)args[sc->offset], &msghdr, sizeof(struct msghdr)) == -1) { fprintf(fp, "0x%lx", args[sc->offset]); break; } fputs("{", fp); print_sockaddr(fp, trussinfo, msghdr.msg_name, msghdr.msg_namelen); fprintf(fp, ",%d,", msghdr.msg_namelen); print_iovec(fp, trussinfo, msghdr.msg_iov, msghdr.msg_iovlen); fprintf(fp, ",%d,", msghdr.msg_iovlen); print_cmsgs(fp, pid, sc->type & OUT, &msghdr); fprintf(fp, ",%u,", msghdr.msg_controllen); print_mask_arg(sysdecode_msg_flags, fp, msghdr.msg_flags); fputs("}", fp); break; } case CloudABIAdvice: fputs(xlookup(cloudabi_advice, args[sc->offset]), fp); break; case CloudABIClockID: fputs(xlookup(cloudabi_clockid, args[sc->offset]), fp); break; case CloudABIFDSFlags: fputs(xlookup_bits(cloudabi_fdsflags, args[sc->offset]), fp); break; case CloudABIFDStat: { cloudabi_fdstat_t fds; if (get_struct(pid, (void *)args[sc->offset], &fds, sizeof(fds)) != -1) { fprintf(fp, "{ %s, ", xlookup(cloudabi_filetype, fds.fs_filetype)); fprintf(fp, "%s, ... }", xlookup_bits(cloudabi_fdflags, fds.fs_flags)); } else fprintf(fp, "0x%lx", args[sc->offset]); break; } case CloudABIFileStat: { cloudabi_filestat_t fsb; if (get_struct(pid, (void *)args[sc->offset], &fsb, sizeof(fsb)) != -1) fprintf(fp, "{ %s, %ju }", xlookup(cloudabi_filetype, fsb.st_filetype), (uintmax_t)fsb.st_size); else fprintf(fp, "0x%lx", args[sc->offset]); break; } case CloudABIFileType: fputs(xlookup(cloudabi_filetype, args[sc->offset]), fp); break; case CloudABIFSFlags: fputs(xlookup_bits(cloudabi_fsflags, args[sc->offset]), fp); break; case CloudABILookup: if ((args[sc->offset] & CLOUDABI_LOOKUP_SYMLINK_FOLLOW) != 0) fprintf(fp, "%d|LOOKUP_SYMLINK_FOLLOW", (int)args[sc->offset]); else fprintf(fp, "%d", (int)args[sc->offset]); break; case CloudABIMFlags: fputs(xlookup_bits(cloudabi_mflags, args[sc->offset]), fp); break; case CloudABIMProt: fputs(xlookup_bits(cloudabi_mprot, args[sc->offset]), fp); break; case CloudABIMSFlags: fputs(xlookup_bits(cloudabi_msflags, args[sc->offset]), fp); break; case CloudABIOFlags: fputs(xlookup_bits(cloudabi_oflags, args[sc->offset]), fp); break; case CloudABISDFlags: fputs(xlookup_bits(cloudabi_sdflags, args[sc->offset]), fp); break; case CloudABISignal: fputs(xlookup(cloudabi_signal, args[sc->offset]), fp); break; case CloudABITimestamp: fprintf(fp, "%lu.%09lus", args[sc->offset] / 1000000000, args[sc->offset] % 1000000000); break; case CloudABIULFlags: fputs(xlookup_bits(cloudabi_ulflags, args[sc->offset]), fp); break; case CloudABIWhence: fputs(xlookup(cloudabi_whence, args[sc->offset]), fp); break; default: errx(1, "Invalid argument type %d\n", sc->type & ARG_MASK); } fclose(fp); return (tmp); } /* * Print (to outfile) the system call and its arguments. */ void print_syscall(struct trussinfo *trussinfo) { struct threadinfo *t; const char *name; char **s_args; int i, len, nargs; t = trussinfo->curthread; name = t->cs.sc->name; nargs = t->cs.nargs; s_args = t->cs.s_args; len = print_line_prefix(trussinfo); len += fprintf(trussinfo->outfile, "%s(", name); for (i = 0; i < nargs; i++) { if (s_args[i] != NULL) len += fprintf(trussinfo->outfile, "%s", s_args[i]); else len += fprintf(trussinfo->outfile, ""); len += fprintf(trussinfo->outfile, "%s", i < (nargs - 1) ? "," : ""); } len += fprintf(trussinfo->outfile, ")"); for (i = 0; i < 6 - (len / 8); i++) fprintf(trussinfo->outfile, "\t"); } void print_syscall_ret(struct trussinfo *trussinfo, int errorp, long *retval) { struct timespec timediff; struct threadinfo *t; struct syscall *sc; int error; t = trussinfo->curthread; sc = t->cs.sc; if (trussinfo->flags & COUNTONLY) { - timespecsubt(&t->after, &t->before, &timediff); + timespecsub(&t->after, &t->before, &timediff); timespecadd(&sc->time, &timediff, &sc->time); sc->ncalls++; if (errorp) sc->nerror++; return; } print_syscall(trussinfo); fflush(trussinfo->outfile); if (retval == NULL) { /* * This system call resulted in the current thread's exit, * so there is no return value or error to display. */ fprintf(trussinfo->outfile, "\n"); return; } if (errorp) { error = sysdecode_abi_to_freebsd_errno(t->proc->abi->abi, retval[0]); fprintf(trussinfo->outfile, " ERR#%ld '%s'\n", retval[0], error == INT_MAX ? "Unknown error" : strerror(error)); } #ifndef __LP64__ else if (sc->ret_type == 2) { off_t off; #if _BYTE_ORDER == _LITTLE_ENDIAN off = (off_t)retval[1] << 32 | retval[0]; #else off = (off_t)retval[0] << 32 | retval[1]; #endif fprintf(trussinfo->outfile, " = %jd (0x%jx)\n", (intmax_t)off, (intmax_t)off); } #endif else fprintf(trussinfo->outfile, " = %ld (0x%lx)\n", retval[0], retval[0]); } void print_summary(struct trussinfo *trussinfo) { struct timespec total = {0, 0}; struct syscall *sc; int ncall, nerror; fprintf(trussinfo->outfile, "%-20s%15s%8s%8s\n", "syscall", "seconds", "calls", "errors"); ncall = nerror = 0; STAILQ_FOREACH(sc, &syscalls, entries) if (sc->ncalls) { fprintf(trussinfo->outfile, "%-20s%5jd.%09ld%8d%8d\n", sc->name, (intmax_t)sc->time.tv_sec, sc->time.tv_nsec, sc->ncalls, sc->nerror); timespecadd(&total, &sc->time, &total); ncall += sc->ncalls; nerror += sc->nerror; } fprintf(trussinfo->outfile, "%20s%15s%8s%8s\n", "", "-------------", "-------", "-------"); fprintf(trussinfo->outfile, "%-20s%5jd.%09ld%8d%8d\n", "", (intmax_t)total.tv_sec, total.tv_nsec, ncall, nerror); } Index: head/usr.bin/truss/truss.h =================================================================== --- head/usr.bin/truss/truss.h (revision 336913) +++ head/usr.bin/truss/truss.h (revision 336914) @@ -1,141 +1,121 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright 2001 Jamey Wood * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include #include #define FOLLOWFORKS 0x00000001 #define RELATIVETIMESTAMPS 0x00000002 #define ABSOLUTETIMESTAMPS 0x00000004 #define NOSIGS 0x00000008 #define EXECVEARGS 0x00000010 #define EXECVEENVS 0x00000020 #define COUNTONLY 0x00000040 #define DISPLAYTIDS 0x00000080 struct procinfo; struct syscall; struct trussinfo; /* * The lookup of normal system calls are optimized by using a fixed * array for the first 1024 system calls that can be indexed directly. * Unknown system calls with other IDs are stored in a linked list. */ #define SYSCALL_NORMAL_COUNT 1024 struct extra_syscall { STAILQ_ENTRY(extra_syscall) entries; struct syscall *sc; u_int number; }; struct procabi { const char *type; enum sysdecode_abi abi; int (*fetch_args)(struct trussinfo *, u_int); int (*fetch_retval)(struct trussinfo *, long *, int *); STAILQ_HEAD(, extra_syscall) extra_syscalls; struct syscall *syscalls[SYSCALL_NORMAL_COUNT]; }; #define PROCABI(abi) DATA_SET(procabi, abi) /* * This is confusingly named. It holds per-thread state about the * currently executing system call. syscall.h defines a struct * syscall that holds metadata used to format system call arguments. * * NB: args[] stores the raw argument values (e.g. from registers) * passed to the system call. s_args[] stores a string representation * of a system call's arguments. These do not necessarily map one to * one. A system call description may omit individual arguments * (padding) or combine adjacent arguments (e.g. when passing an off_t * argument on a 32-bit system). The nargs member contains the count * of valid pointers in s_args[], not args[]. */ struct current_syscall { struct syscall *sc; unsigned int number; unsigned int nargs; unsigned long args[10]; char *s_args[10]; /* the printable arguments */ }; struct threadinfo { LIST_ENTRY(threadinfo) entries; struct procinfo *proc; lwpid_t tid; int in_syscall; struct current_syscall cs; struct timespec before; struct timespec after; }; struct procinfo { LIST_ENTRY(procinfo) entries; pid_t pid; struct procabi *abi; LIST_HEAD(, threadinfo) threadlist; }; struct trussinfo { int flags; int strsize; FILE *outfile; struct timespec start_time; struct threadinfo *curthread; LIST_HEAD(, procinfo) proclist; }; - -#define timespecsubt(tvp, uvp, vvp) \ - do { \ - (vvp)->tv_sec = (tvp)->tv_sec - (uvp)->tv_sec; \ - (vvp)->tv_nsec = (tvp)->tv_nsec - (uvp)->tv_nsec; \ - if ((vvp)->tv_nsec < 0) { \ - (vvp)->tv_sec--; \ - (vvp)->tv_nsec += 1000000000; \ - } \ - } while (0) - -#define timespecadd(tvp, uvp, vvp) \ - do { \ - (vvp)->tv_sec = (tvp)->tv_sec + (uvp)->tv_sec; \ - (vvp)->tv_nsec = (tvp)->tv_nsec + (uvp)->tv_nsec; \ - if ((vvp)->tv_nsec > 1000000000) { \ - (vvp)->tv_sec++; \ - (vvp)->tv_nsec -= 1000000000; \ - } \ - } while (0) Index: head/usr.sbin/camdd/camdd.c =================================================================== --- head/usr.sbin/camdd/camdd.c (revision 336913) +++ head/usr.sbin/camdd/camdd.c (revision 336914) @@ -1,3527 +1,3510 @@ /*- * Copyright (c) 1997-2007 Kenneth D. Merry * Copyright (c) 2013, 2014, 2015 Spectra Logic Corporation * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions, and the following disclaimer, * without modification. * 2. Redistributions in binary form must reproduce at minimum a disclaimer * substantially similar to the "NO WARRANTY" disclaimer below * ("Disclaimer") and any redistribution must be conditioned upon * including a substantially similar Disclaimer requirement for further * binary redistribution. * * NO WARRANTY * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGES. * * Authors: Ken Merry (Spectra Logic Corporation) */ /* * This is eventually intended to be: * - A basic data transfer/copy utility * - A simple benchmark utility * - An example of how to use the asynchronous pass(4) driver interface. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include typedef enum { CAMDD_CMD_NONE = 0x00000000, CAMDD_CMD_HELP = 0x00000001, CAMDD_CMD_WRITE = 0x00000002, CAMDD_CMD_READ = 0x00000003 } camdd_cmdmask; typedef enum { CAMDD_ARG_NONE = 0x00000000, CAMDD_ARG_VERBOSE = 0x00000001, CAMDD_ARG_DEVICE = 0x00000002, CAMDD_ARG_BUS = 0x00000004, CAMDD_ARG_TARGET = 0x00000008, CAMDD_ARG_LUN = 0x00000010, CAMDD_ARG_UNIT = 0x00000020, CAMDD_ARG_TIMEOUT = 0x00000040, CAMDD_ARG_ERR_RECOVER = 0x00000080, CAMDD_ARG_RETRIES = 0x00000100 } camdd_argmask; typedef enum { CAMDD_DEV_NONE = 0x00, CAMDD_DEV_PASS = 0x01, CAMDD_DEV_FILE = 0x02 } camdd_dev_type; struct camdd_io_opts { camdd_dev_type dev_type; char *dev_name; uint64_t blocksize; uint64_t queue_depth; uint64_t offset; int min_cmd_size; int write_dev; uint64_t debug; }; typedef enum { CAMDD_BUF_NONE, CAMDD_BUF_DATA, CAMDD_BUF_INDIRECT } camdd_buf_type; struct camdd_buf_indirect { /* * Pointer to the source buffer. */ struct camdd_buf *src_buf; /* * Offset into the source buffer, in bytes. */ uint64_t offset; /* * Pointer to the starting point in the source buffer. */ uint8_t *start_ptr; /* * Length of this chunk in bytes. */ size_t len; }; struct camdd_buf_data { /* * Buffer allocated when we allocate this camdd_buf. This should * be the size of the blocksize for this device. */ uint8_t *buf; /* * The amount of backing store allocated in buf. Generally this * will be the blocksize of the device. */ uint32_t alloc_len; /* * The amount of data that was put into the buffer (on reads) or * the amount of data we have put onto the src_list so far (on * writes). */ uint32_t fill_len; /* * The amount of data that was not transferred. */ uint32_t resid; /* * Starting byte offset on the reader. */ uint64_t src_start_offset; /* * CCB used for pass(4) device targets. */ union ccb ccb; /* * Number of scatter/gather segments. */ int sg_count; /* * Set if we had to tack on an extra buffer to round the transfer * up to a sector size. */ int extra_buf; /* * Scatter/gather list used generally when we're the writer for a * pass(4) device. */ bus_dma_segment_t *segs; /* * Scatter/gather list used generally when we're the writer for a * file or block device; */ struct iovec *iovec; }; union camdd_buf_types { struct camdd_buf_indirect indirect; struct camdd_buf_data data; }; typedef enum { CAMDD_STATUS_NONE, CAMDD_STATUS_OK, CAMDD_STATUS_SHORT_IO, CAMDD_STATUS_EOF, CAMDD_STATUS_ERROR } camdd_buf_status; struct camdd_buf { camdd_buf_type buf_type; union camdd_buf_types buf_type_spec; camdd_buf_status status; uint64_t lba; size_t len; /* * A reference count of how many indirect buffers point to this * buffer. */ int refcount; /* * A link back to our parent device. */ struct camdd_dev *dev; STAILQ_ENTRY(camdd_buf) links; STAILQ_ENTRY(camdd_buf) work_links; /* * A count of the buffers on the src_list. */ int src_count; /* * List of buffers from our partner thread that are the components * of this buffer for the I/O. Uses src_links. */ STAILQ_HEAD(,camdd_buf) src_list; STAILQ_ENTRY(camdd_buf) src_links; }; #define NUM_DEV_TYPES 2 struct camdd_dev_pass { int scsi_dev_type; int protocol; struct cam_device *dev; uint64_t max_sector; uint32_t block_len; uint32_t cpi_maxio; }; typedef enum { CAMDD_FILE_NONE, CAMDD_FILE_REG, CAMDD_FILE_STD, CAMDD_FILE_PIPE, CAMDD_FILE_DISK, CAMDD_FILE_TAPE, CAMDD_FILE_TTY, CAMDD_FILE_MEM } camdd_file_type; typedef enum { CAMDD_FF_NONE = 0x00, CAMDD_FF_CAN_SEEK = 0x01 } camdd_file_flags; struct camdd_dev_file { int fd; struct stat sb; char filename[MAXPATHLEN + 1]; camdd_file_type file_type; camdd_file_flags file_flags; uint8_t *tmp_buf; }; struct camdd_dev_block { int fd; uint64_t size_bytes; uint32_t block_len; }; union camdd_dev_spec { struct camdd_dev_pass pass; struct camdd_dev_file file; struct camdd_dev_block block; }; typedef enum { CAMDD_DEV_FLAG_NONE = 0x00, CAMDD_DEV_FLAG_EOF = 0x01, CAMDD_DEV_FLAG_PEER_EOF = 0x02, CAMDD_DEV_FLAG_ACTIVE = 0x04, CAMDD_DEV_FLAG_EOF_SENT = 0x08, CAMDD_DEV_FLAG_EOF_QUEUED = 0x10 } camdd_dev_flags; struct camdd_dev { camdd_dev_type dev_type; union camdd_dev_spec dev_spec; camdd_dev_flags flags; char device_name[MAXPATHLEN+1]; uint32_t blocksize; uint32_t sector_size; uint64_t max_sector; uint64_t sector_io_limit; int min_cmd_size; int write_dev; int retry_count; int io_timeout; int debug; uint64_t start_offset_bytes; uint64_t next_io_pos_bytes; uint64_t next_peer_pos_bytes; uint64_t next_completion_pos_bytes; uint64_t peer_bytes_queued; uint64_t bytes_transferred; uint32_t target_queue_depth; uint32_t cur_active_io; uint8_t *extra_buf; uint32_t extra_buf_len; struct camdd_dev *peer_dev; pthread_mutex_t mutex; pthread_cond_t cond; int kq; int (*run)(struct camdd_dev *dev); int (*fetch)(struct camdd_dev *dev); /* * Buffers that are available for I/O. Uses links. */ STAILQ_HEAD(,camdd_buf) free_queue; /* * Free indirect buffers. These are used for breaking a large * buffer into multiple pieces. */ STAILQ_HEAD(,camdd_buf) free_indirect_queue; /* * Buffers that have been queued to the kernel. Uses links. */ STAILQ_HEAD(,camdd_buf) active_queue; /* * Will generally contain one of our buffers that is waiting for enough * I/O from our partner thread to be able to execute. This will * generally happen when our per-I/O-size is larger than the * partner thread's per-I/O-size. Uses links. */ STAILQ_HEAD(,camdd_buf) pending_queue; /* * Number of buffers on the pending queue */ int num_pending_queue; /* * Buffers that are filled and ready to execute. This is used when * our partner (reader) thread sends us blocks that are larger than * our blocksize, and so we have to split them into multiple pieces. */ STAILQ_HEAD(,camdd_buf) run_queue; /* * Number of buffers on the run queue. */ int num_run_queue; STAILQ_HEAD(,camdd_buf) reorder_queue; int num_reorder_queue; /* * Buffers that have been queued to us by our partner thread * (generally the reader thread) to be written out. Uses * work_links. */ STAILQ_HEAD(,camdd_buf) work_queue; /* * Buffers that have been completed by our partner thread. Uses * work_links. */ STAILQ_HEAD(,camdd_buf) peer_done_queue; /* * Number of buffers on the peer done queue. */ uint32_t num_peer_done_queue; /* * A list of buffers that we have queued to our peer thread. Uses * links. */ STAILQ_HEAD(,camdd_buf) peer_work_queue; /* * Number of buffers on the peer work queue. */ uint32_t num_peer_work_queue; }; static sem_t camdd_sem; static sig_atomic_t need_exit = 0; static sig_atomic_t error_exit = 0; static sig_atomic_t need_status = 0; #ifndef min #define min(a, b) (a < b) ? a : b #endif -/* - * XXX KDM private copy of timespecsub(). This is normally defined in - * sys/time.h, but is only enabled in the kernel. If that definition is - * enabled in userland, it breaks the build of libnetbsd. - */ -#ifndef timespecsub -#define timespecsub(vvp, uvp) \ - do { \ - (vvp)->tv_sec -= (uvp)->tv_sec; \ - (vvp)->tv_nsec -= (uvp)->tv_nsec; \ - if ((vvp)->tv_nsec < 0) { \ - (vvp)->tv_sec--; \ - (vvp)->tv_nsec += 1000000000; \ - } \ - } while (0) -#endif - /* Generically useful offsets into the peripheral private area */ #define ppriv_ptr0 periph_priv.entries[0].ptr #define ppriv_ptr1 periph_priv.entries[1].ptr #define ppriv_field0 periph_priv.entries[0].field #define ppriv_field1 periph_priv.entries[1].field #define ccb_buf ppriv_ptr0 #define CAMDD_FILE_DEFAULT_BLOCK 524288 #define CAMDD_FILE_DEFAULT_DEPTH 1 #define CAMDD_PASS_MAX_BLOCK 1048576 #define CAMDD_PASS_DEFAULT_DEPTH 6 #define CAMDD_PASS_RW_TIMEOUT 60 * 1000 static int parse_btl(char *tstr, int *bus, int *target, int *lun, camdd_argmask *arglst); void camdd_free_dev(struct camdd_dev *dev); struct camdd_dev *camdd_alloc_dev(camdd_dev_type dev_type, struct kevent *new_ke, int num_ke, int retry_count, int timeout); static struct camdd_buf *camdd_alloc_buf(struct camdd_dev *dev, camdd_buf_type buf_type); void camdd_release_buf(struct camdd_buf *buf); struct camdd_buf *camdd_get_buf(struct camdd_dev *dev, camdd_buf_type buf_type); int camdd_buf_sg_create(struct camdd_buf *buf, int iovec, uint32_t sector_size, uint32_t *num_sectors_used, int *double_buf_needed); uint32_t camdd_buf_get_len(struct camdd_buf *buf); void camdd_buf_add_child(struct camdd_buf *buf, struct camdd_buf *child_buf); int camdd_probe_tape(int fd, char *filename, uint64_t *max_iosize, uint64_t *max_blk, uint64_t *min_blk, uint64_t *blk_gran); int camdd_probe_pass_scsi(struct cam_device *cam_dev, union ccb *ccb, camdd_argmask arglist, int probe_retry_count, int probe_timeout, uint64_t *maxsector, uint32_t *block_len); struct camdd_dev *camdd_probe_file(int fd, struct camdd_io_opts *io_opts, int retry_count, int timeout); struct camdd_dev *camdd_probe_pass(struct cam_device *cam_dev, struct camdd_io_opts *io_opts, camdd_argmask arglist, int probe_retry_count, int probe_timeout, int io_retry_count, int io_timeout); void *camdd_file_worker(void *arg); camdd_buf_status camdd_ccb_status(union ccb *ccb, int protocol); int camdd_get_cgd(struct cam_device *device, struct ccb_getdev *cgd); int camdd_queue_peer_buf(struct camdd_dev *dev, struct camdd_buf *buf); int camdd_complete_peer_buf(struct camdd_dev *dev, struct camdd_buf *peer_buf); void camdd_peer_done(struct camdd_buf *buf); void camdd_complete_buf(struct camdd_dev *dev, struct camdd_buf *buf, int *error_count); int camdd_pass_fetch(struct camdd_dev *dev); int camdd_file_run(struct camdd_dev *dev); int camdd_pass_run(struct camdd_dev *dev); int camdd_get_next_lba_len(struct camdd_dev *dev, uint64_t *lba, ssize_t *len); int camdd_queue(struct camdd_dev *dev, struct camdd_buf *read_buf); void camdd_get_depth(struct camdd_dev *dev, uint32_t *our_depth, uint32_t *peer_depth, uint32_t *our_bytes, uint32_t *peer_bytes); void *camdd_worker(void *arg); void camdd_sig_handler(int sig); void camdd_print_status(struct camdd_dev *camdd_dev, struct camdd_dev *other_dev, struct timespec *start_time); int camdd_rw(struct camdd_io_opts *io_opts, int num_io_opts, uint64_t max_io, int retry_count, int timeout); int camdd_parse_io_opts(char *args, int is_write, struct camdd_io_opts *io_opts); void usage(void); /* * Parse out a bus, or a bus, target and lun in the following * format: * bus * bus:target * bus:target:lun * * Returns the number of parsed components, or 0. */ static int parse_btl(char *tstr, int *bus, int *target, int *lun, camdd_argmask *arglst) { char *tmpstr; int convs = 0; while (isspace(*tstr) && (*tstr != '\0')) tstr++; tmpstr = (char *)strtok(tstr, ":"); if ((tmpstr != NULL) && (*tmpstr != '\0')) { *bus = strtol(tmpstr, NULL, 0); *arglst |= CAMDD_ARG_BUS; convs++; tmpstr = (char *)strtok(NULL, ":"); if ((tmpstr != NULL) && (*tmpstr != '\0')) { *target = strtol(tmpstr, NULL, 0); *arglst |= CAMDD_ARG_TARGET; convs++; tmpstr = (char *)strtok(NULL, ":"); if ((tmpstr != NULL) && (*tmpstr != '\0')) { *lun = strtol(tmpstr, NULL, 0); *arglst |= CAMDD_ARG_LUN; convs++; } } } return convs; } /* * XXX KDM clean up and free all of the buffers on the queue! */ void camdd_free_dev(struct camdd_dev *dev) { if (dev == NULL) return; switch (dev->dev_type) { case CAMDD_DEV_FILE: { struct camdd_dev_file *file_dev = &dev->dev_spec.file; if (file_dev->fd != -1) close(file_dev->fd); free(file_dev->tmp_buf); break; } case CAMDD_DEV_PASS: { struct camdd_dev_pass *pass_dev = &dev->dev_spec.pass; if (pass_dev->dev != NULL) cam_close_device(pass_dev->dev); break; } default: break; } free(dev); } struct camdd_dev * camdd_alloc_dev(camdd_dev_type dev_type, struct kevent *new_ke, int num_ke, int retry_count, int timeout) { struct camdd_dev *dev = NULL; struct kevent *ke; size_t ke_size; int retval = 0; dev = calloc(1, sizeof(*dev)); if (dev == NULL) { warn("%s: unable to malloc %zu bytes", __func__, sizeof(*dev)); goto bailout; } dev->dev_type = dev_type; dev->io_timeout = timeout; dev->retry_count = retry_count; STAILQ_INIT(&dev->free_queue); STAILQ_INIT(&dev->free_indirect_queue); STAILQ_INIT(&dev->active_queue); STAILQ_INIT(&dev->pending_queue); STAILQ_INIT(&dev->run_queue); STAILQ_INIT(&dev->reorder_queue); STAILQ_INIT(&dev->work_queue); STAILQ_INIT(&dev->peer_done_queue); STAILQ_INIT(&dev->peer_work_queue); retval = pthread_mutex_init(&dev->mutex, NULL); if (retval != 0) { warnc(retval, "%s: failed to initialize mutex", __func__); goto bailout; } retval = pthread_cond_init(&dev->cond, NULL); if (retval != 0) { warnc(retval, "%s: failed to initialize condition variable", __func__); goto bailout; } dev->kq = kqueue(); if (dev->kq == -1) { warn("%s: Unable to create kqueue", __func__); goto bailout; } ke_size = sizeof(struct kevent) * (num_ke + 4); ke = calloc(1, ke_size); if (ke == NULL) { warn("%s: unable to malloc %zu bytes", __func__, ke_size); goto bailout; } if (num_ke > 0) bcopy(new_ke, ke, num_ke * sizeof(struct kevent)); EV_SET(&ke[num_ke++], (uintptr_t)&dev->work_queue, EVFILT_USER, EV_ADD|EV_ENABLE|EV_CLEAR, 0,0, 0); EV_SET(&ke[num_ke++], (uintptr_t)&dev->peer_done_queue, EVFILT_USER, EV_ADD|EV_ENABLE|EV_CLEAR, 0,0, 0); EV_SET(&ke[num_ke++], SIGINFO, EVFILT_SIGNAL, EV_ADD|EV_ENABLE, 0,0,0); EV_SET(&ke[num_ke++], SIGINT, EVFILT_SIGNAL, EV_ADD|EV_ENABLE, 0,0,0); retval = kevent(dev->kq, ke, num_ke, NULL, 0, NULL); if (retval == -1) { warn("%s: Unable to register kevents", __func__); goto bailout; } return (dev); bailout: free(dev); return (NULL); } static struct camdd_buf * camdd_alloc_buf(struct camdd_dev *dev, camdd_buf_type buf_type) { struct camdd_buf *buf = NULL; uint8_t *data_ptr = NULL; /* * We only need to allocate data space for data buffers. */ switch (buf_type) { case CAMDD_BUF_DATA: data_ptr = malloc(dev->blocksize); if (data_ptr == NULL) { warn("unable to allocate %u bytes", dev->blocksize); goto bailout_error; } break; default: break; } buf = calloc(1, sizeof(*buf)); if (buf == NULL) { warn("unable to allocate %zu bytes", sizeof(*buf)); goto bailout_error; } buf->buf_type = buf_type; buf->dev = dev; switch (buf_type) { case CAMDD_BUF_DATA: { struct camdd_buf_data *data; data = &buf->buf_type_spec.data; data->alloc_len = dev->blocksize; data->buf = data_ptr; break; } case CAMDD_BUF_INDIRECT: break; default: break; } STAILQ_INIT(&buf->src_list); return (buf); bailout_error: free(data_ptr); return (NULL); } void camdd_release_buf(struct camdd_buf *buf) { struct camdd_dev *dev; dev = buf->dev; switch (buf->buf_type) { case CAMDD_BUF_DATA: { struct camdd_buf_data *data; data = &buf->buf_type_spec.data; if (data->segs != NULL) { if (data->extra_buf != 0) { void *extra_buf; extra_buf = (void *) data->segs[data->sg_count - 1].ds_addr; free(extra_buf); data->extra_buf = 0; } free(data->segs); data->segs = NULL; data->sg_count = 0; } else if (data->iovec != NULL) { if (data->extra_buf != 0) { free(data->iovec[data->sg_count - 1].iov_base); data->extra_buf = 0; } free(data->iovec); data->iovec = NULL; data->sg_count = 0; } STAILQ_INSERT_TAIL(&dev->free_queue, buf, links); break; } case CAMDD_BUF_INDIRECT: STAILQ_INSERT_TAIL(&dev->free_indirect_queue, buf, links); break; default: err(1, "%s: Invalid buffer type %d for released buffer", __func__, buf->buf_type); break; } } struct camdd_buf * camdd_get_buf(struct camdd_dev *dev, camdd_buf_type buf_type) { struct camdd_buf *buf = NULL; switch (buf_type) { case CAMDD_BUF_DATA: buf = STAILQ_FIRST(&dev->free_queue); if (buf != NULL) { struct camdd_buf_data *data; uint8_t *data_ptr; uint32_t alloc_len; STAILQ_REMOVE_HEAD(&dev->free_queue, links); data = &buf->buf_type_spec.data; data_ptr = data->buf; alloc_len = data->alloc_len; bzero(buf, sizeof(*buf)); data->buf = data_ptr; data->alloc_len = alloc_len; } break; case CAMDD_BUF_INDIRECT: buf = STAILQ_FIRST(&dev->free_indirect_queue); if (buf != NULL) { STAILQ_REMOVE_HEAD(&dev->free_indirect_queue, links); bzero(buf, sizeof(*buf)); } break; default: warnx("Unknown buffer type %d requested", buf_type); break; } if (buf == NULL) return (camdd_alloc_buf(dev, buf_type)); else { STAILQ_INIT(&buf->src_list); buf->dev = dev; buf->buf_type = buf_type; return (buf); } } int camdd_buf_sg_create(struct camdd_buf *buf, int iovec, uint32_t sector_size, uint32_t *num_sectors_used, int *double_buf_needed) { struct camdd_buf *tmp_buf; struct camdd_buf_data *data; uint8_t *extra_buf = NULL; size_t extra_buf_len = 0; int extra_buf_attached = 0; int i, retval = 0; data = &buf->buf_type_spec.data; data->sg_count = buf->src_count; /* * Compose a scatter/gather list from all of the buffers in the list. * If the length of the buffer isn't a multiple of the sector size, * we'll have to add an extra buffer. This should only happen * at the end of a transfer. */ if ((data->fill_len % sector_size) != 0) { extra_buf_len = sector_size - (data->fill_len % sector_size); extra_buf = calloc(extra_buf_len, 1); if (extra_buf == NULL) { warn("%s: unable to allocate %zu bytes for extra " "buffer space", __func__, extra_buf_len); retval = 1; goto bailout; } data->extra_buf = 1; data->sg_count++; } if (iovec == 0) { data->segs = calloc(data->sg_count, sizeof(bus_dma_segment_t)); if (data->segs == NULL) { warn("%s: unable to allocate %zu bytes for S/G list", __func__, sizeof(bus_dma_segment_t) * data->sg_count); retval = 1; goto bailout; } } else { data->iovec = calloc(data->sg_count, sizeof(struct iovec)); if (data->iovec == NULL) { warn("%s: unable to allocate %zu bytes for S/G list", __func__, sizeof(struct iovec) * data->sg_count); retval = 1; goto bailout; } } for (i = 0, tmp_buf = STAILQ_FIRST(&buf->src_list); i < buf->src_count && tmp_buf != NULL; i++, tmp_buf = STAILQ_NEXT(tmp_buf, src_links)) { if (tmp_buf->buf_type == CAMDD_BUF_DATA) { struct camdd_buf_data *tmp_data; tmp_data = &tmp_buf->buf_type_spec.data; if (iovec == 0) { data->segs[i].ds_addr = (bus_addr_t) tmp_data->buf; data->segs[i].ds_len = tmp_data->fill_len - tmp_data->resid; } else { data->iovec[i].iov_base = tmp_data->buf; data->iovec[i].iov_len = tmp_data->fill_len - tmp_data->resid; } if (((tmp_data->fill_len - tmp_data->resid) % sector_size) != 0) *double_buf_needed = 1; } else { struct camdd_buf_indirect *tmp_ind; tmp_ind = &tmp_buf->buf_type_spec.indirect; if (iovec == 0) { data->segs[i].ds_addr = (bus_addr_t)tmp_ind->start_ptr; data->segs[i].ds_len = tmp_ind->len; } else { data->iovec[i].iov_base = tmp_ind->start_ptr; data->iovec[i].iov_len = tmp_ind->len; } if ((tmp_ind->len % sector_size) != 0) *double_buf_needed = 1; } } if (extra_buf != NULL) { if (iovec == 0) { data->segs[i].ds_addr = (bus_addr_t)extra_buf; data->segs[i].ds_len = extra_buf_len; } else { data->iovec[i].iov_base = extra_buf; data->iovec[i].iov_len = extra_buf_len; } extra_buf_attached = 1; i++; } if ((tmp_buf != NULL) || (i != data->sg_count)) { warnx("buffer source count does not match " "number of buffers in list!"); retval = 1; goto bailout; } bailout: if (retval == 0) { *num_sectors_used = (data->fill_len + extra_buf_len) / sector_size; } else if (extra_buf_attached == 0) { /* * If extra_buf isn't attached yet, we need to free it * to avoid leaking. */ free(extra_buf); data->extra_buf = 0; data->sg_count--; } return (retval); } uint32_t camdd_buf_get_len(struct camdd_buf *buf) { uint32_t len = 0; if (buf->buf_type != CAMDD_BUF_DATA) { struct camdd_buf_indirect *indirect; indirect = &buf->buf_type_spec.indirect; len = indirect->len; } else { struct camdd_buf_data *data; data = &buf->buf_type_spec.data; len = data->fill_len; } return (len); } void camdd_buf_add_child(struct camdd_buf *buf, struct camdd_buf *child_buf) { struct camdd_buf_data *data; assert(buf->buf_type == CAMDD_BUF_DATA); data = &buf->buf_type_spec.data; STAILQ_INSERT_TAIL(&buf->src_list, child_buf, src_links); buf->src_count++; data->fill_len += camdd_buf_get_len(child_buf); } typedef enum { CAMDD_TS_MAX_BLK, CAMDD_TS_MIN_BLK, CAMDD_TS_BLK_GRAN, CAMDD_TS_EFF_IOSIZE } camdd_status_item_index; static struct camdd_status_items { const char *name; struct mt_status_entry *entry; } req_status_items[] = { { "max_blk", NULL }, { "min_blk", NULL }, { "blk_gran", NULL }, { "max_effective_iosize", NULL } }; int camdd_probe_tape(int fd, char *filename, uint64_t *max_iosize, uint64_t *max_blk, uint64_t *min_blk, uint64_t *blk_gran) { struct mt_status_data status_data; char *xml_str = NULL; unsigned int i; int retval = 0; retval = mt_get_xml_str(fd, MTIOCEXTGET, &xml_str); if (retval != 0) err(1, "Couldn't get XML string from %s", filename); retval = mt_get_status(xml_str, &status_data); if (retval != XML_STATUS_OK) { warn("couldn't get status for %s", filename); retval = 1; goto bailout; } else retval = 0; if (status_data.error != 0) { warnx("%s", status_data.error_str); retval = 1; goto bailout; } for (i = 0; i < nitems(req_status_items); i++) { char *name; name = __DECONST(char *, req_status_items[i].name); req_status_items[i].entry = mt_status_entry_find(&status_data, name); if (req_status_items[i].entry == NULL) { errx(1, "Cannot find status entry %s", req_status_items[i].name); } } *max_iosize = req_status_items[CAMDD_TS_EFF_IOSIZE].entry->value_unsigned; *max_blk= req_status_items[CAMDD_TS_MAX_BLK].entry->value_unsigned; *min_blk= req_status_items[CAMDD_TS_MIN_BLK].entry->value_unsigned; *blk_gran = req_status_items[CAMDD_TS_BLK_GRAN].entry->value_unsigned; bailout: free(xml_str); mt_status_free(&status_data); return (retval); } struct camdd_dev * camdd_probe_file(int fd, struct camdd_io_opts *io_opts, int retry_count, int timeout) { struct camdd_dev *dev = NULL; struct camdd_dev_file *file_dev; uint64_t blocksize = io_opts->blocksize; dev = camdd_alloc_dev(CAMDD_DEV_FILE, NULL, 0, retry_count, timeout); if (dev == NULL) goto bailout; file_dev = &dev->dev_spec.file; file_dev->fd = fd; strlcpy(file_dev->filename, io_opts->dev_name, sizeof(file_dev->filename)); strlcpy(dev->device_name, io_opts->dev_name, sizeof(dev->device_name)); if (blocksize == 0) dev->blocksize = CAMDD_FILE_DEFAULT_BLOCK; else dev->blocksize = blocksize; if ((io_opts->queue_depth != 0) && (io_opts->queue_depth != 1)) { warnx("Queue depth %ju for %s ignored, only 1 outstanding " "command supported", (uintmax_t)io_opts->queue_depth, io_opts->dev_name); } dev->target_queue_depth = CAMDD_FILE_DEFAULT_DEPTH; dev->run = camdd_file_run; dev->fetch = NULL; /* * We can effectively access files on byte boundaries. We'll reset * this for devices like disks that can be accessed on sector * boundaries. */ dev->sector_size = 1; if ((fd != STDIN_FILENO) && (fd != STDOUT_FILENO)) { int retval; retval = fstat(fd, &file_dev->sb); if (retval != 0) { warn("Cannot stat %s", dev->device_name); goto bailout_error; } if (S_ISREG(file_dev->sb.st_mode)) { file_dev->file_type = CAMDD_FILE_REG; } else if (S_ISCHR(file_dev->sb.st_mode)) { int type; if (ioctl(fd, FIODTYPE, &type) == -1) err(1, "FIODTYPE ioctl failed on %s", dev->device_name); else { if (type & D_TAPE) file_dev->file_type = CAMDD_FILE_TAPE; else if (type & D_DISK) file_dev->file_type = CAMDD_FILE_DISK; else if (type & D_MEM) file_dev->file_type = CAMDD_FILE_MEM; else if (type & D_TTY) file_dev->file_type = CAMDD_FILE_TTY; } } else if (S_ISDIR(file_dev->sb.st_mode)) { errx(1, "cannot operate on directory %s", dev->device_name); } else if (S_ISFIFO(file_dev->sb.st_mode)) { file_dev->file_type = CAMDD_FILE_PIPE; } else errx(1, "Cannot determine file type for %s", dev->device_name); switch (file_dev->file_type) { case CAMDD_FILE_REG: if (file_dev->sb.st_size != 0) dev->max_sector = file_dev->sb.st_size - 1; else dev->max_sector = 0; file_dev->file_flags |= CAMDD_FF_CAN_SEEK; break; case CAMDD_FILE_TAPE: { uint64_t max_iosize, max_blk, min_blk, blk_gran; /* * Check block limits and maximum effective iosize. * Make sure the blocksize is within the block * limits (and a multiple of the minimum blocksize) * and that the blocksize is <= maximum effective * iosize. */ retval = camdd_probe_tape(fd, dev->device_name, &max_iosize, &max_blk, &min_blk, &blk_gran); if (retval != 0) errx(1, "Unable to probe tape %s", dev->device_name); /* * The blocksize needs to be <= the maximum * effective I/O size of the tape device. Note * that this also takes into account the maximum * blocksize reported by READ BLOCK LIMITS. */ if (dev->blocksize > max_iosize) { warnx("Blocksize %u too big for %s, limiting " "to %ju", dev->blocksize, dev->device_name, max_iosize); dev->blocksize = max_iosize; } /* * The blocksize needs to be at least min_blk; */ if (dev->blocksize < min_blk) { warnx("Blocksize %u too small for %s, " "increasing to %ju", dev->blocksize, dev->device_name, min_blk); dev->blocksize = min_blk; } /* * And the blocksize needs to be a multiple of * the block granularity. */ if ((blk_gran != 0) && (dev->blocksize % (1 << blk_gran))) { warnx("Blocksize %u for %s not a multiple of " "%d, adjusting to %d", dev->blocksize, dev->device_name, (1 << blk_gran), dev->blocksize & ~((1 << blk_gran) - 1)); dev->blocksize &= ~((1 << blk_gran) - 1); } if (dev->blocksize == 0) { errx(1, "Unable to derive valid blocksize for " "%s", dev->device_name); } /* * For tape drives, set the sector size to the * blocksize so that we make sure not to write * less than the blocksize out to the drive. */ dev->sector_size = dev->blocksize; break; } case CAMDD_FILE_DISK: { off_t media_size; unsigned int sector_size; file_dev->file_flags |= CAMDD_FF_CAN_SEEK; if (ioctl(fd, DIOCGSECTORSIZE, §or_size) == -1) { err(1, "DIOCGSECTORSIZE ioctl failed on %s", dev->device_name); } if (sector_size == 0) { errx(1, "DIOCGSECTORSIZE ioctl returned " "invalid sector size %u for %s", sector_size, dev->device_name); } if (ioctl(fd, DIOCGMEDIASIZE, &media_size) == -1) { err(1, "DIOCGMEDIASIZE ioctl failed on %s", dev->device_name); } if (media_size == 0) { errx(1, "DIOCGMEDIASIZE ioctl returned " "invalid media size %ju for %s", (uintmax_t)media_size, dev->device_name); } if (dev->blocksize % sector_size) { errx(1, "%s blocksize %u not a multiple of " "sector size %u", dev->device_name, dev->blocksize, sector_size); } dev->sector_size = sector_size; dev->max_sector = (media_size / sector_size) - 1; break; } case CAMDD_FILE_MEM: file_dev->file_flags |= CAMDD_FF_CAN_SEEK; break; default: break; } } if ((io_opts->offset != 0) && ((file_dev->file_flags & CAMDD_FF_CAN_SEEK) == 0)) { warnx("Offset %ju specified for %s, but we cannot seek on %s", io_opts->offset, io_opts->dev_name, io_opts->dev_name); goto bailout_error; } #if 0 else if ((io_opts->offset != 0) && ((io_opts->offset % dev->sector_size) != 0)) { warnx("Offset %ju for %s is not a multiple of the " "sector size %u", io_opts->offset, io_opts->dev_name, dev->sector_size); goto bailout_error; } else { dev->start_offset_bytes = io_opts->offset; } #endif bailout: return (dev); bailout_error: camdd_free_dev(dev); return (NULL); } /* * Get a get device CCB for the specified device. */ int camdd_get_cgd(struct cam_device *device, struct ccb_getdev *cgd) { union ccb *ccb; int retval = 0; ccb = cam_getccb(device); if (ccb == NULL) { warnx("%s: couldn't allocate CCB", __func__); return -1; } CCB_CLEAR_ALL_EXCEPT_HDR(&ccb->cgd); ccb->ccb_h.func_code = XPT_GDEV_TYPE; if (cam_send_ccb(device, ccb) < 0) { warn("%s: error sending Get Device Information CCB", __func__); cam_error_print(device, ccb, CAM_ESF_ALL, CAM_EPF_ALL, stderr); retval = -1; goto bailout; } if ((ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP) { cam_error_print(device, ccb, CAM_ESF_ALL, CAM_EPF_ALL, stderr); retval = -1; goto bailout; } bcopy(&ccb->cgd, cgd, sizeof(struct ccb_getdev)); bailout: cam_freeccb(ccb); return retval; } int camdd_probe_pass_scsi(struct cam_device *cam_dev, union ccb *ccb, camdd_argmask arglist, int probe_retry_count, int probe_timeout, uint64_t *maxsector, uint32_t *block_len) { struct scsi_read_capacity_data rcap; struct scsi_read_capacity_data_long rcaplong; int retval = -1; if (ccb == NULL) { warnx("%s: error passed ccb is NULL", __func__); goto bailout; } CCB_CLEAR_ALL_EXCEPT_HDR(&ccb->csio); scsi_read_capacity(&ccb->csio, /*retries*/ probe_retry_count, /*cbfcnp*/ NULL, /*tag_action*/ MSG_SIMPLE_Q_TAG, &rcap, SSD_FULL_SIZE, /*timeout*/ probe_timeout ? probe_timeout : 5000); /* Disable freezing the device queue */ ccb->ccb_h.flags |= CAM_DEV_QFRZDIS; if (arglist & CAMDD_ARG_ERR_RECOVER) ccb->ccb_h.flags |= CAM_PASS_ERR_RECOVER; if (cam_send_ccb(cam_dev, ccb) < 0) { warn("error sending READ CAPACITY command"); cam_error_print(cam_dev, ccb, CAM_ESF_ALL, CAM_EPF_ALL, stderr); goto bailout; } if ((ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP) { cam_error_print(cam_dev, ccb, CAM_ESF_ALL, CAM_EPF_ALL, stderr); goto bailout; } *maxsector = scsi_4btoul(rcap.addr); *block_len = scsi_4btoul(rcap.length); /* * A last block of 2^32-1 means that the true capacity is over 2TB, * and we need to issue the long READ CAPACITY to get the real * capacity. Otherwise, we're all set. */ if (*maxsector != 0xffffffff) { retval = 0; goto bailout; } scsi_read_capacity_16(&ccb->csio, /*retries*/ probe_retry_count, /*cbfcnp*/ NULL, /*tag_action*/ MSG_SIMPLE_Q_TAG, /*lba*/ 0, /*reladdr*/ 0, /*pmi*/ 0, (uint8_t *)&rcaplong, sizeof(rcaplong), /*sense_len*/ SSD_FULL_SIZE, /*timeout*/ probe_timeout ? probe_timeout : 5000); /* Disable freezing the device queue */ ccb->ccb_h.flags |= CAM_DEV_QFRZDIS; if (arglist & CAMDD_ARG_ERR_RECOVER) ccb->ccb_h.flags |= CAM_PASS_ERR_RECOVER; if (cam_send_ccb(cam_dev, ccb) < 0) { warn("error sending READ CAPACITY (16) command"); cam_error_print(cam_dev, ccb, CAM_ESF_ALL, CAM_EPF_ALL, stderr); goto bailout; } if ((ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP) { cam_error_print(cam_dev, ccb, CAM_ESF_ALL, CAM_EPF_ALL, stderr); goto bailout; } *maxsector = scsi_8btou64(rcaplong.addr); *block_len = scsi_4btoul(rcaplong.length); retval = 0; bailout: return retval; } /* * Need to implement this. Do a basic probe: * - Check the inquiry data, make sure we're talking to a device that we * can reasonably expect to talk to -- direct, RBC, CD, WORM. * - Send a test unit ready, make sure the device is available. * - Get the capacity and block size. */ struct camdd_dev * camdd_probe_pass(struct cam_device *cam_dev, struct camdd_io_opts *io_opts, camdd_argmask arglist, int probe_retry_count, int probe_timeout, int io_retry_count, int io_timeout) { union ccb *ccb; uint64_t maxsector = 0; uint32_t cpi_maxio, max_iosize, pass_numblocks; uint32_t block_len = 0; struct camdd_dev *dev = NULL; struct camdd_dev_pass *pass_dev; struct kevent ke; struct ccb_getdev cgd; int retval; int scsi_dev_type; if ((retval = camdd_get_cgd(cam_dev, &cgd)) != 0) { warnx("%s: error retrieving CGD", __func__); return NULL; } ccb = cam_getccb(cam_dev); if (ccb == NULL) { warnx("%s: error allocating ccb", __func__); goto bailout; } switch (cgd.protocol) { case PROTO_SCSI: scsi_dev_type = SID_TYPE(&cam_dev->inq_data); /* * For devices that support READ CAPACITY, we'll attempt to get the * capacity. Otherwise, we really don't support tape or other * devices via SCSI passthrough, so just return an error in that case. */ switch (scsi_dev_type) { case T_DIRECT: case T_WORM: case T_CDROM: case T_OPTICAL: case T_RBC: case T_ZBC_HM: break; default: errx(1, "Unsupported SCSI device type %d", scsi_dev_type); break; /*NOTREACHED*/ } if ((retval = camdd_probe_pass_scsi(cam_dev, ccb, probe_retry_count, arglist, probe_timeout, &maxsector, &block_len))) { goto bailout; } break; default: errx(1, "Unsupported PROTO type %d", cgd.protocol); break; /*NOTREACHED*/ } if (block_len == 0) { warnx("Sector size for %s%u is 0, cannot continue", cam_dev->device_name, cam_dev->dev_unit_num); goto bailout_error; } CCB_CLEAR_ALL_EXCEPT_HDR(&ccb->cpi); ccb->ccb_h.func_code = XPT_PATH_INQ; ccb->ccb_h.flags = CAM_DIR_NONE; ccb->ccb_h.retry_count = 1; if (cam_send_ccb(cam_dev, ccb) < 0) { warn("error sending XPT_PATH_INQ CCB"); cam_error_print(cam_dev, ccb, CAM_ESF_ALL, CAM_EPF_ALL, stderr); goto bailout; } EV_SET(&ke, cam_dev->fd, EVFILT_READ, EV_ADD|EV_ENABLE, 0, 0, 0); dev = camdd_alloc_dev(CAMDD_DEV_PASS, &ke, 1, io_retry_count, io_timeout); if (dev == NULL) goto bailout; pass_dev = &dev->dev_spec.pass; pass_dev->scsi_dev_type = scsi_dev_type; pass_dev->protocol = cgd.protocol; pass_dev->dev = cam_dev; pass_dev->max_sector = maxsector; pass_dev->block_len = block_len; pass_dev->cpi_maxio = ccb->cpi.maxio; snprintf(dev->device_name, sizeof(dev->device_name), "%s%u", pass_dev->dev->device_name, pass_dev->dev->dev_unit_num); dev->sector_size = block_len; dev->max_sector = maxsector; /* * Determine the optimal blocksize to use for this device. */ /* * If the controller has not specified a maximum I/O size, * just go with 128K as a somewhat conservative value. */ if (pass_dev->cpi_maxio == 0) cpi_maxio = 131072; else cpi_maxio = pass_dev->cpi_maxio; /* * If the controller has a large maximum I/O size, limit it * to something smaller so that the kernel doesn't have trouble * allocating buffers to copy data in and out for us. * XXX KDM this is until we have unmapped I/O support in the kernel. */ max_iosize = min(cpi_maxio, CAMDD_PASS_MAX_BLOCK); /* * If we weren't able to get a block size for some reason, * default to 512 bytes. */ block_len = pass_dev->block_len; if (block_len == 0) block_len = 512; /* * Figure out how many blocksize chunks will fit in the * maximum I/O size. */ pass_numblocks = max_iosize / block_len; /* * And finally, multiple the number of blocks by the LBA * length to get our maximum block size; */ dev->blocksize = pass_numblocks * block_len; if (io_opts->blocksize != 0) { if ((io_opts->blocksize % dev->sector_size) != 0) { warnx("Blocksize %ju for %s is not a multiple of " "sector size %u", (uintmax_t)io_opts->blocksize, dev->device_name, dev->sector_size); goto bailout_error; } dev->blocksize = io_opts->blocksize; } dev->target_queue_depth = CAMDD_PASS_DEFAULT_DEPTH; if (io_opts->queue_depth != 0) dev->target_queue_depth = io_opts->queue_depth; if (io_opts->offset != 0) { if (io_opts->offset > (dev->max_sector * dev->sector_size)) { warnx("Offset %ju is past the end of device %s", io_opts->offset, dev->device_name); goto bailout_error; } #if 0 else if ((io_opts->offset % dev->sector_size) != 0) { warnx("Offset %ju for %s is not a multiple of the " "sector size %u", io_opts->offset, dev->device_name, dev->sector_size); goto bailout_error; } dev->start_offset_bytes = io_opts->offset; #endif } dev->min_cmd_size = io_opts->min_cmd_size; dev->run = camdd_pass_run; dev->fetch = camdd_pass_fetch; bailout: cam_freeccb(ccb); return (dev); bailout_error: cam_freeccb(ccb); camdd_free_dev(dev); return (NULL); } void * camdd_worker(void *arg) { struct camdd_dev *dev = arg; struct camdd_buf *buf; struct timespec ts, *kq_ts; ts.tv_sec = 0; ts.tv_nsec = 0; pthread_mutex_lock(&dev->mutex); dev->flags |= CAMDD_DEV_FLAG_ACTIVE; for (;;) { struct kevent ke; int retval = 0; /* * XXX KDM check the reorder queue depth? */ if (dev->write_dev == 0) { uint32_t our_depth, peer_depth, peer_bytes, our_bytes; uint32_t target_depth = dev->target_queue_depth; uint32_t peer_target_depth = dev->peer_dev->target_queue_depth; uint32_t peer_blocksize = dev->peer_dev->blocksize; camdd_get_depth(dev, &our_depth, &peer_depth, &our_bytes, &peer_bytes); #if 0 while (((our_depth < target_depth) && (peer_depth < peer_target_depth)) || ((peer_bytes + our_bytes) < (peer_blocksize * 2))) { #endif while (((our_depth + peer_depth) < (target_depth + peer_target_depth)) || ((peer_bytes + our_bytes) < (peer_blocksize * 3))) { retval = camdd_queue(dev, NULL); if (retval == 1) break; else if (retval != 0) { error_exit = 1; goto bailout; } camdd_get_depth(dev, &our_depth, &peer_depth, &our_bytes, &peer_bytes); } } /* * See if we have any I/O that is ready to execute. */ buf = STAILQ_FIRST(&dev->run_queue); if (buf != NULL) { while (dev->target_queue_depth > dev->cur_active_io) { retval = dev->run(dev); if (retval == -1) { dev->flags |= CAMDD_DEV_FLAG_EOF; error_exit = 1; break; } else if (retval != 0) { break; } } } /* * We've reached EOF, or our partner has reached EOF. */ if ((dev->flags & CAMDD_DEV_FLAG_EOF) || (dev->flags & CAMDD_DEV_FLAG_PEER_EOF)) { if (dev->write_dev != 0) { if ((STAILQ_EMPTY(&dev->work_queue)) && (dev->num_run_queue == 0) && (dev->cur_active_io == 0)) { goto bailout; } } else { /* * If we're the reader, and the writer * got EOF, he is already done. If we got * the EOF, then we need to wait until * everything is flushed out for the writer. */ if (dev->flags & CAMDD_DEV_FLAG_PEER_EOF) { goto bailout; } else if ((dev->num_peer_work_queue == 0) && (dev->num_peer_done_queue == 0) && (dev->cur_active_io == 0) && (dev->num_run_queue == 0)) { goto bailout; } } /* * XXX KDM need to do something about the pending * queue and cleanup resources. */ } if ((dev->write_dev == 0) && (dev->cur_active_io == 0) && (dev->peer_bytes_queued < dev->peer_dev->blocksize)) kq_ts = &ts; else kq_ts = NULL; /* * Run kevent to see if there are events to process. */ pthread_mutex_unlock(&dev->mutex); retval = kevent(dev->kq, NULL, 0, &ke, 1, kq_ts); pthread_mutex_lock(&dev->mutex); if (retval == -1) { warn("%s: error returned from kevent",__func__); goto bailout; } else if (retval != 0) { switch (ke.filter) { case EVFILT_READ: if (dev->fetch != NULL) { retval = dev->fetch(dev); if (retval == -1) { error_exit = 1; goto bailout; } } break; case EVFILT_SIGNAL: /* * We register for this so we don't get * an error as a result of a SIGINFO or a * SIGINT. It will actually get handled * by the signal handler. If we get a * SIGINT, bail out without printing an * error message. Any other signals * will result in the error message above. */ if (ke.ident == SIGINT) goto bailout; break; case EVFILT_USER: retval = 0; /* * Check to see if the other thread has * queued any I/O for us to do. (In this * case we're the writer.) */ for (buf = STAILQ_FIRST(&dev->work_queue); buf != NULL; buf = STAILQ_FIRST(&dev->work_queue)) { STAILQ_REMOVE_HEAD(&dev->work_queue, work_links); retval = camdd_queue(dev, buf); /* * We keep going unless we get an * actual error. If we get EOF, we * still want to remove the buffers * from the queue and send the back * to the reader thread. */ if (retval == -1) { error_exit = 1; goto bailout; } else retval = 0; } /* * Next check to see if the other thread has * queued any completed buffers back to us. * (In this case we're the reader.) */ for (buf = STAILQ_FIRST(&dev->peer_done_queue); buf != NULL; buf = STAILQ_FIRST(&dev->peer_done_queue)){ STAILQ_REMOVE_HEAD( &dev->peer_done_queue, work_links); dev->num_peer_done_queue--; camdd_peer_done(buf); } break; default: warnx("%s: unknown kevent filter %d", __func__, ke.filter); break; } } } bailout: dev->flags &= ~CAMDD_DEV_FLAG_ACTIVE; /* XXX KDM cleanup resources here? */ pthread_mutex_unlock(&dev->mutex); need_exit = 1; sem_post(&camdd_sem); return (NULL); } /* * Simplistic translation of CCB status to our local status. */ camdd_buf_status camdd_ccb_status(union ccb *ccb, int protocol) { camdd_buf_status status = CAMDD_STATUS_NONE; cam_status ccb_status; ccb_status = ccb->ccb_h.status & CAM_STATUS_MASK; switch (protocol) { case PROTO_SCSI: switch (ccb_status) { case CAM_REQ_CMP: { if (ccb->csio.resid == 0) { status = CAMDD_STATUS_OK; } else if (ccb->csio.dxfer_len > ccb->csio.resid) { status = CAMDD_STATUS_SHORT_IO; } else { status = CAMDD_STATUS_EOF; } break; } case CAM_SCSI_STATUS_ERROR: { switch (ccb->csio.scsi_status) { case SCSI_STATUS_OK: case SCSI_STATUS_COND_MET: case SCSI_STATUS_INTERMED: case SCSI_STATUS_INTERMED_COND_MET: status = CAMDD_STATUS_OK; break; case SCSI_STATUS_CMD_TERMINATED: case SCSI_STATUS_CHECK_COND: case SCSI_STATUS_QUEUE_FULL: case SCSI_STATUS_BUSY: case SCSI_STATUS_RESERV_CONFLICT: default: status = CAMDD_STATUS_ERROR; break; } break; } default: status = CAMDD_STATUS_ERROR; break; } break; default: status = CAMDD_STATUS_ERROR; break; } return (status); } /* * Queue a buffer to our peer's work thread for writing. * * Returns 0 for success, -1 for failure, 1 if the other thread exited. */ int camdd_queue_peer_buf(struct camdd_dev *dev, struct camdd_buf *buf) { struct kevent ke; STAILQ_HEAD(, camdd_buf) local_queue; struct camdd_buf *buf1, *buf2; struct camdd_buf_data *data = NULL; uint64_t peer_bytes_queued = 0; int active = 1; int retval = 0; STAILQ_INIT(&local_queue); /* * Since we're the reader, we need to queue our I/O to the writer * in sequential order in order to make sure it gets written out * in sequential order. * * Check the next expected I/O starting offset. If this doesn't * match, put it on the reorder queue. */ if ((buf->lba * dev->sector_size) != dev->next_completion_pos_bytes) { /* * If there is nothing on the queue, there is no sorting * needed. */ if (STAILQ_EMPTY(&dev->reorder_queue)) { STAILQ_INSERT_TAIL(&dev->reorder_queue, buf, links); dev->num_reorder_queue++; goto bailout; } /* * Sort in ascending order by starting LBA. There should * be no identical LBAs. */ for (buf1 = STAILQ_FIRST(&dev->reorder_queue); buf1 != NULL; buf1 = buf2) { buf2 = STAILQ_NEXT(buf1, links); if (buf->lba < buf1->lba) { /* * If we're less than the first one, then * we insert at the head of the list * because this has to be the first element * on the list. */ STAILQ_INSERT_HEAD(&dev->reorder_queue, buf, links); dev->num_reorder_queue++; break; } else if (buf->lba > buf1->lba) { if (buf2 == NULL) { STAILQ_INSERT_TAIL(&dev->reorder_queue, buf, links); dev->num_reorder_queue++; break; } else if (buf->lba < buf2->lba) { STAILQ_INSERT_AFTER(&dev->reorder_queue, buf1, buf, links); dev->num_reorder_queue++; break; } } else { errx(1, "Found buffers with duplicate LBA %ju!", buf->lba); } } goto bailout; } else { /* * We're the next expected I/O completion, so put ourselves * on the local queue to be sent to the writer. We use * work_links here so that we can queue this to the * peer_work_queue before taking the buffer off of the * local_queue. */ dev->next_completion_pos_bytes += buf->len; STAILQ_INSERT_TAIL(&local_queue, buf, work_links); /* * Go through the reorder queue looking for more sequential * I/O and add it to the local queue. */ for (buf1 = STAILQ_FIRST(&dev->reorder_queue); buf1 != NULL; buf1 = STAILQ_FIRST(&dev->reorder_queue)) { /* * As soon as we see an I/O that is out of sequence, * we're done. */ if ((buf1->lba * dev->sector_size) != dev->next_completion_pos_bytes) break; STAILQ_REMOVE_HEAD(&dev->reorder_queue, links); dev->num_reorder_queue--; STAILQ_INSERT_TAIL(&local_queue, buf1, work_links); dev->next_completion_pos_bytes += buf1->len; } } /* * Setup the event to let the other thread know that it has work * pending. */ EV_SET(&ke, (uintptr_t)&dev->peer_dev->work_queue, EVFILT_USER, 0, NOTE_TRIGGER, 0, NULL); /* * Put this on our shadow queue so that we know what we've queued * to the other thread. */ STAILQ_FOREACH_SAFE(buf1, &local_queue, work_links, buf2) { if (buf1->buf_type != CAMDD_BUF_DATA) { errx(1, "%s: should have a data buffer, not an " "indirect buffer", __func__); } data = &buf1->buf_type_spec.data; /* * We only need to send one EOF to the writer, and don't * need to continue sending EOFs after that. */ if (buf1->status == CAMDD_STATUS_EOF) { if (dev->flags & CAMDD_DEV_FLAG_EOF_SENT) { STAILQ_REMOVE(&local_queue, buf1, camdd_buf, work_links); camdd_release_buf(buf1); retval = 1; continue; } dev->flags |= CAMDD_DEV_FLAG_EOF_SENT; } STAILQ_INSERT_TAIL(&dev->peer_work_queue, buf1, links); peer_bytes_queued += (data->fill_len - data->resid); dev->peer_bytes_queued += (data->fill_len - data->resid); dev->num_peer_work_queue++; } if (STAILQ_FIRST(&local_queue) == NULL) goto bailout; /* * Drop our mutex and pick up the other thread's mutex. We need to * do this to avoid deadlocks. */ pthread_mutex_unlock(&dev->mutex); pthread_mutex_lock(&dev->peer_dev->mutex); if (dev->peer_dev->flags & CAMDD_DEV_FLAG_ACTIVE) { /* * Put the buffers on the other thread's incoming work queue. */ for (buf1 = STAILQ_FIRST(&local_queue); buf1 != NULL; buf1 = STAILQ_FIRST(&local_queue)) { STAILQ_REMOVE_HEAD(&local_queue, work_links); STAILQ_INSERT_TAIL(&dev->peer_dev->work_queue, buf1, work_links); } /* * Send an event to the other thread's kqueue to let it know * that there is something on the work queue. */ retval = kevent(dev->peer_dev->kq, &ke, 1, NULL, 0, NULL); if (retval == -1) warn("%s: unable to add peer work_queue kevent", __func__); else retval = 0; } else active = 0; pthread_mutex_unlock(&dev->peer_dev->mutex); pthread_mutex_lock(&dev->mutex); /* * If the other side isn't active, run through the queue and * release all of the buffers. */ if (active == 0) { for (buf1 = STAILQ_FIRST(&local_queue); buf1 != NULL; buf1 = STAILQ_FIRST(&local_queue)) { STAILQ_REMOVE_HEAD(&local_queue, work_links); STAILQ_REMOVE(&dev->peer_work_queue, buf1, camdd_buf, links); dev->num_peer_work_queue--; camdd_release_buf(buf1); } dev->peer_bytes_queued -= peer_bytes_queued; retval = 1; } bailout: return (retval); } /* * Return a buffer to the reader thread when we have completed writing it. */ int camdd_complete_peer_buf(struct camdd_dev *dev, struct camdd_buf *peer_buf) { struct kevent ke; int retval = 0; /* * Setup the event to let the other thread know that we have * completed a buffer. */ EV_SET(&ke, (uintptr_t)&dev->peer_dev->peer_done_queue, EVFILT_USER, 0, NOTE_TRIGGER, 0, NULL); /* * Drop our lock and acquire the other thread's lock before * manipulating */ pthread_mutex_unlock(&dev->mutex); pthread_mutex_lock(&dev->peer_dev->mutex); /* * Put the buffer on the reader thread's peer done queue now that * we have completed it. */ STAILQ_INSERT_TAIL(&dev->peer_dev->peer_done_queue, peer_buf, work_links); dev->peer_dev->num_peer_done_queue++; /* * Send an event to the peer thread to let it know that we've added * something to its peer done queue. */ retval = kevent(dev->peer_dev->kq, &ke, 1, NULL, 0, NULL); if (retval == -1) warn("%s: unable to add peer_done_queue kevent", __func__); else retval = 0; /* * Drop the other thread's lock and reacquire ours. */ pthread_mutex_unlock(&dev->peer_dev->mutex); pthread_mutex_lock(&dev->mutex); return (retval); } /* * Free a buffer that was written out by the writer thread and returned to * the reader thread. */ void camdd_peer_done(struct camdd_buf *buf) { struct camdd_dev *dev; struct camdd_buf_data *data; dev = buf->dev; if (buf->buf_type != CAMDD_BUF_DATA) { errx(1, "%s: should have a data buffer, not an " "indirect buffer", __func__); } data = &buf->buf_type_spec.data; STAILQ_REMOVE(&dev->peer_work_queue, buf, camdd_buf, links); dev->num_peer_work_queue--; dev->peer_bytes_queued -= (data->fill_len - data->resid); if (buf->status == CAMDD_STATUS_EOF) dev->flags |= CAMDD_DEV_FLAG_PEER_EOF; STAILQ_INSERT_TAIL(&dev->free_queue, buf, links); } /* * Assumes caller holds the lock for this device. */ void camdd_complete_buf(struct camdd_dev *dev, struct camdd_buf *buf, int *error_count) { int retval = 0; /* * If we're the reader, we need to send the completed I/O * to the writer. If we're the writer, we need to just * free up resources, or let the reader know if we've * encountered an error. */ if (dev->write_dev == 0) { retval = camdd_queue_peer_buf(dev, buf); if (retval != 0) (*error_count)++; } else { struct camdd_buf *tmp_buf, *next_buf; STAILQ_FOREACH_SAFE(tmp_buf, &buf->src_list, src_links, next_buf) { struct camdd_buf *src_buf; struct camdd_buf_indirect *indirect; STAILQ_REMOVE(&buf->src_list, tmp_buf, camdd_buf, src_links); tmp_buf->status = buf->status; if (tmp_buf->buf_type == CAMDD_BUF_DATA) { camdd_complete_peer_buf(dev, tmp_buf); continue; } indirect = &tmp_buf->buf_type_spec.indirect; src_buf = indirect->src_buf; src_buf->refcount--; /* * XXX KDM we probably need to account for * exactly how many bytes we were able to * write. Allocate the residual to the * first N buffers? Or just track the * number of bytes written? Right now the reader * doesn't do anything with a residual. */ src_buf->status = buf->status; if (src_buf->refcount <= 0) camdd_complete_peer_buf(dev, src_buf); STAILQ_INSERT_TAIL(&dev->free_indirect_queue, tmp_buf, links); } STAILQ_INSERT_TAIL(&dev->free_queue, buf, links); } } /* * Fetch all completed commands from the pass(4) device. * * Returns the number of commands received, or -1 if any of the commands * completed with an error. Returns 0 if no commands are available. */ int camdd_pass_fetch(struct camdd_dev *dev) { struct camdd_dev_pass *pass_dev = &dev->dev_spec.pass; union ccb ccb; int retval = 0, num_fetched = 0, error_count = 0; pthread_mutex_unlock(&dev->mutex); /* * XXX KDM we don't distinguish between EFAULT and ENOENT. */ while ((retval = ioctl(pass_dev->dev->fd, CAMIOGET, &ccb)) != -1) { struct camdd_buf *buf; struct camdd_buf_data *data; cam_status ccb_status; union ccb *buf_ccb; buf = ccb.ccb_h.ccb_buf; data = &buf->buf_type_spec.data; buf_ccb = &data->ccb; num_fetched++; /* * Copy the CCB back out so we get status, sense data, etc. */ bcopy(&ccb, buf_ccb, sizeof(ccb)); pthread_mutex_lock(&dev->mutex); /* * We're now done, so take this off the active queue. */ STAILQ_REMOVE(&dev->active_queue, buf, camdd_buf, links); dev->cur_active_io--; ccb_status = ccb.ccb_h.status & CAM_STATUS_MASK; if (ccb_status != CAM_REQ_CMP) { cam_error_print(pass_dev->dev, &ccb, CAM_ESF_ALL, CAM_EPF_ALL, stderr); } switch (pass_dev->protocol) { case PROTO_SCSI: data->resid = ccb.csio.resid; dev->bytes_transferred += (ccb.csio.dxfer_len - ccb.csio.resid); break; default: return -1; break; } if (buf->status == CAMDD_STATUS_NONE) buf->status = camdd_ccb_status(&ccb, pass_dev->protocol); if (buf->status == CAMDD_STATUS_ERROR) error_count++; else if (buf->status == CAMDD_STATUS_EOF) { /* * Once we queue this buffer to our partner thread, * he will know that we've hit EOF. */ dev->flags |= CAMDD_DEV_FLAG_EOF; } camdd_complete_buf(dev, buf, &error_count); /* * Unlock in preparation for the ioctl call. */ pthread_mutex_unlock(&dev->mutex); } pthread_mutex_lock(&dev->mutex); if (error_count > 0) return (-1); else return (num_fetched); } /* * Returns -1 for error, 0 for success/continue, and 1 for resource * shortage/stop processing. */ int camdd_file_run(struct camdd_dev *dev) { struct camdd_dev_file *file_dev = &dev->dev_spec.file; struct camdd_buf_data *data; struct camdd_buf *buf; off_t io_offset; int retval = 0, write_dev = dev->write_dev; int error_count = 0, no_resources = 0, double_buf_needed = 0; uint32_t num_sectors = 0, db_len = 0; buf = STAILQ_FIRST(&dev->run_queue); if (buf == NULL) { no_resources = 1; goto bailout; } else if ((dev->write_dev == 0) && (dev->flags & (CAMDD_DEV_FLAG_EOF | CAMDD_DEV_FLAG_EOF_SENT))) { STAILQ_REMOVE(&dev->run_queue, buf, camdd_buf, links); dev->num_run_queue--; buf->status = CAMDD_STATUS_EOF; error_count++; goto bailout; } /* * If we're writing, we need to go through the source buffer list * and create an S/G list. */ if (write_dev != 0) { retval = camdd_buf_sg_create(buf, /*iovec*/ 1, dev->sector_size, &num_sectors, &double_buf_needed); if (retval != 0) { no_resources = 1; goto bailout; } } STAILQ_REMOVE(&dev->run_queue, buf, camdd_buf, links); dev->num_run_queue--; data = &buf->buf_type_spec.data; /* * pread(2) and pwrite(2) offsets are byte offsets. */ io_offset = buf->lba * dev->sector_size; /* * Unlock the mutex while we read or write. */ pthread_mutex_unlock(&dev->mutex); /* * Note that we don't need to double buffer if we're the reader * because in that case, we have allocated a single buffer of * sufficient size to do the read. This copy is necessary on * writes because if one of the components of the S/G list is not * a sector size multiple, the kernel will reject the write. This * is unfortunate but not surprising. So this will make sure that * we're using a single buffer that is a multiple of the sector size. */ if ((double_buf_needed != 0) && (data->sg_count > 1) && (write_dev != 0)) { uint32_t cur_offset; int i; if (file_dev->tmp_buf == NULL) file_dev->tmp_buf = calloc(dev->blocksize, 1); if (file_dev->tmp_buf == NULL) { buf->status = CAMDD_STATUS_ERROR; error_count++; pthread_mutex_lock(&dev->mutex); goto bailout; } for (i = 0, cur_offset = 0; i < data->sg_count; i++) { bcopy(data->iovec[i].iov_base, &file_dev->tmp_buf[cur_offset], data->iovec[i].iov_len); cur_offset += data->iovec[i].iov_len; } db_len = cur_offset; } if (file_dev->file_flags & CAMDD_FF_CAN_SEEK) { if (write_dev == 0) { /* * XXX KDM is there any way we would need a S/G * list here? */ retval = pread(file_dev->fd, data->buf, buf->len, io_offset); } else { if (double_buf_needed != 0) { retval = pwrite(file_dev->fd, file_dev->tmp_buf, db_len, io_offset); } else if (data->sg_count == 0) { retval = pwrite(file_dev->fd, data->buf, data->fill_len, io_offset); } else { retval = pwritev(file_dev->fd, data->iovec, data->sg_count, io_offset); } } } else { if (write_dev == 0) { /* * XXX KDM is there any way we would need a S/G * list here? */ retval = read(file_dev->fd, data->buf, buf->len); } else { if (double_buf_needed != 0) { retval = write(file_dev->fd, file_dev->tmp_buf, db_len); } else if (data->sg_count == 0) { retval = write(file_dev->fd, data->buf, data->fill_len); } else { retval = writev(file_dev->fd, data->iovec, data->sg_count); } } } /* We're done, re-acquire the lock */ pthread_mutex_lock(&dev->mutex); if (retval >= (ssize_t)data->fill_len) { /* * If the bytes transferred is more than the request size, * that indicates an overrun, which should only happen at * the end of a transfer if we have to round up to a sector * boundary. */ if (buf->status == CAMDD_STATUS_NONE) buf->status = CAMDD_STATUS_OK; data->resid = 0; dev->bytes_transferred += retval; } else if (retval == -1) { warn("Error %s %s", (write_dev) ? "writing to" : "reading from", file_dev->filename); buf->status = CAMDD_STATUS_ERROR; data->resid = data->fill_len; error_count++; if (dev->debug == 0) goto bailout; if ((double_buf_needed != 0) && (write_dev != 0)) { fprintf(stderr, "%s: fd %d, DB buf %p, len %u lba %ju " "offset %ju\n", __func__, file_dev->fd, file_dev->tmp_buf, db_len, (uintmax_t)buf->lba, (uintmax_t)io_offset); } else if (data->sg_count == 0) { fprintf(stderr, "%s: fd %d, buf %p, len %u, lba %ju " "offset %ju\n", __func__, file_dev->fd, data->buf, data->fill_len, (uintmax_t)buf->lba, (uintmax_t)io_offset); } else { int i; fprintf(stderr, "%s: fd %d, len %u, lba %ju " "offset %ju\n", __func__, file_dev->fd, data->fill_len, (uintmax_t)buf->lba, (uintmax_t)io_offset); for (i = 0; i < data->sg_count; i++) { fprintf(stderr, "index %d ptr %p len %zu\n", i, data->iovec[i].iov_base, data->iovec[i].iov_len); } } } else if (retval == 0) { buf->status = CAMDD_STATUS_EOF; if (dev->debug != 0) printf("%s: got EOF from %s!\n", __func__, file_dev->filename); data->resid = data->fill_len; error_count++; } else if (retval < (ssize_t)data->fill_len) { if (buf->status == CAMDD_STATUS_NONE) buf->status = CAMDD_STATUS_SHORT_IO; data->resid = data->fill_len - retval; dev->bytes_transferred += retval; } bailout: if (buf != NULL) { if (buf->status == CAMDD_STATUS_EOF) { struct camdd_buf *buf2; dev->flags |= CAMDD_DEV_FLAG_EOF; STAILQ_FOREACH(buf2, &dev->run_queue, links) buf2->status = CAMDD_STATUS_EOF; } camdd_complete_buf(dev, buf, &error_count); } if (error_count != 0) return (-1); else if (no_resources != 0) return (1); else return (0); } /* * Execute one command from the run queue. Returns 0 for success, 1 for * stop processing, and -1 for error. */ int camdd_pass_run(struct camdd_dev *dev) { struct camdd_buf *buf = NULL; struct camdd_dev_pass *pass_dev = &dev->dev_spec.pass; struct camdd_buf_data *data; uint32_t num_blocks, sectors_used = 0; union ccb *ccb; int retval = 0, is_write = dev->write_dev; int double_buf_needed = 0; buf = STAILQ_FIRST(&dev->run_queue); if (buf == NULL) { retval = 1; goto bailout; } /* * If we're writing, we need to go through the source buffer list * and create an S/G list. */ if (is_write != 0) { retval = camdd_buf_sg_create(buf, /*iovec*/ 0,dev->sector_size, §ors_used, &double_buf_needed); if (retval != 0) { retval = -1; goto bailout; } } STAILQ_REMOVE(&dev->run_queue, buf, camdd_buf, links); dev->num_run_queue--; data = &buf->buf_type_spec.data; /* * In almost every case the number of blocks should be the device * block size. The exception may be at the end of an I/O stream * for a partial block or at the end of a device. */ if (is_write != 0) num_blocks = sectors_used; else num_blocks = data->fill_len / pass_dev->block_len; ccb = &data->ccb; switch (pass_dev->protocol) { case PROTO_SCSI: CCB_CLEAR_ALL_EXCEPT_HDR(&ccb->csio); scsi_read_write(&ccb->csio, /*retries*/ dev->retry_count, /*cbfcnp*/ NULL, /*tag_action*/ MSG_SIMPLE_Q_TAG, /*readop*/ (dev->write_dev == 0) ? SCSI_RW_READ : SCSI_RW_WRITE, /*byte2*/ 0, /*minimum_cmd_size*/ dev->min_cmd_size, /*lba*/ buf->lba, /*block_count*/ num_blocks, /*data_ptr*/ (data->sg_count != 0) ? (uint8_t *)data->segs : data->buf, /*dxfer_len*/ (num_blocks * pass_dev->block_len), /*sense_len*/ SSD_FULL_SIZE, /*timeout*/ dev->io_timeout); if (data->sg_count != 0) { ccb->csio.sglist_cnt = data->sg_count; } break; default: retval = -1; goto bailout; } /* Disable freezing the device queue */ ccb->ccb_h.flags |= CAM_DEV_QFRZDIS; if (dev->retry_count != 0) ccb->ccb_h.flags |= CAM_PASS_ERR_RECOVER; if (data->sg_count != 0) { ccb->ccb_h.flags |= CAM_DATA_SG; } /* * Store a pointer to the buffer in the CCB. The kernel will * restore this when we get it back, and we'll use it to identify * the buffer this CCB came from. */ ccb->ccb_h.ccb_buf = buf; /* * Unlock our mutex in preparation for issuing the ioctl. */ pthread_mutex_unlock(&dev->mutex); /* * Queue the CCB to the pass(4) driver. */ if (ioctl(pass_dev->dev->fd, CAMIOQUEUE, ccb) == -1) { pthread_mutex_lock(&dev->mutex); warn("%s: error sending CAMIOQUEUE ioctl to %s%u", __func__, pass_dev->dev->device_name, pass_dev->dev->dev_unit_num); warn("%s: CCB address is %p", __func__, ccb); retval = -1; STAILQ_INSERT_TAIL(&dev->free_queue, buf, links); } else { pthread_mutex_lock(&dev->mutex); dev->cur_active_io++; STAILQ_INSERT_TAIL(&dev->active_queue, buf, links); } bailout: return (retval); } int camdd_get_next_lba_len(struct camdd_dev *dev, uint64_t *lba, ssize_t *len) { struct camdd_dev_pass *pass_dev; uint32_t num_blocks; int retval = 0; pass_dev = &dev->dev_spec.pass; *lba = dev->next_io_pos_bytes / dev->sector_size; *len = dev->blocksize; num_blocks = *len / dev->sector_size; /* * If max_sector is 0, then we have no set limit. This can happen * if we're writing to a file in a filesystem, or reading from * something like /dev/zero. */ if ((dev->max_sector != 0) || (dev->sector_io_limit != 0)) { uint64_t max_sector; if ((dev->max_sector != 0) && (dev->sector_io_limit != 0)) max_sector = min(dev->sector_io_limit, dev->max_sector); else if (dev->max_sector != 0) max_sector = dev->max_sector; else max_sector = dev->sector_io_limit; /* * Check to see whether we're starting off past the end of * the device. If so, we need to just send an EOF * notification to the writer. */ if (*lba > max_sector) { *len = 0; retval = 1; } else if (((*lba + num_blocks) > max_sector + 1) || ((*lba + num_blocks) < *lba)) { /* * If we get here (but pass the first check), we * can trim the request length down to go to the * end of the device. */ num_blocks = (max_sector + 1) - *lba; *len = num_blocks * dev->sector_size; retval = 1; } } dev->next_io_pos_bytes += *len; return (retval); } /* * Returns 0 for success, 1 for EOF detected, and -1 for failure. */ int camdd_queue(struct camdd_dev *dev, struct camdd_buf *read_buf) { struct camdd_buf *buf = NULL; struct camdd_buf_data *data; struct camdd_dev_pass *pass_dev; size_t new_len; struct camdd_buf_data *rb_data; int is_write = dev->write_dev; int eof_flush_needed = 0; int retval = 0; int error; pass_dev = &dev->dev_spec.pass; /* * If we've gotten EOF or our partner has, we should not continue * queueing I/O. If we're a writer, though, we should continue * to write any buffers that don't have EOF status. */ if ((dev->flags & CAMDD_DEV_FLAG_EOF) || ((dev->flags & CAMDD_DEV_FLAG_PEER_EOF) && (is_write == 0))) { /* * Tell the worker thread that we have seen EOF. */ retval = 1; /* * If we're the writer, send the buffer back with EOF status. */ if (is_write) { read_buf->status = CAMDD_STATUS_EOF; error = camdd_complete_peer_buf(dev, read_buf); } goto bailout; } if (is_write == 0) { buf = camdd_get_buf(dev, CAMDD_BUF_DATA); if (buf == NULL) { retval = -1; goto bailout; } data = &buf->buf_type_spec.data; retval = camdd_get_next_lba_len(dev, &buf->lba, &buf->len); if (retval != 0) { buf->status = CAMDD_STATUS_EOF; if ((buf->len == 0) && ((dev->flags & (CAMDD_DEV_FLAG_EOF_SENT | CAMDD_DEV_FLAG_EOF_QUEUED)) != 0)) { camdd_release_buf(buf); goto bailout; } dev->flags |= CAMDD_DEV_FLAG_EOF_QUEUED; } data->fill_len = buf->len; data->src_start_offset = buf->lba * dev->sector_size; /* * Put this on the run queue. */ STAILQ_INSERT_TAIL(&dev->run_queue, buf, links); dev->num_run_queue++; /* We're done. */ goto bailout; } /* * Check for new EOF status from the reader. */ if ((read_buf->status == CAMDD_STATUS_EOF) || (read_buf->status == CAMDD_STATUS_ERROR)) { dev->flags |= CAMDD_DEV_FLAG_PEER_EOF; if ((STAILQ_FIRST(&dev->pending_queue) == NULL) && (read_buf->len == 0)) { camdd_complete_peer_buf(dev, read_buf); retval = 1; goto bailout; } else eof_flush_needed = 1; } /* * See if we have a buffer we're composing with pieces from our * partner thread. */ buf = STAILQ_FIRST(&dev->pending_queue); if (buf == NULL) { uint64_t lba; ssize_t len; retval = camdd_get_next_lba_len(dev, &lba, &len); if (retval != 0) { read_buf->status = CAMDD_STATUS_EOF; if (len == 0) { dev->flags |= CAMDD_DEV_FLAG_EOF; error = camdd_complete_peer_buf(dev, read_buf); goto bailout; } } /* * If we don't have a pending buffer, we need to grab a new * one from the free list or allocate another one. */ buf = camdd_get_buf(dev, CAMDD_BUF_DATA); if (buf == NULL) { retval = 1; goto bailout; } buf->lba = lba; buf->len = len; STAILQ_INSERT_TAIL(&dev->pending_queue, buf, links); dev->num_pending_queue++; } data = &buf->buf_type_spec.data; rb_data = &read_buf->buf_type_spec.data; if ((rb_data->src_start_offset != dev->next_peer_pos_bytes) && (dev->debug != 0)) { printf("%s: WARNING: reader offset %#jx != expected offset " "%#jx\n", __func__, (uintmax_t)rb_data->src_start_offset, (uintmax_t)dev->next_peer_pos_bytes); } dev->next_peer_pos_bytes = rb_data->src_start_offset + (rb_data->fill_len - rb_data->resid); new_len = (rb_data->fill_len - rb_data->resid) + data->fill_len; if (new_len < buf->len) { /* * There are three cases here: * 1. We need more data to fill up a block, so we put * this I/O on the queue and wait for more I/O. * 2. We have a pending buffer in the queue that is * smaller than our blocksize, but we got an EOF. So we * need to go ahead and flush the write out. * 3. We got an error. */ /* * Increment our fill length. */ data->fill_len += (rb_data->fill_len - rb_data->resid); /* * Add the new read buffer to the list for writing. */ STAILQ_INSERT_TAIL(&buf->src_list, read_buf, src_links); /* Increment the count */ buf->src_count++; if (eof_flush_needed == 0) { /* * We need to exit, because we don't have enough * data yet. */ goto bailout; } else { /* * Take the buffer off of the pending queue. */ STAILQ_REMOVE(&dev->pending_queue, buf, camdd_buf, links); dev->num_pending_queue--; /* * If we need an EOF flush, but there is no data * to flush, go ahead and return this buffer. */ if (data->fill_len == 0) { camdd_complete_buf(dev, buf, /*error_count*/0); retval = 1; goto bailout; } /* * Put this on the next queue for execution. */ STAILQ_INSERT_TAIL(&dev->run_queue, buf, links); dev->num_run_queue++; } } else if (new_len == buf->len) { /* * We have enough data to completey fill one block, * so we're ready to issue the I/O. */ /* * Take the buffer off of the pending queue. */ STAILQ_REMOVE(&dev->pending_queue, buf, camdd_buf, links); dev->num_pending_queue--; /* * Add the new read buffer to the list for writing. */ STAILQ_INSERT_TAIL(&buf->src_list, read_buf, src_links); /* Increment the count */ buf->src_count++; /* * Increment our fill length. */ data->fill_len += (rb_data->fill_len - rb_data->resid); /* * Put this on the next queue for execution. */ STAILQ_INSERT_TAIL(&dev->run_queue, buf, links); dev->num_run_queue++; } else { struct camdd_buf *idb; struct camdd_buf_indirect *indirect; uint32_t len_to_go, cur_offset; idb = camdd_get_buf(dev, CAMDD_BUF_INDIRECT); if (idb == NULL) { retval = 1; goto bailout; } indirect = &idb->buf_type_spec.indirect; indirect->src_buf = read_buf; read_buf->refcount++; indirect->offset = 0; indirect->start_ptr = rb_data->buf; /* * We've already established that there is more * data in read_buf than we have room for in our * current write request. So this particular chunk * of the request should just be the remainder * needed to fill up a block. */ indirect->len = buf->len - (data->fill_len - data->resid); camdd_buf_add_child(buf, idb); /* * This buffer is ready to execute, so we can take * it off the pending queue and put it on the run * queue. */ STAILQ_REMOVE(&dev->pending_queue, buf, camdd_buf, links); dev->num_pending_queue--; STAILQ_INSERT_TAIL(&dev->run_queue, buf, links); dev->num_run_queue++; cur_offset = indirect->offset + indirect->len; /* * The resulting I/O would be too large to fit in * one block. We need to split this I/O into * multiple pieces. Allocate as many buffers as needed. */ for (len_to_go = rb_data->fill_len - rb_data->resid - indirect->len; len_to_go > 0;) { struct camdd_buf *new_buf; struct camdd_buf_data *new_data; uint64_t lba; ssize_t len; retval = camdd_get_next_lba_len(dev, &lba, &len); if ((retval != 0) && (len == 0)) { /* * The device has already been marked * as EOF, and there is no space left. */ goto bailout; } new_buf = camdd_get_buf(dev, CAMDD_BUF_DATA); if (new_buf == NULL) { retval = 1; goto bailout; } new_buf->lba = lba; new_buf->len = len; idb = camdd_get_buf(dev, CAMDD_BUF_INDIRECT); if (idb == NULL) { retval = 1; goto bailout; } indirect = &idb->buf_type_spec.indirect; indirect->src_buf = read_buf; read_buf->refcount++; indirect->offset = cur_offset; indirect->start_ptr = rb_data->buf + cur_offset; indirect->len = min(len_to_go, new_buf->len); #if 0 if (((indirect->len % dev->sector_size) != 0) || ((indirect->offset % dev->sector_size) != 0)) { warnx("offset %ju len %ju not aligned with " "sector size %u", indirect->offset, (uintmax_t)indirect->len, dev->sector_size); } #endif cur_offset += indirect->len; len_to_go -= indirect->len; camdd_buf_add_child(new_buf, idb); new_data = &new_buf->buf_type_spec.data; if ((new_data->fill_len == new_buf->len) || (eof_flush_needed != 0)) { STAILQ_INSERT_TAIL(&dev->run_queue, new_buf, links); dev->num_run_queue++; } else if (new_data->fill_len < buf->len) { STAILQ_INSERT_TAIL(&dev->pending_queue, new_buf, links); dev->num_pending_queue++; } else { warnx("%s: too much data in new " "buffer!", __func__); retval = 1; goto bailout; } } } bailout: return (retval); } void camdd_get_depth(struct camdd_dev *dev, uint32_t *our_depth, uint32_t *peer_depth, uint32_t *our_bytes, uint32_t *peer_bytes) { *our_depth = dev->cur_active_io + dev->num_run_queue; if (dev->num_peer_work_queue > dev->num_peer_done_queue) *peer_depth = dev->num_peer_work_queue - dev->num_peer_done_queue; else *peer_depth = 0; *our_bytes = *our_depth * dev->blocksize; *peer_bytes = dev->peer_bytes_queued; } void camdd_sig_handler(int sig) { if (sig == SIGINFO) need_status = 1; else { need_exit = 1; error_exit = 1; } sem_post(&camdd_sem); } void camdd_print_status(struct camdd_dev *camdd_dev, struct camdd_dev *other_dev, struct timespec *start_time) { struct timespec done_time; uint64_t total_ns; long double mb_sec, total_sec; int error = 0; error = clock_gettime(CLOCK_MONOTONIC_PRECISE, &done_time); if (error != 0) { warn("Unable to get done time"); return; } - timespecsub(&done_time, start_time); + timespecsub(&done_time, start_time, &done_time); total_ns = done_time.tv_nsec + (done_time.tv_sec * 1000000000); total_sec = total_ns; total_sec /= 1000000000; fprintf(stderr, "%ju bytes %s %s\n%ju bytes %s %s\n" "%.4Lf seconds elapsed\n", (uintmax_t)camdd_dev->bytes_transferred, (camdd_dev->write_dev == 0) ? "read from" : "written to", camdd_dev->device_name, (uintmax_t)other_dev->bytes_transferred, (other_dev->write_dev == 0) ? "read from" : "written to", other_dev->device_name, total_sec); mb_sec = min(other_dev->bytes_transferred,camdd_dev->bytes_transferred); mb_sec /= 1024 * 1024; mb_sec *= 1000000000; mb_sec /= total_ns; fprintf(stderr, "%.2Lf MB/sec\n", mb_sec); } int camdd_rw(struct camdd_io_opts *io_opts, int num_io_opts, uint64_t max_io, int retry_count, int timeout) { struct cam_device *new_cam_dev = NULL; struct camdd_dev *devs[2]; struct timespec start_time; pthread_t threads[2]; int unit = 0; int error = 0; int i; if (num_io_opts != 2) { warnx("Must have one input and one output path"); error = 1; goto bailout; } bzero(devs, sizeof(devs)); for (i = 0; i < num_io_opts; i++) { switch (io_opts[i].dev_type) { case CAMDD_DEV_PASS: { if (isdigit(io_opts[i].dev_name[0])) { camdd_argmask new_arglist = CAMDD_ARG_NONE; int bus = 0, target = 0, lun = 0; int rv; /* device specified as bus:target[:lun] */ rv = parse_btl(io_opts[i].dev_name, &bus, &target, &lun, &new_arglist); if (rv < 2) { warnx("numeric device specification " "must be either bus:target, or " "bus:target:lun"); error = 1; goto bailout; } /* default to 0 if lun was not specified */ if ((new_arglist & CAMDD_ARG_LUN) == 0) { lun = 0; new_arglist |= CAMDD_ARG_LUN; } new_cam_dev = cam_open_btl(bus, target, lun, O_RDWR, NULL); } else { char name[30]; if (cam_get_device(io_opts[i].dev_name, name, sizeof name, &unit) == -1) { warnx("%s", cam_errbuf); error = 1; goto bailout; } new_cam_dev = cam_open_spec_device(name, unit, O_RDWR, NULL); } if (new_cam_dev == NULL) { warnx("%s", cam_errbuf); error = 1; goto bailout; } devs[i] = camdd_probe_pass(new_cam_dev, /*io_opts*/ &io_opts[i], CAMDD_ARG_ERR_RECOVER, /*probe_retry_count*/ 3, /*probe_timeout*/ 5000, /*io_retry_count*/ retry_count, /*io_timeout*/ timeout); if (devs[i] == NULL) { warn("Unable to probe device %s%u", new_cam_dev->device_name, new_cam_dev->dev_unit_num); error = 1; goto bailout; } break; } case CAMDD_DEV_FILE: { int fd = -1; if (io_opts[i].dev_name[0] == '-') { if (io_opts[i].write_dev != 0) fd = STDOUT_FILENO; else fd = STDIN_FILENO; } else { if (io_opts[i].write_dev != 0) { fd = open(io_opts[i].dev_name, O_RDWR | O_CREAT, S_IWUSR |S_IRUSR); } else { fd = open(io_opts[i].dev_name, O_RDONLY); } } if (fd == -1) { warn("error opening file %s", io_opts[i].dev_name); error = 1; goto bailout; } devs[i] = camdd_probe_file(fd, &io_opts[i], retry_count, timeout); if (devs[i] == NULL) { error = 1; goto bailout; } break; } default: warnx("Unknown device type %d (%s)", io_opts[i].dev_type, io_opts[i].dev_name); error = 1; goto bailout; break; /*NOTREACHED */ } devs[i]->write_dev = io_opts[i].write_dev; devs[i]->start_offset_bytes = io_opts[i].offset; if (max_io != 0) { devs[i]->sector_io_limit = (devs[i]->start_offset_bytes / devs[i]->sector_size) + (max_io / devs[i]->sector_size) - 1; } devs[i]->next_io_pos_bytes = devs[i]->start_offset_bytes; devs[i]->next_completion_pos_bytes =devs[i]->start_offset_bytes; } devs[0]->peer_dev = devs[1]; devs[1]->peer_dev = devs[0]; devs[0]->next_peer_pos_bytes = devs[0]->peer_dev->next_io_pos_bytes; devs[1]->next_peer_pos_bytes = devs[1]->peer_dev->next_io_pos_bytes; sem_init(&camdd_sem, /*pshared*/ 0, 0); signal(SIGINFO, camdd_sig_handler); signal(SIGINT, camdd_sig_handler); error = clock_gettime(CLOCK_MONOTONIC_PRECISE, &start_time); if (error != 0) { warn("Unable to get start time"); goto bailout; } for (i = 0; i < num_io_opts; i++) { error = pthread_create(&threads[i], NULL, camdd_worker, (void *)devs[i]); if (error != 0) { warnc(error, "pthread_create() failed"); goto bailout; } } for (;;) { if ((sem_wait(&camdd_sem) == -1) || (need_exit != 0)) { struct kevent ke; for (i = 0; i < num_io_opts; i++) { EV_SET(&ke, (uintptr_t)&devs[i]->work_queue, EVFILT_USER, 0, NOTE_TRIGGER, 0, NULL); devs[i]->flags |= CAMDD_DEV_FLAG_EOF; error = kevent(devs[i]->kq, &ke, 1, NULL, 0, NULL); if (error == -1) warn("%s: unable to wake up thread", __func__); error = 0; } break; } else if (need_status != 0) { camdd_print_status(devs[0], devs[1], &start_time); need_status = 0; } } for (i = 0; i < num_io_opts; i++) { pthread_join(threads[i], NULL); } camdd_print_status(devs[0], devs[1], &start_time); bailout: for (i = 0; i < num_io_opts; i++) camdd_free_dev(devs[i]); return (error + error_exit); } void usage(void) { fprintf(stderr, "usage: camdd <-i|-o pass=pass0,bs=1M,offset=1M,depth=4>\n" " <-i|-o file=/tmp/file,bs=512K,offset=1M>\n" " <-i|-o file=/dev/da0,bs=512K,offset=1M>\n" " <-i|-o file=/dev/nsa0,bs=512K>\n" " [-C retry_count][-E][-m max_io_amt][-t timeout_secs][-v][-h]\n" "Option description\n" "-i Specify input device/file and parameters\n" "-o Specify output device/file and parameters\n" "Input and Output parameters\n" "pass=name Specify a pass(4) device like pass0 or /dev/pass0\n" "file=name Specify a file or device, /tmp/foo, /dev/da0, /dev/null\n" " or - for stdin/stdout\n" "bs=blocksize Specify blocksize in bytes, or using K, M, G, etc. suffix\n" "offset=len Specify starting offset in bytes or using K, M, G suffix\n" " NOTE: offset cannot be specified on tapes, pipes, stdin/out\n" "depth=N Specify a numeric queue depth. This only applies to pass(4)\n" "mcs=N Specify a minimum cmd size for pass(4) read/write commands\n" "Optional arguments\n" "-C retry_cnt Specify a retry count for pass(4) devices\n" "-E Enable CAM error recovery for pass(4) devices\n" "-m max_io Specify the maximum amount to be transferred in bytes or\n" " using K, G, M, etc. suffixes\n" "-t timeout Specify the I/O timeout to use with pass(4) devices\n" "-v Enable verbose error recovery\n" "-h Print this message\n"); } int camdd_parse_io_opts(char *args, int is_write, struct camdd_io_opts *io_opts) { char *tmpstr, *tmpstr2; char *orig_tmpstr = NULL; int retval = 0; io_opts->write_dev = is_write; tmpstr = strdup(args); if (tmpstr == NULL) { warn("strdup failed"); retval = 1; goto bailout; } orig_tmpstr = tmpstr; while ((tmpstr2 = strsep(&tmpstr, ",")) != NULL) { char *name, *value; /* * If the user creates an empty parameter by putting in two * commas, skip over it and look for the next field. */ if (*tmpstr2 == '\0') continue; name = strsep(&tmpstr2, "="); if (*name == '\0') { warnx("Got empty I/O parameter name"); retval = 1; goto bailout; } value = strsep(&tmpstr2, "="); if ((value == NULL) || (*value == '\0')) { warnx("Empty I/O parameter value for %s", name); retval = 1; goto bailout; } if (strncasecmp(name, "file", 4) == 0) { io_opts->dev_type = CAMDD_DEV_FILE; io_opts->dev_name = strdup(value); if (io_opts->dev_name == NULL) { warn("Error allocating memory"); retval = 1; goto bailout; } } else if (strncasecmp(name, "pass", 4) == 0) { io_opts->dev_type = CAMDD_DEV_PASS; io_opts->dev_name = strdup(value); if (io_opts->dev_name == NULL) { warn("Error allocating memory"); retval = 1; goto bailout; } } else if ((strncasecmp(name, "bs", 2) == 0) || (strncasecmp(name, "blocksize", 9) == 0)) { retval = expand_number(value, &io_opts->blocksize); if (retval == -1) { warn("expand_number(3) failed on %s=%s", name, value); retval = 1; goto bailout; } } else if (strncasecmp(name, "depth", 5) == 0) { char *endptr; io_opts->queue_depth = strtoull(value, &endptr, 0); if (*endptr != '\0') { warnx("invalid queue depth %s", value); retval = 1; goto bailout; } } else if (strncasecmp(name, "mcs", 3) == 0) { char *endptr; io_opts->min_cmd_size = strtol(value, &endptr, 0); if ((*endptr != '\0') || ((io_opts->min_cmd_size > 16) || (io_opts->min_cmd_size < 0))) { warnx("invalid minimum cmd size %s", value); retval = 1; goto bailout; } } else if (strncasecmp(name, "offset", 6) == 0) { retval = expand_number(value, &io_opts->offset); if (retval == -1) { warn("expand_number(3) failed on %s=%s", name, value); retval = 1; goto bailout; } } else if (strncasecmp(name, "debug", 5) == 0) { char *endptr; io_opts->debug = strtoull(value, &endptr, 0); if (*endptr != '\0') { warnx("invalid debug level %s", value); retval = 1; goto bailout; } } else { warnx("Unrecognized parameter %s=%s", name, value); } } bailout: free(orig_tmpstr); return (retval); } int main(int argc, char **argv) { int c; camdd_argmask arglist = CAMDD_ARG_NONE; int timeout = 0, retry_count = 1; int error = 0; uint64_t max_io = 0; struct camdd_io_opts *opt_list = NULL; if (argc == 1) { usage(); exit(1); } opt_list = calloc(2, sizeof(struct camdd_io_opts)); if (opt_list == NULL) { warn("Unable to allocate option list"); error = 1; goto bailout; } while ((c = getopt(argc, argv, "C:Ehi:m:o:t:v")) != -1){ switch (c) { case 'C': retry_count = strtol(optarg, NULL, 0); if (retry_count < 0) errx(1, "retry count %d is < 0", retry_count); arglist |= CAMDD_ARG_RETRIES; break; case 'E': arglist |= CAMDD_ARG_ERR_RECOVER; break; case 'i': case 'o': if (((c == 'i') && (opt_list[0].dev_type != CAMDD_DEV_NONE)) || ((c == 'o') && (opt_list[1].dev_type != CAMDD_DEV_NONE))) { errx(1, "Only one input and output path " "allowed"); } error = camdd_parse_io_opts(optarg, (c == 'o') ? 1 : 0, (c == 'o') ? &opt_list[1] : &opt_list[0]); if (error != 0) goto bailout; break; case 'm': error = expand_number(optarg, &max_io); if (error == -1) { warn("invalid maximum I/O amount %s", optarg); error = 1; goto bailout; } break; case 't': timeout = strtol(optarg, NULL, 0); if (timeout < 0) errx(1, "invalid timeout %d", timeout); /* Convert the timeout from seconds to ms */ timeout *= 1000; arglist |= CAMDD_ARG_TIMEOUT; break; case 'v': arglist |= CAMDD_ARG_VERBOSE; break; case 'h': default: usage(); exit(1); break; /*NOTREACHED*/ } } if ((opt_list[0].dev_type == CAMDD_DEV_NONE) || (opt_list[1].dev_type == CAMDD_DEV_NONE)) errx(1, "Must specify both -i and -o"); /* * Set the timeout if the user hasn't specified one. */ if (timeout == 0) timeout = CAMDD_PASS_RW_TIMEOUT; error = camdd_rw(opt_list, 2, max_io, retry_count, timeout); bailout: free(opt_list); exit(error); }