diff --git a/lib/libprocstat/libprocstat.c b/lib/libprocstat/libprocstat.c index 8f1f58d12957..0c7d28540e43 100644 --- a/lib/libprocstat/libprocstat.c +++ b/lib/libprocstat/libprocstat.c @@ -1,2786 +1,2765 @@ /*- * SPDX-License-Identifier: BSD-4-Clause * * Copyright (c) 2017 Dell EMC * Copyright (c) 2009 Stanislav Sedov * Copyright (c) 1988, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the University of * California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include #include #include #define _WANT_UCRED #include #undef _WANT_UCRED #include #include #include #include #include #define _WANT_SOCKET #include #include #include #include #define _WANT_UNPCB #include #include #include #include #include #define _WANT_FILE #include #include #include #include #include #include #define _WANT_MOUNT #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include -#define _WANT_INPCB -#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "libprocstat_internal.h" #include "common_kvm.h" #include "core.h" int statfs(const char *, struct statfs *); /* XXX */ #define PROCSTAT_KVM 1 #define PROCSTAT_SYSCTL 2 #define PROCSTAT_CORE 3 static char **getargv(struct procstat *procstat, struct kinfo_proc *kp, size_t nchr, int env); static char *getmnton(kvm_t *kd, struct mount *m); static struct kinfo_vmentry * kinfo_getvmmap_core(struct procstat_core *core, int *cntp); static Elf_Auxinfo *procstat_getauxv_core(struct procstat_core *core, unsigned int *cntp); static Elf_Auxinfo *procstat_getauxv_sysctl(pid_t pid, unsigned int *cntp); static struct filestat_list *procstat_getfiles_kvm( struct procstat *procstat, struct kinfo_proc *kp, int mmapped); static struct filestat_list *procstat_getfiles_sysctl( struct procstat *procstat, struct kinfo_proc *kp, int mmapped); static int procstat_get_pipe_info_sysctl(struct filestat *fst, struct pipestat *pipe, char *errbuf); static int procstat_get_pipe_info_kvm(kvm_t *kd, struct filestat *fst, struct pipestat *pipe, char *errbuf); static int procstat_get_pts_info_sysctl(struct filestat *fst, struct ptsstat *pts, char *errbuf); static int procstat_get_pts_info_kvm(kvm_t *kd, struct filestat *fst, struct ptsstat *pts, char *errbuf); static int procstat_get_sem_info_sysctl(struct filestat *fst, struct semstat *sem, char *errbuf); static int procstat_get_sem_info_kvm(kvm_t *kd, struct filestat *fst, struct semstat *sem, char *errbuf); static int procstat_get_shm_info_sysctl(struct filestat *fst, struct shmstat *shm, char *errbuf); static int procstat_get_shm_info_kvm(kvm_t *kd, struct filestat *fst, struct shmstat *shm, char *errbuf); static int procstat_get_socket_info_sysctl(struct filestat *fst, struct sockstat *sock, char *errbuf); static int procstat_get_socket_info_kvm(kvm_t *kd, struct filestat *fst, struct sockstat *sock, char *errbuf); static int to_filestat_flags(int flags); static int procstat_get_vnode_info_kvm(kvm_t *kd, struct filestat *fst, struct vnstat *vn, char *errbuf); static int procstat_get_vnode_info_sysctl(struct filestat *fst, struct vnstat *vn, char *errbuf); static gid_t *procstat_getgroups_core(struct procstat_core *core, unsigned int *count); static gid_t * procstat_getgroups_kvm(kvm_t *kd, struct kinfo_proc *kp, unsigned int *count); static gid_t *procstat_getgroups_sysctl(pid_t pid, unsigned int *count); static struct kinfo_kstack *procstat_getkstack_sysctl(pid_t pid, int *cntp); static int procstat_getosrel_core(struct procstat_core *core, int *osrelp); static int procstat_getosrel_kvm(kvm_t *kd, struct kinfo_proc *kp, int *osrelp); static int procstat_getosrel_sysctl(pid_t pid, int *osrelp); static int procstat_getpathname_core(struct procstat_core *core, char *pathname, size_t maxlen); static int procstat_getpathname_sysctl(pid_t pid, char *pathname, size_t maxlen); static int procstat_getrlimit_core(struct procstat_core *core, int which, struct rlimit* rlimit); static int procstat_getrlimit_kvm(kvm_t *kd, struct kinfo_proc *kp, int which, struct rlimit* rlimit); static int procstat_getrlimit_sysctl(pid_t pid, int which, struct rlimit* rlimit); static int procstat_getumask_core(struct procstat_core *core, unsigned short *maskp); static int procstat_getumask_kvm(kvm_t *kd, struct kinfo_proc *kp, unsigned short *maskp); static int procstat_getumask_sysctl(pid_t pid, unsigned short *maskp); static int vntype2psfsttype(int type); void procstat_close(struct procstat *procstat) { assert(procstat); if (procstat->type == PROCSTAT_KVM) kvm_close(procstat->kd); else if (procstat->type == PROCSTAT_CORE) procstat_core_close(procstat->core); procstat_freeargv(procstat); procstat_freeenvv(procstat); free(procstat); } struct procstat * procstat_open_sysctl(void) { struct procstat *procstat; procstat = calloc(1, sizeof(*procstat)); if (procstat == NULL) { warn("malloc()"); return (NULL); } procstat->type = PROCSTAT_SYSCTL; return (procstat); } struct procstat * procstat_open_kvm(const char *nlistf, const char *memf) { struct procstat *procstat; kvm_t *kd; char buf[_POSIX2_LINE_MAX]; procstat = calloc(1, sizeof(*procstat)); if (procstat == NULL) { warn("malloc()"); return (NULL); } kd = kvm_openfiles(nlistf, memf, NULL, O_RDONLY, buf); if (kd == NULL) { warnx("kvm_openfiles(): %s", buf); free(procstat); return (NULL); } procstat->type = PROCSTAT_KVM; procstat->kd = kd; return (procstat); } struct procstat * procstat_open_core(const char *filename) { struct procstat *procstat; struct procstat_core *core; procstat = calloc(1, sizeof(*procstat)); if (procstat == NULL) { warn("malloc()"); return (NULL); } core = procstat_core_open(filename); if (core == NULL) { free(procstat); return (NULL); } procstat->type = PROCSTAT_CORE; procstat->core = core; return (procstat); } struct kinfo_proc * procstat_getprocs(struct procstat *procstat, int what, int arg, unsigned int *count) { struct kinfo_proc *p0, *p; size_t len, olen; int name[4]; int cnt; int error; assert(procstat); assert(count); p = NULL; if (procstat->type == PROCSTAT_KVM) { *count = 0; p0 = kvm_getprocs(procstat->kd, what, arg, &cnt); if (p0 == NULL || cnt <= 0) return (NULL); *count = cnt; len = *count * sizeof(*p); p = malloc(len); if (p == NULL) { warnx("malloc(%zu)", len); goto fail; } bcopy(p0, p, len); return (p); } else if (procstat->type == PROCSTAT_SYSCTL) { len = 0; name[0] = CTL_KERN; name[1] = KERN_PROC; name[2] = what; name[3] = arg; error = sysctl(name, nitems(name), NULL, &len, NULL, 0); if (error < 0 && errno != EPERM) { warn("sysctl(kern.proc)"); goto fail; } if (len == 0) { warnx("no processes?"); goto fail; } do { len += len / 10; p = reallocf(p, len); if (p == NULL) { warnx("reallocf(%zu)", len); goto fail; } olen = len; error = sysctl(name, nitems(name), p, &len, NULL, 0); } while (error < 0 && errno == ENOMEM && olen == len); if (error < 0 && errno != EPERM) { warn("sysctl(kern.proc)"); goto fail; } /* Perform simple consistency checks. */ if ((len % sizeof(*p)) != 0 || p->ki_structsize != sizeof(*p)) { warnx("kinfo_proc structure size mismatch (len = %zu)", len); goto fail; } *count = len / sizeof(*p); return (p); } else if (procstat->type == PROCSTAT_CORE) { p = procstat_core_get(procstat->core, PSC_TYPE_PROC, NULL, &len); if ((len % sizeof(*p)) != 0 || p->ki_structsize != sizeof(*p)) { warnx("kinfo_proc structure size mismatch"); goto fail; } *count = len / sizeof(*p); return (p); } else { warnx("unknown access method: %d", procstat->type); return (NULL); } fail: if (p) free(p); return (NULL); } void procstat_freeprocs(struct procstat *procstat __unused, struct kinfo_proc *p) { if (p != NULL) free(p); p = NULL; } struct filestat_list * procstat_getfiles(struct procstat *procstat, struct kinfo_proc *kp, int mmapped) { switch (procstat->type) { case PROCSTAT_KVM: return (procstat_getfiles_kvm(procstat, kp, mmapped)); case PROCSTAT_SYSCTL: case PROCSTAT_CORE: return (procstat_getfiles_sysctl(procstat, kp, mmapped)); default: warnx("unknown access method: %d", procstat->type); return (NULL); } } void procstat_freefiles(struct procstat *procstat, struct filestat_list *head) { struct filestat *fst, *tmp; STAILQ_FOREACH_SAFE(fst, head, next, tmp) { if (fst->fs_path != NULL) free(fst->fs_path); free(fst); } free(head); if (procstat->vmentries != NULL) { free(procstat->vmentries); procstat->vmentries = NULL; } if (procstat->files != NULL) { free(procstat->files); procstat->files = NULL; } } static struct filestat * filestat_new_entry(void *typedep, int type, int fd, int fflags, int uflags, int refcount, off_t offset, char *path, cap_rights_t *cap_rightsp) { struct filestat *entry; entry = calloc(1, sizeof(*entry)); if (entry == NULL) { warn("malloc()"); return (NULL); } entry->fs_typedep = typedep; entry->fs_fflags = fflags; entry->fs_uflags = uflags; entry->fs_fd = fd; entry->fs_type = type; entry->fs_ref_count = refcount; entry->fs_offset = offset; entry->fs_path = path; if (cap_rightsp != NULL) entry->fs_cap_rights = *cap_rightsp; else cap_rights_init(&entry->fs_cap_rights); return (entry); } static struct vnode * getctty(kvm_t *kd, struct kinfo_proc *kp) { struct pgrp pgrp; struct proc proc; struct session sess; int error; assert(kp); error = kvm_read_all(kd, (unsigned long)kp->ki_paddr, &proc, sizeof(proc)); if (error == 0) { warnx("can't read proc struct at %p for pid %d", kp->ki_paddr, kp->ki_pid); return (NULL); } if (proc.p_pgrp == NULL) return (NULL); error = kvm_read_all(kd, (unsigned long)proc.p_pgrp, &pgrp, sizeof(pgrp)); if (error == 0) { warnx("can't read pgrp struct at %p for pid %d", proc.p_pgrp, kp->ki_pid); return (NULL); } error = kvm_read_all(kd, (unsigned long)pgrp.pg_session, &sess, sizeof(sess)); if (error == 0) { warnx("can't read session struct at %p for pid %d", pgrp.pg_session, kp->ki_pid); return (NULL); } return (sess.s_ttyvp); } static int procstat_vm_map_reader(void *token, vm_map_entry_t addr, vm_map_entry_t dest) { kvm_t *kd; kd = (kvm_t *)token; return (kvm_read_all(kd, (unsigned long)addr, dest, sizeof(*dest))); } static struct filestat_list * procstat_getfiles_kvm(struct procstat *procstat, struct kinfo_proc *kp, int mmapped) { struct file file; struct filedesc filed; struct pwddesc pathsd; struct fdescenttbl *fdt; struct pwd pwd; unsigned long pwd_addr; struct vm_map_entry vmentry; struct vm_object object; struct vmspace vmspace; vm_map_entry_t entryp; vm_object_t objp; struct vnode *vp; struct filestat *entry; struct filestat_list *head; kvm_t *kd; void *data; int fflags; unsigned int i; int prot, type; size_t fdt_size; unsigned int nfiles; bool haspwd; assert(procstat); kd = procstat->kd; if (kd == NULL) return (NULL); if (kp->ki_fd == NULL || kp->ki_pd == NULL) return (NULL); if (!kvm_read_all(kd, (unsigned long)kp->ki_fd, &filed, sizeof(filed))) { warnx("can't read filedesc at %p", (void *)kp->ki_fd); return (NULL); } if (!kvm_read_all(kd, (unsigned long)kp->ki_pd, &pathsd, sizeof(pathsd))) { warnx("can't read pwddesc at %p", (void *)kp->ki_pd); return (NULL); } haspwd = false; pwd_addr = (unsigned long)(PWDDESC_KVM_LOAD_PWD(&pathsd)); if (pwd_addr != 0) { if (!kvm_read_all(kd, pwd_addr, &pwd, sizeof(pwd))) { warnx("can't read fd_pwd at %p", (void *)pwd_addr); return (NULL); } haspwd = true; } /* * Allocate list head. */ head = malloc(sizeof(*head)); if (head == NULL) return (NULL); STAILQ_INIT(head); /* root directory vnode, if one. */ if (haspwd) { if (pwd.pwd_rdir) { entry = filestat_new_entry(pwd.pwd_rdir, PS_FST_TYPE_VNODE, -1, PS_FST_FFLAG_READ, PS_FST_UFLAG_RDIR, 0, 0, NULL, NULL); if (entry != NULL) STAILQ_INSERT_TAIL(head, entry, next); } /* current working directory vnode. */ if (pwd.pwd_cdir) { entry = filestat_new_entry(pwd.pwd_cdir, PS_FST_TYPE_VNODE, -1, PS_FST_FFLAG_READ, PS_FST_UFLAG_CDIR, 0, 0, NULL, NULL); if (entry != NULL) STAILQ_INSERT_TAIL(head, entry, next); } /* jail root, if any. */ if (pwd.pwd_jdir) { entry = filestat_new_entry(pwd.pwd_jdir, PS_FST_TYPE_VNODE, -1, PS_FST_FFLAG_READ, PS_FST_UFLAG_JAIL, 0, 0, NULL, NULL); if (entry != NULL) STAILQ_INSERT_TAIL(head, entry, next); } } /* ktrace vnode, if one */ if (kp->ki_tracep) { entry = filestat_new_entry(kp->ki_tracep, PS_FST_TYPE_VNODE, -1, PS_FST_FFLAG_READ | PS_FST_FFLAG_WRITE, PS_FST_UFLAG_TRACE, 0, 0, NULL, NULL); if (entry != NULL) STAILQ_INSERT_TAIL(head, entry, next); } /* text vnode, if one */ if (kp->ki_textvp) { entry = filestat_new_entry(kp->ki_textvp, PS_FST_TYPE_VNODE, -1, PS_FST_FFLAG_READ, PS_FST_UFLAG_TEXT, 0, 0, NULL, NULL); if (entry != NULL) STAILQ_INSERT_TAIL(head, entry, next); } /* Controlling terminal. */ if ((vp = getctty(kd, kp)) != NULL) { entry = filestat_new_entry(vp, PS_FST_TYPE_VNODE, -1, PS_FST_FFLAG_READ | PS_FST_FFLAG_WRITE, PS_FST_UFLAG_CTTY, 0, 0, NULL, NULL); if (entry != NULL) STAILQ_INSERT_TAIL(head, entry, next); } if (!kvm_read_all(kd, (unsigned long)filed.fd_files, &nfiles, sizeof(nfiles))) { warnx("can't read fd_files at %p", (void *)filed.fd_files); return (NULL); } fdt_size = sizeof(*fdt) + nfiles * sizeof(struct filedescent); fdt = malloc(fdt_size); if (fdt == NULL) { warn("malloc(%zu)", fdt_size); goto do_mmapped; } if (!kvm_read_all(kd, (unsigned long)filed.fd_files, fdt, fdt_size)) { warnx("cannot read file structures at %p", (void *)filed.fd_files); free(fdt); goto do_mmapped; } for (i = 0; i < nfiles; i++) { if (fdt->fdt_ofiles[i].fde_file == NULL) { continue; } if (!kvm_read_all(kd, (unsigned long)fdt->fdt_ofiles[i].fde_file, &file, sizeof(struct file))) { warnx("can't read file %d at %p", i, (void *)fdt->fdt_ofiles[i].fde_file); continue; } switch (file.f_type) { case DTYPE_VNODE: type = PS_FST_TYPE_VNODE; data = file.f_vnode; break; case DTYPE_SOCKET: type = PS_FST_TYPE_SOCKET; data = file.f_data; break; case DTYPE_PIPE: type = PS_FST_TYPE_PIPE; data = file.f_data; break; case DTYPE_FIFO: type = PS_FST_TYPE_FIFO; data = file.f_vnode; break; #ifdef DTYPE_PTS case DTYPE_PTS: type = PS_FST_TYPE_PTS; data = file.f_data; break; #endif case DTYPE_SEM: type = PS_FST_TYPE_SEM; data = file.f_data; break; case DTYPE_SHM: type = PS_FST_TYPE_SHM; data = file.f_data; break; case DTYPE_PROCDESC: type = PS_FST_TYPE_PROCDESC; data = file.f_data; break; case DTYPE_DEV: type = PS_FST_TYPE_DEV; data = file.f_data; break; case DTYPE_EVENTFD: type = PS_FST_TYPE_EVENTFD; data = file.f_data; break; default: continue; } /* XXXRW: No capability rights support for kvm yet. */ entry = filestat_new_entry(data, type, i, to_filestat_flags(file.f_flag), 0, 0, 0, NULL, NULL); if (entry != NULL) STAILQ_INSERT_TAIL(head, entry, next); } free(fdt); do_mmapped: /* * Process mmapped files if requested. */ if (mmapped) { if (!kvm_read_all(kd, (unsigned long)kp->ki_vmspace, &vmspace, sizeof(vmspace))) { warnx("can't read vmspace at %p", (void *)kp->ki_vmspace); goto exit; } vmentry = vmspace.vm_map.header; for (entryp = vm_map_entry_read_succ(kd, &vmentry, procstat_vm_map_reader); entryp != NULL && entryp != &kp->ki_vmspace->vm_map.header; entryp = vm_map_entry_read_succ(kd, &vmentry, procstat_vm_map_reader)) { if (vmentry.eflags & MAP_ENTRY_IS_SUB_MAP) continue; if ((objp = vmentry.object.vm_object) == NULL) continue; for (; objp; objp = object.backing_object) { if (!kvm_read_all(kd, (unsigned long)objp, &object, sizeof(object))) { warnx("can't read vm_object at %p", (void *)objp); break; } } /* We want only vnode objects. */ if (object.type != OBJT_VNODE) continue; prot = vmentry.protection; fflags = 0; if (prot & VM_PROT_READ) fflags = PS_FST_FFLAG_READ; if ((vmentry.eflags & MAP_ENTRY_COW) == 0 && prot & VM_PROT_WRITE) fflags |= PS_FST_FFLAG_WRITE; /* * Create filestat entry. */ entry = filestat_new_entry(object.handle, PS_FST_TYPE_VNODE, -1, fflags, PS_FST_UFLAG_MMAP, 0, 0, NULL, NULL); if (entry != NULL) STAILQ_INSERT_TAIL(head, entry, next); } if (entryp == NULL) warnx("can't read vm_map_entry"); } exit: return (head); } /* * kinfo types to filestat translation. */ static int kinfo_type2fst(int kftype) { static struct { int kf_type; int fst_type; } kftypes2fst[] = { { KF_TYPE_PROCDESC, PS_FST_TYPE_PROCDESC }, { KF_TYPE_DEV, PS_FST_TYPE_DEV }, { KF_TYPE_FIFO, PS_FST_TYPE_FIFO }, { KF_TYPE_KQUEUE, PS_FST_TYPE_KQUEUE }, { KF_TYPE_MQUEUE, PS_FST_TYPE_MQUEUE }, { KF_TYPE_NONE, PS_FST_TYPE_NONE }, { KF_TYPE_PIPE, PS_FST_TYPE_PIPE }, { KF_TYPE_PTS, PS_FST_TYPE_PTS }, { KF_TYPE_SEM, PS_FST_TYPE_SEM }, { KF_TYPE_SHM, PS_FST_TYPE_SHM }, { KF_TYPE_SOCKET, PS_FST_TYPE_SOCKET }, { KF_TYPE_VNODE, PS_FST_TYPE_VNODE }, { KF_TYPE_EVENTFD, PS_FST_TYPE_EVENTFD }, { KF_TYPE_UNKNOWN, PS_FST_TYPE_UNKNOWN } }; #define NKFTYPES (sizeof(kftypes2fst) / sizeof(*kftypes2fst)) unsigned int i; for (i = 0; i < NKFTYPES; i++) if (kftypes2fst[i].kf_type == kftype) break; if (i == NKFTYPES) return (PS_FST_TYPE_UNKNOWN); return (kftypes2fst[i].fst_type); } /* * kinfo flags to filestat translation. */ static int kinfo_fflags2fst(int kfflags) { static struct { int kf_flag; int fst_flag; } kfflags2fst[] = { { KF_FLAG_APPEND, PS_FST_FFLAG_APPEND }, { KF_FLAG_ASYNC, PS_FST_FFLAG_ASYNC }, { KF_FLAG_CREAT, PS_FST_FFLAG_CREAT }, { KF_FLAG_DIRECT, PS_FST_FFLAG_DIRECT }, { KF_FLAG_EXCL, PS_FST_FFLAG_EXCL }, { KF_FLAG_EXEC, PS_FST_FFLAG_EXEC }, { KF_FLAG_EXLOCK, PS_FST_FFLAG_EXLOCK }, { KF_FLAG_FSYNC, PS_FST_FFLAG_SYNC }, { KF_FLAG_HASLOCK, PS_FST_FFLAG_HASLOCK }, { KF_FLAG_NOFOLLOW, PS_FST_FFLAG_NOFOLLOW }, { KF_FLAG_NONBLOCK, PS_FST_FFLAG_NONBLOCK }, { KF_FLAG_READ, PS_FST_FFLAG_READ }, { KF_FLAG_SHLOCK, PS_FST_FFLAG_SHLOCK }, { KF_FLAG_TRUNC, PS_FST_FFLAG_TRUNC }, { KF_FLAG_WRITE, PS_FST_FFLAG_WRITE } }; #define NKFFLAGS (sizeof(kfflags2fst) / sizeof(*kfflags2fst)) unsigned int i; int flags; flags = 0; for (i = 0; i < NKFFLAGS; i++) if ((kfflags & kfflags2fst[i].kf_flag) != 0) flags |= kfflags2fst[i].fst_flag; return (flags); } static int kinfo_uflags2fst(int fd) { switch (fd) { case KF_FD_TYPE_CTTY: return (PS_FST_UFLAG_CTTY); case KF_FD_TYPE_CWD: return (PS_FST_UFLAG_CDIR); case KF_FD_TYPE_JAIL: return (PS_FST_UFLAG_JAIL); case KF_FD_TYPE_TEXT: return (PS_FST_UFLAG_TEXT); case KF_FD_TYPE_TRACE: return (PS_FST_UFLAG_TRACE); case KF_FD_TYPE_ROOT: return (PS_FST_UFLAG_RDIR); } return (0); } static struct kinfo_file * kinfo_getfile_core(struct procstat_core *core, int *cntp) { int cnt; size_t len; char *buf, *bp, *eb; struct kinfo_file *kif, *kp, *kf; buf = procstat_core_get(core, PSC_TYPE_FILES, NULL, &len); if (buf == NULL) return (NULL); /* * XXXMG: The code below is just copy&past from libutil. * The code duplication can be avoided if libutil * is extended to provide something like: * struct kinfo_file *kinfo_getfile_from_buf(const char *buf, * size_t len, int *cntp); */ /* Pass 1: count items */ cnt = 0; bp = buf; eb = buf + len; while (bp < eb) { kf = (struct kinfo_file *)(uintptr_t)bp; if (kf->kf_structsize == 0) break; bp += kf->kf_structsize; cnt++; } kif = calloc(cnt, sizeof(*kif)); if (kif == NULL) { free(buf); return (NULL); } bp = buf; eb = buf + len; kp = kif; /* Pass 2: unpack */ while (bp < eb) { kf = (struct kinfo_file *)(uintptr_t)bp; if (kf->kf_structsize == 0) break; /* Copy/expand into pre-zeroed buffer */ memcpy(kp, kf, kf->kf_structsize); /* Advance to next packed record */ bp += kf->kf_structsize; /* Set field size to fixed length, advance */ kp->kf_structsize = sizeof(*kp); kp++; } free(buf); *cntp = cnt; return (kif); /* Caller must free() return value */ } static struct filestat_list * procstat_getfiles_sysctl(struct procstat *procstat, struct kinfo_proc *kp, int mmapped) { struct kinfo_file *kif, *files; struct kinfo_vmentry *kve, *vmentries; struct filestat_list *head; struct filestat *entry; char *path; off_t offset; int cnt, fd, fflags; int i, type, uflags; int refcount; cap_rights_t cap_rights; assert(kp); switch (procstat->type) { case PROCSTAT_SYSCTL: files = kinfo_getfile(kp->ki_pid, &cnt); break; case PROCSTAT_CORE: files = kinfo_getfile_core(procstat->core, &cnt); break; default: assert(!"invalid type"); } if (files == NULL && errno != EPERM) { warn("kinfo_getfile()"); return (NULL); } procstat->files = files; /* * Allocate list head. */ head = malloc(sizeof(*head)); if (head == NULL) return (NULL); STAILQ_INIT(head); for (i = 0; i < cnt; i++) { kif = &files[i]; type = kinfo_type2fst(kif->kf_type); fd = kif->kf_fd >= 0 ? kif->kf_fd : -1; fflags = kinfo_fflags2fst(kif->kf_flags); uflags = kinfo_uflags2fst(kif->kf_fd); refcount = kif->kf_ref_count; offset = kif->kf_offset; if (*kif->kf_path != '\0') path = strdup(kif->kf_path); else path = NULL; cap_rights = kif->kf_cap_rights; /* * Create filestat entry. */ entry = filestat_new_entry(kif, type, fd, fflags, uflags, refcount, offset, path, &cap_rights); if (entry != NULL) STAILQ_INSERT_TAIL(head, entry, next); } if (mmapped != 0) { vmentries = procstat_getvmmap(procstat, kp, &cnt); procstat->vmentries = vmentries; if (vmentries == NULL || cnt == 0) goto fail; for (i = 0; i < cnt; i++) { kve = &vmentries[i]; if (kve->kve_type != KVME_TYPE_VNODE) continue; fflags = 0; if (kve->kve_protection & KVME_PROT_READ) fflags = PS_FST_FFLAG_READ; if ((kve->kve_flags & KVME_FLAG_COW) == 0 && kve->kve_protection & KVME_PROT_WRITE) fflags |= PS_FST_FFLAG_WRITE; offset = kve->kve_offset; refcount = kve->kve_ref_count; if (*kve->kve_path != '\0') path = strdup(kve->kve_path); else path = NULL; entry = filestat_new_entry(kve, PS_FST_TYPE_VNODE, -1, fflags, PS_FST_UFLAG_MMAP, refcount, offset, path, NULL); if (entry != NULL) STAILQ_INSERT_TAIL(head, entry, next); } } fail: return (head); } int procstat_get_pipe_info(struct procstat *procstat, struct filestat *fst, struct pipestat *ps, char *errbuf) { assert(ps); if (procstat->type == PROCSTAT_KVM) { return (procstat_get_pipe_info_kvm(procstat->kd, fst, ps, errbuf)); } else if (procstat->type == PROCSTAT_SYSCTL || procstat->type == PROCSTAT_CORE) { return (procstat_get_pipe_info_sysctl(fst, ps, errbuf)); } else { warnx("unknown access method: %d", procstat->type); if (errbuf != NULL) snprintf(errbuf, _POSIX2_LINE_MAX, "error"); return (1); } } static int procstat_get_pipe_info_kvm(kvm_t *kd, struct filestat *fst, struct pipestat *ps, char *errbuf) { struct pipe pi; void *pipep; assert(kd); assert(ps); assert(fst); bzero(ps, sizeof(*ps)); pipep = fst->fs_typedep; if (pipep == NULL) goto fail; if (!kvm_read_all(kd, (unsigned long)pipep, &pi, sizeof(struct pipe))) { warnx("can't read pipe at %p", (void *)pipep); goto fail; } ps->addr = (uintptr_t)pipep; ps->peer = (uintptr_t)pi.pipe_peer; ps->buffer_cnt = pi.pipe_buffer.cnt; return (0); fail: if (errbuf != NULL) snprintf(errbuf, _POSIX2_LINE_MAX, "error"); return (1); } static int procstat_get_pipe_info_sysctl(struct filestat *fst, struct pipestat *ps, char *errbuf __unused) { struct kinfo_file *kif; assert(ps); assert(fst); bzero(ps, sizeof(*ps)); kif = fst->fs_typedep; if (kif == NULL) return (1); ps->addr = kif->kf_un.kf_pipe.kf_pipe_addr; ps->peer = kif->kf_un.kf_pipe.kf_pipe_peer; ps->buffer_cnt = kif->kf_un.kf_pipe.kf_pipe_buffer_cnt; return (0); } int procstat_get_pts_info(struct procstat *procstat, struct filestat *fst, struct ptsstat *pts, char *errbuf) { assert(pts); if (procstat->type == PROCSTAT_KVM) { return (procstat_get_pts_info_kvm(procstat->kd, fst, pts, errbuf)); } else if (procstat->type == PROCSTAT_SYSCTL || procstat->type == PROCSTAT_CORE) { return (procstat_get_pts_info_sysctl(fst, pts, errbuf)); } else { warnx("unknown access method: %d", procstat->type); if (errbuf != NULL) snprintf(errbuf, _POSIX2_LINE_MAX, "error"); return (1); } } static int procstat_get_pts_info_kvm(kvm_t *kd, struct filestat *fst, struct ptsstat *pts, char *errbuf) { struct tty tty; void *ttyp; assert(kd); assert(pts); assert(fst); bzero(pts, sizeof(*pts)); ttyp = fst->fs_typedep; if (ttyp == NULL) goto fail; if (!kvm_read_all(kd, (unsigned long)ttyp, &tty, sizeof(struct tty))) { warnx("can't read tty at %p", (void *)ttyp); goto fail; } pts->dev = dev2udev(kd, tty.t_dev); (void)kdevtoname(kd, tty.t_dev, pts->devname); return (0); fail: if (errbuf != NULL) snprintf(errbuf, _POSIX2_LINE_MAX, "error"); return (1); } static int procstat_get_pts_info_sysctl(struct filestat *fst, struct ptsstat *pts, char *errbuf __unused) { struct kinfo_file *kif; assert(pts); assert(fst); bzero(pts, sizeof(*pts)); kif = fst->fs_typedep; if (kif == NULL) return (0); pts->dev = kif->kf_un.kf_pts.kf_pts_dev; strlcpy(pts->devname, kif->kf_path, sizeof(pts->devname)); return (0); } int procstat_get_sem_info(struct procstat *procstat, struct filestat *fst, struct semstat *sem, char *errbuf) { assert(sem); if (procstat->type == PROCSTAT_KVM) { return (procstat_get_sem_info_kvm(procstat->kd, fst, sem, errbuf)); } else if (procstat->type == PROCSTAT_SYSCTL || procstat->type == PROCSTAT_CORE) { return (procstat_get_sem_info_sysctl(fst, sem, errbuf)); } else { warnx("unknown access method: %d", procstat->type); if (errbuf != NULL) snprintf(errbuf, _POSIX2_LINE_MAX, "error"); return (1); } } static int procstat_get_sem_info_kvm(kvm_t *kd, struct filestat *fst, struct semstat *sem, char *errbuf) { struct ksem ksem; void *ksemp; char *path; int i; assert(kd); assert(sem); assert(fst); bzero(sem, sizeof(*sem)); ksemp = fst->fs_typedep; if (ksemp == NULL) goto fail; if (!kvm_read_all(kd, (unsigned long)ksemp, &ksem, sizeof(struct ksem))) { warnx("can't read ksem at %p", (void *)ksemp); goto fail; } sem->mode = S_IFREG | ksem.ks_mode; sem->value = ksem.ks_value; if (fst->fs_path == NULL && ksem.ks_path != NULL) { path = malloc(MAXPATHLEN); for (i = 0; i < MAXPATHLEN - 1; i++) { if (!kvm_read_all(kd, (unsigned long)ksem.ks_path + i, path + i, 1)) break; if (path[i] == '\0') break; } path[i] = '\0'; if (i == 0) free(path); else fst->fs_path = path; } return (0); fail: if (errbuf != NULL) snprintf(errbuf, _POSIX2_LINE_MAX, "error"); return (1); } static int procstat_get_sem_info_sysctl(struct filestat *fst, struct semstat *sem, char *errbuf __unused) { struct kinfo_file *kif; assert(sem); assert(fst); bzero(sem, sizeof(*sem)); kif = fst->fs_typedep; if (kif == NULL) return (0); sem->value = kif->kf_un.kf_sem.kf_sem_value; sem->mode = kif->kf_un.kf_sem.kf_sem_mode; return (0); } int procstat_get_shm_info(struct procstat *procstat, struct filestat *fst, struct shmstat *shm, char *errbuf) { assert(shm); if (procstat->type == PROCSTAT_KVM) { return (procstat_get_shm_info_kvm(procstat->kd, fst, shm, errbuf)); } else if (procstat->type == PROCSTAT_SYSCTL || procstat->type == PROCSTAT_CORE) { return (procstat_get_shm_info_sysctl(fst, shm, errbuf)); } else { warnx("unknown access method: %d", procstat->type); if (errbuf != NULL) snprintf(errbuf, _POSIX2_LINE_MAX, "error"); return (1); } } static int procstat_get_shm_info_kvm(kvm_t *kd, struct filestat *fst, struct shmstat *shm, char *errbuf) { struct shmfd shmfd; void *shmfdp; char *path; int i; assert(kd); assert(shm); assert(fst); bzero(shm, sizeof(*shm)); shmfdp = fst->fs_typedep; if (shmfdp == NULL) goto fail; if (!kvm_read_all(kd, (unsigned long)shmfdp, &shmfd, sizeof(struct shmfd))) { warnx("can't read shmfd at %p", (void *)shmfdp); goto fail; } shm->mode = S_IFREG | shmfd.shm_mode; shm->size = shmfd.shm_size; if (fst->fs_path == NULL && shmfd.shm_path != NULL) { path = malloc(MAXPATHLEN); for (i = 0; i < MAXPATHLEN - 1; i++) { if (!kvm_read_all(kd, (unsigned long)shmfd.shm_path + i, path + i, 1)) break; if (path[i] == '\0') break; } path[i] = '\0'; if (i == 0) free(path); else fst->fs_path = path; } return (0); fail: if (errbuf != NULL) snprintf(errbuf, _POSIX2_LINE_MAX, "error"); return (1); } static int procstat_get_shm_info_sysctl(struct filestat *fst, struct shmstat *shm, char *errbuf __unused) { struct kinfo_file *kif; assert(shm); assert(fst); bzero(shm, sizeof(*shm)); kif = fst->fs_typedep; if (kif == NULL) return (0); shm->size = kif->kf_un.kf_file.kf_file_size; shm->mode = kif->kf_un.kf_file.kf_file_mode; return (0); } int procstat_get_vnode_info(struct procstat *procstat, struct filestat *fst, struct vnstat *vn, char *errbuf) { assert(vn); if (procstat->type == PROCSTAT_KVM) { return (procstat_get_vnode_info_kvm(procstat->kd, fst, vn, errbuf)); } else if (procstat->type == PROCSTAT_SYSCTL || procstat->type == PROCSTAT_CORE) { return (procstat_get_vnode_info_sysctl(fst, vn, errbuf)); } else { warnx("unknown access method: %d", procstat->type); if (errbuf != NULL) snprintf(errbuf, _POSIX2_LINE_MAX, "error"); return (1); } } static int procstat_get_vnode_info_kvm(kvm_t *kd, struct filestat *fst, struct vnstat *vn, char *errbuf) { /* Filesystem specific handlers. */ #define FSTYPE(fst) {#fst, fst##_filestat} struct { const char *tag; int (*handler)(kvm_t *kd, struct vnode *vp, struct vnstat *vn); } fstypes[] = { FSTYPE(devfs), FSTYPE(isofs), FSTYPE(msdosfs), FSTYPE(nfs), FSTYPE(smbfs), FSTYPE(udf), FSTYPE(ufs), #ifdef LIBPROCSTAT_ZFS FSTYPE(zfs), #endif }; #define NTYPES (sizeof(fstypes) / sizeof(*fstypes)) struct vnode vnode; char tagstr[12]; void *vp; int error; unsigned int i; assert(kd); assert(vn); assert(fst); vp = fst->fs_typedep; if (vp == NULL) goto fail; error = kvm_read_all(kd, (unsigned long)vp, &vnode, sizeof(vnode)); if (error == 0) { warnx("can't read vnode at %p", (void *)vp); goto fail; } bzero(vn, sizeof(*vn)); vn->vn_type = vntype2psfsttype(vnode.v_type); if (vnode.v_type == VNON || vnode.v_type == VBAD) return (0); error = kvm_read_all(kd, (unsigned long)vnode.v_lock.lock_object.lo_name, tagstr, sizeof(tagstr)); if (error == 0) { warnx("can't read lo_name at %p", (void *)vp); goto fail; } tagstr[sizeof(tagstr) - 1] = '\0'; /* * Find appropriate handler. */ for (i = 0; i < NTYPES; i++) if (!strcmp(fstypes[i].tag, tagstr)) { if (fstypes[i].handler(kd, &vnode, vn) != 0) { goto fail; } break; } if (i == NTYPES) { if (errbuf != NULL) snprintf(errbuf, _POSIX2_LINE_MAX, "?(%s)", tagstr); return (1); } vn->vn_mntdir = getmnton(kd, vnode.v_mount); if ((vnode.v_type == VBLK || vnode.v_type == VCHR) && vnode.v_rdev != NULL){ vn->vn_dev = dev2udev(kd, vnode.v_rdev); (void)kdevtoname(kd, vnode.v_rdev, vn->vn_devname); } else { vn->vn_dev = -1; } return (0); fail: if (errbuf != NULL) snprintf(errbuf, _POSIX2_LINE_MAX, "error"); return (1); } /* * kinfo vnode type to filestat translation. */ static int kinfo_vtype2fst(int kfvtype) { static struct { int kf_vtype; int fst_vtype; } kfvtypes2fst[] = { { KF_VTYPE_VBAD, PS_FST_VTYPE_VBAD }, { KF_VTYPE_VBLK, PS_FST_VTYPE_VBLK }, { KF_VTYPE_VCHR, PS_FST_VTYPE_VCHR }, { KF_VTYPE_VDIR, PS_FST_VTYPE_VDIR }, { KF_VTYPE_VFIFO, PS_FST_VTYPE_VFIFO }, { KF_VTYPE_VLNK, PS_FST_VTYPE_VLNK }, { KF_VTYPE_VNON, PS_FST_VTYPE_VNON }, { KF_VTYPE_VREG, PS_FST_VTYPE_VREG }, { KF_VTYPE_VSOCK, PS_FST_VTYPE_VSOCK } }; #define NKFVTYPES (sizeof(kfvtypes2fst) / sizeof(*kfvtypes2fst)) unsigned int i; for (i = 0; i < NKFVTYPES; i++) if (kfvtypes2fst[i].kf_vtype == kfvtype) break; if (i == NKFVTYPES) return (PS_FST_VTYPE_UNKNOWN); return (kfvtypes2fst[i].fst_vtype); } static int procstat_get_vnode_info_sysctl(struct filestat *fst, struct vnstat *vn, char *errbuf) { struct statfs stbuf; struct kinfo_file *kif; struct kinfo_vmentry *kve; char *name, *path; uint64_t fileid; uint64_t size; uint64_t fsid; uint64_t rdev; uint16_t mode; int vntype; int status; assert(fst); assert(vn); bzero(vn, sizeof(*vn)); if (fst->fs_typedep == NULL) return (1); if (fst->fs_uflags & PS_FST_UFLAG_MMAP) { kve = fst->fs_typedep; fileid = kve->kve_vn_fileid; fsid = kve->kve_vn_fsid; mode = kve->kve_vn_mode; path = kve->kve_path; rdev = kve->kve_vn_rdev; size = kve->kve_vn_size; vntype = kinfo_vtype2fst(kve->kve_vn_type); status = kve->kve_status; } else { kif = fst->fs_typedep; fileid = kif->kf_un.kf_file.kf_file_fileid; fsid = kif->kf_un.kf_file.kf_file_fsid; mode = kif->kf_un.kf_file.kf_file_mode; path = kif->kf_path; rdev = kif->kf_un.kf_file.kf_file_rdev; size = kif->kf_un.kf_file.kf_file_size; vntype = kinfo_vtype2fst(kif->kf_vnode_type); status = kif->kf_status; } vn->vn_type = vntype; if (vntype == PS_FST_VTYPE_VNON || vntype == PS_FST_VTYPE_VBAD) return (0); if ((status & KF_ATTR_VALID) == 0) { if (errbuf != NULL) { snprintf(errbuf, _POSIX2_LINE_MAX, "? (no info available)"); } return (1); } if (path && *path) { statfs(path, &stbuf); vn->vn_mntdir = strdup(stbuf.f_mntonname); } else vn->vn_mntdir = strdup("-"); vn->vn_dev = rdev; if (vntype == PS_FST_VTYPE_VBLK) { name = devname(rdev, S_IFBLK); if (name != NULL) strlcpy(vn->vn_devname, name, sizeof(vn->vn_devname)); } else if (vntype == PS_FST_VTYPE_VCHR) { name = devname(vn->vn_dev, S_IFCHR); if (name != NULL) strlcpy(vn->vn_devname, name, sizeof(vn->vn_devname)); } vn->vn_fsid = fsid; vn->vn_fileid = fileid; vn->vn_size = size; vn->vn_mode = mode; return (0); } int procstat_get_socket_info(struct procstat *procstat, struct filestat *fst, struct sockstat *sock, char *errbuf) { assert(sock); if (procstat->type == PROCSTAT_KVM) { return (procstat_get_socket_info_kvm(procstat->kd, fst, sock, errbuf)); } else if (procstat->type == PROCSTAT_SYSCTL || procstat->type == PROCSTAT_CORE) { return (procstat_get_socket_info_sysctl(fst, sock, errbuf)); } else { warnx("unknown access method: %d", procstat->type); if (errbuf != NULL) snprintf(errbuf, _POSIX2_LINE_MAX, "error"); return (1); } } static int procstat_get_socket_info_kvm(kvm_t *kd, struct filestat *fst, struct sockstat *sock, char *errbuf) { struct domain dom; - struct inpcb inpcb; struct protosw proto; struct socket s; struct unpcb unpcb; ssize_t len; void *so; assert(kd); assert(sock); assert(fst); bzero(sock, sizeof(*sock)); so = fst->fs_typedep; if (so == NULL) goto fail; sock->so_addr = (uintptr_t)so; /* fill in socket */ if (!kvm_read_all(kd, (unsigned long)so, &s, sizeof(struct socket))) { warnx("can't read sock at %p", (void *)so); goto fail; } /* fill in protosw entry */ if (!kvm_read_all(kd, (unsigned long)s.so_proto, &proto, sizeof(struct protosw))) { warnx("can't read protosw at %p", (void *)s.so_proto); goto fail; } /* fill in domain */ if (!kvm_read_all(kd, (unsigned long)proto.pr_domain, &dom, sizeof(struct domain))) { warnx("can't read domain at %p", (void *)proto.pr_domain); goto fail; } if ((len = kvm_read(kd, (unsigned long)dom.dom_name, sock->dname, sizeof(sock->dname) - 1)) < 0) { warnx("can't read domain name at %p", (void *)dom.dom_name); sock->dname[0] = '\0'; } else sock->dname[len] = '\0'; /* * Fill in known data. */ sock->type = s.so_type; sock->proto = proto.pr_protocol; sock->dom_family = dom.dom_family; sock->so_pcb = (uintptr_t)s.so_pcb; + sock->sendq = s.so_snd.sb_ccc; + sock->recvq = s.so_rcv.sb_ccc; + sock->so_rcv_sb_state = s.so_rcv.sb_state; + sock->so_snd_sb_state = s.so_snd.sb_state; /* * Protocol specific data. */ switch (dom.dom_family) { - case AF_INET: - case AF_INET6: - if (proto.pr_protocol == IPPROTO_TCP) { - if (s.so_pcb) { - if (kvm_read(kd, (u_long)s.so_pcb, - (char *)&inpcb, sizeof(struct inpcb)) - != sizeof(struct inpcb)) { - warnx("can't read inpcb at %p", - (void *)s.so_pcb); - } else - sock->inp_ppcb = - (uintptr_t)inpcb.inp_ppcb; - sock->sendq = s.so_snd.sb_ccc; - sock->recvq = s.so_rcv.sb_ccc; - } - } - break; case AF_UNIX: if (s.so_pcb) { if (kvm_read(kd, (u_long)s.so_pcb, (char *)&unpcb, sizeof(struct unpcb)) != sizeof(struct unpcb)){ warnx("can't read unpcb at %p", (void *)s.so_pcb); } else if (unpcb.unp_conn) { - sock->so_rcv_sb_state = s.so_rcv.sb_state; - sock->so_snd_sb_state = s.so_snd.sb_state; sock->unp_conn = (uintptr_t)unpcb.unp_conn; - sock->sendq = s.so_snd.sb_ccc; - sock->recvq = s.so_rcv.sb_ccc; } } break; default: break; } return (0); fail: if (errbuf != NULL) snprintf(errbuf, _POSIX2_LINE_MAX, "error"); return (1); } static int procstat_get_socket_info_sysctl(struct filestat *fst, struct sockstat *sock, char *errbuf __unused) { struct kinfo_file *kif; assert(sock); assert(fst); bzero(sock, sizeof(*sock)); kif = fst->fs_typedep; if (kif == NULL) return (0); /* * Fill in known data. */ sock->type = kif->kf_sock_type; sock->proto = kif->kf_sock_protocol; sock->dom_family = kif->kf_sock_domain; sock->so_pcb = kif->kf_un.kf_sock.kf_sock_pcb; strlcpy(sock->dname, kif->kf_path, sizeof(sock->dname)); bcopy(&kif->kf_un.kf_sock.kf_sa_local, &sock->sa_local, kif->kf_un.kf_sock.kf_sa_local.ss_len); bcopy(&kif->kf_un.kf_sock.kf_sa_peer, &sock->sa_peer, kif->kf_un.kf_sock.kf_sa_peer.ss_len); /* * Protocol specific data. */ switch (sock->dom_family) { case AF_INET: case AF_INET6: if (sock->proto == IPPROTO_TCP) { - sock->inp_ppcb = kif->kf_un.kf_sock.kf_sock_inpcb; sock->sendq = kif->kf_un.kf_sock.kf_sock_sendq; sock->recvq = kif->kf_un.kf_sock.kf_sock_recvq; } break; case AF_UNIX: if (kif->kf_un.kf_sock.kf_sock_unpconn != 0) { sock->so_rcv_sb_state = kif->kf_un.kf_sock.kf_sock_rcv_sb_state; sock->so_snd_sb_state = kif->kf_un.kf_sock.kf_sock_snd_sb_state; sock->unp_conn = kif->kf_un.kf_sock.kf_sock_unpconn; sock->sendq = kif->kf_un.kf_sock.kf_sock_sendq; sock->recvq = kif->kf_un.kf_sock.kf_sock_recvq; } break; default: break; } return (0); } /* * Descriptor flags to filestat translation. */ static int to_filestat_flags(int flags) { static struct { int flag; int fst_flag; } fstflags[] = { { FREAD, PS_FST_FFLAG_READ }, { FWRITE, PS_FST_FFLAG_WRITE }, { O_APPEND, PS_FST_FFLAG_APPEND }, { O_ASYNC, PS_FST_FFLAG_ASYNC }, { O_CREAT, PS_FST_FFLAG_CREAT }, { O_DIRECT, PS_FST_FFLAG_DIRECT }, { O_EXCL, PS_FST_FFLAG_EXCL }, { O_EXEC, PS_FST_FFLAG_EXEC }, { O_EXLOCK, PS_FST_FFLAG_EXLOCK }, { O_NOFOLLOW, PS_FST_FFLAG_NOFOLLOW }, { O_NONBLOCK, PS_FST_FFLAG_NONBLOCK }, { O_SHLOCK, PS_FST_FFLAG_SHLOCK }, { O_SYNC, PS_FST_FFLAG_SYNC }, { O_TRUNC, PS_FST_FFLAG_TRUNC } }; #define NFSTFLAGS (sizeof(fstflags) / sizeof(*fstflags)) int fst_flags; unsigned int i; fst_flags = 0; for (i = 0; i < NFSTFLAGS; i++) if (flags & fstflags[i].flag) fst_flags |= fstflags[i].fst_flag; return (fst_flags); } /* * Vnode type to filestate translation. */ static int vntype2psfsttype(int type) { static struct { int vtype; int fst_vtype; } vt2fst[] = { { VBAD, PS_FST_VTYPE_VBAD }, { VBLK, PS_FST_VTYPE_VBLK }, { VCHR, PS_FST_VTYPE_VCHR }, { VDIR, PS_FST_VTYPE_VDIR }, { VFIFO, PS_FST_VTYPE_VFIFO }, { VLNK, PS_FST_VTYPE_VLNK }, { VNON, PS_FST_VTYPE_VNON }, { VREG, PS_FST_VTYPE_VREG }, { VSOCK, PS_FST_VTYPE_VSOCK } }; #define NVFTYPES (sizeof(vt2fst) / sizeof(*vt2fst)) unsigned int i, fst_type; fst_type = PS_FST_VTYPE_UNKNOWN; for (i = 0; i < NVFTYPES; i++) { if (type == vt2fst[i].vtype) { fst_type = vt2fst[i].fst_vtype; break; } } return (fst_type); } static char * getmnton(kvm_t *kd, struct mount *m) { struct mount mnt; static struct mtab { struct mtab *next; struct mount *m; char mntonname[MNAMELEN + 1]; } *mhead = NULL; struct mtab *mt; for (mt = mhead; mt != NULL; mt = mt->next) if (m == mt->m) return (mt->mntonname); if (!kvm_read_all(kd, (unsigned long)m, &mnt, sizeof(struct mount))) { warnx("can't read mount table at %p", (void *)m); return (NULL); } if ((mt = malloc(sizeof (struct mtab))) == NULL) err(1, NULL); mt->m = m; bcopy(&mnt.mnt_stat.f_mntonname[0], &mt->mntonname[0], MNAMELEN); mt->mntonname[MNAMELEN] = '\0'; mt->next = mhead; mhead = mt; return (mt->mntonname); } /* * Auxiliary structures and functions to get process environment or * command line arguments. */ struct argvec { char *buf; size_t bufsize; char **argv; size_t argc; }; static struct argvec * argvec_alloc(size_t bufsize) { struct argvec *av; av = malloc(sizeof(*av)); if (av == NULL) return (NULL); av->bufsize = bufsize; av->buf = malloc(av->bufsize); if (av->buf == NULL) { free(av); return (NULL); } av->argc = 32; av->argv = malloc(sizeof(char *) * av->argc); if (av->argv == NULL) { free(av->buf); free(av); return (NULL); } return av; } static void argvec_free(struct argvec * av) { free(av->argv); free(av->buf); free(av); } static char ** getargv(struct procstat *procstat, struct kinfo_proc *kp, size_t nchr, int env) { int error, name[4], argc, i; struct argvec *av, **avp; enum psc_type type; size_t len; char *p, **argv; assert(procstat); assert(kp); if (procstat->type == PROCSTAT_KVM) { warnx("can't use kvm access method"); return (NULL); } if (procstat->type != PROCSTAT_SYSCTL && procstat->type != PROCSTAT_CORE) { warnx("unknown access method: %d", procstat->type); return (NULL); } if (nchr == 0 || nchr > ARG_MAX) nchr = ARG_MAX; avp = (struct argvec **)(env ? &procstat->argv : &procstat->envv); av = *avp; if (av == NULL) { av = argvec_alloc(nchr); if (av == NULL) { warn("malloc(%zu)", nchr); return (NULL); } *avp = av; } else if (av->bufsize < nchr) { av->buf = reallocf(av->buf, nchr); if (av->buf == NULL) { warn("malloc(%zu)", nchr); return (NULL); } } if (procstat->type == PROCSTAT_SYSCTL) { name[0] = CTL_KERN; name[1] = KERN_PROC; name[2] = env ? KERN_PROC_ENV : KERN_PROC_ARGS; name[3] = kp->ki_pid; len = nchr; error = sysctl(name, nitems(name), av->buf, &len, NULL, 0); if (error != 0 && errno != ESRCH && errno != EPERM) warn("sysctl(kern.proc.%s)", env ? "env" : "args"); if (error != 0 || len == 0) return (NULL); } else /* procstat->type == PROCSTAT_CORE */ { type = env ? PSC_TYPE_ENVV : PSC_TYPE_ARGV; len = nchr; if (procstat_core_get(procstat->core, type, av->buf, &len) == NULL) { return (NULL); } } argv = av->argv; argc = av->argc; i = 0; for (p = av->buf; p < av->buf + len; p += strlen(p) + 1) { argv[i++] = p; if (i < argc) continue; /* Grow argv. */ argc += argc; argv = realloc(argv, sizeof(char *) * argc); if (argv == NULL) { warn("malloc(%zu)", sizeof(char *) * argc); return (NULL); } av->argv = argv; av->argc = argc; } argv[i] = NULL; return (argv); } /* * Return process command line arguments. */ char ** procstat_getargv(struct procstat *procstat, struct kinfo_proc *p, size_t nchr) { return (getargv(procstat, p, nchr, 0)); } /* * Free the buffer allocated by procstat_getargv(). */ void procstat_freeargv(struct procstat *procstat) { if (procstat->argv != NULL) { argvec_free(procstat->argv); procstat->argv = NULL; } } /* * Return process environment. */ char ** procstat_getenvv(struct procstat *procstat, struct kinfo_proc *p, size_t nchr) { return (getargv(procstat, p, nchr, 1)); } /* * Free the buffer allocated by procstat_getenvv(). */ void procstat_freeenvv(struct procstat *procstat) { if (procstat->envv != NULL) { argvec_free(procstat->envv); procstat->envv = NULL; } } static struct kinfo_vmentry * kinfo_getvmmap_core(struct procstat_core *core, int *cntp) { int cnt; size_t len; char *buf, *bp, *eb; struct kinfo_vmentry *kiv, *kp, *kv; buf = procstat_core_get(core, PSC_TYPE_VMMAP, NULL, &len); if (buf == NULL) return (NULL); /* * XXXMG: The code below is just copy&past from libutil. * The code duplication can be avoided if libutil * is extended to provide something like: * struct kinfo_vmentry *kinfo_getvmmap_from_buf(const char *buf, * size_t len, int *cntp); */ /* Pass 1: count items */ cnt = 0; bp = buf; eb = buf + len; while (bp < eb) { kv = (struct kinfo_vmentry *)(uintptr_t)bp; if (kv->kve_structsize == 0) break; bp += kv->kve_structsize; cnt++; } kiv = calloc(cnt, sizeof(*kiv)); if (kiv == NULL) { free(buf); return (NULL); } bp = buf; eb = buf + len; kp = kiv; /* Pass 2: unpack */ while (bp < eb) { kv = (struct kinfo_vmentry *)(uintptr_t)bp; if (kv->kve_structsize == 0) break; /* Copy/expand into pre-zeroed buffer */ memcpy(kp, kv, kv->kve_structsize); /* Advance to next packed record */ bp += kv->kve_structsize; /* Set field size to fixed length, advance */ kp->kve_structsize = sizeof(*kp); kp++; } free(buf); *cntp = cnt; return (kiv); /* Caller must free() return value */ } struct kinfo_vmentry * procstat_getvmmap(struct procstat *procstat, struct kinfo_proc *kp, unsigned int *cntp) { switch (procstat->type) { case PROCSTAT_KVM: warnx("kvm method is not supported"); return (NULL); case PROCSTAT_SYSCTL: return (kinfo_getvmmap(kp->ki_pid, cntp)); case PROCSTAT_CORE: return (kinfo_getvmmap_core(procstat->core, cntp)); default: warnx("unknown access method: %d", procstat->type); return (NULL); } } void procstat_freevmmap(struct procstat *procstat __unused, struct kinfo_vmentry *vmmap) { free(vmmap); } static gid_t * procstat_getgroups_kvm(kvm_t *kd, struct kinfo_proc *kp, unsigned int *cntp) { struct proc proc; struct ucred ucred; gid_t *groups; size_t len; assert(kd != NULL); assert(kp != NULL); if (!kvm_read_all(kd, (unsigned long)kp->ki_paddr, &proc, sizeof(proc))) { warnx("can't read proc struct at %p for pid %d", kp->ki_paddr, kp->ki_pid); return (NULL); } if (proc.p_ucred == NOCRED) return (NULL); if (!kvm_read_all(kd, (unsigned long)proc.p_ucred, &ucred, sizeof(ucred))) { warnx("can't read ucred struct at %p for pid %d", proc.p_ucred, kp->ki_pid); return (NULL); } len = ucred.cr_ngroups * sizeof(gid_t); groups = malloc(len); if (groups == NULL) { warn("malloc(%zu)", len); return (NULL); } if (!kvm_read_all(kd, (unsigned long)ucred.cr_groups, groups, len)) { warnx("can't read groups at %p for pid %d", ucred.cr_groups, kp->ki_pid); free(groups); return (NULL); } *cntp = ucred.cr_ngroups; return (groups); } static gid_t * procstat_getgroups_sysctl(pid_t pid, unsigned int *cntp) { int mib[4]; size_t len; gid_t *groups; mib[0] = CTL_KERN; mib[1] = KERN_PROC; mib[2] = KERN_PROC_GROUPS; mib[3] = pid; len = (sysconf(_SC_NGROUPS_MAX) + 1) * sizeof(gid_t); groups = malloc(len); if (groups == NULL) { warn("malloc(%zu)", len); return (NULL); } if (sysctl(mib, nitems(mib), groups, &len, NULL, 0) == -1) { warn("sysctl: kern.proc.groups: %d", pid); free(groups); return (NULL); } *cntp = len / sizeof(gid_t); return (groups); } static gid_t * procstat_getgroups_core(struct procstat_core *core, unsigned int *cntp) { size_t len; gid_t *groups; groups = procstat_core_get(core, PSC_TYPE_GROUPS, NULL, &len); if (groups == NULL) return (NULL); *cntp = len / sizeof(gid_t); return (groups); } gid_t * procstat_getgroups(struct procstat *procstat, struct kinfo_proc *kp, unsigned int *cntp) { switch (procstat->type) { case PROCSTAT_KVM: return (procstat_getgroups_kvm(procstat->kd, kp, cntp)); case PROCSTAT_SYSCTL: return (procstat_getgroups_sysctl(kp->ki_pid, cntp)); case PROCSTAT_CORE: return (procstat_getgroups_core(procstat->core, cntp)); default: warnx("unknown access method: %d", procstat->type); return (NULL); } } void procstat_freegroups(struct procstat *procstat __unused, gid_t *groups) { free(groups); } static int procstat_getumask_kvm(kvm_t *kd, struct kinfo_proc *kp, unsigned short *maskp) { struct pwddesc pd; assert(kd != NULL); assert(kp != NULL); if (kp->ki_pd == NULL) return (-1); if (!kvm_read_all(kd, (unsigned long)kp->ki_pd, &pd, sizeof(pd))) { warnx("can't read pwddesc at %p for pid %d", kp->ki_pd, kp->ki_pid); return (-1); } *maskp = pd.pd_cmask; return (0); } static int procstat_getumask_sysctl(pid_t pid, unsigned short *maskp) { int error; int mib[4]; size_t len; mib[0] = CTL_KERN; mib[1] = KERN_PROC; mib[2] = KERN_PROC_UMASK; mib[3] = pid; len = sizeof(*maskp); error = sysctl(mib, nitems(mib), maskp, &len, NULL, 0); if (error != 0 && errno != ESRCH && errno != EPERM) warn("sysctl: kern.proc.umask: %d", pid); return (error); } static int procstat_getumask_core(struct procstat_core *core, unsigned short *maskp) { size_t len; unsigned short *buf; buf = procstat_core_get(core, PSC_TYPE_UMASK, NULL, &len); if (buf == NULL) return (-1); if (len < sizeof(*maskp)) { free(buf); return (-1); } *maskp = *buf; free(buf); return (0); } int procstat_getumask(struct procstat *procstat, struct kinfo_proc *kp, unsigned short *maskp) { switch (procstat->type) { case PROCSTAT_KVM: return (procstat_getumask_kvm(procstat->kd, kp, maskp)); case PROCSTAT_SYSCTL: return (procstat_getumask_sysctl(kp->ki_pid, maskp)); case PROCSTAT_CORE: return (procstat_getumask_core(procstat->core, maskp)); default: warnx("unknown access method: %d", procstat->type); return (-1); } } static int procstat_getrlimit_kvm(kvm_t *kd, struct kinfo_proc *kp, int which, struct rlimit* rlimit) { struct proc proc; unsigned long offset; assert(kd != NULL); assert(kp != NULL); assert(which >= 0 && which < RLIM_NLIMITS); if (!kvm_read_all(kd, (unsigned long)kp->ki_paddr, &proc, sizeof(proc))) { warnx("can't read proc struct at %p for pid %d", kp->ki_paddr, kp->ki_pid); return (-1); } if (proc.p_limit == NULL) return (-1); offset = (unsigned long)proc.p_limit + sizeof(struct rlimit) * which; if (!kvm_read_all(kd, offset, rlimit, sizeof(*rlimit))) { warnx("can't read rlimit struct at %p for pid %d", (void *)offset, kp->ki_pid); return (-1); } return (0); } static int procstat_getrlimit_sysctl(pid_t pid, int which, struct rlimit* rlimit) { int error, name[5]; size_t len; name[0] = CTL_KERN; name[1] = KERN_PROC; name[2] = KERN_PROC_RLIMIT; name[3] = pid; name[4] = which; len = sizeof(struct rlimit); error = sysctl(name, nitems(name), rlimit, &len, NULL, 0); if (error < 0 && errno != ESRCH) { warn("sysctl: kern.proc.rlimit: %d", pid); return (-1); } if (error < 0 || len != sizeof(struct rlimit)) return (-1); return (0); } static int procstat_getrlimit_core(struct procstat_core *core, int which, struct rlimit* rlimit) { size_t len; struct rlimit* rlimits; if (which < 0 || which >= RLIM_NLIMITS) { errno = EINVAL; warn("getrlimit: which"); return (-1); } rlimits = procstat_core_get(core, PSC_TYPE_RLIMIT, NULL, &len); if (rlimits == NULL) return (-1); if (len < sizeof(struct rlimit) * RLIM_NLIMITS) { free(rlimits); return (-1); } *rlimit = rlimits[which]; free(rlimits); return (0); } int procstat_getrlimit(struct procstat *procstat, struct kinfo_proc *kp, int which, struct rlimit* rlimit) { switch (procstat->type) { case PROCSTAT_KVM: return (procstat_getrlimit_kvm(procstat->kd, kp, which, rlimit)); case PROCSTAT_SYSCTL: return (procstat_getrlimit_sysctl(kp->ki_pid, which, rlimit)); case PROCSTAT_CORE: return (procstat_getrlimit_core(procstat->core, which, rlimit)); default: warnx("unknown access method: %d", procstat->type); return (-1); } } static int procstat_getpathname_sysctl(pid_t pid, char *pathname, size_t maxlen) { int error, name[4]; size_t len; name[0] = CTL_KERN; name[1] = KERN_PROC; name[2] = KERN_PROC_PATHNAME; name[3] = pid; len = maxlen; error = sysctl(name, nitems(name), pathname, &len, NULL, 0); if (error != 0 && errno != ESRCH) warn("sysctl: kern.proc.pathname: %d", pid); if (len == 0) pathname[0] = '\0'; return (error); } static int procstat_getpathname_core(struct procstat_core *core, char *pathname, size_t maxlen) { struct kinfo_file *files; int cnt, i, result; files = kinfo_getfile_core(core, &cnt); if (files == NULL) return (-1); result = -1; for (i = 0; i < cnt; i++) { if (files[i].kf_fd != KF_FD_TYPE_TEXT) continue; strncpy(pathname, files[i].kf_path, maxlen); result = 0; break; } free(files); return (result); } int procstat_getpathname(struct procstat *procstat, struct kinfo_proc *kp, char *pathname, size_t maxlen) { switch (procstat->type) { case PROCSTAT_KVM: /* XXX: Return empty string. */ if (maxlen > 0) pathname[0] = '\0'; return (0); case PROCSTAT_SYSCTL: return (procstat_getpathname_sysctl(kp->ki_pid, pathname, maxlen)); case PROCSTAT_CORE: return (procstat_getpathname_core(procstat->core, pathname, maxlen)); default: warnx("unknown access method: %d", procstat->type); return (-1); } } static int procstat_getosrel_kvm(kvm_t *kd, struct kinfo_proc *kp, int *osrelp) { struct proc proc; assert(kd != NULL); assert(kp != NULL); if (!kvm_read_all(kd, (unsigned long)kp->ki_paddr, &proc, sizeof(proc))) { warnx("can't read proc struct at %p for pid %d", kp->ki_paddr, kp->ki_pid); return (-1); } *osrelp = proc.p_osrel; return (0); } static int procstat_getosrel_sysctl(pid_t pid, int *osrelp) { int error, name[4]; size_t len; name[0] = CTL_KERN; name[1] = KERN_PROC; name[2] = KERN_PROC_OSREL; name[3] = pid; len = sizeof(*osrelp); error = sysctl(name, nitems(name), osrelp, &len, NULL, 0); if (error != 0 && errno != ESRCH) warn("sysctl: kern.proc.osrel: %d", pid); return (error); } static int procstat_getosrel_core(struct procstat_core *core, int *osrelp) { size_t len; int *buf; buf = procstat_core_get(core, PSC_TYPE_OSREL, NULL, &len); if (buf == NULL) return (-1); if (len < sizeof(*osrelp)) { free(buf); return (-1); } *osrelp = *buf; free(buf); return (0); } int procstat_getosrel(struct procstat *procstat, struct kinfo_proc *kp, int *osrelp) { switch (procstat->type) { case PROCSTAT_KVM: return (procstat_getosrel_kvm(procstat->kd, kp, osrelp)); case PROCSTAT_SYSCTL: return (procstat_getosrel_sysctl(kp->ki_pid, osrelp)); case PROCSTAT_CORE: return (procstat_getosrel_core(procstat->core, osrelp)); default: warnx("unknown access method: %d", procstat->type); return (-1); } } #define PROC_AUXV_MAX 256 #ifdef PS_ARCH_HAS_FREEBSD32 static const char *elf32_sv_names[] = { "Linux ELF32", "FreeBSD ELF32", }; static int is_elf32_sysctl(pid_t pid) { int error, name[4]; size_t len, i; char sv_name[32]; name[0] = CTL_KERN; name[1] = KERN_PROC; name[2] = KERN_PROC_SV_NAME; name[3] = pid; len = sizeof(sv_name); error = sysctl(name, nitems(name), sv_name, &len, NULL, 0); if (error != 0 || len == 0) return (0); for (i = 0; i < sizeof(elf32_sv_names) / sizeof(*elf32_sv_names); i++) { if (strncmp(sv_name, elf32_sv_names[i], sizeof(sv_name)) == 0) return (1); } return (0); } static Elf_Auxinfo * procstat_getauxv32_sysctl(pid_t pid, unsigned int *cntp) { Elf_Auxinfo *auxv; Elf32_Auxinfo *auxv32; size_t len; unsigned int i, count; int name[4]; name[0] = CTL_KERN; name[1] = KERN_PROC; name[2] = KERN_PROC_AUXV; name[3] = pid; len = PROC_AUXV_MAX * sizeof(Elf32_Auxinfo); auxv = NULL; auxv32 = malloc(len); if (auxv32 == NULL) { warn("malloc(%zu)", len); goto out; } if (sysctl(name, nitems(name), auxv32, &len, NULL, 0) == -1) { if (errno != ESRCH && errno != EPERM) warn("sysctl: kern.proc.auxv: %d: %d", pid, errno); goto out; } count = len / sizeof(Elf32_Auxinfo); auxv = malloc(count * sizeof(Elf_Auxinfo)); if (auxv == NULL) { warn("malloc(%zu)", count * sizeof(Elf_Auxinfo)); goto out; } for (i = 0; i < count; i++) { /* * XXX: We expect that values for a_type on a 32-bit platform * are directly mapped to values on 64-bit one, which is not * necessarily true. */ auxv[i].a_type = auxv32[i].a_type; /* * Don't sign extend values. Existing entries are positive * integers or pointers. Under freebsd32, programs typically * have a full [0, 2^32) address space (perhaps minus the last * page) and treating this as a signed integer would be * confusing since these are not kernel pointers. * * XXX: A more complete translation would be ABI and * type-aware. */ auxv[i].a_un.a_val = (uint32_t)auxv32[i].a_un.a_val; } *cntp = count; out: free(auxv32); return (auxv); } #endif /* PS_ARCH_HAS_FREEBSD32 */ static Elf_Auxinfo * procstat_getauxv_sysctl(pid_t pid, unsigned int *cntp) { Elf_Auxinfo *auxv; int name[4]; size_t len; #ifdef PS_ARCH_HAS_FREEBSD32 if (is_elf32_sysctl(pid)) return (procstat_getauxv32_sysctl(pid, cntp)); #endif name[0] = CTL_KERN; name[1] = KERN_PROC; name[2] = KERN_PROC_AUXV; name[3] = pid; len = PROC_AUXV_MAX * sizeof(Elf_Auxinfo); auxv = malloc(len); if (auxv == NULL) { warn("malloc(%zu)", len); return (NULL); } if (sysctl(name, nitems(name), auxv, &len, NULL, 0) == -1) { if (errno != ESRCH && errno != EPERM) warn("sysctl: kern.proc.auxv: %d: %d", pid, errno); free(auxv); return (NULL); } *cntp = len / sizeof(Elf_Auxinfo); return (auxv); } static Elf_Auxinfo * procstat_getauxv_core(struct procstat_core *core, unsigned int *cntp) { Elf_Auxinfo *auxv; size_t len; auxv = procstat_core_get(core, PSC_TYPE_AUXV, NULL, &len); if (auxv == NULL) return (NULL); *cntp = len / sizeof(Elf_Auxinfo); return (auxv); } Elf_Auxinfo * procstat_getauxv(struct procstat *procstat, struct kinfo_proc *kp, unsigned int *cntp) { switch (procstat->type) { case PROCSTAT_KVM: warnx("kvm method is not supported"); return (NULL); case PROCSTAT_SYSCTL: return (procstat_getauxv_sysctl(kp->ki_pid, cntp)); case PROCSTAT_CORE: return (procstat_getauxv_core(procstat->core, cntp)); default: warnx("unknown access method: %d", procstat->type); return (NULL); } } void procstat_freeauxv(struct procstat *procstat __unused, Elf_Auxinfo *auxv) { free(auxv); } static struct ptrace_lwpinfo * procstat_getptlwpinfo_core(struct procstat_core *core, unsigned int *cntp) { void *buf; struct ptrace_lwpinfo *pl; unsigned int cnt; size_t len; cnt = procstat_core_note_count(core, PSC_TYPE_PTLWPINFO); if (cnt == 0) return (NULL); len = cnt * sizeof(*pl); buf = calloc(1, len); pl = procstat_core_get(core, PSC_TYPE_PTLWPINFO, buf, &len); if (pl == NULL) { free(buf); return (NULL); } *cntp = len / sizeof(*pl); return (pl); } struct ptrace_lwpinfo * procstat_getptlwpinfo(struct procstat *procstat, unsigned int *cntp) { switch (procstat->type) { case PROCSTAT_KVM: warnx("kvm method is not supported"); return (NULL); case PROCSTAT_SYSCTL: warnx("sysctl method is not supported"); return (NULL); case PROCSTAT_CORE: return (procstat_getptlwpinfo_core(procstat->core, cntp)); default: warnx("unknown access method: %d", procstat->type); return (NULL); } } void procstat_freeptlwpinfo(struct procstat *procstat __unused, struct ptrace_lwpinfo *pl) { free(pl); } static struct kinfo_kstack * procstat_getkstack_sysctl(pid_t pid, int *cntp) { struct kinfo_kstack *kkstp; int error, name[4]; size_t len; name[0] = CTL_KERN; name[1] = KERN_PROC; name[2] = KERN_PROC_KSTACK; name[3] = pid; len = 0; error = sysctl(name, nitems(name), NULL, &len, NULL, 0); if (error < 0 && errno != ESRCH && errno != EPERM && errno != ENOENT) { warn("sysctl: kern.proc.kstack: %d", pid); return (NULL); } if (error == -1 && errno == ENOENT) { warnx("sysctl: kern.proc.kstack unavailable" " (options DDB or options STACK required in kernel)"); return (NULL); } if (error == -1) return (NULL); kkstp = malloc(len); if (kkstp == NULL) { warn("malloc(%zu)", len); return (NULL); } if (sysctl(name, nitems(name), kkstp, &len, NULL, 0) == -1 && errno != ENOMEM) { warn("sysctl: kern.proc.pid: %d", pid); free(kkstp); return (NULL); } *cntp = len / sizeof(*kkstp); return (kkstp); } struct kinfo_kstack * procstat_getkstack(struct procstat *procstat, struct kinfo_proc *kp, unsigned int *cntp) { switch (procstat->type) { case PROCSTAT_KVM: warnx("kvm method is not supported"); return (NULL); case PROCSTAT_SYSCTL: return (procstat_getkstack_sysctl(kp->ki_pid, cntp)); case PROCSTAT_CORE: warnx("core method is not supported"); return (NULL); default: warnx("unknown access method: %d", procstat->type); return (NULL); } } void procstat_freekstack(struct procstat *procstat __unused, struct kinfo_kstack *kkstp) { free(kkstp); } static struct advlock_list * procstat_getadvlock_sysctl(struct procstat *procstat __unused) { struct advlock_list *res; struct advlock *a; void *buf; char *c; struct kinfo_lockf *kl; size_t buf_len; int error; static const int kl_name[] = { CTL_KERN, KERN_LOCKF }; res = malloc(sizeof(*res)); if (res == NULL) return (NULL); STAILQ_INIT(res); buf = NULL; buf_len = 0; error = sysctl(kl_name, nitems(kl_name), NULL, &buf_len, NULL, 0); if (error != 0) { warn("sysctl KERN_LOCKF size"); goto fail; } buf_len *= 2; buf = malloc(buf_len); if (buf == NULL) { warn("malloc"); goto fail; } error = sysctl(kl_name, nitems(kl_name), buf, &buf_len, NULL, 0); if (error != 0) { warn("sysctl KERN_LOCKF data"); goto fail; } for (c = buf; (char *)c < (char *)buf + buf_len; c += kl->kl_structsize) { kl = (struct kinfo_lockf *)(void *)c; if (sizeof(*kl) < (size_t)kl->kl_structsize) { warn("ABI broken"); goto fail; } a = malloc(sizeof(*a)); if (a == NULL) { warn("malloc advlock"); goto fail; } switch (kl->kl_rw) { case KLOCKF_RW_READ: a->rw = PS_ADVLOCK_RO; break; case KLOCKF_RW_WRITE: a->rw = PS_ADVLOCK_RW; break; default: warn("ABI broken"); free(a); goto fail; } switch (kl->kl_type) { case KLOCKF_TYPE_FLOCK: a->type = PS_ADVLOCK_TYPE_FLOCK; break; case KLOCKF_TYPE_PID: a->type = PS_ADVLOCK_TYPE_PID; break; case KLOCKF_TYPE_REMOTE: a->type = PS_ADVLOCK_TYPE_REMOTE; break; default: warn("ABI broken"); free(a); goto fail; } a->pid = kl->kl_pid; a->sysid = kl->kl_sysid; a->file_fsid = kl->kl_file_fsid; a->file_rdev = kl->kl_file_rdev; a->file_fileid = kl->kl_file_fileid; a->start = kl->kl_start; a->len = kl->kl_len; if (kl->kl_path[0] != '\0') { a->path = strdup(kl->kl_path); if (a->path == NULL) { warn("malloc"); free(a); goto fail; } } else a->path = NULL; STAILQ_INSERT_TAIL(res, a, next); } free(buf); return (res); fail: free(buf); procstat_freeadvlock(procstat, res); return (NULL); } struct advlock_list * procstat_getadvlock(struct procstat *procstat) { switch (procstat->type) { case PROCSTAT_KVM: warnx("kvm method is not supported"); return (NULL); case PROCSTAT_SYSCTL: return (procstat_getadvlock_sysctl(procstat)); case PROCSTAT_CORE: warnx("core method is not supported"); return (NULL); default: warnx("unknown access method: %d", procstat->type); return (NULL); } } void procstat_freeadvlock(struct procstat *procstat __unused, struct advlock_list *lst) { struct advlock *a, *a1; STAILQ_FOREACH_SAFE(a, lst, next, a1) { free(__DECONST(char *, a->path)); free(a); } free(lst); } diff --git a/lib/libprocstat/libprocstat.h b/lib/libprocstat/libprocstat.h index 3d30b4db4018..4aba1610a05f 100644 --- a/lib/libprocstat/libprocstat.h +++ b/lib/libprocstat/libprocstat.h @@ -1,263 +1,262 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2009 Stanislav Sedov * Copyright (c) 2017 Dell EMC * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #ifndef _LIBPROCSTAT_H_ #define _LIBPROCSTAT_H_ /* * XXX: sys/elf.h conflicts with zfs_context.h. Workaround this by not * including conflicting parts when building zfs code. */ #ifndef ZFS #include #endif #include /* * Vnode types. */ #define PS_FST_VTYPE_VNON 1 #define PS_FST_VTYPE_VREG 2 #define PS_FST_VTYPE_VDIR 3 #define PS_FST_VTYPE_VBLK 4 #define PS_FST_VTYPE_VCHR 5 #define PS_FST_VTYPE_VLNK 6 #define PS_FST_VTYPE_VSOCK 7 #define PS_FST_VTYPE_VFIFO 8 #define PS_FST_VTYPE_VBAD 9 #define PS_FST_VTYPE_UNKNOWN 255 /* * Descriptor types. */ #define PS_FST_TYPE_VNODE 1 #define PS_FST_TYPE_FIFO 2 #define PS_FST_TYPE_SOCKET 3 #define PS_FST_TYPE_PIPE 4 #define PS_FST_TYPE_PTS 5 #define PS_FST_TYPE_KQUEUE 6 /* was PS_FST_TYPE_CRYPTO 7 */ #define PS_FST_TYPE_MQUEUE 8 #define PS_FST_TYPE_SHM 9 #define PS_FST_TYPE_SEM 10 #define PS_FST_TYPE_UNKNOWN 11 #define PS_FST_TYPE_NONE 12 #define PS_FST_TYPE_PROCDESC 13 #define PS_FST_TYPE_DEV 14 #define PS_FST_TYPE_EVENTFD 15 /* * Special descriptor numbers. */ #define PS_FST_UFLAG_RDIR 0x0001 #define PS_FST_UFLAG_CDIR 0x0002 #define PS_FST_UFLAG_JAIL 0x0004 #define PS_FST_UFLAG_TRACE 0x0008 #define PS_FST_UFLAG_TEXT 0x0010 #define PS_FST_UFLAG_MMAP 0x0020 #define PS_FST_UFLAG_CTTY 0x0040 /* * Descriptor flags. */ #define PS_FST_FFLAG_READ 0x0001 #define PS_FST_FFLAG_WRITE 0x0002 #define PS_FST_FFLAG_NONBLOCK 0x0004 #define PS_FST_FFLAG_APPEND 0x0008 #define PS_FST_FFLAG_SHLOCK 0x0010 #define PS_FST_FFLAG_EXLOCK 0x0020 #define PS_FST_FFLAG_ASYNC 0x0040 #define PS_FST_FFLAG_SYNC 0x0080 #define PS_FST_FFLAG_NOFOLLOW 0x0100 #define PS_FST_FFLAG_CREAT 0x0200 #define PS_FST_FFLAG_TRUNC 0x0400 #define PS_FST_FFLAG_EXCL 0x0800 #define PS_FST_FFLAG_DIRECT 0x1000 #define PS_FST_FFLAG_EXEC 0x2000 #define PS_FST_FFLAG_HASLOCK 0x4000 #if !defined(__ILP32__) && !defined(__riscv) /* Target architecture supports 32-bit compat */ #define PS_ARCH_HAS_FREEBSD32 1 #endif struct kinfo_kstack; struct kinfo_proc; struct kinfo_vmentry; struct procstat; struct ptrace_lwpinfo; struct rlimit; struct filestat { int fs_type; /* Descriptor type. */ int fs_flags; /* filestat specific flags. */ int fs_fflags; /* Descriptor access flags. */ int fs_uflags; /* How this file is used. */ int fs_fd; /* File descriptor number. */ int fs_ref_count; /* Reference count. */ off_t fs_offset; /* Seek location. */ void *fs_typedep; /* Type dependent data. */ char *fs_path; STAILQ_ENTRY(filestat) next; cap_rights_t fs_cap_rights; /* Capability rights, if flag set. */ }; struct vnstat { uint64_t vn_fileid; uint64_t vn_size; uint64_t vn_dev; uint64_t vn_fsid; char *vn_mntdir; int vn_type; uint16_t vn_mode; char vn_devname[SPECNAMELEN + 1]; }; struct ptsstat { uint64_t dev; char devname[SPECNAMELEN + 1]; }; struct pipestat { size_t buffer_cnt; uint64_t addr; uint64_t peer; }; struct semstat { uint32_t value; uint16_t mode; }; struct shmstat { uint64_t size; uint16_t mode; }; struct sockstat { - uint64_t inp_ppcb; uint64_t so_addr; uint64_t so_pcb; uint64_t unp_conn; int dom_family; int proto; int so_rcv_sb_state; int so_snd_sb_state; struct sockaddr_storage sa_local; /* Socket address. */ struct sockaddr_storage sa_peer; /* Peer address. */ int type; char dname[32]; unsigned int sendq; unsigned int recvq; }; STAILQ_HEAD(filestat_list, filestat); struct advlock { int rw; /* PS_ADVLOCK_RO/RW */ int type; /* PS_ADVLOCK_TYPE_ */ int pid; int sysid; uint64_t file_fsid; uint64_t file_rdev; uint64_t file_fileid; off_t start; off_t len; /* len == 0 till the EOF */ const char *path; STAILQ_ENTRY(advlock) next; }; #define PS_ADVLOCK_RO 0x01 #define PS_ADVLOCK_RW 0x02 #define PS_ADVLOCK_TYPE_FLOCK 0x01 #define PS_ADVLOCK_TYPE_PID 0x02 #define PS_ADVLOCK_TYPE_REMOTE 0x03 STAILQ_HEAD(advlock_list, advlock); __BEGIN_DECLS void procstat_close(struct procstat *procstat); void procstat_freeadvlock(struct procstat *procstat, struct advlock_list *advlocks); void procstat_freeargv(struct procstat *procstat); #ifndef ZFS void procstat_freeauxv(struct procstat *procstat, Elf_Auxinfo *auxv); #endif void procstat_freeenvv(struct procstat *procstat); void procstat_freegroups(struct procstat *procstat, gid_t *groups); void procstat_freekstack(struct procstat *procstat, struct kinfo_kstack *kkstp); void procstat_freeprocs(struct procstat *procstat, struct kinfo_proc *p); void procstat_freefiles(struct procstat *procstat, struct filestat_list *head); void procstat_freeptlwpinfo(struct procstat *procstat, struct ptrace_lwpinfo *pl); void procstat_freevmmap(struct procstat *procstat, struct kinfo_vmentry *vmmap); struct advlock_list *procstat_getadvlock(struct procstat *procstat); struct filestat_list *procstat_getfiles(struct procstat *procstat, struct kinfo_proc *kp, int mmapped); struct kinfo_proc *procstat_getprocs(struct procstat *procstat, int what, int arg, unsigned int *count); int procstat_get_pipe_info(struct procstat *procstat, struct filestat *fst, struct pipestat *pipe, char *errbuf); int procstat_get_pts_info(struct procstat *procstat, struct filestat *fst, struct ptsstat *pts, char *errbuf); int procstat_get_sem_info(struct procstat *procstat, struct filestat *fst, struct semstat *sem, char *errbuf); int procstat_get_shm_info(struct procstat *procstat, struct filestat *fst, struct shmstat *shm, char *errbuf); int procstat_get_socket_info(struct procstat *procstat, struct filestat *fst, struct sockstat *sock, char *errbuf); int procstat_get_vnode_info(struct procstat *procstat, struct filestat *fst, struct vnstat *vn, char *errbuf); char **procstat_getargv(struct procstat *procstat, struct kinfo_proc *p, size_t nchr); #ifndef ZFS Elf_Auxinfo *procstat_getauxv(struct procstat *procstat, struct kinfo_proc *kp, unsigned int *cntp); #endif struct ptrace_lwpinfo *procstat_getptlwpinfo(struct procstat *procstat, unsigned int *cntp); char **procstat_getenvv(struct procstat *procstat, struct kinfo_proc *p, size_t nchr); gid_t *procstat_getgroups(struct procstat *procstat, struct kinfo_proc *kp, unsigned int *count); struct kinfo_kstack *procstat_getkstack(struct procstat *procstat, struct kinfo_proc *kp, unsigned int *count); int procstat_getosrel(struct procstat *procstat, struct kinfo_proc *kp, int *osrelp); int procstat_getpathname(struct procstat *procstat, struct kinfo_proc *kp, char *pathname, size_t maxlen); int procstat_getrlimit(struct procstat *procstat, struct kinfo_proc *kp, int which, struct rlimit* rlimit); int procstat_getumask(struct procstat *procstat, struct kinfo_proc *kp, unsigned short* umask); struct kinfo_vmentry *procstat_getvmmap(struct procstat *procstat, struct kinfo_proc *kp, unsigned int *count); struct procstat *procstat_open_core(const char *filename); struct procstat *procstat_open_sysctl(void); struct procstat *procstat_open_kvm(const char *nlistf, const char *memf); __END_DECLS #endif /* !_LIBPROCSTAT_H_ */ diff --git a/lib/libprocstat/libprocstat_compat.c b/lib/libprocstat/libprocstat_compat.c index 472527973323..63eb35678752 100644 --- a/lib/libprocstat/libprocstat_compat.c +++ b/lib/libprocstat/libprocstat_compat.c @@ -1,259 +1,259 @@ /*- * Copyright (c) 2014 Gleb Kurtsou * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include #include #include #include "libprocstat.h" #define SPECNAMELEN_COMPAT12 63 struct freebsd11_ptsstat { uint32_t dev; char devname[SPECNAMELEN_COMPAT12 + 1]; }; struct freebsd11_vnstat { uint64_t vn_fileid; uint64_t vn_size; char *vn_mntdir; uint32_t vn_dev; uint32_t vn_fsid; int vn_type; uint16_t vn_mode; char vn_devname[SPECNAMELEN_COMPAT12 + 1]; }; struct freebsd11_semstat { uint32_t value; uint16_t mode; }; struct freebsd11_shmstat { uint64_t size; uint16_t mode; }; struct freebsd11_sockstat { uint64_t inp_ppcb; uint64_t so_addr; uint64_t so_pcb; uint64_t unp_conn; int dom_family; int proto; int so_rcv_sb_state; int so_snd_sb_state; struct sockaddr_storage sa_local; /* Socket address. */ struct sockaddr_storage sa_peer; /* Peer address. */ int type; char dname[32]; }; struct freebsd12_vnstat { uint64_t vn_fileid; uint64_t vn_size; uint64_t vn_dev; uint64_t vn_fsid; char *vn_mntdir; int vn_type; uint16_t vn_mode; char vn_devname[SPECNAMELEN_COMPAT12 + 1]; }; struct freebsd12_ptsstat { uint64_t dev; char devname[SPECNAMELEN_COMPAT12 + 1]; }; int freebsd11_procstat_get_pts_info(struct procstat *procstat, struct filestat *fst, struct freebsd11_ptsstat *pts, char *errbuf); int freebsd12_procstat_get_pts_info(struct procstat *procstat, struct filestat *fst, struct freebsd12_ptsstat *pts_compat, char *errbuf); int freebsd11_procstat_get_sem_info(struct procstat *procstat, struct filestat *fst, struct freebsd11_semstat *sem, char *errbuf); int freebsd11_procstat_get_shm_info(struct procstat *procstat, struct filestat *fst, struct freebsd11_shmstat *shm, char *errbuf); int freebsd11_procstat_get_socket_info(struct procstat *procstat, struct filestat *fst, struct freebsd11_sockstat *sock, char *errbuf); int freebsd11_procstat_get_vnode_info(struct procstat *procstat, struct filestat *fst, struct freebsd11_vnstat *vn, char *errbuf); int freebsd12_procstat_get_vnode_info(struct procstat *procstat, struct filestat *fst, struct freebsd12_vnstat *vn_compat, char *errbuf); static const char trunc_name[] = ""; int freebsd11_procstat_get_pts_info(struct procstat *procstat, struct filestat *fst, struct freebsd11_ptsstat *pts_compat, char *errbuf) { struct ptsstat pts; int r; r = procstat_get_pts_info(procstat, fst, &pts, errbuf); if (r != 0) return (r); pts_compat->dev = pts.dev; if (strlen(pts.devname) >= sizeof(pts_compat->devname)) strcpy(pts_compat->devname, trunc_name); else memcpy(pts_compat->devname, pts.devname, sizeof(pts_compat->devname)); return (0); } int freebsd12_procstat_get_pts_info(struct procstat *procstat, struct filestat *fst, struct freebsd12_ptsstat *pts_compat, char *errbuf) { struct ptsstat pts; int r; r = procstat_get_pts_info(procstat, fst, &pts, errbuf); if (r != 0) return (r); pts_compat->dev = pts.dev; if (strlen(pts.devname) >= sizeof(pts_compat->devname)) strcpy(pts_compat->devname, trunc_name); else memcpy(pts_compat->devname, pts.devname, sizeof(pts_compat->devname)); return (0); } int freebsd11_procstat_get_sem_info(struct procstat *procstat, struct filestat *fst, struct freebsd11_semstat *sem_compat, char *errbuf) { struct semstat sem; int r; r = procstat_get_sem_info(procstat, fst, &sem, errbuf); if (r != 0) return (r); sem_compat->value = sem.value; sem_compat->mode = sem.mode; return (0); } int freebsd11_procstat_get_shm_info(struct procstat *procstat, struct filestat *fst, struct freebsd11_shmstat *shm_compat, char *errbuf) { struct shmstat shm; int r; r = procstat_get_shm_info(procstat, fst, &shm, errbuf); if (r != 0) return (r); shm_compat->size = shm.size; shm_compat->mode = shm.mode; return (0); } int freebsd11_procstat_get_socket_info(struct procstat *procstat, struct filestat *fst, struct freebsd11_sockstat *sock_compat, char *errbuf) { struct sockstat sock; int r; r = procstat_get_socket_info(procstat, fst, &sock, errbuf); if (r != 0) return (r); - sock_compat->inp_ppcb = sock.inp_ppcb; + sock_compat->inp_ppcb = sock.so_pcb; sock_compat->so_addr = sock.so_addr; sock_compat->so_pcb = sock.so_pcb; sock_compat->unp_conn = sock.unp_conn; sock_compat->dom_family = sock.dom_family; sock_compat->proto = sock.proto; sock_compat->so_rcv_sb_state = sock.so_rcv_sb_state; sock_compat->so_snd_sb_state = sock.so_snd_sb_state; sock_compat->sa_local = sock.sa_local; sock_compat->sa_peer = sock.sa_peer; sock_compat->type = sock.type; memcpy(sock_compat->dname, sock.dname, sizeof(sock.dname)); return (0); } int freebsd11_procstat_get_vnode_info(struct procstat *procstat, struct filestat *fst, struct freebsd11_vnstat *vn_compat, char *errbuf) { struct vnstat vn; int r; r = procstat_get_vnode_info(procstat, fst, &vn, errbuf); if (r != 0) return (r); vn_compat->vn_fileid = vn.vn_fileid; vn_compat->vn_size = vn.vn_size; vn_compat->vn_mntdir = vn.vn_mntdir; vn_compat->vn_dev = vn.vn_dev; vn_compat->vn_fsid = vn.vn_fsid; vn_compat->vn_type = vn.vn_type; vn_compat->vn_mode = vn.vn_mode; if (strlen(vn.vn_devname) >= sizeof(vn_compat->vn_devname)) strcpy(vn_compat->vn_devname, trunc_name); else memcpy(vn_compat->vn_devname, vn.vn_devname, sizeof(vn_compat->vn_devname)); return (0); } int freebsd12_procstat_get_vnode_info(struct procstat *procstat, struct filestat *fst, struct freebsd12_vnstat *vn_compat, char *errbuf) { struct vnstat vn; int r; r = procstat_get_vnode_info(procstat, fst, &vn, errbuf); if (r != 0) return (r); vn_compat->vn_fileid = vn.vn_fileid; vn_compat->vn_size = vn.vn_size; vn_compat->vn_mntdir = vn.vn_mntdir; vn_compat->vn_dev = vn.vn_dev; vn_compat->vn_fsid = vn.vn_fsid; vn_compat->vn_type = vn.vn_type; vn_compat->vn_mode = vn.vn_mode; if (strlen(vn.vn_devname) >= sizeof(vn_compat->vn_devname)) strcpy(vn_compat->vn_devname, trunc_name); else memcpy(vn_compat->vn_devname, vn.vn_devname, sizeof(vn_compat->vn_devname)); return (0); } __sym_compat(procstat_get_pts_info, freebsd11_procstat_get_pts_info, FBSD_1.2); __sym_compat(procstat_get_socket_info, freebsd11_procstat_get_socket_info, FBSD_1.2); __sym_compat(procstat_get_vnode_info, freebsd11_procstat_get_vnode_info, FBSD_1.2); __sym_compat(procstat_get_sem_info, freebsd11_procstat_get_sem_info, FBSD_1.3); __sym_compat(procstat_get_shm_info, freebsd11_procstat_get_shm_info, FBSD_1.3); __sym_compat(procstat_get_pts_info, freebsd12_procstat_get_pts_info, FBSD_1.5); __sym_compat(procstat_get_vnode_info, freebsd12_procstat_get_vnode_info, FBSD_1.5); diff --git a/sys/kern/sys_socket.c b/sys/kern/sys_socket.c index 7854a4ebdf52..55837e4cb6ce 100644 --- a/sys/kern/sys_socket.c +++ b/sys/kern/sys_socket.c @@ -1,836 +1,832 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1990, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* XXX */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static SYSCTL_NODE(_kern_ipc, OID_AUTO, aio, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "socket AIO stats"); static int empty_results; SYSCTL_INT(_kern_ipc_aio, OID_AUTO, empty_results, CTLFLAG_RD, &empty_results, 0, "socket operation returned EAGAIN"); static int empty_retries; SYSCTL_INT(_kern_ipc_aio, OID_AUTO, empty_retries, CTLFLAG_RD, &empty_retries, 0, "socket operation retries"); static fo_rdwr_t soo_read; static fo_rdwr_t soo_write; static fo_ioctl_t soo_ioctl; static fo_poll_t soo_poll; extern fo_kqfilter_t soo_kqfilter; static fo_stat_t soo_stat; static fo_close_t soo_close; static fo_fill_kinfo_t soo_fill_kinfo; static fo_aio_queue_t soo_aio_queue; static void soo_aio_cancel(struct kaiocb *job); struct fileops socketops = { .fo_read = soo_read, .fo_write = soo_write, .fo_truncate = invfo_truncate, .fo_ioctl = soo_ioctl, .fo_poll = soo_poll, .fo_kqfilter = soo_kqfilter, .fo_stat = soo_stat, .fo_close = soo_close, .fo_chmod = invfo_chmod, .fo_chown = invfo_chown, .fo_sendfile = invfo_sendfile, .fo_fill_kinfo = soo_fill_kinfo, .fo_aio_queue = soo_aio_queue, .fo_cmp = file_kcmp_generic, .fo_flags = DFLAG_PASSABLE }; static int soo_read(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { struct socket *so = fp->f_data; int error; #ifdef MAC error = mac_socket_check_receive(active_cred, so); if (error) return (error); #endif error = soreceive(so, 0, uio, 0, 0, 0); return (error); } static int soo_write(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) { struct socket *so = fp->f_data; int error; #ifdef MAC error = mac_socket_check_send(active_cred, so); if (error) return (error); #endif error = sousrsend(so, NULL, uio, NULL, 0, NULL); return (error); } static int soo_ioctl(struct file *fp, u_long cmd, void *data, struct ucred *active_cred, struct thread *td) { struct socket *so = fp->f_data; int error = 0; switch (cmd) { case FIONBIO: SOCK_LOCK(so); if (*(int *)data) so->so_state |= SS_NBIO; else so->so_state &= ~SS_NBIO; SOCK_UNLOCK(so); break; case FIOASYNC: if (*(int *)data) { SOCK_LOCK(so); so->so_state |= SS_ASYNC; if (SOLISTENING(so)) { so->sol_sbrcv_flags |= SB_ASYNC; so->sol_sbsnd_flags |= SB_ASYNC; } else { SOCK_RECVBUF_LOCK(so); so->so_rcv.sb_flags |= SB_ASYNC; SOCK_RECVBUF_UNLOCK(so); SOCK_SENDBUF_LOCK(so); so->so_snd.sb_flags |= SB_ASYNC; SOCK_SENDBUF_UNLOCK(so); } SOCK_UNLOCK(so); } else { SOCK_LOCK(so); so->so_state &= ~SS_ASYNC; if (SOLISTENING(so)) { so->sol_sbrcv_flags &= ~SB_ASYNC; so->sol_sbsnd_flags &= ~SB_ASYNC; } else { SOCK_RECVBUF_LOCK(so); so->so_rcv.sb_flags &= ~SB_ASYNC; SOCK_RECVBUF_UNLOCK(so); SOCK_SENDBUF_LOCK(so); so->so_snd.sb_flags &= ~SB_ASYNC; SOCK_SENDBUF_UNLOCK(so); } SOCK_UNLOCK(so); } break; case FIONREAD: SOCK_RECVBUF_LOCK(so); if (SOLISTENING(so)) { error = EINVAL; } else { *(int *)data = sbavail(&so->so_rcv) - so->so_rcv.sb_ctl; } SOCK_RECVBUF_UNLOCK(so); break; case FIONWRITE: /* Unlocked read. */ if (SOLISTENING(so)) { error = EINVAL; } else { *(int *)data = sbavail(&so->so_snd); } break; case FIONSPACE: /* Unlocked read. */ if (SOLISTENING(so)) { error = EINVAL; } else { if ((so->so_snd.sb_hiwat < sbused(&so->so_snd)) || (so->so_snd.sb_mbmax < so->so_snd.sb_mbcnt)) { *(int *)data = 0; } else { *(int *)data = sbspace(&so->so_snd); } } break; case FIOSETOWN: error = fsetown(*(int *)data, &so->so_sigio); break; case FIOGETOWN: *(int *)data = fgetown(&so->so_sigio); break; case SIOCSPGRP: error = fsetown(-(*(int *)data), &so->so_sigio); break; case SIOCGPGRP: *(int *)data = -fgetown(&so->so_sigio); break; case SIOCATMARK: /* Unlocked read. */ if (SOLISTENING(so)) { error = EINVAL; } else { *(int *)data = (so->so_rcv.sb_state & SBS_RCVATMARK) != 0; } break; default: /* * Interface/routing/protocol specific ioctls: interface and * routing ioctls should have a different entry since a * socket is unnecessary. */ if (IOCGROUP(cmd) == 'i') error = ifioctl(so, cmd, data, td); else if (IOCGROUP(cmd) == 'r') { CURVNET_SET(so->so_vnet); error = rtioctl_fib(cmd, data, so->so_fibnum); CURVNET_RESTORE(); } else { CURVNET_SET(so->so_vnet); error = so->so_proto->pr_control(so, cmd, data, 0, td); CURVNET_RESTORE(); } break; } return (error); } static int soo_poll(struct file *fp, int events, struct ucred *active_cred, struct thread *td) { struct socket *so = fp->f_data; #ifdef MAC int error; error = mac_socket_check_poll(active_cred, so); if (error) return (error); #endif return (sopoll(so, events, fp->f_cred, td)); } static int soo_stat(struct file *fp, struct stat *ub, struct ucred *active_cred) { struct socket *so = fp->f_data; int error = 0; bzero((caddr_t)ub, sizeof (*ub)); ub->st_mode = S_IFSOCK; #ifdef MAC error = mac_socket_check_stat(active_cred, so); if (error) return (error); #endif SOCK_LOCK(so); if (!SOLISTENING(so)) { struct sockbuf *sb; /* * If SBS_CANTRCVMORE is set, but there's still data left * in the receive buffer, the socket is still readable. */ sb = &so->so_rcv; SOCK_RECVBUF_LOCK(so); if ((sb->sb_state & SBS_CANTRCVMORE) == 0 || sbavail(sb)) ub->st_mode |= S_IRUSR | S_IRGRP | S_IROTH; ub->st_size = sbavail(sb) - sb->sb_ctl; SOCK_RECVBUF_UNLOCK(so); sb = &so->so_snd; SOCK_SENDBUF_LOCK(so); if ((sb->sb_state & SBS_CANTSENDMORE) == 0) ub->st_mode |= S_IWUSR | S_IWGRP | S_IWOTH; SOCK_SENDBUF_UNLOCK(so); } ub->st_uid = so->so_cred->cr_uid; ub->st_gid = so->so_cred->cr_gid; if (so->so_proto->pr_sense) error = so->so_proto->pr_sense(so, ub); SOCK_UNLOCK(so); return (error); } /* * API socket close on file pointer. We call soclose() to close the socket * (including initiating closing protocols). soclose() will sorele() the * file reference but the actual socket will not go away until the socket's * ref count hits 0. */ static int soo_close(struct file *fp, struct thread *td) { int error = 0; struct socket *so; so = fp->f_data; fp->f_ops = &badfileops; fp->f_data = NULL; if (so) error = soclose(so); return (error); } static int soo_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp) { struct sockaddr_storage ss = { .ss_len = sizeof(ss) }; - struct inpcb *inpcb; struct unpcb *unpcb; struct socket *so; int error; kif->kf_type = KF_TYPE_SOCKET; so = fp->f_data; CURVNET_SET(so->so_vnet); kif->kf_un.kf_sock.kf_sock_domain0 = so->so_proto->pr_domain->dom_family; kif->kf_un.kf_sock.kf_sock_type0 = so->so_type; kif->kf_un.kf_sock.kf_sock_protocol0 = so->so_proto->pr_protocol; kif->kf_un.kf_sock.kf_sock_pcb = (uintptr_t)so->so_pcb; switch (kif->kf_un.kf_sock.kf_sock_domain0) { case AF_INET: case AF_INET6: - if (so->so_pcb != NULL) { - inpcb = (struct inpcb *)(so->so_pcb); - kif->kf_un.kf_sock.kf_sock_inpcb = - (uintptr_t)inpcb->inp_ppcb; - } + /* XXX: kf_sock_inpcb is obsolete. It may be removed. */ + kif->kf_un.kf_sock.kf_sock_inpcb = (uintptr_t)so->so_pcb; kif->kf_un.kf_sock.kf_sock_rcv_sb_state = so->so_rcv.sb_state; kif->kf_un.kf_sock.kf_sock_snd_sb_state = so->so_snd.sb_state; kif->kf_un.kf_sock.kf_sock_sendq = sbused(&so->so_snd); kif->kf_un.kf_sock.kf_sock_recvq = sbused(&so->so_rcv); break; case AF_UNIX: if (so->so_pcb != NULL) { unpcb = (struct unpcb *)(so->so_pcb); if (unpcb->unp_conn) { kif->kf_un.kf_sock.kf_sock_unpconn = (uintptr_t)unpcb->unp_conn; kif->kf_un.kf_sock.kf_sock_rcv_sb_state = so->so_rcv.sb_state; kif->kf_un.kf_sock.kf_sock_snd_sb_state = so->so_snd.sb_state; kif->kf_un.kf_sock.kf_sock_sendq = sbused(&so->so_snd); kif->kf_un.kf_sock.kf_sock_recvq = sbused(&so->so_rcv); } } break; } error = sosockaddr(so, (struct sockaddr *)&ss); if (error == 0 && ss.ss_len <= sizeof(kif->kf_un.kf_sock.kf_sa_local)) { bcopy(&ss, &kif->kf_un.kf_sock.kf_sa_local, ss.ss_len); } ss.ss_len = sizeof(ss); error = sopeeraddr(so, (struct sockaddr *)&ss); if (error == 0 && ss.ss_len <= sizeof(kif->kf_un.kf_sock.kf_sa_peer)) { bcopy(&ss, &kif->kf_un.kf_sock.kf_sa_peer, ss.ss_len); } strncpy(kif->kf_path, so->so_proto->pr_domain->dom_name, sizeof(kif->kf_path)); CURVNET_RESTORE(); return (0); } /* * Use the 'backend3' field in AIO jobs to store the amount of data * completed by the AIO job so far. */ #define aio_done backend3 static STAILQ_HEAD(, task) soaio_jobs; static struct mtx soaio_jobs_lock; static struct task soaio_kproc_task; static int soaio_starting, soaio_idle, soaio_queued; static struct unrhdr *soaio_kproc_unr; static int soaio_max_procs = MAX_AIO_PROCS; SYSCTL_INT(_kern_ipc_aio, OID_AUTO, max_procs, CTLFLAG_RW, &soaio_max_procs, 0, "Maximum number of kernel processes to use for async socket IO"); static int soaio_num_procs; SYSCTL_INT(_kern_ipc_aio, OID_AUTO, num_procs, CTLFLAG_RD, &soaio_num_procs, 0, "Number of active kernel processes for async socket IO"); static int soaio_target_procs = TARGET_AIO_PROCS; SYSCTL_INT(_kern_ipc_aio, OID_AUTO, target_procs, CTLFLAG_RD, &soaio_target_procs, 0, "Preferred number of ready kernel processes for async socket IO"); static int soaio_lifetime; SYSCTL_INT(_kern_ipc_aio, OID_AUTO, lifetime, CTLFLAG_RW, &soaio_lifetime, 0, "Maximum lifetime for idle aiod"); static void soaio_kproc_loop(void *arg) { struct proc *p; struct vmspace *myvm; struct task *task; int error, id, pending; id = (intptr_t)arg; /* * Grab an extra reference on the daemon's vmspace so that it * doesn't get freed by jobs that switch to a different * vmspace. */ p = curproc; myvm = vmspace_acquire_ref(p); mtx_lock(&soaio_jobs_lock); MPASS(soaio_starting > 0); soaio_starting--; for (;;) { while (!STAILQ_EMPTY(&soaio_jobs)) { task = STAILQ_FIRST(&soaio_jobs); STAILQ_REMOVE_HEAD(&soaio_jobs, ta_link); soaio_queued--; pending = task->ta_pending; task->ta_pending = 0; mtx_unlock(&soaio_jobs_lock); task->ta_func(task->ta_context, pending); mtx_lock(&soaio_jobs_lock); } MPASS(soaio_queued == 0); if (p->p_vmspace != myvm) { mtx_unlock(&soaio_jobs_lock); vmspace_switch_aio(myvm); mtx_lock(&soaio_jobs_lock); continue; } soaio_idle++; error = mtx_sleep(&soaio_idle, &soaio_jobs_lock, 0, "-", soaio_lifetime); soaio_idle--; if (error == EWOULDBLOCK && STAILQ_EMPTY(&soaio_jobs) && soaio_num_procs > soaio_target_procs) break; } soaio_num_procs--; mtx_unlock(&soaio_jobs_lock); free_unr(soaio_kproc_unr, id); kproc_exit(0); } static void soaio_kproc_create(void *context, int pending) { struct proc *p; int error, id; mtx_lock(&soaio_jobs_lock); for (;;) { if (soaio_num_procs < soaio_target_procs) { /* Must create */ } else if (soaio_num_procs >= soaio_max_procs) { /* * Hit the limit on kernel processes, don't * create another one. */ break; } else if (soaio_queued <= soaio_idle + soaio_starting) { /* * No more AIO jobs waiting for a process to be * created, so stop. */ break; } soaio_starting++; mtx_unlock(&soaio_jobs_lock); id = alloc_unr(soaio_kproc_unr); error = kproc_create(soaio_kproc_loop, (void *)(intptr_t)id, &p, 0, 0, "soaiod%d", id); if (error != 0) { free_unr(soaio_kproc_unr, id); mtx_lock(&soaio_jobs_lock); soaio_starting--; break; } mtx_lock(&soaio_jobs_lock); soaio_num_procs++; } mtx_unlock(&soaio_jobs_lock); } void soaio_enqueue(struct task *task) { mtx_lock(&soaio_jobs_lock); MPASS(task->ta_pending == 0); task->ta_pending++; STAILQ_INSERT_TAIL(&soaio_jobs, task, ta_link); soaio_queued++; if (soaio_queued <= soaio_idle) wakeup_one(&soaio_idle); else if (soaio_num_procs < soaio_max_procs) taskqueue_enqueue(taskqueue_thread, &soaio_kproc_task); mtx_unlock(&soaio_jobs_lock); } static void soaio_init(void) { soaio_lifetime = AIOD_LIFETIME_DEFAULT; STAILQ_INIT(&soaio_jobs); mtx_init(&soaio_jobs_lock, "soaio jobs", NULL, MTX_DEF); soaio_kproc_unr = new_unrhdr(1, INT_MAX, NULL); TASK_INIT(&soaio_kproc_task, 0, soaio_kproc_create, NULL); } SYSINIT(soaio, SI_SUB_VFS, SI_ORDER_ANY, soaio_init, NULL); static __inline int soaio_ready(struct socket *so, struct sockbuf *sb) { return (sb == &so->so_rcv ? soreadable(so) : sowriteable(so)); } static void soaio_process_job(struct socket *so, sb_which which, struct kaiocb *job) { struct ucred *td_savedcred; struct thread *td; struct sockbuf *sb = sobuf(so, which); #ifdef MAC struct file *fp = job->fd_file; #endif size_t cnt, done, job_total_nbytes __diagused; long ru_before; int error, flags; SOCK_BUF_UNLOCK(so, which); aio_switch_vmspace(job); td = curthread; retry: td_savedcred = td->td_ucred; td->td_ucred = job->cred; job_total_nbytes = job->uiop->uio_resid + job->aio_done; done = job->aio_done; cnt = job->uiop->uio_resid; job->uiop->uio_offset = 0; job->uiop->uio_td = td; flags = MSG_NBIO; /* * For resource usage accounting, only count a completed request * as a single message to avoid counting multiple calls to * sosend/soreceive on a blocking socket. */ if (sb == &so->so_rcv) { ru_before = td->td_ru.ru_msgrcv; #ifdef MAC error = mac_socket_check_receive(fp->f_cred, so); if (error == 0) #endif error = soreceive(so, NULL, job->uiop, NULL, NULL, &flags); if (td->td_ru.ru_msgrcv != ru_before) job->msgrcv = 1; } else { if (!TAILQ_EMPTY(&sb->sb_aiojobq)) flags |= MSG_MORETOCOME; ru_before = td->td_ru.ru_msgsnd; #ifdef MAC error = mac_socket_check_send(fp->f_cred, so); if (error == 0) #endif error = sousrsend(so, NULL, job->uiop, NULL, flags, job->userproc); if (td->td_ru.ru_msgsnd != ru_before) job->msgsnd = 1; } done += cnt - job->uiop->uio_resid; job->aio_done = done; td->td_ucred = td_savedcred; if (error == EWOULDBLOCK) { /* * The request was either partially completed or not * completed at all due to racing with a read() or * write() on the socket. If the socket is * non-blocking, return with any partial completion. * If the socket is blocking or if no progress has * been made, requeue this request at the head of the * queue to try again when the socket is ready. */ MPASS(done != job_total_nbytes); SOCK_BUF_LOCK(so, which); if (done == 0 || !(so->so_state & SS_NBIO)) { empty_results++; if (soaio_ready(so, sb)) { empty_retries++; SOCK_BUF_UNLOCK(so, which); goto retry; } if (!aio_set_cancel_function(job, soo_aio_cancel)) { SOCK_BUF_UNLOCK(so, which); if (done != 0) aio_complete(job, done, 0); else aio_cancel(job); SOCK_BUF_LOCK(so, which); } else { TAILQ_INSERT_HEAD(&sb->sb_aiojobq, job, list); } return; } SOCK_BUF_UNLOCK(so, which); } if (done != 0 && (error == ERESTART || error == EINTR || error == EWOULDBLOCK)) error = 0; if (error) aio_complete(job, -1, error); else aio_complete(job, done, 0); SOCK_BUF_LOCK(so, which); } static void soaio_process_sb(struct socket *so, sb_which which) { struct kaiocb *job; struct sockbuf *sb = sobuf(so, which); CURVNET_SET(so->so_vnet); SOCK_BUF_LOCK(so, which); while (!TAILQ_EMPTY(&sb->sb_aiojobq) && soaio_ready(so, sb)) { job = TAILQ_FIRST(&sb->sb_aiojobq); TAILQ_REMOVE(&sb->sb_aiojobq, job, list); if (!aio_clear_cancel_function(job)) continue; soaio_process_job(so, which, job); } /* * If there are still pending requests, the socket must not be * ready so set SB_AIO to request a wakeup when the socket * becomes ready. */ if (!TAILQ_EMPTY(&sb->sb_aiojobq)) sb->sb_flags |= SB_AIO; sb->sb_flags &= ~SB_AIO_RUNNING; SOCK_BUF_UNLOCK(so, which); sorele(so); CURVNET_RESTORE(); } void soaio_rcv(void *context, int pending) { struct socket *so; so = context; soaio_process_sb(so, SO_RCV); } void soaio_snd(void *context, int pending) { struct socket *so; so = context; soaio_process_sb(so, SO_SND); } void sowakeup_aio(struct socket *so, sb_which which) { struct sockbuf *sb = sobuf(so, which); SOCK_BUF_LOCK_ASSERT(so, which); sb->sb_flags &= ~SB_AIO; if (sb->sb_flags & SB_AIO_RUNNING) return; sb->sb_flags |= SB_AIO_RUNNING; soref(so); soaio_enqueue(&sb->sb_aiotask); } static void soo_aio_cancel(struct kaiocb *job) { struct socket *so; struct sockbuf *sb; long done; int opcode; sb_which which; so = job->fd_file->f_data; opcode = job->uaiocb.aio_lio_opcode; if (opcode & LIO_READ) { sb = &so->so_rcv; which = SO_RCV; } else { MPASS(opcode & LIO_WRITE); sb = &so->so_snd; which = SO_SND; } SOCK_BUF_LOCK(so, which); if (!aio_cancel_cleared(job)) TAILQ_REMOVE(&sb->sb_aiojobq, job, list); if (TAILQ_EMPTY(&sb->sb_aiojobq)) sb->sb_flags &= ~SB_AIO; SOCK_BUF_UNLOCK(so, which); done = job->aio_done; if (done != 0) aio_complete(job, done, 0); else aio_cancel(job); } static int soo_aio_queue(struct file *fp, struct kaiocb *job) { struct socket *so; struct sockbuf *sb; sb_which which; int error; so = fp->f_data; error = so->so_proto->pr_aio_queue(so, job); if (error == 0) return (0); /* Lock through the socket, since this may be a listening socket. */ switch (job->uaiocb.aio_lio_opcode & (LIO_WRITE | LIO_READ)) { case LIO_READ: SOCK_RECVBUF_LOCK(so); sb = &so->so_rcv; which = SO_RCV; break; case LIO_WRITE: SOCK_SENDBUF_LOCK(so); sb = &so->so_snd; which = SO_SND; break; default: return (EINVAL); } if (SOLISTENING(so)) { SOCK_BUF_UNLOCK(so, which); return (EINVAL); } if (!aio_set_cancel_function(job, soo_aio_cancel)) panic("new job was cancelled"); TAILQ_INSERT_TAIL(&sb->sb_aiojobq, job, list); if (!(sb->sb_flags & SB_AIO_RUNNING)) { if (soaio_ready(so, sb)) sowakeup_aio(so, which); else sb->sb_flags |= SB_AIO; } SOCK_BUF_UNLOCK(so, which); return (0); } diff --git a/sys/netinet/in_pcb.c b/sys/netinet/in_pcb.c index d966f8e386fe..d9caad6417ef 100644 --- a/sys/netinet/in_pcb.c +++ b/sys/netinet/in_pcb.c @@ -1,3507 +1,3498 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1991, 1993, 1995 * The Regents of the University of California. * Copyright (c) 2007-2009 Robert N. M. Watson * Copyright (c) 2010-2011 Juniper Networks, Inc. * Copyright (c) 2021-2022 Gleb Smirnoff * All rights reserved. * * Portions of this software were developed by Robert N. M. Watson under * contract to Juniper Networks, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include "opt_ddb.h" #include "opt_ipsec.h" #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ratelimit.h" #include "opt_route.h" #include "opt_rss.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef DDB #include #endif #include #include #include #include #include #include #include #include #include #include #if defined(INET) || defined(INET6) #include #include #include #include #ifdef INET #include #include #endif #include #ifdef INET6 #include #include #include #include #endif /* INET6 */ #include #endif #include #include #define INPCBLBGROUP_SIZMIN 8 #define INPCBLBGROUP_SIZMAX 256 #define INP_FREED 0x00000200 /* Went through in_pcbfree(). */ #define INP_INLBGROUP 0x01000000 /* Inserted into inpcblbgroup. */ /* * These configure the range of local port addresses assigned to * "unspecified" outgoing connections/packets/whatever. */ VNET_DEFINE(int, ipport_lowfirstauto) = IPPORT_RESERVED - 1; /* 1023 */ VNET_DEFINE(int, ipport_lowlastauto) = IPPORT_RESERVEDSTART; /* 600 */ VNET_DEFINE(int, ipport_firstauto) = IPPORT_EPHEMERALFIRST; /* 10000 */ VNET_DEFINE(int, ipport_lastauto) = IPPORT_EPHEMERALLAST; /* 65535 */ VNET_DEFINE(int, ipport_hifirstauto) = IPPORT_HIFIRSTAUTO; /* 49152 */ VNET_DEFINE(int, ipport_hilastauto) = IPPORT_HILASTAUTO; /* 65535 */ /* * Reserved ports accessible only to root. There are significant * security considerations that must be accounted for when changing these, * but the security benefits can be great. Please be careful. */ VNET_DEFINE(int, ipport_reservedhigh) = IPPORT_RESERVED - 1; /* 1023 */ VNET_DEFINE(int, ipport_reservedlow); /* Enable random ephemeral port allocation by default. */ VNET_DEFINE(int, ipport_randomized) = 1; #ifdef INET static struct inpcb *in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport_arg, struct in_addr laddr, u_int lport_arg, int lookupflags, uint8_t numa_domain); #define RANGECHK(var, min, max) \ if ((var) < (min)) { (var) = (min); } \ else if ((var) > (max)) { (var) = (max); } static int sysctl_net_ipport_check(SYSCTL_HANDLER_ARGS) { int error; error = sysctl_handle_int(oidp, arg1, arg2, req); if (error == 0) { RANGECHK(V_ipport_lowfirstauto, 1, IPPORT_RESERVED - 1); RANGECHK(V_ipport_lowlastauto, 1, IPPORT_RESERVED - 1); RANGECHK(V_ipport_firstauto, IPPORT_RESERVED, IPPORT_MAX); RANGECHK(V_ipport_lastauto, IPPORT_RESERVED, IPPORT_MAX); RANGECHK(V_ipport_hifirstauto, IPPORT_RESERVED, IPPORT_MAX); RANGECHK(V_ipport_hilastauto, IPPORT_RESERVED, IPPORT_MAX); } return (error); } #undef RANGECHK static SYSCTL_NODE(_net_inet_ip, IPPROTO_IP, portrange, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "IP Ports"); SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowfirst, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, &VNET_NAME(ipport_lowfirstauto), 0, &sysctl_net_ipport_check, "I", ""); SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowlast, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, &VNET_NAME(ipport_lowlastauto), 0, &sysctl_net_ipport_check, "I", ""); SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, first, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, &VNET_NAME(ipport_firstauto), 0, &sysctl_net_ipport_check, "I", ""); SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, last, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, &VNET_NAME(ipport_lastauto), 0, &sysctl_net_ipport_check, "I", ""); SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hifirst, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, &VNET_NAME(ipport_hifirstauto), 0, &sysctl_net_ipport_check, "I", ""); SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hilast, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, &VNET_NAME(ipport_hilastauto), 0, &sysctl_net_ipport_check, "I", ""); SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedhigh, CTLFLAG_VNET | CTLFLAG_RW | CTLFLAG_SECURE, &VNET_NAME(ipport_reservedhigh), 0, ""); SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedlow, CTLFLAG_RW|CTLFLAG_SECURE, &VNET_NAME(ipport_reservedlow), 0, ""); SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomized, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ipport_randomized), 0, "Enable random port allocation"); #ifdef RATELIMIT counter_u64_t rate_limit_new; counter_u64_t rate_limit_chg; counter_u64_t rate_limit_active; counter_u64_t rate_limit_alloc_fail; counter_u64_t rate_limit_set_ok; static SYSCTL_NODE(_net_inet_ip, OID_AUTO, rl, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "IP Rate Limiting"); SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, active, CTLFLAG_RD, &rate_limit_active, "Active rate limited connections"); SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, alloc_fail, CTLFLAG_RD, &rate_limit_alloc_fail, "Rate limited connection failures"); SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, set_ok, CTLFLAG_RD, &rate_limit_set_ok, "Rate limited setting succeeded"); SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, newrl, CTLFLAG_RD, &rate_limit_new, "Total Rate limit new attempts"); SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, chgrl, CTLFLAG_RD, &rate_limit_chg, "Total Rate limited change attempts"); #endif /* RATELIMIT */ #endif /* INET */ VNET_DEFINE(uint32_t, in_pcbhashseed); static void in_pcbhashseed_init(void) { V_in_pcbhashseed = arc4random(); } VNET_SYSINIT(in_pcbhashseed_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_FIRST, in_pcbhashseed_init, 0); static void in_pcbremhash(struct inpcb *); /* * in_pcb.c: manage the Protocol Control Blocks. * * NOTE: It is assumed that most of these functions will be called with * the pcbinfo lock held, and often, the inpcb lock held, as these utility * functions often modify hash chains or addresses in pcbs. */ static struct inpcblbgroup * in_pcblbgroup_alloc(struct inpcblbgrouphead *hdr, struct ucred *cred, u_char vflag, uint16_t port, const union in_dependaddr *addr, int size, uint8_t numa_domain) { struct inpcblbgroup *grp; size_t bytes; bytes = __offsetof(struct inpcblbgroup, il_inp[size]); grp = malloc(bytes, M_PCB, M_ZERO | M_NOWAIT); if (grp == NULL) return (NULL); grp->il_cred = crhold(cred); grp->il_vflag = vflag; grp->il_lport = port; grp->il_numa_domain = numa_domain; grp->il_dependladdr = *addr; grp->il_inpsiz = size; CK_LIST_INSERT_HEAD(hdr, grp, il_list); return (grp); } static void in_pcblbgroup_free_deferred(epoch_context_t ctx) { struct inpcblbgroup *grp; grp = __containerof(ctx, struct inpcblbgroup, il_epoch_ctx); crfree(grp->il_cred); free(grp, M_PCB); } static void in_pcblbgroup_free(struct inpcblbgroup *grp) { CK_LIST_REMOVE(grp, il_list); NET_EPOCH_CALL(in_pcblbgroup_free_deferred, &grp->il_epoch_ctx); } static struct inpcblbgroup * in_pcblbgroup_resize(struct inpcblbgrouphead *hdr, struct inpcblbgroup *old_grp, int size) { struct inpcblbgroup *grp; int i; grp = in_pcblbgroup_alloc(hdr, old_grp->il_cred, old_grp->il_vflag, old_grp->il_lport, &old_grp->il_dependladdr, size, old_grp->il_numa_domain); if (grp == NULL) return (NULL); KASSERT(old_grp->il_inpcnt < grp->il_inpsiz, ("invalid new local group size %d and old local group count %d", grp->il_inpsiz, old_grp->il_inpcnt)); for (i = 0; i < old_grp->il_inpcnt; ++i) grp->il_inp[i] = old_grp->il_inp[i]; grp->il_inpcnt = old_grp->il_inpcnt; in_pcblbgroup_free(old_grp); return (grp); } /* * PCB at index 'i' is removed from the group. Pull up the ones below il_inp[i] * and shrink group if possible. */ static void in_pcblbgroup_reorder(struct inpcblbgrouphead *hdr, struct inpcblbgroup **grpp, int i) { struct inpcblbgroup *grp, *new_grp; grp = *grpp; for (; i + 1 < grp->il_inpcnt; ++i) grp->il_inp[i] = grp->il_inp[i + 1]; grp->il_inpcnt--; if (grp->il_inpsiz > INPCBLBGROUP_SIZMIN && grp->il_inpcnt <= grp->il_inpsiz / 4) { /* Shrink this group. */ new_grp = in_pcblbgroup_resize(hdr, grp, grp->il_inpsiz / 2); if (new_grp != NULL) *grpp = new_grp; } } /* * Add PCB to load balance group for SO_REUSEPORT_LB option. */ static int in_pcbinslbgrouphash(struct inpcb *inp, uint8_t numa_domain) { const static struct timeval interval = { 60, 0 }; static struct timeval lastprint; struct inpcbinfo *pcbinfo; struct inpcblbgrouphead *hdr; struct inpcblbgroup *grp; uint32_t idx; pcbinfo = inp->inp_pcbinfo; INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(pcbinfo); #ifdef INET6 /* * Don't allow IPv4 mapped INET6 wild socket. */ if ((inp->inp_vflag & INP_IPV4) && inp->inp_laddr.s_addr == INADDR_ANY && INP_CHECK_SOCKAF(inp->inp_socket, AF_INET6)) { return (0); } #endif idx = INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask); hdr = &pcbinfo->ipi_lbgrouphashbase[idx]; CK_LIST_FOREACH(grp, hdr, il_list) { if (grp->il_cred->cr_prison == inp->inp_cred->cr_prison && grp->il_vflag == inp->inp_vflag && grp->il_lport == inp->inp_lport && grp->il_numa_domain == numa_domain && memcmp(&grp->il_dependladdr, &inp->inp_inc.inc_ie.ie_dependladdr, sizeof(grp->il_dependladdr)) == 0) { break; } } if (grp == NULL) { /* Create new load balance group. */ grp = in_pcblbgroup_alloc(hdr, inp->inp_cred, inp->inp_vflag, inp->inp_lport, &inp->inp_inc.inc_ie.ie_dependladdr, INPCBLBGROUP_SIZMIN, numa_domain); if (grp == NULL) return (ENOBUFS); } else if (grp->il_inpcnt == grp->il_inpsiz) { if (grp->il_inpsiz >= INPCBLBGROUP_SIZMAX) { if (ratecheck(&lastprint, &interval)) printf("lb group port %d, limit reached\n", ntohs(grp->il_lport)); return (0); } /* Expand this local group. */ grp = in_pcblbgroup_resize(hdr, grp, grp->il_inpsiz * 2); if (grp == NULL) return (ENOBUFS); } KASSERT(grp->il_inpcnt < grp->il_inpsiz, ("invalid local group size %d and count %d", grp->il_inpsiz, grp->il_inpcnt)); grp->il_inp[grp->il_inpcnt] = inp; grp->il_inpcnt++; inp->inp_flags |= INP_INLBGROUP; return (0); } /* * Remove PCB from load balance group. */ static void in_pcbremlbgrouphash(struct inpcb *inp) { struct inpcbinfo *pcbinfo; struct inpcblbgrouphead *hdr; struct inpcblbgroup *grp; int i; pcbinfo = inp->inp_pcbinfo; INP_WLOCK_ASSERT(inp); MPASS(inp->inp_flags & INP_INLBGROUP); INP_HASH_WLOCK_ASSERT(pcbinfo); hdr = &pcbinfo->ipi_lbgrouphashbase[ INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask)]; CK_LIST_FOREACH(grp, hdr, il_list) { for (i = 0; i < grp->il_inpcnt; ++i) { if (grp->il_inp[i] != inp) continue; if (grp->il_inpcnt == 1) { /* We are the last, free this local group. */ in_pcblbgroup_free(grp); } else { /* Pull up inpcbs, shrink group if possible. */ in_pcblbgroup_reorder(hdr, &grp, i); } inp->inp_flags &= ~INP_INLBGROUP; return; } } KASSERT(0, ("%s: did not find %p", __func__, inp)); } int in_pcblbgroup_numa(struct inpcb *inp, int arg) { struct inpcbinfo *pcbinfo; struct inpcblbgrouphead *hdr; struct inpcblbgroup *grp; int err, i; uint8_t numa_domain; switch (arg) { case TCP_REUSPORT_LB_NUMA_NODOM: numa_domain = M_NODOM; break; case TCP_REUSPORT_LB_NUMA_CURDOM: numa_domain = PCPU_GET(domain); break; default: if (arg < 0 || arg >= vm_ndomains) return (EINVAL); numa_domain = arg; } err = 0; pcbinfo = inp->inp_pcbinfo; INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK(pcbinfo); hdr = &pcbinfo->ipi_lbgrouphashbase[ INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask)]; CK_LIST_FOREACH(grp, hdr, il_list) { for (i = 0; i < grp->il_inpcnt; ++i) { if (grp->il_inp[i] != inp) continue; if (grp->il_numa_domain == numa_domain) { goto abort_with_hash_wlock; } /* Remove it from the old group. */ in_pcbremlbgrouphash(inp); /* Add it to the new group based on numa domain. */ in_pcbinslbgrouphash(inp, numa_domain); goto abort_with_hash_wlock; } } err = ENOENT; abort_with_hash_wlock: INP_HASH_WUNLOCK(pcbinfo); return (err); } /* Make sure it is safe to use hashinit(9) on CK_LIST. */ CTASSERT(sizeof(struct inpcbhead) == sizeof(LIST_HEAD(, inpcb))); /* * Initialize an inpcbinfo - a per-VNET instance of connections db. */ void in_pcbinfo_init(struct inpcbinfo *pcbinfo, struct inpcbstorage *pcbstor, u_int hash_nelements, u_int porthash_nelements) { mtx_init(&pcbinfo->ipi_lock, pcbstor->ips_infolock_name, NULL, MTX_DEF); mtx_init(&pcbinfo->ipi_hash_lock, pcbstor->ips_hashlock_name, NULL, MTX_DEF); #ifdef VIMAGE pcbinfo->ipi_vnet = curvnet; #endif CK_LIST_INIT(&pcbinfo->ipi_listhead); pcbinfo->ipi_count = 0; pcbinfo->ipi_hash_exact = hashinit(hash_nelements, M_PCB, &pcbinfo->ipi_hashmask); pcbinfo->ipi_hash_wild = hashinit(hash_nelements, M_PCB, &pcbinfo->ipi_hashmask); porthash_nelements = imin(porthash_nelements, IPPORT_MAX + 1); pcbinfo->ipi_porthashbase = hashinit(porthash_nelements, M_PCB, &pcbinfo->ipi_porthashmask); pcbinfo->ipi_lbgrouphashbase = hashinit(porthash_nelements, M_PCB, &pcbinfo->ipi_lbgrouphashmask); pcbinfo->ipi_zone = pcbstor->ips_zone; pcbinfo->ipi_portzone = pcbstor->ips_portzone; pcbinfo->ipi_smr = uma_zone_get_smr(pcbinfo->ipi_zone); } /* * Destroy an inpcbinfo. */ void in_pcbinfo_destroy(struct inpcbinfo *pcbinfo) { KASSERT(pcbinfo->ipi_count == 0, ("%s: ipi_count = %u", __func__, pcbinfo->ipi_count)); hashdestroy(pcbinfo->ipi_hash_exact, M_PCB, pcbinfo->ipi_hashmask); hashdestroy(pcbinfo->ipi_hash_wild, M_PCB, pcbinfo->ipi_hashmask); hashdestroy(pcbinfo->ipi_porthashbase, M_PCB, pcbinfo->ipi_porthashmask); hashdestroy(pcbinfo->ipi_lbgrouphashbase, M_PCB, pcbinfo->ipi_lbgrouphashmask); mtx_destroy(&pcbinfo->ipi_hash_lock); mtx_destroy(&pcbinfo->ipi_lock); } /* * Initialize a pcbstorage - per protocol zones to allocate inpcbs. */ static void inpcb_fini(void *, int); void in_pcbstorage_init(void *arg) { struct inpcbstorage *pcbstor = arg; pcbstor->ips_zone = uma_zcreate(pcbstor->ips_zone_name, pcbstor->ips_size, NULL, NULL, pcbstor->ips_pcbinit, inpcb_fini, UMA_ALIGN_CACHE, UMA_ZONE_SMR); pcbstor->ips_portzone = uma_zcreate(pcbstor->ips_portzone_name, sizeof(struct inpcbport), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); uma_zone_set_smr(pcbstor->ips_portzone, uma_zone_get_smr(pcbstor->ips_zone)); } /* * Destroy a pcbstorage - used by unloadable protocols. */ void in_pcbstorage_destroy(void *arg) { struct inpcbstorage *pcbstor = arg; uma_zdestroy(pcbstor->ips_zone); uma_zdestroy(pcbstor->ips_portzone); } /* * Allocate a PCB and associate it with the socket. * On success return with the PCB locked. */ int in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo) { struct inpcb *inp; #if defined(IPSEC) || defined(IPSEC_SUPPORT) || defined(MAC) int error; #endif inp = uma_zalloc_smr(pcbinfo->ipi_zone, M_NOWAIT); if (inp == NULL) return (ENOBUFS); bzero(&inp->inp_start_zero, inp_zero_size); #ifdef NUMA inp->inp_numa_domain = M_NODOM; #endif inp->inp_pcbinfo = pcbinfo; inp->inp_socket = so; inp->inp_cred = crhold(so->so_cred); inp->inp_inc.inc_fibnum = so->so_fibnum; #ifdef MAC error = mac_inpcb_init(inp, M_NOWAIT); if (error != 0) goto out; mac_inpcb_create(so, inp); #endif #if defined(IPSEC) || defined(IPSEC_SUPPORT) error = ipsec_init_pcbpolicy(inp); if (error != 0) { #ifdef MAC mac_inpcb_destroy(inp); #endif goto out; } #endif /*IPSEC*/ #ifdef INET6 if (INP_SOCKAF(so) == AF_INET6) { inp->inp_vflag |= INP_IPV6PROTO | INP_IPV6; if (V_ip6_v6only) inp->inp_flags |= IN6P_IPV6_V6ONLY; #ifdef INET else inp->inp_vflag |= INP_IPV4; #endif if (V_ip6_auto_flowlabel) inp->inp_flags |= IN6P_AUTOFLOWLABEL; inp->in6p_hops = -1; /* use kernel default */ } #endif #if defined(INET) && defined(INET6) else #endif #ifdef INET inp->inp_vflag |= INP_IPV4; #endif inp->inp_smr = SMR_SEQ_INVALID; /* * Routes in inpcb's can cache L2 as well; they are guaranteed * to be cleaned up. */ inp->inp_route.ro_flags = RT_LLE_CACHE; refcount_init(&inp->inp_refcount, 1); /* Reference from socket. */ INP_WLOCK(inp); INP_INFO_WLOCK(pcbinfo); pcbinfo->ipi_count++; inp->inp_gencnt = ++pcbinfo->ipi_gencnt; CK_LIST_INSERT_HEAD(&pcbinfo->ipi_listhead, inp, inp_list); INP_INFO_WUNLOCK(pcbinfo); so->so_pcb = inp; return (0); #if defined(IPSEC) || defined(IPSEC_SUPPORT) || defined(MAC) out: uma_zfree_smr(pcbinfo->ipi_zone, inp); return (error); #endif } #ifdef INET int in_pcbbind(struct inpcb *inp, struct sockaddr_in *sin, struct ucred *cred) { int anonport, error; KASSERT(sin == NULL || sin->sin_family == AF_INET, ("%s: invalid address family for %p", __func__, sin)); KASSERT(sin == NULL || sin->sin_len == sizeof(struct sockaddr_in), ("%s: invalid address length for %p", __func__, sin)); INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); if (inp->inp_lport != 0 || inp->inp_laddr.s_addr != INADDR_ANY) return (EINVAL); anonport = sin == NULL || sin->sin_port == 0; error = in_pcbbind_setup(inp, sin, &inp->inp_laddr.s_addr, &inp->inp_lport, cred); if (error) return (error); if (in_pcbinshash(inp) != 0) { inp->inp_laddr.s_addr = INADDR_ANY; inp->inp_lport = 0; return (EAGAIN); } if (anonport) inp->inp_flags |= INP_ANONPORT; return (0); } #endif #if defined(INET) || defined(INET6) /* * Assign a local port like in_pcb_lport(), but also used with connect() * and a foreign address and port. If fsa is non-NULL, choose a local port * that is unused with those, otherwise one that is completely unused. * lsa can be NULL for IPv6. */ int in_pcb_lport_dest(struct inpcb *inp, struct sockaddr *lsa, u_short *lportp, struct sockaddr *fsa, u_short fport, struct ucred *cred, int lookupflags) { struct inpcbinfo *pcbinfo; struct inpcb *tmpinp; unsigned short *lastport; int count, error; u_short aux, first, last, lport; #ifdef INET struct in_addr laddr, faddr; #endif #ifdef INET6 struct in6_addr *laddr6, *faddr6; #endif pcbinfo = inp->inp_pcbinfo; /* * Because no actual state changes occur here, a global write lock on * the pcbinfo isn't required. */ INP_LOCK_ASSERT(inp); INP_HASH_LOCK_ASSERT(pcbinfo); if (inp->inp_flags & INP_HIGHPORT) { first = V_ipport_hifirstauto; /* sysctl */ last = V_ipport_hilastauto; lastport = &pcbinfo->ipi_lasthi; } else if (inp->inp_flags & INP_LOWPORT) { error = priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT); if (error) return (error); first = V_ipport_lowfirstauto; /* 1023 */ last = V_ipport_lowlastauto; /* 600 */ lastport = &pcbinfo->ipi_lastlow; } else { first = V_ipport_firstauto; /* sysctl */ last = V_ipport_lastauto; lastport = &pcbinfo->ipi_lastport; } /* * Instead of having two loops further down counting up or down * make sure that first is always <= last and go with only one * code path implementing all logic. */ if (first > last) { aux = first; first = last; last = aux; } #ifdef INET laddr.s_addr = INADDR_ANY; /* used by INET6+INET below too */ if ((inp->inp_vflag & (INP_IPV4|INP_IPV6)) == INP_IPV4) { if (lsa != NULL) laddr = ((struct sockaddr_in *)lsa)->sin_addr; if (fsa != NULL) faddr = ((struct sockaddr_in *)fsa)->sin_addr; } #endif #ifdef INET6 laddr6 = NULL; if ((inp->inp_vflag & INP_IPV6) != 0) { if (lsa != NULL) laddr6 = &((struct sockaddr_in6 *)lsa)->sin6_addr; if (fsa != NULL) faddr6 = &((struct sockaddr_in6 *)fsa)->sin6_addr; } #endif tmpinp = NULL; lport = *lportp; if (V_ipport_randomized) *lastport = first + (arc4random() % (last - first)); count = last - first; do { if (count-- < 0) /* completely used? */ return (EADDRNOTAVAIL); ++*lastport; if (*lastport < first || *lastport > last) *lastport = first; lport = htons(*lastport); if (fsa != NULL) { #ifdef INET if (lsa->sa_family == AF_INET) { tmpinp = in_pcblookup_hash_locked(pcbinfo, faddr, fport, laddr, lport, lookupflags, M_NODOM); } #endif #ifdef INET6 if (lsa->sa_family == AF_INET6) { tmpinp = in6_pcblookup_hash_locked(pcbinfo, faddr6, fport, laddr6, lport, lookupflags, M_NODOM); } #endif } else { #ifdef INET6 if ((inp->inp_vflag & INP_IPV6) != 0) { tmpinp = in6_pcblookup_local(pcbinfo, &inp->in6p_laddr, lport, lookupflags, cred); #ifdef INET if (tmpinp == NULL && (inp->inp_vflag & INP_IPV4)) tmpinp = in_pcblookup_local(pcbinfo, laddr, lport, lookupflags, cred); #endif } #endif #if defined(INET) && defined(INET6) else #endif #ifdef INET tmpinp = in_pcblookup_local(pcbinfo, laddr, lport, lookupflags, cred); #endif } } while (tmpinp != NULL); *lportp = lport; return (0); } /* * Select a local port (number) to use. */ int in_pcb_lport(struct inpcb *inp, struct in_addr *laddrp, u_short *lportp, struct ucred *cred, int lookupflags) { struct sockaddr_in laddr; if (laddrp) { bzero(&laddr, sizeof(laddr)); laddr.sin_family = AF_INET; laddr.sin_addr = *laddrp; } return (in_pcb_lport_dest(inp, laddrp ? (struct sockaddr *) &laddr : NULL, lportp, NULL, 0, cred, lookupflags)); } #endif /* INET || INET6 */ #ifdef INET /* * Set up a bind operation on a PCB, performing port allocation * as required, but do not actually modify the PCB. Callers can * either complete the bind by setting inp_laddr/inp_lport and * calling in_pcbinshash(), or they can just use the resulting * port and address to authorise the sending of a once-off packet. * * On error, the values of *laddrp and *lportp are not changed. */ int in_pcbbind_setup(struct inpcb *inp, struct sockaddr_in *sin, in_addr_t *laddrp, u_short *lportp, struct ucred *cred) { struct socket *so = inp->inp_socket; struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; struct in_addr laddr; u_short lport = 0; int lookupflags = 0, reuseport = (so->so_options & SO_REUSEPORT); int error; /* * XXX: Maybe we could let SO_REUSEPORT_LB set SO_REUSEPORT bit here * so that we don't have to add to the (already messy) code below. */ int reuseport_lb = (so->so_options & SO_REUSEPORT_LB); /* * No state changes, so read locks are sufficient here. */ INP_LOCK_ASSERT(inp); INP_HASH_LOCK_ASSERT(pcbinfo); laddr.s_addr = *laddrp; if (sin != NULL && laddr.s_addr != INADDR_ANY) return (EINVAL); if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT|SO_REUSEPORT_LB)) == 0) lookupflags = INPLOOKUP_WILDCARD; if (sin == NULL) { if ((error = prison_local_ip4(cred, &laddr)) != 0) return (error); } else { KASSERT(sin->sin_family == AF_INET, ("%s: invalid family for address %p", __func__, sin)); KASSERT(sin->sin_len == sizeof(*sin), ("%s: invalid length for address %p", __func__, sin)); error = prison_local_ip4(cred, &sin->sin_addr); if (error) return (error); if (sin->sin_port != *lportp) { /* Don't allow the port to change. */ if (*lportp != 0) return (EINVAL); lport = sin->sin_port; } /* NB: lport is left as 0 if the port isn't being changed. */ if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) { /* * Treat SO_REUSEADDR as SO_REUSEPORT for multicast; * allow complete duplication of binding if * SO_REUSEPORT is set, or if SO_REUSEADDR is set * and a multicast address is bound on both * new and duplicated sockets. */ if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) != 0) reuseport = SO_REUSEADDR|SO_REUSEPORT; /* * XXX: How to deal with SO_REUSEPORT_LB here? * Treat same as SO_REUSEPORT for now. */ if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT_LB)) != 0) reuseport_lb = SO_REUSEADDR|SO_REUSEPORT_LB; } else if (sin->sin_addr.s_addr != INADDR_ANY) { sin->sin_port = 0; /* yech... */ bzero(&sin->sin_zero, sizeof(sin->sin_zero)); /* * Is the address a local IP address? * If INP_BINDANY is set, then the socket may be bound * to any endpoint address, local or not. */ if ((inp->inp_flags & INP_BINDANY) == 0 && ifa_ifwithaddr_check((struct sockaddr *)sin) == 0) return (EADDRNOTAVAIL); } laddr = sin->sin_addr; if (lport) { struct inpcb *t; /* GROSS */ if (ntohs(lport) <= V_ipport_reservedhigh && ntohs(lport) >= V_ipport_reservedlow && priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT)) return (EACCES); if (!IN_MULTICAST(ntohl(sin->sin_addr.s_addr)) && priv_check_cred(inp->inp_cred, PRIV_NETINET_REUSEPORT) != 0) { t = in_pcblookup_local(pcbinfo, sin->sin_addr, lport, INPLOOKUP_WILDCARD, cred); /* * XXX * This entire block sorely needs a rewrite. */ if (t != NULL && (so->so_type != SOCK_STREAM || ntohl(t->inp_faddr.s_addr) == INADDR_ANY) && (ntohl(sin->sin_addr.s_addr) != INADDR_ANY || ntohl(t->inp_laddr.s_addr) != INADDR_ANY || (t->inp_socket->so_options & SO_REUSEPORT) || (t->inp_socket->so_options & SO_REUSEPORT_LB) == 0) && (inp->inp_cred->cr_uid != t->inp_cred->cr_uid)) return (EADDRINUSE); } t = in_pcblookup_local(pcbinfo, sin->sin_addr, lport, lookupflags, cred); if (t != NULL && (reuseport & t->inp_socket->so_options) == 0 && (reuseport_lb & t->inp_socket->so_options) == 0) { #ifdef INET6 if (ntohl(sin->sin_addr.s_addr) != INADDR_ANY || ntohl(t->inp_laddr.s_addr) != INADDR_ANY || (inp->inp_vflag & INP_IPV6PROTO) == 0 || (t->inp_vflag & INP_IPV6PROTO) == 0) #endif return (EADDRINUSE); } } } if (*lportp != 0) lport = *lportp; if (lport == 0) { error = in_pcb_lport(inp, &laddr, &lport, cred, lookupflags); if (error != 0) return (error); } *laddrp = laddr.s_addr; *lportp = lport; return (0); } /* * Connect from a socket to a specified address. * Both address and port must be specified in argument sin. * If don't have a local address for this socket yet, * then pick one. */ int in_pcbconnect(struct inpcb *inp, struct sockaddr_in *sin, struct ucred *cred, bool rehash __unused) { u_short lport, fport; in_addr_t laddr, faddr; int anonport, error; INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); KASSERT(in_nullhost(inp->inp_faddr), ("%s: inp is already connected", __func__)); lport = inp->inp_lport; laddr = inp->inp_laddr.s_addr; anonport = (lport == 0); error = in_pcbconnect_setup(inp, sin, &laddr, &lport, &faddr, &fport, cred); if (error) return (error); inp->inp_faddr.s_addr = faddr; inp->inp_fport = fport; /* Do the initial binding of the local address if required. */ if (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0) { inp->inp_lport = lport; inp->inp_laddr.s_addr = laddr; if (in_pcbinshash(inp) != 0) { inp->inp_laddr.s_addr = inp->inp_faddr.s_addr = INADDR_ANY; inp->inp_lport = inp->inp_fport = 0; return (EAGAIN); } } else { inp->inp_lport = lport; inp->inp_laddr.s_addr = laddr; if ((inp->inp_flags & INP_INHASHLIST) != 0) in_pcbrehash(inp); else in_pcbinshash(inp); } if (anonport) inp->inp_flags |= INP_ANONPORT; return (0); } /* * Do proper source address selection on an unbound socket in case * of connect. Take jails into account as well. */ int in_pcbladdr(struct inpcb *inp, struct in_addr *faddr, struct in_addr *laddr, struct ucred *cred) { struct ifaddr *ifa; struct sockaddr *sa; struct sockaddr_in *sin, dst; struct nhop_object *nh; int error; NET_EPOCH_ASSERT(); KASSERT(laddr != NULL, ("%s: laddr NULL", __func__)); /* * Bypass source address selection and use the primary jail IP * if requested. */ if (!prison_saddrsel_ip4(cred, laddr)) return (0); error = 0; nh = NULL; bzero(&dst, sizeof(dst)); sin = &dst; sin->sin_family = AF_INET; sin->sin_len = sizeof(struct sockaddr_in); sin->sin_addr.s_addr = faddr->s_addr; /* * If route is known our src addr is taken from the i/f, * else punt. * * Find out route to destination. */ if ((inp->inp_socket->so_options & SO_DONTROUTE) == 0) nh = fib4_lookup(inp->inp_inc.inc_fibnum, *faddr, 0, NHR_NONE, 0); /* * If we found a route, use the address corresponding to * the outgoing interface. * * Otherwise assume faddr is reachable on a directly connected * network and try to find a corresponding interface to take * the source address from. */ if (nh == NULL || nh->nh_ifp == NULL) { struct in_ifaddr *ia; struct ifnet *ifp; ia = ifatoia(ifa_ifwithdstaddr((struct sockaddr *)sin, inp->inp_socket->so_fibnum)); if (ia == NULL) { ia = ifatoia(ifa_ifwithnet((struct sockaddr *)sin, 0, inp->inp_socket->so_fibnum)); } if (ia == NULL) { error = ENETUNREACH; goto done; } if (!prison_flag(cred, PR_IP4)) { laddr->s_addr = ia->ia_addr.sin_addr.s_addr; goto done; } ifp = ia->ia_ifp; ia = NULL; CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { sa = ifa->ifa_addr; if (sa->sa_family != AF_INET) continue; sin = (struct sockaddr_in *)sa; if (prison_check_ip4(cred, &sin->sin_addr) == 0) { ia = (struct in_ifaddr *)ifa; break; } } if (ia != NULL) { laddr->s_addr = ia->ia_addr.sin_addr.s_addr; goto done; } /* 3. As a last resort return the 'default' jail address. */ error = prison_get_ip4(cred, laddr); goto done; } /* * If the outgoing interface on the route found is not * a loopback interface, use the address from that interface. * In case of jails do those three steps: * 1. check if the interface address belongs to the jail. If so use it. * 2. check if we have any address on the outgoing interface * belonging to this jail. If so use it. * 3. as a last resort return the 'default' jail address. */ if ((nh->nh_ifp->if_flags & IFF_LOOPBACK) == 0) { struct in_ifaddr *ia; struct ifnet *ifp; /* If not jailed, use the default returned. */ if (!prison_flag(cred, PR_IP4)) { ia = (struct in_ifaddr *)nh->nh_ifa; laddr->s_addr = ia->ia_addr.sin_addr.s_addr; goto done; } /* Jailed. */ /* 1. Check if the iface address belongs to the jail. */ sin = (struct sockaddr_in *)nh->nh_ifa->ifa_addr; if (prison_check_ip4(cred, &sin->sin_addr) == 0) { ia = (struct in_ifaddr *)nh->nh_ifa; laddr->s_addr = ia->ia_addr.sin_addr.s_addr; goto done; } /* * 2. Check if we have any address on the outgoing interface * belonging to this jail. */ ia = NULL; ifp = nh->nh_ifp; CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { sa = ifa->ifa_addr; if (sa->sa_family != AF_INET) continue; sin = (struct sockaddr_in *)sa; if (prison_check_ip4(cred, &sin->sin_addr) == 0) { ia = (struct in_ifaddr *)ifa; break; } } if (ia != NULL) { laddr->s_addr = ia->ia_addr.sin_addr.s_addr; goto done; } /* 3. As a last resort return the 'default' jail address. */ error = prison_get_ip4(cred, laddr); goto done; } /* * The outgoing interface is marked with 'loopback net', so a route * to ourselves is here. * Try to find the interface of the destination address and then * take the address from there. That interface is not necessarily * a loopback interface. * In case of jails, check that it is an address of the jail * and if we cannot find, fall back to the 'default' jail address. */ if ((nh->nh_ifp->if_flags & IFF_LOOPBACK) != 0) { struct in_ifaddr *ia; ia = ifatoia(ifa_ifwithdstaddr(sintosa(&dst), inp->inp_socket->so_fibnum)); if (ia == NULL) ia = ifatoia(ifa_ifwithnet(sintosa(&dst), 0, inp->inp_socket->so_fibnum)); if (ia == NULL) ia = ifatoia(ifa_ifwithaddr(sintosa(&dst))); if (!prison_flag(cred, PR_IP4)) { if (ia == NULL) { error = ENETUNREACH; goto done; } laddr->s_addr = ia->ia_addr.sin_addr.s_addr; goto done; } /* Jailed. */ if (ia != NULL) { struct ifnet *ifp; ifp = ia->ia_ifp; ia = NULL; CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { sa = ifa->ifa_addr; if (sa->sa_family != AF_INET) continue; sin = (struct sockaddr_in *)sa; if (prison_check_ip4(cred, &sin->sin_addr) == 0) { ia = (struct in_ifaddr *)ifa; break; } } if (ia != NULL) { laddr->s_addr = ia->ia_addr.sin_addr.s_addr; goto done; } } /* 3. As a last resort return the 'default' jail address. */ error = prison_get_ip4(cred, laddr); goto done; } done: if (error == 0 && laddr->s_addr == INADDR_ANY) return (EHOSTUNREACH); return (error); } /* * Set up for a connect from a socket to the specified address. * On entry, *laddrp and *lportp should contain the current local * address and port for the PCB; these are updated to the values * that should be placed in inp_laddr and inp_lport to complete * the connect. * * On success, *faddrp and *fportp will be set to the remote address * and port. These are not updated in the error case. */ int in_pcbconnect_setup(struct inpcb *inp, struct sockaddr_in *sin, in_addr_t *laddrp, u_short *lportp, in_addr_t *faddrp, u_short *fportp, struct ucred *cred) { struct in_ifaddr *ia; struct in_addr laddr, faddr; u_short lport, fport; int error; KASSERT(sin->sin_family == AF_INET, ("%s: invalid address family for %p", __func__, sin)); KASSERT(sin->sin_len == sizeof(*sin), ("%s: invalid address length for %p", __func__, sin)); /* * Because a global state change doesn't actually occur here, a read * lock is sufficient. */ NET_EPOCH_ASSERT(); INP_LOCK_ASSERT(inp); INP_HASH_LOCK_ASSERT(inp->inp_pcbinfo); if (sin->sin_port == 0) return (EADDRNOTAVAIL); laddr.s_addr = *laddrp; lport = *lportp; faddr = sin->sin_addr; fport = sin->sin_port; #ifdef ROUTE_MPATH if (CALC_FLOWID_OUTBOUND) { uint32_t hash_val, hash_type; hash_val = fib4_calc_software_hash(laddr, faddr, 0, fport, inp->inp_socket->so_proto->pr_protocol, &hash_type); inp->inp_flowid = hash_val; inp->inp_flowtype = hash_type; } #endif if (!CK_STAILQ_EMPTY(&V_in_ifaddrhead)) { /* * If the destination address is INADDR_ANY, * use the primary local address. * If the supplied address is INADDR_BROADCAST, * and the primary interface supports broadcast, * choose the broadcast address for that interface. */ if (faddr.s_addr == INADDR_ANY) { faddr = IA_SIN(CK_STAILQ_FIRST(&V_in_ifaddrhead))->sin_addr; if ((error = prison_get_ip4(cred, &faddr)) != 0) return (error); } else if (faddr.s_addr == (u_long)INADDR_BROADCAST) { if (CK_STAILQ_FIRST(&V_in_ifaddrhead)->ia_ifp->if_flags & IFF_BROADCAST) faddr = satosin(&CK_STAILQ_FIRST( &V_in_ifaddrhead)->ia_broadaddr)->sin_addr; } } if (laddr.s_addr == INADDR_ANY) { error = in_pcbladdr(inp, &faddr, &laddr, cred); /* * If the destination address is multicast and an outgoing * interface has been set as a multicast option, prefer the * address of that interface as our source address. */ if (IN_MULTICAST(ntohl(faddr.s_addr)) && inp->inp_moptions != NULL) { struct ip_moptions *imo; struct ifnet *ifp; imo = inp->inp_moptions; if (imo->imo_multicast_ifp != NULL) { ifp = imo->imo_multicast_ifp; CK_STAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { if (ia->ia_ifp == ifp && prison_check_ip4(cred, &ia->ia_addr.sin_addr) == 0) break; } if (ia == NULL) error = EADDRNOTAVAIL; else { laddr = ia->ia_addr.sin_addr; error = 0; } } } if (error) return (error); } if (lport != 0) { if (in_pcblookup_hash_locked(inp->inp_pcbinfo, faddr, fport, laddr, lport, 0, M_NODOM) != NULL) return (EADDRINUSE); } else { struct sockaddr_in lsin, fsin; bzero(&lsin, sizeof(lsin)); bzero(&fsin, sizeof(fsin)); lsin.sin_family = AF_INET; lsin.sin_addr = laddr; fsin.sin_family = AF_INET; fsin.sin_addr = faddr; error = in_pcb_lport_dest(inp, (struct sockaddr *) &lsin, &lport, (struct sockaddr *)& fsin, fport, cred, INPLOOKUP_WILDCARD); if (error) return (error); } *laddrp = laddr.s_addr; *lportp = lport; *faddrp = faddr.s_addr; *fportp = fport; return (0); } void in_pcbdisconnect(struct inpcb *inp) { INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); KASSERT(inp->inp_smr == SMR_SEQ_INVALID, ("%s: inp %p was already disconnected", __func__, inp)); in_pcbremhash_locked(inp); /* See the comment in in_pcbinshash(). */ inp->inp_smr = smr_advance(inp->inp_pcbinfo->ipi_smr); inp->inp_laddr.s_addr = INADDR_ANY; inp->inp_faddr.s_addr = INADDR_ANY; inp->inp_fport = 0; } #endif /* INET */ /* * inpcb hash lookups are protected by SMR section. * * Once desired pcb has been found, switching from SMR section to a pcb * lock is performed with inp_smr_lock(). We can not use INP_(W|R)LOCK * here because SMR is a critical section. * In 99%+ cases inp_smr_lock() would obtain the lock immediately. */ void inp_lock(struct inpcb *inp, const inp_lookup_t lock) { lock == INPLOOKUP_RLOCKPCB ? rw_rlock(&inp->inp_lock) : rw_wlock(&inp->inp_lock); } void inp_unlock(struct inpcb *inp, const inp_lookup_t lock) { lock == INPLOOKUP_RLOCKPCB ? rw_runlock(&inp->inp_lock) : rw_wunlock(&inp->inp_lock); } int inp_trylock(struct inpcb *inp, const inp_lookup_t lock) { return (lock == INPLOOKUP_RLOCKPCB ? rw_try_rlock(&inp->inp_lock) : rw_try_wlock(&inp->inp_lock)); } static inline bool _inp_smr_lock(struct inpcb *inp, const inp_lookup_t lock, const int ignflags) { MPASS(lock == INPLOOKUP_RLOCKPCB || lock == INPLOOKUP_WLOCKPCB); SMR_ASSERT_ENTERED(inp->inp_pcbinfo->ipi_smr); if (__predict_true(inp_trylock(inp, lock))) { if (__predict_false(inp->inp_flags & ignflags)) { smr_exit(inp->inp_pcbinfo->ipi_smr); inp_unlock(inp, lock); return (false); } smr_exit(inp->inp_pcbinfo->ipi_smr); return (true); } if (__predict_true(refcount_acquire_if_not_zero(&inp->inp_refcount))) { smr_exit(inp->inp_pcbinfo->ipi_smr); inp_lock(inp, lock); if (__predict_false(in_pcbrele(inp, lock))) return (false); /* * inp acquired through refcount & lock for sure didn't went * through uma_zfree(). However, it may have already went * through in_pcbfree() and has another reference, that * prevented its release by our in_pcbrele(). */ if (__predict_false(inp->inp_flags & ignflags)) { inp_unlock(inp, lock); return (false); } return (true); } else { smr_exit(inp->inp_pcbinfo->ipi_smr); return (false); } } bool inp_smr_lock(struct inpcb *inp, const inp_lookup_t lock) { /* * in_pcblookup() family of functions ignore not only freed entries, * that may be found due to lockless access to the hash, but dropped * entries, too. */ return (_inp_smr_lock(inp, lock, INP_FREED | INP_DROPPED)); } /* * inp_next() - inpcb hash/list traversal iterator * * Requires initialized struct inpcb_iterator for context. * The structure can be initialized with INP_ITERATOR() or INP_ALL_ITERATOR(). * * - Iterator can have either write-lock or read-lock semantics, that can not * be changed later. * - Iterator can iterate either over all pcbs list (INP_ALL_LIST), or through * a single hash slot. Note: only rip_input() does the latter. * - Iterator may have optional bool matching function. The matching function * will be executed for each inpcb in the SMR context, so it can not acquire * locks and can safely access only immutable fields of inpcb. * * A fresh initialized iterator has NULL inpcb in its context and that * means that inp_next() call would return the very first inpcb on the list * locked with desired semantic. In all following calls the context pointer * shall hold the current inpcb pointer. The KPI user is not supposed to * unlock the current inpcb! Upon end of traversal inp_next() will return NULL * and write NULL to its context. After end of traversal an iterator can be * reused. * * List traversals have the following features/constraints: * - New entries won't be seen, as they are always added to the head of a list. * - Removed entries won't stop traversal as long as they are not added to * a different list. This is violated by in_pcbrehash(). */ #define II_LIST_FIRST(ipi, hash) \ (((hash) == INP_ALL_LIST) ? \ CK_LIST_FIRST(&(ipi)->ipi_listhead) : \ CK_LIST_FIRST(&(ipi)->ipi_hash_exact[(hash)])) #define II_LIST_NEXT(inp, hash) \ (((hash) == INP_ALL_LIST) ? \ CK_LIST_NEXT((inp), inp_list) : \ CK_LIST_NEXT((inp), inp_hash_exact)) #define II_LOCK_ASSERT(inp, lock) \ rw_assert(&(inp)->inp_lock, \ (lock) == INPLOOKUP_RLOCKPCB ? RA_RLOCKED : RA_WLOCKED ) struct inpcb * inp_next(struct inpcb_iterator *ii) { const struct inpcbinfo *ipi = ii->ipi; inp_match_t *match = ii->match; void *ctx = ii->ctx; inp_lookup_t lock = ii->lock; int hash = ii->hash; struct inpcb *inp; if (ii->inp == NULL) { /* First call. */ smr_enter(ipi->ipi_smr); /* This is unrolled CK_LIST_FOREACH(). */ for (inp = II_LIST_FIRST(ipi, hash); inp != NULL; inp = II_LIST_NEXT(inp, hash)) { if (match != NULL && (match)(inp, ctx) == false) continue; if (__predict_true(_inp_smr_lock(inp, lock, INP_FREED))) break; else { smr_enter(ipi->ipi_smr); MPASS(inp != II_LIST_FIRST(ipi, hash)); inp = II_LIST_FIRST(ipi, hash); if (inp == NULL) break; } } if (inp == NULL) smr_exit(ipi->ipi_smr); else ii->inp = inp; return (inp); } /* Not a first call. */ smr_enter(ipi->ipi_smr); restart: inp = ii->inp; II_LOCK_ASSERT(inp, lock); next: inp = II_LIST_NEXT(inp, hash); if (inp == NULL) { smr_exit(ipi->ipi_smr); goto found; } if (match != NULL && (match)(inp, ctx) == false) goto next; if (__predict_true(inp_trylock(inp, lock))) { if (__predict_false(inp->inp_flags & INP_FREED)) { /* * Entries are never inserted in middle of a list, thus * as long as we are in SMR, we can continue traversal. * Jump to 'restart' should yield in the same result, * but could produce unnecessary looping. Could this * looping be unbound? */ inp_unlock(inp, lock); goto next; } else { smr_exit(ipi->ipi_smr); goto found; } } /* * Can't obtain lock immediately, thus going hard. Once we exit the * SMR section we can no longer jump to 'next', and our only stable * anchoring point is ii->inp, which we keep locked for this case, so * we jump to 'restart'. */ if (__predict_true(refcount_acquire_if_not_zero(&inp->inp_refcount))) { smr_exit(ipi->ipi_smr); inp_lock(inp, lock); if (__predict_false(in_pcbrele(inp, lock))) { smr_enter(ipi->ipi_smr); goto restart; } /* * See comment in inp_smr_lock(). */ if (__predict_false(inp->inp_flags & INP_FREED)) { inp_unlock(inp, lock); smr_enter(ipi->ipi_smr); goto restart; } } else goto next; found: inp_unlock(ii->inp, lock); ii->inp = inp; return (ii->inp); } /* * in_pcbref() bumps the reference count on an inpcb in order to maintain * stability of an inpcb pointer despite the inpcb lock being released or * SMR section exited. * * To free a reference later in_pcbrele_(r|w)locked() must be performed. */ void in_pcbref(struct inpcb *inp) { u_int old __diagused; old = refcount_acquire(&inp->inp_refcount); KASSERT(old > 0, ("%s: refcount 0", __func__)); } /* * Drop a refcount on an inpcb elevated using in_pcbref(), potentially * freeing the pcb, if the reference was very last. */ bool in_pcbrele_rlocked(struct inpcb *inp) { INP_RLOCK_ASSERT(inp); if (!refcount_release(&inp->inp_refcount)) return (false); MPASS(inp->inp_flags & INP_FREED); MPASS(inp->inp_socket == NULL); crfree(inp->inp_cred); #ifdef INVARIANTS inp->inp_cred = NULL; #endif INP_RUNLOCK(inp); uma_zfree_smr(inp->inp_pcbinfo->ipi_zone, inp); return (true); } bool in_pcbrele_wlocked(struct inpcb *inp) { INP_WLOCK_ASSERT(inp); if (!refcount_release(&inp->inp_refcount)) return (false); MPASS(inp->inp_flags & INP_FREED); MPASS(inp->inp_socket == NULL); crfree(inp->inp_cred); #ifdef INVARIANTS inp->inp_cred = NULL; #endif INP_WUNLOCK(inp); uma_zfree_smr(inp->inp_pcbinfo->ipi_zone, inp); return (true); } bool in_pcbrele(struct inpcb *inp, const inp_lookup_t lock) { return (lock == INPLOOKUP_RLOCKPCB ? in_pcbrele_rlocked(inp) : in_pcbrele_wlocked(inp)); } /* * Unconditionally schedule an inpcb to be freed by decrementing its * reference count, which should occur only after the inpcb has been detached * from its socket. If another thread holds a temporary reference (acquired * using in_pcbref()) then the free is deferred until that reference is * released using in_pcbrele_(r|w)locked(), but the inpcb is still unlocked. * Almost all work, including removal from global lists, is done in this * context, where the pcbinfo lock is held. */ void in_pcbfree(struct inpcb *inp) { struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; #ifdef INET struct ip_moptions *imo; #endif #ifdef INET6 struct ip6_moptions *im6o; #endif INP_WLOCK_ASSERT(inp); KASSERT(inp->inp_socket != NULL, ("%s: inp_socket == NULL", __func__)); KASSERT((inp->inp_flags & INP_FREED) == 0, ("%s: called twice for pcb %p", __func__, inp)); /* * in_pcblookup_local() and in6_pcblookup_local() may return an inpcb * from the hash without acquiring inpcb lock, they rely on the hash * lock, thus in_pcbremhash() should be the first action. */ if (inp->inp_flags & INP_INHASHLIST) in_pcbremhash(inp); INP_INFO_WLOCK(pcbinfo); inp->inp_gencnt = ++pcbinfo->ipi_gencnt; pcbinfo->ipi_count--; CK_LIST_REMOVE(inp, inp_list); INP_INFO_WUNLOCK(pcbinfo); #ifdef RATELIMIT if (inp->inp_snd_tag != NULL) in_pcbdetach_txrtlmt(inp); #endif inp->inp_flags |= INP_FREED; inp->inp_socket->so_pcb = NULL; inp->inp_socket = NULL; RO_INVALIDATE_CACHE(&inp->inp_route); #ifdef MAC mac_inpcb_destroy(inp); #endif #if defined(IPSEC) || defined(IPSEC_SUPPORT) if (inp->inp_sp != NULL) ipsec_delete_pcbpolicy(inp); #endif #ifdef INET if (inp->inp_options) (void)m_free(inp->inp_options); DEBUG_POISON_POINTER(inp->inp_options); imo = inp->inp_moptions; DEBUG_POISON_POINTER(inp->inp_moptions); #endif #ifdef INET6 if (inp->inp_vflag & INP_IPV6PROTO) { ip6_freepcbopts(inp->in6p_outputopts); DEBUG_POISON_POINTER(inp->in6p_outputopts); im6o = inp->in6p_moptions; DEBUG_POISON_POINTER(inp->in6p_moptions); } else im6o = NULL; #endif if (__predict_false(in_pcbrele_wlocked(inp) == false)) { INP_WUNLOCK(inp); } #ifdef INET6 ip6_freemoptions(im6o); #endif #ifdef INET inp_freemoptions(imo); #endif } /* * Different protocols initialize their inpcbs differently - giving * different name to the lock. But they all are disposed the same. */ static void inpcb_fini(void *mem, int size) { struct inpcb *inp = mem; INP_LOCK_DESTROY(inp); } /* * in_pcbdrop() removes an inpcb from hashed lists, releasing its address and * port reservation, and preventing it from being returned by inpcb lookups. * * It is used by TCP to mark an inpcb as unused and avoid future packet * delivery or event notification when a socket remains open but TCP has * closed. This might occur as a result of a shutdown()-initiated TCP close * or a RST on the wire, and allows the port binding to be reused while still * maintaining the invariant that so_pcb always points to a valid inpcb until * in_pcbdetach(). * * XXXRW: Possibly in_pcbdrop() should also prevent future notifications by * in_pcbpurgeif0()? */ void in_pcbdrop(struct inpcb *inp) { INP_WLOCK_ASSERT(inp); -#ifdef INVARIANTS - if (inp->inp_socket != NULL && inp->inp_ppcb != NULL) - MPASS(inp->inp_refcount > 1); -#endif inp->inp_flags |= INP_DROPPED; if (inp->inp_flags & INP_INHASHLIST) in_pcbremhash(inp); } #ifdef INET /* * Common routines to return the socket addresses associated with inpcbs. */ int in_getsockaddr(struct socket *so, struct sockaddr *sa) { struct inpcb *inp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("in_getsockaddr: inp == NULL")); *(struct sockaddr_in *)sa = (struct sockaddr_in ){ .sin_len = sizeof(struct sockaddr_in), .sin_family = AF_INET, .sin_port = inp->inp_lport, .sin_addr = inp->inp_laddr, }; return (0); } int in_getpeeraddr(struct socket *so, struct sockaddr *sa) { struct inpcb *inp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("in_getpeeraddr: inp == NULL")); *(struct sockaddr_in *)sa = (struct sockaddr_in ){ .sin_len = sizeof(struct sockaddr_in), .sin_family = AF_INET, .sin_port = inp->inp_fport, .sin_addr = inp->inp_faddr, }; return (0); } static bool inp_v4_multi_match(const struct inpcb *inp, void *v __unused) { if ((inp->inp_vflag & INP_IPV4) && inp->inp_moptions != NULL) return (true); else return (false); } void in_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp) { struct inpcb_iterator inpi = INP_ITERATOR(pcbinfo, INPLOOKUP_WLOCKPCB, inp_v4_multi_match, NULL); struct inpcb *inp; struct in_multi *inm; struct in_mfilter *imf; struct ip_moptions *imo; IN_MULTI_LOCK_ASSERT(); while ((inp = inp_next(&inpi)) != NULL) { INP_WLOCK_ASSERT(inp); imo = inp->inp_moptions; /* * Unselect the outgoing interface if it is being * detached. */ if (imo->imo_multicast_ifp == ifp) imo->imo_multicast_ifp = NULL; /* * Drop multicast group membership if we joined * through the interface being detached. * * XXX This can all be deferred to an epoch_call */ restart: IP_MFILTER_FOREACH(imf, &imo->imo_head) { if ((inm = imf->imf_inm) == NULL) continue; if (inm->inm_ifp != ifp) continue; ip_mfilter_remove(&imo->imo_head, imf); in_leavegroup_locked(inm, NULL); ip_mfilter_free(imf); goto restart; } } } /* * Lookup a PCB based on the local address and port. Caller must hold the * hash lock. No inpcb locks or references are acquired. */ #define INP_LOOKUP_MAPPED_PCB_COST 3 struct inpcb * in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr, u_short lport, int lookupflags, struct ucred *cred) { struct inpcb *inp; #ifdef INET6 int matchwild = 3 + INP_LOOKUP_MAPPED_PCB_COST; #else int matchwild = 3; #endif int wildcard; KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0, ("%s: invalid lookup flags %d", __func__, lookupflags)); INP_HASH_LOCK_ASSERT(pcbinfo); if ((lookupflags & INPLOOKUP_WILDCARD) == 0) { struct inpcbhead *head; /* * Look for an unconnected (wildcard foreign addr) PCB that * matches the local address and port we're looking for. */ head = &pcbinfo->ipi_hash_wild[INP_PCBHASH_WILD(lport, pcbinfo->ipi_hashmask)]; CK_LIST_FOREACH(inp, head, inp_hash_wild) { #ifdef INET6 /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV4) == 0) continue; #endif if (inp->inp_faddr.s_addr == INADDR_ANY && inp->inp_laddr.s_addr == laddr.s_addr && inp->inp_lport == lport) { /* * Found? */ if (prison_equal_ip4(cred->cr_prison, inp->inp_cred->cr_prison)) return (inp); } } /* * Not found. */ return (NULL); } else { struct inpcbporthead *porthash; struct inpcbport *phd; struct inpcb *match = NULL; /* * Best fit PCB lookup. * * First see if this local port is in use by looking on the * port hash list. */ porthash = &pcbinfo->ipi_porthashbase[INP_PCBPORTHASH(lport, pcbinfo->ipi_porthashmask)]; CK_LIST_FOREACH(phd, porthash, phd_hash) { if (phd->phd_port == lport) break; } if (phd != NULL) { /* * Port is in use by one or more PCBs. Look for best * fit. */ CK_LIST_FOREACH(inp, &phd->phd_pcblist, inp_portlist) { wildcard = 0; if (!prison_equal_ip4(inp->inp_cred->cr_prison, cred->cr_prison)) continue; #ifdef INET6 /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV4) == 0) continue; /* * We never select the PCB that has * INP_IPV6 flag and is bound to :: if * we have another PCB which is bound * to 0.0.0.0. If a PCB has the * INP_IPV6 flag, then we set its cost * higher than IPv4 only PCBs. * * Note that the case only happens * when a socket is bound to ::, under * the condition that the use of the * mapped address is allowed. */ if ((inp->inp_vflag & INP_IPV6) != 0) wildcard += INP_LOOKUP_MAPPED_PCB_COST; #endif if (inp->inp_faddr.s_addr != INADDR_ANY) wildcard++; if (inp->inp_laddr.s_addr != INADDR_ANY) { if (laddr.s_addr == INADDR_ANY) wildcard++; else if (inp->inp_laddr.s_addr != laddr.s_addr) continue; } else { if (laddr.s_addr != INADDR_ANY) wildcard++; } if (wildcard < matchwild) { match = inp; matchwild = wildcard; if (matchwild == 0) break; } } } return (match); } } #undef INP_LOOKUP_MAPPED_PCB_COST static bool in_pcblookup_lb_numa_match(const struct inpcblbgroup *grp, int domain) { return (domain == M_NODOM || domain == grp->il_numa_domain); } static struct inpcb * in_pcblookup_lbgroup(const struct inpcbinfo *pcbinfo, const struct in_addr *faddr, uint16_t fport, const struct in_addr *laddr, uint16_t lport, int domain) { const struct inpcblbgrouphead *hdr; struct inpcblbgroup *grp; struct inpcblbgroup *jail_exact, *jail_wild, *local_exact, *local_wild; INP_HASH_LOCK_ASSERT(pcbinfo); hdr = &pcbinfo->ipi_lbgrouphashbase[ INP_PCBPORTHASH(lport, pcbinfo->ipi_lbgrouphashmask)]; /* * Search for an LB group match based on the following criteria: * - prefer jailed groups to non-jailed groups * - prefer exact source address matches to wildcard matches * - prefer groups bound to the specified NUMA domain */ jail_exact = jail_wild = local_exact = local_wild = NULL; CK_LIST_FOREACH(grp, hdr, il_list) { bool injail; #ifdef INET6 if (!(grp->il_vflag & INP_IPV4)) continue; #endif if (grp->il_lport != lport) continue; injail = prison_flag(grp->il_cred, PR_IP4) != 0; if (injail && prison_check_ip4_locked(grp->il_cred->cr_prison, laddr) != 0) continue; if (grp->il_laddr.s_addr == laddr->s_addr) { if (injail) { jail_exact = grp; if (in_pcblookup_lb_numa_match(grp, domain)) /* This is a perfect match. */ goto out; } else if (local_exact == NULL || in_pcblookup_lb_numa_match(grp, domain)) { local_exact = grp; } } else if (grp->il_laddr.s_addr == INADDR_ANY) { if (injail) { if (jail_wild == NULL || in_pcblookup_lb_numa_match(grp, domain)) jail_wild = grp; } else if (local_wild == NULL || in_pcblookup_lb_numa_match(grp, domain)) { local_wild = grp; } } } if (jail_exact != NULL) grp = jail_exact; else if (jail_wild != NULL) grp = jail_wild; else if (local_exact != NULL) grp = local_exact; else grp = local_wild; if (grp == NULL) return (NULL); out: return (grp->il_inp[INP_PCBLBGROUP_PKTHASH(faddr, lport, fport) % grp->il_inpcnt]); } static bool in_pcblookup_exact_match(const struct inpcb *inp, struct in_addr faddr, u_short fport, struct in_addr laddr, u_short lport) { #ifdef INET6 /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV4) == 0) return (false); #endif if (inp->inp_faddr.s_addr == faddr.s_addr && inp->inp_laddr.s_addr == laddr.s_addr && inp->inp_fport == fport && inp->inp_lport == lport) return (true); return (false); } static struct inpcb * in_pcblookup_hash_exact(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_short fport, struct in_addr laddr, u_short lport) { struct inpcbhead *head; struct inpcb *inp; INP_HASH_LOCK_ASSERT(pcbinfo); head = &pcbinfo->ipi_hash_exact[INP_PCBHASH(&faddr, lport, fport, pcbinfo->ipi_hashmask)]; CK_LIST_FOREACH(inp, head, inp_hash_exact) { if (in_pcblookup_exact_match(inp, faddr, fport, laddr, lport)) return (inp); } return (NULL); } typedef enum { INPLOOKUP_MATCH_NONE = 0, INPLOOKUP_MATCH_WILD = 1, INPLOOKUP_MATCH_LADDR = 2, } inp_lookup_match_t; static inp_lookup_match_t in_pcblookup_wild_match(const struct inpcb *inp, struct in_addr laddr, u_short lport) { #ifdef INET6 /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV4) == 0) return (INPLOOKUP_MATCH_NONE); #endif if (inp->inp_faddr.s_addr != INADDR_ANY || inp->inp_lport != lport) return (INPLOOKUP_MATCH_NONE); if (inp->inp_laddr.s_addr == INADDR_ANY) return (INPLOOKUP_MATCH_WILD); if (inp->inp_laddr.s_addr == laddr.s_addr) return (INPLOOKUP_MATCH_LADDR); return (INPLOOKUP_MATCH_NONE); } #define INP_LOOKUP_AGAIN ((struct inpcb *)(uintptr_t)-1) static struct inpcb * in_pcblookup_hash_wild_smr(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_short fport, struct in_addr laddr, u_short lport, const inp_lookup_t lockflags) { struct inpcbhead *head; struct inpcb *inp; KASSERT(SMR_ENTERED(pcbinfo->ipi_smr), ("%s: not in SMR read section", __func__)); head = &pcbinfo->ipi_hash_wild[INP_PCBHASH_WILD(lport, pcbinfo->ipi_hashmask)]; CK_LIST_FOREACH(inp, head, inp_hash_wild) { inp_lookup_match_t match; match = in_pcblookup_wild_match(inp, laddr, lport); if (match == INPLOOKUP_MATCH_NONE) continue; if (__predict_true(inp_smr_lock(inp, lockflags))) { match = in_pcblookup_wild_match(inp, laddr, lport); if (match != INPLOOKUP_MATCH_NONE && prison_check_ip4_locked(inp->inp_cred->cr_prison, &laddr) == 0) return (inp); inp_unlock(inp, lockflags); } /* * The matching socket disappeared out from under us. Fall back * to a serialized lookup. */ return (INP_LOOKUP_AGAIN); } return (NULL); } static struct inpcb * in_pcblookup_hash_wild_locked(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_short fport, struct in_addr laddr, u_short lport) { struct inpcbhead *head; struct inpcb *inp, *local_wild, *local_exact, *jail_wild; #ifdef INET6 struct inpcb *local_wild_mapped; #endif INP_HASH_LOCK_ASSERT(pcbinfo); /* * Order of socket selection - we always prefer jails. * 1. jailed, non-wild. * 2. jailed, wild. * 3. non-jailed, non-wild. * 4. non-jailed, wild. */ head = &pcbinfo->ipi_hash_wild[INP_PCBHASH_WILD(lport, pcbinfo->ipi_hashmask)]; local_wild = local_exact = jail_wild = NULL; #ifdef INET6 local_wild_mapped = NULL; #endif CK_LIST_FOREACH(inp, head, inp_hash_wild) { inp_lookup_match_t match; bool injail; match = in_pcblookup_wild_match(inp, laddr, lport); if (match == INPLOOKUP_MATCH_NONE) continue; injail = prison_flag(inp->inp_cred, PR_IP4) != 0; if (injail) { if (prison_check_ip4_locked(inp->inp_cred->cr_prison, &laddr) != 0) continue; } else { if (local_exact != NULL) continue; } if (match == INPLOOKUP_MATCH_LADDR) { if (injail) return (inp); local_exact = inp; } else { #ifdef INET6 /* XXX inp locking, NULL check */ if (inp->inp_vflag & INP_IPV6PROTO) local_wild_mapped = inp; else #endif if (injail) jail_wild = inp; else local_wild = inp; } } if (jail_wild != NULL) return (jail_wild); if (local_exact != NULL) return (local_exact); if (local_wild != NULL) return (local_wild); #ifdef INET6 if (local_wild_mapped != NULL) return (local_wild_mapped); #endif return (NULL); } /* * Lookup PCB in hash list, using pcbinfo tables. This variation assumes * that the caller has either locked the hash list, which usually happens * for bind(2) operations, or is in SMR section, which happens when sorting * out incoming packets. */ static struct inpcb * in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport_arg, struct in_addr laddr, u_int lport_arg, int lookupflags, uint8_t numa_domain) { struct inpcb *inp; const u_short fport = fport_arg, lport = lport_arg; KASSERT((lookupflags & ~INPLOOKUP_WILDCARD) == 0, ("%s: invalid lookup flags %d", __func__, lookupflags)); KASSERT(faddr.s_addr != INADDR_ANY, ("%s: invalid foreign address", __func__)); KASSERT(laddr.s_addr != INADDR_ANY, ("%s: invalid local address", __func__)); INP_HASH_WLOCK_ASSERT(pcbinfo); inp = in_pcblookup_hash_exact(pcbinfo, faddr, fport, laddr, lport); if (inp != NULL) return (inp); if ((lookupflags & INPLOOKUP_WILDCARD) != 0) { inp = in_pcblookup_lbgroup(pcbinfo, &faddr, fport, &laddr, lport, numa_domain); if (inp == NULL) { inp = in_pcblookup_hash_wild_locked(pcbinfo, faddr, fport, laddr, lport); } } return (inp); } static struct inpcb * in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport, struct in_addr laddr, u_int lport, int lookupflags, uint8_t numa_domain) { struct inpcb *inp; const inp_lookup_t lockflags = lookupflags & INPLOOKUP_LOCKMASK; KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0, ("%s: LOCKPCB not set", __func__)); INP_HASH_WLOCK(pcbinfo); inp = in_pcblookup_hash_locked(pcbinfo, faddr, fport, laddr, lport, lookupflags & ~INPLOOKUP_LOCKMASK, numa_domain); if (inp != NULL && !inp_trylock(inp, lockflags)) { in_pcbref(inp); INP_HASH_WUNLOCK(pcbinfo); inp_lock(inp, lockflags); if (in_pcbrele(inp, lockflags)) /* XXX-MJ or retry until we get a negative match? */ inp = NULL; } else { INP_HASH_WUNLOCK(pcbinfo); } return (inp); } static struct inpcb * in_pcblookup_hash_smr(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport_arg, struct in_addr laddr, u_int lport_arg, int lookupflags, uint8_t numa_domain) { struct inpcb *inp; const inp_lookup_t lockflags = lookupflags & INPLOOKUP_LOCKMASK; const u_short fport = fport_arg, lport = lport_arg; KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0, ("%s: invalid lookup flags %d", __func__, lookupflags)); KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0, ("%s: LOCKPCB not set", __func__)); smr_enter(pcbinfo->ipi_smr); inp = in_pcblookup_hash_exact(pcbinfo, faddr, fport, laddr, lport); if (inp != NULL) { if (__predict_true(inp_smr_lock(inp, lockflags))) { /* * Revalidate the 4-tuple, the socket could have been * disconnected. */ if (__predict_true(in_pcblookup_exact_match(inp, faddr, fport, laddr, lport))) return (inp); inp_unlock(inp, lockflags); } /* * We failed to lock the inpcb, or its connection state changed * out from under us. Fall back to a precise search. */ return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport, lookupflags, numa_domain)); } if ((lookupflags & INPLOOKUP_WILDCARD) != 0) { inp = in_pcblookup_lbgroup(pcbinfo, &faddr, fport, &laddr, lport, numa_domain); if (inp != NULL) { if (__predict_true(inp_smr_lock(inp, lockflags))) { if (__predict_true(in_pcblookup_wild_match(inp, laddr, lport) != INPLOOKUP_MATCH_NONE)) return (inp); inp_unlock(inp, lockflags); } inp = INP_LOOKUP_AGAIN; } else { inp = in_pcblookup_hash_wild_smr(pcbinfo, faddr, fport, laddr, lport, lockflags); } if (inp == INP_LOOKUP_AGAIN) { return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport, lookupflags, numa_domain)); } } if (inp == NULL) smr_exit(pcbinfo->ipi_smr); return (inp); } /* * Public inpcb lookup routines, accepting a 4-tuple, and optionally, an mbuf * from which a pre-calculated hash value may be extracted. */ struct inpcb * in_pcblookup(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport, struct in_addr laddr, u_int lport, int lookupflags, struct ifnet *ifp __unused) { return (in_pcblookup_hash_smr(pcbinfo, faddr, fport, laddr, lport, lookupflags, M_NODOM)); } struct inpcb * in_pcblookup_mbuf(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport, struct in_addr laddr, u_int lport, int lookupflags, struct ifnet *ifp __unused, struct mbuf *m) { return (in_pcblookup_hash_smr(pcbinfo, faddr, fport, laddr, lport, lookupflags, m->m_pkthdr.numa_domain)); } #endif /* INET */ static bool in_pcbjailed(const struct inpcb *inp, unsigned int flag) { return (prison_flag(inp->inp_cred, flag) != 0); } /* * Insert the PCB into a hash chain using ordering rules which ensure that * in_pcblookup_hash_wild_*() always encounter the highest-ranking PCB first. * * Specifically, keep jailed PCBs in front of non-jailed PCBs, and keep PCBs * with exact local addresses ahead of wildcard PCBs. Unbound v4-mapped v6 PCBs * always appear last no matter whether they are jailed. */ static void _in_pcbinshash_wild(struct inpcbhead *pcbhash, struct inpcb *inp) { struct inpcb *last; bool bound, injail; INP_LOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); last = NULL; bound = inp->inp_laddr.s_addr != INADDR_ANY; if (!bound && (inp->inp_vflag & INP_IPV6PROTO) != 0) { CK_LIST_FOREACH(last, pcbhash, inp_hash_wild) { if (CK_LIST_NEXT(last, inp_hash_wild) == NULL) { CK_LIST_INSERT_AFTER(last, inp, inp_hash_wild); return; } } CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_wild); return; } injail = in_pcbjailed(inp, PR_IP4); if (!injail) { CK_LIST_FOREACH(last, pcbhash, inp_hash_wild) { if (!in_pcbjailed(last, PR_IP4)) break; if (CK_LIST_NEXT(last, inp_hash_wild) == NULL) { CK_LIST_INSERT_AFTER(last, inp, inp_hash_wild); return; } } } else if (!CK_LIST_EMPTY(pcbhash) && !in_pcbjailed(CK_LIST_FIRST(pcbhash), PR_IP4)) { CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_wild); return; } if (!bound) { CK_LIST_FOREACH_FROM(last, pcbhash, inp_hash_wild) { if (last->inp_laddr.s_addr == INADDR_ANY) break; if (CK_LIST_NEXT(last, inp_hash_wild) == NULL) { CK_LIST_INSERT_AFTER(last, inp, inp_hash_wild); return; } } } if (last == NULL) CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_wild); else CK_LIST_INSERT_BEFORE(last, inp, inp_hash_wild); } #ifdef INET6 /* * See the comment above _in_pcbinshash_wild(). */ static void _in6_pcbinshash_wild(struct inpcbhead *pcbhash, struct inpcb *inp) { struct inpcb *last; bool bound, injail; INP_LOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); last = NULL; bound = !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr); injail = in_pcbjailed(inp, PR_IP6); if (!injail) { CK_LIST_FOREACH(last, pcbhash, inp_hash_wild) { if (!in_pcbjailed(last, PR_IP6)) break; if (CK_LIST_NEXT(last, inp_hash_wild) == NULL) { CK_LIST_INSERT_AFTER(last, inp, inp_hash_wild); return; } } } else if (!CK_LIST_EMPTY(pcbhash) && !in_pcbjailed(CK_LIST_FIRST(pcbhash), PR_IP6)) { CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_wild); return; } if (!bound) { CK_LIST_FOREACH_FROM(last, pcbhash, inp_hash_wild) { if (IN6_IS_ADDR_UNSPECIFIED(&last->in6p_laddr)) break; if (CK_LIST_NEXT(last, inp_hash_wild) == NULL) { CK_LIST_INSERT_AFTER(last, inp, inp_hash_wild); return; } } } if (last == NULL) CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_wild); else CK_LIST_INSERT_BEFORE(last, inp, inp_hash_wild); } #endif /* * Insert PCB onto various hash lists. */ int in_pcbinshash(struct inpcb *inp) { struct inpcbhead *pcbhash; struct inpcbporthead *pcbporthash; struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; struct inpcbport *phd; uint32_t hash; bool connected; INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(pcbinfo); KASSERT((inp->inp_flags & INP_INHASHLIST) == 0, ("in_pcbinshash: INP_INHASHLIST")); #ifdef INET6 if (inp->inp_vflag & INP_IPV6) { hash = INP6_PCBHASH(&inp->in6p_faddr, inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask); connected = !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr); } else #endif { hash = INP_PCBHASH(&inp->inp_faddr, inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask); connected = !in_nullhost(inp->inp_faddr); } if (connected) pcbhash = &pcbinfo->ipi_hash_exact[hash]; else pcbhash = &pcbinfo->ipi_hash_wild[hash]; pcbporthash = &pcbinfo->ipi_porthashbase[ INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_porthashmask)]; /* * Add entry to load balance group. * Only do this if SO_REUSEPORT_LB is set. */ if ((inp->inp_socket->so_options & SO_REUSEPORT_LB) != 0) { int error = in_pcbinslbgrouphash(inp, M_NODOM); if (error != 0) return (error); } /* * Go through port list and look for a head for this lport. */ CK_LIST_FOREACH(phd, pcbporthash, phd_hash) { if (phd->phd_port == inp->inp_lport) break; } /* * If none exists, malloc one and tack it on. */ if (phd == NULL) { phd = uma_zalloc_smr(pcbinfo->ipi_portzone, M_NOWAIT); if (phd == NULL) { if ((inp->inp_flags & INP_INLBGROUP) != 0) in_pcbremlbgrouphash(inp); return (ENOMEM); } phd->phd_port = inp->inp_lport; CK_LIST_INIT(&phd->phd_pcblist); CK_LIST_INSERT_HEAD(pcbporthash, phd, phd_hash); } inp->inp_phd = phd; CK_LIST_INSERT_HEAD(&phd->phd_pcblist, inp, inp_portlist); /* * The PCB may have been disconnected in the past. Before we can safely * make it visible in the hash table, we must wait for all readers which * may be traversing this PCB to finish. */ if (inp->inp_smr != SMR_SEQ_INVALID) { smr_wait(pcbinfo->ipi_smr, inp->inp_smr); inp->inp_smr = SMR_SEQ_INVALID; } if (connected) CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_exact); else { #ifdef INET6 if ((inp->inp_vflag & INP_IPV6) != 0) _in6_pcbinshash_wild(pcbhash, inp); else #endif _in_pcbinshash_wild(pcbhash, inp); } inp->inp_flags |= INP_INHASHLIST; return (0); } void in_pcbremhash_locked(struct inpcb *inp) { struct inpcbport *phd = inp->inp_phd; INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); MPASS(inp->inp_flags & INP_INHASHLIST); if ((inp->inp_flags & INP_INLBGROUP) != 0) in_pcbremlbgrouphash(inp); #ifdef INET6 if (inp->inp_vflag & INP_IPV6) { if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) CK_LIST_REMOVE(inp, inp_hash_wild); else CK_LIST_REMOVE(inp, inp_hash_exact); } else #endif { if (in_nullhost(inp->inp_faddr)) CK_LIST_REMOVE(inp, inp_hash_wild); else CK_LIST_REMOVE(inp, inp_hash_exact); } CK_LIST_REMOVE(inp, inp_portlist); if (CK_LIST_FIRST(&phd->phd_pcblist) == NULL) { CK_LIST_REMOVE(phd, phd_hash); uma_zfree_smr(inp->inp_pcbinfo->ipi_portzone, phd); } inp->inp_flags &= ~INP_INHASHLIST; } static void in_pcbremhash(struct inpcb *inp) { INP_HASH_WLOCK(inp->inp_pcbinfo); in_pcbremhash_locked(inp); INP_HASH_WUNLOCK(inp->inp_pcbinfo); } /* * Move PCB to the proper hash bucket when { faddr, fport } have been * changed. NOTE: This does not handle the case of the lport changing (the * hashed port list would have to be updated as well), so the lport must * not change after in_pcbinshash() has been called. */ void in_pcbrehash(struct inpcb *inp) { struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; struct inpcbhead *head; uint32_t hash; bool connected; INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(pcbinfo); KASSERT(inp->inp_flags & INP_INHASHLIST, ("%s: !INP_INHASHLIST", __func__)); KASSERT(inp->inp_smr == SMR_SEQ_INVALID, ("%s: inp was disconnected", __func__)); #ifdef INET6 if (inp->inp_vflag & INP_IPV6) { hash = INP6_PCBHASH(&inp->in6p_faddr, inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask); connected = !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr); } else #endif { hash = INP_PCBHASH(&inp->inp_faddr, inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask); connected = !in_nullhost(inp->inp_faddr); } /* * When rehashing, the caller must ensure that either the new or the old * foreign address was unspecified. */ if (connected) CK_LIST_REMOVE(inp, inp_hash_wild); else CK_LIST_REMOVE(inp, inp_hash_exact); if (connected) { head = &pcbinfo->ipi_hash_exact[hash]; CK_LIST_INSERT_HEAD(head, inp, inp_hash_exact); } else { head = &pcbinfo->ipi_hash_wild[hash]; CK_LIST_INSERT_HEAD(head, inp, inp_hash_wild); } } /* * Check for alternatives when higher level complains * about service problems. For now, invalidate cached * routing information. If the route was created dynamically * (by a redirect), time to try a default gateway again. */ void in_losing(struct inpcb *inp) { RO_INVALIDATE_CACHE(&inp->inp_route); return; } /* * A set label operation has occurred at the socket layer, propagate the * label change into the in_pcb for the socket. */ void in_pcbsosetlabel(struct socket *so) { #ifdef MAC struct inpcb *inp; inp = sotoinpcb(so); KASSERT(inp != NULL, ("in_pcbsosetlabel: so->so_pcb == NULL")); INP_WLOCK(inp); SOCK_LOCK(so); mac_inpcb_sosetlabel(so, inp); SOCK_UNLOCK(so); INP_WUNLOCK(inp); #endif } void inp_wlock(struct inpcb *inp) { INP_WLOCK(inp); } void inp_wunlock(struct inpcb *inp) { INP_WUNLOCK(inp); } void inp_rlock(struct inpcb *inp) { INP_RLOCK(inp); } void inp_runlock(struct inpcb *inp) { INP_RUNLOCK(inp); } #ifdef INVARIANT_SUPPORT void inp_lock_assert(struct inpcb *inp) { INP_WLOCK_ASSERT(inp); } void inp_unlock_assert(struct inpcb *inp) { INP_UNLOCK_ASSERT(inp); } #endif void inp_apply_all(struct inpcbinfo *pcbinfo, void (*func)(struct inpcb *, void *), void *arg) { struct inpcb_iterator inpi = INP_ALL_ITERATOR(pcbinfo, INPLOOKUP_WLOCKPCB); struct inpcb *inp; while ((inp = inp_next(&inpi)) != NULL) func(inp, arg); } struct socket * inp_inpcbtosocket(struct inpcb *inp) { INP_WLOCK_ASSERT(inp); return (inp->inp_socket); } void inp_4tuple_get(struct inpcb *inp, uint32_t *laddr, uint16_t *lp, uint32_t *faddr, uint16_t *fp) { INP_LOCK_ASSERT(inp); *laddr = inp->inp_laddr.s_addr; *faddr = inp->inp_faddr.s_addr; *lp = inp->inp_lport; *fp = inp->inp_fport; } /* * Create an external-format (``xinpcb'') structure using the information in * the kernel-format in_pcb structure pointed to by inp. This is done to * reduce the spew of irrelevant information over this interface, to isolate * user code from changes in the kernel structure, and potentially to provide * information-hiding if we decide that some of this information should be * hidden from users. */ void in_pcbtoxinpcb(const struct inpcb *inp, struct xinpcb *xi) { bzero(xi, sizeof(*xi)); xi->xi_len = sizeof(struct xinpcb); if (inp->inp_socket) sotoxsocket(inp->inp_socket, &xi->xi_socket); bcopy(&inp->inp_inc, &xi->inp_inc, sizeof(struct in_conninfo)); xi->inp_gencnt = inp->inp_gencnt; - xi->inp_ppcb = (uintptr_t)inp->inp_ppcb; xi->inp_flow = inp->inp_flow; xi->inp_flowid = inp->inp_flowid; xi->inp_flowtype = inp->inp_flowtype; xi->inp_flags = inp->inp_flags; xi->inp_flags2 = inp->inp_flags2; xi->in6p_cksum = inp->in6p_cksum; xi->in6p_hops = inp->in6p_hops; xi->inp_ip_tos = inp->inp_ip_tos; xi->inp_vflag = inp->inp_vflag; xi->inp_ip_ttl = inp->inp_ip_ttl; xi->inp_ip_p = inp->inp_ip_p; xi->inp_ip_minttl = inp->inp_ip_minttl; } int sysctl_setsockopt(SYSCTL_HANDLER_ARGS, struct inpcbinfo *pcbinfo, int (*ctloutput_set)(struct inpcb *, struct sockopt *)) { struct sockopt sopt; struct inpcb_iterator inpi = INP_ALL_ITERATOR(pcbinfo, INPLOOKUP_WLOCKPCB); struct inpcb *inp; struct sockopt_parameters *params; struct socket *so; int error; char buf[1024]; if (req->oldptr != NULL || req->oldlen != 0) return (EINVAL); if (req->newptr == NULL) return (EPERM); if (req->newlen > sizeof(buf)) return (ENOMEM); error = SYSCTL_IN(req, buf, req->newlen); if (error != 0) return (error); if (req->newlen < sizeof(struct sockopt_parameters)) return (EINVAL); params = (struct sockopt_parameters *)buf; sopt.sopt_level = params->sop_level; sopt.sopt_name = params->sop_optname; sopt.sopt_dir = SOPT_SET; sopt.sopt_val = params->sop_optval; sopt.sopt_valsize = req->newlen - sizeof(struct sockopt_parameters); sopt.sopt_td = NULL; #ifdef INET6 if (params->sop_inc.inc_flags & INC_ISIPV6) { if (IN6_IS_SCOPE_LINKLOCAL(¶ms->sop_inc.inc6_laddr)) params->sop_inc.inc6_laddr.s6_addr16[1] = htons(params->sop_inc.inc6_zoneid & 0xffff); if (IN6_IS_SCOPE_LINKLOCAL(¶ms->sop_inc.inc6_faddr)) params->sop_inc.inc6_faddr.s6_addr16[1] = htons(params->sop_inc.inc6_zoneid & 0xffff); } #endif if (params->sop_inc.inc_lport != htons(0) && params->sop_inc.inc_fport != htons(0)) { #ifdef INET6 if (params->sop_inc.inc_flags & INC_ISIPV6) inpi.hash = INP6_PCBHASH( ¶ms->sop_inc.inc6_faddr, params->sop_inc.inc_lport, params->sop_inc.inc_fport, pcbinfo->ipi_hashmask); else #endif inpi.hash = INP_PCBHASH( ¶ms->sop_inc.inc_faddr, params->sop_inc.inc_lport, params->sop_inc.inc_fport, pcbinfo->ipi_hashmask); } while ((inp = inp_next(&inpi)) != NULL) if (inp->inp_gencnt == params->sop_id) { if (inp->inp_flags & INP_DROPPED) { INP_WUNLOCK(inp); return (ECONNRESET); } so = inp->inp_socket; KASSERT(so != NULL, ("inp_socket == NULL")); soref(so); if (params->sop_level == SOL_SOCKET) { INP_WUNLOCK(inp); error = sosetopt(so, &sopt); } else error = (*ctloutput_set)(inp, &sopt); sorele(so); break; } if (inp == NULL) error = ESRCH; return (error); } #ifdef DDB static void db_print_indent(int indent) { int i; for (i = 0; i < indent; i++) db_printf(" "); } static void db_print_inconninfo(struct in_conninfo *inc, const char *name, int indent) { char faddr_str[48], laddr_str[48]; db_print_indent(indent); db_printf("%s at %p\n", name, inc); indent += 2; #ifdef INET6 if (inc->inc_flags & INC_ISIPV6) { /* IPv6. */ ip6_sprintf(laddr_str, &inc->inc6_laddr); ip6_sprintf(faddr_str, &inc->inc6_faddr); } else #endif { /* IPv4. */ inet_ntoa_r(inc->inc_laddr, laddr_str); inet_ntoa_r(inc->inc_faddr, faddr_str); } db_print_indent(indent); db_printf("inc_laddr %s inc_lport %u\n", laddr_str, ntohs(inc->inc_lport)); db_print_indent(indent); db_printf("inc_faddr %s inc_fport %u\n", faddr_str, ntohs(inc->inc_fport)); } static void db_print_inpflags(int inp_flags) { int comma; comma = 0; if (inp_flags & INP_RECVOPTS) { db_printf("%sINP_RECVOPTS", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_RECVRETOPTS) { db_printf("%sINP_RECVRETOPTS", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_RECVDSTADDR) { db_printf("%sINP_RECVDSTADDR", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_ORIGDSTADDR) { db_printf("%sINP_ORIGDSTADDR", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_HDRINCL) { db_printf("%sINP_HDRINCL", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_HIGHPORT) { db_printf("%sINP_HIGHPORT", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_LOWPORT) { db_printf("%sINP_LOWPORT", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_ANONPORT) { db_printf("%sINP_ANONPORT", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_RECVIF) { db_printf("%sINP_RECVIF", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_MTUDISC) { db_printf("%sINP_MTUDISC", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_RECVTTL) { db_printf("%sINP_RECVTTL", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_DONTFRAG) { db_printf("%sINP_DONTFRAG", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_RECVTOS) { db_printf("%sINP_RECVTOS", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_IPV6_V6ONLY) { db_printf("%sIN6P_IPV6_V6ONLY", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_PKTINFO) { db_printf("%sIN6P_PKTINFO", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_HOPLIMIT) { db_printf("%sIN6P_HOPLIMIT", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_HOPOPTS) { db_printf("%sIN6P_HOPOPTS", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_DSTOPTS) { db_printf("%sIN6P_DSTOPTS", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_RTHDR) { db_printf("%sIN6P_RTHDR", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_RTHDRDSTOPTS) { db_printf("%sIN6P_RTHDRDSTOPTS", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_TCLASS) { db_printf("%sIN6P_TCLASS", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_AUTOFLOWLABEL) { db_printf("%sIN6P_AUTOFLOWLABEL", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_ONESBCAST) { db_printf("%sINP_ONESBCAST", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_DROPPED) { db_printf("%sINP_DROPPED", comma ? ", " : ""); comma = 1; } if (inp_flags & INP_SOCKREF) { db_printf("%sINP_SOCKREF", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_RFC2292) { db_printf("%sIN6P_RFC2292", comma ? ", " : ""); comma = 1; } if (inp_flags & IN6P_MTU) { db_printf("IN6P_MTU%s", comma ? ", " : ""); comma = 1; } } static void db_print_inpvflag(u_char inp_vflag) { int comma; comma = 0; if (inp_vflag & INP_IPV4) { db_printf("%sINP_IPV4", comma ? ", " : ""); comma = 1; } if (inp_vflag & INP_IPV6) { db_printf("%sINP_IPV6", comma ? ", " : ""); comma = 1; } if (inp_vflag & INP_IPV6PROTO) { db_printf("%sINP_IPV6PROTO", comma ? ", " : ""); comma = 1; } } static void db_print_inpcb(struct inpcb *inp, const char *name, int indent) { db_print_indent(indent); db_printf("%s at %p\n", name, inp); indent += 2; db_print_indent(indent); db_printf("inp_flow: 0x%x\n", inp->inp_flow); db_print_inconninfo(&inp->inp_inc, "inp_conninfo", indent); - db_print_indent(indent); - db_printf("inp_ppcb: %p inp_pcbinfo: %p inp_socket: %p\n", - inp->inp_ppcb, inp->inp_pcbinfo, inp->inp_socket); - db_print_indent(indent); db_printf("inp_label: %p inp_flags: 0x%x (", inp->inp_label, inp->inp_flags); db_print_inpflags(inp->inp_flags); db_printf(")\n"); db_print_indent(indent); db_printf("inp_sp: %p inp_vflag: 0x%x (", inp->inp_sp, inp->inp_vflag); db_print_inpvflag(inp->inp_vflag); db_printf(")\n"); db_print_indent(indent); db_printf("inp_ip_ttl: %d inp_ip_p: %d inp_ip_minttl: %d\n", inp->inp_ip_ttl, inp->inp_ip_p, inp->inp_ip_minttl); db_print_indent(indent); #ifdef INET6 if (inp->inp_vflag & INP_IPV6) { db_printf("in6p_options: %p in6p_outputopts: %p " "in6p_moptions: %p\n", inp->in6p_options, inp->in6p_outputopts, inp->in6p_moptions); db_printf("in6p_icmp6filt: %p in6p_cksum %d " "in6p_hops %u\n", inp->in6p_icmp6filt, inp->in6p_cksum, inp->in6p_hops); } else #endif { db_printf("inp_ip_tos: %d inp_ip_options: %p " "inp_ip_moptions: %p\n", inp->inp_ip_tos, inp->inp_options, inp->inp_moptions); } db_print_indent(indent); db_printf("inp_phd: %p inp_gencnt: %ju\n", inp->inp_phd, (uintmax_t)inp->inp_gencnt); } DB_SHOW_COMMAND(inpcb, db_show_inpcb) { struct inpcb *inp; if (!have_addr) { db_printf("usage: show inpcb \n"); return; } inp = (struct inpcb *)addr; db_print_inpcb(inp, "inpcb", 0); } #endif /* DDB */ #ifdef RATELIMIT /* * Modify TX rate limit based on the existing "inp->inp_snd_tag", * if any. */ int in_pcbmodify_txrtlmt(struct inpcb *inp, uint32_t max_pacing_rate) { union if_snd_tag_modify_params params = { .rate_limit.max_rate = max_pacing_rate, .rate_limit.flags = M_NOWAIT, }; struct m_snd_tag *mst; int error; mst = inp->inp_snd_tag; if (mst == NULL) return (EINVAL); if (mst->sw->snd_tag_modify == NULL) { error = EOPNOTSUPP; } else { error = mst->sw->snd_tag_modify(mst, ¶ms); } return (error); } /* * Query existing TX rate limit based on the existing * "inp->inp_snd_tag", if any. */ int in_pcbquery_txrtlmt(struct inpcb *inp, uint32_t *p_max_pacing_rate) { union if_snd_tag_query_params params = { }; struct m_snd_tag *mst; int error; mst = inp->inp_snd_tag; if (mst == NULL) return (EINVAL); if (mst->sw->snd_tag_query == NULL) { error = EOPNOTSUPP; } else { error = mst->sw->snd_tag_query(mst, ¶ms); if (error == 0 && p_max_pacing_rate != NULL) *p_max_pacing_rate = params.rate_limit.max_rate; } return (error); } /* * Query existing TX queue level based on the existing * "inp->inp_snd_tag", if any. */ int in_pcbquery_txrlevel(struct inpcb *inp, uint32_t *p_txqueue_level) { union if_snd_tag_query_params params = { }; struct m_snd_tag *mst; int error; mst = inp->inp_snd_tag; if (mst == NULL) return (EINVAL); if (mst->sw->snd_tag_query == NULL) return (EOPNOTSUPP); error = mst->sw->snd_tag_query(mst, ¶ms); if (error == 0 && p_txqueue_level != NULL) *p_txqueue_level = params.rate_limit.queue_level; return (error); } /* * Allocate a new TX rate limit send tag from the network interface * given by the "ifp" argument and save it in "inp->inp_snd_tag": */ int in_pcbattach_txrtlmt(struct inpcb *inp, struct ifnet *ifp, uint32_t flowtype, uint32_t flowid, uint32_t max_pacing_rate, struct m_snd_tag **st) { union if_snd_tag_alloc_params params = { .rate_limit.hdr.type = (max_pacing_rate == -1U) ? IF_SND_TAG_TYPE_UNLIMITED : IF_SND_TAG_TYPE_RATE_LIMIT, .rate_limit.hdr.flowid = flowid, .rate_limit.hdr.flowtype = flowtype, .rate_limit.hdr.numa_domain = inp->inp_numa_domain, .rate_limit.max_rate = max_pacing_rate, .rate_limit.flags = M_NOWAIT, }; int error; INP_WLOCK_ASSERT(inp); /* * If there is already a send tag, or the INP is being torn * down, allocating a new send tag is not allowed. Else send * tags may leak. */ if (*st != NULL || (inp->inp_flags & INP_DROPPED) != 0) return (EINVAL); error = m_snd_tag_alloc(ifp, ¶ms, st); #ifdef INET if (error == 0) { counter_u64_add(rate_limit_set_ok, 1); counter_u64_add(rate_limit_active, 1); } else if (error != EOPNOTSUPP) counter_u64_add(rate_limit_alloc_fail, 1); #endif return (error); } void in_pcbdetach_tag(struct m_snd_tag *mst) { m_snd_tag_rele(mst); #ifdef INET counter_u64_add(rate_limit_active, -1); #endif } /* * Free an existing TX rate limit tag based on the "inp->inp_snd_tag", * if any: */ void in_pcbdetach_txrtlmt(struct inpcb *inp) { struct m_snd_tag *mst; INP_WLOCK_ASSERT(inp); mst = inp->inp_snd_tag; inp->inp_snd_tag = NULL; if (mst == NULL) return; m_snd_tag_rele(mst); #ifdef INET counter_u64_add(rate_limit_active, -1); #endif } int in_pcboutput_txrtlmt_locked(struct inpcb *inp, struct ifnet *ifp, struct mbuf *mb, uint32_t max_pacing_rate) { int error; /* * If the existing send tag is for the wrong interface due to * a route change, first drop the existing tag. Set the * CHANGED flag so that we will keep trying to allocate a new * tag if we fail to allocate one this time. */ if (inp->inp_snd_tag != NULL && inp->inp_snd_tag->ifp != ifp) { in_pcbdetach_txrtlmt(inp); inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED; } /* * NOTE: When attaching to a network interface a reference is * made to ensure the network interface doesn't go away until * all ratelimit connections are gone. The network interface * pointers compared below represent valid network interfaces, * except when comparing towards NULL. */ if (max_pacing_rate == 0 && inp->inp_snd_tag == NULL) { error = 0; } else if (!(ifp->if_capenable & IFCAP_TXRTLMT)) { if (inp->inp_snd_tag != NULL) in_pcbdetach_txrtlmt(inp); error = 0; } else if (inp->inp_snd_tag == NULL) { /* * In order to utilize packet pacing with RSS, we need * to wait until there is a valid RSS hash before we * can proceed: */ if (M_HASHTYPE_GET(mb) == M_HASHTYPE_NONE) { error = EAGAIN; } else { error = in_pcbattach_txrtlmt(inp, ifp, M_HASHTYPE_GET(mb), mb->m_pkthdr.flowid, max_pacing_rate, &inp->inp_snd_tag); } } else { error = in_pcbmodify_txrtlmt(inp, max_pacing_rate); } if (error == 0 || error == EOPNOTSUPP) inp->inp_flags2 &= ~INP_RATE_LIMIT_CHANGED; return (error); } /* * This function should be called when the INP_RATE_LIMIT_CHANGED flag * is set in the fast path and will attach/detach/modify the TX rate * limit send tag based on the socket's so_max_pacing_rate value. */ void in_pcboutput_txrtlmt(struct inpcb *inp, struct ifnet *ifp, struct mbuf *mb) { struct socket *socket; uint32_t max_pacing_rate; bool did_upgrade; if (inp == NULL) return; socket = inp->inp_socket; if (socket == NULL) return; if (!INP_WLOCKED(inp)) { /* * NOTE: If the write locking fails, we need to bail * out and use the non-ratelimited ring for the * transmit until there is a new chance to get the * write lock. */ if (!INP_TRY_UPGRADE(inp)) return; did_upgrade = 1; } else { did_upgrade = 0; } /* * NOTE: The so_max_pacing_rate value is read unlocked, * because atomic updates are not required since the variable * is checked at every mbuf we send. It is assumed that the * variable read itself will be atomic. */ max_pacing_rate = socket->so_max_pacing_rate; in_pcboutput_txrtlmt_locked(inp, ifp, mb, max_pacing_rate); if (did_upgrade) INP_DOWNGRADE(inp); } /* * Track route changes for TX rate limiting. */ void in_pcboutput_eagain(struct inpcb *inp) { bool did_upgrade; if (inp == NULL) return; if (inp->inp_snd_tag == NULL) return; if (!INP_WLOCKED(inp)) { /* * NOTE: If the write locking fails, we need to bail * out and use the non-ratelimited ring for the * transmit until there is a new chance to get the * write lock. */ if (!INP_TRY_UPGRADE(inp)) return; did_upgrade = 1; } else { did_upgrade = 0; } /* detach rate limiting */ in_pcbdetach_txrtlmt(inp); /* make sure new mbuf send tag allocation is made */ inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED; if (did_upgrade) INP_DOWNGRADE(inp); } #ifdef INET static void rl_init(void *st) { rate_limit_new = counter_u64_alloc(M_WAITOK); rate_limit_chg = counter_u64_alloc(M_WAITOK); rate_limit_active = counter_u64_alloc(M_WAITOK); rate_limit_alloc_fail = counter_u64_alloc(M_WAITOK); rate_limit_set_ok = counter_u64_alloc(M_WAITOK); } SYSINIT(rl, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, rl_init, NULL); #endif #endif /* RATELIMIT */ diff --git a/sys/netinet/in_pcb.h b/sys/netinet/in_pcb.h index 7e9de02b0c7c..a4b4075b3501 100644 --- a/sys/netinet/in_pcb.h +++ b/sys/netinet/in_pcb.h @@ -1,738 +1,737 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1990, 1993 * The Regents of the University of California. * Copyright (c) 2010-2011 Juniper Networks, Inc. * All rights reserved. * * Portions of this software were developed by Robert N. M. Watson under * contract to Juniper Networks, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #ifndef _NETINET_IN_PCB_H_ #define _NETINET_IN_PCB_H_ #include #include #include #include #include #include #include #ifdef _KERNEL #include #include #include #include #include #include #endif #include /* * struct inpcb is the common protocol control block structure used in most * IP transport protocols. * * Pointers to local and foreign host table entries, local and foreign socket * numbers, and pointers up (to a socket structure) and down (to a * protocol-specific control block) are stored here. */ CK_LIST_HEAD(inpcbhead, inpcb); CK_LIST_HEAD(inpcbporthead, inpcbport); CK_LIST_HEAD(inpcblbgrouphead, inpcblbgroup); typedef uint64_t inp_gen_t; /* * PCB with AF_INET6 null bind'ed laddr can receive AF_INET input packet. * So, AF_INET6 null laddr is also used as AF_INET null laddr, by utilizing * the following structure. This requires padding always be zeroed out, * which is done right after inpcb allocation and stays through its lifetime. */ struct in_addr_4in6 { u_int32_t ia46_pad32[3]; struct in_addr ia46_addr4; }; union in_dependaddr { struct in_addr_4in6 id46_addr; struct in6_addr id6_addr; }; /* * NOTE: ipv6 addrs should be 64-bit aligned, per RFC 2553. in_conninfo has * some extra padding to accomplish this. * NOTE 2: tcp_syncache.c uses first 5 32-bit words, which identify fport, * lport, faddr to generate hash, so these fields shouldn't be moved. */ struct in_endpoints { u_int16_t ie_fport; /* foreign port */ u_int16_t ie_lport; /* local port */ /* protocol dependent part, local and foreign addr */ union in_dependaddr ie_dependfaddr; /* foreign host table entry */ union in_dependaddr ie_dependladdr; /* local host table entry */ #define ie_faddr ie_dependfaddr.id46_addr.ia46_addr4 #define ie_laddr ie_dependladdr.id46_addr.ia46_addr4 #define ie6_faddr ie_dependfaddr.id6_addr #define ie6_laddr ie_dependladdr.id6_addr u_int32_t ie6_zoneid; /* scope zone id */ }; /* * XXX The defines for inc_* are hacks and should be changed to direct * references. */ struct in_conninfo { u_int8_t inc_flags; u_int8_t inc_len; u_int16_t inc_fibnum; /* XXX was pad, 16 bits is plenty */ /* protocol dependent part */ struct in_endpoints inc_ie; }; /* * Flags for inc_flags. */ #define INC_ISIPV6 0x01 #define INC_IPV6MINMTU 0x02 #define inc_fport inc_ie.ie_fport #define inc_lport inc_ie.ie_lport #define inc_faddr inc_ie.ie_faddr #define inc_laddr inc_ie.ie_laddr #define inc6_faddr inc_ie.ie6_faddr #define inc6_laddr inc_ie.ie6_laddr #define inc6_zoneid inc_ie.ie6_zoneid #if defined(_KERNEL) || defined(_WANT_INPCB) /* * struct inpcb captures the network layer state for TCP, UDP, and raw IPv4 and * IPv6 sockets. In the case of TCP and UDP, further per-connection state is - * hung off of inp_ppcb most of the time. Almost all fields of struct inpcb - * are static after creation or protected by a per-inpcb rwlock, inp_lock. + * located in a larger protocol specific structure that embeds inpcb in it. + * Almost all fields of struct inpcb are static after creation or protected by + * a per-inpcb rwlock, inp_lock. * * A inpcb database is indexed by addresses/ports hash as well as list of * all pcbs that belong to a certain proto. Database lookups or list traversals * are be performed inside SMR section. Once desired PCB is found its own * lock is to be obtained and SMR section exited. * * Key: * (c) - Constant after initialization * (e) - Protected by the SMR section * (i) - Protected by the inpcb lock * (p) - Protected by the pcbinfo lock for the inpcb * (h) - Protected by the pcbhash lock for the inpcb * (s) - Protected by another subsystem's locks * (x) - Undefined locking * * A few other notes: * * When a read lock is held, stability of the field is guaranteed; to write * to a field, a write lock must generally be held. * * netinet/netinet6-layer code should not assume that the inp_socket pointer * is safe to dereference without inp_lock being held, there may be * close(2)-related races. * * The inp_vflag field is overloaded, and would otherwise ideally be (c). */ struct icmp6_filter; struct inpcbpolicy; struct m_snd_tag; struct inpcb { /* Cache line #1 (amd64) */ CK_LIST_ENTRY(inpcb) inp_hash_exact; /* hash table linkage */ CK_LIST_ENTRY(inpcb) inp_hash_wild; /* hash table linkage */ struct rwlock inp_lock; /* Cache line #2 (amd64) */ #define inp_start_zero inp_refcount #define inp_zero_size (sizeof(struct inpcb) - \ offsetof(struct inpcb, inp_start_zero)) u_int inp_refcount; /* (i) refcount */ int inp_flags; /* (i) generic IP/datagram flags */ int inp_flags2; /* (i) generic IP/datagram flags #2*/ uint8_t inp_numa_domain; /* numa domain */ - void *inp_ppcb; /* (i) pointer to per-protocol pcb */ struct socket *inp_socket; /* (i) back pointer to socket */ struct inpcbinfo *inp_pcbinfo; /* (c) PCB list info */ struct ucred *inp_cred; /* (c) cache of socket cred */ u_int32_t inp_flow; /* (i) IPv6 flow information */ u_char inp_vflag; /* (i) IP version flag (v4/v6) */ u_char inp_ip_ttl; /* (i) time to live proto */ u_char inp_ip_p; /* (c) protocol proto */ u_char inp_ip_minttl; /* (i) minimum TTL or drop */ uint32_t inp_flowid; /* (x) flow id / queue id */ smr_seq_t inp_smr; /* (i) sequence number at disconnect */ struct m_snd_tag *inp_snd_tag; /* (i) send tag for outgoing mbufs */ uint32_t inp_flowtype; /* (x) M_HASHTYPE value */ /* Local and foreign ports, local and foreign addr. */ struct in_conninfo inp_inc; /* (i,h) list for PCB's local port */ /* MAC and IPSEC policy information. */ struct label *inp_label; /* (i) MAC label */ struct inpcbpolicy *inp_sp; /* (s) for IPSEC */ /* Protocol-dependent part; options. */ struct { u_char inp_ip_tos; /* (i) type of service proto */ struct mbuf *inp_options; /* (i) IP options */ struct ip_moptions *inp_moptions; /* (i) mcast options */ }; struct { /* (i) IP options */ struct mbuf *in6p_options; /* (i) IP6 options for outgoing packets */ struct ip6_pktopts *in6p_outputopts; /* (i) IP multicast options */ struct ip6_moptions *in6p_moptions; /* (i) ICMPv6 code type filter */ struct icmp6_filter *in6p_icmp6filt; /* (i) IPV6_CHECKSUM setsockopt */ int in6p_cksum; short in6p_hops; }; CK_LIST_ENTRY(inpcb) inp_portlist; /* (r:e/w:h) port list */ struct inpcbport *inp_phd; /* (r:e/w:h) head of this list */ inp_gen_t inp_gencnt; /* (c) generation count */ void *spare_ptr; /* Spare pointer. */ rt_gen_t inp_rt_cookie; /* generation for route entry */ union { /* cached L3 information */ struct route inp_route; struct route_in6 inp_route6; }; CK_LIST_ENTRY(inpcb) inp_list; /* (r:e/w:p) all PCBs for proto */ }; #endif /* _KERNEL */ #define inp_fport inp_inc.inc_fport #define inp_lport inp_inc.inc_lport #define inp_faddr inp_inc.inc_faddr #define inp_laddr inp_inc.inc_laddr #define in6p_faddr inp_inc.inc6_faddr #define in6p_laddr inp_inc.inc6_laddr #define in6p_zoneid inp_inc.inc6_zoneid #define inp_vnet inp_pcbinfo->ipi_vnet /* * The range of the generation count, as used in this implementation, is 9e19. * We would have to create 300 billion connections per second for this number * to roll over in a year. This seems sufficiently unlikely that we simply * don't concern ourselves with that possibility. */ /* * Interface exported to userland by various protocols which use inpcbs. Hack * alert -- only define if struct xsocket is in scope. * Fields prefixed with "xi_" are unique to this structure, and the rest * match fields in the struct inpcb, to ease coding and porting. * * Legend: * (s) - used by userland utilities in src * (p) - used by utilities in ports * (3) - is known to be used by third party software not in ports * (n) - no known usage */ #ifdef _SYS_SOCKETVAR_H_ struct xinpcb { ksize_t xi_len; /* length of this structure */ struct xsocket xi_socket; /* (s,p) */ struct in_conninfo inp_inc; /* (s,p) */ uint64_t inp_gencnt; /* (s,p) */ - kvaddr_t inp_ppcb; /* (s) netstat(1) */ - int64_t inp_spare64[4]; + int64_t inp_spare64[5]; uint32_t inp_flow; /* (s) */ uint32_t inp_flowid; /* (s) */ uint32_t inp_flowtype; /* (s) */ int32_t inp_flags; /* (s,p) */ int32_t inp_flags2; /* (s) */ uint32_t inp_unused; int32_t in6p_cksum; /* (n) */ int32_t inp_spare32[4]; uint16_t in6p_hops; /* (n) */ uint8_t inp_ip_tos; /* (n) */ int8_t pad8; uint8_t inp_vflag; /* (s,p) */ uint8_t inp_ip_ttl; /* (n) */ uint8_t inp_ip_p; /* (n) */ uint8_t inp_ip_minttl; /* (n) */ int8_t inp_spare8[4]; } __aligned(8); struct xinpgen { ksize_t xig_len; /* length of this structure */ u_int xig_count; /* number of PCBs at this time */ uint32_t _xig_spare32; inp_gen_t xig_gen; /* generation count at this time */ so_gen_t xig_sogen; /* socket generation count this time */ uint64_t _xig_spare64[4]; } __aligned(8); struct sockopt_parameters { struct in_conninfo sop_inc; uint64_t sop_id; int sop_level; int sop_optname; char sop_optval[]; }; #ifdef _KERNEL int sysctl_setsockopt(SYSCTL_HANDLER_ARGS, struct inpcbinfo *pcbinfo, int (*ctloutput_set)(struct inpcb *, struct sockopt *)); void in_pcbtoxinpcb(const struct inpcb *, struct xinpcb *); #endif #endif /* _SYS_SOCKETVAR_H_ */ #ifdef _KERNEL /* * Per-VNET pcb database for each high-level protocol (UDP, TCP, ...) in both * IPv4 and IPv6. * * The pcbs are protected with SMR section and thus all lists in inpcbinfo * are CK-lists. Locking is required to insert a pcb into database. Two * locks are provided: one for the hash and one for the global list of pcbs, * as well as overall count and generation count. * * Locking key: * * (c) Constant or nearly constant after initialisation * (e) Protected by SMR section * (g) Locked by ipi_lock * (h) Locked by ipi_hash_lock */ struct inpcbinfo { /* * Global lock protecting inpcb list modification */ struct mtx ipi_lock; struct inpcbhead ipi_listhead; /* (r:e/w:g) */ u_int ipi_count; /* (g) */ /* * Generation count -- incremented each time a connection is allocated * or freed. */ u_quad_t ipi_gencnt; /* (g) */ /* * Fields associated with port lookup and allocation. */ u_short ipi_lastport; /* (h) */ u_short ipi_lastlow; /* (h) */ u_short ipi_lasthi; /* (h) */ /* * UMA zone from which inpcbs are allocated for this protocol. */ uma_zone_t ipi_zone; /* (c) */ uma_zone_t ipi_portzone; /* (c) */ smr_t ipi_smr; /* (c) */ /* * Global hash of inpcbs, hashed by local and foreign addresses and * port numbers. The "exact" hash holds PCBs connected to a foreign * address, and "wild" holds the rest. */ struct mtx ipi_hash_lock; struct inpcbhead *ipi_hash_exact; /* (r:e/w:h) */ struct inpcbhead *ipi_hash_wild; /* (r:e/w:h) */ u_long ipi_hashmask; /* (c) */ /* * Global hash of inpcbs, hashed by only local port number. */ struct inpcbporthead *ipi_porthashbase; /* (h) */ u_long ipi_porthashmask; /* (h) */ /* * Load balance groups used for the SO_REUSEPORT_LB option, * hashed by local port. */ struct inpcblbgrouphead *ipi_lbgrouphashbase; /* (r:e/w:h) */ u_long ipi_lbgrouphashmask; /* (h) */ /* * Pointer to network stack instance */ struct vnet *ipi_vnet; /* (c) */ }; /* * Global allocation storage for each high-level protocol (UDP, TCP, ...). * Each corresponding per-VNET inpcbinfo points into this one. */ struct inpcbstorage { uma_zone_t ips_zone; uma_zone_t ips_portzone; uma_init ips_pcbinit; size_t ips_size; const char * ips_zone_name; const char * ips_portzone_name; const char * ips_infolock_name; const char * ips_hashlock_name; }; #define INPCBSTORAGE_DEFINE(prot, ppcb, lname, zname, iname, hname) \ static int \ prot##_inpcb_init(void *mem, int size __unused, int flags __unused) \ { \ struct inpcb *inp = mem; \ \ rw_init_flags(&inp->inp_lock, lname, RW_RECURSE | RW_DUPOK); \ return (0); \ } \ static struct inpcbstorage prot = { \ .ips_size = sizeof(struct ppcb), \ .ips_pcbinit = prot##_inpcb_init, \ .ips_zone_name = zname, \ .ips_portzone_name = zname " ports", \ .ips_infolock_name = iname, \ .ips_hashlock_name = hname, \ }; \ SYSINIT(prot##_inpcbstorage_init, SI_SUB_PROTO_DOMAIN, \ SI_ORDER_SECOND, in_pcbstorage_init, &prot); \ SYSUNINIT(prot##_inpcbstorage_uninit, SI_SUB_PROTO_DOMAIN, \ SI_ORDER_SECOND, in_pcbstorage_destroy, &prot) /* * Load balance groups used for the SO_REUSEPORT_LB socket option. Each group * (or unique address:port combination) can be re-used at most * INPCBLBGROUP_SIZMAX (256) times. The inpcbs are stored in il_inp which * is dynamically resized as processes bind/unbind to that specific group. */ struct inpcblbgroup { CK_LIST_ENTRY(inpcblbgroup) il_list; struct epoch_context il_epoch_ctx; struct ucred *il_cred; uint16_t il_lport; /* (c) */ u_char il_vflag; /* (c) */ uint8_t il_numa_domain; uint32_t il_pad2; union in_dependaddr il_dependladdr; /* (c) */ #define il_laddr il_dependladdr.id46_addr.ia46_addr4 #define il6_laddr il_dependladdr.id6_addr uint32_t il_inpsiz; /* max count in il_inp[] (h) */ uint32_t il_inpcnt; /* cur count in il_inp[] (h) */ struct inpcb *il_inp[]; /* (h) */ }; #define INP_LOCK_DESTROY(inp) rw_destroy(&(inp)->inp_lock) #define INP_RLOCK(inp) rw_rlock(&(inp)->inp_lock) #define INP_WLOCK(inp) rw_wlock(&(inp)->inp_lock) #define INP_TRY_RLOCK(inp) rw_try_rlock(&(inp)->inp_lock) #define INP_TRY_WLOCK(inp) rw_try_wlock(&(inp)->inp_lock) #define INP_RUNLOCK(inp) rw_runlock(&(inp)->inp_lock) #define INP_WUNLOCK(inp) rw_wunlock(&(inp)->inp_lock) #define INP_UNLOCK(inp) rw_unlock(&(inp)->inp_lock) #define INP_TRY_UPGRADE(inp) rw_try_upgrade(&(inp)->inp_lock) #define INP_DOWNGRADE(inp) rw_downgrade(&(inp)->inp_lock) #define INP_WLOCKED(inp) rw_wowned(&(inp)->inp_lock) #define INP_LOCK_ASSERT(inp) rw_assert(&(inp)->inp_lock, RA_LOCKED) #define INP_RLOCK_ASSERT(inp) rw_assert(&(inp)->inp_lock, RA_RLOCKED) #define INP_WLOCK_ASSERT(inp) rw_assert(&(inp)->inp_lock, RA_WLOCKED) #define INP_UNLOCK_ASSERT(inp) rw_assert(&(inp)->inp_lock, RA_UNLOCKED) /* * These locking functions are for inpcb consumers outside of sys/netinet, * more specifically, they were added for the benefit of TOE drivers. The * macros are reserved for use by the stack. */ void inp_wlock(struct inpcb *); void inp_wunlock(struct inpcb *); void inp_rlock(struct inpcb *); void inp_runlock(struct inpcb *); #ifdef INVARIANT_SUPPORT void inp_lock_assert(struct inpcb *); void inp_unlock_assert(struct inpcb *); #else #define inp_lock_assert(inp) do {} while (0) #define inp_unlock_assert(inp) do {} while (0) #endif void inp_apply_all(struct inpcbinfo *, void (*func)(struct inpcb *, void *), void *arg); struct socket * inp_inpcbtosocket(struct inpcb *inp); void inp_4tuple_get(struct inpcb *inp, uint32_t *laddr, uint16_t *lp, uint32_t *faddr, uint16_t *fp); #endif /* _KERNEL */ #define INP_INFO_WLOCK(ipi) mtx_lock(&(ipi)->ipi_lock) #define INP_INFO_WLOCKED(ipi) mtx_owned(&(ipi)->ipi_lock) #define INP_INFO_WUNLOCK(ipi) mtx_unlock(&(ipi)->ipi_lock) #define INP_INFO_LOCK_ASSERT(ipi) MPASS(SMR_ENTERED((ipi)->ipi_smr) || \ mtx_owned(&(ipi)->ipi_lock)) #define INP_INFO_WLOCK_ASSERT(ipi) mtx_assert(&(ipi)->ipi_lock, MA_OWNED) #define INP_INFO_WUNLOCK_ASSERT(ipi) \ mtx_assert(&(ipi)->ipi_lock, MA_NOTOWNED) #define INP_HASH_WLOCK(ipi) mtx_lock(&(ipi)->ipi_hash_lock) #define INP_HASH_WUNLOCK(ipi) mtx_unlock(&(ipi)->ipi_hash_lock) #define INP_HASH_LOCK_ASSERT(ipi) MPASS(SMR_ENTERED((ipi)->ipi_smr) || \ mtx_owned(&(ipi)->ipi_hash_lock)) #define INP_HASH_WLOCK_ASSERT(ipi) mtx_assert(&(ipi)->ipi_hash_lock, \ MA_OWNED) /* * Wildcard matching hash is not just a microoptimisation! The hash for * wildcard IPv4 and wildcard IPv6 must be the same, otherwise AF_INET6 * wildcard bound pcb won't be able to receive AF_INET connections, while: * jenkins_hash(&zeroes, 1, s) != jenkins_hash(&zeroes, 4, s) * See also comment above struct in_addr_4in6. */ #define IN_ADDR_JHASH32(addr) \ ((addr)->s_addr == INADDR_ANY ? V_in_pcbhashseed : \ jenkins_hash32((&(addr)->s_addr), 1, V_in_pcbhashseed)) #define IN6_ADDR_JHASH32(addr) \ (memcmp((addr), &in6addr_any, sizeof(in6addr_any)) == 0 ? \ V_in_pcbhashseed : \ jenkins_hash32((addr)->__u6_addr.__u6_addr32, \ nitems((addr)->__u6_addr.__u6_addr32), V_in_pcbhashseed)) #define INP_PCBHASH(faddr, lport, fport, mask) \ ((IN_ADDR_JHASH32(faddr) ^ ntohs((lport) ^ (fport))) & (mask)) #define INP6_PCBHASH(faddr, lport, fport, mask) \ ((IN6_ADDR_JHASH32(faddr) ^ ntohs((lport) ^ (fport))) & (mask)) #define INP_PCBHASH_WILD(lport, mask) \ ((V_in_pcbhashseed ^ ntohs(lport)) & (mask)) #define INP_PCBLBGROUP_PKTHASH(faddr, lport, fport) \ (IN_ADDR_JHASH32(faddr) ^ ntohs((lport) ^ (fport))) #define INP6_PCBLBGROUP_PKTHASH(faddr, lport, fport) \ (IN6_ADDR_JHASH32(faddr) ^ ntohs((lport) ^ (fport))) #define INP_PCBPORTHASH(lport, mask) (ntohs((lport)) & (mask)) /* * Flags for inp_vflags -- historically version flags only */ #define INP_IPV4 0x1 #define INP_IPV6 0x2 #define INP_IPV6PROTO 0x4 /* opened under IPv6 protocol */ /* * Flags for inp_flags. */ #define INP_RECVOPTS 0x00000001 /* receive incoming IP options */ #define INP_RECVRETOPTS 0x00000002 /* receive IP options for reply */ #define INP_RECVDSTADDR 0x00000004 /* receive IP dst address */ #define INP_HDRINCL 0x00000008 /* user supplies entire IP header */ #define INP_HIGHPORT 0x00000010 /* user wants "high" port binding */ #define INP_LOWPORT 0x00000020 /* user wants "low" port binding */ #define INP_ANONPORT 0x00000040 /* read by netstat(1) */ #define INP_RECVIF 0x00000080 /* receive incoming interface */ #define INP_MTUDISC 0x00000100 /* user can do MTU discovery */ /* INP_FREED 0x00000200 private to in_pcb.c */ #define INP_RECVTTL 0x00000400 /* receive incoming IP TTL */ #define INP_DONTFRAG 0x00000800 /* don't fragment packet */ #define INP_BINDANY 0x00001000 /* allow bind to any address */ #define INP_INHASHLIST 0x00002000 /* in_pcbinshash() has been called */ #define INP_RECVTOS 0x00004000 /* receive incoming IP TOS */ #define IN6P_IPV6_V6ONLY 0x00008000 /* restrict AF_INET6 socket for v6 */ #define IN6P_PKTINFO 0x00010000 /* receive IP6 dst and I/F */ #define IN6P_HOPLIMIT 0x00020000 /* receive hoplimit */ #define IN6P_HOPOPTS 0x00040000 /* receive hop-by-hop options */ #define IN6P_DSTOPTS 0x00080000 /* receive dst options after rthdr */ #define IN6P_RTHDR 0x00100000 /* receive routing header */ #define IN6P_RTHDRDSTOPTS 0x00200000 /* receive dstoptions before rthdr */ #define IN6P_TCLASS 0x00400000 /* receive traffic class value */ #define IN6P_AUTOFLOWLABEL 0x00800000 /* attach flowlabel automatically */ /* INP_INLBGROUP 0x01000000 private to in_pcb.c */ #define INP_ONESBCAST 0x02000000 /* send all-ones broadcast */ #define INP_DROPPED 0x04000000 /* protocol drop flag */ #define INP_SOCKREF 0x08000000 /* strong socket reference */ #define INP_RESERVED_0 0x10000000 /* reserved field */ #define INP_RESERVED_1 0x20000000 /* reserved field */ #define IN6P_RFC2292 0x40000000 /* used RFC2292 API on the socket */ #define IN6P_MTU 0x80000000 /* receive path MTU */ #define INP_CONTROLOPTS (INP_RECVOPTS|INP_RECVRETOPTS|INP_RECVDSTADDR|\ INP_RECVIF|INP_RECVTTL|INP_RECVTOS|\ IN6P_PKTINFO|IN6P_HOPLIMIT|IN6P_HOPOPTS|\ IN6P_DSTOPTS|IN6P_RTHDR|IN6P_RTHDRDSTOPTS|\ IN6P_TCLASS|IN6P_AUTOFLOWLABEL|IN6P_RFC2292|\ IN6P_MTU) /* * Flags for inp_flags2. */ /* 0x00000001 */ /* 0x00000002 */ /* 0x00000004 */ /* 0x00000008 */ /* 0x00000010 */ /* 0x00000020 */ /* 0x00000040 */ /* 0x00000080 */ #define INP_RECVFLOWID 0x00000100 /* populate recv datagram with flow info */ #define INP_RECVRSSBUCKETID 0x00000200 /* populate recv datagram with bucket id */ #define INP_RATE_LIMIT_CHANGED 0x00000400 /* rate limit needs attention */ #define INP_ORIGDSTADDR 0x00000800 /* receive IP dst address/port */ /* 0x00001000 */ /* 0x00002000 */ /* 0x00004000 */ /* 0x00008000 */ /* 0x00010000 */ #define INP_2PCP_SET 0x00020000 /* If the Eth PCP should be set explicitly */ #define INP_2PCP_BIT0 0x00040000 /* Eth PCP Bit 0 */ #define INP_2PCP_BIT1 0x00080000 /* Eth PCP Bit 1 */ #define INP_2PCP_BIT2 0x00100000 /* Eth PCP Bit 2 */ #define INP_2PCP_BASE INP_2PCP_BIT0 #define INP_2PCP_MASK (INP_2PCP_BIT0 | INP_2PCP_BIT1 | INP_2PCP_BIT2) #define INP_2PCP_SHIFT 18 /* shift PCP field in/out of inp_flags2 */ /* * Flags passed to in_pcblookup*(), inp_smr_lock() and inp_next(). */ typedef enum { INPLOOKUP_WILDCARD = 0x00000001, /* Allow wildcard sockets. */ INPLOOKUP_RLOCKPCB = 0x00000002, /* Return inpcb read-locked. */ INPLOOKUP_WLOCKPCB = 0x00000004, /* Return inpcb write-locked. */ } inp_lookup_t; #define INPLOOKUP_MASK (INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB | \ INPLOOKUP_WLOCKPCB) #define INPLOOKUP_LOCKMASK (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB) #define sotoinpcb(so) ((struct inpcb *)(so)->so_pcb) #define INP_SOCKAF(so) so->so_proto->pr_domain->dom_family #define INP_CHECK_SOCKAF(so, af) (INP_SOCKAF(so) == af) #ifdef _KERNEL VNET_DECLARE(int, ipport_reservedhigh); VNET_DECLARE(int, ipport_reservedlow); VNET_DECLARE(int, ipport_lowfirstauto); VNET_DECLARE(int, ipport_lowlastauto); VNET_DECLARE(int, ipport_firstauto); VNET_DECLARE(int, ipport_lastauto); VNET_DECLARE(int, ipport_hifirstauto); VNET_DECLARE(int, ipport_hilastauto); VNET_DECLARE(int, ipport_randomized); #define V_ipport_reservedhigh VNET(ipport_reservedhigh) #define V_ipport_reservedlow VNET(ipport_reservedlow) #define V_ipport_lowfirstauto VNET(ipport_lowfirstauto) #define V_ipport_lowlastauto VNET(ipport_lowlastauto) #define V_ipport_firstauto VNET(ipport_firstauto) #define V_ipport_lastauto VNET(ipport_lastauto) #define V_ipport_hifirstauto VNET(ipport_hifirstauto) #define V_ipport_hilastauto VNET(ipport_hilastauto) #define V_ipport_randomized VNET(ipport_randomized) void in_pcbinfo_init(struct inpcbinfo *, struct inpcbstorage *, u_int, u_int); void in_pcbinfo_destroy(struct inpcbinfo *); void in_pcbstorage_init(void *); void in_pcbstorage_destroy(void *); void in_pcbpurgeif0(struct inpcbinfo *, struct ifnet *); int in_pcballoc(struct socket *, struct inpcbinfo *); int in_pcbbind(struct inpcb *, struct sockaddr_in *, struct ucred *); int in_pcbbind_setup(struct inpcb *, struct sockaddr_in *, in_addr_t *, u_short *, struct ucred *); int in_pcbconnect(struct inpcb *, struct sockaddr_in *, struct ucred *, bool); int in_pcbconnect_setup(struct inpcb *, struct sockaddr_in *, in_addr_t *, u_short *, in_addr_t *, u_short *, struct ucred *); void in_pcbdisconnect(struct inpcb *); void in_pcbdrop(struct inpcb *); void in_pcbfree(struct inpcb *); int in_pcbinshash(struct inpcb *); int in_pcbladdr(struct inpcb *, struct in_addr *, struct in_addr *, struct ucred *); int in_pcblbgroup_numa(struct inpcb *, int arg); struct inpcb * in_pcblookup(struct inpcbinfo *, struct in_addr, u_int, struct in_addr, u_int, int, struct ifnet *); struct inpcb * in_pcblookup_mbuf(struct inpcbinfo *, struct in_addr, u_int, struct in_addr, u_int, int, struct ifnet *, struct mbuf *); void in_pcbref(struct inpcb *); void in_pcbrehash(struct inpcb *); void in_pcbremhash_locked(struct inpcb *); bool in_pcbrele(struct inpcb *, inp_lookup_t); bool in_pcbrele_rlocked(struct inpcb *); bool in_pcbrele_wlocked(struct inpcb *); typedef bool inp_match_t(const struct inpcb *, void *); struct inpcb_iterator { const struct inpcbinfo *ipi; struct inpcb *inp; inp_match_t *match; void *ctx; int hash; #define INP_ALL_LIST -1 const inp_lookup_t lock; }; /* Note: sparse initializers guarantee .inp = NULL. */ #define INP_ITERATOR(_ipi, _lock, _match, _ctx) \ { \ .ipi = (_ipi), \ .lock = (_lock), \ .hash = INP_ALL_LIST, \ .match = (_match), \ .ctx = (_ctx), \ } #define INP_ALL_ITERATOR(_ipi, _lock) \ { \ .ipi = (_ipi), \ .lock = (_lock), \ .hash = INP_ALL_LIST, \ } struct inpcb *inp_next(struct inpcb_iterator *); void in_losing(struct inpcb *); void in_pcbsetsolabel(struct socket *so); int in_getpeeraddr(struct socket *, struct sockaddr *sa); int in_getsockaddr(struct socket *, struct sockaddr *sa); void in_pcbsosetlabel(struct socket *so); #ifdef RATELIMIT int in_pcboutput_txrtlmt_locked(struct inpcb *, struct ifnet *, struct mbuf *, uint32_t); int in_pcbattach_txrtlmt(struct inpcb *, struct ifnet *, uint32_t, uint32_t, uint32_t, struct m_snd_tag **); void in_pcbdetach_txrtlmt(struct inpcb *); void in_pcbdetach_tag(struct m_snd_tag *); int in_pcbmodify_txrtlmt(struct inpcb *, uint32_t); int in_pcbquery_txrtlmt(struct inpcb *, uint32_t *); int in_pcbquery_txrlevel(struct inpcb *, uint32_t *); void in_pcboutput_txrtlmt(struct inpcb *, struct ifnet *, struct mbuf *); void in_pcboutput_eagain(struct inpcb *); #endif #endif /* _KERNEL */ #endif /* !_NETINET_IN_PCB_H_ */ diff --git a/sys/netinet/tcp_subr.c b/sys/netinet/tcp_subr.c index a6f84c297688..db335a890b28 100644 --- a/sys/netinet/tcp_subr.c +++ b/sys/netinet/tcp_subr.c @@ -1,4754 +1,4747 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_kern_tls.h" #include #include #include #include #include #ifdef TCP_HHOOK #include #endif #include #ifdef TCP_HHOOK #include #endif #ifdef KERN_TLS #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #include #include #include #include #include #include #endif #include #ifdef INVARIANTS #define TCPSTATES #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef TCPPCAP #include #endif #ifdef TCP_OFFLOAD #include #endif #include #include #ifdef INET6 #include #endif #include #include #include #include #ifdef INET6 static ip6proto_ctlinput_t tcp6_ctlinput; static udp_tun_icmp_t tcp6_ctlinput_viaudp; #endif VNET_DEFINE(int, tcp_mssdflt) = TCP_MSS; #ifdef INET6 VNET_DEFINE(int, tcp_v6mssdflt) = TCP6_MSS; #endif #ifdef TCP_SAD_DETECTION /* Sack attack detection thresholds and such */ SYSCTL_NODE(_net_inet_tcp, OID_AUTO, sack_attack, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Sack Attack detection thresholds"); int32_t tcp_force_detection = 0; SYSCTL_INT(_net_inet_tcp_sack_attack, OID_AUTO, force_detection, CTLFLAG_RW, &tcp_force_detection, 0, "Do we force detection even if the INP has it off?"); int32_t tcp_sad_limit = 10000; SYSCTL_INT(_net_inet_tcp_sack_attack, OID_AUTO, limit, CTLFLAG_RW, &tcp_sad_limit, 10000, "If SaD is enabled, what is the limit to sendmap entries (0 = unlimited)?"); int32_t tcp_sack_to_ack_thresh = 700; /* 70 % */ SYSCTL_INT(_net_inet_tcp_sack_attack, OID_AUTO, sack_to_ack_thresh, CTLFLAG_RW, &tcp_sack_to_ack_thresh, 700, "Percentage of sacks to acks we must see above (10.1 percent is 101)?"); int32_t tcp_sack_to_move_thresh = 600; /* 60 % */ SYSCTL_INT(_net_inet_tcp_sack_attack, OID_AUTO, move_thresh, CTLFLAG_RW, &tcp_sack_to_move_thresh, 600, "Percentage of sack moves we must see above (10.1 percent is 101)"); int32_t tcp_restoral_thresh = 450; /* 45 % (sack:2:ack -25%) (mv:ratio -15%) **/ SYSCTL_INT(_net_inet_tcp_sack_attack, OID_AUTO, restore_thresh, CTLFLAG_RW, &tcp_restoral_thresh, 450, "Percentage of sack to ack percentage we must see below to restore(10.1 percent is 101)"); int32_t tcp_sad_decay_val = 800; SYSCTL_INT(_net_inet_tcp_sack_attack, OID_AUTO, decay_per, CTLFLAG_RW, &tcp_sad_decay_val, 800, "The decay percentage (10.1 percent equals 101 )"); int32_t tcp_map_minimum = 500; SYSCTL_INT(_net_inet_tcp_sack_attack, OID_AUTO, nummaps, CTLFLAG_RW, &tcp_map_minimum, 500, "Number of Map enteries before we start detection"); int32_t tcp_sad_pacing_interval = 2000; SYSCTL_INT(_net_inet_tcp_sack_attack, OID_AUTO, sad_pacing_int, CTLFLAG_RW, &tcp_sad_pacing_interval, 2000, "What is the minimum pacing interval for a classified attacker?"); int32_t tcp_sad_low_pps = 100; SYSCTL_INT(_net_inet_tcp_sack_attack, OID_AUTO, sad_low_pps, CTLFLAG_RW, &tcp_sad_low_pps, 100, "What is the input pps that below which we do not decay?"); #endif uint32_t tcp_ack_war_time_window = 1000; SYSCTL_UINT(_net_inet_tcp, OID_AUTO, ack_war_timewindow, CTLFLAG_RW, &tcp_ack_war_time_window, 1000, "If the tcp_stack does ack-war prevention how many milliseconds are in its time window?"); uint32_t tcp_ack_war_cnt = 5; SYSCTL_UINT(_net_inet_tcp, OID_AUTO, ack_war_cnt, CTLFLAG_RW, &tcp_ack_war_cnt, 5, "If the tcp_stack does ack-war prevention how many acks can be sent in its time window?"); struct rwlock tcp_function_lock; static int sysctl_net_inet_tcp_mss_check(SYSCTL_HANDLER_ARGS) { int error, new; new = V_tcp_mssdflt; error = sysctl_handle_int(oidp, &new, 0, req); if (error == 0 && req->newptr) { if (new < TCP_MINMSS) error = EINVAL; else V_tcp_mssdflt = new; } return (error); } SYSCTL_PROC(_net_inet_tcp, TCPCTL_MSSDFLT, mssdflt, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, &VNET_NAME(tcp_mssdflt), 0, &sysctl_net_inet_tcp_mss_check, "I", "Default TCP Maximum Segment Size"); #ifdef INET6 static int sysctl_net_inet_tcp_mss_v6_check(SYSCTL_HANDLER_ARGS) { int error, new; new = V_tcp_v6mssdflt; error = sysctl_handle_int(oidp, &new, 0, req); if (error == 0 && req->newptr) { if (new < TCP_MINMSS) error = EINVAL; else V_tcp_v6mssdflt = new; } return (error); } SYSCTL_PROC(_net_inet_tcp, TCPCTL_V6MSSDFLT, v6mssdflt, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, &VNET_NAME(tcp_v6mssdflt), 0, &sysctl_net_inet_tcp_mss_v6_check, "I", "Default TCP Maximum Segment Size for IPv6"); #endif /* INET6 */ /* * Minimum MSS we accept and use. This prevents DoS attacks where * we are forced to a ridiculous low MSS like 20 and send hundreds * of packets instead of one. The effect scales with the available * bandwidth and quickly saturates the CPU and network interface * with packet generation and sending. Set to zero to disable MINMSS * checking. This setting prevents us from sending too small packets. */ VNET_DEFINE(int, tcp_minmss) = TCP_MINMSS; SYSCTL_INT(_net_inet_tcp, OID_AUTO, minmss, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_minmss), 0, "Minimum TCP Maximum Segment Size"); VNET_DEFINE(int, tcp_do_rfc1323) = 1; SYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1323, rfc1323, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_do_rfc1323), 0, "Enable rfc1323 (high performance TCP) extensions"); /* * As of June 2021, several TCP stacks violate RFC 7323 from September 2014. * Some stacks negotiate TS, but never send them after connection setup. Some * stacks negotiate TS, but don't send them when sending keep-alive segments. * These include modern widely deployed TCP stacks. * Therefore tolerating violations for now... */ VNET_DEFINE(int, tcp_tolerate_missing_ts) = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, tolerate_missing_ts, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_tolerate_missing_ts), 0, "Tolerate missing TCP timestamps"); VNET_DEFINE(int, tcp_ts_offset_per_conn) = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, ts_offset_per_conn, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_ts_offset_per_conn), 0, "Initialize TCP timestamps per connection instead of per host pair"); /* How many connections are pacing */ static volatile uint32_t number_of_tcp_connections_pacing = 0; static uint32_t shadow_num_connections = 0; static counter_u64_t tcp_pacing_failures; static counter_u64_t tcp_dgp_failures; static uint32_t shadow_tcp_pacing_dgp = 0; static volatile uint32_t number_of_dgp_connections = 0; static int tcp_pacing_limit = 10000; SYSCTL_INT(_net_inet_tcp, OID_AUTO, pacing_limit, CTLFLAG_RW, &tcp_pacing_limit, 1000, "If the TCP stack does pacing, is there a limit (-1 = no, 0 = no pacing N = number of connections)"); static int tcp_dgp_limit = -1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, dgp_limit, CTLFLAG_RW, &tcp_dgp_limit, -1, "If the TCP stack does DGP, is there a limit (-1 = no, 0 = no dgp N = number of connections)"); SYSCTL_UINT(_net_inet_tcp, OID_AUTO, pacing_count, CTLFLAG_RD, &shadow_num_connections, 0, "Number of TCP connections being paced"); SYSCTL_COUNTER_U64(_net_inet_tcp, OID_AUTO, pacing_failures, CTLFLAG_RD, &tcp_pacing_failures, "Number of times we failed to enable pacing to avoid exceeding the limit"); SYSCTL_COUNTER_U64(_net_inet_tcp, OID_AUTO, dgp_failures, CTLFLAG_RD, &tcp_dgp_failures, "Number of times we failed to enable dgp to avoid exceeding the limit"); static int tcp_log_debug = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_debug, CTLFLAG_RW, &tcp_log_debug, 0, "Log errors caused by incoming TCP segments"); /* * Target size of TCP PCB hash tables. Must be a power of two. * * Note that this can be overridden by the kernel environment * variable net.inet.tcp.tcbhashsize */ #ifndef TCBHASHSIZE #define TCBHASHSIZE 0 #endif static int tcp_tcbhashsize = TCBHASHSIZE; SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcbhashsize, CTLFLAG_RDTUN, &tcp_tcbhashsize, 0, "Size of TCP control-block hashtable"); static int do_tcpdrain = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, do_tcpdrain, CTLFLAG_RW, &do_tcpdrain, 0, "Enable tcp_drain routine for extra help when low on mbufs"); SYSCTL_UINT(_net_inet_tcp, OID_AUTO, pcbcount, CTLFLAG_VNET | CTLFLAG_RD, &VNET_NAME(tcbinfo.ipi_count), 0, "Number of active PCBs"); VNET_DEFINE_STATIC(int, icmp_may_rst) = 1; #define V_icmp_may_rst VNET(icmp_may_rst) SYSCTL_INT(_net_inet_tcp, OID_AUTO, icmp_may_rst, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(icmp_may_rst), 0, "Certain ICMP unreachable messages may abort connections in SYN_SENT"); VNET_DEFINE_STATIC(int, tcp_isn_reseed_interval) = 0; #define V_tcp_isn_reseed_interval VNET(tcp_isn_reseed_interval) SYSCTL_INT(_net_inet_tcp, OID_AUTO, isn_reseed_interval, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_isn_reseed_interval), 0, "Seconds between reseeding of ISN secret"); static int tcp_soreceive_stream; SYSCTL_INT(_net_inet_tcp, OID_AUTO, soreceive_stream, CTLFLAG_RDTUN, &tcp_soreceive_stream, 0, "Using soreceive_stream for TCP sockets"); VNET_DEFINE(uma_zone_t, sack_hole_zone); #define V_sack_hole_zone VNET(sack_hole_zone) VNET_DEFINE(uint32_t, tcp_map_entries_limit) = 0; /* unlimited */ static int sysctl_net_inet_tcp_map_limit_check(SYSCTL_HANDLER_ARGS) { int error; uint32_t new; new = V_tcp_map_entries_limit; error = sysctl_handle_int(oidp, &new, 0, req); if (error == 0 && req->newptr) { /* only allow "0" and value > minimum */ if (new > 0 && new < TCP_MIN_MAP_ENTRIES_LIMIT) error = EINVAL; else V_tcp_map_entries_limit = new; } return (error); } SYSCTL_PROC(_net_inet_tcp, OID_AUTO, map_limit, CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, &VNET_NAME(tcp_map_entries_limit), 0, &sysctl_net_inet_tcp_map_limit_check, "IU", "Total sendmap entries limit"); VNET_DEFINE(uint32_t, tcp_map_split_limit) = 0; /* unlimited */ SYSCTL_UINT(_net_inet_tcp, OID_AUTO, split_limit, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_map_split_limit), 0, "Total sendmap split entries limit"); #ifdef TCP_HHOOK VNET_DEFINE(struct hhook_head *, tcp_hhh[HHOOK_TCP_LAST+1]); #endif #define TS_OFFSET_SECRET_LENGTH SIPHASH_KEY_LENGTH VNET_DEFINE_STATIC(u_char, ts_offset_secret[TS_OFFSET_SECRET_LENGTH]); #define V_ts_offset_secret VNET(ts_offset_secret) static int tcp_default_fb_init(struct tcpcb *tp, void **ptr); static void tcp_default_fb_fini(struct tcpcb *tp, int tcb_is_purged); static int tcp_default_handoff_ok(struct tcpcb *tp); static struct inpcb *tcp_notify(struct inpcb *, int); static struct inpcb *tcp_mtudisc_notify(struct inpcb *, int); static struct inpcb *tcp_mtudisc(struct inpcb *, int); static struct inpcb *tcp_drop_syn_sent(struct inpcb *, int); static char * tcp_log_addr(struct in_conninfo *inc, struct tcphdr *th, const void *ip4hdr, const void *ip6hdr); static void tcp_default_switch_failed(struct tcpcb *tp); static ipproto_ctlinput_t tcp_ctlinput; static udp_tun_icmp_t tcp_ctlinput_viaudp; static struct tcp_function_block tcp_def_funcblk = { .tfb_tcp_block_name = "freebsd", .tfb_tcp_output = tcp_default_output, .tfb_tcp_do_segment = tcp_do_segment, .tfb_tcp_ctloutput = tcp_default_ctloutput, .tfb_tcp_handoff_ok = tcp_default_handoff_ok, .tfb_tcp_fb_init = tcp_default_fb_init, .tfb_tcp_fb_fini = tcp_default_fb_fini, .tfb_switch_failed = tcp_default_switch_failed, }; static int tcp_fb_cnt = 0; struct tcp_funchead t_functions; VNET_DEFINE_STATIC(struct tcp_function_block *, tcp_func_set_ptr) = &tcp_def_funcblk; #define V_tcp_func_set_ptr VNET(tcp_func_set_ptr) void tcp_record_dsack(struct tcpcb *tp, tcp_seq start, tcp_seq end, int tlp) { TCPSTAT_INC(tcps_dsack_count); tp->t_dsack_pack++; if (tlp == 0) { if (SEQ_GT(end, start)) { tp->t_dsack_bytes += (end - start); TCPSTAT_ADD(tcps_dsack_bytes, (end - start)); } else { tp->t_dsack_tlp_bytes += (start - end); TCPSTAT_ADD(tcps_dsack_bytes, (start - end)); } } else { if (SEQ_GT(end, start)) { tp->t_dsack_bytes += (end - start); TCPSTAT_ADD(tcps_dsack_tlp_bytes, (end - start)); } else { tp->t_dsack_tlp_bytes += (start - end); TCPSTAT_ADD(tcps_dsack_tlp_bytes, (start - end)); } } } static struct tcp_function_block * find_tcp_functions_locked(struct tcp_function_set *fs) { struct tcp_function *f; struct tcp_function_block *blk=NULL; TAILQ_FOREACH(f, &t_functions, tf_next) { if (strcmp(f->tf_name, fs->function_set_name) == 0) { blk = f->tf_fb; break; } } return(blk); } static struct tcp_function_block * find_tcp_fb_locked(struct tcp_function_block *blk, struct tcp_function **s) { struct tcp_function_block *rblk=NULL; struct tcp_function *f; TAILQ_FOREACH(f, &t_functions, tf_next) { if (f->tf_fb == blk) { rblk = blk; if (s) { *s = f; } break; } } return (rblk); } struct tcp_function_block * find_and_ref_tcp_functions(struct tcp_function_set *fs) { struct tcp_function_block *blk; rw_rlock(&tcp_function_lock); blk = find_tcp_functions_locked(fs); if (blk) refcount_acquire(&blk->tfb_refcnt); rw_runlock(&tcp_function_lock); return(blk); } struct tcp_function_block * find_and_ref_tcp_fb(struct tcp_function_block *blk) { struct tcp_function_block *rblk; rw_rlock(&tcp_function_lock); rblk = find_tcp_fb_locked(blk, NULL); if (rblk) refcount_acquire(&rblk->tfb_refcnt); rw_runlock(&tcp_function_lock); return(rblk); } /* Find a matching alias for the given tcp_function_block. */ int find_tcp_function_alias(struct tcp_function_block *blk, struct tcp_function_set *fs) { struct tcp_function *f; int found; found = 0; rw_rlock(&tcp_function_lock); TAILQ_FOREACH(f, &t_functions, tf_next) { if ((f->tf_fb == blk) && (strncmp(f->tf_name, blk->tfb_tcp_block_name, TCP_FUNCTION_NAME_LEN_MAX) != 0)) { /* Matching function block with different name. */ strncpy(fs->function_set_name, f->tf_name, TCP_FUNCTION_NAME_LEN_MAX); found = 1; break; } } /* Null terminate the string appropriately. */ if (found) { fs->function_set_name[TCP_FUNCTION_NAME_LEN_MAX - 1] = '\0'; } else { fs->function_set_name[0] = '\0'; } rw_runlock(&tcp_function_lock); return (found); } static struct tcp_function_block * find_and_ref_tcp_default_fb(void) { struct tcp_function_block *rblk; rw_rlock(&tcp_function_lock); rblk = V_tcp_func_set_ptr; refcount_acquire(&rblk->tfb_refcnt); rw_runlock(&tcp_function_lock); return (rblk); } void tcp_switch_back_to_default(struct tcpcb *tp) { struct tcp_function_block *tfb; void *ptr = NULL; KASSERT(tp->t_fb != &tcp_def_funcblk, ("%s: called by the built-in default stack", __func__)); if (tp->t_fb->tfb_tcp_timer_stop_all != NULL) tp->t_fb->tfb_tcp_timer_stop_all(tp); /* * Now, we'll find a new function block to use. * Start by trying the current user-selected * default, unless this stack is the user-selected * default. */ tfb = find_and_ref_tcp_default_fb(); if (tfb == tp->t_fb) { refcount_release(&tfb->tfb_refcnt); tfb = NULL; } /* Does the stack accept this connection? */ if (tfb != NULL && tfb->tfb_tcp_handoff_ok != NULL && (*tfb->tfb_tcp_handoff_ok)(tp)) { refcount_release(&tfb->tfb_refcnt); tfb = NULL; } /* Try to use that stack. */ if (tfb != NULL) { /* Initialize the new stack. If it succeeds, we are done. */ if (tfb->tfb_tcp_fb_init == NULL || (*tfb->tfb_tcp_fb_init)(tp, &ptr) == 0) { /* Release the old stack */ if (tp->t_fb->tfb_tcp_fb_fini != NULL) (*tp->t_fb->tfb_tcp_fb_fini)(tp, 0); refcount_release(&tp->t_fb->tfb_refcnt); /* Now set in all the pointers */ tp->t_fb = tfb; tp->t_fb_ptr = ptr; return; } /* * Initialization failed. Release the reference count on * the looked up default stack. */ refcount_release(&tfb->tfb_refcnt); } /* * If that wasn't feasible, use the built-in default * stack which is not allowed to reject anyone. */ tfb = find_and_ref_tcp_fb(&tcp_def_funcblk); if (tfb == NULL) { /* there always should be a default */ panic("Can't refer to tcp_def_funcblk"); } if (tfb->tfb_tcp_handoff_ok != NULL) { if ((*tfb->tfb_tcp_handoff_ok) (tp)) { /* The default stack cannot say no */ panic("Default stack rejects a new session?"); } } if (tfb->tfb_tcp_fb_init != NULL && (*tfb->tfb_tcp_fb_init)(tp, &ptr)) { /* The default stack cannot fail */ panic("Default stack initialization failed"); } /* Now release the old stack */ if (tp->t_fb->tfb_tcp_fb_fini != NULL) (*tp->t_fb->tfb_tcp_fb_fini)(tp, 0); refcount_release(&tp->t_fb->tfb_refcnt); /* And set in the pointers to the new */ tp->t_fb = tfb; tp->t_fb_ptr = ptr; } static bool tcp_recv_udp_tunneled_packet(struct mbuf *m, int off, struct inpcb *inp, const struct sockaddr *sa, void *ctx) { struct ip *iph; #ifdef INET6 struct ip6_hdr *ip6; #endif struct udphdr *uh; struct tcphdr *th; int thlen; uint16_t port; TCPSTAT_INC(tcps_tunneled_pkts); if ((m->m_flags & M_PKTHDR) == 0) { /* Can't handle one that is not a pkt hdr */ TCPSTAT_INC(tcps_tunneled_errs); goto out; } thlen = sizeof(struct tcphdr); if (m->m_len < off + sizeof(struct udphdr) + thlen && (m = m_pullup(m, off + sizeof(struct udphdr) + thlen)) == NULL) { TCPSTAT_INC(tcps_tunneled_errs); goto out; } iph = mtod(m, struct ip *); uh = (struct udphdr *)((caddr_t)iph + off); th = (struct tcphdr *)(uh + 1); thlen = th->th_off << 2; if (m->m_len < off + sizeof(struct udphdr) + thlen) { m = m_pullup(m, off + sizeof(struct udphdr) + thlen); if (m == NULL) { TCPSTAT_INC(tcps_tunneled_errs); goto out; } else { iph = mtod(m, struct ip *); uh = (struct udphdr *)((caddr_t)iph + off); th = (struct tcphdr *)(uh + 1); } } m->m_pkthdr.tcp_tun_port = port = uh->uh_sport; bcopy(th, uh, m->m_len - off); m->m_len -= sizeof(struct udphdr); m->m_pkthdr.len -= sizeof(struct udphdr); /* * We use the same algorithm for * both UDP and TCP for c-sum. So * the code in tcp_input will skip * the checksum. So we do nothing * with the flag (m->m_pkthdr.csum_flags). */ switch (iph->ip_v) { #ifdef INET case IPVERSION: iph->ip_len = htons(ntohs(iph->ip_len) - sizeof(struct udphdr)); tcp_input_with_port(&m, &off, IPPROTO_TCP, port); break; #endif #ifdef INET6 case IPV6_VERSION >> 4: ip6 = mtod(m, struct ip6_hdr *); ip6->ip6_plen = htons(ntohs(ip6->ip6_plen) - sizeof(struct udphdr)); tcp6_input_with_port(&m, &off, IPPROTO_TCP, port); break; #endif default: goto out; break; } return (true); out: m_freem(m); return (true); } static int sysctl_net_inet_default_tcp_functions(SYSCTL_HANDLER_ARGS) { int error=ENOENT; struct tcp_function_set fs; struct tcp_function_block *blk; memset(&fs, 0, sizeof(fs)); rw_rlock(&tcp_function_lock); blk = find_tcp_fb_locked(V_tcp_func_set_ptr, NULL); if (blk) { /* Found him */ strcpy(fs.function_set_name, blk->tfb_tcp_block_name); fs.pcbcnt = blk->tfb_refcnt; } rw_runlock(&tcp_function_lock); error = sysctl_handle_string(oidp, fs.function_set_name, sizeof(fs.function_set_name), req); /* Check for error or no change */ if (error != 0 || req->newptr == NULL) return(error); rw_wlock(&tcp_function_lock); blk = find_tcp_functions_locked(&fs); if ((blk == NULL) || (blk->tfb_flags & TCP_FUNC_BEING_REMOVED)) { error = ENOENT; goto done; } V_tcp_func_set_ptr = blk; done: rw_wunlock(&tcp_function_lock); return (error); } SYSCTL_PROC(_net_inet_tcp, OID_AUTO, functions_default, CTLFLAG_VNET | CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_NEEDGIANT, NULL, 0, sysctl_net_inet_default_tcp_functions, "A", "Set/get the default TCP functions"); static int sysctl_net_inet_list_available(SYSCTL_HANDLER_ARGS) { int error, cnt, linesz; struct tcp_function *f; char *buffer, *cp; size_t bufsz, outsz; bool alias; cnt = 0; rw_rlock(&tcp_function_lock); TAILQ_FOREACH(f, &t_functions, tf_next) { cnt++; } rw_runlock(&tcp_function_lock); bufsz = (cnt+2) * ((TCP_FUNCTION_NAME_LEN_MAX * 2) + 13) + 1; buffer = malloc(bufsz, M_TEMP, M_WAITOK); error = 0; cp = buffer; linesz = snprintf(cp, bufsz, "\n%-32s%c %-32s %s\n", "Stack", 'D', "Alias", "PCB count"); cp += linesz; bufsz -= linesz; outsz = linesz; rw_rlock(&tcp_function_lock); TAILQ_FOREACH(f, &t_functions, tf_next) { alias = (f->tf_name != f->tf_fb->tfb_tcp_block_name); linesz = snprintf(cp, bufsz, "%-32s%c %-32s %u\n", f->tf_fb->tfb_tcp_block_name, (f->tf_fb == V_tcp_func_set_ptr) ? '*' : ' ', alias ? f->tf_name : "-", f->tf_fb->tfb_refcnt); if (linesz >= bufsz) { error = EOVERFLOW; break; } cp += linesz; bufsz -= linesz; outsz += linesz; } rw_runlock(&tcp_function_lock); if (error == 0) error = sysctl_handle_string(oidp, buffer, outsz + 1, req); free(buffer, M_TEMP); return (error); } SYSCTL_PROC(_net_inet_tcp, OID_AUTO, functions_available, CTLFLAG_VNET | CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_NEEDGIANT, NULL, 0, sysctl_net_inet_list_available, "A", "list available TCP Function sets"); VNET_DEFINE(int, tcp_udp_tunneling_port) = TCP_TUNNELING_PORT_DEFAULT; #ifdef INET VNET_DEFINE(struct socket *, udp4_tun_socket) = NULL; #define V_udp4_tun_socket VNET(udp4_tun_socket) #endif #ifdef INET6 VNET_DEFINE(struct socket *, udp6_tun_socket) = NULL; #define V_udp6_tun_socket VNET(udp6_tun_socket) #endif static struct sx tcpoudp_lock; static void tcp_over_udp_stop(void) { sx_assert(&tcpoudp_lock, SA_XLOCKED); #ifdef INET if (V_udp4_tun_socket != NULL) { soclose(V_udp4_tun_socket); V_udp4_tun_socket = NULL; } #endif #ifdef INET6 if (V_udp6_tun_socket != NULL) { soclose(V_udp6_tun_socket); V_udp6_tun_socket = NULL; } #endif } static int tcp_over_udp_start(void) { uint16_t port; int ret; #ifdef INET struct sockaddr_in sin; #endif #ifdef INET6 struct sockaddr_in6 sin6; #endif sx_assert(&tcpoudp_lock, SA_XLOCKED); port = V_tcp_udp_tunneling_port; if (ntohs(port) == 0) { /* Must have a port set */ return (EINVAL); } #ifdef INET if (V_udp4_tun_socket != NULL) { /* Already running -- must stop first */ return (EALREADY); } #endif #ifdef INET6 if (V_udp6_tun_socket != NULL) { /* Already running -- must stop first */ return (EALREADY); } #endif #ifdef INET if ((ret = socreate(PF_INET, &V_udp4_tun_socket, SOCK_DGRAM, IPPROTO_UDP, curthread->td_ucred, curthread))) { tcp_over_udp_stop(); return (ret); } /* Call the special UDP hook. */ if ((ret = udp_set_kernel_tunneling(V_udp4_tun_socket, tcp_recv_udp_tunneled_packet, tcp_ctlinput_viaudp, NULL))) { tcp_over_udp_stop(); return (ret); } /* Ok, we have a socket, bind it to the port. */ memset(&sin, 0, sizeof(struct sockaddr_in)); sin.sin_len = sizeof(struct sockaddr_in); sin.sin_family = AF_INET; sin.sin_port = htons(port); if ((ret = sobind(V_udp4_tun_socket, (struct sockaddr *)&sin, curthread))) { tcp_over_udp_stop(); return (ret); } #endif #ifdef INET6 if ((ret = socreate(PF_INET6, &V_udp6_tun_socket, SOCK_DGRAM, IPPROTO_UDP, curthread->td_ucred, curthread))) { tcp_over_udp_stop(); return (ret); } /* Call the special UDP hook. */ if ((ret = udp_set_kernel_tunneling(V_udp6_tun_socket, tcp_recv_udp_tunneled_packet, tcp6_ctlinput_viaudp, NULL))) { tcp_over_udp_stop(); return (ret); } /* Ok, we have a socket, bind it to the port. */ memset(&sin6, 0, sizeof(struct sockaddr_in6)); sin6.sin6_len = sizeof(struct sockaddr_in6); sin6.sin6_family = AF_INET6; sin6.sin6_port = htons(port); if ((ret = sobind(V_udp6_tun_socket, (struct sockaddr *)&sin6, curthread))) { tcp_over_udp_stop(); return (ret); } #endif return (0); } static int sysctl_net_inet_tcp_udp_tunneling_port_check(SYSCTL_HANDLER_ARGS) { int error; uint32_t old, new; old = V_tcp_udp_tunneling_port; new = old; error = sysctl_handle_int(oidp, &new, 0, req); if ((error == 0) && (req->newptr != NULL)) { if ((new < TCP_TUNNELING_PORT_MIN) || (new > TCP_TUNNELING_PORT_MAX)) { error = EINVAL; } else { sx_xlock(&tcpoudp_lock); V_tcp_udp_tunneling_port = new; if (old != 0) { tcp_over_udp_stop(); } if (new != 0) { error = tcp_over_udp_start(); if (error != 0) { V_tcp_udp_tunneling_port = 0; } } sx_xunlock(&tcpoudp_lock); } } return (error); } SYSCTL_PROC(_net_inet_tcp, OID_AUTO, udp_tunneling_port, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, &VNET_NAME(tcp_udp_tunneling_port), 0, &sysctl_net_inet_tcp_udp_tunneling_port_check, "IU", "Tunneling port for tcp over udp"); VNET_DEFINE(int, tcp_udp_tunneling_overhead) = TCP_TUNNELING_OVERHEAD_DEFAULT; static int sysctl_net_inet_tcp_udp_tunneling_overhead_check(SYSCTL_HANDLER_ARGS) { int error, new; new = V_tcp_udp_tunneling_overhead; error = sysctl_handle_int(oidp, &new, 0, req); if (error == 0 && req->newptr) { if ((new < TCP_TUNNELING_OVERHEAD_MIN) || (new > TCP_TUNNELING_OVERHEAD_MAX)) error = EINVAL; else V_tcp_udp_tunneling_overhead = new; } return (error); } SYSCTL_PROC(_net_inet_tcp, OID_AUTO, udp_tunneling_overhead, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, &VNET_NAME(tcp_udp_tunneling_overhead), 0, &sysctl_net_inet_tcp_udp_tunneling_overhead_check, "IU", "MSS reduction when using tcp over udp"); /* * Exports one (struct tcp_function_info) for each alias/name. */ static int sysctl_net_inet_list_func_info(SYSCTL_HANDLER_ARGS) { int cnt, error; struct tcp_function *f; struct tcp_function_info tfi; /* * We don't allow writes. */ if (req->newptr != NULL) return (EINVAL); /* * Wire the old buffer so we can directly copy the functions to * user space without dropping the lock. */ if (req->oldptr != NULL) { error = sysctl_wire_old_buffer(req, 0); if (error) return (error); } /* * Walk the list and copy out matching entries. If INVARIANTS * is compiled in, also walk the list to verify the length of * the list matches what we have recorded. */ rw_rlock(&tcp_function_lock); cnt = 0; #ifndef INVARIANTS if (req->oldptr == NULL) { cnt = tcp_fb_cnt; goto skip_loop; } #endif TAILQ_FOREACH(f, &t_functions, tf_next) { #ifdef INVARIANTS cnt++; #endif if (req->oldptr != NULL) { bzero(&tfi, sizeof(tfi)); tfi.tfi_refcnt = f->tf_fb->tfb_refcnt; tfi.tfi_id = f->tf_fb->tfb_id; (void)strlcpy(tfi.tfi_alias, f->tf_name, sizeof(tfi.tfi_alias)); (void)strlcpy(tfi.tfi_name, f->tf_fb->tfb_tcp_block_name, sizeof(tfi.tfi_name)); error = SYSCTL_OUT(req, &tfi, sizeof(tfi)); /* * Don't stop on error, as that is the * mechanism we use to accumulate length * information if the buffer was too short. */ } } KASSERT(cnt == tcp_fb_cnt, ("%s: cnt (%d) != tcp_fb_cnt (%d)", __func__, cnt, tcp_fb_cnt)); #ifndef INVARIANTS skip_loop: #endif rw_runlock(&tcp_function_lock); if (req->oldptr == NULL) error = SYSCTL_OUT(req, NULL, (cnt + 1) * sizeof(struct tcp_function_info)); return (error); } SYSCTL_PROC(_net_inet_tcp, OID_AUTO, function_info, CTLTYPE_OPAQUE | CTLFLAG_SKIP | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_net_inet_list_func_info, "S,tcp_function_info", "List TCP function block name-to-ID mappings"); /* * tfb_tcp_handoff_ok() function for the default stack. * Note that we'll basically try to take all comers. */ static int tcp_default_handoff_ok(struct tcpcb *tp) { return (0); } /* * tfb_tcp_fb_init() function for the default stack. * * This handles making sure we have appropriate timers set if you are * transitioning a socket that has some amount of setup done. * * The init() fuction from the default can *never* return non-zero i.e. * it is required to always succeed since it is the stack of last resort! */ static int tcp_default_fb_init(struct tcpcb *tp, void **ptr) { struct socket *so = tptosocket(tp); int rexmt; INP_WLOCK_ASSERT(tptoinpcb(tp)); /* We don't use the pointer */ *ptr = NULL; KASSERT(tp->t_state >= 0 && tp->t_state < TCPS_TIME_WAIT, ("%s: connection %p in unexpected state %d", __func__, tp, tp->t_state)); /* Make sure we get no interesting mbuf queuing behavior */ /* All mbuf queue/ack compress flags should be off */ tcp_lro_features_off(tp); /* Cancel the GP measurement in progress */ tp->t_flags &= ~TF_GPUTINPROG; /* Validate the timers are not in usec, if they are convert */ tcp_change_time_units(tp, TCP_TMR_GRANULARITY_TICKS); if ((tp->t_state == TCPS_SYN_SENT) || (tp->t_state == TCPS_SYN_RECEIVED)) rexmt = tcp_rexmit_initial * tcp_backoff[tp->t_rxtshift]; else rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift]; if (tp->t_rxtshift == 0) tp->t_rxtcur = rexmt; else TCPT_RANGESET(tp->t_rxtcur, rexmt, tp->t_rttmin, TCPTV_REXMTMAX); /* * Nothing to do for ESTABLISHED or LISTEN states. And, we don't * know what to do for unexpected states (which includes TIME_WAIT). */ if (tp->t_state <= TCPS_LISTEN || tp->t_state >= TCPS_TIME_WAIT) return (0); /* * Make sure some kind of transmission timer is set if there is * outstanding data. */ if ((!TCPS_HAVEESTABLISHED(tp->t_state) || sbavail(&so->so_snd) || tp->snd_una != tp->snd_max) && !(tcp_timer_active(tp, TT_REXMT) || tcp_timer_active(tp, TT_PERSIST))) { /* * If the session has established and it looks like it should * be in the persist state, set the persist timer. Otherwise, * set the retransmit timer. */ if (TCPS_HAVEESTABLISHED(tp->t_state) && tp->snd_wnd == 0 && (int32_t)(tp->snd_nxt - tp->snd_una) < (int32_t)sbavail(&so->so_snd)) tcp_setpersist(tp); else tcp_timer_activate(tp, TT_REXMT, TP_RXTCUR(tp)); } /* All non-embryonic sessions get a keepalive timer. */ if (!tcp_timer_active(tp, TT_KEEP)) tcp_timer_activate(tp, TT_KEEP, TCPS_HAVEESTABLISHED(tp->t_state) ? TP_KEEPIDLE(tp) : TP_KEEPINIT(tp)); /* * Make sure critical variables are initialized * if transitioning while in Recovery. */ if IN_FASTRECOVERY(tp->t_flags) { if (tp->sackhint.recover_fs == 0) tp->sackhint.recover_fs = max(1, tp->snd_nxt - tp->snd_una); } return (0); } /* * tfb_tcp_fb_fini() function for the default stack. * * This changes state as necessary (or prudent) to prepare for another stack * to assume responsibility for the connection. */ static void tcp_default_fb_fini(struct tcpcb *tp, int tcb_is_purged) { INP_WLOCK_ASSERT(tptoinpcb(tp)); #ifdef TCP_BLACKBOX tcp_log_flowend(tp); #endif tp->t_acktime = 0; return; } MALLOC_DEFINE(M_TCPLOG, "tcplog", "TCP address and flags print buffers"); MALLOC_DEFINE(M_TCPFUNCTIONS, "tcpfunc", "TCP function set memory"); static struct mtx isn_mtx; #define ISN_LOCK_INIT() mtx_init(&isn_mtx, "isn_mtx", NULL, MTX_DEF) #define ISN_LOCK() mtx_lock(&isn_mtx) #define ISN_UNLOCK() mtx_unlock(&isn_mtx) INPCBSTORAGE_DEFINE(tcpcbstor, tcpcb, "tcpinp", "tcp_inpcb", "tcp", "tcphash"); /* * Take a value and get the next power of 2 that doesn't overflow. * Used to size the tcp_inpcb hash buckets. */ static int maketcp_hashsize(int size) { int hashsize; /* * auto tune. * get the next power of 2 higher than maxsockets. */ hashsize = 1 << fls(size); /* catch overflow, and just go one power of 2 smaller */ if (hashsize < size) { hashsize = 1 << (fls(size) - 1); } return (hashsize); } static volatile int next_tcp_stack_id = 1; /* * Register a TCP function block with the name provided in the names * array. (Note that this function does NOT automatically register * blk->tfb_tcp_block_name as a stack name. Therefore, you should * explicitly include blk->tfb_tcp_block_name in the list of names if * you wish to register the stack with that name.) * * Either all name registrations will succeed or all will fail. If * a name registration fails, the function will update the num_names * argument to point to the array index of the name that encountered * the failure. * * Returns 0 on success, or an error code on failure. */ int register_tcp_functions_as_names(struct tcp_function_block *blk, int wait, const char *names[], int *num_names) { struct tcp_function *n; struct tcp_function_set fs; int error, i; KASSERT(names != NULL && *num_names > 0, ("%s: Called with 0-length name list", __func__)); KASSERT(names != NULL, ("%s: Called with NULL name list", __func__)); KASSERT(rw_initialized(&tcp_function_lock), ("%s: called too early", __func__)); if ((blk->tfb_tcp_output == NULL) || (blk->tfb_tcp_do_segment == NULL) || (blk->tfb_tcp_ctloutput == NULL) || (strlen(blk->tfb_tcp_block_name) == 0)) { /* * These functions are required and you * need a name. */ *num_names = 0; return (EINVAL); } if (blk->tfb_flags & TCP_FUNC_BEING_REMOVED) { *num_names = 0; return (EINVAL); } refcount_init(&blk->tfb_refcnt, 0); blk->tfb_id = atomic_fetchadd_int(&next_tcp_stack_id, 1); for (i = 0; i < *num_names; i++) { n = malloc(sizeof(struct tcp_function), M_TCPFUNCTIONS, wait); if (n == NULL) { error = ENOMEM; goto cleanup; } n->tf_fb = blk; (void)strlcpy(fs.function_set_name, names[i], sizeof(fs.function_set_name)); rw_wlock(&tcp_function_lock); if (find_tcp_functions_locked(&fs) != NULL) { /* Duplicate name space not allowed */ rw_wunlock(&tcp_function_lock); free(n, M_TCPFUNCTIONS); error = EALREADY; goto cleanup; } (void)strlcpy(n->tf_name, names[i], sizeof(n->tf_name)); TAILQ_INSERT_TAIL(&t_functions, n, tf_next); tcp_fb_cnt++; rw_wunlock(&tcp_function_lock); } return(0); cleanup: /* * Deregister the names we just added. Because registration failed * for names[i], we don't need to deregister that name. */ *num_names = i; rw_wlock(&tcp_function_lock); while (--i >= 0) { TAILQ_FOREACH(n, &t_functions, tf_next) { if (!strncmp(n->tf_name, names[i], TCP_FUNCTION_NAME_LEN_MAX)) { TAILQ_REMOVE(&t_functions, n, tf_next); tcp_fb_cnt--; n->tf_fb = NULL; free(n, M_TCPFUNCTIONS); break; } } } rw_wunlock(&tcp_function_lock); return (error); } /* * Register a TCP function block using the name provided in the name * argument. * * Returns 0 on success, or an error code on failure. */ int register_tcp_functions_as_name(struct tcp_function_block *blk, const char *name, int wait) { const char *name_list[1]; int num_names, rv; num_names = 1; if (name != NULL) name_list[0] = name; else name_list[0] = blk->tfb_tcp_block_name; rv = register_tcp_functions_as_names(blk, wait, name_list, &num_names); return (rv); } /* * Register a TCP function block using the name defined in * blk->tfb_tcp_block_name. * * Returns 0 on success, or an error code on failure. */ int register_tcp_functions(struct tcp_function_block *blk, int wait) { return (register_tcp_functions_as_name(blk, NULL, wait)); } /* * Deregister all names associated with a function block. This * functionally removes the function block from use within the system. * * When called with a true quiesce argument, mark the function block * as being removed so no more stacks will use it and determine * whether the removal would succeed. * * When called with a false quiesce argument, actually attempt the * removal. * * When called with a force argument, attempt to switch all TCBs to * use the default stack instead of returning EBUSY. * * Returns 0 on success (or if the removal would succeed), or an error * code on failure. */ int deregister_tcp_functions(struct tcp_function_block *blk, bool quiesce, bool force) { struct tcp_function *f; VNET_ITERATOR_DECL(vnet_iter); if (blk == &tcp_def_funcblk) { /* You can't un-register the default */ return (EPERM); } rw_wlock(&tcp_function_lock); VNET_LIST_RLOCK_NOSLEEP(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); if (blk == V_tcp_func_set_ptr) { /* You can't free the current default in some vnet. */ CURVNET_RESTORE(); VNET_LIST_RUNLOCK_NOSLEEP(); rw_wunlock(&tcp_function_lock); return (EBUSY); } CURVNET_RESTORE(); } VNET_LIST_RUNLOCK_NOSLEEP(); /* Mark the block so no more stacks can use it. */ blk->tfb_flags |= TCP_FUNC_BEING_REMOVED; /* * If TCBs are still attached to the stack, attempt to switch them * to the default stack. */ if (force && blk->tfb_refcnt) { struct inpcb *inp; struct tcpcb *tp; VNET_ITERATOR_DECL(vnet_iter); rw_wunlock(&tcp_function_lock); VNET_LIST_RLOCK(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); struct inpcb_iterator inpi = INP_ALL_ITERATOR(&V_tcbinfo, INPLOOKUP_WLOCKPCB); while ((inp = inp_next(&inpi)) != NULL) { tp = intotcpcb(inp); if (tp == NULL || tp->t_fb != blk) continue; tcp_switch_back_to_default(tp); } CURVNET_RESTORE(); } VNET_LIST_RUNLOCK(); rw_wlock(&tcp_function_lock); } if (blk->tfb_refcnt) { /* TCBs still attached. */ rw_wunlock(&tcp_function_lock); return (EBUSY); } if (quiesce) { /* Skip removal. */ rw_wunlock(&tcp_function_lock); return (0); } /* Remove any function names that map to this function block. */ while (find_tcp_fb_locked(blk, &f) != NULL) { TAILQ_REMOVE(&t_functions, f, tf_next); tcp_fb_cnt--; f->tf_fb = NULL; free(f, M_TCPFUNCTIONS); } rw_wunlock(&tcp_function_lock); return (0); } static void tcp_drain(void) { struct epoch_tracker et; VNET_ITERATOR_DECL(vnet_iter); if (!do_tcpdrain) return; NET_EPOCH_ENTER(et); VNET_LIST_RLOCK_NOSLEEP(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); struct inpcb_iterator inpi = INP_ALL_ITERATOR(&V_tcbinfo, INPLOOKUP_WLOCKPCB); struct inpcb *inpb; struct tcpcb *tcpb; /* * Walk the tcpbs, if existing, and flush the reassembly queue, * if there is one... * XXX: The "Net/3" implementation doesn't imply that the TCP * reassembly queue should be flushed, but in a situation * where we're really low on mbufs, this is potentially * useful. */ while ((inpb = inp_next(&inpi)) != NULL) { if ((tcpb = intotcpcb(inpb)) != NULL) { tcp_reass_flush(tcpb); tcp_clean_sackreport(tcpb); #ifdef TCP_BLACKBOX tcp_log_drain(tcpb); #endif #ifdef TCPPCAP if (tcp_pcap_aggressive_free) { /* Free the TCP PCAP queues. */ tcp_pcap_drain(&(tcpb->t_inpkts)); tcp_pcap_drain(&(tcpb->t_outpkts)); } #endif } } CURVNET_RESTORE(); } VNET_LIST_RUNLOCK_NOSLEEP(); NET_EPOCH_EXIT(et); } static void tcp_vnet_init(void *arg __unused) { #ifdef TCP_HHOOK if (hhook_head_register(HHOOK_TYPE_TCP, HHOOK_TCP_EST_IN, &V_tcp_hhh[HHOOK_TCP_EST_IN], HHOOK_NOWAIT|HHOOK_HEADISINVNET) != 0) printf("%s: WARNING: unable to register helper hook\n", __func__); if (hhook_head_register(HHOOK_TYPE_TCP, HHOOK_TCP_EST_OUT, &V_tcp_hhh[HHOOK_TCP_EST_OUT], HHOOK_NOWAIT|HHOOK_HEADISINVNET) != 0) printf("%s: WARNING: unable to register helper hook\n", __func__); #endif #ifdef STATS if (tcp_stats_init()) printf("%s: WARNING: unable to initialise TCP stats\n", __func__); #endif in_pcbinfo_init(&V_tcbinfo, &tcpcbstor, tcp_tcbhashsize, tcp_tcbhashsize); syncache_init(); tcp_hc_init(); TUNABLE_INT_FETCH("net.inet.tcp.sack.enable", &V_tcp_do_sack); V_sack_hole_zone = uma_zcreate("sackhole", sizeof(struct sackhole), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); tcp_fastopen_init(); COUNTER_ARRAY_ALLOC(V_tcps_states, TCP_NSTATES, M_WAITOK); VNET_PCPUSTAT_ALLOC(tcpstat, M_WAITOK); V_tcp_msl = TCPTV_MSL; } VNET_SYSINIT(tcp_vnet_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_FOURTH, tcp_vnet_init, NULL); static void tcp_init(void *arg __unused) { int hashsize; tcp_reass_global_init(); /* XXX virtualize those below? */ tcp_delacktime = TCPTV_DELACK; tcp_keepinit = TCPTV_KEEP_INIT; tcp_keepidle = TCPTV_KEEP_IDLE; tcp_keepintvl = TCPTV_KEEPINTVL; tcp_maxpersistidle = TCPTV_KEEP_IDLE; tcp_rexmit_initial = TCPTV_RTOBASE; if (tcp_rexmit_initial < 1) tcp_rexmit_initial = 1; tcp_rexmit_min = TCPTV_MIN; if (tcp_rexmit_min < 1) tcp_rexmit_min = 1; tcp_persmin = TCPTV_PERSMIN; tcp_persmax = TCPTV_PERSMAX; tcp_rexmit_slop = TCPTV_CPU_VAR; tcp_finwait2_timeout = TCPTV_FINWAIT2_TIMEOUT; /* Setup the tcp function block list */ TAILQ_INIT(&t_functions); rw_init(&tcp_function_lock, "tcp_func_lock"); register_tcp_functions(&tcp_def_funcblk, M_WAITOK); sx_init(&tcpoudp_lock, "TCP over UDP configuration"); #ifdef TCP_BLACKBOX /* Initialize the TCP logging data. */ tcp_log_init(); #endif arc4rand(&V_ts_offset_secret, sizeof(V_ts_offset_secret), 0); if (tcp_soreceive_stream) { #ifdef INET tcp_protosw.pr_soreceive = soreceive_stream; #endif #ifdef INET6 tcp6_protosw.pr_soreceive = soreceive_stream; #endif /* INET6 */ } #ifdef INET6 max_protohdr_grow(sizeof(struct ip6_hdr) + sizeof(struct tcphdr)); #else /* INET6 */ max_protohdr_grow(sizeof(struct tcpiphdr)); #endif /* INET6 */ ISN_LOCK_INIT(); EVENTHANDLER_REGISTER(shutdown_pre_sync, tcp_fini, NULL, SHUTDOWN_PRI_DEFAULT); EVENTHANDLER_REGISTER(vm_lowmem, tcp_drain, NULL, LOWMEM_PRI_DEFAULT); EVENTHANDLER_REGISTER(mbuf_lowmem, tcp_drain, NULL, LOWMEM_PRI_DEFAULT); tcp_inp_lro_direct_queue = counter_u64_alloc(M_WAITOK); tcp_inp_lro_wokeup_queue = counter_u64_alloc(M_WAITOK); tcp_inp_lro_compressed = counter_u64_alloc(M_WAITOK); tcp_inp_lro_locks_taken = counter_u64_alloc(M_WAITOK); tcp_extra_mbuf = counter_u64_alloc(M_WAITOK); tcp_would_have_but = counter_u64_alloc(M_WAITOK); tcp_comp_total = counter_u64_alloc(M_WAITOK); tcp_uncomp_total = counter_u64_alloc(M_WAITOK); tcp_bad_csums = counter_u64_alloc(M_WAITOK); tcp_pacing_failures = counter_u64_alloc(M_WAITOK); tcp_dgp_failures = counter_u64_alloc(M_WAITOK); #ifdef TCPPCAP tcp_pcap_init(); #endif hashsize = tcp_tcbhashsize; if (hashsize == 0) { /* * Auto tune the hash size based on maxsockets. * A perfect hash would have a 1:1 mapping * (hashsize = maxsockets) however it's been * suggested that O(2) average is better. */ hashsize = maketcp_hashsize(maxsockets / 4); /* * Our historical default is 512, * do not autotune lower than this. */ if (hashsize < 512) hashsize = 512; if (bootverbose) printf("%s: %s auto tuned to %d\n", __func__, "net.inet.tcp.tcbhashsize", hashsize); } /* * We require a hashsize to be a power of two. * Previously if it was not a power of two we would just reset it * back to 512, which could be a nasty surprise if you did not notice * the error message. * Instead what we do is clip it to the closest power of two lower * than the specified hash value. */ if (!powerof2(hashsize)) { int oldhashsize = hashsize; hashsize = maketcp_hashsize(hashsize); /* prevent absurdly low value */ if (hashsize < 16) hashsize = 16; printf("%s: WARNING: TCB hash size not a power of 2, " "clipped from %d to %d.\n", __func__, oldhashsize, hashsize); } tcp_tcbhashsize = hashsize; #ifdef INET IPPROTO_REGISTER(IPPROTO_TCP, tcp_input, tcp_ctlinput); #endif #ifdef INET6 IP6PROTO_REGISTER(IPPROTO_TCP, tcp6_input, tcp6_ctlinput); #endif } SYSINIT(tcp_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, tcp_init, NULL); #ifdef VIMAGE static void tcp_destroy(void *unused __unused) { int n; #ifdef TCP_HHOOK int error; #endif /* * All our processes are gone, all our sockets should be cleaned * up, which means, we should be past the tcp_discardcb() calls. * Sleep to let all tcpcb timers really disappear and cleanup. */ for (;;) { INP_INFO_WLOCK(&V_tcbinfo); n = V_tcbinfo.ipi_count; INP_INFO_WUNLOCK(&V_tcbinfo); if (n == 0) break; pause("tcpdes", hz / 10); } tcp_hc_destroy(); syncache_destroy(); in_pcbinfo_destroy(&V_tcbinfo); /* tcp_discardcb() clears the sack_holes up. */ uma_zdestroy(V_sack_hole_zone); /* * Cannot free the zone until all tcpcbs are released as we attach * the allocations to them. */ tcp_fastopen_destroy(); COUNTER_ARRAY_FREE(V_tcps_states, TCP_NSTATES); VNET_PCPUSTAT_FREE(tcpstat); #ifdef TCP_HHOOK error = hhook_head_deregister(V_tcp_hhh[HHOOK_TCP_EST_IN]); if (error != 0) { printf("%s: WARNING: unable to deregister helper hook " "type=%d, id=%d: error %d returned\n", __func__, HHOOK_TYPE_TCP, HHOOK_TCP_EST_IN, error); } error = hhook_head_deregister(V_tcp_hhh[HHOOK_TCP_EST_OUT]); if (error != 0) { printf("%s: WARNING: unable to deregister helper hook " "type=%d, id=%d: error %d returned\n", __func__, HHOOK_TYPE_TCP, HHOOK_TCP_EST_OUT, error); } #endif } VNET_SYSUNINIT(tcp, SI_SUB_PROTO_DOMAIN, SI_ORDER_FOURTH, tcp_destroy, NULL); #endif void tcp_fini(void *xtp) { } /* * Fill in the IP and TCP headers for an outgoing packet, given the tcpcb. * tcp_template used to store this data in mbufs, but we now recopy it out * of the tcpcb each time to conserve mbufs. */ void tcpip_fillheaders(struct inpcb *inp, uint16_t port, void *ip_ptr, void *tcp_ptr) { struct tcphdr *th = (struct tcphdr *)tcp_ptr; INP_WLOCK_ASSERT(inp); #ifdef INET6 if ((inp->inp_vflag & INP_IPV6) != 0) { struct ip6_hdr *ip6; ip6 = (struct ip6_hdr *)ip_ptr; ip6->ip6_flow = (ip6->ip6_flow & ~IPV6_FLOWINFO_MASK) | (inp->inp_flow & IPV6_FLOWINFO_MASK); ip6->ip6_vfc = (ip6->ip6_vfc & ~IPV6_VERSION_MASK) | (IPV6_VERSION & IPV6_VERSION_MASK); if (port == 0) ip6->ip6_nxt = IPPROTO_TCP; else ip6->ip6_nxt = IPPROTO_UDP; ip6->ip6_plen = htons(sizeof(struct tcphdr)); ip6->ip6_src = inp->in6p_laddr; ip6->ip6_dst = inp->in6p_faddr; } #endif /* INET6 */ #if defined(INET6) && defined(INET) else #endif #ifdef INET { struct ip *ip; ip = (struct ip *)ip_ptr; ip->ip_v = IPVERSION; ip->ip_hl = 5; ip->ip_tos = inp->inp_ip_tos; ip->ip_len = 0; ip->ip_id = 0; ip->ip_off = 0; ip->ip_ttl = inp->inp_ip_ttl; ip->ip_sum = 0; if (port == 0) ip->ip_p = IPPROTO_TCP; else ip->ip_p = IPPROTO_UDP; ip->ip_src = inp->inp_laddr; ip->ip_dst = inp->inp_faddr; } #endif /* INET */ th->th_sport = inp->inp_lport; th->th_dport = inp->inp_fport; th->th_seq = 0; th->th_ack = 0; th->th_off = 5; tcp_set_flags(th, 0); th->th_win = 0; th->th_urp = 0; th->th_sum = 0; /* in_pseudo() is called later for ipv4 */ } /* * Create template to be used to send tcp packets on a connection. * Allocates an mbuf and fills in a skeletal tcp/ip header. The only * use for this function is in keepalives, which use tcp_respond. */ struct tcptemp * tcpip_maketemplate(struct inpcb *inp) { struct tcptemp *t; t = malloc(sizeof(*t), M_TEMP, M_NOWAIT); if (t == NULL) return (NULL); tcpip_fillheaders(inp, 0, (void *)&t->tt_ipgen, (void *)&t->tt_t); return (t); } /* * Send a single message to the TCP at address specified by * the given TCP/IP header. If m == NULL, then we make a copy * of the tcpiphdr at th and send directly to the addressed host. * This is used to force keep alive messages out using the TCP * template for a connection. If flags are given then we send * a message back to the TCP which originated the segment th, * and discard the mbuf containing it and any other attached mbufs. * * In any case the ack and sequence number of the transmitted * segment are as specified by the parameters. * * NOTE: If m != NULL, then th must point to *inside* the mbuf. */ void tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m, tcp_seq ack, tcp_seq seq, uint16_t flags) { struct tcpopt to; struct inpcb *inp; struct ip *ip; struct mbuf *optm; struct udphdr *uh = NULL; struct tcphdr *nth; struct tcp_log_buffer *lgb; u_char *optp; #ifdef INET6 struct ip6_hdr *ip6; int isipv6; #endif /* INET6 */ int optlen, tlen, win, ulen; int ect = 0; bool incl_opts; uint16_t port; int output_ret; #ifdef INVARIANTS int thflags = tcp_get_flags(th); #endif KASSERT(tp != NULL || m != NULL, ("tcp_respond: tp and m both NULL")); NET_EPOCH_ASSERT(); #ifdef INET6 isipv6 = ((struct ip *)ipgen)->ip_v == (IPV6_VERSION >> 4); ip6 = ipgen; #endif /* INET6 */ ip = ipgen; if (tp != NULL) { inp = tptoinpcb(tp); INP_LOCK_ASSERT(inp); } else inp = NULL; if (m != NULL) { #ifdef INET6 if (isipv6 && ip6 && (ip6->ip6_nxt == IPPROTO_UDP)) port = m->m_pkthdr.tcp_tun_port; else #endif if (ip && (ip->ip_p == IPPROTO_UDP)) port = m->m_pkthdr.tcp_tun_port; else port = 0; } else port = tp->t_port; incl_opts = false; win = 0; if (tp != NULL) { if (!(flags & TH_RST)) { win = sbspace(&inp->inp_socket->so_rcv); if (win > TCP_MAXWIN << tp->rcv_scale) win = TCP_MAXWIN << tp->rcv_scale; } if ((tp->t_flags & TF_NOOPT) == 0) incl_opts = true; } if (m == NULL) { m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) return; m->m_data += max_linkhdr; #ifdef INET6 if (isipv6) { bcopy((caddr_t)ip6, mtod(m, caddr_t), sizeof(struct ip6_hdr)); ip6 = mtod(m, struct ip6_hdr *); nth = (struct tcphdr *)(ip6 + 1); if (port) { /* Insert a UDP header */ uh = (struct udphdr *)nth; uh->uh_sport = htons(V_tcp_udp_tunneling_port); uh->uh_dport = port; nth = (struct tcphdr *)(uh + 1); } } else #endif /* INET6 */ { bcopy((caddr_t)ip, mtod(m, caddr_t), sizeof(struct ip)); ip = mtod(m, struct ip *); nth = (struct tcphdr *)(ip + 1); if (port) { /* Insert a UDP header */ uh = (struct udphdr *)nth; uh->uh_sport = htons(V_tcp_udp_tunneling_port); uh->uh_dport = port; nth = (struct tcphdr *)(uh + 1); } } bcopy((caddr_t)th, (caddr_t)nth, sizeof(struct tcphdr)); flags = TH_ACK; } else if ((!M_WRITABLE(m)) || (port != 0)) { struct mbuf *n; /* Can't reuse 'm', allocate a new mbuf. */ n = m_gethdr(M_NOWAIT, MT_DATA); if (n == NULL) { m_freem(m); return; } if (!m_dup_pkthdr(n, m, M_NOWAIT)) { m_freem(m); m_freem(n); return; } n->m_data += max_linkhdr; /* m_len is set later */ #define xchg(a,b,type) { type t; t=a; a=b; b=t; } #ifdef INET6 if (isipv6) { bcopy((caddr_t)ip6, mtod(n, caddr_t), sizeof(struct ip6_hdr)); ip6 = mtod(n, struct ip6_hdr *); xchg(ip6->ip6_dst, ip6->ip6_src, struct in6_addr); nth = (struct tcphdr *)(ip6 + 1); if (port) { /* Insert a UDP header */ uh = (struct udphdr *)nth; uh->uh_sport = htons(V_tcp_udp_tunneling_port); uh->uh_dport = port; nth = (struct tcphdr *)(uh + 1); } } else #endif /* INET6 */ { bcopy((caddr_t)ip, mtod(n, caddr_t), sizeof(struct ip)); ip = mtod(n, struct ip *); xchg(ip->ip_dst.s_addr, ip->ip_src.s_addr, uint32_t); nth = (struct tcphdr *)(ip + 1); if (port) { /* Insert a UDP header */ uh = (struct udphdr *)nth; uh->uh_sport = htons(V_tcp_udp_tunneling_port); uh->uh_dport = port; nth = (struct tcphdr *)(uh + 1); } } bcopy((caddr_t)th, (caddr_t)nth, sizeof(struct tcphdr)); xchg(nth->th_dport, nth->th_sport, uint16_t); th = nth; m_freem(m); m = n; } else { /* * reuse the mbuf. * XXX MRT We inherit the FIB, which is lucky. */ m_freem(m->m_next); m->m_next = NULL; m->m_data = (caddr_t)ipgen; /* clear any receive flags for proper bpf timestamping */ m->m_flags &= ~(M_TSTMP | M_TSTMP_LRO); /* m_len is set later */ #ifdef INET6 if (isipv6) { xchg(ip6->ip6_dst, ip6->ip6_src, struct in6_addr); nth = (struct tcphdr *)(ip6 + 1); } else #endif /* INET6 */ { xchg(ip->ip_dst.s_addr, ip->ip_src.s_addr, uint32_t); nth = (struct tcphdr *)(ip + 1); } if (th != nth) { /* * this is usually a case when an extension header * exists between the IPv6 header and the * TCP header. */ nth->th_sport = th->th_sport; nth->th_dport = th->th_dport; } xchg(nth->th_dport, nth->th_sport, uint16_t); #undef xchg } tlen = 0; #ifdef INET6 if (isipv6) tlen = sizeof (struct ip6_hdr) + sizeof (struct tcphdr); #endif #if defined(INET) && defined(INET6) else #endif #ifdef INET tlen = sizeof (struct tcpiphdr); #endif if (port) tlen += sizeof (struct udphdr); #ifdef INVARIANTS m->m_len = 0; KASSERT(M_TRAILINGSPACE(m) >= tlen, ("Not enough trailing space for message (m=%p, need=%d, have=%ld)", m, tlen, (long)M_TRAILINGSPACE(m))); #endif m->m_len = tlen; to.to_flags = 0; if (incl_opts) { ect = tcp_ecn_output_established(tp, &flags, 0, false); /* Make sure we have room. */ if (M_TRAILINGSPACE(m) < TCP_MAXOLEN) { m->m_next = m_get(M_NOWAIT, MT_DATA); if (m->m_next) { optp = mtod(m->m_next, u_char *); optm = m->m_next; } else incl_opts = false; } else { optp = (u_char *) (nth + 1); optm = m; } } if (incl_opts) { /* Timestamps. */ if (tp->t_flags & TF_RCVD_TSTMP) { to.to_tsval = tcp_ts_getticks() + tp->ts_offset; to.to_tsecr = tp->ts_recent; to.to_flags |= TOF_TS; } #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) /* TCP-MD5 (RFC2385). */ if (tp->t_flags & TF_SIGNATURE) to.to_flags |= TOF_SIGNATURE; #endif /* Add the options. */ tlen += optlen = tcp_addoptions(&to, optp); /* Update m_len in the correct mbuf. */ optm->m_len += optlen; } else optlen = 0; #ifdef INET6 if (isipv6) { if (uh) { ulen = tlen - sizeof(struct ip6_hdr); uh->uh_ulen = htons(ulen); } ip6->ip6_flow = htonl(ect << IPV6_FLOWLABEL_LEN); ip6->ip6_vfc = IPV6_VERSION; if (port) ip6->ip6_nxt = IPPROTO_UDP; else ip6->ip6_nxt = IPPROTO_TCP; ip6->ip6_plen = htons(tlen - sizeof(*ip6)); } #endif #if defined(INET) && defined(INET6) else #endif #ifdef INET { if (uh) { ulen = tlen - sizeof(struct ip); uh->uh_ulen = htons(ulen); } ip->ip_len = htons(tlen); if (inp != NULL) { ip->ip_tos = inp->inp_ip_tos & ~IPTOS_ECN_MASK; ip->ip_ttl = inp->inp_ip_ttl; } else { ip->ip_tos = 0; ip->ip_ttl = V_ip_defttl; } ip->ip_tos |= ect; if (port) { ip->ip_p = IPPROTO_UDP; } else { ip->ip_p = IPPROTO_TCP; } if (V_path_mtu_discovery) ip->ip_off |= htons(IP_DF); } #endif m->m_pkthdr.len = tlen; m->m_pkthdr.rcvif = NULL; #ifdef MAC if (inp != NULL) { /* * Packet is associated with a socket, so allow the * label of the response to reflect the socket label. */ INP_LOCK_ASSERT(inp); mac_inpcb_create_mbuf(inp, m); } else { /* * Packet is not associated with a socket, so possibly * update the label in place. */ mac_netinet_tcp_reply(m); } #endif nth->th_seq = htonl(seq); nth->th_ack = htonl(ack); nth->th_off = (sizeof (struct tcphdr) + optlen) >> 2; tcp_set_flags(nth, flags); if (tp && (flags & TH_RST)) { /* Log the reset */ tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST); } if (tp != NULL) nth->th_win = htons((u_short) (win >> tp->rcv_scale)); else nth->th_win = htons((u_short)win); nth->th_urp = 0; #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) if (to.to_flags & TOF_SIGNATURE) { if (!TCPMD5_ENABLED() || TCPMD5_OUTPUT(m, nth, to.to_signature) != 0) { m_freem(m); return; } } #endif #ifdef INET6 if (isipv6) { if (port) { m->m_pkthdr.csum_flags = CSUM_UDP_IPV6; m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); uh->uh_sum = in6_cksum_pseudo(ip6, ulen, IPPROTO_UDP, 0); nth->th_sum = 0; } else { m->m_pkthdr.csum_flags = CSUM_TCP_IPV6; m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); nth->th_sum = in6_cksum_pseudo(ip6, tlen - sizeof(struct ip6_hdr), IPPROTO_TCP, 0); } ip6->ip6_hlim = in6_selecthlim(inp, NULL); } #endif /* INET6 */ #if defined(INET6) && defined(INET) else #endif #ifdef INET { if (port) { uh->uh_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htons(ulen + IPPROTO_UDP)); m->m_pkthdr.csum_flags = CSUM_UDP; m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); nth->th_sum = 0; } else { m->m_pkthdr.csum_flags = CSUM_TCP; m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); nth->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htons((u_short)(tlen - sizeof(struct ip) + ip->ip_p))); } } #endif /* INET */ TCP_PROBE3(debug__output, tp, th, m); if (flags & TH_RST) TCP_PROBE5(accept__refused, NULL, NULL, m, tp, nth); lgb = NULL; if ((tp != NULL) && tcp_bblogging_on(tp)) { if (INP_WLOCKED(inp)) { union tcp_log_stackspecific log; struct timeval tv; memset(&log.u_bbr, 0, sizeof(log.u_bbr)); log.u_bbr.inhpts = tcp_in_hpts(tp); log.u_bbr.flex8 = 4; log.u_bbr.pkts_out = tp->t_maxseg; log.u_bbr.timeStamp = tcp_get_usecs(&tv); log.u_bbr.delivered = 0; lgb = tcp_log_event(tp, nth, NULL, NULL, TCP_LOG_OUT, ERRNO_UNK, 0, &log, false, NULL, NULL, 0, &tv); } else { /* * We can not log the packet, since we only own the * read lock, but a write lock is needed. The read lock * is not upgraded to a write lock, since only getting * the read lock was done intentionally to improve the * handling of SYN flooding attacks. * This happens only for pure SYN segments received in * the initial CLOSED state, or received in a more * advanced state than listen and the UDP encapsulation * port is unexpected. * The incoming SYN segments do not really belong to * the TCP connection and the handling does not change * the state of the TCP connection. Therefore, the * sending of the RST segments is not logged. Please * note that also the incoming SYN segments are not * logged. * * The following code ensures that the above description * is and stays correct. */ KASSERT((thflags & (TH_ACK|TH_SYN)) == TH_SYN && (tp->t_state == TCPS_CLOSED || (tp->t_state > TCPS_LISTEN && tp->t_port != port)), ("%s: Logging of TCP segment with flags 0x%b and " "UDP encapsulation port %u skipped in state %s", __func__, thflags, PRINT_TH_FLAGS, ntohs(port), tcpstates[tp->t_state])); } } if (flags & TH_ACK) TCPSTAT_INC(tcps_sndacks); else if (flags & (TH_SYN|TH_FIN|TH_RST)) TCPSTAT_INC(tcps_sndctrl); TCPSTAT_INC(tcps_sndtotal); #ifdef INET6 if (isipv6) { TCP_PROBE5(send, NULL, tp, ip6, tp, nth); output_ret = ip6_output(m, inp ? inp->in6p_outputopts : NULL, NULL, 0, NULL, NULL, inp); } #endif /* INET6 */ #if defined(INET) && defined(INET6) else #endif #ifdef INET { TCP_PROBE5(send, NULL, tp, ip, tp, nth); output_ret = ip_output(m, NULL, NULL, 0, NULL, inp); } #endif if (lgb != NULL) lgb->tlb_errno = output_ret; } /* * Create a new TCP control block, making an empty reassembly queue and hooking * it to the argument protocol control block. The `inp' parameter must have * come from the zone allocator set up by tcpcbstor declaration. */ struct tcpcb * tcp_newtcpcb(struct inpcb *inp) { struct tcpcb *tp = intotcpcb(inp); #ifdef INET6 int isipv6 = (inp->inp_vflag & INP_IPV6) != 0; #endif /* INET6 */ /* * Historically allocation was done with M_ZERO. There is a lot of * code that rely on that. For now take safe approach and zero whole * tcpcb. This definitely can be optimized. */ bzero(&tp->t_start_zero, t_zero_size); /* Initialise cc_var struct for this tcpcb. */ tp->t_ccv.type = IPPROTO_TCP; tp->t_ccv.ccvc.tcp = tp; rw_rlock(&tcp_function_lock); tp->t_fb = V_tcp_func_set_ptr; refcount_acquire(&tp->t_fb->tfb_refcnt); rw_runlock(&tcp_function_lock); /* * Use the current system default CC algorithm. */ cc_attach(tp, CC_DEFAULT_ALGO()); if (CC_ALGO(tp)->cb_init != NULL) if (CC_ALGO(tp)->cb_init(&tp->t_ccv, NULL) > 0) { cc_detach(tp); if (tp->t_fb->tfb_tcp_fb_fini) (*tp->t_fb->tfb_tcp_fb_fini)(tp, 1); refcount_release(&tp->t_fb->tfb_refcnt); return (NULL); } #ifdef TCP_HHOOK if (khelp_init_osd(HELPER_CLASS_TCP, &tp->t_osd)) { if (tp->t_fb->tfb_tcp_fb_fini) (*tp->t_fb->tfb_tcp_fb_fini)(tp, 1); refcount_release(&tp->t_fb->tfb_refcnt); return (NULL); } #endif TAILQ_INIT(&tp->t_segq); STAILQ_INIT(&tp->t_inqueue); tp->t_maxseg = #ifdef INET6 isipv6 ? V_tcp_v6mssdflt : #endif /* INET6 */ V_tcp_mssdflt; /* All mbuf queue/ack compress flags should be off */ tcp_lro_features_off(tp); tp->t_hpts_cpu = HPTS_CPU_NONE; tp->t_lro_cpu = HPTS_CPU_NONE; callout_init_rw(&tp->t_callout, &inp->inp_lock, CALLOUT_RETURNUNLOCKED); for (int i = 0; i < TT_N; i++) tp->t_timers[i] = SBT_MAX; switch (V_tcp_do_rfc1323) { case 0: break; default: case 1: tp->t_flags = (TF_REQ_SCALE|TF_REQ_TSTMP); break; case 2: tp->t_flags = TF_REQ_SCALE; break; case 3: tp->t_flags = TF_REQ_TSTMP; break; } if (V_tcp_do_sack) tp->t_flags |= TF_SACK_PERMIT; TAILQ_INIT(&tp->snd_holes); /* * Init srtt to TCPTV_SRTTBASE (0), so we can tell that we have no * rtt estimate. Set rttvar so that srtt + 4 * rttvar gives * reasonable initial retransmit time. */ tp->t_srtt = TCPTV_SRTTBASE; tp->t_rttvar = ((tcp_rexmit_initial - TCPTV_SRTTBASE) << TCP_RTTVAR_SHIFT) / 4; tp->t_rttmin = tcp_rexmit_min; tp->t_rxtcur = tcp_rexmit_initial; tp->snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT; tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT; tp->t_rcvtime = ticks; /* We always start with ticks granularity */ tp->t_tmr_granularity = TCP_TMR_GRANULARITY_TICKS; /* * IPv4 TTL initialization is necessary for an IPv6 socket as well, * because the socket may be bound to an IPv6 wildcard address, * which may match an IPv4-mapped IPv6 address. */ inp->inp_ip_ttl = V_ip_defttl; #ifdef TCPPCAP /* * Init the TCP PCAP queues. */ tcp_pcap_tcpcb_init(tp); #endif #ifdef TCP_BLACKBOX /* Initialize the per-TCPCB log data. */ tcp_log_tcpcbinit(tp); #endif tp->t_pacing_rate = -1; if (tp->t_fb->tfb_tcp_fb_init) { if ((*tp->t_fb->tfb_tcp_fb_init)(tp, &tp->t_fb_ptr)) { refcount_release(&tp->t_fb->tfb_refcnt); return (NULL); } } #ifdef STATS if (V_tcp_perconn_stats_enable == 1) tp->t_stats = stats_blob_alloc(V_tcp_perconn_stats_dflt_tpl, 0); #endif if (V_tcp_do_lrd) tp->t_flags |= TF_LRD; return (tp); } /* * Drop a TCP connection, reporting * the specified error. If connection is synchronized, * then send a RST to peer. */ struct tcpcb * tcp_drop(struct tcpcb *tp, int errno) { struct socket *so = tptosocket(tp); NET_EPOCH_ASSERT(); INP_WLOCK_ASSERT(tptoinpcb(tp)); if (TCPS_HAVERCVDSYN(tp->t_state)) { tcp_state_change(tp, TCPS_CLOSED); /* Don't use tcp_output() here due to possible recursion. */ (void)tcp_output_nodrop(tp); TCPSTAT_INC(tcps_drops); } else TCPSTAT_INC(tcps_conndrops); if (errno == ETIMEDOUT && tp->t_softerror) errno = tp->t_softerror; so->so_error = errno; return (tcp_close(tp)); } void tcp_discardcb(struct tcpcb *tp) { struct inpcb *inp = tptoinpcb(tp); struct socket *so = tptosocket(tp); struct mbuf *m; #ifdef INET6 bool isipv6 = (inp->inp_vflag & INP_IPV6) != 0; #endif INP_WLOCK_ASSERT(inp); MPASS(!callout_active(&tp->t_callout)); MPASS(TAILQ_EMPTY(&tp->snd_holes)); /* free the reassembly queue, if any */ tcp_reass_flush(tp); #ifdef TCP_OFFLOAD /* Disconnect offload device, if any. */ if (tp->t_flags & TF_TOE) tcp_offload_detach(tp); #endif #ifdef TCPPCAP /* Free the TCP PCAP queues. */ tcp_pcap_drain(&(tp->t_inpkts)); tcp_pcap_drain(&(tp->t_outpkts)); #endif /* Allow the CC algorithm to clean up after itself. */ if (CC_ALGO(tp)->cb_destroy != NULL) CC_ALGO(tp)->cb_destroy(&tp->t_ccv); CC_DATA(tp) = NULL; /* Detach from the CC algorithm */ cc_detach(tp); #ifdef TCP_HHOOK khelp_destroy_osd(&tp->t_osd); #endif #ifdef STATS stats_blob_destroy(tp->t_stats); #endif CC_ALGO(tp) = NULL; if ((m = STAILQ_FIRST(&tp->t_inqueue)) != NULL) { struct mbuf *prev; STAILQ_INIT(&tp->t_inqueue); STAILQ_FOREACH_FROM_SAFE(m, &tp->t_inqueue, m_stailqpkt, prev) m_freem(m); } TCPSTATES_DEC(tp->t_state); if (tp->t_fb->tfb_tcp_fb_fini) (*tp->t_fb->tfb_tcp_fb_fini)(tp, 1); MPASS(!tcp_in_hpts(tp)); #ifdef TCP_BLACKBOX tcp_log_tcpcbfini(tp); #endif /* * If we got enough samples through the srtt filter, * save the rtt and rttvar in the routing entry. * 'Enough' is arbitrarily defined as 4 rtt samples. * 4 samples is enough for the srtt filter to converge * to within enough % of the correct value; fewer samples * and we could save a bogus rtt. The danger is not high * as tcp quickly recovers from everything. * XXX: Works very well but needs some more statistics! * * XXXRRS: Updating must be after the stack fini() since * that may be converting some internal representation of * say srtt etc into the general one used by other stacks. * Lets also at least protect against the so being NULL * as RW stated below. */ if ((tp->t_rttupdated >= 4) && (so != NULL)) { struct hc_metrics_lite metrics; uint32_t ssthresh; bzero(&metrics, sizeof(metrics)); /* * Update the ssthresh always when the conditions below * are satisfied. This gives us better new start value * for the congestion avoidance for new connections. * ssthresh is only set if packet loss occurred on a session. * * XXXRW: 'so' may be NULL here, and/or socket buffer may be * being torn down. Ideally this code would not use 'so'. */ ssthresh = tp->snd_ssthresh; if (ssthresh != 0 && ssthresh < so->so_snd.sb_hiwat / 2) { /* * convert the limit from user data bytes to * packets then to packet data bytes. */ ssthresh = (ssthresh + tp->t_maxseg / 2) / tp->t_maxseg; if (ssthresh < 2) ssthresh = 2; ssthresh *= (tp->t_maxseg + #ifdef INET6 (isipv6 ? sizeof (struct ip6_hdr) + sizeof (struct tcphdr) : #endif sizeof (struct tcpiphdr) #ifdef INET6 ) #endif ); } else ssthresh = 0; metrics.rmx_ssthresh = ssthresh; metrics.rmx_rtt = tp->t_srtt; metrics.rmx_rttvar = tp->t_rttvar; metrics.rmx_cwnd = tp->snd_cwnd; metrics.rmx_sendpipe = 0; metrics.rmx_recvpipe = 0; tcp_hc_update(&inp->inp_inc, &metrics); } refcount_release(&tp->t_fb->tfb_refcnt); } /* * Attempt to close a TCP control block, marking it as dropped, and freeing * the socket if we hold the only reference. */ struct tcpcb * tcp_close(struct tcpcb *tp) { struct inpcb *inp = tptoinpcb(tp); struct socket *so = tptosocket(tp); INP_WLOCK_ASSERT(inp); #ifdef TCP_OFFLOAD if (tp->t_state == TCPS_LISTEN) tcp_offload_listen_stop(tp); #endif /* * This releases the TFO pending counter resource for TFO listen * sockets as well as passively-created TFO sockets that transition * from SYN_RECEIVED to CLOSED. */ if (tp->t_tfo_pending) { tcp_fastopen_decrement_counter(tp->t_tfo_pending); tp->t_tfo_pending = NULL; } tcp_timer_stop(tp); if (tp->t_fb->tfb_tcp_timer_stop_all != NULL) tp->t_fb->tfb_tcp_timer_stop_all(tp); in_pcbdrop(inp); TCPSTAT_INC(tcps_closed); if (tp->t_state != TCPS_CLOSED) tcp_state_change(tp, TCPS_CLOSED); KASSERT(inp->inp_socket != NULL, ("tcp_close: inp_socket NULL")); tcp_free_sackholes(tp); soisdisconnected(so); if (inp->inp_flags & INP_SOCKREF) { inp->inp_flags &= ~INP_SOCKREF; INP_WUNLOCK(inp); sorele(so); return (NULL); } return (tp); } /* * Notify a tcp user of an asynchronous error; * store error as soft error, but wake up user * (for now, won't do anything until can select for soft error). * * Do not wake up user since there currently is no mechanism for * reporting soft errors (yet - a kqueue filter may be added). */ static struct inpcb * tcp_notify(struct inpcb *inp, int error) { struct tcpcb *tp; INP_WLOCK_ASSERT(inp); tp = intotcpcb(inp); KASSERT(tp != NULL, ("tcp_notify: tp == NULL")); /* * Ignore some errors if we are hooked up. * If connection hasn't completed, has retransmitted several times, * and receives a second error, give up now. This is better * than waiting a long time to establish a connection that * can never complete. */ if (tp->t_state == TCPS_ESTABLISHED && (error == EHOSTUNREACH || error == ENETUNREACH || error == EHOSTDOWN)) { if (inp->inp_route.ro_nh) { NH_FREE(inp->inp_route.ro_nh); inp->inp_route.ro_nh = (struct nhop_object *)NULL; } return (inp); } else if (tp->t_state < TCPS_ESTABLISHED && tp->t_rxtshift > 3 && tp->t_softerror) { tp = tcp_drop(tp, error); if (tp != NULL) return (inp); else return (NULL); } else { tp->t_softerror = error; return (inp); } #if 0 wakeup( &so->so_timeo); sorwakeup(so); sowwakeup(so); #endif } static int tcp_pcblist(SYSCTL_HANDLER_ARGS) { struct inpcb_iterator inpi = INP_ALL_ITERATOR(&V_tcbinfo, INPLOOKUP_RLOCKPCB); struct xinpgen xig; struct inpcb *inp; int error; if (req->newptr != NULL) return (EPERM); if (req->oldptr == NULL) { int n; n = V_tcbinfo.ipi_count + counter_u64_fetch(V_tcps_states[TCPS_SYN_RECEIVED]); n += imax(n / 8, 10); req->oldidx = 2 * (sizeof xig) + n * sizeof(struct xtcpcb); return (0); } if ((error = sysctl_wire_old_buffer(req, 0)) != 0) return (error); bzero(&xig, sizeof(xig)); xig.xig_len = sizeof xig; xig.xig_count = V_tcbinfo.ipi_count + counter_u64_fetch(V_tcps_states[TCPS_SYN_RECEIVED]); xig.xig_gen = V_tcbinfo.ipi_gencnt; xig.xig_sogen = so_gencnt; error = SYSCTL_OUT(req, &xig, sizeof xig); if (error) return (error); error = syncache_pcblist(req); if (error) return (error); while ((inp = inp_next(&inpi)) != NULL) { if (inp->inp_gencnt <= xig.xig_gen && cr_canseeinpcb(req->td->td_ucred, inp) == 0) { struct xtcpcb xt; tcp_inptoxtp(inp, &xt); error = SYSCTL_OUT(req, &xt, sizeof xt); if (error) { INP_RUNLOCK(inp); break; } else continue; } } if (!error) { /* * Give the user an updated idea of our state. * If the generation differs from what we told * her before, she knows that something happened * while we were processing this request, and it * might be necessary to retry. */ xig.xig_gen = V_tcbinfo.ipi_gencnt; xig.xig_sogen = so_gencnt; xig.xig_count = V_tcbinfo.ipi_count + counter_u64_fetch(V_tcps_states[TCPS_SYN_RECEIVED]); error = SYSCTL_OUT(req, &xig, sizeof xig); } return (error); } SYSCTL_PROC(_net_inet_tcp, TCPCTL_PCBLIST, pcblist, CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_NEEDGIANT, NULL, 0, tcp_pcblist, "S,xtcpcb", "List of active TCP connections"); #ifdef INET static int tcp_getcred(SYSCTL_HANDLER_ARGS) { struct xucred xuc; struct sockaddr_in addrs[2]; struct epoch_tracker et; struct inpcb *inp; int error; error = priv_check(req->td, PRIV_NETINET_GETCRED); if (error) return (error); error = SYSCTL_IN(req, addrs, sizeof(addrs)); if (error) return (error); NET_EPOCH_ENTER(et); inp = in_pcblookup(&V_tcbinfo, addrs[1].sin_addr, addrs[1].sin_port, addrs[0].sin_addr, addrs[0].sin_port, INPLOOKUP_RLOCKPCB, NULL); NET_EPOCH_EXIT(et); if (inp != NULL) { if (error == 0) error = cr_canseeinpcb(req->td->td_ucred, inp); if (error == 0) cru2x(inp->inp_cred, &xuc); INP_RUNLOCK(inp); } else error = ENOENT; if (error == 0) error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred)); return (error); } SYSCTL_PROC(_net_inet_tcp, OID_AUTO, getcred, CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_NEEDGIANT, 0, 0, tcp_getcred, "S,xucred", "Get the xucred of a TCP connection"); #endif /* INET */ #ifdef INET6 static int tcp6_getcred(SYSCTL_HANDLER_ARGS) { struct epoch_tracker et; struct xucred xuc; struct sockaddr_in6 addrs[2]; struct inpcb *inp; int error; #ifdef INET int mapped = 0; #endif error = priv_check(req->td, PRIV_NETINET_GETCRED); if (error) return (error); error = SYSCTL_IN(req, addrs, sizeof(addrs)); if (error) return (error); if ((error = sa6_embedscope(&addrs[0], V_ip6_use_defzone)) != 0 || (error = sa6_embedscope(&addrs[1], V_ip6_use_defzone)) != 0) { return (error); } if (IN6_IS_ADDR_V4MAPPED(&addrs[0].sin6_addr)) { #ifdef INET if (IN6_IS_ADDR_V4MAPPED(&addrs[1].sin6_addr)) mapped = 1; else #endif return (EINVAL); } NET_EPOCH_ENTER(et); #ifdef INET if (mapped == 1) inp = in_pcblookup(&V_tcbinfo, *(struct in_addr *)&addrs[1].sin6_addr.s6_addr[12], addrs[1].sin6_port, *(struct in_addr *)&addrs[0].sin6_addr.s6_addr[12], addrs[0].sin6_port, INPLOOKUP_RLOCKPCB, NULL); else #endif inp = in6_pcblookup(&V_tcbinfo, &addrs[1].sin6_addr, addrs[1].sin6_port, &addrs[0].sin6_addr, addrs[0].sin6_port, INPLOOKUP_RLOCKPCB, NULL); NET_EPOCH_EXIT(et); if (inp != NULL) { if (error == 0) error = cr_canseeinpcb(req->td->td_ucred, inp); if (error == 0) cru2x(inp->inp_cred, &xuc); INP_RUNLOCK(inp); } else error = ENOENT; if (error == 0) error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred)); return (error); } SYSCTL_PROC(_net_inet6_tcp6, OID_AUTO, getcred, CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_NEEDGIANT, 0, 0, tcp6_getcred, "S,xucred", "Get the xucred of a TCP6 connection"); #endif /* INET6 */ #ifdef INET /* Path MTU to try next when a fragmentation-needed message is received. */ static inline int tcp_next_pmtu(const struct icmp *icp, const struct ip *ip) { int mtu = ntohs(icp->icmp_nextmtu); /* If no alternative MTU was proposed, try the next smaller one. */ if (!mtu) mtu = ip_next_mtu(ntohs(ip->ip_len), 1); if (mtu < V_tcp_minmss + sizeof(struct tcpiphdr)) mtu = V_tcp_minmss + sizeof(struct tcpiphdr); return (mtu); } static void tcp_ctlinput_with_port(struct icmp *icp, uint16_t port) { struct ip *ip; struct tcphdr *th; struct inpcb *inp; struct tcpcb *tp; struct inpcb *(*notify)(struct inpcb *, int); struct in_conninfo inc; tcp_seq icmp_tcp_seq; int errno, mtu; errno = icmp_errmap(icp); switch (errno) { case 0: return; case EMSGSIZE: notify = tcp_mtudisc_notify; break; case ECONNREFUSED: if (V_icmp_may_rst) notify = tcp_drop_syn_sent; else notify = tcp_notify; break; case EHOSTUNREACH: if (V_icmp_may_rst && icp->icmp_type == ICMP_TIMXCEED) notify = tcp_drop_syn_sent; else notify = tcp_notify; break; default: notify = tcp_notify; } ip = &icp->icmp_ip; th = (struct tcphdr *)((caddr_t)ip + (ip->ip_hl << 2)); icmp_tcp_seq = th->th_seq; inp = in_pcblookup(&V_tcbinfo, ip->ip_dst, th->th_dport, ip->ip_src, th->th_sport, INPLOOKUP_WLOCKPCB, NULL); if (inp != NULL) { tp = intotcpcb(inp); #ifdef TCP_OFFLOAD if (tp->t_flags & TF_TOE && errno == EMSGSIZE) { /* * MTU discovery for offloaded connections. Let * the TOE driver verify seq# and process it. */ mtu = tcp_next_pmtu(icp, ip); tcp_offload_pmtu_update(tp, icmp_tcp_seq, mtu); goto out; } #endif if (tp->t_port != port) goto out; if (SEQ_GEQ(ntohl(icmp_tcp_seq), tp->snd_una) && SEQ_LT(ntohl(icmp_tcp_seq), tp->snd_max)) { if (errno == EMSGSIZE) { /* * MTU discovery: we got a needfrag and * will potentially try a lower MTU. */ mtu = tcp_next_pmtu(icp, ip); /* * Only process the offered MTU if it * is smaller than the current one. */ if (mtu < tp->t_maxseg + sizeof(struct tcpiphdr)) { bzero(&inc, sizeof(inc)); inc.inc_faddr = ip->ip_dst; inc.inc_fibnum = inp->inp_inc.inc_fibnum; tcp_hc_updatemtu(&inc, mtu); inp = tcp_mtudisc(inp, mtu); } } else inp = (*notify)(inp, errno); } } else { bzero(&inc, sizeof(inc)); inc.inc_fport = th->th_dport; inc.inc_lport = th->th_sport; inc.inc_faddr = ip->ip_dst; inc.inc_laddr = ip->ip_src; syncache_unreach(&inc, icmp_tcp_seq, port); } out: if (inp != NULL) INP_WUNLOCK(inp); } static void tcp_ctlinput(struct icmp *icmp) { tcp_ctlinput_with_port(icmp, htons(0)); } static void tcp_ctlinput_viaudp(udp_tun_icmp_param_t param) { /* Its a tunneled TCP over UDP icmp */ struct icmp *icmp = param.icmp; struct ip *outer_ip, *inner_ip; struct udphdr *udp; struct tcphdr *th, ttemp; int i_hlen, o_len; uint16_t port; outer_ip = (struct ip *)((caddr_t)icmp - sizeof(struct ip)); inner_ip = &icmp->icmp_ip; i_hlen = inner_ip->ip_hl << 2; o_len = ntohs(outer_ip->ip_len); if (o_len < (sizeof(struct ip) + 8 + i_hlen + sizeof(struct udphdr) + offsetof(struct tcphdr, th_ack))) { /* Not enough data present */ return; } /* Ok lets strip out the inner udphdr header by copying up on top of it the tcp hdr */ udp = (struct udphdr *)(((caddr_t)inner_ip) + i_hlen); if (ntohs(udp->uh_sport) != V_tcp_udp_tunneling_port) { return; } port = udp->uh_dport; th = (struct tcphdr *)(udp + 1); memcpy(&ttemp, th, sizeof(struct tcphdr)); memcpy(udp, &ttemp, sizeof(struct tcphdr)); /* Now adjust down the size of the outer IP header */ o_len -= sizeof(struct udphdr); outer_ip->ip_len = htons(o_len); /* Now call in to the normal handling code */ tcp_ctlinput_with_port(icmp, port); } #endif /* INET */ #ifdef INET6 static inline int tcp6_next_pmtu(const struct icmp6_hdr *icmp6) { int mtu = ntohl(icmp6->icmp6_mtu); /* * If no alternative MTU was proposed, or the proposed MTU was too * small, set to the min. */ if (mtu < IPV6_MMTU) mtu = IPV6_MMTU - 8; /* XXXNP: what is the adjustment for? */ return (mtu); } static void tcp6_ctlinput_with_port(struct ip6ctlparam *ip6cp, uint16_t port) { struct in6_addr *dst; struct inpcb *(*notify)(struct inpcb *, int); struct ip6_hdr *ip6; struct mbuf *m; struct inpcb *inp; struct tcpcb *tp; struct icmp6_hdr *icmp6; struct in_conninfo inc; struct tcp_ports { uint16_t th_sport; uint16_t th_dport; } t_ports; tcp_seq icmp_tcp_seq; unsigned int mtu; unsigned int off; int errno; icmp6 = ip6cp->ip6c_icmp6; m = ip6cp->ip6c_m; ip6 = ip6cp->ip6c_ip6; off = ip6cp->ip6c_off; dst = &ip6cp->ip6c_finaldst->sin6_addr; errno = icmp6_errmap(icmp6); switch (errno) { case 0: return; case EMSGSIZE: notify = tcp_mtudisc_notify; break; case ECONNREFUSED: if (V_icmp_may_rst) notify = tcp_drop_syn_sent; else notify = tcp_notify; break; case EHOSTUNREACH: /* * There are only four ICMPs that may reset connection: * - administratively prohibited * - port unreachable * - time exceeded in transit * - unknown next header */ if (V_icmp_may_rst && ((icmp6->icmp6_type == ICMP6_DST_UNREACH && (icmp6->icmp6_code == ICMP6_DST_UNREACH_ADMIN || icmp6->icmp6_code == ICMP6_DST_UNREACH_NOPORT)) || (icmp6->icmp6_type == ICMP6_TIME_EXCEEDED && icmp6->icmp6_code == ICMP6_TIME_EXCEED_TRANSIT) || (icmp6->icmp6_type == ICMP6_PARAM_PROB && icmp6->icmp6_code == ICMP6_PARAMPROB_NEXTHEADER))) notify = tcp_drop_syn_sent; else notify = tcp_notify; break; default: notify = tcp_notify; } /* Check if we can safely get the ports from the tcp hdr */ if (m == NULL || (m->m_pkthdr.len < (int32_t) (off + sizeof(struct tcp_ports)))) { return; } bzero(&t_ports, sizeof(struct tcp_ports)); m_copydata(m, off, sizeof(struct tcp_ports), (caddr_t)&t_ports); inp = in6_pcblookup(&V_tcbinfo, &ip6->ip6_dst, t_ports.th_dport, &ip6->ip6_src, t_ports.th_sport, INPLOOKUP_WLOCKPCB, NULL); off += sizeof(struct tcp_ports); if (m->m_pkthdr.len < (int32_t) (off + sizeof(tcp_seq))) { goto out; } m_copydata(m, off, sizeof(tcp_seq), (caddr_t)&icmp_tcp_seq); if (inp != NULL) { tp = intotcpcb(inp); #ifdef TCP_OFFLOAD if (tp->t_flags & TF_TOE && errno == EMSGSIZE) { /* MTU discovery for offloaded connections. */ mtu = tcp6_next_pmtu(icmp6); tcp_offload_pmtu_update(tp, icmp_tcp_seq, mtu); goto out; } #endif if (tp->t_port != port) goto out; if (SEQ_GEQ(ntohl(icmp_tcp_seq), tp->snd_una) && SEQ_LT(ntohl(icmp_tcp_seq), tp->snd_max)) { if (errno == EMSGSIZE) { /* * MTU discovery: * If we got a needfrag set the MTU * in the route to the suggested new * value (if given) and then notify. */ mtu = tcp6_next_pmtu(icmp6); bzero(&inc, sizeof(inc)); inc.inc_fibnum = M_GETFIB(m); inc.inc_flags |= INC_ISIPV6; inc.inc6_faddr = *dst; if (in6_setscope(&inc.inc6_faddr, m->m_pkthdr.rcvif, NULL)) goto out; /* * Only process the offered MTU if it * is smaller than the current one. */ if (mtu < tp->t_maxseg + sizeof (struct tcphdr) + sizeof (struct ip6_hdr)) { tcp_hc_updatemtu(&inc, mtu); tcp_mtudisc(inp, mtu); ICMP6STAT_INC(icp6s_pmtuchg); } } else inp = (*notify)(inp, errno); } } else { bzero(&inc, sizeof(inc)); inc.inc_fibnum = M_GETFIB(m); inc.inc_flags |= INC_ISIPV6; inc.inc_fport = t_ports.th_dport; inc.inc_lport = t_ports.th_sport; inc.inc6_faddr = *dst; inc.inc6_laddr = ip6->ip6_src; syncache_unreach(&inc, icmp_tcp_seq, port); } out: if (inp != NULL) INP_WUNLOCK(inp); } static void tcp6_ctlinput(struct ip6ctlparam *ctl) { tcp6_ctlinput_with_port(ctl, htons(0)); } static void tcp6_ctlinput_viaudp(udp_tun_icmp_param_t param) { struct ip6ctlparam *ip6cp = param.ip6cp; struct mbuf *m; struct udphdr *udp; uint16_t port; m = m_pulldown(ip6cp->ip6c_m, ip6cp->ip6c_off, sizeof(struct udphdr), NULL); if (m == NULL) { return; } udp = mtod(m, struct udphdr *); if (ntohs(udp->uh_sport) != V_tcp_udp_tunneling_port) { return; } port = udp->uh_dport; m_adj(m, sizeof(struct udphdr)); if ((m->m_flags & M_PKTHDR) == 0) { ip6cp->ip6c_m->m_pkthdr.len -= sizeof(struct udphdr); } /* Now call in to the normal handling code */ tcp6_ctlinput_with_port(ip6cp, port); } #endif /* INET6 */ static uint32_t tcp_keyed_hash(struct in_conninfo *inc, u_char *key, u_int len) { SIPHASH_CTX ctx; uint32_t hash[2]; KASSERT(len >= SIPHASH_KEY_LENGTH, ("%s: keylen %u too short ", __func__, len)); SipHash24_Init(&ctx); SipHash_SetKey(&ctx, (uint8_t *)key); SipHash_Update(&ctx, &inc->inc_fport, sizeof(uint16_t)); SipHash_Update(&ctx, &inc->inc_lport, sizeof(uint16_t)); switch (inc->inc_flags & INC_ISIPV6) { #ifdef INET case 0: SipHash_Update(&ctx, &inc->inc_faddr, sizeof(struct in_addr)); SipHash_Update(&ctx, &inc->inc_laddr, sizeof(struct in_addr)); break; #endif #ifdef INET6 case INC_ISIPV6: SipHash_Update(&ctx, &inc->inc6_faddr, sizeof(struct in6_addr)); SipHash_Update(&ctx, &inc->inc6_laddr, sizeof(struct in6_addr)); break; #endif } SipHash_Final((uint8_t *)hash, &ctx); return (hash[0] ^ hash[1]); } uint32_t tcp_new_ts_offset(struct in_conninfo *inc) { struct in_conninfo inc_store, *local_inc; if (!V_tcp_ts_offset_per_conn) { memcpy(&inc_store, inc, sizeof(struct in_conninfo)); inc_store.inc_lport = 0; inc_store.inc_fport = 0; local_inc = &inc_store; } else { local_inc = inc; } return (tcp_keyed_hash(local_inc, V_ts_offset_secret, sizeof(V_ts_offset_secret))); } /* * Following is where TCP initial sequence number generation occurs. * * There are two places where we must use initial sequence numbers: * 1. In SYN-ACK packets. * 2. In SYN packets. * * All ISNs for SYN-ACK packets are generated by the syncache. See * tcp_syncache.c for details. * * The ISNs in SYN packets must be monotonic; TIME_WAIT recycling * depends on this property. In addition, these ISNs should be * unguessable so as to prevent connection hijacking. To satisfy * the requirements of this situation, the algorithm outlined in * RFC 1948 is used, with only small modifications. * * Implementation details: * * Time is based off the system timer, and is corrected so that it * increases by one megabyte per second. This allows for proper * recycling on high speed LANs while still leaving over an hour * before rollover. * * As reading the *exact* system time is too expensive to be done * whenever setting up a TCP connection, we increment the time * offset in two ways. First, a small random positive increment * is added to isn_offset for each connection that is set up. * Second, the function tcp_isn_tick fires once per clock tick * and increments isn_offset as necessary so that sequence numbers * are incremented at approximately ISN_BYTES_PER_SECOND. The * random positive increments serve only to ensure that the same * exact sequence number is never sent out twice (as could otherwise * happen when a port is recycled in less than the system tick * interval.) * * net.inet.tcp.isn_reseed_interval controls the number of seconds * between seeding of isn_secret. This is normally set to zero, * as reseeding should not be necessary. * * Locking of the global variables isn_secret, isn_last_reseed, isn_offset, * isn_offset_old, and isn_ctx is performed using the ISN lock. In * general, this means holding an exclusive (write) lock. */ #define ISN_BYTES_PER_SECOND 1048576 #define ISN_STATIC_INCREMENT 4096 #define ISN_RANDOM_INCREMENT (4096 - 1) #define ISN_SECRET_LENGTH SIPHASH_KEY_LENGTH VNET_DEFINE_STATIC(u_char, isn_secret[ISN_SECRET_LENGTH]); VNET_DEFINE_STATIC(int, isn_last); VNET_DEFINE_STATIC(int, isn_last_reseed); VNET_DEFINE_STATIC(u_int32_t, isn_offset); VNET_DEFINE_STATIC(u_int32_t, isn_offset_old); #define V_isn_secret VNET(isn_secret) #define V_isn_last VNET(isn_last) #define V_isn_last_reseed VNET(isn_last_reseed) #define V_isn_offset VNET(isn_offset) #define V_isn_offset_old VNET(isn_offset_old) tcp_seq tcp_new_isn(struct in_conninfo *inc) { tcp_seq new_isn; u_int32_t projected_offset; ISN_LOCK(); /* Seed if this is the first use, reseed if requested. */ if ((V_isn_last_reseed == 0) || ((V_tcp_isn_reseed_interval > 0) && (((u_int)V_isn_last_reseed + (u_int)V_tcp_isn_reseed_interval*hz) < (u_int)ticks))) { arc4rand(&V_isn_secret, sizeof(V_isn_secret), 0); V_isn_last_reseed = ticks; } /* Compute the hash and return the ISN. */ new_isn = (tcp_seq)tcp_keyed_hash(inc, V_isn_secret, sizeof(V_isn_secret)); V_isn_offset += ISN_STATIC_INCREMENT + (arc4random() & ISN_RANDOM_INCREMENT); if (ticks != V_isn_last) { projected_offset = V_isn_offset_old + ISN_BYTES_PER_SECOND / hz * (ticks - V_isn_last); if (SEQ_GT(projected_offset, V_isn_offset)) V_isn_offset = projected_offset; V_isn_offset_old = V_isn_offset; V_isn_last = ticks; } new_isn += V_isn_offset; ISN_UNLOCK(); return (new_isn); } /* * When a specific ICMP unreachable message is received and the * connection state is SYN-SENT, drop the connection. This behavior * is controlled by the icmp_may_rst sysctl. */ static struct inpcb * tcp_drop_syn_sent(struct inpcb *inp, int errno) { struct tcpcb *tp; NET_EPOCH_ASSERT(); INP_WLOCK_ASSERT(inp); tp = intotcpcb(inp); if (tp->t_state != TCPS_SYN_SENT) return (inp); if (tp->t_flags & TF_FASTOPEN) tcp_fastopen_disable_path(tp); tp = tcp_drop(tp, errno); if (tp != NULL) return (inp); else return (NULL); } /* * When `need fragmentation' ICMP is received, update our idea of the MSS * based on the new value. Also nudge TCP to send something, since we * know the packet we just sent was dropped. * This duplicates some code in the tcp_mss() function in tcp_input.c. */ static struct inpcb * tcp_mtudisc_notify(struct inpcb *inp, int error) { return (tcp_mtudisc(inp, -1)); } static struct inpcb * tcp_mtudisc(struct inpcb *inp, int mtuoffer) { struct tcpcb *tp; struct socket *so; INP_WLOCK_ASSERT(inp); tp = intotcpcb(inp); KASSERT(tp != NULL, ("tcp_mtudisc: tp == NULL")); tcp_mss_update(tp, -1, mtuoffer, NULL, NULL); so = inp->inp_socket; SOCKBUF_LOCK(&so->so_snd); /* If the mss is larger than the socket buffer, decrease the mss. */ if (so->so_snd.sb_hiwat < tp->t_maxseg) tp->t_maxseg = so->so_snd.sb_hiwat; SOCKBUF_UNLOCK(&so->so_snd); TCPSTAT_INC(tcps_mturesent); tp->t_rtttime = 0; tp->snd_nxt = tp->snd_una; tcp_free_sackholes(tp); tp->snd_recover = tp->snd_max; if (tp->t_flags & TF_SACK_PERMIT) EXIT_FASTRECOVERY(tp->t_flags); if (tp->t_fb->tfb_tcp_mtu_chg != NULL) { /* * Conceptually the snd_nxt setting * and freeing sack holes should * be done by the default stacks * own tfb_tcp_mtu_chg(). */ tp->t_fb->tfb_tcp_mtu_chg(tp); } if (tcp_output(tp) < 0) return (NULL); else return (inp); } #ifdef INET /* * Look-up the routing entry to the peer of this inpcb. If no route * is found and it cannot be allocated, then return 0. This routine * is called by TCP routines that access the rmx structure and by * tcp_mss_update to get the peer/interface MTU. */ uint32_t tcp_maxmtu(struct in_conninfo *inc, struct tcp_ifcap *cap) { struct nhop_object *nh; struct ifnet *ifp; uint32_t maxmtu = 0; KASSERT(inc != NULL, ("tcp_maxmtu with NULL in_conninfo pointer")); if (inc->inc_faddr.s_addr != INADDR_ANY) { nh = fib4_lookup(inc->inc_fibnum, inc->inc_faddr, 0, NHR_NONE, 0); if (nh == NULL) return (0); ifp = nh->nh_ifp; maxmtu = nh->nh_mtu; /* Report additional interface capabilities. */ if (cap != NULL) { if (ifp->if_capenable & IFCAP_TSO4 && ifp->if_hwassist & CSUM_TSO) { cap->ifcap |= CSUM_TSO; cap->tsomax = ifp->if_hw_tsomax; cap->tsomaxsegcount = ifp->if_hw_tsomaxsegcount; cap->tsomaxsegsize = ifp->if_hw_tsomaxsegsize; } } } return (maxmtu); } #endif /* INET */ #ifdef INET6 uint32_t tcp_maxmtu6(struct in_conninfo *inc, struct tcp_ifcap *cap) { struct nhop_object *nh; struct in6_addr dst6; uint32_t scopeid; struct ifnet *ifp; uint32_t maxmtu = 0; KASSERT(inc != NULL, ("tcp_maxmtu6 with NULL in_conninfo pointer")); if (inc->inc_flags & INC_IPV6MINMTU) return (IPV6_MMTU); if (!IN6_IS_ADDR_UNSPECIFIED(&inc->inc6_faddr)) { in6_splitscope(&inc->inc6_faddr, &dst6, &scopeid); nh = fib6_lookup(inc->inc_fibnum, &dst6, scopeid, NHR_NONE, 0); if (nh == NULL) return (0); ifp = nh->nh_ifp; maxmtu = nh->nh_mtu; /* Report additional interface capabilities. */ if (cap != NULL) { if (ifp->if_capenable & IFCAP_TSO6 && ifp->if_hwassist & CSUM_TSO) { cap->ifcap |= CSUM_TSO; cap->tsomax = ifp->if_hw_tsomax; cap->tsomaxsegcount = ifp->if_hw_tsomaxsegcount; cap->tsomaxsegsize = ifp->if_hw_tsomaxsegsize; } } } return (maxmtu); } /* * Handle setsockopt(IPV6_USE_MIN_MTU) by a TCP stack. * * XXXGL: we are updating inpcb here with INC_IPV6MINMTU flag. * The right place to do that is ip6_setpktopt() that has just been * executed. By the way it just filled ip6po_minmtu for us. */ void tcp6_use_min_mtu(struct tcpcb *tp) { struct inpcb *inp = tptoinpcb(tp); INP_WLOCK_ASSERT(inp); /* * In case of the IPV6_USE_MIN_MTU socket * option, the INC_IPV6MINMTU flag to announce * a corresponding MSS during the initial * handshake. If the TCP connection is not in * the front states, just reduce the MSS being * used. This avoids the sending of TCP * segments which will be fragmented at the * IPv6 layer. */ inp->inp_inc.inc_flags |= INC_IPV6MINMTU; if ((tp->t_state >= TCPS_SYN_SENT) && (inp->inp_inc.inc_flags & INC_ISIPV6)) { struct ip6_pktopts *opt; opt = inp->in6p_outputopts; if (opt != NULL && opt->ip6po_minmtu == IP6PO_MINMTU_ALL && tp->t_maxseg > TCP6_MSS) tp->t_maxseg = TCP6_MSS; } } #endif /* INET6 */ /* * Calculate effective SMSS per RFC5681 definition for a given TCP * connection at its current state, taking into account SACK and etc. */ u_int tcp_maxseg(const struct tcpcb *tp) { u_int optlen; if (tp->t_flags & TF_NOOPT) return (tp->t_maxseg); /* * Here we have a simplified code from tcp_addoptions(), * without a proper loop, and having most of paddings hardcoded. * We might make mistakes with padding here in some edge cases, * but this is harmless, since result of tcp_maxseg() is used * only in cwnd and ssthresh estimations. */ if (TCPS_HAVEESTABLISHED(tp->t_state)) { if (tp->t_flags & TF_RCVD_TSTMP) optlen = TCPOLEN_TSTAMP_APPA; else optlen = 0; #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) if (tp->t_flags & TF_SIGNATURE) optlen += PADTCPOLEN(TCPOLEN_SIGNATURE); #endif if ((tp->t_flags & TF_SACK_PERMIT) && tp->rcv_numsacks > 0) { optlen += TCPOLEN_SACKHDR; optlen += tp->rcv_numsacks * TCPOLEN_SACK; optlen = PADTCPOLEN(optlen); } } else { if (tp->t_flags & TF_REQ_TSTMP) optlen = TCPOLEN_TSTAMP_APPA; else optlen = PADTCPOLEN(TCPOLEN_MAXSEG); if (tp->t_flags & TF_REQ_SCALE) optlen += PADTCPOLEN(TCPOLEN_WINDOW); #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) if (tp->t_flags & TF_SIGNATURE) optlen += PADTCPOLEN(TCPOLEN_SIGNATURE); #endif if (tp->t_flags & TF_SACK_PERMIT) optlen += PADTCPOLEN(TCPOLEN_SACK_PERMITTED); } #undef PAD optlen = min(optlen, TCP_MAXOLEN); return (tp->t_maxseg - optlen); } u_int tcp_fixed_maxseg(const struct tcpcb *tp) { int optlen; if (tp->t_flags & TF_NOOPT) return (tp->t_maxseg); /* * Here we have a simplified code from tcp_addoptions(), * without a proper loop, and having most of paddings hardcoded. * We only consider fixed options that we would send every * time I.e. SACK is not considered. This is important * for cc modules to figure out what the modulo of the * cwnd should be. */ #define PAD(len) ((((len) / 4) + !!((len) % 4)) * 4) if (TCPS_HAVEESTABLISHED(tp->t_state)) { if (tp->t_flags & TF_RCVD_TSTMP) optlen = TCPOLEN_TSTAMP_APPA; else optlen = 0; #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) if (tp->t_flags & TF_SIGNATURE) optlen += PAD(TCPOLEN_SIGNATURE); #endif } else { if (tp->t_flags & TF_REQ_TSTMP) optlen = TCPOLEN_TSTAMP_APPA; else optlen = PAD(TCPOLEN_MAXSEG); if (tp->t_flags & TF_REQ_SCALE) optlen += PAD(TCPOLEN_WINDOW); #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) if (tp->t_flags & TF_SIGNATURE) optlen += PAD(TCPOLEN_SIGNATURE); #endif if (tp->t_flags & TF_SACK_PERMIT) optlen += PAD(TCPOLEN_SACK_PERMITTED); } #undef PAD optlen = min(optlen, TCP_MAXOLEN); return (tp->t_maxseg - optlen); } static int sysctl_drop(SYSCTL_HANDLER_ARGS) { /* addrs[0] is a foreign socket, addrs[1] is a local one. */ struct sockaddr_storage addrs[2]; struct inpcb *inp; struct tcpcb *tp; #ifdef INET struct sockaddr_in *fin = NULL, *lin = NULL; #endif struct epoch_tracker et; #ifdef INET6 struct sockaddr_in6 *fin6, *lin6; #endif int error; inp = NULL; #ifdef INET6 fin6 = lin6 = NULL; #endif error = 0; if (req->oldptr != NULL || req->oldlen != 0) return (EINVAL); if (req->newptr == NULL) return (EPERM); if (req->newlen < sizeof(addrs)) return (ENOMEM); error = SYSCTL_IN(req, &addrs, sizeof(addrs)); if (error) return (error); switch (addrs[0].ss_family) { #ifdef INET6 case AF_INET6: fin6 = (struct sockaddr_in6 *)&addrs[0]; lin6 = (struct sockaddr_in6 *)&addrs[1]; if (fin6->sin6_len != sizeof(struct sockaddr_in6) || lin6->sin6_len != sizeof(struct sockaddr_in6)) return (EINVAL); if (IN6_IS_ADDR_V4MAPPED(&fin6->sin6_addr)) { if (!IN6_IS_ADDR_V4MAPPED(&lin6->sin6_addr)) return (EINVAL); in6_sin6_2_sin_in_sock((struct sockaddr *)&addrs[0]); in6_sin6_2_sin_in_sock((struct sockaddr *)&addrs[1]); #ifdef INET fin = (struct sockaddr_in *)&addrs[0]; lin = (struct sockaddr_in *)&addrs[1]; #endif break; } error = sa6_embedscope(fin6, V_ip6_use_defzone); if (error) return (error); error = sa6_embedscope(lin6, V_ip6_use_defzone); if (error) return (error); break; #endif #ifdef INET case AF_INET: fin = (struct sockaddr_in *)&addrs[0]; lin = (struct sockaddr_in *)&addrs[1]; if (fin->sin_len != sizeof(struct sockaddr_in) || lin->sin_len != sizeof(struct sockaddr_in)) return (EINVAL); break; #endif default: return (EINVAL); } NET_EPOCH_ENTER(et); switch (addrs[0].ss_family) { #ifdef INET6 case AF_INET6: inp = in6_pcblookup(&V_tcbinfo, &fin6->sin6_addr, fin6->sin6_port, &lin6->sin6_addr, lin6->sin6_port, INPLOOKUP_WLOCKPCB, NULL); break; #endif #ifdef INET case AF_INET: inp = in_pcblookup(&V_tcbinfo, fin->sin_addr, fin->sin_port, lin->sin_addr, lin->sin_port, INPLOOKUP_WLOCKPCB, NULL); break; #endif } if (inp != NULL) { if (!SOLISTENING(inp->inp_socket)) { tp = intotcpcb(inp); tp = tcp_drop(tp, ECONNABORTED); if (tp != NULL) INP_WUNLOCK(inp); } else INP_WUNLOCK(inp); } else error = ESRCH; NET_EPOCH_EXIT(et); return (error); } SYSCTL_PROC(_net_inet_tcp, TCPCTL_DROP, drop, CTLFLAG_VNET | CTLTYPE_STRUCT | CTLFLAG_WR | CTLFLAG_SKIP | CTLFLAG_NEEDGIANT, NULL, 0, sysctl_drop, "", "Drop TCP connection"); static int tcp_sysctl_setsockopt(SYSCTL_HANDLER_ARGS) { return (sysctl_setsockopt(oidp, arg1, arg2, req, &V_tcbinfo, &tcp_ctloutput_set)); } SYSCTL_PROC(_net_inet_tcp, OID_AUTO, setsockopt, CTLFLAG_VNET | CTLTYPE_STRUCT | CTLFLAG_WR | CTLFLAG_SKIP | CTLFLAG_MPSAFE, NULL, 0, tcp_sysctl_setsockopt, "", "Set socket option for TCP endpoint"); #ifdef KERN_TLS static int sysctl_switch_tls(SYSCTL_HANDLER_ARGS) { /* addrs[0] is a foreign socket, addrs[1] is a local one. */ struct sockaddr_storage addrs[2]; struct inpcb *inp; #ifdef INET struct sockaddr_in *fin = NULL, *lin = NULL; #endif struct epoch_tracker et; #ifdef INET6 struct sockaddr_in6 *fin6, *lin6; #endif int error; inp = NULL; #ifdef INET6 fin6 = lin6 = NULL; #endif error = 0; if (req->oldptr != NULL || req->oldlen != 0) return (EINVAL); if (req->newptr == NULL) return (EPERM); if (req->newlen < sizeof(addrs)) return (ENOMEM); error = SYSCTL_IN(req, &addrs, sizeof(addrs)); if (error) return (error); switch (addrs[0].ss_family) { #ifdef INET6 case AF_INET6: fin6 = (struct sockaddr_in6 *)&addrs[0]; lin6 = (struct sockaddr_in6 *)&addrs[1]; if (fin6->sin6_len != sizeof(struct sockaddr_in6) || lin6->sin6_len != sizeof(struct sockaddr_in6)) return (EINVAL); if (IN6_IS_ADDR_V4MAPPED(&fin6->sin6_addr)) { if (!IN6_IS_ADDR_V4MAPPED(&lin6->sin6_addr)) return (EINVAL); in6_sin6_2_sin_in_sock((struct sockaddr *)&addrs[0]); in6_sin6_2_sin_in_sock((struct sockaddr *)&addrs[1]); #ifdef INET fin = (struct sockaddr_in *)&addrs[0]; lin = (struct sockaddr_in *)&addrs[1]; #endif break; } error = sa6_embedscope(fin6, V_ip6_use_defzone); if (error) return (error); error = sa6_embedscope(lin6, V_ip6_use_defzone); if (error) return (error); break; #endif #ifdef INET case AF_INET: fin = (struct sockaddr_in *)&addrs[0]; lin = (struct sockaddr_in *)&addrs[1]; if (fin->sin_len != sizeof(struct sockaddr_in) || lin->sin_len != sizeof(struct sockaddr_in)) return (EINVAL); break; #endif default: return (EINVAL); } NET_EPOCH_ENTER(et); switch (addrs[0].ss_family) { #ifdef INET6 case AF_INET6: inp = in6_pcblookup(&V_tcbinfo, &fin6->sin6_addr, fin6->sin6_port, &lin6->sin6_addr, lin6->sin6_port, INPLOOKUP_WLOCKPCB, NULL); break; #endif #ifdef INET case AF_INET: inp = in_pcblookup(&V_tcbinfo, fin->sin_addr, fin->sin_port, lin->sin_addr, lin->sin_port, INPLOOKUP_WLOCKPCB, NULL); break; #endif } NET_EPOCH_EXIT(et); if (inp != NULL) { struct socket *so; so = inp->inp_socket; soref(so); error = ktls_set_tx_mode(so, arg2 == 0 ? TCP_TLS_MODE_SW : TCP_TLS_MODE_IFNET); INP_WUNLOCK(inp); sorele(so); } else error = ESRCH; return (error); } SYSCTL_PROC(_net_inet_tcp, OID_AUTO, switch_to_sw_tls, CTLFLAG_VNET | CTLTYPE_STRUCT | CTLFLAG_WR | CTLFLAG_SKIP | CTLFLAG_NEEDGIANT, NULL, 0, sysctl_switch_tls, "", "Switch TCP connection to SW TLS"); SYSCTL_PROC(_net_inet_tcp, OID_AUTO, switch_to_ifnet_tls, CTLFLAG_VNET | CTLTYPE_STRUCT | CTLFLAG_WR | CTLFLAG_SKIP | CTLFLAG_NEEDGIANT, NULL, 1, sysctl_switch_tls, "", "Switch TCP connection to ifnet TLS"); #endif /* * Generate a standardized TCP log line for use throughout the * tcp subsystem. Memory allocation is done with M_NOWAIT to * allow use in the interrupt context. * * NB: The caller MUST free(s, M_TCPLOG) the returned string. * NB: The function may return NULL if memory allocation failed. * * Due to header inclusion and ordering limitations the struct ip * and ip6_hdr pointers have to be passed as void pointers. */ char * tcp_log_vain(struct in_conninfo *inc, struct tcphdr *th, const void *ip4hdr, const void *ip6hdr) { /* Is logging enabled? */ if (V_tcp_log_in_vain == 0) return (NULL); return (tcp_log_addr(inc, th, ip4hdr, ip6hdr)); } char * tcp_log_addrs(struct in_conninfo *inc, struct tcphdr *th, const void *ip4hdr, const void *ip6hdr) { /* Is logging enabled? */ if (tcp_log_debug == 0) return (NULL); return (tcp_log_addr(inc, th, ip4hdr, ip6hdr)); } static char * tcp_log_addr(struct in_conninfo *inc, struct tcphdr *th, const void *ip4hdr, const void *ip6hdr) { char *s, *sp; size_t size; #ifdef INET const struct ip *ip = (const struct ip *)ip4hdr; #endif #ifdef INET6 const struct ip6_hdr *ip6 = (const struct ip6_hdr *)ip6hdr; #endif /* INET6 */ /* * The log line looks like this: * "TCP: [1.2.3.4]:50332 to [1.2.3.4]:80 tcpflags 0x2" */ size = sizeof("TCP: []:12345 to []:12345 tcpflags 0x2<>") + sizeof(PRINT_TH_FLAGS) + 1 + #ifdef INET6 2 * INET6_ADDRSTRLEN; #else 2 * INET_ADDRSTRLEN; #endif /* INET6 */ s = malloc(size, M_TCPLOG, M_ZERO|M_NOWAIT); if (s == NULL) return (NULL); strcat(s, "TCP: ["); sp = s + strlen(s); if (inc && ((inc->inc_flags & INC_ISIPV6) == 0)) { inet_ntoa_r(inc->inc_faddr, sp); sp = s + strlen(s); sprintf(sp, "]:%i to [", ntohs(inc->inc_fport)); sp = s + strlen(s); inet_ntoa_r(inc->inc_laddr, sp); sp = s + strlen(s); sprintf(sp, "]:%i", ntohs(inc->inc_lport)); #ifdef INET6 } else if (inc) { ip6_sprintf(sp, &inc->inc6_faddr); sp = s + strlen(s); sprintf(sp, "]:%i to [", ntohs(inc->inc_fport)); sp = s + strlen(s); ip6_sprintf(sp, &inc->inc6_laddr); sp = s + strlen(s); sprintf(sp, "]:%i", ntohs(inc->inc_lport)); } else if (ip6 && th) { ip6_sprintf(sp, &ip6->ip6_src); sp = s + strlen(s); sprintf(sp, "]:%i to [", ntohs(th->th_sport)); sp = s + strlen(s); ip6_sprintf(sp, &ip6->ip6_dst); sp = s + strlen(s); sprintf(sp, "]:%i", ntohs(th->th_dport)); #endif /* INET6 */ #ifdef INET } else if (ip && th) { inet_ntoa_r(ip->ip_src, sp); sp = s + strlen(s); sprintf(sp, "]:%i to [", ntohs(th->th_sport)); sp = s + strlen(s); inet_ntoa_r(ip->ip_dst, sp); sp = s + strlen(s); sprintf(sp, "]:%i", ntohs(th->th_dport)); #endif /* INET */ } else { free(s, M_TCPLOG); return (NULL); } sp = s + strlen(s); if (th) sprintf(sp, " tcpflags 0x%b", tcp_get_flags(th), PRINT_TH_FLAGS); if (*(s + size - 1) != '\0') panic("%s: string too long", __func__); return (s); } /* * A subroutine which makes it easy to track TCP state changes with DTrace. * This function shouldn't be called for t_state initializations that don't * correspond to actual TCP state transitions. */ void tcp_state_change(struct tcpcb *tp, int newstate) { #if defined(KDTRACE_HOOKS) int pstate = tp->t_state; #endif TCPSTATES_DEC(tp->t_state); TCPSTATES_INC(newstate); tp->t_state = newstate; TCP_PROBE6(state__change, NULL, tp, NULL, tp, NULL, pstate); } /* * Create an external-format (``xtcpcb'') structure using the information in * the kernel-format tcpcb structure pointed to by tp. This is done to * reduce the spew of irrelevant information over this interface, to isolate * user code from changes in the kernel structure, and potentially to provide * information-hiding if we decide that some of this information should be * hidden from users. */ void tcp_inptoxtp(const struct inpcb *inp, struct xtcpcb *xt) { struct tcpcb *tp = intotcpcb(inp); sbintime_t now; bzero(xt, sizeof(*xt)); xt->t_state = tp->t_state; xt->t_logstate = tcp_get_bblog_state(tp); xt->t_flags = tp->t_flags; xt->t_sndzerowin = tp->t_sndzerowin; xt->t_sndrexmitpack = tp->t_sndrexmitpack; xt->t_rcvoopack = tp->t_rcvoopack; xt->t_rcv_wnd = tp->rcv_wnd; xt->t_snd_wnd = tp->snd_wnd; xt->t_snd_cwnd = tp->snd_cwnd; xt->t_snd_ssthresh = tp->snd_ssthresh; xt->t_dsack_bytes = tp->t_dsack_bytes; xt->t_dsack_tlp_bytes = tp->t_dsack_tlp_bytes; xt->t_dsack_pack = tp->t_dsack_pack; xt->t_maxseg = tp->t_maxseg; xt->xt_ecn = (tp->t_flags2 & TF2_ECN_PERMIT) ? 1 : 0 + (tp->t_flags2 & TF2_ACE_PERMIT) ? 2 : 0; now = getsbinuptime(); #define COPYTIMER(which,where) do { \ if (tp->t_timers[which] != SBT_MAX) \ xt->where = (tp->t_timers[which] - now) / SBT_1MS; \ else \ xt->where = 0; \ } while (0) COPYTIMER(TT_DELACK, tt_delack); COPYTIMER(TT_REXMT, tt_rexmt); COPYTIMER(TT_PERSIST, tt_persist); COPYTIMER(TT_KEEP, tt_keep); COPYTIMER(TT_2MSL, tt_2msl); #undef COPYTIMER xt->t_rcvtime = 1000 * (ticks - tp->t_rcvtime) / hz; xt->xt_encaps_port = tp->t_port; bcopy(tp->t_fb->tfb_tcp_block_name, xt->xt_stack, TCP_FUNCTION_NAME_LEN_MAX); bcopy(CC_ALGO(tp)->name, xt->xt_cc, TCP_CA_NAME_MAX); #ifdef TCP_BLACKBOX (void)tcp_log_get_id(tp, xt->xt_logid); #endif xt->xt_len = sizeof(struct xtcpcb); in_pcbtoxinpcb(inp, &xt->xt_inp); - /* - * TCP doesn't use inp_ppcb pointer, we embed inpcb into tcpcb. - * Fixup the pointer that in_pcbtoxinpcb() has set. When printing - * TCP netstat(1) used to use this pointer, so this fixup needs to - * stay for stable/14. - */ - xt->xt_inp.inp_ppcb = (uintptr_t)tp; } void tcp_log_end_status(struct tcpcb *tp, uint8_t status) { uint32_t bit, i; if ((tp == NULL) || (status > TCP_EI_STATUS_MAX_VALUE) || (status == 0)) { /* Invalid */ return; } if (status > (sizeof(uint32_t) * 8)) { /* Should this be a KASSERT? */ return; } bit = 1U << (status - 1); if (bit & tp->t_end_info_status) { /* already logged */ return; } for (i = 0; i < TCP_END_BYTE_INFO; i++) { if (tp->t_end_info_bytes[i] == TCP_EI_EMPTY_SLOT) { tp->t_end_info_bytes[i] = status; tp->t_end_info_status |= bit; break; } } } int tcp_can_enable_pacing(void) { if ((tcp_pacing_limit == -1) || (tcp_pacing_limit > number_of_tcp_connections_pacing)) { atomic_fetchadd_int(&number_of_tcp_connections_pacing, 1); shadow_num_connections = number_of_tcp_connections_pacing; return (1); } else { counter_u64_add(tcp_pacing_failures, 1); return (0); } } int tcp_incr_dgp_pacing_cnt(void) { if ((tcp_dgp_limit == -1) || (tcp_dgp_limit > number_of_dgp_connections)) { atomic_fetchadd_int(&number_of_dgp_connections, 1); shadow_tcp_pacing_dgp = number_of_dgp_connections; return (1); } else { counter_u64_add(tcp_dgp_failures, 1); return (0); } } static uint8_t tcp_dgp_warning = 0; void tcp_dec_dgp_pacing_cnt(void) { uint32_t ret; ret = atomic_fetchadd_int(&number_of_dgp_connections, -1); shadow_tcp_pacing_dgp = number_of_dgp_connections; KASSERT(ret != 0, ("number_of_dgp_connections -1 would cause wrap?")); if (ret == 0) { if (tcp_dgp_limit != -1) { printf("Warning all DGP is now disabled, count decrements invalidly!\n"); tcp_dgp_limit = 0; tcp_dgp_warning = 1; } else if (tcp_dgp_warning == 0) { printf("Warning DGP pacing is invalid, invalid decrement\n"); tcp_dgp_warning = 1; } } } static uint8_t tcp_pacing_warning = 0; void tcp_decrement_paced_conn(void) { uint32_t ret; ret = atomic_fetchadd_int(&number_of_tcp_connections_pacing, -1); shadow_num_connections = number_of_tcp_connections_pacing; KASSERT(ret != 0, ("tcp_paced_connection_exits -1 would cause wrap?")); if (ret == 0) { if (tcp_pacing_limit != -1) { printf("Warning all pacing is now disabled, count decrements invalidly!\n"); tcp_pacing_limit = 0; } else if (tcp_pacing_warning == 0) { printf("Warning pacing count is invalid, invalid decrement\n"); tcp_pacing_warning = 1; } } } static void tcp_default_switch_failed(struct tcpcb *tp) { /* * If a switch fails we only need to * care about two things: * a) The t_flags2 * and * b) The timer granularity. * Timeouts, at least for now, don't use the * old callout system in the other stacks so * those are hopefully safe. */ tcp_lro_features_off(tp); tcp_change_time_units(tp, TCP_TMR_GRANULARITY_TICKS); } #ifdef TCP_ACCOUNTING int tcp_do_ack_accounting(struct tcpcb *tp, struct tcphdr *th, struct tcpopt *to, uint32_t tiwin, int mss) { if (SEQ_LT(th->th_ack, tp->snd_una)) { /* Do we have a SACK? */ if (to->to_flags & TOF_SACK) { if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { tp->tcp_cnt_counters[ACK_SACK]++; } return (ACK_SACK); } else { if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { tp->tcp_cnt_counters[ACK_BEHIND]++; } return (ACK_BEHIND); } } else if (th->th_ack == tp->snd_una) { /* Do we have a SACK? */ if (to->to_flags & TOF_SACK) { if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { tp->tcp_cnt_counters[ACK_SACK]++; } return (ACK_SACK); } else if (tiwin != tp->snd_wnd) { if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { tp->tcp_cnt_counters[ACK_RWND]++; } return (ACK_RWND); } else { if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { tp->tcp_cnt_counters[ACK_DUPACK]++; } return (ACK_DUPACK); } } else { if (!SEQ_GT(th->th_ack, tp->snd_max)) { if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { tp->tcp_cnt_counters[CNT_OF_ACKS_IN] += (((th->th_ack - tp->snd_una) + mss - 1)/mss); } } if (to->to_flags & TOF_SACK) { if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { tp->tcp_cnt_counters[ACK_CUMACK_SACK]++; } return (ACK_CUMACK_SACK); } else { if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { tp->tcp_cnt_counters[ACK_CUMACK]++; } return (ACK_CUMACK); } } } #endif void tcp_change_time_units(struct tcpcb *tp, int granularity) { if (tp->t_tmr_granularity == granularity) { /* We are there */ return; } if (granularity == TCP_TMR_GRANULARITY_USEC) { KASSERT((tp->t_tmr_granularity == TCP_TMR_GRANULARITY_TICKS), ("Granularity is not TICKS its %u in tp:%p", tp->t_tmr_granularity, tp)); tp->t_rttlow = TICKS_2_USEC(tp->t_rttlow); if (tp->t_srtt > 1) { uint32_t val, frac; val = tp->t_srtt >> TCP_RTT_SHIFT; frac = tp->t_srtt & 0x1f; tp->t_srtt = TICKS_2_USEC(val); /* * frac is the fractional part of the srtt (if any) * but its in ticks and every bit represents * 1/32nd of a hz. */ if (frac) { if (hz == 1000) { frac = (((uint64_t)frac * (uint64_t)HPTS_USEC_IN_MSEC) / (uint64_t)TCP_RTT_SCALE); } else { frac = (((uint64_t)frac * (uint64_t)HPTS_USEC_IN_SEC) / ((uint64_t)(hz) * (uint64_t)TCP_RTT_SCALE)); } tp->t_srtt += frac; } } if (tp->t_rttvar) { uint32_t val, frac; val = tp->t_rttvar >> TCP_RTTVAR_SHIFT; frac = tp->t_rttvar & 0x1f; tp->t_rttvar = TICKS_2_USEC(val); /* * frac is the fractional part of the srtt (if any) * but its in ticks and every bit represents * 1/32nd of a hz. */ if (frac) { if (hz == 1000) { frac = (((uint64_t)frac * (uint64_t)HPTS_USEC_IN_MSEC) / (uint64_t)TCP_RTT_SCALE); } else { frac = (((uint64_t)frac * (uint64_t)HPTS_USEC_IN_SEC) / ((uint64_t)(hz) * (uint64_t)TCP_RTT_SCALE)); } tp->t_rttvar += frac; } } tp->t_tmr_granularity = TCP_TMR_GRANULARITY_USEC; } else if (granularity == TCP_TMR_GRANULARITY_TICKS) { /* Convert back to ticks, with */ KASSERT((tp->t_tmr_granularity == TCP_TMR_GRANULARITY_USEC), ("Granularity is not USEC its %u in tp:%p", tp->t_tmr_granularity, tp)); if (tp->t_srtt > 1) { uint32_t val, frac; val = USEC_2_TICKS(tp->t_srtt); frac = tp->t_srtt % (HPTS_USEC_IN_SEC / hz); tp->t_srtt = val << TCP_RTT_SHIFT; /* * frac is the fractional part here is left * over from converting to hz and shifting. * We need to convert this to the 5 bit * remainder. */ if (frac) { if (hz == 1000) { frac = (((uint64_t)frac * (uint64_t)TCP_RTT_SCALE) / (uint64_t)HPTS_USEC_IN_MSEC); } else { frac = (((uint64_t)frac * (uint64_t)(hz) * (uint64_t)TCP_RTT_SCALE) /(uint64_t)HPTS_USEC_IN_SEC); } tp->t_srtt += frac; } } if (tp->t_rttvar) { uint32_t val, frac; val = USEC_2_TICKS(tp->t_rttvar); frac = tp->t_srtt % (HPTS_USEC_IN_SEC / hz); tp->t_rttvar = val << TCP_RTTVAR_SHIFT; /* * frac is the fractional part here is left * over from converting to hz and shifting. * We need to convert this to the 5 bit * remainder. */ if (frac) { if (hz == 1000) { frac = (((uint64_t)frac * (uint64_t)TCP_RTT_SCALE) / (uint64_t)HPTS_USEC_IN_MSEC); } else { frac = (((uint64_t)frac * (uint64_t)(hz) * (uint64_t)TCP_RTT_SCALE) /(uint64_t)HPTS_USEC_IN_SEC); } tp->t_rttvar += frac; } } tp->t_rttlow = USEC_2_TICKS(tp->t_rttlow); tp->t_tmr_granularity = TCP_TMR_GRANULARITY_TICKS; } #ifdef INVARIANTS else { panic("Unknown granularity:%d tp:%p", granularity, tp); } #endif } void tcp_handle_orphaned_packets(struct tcpcb *tp) { struct mbuf *save, *m, *prev; /* * Called when a stack switch is occuring from the fini() * of the old stack. We assue the init() as already been * run of the new stack and it has set the t_flags2 to * what it supports. This function will then deal with any * differences i.e. cleanup packets that maybe queued that * the newstack does not support. */ if (tp->t_flags2 & TF2_MBUF_L_ACKS) return; if ((tp->t_flags2 & TF2_SUPPORTS_MBUFQ) == 0 && !STAILQ_EMPTY(&tp->t_inqueue)) { /* * It is unsafe to process the packets since a * reset may be lurking in them (its rare but it * can occur). If we were to find a RST, then we * would end up dropping the connection and the * INP lock, so when we return the caller (tcp_usrreq) * will blow up when it trys to unlock the inp. * This new stack does not do any fancy LRO features * so all we can do is toss the packets. */ m = STAILQ_FIRST(&tp->t_inqueue); STAILQ_INIT(&tp->t_inqueue); STAILQ_FOREACH_FROM_SAFE(m, &tp->t_inqueue, m_stailqpkt, save) m_freem(m); } else { /* * Here we have a stack that does mbuf queuing but * does not support compressed ack's. We must * walk all the mbufs and discard any compressed acks. */ STAILQ_FOREACH_SAFE(m, &tp->t_inqueue, m_stailqpkt, save) { if (m->m_flags & M_ACKCMP) { if (m == STAILQ_FIRST(&tp->t_inqueue)) STAILQ_REMOVE_HEAD(&tp->t_inqueue, m_stailqpkt); else STAILQ_REMOVE_AFTER(&tp->t_inqueue, prev, m_stailqpkt); m_freem(m); } else prev = m; } } } #ifdef TCP_REQUEST_TRK uint32_t tcp_estimate_tls_overhead(struct socket *so, uint64_t tls_usr_bytes) { #ifdef KERN_TLS struct ktls_session *tls; uint32_t rec_oh, records; tls = so->so_snd.sb_tls_info; if (tls == NULL) return (0); rec_oh = tls->params.tls_hlen + tls->params.tls_tlen; records = ((tls_usr_bytes + tls->params.max_frame_len - 1)/tls->params.max_frame_len); return (records * rec_oh); #else return (0); #endif } extern uint32_t tcp_stale_entry_time; uint32_t tcp_stale_entry_time = 250000; SYSCTL_UINT(_net_inet_tcp, OID_AUTO, usrlog_stale, CTLFLAG_RW, &tcp_stale_entry_time, 250000, "Time that a tcpreq entry without a sendfile ages out"); void tcp_req_log_req_info(struct tcpcb *tp, struct tcp_sendfile_track *req, uint16_t slot, uint8_t val, uint64_t offset, uint64_t nbytes) { if (tcp_bblogging_on(tp)) { union tcp_log_stackspecific log; struct timeval tv; memset(&log.u_bbr, 0, sizeof(log.u_bbr)); log.u_bbr.inhpts = tcp_in_hpts(tp); log.u_bbr.flex8 = val; log.u_bbr.rttProp = req->timestamp; log.u_bbr.delRate = req->start; log.u_bbr.cur_del_rate = req->end; log.u_bbr.flex1 = req->start_seq; log.u_bbr.flex2 = req->end_seq; log.u_bbr.flex3 = req->flags; log.u_bbr.flex4 = ((req->localtime >> 32) & 0x00000000ffffffff); log.u_bbr.flex5 = (req->localtime & 0x00000000ffffffff); log.u_bbr.flex7 = slot; log.u_bbr.bw_inuse = offset; /* nbytes = flex6 | epoch */ log.u_bbr.flex6 = ((nbytes >> 32) & 0x00000000ffffffff); log.u_bbr.epoch = (nbytes & 0x00000000ffffffff); /* cspr = lt_epoch | pkts_out */ log.u_bbr.lt_epoch = ((req->cspr >> 32) & 0x00000000ffffffff); log.u_bbr.pkts_out |= (req->cspr & 0x00000000ffffffff); log.u_bbr.applimited = tp->t_tcpreq_closed; log.u_bbr.applimited <<= 8; log.u_bbr.applimited |= tp->t_tcpreq_open; log.u_bbr.applimited <<= 8; log.u_bbr.applimited |= tp->t_tcpreq_req; log.u_bbr.timeStamp = tcp_get_usecs(&tv); TCP_LOG_EVENTP(tp, NULL, &tptosocket(tp)->so_rcv, &tptosocket(tp)->so_snd, TCP_LOG_REQ_T, 0, 0, &log, false, &tv); } } void tcp_req_free_a_slot(struct tcpcb *tp, struct tcp_sendfile_track *ent) { if (tp->t_tcpreq_req > 0) tp->t_tcpreq_req--; if (ent->flags & TCP_TRK_TRACK_FLG_OPEN) { if (tp->t_tcpreq_open > 0) tp->t_tcpreq_open--; } else { if (tp->t_tcpreq_closed > 0) tp->t_tcpreq_closed--; } ent->flags = TCP_TRK_TRACK_FLG_EMPTY; } static void tcp_req_check_for_stale_entries(struct tcpcb *tp, uint64_t ts, int rm_oldest) { struct tcp_sendfile_track *ent; uint64_t time_delta, oldest_delta; int i, oldest, oldest_set = 0, cnt_rm = 0; for(i = 0; i < MAX_TCP_TRK_REQ; i++) { ent = &tp->t_tcpreq_info[i]; if (ent->flags != TCP_TRK_TRACK_FLG_USED) { /* * We only care about closed end ranges * that are allocated and have no sendfile * ever touching them. They would be in * state USED. */ continue; } if (ts >= ent->localtime) time_delta = ts - ent->localtime; else time_delta = 0; if (time_delta && ((oldest_delta < time_delta) || (oldest_set == 0))) { oldest_set = 1; oldest = i; oldest_delta = time_delta; } if (tcp_stale_entry_time && (time_delta >= tcp_stale_entry_time)) { /* * No sendfile in a our time-limit * time to purge it. */ cnt_rm++; tcp_req_log_req_info(tp, &tp->t_tcpreq_info[i], i, TCP_TRK_REQ_LOG_STALE, time_delta, 0); tcp_req_free_a_slot(tp, ent); } } if ((cnt_rm == 0) && rm_oldest && oldest_set) { ent = &tp->t_tcpreq_info[oldest]; tcp_req_log_req_info(tp, &tp->t_tcpreq_info[i], i, TCP_TRK_REQ_LOG_STALE, oldest_delta, 1); tcp_req_free_a_slot(tp, ent); } } int tcp_req_check_for_comp(struct tcpcb *tp, tcp_seq ack_point) { int i, ret=0; struct tcp_sendfile_track *ent; /* Clean up any old closed end requests that are now completed */ if (tp->t_tcpreq_req == 0) return(0); if (tp->t_tcpreq_closed == 0) return(0); for(i = 0; i < MAX_TCP_TRK_REQ; i++) { ent = &tp->t_tcpreq_info[i]; /* Skip empty ones */ if (ent->flags == TCP_TRK_TRACK_FLG_EMPTY) continue; /* Skip open ones */ if (ent->flags & TCP_TRK_TRACK_FLG_OPEN) continue; if (SEQ_GEQ(ack_point, ent->end_seq)) { /* We are past it -- free it */ tcp_req_log_req_info(tp, ent, i, TCP_TRK_REQ_LOG_FREED, 0, 0); tcp_req_free_a_slot(tp, ent); ret++; } } return (ret); } int tcp_req_is_entry_comp(struct tcpcb *tp, struct tcp_sendfile_track *ent, tcp_seq ack_point) { if (tp->t_tcpreq_req == 0) return(-1); if (tp->t_tcpreq_closed == 0) return(-1); if (ent->flags == TCP_TRK_TRACK_FLG_EMPTY) return(-1); if (SEQ_GEQ(ack_point, ent->end_seq)) { return (1); } return (0); } struct tcp_sendfile_track * tcp_req_find_a_req_that_is_completed_by(struct tcpcb *tp, tcp_seq th_ack, int *ip) { /* * Given an ack point (th_ack) walk through our entries and * return the first one found that th_ack goes past the * end_seq. */ struct tcp_sendfile_track *ent; int i; if (tp->t_tcpreq_req == 0) { /* none open */ return (NULL); } for(i = 0; i < MAX_TCP_TRK_REQ; i++) { ent = &tp->t_tcpreq_info[i]; if (ent->flags == TCP_TRK_TRACK_FLG_EMPTY) continue; if ((ent->flags & TCP_TRK_TRACK_FLG_OPEN) == 0) { if (SEQ_GEQ(th_ack, ent->end_seq)) { *ip = i; return (ent); } } } return (NULL); } struct tcp_sendfile_track * tcp_req_find_req_for_seq(struct tcpcb *tp, tcp_seq seq) { struct tcp_sendfile_track *ent; int i; if (tp->t_tcpreq_req == 0) { /* none open */ return (NULL); } for(i = 0; i < MAX_TCP_TRK_REQ; i++) { ent = &tp->t_tcpreq_info[i]; tcp_req_log_req_info(tp, ent, i, TCP_TRK_REQ_LOG_SEARCH, (uint64_t)seq, 0); if (ent->flags == TCP_TRK_TRACK_FLG_EMPTY) { continue; } if (ent->flags & TCP_TRK_TRACK_FLG_OPEN) { /* * An open end request only needs to * match the beginning seq or be * all we have (once we keep going on * a open end request we may have a seq * wrap). */ if ((SEQ_GEQ(seq, ent->start_seq)) || (tp->t_tcpreq_closed == 0)) return (ent); } else { /* * For this one we need to * be a bit more careful if its * completed at least. */ if ((SEQ_GEQ(seq, ent->start_seq)) && (SEQ_LT(seq, ent->end_seq))) { return (ent); } } } return (NULL); } /* Should this be in its own file tcp_req.c ? */ struct tcp_sendfile_track * tcp_req_alloc_req_full(struct tcpcb *tp, struct tcp_snd_req *req, uint64_t ts, int rec_dups) { struct tcp_sendfile_track *fil; int i, allocated; /* In case the stack does not check for completions do so now */ tcp_req_check_for_comp(tp, tp->snd_una); /* Check for stale entries */ if (tp->t_tcpreq_req) tcp_req_check_for_stale_entries(tp, ts, (tp->t_tcpreq_req >= MAX_TCP_TRK_REQ)); /* Check to see if this is a duplicate of one not started */ if (tp->t_tcpreq_req) { for(i = 0, allocated = 0; i < MAX_TCP_TRK_REQ; i++) { fil = &tp->t_tcpreq_info[i]; if ((fil->flags & TCP_TRK_TRACK_FLG_USED) == 0) continue; if ((fil->timestamp == req->timestamp) && (fil->start == req->start) && ((fil->flags & TCP_TRK_TRACK_FLG_OPEN) || (fil->end == req->end))) { /* * We already have this request * and it has not been started with sendfile. * This probably means the user was returned * a 4xx of some sort and its going to age * out, lets not duplicate it. */ return(fil); } } } /* Ok if there is no room at the inn we are in trouble */ if (tp->t_tcpreq_req >= MAX_TCP_TRK_REQ) { tcp_trace_point(tp, TCP_TP_REQ_LOG_FAIL); for(i = 0; i < MAX_TCP_TRK_REQ; i++) { tcp_req_log_req_info(tp, &tp->t_tcpreq_info[i], i, TCP_TRK_REQ_LOG_ALLOCFAIL, 0, 0); } return (NULL); } for(i = 0, allocated = 0; i < MAX_TCP_TRK_REQ; i++) { fil = &tp->t_tcpreq_info[i]; if (fil->flags == TCP_TRK_TRACK_FLG_EMPTY) { allocated = 1; fil->flags = TCP_TRK_TRACK_FLG_USED; fil->timestamp = req->timestamp; fil->playout_ms = req->playout_ms; fil->localtime = ts; fil->start = req->start; if (req->flags & TCP_LOG_HTTPD_RANGE_END) { fil->end = req->end; } else { fil->end = 0; fil->flags |= TCP_TRK_TRACK_FLG_OPEN; } /* * We can set the min boundaries to the TCP Sequence space, * but it might be found to be further up when sendfile * actually runs on this range (if it ever does). */ fil->sbcc_at_s = tptosocket(tp)->so_snd.sb_ccc; fil->start_seq = tp->snd_una + tptosocket(tp)->so_snd.sb_ccc; if (req->flags & TCP_LOG_HTTPD_RANGE_END) fil->end_seq = (fil->start_seq + ((uint32_t)(fil->end - fil->start))); else fil->end_seq = 0; if (tptosocket(tp)->so_snd.sb_tls_info) { /* * This session is doing TLS. Take a swag guess * at the overhead. */ fil->end_seq += tcp_estimate_tls_overhead( tptosocket(tp), (fil->end - fil->start)); } tp->t_tcpreq_req++; if (fil->flags & TCP_TRK_TRACK_FLG_OPEN) tp->t_tcpreq_open++; else tp->t_tcpreq_closed++; tcp_req_log_req_info(tp, fil, i, TCP_TRK_REQ_LOG_NEW, 0, 0); break; } else fil = NULL; } return (fil); } void tcp_req_alloc_req(struct tcpcb *tp, union tcp_log_userdata *user, uint64_t ts) { (void)tcp_req_alloc_req_full(tp, &user->tcp_req, ts, 1); } #endif void tcp_log_socket_option(struct tcpcb *tp, uint32_t option_num, uint32_t option_val, int err) { if (tcp_bblogging_on(tp)) { struct tcp_log_buffer *l; l = tcp_log_event(tp, NULL, &tptosocket(tp)->so_rcv, &tptosocket(tp)->so_snd, TCP_LOG_SOCKET_OPT, err, 0, NULL, 1, NULL, NULL, 0, NULL); if (l) { l->tlb_flex1 = option_num; l->tlb_flex2 = option_val; } } } uint32_t tcp_get_srtt(struct tcpcb *tp, int granularity) { uint32_t srtt; KASSERT(granularity == TCP_TMR_GRANULARITY_USEC || granularity == TCP_TMR_GRANULARITY_TICKS, ("%s: called with unexpected granularity %d", __func__, granularity)); srtt = tp->t_srtt; /* * We only support two granularities. If the stored granularity * does not match the granularity requested by the caller, * convert the stored value to the requested unit of granularity. */ if (tp->t_tmr_granularity != granularity) { if (granularity == TCP_TMR_GRANULARITY_USEC) srtt = TICKS_2_USEC(srtt); else srtt = USEC_2_TICKS(srtt); } /* * If the srtt is stored with ticks granularity, we need to * unshift to get the actual value. We do this after the * conversion above (if one was necessary) in order to maximize * precision. */ if (tp->t_tmr_granularity == TCP_TMR_GRANULARITY_TICKS) srtt = srtt >> TCP_RTT_SHIFT; return (srtt); } void tcp_account_for_send(struct tcpcb *tp, uint32_t len, uint8_t is_rxt, uint8_t is_tlp, bool hw_tls) { if (is_tlp) { tp->t_sndtlppack++; tp->t_sndtlpbyte += len; } /* To get total bytes sent you must add t_snd_rxt_bytes to t_sndbytes */ if (is_rxt) tp->t_snd_rxt_bytes += len; else tp->t_sndbytes += len; #ifdef KERN_TLS if (hw_tls && is_rxt && len != 0) { uint64_t rexmit_percent; rexmit_percent = (1000ULL * tp->t_snd_rxt_bytes) / (10ULL * (tp->t_snd_rxt_bytes + tp->t_sndbytes)); if (rexmit_percent > ktls_ifnet_max_rexmit_pct) ktls_disable_ifnet(tp); } #endif } diff --git a/sys/sys/user.h b/sys/sys/user.h index d70cfe3bd718..e76b2a66ae94 100644 --- a/sys/sys/user.h +++ b/sys/sys/user.h @@ -1,690 +1,690 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1989, 1991, 1993 * The Regents of the University of California. * Copyright (c) 2007 Robert N. M. Watson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #ifndef _SYS_USER_H_ #define _SYS_USER_H_ #include #ifndef _KERNEL /* stuff that *used* to be included by user.h, or is now needed */ #include #include #include #include #include #include #include #include #include #include /* XXX */ #include /* XXX */ #include /* XXX */ #include /* XXX */ #endif /* !_KERNEL */ #ifndef _SYS_RESOURCEVAR_H_ #include #endif #ifndef _SYS_SIGNALVAR_H_ #include #endif #ifndef _SYS_SOCKET_VAR_H_ #include #endif #include /* * KERN_PROC subtype ops return arrays of selected proc structure entries: * * This struct includes several arrays of spare space, with different arrays * for different standard C-types. When adding new variables to this struct, * the space for byte-aligned data should be taken from the ki_sparestring, * pointers from ki_spareptrs, word-aligned data from ki_spareints, and * doubleword-aligned data from ki_sparelongs. Make sure the space for new * variables come from the array which matches the size and alignment of * those variables on ALL hardware platforms, and then adjust the appropriate * KI_NSPARE_* value(s) to match. * * Always verify that sizeof(struct kinfo_proc) == KINFO_PROC_SIZE on all * platforms after you have added new variables. Note that if you change * the value of KINFO_PROC_SIZE, then many userland programs will stop * working until they are recompiled! * * Once you have added the new field, you will need to add code to initialize * it in two places: function fill_kinfo_proc in sys/kern/kern_proc.c and * function kvm_proclist in lib/libkvm/kvm_proc.c . */ #define KI_NSPARE_INT 2 #define KI_NSPARE_LONG 12 #define KI_NSPARE_PTR 5 #ifndef _KERNEL #ifndef KINFO_PROC_SIZE #error "Unknown architecture" #endif #endif /* !_KERNEL */ #define WMESGLEN 8 /* size of returned wchan message */ #define LOCKNAMELEN 8 /* size of returned lock name */ #define TDNAMLEN 16 /* size of returned thread name */ #define COMMLEN 19 /* size of returned ki_comm name */ #define KI_EMULNAMELEN 16 /* size of returned ki_emul */ #define KI_NGROUPS 16 /* number of groups in ki_groups */ #define LOGNAMELEN 17 /* size of returned ki_login */ #define LOGINCLASSLEN 17 /* size of returned ki_loginclass */ #ifndef BURN_BRIDGES #define OCOMMLEN TDNAMLEN #define ki_ocomm ki_tdname #endif /* Flags for the process credential. */ #define KI_CRF_CAPABILITY_MODE 0x00000001 /* * Steal a bit from ki_cr_flags to indicate that the cred had more than * KI_NGROUPS groups. */ #define KI_CRF_GRP_OVERFLOW 0x80000000 struct kinfo_proc { int ki_structsize; /* size of this structure */ int ki_layout; /* reserved: layout identifier */ struct pargs *ki_args; /* address of command arguments */ struct proc *ki_paddr; /* address of proc */ struct user *ki_addr; /* kernel virtual addr of u-area */ struct vnode *ki_tracep; /* pointer to trace file */ struct vnode *ki_textvp; /* pointer to executable file */ struct filedesc *ki_fd; /* pointer to open file info */ struct vmspace *ki_vmspace; /* pointer to kernel vmspace struct */ const void *ki_wchan; /* sleep address */ pid_t ki_pid; /* Process identifier */ pid_t ki_ppid; /* parent process id */ pid_t ki_pgid; /* process group id */ pid_t ki_tpgid; /* tty process group id */ pid_t ki_sid; /* Process session ID */ pid_t ki_tsid; /* Terminal session ID */ short ki_jobc; /* job control counter */ short ki_spare_short1; /* unused (just here for alignment) */ uint32_t ki_tdev_freebsd11; /* controlling tty dev */ sigset_t ki_siglist; /* Signals arrived but not delivered */ sigset_t ki_sigmask; /* Current signal mask */ sigset_t ki_sigignore; /* Signals being ignored */ sigset_t ki_sigcatch; /* Signals being caught by user */ uid_t ki_uid; /* effective user id */ uid_t ki_ruid; /* Real user id */ uid_t ki_svuid; /* Saved effective user id */ gid_t ki_rgid; /* Real group id */ gid_t ki_svgid; /* Saved effective group id */ short ki_ngroups; /* number of groups */ short ki_spare_short2; /* unused (just here for alignment) */ gid_t ki_groups[KI_NGROUPS]; /* groups */ vm_size_t ki_size; /* virtual size */ segsz_t ki_rssize; /* current resident set size in pages */ segsz_t ki_swrss; /* resident set size before last swap */ segsz_t ki_tsize; /* text size (pages) XXX */ segsz_t ki_dsize; /* data size (pages) XXX */ segsz_t ki_ssize; /* stack size (pages) */ u_short ki_xstat; /* Exit status for wait & stop signal */ u_short ki_acflag; /* Accounting flags */ fixpt_t ki_pctcpu; /* %cpu for process during ki_swtime */ u_int ki_estcpu; /* Time averaged value of ki_cpticks */ u_int ki_slptime; /* Time since last blocked */ u_int ki_swtime; /* Time swapped in or out */ u_int ki_cow; /* number of copy-on-write faults */ u_int64_t ki_runtime; /* Real time in microsec */ struct timeval ki_start; /* starting time */ struct timeval ki_childtime; /* time used by process children */ long ki_flag; /* P_* flags */ long ki_kiflag; /* KI_* flags (below) */ int ki_traceflag; /* Kernel trace points */ char ki_stat; /* S* process status */ signed char ki_nice; /* Process "nice" value */ char ki_lock; /* Process lock (prevent swap) count */ char ki_rqindex; /* Run queue index */ u_char ki_oncpu_old; /* Which cpu we are on (legacy) */ u_char ki_lastcpu_old; /* Last cpu we were on (legacy) */ char ki_tdname[TDNAMLEN+1]; /* thread name */ char ki_wmesg[WMESGLEN+1]; /* wchan message */ char ki_login[LOGNAMELEN+1]; /* setlogin name */ char ki_lockname[LOCKNAMELEN+1]; /* lock name */ char ki_comm[COMMLEN+1]; /* command name */ char ki_emul[KI_EMULNAMELEN+1]; /* emulation name */ char ki_loginclass[LOGINCLASSLEN+1]; /* login class */ char ki_moretdname[MAXCOMLEN-TDNAMLEN+1]; /* more thread name */ /* * When adding new variables, take space for char-strings from the * front of ki_sparestrings, and ints from the end of ki_spareints. * That way the spare room from both arrays will remain contiguous. */ char ki_sparestrings[46]; /* spare string space */ int ki_spareints[KI_NSPARE_INT]; /* spare room for growth */ uint64_t ki_tdev; /* controlling tty dev */ int ki_oncpu; /* Which cpu we are on */ int ki_lastcpu; /* Last cpu we were on */ int ki_tracer; /* Pid of tracing process */ int ki_flag2; /* P2_* flags */ int ki_fibnum; /* Default FIB number */ u_int ki_cr_flags; /* Credential flags */ int ki_jid; /* Process jail ID */ int ki_numthreads; /* XXXKSE number of threads in total */ lwpid_t ki_tid; /* XXXKSE thread id */ struct priority ki_pri; /* process priority */ struct rusage ki_rusage; /* process rusage statistics */ /* XXX - most fields in ki_rusage_ch are not (yet) filled in */ struct rusage ki_rusage_ch; /* rusage of children processes */ struct pcb *ki_pcb; /* kernel virtual addr of pcb */ void *ki_kstack; /* kernel virtual addr of stack */ void *ki_udata; /* User convenience pointer */ struct thread *ki_tdaddr; /* address of thread */ /* * When adding new variables, take space for pointers from the * front of ki_spareptrs, and longs from the end of ki_sparelongs. * That way the spare room from both arrays will remain contiguous. */ struct pwddesc *ki_pd; /* pointer to process paths info */ void *ki_spareptrs[KI_NSPARE_PTR]; /* spare room for growth */ long ki_sparelongs[KI_NSPARE_LONG]; /* spare room for growth */ long ki_sflag; /* PS_* flags */ long ki_tdflags; /* XXXKSE kthread flag */ }; void fill_kinfo_proc(struct proc *, struct kinfo_proc *); /* XXX - the following two defines are temporary */ #define ki_childstime ki_rusage_ch.ru_stime #define ki_childutime ki_rusage_ch.ru_utime /* * Legacy PS_ flag. This moved to p_flag but is maintained for * compatibility. */ #define PS_INMEM 0x00001 /* Loaded into memory. */ /* ki_sessflag values */ #define KI_CTTY 0x00000001 /* controlling tty vnode active */ #define KI_SLEADER 0x00000002 /* session leader */ #define KI_LOCKBLOCK 0x00000004 /* proc blocked on lock ki_lockname */ /* * This used to be the per-process structure containing data that * isn't needed in core when the process is swapped out, but now it * remains only for the benefit of a.out core dumps. */ struct user { struct pstats u_stats; /* *p_stats */ struct kinfo_proc u_kproc; /* eproc */ }; /* * The KERN_PROC_FILE sysctl allows a process to dump the file descriptor * array of another process. */ #define KF_ATTR_VALID 0x0001 #define KF_TYPE_NONE 0 #define KF_TYPE_VNODE 1 #define KF_TYPE_SOCKET 2 #define KF_TYPE_PIPE 3 #define KF_TYPE_FIFO 4 #define KF_TYPE_KQUEUE 5 /* was KF_TYPE_CRYPTO 6 */ #define KF_TYPE_MQUEUE 7 #define KF_TYPE_SHM 8 #define KF_TYPE_SEM 9 #define KF_TYPE_PTS 10 #define KF_TYPE_PROCDESC 11 #define KF_TYPE_DEV 12 #define KF_TYPE_EVENTFD 13 #define KF_TYPE_TIMERFD 14 #define KF_TYPE_UNKNOWN 255 #define KF_VTYPE_VNON 0 #define KF_VTYPE_VREG 1 #define KF_VTYPE_VDIR 2 #define KF_VTYPE_VBLK 3 #define KF_VTYPE_VCHR 4 #define KF_VTYPE_VLNK 5 #define KF_VTYPE_VSOCK 6 #define KF_VTYPE_VFIFO 7 #define KF_VTYPE_VBAD 8 #define KF_VTYPE_UNKNOWN 255 #define KF_FD_TYPE_CWD -1 /* Current working directory */ #define KF_FD_TYPE_ROOT -2 /* Root directory */ #define KF_FD_TYPE_JAIL -3 /* Jail directory */ #define KF_FD_TYPE_TRACE -4 /* Ktrace vnode */ #define KF_FD_TYPE_TEXT -5 /* Text vnode */ #define KF_FD_TYPE_CTTY -6 /* Controlling terminal */ #define KF_FLAG_READ 0x00000001 #define KF_FLAG_WRITE 0x00000002 #define KF_FLAG_APPEND 0x00000004 #define KF_FLAG_ASYNC 0x00000008 #define KF_FLAG_FSYNC 0x00000010 #define KF_FLAG_NONBLOCK 0x00000020 #define KF_FLAG_DIRECT 0x00000040 #define KF_FLAG_HASLOCK 0x00000080 #define KF_FLAG_SHLOCK 0x00000100 #define KF_FLAG_EXLOCK 0x00000200 #define KF_FLAG_NOFOLLOW 0x00000400 #define KF_FLAG_CREAT 0x00000800 #define KF_FLAG_TRUNC 0x00001000 #define KF_FLAG_EXCL 0x00002000 #define KF_FLAG_EXEC 0x00004000 /* * Old format. Has variable hidden padding due to alignment. * This is a compatibility hack for pre-build 7.1 packages. */ #if defined(__amd64__) #define KINFO_OFILE_SIZE 1328 #endif #if defined(__i386__) #define KINFO_OFILE_SIZE 1324 #endif struct kinfo_ofile { int kf_structsize; /* Size of kinfo_file. */ int kf_type; /* Descriptor type. */ int kf_fd; /* Array index. */ int kf_ref_count; /* Reference count. */ int kf_flags; /* Flags. */ /* XXX Hidden alignment padding here on amd64 */ off_t kf_offset; /* Seek location. */ int kf_vnode_type; /* Vnode type. */ int kf_sock_domain; /* Socket domain. */ int kf_sock_type; /* Socket type. */ int kf_sock_protocol; /* Socket protocol. */ char kf_path[PATH_MAX]; /* Path to file, if any. */ struct sockaddr_storage kf_sa_local; /* Socket address. */ struct sockaddr_storage kf_sa_peer; /* Peer address. */ }; #if defined(__amd64__) || defined(__i386__) /* * This size should never be changed. If you really need to, you must provide * backward ABI compatibility by allocating a new sysctl MIB that will return * the new structure. The current structure has to be returned by the current * sysctl MIB. See how it is done for the kinfo_ofile structure. */ #define KINFO_FILE_SIZE 1392 #endif struct kinfo_file { int kf_structsize; /* Variable size of record. */ int kf_type; /* Descriptor type. */ int kf_fd; /* Array index. */ int kf_ref_count; /* Reference count. */ int kf_flags; /* Flags. */ int kf_pad0; /* Round to 64 bit alignment. */ int64_t kf_offset; /* Seek location. */ union { struct { /* API compatibility with FreeBSD < 12. */ int kf_vnode_type; int kf_sock_domain; int kf_sock_type; int kf_sock_protocol; struct sockaddr_storage kf_sa_local; struct sockaddr_storage kf_sa_peer; }; union { struct { /* Sendq size */ uint32_t kf_sock_sendq; /* Socket domain. */ int kf_sock_domain0; /* Socket type. */ int kf_sock_type0; /* Socket protocol. */ int kf_sock_protocol0; /* Socket address. */ struct sockaddr_storage kf_sa_local; /* Peer address. */ struct sockaddr_storage kf_sa_peer; /* Address of so_pcb. */ uint64_t kf_sock_pcb; - /* Address of inp_ppcb. */ + /* Obsolete! May be reused as a spare. */ uint64_t kf_sock_inpcb; /* Address of unp_conn. */ uint64_t kf_sock_unpconn; /* Send buffer state. */ uint16_t kf_sock_snd_sb_state; /* Receive buffer state. */ uint16_t kf_sock_rcv_sb_state; /* Recvq size. */ uint32_t kf_sock_recvq; } kf_sock; struct { /* Vnode type. */ int kf_file_type; /* Space for future use */ int kf_spareint[3]; uint64_t kf_spareint64[29]; /* Number of references to file. */ uint64_t kf_file_nlink; /* Vnode filesystem id. */ uint64_t kf_file_fsid; /* File device. */ uint64_t kf_file_rdev; /* Global file id. */ uint64_t kf_file_fileid; /* File size. */ uint64_t kf_file_size; /* Vnode filesystem id, FreeBSD 11 compat. */ uint32_t kf_file_fsid_freebsd11; /* File device, FreeBSD 11 compat. */ uint32_t kf_file_rdev_freebsd11; /* File mode. */ uint16_t kf_file_mode; /* Round to 64 bit alignment. */ uint16_t kf_file_pad0; uint32_t kf_file_pad1; } kf_file; struct { uint32_t kf_spareint[4]; uint64_t kf_spareint64[32]; uint32_t kf_sem_value; uint16_t kf_sem_mode; } kf_sem; struct { uint32_t kf_spareint[4]; uint64_t kf_spareint64[32]; uint64_t kf_pipe_addr; uint64_t kf_pipe_peer; uint32_t kf_pipe_buffer_cnt; uint32_t kf_pipe_buffer_in; uint32_t kf_pipe_buffer_out; uint32_t kf_pipe_buffer_size; } kf_pipe; struct { uint32_t kf_spareint[4]; uint64_t kf_spareint64[32]; uint32_t kf_pts_dev_freebsd11; uint32_t kf_pts_pad0; uint64_t kf_pts_dev; /* Round to 64 bit alignment. */ uint32_t kf_pts_pad1[4]; } kf_pts; struct { uint32_t kf_spareint[4]; uint64_t kf_spareint64[32]; pid_t kf_pid; } kf_proc; struct { uint64_t kf_eventfd_value; uint32_t kf_eventfd_flags; uint32_t kf_eventfd_spareint[3]; uint64_t kf_eventfd_addr; } kf_eventfd; struct { uint32_t kf_timerfd_clockid; uint32_t kf_timerfd_flags; uint64_t kf_timerfd_addr; } kf_timerfd; struct { uint64_t kf_kqueue_addr; int32_t kf_kqueue_count; int32_t kf_kqueue_state; } kf_kqueue; } kf_un; }; uint16_t kf_status; /* Status flags. */ uint16_t kf_pad1; /* Round to 32 bit alignment. */ int _kf_ispare0; /* Space for more stuff. */ cap_rights_t kf_cap_rights; /* Capability rights. */ uint64_t _kf_cap_spare; /* Space for future cap_rights_t. */ /* Truncated before copyout in sysctl */ char kf_path[PATH_MAX]; /* Path to file, if any. */ }; struct kinfo_lockf { int kl_structsize; /* Variable size of record. */ int kl_rw; int kl_type; int kl_pid; int kl_sysid; int kl_pad0; uint64_t kl_file_fsid; uint64_t kl_file_rdev; uint64_t kl_file_fileid; off_t kl_start; off_t kl_len; /* len == 0 till the EOF */ char kl_path[PATH_MAX]; }; #define KLOCKF_RW_READ 0x01 #define KLOCKF_RW_WRITE 0x02 #define KLOCKF_TYPE_FLOCK 0x01 #define KLOCKF_TYPE_PID 0x02 #define KLOCKF_TYPE_REMOTE 0x03 /* * The KERN_PROC_VMMAP sysctl allows a process to dump the VM layout of * another process as a series of entries. */ #define KVME_TYPE_NONE 0 #define KVME_TYPE_DEFAULT 1 /* no longer returned */ #define KVME_TYPE_VNODE 2 #define KVME_TYPE_SWAP 3 #define KVME_TYPE_DEVICE 4 #define KVME_TYPE_PHYS 5 #define KVME_TYPE_DEAD 6 #define KVME_TYPE_SG 7 #define KVME_TYPE_MGTDEVICE 8 #define KVME_TYPE_GUARD 9 #define KVME_TYPE_UNKNOWN 255 #define KVME_PROT_READ 0x00000001 #define KVME_PROT_WRITE 0x00000002 #define KVME_PROT_EXEC 0x00000004 #define KVME_FLAG_COW 0x00000001 #define KVME_FLAG_NEEDS_COPY 0x00000002 #define KVME_FLAG_NOCOREDUMP 0x00000004 #define KVME_FLAG_SUPER 0x00000008 #define KVME_FLAG_GROWS_UP 0x00000010 #define KVME_FLAG_GROWS_DOWN 0x00000020 #define KVME_FLAG_USER_WIRED 0x00000040 #if defined(__amd64__) #define KINFO_OVMENTRY_SIZE 1168 #endif #if defined(__i386__) #define KINFO_OVMENTRY_SIZE 1128 #endif struct kinfo_ovmentry { int kve_structsize; /* Size of kinfo_vmmapentry. */ int kve_type; /* Type of map entry. */ void *kve_start; /* Starting address. */ void *kve_end; /* Finishing address. */ int kve_flags; /* Flags on map entry. */ int kve_resident; /* Number of resident pages. */ int kve_private_resident; /* Number of private pages. */ int kve_protection; /* Protection bitmask. */ int kve_ref_count; /* VM obj ref count. */ int kve_shadow_count; /* VM obj shadow count. */ char kve_path[PATH_MAX]; /* Path to VM obj, if any. */ void *_kve_pspare[8]; /* Space for more stuff. */ off_t kve_offset; /* Mapping offset in object */ uint64_t kve_fileid; /* inode number if vnode */ uint32_t kve_fsid; /* dev_t of vnode location */ int _kve_ispare[3]; /* Space for more stuff. */ }; #if defined(__amd64__) || defined(__i386__) #define KINFO_VMENTRY_SIZE 1160 #endif struct kinfo_vmentry { int kve_structsize; /* Variable size of record. */ int kve_type; /* Type of map entry. */ uint64_t kve_start; /* Starting address. */ uint64_t kve_end; /* Finishing address. */ uint64_t kve_offset; /* Mapping offset in object */ uint64_t kve_vn_fileid; /* inode number if vnode */ uint32_t kve_vn_fsid_freebsd11; /* dev_t of vnode location */ int kve_flags; /* Flags on map entry. */ int kve_resident; /* Number of resident pages. */ int kve_private_resident; /* Number of private pages. */ int kve_protection; /* Protection bitmask. */ int kve_ref_count; /* VM obj ref count. */ int kve_shadow_count; /* VM obj shadow count. */ int kve_vn_type; /* Vnode type. */ uint64_t kve_vn_size; /* File size. */ uint32_t kve_vn_rdev_freebsd11; /* Device id if device. */ uint16_t kve_vn_mode; /* File mode. */ uint16_t kve_status; /* Status flags. */ union { uint64_t _kve_vn_fsid; /* dev_t of vnode location */ uint64_t _kve_obj; /* handle of anon obj */ } kve_type_spec; uint64_t kve_vn_rdev; /* Device id if device. */ int _kve_ispare[8]; /* Space for more stuff. */ /* Truncated before copyout in sysctl */ char kve_path[PATH_MAX]; /* Path to VM obj, if any. */ }; #define kve_vn_fsid kve_type_spec._kve_vn_fsid #define kve_obj kve_type_spec._kve_obj /* * The "vm.objects" sysctl provides a list of all VM objects in the system * via an array of these entries. */ struct kinfo_vmobject { int kvo_structsize; /* Variable size of record. */ int kvo_type; /* Object type: KVME_TYPE_*. */ uint64_t kvo_size; /* Object size in pages. */ uint64_t kvo_vn_fileid; /* inode number if vnode. */ uint32_t kvo_vn_fsid_freebsd11; /* dev_t of vnode location. */ int kvo_ref_count; /* Reference count. */ int kvo_shadow_count; /* Shadow count. */ int kvo_memattr; /* Memory attribute. */ uint64_t kvo_resident; /* Number of resident pages. */ uint64_t kvo_active; /* Number of active pages. */ uint64_t kvo_inactive; /* Number of inactive pages. */ union { uint64_t _kvo_vn_fsid; uint64_t _kvo_backing_obj; /* Handle for the backing obj */ } kvo_type_spec; /* Type-specific union */ uint64_t kvo_me; /* Uniq handle for anon obj */ uint64_t _kvo_qspare[6]; uint32_t kvo_swapped; /* Number of swapped pages */ uint32_t _kvo_ispare[7]; char kvo_path[PATH_MAX]; /* Pathname, if any. */ }; #define kvo_vn_fsid kvo_type_spec._kvo_vn_fsid #define kvo_backing_obj kvo_type_spec._kvo_backing_obj /* * The KERN_PROC_KSTACK sysctl allows a process to dump the kernel stacks of * another process as a series of entries. Each stack is represented by a * series of symbol names and offsets as generated by stack_sbuf_print(9). */ #define KKST_MAXLEN 1024 #define KKST_STATE_STACKOK 0 /* Stack is valid. */ #define KKST_STATE_SWAPPED 1 /* Stack swapped out. */ #define KKST_STATE_RUNNING 2 /* Stack ephemeral. */ #if defined(__amd64__) || defined(__i386__) #define KINFO_KSTACK_SIZE 1096 #endif struct kinfo_kstack { lwpid_t kkst_tid; /* ID of thread. */ int kkst_state; /* Validity of stack. */ char kkst_trace[KKST_MAXLEN]; /* String representing stack. */ int _kkst_ispare[16]; /* Space for more stuff. */ }; struct kinfo_sigtramp { void *ksigtramp_start; void *ksigtramp_end; void *ksigtramp_spare[4]; }; #define KMAP_FLAG_WIREFUTURE 0x01 /* all future mappings wil be wired */ #define KMAP_FLAG_ASLR 0x02 /* ASLR is applied to mappings */ #define KMAP_FLAG_ASLR_IGNSTART 0x04 /* ASLR may map into sbrk grow region */ #define KMAP_FLAG_WXORX 0x08 /* W^X mapping policy is enforced */ #define KMAP_FLAG_ASLR_STACK 0x10 /* the stack location is randomized */ #define KMAP_FLAG_ASLR_SHARED_PAGE 0x20 /* the shared page location is randomized */ struct kinfo_vm_layout { uintptr_t kvm_min_user_addr; uintptr_t kvm_max_user_addr; uintptr_t kvm_text_addr; size_t kvm_text_size; uintptr_t kvm_data_addr; size_t kvm_data_size; uintptr_t kvm_stack_addr; size_t kvm_stack_size; int kvm_map_flags; uintptr_t kvm_shp_addr; size_t kvm_shp_size; uintptr_t kvm_spare[12]; }; #ifdef _KERNEL /* Flags for kern_proc_out function. */ #define KERN_PROC_NOTHREADS 0x1 #define KERN_PROC_MASK32 0x2 /* Flags for kern_proc_filedesc_out. */ #define KERN_FILEDESC_PACK_KINFO 0x00000001U /* Flags for kern_proc_vmmap_out. */ #define KERN_VMMAP_PACK_KINFO 0x00000001U struct sbuf; /* * The kern_proc out functions are helper functions to dump process * miscellaneous kinfo structures to sbuf. The main consumers are KERN_PROC * sysctls but they may also be used by other kernel subsystems. * * The functions manipulate the process locking state and expect the process * to be locked on enter. On return the process is unlocked. */ int kern_proc_filedesc_out(struct proc *p, struct sbuf *sb, ssize_t maxlen, int flags); int kern_proc_cwd_out(struct proc *p, struct sbuf *sb, ssize_t maxlen); int kern_proc_out(struct proc *p, struct sbuf *sb, int flags); int kern_proc_vmmap_out(struct proc *p, struct sbuf *sb, ssize_t maxlen, int flags); int vntype_to_kinfo(int vtype); void pack_kinfo(struct kinfo_file *kif); #endif /* !_KERNEL */ #endif diff --git a/usr.bin/fstat/fstat.c b/usr.bin/fstat/fstat.c index 328e35e9138f..e5d0755062d0 100644 --- a/usr.bin/fstat/fstat.c +++ b/usr.bin/fstat/fstat.c @@ -1,648 +1,643 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2009 Stanislav Sedov * Copyright (c) 1988, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "functions.h" static int fsflg, /* show files on same filesystem as file(s) argument */ pflg, /* show files open by a particular pid */ sflg, /* show socket details */ uflg; /* show files open by a particular (effective) user */ static int checkfile; /* restrict to particular files or filesystems */ static int nflg; /* (numerical) display f.s. and rdev as dev_t */ static int mflg; /* include memory-mapped files */ static int vflg; /* be verbose */ typedef struct devs { struct devs *next; uint64_t fsid; uint64_t ino; const char *name; } DEVS; static DEVS *devs; static char *memf, *nlistf; static int getfname(const char *filename); static void dofiles(struct procstat *procstat, struct kinfo_proc *p); static void print_access_flags(int flags); static void print_file_info(struct procstat *procstat, struct filestat *fst, const char *uname, const char *cmd, int pid); static void print_pipe_info(struct procstat *procstat, struct filestat *fst); static void print_pts_info(struct procstat *procstat, struct filestat *fst); static void print_sem_info(struct procstat *procstat, struct filestat *fst); static void print_shm_info(struct procstat *procstat, struct filestat *fst); static void print_socket_info(struct procstat *procstat, struct filestat *fst); static void print_vnode_info(struct procstat *procstat, struct filestat *fst); static void usage(void) __dead2; int do_fstat(int argc, char **argv) { struct kinfo_proc *p; struct passwd *passwd; struct procstat *procstat; int arg, ch, what; int cnt, i; arg = 0; what = KERN_PROC_PROC; nlistf = memf = NULL; while ((ch = getopt(argc, argv, "fmnp:su:vN:M:")) != -1) switch((char)ch) { case 'f': fsflg = 1; break; case 'M': memf = optarg; break; case 'N': nlistf = optarg; break; case 'm': mflg = 1; break; case 'n': nflg = 1; break; case 'p': if (pflg++) usage(); if (!isdigit(*optarg)) { warnx("-p requires a process id"); usage(); } what = KERN_PROC_PID; arg = atoi(optarg); break; case 's': sflg = 1; break; case 'u': if (uflg++) usage(); if (!(passwd = getpwnam(optarg))) errx(1, "%s: unknown uid", optarg); what = KERN_PROC_UID; arg = passwd->pw_uid; break; case 'v': vflg = 1; break; case '?': default: usage(); } if (*(argv += optind)) { for (; *argv; ++argv) { if (getfname(*argv)) checkfile = 1; } if (!checkfile) /* file(s) specified, but none accessible */ exit(1); } if (fsflg && !checkfile) { /* -f with no files means use wd */ if (getfname(".") == 0) exit(1); checkfile = 1; } if (memf != NULL) procstat = procstat_open_kvm(nlistf, memf); else procstat = procstat_open_sysctl(); if (procstat == NULL) errx(1, "procstat_open()"); p = procstat_getprocs(procstat, what, arg, &cnt); if (p == NULL) errx(1, "procstat_getprocs()"); /* * Print header. */ if (nflg) printf("%s", "USER CMD PID FD DEV INUM MODE SZ|DV R/W"); else printf("%s", "USER CMD PID FD MOUNT INUM MODE SZ|DV R/W"); if (checkfile && fsflg == 0) printf(" NAME\n"); else putchar('\n'); /* * Go through the process list. */ for (i = 0; i < cnt; i++) { if (p[i].ki_stat == SZOMB) continue; dofiles(procstat, &p[i]); } procstat_freeprocs(procstat, p); procstat_close(procstat); return (0); } static void dofiles(struct procstat *procstat, struct kinfo_proc *kp) { const char *cmd; const char *uname; struct filestat *fst; struct filestat_list *head; int pid; uname = user_from_uid(kp->ki_uid, 0); pid = kp->ki_pid; cmd = kp->ki_comm; head = procstat_getfiles(procstat, kp, mflg); if (head == NULL) return; STAILQ_FOREACH(fst, head, next) print_file_info(procstat, fst, uname, cmd, pid); procstat_freefiles(procstat, head); } static void print_file_info(struct procstat *procstat, struct filestat *fst, const char *uname, const char *cmd, int pid) { struct vnstat vn; DEVS *d; const char *filename; int error, fsmatch = 0; char errbuf[_POSIX2_LINE_MAX]; filename = NULL; if (checkfile != 0) { if (fst->fs_type != PS_FST_TYPE_VNODE && fst->fs_type != PS_FST_TYPE_FIFO) return; error = procstat_get_vnode_info(procstat, fst, &vn, errbuf); if (error != 0) return; for (d = devs; d != NULL; d = d->next) if (d->fsid == vn.vn_fsid) { fsmatch = 1; if (d->ino == vn.vn_fileid) { filename = d->name; break; } } if (fsmatch == 0 || (filename == NULL && fsflg == 0)) return; } /* * Print entry prefix. */ printf("%-8.8s %-10s %5d", uname, cmd, pid); if (fst->fs_uflags & PS_FST_UFLAG_TEXT) printf(" text"); else if (fst->fs_uflags & PS_FST_UFLAG_CDIR) printf(" wd"); else if (fst->fs_uflags & PS_FST_UFLAG_RDIR) printf(" root"); else if (fst->fs_uflags & PS_FST_UFLAG_TRACE) printf(" tr"); else if (fst->fs_uflags & PS_FST_UFLAG_MMAP) printf(" mmap"); else if (fst->fs_uflags & PS_FST_UFLAG_JAIL) printf(" jail"); else if (fst->fs_uflags & PS_FST_UFLAG_CTTY) printf(" ctty"); else printf(" %4d", fst->fs_fd); /* * Print type-specific data. */ switch (fst->fs_type) { case PS_FST_TYPE_FIFO: case PS_FST_TYPE_VNODE: print_vnode_info(procstat, fst); break; case PS_FST_TYPE_SOCKET: print_socket_info(procstat, fst); break; case PS_FST_TYPE_PIPE: print_pipe_info(procstat, fst); break; case PS_FST_TYPE_PTS: print_pts_info(procstat, fst); break; case PS_FST_TYPE_SHM: print_shm_info(procstat, fst); break; case PS_FST_TYPE_SEM: print_sem_info(procstat, fst); break; case PS_FST_TYPE_DEV: break; default: if (vflg) fprintf(stderr, "unknown file type %d for file %d of pid %d\n", fst->fs_type, fst->fs_fd, pid); } if (filename && !fsflg) printf(" %s", filename); putchar('\n'); } static char * addr_to_string(struct sockaddr_storage *ss, char *buffer, int buflen) { char buffer2[INET6_ADDRSTRLEN]; struct sockaddr_in6 *sin6; struct sockaddr_in *sin; struct sockaddr_un *sun; switch (ss->ss_family) { case AF_LOCAL: sun = (struct sockaddr_un *)ss; if (strlen(sun->sun_path) == 0) strlcpy(buffer, "-", buflen); else strlcpy(buffer, sun->sun_path, buflen); break; case AF_INET: sin = (struct sockaddr_in *)ss; if (sin->sin_addr.s_addr == INADDR_ANY) snprintf(buffer, buflen, "%s:%d", "*", ntohs(sin->sin_port)); else if (inet_ntop(AF_INET, &sin->sin_addr, buffer2, sizeof(buffer2)) != NULL) snprintf(buffer, buflen, "%s:%d", buffer2, ntohs(sin->sin_port)); break; case AF_INET6: sin6 = (struct sockaddr_in6 *)ss; if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) snprintf(buffer, buflen, "%s.%d", "*", ntohs(sin6->sin6_port)); else if (inet_ntop(AF_INET6, &sin6->sin6_addr, buffer2, sizeof(buffer2)) != NULL) snprintf(buffer, buflen, "%s.%d", buffer2, ntohs(sin6->sin6_port)); else strlcpy(buffer, "-", buflen); break; default: strlcpy(buffer, "", buflen); break; } return buffer; } static void print_socket_info(struct procstat *procstat, struct filestat *fst) { static const char *stypename[] = { "unused", /* 0 */ "stream", /* 1 */ "dgram", /* 2 */ "raw", /* 3 */ "rdm", /* 4 */ "seqpak" /* 5 */ }; #define STYPEMAX 5 struct sockstat sock; struct protoent *pe; char errbuf[_POSIX2_LINE_MAX]; char src_addr[PATH_MAX], dst_addr[PATH_MAX]; struct sockaddr_un *sun; int error; static int isopen; error = procstat_get_socket_info(procstat, fst, &sock, errbuf); if (error != 0) { printf("* error"); return; } if (sock.type > STYPEMAX) printf("* %s ?%d", sock.dname, sock.type); else printf("* %s %s", sock.dname, stypename[sock.type]); /* * protocol specific formatting * - * Try to find interesting things to print. For tcp, the interesting - * thing is the address of the tcpcb, for udp and others, just the - * inpcb (socket pcb). For unix domain, its the address of the socket - * pcb and the address of the connected pcb (if connected). Otherwise - * just print the protocol number and address of the socket itself. + * Try to find interesting things to print. For internet and unix + * sockets, its the address of the socket pcb. For unix it is also the + * address of the connected pcb (if connected). Otherwise just print + * the protocol number and address of the socket itself. * The idea is not to duplicate netstat, but to make available enough * information for further analysis. */ switch (sock.dom_family) { case AF_INET: case AF_INET6: if (!isopen) setprotoent(++isopen); if ((pe = getprotobynumber(sock.proto)) != NULL) printf(" %s", pe->p_name); else printf(" %d", sock.proto); - if (sock.proto == IPPROTO_TCP ) { - if (sock.inp_ppcb != 0) - printf(" %lx", (u_long)sock.inp_ppcb); - } - else if (sock.so_pcb != 0) + if (sock.so_pcb != 0) printf(" %lx", (u_long)sock.so_pcb); if (!sflg) break; printf(" %s <-> %s", addr_to_string(&sock.sa_local, src_addr, sizeof(src_addr)), addr_to_string(&sock.sa_peer, dst_addr, sizeof(dst_addr))); break; case AF_UNIX: /* print address of pcb and connected pcb */ if (sock.so_pcb != 0) { printf(" %lx", (u_long)sock.so_pcb); if (sock.unp_conn) { char shoconn[4], *cp; cp = shoconn; if (!(sock.so_rcv_sb_state & SBS_CANTRCVMORE)) *cp++ = '<'; *cp++ = '-'; if (!(sock.so_snd_sb_state & SBS_CANTSENDMORE)) *cp++ = '>'; *cp = '\0'; printf(" %s %lx", shoconn, (u_long)sock.unp_conn); } } if (!sflg) break; sun = (struct sockaddr_un *)&sock.sa_local; /* * While generally we like to print two addresses, * local and peer, for sockets, it turns out to be * more useful to print the first non-null address for * local sockets, as typically they aren't bound and * connected, and the path strings can get long. */ if (sun->sun_path[0] != 0) addr_to_string(&sock.sa_local, src_addr, sizeof(src_addr)); else addr_to_string(&sock.sa_peer, src_addr, sizeof(src_addr)); printf(" %s", src_addr); break; default: /* print protocol number and socket address */ printf(" %d %lx", sock.proto, (u_long)sock.so_addr); } } static void print_pipe_info(struct procstat *procstat, struct filestat *fst) { struct pipestat ps; char errbuf[_POSIX2_LINE_MAX]; int error; error = procstat_get_pipe_info(procstat, fst, &ps, errbuf); if (error != 0) { printf("* error"); return; } printf("* pipe %8lx <-> %8lx", (u_long)ps.addr, (u_long)ps.peer); printf(" %6zd", ps.buffer_cnt); print_access_flags(fst->fs_fflags); } static void print_pts_info(struct procstat *procstat, struct filestat *fst) { struct ptsstat pts; char errbuf[_POSIX2_LINE_MAX]; int error; error = procstat_get_pts_info(procstat, fst, &pts, errbuf); if (error != 0) { printf("* error"); return; } printf("* pseudo-terminal master "); if (nflg || !*pts.devname) { printf("%#10jx", (uintmax_t)pts.dev); } else { printf("%10s", pts.devname); } print_access_flags(fst->fs_fflags); } static void print_sem_info(struct procstat *procstat, struct filestat *fst) { struct semstat sem; char errbuf[_POSIX2_LINE_MAX]; char mode[15]; int error; error = procstat_get_sem_info(procstat, fst, &sem, errbuf); if (error != 0) { printf("* error"); return; } if (nflg) { printf(" "); (void)snprintf(mode, sizeof(mode), "%o", sem.mode); } else { printf(" %-15s", fst->fs_path != NULL ? fst->fs_path : "-"); strmode(sem.mode, mode); } printf(" %10s %6u", mode, sem.value); print_access_flags(fst->fs_fflags); } static void print_shm_info(struct procstat *procstat, struct filestat *fst) { struct shmstat shm; char errbuf[_POSIX2_LINE_MAX]; char mode[15]; int error; error = procstat_get_shm_info(procstat, fst, &shm, errbuf); if (error != 0) { printf("* error"); return; } if (nflg) { printf(" "); (void)snprintf(mode, sizeof(mode), "%o", shm.mode); } else { printf(" %-15s", fst->fs_path != NULL ? fst->fs_path : "-"); strmode(shm.mode, mode); } printf(" %10s %6ju", mode, shm.size); print_access_flags(fst->fs_fflags); } static void print_vnode_info(struct procstat *procstat, struct filestat *fst) { struct vnstat vn; char errbuf[_POSIX2_LINE_MAX]; char mode[15]; const char *badtype; int error; badtype = NULL; error = procstat_get_vnode_info(procstat, fst, &vn, errbuf); if (error != 0) badtype = errbuf; else if (vn.vn_type == PS_FST_VTYPE_VBAD) badtype = "bad"; else if (vn.vn_type == PS_FST_VTYPE_VNON) badtype = "none"; if (badtype != NULL) { printf(" - - %10s -", badtype); return; } if (nflg) printf(" %#5jx", (uintmax_t)vn.vn_fsid); else if (vn.vn_mntdir != NULL) (void)printf(" %-8s", vn.vn_mntdir); /* * Print access mode. */ if (nflg) (void)snprintf(mode, sizeof(mode), "%o", vn.vn_mode); else { strmode(vn.vn_mode, mode); } (void)printf(" %6jd %10s", (intmax_t)vn.vn_fileid, mode); if (vn.vn_type == PS_FST_VTYPE_VBLK || vn.vn_type == PS_FST_VTYPE_VCHR) { if (nflg || !*vn.vn_devname) printf(" %#6jx", (uintmax_t)vn.vn_dev); else { printf(" %6s", vn.vn_devname); } } else printf(" %6ju", (uintmax_t)vn.vn_size); print_access_flags(fst->fs_fflags); } static void print_access_flags(int flags) { char rw[3]; rw[0] = '\0'; if (flags & PS_FST_FFLAG_READ) strcat(rw, "r"); if (flags & PS_FST_FFLAG_WRITE) strcat(rw, "w"); printf(" %2s", rw); } int getfname(const char *filename) { struct stat statbuf; DEVS *cur; if (stat(filename, &statbuf)) { warn("%s", filename); return (0); } if ((cur = malloc(sizeof(DEVS))) == NULL) err(1, NULL); cur->next = devs; devs = cur; cur->ino = statbuf.st_ino; cur->fsid = statbuf.st_dev; cur->name = filename; return (1); } static void usage(void) { (void)fprintf(stderr, "usage: fstat [-fmnv] [-M core] [-N system] [-p pid] [-u user] [file ...]\n"); exit(1); } diff --git a/usr.bin/systat/netstat.c b/usr.bin/systat/netstat.c index bc4acabe4eda..6d188bf1dd0d 100644 --- a/usr.bin/systat/netstat.c +++ b/usr.bin/systat/netstat.c @@ -1,635 +1,633 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1980, 1992, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * netstat */ #include #include #include #include #define _WANT_SOCKET #include #include #include #include #include #include #include #ifdef INET6 #include #endif #define _WANT_INPCB #include #include #include #include #include #include #include #define TCPSTATES #include #include #define _WANT_TCPCB #include #include #include #include #include #include #include #include #include "systat.h" #include "extern.h" static struct netinfo *enter(struct in_conninfo *, uint8_t, int, const char *); static void enter_kvm(struct inpcb *, struct socket *, int, const char *); static void enter_sysctl(struct xinpcb *, struct xsocket *, int, const char *); static void fetchnetstat_kvm(void); static void fetchnetstat_sysctl(void); static char *inetname(struct sockaddr *); static void inetprint(struct sockaddr *, const char *); #define streq(a,b) (strcmp(a,b)==0) #define YMAX(w) (getmaxy(w)-2) WINDOW * opennetstat(void) { sethostent(1); setnetent(1); return (subwin(stdscr, LINES-3-1, 0, MAINWIN_ROW, 0)); } struct netinfo { TAILQ_ENTRY(netinfo) chain; short ni_line; /* line on screen */ short ni_seen; /* 0 when not present in list */ short ni_flags; #define NIF_LACHG 0x1 /* local address changed */ #define NIF_FACHG 0x2 /* foreign address changed */ short ni_state; /* tcp state */ const char *ni_proto; /* protocol */ struct sockaddr_storage ni_lsa; /* local address */ struct sockaddr_storage ni_fsa; /* foreign address */ u_int ni_rcvcc; /* rcv buffer character count */ u_int ni_sndcc; /* snd buffer character count */ }; static TAILQ_HEAD(netinfohead, netinfo) netcb = TAILQ_HEAD_INITIALIZER(netcb); static int aflag = 0; static int nflag = 0; static int lastrow = 1; void closenetstat(WINDOW *w) { struct netinfo *p; endhostent(); endnetent(); TAILQ_FOREACH(p, &netcb, chain) { if (p->ni_line != -1) lastrow--; p->ni_line = -1; } if (w != NULL) { wclear(w); wrefresh(w); delwin(w); } } static const char *miblist[] = { "net.inet.tcp.pcblist", "net.inet.udp.pcblist" }; static char tcb[] = "tcb", udb[] = "udb"; struct nlist namelist[] = { #define X_TCB 0 { .n_name = tcb }, #define X_UDB 1 { .n_name = udb }, { .n_name = NULL }, }; int initnetstat(void) { protos = TCP|UDP; return(1); } void fetchnetstat(void) { if (use_kvm) fetchnetstat_kvm(); else fetchnetstat_sysctl(); } static void fetchnetstat_kvm(void) { - struct inpcb *next; struct netinfo *p; struct inpcbhead head; - struct inpcb inpcb; struct socket sockb; struct tcpcb tcpcb; + struct inpcb *inpcb; void *off; int istcp; if (namelist[X_TCB].n_value == 0) return; TAILQ_FOREACH(p, &netcb, chain) p->ni_seen = 0; if (protos&TCP) { off = NPTR(X_TCB); istcp = 1; } else if (protos&UDP) { off = NPTR(X_UDB); istcp = 0; } else { error("No protocols to display"); return; } again: KREAD(off, &head, sizeof (struct inpcbhead)); - LIST_FOREACH(next, &head, inp_list) { - KREAD(next, &inpcb, sizeof (inpcb)); - next = &inpcb; + LIST_FOREACH(inpcb, &head, inp_list) { + KREAD(inpcb, &tcpcb, istcp ? sizeof(tcpcb) : sizeof(inpcb)); + inpcb = (struct inpcb *)&tcpcb; if (!aflag) { - if (inpcb.inp_vflag & INP_IPV4) { - if (inpcb.inp_laddr.s_addr == INADDR_ANY) + if (inpcb->inp_vflag & INP_IPV4) { + if (inpcb->inp_laddr.s_addr == INADDR_ANY) continue; } #ifdef INET6 - else if (inpcb.inp_vflag & INP_IPV6) { - if (memcmp(&inpcb.in6p_laddr, + else if (inpcb->inp_vflag & INP_IPV6) { + if (memcmp(&inpcb->in6p_laddr, &in6addr_any, sizeof(in6addr_any)) == 0) continue; } #endif } - if (nhosts && !checkhost(&inpcb.inp_inc)) + if (nhosts && !checkhost(&inpcb->inp_inc)) continue; - if (nports && !checkport(&inpcb.inp_inc)) + if (nports && !checkport(&inpcb->inp_inc)) continue; if (istcp) { - KREAD(inpcb.inp_socket, &sockb, sizeof (sockb)); - KREAD(inpcb.inp_ppcb, &tcpcb, sizeof (tcpcb)); - enter_kvm(&inpcb, &sockb, tcpcb.t_state, "tcp"); + KREAD(inpcb->inp_socket, &sockb, sizeof (sockb)); + enter_kvm(inpcb, &sockb, tcpcb.t_state, "tcp"); } else - enter_kvm(&inpcb, &sockb, 0, "udp"); + enter_kvm(inpcb, &sockb, 0, "udp"); } if (istcp && (protos&UDP)) { istcp = 0; off = NPTR(X_UDB); goto again; } } static void fetchnetstat_sysctl(void) { struct netinfo *p; int idx; struct xinpgen *inpg; char *cur, *end; struct xinpcb *xip = NULL; struct xtcpcb *xtp = NULL; int plen; size_t lsz; TAILQ_FOREACH(p, &netcb, chain) p->ni_seen = 0; if (protos&TCP) { idx = 0; } else if (protos&UDP) { idx = 1; } else { error("No protocols to display"); return; } for (;idx < 2; idx++) { if (idx == 1 && !(protos&UDP)) break; inpg = (struct xinpgen *)sysctl_dynread(miblist[idx], &lsz); if (inpg == NULL) { error("sysctl(%s...) failed", miblist[idx]); continue; } /* * We currently do no require a consistent pcb list. * Try to be robust in case of struct size changes */ cur = ((char *)inpg) + inpg->xig_len; /* There is also a trailing struct xinpgen */ end = ((char *)inpg) + lsz - inpg->xig_len; if (end <= cur) { free(inpg); continue; } if (idx == 0) { /* TCP */ xtp = (struct xtcpcb *)cur; plen = xtp->xt_len; } else { xip = (struct xinpcb *)cur; plen = xip->xi_len; } while (cur + plen <= end) { if (idx == 0) { /* TCP */ xtp = (struct xtcpcb *)cur; xip = &xtp->xt_inp; } else { xip = (struct xinpcb *)cur; } cur += plen; if (!aflag) { if (xip->inp_vflag & INP_IPV4) { if (xip->inp_laddr.s_addr == INADDR_ANY) continue; } #ifdef INET6 else if (xip->inp_vflag & INP_IPV6) { if (memcmp(&xip->in6p_laddr, &in6addr_any, sizeof(in6addr_any)) == 0) continue; } #endif } if (nhosts && !checkhost(&xip->inp_inc)) continue; if (nports && !checkport(&xip->inp_inc)) continue; if (idx == 0) enter_sysctl(xip, &xip->xi_socket, xtp->t_state, "tcp"); else enter_sysctl(xip, &xip->xi_socket, 0, "udp"); } free(inpg); } } static void enter_kvm(struct inpcb *inp, struct socket *so, int state, const char *proto) { struct netinfo *p; if ((p = enter(&inp->inp_inc, inp->inp_vflag, state, proto)) != NULL) { p->ni_rcvcc = so->so_rcv.sb_ccc; p->ni_sndcc = so->so_snd.sb_ccc; } } static void enter_sysctl(struct xinpcb *xip, struct xsocket *so, int state, const char *proto) { struct netinfo *p; if ((p = enter(&xip->inp_inc, xip->inp_vflag, state, proto)) != NULL) { p->ni_rcvcc = so->so_rcv.sb_cc; p->ni_sndcc = so->so_snd.sb_cc; } } static struct netinfo * enter(struct in_conninfo *inc, uint8_t vflag, int state, const char *proto) { struct netinfo *p; struct sockaddr_storage lsa, fsa; struct sockaddr_in *sa4; #ifdef INET6 struct sockaddr_in6 *sa6; #endif memset(&lsa, 0, sizeof(lsa)); memset(&fsa, 0, sizeof(fsa)); if (vflag & INP_IPV4) { sa4 = (struct sockaddr_in *)&lsa; sa4->sin_addr = inc->inc_laddr; sa4->sin_port = inc->inc_lport; sa4->sin_family = AF_INET; sa4->sin_len = sizeof(struct sockaddr_in); sa4 = (struct sockaddr_in *)&fsa; sa4->sin_addr = inc->inc_faddr; sa4->sin_port = inc->inc_fport; sa4->sin_family = AF_INET; sa4->sin_len = sizeof(struct sockaddr_in); } #ifdef INET6 else if (vflag & INP_IPV6) { sa6 = (struct sockaddr_in6 *)&lsa; memcpy(&sa6->sin6_addr, &inc->inc6_laddr, sizeof(struct in6_addr)); sa6->sin6_port = inc->inc_lport; sa6->sin6_family = AF_INET6; sa6->sin6_len = sizeof(struct sockaddr_in6); sa6 = (struct sockaddr_in6 *)&fsa; memcpy(&sa6->sin6_addr, &inc->inc6_faddr, sizeof(struct in6_addr)); sa6->sin6_port = inc->inc_fport; sa6->sin6_family = AF_INET6; sa6->sin6_len = sizeof(struct sockaddr_in6); } #endif else return NULL; /* * Only take exact matches, any sockets with * previously unbound addresses will be deleted * below in the display routine because they * will appear as ``not seen'' in the kernel * data structures. */ TAILQ_FOREACH(p, &netcb, chain) { if (!streq(proto, p->ni_proto)) continue; if (p->ni_lsa.ss_family != lsa.ss_family || memcmp(&p->ni_lsa, &lsa, lsa.ss_len) != 0) continue; if (p->ni_fsa.ss_family == fsa.ss_family && memcmp(&p->ni_fsa, &fsa, fsa.ss_len) == 0) break; } if (p == NULL) { if ((p = malloc(sizeof(*p))) == NULL) { error("Out of memory"); return NULL; } TAILQ_INSERT_HEAD(&netcb, p, chain); p->ni_line = -1; memcpy(&p->ni_lsa, &lsa, lsa.ss_len); memcpy(&p->ni_fsa, &fsa, fsa.ss_len); p->ni_proto = strdup(proto); p->ni_flags = NIF_LACHG|NIF_FACHG; } p->ni_state = state; p->ni_seen = 1; return p; } /* column locations */ #define LADDR 0 #define FADDR LADDR+23 #define PROTO FADDR+23 #define RCVCC PROTO+6 #define SNDCC RCVCC+7 #define STATE SNDCC+7 void labelnetstat(void) { if (use_kvm && namelist[X_TCB].n_type == 0) return; wmove(wnd, 0, 0); wclrtobot(wnd); mvwaddstr(wnd, 0, LADDR, "Local Address"); mvwaddstr(wnd, 0, FADDR, "Foreign Address"); mvwaddstr(wnd, 0, PROTO, "Proto"); mvwaddstr(wnd, 0, RCVCC, "Recv-Q"); mvwaddstr(wnd, 0, SNDCC, "Send-Q"); mvwaddstr(wnd, 0, STATE, "(state)"); } void shownetstat(void) { struct netinfo *p, *q; char proto[6]; const char *family = ""; /* * First, delete any connections that have gone * away and adjust the position of connections * below to reflect the deleted line. */ p = TAILQ_FIRST(&netcb); while (p != NULL) { if (p->ni_line == -1 || p->ni_seen) { p = TAILQ_NEXT(p, chain); continue; } wmove(wnd, p->ni_line, 0); wdeleteln(wnd); TAILQ_FOREACH(q, &netcb, chain) if (q != p && q->ni_line > p->ni_line) { q->ni_line--; /* this shouldn't be necessary */ q->ni_flags |= NIF_LACHG|NIF_FACHG; } lastrow--; q = TAILQ_NEXT(p, chain); TAILQ_REMOVE(&netcb, p, chain); free(p); p = q; } /* * Update existing connections and add new ones. */ TAILQ_FOREACH(p, &netcb, chain) { if (p->ni_line == -1) { /* * Add a new entry if possible. */ if (lastrow > YMAX(wnd)) continue; p->ni_line = lastrow++; p->ni_flags |= NIF_LACHG|NIF_FACHG; } if (p->ni_flags & NIF_LACHG) { wmove(wnd, p->ni_line, LADDR); inetprint((struct sockaddr *)&p->ni_lsa, p->ni_proto); p->ni_flags &= ~NIF_LACHG; } if (p->ni_flags & NIF_FACHG) { wmove(wnd, p->ni_line, FADDR); inetprint((struct sockaddr *)&p->ni_fsa, p->ni_proto); p->ni_flags &= ~NIF_FACHG; } #ifdef INET6 family = (p->ni_lsa.ss_family == AF_INET) ? "4" : "6"; #endif snprintf(proto, sizeof(proto), "%s%s", p->ni_proto, family); mvwaddstr(wnd, p->ni_line, PROTO, proto); mvwprintw(wnd, p->ni_line, RCVCC, "%6u", p->ni_rcvcc); mvwprintw(wnd, p->ni_line, SNDCC, "%6u", p->ni_sndcc); if (streq(p->ni_proto, "tcp")) { if (p->ni_state < 0 || p->ni_state >= TCP_NSTATES) mvwprintw(wnd, p->ni_line, STATE, "%d", p->ni_state); else mvwaddstr(wnd, p->ni_line, STATE, tcpstates[p->ni_state]); } wclrtoeol(wnd); } if (lastrow < YMAX(wnd)) { wmove(wnd, lastrow, 0); wclrtobot(wnd); wmove(wnd, YMAX(wnd), 0); wdeleteln(wnd); /* XXX */ } } /* * Pretty print an Internet address (net address + port). * If the nflag was specified, use numbers instead of names. */ static void inetprint(struct sockaddr *sa, const char *proto) { struct servent *sp = 0; char line[80], *cp; int port; switch (sa->sa_family) { case AF_INET: port = ((struct sockaddr_in *)sa)->sin_port; break; #ifdef INET6 case AF_INET6: port = ((struct sockaddr_in6 *)sa)->sin6_port; break; #endif default: port = 0; break; } snprintf(line, sizeof(line), "%.*s.", 16, inetname(sa)); cp = strchr(line, '\0'); if (!nflag && port) sp = getservbyport(port, proto); if (sp || port == 0) snprintf(cp, sizeof(line) - (cp - line), "%.8s", sp ? sp->s_name : "*"); else snprintf(cp, sizeof(line) - (cp - line), "%d", ntohs((u_short)port)); /* pad to full column to clear any garbage */ cp = strchr(line, '\0'); while (cp - line < 22) *cp++ = ' '; line[22] = '\0'; waddstr(wnd, line); } /* * Construct an Internet address representation. * If the nflag has been supplied, give * numeric value, otherwise try for symbolic name. */ static char * inetname(struct sockaddr *sa) { char *cp = 0; static char line[NI_MAXHOST]; struct hostent *hp; struct in_addr in; #ifdef INET6 if (sa->sa_family == AF_INET6) { if (memcmp(&((struct sockaddr_in6 *)sa)->sin6_addr, &in6addr_any, sizeof(in6addr_any)) == 0) strcpy(line, "*"); else getnameinfo(sa, sa->sa_len, line, sizeof(line), NULL, 0, nflag ? NI_NUMERICHOST : 0); return (line); } #endif in = ((struct sockaddr_in *)sa)->sin_addr; if (!nflag && in.s_addr != INADDR_ANY) { hp = gethostbyaddr((char *)&in, sizeof (in), AF_INET); if (hp) cp = hp->h_name; } if (in.s_addr == INADDR_ANY) strcpy(line, "*"); else if (cp) snprintf(line, sizeof(line), "%s", cp); else { in.s_addr = ntohl(in.s_addr); #define C(x) ((x) & 0xff) snprintf(line, sizeof(line), "%u.%u.%u.%u", C(in.s_addr >> 24), C(in.s_addr >> 16), C(in.s_addr >> 8), C(in.s_addr)); } return (line); } int cmdnetstat(const char *cmd, const char *args) { if (prefix(cmd, "all")) { aflag = !aflag; goto fixup; } if (prefix(cmd, "numbers") || prefix(cmd, "names")) { struct netinfo *p; int new; new = prefix(cmd, "numbers"); if (new == nflag) return (1); TAILQ_FOREACH(p, &netcb, chain) { if (p->ni_line == -1) continue; p->ni_flags |= NIF_LACHG|NIF_FACHG; } nflag = new; goto redisplay; } if (!netcmd(cmd, args)) return (0); fixup: fetchnetstat(); redisplay: shownetstat(); refresh(); return (1); }