Index: head/sys/cam/scsi/scsi_pass.c =================================================================== --- head/sys/cam/scsi/scsi_pass.c (revision 299863) +++ head/sys/cam/scsi/scsi_pass.c (revision 299864) @@ -1,2222 +1,2220 @@ /*- * Copyright (c) 1997, 1998, 2000 Justin T. Gibbs. * Copyright (c) 1997, 1998, 1999 Kenneth D. Merry. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions, and the following disclaimer, * without modification, immediately at the beginning of the file. * 2. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); -#include "opt_kdtrace.h" - #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include typedef enum { PASS_FLAG_OPEN = 0x01, PASS_FLAG_LOCKED = 0x02, PASS_FLAG_INVALID = 0x04, PASS_FLAG_INITIAL_PHYSPATH = 0x08, PASS_FLAG_ZONE_INPROG = 0x10, PASS_FLAG_ZONE_VALID = 0x20, PASS_FLAG_UNMAPPED_CAPABLE = 0x40, PASS_FLAG_ABANDONED_REF_SET = 0x80 } pass_flags; typedef enum { PASS_STATE_NORMAL } pass_state; typedef enum { PASS_CCB_BUFFER_IO, PASS_CCB_QUEUED_IO } pass_ccb_types; #define ccb_type ppriv_field0 #define ccb_ioreq ppriv_ptr1 /* * The maximum number of memory segments we preallocate. */ #define PASS_MAX_SEGS 16 typedef enum { PASS_IO_NONE = 0x00, PASS_IO_USER_SEG_MALLOC = 0x01, PASS_IO_KERN_SEG_MALLOC = 0x02, PASS_IO_ABANDONED = 0x04 } pass_io_flags; struct pass_io_req { union ccb ccb; union ccb *alloced_ccb; union ccb *user_ccb_ptr; camq_entry user_periph_links; ccb_ppriv_area user_periph_priv; struct cam_periph_map_info mapinfo; pass_io_flags flags; ccb_flags data_flags; int num_user_segs; bus_dma_segment_t user_segs[PASS_MAX_SEGS]; int num_kern_segs; bus_dma_segment_t kern_segs[PASS_MAX_SEGS]; bus_dma_segment_t *user_segptr; bus_dma_segment_t *kern_segptr; int num_bufs; uint32_t dirs[CAM_PERIPH_MAXMAPS]; uint32_t lengths[CAM_PERIPH_MAXMAPS]; uint8_t *user_bufs[CAM_PERIPH_MAXMAPS]; uint8_t *kern_bufs[CAM_PERIPH_MAXMAPS]; struct bintime start_time; TAILQ_ENTRY(pass_io_req) links; }; struct pass_softc { pass_state state; pass_flags flags; u_int8_t pd_type; union ccb saved_ccb; int open_count; u_int maxio; struct devstat *device_stats; struct cdev *dev; struct cdev *alias_dev; struct task add_physpath_task; struct task shutdown_kqueue_task; struct selinfo read_select; TAILQ_HEAD(, pass_io_req) incoming_queue; TAILQ_HEAD(, pass_io_req) active_queue; TAILQ_HEAD(, pass_io_req) abandoned_queue; TAILQ_HEAD(, pass_io_req) done_queue; struct cam_periph *periph; char zone_name[12]; char io_zone_name[12]; uma_zone_t pass_zone; uma_zone_t pass_io_zone; size_t io_zone_size; }; static d_open_t passopen; static d_close_t passclose; static d_ioctl_t passioctl; static d_ioctl_t passdoioctl; static d_poll_t passpoll; static d_kqfilter_t passkqfilter; static void passreadfiltdetach(struct knote *kn); static int passreadfilt(struct knote *kn, long hint); static periph_init_t passinit; static periph_ctor_t passregister; static periph_oninv_t passoninvalidate; static periph_dtor_t passcleanup; static periph_start_t passstart; static void pass_shutdown_kqueue(void *context, int pending); static void pass_add_physpath(void *context, int pending); static void passasync(void *callback_arg, u_int32_t code, struct cam_path *path, void *arg); static void passdone(struct cam_periph *periph, union ccb *done_ccb); static int passcreatezone(struct cam_periph *periph); static void passiocleanup(struct pass_softc *softc, struct pass_io_req *io_req); static int passcopysglist(struct cam_periph *periph, struct pass_io_req *io_req, ccb_flags direction); static int passmemsetup(struct cam_periph *periph, struct pass_io_req *io_req); static int passmemdone(struct cam_periph *periph, struct pass_io_req *io_req); static int passerror(union ccb *ccb, u_int32_t cam_flags, u_int32_t sense_flags); static int passsendccb(struct cam_periph *periph, union ccb *ccb, union ccb *inccb); static struct periph_driver passdriver = { passinit, "pass", TAILQ_HEAD_INITIALIZER(passdriver.units), /* generation */ 0 }; PERIPHDRIVER_DECLARE(pass, passdriver); static struct cdevsw pass_cdevsw = { .d_version = D_VERSION, .d_flags = D_TRACKCLOSE, .d_open = passopen, .d_close = passclose, .d_ioctl = passioctl, .d_poll = passpoll, .d_kqfilter = passkqfilter, .d_name = "pass", }; static struct filterops passread_filtops = { .f_isfd = 1, .f_detach = passreadfiltdetach, .f_event = passreadfilt }; static MALLOC_DEFINE(M_SCSIPASS, "scsi_pass", "scsi passthrough buffers"); static void passinit(void) { cam_status status; /* * Install a global async callback. This callback will * receive async callbacks like "new device found". */ status = xpt_register_async(AC_FOUND_DEVICE, passasync, NULL, NULL); if (status != CAM_REQ_CMP) { printf("pass: Failed to attach master async callback " "due to status 0x%x!\n", status); } } static void passrejectios(struct cam_periph *periph) { struct pass_io_req *io_req, *io_req2; struct pass_softc *softc; softc = (struct pass_softc *)periph->softc; /* * The user can no longer get status for I/O on the done queue, so * clean up all outstanding I/O on the done queue. */ TAILQ_FOREACH_SAFE(io_req, &softc->done_queue, links, io_req2) { TAILQ_REMOVE(&softc->done_queue, io_req, links); passiocleanup(softc, io_req); uma_zfree(softc->pass_zone, io_req); } /* * The underlying device is gone, so we can't issue these I/Os. * The devfs node has been shut down, so we can't return status to * the user. Free any I/O left on the incoming queue. */ TAILQ_FOREACH_SAFE(io_req, &softc->incoming_queue, links, io_req2) { TAILQ_REMOVE(&softc->incoming_queue, io_req, links); passiocleanup(softc, io_req); uma_zfree(softc->pass_zone, io_req); } /* * Normally we would put I/Os on the abandoned queue and acquire a * reference when we saw the final close. But, the device went * away and devfs may have moved everything off to deadfs by the * time the I/O done callback is called; as a result, we won't see * any more closes. So, if we have any active I/Os, we need to put * them on the abandoned queue. When the abandoned queue is empty, * we'll release the remaining reference (see below) to the peripheral. */ TAILQ_FOREACH_SAFE(io_req, &softc->active_queue, links, io_req2) { TAILQ_REMOVE(&softc->active_queue, io_req, links); io_req->flags |= PASS_IO_ABANDONED; TAILQ_INSERT_TAIL(&softc->abandoned_queue, io_req, links); } /* * If we put any I/O on the abandoned queue, acquire a reference. */ if ((!TAILQ_EMPTY(&softc->abandoned_queue)) && ((softc->flags & PASS_FLAG_ABANDONED_REF_SET) == 0)) { cam_periph_doacquire(periph); softc->flags |= PASS_FLAG_ABANDONED_REF_SET; } } static void passdevgonecb(void *arg) { struct cam_periph *periph; struct mtx *mtx; struct pass_softc *softc; int i; periph = (struct cam_periph *)arg; mtx = cam_periph_mtx(periph); mtx_lock(mtx); softc = (struct pass_softc *)periph->softc; KASSERT(softc->open_count >= 0, ("Negative open count %d", softc->open_count)); /* * When we get this callback, we will get no more close calls from * devfs. So if we have any dangling opens, we need to release the * reference held for that particular context. */ for (i = 0; i < softc->open_count; i++) cam_periph_release_locked(periph); softc->open_count = 0; /* * Release the reference held for the device node, it is gone now. * Accordingly, inform all queued I/Os of their fate. */ cam_periph_release_locked(periph); passrejectios(periph); /* * We reference the SIM lock directly here, instead of using * cam_periph_unlock(). The reason is that the final call to * cam_periph_release_locked() above could result in the periph * getting freed. If that is the case, dereferencing the periph * with a cam_periph_unlock() call would cause a page fault. */ mtx_unlock(mtx); /* * We have to remove our kqueue context from a thread because it * may sleep. It would be nice if we could get a callback from * kqueue when it is done cleaning up resources. */ taskqueue_enqueue(taskqueue_thread, &softc->shutdown_kqueue_task); } static void passoninvalidate(struct cam_periph *periph) { struct pass_softc *softc; softc = (struct pass_softc *)periph->softc; /* * De-register any async callbacks. */ xpt_register_async(0, passasync, periph, periph->path); softc->flags |= PASS_FLAG_INVALID; /* * Tell devfs this device has gone away, and ask for a callback * when it has cleaned up its state. */ destroy_dev_sched_cb(softc->dev, passdevgonecb, periph); } static void passcleanup(struct cam_periph *periph) { struct pass_softc *softc; softc = (struct pass_softc *)periph->softc; cam_periph_assert(periph, MA_OWNED); KASSERT(TAILQ_EMPTY(&softc->active_queue), ("%s called when there are commands on the active queue!\n", __func__)); KASSERT(TAILQ_EMPTY(&softc->abandoned_queue), ("%s called when there are commands on the abandoned queue!\n", __func__)); KASSERT(TAILQ_EMPTY(&softc->incoming_queue), ("%s called when there are commands on the incoming queue!\n", __func__)); KASSERT(TAILQ_EMPTY(&softc->done_queue), ("%s called when there are commands on the done queue!\n", __func__)); devstat_remove_entry(softc->device_stats); cam_periph_unlock(periph); /* * We call taskqueue_drain() for the physpath task to make sure it * is complete. We drop the lock because this can potentially * sleep. XXX KDM that is bad. Need a way to get a callback when * a taskqueue is drained. * * Note that we don't drain the kqueue shutdown task queue. This * is because we hold a reference on the periph for kqueue, and * release that reference from the kqueue shutdown task queue. So * we cannot come into this routine unless we've released that * reference. Also, because that could be the last reference, we * could be called from the cam_periph_release() call in * pass_shutdown_kqueue(). In that case, the taskqueue_drain() * would deadlock. It would be preferable if we had a way to * get a callback when a taskqueue is done. */ taskqueue_drain(taskqueue_thread, &softc->add_physpath_task); cam_periph_lock(periph); free(softc, M_DEVBUF); } static void pass_shutdown_kqueue(void *context, int pending) { struct cam_periph *periph; struct pass_softc *softc; periph = context; softc = periph->softc; knlist_clear(&softc->read_select.si_note, /*is_locked*/ 0); knlist_destroy(&softc->read_select.si_note); /* * Release the reference we held for kqueue. */ cam_periph_release(periph); } static void pass_add_physpath(void *context, int pending) { struct cam_periph *periph; struct pass_softc *softc; struct mtx *mtx; char *physpath; /* * If we have one, create a devfs alias for our * physical path. */ periph = context; softc = periph->softc; physpath = malloc(MAXPATHLEN, M_DEVBUF, M_WAITOK); mtx = cam_periph_mtx(periph); mtx_lock(mtx); if (periph->flags & CAM_PERIPH_INVALID) goto out; if (xpt_getattr(physpath, MAXPATHLEN, "GEOM::physpath", periph->path) == 0 && strlen(physpath) != 0) { mtx_unlock(mtx); make_dev_physpath_alias(MAKEDEV_WAITOK, &softc->alias_dev, softc->dev, softc->alias_dev, physpath); mtx_lock(mtx); } out: /* * Now that we've made our alias, we no longer have to have a * reference to the device. */ if ((softc->flags & PASS_FLAG_INITIAL_PHYSPATH) == 0) softc->flags |= PASS_FLAG_INITIAL_PHYSPATH; /* * We always acquire a reference to the periph before queueing this * task queue function, so it won't go away before we run. */ while (pending-- > 0) cam_periph_release_locked(periph); mtx_unlock(mtx); free(physpath, M_DEVBUF); } static void passasync(void *callback_arg, u_int32_t code, struct cam_path *path, void *arg) { struct cam_periph *periph; periph = (struct cam_periph *)callback_arg; switch (code) { case AC_FOUND_DEVICE: { struct ccb_getdev *cgd; cam_status status; cgd = (struct ccb_getdev *)arg; if (cgd == NULL) break; /* * Allocate a peripheral instance for * this device and start the probe * process. */ status = cam_periph_alloc(passregister, passoninvalidate, passcleanup, passstart, "pass", CAM_PERIPH_BIO, path, passasync, AC_FOUND_DEVICE, cgd); if (status != CAM_REQ_CMP && status != CAM_REQ_INPROG) { const struct cam_status_entry *entry; entry = cam_fetch_status_entry(status); printf("passasync: Unable to attach new device " "due to status %#x: %s\n", status, entry ? entry->status_text : "Unknown"); } break; } case AC_ADVINFO_CHANGED: { uintptr_t buftype; buftype = (uintptr_t)arg; if (buftype == CDAI_TYPE_PHYS_PATH) { struct pass_softc *softc; cam_status status; softc = (struct pass_softc *)periph->softc; /* * Acquire a reference to the periph before we * start the taskqueue, so that we don't run into * a situation where the periph goes away before * the task queue has a chance to run. */ status = cam_periph_acquire(periph); if (status != CAM_REQ_CMP) break; taskqueue_enqueue(taskqueue_thread, &softc->add_physpath_task); } break; } default: cam_periph_async(periph, code, path, arg); break; } } static cam_status passregister(struct cam_periph *periph, void *arg) { struct pass_softc *softc; struct ccb_getdev *cgd; struct ccb_pathinq cpi; struct make_dev_args args; int error, no_tags; cgd = (struct ccb_getdev *)arg; if (cgd == NULL) { printf("%s: no getdev CCB, can't register device\n", __func__); return(CAM_REQ_CMP_ERR); } softc = (struct pass_softc *)malloc(sizeof(*softc), M_DEVBUF, M_NOWAIT); if (softc == NULL) { printf("%s: Unable to probe new device. " "Unable to allocate softc\n", __func__); return(CAM_REQ_CMP_ERR); } bzero(softc, sizeof(*softc)); softc->state = PASS_STATE_NORMAL; if (cgd->protocol == PROTO_SCSI || cgd->protocol == PROTO_ATAPI) softc->pd_type = SID_TYPE(&cgd->inq_data); else if (cgd->protocol == PROTO_SATAPM) softc->pd_type = T_ENCLOSURE; else softc->pd_type = T_DIRECT; periph->softc = softc; softc->periph = periph; TAILQ_INIT(&softc->incoming_queue); TAILQ_INIT(&softc->active_queue); TAILQ_INIT(&softc->abandoned_queue); TAILQ_INIT(&softc->done_queue); snprintf(softc->zone_name, sizeof(softc->zone_name), "%s%d", periph->periph_name, periph->unit_number); snprintf(softc->io_zone_name, sizeof(softc->io_zone_name), "%s%dIO", periph->periph_name, periph->unit_number); softc->io_zone_size = MAXPHYS; knlist_init_mtx(&softc->read_select.si_note, cam_periph_mtx(periph)); bzero(&cpi, sizeof(cpi)); xpt_setup_ccb(&cpi.ccb_h, periph->path, CAM_PRIORITY_NORMAL); cpi.ccb_h.func_code = XPT_PATH_INQ; xpt_action((union ccb *)&cpi); if (cpi.maxio == 0) softc->maxio = DFLTPHYS; /* traditional default */ else if (cpi.maxio > MAXPHYS) softc->maxio = MAXPHYS; /* for safety */ else softc->maxio = cpi.maxio; /* real value */ if (cpi.hba_misc & PIM_UNMAPPED) softc->flags |= PASS_FLAG_UNMAPPED_CAPABLE; /* * We pass in 0 for a blocksize, since we don't * know what the blocksize of this device is, if * it even has a blocksize. */ cam_periph_unlock(periph); no_tags = (cgd->inq_data.flags & SID_CmdQue) == 0; softc->device_stats = devstat_new_entry("pass", periph->unit_number, 0, DEVSTAT_NO_BLOCKSIZE | (no_tags ? DEVSTAT_NO_ORDERED_TAGS : 0), softc->pd_type | XPORT_DEVSTAT_TYPE(cpi.transport) | DEVSTAT_TYPE_PASS, DEVSTAT_PRIORITY_PASS); /* * Initialize the taskqueue handler for shutting down kqueue. */ TASK_INIT(&softc->shutdown_kqueue_task, /*priority*/ 0, pass_shutdown_kqueue, periph); /* * Acquire a reference to the periph that we can release once we've * cleaned up the kqueue. */ if (cam_periph_acquire(periph) != CAM_REQ_CMP) { xpt_print(periph->path, "%s: lost periph during " "registration!\n", __func__); cam_periph_lock(periph); return (CAM_REQ_CMP_ERR); } /* * Acquire a reference to the periph before we create the devfs * instance for it. We'll release this reference once the devfs * instance has been freed. */ if (cam_periph_acquire(periph) != CAM_REQ_CMP) { xpt_print(periph->path, "%s: lost periph during " "registration!\n", __func__); cam_periph_lock(periph); return (CAM_REQ_CMP_ERR); } /* Register the device */ make_dev_args_init(&args); args.mda_devsw = &pass_cdevsw; args.mda_unit = periph->unit_number; args.mda_uid = UID_ROOT; args.mda_gid = GID_OPERATOR; args.mda_mode = 0600; args.mda_si_drv1 = periph; error = make_dev_s(&args, &softc->dev, "%s%d", periph->periph_name, periph->unit_number); if (error != 0) { cam_periph_lock(periph); cam_periph_release_locked(periph); return (CAM_REQ_CMP_ERR); } /* * Hold a reference to the periph before we create the physical * path alias so it can't go away. */ if (cam_periph_acquire(periph) != CAM_REQ_CMP) { xpt_print(periph->path, "%s: lost periph during " "registration!\n", __func__); cam_periph_lock(periph); return (CAM_REQ_CMP_ERR); } cam_periph_lock(periph); TASK_INIT(&softc->add_physpath_task, /*priority*/0, pass_add_physpath, periph); /* * See if physical path information is already available. */ taskqueue_enqueue(taskqueue_thread, &softc->add_physpath_task); /* * Add an async callback so that we get notified if * this device goes away or its physical path * (stored in the advanced info data of the EDT) has * changed. */ xpt_register_async(AC_LOST_DEVICE | AC_ADVINFO_CHANGED, passasync, periph, periph->path); if (bootverbose) xpt_announce_periph(periph, NULL); return(CAM_REQ_CMP); } static int passopen(struct cdev *dev, int flags, int fmt, struct thread *td) { struct cam_periph *periph; struct pass_softc *softc; int error; periph = (struct cam_periph *)dev->si_drv1; if (cam_periph_acquire(periph) != CAM_REQ_CMP) return (ENXIO); cam_periph_lock(periph); softc = (struct pass_softc *)periph->softc; if (softc->flags & PASS_FLAG_INVALID) { cam_periph_release_locked(periph); cam_periph_unlock(periph); return(ENXIO); } /* * Don't allow access when we're running at a high securelevel. */ error = securelevel_gt(td->td_ucred, 1); if (error) { cam_periph_release_locked(periph); cam_periph_unlock(periph); return(error); } /* * Only allow read-write access. */ if (((flags & FWRITE) == 0) || ((flags & FREAD) == 0)) { cam_periph_release_locked(periph); cam_periph_unlock(periph); return(EPERM); } /* * We don't allow nonblocking access. */ if ((flags & O_NONBLOCK) != 0) { xpt_print(periph->path, "can't do nonblocking access\n"); cam_periph_release_locked(periph); cam_periph_unlock(periph); return(EINVAL); } softc->open_count++; cam_periph_unlock(periph); return (error); } static int passclose(struct cdev *dev, int flag, int fmt, struct thread *td) { struct cam_periph *periph; struct pass_softc *softc; struct mtx *mtx; periph = (struct cam_periph *)dev->si_drv1; mtx = cam_periph_mtx(periph); mtx_lock(mtx); softc = periph->softc; softc->open_count--; if (softc->open_count == 0) { struct pass_io_req *io_req, *io_req2; TAILQ_FOREACH_SAFE(io_req, &softc->done_queue, links, io_req2) { TAILQ_REMOVE(&softc->done_queue, io_req, links); passiocleanup(softc, io_req); uma_zfree(softc->pass_zone, io_req); } TAILQ_FOREACH_SAFE(io_req, &softc->incoming_queue, links, io_req2) { TAILQ_REMOVE(&softc->incoming_queue, io_req, links); passiocleanup(softc, io_req); uma_zfree(softc->pass_zone, io_req); } /* * If there are any active I/Os, we need to forcibly acquire a * reference to the peripheral so that we don't go away * before they complete. We'll release the reference when * the abandoned queue is empty. */ io_req = TAILQ_FIRST(&softc->active_queue); if ((io_req != NULL) && (softc->flags & PASS_FLAG_ABANDONED_REF_SET) == 0) { cam_periph_doacquire(periph); softc->flags |= PASS_FLAG_ABANDONED_REF_SET; } /* * Since the I/O in the active queue is not under our * control, just set a flag so that we can clean it up when * it completes and put it on the abandoned queue. This * will prevent our sending spurious completions in the * event that the device is opened again before these I/Os * complete. */ TAILQ_FOREACH_SAFE(io_req, &softc->active_queue, links, io_req2) { TAILQ_REMOVE(&softc->active_queue, io_req, links); io_req->flags |= PASS_IO_ABANDONED; TAILQ_INSERT_TAIL(&softc->abandoned_queue, io_req, links); } } cam_periph_release_locked(periph); /* * We reference the lock directly here, instead of using * cam_periph_unlock(). The reason is that the call to * cam_periph_release_locked() above could result in the periph * getting freed. If that is the case, dereferencing the periph * with a cam_periph_unlock() call would cause a page fault. * * cam_periph_release() avoids this problem using the same method, * but we're manually acquiring and dropping the lock here to * protect the open count and avoid another lock acquisition and * release. */ mtx_unlock(mtx); return (0); } static void passstart(struct cam_periph *periph, union ccb *start_ccb) { struct pass_softc *softc; softc = (struct pass_softc *)periph->softc; switch (softc->state) { case PASS_STATE_NORMAL: { struct pass_io_req *io_req; /* * Check for any queued I/O requests that require an * allocated slot. */ io_req = TAILQ_FIRST(&softc->incoming_queue); if (io_req == NULL) { xpt_release_ccb(start_ccb); break; } TAILQ_REMOVE(&softc->incoming_queue, io_req, links); TAILQ_INSERT_TAIL(&softc->active_queue, io_req, links); /* * Merge the user's CCB into the allocated CCB. */ xpt_merge_ccb(start_ccb, &io_req->ccb); start_ccb->ccb_h.ccb_type = PASS_CCB_QUEUED_IO; start_ccb->ccb_h.ccb_ioreq = io_req; start_ccb->ccb_h.cbfcnp = passdone; io_req->alloced_ccb = start_ccb; binuptime(&io_req->start_time); devstat_start_transaction(softc->device_stats, &io_req->start_time); xpt_action(start_ccb); /* * If we have any more I/O waiting, schedule ourselves again. */ if (!TAILQ_EMPTY(&softc->incoming_queue)) xpt_schedule(periph, CAM_PRIORITY_NORMAL); break; } default: break; } } static void passdone(struct cam_periph *periph, union ccb *done_ccb) { struct pass_softc *softc; struct ccb_scsiio *csio; softc = (struct pass_softc *)periph->softc; cam_periph_assert(periph, MA_OWNED); csio = &done_ccb->csio; switch (csio->ccb_h.ccb_type) { case PASS_CCB_QUEUED_IO: { struct pass_io_req *io_req; io_req = done_ccb->ccb_h.ccb_ioreq; #if 0 xpt_print(periph->path, "%s: called for user CCB %p\n", __func__, io_req->user_ccb_ptr); #endif if (((done_ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP) && (done_ccb->ccb_h.flags & CAM_PASS_ERR_RECOVER) && ((io_req->flags & PASS_IO_ABANDONED) == 0)) { int error; error = passerror(done_ccb, CAM_RETRY_SELTO, SF_RETRY_UA | SF_NO_PRINT); if (error == ERESTART) { /* * A retry was scheduled, so * just return. */ return; } } /* * Copy the allocated CCB contents back to the malloced CCB * so we can give status back to the user when he requests it. */ bcopy(done_ccb, &io_req->ccb, sizeof(*done_ccb)); /* * Log data/transaction completion with devstat(9). */ switch (done_ccb->ccb_h.func_code) { case XPT_SCSI_IO: devstat_end_transaction(softc->device_stats, done_ccb->csio.dxfer_len - done_ccb->csio.resid, done_ccb->csio.tag_action & 0x3, ((done_ccb->ccb_h.flags & CAM_DIR_MASK) == CAM_DIR_NONE) ? DEVSTAT_NO_DATA : (done_ccb->ccb_h.flags & CAM_DIR_OUT) ? DEVSTAT_WRITE : DEVSTAT_READ, NULL, &io_req->start_time); break; case XPT_ATA_IO: devstat_end_transaction(softc->device_stats, done_ccb->ataio.dxfer_len - done_ccb->ataio.resid, 0, /* Not used in ATA */ ((done_ccb->ccb_h.flags & CAM_DIR_MASK) == CAM_DIR_NONE) ? DEVSTAT_NO_DATA : (done_ccb->ccb_h.flags & CAM_DIR_OUT) ? DEVSTAT_WRITE : DEVSTAT_READ, NULL, &io_req->start_time); break; case XPT_SMP_IO: /* * XXX KDM this isn't quite right, but there isn't * currently an easy way to represent a bidirectional * transfer in devstat. The only way to do it * and have the byte counts come out right would * mean that we would have to record two * transactions, one for the request and one for the * response. For now, so that we report something, * just treat the entire thing as a read. */ devstat_end_transaction(softc->device_stats, done_ccb->smpio.smp_request_len + done_ccb->smpio.smp_response_len, DEVSTAT_TAG_SIMPLE, DEVSTAT_READ, NULL, &io_req->start_time); break; default: devstat_end_transaction(softc->device_stats, 0, DEVSTAT_TAG_NONE, DEVSTAT_NO_DATA, NULL, &io_req->start_time); break; } /* * In the normal case, take the completed I/O off of the * active queue and put it on the done queue. Notitfy the * user that we have a completed I/O. */ if ((io_req->flags & PASS_IO_ABANDONED) == 0) { TAILQ_REMOVE(&softc->active_queue, io_req, links); TAILQ_INSERT_TAIL(&softc->done_queue, io_req, links); selwakeuppri(&softc->read_select, PRIBIO); KNOTE_LOCKED(&softc->read_select.si_note, 0); } else { /* * In the case of an abandoned I/O (final close * without fetching the I/O), take it off of the * abandoned queue and free it. */ TAILQ_REMOVE(&softc->abandoned_queue, io_req, links); passiocleanup(softc, io_req); uma_zfree(softc->pass_zone, io_req); /* * Release the done_ccb here, since we may wind up * freeing the peripheral when we decrement the * reference count below. */ xpt_release_ccb(done_ccb); /* * If the abandoned queue is empty, we can release * our reference to the periph since we won't have * any more completions coming. */ if ((TAILQ_EMPTY(&softc->abandoned_queue)) && (softc->flags & PASS_FLAG_ABANDONED_REF_SET)) { softc->flags &= ~PASS_FLAG_ABANDONED_REF_SET; cam_periph_release_locked(periph); } /* * We have already released the CCB, so we can * return. */ return; } break; } } xpt_release_ccb(done_ccb); } static int passcreatezone(struct cam_periph *periph) { struct pass_softc *softc; int error; error = 0; softc = (struct pass_softc *)periph->softc; cam_periph_assert(periph, MA_OWNED); KASSERT(((softc->flags & PASS_FLAG_ZONE_VALID) == 0), ("%s called when the pass(4) zone is valid!\n", __func__)); KASSERT((softc->pass_zone == NULL), ("%s called when the pass(4) zone is allocated!\n", __func__)); if ((softc->flags & PASS_FLAG_ZONE_INPROG) == 0) { /* * We're the first context through, so we need to create * the pass(4) UMA zone for I/O requests. */ softc->flags |= PASS_FLAG_ZONE_INPROG; /* * uma_zcreate() does a blocking (M_WAITOK) allocation, * so we cannot hold a mutex while we call it. */ cam_periph_unlock(periph); softc->pass_zone = uma_zcreate(softc->zone_name, sizeof(struct pass_io_req), NULL, NULL, NULL, NULL, /*align*/ 0, /*flags*/ 0); softc->pass_io_zone = uma_zcreate(softc->io_zone_name, softc->io_zone_size, NULL, NULL, NULL, NULL, /*align*/ 0, /*flags*/ 0); cam_periph_lock(periph); if ((softc->pass_zone == NULL) || (softc->pass_io_zone == NULL)) { if (softc->pass_zone == NULL) xpt_print(periph->path, "unable to allocate " "IO Req UMA zone\n"); else xpt_print(periph->path, "unable to allocate " "IO UMA zone\n"); softc->flags &= ~PASS_FLAG_ZONE_INPROG; goto bailout; } /* * Set the flags appropriately and notify any other waiters. */ softc->flags &= PASS_FLAG_ZONE_INPROG; softc->flags |= PASS_FLAG_ZONE_VALID; wakeup(&softc->pass_zone); } else { /* * In this case, the UMA zone has not yet been created, but * another context is in the process of creating it. We * need to sleep until the creation is either done or has * failed. */ while ((softc->flags & PASS_FLAG_ZONE_INPROG) && ((softc->flags & PASS_FLAG_ZONE_VALID) == 0)) { error = msleep(&softc->pass_zone, cam_periph_mtx(periph), PRIBIO, "paszon", 0); if (error != 0) goto bailout; } /* * If the zone creation failed, no luck for the user. */ if ((softc->flags & PASS_FLAG_ZONE_VALID) == 0){ error = ENOMEM; goto bailout; } } bailout: return (error); } static void passiocleanup(struct pass_softc *softc, struct pass_io_req *io_req) { union ccb *ccb; u_int8_t **data_ptrs[CAM_PERIPH_MAXMAPS]; int i, numbufs; ccb = &io_req->ccb; switch (ccb->ccb_h.func_code) { case XPT_DEV_MATCH: numbufs = min(io_req->num_bufs, 2); if (numbufs == 1) { data_ptrs[0] = (u_int8_t **)&ccb->cdm.matches; } else { data_ptrs[0] = (u_int8_t **)&ccb->cdm.patterns; data_ptrs[1] = (u_int8_t **)&ccb->cdm.matches; } break; case XPT_SCSI_IO: case XPT_CONT_TARGET_IO: data_ptrs[0] = &ccb->csio.data_ptr; numbufs = min(io_req->num_bufs, 1); break; case XPT_ATA_IO: data_ptrs[0] = &ccb->ataio.data_ptr; numbufs = min(io_req->num_bufs, 1); break; case XPT_SMP_IO: numbufs = min(io_req->num_bufs, 2); data_ptrs[0] = &ccb->smpio.smp_request; data_ptrs[1] = &ccb->smpio.smp_response; break; case XPT_DEV_ADVINFO: numbufs = min(io_req->num_bufs, 1); data_ptrs[0] = (uint8_t **)&ccb->cdai.buf; break; default: /* allow ourselves to be swapped once again */ return; break; /* NOTREACHED */ } if (io_req->flags & PASS_IO_USER_SEG_MALLOC) { free(io_req->user_segptr, M_SCSIPASS); io_req->user_segptr = NULL; } /* * We only want to free memory we malloced. */ if (io_req->data_flags == CAM_DATA_VADDR) { for (i = 0; i < io_req->num_bufs; i++) { if (io_req->kern_bufs[i] == NULL) continue; free(io_req->kern_bufs[i], M_SCSIPASS); io_req->kern_bufs[i] = NULL; } } else if (io_req->data_flags == CAM_DATA_SG) { for (i = 0; i < io_req->num_kern_segs; i++) { if ((uint8_t *)(uintptr_t) io_req->kern_segptr[i].ds_addr == NULL) continue; uma_zfree(softc->pass_io_zone, (uint8_t *)(uintptr_t) io_req->kern_segptr[i].ds_addr); io_req->kern_segptr[i].ds_addr = 0; } } if (io_req->flags & PASS_IO_KERN_SEG_MALLOC) { free(io_req->kern_segptr, M_SCSIPASS); io_req->kern_segptr = NULL; } if (io_req->data_flags != CAM_DATA_PADDR) { for (i = 0; i < numbufs; i++) { /* * Restore the user's buffer pointers to their * previous values. */ if (io_req->user_bufs[i] != NULL) *data_ptrs[i] = io_req->user_bufs[i]; } } } static int passcopysglist(struct cam_periph *periph, struct pass_io_req *io_req, ccb_flags direction) { bus_size_t kern_watermark, user_watermark, len_copied, len_to_copy; bus_dma_segment_t *user_sglist, *kern_sglist; int i, j, error; error = 0; kern_watermark = 0; user_watermark = 0; len_to_copy = 0; len_copied = 0; user_sglist = io_req->user_segptr; kern_sglist = io_req->kern_segptr; for (i = 0, j = 0; i < io_req->num_user_segs && j < io_req->num_kern_segs;) { uint8_t *user_ptr, *kern_ptr; len_to_copy = min(user_sglist[i].ds_len -user_watermark, kern_sglist[j].ds_len - kern_watermark); user_ptr = (uint8_t *)(uintptr_t)user_sglist[i].ds_addr; user_ptr = user_ptr + user_watermark; kern_ptr = (uint8_t *)(uintptr_t)kern_sglist[j].ds_addr; kern_ptr = kern_ptr + kern_watermark; user_watermark += len_to_copy; kern_watermark += len_to_copy; if (!useracc(user_ptr, len_to_copy, (direction == CAM_DIR_IN) ? VM_PROT_WRITE : VM_PROT_READ)) { xpt_print(periph->path, "%s: unable to access user " "S/G list element %p len %zu\n", __func__, user_ptr, len_to_copy); error = EFAULT; goto bailout; } if (direction == CAM_DIR_IN) { error = copyout(kern_ptr, user_ptr, len_to_copy); if (error != 0) { xpt_print(periph->path, "%s: copyout of %u " "bytes from %p to %p failed with " "error %d\n", __func__, len_to_copy, kern_ptr, user_ptr, error); goto bailout; } } else { error = copyin(user_ptr, kern_ptr, len_to_copy); if (error != 0) { xpt_print(periph->path, "%s: copyin of %u " "bytes from %p to %p failed with " "error %d\n", __func__, len_to_copy, user_ptr, kern_ptr, error); goto bailout; } } len_copied += len_to_copy; if (user_sglist[i].ds_len == user_watermark) { i++; user_watermark = 0; } if (kern_sglist[j].ds_len == kern_watermark) { j++; kern_watermark = 0; } } bailout: return (error); } static int passmemsetup(struct cam_periph *periph, struct pass_io_req *io_req) { union ccb *ccb; struct pass_softc *softc; int numbufs, i; uint8_t **data_ptrs[CAM_PERIPH_MAXMAPS]; uint32_t lengths[CAM_PERIPH_MAXMAPS]; uint32_t dirs[CAM_PERIPH_MAXMAPS]; uint32_t num_segs; uint16_t *seg_cnt_ptr; size_t maxmap; int error; cam_periph_assert(periph, MA_NOTOWNED); softc = periph->softc; error = 0; ccb = &io_req->ccb; maxmap = 0; num_segs = 0; seg_cnt_ptr = NULL; switch(ccb->ccb_h.func_code) { case XPT_DEV_MATCH: if (ccb->cdm.match_buf_len == 0) { printf("%s: invalid match buffer length 0\n", __func__); return(EINVAL); } if (ccb->cdm.pattern_buf_len > 0) { data_ptrs[0] = (u_int8_t **)&ccb->cdm.patterns; lengths[0] = ccb->cdm.pattern_buf_len; dirs[0] = CAM_DIR_OUT; data_ptrs[1] = (u_int8_t **)&ccb->cdm.matches; lengths[1] = ccb->cdm.match_buf_len; dirs[1] = CAM_DIR_IN; numbufs = 2; } else { data_ptrs[0] = (u_int8_t **)&ccb->cdm.matches; lengths[0] = ccb->cdm.match_buf_len; dirs[0] = CAM_DIR_IN; numbufs = 1; } io_req->data_flags = CAM_DATA_VADDR; break; case XPT_SCSI_IO: case XPT_CONT_TARGET_IO: if ((ccb->ccb_h.flags & CAM_DIR_MASK) == CAM_DIR_NONE) return(0); /* * The user shouldn't be able to supply a bio. */ if ((ccb->ccb_h.flags & CAM_DATA_MASK) == CAM_DATA_BIO) return (EINVAL); io_req->data_flags = ccb->ccb_h.flags & CAM_DATA_MASK; data_ptrs[0] = &ccb->csio.data_ptr; lengths[0] = ccb->csio.dxfer_len; dirs[0] = ccb->ccb_h.flags & CAM_DIR_MASK; num_segs = ccb->csio.sglist_cnt; seg_cnt_ptr = &ccb->csio.sglist_cnt; numbufs = 1; maxmap = softc->maxio; break; case XPT_ATA_IO: if ((ccb->ccb_h.flags & CAM_DIR_MASK) == CAM_DIR_NONE) return(0); /* * We only support a single virtual address for ATA I/O. */ if ((ccb->ccb_h.flags & CAM_DATA_MASK) != CAM_DATA_VADDR) return (EINVAL); io_req->data_flags = CAM_DATA_VADDR; data_ptrs[0] = &ccb->ataio.data_ptr; lengths[0] = ccb->ataio.dxfer_len; dirs[0] = ccb->ccb_h.flags & CAM_DIR_MASK; numbufs = 1; maxmap = softc->maxio; break; case XPT_SMP_IO: io_req->data_flags = CAM_DATA_VADDR; data_ptrs[0] = &ccb->smpio.smp_request; lengths[0] = ccb->smpio.smp_request_len; dirs[0] = CAM_DIR_OUT; data_ptrs[1] = &ccb->smpio.smp_response; lengths[1] = ccb->smpio.smp_response_len; dirs[1] = CAM_DIR_IN; numbufs = 2; maxmap = softc->maxio; break; case XPT_DEV_ADVINFO: if (ccb->cdai.bufsiz == 0) return (0); io_req->data_flags = CAM_DATA_VADDR; data_ptrs[0] = (uint8_t **)&ccb->cdai.buf; lengths[0] = ccb->cdai.bufsiz; dirs[0] = CAM_DIR_IN; numbufs = 1; break; default: return(EINVAL); break; /* NOTREACHED */ } io_req->num_bufs = numbufs; /* * If there is a maximum, check to make sure that the user's * request fits within the limit. In general, we should only have * a maximum length for requests that go to hardware. Otherwise it * is whatever we're able to malloc. */ for (i = 0; i < numbufs; i++) { io_req->user_bufs[i] = *data_ptrs[i]; io_req->dirs[i] = dirs[i]; io_req->lengths[i] = lengths[i]; if (maxmap == 0) continue; if (lengths[i] <= maxmap) continue; xpt_print(periph->path, "%s: data length %u > max allowed %u " "bytes\n", __func__, lengths[i], maxmap); error = EINVAL; goto bailout; } switch (io_req->data_flags) { case CAM_DATA_VADDR: /* Map or copy the buffer into kernel address space */ for (i = 0; i < numbufs; i++) { uint8_t *tmp_buf; /* * If for some reason no length is specified, we * don't need to allocate anything. */ if (io_req->lengths[i] == 0) continue; /* * Make sure that the user's buffer is accessible * to that process. */ if (!useracc(io_req->user_bufs[i], io_req->lengths[i], (io_req->dirs[i] == CAM_DIR_IN) ? VM_PROT_WRITE : VM_PROT_READ)) { xpt_print(periph->path, "%s: user address %p " "length %u is not accessible\n", __func__, io_req->user_bufs[i], io_req->lengths[i]); error = EFAULT; goto bailout; } tmp_buf = malloc(lengths[i], M_SCSIPASS, M_WAITOK | M_ZERO); io_req->kern_bufs[i] = tmp_buf; *data_ptrs[i] = tmp_buf; #if 0 xpt_print(periph->path, "%s: malloced %p len %u, user " "buffer %p, operation: %s\n", __func__, tmp_buf, lengths[i], io_req->user_bufs[i], (dirs[i] == CAM_DIR_IN) ? "read" : "write"); #endif /* * We only need to copy in if the user is writing. */ if (dirs[i] != CAM_DIR_OUT) continue; error = copyin(io_req->user_bufs[i], io_req->kern_bufs[i], lengths[i]); if (error != 0) { xpt_print(periph->path, "%s: copy of user " "buffer from %p to %p failed with " "error %d\n", __func__, io_req->user_bufs[i], io_req->kern_bufs[i], error); goto bailout; } } break; case CAM_DATA_PADDR: /* Pass down the pointer as-is */ break; case CAM_DATA_SG: { size_t sg_length, size_to_go, alloc_size; uint32_t num_segs_needed; /* * Copy the user S/G list in, and then copy in the * individual segments. */ /* * We shouldn't see this, but check just in case. */ if (numbufs != 1) { xpt_print(periph->path, "%s: cannot currently handle " "more than one S/G list per CCB\n", __func__); error = EINVAL; goto bailout; } /* * We have to have at least one segment. */ if (num_segs == 0) { xpt_print(periph->path, "%s: CAM_DATA_SG flag set, " "but sglist_cnt=0!\n", __func__); error = EINVAL; goto bailout; } /* * Make sure the user specified the total length and didn't * just leave it to us to decode the S/G list. */ if (lengths[0] == 0) { xpt_print(periph->path, "%s: no dxfer_len specified, " "but CAM_DATA_SG flag is set!\n", __func__); error = EINVAL; goto bailout; } /* * We allocate buffers in io_zone_size increments for an * S/G list. This will generally be MAXPHYS. */ if (lengths[0] <= softc->io_zone_size) num_segs_needed = 1; else { num_segs_needed = lengths[0] / softc->io_zone_size; if ((lengths[0] % softc->io_zone_size) != 0) num_segs_needed++; } /* Figure out the size of the S/G list */ sg_length = num_segs * sizeof(bus_dma_segment_t); io_req->num_user_segs = num_segs; io_req->num_kern_segs = num_segs_needed; /* Save the user's S/G list pointer for later restoration */ io_req->user_bufs[0] = *data_ptrs[0]; /* * If we have enough segments allocated by default to handle * the length of the user's S/G list, */ if (num_segs > PASS_MAX_SEGS) { io_req->user_segptr = malloc(sizeof(bus_dma_segment_t) * num_segs, M_SCSIPASS, M_WAITOK | M_ZERO); io_req->flags |= PASS_IO_USER_SEG_MALLOC; } else io_req->user_segptr = io_req->user_segs; if (!useracc(*data_ptrs[0], sg_length, VM_PROT_READ)) { xpt_print(periph->path, "%s: unable to access user " "S/G list at %p\n", __func__, *data_ptrs[0]); error = EFAULT; goto bailout; } error = copyin(*data_ptrs[0], io_req->user_segptr, sg_length); if (error != 0) { xpt_print(periph->path, "%s: copy of user S/G list " "from %p to %p failed with error %d\n", __func__, *data_ptrs[0], io_req->user_segptr, error); goto bailout; } if (num_segs_needed > PASS_MAX_SEGS) { io_req->kern_segptr = malloc(sizeof(bus_dma_segment_t) * num_segs_needed, M_SCSIPASS, M_WAITOK | M_ZERO); io_req->flags |= PASS_IO_KERN_SEG_MALLOC; } else { io_req->kern_segptr = io_req->kern_segs; } /* * Allocate the kernel S/G list. */ for (size_to_go = lengths[0], i = 0; size_to_go > 0 && i < num_segs_needed; i++, size_to_go -= alloc_size) { uint8_t *kern_ptr; alloc_size = min(size_to_go, softc->io_zone_size); kern_ptr = uma_zalloc(softc->pass_io_zone, M_WAITOK); io_req->kern_segptr[i].ds_addr = (bus_addr_t)(uintptr_t)kern_ptr; io_req->kern_segptr[i].ds_len = alloc_size; } if (size_to_go > 0) { printf("%s: size_to_go = %zu, software error!\n", __func__, size_to_go); error = EINVAL; goto bailout; } *data_ptrs[0] = (uint8_t *)io_req->kern_segptr; *seg_cnt_ptr = io_req->num_kern_segs; /* * We only need to copy data here if the user is writing. */ if (dirs[0] == CAM_DIR_OUT) error = passcopysglist(periph, io_req, dirs[0]); break; } case CAM_DATA_SG_PADDR: { size_t sg_length; /* * We shouldn't see this, but check just in case. */ if (numbufs != 1) { printf("%s: cannot currently handle more than one " "S/G list per CCB\n", __func__); error = EINVAL; goto bailout; } /* * We have to have at least one segment. */ if (num_segs == 0) { xpt_print(periph->path, "%s: CAM_DATA_SG_PADDR flag " "set, but sglist_cnt=0!\n", __func__); error = EINVAL; goto bailout; } /* * Make sure the user specified the total length and didn't * just leave it to us to decode the S/G list. */ if (lengths[0] == 0) { xpt_print(periph->path, "%s: no dxfer_len specified, " "but CAM_DATA_SG flag is set!\n", __func__); error = EINVAL; goto bailout; } /* Figure out the size of the S/G list */ sg_length = num_segs * sizeof(bus_dma_segment_t); io_req->num_user_segs = num_segs; io_req->num_kern_segs = io_req->num_user_segs; /* Save the user's S/G list pointer for later restoration */ io_req->user_bufs[0] = *data_ptrs[0]; if (num_segs > PASS_MAX_SEGS) { io_req->user_segptr = malloc(sizeof(bus_dma_segment_t) * num_segs, M_SCSIPASS, M_WAITOK | M_ZERO); io_req->flags |= PASS_IO_USER_SEG_MALLOC; } else io_req->user_segptr = io_req->user_segs; io_req->kern_segptr = io_req->user_segptr; error = copyin(*data_ptrs[0], io_req->user_segptr, sg_length); if (error != 0) { xpt_print(periph->path, "%s: copy of user S/G list " "from %p to %p failed with error %d\n", __func__, *data_ptrs[0], io_req->user_segptr, error); goto bailout; } break; } default: case CAM_DATA_BIO: /* * A user shouldn't be attaching a bio to the CCB. It * isn't a user-accessible structure. */ error = EINVAL; break; } bailout: if (error != 0) passiocleanup(softc, io_req); return (error); } static int passmemdone(struct cam_periph *periph, struct pass_io_req *io_req) { struct pass_softc *softc; union ccb *ccb; int error; int i; error = 0; softc = (struct pass_softc *)periph->softc; ccb = &io_req->ccb; switch (io_req->data_flags) { case CAM_DATA_VADDR: /* * Copy back to the user buffer if this was a read. */ for (i = 0; i < io_req->num_bufs; i++) { if (io_req->dirs[i] != CAM_DIR_IN) continue; error = copyout(io_req->kern_bufs[i], io_req->user_bufs[i], io_req->lengths[i]); if (error != 0) { xpt_print(periph->path, "Unable to copy %u " "bytes from %p to user address %p\n", io_req->lengths[i], io_req->kern_bufs[i], io_req->user_bufs[i]); goto bailout; } } break; case CAM_DATA_PADDR: /* Do nothing. The pointer is a physical address already */ break; case CAM_DATA_SG: /* * Copy back to the user buffer if this was a read. * Restore the user's S/G list buffer pointer. */ if (io_req->dirs[0] == CAM_DIR_IN) error = passcopysglist(periph, io_req, io_req->dirs[0]); break; case CAM_DATA_SG_PADDR: /* * Restore the user's S/G list buffer pointer. No need to * copy. */ break; default: case CAM_DATA_BIO: error = EINVAL; break; } bailout: /* * Reset the user's pointers to their original values and free * allocated memory. */ passiocleanup(softc, io_req); return (error); } static int passioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flag, struct thread *td) { int error; if ((error = passdoioctl(dev, cmd, addr, flag, td)) == ENOTTY) { error = cam_compat_ioctl(dev, cmd, addr, flag, td, passdoioctl); } return (error); } static int passdoioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flag, struct thread *td) { struct cam_periph *periph; struct pass_softc *softc; int error; uint32_t priority; periph = (struct cam_periph *)dev->si_drv1; cam_periph_lock(periph); softc = (struct pass_softc *)periph->softc; error = 0; switch (cmd) { case CAMIOCOMMAND: { union ccb *inccb; union ccb *ccb; int ccb_malloced; inccb = (union ccb *)addr; /* * Some CCB types, like scan bus and scan lun can only go * through the transport layer device. */ if (inccb->ccb_h.func_code & XPT_FC_XPT_ONLY) { xpt_print(periph->path, "CCB function code %#x is " "restricted to the XPT device\n", inccb->ccb_h.func_code); error = ENODEV; break; } /* Compatibility for RL/priority-unaware code. */ priority = inccb->ccb_h.pinfo.priority; if (priority <= CAM_PRIORITY_OOB) priority += CAM_PRIORITY_OOB + 1; /* * Non-immediate CCBs need a CCB from the per-device pool * of CCBs, which is scheduled by the transport layer. * Immediate CCBs and user-supplied CCBs should just be * malloced. */ if ((inccb->ccb_h.func_code & XPT_FC_QUEUED) && ((inccb->ccb_h.func_code & XPT_FC_USER_CCB) == 0)) { ccb = cam_periph_getccb(periph, priority); ccb_malloced = 0; } else { ccb = xpt_alloc_ccb_nowait(); if (ccb != NULL) xpt_setup_ccb(&ccb->ccb_h, periph->path, priority); ccb_malloced = 1; } if (ccb == NULL) { xpt_print(periph->path, "unable to allocate CCB\n"); error = ENOMEM; break; } error = passsendccb(periph, ccb, inccb); if (ccb_malloced) xpt_free_ccb(ccb); else xpt_release_ccb(ccb); break; } case CAMIOQUEUE: { struct pass_io_req *io_req; union ccb **user_ccb, *ccb; xpt_opcode fc; if ((softc->flags & PASS_FLAG_ZONE_VALID) == 0) { error = passcreatezone(periph); if (error != 0) goto bailout; } /* * We're going to do a blocking allocation for this I/O * request, so we have to drop the lock. */ cam_periph_unlock(periph); io_req = uma_zalloc(softc->pass_zone, M_WAITOK | M_ZERO); ccb = &io_req->ccb; user_ccb = (union ccb **)addr; /* * Unlike the CAMIOCOMMAND ioctl above, we only have a * pointer to the user's CCB, so we have to copy the whole * thing in to a buffer we have allocated (above) instead * of allowing the ioctl code to malloc a buffer and copy * it in. * * This is an advantage for this asynchronous interface, * since we don't want the memory to get freed while the * CCB is outstanding. */ #if 0 xpt_print(periph->path, "Copying user CCB %p to " "kernel address %p\n", *user_ccb, ccb); #endif error = copyin(*user_ccb, ccb, sizeof(*ccb)); if (error != 0) { xpt_print(periph->path, "Copy of user CCB %p to " "kernel address %p failed with error %d\n", *user_ccb, ccb, error); uma_zfree(softc->pass_zone, io_req); cam_periph_lock(periph); break; } /* * Some CCB types, like scan bus and scan lun can only go * through the transport layer device. */ if (ccb->ccb_h.func_code & XPT_FC_XPT_ONLY) { xpt_print(periph->path, "CCB function code %#x is " "restricted to the XPT device\n", ccb->ccb_h.func_code); uma_zfree(softc->pass_zone, io_req); cam_periph_lock(periph); error = ENODEV; break; } /* * Save the user's CCB pointer as well as his linked list * pointers and peripheral private area so that we can * restore these later. */ io_req->user_ccb_ptr = *user_ccb; io_req->user_periph_links = ccb->ccb_h.periph_links; io_req->user_periph_priv = ccb->ccb_h.periph_priv; /* * Now that we've saved the user's values, we can set our * own peripheral private entry. */ ccb->ccb_h.ccb_ioreq = io_req; /* Compatibility for RL/priority-unaware code. */ priority = ccb->ccb_h.pinfo.priority; if (priority <= CAM_PRIORITY_OOB) priority += CAM_PRIORITY_OOB + 1; /* * Setup fields in the CCB like the path and the priority. * The path in particular cannot be done in userland, since * it is a pointer to a kernel data structure. */ xpt_setup_ccb_flags(&ccb->ccb_h, periph->path, priority, ccb->ccb_h.flags); /* * Setup our done routine. There is no way for the user to * have a valid pointer here. */ ccb->ccb_h.cbfcnp = passdone; fc = ccb->ccb_h.func_code; /* * If this function code has memory that can be mapped in * or out, we need to call passmemsetup(). */ if ((fc == XPT_SCSI_IO) || (fc == XPT_ATA_IO) || (fc == XPT_SMP_IO) || (fc == XPT_DEV_MATCH) || (fc == XPT_DEV_ADVINFO)) { error = passmemsetup(periph, io_req); if (error != 0) { uma_zfree(softc->pass_zone, io_req); cam_periph_lock(periph); break; } } else io_req->mapinfo.num_bufs_used = 0; cam_periph_lock(periph); /* * Everything goes on the incoming queue initially. */ TAILQ_INSERT_TAIL(&softc->incoming_queue, io_req, links); /* * If the CCB is queued, and is not a user CCB, then * we need to allocate a slot for it. Call xpt_schedule() * so that our start routine will get called when a CCB is * available. */ if ((fc & XPT_FC_QUEUED) && ((fc & XPT_FC_USER_CCB) == 0)) { xpt_schedule(periph, priority); break; } /* * At this point, the CCB in question is either an * immediate CCB (like XPT_DEV_ADVINFO) or it is a user CCB * and therefore should be malloced, not allocated via a slot. * Remove the CCB from the incoming queue and add it to the * active queue. */ TAILQ_REMOVE(&softc->incoming_queue, io_req, links); TAILQ_INSERT_TAIL(&softc->active_queue, io_req, links); xpt_action(ccb); /* * If this is not a queued CCB (i.e. it is an immediate CCB), * then it is already done. We need to put it on the done * queue for the user to fetch. */ if ((fc & XPT_FC_QUEUED) == 0) { TAILQ_REMOVE(&softc->active_queue, io_req, links); TAILQ_INSERT_TAIL(&softc->done_queue, io_req, links); } break; } case CAMIOGET: { union ccb **user_ccb; struct pass_io_req *io_req; int old_error; user_ccb = (union ccb **)addr; old_error = 0; io_req = TAILQ_FIRST(&softc->done_queue); if (io_req == NULL) { error = ENOENT; break; } /* * Remove the I/O from the done queue. */ TAILQ_REMOVE(&softc->done_queue, io_req, links); /* * We have to drop the lock during the copyout because the * copyout can result in VM faults that require sleeping. */ cam_periph_unlock(periph); /* * Do any needed copies (e.g. for reads) and revert the * pointers in the CCB back to the user's pointers. */ error = passmemdone(periph, io_req); old_error = error; io_req->ccb.ccb_h.periph_links = io_req->user_periph_links; io_req->ccb.ccb_h.periph_priv = io_req->user_periph_priv; #if 0 xpt_print(periph->path, "Copying to user CCB %p from " "kernel address %p\n", *user_ccb, &io_req->ccb); #endif error = copyout(&io_req->ccb, *user_ccb, sizeof(union ccb)); if (error != 0) { xpt_print(periph->path, "Copy to user CCB %p from " "kernel address %p failed with error %d\n", *user_ccb, &io_req->ccb, error); } /* * Prefer the first error we got back, and make sure we * don't overwrite bad status with good. */ if (old_error != 0) error = old_error; cam_periph_lock(periph); /* * At this point, if there was an error, we could potentially * re-queue the I/O and try again. But why? The error * would almost certainly happen again. We might as well * not leak memory. */ uma_zfree(softc->pass_zone, io_req); break; } default: error = cam_periph_ioctl(periph, cmd, addr, passerror); break; } bailout: cam_periph_unlock(periph); return(error); } static int passpoll(struct cdev *dev, int poll_events, struct thread *td) { struct cam_periph *periph; struct pass_softc *softc; int revents; periph = (struct cam_periph *)dev->si_drv1; softc = (struct pass_softc *)periph->softc; revents = poll_events & (POLLOUT | POLLWRNORM); if ((poll_events & (POLLIN | POLLRDNORM)) != 0) { cam_periph_lock(periph); if (!TAILQ_EMPTY(&softc->done_queue)) { revents |= poll_events & (POLLIN | POLLRDNORM); } cam_periph_unlock(periph); if (revents == 0) selrecord(td, &softc->read_select); } return (revents); } static int passkqfilter(struct cdev *dev, struct knote *kn) { struct cam_periph *periph; struct pass_softc *softc; periph = (struct cam_periph *)dev->si_drv1; softc = (struct pass_softc *)periph->softc; kn->kn_hook = (caddr_t)periph; kn->kn_fop = &passread_filtops; knlist_add(&softc->read_select.si_note, kn, 0); return (0); } static void passreadfiltdetach(struct knote *kn) { struct cam_periph *periph; struct pass_softc *softc; periph = (struct cam_periph *)kn->kn_hook; softc = (struct pass_softc *)periph->softc; knlist_remove(&softc->read_select.si_note, kn, 0); } static int passreadfilt(struct knote *kn, long hint) { struct cam_periph *periph; struct pass_softc *softc; int retval; periph = (struct cam_periph *)kn->kn_hook; softc = (struct pass_softc *)periph->softc; cam_periph_assert(periph, MA_OWNED); if (TAILQ_EMPTY(&softc->done_queue)) retval = 0; else retval = 1; return (retval); } /* * Generally, "ccb" should be the CCB supplied by the kernel. "inccb" * should be the CCB that is copied in from the user. */ static int passsendccb(struct cam_periph *periph, union ccb *ccb, union ccb *inccb) { struct pass_softc *softc; struct cam_periph_map_info mapinfo; xpt_opcode fc; int error; softc = (struct pass_softc *)periph->softc; /* * There are some fields in the CCB header that need to be * preserved, the rest we get from the user. */ xpt_merge_ccb(ccb, inccb); /* */ ccb->ccb_h.cbfcnp = passdone; /* * Let cam_periph_mapmem do a sanity check on the data pointer format. * Even if no data transfer is needed, it's a cheap check and it * simplifies the code. */ fc = ccb->ccb_h.func_code; if ((fc == XPT_SCSI_IO) || (fc == XPT_ATA_IO) || (fc == XPT_SMP_IO) || (fc == XPT_DEV_MATCH) || (fc == XPT_DEV_ADVINFO)) { bzero(&mapinfo, sizeof(mapinfo)); /* * cam_periph_mapmem calls into proc and vm functions that can * sleep as well as trigger I/O, so we can't hold the lock. * Dropping it here is reasonably safe. */ cam_periph_unlock(periph); error = cam_periph_mapmem(ccb, &mapinfo, softc->maxio); cam_periph_lock(periph); /* * cam_periph_mapmem returned an error, we can't continue. * Return the error to the user. */ if (error) return(error); } else /* Ensure that the unmap call later on is a no-op. */ mapinfo.num_bufs_used = 0; /* * If the user wants us to perform any error recovery, then honor * that request. Otherwise, it's up to the user to perform any * error recovery. */ cam_periph_runccb(ccb, passerror, /* cam_flags */ CAM_RETRY_SELTO, /* sense_flags */ ((ccb->ccb_h.flags & CAM_PASS_ERR_RECOVER) ? SF_RETRY_UA : SF_NO_RECOVERY) | SF_NO_PRINT, softc->device_stats); cam_periph_unmapmem(ccb, &mapinfo); ccb->ccb_h.cbfcnp = NULL; ccb->ccb_h.periph_priv = inccb->ccb_h.periph_priv; bcopy(ccb, inccb, sizeof(union ccb)); return(0); } static int passerror(union ccb *ccb, u_int32_t cam_flags, u_int32_t sense_flags) { struct cam_periph *periph; struct pass_softc *softc; periph = xpt_path_periph(ccb->ccb_h.path); softc = (struct pass_softc *)periph->softc; return(cam_periph_error(ccb, cam_flags, sense_flags, &softc->saved_ccb)); } Index: head/sys/modules/cam/Makefile =================================================================== --- head/sys/modules/cam/Makefile (revision 299863) +++ head/sys/modules/cam/Makefile (revision 299864) @@ -1,47 +1,46 @@ # $FreeBSD$ S= ${.CURDIR}/../.. .PATH: $S/cam $S/cam/scsi $S/cam/ata $S/${MACHINE}/${MACHINE} KMOD= cam # See sys/conf/options for the flags that go into the different opt_*.h files. SRCS= opt_cam.h SRCS+= opt_ada.h SRCS+= opt_scsi.h SRCS+= opt_cd.h -SRCS+= opt_kdtrace.h SRCS+= opt_pt.h SRCS+= opt_sa.h SRCS+= opt_ses.h SRCS+= device_if.h bus_if.h vnode_if.h SRCS+= cam.c SRCS+= cam_compat.c .if exists($S/${MACHINE}/${MACHINE}/cam_machdep.c) SRCS+= cam_machdep.c .endif SRCS+= cam_iosched.c cam_periph.c cam_queue.c cam_sim.c cam_xpt.c SRCS+= scsi_all.c scsi_cd.c scsi_ch.c SRCS+= scsi_da.c SRCS+= scsi_pass.c SRCS+= scsi_pt.c SRCS+= scsi_sa.c SRCS+= scsi_enc.c SRCS+= scsi_enc_ses.c SRCS+= scsi_enc_safte.c SRCS+= scsi_sg.c SRCS+= scsi_targ_bh.c scsi_target.c SRCS+= scsi_xpt.c SRCS+= smp_all.c SRCS+= ata_all.c SRCS+= ata_xpt.c SRCS+= ata_da.c .if exists($S/${MACHINE}/${MACHINE}/ata_machdep.c) SRCS+= ata_machdep.c .endif SRCS+= ata_pmp.c EXPORT_SYMS= YES # XXX evaluate .include Index: head/sys/modules/tcp/fastpath/Makefile =================================================================== --- head/sys/modules/tcp/fastpath/Makefile (revision 299863) +++ head/sys/modules/tcp/fastpath/Makefile (revision 299864) @@ -1,18 +1,18 @@ # # $FreeBSD$ # .PATH: ${.CURDIR}/../../../netinet/tcp_stacks KMOD= fastpath SRCS= fastpath.c -SRCS+= opt_ipfw.h opt_inet.h opt_inet6.h opt_ipsec.h opt_kdtrace.h +SRCS+= opt_ipfw.h opt_inet.h opt_inet6.h opt_ipsec.h SRCS+= opt_tcpdebug.h # # Enable full debugging # #CFLAGS += -g .include Index: head/sys/netinet/tcp_stacks/fastpath.c =================================================================== --- head/sys/netinet/tcp_stacks/fastpath.c (revision 299863) +++ head/sys/netinet/tcp_stacks/fastpath.c (revision 299864) @@ -1,2455 +1,2454 @@ /*- * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995 * The Regents of the University of California. All rights reserved. * Copyright (c) 2007-2008,2010 * Swinburne University of Technology, Melbourne, Australia. * Copyright (c) 2009-2010 Lawrence Stewart * Copyright (c) 2010 The FreeBSD Foundation * Copyright (c) 2010-2011 Juniper Networks, Inc. * Copyright (c) 2015 Netflix Inc. * All rights reserved. * * Portions of this software were developed at the Centre for Advanced Internet * Architectures, Swinburne University of Technology, by Lawrence Stewart, * James Healy and David Hayes, made possible in part by a grant from the Cisco * University Research Program Fund at Community Foundation Silicon Valley. * * Portions of this software were developed at the Centre for Advanced * Internet Architectures, Swinburne University of Technology, Melbourne, * Australia by David Hayes under sponsorship from the FreeBSD Foundation. * * Portions of this software were developed by Robert N. M. Watson under * contract to Juniper Networks, Inc. * * Portions of this software were developed by Randall R. Stewart while * working for Netflix Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)tcp_input.c 8.12 (Berkeley) 5/24/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_ipfw.h" /* for ipfw_fwd */ #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" -#include "opt_kdtrace.h" #include "opt_tcpdebug.h" #include #include #include #include #include #include #include /* for proc0 declaration */ #include #include #include #include #include #include #include #include #include /* before tcp_seq.h, for tcp_random18() */ #include #include #include #define TCPSTATES /* for logging */ #include #include #include #include #include #include /* required for icmp_var.h */ #include /* for ICMP_BANDLIM */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef TCPDEBUG #include #endif /* TCPDEBUG */ #ifdef TCP_OFFLOAD #include #endif #ifdef IPSEC #include #include #endif /*IPSEC*/ #include #include VNET_DECLARE(int, tcp_autorcvbuf_inc); #define V_tcp_autorcvbuf_inc VNET(tcp_autorcvbuf_inc) VNET_DECLARE(int, tcp_autorcvbuf_max); #define V_tcp_autorcvbuf_max VNET(tcp_autorcvbuf_max) VNET_DECLARE(int, tcp_do_rfc3042); #define V_tcp_do_rfc3042 VNET(tcp_do_rfc3042) VNET_DECLARE(int, tcp_do_autorcvbuf); #define V_tcp_do_autorcvbuf VNET(tcp_do_autorcvbuf) VNET_DECLARE(int, tcp_insecure_rst); #define V_tcp_insecure_rst VNET(tcp_insecure_rst) VNET_DECLARE(int, tcp_insecure_syn); #define V_tcp_insecure_syn VNET(tcp_insecure_syn) static void tcp_do_segment_fastslow(struct mbuf *, struct tcphdr *, struct socket *, struct tcpcb *, int, int, uint8_t, int); static void tcp_do_segment_fastack(struct mbuf *, struct tcphdr *, struct socket *, struct tcpcb *, int, int, uint8_t, int); /* * Indicate whether this ack should be delayed. We can delay the ack if * following conditions are met: * - There is no delayed ack timer in progress. * - Our last ack wasn't a 0-sized window. We never want to delay * the ack that opens up a 0-sized window. * - LRO wasn't used for this segment. We make sure by checking that the * segment size is not larger than the MSS. */ #define DELAY_ACK(tp, tlen) \ ((!tcp_timer_active(tp, TT_DELACK) && \ (tp->t_flags & TF_RXWIN0SENT) == 0) && \ (tlen <= tp->t_maxseg) && \ (V_tcp_delack_enabled || (tp->t_flags & TF_NEEDSYN))) /* * So how is this faster than the normal fast ack? * It basically allows us to also stay in the fastpath * when a window-update ack also arrives. In testing * we saw only 25-30% of connections doing fastpath * due to the fact that along with moving forward * in sequence the window was also updated. */ static void tcp_do_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int drop_hdrlen, int tlen, int ti_locked, u_long tiwin) { int acked; int winup_only=0; #ifdef TCPDEBUG /* * The size of tcp_saveipgen must be the size of the max ip header, * now IPv6. */ u_char tcp_saveipgen[IP6_HDR_LEN]; struct tcphdr tcp_savetcp; short ostate = 0; #endif /* * The following if statement will be true if * we are doing the win_up_in_fp * - We have more new data (SEQ_LT(tp->snd_wl1, th->th_seq)) * - No more new data, but we have an ack for new data * (tp->snd_wl1 == th->th_seq && SEQ_LT(tp->snd_wl2, th->th_ack)) * - No more new data, the same ack point but the window grew * (tp->snd_wl1 == th->th_seq && tp->snd_wl2 == th->th_ack && twin > tp->snd_wnd) */ if ((SEQ_LT(tp->snd_wl1, th->th_seq) || (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) || (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) { /* keep track of pure window updates */ if (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) { winup_only = 1; TCPSTAT_INC(tcps_rcvwinupd); } tp->snd_wnd = tiwin; tp->snd_wl1 = th->th_seq; tp->snd_wl2 = th->th_ack; if (tp->snd_wnd > tp->max_sndwnd) tp->max_sndwnd = tp->snd_wnd; } /* * If last ACK falls within this segment's sequence numbers, * record the timestamp. * NOTE that the test is modified according to the latest * proposal of the tcplw@cray.com list (Braden 1993/04/26). */ if ((to->to_flags & TOF_TS) != 0 && SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { tp->ts_recent_age = tcp_ts_getticks(); tp->ts_recent = to->to_tsval; } /* * This is a pure ack for outstanding data. */ if (ti_locked == TI_RLOCKED) { INP_INFO_RUNLOCK(&V_tcbinfo); } ti_locked = TI_UNLOCKED; TCPSTAT_INC(tcps_predack); /* * "bad retransmit" recovery. */ if (tp->t_rxtshift == 1 && tp->t_flags & TF_PREVVALID && (int)(ticks - tp->t_badrxtwin) < 0) { cc_cong_signal(tp, th, CC_RTO_ERR); } /* * Recalculate the transmit timer / rtt. * * Some boxes send broken timestamp replies * during the SYN+ACK phase, ignore * timestamps of 0 or we could calculate a * huge RTT and blow up the retransmit timer. */ if ((to->to_flags & TOF_TS) != 0 && to->to_tsecr) { u_int t; t = tcp_ts_getticks() - to->to_tsecr; if (!tp->t_rttlow || tp->t_rttlow > t) tp->t_rttlow = t; tcp_xmit_timer(tp, TCP_TS_TO_TICKS(t) + 1); } else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) { if (!tp->t_rttlow || tp->t_rttlow > ticks - tp->t_rtttime) tp->t_rttlow = ticks - tp->t_rtttime; tcp_xmit_timer(tp, ticks - tp->t_rtttime); } if (winup_only == 0) { acked = BYTES_THIS_ACK(tp, th); /* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */ hhook_run_tcp_est_in(tp, th, to); TCPSTAT_ADD(tcps_rcvackbyte, acked); sbdrop(&so->so_snd, acked); if (SEQ_GT(tp->snd_una, tp->snd_recover) && SEQ_LEQ(th->th_ack, tp->snd_recover)) tp->snd_recover = th->th_ack - 1; /* * Let the congestion control algorithm update * congestion control related information. This * typically means increasing the congestion * window. */ cc_ack_received(tp, th, CC_ACK); tp->snd_una = th->th_ack; /* * Pull snd_wl2 up to prevent seq wrap relative * to th_ack. */ tp->snd_wl2 = th->th_ack; tp->t_dupacks = 0; /* * If all outstanding data are acked, stop * retransmit timer, otherwise restart timer * using current (possibly backed-off) value. * If process is waiting for space, * wakeup/selwakeup/signal. If data * are ready to send, let tcp_output * decide between more output or persist. */ #ifdef TCPDEBUG if (so->so_options & SO_DEBUG) tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif TCP_PROBE3(debug__input, tp, th, mtod(m, const char *)); m_freem(m); if (tp->snd_una == tp->snd_max) tcp_timer_activate(tp, TT_REXMT, 0); else if (!tcp_timer_active(tp, TT_PERSIST)) tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur); } else { /* * Window update only, just free the mbufs and * send out whatever we can. */ m_freem(m); } sowwakeup(so); if (sbavail(&so->so_snd)) (void) tcp_output(tp); KASSERT(ti_locked == TI_UNLOCKED, ("%s: check_delack ti_locked %d", __func__, ti_locked)); INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(tp->t_inpcb); if (tp->t_flags & TF_DELACK) { tp->t_flags &= ~TF_DELACK; tcp_timer_activate(tp, TT_DELACK, tcp_delacktime); } INP_WUNLOCK(tp->t_inpcb); } /* * Here nothing is really faster, its just that we * have broken out the fast-data path also just like * the fast-ack. */ static void tcp_do_fastnewdata(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int drop_hdrlen, int tlen, int ti_locked, u_long tiwin) { int newsize = 0; /* automatic sockbuf scaling */ #ifdef TCPDEBUG /* * The size of tcp_saveipgen must be the size of the max ip header, * now IPv6. */ u_char tcp_saveipgen[IP6_HDR_LEN]; struct tcphdr tcp_savetcp; short ostate = 0; #endif /* * If last ACK falls within this segment's sequence numbers, * record the timestamp. * NOTE that the test is modified according to the latest * proposal of the tcplw@cray.com list (Braden 1993/04/26). */ if ((to->to_flags & TOF_TS) != 0 && SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { tp->ts_recent_age = tcp_ts_getticks(); tp->ts_recent = to->to_tsval; } /* * This is a pure, in-sequence data packet with * nothing on the reassembly queue and we have enough * buffer space to take it. */ if (ti_locked == TI_RLOCKED) { INP_INFO_RUNLOCK(&V_tcbinfo); } ti_locked = TI_UNLOCKED; /* Clean receiver SACK report if present */ if ((tp->t_flags & TF_SACK_PERMIT) && tp->rcv_numsacks) tcp_clean_sackreport(tp); TCPSTAT_INC(tcps_preddat); tp->rcv_nxt += tlen; /* * Pull snd_wl1 up to prevent seq wrap relative to * th_seq. */ tp->snd_wl1 = th->th_seq; /* * Pull rcv_up up to prevent seq wrap relative to * rcv_nxt. */ tp->rcv_up = tp->rcv_nxt; TCPSTAT_ADD(tcps_rcvbyte, tlen); #ifdef TCPDEBUG if (so->so_options & SO_DEBUG) tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif TCP_PROBE3(debug__input, tp, th, mtod(m, const char *)); /* * Automatic sizing of receive socket buffer. Often the send * buffer size is not optimally adjusted to the actual network * conditions at hand (delay bandwidth product). Setting the * buffer size too small limits throughput on links with high * bandwidth and high delay (eg. trans-continental/oceanic links). * * On the receive side the socket buffer memory is only rarely * used to any significant extent. This allows us to be much * more aggressive in scaling the receive socket buffer. For * the case that the buffer space is actually used to a large * extent and we run out of kernel memory we can simply drop * the new segments; TCP on the sender will just retransmit it * later. Setting the buffer size too big may only consume too * much kernel memory if the application doesn't read() from * the socket or packet loss or reordering makes use of the * reassembly queue. * * The criteria to step up the receive buffer one notch are: * 1. Application has not set receive buffer size with * SO_RCVBUF. Setting SO_RCVBUF clears SB_AUTOSIZE. * 2. the number of bytes received during the time it takes * one timestamp to be reflected back to us (the RTT); * 3. received bytes per RTT is within seven eighth of the * current socket buffer size; * 4. receive buffer size has not hit maximal automatic size; * * This algorithm does one step per RTT at most and only if * we receive a bulk stream w/o packet losses or reorderings. * Shrinking the buffer during idle times is not necessary as * it doesn't consume any memory when idle. * * TODO: Only step up if the application is actually serving * the buffer to better manage the socket buffer resources. */ if (V_tcp_do_autorcvbuf && (to->to_flags & TOF_TS) && to->to_tsecr && (so->so_rcv.sb_flags & SB_AUTOSIZE)) { if (TSTMP_GT(to->to_tsecr, tp->rfbuf_ts) && to->to_tsecr - tp->rfbuf_ts < hz) { if (tp->rfbuf_cnt > (so->so_rcv.sb_hiwat / 8 * 7) && so->so_rcv.sb_hiwat < V_tcp_autorcvbuf_max) { newsize = min(so->so_rcv.sb_hiwat + V_tcp_autorcvbuf_inc, V_tcp_autorcvbuf_max); } /* Start over with next RTT. */ tp->rfbuf_ts = 0; tp->rfbuf_cnt = 0; } else tp->rfbuf_cnt += tlen; /* add up */ } /* Add data to socket buffer. */ SOCKBUF_LOCK(&so->so_rcv); if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { m_freem(m); } else { /* * Set new socket buffer size. * Give up when limit is reached. */ if (newsize) if (!sbreserve_locked(&so->so_rcv, newsize, so, NULL)) so->so_rcv.sb_flags &= ~SB_AUTOSIZE; m_adj(m, drop_hdrlen); /* delayed header drop */ sbappendstream_locked(&so->so_rcv, m, 0); } /* NB: sorwakeup_locked() does an implicit unlock. */ sorwakeup_locked(so); if (DELAY_ACK(tp, tlen)) { tp->t_flags |= TF_DELACK; } else { tp->t_flags |= TF_ACKNOW; tcp_output(tp); } KASSERT(ti_locked == TI_UNLOCKED, ("%s: check_delack ti_locked %d", __func__, ti_locked)); INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(tp->t_inpcb); if (tp->t_flags & TF_DELACK) { tp->t_flags &= ~TF_DELACK; tcp_timer_activate(tp, TT_DELACK, tcp_delacktime); } INP_WUNLOCK(tp->t_inpcb); } /* * The slow-path is the clone of the long long part * of tcp_do_segment past all the fast-path stuff. We * use it here by two different callers, the fast/slow and * the fastack only. */ static void tcp_do_slowpath(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int drop_hdrlen, int tlen, int ti_locked, u_long tiwin, int thflags) { int acked, ourfinisacked, needoutput = 0; int rstreason, todrop, win; char *s; struct in_conninfo *inc; struct mbuf *mfree = NULL; #ifdef TCPDEBUG /* * The size of tcp_saveipgen must be the size of the max ip header, * now IPv6. */ u_char tcp_saveipgen[IP6_HDR_LEN]; struct tcphdr tcp_savetcp; short ostate = 0; #endif /* * Calculate amount of space in receive window, * and then do TCP input processing. * Receive window is amount of space in rcv queue, * but not less than advertised window. */ inc = &tp->t_inpcb->inp_inc; win = sbspace(&so->so_rcv); if (win < 0) win = 0; tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt)); /* Reset receive buffer auto scaling when not in bulk receive mode. */ tp->rfbuf_ts = 0; tp->rfbuf_cnt = 0; switch (tp->t_state) { /* * If the state is SYN_RECEIVED: * if seg contains an ACK, but not for our SYN/ACK, send a RST. */ case TCPS_SYN_RECEIVED: if ((thflags & TH_ACK) && (SEQ_LEQ(th->th_ack, tp->snd_una) || SEQ_GT(th->th_ack, tp->snd_max))) { rstreason = BANDLIM_RST_OPENPORT; goto dropwithreset; } break; /* * If the state is SYN_SENT: * if seg contains an ACK, but not for our SYN, drop the input. * if seg contains a RST, then drop the connection. * if seg does not contain SYN, then drop it. * Otherwise this is an acceptable SYN segment * initialize tp->rcv_nxt and tp->irs * if seg contains ack then advance tp->snd_una * if seg contains an ECE and ECN support is enabled, the stream * is ECN capable. * if SYN has been acked change to ESTABLISHED else SYN_RCVD state * arrange for segment to be acked (eventually) * continue processing rest of data/controls, beginning with URG */ case TCPS_SYN_SENT: if ((thflags & TH_ACK) && (SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) { rstreason = BANDLIM_UNLIMITED; goto dropwithreset; } if ((thflags & (TH_ACK|TH_RST)) == (TH_ACK|TH_RST)) { TCP_PROBE5(connect__refused, NULL, tp, mtod(m, const char *), tp, th); tp = tcp_drop(tp, ECONNREFUSED); } if (thflags & TH_RST) goto drop; if (!(thflags & TH_SYN)) goto drop; tp->irs = th->th_seq; tcp_rcvseqinit(tp); if (thflags & TH_ACK) { TCPSTAT_INC(tcps_connects); soisconnected(so); #ifdef MAC mac_socketpeer_set_from_mbuf(m, so); #endif /* Do window scaling on this connection? */ if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == (TF_RCVD_SCALE|TF_REQ_SCALE)) { tp->rcv_scale = tp->request_r_scale; } tp->rcv_adv += imin(tp->rcv_wnd, TCP_MAXWIN << tp->rcv_scale); tp->snd_una++; /* SYN is acked */ /* * If there's data, delay ACK; if there's also a FIN * ACKNOW will be turned on later. */ if (DELAY_ACK(tp, tlen) && tlen != 0) tcp_timer_activate(tp, TT_DELACK, tcp_delacktime); else tp->t_flags |= TF_ACKNOW; if ((thflags & TH_ECE) && V_tcp_do_ecn) { tp->t_flags |= TF_ECN_PERMIT; TCPSTAT_INC(tcps_ecn_shs); } /* * Received in SYN_SENT[*] state. * Transitions: * SYN_SENT --> ESTABLISHED * SYN_SENT* --> FIN_WAIT_1 */ tp->t_starttime = ticks; if (tp->t_flags & TF_NEEDFIN) { tcp_state_change(tp, TCPS_FIN_WAIT_1); tp->t_flags &= ~TF_NEEDFIN; thflags &= ~TH_SYN; } else { tcp_state_change(tp, TCPS_ESTABLISHED); TCP_PROBE5(connect__established, NULL, tp, mtod(m, const char *), tp, th); cc_conn_init(tp); tcp_timer_activate(tp, TT_KEEP, TP_KEEPIDLE(tp)); } } else { /* * Received initial SYN in SYN-SENT[*] state => * simultaneous open. * If it succeeds, connection is * half-synchronized. * Otherwise, do 3-way handshake: * SYN-SENT -> SYN-RECEIVED * SYN-SENT* -> SYN-RECEIVED* */ tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN); tcp_timer_activate(tp, TT_REXMT, 0); tcp_state_change(tp, TCPS_SYN_RECEIVED); } KASSERT(ti_locked == TI_RLOCKED, ("%s: trimthenstep6: " "ti_locked %d", __func__, ti_locked)); INP_INFO_RLOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(tp->t_inpcb); /* * Advance th->th_seq to correspond to first data byte. * If data, trim to stay within window, * dropping FIN if necessary. */ th->th_seq++; if (tlen > tp->rcv_wnd) { todrop = tlen - tp->rcv_wnd; m_adj(m, -todrop); tlen = tp->rcv_wnd; thflags &= ~TH_FIN; TCPSTAT_INC(tcps_rcvpackafterwin); TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop); } tp->snd_wl1 = th->th_seq - 1; tp->rcv_up = th->th_seq; /* * Client side of transaction: already sent SYN and data. * If the remote host used T/TCP to validate the SYN, * our data will be ACK'd; if so, enter normal data segment * processing in the middle of step 5, ack processing. * Otherwise, goto step 6. */ if (thflags & TH_ACK) goto process_ACK; goto step6; /* * If the state is LAST_ACK or CLOSING or TIME_WAIT: * do normal processing. * * NB: Leftover from RFC1644 T/TCP. Cases to be reused later. */ case TCPS_LAST_ACK: case TCPS_CLOSING: break; /* continue normal processing */ } /* * States other than LISTEN or SYN_SENT. * First check the RST flag and sequence number since reset segments * are exempt from the timestamp and connection count tests. This * fixes a bug introduced by the Stevens, vol. 2, p. 960 bugfix * below which allowed reset segments in half the sequence space * to fall though and be processed (which gives forged reset * segments with a random sequence number a 50 percent chance of * killing a connection). * Then check timestamp, if present. * Then check the connection count, if present. * Then check that at least some bytes of segment are within * receive window. If segment begins before rcv_nxt, * drop leading data (and SYN); if nothing left, just ack. */ if (thflags & TH_RST) { /* * RFC5961 Section 3.2 * * - RST drops connection only if SEG.SEQ == RCV.NXT. * - If RST is in window, we send challenge ACK. * * Note: to take into account delayed ACKs, we should * test against last_ack_sent instead of rcv_nxt. * Note 2: we handle special case of closed window, not * covered by the RFC. */ if ((SEQ_GEQ(th->th_seq, tp->last_ack_sent) && SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) || (tp->rcv_wnd == 0 && tp->last_ack_sent == th->th_seq)) { INP_INFO_RLOCK_ASSERT(&V_tcbinfo); KASSERT(ti_locked == TI_RLOCKED, ("%s: TH_RST ti_locked %d, th %p tp %p", __func__, ti_locked, th, tp)); KASSERT(tp->t_state != TCPS_SYN_SENT, ("%s: TH_RST for TCPS_SYN_SENT th %p tp %p", __func__, th, tp)); if (V_tcp_insecure_rst || tp->last_ack_sent == th->th_seq) { TCPSTAT_INC(tcps_drops); /* Drop the connection. */ switch (tp->t_state) { case TCPS_SYN_RECEIVED: so->so_error = ECONNREFUSED; goto close; case TCPS_ESTABLISHED: case TCPS_FIN_WAIT_1: case TCPS_FIN_WAIT_2: case TCPS_CLOSE_WAIT: so->so_error = ECONNRESET; close: tcp_state_change(tp, TCPS_CLOSED); /* FALLTHROUGH */ default: tp = tcp_close(tp); } } else { TCPSTAT_INC(tcps_badrst); /* Send challenge ACK. */ tcp_respond(tp, mtod(m, void *), th, m, tp->rcv_nxt, tp->snd_nxt, TH_ACK); tp->last_ack_sent = tp->rcv_nxt; m = NULL; } } goto drop; } /* * RFC5961 Section 4.2 * Send challenge ACK for any SYN in synchronized state. */ if ((thflags & TH_SYN) && tp->t_state != TCPS_SYN_SENT) { KASSERT(ti_locked == TI_RLOCKED, ("tcp_do_segment: TH_SYN ti_locked %d", ti_locked)); INP_INFO_RLOCK_ASSERT(&V_tcbinfo); TCPSTAT_INC(tcps_badsyn); if (V_tcp_insecure_syn && SEQ_GEQ(th->th_seq, tp->last_ack_sent) && SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) { tp = tcp_drop(tp, ECONNRESET); rstreason = BANDLIM_UNLIMITED; } else { /* Send challenge ACK. */ tcp_respond(tp, mtod(m, void *), th, m, tp->rcv_nxt, tp->snd_nxt, TH_ACK); tp->last_ack_sent = tp->rcv_nxt; m = NULL; } goto drop; } /* * RFC 1323 PAWS: If we have a timestamp reply on this segment * and it's less than ts_recent, drop it. */ if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && TSTMP_LT(to->to_tsval, tp->ts_recent)) { /* Check to see if ts_recent is over 24 days old. */ if (tcp_ts_getticks() - tp->ts_recent_age > TCP_PAWS_IDLE) { /* * Invalidate ts_recent. If this segment updates * ts_recent, the age will be reset later and ts_recent * will get a valid value. If it does not, setting * ts_recent to zero will at least satisfy the * requirement that zero be placed in the timestamp * echo reply when ts_recent isn't valid. The * age isn't reset until we get a valid ts_recent * because we don't want out-of-order segments to be * dropped when ts_recent is old. */ tp->ts_recent = 0; } else { TCPSTAT_INC(tcps_rcvduppack); TCPSTAT_ADD(tcps_rcvdupbyte, tlen); TCPSTAT_INC(tcps_pawsdrop); if (tlen) goto dropafterack; goto drop; } } /* * In the SYN-RECEIVED state, validate that the packet belongs to * this connection before trimming the data to fit the receive * window. Check the sequence number versus IRS since we know * the sequence numbers haven't wrapped. This is a partial fix * for the "LAND" DoS attack. */ if (tp->t_state == TCPS_SYN_RECEIVED && SEQ_LT(th->th_seq, tp->irs)) { rstreason = BANDLIM_RST_OPENPORT; goto dropwithreset; } todrop = tp->rcv_nxt - th->th_seq; if (todrop > 0) { if (thflags & TH_SYN) { thflags &= ~TH_SYN; th->th_seq++; if (th->th_urp > 1) th->th_urp--; else thflags &= ~TH_URG; todrop--; } /* * Following if statement from Stevens, vol. 2, p. 960. */ if (todrop > tlen || (todrop == tlen && (thflags & TH_FIN) == 0)) { /* * Any valid FIN must be to the left of the window. * At this point the FIN must be a duplicate or out * of sequence; drop it. */ thflags &= ~TH_FIN; /* * Send an ACK to resynchronize and drop any data. * But keep on processing for RST or ACK. */ tp->t_flags |= TF_ACKNOW; todrop = tlen; TCPSTAT_INC(tcps_rcvduppack); TCPSTAT_ADD(tcps_rcvdupbyte, todrop); } else { TCPSTAT_INC(tcps_rcvpartduppack); TCPSTAT_ADD(tcps_rcvpartdupbyte, todrop); } drop_hdrlen += todrop; /* drop from the top afterwards */ th->th_seq += todrop; tlen -= todrop; if (th->th_urp > todrop) th->th_urp -= todrop; else { thflags &= ~TH_URG; th->th_urp = 0; } } /* * If new data are received on a connection after the * user processes are gone, then RST the other end. */ if ((so->so_state & SS_NOFDREF) && tp->t_state > TCPS_CLOSE_WAIT && tlen) { KASSERT(ti_locked == TI_RLOCKED, ("%s: SS_NOFDEREF && " "CLOSE_WAIT && tlen ti_locked %d", __func__, ti_locked)); INP_INFO_RLOCK_ASSERT(&V_tcbinfo); if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: %s: Received %d bytes of data " "after socket was closed, " "sending RST and removing tcpcb\n", s, __func__, tcpstates[tp->t_state], tlen); free(s, M_TCPLOG); } tp = tcp_close(tp); TCPSTAT_INC(tcps_rcvafterclose); rstreason = BANDLIM_UNLIMITED; goto dropwithreset; } /* * If segment ends after window, drop trailing data * (and PUSH and FIN); if nothing left, just ACK. */ todrop = (th->th_seq + tlen) - (tp->rcv_nxt + tp->rcv_wnd); if (todrop > 0) { TCPSTAT_INC(tcps_rcvpackafterwin); if (todrop >= tlen) { TCPSTAT_ADD(tcps_rcvbyteafterwin, tlen); /* * If window is closed can only take segments at * window edge, and have to drop data and PUSH from * incoming segments. Continue processing, but * remember to ack. Otherwise, drop segment * and ack. */ if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) { tp->t_flags |= TF_ACKNOW; TCPSTAT_INC(tcps_rcvwinprobe); } else goto dropafterack; } else TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop); m_adj(m, -todrop); tlen -= todrop; thflags &= ~(TH_PUSH|TH_FIN); } /* * If last ACK falls within this segment's sequence numbers, * record its timestamp. * NOTE: * 1) That the test incorporates suggestions from the latest * proposal of the tcplw@cray.com list (Braden 1993/04/26). * 2) That updating only on newer timestamps interferes with * our earlier PAWS tests, so this check should be solely * predicated on the sequence space of this segment. * 3) That we modify the segment boundary check to be * Last.ACK.Sent <= SEG.SEQ + SEG.Len * instead of RFC1323's * Last.ACK.Sent < SEG.SEQ + SEG.Len, * This modified check allows us to overcome RFC1323's * limitations as described in Stevens TCP/IP Illustrated * Vol. 2 p.869. In such cases, we can still calculate the * RTT correctly when RCV.NXT == Last.ACK.Sent. */ if ((to->to_flags & TOF_TS) != 0 && SEQ_LEQ(th->th_seq, tp->last_ack_sent) && SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + ((thflags & (TH_SYN|TH_FIN)) != 0))) { tp->ts_recent_age = tcp_ts_getticks(); tp->ts_recent = to->to_tsval; } /* * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN * flag is on (half-synchronized state), then queue data for * later processing; else drop segment and return. */ if ((thflags & TH_ACK) == 0) { if (tp->t_state == TCPS_SYN_RECEIVED || (tp->t_flags & TF_NEEDSYN)) goto step6; else if (tp->t_flags & TF_ACKNOW) goto dropafterack; else goto drop; } /* * Ack processing. */ switch (tp->t_state) { /* * In SYN_RECEIVED state, the ack ACKs our SYN, so enter * ESTABLISHED state and continue processing. * The ACK was checked above. */ case TCPS_SYN_RECEIVED: TCPSTAT_INC(tcps_connects); soisconnected(so); /* Do window scaling? */ if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == (TF_RCVD_SCALE|TF_REQ_SCALE)) { tp->rcv_scale = tp->request_r_scale; tp->snd_wnd = tiwin; } /* * Make transitions: * SYN-RECEIVED -> ESTABLISHED * SYN-RECEIVED* -> FIN-WAIT-1 */ tp->t_starttime = ticks; if (tp->t_flags & TF_NEEDFIN) { tcp_state_change(tp, TCPS_FIN_WAIT_1); tp->t_flags &= ~TF_NEEDFIN; } else { tcp_state_change(tp, TCPS_ESTABLISHED); TCP_PROBE5(accept__established, NULL, tp, mtod(m, const char *), tp, th); cc_conn_init(tp); tcp_timer_activate(tp, TT_KEEP, TP_KEEPIDLE(tp)); } /* * If segment contains data or ACK, will call tcp_reass() * later; if not, do so now to pass queued data to user. */ if (tlen == 0 && (thflags & TH_FIN) == 0) (void) tcp_reass(tp, (struct tcphdr *)0, 0, (struct mbuf *)0); tp->snd_wl1 = th->th_seq - 1; /* FALLTHROUGH */ /* * In ESTABLISHED state: drop duplicate ACKs; ACK out of range * ACKs. If the ack is in the range * tp->snd_una < th->th_ack <= tp->snd_max * then advance tp->snd_una to th->th_ack and drop * data from the retransmission queue. If this ACK reflects * more up to date window information we update our window information. */ case TCPS_ESTABLISHED: case TCPS_FIN_WAIT_1: case TCPS_FIN_WAIT_2: case TCPS_CLOSE_WAIT: case TCPS_CLOSING: case TCPS_LAST_ACK: if (SEQ_GT(th->th_ack, tp->snd_max)) { TCPSTAT_INC(tcps_rcvacktoomuch); goto dropafterack; } if ((tp->t_flags & TF_SACK_PERMIT) && ((to->to_flags & TOF_SACK) || !TAILQ_EMPTY(&tp->snd_holes))) tcp_sack_doack(tp, to, th->th_ack); else /* * Reset the value so that previous (valid) value * from the last ack with SACK doesn't get used. */ tp->sackhint.sacked_bytes = 0; /* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */ hhook_run_tcp_est_in(tp, th, to); if (SEQ_LEQ(th->th_ack, tp->snd_una)) { if (tlen == 0 && tiwin == tp->snd_wnd) { /* * If this is the first time we've seen a * FIN from the remote, this is not a * duplicate and it needs to be processed * normally. This happens during a * simultaneous close. */ if ((thflags & TH_FIN) && (TCPS_HAVERCVDFIN(tp->t_state) == 0)) { tp->t_dupacks = 0; break; } TCPSTAT_INC(tcps_rcvdupack); /* * If we have outstanding data (other than * a window probe), this is a completely * duplicate ack (ie, window info didn't * change and FIN isn't set), * the ack is the biggest we've * seen and we've seen exactly our rexmt * threshold of them, assume a packet * has been dropped and retransmit it. * Kludge snd_nxt & the congestion * window so we send only this one * packet. * * We know we're losing at the current * window size so do congestion avoidance * (set ssthresh to half the current window * and pull our congestion window back to * the new ssthresh). * * Dup acks mean that packets have left the * network (they're now cached at the receiver) * so bump cwnd by the amount in the receiver * to keep a constant cwnd packets in the * network. * * When using TCP ECN, notify the peer that * we reduced the cwnd. */ if (!tcp_timer_active(tp, TT_REXMT) || th->th_ack != tp->snd_una) tp->t_dupacks = 0; else if (++tp->t_dupacks > tcprexmtthresh || IN_FASTRECOVERY(tp->t_flags)) { cc_ack_received(tp, th, CC_DUPACK); if ((tp->t_flags & TF_SACK_PERMIT) && IN_FASTRECOVERY(tp->t_flags)) { int awnd; /* * Compute the amount of data in flight first. * We can inject new data into the pipe iff * we have less than 1/2 the original window's * worth of data in flight. */ if (V_tcp_do_rfc6675_pipe) awnd = tcp_compute_pipe(tp); else awnd = (tp->snd_nxt - tp->snd_fack) + tp->sackhint.sack_bytes_rexmit; if (awnd < tp->snd_ssthresh) { tp->snd_cwnd += tp->t_maxseg; if (tp->snd_cwnd > tp->snd_ssthresh) tp->snd_cwnd = tp->snd_ssthresh; } } else tp->snd_cwnd += tp->t_maxseg; (void) tp->t_fb->tfb_tcp_output(tp); goto drop; } else if (tp->t_dupacks == tcprexmtthresh) { tcp_seq onxt = tp->snd_nxt; /* * If we're doing sack, check to * see if we're already in sack * recovery. If we're not doing sack, * check to see if we're in newreno * recovery. */ if (tp->t_flags & TF_SACK_PERMIT) { if (IN_FASTRECOVERY(tp->t_flags)) { tp->t_dupacks = 0; break; } } else { if (SEQ_LEQ(th->th_ack, tp->snd_recover)) { tp->t_dupacks = 0; break; } } /* Congestion signal before ack. */ cc_cong_signal(tp, th, CC_NDUPACK); cc_ack_received(tp, th, CC_DUPACK); tcp_timer_activate(tp, TT_REXMT, 0); tp->t_rtttime = 0; if (tp->t_flags & TF_SACK_PERMIT) { TCPSTAT_INC( tcps_sack_recovery_episode); tp->sack_newdata = tp->snd_nxt; tp->snd_cwnd = tp->t_maxseg; (void) tp->t_fb->tfb_tcp_output(tp); goto drop; } tp->snd_nxt = th->th_ack; tp->snd_cwnd = tp->t_maxseg; (void) tp->t_fb->tfb_tcp_output(tp); KASSERT(tp->snd_limited <= 2, ("%s: tp->snd_limited too big", __func__)); tp->snd_cwnd = tp->snd_ssthresh + tp->t_maxseg * (tp->t_dupacks - tp->snd_limited); if (SEQ_GT(onxt, tp->snd_nxt)) tp->snd_nxt = onxt; goto drop; } else if (V_tcp_do_rfc3042) { /* * Process first and second duplicate * ACKs. Each indicates a segment * leaving the network, creating room * for more. Make sure we can send a * packet on reception of each duplicate * ACK by increasing snd_cwnd by one * segment. Restore the original * snd_cwnd after packet transmission. */ cc_ack_received(tp, th, CC_DUPACK); u_long oldcwnd = tp->snd_cwnd; tcp_seq oldsndmax = tp->snd_max; u_int sent; int avail; KASSERT(tp->t_dupacks == 1 || tp->t_dupacks == 2, ("%s: dupacks not 1 or 2", __func__)); if (tp->t_dupacks == 1) tp->snd_limited = 0; tp->snd_cwnd = (tp->snd_nxt - tp->snd_una) + (tp->t_dupacks - tp->snd_limited) * tp->t_maxseg; /* * Only call tcp_output when there * is new data available to be sent. * Otherwise we would send pure ACKs. */ SOCKBUF_LOCK(&so->so_snd); avail = sbavail(&so->so_snd) - (tp->snd_nxt - tp->snd_una); SOCKBUF_UNLOCK(&so->so_snd); if (avail > 0) (void) tp->t_fb->tfb_tcp_output(tp); sent = tp->snd_max - oldsndmax; if (sent > tp->t_maxseg) { KASSERT((tp->t_dupacks == 2 && tp->snd_limited == 0) || (sent == tp->t_maxseg + 1 && tp->t_flags & TF_SENTFIN), ("%s: sent too much", __func__)); tp->snd_limited = 2; } else if (sent > 0) ++tp->snd_limited; tp->snd_cwnd = oldcwnd; goto drop; } } else tp->t_dupacks = 0; break; } KASSERT(SEQ_GT(th->th_ack, tp->snd_una), ("%s: th_ack <= snd_una", __func__)); /* * If the congestion window was inflated to account * for the other side's cached packets, retract it. */ if (IN_FASTRECOVERY(tp->t_flags)) { if (SEQ_LT(th->th_ack, tp->snd_recover)) { if (tp->t_flags & TF_SACK_PERMIT) tcp_sack_partialack(tp, th); else tcp_newreno_partial_ack(tp, th); } else cc_post_recovery(tp, th); } tp->t_dupacks = 0; /* * If we reach this point, ACK is not a duplicate, * i.e., it ACKs something we sent. */ if (tp->t_flags & TF_NEEDSYN) { /* * T/TCP: Connection was half-synchronized, and our * SYN has been ACK'd (so connection is now fully * synchronized). Go to non-starred state, * increment snd_una for ACK of SYN, and check if * we can do window scaling. */ tp->t_flags &= ~TF_NEEDSYN; tp->snd_una++; /* Do window scaling? */ if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) == (TF_RCVD_SCALE|TF_REQ_SCALE)) { tp->rcv_scale = tp->request_r_scale; /* Send window already scaled. */ } } process_ACK: INP_WLOCK_ASSERT(tp->t_inpcb); acked = BYTES_THIS_ACK(tp, th); TCPSTAT_INC(tcps_rcvackpack); TCPSTAT_ADD(tcps_rcvackbyte, acked); /* * If we just performed our first retransmit, and the ACK * arrives within our recovery window, then it was a mistake * to do the retransmit in the first place. Recover our * original cwnd and ssthresh, and proceed to transmit where * we left off. */ if (tp->t_rxtshift == 1 && tp->t_flags & TF_PREVVALID && (int)(ticks - tp->t_badrxtwin) < 0) cc_cong_signal(tp, th, CC_RTO_ERR); /* * If we have a timestamp reply, update smoothed * round trip time. If no timestamp is present but * transmit timer is running and timed sequence * number was acked, update smoothed round trip time. * Since we now have an rtt measurement, cancel the * timer backoff (cf., Phil Karn's retransmit alg.). * Recompute the initial retransmit timer. * * Some boxes send broken timestamp replies * during the SYN+ACK phase, ignore * timestamps of 0 or we could calculate a * huge RTT and blow up the retransmit timer. */ if ((to->to_flags & TOF_TS) != 0 && to->to_tsecr) { u_int t; t = tcp_ts_getticks() - to->to_tsecr; if (!tp->t_rttlow || tp->t_rttlow > t) tp->t_rttlow = t; tcp_xmit_timer(tp, TCP_TS_TO_TICKS(t) + 1); } else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) { if (!tp->t_rttlow || tp->t_rttlow > ticks - tp->t_rtttime) tp->t_rttlow = ticks - tp->t_rtttime; tcp_xmit_timer(tp, ticks - tp->t_rtttime); } /* * If all outstanding data is acked, stop retransmit * timer and remember to restart (more output or persist). * If there is more data to be acked, restart retransmit * timer, using current (possibly backed-off) value. */ if (th->th_ack == tp->snd_max) { tcp_timer_activate(tp, TT_REXMT, 0); needoutput = 1; } else if (!tcp_timer_active(tp, TT_PERSIST)) tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur); /* * If no data (only SYN) was ACK'd, * skip rest of ACK processing. */ if (acked == 0) goto step6; /* * Let the congestion control algorithm update congestion * control related information. This typically means increasing * the congestion window. */ cc_ack_received(tp, th, CC_ACK); SOCKBUF_LOCK(&so->so_snd); if (acked > sbavail(&so->so_snd)) { tp->snd_wnd -= sbavail(&so->so_snd); mfree = sbcut_locked(&so->so_snd, (int)sbavail(&so->so_snd)); ourfinisacked = 1; } else { mfree = sbcut_locked(&so->so_snd, acked); tp->snd_wnd -= acked; ourfinisacked = 0; } /* NB: sowwakeup_locked() does an implicit unlock. */ sowwakeup_locked(so); m_freem(mfree); /* Detect una wraparound. */ if (!IN_RECOVERY(tp->t_flags) && SEQ_GT(tp->snd_una, tp->snd_recover) && SEQ_LEQ(th->th_ack, tp->snd_recover)) tp->snd_recover = th->th_ack - 1; /* XXXLAS: Can this be moved up into cc_post_recovery? */ if (IN_RECOVERY(tp->t_flags) && SEQ_GEQ(th->th_ack, tp->snd_recover)) { EXIT_RECOVERY(tp->t_flags); } tp->snd_una = th->th_ack; if (tp->t_flags & TF_SACK_PERMIT) { if (SEQ_GT(tp->snd_una, tp->snd_recover)) tp->snd_recover = tp->snd_una; } if (SEQ_LT(tp->snd_nxt, tp->snd_una)) tp->snd_nxt = tp->snd_una; switch (tp->t_state) { /* * In FIN_WAIT_1 STATE in addition to the processing * for the ESTABLISHED state if our FIN is now acknowledged * then enter FIN_WAIT_2. */ case TCPS_FIN_WAIT_1: if (ourfinisacked) { /* * If we can't receive any more * data, then closing user can proceed. * Starting the timer is contrary to the * specification, but if we don't get a FIN * we'll hang forever. * * XXXjl: * we should release the tp also, and use a * compressed state. */ if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { soisdisconnected(so); tcp_timer_activate(tp, TT_2MSL, (tcp_fast_finwait2_recycle ? tcp_finwait2_timeout : TP_MAXIDLE(tp))); } tcp_state_change(tp, TCPS_FIN_WAIT_2); } break; /* * In CLOSING STATE in addition to the processing for * the ESTABLISHED state if the ACK acknowledges our FIN * then enter the TIME-WAIT state, otherwise ignore * the segment. */ case TCPS_CLOSING: if (ourfinisacked) { INP_INFO_RLOCK_ASSERT(&V_tcbinfo); tcp_twstart(tp); INP_INFO_RUNLOCK(&V_tcbinfo); m_freem(m); return; } break; /* * In LAST_ACK, we may still be waiting for data to drain * and/or to be acked, as well as for the ack of our FIN. * If our FIN is now acknowledged, delete the TCB, * enter the closed state and return. */ case TCPS_LAST_ACK: if (ourfinisacked) { INP_INFO_RLOCK_ASSERT(&V_tcbinfo); tp = tcp_close(tp); goto drop; } break; } } step6: INP_WLOCK_ASSERT(tp->t_inpcb); /* * Update window information. * Don't look at window if no ACK: TAC's send garbage on first SYN. */ if ((thflags & TH_ACK) && (SEQ_LT(tp->snd_wl1, th->th_seq) || (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) || (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) { /* keep track of pure window updates */ if (tlen == 0 && tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) TCPSTAT_INC(tcps_rcvwinupd); tp->snd_wnd = tiwin; tp->snd_wl1 = th->th_seq; tp->snd_wl2 = th->th_ack; if (tp->snd_wnd > tp->max_sndwnd) tp->max_sndwnd = tp->snd_wnd; needoutput = 1; } /* * Process segments with URG. */ if ((thflags & TH_URG) && th->th_urp && TCPS_HAVERCVDFIN(tp->t_state) == 0) { /* * This is a kludge, but if we receive and accept * random urgent pointers, we'll crash in * soreceive. It's hard to imagine someone * actually wanting to send this much urgent data. */ SOCKBUF_LOCK(&so->so_rcv); if (th->th_urp + sbavail(&so->so_rcv) > sb_max) { th->th_urp = 0; /* XXX */ thflags &= ~TH_URG; /* XXX */ SOCKBUF_UNLOCK(&so->so_rcv); /* XXX */ goto dodata; /* XXX */ } /* * If this segment advances the known urgent pointer, * then mark the data stream. This should not happen * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since * a FIN has been received from the remote side. * In these states we ignore the URG. * * According to RFC961 (Assigned Protocols), * the urgent pointer points to the last octet * of urgent data. We continue, however, * to consider it to indicate the first octet * of data past the urgent section as the original * spec states (in one of two places). */ if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) { tp->rcv_up = th->th_seq + th->th_urp; so->so_oobmark = sbavail(&so->so_rcv) + (tp->rcv_up - tp->rcv_nxt) - 1; if (so->so_oobmark == 0) so->so_rcv.sb_state |= SBS_RCVATMARK; sohasoutofband(so); tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA); } SOCKBUF_UNLOCK(&so->so_rcv); /* * Remove out of band data so doesn't get presented to user. * This can happen independent of advancing the URG pointer, * but if two URG's are pending at once, some out-of-band * data may creep in... ick. */ if (th->th_urp <= (u_long)tlen && !(so->so_options & SO_OOBINLINE)) { /* hdr drop is delayed */ tcp_pulloutofband(so, th, m, drop_hdrlen); } } else { /* * If no out of band data is expected, * pull receive urgent pointer along * with the receive window. */ if (SEQ_GT(tp->rcv_nxt, tp->rcv_up)) tp->rcv_up = tp->rcv_nxt; } dodata: /* XXX */ INP_WLOCK_ASSERT(tp->t_inpcb); /* * Process the segment text, merging it into the TCP sequencing queue, * and arranging for acknowledgment of receipt if necessary. * This process logically involves adjusting tp->rcv_wnd as data * is presented to the user (this happens in tcp_usrreq.c, * case PRU_RCVD). If a FIN has already been received on this * connection then we just ignore the text. */ if ((tlen || (thflags & TH_FIN)) && TCPS_HAVERCVDFIN(tp->t_state) == 0) { tcp_seq save_start = th->th_seq; m_adj(m, drop_hdrlen); /* delayed header drop */ /* * Insert segment which includes th into TCP reassembly queue * with control block tp. Set thflags to whether reassembly now * includes a segment with FIN. This handles the common case * inline (segment is the next to be received on an established * connection, and the queue is empty), avoiding linkage into * and removal from the queue and repetition of various * conversions. * Set DELACK for segments received in order, but ack * immediately when segments are out of order (so * fast retransmit can work). */ if (th->th_seq == tp->rcv_nxt && LIST_EMPTY(&tp->t_segq) && TCPS_HAVEESTABLISHED(tp->t_state)) { if (DELAY_ACK(tp, tlen)) tp->t_flags |= TF_DELACK; else tp->t_flags |= TF_ACKNOW; tp->rcv_nxt += tlen; thflags = th->th_flags & TH_FIN; TCPSTAT_INC(tcps_rcvpack); TCPSTAT_ADD(tcps_rcvbyte, tlen); SOCKBUF_LOCK(&so->so_rcv); if (so->so_rcv.sb_state & SBS_CANTRCVMORE) m_freem(m); else sbappendstream_locked(&so->so_rcv, m, 0); /* NB: sorwakeup_locked() does an implicit unlock. */ sorwakeup_locked(so); } else { /* * XXX: Due to the header drop above "th" is * theoretically invalid by now. Fortunately * m_adj() doesn't actually frees any mbufs * when trimming from the head. */ thflags = tcp_reass(tp, th, &tlen, m); tp->t_flags |= TF_ACKNOW; } if (tlen > 0 && (tp->t_flags & TF_SACK_PERMIT)) tcp_update_sack_list(tp, save_start, save_start + tlen); #if 0 /* * Note the amount of data that peer has sent into * our window, in order to estimate the sender's * buffer size. * XXX: Unused. */ if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt)) len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt); else len = so->so_rcv.sb_hiwat; #endif } else { m_freem(m); thflags &= ~TH_FIN; } /* * If FIN is received ACK the FIN and let the user know * that the connection is closing. */ if (thflags & TH_FIN) { if (TCPS_HAVERCVDFIN(tp->t_state) == 0) { socantrcvmore(so); /* * If connection is half-synchronized * (ie NEEDSYN flag on) then delay ACK, * so it may be piggybacked when SYN is sent. * Otherwise, since we received a FIN then no * more input can be expected, send ACK now. */ if (tp->t_flags & TF_NEEDSYN) tp->t_flags |= TF_DELACK; else tp->t_flags |= TF_ACKNOW; tp->rcv_nxt++; } switch (tp->t_state) { /* * In SYN_RECEIVED and ESTABLISHED STATES * enter the CLOSE_WAIT state. */ case TCPS_SYN_RECEIVED: tp->t_starttime = ticks; /* FALLTHROUGH */ case TCPS_ESTABLISHED: tcp_state_change(tp, TCPS_CLOSE_WAIT); break; /* * If still in FIN_WAIT_1 STATE FIN has not been acked so * enter the CLOSING state. */ case TCPS_FIN_WAIT_1: tcp_state_change(tp, TCPS_CLOSING); break; /* * In FIN_WAIT_2 state enter the TIME_WAIT state, * starting the time-wait timer, turning off the other * standard timers. */ case TCPS_FIN_WAIT_2: INP_INFO_RLOCK_ASSERT(&V_tcbinfo); KASSERT(ti_locked == TI_RLOCKED, ("%s: dodata " "TCP_FIN_WAIT_2 ti_locked: %d", __func__, ti_locked)); tcp_twstart(tp); INP_INFO_RUNLOCK(&V_tcbinfo); return; } } if (ti_locked == TI_RLOCKED) { INP_INFO_RUNLOCK(&V_tcbinfo); } ti_locked = TI_UNLOCKED; #ifdef TCPDEBUG if (so->so_options & SO_DEBUG) tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif TCP_PROBE3(debug__input, tp, th, mtod(m, const char *)); /* * Return any desired output. */ if (needoutput || (tp->t_flags & TF_ACKNOW)) (void) tp->t_fb->tfb_tcp_output(tp); KASSERT(ti_locked == TI_UNLOCKED, ("%s: check_delack ti_locked %d", __func__, ti_locked)); INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(tp->t_inpcb); if (tp->t_flags & TF_DELACK) { tp->t_flags &= ~TF_DELACK; tcp_timer_activate(tp, TT_DELACK, tcp_delacktime); } INP_WUNLOCK(tp->t_inpcb); return; dropafterack: /* * Generate an ACK dropping incoming segment if it occupies * sequence space, where the ACK reflects our state. * * We can now skip the test for the RST flag since all * paths to this code happen after packets containing * RST have been dropped. * * In the SYN-RECEIVED state, don't send an ACK unless the * segment we received passes the SYN-RECEIVED ACK test. * If it fails send a RST. This breaks the loop in the * "LAND" DoS attack, and also prevents an ACK storm * between two listening ports that have been sent forged * SYN segments, each with the source address of the other. */ if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) && (SEQ_GT(tp->snd_una, th->th_ack) || SEQ_GT(th->th_ack, tp->snd_max)) ) { rstreason = BANDLIM_RST_OPENPORT; goto dropwithreset; } #ifdef TCPDEBUG if (so->so_options & SO_DEBUG) tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif TCP_PROBE3(debug__drop, tp, th, mtod(m, const char *)); if (ti_locked == TI_RLOCKED) { INP_INFO_RUNLOCK(&V_tcbinfo); } ti_locked = TI_UNLOCKED; tp->t_flags |= TF_ACKNOW; (void) tp->t_fb->tfb_tcp_output(tp); INP_WUNLOCK(tp->t_inpcb); m_freem(m); return; dropwithreset: if (ti_locked == TI_RLOCKED) { INP_INFO_RUNLOCK(&V_tcbinfo); } ti_locked = TI_UNLOCKED; if (tp != NULL) { tcp_dropwithreset(m, th, tp, tlen, rstreason); INP_WUNLOCK(tp->t_inpcb); } else tcp_dropwithreset(m, th, NULL, tlen, rstreason); return; drop: if (ti_locked == TI_RLOCKED) { INP_INFO_RUNLOCK(&V_tcbinfo); ti_locked = TI_UNLOCKED; } #ifdef INVARIANTS else INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); #endif /* * Drop space held by incoming segment and return. */ #ifdef TCPDEBUG if (tp == NULL || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif TCP_PROBE3(debug__drop, tp, th, mtod(m, const char *)); if (tp != NULL) INP_WUNLOCK(tp->t_inpcb); m_freem(m); } /* * Do fast slow is a combination of the original * tcp_dosegment and a split fastpath, one function * for the fast-ack which also includes allowing fastpath * for window advanced in sequence acks. And also a * sub-function that handles the insequence data. */ void tcp_do_segment_fastslow(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, int drop_hdrlen, int tlen, uint8_t iptos, int ti_locked) { int thflags; u_long tiwin; char *s; int can_enter; struct in_conninfo *inc; struct tcpopt to; thflags = th->th_flags; tp->sackhint.last_sack_ack = 0; inc = &tp->t_inpcb->inp_inc; /* * If this is either a state-changing packet or current state isn't * established, we require a write lock on tcbinfo. Otherwise, we * allow the tcbinfo to be in either alocked or unlocked, as the * caller may have unnecessarily acquired a write lock due to a race. */ if ((thflags & (TH_SYN | TH_FIN | TH_RST)) != 0 || tp->t_state != TCPS_ESTABLISHED) { KASSERT(ti_locked == TI_RLOCKED, ("%s ti_locked %d for " "SYN/FIN/RST/!EST", __func__, ti_locked)); INP_INFO_RLOCK_ASSERT(&V_tcbinfo); } else { #ifdef INVARIANTS if (ti_locked == TI_RLOCKED) { INP_INFO_RLOCK_ASSERT(&V_tcbinfo); } else { KASSERT(ti_locked == TI_UNLOCKED, ("%s: EST " "ti_locked: %d", __func__, ti_locked)); INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); } #endif } INP_WLOCK_ASSERT(tp->t_inpcb); KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN", __func__)); KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT", __func__)); /* * Segment received on connection. * Reset idle time and keep-alive timer. * XXX: This should be done after segment * validation to ignore broken/spoofed segs. */ tp->t_rcvtime = ticks; if (TCPS_HAVEESTABLISHED(tp->t_state)) tcp_timer_activate(tp, TT_KEEP, TP_KEEPIDLE(tp)); /* * Unscale the window into a 32-bit value. * For the SYN_SENT state the scale is zero. */ tiwin = th->th_win << tp->snd_scale; /* * TCP ECN processing. */ if (tp->t_flags & TF_ECN_PERMIT) { if (thflags & TH_CWR) tp->t_flags &= ~TF_ECN_SND_ECE; switch (iptos & IPTOS_ECN_MASK) { case IPTOS_ECN_CE: tp->t_flags |= TF_ECN_SND_ECE; TCPSTAT_INC(tcps_ecn_ce); break; case IPTOS_ECN_ECT0: TCPSTAT_INC(tcps_ecn_ect0); break; case IPTOS_ECN_ECT1: TCPSTAT_INC(tcps_ecn_ect1); break; } /* Congestion experienced. */ if (thflags & TH_ECE) { cc_cong_signal(tp, th, CC_ECN); } } /* * Parse options on any incoming segment. */ tcp_dooptions(&to, (u_char *)(th + 1), (th->th_off << 2) - sizeof(struct tcphdr), (thflags & TH_SYN) ? TO_SYN : 0); /* * If echoed timestamp is later than the current time, * fall back to non RFC1323 RTT calculation. Normalize * timestamp if syncookies were used when this connection * was established. */ if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) { to.to_tsecr -= tp->ts_offset; if (TSTMP_GT(to.to_tsecr, tcp_ts_getticks())) to.to_tsecr = 0; } /* * If timestamps were negotiated during SYN/ACK they should * appear on every segment during this session and vice versa. */ if ((tp->t_flags & TF_RCVD_TSTMP) && !(to.to_flags & TOF_TS)) { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: Timestamp missing, " "no action\n", s, __func__); free(s, M_TCPLOG); } } if (!(tp->t_flags & TF_RCVD_TSTMP) && (to.to_flags & TOF_TS)) { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: Timestamp not expected, " "no action\n", s, __func__); free(s, M_TCPLOG); } } /* * Process options only when we get SYN/ACK back. The SYN case * for incoming connections is handled in tcp_syncache. * According to RFC1323 the window field in a SYN (i.e., a * or ) segment itself is never scaled. * XXX this is traditional behavior, may need to be cleaned up. */ if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) { if ((to.to_flags & TOF_SCALE) && (tp->t_flags & TF_REQ_SCALE)) { tp->t_flags |= TF_RCVD_SCALE; tp->snd_scale = to.to_wscale; } /* * Initial send window. It will be updated with * the next incoming segment to the scaled value. */ tp->snd_wnd = th->th_win; if (to.to_flags & TOF_TS) { tp->t_flags |= TF_RCVD_TSTMP; tp->ts_recent = to.to_tsval; tp->ts_recent_age = tcp_ts_getticks(); } if (to.to_flags & TOF_MSS) tcp_mss(tp, to.to_mss); if ((tp->t_flags & TF_SACK_PERMIT) && (to.to_flags & TOF_SACKPERM) == 0) tp->t_flags &= ~TF_SACK_PERMIT; } can_enter = 0; if (__predict_true((tlen == 0))) { /* * The ack moved forward and we have a window (non-zero) * * The ack did not move forward, but the window increased. */ if (__predict_true((SEQ_GT(th->th_ack, tp->snd_una) && tiwin) || ((th->th_ack == tp->snd_una) && tiwin && (tiwin > tp->snd_wnd)))) { can_enter = 1; } } else { /* * Data incoming, use the old entry criteria * for fast-path with data. */ if ((tiwin && tiwin == tp->snd_wnd)) { can_enter = 1; } } /* * Header prediction: check for the two common cases * of a uni-directional data xfer. If the packet has * no control flags, is in-sequence, the window didn't * change and we're not retransmitting, it's a * candidate. If the length is zero and the ack moved * forward, we're the sender side of the xfer. Just * free the data acked & wake any higher level process * that was blocked waiting for space. If the length * is non-zero and the ack didn't move, we're the * receiver side. If we're getting packets in-order * (the reassembly queue is empty), add the data to * the socket buffer and note that we need a delayed ack. * Make sure that the hidden state-flags are also off. * Since we check for TCPS_ESTABLISHED first, it can only * be TH_NEEDSYN. */ if (__predict_true(tp->t_state == TCPS_ESTABLISHED && th->th_seq == tp->rcv_nxt && (thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK && tp->snd_nxt == tp->snd_max && can_enter && ((tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN)) == 0) && LIST_EMPTY(&tp->t_segq) && ((to.to_flags & TOF_TS) == 0 || TSTMP_GEQ(to.to_tsval, tp->ts_recent)))) { if (__predict_true((tlen == 0) && (SEQ_LEQ(th->th_ack, tp->snd_max) && !IN_RECOVERY(tp->t_flags) && (to.to_flags & TOF_SACK) == 0 && TAILQ_EMPTY(&tp->snd_holes)))) { /* We are done */ tcp_do_fastack(m, th, so, tp, &to, drop_hdrlen, tlen, ti_locked, tiwin); return; } else if ((tlen) && (th->th_ack == tp->snd_una && tlen <= sbspace(&so->so_rcv))) { tcp_do_fastnewdata(m, th, so, tp, &to, drop_hdrlen, tlen, ti_locked, tiwin); /* We are done */ return; } } tcp_do_slowpath(m, th, so, tp, &to, drop_hdrlen, tlen, ti_locked, tiwin, thflags); } /* * This subfunction is used to try to highly optimize the * fast path. We again allow window updates that are * in sequence to remain in the fast-path. We also add * in the __predict's to attempt to help the compiler. * Note that if we return a 0, then we can *not* process * it and the caller should push the packet into the * slow-path. */ static int tcp_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int drop_hdrlen, int tlen, int ti_locked, u_long tiwin) { int acked; int winup_only=0; #ifdef TCPDEBUG /* * The size of tcp_saveipgen must be the size of the max ip header, * now IPv6. */ u_char tcp_saveipgen[IP6_HDR_LEN]; struct tcphdr tcp_savetcp; short ostate = 0; #endif if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) { /* Old ack, behind (or duplicate to) the last one rcv'd */ return (0); } if (__predict_false(th->th_ack == tp->snd_una) && __predict_false(tiwin <= tp->snd_wnd)) { /* duplicate ack a shrinking dup ack with shrinking window */ return (0); } if (__predict_false(tiwin == 0)) { /* zero window */ return (0); } if (__predict_false(SEQ_GT(th->th_ack, tp->snd_max))) { /* Above what we have sent? */ return (0); } if (__predict_false(tp->snd_nxt != tp->snd_max)) { /* We are retransmitting */ return (0); } if (__predict_false(tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN))) { /* We need a SYN or a FIN, unlikely.. */ return (0); } if((to->to_flags & TOF_TS) && __predict_false(TSTMP_LT(to->to_tsval, tp->ts_recent))) { /* Timestamp is behind .. old ack with seq wrap? */ return (0); } if (__predict_false(IN_RECOVERY(tp->t_flags))) { /* Still recovering */ return (0); } if (__predict_false(to->to_flags & TOF_SACK)) { /* Sack included in the ack.. */ return (0); } if (!TAILQ_EMPTY(&tp->snd_holes)) { /* We have sack holes on our scoreboard */ return (0); } /* Ok if we reach here, we can process a fast-ack */ /* Did the window get updated? */ if (tiwin != tp->snd_wnd) { /* keep track of pure window updates */ if (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) { winup_only = 1; TCPSTAT_INC(tcps_rcvwinupd); } tp->snd_wnd = tiwin; tp->snd_wl1 = th->th_seq; if (tp->snd_wnd > tp->max_sndwnd) tp->max_sndwnd = tp->snd_wnd; } /* * Pull snd_wl2 up to prevent seq wrap relative * to th_ack. */ tp->snd_wl2 = th->th_ack; /* * If last ACK falls within this segment's sequence numbers, * record the timestamp. * NOTE that the test is modified according to the latest * proposal of the tcplw@cray.com list (Braden 1993/04/26). */ if ((to->to_flags & TOF_TS) != 0 && SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { tp->ts_recent_age = tcp_ts_getticks(); tp->ts_recent = to->to_tsval; } /* * This is a pure ack for outstanding data. */ if (ti_locked == TI_RLOCKED) { INP_INFO_RUNLOCK(&V_tcbinfo); } ti_locked = TI_UNLOCKED; TCPSTAT_INC(tcps_predack); /* * "bad retransmit" recovery. */ if (tp->t_rxtshift == 1 && tp->t_flags & TF_PREVVALID && (int)(ticks - tp->t_badrxtwin) < 0) { cc_cong_signal(tp, th, CC_RTO_ERR); } /* * Recalculate the transmit timer / rtt. * * Some boxes send broken timestamp replies * during the SYN+ACK phase, ignore * timestamps of 0 or we could calculate a * huge RTT and blow up the retransmit timer. */ if ((to->to_flags & TOF_TS) != 0 && to->to_tsecr) { u_int t; t = tcp_ts_getticks() - to->to_tsecr; if (!tp->t_rttlow || tp->t_rttlow > t) tp->t_rttlow = t; tcp_xmit_timer(tp, TCP_TS_TO_TICKS(t) + 1); } else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) { if (!tp->t_rttlow || tp->t_rttlow > ticks - tp->t_rtttime) tp->t_rttlow = ticks - tp->t_rtttime; tcp_xmit_timer(tp, ticks - tp->t_rtttime); } if (winup_only == 0) { acked = BYTES_THIS_ACK(tp, th); /* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */ hhook_run_tcp_est_in(tp, th, to); TCPSTAT_ADD(tcps_rcvackbyte, acked); sbdrop(&so->so_snd, acked); if (SEQ_GT(tp->snd_una, tp->snd_recover) && SEQ_LEQ(th->th_ack, tp->snd_recover)) tp->snd_recover = th->th_ack - 1; /* * Let the congestion control algorithm update * congestion control related information. This * typically means increasing the congestion * window. */ cc_ack_received(tp, th, CC_ACK); tp->snd_una = th->th_ack; tp->t_dupacks = 0; /* * If all outstanding data are acked, stop * retransmit timer, otherwise restart timer * using current (possibly backed-off) value. * If process is waiting for space, * wakeup/selwakeup/signal. If data * are ready to send, let tcp_output * decide between more output or persist. */ #ifdef TCPDEBUG if (so->so_options & SO_DEBUG) tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif TCP_PROBE3(debug__input, tp, th, mtod(m, const char *)); m_freem(m); if (tp->snd_una == tp->snd_max) tcp_timer_activate(tp, TT_REXMT, 0); else if (!tcp_timer_active(tp, TT_PERSIST)) tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur); /* Wake up the socket if we have room to write more */ sowwakeup(so); } else { /* * Window update only, just free the mbufs and * send out whatever we can. */ m_freem(m); } if (sbavail(&so->so_snd)) (void) tcp_output(tp); KASSERT(ti_locked == TI_UNLOCKED, ("%s: check_delack ti_locked %d", __func__, ti_locked)); INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(tp->t_inpcb); if (tp->t_flags & TF_DELACK) { tp->t_flags &= ~TF_DELACK; tcp_timer_activate(tp, TT_DELACK, tcp_delacktime); } INP_WUNLOCK(tp->t_inpcb); return (1); } /* * This tcp-do-segment concentrates on making the fastest * ack processing path. It does not have a fast-path for * data (it possibly could which would then eliminate the * need for fast-slow above). For a content distributor having * large outgoing elephants and very very little coming in * having no fastpath for data does not really help (since you * don't get much data in). The most important thing is * processing ack's quickly and getting the rest of the data * output to the peer as quickly as possible. This routine * seems to be about an overall 3% faster then the old * tcp_do_segment and keeps us in the fast-path for packets * much more (by allowing window updates to also stay in the fastpath). */ void tcp_do_segment_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, int drop_hdrlen, int tlen, uint8_t iptos, int ti_locked) { int thflags; u_long tiwin; char *s; struct in_conninfo *inc; struct tcpopt to; thflags = th->th_flags; tp->sackhint.last_sack_ack = 0; inc = &tp->t_inpcb->inp_inc; /* * If this is either a state-changing packet or current state isn't * established, we require a write lock on tcbinfo. Otherwise, we * allow the tcbinfo to be in either alocked or unlocked, as the * caller may have unnecessarily acquired a write lock due to a race. */ if ((thflags & (TH_SYN | TH_FIN | TH_RST)) != 0 || tp->t_state != TCPS_ESTABLISHED) { KASSERT(ti_locked == TI_RLOCKED, ("%s ti_locked %d for " "SYN/FIN/RST/!EST", __func__, ti_locked)); INP_INFO_RLOCK_ASSERT(&V_tcbinfo); } else { #ifdef INVARIANTS if (ti_locked == TI_RLOCKED) { INP_INFO_RLOCK_ASSERT(&V_tcbinfo); } else { KASSERT(ti_locked == TI_UNLOCKED, ("%s: EST " "ti_locked: %d", __func__, ti_locked)); INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); } #endif } INP_WLOCK_ASSERT(tp->t_inpcb); KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN", __func__)); KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT", __func__)); /* * Segment received on connection. * Reset idle time and keep-alive timer. * XXX: This should be done after segment * validation to ignore broken/spoofed segs. */ tp->t_rcvtime = ticks; if (TCPS_HAVEESTABLISHED(tp->t_state)) tcp_timer_activate(tp, TT_KEEP, TP_KEEPIDLE(tp)); /* * Unscale the window into a 32-bit value. * For the SYN_SENT state the scale is zero. */ tiwin = th->th_win << tp->snd_scale; /* * TCP ECN processing. */ if (tp->t_flags & TF_ECN_PERMIT) { if (thflags & TH_CWR) tp->t_flags &= ~TF_ECN_SND_ECE; switch (iptos & IPTOS_ECN_MASK) { case IPTOS_ECN_CE: tp->t_flags |= TF_ECN_SND_ECE; TCPSTAT_INC(tcps_ecn_ce); break; case IPTOS_ECN_ECT0: TCPSTAT_INC(tcps_ecn_ect0); break; case IPTOS_ECN_ECT1: TCPSTAT_INC(tcps_ecn_ect1); break; } /* Congestion experienced. */ if (thflags & TH_ECE) { cc_cong_signal(tp, th, CC_ECN); } } /* * Parse options on any incoming segment. */ tcp_dooptions(&to, (u_char *)(th + 1), (th->th_off << 2) - sizeof(struct tcphdr), (thflags & TH_SYN) ? TO_SYN : 0); /* * If echoed timestamp is later than the current time, * fall back to non RFC1323 RTT calculation. Normalize * timestamp if syncookies were used when this connection * was established. */ if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) { to.to_tsecr -= tp->ts_offset; if (TSTMP_GT(to.to_tsecr, tcp_ts_getticks())) to.to_tsecr = 0; } /* * If timestamps were negotiated during SYN/ACK they should * appear on every segment during this session and vice versa. */ if ((tp->t_flags & TF_RCVD_TSTMP) && !(to.to_flags & TOF_TS)) { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: Timestamp missing, " "no action\n", s, __func__); free(s, M_TCPLOG); } } if (!(tp->t_flags & TF_RCVD_TSTMP) && (to.to_flags & TOF_TS)) { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: Timestamp not expected, " "no action\n", s, __func__); free(s, M_TCPLOG); } } /* * Process options only when we get SYN/ACK back. The SYN case * for incoming connections is handled in tcp_syncache. * According to RFC1323 the window field in a SYN (i.e., a * or ) segment itself is never scaled. * XXX this is traditional behavior, may need to be cleaned up. */ if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) { if ((to.to_flags & TOF_SCALE) && (tp->t_flags & TF_REQ_SCALE)) { tp->t_flags |= TF_RCVD_SCALE; tp->snd_scale = to.to_wscale; } /* * Initial send window. It will be updated with * the next incoming segment to the scaled value. */ tp->snd_wnd = th->th_win; if (to.to_flags & TOF_TS) { tp->t_flags |= TF_RCVD_TSTMP; tp->ts_recent = to.to_tsval; tp->ts_recent_age = tcp_ts_getticks(); } if (to.to_flags & TOF_MSS) tcp_mss(tp, to.to_mss); if ((tp->t_flags & TF_SACK_PERMIT) && (to.to_flags & TOF_SACKPERM) == 0) tp->t_flags &= ~TF_SACK_PERMIT; } /* * Header prediction: check for the two common cases * of a uni-directional data xfer. If the packet has * no control flags, is in-sequence, the window didn't * change and we're not retransmitting, it's a * candidate. If the length is zero and the ack moved * forward, we're the sender side of the xfer. Just * free the data acked & wake any higher level process * that was blocked waiting for space. If the length * is non-zero and the ack didn't move, we're the * receiver side. If we're getting packets in-order * (the reassembly queue is empty), add the data to * the socket buffer and note that we need a delayed ack. * Make sure that the hidden state-flags are also off. * Since we check for TCPS_ESTABLISHED first, it can only * be TH_NEEDSYN. */ if (__predict_true(tp->t_state == TCPS_ESTABLISHED) && __predict_true(((to.to_flags & TOF_SACK) == 0)) && __predict_true(tlen == 0) && __predict_true((thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK) && __predict_true(LIST_EMPTY(&tp->t_segq)) && __predict_true(th->th_seq == tp->rcv_nxt)) { if (tcp_fastack(m, th, so, tp, &to, drop_hdrlen, tlen, ti_locked, tiwin)) { return; } } tcp_do_slowpath(m, th, so, tp, &to, drop_hdrlen, tlen, ti_locked, tiwin, thflags); } struct tcp_function_block __tcp_fastslow = { "fastslow", tcp_output, tcp_do_segment_fastslow, tcp_default_ctloutput, NULL, NULL, NULL, NULL, NULL, NULL, 0, 0 }; struct tcp_function_block __tcp_fastack = { "fastack", tcp_output, tcp_do_segment_fastack, tcp_default_ctloutput, NULL, NULL, NULL, NULL, NULL, NULL, 0, 0 }; static int tcp_addfastpaths(module_t mod, int type, void *data) { int err=0; switch (type) { case MOD_LOAD: err = register_tcp_functions(&__tcp_fastack, M_WAITOK); if (err) { printf("Failed to register fastack module -- err:%d\n", err); return(err); } err = register_tcp_functions(&__tcp_fastslow, M_WAITOK); if (err) { printf("Failed to register fastslow module -- err:%d\n", err); deregister_tcp_functions(&__tcp_fastack); return(err); } break; case MOD_QUIESCE: if ((__tcp_fastslow.tfb_refcnt) ||( __tcp_fastack.tfb_refcnt)) { return(EBUSY); } break; case MOD_UNLOAD: err = deregister_tcp_functions(&__tcp_fastack); if (err == EBUSY) break; err = deregister_tcp_functions(&__tcp_fastslow); if (err == EBUSY) break; err = 0; break; default: return (EOPNOTSUPP); } return (err); } static moduledata_t new_tcp_fastpaths = { .name = "tcp_fastpaths", .evhand = tcp_addfastpaths, .priv = 0 }; MODULE_VERSION(kern_tcpfastpaths, 1); DECLARE_MODULE(kern_tcpfastpaths, new_tcp_fastpaths, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY);