Index: sys/cam/ctl/ctl_ha.c =================================================================== --- sys/cam/ctl/ctl_ha.c +++ sys/cam/ctl/ctl_ha.c @@ -458,44 +458,19 @@ static int ctl_ha_accept(struct ha_softc *softc) { - struct socket *so; + struct socket *lso, *so; struct sockaddr *sap; int error; - ACCEPT_LOCK(); - if (softc->ha_lso->so_rcv.sb_state & SBS_CANTRCVMORE) - softc->ha_lso->so_error = ECONNABORTED; - if (softc->ha_lso->so_error) { - error = softc->ha_lso->so_error; - softc->ha_lso->so_error = 0; - ACCEPT_UNLOCK(); + lso = softc->ha_lso; + SOLISTEN_LOCK(lso); + error = solisten_dequeue(lso, &so, 0); + if (error == EWOULDBLOCK) + return (error); + if (error) { printf("%s: socket error %d\n", __func__, error); goto out; } - so = TAILQ_FIRST(&softc->ha_lso->so_comp); - if (so == NULL) { - ACCEPT_UNLOCK(); - return (EWOULDBLOCK); - } - KASSERT(!(so->so_qstate & SQ_INCOMP), ("accept1: so SQ_INCOMP")); - KASSERT(so->so_qstate & SQ_COMP, ("accept1: so not SQ_COMP")); - - /* - * Before changing the flags on the socket, we have to bump the - * reference count. Otherwise, if the protocol calls sofree(), - * the socket will be released due to a zero refcount. - */ - SOCK_LOCK(so); /* soref() and so_state update */ - soref(so); /* file descriptor reference */ - - TAILQ_REMOVE(&softc->ha_lso->so_comp, so, so_list); - softc->ha_lso->so_qlen--; - so->so_state |= SS_NBIO; - so->so_qstate &= ~SQ_COMP; - so->so_head = NULL; - - SOCK_UNLOCK(so); - ACCEPT_UNLOCK(); sap = NULL; error = soaccept(so, &sap); @@ -556,9 +531,6 @@ printf("%s: REUSEPORT setting failed %d\n", __func__, error); } - SOCKBUF_LOCK(&softc->ha_lso->so_rcv); - soupcall_set(softc->ha_lso, SO_RCV, ctl_ha_lupcall, softc); - SOCKBUF_UNLOCK(&softc->ha_lso->so_rcv); } memcpy(&sa, &softc->ha_peer_in, sizeof(sa)); @@ -572,6 +544,10 @@ printf("%s: solisten() error %d\n", __func__, error); goto out; } + SOLISTEN_LOCK(softc->ha_lso); + softc->ha_lso->so_state |= SS_NBIO; + solisten_upcall_set(softc->ha_lso, ctl_ha_lupcall, softc); + SOLISTEN_UNLOCK(softc->ha_lso); return (0); out: Index: sys/dev/iscsi/icl_soft_proxy.c =================================================================== --- sys/dev/iscsi/icl_soft_proxy.c +++ sys/dev/iscsi/icl_soft_proxy.c @@ -92,7 +92,6 @@ struct icl_listen *ils_listen; struct socket *ils_socket; bool ils_running; - bool ils_disconnecting; int ils_id; }; @@ -184,7 +183,9 @@ while (ils->ils_running) { ICL_DEBUG("waiting for accept thread to terminate"); sx_xunlock(&il->il_lock); - ils->ils_disconnecting = true; + SOLISTEN_LOCK(ils->ils_socket); + ils->ils_socket->so_error = ENOTCONN; + SOLISTEN_UNLOCK(ils->ils_socket); wakeup(&ils->ils_socket->so_timeo); pause("icl_unlisten", 1 * hz); sx_xlock(&il->il_lock); @@ -200,9 +201,9 @@ } /* - * XXX: Doing accept in a separate thread in each socket might not be the best way - * to do stuff, but it's pretty clean and debuggable - and you probably won't - * have hundreds of listening sockets anyway. + * XXX: Doing accept in a separate thread in each socket might not be the + * best way to do stuff, but it's pretty clean and debuggable - and you + * probably won't have hundreds of listening sockets anyway. */ static void icl_accept_thread(void *arg) @@ -218,55 +219,22 @@ ils->ils_running = true; for (;;) { - ACCEPT_LOCK(); - while (TAILQ_EMPTY(&head->so_comp) && head->so_error == 0 && ils->ils_disconnecting == false) { - if (head->so_rcv.sb_state & SBS_CANTRCVMORE) { - head->so_error = ECONNABORTED; - break; - } - error = msleep(&head->so_timeo, &accept_mtx, PSOCK | PCATCH, - "accept", 0); - if (error) { - ACCEPT_UNLOCK(); - ICL_WARN("msleep failed with error %d", error); - continue; - } - if (ils->ils_disconnecting) { - ACCEPT_UNLOCK(); - ICL_DEBUG("terminating"); - ils->ils_running = false; - kthread_exit(); - return; - } + SOLISTEN_LOCK(head); + error = solisten_dequeue(head, &so, 0); + if (error == ENOTCONN) { + /* + * XXXGL: ENOTCONN is our mark from icl_listen_free(). + * Neither socket code, nor msleep(9) may return it. + */ + ICL_DEBUG("terminating"); + ils->ils_running = false; + kthread_exit(); + return; } - if (head->so_error) { - error = head->so_error; - head->so_error = 0; - ACCEPT_UNLOCK(); - ICL_WARN("socket error %d", error); + if (error) { + ICL_WARN("solisten_dequeue error %d", error); continue; } - so = TAILQ_FIRST(&head->so_comp); - KASSERT(so != NULL, ("NULL so")); - KASSERT(!(so->so_qstate & SQ_INCOMP), ("accept1: so SQ_INCOMP")); - KASSERT(so->so_qstate & SQ_COMP, ("accept1: so not SQ_COMP")); - - /* - * Before changing the flags on the socket, we have to bump the - * reference count. Otherwise, if the protocol calls sofree(), - * the socket will be released due to a zero refcount. - */ - SOCK_LOCK(so); /* soref() and so_state update */ - soref(so); /* file descriptor reference */ - - TAILQ_REMOVE(&head->so_comp, so, so_list); - head->so_qlen--; - so->so_state |= (head->so_state & SS_NBIO); - so->so_qstate &= ~SQ_COMP; - so->so_head = NULL; - - SOCK_UNLOCK(so); - ACCEPT_UNLOCK(); sa = NULL; error = soaccept(so, &sa); Index: sys/kern/sys_socket.c =================================================================== --- sys/kern/sys_socket.c +++ sys/kern/sys_socket.c @@ -170,32 +170,36 @@ break; case FIOASYNC: - /* - * XXXRW: This code separately acquires SOCK_LOCK(so) and - * SOCKBUF_LOCK(&so->so_rcv) even though they are the same - * mutex to avoid introducing the assumption that they are - * the same. - */ if (*(int *)data) { SOCK_LOCK(so); so->so_state |= SS_ASYNC; + if (SOLISTENING(so)) { + so->sol_sbrcv_flags |= SB_ASYNC; + so->sol_sbsnd_flags |= SB_ASYNC; + } else { + SOCKBUF_LOCK(&so->so_rcv); + so->so_rcv.sb_flags |= SB_ASYNC; + SOCKBUF_UNLOCK(&so->so_rcv); + SOCKBUF_LOCK(&so->so_snd); + so->so_snd.sb_flags |= SB_ASYNC; + SOCKBUF_UNLOCK(&so->so_snd); + } SOCK_UNLOCK(so); - SOCKBUF_LOCK(&so->so_rcv); - so->so_rcv.sb_flags |= SB_ASYNC; - SOCKBUF_UNLOCK(&so->so_rcv); - SOCKBUF_LOCK(&so->so_snd); - so->so_snd.sb_flags |= SB_ASYNC; - SOCKBUF_UNLOCK(&so->so_snd); } else { SOCK_LOCK(so); so->so_state &= ~SS_ASYNC; + if (SOLISTENING(so)) { + so->sol_sbrcv_flags &= ~SB_ASYNC; + so->sol_sbsnd_flags &= ~SB_ASYNC; + } else { + SOCKBUF_LOCK(&so->so_rcv); + so->so_rcv.sb_flags &= ~SB_ASYNC; + SOCKBUF_UNLOCK(&so->so_rcv); + SOCKBUF_LOCK(&so->so_snd); + so->so_snd.sb_flags &= ~SB_ASYNC; + SOCKBUF_UNLOCK(&so->so_snd); + } SOCK_UNLOCK(so); - SOCKBUF_LOCK(&so->so_rcv); - so->so_rcv.sb_flags &= ~SB_ASYNC; - SOCKBUF_UNLOCK(&so->so_rcv); - SOCKBUF_LOCK(&so->so_snd); - so->so_snd.sb_flags &= ~SB_ASYNC; - SOCKBUF_UNLOCK(&so->so_snd); } break; @@ -695,7 +699,6 @@ sb->sb_flags &= ~SB_AIO_RUNNING; SOCKBUF_UNLOCK(sb); - ACCEPT_LOCK(); SOCK_LOCK(so); sorele(so); } Index: sys/kern/uipc_accf.c =================================================================== --- sys/kern/uipc_accf.c +++ sys/kern/uipc_accf.c @@ -162,26 +162,25 @@ } int -do_getopt_accept_filter(struct socket *so, struct sockopt *sopt) +accept_filt_getopt(struct socket *so, struct sockopt *sopt) { struct accept_filter_arg *afap; int error; error = 0; - afap = malloc(sizeof(*afap), M_TEMP, - M_WAITOK | M_ZERO); + afap = malloc(sizeof(*afap), M_TEMP, M_WAITOK | M_ZERO); SOCK_LOCK(so); if ((so->so_options & SO_ACCEPTCONN) == 0) { error = EINVAL; goto out; } - if ((so->so_options & SO_ACCEPTFILTER) == 0) { + if (so->sol_accept_filter == NULL) { error = EINVAL; goto out; } - strcpy(afap->af_name, so->so_accf->so_accept_filter->accf_name); - if (so->so_accf->so_accept_filter_str != NULL) - strcpy(afap->af_arg, so->so_accf->so_accept_filter_str); + strcpy(afap->af_name, so->sol_accept_filter->accf_name); + if (so->sol_accept_filter_str != NULL) + strcpy(afap->af_arg, so->sol_accept_filter_str); out: SOCK_UNLOCK(so); if (error == 0) @@ -191,12 +190,13 @@ } int -do_setopt_accept_filter(struct socket *so, struct sockopt *sopt) +accept_filt_setopt(struct socket *so, struct sockopt *sopt) { struct accept_filter_arg *afap; struct accept_filter *afp; - struct so_accf *newaf; - int error = 0; + char *accept_filter_str = NULL; + void *accept_filter_arg = NULL; + int error; /* * Handle the simple delete case first. @@ -207,18 +207,15 @@ SOCK_UNLOCK(so); return (EINVAL); } - if (so->so_accf != NULL) { - struct so_accf *af = so->so_accf; - if (af->so_accept_filter != NULL && - af->so_accept_filter->accf_destroy != NULL) { - af->so_accept_filter->accf_destroy(so); - } - if (af->so_accept_filter_str != NULL) - free(af->so_accept_filter_str, M_ACCF); - free(af, M_ACCF); - so->so_accf = NULL; + if (so->sol_accept_filter != NULL) { + if (so->sol_accept_filter->accf_destroy != NULL) + so->sol_accept_filter->accf_destroy(so); + if (so->sol_accept_filter_str != NULL) + free(so->sol_accept_filter_str, M_ACCF); + so->sol_accept_filter = NULL; + so->sol_accept_filter_arg = NULL; + so->sol_accept_filter_str = NULL; } - so->so_options &= ~SO_ACCEPTFILTER; SOCK_UNLOCK(so); return (0); } @@ -227,8 +224,7 @@ * Pre-allocate any memory we may need later to avoid blocking at * untimely moments. This does not optimize for invalid arguments. */ - afap = malloc(sizeof(*afap), M_TEMP, - M_WAITOK); + afap = malloc(sizeof(*afap), M_TEMP, M_WAITOK); error = sooptcopyin(sopt, afap, sizeof *afap, sizeof *afap); afap->af_name[sizeof(afap->af_name)-1] = '\0'; afap->af_arg[sizeof(afap->af_arg)-1] = '\0'; @@ -241,19 +237,10 @@ free(afap, M_TEMP); return (ENOENT); } - /* - * Allocate the new accept filter instance storage. We may - * have to free it again later if we fail to attach it. If - * attached properly, 'newaf' is NULLed to avoid a free() - * while in use. - */ - newaf = malloc(sizeof(*newaf), M_ACCF, M_WAITOK | - M_ZERO); if (afp->accf_create != NULL && afap->af_name[0] != '\0') { size_t len = strlen(afap->af_name) + 1; - newaf->so_accept_filter_str = malloc(len, M_ACCF, - M_WAITOK); - strcpy(newaf->so_accept_filter_str, afap->af_name); + accept_filter_str = malloc(len, M_ACCF, M_WAITOK); + strcpy(accept_filter_str, afap->af_name); } /* @@ -261,8 +248,8 @@ * without first removing it. */ SOCK_LOCK(so); - if (((so->so_options & SO_ACCEPTCONN) == 0) || - (so->so_accf != NULL)) { + if ((so->so_options & SO_ACCEPTCONN) == 0 || + so->sol_accept_filter != NULL) { error = EINVAL; goto out; } @@ -273,25 +260,19 @@ * can't block. */ if (afp->accf_create != NULL) { - newaf->so_accept_filter_arg = - afp->accf_create(so, afap->af_arg); - if (newaf->so_accept_filter_arg == NULL) { + accept_filter_arg = afp->accf_create(so, afap->af_arg); + if (accept_filter_arg == NULL) { error = EINVAL; goto out; } } - newaf->so_accept_filter = afp; - so->so_accf = newaf; - so->so_options |= SO_ACCEPTFILTER; - newaf = NULL; + so->sol_accept_filter = afp; + so->sol_accept_filter_arg = accept_filter_arg; + so->sol_accept_filter_str = accept_filter_str; out: SOCK_UNLOCK(so); - if (newaf != NULL) { - if (newaf->so_accept_filter_str != NULL) - free(newaf->so_accept_filter_str, M_ACCF); - free(newaf, M_ACCF); - } - if (afap != NULL) - free(afap, M_TEMP); + if (accept_filter_str != NULL) + free(accept_filter_str, M_ACCF); + free(afap, M_TEMP); return (error); } Index: sys/kern/uipc_debug.c =================================================================== --- sys/kern/uipc_debug.c +++ sys/kern/uipc_debug.c @@ -448,8 +448,6 @@ db_printf(")\n"); db_print_indent(indent); - db_printf("so_qstate: 0x%x (", so->so_qstate); - db_print_soqstate(so->so_qstate); db_printf(") "); db_printf("so_pcb: %p ", so->so_pcb); db_printf("so_proto: %p\n", so->so_proto); @@ -458,24 +456,28 @@ db_print_protosw(so->so_proto, "so_proto", indent); db_print_indent(indent); - db_printf("so_head: %p ", so->so_head); - db_printf("so_incomp first: %p ", TAILQ_FIRST(&so->so_incomp)); - db_printf("so_comp first: %p\n", TAILQ_FIRST(&so->so_comp)); - - db_print_indent(indent); - /* so_list skipped */ - db_printf("so_qlen: %u ", so->so_qlen); - db_printf("so_incqlen: %u ", so->so_incqlen); - db_printf("so_qlimit: %u ", so->so_qlimit); - db_printf("so_timeo: %d ", so->so_timeo); - db_printf("so_error: %d\n", so->so_error); - - db_print_indent(indent); - db_printf("so_sigio: %p ", so->so_sigio); - db_printf("so_oobmark: %lu ", so->so_oobmark); - - db_print_sockbuf(&so->so_rcv, "so_rcv", indent); - db_print_sockbuf(&so->so_snd, "so_snd", indent); + if (so->so_options & SO_ACCEPTCONN) { + db_printf("sol_incomp first: %p ", + TAILQ_FIRST(&so->sol_incomp)); + db_printf("sol_comp first: %p\n", TAILQ_FIRST(&so->sol_comp)); + db_printf("sol_qlen: %d ", so->sol_qlen); + db_printf("sol_incqlen: %d ", so->sol_incqlen); + db_printf("sol_qlimit: %d ", so->sol_qlimit); + } else { + db_printf("so_qstate: 0x%x (", so->so_qstate); + db_print_soqstate(so->so_qstate); + db_printf("so_listen: %p ", so->so_listen); + /* so_list skipped */ + db_printf("so_timeo: %d ", so->so_timeo); + db_printf("so_error: %d\n", so->so_error); + + db_print_indent(indent); + db_printf("so_sigio: %p ", so->so_sigio); + db_printf("so_oobmark: %lu ", so->so_oobmark); + + db_print_sockbuf(&so->so_rcv, "so_rcv", indent); + db_print_sockbuf(&so->so_snd, "so_snd", indent); + } } DB_SHOW_COMMAND(socket, db_show_socket) Index: sys/kern/uipc_sockbuf.c =================================================================== --- sys/kern/uipc_sockbuf.c +++ sys/kern/uipc_sockbuf.c @@ -314,14 +314,14 @@ SOCKBUF_LOCK_ASSERT(sb); - selwakeuppri(&sb->sb_sel, PSOCK); - if (!SEL_WAITING(&sb->sb_sel)) + selwakeuppri(sb->sb_sel, PSOCK); + if (!SEL_WAITING(sb->sb_sel)) sb->sb_flags &= ~SB_SEL; if (sb->sb_flags & SB_WAIT) { sb->sb_flags &= ~SB_WAIT; wakeup(&sb->sb_acc); } - KNOTE_LOCKED(&sb->sb_sel.si_note, 0); + KNOTE_LOCKED(&sb->sb_sel->si_note, 0); if (sb->sb_upcall != NULL) { ret = sb->sb_upcall(so, sb->sb_upcallarg, M_NOWAIT); if (ret == SU_ISCONNECTED) { Index: sys/kern/uipc_socket.c =================================================================== --- sys/kern/uipc_socket.c +++ sys/kern/uipc_socket.c @@ -106,6 +106,7 @@ #include "opt_inet.h" #include "opt_inet6.h" #include "opt_compat.h" +#include "opt_sctp.h" #include #include @@ -154,13 +155,22 @@ static int soreceive_rcvoob(struct socket *so, struct uio *uio, int flags); +static void so_rdknl_lock(void *); +static void so_rdknl_unlock(void *); +static void so_rdknl_assert_locked(void *); +static void so_rdknl_assert_unlocked(void *); +static void so_wrknl_lock(void *); +static void so_wrknl_unlock(void *); +static void so_wrknl_assert_locked(void *); +static void so_wrknl_assert_unlocked(void *); static void filt_sordetach(struct knote *kn); static int filt_soread(struct knote *kn, long hint); static void filt_sowdetach(struct knote *kn); static int filt_sowrite(struct knote *kn, long hint); -static int inline hhook_run_socket(struct socket *so, void *hctx, int32_t h_id); static int filt_soempty(struct knote *kn, long hint); +static void solisten_wakeup(struct socket *); +static int inline hhook_run_socket(struct socket *so, void *hctx, int32_t h_id); fo_kqfilter_t soo_kqfilter; static struct filterops soread_filtops = { @@ -393,8 +403,16 @@ return (NULL); } + /* + * The socket locking protocol allows to lock 2 sockets at a time, + * however, the first one must be a listening socket. WITNESS lacks + * a feature to change class of an existing lock, so we use DUPOK. + */ + mtx_init(&so->so_lock, "socket", NULL, MTX_DEF | MTX_DUPOK); SOCKBUF_LOCK_INIT(&so->so_snd, "so_snd"); SOCKBUF_LOCK_INIT(&so->so_rcv, "so_rcv"); + so->so_rcv.sb_sel = &so->so_rdsel; + so->so_snd.sb_sel = &so->so_wrsel; sx_init(&so->so_snd.sb_sx, "so_snd_sx"); sx_init(&so->so_rcv.sb_sx, "so_rcv_sx"); TAILQ_INIT(&so->so_snd.sb_aiojobq); @@ -450,9 +468,6 @@ if (so->so_snd.sb_hiwat) (void)chgsbsize(so->so_cred->cr_uidinfo, &so->so_snd.sb_hiwat, 0, RLIM_INFINITY); - /* remove accept filter if one is present. */ - if (so->so_accf != NULL) - do_setopt_accept_filter(so, NULL); #ifdef MAC mac_socket_destroy(so); #endif @@ -460,10 +475,16 @@ crfree(so->so_cred); khelp_destroy_osd(&so->osd); - sx_destroy(&so->so_snd.sb_sx); - sx_destroy(&so->so_rcv.sb_sx); - SOCKBUF_LOCK_DESTROY(&so->so_snd); - SOCKBUF_LOCK_DESTROY(&so->so_rcv); + if (SOLISTENING(so)) { + if (so->sol_accept_filter != NULL) + accept_filt_setopt(so, NULL); + } else { + sx_destroy(&so->so_snd.sb_sx); + sx_destroy(&so->so_rcv.sb_sx); + SOCKBUF_LOCK_DESTROY(&so->so_snd); + SOCKBUF_LOCK_DESTROY(&so->so_rcv); + } + mtx_destroy(&so->so_lock); uma_zfree(socket_zone, so); } @@ -506,8 +527,6 @@ if (so == NULL) return (ENOBUFS); - TAILQ_INIT(&so->so_incomp); - TAILQ_INIT(&so->so_comp); so->so_type = type; so->so_cred = crhold(cred); if ((prp->pr_domain->dom_family == PF_INET) || @@ -520,9 +539,10 @@ #ifdef MAC mac_socket_create(cred, so); #endif - knlist_init_mtx(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv)); - knlist_init_mtx(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd)); - so->so_count = 1; + knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock, + so_rdknl_assert_locked, so_rdknl_assert_unlocked); + knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock, + so_wrknl_assert_locked, so_wrknl_assert_unlocked); /* * Auto-sizing of socket buffers is managed by the protocols and * the appropriate flags must be set in the pru_attach function. @@ -531,12 +551,10 @@ error = (*prp->pr_usrreqs->pru_attach)(so, proto, td); CURVNET_RESTORE(); if (error) { - KASSERT(so->so_count == 1, ("socreate: so_count %d", - so->so_count)); - so->so_count = 0; sodealloc(so); return (error); } + soref(so); *aso = so; return (0); } @@ -564,11 +582,11 @@ static int overcount; struct socket *so; - int over; + u_int over; - ACCEPT_LOCK(); - over = (head->so_qlen > 3 * head->so_qlimit / 2); - ACCEPT_UNLOCK(); + SOLISTEN_LOCK(head); + over = (head->sol_qlen > 3 * head->sol_qlimit / 2); + SOLISTEN_UNLOCK(head); #ifdef REGRESSION if (regression_sonewconn_earlytest && over) { #else @@ -580,15 +598,15 @@ log(LOG_DEBUG, "%s: pcb %p: Listen queue overflow: " "%i already in queue awaiting acceptance " "(%d occurrences)\n", - __func__, head->so_pcb, head->so_qlen, overcount); + __func__, head->so_pcb, head->sol_qlen, overcount); overcount = 0; } return (NULL); } - VNET_ASSERT(head->so_vnet != NULL, ("%s:%d so_vnet is NULL, head=%p", - __func__, __LINE__, head)); + VNET_ASSERT(head->so_vnet != NULL, ("%s: so %p vnet is NULL", + __func__, head)); so = soalloc(head->so_vnet); if (so == NULL) { log(LOG_DEBUG, "%s: pcb %p: New socket allocation failure: " @@ -596,9 +614,9 @@ __func__, head->so_pcb); return (NULL); } - if ((head->so_options & SO_ACCEPTFILTER) != 0) + if (head->sol_accept_filter != NULL) connstatus = 0; - so->so_head = head; + so->so_listen = head; so->so_type = head->so_type; so->so_options = head->so_options &~ SO_ACCEPTCONN; so->so_linger = head->so_linger; @@ -609,10 +627,12 @@ #ifdef MAC mac_socket_newconn(head, so); #endif - knlist_init_mtx(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv)); - knlist_init_mtx(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd)); + knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock, + so_rdknl_assert_locked, so_rdknl_assert_unlocked); + knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock, + so_wrknl_assert_locked, so_wrknl_assert_unlocked); VNET_SO_ASSERT(head); - if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat)) { + if (soreserve(so, head->sol_sbsnd_hiwat, head->sol_sbrcv_hiwat)) { sodealloc(so); log(LOG_DEBUG, "%s: pcb %p: soreserve() failed\n", __func__, head->so_pcb); @@ -624,32 +644,21 @@ __func__, head->so_pcb); return (NULL); } - so->so_rcv.sb_lowat = head->so_rcv.sb_lowat; - so->so_snd.sb_lowat = head->so_snd.sb_lowat; - so->so_rcv.sb_timeo = head->so_rcv.sb_timeo; - so->so_snd.sb_timeo = head->so_snd.sb_timeo; - so->so_rcv.sb_flags |= head->so_rcv.sb_flags & SB_AUTOSIZE; - so->so_snd.sb_flags |= head->so_snd.sb_flags & SB_AUTOSIZE; + so->so_rcv.sb_lowat = head->sol_sbrcv_lowat; + so->so_snd.sb_lowat = head->sol_sbsnd_lowat; + so->so_rcv.sb_timeo = head->sol_sbrcv_timeo; + so->so_snd.sb_timeo = head->sol_sbsnd_timeo; + so->so_rcv.sb_flags |= head->sol_sbrcv_flags & SB_AUTOSIZE; + so->so_snd.sb_flags |= head->sol_sbsnd_flags & SB_AUTOSIZE; so->so_state |= connstatus; - ACCEPT_LOCK(); - /* - * The accept socket may be tearing down but we just - * won a race on the ACCEPT_LOCK. - * However, if sctp_peeloff() is called on a 1-to-many - * style socket, the SO_ACCEPTCONN doesn't need to be set. - */ - if (!(head->so_options & SO_ACCEPTCONN) && - ((head->so_proto->pr_protocol != IPPROTO_SCTP) || - (head->so_type != SOCK_SEQPACKET))) { - SOCK_LOCK(so); - so->so_head = NULL; - sofree(so); /* NB: returns ACCEPT_UNLOCK'ed. */ - return (NULL); - } + + SOLISTEN_LOCK(head); + soref(head); /* A socket on (in)complete queue refs head. */ if (connstatus) { - TAILQ_INSERT_TAIL(&head->so_comp, so, so_list); - so->so_qstate |= SQ_COMP; - head->so_qlen++; + TAILQ_INSERT_TAIL(&head->sol_comp, so, so_list); + so->so_qstate = SQ_COMP; + head->sol_qlen++; + solisten_wakeup(head); /* unlocks */ } else { /* * Keep removing sockets from the head until there's room for @@ -658,28 +667,86 @@ * threads and soabort() requires dropping locks, we must * loop waiting for the condition to be true. */ - while (head->so_incqlen > head->so_qlimit) { + while (head->sol_incqlen > head->sol_qlimit) { struct socket *sp; - sp = TAILQ_FIRST(&head->so_incomp); - TAILQ_REMOVE(&head->so_incomp, sp, so_list); - head->so_incqlen--; - sp->so_qstate &= ~SQ_INCOMP; - sp->so_head = NULL; - ACCEPT_UNLOCK(); + + sp = TAILQ_FIRST(&head->sol_incomp); + TAILQ_REMOVE(&head->sol_incomp, sp, so_list); + head->sol_incqlen--; + SOCK_LOCK(sp); + sp->so_qstate = SQ_NONE; + sp->so_listen = NULL; + SOCK_UNLOCK(sp); + sorele(head); /* does SOLISTEN_UNLOCK, head stays */ soabort(sp); - ACCEPT_LOCK(); + SOLISTEN_LOCK(head); } - TAILQ_INSERT_TAIL(&head->so_incomp, so, so_list); - so->so_qstate |= SQ_INCOMP; - head->so_incqlen++; + TAILQ_INSERT_TAIL(&head->sol_incomp, so, so_list); + so->so_qstate = SQ_INCOMP; + head->sol_incqlen++; + SOLISTEN_UNLOCK(head); } - ACCEPT_UNLOCK(); - if (connstatus) { - sorwakeup(head); - wakeup_one(&head->so_timeo); + return (so); +} + +#ifdef SCTP +/* + * Socket part of sctp_peeloff(). Detach a new socket from an + * association. The new socket is returned with a reference. + */ +struct socket * +sopeeloff(struct socket *head) +{ + struct socket *so; + + VNET_ASSERT(head->so_vnet != NULL, ("%s:%d so_vnet is NULL, head=%p", + __func__, __LINE__, head)); + so = soalloc(head->so_vnet); + if (so == NULL) { + log(LOG_DEBUG, "%s: pcb %p: New socket allocation failure: " + "limit reached or out of memory\n", + __func__, head->so_pcb); + return (NULL); + } + so->so_type = head->so_type; + so->so_options = head->so_options; + so->so_linger = head->so_linger; + so->so_state = (head->so_state & SS_NBIO) | SS_ISCONNECTED; + so->so_fibnum = head->so_fibnum; + so->so_proto = head->so_proto; + so->so_cred = crhold(head->so_cred); +#ifdef MAC + mac_socket_newconn(head, so); +#endif + knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock, + so_rdknl_assert_locked, so_rdknl_assert_unlocked); + knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock, + so_wrknl_assert_locked, so_wrknl_assert_unlocked); + VNET_SO_ASSERT(head); + if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat)) { + sodealloc(so); + log(LOG_DEBUG, "%s: pcb %p: soreserve() failed\n", + __func__, head->so_pcb); + return (NULL); + } + if ((*so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL)) { + sodealloc(so); + log(LOG_DEBUG, "%s: pcb %p: pru_attach() failed\n", + __func__, head->so_pcb); + return (NULL); } + so->so_rcv.sb_lowat = head->so_rcv.sb_lowat; + so->so_snd.sb_lowat = head->so_snd.sb_lowat; + so->so_rcv.sb_timeo = head->so_rcv.sb_timeo; + so->so_snd.sb_timeo = head->so_snd.sb_timeo; + so->so_rcv.sb_flags |= head->so_rcv.sb_flags & SB_AUTOSIZE; + so->so_snd.sb_flags |= head->so_snd.sb_flags & SB_AUTOSIZE; + + soref(so); + return (so); } +#endif /* SCTP */ int sobind(struct socket *so, struct sockaddr *nam, struct thread *td) @@ -741,13 +808,137 @@ void solisten_proto(struct socket *so, int backlog) { + int sbrcv_lowat, sbsnd_lowat; + u_int sbrcv_hiwat, sbsnd_hiwat; + short sbrcv_flags, sbsnd_flags; + sbintime_t sbrcv_timeo, sbsnd_timeo; SOCK_LOCK_ASSERT(so); + if (SOLISTENING(so)) + goto listening; + + /* + * Change this socket to listening state. + */ + sbrcv_lowat = so->so_rcv.sb_lowat; + sbsnd_lowat = so->so_snd.sb_lowat; + sbrcv_hiwat = so->so_rcv.sb_hiwat; + sbsnd_hiwat = so->so_snd.sb_hiwat; + sbrcv_flags = so->so_rcv.sb_flags; + sbsnd_flags = so->so_snd.sb_flags; + sbrcv_timeo = so->so_rcv.sb_timeo; + sbsnd_timeo = so->so_snd.sb_timeo; + + sbdestroy(&so->so_snd, so); + sbdestroy(&so->so_rcv, so); + sx_destroy(&so->so_snd.sb_sx); + sx_destroy(&so->so_rcv.sb_sx); + SOCKBUF_LOCK_DESTROY(&so->so_snd); + SOCKBUF_LOCK_DESTROY(&so->so_rcv); + +#ifdef INVARIANTS + bzero(&so->so_rcv, + sizeof(struct socket) - offsetof(struct socket, so_rcv)); +#endif + + so->sol_sbrcv_lowat = sbrcv_lowat; + so->sol_sbsnd_lowat = sbsnd_lowat; + so->sol_sbrcv_hiwat = sbrcv_hiwat; + so->sol_sbsnd_hiwat = sbsnd_hiwat; + so->sol_sbrcv_flags = sbrcv_flags; + so->sol_sbsnd_flags = sbsnd_flags; + so->sol_sbrcv_timeo = sbrcv_timeo; + so->sol_sbsnd_timeo = sbsnd_timeo; + + so->sol_qlen = so->sol_incqlen = 0; + TAILQ_INIT(&so->sol_incomp); + TAILQ_INIT(&so->sol_comp); + + so->sol_accept_filter = NULL; + so->sol_accept_filter_arg = NULL; + so->sol_accept_filter_str = NULL; + + so->so_options |= SO_ACCEPTCONN; + +listening: if (backlog < 0 || backlog > somaxconn) backlog = somaxconn; - so->so_qlimit = backlog; - so->so_options |= SO_ACCEPTCONN; + so->sol_qlimit = backlog; +} + +/* + * Wakeup listeners/subsystems once we have a complete connection. + * Enters with lock, returns unlocked. + */ +static void +solisten_wakeup(struct socket *sol) +{ + + if (sol->sol_upcall != NULL) + (void )sol->sol_upcall(sol, sol->sol_upcallarg, M_NOWAIT); + else { + selwakeuppri(&sol->so_rdsel, PSOCK); + KNOTE_LOCKED(&sol->so_rdsel.si_note, 0); + } + SOLISTEN_UNLOCK(sol); + wakeup_one(&sol->sol_comp); +} + +/* + * Return single connection off a listening socket queue. Main consumer of + * the function is kern_accept4(). Some modules, that do their own accept + * management also use the function. + * + * Listening socket must be locked on entry and is returned unlocked on + * return. + * The flags argument is set of accept4(2) flags and ACCEPT4_INHERIT. + */ +int +solisten_dequeue(struct socket *head, struct socket **ret, int flags) +{ + struct socket *so; + int error; + + SOLISTEN_LOCK_ASSERT(head); + + while (!(head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->sol_comp) && + head->so_error == 0) { + error = msleep(&head->sol_comp, &head->so_lock, PSOCK | PCATCH, + "accept", 0); + if (error != 0) { + SOLISTEN_UNLOCK(head); + return (error); + } + } + if (head->so_error) { + error = head->so_error; + head->so_error = 0; + SOLISTEN_UNLOCK(head); + return (error); + } + if ((head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->sol_comp)) { + SOLISTEN_UNLOCK(head); + return (EWOULDBLOCK); + } + so = TAILQ_FIRST(&head->sol_comp); + SOCK_LOCK(so); + KASSERT(so->so_qstate == SQ_COMP, + ("%s: so %p not SQ_COMP", __func__, so)); + soref(so); + head->sol_qlen--; + so->so_qstate = SQ_NONE; + so->so_listen = NULL; + TAILQ_REMOVE(&head->sol_comp, so, so_list); + if (flags & ACCEPT4_INHERIT) + so->so_state |= (head->so_state & SS_NBIO); + else + so->so_state |= (flags & SOCK_NONBLOCK) ? SS_NBIO : 0; + SOCK_UNLOCK(so); + sorele(head); + + *ret = so; + return (0); } /* @@ -774,44 +965,62 @@ sofree(struct socket *so) { struct protosw *pr = so->so_proto; - struct socket *head; - ACCEPT_LOCK_ASSERT(); SOCK_LOCK_ASSERT(so); if ((so->so_state & SS_NOFDREF) == 0 || so->so_count != 0 || - (so->so_state & SS_PROTOREF) || (so->so_qstate & SQ_COMP)) { + (so->so_state & SS_PROTOREF) || (so->so_qstate == SQ_COMP)) { SOCK_UNLOCK(so); - ACCEPT_UNLOCK(); return; } - head = so->so_head; - if (head != NULL) { - KASSERT((so->so_qstate & SQ_COMP) != 0 || - (so->so_qstate & SQ_INCOMP) != 0, - ("sofree: so_head != NULL, but neither SQ_COMP nor " - "SQ_INCOMP")); - KASSERT((so->so_qstate & SQ_COMP) == 0 || - (so->so_qstate & SQ_INCOMP) == 0, - ("sofree: so->so_qstate is SQ_COMP and also SQ_INCOMP")); - TAILQ_REMOVE(&head->so_incomp, so, so_list); - head->so_incqlen--; - so->so_qstate &= ~SQ_INCOMP; - so->so_head = NULL; - } - KASSERT((so->so_qstate & SQ_COMP) == 0 && - (so->so_qstate & SQ_INCOMP) == 0, - ("sofree: so_head == NULL, but still SQ_COMP(%d) or SQ_INCOMP(%d)", - so->so_qstate & SQ_COMP, so->so_qstate & SQ_INCOMP)); - if (so->so_options & SO_ACCEPTCONN) { - KASSERT((TAILQ_EMPTY(&so->so_comp)), - ("sofree: so_comp populated")); - KASSERT((TAILQ_EMPTY(&so->so_incomp)), - ("sofree: so_incomp populated")); + if (!SOLISTENING(so) && so->so_qstate == SQ_INCOMP) { + struct socket *sol; + + sol = so->so_listen; + KASSERT(sol, ("%s: so %p on incomp of NULL", __func__, so)); + + /* + * To solve race between close of a listening socket and + * a socket on its incomplete queue, we need to lock both. + * The order is first listening socket, then regular. + * Since we don't have SS_NOFDREF neither SS_PROTOREF, this + * function and the listening socket are the only pointers + * to so. To preserve so and sol, we reference both and then + * relock. + * After relock the socket may not move to so_comp since it + * doesn't have PCB already, but it may be removed from + * so_incomp. If that happens, we share responsiblity on + * freeing the socket, but soclose() has already removed + * it from queue. + */ + soref(sol); + soref(so); + SOCK_UNLOCK(so); + SOLISTEN_LOCK(sol); + SOCK_LOCK(so); + if (so->so_qstate == SQ_INCOMP) { + KASSERT(so->so_listen == sol, + ("%s: so %p migrated out of sol %p", + __func__, so, sol)); + TAILQ_REMOVE(&sol->sol_incomp, so, so_list); + sol->sol_incqlen--; + /* This is guarenteed not to be the last. */ + refcount_release(&sol->so_count); + so->so_qstate = SQ_NONE; + so->so_listen = NULL; + } else + KASSERT(so->so_listen == NULL, + ("%s: so %p not on (in)comp with so_listen", + __func__, so)); + sorele(sol); + KASSERT(so->so_count == 1, + ("%s: so %p count %u", __func__, so, so->so_count)); + so->so_count = 0; } + if (SOLISTENING(so)) + so->so_error = ECONNABORTED; SOCK_UNLOCK(so); - ACCEPT_UNLOCK(); VNET_SO_ASSERT(so); if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL) @@ -833,12 +1042,14 @@ * before calling pru_detach. This means that protocols shold not * assume they can perform socket wakeups, etc, in their detach code. */ - sbdestroy(&so->so_snd, so); - sbdestroy(&so->so_rcv, so); - seldrain(&so->so_snd.sb_sel); - seldrain(&so->so_rcv.sb_sel); - knlist_destroy(&so->so_rcv.sb_sel.si_note); - knlist_destroy(&so->so_snd.sb_sel.si_note); + if (!SOLISTENING(so)) { + sbdestroy(&so->so_snd, so); + sbdestroy(&so->so_rcv, so); + } + seldrain(&so->so_rdsel); + seldrain(&so->so_wrsel); + knlist_destroy(&so->so_rdsel.si_note); + knlist_destroy(&so->so_wrsel.si_note); sodealloc(so); } @@ -853,6 +1064,8 @@ int soclose(struct socket *so) { + struct accept_queue lqueue; + bool listening; int error = 0; KASSERT(!(so->so_state & SS_NOFDREF), ("soclose: SS_NOFDREF on enter")); @@ -885,41 +1098,42 @@ drop: if (so->so_proto->pr_usrreqs->pru_close != NULL) (*so->so_proto->pr_usrreqs->pru_close)(so); - ACCEPT_LOCK(); - if (so->so_options & SO_ACCEPTCONN) { + + SOCK_LOCK(so); + if ((listening = (so->so_options & SO_ACCEPTCONN))) { struct socket *sp; - /* - * Prevent new additions to the accept queues due - * to ACCEPT_LOCK races while we are draining them. - */ - so->so_options &= ~SO_ACCEPTCONN; - while ((sp = TAILQ_FIRST(&so->so_incomp)) != NULL) { - TAILQ_REMOVE(&so->so_incomp, sp, so_list); - so->so_incqlen--; - sp->so_qstate &= ~SQ_INCOMP; - sp->so_head = NULL; - ACCEPT_UNLOCK(); - soabort(sp); - ACCEPT_LOCK(); - } - while ((sp = TAILQ_FIRST(&so->so_comp)) != NULL) { - TAILQ_REMOVE(&so->so_comp, sp, so_list); - so->so_qlen--; - sp->so_qstate &= ~SQ_COMP; - sp->so_head = NULL; - ACCEPT_UNLOCK(); - soabort(sp); - ACCEPT_LOCK(); + + TAILQ_INIT(&lqueue); + TAILQ_SWAP(&lqueue, &so->sol_incomp, socket, so_list); + TAILQ_CONCAT(&lqueue, &so->sol_comp, so_list); + + so->sol_qlen = so->sol_incqlen = 0; + + TAILQ_FOREACH(sp, &lqueue, so_list) { + SOCK_LOCK(sp); + sp->so_qstate = SQ_NONE; + sp->so_listen = NULL; + SOCK_UNLOCK(sp); + /* Guaranteed not to be the last. */ + refcount_release(&so->so_count); } - KASSERT((TAILQ_EMPTY(&so->so_comp)), - ("%s: so_comp populated", __func__)); - KASSERT((TAILQ_EMPTY(&so->so_incomp)), - ("%s: so_incomp populated", __func__)); } - SOCK_LOCK(so); KASSERT((so->so_state & SS_NOFDREF) == 0, ("soclose: NOFDREF")); so->so_state |= SS_NOFDREF; - sorele(so); /* NB: Returns with ACCEPT_UNLOCK(). */ + sorele(so); + if (listening) { + struct socket *sp; + + TAILQ_FOREACH(sp, &lqueue, so_list) { + SOCK_LOCK(sp); + if (sp->so_count == 0) { + SOCK_UNLOCK(sp); + soabort(sp); + } else + /* sp is now in sofree() */ + SOCK_UNLOCK(sp); + } + } CURVNET_RESTORE(); return (error); } @@ -951,13 +1165,11 @@ KASSERT(so->so_count == 0, ("soabort: so_count")); KASSERT((so->so_state & SS_PROTOREF) == 0, ("soabort: SS_PROTOREF")); KASSERT(so->so_state & SS_NOFDREF, ("soabort: !SS_NOFDREF")); - KASSERT((so->so_state & SQ_COMP) == 0, ("soabort: SQ_COMP")); - KASSERT((so->so_state & SQ_INCOMP) == 0, ("soabort: SQ_INCOMP")); + KASSERT(so->so_qstate == SQ_NONE, ("soabort: !SQ_NONE")); VNET_SO_ASSERT(so); if (so->so_proto->pr_usrreqs->pru_abort != NULL) (*so->so_proto->pr_usrreqs->pru_abort)(so); - ACCEPT_LOCK(); SOCK_LOCK(so); sofree(so); } @@ -2511,7 +2723,7 @@ } else { switch (sopt->sopt_name) { case SO_ACCEPTFILTER: - error = do_setopt_accept_filter(so, sopt); + error = accept_filt_setopt(so, sopt); if (error) goto bad; break; @@ -2769,7 +2981,7 @@ } else { switch (sopt->sopt_name) { case SO_ACCEPTFILTER: - error = do_getopt_accept_filter(so, sopt); + error = accept_filt_getopt(so, sopt); break; case SO_LINGER: @@ -2877,15 +3089,15 @@ break; case SO_LISTENQLIMIT: - optval = so->so_qlimit; + optval = SOLISTENING(so) ? so->sol_qlimit : 0; goto integer; case SO_LISTENQLEN: - optval = so->so_qlen; + optval = SOLISTENING(so) ? so->sol_qlen : 0; goto integer; case SO_LISTENINCQLEN: - optval = so->so_incqlen; + optval = SOLISTENING(so) ? so->sol_incqlen : 0; goto integer; case SO_TS_CLOCK: @@ -3032,7 +3244,7 @@ if (so->so_sigio != NULL) pgsigio(&so->so_sigio, SIGURG, 0); - selwakeuppri(&so->so_rcv.sb_sel, PSOCK); + selwakeuppri(&so->so_rdsel, PSOCK); } int @@ -3052,44 +3264,54 @@ sopoll_generic(struct socket *so, int events, struct ucred *active_cred, struct thread *td) { - int revents = 0; - - SOCKBUF_LOCK(&so->so_snd); - SOCKBUF_LOCK(&so->so_rcv); - if (events & (POLLIN | POLLRDNORM)) - if (soreadabledata(so)) - revents |= events & (POLLIN | POLLRDNORM); - - if (events & (POLLOUT | POLLWRNORM)) - if (sowriteable(so)) - revents |= events & (POLLOUT | POLLWRNORM); + int revents; - if (events & (POLLPRI | POLLRDBAND)) - if (so->so_oobmark || (so->so_rcv.sb_state & SBS_RCVATMARK)) - revents |= events & (POLLPRI | POLLRDBAND); - - if ((events & POLLINIGNEOF) == 0) { - if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { - revents |= events & (POLLIN | POLLRDNORM); - if (so->so_snd.sb_state & SBS_CANTSENDMORE) - revents |= POLLHUP; + SOCK_LOCK(so); + if (SOLISTENING(so)) { + if (!(events & (POLLIN | POLLRDNORM))) + revents = 0; + else if (!TAILQ_EMPTY(&so->sol_comp)) + revents = events & (POLLIN | POLLRDNORM); + else { + selrecord(td, &so->so_rdsel); + revents = 0; } - } - - if (revents == 0) { - if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) { - selrecord(td, &so->so_rcv.sb_sel); - so->so_rcv.sb_flags |= SB_SEL; + } else { + revents = 0; + SOCKBUF_LOCK(&so->so_snd); + SOCKBUF_LOCK(&so->so_rcv); + if (events & (POLLIN | POLLRDNORM)) + if (soreadabledata(so)) + revents |= events & (POLLIN | POLLRDNORM); + if (events & (POLLOUT | POLLWRNORM)) + if (sowriteable(so)) + revents |= events & (POLLOUT | POLLWRNORM); + if (events & (POLLPRI | POLLRDBAND)) + if (so->so_oobmark || + (so->so_rcv.sb_state & SBS_RCVATMARK)) + revents |= events & (POLLPRI | POLLRDBAND); + if ((events & POLLINIGNEOF) == 0) { + if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { + revents |= events & (POLLIN | POLLRDNORM); + if (so->so_snd.sb_state & SBS_CANTSENDMORE) + revents |= POLLHUP; + } } - - if (events & (POLLOUT | POLLWRNORM)) { - selrecord(td, &so->so_snd.sb_sel); - so->so_snd.sb_flags |= SB_SEL; + if (revents == 0) { + if (events & + (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) { + selrecord(td, &so->so_rdsel); + so->so_rcv.sb_flags |= SB_SEL; + } + if (events & (POLLOUT | POLLWRNORM)) { + selrecord(td, &so->so_wrsel); + so->so_snd.sb_flags |= SB_SEL; + } } + SOCKBUF_UNLOCK(&so->so_rcv); + SOCKBUF_UNLOCK(&so->so_snd); } - - SOCKBUF_UNLOCK(&so->so_rcv); - SOCKBUF_UNLOCK(&so->so_snd); + SOCK_UNLOCK(so); return (revents); } @@ -3098,28 +3320,38 @@ { struct socket *so = kn->kn_fp->f_data; struct sockbuf *sb; + struct knlist *knl; switch (kn->kn_filter) { case EVFILT_READ: kn->kn_fop = &soread_filtops; + knl = &so->so_rdsel.si_note; sb = &so->so_rcv; break; case EVFILT_WRITE: kn->kn_fop = &sowrite_filtops; + knl = &so->so_wrsel.si_note; sb = &so->so_snd; break; case EVFILT_EMPTY: kn->kn_fop = &soempty_filtops; + knl = &so->so_wrsel.si_note; sb = &so->so_snd; break; default: return (EINVAL); } - SOCKBUF_LOCK(sb); - knlist_add(&sb->sb_sel.si_note, kn, 1); - sb->sb_flags |= SB_KNOTE; - SOCKBUF_UNLOCK(sb); + SOCK_LOCK(so); + if (SOLISTENING(so)) { + knlist_add(knl, kn, 1); + } else { + SOCKBUF_LOCK(sb); + knlist_add(knl, kn, 1); + sb->sb_flags |= SB_KNOTE; + SOCKBUF_UNLOCK(sb); + } + SOCK_UNLOCK(so); return (0); } @@ -3298,11 +3530,11 @@ { struct socket *so = kn->kn_fp->f_data; - SOCKBUF_LOCK(&so->so_rcv); - knlist_remove(&so->so_rcv.sb_sel.si_note, kn, 1); - if (knlist_empty(&so->so_rcv.sb_sel.si_note)) + so_rdknl_lock(so); + knlist_remove(&so->so_rdsel.si_note, kn, 1); + if (!SOLISTENING(so) && knlist_empty(&so->so_rdsel.si_note)) so->so_rcv.sb_flags &= ~SB_KNOTE; - SOCKBUF_UNLOCK(&so->so_rcv); + so_rdknl_unlock(so); } /*ARGSUSED*/ @@ -3312,11 +3544,13 @@ struct socket *so; so = kn->kn_fp->f_data; - if (so->so_options & SO_ACCEPTCONN) { - kn->kn_data = so->so_qlen; - return (!TAILQ_EMPTY(&so->so_comp)); + if (SOLISTENING(so)) { + SOCK_LOCK_ASSERT(so); + kn->kn_data = so->sol_qlen; + return (!TAILQ_EMPTY(&so->sol_comp)); } + SOCKBUF_LOCK_ASSERT(&so->so_rcv); kn->kn_data = sbavail(&so->so_rcv) - so->so_rcv.sb_ctl; @@ -3342,11 +3576,11 @@ { struct socket *so = kn->kn_fp->f_data; - SOCKBUF_LOCK(&so->so_snd); - knlist_remove(&so->so_snd.sb_sel.si_note, kn, 1); - if (knlist_empty(&so->so_snd.sb_sel.si_note)) + so_wrknl_lock(so); + knlist_remove(&so->so_wrsel.si_note, kn, 1); + if (!SOLISTENING(so) && knlist_empty(&so->so_wrsel.si_note)) so->so_snd.sb_flags &= ~SB_KNOTE; - SOCKBUF_UNLOCK(&so->so_snd); + so_wrknl_unlock(so); } /*ARGSUSED*/ @@ -3356,6 +3590,10 @@ struct socket *so; so = kn->kn_fp->f_data; + + if (SOLISTENING(so)) + return (0); + SOCKBUF_LOCK_ASSERT(&so->so_snd); kn->kn_data = sbspace(&so->so_snd); @@ -3382,6 +3620,10 @@ struct socket *so; so = kn->kn_fp->f_data; + + if (SOLISTENING(so)) + return (1); + SOCKBUF_LOCK_ASSERT(&so->so_snd); kn->kn_data = sbused(&so->so_snd); @@ -3451,31 +3693,31 @@ int ret; restart: - ACCEPT_LOCK(); + if ((head = so->so_listen) != NULL) + SOLISTEN_LOCK(head); SOCK_LOCK(so); + /* + * XXXGL: should we re-check so->so_listen? + */ so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING|SS_ISCONFIRMING); so->so_state |= SS_ISCONNECTED; - head = so->so_head; - if (head != NULL && (so->so_qstate & SQ_INCOMP)) { + if (head != NULL && (so->so_qstate == SQ_INCOMP)) { if ((so->so_options & SO_ACCEPTFILTER) == 0) { + TAILQ_REMOVE(&head->sol_incomp, so, so_list); + head->sol_incqlen--; + TAILQ_INSERT_TAIL(&head->sol_comp, so, so_list); + head->sol_qlen++; + so->so_qstate = SQ_COMP; SOCK_UNLOCK(so); - TAILQ_REMOVE(&head->so_incomp, so, so_list); - head->so_incqlen--; - so->so_qstate &= ~SQ_INCOMP; - TAILQ_INSERT_TAIL(&head->so_comp, so, so_list); - head->so_qlen++; - so->so_qstate |= SQ_COMP; - ACCEPT_UNLOCK(); - sorwakeup(head); - wakeup_one(&head->so_timeo); + solisten_wakeup(head); /* unlocks */ } else { - ACCEPT_UNLOCK(); + SOLISTEN_UNLOCK(head); soupcall_set(so, SO_RCV, - head->so_accf->so_accept_filter->accf_callback, - head->so_accf->so_accept_filter_arg); + head->sol_accept_filter->accf_callback, + head->sol_accept_filter_arg); so->so_options &= ~SO_ACCEPTFILTER; - ret = head->so_accf->so_accept_filter->accf_callback(so, - head->so_accf->so_accept_filter_arg, M_NOWAIT); + ret = head->sol_accept_filter->accf_callback(so, + head->sol_accept_filter_arg, M_NOWAIT); if (ret == SU_ISCONNECTED) soupcall_clear(so, SO_RCV); SOCK_UNLOCK(so); @@ -3484,8 +3726,9 @@ } return; } + if (head != NULL) + SOLISTEN_UNLOCK(head); SOCK_UNLOCK(so); - ACCEPT_UNLOCK(); wakeup(&so->so_timeo); sorwakeup(so); sowwakeup(so); @@ -3495,16 +3738,17 @@ soisdisconnecting(struct socket *so) { - /* - * Note: This code assumes that SOCK_LOCK(so) and - * SOCKBUF_LOCK(&so->so_rcv) are the same. - */ - SOCKBUF_LOCK(&so->so_rcv); + SOCK_LOCK(so); so->so_state &= ~SS_ISCONNECTING; so->so_state |= SS_ISDISCONNECTING; - socantrcvmore_locked(so); - SOCKBUF_LOCK(&so->so_snd); - socantsendmore_locked(so); + + if (!SOLISTENING(so)) { + SOCKBUF_LOCK(&so->so_rcv); + socantrcvmore_locked(so); + SOCKBUF_LOCK(&so->so_snd); + socantsendmore_locked(so); + } + SOCK_UNLOCK(so); wakeup(&so->so_timeo); } @@ -3512,17 +3756,18 @@ soisdisconnected(struct socket *so) { - /* - * Note: This code assumes that SOCK_LOCK(so) and - * SOCKBUF_LOCK(&so->so_rcv) are the same. - */ - SOCKBUF_LOCK(&so->so_rcv); + SOCK_LOCK(so); so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING); so->so_state |= SS_ISDISCONNECTED; - socantrcvmore_locked(so); - SOCKBUF_LOCK(&so->so_snd); - sbdrop_locked(&so->so_snd, sbused(&so->so_snd)); - socantsendmore_locked(so); + + if (!SOLISTENING(so)) { + SOCKBUF_LOCK(&so->so_rcv); + socantrcvmore_locked(so); + SOCKBUF_LOCK(&so->so_snd); + sbdrop_locked(&so->so_snd, sbused(&so->so_snd)); + socantsendmore_locked(so); + } + SOCK_UNLOCK(so); wakeup(&so->so_timeo); } @@ -3544,11 +3789,12 @@ * Register per-socket buffer upcalls. */ void -soupcall_set(struct socket *so, int which, - int (*func)(struct socket *, void *, int), void *arg) +soupcall_set(struct socket *so, int which, so_upcall_t func, void *arg) { struct sockbuf *sb; + KASSERT(!SOLISTENING(so), ("%s: so %p listening", __func__, so)); + switch (which) { case SO_RCV: sb = &so->so_rcv; @@ -3560,10 +3806,6 @@ panic("soupcall_set: bad which"); } SOCKBUF_LOCK_ASSERT(sb); -#if 0 - /* XXX: accf_http actually wants to do this on purpose. */ - KASSERT(sb->sb_upcall == NULL, ("soupcall_set: overwriting upcall")); -#endif sb->sb_upcall = func; sb->sb_upcallarg = arg; sb->sb_flags |= SB_UPCALL; @@ -3574,6 +3816,8 @@ { struct sockbuf *sb; + KASSERT(!SOLISTENING(so), ("%s: so %p listening", __func__, so)); + switch (which) { case SO_RCV: sb = &so->so_rcv; @@ -3585,12 +3829,110 @@ panic("soupcall_clear: bad which"); } SOCKBUF_LOCK_ASSERT(sb); - KASSERT(sb->sb_upcall != NULL, ("soupcall_clear: no upcall to clear")); + KASSERT(sb->sb_upcall != NULL, + ("%s: so %p no upcall to clear", __func__, so)); sb->sb_upcall = NULL; sb->sb_upcallarg = NULL; sb->sb_flags &= ~SB_UPCALL; } +void +solisten_upcall_set(struct socket *so, so_upcall_t func, void *arg) +{ + + SOLISTEN_LOCK_ASSERT(so); + so->sol_upcall = func; + so->sol_upcallarg = arg; +} + +static void +so_rdknl_lock(void *arg) +{ + struct socket *so = arg; + + if (SOLISTENING(so)) + SOCK_LOCK(so); + else + SOCKBUF_LOCK(&so->so_rcv); +} + +static void +so_rdknl_unlock(void *arg) +{ + struct socket *so = arg; + + if (SOLISTENING(so)) + SOCK_UNLOCK(so); + else + SOCKBUF_UNLOCK(&so->so_rcv); +} + +static void +so_rdknl_assert_locked(void *arg) +{ + struct socket *so = arg; + + if (SOLISTENING(so)) + SOCK_LOCK_ASSERT(so); + else + SOCKBUF_LOCK_ASSERT(&so->so_rcv); +} + +static void +so_rdknl_assert_unlocked(void *arg) +{ + struct socket *so = arg; + + if (SOLISTENING(so)) + SOCK_UNLOCK_ASSERT(so); + else + SOCKBUF_UNLOCK_ASSERT(&so->so_rcv); +} + +static void +so_wrknl_lock(void *arg) +{ + struct socket *so = arg; + + if (SOLISTENING(so)) + SOCK_LOCK(so); + else + SOCKBUF_LOCK(&so->so_snd); +} + +static void +so_wrknl_unlock(void *arg) +{ + struct socket *so = arg; + + if (SOLISTENING(so)) + SOCK_UNLOCK(so); + else + SOCKBUF_UNLOCK(&so->so_snd); +} + +static void +so_wrknl_assert_locked(void *arg) +{ + struct socket *so = arg; + + if (SOLISTENING(so)) + SOCK_LOCK_ASSERT(so); + else + SOCKBUF_LOCK_ASSERT(&so->so_snd); +} + +static void +so_wrknl_assert_unlocked(void *arg) +{ + struct socket *so = arg; + + if (SOLISTENING(so)) + SOCK_UNLOCK_ASSERT(so); + else + SOCKBUF_UNLOCK_ASSERT(&so->so_snd); +} + /* * Create an external-format (``xsocket'') structure using the information in * the kernel-format socket structure pointed to by so. This is done to @@ -3612,32 +3954,24 @@ xso->so_pcb = so->so_pcb; xso->xso_protocol = so->so_proto->pr_protocol; xso->xso_family = so->so_proto->pr_domain->dom_family; - xso->so_qlen = so->so_qlen; - xso->so_incqlen = so->so_incqlen; - xso->so_qlimit = so->so_qlimit; xso->so_timeo = so->so_timeo; xso->so_error = so->so_error; - xso->so_pgid = so->so_sigio ? so->so_sigio->sio_pgid : 0; - xso->so_oobmark = so->so_oobmark; - sbtoxsockbuf(&so->so_snd, &xso->so_snd); - sbtoxsockbuf(&so->so_rcv, &xso->so_rcv); xso->so_uid = so->so_cred->cr_uid; -} - - -/* - * Socket accessor functions to provide external consumers with - * a safe interface to socket state - * - */ - -void -so_listeners_apply_all(struct socket *so, void (*func)(struct socket *, void *), - void *arg) -{ - - TAILQ_FOREACH(so, &so->so_comp, so_list) - func(so, arg); + xso->so_pgid = so->so_sigio ? so->so_sigio->sio_pgid : 0; + if (SOLISTENING(so)) { + xso->so_qlen = so->sol_qlen; + xso->so_incqlen = so->sol_incqlen; + xso->so_qlimit = so->sol_qlimit; + xso->so_oobmark = 0; + bzero(&xso->so_snd, sizeof(xso->so_snd)); + bzero(&xso->so_rcv, sizeof(xso->so_rcv)); + } else { + xso->so_state |= so->so_qstate; + xso->so_qlen = xso->so_incqlen = xso->so_qlimit = 0; + xso->so_oobmark = so->so_oobmark; + sbtoxsockbuf(&so->so_snd, &xso->so_snd); + sbtoxsockbuf(&so->so_rcv, &xso->so_rcv); + } } struct sockbuf * Index: sys/kern/uipc_syscalls.c =================================================================== --- sys/kern/uipc_syscalls.c +++ sys/kern/uipc_syscalls.c @@ -68,13 +68,6 @@ #include #include -/* - * Flags for accept1() and kern_accept4(), in addition to SOCK_CLOEXEC - * and SOCK_NONBLOCK. - */ -#define ACCEPT4_INHERIT 0x1 -#define ACCEPT4_COMPAT 0x2 - static int sendit(struct thread *td, int s, struct msghdr *mp, int flags); static int recvit(struct thread *td, int s, struct msghdr *mp, void *namelenp); @@ -350,59 +343,22 @@ (flags & SOCK_CLOEXEC) ? O_CLOEXEC : 0, &fcaps); if (error != 0) goto done; - ACCEPT_LOCK(); - if ((head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->so_comp)) { - ACCEPT_UNLOCK(); - error = EWOULDBLOCK; - goto noconnection; - } - while (TAILQ_EMPTY(&head->so_comp) && head->so_error == 0) { - if (head->so_rcv.sb_state & SBS_CANTRCVMORE) { - head->so_error = ECONNABORTED; - break; - } - error = msleep(&head->so_timeo, &accept_mtx, PSOCK | PCATCH, - "accept", 0); - if (error != 0) { - ACCEPT_UNLOCK(); - goto noconnection; - } - } - if (head->so_error) { - error = head->so_error; - head->so_error = 0; - ACCEPT_UNLOCK(); + SOCK_LOCK(head); + if (!SOLISTENING(head)) { + SOCK_UNLOCK(head); + error = EINVAL; goto noconnection; } - so = TAILQ_FIRST(&head->so_comp); - KASSERT(!(so->so_qstate & SQ_INCOMP), ("accept1: so SQ_INCOMP")); - KASSERT(so->so_qstate & SQ_COMP, ("accept1: so not SQ_COMP")); - /* - * Before changing the flags on the socket, we have to bump the - * reference count. Otherwise, if the protocol calls sofree(), - * the socket will be released due to a zero refcount. - */ - SOCK_LOCK(so); /* soref() and so_state update */ - soref(so); /* file descriptor reference */ - - TAILQ_REMOVE(&head->so_comp, so, so_list); - head->so_qlen--; - if (flags & ACCEPT4_INHERIT) - so->so_state |= (head->so_state & SS_NBIO); - else - so->so_state |= (flags & SOCK_NONBLOCK) ? SS_NBIO : 0; - so->so_qstate &= ~SQ_COMP; - so->so_head = NULL; - - SOCK_UNLOCK(so); - ACCEPT_UNLOCK(); + error = solisten_dequeue(head, &so, flags); + if (error != 0) + goto noconnection; /* An extra reference on `nfp' has been held for us by falloc(). */ td->td_retval[0] = fd; - /* connection has been removed from the listen queue */ - KNOTE_UNLOCKED(&head->so_rcv.sb_sel.si_note, 0); + /* Connection has been removed from the listen queue. */ + KNOTE_UNLOCKED(&head->so_rdsel.si_note, 0); if (flags & ACCEPT4_INHERIT) { pgid = fgetown(&head->so_sigio); @@ -420,7 +376,6 @@ (void) fo_ioctl(nfp, FIONBIO, &tmp, td->td_ucred, td); tmp = fflag & FASYNC; (void) fo_ioctl(nfp, FIOASYNC, &tmp, td->td_ucred, td); - sa = NULL; error = soaccept(so, &sa); if (error != 0) goto noconnection; @@ -558,7 +513,7 @@ } SOCK_LOCK(so); while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) { - error = msleep(&so->so_timeo, SOCK_MTX(so), PSOCK | PCATCH, + error = msleep(&so->so_timeo, &so->so_lock, PSOCK | PCATCH, "connec", 0); if (error != 0) { if (error == EINTR || error == ERESTART) Index: sys/kern/uipc_usrreq.c =================================================================== --- sys/kern/uipc_usrreq.c +++ sys/kern/uipc_usrreq.c @@ -430,7 +430,7 @@ unp->unp_socket = so; so->so_pcb = unp; unp->unp_refcount = 1; - if (so->so_head != NULL) + if (so->so_listen != NULL) unp->unp_flags |= UNP_NASCENT; UNP_LIST_LOCK(); @@ -552,7 +552,7 @@ UNP_LINK_WLOCK(); UNP_PCB_LOCK(unp); - VOP_UNP_BIND(vp, unp->unp_socket); + VOP_UNP_BIND(vp, unp); unp->unp_vnode = vp; unp->unp_addr = soun; unp->unp_flags &= ~UNP_BINDING; @@ -607,6 +607,7 @@ uipc_close(struct socket *so) { struct unpcb *unp, *unp2; + struct vnode *vp = NULL; unp = sotounpcb(so); KASSERT(unp != NULL, ("uipc_close: unp == NULL")); @@ -619,8 +620,14 @@ unp_disconnect(unp, unp2); UNP_PCB_UNLOCK(unp2); } + if (SOLISTENING(so) && ((vp = unp->unp_vnode) != NULL)) { + VOP_UNP_DETACH(vp); + unp->unp_vnode = NULL; + } UNP_PCB_UNLOCK(unp); UNP_LINK_WUNLOCK(); + if (vp) + vrele(vp); } static int @@ -663,16 +670,11 @@ --unp_count; UNP_LIST_UNLOCK(); - if ((unp->unp_flags & UNP_NASCENT) != 0) { - UNP_PCB_LOCK(unp); - goto teardown; - } UNP_LINK_WLOCK(); UNP_PCB_LOCK(unp); + if ((unp->unp_flags & UNP_NASCENT) != 0) + goto teardown; - /* - * XXXRW: Should assert vp->v_socket == so. - */ if ((vp = unp->unp_vnode) != NULL) { VOP_UNP_DETACH(vp); unp->unp_vnode = NULL; @@ -696,8 +698,8 @@ UNP_PCB_UNLOCK(ref); } local_unp_rights = unp_rights; - UNP_LINK_WUNLOCK(); teardown: + UNP_LINK_WUNLOCK(); unp->unp_socket->so_pcb = NULL; saved_unp_addr = unp->unp_addr; unp->unp_addr = NULL; @@ -761,7 +763,6 @@ error = solisten_proto_check(so); if (error == 0) { cru2x(td->td_ucred, &unp->unp_peercred); - unp->unp_flags |= UNP_HAVEPCCACHED; solisten_proto(so, backlog); } SOCK_UNLOCK(so); @@ -1319,7 +1320,7 @@ { struct sockaddr_un *soun = (struct sockaddr_un *)nam; struct vnode *vp; - struct socket *so2, *so3; + struct socket *so2; struct unpcb *unp, *unp2, *unp3; struct nameidata nd; char buf[SOCK_MAXADDRLEN]; @@ -1386,31 +1387,30 @@ * and to protect simultaneous locking of multiple pcbs. */ UNP_LINK_WLOCK(); - VOP_UNP_CONNECT(vp, &so2); - if (so2 == NULL) { + VOP_UNP_CONNECT(vp, &unp2); + if (unp2 == NULL) { error = ECONNREFUSED; goto bad2; } + so2 = unp2->unp_socket; if (so->so_type != so2->so_type) { error = EPROTOTYPE; goto bad2; } + UNP_PCB_LOCK(unp); + UNP_PCB_LOCK(unp2); if (so->so_proto->pr_flags & PR_CONNREQUIRED) { if (so2->so_options & SO_ACCEPTCONN) { CURVNET_SET(so2->so_vnet); - so3 = sonewconn(so2, 0); + so2 = sonewconn(so2, 0); CURVNET_RESTORE(); } else - so3 = NULL; - if (so3 == NULL) { + so2 = NULL; + if (so2 == NULL) { error = ECONNREFUSED; - goto bad2; + goto bad3; } - unp = sotounpcb(so); - unp2 = sotounpcb(so2); - unp3 = sotounpcb(so3); - UNP_PCB_LOCK(unp); - UNP_PCB_LOCK(unp2); + unp3 = sotounpcb(so2); UNP_PCB_LOCK(unp3); if (unp2->unp_addr != NULL) { bcopy(unp2->unp_addr, sa, unp2->unp_addr->sun_len); @@ -1431,30 +1431,24 @@ * listen(); uipc_listen() cached that process's credentials * at that time so we can use them now. */ - KASSERT(unp2->unp_flags & UNP_HAVEPCCACHED, - ("unp_connect: listener without cached peercred")); memcpy(&unp->unp_peercred, &unp2->unp_peercred, sizeof(unp->unp_peercred)); unp->unp_flags |= UNP_HAVEPC; if (unp2->unp_flags & UNP_WANTCRED) unp3->unp_flags |= UNP_WANTCRED; - UNP_PCB_UNLOCK(unp3); UNP_PCB_UNLOCK(unp2); - UNP_PCB_UNLOCK(unp); + unp2 = unp3; #ifdef MAC - mac_socketpeer_set_from_socket(so, so3); - mac_socketpeer_set_from_socket(so3, so); + mac_socketpeer_set_from_socket(so, so2); + mac_socketpeer_set_from_socket(so2, so); #endif - - so2 = so3; } - unp = sotounpcb(so); - KASSERT(unp != NULL, ("unp_connect: unp == NULL")); - unp2 = sotounpcb(so2); - KASSERT(unp2 != NULL, ("unp_connect: unp2 == NULL")); - UNP_PCB_LOCK(unp); - UNP_PCB_LOCK(unp2); + + KASSERT(unp2 != NULL && so2 != NULL && unp2->unp_socket == so2 && + sotounpcb(so2) == unp2, + ("%s: unp2 %p so2 %p", __func__, unp2, so2)); error = unp_connect2(so, so2, PRU_CONNECT); +bad3: UNP_PCB_UNLOCK(unp2); UNP_PCB_UNLOCK(unp); bad2: @@ -2237,8 +2231,7 @@ static void unp_gc_process(struct unpcb *unp) { - struct socket *soa; - struct socket *so; + struct socket *so, *soa; struct file *fp; /* Already processed. */ @@ -2258,28 +2251,30 @@ return; } - /* - * Mark all sockets we reference with RIGHTS. - */ so = unp->unp_socket; - if ((unp->unp_gcflag & UNPGC_IGNORE_RIGHTS) == 0) { - SOCKBUF_LOCK(&so->so_rcv); - unp_scan(so->so_rcv.sb_mb, unp_accessable); - SOCKBUF_UNLOCK(&so->so_rcv); - } - - /* - * Mark all sockets in our accept queue. - */ - ACCEPT_LOCK(); - TAILQ_FOREACH(soa, &so->so_comp, so_list) { - if ((sotounpcb(soa)->unp_gcflag & UNPGC_IGNORE_RIGHTS) != 0) - continue; - SOCKBUF_LOCK(&soa->so_rcv); - unp_scan(soa->so_rcv.sb_mb, unp_accessable); - SOCKBUF_UNLOCK(&soa->so_rcv); + SOCK_LOCK(so); + if (SOLISTENING(so)) { + /* + * Mark all sockets in our accept queue. + */ + TAILQ_FOREACH(soa, &so->sol_comp, so_list) { + if (sotounpcb(soa)->unp_gcflag & UNPGC_IGNORE_RIGHTS) + continue; + SOCKBUF_LOCK(&soa->so_rcv); + unp_scan(soa->so_rcv.sb_mb, unp_accessable); + SOCKBUF_UNLOCK(&soa->so_rcv); + } + } else { + /* + * Mark all sockets we reference with RIGHTS. + */ + if ((unp->unp_gcflag & UNPGC_IGNORE_RIGHTS) == 0) { + SOCKBUF_LOCK(&so->so_rcv); + unp_scan(so->so_rcv.sb_mb, unp_accessable); + SOCKBUF_UNLOCK(&so->so_rcv); + } } - ACCEPT_UNLOCK(); + SOCK_UNLOCK(so); unp->unp_gcflag |= UNPGC_SCANNED; } @@ -2399,7 +2394,8 @@ UNP_LIST_LOCK(); unp->unp_gcflag |= UNPGC_IGNORE_RIGHTS; UNP_LIST_UNLOCK(); - unp_dispose_mbuf(so->so_rcv.sb_mb); + if (!SOLISTENING(so)) + unp_dispose_mbuf(so->so_rcv.sb_mb); } static void @@ -2454,7 +2450,6 @@ void vfs_unp_reclaim(struct vnode *vp) { - struct socket *so; struct unpcb *unp; int active; @@ -2464,10 +2459,7 @@ active = 0; UNP_LINK_WLOCK(); - VOP_UNP_CONNECT(vp, &so); - if (so == NULL) - goto done; - unp = sotounpcb(so); + VOP_UNP_CONNECT(vp, &unp); if (unp == NULL) goto done; UNP_PCB_LOCK(unp); @@ -2503,10 +2495,6 @@ db_printf("%sUNP_HAVEPC", comma ? ", " : ""); comma = 1; } - if (unp_flags & UNP_HAVEPCCACHED) { - db_printf("%sUNP_HAVEPCCACHED", comma ? ", " : ""); - comma = 1; - } if (unp_flags & UNP_WANTCRED) { db_printf("%sUNP_WANTCRED", comma ? ", " : ""); comma = 1; Index: sys/kern/vfs_default.c =================================================================== --- sys/kern/vfs_default.c +++ sys/kern/vfs_default.c @@ -1128,7 +1128,7 @@ vop_stdunp_bind(struct vop_unp_bind_args *ap) { - ap->a_vp->v_socket = ap->a_socket; + ap->a_vp->v_unpcb = ap->a_unpcb; return (0); } @@ -1136,7 +1136,7 @@ vop_stdunp_connect(struct vop_unp_connect_args *ap) { - *ap->a_socket = ap->a_vp->v_socket; + *ap->a_unpcb = ap->a_vp->v_unpcb; return (0); } @@ -1144,7 +1144,7 @@ vop_stdunp_detach(struct vop_unp_detach_args *ap) { - ap->a_vp->v_socket = NULL; + ap->a_vp->v_unpcb = NULL; return (0); } Index: sys/kern/vfs_subr.c =================================================================== --- sys/kern/vfs_subr.c +++ sys/kern/vfs_subr.c @@ -2992,7 +2992,10 @@ /* XXX Elsewhere we detect an already freed vnode via NULL v_op. */ vp->v_op = NULL; #endif - bzero(&vp->v_un, sizeof(vp->v_un)); + vp->v_mountedhere = NULL; + vp->v_unpcb = NULL; + vp->v_rdev = NULL; + vp->v_fifoinfo = NULL; vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0; vp->v_iflag = 0; vp->v_vflag = 0; Index: sys/kern/vnode_if.src =================================================================== --- sys/kern/vnode_if.src +++ sys/kern/vnode_if.src @@ -662,7 +662,7 @@ vop_unp_bind { IN struct vnode *vp; - IN struct socket *socket; + IN struct unpcb *unpcb; }; @@ -670,7 +670,7 @@ vop_unp_connect { IN struct vnode *vp; - OUT struct socket **socket; + OUT struct unpcb **unpcb; }; Index: sys/netgraph/bluetooth/socket/ng_btsocket_l2cap.c =================================================================== --- sys/netgraph/bluetooth/socket/ng_btsocket_l2cap.c +++ sys/netgraph/bluetooth/socket/ng_btsocket_l2cap.c @@ -614,21 +614,13 @@ pcb = ng_btsocket_l2cap_pcb_by_addr(&rt->src, ip->psm); if (pcb != NULL) { - struct socket *so1 = NULL; + struct socket *so1; mtx_lock(&pcb->pcb_mtx); - /* - * First check the pending connections queue and if we have - * space then create new socket and set proper source address. - */ - - if (pcb->so->so_qlen <= pcb->so->so_qlimit) { - CURVNET_SET(pcb->so->so_vnet); - so1 = sonewconn(pcb->so, 0); - CURVNET_RESTORE(); - } - + CURVNET_SET(pcb->so->so_vnet); + so1 = sonewconn(pcb->so, 0); + CURVNET_RESTORE(); if (so1 == NULL) { result = NG_L2CAP_NO_RESOURCES; goto respond; Index: sys/netgraph/bluetooth/socket/ng_btsocket_rfcomm.c =================================================================== --- sys/netgraph/bluetooth/socket/ng_btsocket_rfcomm.c +++ sys/netgraph/bluetooth/socket/ng_btsocket_rfcomm.c @@ -1149,7 +1149,7 @@ { ng_btsocket_rfcomm_pcb_p pcb = NULL, pcb1 = NULL; ng_btsocket_l2cap_pcb_p l2pcb = NULL; - struct socket *so1 = NULL; + struct socket *so1; mtx_assert(&s->session_mtx, MA_OWNED); @@ -1171,11 +1171,9 @@ mtx_lock(&pcb->pcb_mtx); - if (pcb->so->so_qlen <= pcb->so->so_qlimit) { - CURVNET_SET(pcb->so->so_vnet); - so1 = sonewconn(pcb->so, 0); - CURVNET_RESTORE(); - } + CURVNET_SET(pcb->so->so_vnet); + so1 = sonewconn(pcb->so, 0); + CURVNET_RESTORE(); mtx_unlock(&pcb->pcb_mtx); @@ -1405,47 +1403,25 @@ static int ng_btsocket_rfcomm_session_accept(ng_btsocket_rfcomm_session_p s0) { - struct socket *l2so = NULL; + struct socket *l2so; struct sockaddr_l2cap *l2sa = NULL; ng_btsocket_l2cap_pcb_t *l2pcb = NULL; ng_btsocket_rfcomm_session_p s = NULL; - int error = 0; + int error; mtx_assert(&ng_btsocket_rfcomm_sessions_mtx, MA_OWNED); mtx_assert(&s0->session_mtx, MA_OWNED); - /* Check if there is a complete L2CAP connection in the queue */ - if ((error = s0->l2so->so_error) != 0) { + SOLISTEN_LOCK(s0->l2so); + error = solisten_dequeue(s0->l2so, &l2so, 0); + if (error == EWOULDBLOCK) + return (error); + if (error) { NG_BTSOCKET_RFCOMM_ERR( "%s: Could not accept connection on L2CAP socket, error=%d\n", __func__, error); - s0->l2so->so_error = 0; - return (error); } - ACCEPT_LOCK(); - if (TAILQ_EMPTY(&s0->l2so->so_comp)) { - ACCEPT_UNLOCK(); - if (s0->l2so->so_rcv.sb_state & SBS_CANTRCVMORE) - return (ECONNABORTED); - return (EWOULDBLOCK); - } - - /* Accept incoming L2CAP connection */ - l2so = TAILQ_FIRST(&s0->l2so->so_comp); - if (l2so == NULL) - panic("%s: l2so == NULL\n", __func__); - - TAILQ_REMOVE(&s0->l2so->so_comp, l2so, so_list); - s0->l2so->so_qlen --; - l2so->so_qstate &= ~SQ_COMP; - l2so->so_head = NULL; - SOCK_LOCK(l2so); - soref(l2so); - l2so->so_state |= SS_NBIO; - SOCK_UNLOCK(l2so); - ACCEPT_UNLOCK(); - error = soaccept(l2so, (struct sockaddr **) &l2sa); if (error != 0) { NG_BTSOCKET_RFCOMM_ERR( Index: sys/netgraph/bluetooth/socket/ng_btsocket_sco.c =================================================================== --- sys/netgraph/bluetooth/socket/ng_btsocket_sco.c +++ sys/netgraph/bluetooth/socket/ng_btsocket_sco.c @@ -471,20 +471,13 @@ pcb = ng_btsocket_sco_pcb_by_addr(&rt->src); if (pcb != NULL) { - struct socket *so1 = NULL; + struct socket *so1; /* pcb is locked */ - /* - * First check the pending connections queue and if we have - * space then create new socket and set proper source address. - */ - - if (pcb->so->so_qlen <= pcb->so->so_qlimit) { - CURVNET_SET(pcb->so->so_vnet); - so1 = sonewconn(pcb->so, 0); - CURVNET_RESTORE(); - } + CURVNET_SET(pcb->so->so_vnet); + so1 = sonewconn(pcb->so, 0); + CURVNET_RESTORE(); if (so1 == NULL) { status = 0x0d; /* Rejected due to limited resources */ Index: sys/netgraph/ng_ksocket.c =================================================================== --- sys/netgraph/ng_ksocket.c +++ sys/netgraph/ng_ksocket.c @@ -153,8 +153,7 @@ }; /* Helper functions */ -static int ng_ksocket_check_accept(priv_p); -static void ng_ksocket_finish_accept(priv_p); +static int ng_ksocket_accept(priv_p); static int ng_ksocket_incoming(struct socket *so, void *arg, int waitflag); static int ng_ksocket_parse(const struct ng_ksocket_alias *aliases, const char *s, int family); @@ -698,6 +697,7 @@ ERROUT(ENXIO); /* Listen */ + so->so_state |= SS_NBIO; error = solisten(so, *((int32_t *)msg->data), td); break; } @@ -716,21 +716,16 @@ if (priv->flags & KSF_ACCEPTING) ERROUT(EALREADY); - error = ng_ksocket_check_accept(priv); - if (error != 0 && error != EWOULDBLOCK) - ERROUT(error); - /* * If a connection is already complete, take it. * Otherwise let the upcall function deal with * the connection when it comes in. */ + error = ng_ksocket_accept(priv); + if (error != 0 && error != EWOULDBLOCK) + ERROUT(error); priv->response_token = msg->header.token; raddr = priv->response_addr = NGI_RETADDR(item); - if (error == 0) { - ng_ksocket_finish_accept(priv); - } else - priv->flags |= KSF_ACCEPTING; break; } @@ -1068,13 +1063,8 @@ } /* Check whether a pending accept operation has completed */ - if (priv->flags & KSF_ACCEPTING) { - error = ng_ksocket_check_accept(priv); - if (error != EWOULDBLOCK) - priv->flags &= ~KSF_ACCEPTING; - if (error == 0) - ng_ksocket_finish_accept(priv); - } + if (priv->flags & KSF_ACCEPTING) + (void )ng_ksocket_accept(priv); /* * If we don't have a hook, we must handle data events later. When @@ -1171,35 +1161,8 @@ } } -/* - * Check for a completed incoming connection and return 0 if one is found. - * Otherwise return the appropriate error code. - */ static int -ng_ksocket_check_accept(priv_p priv) -{ - struct socket *const head = priv->so; - int error; - - if ((error = head->so_error) != 0) { - head->so_error = 0; - return error; - } - /* Unlocked read. */ - if (TAILQ_EMPTY(&head->so_comp)) { - if (head->so_rcv.sb_state & SBS_CANTRCVMORE) - return ECONNABORTED; - return EWOULDBLOCK; - } - return 0; -} - -/* - * Handle the first completed incoming connection, assumed to be already - * on the socket's so_comp queue. - */ -static void -ng_ksocket_finish_accept(priv_p priv) +ng_ksocket_accept(priv_p priv) { struct socket *const head = priv->so; struct socket *so; @@ -1211,23 +1174,15 @@ int len; int error; - ACCEPT_LOCK(); - so = TAILQ_FIRST(&head->so_comp); - if (so == NULL) { /* Should never happen */ - ACCEPT_UNLOCK(); - return; + SOLISTEN_LOCK(head); + error = solisten_dequeue(head, &so, SOCK_NONBLOCK); + if (error == EWOULDBLOCK) { + priv->flags |= KSF_ACCEPTING; + return (error); } - TAILQ_REMOVE(&head->so_comp, so, so_list); - head->so_qlen--; - so->so_qstate &= ~SQ_COMP; - so->so_head = NULL; - SOCK_LOCK(so); - soref(so); - so->so_state |= SS_NBIO; - SOCK_UNLOCK(so); - ACCEPT_UNLOCK(); - - /* XXX KNOTE_UNLOCKED(&head->so_rcv.sb_sel.si_note, 0); */ + priv->flags &= ~KSF_ACCEPTING; + if (error) + return (error); soaccept(so, &sa); @@ -1288,6 +1243,8 @@ out: if (sa != NULL) free(sa, M_SONAME); + + return (0); } /* Index: sys/netinet/sctp_input.c =================================================================== --- sys/netinet/sctp_input.c +++ sys/netinet/sctp_input.c @@ -161,13 +161,12 @@ *abort_no_unlock = 1; goto outnow; } - /* We are only accepting if we have a socket with positive - * so_qlimit. */ + /* We are only accepting if we have a listening socket. */ if ((stcb == NULL) && ((inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) || (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) || (inp->sctp_socket == NULL) || - (inp->sctp_socket->so_qlimit == 0))) { + (!SOLISTENING(inp->sctp_socket)))) { /* * FIX ME ?? What about TCP model and we have a * match/restart case? Actually no fix is needed. the lookup @@ -1605,7 +1604,7 @@ sctp_stop_all_cookie_timers(stcb); if (((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL)) && - (inp->sctp_socket->so_qlimit == 0) + (!SOLISTENING(inp->sctp_socket)) ) { #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) struct socket *so; @@ -1806,7 +1805,7 @@ if (((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL)) && - (inp->sctp_socket->so_qlimit == 0)) { + (!SOLISTENING(inp->sctp_socket))) { #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) struct socket *so; #endif @@ -2317,7 +2316,7 @@ *notification = SCTP_NOTIFY_ASSOC_UP; if (((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL)) && - (inp->sctp_socket->so_qlimit == 0)) { + (!SOLISTENING(inp->sctp_socket))) { /* * This is an endpoint that called connect() how it got a * cookie that is NEW is a bit of a mystery. It must be that @@ -2343,7 +2342,7 @@ SCTP_SOCKET_UNLOCK(so, 1); #endif } else if ((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) && - (inp->sctp_socket->so_qlimit)) { + SOLISTENING(inp->sctp_socket)) { /* * We don't want to do anything with this one. Since it is * the listening guy. The timer will get started for @@ -5205,7 +5204,7 @@ * longer listening. */ - if ((stcb == NULL) && (inp->sctp_socket->so_qlen >= inp->sctp_socket->so_qlimit)) { + if ((stcb == NULL) && (!SOLISTENING(inp->sctp_socket))) { if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) && (SCTP_BASE_SYSCTL(sctp_abort_if_one_2_one_hits_limit))) { op_err = sctp_generate_cause(SCTP_CAUSE_OUT_OF_RESC, ""); Index: sys/netinet/sctp_output.c =================================================================== --- sys/netinet/sctp_output.c +++ sys/netinet/sctp_output.c @@ -12595,7 +12595,7 @@ (void *)addr, sndlen); if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) && - (inp->sctp_socket->so_qlimit)) { + SOLISTENING(inp->sctp_socket)) { /* The listener can NOT send */ SCTP_LTRACE_ERR_RET(NULL, NULL, NULL, SCTP_FROM_SCTP_OUTPUT, ENOTCONN); error = ENOTCONN; Index: sys/netinet/sctp_pcb.c =================================================================== --- sys/netinet/sctp_pcb.c +++ sys/netinet/sctp_pcb.c @@ -1311,7 +1311,7 @@ * it is the acceptor, then do the special_lookup to hash * and find the real inp. */ - if ((inp->sctp_socket) && (inp->sctp_socket->so_qlimit)) { + if ((inp->sctp_socket) && SOLISTENING(inp->sctp_socket)) { /* to is peer addr, from is my addr */ stcb = sctp_tcb_special_locate(inp_p, remote, local, netp, inp->def_vrf_id); @@ -1884,7 +1884,7 @@ if (tinp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) { continue; } - if (tinp->sctp_socket->so_qlimit) { + if (SOLISTENING(tinp->sctp_socket)) { continue; } SCTP_INP_WLOCK(tinp); Index: sys/netinet/sctp_syscalls.c =================================================================== --- sys/netinet/sctp_syscalls.c +++ sys/netinet/sctp_syscalls.c @@ -152,29 +152,11 @@ td->td_retval[0] = fd; CURVNET_SET(head->so_vnet); - so = sonewconn(head, SS_ISCONNECTED); + so = sopeeloff(head); if (so == NULL) { error = ENOMEM; goto noconnection; } - /* - * Before changing the flags on the socket, we have to bump the - * reference count. Otherwise, if the protocol calls sofree(), - * the socket will be released due to a zero refcount. - */ - SOCK_LOCK(so); - soref(so); /* file descriptor reference */ - SOCK_UNLOCK(so); - - ACCEPT_LOCK(); - - TAILQ_REMOVE(&head->so_comp, so, so_list); - head->so_qlen--; - so->so_state |= (head->so_state & SS_NBIO); - so->so_state &= ~SS_NOFDREF; - so->so_qstate &= ~SQ_COMP; - so->so_head = NULL; - ACCEPT_UNLOCK(); finit(nfp, fflag, DTYPE_SOCKET, so, &socketops); error = sctp_do_peeloff(head, so, (sctp_assoc_t)uap->name); if (error != 0) Index: sys/netinet/sctp_sysctl.c =================================================================== --- sys/netinet/sctp_sysctl.c +++ sys/netinet/sctp_sysctl.c @@ -410,16 +410,17 @@ xinpcb.socket = inp->sctp_socket; so = inp->sctp_socket; if ((so == NULL) || + !SOLISTENING(so) || (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE)) { xinpcb.qlen = 0; xinpcb.maxqlen = 0; } else { - xinpcb.qlen = so->so_qlen; - xinpcb.qlen_old = so->so_qlen > USHRT_MAX ? - USHRT_MAX : (uint16_t)so->so_qlen; - xinpcb.maxqlen = so->so_qlimit; - xinpcb.maxqlen_old = so->so_qlimit > USHRT_MAX ? - USHRT_MAX : (uint16_t)so->so_qlimit; + xinpcb.qlen = so->sol_qlen; + xinpcb.qlen_old = so->sol_qlen > USHRT_MAX ? + USHRT_MAX : (uint16_t)so->sol_qlen; + xinpcb.maxqlen = so->sol_qlimit; + xinpcb.maxqlen_old = so->sol_qlimit > USHRT_MAX ? + USHRT_MAX : (uint16_t)so->sol_qlimit; } SCTP_INP_INCR_REF(inp); SCTP_INP_RUNLOCK(inp); Index: sys/netinet/sctp_usrreq.c =================================================================== --- sys/netinet/sctp_usrreq.c +++ sys/netinet/sctp_usrreq.c @@ -7005,6 +7005,8 @@ SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); return (ECONNRESET); } + if (inp->sctp_flags & SCTP_PCB_FLAGS_UDPTYPE) + return (EOPNOTSUPP); if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_PORTREUSE)) { /* See if we have a listener */ struct sctp_inpcb *tinp; @@ -7034,7 +7036,7 @@ if (tinp && (tinp != inp) && ((tinp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) == 0) && ((tinp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) == 0) && - (tinp->sctp_socket->so_qlimit)) { + SOLISTENING(tinp->sctp_socket)) { /* * we have a listener already and * its not this inp. @@ -7078,7 +7080,7 @@ if (tinp && (tinp != inp) && ((tinp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) == 0) && ((tinp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) == 0) && - (tinp->sctp_socket->so_qlimit)) { + SOLISTENING(tinp->sctp_socket)) { /* * we have a listener already and its not * this inp. @@ -7133,16 +7135,7 @@ } } SOCK_LOCK(so); - /* It appears for 7.0 and on, we must always call this. */ solisten_proto(so, backlog); - if (inp->sctp_flags & SCTP_PCB_FLAGS_UDPTYPE) { - /* remove the ACCEPTCONN flag for one-to-many sockets */ - so->so_options &= ~SO_ACCEPTCONN; - } - if (backlog == 0) { - /* turning off listen */ - so->so_options &= ~SO_ACCEPTCONN; - } SOCK_UNLOCK(so); return (error); } Index: sys/netinet/tcp_input.c =================================================================== --- sys/netinet/tcp_input.c +++ sys/netinet/tcp_input.c @@ -1387,10 +1387,10 @@ TCP_PROBE3(debug__input, tp, th, m); tcp_dooptions(&to, optp, optlen, TO_SYN); #ifdef TCP_RFC7413 - if (syncache_add(&inc, &to, th, inp, &so, m, NULL, NULL)) + if (syncache_add(&inc, &to, th, inp, so, m, NULL, NULL)) goto tfo_socket_result; #else - syncache_add(&inc, &to, th, inp, &so, m, NULL, NULL); + syncache_add(&inc, &to, th, inp, so, m, NULL, NULL); #endif /* * Entry added to syncache and mbuf consumed. Index: sys/netinet/tcp_subr.c =================================================================== --- sys/netinet/tcp_subr.c +++ sys/netinet/tcp_subr.c @@ -1576,7 +1576,6 @@ ("tcp_close: !SS_PROTOREF")); inp->inp_flags &= ~INP_SOCKREF; INP_WUNLOCK(inp); - ACCEPT_LOCK(); SOCK_LOCK(so); so->so_state &= ~SS_PROTOREF; sofree(so); Index: sys/netinet/tcp_syncache.h =================================================================== --- sys/netinet/tcp_syncache.h +++ sys/netinet/tcp_syncache.h @@ -42,7 +42,7 @@ int syncache_expand(struct in_conninfo *, struct tcpopt *, struct tcphdr *, struct socket **, struct mbuf *); int syncache_add(struct in_conninfo *, struct tcpopt *, - struct tcphdr *, struct inpcb *, struct socket **, struct mbuf *, + struct tcphdr *, struct inpcb *, struct socket *, struct mbuf *, void *, void *); void syncache_chkrst(struct in_conninfo *, struct tcphdr *); void syncache_badack(struct in_conninfo *); Index: sys/netinet/tcp_syncache.c =================================================================== --- sys/netinet/tcp_syncache.c +++ sys/netinet/tcp_syncache.c @@ -146,9 +146,9 @@ struct socket *); static void syncookie_reseed(void *); #ifdef INVARIANTS -static int syncookie_cmp(struct in_conninfo *inc, struct syncache_head *sch, - struct syncache *sc, struct tcphdr *th, struct tcpopt *to, - struct socket *lso); +static int syncookie_cmp(struct in_conninfo *, struct syncache_head *, + struct syncache *, struct tcphdr *, struct tcpopt *, + struct socket *); #endif /* @@ -1160,10 +1160,11 @@ } #ifdef TCP_RFC7413 -static void -syncache_tfo_expand(struct syncache *sc, struct socket **lsop, struct mbuf *m, +static struct socket * +syncache_tfo_expand(struct syncache *sc, struct socket *lso, struct mbuf *m, uint64_t response_cookie) { + struct socket *so; struct inpcb *inp; struct tcpcb *tp; unsigned int *pending_counter; @@ -1175,12 +1176,12 @@ INP_INFO_RLOCK_ASSERT(&V_tcbinfo); pending_counter = intotcpcb(sotoinpcb(*lsop))->t_tfo_pending; - *lsop = syncache_socket(sc, *lsop, m); - if (*lsop == NULL) { + so = syncache_socket(sc, lso, m); + if (so == NULL) { TCPSTAT_INC(tcps_sc_aborted); atomic_subtract_int(pending_counter, 1); } else { - inp = sotoinpcb(*lsop); + inp = sotoinpcb(so); tp = intotcpcb(inp); tp->t_flags |= TF_FASTOPEN; tp->t_tfo_cookie = response_cookie; @@ -1189,6 +1190,7 @@ tp->t_tfo_pending = pending_counter; TCPSTAT_INC(tcps_sc_completed); } + return (so); } #endif /* TCP_RFC7413 */ @@ -1213,11 +1215,10 @@ */ int syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, - struct inpcb *inp, struct socket **lsop, struct mbuf *m, void *tod, + struct inpcb *inp, struct socket *so, struct mbuf *m, void *tod, void *todctx) { struct tcpcb *tp; - struct socket *so; struct syncache *sc = NULL; struct syncache_head *sch; struct mbuf *ipopts = NULL; @@ -1248,7 +1249,7 @@ * Combine all so/tp operations very early to drop the INP lock as * soon as possible. */ - so = *lsop; + KASSERT(SOLISTENING(so), ("%s: %p not listening", __func__, so)); tp = sototcpcb(so); cred = crhold(so->so_cred); @@ -1259,7 +1260,7 @@ #endif ip_ttl = inp->inp_ip_ttl; ip_tos = inp->inp_ip_tos; - win = sbspace(&so->so_rcv); + win = so->sol_sbrcv_hiwat; ltflags = (tp->t_flags & (TF_NOOPT | TF_SIGNATURE)); #ifdef TCP_RFC7413 @@ -1272,7 +1273,7 @@ * listen queue with bogus TFO connections. */ if (atomic_fetchadd_int(tp->t_tfo_pending, 1) <= - (so->so_qlimit / 2)) { + (so->sol_qlimit / 2)) { int result; result = tcp_fastopen_check_cookie(inc, @@ -1566,10 +1567,7 @@ } done: - if (m) { - *lsop = NULL; - m_freem(m); - } + m_freem(m); #ifdef TCP_RFC7413 /* * If tfo_pending is not NULL here, then a TFO SYN that did not @@ -2098,7 +2096,7 @@ sc->sc_flags |= SCF_WINSCALE; } - wnd = sbspace(&lso->so_rcv); + wnd = lso->sol_sbrcv_hiwat; wnd = imax(wnd, 0); wnd = imin(wnd, TCP_MAXWIN); sc->sc_wnd = wnd; Index: sys/netinet/tcp_timewait.c =================================================================== --- sys/netinet/tcp_timewait.c +++ sys/netinet/tcp_timewait.c @@ -352,7 +352,6 @@ ("tcp_twstart: !SS_PROTOREF")); inp->inp_flags &= ~INP_SOCKREF; INP_WUNLOCK(inp); - ACCEPT_LOCK(); SOCK_LOCK(so); so->so_state &= ~SS_PROTOREF; sofree(so); @@ -491,7 +490,6 @@ if (inp->inp_flags & INP_SOCKREF) { inp->inp_flags &= ~INP_SOCKREF; INP_WUNLOCK(inp); - ACCEPT_LOCK(); SOCK_LOCK(so); KASSERT(so->so_state & SS_PROTOREF, ("tcp_twclose: INP_SOCKREF && !SS_PROTOREF")); Index: sys/netinet/toecore.c =================================================================== --- sys/netinet/toecore.c +++ sys/netinet/toecore.c @@ -331,7 +331,7 @@ INP_WLOCK_ASSERT(inp); - syncache_add(inc, to, th, inp, &lso, NULL, tod, todctx); + syncache_add(inc, to, th, inp, lso, NULL, tod, todctx); } int Index: sys/ofed/drivers/infiniband/core/iwcm.c =================================================================== --- sys/ofed/drivers/infiniband/core/iwcm.c +++ sys/ofed/drivers/infiniband/core/iwcm.c @@ -416,34 +416,19 @@ { struct socket *so; struct sockaddr_in *remote; + int error; - ACCEPT_LOCK(); - so = TAILQ_FIRST(&head->so_comp); - if (!so) { - ACCEPT_UNLOCK(); - return NULL; - } - - SOCK_LOCK(so); - /* - * Before changing the flags on the socket, we have to bump the - * reference count. Otherwise, if the protocol calls sofree(), - * the socket will be released due to a zero refcount. - */ - soref(so); - TAILQ_REMOVE(&head->so_comp, so, so_list); - head->so_qlen--; - so->so_qstate &= ~SQ_COMP; - so->so_head = NULL; - so->so_state |= SS_NBIO; - SOCK_UNLOCK(so); - ACCEPT_UNLOCK(); + SOLISTEN_LOCK(head); + error = solisten_dequeue(head, &so, SOCK_NONBLOCK); + if (error == EWOULDBLOCK) + return (NULL); remote = NULL; soaccept(so, (struct sockaddr **)&remote); free(remote, M_SONAME); return so; } + static void iw_so_event_handler(struct work_struct *_work) { @@ -485,18 +470,17 @@ #endif return; } + static int iw_so_upcall(struct socket *parent_so, void *arg, int waitflag) { struct iwcm_listen_work *work; - struct socket *so; struct iw_cm_id *cm_id = arg; /* check whether iw_so_event_handler() already dequeued this 'so' */ - so = TAILQ_FIRST(&parent_so->so_comp); - if (!so) + if (TAILQ_EMPTY(&parent_so->sol_comp)) return SU_OK; - work = kzalloc(sizeof(*work), M_NOWAIT); + work = kzalloc(sizeof(*work), waitflag); if (!work) return -ENOMEM; work->cm_id = cm_id; @@ -507,17 +491,21 @@ return SU_OK; } -static void -iw_init_sock(struct iw_cm_id *cm_id) +static int +iw_create_listen(struct iw_cm_id *cm_id, int backlog) { struct sockopt sopt; struct socket *so = cm_id->so; int on = 1; + int rc; - SOCK_LOCK(so); - soupcall_set(so, SO_RCV, iw_so_upcall, cm_id); + rc = -solisten(cm_id->so, backlog, curthread); + if (rc != 0) + return (rc); + SOLISTEN_LOCK(so); + solisten_upcall_set(so, iw_so_upcall, cm_id); so->so_state |= SS_NBIO; - SOCK_UNLOCK(so); + SOLISTEN_UNLOCK(so); sopt.sopt_dir = SOPT_SET; sopt.sopt_level = IPPROTO_TCP; sopt.sopt_name = TCP_NODELAY; @@ -525,37 +513,18 @@ sopt.sopt_valsize = sizeof(on); sopt.sopt_td = NULL; sosetopt(so, &sopt); -} - -static int -iw_uninit_socket(struct iw_cm_id *cm_id) -{ - struct socket *so = cm_id->so; - - SOCK_LOCK(so); - soupcall_clear(so, SO_RCV); - SOCK_UNLOCK(so); - return (0); } static int -iw_create_listen(struct iw_cm_id *cm_id, int backlog) -{ - int rc; - - iw_init_sock(cm_id); - rc = -solisten(cm_id->so, backlog, curthread); - if (rc != 0) - iw_uninit_socket(cm_id); - return (rc); -} - -static int iw_destroy_listen(struct iw_cm_id *cm_id) { + struct socket *so = cm_id->so; - return (iw_uninit_socket(cm_id)); + SOLISTEN_LOCK(so); + solisten_upcall_set(so, NULL, NULL); + SOLISTEN_UNLOCK(so); + return (0); } Index: sys/ofed/drivers/infiniband/ulp/sdp/sdp_main.c =================================================================== --- sys/ofed/drivers/infiniband/ulp/sdp/sdp_main.c +++ sys/ofed/drivers/infiniband/ulp/sdp/sdp_main.c @@ -310,7 +310,6 @@ ("sdp_closed: !SS_PROTOREF")); ssk->flags &= ~SDP_SOCKREF; SDP_WUNLOCK(ssk); - ACCEPT_LOCK(); SOCK_LOCK(so); so->so_state &= ~SS_PROTOREF; sofree(so); Index: sys/rpc/svc_vc.c =================================================================== --- sys/rpc/svc_vc.c +++ sys/rpc/svc_vc.c @@ -96,6 +96,7 @@ struct sockaddr *raddr); static int svc_vc_accept(struct socket *head, struct socket **sop); static int svc_vc_soupcall(struct socket *so, void *arg, int waitflag); +static int svc_vc_rendezvous_soupcall(struct socket *, void *, int); static struct xp_ops svc_vc_rendezvous_ops = { .xp_recv = svc_vc_rendezvous_recv, @@ -183,10 +184,10 @@ solisten(so, -1, curthread); - SOCKBUF_LOCK(&so->so_rcv); + SOLISTEN_LOCK(so); xprt->xp_upcallset = 1; - soupcall_set(so, SO_RCV, svc_vc_soupcall, xprt); - SOCKBUF_UNLOCK(&so->so_rcv); + solisten_upcall_set(so, svc_vc_rendezvous_soupcall, xprt); + SOLISTEN_UNLOCK(so); return (xprt); @@ -316,9 +317,11 @@ int svc_vc_accept(struct socket *head, struct socket **sop) { - int error = 0; struct socket *so; + int error = 0; + short nbio; + /* XXXGL: shouldn't that be an assertion? */ if ((head->so_options & SO_ACCEPTCONN) == 0) { error = EINVAL; goto done; @@ -328,38 +331,26 @@ if (error != 0) goto done; #endif - ACCEPT_LOCK(); - if (TAILQ_EMPTY(&head->so_comp)) { - ACCEPT_UNLOCK(); - error = EWOULDBLOCK; - goto done; - } - so = TAILQ_FIRST(&head->so_comp); - KASSERT(!(so->so_qstate & SQ_INCOMP), ("svc_vc_accept: so SQ_INCOMP")); - KASSERT(so->so_qstate & SQ_COMP, ("svc_vc_accept: so not SQ_COMP")); - /* - * Before changing the flags on the socket, we have to bump the - * reference count. Otherwise, if the protocol calls sofree(), - * the socket will be released due to a zero refcount. - * XXX might not need soref() since this is simpler than kern_accept. + * XXXGL: we want non-blocking semantics. The socket could be a + * socket created by kernel as well as socket shared with userland, + * so we can't be sure about presense of SS_NBIO. We also shall not + * toggle it on the socket, since that may surprise userland. So we + * set SS_NBIO only temporarily. */ - SOCK_LOCK(so); /* soref() and so_state update */ - soref(so); /* file descriptor reference */ - - TAILQ_REMOVE(&head->so_comp, so, so_list); - head->so_qlen--; - so->so_state |= (head->so_state & SS_NBIO); - so->so_qstate &= ~SQ_COMP; - so->so_head = NULL; - - SOCK_UNLOCK(so); - ACCEPT_UNLOCK(); + SOLISTEN_LOCK(head); + nbio = head->so_state & SS_NBIO; + head->so_state |= SS_NBIO; + error = solisten_dequeue(head, &so, 0); + head->so_state &= (nbio & ~SS_NBIO); + if (error) + goto done; + so->so_state |= nbio; *sop = so; /* connection has been removed from the listen queue */ - KNOTE_UNLOCKED(&head->so_rcv.sb_sel.si_note, 0); + KNOTE_UNLOCKED(&head->so_rdsel.si_note, 0); done: return (error); } @@ -392,21 +383,21 @@ * connection arrives after our call to accept fails * with EWOULDBLOCK. */ - ACCEPT_LOCK(); - if (TAILQ_EMPTY(&xprt->xp_socket->so_comp)) + SOLISTEN_LOCK(xprt->xp_socket); + if (TAILQ_EMPTY(&xprt->xp_socket->sol_comp)) xprt_inactive_self(xprt); - ACCEPT_UNLOCK(); + SOLISTEN_UNLOCK(xprt->xp_socket); sx_xunlock(&xprt->xp_lock); return (FALSE); } if (error) { - SOCKBUF_LOCK(&xprt->xp_socket->so_rcv); + SOLISTEN_LOCK(xprt->xp_socket); if (xprt->xp_upcallset) { xprt->xp_upcallset = 0; soupcall_clear(xprt->xp_socket, SO_RCV); } - SOCKBUF_UNLOCK(&xprt->xp_socket->so_rcv); + SOLISTEN_UNLOCK(xprt->xp_socket); xprt_inactive_self(xprt); sx_xunlock(&xprt->xp_lock); return (FALSE); @@ -453,12 +444,6 @@ static void svc_vc_destroy_common(SVCXPRT *xprt) { - SOCKBUF_LOCK(&xprt->xp_socket->so_rcv); - if (xprt->xp_upcallset) { - xprt->xp_upcallset = 0; - soupcall_clear(xprt->xp_socket, SO_RCV); - } - SOCKBUF_UNLOCK(&xprt->xp_socket->so_rcv); if (xprt->xp_socket) (void)soclose(xprt->xp_socket); @@ -472,6 +457,13 @@ svc_vc_rendezvous_destroy(SVCXPRT *xprt) { + SOLISTEN_LOCK(xprt->xp_socket); + if (xprt->xp_upcallset) { + xprt->xp_upcallset = 0; + solisten_upcall_set(xprt->xp_socket, NULL, NULL); + } + SOLISTEN_UNLOCK(xprt->xp_socket); + svc_vc_destroy_common(xprt); } @@ -480,6 +472,13 @@ { struct cf_conn *cd = (struct cf_conn *)xprt->xp_p1; + SOCKBUF_LOCK(&xprt->xp_socket->so_rcv); + if (xprt->xp_upcallset) { + xprt->xp_upcallset = 0; + soupcall_clear(xprt->xp_socket, SO_RCV); + } + SOCKBUF_UNLOCK(&xprt->xp_socket->so_rcv); + svc_vc_destroy_common(xprt); if (cd->mreq) @@ -958,6 +957,16 @@ return (SU_OK); } +static int +svc_vc_rendezvous_soupcall(struct socket *head, void *arg, int waitflag) +{ + SVCXPRT *xprt = (SVCXPRT *) arg; + + if (!TAILQ_EMPTY(&head->sol_comp)) + xprt_active(xprt); + return (SU_OK); +} + #if 0 /* * Get the effective UID of the sending process. Used by rpcbind, keyserv Index: sys/sys/sockbuf.h =================================================================== --- sys/sys/sockbuf.h +++ sys/sys/sockbuf.h @@ -32,7 +32,6 @@ */ #ifndef _SYS_SOCKBUF_H_ #define _SYS_SOCKBUF_H_ -#include /* for struct selinfo */ #include #include #include @@ -64,6 +63,7 @@ struct sockaddr; struct socket; struct thread; +struct selinfo; struct xsockbuf { u_int sb_cc; @@ -84,9 +84,9 @@ * (a) locked by SOCKBUF_LOCK(). */ struct sockbuf { - struct selinfo sb_sel; /* process selecting read/write */ - struct mtx sb_mtx; /* sockbuf lock */ - struct sx sb_sx; /* prevent I/O interlacing */ + struct mtx sb_mtx; /* sockbuf lock */ + struct sx sb_sx; /* prevent I/O interlacing */ + struct selinfo *sb_sel; /* process selecting read/write */ short sb_state; /* (a) socket state on sockbuf */ #define sb_startzero sb_mb struct mbuf *sb_mb; /* (a) the mbuf chain */ Index: sys/sys/socket.h =================================================================== --- sys/sys/socket.h +++ sys/sys/socket.h @@ -111,7 +111,15 @@ */ #define SOCK_CLOEXEC 0x10000000 #define SOCK_NONBLOCK 0x20000000 -#endif +#ifdef _KERNEL +/* + * Flags for accept1(), kern_accept4() and solisten_dequeue, in addition + * to SOCK_CLOEXEC and SOCK_NONBLOCK. + */ +#define ACCEPT4_INHERIT 0x1 +#define ACCEPT4_COMPAT 0x2 +#endif /* _KERNEL */ +#endif /* __BSD_VISIBLE */ /* * Option flags per-socket. @@ -704,9 +712,5 @@ void so_lock(struct socket *so); void so_unlock(struct socket *so); -void so_listeners_apply_all(struct socket *so, void (*func)(struct socket *, void *), void *arg); - -#endif - - +#endif /* _KERNEL */ #endif /* !_SYS_SOCKET_H_ */ Index: sys/sys/socketvar.h =================================================================== --- sys/sys/socketvar.h +++ sys/sys/socketvar.h @@ -55,7 +55,9 @@ * handle on protocol and pointer to protocol * private data and error information. */ -typedef u_quad_t so_gen_t; +typedef uint64_t so_gen_t; +typedef int so_upcall_t(struct socket *, void *, int); + struct socket; @@ -63,60 +65,35 @@ * Locking key to struct socket: * (a) constant after allocation, no locking required. * (b) locked by SOCK_LOCK(so). - * (c) locked by SOCKBUF_LOCK(&so->so_rcv). - * (e) locked by ACCEPT_LOCK(). + * (cr) locked by SOCKBUF_LOCK(&so->so_rcv). + * (cs) locked by SOCKBUF_LOCK(&so->so_rcv). + * (e) locked by SOLISTEN_LOCK() of corresponding listening socket. * (f) not locked since integer reads/writes are atomic. * (g) used only as a sleep/wakeup address, no value. * (h) locked by global mutex so_global_mtx. */ +TAILQ_HEAD(accept_queue, socket); struct socket { - int so_count; /* (b) reference count */ + struct mtx so_lock; + volatile u_int so_count; /* (b / refcount) */ + struct selinfo so_rdsel; /* (b/cr) for so_rcv/so_comp */ + struct selinfo so_wrsel; /* (b/cs) for so_snd */ short so_type; /* (a) generic type, see socket.h */ - short so_options; /* from socket call, see socket.h */ - short so_linger; /* time to linger while closing */ + short so_options; /* (b) from socket call, see socket.h */ + short so_linger; /* time to linger close(2) */ short so_state; /* (b) internal state flags SS_* */ - int so_qstate; /* (e) internal state flags SQ_* */ void *so_pcb; /* protocol control block */ struct vnet *so_vnet; /* (a) network stack instance */ struct protosw *so_proto; /* (a) protocol handle */ -/* - * Variables for connection queuing. - * Socket where accepts occur is so_head in all subsidiary sockets. - * If so_head is 0, socket is not related to an accept. - * For head socket so_incomp queues partially completed connections, - * while so_comp is a queue of connections ready to be accepted. - * If a connection is aborted and it has so_head set, then - * it has to be pulled out of either so_incomp or so_comp. - * We allow connections to queue up based on current queue lengths - * and limit on number of queued connections for this socket. - */ - struct socket *so_head; /* (e) back pointer to listen socket */ - TAILQ_HEAD(, socket) so_incomp; /* (e) queue of partial unaccepted connections */ - TAILQ_HEAD(, socket) so_comp; /* (e) queue of complete unaccepted connections */ - TAILQ_ENTRY(socket) so_list; /* (e) list of unaccepted connections */ - u_int so_qlen; /* (e) number of unaccepted connections */ - u_int so_incqlen; /* (e) number of unaccepted incomplete - connections */ - u_int so_qlimit; /* (e) max number queued connections */ short so_timeo; /* (g) connection timeout */ u_short so_error; /* (f) error affecting connection */ struct sigio *so_sigio; /* [sg] information for async I/O or out of band data (SIGURG) */ - u_long so_oobmark; /* (c) chars to oob mark */ - - struct sockbuf so_rcv, so_snd; - struct ucred *so_cred; /* (a) user credentials */ struct label *so_label; /* (b) MAC label for socket */ - struct label *so_peerlabel; /* (b) cached MAC label for peer */ /* NB: generation count must not be first. */ so_gen_t so_gencnt; /* (h) generation count */ void *so_emuldata; /* (b) private data for emulators */ - struct so_accf { - struct accept_filter *so_accept_filter; - void *so_accept_filter_arg; /* saved filter args */ - char *so_accept_filter_str; /* saved user args */ - } *so_accf; struct osd osd; /* Object Specific extensions */ /* * so_fibnum, so_user_cookie and friends can be used to attach @@ -129,39 +106,92 @@ int so_ts_clock; /* type of the clock used for timestamps */ uint32_t so_max_pacing_rate; /* (f) TX rate limit in bytes/s */ - - void *so_pspare[2]; /* general use */ - int so_ispare[2]; /* general use */ + union { + /* Regular (data flow) socket. */ + struct { + /* (cr, cs) Receive and send buffers. */ + struct sockbuf so_rcv, so_snd; + + /* (e) Our place on accept queue. */ + TAILQ_ENTRY(socket) so_list; + struct socket *so_listen; /* (b) */ + enum { + SQ_NONE = 0, + SQ_INCOMP = 0x0800, /* on sol_incomp */ + SQ_COMP = 0x1000, /* on sol_comp */ + } so_qstate; /* (b) */ + + /* (b) cached MAC label for peer */ + struct label *so_peerlabel; + u_long so_oobmark; /* chars to oob mark */ + }; + /* + * Listening socket, where accepts occur, is so_listen in all + * subsidiary sockets. If so_listen is NULL, socket is not + * related to an accept. For a listening socket itself + * sol_incomp queues partially completed connections, while + * sol_comp is a queue of connections ready to be accepted. + * If a connection is aborted and it has so_listen set, then + * it has to be pulled out of either sol_incomp or sol_comp. + * We allow connections to queue up based on current queue + * lengths and limit on number of queued connections for this + * socket. + */ + struct { + /* (e) queue of partial unaccepted connections */ + struct accept_queue sol_incomp; + /* (e) queue of complete unaccepted connections */ + struct accept_queue sol_comp; + u_int sol_qlen; /* (e) sol_comp length */ + u_int sol_incqlen; /* (e) sol_incomp length */ + u_int sol_qlimit; /* (e) queue limit */ + + /* accept_filter(9) optional data */ + struct accept_filter *sol_accept_filter; + void *sol_accept_filter_arg; /* saved filter args */ + char *sol_accept_filter_str; /* saved user args */ + + /* Optional upcall, for kernel socket. */ + so_upcall_t *sol_upcall; /* (e) */ + void *sol_upcallarg; /* (e) */ + + /* Socket buffer parameters, to be copied to + * dataflow sockets, accepted from this one. */ + int sol_sbrcv_lowat; + int sol_sbsnd_lowat; + u_int sol_sbrcv_hiwat; + u_int sol_sbsnd_hiwat; + short sol_sbrcv_flags; + short sol_sbsnd_flags; + sbintime_t sol_sbrcv_timeo; + sbintime_t sol_sbsnd_timeo; + }; + }; }; -/* - * Global accept mutex to serialize access to accept queues and - * fields associated with multiple sockets. This allows us to - * avoid defining a lock order between listen and accept sockets - * until such time as it proves to be a good idea. - */ -extern struct mtx accept_mtx; -#define ACCEPT_LOCK_ASSERT() mtx_assert(&accept_mtx, MA_OWNED) -#define ACCEPT_UNLOCK_ASSERT() mtx_assert(&accept_mtx, MA_NOTOWNED) -#define ACCEPT_LOCK() mtx_lock(&accept_mtx) -#define ACCEPT_UNLOCK() mtx_unlock(&accept_mtx) - -/* - * Per-socket mutex: we reuse the receive socket buffer mutex for space - * efficiency. This decision should probably be revisited as we optimize - * locking for the socket code. - */ -#define SOCK_MTX(_so) SOCKBUF_MTX(&(_so)->so_rcv) -#define SOCK_LOCK(_so) SOCKBUF_LOCK(&(_so)->so_rcv) -#define SOCK_OWNED(_so) SOCKBUF_OWNED(&(_so)->so_rcv) -#define SOCK_UNLOCK(_so) SOCKBUF_UNLOCK(&(_so)->so_rcv) -#define SOCK_LOCK_ASSERT(_so) SOCKBUF_LOCK_ASSERT(&(_so)->so_rcv) - -/* - * Socket state bits stored in so_qstate. - */ -#define SQ_INCOMP 0x0800 /* unaccepted, incomplete connection */ -#define SQ_COMP 0x1000 /* unaccepted, complete connection */ +#define SOCK_MTX(so) &(so)->so_lock +#define SOCK_LOCK(so) mtx_lock(&(so)->so_lock) +#define SOCK_OWNED(so) mtx_owned(&(so)->so_lock) +#define SOCK_UNLOCK(so) mtx_unlock(&(so)->so_lock) +#define SOCK_LOCK_ASSERT(so) mtx_assert(&(so)->so_lock, MA_OWNED) +#define SOCK_UNLOCK_ASSERT(so) mtx_assert(&(so)->so_lock, MA_NOTOWNED) + +#define SOLISTENING(sol) (((sol)->so_options & SO_ACCEPTCONN) != 0) +#define SOLISTEN_LOCK(sol) do { \ + mtx_lock(&(sol)->so_lock); \ + KASSERT(SOLISTENING(sol), \ + ("%s: %p not listening", __func__, (sol))); \ +} while (0) +#define SOLISTEN_UNLOCK(sol) do { \ + KASSERT(SOLISTENING(sol), \ + ("%s: %p not listening", __func__, (sol))); \ + mtx_unlock(&(sol)->so_lock); \ +} while (0) +#define SOLISTEN_LOCK_ASSERT(sol) do { \ + mtx_assert(&(sol)->so_lock, MA_OWNED); \ + KASSERT(SOLISTENING(sol), \ + ("%s: %p not listening", __func__, (sol))); \ +} while (0) /* * Externalized form of struct socket used by the sysctl(3) interface. @@ -212,8 +242,7 @@ /* can we read something from so? */ #define soreadabledata(so) \ - (sbavail(&(so)->so_rcv) >= (so)->so_rcv.sb_lowat || \ - !TAILQ_EMPTY(&(so)->so_comp) || (so)->so_error) + (sbavail(&(so)->so_rcv) >= (so)->so_rcv.sb_lowat || (so)->so_error) #define soreadable(so) \ (soreadabledata(so) || ((so)->so_rcv.sb_state & SBS_CANTRCVMORE)) @@ -226,26 +255,19 @@ (so)->so_error) /* - * soref()/sorele() ref-count the socket structure. Note that you must - * still explicitly close the socket, but the last ref count will free - * the structure. + * soref()/sorele() ref-count the socket structure. + * soref() may be called without owning socket lock, but in that case a + * caller must own something that holds socket, and so_count must be not 0. + * Note that you must still explicitly close the socket, but the last ref + * count will free the structure. */ -#define soref(so) do { \ - SOCK_LOCK_ASSERT(so); \ - ++(so)->so_count; \ -} while (0) - +#define soref(so) refcount_acquire(&(so)->so_count) #define sorele(so) do { \ - ACCEPT_LOCK_ASSERT(); \ SOCK_LOCK_ASSERT(so); \ - if ((so)->so_count <= 0) \ - panic("sorele"); \ - if (--(so)->so_count == 0) \ + if (refcount_release(&(so)->so_count)) \ sofree(so); \ - else { \ + else \ SOCK_UNLOCK(so); \ - ACCEPT_UNLOCK(); \ - } \ } while (0) /* @@ -368,10 +390,11 @@ int solisten(struct socket *so, int backlog, struct thread *td); void solisten_proto(struct socket *so, int backlog); int solisten_proto_check(struct socket *so); +int solisten_dequeue(struct socket *, struct socket **, int); struct socket * sonewconn(struct socket *head, int connstatus); - - +struct socket * + sopeeloff(struct socket *); int sopoll(struct socket *so, int events, struct ucred *active_cred, struct thread *td); int sopoll_generic(struct socket *so, int events, @@ -403,6 +426,7 @@ void soupcall_clear(struct socket *so, int which); void soupcall_set(struct socket *so, int which, int (*func)(struct socket *, void *, int), void *arg); +void solisten_upcall_set(struct socket *, so_upcall_t, void *); void sowakeup(struct socket *so, struct sockbuf *sb); void sowakeup_aio(struct socket *so, struct sockbuf *sb); int selsocket(struct socket *so, int events, struct timeval *tv, Index: sys/sys/sockopt.h =================================================================== --- sys/sys/sockopt.h +++ sys/sys/sockopt.h @@ -64,8 +64,8 @@ int soopt_getm(struct sockopt *sopt, struct mbuf **mp); int soopt_mcopyin(struct sockopt *sopt, struct mbuf *m); int soopt_mcopyout(struct sockopt *sopt, struct mbuf *m); -int do_getopt_accept_filter(struct socket *so, struct sockopt *sopt); -int do_setopt_accept_filter(struct socket *so, struct sockopt *sopt); +int accept_filt_getopt(struct socket *, struct sockopt *); +int accept_filt_setopt(struct socket *, struct sockopt *); int so_setsockopt(struct socket *so, int level, int optname, void *optval, size_t optlen); Index: sys/sys/unpcb.h =================================================================== --- sys/sys/unpcb.h +++ sys/sys/unpcb.h @@ -92,14 +92,8 @@ * and is really the credentials of the connected peer. This is used * to determine whether the contents should be sent to the user or * not. - * - * UNP_HAVEPCCACHED - indicates that the unp_peercred member is filled - * in, but does *not* contain the credentials of the connected peer - * (there may not even be a peer). This is set in unp_listen() when - * it fills in unp_peercred for later consumption by unp_connect(). */ #define UNP_HAVEPC 0x001 -#define UNP_HAVEPCCACHED 0x002 #define UNP_WANTCRED 0x004 /* credentials wanted */ #define UNP_CONNWAIT 0x008 /* connect blocks until accepted */ Index: sys/sys/vnode.h =================================================================== --- sys/sys/vnode.h +++ sys/sys/vnode.h @@ -112,14 +112,13 @@ /* * Type specific fields, only one applies to any given vnode. - * See #defines below for renaming to v_* namespace. */ union { - struct mount *vu_mount; /* v ptr to mountpoint (VDIR) */ - struct socket *vu_socket; /* v unix domain net (VSOCK) */ - struct cdev *vu_cdev; /* v device (VCHR, VBLK) */ - struct fifoinfo *vu_fifoinfo; /* v fifo (VFIFO) */ - } v_un; + struct mount *v_mountedhere; /* v ptr to mountpoint (VDIR) */ + struct unpcb *v_unpcb; /* v unix domain net (VSOCK) */ + struct cdev *v_rdev; /* v device (VCHR, VBLK) */ + struct fifoinfo *v_fifoinfo; /* v fifo (VFIFO) */ + }; /* * vfs_hash: (mount + inode) -> vnode hash. The hash value @@ -175,11 +174,6 @@ #endif /* defined(_KERNEL) || defined(_KVM_VNODE) */ -#define v_mountedhere v_un.vu_mount -#define v_socket v_un.vu_socket -#define v_rdev v_un.vu_cdev -#define v_fifoinfo v_un.vu_fifoinfo - #define bo2vnode(bo) __containerof((bo), struct vnode, v_bufobj) /* XXX: These are temporary to avoid a source sweep at this time */ @@ -200,7 +194,7 @@ long xv_numoutput; /* num of writes in progress */ enum vtype xv_type; /* vnode type */ union { - void *xvu_socket; /* socket, if VSOCK */ + void *xvu_socket; /* unpcb, if VSOCK */ void *xvu_fifo; /* fifo, if VFIFO */ dev_t xvu_rdev; /* maj/min, if VBLK/VCHR */ struct { Index: usr.bin/netstat/inet.c =================================================================== --- usr.bin/netstat/inet.c +++ usr.bin/netstat/inet.c @@ -170,14 +170,17 @@ if (kread((uintptr_t)proto.pr_domain, &domain, sizeof(domain)) != 0) return (-1); xso->xso_family = domain.dom_family; - xso->so_qlen = so->so_qlen; - xso->so_incqlen = so->so_incqlen; - xso->so_qlimit = so->so_qlimit; xso->so_timeo = so->so_timeo; xso->so_error = so->so_error; - xso->so_oobmark = so->so_oobmark; - sbtoxsockbuf(&so->so_snd, &xso->so_snd); - sbtoxsockbuf(&so->so_rcv, &xso->so_rcv); + if (SOLISTENING(so)) { + xso->so_qlen = so->sol_qlen; + xso->so_incqlen = so->sol_incqlen; + xso->so_qlimit = so->sol_qlimit; + } else { + sbtoxsockbuf(&so->so_snd, &xso->so_snd); + sbtoxsockbuf(&so->so_rcv, &xso->so_rcv); + xso->so_oobmark = so->so_oobmark; + } return (0); }