Index: head/sys/cam/ctl/ctl_ha.c =================================================================== --- head/sys/cam/ctl/ctl_ha.c +++ head/sys/cam/ctl/ctl_ha.c @@ -458,45 +458,20 @@ static int ctl_ha_accept(struct ha_softc *softc) { - struct socket *so; + struct socket *lso, *so; struct sockaddr *sap; int error; - ACCEPT_LOCK(); - if (softc->ha_lso->so_rcv.sb_state & SBS_CANTRCVMORE) - softc->ha_lso->so_error = ECONNABORTED; - if (softc->ha_lso->so_error) { - error = softc->ha_lso->so_error; - softc->ha_lso->so_error = 0; - ACCEPT_UNLOCK(); + lso = softc->ha_lso; + SOLISTEN_LOCK(lso); + error = solisten_dequeue(lso, &so, 0); + if (error == EWOULDBLOCK) + return (error); + if (error) { printf("%s: socket error %d\n", __func__, error); goto out; } - so = TAILQ_FIRST(&softc->ha_lso->so_comp); - if (so == NULL) { - ACCEPT_UNLOCK(); - return (EWOULDBLOCK); - } - KASSERT(!(so->so_qstate & SQ_INCOMP), ("accept1: so SQ_INCOMP")); - KASSERT(so->so_qstate & SQ_COMP, ("accept1: so not SQ_COMP")); - /* - * Before changing the flags on the socket, we have to bump the - * reference count. Otherwise, if the protocol calls sofree(), - * the socket will be released due to a zero refcount. - */ - SOCK_LOCK(so); /* soref() and so_state update */ - soref(so); /* file descriptor reference */ - - TAILQ_REMOVE(&softc->ha_lso->so_comp, so, so_list); - softc->ha_lso->so_qlen--; - so->so_state |= SS_NBIO; - so->so_qstate &= ~SQ_COMP; - so->so_head = NULL; - - SOCK_UNLOCK(so); - ACCEPT_UNLOCK(); - sap = NULL; error = soaccept(so, &sap); if (error != 0) { @@ -556,9 +531,6 @@ printf("%s: REUSEPORT setting failed %d\n", __func__, error); } - SOCKBUF_LOCK(&softc->ha_lso->so_rcv); - soupcall_set(softc->ha_lso, SO_RCV, ctl_ha_lupcall, softc); - SOCKBUF_UNLOCK(&softc->ha_lso->so_rcv); } memcpy(&sa, &softc->ha_peer_in, sizeof(sa)); @@ -572,6 +544,10 @@ printf("%s: solisten() error %d\n", __func__, error); goto out; } + SOLISTEN_LOCK(softc->ha_lso); + softc->ha_lso->so_state |= SS_NBIO; + solisten_upcall_set(softc->ha_lso, ctl_ha_lupcall, softc); + SOLISTEN_UNLOCK(softc->ha_lso); return (0); out: Index: head/sys/dev/iscsi/icl_soft_proxy.c =================================================================== --- head/sys/dev/iscsi/icl_soft_proxy.c +++ head/sys/dev/iscsi/icl_soft_proxy.c @@ -92,7 +92,6 @@ struct icl_listen *ils_listen; struct socket *ils_socket; bool ils_running; - bool ils_disconnecting; int ils_id; }; @@ -184,7 +183,9 @@ while (ils->ils_running) { ICL_DEBUG("waiting for accept thread to terminate"); sx_xunlock(&il->il_lock); - ils->ils_disconnecting = true; + SOLISTEN_LOCK(ils->ils_socket); + ils->ils_socket->so_error = ENOTCONN; + SOLISTEN_UNLOCK(ils->ils_socket); wakeup(&ils->ils_socket->so_timeo); pause("icl_unlisten", 1 * hz); sx_xlock(&il->il_lock); @@ -200,9 +201,9 @@ } /* - * XXX: Doing accept in a separate thread in each socket might not be the best way - * to do stuff, but it's pretty clean and debuggable - and you probably won't - * have hundreds of listening sockets anyway. + * XXX: Doing accept in a separate thread in each socket might not be the + * best way to do stuff, but it's pretty clean and debuggable - and you + * probably won't have hundreds of listening sockets anyway. */ static void icl_accept_thread(void *arg) @@ -218,55 +219,22 @@ ils->ils_running = true; for (;;) { - ACCEPT_LOCK(); - while (TAILQ_EMPTY(&head->so_comp) && head->so_error == 0 && ils->ils_disconnecting == false) { - if (head->so_rcv.sb_state & SBS_CANTRCVMORE) { - head->so_error = ECONNABORTED; - break; - } - error = msleep(&head->so_timeo, &accept_mtx, PSOCK | PCATCH, - "accept", 0); - if (error) { - ACCEPT_UNLOCK(); - ICL_WARN("msleep failed with error %d", error); - continue; - } - if (ils->ils_disconnecting) { - ACCEPT_UNLOCK(); - ICL_DEBUG("terminating"); - ils->ils_running = false; - kthread_exit(); - return; - } + SOLISTEN_LOCK(head); + error = solisten_dequeue(head, &so, 0); + if (error == ENOTCONN) { + /* + * XXXGL: ENOTCONN is our mark from icl_listen_free(). + * Neither socket code, nor msleep(9) may return it. + */ + ICL_DEBUG("terminating"); + ils->ils_running = false; + kthread_exit(); + return; } - if (head->so_error) { - error = head->so_error; - head->so_error = 0; - ACCEPT_UNLOCK(); - ICL_WARN("socket error %d", error); + if (error) { + ICL_WARN("solisten_dequeue error %d", error); continue; } - so = TAILQ_FIRST(&head->so_comp); - KASSERT(so != NULL, ("NULL so")); - KASSERT(!(so->so_qstate & SQ_INCOMP), ("accept1: so SQ_INCOMP")); - KASSERT(so->so_qstate & SQ_COMP, ("accept1: so not SQ_COMP")); - - /* - * Before changing the flags on the socket, we have to bump the - * reference count. Otherwise, if the protocol calls sofree(), - * the socket will be released due to a zero refcount. - */ - SOCK_LOCK(so); /* soref() and so_state update */ - soref(so); /* file descriptor reference */ - - TAILQ_REMOVE(&head->so_comp, so, so_list); - head->so_qlen--; - so->so_state |= (head->so_state & SS_NBIO); - so->so_qstate &= ~SQ_COMP; - so->so_head = NULL; - - SOCK_UNLOCK(so); - ACCEPT_UNLOCK(); sa = NULL; error = soaccept(so, &sa); Index: head/sys/kern/sys_socket.c =================================================================== --- head/sys/kern/sys_socket.c +++ head/sys/kern/sys_socket.c @@ -170,32 +170,36 @@ break; case FIOASYNC: - /* - * XXXRW: This code separately acquires SOCK_LOCK(so) and - * SOCKBUF_LOCK(&so->so_rcv) even though they are the same - * mutex to avoid introducing the assumption that they are - * the same. - */ if (*(int *)data) { SOCK_LOCK(so); so->so_state |= SS_ASYNC; + if (SOLISTENING(so)) { + so->sol_sbrcv_flags |= SB_ASYNC; + so->sol_sbsnd_flags |= SB_ASYNC; + } else { + SOCKBUF_LOCK(&so->so_rcv); + so->so_rcv.sb_flags |= SB_ASYNC; + SOCKBUF_UNLOCK(&so->so_rcv); + SOCKBUF_LOCK(&so->so_snd); + so->so_snd.sb_flags |= SB_ASYNC; + SOCKBUF_UNLOCK(&so->so_snd); + } SOCK_UNLOCK(so); - SOCKBUF_LOCK(&so->so_rcv); - so->so_rcv.sb_flags |= SB_ASYNC; - SOCKBUF_UNLOCK(&so->so_rcv); - SOCKBUF_LOCK(&so->so_snd); - so->so_snd.sb_flags |= SB_ASYNC; - SOCKBUF_UNLOCK(&so->so_snd); } else { SOCK_LOCK(so); so->so_state &= ~SS_ASYNC; + if (SOLISTENING(so)) { + so->sol_sbrcv_flags &= ~SB_ASYNC; + so->sol_sbsnd_flags &= ~SB_ASYNC; + } else { + SOCKBUF_LOCK(&so->so_rcv); + so->so_rcv.sb_flags &= ~SB_ASYNC; + SOCKBUF_UNLOCK(&so->so_rcv); + SOCKBUF_LOCK(&so->so_snd); + so->so_snd.sb_flags &= ~SB_ASYNC; + SOCKBUF_UNLOCK(&so->so_snd); + } SOCK_UNLOCK(so); - SOCKBUF_LOCK(&so->so_rcv); - so->so_rcv.sb_flags &= ~SB_ASYNC; - SOCKBUF_UNLOCK(&so->so_rcv); - SOCKBUF_LOCK(&so->so_snd); - so->so_snd.sb_flags &= ~SB_ASYNC; - SOCKBUF_UNLOCK(&so->so_snd); } break; @@ -706,7 +710,6 @@ sb->sb_flags &= ~SB_AIO_RUNNING; SOCKBUF_UNLOCK(sb); - ACCEPT_LOCK(); SOCK_LOCK(so); sorele(so); } Index: head/sys/kern/uipc_accf.c =================================================================== --- head/sys/kern/uipc_accf.c +++ head/sys/kern/uipc_accf.c @@ -173,13 +173,13 @@ error = EINVAL; goto out; } - if ((so->so_options & SO_ACCEPTFILTER) == 0) { + if (so->sol_accept_filter == NULL) { error = EINVAL; goto out; } - strcpy(afap->af_name, so->so_accf->so_accept_filter->accf_name); - if (so->so_accf->so_accept_filter_str != NULL) - strcpy(afap->af_arg, so->so_accf->so_accept_filter_str); + strcpy(afap->af_name, so->sol_accept_filter->accf_name); + if (so->sol_accept_filter_str != NULL) + strcpy(afap->af_arg, so->sol_accept_filter_str); out: SOCK_UNLOCK(so); if (error == 0) @@ -193,31 +193,57 @@ { struct accept_filter_arg *afap; struct accept_filter *afp; - struct so_accf *newaf; - int error = 0; + char *accept_filter_str = NULL; + void *accept_filter_arg = NULL; + int error; /* * Handle the simple delete case first. */ if (sopt == NULL || sopt->sopt_val == NULL) { + struct socket *sp, *sp1; + int wakeup; + SOCK_LOCK(so); if ((so->so_options & SO_ACCEPTCONN) == 0) { SOCK_UNLOCK(so); return (EINVAL); } - if (so->so_accf != NULL) { - struct so_accf *af = so->so_accf; - if (af->so_accept_filter != NULL && - af->so_accept_filter->accf_destroy != NULL) { - af->so_accept_filter->accf_destroy(so); - } - if (af->so_accept_filter_str != NULL) - free(af->so_accept_filter_str, M_ACCF); - free(af, M_ACCF); - so->so_accf = NULL; + if (so->sol_accept_filter == NULL) { + SOCK_UNLOCK(so); + return (0); } + if (so->sol_accept_filter->accf_destroy != NULL) + so->sol_accept_filter->accf_destroy(so); + if (so->sol_accept_filter_str != NULL) + free(so->sol_accept_filter_str, M_ACCF); + so->sol_accept_filter = NULL; + so->sol_accept_filter_arg = NULL; + so->sol_accept_filter_str = NULL; so->so_options &= ~SO_ACCEPTFILTER; - SOCK_UNLOCK(so); + + /* + * Move from incomplete queue to complete only those + * connections, that are blocked by us. + */ + wakeup = 0; + TAILQ_FOREACH_SAFE(sp, &so->sol_incomp, so_list, sp1) { + SOCK_LOCK(sp); + if (sp->so_options & SO_ACCEPTFILTER) { + TAILQ_REMOVE(&so->sol_incomp, sp, so_list); + TAILQ_INSERT_TAIL(&so->sol_comp, sp, so_list); + sp->so_qstate = SQ_COMP; + sp->so_options &= ~SO_ACCEPTFILTER; + so->sol_incqlen--; + so->sol_qlen++; + wakeup = 1; + } + SOCK_UNLOCK(sp); + } + if (wakeup) + solisten_wakeup(so); /* unlocks */ + else + SOLISTEN_UNLOCK(so); return (0); } @@ -238,17 +264,10 @@ free(afap, M_TEMP); return (ENOENT); } - /* - * Allocate the new accept filter instance storage. We may - * have to free it again later if we fail to attach it. If - * attached properly, 'newaf' is NULLed to avoid a free() - * while in use. - */ - newaf = malloc(sizeof(*newaf), M_ACCF, M_WAITOK | M_ZERO); if (afp->accf_create != NULL && afap->af_name[0] != '\0') { size_t len = strlen(afap->af_name) + 1; - newaf->so_accept_filter_str = malloc(len, M_ACCF, M_WAITOK); - strcpy(newaf->so_accept_filter_str, afap->af_name); + accept_filter_str = malloc(len, M_ACCF, M_WAITOK); + strcpy(accept_filter_str, afap->af_name); } /* @@ -256,8 +275,8 @@ * without first removing it. */ SOCK_LOCK(so); - if (((so->so_options & SO_ACCEPTCONN) == 0) || - (so->so_accf != NULL)) { + if ((so->so_options & SO_ACCEPTCONN) == 0 || + so->sol_accept_filter != NULL) { error = EINVAL; goto out; } @@ -268,25 +287,20 @@ * can't block. */ if (afp->accf_create != NULL) { - newaf->so_accept_filter_arg = - afp->accf_create(so, afap->af_arg); - if (newaf->so_accept_filter_arg == NULL) { + accept_filter_arg = afp->accf_create(so, afap->af_arg); + if (accept_filter_arg == NULL) { error = EINVAL; goto out; } } - newaf->so_accept_filter = afp; - so->so_accf = newaf; + so->sol_accept_filter = afp; + so->sol_accept_filter_arg = accept_filter_arg; + so->sol_accept_filter_str = accept_filter_str; so->so_options |= SO_ACCEPTFILTER; - newaf = NULL; out: SOCK_UNLOCK(so); - if (newaf != NULL) { - if (newaf->so_accept_filter_str != NULL) - free(newaf->so_accept_filter_str, M_ACCF); - free(newaf, M_ACCF); - } - if (afap != NULL) - free(afap, M_TEMP); + if (accept_filter_str != NULL) + free(accept_filter_str, M_ACCF); + free(afap, M_TEMP); return (error); } Index: head/sys/kern/uipc_debug.c =================================================================== --- head/sys/kern/uipc_debug.c +++ head/sys/kern/uipc_debug.c @@ -448,8 +448,6 @@ db_printf(")\n"); db_print_indent(indent); - db_printf("so_qstate: 0x%x (", so->so_qstate); - db_print_soqstate(so->so_qstate); db_printf(") "); db_printf("so_pcb: %p ", so->so_pcb); db_printf("so_proto: %p\n", so->so_proto); @@ -458,24 +456,28 @@ db_print_protosw(so->so_proto, "so_proto", indent); db_print_indent(indent); - db_printf("so_head: %p ", so->so_head); - db_printf("so_incomp first: %p ", TAILQ_FIRST(&so->so_incomp)); - db_printf("so_comp first: %p\n", TAILQ_FIRST(&so->so_comp)); + if (so->so_options & SO_ACCEPTCONN) { + db_printf("sol_incomp first: %p ", + TAILQ_FIRST(&so->sol_incomp)); + db_printf("sol_comp first: %p\n", TAILQ_FIRST(&so->sol_comp)); + db_printf("sol_qlen: %d ", so->sol_qlen); + db_printf("sol_incqlen: %d ", so->sol_incqlen); + db_printf("sol_qlimit: %d ", so->sol_qlimit); + } else { + db_printf("so_qstate: 0x%x (", so->so_qstate); + db_print_soqstate(so->so_qstate); + db_printf("so_listen: %p ", so->so_listen); + /* so_list skipped */ + db_printf("so_timeo: %d ", so->so_timeo); + db_printf("so_error: %d\n", so->so_error); - db_print_indent(indent); - /* so_list skipped */ - db_printf("so_qlen: %u ", so->so_qlen); - db_printf("so_incqlen: %u ", so->so_incqlen); - db_printf("so_qlimit: %u ", so->so_qlimit); - db_printf("so_timeo: %d ", so->so_timeo); - db_printf("so_error: %d\n", so->so_error); + db_print_indent(indent); + db_printf("so_sigio: %p ", so->so_sigio); + db_printf("so_oobmark: %lu ", so->so_oobmark); - db_print_indent(indent); - db_printf("so_sigio: %p ", so->so_sigio); - db_printf("so_oobmark: %lu ", so->so_oobmark); - - db_print_sockbuf(&so->so_rcv, "so_rcv", indent); - db_print_sockbuf(&so->so_snd, "so_snd", indent); + db_print_sockbuf(&so->so_rcv, "so_rcv", indent); + db_print_sockbuf(&so->so_snd, "so_snd", indent); + } } DB_SHOW_COMMAND(socket, db_show_socket) Index: head/sys/kern/uipc_sockbuf.c =================================================================== --- head/sys/kern/uipc_sockbuf.c +++ head/sys/kern/uipc_sockbuf.c @@ -314,14 +314,14 @@ SOCKBUF_LOCK_ASSERT(sb); - selwakeuppri(&sb->sb_sel, PSOCK); - if (!SEL_WAITING(&sb->sb_sel)) + selwakeuppri(sb->sb_sel, PSOCK); + if (!SEL_WAITING(sb->sb_sel)) sb->sb_flags &= ~SB_SEL; if (sb->sb_flags & SB_WAIT) { sb->sb_flags &= ~SB_WAIT; wakeup(&sb->sb_acc); } - KNOTE_LOCKED(&sb->sb_sel.si_note, 0); + KNOTE_LOCKED(&sb->sb_sel->si_note, 0); if (sb->sb_upcall != NULL && !(so->so_state & SS_ISDISCONNECTED)) { ret = sb->sb_upcall(so, sb->sb_upcallarg, M_NOWAIT); if (ret == SU_ISCONNECTED) { Index: head/sys/kern/uipc_socket.c =================================================================== --- head/sys/kern/uipc_socket.c +++ head/sys/kern/uipc_socket.c @@ -106,6 +106,7 @@ #include "opt_inet.h" #include "opt_inet6.h" #include "opt_compat.h" +#include "opt_sctp.h" #include #include @@ -154,13 +155,21 @@ static int soreceive_rcvoob(struct socket *so, struct uio *uio, int flags); +static void so_rdknl_lock(void *); +static void so_rdknl_unlock(void *); +static void so_rdknl_assert_locked(void *); +static void so_rdknl_assert_unlocked(void *); +static void so_wrknl_lock(void *); +static void so_wrknl_unlock(void *); +static void so_wrknl_assert_locked(void *); +static void so_wrknl_assert_unlocked(void *); static void filt_sordetach(struct knote *kn); static int filt_soread(struct knote *kn, long hint); static void filt_sowdetach(struct knote *kn); static int filt_sowrite(struct knote *kn, long hint); -static int inline hhook_run_socket(struct socket *so, void *hctx, int32_t h_id); static int filt_soempty(struct knote *kn, long hint); +static int inline hhook_run_socket(struct socket *so, void *hctx, int32_t h_id); fo_kqfilter_t soo_kqfilter; static struct filterops soread_filtops = { @@ -393,8 +402,16 @@ return (NULL); } + /* + * The socket locking protocol allows to lock 2 sockets at a time, + * however, the first one must be a listening socket. WITNESS lacks + * a feature to change class of an existing lock, so we use DUPOK. + */ + mtx_init(&so->so_lock, "socket", NULL, MTX_DEF | MTX_DUPOK); SOCKBUF_LOCK_INIT(&so->so_snd, "so_snd"); SOCKBUF_LOCK_INIT(&so->so_rcv, "so_rcv"); + so->so_rcv.sb_sel = &so->so_rdsel; + so->so_snd.sb_sel = &so->so_wrsel; sx_init(&so->so_snd.sb_sx, "so_snd_sx"); sx_init(&so->so_rcv.sb_sx, "so_rcv_sx"); TAILQ_INIT(&so->so_snd.sb_aiojobq); @@ -450,9 +467,6 @@ if (so->so_snd.sb_hiwat) (void)chgsbsize(so->so_cred->cr_uidinfo, &so->so_snd.sb_hiwat, 0, RLIM_INFINITY); - /* remove accept filter if one is present. */ - if (so->so_accf != NULL) - accept_filt_setopt(so, NULL); #ifdef MAC mac_socket_destroy(so); #endif @@ -460,10 +474,16 @@ crfree(so->so_cred); khelp_destroy_osd(&so->osd); - sx_destroy(&so->so_snd.sb_sx); - sx_destroy(&so->so_rcv.sb_sx); - SOCKBUF_LOCK_DESTROY(&so->so_snd); - SOCKBUF_LOCK_DESTROY(&so->so_rcv); + if (SOLISTENING(so)) { + if (so->sol_accept_filter != NULL) + accept_filt_setopt(so, NULL); + } else { + sx_destroy(&so->so_snd.sb_sx); + sx_destroy(&so->so_rcv.sb_sx); + SOCKBUF_LOCK_DESTROY(&so->so_snd); + SOCKBUF_LOCK_DESTROY(&so->so_rcv); + } + mtx_destroy(&so->so_lock); uma_zfree(socket_zone, so); } @@ -506,8 +526,6 @@ if (so == NULL) return (ENOBUFS); - TAILQ_INIT(&so->so_incomp); - TAILQ_INIT(&so->so_comp); so->so_type = type; so->so_cred = crhold(cred); if ((prp->pr_domain->dom_family == PF_INET) || @@ -520,9 +538,10 @@ #ifdef MAC mac_socket_create(cred, so); #endif - knlist_init_mtx(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv)); - knlist_init_mtx(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd)); - so->so_count = 1; + knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock, + so_rdknl_assert_locked, so_rdknl_assert_unlocked); + knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock, + so_wrknl_assert_locked, so_wrknl_assert_unlocked); /* * Auto-sizing of socket buffers is managed by the protocols and * the appropriate flags must be set in the pru_attach function. @@ -531,12 +550,10 @@ error = (*prp->pr_usrreqs->pru_attach)(so, proto, td); CURVNET_RESTORE(); if (error) { - KASSERT(so->so_count == 1, ("socreate: so_count %d", - so->so_count)); - so->so_count = 0; sodealloc(so); return (error); } + soref(so); *aso = so; return (0); } @@ -564,11 +581,11 @@ static int overcount; struct socket *so; - int over; + u_int over; - ACCEPT_LOCK(); - over = (head->so_qlen > 3 * head->so_qlimit / 2); - ACCEPT_UNLOCK(); + SOLISTEN_LOCK(head); + over = (head->sol_qlen > 3 * head->sol_qlimit / 2); + SOLISTEN_UNLOCK(head); #ifdef REGRESSION if (regression_sonewconn_earlytest && over) { #else @@ -580,15 +597,15 @@ log(LOG_DEBUG, "%s: pcb %p: Listen queue overflow: " "%i already in queue awaiting acceptance " "(%d occurrences)\n", - __func__, head->so_pcb, head->so_qlen, overcount); + __func__, head->so_pcb, head->sol_qlen, overcount); overcount = 0; } return (NULL); } - VNET_ASSERT(head->so_vnet != NULL, ("%s:%d so_vnet is NULL, head=%p", - __func__, __LINE__, head)); + VNET_ASSERT(head->so_vnet != NULL, ("%s: so %p vnet is NULL", + __func__, head)); so = soalloc(head->so_vnet); if (so == NULL) { log(LOG_DEBUG, "%s: pcb %p: New socket allocation failure: " @@ -596,11 +613,8 @@ __func__, head->so_pcb); return (NULL); } - if ((head->so_options & SO_ACCEPTFILTER) != 0) - connstatus = 0; - so->so_head = head; + so->so_listen = head; so->so_type = head->so_type; - so->so_options = head->so_options &~ SO_ACCEPTCONN; so->so_linger = head->so_linger; so->so_state = head->so_state | SS_NOFDREF; so->so_fibnum = head->so_fibnum; @@ -609,10 +623,12 @@ #ifdef MAC mac_socket_newconn(head, so); #endif - knlist_init_mtx(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv)); - knlist_init_mtx(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd)); + knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock, + so_rdknl_assert_locked, so_rdknl_assert_unlocked); + knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock, + so_wrknl_assert_locked, so_wrknl_assert_unlocked); VNET_SO_ASSERT(head); - if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat)) { + if (soreserve(so, head->sol_sbsnd_hiwat, head->sol_sbrcv_hiwat)) { sodealloc(so); log(LOG_DEBUG, "%s: pcb %p: soreserve() failed\n", __func__, head->so_pcb); @@ -624,32 +640,24 @@ __func__, head->so_pcb); return (NULL); } - so->so_rcv.sb_lowat = head->so_rcv.sb_lowat; - so->so_snd.sb_lowat = head->so_snd.sb_lowat; - so->so_rcv.sb_timeo = head->so_rcv.sb_timeo; - so->so_snd.sb_timeo = head->so_snd.sb_timeo; - so->so_rcv.sb_flags |= head->so_rcv.sb_flags & SB_AUTOSIZE; - so->so_snd.sb_flags |= head->so_snd.sb_flags & SB_AUTOSIZE; + so->so_rcv.sb_lowat = head->sol_sbrcv_lowat; + so->so_snd.sb_lowat = head->sol_sbsnd_lowat; + so->so_rcv.sb_timeo = head->sol_sbrcv_timeo; + so->so_snd.sb_timeo = head->sol_sbsnd_timeo; + so->so_rcv.sb_flags |= head->sol_sbrcv_flags & SB_AUTOSIZE; + so->so_snd.sb_flags |= head->sol_sbsnd_flags & SB_AUTOSIZE; + + SOLISTEN_LOCK(head); + if (head->sol_accept_filter != NULL) + connstatus = 0; so->so_state |= connstatus; - ACCEPT_LOCK(); - /* - * The accept socket may be tearing down but we just - * won a race on the ACCEPT_LOCK. - * However, if sctp_peeloff() is called on a 1-to-many - * style socket, the SO_ACCEPTCONN doesn't need to be set. - */ - if (!(head->so_options & SO_ACCEPTCONN) && - ((head->so_proto->pr_protocol != IPPROTO_SCTP) || - (head->so_type != SOCK_SEQPACKET))) { - SOCK_LOCK(so); - so->so_head = NULL; - sofree(so); /* NB: returns ACCEPT_UNLOCK'ed. */ - return (NULL); - } + so->so_options = head->so_options & ~SO_ACCEPTCONN; + soref(head); /* A socket on (in)complete queue refs head. */ if (connstatus) { - TAILQ_INSERT_TAIL(&head->so_comp, so, so_list); - so->so_qstate |= SQ_COMP; - head->so_qlen++; + TAILQ_INSERT_TAIL(&head->sol_comp, so, so_list); + so->so_qstate = SQ_COMP; + head->sol_qlen++; + solisten_wakeup(head); /* unlocks */ } else { /* * Keep removing sockets from the head until there's room for @@ -658,28 +666,86 @@ * threads and soabort() requires dropping locks, we must * loop waiting for the condition to be true. */ - while (head->so_incqlen > head->so_qlimit) { + while (head->sol_incqlen > head->sol_qlimit) { struct socket *sp; - sp = TAILQ_FIRST(&head->so_incomp); - TAILQ_REMOVE(&head->so_incomp, sp, so_list); - head->so_incqlen--; - sp->so_qstate &= ~SQ_INCOMP; - sp->so_head = NULL; - ACCEPT_UNLOCK(); + + sp = TAILQ_FIRST(&head->sol_incomp); + TAILQ_REMOVE(&head->sol_incomp, sp, so_list); + head->sol_incqlen--; + SOCK_LOCK(sp); + sp->so_qstate = SQ_NONE; + sp->so_listen = NULL; + SOCK_UNLOCK(sp); + sorele(head); /* does SOLISTEN_UNLOCK, head stays */ soabort(sp); - ACCEPT_LOCK(); + SOLISTEN_LOCK(head); } - TAILQ_INSERT_TAIL(&head->so_incomp, so, so_list); - so->so_qstate |= SQ_INCOMP; - head->so_incqlen++; + TAILQ_INSERT_TAIL(&head->sol_incomp, so, so_list); + so->so_qstate = SQ_INCOMP; + head->sol_incqlen++; + SOLISTEN_UNLOCK(head); } - ACCEPT_UNLOCK(); - if (connstatus) { - sorwakeup(head); - wakeup_one(&head->so_timeo); + return (so); +} + +#ifdef SCTP +/* + * Socket part of sctp_peeloff(). Detach a new socket from an + * association. The new socket is returned with a reference. + */ +struct socket * +sopeeloff(struct socket *head) +{ + struct socket *so; + + VNET_ASSERT(head->so_vnet != NULL, ("%s:%d so_vnet is NULL, head=%p", + __func__, __LINE__, head)); + so = soalloc(head->so_vnet); + if (so == NULL) { + log(LOG_DEBUG, "%s: pcb %p: New socket allocation failure: " + "limit reached or out of memory\n", + __func__, head->so_pcb); + return (NULL); } + so->so_type = head->so_type; + so->so_options = head->so_options; + so->so_linger = head->so_linger; + so->so_state = (head->so_state & SS_NBIO) | SS_ISCONNECTED; + so->so_fibnum = head->so_fibnum; + so->so_proto = head->so_proto; + so->so_cred = crhold(head->so_cred); +#ifdef MAC + mac_socket_newconn(head, so); +#endif + knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock, + so_rdknl_assert_locked, so_rdknl_assert_unlocked); + knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock, + so_wrknl_assert_locked, so_wrknl_assert_unlocked); + VNET_SO_ASSERT(head); + if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat)) { + sodealloc(so); + log(LOG_DEBUG, "%s: pcb %p: soreserve() failed\n", + __func__, head->so_pcb); + return (NULL); + } + if ((*so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL)) { + sodealloc(so); + log(LOG_DEBUG, "%s: pcb %p: pru_attach() failed\n", + __func__, head->so_pcb); + return (NULL); + } + so->so_rcv.sb_lowat = head->so_rcv.sb_lowat; + so->so_snd.sb_lowat = head->so_snd.sb_lowat; + so->so_rcv.sb_timeo = head->so_rcv.sb_timeo; + so->so_snd.sb_timeo = head->so_snd.sb_timeo; + so->so_rcv.sb_flags |= head->so_rcv.sb_flags & SB_AUTOSIZE; + so->so_snd.sb_flags |= head->so_snd.sb_flags & SB_AUTOSIZE; + + soref(so); + return (so); } +#endif /* SCTP */ int sobind(struct socket *so, struct sockaddr *nam, struct thread *td) @@ -741,16 +807,140 @@ void solisten_proto(struct socket *so, int backlog) { + int sbrcv_lowat, sbsnd_lowat; + u_int sbrcv_hiwat, sbsnd_hiwat; + short sbrcv_flags, sbsnd_flags; + sbintime_t sbrcv_timeo, sbsnd_timeo; SOCK_LOCK_ASSERT(so); + if (SOLISTENING(so)) + goto listening; + + /* + * Change this socket to listening state. + */ + sbrcv_lowat = so->so_rcv.sb_lowat; + sbsnd_lowat = so->so_snd.sb_lowat; + sbrcv_hiwat = so->so_rcv.sb_hiwat; + sbsnd_hiwat = so->so_snd.sb_hiwat; + sbrcv_flags = so->so_rcv.sb_flags; + sbsnd_flags = so->so_snd.sb_flags; + sbrcv_timeo = so->so_rcv.sb_timeo; + sbsnd_timeo = so->so_snd.sb_timeo; + + sbdestroy(&so->so_snd, so); + sbdestroy(&so->so_rcv, so); + sx_destroy(&so->so_snd.sb_sx); + sx_destroy(&so->so_rcv.sb_sx); + SOCKBUF_LOCK_DESTROY(&so->so_snd); + SOCKBUF_LOCK_DESTROY(&so->so_rcv); + +#ifdef INVARIANTS + bzero(&so->so_rcv, + sizeof(struct socket) - offsetof(struct socket, so_rcv)); +#endif + + so->sol_sbrcv_lowat = sbrcv_lowat; + so->sol_sbsnd_lowat = sbsnd_lowat; + so->sol_sbrcv_hiwat = sbrcv_hiwat; + so->sol_sbsnd_hiwat = sbsnd_hiwat; + so->sol_sbrcv_flags = sbrcv_flags; + so->sol_sbsnd_flags = sbsnd_flags; + so->sol_sbrcv_timeo = sbrcv_timeo; + so->sol_sbsnd_timeo = sbsnd_timeo; + + so->sol_qlen = so->sol_incqlen = 0; + TAILQ_INIT(&so->sol_incomp); + TAILQ_INIT(&so->sol_comp); + + so->sol_accept_filter = NULL; + so->sol_accept_filter_arg = NULL; + so->sol_accept_filter_str = NULL; + + so->so_options |= SO_ACCEPTCONN; + +listening: if (backlog < 0 || backlog > somaxconn) backlog = somaxconn; - so->so_qlimit = backlog; - so->so_options |= SO_ACCEPTCONN; + so->sol_qlimit = backlog; } /* + * Wakeup listeners/subsystems once we have a complete connection. + * Enters with lock, returns unlocked. + */ +void +solisten_wakeup(struct socket *sol) +{ + + if (sol->sol_upcall != NULL) + (void )sol->sol_upcall(sol, sol->sol_upcallarg, M_NOWAIT); + else { + selwakeuppri(&sol->so_rdsel, PSOCK); + KNOTE_LOCKED(&sol->so_rdsel.si_note, 0); + } + SOLISTEN_UNLOCK(sol); + wakeup_one(&sol->sol_comp); +} + +/* + * Return single connection off a listening socket queue. Main consumer of + * the function is kern_accept4(). Some modules, that do their own accept + * management also use the function. + * + * Listening socket must be locked on entry and is returned unlocked on + * return. + * The flags argument is set of accept4(2) flags and ACCEPT4_INHERIT. + */ +int +solisten_dequeue(struct socket *head, struct socket **ret, int flags) +{ + struct socket *so; + int error; + + SOLISTEN_LOCK_ASSERT(head); + + while (!(head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->sol_comp) && + head->so_error == 0) { + error = msleep(&head->sol_comp, &head->so_lock, PSOCK | PCATCH, + "accept", 0); + if (error != 0) { + SOLISTEN_UNLOCK(head); + return (error); + } + } + if (head->so_error) { + error = head->so_error; + head->so_error = 0; + SOLISTEN_UNLOCK(head); + return (error); + } + if ((head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->sol_comp)) { + SOLISTEN_UNLOCK(head); + return (EWOULDBLOCK); + } + so = TAILQ_FIRST(&head->sol_comp); + SOCK_LOCK(so); + KASSERT(so->so_qstate == SQ_COMP, + ("%s: so %p not SQ_COMP", __func__, so)); + soref(so); + head->sol_qlen--; + so->so_qstate = SQ_NONE; + so->so_listen = NULL; + TAILQ_REMOVE(&head->sol_comp, so, so_list); + if (flags & ACCEPT4_INHERIT) + so->so_state |= (head->so_state & SS_NBIO); + else + so->so_state |= (flags & SOCK_NONBLOCK) ? SS_NBIO : 0; + SOCK_UNLOCK(so); + sorele(head); + + *ret = so; + return (0); +} + +/* * Evaluate the reference count and named references on a socket; if no * references remain, free it. This should be called whenever a reference is * released, such as in sorele(), but also when named reference flags are @@ -774,44 +964,62 @@ sofree(struct socket *so) { struct protosw *pr = so->so_proto; - struct socket *head; - ACCEPT_LOCK_ASSERT(); SOCK_LOCK_ASSERT(so); if ((so->so_state & SS_NOFDREF) == 0 || so->so_count != 0 || - (so->so_state & SS_PROTOREF) || (so->so_qstate & SQ_COMP)) { + (so->so_state & SS_PROTOREF) || (so->so_qstate == SQ_COMP)) { SOCK_UNLOCK(so); - ACCEPT_UNLOCK(); return; } - head = so->so_head; - if (head != NULL) { - KASSERT((so->so_qstate & SQ_COMP) != 0 || - (so->so_qstate & SQ_INCOMP) != 0, - ("sofree: so_head != NULL, but neither SQ_COMP nor " - "SQ_INCOMP")); - KASSERT((so->so_qstate & SQ_COMP) == 0 || - (so->so_qstate & SQ_INCOMP) == 0, - ("sofree: so->so_qstate is SQ_COMP and also SQ_INCOMP")); - TAILQ_REMOVE(&head->so_incomp, so, so_list); - head->so_incqlen--; - so->so_qstate &= ~SQ_INCOMP; - so->so_head = NULL; + if (!SOLISTENING(so) && so->so_qstate == SQ_INCOMP) { + struct socket *sol; + + sol = so->so_listen; + KASSERT(sol, ("%s: so %p on incomp of NULL", __func__, so)); + + /* + * To solve race between close of a listening socket and + * a socket on its incomplete queue, we need to lock both. + * The order is first listening socket, then regular. + * Since we don't have SS_NOFDREF neither SS_PROTOREF, this + * function and the listening socket are the only pointers + * to so. To preserve so and sol, we reference both and then + * relock. + * After relock the socket may not move to so_comp since it + * doesn't have PCB already, but it may be removed from + * so_incomp. If that happens, we share responsiblity on + * freeing the socket, but soclose() has already removed + * it from queue. + */ + soref(sol); + soref(so); + SOCK_UNLOCK(so); + SOLISTEN_LOCK(sol); + SOCK_LOCK(so); + if (so->so_qstate == SQ_INCOMP) { + KASSERT(so->so_listen == sol, + ("%s: so %p migrated out of sol %p", + __func__, so, sol)); + TAILQ_REMOVE(&sol->sol_incomp, so, so_list); + sol->sol_incqlen--; + /* This is guarenteed not to be the last. */ + refcount_release(&sol->so_count); + so->so_qstate = SQ_NONE; + so->so_listen = NULL; + } else + KASSERT(so->so_listen == NULL, + ("%s: so %p not on (in)comp with so_listen", + __func__, so)); + sorele(sol); + KASSERT(so->so_count == 1, + ("%s: so %p count %u", __func__, so, so->so_count)); + so->so_count = 0; } - KASSERT((so->so_qstate & SQ_COMP) == 0 && - (so->so_qstate & SQ_INCOMP) == 0, - ("sofree: so_head == NULL, but still SQ_COMP(%d) or SQ_INCOMP(%d)", - so->so_qstate & SQ_COMP, so->so_qstate & SQ_INCOMP)); - if (so->so_options & SO_ACCEPTCONN) { - KASSERT((TAILQ_EMPTY(&so->so_comp)), - ("sofree: so_comp populated")); - KASSERT((TAILQ_EMPTY(&so->so_incomp)), - ("sofree: so_incomp populated")); - } + if (SOLISTENING(so)) + so->so_error = ECONNABORTED; SOCK_UNLOCK(so); - ACCEPT_UNLOCK(); VNET_SO_ASSERT(so); if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL) @@ -833,12 +1041,14 @@ * before calling pru_detach. This means that protocols shold not * assume they can perform socket wakeups, etc, in their detach code. */ - sbdestroy(&so->so_snd, so); - sbdestroy(&so->so_rcv, so); - seldrain(&so->so_snd.sb_sel); - seldrain(&so->so_rcv.sb_sel); - knlist_destroy(&so->so_rcv.sb_sel.si_note); - knlist_destroy(&so->so_snd.sb_sel.si_note); + if (!SOLISTENING(so)) { + sbdestroy(&so->so_snd, so); + sbdestroy(&so->so_rcv, so); + } + seldrain(&so->so_rdsel); + seldrain(&so->so_wrsel); + knlist_destroy(&so->so_rdsel.si_note); + knlist_destroy(&so->so_wrsel.si_note); sodealloc(so); } @@ -853,6 +1063,8 @@ int soclose(struct socket *so) { + struct accept_queue lqueue; + bool listening; int error = 0; KASSERT(!(so->so_state & SS_NOFDREF), ("soclose: SS_NOFDREF on enter")); @@ -885,41 +1097,42 @@ drop: if (so->so_proto->pr_usrreqs->pru_close != NULL) (*so->so_proto->pr_usrreqs->pru_close)(so); - ACCEPT_LOCK(); - if (so->so_options & SO_ACCEPTCONN) { + + SOCK_LOCK(so); + if ((listening = (so->so_options & SO_ACCEPTCONN))) { struct socket *sp; - /* - * Prevent new additions to the accept queues due - * to ACCEPT_LOCK races while we are draining them. - */ - so->so_options &= ~SO_ACCEPTCONN; - while ((sp = TAILQ_FIRST(&so->so_incomp)) != NULL) { - TAILQ_REMOVE(&so->so_incomp, sp, so_list); - so->so_incqlen--; - sp->so_qstate &= ~SQ_INCOMP; - sp->so_head = NULL; - ACCEPT_UNLOCK(); - soabort(sp); - ACCEPT_LOCK(); + + TAILQ_INIT(&lqueue); + TAILQ_SWAP(&lqueue, &so->sol_incomp, socket, so_list); + TAILQ_CONCAT(&lqueue, &so->sol_comp, so_list); + + so->sol_qlen = so->sol_incqlen = 0; + + TAILQ_FOREACH(sp, &lqueue, so_list) { + SOCK_LOCK(sp); + sp->so_qstate = SQ_NONE; + sp->so_listen = NULL; + SOCK_UNLOCK(sp); + /* Guaranteed not to be the last. */ + refcount_release(&so->so_count); } - while ((sp = TAILQ_FIRST(&so->so_comp)) != NULL) { - TAILQ_REMOVE(&so->so_comp, sp, so_list); - so->so_qlen--; - sp->so_qstate &= ~SQ_COMP; - sp->so_head = NULL; - ACCEPT_UNLOCK(); - soabort(sp); - ACCEPT_LOCK(); - } - KASSERT((TAILQ_EMPTY(&so->so_comp)), - ("%s: so_comp populated", __func__)); - KASSERT((TAILQ_EMPTY(&so->so_incomp)), - ("%s: so_incomp populated", __func__)); } - SOCK_LOCK(so); KASSERT((so->so_state & SS_NOFDREF) == 0, ("soclose: NOFDREF")); so->so_state |= SS_NOFDREF; - sorele(so); /* NB: Returns with ACCEPT_UNLOCK(). */ + sorele(so); + if (listening) { + struct socket *sp; + + TAILQ_FOREACH(sp, &lqueue, so_list) { + SOCK_LOCK(sp); + if (sp->so_count == 0) { + SOCK_UNLOCK(sp); + soabort(sp); + } else + /* sp is now in sofree() */ + SOCK_UNLOCK(sp); + } + } CURVNET_RESTORE(); return (error); } @@ -951,13 +1164,11 @@ KASSERT(so->so_count == 0, ("soabort: so_count")); KASSERT((so->so_state & SS_PROTOREF) == 0, ("soabort: SS_PROTOREF")); KASSERT(so->so_state & SS_NOFDREF, ("soabort: !SS_NOFDREF")); - KASSERT((so->so_state & SQ_COMP) == 0, ("soabort: SQ_COMP")); - KASSERT((so->so_state & SQ_INCOMP) == 0, ("soabort: SQ_INCOMP")); + KASSERT(so->so_qstate == SQ_NONE, ("soabort: !SQ_NONE")); VNET_SO_ASSERT(so); if (so->so_proto->pr_usrreqs->pru_abort != NULL) (*so->so_proto->pr_usrreqs->pru_abort)(so); - ACCEPT_LOCK(); SOCK_LOCK(so); sofree(so); } @@ -2892,15 +3103,15 @@ break; case SO_LISTENQLIMIT: - optval = so->so_qlimit; + optval = SOLISTENING(so) ? so->sol_qlimit : 0; goto integer; case SO_LISTENQLEN: - optval = so->so_qlen; + optval = SOLISTENING(so) ? so->sol_qlen : 0; goto integer; case SO_LISTENINCQLEN: - optval = so->so_incqlen; + optval = SOLISTENING(so) ? so->sol_incqlen : 0; goto integer; case SO_TS_CLOCK: @@ -3047,7 +3258,7 @@ if (so->so_sigio != NULL) pgsigio(&so->so_sigio, SIGURG, 0); - selwakeuppri(&so->so_rcv.sb_sel, PSOCK); + selwakeuppri(&so->so_rdsel, PSOCK); } int @@ -3067,44 +3278,54 @@ sopoll_generic(struct socket *so, int events, struct ucred *active_cred, struct thread *td) { - int revents = 0; + int revents; - SOCKBUF_LOCK(&so->so_snd); - SOCKBUF_LOCK(&so->so_rcv); - if (events & (POLLIN | POLLRDNORM)) - if (soreadabledata(so)) - revents |= events & (POLLIN | POLLRDNORM); - - if (events & (POLLOUT | POLLWRNORM)) - if (sowriteable(so)) - revents |= events & (POLLOUT | POLLWRNORM); - - if (events & (POLLPRI | POLLRDBAND)) - if (so->so_oobmark || (so->so_rcv.sb_state & SBS_RCVATMARK)) - revents |= events & (POLLPRI | POLLRDBAND); - - if ((events & POLLINIGNEOF) == 0) { - if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { - revents |= events & (POLLIN | POLLRDNORM); - if (so->so_snd.sb_state & SBS_CANTSENDMORE) - revents |= POLLHUP; + SOCK_LOCK(so); + if (SOLISTENING(so)) { + if (!(events & (POLLIN | POLLRDNORM))) + revents = 0; + else if (!TAILQ_EMPTY(&so->sol_comp)) + revents = events & (POLLIN | POLLRDNORM); + else { + selrecord(td, &so->so_rdsel); + revents = 0; } - } - - if (revents == 0) { - if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) { - selrecord(td, &so->so_rcv.sb_sel); - so->so_rcv.sb_flags |= SB_SEL; + } else { + revents = 0; + SOCKBUF_LOCK(&so->so_snd); + SOCKBUF_LOCK(&so->so_rcv); + if (events & (POLLIN | POLLRDNORM)) + if (soreadabledata(so)) + revents |= events & (POLLIN | POLLRDNORM); + if (events & (POLLOUT | POLLWRNORM)) + if (sowriteable(so)) + revents |= events & (POLLOUT | POLLWRNORM); + if (events & (POLLPRI | POLLRDBAND)) + if (so->so_oobmark || + (so->so_rcv.sb_state & SBS_RCVATMARK)) + revents |= events & (POLLPRI | POLLRDBAND); + if ((events & POLLINIGNEOF) == 0) { + if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { + revents |= events & (POLLIN | POLLRDNORM); + if (so->so_snd.sb_state & SBS_CANTSENDMORE) + revents |= POLLHUP; + } } - - if (events & (POLLOUT | POLLWRNORM)) { - selrecord(td, &so->so_snd.sb_sel); - so->so_snd.sb_flags |= SB_SEL; + if (revents == 0) { + if (events & + (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) { + selrecord(td, &so->so_rdsel); + so->so_rcv.sb_flags |= SB_SEL; + } + if (events & (POLLOUT | POLLWRNORM)) { + selrecord(td, &so->so_wrsel); + so->so_snd.sb_flags |= SB_SEL; + } } + SOCKBUF_UNLOCK(&so->so_rcv); + SOCKBUF_UNLOCK(&so->so_snd); } - - SOCKBUF_UNLOCK(&so->so_rcv); - SOCKBUF_UNLOCK(&so->so_snd); + SOCK_UNLOCK(so); return (revents); } @@ -3113,28 +3334,38 @@ { struct socket *so = kn->kn_fp->f_data; struct sockbuf *sb; + struct knlist *knl; switch (kn->kn_filter) { case EVFILT_READ: kn->kn_fop = &soread_filtops; + knl = &so->so_rdsel.si_note; sb = &so->so_rcv; break; case EVFILT_WRITE: kn->kn_fop = &sowrite_filtops; + knl = &so->so_wrsel.si_note; sb = &so->so_snd; break; case EVFILT_EMPTY: kn->kn_fop = &soempty_filtops; + knl = &so->so_wrsel.si_note; sb = &so->so_snd; break; default: return (EINVAL); } - SOCKBUF_LOCK(sb); - knlist_add(&sb->sb_sel.si_note, kn, 1); - sb->sb_flags |= SB_KNOTE; - SOCKBUF_UNLOCK(sb); + SOCK_LOCK(so); + if (SOLISTENING(so)) { + knlist_add(knl, kn, 1); + } else { + SOCKBUF_LOCK(sb); + knlist_add(knl, kn, 1); + sb->sb_flags |= SB_KNOTE; + SOCKBUF_UNLOCK(sb); + } + SOCK_UNLOCK(so); return (0); } @@ -3313,11 +3544,11 @@ { struct socket *so = kn->kn_fp->f_data; - SOCKBUF_LOCK(&so->so_rcv); - knlist_remove(&so->so_rcv.sb_sel.si_note, kn, 1); - if (knlist_empty(&so->so_rcv.sb_sel.si_note)) + so_rdknl_lock(so); + knlist_remove(&so->so_rdsel.si_note, kn, 1); + if (!SOLISTENING(so) && knlist_empty(&so->so_rdsel.si_note)) so->so_rcv.sb_flags &= ~SB_KNOTE; - SOCKBUF_UNLOCK(&so->so_rcv); + so_rdknl_unlock(so); } /*ARGSUSED*/ @@ -3327,11 +3558,13 @@ struct socket *so; so = kn->kn_fp->f_data; - if (so->so_options & SO_ACCEPTCONN) { - kn->kn_data = so->so_qlen; - return (!TAILQ_EMPTY(&so->so_comp)); + if (SOLISTENING(so)) { + SOCK_LOCK_ASSERT(so); + kn->kn_data = so->sol_qlen; + return (!TAILQ_EMPTY(&so->sol_comp)); } + SOCKBUF_LOCK_ASSERT(&so->so_rcv); kn->kn_data = sbavail(&so->so_rcv) - so->so_rcv.sb_ctl; @@ -3357,11 +3590,11 @@ { struct socket *so = kn->kn_fp->f_data; - SOCKBUF_LOCK(&so->so_snd); - knlist_remove(&so->so_snd.sb_sel.si_note, kn, 1); - if (knlist_empty(&so->so_snd.sb_sel.si_note)) + so_wrknl_lock(so); + knlist_remove(&so->so_wrsel.si_note, kn, 1); + if (!SOLISTENING(so) && knlist_empty(&so->so_wrsel.si_note)) so->so_snd.sb_flags &= ~SB_KNOTE; - SOCKBUF_UNLOCK(&so->so_snd); + so_wrknl_unlock(so); } /*ARGSUSED*/ @@ -3371,6 +3604,10 @@ struct socket *so; so = kn->kn_fp->f_data; + + if (SOLISTENING(so)) + return (0); + SOCKBUF_LOCK_ASSERT(&so->so_snd); kn->kn_data = sbspace(&so->so_snd); @@ -3397,6 +3634,10 @@ struct socket *so; so = kn->kn_fp->f_data; + + if (SOLISTENING(so)) + return (1); + SOCKBUF_LOCK_ASSERT(&so->so_snd); kn->kn_data = sbused(&so->so_snd); @@ -3465,42 +3706,52 @@ struct socket *head; int ret; + /* + * XXXGL: this is the only place where we acquire socket locks + * in reverse order: first child, then listening socket. To + * avoid possible LOR, use try semantics. + */ restart: - ACCEPT_LOCK(); SOCK_LOCK(so); + if ((head = so->so_listen) != NULL && + __predict_false(SOLISTEN_TRYLOCK(head) == 0)) { + SOCK_UNLOCK(so); + goto restart; + } so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING|SS_ISCONFIRMING); so->so_state |= SS_ISCONNECTED; - head = so->so_head; - if (head != NULL && (so->so_qstate & SQ_INCOMP)) { + if (head != NULL && (so->so_qstate == SQ_INCOMP)) { +again: if ((so->so_options & SO_ACCEPTFILTER) == 0) { + TAILQ_REMOVE(&head->sol_incomp, so, so_list); + head->sol_incqlen--; + TAILQ_INSERT_TAIL(&head->sol_comp, so, so_list); + head->sol_qlen++; + so->so_qstate = SQ_COMP; SOCK_UNLOCK(so); - TAILQ_REMOVE(&head->so_incomp, so, so_list); - head->so_incqlen--; - so->so_qstate &= ~SQ_INCOMP; - TAILQ_INSERT_TAIL(&head->so_comp, so, so_list); - head->so_qlen++; - so->so_qstate |= SQ_COMP; - ACCEPT_UNLOCK(); - sorwakeup(head); - wakeup_one(&head->so_timeo); + solisten_wakeup(head); /* unlocks */ } else { - ACCEPT_UNLOCK(); + SOCKBUF_LOCK(&so->so_rcv); soupcall_set(so, SO_RCV, - head->so_accf->so_accept_filter->accf_callback, - head->so_accf->so_accept_filter_arg); + head->sol_accept_filter->accf_callback, + head->sol_accept_filter_arg); so->so_options &= ~SO_ACCEPTFILTER; - ret = head->so_accf->so_accept_filter->accf_callback(so, - head->so_accf->so_accept_filter_arg, M_NOWAIT); - if (ret == SU_ISCONNECTED) + ret = head->sol_accept_filter->accf_callback(so, + head->sol_accept_filter_arg, M_NOWAIT); + if (ret == SU_ISCONNECTED) { soupcall_clear(so, SO_RCV); + SOCKBUF_UNLOCK(&so->so_rcv); + goto again; + } + SOCKBUF_UNLOCK(&so->so_rcv); SOCK_UNLOCK(so); - if (ret == SU_ISCONNECTED) - goto restart; + SOLISTEN_UNLOCK(head); } return; } + if (head != NULL) + SOLISTEN_UNLOCK(head); SOCK_UNLOCK(so); - ACCEPT_UNLOCK(); wakeup(&so->so_timeo); sorwakeup(so); sowwakeup(so); @@ -3510,16 +3761,17 @@ soisdisconnecting(struct socket *so) { - /* - * Note: This code assumes that SOCK_LOCK(so) and - * SOCKBUF_LOCK(&so->so_rcv) are the same. - */ - SOCKBUF_LOCK(&so->so_rcv); + SOCK_LOCK(so); so->so_state &= ~SS_ISCONNECTING; so->so_state |= SS_ISDISCONNECTING; - socantrcvmore_locked(so); - SOCKBUF_LOCK(&so->so_snd); - socantsendmore_locked(so); + + if (!SOLISTENING(so)) { + SOCKBUF_LOCK(&so->so_rcv); + socantrcvmore_locked(so); + SOCKBUF_LOCK(&so->so_snd); + socantsendmore_locked(so); + } + SOCK_UNLOCK(so); wakeup(&so->so_timeo); } @@ -3527,17 +3779,18 @@ soisdisconnected(struct socket *so) { - /* - * Note: This code assumes that SOCK_LOCK(so) and - * SOCKBUF_LOCK(&so->so_rcv) are the same. - */ - SOCKBUF_LOCK(&so->so_rcv); + SOCK_LOCK(so); so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING); so->so_state |= SS_ISDISCONNECTED; - socantrcvmore_locked(so); - SOCKBUF_LOCK(&so->so_snd); - sbdrop_locked(&so->so_snd, sbused(&so->so_snd)); - socantsendmore_locked(so); + + if (!SOLISTENING(so)) { + SOCKBUF_LOCK(&so->so_rcv); + socantrcvmore_locked(so); + SOCKBUF_LOCK(&so->so_snd); + sbdrop_locked(&so->so_snd, sbused(&so->so_snd)); + socantsendmore_locked(so); + } + SOCK_UNLOCK(so); wakeup(&so->so_timeo); } @@ -3563,6 +3816,8 @@ { struct sockbuf *sb; + KASSERT(!SOLISTENING(so), ("%s: so %p listening", __func__, so)); + switch (which) { case SO_RCV: sb = &so->so_rcv; @@ -3584,6 +3839,8 @@ { struct sockbuf *sb; + KASSERT(!SOLISTENING(so), ("%s: so %p listening", __func__, so)); + switch (which) { case SO_RCV: sb = &so->so_rcv; @@ -3595,12 +3852,110 @@ panic("soupcall_clear: bad which"); } SOCKBUF_LOCK_ASSERT(sb); - KASSERT(sb->sb_upcall != NULL, ("soupcall_clear: no upcall to clear")); + KASSERT(sb->sb_upcall != NULL, + ("%s: so %p no upcall to clear", __func__, so)); sb->sb_upcall = NULL; sb->sb_upcallarg = NULL; sb->sb_flags &= ~SB_UPCALL; } +void +solisten_upcall_set(struct socket *so, so_upcall_t func, void *arg) +{ + + SOLISTEN_LOCK_ASSERT(so); + so->sol_upcall = func; + so->sol_upcallarg = arg; +} + +static void +so_rdknl_lock(void *arg) +{ + struct socket *so = arg; + + if (SOLISTENING(so)) + SOCK_LOCK(so); + else + SOCKBUF_LOCK(&so->so_rcv); +} + +static void +so_rdknl_unlock(void *arg) +{ + struct socket *so = arg; + + if (SOLISTENING(so)) + SOCK_UNLOCK(so); + else + SOCKBUF_UNLOCK(&so->so_rcv); +} + +static void +so_rdknl_assert_locked(void *arg) +{ + struct socket *so = arg; + + if (SOLISTENING(so)) + SOCK_LOCK_ASSERT(so); + else + SOCKBUF_LOCK_ASSERT(&so->so_rcv); +} + +static void +so_rdknl_assert_unlocked(void *arg) +{ + struct socket *so = arg; + + if (SOLISTENING(so)) + SOCK_UNLOCK_ASSERT(so); + else + SOCKBUF_UNLOCK_ASSERT(&so->so_rcv); +} + +static void +so_wrknl_lock(void *arg) +{ + struct socket *so = arg; + + if (SOLISTENING(so)) + SOCK_LOCK(so); + else + SOCKBUF_LOCK(&so->so_snd); +} + +static void +so_wrknl_unlock(void *arg) +{ + struct socket *so = arg; + + if (SOLISTENING(so)) + SOCK_UNLOCK(so); + else + SOCKBUF_UNLOCK(&so->so_snd); +} + +static void +so_wrknl_assert_locked(void *arg) +{ + struct socket *so = arg; + + if (SOLISTENING(so)) + SOCK_LOCK_ASSERT(so); + else + SOCKBUF_LOCK_ASSERT(&so->so_snd); +} + +static void +so_wrknl_assert_unlocked(void *arg) +{ + struct socket *so = arg; + + if (SOLISTENING(so)) + SOCK_UNLOCK_ASSERT(so); + else + SOCKBUF_UNLOCK_ASSERT(&so->so_snd); +} + /* * Create an external-format (``xsocket'') structure using the information in * the kernel-format socket structure pointed to by so. This is done to @@ -3622,32 +3977,24 @@ xso->so_pcb = so->so_pcb; xso->xso_protocol = so->so_proto->pr_protocol; xso->xso_family = so->so_proto->pr_domain->dom_family; - xso->so_qlen = so->so_qlen; - xso->so_incqlen = so->so_incqlen; - xso->so_qlimit = so->so_qlimit; xso->so_timeo = so->so_timeo; xso->so_error = so->so_error; - xso->so_pgid = so->so_sigio ? so->so_sigio->sio_pgid : 0; - xso->so_oobmark = so->so_oobmark; - sbtoxsockbuf(&so->so_snd, &xso->so_snd); - sbtoxsockbuf(&so->so_rcv, &xso->so_rcv); xso->so_uid = so->so_cred->cr_uid; -} - - -/* - * Socket accessor functions to provide external consumers with - * a safe interface to socket state - * - */ - -void -so_listeners_apply_all(struct socket *so, void (*func)(struct socket *, void *), - void *arg) -{ - - TAILQ_FOREACH(so, &so->so_comp, so_list) - func(so, arg); + xso->so_pgid = so->so_sigio ? so->so_sigio->sio_pgid : 0; + if (SOLISTENING(so)) { + xso->so_qlen = so->sol_qlen; + xso->so_incqlen = so->sol_incqlen; + xso->so_qlimit = so->sol_qlimit; + xso->so_oobmark = 0; + bzero(&xso->so_snd, sizeof(xso->so_snd)); + bzero(&xso->so_rcv, sizeof(xso->so_rcv)); + } else { + xso->so_state |= so->so_qstate; + xso->so_qlen = xso->so_incqlen = xso->so_qlimit = 0; + xso->so_oobmark = so->so_oobmark; + sbtoxsockbuf(&so->so_snd, &xso->so_snd); + sbtoxsockbuf(&so->so_rcv, &xso->so_rcv); + } } struct sockbuf * Index: head/sys/kern/uipc_syscalls.c =================================================================== --- head/sys/kern/uipc_syscalls.c +++ head/sys/kern/uipc_syscalls.c @@ -68,13 +68,6 @@ #include #include -/* - * Flags for accept1() and kern_accept4(), in addition to SOCK_CLOEXEC - * and SOCK_NONBLOCK. - */ -#define ACCEPT4_INHERIT 0x1 -#define ACCEPT4_COMPAT 0x2 - static int sendit(struct thread *td, int s, struct msghdr *mp, int flags); static int recvit(struct thread *td, int s, struct msghdr *mp, void *namelenp); @@ -350,59 +343,22 @@ (flags & SOCK_CLOEXEC) ? O_CLOEXEC : 0, &fcaps); if (error != 0) goto done; - ACCEPT_LOCK(); - if ((head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->so_comp)) { - ACCEPT_UNLOCK(); - error = EWOULDBLOCK; + SOCK_LOCK(head); + if (!SOLISTENING(head)) { + SOCK_UNLOCK(head); + error = EINVAL; goto noconnection; } - while (TAILQ_EMPTY(&head->so_comp) && head->so_error == 0) { - if (head->so_rcv.sb_state & SBS_CANTRCVMORE) { - head->so_error = ECONNABORTED; - break; - } - error = msleep(&head->so_timeo, &accept_mtx, PSOCK | PCATCH, - "accept", 0); - if (error != 0) { - ACCEPT_UNLOCK(); - goto noconnection; - } - } - if (head->so_error) { - error = head->so_error; - head->so_error = 0; - ACCEPT_UNLOCK(); + + error = solisten_dequeue(head, &so, flags); + if (error != 0) goto noconnection; - } - so = TAILQ_FIRST(&head->so_comp); - KASSERT(!(so->so_qstate & SQ_INCOMP), ("accept1: so SQ_INCOMP")); - KASSERT(so->so_qstate & SQ_COMP, ("accept1: so not SQ_COMP")); - /* - * Before changing the flags on the socket, we have to bump the - * reference count. Otherwise, if the protocol calls sofree(), - * the socket will be released due to a zero refcount. - */ - SOCK_LOCK(so); /* soref() and so_state update */ - soref(so); /* file descriptor reference */ - - TAILQ_REMOVE(&head->so_comp, so, so_list); - head->so_qlen--; - if (flags & ACCEPT4_INHERIT) - so->so_state |= (head->so_state & SS_NBIO); - else - so->so_state |= (flags & SOCK_NONBLOCK) ? SS_NBIO : 0; - so->so_qstate &= ~SQ_COMP; - so->so_head = NULL; - - SOCK_UNLOCK(so); - ACCEPT_UNLOCK(); - /* An extra reference on `nfp' has been held for us by falloc(). */ td->td_retval[0] = fd; - /* connection has been removed from the listen queue */ - KNOTE_UNLOCKED(&head->so_rcv.sb_sel.si_note, 0); + /* Connection has been removed from the listen queue. */ + KNOTE_UNLOCKED(&head->so_rdsel.si_note, 0); if (flags & ACCEPT4_INHERIT) { pgid = fgetown(&head->so_sigio); @@ -420,7 +376,6 @@ (void) fo_ioctl(nfp, FIONBIO, &tmp, td->td_ucred, td); tmp = fflag & FASYNC; (void) fo_ioctl(nfp, FIOASYNC, &tmp, td->td_ucred, td); - sa = NULL; error = soaccept(so, &sa); if (error != 0) goto noconnection; @@ -558,7 +513,7 @@ } SOCK_LOCK(so); while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) { - error = msleep(&so->so_timeo, SOCK_MTX(so), PSOCK | PCATCH, + error = msleep(&so->so_timeo, &so->so_lock, PSOCK | PCATCH, "connec", 0); if (error != 0) { if (error == EINTR || error == ERESTART) Index: head/sys/kern/uipc_usrreq.c =================================================================== --- head/sys/kern/uipc_usrreq.c +++ head/sys/kern/uipc_usrreq.c @@ -189,10 +189,9 @@ /* * Locking and synchronization: * - * Three types of locks exit in the local domain socket implementation: a - * global list mutex, a global linkage rwlock, and per-unpcb mutexes. Of the - * global locks, the list lock protects the socket count, global generation - * number, and stream/datagram global lists. The linkage lock protects the + * Two types of locks exist in the local domain socket implementation: a + * a global linkage rwlock and per-unpcb mutexes. The linkage lock protects + * the socket count, global generation number, stream/datagram global lists and * interconnection of unpcbs, the v_socket and unp_vnode pointers, and can be * held exclusively over the acquisition of multiple unpcb locks to prevent * deadlock. @@ -233,7 +232,6 @@ * to perform namei() and other file system operations. */ static struct rwlock unp_link_rwlock; -static struct mtx unp_list_lock; static struct mtx unp_defers_lock; #define UNP_LINK_LOCK_INIT() rw_init(&unp_link_rwlock, \ @@ -250,12 +248,8 @@ #define UNP_LINK_WUNLOCK() rw_wunlock(&unp_link_rwlock) #define UNP_LINK_WLOCK_ASSERT() rw_assert(&unp_link_rwlock, \ RA_WLOCKED) +#define UNP_LINK_WOWNED() rw_wowned(&unp_link_rwlock) -#define UNP_LIST_LOCK_INIT() mtx_init(&unp_list_lock, \ - "unp_list_lock", NULL, MTX_DEF) -#define UNP_LIST_LOCK() mtx_lock(&unp_list_lock) -#define UNP_LIST_UNLOCK() mtx_unlock(&unp_list_lock) - #define UNP_DEFERRED_LOCK_INIT() mtx_init(&unp_defers_lock, \ "unp_defer", NULL, MTX_DEF) #define UNP_DEFERRED_LOCK() mtx_lock(&unp_defers_lock) @@ -396,6 +390,7 @@ u_long sendspace, recvspace; struct unpcb *unp; int error; + bool locked; KASSERT(so->so_pcb == NULL, ("uipc_attach: so_pcb != NULL")); if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) { @@ -430,10 +425,12 @@ unp->unp_socket = so; so->so_pcb = unp; unp->unp_refcount = 1; - if (so->so_head != NULL) + if (so->so_listen != NULL) unp->unp_flags |= UNP_NASCENT; - UNP_LIST_LOCK(); + if ((locked = UNP_LINK_WOWNED()) == false) + UNP_LINK_WLOCK(); + unp->unp_gencnt = ++unp_gencnt; unp_count++; switch (so->so_type) { @@ -452,8 +449,10 @@ default: panic("uipc_attach"); } - UNP_LIST_UNLOCK(); + if (locked == false) + UNP_LINK_WUNLOCK(); + return (0); } @@ -607,6 +606,7 @@ uipc_close(struct socket *so) { struct unpcb *unp, *unp2; + struct vnode *vp = NULL; unp = sotounpcb(so); KASSERT(unp != NULL, ("uipc_close: unp == NULL")); @@ -619,8 +619,14 @@ unp_disconnect(unp, unp2); UNP_PCB_UNLOCK(unp2); } + if (SOLISTENING(so) && ((vp = unp->unp_vnode) != NULL)) { + VOP_UNP_DETACH(vp); + unp->unp_vnode = NULL; + } UNP_PCB_UNLOCK(unp); UNP_LINK_WUNLOCK(); + if (vp) + vrele(vp); } static int @@ -657,18 +663,13 @@ vp = NULL; local_unp_rights = 0; - UNP_LIST_LOCK(); + UNP_LINK_WLOCK(); LIST_REMOVE(unp, unp_link); unp->unp_gencnt = ++unp_gencnt; --unp_count; - UNP_LIST_UNLOCK(); - - if ((unp->unp_flags & UNP_NASCENT) != 0) { - UNP_PCB_LOCK(unp); - goto teardown; - } - UNP_LINK_WLOCK(); UNP_PCB_LOCK(unp); + if ((unp->unp_flags & UNP_NASCENT) != 0) + goto teardown; if ((vp = unp->unp_vnode) != NULL) { VOP_UNP_DETACH(vp); @@ -693,8 +694,8 @@ UNP_PCB_UNLOCK(ref); } local_unp_rights = unp_rights; - UNP_LINK_WUNLOCK(); teardown: + UNP_LINK_WUNLOCK(); unp->unp_socket->so_pcb = NULL; saved_unp_addr = unp->unp_addr; unp->unp_addr = NULL; @@ -1315,7 +1316,7 @@ { struct sockaddr_un *soun = (struct sockaddr_un *)nam; struct vnode *vp; - struct socket *so2, *so3; + struct socket *so2; struct unpcb *unp, *unp2, *unp3; struct nameidata nd; char buf[SOCK_MAXADDRLEN]; @@ -1392,22 +1393,20 @@ error = EPROTOTYPE; goto bad2; } + UNP_PCB_LOCK(unp); + UNP_PCB_LOCK(unp2); if (so->so_proto->pr_flags & PR_CONNREQUIRED) { if (so2->so_options & SO_ACCEPTCONN) { CURVNET_SET(so2->so_vnet); - so3 = sonewconn(so2, 0); + so2 = sonewconn(so2, 0); CURVNET_RESTORE(); } else - so3 = NULL; - if (so3 == NULL) { + so2 = NULL; + if (so2 == NULL) { error = ECONNREFUSED; - goto bad2; + goto bad3; } - unp = sotounpcb(so); - unp2 = sotounpcb(so2); - unp3 = sotounpcb(so3); - UNP_PCB_LOCK(unp); - UNP_PCB_LOCK(unp2); + unp3 = sotounpcb(so2); UNP_PCB_LOCK(unp3); if (unp2->unp_addr != NULL) { bcopy(unp2->unp_addr, sa, unp2->unp_addr->sun_len); @@ -1433,23 +1432,19 @@ unp->unp_flags |= UNP_HAVEPC; if (unp2->unp_flags & UNP_WANTCRED) unp3->unp_flags |= UNP_WANTCRED; - UNP_PCB_UNLOCK(unp3); UNP_PCB_UNLOCK(unp2); - UNP_PCB_UNLOCK(unp); + unp2 = unp3; #ifdef MAC - mac_socketpeer_set_from_socket(so, so3); - mac_socketpeer_set_from_socket(so3, so); + mac_socketpeer_set_from_socket(so, so2); + mac_socketpeer_set_from_socket(so2, so); #endif - - so2 = so3; } - unp = sotounpcb(so); - KASSERT(unp != NULL, ("unp_connect: unp == NULL")); - unp2 = sotounpcb(so2); - KASSERT(unp2 != NULL, ("unp_connect: unp2 == NULL")); - UNP_PCB_LOCK(unp); - UNP_PCB_LOCK(unp2); + + KASSERT(unp2 != NULL && so2 != NULL && unp2->unp_socket == so2 && + sotounpcb(so2) == unp2, + ("%s: unp2 %p so2 %p", __func__, unp2, so2)); error = unp_connect2(so, so2, PRU_CONNECT); +bad3: UNP_PCB_UNLOCK(unp2); UNP_PCB_UNLOCK(unp); bad2: @@ -1591,10 +1586,10 @@ * OK, now we're committed to doing something. */ xug = malloc(sizeof(*xug), M_TEMP, M_WAITOK); - UNP_LIST_LOCK(); + UNP_LINK_RLOCK(); gencnt = unp_gencnt; n = unp_count; - UNP_LIST_UNLOCK(); + UNP_LINK_RUNLOCK(); xug->xug_len = sizeof *xug; xug->xug_count = n; @@ -1608,7 +1603,7 @@ unp_list = malloc(n * sizeof *unp_list, M_TEMP, M_WAITOK); - UNP_LIST_LOCK(); + UNP_LINK_RLOCK(); for (unp = LIST_FIRST(head), i = 0; unp && i < n; unp = LIST_NEXT(unp, unp_link)) { UNP_PCB_LOCK(unp); @@ -1623,7 +1618,7 @@ } UNP_PCB_UNLOCK(unp); } - UNP_LIST_UNLOCK(); + UNP_LINK_RUNLOCK(); n = i; /* In case we lost some during malloc. */ error = 0; @@ -1881,7 +1876,6 @@ TIMEOUT_TASK_INIT(taskqueue_thread, &unp_gc_task, 0, unp_gc, NULL); TASK_INIT(&unp_defer_task, 0, unp_process_defers, NULL); UNP_LINK_LOCK_INIT(); - UNP_LIST_LOCK_INIT(); UNP_DEFERRED_LOCK_INIT(); } @@ -2232,8 +2226,7 @@ static void unp_gc_process(struct unpcb *unp) { - struct socket *soa; - struct socket *so; + struct socket *so, *soa; struct file *fp; /* Already processed. */ @@ -2253,28 +2246,30 @@ return; } - /* - * Mark all sockets we reference with RIGHTS. - */ so = unp->unp_socket; - if ((unp->unp_gcflag & UNPGC_IGNORE_RIGHTS) == 0) { - SOCKBUF_LOCK(&so->so_rcv); - unp_scan(so->so_rcv.sb_mb, unp_accessable); - SOCKBUF_UNLOCK(&so->so_rcv); + SOCK_LOCK(so); + if (SOLISTENING(so)) { + /* + * Mark all sockets in our accept queue. + */ + TAILQ_FOREACH(soa, &so->sol_comp, so_list) { + if (sotounpcb(soa)->unp_gcflag & UNPGC_IGNORE_RIGHTS) + continue; + SOCKBUF_LOCK(&soa->so_rcv); + unp_scan(soa->so_rcv.sb_mb, unp_accessable); + SOCKBUF_UNLOCK(&soa->so_rcv); + } + } else { + /* + * Mark all sockets we reference with RIGHTS. + */ + if ((unp->unp_gcflag & UNPGC_IGNORE_RIGHTS) == 0) { + SOCKBUF_LOCK(&so->so_rcv); + unp_scan(so->so_rcv.sb_mb, unp_accessable); + SOCKBUF_UNLOCK(&so->so_rcv); + } } - - /* - * Mark all sockets in our accept queue. - */ - ACCEPT_LOCK(); - TAILQ_FOREACH(soa, &so->so_comp, so_list) { - if ((sotounpcb(soa)->unp_gcflag & UNPGC_IGNORE_RIGHTS) != 0) - continue; - SOCKBUF_LOCK(&soa->so_rcv); - unp_scan(soa->so_rcv.sb_mb, unp_accessable); - SOCKBUF_UNLOCK(&soa->so_rcv); - } - ACCEPT_UNLOCK(); + SOCK_UNLOCK(so); unp->unp_gcflag |= UNPGC_SCANNED; } @@ -2297,7 +2292,7 @@ int i, total; unp_taskcount++; - UNP_LIST_LOCK(); + UNP_LINK_RLOCK(); /* * First clear all gc flags from previous runs, apart from * UNPGC_IGNORE_RIGHTS. @@ -2320,7 +2315,7 @@ LIST_FOREACH(unp, *head, unp_link) unp_gc_process(unp); } while (unp_marked); - UNP_LIST_UNLOCK(); + UNP_LINK_RUNLOCK(); if (unp_unreachable == 0) return; @@ -2335,7 +2330,6 @@ * as as unreachable and store them locally. */ UNP_LINK_RLOCK(); - UNP_LIST_LOCK(); for (total = 0, head = heads; *head != NULL; head++) LIST_FOREACH(unp, *head, unp_link) if ((unp->unp_gcflag & UNPGC_DEAD) != 0) { @@ -2348,7 +2342,6 @@ KASSERT(total <= unp_unreachable, ("unp_gc: incorrect unreachable count.")); } - UNP_LIST_UNLOCK(); UNP_LINK_RUNLOCK(); /* @@ -2391,10 +2384,11 @@ struct unpcb *unp; unp = sotounpcb(so); - UNP_LIST_LOCK(); + UNP_LINK_WLOCK(); unp->unp_gcflag |= UNPGC_IGNORE_RIGHTS; - UNP_LIST_UNLOCK(); - unp_dispose_mbuf(so->so_rcv.sb_mb); + UNP_LINK_WUNLOCK(); + if (!SOLISTENING(so)) + unp_dispose_mbuf(so->so_rcv.sb_mb); } static void Index: head/sys/netgraph/bluetooth/socket/ng_btsocket_l2cap.c =================================================================== --- head/sys/netgraph/bluetooth/socket/ng_btsocket_l2cap.c +++ head/sys/netgraph/bluetooth/socket/ng_btsocket_l2cap.c @@ -614,21 +614,13 @@ pcb = ng_btsocket_l2cap_pcb_by_addr(&rt->src, ip->psm); if (pcb != NULL) { - struct socket *so1 = NULL; + struct socket *so1; mtx_lock(&pcb->pcb_mtx); - /* - * First check the pending connections queue and if we have - * space then create new socket and set proper source address. - */ - - if (pcb->so->so_qlen <= pcb->so->so_qlimit) { - CURVNET_SET(pcb->so->so_vnet); - so1 = sonewconn(pcb->so, 0); - CURVNET_RESTORE(); - } - + CURVNET_SET(pcb->so->so_vnet); + so1 = sonewconn(pcb->so, 0); + CURVNET_RESTORE(); if (so1 == NULL) { result = NG_L2CAP_NO_RESOURCES; goto respond; Index: head/sys/netgraph/bluetooth/socket/ng_btsocket_rfcomm.c =================================================================== --- head/sys/netgraph/bluetooth/socket/ng_btsocket_rfcomm.c +++ head/sys/netgraph/bluetooth/socket/ng_btsocket_rfcomm.c @@ -1149,7 +1149,7 @@ { ng_btsocket_rfcomm_pcb_p pcb = NULL, pcb1 = NULL; ng_btsocket_l2cap_pcb_p l2pcb = NULL; - struct socket *so1 = NULL; + struct socket *so1; mtx_assert(&s->session_mtx, MA_OWNED); @@ -1171,11 +1171,9 @@ mtx_lock(&pcb->pcb_mtx); - if (pcb->so->so_qlen <= pcb->so->so_qlimit) { - CURVNET_SET(pcb->so->so_vnet); - so1 = sonewconn(pcb->so, 0); - CURVNET_RESTORE(); - } + CURVNET_SET(pcb->so->so_vnet); + so1 = sonewconn(pcb->so, 0); + CURVNET_RESTORE(); mtx_unlock(&pcb->pcb_mtx); @@ -1405,46 +1403,24 @@ static int ng_btsocket_rfcomm_session_accept(ng_btsocket_rfcomm_session_p s0) { - struct socket *l2so = NULL; + struct socket *l2so; struct sockaddr_l2cap *l2sa = NULL; ng_btsocket_l2cap_pcb_t *l2pcb = NULL; ng_btsocket_rfcomm_session_p s = NULL; - int error = 0; + int error; mtx_assert(&ng_btsocket_rfcomm_sessions_mtx, MA_OWNED); mtx_assert(&s0->session_mtx, MA_OWNED); - /* Check if there is a complete L2CAP connection in the queue */ - if ((error = s0->l2so->so_error) != 0) { + SOLISTEN_LOCK(s0->l2so); + error = solisten_dequeue(s0->l2so, &l2so, 0); + if (error == EWOULDBLOCK) + return (error); + if (error) { NG_BTSOCKET_RFCOMM_ERR( "%s: Could not accept connection on L2CAP socket, error=%d\n", __func__, error); - s0->l2so->so_error = 0; - return (error); } - - ACCEPT_LOCK(); - if (TAILQ_EMPTY(&s0->l2so->so_comp)) { - ACCEPT_UNLOCK(); - if (s0->l2so->so_rcv.sb_state & SBS_CANTRCVMORE) - return (ECONNABORTED); - return (EWOULDBLOCK); - } - - /* Accept incoming L2CAP connection */ - l2so = TAILQ_FIRST(&s0->l2so->so_comp); - if (l2so == NULL) - panic("%s: l2so == NULL\n", __func__); - - TAILQ_REMOVE(&s0->l2so->so_comp, l2so, so_list); - s0->l2so->so_qlen --; - l2so->so_qstate &= ~SQ_COMP; - l2so->so_head = NULL; - SOCK_LOCK(l2so); - soref(l2so); - l2so->so_state |= SS_NBIO; - SOCK_UNLOCK(l2so); - ACCEPT_UNLOCK(); error = soaccept(l2so, (struct sockaddr **) &l2sa); if (error != 0) { Index: head/sys/netgraph/bluetooth/socket/ng_btsocket_sco.c =================================================================== --- head/sys/netgraph/bluetooth/socket/ng_btsocket_sco.c +++ head/sys/netgraph/bluetooth/socket/ng_btsocket_sco.c @@ -471,20 +471,13 @@ pcb = ng_btsocket_sco_pcb_by_addr(&rt->src); if (pcb != NULL) { - struct socket *so1 = NULL; + struct socket *so1; /* pcb is locked */ - /* - * First check the pending connections queue and if we have - * space then create new socket and set proper source address. - */ - - if (pcb->so->so_qlen <= pcb->so->so_qlimit) { - CURVNET_SET(pcb->so->so_vnet); - so1 = sonewconn(pcb->so, 0); - CURVNET_RESTORE(); - } + CURVNET_SET(pcb->so->so_vnet); + so1 = sonewconn(pcb->so, 0); + CURVNET_RESTORE(); if (so1 == NULL) { status = 0x0d; /* Rejected due to limited resources */ Index: head/sys/netgraph/ng_ksocket.c =================================================================== --- head/sys/netgraph/ng_ksocket.c +++ head/sys/netgraph/ng_ksocket.c @@ -153,8 +153,7 @@ }; /* Helper functions */ -static int ng_ksocket_check_accept(priv_p); -static void ng_ksocket_finish_accept(priv_p); +static int ng_ksocket_accept(priv_p); static int ng_ksocket_incoming(struct socket *so, void *arg, int waitflag); static int ng_ksocket_parse(const struct ng_ksocket_alias *aliases, const char *s, int family); @@ -698,6 +697,7 @@ ERROUT(ENXIO); /* Listen */ + so->so_state |= SS_NBIO; error = solisten(so, *((int32_t *)msg->data), td); break; } @@ -716,21 +716,16 @@ if (priv->flags & KSF_ACCEPTING) ERROUT(EALREADY); - error = ng_ksocket_check_accept(priv); - if (error != 0 && error != EWOULDBLOCK) - ERROUT(error); - /* * If a connection is already complete, take it. * Otherwise let the upcall function deal with * the connection when it comes in. */ + error = ng_ksocket_accept(priv); + if (error != 0 && error != EWOULDBLOCK) + ERROUT(error); priv->response_token = msg->header.token; raddr = priv->response_addr = NGI_RETADDR(item); - if (error == 0) { - ng_ksocket_finish_accept(priv); - } else - priv->flags |= KSF_ACCEPTING; break; } @@ -1068,13 +1063,8 @@ } /* Check whether a pending accept operation has completed */ - if (priv->flags & KSF_ACCEPTING) { - error = ng_ksocket_check_accept(priv); - if (error != EWOULDBLOCK) - priv->flags &= ~KSF_ACCEPTING; - if (error == 0) - ng_ksocket_finish_accept(priv); - } + if (priv->flags & KSF_ACCEPTING) + (void )ng_ksocket_accept(priv); /* * If we don't have a hook, we must handle data events later. When @@ -1171,37 +1161,10 @@ } } -/* - * Check for a completed incoming connection and return 0 if one is found. - * Otherwise return the appropriate error code. - */ static int -ng_ksocket_check_accept(priv_p priv) +ng_ksocket_accept(priv_p priv) { struct socket *const head = priv->so; - int error; - - if ((error = head->so_error) != 0) { - head->so_error = 0; - return error; - } - /* Unlocked read. */ - if (TAILQ_EMPTY(&head->so_comp)) { - if (head->so_rcv.sb_state & SBS_CANTRCVMORE) - return ECONNABORTED; - return EWOULDBLOCK; - } - return 0; -} - -/* - * Handle the first completed incoming connection, assumed to be already - * on the socket's so_comp queue. - */ -static void -ng_ksocket_finish_accept(priv_p priv) -{ - struct socket *const head = priv->so; struct socket *so; struct sockaddr *sa = NULL; struct ng_mesg *resp; @@ -1211,24 +1174,16 @@ int len; int error; - ACCEPT_LOCK(); - so = TAILQ_FIRST(&head->so_comp); - if (so == NULL) { /* Should never happen */ - ACCEPT_UNLOCK(); - return; + SOLISTEN_LOCK(head); + error = solisten_dequeue(head, &so, SOCK_NONBLOCK); + if (error == EWOULDBLOCK) { + priv->flags |= KSF_ACCEPTING; + return (error); } - TAILQ_REMOVE(&head->so_comp, so, so_list); - head->so_qlen--; - so->so_qstate &= ~SQ_COMP; - so->so_head = NULL; - SOCK_LOCK(so); - soref(so); - so->so_state |= SS_NBIO; - SOCK_UNLOCK(so); - ACCEPT_UNLOCK(); + priv->flags &= ~KSF_ACCEPTING; + if (error) + return (error); - /* XXX KNOTE_UNLOCKED(&head->so_rcv.sb_sel.si_note, 0); */ - soaccept(so, &sa); len = OFFSETOF(struct ng_ksocket_accept, addr); @@ -1288,6 +1243,8 @@ out: if (sa != NULL) free(sa, M_SONAME); + + return (0); } /* Index: head/sys/netinet/sctp_input.c =================================================================== --- head/sys/netinet/sctp_input.c +++ head/sys/netinet/sctp_input.c @@ -5200,11 +5200,21 @@ * listening responded to a INIT-ACK and then * closed. We opened and bound.. and are now no * longer listening. + * + * XXXGL: notes on checking listen queue length. + * 1) SCTP_IS_LISTENING() doesn't necessarily mean + * SOLISTENING(), because a listening "UDP type" + * socket isn't listening in terms of the socket + * layer. It is a normal data flow socket, that + * can fork off new connections. Thus, we should + * look into sol_qlen only in case we are !UDP. + * 2) Checking sol_qlen in general requires locking + * the socket, and this code lacks that. */ - if ((stcb == NULL) && (!SCTP_IS_LISTENING(inp) || - inp->sctp_socket->so_qlen >= inp->sctp_socket->so_qlimit)) { + (!(inp->sctp_flags & SCTP_PCB_FLAGS_UDPTYPE) && + inp->sctp_socket->sol_qlen >= inp->sctp_socket->sol_qlimit))) { if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) && (SCTP_BASE_SYSCTL(sctp_abort_if_one_2_one_hits_limit))) { op_err = sctp_generate_cause(SCTP_CAUSE_OUT_OF_RESC, ""); Index: head/sys/netinet/sctp_syscalls.c =================================================================== --- head/sys/netinet/sctp_syscalls.c +++ head/sys/netinet/sctp_syscalls.c @@ -152,29 +152,11 @@ td->td_retval[0] = fd; CURVNET_SET(head->so_vnet); - so = sonewconn(head, SS_ISCONNECTED); + so = sopeeloff(head); if (so == NULL) { error = ENOMEM; goto noconnection; } - /* - * Before changing the flags on the socket, we have to bump the - * reference count. Otherwise, if the protocol calls sofree(), - * the socket will be released due to a zero refcount. - */ - SOCK_LOCK(so); - soref(so); /* file descriptor reference */ - SOCK_UNLOCK(so); - - ACCEPT_LOCK(); - - TAILQ_REMOVE(&head->so_comp, so, so_list); - head->so_qlen--; - so->so_state |= (head->so_state & SS_NBIO); - so->so_state &= ~SS_NOFDREF; - so->so_qstate &= ~SQ_COMP; - so->so_head = NULL; - ACCEPT_UNLOCK(); finit(nfp, fflag, DTYPE_SOCKET, so, &socketops); error = sctp_do_peeloff(head, so, (sctp_assoc_t)uap->name); if (error != 0) Index: head/sys/netinet/sctp_sysctl.c =================================================================== --- head/sys/netinet/sctp_sysctl.c +++ head/sys/netinet/sctp_sysctl.c @@ -415,12 +415,12 @@ xinpcb.qlen = 0; xinpcb.maxqlen = 0; } else { - xinpcb.qlen = so->so_qlen; - xinpcb.qlen_old = so->so_qlen > USHRT_MAX ? - USHRT_MAX : (uint16_t)so->so_qlen; - xinpcb.maxqlen = so->so_qlimit; - xinpcb.maxqlen_old = so->so_qlimit > USHRT_MAX ? - USHRT_MAX : (uint16_t)so->so_qlimit; + xinpcb.qlen = so->sol_qlen; + xinpcb.qlen_old = so->sol_qlen > USHRT_MAX ? + USHRT_MAX : (uint16_t)so->sol_qlen; + xinpcb.maxqlen = so->sol_qlimit; + xinpcb.maxqlen_old = so->sol_qlimit > USHRT_MAX ? + USHRT_MAX : (uint16_t)so->sol_qlimit; } SCTP_INP_INCR_REF(inp); SCTP_INP_RUNLOCK(inp); Index: head/sys/netinet/sctp_usrreq.c =================================================================== --- head/sys/netinet/sctp_usrreq.c +++ head/sys/netinet/sctp_usrreq.c @@ -7138,19 +7138,12 @@ } } SCTP_INP_WLOCK(inp); - SOCK_LOCK(so); - /* It appears for 7.0 and on, we must always call this. */ - solisten_proto(so, backlog); - if (inp->sctp_flags & SCTP_PCB_FLAGS_UDPTYPE) { - /* remove the ACCEPTCONN flag for one-to-many sockets */ - so->so_options &= ~SO_ACCEPTCONN; + if ((inp->sctp_flags & SCTP_PCB_FLAGS_UDPTYPE) == 0) { + SOCK_LOCK(so); + solisten_proto(so, backlog); + SOCK_UNLOCK(so); } - if (backlog > 0) { - inp->sctp_flags |= SCTP_PCB_FLAGS_ACCEPTING; - } else { - inp->sctp_flags &= ~SCTP_PCB_FLAGS_ACCEPTING; - } - SOCK_UNLOCK(so); + inp->sctp_flags |= SCTP_PCB_FLAGS_ACCEPTING; SCTP_INP_WUNLOCK(inp); return (error); } Index: head/sys/netinet/tcp_subr.c =================================================================== --- head/sys/netinet/tcp_subr.c +++ head/sys/netinet/tcp_subr.c @@ -1664,7 +1664,6 @@ ("tcp_close: !SS_PROTOREF")); inp->inp_flags &= ~INP_SOCKREF; INP_WUNLOCK(inp); - ACCEPT_LOCK(); SOCK_LOCK(so); so->so_state &= ~SS_PROTOREF; sofree(so); Index: head/sys/netinet/tcp_syncache.c =================================================================== --- head/sys/netinet/tcp_syncache.c +++ head/sys/netinet/tcp_syncache.c @@ -1264,6 +1264,7 @@ * soon as possible. */ so = *lsop; + KASSERT(SOLISTENING(so), ("%s: %p not listening", __func__, so)); tp = sototcpcb(so); cred = crhold(so->so_cred); @@ -1274,7 +1275,7 @@ #endif ip_ttl = inp->inp_ip_ttl; ip_tos = inp->inp_ip_tos; - win = sbspace(&so->so_rcv); + win = so->sol_sbrcv_hiwat; ltflags = (tp->t_flags & (TF_NOOPT | TF_SIGNATURE)); #ifdef TCP_RFC7413 @@ -1287,7 +1288,7 @@ * listen queue with bogus TFO connections. */ if (atomic_fetchadd_int(tp->t_tfo_pending, 1) <= - (so->so_qlimit / 2)) { + (so->sol_qlimit / 2)) { int result; result = tcp_fastopen_check_cookie(inc, @@ -2115,7 +2116,7 @@ sc->sc_flags |= SCF_WINSCALE; } - wnd = sbspace(&lso->so_rcv); + wnd = lso->sol_sbrcv_hiwat; wnd = imax(wnd, 0); wnd = imin(wnd, TCP_MAXWIN); sc->sc_wnd = wnd; Index: head/sys/netinet/tcp_timewait.c =================================================================== --- head/sys/netinet/tcp_timewait.c +++ head/sys/netinet/tcp_timewait.c @@ -352,7 +352,6 @@ ("tcp_twstart: !SS_PROTOREF")); inp->inp_flags &= ~INP_SOCKREF; INP_WUNLOCK(inp); - ACCEPT_LOCK(); SOCK_LOCK(so); so->so_state &= ~SS_PROTOREF; sofree(so); @@ -491,7 +490,6 @@ if (inp->inp_flags & INP_SOCKREF) { inp->inp_flags &= ~INP_SOCKREF; INP_WUNLOCK(inp); - ACCEPT_LOCK(); SOCK_LOCK(so); KASSERT(so->so_state & SS_PROTOREF, ("tcp_twclose: INP_SOCKREF && !SS_PROTOREF")); Index: head/sys/ofed/drivers/infiniband/core/iwcm.c =================================================================== --- head/sys/ofed/drivers/infiniband/core/iwcm.c +++ head/sys/ofed/drivers/infiniband/core/iwcm.c @@ -416,34 +416,19 @@ { struct socket *so; struct sockaddr_in *remote; + int error; - ACCEPT_LOCK(); - so = TAILQ_FIRST(&head->so_comp); - if (!so) { - ACCEPT_UNLOCK(); - return NULL; - } - - SOCK_LOCK(so); - /* - * Before changing the flags on the socket, we have to bump the - * reference count. Otherwise, if the protocol calls sofree(), - * the socket will be released due to a zero refcount. - */ - soref(so); - TAILQ_REMOVE(&head->so_comp, so, so_list); - head->so_qlen--; - so->so_qstate &= ~SQ_COMP; - so->so_head = NULL; - so->so_state |= SS_NBIO; - SOCK_UNLOCK(so); - ACCEPT_UNLOCK(); + SOLISTEN_LOCK(head); + error = solisten_dequeue(head, &so, SOCK_NONBLOCK); + if (error == EWOULDBLOCK) + return (NULL); remote = NULL; soaccept(so, (struct sockaddr **)&remote); free(remote, M_SONAME); return so; } + static void iw_so_event_handler(struct work_struct *_work) { @@ -485,18 +470,17 @@ #endif return; } + static int iw_so_upcall(struct socket *parent_so, void *arg, int waitflag) { struct iwcm_listen_work *work; - struct socket *so; struct iw_cm_id *cm_id = arg; /* check whether iw_so_event_handler() already dequeued this 'so' */ - so = TAILQ_FIRST(&parent_so->so_comp); - if (!so) + if (TAILQ_EMPTY(&parent_so->sol_comp)) return SU_OK; - work = kzalloc(sizeof(*work), M_NOWAIT); + work = kzalloc(sizeof(*work), waitflag); if (!work) return -ENOMEM; work->cm_id = cm_id; @@ -507,17 +491,21 @@ return SU_OK; } -static void -iw_init_sock(struct iw_cm_id *cm_id) +static int +iw_create_listen(struct iw_cm_id *cm_id, int backlog) { struct sockopt sopt; struct socket *so = cm_id->so; int on = 1; + int rc; - SOCK_LOCK(so); - soupcall_set(so, SO_RCV, iw_so_upcall, cm_id); + rc = -solisten(cm_id->so, backlog, curthread); + if (rc != 0) + return (rc); + SOLISTEN_LOCK(so); + solisten_upcall_set(so, iw_so_upcall, cm_id); so->so_state |= SS_NBIO; - SOCK_UNLOCK(so); + SOLISTEN_UNLOCK(so); sopt.sopt_dir = SOPT_SET; sopt.sopt_level = IPPROTO_TCP; sopt.sopt_name = TCP_NODELAY; @@ -525,37 +513,18 @@ sopt.sopt_valsize = sizeof(on); sopt.sopt_td = NULL; sosetopt(so, &sopt); -} - -static int -iw_uninit_socket(struct iw_cm_id *cm_id) -{ - struct socket *so = cm_id->so; - - SOCK_LOCK(so); - soupcall_clear(so, SO_RCV); - SOCK_UNLOCK(so); - return (0); } static int -iw_create_listen(struct iw_cm_id *cm_id, int backlog) -{ - int rc; - - iw_init_sock(cm_id); - rc = -solisten(cm_id->so, backlog, curthread); - if (rc != 0) - iw_uninit_socket(cm_id); - return (rc); -} - -static int iw_destroy_listen(struct iw_cm_id *cm_id) { + struct socket *so = cm_id->so; - return (iw_uninit_socket(cm_id)); + SOLISTEN_LOCK(so); + solisten_upcall_set(so, NULL, NULL); + SOLISTEN_UNLOCK(so); + return (0); } Index: head/sys/ofed/drivers/infiniband/ulp/sdp/sdp_main.c =================================================================== --- head/sys/ofed/drivers/infiniband/ulp/sdp/sdp_main.c +++ head/sys/ofed/drivers/infiniband/ulp/sdp/sdp_main.c @@ -310,7 +310,6 @@ ("sdp_closed: !SS_PROTOREF")); ssk->flags &= ~SDP_SOCKREF; SDP_WUNLOCK(ssk); - ACCEPT_LOCK(); SOCK_LOCK(so); so->so_state &= ~SS_PROTOREF; sofree(so); Index: head/sys/rpc/svc_vc.c =================================================================== --- head/sys/rpc/svc_vc.c +++ head/sys/rpc/svc_vc.c @@ -96,6 +96,7 @@ struct sockaddr *raddr); static int svc_vc_accept(struct socket *head, struct socket **sop); static int svc_vc_soupcall(struct socket *so, void *arg, int waitflag); +static int svc_vc_rendezvous_soupcall(struct socket *, void *, int); static struct xp_ops svc_vc_rendezvous_ops = { .xp_recv = svc_vc_rendezvous_recv, @@ -183,10 +184,10 @@ solisten(so, -1, curthread); - SOCKBUF_LOCK(&so->so_rcv); + SOLISTEN_LOCK(so); xprt->xp_upcallset = 1; - soupcall_set(so, SO_RCV, svc_vc_soupcall, xprt); - SOCKBUF_UNLOCK(&so->so_rcv); + solisten_upcall_set(so, svc_vc_rendezvous_soupcall, xprt); + SOLISTEN_UNLOCK(so); return (xprt); @@ -316,9 +317,11 @@ int svc_vc_accept(struct socket *head, struct socket **sop) { - int error = 0; struct socket *so; + int error = 0; + short nbio; + /* XXXGL: shouldn't that be an assertion? */ if ((head->so_options & SO_ACCEPTCONN) == 0) { error = EINVAL; goto done; @@ -328,38 +331,26 @@ if (error != 0) goto done; #endif - ACCEPT_LOCK(); - if (TAILQ_EMPTY(&head->so_comp)) { - ACCEPT_UNLOCK(); - error = EWOULDBLOCK; - goto done; - } - so = TAILQ_FIRST(&head->so_comp); - KASSERT(!(so->so_qstate & SQ_INCOMP), ("svc_vc_accept: so SQ_INCOMP")); - KASSERT(so->so_qstate & SQ_COMP, ("svc_vc_accept: so not SQ_COMP")); - /* - * Before changing the flags on the socket, we have to bump the - * reference count. Otherwise, if the protocol calls sofree(), - * the socket will be released due to a zero refcount. - * XXX might not need soref() since this is simpler than kern_accept. + * XXXGL: we want non-blocking semantics. The socket could be a + * socket created by kernel as well as socket shared with userland, + * so we can't be sure about presense of SS_NBIO. We also shall not + * toggle it on the socket, since that may surprise userland. So we + * set SS_NBIO only temporarily. */ - SOCK_LOCK(so); /* soref() and so_state update */ - soref(so); /* file descriptor reference */ + SOLISTEN_LOCK(head); + nbio = head->so_state & SS_NBIO; + head->so_state |= SS_NBIO; + error = solisten_dequeue(head, &so, 0); + head->so_state &= (nbio & ~SS_NBIO); + if (error) + goto done; - TAILQ_REMOVE(&head->so_comp, so, so_list); - head->so_qlen--; - so->so_state |= (head->so_state & SS_NBIO); - so->so_qstate &= ~SQ_COMP; - so->so_head = NULL; - - SOCK_UNLOCK(so); - ACCEPT_UNLOCK(); - + so->so_state |= nbio; *sop = so; /* connection has been removed from the listen queue */ - KNOTE_UNLOCKED(&head->so_rcv.sb_sel.si_note, 0); + KNOTE_UNLOCKED(&head->so_rdsel.si_note, 0); done: return (error); } @@ -392,21 +383,21 @@ * connection arrives after our call to accept fails * with EWOULDBLOCK. */ - ACCEPT_LOCK(); - if (TAILQ_EMPTY(&xprt->xp_socket->so_comp)) + SOLISTEN_LOCK(xprt->xp_socket); + if (TAILQ_EMPTY(&xprt->xp_socket->sol_comp)) xprt_inactive_self(xprt); - ACCEPT_UNLOCK(); + SOLISTEN_UNLOCK(xprt->xp_socket); sx_xunlock(&xprt->xp_lock); return (FALSE); } if (error) { - SOCKBUF_LOCK(&xprt->xp_socket->so_rcv); + SOLISTEN_LOCK(xprt->xp_socket); if (xprt->xp_upcallset) { xprt->xp_upcallset = 0; soupcall_clear(xprt->xp_socket, SO_RCV); } - SOCKBUF_UNLOCK(&xprt->xp_socket->so_rcv); + SOLISTEN_UNLOCK(xprt->xp_socket); xprt_inactive_self(xprt); sx_xunlock(&xprt->xp_lock); return (FALSE); @@ -453,12 +444,6 @@ static void svc_vc_destroy_common(SVCXPRT *xprt) { - SOCKBUF_LOCK(&xprt->xp_socket->so_rcv); - if (xprt->xp_upcallset) { - xprt->xp_upcallset = 0; - soupcall_clear(xprt->xp_socket, SO_RCV); - } - SOCKBUF_UNLOCK(&xprt->xp_socket->so_rcv); if (xprt->xp_socket) (void)soclose(xprt->xp_socket); @@ -472,6 +457,13 @@ svc_vc_rendezvous_destroy(SVCXPRT *xprt) { + SOLISTEN_LOCK(xprt->xp_socket); + if (xprt->xp_upcallset) { + xprt->xp_upcallset = 0; + solisten_upcall_set(xprt->xp_socket, NULL, NULL); + } + SOLISTEN_UNLOCK(xprt->xp_socket); + svc_vc_destroy_common(xprt); } @@ -480,6 +472,13 @@ { struct cf_conn *cd = (struct cf_conn *)xprt->xp_p1; + SOCKBUF_LOCK(&xprt->xp_socket->so_rcv); + if (xprt->xp_upcallset) { + xprt->xp_upcallset = 0; + soupcall_clear(xprt->xp_socket, SO_RCV); + } + SOCKBUF_UNLOCK(&xprt->xp_socket->so_rcv); + svc_vc_destroy_common(xprt); if (cd->mreq) @@ -954,6 +953,16 @@ SVCXPRT *xprt = (SVCXPRT *) arg; if (soreadable(xprt->xp_socket)) + xprt_active(xprt); + return (SU_OK); +} + +static int +svc_vc_rendezvous_soupcall(struct socket *head, void *arg, int waitflag) +{ + SVCXPRT *xprt = (SVCXPRT *) arg; + + if (!TAILQ_EMPTY(&head->sol_comp)) xprt_active(xprt); return (SU_OK); } Index: head/sys/sys/sockbuf.h =================================================================== --- head/sys/sys/sockbuf.h +++ head/sys/sys/sockbuf.h @@ -32,7 +32,6 @@ */ #ifndef _SYS_SOCKBUF_H_ #define _SYS_SOCKBUF_H_ -#include /* for struct selinfo */ #include #include #include @@ -64,6 +63,7 @@ struct sockaddr; struct socket; struct thread; +struct selinfo; struct xsockbuf { u_int sb_cc; @@ -84,9 +84,9 @@ * (a) locked by SOCKBUF_LOCK(). */ struct sockbuf { - struct selinfo sb_sel; /* process selecting read/write */ - struct mtx sb_mtx; /* sockbuf lock */ - struct sx sb_sx; /* prevent I/O interlacing */ + struct mtx sb_mtx; /* sockbuf lock */ + struct sx sb_sx; /* prevent I/O interlacing */ + struct selinfo *sb_sel; /* process selecting read/write */ short sb_state; /* (a) socket state on sockbuf */ #define sb_startzero sb_mb struct mbuf *sb_mb; /* (a) the mbuf chain */ Index: head/sys/sys/socket.h =================================================================== --- head/sys/sys/socket.h +++ head/sys/sys/socket.h @@ -111,7 +111,15 @@ */ #define SOCK_CLOEXEC 0x10000000 #define SOCK_NONBLOCK 0x20000000 -#endif +#ifdef _KERNEL +/* + * Flags for accept1(), kern_accept4() and solisten_dequeue, in addition + * to SOCK_CLOEXEC and SOCK_NONBLOCK. + */ +#define ACCEPT4_INHERIT 0x1 +#define ACCEPT4_COMPAT 0x2 +#endif /* _KERNEL */ +#endif /* __BSD_VISIBLE */ /* * Option flags per-socket. @@ -704,9 +712,5 @@ void so_lock(struct socket *so); void so_unlock(struct socket *so); -void so_listeners_apply_all(struct socket *so, void (*func)(struct socket *, void *), void *arg); - -#endif - - +#endif /* _KERNEL */ #endif /* !_SYS_SOCKET_H_ */ Index: head/sys/sys/socketvar.h =================================================================== --- head/sys/sys/socketvar.h +++ head/sys/sys/socketvar.h @@ -64,60 +64,35 @@ * Locking key to struct socket: * (a) constant after allocation, no locking required. * (b) locked by SOCK_LOCK(so). - * (c) locked by SOCKBUF_LOCK(&so->so_rcv). - * (e) locked by ACCEPT_LOCK(). + * (cr) locked by SOCKBUF_LOCK(&so->so_rcv). + * (cs) locked by SOCKBUF_LOCK(&so->so_rcv). + * (e) locked by SOLISTEN_LOCK() of corresponding listening socket. * (f) not locked since integer reads/writes are atomic. * (g) used only as a sleep/wakeup address, no value. * (h) locked by global mutex so_global_mtx. */ +TAILQ_HEAD(accept_queue, socket); struct socket { - int so_count; /* (b) reference count */ + struct mtx so_lock; + volatile u_int so_count; /* (b / refcount) */ + struct selinfo so_rdsel; /* (b/cr) for so_rcv/so_comp */ + struct selinfo so_wrsel; /* (b/cs) for so_snd */ short so_type; /* (a) generic type, see socket.h */ - short so_options; /* from socket call, see socket.h */ - short so_linger; /* time to linger while closing */ + short so_options; /* (b) from socket call, see socket.h */ + short so_linger; /* time to linger close(2) */ short so_state; /* (b) internal state flags SS_* */ - int so_qstate; /* (e) internal state flags SQ_* */ void *so_pcb; /* protocol control block */ struct vnet *so_vnet; /* (a) network stack instance */ struct protosw *so_proto; /* (a) protocol handle */ -/* - * Variables for connection queuing. - * Socket where accepts occur is so_head in all subsidiary sockets. - * If so_head is 0, socket is not related to an accept. - * For head socket so_incomp queues partially completed connections, - * while so_comp is a queue of connections ready to be accepted. - * If a connection is aborted and it has so_head set, then - * it has to be pulled out of either so_incomp or so_comp. - * We allow connections to queue up based on current queue lengths - * and limit on number of queued connections for this socket. - */ - struct socket *so_head; /* (e) back pointer to listen socket */ - TAILQ_HEAD(, socket) so_incomp; /* (e) queue of partial unaccepted connections */ - TAILQ_HEAD(, socket) so_comp; /* (e) queue of complete unaccepted connections */ - TAILQ_ENTRY(socket) so_list; /* (e) list of unaccepted connections */ - u_int so_qlen; /* (e) number of unaccepted connections */ - u_int so_incqlen; /* (e) number of unaccepted incomplete - connections */ - u_int so_qlimit; /* (e) max number queued connections */ short so_timeo; /* (g) connection timeout */ u_short so_error; /* (f) error affecting connection */ struct sigio *so_sigio; /* [sg] information for async I/O or out of band data (SIGURG) */ - u_long so_oobmark; /* (c) chars to oob mark */ - - struct sockbuf so_rcv, so_snd; - struct ucred *so_cred; /* (a) user credentials */ struct label *so_label; /* (b) MAC label for socket */ - struct label *so_peerlabel; /* (b) cached MAC label for peer */ /* NB: generation count must not be first. */ so_gen_t so_gencnt; /* (h) generation count */ void *so_emuldata; /* (b) private data for emulators */ - struct so_accf { - struct accept_filter *so_accept_filter; - void *so_accept_filter_arg; /* saved filter args */ - char *so_accept_filter_str; /* saved user args */ - } *so_accf; struct osd osd; /* Object Specific extensions */ /* * so_fibnum, so_user_cookie and friends can be used to attach @@ -130,41 +105,95 @@ int so_ts_clock; /* type of the clock used for timestamps */ uint32_t so_max_pacing_rate; /* (f) TX rate limit in bytes/s */ + union { + /* Regular (data flow) socket. */ + struct { + /* (cr, cs) Receive and send buffers. */ + struct sockbuf so_rcv, so_snd; - void *so_pspare[2]; /* general use */ - int so_ispare[2]; /* general use */ + /* (e) Our place on accept queue. */ + TAILQ_ENTRY(socket) so_list; + struct socket *so_listen; /* (b) */ + enum { + SQ_NONE = 0, + SQ_INCOMP = 0x0800, /* on sol_incomp */ + SQ_COMP = 0x1000, /* on sol_comp */ + } so_qstate; /* (b) */ + + /* (b) cached MAC label for peer */ + struct label *so_peerlabel; + u_long so_oobmark; /* chars to oob mark */ + }; + /* + * Listening socket, where accepts occur, is so_listen in all + * subsidiary sockets. If so_listen is NULL, socket is not + * related to an accept. For a listening socket itself + * sol_incomp queues partially completed connections, while + * sol_comp is a queue of connections ready to be accepted. + * If a connection is aborted and it has so_listen set, then + * it has to be pulled out of either sol_incomp or sol_comp. + * We allow connections to queue up based on current queue + * lengths and limit on number of queued connections for this + * socket. + */ + struct { + /* (e) queue of partial unaccepted connections */ + struct accept_queue sol_incomp; + /* (e) queue of complete unaccepted connections */ + struct accept_queue sol_comp; + u_int sol_qlen; /* (e) sol_comp length */ + u_int sol_incqlen; /* (e) sol_incomp length */ + u_int sol_qlimit; /* (e) queue limit */ + + /* accept_filter(9) optional data */ + struct accept_filter *sol_accept_filter; + void *sol_accept_filter_arg; /* saved filter args */ + char *sol_accept_filter_str; /* saved user args */ + + /* Optional upcall, for kernel socket. */ + so_upcall_t *sol_upcall; /* (e) */ + void *sol_upcallarg; /* (e) */ + + /* Socket buffer parameters, to be copied to + * dataflow sockets, accepted from this one. */ + int sol_sbrcv_lowat; + int sol_sbsnd_lowat; + u_int sol_sbrcv_hiwat; + u_int sol_sbsnd_hiwat; + short sol_sbrcv_flags; + short sol_sbsnd_flags; + sbintime_t sol_sbrcv_timeo; + sbintime_t sol_sbsnd_timeo; + }; + }; }; -/* - * Global accept mutex to serialize access to accept queues and - * fields associated with multiple sockets. This allows us to - * avoid defining a lock order between listen and accept sockets - * until such time as it proves to be a good idea. - */ -extern struct mtx accept_mtx; -#define ACCEPT_LOCK_ASSERT() mtx_assert(&accept_mtx, MA_OWNED) -#define ACCEPT_UNLOCK_ASSERT() mtx_assert(&accept_mtx, MA_NOTOWNED) -#define ACCEPT_LOCK() mtx_lock(&accept_mtx) -#define ACCEPT_UNLOCK() mtx_unlock(&accept_mtx) +#define SOCK_MTX(so) &(so)->so_lock +#define SOCK_LOCK(so) mtx_lock(&(so)->so_lock) +#define SOCK_OWNED(so) mtx_owned(&(so)->so_lock) +#define SOCK_UNLOCK(so) mtx_unlock(&(so)->so_lock) +#define SOCK_LOCK_ASSERT(so) mtx_assert(&(so)->so_lock, MA_OWNED) +#define SOCK_UNLOCK_ASSERT(so) mtx_assert(&(so)->so_lock, MA_NOTOWNED) -/* - * Per-socket mutex: we reuse the receive socket buffer mutex for space - * efficiency. This decision should probably be revisited as we optimize - * locking for the socket code. - */ -#define SOCK_MTX(_so) SOCKBUF_MTX(&(_so)->so_rcv) -#define SOCK_LOCK(_so) SOCKBUF_LOCK(&(_so)->so_rcv) -#define SOCK_OWNED(_so) SOCKBUF_OWNED(&(_so)->so_rcv) -#define SOCK_UNLOCK(_so) SOCKBUF_UNLOCK(&(_so)->so_rcv) -#define SOCK_LOCK_ASSERT(_so) SOCKBUF_LOCK_ASSERT(&(_so)->so_rcv) +#define SOLISTENING(sol) (((sol)->so_options & SO_ACCEPTCONN) != 0) +#define SOLISTEN_LOCK(sol) do { \ + mtx_lock(&(sol)->so_lock); \ + KASSERT(SOLISTENING(sol), \ + ("%s: %p not listening", __func__, (sol))); \ +} while (0) +#define SOLISTEN_TRYLOCK(sol) mtx_trylock(&(sol)->so_lock) +#define SOLISTEN_UNLOCK(sol) do { \ + KASSERT(SOLISTENING(sol), \ + ("%s: %p not listening", __func__, (sol))); \ + mtx_unlock(&(sol)->so_lock); \ +} while (0) +#define SOLISTEN_LOCK_ASSERT(sol) do { \ + mtx_assert(&(sol)->so_lock, MA_OWNED); \ + KASSERT(SOLISTENING(sol), \ + ("%s: %p not listening", __func__, (sol))); \ +} while (0) /* - * Socket state bits stored in so_qstate. - */ -#define SQ_INCOMP 0x0800 /* unaccepted, incomplete connection */ -#define SQ_COMP 0x1000 /* unaccepted, complete connection */ - -/* * Externalized form of struct socket used by the sysctl(3) interface. */ struct xsocket { @@ -213,8 +242,7 @@ /* can we read something from so? */ #define soreadabledata(so) \ - (sbavail(&(so)->so_rcv) >= (so)->so_rcv.sb_lowat || \ - !TAILQ_EMPTY(&(so)->so_comp) || (so)->so_error) + (sbavail(&(so)->so_rcv) >= (so)->so_rcv.sb_lowat || (so)->so_error) #define soreadable(so) \ (soreadabledata(so) || ((so)->so_rcv.sb_state & SBS_CANTRCVMORE)) @@ -227,26 +255,19 @@ (so)->so_error) /* - * soref()/sorele() ref-count the socket structure. Note that you must - * still explicitly close the socket, but the last ref count will free - * the structure. + * soref()/sorele() ref-count the socket structure. + * soref() may be called without owning socket lock, but in that case a + * caller must own something that holds socket, and so_count must be not 0. + * Note that you must still explicitly close the socket, but the last ref + * count will free the structure. */ -#define soref(so) do { \ - SOCK_LOCK_ASSERT(so); \ - ++(so)->so_count; \ -} while (0) - +#define soref(so) refcount_acquire(&(so)->so_count) #define sorele(so) do { \ - ACCEPT_LOCK_ASSERT(); \ SOCK_LOCK_ASSERT(so); \ - if ((so)->so_count <= 0) \ - panic("sorele"); \ - if (--(so)->so_count == 0) \ + if (refcount_release(&(so)->so_count)) \ sofree(so); \ - else { \ + else \ SOCK_UNLOCK(so); \ - ACCEPT_UNLOCK(); \ - } \ } while (0) /* @@ -369,10 +390,11 @@ int solisten(struct socket *so, int backlog, struct thread *td); void solisten_proto(struct socket *so, int backlog); int solisten_proto_check(struct socket *so); +int solisten_dequeue(struct socket *, struct socket **, int); struct socket * sonewconn(struct socket *head, int connstatus); - - +struct socket * + sopeeloff(struct socket *); int sopoll(struct socket *so, int events, struct ucred *active_cred, struct thread *td); int sopoll_generic(struct socket *so, int events, @@ -403,8 +425,10 @@ void sotoxsocket(struct socket *so, struct xsocket *xso); void soupcall_clear(struct socket *, int); void soupcall_set(struct socket *, int, so_upcall_t, void *); +void solisten_upcall_set(struct socket *, so_upcall_t, void *); void sowakeup(struct socket *so, struct sockbuf *sb); void sowakeup_aio(struct socket *so, struct sockbuf *sb); +void solisten_wakeup(struct socket *); int selsocket(struct socket *so, int events, struct timeval *tv, struct thread *td); Index: head/usr.bin/netstat/inet.c =================================================================== --- head/usr.bin/netstat/inet.c +++ head/usr.bin/netstat/inet.c @@ -170,14 +170,17 @@ if (kread((uintptr_t)proto.pr_domain, &domain, sizeof(domain)) != 0) return (-1); xso->xso_family = domain.dom_family; - xso->so_qlen = so->so_qlen; - xso->so_incqlen = so->so_incqlen; - xso->so_qlimit = so->so_qlimit; xso->so_timeo = so->so_timeo; xso->so_error = so->so_error; - xso->so_oobmark = so->so_oobmark; - sbtoxsockbuf(&so->so_snd, &xso->so_snd); - sbtoxsockbuf(&so->so_rcv, &xso->so_rcv); + if (SOLISTENING(so)) { + xso->so_qlen = so->sol_qlen; + xso->so_incqlen = so->sol_incqlen; + xso->so_qlimit = so->sol_qlimit; + } else { + sbtoxsockbuf(&so->so_snd, &xso->so_snd); + sbtoxsockbuf(&so->so_rcv, &xso->so_rcv); + xso->so_oobmark = so->so_oobmark; + } return (0); }