Index: sys/kern/sys_socket.c =================================================================== --- sys/kern/sys_socket.c +++ sys/kern/sys_socket.c @@ -170,32 +170,36 @@ break; case FIOASYNC: - /* - * XXXRW: This code separately acquires SOCK_LOCK(so) and - * SOCKBUF_LOCK(&so->so_rcv) even though they are the same - * mutex to avoid introducing the assumption that they are - * the same. - */ if (*(int *)data) { SOCK_LOCK(so); so->so_state |= SS_ASYNC; + if (SOLISTENING(so)) { + so->sol_sbrcv_flags |= SB_ASYNC; + so->sol_sbsnd_flags |= SB_ASYNC; + } else { + SOCKBUF_LOCK(&so->so_rcv); + so->so_rcv.sb_flags |= SB_ASYNC; + SOCKBUF_UNLOCK(&so->so_rcv); + SOCKBUF_LOCK(&so->so_snd); + so->so_snd.sb_flags |= SB_ASYNC; + SOCKBUF_UNLOCK(&so->so_snd); + } SOCK_UNLOCK(so); - SOCKBUF_LOCK(&so->so_rcv); - so->so_rcv.sb_flags |= SB_ASYNC; - SOCKBUF_UNLOCK(&so->so_rcv); - SOCKBUF_LOCK(&so->so_snd); - so->so_snd.sb_flags |= SB_ASYNC; - SOCKBUF_UNLOCK(&so->so_snd); } else { SOCK_LOCK(so); so->so_state &= ~SS_ASYNC; + if (SOLISTENING(so)) { + so->sol_sbrcv_flags &= ~SB_ASYNC; + so->sol_sbsnd_flags &= ~SB_ASYNC; + } else { + SOCKBUF_LOCK(&so->so_rcv); + so->so_rcv.sb_flags &= ~SB_ASYNC; + SOCKBUF_UNLOCK(&so->so_rcv); + SOCKBUF_LOCK(&so->so_snd); + so->so_snd.sb_flags &= ~SB_ASYNC; + SOCKBUF_UNLOCK(&so->so_snd); + } SOCK_UNLOCK(so); - SOCKBUF_LOCK(&so->so_rcv); - so->so_rcv.sb_flags &= ~SB_ASYNC; - SOCKBUF_UNLOCK(&so->so_rcv); - SOCKBUF_LOCK(&so->so_snd); - so->so_snd.sb_flags &= ~SB_ASYNC; - SOCKBUF_UNLOCK(&so->so_snd); } break; @@ -695,7 +699,6 @@ sb->sb_flags &= ~SB_AIO_RUNNING; SOCKBUF_UNLOCK(sb); - ACCEPT_LOCK(); SOCK_LOCK(so); sorele(so); } Index: sys/kern/uipc_accf.c =================================================================== --- sys/kern/uipc_accf.c +++ sys/kern/uipc_accf.c @@ -162,26 +162,25 @@ } int -do_getopt_accept_filter(struct socket *so, struct sockopt *sopt) +accept_filt_getopt(struct socket *so, struct sockopt *sopt) { struct accept_filter_arg *afap; int error; error = 0; - afap = malloc(sizeof(*afap), M_TEMP, - M_WAITOK | M_ZERO); + afap = malloc(sizeof(*afap), M_TEMP, M_WAITOK | M_ZERO); SOCK_LOCK(so); if ((so->so_options & SO_ACCEPTCONN) == 0) { error = EINVAL; goto out; } - if ((so->so_options & SO_ACCEPTFILTER) == 0) { + if (so->sol_accept_filter == NULL) { error = EINVAL; goto out; } - strcpy(afap->af_name, so->so_accf->so_accept_filter->accf_name); - if (so->so_accf->so_accept_filter_str != NULL) - strcpy(afap->af_arg, so->so_accf->so_accept_filter_str); + strcpy(afap->af_name, so->sol_accept_filter->accf_name); + if (so->sol_accept_filter_str != NULL) + strcpy(afap->af_arg, so->sol_accept_filter_str); out: SOCK_UNLOCK(so); if (error == 0) @@ -191,12 +190,13 @@ } int -do_setopt_accept_filter(struct socket *so, struct sockopt *sopt) +accept_filt_setopt(struct socket *so, struct sockopt *sopt) { struct accept_filter_arg *afap; struct accept_filter *afp; - struct so_accf *newaf; - int error = 0; + char *accept_filter_str = NULL; + void *accept_filter_arg = NULL; + int error; /* * Handle the simple delete case first. @@ -207,18 +207,15 @@ SOCK_UNLOCK(so); return (EINVAL); } - if (so->so_accf != NULL) { - struct so_accf *af = so->so_accf; - if (af->so_accept_filter != NULL && - af->so_accept_filter->accf_destroy != NULL) { - af->so_accept_filter->accf_destroy(so); - } - if (af->so_accept_filter_str != NULL) - free(af->so_accept_filter_str, M_ACCF); - free(af, M_ACCF); - so->so_accf = NULL; + if (so->sol_accept_filter != NULL) { + if (so->sol_accept_filter->accf_destroy != NULL) + so->sol_accept_filter->accf_destroy(so); + if (so->sol_accept_filter_str != NULL) + free(so->sol_accept_filter_str, M_ACCF); + so->sol_accept_filter = NULL; + so->sol_accept_filter_arg = NULL; + so->sol_accept_filter_str = NULL; } - so->so_options &= ~SO_ACCEPTFILTER; SOCK_UNLOCK(so); return (0); } @@ -227,8 +224,7 @@ * Pre-allocate any memory we may need later to avoid blocking at * untimely moments. This does not optimize for invalid arguments. */ - afap = malloc(sizeof(*afap), M_TEMP, - M_WAITOK); + afap = malloc(sizeof(*afap), M_TEMP, M_WAITOK); error = sooptcopyin(sopt, afap, sizeof *afap, sizeof *afap); afap->af_name[sizeof(afap->af_name)-1] = '\0'; afap->af_arg[sizeof(afap->af_arg)-1] = '\0'; @@ -241,19 +237,10 @@ free(afap, M_TEMP); return (ENOENT); } - /* - * Allocate the new accept filter instance storage. We may - * have to free it again later if we fail to attach it. If - * attached properly, 'newaf' is NULLed to avoid a free() - * while in use. - */ - newaf = malloc(sizeof(*newaf), M_ACCF, M_WAITOK | - M_ZERO); if (afp->accf_create != NULL && afap->af_name[0] != '\0') { size_t len = strlen(afap->af_name) + 1; - newaf->so_accept_filter_str = malloc(len, M_ACCF, - M_WAITOK); - strcpy(newaf->so_accept_filter_str, afap->af_name); + accept_filter_str = malloc(len, M_ACCF, M_WAITOK); + strcpy(accept_filter_str, afap->af_name); } /* @@ -261,8 +248,8 @@ * without first removing it. */ SOCK_LOCK(so); - if (((so->so_options & SO_ACCEPTCONN) == 0) || - (so->so_accf != NULL)) { + if ((so->so_options & SO_ACCEPTCONN) == 0 || + so->sol_accept_filter != NULL) { error = EINVAL; goto out; } @@ -273,25 +260,19 @@ * can't block. */ if (afp->accf_create != NULL) { - newaf->so_accept_filter_arg = - afp->accf_create(so, afap->af_arg); - if (newaf->so_accept_filter_arg == NULL) { + accept_filter_arg = afp->accf_create(so, afap->af_arg); + if (accept_filter_arg == NULL) { error = EINVAL; goto out; } } - newaf->so_accept_filter = afp; - so->so_accf = newaf; - so->so_options |= SO_ACCEPTFILTER; - newaf = NULL; + so->sol_accept_filter = afp; + so->sol_accept_filter_arg = accept_filter_arg; + so->sol_accept_filter_str = accept_filter_str; out: SOCK_UNLOCK(so); - if (newaf != NULL) { - if (newaf->so_accept_filter_str != NULL) - free(newaf->so_accept_filter_str, M_ACCF); - free(newaf, M_ACCF); - } - if (afap != NULL) - free(afap, M_TEMP); + if (accept_filter_str != NULL) + free(accept_filter_str, M_ACCF); + free(afap, M_TEMP); return (error); } Index: sys/kern/uipc_debug.c =================================================================== --- sys/kern/uipc_debug.c +++ sys/kern/uipc_debug.c @@ -448,8 +448,6 @@ db_printf(")\n"); db_print_indent(indent); - db_printf("so_qstate: 0x%x (", so->so_qstate); - db_print_soqstate(so->so_qstate); db_printf(") "); db_printf("so_pcb: %p ", so->so_pcb); db_printf("so_proto: %p\n", so->so_proto); @@ -458,24 +456,28 @@ db_print_protosw(so->so_proto, "so_proto", indent); db_print_indent(indent); - db_printf("so_head: %p ", so->so_head); - db_printf("so_incomp first: %p ", TAILQ_FIRST(&so->so_incomp)); - db_printf("so_comp first: %p\n", TAILQ_FIRST(&so->so_comp)); - - db_print_indent(indent); - /* so_list skipped */ - db_printf("so_qlen: %u ", so->so_qlen); - db_printf("so_incqlen: %u ", so->so_incqlen); - db_printf("so_qlimit: %u ", so->so_qlimit); - db_printf("so_timeo: %d ", so->so_timeo); - db_printf("so_error: %d\n", so->so_error); - - db_print_indent(indent); - db_printf("so_sigio: %p ", so->so_sigio); - db_printf("so_oobmark: %lu ", so->so_oobmark); - - db_print_sockbuf(&so->so_rcv, "so_rcv", indent); - db_print_sockbuf(&so->so_snd, "so_snd", indent); + if (so->so_options & SO_ACCEPTCONN) { + db_printf("sol_incomp first: %p ", + TAILQ_FIRST(&so->sol_incomp)); + db_printf("sol_comp first: %p\n", TAILQ_FIRST(&so->sol_comp)); + db_printf("sol_qlen: %d ", so->sol_qlen); + db_printf("sol_incqlen: %d ", so->sol_incqlen); + db_printf("sol_qlimit: %d ", so->sol_qlimit); + } else { + db_printf("so_qstate: 0x%x (", so->so_qstate); + db_print_soqstate(so->so_qstate); + db_printf("so_listen: %p ", so->so_listen); + /* so_list skipped */ + db_printf("so_timeo: %d ", so->so_timeo); + db_printf("so_error: %d\n", so->so_error); + + db_print_indent(indent); + db_printf("so_sigio: %p ", so->so_sigio); + db_printf("so_oobmark: %lu ", so->so_oobmark); + + db_print_sockbuf(&so->so_rcv, "so_rcv", indent); + db_print_sockbuf(&so->so_snd, "so_snd", indent); + } } DB_SHOW_COMMAND(socket, db_show_socket) Index: sys/kern/uipc_sockbuf.c =================================================================== --- sys/kern/uipc_sockbuf.c +++ sys/kern/uipc_sockbuf.c @@ -314,14 +314,14 @@ SOCKBUF_LOCK_ASSERT(sb); - selwakeuppri(&sb->sb_sel, PSOCK); - if (!SEL_WAITING(&sb->sb_sel)) + selwakeuppri(sb->sb_sel, PSOCK); + if (!SEL_WAITING(sb->sb_sel)) sb->sb_flags &= ~SB_SEL; if (sb->sb_flags & SB_WAIT) { sb->sb_flags &= ~SB_WAIT; wakeup(&sb->sb_acc); } - KNOTE_LOCKED(&sb->sb_sel.si_note, 0); + KNOTE_LOCKED(&sb->sb_sel->si_note, 0); if (sb->sb_upcall != NULL) { ret = sb->sb_upcall(so, sb->sb_upcallarg, M_NOWAIT); if (ret == SU_ISCONNECTED) { Index: sys/kern/uipc_socket.c =================================================================== --- sys/kern/uipc_socket.c +++ sys/kern/uipc_socket.c @@ -106,6 +106,7 @@ #include "opt_inet.h" #include "opt_inet6.h" #include "opt_compat.h" +#include "opt_sctp.h" #include #include @@ -154,13 +155,21 @@ static int soreceive_rcvoob(struct socket *so, struct uio *uio, int flags); +static void so_rdknl_lock(void *); +static void so_rdknl_unlock(void *); +static void so_rdknl_assert_locked(void *); +static void so_rdknl_assert_unlocked(void *); +static void so_wrknl_lock(void *); +static void so_wrknl_unlock(void *); +static void so_wrknl_assert_locked(void *); +static void so_wrknl_assert_unlocked(void *); static void filt_sordetach(struct knote *kn); static int filt_soread(struct knote *kn, long hint); static void filt_sowdetach(struct knote *kn); static int filt_sowrite(struct knote *kn, long hint); -static int inline hhook_run_socket(struct socket *so, void *hctx, int32_t h_id); static int filt_soempty(struct knote *kn, long hint); +static int inline hhook_run_socket(struct socket *so, void *hctx, int32_t h_id); fo_kqfilter_t soo_kqfilter; static struct filterops soread_filtops = { @@ -393,8 +402,16 @@ return (NULL); } + /* + * The socket locking protocol allows to lock 2 sockets at a time, + * however, the first one must be a listening socket. WITNESS lacks + * a feature to change class of an existing lock, so we use DUPOK. + */ + mtx_init(&so->so_lock, "socket", NULL, MTX_DEF | MTX_DUPOK); SOCKBUF_LOCK_INIT(&so->so_snd, "so_snd"); SOCKBUF_LOCK_INIT(&so->so_rcv, "so_rcv"); + so->so_rcv.sb_sel = &so->so_rdsel; + so->so_snd.sb_sel = &so->so_wrsel; sx_init(&so->so_snd.sb_sx, "so_snd_sx"); sx_init(&so->so_rcv.sb_sx, "so_rcv_sx"); TAILQ_INIT(&so->so_snd.sb_aiojobq); @@ -450,9 +467,6 @@ if (so->so_snd.sb_hiwat) (void)chgsbsize(so->so_cred->cr_uidinfo, &so->so_snd.sb_hiwat, 0, RLIM_INFINITY); - /* remove accept filter if one is present. */ - if (so->so_accf != NULL) - do_setopt_accept_filter(so, NULL); #ifdef MAC mac_socket_destroy(so); #endif @@ -460,10 +474,16 @@ crfree(so->so_cred); khelp_destroy_osd(&so->osd); - sx_destroy(&so->so_snd.sb_sx); - sx_destroy(&so->so_rcv.sb_sx); - SOCKBUF_LOCK_DESTROY(&so->so_snd); - SOCKBUF_LOCK_DESTROY(&so->so_rcv); + if (SOLISTENING(so)) { + if (so->sol_accept_filter != NULL) + accept_filt_setopt(so, NULL); + } else { + sx_destroy(&so->so_snd.sb_sx); + sx_destroy(&so->so_rcv.sb_sx); + SOCKBUF_LOCK_DESTROY(&so->so_snd); + SOCKBUF_LOCK_DESTROY(&so->so_rcv); + } + mtx_destroy(&so->so_lock); uma_zfree(socket_zone, so); } @@ -506,8 +526,6 @@ if (so == NULL) return (ENOBUFS); - TAILQ_INIT(&so->so_incomp); - TAILQ_INIT(&so->so_comp); so->so_type = type; so->so_cred = crhold(cred); if ((prp->pr_domain->dom_family == PF_INET) || @@ -520,9 +538,10 @@ #ifdef MAC mac_socket_create(cred, so); #endif - knlist_init_mtx(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv)); - knlist_init_mtx(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd)); - so->so_count = 1; + knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock, + so_rdknl_assert_locked, so_rdknl_assert_unlocked); + knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock, + so_wrknl_assert_locked, so_wrknl_assert_unlocked); /* * Auto-sizing of socket buffers is managed by the protocols and * the appropriate flags must be set in the pru_attach function. @@ -531,12 +550,10 @@ error = (*prp->pr_usrreqs->pru_attach)(so, proto, td); CURVNET_RESTORE(); if (error) { - KASSERT(so->so_count == 1, ("socreate: so_count %d", - so->so_count)); - so->so_count = 0; sodealloc(so); return (error); } + soref(so); *aso = so; return (0); } @@ -564,11 +581,11 @@ static int overcount; struct socket *so; - int over; + u_int over; - ACCEPT_LOCK(); - over = (head->so_qlen > 3 * head->so_qlimit / 2); - ACCEPT_UNLOCK(); + SOLISTEN_LOCK(head); + over = (head->sol_qlen > 3 * head->sol_qlimit / 2); + SOLISTEN_UNLOCK(head); #ifdef REGRESSION if (regression_sonewconn_earlytest && over) { #else @@ -580,15 +597,15 @@ log(LOG_DEBUG, "%s: pcb %p: Listen queue overflow: " "%i already in queue awaiting acceptance " "(%d occurrences)\n", - __func__, head->so_pcb, head->so_qlen, overcount); + __func__, head->so_pcb, head->sol_qlen, overcount); overcount = 0; } return (NULL); } - VNET_ASSERT(head->so_vnet != NULL, ("%s:%d so_vnet is NULL, head=%p", - __func__, __LINE__, head)); + VNET_ASSERT(head->so_vnet != NULL, ("%s: so %p vnet is NULL", + __func__, head)); so = soalloc(head->so_vnet); if (so == NULL) { log(LOG_DEBUG, "%s: pcb %p: New socket allocation failure: " @@ -596,9 +613,9 @@ __func__, head->so_pcb); return (NULL); } - if ((head->so_options & SO_ACCEPTFILTER) != 0) + if (head->sol_accept_filter != NULL) connstatus = 0; - so->so_head = head; + so->so_listen = head; so->so_type = head->so_type; so->so_options = head->so_options &~ SO_ACCEPTCONN; so->so_linger = head->so_linger; @@ -609,10 +626,12 @@ #ifdef MAC mac_socket_newconn(head, so); #endif - knlist_init_mtx(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv)); - knlist_init_mtx(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd)); + knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock, + so_rdknl_assert_locked, so_rdknl_assert_unlocked); + knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock, + so_wrknl_assert_locked, so_wrknl_assert_unlocked); VNET_SO_ASSERT(head); - if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat)) { + if (soreserve(so, head->sol_sbsnd_hiwat, head->sol_sbrcv_hiwat)) { sodealloc(so); log(LOG_DEBUG, "%s: pcb %p: soreserve() failed\n", __func__, head->so_pcb); @@ -624,32 +643,19 @@ __func__, head->so_pcb); return (NULL); } - so->so_rcv.sb_lowat = head->so_rcv.sb_lowat; - so->so_snd.sb_lowat = head->so_snd.sb_lowat; - so->so_rcv.sb_timeo = head->so_rcv.sb_timeo; - so->so_snd.sb_timeo = head->so_snd.sb_timeo; - so->so_rcv.sb_flags |= head->so_rcv.sb_flags & SB_AUTOSIZE; - so->so_snd.sb_flags |= head->so_snd.sb_flags & SB_AUTOSIZE; + so->so_rcv.sb_lowat = head->sol_sbrcv_lowat; + so->so_snd.sb_lowat = head->sol_sbsnd_lowat; + so->so_rcv.sb_timeo = head->sol_sbrcv_timeo; + so->so_snd.sb_timeo = head->sol_sbsnd_timeo; + so->so_rcv.sb_flags |= head->sol_sbrcv_flags & SB_AUTOSIZE; + so->so_snd.sb_flags |= head->sol_sbsnd_flags & SB_AUTOSIZE; so->so_state |= connstatus; - ACCEPT_LOCK(); - /* - * The accept socket may be tearing down but we just - * won a race on the ACCEPT_LOCK. - * However, if sctp_peeloff() is called on a 1-to-many - * style socket, the SO_ACCEPTCONN doesn't need to be set. - */ - if (!(head->so_options & SO_ACCEPTCONN) && - ((head->so_proto->pr_protocol != IPPROTO_SCTP) || - (head->so_type != SOCK_SEQPACKET))) { - SOCK_LOCK(so); - so->so_head = NULL; - sofree(so); /* NB: returns ACCEPT_UNLOCK'ed. */ - return (NULL); - } + + SOLISTEN_LOCK(head); if (connstatus) { - TAILQ_INSERT_TAIL(&head->so_comp, so, so_list); - so->so_qstate |= SQ_COMP; - head->so_qlen++; + TAILQ_INSERT_TAIL(&head->sol_comp, so, so_list); + so->so_qstate = SQ_COMP; + head->sol_qlen++; } else { /* * Keep removing sockets from the head until there's room for @@ -658,28 +664,92 @@ * threads and soabort() requires dropping locks, we must * loop waiting for the condition to be true. */ - while (head->so_incqlen > head->so_qlimit) { + while (head->sol_incqlen > head->sol_qlimit) { struct socket *sp; - sp = TAILQ_FIRST(&head->so_incomp); - TAILQ_REMOVE(&head->so_incomp, sp, so_list); - head->so_incqlen--; - sp->so_qstate &= ~SQ_INCOMP; - sp->so_head = NULL; - ACCEPT_UNLOCK(); + + sp = TAILQ_FIRST(&head->sol_incomp); + TAILQ_REMOVE(&head->sol_incomp, sp, so_list); + head->sol_incqlen--; + SOCK_LOCK(sp); + sp->so_qstate = SQ_NONE; + sp->so_listen = NULL; + SOCK_UNLOCK(sp); + sorele(head); /* does SOLISTEN_UNLOCK, head stays */ soabort(sp); - ACCEPT_LOCK(); + SOLISTEN_LOCK(head); } - TAILQ_INSERT_TAIL(&head->so_incomp, so, so_list); - so->so_qstate |= SQ_INCOMP; - head->so_incqlen++; + TAILQ_INSERT_TAIL(&head->sol_incomp, so, so_list); + so->so_qstate = SQ_INCOMP; + head->sol_incqlen++; } - ACCEPT_UNLOCK(); + soref(head); /* A socket on (in)complete queue refs head. */ + SOLISTEN_UNLOCK(head); if (connstatus) { - sorwakeup(head); - wakeup_one(&head->so_timeo); + selwakeuppri(&head->so_rdsel, PSOCK); + KNOTE_LOCKED(&head->so_rdsel.si_note, 0); + wakeup_one(&head->sol_comp); + } + return (so); +} + +#ifdef SCTP +/* + * Socket part of sctp_peeloff(). Detach a new socket from an + * association. The new socket is returned with a reference. + */ +struct socket * +sopeeloff(struct socket *head) +{ + struct socket *so; + + VNET_ASSERT(head->so_vnet != NULL, ("%s:%d so_vnet is NULL, head=%p", + __func__, __LINE__, head)); + so = soalloc(head->so_vnet); + if (so == NULL) { + log(LOG_DEBUG, "%s: pcb %p: New socket allocation failure: " + "limit reached or out of memory\n", + __func__, head->so_pcb); + return (NULL); + } + so->so_type = head->so_type; + so->so_options = head->so_options; + so->so_linger = head->so_linger; + so->so_state = (head->so_state & SS_NBIO) | SS_ISCONNECTED; + so->so_fibnum = head->so_fibnum; + so->so_proto = head->so_proto; + so->so_cred = crhold(head->so_cred); +#ifdef MAC + mac_socket_newconn(head, so); +#endif + knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock, + so_rdknl_assert_locked, so_rdknl_assert_unlocked); + knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock, + so_wrknl_assert_locked, so_wrknl_assert_unlocked); + VNET_SO_ASSERT(head); + if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat)) { + sodealloc(so); + log(LOG_DEBUG, "%s: pcb %p: soreserve() failed\n", + __func__, head->so_pcb); + return (NULL); + } + if ((*so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL)) { + sodealloc(so); + log(LOG_DEBUG, "%s: pcb %p: pru_attach() failed\n", + __func__, head->so_pcb); + return (NULL); } + so->so_rcv.sb_lowat = head->so_rcv.sb_lowat; + so->so_snd.sb_lowat = head->so_snd.sb_lowat; + so->so_rcv.sb_timeo = head->so_rcv.sb_timeo; + so->so_snd.sb_timeo = head->so_snd.sb_timeo; + so->so_rcv.sb_flags |= head->so_rcv.sb_flags & SB_AUTOSIZE; + so->so_snd.sb_flags |= head->so_snd.sb_flags & SB_AUTOSIZE; + + soref(so); + return (so); } +#endif /* SCTP */ int sobind(struct socket *so, struct sockaddr *nam, struct thread *td) @@ -741,13 +811,63 @@ void solisten_proto(struct socket *so, int backlog) { + int sbrcv_lowat, sbsnd_lowat; + u_int sbrcv_hiwat, sbsnd_hiwat; + short sbrcv_flags, sbsnd_flags; + sbintime_t sbrcv_timeo, sbsnd_timeo; SOCK_LOCK_ASSERT(so); + if (SOLISTENING(so)) + goto listening; + + /* + * Change this socket to listening state. + */ + sbrcv_lowat = so->so_rcv.sb_lowat; + sbsnd_lowat = so->so_snd.sb_lowat; + sbrcv_hiwat = so->so_rcv.sb_hiwat; + sbsnd_hiwat = so->so_snd.sb_hiwat; + sbrcv_flags = so->so_rcv.sb_flags; + sbsnd_flags = so->so_snd.sb_flags; + sbrcv_timeo = so->so_rcv.sb_timeo; + sbsnd_timeo = so->so_snd.sb_timeo; + + sbdestroy(&so->so_snd, so); + sbdestroy(&so->so_rcv, so); + sx_destroy(&so->so_snd.sb_sx); + sx_destroy(&so->so_rcv.sb_sx); + SOCKBUF_LOCK_DESTROY(&so->so_snd); + SOCKBUF_LOCK_DESTROY(&so->so_rcv); + +#ifdef INVARIANTS + bzero(&so->so_rcv, + sizeof(struct socket) - offsetof(struct socket, so_rcv)); +#endif + + so->sol_sbrcv_lowat = sbrcv_lowat; + so->sol_sbsnd_lowat = sbsnd_lowat; + so->sol_sbrcv_hiwat = sbrcv_hiwat; + so->sol_sbsnd_hiwat = sbsnd_hiwat; + so->sol_sbrcv_flags = sbrcv_flags; + so->sol_sbsnd_flags = sbsnd_flags; + so->sol_sbrcv_timeo = sbrcv_timeo; + so->sol_sbsnd_timeo = sbsnd_timeo; + + so->sol_qlen = so->sol_incqlen = 0; + TAILQ_INIT(&so->sol_incomp); + TAILQ_INIT(&so->sol_comp); + + so->sol_accept_filter = NULL; + so->sol_accept_filter_arg = NULL; + so->sol_accept_filter_str = NULL; + + so->so_options |= SO_ACCEPTCONN; + +listening: if (backlog < 0 || backlog > somaxconn) backlog = somaxconn; - so->so_qlimit = backlog; - so->so_options |= SO_ACCEPTCONN; + so->sol_qlimit = backlog; } /* @@ -774,44 +894,62 @@ sofree(struct socket *so) { struct protosw *pr = so->so_proto; - struct socket *head; - ACCEPT_LOCK_ASSERT(); SOCK_LOCK_ASSERT(so); if ((so->so_state & SS_NOFDREF) == 0 || so->so_count != 0 || - (so->so_state & SS_PROTOREF) || (so->so_qstate & SQ_COMP)) { + (so->so_state & SS_PROTOREF) || (so->so_qstate == SQ_COMP)) { SOCK_UNLOCK(so); - ACCEPT_UNLOCK(); return; } - head = so->so_head; - if (head != NULL) { - KASSERT((so->so_qstate & SQ_COMP) != 0 || - (so->so_qstate & SQ_INCOMP) != 0, - ("sofree: so_head != NULL, but neither SQ_COMP nor " - "SQ_INCOMP")); - KASSERT((so->so_qstate & SQ_COMP) == 0 || - (so->so_qstate & SQ_INCOMP) == 0, - ("sofree: so->so_qstate is SQ_COMP and also SQ_INCOMP")); - TAILQ_REMOVE(&head->so_incomp, so, so_list); - head->so_incqlen--; - so->so_qstate &= ~SQ_INCOMP; - so->so_head = NULL; - } - KASSERT((so->so_qstate & SQ_COMP) == 0 && - (so->so_qstate & SQ_INCOMP) == 0, - ("sofree: so_head == NULL, but still SQ_COMP(%d) or SQ_INCOMP(%d)", - so->so_qstate & SQ_COMP, so->so_qstate & SQ_INCOMP)); - if (so->so_options & SO_ACCEPTCONN) { - KASSERT((TAILQ_EMPTY(&so->so_comp)), - ("sofree: so_comp populated")); - KASSERT((TAILQ_EMPTY(&so->so_incomp)), - ("sofree: so_incomp populated")); + if (!SOLISTENING(so) && so->so_qstate == SQ_INCOMP) { + struct socket *sol; + + sol = so->so_listen; + KASSERT(sol, ("%s: so %p on incomp of NULL", __func__, so)); + + /* + * To solve race between close of a listening socket and + * a socket on its incomplete queue, we need to lock both. + * The order is first listening socket, then regular. + * Since we don't have SS_NOFDREF neither SS_PROTOREF, this + * function and the listening socket are the only pointers + * to so. To preserve so and sol, we reference both and then + * relock. + * After relock the socket may not move to so_comp since it + * doesn't have PCB already, but it may be removed from + * so_incomp. If that happens, we share responsiblity on + * freeing the socket, but soclose() has already removed + * it from queue. + */ + soref(sol); + soref(so); + SOCK_UNLOCK(so); + SOLISTEN_LOCK(sol); + SOCK_LOCK(so); + if (so->so_qstate == SQ_INCOMP) { + KASSERT(so->so_listen == sol, + ("%s: so %p migrated out of sol %p", + __func__, so, sol)); + TAILQ_REMOVE(&sol->sol_incomp, so, so_list); + sol->sol_incqlen--; + /* This is guarenteed not to be the last. */ + refcount_release(&sol->so_count); + so->so_qstate = SQ_NONE; + so->so_listen = NULL; + } else + KASSERT(so->so_listen == NULL, + ("%s: so %p not on (in)comp with so_listen", + __func__, so)); + sorele(sol); + KASSERT(so->so_count == 1, + ("%s: so %p count %u", __func__, so, so->so_count)); + so->so_count = 0; } + if (SOLISTENING(so)) + so->so_error = ECONNABORTED; SOCK_UNLOCK(so); - ACCEPT_UNLOCK(); VNET_SO_ASSERT(so); if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL) @@ -833,12 +971,14 @@ * before calling pru_detach. This means that protocols shold not * assume they can perform socket wakeups, etc, in their detach code. */ - sbdestroy(&so->so_snd, so); - sbdestroy(&so->so_rcv, so); - seldrain(&so->so_snd.sb_sel); - seldrain(&so->so_rcv.sb_sel); - knlist_destroy(&so->so_rcv.sb_sel.si_note); - knlist_destroy(&so->so_snd.sb_sel.si_note); + if (!SOLISTENING(so)) { + sbdestroy(&so->so_snd, so); + sbdestroy(&so->so_rcv, so); + } + seldrain(&so->so_rdsel); + seldrain(&so->so_wrsel); + knlist_destroy(&so->so_rdsel.si_note); + knlist_destroy(&so->so_wrsel.si_note); sodealloc(so); } @@ -853,6 +993,8 @@ int soclose(struct socket *so) { + struct accept_queue lqueue; + bool listening; int error = 0; KASSERT(!(so->so_state & SS_NOFDREF), ("soclose: SS_NOFDREF on enter")); @@ -885,41 +1027,42 @@ drop: if (so->so_proto->pr_usrreqs->pru_close != NULL) (*so->so_proto->pr_usrreqs->pru_close)(so); - ACCEPT_LOCK(); - if (so->so_options & SO_ACCEPTCONN) { + + SOCK_LOCK(so); + if ((listening = (so->so_options & SO_ACCEPTCONN))) { struct socket *sp; - /* - * Prevent new additions to the accept queues due - * to ACCEPT_LOCK races while we are draining them. - */ - so->so_options &= ~SO_ACCEPTCONN; - while ((sp = TAILQ_FIRST(&so->so_incomp)) != NULL) { - TAILQ_REMOVE(&so->so_incomp, sp, so_list); - so->so_incqlen--; - sp->so_qstate &= ~SQ_INCOMP; - sp->so_head = NULL; - ACCEPT_UNLOCK(); - soabort(sp); - ACCEPT_LOCK(); - } - while ((sp = TAILQ_FIRST(&so->so_comp)) != NULL) { - TAILQ_REMOVE(&so->so_comp, sp, so_list); - so->so_qlen--; - sp->so_qstate &= ~SQ_COMP; - sp->so_head = NULL; - ACCEPT_UNLOCK(); - soabort(sp); - ACCEPT_LOCK(); + + TAILQ_INIT(&lqueue); + TAILQ_SWAP(&lqueue, &so->sol_incomp, socket, so_list); + TAILQ_CONCAT(&lqueue, &so->sol_comp, so_list); + + so->sol_qlen = so->sol_incqlen = 0; + + TAILQ_FOREACH(sp, &lqueue, so_list) { + SOCK_LOCK(sp); + sp->so_qstate = SQ_NONE; + sp->so_listen = NULL; + SOCK_UNLOCK(sp); + /* Guaranteed not to be the last. */ + refcount_release(&so->so_count); } - KASSERT((TAILQ_EMPTY(&so->so_comp)), - ("%s: so_comp populated", __func__)); - KASSERT((TAILQ_EMPTY(&so->so_incomp)), - ("%s: so_incomp populated", __func__)); } - SOCK_LOCK(so); KASSERT((so->so_state & SS_NOFDREF) == 0, ("soclose: NOFDREF")); so->so_state |= SS_NOFDREF; - sorele(so); /* NB: Returns with ACCEPT_UNLOCK(). */ + sorele(so); + if (listening) { + struct socket *sp; + + TAILQ_FOREACH(sp, &lqueue, so_list) { + SOCK_LOCK(sp); + if (sp->so_count == 0) { + SOCK_UNLOCK(sp); + soabort(sp); + } else + /* sp is now in sofree() */ + SOCK_UNLOCK(sp); + } + } CURVNET_RESTORE(); return (error); } @@ -951,13 +1094,11 @@ KASSERT(so->so_count == 0, ("soabort: so_count")); KASSERT((so->so_state & SS_PROTOREF) == 0, ("soabort: SS_PROTOREF")); KASSERT(so->so_state & SS_NOFDREF, ("soabort: !SS_NOFDREF")); - KASSERT((so->so_state & SQ_COMP) == 0, ("soabort: SQ_COMP")); - KASSERT((so->so_state & SQ_INCOMP) == 0, ("soabort: SQ_INCOMP")); + KASSERT(so->so_qstate == SQ_NONE, ("soabort: !SQ_NONE")); VNET_SO_ASSERT(so); if (so->so_proto->pr_usrreqs->pru_abort != NULL) (*so->so_proto->pr_usrreqs->pru_abort)(so); - ACCEPT_LOCK(); SOCK_LOCK(so); sofree(so); } @@ -2511,7 +2652,7 @@ } else { switch (sopt->sopt_name) { case SO_ACCEPTFILTER: - error = do_setopt_accept_filter(so, sopt); + error = accept_filt_setopt(so, sopt); if (error) goto bad; break; @@ -2769,7 +2910,7 @@ } else { switch (sopt->sopt_name) { case SO_ACCEPTFILTER: - error = do_getopt_accept_filter(so, sopt); + error = accept_filt_getopt(so, sopt); break; case SO_LINGER: @@ -2877,15 +3018,15 @@ break; case SO_LISTENQLIMIT: - optval = so->so_qlimit; + optval = SOLISTENING(so) ? so->sol_qlimit : 0; goto integer; case SO_LISTENQLEN: - optval = so->so_qlen; + optval = SOLISTENING(so) ? so->sol_qlen : 0; goto integer; case SO_LISTENINCQLEN: - optval = so->so_incqlen; + optval = SOLISTENING(so) ? so->sol_incqlen : 0; goto integer; case SO_TS_CLOCK: @@ -3032,7 +3173,7 @@ if (so->so_sigio != NULL) pgsigio(&so->so_sigio, SIGURG, 0); - selwakeuppri(&so->so_rcv.sb_sel, PSOCK); + selwakeuppri(&so->so_rdsel, PSOCK); } int @@ -3052,44 +3193,54 @@ sopoll_generic(struct socket *so, int events, struct ucred *active_cred, struct thread *td) { - int revents = 0; - - SOCKBUF_LOCK(&so->so_snd); - SOCKBUF_LOCK(&so->so_rcv); - if (events & (POLLIN | POLLRDNORM)) - if (soreadabledata(so)) - revents |= events & (POLLIN | POLLRDNORM); + int revents; - if (events & (POLLOUT | POLLWRNORM)) - if (sowriteable(so)) - revents |= events & (POLLOUT | POLLWRNORM); - - if (events & (POLLPRI | POLLRDBAND)) - if (so->so_oobmark || (so->so_rcv.sb_state & SBS_RCVATMARK)) - revents |= events & (POLLPRI | POLLRDBAND); - - if ((events & POLLINIGNEOF) == 0) { - if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { - revents |= events & (POLLIN | POLLRDNORM); - if (so->so_snd.sb_state & SBS_CANTSENDMORE) - revents |= POLLHUP; + SOCK_LOCK(so); + if (SOLISTENING(so)) { + if (!(events & (POLLIN | POLLRDNORM))) + revents = 0; + else if (!TAILQ_EMPTY(&so->sol_comp)) + revents = events & (POLLIN | POLLRDNORM); + else { + selrecord(td, &so->so_rdsel); + revents = 0; } - } - - if (revents == 0) { - if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) { - selrecord(td, &so->so_rcv.sb_sel); - so->so_rcv.sb_flags |= SB_SEL; + } else { + revents = 0; + SOCKBUF_LOCK(&so->so_snd); + SOCKBUF_LOCK(&so->so_rcv); + if (events & (POLLIN | POLLRDNORM)) + if (soreadabledata(so)) + revents |= events & (POLLIN | POLLRDNORM); + if (events & (POLLOUT | POLLWRNORM)) + if (sowriteable(so)) + revents |= events & (POLLOUT | POLLWRNORM); + if (events & (POLLPRI | POLLRDBAND)) + if (so->so_oobmark || + (so->so_rcv.sb_state & SBS_RCVATMARK)) + revents |= events & (POLLPRI | POLLRDBAND); + if ((events & POLLINIGNEOF) == 0) { + if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { + revents |= events & (POLLIN | POLLRDNORM); + if (so->so_snd.sb_state & SBS_CANTSENDMORE) + revents |= POLLHUP; + } } - - if (events & (POLLOUT | POLLWRNORM)) { - selrecord(td, &so->so_snd.sb_sel); - so->so_snd.sb_flags |= SB_SEL; + if (revents == 0) { + if (events & + (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) { + selrecord(td, &so->so_rdsel); + so->so_rcv.sb_flags |= SB_SEL; + } + if (events & (POLLOUT | POLLWRNORM)) { + selrecord(td, &so->so_wrsel); + so->so_snd.sb_flags |= SB_SEL; + } } + SOCKBUF_UNLOCK(&so->so_rcv); + SOCKBUF_UNLOCK(&so->so_snd); } - - SOCKBUF_UNLOCK(&so->so_rcv); - SOCKBUF_UNLOCK(&so->so_snd); + SOCK_UNLOCK(so); return (revents); } @@ -3098,28 +3249,38 @@ { struct socket *so = kn->kn_fp->f_data; struct sockbuf *sb; + struct knlist *knl; switch (kn->kn_filter) { case EVFILT_READ: kn->kn_fop = &soread_filtops; + knl = &so->so_rdsel.si_note; sb = &so->so_rcv; break; case EVFILT_WRITE: kn->kn_fop = &sowrite_filtops; + knl = &so->so_wrsel.si_note; sb = &so->so_snd; break; case EVFILT_EMPTY: kn->kn_fop = &soempty_filtops; + knl = &so->so_wrsel.si_note; sb = &so->so_snd; break; default: return (EINVAL); } - SOCKBUF_LOCK(sb); - knlist_add(&sb->sb_sel.si_note, kn, 1); - sb->sb_flags |= SB_KNOTE; - SOCKBUF_UNLOCK(sb); + SOCK_LOCK(so); + if (SOLISTENING(so)) { + knlist_add(knl, kn, 1); + } else { + SOCKBUF_LOCK(sb); + knlist_add(knl, kn, 1); + sb->sb_flags |= SB_KNOTE; + SOCKBUF_UNLOCK(sb); + } + SOCK_UNLOCK(so); return (0); } @@ -3298,11 +3459,11 @@ { struct socket *so = kn->kn_fp->f_data; - SOCKBUF_LOCK(&so->so_rcv); - knlist_remove(&so->so_rcv.sb_sel.si_note, kn, 1); - if (knlist_empty(&so->so_rcv.sb_sel.si_note)) + so_rdknl_lock(so); + knlist_remove(&so->so_rdsel.si_note, kn, 1); + if (!SOLISTENING(so) && knlist_empty(&so->so_rdsel.si_note)) so->so_rcv.sb_flags &= ~SB_KNOTE; - SOCKBUF_UNLOCK(&so->so_rcv); + so_rdknl_unlock(so); } /*ARGSUSED*/ @@ -3312,11 +3473,13 @@ struct socket *so; so = kn->kn_fp->f_data; - if (so->so_options & SO_ACCEPTCONN) { - kn->kn_data = so->so_qlen; - return (!TAILQ_EMPTY(&so->so_comp)); + if (SOLISTENING(so)) { + SOCK_LOCK_ASSERT(so); + kn->kn_data = so->sol_qlen; + return (!TAILQ_EMPTY(&so->sol_comp)); } + SOCKBUF_LOCK_ASSERT(&so->so_rcv); kn->kn_data = sbavail(&so->so_rcv) - so->so_rcv.sb_ctl; @@ -3342,11 +3505,11 @@ { struct socket *so = kn->kn_fp->f_data; - SOCKBUF_LOCK(&so->so_snd); - knlist_remove(&so->so_snd.sb_sel.si_note, kn, 1); - if (knlist_empty(&so->so_snd.sb_sel.si_note)) + so_wrknl_lock(so); + knlist_remove(&so->so_wrsel.si_note, kn, 1); + if (!SOLISTENING(so) && knlist_empty(&so->so_wrsel.si_note)) so->so_snd.sb_flags &= ~SB_KNOTE; - SOCKBUF_UNLOCK(&so->so_snd); + so_wrknl_unlock(so); } /*ARGSUSED*/ @@ -3356,6 +3519,10 @@ struct socket *so; so = kn->kn_fp->f_data; + + if (SOLISTENING(so)) + return (0); + SOCKBUF_LOCK_ASSERT(&so->so_snd); kn->kn_data = sbspace(&so->so_snd); @@ -3382,6 +3549,10 @@ struct socket *so; so = kn->kn_fp->f_data; + + if (SOLISTENING(so)) + return (1); + SOCKBUF_LOCK_ASSERT(&so->so_snd); kn->kn_data = sbused(&so->so_snd); @@ -3451,31 +3622,34 @@ int ret; restart: - ACCEPT_LOCK(); + if ((head = so->so_listen) != NULL) + SOLISTEN_LOCK(head); SOCK_LOCK(so); + /* + * XXXGL: should we re-check so->so_listen? + */ so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING|SS_ISCONFIRMING); so->so_state |= SS_ISCONNECTED; - head = so->so_head; - if (head != NULL && (so->so_qstate & SQ_INCOMP)) { + if (head != NULL && (so->so_qstate == SQ_INCOMP)) { if ((so->so_options & SO_ACCEPTFILTER) == 0) { + TAILQ_REMOVE(&head->sol_incomp, so, so_list); + head->sol_incqlen--; + TAILQ_INSERT_TAIL(&head->sol_comp, so, so_list); + head->sol_qlen++; + so->so_qstate = SQ_COMP; SOCK_UNLOCK(so); - TAILQ_REMOVE(&head->so_incomp, so, so_list); - head->so_incqlen--; - so->so_qstate &= ~SQ_INCOMP; - TAILQ_INSERT_TAIL(&head->so_comp, so, so_list); - head->so_qlen++; - so->so_qstate |= SQ_COMP; - ACCEPT_UNLOCK(); - sorwakeup(head); - wakeup_one(&head->so_timeo); + selwakeuppri(&head->so_rdsel, PSOCK); + KNOTE_LOCKED(&head->so_rdsel.si_note, 0); + SOLISTEN_UNLOCK(head); + wakeup_one(&head->sol_comp); } else { - ACCEPT_UNLOCK(); + SOLISTEN_UNLOCK(head); soupcall_set(so, SO_RCV, - head->so_accf->so_accept_filter->accf_callback, - head->so_accf->so_accept_filter_arg); + head->sol_accept_filter->accf_callback, + head->sol_accept_filter_arg); so->so_options &= ~SO_ACCEPTFILTER; - ret = head->so_accf->so_accept_filter->accf_callback(so, - head->so_accf->so_accept_filter_arg, M_NOWAIT); + ret = head->sol_accept_filter->accf_callback(so, + head->sol_accept_filter_arg, M_NOWAIT); if (ret == SU_ISCONNECTED) soupcall_clear(so, SO_RCV); SOCK_UNLOCK(so); @@ -3484,8 +3658,9 @@ } return; } + if (head != NULL) + SOLISTEN_UNLOCK(head); SOCK_UNLOCK(so); - ACCEPT_UNLOCK(); wakeup(&so->so_timeo); sorwakeup(so); sowwakeup(so); @@ -3495,16 +3670,17 @@ soisdisconnecting(struct socket *so) { - /* - * Note: This code assumes that SOCK_LOCK(so) and - * SOCKBUF_LOCK(&so->so_rcv) are the same. - */ - SOCKBUF_LOCK(&so->so_rcv); + SOCK_LOCK(so); so->so_state &= ~SS_ISCONNECTING; so->so_state |= SS_ISDISCONNECTING; - socantrcvmore_locked(so); - SOCKBUF_LOCK(&so->so_snd); - socantsendmore_locked(so); + + if (!SOLISTENING(so)) { + SOCKBUF_LOCK(&so->so_rcv); + socantrcvmore_locked(so); + SOCKBUF_LOCK(&so->so_snd); + socantsendmore_locked(so); + } + SOCK_UNLOCK(so); wakeup(&so->so_timeo); } @@ -3512,17 +3688,18 @@ soisdisconnected(struct socket *so) { - /* - * Note: This code assumes that SOCK_LOCK(so) and - * SOCKBUF_LOCK(&so->so_rcv) are the same. - */ - SOCKBUF_LOCK(&so->so_rcv); + SOCK_LOCK(so); so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING); so->so_state |= SS_ISDISCONNECTED; - socantrcvmore_locked(so); - SOCKBUF_LOCK(&so->so_snd); - sbdrop_locked(&so->so_snd, sbused(&so->so_snd)); - socantsendmore_locked(so); + + if (!SOLISTENING(so)) { + SOCKBUF_LOCK(&so->so_rcv); + socantrcvmore_locked(so); + SOCKBUF_LOCK(&so->so_snd); + sbdrop_locked(&so->so_snd, sbused(&so->so_snd)); + socantsendmore_locked(so); + } + SOCK_UNLOCK(so); wakeup(&so->so_timeo); } @@ -3591,6 +3768,94 @@ sb->sb_flags &= ~SB_UPCALL; } +static void +so_rdknl_lock(void *arg) +{ + struct socket *so = arg; + + if (SOLISTENING(so)) + SOCK_LOCK(so); + else + SOCKBUF_LOCK(&so->so_rcv); +} + +static void +so_rdknl_unlock(void *arg) +{ + struct socket *so = arg; + + if (SOLISTENING(so)) + SOCK_UNLOCK(so); + else + SOCKBUF_UNLOCK(&so->so_rcv); +} + +static void +so_rdknl_assert_locked(void *arg) +{ + struct socket *so = arg; + + if (SOLISTENING(so)) + SOCK_LOCK_ASSERT(so); + else + SOCKBUF_LOCK_ASSERT(&so->so_rcv); +} + +static void +so_rdknl_assert_unlocked(void *arg) +{ + struct socket *so = arg; + + if (SOLISTENING(so)) + SOCK_UNLOCK_ASSERT(so); + else + SOCKBUF_UNLOCK_ASSERT(&so->so_rcv); +} + +static void +so_wrknl_lock(void *arg) +{ + struct socket *so = arg; + + if (SOLISTENING(so)) + SOCK_LOCK(so); + else + SOCKBUF_LOCK(&so->so_snd); +} + +static void +so_wrknl_unlock(void *arg) +{ + struct socket *so = arg; + + if (SOLISTENING(so)) + SOCK_UNLOCK(so); + else + SOCKBUF_UNLOCK(&so->so_snd); +} + +static void +so_wrknl_assert_locked(void *arg) +{ + struct socket *so = arg; + + if (SOLISTENING(so)) + SOCK_LOCK_ASSERT(so); + else + SOCKBUF_LOCK_ASSERT(&so->so_snd); +} + +static void +so_wrknl_assert_unlocked(void *arg) +{ + struct socket *so = arg; + + if (SOLISTENING(so)) + SOCK_UNLOCK_ASSERT(so); + else + SOCKBUF_UNLOCK_ASSERT(&so->so_snd); +} + /* * Create an external-format (``xsocket'') structure using the information in * the kernel-format socket structure pointed to by so. This is done to @@ -3612,32 +3877,24 @@ xso->so_pcb = so->so_pcb; xso->xso_protocol = so->so_proto->pr_protocol; xso->xso_family = so->so_proto->pr_domain->dom_family; - xso->so_qlen = so->so_qlen; - xso->so_incqlen = so->so_incqlen; - xso->so_qlimit = so->so_qlimit; xso->so_timeo = so->so_timeo; xso->so_error = so->so_error; - xso->so_pgid = so->so_sigio ? so->so_sigio->sio_pgid : 0; - xso->so_oobmark = so->so_oobmark; - sbtoxsockbuf(&so->so_snd, &xso->so_snd); - sbtoxsockbuf(&so->so_rcv, &xso->so_rcv); xso->so_uid = so->so_cred->cr_uid; -} - - -/* - * Socket accessor functions to provide external consumers with - * a safe interface to socket state - * - */ - -void -so_listeners_apply_all(struct socket *so, void (*func)(struct socket *, void *), - void *arg) -{ - - TAILQ_FOREACH(so, &so->so_comp, so_list) - func(so, arg); + xso->so_pgid = so->so_sigio ? so->so_sigio->sio_pgid : 0; + if (SOLISTENING(so)) { + xso->so_qlen = so->sol_qlen; + xso->so_incqlen = so->sol_incqlen; + xso->so_qlimit = so->sol_qlimit; + xso->so_oobmark = 0; + bzero(&xso->so_snd, sizeof(xso->so_snd)); + bzero(&xso->so_rcv, sizeof(xso->so_rcv)); + } else { + xso->so_state |= so->so_qstate; + xso->so_qlen = xso->so_incqlen = xso->so_qlimit = 0; + xso->so_oobmark = so->so_oobmark; + sbtoxsockbuf(&so->so_snd, &xso->so_snd); + sbtoxsockbuf(&so->so_rcv, &xso->so_rcv); + } } struct sockbuf * Index: sys/kern/uipc_syscalls.c =================================================================== --- sys/kern/uipc_syscalls.c +++ sys/kern/uipc_syscalls.c @@ -350,59 +350,57 @@ (flags & SOCK_CLOEXEC) ? O_CLOEXEC : 0, &fcaps); if (error != 0) goto done; - ACCEPT_LOCK(); - if ((head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->so_comp)) { - ACCEPT_UNLOCK(); + SOCK_LOCK(head); + if (!SOLISTENING(head)) { + SOCK_UNLOCK(head); + error = EINVAL; + goto noconnection; + } + if ((head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->sol_comp)) { + SOLISTEN_UNLOCK(head); error = EWOULDBLOCK; goto noconnection; } - while (TAILQ_EMPTY(&head->so_comp) && head->so_error == 0) { - if (head->so_rcv.sb_state & SBS_CANTRCVMORE) { - head->so_error = ECONNABORTED; - break; - } - error = msleep(&head->so_timeo, &accept_mtx, PSOCK | PCATCH, + while (TAILQ_EMPTY(&head->sol_comp) && head->so_error == 0) { + error = msleep(&head->sol_comp, &head->so_lock, PSOCK | PCATCH, "accept", 0); if (error != 0) { - ACCEPT_UNLOCK(); + SOLISTEN_UNLOCK(head); goto noconnection; } } if (head->so_error) { error = head->so_error; head->so_error = 0; - ACCEPT_UNLOCK(); + SOLISTEN_UNLOCK(head); goto noconnection; } - so = TAILQ_FIRST(&head->so_comp); - KASSERT(!(so->so_qstate & SQ_INCOMP), ("accept1: so SQ_INCOMP")); - KASSERT(so->so_qstate & SQ_COMP, ("accept1: so not SQ_COMP")); - + so = TAILQ_FIRST(&head->sol_comp); /* * Before changing the flags on the socket, we have to bump the * reference count. Otherwise, if the protocol calls sofree(), * the socket will be released due to a zero refcount. */ - SOCK_LOCK(so); /* soref() and so_state update */ + SOCK_LOCK(so); + KASSERT(so->so_qstate == SQ_COMP, + ("%s: so %p not SQ_COMP", __func__, so)); soref(so); /* file descriptor reference */ - - TAILQ_REMOVE(&head->so_comp, so, so_list); - head->so_qlen--; + TAILQ_REMOVE(&head->sol_comp, so, so_list); + head->sol_qlen--; if (flags & ACCEPT4_INHERIT) so->so_state |= (head->so_state & SS_NBIO); else so->so_state |= (flags & SOCK_NONBLOCK) ? SS_NBIO : 0; - so->so_qstate &= ~SQ_COMP; - so->so_head = NULL; - + so->so_qstate = SQ_NONE; + so->so_listen = NULL; SOCK_UNLOCK(so); - ACCEPT_UNLOCK(); + sorele(head); /* An extra reference on `nfp' has been held for us by falloc(). */ td->td_retval[0] = fd; - /* connection has been removed from the listen queue */ - KNOTE_UNLOCKED(&head->so_rcv.sb_sel.si_note, 0); + /* Connection has been removed from the listen queue. */ + KNOTE_UNLOCKED(&head->so_rdsel.si_note, 0); if (flags & ACCEPT4_INHERIT) { pgid = fgetown(&head->so_sigio); @@ -420,7 +418,6 @@ (void) fo_ioctl(nfp, FIONBIO, &tmp, td->td_ucred, td); tmp = fflag & FASYNC; (void) fo_ioctl(nfp, FIOASYNC, &tmp, td->td_ucred, td); - sa = NULL; error = soaccept(so, &sa); if (error != 0) goto noconnection; @@ -558,7 +555,7 @@ } SOCK_LOCK(so); while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) { - error = msleep(&so->so_timeo, SOCK_MTX(so), PSOCK | PCATCH, + error = msleep(&so->so_timeo, &so->so_lock, PSOCK | PCATCH, "connec", 0); if (error != 0) { if (error == EINTR || error == ERESTART) Index: sys/kern/uipc_usrreq.c =================================================================== --- sys/kern/uipc_usrreq.c +++ sys/kern/uipc_usrreq.c @@ -430,7 +430,7 @@ unp->unp_socket = so; so->so_pcb = unp; unp->unp_refcount = 1; - if (so->so_head != NULL) + if (so->so_listen != NULL) unp->unp_flags |= UNP_NASCENT; UNP_LIST_LOCK(); @@ -552,7 +552,7 @@ UNP_LINK_WLOCK(); UNP_PCB_LOCK(unp); - VOP_UNP_BIND(vp, unp->unp_socket); + VOP_UNP_BIND(vp, unp); unp->unp_vnode = vp; unp->unp_addr = soun; unp->unp_flags &= ~UNP_BINDING; @@ -607,6 +607,7 @@ uipc_close(struct socket *so) { struct unpcb *unp, *unp2; + struct vnode *vp = NULL; unp = sotounpcb(so); KASSERT(unp != NULL, ("uipc_close: unp == NULL")); @@ -619,8 +620,14 @@ unp_disconnect(unp, unp2); UNP_PCB_UNLOCK(unp2); } + if (SOLISTENING(so) && ((vp = unp->unp_vnode) != NULL)) { + VOP_UNP_DETACH(vp); + unp->unp_vnode = NULL; + } UNP_PCB_UNLOCK(unp); UNP_LINK_WUNLOCK(); + if (vp) + vrele(vp); } static int @@ -663,16 +670,11 @@ --unp_count; UNP_LIST_UNLOCK(); - if ((unp->unp_flags & UNP_NASCENT) != 0) { - UNP_PCB_LOCK(unp); - goto teardown; - } UNP_LINK_WLOCK(); UNP_PCB_LOCK(unp); + if ((unp->unp_flags & UNP_NASCENT) != 0) + goto teardown; - /* - * XXXRW: Should assert vp->v_socket == so. - */ if ((vp = unp->unp_vnode) != NULL) { VOP_UNP_DETACH(vp); unp->unp_vnode = NULL; @@ -696,8 +698,8 @@ UNP_PCB_UNLOCK(ref); } local_unp_rights = unp_rights; - UNP_LINK_WUNLOCK(); teardown: + UNP_LINK_WUNLOCK(); unp->unp_socket->so_pcb = NULL; saved_unp_addr = unp->unp_addr; unp->unp_addr = NULL; @@ -761,7 +763,6 @@ error = solisten_proto_check(so); if (error == 0) { cru2x(td->td_ucred, &unp->unp_peercred); - unp->unp_flags |= UNP_HAVEPCCACHED; solisten_proto(so, backlog); } SOCK_UNLOCK(so); @@ -1319,7 +1320,7 @@ { struct sockaddr_un *soun = (struct sockaddr_un *)nam; struct vnode *vp; - struct socket *so2, *so3; + struct socket *so2; struct unpcb *unp, *unp2, *unp3; struct nameidata nd; char buf[SOCK_MAXADDRLEN]; @@ -1386,31 +1387,30 @@ * and to protect simultaneous locking of multiple pcbs. */ UNP_LINK_WLOCK(); - VOP_UNP_CONNECT(vp, &so2); - if (so2 == NULL) { + VOP_UNP_CONNECT(vp, &unp2); + if (unp2 == NULL) { error = ECONNREFUSED; goto bad2; } + so2 = unp2->unp_socket; if (so->so_type != so2->so_type) { error = EPROTOTYPE; goto bad2; } + UNP_PCB_LOCK(unp); + UNP_PCB_LOCK(unp2); if (so->so_proto->pr_flags & PR_CONNREQUIRED) { if (so2->so_options & SO_ACCEPTCONN) { CURVNET_SET(so2->so_vnet); - so3 = sonewconn(so2, 0); + so2 = sonewconn(so2, 0); CURVNET_RESTORE(); } else - so3 = NULL; - if (so3 == NULL) { + so2 = NULL; + if (so2 == NULL) { error = ECONNREFUSED; - goto bad2; + goto bad3; } - unp = sotounpcb(so); - unp2 = sotounpcb(so2); - unp3 = sotounpcb(so3); - UNP_PCB_LOCK(unp); - UNP_PCB_LOCK(unp2); + unp3 = sotounpcb(so2); UNP_PCB_LOCK(unp3); if (unp2->unp_addr != NULL) { bcopy(unp2->unp_addr, sa, unp2->unp_addr->sun_len); @@ -1431,30 +1431,24 @@ * listen(); uipc_listen() cached that process's credentials * at that time so we can use them now. */ - KASSERT(unp2->unp_flags & UNP_HAVEPCCACHED, - ("unp_connect: listener without cached peercred")); memcpy(&unp->unp_peercred, &unp2->unp_peercred, sizeof(unp->unp_peercred)); unp->unp_flags |= UNP_HAVEPC; if (unp2->unp_flags & UNP_WANTCRED) unp3->unp_flags |= UNP_WANTCRED; - UNP_PCB_UNLOCK(unp3); UNP_PCB_UNLOCK(unp2); - UNP_PCB_UNLOCK(unp); + unp2 = unp3; #ifdef MAC - mac_socketpeer_set_from_socket(so, so3); - mac_socketpeer_set_from_socket(so3, so); + mac_socketpeer_set_from_socket(so, so2); + mac_socketpeer_set_from_socket(so2, so); #endif - - so2 = so3; } - unp = sotounpcb(so); - KASSERT(unp != NULL, ("unp_connect: unp == NULL")); - unp2 = sotounpcb(so2); - KASSERT(unp2 != NULL, ("unp_connect: unp2 == NULL")); - UNP_PCB_LOCK(unp); - UNP_PCB_LOCK(unp2); + + KASSERT(unp2 != NULL && so2 != NULL && unp2->unp_socket == so2 && + sotounpcb(so2) == unp2, + ("%s: unp2 %p so2 %p", __func__, unp2, so2)); error = unp_connect2(so, so2, PRU_CONNECT); +bad3: UNP_PCB_UNLOCK(unp2); UNP_PCB_UNLOCK(unp); bad2: @@ -2237,8 +2231,7 @@ static void unp_gc_process(struct unpcb *unp) { - struct socket *soa; - struct socket *so; + struct socket *so, *soa; struct file *fp; /* Already processed. */ @@ -2258,28 +2251,30 @@ return; } - /* - * Mark all sockets we reference with RIGHTS. - */ so = unp->unp_socket; - if ((unp->unp_gcflag & UNPGC_IGNORE_RIGHTS) == 0) { - SOCKBUF_LOCK(&so->so_rcv); - unp_scan(so->so_rcv.sb_mb, unp_accessable); - SOCKBUF_UNLOCK(&so->so_rcv); - } - - /* - * Mark all sockets in our accept queue. - */ - ACCEPT_LOCK(); - TAILQ_FOREACH(soa, &so->so_comp, so_list) { - if ((sotounpcb(soa)->unp_gcflag & UNPGC_IGNORE_RIGHTS) != 0) - continue; - SOCKBUF_LOCK(&soa->so_rcv); - unp_scan(soa->so_rcv.sb_mb, unp_accessable); - SOCKBUF_UNLOCK(&soa->so_rcv); + SOCK_LOCK(so); + if (SOLISTENING(so)) { + /* + * Mark all sockets in our accept queue. + */ + TAILQ_FOREACH(soa, &so->sol_comp, so_list) { + if (sotounpcb(soa)->unp_gcflag & UNPGC_IGNORE_RIGHTS) + continue; + SOCKBUF_LOCK(&soa->so_rcv); + unp_scan(soa->so_rcv.sb_mb, unp_accessable); + SOCKBUF_UNLOCK(&soa->so_rcv); + } + } else { + /* + * Mark all sockets we reference with RIGHTS. + */ + if ((unp->unp_gcflag & UNPGC_IGNORE_RIGHTS) == 0) { + SOCKBUF_LOCK(&so->so_rcv); + unp_scan(so->so_rcv.sb_mb, unp_accessable); + SOCKBUF_UNLOCK(&so->so_rcv); + } } - ACCEPT_UNLOCK(); + SOCK_UNLOCK(so); unp->unp_gcflag |= UNPGC_SCANNED; } @@ -2399,7 +2394,8 @@ UNP_LIST_LOCK(); unp->unp_gcflag |= UNPGC_IGNORE_RIGHTS; UNP_LIST_UNLOCK(); - unp_dispose_mbuf(so->so_rcv.sb_mb); + if (!SOLISTENING(so)) + unp_dispose_mbuf(so->so_rcv.sb_mb); } static void @@ -2454,7 +2450,6 @@ void vfs_unp_reclaim(struct vnode *vp) { - struct socket *so; struct unpcb *unp; int active; @@ -2464,10 +2459,7 @@ active = 0; UNP_LINK_WLOCK(); - VOP_UNP_CONNECT(vp, &so); - if (so == NULL) - goto done; - unp = sotounpcb(so); + VOP_UNP_CONNECT(vp, &unp); if (unp == NULL) goto done; UNP_PCB_LOCK(unp); @@ -2503,10 +2495,6 @@ db_printf("%sUNP_HAVEPC", comma ? ", " : ""); comma = 1; } - if (unp_flags & UNP_HAVEPCCACHED) { - db_printf("%sUNP_HAVEPCCACHED", comma ? ", " : ""); - comma = 1; - } if (unp_flags & UNP_WANTCRED) { db_printf("%sUNP_WANTCRED", comma ? ", " : ""); comma = 1; Index: sys/kern/vfs_default.c =================================================================== --- sys/kern/vfs_default.c +++ sys/kern/vfs_default.c @@ -1128,7 +1128,7 @@ vop_stdunp_bind(struct vop_unp_bind_args *ap) { - ap->a_vp->v_socket = ap->a_socket; + ap->a_vp->v_unpcb = ap->a_unpcb; return (0); } @@ -1136,7 +1136,7 @@ vop_stdunp_connect(struct vop_unp_connect_args *ap) { - *ap->a_socket = ap->a_vp->v_socket; + *ap->a_unpcb = ap->a_vp->v_unpcb; return (0); } @@ -1144,7 +1144,7 @@ vop_stdunp_detach(struct vop_unp_detach_args *ap) { - ap->a_vp->v_socket = NULL; + ap->a_vp->v_unpcb = NULL; return (0); } Index: sys/kern/vfs_subr.c =================================================================== --- sys/kern/vfs_subr.c +++ sys/kern/vfs_subr.c @@ -2992,7 +2992,10 @@ /* XXX Elsewhere we detect an already freed vnode via NULL v_op. */ vp->v_op = NULL; #endif - bzero(&vp->v_un, sizeof(vp->v_un)); + vp->v_mountedhere = NULL; + vp->v_unpcb = NULL; + vp->v_rdev = NULL; + vp->v_fifoinfo = NULL; vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0; vp->v_iflag = 0; vp->v_vflag = 0; Index: sys/kern/vnode_if.src =================================================================== --- sys/kern/vnode_if.src +++ sys/kern/vnode_if.src @@ -662,7 +662,7 @@ vop_unp_bind { IN struct vnode *vp; - IN struct socket *socket; + IN struct unpcb *unpcb; }; @@ -670,7 +670,7 @@ vop_unp_connect { IN struct vnode *vp; - OUT struct socket **socket; + OUT struct unpcb **unpcb; }; Index: sys/netinet/tcp_input.c =================================================================== --- sys/netinet/tcp_input.c +++ sys/netinet/tcp_input.c @@ -1387,10 +1387,10 @@ TCP_PROBE3(debug__input, tp, th, m); tcp_dooptions(&to, optp, optlen, TO_SYN); #ifdef TCP_RFC7413 - if (syncache_add(&inc, &to, th, inp, &so, m, NULL, NULL)) + if (syncache_add(&inc, &to, th, inp, so, m, NULL, NULL)) goto tfo_socket_result; #else - syncache_add(&inc, &to, th, inp, &so, m, NULL, NULL); + syncache_add(&inc, &to, th, inp, so, m, NULL, NULL); #endif /* * Entry added to syncache and mbuf consumed. Index: sys/netinet/tcp_subr.c =================================================================== --- sys/netinet/tcp_subr.c +++ sys/netinet/tcp_subr.c @@ -1576,7 +1576,6 @@ ("tcp_close: !SS_PROTOREF")); inp->inp_flags &= ~INP_SOCKREF; INP_WUNLOCK(inp); - ACCEPT_LOCK(); SOCK_LOCK(so); so->so_state &= ~SS_PROTOREF; sofree(so); Index: sys/netinet/tcp_syncache.h =================================================================== --- sys/netinet/tcp_syncache.h +++ sys/netinet/tcp_syncache.h @@ -42,7 +42,7 @@ int syncache_expand(struct in_conninfo *, struct tcpopt *, struct tcphdr *, struct socket **, struct mbuf *); int syncache_add(struct in_conninfo *, struct tcpopt *, - struct tcphdr *, struct inpcb *, struct socket **, struct mbuf *, + struct tcphdr *, struct inpcb *, struct socket *, struct mbuf *, void *, void *); void syncache_chkrst(struct in_conninfo *, struct tcphdr *); void syncache_badack(struct in_conninfo *); Index: sys/netinet/tcp_syncache.c =================================================================== --- sys/netinet/tcp_syncache.c +++ sys/netinet/tcp_syncache.c @@ -146,9 +146,9 @@ struct socket *); static void syncookie_reseed(void *); #ifdef INVARIANTS -static int syncookie_cmp(struct in_conninfo *inc, struct syncache_head *sch, - struct syncache *sc, struct tcphdr *th, struct tcpopt *to, - struct socket *lso); +static int syncookie_cmp(struct in_conninfo *, struct syncache_head *, + struct syncache *, struct tcphdr *, struct tcpopt *, + struct socket *); #endif /* @@ -1160,10 +1160,11 @@ } #ifdef TCP_RFC7413 -static void -syncache_tfo_expand(struct syncache *sc, struct socket **lsop, struct mbuf *m, +static struct socket * +syncache_tfo_expand(struct syncache *sc, struct socket *lso, struct mbuf *m, uint64_t response_cookie) { + struct socket *so; struct inpcb *inp; struct tcpcb *tp; unsigned int *pending_counter; @@ -1175,12 +1176,12 @@ INP_INFO_RLOCK_ASSERT(&V_tcbinfo); pending_counter = intotcpcb(sotoinpcb(*lsop))->t_tfo_pending; - *lsop = syncache_socket(sc, *lsop, m); - if (*lsop == NULL) { + so = syncache_socket(sc, lso, m); + if (so == NULL) { TCPSTAT_INC(tcps_sc_aborted); atomic_subtract_int(pending_counter, 1); } else { - inp = sotoinpcb(*lsop); + inp = sotoinpcb(so); tp = intotcpcb(inp); tp->t_flags |= TF_FASTOPEN; tp->t_tfo_cookie = response_cookie; @@ -1189,6 +1190,7 @@ tp->t_tfo_pending = pending_counter; TCPSTAT_INC(tcps_sc_completed); } + return (so); } #endif /* TCP_RFC7413 */ @@ -1213,11 +1215,10 @@ */ int syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, - struct inpcb *inp, struct socket **lsop, struct mbuf *m, void *tod, + struct inpcb *inp, struct socket *so, struct mbuf *m, void *tod, void *todctx) { struct tcpcb *tp; - struct socket *so; struct syncache *sc = NULL; struct syncache_head *sch; struct mbuf *ipopts = NULL; @@ -1248,7 +1249,7 @@ * Combine all so/tp operations very early to drop the INP lock as * soon as possible. */ - so = *lsop; + KASSERT(SOLISTENING(so), ("%s: %p not listening", __func__, so)); tp = sototcpcb(so); cred = crhold(so->so_cred); @@ -1259,7 +1260,7 @@ #endif ip_ttl = inp->inp_ip_ttl; ip_tos = inp->inp_ip_tos; - win = sbspace(&so->so_rcv); + win = so->sol_sbrcv_hiwat; ltflags = (tp->t_flags & (TF_NOOPT | TF_SIGNATURE)); #ifdef TCP_RFC7413 @@ -1272,7 +1273,7 @@ * listen queue with bogus TFO connections. */ if (atomic_fetchadd_int(tp->t_tfo_pending, 1) <= - (so->so_qlimit / 2)) { + (so->sol_qlimit / 2)) { int result; result = tcp_fastopen_check_cookie(inc, @@ -1566,10 +1567,7 @@ } done: - if (m) { - *lsop = NULL; - m_freem(m); - } + m_freem(m); #ifdef TCP_RFC7413 /* * If tfo_pending is not NULL here, then a TFO SYN that did not @@ -2098,7 +2096,7 @@ sc->sc_flags |= SCF_WINSCALE; } - wnd = sbspace(&lso->so_rcv); + wnd = lso->sol_sbrcv_hiwat; wnd = imax(wnd, 0); wnd = imin(wnd, TCP_MAXWIN); sc->sc_wnd = wnd; Index: sys/netinet/tcp_timewait.c =================================================================== --- sys/netinet/tcp_timewait.c +++ sys/netinet/tcp_timewait.c @@ -352,7 +352,6 @@ ("tcp_twstart: !SS_PROTOREF")); inp->inp_flags &= ~INP_SOCKREF; INP_WUNLOCK(inp); - ACCEPT_LOCK(); SOCK_LOCK(so); so->so_state &= ~SS_PROTOREF; sofree(so); @@ -491,7 +490,6 @@ if (inp->inp_flags & INP_SOCKREF) { inp->inp_flags &= ~INP_SOCKREF; INP_WUNLOCK(inp); - ACCEPT_LOCK(); SOCK_LOCK(so); KASSERT(so->so_state & SS_PROTOREF, ("tcp_twclose: INP_SOCKREF && !SS_PROTOREF")); Index: sys/sys/sockbuf.h =================================================================== --- sys/sys/sockbuf.h +++ sys/sys/sockbuf.h @@ -32,7 +32,6 @@ */ #ifndef _SYS_SOCKBUF_H_ #define _SYS_SOCKBUF_H_ -#include /* for struct selinfo */ #include #include #include @@ -64,6 +63,7 @@ struct sockaddr; struct socket; struct thread; +struct selinfo; struct xsockbuf { u_int sb_cc; @@ -84,9 +84,9 @@ * (a) locked by SOCKBUF_LOCK(). */ struct sockbuf { - struct selinfo sb_sel; /* process selecting read/write */ - struct mtx sb_mtx; /* sockbuf lock */ - struct sx sb_sx; /* prevent I/O interlacing */ + struct mtx sb_mtx; /* sockbuf lock */ + struct sx sb_sx; /* prevent I/O interlacing */ + struct selinfo *sb_sel; /* process selecting read/write */ short sb_state; /* (a) socket state on sockbuf */ #define sb_startzero sb_mb struct mbuf *sb_mb; /* (a) the mbuf chain */ Index: sys/sys/socket.h =================================================================== --- sys/sys/socket.h +++ sys/sys/socket.h @@ -704,9 +704,5 @@ void so_lock(struct socket *so); void so_unlock(struct socket *so); -void so_listeners_apply_all(struct socket *so, void (*func)(struct socket *, void *), void *arg); - -#endif - - +#endif /* _KERNEL */ #endif /* !_SYS_SOCKET_H_ */ Index: sys/sys/socketvar.h =================================================================== --- sys/sys/socketvar.h +++ sys/sys/socketvar.h @@ -63,60 +63,35 @@ * Locking key to struct socket: * (a) constant after allocation, no locking required. * (b) locked by SOCK_LOCK(so). - * (c) locked by SOCKBUF_LOCK(&so->so_rcv). - * (e) locked by ACCEPT_LOCK(). + * (cr) locked by SOCKBUF_LOCK(&so->so_rcv). + * (cs) locked by SOCKBUF_LOCK(&so->so_rcv). + * (e) locked by SOLISTEN_LOCK() of corresponding listening socket. * (f) not locked since integer reads/writes are atomic. * (g) used only as a sleep/wakeup address, no value. * (h) locked by global mutex so_global_mtx. */ +TAILQ_HEAD(accept_queue, socket); struct socket { - int so_count; /* (b) reference count */ + struct mtx so_lock; + volatile u_int so_count; /* (b / refcount) */ + struct selinfo so_rdsel; /* (b/cr) for so_rcv/so_comp */ + struct selinfo so_wrsel; /* (b/cs) for so_snd */ short so_type; /* (a) generic type, see socket.h */ - short so_options; /* from socket call, see socket.h */ - short so_linger; /* time to linger while closing */ + short so_options; /* (b) from socket call, see socket.h */ + short so_linger; /* time to linger close(2) */ short so_state; /* (b) internal state flags SS_* */ - int so_qstate; /* (e) internal state flags SQ_* */ void *so_pcb; /* protocol control block */ struct vnet *so_vnet; /* (a) network stack instance */ struct protosw *so_proto; /* (a) protocol handle */ -/* - * Variables for connection queuing. - * Socket where accepts occur is so_head in all subsidiary sockets. - * If so_head is 0, socket is not related to an accept. - * For head socket so_incomp queues partially completed connections, - * while so_comp is a queue of connections ready to be accepted. - * If a connection is aborted and it has so_head set, then - * it has to be pulled out of either so_incomp or so_comp. - * We allow connections to queue up based on current queue lengths - * and limit on number of queued connections for this socket. - */ - struct socket *so_head; /* (e) back pointer to listen socket */ - TAILQ_HEAD(, socket) so_incomp; /* (e) queue of partial unaccepted connections */ - TAILQ_HEAD(, socket) so_comp; /* (e) queue of complete unaccepted connections */ - TAILQ_ENTRY(socket) so_list; /* (e) list of unaccepted connections */ - u_int so_qlen; /* (e) number of unaccepted connections */ - u_int so_incqlen; /* (e) number of unaccepted incomplete - connections */ - u_int so_qlimit; /* (e) max number queued connections */ short so_timeo; /* (g) connection timeout */ u_short so_error; /* (f) error affecting connection */ struct sigio *so_sigio; /* [sg] information for async I/O or out of band data (SIGURG) */ - u_long so_oobmark; /* (c) chars to oob mark */ - - struct sockbuf so_rcv, so_snd; - struct ucred *so_cred; /* (a) user credentials */ struct label *so_label; /* (b) MAC label for socket */ - struct label *so_peerlabel; /* (b) cached MAC label for peer */ /* NB: generation count must not be first. */ so_gen_t so_gencnt; /* (h) generation count */ void *so_emuldata; /* (b) private data for emulators */ - struct so_accf { - struct accept_filter *so_accept_filter; - void *so_accept_filter_arg; /* saved filter args */ - char *so_accept_filter_str; /* saved user args */ - } *so_accf; struct osd osd; /* Object Specific extensions */ /* * so_fibnum, so_user_cookie and friends can be used to attach @@ -129,39 +104,82 @@ int so_ts_clock; /* type of the clock used for timestamps */ uint32_t so_max_pacing_rate; /* (f) TX rate limit in bytes/s */ - - void *so_pspare[2]; /* general use */ - int so_ispare[2]; /* general use */ + union { + /* Regular (data flow) socket. */ + struct { + /* (cr, cs) Receive and send buffers. */ + struct sockbuf so_rcv, so_snd; + + /* (e) Our place on accept queue. */ + TAILQ_ENTRY(socket) so_list; + struct socket *so_listen; /* (b) */ + enum { + SQ_NONE = 0, + SQ_INCOMP = 0x0800, /* on sol_incomp */ + SQ_COMP = 0x1000, /* on sol_comp */ + } so_qstate; /* (b) */ + + /* (b) cached MAC label for peer */ + struct label *so_peerlabel; + u_long so_oobmark; /* chars to oob mark */ + }; + /* + * Listening socket, where accepts occur, is so_listen in all + * subsidiary sockets. If so_listen is NULL, socket is not + * related to an accept. For a listening socket itself + * sol_incomp queues partially completed connections, while + * sol_comp is a queue of connections ready to be accepted. + * If a connection is aborted and it has so_listen set, then + * it has to be pulled out of either sol_incomp or sol_comp. + * We allow connections to queue up based on current queue + * lengths and limit on number of queued connections for this + * socket. + */ + struct { + /* (e) queue of partial unaccepted connections */ + struct accept_queue sol_incomp; + /* (e) queue of complete unaccepted connections */ + struct accept_queue sol_comp; + u_int sol_qlen; /* (e) sol_comp length */ + u_int sol_incqlen; /* (e) sol_incomp length */ + u_int sol_qlimit; /* (e) queue limit */ + + /* accept_filter(9) optional data */ + struct accept_filter *sol_accept_filter; + void *sol_accept_filter_arg; /* saved filter args */ + char *sol_accept_filter_str; /* saved user args */ + + /* Socket buffer parameters, to be copied to + * dataflow sockets, accepted from this one. */ + int sol_sbrcv_lowat; + int sol_sbsnd_lowat; + u_int sol_sbrcv_hiwat; + u_int sol_sbsnd_hiwat; + short sol_sbrcv_flags; + short sol_sbsnd_flags; + sbintime_t sol_sbrcv_timeo; + sbintime_t sol_sbsnd_timeo; + }; + }; }; -/* - * Global accept mutex to serialize access to accept queues and - * fields associated with multiple sockets. This allows us to - * avoid defining a lock order between listen and accept sockets - * until such time as it proves to be a good idea. - */ -extern struct mtx accept_mtx; -#define ACCEPT_LOCK_ASSERT() mtx_assert(&accept_mtx, MA_OWNED) -#define ACCEPT_UNLOCK_ASSERT() mtx_assert(&accept_mtx, MA_NOTOWNED) -#define ACCEPT_LOCK() mtx_lock(&accept_mtx) -#define ACCEPT_UNLOCK() mtx_unlock(&accept_mtx) - -/* - * Per-socket mutex: we reuse the receive socket buffer mutex for space - * efficiency. This decision should probably be revisited as we optimize - * locking for the socket code. - */ -#define SOCK_MTX(_so) SOCKBUF_MTX(&(_so)->so_rcv) -#define SOCK_LOCK(_so) SOCKBUF_LOCK(&(_so)->so_rcv) -#define SOCK_OWNED(_so) SOCKBUF_OWNED(&(_so)->so_rcv) -#define SOCK_UNLOCK(_so) SOCKBUF_UNLOCK(&(_so)->so_rcv) -#define SOCK_LOCK_ASSERT(_so) SOCKBUF_LOCK_ASSERT(&(_so)->so_rcv) - -/* - * Socket state bits stored in so_qstate. - */ -#define SQ_INCOMP 0x0800 /* unaccepted, incomplete connection */ -#define SQ_COMP 0x1000 /* unaccepted, complete connection */ +#define SOCK_LOCK(so) mtx_lock(&(so)->so_lock) +#define SOCK_OWNED(so) mtx_owned(&(so)->so_lock) +#define SOCK_UNLOCK(so) mtx_unlock(&(so)->so_lock) +#define SOCK_LOCK_ASSERT(so) mtx_assert(&(so)->so_lock, MA_OWNED) +#define SOCK_UNLOCK_ASSERT(so) mtx_assert(&(so)->so_lock, MA_NOTOWNED) + +#define SOLISTENING(so) (((so)->so_options & SO_ACCEPTCONN) != 0) +#define SOLISTEN_LOCK(sol) do { \ + mtx_lock(&(sol)->so_lock); \ + KASSERT(SOLISTENING(sol), \ + ("%s: %p not listening", __func__, (sol))); \ +} while (0) +#define SOLISTEN_UNLOCK(sol) do { \ + KASSERT(SOLISTENING(sol), \ + ("%s: %p not listening", __func__, (sol))); \ + mtx_unlock(&(sol)->so_lock); \ +} while (0) /* * Externalized form of struct socket used by the sysctl(3) interface. @@ -212,8 +230,7 @@ /* can we read something from so? */ #define soreadabledata(so) \ - (sbavail(&(so)->so_rcv) >= (so)->so_rcv.sb_lowat || \ - !TAILQ_EMPTY(&(so)->so_comp) || (so)->so_error) + (sbavail(&(so)->so_rcv) >= (so)->so_rcv.sb_lowat || (so)->so_error) #define soreadable(so) \ (soreadabledata(so) || ((so)->so_rcv.sb_state & SBS_CANTRCVMORE)) @@ -226,26 +243,19 @@ (so)->so_error) /* - * soref()/sorele() ref-count the socket structure. Note that you must - * still explicitly close the socket, but the last ref count will free - * the structure. + * soref()/sorele() ref-count the socket structure. + * soref() may be called without owning socket lock, but in that case a + * caller must own something that holds socket, and so_count must be not 0. + * Note that you must still explicitly close the socket, but the last ref + * count will free the structure. */ -#define soref(so) do { \ - SOCK_LOCK_ASSERT(so); \ - ++(so)->so_count; \ -} while (0) - +#define soref(so) refcount_acquire(&(so)->so_count) #define sorele(so) do { \ - ACCEPT_LOCK_ASSERT(); \ SOCK_LOCK_ASSERT(so); \ - if ((so)->so_count <= 0) \ - panic("sorele"); \ - if (--(so)->so_count == 0) \ + if (refcount_release(&(so)->so_count)) \ sofree(so); \ - else { \ + else \ SOCK_UNLOCK(so); \ - ACCEPT_UNLOCK(); \ - } \ } while (0) /* @@ -370,8 +380,8 @@ int solisten_proto_check(struct socket *so); struct socket * sonewconn(struct socket *head, int connstatus); - - +struct socket * + sopeeloff(struct socket *); int sopoll(struct socket *so, int events, struct ucred *active_cred, struct thread *td); int sopoll_generic(struct socket *so, int events, Index: sys/sys/sockopt.h =================================================================== --- sys/sys/sockopt.h +++ sys/sys/sockopt.h @@ -64,8 +64,8 @@ int soopt_getm(struct sockopt *sopt, struct mbuf **mp); int soopt_mcopyin(struct sockopt *sopt, struct mbuf *m); int soopt_mcopyout(struct sockopt *sopt, struct mbuf *m); -int do_getopt_accept_filter(struct socket *so, struct sockopt *sopt); -int do_setopt_accept_filter(struct socket *so, struct sockopt *sopt); +int accept_filt_getopt(struct socket *, struct sockopt *); +int accept_filt_setopt(struct socket *, struct sockopt *); int so_setsockopt(struct socket *so, int level, int optname, void *optval, size_t optlen); Index: sys/sys/unpcb.h =================================================================== --- sys/sys/unpcb.h +++ sys/sys/unpcb.h @@ -92,14 +92,8 @@ * and is really the credentials of the connected peer. This is used * to determine whether the contents should be sent to the user or * not. - * - * UNP_HAVEPCCACHED - indicates that the unp_peercred member is filled - * in, but does *not* contain the credentials of the connected peer - * (there may not even be a peer). This is set in unp_listen() when - * it fills in unp_peercred for later consumption by unp_connect(). */ #define UNP_HAVEPC 0x001 -#define UNP_HAVEPCCACHED 0x002 #define UNP_WANTCRED 0x004 /* credentials wanted */ #define UNP_CONNWAIT 0x008 /* connect blocks until accepted */ Index: sys/sys/vnode.h =================================================================== --- sys/sys/vnode.h +++ sys/sys/vnode.h @@ -112,14 +112,13 @@ /* * Type specific fields, only one applies to any given vnode. - * See #defines below for renaming to v_* namespace. */ union { - struct mount *vu_mount; /* v ptr to mountpoint (VDIR) */ - struct socket *vu_socket; /* v unix domain net (VSOCK) */ - struct cdev *vu_cdev; /* v device (VCHR, VBLK) */ - struct fifoinfo *vu_fifoinfo; /* v fifo (VFIFO) */ - } v_un; + struct mount *v_mountedhere; /* v ptr to mountpoint (VDIR) */ + struct unpcb *v_unpcb; /* v unix domain net (VSOCK) */ + struct cdev *v_rdev; /* v device (VCHR, VBLK) */ + struct fifoinfo *v_fifoinfo; /* v fifo (VFIFO) */ + }; /* * vfs_hash: (mount + inode) -> vnode hash. The hash value @@ -175,11 +174,6 @@ #endif /* defined(_KERNEL) || defined(_KVM_VNODE) */ -#define v_mountedhere v_un.vu_mount -#define v_socket v_un.vu_socket -#define v_rdev v_un.vu_cdev -#define v_fifoinfo v_un.vu_fifoinfo - #define bo2vnode(bo) __containerof((bo), struct vnode, v_bufobj) /* XXX: These are temporary to avoid a source sweep at this time */ @@ -200,7 +194,7 @@ long xv_numoutput; /* num of writes in progress */ enum vtype xv_type; /* vnode type */ union { - void *xvu_socket; /* socket, if VSOCK */ + void *xvu_socket; /* unpcb, if VSOCK */ void *xvu_fifo; /* fifo, if VFIFO */ dev_t xvu_rdev; /* maj/min, if VBLK/VCHR */ struct { Index: usr.bin/netstat/inet.c =================================================================== --- usr.bin/netstat/inet.c +++ usr.bin/netstat/inet.c @@ -170,14 +170,17 @@ if (kread((uintptr_t)proto.pr_domain, &domain, sizeof(domain)) != 0) return (-1); xso->xso_family = domain.dom_family; - xso->so_qlen = so->so_qlen; - xso->so_incqlen = so->so_incqlen; - xso->so_qlimit = so->so_qlimit; xso->so_timeo = so->so_timeo; xso->so_error = so->so_error; - xso->so_oobmark = so->so_oobmark; - sbtoxsockbuf(&so->so_snd, &xso->so_snd); - sbtoxsockbuf(&so->so_rcv, &xso->so_rcv); + if (SOLISTENING(so)) { + xso->so_qlen = so->sol_qlen; + xso->so_incqlen = so->sol_incqlen; + xso->so_qlimit = so->sol_qlimit; + } else { + sbtoxsockbuf(&so->so_snd, &xso->so_snd); + sbtoxsockbuf(&so->so_rcv, &xso->so_rcv); + xso->so_oobmark = so->so_oobmark; + } return (0); }