Index: share/man/man9/accept_filter.9 =================================================================== --- share/man/man9/accept_filter.9 +++ share/man/man9/accept_filter.9 @@ -25,7 +25,7 @@ .\" .\" $FreeBSD$ .\" " -.Dd June 25, 2000 +.Dd January 26, 2017 .Dt ACCEPT_FILTER 9 .Os .Sh NAME @@ -67,8 +67,8 @@ struct accept_filter { char accf_name[16]; void (*accf_callback)(struct socket *so, void *arg, int waitflag); - void * (*accf_create)(struct socket *so, char *arg); - void (*accf_destroy)(struct socket *so); + void * (*accf_create)(struct solisten *sol, char *arg); + void (*accf_destroy)(struct solisten *sol); SLIST_ENTRY(accept_filter) accf_next; /* next on the list */ }; .Ed Index: sys/kern/kern_sendfile.c =================================================================== --- sys/kern/kern_sendfile.c +++ sys/kern/kern_sendfile.c @@ -510,7 +510,7 @@ * The socket must be a stream socket and connected. */ error = getsock_cap(td, s, cap_rights_init(&rights, CAP_SEND), - sock_fp, NULL, NULL); + sock_fp, NULL, NULL, DTYPE_SOCKET); if (error != 0) return (error); *so = (*sock_fp)->f_data; Index: sys/kern/sys_socket.c =================================================================== --- sys/kern/sys_socket.c +++ sys/kern/sys_socket.c @@ -115,6 +115,28 @@ .fo_flags = DFLAG_PASSABLE }; +static fo_rdwr_t notconn_readwrite; +static fo_ioctl_t sol_ioctl; +static fo_poll_t sol_poll; +extern fo_kqfilter_t sol_kqfilter; +static fo_stat_t sol_stat; +static fo_close_t sol_close; + +struct fileops solistenops = { + .fo_read = notconn_readwrite, + .fo_write = notconn_readwrite, + .fo_truncate = invfo_truncate, + .fo_ioctl = sol_ioctl, + .fo_poll = sol_poll, + .fo_kqfilter = sol_kqfilter, + .fo_stat = sol_stat, /* XXXGL: to do? */ + .fo_close = sol_close, + .fo_chmod = invfo_chmod, + .fo_chown = invfo_chown, + .fo_sendfile = invfo_sendfile, + .fo_flags = DFLAG_PASSABLE +}; + static int soo_read(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td) @@ -153,6 +175,14 @@ } static int +notconn_readwrite(struct file *fp, struct uio *uio, struct ucred *active_cred, + int flags, struct thread *td) +{ + + return (ENOTCONN); +} + +static int soo_ioctl(struct file *fp, u_long cmd, void *data, struct ucred *active_cred, struct thread *td) { @@ -262,6 +292,61 @@ } static int +sol_ioctl(struct file *fp, u_long cmd, void *data, struct ucred *active_cred, + struct thread *td) +{ + struct solisten *sol = fp->f_data; + int error = 0; + + switch (cmd) { + case FIONBIO: + SOLISTEN_LOCK(sol); + if (*(int *)data) + sol->sol_state |= SS_NBIO; + else + sol->sol_state &= ~SS_NBIO; + SOLISTEN_UNLOCK(sol); + break; + + case FIOASYNC: + SOLISTEN_LOCK(sol); + /* To be copied to child sockets. */ + if (*(int *)data) + sol->sol_state |= SS_ASYNC; + else + sol->sol_state &= ~SS_ASYNC; + SOLISTEN_UNLOCK(sol); + break; + + case FIOSETOWN: + error = fsetown(*(int *)data, &sol->sol_sigio); + break; + + case FIOGETOWN: + *(int *)data = fgetown(&sol->sol_sigio); + break; + + case SIOCSPGRP: + error = fsetown(-(*(int *)data), &sol->sol_sigio); + break; + + case SIOCGPGRP: + *(int *)data = -fgetown(&sol->sol_sigio); + break; + + case FIONREAD: + case FIONWRITE: + case FIONSPACE: + case SIOCATMARK: + /* XXXGL: find a better error code for these. ENOBUFS? */ + default: + error = EINVAL; + } + + return (error); +} + +static int soo_poll(struct file *fp, int events, struct ucred *active_cred, struct thread *td) { @@ -277,6 +362,22 @@ } static int +sol_poll(struct file *fp, int events, struct ucred *active_cred, + struct thread *td) +{ + struct solisten *sol = fp->f_data; +#ifdef MAC + int error; + + /* XXXGL */ + error = mac_socket_check_poll(active_cred, (struct socket *)sol); + if (error) + return (error); +#endif + return (solistenpoll(sol, events, fp->f_cred, td)); +} + +static int soo_stat(struct file *fp, struct stat *ub, struct ucred *active_cred, struct thread *td) { @@ -314,6 +415,14 @@ return (*so->so_proto->pr_usrreqs->pru_sense)(so, ub); } +static int +sol_stat(struct file *fp, struct stat *ub, struct ucred *active_cred, + struct thread *td) +{ + + return (EINVAL); +} + /* * API socket close on file pointer. We call soclose() to close the socket * (including initiating closing protocols). soclose() will sorele() the @@ -336,6 +445,17 @@ } static int +sol_close(struct file *fp, struct thread *td) +{ + struct solisten *sol = fp->f_data; + + fp->f_ops = &badfileops; + fp->f_data = NULL; + + return (solistenclose(sol)); +} + +static int soo_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp) { struct sockaddr *sa; @@ -695,7 +815,6 @@ sb->sb_flags &= ~SB_AIO_RUNNING; SOCKBUF_UNLOCK(sb); - ACCEPT_LOCK(); SOCK_LOCK(so); sorele(so); } Index: sys/kern/uipc_accf.c =================================================================== --- sys/kern/uipc_accf.c +++ sys/kern/uipc_accf.c @@ -162,28 +162,23 @@ } int -do_getopt_accept_filter(struct socket *so, struct sockopt *sopt) +accept_filt_getopt(struct solisten *sol, struct sockopt *sopt) { struct accept_filter_arg *afap; int error; error = 0; - afap = malloc(sizeof(*afap), M_TEMP, - M_WAITOK | M_ZERO); - SOCK_LOCK(so); - if ((so->so_options & SO_ACCEPTCONN) == 0) { + afap = malloc(sizeof(*afap), M_TEMP, M_WAITOK | M_ZERO); + SOLISTEN_LOCK(sol); + if (sol->sol_accept_filter == NULL) { error = EINVAL; goto out; } - if ((so->so_options & SO_ACCEPTFILTER) == 0) { - error = EINVAL; - goto out; - } - strcpy(afap->af_name, so->so_accf->so_accept_filter->accf_name); - if (so->so_accf->so_accept_filter_str != NULL) - strcpy(afap->af_arg, so->so_accf->so_accept_filter_str); + strcpy(afap->af_name, sol->sol_accept_filter->accf_name); + if (sol->sol_accept_filter_str != NULL) + strcpy(afap->af_arg, sol->sol_accept_filter_str); out: - SOCK_UNLOCK(so); + SOLISTEN_UNLOCK(sol); if (error == 0) error = sooptcopyout(sopt, afap, sizeof(*afap)); free(afap, M_TEMP); @@ -191,35 +186,29 @@ } int -do_setopt_accept_filter(struct socket *so, struct sockopt *sopt) +accept_filt_setopt(struct solisten *sol, struct sockopt *sopt) { struct accept_filter_arg *afap; struct accept_filter *afp; - struct so_accf *newaf; - int error = 0; + char *accept_filter_str = NULL; + void *accept_filter_arg = NULL; + int error; /* * Handle the simple delete case first. */ if (sopt == NULL || sopt->sopt_val == NULL) { - SOCK_LOCK(so); - if ((so->so_options & SO_ACCEPTCONN) == 0) { - SOCK_UNLOCK(so); - return (EINVAL); + SOLISTEN_LOCK(sol); + if (sol->sol_accept_filter != NULL) { + if (sol->sol_accept_filter->accf_destroy != NULL) + sol->sol_accept_filter->accf_destroy(sol); + if (sol->sol_accept_filter_str != NULL) + free(sol->sol_accept_filter_str, M_ACCF); + sol->sol_accept_filter = NULL; + sol->sol_accept_filter_arg = NULL; + sol->sol_accept_filter_str = NULL; } - if (so->so_accf != NULL) { - struct so_accf *af = so->so_accf; - if (af->so_accept_filter != NULL && - af->so_accept_filter->accf_destroy != NULL) { - af->so_accept_filter->accf_destroy(so); - } - if (af->so_accept_filter_str != NULL) - free(af->so_accept_filter_str, M_ACCF); - free(af, M_ACCF); - so->so_accf = NULL; - } - so->so_options &= ~SO_ACCEPTFILTER; - SOCK_UNLOCK(so); + SOLISTEN_UNLOCK(sol); return (0); } @@ -227,8 +216,7 @@ * Pre-allocate any memory we may need later to avoid blocking at * untimely moments. This does not optimize for invalid arguments. */ - afap = malloc(sizeof(*afap), M_TEMP, - M_WAITOK); + afap = malloc(sizeof(*afap), M_TEMP, M_WAITOK); error = sooptcopyin(sopt, afap, sizeof *afap, sizeof *afap); afap->af_name[sizeof(afap->af_name)-1] = '\0'; afap->af_arg[sizeof(afap->af_arg)-1] = '\0'; @@ -241,28 +229,18 @@ free(afap, M_TEMP); return (ENOENT); } - /* - * Allocate the new accept filter instance storage. We may - * have to free it again later if we fail to attach it. If - * attached properly, 'newaf' is NULLed to avoid a free() - * while in use. - */ - newaf = malloc(sizeof(*newaf), M_ACCF, M_WAITOK | - M_ZERO); + if (afp->accf_create != NULL && afap->af_name[0] != '\0') { size_t len = strlen(afap->af_name) + 1; - newaf->so_accept_filter_str = malloc(len, M_ACCF, - M_WAITOK); - strcpy(newaf->so_accept_filter_str, afap->af_name); + accept_filter_str = malloc(len, M_ACCF, M_WAITOK); + strcpy(accept_filter_str, afap->af_name); } /* - * Require a listen socket; don't try to replace an existing filter - * without first removing it. + * Don't try to replace an existing filter without first removing it. */ - SOCK_LOCK(so); - if (((so->so_options & SO_ACCEPTCONN) == 0) || - (so->so_accf != NULL)) { + SOLISTEN_LOCK(sol); + if (sol->sol_accept_filter != NULL) { error = EINVAL; goto out; } @@ -273,25 +251,19 @@ * can't block. */ if (afp->accf_create != NULL) { - newaf->so_accept_filter_arg = - afp->accf_create(so, afap->af_arg); - if (newaf->so_accept_filter_arg == NULL) { + accept_filter_arg = afp->accf_create(sol, afap->af_arg); + if (accept_filter_arg == NULL) { error = EINVAL; goto out; } } - newaf->so_accept_filter = afp; - so->so_accf = newaf; - so->so_options |= SO_ACCEPTFILTER; - newaf = NULL; + sol->sol_accept_filter = afp; + sol->sol_accept_filter_arg = accept_filter_arg; + sol->sol_accept_filter_str = accept_filter_str; out: - SOCK_UNLOCK(so); - if (newaf != NULL) { - if (newaf->so_accept_filter_str != NULL) - free(newaf->so_accept_filter_str, M_ACCF); - free(newaf, M_ACCF); - } - if (afap != NULL) - free(afap, M_TEMP); + SOLISTEN_UNLOCK(sol); + if (accept_filter_str != NULL) + free(accept_filter_str, M_ACCF); + free(afap, M_TEMP); return (error); } Index: sys/kern/uipc_debug.c =================================================================== --- sys/kern/uipc_debug.c +++ sys/kern/uipc_debug.c @@ -84,10 +84,6 @@ db_printf("%sSO_DEBUG", comma ? ", " : ""); comma = 1; } - if (so_options & SO_ACCEPTCONN) { - db_printf("%sSO_ACCEPTCONN", comma ? ", " : ""); - comma = 1; - } if (so_options & SO_REUSEADDR) { db_printf("%sSO_REUSEADDR", comma ? ", " : ""); comma = 1; @@ -458,15 +454,8 @@ db_print_protosw(so->so_proto, "so_proto", indent); db_print_indent(indent); - db_printf("so_head: %p ", so->so_head); - db_printf("so_incomp first: %p ", TAILQ_FIRST(&so->so_incomp)); - db_printf("so_comp first: %p\n", TAILQ_FIRST(&so->so_comp)); - - db_print_indent(indent); + db_printf("so_listen: %p ", so->so_listen); /* so_list skipped */ - db_printf("so_qlen: %u ", so->so_qlen); - db_printf("so_incqlen: %u ", so->so_incqlen); - db_printf("so_qlimit: %u ", so->so_qlimit); db_printf("so_timeo: %d ", so->so_timeo); db_printf("so_error: %d\n", so->so_error); @@ -478,6 +467,29 @@ db_print_sockbuf(&so->so_snd, "so_snd", indent); } +static void +db_print_solisten(struct solisten *sol, const char *socketname, int indent) +{ + + db_print_indent(indent); + db_printf("%s at %p\n", socketname, sol); + + indent += 2; + + db_print_indent(indent); + db_printf("sol_incomp first: %p ", TAILQ_FIRST(&sol->sol_incomp)); + db_printf("sol_comp first: %p\n", TAILQ_FIRST(&sol->sol_comp)); + db_printf("sol_qlen: %d ", sol->sol_qlen); + db_printf("sol_incqlen: %d ", sol->sol_incqlen); + db_printf("sol_qlimit: %d ", sol->sol_qlimit); + db_printf(")\n"); + + db_print_indent(indent); + db_printf("sol_options: 0x%x (", sol->sol_options); + db_print_sooptions(sol->sol_options); + db_printf(")\n"); +} + DB_SHOW_COMMAND(socket, db_show_socket) { struct socket *so; @@ -491,6 +503,19 @@ db_print_socket(so, "socket", 0); } +DB_SHOW_COMMAND(solisten, db_show_solisten) +{ + struct solisten *sol; + + if (!have_addr) { + db_printf("usage: show solisten \n"); + return; + } + sol = (struct solisten *)addr; + + db_print_solisten(sol, "solisten", 0); +} + DB_SHOW_COMMAND(sockbuf, db_show_sockbuf) { struct sockbuf *sb; Index: sys/kern/uipc_socket.c =================================================================== --- sys/kern/uipc_socket.c +++ sys/kern/uipc_socket.c @@ -106,6 +106,7 @@ #include "opt_inet.h" #include "opt_inet6.h" #include "opt_compat.h" +#include "opt_sctp.h" #include #include @@ -159,14 +160,17 @@ static int filt_soread(struct knote *kn, long hint); static void filt_sowdetach(struct knote *kn); static int filt_sowrite(struct knote *kn, long hint); +static int filt_soempty(struct knote *kn, long hint); static int filt_solisten(struct knote *kn, long hint); +static void filt_soldetach(struct knote *kn); static int inline hhook_run_socket(struct socket *so, void *hctx, int32_t h_id); -static int filt_soempty(struct knote *kn, long hint); + fo_kqfilter_t soo_kqfilter; +fo_kqfilter_t sol_kqfilter; static struct filterops solisten_filtops = { .f_isfd = 1, - .f_detach = filt_sordetach, + .f_detach = filt_soldetach, .f_event = filt_solisten, }; static struct filterops soread_filtops = { @@ -187,6 +191,7 @@ so_gen_t so_gencnt; /* generation count for sockets */ +MALLOC_DEFINE(M_SOLISTEN, "solisten", "listening sockets"); MALLOC_DEFINE(M_SONAME, "soname", "socket name"); MALLOC_DEFINE(M_PCB, "pcb", "protocol control block"); @@ -439,7 +444,6 @@ { KASSERT(so->so_count == 0, ("sodealloc(): so_count %d", so->so_count)); - KASSERT(so->so_pcb == NULL, ("sodealloc(): so_pcb != NULL")); mtx_lock(&so_global_mtx); so->so_gencnt = ++so_gencnt; @@ -456,9 +460,6 @@ if (so->so_snd.sb_hiwat) (void)chgsbsize(so->so_cred->cr_uidinfo, &so->so_snd.sb_hiwat, 0, RLIM_INFINITY); - /* remove accept filter if one is present. */ - if (so->so_accf != NULL) - do_setopt_accept_filter(so, NULL); #ifdef MAC mac_socket_destroy(so); #endif @@ -512,8 +513,6 @@ if (so == NULL) return (ENOBUFS); - TAILQ_INIT(&so->so_incomp); - TAILQ_INIT(&so->so_comp); so->so_type = type; so->so_cred = crhold(cred); if ((prp->pr_domain->dom_family == PF_INET) || @@ -563,18 +562,18 @@ * Note: the ref count on the socket is 0 on return. */ struct socket * -sonewconn(struct socket *head, int connstatus) +sonewconn(struct solisten *sol, int connstatus) { static struct timeval lastover; static struct timeval overinterval = { 60, 0 }; static int overcount; struct socket *so; - int over; + u_int over; - ACCEPT_LOCK(); - over = (head->so_qlen > 3 * head->so_qlimit / 2); - ACCEPT_UNLOCK(); + SOLISTEN_LOCK(sol); + over = (sol->sol_qlen > 3 * sol->sol_qlimit / 2); + SOLISTEN_UNLOCK(sol); #ifdef REGRESSION if (regression_sonewconn_earlytest && over) { #else @@ -586,76 +585,64 @@ log(LOG_DEBUG, "%s: pcb %p: Listen queue overflow: " "%i already in queue awaiting acceptance " "(%d occurrences)\n", - __func__, head->so_pcb, head->so_qlen, overcount); + __func__, sol->sol_pcb, sol->sol_qlen, overcount); overcount = 0; } return (NULL); } - VNET_ASSERT(head->so_vnet != NULL, ("%s:%d so_vnet is NULL, head=%p", - __func__, __LINE__, head)); - so = soalloc(head->so_vnet); + VNET_ASSERT(sol->sol_vnet != NULL, ("%s: sol %p vnet is NULL", + __func__, sol)); + so = soalloc(sol->sol_vnet); if (so == NULL) { log(LOG_DEBUG, "%s: pcb %p: New socket allocation failure: " "limit reached or out of memory\n", - __func__, head->so_pcb); + __func__, sol->sol_pcb); return (NULL); } - if ((head->so_options & SO_ACCEPTFILTER) != 0) + if (sol->sol_accept_filter != NULL) connstatus = 0; - so->so_head = head; - so->so_type = head->so_type; - so->so_options = head->so_options &~ SO_ACCEPTCONN; - so->so_linger = head->so_linger; - so->so_state = head->so_state | SS_NOFDREF; - so->so_fibnum = head->so_fibnum; - so->so_proto = head->so_proto; - so->so_cred = crhold(head->so_cred); + so->so_listen = sol; + so->so_type = sol->sol_type; + so->so_options = sol->sol_options; + so->so_linger = sol->sol_linger; + so->so_state = sol->sol_state | SS_NOFDREF; + so->so_fibnum = sol->sol_fibnum; + so->so_proto = sol->sol_proto; + so->so_cred = crhold(sol->sol_cred); #ifdef MAC - mac_socket_newconn(head, so); + QQQ + mac_socket_newconn(sol, so); #endif knlist_init_mtx(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv)); knlist_init_mtx(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd)); - VNET_SO_ASSERT(head); - if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat)) { + VNET_SO_ASSERT(sol); + if (soreserve(so, sol->sol_sbsnd_hiwat, sol->sol_sbrcv_hiwat)) { sodealloc(so); log(LOG_DEBUG, "%s: pcb %p: soreserve() failed\n", - __func__, head->so_pcb); + __func__, sol->sol_pcb); return (NULL); } if ((*so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL)) { sodealloc(so); log(LOG_DEBUG, "%s: pcb %p: pru_attach() failed\n", - __func__, head->so_pcb); + __func__, sol->sol_pcb); return (NULL); } - so->so_rcv.sb_lowat = head->so_rcv.sb_lowat; - so->so_snd.sb_lowat = head->so_snd.sb_lowat; - so->so_rcv.sb_timeo = head->so_rcv.sb_timeo; - so->so_snd.sb_timeo = head->so_snd.sb_timeo; - so->so_rcv.sb_flags |= head->so_rcv.sb_flags & SB_AUTOSIZE; - so->so_snd.sb_flags |= head->so_snd.sb_flags & SB_AUTOSIZE; + so->so_rcv.sb_lowat = sol->sol_sbrcv_lowat; + so->so_snd.sb_lowat = sol->sol_sbsnd_lowat; + so->so_rcv.sb_timeo = sol->sol_sbrcv_timeo; + so->so_snd.sb_timeo = sol->sol_sbsnd_timeo; + so->so_rcv.sb_flags |= sol->sol_sbrcv_flags & SB_AUTOSIZE; + so->so_snd.sb_flags |= sol->sol_sbsnd_flags & SB_AUTOSIZE; so->so_state |= connstatus; - ACCEPT_LOCK(); - /* - * The accept socket may be tearing down but we just - * won a race on the ACCEPT_LOCK. - * However, if sctp_peeloff() is called on a 1-to-many - * style socket, the SO_ACCEPTCONN doesn't need to be set. - */ - if (!(head->so_options & SO_ACCEPTCONN) && - ((head->so_proto->pr_protocol != IPPROTO_SCTP) || - (head->so_type != SOCK_SEQPACKET))) { - SOCK_LOCK(so); - so->so_head = NULL; - sofree(so); /* NB: returns ACCEPT_UNLOCK'ed. */ - return (NULL); - } + + SOLISTEN_LOCK(sol); if (connstatus) { - TAILQ_INSERT_TAIL(&head->so_comp, so, so_list); + TAILQ_INSERT_TAIL(&sol->sol_comp, so, so_list); so->so_qstate |= SQ_COMP; - head->so_qlen++; + sol->sol_qlen++; } else { /* * Keep removing sockets from the head until there's room for @@ -664,29 +651,90 @@ * threads and soabort() requires dropping locks, we must * loop waiting for the condition to be true. */ - while (head->so_incqlen > head->so_qlimit) { + while (sol->sol_incqlen > sol->sol_qlimit) { struct socket *sp; - sp = TAILQ_FIRST(&head->so_incomp); - TAILQ_REMOVE(&head->so_incomp, sp, so_list); - head->so_incqlen--; + sp = TAILQ_FIRST(&sol->sol_incomp); + TAILQ_REMOVE(&sol->sol_incomp, sp, so_list); + sol->sol_incqlen--; sp->so_qstate &= ~SQ_INCOMP; - sp->so_head = NULL; - ACCEPT_UNLOCK(); + sp->so_listen = NULL; + SOLISTEN_UNLOCK(sol); soabort(sp); - ACCEPT_LOCK(); + SOLISTEN_LOCK(sol); } - TAILQ_INSERT_TAIL(&head->so_incomp, so, so_list); + TAILQ_INSERT_TAIL(&sol->sol_incomp, so, so_list); so->so_qstate |= SQ_INCOMP; - head->so_incqlen++; + sol->sol_incqlen++; } - ACCEPT_UNLOCK(); + SOLISTEN_UNLOCK(sol); if (connstatus) { - sorwakeup(head); - wakeup_one(&head->so_timeo); + /* QQQ sorwakeup(head); */ + selwakeuppri(&sol->sol_selinfo, PSOCK); + KNOTE_LOCKED(&sol->sol_selinfo.si_note, 0); + wakeup_one(&sol->sol_comp); } return (so); } +#ifdef SCTP +/* + * Socket part of sctp_peeloff(). Detach a new socket from an + * association. The new socket is returned with a reference. + */ +struct socket * +sopeeloff(struct socket *head) +{ + struct socket *so; + + VNET_ASSERT(head->so_vnet != NULL, ("%s:%d so_vnet is NULL, head=%p", + __func__, __LINE__, head)); + so = soalloc(head->so_vnet); + if (so == NULL) { + log(LOG_DEBUG, "%s: pcb %p: New socket allocation failure: " + "limit reached or out of memory\n", + __func__, head->so_pcb); + return (NULL); + } + so->so_type = head->so_type; + so->so_options = head->so_options; + so->so_linger = head->so_linger; + so->so_state = (head->so_state & SS_NBIO) | SS_ISCONNECTED; + so->so_fibnum = head->so_fibnum; + so->so_proto = head->so_proto; + so->so_cred = crhold(head->so_cred); +#ifdef MAC + mac_socket_newconn(head, so); +#endif + knlist_init_mtx(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv)); + knlist_init_mtx(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd)); + VNET_SO_ASSERT(head); + if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat)) { + sodealloc(so); + log(LOG_DEBUG, "%s: pcb %p: soreserve() failed\n", + __func__, head->so_pcb); + return (NULL); + } + if ((*so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL)) { + sodealloc(so); + log(LOG_DEBUG, "%s: pcb %p: pru_attach() failed\n", + __func__, head->so_pcb); + return (NULL); + } + so->so_rcv.sb_lowat = head->so_rcv.sb_lowat; + so->so_snd.sb_lowat = head->so_snd.sb_lowat; + so->so_rcv.sb_timeo = head->so_rcv.sb_timeo; + so->so_snd.sb_timeo = head->so_snd.sb_timeo; + so->so_rcv.sb_flags |= head->so_rcv.sb_flags & SB_AUTOSIZE; + so->so_snd.sb_flags |= head->so_snd.sb_flags & SB_AUTOSIZE; + + SOCK_LOCK(so); + soref(so); + SOCK_UNLOCK(so); + + return (so); +} +#endif /* SCTP */ + int sobind(struct socket *so, struct sockaddr *nam, struct thread *td) { @@ -711,49 +759,128 @@ /* * solisten() transitions a socket from a non-listening state to a listening - * state, but can also be used to update the listen queue depth on an - * existing listen socket. The protocol will call back into the sockets - * layer using solisten_proto_check() and solisten_proto() to check and set - * socket-layer listen state. Call backs are used so that the protocol can - * acquire both protocol and socket layer locks in whatever order is required - * by the protocol. - * - * Protocol implementors are advised to hold the socket lock across the - * socket-layer test and set to avoid races at the socket layer. + * state. */ int -solisten(struct socket *so, int backlog, struct thread *td) +solisten(struct socket *so, int backlog, struct thread *td, struct file *fp) { + struct solisten *sol; int error; CURVNET_SET(so->so_vnet); - error = (*so->so_proto->pr_usrreqs->pru_listen)(so, backlog, td); - CURVNET_RESTORE(); - return (error); -} -int -solisten_proto_check(struct socket *so) -{ - - SOCK_LOCK_ASSERT(so); + sol = malloc(sizeof(struct solisten), M_SOLISTEN, M_WAITOK | M_ZERO); + SOCK_LOCK(so); if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING | - SS_ISDISCONNECTING)) + SS_ISDISCONNECTING)) { + SOCK_UNLOCK(so); + free(sol, M_SOLISTEN); return (EINVAL); + } + /* + * XXXGL: we are going to drop socket lock, to avoid LOR + * against protocol locks, thus to block any other syscalls + * on the socket we mark it SS_ISDISCONNECTING. + * If that doesn't work well, we would need a separate flag + * to mark the fact that socket is now under listen(2) + * transmutation. + */ + so->so_state |= SS_ISDISCONNECTING; + SOCK_UNLOCK(so); + + sol->sol_type = so->so_type; + sol->sol_options = so->so_options; + sol->sol_linger = so->so_linger; + sol->sol_state = so->so_state & ~SS_ISDISCONNECTING; + sol->sol_fibnum = so->so_fibnum; + sol->sol_sbrcv_lowat = so->so_rcv.sb_lowat; + sol->sol_sbsnd_lowat = so->so_snd.sb_lowat; + sol->sol_sbrcv_hiwat = so->so_rcv.sb_hiwat; + sol->sol_sbsnd_hiwat = so->so_snd.sb_hiwat; + sol->sol_sbrcv_flags = so->so_rcv.sb_flags; + sol->sol_sbsnd_flags = so->so_snd.sb_flags; + sol->sol_sbrcv_timeo = so->so_rcv.sb_timeo; + sol->sol_sbsnd_timeo = so->so_snd.sb_timeo; + sol->sol_proto = so->so_proto; + sol->sol_vnet = so->so_vnet; + sol->sol_cred = so->so_cred; + if (so->so_sigio != NULL) + fsetown(fgetown(&so->so_sigio), &sol->sol_sigio); + else + sol->sol_sigio = NULL; + sol->sol_pcb = so->so_pcb; + sol->sol_accept_filter = NULL; + sol->sol_accept_filter_arg = NULL; + sol->sol_accept_filter_str = NULL; + sol->sol_error = 0; + + mtx_init(&sol->sol_mutex, "solisten", NULL, MTX_DEF); + knlist_init_mtx(&sol->sol_selinfo.si_note, &sol->sol_mutex); + sol->sol_qlen = sol->sol_incqlen = 0; + if (backlog < 0 || backlog > somaxconn) + backlog = somaxconn; + sol->sol_qlimit = backlog; + TAILQ_INIT(&sol->sol_incomp); + TAILQ_INIT(&sol->sol_comp); + + error = (*so->so_proto->pr_usrreqs->pru_listen)(so, sol, backlog, td); + if (error) { + SOCK_LOCK(so); + so->so_state &= ~SS_ISDISCONNECTING; + SOCK_UNLOCK(so); + funsetown(&sol->sol_sigio); + knlist_destroy(&sol->sol_selinfo.si_note); + mtx_destroy(&sol->sol_mutex); + free(sol, M_SOLISTEN); + return (error); + } + + finit(fp, fp->f_flag, DTYPE_SOLISTEN, sol, &solistenops); + + SOCK_LOCK(so); + KASSERT((so->so_state & SS_NOFDREF) == 0, ("%s: NOFDREF", __func__)); + funsetown(&so->so_sigio); + so->so_state |= SS_NOFDREF; + so->so_pcb = NULL; + sorele(so); + + CURVNET_RESTORE(); + return (0); } -void -solisten_proto(struct socket *so, int backlog) +/* + * sollisten() implements listen(2) on already listening socket. For most + * protocols it just updates backlog length, but for SCTP it unlistens the + * socket. + */ +int +sollisten(struct solisten *sol, int backlog, struct thread *td, struct file *fp) { + int error; - SOCK_LOCK_ASSERT(so); + error = (*sol->sol_proto->pr_usrreqs->pru_listen)(NULL, + sol, backlog, td); + + /* + * If the protocol decides to do something more complex than backlog + * update, it does all the work itself and returns ENOTSOCK. For now + * this is case only for SCTP. + */ + if (error == ENOTSOCK) + return (0); + + if (error) + return (error); if (backlog < 0 || backlog > somaxconn) backlog = somaxconn; - so->so_qlimit = backlog; - so->so_options |= SO_ACCEPTCONN; + SOLISTEN_LOCK(sol); + sol->sol_qlimit = backlog; + SOLISTEN_UNLOCK(sol); + + return (0); } /* @@ -780,50 +907,52 @@ sofree(struct socket *so) { struct protosw *pr = so->so_proto; - struct socket *head; + struct solisten *sol; - ACCEPT_LOCK_ASSERT(); SOCK_LOCK_ASSERT(so); if ((so->so_state & SS_NOFDREF) == 0 || so->so_count != 0 || (so->so_state & SS_PROTOREF) || (so->so_qstate & SQ_COMP)) { SOCK_UNLOCK(so); - ACCEPT_UNLOCK(); return; } - head = so->so_head; - if (head != NULL) { + sol = so->so_listen; + if (sol != NULL) { KASSERT((so->so_qstate & SQ_COMP) != 0 || (so->so_qstate & SQ_INCOMP) != 0, - ("sofree: so_head != NULL, but neither SQ_COMP nor " + ("sofree: so_listen != NULL, but neither SQ_COMP nor " "SQ_INCOMP")); KASSERT((so->so_qstate & SQ_COMP) == 0 || (so->so_qstate & SQ_INCOMP) == 0, ("sofree: so->so_qstate is SQ_COMP and also SQ_INCOMP")); - TAILQ_REMOVE(&head->so_incomp, so, so_list); - head->so_incqlen--; + SOLISTEN_LOCK(sol); + TAILQ_REMOVE(&sol->sol_incomp, so, so_list); + sol->sol_incqlen--; + SOLISTEN_UNLOCK(sol); so->so_qstate &= ~SQ_INCOMP; - so->so_head = NULL; + so->so_listen = NULL; } KASSERT((so->so_qstate & SQ_COMP) == 0 && (so->so_qstate & SQ_INCOMP) == 0, - ("sofree: so_head == NULL, but still SQ_COMP(%d) or SQ_INCOMP(%d)", - so->so_qstate & SQ_COMP, so->so_qstate & SQ_INCOMP)); - if (so->so_options & SO_ACCEPTCONN) { - KASSERT((TAILQ_EMPTY(&so->so_comp)), - ("sofree: so_comp populated")); - KASSERT((TAILQ_EMPTY(&so->so_incomp)), - ("sofree: so_incomp populated")); - } + ("%s: so_listen == NULL, but still SQ_COMP(%d) or SQ_INCOMP(%d)", + __func__, so->so_qstate & SQ_COMP, so->so_qstate & SQ_INCOMP)); SOCK_UNLOCK(so); - ACCEPT_UNLOCK(); VNET_SO_ASSERT(so); - if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL) - (*pr->pr_domain->dom_dispose)(so); - if (pr->pr_usrreqs->pru_detach != NULL) - (*pr->pr_usrreqs->pru_detach)(so); + + /* + * If socket has pcb pointer cleared, this means we are + * called from solisten() and pcb shall not be disposed + * and detached. + */ + if (so->so_pcb != NULL) { + if (pr->pr_flags & PR_RIGHTS && + pr->pr_domain->dom_dispose != NULL) + (*pr->pr_domain->dom_dispose)(so); + if (pr->pr_usrreqs->pru_detach != NULL) + (*pr->pr_usrreqs->pru_detach)(so); + } /* * From this point on, we assume that no other references to this @@ -891,45 +1020,60 @@ drop: if (so->so_proto->pr_usrreqs->pru_close != NULL) (*so->so_proto->pr_usrreqs->pru_close)(so); - ACCEPT_LOCK(); - if (so->so_options & SO_ACCEPTCONN) { - struct socket *sp; - /* - * Prevent new additions to the accept queues due - * to ACCEPT_LOCK races while we are draining them. - */ - so->so_options &= ~SO_ACCEPTCONN; - while ((sp = TAILQ_FIRST(&so->so_incomp)) != NULL) { - TAILQ_REMOVE(&so->so_incomp, sp, so_list); - so->so_incqlen--; - sp->so_qstate &= ~SQ_INCOMP; - sp->so_head = NULL; - ACCEPT_UNLOCK(); - soabort(sp); - ACCEPT_LOCK(); - } - while ((sp = TAILQ_FIRST(&so->so_comp)) != NULL) { - TAILQ_REMOVE(&so->so_comp, sp, so_list); - so->so_qlen--; - sp->so_qstate &= ~SQ_COMP; - sp->so_head = NULL; - ACCEPT_UNLOCK(); - soabort(sp); - ACCEPT_LOCK(); - } - KASSERT((TAILQ_EMPTY(&so->so_comp)), - ("%s: so_comp populated", __func__)); - KASSERT((TAILQ_EMPTY(&so->so_incomp)), - ("%s: so_incomp populated", __func__)); - } SOCK_LOCK(so); KASSERT((so->so_state & SS_NOFDREF) == 0, ("soclose: NOFDREF")); so->so_state |= SS_NOFDREF; - sorele(so); /* NB: Returns with ACCEPT_UNLOCK(). */ + sorele(so); CURVNET_RESTORE(); return (error); } +int +solistenclose(struct solisten *sol) +{ + struct accept_queue comp, incomp; + struct socket *so; + + KASSERT((sol->sol_state & SS_NOFDREF) == 0, ("%s: NOFDREF", __func__)); + + CURVNET_SET(sol->sol_vnet); + funsetown(&sol->sol_sigio); + + if (sol->sol_accept_filter != NULL) + accept_filt_setopt(sol, NULL); + + (*sol->sol_proto->pr_usrreqs->pru_listenclose)(sol); + + TAILQ_INIT(&comp); + TAILQ_INIT(&incomp); + SOLISTEN_LOCK(sol); + TAILQ_SWAP(&comp, &sol->sol_comp, socket, so_list); + TAILQ_SWAP(&incomp, &sol->sol_incomp, socket, so_list); + sol->sol_incqlen = 0; + sol->sol_qlen = 0; + SOLISTEN_UNLOCK(sol); + while ((so = TAILQ_FIRST(&incomp)) != NULL) { + TAILQ_REMOVE(&incomp, so, so_list); + so->so_qstate &= ~SQ_INCOMP; + so->so_listen = NULL; + soabort(so); + } + while ((so = TAILQ_FIRST(&comp)) != NULL) { + TAILQ_REMOVE(&comp, so, so_list); + so->so_qstate &= ~SQ_COMP; + so->so_listen = NULL; + soabort(so); + } + seldrain(&sol->sol_selinfo); + knlist_destroy(&sol->sol_selinfo.si_note); + sol->sol_state |= SS_NOFDREF; + mtx_destroy(&sol->sol_mutex); + free(sol, M_SOLISTEN); + CURVNET_RESTORE(); + + return (0); +} + /* * soabort() is used to abruptly tear down a connection, such as when a * resource limit is reached (listen queue depth exceeded), or if a listen @@ -963,7 +1107,6 @@ if (so->so_proto->pr_usrreqs->pru_abort != NULL) (*so->so_proto->pr_usrreqs->pru_abort)(so); - ACCEPT_LOCK(); SOCK_LOCK(so); sofree(so); } @@ -996,9 +1139,6 @@ { int error; - if (so->so_options & SO_ACCEPTCONN) - return (EOPNOTSUPP); - CURVNET_SET(so->so_vnet); /* * If protocol is connection-based, can only connect once. @@ -2509,19 +2649,13 @@ error = 0; if (sopt->sopt_level != SOL_SOCKET) { if (so->so_proto->pr_ctloutput != NULL) { - error = (*so->so_proto->pr_ctloutput)(so, sopt); + error = (*so->so_proto->pr_ctloutput)(so->so_pcb, sopt); CURVNET_RESTORE(); return (error); } error = ENOPROTOOPT; } else { switch (sopt->sopt_name) { - case SO_ACCEPTFILTER: - error = do_setopt_accept_filter(so, sopt); - if (error) - goto bad; - break; - case SO_LINGER: error = sooptcopyin(sopt, &l, sizeof l, sizeof l); if (error) @@ -2716,7 +2850,7 @@ break; } if (error == 0 && so->so_proto->pr_ctloutput != NULL) - (void)(*so->so_proto->pr_ctloutput)(so, sopt); + (void)(*so->so_proto->pr_ctloutput)(so->so_pcb, sopt); } bad: CURVNET_RESTORE(); @@ -2767,7 +2901,7 @@ error = 0; if (sopt->sopt_level != SOL_SOCKET) { if (so->so_proto->pr_ctloutput != NULL) - error = (*so->so_proto->pr_ctloutput)(so, sopt); + error = (*so->so_proto->pr_ctloutput)(so->so_pcb, sopt); else error = ENOPROTOOPT; CURVNET_RESTORE(); @@ -2775,7 +2909,7 @@ } else { switch (sopt->sopt_name) { case SO_ACCEPTFILTER: - error = do_getopt_accept_filter(so, sopt); + error = EINVAL; break; case SO_LINGER: @@ -2794,7 +2928,6 @@ case SO_REUSEPORT: case SO_BROADCAST: case SO_OOBINLINE: - case SO_ACCEPTCONN: case SO_TIMESTAMP: case SO_BINTIME: case SO_NOSIGPIPE: @@ -2855,11 +2988,11 @@ error = sooptcopyin(sopt, &extmac, sizeof(extmac), sizeof(extmac)); if (error) - goto bad; + break; error = mac_getsockopt_label(sopt->sopt_td->td_ucred, so, &extmac); if (error) - goto bad; + break; error = sooptcopyout(sopt, &extmac, sizeof extmac); #else error = EOPNOTSUPP; @@ -2875,23 +3008,20 @@ error = mac_getsockopt_peerlabel( sopt->sopt_td->td_ucred, so, &extmac); if (error) - goto bad; + break; error = sooptcopyout(sopt, &extmac, sizeof extmac); #else error = EOPNOTSUPP; #endif break; +#if 0 +QQQ case SO_ACCEPTCONN: +#endif case SO_LISTENQLIMIT: - optval = so->so_qlimit; - goto integer; - case SO_LISTENQLEN: - optval = so->so_qlen; - goto integer; - case SO_LISTENINCQLEN: - optval = so->so_incqlen; + optval = 0; goto integer; case SO_TS_CLOCK: @@ -2911,9 +3041,76 @@ break; } } -#ifdef MAC -bad: -#endif + CURVNET_RESTORE(); + return (error); +} + +int +solgetopt(struct solisten *sol, struct sockopt *sopt) +{ + int optval, error; + + CURVNET_SET(sol->sol_vnet); + if (sopt->sopt_level != SOL_SOCKET) { + if (sol->sol_proto->pr_ctloutput != NULL) + error = (*sol->sol_proto->pr_ctloutput)(sol->sol_pcb, + sopt); + else + error = ENOPROTOOPT; + CURVNET_RESTORE(); + return (error); + } + + switch (sopt->sopt_name) { + case SO_ACCEPTFILTER: + error = accept_filt_getopt(sol, sopt); + break; + + case SO_LISTENQLIMIT: + optval = sol->sol_qlimit; + goto integer; + + case SO_LISTENQLEN: + optval = sol->sol_qlen; + goto integer; + + case SO_LISTENINCQLEN: + optval = sol->sol_incqlen; +integer: + error = sooptcopyout(sopt, &optval, sizeof optval); + break; + + default: + error = ENOPROTOOPT; + } + CURVNET_RESTORE(); + return (error); +} + +int +solsetopt(struct solisten *sol, struct sockopt *sopt) +{ + int error; + + CURVNET_SET(sol->sol_vnet); + if (sopt->sopt_level != SOL_SOCKET) { + if (sol->sol_proto->pr_ctloutput != NULL) + error = (*sol->sol_proto->pr_ctloutput)(sol->sol_pcb, + sopt); + else + error = ENOPROTOOPT; + CURVNET_RESTORE(); + return (error); + } + + switch (sopt->sopt_name) { + case SO_ACCEPTFILTER: + error = accept_filt_setopt(sol, sopt); + break; + + default: + error = ENOPROTOOPT; + } CURVNET_RESTORE(); return (error); } @@ -3100,6 +3297,25 @@ } int +solistenpoll(struct solisten *sol, int events, struct ucred *active_cred, + struct thread *td) +{ + + if (!(events & (POLLIN | POLLRDNORM))) + return (0); + + SOLISTEN_LOCK(sol); + if (!TAILQ_EMPTY(&sol->sol_comp)) { + SOLISTEN_UNLOCK(sol); + return (events & (POLLIN | POLLRDNORM)); + } else { + selrecord(td, &sol->sol_selinfo); + SOLISTEN_UNLOCK(sol); + return (0); + } +} + +int soo_kqfilter(struct file *fp, struct knote *kn) { struct socket *so = kn->kn_fp->f_data; @@ -3107,10 +3323,7 @@ switch (kn->kn_filter) { case EVFILT_READ: - if (so->so_options & SO_ACCEPTCONN) - kn->kn_fop = &solisten_filtops; - else - kn->kn_fop = &soread_filtops; + kn->kn_fop = &soread_filtops; sb = &so->so_rcv; break; case EVFILT_WRITE: @@ -3132,6 +3345,22 @@ return (0); } +int +sol_kqfilter(struct file *fp, struct knote *kn) +{ + struct solisten *sol = kn->kn_fp->f_data; + + if (kn->kn_filter != EVFILT_READ) + return (EINVAL); + + kn->kn_fop = &solisten_filtops; + SOLISTEN_LOCK(sol); + knlist_add(&sol->sol_selinfo.si_note, kn, 1); + SOLISTEN_UNLOCK(sol); + + return (0); +} + /* * Some routines that return EOPNOTSUPP for entry points that are not * supported by a protocol. Fill in as needed. @@ -3210,7 +3439,8 @@ } int -pru_listen_notsupp(struct socket *so, int backlog, struct thread *td) +pru_listen_notsupp(struct socket *so, struct solisten *sol, int backlog, + struct thread *td) { return EOPNOTSUPP; @@ -3314,6 +3544,16 @@ SOCKBUF_UNLOCK(&so->so_rcv); } +static void +filt_soldetach(struct knote *kn) +{ + struct solisten *sol = kn->kn_fp->f_data; + + SOLISTEN_LOCK(sol); + knlist_remove(&sol->sol_selinfo.si_note, kn, 1); + SOLISTEN_UNLOCK(sol); +} + /*ARGSUSED*/ static int filt_soread(struct knote *kn, long hint) @@ -3401,10 +3641,10 @@ static int filt_solisten(struct knote *kn, long hint) { - struct socket *so = kn->kn_fp->f_data; + struct solisten *sol = kn->kn_fp->f_data; - kn->kn_data = so->so_qlen; - return (!TAILQ_EMPTY(&so->so_comp)); + kn->kn_data = sol->sol_qlen; + return (!TAILQ_EMPTY(&sol->sol_comp)); } int @@ -3463,35 +3703,37 @@ void soisconnected(struct socket *so) { - struct socket *head; + struct solisten *sol; int ret; restart: - ACCEPT_LOCK(); + if ((sol = so->so_listen) != NULL) + SOLISTEN_LOCK(sol); SOCK_LOCK(so); so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING|SS_ISCONFIRMING); so->so_state |= SS_ISCONNECTED; - head = so->so_head; - if (head != NULL && (so->so_qstate & SQ_INCOMP)) { + if (sol != NULL && (so->so_qstate & SQ_INCOMP)) { if ((so->so_options & SO_ACCEPTFILTER) == 0) { SOCK_UNLOCK(so); - TAILQ_REMOVE(&head->so_incomp, so, so_list); - head->so_incqlen--; + TAILQ_REMOVE(&sol->sol_incomp, so, so_list); + sol->sol_incqlen--; so->so_qstate &= ~SQ_INCOMP; - TAILQ_INSERT_TAIL(&head->so_comp, so, so_list); - head->so_qlen++; + TAILQ_INSERT_TAIL(&sol->sol_comp, so, so_list); + sol->sol_qlen++; so->so_qstate |= SQ_COMP; - ACCEPT_UNLOCK(); - sorwakeup(head); - wakeup_one(&head->so_timeo); + /* QQQ sorwakeup(sol); */ + selwakeuppri(&sol->sol_selinfo, PSOCK); + KNOTE_LOCKED(&sol->sol_selinfo.si_note, 0); + SOLISTEN_UNLOCK(sol); + wakeup_one(&sol->sol_comp); } else { - ACCEPT_UNLOCK(); + SOLISTEN_UNLOCK(sol); soupcall_set(so, SO_RCV, - head->so_accf->so_accept_filter->accf_callback, - head->so_accf->so_accept_filter_arg); + sol->sol_accept_filter->accf_callback, + sol->sol_accept_filter_arg); so->so_options &= ~SO_ACCEPTFILTER; - ret = head->so_accf->so_accept_filter->accf_callback(so, - head->so_accf->so_accept_filter_arg, M_NOWAIT); + ret = sol->sol_accept_filter->accf_callback(so, + sol->sol_accept_filter_arg, M_NOWAIT); if (ret == SU_ISCONNECTED) soupcall_clear(so, SO_RCV); SOCK_UNLOCK(so); @@ -3500,8 +3742,9 @@ } return; } + if (sol != NULL) + SOLISTEN_UNLOCK(sol); SOCK_UNLOCK(so); - ACCEPT_UNLOCK(); wakeup(&so->so_timeo); sorwakeup(so); sowwakeup(so); @@ -3628,9 +3871,7 @@ xso->so_pcb = so->so_pcb; xso->xso_protocol = so->so_proto->pr_protocol; xso->xso_family = so->so_proto->pr_domain->dom_family; - xso->so_qlen = so->so_qlen; - xso->so_incqlen = so->so_incqlen; - xso->so_qlimit = so->so_qlimit; + xso->so_qlen = xso->so_incqlen = xso->so_qlimit = 0; xso->so_timeo = so->so_timeo; xso->so_error = so->so_error; xso->so_pgid = so->so_sigio ? so->so_sigio->sio_pgid : 0; @@ -3640,20 +3881,33 @@ xso->so_uid = so->so_cred->cr_uid; } - /* - * Socket accessor functions to provide external consumers with - * a safe interface to socket state - * + * Create an external-format (``xsocket'') structure using the information in + * the listening socket structure pointed to by sol. */ - void -so_listeners_apply_all(struct socket *so, void (*func)(struct socket *, void *), - void *arg) +soltoxsocket(struct solisten *sol, struct xsocket *xso) { - TAILQ_FOREACH(so, &so->so_comp, so_list) - func(so, arg); + xso->xso_len = sizeof *xso; + xso->xso_so = (struct socket *)sol; + xso->so_type = sol->sol_type; + xso->so_options = sol->sol_options | SO_ACCEPTCONN; + xso->so_linger = sol->sol_linger; + xso->so_state = sol->sol_state; + xso->so_pcb = sol->sol_pcb; + xso->xso_protocol = sol->sol_proto->pr_protocol; + xso->xso_family = sol->sol_proto->pr_domain->dom_family; + xso->so_qlen = sol->sol_qlen; + xso->so_incqlen = sol->sol_incqlen; + xso->so_qlimit = sol->sol_qlimit; + xso->so_timeo = 0; + xso->so_error = sol->sol_error; + xso->so_pgid = sol->sol_sigio ? sol->sol_sigio->sio_pgid : 0; + xso->so_oobmark = 0; + bzero(&xso->so_snd, sizeof(xso->so_snd)); + bzero(&xso->so_rcv, sizeof(xso->so_rcv)); + xso->so_uid = sol->sol_cred->cr_uid; } struct sockbuf * Index: sys/kern/uipc_syscalls.c =================================================================== --- sys/kern/uipc_syscalls.c +++ sys/kern/uipc_syscalls.c @@ -94,7 +94,7 @@ */ int getsock_cap(struct thread *td, int fd, cap_rights_t *rightsp, - struct file **fpp, u_int *fflagp, struct filecaps *havecapsp) + struct file **fpp, u_int *fflagp, struct filecaps *havecapsp, short type) { struct file *fp; int error; @@ -102,7 +102,7 @@ error = fget_cap(td, fd, rightsp, &fp, havecapsp); if (error != 0) return (error); - if (fp->f_type != DTYPE_SOCKET) { + if (fp->f_type != type && type != 0) { fdrop(fp, td); if (havecapsp != NULL) filecaps_free(havecapsp); @@ -191,7 +191,7 @@ AUDIT_ARG_FD(fd); AUDIT_ARG_SOCKADDR(td, dirfd, sa); error = getsock_cap(td, fd, cap_rights_init(&rights, CAP_BIND), - &fp, NULL, NULL); + &fp, NULL, NULL, DTYPE_SOCKET); if (error != 0) return (error); so = fp->f_data; @@ -232,23 +232,40 @@ sys_listen(struct thread *td, struct listen_args *uap) { struct socket *so; + struct solisten *sol; struct file *fp; cap_rights_t rights; int error; AUDIT_ARG_FD(uap->s); error = getsock_cap(td, uap->s, cap_rights_init(&rights, CAP_LISTEN), - &fp, NULL, NULL); - if (error == 0) { + &fp, NULL, NULL, 0); + if (error) + return (error); + + switch (fp->f_type) { + case DTYPE_SOCKET: so = fp->f_data; #ifdef MAC error = mac_socket_check_listen(td->td_ucred, so); if (error == 0) #endif - error = solisten(so, uap->backlog, td); - fdrop(fp, td); + error = solisten(so, uap->backlog, td, fp); + break; + case DTYPE_SOLISTEN: + sol = fp->f_data; +#ifdef MAC + QQQ + if (error == 0) +#endif + error = sollisten(sol, uap->backlog, td, fp); + break; + default: + error = ENOTSOCK; } - return(error); + fdrop(fp, td); + + return (error); } /* @@ -308,9 +325,10 @@ kern_accept4(struct thread *td, int s, struct sockaddr **name, socklen_t *namelen, int flags, struct file **fp) { - struct file *headfp, *nfp = NULL; + struct file *solfp, *nfp = NULL; struct sockaddr *sa = NULL; - struct socket *head, *so; + struct solisten *sol; + struct socket *so; struct filecaps fcaps; cap_rights_t rights; u_int fflag; @@ -322,16 +340,17 @@ AUDIT_ARG_FD(s); error = getsock_cap(td, s, cap_rights_init(&rights, CAP_ACCEPT), - &headfp, &fflag, &fcaps); + &solfp, &fflag, &fcaps, 0); if (error != 0) return (error); - head = headfp->f_data; - if ((head->so_options & SO_ACCEPTCONN) == 0) { - error = EINVAL; + if (solfp->f_type != DTYPE_SOLISTEN) { + error = solfp->f_type == DTYPE_SOCKET ? EINVAL : ENOTSOCK; goto done; } + sol = solfp->f_data; #ifdef MAC - error = mac_socket_check_accept(td->td_ucred, head); + QQQ + error = mac_socket_check_accept(td->td_ucred, sol); if (error != 0) goto done; #endif @@ -339,31 +358,27 @@ (flags & SOCK_CLOEXEC) ? O_CLOEXEC : 0, &fcaps); if (error != 0) goto done; - ACCEPT_LOCK(); - if ((head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->so_comp)) { - ACCEPT_UNLOCK(); + SOLISTEN_LOCK(sol); + if ((sol->sol_state & SS_NBIO) && TAILQ_EMPTY(&sol->sol_comp)) { + SOLISTEN_UNLOCK(sol); error = EWOULDBLOCK; goto noconnection; } - while (TAILQ_EMPTY(&head->so_comp) && head->so_error == 0) { - if (head->so_rcv.sb_state & SBS_CANTRCVMORE) { - head->so_error = ECONNABORTED; - break; - } - error = msleep(&head->so_timeo, &accept_mtx, PSOCK | PCATCH, + while (TAILQ_EMPTY(&sol->sol_comp) && sol->sol_error == 0) { + error = msleep(&sol->sol_comp, &sol->sol_mutex, PSOCK | PCATCH, "accept", 0); if (error != 0) { - ACCEPT_UNLOCK(); + SOLISTEN_UNLOCK(sol); goto noconnection; } } - if (head->so_error) { - error = head->so_error; - head->so_error = 0; - ACCEPT_UNLOCK(); + if (sol->sol_error) { + error = sol->sol_error; + sol->sol_error = 0; + SOLISTEN_UNLOCK(sol); goto noconnection; } - so = TAILQ_FIRST(&head->so_comp); + so = TAILQ_FIRST(&sol->sol_comp); KASSERT(!(so->so_qstate & SQ_INCOMP), ("accept1: so SQ_INCOMP")); KASSERT(so->so_qstate & SQ_COMP, ("accept1: so not SQ_COMP")); @@ -375,26 +390,26 @@ SOCK_LOCK(so); /* soref() and so_state update */ soref(so); /* file descriptor reference */ - TAILQ_REMOVE(&head->so_comp, so, so_list); - head->so_qlen--; + TAILQ_REMOVE(&sol->sol_comp, so, so_list); + sol->sol_qlen--; if (flags & ACCEPT4_INHERIT) - so->so_state |= (head->so_state & SS_NBIO); + so->so_state |= (sol->sol_state & SS_NBIO); else so->so_state |= (flags & SOCK_NONBLOCK) ? SS_NBIO : 0; so->so_qstate &= ~SQ_COMP; - so->so_head = NULL; + so->so_listen = NULL; SOCK_UNLOCK(so); - ACCEPT_UNLOCK(); + SOLISTEN_UNLOCK(sol); /* An extra reference on `nfp' has been held for us by falloc(). */ td->td_retval[0] = fd; - /* connection has been removed from the listen queue */ - KNOTE_UNLOCKED(&head->so_rcv.sb_sel.si_note, 0); + /* Connection has been removed from the listen queue. */ + KNOTE_UNLOCKED(&sol->sol_selinfo.si_note, 0); if (flags & ACCEPT4_INHERIT) { - pgid = fgetown(&head->so_sigio); + pgid = fgetown(&sol->sol_sigio); if (pgid != 0) fsetown(pgid, &so->so_sigio); } else { @@ -456,7 +471,7 @@ } if (nfp != NULL) fdrop(nfp, td); - fdrop(headfp, td); + fdrop(solfp, td); return (error); } @@ -518,7 +533,7 @@ AUDIT_ARG_FD(fd); AUDIT_ARG_SOCKADDR(td, dirfd, sa); error = getsock_cap(td, fd, cap_rights_init(&rights, CAP_CONNECT), - &fp, NULL, NULL); + &fp, NULL, NULL, DTYPE_SOCKET); if (error != 0) return (error); so = fp->f_data; @@ -761,7 +776,7 @@ AUDIT_ARG_SOCKADDR(td, AT_FDCWD, mp->msg_name); cap_rights_set(&rights, CAP_CONNECT); } - error = getsock_cap(td, s, &rights, &fp, NULL, NULL); + error = getsock_cap(td, s, &rights, &fp, NULL, NULL, DTYPE_SOCKET); if (error != 0) { m_freem(control); return (error); @@ -937,7 +952,7 @@ AUDIT_ARG_FD(s); error = getsock_cap(td, s, cap_rights_init(&rights, CAP_RECV), - &fp, NULL, NULL); + &fp, NULL, NULL, DTYPE_SOCKET); if (error != 0) return (error); so = fp->f_data; @@ -1205,28 +1220,33 @@ int sys_shutdown(struct thread *td, struct shutdown_args *uap) { - struct socket *so; struct file *fp; cap_rights_t rights; int error; AUDIT_ARG_FD(uap->s); error = getsock_cap(td, uap->s, cap_rights_init(&rights, CAP_SHUTDOWN), - &fp, NULL, NULL); - if (error == 0) { - so = fp->f_data; - error = soshutdown(so, uap->how); - /* - * Previous versions did not return ENOTCONN, but 0 in - * case the socket was not connected. Some important - * programs like syslogd up to r279016, 2015-02-19, - * still depend on this behavior. - */ - if (error == ENOTCONN && - td->td_proc->p_osrel < P_OSREL_SHUTDOWN_ENOTCONN) - error = 0; - fdrop(fp, td); + &fp, NULL, NULL, 0); + if (error) + return (error); + switch (fp->f_type) { + case DTYPE_SOCKET: + error = soshutdown((struct socket *)fp->f_data, uap->how); + break; + case DTYPE_SOLISTEN: + error = ENOTCONN; + break; } + /* + * Previous versions did not return ENOTCONN, but 0 in + * case the socket was not connected. Some important + * programs like syslogd up to r279016, 2015-02-19, + * still depend on this behavior. + */ + if (error == ENOTCONN && + td->td_proc->p_osrel < P_OSREL_SHUTDOWN_ENOTCONN) + error = 0; + fdrop(fp, td); return (error); } @@ -1242,7 +1262,6 @@ kern_setsockopt(struct thread *td, int s, int level, int name, void *val, enum uio_seg valseg, socklen_t valsize) { - struct socket *so; struct file *fp; struct sockopt sopt; cap_rights_t rights; @@ -1271,13 +1290,21 @@ AUDIT_ARG_FD(s); error = getsock_cap(td, s, cap_rights_init(&rights, CAP_SETSOCKOPT), - &fp, NULL, NULL); - if (error == 0) { - so = fp->f_data; - error = sosetopt(so, &sopt); - fdrop(fp, td); + &fp, NULL, NULL, 0); + if (error) + return (error); + switch (fp->f_type) { + case DTYPE_SOCKET: + error = sosetopt((struct socket *)fp->f_data, &sopt); + break; + case DTYPE_SOLISTEN: + error = solsetopt((struct solisten *)fp->f_data, &sopt); + break; + default: + error = ENOTSOCK; } - return(error); + fdrop(fp, td); + return (error); } int @@ -1308,7 +1335,6 @@ kern_getsockopt(struct thread *td, int s, int level, int name, void *val, enum uio_seg valseg, socklen_t *valsize) { - struct socket *so; struct file *fp; struct sockopt sopt; cap_rights_t rights; @@ -1337,13 +1363,23 @@ AUDIT_ARG_FD(s); error = getsock_cap(td, s, cap_rights_init(&rights, CAP_GETSOCKOPT), - &fp, NULL, NULL); - if (error == 0) { - so = fp->f_data; - error = sogetopt(so, &sopt); + &fp, NULL, NULL, 0); + if (error) + return (error); + switch (fp->f_type) { + case DTYPE_SOCKET: + error = sogetopt((struct socket *)fp->f_data, &sopt); *valsize = sopt.sopt_valsize; - fdrop(fp, td); + break; + case DTYPE_SOLISTEN: + error = solgetopt((struct solisten *)fp->f_data, &sopt); + *valsize = sopt.sopt_valsize; + break; + break; + default: + error = ENOTSOCK; } + fdrop(fp, td); return (error); } @@ -1390,7 +1426,7 @@ AUDIT_ARG_FD(fd); error = getsock_cap(td, fd, cap_rights_init(&rights, CAP_GETSOCKNAME), - &fp, NULL, NULL); + &fp, NULL, NULL, DTYPE_SOCKET); if (error != 0) return (error); so = fp->f_data; @@ -1477,7 +1513,7 @@ AUDIT_ARG_FD(fd); error = getsock_cap(td, fd, cap_rights_init(&rights, CAP_GETPEERNAME), - &fp, NULL, NULL); + &fp, NULL, NULL, DTYPE_SOCKET); if (error != 0) return (error); so = fp->f_data; Index: sys/kern/uipc_usrreq.c =================================================================== --- sys/kern/uipc_usrreq.c +++ sys/kern/uipc_usrreq.c @@ -270,7 +270,7 @@ #define UNP_PCB_LOCK_ASSERT(unp) mtx_assert(&(unp)->unp_mtx, MA_OWNED) static int uipc_connect2(struct socket *, struct socket *); -static int uipc_ctloutput(struct socket *, struct sockopt *); +static int uipc_ctloutput(void *, struct sockopt *); static int unp_connect(struct socket *, struct sockaddr *, struct thread *); static int unp_connectat(int, struct socket *, struct sockaddr *, @@ -284,6 +284,7 @@ static void unp_gc(__unused void *, int); static void unp_scan(struct mbuf *, void (*)(struct filedescent **, int)); static void unp_discard(struct file *); +static void unp_free(struct unpcb *); static void unp_freerights(struct filedescent **, int); static void unp_init(void); static int unp_internalize(struct mbuf **, struct thread *); @@ -430,7 +431,7 @@ unp->unp_socket = so; so->so_pcb = unp; unp->unp_refcount = 1; - if (so->so_head != NULL) + if (so->so_listen != NULL) unp->unp_flags |= UNP_NASCENT; UNP_LIST_LOCK(); @@ -552,7 +553,7 @@ UNP_LINK_WLOCK(); UNP_PCB_LOCK(unp); - VOP_UNP_BIND(vp, unp->unp_socket); + VOP_UNP_BIND(vp, unp); unp->unp_vnode = vp; unp->unp_addr = soun; unp->unp_flags &= ~UNP_BINDING; @@ -646,14 +647,35 @@ static void uipc_detach(struct socket *so) { - struct unpcb *unp, *unp2; + struct unpcb *unp; + + unp = sotounpcb(so); + KASSERT(unp != NULL, ("%s: unp == NULL", __func__)); + + unp_free(unp); +} + +static void +uipc_listenclose(struct solisten *sol) +{ + struct unpcb *unp; + + unp = soltounpcb(sol); + KASSERT(unp != NULL, ("%s: unp == NULL", __func__)); + KASSERT(unp->unp_conn == NULL && unp->unp_flags & UNP_LISTENING, + ("%s: bad unp %p", __func__, unp)); + + unp_free(unp); +} + +static void +unp_free(struct unpcb *unp) +{ + struct unpcb *unp2; struct sockaddr_un *saved_unp_addr; struct vnode *vp; int freeunp, local_unp_rights; - unp = sotounpcb(so); - KASSERT(unp != NULL, ("uipc_detach: unp == NULL")); - vp = NULL; local_unp_rights = 0; @@ -698,7 +720,10 @@ local_unp_rights = unp_rights; UNP_LINK_WUNLOCK(); teardown: - unp->unp_socket->so_pcb = NULL; + if (unp->unp_flags & UNP_LISTENING) + unp->unp_listen->sol_pcb = NULL; + else + unp->unp_socket->so_pcb = NULL; saved_unp_addr = unp->unp_addr; unp->unp_addr = NULL; unp->unp_refcount--; @@ -738,11 +763,15 @@ } static int -uipc_listen(struct socket *so, int backlog, struct thread *td) +uipc_listen(struct socket *so, struct solisten *sol, int backlog, + struct thread *td) { struct unpcb *unp; int error; + if (so == NULL) + return (0); + if (so->so_type != SOCK_STREAM && so->so_type != SOCK_SEQPACKET) return (EOPNOTSUPP); @@ -757,16 +786,12 @@ return (error); } - SOCK_LOCK(so); - error = solisten_proto_check(so); - if (error == 0) { - cru2x(td->td_ucred, &unp->unp_peercred); - unp->unp_flags |= UNP_HAVEPCCACHED; - solisten_proto(so, backlog); - } - SOCK_UNLOCK(so); + cru2x(td->td_ucred, &unp->unp_peercred); + unp->unp_flags |= UNP_LISTENING; + unp->unp_listen = sol; + VOP_UNP_BIND(unp->unp_vnode, unp); UNP_PCB_UNLOCK(unp); - return (error); + return (0); } static int @@ -1157,6 +1182,7 @@ .pru_detach = uipc_detach, .pru_disconnect = uipc_disconnect, .pru_listen = uipc_listen, + .pru_listenclose = uipc_listenclose, .pru_peeraddr = uipc_peeraddr, .pru_rcvd = uipc_rcvd, .pru_send = uipc_send, @@ -1179,6 +1205,7 @@ .pru_detach = uipc_detach, .pru_disconnect = uipc_disconnect, .pru_listen = uipc_listen, + .pru_listenclose = uipc_listenclose, .pru_peeraddr = uipc_peeraddr, .pru_rcvd = uipc_rcvd, .pru_send = uipc_send, @@ -1201,6 +1228,7 @@ .pru_detach = uipc_detach, .pru_disconnect = uipc_disconnect, .pru_listen = uipc_listen, + .pru_listenclose = uipc_listenclose, .pru_peeraddr = uipc_peeraddr, .pru_rcvd = uipc_rcvd, .pru_send = uipc_send, @@ -1213,17 +1241,20 @@ }; static int -uipc_ctloutput(struct socket *so, struct sockopt *sopt) +uipc_ctloutput(void *xpcb, struct sockopt *sopt) { - struct unpcb *unp; + struct unpcb *unp = (struct unpcb *)xpcb; struct xucred xu; int error, optval; + short type; if (sopt->sopt_level != 0) return (EINVAL); - unp = sotounpcb(so); - KASSERT(unp != NULL, ("uipc_ctloutput: unp == NULL")); + if (unp->unp_flags & UNP_LISTENING) + type = unp->unp_listen->sol_type; + else + type = unp->unp_socket->so_type; error = 0; switch (sopt->sopt_dir) { case SOPT_GET: @@ -1233,7 +1264,7 @@ if (unp->unp_flags & UNP_HAVEPC) xu = unp->unp_peercred; else { - if (so->so_type == SOCK_STREAM) + if (type == SOCK_STREAM) error = ENOTCONN; else error = EINVAL; @@ -1319,7 +1350,7 @@ { struct sockaddr_un *soun = (struct sockaddr_un *)nam; struct vnode *vp; - struct socket *so2, *so3; + struct socket *so2; struct unpcb *unp, *unp2, *unp3; struct nameidata nd; char buf[SOCK_MAXADDRLEN]; @@ -1386,31 +1417,32 @@ * and to protect simultaneous locking of multiple pcbs. */ UNP_LINK_WLOCK(); - VOP_UNP_CONNECT(vp, &so2); - if (so2 == NULL) { + VOP_UNP_CONNECT(vp, &unp2); + if (unp2 == NULL) { error = ECONNREFUSED; goto bad2; } - if (so->so_type != so2->so_type) { - error = EPROTOTYPE; - goto bad2; - } + UNP_PCB_LOCK(unp); + UNP_PCB_LOCK(unp2); if (so->so_proto->pr_flags & PR_CONNREQUIRED) { - if (so2->so_options & SO_ACCEPTCONN) { - CURVNET_SET(so2->so_vnet); - so3 = sonewconn(so2, 0); + if (unp2->unp_flags & UNP_LISTENING) { + struct solisten *sol; + + sol = unp2->unp_listen; + if (so->so_type != sol->sol_type) { + error = EPROTOTYPE; + goto bad3; + } + CURVNET_SET(sol->sol_vnet); + so2 = sonewconn(sol, 0); CURVNET_RESTORE(); } else - so3 = NULL; - if (so3 == NULL) { + so2 = NULL; + if (so2 == NULL) { error = ECONNREFUSED; - goto bad2; + goto bad3; } - unp = sotounpcb(so); - unp2 = sotounpcb(so2); - unp3 = sotounpcb(so3); - UNP_PCB_LOCK(unp); - UNP_PCB_LOCK(unp2); + unp3 = sotounpcb(so2); UNP_PCB_LOCK(unp3); if (unp2->unp_addr != NULL) { bcopy(unp2->unp_addr, sa, unp2->unp_addr->sun_len); @@ -1431,30 +1463,34 @@ * listen(); uipc_listen() cached that process's credentials * at that time so we can use them now. */ - KASSERT(unp2->unp_flags & UNP_HAVEPCCACHED, - ("unp_connect: listener without cached peercred")); memcpy(&unp->unp_peercred, &unp2->unp_peercred, sizeof(unp->unp_peercred)); unp->unp_flags |= UNP_HAVEPC; if (unp2->unp_flags & UNP_WANTCRED) unp3->unp_flags |= UNP_WANTCRED; - UNP_PCB_UNLOCK(unp3); UNP_PCB_UNLOCK(unp2); - UNP_PCB_UNLOCK(unp); + unp2 = unp3; #ifdef MAC - mac_socketpeer_set_from_socket(so, so3); - mac_socketpeer_set_from_socket(so3, so); + mac_socketpeer_set_from_socket(so, so2); + mac_socketpeer_set_from_socket(so2, so); #endif - - so2 = so3; + } else { /* !UNP_LISTENING */ + so2 = unp2->unp_socket; + if (so2->so_type != so->so_type) { + error = EPROTOTYPE; + goto bad3; + } + if (so2->so_proto->pr_flags & PR_CONNREQUIRED) { + error = ECONNREFUSED; + goto bad3; + } } - unp = sotounpcb(so); - KASSERT(unp != NULL, ("unp_connect: unp == NULL")); - unp2 = sotounpcb(so2); - KASSERT(unp2 != NULL, ("unp_connect: unp2 == NULL")); - UNP_PCB_LOCK(unp); - UNP_PCB_LOCK(unp2); + + KASSERT(unp2 != NULL && so2 != NULL && unp2->unp_socket == so2 && + sotounpcb(so2) == unp2, + ("%s: unp2 %p so2 %p", __func__, unp2, so2)); error = unp_connect2(so, so2, PRU_CONNECT); +bad3: UNP_PCB_UNLOCK(unp2); UNP_PCB_UNLOCK(unp); bad2: @@ -1618,8 +1654,13 @@ unp = LIST_NEXT(unp, unp_link)) { UNP_PCB_LOCK(unp); if (unp->unp_gencnt <= gencnt) { - if (cr_cansee(req->td->td_ucred, - unp->unp_socket->so_cred)) { + struct ucred *cr; + + if (unp->unp_flags & UNP_LISTENING) + cr = unp->unp_listen->sol_cred; + else + cr = unp->unp_socket->so_cred; + if (cr_cansee(req->td->td_ucred, cr)) { UNP_PCB_UNLOCK(unp); continue; } @@ -1653,7 +1694,10 @@ &xu->xu_caddr, unp->unp_conn->unp_addr->sun_len); bcopy(unp, &xu->xu_unp, sizeof *unp); - sotoxsocket(unp->unp_socket, &xu->xu_socket); + if (unp->unp_flags & UNP_LISTENING) + soltoxsocket(unp->unp_listen, &xu->xu_socket); + else + sotoxsocket(unp->unp_socket, &xu->xu_socket); UNP_PCB_UNLOCK(unp); error = SYSCTL_OUT(req, xu, sizeof *xu); } else { @@ -2237,8 +2281,8 @@ static void unp_gc_process(struct unpcb *unp) { - struct socket *soa; struct socket *so; + struct solisten *sol; struct file *fp; /* Already processed. */ @@ -2258,28 +2302,31 @@ return; } - /* - * Mark all sockets we reference with RIGHTS. - */ - so = unp->unp_socket; - if ((unp->unp_gcflag & UNPGC_IGNORE_RIGHTS) == 0) { - SOCKBUF_LOCK(&so->so_rcv); - unp_scan(so->so_rcv.sb_mb, unp_accessable); - SOCKBUF_UNLOCK(&so->so_rcv); - } - - /* - * Mark all sockets in our accept queue. - */ - ACCEPT_LOCK(); - TAILQ_FOREACH(soa, &so->so_comp, so_list) { - if ((sotounpcb(soa)->unp_gcflag & UNPGC_IGNORE_RIGHTS) != 0) - continue; - SOCKBUF_LOCK(&soa->so_rcv); - unp_scan(soa->so_rcv.sb_mb, unp_accessable); - SOCKBUF_UNLOCK(&soa->so_rcv); + if ((unp->unp_flags & UNP_LISTENING) == 0) { + /* + * Mark all sockets we reference with RIGHTS. + */ + so = unp->unp_socket; + if ((unp->unp_gcflag & UNPGC_IGNORE_RIGHTS) == 0) { + SOCKBUF_LOCK(&so->so_rcv); + unp_scan(so->so_rcv.sb_mb, unp_accessable); + SOCKBUF_UNLOCK(&so->so_rcv); + } + } else { + /* + * Mark all sockets in our accept queue. + */ + sol = unp->unp_listen; + SOLISTEN_LOCK(sol); + TAILQ_FOREACH(so, &sol->sol_comp, so_list) { + if (sotounpcb(so)->unp_gcflag & UNPGC_IGNORE_RIGHTS) + continue; + SOCKBUF_LOCK(&so->so_rcv); + unp_scan(so->so_rcv.sb_mb, unp_accessable); + SOCKBUF_UNLOCK(&so->so_rcv); + } + SOLISTEN_UNLOCK(sol); } - ACCEPT_UNLOCK(); unp->unp_gcflag |= UNPGC_SCANNED; } @@ -2454,7 +2501,6 @@ void vfs_unp_reclaim(struct vnode *vp) { - struct socket *so; struct unpcb *unp; int active; @@ -2464,10 +2510,7 @@ active = 0; UNP_LINK_WLOCK(); - VOP_UNP_CONNECT(vp, &so); - if (so == NULL) - goto done; - unp = sotounpcb(so); + VOP_UNP_CONNECT(vp, &unp); if (unp == NULL) goto done; UNP_PCB_LOCK(unp); @@ -2503,8 +2546,8 @@ db_printf("%sUNP_HAVEPC", comma ? ", " : ""); comma = 1; } - if (unp_flags & UNP_HAVEPCCACHED) { - db_printf("%sUNP_HAVEPCCACHED", comma ? ", " : ""); + if (unp_flags & UNP_LISTENING) { + db_printf("%sUNP_LISTENING", comma ? ", " : ""); comma = 1; } if (unp_flags & UNP_WANTCRED) { Index: sys/kern/vfs_default.c =================================================================== --- sys/kern/vfs_default.c +++ sys/kern/vfs_default.c @@ -1124,7 +1124,7 @@ vop_stdunp_bind(struct vop_unp_bind_args *ap) { - ap->a_vp->v_socket = ap->a_socket; + ap->a_vp->v_unpcb = ap->a_unpcb; return (0); } @@ -1132,7 +1132,7 @@ vop_stdunp_connect(struct vop_unp_connect_args *ap) { - *ap->a_socket = ap->a_vp->v_socket; + *ap->a_unpcb = ap->a_vp->v_unpcb; return (0); } @@ -1140,7 +1140,7 @@ vop_stdunp_detach(struct vop_unp_detach_args *ap) { - ap->a_vp->v_socket = NULL; + ap->a_vp->v_unpcb = NULL; return (0); } Index: sys/kern/vfs_subr.c =================================================================== --- sys/kern/vfs_subr.c +++ sys/kern/vfs_subr.c @@ -2992,7 +2992,10 @@ /* XXX Elsewhere we detect an already freed vnode via NULL v_op. */ vp->v_op = NULL; #endif - bzero(&vp->v_un, sizeof(vp->v_un)); + vp->v_mountedhere = NULL; + vp->v_unpcb = NULL; + vp->v_rdev = NULL; + vp->v_fifoinfo = NULL; vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0; vp->v_iflag = 0; vp->v_vflag = 0; Index: sys/kern/vnode_if.src =================================================================== --- sys/kern/vnode_if.src +++ sys/kern/vnode_if.src @@ -662,7 +662,7 @@ vop_unp_bind { IN struct vnode *vp; - IN struct socket *socket; + IN struct unpcb *unpcb; }; @@ -670,7 +670,7 @@ vop_unp_connect { IN struct vnode *vp; - OUT struct socket **socket; + OUT struct unpcb **unpcb; }; Index: sys/netinet/in_pcb.h =================================================================== --- sys/netinet/in_pcb.h +++ sys/netinet/in_pcb.h @@ -192,7 +192,10 @@ struct inpcbinfo *inp_pcbinfo; /* (c) PCB list info */ struct inpcbgroup *inp_pcbgroup; /* (g/i) PCB group list */ LIST_ENTRY(inpcb) inp_pcbgroup_wild; /* (g/i/h) group wildcard entry */ - struct socket *inp_socket; /* (i) back pointer to socket */ + union { + struct socket *inp_socket; /* (i) back pointer to socket */ + struct solisten *inp_solisten; /* or solisten */ + }; struct ucred *inp_cred; /* (c) cache of socket cred */ u_int32_t inp_flow; /* (i) IPv6 flow information */ int inp_flags; /* (i) generic IP/datagram flags */ @@ -618,6 +621,7 @@ #define INP_RECVFLOWID 0x00000100 /* populate recv datagram with flow info */ #define INP_RECVRSSBUCKETID 0x00000200 /* populate recv datagram with bucket id */ #define INP_RATE_LIMIT_CHANGED 0x00000400 /* rate limit needs attention */ +#define INP_LISTENING 0x00000800 /* this is a listening socket */ /* * Flags passed to in_pcblookup*() functions. @@ -631,6 +635,7 @@ #define sotoinpcb(so) ((struct inpcb *)(so)->so_pcb) #define sotoin6pcb(so) sotoinpcb(so) /* for KAME src sync over BSD*'s */ +#define soltoinpcb(sol) ((struct inpcb *)(sol)->sol_pcb) #define INP_SOCKAF(so) so->so_proto->pr_domain->dom_family Index: sys/netinet/ip_output.c =================================================================== --- sys/netinet/ip_output.c +++ sys/netinet/ip_output.c @@ -965,9 +965,10 @@ * IP socket option processing. */ int -ip_ctloutput(struct socket *so, struct sockopt *sopt) +ip_ctloutput(void *xpcb, struct sockopt *sopt) { - struct inpcb *inp = sotoinpcb(so); + struct inpcb *inp = (struct inpcb *)xpcb; + struct socket *so = inp->inp_socket; int error, optval; #ifdef RSS uint32_t rss_bucket; Index: sys/netinet/ip_var.h =================================================================== --- sys/netinet/ip_var.h +++ sys/netinet/ip_var.h @@ -203,7 +203,7 @@ int inp_getmoptions(struct inpcb *, struct sockopt *); int inp_setmoptions(struct inpcb *, struct sockopt *); -int ip_ctloutput(struct socket *, struct sockopt *sopt); +int ip_ctloutput(void *, struct sockopt *sopt); void ip_drain(void); int ip_fragment(struct ip *ip, struct mbuf **m_frag, int mtu, u_long if_hwassist_flags); @@ -223,7 +223,7 @@ struct mbuf *); void ip_slowtimo(void); void ip_fillid(struct ip *); -int rip_ctloutput(struct socket *, struct sockopt *); +int rip_ctloutput(void *, struct sockopt *); void rip_ctlinput(int, struct sockaddr *, void *); void rip_init(void); int rip_input(struct mbuf **, int *, int); Index: sys/netinet/raw_ip.c =================================================================== --- sys/netinet/raw_ip.c +++ sys/netinet/raw_ip.c @@ -562,9 +562,10 @@ * XXX-BZ inp locking? */ int -rip_ctloutput(struct socket *so, struct sockopt *sopt) +rip_ctloutput(void *xpcb, struct sockopt *sopt) { - struct inpcb *inp = sotoinpcb(so); + struct inpcb *inp = (struct inpcb *)xpcb; + struct socket *so = inp->inp_socket; int error, optval; if (sopt->sopt_level != IPPROTO_IP) { @@ -626,7 +627,7 @@ break; default: - error = ip_ctloutput(so, sopt); + error = ip_ctloutput(inp, sopt); break; } break; @@ -714,7 +715,7 @@ break; default: - error = ip_ctloutput(so, sopt); + error = ip_ctloutput(inp, sopt); break; } break; Index: sys/netinet/sctp_input.c =================================================================== --- sys/netinet/sctp_input.c +++ sys/netinet/sctp_input.c @@ -161,13 +161,13 @@ *abort_no_unlock = 1; goto outnow; } - /* We are only accepting if we have a socket with positive - * so_qlimit. */ + /* + * Check if the socket is accepting. + */ if ((stcb == NULL) && ((inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) || (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) || - (inp->sctp_socket == NULL) || - (inp->sctp_socket->so_qlimit == 0))) { + !(inp->sctp_flags & SCTP_PCB_FLAGS_ACCEPTING))) { /* * FIX ME ?? What about TCP model and we have a * match/restart case? Actually no fix is needed. the lookup @@ -1605,7 +1605,7 @@ sctp_stop_all_cookie_timers(stcb); if (((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL)) && - (inp->sctp_socket->so_qlimit == 0) + !(stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_ACCEPTING) ) { #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) struct socket *so; @@ -1806,7 +1806,7 @@ if (((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL)) && - (inp->sctp_socket->so_qlimit == 0)) { + !(stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_ACCEPTING)) { #if defined(__APPLE__) || defined(SCTP_SO_LOCK_TESTING) struct socket *so; #endif @@ -2317,7 +2317,7 @@ *notification = SCTP_NOTIFY_ASSOC_UP; if (((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) || (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL)) && - (inp->sctp_socket->so_qlimit == 0)) { + !(stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_ACCEPTING)) { /* * This is an endpoint that called connect() how it got a * cookie that is NEW is a bit of a mystery. It must be that @@ -2343,7 +2343,7 @@ SCTP_SOCKET_UNLOCK(so, 1); #endif } else if ((stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) && - (inp->sctp_socket->so_qlimit)) { + (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_ACCEPTING)) { /* * We don't want to do anything with this one. Since it is * the listening guy. The timer will get started for @@ -2720,13 +2720,14 @@ sctp_start_net_timers(*stcb); if ((*inp_p)->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) { if (!had_a_existing_tcb || - (((*inp_p)->sctp_flags & SCTP_PCB_FLAGS_CONNECTED) == 0)) { + ((*inp_p)->sctp_flags & SCTP_PCB_FLAGS_ACCEPTING)) { /* * If we have a NEW cookie or the connect never * reached the connected state during collision we * must do the TCP accept thing. */ - struct socket *so, *oso; + struct solisten *sol; + struct socket *so; struct sctp_inpcb *inp; if (notification == SCTP_NOTIFY_ASSOC_RESTART) { @@ -2741,12 +2742,11 @@ } return (m); } - oso = (*inp_p)->sctp_socket; + sol = (*inp_p)->sctp_solisten; atomic_add_int(&(*stcb)->asoc.refcnt, 1); SCTP_TCB_UNLOCK((*stcb)); CURVNET_SET(oso->so_vnet); - so = sonewconn(oso, 0 - ); + so = sonewconn(sol, 0); CURVNET_RESTORE(); SCTP_TCB_LOCK((*stcb)); atomic_subtract_int(&(*stcb)->asoc.refcnt, 1); Index: sys/netinet/sctp_pcb.h =================================================================== --- sys/netinet/sctp_pcb.h +++ sys/netinet/sctp_pcb.h @@ -385,8 +385,11 @@ */ struct sctp_laddr *next_addr_touse; - /* back pointer to our socket */ - struct socket *sctp_socket; + /* back pointer to our socket / listening socket */ + union { + struct socket *sctp_socket; + struct solisten *sctp_solisten; + }; uint64_t sctp_features; /* Feature flags */ uint32_t sctp_flags; /* INP state flag set */ uint32_t sctp_mobility_features; /* Mobility Feature flags */ Index: sys/netinet/sctp_syscalls.c =================================================================== --- sys/netinet/sctp_syscalls.c +++ sys/netinet/sctp_syscalls.c @@ -152,29 +152,11 @@ td->td_retval[0] = fd; CURVNET_SET(head->so_vnet); - so = sonewconn(head, SS_ISCONNECTED); + so = sopeeloff(head); if (so == NULL) { error = ENOMEM; goto noconnection; } - /* - * Before changing the flags on the socket, we have to bump the - * reference count. Otherwise, if the protocol calls sofree(), - * the socket will be released due to a zero refcount. - */ - SOCK_LOCK(so); - soref(so); /* file descriptor reference */ - SOCK_UNLOCK(so); - - ACCEPT_LOCK(); - - TAILQ_REMOVE(&head->so_comp, so, so_list); - head->so_qlen--; - so->so_state |= (head->so_state & SS_NBIO); - so->so_state &= ~SS_NOFDREF; - so->so_qstate &= ~SQ_COMP; - so->so_head = NULL; - ACCEPT_UNLOCK(); finit(nfp, fflag, DTYPE_SOCKET, so, &socketops); error = sctp_do_peeloff(head, so, (sctp_assoc_t)uap->name); if (error != 0) Index: sys/netinet/sctp_usrreq.c =================================================================== --- sys/netinet/sctp_usrreq.c +++ sys/netinet/sctp_usrreq.c @@ -6792,7 +6792,7 @@ else #endif #ifdef INET - error = ip_ctloutput(so, sopt); + error = ip_ctloutput(inp, sopt); #endif return (error); } @@ -6985,19 +6985,11 @@ #endif int -sctp_listen(struct socket *so, int backlog, struct thread *p) +sctp_listen(struct socket *so, struct solisten *sol, int backlog, + struct thread *p) { - /* - * Note this module depends on the protocol processing being called - * AFTER any socket level flags and backlog are applied to the - * socket. The traditional way that the socket flags are applied is - * AFTER protocol processing. We have made a change to the - * sys/kern/uipc_socket.c module to reverse this but this MUST be in - * place if the socket API for SCTP is to work properly. - */ - - int error = 0; struct sctp_inpcb *inp; + int error; inp = (struct sctp_inpcb *)so->so_pcb; if (inp == NULL) { @@ -7096,13 +7088,6 @@ sctp_log_lock(inp, (struct sctp_tcb *)NULL, SCTP_LOG_LOCK_SOCK); } #endif - SOCK_LOCK(so); - error = solisten_proto_check(so); - SOCK_UNLOCK(so); - if (error) { - SCTP_INP_RUNLOCK(inp); - return (error); - } if ((sctp_is_feature_on(inp, SCTP_PCB_FLAGS_PORTREUSE)) && (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL)) { /* @@ -7132,19 +7117,17 @@ return (error); } } - SOCK_LOCK(so); - /* It appears for 7.0 and on, we must always call this. */ - solisten_proto(so, backlog); - if (inp->sctp_flags & SCTP_PCB_FLAGS_UDPTYPE) { - /* remove the ACCEPTCONN flag for one-to-many sockets */ - so->so_options &= ~SO_ACCEPTCONN; - } - if (backlog == 0) { - /* turning off listen */ - so->so_options &= ~SO_ACCEPTCONN; - } - SOCK_UNLOCK(so); - return (error); + + /* + * If it is a one-to-many socket, or backlog is 0, we refuse + * transfer to solisten. + */ + if ((inp->sctp_flags & SCTP_PCB_FLAGS_UDPTYPE) || backlog == 0) + return (ENOTSOCK); + + inp->sctp_flags |= SCTP_PCB_FLAGS_ACCEPTING; + inp->sctp_solisten = sol; + return (0); } static int sctp_defered_wakeup_cnt = 0; Index: sys/netinet/tcp_input.c =================================================================== --- sys/netinet/tcp_input.c +++ sys/netinet/tcp_input.c @@ -601,6 +601,7 @@ struct inpcb *inp = NULL; struct tcpcb *tp = NULL; struct socket *so = NULL; + struct solisten *sol = NULL; u_char *optp = NULL; int off0; int optlen = 0; @@ -633,6 +634,7 @@ u_char tcp_saveipgen[IP6_HDR_LEN]; struct tcphdr tcp_savetcp; short ostate = 0; + bool debug = false; #endif #ifdef INET6 @@ -940,7 +942,7 @@ if ((inp->inp_flowtype == M_HASHTYPE_NONE) && (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) && ((inp->inp_socket == NULL) || - (inp->inp_socket->so_options & SO_ACCEPTCONN) == 0)) { + (inp->inp_flags2 & INP_LISTENING) == 0)) { inp->inp_flowid = m->m_pkthdr.flowid; inp->inp_flowtype = M_HASHTYPE_GET(m); } @@ -1077,10 +1079,21 @@ if (mac_inpcb_check_deliver(inp, m)) goto dropunlock; #endif - so = inp->inp_socket; - KASSERT(so != NULL, ("%s: so == NULL", __func__)); + if (inp->inp_flags2 & INP_LISTENING) { + sol = inp->inp_solisten; +#ifdef TCPDEBUG + debug = sol->sol_options & SO_DEBUG; +#endif + } else { + so = inp->inp_socket; +#ifdef TCPDEBUG + debug = so->so_options & SO_DEBUG; +#endif + } + KASSERT(so != NULL || sol != NULL, + ("%s: inp %p has not socket", __func__, inp)); #ifdef TCPDEBUG - if (so->so_options & SO_DEBUG) { + if (debug) { ostate = tp->t_state; #ifdef INET6 if (isipv6) { @@ -1096,9 +1109,10 @@ * state) we look into the SYN cache if this is a new connection * attempt or the completion of a previous one. */ - KASSERT(tp->t_state == TCPS_LISTEN || !(so->so_options & SO_ACCEPTCONN), - ("%s: so accepting but tp %p not listening", __func__, tp)); - if (tp->t_state == TCPS_LISTEN && (so->so_options & SO_ACCEPTCONN)) { + KASSERT(tp->t_state == TCPS_LISTEN || + !(inp->inp_flags2 & INP_LISTENING), + ("%s: pcb listening but tp %p not listening", __func__, tp)); + if (tp->t_state == TCPS_LISTEN && (inp->inp_flags2 & INP_LISTENING)) { struct in_conninfo inc; bzero(&inc, sizeof(inc)); @@ -1115,7 +1129,7 @@ } inc.inc_fport = th->th_sport; inc.inc_lport = th->th_dport; - inc.inc_fibnum = so->so_fibnum; + inc.inc_fibnum = sol->sol_fibnum; /* * Check for an existing connection attempt in syncache if @@ -1135,7 +1149,7 @@ * NB: syncache_expand() doesn't unlock * inp and tcpinfo locks. */ - if (!syncache_expand(&inc, &to, th, &so, m)) { + if (!syncache_expand(&inc, &to, th, sol, &so, m)) { /* * No syncache entry or ACK was not * for our SYN/ACK. Send a RST. @@ -1404,17 +1418,17 @@ * for syncache. */ #ifdef TCPDEBUG - if (so->so_options & SO_DEBUG) + if (debug) tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen, &tcp_savetcp, 0); #endif TCP_PROBE3(debug__input, tp, th, m); tcp_dooptions(&to, optp, optlen, TO_SYN); #ifdef TCP_RFC7413 - if (syncache_add(&inc, &to, th, inp, &so, m, NULL, NULL)) + if (syncache_add(&inc, &to, th, inp, sol, m, NULL, NULL)) goto tfo_socket_result; #else - syncache_add(&inc, &to, th, inp, &so, m, NULL, NULL); + syncache_add(&inc, &to, th, inp, sol, m, NULL, NULL); #endif /* * Entry added to syncache and mbuf consumed. @@ -1428,6 +1442,7 @@ return (IPPROTO_DONE); } else if (tp->t_state == TCPS_LISTEN) { /* + * QQQ * When a listen socket is torn down the SO_ACCEPTCONN * flag is removed first while connections are drained * from the accept queue in a unlock/lock cycle of the Index: sys/netinet/tcp_subr.c =================================================================== --- sys/netinet/tcp_subr.c +++ sys/netinet/tcp_subr.c @@ -1557,12 +1557,13 @@ { struct inpcb *inp = tp->t_inpcb; struct socket *so; + bool listening = (tp->t_state == TCPS_LISTEN); INP_INFO_LOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(inp); #ifdef TCP_OFFLOAD - if (tp->t_state == TCPS_LISTEN) + if (listening) tcp_offload_listen_stop(tp); #endif #ifdef TCP_RFC7413 @@ -1580,6 +1581,12 @@ TCPSTAT_INC(tcps_closed); if (tp->t_state != TCPS_CLOSED) tcp_state_change(tp, TCPS_CLOSED); + if (listening) { + tcp_discardcb(tp); + in_pcbdetach(inp); + in_pcbfree(inp); + return (NULL); + } KASSERT(inp->inp_socket != NULL, ("tcp_close: inp_socket NULL")); so = inp->inp_socket; soisdisconnected(so); @@ -1588,7 +1595,6 @@ ("tcp_close: !SS_PROTOREF")); inp->inp_flags &= ~INP_SOCKREF; INP_WUNLOCK(inp); - ACCEPT_LOCK(); SOCK_LOCK(so); so->so_state &= ~SS_PROTOREF; sofree(so); @@ -1802,9 +1808,14 @@ if (xt.xt_tp.t_timers) tcp_timer_to_xtimer(&xt.xt_tp, xt.xt_tp.t_timers, &xt.xt_timer); } - if (inp->inp_socket != NULL) - sotoxsocket(inp->inp_socket, &xt.xt_socket); - else { + if (inp->inp_socket != NULL) { + if (inp->inp_flags2 & INP_LISTENING) + soltoxsocket(inp->inp_solisten, + &xt.xt_socket); + else + sotoxsocket(inp->inp_socket, + &xt.xt_socket); + } else { bzero(&xt.xt_socket, sizeof xt.xt_socket); xt.xt_socket.xso_protocol = IPPROTO_TCP; } @@ -2964,7 +2975,7 @@ else INP_WUNLOCK(inp); } else if (!(inp->inp_flags & INP_DROPPED) && - !(inp->inp_socket->so_options & SO_ACCEPTCONN)) { + !(inp->inp_flags2 & INP_LISTENING)) { tp = intotcpcb(inp); tp = tcp_drop(tp, ECONNABORTED); if (tp != NULL) Index: sys/netinet/tcp_syncache.h =================================================================== --- sys/netinet/tcp_syncache.h +++ sys/netinet/tcp_syncache.h @@ -40,9 +40,10 @@ #endif void syncache_unreach(struct in_conninfo *, struct tcphdr *); int syncache_expand(struct in_conninfo *, struct tcpopt *, - struct tcphdr *, struct socket **, struct mbuf *); + struct tcphdr *, struct solisten *, struct socket **, + struct mbuf *); int syncache_add(struct in_conninfo *, struct tcpopt *, - struct tcphdr *, struct inpcb *, struct socket **, struct mbuf *, + struct tcphdr *, struct inpcb *, struct solisten *, struct mbuf *, void *, void *); void syncache_chkrst(struct in_conninfo *, struct tcphdr *); void syncache_badack(struct in_conninfo *); Index: sys/netinet/tcp_syncache.c =================================================================== --- sys/netinet/tcp_syncache.c +++ sys/netinet/tcp_syncache.c @@ -129,7 +129,7 @@ static void syncache_insert(struct syncache *, struct syncache_head *); static int syncache_respond(struct syncache *, struct syncache_head *, int, const struct mbuf *); -static struct socket *syncache_socket(struct syncache *, struct socket *, +static struct socket *syncache_socket(struct syncache *, struct solisten *, struct mbuf *m); static void syncache_timeout(struct syncache *sc, struct syncache_head *sch, int docallout); @@ -141,12 +141,12 @@ static struct syncache *syncookie_lookup(struct in_conninfo *, struct syncache_head *, struct syncache *, struct tcphdr *, struct tcpopt *, - struct socket *); + struct solisten *); static void syncookie_reseed(void *); #ifdef INVARIANTS -static int syncookie_cmp(struct in_conninfo *inc, struct syncache_head *sch, - struct syncache *sc, struct tcphdr *th, struct tcpopt *to, - struct socket *lso); +static int syncookie_cmp(struct in_conninfo *, struct syncache_head *, + struct syncache *, struct tcphdr *, struct tcpopt *, + struct solisten *); #endif /* @@ -635,7 +635,7 @@ * On success return the newly created socket with its underlying inp locked. */ static struct socket * -syncache_socket(struct syncache *sc, struct socket *lso, struct mbuf *m) +syncache_socket(struct syncache *sc, struct solisten *sol, struct mbuf *m) { struct tcp_function_block *blk; struct inpcb *inp = NULL; @@ -652,7 +652,7 @@ * connection when the SYN arrived. If we can't create * the connection, abort it. */ - so = sonewconn(lso, 0); + so = sonewconn(sol, 0); if (so == NULL) { /* * Drop the connection; we will either send a RST or @@ -738,12 +738,12 @@ } #ifdef IPSEC /* Copy old policy into new socket's. */ - if (ipsec_copy_policy(sotoinpcb(lso)->inp_sp, inp->inp_sp)) + if (ipsec_copy_policy(soltoinpcb(sol)->inp_sp, inp->inp_sp)) printf("syncache_socket: could not copy policy\n"); #endif #ifdef INET6 if (sc->sc_inc.inc_flags & INC_ISIPV6) { - struct inpcb *oinp = sotoinpcb(lso); + struct inpcb *oinp = soltoinpcb(sol); struct in6_addr laddr6; struct sockaddr_in6 sin6; /* @@ -829,7 +829,7 @@ tp->irs = sc->sc_irs; tcp_rcvseqinit(tp); tcp_sendseqinit(tp); - blk = sototcpcb(lso)->t_fb; + blk = soltotcpcb(sol)->t_fb; if (blk != tp->t_fb) { /* * Our parents t_fb was not the default, @@ -857,7 +857,7 @@ tp->rcv_adv += tp->rcv_wnd; tp->last_ack_sent = tp->rcv_nxt; - tp->t_flags = sototcpcb(lso)->t_flags & (TF_NOPUSH|TF_NODELAY); + tp->t_flags = soltotcpcb(sol)->t_flags & (TF_NOPUSH|TF_NODELAY); if (sc->sc_flags & SCF_NOOPT) tp->t_flags |= TF_NOOPT; else { @@ -912,10 +912,10 @@ /* * Copy and activate timers. */ - tp->t_keepinit = sototcpcb(lso)->t_keepinit; - tp->t_keepidle = sototcpcb(lso)->t_keepidle; - tp->t_keepintvl = sototcpcb(lso)->t_keepintvl; - tp->t_keepcnt = sototcpcb(lso)->t_keepcnt; + tp->t_keepinit = soltotcpcb(sol)->t_keepinit; + tp->t_keepidle = soltotcpcb(sol)->t_keepidle; + tp->t_keepintvl = soltotcpcb(sol)->t_keepintvl; + tp->t_keepcnt = soltotcpcb(sol)->t_keepcnt; tcp_timer_activate(tp, TT_KEEP, TP_KEEPINIT(tp)); TCPSTAT_INC(tcps_accepts); @@ -941,7 +941,7 @@ */ int syncache_expand(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, - struct socket **lsop, struct mbuf *m) + struct solisten *sol, struct socket **so, struct mbuf *m) { struct syncache *sc; struct syncache_head *sch; @@ -965,7 +965,7 @@ * values with the reconstructed values from the cookie. */ if (sc != NULL) - syncookie_cmp(inc, sch, sc, th, to, *lsop); + syncookie_cmp(inc, sch, sc, th, to, sol); #endif if (sc == NULL) { @@ -987,7 +987,7 @@ goto failed; } bzero(&scs, sizeof(scs)); - sc = syncookie_lookup(inc, sch, &scs, th, to, *lsop); + sc = syncookie_lookup(inc, sch, &scs, th, to, sol); SCH_UNLOCK(sch); if (sc == NULL) { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) @@ -1087,9 +1087,9 @@ goto failed; } - *lsop = syncache_socket(sc, *lsop, m); + *so = syncache_socket(sc, sol, m); - if (*lsop == NULL) + if (*so == NULL) TCPSTAT_INC(tcps_sc_aborted); else TCPSTAT_INC(tcps_sc_completed); @@ -1103,7 +1103,7 @@ syncache_free(sc); if (s != NULL) free(s, M_TCPLOG); - *lsop = NULL; + *so = NULL; return (0); } @@ -1161,7 +1161,7 @@ */ int syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, - struct inpcb *inp, struct socket **lsop, struct mbuf *m, void *tod, + struct inpcb *inp, struct solisten *sol, struct mbuf *m, void *tod, void *todctx) { struct tcpcb *tp; @@ -1193,12 +1193,11 @@ ("%s: unexpected tcp flags", __func__)); /* - * Combine all so/tp operations very early to drop the INP lock as - * soon as possible. + * Combine all solisten/tp operations very early to drop the INP + * lock as soon as possible. */ - so = *lsop; - tp = sototcpcb(so); - cred = crhold(so->so_cred); + tp = soltotcpcb(sol); + cred = crhold(sol->sol_cred); #ifdef INET6 if ((inc->inc_flags & INC_ISIPV6) && @@ -1207,7 +1206,7 @@ #endif ip_ttl = inp->inp_ip_ttl; ip_tos = inp->inp_ip_tos; - win = sbspace(&so->so_rcv); + win = sol->sol_sbrcv_hiwat; ltflags = (tp->t_flags & (TF_NOOPT | TF_SIGNATURE)); #ifdef TCP_RFC7413 @@ -1220,7 +1219,7 @@ * listen queue with bogus TFO connections. */ if (atomic_fetchadd_int(tp->t_tfo_pending, 1) <= - (so->so_qlimit / 2)) { + (sol->sol_qlimit / 2)) { int result; result = tcp_fastopen_check_cookie(inc, @@ -1498,10 +1497,7 @@ } done: - if (m) { - *lsop = NULL; - m_freem(m); - } + m_freem(m); #ifdef TCP_RFC7413 /* * If tfo_pending is not NULL here, then a TFO SYN that did not @@ -1981,7 +1977,7 @@ static struct syncache * syncookie_lookup(struct in_conninfo *inc, struct syncache_head *sch, struct syncache *sc, struct tcphdr *th, struct tcpopt *to, - struct socket *lso) + struct solisten *sol) { uint32_t hash; uint8_t *secbits; @@ -2024,13 +2020,13 @@ switch (inc->inc_flags & INC_ISIPV6) { #ifdef INET case 0: - sc->sc_ip_ttl = sotoinpcb(lso)->inp_ip_ttl; - sc->sc_ip_tos = sotoinpcb(lso)->inp_ip_tos; + sc->sc_ip_ttl = soltoinpcb(sol)->inp_ip_ttl; + sc->sc_ip_tos = soltoinpcb(sol)->inp_ip_tos; break; #endif #ifdef INET6 case INC_ISIPV6: - if (sotoinpcb(lso)->inp_flags & IN6P_AUTOFLOWLABEL) + if (soltoinpcb(sol)->inp_flags & IN6P_AUTOFLOWLABEL) sc->sc_flowlabel = sc->sc_iss & IPV6_FLOWLABEL_MASK; break; #endif @@ -2049,7 +2045,7 @@ sc->sc_flags |= SCF_WINSCALE; } - wnd = sbspace(&lso->so_rcv); + wnd = sol->sol_sbrcv_hiwat; wnd = imax(wnd, 0); wnd = imin(wnd, TCP_MAXWIN); sc->sc_wnd = wnd; @@ -2077,13 +2073,13 @@ static int syncookie_cmp(struct in_conninfo *inc, struct syncache_head *sch, struct syncache *sc, struct tcphdr *th, struct tcpopt *to, - struct socket *lso) + struct solisten *sol) { struct syncache scs, *scx; char *s; bzero(&scs, sizeof(scs)); - scx = syncookie_lookup(inc, sch, &scs, th, to, lso); + scx = syncookie_lookup(inc, sch, &scs, th, to, sol); if ((s = tcp_log_addrs(inc, th, NULL, NULL)) == NULL) return (0); Index: sys/netinet/tcp_timewait.c =================================================================== --- sys/netinet/tcp_timewait.c +++ sys/netinet/tcp_timewait.c @@ -352,7 +352,6 @@ ("tcp_twstart: !SS_PROTOREF")); inp->inp_flags &= ~INP_SOCKREF; INP_WUNLOCK(inp); - ACCEPT_LOCK(); SOCK_LOCK(so); so->so_state &= ~SS_PROTOREF; sofree(so); @@ -491,7 +490,6 @@ if (inp->inp_flags & INP_SOCKREF) { inp->inp_flags &= ~INP_SOCKREF; INP_WUNLOCK(inp); - ACCEPT_LOCK(); SOCK_LOCK(so); KASSERT(so->so_state & SS_PROTOREF, ("tcp_twclose: INP_SOCKREF && !SS_PROTOREF")); Index: sys/netinet/tcp_usrreq.c =================================================================== --- sys/netinet/tcp_usrreq.c +++ sys/netinet/tcp_usrreq.c @@ -394,97 +394,129 @@ * Prepare to accept connections. */ static int -tcp_usr_listen(struct socket *so, int backlog, struct thread *td) +tcp_usr_listen(struct socket *so, struct solisten *sol, int backlog, + struct thread *td) { - int error = 0; struct inpcb *inp; - struct tcpcb *tp = NULL; + struct tcpcb *tp; TCPDEBUG0; + if (so == NULL) + return (0); inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_usr_listen: inp == NULL")); INP_WLOCK(inp); if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { - error = EINVAL; - goto out; + INP_WUNLOCK(inp); + return (EINVAL); } tp = intotcpcb(inp); TCPDEBUG1(); SOCK_LOCK(so); - error = solisten_proto_check(so); - INP_HASH_WLOCK(&V_tcbinfo); - if (error == 0 && inp->inp_lport == 0) + if (inp->inp_lport == 0) { + int error; + + INP_HASH_WLOCK(&V_tcbinfo); error = in_pcbbind(inp, (struct sockaddr *)0, td->td_ucred); - INP_HASH_WUNLOCK(&V_tcbinfo); - if (error == 0) { - tcp_state_change(tp, TCPS_LISTEN); - solisten_proto(so, backlog); + INP_HASH_WUNLOCK(&V_tcbinfo); + if (error) { + SOCK_UNLOCK(so); + INP_WUNLOCK(inp); + return (error); + } + } + tcp_state_change(tp, TCPS_LISTEN); + inp->inp_flags2 |= INP_LISTENING; + inp->inp_solisten = sol; #ifdef TCP_OFFLOAD - if ((so->so_options & SO_NO_OFFLOAD) == 0) - tcp_offload_listen_start(tp); + if ((so->so_options & SO_NO_OFFLOAD) == 0) + tcp_offload_listen_start(tp); #endif - } SOCK_UNLOCK(so); - #ifdef TCP_RFC7413 if (IS_FASTOPEN(tp->t_flags)) tp->t_tfo_pending = tcp_fastopen_alloc_counter(); #endif -out: TCPDEBUG2(PRU_LISTEN); TCP_PROBE2(debug__user, tp, PRU_LISTEN); INP_WUNLOCK(inp); - return (error); + return (0); +} + +static void +tcp_usr_listenclose(struct solisten *sol) +{ + struct inpcb *inp; + struct tcpcb *tp; + + inp = soltoinpcb(sol); + INP_INFO_RLOCK(&V_tcbinfo); + INP_WLOCK(inp); + KASSERT(inp->inp_flags2 & INP_LISTENING, + ("%s: inp %p not listening", __func__, inp)); + KASSERT(!(inp->inp_flags & INP_DROPPED), + ("%s: listening inp %p dropped", __func__, inp)); + tp = intotcpcb(inp); + TCPDEBUG2(PRU_CLOSE); + TCP_PROBE2(debug__user, tp, PRU_CLOSE); + tcp_disconnect(tp); + /* inp gone */ + INP_INFO_RUNLOCK(&V_tcbinfo); } #endif /* INET */ #ifdef INET6 static int -tcp6_usr_listen(struct socket *so, int backlog, struct thread *td) +tcp6_usr_listen(struct socket *so, struct solisten *sol, int backlog, + struct thread *td) { - int error = 0; struct inpcb *inp; - struct tcpcb *tp = NULL; + struct tcpcb *tp; TCPDEBUG0; + if (so == NULL) + return (0); inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp6_usr_listen: inp == NULL")); INP_WLOCK(inp); if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { - error = EINVAL; - goto out; + INP_WUNLOCK(inp); + return (EINVAL); } tp = intotcpcb(inp); TCPDEBUG1(); SOCK_LOCK(so); - error = solisten_proto_check(so); - INP_HASH_WLOCK(&V_tcbinfo); - if (error == 0 && inp->inp_lport == 0) { + if (inp->inp_lport == 0) { + int error; + inp->inp_vflag &= ~INP_IPV4; if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) inp->inp_vflag |= INP_IPV4; + INP_HASH_WLOCK(&V_tcbinfo); error = in6_pcbbind(inp, (struct sockaddr *)0, td->td_ucred); + INP_HASH_WUNLOCK(&V_tcbinfo); + if (error) { + SOCK_UNLOCK(so); + INP_WUNLOCK(inp); + return (error); + } } - INP_HASH_WUNLOCK(&V_tcbinfo); - if (error == 0) { - tcp_state_change(tp, TCPS_LISTEN); - solisten_proto(so, backlog); + tcp_state_change(tp, TCPS_LISTEN); + inp->inp_flags2 |= INP_LISTENING; + inp->inp_solisten = sol; #ifdef TCP_OFFLOAD - if ((so->so_options & SO_NO_OFFLOAD) == 0) - tcp_offload_listen_start(tp); + if ((so->so_options & SO_NO_OFFLOAD) == 0) + tcp_offload_listen_start(tp); #endif - } SOCK_UNLOCK(so); - #ifdef TCP_RFC7413 if (IS_FASTOPEN(tp->t_flags)) tp->t_tfo_pending = tcp_fastopen_alloc_counter(); #endif -out: TCPDEBUG2(PRU_LISTEN); TCP_PROBE2(debug__user, tp, PRU_LISTEN); INP_WUNLOCK(inp); - return (error); + return (0); } #endif /* INET6 */ @@ -1177,6 +1209,7 @@ .pru_detach = tcp_usr_detach, .pru_disconnect = tcp_usr_disconnect, .pru_listen = tcp_usr_listen, + .pru_listenclose = tcp_usr_listenclose, .pru_peeraddr = in_getpeeraddr, .pru_rcvd = tcp_usr_rcvd, .pru_rcvoob = tcp_usr_rcvoob, @@ -1200,6 +1233,7 @@ .pru_detach = tcp_usr_detach, .pru_disconnect = tcp_usr_disconnect, .pru_listen = tcp6_usr_listen, + .pru_listenclose = tcp_usr_listenclose, .pru_peeraddr = in6_mapped_peeraddr, .pru_rcvd = tcp_usr_rcvd, .pru_rcvoob = tcp_usr_rcvoob, @@ -1392,23 +1426,21 @@ #define INP_WLOCK_RECHECK(inp) INP_WLOCK_RECHECK_CLEANUP((inp), /* noop */) int -tcp_ctloutput(struct socket *so, struct sockopt *sopt) +tcp_ctloutput(void *xpcb, struct sockopt *sopt) { - int error; - struct inpcb *inp; - struct tcpcb *tp; + struct inpcb *inp = (struct inpcb *)xpcb; + struct tcpcb *tp; struct tcp_function_block *blk; struct tcp_function_set fsn; + int error; error = 0; - inp = sotoinpcb(so); - KASSERT(inp != NULL, ("tcp_ctloutput: inp == NULL")); INP_WLOCK(inp); if (sopt->sopt_level != IPPROTO_TCP) { #ifdef INET6 if (inp->inp_vflag & INP_IPV6PROTO) { INP_WUNLOCK(inp); - error = ip6_ctloutput(so, sopt); + error = ip6_ctloutput(inp, sopt); } #endif /* INET6 */ #if defined(INET6) && defined(INET) @@ -1417,7 +1449,7 @@ #ifdef INET { INP_WUNLOCK(inp); - error = ip_ctloutput(so, sopt); + error = ip_ctloutput(inp, sopt); } #endif return (error); @@ -1510,18 +1542,19 @@ return (error); } /* Pass in the INP locked, called must unlock it */ - return (tp->t_fb->tfb_tcp_ctloutput(so, sopt, inp, tp)); + return (tp->t_fb->tfb_tcp_ctloutput(inp, sopt)); } int -tcp_default_ctloutput(struct socket *so, struct sockopt *sopt, struct inpcb *inp, struct tcpcb *tp) +tcp_default_ctloutput(struct inpcb *inp, struct sockopt *sopt) { - int error, opt, optval; - u_int ui; - struct tcp_info ti; + struct tcp_info ti; struct cc_algo *algo; + struct tcpcb *tp = intotcpcb(inp); char *pbuf, buf[TCP_CA_NAME_MAX]; size_t len; + int error, opt, optval; + u_int ui; /* * For TCP_CCALGOOPT forward the control to CC module, for both @@ -1946,10 +1979,12 @@ INP_WLOCK_ASSERT(inp); /* - * Neither tcp_close() nor tcp_drop() should return NULL, as the - * socket is still open. + * For a regular socket neither tcp_close() nor tcp_drop() should + * return NULL, as the socket is still open. */ - if (tp->t_state < TCPS_ESTABLISHED) { + if (tp->t_state == TCPS_LISTEN) + tp = tcp_close(tp); + else if (tp->t_state < TCPS_ESTABLISHED) { tp = tcp_close(tp); KASSERT(tp != NULL, ("tcp_disconnect: tcp_close() returned NULL")); Index: sys/netinet/tcp_var.h =================================================================== --- sys/netinet/tcp_var.h +++ sys/netinet/tcp_var.h @@ -136,8 +136,7 @@ struct socket *, struct tcpcb *, int, int, uint8_t, int); - int (*tfb_tcp_ctloutput)(struct socket *so, struct sockopt *sopt, - struct inpcb *inp, struct tcpcb *tp); + int (*tfb_tcp_ctloutput)(struct inpcb *inp, struct sockopt *sopt); /* Optional memory allocation/free routine */ void (*tfb_tcp_fb_init)(struct tcpcb *); void (*tfb_tcp_fb_fini)(struct tcpcb *, int); @@ -466,6 +465,7 @@ #define intotcpcb(ip) ((struct tcpcb *)(ip)->inp_ppcb) #define intotw(ip) ((struct tcptw *)(ip)->inp_ppcb) #define sototcpcb(so) (intotcpcb(sotoinpcb(so))) +#define soltotcpcb(sol) (intotcpcb(soltoinpcb(sol))) /* * The smoothed round-trip time and estimated variance @@ -771,7 +771,7 @@ void tcp_twstart(struct tcpcb *); void tcp_twclose(struct tcptw *, int); void tcp_ctlinput(int, struct sockaddr *, void *); -int tcp_ctloutput(struct socket *, struct sockopt *); +int tcp_ctloutput(void *, struct sockopt *); struct tcpcb * tcp_drop(struct tcpcb *, int); void tcp_drain(void); @@ -810,7 +810,7 @@ int deregister_tcp_functions(struct tcp_function_block *blk); struct tcp_function_block *find_and_ref_tcp_functions(struct tcp_function_set *fs); struct tcp_function_block *find_and_ref_tcp_fb(struct tcp_function_block *blk); -int tcp_default_ctloutput(struct socket *so, struct sockopt *sopt, struct inpcb *inp, struct tcpcb *tp); +int tcp_default_ctloutput(struct inpcb *inp, struct sockopt *sopt); uint32_t tcp_maxmtu(struct in_conninfo *, struct tcp_ifcap *); uint32_t tcp_maxmtu6(struct in_conninfo *, struct tcp_ifcap *); Index: sys/netinet/udp_usrreq.c =================================================================== --- sys/netinet/udp_usrreq.c +++ sys/netinet/udp_usrreq.c @@ -988,16 +988,15 @@ #endif /* INET */ int -udp_ctloutput(struct socket *so, struct sockopt *sopt) +udp_ctloutput(void *xpcb, struct sockopt *sopt) { - struct inpcb *inp; + struct inpcb *inp = (struct inpcb *)xpcb; + struct socket *so = inp->inp_socket; struct udpcb *up; int isudplite, error, optval; error = 0; isudplite = (so->so_proto->pr_protocol == IPPROTO_UDPLITE) ? 1 : 0; - inp = sotoinpcb(so); - KASSERT(inp != NULL, ("%s: inp == NULL", __func__)); INP_WLOCK(inp); if (sopt->sopt_level != so->so_proto->pr_protocol) { #ifdef INET6 @@ -1012,7 +1011,7 @@ #ifdef INET { INP_WUNLOCK(inp); - error = ip_ctloutput(so, sopt); + error = ip_ctloutput(inp, sopt); } #endif return (error); @@ -1027,8 +1026,6 @@ sizeof optval); if (error) break; - inp = sotoinpcb(so); - KASSERT(inp != NULL, ("%s: inp == NULL", __func__)); INP_WLOCK(inp); #ifdef IPSEC_NAT_T up = intoudpcb(inp); Index: sys/netinet/udp_var.h =================================================================== --- sys/netinet/udp_var.h +++ sys/netinet/udp_var.h @@ -168,7 +168,7 @@ void udp_ctlinput(int, struct sockaddr *, void *); void udplite_ctlinput(int, struct sockaddr *, void *); -int udp_ctloutput(struct socket *, struct sockopt *); +int udp_ctloutput(void *, struct sockopt *); void udp_init(void); void udplite_init(void); int udp_input(struct mbuf **, int *, int); Index: sys/netinet6/ip6_output.c =================================================================== --- sys/netinet6/ip6_output.c +++ sys/netinet6/ip6_output.c @@ -1410,11 +1410,12 @@ * IP6 socket option processing. */ int -ip6_ctloutput(struct socket *so, struct sockopt *sopt) +ip6_ctloutput(void *xpcb, struct sockopt *sopt) { + struct inpcb *in6p = (struct inpcb *)xpcb; + struct socket *so = in6p->inp_socket; int optdatalen, uproto; void *optdata; - struct inpcb *in6p = sotoinpcb(so); int error, optval; int level, op, optname; int optlen; Index: sys/netinet6/ip6_var.h =================================================================== --- sys/netinet6/ip6_var.h +++ sys/netinet6/ip6_var.h @@ -385,7 +385,7 @@ int, struct ip6_moptions *, struct ifnet **, struct inpcb *); -int ip6_ctloutput(struct socket *, struct sockopt *); +int ip6_ctloutput(void *, struct sockopt *); int ip6_raw_ctloutput(struct socket *, struct sockopt *); void ip6_initpktopts(struct ip6_pktopts *); int ip6_setpktopts(struct mbuf *, struct ip6_pktopts *, @@ -407,7 +407,7 @@ void rip6_init(void); int rip6_input(struct mbuf **, int *, int); void rip6_ctlinput(int, struct sockaddr *, void *); -int rip6_ctloutput(struct socket *, struct sockopt *); +int rip6_ctloutput(void *, struct sockopt *); int rip6_output(struct mbuf *, struct socket *, ...); int rip6_usrreq(struct socket *, int, struct mbuf *, struct mbuf *, struct mbuf *, struct thread *); Index: sys/netinet6/raw_ip6.c =================================================================== --- sys/netinet6/raw_ip6.c +++ sys/netinet6/raw_ip6.c @@ -560,9 +560,10 @@ * Raw IPv6 socket option processing. */ int -rip6_ctloutput(struct socket *so, struct sockopt *sopt) +rip6_ctloutput(void *xpcb, struct sockopt *sopt) { - struct inpcb *inp; + struct inpcb *inp = (struct inpcb *)xpcb; + struct socket *so = inp->inp_socket; int error; if (sopt->sopt_level == IPPROTO_ICMPV6) @@ -574,7 +575,6 @@ else if (sopt->sopt_level != IPPROTO_IPV6) { if (sopt->sopt_level == SOL_SOCKET && sopt->sopt_name == SO_SETFIB) { - inp = sotoinpcb(so); INP_WLOCK(inp); inp->inp_inc.inc_fibnum = so->so_fibnum; INP_WUNLOCK(inp); @@ -623,7 +623,7 @@ error = ip6_raw_ctloutput(so, sopt); break; default: - error = ip6_ctloutput(so, sopt); + error = ip6_ctloutput(inp, sopt); break; } break; Index: sys/sys/file.h =================================================================== --- sys/sys/file.h +++ sys/sys/file.h @@ -54,7 +54,7 @@ #endif /* _KERNEL */ #define DTYPE_VNODE 1 /* file */ -#define DTYPE_SOCKET 2 /* communications endpoint */ +#define DTYPE_SOCKET 2 /* regular (data flow) socket */ #define DTYPE_PIPE 3 /* pipe */ #define DTYPE_FIFO 4 /* fifo (named pipe) */ #define DTYPE_KQUEUE 5 /* event queue */ @@ -66,6 +66,7 @@ #define DTYPE_DEV 11 /* Device specific fd type */ #define DTYPE_PROCDESC 12 /* process descriptor */ #define DTYPE_LINUXEFD 13 /* emulation eventfd type */ +#define DTYPE_SOLISTEN 14 /* listen(2)ing socket */ #ifdef _KERNEL @@ -224,6 +225,7 @@ extern struct fileops vnops; extern struct fileops badfileops; extern struct fileops socketops; +extern struct fileops solistenops; extern int maxfiles; /* kernel limit on number of open files */ extern int maxfilesperproc; /* per process limit on number of open files */ extern volatile int openfiles; /* actual number of open files */ Index: sys/sys/protosw.h =================================================================== --- sys/sys/protosw.h +++ sys/sys/protosw.h @@ -39,6 +39,7 @@ struct thread; struct sockaddr; struct socket; +struct solisten; struct sockopt; /*#ifdef _KERNEL*/ @@ -68,7 +69,7 @@ typedef int pr_input_t (struct mbuf **, int*, int); typedef int pr_output_t (struct mbuf *, struct socket *, ...); typedef void pr_ctlinput_t (int, struct sockaddr *, void *); -typedef int pr_ctloutput_t (struct socket *, struct sockopt *); +typedef int pr_ctloutput_t (void *, struct sockopt *); typedef void pr_init_t (void); typedef void pr_fasttimo_t (void); typedef void pr_slowtimo_t (void); @@ -196,8 +197,8 @@ struct ifnet *ifp, struct thread *td); void (*pru_detach)(struct socket *so); int (*pru_disconnect)(struct socket *so); - int (*pru_listen)(struct socket *so, int backlog, - struct thread *td); + int (*pru_listen)(struct socket *so, struct solisten *sol, + int backlog, struct thread *td); int (*pru_peeraddr)(struct socket *so, struct sockaddr **nam); int (*pru_rcvd)(struct socket *so, int flags); int (*pru_rcvoob)(struct socket *so, struct mbuf *m, int flags); @@ -223,6 +224,7 @@ struct ucred *cred, struct thread *td); void (*pru_sosetlabel)(struct socket *so); void (*pru_close)(struct socket *so); + void (*pru_listenclose)(struct solisten *sol); int (*pru_bindat)(int fd, struct socket *so, struct sockaddr *nam, struct thread *td); int (*pru_connectat)(int fd, struct socket *so, @@ -248,7 +250,8 @@ int pru_control_notsupp(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp, struct thread *td); int pru_disconnect_notsupp(struct socket *so); -int pru_listen_notsupp(struct socket *so, int backlog, struct thread *td); +int pru_listen_notsupp(struct socket *so, struct solisten *sol, + int backlog, struct thread *td); int pru_peeraddr_notsupp(struct socket *so, struct sockaddr **nam); int pru_rcvd_notsupp(struct socket *so, int flags); int pru_rcvoob_notsupp(struct socket *so, struct mbuf *m, int flags); Index: sys/sys/socket.h =================================================================== --- sys/sys/socket.h +++ sys/sys/socket.h @@ -117,7 +117,9 @@ * Option flags per-socket. */ #define SO_DEBUG 0x0001 /* turn on debugging info recording */ +#if 0 #define SO_ACCEPTCONN 0x0002 /* socket has had listen() */ +#endif #define SO_REUSEADDR 0x0004 /* allow local address reuse */ #define SO_KEEPALIVE 0x0008 /* keep connections alive */ #define SO_DONTROUTE 0x0010 /* just use interface addresses */ @@ -704,9 +706,5 @@ void so_lock(struct socket *so); void so_unlock(struct socket *so); -void so_listeners_apply_all(struct socket *so, void (*func)(struct socket *, void *), void *arg); - -#endif - - +#endif /* _KERNEL */ #endif /* !_SYS_SOCKET_H_ */ Index: sys/sys/socketvar.h =================================================================== --- sys/sys/socketvar.h +++ sys/sys/socketvar.h @@ -58,13 +58,14 @@ typedef u_quad_t so_gen_t; struct socket; +struct solisten; /*- * Locking key to struct socket: * (a) constant after allocation, no locking required. * (b) locked by SOCK_LOCK(so). * (c) locked by SOCKBUF_LOCK(&so->so_rcv). - * (e) locked by ACCEPT_LOCK(). + * (e) locked by SOLISTEN_LOCK() of corresponding listening socket. * (f) not locked since integer reads/writes are atomic. * (g) used only as a sleep/wakeup address, no value. * (h) locked by global mutex so_global_mtx. @@ -79,25 +80,8 @@ void *so_pcb; /* protocol control block */ struct vnet *so_vnet; /* (a) network stack instance */ struct protosw *so_proto; /* (a) protocol handle */ -/* - * Variables for connection queuing. - * Socket where accepts occur is so_head in all subsidiary sockets. - * If so_head is 0, socket is not related to an accept. - * For head socket so_incomp queues partially completed connections, - * while so_comp is a queue of connections ready to be accepted. - * If a connection is aborted and it has so_head set, then - * it has to be pulled out of either so_incomp or so_comp. - * We allow connections to queue up based on current queue lengths - * and limit on number of queued connections for this socket. - */ - struct socket *so_head; /* (e) back pointer to listen socket */ - TAILQ_HEAD(, socket) so_incomp; /* (e) queue of partial unaccepted connections */ - TAILQ_HEAD(, socket) so_comp; /* (e) queue of complete unaccepted connections */ + struct solisten *so_listen; /* (e) back pointer to listen socket */ TAILQ_ENTRY(socket) so_list; /* (e) list of unaccepted connections */ - u_int so_qlen; /* (e) number of unaccepted connections */ - u_int so_incqlen; /* (e) number of unaccepted incomplete - connections */ - u_int so_qlimit; /* (e) max number queued connections */ short so_timeo; /* (g) connection timeout */ u_short so_error; /* (f) error affecting connection */ struct sigio *so_sigio; /* [sg] information for async I/O or @@ -112,11 +96,6 @@ /* NB: generation count must not be first. */ so_gen_t so_gencnt; /* (h) generation count */ void *so_emuldata; /* (b) private data for emulators */ - struct so_accf { - struct accept_filter *so_accept_filter; - void *so_accept_filter_arg; /* saved filter args */ - char *so_accept_filter_str; /* saved user args */ - } *so_accf; struct osd osd; /* Object Specific extensions */ /* * so_fibnum, so_user_cookie and friends can be used to attach @@ -135,16 +114,63 @@ }; /* - * Global accept mutex to serialize access to accept queues and - * fields associated with multiple sockets. This allows us to - * avoid defining a lock order between listen and accept sockets - * until such time as it proves to be a good idea. - */ -extern struct mtx accept_mtx; -#define ACCEPT_LOCK_ASSERT() mtx_assert(&accept_mtx, MA_OWNED) -#define ACCEPT_UNLOCK_ASSERT() mtx_assert(&accept_mtx, MA_NOTOWNED) -#define ACCEPT_LOCK() mtx_lock(&accept_mtx) -#define ACCEPT_UNLOCK() mtx_unlock(&accept_mtx) + * Structure for listening socket. + * Socket where accepts occur is so_listen in all subsidiary sockets. + * If so_listen is NULL, socket is not related to an accept. + * For head socket so_incomp queues partially completed connections, + * while so_comp is a queue of connections ready to be accepted. + * If a connection is aborted and it has so_listen set, then + * it has to be pulled out of either so_incomp or so_comp. + * We allow connections to queue up based on current queue lengths + * and limit on number of queued connections for this socket. + */ +TAILQ_HEAD(accept_queue, socket); +struct solisten { + /* + * Data inherited from 'struct socket' before listen(2) + * mutation, to be copied to our children sockets. + */ + short sol_type; + short sol_options; + short sol_linger; + short sol_state; + int sol_fibnum; + int sol_sbrcv_lowat; + int sol_sbsnd_lowat; + u_int sol_sbrcv_hiwat; + u_int sol_sbsnd_hiwat; + short sol_sbrcv_flags; + short sol_sbsnd_flags; + sbintime_t sol_sbrcv_timeo; + sbintime_t sol_sbsnd_timeo; + struct protosw *sol_proto; + struct vnet *sol_vnet; + struct ucred *sol_cred; + struct label *sol_label; + struct sigio *sol_sigio; + void *sol_pcb; + + /* accept_filter(9) optional data */ + struct accept_filter *sol_accept_filter; + void *sol_accept_filter_arg; /* saved filter args */ + char *sol_accept_filter_str; /* saved user args */ + + /* Actual queue management. */ + struct mtx sol_mutex; + struct selinfo sol_selinfo; + u_int sol_qlen; /* (e) number of unaccepted connections */ + u_int sol_incqlen; /* (e) number of unaccepted incomplete + connections */ + u_int sol_qlimit; /* (e) max number queued connections */ + u_int sol_error; + /* (e) queue of partial unaccepted connections */ + struct accept_queue sol_incomp; + /* (e) queue of complete unaccepted connections */ + struct accept_queue sol_comp; +}; + +#define SOLISTEN_LOCK(sol) mtx_lock(&(sol)->sol_mutex) +#define SOLISTEN_UNLOCK(sol) mtx_unlock(&(sol)->sol_mutex) /* * Per-socket mutex: we reuse the receive socket buffer mutex for space @@ -212,8 +238,7 @@ /* can we read something from so? */ #define soreadabledata(so) \ - (sbavail(&(so)->so_rcv) >= (so)->so_rcv.sb_lowat || \ - !TAILQ_EMPTY(&(so)->so_comp) || (so)->so_error) + (sbavail(&(so)->so_rcv) >= (so)->so_rcv.sb_lowat || (so)->so_error) #define soreadable(so) \ (soreadabledata(so) || ((so)->so_rcv.sb_state & SBS_CANTRCVMORE)) @@ -236,16 +261,13 @@ } while (0) #define sorele(so) do { \ - ACCEPT_LOCK_ASSERT(); \ SOCK_LOCK_ASSERT(so); \ if ((so)->so_count <= 0) \ panic("sorele"); \ if (--(so)->so_count == 0) \ sofree(so); \ - else { \ + else \ SOCK_UNLOCK(so); \ - ACCEPT_UNLOCK(); \ - } \ } while (0) /* @@ -285,17 +307,18 @@ struct accept_filter { char accf_name[16]; int (*accf_callback) - (struct socket *so, void *arg, int waitflag); + (struct socket *, void *, int); void * (*accf_create) - (struct socket *so, char *arg); + (struct solisten *, char *); void (*accf_destroy) - (struct socket *so); + (struct solisten *); SLIST_ENTRY(accept_filter) accf_next; }; #ifdef MALLOC_DECLARE MALLOC_DECLARE(M_ACCF); MALLOC_DECLARE(M_PCB); +MALLOC_DECLARE(M_SOLISTEN); MALLOC_DECLARE(M_SONAME); #endif @@ -344,7 +367,8 @@ */ int getsockaddr(struct sockaddr **namp, caddr_t uaddr, size_t len); int getsock_cap(struct thread *td, int fd, cap_rights_t *rightsp, - struct file **fpp, u_int *fflagp, struct filecaps *havecaps); + struct file **fpp, u_int *fflagp, struct filecaps *havecaps, + short type); void soabort(struct socket *so); int soaccept(struct socket *so, struct sockaddr **nam); void soaio_enqueue(struct task *task); @@ -365,13 +389,8 @@ struct sockaddr *sodupsockaddr(const struct sockaddr *sa, int mflags); void sofree(struct socket *so); void sohasoutofband(struct socket *so); -int solisten(struct socket *so, int backlog, struct thread *td); -void solisten_proto(struct socket *so, int backlog); -int solisten_proto_check(struct socket *so); -struct socket * - sonewconn(struct socket *head, int connstatus); - - +struct socket * sonewconn(struct solisten *, int); +struct socket * sopeeloff(struct socket *); int sopoll(struct socket *so, int events, struct ucred *active_cred, struct thread *td); int sopoll_generic(struct socket *so, int events, @@ -409,6 +428,17 @@ struct thread *td); /* + * Listening sockets. + */ +int solisten(struct socket *so, int backlog, struct thread *td, + struct file *fp); +int sollisten(struct solisten *so, int backlog, struct thread *td, + struct file *fp); +int solistenpoll(struct solisten *, int, struct ucred *a, struct thread *); +int solistenclose(struct solisten *); +void soltoxsocket(struct solisten *so, struct xsocket *xso); + +/* * Accept filter functions (duh). */ int accept_filt_add(struct accept_filter *filt); Index: sys/sys/sockopt.h =================================================================== --- sys/sys/sockopt.h +++ sys/sys/sockopt.h @@ -40,6 +40,7 @@ struct thread; struct socket; +struct solisten; /* * Argument structure for sosetopt et seq. This is in the KERNEL @@ -58,14 +59,16 @@ int sosetopt(struct socket *so, struct sockopt *sopt); int sogetopt(struct socket *so, struct sockopt *sopt); +int solsetopt(struct solisten *so, struct sockopt *sopt); +int solgetopt(struct solisten *so, struct sockopt *sopt); int sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen); int sooptcopyout(struct sockopt *sopt, const void *buf, size_t len); /* XXX; prepare mbuf for (__FreeBSD__ < 3) routines. */ int soopt_getm(struct sockopt *sopt, struct mbuf **mp); int soopt_mcopyin(struct sockopt *sopt, struct mbuf *m); int soopt_mcopyout(struct sockopt *sopt, struct mbuf *m); -int do_getopt_accept_filter(struct socket *so, struct sockopt *sopt); -int do_setopt_accept_filter(struct socket *so, struct sockopt *sopt); +int accept_filt_getopt(struct solisten *, struct sockopt *); +int accept_filt_setopt(struct solisten *, struct sockopt *); int so_setsockopt(struct socket *so, int level, int optname, void *optval, size_t optlen); Index: sys/sys/unpcb.h =================================================================== --- sys/sys/unpcb.h +++ sys/sys/unpcb.h @@ -66,7 +66,10 @@ struct unpcb { LIST_ENTRY(unpcb) unp_link; /* glue on list of all PCBs */ - struct socket *unp_socket; /* pointer back to socket */ + union { + struct socket *unp_socket; /* pointer back to socket */ + struct solisten *unp_listen; /* to listening socket */ + }; struct file *unp_file; /* back-pointer to file for gc. */ struct vnode *unp_vnode; /* if associated with file */ ino_t unp_ino; /* fake inode number */ @@ -93,13 +96,14 @@ * to determine whether the contents should be sent to the user or * not. * - * UNP_HAVEPCCACHED - indicates that the unp_peercred member is filled - * in, but does *not* contain the credentials of the connected peer + * UNP_LISTENING - indicates that unp is listening, and unp_listen should + * be dereferenced instead of unp_socket. The unp_peercred member is + * filled in, but does *not* contain the credentials of the connected peer * (there may not even be a peer). This is set in unp_listen() when * it fills in unp_peercred for later consumption by unp_connect(). */ #define UNP_HAVEPC 0x001 -#define UNP_HAVEPCCACHED 0x002 +#define UNP_LISTENING 0x002 #define UNP_WANTCRED 0x004 /* credentials wanted */ #define UNP_CONNWAIT 0x008 /* connect blocks until accepted */ @@ -121,6 +125,7 @@ #define UNPGC_IGNORE_RIGHTS 0x8 /* Attached rights are freed */ #define sotounpcb(so) ((struct unpcb *)((so)->so_pcb)) +#define soltounpcb(sol) ((struct unpcb *)((sol)->sol_pcb)) /* Hack alert -- this structure depends on . */ #ifdef _SYS_SOCKETVAR_H_ Index: sys/sys/vnode.h =================================================================== --- sys/sys/vnode.h +++ sys/sys/vnode.h @@ -112,14 +112,13 @@ /* * Type specific fields, only one applies to any given vnode. - * See #defines below for renaming to v_* namespace. */ union { - struct mount *vu_mount; /* v ptr to mountpoint (VDIR) */ - struct socket *vu_socket; /* v unix domain net (VSOCK) */ - struct cdev *vu_cdev; /* v device (VCHR, VBLK) */ - struct fifoinfo *vu_fifoinfo; /* v fifo (VFIFO) */ - } v_un; + struct mount *v_mountedhere; /* v ptr to mountpoint (VDIR) */ + struct unpcb *v_unpcb; /* v unix domain net (VSOCK) */ + struct cdev *v_rdev; /* v device (VCHR, VBLK) */ + struct fifoinfo *v_fifoinfo; /* v fifo (VFIFO) */ + }; /* * vfs_hash: (mount + inode) -> vnode hash. The hash value @@ -175,11 +174,6 @@ #endif /* defined(_KERNEL) || defined(_KVM_VNODE) */ -#define v_mountedhere v_un.vu_mount -#define v_socket v_un.vu_socket -#define v_rdev v_un.vu_cdev -#define v_fifoinfo v_un.vu_fifoinfo - #define bo2vnode(bo) __containerof((bo), struct vnode, v_bufobj) /* XXX: These are temporary to avoid a source sweep at this time */ @@ -200,7 +194,7 @@ long xv_numoutput; /* num of writes in progress */ enum vtype xv_type; /* vnode type */ union { - void *xvu_socket; /* socket, if VSOCK */ + void *xvu_socket; /* unpcb, if VSOCK */ void *xvu_fifo; /* fifo, if VFIFO */ dev_t xvu_rdev; /* maj/min, if VBLK/VCHR */ struct { Index: usr.bin/netstat/inet.c =================================================================== --- usr.bin/netstat/inet.c +++ usr.bin/netstat/inet.c @@ -170,9 +170,6 @@ if (kread((uintptr_t)proto.pr_domain, &domain, sizeof(domain)) != 0) return (-1); xso->xso_family = domain.dom_family; - xso->so_qlen = so->so_qlen; - xso->so_incqlen = so->so_incqlen; - xso->so_qlimit = so->so_qlimit; xso->so_timeo = so->so_timeo; xso->so_error = so->so_error; xso->so_oobmark = so->so_oobmark; @@ -181,6 +178,33 @@ return (0); } +int +soltoxsocket(struct solisten *sol, struct xsocket *xso) +{ + struct protosw proto; + struct domain domain; + + bzero(xso, sizeof *xso); + xso->xso_len = sizeof *xso; + xso->xso_so = (struct socket *)sol; + xso->so_type = sol->sol_type; + xso->so_options = sol->sol_options | SO_ACCEPTCONN; + xso->so_linger = sol->sol_linger; + xso->so_state = sol->sol_state; + xso->so_pcb = sol->sol_pcb; + if (kread((uintptr_t)sol->sol_proto, &proto, sizeof(proto)) != 0) + return (-1); + xso->xso_protocol = proto.pr_protocol; + if (kread((uintptr_t)proto.pr_domain, &domain, sizeof(domain)) != 0) + return (-1); + xso->xso_family = domain.dom_family; + xso->so_qlen = sol->sol_qlen; + xso->so_incqlen = sol->sol_incqlen; + xso->so_qlimit = sol->sol_qlimit; + xso->so_error = sol->sol_error; + return (0); +} + static int pcblist_kvm(u_long off, char **bufp, int istcp) { @@ -191,6 +215,7 @@ struct xinpgen xig; struct xtcpcb xt; struct socket so; + struct solisten sol; struct xsocket *xso; char *buf, *p; size_t len; @@ -264,7 +289,11 @@ KREAD(inp->inp_ppcb, &xt.xt_tp, sizeof xt.xt_tp); } - if (inp->inp_socket) { + if (inp->inp_flags2 & INP_LISTENING) { + KREAD(inp->inp_solisten, &sol, sizeof(sol)); + if (soltoxsocket(&sol, xso) != 0) + goto fail; + } else if (inp->inp_socket != NULL) { KREAD(inp->inp_socket, &so, sizeof(so)); if (sotoxsocket(&so, xso) != 0) goto fail; Index: usr.bin/netstat/netstat.h =================================================================== --- usr.bin/netstat/netstat.h +++ usr.bin/netstat/netstat.h @@ -75,10 +75,10 @@ const char *plurales(uintmax_t); const char *pluralies(uintmax_t); -struct sockaddr; -struct socket; -struct xsocket; +#ifdef _SYS_SOCKETVAR_H_ int sotoxsocket(struct socket *, struct xsocket *); +int soltoxsocket(struct solisten *, struct xsocket *); +#endif void protopr(u_long, const char *, int, int); void tcp_stats(u_long, const char *, int, int); void udp_stats(u_long, const char *, int, int); @@ -142,9 +142,11 @@ void rt_stats(void); void flowtable_stats(void); +#ifdef _SYS_SOCKETVAR_H_ char *routename(struct sockaddr *, int); const char *netname(struct sockaddr *, struct sockaddr *); void routepr(int, int); +#endif #ifdef NETGRAPH void netgraphprotopr(u_long, const char *, int, int); Index: usr.bin/netstat/unix.c =================================================================== --- usr.bin/netstat/unix.c +++ usr.bin/netstat/unix.c @@ -103,6 +103,7 @@ struct unpcb *unp, unp_conn; u_char sun_len; struct socket so; + struct solisten sol; struct xunpgen xug; struct xunpcb xu; unp_gen_t unp_gencnt; @@ -167,9 +168,15 @@ KREAD(unp_conn.unp_addr, &xu.xu_caddr, sun_len); } } - KREAD(unp->unp_socket, &so, sizeof(so)); - if (sotoxsocket(&so, &xu.xu_socket) != 0) - goto fail; + if (unp->unp_flags & UNP_LISTENING) { + KREAD(unp->unp_listen, &sol, sizeof(sol)); + if (soltoxsocket(&sol, &xu.xu_socket) != 0) + goto fail; + } else { + KREAD(unp->unp_socket, &so, sizeof(so)); + if (sotoxsocket(&so, &xu.xu_socket) != 0) + goto fail; + } COPYOUT(&xu, sizeof(xu)); }