Index: head/sys/ddb/db_variables.c =================================================================== --- head/sys/ddb/db_variables.c (revision 298353) +++ head/sys/ddb/db_variables.c (revision 298354) @@ -1,159 +1,158 @@ /*- * Mach Operating System * Copyright (c) 1991,1990 Carnegie Mellon University * All Rights Reserved. * * Permission to use, copy, modify and distribute this software and its * documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. */ /* * Author: David B. Golub, Carnegie Mellon University * Date: 7/90 */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include static int db_find_variable(struct db_variable **varp); static struct db_variable db_vars[] = { { "radix", &db_radix, FCN_NULL }, { "maxoff", &db_maxoff, FCN_NULL }, { "maxwidth", &db_max_width, FCN_NULL }, { "tabstops", &db_tab_stop_width, FCN_NULL }, { "lines", &db_lines_per_page, FCN_NULL }, { "curcpu", NULL, db_var_curcpu }, { "db_cpu", NULL, db_var_db_cpu }, #ifdef VIMAGE { "curvnet", NULL, db_var_curvnet }, { "db_vnet", NULL, db_var_db_vnet }, #endif }; -static struct db_variable *db_evars = - db_vars + nitems(db_vars); +static struct db_variable *db_evars = db_vars + nitems(db_vars); static int db_find_variable(struct db_variable **varp) { struct db_variable *vp; int t; t = db_read_token(); if (t == tIDENT) { for (vp = db_vars; vp < db_evars; vp++) { if (!strcmp(db_tok_string, vp->name)) { *varp = vp; return (1); } } for (vp = db_regs; vp < db_eregs; vp++) { if (!strcmp(db_tok_string, vp->name)) { *varp = vp; return (1); } } } db_error("Unknown variable\n"); return (0); } int db_get_variable(db_expr_t *valuep) { struct db_variable *vp; if (!db_find_variable(&vp)) return (0); return (db_read_variable(vp, valuep)); } int db_set_variable(db_expr_t value) { struct db_variable *vp; if (!db_find_variable(&vp)) return (0); return (db_write_variable(vp, value)); } int db_read_variable(struct db_variable *vp, db_expr_t *valuep) { db_varfcn_t *func = vp->fcn; if (func == FCN_NULL) { *valuep = *(vp->valuep); return (1); } return ((*func)(vp, valuep, DB_VAR_GET)); } int db_write_variable(struct db_variable *vp, db_expr_t value) { db_varfcn_t *func = vp->fcn; if (func == FCN_NULL) { *(vp->valuep) = value; return (1); } return ((*func)(vp, &value, DB_VAR_SET)); } void db_set_cmd(db_expr_t dummy1, bool dummy2, db_expr_t dummy3, char *dummy4) { struct db_variable *vp; db_expr_t value; int t; t = db_read_token(); if (t != tDOLLAR) { db_error("Unknown variable\n"); return; } if (!db_find_variable(&vp)) { db_error("Unknown variable\n"); return; } t = db_read_token(); if (t != tEQ) db_unread_token(t); if (!db_expression(&value)) { db_error("No value\n"); return; } if (db_read_token() != tEOL) db_error("?\n"); db_write_variable(vp, value); } Index: head/sys/geom/part/g_part_bsd.c =================================================================== --- head/sys/geom/part/g_part_bsd.c (revision 298353) +++ head/sys/geom/part/g_part_bsd.c (revision 298354) @@ -1,540 +1,539 @@ /*- * Copyright (c) 2007 Marcel Moolenaar * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "g_part_if.h" #define BOOT1_SIZE 512 #define LABEL_SIZE 512 #define BOOT2_OFF (BOOT1_SIZE + LABEL_SIZE) #define BOOT2_SIZE (BBSIZE - BOOT2_OFF) FEATURE(geom_part_bsd, "GEOM partitioning class for BSD disklabels"); struct g_part_bsd_table { struct g_part_table base; u_char *bbarea; uint32_t offset; }; struct g_part_bsd_entry { struct g_part_entry base; struct partition part; }; static int g_part_bsd_add(struct g_part_table *, struct g_part_entry *, struct g_part_parms *); static int g_part_bsd_bootcode(struct g_part_table *, struct g_part_parms *); static int g_part_bsd_create(struct g_part_table *, struct g_part_parms *); static int g_part_bsd_destroy(struct g_part_table *, struct g_part_parms *); static void g_part_bsd_dumpconf(struct g_part_table *, struct g_part_entry *, struct sbuf *, const char *); static int g_part_bsd_dumpto(struct g_part_table *, struct g_part_entry *); static int g_part_bsd_modify(struct g_part_table *, struct g_part_entry *, struct g_part_parms *); static const char *g_part_bsd_name(struct g_part_table *, struct g_part_entry *, char *, size_t); static int g_part_bsd_probe(struct g_part_table *, struct g_consumer *); static int g_part_bsd_read(struct g_part_table *, struct g_consumer *); static const char *g_part_bsd_type(struct g_part_table *, struct g_part_entry *, char *, size_t); static int g_part_bsd_write(struct g_part_table *, struct g_consumer *); static int g_part_bsd_resize(struct g_part_table *, struct g_part_entry *, struct g_part_parms *); static kobj_method_t g_part_bsd_methods[] = { KOBJMETHOD(g_part_add, g_part_bsd_add), KOBJMETHOD(g_part_bootcode, g_part_bsd_bootcode), KOBJMETHOD(g_part_create, g_part_bsd_create), KOBJMETHOD(g_part_destroy, g_part_bsd_destroy), KOBJMETHOD(g_part_dumpconf, g_part_bsd_dumpconf), KOBJMETHOD(g_part_dumpto, g_part_bsd_dumpto), KOBJMETHOD(g_part_modify, g_part_bsd_modify), KOBJMETHOD(g_part_resize, g_part_bsd_resize), KOBJMETHOD(g_part_name, g_part_bsd_name), KOBJMETHOD(g_part_probe, g_part_bsd_probe), KOBJMETHOD(g_part_read, g_part_bsd_read), KOBJMETHOD(g_part_type, g_part_bsd_type), KOBJMETHOD(g_part_write, g_part_bsd_write), { 0, 0 } }; static struct g_part_scheme g_part_bsd_scheme = { "BSD", g_part_bsd_methods, sizeof(struct g_part_bsd_table), .gps_entrysz = sizeof(struct g_part_bsd_entry), .gps_minent = 8, .gps_maxent = 20, /* Only 22 entries fit in 512 byte sectors */ .gps_bootcodesz = BBSIZE, }; G_PART_SCHEME_DECLARE(g_part_bsd); static struct g_part_bsd_alias { uint8_t type; int alias; } bsd_alias_match[] = { { FS_BSDFFS, G_PART_ALIAS_FREEBSD_UFS }, { FS_SWAP, G_PART_ALIAS_FREEBSD_SWAP }, { FS_ZFS, G_PART_ALIAS_FREEBSD_ZFS }, { FS_VINUM, G_PART_ALIAS_FREEBSD_VINUM }, { FS_NANDFS, G_PART_ALIAS_FREEBSD_NANDFS }, { FS_HAMMER, G_PART_ALIAS_DFBSD_HAMMER }, { FS_HAMMER2, G_PART_ALIAS_DFBSD_HAMMER2 }, }; static int bsd_parse_type(const char *type, uint8_t *fstype) { const char *alias; char *endp; long lt; int i; if (type[0] == '!') { lt = strtol(type + 1, &endp, 0); if (type[1] == '\0' || *endp != '\0' || lt <= 0 || lt >= 256) return (EINVAL); *fstype = (u_int)lt; return (0); } - for (i = 0; - i < nitems(bsd_alias_match); i++) { + for (i = 0; i < nitems(bsd_alias_match); i++) { alias = g_part_alias_name(bsd_alias_match[i].alias); if (strcasecmp(type, alias) == 0) { *fstype = bsd_alias_match[i].type; return (0); } } return (EINVAL); } static int g_part_bsd_add(struct g_part_table *basetable, struct g_part_entry *baseentry, struct g_part_parms *gpp) { struct g_part_bsd_entry *entry; struct g_part_bsd_table *table; if (gpp->gpp_parms & G_PART_PARM_LABEL) return (EINVAL); entry = (struct g_part_bsd_entry *)baseentry; table = (struct g_part_bsd_table *)basetable; entry->part.p_size = gpp->gpp_size; entry->part.p_offset = gpp->gpp_start + table->offset; entry->part.p_fsize = 0; entry->part.p_frag = 0; entry->part.p_cpg = 0; return (bsd_parse_type(gpp->gpp_type, &entry->part.p_fstype)); } static int g_part_bsd_bootcode(struct g_part_table *basetable, struct g_part_parms *gpp) { struct g_part_bsd_table *table; const u_char *codeptr; if (gpp->gpp_codesize != BOOT1_SIZE && gpp->gpp_codesize != BBSIZE) return (ENODEV); table = (struct g_part_bsd_table *)basetable; codeptr = gpp->gpp_codeptr; bcopy(codeptr, table->bbarea, BOOT1_SIZE); if (gpp->gpp_codesize == BBSIZE) bcopy(codeptr + BOOT2_OFF, table->bbarea + BOOT2_OFF, BOOT2_SIZE); return (0); } static int g_part_bsd_create(struct g_part_table *basetable, struct g_part_parms *gpp) { struct g_provider *pp; struct g_part_entry *baseentry; struct g_part_bsd_entry *entry; struct g_part_bsd_table *table; u_char *ptr; uint32_t msize, ncyls, secpercyl; pp = gpp->gpp_provider; if (pp->sectorsize < sizeof(struct disklabel)) return (ENOSPC); if (BBSIZE % pp->sectorsize) return (ENOTBLK); msize = MIN(pp->mediasize / pp->sectorsize, UINT32_MAX); secpercyl = basetable->gpt_sectors * basetable->gpt_heads; ncyls = msize / secpercyl; table = (struct g_part_bsd_table *)basetable; table->bbarea = g_malloc(BBSIZE, M_WAITOK | M_ZERO); ptr = table->bbarea + pp->sectorsize; le32enc(ptr + 0, DISKMAGIC); /* d_magic */ le32enc(ptr + 40, pp->sectorsize); /* d_secsize */ le32enc(ptr + 44, basetable->gpt_sectors); /* d_nsectors */ le32enc(ptr + 48, basetable->gpt_heads); /* d_ntracks */ le32enc(ptr + 52, ncyls); /* d_ncylinders */ le32enc(ptr + 56, secpercyl); /* d_secpercyl */ le32enc(ptr + 60, msize); /* d_secperunit */ le16enc(ptr + 72, 3600); /* d_rpm */ le32enc(ptr + 132, DISKMAGIC); /* d_magic2 */ le16enc(ptr + 138, basetable->gpt_entries); /* d_npartitions */ le32enc(ptr + 140, BBSIZE); /* d_bbsize */ basetable->gpt_first = 0; basetable->gpt_last = msize - 1; basetable->gpt_isleaf = 1; baseentry = g_part_new_entry(basetable, RAW_PART + 1, basetable->gpt_first, basetable->gpt_last); baseentry->gpe_internal = 1; entry = (struct g_part_bsd_entry *)baseentry; entry->part.p_size = basetable->gpt_last + 1; entry->part.p_offset = table->offset; return (0); } static int g_part_bsd_destroy(struct g_part_table *basetable, struct g_part_parms *gpp) { struct g_part_bsd_table *table; table = (struct g_part_bsd_table *)basetable; if (table->bbarea != NULL) g_free(table->bbarea); table->bbarea = NULL; /* Wipe the second sector to clear the partitioning. */ basetable->gpt_smhead |= 2; return (0); } static void g_part_bsd_dumpconf(struct g_part_table *table, struct g_part_entry *baseentry, struct sbuf *sb, const char *indent) { struct g_part_bsd_entry *entry; entry = (struct g_part_bsd_entry *)baseentry; if (indent == NULL) { /* conftxt: libdisk compatibility */ sbuf_printf(sb, " xs BSD xt %u", entry->part.p_fstype); } else if (entry != NULL) { /* confxml: partition entry information */ sbuf_printf(sb, "%s%u\n", indent, entry->part.p_fstype); } else { /* confxml: scheme information */ } } static int g_part_bsd_dumpto(struct g_part_table *table, struct g_part_entry *baseentry) { struct g_part_bsd_entry *entry; /* Allow dumping to a swap partition or an unused partition. */ entry = (struct g_part_bsd_entry *)baseentry; return ((entry->part.p_fstype == FS_UNUSED || entry->part.p_fstype == FS_SWAP) ? 1 : 0); } static int g_part_bsd_modify(struct g_part_table *basetable, struct g_part_entry *baseentry, struct g_part_parms *gpp) { struct g_part_bsd_entry *entry; if (gpp->gpp_parms & G_PART_PARM_LABEL) return (EINVAL); entry = (struct g_part_bsd_entry *)baseentry; if (gpp->gpp_parms & G_PART_PARM_TYPE) return (bsd_parse_type(gpp->gpp_type, &entry->part.p_fstype)); return (0); } static void bsd_set_rawsize(struct g_part_table *basetable, struct g_provider *pp) { struct g_part_bsd_table *table; struct g_part_bsd_entry *entry; struct g_part_entry *baseentry; uint32_t msize; table = (struct g_part_bsd_table *)basetable; msize = MIN(pp->mediasize / pp->sectorsize, UINT32_MAX); le32enc(table->bbarea + pp->sectorsize + 60, msize); /* d_secperunit */ basetable->gpt_last = msize - 1; LIST_FOREACH(baseentry, &basetable->gpt_entry, gpe_entry) { if (baseentry->gpe_index != RAW_PART + 1) continue; baseentry->gpe_end = basetable->gpt_last; entry = (struct g_part_bsd_entry *)baseentry; entry->part.p_size = msize; return; } } static int g_part_bsd_resize(struct g_part_table *basetable, struct g_part_entry *baseentry, struct g_part_parms *gpp) { struct g_part_bsd_entry *entry; struct g_provider *pp; if (baseentry == NULL) { pp = LIST_FIRST(&basetable->gpt_gp->consumer)->provider; bsd_set_rawsize(basetable, pp); return (0); } entry = (struct g_part_bsd_entry *)baseentry; baseentry->gpe_end = baseentry->gpe_start + gpp->gpp_size - 1; entry->part.p_size = gpp->gpp_size; return (0); } static const char * g_part_bsd_name(struct g_part_table *table, struct g_part_entry *baseentry, char *buf, size_t bufsz) { snprintf(buf, bufsz, "%c", 'a' + baseentry->gpe_index - 1); return (buf); } static int g_part_bsd_probe(struct g_part_table *table, struct g_consumer *cp) { struct g_provider *pp; u_char *buf; uint32_t magic1, magic2; int error; pp = cp->provider; /* Sanity-check the provider. */ if (pp->sectorsize < sizeof(struct disklabel) || pp->mediasize < BBSIZE) return (ENOSPC); if (BBSIZE % pp->sectorsize) return (ENOTBLK); /* Check that there's a disklabel. */ buf = g_read_data(cp, pp->sectorsize, pp->sectorsize, &error); if (buf == NULL) return (error); magic1 = le32dec(buf + 0); magic2 = le32dec(buf + 132); g_free(buf); return ((magic1 == DISKMAGIC && magic2 == DISKMAGIC) ? G_PART_PROBE_PRI_HIGH : ENXIO); } static int g_part_bsd_read(struct g_part_table *basetable, struct g_consumer *cp) { struct g_provider *pp; struct g_part_bsd_table *table; struct g_part_entry *baseentry; struct g_part_bsd_entry *entry; struct partition part; u_char *buf, *p; off_t chs, msize; u_int sectors, heads; int error, index; pp = cp->provider; table = (struct g_part_bsd_table *)basetable; msize = MIN(pp->mediasize / pp->sectorsize, UINT32_MAX); table->bbarea = g_read_data(cp, 0, BBSIZE, &error); if (table->bbarea == NULL) return (error); buf = table->bbarea + pp->sectorsize; if (le32dec(buf + 40) != pp->sectorsize) goto invalid_label; sectors = le32dec(buf + 44); if (sectors < 1 || sectors > 255) goto invalid_label; if (sectors != basetable->gpt_sectors && !basetable->gpt_fixgeom) { g_part_geometry_heads(msize, sectors, &chs, &heads); if (chs != 0) { basetable->gpt_sectors = sectors; basetable->gpt_heads = heads; } } heads = le32dec(buf + 48); if (heads < 1 || heads > 255) goto invalid_label; if (heads != basetable->gpt_heads && !basetable->gpt_fixgeom) basetable->gpt_heads = heads; chs = le32dec(buf + 60); if (chs < 1) goto invalid_label; /* Fix-up a sysinstall bug. */ if (chs > msize) { chs = msize; le32enc(buf + 60, msize); } basetable->gpt_first = 0; basetable->gpt_last = msize - 1; basetable->gpt_isleaf = 1; basetable->gpt_entries = le16dec(buf + 138); if (basetable->gpt_entries < g_part_bsd_scheme.gps_minent || basetable->gpt_entries > g_part_bsd_scheme.gps_maxent) goto invalid_label; table->offset = le32dec(buf + 148 + RAW_PART * 16 + 4); for (index = basetable->gpt_entries - 1; index >= 0; index--) { p = buf + 148 + index * 16; part.p_size = le32dec(p + 0); part.p_offset = le32dec(p + 4); part.p_fsize = le32dec(p + 8); part.p_fstype = p[12]; part.p_frag = p[13]; part.p_cpg = le16dec(p + 14); if (part.p_size == 0) continue; if (part.p_offset < table->offset) continue; if (part.p_offset - table->offset > basetable->gpt_last) goto invalid_label; baseentry = g_part_new_entry(basetable, index + 1, part.p_offset - table->offset, part.p_offset - table->offset + part.p_size - 1); entry = (struct g_part_bsd_entry *)baseentry; entry->part = part; if (index == RAW_PART) baseentry->gpe_internal = 1; } return (0); invalid_label: printf("GEOM: %s: invalid disklabel.\n", pp->name); g_free(table->bbarea); table->bbarea = NULL; return (EINVAL); } static const char * g_part_bsd_type(struct g_part_table *basetable, struct g_part_entry *baseentry, char *buf, size_t bufsz) { struct g_part_bsd_entry *entry; int type; entry = (struct g_part_bsd_entry *)baseentry; type = entry->part.p_fstype; if (type == FS_NANDFS) return (g_part_alias_name(G_PART_ALIAS_FREEBSD_NANDFS)); if (type == FS_SWAP) return (g_part_alias_name(G_PART_ALIAS_FREEBSD_SWAP)); if (type == FS_BSDFFS) return (g_part_alias_name(G_PART_ALIAS_FREEBSD_UFS)); if (type == FS_VINUM) return (g_part_alias_name(G_PART_ALIAS_FREEBSD_VINUM)); if (type == FS_ZFS) return (g_part_alias_name(G_PART_ALIAS_FREEBSD_ZFS)); snprintf(buf, bufsz, "!%d", type); return (buf); } static int g_part_bsd_write(struct g_part_table *basetable, struct g_consumer *cp) { struct g_provider *pp; struct g_part_entry *baseentry; struct g_part_bsd_entry *entry; struct g_part_bsd_table *table; uint16_t sum; u_char *label, *p, *pe; int error, index; pp = cp->provider; table = (struct g_part_bsd_table *)basetable; baseentry = LIST_FIRST(&basetable->gpt_entry); label = table->bbarea + pp->sectorsize; for (index = 1; index <= basetable->gpt_entries; index++) { p = label + 148 + (index - 1) * 16; entry = (baseentry != NULL && index == baseentry->gpe_index) ? (struct g_part_bsd_entry *)baseentry : NULL; if (entry != NULL && !baseentry->gpe_deleted) { le32enc(p + 0, entry->part.p_size); le32enc(p + 4, entry->part.p_offset); le32enc(p + 8, entry->part.p_fsize); p[12] = entry->part.p_fstype; p[13] = entry->part.p_frag; le16enc(p + 14, entry->part.p_cpg); } else bzero(p, 16); if (entry != NULL) baseentry = LIST_NEXT(baseentry, gpe_entry); } /* Calculate checksum. */ le16enc(label + 136, 0); pe = label + 148 + basetable->gpt_entries * 16; sum = 0; for (p = label; p < pe; p += 2) sum ^= le16dec(p); le16enc(label + 136, sum); error = g_write_data(cp, 0, table->bbarea, BBSIZE); return (error); } Index: head/sys/geom/part/g_part_ebr.c =================================================================== --- head/sys/geom/part/g_part_ebr.c (revision 298353) +++ head/sys/geom/part/g_part_ebr.c (revision 298354) @@ -1,696 +1,694 @@ /*- * Copyright (c) 2007-2009 Marcel Moolenaar * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "opt_geom.h" #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "g_part_if.h" FEATURE(geom_part_ebr, "GEOM partitioning class for extended boot records support"); #if defined(GEOM_PART_EBR_COMPAT) FEATURE(geom_part_ebr_compat, "GEOM EBR partitioning class: backward-compatible partition names"); #endif #define EBRSIZE 512 struct g_part_ebr_table { struct g_part_table base; #ifndef GEOM_PART_EBR_COMPAT u_char ebr[EBRSIZE]; #endif }; struct g_part_ebr_entry { struct g_part_entry base; struct dos_partition ent; }; static int g_part_ebr_add(struct g_part_table *, struct g_part_entry *, struct g_part_parms *); static int g_part_ebr_create(struct g_part_table *, struct g_part_parms *); static int g_part_ebr_destroy(struct g_part_table *, struct g_part_parms *); static void g_part_ebr_dumpconf(struct g_part_table *, struct g_part_entry *, struct sbuf *, const char *); static int g_part_ebr_dumpto(struct g_part_table *, struct g_part_entry *); #if defined(GEOM_PART_EBR_COMPAT) static void g_part_ebr_fullname(struct g_part_table *, struct g_part_entry *, struct sbuf *, const char *); #endif static int g_part_ebr_modify(struct g_part_table *, struct g_part_entry *, struct g_part_parms *); static const char *g_part_ebr_name(struct g_part_table *, struct g_part_entry *, char *, size_t); static int g_part_ebr_precheck(struct g_part_table *, enum g_part_ctl, struct g_part_parms *); static int g_part_ebr_probe(struct g_part_table *, struct g_consumer *); static int g_part_ebr_read(struct g_part_table *, struct g_consumer *); static int g_part_ebr_setunset(struct g_part_table *, struct g_part_entry *, const char *, unsigned int); static const char *g_part_ebr_type(struct g_part_table *, struct g_part_entry *, char *, size_t); static int g_part_ebr_write(struct g_part_table *, struct g_consumer *); static int g_part_ebr_resize(struct g_part_table *, struct g_part_entry *, struct g_part_parms *); static kobj_method_t g_part_ebr_methods[] = { KOBJMETHOD(g_part_add, g_part_ebr_add), KOBJMETHOD(g_part_create, g_part_ebr_create), KOBJMETHOD(g_part_destroy, g_part_ebr_destroy), KOBJMETHOD(g_part_dumpconf, g_part_ebr_dumpconf), KOBJMETHOD(g_part_dumpto, g_part_ebr_dumpto), #if defined(GEOM_PART_EBR_COMPAT) KOBJMETHOD(g_part_fullname, g_part_ebr_fullname), #endif KOBJMETHOD(g_part_modify, g_part_ebr_modify), KOBJMETHOD(g_part_name, g_part_ebr_name), KOBJMETHOD(g_part_precheck, g_part_ebr_precheck), KOBJMETHOD(g_part_probe, g_part_ebr_probe), KOBJMETHOD(g_part_read, g_part_ebr_read), KOBJMETHOD(g_part_resize, g_part_ebr_resize), KOBJMETHOD(g_part_setunset, g_part_ebr_setunset), KOBJMETHOD(g_part_type, g_part_ebr_type), KOBJMETHOD(g_part_write, g_part_ebr_write), { 0, 0 } }; static struct g_part_scheme g_part_ebr_scheme = { "EBR", g_part_ebr_methods, sizeof(struct g_part_ebr_table), .gps_entrysz = sizeof(struct g_part_ebr_entry), .gps_minent = 1, .gps_maxent = INT_MAX, }; G_PART_SCHEME_DECLARE(g_part_ebr); static struct g_part_ebr_alias { u_char typ; int alias; } ebr_alias_match[] = { { DOSPTYP_386BSD, G_PART_ALIAS_FREEBSD }, { DOSPTYP_NTFS, G_PART_ALIAS_MS_NTFS }, { DOSPTYP_FAT32, G_PART_ALIAS_MS_FAT32 }, { DOSPTYP_LINSWP, G_PART_ALIAS_LINUX_SWAP }, { DOSPTYP_LINUX, G_PART_ALIAS_LINUX_DATA }, { DOSPTYP_LINLVM, G_PART_ALIAS_LINUX_LVM }, { DOSPTYP_LINRAID, G_PART_ALIAS_LINUX_RAID }, }; static void ebr_set_chs(struct g_part_table *, uint32_t, u_char *, u_char *, u_char *); static void ebr_entry_decode(const char *p, struct dos_partition *ent) { ent->dp_flag = p[0]; ent->dp_shd = p[1]; ent->dp_ssect = p[2]; ent->dp_scyl = p[3]; ent->dp_typ = p[4]; ent->dp_ehd = p[5]; ent->dp_esect = p[6]; ent->dp_ecyl = p[7]; ent->dp_start = le32dec(p + 8); ent->dp_size = le32dec(p + 12); } static void ebr_entry_link(struct g_part_table *table, uint32_t start, uint32_t end, u_char *buf) { buf[0] = 0 /* dp_flag */; ebr_set_chs(table, start, &buf[3] /* dp_scyl */, &buf[1] /* dp_shd */, &buf[2] /* dp_ssect */); buf[4] = 5 /* dp_typ */; ebr_set_chs(table, end, &buf[7] /* dp_ecyl */, &buf[5] /* dp_ehd */, &buf[6] /* dp_esect */); le32enc(buf + 8, start); le32enc(buf + 12, end - start + 1); } static int ebr_parse_type(const char *type, u_char *dp_typ) { const char *alias; char *endp; long lt; int i; if (type[0] == '!') { lt = strtol(type + 1, &endp, 0); if (type[1] == '\0' || *endp != '\0' || lt <= 0 || lt >= 256) return (EINVAL); *dp_typ = (u_char)lt; return (0); } - for (i = 0; - i < nitems(ebr_alias_match); i++) { + for (i = 0; i < nitems(ebr_alias_match); i++) { alias = g_part_alias_name(ebr_alias_match[i].alias); if (strcasecmp(type, alias) == 0) { *dp_typ = ebr_alias_match[i].typ; return (0); } } return (EINVAL); } static void ebr_set_chs(struct g_part_table *table, uint32_t lba, u_char *cylp, u_char *hdp, u_char *secp) { uint32_t cyl, hd, sec; sec = lba % table->gpt_sectors + 1; lba /= table->gpt_sectors; hd = lba % table->gpt_heads; lba /= table->gpt_heads; cyl = lba; if (cyl > 1023) sec = hd = cyl = ~0; *cylp = cyl & 0xff; *hdp = hd & 0xff; *secp = (sec & 0x3f) | ((cyl >> 2) & 0xc0); } static int ebr_align(struct g_part_table *basetable, uint32_t *start, uint32_t *size) { uint32_t sectors; sectors = basetable->gpt_sectors; if (*size < 2 * sectors) return (EINVAL); if (*start % sectors) { *size += (*start % sectors) - sectors; *start -= (*start % sectors) - sectors; } if (*size % sectors) *size -= (*size % sectors); if (*size < 2 * sectors) return (EINVAL); return (0); } static int g_part_ebr_add(struct g_part_table *basetable, struct g_part_entry *baseentry, struct g_part_parms *gpp) { struct g_provider *pp; struct g_part_ebr_entry *entry; uint32_t start, size; if (gpp->gpp_parms & G_PART_PARM_LABEL) return (EINVAL); pp = LIST_FIRST(&basetable->gpt_gp->consumer)->provider; entry = (struct g_part_ebr_entry *)baseentry; start = gpp->gpp_start; size = gpp->gpp_size; if (ebr_align(basetable, &start, &size) != 0) return (EINVAL); if (baseentry->gpe_deleted) bzero(&entry->ent, sizeof(entry->ent)); KASSERT(baseentry->gpe_start <= start, ("%s", __func__)); KASSERT(baseentry->gpe_end >= start + size - 1, ("%s", __func__)); baseentry->gpe_index = (start / basetable->gpt_sectors) + 1; baseentry->gpe_offset = (off_t)(start + basetable->gpt_sectors) * pp->sectorsize; baseentry->gpe_start = start; baseentry->gpe_end = start + size - 1; entry->ent.dp_start = basetable->gpt_sectors; entry->ent.dp_size = size - basetable->gpt_sectors; ebr_set_chs(basetable, entry->ent.dp_start, &entry->ent.dp_scyl, &entry->ent.dp_shd, &entry->ent.dp_ssect); ebr_set_chs(basetable, baseentry->gpe_end, &entry->ent.dp_ecyl, &entry->ent.dp_ehd, &entry->ent.dp_esect); return (ebr_parse_type(gpp->gpp_type, &entry->ent.dp_typ)); } static int g_part_ebr_create(struct g_part_table *basetable, struct g_part_parms *gpp) { char type[64]; struct g_consumer *cp; struct g_provider *pp; uint32_t msize; int error; pp = gpp->gpp_provider; if (pp->sectorsize < EBRSIZE) return (ENOSPC); if (pp->sectorsize > 4096) return (ENXIO); /* Check that we have a parent and that it's a MBR. */ if (basetable->gpt_depth == 0) return (ENXIO); cp = LIST_FIRST(&pp->consumers); error = g_getattr("PART::scheme", cp, &type); if (error != 0) return (error); if (strcmp(type, "MBR") != 0) return (ENXIO); error = g_getattr("PART::type", cp, &type); if (error != 0) return (error); if (strcmp(type, "ebr") != 0) return (ENXIO); msize = MIN(pp->mediasize / pp->sectorsize, UINT32_MAX); basetable->gpt_first = 0; basetable->gpt_last = msize - 1; basetable->gpt_entries = msize / basetable->gpt_sectors; return (0); } static int g_part_ebr_destroy(struct g_part_table *basetable, struct g_part_parms *gpp) { /* Wipe the first sector to clear the partitioning. */ basetable->gpt_smhead |= 1; return (0); } static void g_part_ebr_dumpconf(struct g_part_table *table, struct g_part_entry *baseentry, struct sbuf *sb, const char *indent) { struct g_part_ebr_entry *entry; entry = (struct g_part_ebr_entry *)baseentry; if (indent == NULL) { /* conftxt: libdisk compatibility */ sbuf_printf(sb, " xs MBREXT xt %u", entry->ent.dp_typ); } else if (entry != NULL) { /* confxml: partition entry information */ sbuf_printf(sb, "%s%u\n", indent, entry->ent.dp_typ); if (entry->ent.dp_flag & 0x80) sbuf_printf(sb, "%sactive\n", indent); } else { /* confxml: scheme information */ } } static int g_part_ebr_dumpto(struct g_part_table *table, struct g_part_entry *baseentry) { struct g_part_ebr_entry *entry; /* Allow dumping to a FreeBSD partition or Linux swap partition only. */ entry = (struct g_part_ebr_entry *)baseentry; return ((entry->ent.dp_typ == DOSPTYP_386BSD || entry->ent.dp_typ == DOSPTYP_LINSWP) ? 1 : 0); } #if defined(GEOM_PART_EBR_COMPAT) static void g_part_ebr_fullname(struct g_part_table *table, struct g_part_entry *entry, struct sbuf *sb, const char *pfx) { struct g_part_entry *iter; u_int idx; idx = 5; LIST_FOREACH(iter, &table->gpt_entry, gpe_entry) { if (iter == entry) break; idx++; } sbuf_printf(sb, "%.*s%u", (int)strlen(pfx) - 1, pfx, idx); } #endif static int g_part_ebr_modify(struct g_part_table *basetable, struct g_part_entry *baseentry, struct g_part_parms *gpp) { struct g_part_ebr_entry *entry; if (gpp->gpp_parms & G_PART_PARM_LABEL) return (EINVAL); entry = (struct g_part_ebr_entry *)baseentry; if (gpp->gpp_parms & G_PART_PARM_TYPE) return (ebr_parse_type(gpp->gpp_type, &entry->ent.dp_typ)); return (0); } static int g_part_ebr_resize(struct g_part_table *basetable, struct g_part_entry *baseentry, struct g_part_parms *gpp) { struct g_provider *pp; if (baseentry != NULL) return (EOPNOTSUPP); pp = LIST_FIRST(&basetable->gpt_gp->consumer)->provider; basetable->gpt_last = MIN(pp->mediasize / pp->sectorsize, UINT32_MAX) - 1; return (0); } static const char * g_part_ebr_name(struct g_part_table *table, struct g_part_entry *entry, char *buf, size_t bufsz) { snprintf(buf, bufsz, "+%08u", entry->gpe_index); return (buf); } static int g_part_ebr_precheck(struct g_part_table *table, enum g_part_ctl req, struct g_part_parms *gpp) { #if defined(GEOM_PART_EBR_COMPAT) if (req == G_PART_CTL_DESTROY) return (0); return (ECANCELED); #else /* * The index is a function of the start of the partition. * This is not something the user can override, nor is it * something the common code will do right. We can set the * index now so that we get what we need. */ if (req == G_PART_CTL_ADD) gpp->gpp_index = (gpp->gpp_start / table->gpt_sectors) + 1; return (0); #endif } static int g_part_ebr_probe(struct g_part_table *table, struct g_consumer *cp) { char type[64]; struct g_provider *pp; u_char *buf, *p; int error, index, res; uint16_t magic; pp = cp->provider; /* Sanity-check the provider. */ if (pp->sectorsize < EBRSIZE || pp->mediasize < pp->sectorsize) return (ENOSPC); if (pp->sectorsize > 4096) return (ENXIO); /* Check that we have a parent and that it's a MBR. */ if (table->gpt_depth == 0) return (ENXIO); error = g_getattr("PART::scheme", cp, &type); if (error != 0) return (error); if (strcmp(type, "MBR") != 0) return (ENXIO); /* Check that partition has type DOSPTYP_EBR. */ error = g_getattr("PART::type", cp, &type); if (error != 0) return (error); if (strcmp(type, "ebr") != 0) return (ENXIO); /* Check that there's a EBR. */ buf = g_read_data(cp, 0L, pp->sectorsize, &error); if (buf == NULL) return (error); /* We goto out on mismatch. */ res = ENXIO; magic = le16dec(buf + DOSMAGICOFFSET); if (magic != DOSMAGIC) goto out; for (index = 0; index < 2; index++) { p = buf + DOSPARTOFF + index * DOSPARTSIZE; if (p[0] != 0 && p[0] != 0x80) goto out; } res = G_PART_PROBE_PRI_NORM; out: g_free(buf); return (res); } static int g_part_ebr_read(struct g_part_table *basetable, struct g_consumer *cp) { struct dos_partition ent[2]; struct g_provider *pp; struct g_part_entry *baseentry; struct g_part_ebr_table *table; struct g_part_ebr_entry *entry; u_char *buf; off_t ofs, msize; u_int lba; int error, index; pp = cp->provider; table = (struct g_part_ebr_table *)basetable; msize = MIN(pp->mediasize / pp->sectorsize, UINT32_MAX); lba = 0; while (1) { ofs = (off_t)lba * pp->sectorsize; buf = g_read_data(cp, ofs, pp->sectorsize, &error); if (buf == NULL) return (error); ebr_entry_decode(buf + DOSPARTOFF + 0 * DOSPARTSIZE, ent + 0); ebr_entry_decode(buf + DOSPARTOFF + 1 * DOSPARTSIZE, ent + 1); /* The 3rd & 4th entries should be zeroes. */ if (le64dec(buf + DOSPARTOFF + 2 * DOSPARTSIZE) + le64dec(buf + DOSPARTOFF + 3 * DOSPARTSIZE) != 0) { basetable->gpt_corrupt = 1; printf("GEOM: %s: invalid entries in the EBR ignored.\n", pp->name); } #ifndef GEOM_PART_EBR_COMPAT /* Save the first EBR, it can contain a boot code */ if (lba == 0) bcopy(buf, table->ebr, sizeof(table->ebr)); #endif g_free(buf); if (ent[0].dp_typ == 0) break; if (ent[0].dp_typ == 5 && ent[1].dp_typ == 0) { lba = ent[0].dp_start; continue; } index = (lba / basetable->gpt_sectors) + 1; baseentry = (struct g_part_entry *)g_part_new_entry(basetable, index, lba, lba + ent[0].dp_start + ent[0].dp_size - 1); baseentry->gpe_offset = (off_t)(lba + ent[0].dp_start) * pp->sectorsize; entry = (struct g_part_ebr_entry *)baseentry; entry->ent = ent[0]; if (ent[1].dp_typ == 0) break; lba = ent[1].dp_start; } basetable->gpt_entries = msize / basetable->gpt_sectors; basetable->gpt_first = 0; basetable->gpt_last = msize - 1; return (0); } static int g_part_ebr_setunset(struct g_part_table *table, struct g_part_entry *baseentry, const char *attrib, unsigned int set) { struct g_part_entry *iter; struct g_part_ebr_entry *entry; int changed; if (baseentry == NULL) return (ENODEV); if (strcasecmp(attrib, "active") != 0) return (EINVAL); /* Only one entry can have the active attribute. */ LIST_FOREACH(iter, &table->gpt_entry, gpe_entry) { if (iter->gpe_deleted) continue; changed = 0; entry = (struct g_part_ebr_entry *)iter; if (iter == baseentry) { if (set && (entry->ent.dp_flag & 0x80) == 0) { entry->ent.dp_flag |= 0x80; changed = 1; } else if (!set && (entry->ent.dp_flag & 0x80)) { entry->ent.dp_flag &= ~0x80; changed = 1; } } else { if (set && (entry->ent.dp_flag & 0x80)) { entry->ent.dp_flag &= ~0x80; changed = 1; } } if (changed && !iter->gpe_created) iter->gpe_modified = 1; } return (0); } static const char * g_part_ebr_type(struct g_part_table *basetable, struct g_part_entry *baseentry, char *buf, size_t bufsz) { struct g_part_ebr_entry *entry; int i; entry = (struct g_part_ebr_entry *)baseentry; - for (i = 0; - i < nitems(ebr_alias_match); i++) { + for (i = 0; i < nitems(ebr_alias_match); i++) { if (ebr_alias_match[i].typ == entry->ent.dp_typ) return (g_part_alias_name(ebr_alias_match[i].alias)); } snprintf(buf, bufsz, "!%d", entry->ent.dp_typ); return (buf); } static int g_part_ebr_write(struct g_part_table *basetable, struct g_consumer *cp) { #ifndef GEOM_PART_EBR_COMPAT struct g_part_ebr_table *table; #endif struct g_provider *pp; struct g_part_entry *baseentry, *next; struct g_part_ebr_entry *entry; u_char *buf; u_char *p; int error; pp = cp->provider; buf = g_malloc(pp->sectorsize, M_WAITOK | M_ZERO); #ifndef GEOM_PART_EBR_COMPAT table = (struct g_part_ebr_table *)basetable; bcopy(table->ebr, buf, DOSPARTOFF); #endif le16enc(buf + DOSMAGICOFFSET, DOSMAGIC); baseentry = LIST_FIRST(&basetable->gpt_entry); while (baseentry != NULL && baseentry->gpe_deleted) baseentry = LIST_NEXT(baseentry, gpe_entry); /* Wipe-out the first EBR when there are no slices. */ if (baseentry == NULL) { error = g_write_data(cp, 0, buf, pp->sectorsize); goto out; } /* * If the first partition is not in LBA 0, we need to * put a "link" EBR in LBA 0. */ if (baseentry->gpe_start != 0) { ebr_entry_link(basetable, (uint32_t)baseentry->gpe_start, (uint32_t)baseentry->gpe_end, buf + DOSPARTOFF); error = g_write_data(cp, 0, buf, pp->sectorsize); if (error) goto out; } do { entry = (struct g_part_ebr_entry *)baseentry; p = buf + DOSPARTOFF; p[0] = entry->ent.dp_flag; p[1] = entry->ent.dp_shd; p[2] = entry->ent.dp_ssect; p[3] = entry->ent.dp_scyl; p[4] = entry->ent.dp_typ; p[5] = entry->ent.dp_ehd; p[6] = entry->ent.dp_esect; p[7] = entry->ent.dp_ecyl; le32enc(p + 8, entry->ent.dp_start); le32enc(p + 12, entry->ent.dp_size); next = LIST_NEXT(baseentry, gpe_entry); while (next != NULL && next->gpe_deleted) next = LIST_NEXT(next, gpe_entry); p += DOSPARTSIZE; if (next != NULL) ebr_entry_link(basetable, (uint32_t)next->gpe_start, (uint32_t)next->gpe_end, p); else bzero(p, DOSPARTSIZE); error = g_write_data(cp, baseentry->gpe_start * pp->sectorsize, buf, pp->sectorsize); #ifndef GEOM_PART_EBR_COMPAT if (baseentry->gpe_start == 0) bzero(buf, DOSPARTOFF); #endif baseentry = next; } while (!error && baseentry != NULL); out: g_free(buf); return (error); } Index: head/sys/geom/part/g_part_ldm.c =================================================================== --- head/sys/geom/part/g_part_ldm.c (revision 298353) +++ head/sys/geom/part/g_part_ldm.c (revision 298354) @@ -1,1485 +1,1483 @@ /*- * Copyright (c) 2012 Andrey V. Elsukov * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "g_part_if.h" FEATURE(geom_part_ldm, "GEOM partitioning class for LDM support"); SYSCTL_DECL(_kern_geom_part); static SYSCTL_NODE(_kern_geom_part, OID_AUTO, ldm, CTLFLAG_RW, 0, "GEOM_PART_LDM Logical Disk Manager"); static u_int ldm_debug = 0; SYSCTL_UINT(_kern_geom_part_ldm, OID_AUTO, debug, CTLFLAG_RWTUN, &ldm_debug, 0, "Debug level"); /* * This allows access to mirrored LDM volumes. Since we do not * doing mirroring here, it is not enabled by default. */ static u_int show_mirrors = 0; SYSCTL_UINT(_kern_geom_part_ldm, OID_AUTO, show_mirrors, CTLFLAG_RWTUN, &show_mirrors, 0, "Show mirrored volumes"); #define LDM_DEBUG(lvl, fmt, ...) do { \ if (ldm_debug >= (lvl)) { \ printf("GEOM_PART: " fmt "\n", __VA_ARGS__); \ } \ } while (0) #define LDM_DUMP(buf, size) do { \ if (ldm_debug > 1) { \ hexdump(buf, size, NULL, 0); \ } \ } while (0) /* * There are internal representations of LDM structures. * * We do not keep all fields of on-disk structures, only most useful. * All numbers in an on-disk structures are in big-endian format. */ /* * Private header is 512 bytes long. There are three copies on each disk. * Offset and sizes are in sectors. Location of each copy: * - the first offset is relative to the disk start; * - the second and third offset are relative to the LDM database start. * * On a disk partitioned with GPT, the LDM has not first private header. */ #define LDM_PH_MBRINDEX 0 #define LDM_PH_GPTINDEX 2 static const uint64_t ldm_ph_off[] = {6, 1856, 2047}; #define LDM_VERSION_2K 0x2000b #define LDM_VERSION_VISTA 0x2000c #define LDM_PH_VERSION_OFF 0x00c #define LDM_PH_DISKGUID_OFF 0x030 #define LDM_PH_DGGUID_OFF 0x0b0 #define LDM_PH_DGNAME_OFF 0x0f0 #define LDM_PH_START_OFF 0x11b #define LDM_PH_SIZE_OFF 0x123 #define LDM_PH_DB_OFF 0x12b #define LDM_PH_DBSIZE_OFF 0x133 #define LDM_PH_TH1_OFF 0x13b #define LDM_PH_TH2_OFF 0x143 #define LDM_PH_CONFSIZE_OFF 0x153 #define LDM_PH_LOGSIZE_OFF 0x15b #define LDM_PH_SIGN "PRIVHEAD" struct ldm_privhdr { struct uuid disk_guid; struct uuid dg_guid; u_char dg_name[32]; uint64_t start; /* logical disk start */ uint64_t size; /* logical disk size */ uint64_t db_offset; /* LDM database start */ #define LDM_DB_SIZE 2048 uint64_t db_size; /* LDM database size */ #define LDM_TH_COUNT 2 uint64_t th_offset[LDM_TH_COUNT]; /* TOC header offsets */ uint64_t conf_size; /* configuration size */ uint64_t log_size; /* size of log */ }; /* * Table of contents header is 512 bytes long. * There are two identical copies at offsets from the private header. * Offsets are relative to the LDM database start. */ #define LDM_TH_SIGN "TOCBLOCK" #define LDM_TH_NAME1 "config" #define LDM_TH_NAME2 "log" #define LDM_TH_NAME1_OFF 0x024 #define LDM_TH_CONF_OFF 0x02e #define LDM_TH_CONFSIZE_OFF 0x036 #define LDM_TH_NAME2_OFF 0x046 #define LDM_TH_LOG_OFF 0x050 #define LDM_TH_LOGSIZE_OFF 0x058 struct ldm_tochdr { uint64_t conf_offset; /* configuration offset */ uint64_t log_offset; /* log offset */ }; /* * LDM database header is 512 bytes long. */ #define LDM_VMDB_SIGN "VMDB" #define LDM_DB_LASTSEQ_OFF 0x004 #define LDM_DB_SIZE_OFF 0x008 #define LDM_DB_STATUS_OFF 0x010 #define LDM_DB_VERSION_OFF 0x012 #define LDM_DB_DGNAME_OFF 0x016 #define LDM_DB_DGGUID_OFF 0x035 struct ldm_vmdbhdr { uint32_t last_seq; /* sequence number of last VBLK */ uint32_t size; /* size of VBLK */ }; /* * The LDM database configuration section contains VMDB header and * many VBLKs. Each VBLK represents a disk group, disk partition, * component or volume. * * The most interesting for us are volumes, they are represents * partitions in the GEOM_PART meaning. But volume VBLK does not * contain all information needed to create GEOM provider. And we * should get this information from the related VBLK. This is how * VBLK releated: * Volumes <- Components <- Partitions -> Disks * * One volume can contain several components. In this case LDM * does mirroring of volume data to each component. * * Also each component can contain several partitions (spanned or * striped volumes). */ struct ldm_component { uint64_t id; /* object id */ uint64_t vol_id; /* parent volume object id */ int count; LIST_HEAD(, ldm_partition) partitions; LIST_ENTRY(ldm_component) entry; }; struct ldm_volume { uint64_t id; /* object id */ uint64_t size; /* volume size */ uint8_t number; /* used for ordering */ uint8_t part_type; /* partition type */ int count; LIST_HEAD(, ldm_component) components; LIST_ENTRY(ldm_volume) entry; }; struct ldm_disk { uint64_t id; /* object id */ struct uuid guid; /* disk guid */ LIST_ENTRY(ldm_disk) entry; }; #if 0 struct ldm_disk_group { uint64_t id; /* object id */ struct uuid guid; /* disk group guid */ u_char name[32]; /* disk group name */ LIST_ENTRY(ldm_disk_group) entry; }; #endif struct ldm_partition { uint64_t id; /* object id */ uint64_t disk_id; /* disk object id */ uint64_t comp_id; /* parent component object id */ uint64_t start; /* offset relative to disk start */ uint64_t offset; /* offset for spanned volumes */ uint64_t size; /* partition size */ LIST_ENTRY(ldm_partition) entry; }; /* * Each VBLK is 128 bytes long and has standard 16 bytes header. * Some of VBLK's fields are fixed size, but others has variable size. * Fields with variable size are prefixed with one byte length marker. * Some fields are strings and also can have fixed size and variable. * Strings with fixed size are NULL-terminated, others are not. * All VBLKs have same several first fields: * Offset Size Description * ---------------+---------------+-------------------------- * 0x00 16 standard VBLK header * 0x10 2 update status * 0x13 1 VBLK type * 0x18 PS object id * 0x18+ PN object name * * o Offset 0x18+ means '0x18 + length of all variable-width fields' * o 'P' in size column means 'prefixed' (variable-width), * 'S' - string, 'N' - number. */ #define LDM_VBLK_SIGN "VBLK" #define LDM_VBLK_SEQ_OFF 0x04 #define LDM_VBLK_GROUP_OFF 0x08 #define LDM_VBLK_INDEX_OFF 0x0c #define LDM_VBLK_COUNT_OFF 0x0e #define LDM_VBLK_TYPE_OFF 0x13 #define LDM_VBLK_OID_OFF 0x18 struct ldm_vblkhdr { uint32_t seq; /* sequence number */ uint32_t group; /* group number */ uint16_t index; /* index in the group */ uint16_t count; /* number of entries in the group */ }; #define LDM_VBLK_T_COMPONENT 0x32 #define LDM_VBLK_T_PARTITION 0x33 #define LDM_VBLK_T_DISK 0x34 #define LDM_VBLK_T_DISKGROUP 0x35 #define LDM_VBLK_T_DISK4 0x44 #define LDM_VBLK_T_DISKGROUP4 0x45 #define LDM_VBLK_T_VOLUME 0x51 struct ldm_vblk { uint8_t type; /* VBLK type */ union { uint64_t id; struct ldm_volume vol; struct ldm_component comp; struct ldm_disk disk; struct ldm_partition part; #if 0 struct ldm_disk_group disk_group; #endif } u; LIST_ENTRY(ldm_vblk) entry; }; /* * Some VBLKs contains a bit more data than can fit into 128 bytes. These * VBLKs are called eXtended VBLK. Before parsing, the data from these VBLK * should be placed into continuous memory buffer. We can determine xVBLK * by the count field in the standard VBLK header (count > 1). */ struct ldm_xvblk { uint32_t group; /* xVBLK group number */ uint32_t size; /* the total size of xVBLK */ uint8_t map; /* bitmask of currently saved VBLKs */ u_char *data; /* xVBLK data */ LIST_ENTRY(ldm_xvblk) entry; }; /* The internal representation of LDM database. */ struct ldm_db { struct ldm_privhdr ph; /* private header */ struct ldm_tochdr th; /* TOC header */ struct ldm_vmdbhdr dh; /* VMDB header */ LIST_HEAD(, ldm_volume) volumes; LIST_HEAD(, ldm_disk) disks; LIST_HEAD(, ldm_vblk) vblks; LIST_HEAD(, ldm_xvblk) xvblks; }; static struct uuid gpt_uuid_ms_ldm_metadata = GPT_ENT_TYPE_MS_LDM_METADATA; struct g_part_ldm_table { struct g_part_table base; uint64_t db_offset; int is_gpt; }; struct g_part_ldm_entry { struct g_part_entry base; uint8_t type; }; static int g_part_ldm_add(struct g_part_table *, struct g_part_entry *, struct g_part_parms *); static int g_part_ldm_bootcode(struct g_part_table *, struct g_part_parms *); static int g_part_ldm_create(struct g_part_table *, struct g_part_parms *); static int g_part_ldm_destroy(struct g_part_table *, struct g_part_parms *); static void g_part_ldm_dumpconf(struct g_part_table *, struct g_part_entry *, struct sbuf *, const char *); static int g_part_ldm_dumpto(struct g_part_table *, struct g_part_entry *); static int g_part_ldm_modify(struct g_part_table *, struct g_part_entry *, struct g_part_parms *); static const char *g_part_ldm_name(struct g_part_table *, struct g_part_entry *, char *, size_t); static int g_part_ldm_probe(struct g_part_table *, struct g_consumer *); static int g_part_ldm_read(struct g_part_table *, struct g_consumer *); static const char *g_part_ldm_type(struct g_part_table *, struct g_part_entry *, char *, size_t); static int g_part_ldm_write(struct g_part_table *, struct g_consumer *); static kobj_method_t g_part_ldm_methods[] = { KOBJMETHOD(g_part_add, g_part_ldm_add), KOBJMETHOD(g_part_bootcode, g_part_ldm_bootcode), KOBJMETHOD(g_part_create, g_part_ldm_create), KOBJMETHOD(g_part_destroy, g_part_ldm_destroy), KOBJMETHOD(g_part_dumpconf, g_part_ldm_dumpconf), KOBJMETHOD(g_part_dumpto, g_part_ldm_dumpto), KOBJMETHOD(g_part_modify, g_part_ldm_modify), KOBJMETHOD(g_part_name, g_part_ldm_name), KOBJMETHOD(g_part_probe, g_part_ldm_probe), KOBJMETHOD(g_part_read, g_part_ldm_read), KOBJMETHOD(g_part_type, g_part_ldm_type), KOBJMETHOD(g_part_write, g_part_ldm_write), { 0, 0 } }; static struct g_part_scheme g_part_ldm_scheme = { "LDM", g_part_ldm_methods, sizeof(struct g_part_ldm_table), .gps_entrysz = sizeof(struct g_part_ldm_entry) }; G_PART_SCHEME_DECLARE(g_part_ldm); static struct g_part_ldm_alias { u_char typ; int alias; } ldm_alias_match[] = { { DOSPTYP_NTFS, G_PART_ALIAS_MS_NTFS }, { DOSPTYP_FAT32, G_PART_ALIAS_MS_FAT32 }, { DOSPTYP_386BSD, G_PART_ALIAS_FREEBSD }, { DOSPTYP_LDM, G_PART_ALIAS_MS_LDM_DATA }, { DOSPTYP_LINSWP, G_PART_ALIAS_LINUX_SWAP }, { DOSPTYP_LINUX, G_PART_ALIAS_LINUX_DATA }, { DOSPTYP_LINLVM, G_PART_ALIAS_LINUX_LVM }, { DOSPTYP_LINRAID, G_PART_ALIAS_LINUX_RAID }, }; static u_char* ldm_privhdr_read(struct g_consumer *cp, uint64_t off, int *error) { struct g_provider *pp; u_char *buf; pp = cp->provider; buf = g_read_data(cp, off, pp->sectorsize, error); if (buf == NULL) return (NULL); if (memcmp(buf, LDM_PH_SIGN, strlen(LDM_PH_SIGN)) != 0) { LDM_DEBUG(1, "%s: invalid LDM private header signature", pp->name); g_free(buf); buf = NULL; *error = EINVAL; } return (buf); } static int ldm_privhdr_parse(struct g_consumer *cp, struct ldm_privhdr *hdr, const u_char *buf) { uint32_t version; int error; memset(hdr, 0, sizeof(*hdr)); version = be32dec(buf + LDM_PH_VERSION_OFF); if (version != LDM_VERSION_2K && version != LDM_VERSION_VISTA) { LDM_DEBUG(0, "%s: unsupported LDM version %u.%u", cp->provider->name, version >> 16, version & 0xFFFF); return (ENXIO); } error = parse_uuid(buf + LDM_PH_DISKGUID_OFF, &hdr->disk_guid); if (error != 0) return (error); error = parse_uuid(buf + LDM_PH_DGGUID_OFF, &hdr->dg_guid); if (error != 0) return (error); strncpy(hdr->dg_name, buf + LDM_PH_DGNAME_OFF, sizeof(hdr->dg_name)); hdr->start = be64dec(buf + LDM_PH_START_OFF); hdr->size = be64dec(buf + LDM_PH_SIZE_OFF); hdr->db_offset = be64dec(buf + LDM_PH_DB_OFF); hdr->db_size = be64dec(buf + LDM_PH_DBSIZE_OFF); hdr->th_offset[0] = be64dec(buf + LDM_PH_TH1_OFF); hdr->th_offset[1] = be64dec(buf + LDM_PH_TH2_OFF); hdr->conf_size = be64dec(buf + LDM_PH_CONFSIZE_OFF); hdr->log_size = be64dec(buf + LDM_PH_LOGSIZE_OFF); return (0); } static int ldm_privhdr_check(struct ldm_db *db, struct g_consumer *cp, int is_gpt) { struct g_consumer *cp2; struct g_provider *pp; struct ldm_privhdr hdr; uint64_t offset, last; int error, found, i; u_char *buf; pp = cp->provider; if (is_gpt) { /* * The last LBA is used in several checks below, for the * GPT case it should be calculated relative to the whole * disk. */ cp2 = LIST_FIRST(&pp->geom->consumer); last = cp2->provider->mediasize / cp2->provider->sectorsize - 1; } else last = pp->mediasize / pp->sectorsize - 1; - for (found = 0, i = is_gpt; - i < nitems(ldm_ph_off); i++) { + for (found = 0, i = is_gpt; i < nitems(ldm_ph_off); i++) { offset = ldm_ph_off[i]; /* * In the GPT case consumer is attached to the LDM metadata * partition and we don't need add db_offset. */ if (!is_gpt) offset += db->ph.db_offset; if (i == LDM_PH_MBRINDEX) { /* * Prepare to errors and setup new base offset * to read backup private headers. Assume that LDM * database is in the last 1Mbyte area. */ db->ph.db_offset = last - LDM_DB_SIZE; } buf = ldm_privhdr_read(cp, offset * pp->sectorsize, &error); if (buf == NULL) { LDM_DEBUG(1, "%s: failed to read private header " "%d at LBA %ju", pp->name, i, (uintmax_t)offset); continue; } error = ldm_privhdr_parse(cp, &hdr, buf); if (error != 0) { LDM_DEBUG(1, "%s: failed to parse private " "header %d", pp->name, i); LDM_DUMP(buf, pp->sectorsize); g_free(buf); continue; } g_free(buf); if (hdr.start > last || hdr.start + hdr.size - 1 > last || (hdr.start + hdr.size - 1 > hdr.db_offset && !is_gpt) || hdr.db_size != LDM_DB_SIZE || hdr.db_offset + LDM_DB_SIZE - 1 > last || hdr.th_offset[0] >= LDM_DB_SIZE || hdr.th_offset[1] >= LDM_DB_SIZE || hdr.conf_size + hdr.log_size >= LDM_DB_SIZE) { LDM_DEBUG(1, "%s: invalid values in the " "private header %d", pp->name, i); LDM_DEBUG(2, "%s: start: %jd, size: %jd, " "db_offset: %jd, db_size: %jd, th_offset0: %jd, " "th_offset1: %jd, conf_size: %jd, log_size: %jd, " "last: %jd", pp->name, hdr.start, hdr.size, hdr.db_offset, hdr.db_size, hdr.th_offset[0], hdr.th_offset[1], hdr.conf_size, hdr.log_size, last); continue; } if (found != 0 && memcmp(&db->ph, &hdr, sizeof(hdr)) != 0) { LDM_DEBUG(0, "%s: private headers are not equal", pp->name); if (i > 1) { /* * We have different headers in the LDM. * We can not trust this metadata. */ LDM_DEBUG(0, "%s: refuse LDM metadata", pp->name); return (EINVAL); } /* * We already have read primary private header * and it differs from this backup one. * Prefer the backup header and save it. */ found = 0; } if (found == 0) memcpy(&db->ph, &hdr, sizeof(hdr)); found = 1; } if (found == 0) { LDM_DEBUG(1, "%s: valid LDM private header not found", pp->name); return (ENXIO); } return (0); } static int ldm_gpt_check(struct ldm_db *db, struct g_consumer *cp) { struct g_part_table *gpt; struct g_part_entry *e; struct g_consumer *cp2; int error; cp2 = LIST_NEXT(cp, consumer); g_topology_lock(); gpt = cp->provider->geom->softc; error = 0; LIST_FOREACH(e, &gpt->gpt_entry, gpe_entry) { if (cp->provider == e->gpe_pp) { /* ms-ldm-metadata partition */ if (e->gpe_start != db->ph.db_offset || e->gpe_end != db->ph.db_offset + LDM_DB_SIZE - 1) error++; } else if (cp2->provider == e->gpe_pp) { /* ms-ldm-data partition */ if (e->gpe_start != db->ph.start || e->gpe_end != db->ph.start + db->ph.size - 1) error++; } if (error != 0) { LDM_DEBUG(0, "%s: GPT partition %d boundaries " "do not match with the LDM metadata", e->gpe_pp->name, e->gpe_index); error = ENXIO; break; } } g_topology_unlock(); return (error); } static int ldm_tochdr_check(struct ldm_db *db, struct g_consumer *cp) { struct g_provider *pp; struct ldm_tochdr hdr; uint64_t offset, conf_size, log_size; int error, found, i; u_char *buf; pp = cp->provider; for (i = 0, found = 0; i < LDM_TH_COUNT; i++) { offset = db->ph.db_offset + db->ph.th_offset[i]; buf = g_read_data(cp, offset * pp->sectorsize, pp->sectorsize, &error); if (buf == NULL) { LDM_DEBUG(1, "%s: failed to read TOC header " "at LBA %ju", pp->name, (uintmax_t)offset); continue; } if (memcmp(buf, LDM_TH_SIGN, strlen(LDM_TH_SIGN)) != 0 || memcmp(buf + LDM_TH_NAME1_OFF, LDM_TH_NAME1, strlen(LDM_TH_NAME1)) != 0 || memcmp(buf + LDM_TH_NAME2_OFF, LDM_TH_NAME2, strlen(LDM_TH_NAME2)) != 0) { LDM_DEBUG(1, "%s: failed to parse TOC header " "at LBA %ju", pp->name, (uintmax_t)offset); LDM_DUMP(buf, pp->sectorsize); g_free(buf); continue; } hdr.conf_offset = be64dec(buf + LDM_TH_CONF_OFF); hdr.log_offset = be64dec(buf + LDM_TH_LOG_OFF); conf_size = be64dec(buf + LDM_TH_CONFSIZE_OFF); log_size = be64dec(buf + LDM_TH_LOGSIZE_OFF); if (conf_size != db->ph.conf_size || hdr.conf_offset + conf_size >= LDM_DB_SIZE || log_size != db->ph.log_size || hdr.log_offset + log_size >= LDM_DB_SIZE) { LDM_DEBUG(1, "%s: invalid values in the " "TOC header at LBA %ju", pp->name, (uintmax_t)offset); LDM_DUMP(buf, pp->sectorsize); g_free(buf); continue; } g_free(buf); if (found == 0) memcpy(&db->th, &hdr, sizeof(hdr)); found = 1; } if (found == 0) { LDM_DEBUG(0, "%s: valid LDM TOC header not found.", pp->name); return (ENXIO); } return (0); } static int ldm_vmdbhdr_check(struct ldm_db *db, struct g_consumer *cp) { struct g_provider *pp; struct uuid dg_guid; uint64_t offset; uint32_t version; int error; u_char *buf; pp = cp->provider; offset = db->ph.db_offset + db->th.conf_offset; buf = g_read_data(cp, offset * pp->sectorsize, pp->sectorsize, &error); if (buf == NULL) { LDM_DEBUG(0, "%s: failed to read VMDB header at " "LBA %ju", pp->name, (uintmax_t)offset); return (error); } if (memcmp(buf, LDM_VMDB_SIGN, strlen(LDM_VMDB_SIGN)) != 0) { g_free(buf); LDM_DEBUG(0, "%s: failed to parse VMDB header at " "LBA %ju", pp->name, (uintmax_t)offset); return (ENXIO); } /* Check version. */ version = be32dec(buf + LDM_DB_VERSION_OFF); if (version != 0x4000A) { g_free(buf); LDM_DEBUG(0, "%s: unsupported VMDB version %u.%u", pp->name, version >> 16, version & 0xFFFF); return (ENXIO); } /* * Check VMDB update status: * 1 - in a consistent state; * 2 - in a creation phase; * 3 - in a deletion phase; */ if (be16dec(buf + LDM_DB_STATUS_OFF) != 1) { g_free(buf); LDM_DEBUG(0, "%s: VMDB is not in a consistent state", pp->name); return (ENXIO); } db->dh.last_seq = be32dec(buf + LDM_DB_LASTSEQ_OFF); db->dh.size = be32dec(buf + LDM_DB_SIZE_OFF); error = parse_uuid(buf + LDM_DB_DGGUID_OFF, &dg_guid); /* Compare disk group name and guid from VMDB and private headers */ if (error != 0 || db->dh.size == 0 || pp->sectorsize % db->dh.size != 0 || strncmp(buf + LDM_DB_DGNAME_OFF, db->ph.dg_name, 31) != 0 || memcmp(&dg_guid, &db->ph.dg_guid, sizeof(dg_guid)) != 0 || db->dh.size * db->dh.last_seq > db->ph.conf_size * pp->sectorsize) { LDM_DEBUG(0, "%s: invalid values in the VMDB header", pp->name); LDM_DUMP(buf, pp->sectorsize); g_free(buf); return (EINVAL); } g_free(buf); return (0); } static int ldm_xvblk_handle(struct ldm_db *db, struct ldm_vblkhdr *vh, const u_char *p) { struct ldm_xvblk *blk; size_t size; size = db->dh.size - 16; LIST_FOREACH(blk, &db->xvblks, entry) if (blk->group == vh->group) break; if (blk == NULL) { blk = g_malloc(sizeof(*blk), M_WAITOK | M_ZERO); blk->group = vh->group; blk->size = size * vh->count + 16; blk->data = g_malloc(blk->size, M_WAITOK | M_ZERO); blk->map = 0xFF << vh->count; LIST_INSERT_HEAD(&db->xvblks, blk, entry); } if ((blk->map & (1 << vh->index)) != 0) { /* Block with given index has been already saved. */ return (EINVAL); } /* Copy the data block to the place related to index. */ memcpy(blk->data + size * vh->index + 16, p + 16, size); blk->map |= 1 << vh->index; return (0); } /* Read the variable-width numeric field and return new offset */ static int ldm_vnum_get(const u_char *buf, int offset, uint64_t *result, size_t range) { uint64_t num; uint8_t len; len = buf[offset++]; if (len > sizeof(uint64_t) || len + offset >= range) return (-1); for (num = 0; len > 0; len--) num = (num << 8) | buf[offset++]; *result = num; return (offset); } /* Read the variable-width string and return new offset */ static int ldm_vstr_get(const u_char *buf, int offset, u_char *result, size_t maxlen, size_t range) { uint8_t len; len = buf[offset++]; if (len >= maxlen || len + offset >= range) return (-1); memcpy(result, buf + offset, len); result[len] = '\0'; return (offset + len); } /* Just skip the variable-width variable and return new offset */ static int ldm_vparm_skip(const u_char *buf, int offset, size_t range) { uint8_t len; len = buf[offset++]; if (offset + len >= range) return (-1); return (offset + len); } static int ldm_vblk_handle(struct ldm_db *db, const u_char *p, size_t size) { struct ldm_vblk *blk; struct ldm_volume *volume, *last; const char *errstr; u_char vstr[64]; int error, offset; blk = g_malloc(sizeof(*blk), M_WAITOK | M_ZERO); blk->type = p[LDM_VBLK_TYPE_OFF]; offset = ldm_vnum_get(p, LDM_VBLK_OID_OFF, &blk->u.id, size); if (offset < 0) { errstr = "object id"; goto fail; } offset = ldm_vstr_get(p, offset, vstr, sizeof(vstr), size); if (offset < 0) { errstr = "object name"; goto fail; } switch (blk->type) { /* * Component VBLK fields: * Offset Size Description * ------------+-------+------------------------ * 0x18+ PS volume state * 0x18+5 PN component children count * 0x1D+16 PN parent's volume object id * 0x2D+1 PN stripe size */ case LDM_VBLK_T_COMPONENT: offset = ldm_vparm_skip(p, offset, size); if (offset < 0) { errstr = "volume state"; goto fail; } offset = ldm_vparm_skip(p, offset + 5, size); if (offset < 0) { errstr = "children count"; goto fail; } offset = ldm_vnum_get(p, offset + 16, &blk->u.comp.vol_id, size); if (offset < 0) { errstr = "volume id"; goto fail; } break; /* * Partition VBLK fields: * Offset Size Description * ------------+-------+------------------------ * 0x18+12 8 partition start offset * 0x18+20 8 volume offset * 0x18+28 PN partition size * 0x34+ PN parent's component object id * 0x34+ PN disk's object id */ case LDM_VBLK_T_PARTITION: if (offset + 28 >= size) { errstr = "too small buffer"; goto fail; } blk->u.part.start = be64dec(p + offset + 12); blk->u.part.offset = be64dec(p + offset + 20); offset = ldm_vnum_get(p, offset + 28, &blk->u.part.size, size); if (offset < 0) { errstr = "partition size"; goto fail; } offset = ldm_vnum_get(p, offset, &blk->u.part.comp_id, size); if (offset < 0) { errstr = "component id"; goto fail; } offset = ldm_vnum_get(p, offset, &blk->u.part.disk_id, size); if (offset < 0) { errstr = "disk id"; goto fail; } break; /* * Disk VBLK fields: * Offset Size Description * ------------+-------+------------------------ * 0x18+ PS disk GUID */ case LDM_VBLK_T_DISK: errstr = "disk guid"; offset = ldm_vstr_get(p, offset, vstr, sizeof(vstr), size); if (offset < 0) goto fail; error = parse_uuid(vstr, &blk->u.disk.guid); if (error != 0) goto fail; LIST_INSERT_HEAD(&db->disks, &blk->u.disk, entry); break; /* * Disk group VBLK fields: * Offset Size Description * ------------+-------+------------------------ * 0x18+ PS disk group GUID */ case LDM_VBLK_T_DISKGROUP: #if 0 strncpy(blk->u.disk_group.name, vstr, sizeof(blk->u.disk_group.name)); offset = ldm_vstr_get(p, offset, vstr, sizeof(vstr), size); if (offset < 0) { errstr = "disk group guid"; goto fail; } error = parse_uuid(name, &blk->u.disk_group.guid); if (error != 0) { errstr = "disk group guid"; goto fail; } LIST_INSERT_HEAD(&db->groups, &blk->u.disk_group, entry); #endif break; /* * Disk VBLK fields: * Offset Size Description * ------------+-------+------------------------ * 0x18+ 16 disk GUID */ case LDM_VBLK_T_DISK4: be_uuid_dec(p + offset, &blk->u.disk.guid); LIST_INSERT_HEAD(&db->disks, &blk->u.disk, entry); break; /* * Disk group VBLK fields: * Offset Size Description * ------------+-------+------------------------ * 0x18+ 16 disk GUID */ case LDM_VBLK_T_DISKGROUP4: #if 0 strncpy(blk->u.disk_group.name, vstr, sizeof(blk->u.disk_group.name)); be_uuid_dec(p + offset, &blk->u.disk.guid); LIST_INSERT_HEAD(&db->groups, &blk->u.disk_group, entry); #endif break; /* * Volume VBLK fields: * Offset Size Description * ------------+-------+------------------------ * 0x18+ PS volume type * 0x18+ PS unknown * 0x18+ 14(S) volume state * 0x18+16 1 volume number * 0x18+21 PN volume children count * 0x2D+16 PN volume size * 0x3D+4 1 partition type */ case LDM_VBLK_T_VOLUME: offset = ldm_vparm_skip(p, offset, size); if (offset < 0) { errstr = "volume type"; goto fail; } offset = ldm_vparm_skip(p, offset, size); if (offset < 0) { errstr = "unknown param"; goto fail; } if (offset + 21 >= size) { errstr = "too small buffer"; goto fail; } blk->u.vol.number = p[offset + 16]; offset = ldm_vparm_skip(p, offset + 21, size); if (offset < 0) { errstr = "children count"; goto fail; } offset = ldm_vnum_get(p, offset + 16, &blk->u.vol.size, size); if (offset < 0) { errstr = "volume size"; goto fail; } if (offset + 4 >= size) { errstr = "too small buffer"; goto fail; } blk->u.vol.part_type = p[offset + 4]; /* keep volumes ordered by volume number */ last = NULL; LIST_FOREACH(volume, &db->volumes, entry) { if (volume->number > blk->u.vol.number) break; last = volume; } if (last != NULL) LIST_INSERT_AFTER(last, &blk->u.vol, entry); else LIST_INSERT_HEAD(&db->volumes, &blk->u.vol, entry); break; default: LDM_DEBUG(1, "unknown VBLK type 0x%02x\n", blk->type); LDM_DUMP(p, size); } LIST_INSERT_HEAD(&db->vblks, blk, entry); return (0); fail: LDM_DEBUG(0, "failed to parse '%s' in VBLK of type 0x%02x\n", errstr, blk->type); LDM_DUMP(p, size); g_free(blk); return (EINVAL); } static void ldm_vmdb_free(struct ldm_db *db) { struct ldm_vblk *vblk; struct ldm_xvblk *xvblk; while (!LIST_EMPTY(&db->xvblks)) { xvblk = LIST_FIRST(&db->xvblks); LIST_REMOVE(xvblk, entry); g_free(xvblk->data); g_free(xvblk); } while (!LIST_EMPTY(&db->vblks)) { vblk = LIST_FIRST(&db->vblks); LIST_REMOVE(vblk, entry); g_free(vblk); } } static int ldm_vmdb_parse(struct ldm_db *db, struct g_consumer *cp) { struct g_provider *pp; struct ldm_vblk *vblk; struct ldm_xvblk *xvblk; struct ldm_volume *volume; struct ldm_component *comp; struct ldm_vblkhdr vh; u_char *buf, *p; size_t size, n, sectors; uint64_t offset; int error; pp = cp->provider; size = (db->dh.last_seq * db->dh.size + pp->sectorsize - 1) / pp->sectorsize; size -= 1; /* one sector takes vmdb header */ for (n = 0; n < size; n += MAXPHYS / pp->sectorsize) { offset = db->ph.db_offset + db->th.conf_offset + n + 1; sectors = (size - n) > (MAXPHYS / pp->sectorsize) ? MAXPHYS / pp->sectorsize: size - n; /* read VBLKs */ buf = g_read_data(cp, offset * pp->sectorsize, sectors * pp->sectorsize, &error); if (buf == NULL) { LDM_DEBUG(0, "%s: failed to read VBLK\n", pp->name); goto fail; } for (p = buf; p < buf + sectors * pp->sectorsize; p += db->dh.size) { if (memcmp(p, LDM_VBLK_SIGN, strlen(LDM_VBLK_SIGN)) != 0) { LDM_DEBUG(0, "%s: no VBLK signature\n", pp->name); LDM_DUMP(p, db->dh.size); goto fail; } vh.seq = be32dec(p + LDM_VBLK_SEQ_OFF); vh.group = be32dec(p + LDM_VBLK_GROUP_OFF); /* skip empty blocks */ if (vh.seq == 0 || vh.group == 0) continue; vh.index = be16dec(p + LDM_VBLK_INDEX_OFF); vh.count = be16dec(p + LDM_VBLK_COUNT_OFF); if (vh.count == 0 || vh.count > 4 || vh.seq > db->dh.last_seq) { LDM_DEBUG(0, "%s: invalid values " "in the VBLK header\n", pp->name); LDM_DUMP(p, db->dh.size); goto fail; } if (vh.count > 1) { error = ldm_xvblk_handle(db, &vh, p); if (error != 0) { LDM_DEBUG(0, "%s: xVBLK " "is corrupted\n", pp->name); LDM_DUMP(p, db->dh.size); goto fail; } continue; } if (be16dec(p + 16) != 0) LDM_DEBUG(1, "%s: VBLK update" " status is %u\n", pp->name, be16dec(p + 16)); error = ldm_vblk_handle(db, p, db->dh.size); if (error != 0) goto fail; } g_free(buf); buf = NULL; } /* Parse xVBLKs */ while (!LIST_EMPTY(&db->xvblks)) { xvblk = LIST_FIRST(&db->xvblks); if (xvblk->map == 0xFF) { error = ldm_vblk_handle(db, xvblk->data, xvblk->size); if (error != 0) goto fail; } else { LDM_DEBUG(0, "%s: incomplete or corrupt " "xVBLK found\n", pp->name); goto fail; } LIST_REMOVE(xvblk, entry); g_free(xvblk->data); g_free(xvblk); } /* construct all VBLKs relations */ LIST_FOREACH(volume, &db->volumes, entry) { LIST_FOREACH(vblk, &db->vblks, entry) if (vblk->type == LDM_VBLK_T_COMPONENT && vblk->u.comp.vol_id == volume->id) { LIST_INSERT_HEAD(&volume->components, &vblk->u.comp, entry); volume->count++; } LIST_FOREACH(comp, &volume->components, entry) LIST_FOREACH(vblk, &db->vblks, entry) if (vblk->type == LDM_VBLK_T_PARTITION && vblk->u.part.comp_id == comp->id) { LIST_INSERT_HEAD(&comp->partitions, &vblk->u.part, entry); comp->count++; } } return (0); fail: ldm_vmdb_free(db); g_free(buf); return (ENXIO); } static int g_part_ldm_add(struct g_part_table *basetable, struct g_part_entry *baseentry, struct g_part_parms *gpp) { return (ENOSYS); } static int g_part_ldm_bootcode(struct g_part_table *basetable, struct g_part_parms *gpp) { return (ENOSYS); } static int g_part_ldm_create(struct g_part_table *basetable, struct g_part_parms *gpp) { return (ENOSYS); } static int g_part_ldm_destroy(struct g_part_table *basetable, struct g_part_parms *gpp) { struct g_part_ldm_table *table; struct g_provider *pp; table = (struct g_part_ldm_table *)basetable; /* * To destroy LDM on a disk partitioned with GPT we should delete * ms-ldm-metadata partition, but we can't do this via standard * GEOM_PART method. */ if (table->is_gpt) return (ENOSYS); pp = LIST_FIRST(&basetable->gpt_gp->consumer)->provider; /* * To destroy LDM we should wipe MBR, first private header and * backup private headers. */ basetable->gpt_smhead = (1 << ldm_ph_off[0]) | 1; /* * Don't touch last backup private header when LDM database is * not located in the last 1MByte area. * XXX: can't remove all blocks. */ if (table->db_offset + LDM_DB_SIZE == pp->mediasize / pp->sectorsize) basetable->gpt_smtail = 1; return (0); } static void g_part_ldm_dumpconf(struct g_part_table *basetable, struct g_part_entry *baseentry, struct sbuf *sb, const char *indent) { struct g_part_ldm_entry *entry; entry = (struct g_part_ldm_entry *)baseentry; if (indent == NULL) { /* conftxt: libdisk compatibility */ sbuf_printf(sb, " xs LDM xt %u", entry->type); } else if (entry != NULL) { /* confxml: partition entry information */ sbuf_printf(sb, "%s%u\n", indent, entry->type); } else { /* confxml: scheme information */ } } static int g_part_ldm_dumpto(struct g_part_table *table, struct g_part_entry *baseentry) { return (0); } static int g_part_ldm_modify(struct g_part_table *basetable, struct g_part_entry *baseentry, struct g_part_parms *gpp) { return (ENOSYS); } static const char * g_part_ldm_name(struct g_part_table *table, struct g_part_entry *baseentry, char *buf, size_t bufsz) { snprintf(buf, bufsz, "s%d", baseentry->gpe_index); return (buf); } static int ldm_gpt_probe(struct g_part_table *basetable, struct g_consumer *cp) { struct g_part_ldm_table *table; struct g_part_table *gpt; struct g_part_entry *entry; struct g_consumer *cp2; struct gpt_ent *part; u_char *buf; int error; /* * XXX: We use some knowlege about GEOM_PART_GPT internal * structures, but it is easier than parse GPT by himself. */ g_topology_lock(); gpt = cp->provider->geom->softc; LIST_FOREACH(entry, &gpt->gpt_entry, gpe_entry) { part = (struct gpt_ent *)(entry + 1); /* Search ms-ldm-metadata partition */ if (memcmp(&part->ent_type, &gpt_uuid_ms_ldm_metadata, sizeof(struct uuid)) != 0 || entry->gpe_end - entry->gpe_start < LDM_DB_SIZE - 1) continue; /* Create new consumer and attach it to metadata partition */ cp2 = g_new_consumer(cp->geom); error = g_attach(cp2, entry->gpe_pp); if (error != 0) { g_destroy_consumer(cp2); g_topology_unlock(); return (ENXIO); } error = g_access(cp2, 1, 0, 0); if (error != 0) { g_detach(cp2); g_destroy_consumer(cp2); g_topology_unlock(); return (ENXIO); } g_topology_unlock(); LDM_DEBUG(2, "%s: LDM metadata partition %s found in the GPT", cp->provider->name, cp2->provider->name); /* Read the LDM private header */ buf = ldm_privhdr_read(cp2, ldm_ph_off[LDM_PH_GPTINDEX] * cp2->provider->sectorsize, &error); if (buf != NULL) { table = (struct g_part_ldm_table *)basetable; table->is_gpt = 1; g_free(buf); return (G_PART_PROBE_PRI_HIGH); } /* second consumer is no longer needed. */ g_topology_lock(); g_access(cp2, -1, 0, 0); g_detach(cp2); g_destroy_consumer(cp2); break; } g_topology_unlock(); return (ENXIO); } static int g_part_ldm_probe(struct g_part_table *basetable, struct g_consumer *cp) { struct g_provider *pp; u_char *buf, type[64]; int error, idx; pp = cp->provider; if (pp->sectorsize != 512) return (ENXIO); error = g_getattr("PART::scheme", cp, &type); if (error == 0 && strcmp(type, "GPT") == 0) { if (g_getattr("PART::type", cp, &type) != 0 || strcmp(type, "ms-ldm-data") != 0) return (ENXIO); error = ldm_gpt_probe(basetable, cp); return (error); } if (basetable->gpt_depth != 0) return (ENXIO); /* LDM has 1M metadata area */ if (pp->mediasize <= 1024 * 1024) return (ENOSPC); /* Check that there's a MBR */ buf = g_read_data(cp, 0, pp->sectorsize, &error); if (buf == NULL) return (error); if (le16dec(buf + DOSMAGICOFFSET) != DOSMAGIC) { g_free(buf); return (ENXIO); } error = ENXIO; /* Check that we have LDM partitions in the MBR */ for (idx = 0; idx < NDOSPART && error != 0; idx++) { if (buf[DOSPARTOFF + idx * DOSPARTSIZE + 4] == DOSPTYP_LDM) error = 0; } g_free(buf); if (error == 0) { LDM_DEBUG(2, "%s: LDM data partitions found in MBR", pp->name); /* Read the LDM private header */ buf = ldm_privhdr_read(cp, ldm_ph_off[LDM_PH_MBRINDEX] * pp->sectorsize, &error); if (buf == NULL) return (error); g_free(buf); return (G_PART_PROBE_PRI_HIGH); } return (error); } static int g_part_ldm_read(struct g_part_table *basetable, struct g_consumer *cp) { struct g_part_ldm_table *table; struct g_part_ldm_entry *entry; struct g_consumer *cp2; struct ldm_component *comp; struct ldm_partition *part; struct ldm_volume *vol; struct ldm_disk *disk; struct ldm_db db; int error, index, skipped; table = (struct g_part_ldm_table *)basetable; memset(&db, 0, sizeof(db)); cp2 = cp; /* ms-ldm-data */ if (table->is_gpt) cp = LIST_FIRST(&cp->geom->consumer); /* ms-ldm-metadata */ /* Read and parse LDM private headers. */ error = ldm_privhdr_check(&db, cp, table->is_gpt); if (error != 0) goto gpt_cleanup; basetable->gpt_first = table->is_gpt ? 0: db.ph.start; basetable->gpt_last = basetable->gpt_first + db.ph.size - 1; table->db_offset = db.ph.db_offset; /* Make additional checks for GPT */ if (table->is_gpt) { error = ldm_gpt_check(&db, cp); if (error != 0) goto gpt_cleanup; /* * Now we should reset database offset to zero, because our * consumer cp is attached to the ms-ldm-metadata partition * and we don't need add db_offset to read from it. */ db.ph.db_offset = 0; } /* Read and parse LDM TOC headers. */ error = ldm_tochdr_check(&db, cp); if (error != 0) goto gpt_cleanup; /* Read and parse LDM VMDB header. */ error = ldm_vmdbhdr_check(&db, cp); if (error != 0) goto gpt_cleanup; error = ldm_vmdb_parse(&db, cp); /* * For the GPT case we must detach and destroy * second consumer before return. */ gpt_cleanup: if (table->is_gpt) { g_topology_lock(); g_access(cp, -1, 0, 0); g_detach(cp); g_destroy_consumer(cp); g_topology_unlock(); cp = cp2; } if (error != 0) return (error); /* Search current disk in the disk list. */ LIST_FOREACH(disk, &db.disks, entry) if (memcmp(&disk->guid, &db.ph.disk_guid, sizeof(struct uuid)) == 0) break; if (disk == NULL) { LDM_DEBUG(1, "%s: no LDM volumes on this disk", cp->provider->name); ldm_vmdb_free(&db); return (ENXIO); } index = 1; LIST_FOREACH(vol, &db.volumes, entry) { LIST_FOREACH(comp, &vol->components, entry) { /* Skip volumes from different disks. */ part = LIST_FIRST(&comp->partitions); if (part->disk_id != disk->id) continue; skipped = 0; /* We don't support spanned and striped volumes. */ if (comp->count > 1 || part->offset != 0) { LDM_DEBUG(1, "%s: LDM volume component " "%ju has %u partitions. Skipped", cp->provider->name, (uintmax_t)comp->id, comp->count); skipped = 1; } /* * Allow mirrored volumes only when they are explicitly * allowed with kern.geom.part.ldm.show_mirrors=1. */ if (vol->count > 1 && show_mirrors == 0) { LDM_DEBUG(1, "%s: LDM volume %ju has %u " "components. Skipped", cp->provider->name, (uintmax_t)vol->id, vol->count); skipped = 1; } entry = (struct g_part_ldm_entry *)g_part_new_entry( basetable, index++, basetable->gpt_first + part->start, basetable->gpt_first + part->start + part->size - 1); /* * Mark skipped partition as ms-ldm-data partition. * We do not support them, but it is better to show * that we have something there, than just show * free space. */ if (skipped == 0) entry->type = vol->part_type; else entry->type = DOSPTYP_LDM; LDM_DEBUG(1, "%s: new volume id: %ju, start: %ju," " end: %ju, type: 0x%02x\n", cp->provider->name, (uintmax_t)part->id,(uintmax_t)part->start + basetable->gpt_first, (uintmax_t)part->start + part->size + basetable->gpt_first - 1, vol->part_type); } } ldm_vmdb_free(&db); return (error); } static const char * g_part_ldm_type(struct g_part_table *basetable, struct g_part_entry *baseentry, char *buf, size_t bufsz) { struct g_part_ldm_entry *entry; int i; entry = (struct g_part_ldm_entry *)baseentry; - for (i = 0; - i < nitems(ldm_alias_match); i++) { + for (i = 0; i < nitems(ldm_alias_match); i++) { if (ldm_alias_match[i].typ == entry->type) return (g_part_alias_name(ldm_alias_match[i].alias)); } snprintf(buf, bufsz, "!%d", entry->type); return (buf); } static int g_part_ldm_write(struct g_part_table *basetable, struct g_consumer *cp) { return (ENOSYS); } Index: head/sys/geom/part/g_part_mbr.c =================================================================== --- head/sys/geom/part/g_part_mbr.c (revision 298353) +++ head/sys/geom/part/g_part_mbr.c (revision 298354) @@ -1,607 +1,605 @@ /*- * Copyright (c) 2007, 2008 Marcel Moolenaar * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "g_part_if.h" FEATURE(geom_part_mbr, "GEOM partitioning class for MBR support"); SYSCTL_DECL(_kern_geom_part); static SYSCTL_NODE(_kern_geom_part, OID_AUTO, mbr, CTLFLAG_RW, 0, "GEOM_PART_MBR Master Boot Record"); static u_int enforce_chs = 0; SYSCTL_UINT(_kern_geom_part_mbr, OID_AUTO, enforce_chs, CTLFLAG_RWTUN, &enforce_chs, 0, "Enforce alignment to CHS addressing"); #define MBRSIZE 512 struct g_part_mbr_table { struct g_part_table base; u_char mbr[MBRSIZE]; }; struct g_part_mbr_entry { struct g_part_entry base; struct dos_partition ent; }; static int g_part_mbr_add(struct g_part_table *, struct g_part_entry *, struct g_part_parms *); static int g_part_mbr_bootcode(struct g_part_table *, struct g_part_parms *); static int g_part_mbr_create(struct g_part_table *, struct g_part_parms *); static int g_part_mbr_destroy(struct g_part_table *, struct g_part_parms *); static void g_part_mbr_dumpconf(struct g_part_table *, struct g_part_entry *, struct sbuf *, const char *); static int g_part_mbr_dumpto(struct g_part_table *, struct g_part_entry *); static int g_part_mbr_modify(struct g_part_table *, struct g_part_entry *, struct g_part_parms *); static const char *g_part_mbr_name(struct g_part_table *, struct g_part_entry *, char *, size_t); static int g_part_mbr_probe(struct g_part_table *, struct g_consumer *); static int g_part_mbr_read(struct g_part_table *, struct g_consumer *); static int g_part_mbr_setunset(struct g_part_table *, struct g_part_entry *, const char *, unsigned int); static const char *g_part_mbr_type(struct g_part_table *, struct g_part_entry *, char *, size_t); static int g_part_mbr_write(struct g_part_table *, struct g_consumer *); static int g_part_mbr_resize(struct g_part_table *, struct g_part_entry *, struct g_part_parms *); static kobj_method_t g_part_mbr_methods[] = { KOBJMETHOD(g_part_add, g_part_mbr_add), KOBJMETHOD(g_part_bootcode, g_part_mbr_bootcode), KOBJMETHOD(g_part_create, g_part_mbr_create), KOBJMETHOD(g_part_destroy, g_part_mbr_destroy), KOBJMETHOD(g_part_dumpconf, g_part_mbr_dumpconf), KOBJMETHOD(g_part_dumpto, g_part_mbr_dumpto), KOBJMETHOD(g_part_modify, g_part_mbr_modify), KOBJMETHOD(g_part_resize, g_part_mbr_resize), KOBJMETHOD(g_part_name, g_part_mbr_name), KOBJMETHOD(g_part_probe, g_part_mbr_probe), KOBJMETHOD(g_part_read, g_part_mbr_read), KOBJMETHOD(g_part_setunset, g_part_mbr_setunset), KOBJMETHOD(g_part_type, g_part_mbr_type), KOBJMETHOD(g_part_write, g_part_mbr_write), { 0, 0 } }; static struct g_part_scheme g_part_mbr_scheme = { "MBR", g_part_mbr_methods, sizeof(struct g_part_mbr_table), .gps_entrysz = sizeof(struct g_part_mbr_entry), .gps_minent = NDOSPART, .gps_maxent = NDOSPART, .gps_bootcodesz = MBRSIZE, }; G_PART_SCHEME_DECLARE(g_part_mbr); static struct g_part_mbr_alias { u_char typ; int alias; } mbr_alias_match[] = { { DOSPTYP_386BSD, G_PART_ALIAS_FREEBSD }, { DOSPTYP_EXT, G_PART_ALIAS_EBR }, { DOSPTYP_NTFS, G_PART_ALIAS_MS_NTFS }, { DOSPTYP_FAT16, G_PART_ALIAS_MS_FAT16 }, { DOSPTYP_FAT32, G_PART_ALIAS_MS_FAT32 }, { DOSPTYP_EXTLBA, G_PART_ALIAS_EBR }, { DOSPTYP_LDM, G_PART_ALIAS_MS_LDM_DATA }, { DOSPTYP_LINSWP, G_PART_ALIAS_LINUX_SWAP }, { DOSPTYP_LINUX, G_PART_ALIAS_LINUX_DATA }, { DOSPTYP_LINLVM, G_PART_ALIAS_LINUX_LVM }, { DOSPTYP_LINRAID, G_PART_ALIAS_LINUX_RAID }, { DOSPTYP_PPCBOOT, G_PART_ALIAS_PREP_BOOT }, { DOSPTYP_VMFS, G_PART_ALIAS_VMFS }, { DOSPTYP_VMKDIAG, G_PART_ALIAS_VMKDIAG }, { DOSPTYP_APPLE_UFS, G_PART_ALIAS_APPLE_UFS }, { DOSPTYP_APPLE_BOOT, G_PART_ALIAS_APPLE_BOOT }, { DOSPTYP_HFS, G_PART_ALIAS_APPLE_HFS }, }; static int mbr_parse_type(const char *type, u_char *dp_typ) { const char *alias; char *endp; long lt; int i; if (type[0] == '!') { lt = strtol(type + 1, &endp, 0); if (type[1] == '\0' || *endp != '\0' || lt <= 0 || lt >= 256) return (EINVAL); *dp_typ = (u_char)lt; return (0); } - for (i = 0; - i < nitems(mbr_alias_match); i++) { + for (i = 0; i < nitems(mbr_alias_match); i++) { alias = g_part_alias_name(mbr_alias_match[i].alias); if (strcasecmp(type, alias) == 0) { *dp_typ = mbr_alias_match[i].typ; return (0); } } return (EINVAL); } static int mbr_probe_bpb(u_char *bpb) { uint16_t secsz; uint8_t clstsz; #define PO2(x) ((x & (x - 1)) == 0) secsz = le16dec(bpb); if (secsz < 512 || secsz > 4096 || !PO2(secsz)) return (0); clstsz = bpb[2]; if (clstsz < 1 || clstsz > 128 || !PO2(clstsz)) return (0); #undef PO2 return (1); } static void mbr_set_chs(struct g_part_table *table, uint32_t lba, u_char *cylp, u_char *hdp, u_char *secp) { uint32_t cyl, hd, sec; sec = lba % table->gpt_sectors + 1; lba /= table->gpt_sectors; hd = lba % table->gpt_heads; lba /= table->gpt_heads; cyl = lba; if (cyl > 1023) sec = hd = cyl = ~0; *cylp = cyl & 0xff; *hdp = hd & 0xff; *secp = (sec & 0x3f) | ((cyl >> 2) & 0xc0); } static int mbr_align(struct g_part_table *basetable, uint32_t *start, uint32_t *size) { uint32_t sectors; if (enforce_chs == 0) return (0); sectors = basetable->gpt_sectors; if (*size < sectors) return (EINVAL); if (start != NULL && (*start % sectors)) { *size += (*start % sectors) - sectors; *start -= (*start % sectors) - sectors; } if (*size % sectors) *size -= (*size % sectors); if (*size < sectors) return (EINVAL); return (0); } static int g_part_mbr_add(struct g_part_table *basetable, struct g_part_entry *baseentry, struct g_part_parms *gpp) { struct g_part_mbr_entry *entry; uint32_t start, size; if (gpp->gpp_parms & G_PART_PARM_LABEL) return (EINVAL); entry = (struct g_part_mbr_entry *)baseentry; start = gpp->gpp_start; size = gpp->gpp_size; if (mbr_align(basetable, &start, &size) != 0) return (EINVAL); if (baseentry->gpe_deleted) bzero(&entry->ent, sizeof(entry->ent)); KASSERT(baseentry->gpe_start <= start, ("%s", __func__)); KASSERT(baseentry->gpe_end >= start + size - 1, ("%s", __func__)); baseentry->gpe_start = start; baseentry->gpe_end = start + size - 1; entry->ent.dp_start = start; entry->ent.dp_size = size; mbr_set_chs(basetable, baseentry->gpe_start, &entry->ent.dp_scyl, &entry->ent.dp_shd, &entry->ent.dp_ssect); mbr_set_chs(basetable, baseentry->gpe_end, &entry->ent.dp_ecyl, &entry->ent.dp_ehd, &entry->ent.dp_esect); return (mbr_parse_type(gpp->gpp_type, &entry->ent.dp_typ)); } static int g_part_mbr_bootcode(struct g_part_table *basetable, struct g_part_parms *gpp) { struct g_part_mbr_table *table; uint32_t dsn; if (gpp->gpp_codesize != MBRSIZE) return (ENODEV); table = (struct g_part_mbr_table *)basetable; dsn = *(uint32_t *)(table->mbr + DOSDSNOFF); bcopy(gpp->gpp_codeptr, table->mbr, DOSPARTOFF); if (dsn != 0) *(uint32_t *)(table->mbr + DOSDSNOFF) = dsn; return (0); } static int g_part_mbr_create(struct g_part_table *basetable, struct g_part_parms *gpp) { struct g_provider *pp; struct g_part_mbr_table *table; pp = gpp->gpp_provider; if (pp->sectorsize < MBRSIZE) return (ENOSPC); basetable->gpt_first = basetable->gpt_sectors; basetable->gpt_last = MIN(pp->mediasize / pp->sectorsize, UINT32_MAX) - 1; table = (struct g_part_mbr_table *)basetable; le16enc(table->mbr + DOSMAGICOFFSET, DOSMAGIC); return (0); } static int g_part_mbr_destroy(struct g_part_table *basetable, struct g_part_parms *gpp) { /* Wipe the first sector to clear the partitioning. */ basetable->gpt_smhead |= 1; return (0); } static void g_part_mbr_dumpconf(struct g_part_table *table, struct g_part_entry *baseentry, struct sbuf *sb, const char *indent) { struct g_part_mbr_entry *entry; entry = (struct g_part_mbr_entry *)baseentry; if (indent == NULL) { /* conftxt: libdisk compatibility */ sbuf_printf(sb, " xs MBR xt %u", entry->ent.dp_typ); } else if (entry != NULL) { /* confxml: partition entry information */ sbuf_printf(sb, "%s%u\n", indent, entry->ent.dp_typ); if (entry->ent.dp_flag & 0x80) sbuf_printf(sb, "%sactive\n", indent); } else { /* confxml: scheme information */ } } static int g_part_mbr_dumpto(struct g_part_table *table, struct g_part_entry *baseentry) { struct g_part_mbr_entry *entry; /* Allow dumping to a FreeBSD partition or Linux swap partition only. */ entry = (struct g_part_mbr_entry *)baseentry; return ((entry->ent.dp_typ == DOSPTYP_386BSD || entry->ent.dp_typ == DOSPTYP_LINSWP) ? 1 : 0); } static int g_part_mbr_modify(struct g_part_table *basetable, struct g_part_entry *baseentry, struct g_part_parms *gpp) { struct g_part_mbr_entry *entry; if (gpp->gpp_parms & G_PART_PARM_LABEL) return (EINVAL); entry = (struct g_part_mbr_entry *)baseentry; if (gpp->gpp_parms & G_PART_PARM_TYPE) return (mbr_parse_type(gpp->gpp_type, &entry->ent.dp_typ)); return (0); } static int g_part_mbr_resize(struct g_part_table *basetable, struct g_part_entry *baseentry, struct g_part_parms *gpp) { struct g_part_mbr_entry *entry; struct g_provider *pp; uint32_t size; if (baseentry == NULL) { pp = LIST_FIRST(&basetable->gpt_gp->consumer)->provider; basetable->gpt_last = MIN(pp->mediasize / pp->sectorsize, UINT32_MAX) - 1; return (0); } size = gpp->gpp_size; if (mbr_align(basetable, NULL, &size) != 0) return (EINVAL); /* XXX: prevent unexpected shrinking. */ pp = baseentry->gpe_pp; if ((g_debugflags & 0x10) == 0 && size < gpp->gpp_size && pp->mediasize / pp->sectorsize > size) return (EBUSY); entry = (struct g_part_mbr_entry *)baseentry; baseentry->gpe_end = baseentry->gpe_start + size - 1; entry->ent.dp_size = size; mbr_set_chs(basetable, baseentry->gpe_end, &entry->ent.dp_ecyl, &entry->ent.dp_ehd, &entry->ent.dp_esect); return (0); } static const char * g_part_mbr_name(struct g_part_table *table, struct g_part_entry *baseentry, char *buf, size_t bufsz) { snprintf(buf, bufsz, "s%d", baseentry->gpe_index); return (buf); } static int g_part_mbr_probe(struct g_part_table *table, struct g_consumer *cp) { char psn[8]; struct g_provider *pp; u_char *buf, *p; int error, index, res, sum; uint16_t magic; pp = cp->provider; /* Sanity-check the provider. */ if (pp->sectorsize < MBRSIZE || pp->mediasize < pp->sectorsize) return (ENOSPC); if (pp->sectorsize > 4096) return (ENXIO); /* We don't nest under an MBR (see EBR instead). */ error = g_getattr("PART::scheme", cp, &psn); if (error == 0 && strcmp(psn, g_part_mbr_scheme.name) == 0) return (ELOOP); /* Check that there's a MBR. */ buf = g_read_data(cp, 0L, pp->sectorsize, &error); if (buf == NULL) return (error); /* We goto out on mismatch. */ res = ENXIO; magic = le16dec(buf + DOSMAGICOFFSET); if (magic != DOSMAGIC) goto out; for (index = 0; index < NDOSPART; index++) { p = buf + DOSPARTOFF + index * DOSPARTSIZE; if (p[0] != 0 && p[0] != 0x80) goto out; } /* * If the partition table does not consist of all zeroes, * assume we have a MBR. If it's all zeroes, we could have * a boot sector. For example, a boot sector that doesn't * have boot code -- common on non-i386 hardware. In that * case we check if we have a possible BPB. If so, then we * assume we have a boot sector instead. */ sum = 0; for (index = 0; index < NDOSPART * DOSPARTSIZE; index++) sum += buf[DOSPARTOFF + index]; if (sum != 0 || !mbr_probe_bpb(buf + 0x0b)) res = G_PART_PROBE_PRI_NORM; out: g_free(buf); return (res); } static int g_part_mbr_read(struct g_part_table *basetable, struct g_consumer *cp) { struct dos_partition ent; struct g_provider *pp; struct g_part_mbr_table *table; struct g_part_mbr_entry *entry; u_char *buf, *p; off_t chs, msize, first; u_int sectors, heads; int error, index; pp = cp->provider; table = (struct g_part_mbr_table *)basetable; first = basetable->gpt_sectors; msize = MIN(pp->mediasize / pp->sectorsize, UINT32_MAX); buf = g_read_data(cp, 0L, pp->sectorsize, &error); if (buf == NULL) return (error); bcopy(buf, table->mbr, sizeof(table->mbr)); for (index = NDOSPART - 1; index >= 0; index--) { p = buf + DOSPARTOFF + index * DOSPARTSIZE; ent.dp_flag = p[0]; ent.dp_shd = p[1]; ent.dp_ssect = p[2]; ent.dp_scyl = p[3]; ent.dp_typ = p[4]; ent.dp_ehd = p[5]; ent.dp_esect = p[6]; ent.dp_ecyl = p[7]; ent.dp_start = le32dec(p + 8); ent.dp_size = le32dec(p + 12); if (ent.dp_typ == 0 || ent.dp_typ == DOSPTYP_PMBR) continue; if (ent.dp_start == 0 || ent.dp_size == 0) continue; sectors = ent.dp_esect & 0x3f; if (sectors > basetable->gpt_sectors && !basetable->gpt_fixgeom) { g_part_geometry_heads(msize, sectors, &chs, &heads); if (chs != 0) { basetable->gpt_sectors = sectors; basetable->gpt_heads = heads; } } if (ent.dp_start < first) first = ent.dp_start; entry = (struct g_part_mbr_entry *)g_part_new_entry(basetable, index + 1, ent.dp_start, ent.dp_start + ent.dp_size - 1); entry->ent = ent; } basetable->gpt_entries = NDOSPART; basetable->gpt_first = basetable->gpt_sectors; basetable->gpt_last = msize - 1; if (first < basetable->gpt_first) basetable->gpt_first = 1; g_free(buf); return (0); } static int g_part_mbr_setunset(struct g_part_table *table, struct g_part_entry *baseentry, const char *attrib, unsigned int set) { struct g_part_entry *iter; struct g_part_mbr_entry *entry; int changed; if (baseentry == NULL) return (ENODEV); if (strcasecmp(attrib, "active") != 0) return (EINVAL); /* Only one entry can have the active attribute. */ LIST_FOREACH(iter, &table->gpt_entry, gpe_entry) { if (iter->gpe_deleted) continue; changed = 0; entry = (struct g_part_mbr_entry *)iter; if (iter == baseentry) { if (set && (entry->ent.dp_flag & 0x80) == 0) { entry->ent.dp_flag |= 0x80; changed = 1; } else if (!set && (entry->ent.dp_flag & 0x80)) { entry->ent.dp_flag &= ~0x80; changed = 1; } } else { if (set && (entry->ent.dp_flag & 0x80)) { entry->ent.dp_flag &= ~0x80; changed = 1; } } if (changed && !iter->gpe_created) iter->gpe_modified = 1; } return (0); } static const char * g_part_mbr_type(struct g_part_table *basetable, struct g_part_entry *baseentry, char *buf, size_t bufsz) { struct g_part_mbr_entry *entry; int i; entry = (struct g_part_mbr_entry *)baseentry; - for (i = 0; - i < nitems(mbr_alias_match); i++) { + for (i = 0; i < nitems(mbr_alias_match); i++) { if (mbr_alias_match[i].typ == entry->ent.dp_typ) return (g_part_alias_name(mbr_alias_match[i].alias)); } snprintf(buf, bufsz, "!%d", entry->ent.dp_typ); return (buf); } static int g_part_mbr_write(struct g_part_table *basetable, struct g_consumer *cp) { struct g_part_entry *baseentry; struct g_part_mbr_entry *entry; struct g_part_mbr_table *table; u_char *p; int error, index; table = (struct g_part_mbr_table *)basetable; baseentry = LIST_FIRST(&basetable->gpt_entry); for (index = 1; index <= basetable->gpt_entries; index++) { p = table->mbr + DOSPARTOFF + (index - 1) * DOSPARTSIZE; entry = (baseentry != NULL && index == baseentry->gpe_index) ? (struct g_part_mbr_entry *)baseentry : NULL; if (entry != NULL && !baseentry->gpe_deleted) { p[0] = entry->ent.dp_flag; p[1] = entry->ent.dp_shd; p[2] = entry->ent.dp_ssect; p[3] = entry->ent.dp_scyl; p[4] = entry->ent.dp_typ; p[5] = entry->ent.dp_ehd; p[6] = entry->ent.dp_esect; p[7] = entry->ent.dp_ecyl; le32enc(p + 8, entry->ent.dp_start); le32enc(p + 12, entry->ent.dp_size); } else bzero(p, DOSPARTSIZE); if (entry != NULL) baseentry = LIST_NEXT(baseentry, gpe_entry); } error = g_write_data(cp, 0, table->mbr, cp->provider->sectorsize); return (error); } Index: head/sys/kern/sysv_msg.c =================================================================== --- head/sys/kern/sysv_msg.c (revision 298353) +++ head/sys/kern/sysv_msg.c (revision 298354) @@ -1,1590 +1,1589 @@ /*- * Implementation of SVID messages * * Author: Daniel Boulet * * Copyright 1993 Daniel Boulet and RTMX Inc. * * This system call was implemented by Daniel Boulet under contract from RTMX. * * Redistribution and use in source forms, with and without modification, * are permitted provided that this entire comment appears intact. * * Redistribution in binary form may occur without any restrictions. * Obviously, it would be nice if you gave credit where credit is due * but requiring it would be too onerous. * * This software is provided ``AS IS'' without any warranties of any kind. */ /*- * Copyright (c) 2003-2005 McAfee, Inc. * All rights reserved. * * This software was developed for the FreeBSD Project in part by McAfee * Research, the Security Research Division of McAfee, Inc under DARPA/SPAWAR * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS research * program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #include "opt_sysvipc.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include FEATURE(sysv_msg, "System V message queues support"); static MALLOC_DEFINE(M_MSG, "msg", "SVID compatible message queues"); static int msginit(void); static int msgunload(void); static int sysvmsg_modload(struct module *, int, void *); #ifdef MSG_DEBUG #define DPRINTF(a) printf a #else #define DPRINTF(a) (void)0 #endif static void msg_freehdr(struct msg *msghdr); #ifndef MSGSSZ #define MSGSSZ 8 /* Each segment must be 2^N long */ #endif #ifndef MSGSEG #define MSGSEG 2048 /* must be less than 32767 */ #endif #define MSGMAX (MSGSSZ*MSGSEG) #ifndef MSGMNB #define MSGMNB 2048 /* max # of bytes in a queue */ #endif #ifndef MSGMNI #define MSGMNI 40 #endif #ifndef MSGTQL #define MSGTQL 40 #endif /* * Based on the configuration parameters described in an SVR2 (yes, two) * config(1m) man page. * * Each message is broken up and stored in segments that are msgssz bytes * long. For efficiency reasons, this should be a power of two. Also, * it doesn't make sense if it is less than 8 or greater than about 256. * Consequently, msginit in kern/sysv_msg.c checks that msgssz is a power of * two between 8 and 1024 inclusive (and panic's if it isn't). */ struct msginfo msginfo = { MSGMAX, /* max chars in a message */ MSGMNI, /* # of message queue identifiers */ MSGMNB, /* max chars in a queue */ MSGTQL, /* max messages in system */ MSGSSZ, /* size of a message segment */ /* (must be small power of 2 greater than 4) */ MSGSEG /* number of message segments */ }; /* * macros to convert between msqid_ds's and msqid's. * (specific to this implementation) */ #define MSQID(ix,ds) ((ix) & 0xffff | (((ds).msg_perm.seq << 16) & 0xffff0000)) #define MSQID_IX(id) ((id) & 0xffff) #define MSQID_SEQ(id) (((id) >> 16) & 0xffff) /* * The rest of this file is specific to this particular implementation. */ struct msgmap { short next; /* next segment in buffer */ /* -1 -> available */ /* 0..(MSGSEG-1) -> index of next segment */ }; #define MSG_LOCKED 01000 /* Is this msqid_ds locked? */ static int nfree_msgmaps; /* # of free map entries */ static short free_msgmaps; /* head of linked list of free map entries */ static struct msg *free_msghdrs;/* list of free msg headers */ static char *msgpool; /* MSGMAX byte long msg buffer pool */ static struct msgmap *msgmaps; /* MSGSEG msgmap structures */ static struct msg *msghdrs; /* MSGTQL msg headers */ static struct msqid_kernel *msqids; /* MSGMNI msqid_kernel struct's */ static struct mtx msq_mtx; /* global mutex for message queues. */ static struct syscall_helper_data msg_syscalls[] = { SYSCALL_INIT_HELPER(msgctl), SYSCALL_INIT_HELPER(msgget), SYSCALL_INIT_HELPER(msgsnd), SYSCALL_INIT_HELPER(msgrcv), #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \ defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7) SYSCALL_INIT_HELPER(msgsys), SYSCALL_INIT_HELPER_COMPAT(freebsd7_msgctl), #endif SYSCALL_INIT_LAST }; #ifdef COMPAT_FREEBSD32 #include #include #include #include #include #include static struct syscall_helper_data msg32_syscalls[] = { SYSCALL32_INIT_HELPER(freebsd32_msgctl), SYSCALL32_INIT_HELPER(freebsd32_msgsnd), SYSCALL32_INIT_HELPER(freebsd32_msgrcv), SYSCALL32_INIT_HELPER_COMPAT(msgget), SYSCALL32_INIT_HELPER(freebsd32_msgsys), #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \ defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7) SYSCALL32_INIT_HELPER(freebsd7_freebsd32_msgctl), #endif SYSCALL_INIT_LAST }; #endif static int msginit() { int i, error; msginfo.msgmax = msginfo.msgseg * msginfo.msgssz; msgpool = malloc(msginfo.msgmax, M_MSG, M_WAITOK); msgmaps = malloc(sizeof(struct msgmap) * msginfo.msgseg, M_MSG, M_WAITOK); msghdrs = malloc(sizeof(struct msg) * msginfo.msgtql, M_MSG, M_WAITOK); msqids = malloc(sizeof(struct msqid_kernel) * msginfo.msgmni, M_MSG, M_WAITOK); /* * msginfo.msgssz should be a power of two for efficiency reasons. * It is also pretty silly if msginfo.msgssz is less than 8 * or greater than about 256 so ... */ i = 8; while (i < 1024 && i != msginfo.msgssz) i <<= 1; if (i != msginfo.msgssz) { DPRINTF(("msginfo.msgssz=%d (0x%x)\n", msginfo.msgssz, msginfo.msgssz)); panic("msginfo.msgssz not a small power of 2"); } if (msginfo.msgseg > 32767) { DPRINTF(("msginfo.msgseg=%d\n", msginfo.msgseg)); panic("msginfo.msgseg > 32767"); } for (i = 0; i < msginfo.msgseg; i++) { if (i > 0) msgmaps[i-1].next = i; msgmaps[i].next = -1; /* implies entry is available */ } free_msgmaps = 0; nfree_msgmaps = msginfo.msgseg; for (i = 0; i < msginfo.msgtql; i++) { msghdrs[i].msg_type = 0; if (i > 0) msghdrs[i-1].msg_next = &msghdrs[i]; msghdrs[i].msg_next = NULL; #ifdef MAC mac_sysvmsg_init(&msghdrs[i]); #endif } free_msghdrs = &msghdrs[0]; for (i = 0; i < msginfo.msgmni; i++) { msqids[i].u.msg_qbytes = 0; /* implies entry is available */ msqids[i].u.msg_perm.seq = 0; /* reset to a known value */ msqids[i].u.msg_perm.mode = 0; #ifdef MAC mac_sysvmsq_init(&msqids[i]); #endif } mtx_init(&msq_mtx, "msq", NULL, MTX_DEF); error = syscall_helper_register(msg_syscalls, SY_THR_STATIC_KLD); if (error != 0) return (error); #ifdef COMPAT_FREEBSD32 error = syscall32_helper_register(msg32_syscalls, SY_THR_STATIC_KLD); if (error != 0) return (error); #endif return (0); } static int msgunload() { struct msqid_kernel *msqkptr; int msqid; #ifdef MAC int i; #endif syscall_helper_unregister(msg_syscalls); #ifdef COMPAT_FREEBSD32 syscall32_helper_unregister(msg32_syscalls); #endif for (msqid = 0; msqid < msginfo.msgmni; msqid++) { /* * Look for an unallocated and unlocked msqid_ds. * msqid_ds's can be locked by msgsnd or msgrcv while * they are copying the message in/out. We can't * re-use the entry until they release it. */ msqkptr = &msqids[msqid]; if (msqkptr->u.msg_qbytes != 0 || (msqkptr->u.msg_perm.mode & MSG_LOCKED) != 0) break; } if (msqid != msginfo.msgmni) return (EBUSY); #ifdef MAC for (i = 0; i < msginfo.msgtql; i++) mac_sysvmsg_destroy(&msghdrs[i]); for (msqid = 0; msqid < msginfo.msgmni; msqid++) mac_sysvmsq_destroy(&msqids[msqid]); #endif free(msgpool, M_MSG); free(msgmaps, M_MSG); free(msghdrs, M_MSG); free(msqids, M_MSG); mtx_destroy(&msq_mtx); return (0); } static int sysvmsg_modload(struct module *module, int cmd, void *arg) { int error = 0; switch (cmd) { case MOD_LOAD: error = msginit(); if (error != 0) msgunload(); break; case MOD_UNLOAD: error = msgunload(); break; case MOD_SHUTDOWN: break; default: error = EINVAL; break; } return (error); } static moduledata_t sysvmsg_mod = { "sysvmsg", &sysvmsg_modload, NULL }; DECLARE_MODULE(sysvmsg, sysvmsg_mod, SI_SUB_SYSV_MSG, SI_ORDER_FIRST); MODULE_VERSION(sysvmsg, 1); static void msg_freehdr(msghdr) struct msg *msghdr; { while (msghdr->msg_ts > 0) { short next; if (msghdr->msg_spot < 0 || msghdr->msg_spot >= msginfo.msgseg) panic("msghdr->msg_spot out of range"); next = msgmaps[msghdr->msg_spot].next; msgmaps[msghdr->msg_spot].next = free_msgmaps; free_msgmaps = msghdr->msg_spot; nfree_msgmaps++; msghdr->msg_spot = next; if (msghdr->msg_ts >= msginfo.msgssz) msghdr->msg_ts -= msginfo.msgssz; else msghdr->msg_ts = 0; } if (msghdr->msg_spot != -1) panic("msghdr->msg_spot != -1"); msghdr->msg_next = free_msghdrs; free_msghdrs = msghdr; #ifdef MAC mac_sysvmsg_cleanup(msghdr); #endif } #ifndef _SYS_SYSPROTO_H_ struct msgctl_args { int msqid; int cmd; struct msqid_ds *buf; }; #endif int sys_msgctl(td, uap) struct thread *td; register struct msgctl_args *uap; { int msqid = uap->msqid; int cmd = uap->cmd; struct msqid_ds msqbuf; int error; DPRINTF(("call to msgctl(%d, %d, %p)\n", msqid, cmd, uap->buf)); if (cmd == IPC_SET && (error = copyin(uap->buf, &msqbuf, sizeof(msqbuf))) != 0) return (error); error = kern_msgctl(td, msqid, cmd, &msqbuf); if (cmd == IPC_STAT && error == 0) error = copyout(&msqbuf, uap->buf, sizeof(struct msqid_ds)); return (error); } int kern_msgctl(td, msqid, cmd, msqbuf) struct thread *td; int msqid; int cmd; struct msqid_ds *msqbuf; { int rval, error, msqix; register struct msqid_kernel *msqkptr; if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC)) return (ENOSYS); msqix = IPCID_TO_IX(msqid); if (msqix < 0 || msqix >= msginfo.msgmni) { DPRINTF(("msqid (%d) out of range (0<=msqid<%d)\n", msqix, msginfo.msgmni)); return (EINVAL); } msqkptr = &msqids[msqix]; mtx_lock(&msq_mtx); if (msqkptr->u.msg_qbytes == 0) { DPRINTF(("no such msqid\n")); error = EINVAL; goto done2; } if (msqkptr->u.msg_perm.seq != IPCID_TO_SEQ(msqid)) { DPRINTF(("wrong sequence number\n")); error = EINVAL; goto done2; } #ifdef MAC error = mac_sysvmsq_check_msqctl(td->td_ucred, msqkptr, cmd); if (error != 0) goto done2; #endif error = 0; rval = 0; switch (cmd) { case IPC_RMID: { struct msg *msghdr; if ((error = ipcperm(td, &msqkptr->u.msg_perm, IPC_M))) goto done2; #ifdef MAC /* * Check that the thread has MAC access permissions to * individual msghdrs. Note: We need to do this in a * separate loop because the actual loop alters the * msq/msghdr info as it progresses, and there is no going * back if half the way through we discover that the * thread cannot free a certain msghdr. The msq will get * into an inconsistent state. */ for (msghdr = msqkptr->u.msg_first; msghdr != NULL; msghdr = msghdr->msg_next) { error = mac_sysvmsq_check_msgrmid(td->td_ucred, msghdr); if (error != 0) goto done2; } #endif racct_sub_cred(msqkptr->cred, RACCT_NMSGQ, 1); racct_sub_cred(msqkptr->cred, RACCT_MSGQQUEUED, msqkptr->u.msg_qnum); racct_sub_cred(msqkptr->cred, RACCT_MSGQSIZE, msqkptr->u.msg_cbytes); crfree(msqkptr->cred); msqkptr->cred = NULL; /* Free the message headers */ msghdr = msqkptr->u.msg_first; while (msghdr != NULL) { struct msg *msghdr_tmp; /* Free the segments of each message */ msqkptr->u.msg_cbytes -= msghdr->msg_ts; msqkptr->u.msg_qnum--; msghdr_tmp = msghdr; msghdr = msghdr->msg_next; msg_freehdr(msghdr_tmp); } if (msqkptr->u.msg_cbytes != 0) panic("msg_cbytes is screwed up"); if (msqkptr->u.msg_qnum != 0) panic("msg_qnum is screwed up"); msqkptr->u.msg_qbytes = 0; /* Mark it as free */ #ifdef MAC mac_sysvmsq_cleanup(msqkptr); #endif wakeup(msqkptr); } break; case IPC_SET: if ((error = ipcperm(td, &msqkptr->u.msg_perm, IPC_M))) goto done2; if (msqbuf->msg_qbytes > msqkptr->u.msg_qbytes) { error = priv_check(td, PRIV_IPC_MSGSIZE); if (error) goto done2; } if (msqbuf->msg_qbytes > msginfo.msgmnb) { DPRINTF(("can't increase msg_qbytes beyond %d" "(truncating)\n", msginfo.msgmnb)); msqbuf->msg_qbytes = msginfo.msgmnb; /* silently restrict qbytes to system limit */ } if (msqbuf->msg_qbytes == 0) { DPRINTF(("can't reduce msg_qbytes to 0\n")); error = EINVAL; /* non-standard errno! */ goto done2; } msqkptr->u.msg_perm.uid = msqbuf->msg_perm.uid; /* change the owner */ msqkptr->u.msg_perm.gid = msqbuf->msg_perm.gid; /* change the owner */ msqkptr->u.msg_perm.mode = (msqkptr->u.msg_perm.mode & ~0777) | (msqbuf->msg_perm.mode & 0777); msqkptr->u.msg_qbytes = msqbuf->msg_qbytes; msqkptr->u.msg_ctime = time_second; break; case IPC_STAT: if ((error = ipcperm(td, &msqkptr->u.msg_perm, IPC_R))) { DPRINTF(("requester doesn't have read access\n")); goto done2; } *msqbuf = msqkptr->u; break; default: DPRINTF(("invalid command %d\n", cmd)); error = EINVAL; goto done2; } if (error == 0) td->td_retval[0] = rval; done2: mtx_unlock(&msq_mtx); return (error); } #ifndef _SYS_SYSPROTO_H_ struct msgget_args { key_t key; int msgflg; }; #endif int sys_msgget(td, uap) struct thread *td; register struct msgget_args *uap; { int msqid, error = 0; int key = uap->key; int msgflg = uap->msgflg; struct ucred *cred = td->td_ucred; register struct msqid_kernel *msqkptr = NULL; DPRINTF(("msgget(0x%x, 0%o)\n", key, msgflg)); if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC)) return (ENOSYS); mtx_lock(&msq_mtx); if (key != IPC_PRIVATE) { for (msqid = 0; msqid < msginfo.msgmni; msqid++) { msqkptr = &msqids[msqid]; if (msqkptr->u.msg_qbytes != 0 && msqkptr->u.msg_perm.key == key) break; } if (msqid < msginfo.msgmni) { DPRINTF(("found public key\n")); if ((msgflg & IPC_CREAT) && (msgflg & IPC_EXCL)) { DPRINTF(("not exclusive\n")); error = EEXIST; goto done2; } if ((error = ipcperm(td, &msqkptr->u.msg_perm, msgflg & 0700))) { DPRINTF(("requester doesn't have 0%o access\n", msgflg & 0700)); goto done2; } #ifdef MAC error = mac_sysvmsq_check_msqget(cred, msqkptr); if (error != 0) goto done2; #endif goto found; } } DPRINTF(("need to allocate the msqid_ds\n")); if (key == IPC_PRIVATE || (msgflg & IPC_CREAT)) { for (msqid = 0; msqid < msginfo.msgmni; msqid++) { /* * Look for an unallocated and unlocked msqid_ds. * msqid_ds's can be locked by msgsnd or msgrcv while * they are copying the message in/out. We can't * re-use the entry until they release it. */ msqkptr = &msqids[msqid]; if (msqkptr->u.msg_qbytes == 0 && (msqkptr->u.msg_perm.mode & MSG_LOCKED) == 0) break; } if (msqid == msginfo.msgmni) { DPRINTF(("no more msqid_ds's available\n")); error = ENOSPC; goto done2; } #ifdef RACCT if (racct_enable) { PROC_LOCK(td->td_proc); error = racct_add(td->td_proc, RACCT_NMSGQ, 1); PROC_UNLOCK(td->td_proc); if (error != 0) { error = ENOSPC; goto done2; } } #endif DPRINTF(("msqid %d is available\n", msqid)); msqkptr->u.msg_perm.key = key; msqkptr->u.msg_perm.cuid = cred->cr_uid; msqkptr->u.msg_perm.uid = cred->cr_uid; msqkptr->u.msg_perm.cgid = cred->cr_gid; msqkptr->u.msg_perm.gid = cred->cr_gid; msqkptr->u.msg_perm.mode = (msgflg & 0777); msqkptr->cred = crhold(cred); /* Make sure that the returned msqid is unique */ msqkptr->u.msg_perm.seq = (msqkptr->u.msg_perm.seq + 1) & 0x7fff; msqkptr->u.msg_first = NULL; msqkptr->u.msg_last = NULL; msqkptr->u.msg_cbytes = 0; msqkptr->u.msg_qnum = 0; msqkptr->u.msg_qbytes = msginfo.msgmnb; msqkptr->u.msg_lspid = 0; msqkptr->u.msg_lrpid = 0; msqkptr->u.msg_stime = 0; msqkptr->u.msg_rtime = 0; msqkptr->u.msg_ctime = time_second; #ifdef MAC mac_sysvmsq_create(cred, msqkptr); #endif } else { DPRINTF(("didn't find it and wasn't asked to create it\n")); error = ENOENT; goto done2; } found: /* Construct the unique msqid */ td->td_retval[0] = IXSEQ_TO_IPCID(msqid, msqkptr->u.msg_perm); done2: mtx_unlock(&msq_mtx); return (error); } #ifndef _SYS_SYSPROTO_H_ struct msgsnd_args { int msqid; const void *msgp; size_t msgsz; int msgflg; }; #endif int kern_msgsnd(td, msqid, msgp, msgsz, msgflg, mtype) struct thread *td; int msqid; const void *msgp; /* XXX msgp is actually mtext. */ size_t msgsz; int msgflg; long mtype; { int msqix, segs_needed, error = 0; register struct msqid_kernel *msqkptr; register struct msg *msghdr; short next; #ifdef RACCT size_t saved_msgsz; #endif if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC)) return (ENOSYS); mtx_lock(&msq_mtx); msqix = IPCID_TO_IX(msqid); if (msqix < 0 || msqix >= msginfo.msgmni) { DPRINTF(("msqid (%d) out of range (0<=msqid<%d)\n", msqix, msginfo.msgmni)); error = EINVAL; goto done2; } msqkptr = &msqids[msqix]; if (msqkptr->u.msg_qbytes == 0) { DPRINTF(("no such message queue id\n")); error = EINVAL; goto done2; } if (msqkptr->u.msg_perm.seq != IPCID_TO_SEQ(msqid)) { DPRINTF(("wrong sequence number\n")); error = EINVAL; goto done2; } if ((error = ipcperm(td, &msqkptr->u.msg_perm, IPC_W))) { DPRINTF(("requester doesn't have write access\n")); goto done2; } #ifdef MAC error = mac_sysvmsq_check_msqsnd(td->td_ucred, msqkptr); if (error != 0) goto done2; #endif #ifdef RACCT if (racct_enable) { PROC_LOCK(td->td_proc); if (racct_add(td->td_proc, RACCT_MSGQQUEUED, 1)) { PROC_UNLOCK(td->td_proc); error = EAGAIN; goto done2; } saved_msgsz = msgsz; if (racct_add(td->td_proc, RACCT_MSGQSIZE, msgsz)) { racct_sub(td->td_proc, RACCT_MSGQQUEUED, 1); PROC_UNLOCK(td->td_proc); error = EAGAIN; goto done2; } PROC_UNLOCK(td->td_proc); } #endif segs_needed = (msgsz + msginfo.msgssz - 1) / msginfo.msgssz; DPRINTF(("msgsz=%zu, msgssz=%d, segs_needed=%d\n", msgsz, msginfo.msgssz, segs_needed)); for (;;) { int need_more_resources = 0; /* * check msgsz * (inside this loop in case msg_qbytes changes while we sleep) */ if (msgsz > msqkptr->u.msg_qbytes) { DPRINTF(("msgsz > msqkptr->u.msg_qbytes\n")); error = EINVAL; goto done3; } if (msqkptr->u.msg_perm.mode & MSG_LOCKED) { DPRINTF(("msqid is locked\n")); need_more_resources = 1; } if (msgsz + msqkptr->u.msg_cbytes > msqkptr->u.msg_qbytes) { DPRINTF(("msgsz + msg_cbytes > msg_qbytes\n")); need_more_resources = 1; } if (segs_needed > nfree_msgmaps) { DPRINTF(("segs_needed > nfree_msgmaps\n")); need_more_resources = 1; } if (free_msghdrs == NULL) { DPRINTF(("no more msghdrs\n")); need_more_resources = 1; } if (need_more_resources) { int we_own_it; if ((msgflg & IPC_NOWAIT) != 0) { DPRINTF(("need more resources but caller " "doesn't want to wait\n")); error = EAGAIN; goto done3; } if ((msqkptr->u.msg_perm.mode & MSG_LOCKED) != 0) { DPRINTF(("we don't own the msqid_ds\n")); we_own_it = 0; } else { /* Force later arrivals to wait for our request */ DPRINTF(("we own the msqid_ds\n")); msqkptr->u.msg_perm.mode |= MSG_LOCKED; we_own_it = 1; } DPRINTF(("msgsnd: goodnight\n")); error = msleep(msqkptr, &msq_mtx, (PZERO - 4) | PCATCH, "msgsnd", hz); DPRINTF(("msgsnd: good morning, error=%d\n", error)); if (we_own_it) msqkptr->u.msg_perm.mode &= ~MSG_LOCKED; if (error == EWOULDBLOCK) { DPRINTF(("msgsnd: timed out\n")); continue; } if (error != 0) { DPRINTF(("msgsnd: interrupted system call\n")); error = EINTR; goto done3; } /* * Make sure that the msq queue still exists */ if (msqkptr->u.msg_qbytes == 0) { DPRINTF(("msqid deleted\n")); error = EIDRM; goto done3; } } else { DPRINTF(("got all the resources that we need\n")); break; } } /* * We have the resources that we need. * Make sure! */ if (msqkptr->u.msg_perm.mode & MSG_LOCKED) panic("msg_perm.mode & MSG_LOCKED"); if (segs_needed > nfree_msgmaps) panic("segs_needed > nfree_msgmaps"); if (msgsz + msqkptr->u.msg_cbytes > msqkptr->u.msg_qbytes) panic("msgsz + msg_cbytes > msg_qbytes"); if (free_msghdrs == NULL) panic("no more msghdrs"); /* * Re-lock the msqid_ds in case we page-fault when copying in the * message */ if ((msqkptr->u.msg_perm.mode & MSG_LOCKED) != 0) panic("msqid_ds is already locked"); msqkptr->u.msg_perm.mode |= MSG_LOCKED; /* * Allocate a message header */ msghdr = free_msghdrs; free_msghdrs = msghdr->msg_next; msghdr->msg_spot = -1; msghdr->msg_ts = msgsz; msghdr->msg_type = mtype; #ifdef MAC /* * XXXMAC: Should the mac_sysvmsq_check_msgmsq check follow here * immediately? Or, should it be checked just before the msg is * enqueued in the msgq (as it is done now)? */ mac_sysvmsg_create(td->td_ucred, msqkptr, msghdr); #endif /* * Allocate space for the message */ while (segs_needed > 0) { if (nfree_msgmaps <= 0) panic("not enough msgmaps"); if (free_msgmaps == -1) panic("nil free_msgmaps"); next = free_msgmaps; if (next <= -1) panic("next too low #1"); if (next >= msginfo.msgseg) panic("next out of range #1"); DPRINTF(("allocating segment %d to message\n", next)); free_msgmaps = msgmaps[next].next; nfree_msgmaps--; msgmaps[next].next = msghdr->msg_spot; msghdr->msg_spot = next; segs_needed--; } /* * Validate the message type */ if (msghdr->msg_type < 1) { msg_freehdr(msghdr); msqkptr->u.msg_perm.mode &= ~MSG_LOCKED; wakeup(msqkptr); DPRINTF(("mtype (%ld) < 1\n", msghdr->msg_type)); error = EINVAL; goto done3; } /* * Copy in the message body */ next = msghdr->msg_spot; while (msgsz > 0) { size_t tlen; if (msgsz > msginfo.msgssz) tlen = msginfo.msgssz; else tlen = msgsz; if (next <= -1) panic("next too low #2"); if (next >= msginfo.msgseg) panic("next out of range #2"); mtx_unlock(&msq_mtx); if ((error = copyin(msgp, &msgpool[next * msginfo.msgssz], tlen)) != 0) { mtx_lock(&msq_mtx); DPRINTF(("error %d copying in message segment\n", error)); msg_freehdr(msghdr); msqkptr->u.msg_perm.mode &= ~MSG_LOCKED; wakeup(msqkptr); goto done3; } mtx_lock(&msq_mtx); msgsz -= tlen; msgp = (const char *)msgp + tlen; next = msgmaps[next].next; } if (next != -1) panic("didn't use all the msg segments"); /* * We've got the message. Unlock the msqid_ds. */ msqkptr->u.msg_perm.mode &= ~MSG_LOCKED; /* * Make sure that the msqid_ds is still allocated. */ if (msqkptr->u.msg_qbytes == 0) { msg_freehdr(msghdr); wakeup(msqkptr); error = EIDRM; goto done3; } #ifdef MAC /* * Note: Since the task/thread allocates the msghdr and usually * primes it with its own MAC label, for a majority of policies, it * won't be necessary to check whether the msghdr has access * permissions to the msgq. The mac_sysvmsq_check_msqsnd check would * suffice in that case. However, this hook may be required where * individual policies derive a non-identical label for the msghdr * from the current thread label and may want to check the msghdr * enqueue permissions, along with read/write permissions to the * msgq. */ error = mac_sysvmsq_check_msgmsq(td->td_ucred, msghdr, msqkptr); if (error != 0) { msg_freehdr(msghdr); wakeup(msqkptr); goto done3; } #endif /* * Put the message into the queue */ if (msqkptr->u.msg_first == NULL) { msqkptr->u.msg_first = msghdr; msqkptr->u.msg_last = msghdr; } else { msqkptr->u.msg_last->msg_next = msghdr; msqkptr->u.msg_last = msghdr; } msqkptr->u.msg_last->msg_next = NULL; msqkptr->u.msg_cbytes += msghdr->msg_ts; msqkptr->u.msg_qnum++; msqkptr->u.msg_lspid = td->td_proc->p_pid; msqkptr->u.msg_stime = time_second; wakeup(msqkptr); td->td_retval[0] = 0; done3: #ifdef RACCT if (racct_enable && error != 0) { PROC_LOCK(td->td_proc); racct_sub(td->td_proc, RACCT_MSGQQUEUED, 1); racct_sub(td->td_proc, RACCT_MSGQSIZE, saved_msgsz); PROC_UNLOCK(td->td_proc); } #endif done2: mtx_unlock(&msq_mtx); return (error); } int sys_msgsnd(td, uap) struct thread *td; register struct msgsnd_args *uap; { int error; long mtype; DPRINTF(("call to msgsnd(%d, %p, %zu, %d)\n", uap->msqid, uap->msgp, uap->msgsz, uap->msgflg)); if ((error = copyin(uap->msgp, &mtype, sizeof(mtype))) != 0) { DPRINTF(("error %d copying the message type\n", error)); return (error); } return (kern_msgsnd(td, uap->msqid, (const char *)uap->msgp + sizeof(mtype), uap->msgsz, uap->msgflg, mtype)); } #ifndef _SYS_SYSPROTO_H_ struct msgrcv_args { int msqid; void *msgp; size_t msgsz; long msgtyp; int msgflg; }; #endif int kern_msgrcv(td, msqid, msgp, msgsz, msgtyp, msgflg, mtype) struct thread *td; int msqid; void *msgp; /* XXX msgp is actually mtext. */ size_t msgsz; long msgtyp; int msgflg; long *mtype; { size_t len; register struct msqid_kernel *msqkptr; register struct msg *msghdr; int msqix, error = 0; short next; if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC)) return (ENOSYS); msqix = IPCID_TO_IX(msqid); if (msqix < 0 || msqix >= msginfo.msgmni) { DPRINTF(("msqid (%d) out of range (0<=msqid<%d)\n", msqix, msginfo.msgmni)); return (EINVAL); } msqkptr = &msqids[msqix]; mtx_lock(&msq_mtx); if (msqkptr->u.msg_qbytes == 0) { DPRINTF(("no such message queue id\n")); error = EINVAL; goto done2; } if (msqkptr->u.msg_perm.seq != IPCID_TO_SEQ(msqid)) { DPRINTF(("wrong sequence number\n")); error = EINVAL; goto done2; } if ((error = ipcperm(td, &msqkptr->u.msg_perm, IPC_R))) { DPRINTF(("requester doesn't have read access\n")); goto done2; } #ifdef MAC error = mac_sysvmsq_check_msqrcv(td->td_ucred, msqkptr); if (error != 0) goto done2; #endif msghdr = NULL; while (msghdr == NULL) { if (msgtyp == 0) { msghdr = msqkptr->u.msg_first; if (msghdr != NULL) { if (msgsz < msghdr->msg_ts && (msgflg & MSG_NOERROR) == 0) { DPRINTF(("first message on the queue " "is too big (want %zu, got %d)\n", msgsz, msghdr->msg_ts)); error = E2BIG; goto done2; } #ifdef MAC error = mac_sysvmsq_check_msgrcv(td->td_ucred, msghdr); if (error != 0) goto done2; #endif if (msqkptr->u.msg_first == msqkptr->u.msg_last) { msqkptr->u.msg_first = NULL; msqkptr->u.msg_last = NULL; } else { msqkptr->u.msg_first = msghdr->msg_next; if (msqkptr->u.msg_first == NULL) panic("msg_first/last screwed up #1"); } } } else { struct msg *previous; struct msg **prev; previous = NULL; prev = &(msqkptr->u.msg_first); while ((msghdr = *prev) != NULL) { /* * Is this message's type an exact match or is * this message's type less than or equal to * the absolute value of a negative msgtyp? * Note that the second half of this test can * NEVER be true if msgtyp is positive since * msg_type is always positive! */ if (msgtyp == msghdr->msg_type || msghdr->msg_type <= -msgtyp) { DPRINTF(("found message type %ld, " "requested %ld\n", msghdr->msg_type, msgtyp)); if (msgsz < msghdr->msg_ts && (msgflg & MSG_NOERROR) == 0) { DPRINTF(("requested message " "on the queue is too big " "(want %zu, got %hu)\n", msgsz, msghdr->msg_ts)); error = E2BIG; goto done2; } #ifdef MAC error = mac_sysvmsq_check_msgrcv( td->td_ucred, msghdr); if (error != 0) goto done2; #endif *prev = msghdr->msg_next; if (msghdr == msqkptr->u.msg_last) { if (previous == NULL) { if (prev != &msqkptr->u.msg_first) panic("msg_first/last screwed up #2"); msqkptr->u.msg_first = NULL; msqkptr->u.msg_last = NULL; } else { if (prev == &msqkptr->u.msg_first) panic("msg_first/last screwed up #3"); msqkptr->u.msg_last = previous; } } break; } previous = msghdr; prev = &(msghdr->msg_next); } } /* * We've either extracted the msghdr for the appropriate * message or there isn't one. * If there is one then bail out of this loop. */ if (msghdr != NULL) break; /* * Hmph! No message found. Does the user want to wait? */ if ((msgflg & IPC_NOWAIT) != 0) { DPRINTF(("no appropriate message found (msgtyp=%ld)\n", msgtyp)); /* The SVID says to return ENOMSG. */ error = ENOMSG; goto done2; } /* * Wait for something to happen */ DPRINTF(("msgrcv: goodnight\n")); error = msleep(msqkptr, &msq_mtx, (PZERO - 4) | PCATCH, "msgrcv", 0); DPRINTF(("msgrcv: good morning (error=%d)\n", error)); if (error != 0) { DPRINTF(("msgrcv: interrupted system call\n")); error = EINTR; goto done2; } /* * Make sure that the msq queue still exists */ if (msqkptr->u.msg_qbytes == 0 || msqkptr->u.msg_perm.seq != IPCID_TO_SEQ(msqid)) { DPRINTF(("msqid deleted\n")); error = EIDRM; goto done2; } } /* * Return the message to the user. * * First, do the bookkeeping (before we risk being interrupted). */ msqkptr->u.msg_cbytes -= msghdr->msg_ts; msqkptr->u.msg_qnum--; msqkptr->u.msg_lrpid = td->td_proc->p_pid; msqkptr->u.msg_rtime = time_second; racct_sub_cred(msqkptr->cred, RACCT_MSGQQUEUED, 1); racct_sub_cred(msqkptr->cred, RACCT_MSGQSIZE, msghdr->msg_ts); /* * Make msgsz the actual amount that we'll be returning. * Note that this effectively truncates the message if it is too long * (since msgsz is never increased). */ DPRINTF(("found a message, msgsz=%zu, msg_ts=%hu\n", msgsz, msghdr->msg_ts)); if (msgsz > msghdr->msg_ts) msgsz = msghdr->msg_ts; *mtype = msghdr->msg_type; /* * Return the segments to the user */ next = msghdr->msg_spot; for (len = 0; len < msgsz; len += msginfo.msgssz) { size_t tlen; if (msgsz - len > msginfo.msgssz) tlen = msginfo.msgssz; else tlen = msgsz - len; if (next <= -1) panic("next too low #3"); if (next >= msginfo.msgseg) panic("next out of range #3"); mtx_unlock(&msq_mtx); error = copyout(&msgpool[next * msginfo.msgssz], msgp, tlen); mtx_lock(&msq_mtx); if (error != 0) { DPRINTF(("error (%d) copying out message segment\n", error)); msg_freehdr(msghdr); wakeup(msqkptr); goto done2; } msgp = (char *)msgp + tlen; next = msgmaps[next].next; } /* * Done, return the actual number of bytes copied out. */ msg_freehdr(msghdr); wakeup(msqkptr); td->td_retval[0] = msgsz; done2: mtx_unlock(&msq_mtx); return (error); } int sys_msgrcv(td, uap) struct thread *td; register struct msgrcv_args *uap; { int error; long mtype; DPRINTF(("call to msgrcv(%d, %p, %zu, %ld, %d)\n", uap->msqid, uap->msgp, uap->msgsz, uap->msgtyp, uap->msgflg)); if ((error = kern_msgrcv(td, uap->msqid, (char *)uap->msgp + sizeof(mtype), uap->msgsz, uap->msgtyp, uap->msgflg, &mtype)) != 0) return (error); if ((error = copyout(&mtype, uap->msgp, sizeof(mtype))) != 0) DPRINTF(("error %d copying the message type\n", error)); return (error); } static int sysctl_msqids(SYSCTL_HANDLER_ARGS) { return (SYSCTL_OUT(req, msqids, sizeof(struct msqid_kernel) * msginfo.msgmni)); } SYSCTL_INT(_kern_ipc, OID_AUTO, msgmax, CTLFLAG_RD, &msginfo.msgmax, 0, "Maximum message size"); SYSCTL_INT(_kern_ipc, OID_AUTO, msgmni, CTLFLAG_RDTUN, &msginfo.msgmni, 0, "Number of message queue identifiers"); SYSCTL_INT(_kern_ipc, OID_AUTO, msgmnb, CTLFLAG_RDTUN, &msginfo.msgmnb, 0, "Maximum number of bytes in a queue"); SYSCTL_INT(_kern_ipc, OID_AUTO, msgtql, CTLFLAG_RDTUN, &msginfo.msgtql, 0, "Maximum number of messages in the system"); SYSCTL_INT(_kern_ipc, OID_AUTO, msgssz, CTLFLAG_RDTUN, &msginfo.msgssz, 0, "Size of a message segment"); SYSCTL_INT(_kern_ipc, OID_AUTO, msgseg, CTLFLAG_RDTUN, &msginfo.msgseg, 0, "Number of message segments"); SYSCTL_PROC(_kern_ipc, OID_AUTO, msqids, CTLTYPE_OPAQUE | CTLFLAG_RD, NULL, 0, sysctl_msqids, "", "Message queue IDs"); #ifdef COMPAT_FREEBSD32 int freebsd32_msgsys(struct thread *td, struct freebsd32_msgsys_args *uap) { #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \ defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7) switch (uap->which) { case 0: return (freebsd7_freebsd32_msgctl(td, (struct freebsd7_freebsd32_msgctl_args *)&uap->a2)); case 2: return (freebsd32_msgsnd(td, (struct freebsd32_msgsnd_args *)&uap->a2)); case 3: return (freebsd32_msgrcv(td, (struct freebsd32_msgrcv_args *)&uap->a2)); default: return (sys_msgsys(td, (struct msgsys_args *)uap)); } #else return (nosys(td, NULL)); #endif } #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \ defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7) int freebsd7_freebsd32_msgctl(struct thread *td, struct freebsd7_freebsd32_msgctl_args *uap) { struct msqid_ds msqbuf; struct msqid_ds32_old msqbuf32; int error; if (uap->cmd == IPC_SET) { error = copyin(uap->buf, &msqbuf32, sizeof(msqbuf32)); if (error) return (error); freebsd32_ipcperm_old_in(&msqbuf32.msg_perm, &msqbuf.msg_perm); PTRIN_CP(msqbuf32, msqbuf, msg_first); PTRIN_CP(msqbuf32, msqbuf, msg_last); CP(msqbuf32, msqbuf, msg_cbytes); CP(msqbuf32, msqbuf, msg_qnum); CP(msqbuf32, msqbuf, msg_qbytes); CP(msqbuf32, msqbuf, msg_lspid); CP(msqbuf32, msqbuf, msg_lrpid); CP(msqbuf32, msqbuf, msg_stime); CP(msqbuf32, msqbuf, msg_rtime); CP(msqbuf32, msqbuf, msg_ctime); } error = kern_msgctl(td, uap->msqid, uap->cmd, &msqbuf); if (error) return (error); if (uap->cmd == IPC_STAT) { bzero(&msqbuf32, sizeof(msqbuf32)); freebsd32_ipcperm_old_out(&msqbuf.msg_perm, &msqbuf32.msg_perm); PTROUT_CP(msqbuf, msqbuf32, msg_first); PTROUT_CP(msqbuf, msqbuf32, msg_last); CP(msqbuf, msqbuf32, msg_cbytes); CP(msqbuf, msqbuf32, msg_qnum); CP(msqbuf, msqbuf32, msg_qbytes); CP(msqbuf, msqbuf32, msg_lspid); CP(msqbuf, msqbuf32, msg_lrpid); CP(msqbuf, msqbuf32, msg_stime); CP(msqbuf, msqbuf32, msg_rtime); CP(msqbuf, msqbuf32, msg_ctime); error = copyout(&msqbuf32, uap->buf, sizeof(struct msqid_ds32)); } return (error); } #endif int freebsd32_msgctl(struct thread *td, struct freebsd32_msgctl_args *uap) { struct msqid_ds msqbuf; struct msqid_ds32 msqbuf32; int error; if (uap->cmd == IPC_SET) { error = copyin(uap->buf, &msqbuf32, sizeof(msqbuf32)); if (error) return (error); freebsd32_ipcperm_in(&msqbuf32.msg_perm, &msqbuf.msg_perm); PTRIN_CP(msqbuf32, msqbuf, msg_first); PTRIN_CP(msqbuf32, msqbuf, msg_last); CP(msqbuf32, msqbuf, msg_cbytes); CP(msqbuf32, msqbuf, msg_qnum); CP(msqbuf32, msqbuf, msg_qbytes); CP(msqbuf32, msqbuf, msg_lspid); CP(msqbuf32, msqbuf, msg_lrpid); CP(msqbuf32, msqbuf, msg_stime); CP(msqbuf32, msqbuf, msg_rtime); CP(msqbuf32, msqbuf, msg_ctime); } error = kern_msgctl(td, uap->msqid, uap->cmd, &msqbuf); if (error) return (error); if (uap->cmd == IPC_STAT) { freebsd32_ipcperm_out(&msqbuf.msg_perm, &msqbuf32.msg_perm); PTROUT_CP(msqbuf, msqbuf32, msg_first); PTROUT_CP(msqbuf, msqbuf32, msg_last); CP(msqbuf, msqbuf32, msg_cbytes); CP(msqbuf, msqbuf32, msg_qnum); CP(msqbuf, msqbuf32, msg_qbytes); CP(msqbuf, msqbuf32, msg_lspid); CP(msqbuf, msqbuf32, msg_lrpid); CP(msqbuf, msqbuf32, msg_stime); CP(msqbuf, msqbuf32, msg_rtime); CP(msqbuf, msqbuf32, msg_ctime); error = copyout(&msqbuf32, uap->buf, sizeof(struct msqid_ds32)); } return (error); } int freebsd32_msgsnd(struct thread *td, struct freebsd32_msgsnd_args *uap) { const void *msgp; long mtype; int32_t mtype32; int error; msgp = PTRIN(uap->msgp); if ((error = copyin(msgp, &mtype32, sizeof(mtype32))) != 0) return (error); mtype = mtype32; return (kern_msgsnd(td, uap->msqid, (const char *)msgp + sizeof(mtype32), uap->msgsz, uap->msgflg, mtype)); } int freebsd32_msgrcv(struct thread *td, struct freebsd32_msgrcv_args *uap) { void *msgp; long mtype; int32_t mtype32; int error; msgp = PTRIN(uap->msgp); if ((error = kern_msgrcv(td, uap->msqid, (char *)msgp + sizeof(mtype32), uap->msgsz, uap->msgtyp, uap->msgflg, &mtype)) != 0) return (error); mtype32 = (int32_t)mtype; return (copyout(&mtype32, msgp, sizeof(mtype32))); } #endif #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \ defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7) /* XXX casting to (sy_call_t *) is bogus, as usual. */ static sy_call_t *msgcalls[] = { (sy_call_t *)freebsd7_msgctl, (sy_call_t *)sys_msgget, (sy_call_t *)sys_msgsnd, (sy_call_t *)sys_msgrcv }; /* * Entry point for all MSG calls. */ int sys_msgsys(td, uap) struct thread *td; /* XXX actually varargs. */ struct msgsys_args /* { int which; int a2; int a3; int a4; int a5; int a6; } */ *uap; { int error; if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC)) return (ENOSYS); - if (uap->which < 0 || - uap->which >= nitems(msgcalls)) + if (uap->which < 0 || uap->which >= nitems(msgcalls)) return (EINVAL); error = (*msgcalls[uap->which])(td, &uap->a2); return (error); } #ifndef CP #define CP(src, dst, fld) do { (dst).fld = (src).fld; } while (0) #endif #ifndef _SYS_SYSPROTO_H_ struct freebsd7_msgctl_args { int msqid; int cmd; struct msqid_ds_old *buf; }; #endif int freebsd7_msgctl(td, uap) struct thread *td; struct freebsd7_msgctl_args *uap; { struct msqid_ds_old msqold; struct msqid_ds msqbuf; int error; DPRINTF(("call to freebsd7_msgctl(%d, %d, %p)\n", uap->msqid, uap->cmd, uap->buf)); if (uap->cmd == IPC_SET) { error = copyin(uap->buf, &msqold, sizeof(msqold)); if (error) return (error); ipcperm_old2new(&msqold.msg_perm, &msqbuf.msg_perm); CP(msqold, msqbuf, msg_first); CP(msqold, msqbuf, msg_last); CP(msqold, msqbuf, msg_cbytes); CP(msqold, msqbuf, msg_qnum); CP(msqold, msqbuf, msg_qbytes); CP(msqold, msqbuf, msg_lspid); CP(msqold, msqbuf, msg_lrpid); CP(msqold, msqbuf, msg_stime); CP(msqold, msqbuf, msg_rtime); CP(msqold, msqbuf, msg_ctime); } error = kern_msgctl(td, uap->msqid, uap->cmd, &msqbuf); if (error) return (error); if (uap->cmd == IPC_STAT) { bzero(&msqold, sizeof(msqold)); ipcperm_new2old(&msqbuf.msg_perm, &msqold.msg_perm); CP(msqbuf, msqold, msg_first); CP(msqbuf, msqold, msg_last); CP(msqbuf, msqold, msg_cbytes); CP(msqbuf, msqold, msg_qnum); CP(msqbuf, msqold, msg_qbytes); CP(msqbuf, msqold, msg_lspid); CP(msqbuf, msqold, msg_lrpid); CP(msqbuf, msqold, msg_stime); CP(msqbuf, msqold, msg_rtime); CP(msqbuf, msqold, msg_ctime); error = copyout(&msqold, uap->buf, sizeof(struct msqid_ds_old)); } return (error); } #undef CP #endif /* COMPAT_FREEBSD4 || COMPAT_FREEBSD5 || COMPAT_FREEBSD6 || COMPAT_FREEBSD7 */ Index: head/sys/kern/sysv_sem.c =================================================================== --- head/sys/kern/sysv_sem.c (revision 298353) +++ head/sys/kern/sysv_sem.c (revision 298354) @@ -1,1661 +1,1660 @@ /*- * Implementation of SVID semaphores * * Author: Daniel Boulet * * This software is provided ``AS IS'' without any warranties of any kind. */ /*- * Copyright (c) 2003-2005 McAfee, Inc. * All rights reserved. * * This software was developed for the FreeBSD Project in part by McAfee * Research, the Security Research Division of McAfee, Inc under DARPA/SPAWAR * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS research * program. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_compat.h" #include "opt_sysvipc.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include FEATURE(sysv_sem, "System V semaphores support"); static MALLOC_DEFINE(M_SEM, "sem", "SVID compatible semaphores"); #ifdef SEM_DEBUG #define DPRINTF(a) printf a #else #define DPRINTF(a) #endif static int seminit(void); static int sysvsem_modload(struct module *, int, void *); static int semunload(void); static void semexit_myhook(void *arg, struct proc *p); static int sysctl_sema(SYSCTL_HANDLER_ARGS); static int semvalid(int semid, struct semid_kernel *semakptr); #ifndef _SYS_SYSPROTO_H_ struct __semctl_args; int __semctl(struct thread *td, struct __semctl_args *uap); struct semget_args; int semget(struct thread *td, struct semget_args *uap); struct semop_args; int semop(struct thread *td, struct semop_args *uap); #endif static struct sem_undo *semu_alloc(struct thread *td); static int semundo_adjust(struct thread *td, struct sem_undo **supptr, int semid, int semseq, int semnum, int adjval); static void semundo_clear(int semid, int semnum); static struct mtx sem_mtx; /* semaphore global lock */ static struct mtx sem_undo_mtx; static int semtot = 0; static struct semid_kernel *sema; /* semaphore id pool */ static struct mtx *sema_mtx; /* semaphore id pool mutexes*/ static struct sem *sem; /* semaphore pool */ LIST_HEAD(, sem_undo) semu_list; /* list of active undo structures */ LIST_HEAD(, sem_undo) semu_free_list; /* list of free undo structures */ static int *semu; /* undo structure pool */ static eventhandler_tag semexit_tag; #define SEMUNDO_MTX sem_undo_mtx #define SEMUNDO_LOCK() mtx_lock(&SEMUNDO_MTX); #define SEMUNDO_UNLOCK() mtx_unlock(&SEMUNDO_MTX); #define SEMUNDO_LOCKASSERT(how) mtx_assert(&SEMUNDO_MTX, (how)); struct sem { u_short semval; /* semaphore value */ pid_t sempid; /* pid of last operation */ u_short semncnt; /* # awaiting semval > cval */ u_short semzcnt; /* # awaiting semval = 0 */ }; /* * Undo structure (one per process) */ struct sem_undo { LIST_ENTRY(sem_undo) un_next; /* ptr to next active undo structure */ struct proc *un_proc; /* owner of this structure */ short un_cnt; /* # of active entries */ struct undo { short un_adjval; /* adjust on exit values */ short un_num; /* semaphore # */ int un_id; /* semid */ unsigned short un_seq; } un_ent[1]; /* undo entries */ }; /* * Configuration parameters */ #ifndef SEMMNI #define SEMMNI 50 /* # of semaphore identifiers */ #endif #ifndef SEMMNS #define SEMMNS 340 /* # of semaphores in system */ #endif #ifndef SEMUME #define SEMUME 50 /* max # of undo entries per process */ #endif #ifndef SEMMNU #define SEMMNU 150 /* # of undo structures in system */ #endif /* shouldn't need tuning */ #ifndef SEMMSL #define SEMMSL SEMMNS /* max # of semaphores per id */ #endif #ifndef SEMOPM #define SEMOPM 100 /* max # of operations per semop call */ #endif #define SEMVMX 32767 /* semaphore maximum value */ #define SEMAEM 16384 /* adjust on exit max value */ /* * Due to the way semaphore memory is allocated, we have to ensure that * SEMUSZ is properly aligned. */ #define SEM_ALIGN(bytes) (((bytes) + (sizeof(long) - 1)) & ~(sizeof(long) - 1)) /* actual size of an undo structure */ #define SEMUSZ SEM_ALIGN(offsetof(struct sem_undo, un_ent[SEMUME])) /* * Macro to find a particular sem_undo vector */ #define SEMU(ix) \ ((struct sem_undo *)(((intptr_t)semu)+ix * seminfo.semusz)) /* * semaphore info struct */ struct seminfo seminfo = { SEMMNI, /* # of semaphore identifiers */ SEMMNS, /* # of semaphores in system */ SEMMNU, /* # of undo structures in system */ SEMMSL, /* max # of semaphores per id */ SEMOPM, /* max # of operations per semop call */ SEMUME, /* max # of undo entries per process */ SEMUSZ, /* size in bytes of undo structure */ SEMVMX, /* semaphore maximum value */ SEMAEM /* adjust on exit max value */ }; SYSCTL_INT(_kern_ipc, OID_AUTO, semmni, CTLFLAG_RDTUN, &seminfo.semmni, 0, "Number of semaphore identifiers"); SYSCTL_INT(_kern_ipc, OID_AUTO, semmns, CTLFLAG_RDTUN, &seminfo.semmns, 0, "Maximum number of semaphores in the system"); SYSCTL_INT(_kern_ipc, OID_AUTO, semmnu, CTLFLAG_RDTUN, &seminfo.semmnu, 0, "Maximum number of undo structures in the system"); SYSCTL_INT(_kern_ipc, OID_AUTO, semmsl, CTLFLAG_RWTUN, &seminfo.semmsl, 0, "Max semaphores per id"); SYSCTL_INT(_kern_ipc, OID_AUTO, semopm, CTLFLAG_RDTUN, &seminfo.semopm, 0, "Max operations per semop call"); SYSCTL_INT(_kern_ipc, OID_AUTO, semume, CTLFLAG_RDTUN, &seminfo.semume, 0, "Max undo entries per process"); SYSCTL_INT(_kern_ipc, OID_AUTO, semusz, CTLFLAG_RDTUN, &seminfo.semusz, 0, "Size in bytes of undo structure"); SYSCTL_INT(_kern_ipc, OID_AUTO, semvmx, CTLFLAG_RWTUN, &seminfo.semvmx, 0, "Semaphore maximum value"); SYSCTL_INT(_kern_ipc, OID_AUTO, semaem, CTLFLAG_RWTUN, &seminfo.semaem, 0, "Adjust on exit max value"); SYSCTL_PROC(_kern_ipc, OID_AUTO, sema, CTLTYPE_OPAQUE | CTLFLAG_RD, NULL, 0, sysctl_sema, "", "Semaphore id pool"); static struct syscall_helper_data sem_syscalls[] = { SYSCALL_INIT_HELPER(__semctl), SYSCALL_INIT_HELPER(semget), SYSCALL_INIT_HELPER(semop), #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \ defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7) SYSCALL_INIT_HELPER(semsys), SYSCALL_INIT_HELPER_COMPAT(freebsd7___semctl), #endif SYSCALL_INIT_LAST }; #ifdef COMPAT_FREEBSD32 #include #include #include #include #include #include static struct syscall_helper_data sem32_syscalls[] = { SYSCALL32_INIT_HELPER(freebsd32_semctl), SYSCALL32_INIT_HELPER_COMPAT(semget), SYSCALL32_INIT_HELPER_COMPAT(semop), SYSCALL32_INIT_HELPER(freebsd32_semsys), #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \ defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7) SYSCALL32_INIT_HELPER(freebsd7_freebsd32_semctl), #endif SYSCALL_INIT_LAST }; #endif static int seminit(void) { int i, error; sem = malloc(sizeof(struct sem) * seminfo.semmns, M_SEM, M_WAITOK); sema = malloc(sizeof(struct semid_kernel) * seminfo.semmni, M_SEM, M_WAITOK); sema_mtx = malloc(sizeof(struct mtx) * seminfo.semmni, M_SEM, M_WAITOK | M_ZERO); semu = malloc(seminfo.semmnu * seminfo.semusz, M_SEM, M_WAITOK); for (i = 0; i < seminfo.semmni; i++) { sema[i].u.sem_base = 0; sema[i].u.sem_perm.mode = 0; sema[i].u.sem_perm.seq = 0; #ifdef MAC mac_sysvsem_init(&sema[i]); #endif } for (i = 0; i < seminfo.semmni; i++) mtx_init(&sema_mtx[i], "semid", NULL, MTX_DEF); LIST_INIT(&semu_free_list); for (i = 0; i < seminfo.semmnu; i++) { struct sem_undo *suptr = SEMU(i); suptr->un_proc = NULL; LIST_INSERT_HEAD(&semu_free_list, suptr, un_next); } LIST_INIT(&semu_list); mtx_init(&sem_mtx, "sem", NULL, MTX_DEF); mtx_init(&sem_undo_mtx, "semu", NULL, MTX_DEF); semexit_tag = EVENTHANDLER_REGISTER(process_exit, semexit_myhook, NULL, EVENTHANDLER_PRI_ANY); error = syscall_helper_register(sem_syscalls, SY_THR_STATIC_KLD); if (error != 0) return (error); #ifdef COMPAT_FREEBSD32 error = syscall32_helper_register(sem32_syscalls, SY_THR_STATIC_KLD); if (error != 0) return (error); #endif return (0); } static int semunload(void) { int i; /* XXXKIB */ if (semtot != 0) return (EBUSY); #ifdef COMPAT_FREEBSD32 syscall32_helper_unregister(sem32_syscalls); #endif syscall_helper_unregister(sem_syscalls); EVENTHANDLER_DEREGISTER(process_exit, semexit_tag); #ifdef MAC for (i = 0; i < seminfo.semmni; i++) mac_sysvsem_destroy(&sema[i]); #endif free(sem, M_SEM); free(sema, M_SEM); free(semu, M_SEM); for (i = 0; i < seminfo.semmni; i++) mtx_destroy(&sema_mtx[i]); free(sema_mtx, M_SEM); mtx_destroy(&sem_mtx); mtx_destroy(&sem_undo_mtx); return (0); } static int sysvsem_modload(struct module *module, int cmd, void *arg) { int error = 0; switch (cmd) { case MOD_LOAD: error = seminit(); if (error != 0) semunload(); break; case MOD_UNLOAD: error = semunload(); break; case MOD_SHUTDOWN: break; default: error = EINVAL; break; } return (error); } static moduledata_t sysvsem_mod = { "sysvsem", &sysvsem_modload, NULL }; DECLARE_MODULE(sysvsem, sysvsem_mod, SI_SUB_SYSV_SEM, SI_ORDER_FIRST); MODULE_VERSION(sysvsem, 1); /* * Allocate a new sem_undo structure for a process * (returns ptr to structure or NULL if no more room) */ static struct sem_undo * semu_alloc(struct thread *td) { struct sem_undo *suptr; SEMUNDO_LOCKASSERT(MA_OWNED); if ((suptr = LIST_FIRST(&semu_free_list)) == NULL) return (NULL); LIST_REMOVE(suptr, un_next); LIST_INSERT_HEAD(&semu_list, suptr, un_next); suptr->un_cnt = 0; suptr->un_proc = td->td_proc; return (suptr); } static int semu_try_free(struct sem_undo *suptr) { SEMUNDO_LOCKASSERT(MA_OWNED); if (suptr->un_cnt != 0) return (0); LIST_REMOVE(suptr, un_next); LIST_INSERT_HEAD(&semu_free_list, suptr, un_next); return (1); } /* * Adjust a particular entry for a particular proc */ static int semundo_adjust(struct thread *td, struct sem_undo **supptr, int semid, int semseq, int semnum, int adjval) { struct proc *p = td->td_proc; struct sem_undo *suptr; struct undo *sunptr; int i; SEMUNDO_LOCKASSERT(MA_OWNED); /* Look for and remember the sem_undo if the caller doesn't provide it */ suptr = *supptr; if (suptr == NULL) { LIST_FOREACH(suptr, &semu_list, un_next) { if (suptr->un_proc == p) { *supptr = suptr; break; } } if (suptr == NULL) { if (adjval == 0) return(0); suptr = semu_alloc(td); if (suptr == NULL) return (ENOSPC); *supptr = suptr; } } /* * Look for the requested entry and adjust it (delete if adjval becomes * 0). */ sunptr = &suptr->un_ent[0]; for (i = 0; i < suptr->un_cnt; i++, sunptr++) { if (sunptr->un_id != semid || sunptr->un_num != semnum) continue; if (adjval != 0) { adjval += sunptr->un_adjval; if (adjval > seminfo.semaem || adjval < -seminfo.semaem) return (ERANGE); } sunptr->un_adjval = adjval; if (sunptr->un_adjval == 0) { suptr->un_cnt--; if (i < suptr->un_cnt) suptr->un_ent[i] = suptr->un_ent[suptr->un_cnt]; if (suptr->un_cnt == 0) semu_try_free(suptr); } return (0); } /* Didn't find the right entry - create it */ if (adjval == 0) return (0); if (adjval > seminfo.semaem || adjval < -seminfo.semaem) return (ERANGE); if (suptr->un_cnt != seminfo.semume) { sunptr = &suptr->un_ent[suptr->un_cnt]; suptr->un_cnt++; sunptr->un_adjval = adjval; sunptr->un_id = semid; sunptr->un_num = semnum; sunptr->un_seq = semseq; } else return (EINVAL); return (0); } static void semundo_clear(int semid, int semnum) { struct sem_undo *suptr, *suptr1; struct undo *sunptr; int i; SEMUNDO_LOCKASSERT(MA_OWNED); LIST_FOREACH_SAFE(suptr, &semu_list, un_next, suptr1) { sunptr = &suptr->un_ent[0]; for (i = 0; i < suptr->un_cnt; i++, sunptr++) { if (sunptr->un_id != semid) continue; if (semnum == -1 || sunptr->un_num == semnum) { suptr->un_cnt--; if (i < suptr->un_cnt) { suptr->un_ent[i] = suptr->un_ent[suptr->un_cnt]; continue; } semu_try_free(suptr); } if (semnum != -1) break; } } } static int semvalid(int semid, struct semid_kernel *semakptr) { return ((semakptr->u.sem_perm.mode & SEM_ALLOC) == 0 || semakptr->u.sem_perm.seq != IPCID_TO_SEQ(semid) ? EINVAL : 0); } /* * Note that the user-mode half of this passes a union, not a pointer. */ #ifndef _SYS_SYSPROTO_H_ struct __semctl_args { int semid; int semnum; int cmd; union semun *arg; }; #endif int sys___semctl(struct thread *td, struct __semctl_args *uap) { struct semid_ds dsbuf; union semun arg, semun; register_t rval; int error; switch (uap->cmd) { case SEM_STAT: case IPC_SET: case IPC_STAT: case GETALL: case SETVAL: case SETALL: error = copyin(uap->arg, &arg, sizeof(arg)); if (error) return (error); break; } switch (uap->cmd) { case SEM_STAT: case IPC_STAT: semun.buf = &dsbuf; break; case IPC_SET: error = copyin(arg.buf, &dsbuf, sizeof(dsbuf)); if (error) return (error); semun.buf = &dsbuf; break; case GETALL: case SETALL: semun.array = arg.array; break; case SETVAL: semun.val = arg.val; break; } error = kern_semctl(td, uap->semid, uap->semnum, uap->cmd, &semun, &rval); if (error) return (error); switch (uap->cmd) { case SEM_STAT: case IPC_STAT: error = copyout(&dsbuf, arg.buf, sizeof(dsbuf)); break; } if (error == 0) td->td_retval[0] = rval; return (error); } int kern_semctl(struct thread *td, int semid, int semnum, int cmd, union semun *arg, register_t *rval) { u_short *array; struct ucred *cred = td->td_ucred; int i, error; struct semid_ds *sbuf; struct semid_kernel *semakptr; struct mtx *sema_mtxp; u_short usval, count; int semidx; DPRINTF(("call to semctl(%d, %d, %d, 0x%p)\n", semid, semnum, cmd, arg)); if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC)) return (ENOSYS); array = NULL; switch(cmd) { case SEM_STAT: /* * For this command we assume semid is an array index * rather than an IPC id. */ if (semid < 0 || semid >= seminfo.semmni) return (EINVAL); semakptr = &sema[semid]; sema_mtxp = &sema_mtx[semid]; mtx_lock(sema_mtxp); if ((semakptr->u.sem_perm.mode & SEM_ALLOC) == 0) { error = EINVAL; goto done2; } if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_R))) goto done2; #ifdef MAC error = mac_sysvsem_check_semctl(cred, semakptr, cmd); if (error != 0) goto done2; #endif bcopy(&semakptr->u, arg->buf, sizeof(struct semid_ds)); *rval = IXSEQ_TO_IPCID(semid, semakptr->u.sem_perm); mtx_unlock(sema_mtxp); return (0); } semidx = IPCID_TO_IX(semid); if (semidx < 0 || semidx >= seminfo.semmni) return (EINVAL); semakptr = &sema[semidx]; sema_mtxp = &sema_mtx[semidx]; if (cmd == IPC_RMID) mtx_lock(&sem_mtx); mtx_lock(sema_mtxp); #ifdef MAC error = mac_sysvsem_check_semctl(cred, semakptr, cmd); if (error != 0) goto done2; #endif error = 0; *rval = 0; switch (cmd) { case IPC_RMID: if ((error = semvalid(semid, semakptr)) != 0) goto done2; if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_M))) goto done2; semakptr->u.sem_perm.cuid = cred->cr_uid; semakptr->u.sem_perm.uid = cred->cr_uid; semakptr->u.sem_perm.mode = 0; racct_sub_cred(semakptr->cred, RACCT_NSEM, semakptr->u.sem_nsems); crfree(semakptr->cred); semakptr->cred = NULL; SEMUNDO_LOCK(); semundo_clear(semidx, -1); SEMUNDO_UNLOCK(); #ifdef MAC mac_sysvsem_cleanup(semakptr); #endif wakeup(semakptr); for (i = 0; i < seminfo.semmni; i++) { if ((sema[i].u.sem_perm.mode & SEM_ALLOC) && sema[i].u.sem_base > semakptr->u.sem_base) mtx_lock_flags(&sema_mtx[i], LOP_DUPOK); } for (i = semakptr->u.sem_base - sem; i < semtot; i++) sem[i] = sem[i + semakptr->u.sem_nsems]; for (i = 0; i < seminfo.semmni; i++) { if ((sema[i].u.sem_perm.mode & SEM_ALLOC) && sema[i].u.sem_base > semakptr->u.sem_base) { sema[i].u.sem_base -= semakptr->u.sem_nsems; mtx_unlock(&sema_mtx[i]); } } semtot -= semakptr->u.sem_nsems; break; case IPC_SET: if ((error = semvalid(semid, semakptr)) != 0) goto done2; if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_M))) goto done2; sbuf = arg->buf; semakptr->u.sem_perm.uid = sbuf->sem_perm.uid; semakptr->u.sem_perm.gid = sbuf->sem_perm.gid; semakptr->u.sem_perm.mode = (semakptr->u.sem_perm.mode & ~0777) | (sbuf->sem_perm.mode & 0777); semakptr->u.sem_ctime = time_second; break; case IPC_STAT: if ((error = semvalid(semid, semakptr)) != 0) goto done2; if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_R))) goto done2; bcopy(&semakptr->u, arg->buf, sizeof(struct semid_ds)); break; case GETNCNT: if ((error = semvalid(semid, semakptr)) != 0) goto done2; if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_R))) goto done2; if (semnum < 0 || semnum >= semakptr->u.sem_nsems) { error = EINVAL; goto done2; } *rval = semakptr->u.sem_base[semnum].semncnt; break; case GETPID: if ((error = semvalid(semid, semakptr)) != 0) goto done2; if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_R))) goto done2; if (semnum < 0 || semnum >= semakptr->u.sem_nsems) { error = EINVAL; goto done2; } *rval = semakptr->u.sem_base[semnum].sempid; break; case GETVAL: if ((error = semvalid(semid, semakptr)) != 0) goto done2; if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_R))) goto done2; if (semnum < 0 || semnum >= semakptr->u.sem_nsems) { error = EINVAL; goto done2; } *rval = semakptr->u.sem_base[semnum].semval; break; case GETALL: /* * Unfortunately, callers of this function don't know * in advance how many semaphores are in this set. * While we could just allocate the maximum size array * and pass the actual size back to the caller, that * won't work for SETALL since we can't copyin() more * data than the user specified as we may return a * spurious EFAULT. * * Note that the number of semaphores in a set is * fixed for the life of that set. The only way that * the 'count' could change while are blocked in * malloc() is if this semaphore set were destroyed * and a new one created with the same index. * However, semvalid() will catch that due to the * sequence number unless exactly 0x8000 (or a * multiple thereof) semaphore sets for the same index * are created and destroyed while we are in malloc! * */ count = semakptr->u.sem_nsems; mtx_unlock(sema_mtxp); array = malloc(sizeof(*array) * count, M_TEMP, M_WAITOK); mtx_lock(sema_mtxp); if ((error = semvalid(semid, semakptr)) != 0) goto done2; KASSERT(count == semakptr->u.sem_nsems, ("nsems changed")); if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_R))) goto done2; for (i = 0; i < semakptr->u.sem_nsems; i++) array[i] = semakptr->u.sem_base[i].semval; mtx_unlock(sema_mtxp); error = copyout(array, arg->array, count * sizeof(*array)); mtx_lock(sema_mtxp); break; case GETZCNT: if ((error = semvalid(semid, semakptr)) != 0) goto done2; if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_R))) goto done2; if (semnum < 0 || semnum >= semakptr->u.sem_nsems) { error = EINVAL; goto done2; } *rval = semakptr->u.sem_base[semnum].semzcnt; break; case SETVAL: if ((error = semvalid(semid, semakptr)) != 0) goto done2; if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_W))) goto done2; if (semnum < 0 || semnum >= semakptr->u.sem_nsems) { error = EINVAL; goto done2; } if (arg->val < 0 || arg->val > seminfo.semvmx) { error = ERANGE; goto done2; } semakptr->u.sem_base[semnum].semval = arg->val; SEMUNDO_LOCK(); semundo_clear(semidx, semnum); SEMUNDO_UNLOCK(); wakeup(semakptr); break; case SETALL: /* * See comment on GETALL for why 'count' shouldn't change * and why we require a userland buffer. */ count = semakptr->u.sem_nsems; mtx_unlock(sema_mtxp); array = malloc(sizeof(*array) * count, M_TEMP, M_WAITOK); error = copyin(arg->array, array, count * sizeof(*array)); mtx_lock(sema_mtxp); if (error) break; if ((error = semvalid(semid, semakptr)) != 0) goto done2; KASSERT(count == semakptr->u.sem_nsems, ("nsems changed")); if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_W))) goto done2; for (i = 0; i < semakptr->u.sem_nsems; i++) { usval = array[i]; if (usval > seminfo.semvmx) { error = ERANGE; break; } semakptr->u.sem_base[i].semval = usval; } SEMUNDO_LOCK(); semundo_clear(semidx, -1); SEMUNDO_UNLOCK(); wakeup(semakptr); break; default: error = EINVAL; break; } done2: mtx_unlock(sema_mtxp); if (cmd == IPC_RMID) mtx_unlock(&sem_mtx); if (array != NULL) free(array, M_TEMP); return(error); } #ifndef _SYS_SYSPROTO_H_ struct semget_args { key_t key; int nsems; int semflg; }; #endif int sys_semget(struct thread *td, struct semget_args *uap) { int semid, error = 0; int key = uap->key; int nsems = uap->nsems; int semflg = uap->semflg; struct ucred *cred = td->td_ucred; DPRINTF(("semget(0x%x, %d, 0%o)\n", key, nsems, semflg)); if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC)) return (ENOSYS); mtx_lock(&sem_mtx); if (key != IPC_PRIVATE) { for (semid = 0; semid < seminfo.semmni; semid++) { if ((sema[semid].u.sem_perm.mode & SEM_ALLOC) && sema[semid].u.sem_perm.key == key) break; } if (semid < seminfo.semmni) { DPRINTF(("found public key\n")); if ((semflg & IPC_CREAT) && (semflg & IPC_EXCL)) { DPRINTF(("not exclusive\n")); error = EEXIST; goto done2; } if ((error = ipcperm(td, &sema[semid].u.sem_perm, semflg & 0700))) { goto done2; } if (nsems > 0 && sema[semid].u.sem_nsems < nsems) { DPRINTF(("too small\n")); error = EINVAL; goto done2; } #ifdef MAC error = mac_sysvsem_check_semget(cred, &sema[semid]); if (error != 0) goto done2; #endif goto found; } } DPRINTF(("need to allocate the semid_kernel\n")); if (key == IPC_PRIVATE || (semflg & IPC_CREAT)) { if (nsems <= 0 || nsems > seminfo.semmsl) { DPRINTF(("nsems out of range (0<%d<=%d)\n", nsems, seminfo.semmsl)); error = EINVAL; goto done2; } if (nsems > seminfo.semmns - semtot) { DPRINTF(( "not enough semaphores left (need %d, got %d)\n", nsems, seminfo.semmns - semtot)); error = ENOSPC; goto done2; } for (semid = 0; semid < seminfo.semmni; semid++) { if ((sema[semid].u.sem_perm.mode & SEM_ALLOC) == 0) break; } if (semid == seminfo.semmni) { DPRINTF(("no more semid_kernel's available\n")); error = ENOSPC; goto done2; } #ifdef RACCT if (racct_enable) { PROC_LOCK(td->td_proc); error = racct_add(td->td_proc, RACCT_NSEM, nsems); PROC_UNLOCK(td->td_proc); if (error != 0) { error = ENOSPC; goto done2; } } #endif DPRINTF(("semid %d is available\n", semid)); mtx_lock(&sema_mtx[semid]); KASSERT((sema[semid].u.sem_perm.mode & SEM_ALLOC) == 0, ("Lost semaphore %d", semid)); sema[semid].u.sem_perm.key = key; sema[semid].u.sem_perm.cuid = cred->cr_uid; sema[semid].u.sem_perm.uid = cred->cr_uid; sema[semid].u.sem_perm.cgid = cred->cr_gid; sema[semid].u.sem_perm.gid = cred->cr_gid; sema[semid].u.sem_perm.mode = (semflg & 0777) | SEM_ALLOC; sema[semid].cred = crhold(cred); sema[semid].u.sem_perm.seq = (sema[semid].u.sem_perm.seq + 1) & 0x7fff; sema[semid].u.sem_nsems = nsems; sema[semid].u.sem_otime = 0; sema[semid].u.sem_ctime = time_second; sema[semid].u.sem_base = &sem[semtot]; semtot += nsems; bzero(sema[semid].u.sem_base, sizeof(sema[semid].u.sem_base[0])*nsems); #ifdef MAC mac_sysvsem_create(cred, &sema[semid]); #endif mtx_unlock(&sema_mtx[semid]); DPRINTF(("sembase = %p, next = %p\n", sema[semid].u.sem_base, &sem[semtot])); } else { DPRINTF(("didn't find it and wasn't asked to create it\n")); error = ENOENT; goto done2; } found: td->td_retval[0] = IXSEQ_TO_IPCID(semid, sema[semid].u.sem_perm); done2: mtx_unlock(&sem_mtx); return (error); } #ifndef _SYS_SYSPROTO_H_ struct semop_args { int semid; struct sembuf *sops; size_t nsops; }; #endif int sys_semop(struct thread *td, struct semop_args *uap) { #define SMALL_SOPS 8 struct sembuf small_sops[SMALL_SOPS]; int semid = uap->semid; size_t nsops = uap->nsops; struct sembuf *sops; struct semid_kernel *semakptr; struct sembuf *sopptr = NULL; struct sem *semptr = NULL; struct sem_undo *suptr; struct mtx *sema_mtxp; size_t i, j, k; int error; int do_wakeup, do_undos; unsigned short seq; #ifdef SEM_DEBUG sops = NULL; #endif DPRINTF(("call to semop(%d, %p, %u)\n", semid, sops, nsops)); if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC)) return (ENOSYS); semid = IPCID_TO_IX(semid); /* Convert back to zero origin */ if (semid < 0 || semid >= seminfo.semmni) return (EINVAL); /* Allocate memory for sem_ops */ if (nsops <= SMALL_SOPS) sops = small_sops; else if (nsops > seminfo.semopm) { DPRINTF(("too many sops (max=%d, nsops=%d)\n", seminfo.semopm, nsops)); return (E2BIG); } else { #ifdef RACCT if (racct_enable) { PROC_LOCK(td->td_proc); if (nsops > racct_get_available(td->td_proc, RACCT_NSEMOP)) { PROC_UNLOCK(td->td_proc); return (E2BIG); } PROC_UNLOCK(td->td_proc); } #endif sops = malloc(nsops * sizeof(*sops), M_TEMP, M_WAITOK); } if ((error = copyin(uap->sops, sops, nsops * sizeof(sops[0]))) != 0) { DPRINTF(("error = %d from copyin(%p, %p, %d)\n", error, uap->sops, sops, nsops * sizeof(sops[0]))); if (sops != small_sops) free(sops, M_SEM); return (error); } semakptr = &sema[semid]; sema_mtxp = &sema_mtx[semid]; mtx_lock(sema_mtxp); if ((semakptr->u.sem_perm.mode & SEM_ALLOC) == 0) { error = EINVAL; goto done2; } seq = semakptr->u.sem_perm.seq; if (seq != IPCID_TO_SEQ(uap->semid)) { error = EINVAL; goto done2; } /* * Initial pass thru sops to see what permissions are needed. * Also perform any checks that don't need repeating on each * attempt to satisfy the request vector. */ j = 0; /* permission needed */ do_undos = 0; for (i = 0; i < nsops; i++) { sopptr = &sops[i]; if (sopptr->sem_num >= semakptr->u.sem_nsems) { error = EFBIG; goto done2; } if (sopptr->sem_flg & SEM_UNDO && sopptr->sem_op != 0) do_undos = 1; j |= (sopptr->sem_op == 0) ? SEM_R : SEM_A; } if ((error = ipcperm(td, &semakptr->u.sem_perm, j))) { DPRINTF(("error = %d from ipaccess\n", error)); goto done2; } #ifdef MAC error = mac_sysvsem_check_semop(td->td_ucred, semakptr, j); if (error != 0) goto done2; #endif /* * Loop trying to satisfy the vector of requests. * If we reach a point where we must wait, any requests already * performed are rolled back and we go to sleep until some other * process wakes us up. At this point, we start all over again. * * This ensures that from the perspective of other tasks, a set * of requests is atomic (never partially satisfied). */ for (;;) { do_wakeup = 0; error = 0; /* error return if necessary */ for (i = 0; i < nsops; i++) { sopptr = &sops[i]; semptr = &semakptr->u.sem_base[sopptr->sem_num]; DPRINTF(( "semop: semakptr=%p, sem_base=%p, " "semptr=%p, sem[%d]=%d : op=%d, flag=%s\n", semakptr, semakptr->u.sem_base, semptr, sopptr->sem_num, semptr->semval, sopptr->sem_op, (sopptr->sem_flg & IPC_NOWAIT) ? "nowait" : "wait")); if (sopptr->sem_op < 0) { if (semptr->semval + sopptr->sem_op < 0) { DPRINTF(("semop: can't do it now\n")); break; } else { semptr->semval += sopptr->sem_op; if (semptr->semval == 0 && semptr->semzcnt > 0) do_wakeup = 1; } } else if (sopptr->sem_op == 0) { if (semptr->semval != 0) { DPRINTF(("semop: not zero now\n")); break; } } else if (semptr->semval + sopptr->sem_op > seminfo.semvmx) { error = ERANGE; break; } else { if (semptr->semncnt > 0) do_wakeup = 1; semptr->semval += sopptr->sem_op; } } /* * Did we get through the entire vector? */ if (i >= nsops) goto done; /* * No ... rollback anything that we've already done */ DPRINTF(("semop: rollback 0 through %d\n", i-1)); for (j = 0; j < i; j++) semakptr->u.sem_base[sops[j].sem_num].semval -= sops[j].sem_op; /* If we detected an error, return it */ if (error != 0) goto done2; /* * If the request that we couldn't satisfy has the * NOWAIT flag set then return with EAGAIN. */ if (sopptr->sem_flg & IPC_NOWAIT) { error = EAGAIN; goto done2; } if (sopptr->sem_op == 0) semptr->semzcnt++; else semptr->semncnt++; DPRINTF(("semop: good night!\n")); error = msleep(semakptr, sema_mtxp, (PZERO - 4) | PCATCH, "semwait", 0); DPRINTF(("semop: good morning (error=%d)!\n", error)); /* return code is checked below, after sem[nz]cnt-- */ /* * Make sure that the semaphore still exists */ seq = semakptr->u.sem_perm.seq; if ((semakptr->u.sem_perm.mode & SEM_ALLOC) == 0 || seq != IPCID_TO_SEQ(uap->semid)) { error = EIDRM; goto done2; } /* * Renew the semaphore's pointer after wakeup since * during msleep sem_base may have been modified and semptr * is not valid any more */ semptr = &semakptr->u.sem_base[sopptr->sem_num]; /* * The semaphore is still alive. Readjust the count of * waiting processes. */ if (sopptr->sem_op == 0) semptr->semzcnt--; else semptr->semncnt--; /* * Is it really morning, or was our sleep interrupted? * (Delayed check of msleep() return code because we * need to decrement sem[nz]cnt either way.) */ if (error != 0) { error = EINTR; goto done2; } DPRINTF(("semop: good morning!\n")); } done: /* * Process any SEM_UNDO requests. */ if (do_undos) { SEMUNDO_LOCK(); suptr = NULL; for (i = 0; i < nsops; i++) { /* * We only need to deal with SEM_UNDO's for non-zero * op's. */ int adjval; if ((sops[i].sem_flg & SEM_UNDO) == 0) continue; adjval = sops[i].sem_op; if (adjval == 0) continue; error = semundo_adjust(td, &suptr, semid, seq, sops[i].sem_num, -adjval); if (error == 0) continue; /* * Oh-Oh! We ran out of either sem_undo's or undo's. * Rollback the adjustments to this point and then * rollback the semaphore ups and down so we can return * with an error with all structures restored. We * rollback the undo's in the exact reverse order that * we applied them. This guarantees that we won't run * out of space as we roll things back out. */ for (j = 0; j < i; j++) { k = i - j - 1; if ((sops[k].sem_flg & SEM_UNDO) == 0) continue; adjval = sops[k].sem_op; if (adjval == 0) continue; if (semundo_adjust(td, &suptr, semid, seq, sops[k].sem_num, adjval) != 0) panic("semop - can't undo undos"); } for (j = 0; j < nsops; j++) semakptr->u.sem_base[sops[j].sem_num].semval -= sops[j].sem_op; DPRINTF(("error = %d from semundo_adjust\n", error)); SEMUNDO_UNLOCK(); goto done2; } /* loop through the sops */ SEMUNDO_UNLOCK(); } /* if (do_undos) */ /* We're definitely done - set the sempid's and time */ for (i = 0; i < nsops; i++) { sopptr = &sops[i]; semptr = &semakptr->u.sem_base[sopptr->sem_num]; semptr->sempid = td->td_proc->p_pid; } semakptr->u.sem_otime = time_second; /* * Do a wakeup if any semaphore was up'd whilst something was * sleeping on it. */ if (do_wakeup) { DPRINTF(("semop: doing wakeup\n")); wakeup(semakptr); DPRINTF(("semop: back from wakeup\n")); } DPRINTF(("semop: done\n")); td->td_retval[0] = 0; done2: mtx_unlock(sema_mtxp); if (sops != small_sops) free(sops, M_SEM); return (error); } /* * Go through the undo structures for this process and apply the adjustments to * semaphores. */ static void semexit_myhook(void *arg, struct proc *p) { struct sem_undo *suptr; struct semid_kernel *semakptr; struct mtx *sema_mtxp; int semid, semnum, adjval, ix; unsigned short seq; /* * Go through the chain of undo vectors looking for one * associated with this process. */ SEMUNDO_LOCK(); LIST_FOREACH(suptr, &semu_list, un_next) { if (suptr->un_proc == p) break; } if (suptr == NULL) { SEMUNDO_UNLOCK(); return; } LIST_REMOVE(suptr, un_next); DPRINTF(("proc @%p has undo structure with %d entries\n", p, suptr->un_cnt)); /* * If there are any active undo elements then process them. */ if (suptr->un_cnt > 0) { SEMUNDO_UNLOCK(); for (ix = 0; ix < suptr->un_cnt; ix++) { semid = suptr->un_ent[ix].un_id; semnum = suptr->un_ent[ix].un_num; adjval = suptr->un_ent[ix].un_adjval; seq = suptr->un_ent[ix].un_seq; semakptr = &sema[semid]; sema_mtxp = &sema_mtx[semid]; mtx_lock(sema_mtxp); if ((semakptr->u.sem_perm.mode & SEM_ALLOC) == 0 || (semakptr->u.sem_perm.seq != seq)) { mtx_unlock(sema_mtxp); continue; } if (semnum >= semakptr->u.sem_nsems) panic("semexit - semnum out of range"); DPRINTF(( "semexit: %p id=%d num=%d(adj=%d) ; sem=%d\n", suptr->un_proc, suptr->un_ent[ix].un_id, suptr->un_ent[ix].un_num, suptr->un_ent[ix].un_adjval, semakptr->u.sem_base[semnum].semval)); if (adjval < 0 && semakptr->u.sem_base[semnum].semval < -adjval) semakptr->u.sem_base[semnum].semval = 0; else semakptr->u.sem_base[semnum].semval += adjval; wakeup(semakptr); DPRINTF(("semexit: back from wakeup\n")); mtx_unlock(sema_mtxp); } SEMUNDO_LOCK(); } /* * Deallocate the undo vector. */ DPRINTF(("removing vector\n")); suptr->un_proc = NULL; suptr->un_cnt = 0; LIST_INSERT_HEAD(&semu_free_list, suptr, un_next); SEMUNDO_UNLOCK(); } static int sysctl_sema(SYSCTL_HANDLER_ARGS) { return (SYSCTL_OUT(req, sema, sizeof(struct semid_kernel) * seminfo.semmni)); } #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \ defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7) /* XXX casting to (sy_call_t *) is bogus, as usual. */ static sy_call_t *semcalls[] = { (sy_call_t *)freebsd7___semctl, (sy_call_t *)sys_semget, (sy_call_t *)sys_semop }; /* * Entry point for all SEM calls. */ int sys_semsys(td, uap) struct thread *td; /* XXX actually varargs. */ struct semsys_args /* { int which; int a2; int a3; int a4; int a5; } */ *uap; { int error; if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC)) return (ENOSYS); - if (uap->which < 0 || - uap->which >= nitems(semcalls)) + if (uap->which < 0 || uap->which >= nitems(semcalls)) return (EINVAL); error = (*semcalls[uap->which])(td, &uap->a2); return (error); } #ifndef CP #define CP(src, dst, fld) do { (dst).fld = (src).fld; } while (0) #endif #ifndef _SYS_SYSPROTO_H_ struct freebsd7___semctl_args { int semid; int semnum; int cmd; union semun_old *arg; }; #endif int freebsd7___semctl(struct thread *td, struct freebsd7___semctl_args *uap) { struct semid_ds_old dsold; struct semid_ds dsbuf; union semun_old arg; union semun semun; register_t rval; int error; switch (uap->cmd) { case SEM_STAT: case IPC_SET: case IPC_STAT: case GETALL: case SETVAL: case SETALL: error = copyin(uap->arg, &arg, sizeof(arg)); if (error) return (error); break; } switch (uap->cmd) { case SEM_STAT: case IPC_STAT: semun.buf = &dsbuf; break; case IPC_SET: error = copyin(arg.buf, &dsold, sizeof(dsold)); if (error) return (error); ipcperm_old2new(&dsold.sem_perm, &dsbuf.sem_perm); CP(dsold, dsbuf, sem_base); CP(dsold, dsbuf, sem_nsems); CP(dsold, dsbuf, sem_otime); CP(dsold, dsbuf, sem_ctime); semun.buf = &dsbuf; break; case GETALL: case SETALL: semun.array = arg.array; break; case SETVAL: semun.val = arg.val; break; } error = kern_semctl(td, uap->semid, uap->semnum, uap->cmd, &semun, &rval); if (error) return (error); switch (uap->cmd) { case SEM_STAT: case IPC_STAT: bzero(&dsold, sizeof(dsold)); ipcperm_new2old(&dsbuf.sem_perm, &dsold.sem_perm); CP(dsbuf, dsold, sem_base); CP(dsbuf, dsold, sem_nsems); CP(dsbuf, dsold, sem_otime); CP(dsbuf, dsold, sem_ctime); error = copyout(&dsold, arg.buf, sizeof(dsold)); break; } if (error == 0) td->td_retval[0] = rval; return (error); } #endif /* COMPAT_FREEBSD{4,5,6,7} */ #ifdef COMPAT_FREEBSD32 int freebsd32_semsys(struct thread *td, struct freebsd32_semsys_args *uap) { #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \ defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7) switch (uap->which) { case 0: return (freebsd7_freebsd32_semctl(td, (struct freebsd7_freebsd32_semctl_args *)&uap->a2)); default: return (sys_semsys(td, (struct semsys_args *)uap)); } #else return (nosys(td, NULL)); #endif } #if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \ defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7) int freebsd7_freebsd32_semctl(struct thread *td, struct freebsd7_freebsd32_semctl_args *uap) { struct semid_ds32_old dsbuf32; struct semid_ds dsbuf; union semun semun; union semun32 arg; register_t rval; int error; switch (uap->cmd) { case SEM_STAT: case IPC_SET: case IPC_STAT: case GETALL: case SETVAL: case SETALL: error = copyin(uap->arg, &arg, sizeof(arg)); if (error) return (error); break; } switch (uap->cmd) { case SEM_STAT: case IPC_STAT: semun.buf = &dsbuf; break; case IPC_SET: error = copyin(PTRIN(arg.buf), &dsbuf32, sizeof(dsbuf32)); if (error) return (error); freebsd32_ipcperm_old_in(&dsbuf32.sem_perm, &dsbuf.sem_perm); PTRIN_CP(dsbuf32, dsbuf, sem_base); CP(dsbuf32, dsbuf, sem_nsems); CP(dsbuf32, dsbuf, sem_otime); CP(dsbuf32, dsbuf, sem_ctime); semun.buf = &dsbuf; break; case GETALL: case SETALL: semun.array = PTRIN(arg.array); break; case SETVAL: semun.val = arg.val; break; } error = kern_semctl(td, uap->semid, uap->semnum, uap->cmd, &semun, &rval); if (error) return (error); switch (uap->cmd) { case SEM_STAT: case IPC_STAT: bzero(&dsbuf32, sizeof(dsbuf32)); freebsd32_ipcperm_old_out(&dsbuf.sem_perm, &dsbuf32.sem_perm); PTROUT_CP(dsbuf, dsbuf32, sem_base); CP(dsbuf, dsbuf32, sem_nsems); CP(dsbuf, dsbuf32, sem_otime); CP(dsbuf, dsbuf32, sem_ctime); error = copyout(&dsbuf32, PTRIN(arg.buf), sizeof(dsbuf32)); break; } if (error == 0) td->td_retval[0] = rval; return (error); } #endif int freebsd32_semctl(struct thread *td, struct freebsd32_semctl_args *uap) { struct semid_ds32 dsbuf32; struct semid_ds dsbuf; union semun semun; union semun32 arg; register_t rval; int error; switch (uap->cmd) { case SEM_STAT: case IPC_SET: case IPC_STAT: case GETALL: case SETVAL: case SETALL: error = copyin(uap->arg, &arg, sizeof(arg)); if (error) return (error); break; } switch (uap->cmd) { case SEM_STAT: case IPC_STAT: semun.buf = &dsbuf; break; case IPC_SET: error = copyin(PTRIN(arg.buf), &dsbuf32, sizeof(dsbuf32)); if (error) return (error); freebsd32_ipcperm_in(&dsbuf32.sem_perm, &dsbuf.sem_perm); PTRIN_CP(dsbuf32, dsbuf, sem_base); CP(dsbuf32, dsbuf, sem_nsems); CP(dsbuf32, dsbuf, sem_otime); CP(dsbuf32, dsbuf, sem_ctime); semun.buf = &dsbuf; break; case GETALL: case SETALL: semun.array = PTRIN(arg.array); break; case SETVAL: semun.val = arg.val; break; } error = kern_semctl(td, uap->semid, uap->semnum, uap->cmd, &semun, &rval); if (error) return (error); switch (uap->cmd) { case SEM_STAT: case IPC_STAT: bzero(&dsbuf32, sizeof(dsbuf32)); freebsd32_ipcperm_out(&dsbuf.sem_perm, &dsbuf32.sem_perm); PTROUT_CP(dsbuf, dsbuf32, sem_base); CP(dsbuf, dsbuf32, sem_nsems); CP(dsbuf, dsbuf32, sem_otime); CP(dsbuf, dsbuf32, sem_ctime); error = copyout(&dsbuf32, PTRIN(arg.buf), sizeof(dsbuf32)); break; } if (error == 0) td->td_retval[0] = rval; return (error); } #endif /* COMPAT_FREEBSD32 */ Index: head/sys/netinet/tcp_syncache.c =================================================================== --- head/sys/netinet/tcp_syncache.c (revision 298353) +++ head/sys/netinet/tcp_syncache.c (revision 298354) @@ -1,2158 +1,2157 @@ /*- * Copyright (c) 2001 McAfee, Inc. * Copyright (c) 2006,2013 Andre Oppermann, Internet Business Solutions AG * All rights reserved. * * This software was developed for the FreeBSD Project by Jonathan Lemon * and McAfee Research, the Security Research Division of McAfee, Inc. under * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the * DARPA CHATS research program. [2001 McAfee, Inc.] * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_pcbgroup.h" #include #include #include #include #include #include #include #include #include #include #include #include /* for proc0 declaration */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #include #include #include #include #endif #include #ifdef TCP_RFC7413 #include #endif #include #include #include #include #include #ifdef INET6 #include #endif #ifdef TCP_OFFLOAD #include #endif #ifdef IPSEC #include #ifdef INET6 #include #endif #include #endif /*IPSEC*/ #include #include static VNET_DEFINE(int, tcp_syncookies) = 1; #define V_tcp_syncookies VNET(tcp_syncookies) SYSCTL_INT(_net_inet_tcp, OID_AUTO, syncookies, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_syncookies), 0, "Use TCP SYN cookies if the syncache overflows"); static VNET_DEFINE(int, tcp_syncookiesonly) = 0; #define V_tcp_syncookiesonly VNET(tcp_syncookiesonly) SYSCTL_INT(_net_inet_tcp, OID_AUTO, syncookies_only, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_syncookiesonly), 0, "Use only TCP SYN cookies"); #ifdef TCP_OFFLOAD #define ADDED_BY_TOE(sc) ((sc)->sc_tod != NULL) #endif static void syncache_drop(struct syncache *, struct syncache_head *); static void syncache_free(struct syncache *); static void syncache_insert(struct syncache *, struct syncache_head *); static int syncache_respond(struct syncache *, struct syncache_head *, int); static struct socket *syncache_socket(struct syncache *, struct socket *, struct mbuf *m); static void syncache_timeout(struct syncache *sc, struct syncache_head *sch, int docallout); static void syncache_timer(void *); static uint32_t syncookie_mac(struct in_conninfo *, tcp_seq, uint8_t, uint8_t *, uintptr_t); static tcp_seq syncookie_generate(struct syncache_head *, struct syncache *); static struct syncache *syncookie_lookup(struct in_conninfo *, struct syncache_head *, struct syncache *, struct tcphdr *, struct tcpopt *, struct socket *); static void syncookie_reseed(void *); #ifdef INVARIANTS static int syncookie_cmp(struct in_conninfo *inc, struct syncache_head *sch, struct syncache *sc, struct tcphdr *th, struct tcpopt *to, struct socket *lso); #endif /* * Transmit the SYN,ACK fewer times than TCP_MAXRXTSHIFT specifies. * 3 retransmits corresponds to a timeout of 3 * (1 + 2 + 4 + 8) == 45 seconds, * the odds are that the user has given up attempting to connect by then. */ #define SYNCACHE_MAXREXMTS 3 /* Arbitrary values */ #define TCP_SYNCACHE_HASHSIZE 512 #define TCP_SYNCACHE_BUCKETLIMIT 30 static VNET_DEFINE(struct tcp_syncache, tcp_syncache); #define V_tcp_syncache VNET(tcp_syncache) static SYSCTL_NODE(_net_inet_tcp, OID_AUTO, syncache, CTLFLAG_RW, 0, "TCP SYN cache"); SYSCTL_UINT(_net_inet_tcp_syncache, OID_AUTO, bucketlimit, CTLFLAG_VNET | CTLFLAG_RDTUN, &VNET_NAME(tcp_syncache.bucket_limit), 0, "Per-bucket hash limit for syncache"); SYSCTL_UINT(_net_inet_tcp_syncache, OID_AUTO, cachelimit, CTLFLAG_VNET | CTLFLAG_RDTUN, &VNET_NAME(tcp_syncache.cache_limit), 0, "Overall entry limit for syncache"); SYSCTL_UMA_CUR(_net_inet_tcp_syncache, OID_AUTO, count, CTLFLAG_VNET, &VNET_NAME(tcp_syncache.zone), "Current number of entries in syncache"); SYSCTL_UINT(_net_inet_tcp_syncache, OID_AUTO, hashsize, CTLFLAG_VNET | CTLFLAG_RDTUN, &VNET_NAME(tcp_syncache.hashsize), 0, "Size of TCP syncache hashtable"); SYSCTL_UINT(_net_inet_tcp_syncache, OID_AUTO, rexmtlimit, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_syncache.rexmt_limit), 0, "Limit on SYN/ACK retransmissions"); VNET_DEFINE(int, tcp_sc_rst_sock_fail) = 1; SYSCTL_INT(_net_inet_tcp_syncache, OID_AUTO, rst_on_sock_fail, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_sc_rst_sock_fail), 0, "Send reset on socket allocation failure"); static MALLOC_DEFINE(M_SYNCACHE, "syncache", "TCP syncache"); #define SCH_LOCK(sch) mtx_lock(&(sch)->sch_mtx) #define SCH_UNLOCK(sch) mtx_unlock(&(sch)->sch_mtx) #define SCH_LOCK_ASSERT(sch) mtx_assert(&(sch)->sch_mtx, MA_OWNED) /* * Requires the syncache entry to be already removed from the bucket list. */ static void syncache_free(struct syncache *sc) { if (sc->sc_ipopts) (void) m_free(sc->sc_ipopts); if (sc->sc_cred) crfree(sc->sc_cred); #ifdef MAC mac_syncache_destroy(&sc->sc_label); #endif uma_zfree(V_tcp_syncache.zone, sc); } void syncache_init(void) { int i; V_tcp_syncache.hashsize = TCP_SYNCACHE_HASHSIZE; V_tcp_syncache.bucket_limit = TCP_SYNCACHE_BUCKETLIMIT; V_tcp_syncache.rexmt_limit = SYNCACHE_MAXREXMTS; V_tcp_syncache.hash_secret = arc4random(); TUNABLE_INT_FETCH("net.inet.tcp.syncache.hashsize", &V_tcp_syncache.hashsize); TUNABLE_INT_FETCH("net.inet.tcp.syncache.bucketlimit", &V_tcp_syncache.bucket_limit); if (!powerof2(V_tcp_syncache.hashsize) || V_tcp_syncache.hashsize == 0) { printf("WARNING: syncache hash size is not a power of 2.\n"); V_tcp_syncache.hashsize = TCP_SYNCACHE_HASHSIZE; } V_tcp_syncache.hashmask = V_tcp_syncache.hashsize - 1; /* Set limits. */ V_tcp_syncache.cache_limit = V_tcp_syncache.hashsize * V_tcp_syncache.bucket_limit; TUNABLE_INT_FETCH("net.inet.tcp.syncache.cachelimit", &V_tcp_syncache.cache_limit); /* Allocate the hash table. */ V_tcp_syncache.hashbase = malloc(V_tcp_syncache.hashsize * sizeof(struct syncache_head), M_SYNCACHE, M_WAITOK | M_ZERO); #ifdef VIMAGE V_tcp_syncache.vnet = curvnet; #endif /* Initialize the hash buckets. */ for (i = 0; i < V_tcp_syncache.hashsize; i++) { TAILQ_INIT(&V_tcp_syncache.hashbase[i].sch_bucket); mtx_init(&V_tcp_syncache.hashbase[i].sch_mtx, "tcp_sc_head", NULL, MTX_DEF); callout_init_mtx(&V_tcp_syncache.hashbase[i].sch_timer, &V_tcp_syncache.hashbase[i].sch_mtx, 0); V_tcp_syncache.hashbase[i].sch_length = 0; V_tcp_syncache.hashbase[i].sch_sc = &V_tcp_syncache; } /* Create the syncache entry zone. */ V_tcp_syncache.zone = uma_zcreate("syncache", sizeof(struct syncache), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); V_tcp_syncache.cache_limit = uma_zone_set_max(V_tcp_syncache.zone, V_tcp_syncache.cache_limit); /* Start the SYN cookie reseeder callout. */ callout_init(&V_tcp_syncache.secret.reseed, 1); arc4rand(V_tcp_syncache.secret.key[0], SYNCOOKIE_SECRET_SIZE, 0); arc4rand(V_tcp_syncache.secret.key[1], SYNCOOKIE_SECRET_SIZE, 0); callout_reset(&V_tcp_syncache.secret.reseed, SYNCOOKIE_LIFETIME * hz, syncookie_reseed, &V_tcp_syncache); } #ifdef VIMAGE void syncache_destroy(void) { struct syncache_head *sch; struct syncache *sc, *nsc; int i; /* * Stop the re-seed timer before freeing resources. No need to * possibly schedule it another time. */ callout_drain(&V_tcp_syncache.secret.reseed); /* Cleanup hash buckets: stop timers, free entries, destroy locks. */ for (i = 0; i < V_tcp_syncache.hashsize; i++) { sch = &V_tcp_syncache.hashbase[i]; callout_drain(&sch->sch_timer); SCH_LOCK(sch); TAILQ_FOREACH_SAFE(sc, &sch->sch_bucket, sc_hash, nsc) syncache_drop(sc, sch); SCH_UNLOCK(sch); KASSERT(TAILQ_EMPTY(&sch->sch_bucket), ("%s: sch->sch_bucket not empty", __func__)); KASSERT(sch->sch_length == 0, ("%s: sch->sch_length %d not 0", __func__, sch->sch_length)); mtx_destroy(&sch->sch_mtx); } KASSERT(uma_zone_get_cur(V_tcp_syncache.zone) == 0, ("%s: cache_count not 0", __func__)); /* Free the allocated global resources. */ uma_zdestroy(V_tcp_syncache.zone); free(V_tcp_syncache.hashbase, M_SYNCACHE); } #endif /* * Inserts a syncache entry into the specified bucket row. * Locks and unlocks the syncache_head autonomously. */ static void syncache_insert(struct syncache *sc, struct syncache_head *sch) { struct syncache *sc2; SCH_LOCK(sch); /* * Make sure that we don't overflow the per-bucket limit. * If the bucket is full, toss the oldest element. */ if (sch->sch_length >= V_tcp_syncache.bucket_limit) { KASSERT(!TAILQ_EMPTY(&sch->sch_bucket), ("sch->sch_length incorrect")); sc2 = TAILQ_LAST(&sch->sch_bucket, sch_head); syncache_drop(sc2, sch); TCPSTAT_INC(tcps_sc_bucketoverflow); } /* Put it into the bucket. */ TAILQ_INSERT_HEAD(&sch->sch_bucket, sc, sc_hash); sch->sch_length++; #ifdef TCP_OFFLOAD if (ADDED_BY_TOE(sc)) { struct toedev *tod = sc->sc_tod; tod->tod_syncache_added(tod, sc->sc_todctx); } #endif /* Reinitialize the bucket row's timer. */ if (sch->sch_length == 1) sch->sch_nextc = ticks + INT_MAX; syncache_timeout(sc, sch, 1); SCH_UNLOCK(sch); TCPSTATES_INC(TCPS_SYN_RECEIVED); TCPSTAT_INC(tcps_sc_added); } /* * Remove and free entry from syncache bucket row. * Expects locked syncache head. */ static void syncache_drop(struct syncache *sc, struct syncache_head *sch) { SCH_LOCK_ASSERT(sch); TCPSTATES_DEC(TCPS_SYN_RECEIVED); TAILQ_REMOVE(&sch->sch_bucket, sc, sc_hash); sch->sch_length--; #ifdef TCP_OFFLOAD if (ADDED_BY_TOE(sc)) { struct toedev *tod = sc->sc_tod; tod->tod_syncache_removed(tod, sc->sc_todctx); } #endif syncache_free(sc); } /* * Engage/reengage time on bucket row. */ static void syncache_timeout(struct syncache *sc, struct syncache_head *sch, int docallout) { sc->sc_rxttime = ticks + TCPTV_RTOBASE * (tcp_syn_backoff[sc->sc_rxmits]); sc->sc_rxmits++; if (TSTMP_LT(sc->sc_rxttime, sch->sch_nextc)) { sch->sch_nextc = sc->sc_rxttime; if (docallout) callout_reset(&sch->sch_timer, sch->sch_nextc - ticks, syncache_timer, (void *)sch); } } /* * Walk the timer queues, looking for SYN,ACKs that need to be retransmitted. * If we have retransmitted an entry the maximum number of times, expire it. * One separate timer for each bucket row. */ static void syncache_timer(void *xsch) { struct syncache_head *sch = (struct syncache_head *)xsch; struct syncache *sc, *nsc; int tick = ticks; char *s; CURVNET_SET(sch->sch_sc->vnet); /* NB: syncache_head has already been locked by the callout. */ SCH_LOCK_ASSERT(sch); /* * In the following cycle we may remove some entries and/or * advance some timeouts, so re-initialize the bucket timer. */ sch->sch_nextc = tick + INT_MAX; TAILQ_FOREACH_SAFE(sc, &sch->sch_bucket, sc_hash, nsc) { /* * We do not check if the listen socket still exists * and accept the case where the listen socket may be * gone by the time we resend the SYN/ACK. We do * not expect this to happens often. If it does, * then the RST will be sent by the time the remote * host does the SYN/ACK->ACK. */ if (TSTMP_GT(sc->sc_rxttime, tick)) { if (TSTMP_LT(sc->sc_rxttime, sch->sch_nextc)) sch->sch_nextc = sc->sc_rxttime; continue; } if (sc->sc_rxmits > V_tcp_syncache.rexmt_limit) { if ((s = tcp_log_addrs(&sc->sc_inc, NULL, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: Retransmits exhausted, " "giving up and removing syncache entry\n", s, __func__); free(s, M_TCPLOG); } syncache_drop(sc, sch); TCPSTAT_INC(tcps_sc_stale); continue; } if ((s = tcp_log_addrs(&sc->sc_inc, NULL, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: Response timeout, " "retransmitting (%u) SYN|ACK\n", s, __func__, sc->sc_rxmits); free(s, M_TCPLOG); } syncache_respond(sc, sch, 1); TCPSTAT_INC(tcps_sc_retransmitted); syncache_timeout(sc, sch, 0); } if (!TAILQ_EMPTY(&(sch)->sch_bucket)) callout_reset(&(sch)->sch_timer, (sch)->sch_nextc - tick, syncache_timer, (void *)(sch)); CURVNET_RESTORE(); } /* * Find an entry in the syncache. * Returns always with locked syncache_head plus a matching entry or NULL. */ static struct syncache * syncache_lookup(struct in_conninfo *inc, struct syncache_head **schp) { struct syncache *sc; struct syncache_head *sch; uint32_t hash; /* * The hash is built on foreign port + local port + foreign address. * We rely on the fact that struct in_conninfo starts with 16 bits * of foreign port, then 16 bits of local port then followed by 128 * bits of foreign address. In case of IPv4 address, the first 3 * 32-bit words of the address always are zeroes. */ hash = jenkins_hash32((uint32_t *)&inc->inc_ie, 5, V_tcp_syncache.hash_secret) & V_tcp_syncache.hashmask; sch = &V_tcp_syncache.hashbase[hash]; *schp = sch; SCH_LOCK(sch); /* Circle through bucket row to find matching entry. */ TAILQ_FOREACH(sc, &sch->sch_bucket, sc_hash) if (bcmp(&inc->inc_ie, &sc->sc_inc.inc_ie, sizeof(struct in_endpoints)) == 0) break; return (sc); /* Always returns with locked sch. */ } /* * This function is called when we get a RST for a * non-existent connection, so that we can see if the * connection is in the syn cache. If it is, zap it. */ void syncache_chkrst(struct in_conninfo *inc, struct tcphdr *th) { struct syncache *sc; struct syncache_head *sch; char *s = NULL; sc = syncache_lookup(inc, &sch); /* returns locked sch */ SCH_LOCK_ASSERT(sch); /* * Any RST to our SYN|ACK must not carry ACK, SYN or FIN flags. * See RFC 793 page 65, section SEGMENT ARRIVES. */ if (th->th_flags & (TH_ACK|TH_SYN|TH_FIN)) { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Spurious RST with ACK, SYN or " "FIN flag set, segment ignored\n", s, __func__); TCPSTAT_INC(tcps_badrst); goto done; } /* * No corresponding connection was found in syncache. * If syncookies are enabled and possibly exclusively * used, or we are under memory pressure, a valid RST * may not find a syncache entry. In that case we're * done and no SYN|ACK retransmissions will happen. * Otherwise the RST was misdirected or spoofed. */ if (sc == NULL) { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Spurious RST without matching " "syncache entry (possibly syncookie only), " "segment ignored\n", s, __func__); TCPSTAT_INC(tcps_badrst); goto done; } /* * If the RST bit is set, check the sequence number to see * if this is a valid reset segment. * RFC 793 page 37: * In all states except SYN-SENT, all reset (RST) segments * are validated by checking their SEQ-fields. A reset is * valid if its sequence number is in the window. * * The sequence number in the reset segment is normally an * echo of our outgoing acknowlegement numbers, but some hosts * send a reset with the sequence number at the rightmost edge * of our receive window, and we have to handle this case. */ if (SEQ_GEQ(th->th_seq, sc->sc_irs) && SEQ_LEQ(th->th_seq, sc->sc_irs + sc->sc_wnd)) { syncache_drop(sc, sch); if ((s = tcp_log_addrs(inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Our SYN|ACK was rejected, " "connection attempt aborted by remote endpoint\n", s, __func__); TCPSTAT_INC(tcps_sc_reset); } else { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: RST with invalid SEQ %u != " "IRS %u (+WND %u), segment ignored\n", s, __func__, th->th_seq, sc->sc_irs, sc->sc_wnd); TCPSTAT_INC(tcps_badrst); } done: if (s != NULL) free(s, M_TCPLOG); SCH_UNLOCK(sch); } void syncache_badack(struct in_conninfo *inc) { struct syncache *sc; struct syncache_head *sch; sc = syncache_lookup(inc, &sch); /* returns locked sch */ SCH_LOCK_ASSERT(sch); if (sc != NULL) { syncache_drop(sc, sch); TCPSTAT_INC(tcps_sc_badack); } SCH_UNLOCK(sch); } void syncache_unreach(struct in_conninfo *inc, struct tcphdr *th) { struct syncache *sc; struct syncache_head *sch; sc = syncache_lookup(inc, &sch); /* returns locked sch */ SCH_LOCK_ASSERT(sch); if (sc == NULL) goto done; /* If the sequence number != sc_iss, then it's a bogus ICMP msg */ if (ntohl(th->th_seq) != sc->sc_iss) goto done; /* * If we've rertransmitted 3 times and this is our second error, * we remove the entry. Otherwise, we allow it to continue on. * This prevents us from incorrectly nuking an entry during a * spurious network outage. * * See tcp_notify(). */ if ((sc->sc_flags & SCF_UNREACH) == 0 || sc->sc_rxmits < 3 + 1) { sc->sc_flags |= SCF_UNREACH; goto done; } syncache_drop(sc, sch); TCPSTAT_INC(tcps_sc_unreach); done: SCH_UNLOCK(sch); } /* * Build a new TCP socket structure from a syncache entry. * * On success return the newly created socket with its underlying inp locked. */ static struct socket * syncache_socket(struct syncache *sc, struct socket *lso, struct mbuf *m) { struct tcp_function_block *blk; struct inpcb *inp = NULL; struct socket *so; struct tcpcb *tp; int error; char *s; INP_INFO_RLOCK_ASSERT(&V_tcbinfo); /* * Ok, create the full blown connection, and set things up * as they would have been set up if we had created the * connection when the SYN arrived. If we can't create * the connection, abort it. */ so = sonewconn(lso, 0); if (so == NULL) { /* * Drop the connection; we will either send a RST or * have the peer retransmit its SYN again after its * RTO and try again. */ TCPSTAT_INC(tcps_listendrop); if ((s = tcp_log_addrs(&sc->sc_inc, NULL, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: Socket create failed " "due to limits or memory shortage\n", s, __func__); free(s, M_TCPLOG); } goto abort2; } #ifdef MAC mac_socketpeer_set_from_mbuf(m, so); #endif inp = sotoinpcb(so); inp->inp_inc.inc_fibnum = so->so_fibnum; INP_WLOCK(inp); /* * Exclusive pcbinfo lock is not required in syncache socket case even * if two inpcb locks can be acquired simultaneously: * - the inpcb in LISTEN state, * - the newly created inp. * * In this case, an inp cannot be at same time in LISTEN state and * just created by an accept() call. */ INP_HASH_WLOCK(&V_tcbinfo); /* Insert new socket into PCB hash list. */ inp->inp_inc.inc_flags = sc->sc_inc.inc_flags; #ifdef INET6 if (sc->sc_inc.inc_flags & INC_ISIPV6) { inp->in6p_laddr = sc->sc_inc.inc6_laddr; } else { inp->inp_vflag &= ~INP_IPV6; inp->inp_vflag |= INP_IPV4; #endif inp->inp_laddr = sc->sc_inc.inc_laddr; #ifdef INET6 } #endif /* * If there's an mbuf and it has a flowid, then let's initialise the * inp with that particular flowid. */ if (m != NULL && M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) { inp->inp_flowid = m->m_pkthdr.flowid; inp->inp_flowtype = M_HASHTYPE_GET(m); } /* * Install in the reservation hash table for now, but don't yet * install a connection group since the full 4-tuple isn't yet * configured. */ inp->inp_lport = sc->sc_inc.inc_lport; if ((error = in_pcbinshash_nopcbgroup(inp)) != 0) { /* * Undo the assignments above if we failed to * put the PCB on the hash lists. */ #ifdef INET6 if (sc->sc_inc.inc_flags & INC_ISIPV6) inp->in6p_laddr = in6addr_any; else #endif inp->inp_laddr.s_addr = INADDR_ANY; inp->inp_lport = 0; if ((s = tcp_log_addrs(&sc->sc_inc, NULL, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: in_pcbinshash failed " "with error %i\n", s, __func__, error); free(s, M_TCPLOG); } INP_HASH_WUNLOCK(&V_tcbinfo); goto abort; } #ifdef IPSEC /* Copy old policy into new socket's. */ if (ipsec_copy_policy(sotoinpcb(lso)->inp_sp, inp->inp_sp)) printf("syncache_socket: could not copy policy\n"); #endif #ifdef INET6 if (sc->sc_inc.inc_flags & INC_ISIPV6) { struct inpcb *oinp = sotoinpcb(lso); struct in6_addr laddr6; struct sockaddr_in6 sin6; /* * Inherit socket options from the listening socket. * Note that in6p_inputopts are not (and should not be) * copied, since it stores previously received options and is * used to detect if each new option is different than the * previous one and hence should be passed to a user. * If we copied in6p_inputopts, a user would not be able to * receive options just after calling the accept system call. */ inp->inp_flags |= oinp->inp_flags & INP_CONTROLOPTS; if (oinp->in6p_outputopts) inp->in6p_outputopts = ip6_copypktopts(oinp->in6p_outputopts, M_NOWAIT); sin6.sin6_family = AF_INET6; sin6.sin6_len = sizeof(sin6); sin6.sin6_addr = sc->sc_inc.inc6_faddr; sin6.sin6_port = sc->sc_inc.inc_fport; sin6.sin6_flowinfo = sin6.sin6_scope_id = 0; laddr6 = inp->in6p_laddr; if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) inp->in6p_laddr = sc->sc_inc.inc6_laddr; if ((error = in6_pcbconnect_mbuf(inp, (struct sockaddr *)&sin6, thread0.td_ucred, m)) != 0) { inp->in6p_laddr = laddr6; if ((s = tcp_log_addrs(&sc->sc_inc, NULL, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: in6_pcbconnect failed " "with error %i\n", s, __func__, error); free(s, M_TCPLOG); } INP_HASH_WUNLOCK(&V_tcbinfo); goto abort; } /* Override flowlabel from in6_pcbconnect. */ inp->inp_flow &= ~IPV6_FLOWLABEL_MASK; inp->inp_flow |= sc->sc_flowlabel; } #endif /* INET6 */ #if defined(INET) && defined(INET6) else #endif #ifdef INET { struct in_addr laddr; struct sockaddr_in sin; inp->inp_options = (m) ? ip_srcroute(m) : NULL; if (inp->inp_options == NULL) { inp->inp_options = sc->sc_ipopts; sc->sc_ipopts = NULL; } sin.sin_family = AF_INET; sin.sin_len = sizeof(sin); sin.sin_addr = sc->sc_inc.inc_faddr; sin.sin_port = sc->sc_inc.inc_fport; bzero((caddr_t)sin.sin_zero, sizeof(sin.sin_zero)); laddr = inp->inp_laddr; if (inp->inp_laddr.s_addr == INADDR_ANY) inp->inp_laddr = sc->sc_inc.inc_laddr; if ((error = in_pcbconnect_mbuf(inp, (struct sockaddr *)&sin, thread0.td_ucred, m)) != 0) { inp->inp_laddr = laddr; if ((s = tcp_log_addrs(&sc->sc_inc, NULL, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: in_pcbconnect failed " "with error %i\n", s, __func__, error); free(s, M_TCPLOG); } INP_HASH_WUNLOCK(&V_tcbinfo); goto abort; } } #endif /* INET */ INP_HASH_WUNLOCK(&V_tcbinfo); tp = intotcpcb(inp); tcp_state_change(tp, TCPS_SYN_RECEIVED); tp->iss = sc->sc_iss; tp->irs = sc->sc_irs; tcp_rcvseqinit(tp); tcp_sendseqinit(tp); blk = sototcpcb(lso)->t_fb; if (blk != tp->t_fb) { /* * Our parents t_fb was not the default, * we need to release our ref on tp->t_fb and * pickup one on the new entry. */ struct tcp_function_block *rblk; rblk = find_and_ref_tcp_fb(blk); KASSERT(rblk != NULL, ("cannot find blk %p out of syncache?", blk)); if (tp->t_fb->tfb_tcp_fb_fini) (*tp->t_fb->tfb_tcp_fb_fini)(tp); refcount_release(&tp->t_fb->tfb_refcnt); tp->t_fb = rblk; if (tp->t_fb->tfb_tcp_fb_init) { (*tp->t_fb->tfb_tcp_fb_init)(tp); } } tp->snd_wl1 = sc->sc_irs; tp->snd_max = tp->iss + 1; tp->snd_nxt = tp->iss + 1; tp->rcv_up = sc->sc_irs + 1; tp->rcv_wnd = sc->sc_wnd; tp->rcv_adv += tp->rcv_wnd; tp->last_ack_sent = tp->rcv_nxt; tp->t_flags = sototcpcb(lso)->t_flags & (TF_NOPUSH|TF_NODELAY); if (sc->sc_flags & SCF_NOOPT) tp->t_flags |= TF_NOOPT; else { if (sc->sc_flags & SCF_WINSCALE) { tp->t_flags |= TF_REQ_SCALE|TF_RCVD_SCALE; tp->snd_scale = sc->sc_requested_s_scale; tp->request_r_scale = sc->sc_requested_r_scale; } if (sc->sc_flags & SCF_TIMESTAMP) { tp->t_flags |= TF_REQ_TSTMP|TF_RCVD_TSTMP; tp->ts_recent = sc->sc_tsreflect; tp->ts_recent_age = tcp_ts_getticks(); tp->ts_offset = sc->sc_tsoff; } #ifdef TCP_SIGNATURE if (sc->sc_flags & SCF_SIGNATURE) tp->t_flags |= TF_SIGNATURE; #endif if (sc->sc_flags & SCF_SACK) tp->t_flags |= TF_SACK_PERMIT; } if (sc->sc_flags & SCF_ECN) tp->t_flags |= TF_ECN_PERMIT; /* * Set up MSS and get cached values from tcp_hostcache. * This might overwrite some of the defaults we just set. */ tcp_mss(tp, sc->sc_peer_mss); /* * If the SYN,ACK was retransmitted, indicate that CWND to be * limited to one segment in cc_conn_init(). * NB: sc_rxmits counts all SYN,ACK transmits, not just retransmits. */ if (sc->sc_rxmits > 1) tp->snd_cwnd = 1; #ifdef TCP_OFFLOAD /* * Allow a TOE driver to install its hooks. Note that we hold the * pcbinfo lock too and that prevents tcp_usr_accept from accepting a * new connection before the TOE driver has done its thing. */ if (ADDED_BY_TOE(sc)) { struct toedev *tod = sc->sc_tod; tod->tod_offload_socket(tod, sc->sc_todctx, so); } #endif /* * Copy and activate timers. */ tp->t_keepinit = sototcpcb(lso)->t_keepinit; tp->t_keepidle = sototcpcb(lso)->t_keepidle; tp->t_keepintvl = sototcpcb(lso)->t_keepintvl; tp->t_keepcnt = sototcpcb(lso)->t_keepcnt; tcp_timer_activate(tp, TT_KEEP, TP_KEEPINIT(tp)); soisconnected(so); TCPSTAT_INC(tcps_accepts); return (so); abort: INP_WUNLOCK(inp); abort2: if (so != NULL) soabort(so); return (NULL); } /* * This function gets called when we receive an ACK for a * socket in the LISTEN state. We look up the connection * in the syncache, and if its there, we pull it out of * the cache and turn it into a full-blown connection in * the SYN-RECEIVED state. * * On syncache_socket() success the newly created socket * has its underlying inp locked. */ int syncache_expand(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, struct socket **lsop, struct mbuf *m) { struct syncache *sc; struct syncache_head *sch; struct syncache scs; char *s; /* * Global TCP locks are held because we manipulate the PCB lists * and create a new socket. */ INP_INFO_RLOCK_ASSERT(&V_tcbinfo); KASSERT((th->th_flags & (TH_RST|TH_ACK|TH_SYN)) == TH_ACK, ("%s: can handle only ACK", __func__)); sc = syncache_lookup(inc, &sch); /* returns locked sch */ SCH_LOCK_ASSERT(sch); #ifdef INVARIANTS /* * Test code for syncookies comparing the syncache stored * values with the reconstructed values from the cookie. */ if (sc != NULL) syncookie_cmp(inc, sch, sc, th, to, *lsop); #endif if (sc == NULL) { /* * There is no syncache entry, so see if this ACK is * a returning syncookie. To do this, first: * A. See if this socket has had a syncache entry dropped in * the past. We don't want to accept a bogus syncookie * if we've never received a SYN. * B. check that the syncookie is valid. If it is, then * cobble up a fake syncache entry, and return. */ if (!V_tcp_syncookies) { SCH_UNLOCK(sch); if ((s = tcp_log_addrs(inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Spurious ACK, " "segment rejected (syncookies disabled)\n", s, __func__); goto failed; } bzero(&scs, sizeof(scs)); sc = syncookie_lookup(inc, sch, &scs, th, to, *lsop); SCH_UNLOCK(sch); if (sc == NULL) { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Segment failed " "SYNCOOKIE authentication, segment rejected " "(probably spoofed)\n", s, __func__); goto failed; } } else { /* * Pull out the entry to unlock the bucket row. * * NOTE: We must decrease TCPS_SYN_RECEIVED count here, not * tcp_state_change(). The tcpcb is not existent at this * moment. A new one will be allocated via syncache_socket-> * sonewconn->tcp_usr_attach in TCPS_CLOSED state, then * syncache_socket() will change it to TCPS_SYN_RECEIVED. */ TCPSTATES_DEC(TCPS_SYN_RECEIVED); TAILQ_REMOVE(&sch->sch_bucket, sc, sc_hash); sch->sch_length--; #ifdef TCP_OFFLOAD if (ADDED_BY_TOE(sc)) { struct toedev *tod = sc->sc_tod; tod->tod_syncache_removed(tod, sc->sc_todctx); } #endif SCH_UNLOCK(sch); } /* * Segment validation: * ACK must match our initial sequence number + 1 (the SYN|ACK). */ if (th->th_ack != sc->sc_iss + 1) { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: ACK %u != ISS+1 %u, segment " "rejected\n", s, __func__, th->th_ack, sc->sc_iss); goto failed; } /* * The SEQ must fall in the window starting at the received * initial receive sequence number + 1 (the SYN). */ if (SEQ_LEQ(th->th_seq, sc->sc_irs) || SEQ_GT(th->th_seq, sc->sc_irs + sc->sc_wnd)) { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: SEQ %u != IRS+1 %u, segment " "rejected\n", s, __func__, th->th_seq, sc->sc_irs); goto failed; } /* * If timestamps were not negotiated during SYN/ACK they * must not appear on any segment during this session. */ if (!(sc->sc_flags & SCF_TIMESTAMP) && (to->to_flags & TOF_TS)) { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: Timestamp not expected, " "segment rejected\n", s, __func__); goto failed; } /* * If timestamps were negotiated during SYN/ACK they should * appear on every segment during this session. * XXXAO: This is only informal as there have been unverified * reports of non-compliants stacks. */ if ((sc->sc_flags & SCF_TIMESTAMP) && !(to->to_flags & TOF_TS)) { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: Timestamp missing, " "no action\n", s, __func__); free(s, M_TCPLOG); s = NULL; } } /* * If timestamps were negotiated the reflected timestamp * must be equal to what we actually sent in the SYN|ACK. */ if ((to->to_flags & TOF_TS) && to->to_tsecr != sc->sc_ts) { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) log(LOG_DEBUG, "%s; %s: TSECR %u != TS %u, " "segment rejected\n", s, __func__, to->to_tsecr, sc->sc_ts); goto failed; } *lsop = syncache_socket(sc, *lsop, m); if (*lsop == NULL) TCPSTAT_INC(tcps_sc_aborted); else TCPSTAT_INC(tcps_sc_completed); /* how do we find the inp for the new socket? */ if (sc != &scs) syncache_free(sc); return (1); failed: if (sc != NULL && sc != &scs) syncache_free(sc); if (s != NULL) free(s, M_TCPLOG); *lsop = NULL; return (0); } #ifdef TCP_RFC7413 static void syncache_tfo_expand(struct syncache *sc, struct socket **lsop, struct mbuf *m, uint64_t response_cookie) { struct inpcb *inp; struct tcpcb *tp; unsigned int *pending_counter; /* * Global TCP locks are held because we manipulate the PCB lists * and create a new socket. */ INP_INFO_RLOCK_ASSERT(&V_tcbinfo); pending_counter = intotcpcb(sotoinpcb(*lsop))->t_tfo_pending; *lsop = syncache_socket(sc, *lsop, m); if (*lsop == NULL) { TCPSTAT_INC(tcps_sc_aborted); atomic_subtract_int(pending_counter, 1); } else { inp = sotoinpcb(*lsop); tp = intotcpcb(inp); tp->t_flags |= TF_FASTOPEN; tp->t_tfo_cookie = response_cookie; tp->snd_max = tp->iss; tp->snd_nxt = tp->iss; tp->t_tfo_pending = pending_counter; TCPSTAT_INC(tcps_sc_completed); } } #endif /* TCP_RFC7413 */ /* * Given a LISTEN socket and an inbound SYN request, add * this to the syn cache, and send back a segment: * * to the source. * * IMPORTANT NOTE: We do _NOT_ ACK data that might accompany the SYN. * Doing so would require that we hold onto the data and deliver it * to the application. However, if we are the target of a SYN-flood * DoS attack, an attacker could send data which would eventually * consume all available buffer space if it were ACKed. By not ACKing * the data, we avoid this DoS scenario. * * The exception to the above is when a SYN with a valid TCP Fast Open (TFO) * cookie is processed, V_tcp_fastopen_enabled set to true, and the * TCP_FASTOPEN socket option is set. In this case, a new socket is created * and returned via lsop, the mbuf is not freed so that tcp_input() can * queue its data to the socket, and 1 is returned to indicate the * TFO-socket-creation path was taken. */ int syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, struct inpcb *inp, struct socket **lsop, struct mbuf *m, void *tod, void *todctx) { struct tcpcb *tp; struct socket *so; struct syncache *sc = NULL; struct syncache_head *sch; struct mbuf *ipopts = NULL; u_int ltflags; int win, sb_hiwat, ip_ttl, ip_tos; char *s; int rv = 0; #ifdef INET6 int autoflowlabel = 0; #endif #ifdef MAC struct label *maclabel; #endif struct syncache scs; struct ucred *cred; #ifdef TCP_RFC7413 uint64_t tfo_response_cookie; int tfo_cookie_valid = 0; int tfo_response_cookie_valid = 0; #endif INP_WLOCK_ASSERT(inp); /* listen socket */ KASSERT((th->th_flags & (TH_RST|TH_ACK|TH_SYN)) == TH_SYN, ("%s: unexpected tcp flags", __func__)); /* * Combine all so/tp operations very early to drop the INP lock as * soon as possible. */ so = *lsop; tp = sototcpcb(so); cred = crhold(so->so_cred); #ifdef INET6 if ((inc->inc_flags & INC_ISIPV6) && (inp->inp_flags & IN6P_AUTOFLOWLABEL)) autoflowlabel = 1; #endif ip_ttl = inp->inp_ip_ttl; ip_tos = inp->inp_ip_tos; win = sbspace(&so->so_rcv); sb_hiwat = so->so_rcv.sb_hiwat; ltflags = (tp->t_flags & (TF_NOOPT | TF_SIGNATURE)); #ifdef TCP_RFC7413 if (V_tcp_fastopen_enabled && (tp->t_flags & TF_FASTOPEN) && (tp->t_tfo_pending != NULL) && (to->to_flags & TOF_FASTOPEN)) { /* * Limit the number of pending TFO connections to * approximately half of the queue limit. This prevents TFO * SYN floods from starving the service by filling the * listen queue with bogus TFO connections. */ if (atomic_fetchadd_int(tp->t_tfo_pending, 1) <= (so->so_qlimit / 2)) { int result; result = tcp_fastopen_check_cookie(inc, to->to_tfo_cookie, to->to_tfo_len, &tfo_response_cookie); tfo_cookie_valid = (result > 0); tfo_response_cookie_valid = (result >= 0); } else atomic_subtract_int(tp->t_tfo_pending, 1); } #endif /* By the time we drop the lock these should no longer be used. */ so = NULL; tp = NULL; #ifdef MAC if (mac_syncache_init(&maclabel) != 0) { INP_WUNLOCK(inp); goto done; } else mac_syncache_create(maclabel, inp); #endif #ifdef TCP_RFC7413 if (!tfo_cookie_valid) #endif INP_WUNLOCK(inp); /* * Remember the IP options, if any. */ #ifdef INET6 if (!(inc->inc_flags & INC_ISIPV6)) #endif #ifdef INET ipopts = (m) ? ip_srcroute(m) : NULL; #else ipopts = NULL; #endif /* * See if we already have an entry for this connection. * If we do, resend the SYN,ACK, and reset the retransmit timer. * * XXX: should the syncache be re-initialized with the contents * of the new SYN here (which may have different options?) * * XXX: We do not check the sequence number to see if this is a * real retransmit or a new connection attempt. The question is * how to handle such a case; either ignore it as spoofed, or * drop the current entry and create a new one? */ sc = syncache_lookup(inc, &sch); /* returns locked entry */ SCH_LOCK_ASSERT(sch); if (sc != NULL) { #ifdef TCP_RFC7413 if (tfo_cookie_valid) INP_WUNLOCK(inp); #endif TCPSTAT_INC(tcps_sc_dupsyn); if (ipopts) { /* * If we were remembering a previous source route, * forget it and use the new one we've been given. */ if (sc->sc_ipopts) (void) m_free(sc->sc_ipopts); sc->sc_ipopts = ipopts; } /* * Update timestamp if present. */ if ((sc->sc_flags & SCF_TIMESTAMP) && (to->to_flags & TOF_TS)) sc->sc_tsreflect = to->to_tsval; else sc->sc_flags &= ~SCF_TIMESTAMP; #ifdef MAC /* * Since we have already unconditionally allocated label * storage, free it up. The syncache entry will already * have an initialized label we can use. */ mac_syncache_destroy(&maclabel); #endif /* Retransmit SYN|ACK and reset retransmit count. */ if ((s = tcp_log_addrs(&sc->sc_inc, th, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: Received duplicate SYN, " "resetting timer and retransmitting SYN|ACK\n", s, __func__); free(s, M_TCPLOG); } if (syncache_respond(sc, sch, 1) == 0) { sc->sc_rxmits = 0; syncache_timeout(sc, sch, 1); TCPSTAT_INC(tcps_sndacks); TCPSTAT_INC(tcps_sndtotal); } SCH_UNLOCK(sch); goto done; } #ifdef TCP_RFC7413 if (tfo_cookie_valid) { bzero(&scs, sizeof(scs)); sc = &scs; goto skip_alloc; } #endif sc = uma_zalloc(V_tcp_syncache.zone, M_NOWAIT | M_ZERO); if (sc == NULL) { /* * The zone allocator couldn't provide more entries. * Treat this as if the cache was full; drop the oldest * entry and insert the new one. */ TCPSTAT_INC(tcps_sc_zonefail); if ((sc = TAILQ_LAST(&sch->sch_bucket, sch_head)) != NULL) syncache_drop(sc, sch); sc = uma_zalloc(V_tcp_syncache.zone, M_NOWAIT | M_ZERO); if (sc == NULL) { if (V_tcp_syncookies) { bzero(&scs, sizeof(scs)); sc = &scs; } else { SCH_UNLOCK(sch); if (ipopts) (void) m_free(ipopts); goto done; } } } #ifdef TCP_RFC7413 skip_alloc: if (!tfo_cookie_valid && tfo_response_cookie_valid) sc->sc_tfo_cookie = &tfo_response_cookie; #endif /* * Fill in the syncache values. */ #ifdef MAC sc->sc_label = maclabel; #endif sc->sc_cred = cred; cred = NULL; sc->sc_ipopts = ipopts; bcopy(inc, &sc->sc_inc, sizeof(struct in_conninfo)); #ifdef INET6 if (!(inc->inc_flags & INC_ISIPV6)) #endif { sc->sc_ip_tos = ip_tos; sc->sc_ip_ttl = ip_ttl; } #ifdef TCP_OFFLOAD sc->sc_tod = tod; sc->sc_todctx = todctx; #endif sc->sc_irs = th->th_seq; sc->sc_iss = arc4random(); sc->sc_flags = 0; sc->sc_flowlabel = 0; /* * Initial receive window: clip sbspace to [0 .. TCP_MAXWIN]. * win was derived from socket earlier in the function. */ win = imax(win, 0); win = imin(win, TCP_MAXWIN); sc->sc_wnd = win; if (V_tcp_do_rfc1323) { /* * A timestamp received in a SYN makes * it ok to send timestamp requests and replies. */ if (to->to_flags & TOF_TS) { sc->sc_tsreflect = to->to_tsval; sc->sc_ts = tcp_ts_getticks(); sc->sc_flags |= SCF_TIMESTAMP; } if (to->to_flags & TOF_SCALE) { int wscale = 0; /* * Pick the smallest possible scaling factor that * will still allow us to scale up to sb_max, aka * kern.ipc.maxsockbuf. * * We do this because there are broken firewalls that * will corrupt the window scale option, leading to * the other endpoint believing that our advertised * window is unscaled. At scale factors larger than * 5 the unscaled window will drop below 1500 bytes, * leading to serious problems when traversing these * broken firewalls. * * With the default maxsockbuf of 256K, a scale factor * of 3 will be chosen by this algorithm. Those who * choose a larger maxsockbuf should watch out * for the compatiblity problems mentioned above. * * RFC1323: The Window field in a SYN (i.e., a * or ) segment itself is never scaled. */ while (wscale < TCP_MAX_WINSHIFT && (TCP_MAXWIN << wscale) < sb_max) wscale++; sc->sc_requested_r_scale = wscale; sc->sc_requested_s_scale = to->to_wscale; sc->sc_flags |= SCF_WINSCALE; } } #ifdef TCP_SIGNATURE /* * If listening socket requested TCP digests, OR received SYN * contains the option, flag this in the syncache so that * syncache_respond() will do the right thing with the SYN+ACK. */ if (to->to_flags & TOF_SIGNATURE || ltflags & TF_SIGNATURE) sc->sc_flags |= SCF_SIGNATURE; #endif if (to->to_flags & TOF_SACKPERM) sc->sc_flags |= SCF_SACK; if (to->to_flags & TOF_MSS) sc->sc_peer_mss = to->to_mss; /* peer mss may be zero */ if (ltflags & TF_NOOPT) sc->sc_flags |= SCF_NOOPT; if ((th->th_flags & (TH_ECE|TH_CWR)) && V_tcp_do_ecn) sc->sc_flags |= SCF_ECN; if (V_tcp_syncookies) sc->sc_iss = syncookie_generate(sch, sc); #ifdef INET6 if (autoflowlabel) { if (V_tcp_syncookies) sc->sc_flowlabel = sc->sc_iss; else sc->sc_flowlabel = ip6_randomflowlabel(); sc->sc_flowlabel = htonl(sc->sc_flowlabel) & IPV6_FLOWLABEL_MASK; } #endif SCH_UNLOCK(sch); #ifdef TCP_RFC7413 if (tfo_cookie_valid) { syncache_tfo_expand(sc, lsop, m, tfo_response_cookie); /* INP_WUNLOCK(inp) will be performed by the called */ rv = 1; goto tfo_done; } #endif /* * Do a standard 3-way handshake. */ if (syncache_respond(sc, sch, 0) == 0) { if (V_tcp_syncookies && V_tcp_syncookiesonly && sc != &scs) syncache_free(sc); else if (sc != &scs) syncache_insert(sc, sch); /* locks and unlocks sch */ TCPSTAT_INC(tcps_sndacks); TCPSTAT_INC(tcps_sndtotal); } else { if (sc != &scs) syncache_free(sc); TCPSTAT_INC(tcps_sc_dropped); } done: if (m) { *lsop = NULL; m_freem(m); } #ifdef TCP_RFC7413 tfo_done: #endif if (cred != NULL) crfree(cred); #ifdef MAC if (sc == &scs) mac_syncache_destroy(&maclabel); #endif return (rv); } static int syncache_respond(struct syncache *sc, struct syncache_head *sch, int locked) { struct ip *ip = NULL; struct mbuf *m; struct tcphdr *th = NULL; int optlen, error = 0; /* Make compiler happy */ u_int16_t hlen, tlen, mssopt; struct tcpopt to; #ifdef INET6 struct ip6_hdr *ip6 = NULL; #endif #ifdef TCP_SIGNATURE struct secasvar *sav; #endif hlen = #ifdef INET6 (sc->sc_inc.inc_flags & INC_ISIPV6) ? sizeof(struct ip6_hdr) : #endif sizeof(struct ip); tlen = hlen + sizeof(struct tcphdr); /* Determine MSS we advertize to other end of connection. */ mssopt = tcp_mssopt(&sc->sc_inc); if (sc->sc_peer_mss) mssopt = max( min(sc->sc_peer_mss, mssopt), V_tcp_minmss); /* XXX: Assume that the entire packet will fit in a header mbuf. */ KASSERT(max_linkhdr + tlen + TCP_MAXOLEN <= MHLEN, ("syncache: mbuf too small")); /* Create the IP+TCP header from scratch. */ m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) return (ENOBUFS); #ifdef MAC mac_syncache_create_mbuf(sc->sc_label, m); #endif m->m_data += max_linkhdr; m->m_len = tlen; m->m_pkthdr.len = tlen; m->m_pkthdr.rcvif = NULL; #ifdef INET6 if (sc->sc_inc.inc_flags & INC_ISIPV6) { ip6 = mtod(m, struct ip6_hdr *); ip6->ip6_vfc = IPV6_VERSION; ip6->ip6_nxt = IPPROTO_TCP; ip6->ip6_src = sc->sc_inc.inc6_laddr; ip6->ip6_dst = sc->sc_inc.inc6_faddr; ip6->ip6_plen = htons(tlen - hlen); /* ip6_hlim is set after checksum */ ip6->ip6_flow &= ~IPV6_FLOWLABEL_MASK; ip6->ip6_flow |= sc->sc_flowlabel; th = (struct tcphdr *)(ip6 + 1); } #endif #if defined(INET6) && defined(INET) else #endif #ifdef INET { ip = mtod(m, struct ip *); ip->ip_v = IPVERSION; ip->ip_hl = sizeof(struct ip) >> 2; ip->ip_len = htons(tlen); ip->ip_id = 0; ip->ip_off = 0; ip->ip_sum = 0; ip->ip_p = IPPROTO_TCP; ip->ip_src = sc->sc_inc.inc_laddr; ip->ip_dst = sc->sc_inc.inc_faddr; ip->ip_ttl = sc->sc_ip_ttl; ip->ip_tos = sc->sc_ip_tos; /* * See if we should do MTU discovery. Route lookups are * expensive, so we will only unset the DF bit if: * * 1) path_mtu_discovery is disabled * 2) the SCF_UNREACH flag has been set */ if (V_path_mtu_discovery && ((sc->sc_flags & SCF_UNREACH) == 0)) ip->ip_off |= htons(IP_DF); th = (struct tcphdr *)(ip + 1); } #endif /* INET */ th->th_sport = sc->sc_inc.inc_lport; th->th_dport = sc->sc_inc.inc_fport; th->th_seq = htonl(sc->sc_iss); th->th_ack = htonl(sc->sc_irs + 1); th->th_off = sizeof(struct tcphdr) >> 2; th->th_x2 = 0; th->th_flags = TH_SYN|TH_ACK; th->th_win = htons(sc->sc_wnd); th->th_urp = 0; if (sc->sc_flags & SCF_ECN) { th->th_flags |= TH_ECE; TCPSTAT_INC(tcps_ecn_shs); } /* Tack on the TCP options. */ if ((sc->sc_flags & SCF_NOOPT) == 0) { to.to_flags = 0; to.to_mss = mssopt; to.to_flags = TOF_MSS; if (sc->sc_flags & SCF_WINSCALE) { to.to_wscale = sc->sc_requested_r_scale; to.to_flags |= TOF_SCALE; } if (sc->sc_flags & SCF_TIMESTAMP) { /* Virgin timestamp or TCP cookie enhanced one. */ to.to_tsval = sc->sc_ts; to.to_tsecr = sc->sc_tsreflect; to.to_flags |= TOF_TS; } if (sc->sc_flags & SCF_SACK) to.to_flags |= TOF_SACKPERM; #ifdef TCP_SIGNATURE sav = NULL; if (sc->sc_flags & SCF_SIGNATURE) { sav = tcp_get_sav(m, IPSEC_DIR_OUTBOUND); if (sav != NULL) to.to_flags |= TOF_SIGNATURE; else { /* * We've got SCF_SIGNATURE flag * inherited from listening socket, * but no SADB key for given source * address. Assume signature is not * required and remove signature flag * instead of silently dropping * connection. */ if (locked == 0) SCH_LOCK(sch); sc->sc_flags &= ~SCF_SIGNATURE; if (locked == 0) SCH_UNLOCK(sch); } } #endif #ifdef TCP_RFC7413 if (sc->sc_tfo_cookie) { to.to_flags |= TOF_FASTOPEN; to.to_tfo_len = TCP_FASTOPEN_COOKIE_LEN; to.to_tfo_cookie = sc->sc_tfo_cookie; /* don't send cookie again when retransmitting response */ sc->sc_tfo_cookie = NULL; } #endif optlen = tcp_addoptions(&to, (u_char *)(th + 1)); /* Adjust headers by option size. */ th->th_off = (sizeof(struct tcphdr) + optlen) >> 2; m->m_len += optlen; m->m_pkthdr.len += optlen; #ifdef TCP_SIGNATURE if (sc->sc_flags & SCF_SIGNATURE) tcp_signature_do_compute(m, 0, optlen, to.to_signature, sav); #endif #ifdef INET6 if (sc->sc_inc.inc_flags & INC_ISIPV6) ip6->ip6_plen = htons(ntohs(ip6->ip6_plen) + optlen); else #endif ip->ip_len = htons(ntohs(ip->ip_len) + optlen); } else optlen = 0; M_SETFIB(m, sc->sc_inc.inc_fibnum); m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); #ifdef INET6 if (sc->sc_inc.inc_flags & INC_ISIPV6) { m->m_pkthdr.csum_flags = CSUM_TCP_IPV6; th->th_sum = in6_cksum_pseudo(ip6, tlen + optlen - hlen, IPPROTO_TCP, 0); ip6->ip6_hlim = in6_selecthlim(NULL, NULL); #ifdef TCP_OFFLOAD if (ADDED_BY_TOE(sc)) { struct toedev *tod = sc->sc_tod; error = tod->tod_syncache_respond(tod, sc->sc_todctx, m); return (error); } #endif error = ip6_output(m, NULL, NULL, 0, NULL, NULL, NULL); } #endif #if defined(INET6) && defined(INET) else #endif #ifdef INET { m->m_pkthdr.csum_flags = CSUM_TCP; th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htons(tlen + optlen - hlen + IPPROTO_TCP)); #ifdef TCP_OFFLOAD if (ADDED_BY_TOE(sc)) { struct toedev *tod = sc->sc_tod; error = tod->tod_syncache_respond(tod, sc->sc_todctx, m); return (error); } #endif error = ip_output(m, sc->sc_ipopts, NULL, 0, NULL, NULL); } #endif return (error); } /* * The purpose of syncookies is to handle spoofed SYN flooding DoS attacks * that exceed the capacity of the syncache by avoiding the storage of any * of the SYNs we receive. Syncookies defend against blind SYN flooding * attacks where the attacker does not have access to our responses. * * Syncookies encode and include all necessary information about the * connection setup within the SYN|ACK that we send back. That way we * can avoid keeping any local state until the ACK to our SYN|ACK returns * (if ever). Normally the syncache and syncookies are running in parallel * with the latter taking over when the former is exhausted. When matching * syncache entry is found the syncookie is ignored. * * The only reliable information persisting the 3WHS is our inital sequence * number ISS of 32 bits. Syncookies embed a cryptographically sufficient * strong hash (MAC) value and a few bits of TCP SYN options in the ISS * of our SYN|ACK. The MAC can be recomputed when the ACK to our SYN|ACK * returns and signifies a legitimate connection if it matches the ACK. * * The available space of 32 bits to store the hash and to encode the SYN * option information is very tight and we should have at least 24 bits for * the MAC to keep the number of guesses by blind spoofing reasonably high. * * SYN option information we have to encode to fully restore a connection: * MSS: is imporant to chose an optimal segment size to avoid IP level * fragmentation along the path. The common MSS values can be encoded * in a 3-bit table. Uncommon values are captured by the next lower value * in the table leading to a slight increase in packetization overhead. * WSCALE: is necessary to allow large windows to be used for high delay- * bandwidth product links. Not scaling the window when it was initially * negotiated is bad for performance as lack of scaling further decreases * the apparent available send window. We only need to encode the WSCALE * we received from the remote end. Our end can be recalculated at any * time. The common WSCALE values can be encoded in a 3-bit table. * Uncommon values are captured by the next lower value in the table * making us under-estimate the available window size halving our * theoretically possible maximum throughput for that connection. * SACK: Greatly assists in packet loss recovery and requires 1 bit. * TIMESTAMP and SIGNATURE is not encoded because they are permanent options * that are included in all segments on a connection. We enable them when * the ACK has them. * * Security of syncookies and attack vectors: * * The MAC is computed over (faddr||laddr||fport||lport||irs||flags||secmod) * together with the gloabl secret to make it unique per connection attempt. * Thus any change of any of those parameters results in a different MAC output * in an unpredictable way unless a collision is encountered. 24 bits of the * MAC are embedded into the ISS. * * To prevent replay attacks two rotating global secrets are updated with a * new random value every 15 seconds. The life-time of a syncookie is thus * 15-30 seconds. * * Vector 1: Attacking the secret. This requires finding a weakness in the * MAC itself or the way it is used here. The attacker can do a chosen plain * text attack by varying and testing the all parameters under his control. * The strength depends on the size and randomness of the secret, and the * cryptographic security of the MAC function. Due to the constant updating * of the secret the attacker has at most 29.999 seconds to find the secret * and launch spoofed connections. After that he has to start all over again. * * Vector 2: Collision attack on the MAC of a single ACK. With a 24 bit MAC * size an average of 4,823 attempts are required for a 50% chance of success * to spoof a single syncookie (birthday collision paradox). However the * attacker is blind and doesn't know if one of his attempts succeeded unless * he has a side channel to interfere success from. A single connection setup * success average of 90% requires 8,790 packets, 99.99% requires 17,578 packets. * This many attempts are required for each one blind spoofed connection. For * every additional spoofed connection he has to launch another N attempts. * Thus for a sustained rate 100 spoofed connections per second approximately * 1,800,000 packets per second would have to be sent. * * NB: The MAC function should be fast so that it doesn't become a CPU * exhaustion attack vector itself. * * References: * RFC4987 TCP SYN Flooding Attacks and Common Mitigations * SYN cookies were first proposed by cryptographer Dan J. Bernstein in 1996 * http://cr.yp.to/syncookies.html (overview) * http://cr.yp.to/syncookies/archive (details) * * * Schematic construction of a syncookie enabled Initial Sequence Number: * 0 1 2 3 * 12345678901234567890123456789012 * |xxxxxxxxxxxxxxxxxxxxxxxxWWWMMMSP| * * x 24 MAC (truncated) * W 3 Send Window Scale index * M 3 MSS index * S 1 SACK permitted * P 1 Odd/even secret */ /* * Distribution and probability of certain MSS values. Those in between are * rounded down to the next lower one. * [An Analysis of TCP Maximum Segment Sizes, S. Alcock and R. Nelson, 2011] * .2% .3% 5% 7% 7% 20% 15% 45% */ static int tcp_sc_msstab[] = { 216, 536, 1200, 1360, 1400, 1440, 1452, 1460 }; /* * Distribution and probability of certain WSCALE values. We have to map the * (send) window scale (shift) option with a range of 0-14 from 4 bits into 3 * bits based on prevalence of certain values. Where we don't have an exact * match for are rounded down to the next lower one letting us under-estimate * the true available window. At the moment this would happen only for the * very uncommon values 3, 5 and those above 8 (more than 16MB socket buffer * and window size). The absence of the WSCALE option (no scaling in either * direction) is encoded with index zero. * [WSCALE values histograms, Allman, 2012] * X 10 10 35 5 6 14 10% by host * X 11 4 5 5 18 49 3% by connections */ static int tcp_sc_wstab[] = { 0, 0, 1, 2, 4, 6, 7, 8 }; /* * Compute the MAC for the SYN cookie. SIPHASH-2-4 is chosen for its speed * and good cryptographic properties. */ static uint32_t syncookie_mac(struct in_conninfo *inc, tcp_seq irs, uint8_t flags, uint8_t *secbits, uintptr_t secmod) { SIPHASH_CTX ctx; uint32_t siphash[2]; SipHash24_Init(&ctx); SipHash_SetKey(&ctx, secbits); switch (inc->inc_flags & INC_ISIPV6) { #ifdef INET case 0: SipHash_Update(&ctx, &inc->inc_faddr, sizeof(inc->inc_faddr)); SipHash_Update(&ctx, &inc->inc_laddr, sizeof(inc->inc_laddr)); break; #endif #ifdef INET6 case INC_ISIPV6: SipHash_Update(&ctx, &inc->inc6_faddr, sizeof(inc->inc6_faddr)); SipHash_Update(&ctx, &inc->inc6_laddr, sizeof(inc->inc6_laddr)); break; #endif } SipHash_Update(&ctx, &inc->inc_fport, sizeof(inc->inc_fport)); SipHash_Update(&ctx, &inc->inc_lport, sizeof(inc->inc_lport)); SipHash_Update(&ctx, &irs, sizeof(irs)); SipHash_Update(&ctx, &flags, sizeof(flags)); SipHash_Update(&ctx, &secmod, sizeof(secmod)); SipHash_Final((u_int8_t *)&siphash, &ctx); return (siphash[0] ^ siphash[1]); } static tcp_seq syncookie_generate(struct syncache_head *sch, struct syncache *sc) { u_int i, mss, secbit, wscale; uint32_t iss, hash; uint8_t *secbits; union syncookie cookie; SCH_LOCK_ASSERT(sch); cookie.cookie = 0; /* Map our computed MSS into the 3-bit index. */ mss = min(tcp_mssopt(&sc->sc_inc), max(sc->sc_peer_mss, V_tcp_minmss)); - for (i = nitems(tcp_sc_msstab) - 1; - tcp_sc_msstab[i] > mss && i > 0; + for (i = nitems(tcp_sc_msstab) - 1; tcp_sc_msstab[i] > mss && i > 0; i--) ; cookie.flags.mss_idx = i; /* * Map the send window scale into the 3-bit index but only if * the wscale option was received. */ if (sc->sc_flags & SCF_WINSCALE) { wscale = sc->sc_requested_s_scale; for (i = nitems(tcp_sc_wstab) - 1; - tcp_sc_wstab[i] > wscale && i > 0; + tcp_sc_wstab[i] > wscale && i > 0; i--) ; cookie.flags.wscale_idx = i; } /* Can we do SACK? */ if (sc->sc_flags & SCF_SACK) cookie.flags.sack_ok = 1; /* Which of the two secrets to use. */ secbit = sch->sch_sc->secret.oddeven & 0x1; cookie.flags.odd_even = secbit; secbits = sch->sch_sc->secret.key[secbit]; hash = syncookie_mac(&sc->sc_inc, sc->sc_irs, cookie.cookie, secbits, (uintptr_t)sch); /* * Put the flags into the hash and XOR them to get better ISS number * variance. This doesn't enhance the cryptographic strength and is * done to prevent the 8 cookie bits from showing up directly on the * wire. */ iss = hash & ~0xff; iss |= cookie.cookie ^ (hash >> 24); /* Randomize the timestamp. */ if (sc->sc_flags & SCF_TIMESTAMP) { sc->sc_ts = arc4random(); sc->sc_tsoff = sc->sc_ts - tcp_ts_getticks(); } TCPSTAT_INC(tcps_sc_sendcookie); return (iss); } static struct syncache * syncookie_lookup(struct in_conninfo *inc, struct syncache_head *sch, struct syncache *sc, struct tcphdr *th, struct tcpopt *to, struct socket *lso) { uint32_t hash; uint8_t *secbits; tcp_seq ack, seq; int wnd, wscale = 0; union syncookie cookie; SCH_LOCK_ASSERT(sch); /* * Pull information out of SYN-ACK/ACK and revert sequence number * advances. */ ack = th->th_ack - 1; seq = th->th_seq - 1; /* * Unpack the flags containing enough information to restore the * connection. */ cookie.cookie = (ack & 0xff) ^ (ack >> 24); /* Which of the two secrets to use. */ secbits = sch->sch_sc->secret.key[cookie.flags.odd_even]; hash = syncookie_mac(inc, seq, cookie.cookie, secbits, (uintptr_t)sch); /* The recomputed hash matches the ACK if this was a genuine cookie. */ if ((ack & ~0xff) != (hash & ~0xff)) return (NULL); /* Fill in the syncache values. */ sc->sc_flags = 0; bcopy(inc, &sc->sc_inc, sizeof(struct in_conninfo)); sc->sc_ipopts = NULL; sc->sc_irs = seq; sc->sc_iss = ack; switch (inc->inc_flags & INC_ISIPV6) { #ifdef INET case 0: sc->sc_ip_ttl = sotoinpcb(lso)->inp_ip_ttl; sc->sc_ip_tos = sotoinpcb(lso)->inp_ip_tos; break; #endif #ifdef INET6 case INC_ISIPV6: if (sotoinpcb(lso)->inp_flags & IN6P_AUTOFLOWLABEL) sc->sc_flowlabel = sc->sc_iss & IPV6_FLOWLABEL_MASK; break; #endif } sc->sc_peer_mss = tcp_sc_msstab[cookie.flags.mss_idx]; /* We can simply recompute receive window scale we sent earlier. */ while (wscale < TCP_MAX_WINSHIFT && (TCP_MAXWIN << wscale) < sb_max) wscale++; /* Only use wscale if it was enabled in the orignal SYN. */ if (cookie.flags.wscale_idx > 0) { sc->sc_requested_r_scale = wscale; sc->sc_requested_s_scale = tcp_sc_wstab[cookie.flags.wscale_idx]; sc->sc_flags |= SCF_WINSCALE; } wnd = sbspace(&lso->so_rcv); wnd = imax(wnd, 0); wnd = imin(wnd, TCP_MAXWIN); sc->sc_wnd = wnd; if (cookie.flags.sack_ok) sc->sc_flags |= SCF_SACK; if (to->to_flags & TOF_TS) { sc->sc_flags |= SCF_TIMESTAMP; sc->sc_tsreflect = to->to_tsval; sc->sc_ts = to->to_tsecr; sc->sc_tsoff = to->to_tsecr - tcp_ts_getticks(); } if (to->to_flags & TOF_SIGNATURE) sc->sc_flags |= SCF_SIGNATURE; sc->sc_rxmits = 0; TCPSTAT_INC(tcps_sc_recvcookie); return (sc); } #ifdef INVARIANTS static int syncookie_cmp(struct in_conninfo *inc, struct syncache_head *sch, struct syncache *sc, struct tcphdr *th, struct tcpopt *to, struct socket *lso) { struct syncache scs, *scx; char *s; bzero(&scs, sizeof(scs)); scx = syncookie_lookup(inc, sch, &scs, th, to, lso); if ((s = tcp_log_addrs(inc, th, NULL, NULL)) == NULL) return (0); if (scx != NULL) { if (sc->sc_peer_mss != scx->sc_peer_mss) log(LOG_DEBUG, "%s; %s: mss different %i vs %i\n", s, __func__, sc->sc_peer_mss, scx->sc_peer_mss); if (sc->sc_requested_r_scale != scx->sc_requested_r_scale) log(LOG_DEBUG, "%s; %s: rwscale different %i vs %i\n", s, __func__, sc->sc_requested_r_scale, scx->sc_requested_r_scale); if (sc->sc_requested_s_scale != scx->sc_requested_s_scale) log(LOG_DEBUG, "%s; %s: swscale different %i vs %i\n", s, __func__, sc->sc_requested_s_scale, scx->sc_requested_s_scale); if ((sc->sc_flags & SCF_SACK) != (scx->sc_flags & SCF_SACK)) log(LOG_DEBUG, "%s; %s: SACK different\n", s, __func__); } if (s != NULL) free(s, M_TCPLOG); return (0); } #endif /* INVARIANTS */ static void syncookie_reseed(void *arg) { struct tcp_syncache *sc = arg; uint8_t *secbits; int secbit; /* * Reseeding the secret doesn't have to be protected by a lock. * It only must be ensured that the new random values are visible * to all CPUs in a SMP environment. The atomic with release * semantics ensures that. */ secbit = (sc->secret.oddeven & 0x1) ? 0 : 1; secbits = sc->secret.key[secbit]; arc4rand(secbits, SYNCOOKIE_SECRET_SIZE, 0); atomic_add_rel_int(&sc->secret.oddeven, 1); /* Reschedule ourself. */ callout_schedule(&sc->secret.reseed, SYNCOOKIE_LIFETIME * hz); } /* * Exports the syncache entries to userland so that netstat can display * them alongside the other sockets. This function is intended to be * called only from tcp_pcblist. * * Due to concurrency on an active system, the number of pcbs exported * may have no relation to max_pcbs. max_pcbs merely indicates the * amount of space the caller allocated for this function to use. */ int syncache_pcblist(struct sysctl_req *req, int max_pcbs, int *pcbs_exported) { struct xtcpcb xt; struct syncache *sc; struct syncache_head *sch; int count, error, i; for (count = 0, error = 0, i = 0; i < V_tcp_syncache.hashsize; i++) { sch = &V_tcp_syncache.hashbase[i]; SCH_LOCK(sch); TAILQ_FOREACH(sc, &sch->sch_bucket, sc_hash) { if (count >= max_pcbs) { SCH_UNLOCK(sch); goto exit; } if (cr_cansee(req->td->td_ucred, sc->sc_cred) != 0) continue; bzero(&xt, sizeof(xt)); xt.xt_len = sizeof(xt); if (sc->sc_inc.inc_flags & INC_ISIPV6) xt.xt_inp.inp_vflag = INP_IPV6; else xt.xt_inp.inp_vflag = INP_IPV4; bcopy(&sc->sc_inc, &xt.xt_inp.inp_inc, sizeof (struct in_conninfo)); xt.xt_tp.t_inpcb = &xt.xt_inp; xt.xt_tp.t_state = TCPS_SYN_RECEIVED; xt.xt_socket.xso_protocol = IPPROTO_TCP; xt.xt_socket.xso_len = sizeof (struct xsocket); xt.xt_socket.so_type = SOCK_STREAM; xt.xt_socket.so_state = SS_ISCONNECTING; error = SYSCTL_OUT(req, &xt, sizeof xt); if (error) { SCH_UNLOCK(sch); goto exit; } count++; } SCH_UNLOCK(sch); } exit: *pcbs_exported = count; return error; } Index: head/sys/netinet6/in6_proto.c =================================================================== --- head/sys/netinet6/in6_proto.c (revision 298353) +++ head/sys/netinet6/in6_proto.c (revision 298354) @@ -1,628 +1,627 @@ /*- * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the project nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $KAME: in6_proto.c,v 1.91 2001/05/27 13:28:35 itojun Exp $ */ /*- * Copyright (c) 1982, 1986, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)in_proto.c 8.1 (Berkeley) 6/10/93 */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_ipstealth.h" #include "opt_sctp.h" #include "opt_mpath.h" #include "opt_route.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef RADIX_MPATH #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef SCTP #include #include #include #include #include #endif /* SCTP */ #ifdef IPSEC #include #include #endif /* IPSEC */ #include /* * TCP/IP protocol family: IP6, ICMP6, UDP, TCP. */ FEATURE(inet6, "Internet Protocol version 6"); extern struct domain inet6domain; static struct pr_usrreqs nousrreqs; #define PR_LISTEN 0 #define PR_ABRTACPTDIS 0 /* Spacer for loadable protocols. */ #define IP6PROTOSPACER \ { \ .pr_domain = &inet6domain, \ .pr_protocol = PROTO_SPACER, \ .pr_usrreqs = &nousrreqs \ } struct protosw inet6sw[] = { { .pr_type = 0, .pr_domain = &inet6domain, .pr_protocol = IPPROTO_IPV6, .pr_init = ip6_init, #ifdef VIMAGE .pr_destroy = ip6_destroy, #endif .pr_slowtimo = frag6_slowtimo, .pr_drain = frag6_drain, .pr_usrreqs = &nousrreqs, }, { .pr_type = SOCK_DGRAM, .pr_domain = &inet6domain, .pr_protocol = IPPROTO_UDP, .pr_flags = PR_ATOMIC|PR_ADDR, .pr_input = udp6_input, .pr_ctlinput = udp6_ctlinput, .pr_ctloutput = ip6_ctloutput, #ifndef INET /* Do not call initialization twice. */ .pr_init = udp_init, #endif .pr_usrreqs = &udp6_usrreqs, }, { .pr_type = SOCK_STREAM, .pr_domain = &inet6domain, .pr_protocol = IPPROTO_TCP, .pr_flags = PR_CONNREQUIRED|PR_WANTRCVD|PR_LISTEN, .pr_input = tcp6_input, .pr_ctlinput = tcp6_ctlinput, .pr_ctloutput = tcp_ctloutput, #ifndef INET /* don't call initialization and timeout routines twice */ .pr_init = tcp_init, .pr_slowtimo = tcp_slowtimo, #endif .pr_drain = tcp_drain, .pr_usrreqs = &tcp6_usrreqs, }, #ifdef SCTP { .pr_type = SOCK_SEQPACKET, .pr_domain = &inet6domain, .pr_protocol = IPPROTO_SCTP, .pr_flags = PR_WANTRCVD, .pr_input = sctp6_input, .pr_ctlinput = sctp6_ctlinput, .pr_ctloutput = sctp_ctloutput, .pr_drain = sctp_drain, #ifndef INET /* Do not call initialization twice. */ .pr_init = sctp_init, #endif .pr_usrreqs = &sctp6_usrreqs }, { .pr_type = SOCK_STREAM, .pr_domain = &inet6domain, .pr_protocol = IPPROTO_SCTP, .pr_flags = PR_WANTRCVD, .pr_input = sctp6_input, .pr_ctlinput = sctp6_ctlinput, .pr_ctloutput = sctp_ctloutput, .pr_drain = sctp_drain, .pr_usrreqs = &sctp6_usrreqs }, #endif /* SCTP */ { .pr_type = SOCK_DGRAM, .pr_domain = &inet6domain, .pr_protocol = IPPROTO_UDPLITE, .pr_flags = PR_ATOMIC|PR_ADDR, .pr_input = udp6_input, .pr_ctlinput = udplite6_ctlinput, .pr_ctloutput = udp_ctloutput, #ifndef INET /* Do not call initialization twice. */ .pr_init = udplite_init, #endif .pr_usrreqs = &udp6_usrreqs, }, { .pr_type = SOCK_RAW, .pr_domain = &inet6domain, .pr_protocol = IPPROTO_RAW, .pr_flags = PR_ATOMIC|PR_ADDR, .pr_input = rip6_input, .pr_output = rip6_output, .pr_ctlinput = rip6_ctlinput, .pr_ctloutput = rip6_ctloutput, #ifndef INET /* Do not call initialization twice. */ .pr_init = rip_init, #endif .pr_usrreqs = &rip6_usrreqs }, { .pr_type = SOCK_RAW, .pr_domain = &inet6domain, .pr_protocol = IPPROTO_ICMPV6, .pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR, .pr_input = icmp6_input, .pr_output = rip6_output, .pr_ctlinput = rip6_ctlinput, .pr_ctloutput = rip6_ctloutput, .pr_fasttimo = icmp6_fasttimo, .pr_slowtimo = icmp6_slowtimo, .pr_usrreqs = &rip6_usrreqs }, { .pr_type = SOCK_RAW, .pr_domain = &inet6domain, .pr_protocol = IPPROTO_DSTOPTS, .pr_flags = PR_ATOMIC|PR_ADDR, .pr_input = dest6_input, .pr_usrreqs = &nousrreqs }, { .pr_type = SOCK_RAW, .pr_domain = &inet6domain, .pr_protocol = IPPROTO_ROUTING, .pr_flags = PR_ATOMIC|PR_ADDR, .pr_input = route6_input, .pr_usrreqs = &nousrreqs }, { .pr_type = SOCK_RAW, .pr_domain = &inet6domain, .pr_protocol = IPPROTO_FRAGMENT, .pr_flags = PR_ATOMIC|PR_ADDR, .pr_input = frag6_input, .pr_usrreqs = &nousrreqs }, #ifdef IPSEC { .pr_type = SOCK_RAW, .pr_domain = &inet6domain, .pr_protocol = IPPROTO_AH, .pr_flags = PR_ATOMIC|PR_ADDR, .pr_input = ipsec6_common_input, .pr_usrreqs = &nousrreqs, }, { .pr_type = SOCK_RAW, .pr_domain = &inet6domain, .pr_protocol = IPPROTO_ESP, .pr_flags = PR_ATOMIC|PR_ADDR, .pr_input = ipsec6_common_input, .pr_ctlinput = esp6_ctlinput, .pr_usrreqs = &nousrreqs, }, { .pr_type = SOCK_RAW, .pr_domain = &inet6domain, .pr_protocol = IPPROTO_IPCOMP, .pr_flags = PR_ATOMIC|PR_ADDR, .pr_input = ipsec6_common_input, .pr_usrreqs = &nousrreqs, }, #endif /* IPSEC */ #ifdef INET { .pr_type = SOCK_RAW, .pr_domain = &inet6domain, .pr_protocol = IPPROTO_IPV4, .pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR, .pr_input = encap6_input, .pr_output = rip6_output, .pr_ctloutput = rip6_ctloutput, .pr_init = encap_init, .pr_usrreqs = &rip6_usrreqs }, #endif /* INET */ { .pr_type = SOCK_RAW, .pr_domain = &inet6domain, .pr_protocol = IPPROTO_IPV6, .pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR, .pr_input = encap6_input, .pr_output = rip6_output, .pr_ctloutput = rip6_ctloutput, .pr_init = encap_init, .pr_usrreqs = &rip6_usrreqs }, { .pr_type = SOCK_RAW, .pr_domain = &inet6domain, .pr_protocol = IPPROTO_GRE, .pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR, .pr_input = encap6_input, .pr_output = rip6_output, .pr_ctloutput = rip6_ctloutput, .pr_init = encap_init, .pr_usrreqs = &rip6_usrreqs }, { .pr_type = SOCK_RAW, .pr_domain = &inet6domain, .pr_protocol = IPPROTO_PIM, .pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR, .pr_input = encap6_input, .pr_output = rip6_output, .pr_ctloutput = rip6_ctloutput, .pr_usrreqs = &rip6_usrreqs }, /* Spacer n-times for loadable protocols. */ IP6PROTOSPACER, IP6PROTOSPACER, IP6PROTOSPACER, IP6PROTOSPACER, IP6PROTOSPACER, IP6PROTOSPACER, IP6PROTOSPACER, IP6PROTOSPACER, /* raw wildcard */ { .pr_type = SOCK_RAW, .pr_domain = &inet6domain, .pr_flags = PR_ATOMIC|PR_ADDR, .pr_input = rip6_input, .pr_output = rip6_output, .pr_ctloutput = rip6_ctloutput, .pr_usrreqs = &rip6_usrreqs }, }; extern int in6_inithead(void **, int); #ifdef VIMAGE extern int in6_detachhead(void **, int); #endif struct domain inet6domain = { .dom_family = AF_INET6, .dom_name = "internet6", .dom_protosw = (struct protosw *)inet6sw, - .dom_protoswNPROTOSW = (struct protosw *) - &inet6sw[nitems(inet6sw)], + .dom_protoswNPROTOSW = (struct protosw *)&inet6sw[nitems(inet6sw)], #ifdef RADIX_MPATH .dom_rtattach = rn6_mpath_inithead, #else .dom_rtattach = in6_inithead, #endif #ifdef VIMAGE .dom_rtdetach = in6_detachhead, #endif .dom_ifattach = in6_domifattach, .dom_ifdetach = in6_domifdetach, .dom_ifmtu = in6_domifmtu }; VNET_DOMAIN_SET(inet6); /* * Internet configuration info */ #ifndef IPV6FORWARDING #ifdef GATEWAY6 #define IPV6FORWARDING 1 /* forward IP6 packets not for us */ #else #define IPV6FORWARDING 0 /* don't forward IP6 packets not for us */ #endif /* GATEWAY6 */ #endif /* !IPV6FORWARDING */ #ifndef IPV6_SENDREDIRECTS #define IPV6_SENDREDIRECTS 1 #endif VNET_DEFINE(int, ip6_forwarding) = IPV6FORWARDING; /* act as router? */ VNET_DEFINE(int, ip6_sendredirects) = IPV6_SENDREDIRECTS; VNET_DEFINE(int, ip6_defhlim) = IPV6_DEFHLIM; VNET_DEFINE(int, ip6_defmcasthlim) = IPV6_DEFAULT_MULTICAST_HOPS; VNET_DEFINE(int, ip6_accept_rtadv) = 0; VNET_DEFINE(int, ip6_no_radr) = 0; VNET_DEFINE(int, ip6_norbit_raif) = 0; VNET_DEFINE(int, ip6_rfc6204w3) = 0; VNET_DEFINE(int, ip6_maxfragpackets); /* initialized in frag6.c:frag6_init() */ VNET_DEFINE(int, ip6_maxfrags); /* initialized in frag6.c:frag6_init() */ VNET_DEFINE(int, ip6_log_interval) = 5; VNET_DEFINE(int, ip6_hdrnestlimit) = 15;/* How many header options will we * process? */ VNET_DEFINE(int, ip6_dad_count) = 1; /* DupAddrDetectionTransmits */ VNET_DEFINE(int, ip6_auto_flowlabel) = 1; VNET_DEFINE(int, ip6_use_deprecated) = 1;/* allow deprecated addr * (RFC2462 5.5.4) */ VNET_DEFINE(int, ip6_rr_prune) = 5; /* router renumbering prefix * walk list every 5 sec. */ VNET_DEFINE(int, ip6_mcast_pmtu) = 0; /* enable pMTU discovery for multicast? */ VNET_DEFINE(int, ip6_v6only) = 1; VNET_DEFINE(time_t, ip6_log_time) = (time_t)0L; #ifdef IPSTEALTH VNET_DEFINE(int, ip6stealth) = 0; #endif VNET_DEFINE(int, nd6_onlink_ns_rfc4861) = 0;/* allow 'on-link' nd6 NS * (RFC 4861) */ /* icmp6 */ /* * BSDI4 defines these variables in in_proto.c... * XXX: what if we don't define INET? Should we define pmtu6_expire * or so? (jinmei@kame.net 19990310) */ VNET_DEFINE(int, pmtu_expire) = 60*10; VNET_DEFINE(int, pmtu_probe) = 60*2; /* ICMPV6 parameters */ VNET_DEFINE(int, icmp6_rediraccept) = 1;/* accept and process redirects */ VNET_DEFINE(int, icmp6_redirtimeout) = 10 * 60; /* 10 minutes */ VNET_DEFINE(int, icmp6errppslim) = 100; /* 100pps */ /* control how to respond to NI queries */ VNET_DEFINE(int, icmp6_nodeinfo) = (ICMP6_NODEINFO_FQDNOK|ICMP6_NODEINFO_NODEADDROK); VNET_DEFINE(int, icmp6_nodeinfo_oldmcprefix) = 1; /* * sysctl related items. */ SYSCTL_NODE(_net, PF_INET6, inet6, CTLFLAG_RW, 0, "Internet6 Family"); /* net.inet6 */ SYSCTL_NODE(_net_inet6, IPPROTO_IPV6, ip6, CTLFLAG_RW, 0, "IP6"); SYSCTL_NODE(_net_inet6, IPPROTO_ICMPV6, icmp6, CTLFLAG_RW, 0, "ICMP6"); SYSCTL_NODE(_net_inet6, IPPROTO_UDP, udp6, CTLFLAG_RW, 0, "UDP6"); SYSCTL_NODE(_net_inet6, IPPROTO_TCP, tcp6, CTLFLAG_RW, 0, "TCP6"); #ifdef SCTP SYSCTL_NODE(_net_inet6, IPPROTO_SCTP, sctp6, CTLFLAG_RW, 0, "SCTP6"); #endif #ifdef IPSEC SYSCTL_NODE(_net_inet6, IPPROTO_ESP, ipsec6, CTLFLAG_RW, 0, "IPSEC6"); #endif /* IPSEC */ /* net.inet6.ip6 */ static int sysctl_ip6_temppltime(SYSCTL_HANDLER_ARGS) { int error = 0; int old; error = SYSCTL_OUT(req, arg1, sizeof(int)); if (error || !req->newptr) return (error); old = V_ip6_temp_preferred_lifetime; error = SYSCTL_IN(req, arg1, sizeof(int)); if (V_ip6_temp_preferred_lifetime < V_ip6_desync_factor + V_ip6_temp_regen_advance) { V_ip6_temp_preferred_lifetime = old; return (EINVAL); } return (error); } static int sysctl_ip6_tempvltime(SYSCTL_HANDLER_ARGS) { int error = 0; int old; error = SYSCTL_OUT(req, arg1, sizeof(int)); if (error || !req->newptr) return (error); old = V_ip6_temp_valid_lifetime; error = SYSCTL_IN(req, arg1, sizeof(int)); if (V_ip6_temp_valid_lifetime < V_ip6_temp_preferred_lifetime) { V_ip6_temp_preferred_lifetime = old; return (EINVAL); } return (error); } SYSCTL_INT(_net_inet6_ip6, IPV6CTL_FORWARDING, forwarding, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_forwarding), 0, ""); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_SENDREDIRECTS, redirect, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_sendredirects), 0, ""); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_DEFHLIM, hlim, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_defhlim), 0, ""); SYSCTL_VNET_PCPUSTAT(_net_inet6_ip6, IPV6CTL_STATS, stats, struct ip6stat, ip6stat, "IP6 statistics (struct ip6stat, netinet6/ip6_var.h)"); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_MAXFRAGPACKETS, maxfragpackets, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_maxfragpackets), 0, ""); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_ACCEPT_RTADV, accept_rtadv, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_accept_rtadv), 0, "Default value of per-interface flag for accepting ICMPv6 Router" "Advertisement messages"); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_NO_RADR, no_radr, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_no_radr), 0, "Default value of per-interface flag to control whether routers " "sending ICMPv6 RA messages on that interface are added into the " "default router list."); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_NORBIT_RAIF, norbit_raif, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_norbit_raif), 0, "Always set 0 to R flag in ICMPv6 NA messages when accepting RA" " on the interface."); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_RFC6204W3, rfc6204w3, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_rfc6204w3), 0, "Accept the default router list from ICMPv6 RA messages even " "when packet forwarding enabled."); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_LOG_INTERVAL, log_interval, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_log_interval), 0, ""); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_HDRNESTLIMIT, hdrnestlimit, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_hdrnestlimit), 0, ""); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_DAD_COUNT, dad_count, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_dad_count), 0, ""); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_AUTO_FLOWLABEL, auto_flowlabel, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_auto_flowlabel), 0, ""); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_DEFMCASTHLIM, defmcasthlim, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_defmcasthlim), 0, ""); SYSCTL_STRING(_net_inet6_ip6, IPV6CTL_KAME_VERSION, kame_version, CTLFLAG_RD, __KAME_VERSION, 0, ""); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_USE_DEPRECATED, use_deprecated, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_use_deprecated), 0, ""); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_RR_PRUNE, rr_prune, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_rr_prune), 0, ""); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_USETEMPADDR, use_tempaddr, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_use_tempaddr), 0, ""); SYSCTL_PROC(_net_inet6_ip6, IPV6CTL_TEMPPLTIME, temppltime, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW, &VNET_NAME(ip6_temp_preferred_lifetime), 0, sysctl_ip6_temppltime, "I", ""); SYSCTL_PROC(_net_inet6_ip6, IPV6CTL_TEMPVLTIME, tempvltime, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW, &VNET_NAME(ip6_temp_valid_lifetime), 0, sysctl_ip6_tempvltime, "I", ""); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_V6ONLY, v6only, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_v6only), 0, ""); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_AUTO_LINKLOCAL, auto_linklocal, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_auto_linklocal), 0, "Default value of per-interface flag for automatically adding an IPv6" " link-local address to interfaces when attached"); SYSCTL_VNET_PCPUSTAT(_net_inet6_ip6, IPV6CTL_RIP6STATS, rip6stats, struct rip6stat, rip6stat, "Raw IP6 statistics (struct rip6stat, netinet6/raw_ip6.h)"); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_PREFER_TEMPADDR, prefer_tempaddr, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_prefer_tempaddr), 0, ""); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_USE_DEFAULTZONE, use_defaultzone, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_use_defzone), 0,""); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_MAXFRAGS, maxfrags, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_maxfrags), 0, ""); SYSCTL_INT(_net_inet6_ip6, IPV6CTL_MCAST_PMTU, mcast_pmtu, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_mcast_pmtu), 0, ""); #ifdef IPSTEALTH SYSCTL_INT(_net_inet6_ip6, IPV6CTL_STEALTH, stealth, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6stealth), 0, ""); #endif /* net.inet6.icmp6 */ SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_REDIRACCEPT, rediraccept, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(icmp6_rediraccept), 0, ""); SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_REDIRTIMEOUT, redirtimeout, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(icmp6_redirtimeout), 0, ""); SYSCTL_VNET_PCPUSTAT(_net_inet6_icmp6, ICMPV6CTL_STATS, stats, struct icmp6stat, icmp6stat, "ICMPv6 statistics (struct icmp6stat, netinet/icmp6.h)"); SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_PRUNE, nd6_prune, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(nd6_prune), 0, ""); SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_DELAY, nd6_delay, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(nd6_delay), 0, ""); SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_UMAXTRIES, nd6_umaxtries, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(nd6_umaxtries), 0, ""); SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_MMAXTRIES, nd6_mmaxtries, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(nd6_mmaxtries), 0, ""); SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_USELOOPBACK, nd6_useloopback, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(nd6_useloopback), 0, ""); SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_NODEINFO, nodeinfo, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(icmp6_nodeinfo), 0, ""); SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_NODEINFO_OLDMCPREFIX, nodeinfo_oldmcprefix, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(icmp6_nodeinfo_oldmcprefix), 0, "Join old IPv6 NI group address in draft-ietf-ipngwg-icmp-name-lookup" " for compatibility with KAME implememtation."); SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ERRPPSLIMIT, errppslimit, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(icmp6errppslim), 0, ""); SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_MAXNUDHINT, nd6_maxnudhint, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(nd6_maxnudhint), 0, ""); SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_DEBUG, nd6_debug, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(nd6_debug), 0, ""); SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_ONLINKNSRFC4861, nd6_onlink_ns_rfc4861, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(nd6_onlink_ns_rfc4861), 0, "Accept 'on-link' nd6 NS in compliance with RFC 4861.");