Index: sys/amd64/linux/linux_dummy.c =================================================================== --- sys/amd64/linux/linux_dummy.c +++ sys/amd64/linux/linux_dummy.c @@ -132,7 +132,6 @@ /* Linux 3.15: */ DUMMY(kexec_file_load); /* Linux 3.17: */ -DUMMY(memfd_create); DUMMY(seccomp); /* Linux 3.18: */ DUMMY(bpf); Index: sys/amd64/linux32/linux32_dummy.c =================================================================== --- sys/amd64/linux32/linux32_dummy.c +++ sys/amd64/linux32/linux32_dummy.c @@ -136,7 +136,6 @@ DUMMY(sched_setattr); DUMMY(sched_getattr); /* Linux 3.17: */ -DUMMY(memfd_create); DUMMY(seccomp); /* Linux 3.18: */ DUMMY(bpf); Index: sys/arm64/linux/linux_dummy.c =================================================================== --- sys/arm64/linux/linux_dummy.c +++ sys/arm64/linux/linux_dummy.c @@ -130,7 +130,6 @@ DUMMY(sched_setattr); DUMMY(sched_getattr); /* Linux 3.17: */ -DUMMY(memfd_create); DUMMY(seccomp); /* Linux 3.18: */ DUMMY(bpf); Index: sys/compat/linux/linux.h =================================================================== --- sys/compat/linux/linux.h +++ sys/compat/linux/linux.h @@ -146,4 +146,49 @@ void linux_dev_shm_create(void); void linux_dev_shm_destroy(void); +/* + * mask=0 is not sensible for this application, so it will be taken to mean + * a mask equivalent to the value. Otherwise, (word & mask) == value maps to + * (word & ~mask) | value in a bitfield for the platform we're converting to. + */ +struct bsd_to_linux_bitmap { + int bsd_mask; + int bsd_value; + int linux_mask; + int linux_value; +}; + +int bsd_to_linux_bits_(int value, struct bsd_to_linux_bitmap *bitmap, + size_t mapcnt, int no_value); +int linux_to_bsd_bits_(int value, struct bsd_to_linux_bitmap *bitmap, + size_t mapcnt, int no_value); + +#define bsd_to_linux_bits(_val, _bmap, _noval) \ + bsd_to_linux_bits_((_val), (_bmap), nitems((_bmap)), (_noval)) + +/* + * These functions are used for simplification of BSD <-> Linux bit conversions. + * Given `value`, a bit field, these functions will walk the given bitmap table + * and set the appropriate bits for the target platform. If any bits were + * successfully converted, then the return value is the equivalent of value + * represented with the bit values appropriate for the target platform. + * Otherwise, the value supplied as `no_value` is returned. + */ +#define linux_to_bsd_bits(_val, _bmap, _noval) \ + linux_to_bsd_bits_((_val), (_bmap), nitems((_bmap)), (_noval)) + +/* + * Easy mapping helpers. BITMAP_EASY_LINUX represents a single bit to be + * translated, and the FreeBSD and Linux values are supplied. BITMAP_1t1_LINUX + * is the extreme version of this, where not only is it a single bit, but the + * name of the macro used to represent the Linux version of a bit literally has + * LINUX_ prepended to the normal name. + */ +#define BITMAP_EASY_LINUX(_name, _linux_name) \ + { \ + .bsd_value = (_name), \ + .linux_value = (_linux_name), \ + } +#define BITMAP_1t1_LINUX(_name) BITMAP_EASY_LINUX(_name, LINUX_##_name) + #endif /* _LINUX_MI_H_ */ Index: sys/compat/linux/linux.c =================================================================== --- sys/compat/linux/linux.c +++ sys/compat/linux/linux.c @@ -551,3 +551,79 @@ destroy_dev(dev_shm_cdev); } + +int +bsd_to_linux_bits_(int value, struct bsd_to_linux_bitmap *bitmap, + size_t mapcnt, int no_value) +{ + int bsd_mask, bsd_value, linux_mask, linux_value; + int linux_ret; + size_t i; + bool applied; + + applied = false; + linux_ret = 0; + for (i = 0; i < mapcnt; ++i) { + bsd_mask = bitmap[i].bsd_mask; + bsd_value = bitmap[i].bsd_value; + if (bsd_mask == 0) + bsd_mask = bsd_value; + + linux_mask = bitmap[i].linux_mask; + linux_value = bitmap[i].linux_value; + if (linux_mask == 0) + linux_mask = linux_value; + + /* + * If a mask larger than just the value is set, we explicitly + * want to make sure that only this bit we mapped within that + * mask is set. + */ + if ((value & bsd_mask) == bsd_value) { + linux_ret = (linux_ret & ~linux_mask) | linux_value; + applied = true; + } + } + + if (!applied) + return (no_value); + return (linux_ret); +} + +int +linux_to_bsd_bits_(int value, struct bsd_to_linux_bitmap *bitmap, + size_t mapcnt, int no_value) +{ + int bsd_mask, bsd_value, linux_mask, linux_value; + int bsd_ret; + size_t i; + bool applied; + + applied = false; + bsd_ret = 0; + for (i = 0; i < mapcnt; ++i) { + bsd_mask = bitmap[i].bsd_mask; + bsd_value = bitmap[i].bsd_value; + if (bsd_mask == 0) + bsd_mask = bsd_value; + + linux_mask = bitmap[i].linux_mask; + linux_value = bitmap[i].linux_value; + if (linux_mask == 0) + linux_mask = linux_value; + + /* + * If a mask larger than just the value is set, we explicitly + * want to make sure that only this bit we mapped within that + * mask is set. + */ + if ((value & linux_mask) == linux_value) { + bsd_ret = (bsd_ret & ~bsd_mask) | bsd_value; + applied = true; + } + } + + if (!applied) + return (no_value); + return (bsd_ret); +} Index: sys/compat/linux/linux_file.h =================================================================== --- sys/compat/linux/linux_file.h +++ sys/compat/linux/linux_file.h @@ -113,6 +113,9 @@ #define LINUX_F_SETPIPE_SZ (LINUX_F_SPECIFIC_BASE + 7) #define LINUX_F_GETPIPE_SZ (LINUX_F_SPECIFIC_BASE + 8) +#define LINUX_F_ADD_SEALS (LINUX_F_SPECIFIC_BASE + 9) +#define LINUX_F_GET_SEALS (LINUX_F_SPECIFIC_BASE + 10) + #define LINUX_F_GETLKP 36 #define LINUX_F_SETLKP 37 #define LINUX_F_SETLKPW 38 @@ -141,4 +144,29 @@ #define LINUX_SYNC_FILE_RANGE_WRITE 2 #define LINUX_SYNC_FILE_RANGE_WAIT_AFTER 4 +#define LINUX_F_SEAL_SEAL 0x0001 +#define LINUX_F_SEAL_SHRINK 0x0002 +#define LINUX_F_SEAL_GROW 0x0004 +#define LINUX_F_SEAL_WRITE 0x0008 + +#define LINUX_MFD_CLOEXEC 0x0001 +#define LINUX_MFD_ALLOW_SEALING 0x0002 +#define LINUX_MFD_HUGETLB 0x0004 + +#define LINUX_HUGETLB_FLAG_ENCODE_SHIFT 26 +#define LINUX_HUGETLB_FLAG_ENCODE_MASK 0x3f + +#define LINUX_HUGETLB_FLAG_ENCODE_64KB (16 << LINUX_HUGETLB_FLAG_ENCODE_SHIFT) +#define LINUX_HUGETLB_FLAG_ENCODE_512KB (19 << LINUX_HUGETLB_FLAG_ENCODE_SHIFT) +#define LINUX_HUGETLB_FLAG_ENCODE_1MB (20 << LINUX_HUGETLB_FLAG_ENCODE_SHIFT) +#define LINUX_HUGETLB_FLAG_ENCODE_2MB (21 << LINUX_HUGETLB_FLAG_ENCODE_SHIFT) +#define LINUX_HUGETLB_FLAG_ENCODE_8MB (23 << LINUX_HUGETLB_FLAG_ENCODE_SHIFT) +#define LINUX_HUGETLB_FLAG_ENCODE_16MB (24 << LINUX_HUGETLB_FLAG_ENCODE_SHIFT) +#define LINUX_HUGETLB_FLAG_ENCODE_32MB (25 << LINUX_HUGETLB_FLAG_ENCODE_SHIFT) +#define LINUX_HUGETLB_FLAG_ENCODE_256MB (28 << LINUX_HUGETLB_FLAG_ENCODE_SHIFT) +#define LINUX_HUGETLB_FLAG_ENCODE_512MB (29 << LINUX_HUGETLB_FLAG_ENCODE_SHIFT) +#define LINUX_HUGETLB_FLAG_ENCODE_1GB (30 << LINUX_HUGETLB_FLAG_ENCODE_SHIFT) +#define LINUX_HUGETLB_FLAG_ENCODE_2GB (31 << LINUX_HUGETLB_FLAG_ENCODE_SHIFT) +#define LINUX_HUGETLB_FLAG_ENCODE_16GB (34U << LINUX_HUGETLB_FLAG_ENCODE_SHIFT) + #endif /* !_LINUX_FILE_H_ */ Index: sys/compat/linux/linux_file.c =================================================================== --- sys/compat/linux/linux_file.c +++ sys/compat/linux/linux_file.c @@ -41,6 +41,7 @@ #include #include #include +#include #include #include #include @@ -67,6 +68,36 @@ static int linux_common_open(struct thread *, int, char *, int, int); static int linux_getdents_error(struct thread *, int, int); +static struct bsd_to_linux_bitmap seal_bitmap[] = { + BITMAP_1t1_LINUX(F_SEAL_SEAL), + BITMAP_1t1_LINUX(F_SEAL_SHRINK), + BITMAP_1t1_LINUX(F_SEAL_GROW), + BITMAP_1t1_LINUX(F_SEAL_WRITE), +}; + +#define MFD_HUGETLB_ENTRY(_size) \ + { \ + .bsd_value = MFD_HUGE_##_size, \ + .linux_value = LINUX_HUGETLB_FLAG_ENCODE_##_size \ + } +static struct bsd_to_linux_bitmap mfd_bitmap[] = { + BITMAP_1t1_LINUX(MFD_CLOEXEC), + BITMAP_1t1_LINUX(MFD_ALLOW_SEALING), + BITMAP_1t1_LINUX(MFD_HUGETLB), + MFD_HUGETLB_ENTRY(64KB), + MFD_HUGETLB_ENTRY(512KB), + MFD_HUGETLB_ENTRY(1MB), + MFD_HUGETLB_ENTRY(2MB), + MFD_HUGETLB_ENTRY(8MB), + MFD_HUGETLB_ENTRY(16MB), + MFD_HUGETLB_ENTRY(32MB), + MFD_HUGETLB_ENTRY(256MB), + MFD_HUGETLB_ENTRY(512MB), + MFD_HUGETLB_ENTRY(1GB), + MFD_HUGETLB_ENTRY(2GB), + MFD_HUGETLB_ENTRY(16GB), +}; +#undef MFD_HUGETLB_ENTRY #ifdef LINUX_LEGACY_SYSCALLS int @@ -1324,6 +1355,22 @@ case LINUX_F_DUPFD_CLOEXEC: return (kern_fcntl(td, args->fd, F_DUPFD_CLOEXEC, args->arg)); + + /* + * Our F_SEAL_* values match Linux one for maximum compatibility. So we + * only needed to account for different values for fcntl(2) commands. + */ + case LINUX_F_GET_SEALS: + error = kern_fcntl(td, args->fd, F_GET_SEALS, 0); + if (error != 0) + return (error); + td->td_retval[0] = bsd_to_linux_bits(td->td_retval[0], + seal_bitmap, 0); + return (0); + + case LINUX_F_ADD_SEALS: + return (kern_fcntl(td, args->fd, F_ADD_SEALS, + linux_to_bsd_bits(args->arg, seal_bitmap, 0))); } return (EINVAL); @@ -1606,3 +1653,45 @@ return (error); } +#define LINUX_MEMFD_PREFIX "memfd:" + +int +linux_memfd_create(struct thread *td, struct linux_memfd_create_args *args) +{ + char memfd_name[LINUX_NAME_MAX + 1]; + int error, flags, shmflags, oflags; + + /* + * This is our clever trick to avoid the heap allocation to copy in the + * uname. We don't really need to go this far out of our way, but it + * does keep the rest of this function fairly clean as they don't have + * to worry about cleanup on the way out. + */ + error = copyinstr(args->uname_ptr, + memfd_name + sizeof(LINUX_MEMFD_PREFIX) - 1, + LINUX_NAME_MAX - sizeof(LINUX_MEMFD_PREFIX) - 1, NULL); + if (error != 0) { + if (error == ENAMETOOLONG) + error = EINVAL; + return (error); + } + + memcpy(memfd_name, LINUX_MEMFD_PREFIX, sizeof(LINUX_MEMFD_PREFIX) - 1); + flags = linux_to_bsd_bits(args->flags, mfd_bitmap, 0); + if ((flags & ~(MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB | + MFD_HUGE_MASK)) != 0) + return (EINVAL); + /* Size specified but no HUGETLB. */ + if ((flags & MFD_HUGE_MASK) != 0 && (flags & MFD_HUGETLB) == 0) + return (EINVAL); + /* We don't actually support HUGETLB. */ + if ((flags & MFD_HUGETLB) != 0) + return (ENOSYS); + oflags = O_RDWR; + shmflags = 0; + if ((flags & MFD_CLOEXEC) != 0) + oflags |= O_CLOEXEC; + if ((flags & MFD_ALLOW_SEALING) != 0) + shmflags |= SHM_ALLOW_SEALING; + return (kern_shm_open2(td, SHM_ANON, oflags, 0, shmflags, memfd_name)); +} Index: sys/i386/linux/linux_dummy.c =================================================================== --- sys/i386/linux/linux_dummy.c +++ sys/i386/linux/linux_dummy.c @@ -132,7 +132,6 @@ DUMMY(sched_setattr); DUMMY(sched_getattr); /* Linux 3.17: */ -DUMMY(memfd_create); DUMMY(seccomp); /* Linux 3.18: */ DUMMY(bpf);