diff --git a/sbin/Makefile b/sbin/Makefile --- a/sbin/Makefile +++ b/sbin/Makefile @@ -44,6 +44,7 @@ mount_msdosfs \ mount_nfs \ mount_nullfs \ + mount_qemufwcfg \ mount_udf \ mount_unionfs \ newfs \ diff --git a/sbin/mount_qemufwcfg/Makefile b/sbin/mount_qemufwcfg/Makefile new file mode 100644 --- /dev/null +++ b/sbin/mount_qemufwcfg/Makefile @@ -0,0 +1,15 @@ +.include + +PROG_CXX=mount_qemufwcfg +SRCS= mount_qemufwcfg.cc +MAN= mount_qemufwcfg.8 + +WARNS?= 5 + +CXXSTD= c++20 + +CXXFLAGS+=-I${SYSDIR}/fs/fuse -I${SYSDIR} + +NO_SHARED?=NO + +.include diff --git a/sbin/mount_qemufwcfg/mount_qemufwcfg.8 b/sbin/mount_qemufwcfg/mount_qemufwcfg.8 new file mode 100644 --- /dev/null +++ b/sbin/mount_qemufwcfg/mount_qemufwcfg.8 @@ -0,0 +1,96 @@ +.\" $NetBSD: mount_qemufwcfg.8,v 1.3 2020/04/29 09:54:43 gson Exp $ +.\" +.\" Copyright (c) 2017 The NetBSD Foundation, Inc. +.\" All rights reserved. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" +.\" THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS +.\" ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +.\" TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +.\" PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS +.\" BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +.\" CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +.\" SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +.\" INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +.\" CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +.\" ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +.\" POSSIBILITY OF SUCH DAMAGE. +.\" +.Dd April 29, 2020 +.Dt MOUNT_QEMUFWCFG 8 +.Os +.Sh NAME +.Nm mount_qemufwcfg +.Nd provide QEMU fw_cfg data as a file system +.Sh SYNOPSIS +.Nm +.Op Fl F Ar path +.Op Fl g Ar gid +.Op Fl M Ar dir-mode +.Op Fl m Ar file-mode +.Op Fl u Ar uid +.Op Ar fuse-options +.Ar node +.Sh DESCRIPTION +The +.Nm +command provides the QEMU fw_cfg configuration files in a file system +tree at point +.Ar node . +The directory specified by +.Ar node +is converted to an absolute path before use. +.Pp +The options are as follows: +.Bl -tag -width Ds +.It Fl F Ar path +Use +.Ar path +instead of +.Pa /dev/qemufwcfg +for the QEMU device. +.It Fl g Ar gid +Use +.Ar gid +as group for files in the file system instead of the active group id. +.It Fl M Ar dir-mode +Use +.Ar dir-mode +as permissions for directories instead of the default +.Ar 0555 . +.It Fl m Ar file-mode +Use +.Ar file-mode +as permissions for files instead of the default +.Ar 0444 . +.It Fl u Ar uid +Use +.Ar uid +as user for files in the file system instead of the active user id. +.El +.Sh SEE ALSO +.Xr qemufwcfg 4 +.Sh HISTORY +A +.Nm +command first appeared in +.Nx 9.0 . +The +.Fx +version is a reimplementation to avoid the libfuse dependency and add +Capsicum support. +.Sh AUTHORS +The utility was written by +.An David Chisnall +This man page and the +.Nx +version were written by +.An Jared McNeill Aq Mt jmcneill@invisible.ca diff --git a/sbin/mount_qemufwcfg/mount_qemufwcfg.cc b/sbin/mount_qemufwcfg/mount_qemufwcfg.cc new file mode 100644 --- /dev/null +++ b/sbin/mount_qemufwcfg/mount_qemufwcfg.cc @@ -0,0 +1,742 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2024 David Chisnall + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef WITHOUT_CAPSICUM +#if __has_include() +#include +#else +#define WITHOUT_CAPSICUM +#endif +#endif + +#include + +#include + +#include + +#include +#include +#include + +#include "tinyfuse.hh" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace { + +/** + * Enable debugging messages if we are compiling a debug build. + */ +constexpr bool Debug = false; + +/** + * Enable caching of files. This adds a very small performance improvement + * (typically under 10%)in exchange for some memory overhead. This defaults to + * off because performance is very rarely a requirement for this. We can add a + * configuration option to enable it if someone has a use case. + */ +constexpr bool Cache = false; + +/** + * Class implementing a FUSE filesystem for the QEMU FW CFG device. + * + * The underlying device contains a set of blobs indexed by a 16-bit + * identifier. One of these is a catalogue, providing name to index mappings. + * Those names may contain slashes and so can be interpreted as paths. This + * filesystem builds a virtual directory structure from the names and exposes + * them as a real filesystem. + */ +class QemuFWCfgFuseFS : public FuseFS { + /** + * File structure. Contains the information from the device: the size + * and the selector used to access this 'file'. + */ + struct File { + /// The file size. + uint32_t size = 0; + /// The selector for this file. + uint16_t selector = 0; + }; + + /** + * QEMU Firmware Config 'file'. This is the catalog entry returned from + * the device to describe the names of the other entries. + */ + struct FWCfgFile { + /// The number of bytes of the 'file' that this describes. + uint32_t size; /* size of referenced fw_cfg item, big-endian */ + /// The selector used to access this file. + uint16_t selector; /* selector key of fw_cfg item, big-endian */ + /// Padding + uint16_t reserved; + /// Full file path, null-terminated string. + char name[56]; /* fw_cfg item name, NUL-terminated ascii */ + }; + + struct Directory; + + /** + * Helper for (shared) pointers to directories. + */ + using DirectoryPointer = std::shared_ptr; + + /** + * Objects in the 'filesystem' are either other directories or files. + */ + using FilesystemObject = std::variant; + + /** + * Directory. Contains a map of names to children, which may be files + * or other directories. + */ + struct Directory { + /** + * The first inode to allocate to a directory. + */ + static constexpr uint32_t FirstDirectoryInode = + std::numeric_limits::max() + 1; + + /** + * Next inode number to assign to directories. Directory inodes + * are allocated after selectors. + */ + inline static uint32_t nextDirectoryInode = FirstDirectoryInode; + + /** + * Ordered map from file names to children. + */ + std::map children; + + /** + * Cached version of the directory entries. + */ + VariableSizeResponse direntCache; + + /** + * Inode for this directory. + */ + const uint32_t inode; + + /** + * Default constructor, allocates a directory with the next + * available inode. + */ + Directory() + : Directory(nextDirectoryInode++) + { + } + + /** + * Construct a directory with the specified inode. + */ + Directory(uint32_t inode) + : inode(inode) + { + } + }; + + /** + * Look up the inode for an object in the filesystem. This is either + * the selector for 'files' or a number outside the valid selector range + * for directories. + */ + uint32_t inode_for_filesystem_object(FilesystemObject filesystemObject) + { + uint32_t ret = 0; + std::visit( + [&ret](auto &&object) { + if constexpr (std::is_same_v>) { + ret = object.selector; + } else if constexpr (std::is_same_v< + DirectoryPointer, + std::remove_cvref_t< + decltype(object)>>) { + ret = object->inode; + } else { + } + }, + filesystemObject); + return ret; + } + + /** + * Add a subdirectory to the current directory. If the directory + * already exists, the existing one is returned, otherwise a new one is + * allocated and returned. + */ + DirectoryPointer add_subdirectory(Directory &parent, + const std::string &name) + { + auto it = parent.children.find(name); + if (it != parent.children.end()) { + if (std::holds_alternative( + it->second)) { + return std::get(it->second); + } + throw std::invalid_argument( + "Directory is a regular file"); + } + auto newDirectory = std::make_shared(); + parent.children[name] = newDirectory; + inodes[newDirectory->inode] = newDirectory; + return newDirectory; + } + + /** + * Add a file in the specified directory. + */ + void add_file(Directory &parent, const std::string &name, uint32_t size, + uint16_t selector) + { + parent.children[name] = File { size, selector }; + inodes[selector] = parent.children[name]; + } + + /** + * Returns true if the inode is a directory inode, false if not. This + * does not require any file or directory to actually exist for this + * inode number. + */ + bool is_directory(uint64_t inode) + { + return (inode == FUSE_ROOT_ID) || + inode >= Directory::FirstDirectoryInode; + } + + /** + * Root directory. + */ + DirectoryPointer root = std::make_shared(FUSE_ROOT_ID); + + /** + * Map from inode number to the object that they refer to. + */ + std::unordered_map inodes; + + /** + * The total number of files in this filesystem. + */ + uint16_t numberOfFiles = 0; + + /** + * The total number of bytes in this filesystem. + */ + uint32_t totalSize = 0; + + /** + * Buffers that cache the contents of files. We currently never + * invalidate these because the interface is not used to deliver very + * large files. The filesystem can be unmounted and remounted to clear + * the cache. + * + * If this is a problem, it's easy to add some cache invalidation later. + */ + std::unordered_map fileCaches; + + /** + * Time (in seconds) when the filesystem was mounted. All files are + * treated as being created at that time. + */ + const uint32_t timeS; + + /** + * File descriptor for the QEMU FWCFG device. + */ + int qemuFWCfgFD; + + /** + * The GID used for files in this filesystem + */ + gid_t defaultGid; + + /** + * The UID used for files in this filesystem + */ + uid_t defaultUid; + + /** + * The mode for directories in this filesystem. + */ + mode_t defaultDirectoryMode; + + /** + * The mode for files in this filesystem. + */ + mode_t defaultFileMode; + + public: + /** + * Constructor. Reads the catalog from the device and prepares the + * filesystem structure. + */ + QemuFWCfgFuseFS(const char *devicePath, gid_t defaultGid, + uid_t defaultUid, mode_t defaultDirectoryMode, + mode_t defaultFileMode) + : timeS(time(nullptr)) + , defaultGid(defaultGid) + , defaultUid(defaultUid) + , defaultDirectoryMode(defaultDirectoryMode) + , defaultFileMode(defaultFileMode) + { + // Open the qemufwcfg device. This can fail if this filesystem + // is already mounted: it is designed for a single userspace + // consumer. + qemuFWCfgFD = open(devicePath, O_RDWR); + if (qemuFWCfgFD < 0) { + throw std::system_error(errno, std::generic_category(), + "Failed to open qemufwcfg device"); + } + // Set the selector to the index for the well-known blob + // containing the catalogue. + uint16_t selector = FW_CFG_FILE_DIR; + ioctl(qemuFWCfgFD, FWCFGIO_SET_INDEX, &selector); + // Read the number of entries (big endian). + uint32_t count; + if (int ret = read(qemuFWCfgFD, &count, sizeof(count)); + ret != sizeof(count)) { + throw std::system_error(errno, std::generic_category(), + "Failed to read number of entries in qemufwcfg device"); + } + debug_message("Found {} firmware entries", count); + count = ntohl(count); + numberOfFiles = count; + // Read each entry and build the required directory structure. + for (uint32_t i = 0; i < count; i++) { + FWCfgFile file; + read(qemuFWCfgFD, &file, sizeof(file)); + debug_message("File name: {}, size: {}, selector: {}", + file.name, ntohl(file.size), ntohs(file.selector)); + totalSize += ntohl(file.size); + std::string_view path { file.name }; + size_t nextSlash; + auto dir = root; + // If this name contains any slashes, construct a + // directory hierarchy leading up to the directory + // containing the file. + while ((nextSlash = path.find('/')) != + std::string_view::npos) { + std::string pathComponent { path.substr(0, + nextSlash) }; + dir = add_subdirectory(*dir, pathComponent); + path = path.substr(nextSlash + 1); + } + // Insert the file into the directory. + add_file(*dir, std::string { path }, ntohl(file.size), + ntohs(file.selector)); + } + // Insert the root directory into the inodes map. + inodes[FUSE_ROOT_ID] = root; + } + + /** + * Implement stat functionality. + */ + ErrorOr fuse_getattr(const fuse_in_header &header, + const fuse_getattr_in &attrIn) + { + uint64_t inode = header.nodeid; + debug_message("GetAttr flags: {}, inode: {}", + attrIn.getattr_flags, inode); + bool isDirectory = is_directory(inode); + // If this is a directory, the size is zero, otherwise look up + // the size. + uint64_t size = 0; + if (!isDirectory) { + size = std::get(inodes[inode]).size; + } + fuse_attr_out out; + memset(&out, 0, sizeof(out)); + // Read-only filesystem, make the cache timeout the distant + // future. + out.attr_valid = + std::numeric_limits::max() / 2; + out.attr_valid_nsec = 0; // 0x10000; + out.dummy = 0; + // Attributes + set_attrs(out.attr, inode, size, isDirectory); + return out; + } + + /** + * Read from a file. + * + * The underlying device does not support seeking and so this will read + * the entire file and cache it if `UseCache` is true, otherwise it will + * read all data from the device up to the requested point, discard it, + * and then read the requested part. + */ + template + ErrorOr, Buffer>> + fuse_read_helper(const fuse_in_header &header, + const fuse_read_in &readIn) + { + debug_message( + "read {{ fh: {}, offset: {}, size: {}, read_flags: {}, lock_owner: {}, flags: {} }}", + readIn.fh, readIn.offset, readIn.size, readIn.read_flags, + readIn.lock_owner, readIn.flags); + auto &item = inodes[header.nodeid]; + if (!std::holds_alternative(item)) { + return EINVAL; + } + auto file = std::get(item); + Buffer out; + out.resize(readIn.size); + int ret = ioctl(qemuFWCfgFD, FWCFGIO_SET_INDEX, &file.selector); + if (ret != 0) { + throw std::system_error(errno, std::generic_category(), + "Failed to switch selector in qemufwcfg device"); + } + auto readToBuffer = [&](size_t size) { + Buffer buffer; + buffer.resize(size); + size_t readData = 0; + while (readData < size) { + debug_message("read({}, {}, {})", qemuFWCfgFD, + buffer.data() + readData, + buffer.size() - readData); + ssize_t result = read(qemuFWCfgFD, + buffer.data() + readData, + buffer.size() - readData); + debug_message("Read returned {}", result); + // FIXME!!!!! + if (result <= 0) { + if (errno == EAGAIN) { + continue; + } + throw std::system_error(errno, + std::generic_category(), + "Failed to read from qemufwcfg device"); + } + readData += result; + } + return buffer; + }; + if constexpr (UseCache) { + auto &cache = fileCaches[file.selector]; + if (cache.size() < file.size) { + debug_message( + "Reading {} bytes to populate cache", + file.size); + cache = (readToBuffer(file.size)); + } + debug_message("Returning subrange from cache"); + return std::ranges::subrange(cache.begin() + + readIn.offset, + cache.begin() + readIn.offset + readIn.size); + + } else { + if (readIn.offset > 0) { + debug_message( + "Reading {} bytes to get to offset", + readIn.offset); + readToBuffer(readIn.offset); + } + debug_message("Reading {} bytes to return", + readIn.size); + return readToBuffer(readIn.size); + } + } + + /** + * Interface for fuse_read called from the superclass. This cannot be + * the templated version because then `dispatch` is unable to infer the + * implicit template argument. + */ + auto fuse_read(const fuse_in_header &header, const fuse_read_in &readIn) + { + return fuse_read_helper(header, readIn); + } + + /** + * Read one or more directory entries. + */ + ErrorOr> + fuse_readdir(const fuse_in_header &header, const fuse_read_in &readIn) + { + debug_message( + "readdir {{ fh: {}, offset: {}, size: {}, read_flags: {}, lock_owner: {}, flags: {} }}", + readIn.fh, readIn.offset, readIn.size, readIn.read_flags, + readIn.lock_owner, readIn.flags); + auto &item = inodes[header.nodeid]; + if (!std::holds_alternative(item)) { + return EINVAL; + } + auto &directory = *std::get(item); + VariableSizeResponse &dirents = directory.direntCache; + if (dirents.empty()) { + auto roundUp8 = [](size_t size) { + return ((size + 8 - 1) / 8) * 8; + }; + // For some reason (to be debugged) the kernel doesn't + // like these if you give them the correct inode values, + // but is happy with -1 as a 32-bit integer. + // + // Normal dirents use 0 as the indicator of the position + // of the next one, but FUSE uses -1. This, like + // everything else about FUSE, is undocumented. + auto addDirent = [&](std::string_view name, + bool isLast = false, + uint32_t inode = 0xffff'ffff) { + auto initialSize = dirents.size(); + auto next = roundUp8( + sizeof(fuse_dirent) + name.size()); + dirents << fuse_dirent { inode, + isLast ? -1 : next, + static_cast(name.size()), 0 } + << name; + dirents.pad_to_alignment(8); + auto length = dirents.size() - initialSize; + debug_message( + "Added dirent at offset {}, next: {}", + initialSize, next); + assert(length == next); + }; + addDirent("."); + addDirent(".."); + if (directory.children.size() > 0) { + for (auto i : std::ranges::subrange( + directory.children.begin(), + --directory.children.end())) { + addDirent(i.first); + } + } + // Add the last entry. + addDirent((--directory.children.end())->first, true); + } + debug_message("Dirents size: {}, number of entries: {}", + dirents.size(), directory.children.size()); + if (readIn.offset >= dirents.size()) { + debug_message("Writing no dirent for >0 offset {}", + readIn.offset); + return 0; + } + size_t size = std::min(readIn.size, + dirents.size() - readIn.offset); + return std::ranges::subrange(dirents.begin() + readIn.offset, + dirents.begin() + size); + } + + /** + * Look up a path component in a directory. + */ + ErrorOr fuse_lookup(const fuse_in_header &header, + const char *path) + { + // Find the directory from the inode. + auto &containingDirectory = inodes[header.nodeid]; + if (!std::holds_alternative( + containingDirectory)) { + return EINVAL; + } + + auto &directory = *std::get( + containingDirectory); + + // Find the entry in the directory. + std::string filename { path }; + auto &item = directory.children[filename]; + debug_message("Look up: {}", path); + + fuse_entry_out out; + memset(&out, 0, sizeof(out)); + out.nodeid = inode_for_filesystem_object(item); + out.generation = 0; + // Maximum possible timeout. We are an immutable filesystem. + out.entry_valid = + std::numeric_limits::max(); + out.attr_valid = + std::numeric_limits::max(); + out.entry_valid_nsec = 0; + out.attr_valid_nsec = 0; + uint64_t size = 0; + bool isDirectory = true; + if (std::holds_alternative(item)) { + size = std::get(item).size; + debug_message("File size is {}", size); + isDirectory = false; + } else { + debug_message("{} is a directory", path); + } + set_attrs(out.attr, inode_for_filesystem_object(item), size, + isDirectory); + return out; + } + + /** + * Set the fuse attributes for a file or directory given an inode number + * and size. + */ + void set_attrs(fuse_attr &attr, uint64_t inode, uint64_t size, + bool isDirectory) + { + static constexpr uint64_t BlockSize = 512; + // Inode number + attr.ino = inode; + attr.size = size; + attr.blocks = size / BlockSize; + // Read-only filesystem, everything was created at the time when + // we mounted the filesystem. + attr.atime = attr.mtime = attr.ctime = timeS; + attr.atimensec = attr.mtimensec = attr.ctimensec = 0; + // Read-only + attr.mode = isDirectory ? (defaultDirectoryMode | S_IFDIR) : + (defaultFileMode | S_IFREG); + // No links on this filesystem, give everything a link count of + // one. + attr.nlink = 1; + attr.uid = defaultUid; + attr.gid = defaultGid; + attr.rdev = 0; + attr.blksize = BlockSize; + } + + /** + * Statfs. Return the number of files in the filesystem. + */ + ErrorOr fuse_statfs(const fuse_in_header &header) + { + fuse_statfs_out out = std::get( + FuseFS::fuse_statfs(header)); + out.st.files = numberOfFiles; + // Use the default block size + out.st.frsize = 512; + out.st.blocks = totalSize / 512; + out.st.namelen = sizeof(FWCfgFile::name); + return out; + } +}; + +} + +int +main(int argc, char **argv) +{ + // Is this a direct invocation, or via mount_fusefs? + bool directInvocation = (getenv("FUSE_DEV_FD") == nullptr); + + // Default configuration options. + const char *devicePath = "/dev/qemufwcfg"; + gid_t defaultGid = getgid(); + uid_t defaultUid = getuid(); + mode_t defaultDirectoryMode = S_IRUSR | S_IRGRP | S_IROTH | S_IXGRP | + S_IXOTH; + mode_t defaultFileMode = S_IRUSR | S_IRGRP | S_IROTH; + + // Parse command-line flags. + const char *argv0 = argv[0]; + int ch; + // Don't report illegal options, they are ones that we should forward to + // mount_fusefs + opterr = !directInvocation; + while ((ch = getopt(argc, argv, "F:g:M:m:u:h")) != -1) { + // Unknown options are assumed to be for mount_fusefs + if (directInvocation && (ch == '?')) { + optind--; + break; + } + switch (ch) { + case 'F': + devicePath = optarg; + break; + case 'm': + defaultFileMode = std::stoi(optarg, nullptr, 8) & + ACCESSPERMS; + break; + case 'M': + defaultDirectoryMode = std::stoi(optarg, nullptr, 8) & + ACCESSPERMS; + break; + case 'g': + defaultGid = std::stoi(optarg); + break; + case 'u': + defaultUid = std::stoi(optarg); + break; + case 'h': + default: + std::cerr + << "Usage: " << argv[0] + << " [-F path] [-g gid] [-M dir-mode] [-m file-mode] [-u uid] [fuse-options] node" + << std::endl; + return EXIT_SUCCESS; + } + } + argc -= optind; + argv += optind; + + // If we are not invoked by mount_fusefs, exec mount_fusefs. + if (getenv("FUSE_DEV_FD") == nullptr) { + const char *mount_fusefs = "/sbin/mount_fusefs"; + std::string daemonArgs = std::format( + "-F {} -g {} -M {} -m {} -u {}", devicePath, defaultGid, + defaultDirectoryMode, defaultUid, defaultUid); + std::vector args; + args.push_back(mount_fusefs); + args.push_back("auto"); + args.push_back("-O"); + args.push_back(daemonArgs.c_str()); + args.push_back("-D"); + args.push_back(argv0); + for (int i = 0; i < argc; i++) { + args.push_back(argv[i]); + } + args.push_back(nullptr); + execv(mount_fusefs, const_cast(args.data())); + return EXIT_FAILURE; + } + + QemuFWCfgFuseFS fs(devicePath, defaultGid, defaultUid, + defaultDirectoryMode, defaultFileMode); +#ifndef WITHOUT_CAPSICUM + // Close standard in and out, restrict the rights on standard error to + // output (no ioctls, no read, and so on). + close(STDIN_FILENO); + close(STDOUT_FILENO); + cap_rights_t setrights; + cap_rights_init(&setrights, CAP_WRITE); + cap_rights_limit(STDERR_FILENO, &setrights); + cap_enter(); +#endif + try { + fs.run(); + } catch (std::exception &e) { + std::cerr << "QEMU Firmware Filesystem failed: " << e.what() + << std::endl; + } +} diff --git a/sbin/mount_qemufwcfg/tinyfuse.hh b/sbin/mount_qemufwcfg/tinyfuse.hh new file mode 100644 --- /dev/null +++ b/sbin/mount_qemufwcfg/tinyfuse.hh @@ -0,0 +1,704 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2024 David Chisnall + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#pragma once +#include + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/** + * Concept to differentiate between values that are provided as variable-sized + * objects. + */ +template +concept IsVariableSizedFuseStructure = + requires(U a) { + { + a.data() + } -> std::convertible_to; + { + a.size() + } -> std::convertible_to; + }; + +namespace { + +/** + * CRTP class for a fuse filesystems. + * + * This allows tiny implementations of FUSE filesystems. + * + * It is intended for simplicity and not performance. If you care about + * performance, libfuse provides a rich set of interfaces for writing + * multithreaded FUSE filesystems. + * + * This interface currently supports read-only filesystems. It may be extended + * in the future for read-write ones. + * + * The `T` template parameter is the class that inherits from this. The + * `Debug` parameter controls whether (very!) verbose logging is enabled. + * + * Users subclass this class and implement methods for handling each FUSE + * request. NOTE: The return types of the overridden methods do *NOT* have to + * match the ones in this class. It is perfectly fine to return non-owning + * wrappers, for example, in places where an owning object is returned in the + * default case. + */ +template class FuseFS { + protected: + /** + * Helper type for buffers of bytes. + */ + using Buffer = std::vector; + + /** + * Error wrapper. Most FUSE commands return either a value appended to + * the response, or an error code in the response header. + */ + template using ErrorOr = std::variant; + + /** + * Log a debug message to standard output if compiled in debug mode, + * otherwise do nothing. + */ + template + void debug_message(std::string_view formatString, Args &&...args) + { + if constexpr (Debug) { + std::cerr << std::vformat(formatString, + std::make_format_args(args...)) + << std::endl; + } + } + + private: + /** + * The file descriptor for /dev/fuse + */ + int fd = -1; + + /** + * The maximum size of a response. + */ + static constexpr size_t MaxReadSize = 128 * 1024; + + /** + * The maximum size of a write (the largest size of a message body from + * the kernel to the daemon). + */ + static constexpr size_t MaxWriteSize = 4 * 1024; + + /** + * Read a single message from the FUSE device and return it as a header + * and a buffer containing the variable-sized portion. + */ + std::pair> read_message() + { + fuse_in_header header; + Buffer message; + message.resize(MaxWriteSize); + // The fuse protocol is horrible. Each message must be read as + // a single atomic transaction, but you don't know the size + // until you've read the header and so we must reserve the + // maximum space even though most messages are a few bytes. + // If you read too little, the device returns ENODEV and then no + // future reads succeed. + while (true) { + iovec iov[] = { { &header, sizeof(header) }, + { message.data(), message.size() } }; + debug_message("readv iov[0] {{ {}, {} }}", + iov[0].iov_base, iov[0].iov_len); + debug_message("readv iov[1] {{ {}, {} }}", + iov[1].iov_base, iov[1].iov_len); + + ssize_t ret = readv(fd, iov, 2); + if (ret > 0) { + message.resize(ret - sizeof(header)); + return { header, std::move(message) }; + } + if (errno != EAGAIN) { + throw std::system_error(errno, + std::generic_category(), + "Failed to read from FUSE device"); + } + } + } + + /** + * Write an error response to the FUSE daemon. This takes the request's + * unique ID and the error code as arguments. + */ + void write_response_error(uint64_t unique, int error) + { + fuse_out_header out; + out.len = sizeof(out); + // Ensure that the error value is negative. FUSE uses the Linux + // syscall convention where errno is always negative. + out.error = -std::abs(error); + out.unique = unique; + debug_message("Writing error {}", error); + ssize_t ret = write(fd, &out, sizeof(out)); + if (ret < 0) { + throw std::system_error(errno, std::generic_category(), + "Failed to write to FUSE device"); + } + } + + /** + * Write a non-error response. The body can be: + * + * - A `NoResult` tag type, in which case only the header is sent. + * - A (reference to an) object that conforms to the + * `IsVariableSizedFuseStructure` concept, in which case the `data()` + * and `size()` members are used to extract the payload. + * - Any other object type, in which case the object is sent as-is. + */ + template void write_response(uint64_t unique, U &body) + { + // Type of the body, excluding qualifiers. + using Body = std::remove_cvref_t; + // Is the body type something other than the no-result tag type? + constexpr bool HasBody = !std::is_same_v; + + // Construct the header + fuse_out_header out; + out.error = 0; + out.unique = unique; + // The size is initially just the header size. This field is + // updated if a payload is attached. + out.len = sizeof(out); + + // Prepare the iovec. + iovec iov[] = { { &out, sizeof(out) }, + { &body, sizeof(body) } }; + + // If this is a variable-sized structure, query it for the data + // to send. + if constexpr (IsVariableSizedFuseStructure) { + iov[1] = { body.data(), body.size() }; + out.len += body.size(); + } else if constexpr (HasBody) { + out.len += sizeof(body); + } + + // Helper to print in the normal hex dump format. This makes it + // easy to compare the messages sent to the kernel against other + // implementations (via ktrace). + auto print_hex = [](void *ptr, size_t length) { + if constexpr (Debug) { + uint8_t *p = (uint8_t *)ptr; + for (size_t i = 0; i < length; i++) { + std::cerr << std::vformat("{:02x}", + std::make_format_args(p[i])); + if (i % 2 == 1) { + std::cerr << ' '; + } + } + std::cerr << std::endl; + } + }; + debug_message("Writing response {}, size: {}", unique, out.len); + debug_message("iov[0] {{ {}, {} }}", iov[0].iov_base, + iov[0].iov_len); + print_hex(iov[0].iov_base, iov[0].iov_len); + if (HasBody) { + debug_message("iov[1] {{ {}, {} }}", iov[1].iov_base, + iov[1].iov_len); + print_hex(iov[1].iov_base, iov[1].iov_len); + } + + // We need to use writev because the FUSE device requires + // messages to be written as a single transaction. + ssize_t ret = writev(fd, iov, HasBody ? 2 : 1); + // FUSE messages are written as a single atomic transaction, + // they will never partially fail, the entire write will fail if + // the FUSE device does not accept the message. DTrace is your + // friend if this happens. + if (ret != out.len) { + throw std::system_error(errno, std::generic_category(), + "Failed to write to FUSE device"); + } + } + + /** + * Helper to extract the argument of a callback. This version is never + * instantiated, only the specialisation is. + */ + template struct CallbackArgumentType { }; + + /** + * Specialisation for the method pointers that call the function. + */ + template + struct CallbackArgumentType ( + Base::*)(const fuse_in_header &, ArgumentType)> { + /// The type of the argument. + using Argument = std::remove_cvref_t; + /// The result type. + using Result = ErrorOr; + /// The result type removing the `ErrorOr` wrapper. + using SuccessResultType = ResultType; + }; + + /** + * Specialisation for members that don't take a message-specific + * argument type (used for messages where the header is the only + * message). + */ + template + struct CallbackArgumentType (Base::*)( + const fuse_in_header &)> { + /// The type of the argument. + using Argument = void; + /// The result type. + using Result = ErrorOr; + /// The result type removing the `ErrorOr` wrapper. + using SuccessResultType = ResultType; + }; + + /** + * Dispatch a message to the handler given by `function`. This must be + * a member pointer for `T`. The body will be coerced to the type that + * this function expects and then the response sent back to the kernel. + * The return type is expected to be an `ErrorOr` wrapper around the + * real return result. If this contains the error code, that will be + * reported as an error, otherwise the payload will be returned. + * + * If the return type is the `NoResponse` tag type, no response is sent + * (not even the header). + */ + void dispatch(auto function, const fuse_in_header header, + const Buffer body) + { + // Extract the argument and result types from the pointer + using Argument = + typename CallbackArgumentType::Argument; + using Result = + typename CallbackArgumentType::Result; + + // Call the callback and get the result + Result result; + if constexpr (std::is_same_v) { + // If this doesn't take an extra argument, use the + // one-argument version. + result = ((static_cast(this))->*function)(header); + } else if constexpr (std::is_pointer_v) { + // If the argument is a pointer, pass it directly. + result = ((static_cast(this))->*function)(header, + reinterpret_cast(body.data())); + } else { + // If the argument type is not a pointer, convert the + // pointer to a reference of the correct type and pass + // that. + const Argument &argument = + *reinterpret_cast(body.data()); + result = ((static_cast(this))->*function)(header, + argument); + } + // Write the response back to the kernel. + auto unique = header.unique; + std::visit( + [unique, this](auto &&arg) { + if constexpr (std::is_same_v>) { + write_response_error(unique, arg); + } else if (!std::is_same_v>) { + write_response(unique, arg); + } + }, + result); + } + + public: + /** + * Tag type for results where the FUSE response header is returned to + * the kernel with no additional data. + */ + struct NoResult { }; + + /** + * Tag type for when no response should be given for a FUSE message. + */ + struct NoResponse { }; + + /** + * Helper for building variable-sized responses. + */ + struct VariableSizeResponse : Buffer { + /** + * Add an arbitrary object to this buffer. + */ + template + VariableSizeResponse &operator<<(const U &object) + requires(std::is_trivially_copyable_v) + { + const uint8_t *start = + reinterpret_cast(&object); + const uint8_t *end = start + sizeof(U); + Buffer::insert(Buffer::end(), start, end); + return *this; + } + + /** + * Add a string view to this buffer. + */ + VariableSizeResponse &operator<<(std::string_view string) + { + Buffer::insert(Buffer::end(), string.begin(), + string.end()); + return *this; + } + + /** + * Insert padding to ensure alignment (at the end, `size()` will + * return a multiple of `align`). + */ + void pad_to_alignment(size_t align) + { + while (size() % align != 0) { + push_back(0); + } + } + }; + + /** + * Constructor. Takes ownership of the filesystem given to us by + * `mount_fusefs`. + */ + FuseFS() + { + std::string fdName = getenv("FUSE_DEV_FD"); + if (fdName.empty()) { + throw std::invalid_argument("FUSE_DEV_FD not set"); + } + fd = std::stoi(fdName); + } + + /** + * Destructor. + */ + ~FuseFS() + { + if (fd != -1) { + close(fd); + } + } + + /** + * Default handler for FUSE_INIT messages. Sets some sensible defaults + * for the connection. You can override this and either call this and + * modify the result, or replace it entirely. + */ + ErrorOr fuse_init(const fuse_in_header &, + const fuse_init_in &initIn) + { + // If this is a very old FUSE version, give up. + if ((initIn.major < 7) || + ((initIn.major == 7) && (initIn.minor < 13))) { + return ENOSYS; + } + fuse_init_out reply; + // Make sure any new fields are zero. + memset(&reply, 0, sizeof(reply)); + reply.major = FUSE_KERNEL_VERSION; + reply.minor = FUSE_KERNEL_MINOR_VERSION; + reply.max_readahead = MaxReadSize; + reply.congestion_threshold = 100; + reply.max_background = 100; + reply.max_write = MaxWriteSize; + reply.time_gran = 1; + // If the subclass doesn't override fuse_open and fuse_opendir, + // ask the kernel not to call them. + if constexpr (&T::fuse_open == &FuseFS::fuse_open) { + reply.flags |= FUSE_NO_OPEN_SUPPORT; + } + if constexpr (&T::fuse_opendir == + &FuseFS::fuse_opendir) { + reply.flags |= FUSE_NO_OPENDIR_SUPPORT; + } + debug_message("Initialised FUSE connection!"); + return reply; + } + + /** + * Handler for FUSE_GETATTR messages. Should be overridden by + * subclasses. + */ + ErrorOr fuse_getattr(const fuse_in_header &, + const fuse_getattr_in &) + { + return ENOSYS; + } + + /** + * Handler for FUSE_OPEN messages. Should be overridden by subclasses. + */ + ErrorOr fuse_open(const fuse_in_header &, + const fuse_open_in &) + { + return ENOSYS; + } + + /** + * Handler for FUSE_OPENDIR messages. Should be overridden by + * subclasses. + */ + ErrorOr fuse_opendir(const fuse_in_header &, + const fuse_open_in &) + { + return ENOSYS; + } + + /** + * Handler for FUSE_ACCESS messages. This can return an error if access + * is not allowed, otherwise the VFS layer will perform normal access + * checks. + */ + ErrorOr fuse_access(const fuse_in_header &, + const fuse_access_in &) + { + // Allow anything that VFS is happy with. + return ENOSYS; + } + + /** + * Handler for FUSE_RELEASE messages. Should be overridden by + * subclasses that store any state per file descriptor. + */ + ErrorOr fuse_release(const fuse_in_header &, + const fuse_release_in &) + { + return NoResult {}; + } + + /** + * Handler for FUSE_OPENDIR messages. Should be overridden by + * subclasses that store any state per directory descriptor. + */ + ErrorOr fuse_releasedir(const fuse_in_header &, + const fuse_release_in &) + { + return NoResult {}; + } + + /** + * Handler for FUSE_FORGET messages. Can be overridden by subclasses. + */ + ErrorOr fuse_forget(const fuse_in_header &, + const fuse_forget_in &) + { + return NoResponse {}; + } + + /** + * Handler for FUSE_READDIR messages. Should be overridden by + * subclasses. + * + * The result is a sequence of `fuse_dirent` structures, followed by the + * name of the entry, padded to an 8-byte boundary. + */ + ErrorOr fuse_readdir(const fuse_in_header &, + const fuse_read_in &) + { + return ENOSYS; + } + + /** + * Handler for FUSE_READ messages. Should be overridden by subclasses. + */ + ErrorOr fuse_read(const fuse_in_header &, + const fuse_read_in &) + { + return ENOSYS; + } + + /** + * Handler for FUSE_LOOKUP messages. Should be overridden by + * subclasses. + */ + ErrorOr fuse_lookup(const fuse_in_header &, + const char *) + { + return ENOSYS; + } + + /** + * Handler for FUSE_FLUSH messages. Can be overridden by subclasses. + * + * The default implementation instructs the kernel to not send any + * future flush requests. + */ + ErrorOr fuse_flush(const fuse_in_header &, + const fuse_flush_in &) + { + return ENOSYS; + } + + /** + * Handler for FUSE_SETATTR messages. This is needed even for read-only + * filesystems unless they are mounted with noatime. The default + * implementation returns a stub set of attributes with a cache policy + * indicating that they are immediately invalidated so the kernel will + * then query the daemon again to get the real values. This is + * sufficient for read-only filesystems that don't support atime. + */ + ErrorOr fuse_setattr(const fuse_in_header &, + const fuse_setattr_in &) + { + return ENOSYS; + } + + /** + * Default statfs. Sets some defaults. Can be overwritten and this + * version can be called to provide some defaults that a filesystem then + * modifies. + */ + ErrorOr fuse_statfs(const fuse_in_header &) + { + fuse_statfs_out out; + memset(&out, 0, sizeof(out)); + out.st.blocks = 1; + out.st.bfree = 0; + out.st.bavail = 0; + out.st.files = 0; + out.st.ffree = 0; + // Default block size + out.st.bsize = 512; + out.st.namelen = PATH_MAX; + out.st.frsize = 0; + return out; + } + + /** + * Default block map. Just maps the input block to the same output. + */ + ErrorOr fuse_bmap(const fuse_in_header &, + const fuse_bmap_in &) + { + return ENOSYS; + } + + /* + * Enter a run loop, waiting for kernel messages and posting responses. + * + * When a FUSE message is received from the kernel, this calls the + * corresponding handler in the subclass (or this class if none is + * provided in the subclass). + */ + void run() + { + debug_message("Starting FUSE FS"); + bool destroy = false; + while (!destroy) { + auto [header, body] = read_message(); + + debug_message( + "Message: {{ opcode: {}, length: {}, unique: {}, nodeid: {}, uid: {}, gid: {}, pid: {} }}", + header.opcode, header.len, header.unique, + header.nodeid, header.uid, header.gid, header.pid); + switch (header.opcode) { + default: + debug_message( + "Unhandled message with opcode {}", + header.opcode); + write_response_error(header.unique, ENOSYS); + break; + case FUSE_INIT: + dispatch(&T::fuse_init, header, body); + break; + case FUSE_GETATTR: + dispatch(&T::fuse_getattr, header, body); + break; + case FUSE_OPEN: + dispatch(&T::fuse_open, header, body); + break; + case FUSE_OPENDIR: + dispatch(&T::fuse_opendir, header, body); + break; + case FUSE_ACCESS: + dispatch(&T::fuse_access, header, body); + break; + case FUSE_READDIR: + dispatch(&T::fuse_readdir, header, body); + break; + case FUSE_READ: + dispatch(&T::fuse_read, header, body); + break; + case FUSE_FORGET: + dispatch(&T::fuse_forget, header, body); + break; + case FUSE_RELEASEDIR: + dispatch(&T::fuse_releasedir, header, body); + break; + case FUSE_RELEASE: + dispatch(&T::fuse_release, header, body); + break; + case FUSE_LOOKUP: + dispatch(&T::fuse_lookup, header, body); + break; + case FUSE_FLUSH: + dispatch(&T::fuse_flush, header, body); + break; + case FUSE_SETATTR: + dispatch(&T::fuse_setattr, header, body); + break; + case FUSE_STATFS: + dispatch(&T::fuse_statfs, header, body); + break; + case FUSE_BMAP: + dispatch(&T::fuse_bmap, header, body); + break; + case FUSE_DESTROY: + // When we receive a destroy message, this + // filesystem has been unmounted. This doesn't + // need a response, just close the device and + // exit the run loop. + close(fd); + fd = -1; + destroy = true; + break; + } + } + } +}; + +} diff --git a/sys/dev/ic/qemufwcfgio.h b/sys/dev/ic/qemufwcfgio.h new file mode 100644 --- /dev/null +++ b/sys/dev/ic/qemufwcfgio.h @@ -0,0 +1,42 @@ +/* $NetBSD: qemufwcfgio.h,v 1.1 2017/11/25 16:31:03 jmcneill Exp $ */ + +/*- + * Copyright (c) 2017 Jared McNeill + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _QEMUFWCFGIO_H +#define _QEMUFWCFGIO_H + +#include + +/* Fixed selector keys */ +#define FW_CFG_SIGNATURE 0x0000 /* Signature */ +#define FW_CFG_ID 0x0001 /* Revision / feature bitmap */ +#define FW_CFG_FILE_DIR 0x0019 /* File directory */ +#define FW_CFG_FILE_FIRST 0x0020 /* First file in directory */ + +#define FWCFGIO_SET_INDEX _IOW('q', 0, uint16_t) + +#endif /* !_QEMUFWCFGIO_H */ diff --git a/sys/dev/qemufwcfg/qemufwcfg.5 b/sys/dev/qemufwcfg/qemufwcfg.5 new file mode 100644 --- /dev/null +++ b/sys/dev/qemufwcfg/qemufwcfg.5 @@ -0,0 +1,77 @@ +.\" +.\" SPDX-License-Identifier: BSD-2-Clause +.\" +.\" Copyright (c) 2023 David Chisnall +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" +.\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND +.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +.\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE +.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +.\" SUCH DAMAGE. +.Dd August 17, 2023 +.Dt QEMUFWCFG 5 +.Os +.Sh NAME +.Nm qemufqcfg +.Nd "QEMU Firmware Config client interface" +.Sh SYNOPSIS +.Pp +To load as a loadable kernel module: +.Pp +.Dl "kldload fusefs" +.Sh DESCRIPTION +The +.Nm +driver implements the client driver for QEMU's firmware configuration interface. +.Pp +This interface allows QEMU, or a compatible emulator or hypervisor such as +.Xr bhyve 8 , +to provide configuration files via a simple interface. +The core interface is a simple (read-only) key-value store that uses 16-bit integers as keys and bytes streams as values. +.Pp +This driver is intended to be used in conjunction with a single userspace client, such as the +.Xr mount_qemufwcfg 8 +FUSE filesystem. +.Pp +The driver supports a single command via the +.Xr ioctl 2 +system call: +.Dv FWCFGIO_SET_INDEX . +This takes a 16-bit integer as the argument and selects the specified entry in the host. +Subsequent +.Xr read 2 +calls will read the byte stream associated with that selector. +.Sh SEE ALSO +.Rs +.Xr mount_qemufwcfg 8 , +.%T QEMU Firmware Configuration (fw_cfg) Device specification +.%U https://www.qemu.org/docs/master/specs/fw_cfg.html +.Sh HISTORY +The +.Nm qemufqcfg +driver first appeared in NetBSD 9.0. +The +.Fx +implementation, which used the NetBSD version as a reference, first appeared in +.Fx 14.0 +. +.Sh AUTHORS +The +.Nm qemufqcfg +driver was originally written by +.An David Chisnall . diff --git a/sys/dev/qemufwcfg/qemufwcfg.c b/sys/dev/qemufwcfg/qemufwcfg.c new file mode 100644 --- /dev/null +++ b/sys/dev/qemufwcfg/qemufwcfg.c @@ -0,0 +1,389 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2023 David Chisnall + * + * This file was created using the NetBSD implementation as reference and so + * may be a derived work of the NetBSD implementation: + * Copyright (c) 2017 Jared McNeill + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/** + * This provides a NetBSD-compatible qemufwcfg device. + * + * The only intended consumer of this is a FUSE filesystem that exports the + * firmware configuration to userspace. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + +// clang-format off +// Clang-format puts these in the wrong order. +#include +#include +#include +// clang-format on + +// Forward declarations +static int qemufwcfg_probe(device_t); +static int qemufwcfg_attach(device_t); +static int qemufwcfg_detach(device_t); +static int qemufwcfg_open(struct cdev *dev, int oflags, int devtype, + struct thread *td); +static int qemufwcfg_close(struct cdev *dev, int flags, int fmt, + struct thread *td); +static int qemufwcfg_ioctl(struct cdev *dev, u_long cmd, caddr_t data, + int fflag, struct thread *td); +static int qemufwcfg_read(struct cdev *dev, struct uio *uio, int ioflag); + +/** + * Device attachment methods. + */ +static device_method_t qemufwcfg_methods[] = { + /* Methods from the device interface */ + DEVMETHOD(device_probe, qemufwcfg_probe), + DEVMETHOD(device_attach, qemufwcfg_attach), + DEVMETHOD(device_detach, qemufwcfg_detach), + /* Terminate method list */ + DEVMETHOD_END +}; + +/** + * Character device methods. + */ +static struct cdevsw qemufwcfg_cdevsw = { + .d_version = D_VERSION, + .d_name = "qemufwcfg", + .d_open = qemufwcfg_open, + .d_close = qemufwcfg_close, + .d_read = qemufwcfg_read, + .d_ioctl = qemufwcfg_ioctl, +}; + +/** + * State for this device. + */ +struct qemufwcfg_softc { + /// I/O type, MMIO or I/O port + int io_type; + /// Resource id, filled in by `bus_alloc_resource_any`. + int resource_id; + /// The resource for this device, from ACPI. May be I/O ports on x86 + /// or MMIO. + struct resource *res; + /// Cached copy of the bus space tag from `res`. + bus_space_tag_t tag; + /// Cached copy of the bus space handle from `res`. + bus_space_handle_t handle; + /// Character device node. + struct cdev *cdev; + /// Mutex protecting this structure. This is used to protect `is_open`. + struct mtx mutex; + /// Flag indicating that this is open, protects against concurrent + /// access. + bool is_open; +}; + +/** + * Driver configuration. + */ +static driver_t qemufwcfg_driver = { "qemufwcfg", qemufwcfg_methods, + sizeof(struct qemufwcfg_softc) }; + +DRIVER_MODULE(qemufwcfg, acpi, qemufwcfg_driver, NULL, NULL); + +/* + * The selector reserved for checking that this is the correct + * interface. + */ +const int SignatureSelector = 0; + +/** + * Helper. Writes the specified selector value to the device. + */ +static void +write_selector(struct qemufwcfg_softc *sc, uint16_t index) +{ + // The offset from the base I/O port when using I/O ports + const int SelectorPortOffset = 0x0; + // The offset from the base MMIO address, when using MMIO. + const int SelectorMMIOOffset = 0x8; + int offset; + + if (sc->io_type == SYS_RES_IOPORT) { + // I/O port mode uses little endian for the selector + index = htole16(index); + offset = SelectorPortOffset; + } else { + // MMIO mode uses little endian for the selector + index = htobe16(index); + offset = SelectorMMIOOffset; + } + // Write the selector value. + bus_space_write_2(sc->tag, sc->handle, offset, index); +} + +/** + * Helper. Returns the offset for the selector. + */ +static int +offset_for_data(struct qemufwcfg_softc *sc) +{ + // The offset from the base I/O port when using I/O ports + const int DataPortOffset = 0x1; + // The offset from the base MMIO address, when using MMIO. + const int DataMMIOOffset = 0x0; + return ( + (sc->io_type == SYS_RES_IOPORT) ? DataPortOffset : DataMMIOOffset); +} + +/** + * Probe hook. Checks that the ACPI node exists. + */ +static int +qemufwcfg_probe(device_t dev) +{ + ACPI_HANDLE h; + + if ((h = acpi_get_handle(dev)) == NULL) + return (ENXIO); + + if (!acpi_MatchHid(h, "QEMU0002")) + return (ENXIO); + + return (0); +} + +/** + * Attach to the device. This performs a read on the signature to ensure that + * this really is the right kind of device. + */ +static int +qemufwcfg_attach(device_t dev) +{ + struct qemufwcfg_softc *sc = device_get_softc(dev); + + // Try to configure the memory space. The device can use I/O ports on + // x86, memory elsewhere. + if (bus_get_resource(dev, SYS_RES_IOPORT, 0, NULL, NULL) == 0) { + sc->io_type = SYS_RES_IOPORT; + } else if (bus_get_resource(dev, SYS_RES_MEMORY, 0, NULL, NULL) == 0) { + sc->io_type = SYS_RES_MEMORY; + } else { + device_printf(dev, "Unknown resource type\n"); + return (ENXIO); + } + + sc->res = bus_alloc_resource_any(dev, sc->io_type, &sc->resource_id, + RF_ACTIVE); + if (sc->res == NULL) { + device_printf(dev, "Failed to allocate bus resource\n"); + return (ENXIO); + } + + // Cache the tag and handle so we don't have to keep looking them up. + sc->tag = rman_get_bustag(sc->res); + sc->handle = rman_get_bushandle(sc->res); + + write_selector(sc, SignatureSelector); + + // Read 4 bytes from signature. + int offset = offset_for_data(sc); + char buf[4]; + bus_space_read_multi_1(sc->tag, sc->handle, offset, buf, sizeof(buf)); + + // Check that the signature value is correct. + static const char expected[] = "QEMU"; + _Static_assert(sizeof(expected) >= sizeof(buf), + "Expected value too small!"); + if (strncmp(buf, expected, sizeof(buf)) != 0) { + bus_release_resource(dev, sc->io_type, sc->resource_id, + sc->res); + sc->res = NULL; + device_printf(dev, + "Failed to attach, got <%c%c%c%c>, expected \n", + buf[0], buf[1], buf[2], buf[3]); + return (ENXIO); + } + + mtx_init(&sc->mutex, "qemufwcfg lock", NULL, MTX_DEF); + + // Create the device node. + struct make_dev_args args; + make_dev_args_init(&args); + args.mda_mode = 0400; + args.mda_devsw = &qemufwcfg_cdevsw; + args.mda_si_drv1 = sc; + args.mda_flags = MAKEDEV_WAITOK | MAKEDEV_CHECKNAME; + + int ret = make_dev_s(&args, &sc->cdev, "qemufwcfg"); + KASSERT(ret == 0, "make_dev_s should not fail here"); + + return (0); +} + +/** + * Detach from the device. Cleans up resources. + */ +static int +qemufwcfg_detach(device_t dev) +{ + struct qemufwcfg_softc *sc = device_get_softc(dev); + destroy_dev(sc->cdev); + bus_release_resource(dev, sc->io_type, sc->resource_id, sc->res); + mtx_destroy(&sc->mutex); + return (0); +} + +/** + * Open. This device doesn't allow concurrent access so this fails if more + * more than one attempt is made to open the device. + */ +static int +qemufwcfg_open(struct cdev *dev, int oflags __unused, int devtype __unused, + struct thread *td __unused) +{ + struct qemufwcfg_softc *sc = dev->si_drv1; + int error = 0; + + mtx_lock(&sc->mutex); + if (sc->is_open) { + error = EBUSY; + } else { + sc->is_open = true; + } + mtx_unlock(&sc->mutex); + + return (error); +} + +/** + * Close the device. This just marks the device as not open to allow another + * userspace process to open it, it doesn't do any cleanup. + */ +static int +qemufwcfg_close(struct cdev *dev, int flags __unused, int fmt __unused, + struct thread *td __unused) +{ + struct qemufwcfg_softc *sc = dev->si_drv1; + int error = 0; + + mtx_lock(&sc->mutex); + if (!sc->is_open) { + error = EINVAL; + } else { + sc->is_open = false; + } + mtx_unlock(&sc->mutex); + + return (error); +} + +/** + * Ioctl handler. A single ioctl is supported, to set the selector. + */ +static int +qemufwcfg_ioctl(struct cdev *dev, u_long cmd, caddr_t data, int fflag __unused, + struct thread *td __unused) +{ + struct qemufwcfg_softc *sc = dev->si_drv1; + + switch (cmd) { + default: + return (ENOTTY); + case FWCFGIO_SET_INDEX: { + uint16_t index = *(uint16_t *)data; + write_selector(sc, index); + + return (0); + } + } +} + +/** + * Read. This reads the specified number of bytes from the currently + * configured selector. Seek is not supported (here or by the QEMU device), + * the only way of reading backwards is to reset to the beginning of a 'file' + * and read forwards. + * + * DMA is not currently used. For small files, the cost of pinning a buffer + * and passing a physical address out to the host would likely offset any + * speedup. We can read 8 bytes at a time and most files that we read are a + * handful of MMIO reads at this size. + */ +static int +qemufwcfg_read(struct cdev *dev, struct uio *uio, int ioflag __unused) +{ + struct qemufwcfg_softc *sc = dev->si_drv1; + + if (sc == NULL) { + return (ENXIO); + } + + int error = 0; + int offset = offset_for_data(sc); + +#if !(defined(__i386__) || defined(__amd64__)) + // If we're in MMIO mode, try reading 8 bytes at a time. This reduces + // the number of VM exits that we need by a factor of 8, which is + // probably premature optimisation given how rare reads on this device + // are, but was easy to do. + if (sc->io_type == SYS_RES_MEMORY) { + uint64_t buf[8]; + while ((uio->uio_resid > sizeof(buf[0])) && (error == 0)) { + size_t count = qmin(sizeof(buf), uio->uio_resid) / + sizeof(buf[0]); + bus_space_read_multi_8(sc->tag, sc->handle, offset, buf, + count); + error = uiomove(buf, count * sizeof(buf[0]), uio); + } + } +#endif + + while ((uio->uio_resid > 0) && (error == 0)) { + // Try copying 64 bytes at a time. If we're on a platform that + // supports MMIO then we should be copying at most 7 bytes here + // because we'll have read the rest via 8-byte reads. If we're + // using x86 IO Ports then we have to read one byte at at time. + uint8_t buf[64]; + size_t count = qmin(sizeof(buf), uio->uio_resid); + bus_space_read_multi_1(sc->tag, sc->handle, offset, buf, count); + error = uiomove(buf, count, uio); + } + + return (error); +} diff --git a/sys/modules/Makefile b/sys/modules/Makefile --- a/sys/modules/Makefile +++ b/sys/modules/Makefile @@ -331,6 +331,7 @@ ${_qatfw} \ ${_qat_c2xxx} \ ${_qat_c2xxxfw} \ + qemufwcfg \ ${_qlxge} \ ${_qlxgb} \ ${_qlxgbe} \ diff --git a/sys/modules/qemufwcfg/Makefile b/sys/modules/qemufwcfg/Makefile new file mode 100644 --- /dev/null +++ b/sys/modules/qemufwcfg/Makefile @@ -0,0 +1,8 @@ +.PATH: ${SRCTOP}/sys/dev/qemufwcfg + +KMOD= qemufwcfg + +SRCS= qemufwcfg.c device_if.h bus_if.h acpi_if.h opt_acpi.h + +.include +