diff --git a/sys/compat/linuxkpi/common/include/linux/module.h b/sys/compat/linuxkpi/common/include/linux/module.h index 2a4fdc5a11a9..25d775dd8df9 100644 --- a/sys/compat/linuxkpi/common/include/linux/module.h +++ b/sys/compat/linuxkpi/common/include/linux/module.h @@ -1,107 +1,108 @@ /*- * Copyright (c) 2010 Isilon Systems, Inc. * Copyright (c) 2010 iX Systems, Inc. * Copyright (c) 2010 Panasas, Inc. * Copyright (c) 2013, 2014 Mellanox Technologies, Ltd. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _LINUX_MODULE_H_ #define _LINUX_MODULE_H_ #include #include +#include #include #include #include #include #include #include #include #include #include #include #define MODULE_AUTHOR(name) #define MODULE_DESCRIPTION(name) #define MODULE_LICENSE(name) #define MODULE_INFO(tag, info) #define MODULE_FIRMWARE(firmware) #define MODULE_SUPPORTED_DEVICE(name) #define THIS_MODULE ((struct module *)0) #define __MODULE_STRING(x) __stringify(x) /* OFED pre-module initialization */ #define SI_SUB_OFED_PREINIT (SI_SUB_ROOT_CONF - 2) /* OFED default module initialization */ #define SI_SUB_OFED_MODINIT (SI_SUB_ROOT_CONF - 1) #include static inline void _module_run(void *arg) { void (*fn)(void); #ifdef OFED_DEBUG_INIT char name[1024]; caddr_t pc; long offset; pc = (caddr_t)arg; if (linker_search_symbol_name(pc, name, sizeof(name), &offset) != 0) printf("Running ??? (%p)\n", pc); else printf("Running %s (%p)\n", name, pc); #endif fn = arg; fn(); } #define module_init(fn) \ SYSINIT(fn, SI_SUB_OFED_MODINIT, SI_ORDER_FIRST, _module_run, (fn)) #define module_exit(fn) \ SYSUNINIT(fn, SI_SUB_OFED_MODINIT, SI_ORDER_SECOND, _module_run, (fn)) /* * The following two macros are a workaround for not having a module * load and unload order resolver: */ #define module_init_order(fn, order) \ SYSINIT(fn, SI_SUB_OFED_MODINIT, (order), _module_run, (fn)) #define module_exit_order(fn, order) \ SYSUNINIT(fn, SI_SUB_OFED_MODINIT, (order), _module_run, (fn)) #define module_get(module) #define module_put(module) #define try_module_get(module) 1 #define postcore_initcall(fn) module_init(fn) #endif /* _LINUX_MODULE_H_ */ diff --git a/sys/contrib/rdma/krping/krping_dev.c b/sys/contrib/rdma/krping/krping_dev.c index 01927419fe33..eea3c772ea4f 100644 --- a/sys/contrib/rdma/krping/krping_dev.c +++ b/sys/contrib/rdma/krping/krping_dev.c @@ -1,231 +1,231 @@ /* * This code lifted from: * Simple `echo' pseudo-device KLD * Murray Stokely * Converted to 5.X by Søren (Xride) Straarup */ /* * /bin/echo "server,port=9999,addr=192.168.69.142,validate" > /dev/krping * /bin/echo "client,port=9999,addr=192.168.69.142,validate" > /dev/krping */ #include __FBSDID("$FreeBSD$"); #include +#include /* defines used in kernel.h and module.h */ #include #include /* uprintf */ #include -#include /* defines used in kernel.h */ #include /* types used in module initialization */ #include /* cdevsw struct */ #include /* uio struct */ #include #include #include #include #include "krping.h" #define BUFFERSIZE 512 SYSCTL_NODE(_dev, OID_AUTO, krping, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "kernel rping module"); int krping_debug = 0; SYSCTL_INT(_dev_krping, OID_AUTO, debug, CTLFLAG_RW, &krping_debug, 0 , ""); /* Function prototypes */ static d_open_t krping_open; static d_close_t krping_close; static d_read_t krping_read; static d_write_t krping_write; static d_purge_t krping_purge; /* Character device entry points */ static struct cdevsw krping_cdevsw = { .d_version = D_VERSION, .d_open = krping_open, .d_close = krping_close, .d_read = krping_read, .d_write = krping_write, .d_purge = krping_purge, .d_name = "krping", }; typedef struct s_krping { char msg[BUFFERSIZE]; int len; } krping_t; struct stats_list_entry { STAILQ_ENTRY(stats_list_entry) link; struct krping_stats *stats; }; STAILQ_HEAD(stats_list, stats_list_entry); /* vars */ static struct cdev *krping_dev; static int krping_loader(struct module *m, int what, void *arg) { int err = 0; switch (what) { case MOD_LOAD: /* kldload */ krping_dev = make_dev(&krping_cdevsw, 0, UID_ROOT, GID_WHEEL, 0600, "krping"); printf("Krping device loaded.\n"); break; case MOD_UNLOAD: destroy_dev(krping_dev); printf("Krping device unloaded.\n"); break; default: err = EOPNOTSUPP; break; } return (err); } static int krping_open(struct cdev *dev, int oflags, int devtype, struct thread *p) { return (0); } static int krping_close(struct cdev *dev, int fflag, int devtype, struct thread *p) { return 0; } static void krping_copy_stats(struct krping_stats *stats, void *arg) { struct stats_list_entry *s; struct stats_list *list = arg; s = malloc(sizeof(*s), M_DEVBUF, M_NOWAIT | M_ZERO); if (s == NULL) return; if (stats != NULL) { s->stats = malloc(sizeof(*stats), M_DEVBUF, M_NOWAIT | M_ZERO); if (s->stats == NULL) { free(s, M_DEVBUF); return; } *s->stats = *stats; } STAILQ_INSERT_TAIL(list, s, link); } static int krping_read(struct cdev *dev, struct uio *uio, int ioflag) { int num = 1; struct stats_list list; struct stats_list_entry *e; STAILQ_INIT(&list); krping_walk_cb_list(krping_copy_stats, &list); if (STAILQ_EMPTY(&list)) return (0); uprintf("krping: %4s %10s %10s %10s %10s %10s %10s %10s %10s %10s\n", "num", "device", "snd bytes", "snd msgs", "rcv bytes", "rcv msgs", "wr bytes", "wr msgs", "rd bytes", "rd msgs"); while (!STAILQ_EMPTY(&list)) { e = STAILQ_FIRST(&list); STAILQ_REMOVE_HEAD(&list, link); if (e->stats == NULL) uprintf("krping: %d listen\n", num); else { struct krping_stats *stats = e->stats; uprintf("krping: %4d %10s %10llu %10llu %10llu %10llu " "%10llu %10llu %10llu %10llu\n", num, stats->name, stats->send_bytes, stats->send_msgs, stats->recv_bytes, stats->recv_msgs, stats->write_bytes, stats->write_msgs, stats->read_bytes, stats->read_msgs); free(stats, M_DEVBUF); } num++; free(e, M_DEVBUF); } return (0); } static int krping_write(struct cdev *dev, struct uio *uio, int ioflag) { int err = 0; int amt; int remain = BUFFERSIZE; char *cp; krping_t *krpingmsg; krpingmsg = malloc(sizeof *krpingmsg, M_DEVBUF, M_WAITOK|M_ZERO); if (!krpingmsg) { uprintf("Could not malloc mem!\n"); return ENOMEM; } cp = krpingmsg->msg; while (uio->uio_resid) { amt = MIN(uio->uio_resid, remain); if (amt == 0) break; /* Copy the string in from user memory to kernel memory */ err = uiomove(cp, amt, uio); if (err) { uprintf("Write failed: bad address!\n"); goto done; } cp += amt; remain -= amt; } if (uio->uio_resid != 0) { uprintf("Message too big. max size is %d!\n", BUFFERSIZE); err = EMSGSIZE; goto done; } /* null terminate and remove the \n */ cp--; *cp = 0; krpingmsg->len = (unsigned long)(cp - krpingmsg->msg); uprintf("krping: write string = |%s|\n", krpingmsg->msg); err = krping_doit(krpingmsg->msg); done: free(krpingmsg, M_DEVBUF); return(err); } static void krping_purge(struct cdev *dev __unused) { krping_cancel_all(); } int krping_sigpending(void) { return (SIGPENDING(curthread)); } DEV_MODULE(krping, krping_loader, NULL); MODULE_DEPEND(krping, ibcore, 1, 1, 1); diff --git a/sys/dev/videomode/videomode.c b/sys/dev/videomode/videomode.c index a1c7f0a82290..aa576d21b623 100644 --- a/sys/dev/videomode/videomode.c +++ b/sys/dev/videomode/videomode.c @@ -1,130 +1,131 @@ /* $FreeBSD$ */ /* * THIS FILE AUTOMATICALLY GENERATED. DO NOT EDIT. * * generated from: * NetBSD: modelines,v 1.9 2011/03/30 18:45:04 jdc Exp */ #include __FBSDID("$FreeBSD$"); #include +#include #include #include MODULE_VERSION(videomode, 1); /* * These macros help the modelines below fit on one line. */ #define HP VID_PHSYNC #define HN VID_NHSYNC #define VP VID_PVSYNC #define VN VID_NVSYNC #define I VID_INTERLACE #define DS VID_DBLSCAN #define M(nm,hr,vr,clk,hs,he,ht,vs,ve,vt,f) \ { clk, hr, hs, he, ht, vr, vs, ve, vt, f, nm } const struct videomode videomode_list[] = { M("640x350x85",640,350,31500,672,736,832,382,385,445,HP|VN), M("640x400x85",640,400,31500,672,736,832,401,404,445,HN|VP), M("720x400x70",720,400,28320,738,846,900,412,414,449,HN|VP), M("720x400x85",720,400,35500,756,828,936,401,404,446,HN|VP), M("720x400x87",720,400,35500,738,846,900,421,423,449,HN|VN), M("640x480x60",640,480,25175,656,752,800,490,492,525,HN|VN), M("640x480x72",640,480,31500,664,704,832,489,492,520,HN|VN), M("640x480x75",640,480,31500,656,720,840,481,484,500,HN|VN), M("640x480x85",640,480,36000,696,752,832,481,484,509,HN|VN), M("800x600x56",800,600,36000,824,896,1024,601,603,625,HP|VP), M("800x600x60",800,600,40000,840,968,1056,601,605,628,HP|VP), M("800x600x72",800,600,50000,856,976,1040,637,643,666,HP|VP), M("800x600x75",800,600,49500,816,896,1056,601,604,625,HP|VP), M("800x600x85",800,600,56250,832,896,1048,601,604,631,HP|VP), M("1024x768x87i",1024,768,44900,1032,1208,1264,768,776,817,HP|VP|I), M("1024x768x60",1024,768,65000,1048,1184,1344,771,777,806,HN|VN), M("1024x768x70",1024,768,75000,1048,1184,1328,771,777,806,HN|VN), M("1024x768x75",1024,768,78750,1040,1136,1312,769,772,800,HP|VP), M("1024x768x85",1024,768,94500,1072,1168,1376,769,772,808,HP|VP), M("1024x768x89",1024,768,100000,1108,1280,1408,768,780,796,HP|VP), M("1152x864x75",1152,864,108000,1216,1344,1600,865,868,900,HP|VP), M("1280x768x75",1280,768,105640,1312,1712,1744,782,792,807,HN|VP), M("1280x960x60",1280,960,108000,1376,1488,1800,961,964,1000,HP|VP), M("1280x960x85",1280,960,148500,1344,1504,1728,961,964,1011,HP|VP), M("1280x1024x60",1280,1024,108000,1328,1440,1688,1025,1028,1066,HP|VP), M("1280x1024x70",1280,1024,126000,1328,1440,1688,1025,1028,1066,HP|VP), M("1280x1024x75",1280,1024,135000,1296,1440,1688,1025,1028,1066,HP|VP), M("1280x1024x85",1280,1024,157500,1344,1504,1728,1025,1028,1072,HP|VP), M("1600x1200x60",1600,1200,162000,1664,1856,2160,1201,1204,1250,HP|VP), M("1600x1200x65",1600,1200,175500,1664,1856,2160,1201,1204,1250,HP|VP), M("1600x1200x70",1600,1200,189000,1664,1856,2160,1201,1204,1250,HP|VP), M("1600x1200x75",1600,1200,202500,1664,1856,2160,1201,1204,1250,HP|VP), M("1600x1200x85",1600,1200,229500,1664,1856,2160,1201,1204,1250,HP|VP), M("1680x1050x60",1680,1050,147140,1784,1968,2256,1051,1054,1087,HP|VP), M("1792x1344x60",1792,1344,204800,1920,2120,2448,1345,1348,1394,HN|VP), M("1792x1344x75",1792,1344,261000,1888,2104,2456,1345,1348,1417,HN|VP), M("1856x1392x60",1856,1392,218300,1952,2176,2528,1393,1396,1439,HN|VP), M("1856x1392x75",1856,1392,288000,1984,2208,2560,1393,1396,1500,HN|VP), M("1920x1440x60",1920,1440,234000,2048,2256,2600,1441,1444,1500,HN|VP), M("1920x1440x75",1920,1440,297000,2064,2288,2640,1441,1444,1500,HN|VP), M("832x624x74",832,624,57284,864,928,1152,625,628,667,HN|VN), M("1152x768x54",1152,768,64995,1178,1314,1472,771,777,806,HP|VP), M("1400x1050x60",1400,1050,122000,1488,1640,1880,1052,1064,1082,HP|VP), M("1400x1050x74",1400,1050,155800,1464,1784,1912,1052,1064,1090,HP|VP), M("1152x900x66",1152,900,94500,1192,1320,1528,902,906,937,HN|VN), M("1152x900x76",1152,900,105560,1168,1280,1472,902,906,943,HN|VN), /* Derived Double Scan Modes */ M("320x175x85",320,175,15750,336,368,416,191,192,222,HP|VN|DS), M("320x200x85",320,200,15750,336,368,416,200,202,222,HN|VP|DS), M("360x200x70",360,200,14160,369,423,450,206,207,224,HN|VP|DS), M("360x200x85",360,200,17750,378,414,468,200,202,223,HN|VP|DS), M("360x200x87",360,200,17750,369,423,450,210,211,224,HN|VN|DS), M("320x240x60",320,240,12587,328,376,400,245,246,262,HN|VN|DS), M("320x240x72",320,240,15750,332,352,416,244,246,260,HN|VN|DS), M("320x240x75",320,240,15750,328,360,420,240,242,250,HN|VN|DS), M("320x240x85",320,240,18000,348,376,416,240,242,254,HN|VN|DS), M("400x300x56",400,300,18000,412,448,512,300,301,312,HP|VP|DS), M("400x300x60",400,300,20000,420,484,528,300,302,314,HP|VP|DS), M("400x300x72",400,300,25000,428,488,520,318,321,333,HP|VP|DS), M("400x300x75",400,300,24750,408,448,528,300,302,312,HP|VP|DS), M("400x300x85",400,300,28125,416,448,524,300,302,315,HP|VP|DS), M("512x384x87i",512,384,22450,516,604,632,384,388,408,HP|VP|DS|I), M("512x384x60",512,384,32500,524,592,672,385,388,403,HN|VN|DS), M("512x384x70",512,384,37500,524,592,664,385,388,403,HN|VN|DS), M("512x384x75",512,384,39375,520,568,656,384,386,400,HP|VP|DS), M("512x384x85",512,384,47250,536,584,688,384,386,404,HP|VP|DS), M("512x384x89",512,384,50000,554,640,704,384,390,398,HP|VP|DS), M("576x432x75",576,432,54000,608,672,800,432,434,450,HP|VP|DS), M("640x384x75",640,384,52820,656,856,872,391,396,403,HN|VP|DS), M("640x480x60",640,480,54000,688,744,900,480,482,500,HP|VP|DS), M("640x480x85",640,480,74250,672,752,864,480,482,505,HP|VP|DS), M("640x512x60",640,512,54000,664,720,844,512,514,533,HP|VP|DS), M("640x512x70",640,512,63000,664,720,844,512,514,533,HP|VP|DS), M("640x512x75",640,512,67500,648,720,844,512,514,533,HP|VP|DS), M("640x512x85",640,512,78750,672,752,864,512,514,536,HP|VP|DS), M("800x600x60",800,600,81000,832,928,1080,600,602,625,HP|VP|DS), M("800x600x65",800,600,87750,832,928,1080,600,602,625,HP|VP|DS), M("800x600x70",800,600,94500,832,928,1080,600,602,625,HP|VP|DS), M("800x600x75",800,600,101250,832,928,1080,600,602,625,HP|VP|DS), M("800x600x85",800,600,114750,832,928,1080,600,602,625,HP|VP|DS), M("840x525x60",840,525,73570,892,984,1128,525,527,543,HP|VP|DS), M("896x672x60",896,672,102400,960,1060,1224,672,674,697,HN|VP|DS), M("896x672x75",896,672,130500,944,1052,1228,672,674,708,HN|VP|DS), M("928x696x60",928,696,109150,976,1088,1264,696,698,719,HN|VP|DS), M("928x696x75",928,696,144000,992,1104,1280,696,698,750,HN|VP|DS), M("960x720x60",960,720,117000,1024,1128,1300,720,722,750,HN|VP|DS), M("960x720x75",960,720,148500,1032,1144,1320,720,722,750,HN|VP|DS), M("416x312x74",416,312,28642,432,464,576,312,314,333,HN|VN|DS), M("576x384x54",576,384,32497,589,657,736,385,388,403,HP|VP|DS), M("700x525x60",700,525,61000,744,820,940,526,532,541,HP|VP|DS), M("700x525x74",700,525,77900,732,892,956,526,532,545,HP|VP|DS), M("576x450x66",576,450,47250,596,660,764,451,453,468,HN|VN|DS), M("576x450x76",576,450,52780,584,640,736,451,453,471,HN|VN|DS), }; const int videomode_count = 46; diff --git a/sys/fs/fuse/fuse_device.c b/sys/fs/fuse/fuse_device.c index f8807d6d1c26..157c3802ec7e 100644 --- a/sys/fs/fuse/fuse_device.c +++ b/sys/fs/fuse/fuse_device.c @@ -1,601 +1,602 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2007-2009 Google Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are * met: * * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following disclaimer * in the documentation and/or other materials provided with the * distribution. * * Neither the name of Google Inc. nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * Copyright (C) 2005 Csaba Henk. * All rights reserved. * * Copyright (c) 2019 The FreeBSD Foundation * * Portions of this software were developed by BFF Storage Systems, LLC under * sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include +#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "fuse.h" #include "fuse_internal.h" #include "fuse_ipc.h" #include #include SDT_PROVIDER_DECLARE(fusefs); /* * Fuse trace probe: * arg0: verbosity. Higher numbers give more verbose messages * arg1: Textual message */ SDT_PROBE_DEFINE2(fusefs, , device, trace, "int", "char*"); static struct cdev *fuse_dev; static d_kqfilter_t fuse_device_filter; static d_open_t fuse_device_open; static d_poll_t fuse_device_poll; static d_read_t fuse_device_read; static d_write_t fuse_device_write; static struct cdevsw fuse_device_cdevsw = { .d_kqfilter = fuse_device_filter, .d_open = fuse_device_open, .d_name = "fuse", .d_poll = fuse_device_poll, .d_read = fuse_device_read, .d_write = fuse_device_write, .d_version = D_VERSION, }; static int fuse_device_filt_read(struct knote *kn, long hint); static int fuse_device_filt_write(struct knote *kn, long hint); static void fuse_device_filt_detach(struct knote *kn); struct filterops fuse_device_rfiltops = { .f_isfd = 1, .f_detach = fuse_device_filt_detach, .f_event = fuse_device_filt_read, }; struct filterops fuse_device_wfiltops = { .f_isfd = 1, .f_event = fuse_device_filt_write, }; /**************************** * * >>> Fuse device op defs * ****************************/ static void fdata_dtor(void *arg) { struct fuse_data *fdata; struct fuse_ticket *tick; fdata = arg; if (fdata == NULL) return; fdata_set_dead(fdata); FUSE_LOCK(); fuse_lck_mtx_lock(fdata->aw_mtx); /* wakup poll()ers */ selwakeuppri(&fdata->ks_rsel, PZERO + 1); /* Don't let syscall handlers wait in vain */ while ((tick = fuse_aw_pop(fdata))) { fuse_lck_mtx_lock(tick->tk_aw_mtx); fticket_set_answered(tick); tick->tk_aw_errno = ENOTCONN; wakeup(tick); fuse_lck_mtx_unlock(tick->tk_aw_mtx); FUSE_ASSERT_AW_DONE(tick); fuse_ticket_drop(tick); } fuse_lck_mtx_unlock(fdata->aw_mtx); /* Cleanup unsent operations */ fuse_lck_mtx_lock(fdata->ms_mtx); while ((tick = fuse_ms_pop(fdata))) { fuse_ticket_drop(tick); } fuse_lck_mtx_unlock(fdata->ms_mtx); FUSE_UNLOCK(); fdata_trydestroy(fdata); } static int fuse_device_filter(struct cdev *dev, struct knote *kn) { struct fuse_data *data; int error; error = devfs_get_cdevpriv((void **)&data); if (error == 0 && kn->kn_filter == EVFILT_READ) { kn->kn_fop = &fuse_device_rfiltops; kn->kn_hook = data; knlist_add(&data->ks_rsel.si_note, kn, 0); error = 0; } else if (error == 0 && kn->kn_filter == EVFILT_WRITE) { kn->kn_fop = &fuse_device_wfiltops; error = 0; } else if (error == 0) { error = EINVAL; kn->kn_data = error; } return (error); } static void fuse_device_filt_detach(struct knote *kn) { struct fuse_data *data; data = (struct fuse_data*)kn->kn_hook; MPASS(data != NULL); knlist_remove(&data->ks_rsel.si_note, kn, 0); kn->kn_hook = NULL; } static int fuse_device_filt_read(struct knote *kn, long hint) { struct fuse_data *data; int ready; data = (struct fuse_data*)kn->kn_hook; MPASS(data != NULL); mtx_assert(&data->ms_mtx, MA_OWNED); if (fdata_get_dead(data)) { kn->kn_flags |= EV_EOF; kn->kn_fflags = ENODEV; kn->kn_data = 1; ready = 1; } else if (STAILQ_FIRST(&data->ms_head)) { MPASS(data->ms_count >= 1); kn->kn_data = data->ms_count; ready = 1; } else { ready = 0; } return (ready); } static int fuse_device_filt_write(struct knote *kn, long hint) { kn->kn_data = 0; /* The device is always ready to write, so we return 1*/ return (1); } /* * Resources are set up on a per-open basis */ static int fuse_device_open(struct cdev *dev, int oflags, int devtype, struct thread *td) { struct fuse_data *fdata; int error; SDT_PROBE2(fusefs, , device, trace, 1, "device open"); fdata = fdata_alloc(dev, td->td_ucred); error = devfs_set_cdevpriv(fdata, fdata_dtor); if (error != 0) fdata_trydestroy(fdata); else SDT_PROBE2(fusefs, , device, trace, 1, "device open success"); return (error); } int fuse_device_poll(struct cdev *dev, int events, struct thread *td) { struct fuse_data *data; int error, revents = 0; error = devfs_get_cdevpriv((void **)&data); if (error != 0) return (events & (POLLHUP|POLLIN|POLLRDNORM|POLLOUT|POLLWRNORM)); if (events & (POLLIN | POLLRDNORM)) { fuse_lck_mtx_lock(data->ms_mtx); if (fdata_get_dead(data) || STAILQ_FIRST(&data->ms_head)) revents |= events & (POLLIN | POLLRDNORM); else selrecord(td, &data->ks_rsel); fuse_lck_mtx_unlock(data->ms_mtx); } if (events & (POLLOUT | POLLWRNORM)) { revents |= events & (POLLOUT | POLLWRNORM); } return (revents); } /* * fuse_device_read hangs on the queue of VFS messages. * When it's notified that there is a new one, it picks that and * passes up to the daemon */ int fuse_device_read(struct cdev *dev, struct uio *uio, int ioflag) { int err; struct fuse_data *data; struct fuse_ticket *tick; void *buf; int buflen; SDT_PROBE2(fusefs, , device, trace, 1, "fuse device read"); err = devfs_get_cdevpriv((void **)&data); if (err != 0) return (err); fuse_lck_mtx_lock(data->ms_mtx); again: if (fdata_get_dead(data)) { SDT_PROBE2(fusefs, , device, trace, 2, "we know early on that reader should be kicked so we " "don't wait for news"); fuse_lck_mtx_unlock(data->ms_mtx); return (ENODEV); } if (!(tick = fuse_ms_pop(data))) { /* check if we may block */ if (ioflag & O_NONBLOCK) { /* get outa here soon */ fuse_lck_mtx_unlock(data->ms_mtx); return (EAGAIN); } else { err = msleep(data, &data->ms_mtx, PCATCH, "fu_msg", 0); if (err != 0) { fuse_lck_mtx_unlock(data->ms_mtx); return (fdata_get_dead(data) ? ENODEV : err); } tick = fuse_ms_pop(data); } } if (!tick) { /* * We can get here if fuse daemon suddenly terminates, * eg, by being hit by a SIGKILL * -- and some other cases, too, tho not totally clear, when * (cv_signal/wakeup_one signals the whole process ?) */ SDT_PROBE2(fusefs, , device, trace, 1, "no message on thread"); goto again; } fuse_lck_mtx_unlock(data->ms_mtx); if (fdata_get_dead(data)) { /* * somebody somewhere -- eg., umount routine -- * wants this liaison finished off */ SDT_PROBE2(fusefs, , device, trace, 2, "reader is to be sacked"); if (tick) { SDT_PROBE2(fusefs, , device, trace, 2, "weird -- " "\"kick\" is set tho there is message"); FUSE_ASSERT_MS_DONE(tick); fuse_ticket_drop(tick); } return (ENODEV); /* This should make the daemon get off * of us */ } SDT_PROBE2(fusefs, , device, trace, 1, "fuse device read message successfully"); buf = tick->tk_ms_fiov.base; buflen = tick->tk_ms_fiov.len; /* * Why not ban mercilessly stupid daemons who can't keep up * with us? (There is no much use of a partial read here...) */ /* * XXX note that in such cases Linux FUSE throws EIO at the * syscall invoker and stands back to the message queue. The * rationale should be made clear (and possibly adopt that * behaviour). Keeping the current scheme at least makes * fallacy as loud as possible... */ if (uio->uio_resid < buflen) { fdata_set_dead(data); SDT_PROBE2(fusefs, , device, trace, 2, "daemon is stupid, kick it off..."); err = ENODEV; } else { err = uiomove(buf, buflen, uio); } FUSE_ASSERT_MS_DONE(tick); fuse_ticket_drop(tick); return (err); } static inline int fuse_ohead_audit(struct fuse_out_header *ohead, struct uio *uio) { if (uio->uio_resid + sizeof(struct fuse_out_header) != ohead->len) { SDT_PROBE2(fusefs, , device, trace, 1, "Format error: body size " "differs from size claimed by header"); return (EINVAL); } if (uio->uio_resid && ohead->unique != 0 && ohead->error) { SDT_PROBE2(fusefs, , device, trace, 1, "Format error: non zero error but message had a body"); return (EINVAL); } return (0); } SDT_PROBE_DEFINE1(fusefs, , device, fuse_device_write_notify, "struct fuse_out_header*"); SDT_PROBE_DEFINE1(fusefs, , device, fuse_device_write_missing_ticket, "uint64_t"); SDT_PROBE_DEFINE1(fusefs, , device, fuse_device_write_found, "struct fuse_ticket*"); /* * fuse_device_write first reads the header sent by the daemon. * If that's OK, looks up ticket/callback node by the unique id seen in header. * If the callback node contains a handler function, the uio is passed over * that. */ static int fuse_device_write(struct cdev *dev, struct uio *uio, int ioflag) { struct fuse_out_header ohead; int err = 0; struct fuse_data *data; struct mount *mp; struct fuse_ticket *tick, *itick, *x_tick; int found = 0; err = devfs_get_cdevpriv((void **)&data); if (err != 0) return (err); mp = data->mp; if (uio->uio_resid < sizeof(struct fuse_out_header)) { SDT_PROBE2(fusefs, , device, trace, 1, "fuse_device_write got less than a header!"); fdata_set_dead(data); return (EINVAL); } if ((err = uiomove(&ohead, sizeof(struct fuse_out_header), uio)) != 0) return (err); if (data->linux_errnos != 0 && ohead.error != 0) { err = -ohead.error; if (err < 0 || err >= nitems(linux_to_bsd_errtbl)) return (EINVAL); /* '-', because it will get flipped again below */ ohead.error = -linux_to_bsd_errtbl[err]; } /* * We check header information (which is redundant) and compare it * with what we see. If we see some inconsistency we discard the * whole answer and proceed on as if it had never existed. In * particular, no pretender will be woken up, regardless the * "unique" value in the header. */ if ((err = fuse_ohead_audit(&ohead, uio))) { fdata_set_dead(data); return (err); } /* Pass stuff over to callback if there is one installed */ /* Looking for ticket with the unique id of header */ fuse_lck_mtx_lock(data->aw_mtx); TAILQ_FOREACH_SAFE(tick, &data->aw_head, tk_aw_link, x_tick) { if (tick->tk_unique == ohead.unique) { SDT_PROBE1(fusefs, , device, fuse_device_write_found, tick); found = 1; fuse_aw_remove(tick); break; } } if (found && tick->irq_unique > 0) { /* * Discard the FUSE_INTERRUPT ticket that tried to interrupt * this operation */ TAILQ_FOREACH_SAFE(itick, &data->aw_head, tk_aw_link, x_tick) { if (itick->tk_unique == tick->irq_unique) { fuse_aw_remove(itick); fuse_ticket_drop(itick); break; } } tick->irq_unique = 0; } fuse_lck_mtx_unlock(data->aw_mtx); if (found) { if (tick->tk_aw_handler) { /* * We found a callback with proper handler. In this * case the out header will be 0wnd by the callback, * so the fun of freeing that is left for her. * (Then, by all chance, she'll just get that's done * via ticket_drop(), so no manual mucking * around...) */ SDT_PROBE2(fusefs, , device, trace, 1, "pass ticket to a callback"); /* Sanitize the linuxism of negative errnos */ ohead.error *= -1; memcpy(&tick->tk_aw_ohead, &ohead, sizeof(ohead)); err = tick->tk_aw_handler(tick, uio); } else { /* pretender doesn't wanna do anything with answer */ SDT_PROBE2(fusefs, , device, trace, 1, "stuff devalidated, so we drop it"); } /* * As aw_mtx was not held during the callback execution the * ticket may have been inserted again. However, this is safe * because fuse_ticket_drop() will deal with refcount anyway. */ fuse_ticket_drop(tick); } else if (ohead.unique == 0){ /* unique == 0 means asynchronous notification */ SDT_PROBE1(fusefs, , device, fuse_device_write_notify, &ohead); switch (ohead.error) { case FUSE_NOTIFY_INVAL_ENTRY: err = fuse_internal_invalidate_entry(mp, uio); break; case FUSE_NOTIFY_INVAL_INODE: err = fuse_internal_invalidate_inode(mp, uio); break; case FUSE_NOTIFY_RETRIEVE: case FUSE_NOTIFY_STORE: /* * Unimplemented. I don't know of any file systems * that use them, and the protocol isn't sound anyway, * since the notification messages don't include the * inode's generation number. Without that, it's * possible to manipulate the cache of the wrong vnode. * Finally, it's not defined what this message should * do for a file with dirty cache. */ case FUSE_NOTIFY_POLL: /* Unimplemented. See comments in fuse_vnops */ default: /* Not implemented */ err = ENOSYS; } } else { /* no callback at all! */ SDT_PROBE1(fusefs, , device, fuse_device_write_missing_ticket, ohead.unique); if (ohead.error == -EAGAIN) { /* * This was probably a response to a FUSE_INTERRUPT * operation whose original operation is already * complete. We can't store FUSE_INTERRUPT tickets * indefinitely because their responses are optional. * So we delete them when the original operation * completes. And sadly the fuse_header_out doesn't * identify the opcode, so we have to guess. */ err = 0; } else { err = EINVAL; } } return (err); } int fuse_device_init(void) { fuse_dev = make_dev(&fuse_device_cdevsw, 0, UID_ROOT, GID_OPERATOR, S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH, "fuse"); if (fuse_dev == NULL) return (ENOMEM); return (0); } void fuse_device_destroy(void) { MPASS(fuse_dev != NULL); destroy_dev(fuse_dev); } diff --git a/sys/fs/fuse/fuse_io.c b/sys/fs/fuse/fuse_io.c index 78398a990d7d..f85d17517ee0 100644 --- a/sys/fs/fuse/fuse_io.c +++ b/sys/fs/fuse/fuse_io.c @@ -1,1132 +1,1133 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2007-2009 Google Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are * met: * * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following disclaimer * in the documentation and/or other materials provided with the * distribution. * * Neither the name of Google Inc. nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * Copyright (C) 2005 Csaba Henk. * All rights reserved. * * Copyright (c) 2019 The FreeBSD Foundation * * Portions of this software were developed by BFF Storage Systems, LLC under * sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include +#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "fuse.h" #include "fuse_file.h" #include "fuse_node.h" #include "fuse_internal.h" #include "fuse_ipc.h" #include "fuse_io.h" /* * Set in a struct buf to indicate that the write came from the buffer cache * and the originating cred and pid are no longer known. */ #define B_FUSEFS_WRITE_CACHE B_FS_FLAG1 SDT_PROVIDER_DECLARE(fusefs); /* * Fuse trace probe: * arg0: verbosity. Higher numbers give more verbose messages * arg1: Textual message */ SDT_PROBE_DEFINE2(fusefs, , io, trace, "int", "char*"); static int fuse_inval_buf_range(struct vnode *vp, off_t filesize, off_t start, off_t end); static int fuse_read_directbackend(struct vnode *vp, struct uio *uio, struct ucred *cred, struct fuse_filehandle *fufh); static int fuse_read_biobackend(struct vnode *vp, struct uio *uio, int ioflag, struct ucred *cred, struct fuse_filehandle *fufh, pid_t pid); static int fuse_write_directbackend(struct vnode *vp, struct uio *uio, struct ucred *cred, struct fuse_filehandle *fufh, off_t filesize, int ioflag, bool pages); static int fuse_write_biobackend(struct vnode *vp, struct uio *uio, struct ucred *cred, struct fuse_filehandle *fufh, int ioflag, pid_t pid); /* Invalidate a range of cached data, whether dirty of not */ static int fuse_inval_buf_range(struct vnode *vp, off_t filesize, off_t start, off_t end) { struct buf *bp; daddr_t left_lbn, end_lbn, right_lbn; off_t new_filesize; int iosize, left_on, right_on, right_blksize; iosize = fuse_iosize(vp); left_lbn = start / iosize; end_lbn = howmany(end, iosize); left_on = start & (iosize - 1); if (left_on != 0) { bp = getblk(vp, left_lbn, iosize, PCATCH, 0, 0); if ((bp->b_flags & B_CACHE) != 0 && bp->b_dirtyend >= left_on) { /* * Flush the dirty buffer, because we don't have a * byte-granular way to record which parts of the * buffer are valid. */ bwrite(bp); if (bp->b_error) return (bp->b_error); } else { brelse(bp); } } right_on = end & (iosize - 1); if (right_on != 0) { right_lbn = end / iosize; new_filesize = MAX(filesize, end); right_blksize = MIN(iosize, new_filesize - iosize * right_lbn); bp = getblk(vp, right_lbn, right_blksize, PCATCH, 0, 0); if ((bp->b_flags & B_CACHE) != 0 && bp->b_dirtyoff < right_on) { /* * Flush the dirty buffer, because we don't have a * byte-granular way to record which parts of the * buffer are valid. */ bwrite(bp); if (bp->b_error) return (bp->b_error); } else { brelse(bp); } } v_inval_buf_range(vp, left_lbn, end_lbn, iosize); return (0); } SDT_PROBE_DEFINE5(fusefs, , io, io_dispatch, "struct vnode*", "struct uio*", "int", "struct ucred*", "struct fuse_filehandle*"); SDT_PROBE_DEFINE4(fusefs, , io, io_dispatch_filehandles_closed, "struct vnode*", "struct uio*", "int", "struct ucred*"); int fuse_io_dispatch(struct vnode *vp, struct uio *uio, int ioflag, struct ucred *cred, pid_t pid) { struct fuse_filehandle *fufh; int err, directio; int fflag; bool closefufh = false; MPASS(vp->v_type == VREG || vp->v_type == VDIR); fflag = (uio->uio_rw == UIO_READ) ? FREAD : FWRITE; err = fuse_filehandle_getrw(vp, fflag, &fufh, cred, pid); if (err == EBADF && vnode_mount(vp)->mnt_flag & MNT_EXPORTED) { /* * nfsd will do I/O without first doing VOP_OPEN. We * must implicitly open the file here */ err = fuse_filehandle_open(vp, fflag, &fufh, curthread, cred); closefufh = true; } else if (err) { SDT_PROBE4(fusefs, , io, io_dispatch_filehandles_closed, vp, uio, ioflag, cred); printf("FUSE: io dispatch: filehandles are closed\n"); return err; } if (err) goto out; SDT_PROBE5(fusefs, , io, io_dispatch, vp, uio, ioflag, cred, fufh); /* * Ideally, when the daemon asks for direct io at open time, the * standard file flag should be set according to this, so that would * just change the default mode, which later on could be changed via * fcntl(2). * But this doesn't work, the O_DIRECT flag gets cleared at some point * (don't know where). So to make any use of the Fuse direct_io option, * we hardwire it into the file's private data (similarly to Linux, * btw.). */ directio = (ioflag & IO_DIRECT) || !fsess_opt_datacache(vnode_mount(vp)); switch (uio->uio_rw) { case UIO_READ: fuse_vnode_update(vp, FN_ATIMECHANGE); if (directio) { SDT_PROBE2(fusefs, , io, trace, 1, "direct read of vnode"); err = fuse_read_directbackend(vp, uio, cred, fufh); } else { SDT_PROBE2(fusefs, , io, trace, 1, "buffered read of vnode"); err = fuse_read_biobackend(vp, uio, ioflag, cred, fufh, pid); } break; case UIO_WRITE: fuse_vnode_update(vp, FN_MTIMECHANGE | FN_CTIMECHANGE); if (directio) { off_t start, end, filesize; bool pages = (ioflag & IO_VMIO) != 0; SDT_PROBE2(fusefs, , io, trace, 1, "direct write of vnode"); err = fuse_vnode_size(vp, &filesize, cred, curthread); if (err) goto out; start = uio->uio_offset; end = start + uio->uio_resid; if (!pages) { err = fuse_inval_buf_range(vp, filesize, start, end); if (err) return (err); } err = fuse_write_directbackend(vp, uio, cred, fufh, filesize, ioflag, pages); } else { SDT_PROBE2(fusefs, , io, trace, 1, "buffered write of vnode"); if (!fsess_opt_writeback(vnode_mount(vp))) ioflag |= IO_SYNC; err = fuse_write_biobackend(vp, uio, cred, fufh, ioflag, pid); } fuse_internal_clear_suid_on_write(vp, cred, uio->uio_td); break; default: panic("uninterpreted mode passed to fuse_io_dispatch"); } out: if (closefufh) fuse_filehandle_close(vp, fufh, curthread, cred); return (err); } SDT_PROBE_DEFINE4(fusefs, , io, read_bio_backend_start, "int", "int", "int", "int"); SDT_PROBE_DEFINE2(fusefs, , io, read_bio_backend_feed, "int", "struct buf*"); SDT_PROBE_DEFINE4(fusefs, , io, read_bio_backend_end, "int", "ssize_t", "int", "struct buf*"); static int fuse_read_biobackend(struct vnode *vp, struct uio *uio, int ioflag, struct ucred *cred, struct fuse_filehandle *fufh, pid_t pid) { struct buf *bp; struct mount *mp; struct fuse_data *data; daddr_t lbn, nextlbn; int bcount, nextsize; int err, n = 0, on = 0, seqcount; off_t filesize; const int biosize = fuse_iosize(vp); mp = vnode_mount(vp); data = fuse_get_mpdata(mp); if (uio->uio_offset < 0) return (EINVAL); seqcount = ioflag >> IO_SEQSHIFT; err = fuse_vnode_size(vp, &filesize, cred, curthread); if (err) return err; for (err = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) { if (fuse_isdeadfs(vp)) { err = ENXIO; break; } if (filesize - uio->uio_offset <= 0) break; lbn = uio->uio_offset / biosize; on = uio->uio_offset & (biosize - 1); if ((off_t)lbn * biosize >= filesize) { bcount = 0; } else if ((off_t)(lbn + 1) * biosize > filesize) { bcount = filesize - (off_t)lbn *biosize; } else { bcount = biosize; } nextlbn = lbn + 1; nextsize = MIN(biosize, filesize - nextlbn * biosize); SDT_PROBE4(fusefs, , io, read_bio_backend_start, biosize, (int)lbn, on, bcount); if (bcount < biosize) { /* If near EOF, don't do readahead */ err = bread(vp, lbn, bcount, NOCRED, &bp); } else if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) { /* Try clustered read */ long totread = uio->uio_resid + on; seqcount = MIN(seqcount, data->max_readahead_blocks + 1); err = cluster_read(vp, filesize, lbn, bcount, NOCRED, totread, seqcount, 0, &bp); } else if (seqcount > 1 && data->max_readahead_blocks >= 1) { /* Try non-clustered readahead */ err = breadn(vp, lbn, bcount, &nextlbn, &nextsize, 1, NOCRED, &bp); } else { /* Just read what was requested */ err = bread(vp, lbn, bcount, NOCRED, &bp); } if (err) { brelse(bp); bp = NULL; break; } /* * on is the offset into the current bp. Figure out how many * bytes we can copy out of the bp. Note that bcount is * NOT DEV_BSIZE aligned. * * Then figure out how many bytes we can copy into the uio. */ n = 0; if (on < bcount - bp->b_resid) n = MIN((unsigned)(bcount - bp->b_resid - on), uio->uio_resid); if (n > 0) { SDT_PROBE2(fusefs, , io, read_bio_backend_feed, n, bp); err = uiomove(bp->b_data + on, n, uio); } vfs_bio_brelse(bp, ioflag); SDT_PROBE4(fusefs, , io, read_bio_backend_end, err, uio->uio_resid, n, bp); if (bp->b_resid > 0) { /* Short read indicates EOF */ break; } } return (err); } SDT_PROBE_DEFINE1(fusefs, , io, read_directbackend_start, "struct fuse_read_in*"); SDT_PROBE_DEFINE3(fusefs, , io, read_directbackend_complete, "struct fuse_dispatcher*", "struct fuse_read_in*", "struct uio*"); static int fuse_read_directbackend(struct vnode *vp, struct uio *uio, struct ucred *cred, struct fuse_filehandle *fufh) { struct fuse_data *data; struct fuse_dispatcher fdi; struct fuse_read_in *fri; int err = 0; data = fuse_get_mpdata(vp->v_mount); if (uio->uio_resid == 0) return (0); fdisp_init(&fdi, 0); /* * XXX In "normal" case we use an intermediate kernel buffer for * transmitting data from daemon's context to ours. Eventually, we should * get rid of this. Anyway, if the target uio lives in sysspace (we are * called from pageops), and the input data doesn't need kernel-side * processing (we are not called from readdir) we can already invoke * an optimized, "peer-to-peer" I/O routine. */ while (uio->uio_resid > 0) { fdi.iosize = sizeof(*fri); fdisp_make_vp(&fdi, FUSE_READ, vp, uio->uio_td, cred); fri = fdi.indata; fri->fh = fufh->fh_id; fri->offset = uio->uio_offset; fri->size = MIN(uio->uio_resid, fuse_get_mpdata(vp->v_mount)->max_read); if (fuse_libabi_geq(data, 7, 9)) { /* See comment regarding FUSE_WRITE_LOCKOWNER */ fri->read_flags = 0; fri->flags = fufh_type_2_fflags(fufh->fufh_type); } SDT_PROBE1(fusefs, , io, read_directbackend_start, fri); if ((err = fdisp_wait_answ(&fdi))) goto out; SDT_PROBE3(fusefs, , io, read_directbackend_complete, &fdi, fri, uio); if ((err = uiomove(fdi.answ, MIN(fri->size, fdi.iosize), uio))) break; if (fdi.iosize < fri->size) { /* * Short read. Should only happen at EOF or with * direct io. */ break; } } out: fdisp_destroy(&fdi); return (err); } static int fuse_write_directbackend(struct vnode *vp, struct uio *uio, struct ucred *cred, struct fuse_filehandle *fufh, off_t filesize, int ioflag, bool pages) { struct fuse_vnode_data *fvdat = VTOFUD(vp); struct fuse_data *data; struct fuse_write_in *fwi; struct fuse_write_out *fwo; struct fuse_dispatcher fdi; size_t chunksize; void *fwi_data; off_t as_written_offset; int diff; int err = 0; bool direct_io = fufh->fuse_open_flags & FOPEN_DIRECT_IO; bool wrote_anything = false; uint32_t write_flags; data = fuse_get_mpdata(vp->v_mount); /* * Don't set FUSE_WRITE_LOCKOWNER in write_flags. It can't be set * accurately when using POSIX AIO, libfuse doesn't use it, and I'm not * aware of any file systems that do. It was an attempt to add * Linux-style mandatory locking to the FUSE protocol, but mandatory * locking is deprecated even on Linux. See Linux commit * f33321141b273d60cbb3a8f56a5489baad82ba5e . */ /* * Set FUSE_WRITE_CACHE whenever we don't know the uid, gid, and/or pid * that originated a write. For example when writing from the * writeback cache. I don't know of a single file system that cares, * but the protocol says we're supposed to do this. */ write_flags = !pages && ( (ioflag & IO_DIRECT) || !fsess_opt_datacache(vnode_mount(vp)) || !fsess_opt_writeback(vnode_mount(vp))) ? 0 : FUSE_WRITE_CACHE; if (uio->uio_resid == 0) return (0); if (ioflag & IO_APPEND) uio_setoffset(uio, filesize); if (vn_rlimit_fsize(vp, uio, uio->uio_td)) return (EFBIG); fdisp_init(&fdi, 0); while (uio->uio_resid > 0) { size_t sizeof_fwi; if (fuse_libabi_geq(data, 7, 9)) { sizeof_fwi = sizeof(*fwi); } else { sizeof_fwi = FUSE_COMPAT_WRITE_IN_SIZE; } chunksize = MIN(uio->uio_resid, data->max_write); fdi.iosize = sizeof_fwi + chunksize; fdisp_make_vp(&fdi, FUSE_WRITE, vp, uio->uio_td, cred); fwi = fdi.indata; fwi->fh = fufh->fh_id; fwi->offset = uio->uio_offset; fwi->size = chunksize; fwi->write_flags = write_flags; if (fuse_libabi_geq(data, 7, 9)) { fwi->flags = fufh_type_2_fflags(fufh->fufh_type); } fwi_data = (char *)fdi.indata + sizeof_fwi; if ((err = uiomove(fwi_data, chunksize, uio))) break; retry: err = fdisp_wait_answ(&fdi); if (err == ERESTART || err == EINTR || err == EWOULDBLOCK) { /* * Rewind the uio so dofilewrite will know it's * incomplete */ uio->uio_resid += fwi->size; uio->uio_offset -= fwi->size; /* * Change ERESTART into EINTR because we can't rewind * uio->uio_iov. Basically, once uiomove(9) has been * called, it's impossible to restart a syscall. */ if (err == ERESTART) err = EINTR; break; } else if (err) { break; } else { wrote_anything = true; } fwo = ((struct fuse_write_out *)fdi.answ); /* Adjust the uio in the case of short writes */ diff = fwi->size - fwo->size; as_written_offset = uio->uio_offset - diff; if (as_written_offset - diff > filesize) fuse_vnode_setsize(vp, as_written_offset, false); if (as_written_offset - diff >= filesize) fvdat->flag &= ~FN_SIZECHANGE; if (diff < 0) { fuse_warn(data, FSESS_WARN_WROTE_LONG, "wrote more data than we provided it."); err = EINVAL; break; } else if (diff > 0) { /* Short write */ if (!direct_io) { fuse_warn(data, FSESS_WARN_SHORT_WRITE, "short writes are only allowed with " "direct_io."); } if (ioflag & IO_DIRECT) { /* Return early */ uio->uio_resid += diff; uio->uio_offset -= diff; break; } else { /* Resend the unwritten portion of data */ fdi.iosize = sizeof_fwi + diff; /* Refresh fdi without clearing data buffer */ fdisp_refresh_vp(&fdi, FUSE_WRITE, vp, uio->uio_td, cred); fwi = fdi.indata; MPASS2(fwi == fdi.indata, "FUSE dispatcher " "reallocated despite no increase in " "size?"); void *src = (char*)fwi_data + fwo->size; memmove(fwi_data, src, diff); fwi->fh = fufh->fh_id; fwi->offset = as_written_offset; fwi->size = diff; fwi->write_flags = write_flags; goto retry; } } } fdisp_destroy(&fdi); if (wrote_anything) fuse_vnode_undirty_cached_timestamps(vp, false); return (err); } SDT_PROBE_DEFINE6(fusefs, , io, write_biobackend_start, "int64_t", "int", "int", "struct uio*", "int", "bool"); SDT_PROBE_DEFINE2(fusefs, , io, write_biobackend_append_race, "long", "int"); SDT_PROBE_DEFINE2(fusefs, , io, write_biobackend_issue, "int", "struct buf*"); static int fuse_write_biobackend(struct vnode *vp, struct uio *uio, struct ucred *cred, struct fuse_filehandle *fufh, int ioflag, pid_t pid) { struct fuse_vnode_data *fvdat = VTOFUD(vp); struct buf *bp; daddr_t lbn; off_t filesize; int bcount; int n, on, seqcount, err = 0; bool last_page; const int biosize = fuse_iosize(vp); seqcount = ioflag >> IO_SEQSHIFT; KASSERT(uio->uio_rw == UIO_WRITE, ("fuse_write_biobackend mode")); if (vp->v_type != VREG) return (EIO); if (uio->uio_offset < 0) return (EINVAL); if (uio->uio_resid == 0) return (0); err = fuse_vnode_size(vp, &filesize, cred, curthread); if (err) return err; if (ioflag & IO_APPEND) uio_setoffset(uio, filesize); if (vn_rlimit_fsize(vp, uio, uio->uio_td)) return (EFBIG); do { bool direct_append, extending; if (fuse_isdeadfs(vp)) { err = ENXIO; break; } lbn = uio->uio_offset / biosize; on = uio->uio_offset & (biosize - 1); n = MIN((unsigned)(biosize - on), uio->uio_resid); again: /* Get or create a buffer for the write */ direct_append = uio->uio_offset == filesize && n; if (uio->uio_offset + n < filesize) { extending = false; if ((off_t)(lbn + 1) * biosize < filesize) { /* Not the file's last block */ bcount = biosize; } else { /* The file's last block */ bcount = filesize - (off_t)lbn * biosize; } } else { extending = true; bcount = on + n; } if (howmany(((off_t)lbn * biosize + on + n - 1), PAGE_SIZE) >= howmany(filesize, PAGE_SIZE)) last_page = true; else last_page = false; if (direct_append) { /* * Take care to preserve the buffer's B_CACHE state so * as not to cause an unnecessary read. */ bp = getblk(vp, lbn, on, PCATCH, 0, 0); if (bp != NULL) { uint32_t save = bp->b_flags & B_CACHE; allocbuf(bp, bcount); bp->b_flags |= save; } } else { bp = getblk(vp, lbn, bcount, PCATCH, 0, 0); } if (!bp) { err = EINTR; break; } if (extending) { /* * Extend file _after_ locking buffer so we won't race * with other readers */ err = fuse_vnode_setsize(vp, uio->uio_offset + n, false); filesize = uio->uio_offset + n; fvdat->flag |= FN_SIZECHANGE; if (err) { brelse(bp); break; } } SDT_PROBE6(fusefs, , io, write_biobackend_start, lbn, on, n, uio, bcount, direct_append); /* * Issue a READ if B_CACHE is not set. In special-append * mode, B_CACHE is based on the buffer prior to the write * op and is typically set, avoiding the read. If a read * is required in special append mode, the server will * probably send us a short-read since we extended the file * on our end, resulting in b_resid == 0 and, thusly, * B_CACHE getting set. * * We can also avoid issuing the read if the write covers * the entire buffer. We have to make sure the buffer state * is reasonable in this case since we will not be initiating * I/O. See the comments in kern/vfs_bio.c's getblk() for * more information. * * B_CACHE may also be set due to the buffer being cached * normally. */ if (on == 0 && n == bcount) { bp->b_flags |= B_CACHE; bp->b_flags &= ~B_INVAL; bp->b_ioflags &= ~BIO_ERROR; } if ((bp->b_flags & B_CACHE) == 0) { bp->b_iocmd = BIO_READ; vfs_busy_pages(bp, 0); fuse_io_strategy(vp, bp); if ((err = bp->b_error)) { brelse(bp); break; } if (bp->b_resid > 0) { /* * Short read indicates EOF. Update file size * from the server and try again. */ SDT_PROBE2(fusefs, , io, trace, 1, "Short read during a RMW"); brelse(bp); err = fuse_vnode_size(vp, &filesize, cred, curthread); if (err) break; else goto again; } } if (bp->b_wcred == NOCRED) bp->b_wcred = crhold(cred); /* * If dirtyend exceeds file size, chop it down. This should * not normally occur but there is an append race where it * might occur XXX, so we log it. * * If the chopping creates a reverse-indexed or degenerate * situation with dirtyoff/end, we 0 both of them. */ if (bp->b_dirtyend > bcount) { SDT_PROBE2(fusefs, , io, write_biobackend_append_race, (long)bp->b_blkno * biosize, bp->b_dirtyend - bcount); bp->b_dirtyend = bcount; } if (bp->b_dirtyoff >= bp->b_dirtyend) bp->b_dirtyoff = bp->b_dirtyend = 0; /* * If the new write will leave a contiguous dirty * area, just update the b_dirtyoff and b_dirtyend, * otherwise force a write rpc of the old dirty area. * * While it is possible to merge discontiguous writes due to * our having a B_CACHE buffer ( and thus valid read data * for the hole), we don't because it could lead to * significant cache coherency problems with multiple clients, * especially if locking is implemented later on. * * as an optimization we could theoretically maintain * a linked list of discontinuous areas, but we would still * have to commit them separately so there isn't much * advantage to it except perhaps a bit of asynchronization. */ if (bp->b_dirtyend > 0 && (on > bp->b_dirtyend || (on + n) < bp->b_dirtyoff)) { /* * Yes, we mean it. Write out everything to "storage" * immediately, without hesitation. (Apart from other * reasons: the only way to know if a write is valid * if its actually written out.) */ SDT_PROBE2(fusefs, , io, write_biobackend_issue, 0, bp); bwrite(bp); if (bp->b_error == EINTR) { err = EINTR; break; } goto again; } err = uiomove((char *)bp->b_data + on, n, uio); if (err) { bp->b_ioflags |= BIO_ERROR; bp->b_error = err; brelse(bp); break; /* TODO: vfs_bio_clrbuf like ffs_write does? */ } /* * Only update dirtyoff/dirtyend if not a degenerate * condition. */ if (n) { if (bp->b_dirtyend > 0) { bp->b_dirtyoff = MIN(on, bp->b_dirtyoff); bp->b_dirtyend = MAX((on + n), bp->b_dirtyend); } else { bp->b_dirtyoff = on; bp->b_dirtyend = on + n; } vfs_bio_set_valid(bp, on, n); } vfs_bio_set_flags(bp, ioflag); bp->b_flags |= B_FUSEFS_WRITE_CACHE; if (ioflag & IO_SYNC) { SDT_PROBE2(fusefs, , io, write_biobackend_issue, 2, bp); if (!(ioflag & IO_VMIO)) bp->b_flags &= ~B_FUSEFS_WRITE_CACHE; err = bwrite(bp); } else if (vm_page_count_severe() || buf_dirty_count_severe() || (ioflag & IO_ASYNC)) { bp->b_flags |= B_CLUSTEROK; SDT_PROBE2(fusefs, , io, write_biobackend_issue, 3, bp); bawrite(bp); } else if (on == 0 && n == bcount) { if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0) { bp->b_flags |= B_CLUSTEROK; SDT_PROBE2(fusefs, , io, write_biobackend_issue, 4, bp); cluster_write(vp, bp, filesize, seqcount, 0); } else { SDT_PROBE2(fusefs, , io, write_biobackend_issue, 5, bp); bawrite(bp); } } else if (ioflag & IO_DIRECT) { bp->b_flags |= B_CLUSTEROK; SDT_PROBE2(fusefs, , io, write_biobackend_issue, 6, bp); bawrite(bp); } else { bp->b_flags &= ~B_CLUSTEROK; SDT_PROBE2(fusefs, , io, write_biobackend_issue, 7, bp); bdwrite(bp); } if (err) break; } while (uio->uio_resid > 0 && n > 0); return (err); } int fuse_io_strategy(struct vnode *vp, struct buf *bp) { struct fuse_vnode_data *fvdat = VTOFUD(vp); struct fuse_filehandle *fufh; struct ucred *cred; struct uio *uiop; struct uio uio; struct iovec io; off_t filesize; int error = 0; int fflag; /* We don't know the true pid when we're dealing with the cache */ pid_t pid = 0; const int biosize = fuse_iosize(vp); MPASS(vp->v_type == VREG || vp->v_type == VDIR); MPASS(bp->b_iocmd == BIO_READ || bp->b_iocmd == BIO_WRITE); fflag = bp->b_iocmd == BIO_READ ? FREAD : FWRITE; cred = bp->b_iocmd == BIO_READ ? bp->b_rcred : bp->b_wcred; error = fuse_filehandle_getrw(vp, fflag, &fufh, cred, pid); if (bp->b_iocmd == BIO_READ && error == EBADF) { /* * This may be a read-modify-write operation on a cached file * opened O_WRONLY. The FUSE protocol allows this. */ error = fuse_filehandle_get(vp, FWRITE, &fufh, cred, pid); } if (error) { printf("FUSE: strategy: filehandles are closed\n"); bp->b_ioflags |= BIO_ERROR; bp->b_error = error; bufdone(bp); return (error); } uiop = &uio; uiop->uio_iov = &io; uiop->uio_iovcnt = 1; uiop->uio_segflg = UIO_SYSSPACE; uiop->uio_td = curthread; /* * clear BIO_ERROR and B_INVAL state prior to initiating the I/O. We * do this here so we do not have to do it in all the code that * calls us. */ bp->b_flags &= ~B_INVAL; bp->b_ioflags &= ~BIO_ERROR; KASSERT(!(bp->b_flags & B_DONE), ("fuse_io_strategy: bp %p already marked done", bp)); if (bp->b_iocmd == BIO_READ) { ssize_t left; io.iov_len = uiop->uio_resid = bp->b_bcount; io.iov_base = bp->b_data; uiop->uio_rw = UIO_READ; uiop->uio_offset = ((off_t)bp->b_lblkno) * biosize; error = fuse_read_directbackend(vp, uiop, cred, fufh); /* * Store the amount we failed to read in the buffer's private * field, so callers can truncate the file if necessary' */ if (!error && uiop->uio_resid) { int nread = bp->b_bcount - uiop->uio_resid; left = uiop->uio_resid; bzero((char *)bp->b_data + nread, left); if ((fvdat->flag & FN_SIZECHANGE) == 0) { /* * A short read with no error, when not using * direct io, and when no writes are cached, * indicates EOF caused by a server-side * truncation. Clear the attr cache so we'll * pick up the new file size and timestamps. * * We must still bzero the remaining buffer so * uninitialized data doesn't get exposed by a * future truncate that extends the file. * * To prevent lock order problems, we must * truncate the file upstack, not here. */ SDT_PROBE2(fusefs, , io, trace, 1, "Short read of a clean file"); fuse_vnode_clear_attr_cache(vp); } else { /* * If dirty writes _are_ cached beyond EOF, * that indicates a newly created hole that the * server doesn't know about. Those don't pose * any problem. * XXX: we don't currently track whether dirty * writes are cached beyond EOF, before EOF, or * both. */ SDT_PROBE2(fusefs, , io, trace, 1, "Short read of a dirty file"); uiop->uio_resid = 0; } } if (error) { bp->b_ioflags |= BIO_ERROR; bp->b_error = error; } } else { /* * Setup for actual write */ /* * If the file's size is cached, use that value, even if the * cache is expired. At this point we're already committed to * writing something. If the FUSE server has changed the * file's size behind our back, it's too late for us to do * anything about it. In particular, we can't invalidate any * part of the file's buffers because VOP_STRATEGY is called * with them already locked. */ filesize = fvdat->cached_attrs.va_size; /* filesize must've been cached by fuse_vnop_open. */ KASSERT(filesize != VNOVAL, ("filesize should've been cached")); if ((off_t)bp->b_lblkno * biosize + bp->b_dirtyend > filesize) bp->b_dirtyend = filesize - (off_t)bp->b_lblkno * biosize; if (bp->b_dirtyend > bp->b_dirtyoff) { io.iov_len = uiop->uio_resid = bp->b_dirtyend - bp->b_dirtyoff; uiop->uio_offset = (off_t)bp->b_lblkno * biosize + bp->b_dirtyoff; io.iov_base = (char *)bp->b_data + bp->b_dirtyoff; uiop->uio_rw = UIO_WRITE; bool pages = bp->b_flags & B_FUSEFS_WRITE_CACHE; error = fuse_write_directbackend(vp, uiop, cred, fufh, filesize, 0, pages); if (error == EINTR || error == ETIMEDOUT) { bp->b_flags &= ~(B_INVAL | B_NOCACHE); if ((bp->b_flags & B_PAGING) == 0) { bdirty(bp); bp->b_flags &= ~B_DONE; } if ((error == EINTR || error == ETIMEDOUT) && (bp->b_flags & B_ASYNC) == 0) bp->b_flags |= B_EINTR; } else { if (error) { bp->b_ioflags |= BIO_ERROR; bp->b_flags |= B_INVAL; bp->b_error = error; } bp->b_dirtyoff = bp->b_dirtyend = 0; } } else { bp->b_resid = 0; bufdone(bp); return (0); } } bp->b_resid = uiop->uio_resid; bufdone(bp); return (error); } int fuse_io_flushbuf(struct vnode *vp, int waitfor, struct thread *td) { return (vn_fsync_buf(vp, waitfor)); } /* * Flush and invalidate all dirty buffers. If another process is already * doing the flush, just wait for completion. */ int fuse_io_invalbuf(struct vnode *vp, struct thread *td) { struct fuse_vnode_data *fvdat = VTOFUD(vp); int error = 0; if (VN_IS_DOOMED(vp)) return 0; ASSERT_VOP_ELOCKED(vp, "fuse_io_invalbuf"); while (fvdat->flag & FN_FLUSHINPROG) { struct proc *p = td->td_proc; if (vp->v_mount->mnt_kern_flag & MNTK_UNMOUNTF) return EIO; fvdat->flag |= FN_FLUSHWANT; tsleep(&fvdat->flag, PRIBIO + 2, "fusevinv", 2 * hz); error = 0; if (p != NULL) { PROC_LOCK(p); if (SIGNOTEMPTY(p->p_siglist) || SIGNOTEMPTY(td->td_siglist)) error = EINTR; PROC_UNLOCK(p); } if (error == EINTR) return EINTR; } fvdat->flag |= FN_FLUSHINPROG; if (vp->v_bufobj.bo_object != NULL) { VM_OBJECT_WLOCK(vp->v_bufobj.bo_object); vm_object_page_clean(vp->v_bufobj.bo_object, 0, 0, OBJPC_SYNC); VM_OBJECT_WUNLOCK(vp->v_bufobj.bo_object); } error = vinvalbuf(vp, V_SAVE, PCATCH, 0); while (error) { if (error == ERESTART || error == EINTR) { fvdat->flag &= ~FN_FLUSHINPROG; if (fvdat->flag & FN_FLUSHWANT) { fvdat->flag &= ~FN_FLUSHWANT; wakeup(&fvdat->flag); } return EINTR; } error = vinvalbuf(vp, V_SAVE, PCATCH, 0); } fvdat->flag &= ~FN_FLUSHINPROG; if (fvdat->flag & FN_FLUSHWANT) { fvdat->flag &= ~FN_FLUSHWANT; wakeup(&fvdat->flag); } return (error); } diff --git a/sys/fs/fuse/fuse_main.c b/sys/fs/fuse/fuse_main.c index ac15ad960725..824458db72cb 100644 --- a/sys/fs/fuse/fuse_main.c +++ b/sys/fs/fuse/fuse_main.c @@ -1,181 +1,182 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2007-2009 Google Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are * met: * * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following disclaimer * in the documentation and/or other materials provided with the * distribution. * * Neither the name of Google Inc. nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * Copyright (C) 2005 Csaba Henk. * All rights reserved. * * Copyright (c) 2019 The FreeBSD Foundation * * Portions of this software were developed by BFF Storage Systems, LLC under * sponsorship from the FreeBSD Foundation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include +#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "fuse.h" #include "fuse_file.h" #include "fuse_ipc.h" #include "fuse_internal.h" #include "fuse_node.h" static void fuse_bringdown(eventhandler_tag eh_tag); static int fuse_loader(struct module *m, int what, void *arg); struct mtx fuse_mtx; extern struct vfsops fuse_vfsops; extern struct cdevsw fuse_cdevsw; extern struct vop_vector fuse_fifonops; extern uma_zone_t fuse_pbuf_zone; static struct vfsconf fuse_vfsconf = { .vfc_version = VFS_VERSION, .vfc_name = "fusefs", .vfc_vfsops = &fuse_vfsops, .vfc_typenum = -1, .vfc_flags = VFCF_JAIL | VFCF_SYNTHETIC }; SYSCTL_NODE(_vfs, OID_AUTO, fusefs, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "FUSE tunables"); SYSCTL_NODE(_vfs_fusefs, OID_AUTO, stats, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "FUSE statistics"); SYSCTL_INT(_vfs_fusefs, OID_AUTO, kernelabi_major, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, FUSE_KERNEL_VERSION, "FUSE kernel abi major version"); SYSCTL_INT(_vfs_fusefs, OID_AUTO, kernelabi_minor, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, FUSE_KERNEL_MINOR_VERSION, "FUSE kernel abi minor version"); SDT_PROVIDER_DEFINE(fusefs); /****************************** * * >>> Module management stuff * ******************************/ static void fuse_bringdown(eventhandler_tag eh_tag) { fuse_node_destroy(); fuse_internal_destroy(); fuse_file_destroy(); fuse_ipc_destroy(); fuse_device_destroy(); mtx_destroy(&fuse_mtx); } static int fuse_loader(struct module *m, int what, void *arg) { static eventhandler_tag eh_tag = NULL; int err = 0; switch (what) { case MOD_LOAD: /* kldload */ mtx_init(&fuse_mtx, "fuse_mtx", NULL, MTX_DEF); err = fuse_device_init(); if (err) { mtx_destroy(&fuse_mtx); return (err); } fuse_ipc_init(); fuse_file_init(); fuse_internal_init(); fuse_node_init(); fuse_pbuf_zone = pbuf_zsecond_create("fusepbuf", nswbuf / 2); /* vfs_modevent ignores its first arg */ if ((err = vfs_modevent(NULL, what, &fuse_vfsconf))) fuse_bringdown(eh_tag); break; case MOD_UNLOAD: if ((err = vfs_modevent(NULL, what, &fuse_vfsconf))) return (err); fuse_bringdown(eh_tag); uma_zdestroy(fuse_pbuf_zone); break; default: return (EINVAL); } return (err); } /* Registering the module */ static moduledata_t fuse_moddata = { "fusefs", fuse_loader, &fuse_vfsconf }; DECLARE_MODULE(fusefs, fuse_moddata, SI_SUB_VFS, SI_ORDER_MIDDLE); MODULE_VERSION(fusefs, 1); diff --git a/sys/kern/kern_module.c b/sys/kern/kern_module.c index 97dca7185319..1798a834dcec 100644 --- a/sys/kern/kern_module.c +++ b/sys/kern/kern_module.c @@ -1,519 +1,573 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 1997 Doug Rabson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include static MALLOC_DEFINE(M_MODULE, "module", "module data structures"); struct module { TAILQ_ENTRY(module) link; /* chain together all modules */ TAILQ_ENTRY(module) flink; /* all modules in a file */ struct linker_file *file; /* file which contains this module */ int refs; /* reference count */ int id; /* unique id number */ char *name; /* module name */ modeventhand_t handler; /* event handler */ void *arg; /* argument for handler */ modspecific_t data; /* module specific data */ }; #define MOD_EVENT(mod, type) (mod)->handler((mod), (type), (mod)->arg) static TAILQ_HEAD(modulelist, module) modules; struct sx modules_sx; static int nextid = 1; static void module_shutdown(void *, int); static int modevent_nop(module_t mod, int what, void *arg) { switch(what) { case MOD_LOAD: return (0); case MOD_UNLOAD: return (EBUSY); default: return (EOPNOTSUPP); } } static void module_init(void *arg) { sx_init(&modules_sx, "module subsystem sx lock"); TAILQ_INIT(&modules); EVENTHANDLER_REGISTER(shutdown_final, module_shutdown, NULL, SHUTDOWN_PRI_DEFAULT); } SYSINIT(module, SI_SUB_KLD, SI_ORDER_FIRST, module_init, NULL); static void module_shutdown(void *arg1, int arg2) { module_t mod; if (arg2 & RB_NOSYNC) return; mtx_lock(&Giant); MOD_SLOCK; TAILQ_FOREACH_REVERSE(mod, &modules, modulelist, link) MOD_EVENT(mod, MOD_SHUTDOWN); MOD_SUNLOCK; mtx_unlock(&Giant); } void module_register_init(const void *arg) { const moduledata_t *data = (const moduledata_t *)arg; int error; module_t mod; mtx_lock(&Giant); MOD_SLOCK; mod = module_lookupbyname(data->name); if (mod == NULL) panic("module_register_init: module named %s not found\n", data->name); MOD_SUNLOCK; error = MOD_EVENT(mod, MOD_LOAD); if (error) { MOD_EVENT(mod, MOD_UNLOAD); MOD_XLOCK; module_release(mod); MOD_XUNLOCK; printf("module_register_init: MOD_LOAD (%s, %p, %p) error" " %d\n", data->name, (void *)data->evhand, data->priv, error); } else { MOD_XLOCK; if (mod->file) { /* * Once a module is successfully loaded, move * it to the head of the module list for this * linker file. This resorts the list so that * when the kernel linker iterates over the * modules to unload them, it will unload them * in the reverse order they were loaded. */ TAILQ_REMOVE(&mod->file->modules, mod, flink); TAILQ_INSERT_HEAD(&mod->file->modules, mod, flink); } MOD_XUNLOCK; } mtx_unlock(&Giant); } int module_register(const moduledata_t *data, linker_file_t container) { size_t namelen; module_t newmod; MOD_XLOCK; newmod = module_lookupbyname(data->name); if (newmod != NULL) { MOD_XUNLOCK; printf("%s: cannot register %s from %s; already loaded from %s\n", __func__, data->name, container->filename, newmod->file->filename); return (EEXIST); } namelen = strlen(data->name) + 1; newmod = malloc(sizeof(struct module) + namelen, M_MODULE, M_WAITOK); newmod->refs = 1; newmod->id = nextid++; newmod->name = (char *)(newmod + 1); strcpy(newmod->name, data->name); newmod->handler = data->evhand ? data->evhand : modevent_nop; newmod->arg = data->priv; bzero(&newmod->data, sizeof(newmod->data)); TAILQ_INSERT_TAIL(&modules, newmod, link); if (container) TAILQ_INSERT_TAIL(&container->modules, newmod, flink); newmod->file = container; MOD_XUNLOCK; return (0); } void module_reference(module_t mod) { MOD_XLOCK_ASSERT; MOD_DPF(REFS, ("module_reference: before, refs=%d\n", mod->refs)); mod->refs++; } void module_release(module_t mod) { MOD_XLOCK_ASSERT; if (mod->refs <= 0) panic("module_release: bad reference count"); MOD_DPF(REFS, ("module_release: before, refs=%d\n", mod->refs)); mod->refs--; if (mod->refs == 0) { TAILQ_REMOVE(&modules, mod, link); if (mod->file) TAILQ_REMOVE(&mod->file->modules, mod, flink); free(mod, M_MODULE); } } module_t module_lookupbyname(const char *name) { module_t mod; int err; MOD_LOCK_ASSERT; TAILQ_FOREACH(mod, &modules, link) { err = strcmp(mod->name, name); if (err == 0) return (mod); } return (NULL); } module_t module_lookupbyid(int modid) { module_t mod; MOD_LOCK_ASSERT; TAILQ_FOREACH(mod, &modules, link) if (mod->id == modid) return(mod); return (NULL); } int module_quiesce(module_t mod) { int error; mtx_lock(&Giant); error = MOD_EVENT(mod, MOD_QUIESCE); mtx_unlock(&Giant); if (error == EOPNOTSUPP || error == EINVAL) error = 0; return (error); } int module_unload(module_t mod) { int error; mtx_lock(&Giant); error = MOD_EVENT(mod, MOD_UNLOAD); mtx_unlock(&Giant); return (error); } int module_getid(module_t mod) { MOD_LOCK_ASSERT; return (mod->id); } module_t module_getfnext(module_t mod) { MOD_LOCK_ASSERT; return (TAILQ_NEXT(mod, flink)); } const char * module_getname(module_t mod) { MOD_LOCK_ASSERT; return (mod->name); } void module_setspecific(module_t mod, modspecific_t *datap) { MOD_XLOCK_ASSERT; mod->data = *datap; } linker_file_t module_file(module_t mod) { return (mod->file); } /* * Syscalls. */ int sys_modnext(struct thread *td, struct modnext_args *uap) { module_t mod; int error = 0; td->td_retval[0] = -1; MOD_SLOCK; if (uap->modid == 0) { mod = TAILQ_FIRST(&modules); if (mod) td->td_retval[0] = mod->id; else error = ENOENT; goto done2; } mod = module_lookupbyid(uap->modid); if (mod == NULL) { error = ENOENT; goto done2; } if (TAILQ_NEXT(mod, link)) td->td_retval[0] = TAILQ_NEXT(mod, link)->id; else td->td_retval[0] = 0; done2: MOD_SUNLOCK; return (error); } int sys_modfnext(struct thread *td, struct modfnext_args *uap) { module_t mod; int error; td->td_retval[0] = -1; MOD_SLOCK; mod = module_lookupbyid(uap->modid); if (mod == NULL) { error = ENOENT; } else { error = 0; if (TAILQ_NEXT(mod, flink)) td->td_retval[0] = TAILQ_NEXT(mod, flink)->id; else td->td_retval[0] = 0; } MOD_SUNLOCK; return (error); } struct module_stat_v1 { int version; /* set to sizeof(struct module_stat) */ - char name[MAXMODNAME]; + char name[MAXMODNAMEV1V2]; int refs; int id; }; +struct module_stat_v2 { + int version; /* set to sizeof(struct module_stat) */ + char name[MAXMODNAMEV1V2]; + int refs; + int id; + modspecific_t data; +}; + int sys_modstat(struct thread *td, struct modstat_args *uap) { module_t mod; modspecific_t data; int error = 0; int id, namelen, refs, version; struct module_stat *stat; + struct module_stat_v2 *stat_v2; char *name; + bool is_v1v2; MOD_SLOCK; mod = module_lookupbyid(uap->modid); if (mod == NULL) { MOD_SUNLOCK; return (ENOENT); } id = mod->id; refs = mod->refs; name = mod->name; data = mod->data; MOD_SUNLOCK; stat = uap->stat; /* * Check the version of the user's structure. */ if ((error = copyin(&stat->version, &version, sizeof(version))) != 0) return (error); - if (version != sizeof(struct module_stat_v1) - && version != sizeof(struct module_stat)) + is_v1v2 = (version == sizeof(struct module_stat_v1) || + version == sizeof(struct module_stat_v2)); + if (!is_v1v2 && version != sizeof(struct module_stat)) return (EINVAL); namelen = strlen(mod->name) + 1; - if (namelen > MAXMODNAME) - namelen = MAXMODNAME; + if (is_v1v2 && namelen > MAXMODNAMEV1V2) + namelen = MAXMODNAMEV1V2; + else if (namelen > MAXMODNAMEV3) + namelen = MAXMODNAMEV3; if ((error = copyout(name, &stat->name[0], namelen)) != 0) return (error); - if ((error = copyout(&refs, &stat->refs, sizeof(int))) != 0) - return (error); - if ((error = copyout(&id, &stat->id, sizeof(int))) != 0) - return (error); + /* Extending MAXMODNAME gives an offset change for v3. */ + if (is_v1v2) { + stat_v2 = (struct module_stat_v2 *)stat; + if ((error = copyout(&refs, &stat_v2->refs, sizeof(int))) != 0) + return (error); + if ((error = copyout(&id, &stat_v2->id, sizeof(int))) != 0) + return (error); + } else { + if ((error = copyout(&refs, &stat->refs, sizeof(int))) != 0) + return (error); + if ((error = copyout(&id, &stat->id, sizeof(int))) != 0) + return (error); + } /* * >v1 stat includes module data. */ - if (version == sizeof(struct module_stat)) + if (version == sizeof(struct module_stat_v2)) { + if ((error = copyout(&data, &stat_v2->data, + sizeof(data))) != 0) + return (error); + } else if (version == sizeof(struct module_stat)) { if ((error = copyout(&data, &stat->data, sizeof(data))) != 0) return (error); + } td->td_retval[0] = 0; return (error); } int sys_modfind(struct thread *td, struct modfind_args *uap) { int error = 0; - char name[MAXMODNAME]; + char name[MAXMODNAMEV3]; module_t mod; if ((error = copyinstr(uap->name, name, sizeof name, 0)) != 0) return (error); MOD_SLOCK; mod = module_lookupbyname(name); if (mod == NULL) error = ENOENT; else td->td_retval[0] = module_getid(mod); MOD_SUNLOCK; return (error); } MODULE_VERSION(kernel, __FreeBSD_version); #ifdef COMPAT_FREEBSD32 #include #include #include #include #include typedef union modspecific32 { int intval; uint32_t uintval; int longval; uint32_t ulongval; } modspecific32_t; +struct module_stat32_v2 { + int version; + char name[MAXMODNAMEV1V2]; + int refs; + int id; + modspecific32_t data; +}; + struct module_stat32 { int version; char name[MAXMODNAME]; int refs; int id; modspecific32_t data; }; int freebsd32_modstat(struct thread *td, struct freebsd32_modstat_args *uap) { module_t mod; modspecific32_t data32; int error = 0; int id, namelen, refs, version; struct module_stat32 *stat32; + struct module_stat32_v2 *stat32_v2; char *name; + bool is_v1v2; MOD_SLOCK; mod = module_lookupbyid(uap->modid); if (mod == NULL) { MOD_SUNLOCK; return (ENOENT); } id = mod->id; refs = mod->refs; name = mod->name; CP(mod->data, data32, intval); CP(mod->data, data32, uintval); CP(mod->data, data32, longval); CP(mod->data, data32, ulongval); MOD_SUNLOCK; stat32 = uap->stat; if ((error = copyin(&stat32->version, &version, sizeof(version))) != 0) return (error); - if (version != sizeof(struct module_stat_v1) - && version != sizeof(struct module_stat32)) + is_v1v2 = (version == sizeof(struct module_stat_v1) || + version == sizeof(struct module_stat32_v2)); + if (!is_v1v2 && version != sizeof(struct module_stat32)) return (EINVAL); namelen = strlen(mod->name) + 1; - if (namelen > MAXMODNAME) - namelen = MAXMODNAME; + if (is_v1v2 && namelen > MAXMODNAMEV1V2) + namelen = MAXMODNAMEV1V2; + else if (namelen > MAXMODNAMEV3) + namelen = MAXMODNAMEV3; if ((error = copyout(name, &stat32->name[0], namelen)) != 0) return (error); - if ((error = copyout(&refs, &stat32->refs, sizeof(int))) != 0) - return (error); - if ((error = copyout(&id, &stat32->id, sizeof(int))) != 0) - return (error); + /* Extending MAXMODNAME gives an offset change for v3. */ + if (is_v1v2) { + stat32_v2 = (struct module_stat32_v2 *)stat32; + if ((error = copyout(&refs, &stat32_v2->refs, sizeof(int))) != 0) + return (error); + if ((error = copyout(&id, &stat32_v2->id, sizeof(int))) != 0) + return (error); + } else { + if ((error = copyout(&refs, &stat32->refs, sizeof(int))) != 0) + return (error); + if ((error = copyout(&id, &stat32->id, sizeof(int))) != 0) + return (error); + } /* * >v1 stat includes module data. */ - if (version == sizeof(struct module_stat32)) + if (version == sizeof(struct module_stat32_v2)) { + if ((error = copyout(&data32, &stat32_v2->data, + sizeof(data32))) != 0) + return (error); + } else if (version == sizeof(struct module_stat32)) { if ((error = copyout(&data32, &stat32->data, sizeof(data32))) != 0) return (error); + } td->td_retval[0] = 0; return (error); } #endif diff --git a/sys/sys/module.h b/sys/sys/module.h index 89377df401a8..efefaf4cb513 100644 --- a/sys/sys/module.h +++ b/sys/sys/module.h @@ -1,279 +1,281 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 1997 Doug Rabson * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _SYS_MODULE_H_ #define _SYS_MODULE_H_ /* * Module metadata types */ #define MDT_DEPEND 1 /* argument is a module name */ #define MDT_MODULE 2 /* module declaration */ #define MDT_VERSION 3 /* module version(s) */ #define MDT_PNP_INFO 4 /* Plug and play hints record */ #define MDT_STRUCT_VERSION 1 /* version of metadata structure */ #define MDT_SETNAME "modmetadata_set" typedef enum modeventtype { MOD_LOAD, MOD_UNLOAD, MOD_SHUTDOWN, MOD_QUIESCE } modeventtype_t; typedef struct module *module_t; typedef int (*modeventhand_t)(module_t, int /* modeventtype_t */, void *); /* * Struct for registering modules statically via SYSINIT. */ typedef struct moduledata { const char *name; /* module name */ modeventhand_t evhand; /* event handler */ void *priv; /* extra data */ } moduledata_t; /* * A module can use this to report module specific data to the user via * kldstat(2). */ typedef union modspecific { int intval; u_int uintval; long longval; u_long ulongval; } modspecific_t; /* * Module dependency declaration */ struct mod_depend { int md_ver_minimum; int md_ver_preferred; int md_ver_maximum; }; /* * Module version declaration */ struct mod_version { int mv_version; }; struct mod_metadata { int md_version; /* structure version MDTV_* */ int md_type; /* type of entry MDT_* */ const void *md_data; /* specific data */ const char *md_cval; /* common string label */ }; struct mod_pnp_match_info { const char *descr; /* Description of the table */ const char *bus; /* Name of the bus for this table */ const void *table; /* Pointer to pnp table */ int entry_len; /* Length of each entry in the table (may be */ /* longer than descr describes). */ int num_entry; /* Number of entries in the table */ }; #ifdef _KERNEL #include #define MODULE_METADATA_CONCAT(uniquifier) _mod_metadata##uniquifier #define MODULE_METADATA(uniquifier, type, data, cval) \ static struct mod_metadata MODULE_METADATA_CONCAT(uniquifier) = { \ MDT_STRUCT_VERSION, \ type, \ data, \ cval \ }; \ DATA_SET(modmetadata_set, MODULE_METADATA_CONCAT(uniquifier)) #define MODULE_DEPEND(module, mdepend, vmin, vpref, vmax) \ static struct mod_depend _##module##_depend_on_##mdepend \ __section(".data") = { \ vmin, \ vpref, \ vmax \ }; \ MODULE_METADATA(_md_##module##_on_##mdepend, MDT_DEPEND, \ &_##module##_depend_on_##mdepend, #mdepend) /* * Every kernel has a 'kernel' module with the version set to * __FreeBSD_version. We embed a MODULE_DEPEND() inside every module * that depends on the 'kernel' module. It uses the current value of * __FreeBSD_version as the minimum and preferred versions. For the * maximum version it rounds the version up to the end of its branch * (i.e. M99999 for M.x). This allows a module built on M.x to work * on M.y systems where y >= x, but fail on M.z systems where z < x. */ #define MODULE_KERNEL_MAXVER (roundup(__FreeBSD_version, 100000) - 1) #define DECLARE_MODULE_WITH_MAXVER(name, data, sub, order, maxver) \ MODULE_DEPEND(name, kernel, __FreeBSD_version, \ __FreeBSD_version, maxver); \ MODULE_METADATA(_md_##name, MDT_MODULE, &data, __XSTRING(name));\ SYSINIT(name##module, sub, order, module_register_init, &data); \ struct __hack #ifdef KLD_TIED #define DECLARE_MODULE(name, data, sub, order) \ DECLARE_MODULE_WITH_MAXVER(name, data, sub, order, __FreeBSD_version) #else #define DECLARE_MODULE(name, data, sub, order) \ DECLARE_MODULE_WITH_MAXVER(name, data, sub, order, MODULE_KERNEL_MAXVER) #endif /* * The module declared with DECLARE_MODULE_TIED can only be loaded * into the kernel with exactly the same __FreeBSD_version. * * Use it for modules that use kernel interfaces that are not stable * even on STABLE/X branches. */ #define DECLARE_MODULE_TIED(name, data, sub, order) \ DECLARE_MODULE_WITH_MAXVER(name, data, sub, order, __FreeBSD_version) #define MODULE_VERSION_CONCAT(module, version) _##module##_version #define MODULE_VERSION(module, version) \ static struct mod_version MODULE_VERSION_CONCAT(module, version)\ __section(".data") = { \ version \ }; \ MODULE_METADATA(MODULE_VERSION_CONCAT(module, version), MDT_VERSION,\ &MODULE_VERSION_CONCAT(module, version), __XSTRING(module)) /** * Generic macros to create pnp info hints that modules may export * to allow external tools to parse their internal device tables * to make an informed guess about what driver(s) to load. */ #define MODULE_PNP_INFO(d, b, unique, t, n) \ static const struct mod_pnp_match_info _module_pnp_##b##_##unique = { \ .descr = d, \ .bus = #b, \ .table = t, \ .entry_len = sizeof((t)[0]), \ .num_entry = n \ }; \ MODULE_METADATA(_md_##b##_pnpinfo_##unique, MDT_PNP_INFO, \ &_module_pnp_##b##_##unique, #b); /** * descr is a string that describes each entry in the table. The general * form is the grammar (TYPE:pnp_name[/pnp_name];)* * where TYPE is one of the following: * U8 uint8_t element * V8 like U8 and 0xff means match any * G16 uint16_t element, any value >= matches * L16 uint16_t element, any value <= matches * M16 uint16_t element, mask of which of the following fields to use. * U16 uint16_t element * V16 like U16 and 0xffff means match any * U32 uint32_t element * V32 like U32 and 0xffffffff means match any * W32 Two 16-bit values with first pnp_name in LSW and second in MSW. * Z pointer to a string to match exactly * D pointer to a string to human readable description for device * P A pointer that should be ignored * E EISA PNP Identifier (in binary, but bus publishes string) * T Key for whole table. pnp_name=value. must be last, if present. * * The pnp_name "#" is reserved for other fields that should be ignored. * Otherwise pnp_name must match the name from the parent device's pnpinfo * output. The second pnp_name is used for the W32 type. */ extern struct sx modules_sx; #define MOD_XLOCK sx_xlock(&modules_sx) #define MOD_SLOCK sx_slock(&modules_sx) #define MOD_XUNLOCK sx_xunlock(&modules_sx) #define MOD_SUNLOCK sx_sunlock(&modules_sx) #define MOD_LOCK_ASSERT sx_assert(&modules_sx, SX_LOCKED) #define MOD_XLOCK_ASSERT sx_assert(&modules_sx, SX_XLOCKED) struct linker_file; void module_register_init(const void *); int module_register(const struct moduledata *, struct linker_file *); module_t module_lookupbyname(const char *); module_t module_lookupbyid(int); int module_quiesce(module_t); void module_reference(module_t); void module_release(module_t); int module_unload(module_t); int module_getid(module_t); module_t module_getfnext(module_t); const char * module_getname(module_t); void module_setspecific(module_t, modspecific_t *); struct linker_file *module_file(module_t); #ifdef MOD_DEBUG extern int mod_debug; #define MOD_DEBUG_REFS 1 #define MOD_DPF(cat, args) do { \ if (mod_debug & MOD_DEBUG_##cat) \ printf args; \ } while (0) #else /* !MOD_DEBUG */ #define MOD_DPF(cat, args) #endif #endif /* _KERNEL */ -#define MAXMODNAME 32 +#define MAXMODNAMEV1V2 32 +#define MAXMODNAMEV3 MAXPATHLEN +#define MAXMODNAME MAXMODNAMEV3 struct module_stat { int version; /* set to sizeof(struct module_stat) */ char name[MAXMODNAME]; int refs; int id; modspecific_t data; }; #ifndef _KERNEL #include __BEGIN_DECLS int modnext(int _modid); int modfnext(int _modid); int modstat(int _modid, struct module_stat *_stat); int modfind(const char *_name); __END_DECLS #endif #endif /* !_SYS_MODULE_H_ */ diff --git a/sys/x86/cpufreq/hwpstate_intel.c b/sys/x86/cpufreq/hwpstate_intel.c index f6e63fdeb854..e3d17aa0bd1c 100644 --- a/sys/x86/cpufreq/hwpstate_intel.c +++ b/sys/x86/cpufreq/hwpstate_intel.c @@ -1,639 +1,640 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2018 Intel Corporation * * Redistribution and use in source and binary forms, with or without * modification, are permitted providing that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include +#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "acpi_if.h" #include "cpufreq_if.h" extern uint64_t tsc_freq; static int intel_hwpstate_probe(device_t dev); static int intel_hwpstate_attach(device_t dev); static int intel_hwpstate_detach(device_t dev); static int intel_hwpstate_suspend(device_t dev); static int intel_hwpstate_resume(device_t dev); static int intel_hwpstate_get(device_t dev, struct cf_setting *cf); static int intel_hwpstate_type(device_t dev, int *type); static device_method_t intel_hwpstate_methods[] = { /* Device interface */ DEVMETHOD(device_identify, intel_hwpstate_identify), DEVMETHOD(device_probe, intel_hwpstate_probe), DEVMETHOD(device_attach, intel_hwpstate_attach), DEVMETHOD(device_detach, intel_hwpstate_detach), DEVMETHOD(device_suspend, intel_hwpstate_suspend), DEVMETHOD(device_resume, intel_hwpstate_resume), /* cpufreq interface */ DEVMETHOD(cpufreq_drv_get, intel_hwpstate_get), DEVMETHOD(cpufreq_drv_type, intel_hwpstate_type), DEVMETHOD_END }; struct hwp_softc { device_t dev; bool hwp_notifications; bool hwp_activity_window; bool hwp_pref_ctrl; bool hwp_pkg_ctrl; bool hwp_pkg_ctrl_en; bool hwp_perf_bias; bool hwp_perf_bias_cached; uint64_t req; /* Cached copy of HWP_REQUEST */ uint64_t hwp_energy_perf_bias; /* Cache PERF_BIAS */ uint8_t high; uint8_t guaranteed; uint8_t efficient; uint8_t low; }; static devclass_t hwpstate_intel_devclass; static driver_t hwpstate_intel_driver = { "hwpstate_intel", intel_hwpstate_methods, sizeof(struct hwp_softc), }; DRIVER_MODULE(hwpstate_intel, cpu, hwpstate_intel_driver, hwpstate_intel_devclass, NULL, NULL); MODULE_VERSION(hwpstate_intel, 1); static bool hwpstate_pkg_ctrl_enable = true; SYSCTL_BOOL(_machdep, OID_AUTO, hwpstate_pkg_ctrl, CTLFLAG_RDTUN, &hwpstate_pkg_ctrl_enable, 0, "Set 1 (default) to enable package-level control, 0 to disable"); static int intel_hwp_dump_sysctl_handler(SYSCTL_HANDLER_ARGS) { device_t dev; struct pcpu *pc; struct sbuf *sb; struct hwp_softc *sc; uint64_t data, data2; int ret; sc = (struct hwp_softc *)arg1; dev = sc->dev; pc = cpu_get_pcpu(dev); if (pc == NULL) return (ENXIO); sb = sbuf_new(NULL, NULL, 1024, SBUF_FIXEDLEN | SBUF_INCLUDENUL); sbuf_putc(sb, '\n'); thread_lock(curthread); sched_bind(curthread, pc->pc_cpuid); thread_unlock(curthread); rdmsr_safe(MSR_IA32_PM_ENABLE, &data); sbuf_printf(sb, "CPU%d: HWP %sabled\n", pc->pc_cpuid, ((data & 1) ? "En" : "Dis")); if (data == 0) { ret = 0; goto out; } rdmsr_safe(MSR_IA32_HWP_CAPABILITIES, &data); sbuf_printf(sb, "\tHighest Performance: %03ju\n", data & 0xff); sbuf_printf(sb, "\tGuaranteed Performance: %03ju\n", (data >> 8) & 0xff); sbuf_printf(sb, "\tEfficient Performance: %03ju\n", (data >> 16) & 0xff); sbuf_printf(sb, "\tLowest Performance: %03ju\n", (data >> 24) & 0xff); rdmsr_safe(MSR_IA32_HWP_REQUEST, &data); data2 = 0; if (sc->hwp_pkg_ctrl && (data & IA32_HWP_REQUEST_PACKAGE_CONTROL)) rdmsr_safe(MSR_IA32_HWP_REQUEST_PKG, &data2); sbuf_putc(sb, '\n'); #define pkg_print(x, name, offset) do { \ if (!sc->hwp_pkg_ctrl || (data & x) != 0) \ sbuf_printf(sb, "\t%s: %03u\n", name, \ (unsigned)(data >> offset) & 0xff); \ else \ sbuf_printf(sb, "\t%s: %03u\n", name, \ (unsigned)(data2 >> offset) & 0xff); \ } while (0) pkg_print(IA32_HWP_REQUEST_EPP_VALID, "Requested Efficiency Performance Preference", 24); pkg_print(IA32_HWP_REQUEST_DESIRED_VALID, "Requested Desired Performance", 16); pkg_print(IA32_HWP_REQUEST_MAXIMUM_VALID, "Requested Maximum Performance", 8); pkg_print(IA32_HWP_REQUEST_MINIMUM_VALID, "Requested Minimum Performance", 0); #undef pkg_print sbuf_putc(sb, '\n'); out: thread_lock(curthread); sched_unbind(curthread); thread_unlock(curthread); ret = sbuf_finish(sb); if (ret == 0) ret = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb)); sbuf_delete(sb); return (ret); } static inline int percent_to_raw(int x) { MPASS(x <= 100 && x >= 0); return (0xff * x / 100); } /* * Given x * 10 in [0, 1000], round to the integer nearest x. * * This allows round-tripping nice human readable numbers through this * interface. Otherwise, user-provided percentages such as 25, 50, 75 get * rounded down to 24, 49, and 74, which is a bit ugly. */ static inline int round10(int xtimes10) { return ((xtimes10 + 5) / 10); } static inline int raw_to_percent(int x) { MPASS(x <= 0xff && x >= 0); return (round10(x * 1000 / 0xff)); } /* Range of MSR_IA32_ENERGY_PERF_BIAS is more limited: 0-0xf. */ static inline int percent_to_raw_perf_bias(int x) { /* * Round up so that raw values present as nice round human numbers and * also round-trip to the same raw value. */ MPASS(x <= 100 && x >= 0); return (((0xf * x) + 50) / 100); } static inline int raw_to_percent_perf_bias(int x) { /* Rounding to nice human numbers despite a step interval of 6.67%. */ MPASS(x <= 0xf && x >= 0); return (((x * 20) / 0xf) * 5); } static int sysctl_epp_select(SYSCTL_HANDLER_ARGS) { struct hwp_softc *sc; device_t dev; struct pcpu *pc; uint64_t epb; uint32_t val; int ret; dev = oidp->oid_arg1; sc = device_get_softc(dev); if (!sc->hwp_pref_ctrl && !sc->hwp_perf_bias) return (ENODEV); pc = cpu_get_pcpu(dev); if (pc == NULL) return (ENXIO); thread_lock(curthread); sched_bind(curthread, pc->pc_cpuid); thread_unlock(curthread); if (sc->hwp_pref_ctrl) { val = (sc->req & IA32_HWP_REQUEST_ENERGY_PERFORMANCE_PREFERENCE) >> 24; val = raw_to_percent(val); } else { /* * If cpuid indicates EPP is not supported, the HWP controller * uses MSR_IA32_ENERGY_PERF_BIAS instead (Intel SDM §14.4.4). * This register is per-core (but not HT). */ if (!sc->hwp_perf_bias_cached) { ret = rdmsr_safe(MSR_IA32_ENERGY_PERF_BIAS, &epb); if (ret) goto out; sc->hwp_energy_perf_bias = epb; sc->hwp_perf_bias_cached = true; } val = sc->hwp_energy_perf_bias & IA32_ENERGY_PERF_BIAS_POLICY_HINT_MASK; val = raw_to_percent_perf_bias(val); } MPASS(val >= 0 && val <= 100); ret = sysctl_handle_int(oidp, &val, 0, req); if (ret || req->newptr == NULL) goto out; if (val > 100) { ret = EINVAL; goto out; } if (sc->hwp_pref_ctrl) { val = percent_to_raw(val); sc->req = ((sc->req & ~IA32_HWP_REQUEST_ENERGY_PERFORMANCE_PREFERENCE) | (val << 24u)); if (sc->hwp_pkg_ctrl_en) ret = wrmsr_safe(MSR_IA32_HWP_REQUEST_PKG, sc->req); else ret = wrmsr_safe(MSR_IA32_HWP_REQUEST, sc->req); } else { val = percent_to_raw_perf_bias(val); MPASS((val & ~IA32_ENERGY_PERF_BIAS_POLICY_HINT_MASK) == 0); sc->hwp_energy_perf_bias = ((sc->hwp_energy_perf_bias & ~IA32_ENERGY_PERF_BIAS_POLICY_HINT_MASK) | val); ret = wrmsr_safe(MSR_IA32_ENERGY_PERF_BIAS, sc->hwp_energy_perf_bias); } out: thread_lock(curthread); sched_unbind(curthread); thread_unlock(curthread); return (ret); } void intel_hwpstate_identify(driver_t *driver, device_t parent) { if (device_find_child(parent, "hwpstate_intel", -1) != NULL) return; if (cpu_vendor_id != CPU_VENDOR_INTEL) return; if (resource_disabled("hwpstate_intel", 0)) return; /* * Intel SDM 14.4.1 (HWP Programming Interfaces): * Availability of HWP baseline resource and capability, * CPUID.06H:EAX[bit 7]: If this bit is set, HWP provides several new * architectural MSRs: IA32_PM_ENABLE, IA32_HWP_CAPABILITIES, * IA32_HWP_REQUEST, IA32_HWP_STATUS. */ if ((cpu_power_eax & CPUTPM1_HWP) == 0) return; if (BUS_ADD_CHILD(parent, 10, "hwpstate_intel", -1) == NULL) return; if (bootverbose) device_printf(parent, "hwpstate registered\n"); } static int intel_hwpstate_probe(device_t dev) { device_set_desc(dev, "Intel Speed Shift"); return (BUS_PROBE_NOWILDCARD); } static int set_autonomous_hwp(struct hwp_softc *sc) { struct pcpu *pc; device_t dev; uint64_t caps; int ret; dev = sc->dev; pc = cpu_get_pcpu(dev); if (pc == NULL) return (ENXIO); thread_lock(curthread); sched_bind(curthread, pc->pc_cpuid); thread_unlock(curthread); /* XXX: Many MSRs aren't readable until feature is enabled */ ret = wrmsr_safe(MSR_IA32_PM_ENABLE, 1); if (ret) { /* * This is actually a package-level MSR, and only the first * write is not ignored. So it is harmless to enable it across * all devices, and this allows us not to care especially in * which order cores (and packages) are probed. This error * condition should not happen given we gate on the HWP CPUID * feature flag, if the Intel SDM is correct. */ device_printf(dev, "Failed to enable HWP for cpu%d (%d)\n", pc->pc_cpuid, ret); goto out; } ret = rdmsr_safe(MSR_IA32_HWP_REQUEST, &sc->req); if (ret) { device_printf(dev, "Failed to read HWP request MSR for cpu%d (%d)\n", pc->pc_cpuid, ret); goto out; } ret = rdmsr_safe(MSR_IA32_HWP_CAPABILITIES, &caps); if (ret) { device_printf(dev, "Failed to read HWP capabilities MSR for cpu%d (%d)\n", pc->pc_cpuid, ret); goto out; } /* * High and low are static; "guaranteed" is dynamic; and efficient is * also dynamic. */ sc->high = IA32_HWP_CAPABILITIES_HIGHEST_PERFORMANCE(caps); sc->guaranteed = IA32_HWP_CAPABILITIES_GUARANTEED_PERFORMANCE(caps); sc->efficient = IA32_HWP_CAPABILITIES_EFFICIENT_PERFORMANCE(caps); sc->low = IA32_HWP_CAPABILITIES_LOWEST_PERFORMANCE(caps); /* hardware autonomous selection determines the performance target */ sc->req &= ~IA32_HWP_DESIRED_PERFORMANCE; /* enable HW dynamic selection of window size */ sc->req &= ~IA32_HWP_ACTIVITY_WINDOW; /* IA32_HWP_REQUEST.Minimum_Performance = IA32_HWP_CAPABILITIES.Lowest_Performance */ sc->req &= ~IA32_HWP_MINIMUM_PERFORMANCE; sc->req |= sc->low; /* IA32_HWP_REQUEST.Maximum_Performance = IA32_HWP_CAPABILITIES.Highest_Performance. */ sc->req &= ~IA32_HWP_REQUEST_MAXIMUM_PERFORMANCE; sc->req |= sc->high << 8; /* If supported, request package-level control for this CPU. */ if (sc->hwp_pkg_ctrl_en) ret = wrmsr_safe(MSR_IA32_HWP_REQUEST, sc->req | IA32_HWP_REQUEST_PACKAGE_CONTROL); else ret = wrmsr_safe(MSR_IA32_HWP_REQUEST, sc->req); if (ret) { device_printf(dev, "Failed to setup%s autonomous HWP for cpu%d\n", sc->hwp_pkg_ctrl_en ? " PKG" : "", pc->pc_cpuid); goto out; } /* If supported, write the PKG-wide control MSR. */ if (sc->hwp_pkg_ctrl_en) { /* * "The structure of the IA32_HWP_REQUEST_PKG MSR * (package-level) is identical to the IA32_HWP_REQUEST MSR * with the exception of the Package Control field, which does * not exist." (Intel SDM §14.4.4) */ ret = wrmsr_safe(MSR_IA32_HWP_REQUEST_PKG, sc->req); if (ret) { device_printf(dev, "Failed to set autonomous HWP for package\n"); } } out: thread_lock(curthread); sched_unbind(curthread); thread_unlock(curthread); return (ret); } static int intel_hwpstate_attach(device_t dev) { struct hwp_softc *sc; int ret; sc = device_get_softc(dev); sc->dev = dev; /* eax */ if (cpu_power_eax & CPUTPM1_HWP_NOTIFICATION) sc->hwp_notifications = true; if (cpu_power_eax & CPUTPM1_HWP_ACTIVITY_WINDOW) sc->hwp_activity_window = true; if (cpu_power_eax & CPUTPM1_HWP_PERF_PREF) sc->hwp_pref_ctrl = true; if (cpu_power_eax & CPUTPM1_HWP_PKG) sc->hwp_pkg_ctrl = true; /* Allow administrators to disable pkg-level control. */ sc->hwp_pkg_ctrl_en = (sc->hwp_pkg_ctrl && hwpstate_pkg_ctrl_enable); /* ecx */ if (cpu_power_ecx & CPUID_PERF_BIAS) sc->hwp_perf_bias = true; ret = set_autonomous_hwp(sc); if (ret) return (ret); SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), SYSCTL_STATIC_CHILDREN(_debug), OID_AUTO, device_get_nameunit(dev), CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_SKIP | CTLFLAG_NEEDGIANT, sc, 0, intel_hwp_dump_sysctl_handler, "A", ""); SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, "epp", CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_NEEDGIANT, dev, 0, sysctl_epp_select, "I", "Efficiency/Performance Preference " "(range from 0, most performant, through 100, most efficient)"); return (cpufreq_register(dev)); } static int intel_hwpstate_detach(device_t dev) { return (cpufreq_unregister(dev)); } static int intel_hwpstate_get(device_t dev, struct cf_setting *set) { struct pcpu *pc; uint64_t rate; int ret; if (set == NULL) return (EINVAL); pc = cpu_get_pcpu(dev); if (pc == NULL) return (ENXIO); memset(set, CPUFREQ_VAL_UNKNOWN, sizeof(*set)); set->dev = dev; ret = cpu_est_clockrate(pc->pc_cpuid, &rate); if (ret == 0) set->freq = rate / 1000000; set->volts = CPUFREQ_VAL_UNKNOWN; set->power = CPUFREQ_VAL_UNKNOWN; set->lat = CPUFREQ_VAL_UNKNOWN; return (0); } static int intel_hwpstate_type(device_t dev, int *type) { if (type == NULL) return (EINVAL); *type = CPUFREQ_TYPE_ABSOLUTE | CPUFREQ_FLAG_INFO_ONLY | CPUFREQ_FLAG_UNCACHED; return (0); } static int intel_hwpstate_suspend(device_t dev) { return (0); } /* * Redo a subset of set_autonomous_hwp on resume; untested. Without this, * testers observed that on resume MSR_IA32_HWP_REQUEST was bogus. */ static int intel_hwpstate_resume(device_t dev) { struct hwp_softc *sc; struct pcpu *pc; int ret; sc = device_get_softc(dev); pc = cpu_get_pcpu(dev); if (pc == NULL) return (ENXIO); thread_lock(curthread); sched_bind(curthread, pc->pc_cpuid); thread_unlock(curthread); ret = wrmsr_safe(MSR_IA32_PM_ENABLE, 1); if (ret) { device_printf(dev, "Failed to enable HWP for cpu%d after suspend (%d)\n", pc->pc_cpuid, ret); goto out; } if (sc->hwp_pkg_ctrl_en) ret = wrmsr_safe(MSR_IA32_HWP_REQUEST, sc->req | IA32_HWP_REQUEST_PACKAGE_CONTROL); else ret = wrmsr_safe(MSR_IA32_HWP_REQUEST, sc->req); if (ret) { device_printf(dev, "Failed to set%s autonomous HWP for cpu%d after suspend\n", sc->hwp_pkg_ctrl_en ? " PKG" : "", pc->pc_cpuid); goto out; } if (sc->hwp_pkg_ctrl_en) { ret = wrmsr_safe(MSR_IA32_HWP_REQUEST_PKG, sc->req); if (ret) { device_printf(dev, "Failed to set autonomous HWP for package after " "suspend\n"); goto out; } } if (!sc->hwp_pref_ctrl && sc->hwp_perf_bias_cached) { ret = wrmsr_safe(MSR_IA32_ENERGY_PERF_BIAS, sc->hwp_energy_perf_bias); if (ret) { device_printf(dev, "Failed to set energy perf bias for cpu%d after " "suspend\n", pc->pc_cpuid); } } out: thread_lock(curthread); sched_unbind(curthread); thread_unlock(curthread); return (ret); }