Changeset View
Standalone View
sys/kern/vfs_mount.c
Show First 20 Lines • Show All 89 Lines • ▼ Show 20 Lines | |||||
SYSCTL_BOOL(_vfs, OID_AUTO, default_autoro, CTLFLAG_RW, &default_autoro, 0, | SYSCTL_BOOL(_vfs, OID_AUTO, default_autoro, CTLFLAG_RW, &default_autoro, 0, | ||||
"Retry failed r/w mount as r/o if no explicit ro/rw option is specified"); | "Retry failed r/w mount as r/o if no explicit ro/rw option is specified"); | ||||
static bool recursive_forced_unmount = false; | static bool recursive_forced_unmount = false; | ||||
SYSCTL_BOOL(_vfs, OID_AUTO, recursive_forced_unmount, CTLFLAG_RW, | SYSCTL_BOOL(_vfs, OID_AUTO, recursive_forced_unmount, CTLFLAG_RW, | ||||
&recursive_forced_unmount, 0, "Recursively unmount stacked upper mounts" | &recursive_forced_unmount, 0, "Recursively unmount stacked upper mounts" | ||||
" when a file system is forcibly unmounted"); | " when a file system is forcibly unmounted"); | ||||
static SYSCTL_NODE(_vfs, OID_AUTO, deferred_unmount, | |||||
CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "deferred unmount controls"); | |||||
static unsigned int deferred_unmount_retry_limit = 10; | |||||
SYSCTL_UINT(_vfs_deferred_unmount, OID_AUTO, retry_limit, CTLFLAG_RW, | |||||
&deferred_unmount_retry_limit, 0, | |||||
"Maximum number of retries for deferred unmount failure"); | |||||
static int deferred_unmount_retry_delay_hz; | |||||
kib: Did you considered adding a node vfs.deferred_unmount and putting retries and delay_hz under it? | |||||
Done Inline ActionsI didn't consider that, but I like the idea. jah: I didn't consider that, but I like the idea. | |||||
SYSCTL_INT(_vfs_deferred_unmount, OID_AUTO, retry_delay_hz, CTLFLAG_RW, | |||||
&deferred_unmount_retry_delay_hz, 0, | |||||
"Delay (in ticks) when retrying a failed deferred unmount"); | |||||
Not Done Inline ActionsThe tick rate can vary between machines and is generally not known. I suggest that you make this in some time units. I suggest seconds would be appropriate. mckusick: The tick rate can vary between machines and is generally not known. I suggest that you make… | |||||
Not Done Inline ActionsThe 'hz' expression guarantees 1 second timeout for taskqueue_enqueue_timeout() kib: The 'hz' expression guarantees 1 second timeout for taskqueue_enqueue_timeout() | |||||
Not Done Inline ActionsThis is a user-settable variable. If I want to change the default from one second to two seconds, I need to know the hz value to do so. The variable should be in seconds and the variable multiplied by hz where it is used in deferred_unmount_enqueue (where timeout_ticks should be called jjust timeout or perhaps timeout_seconds). mckusick: This is a user-settable variable. If I want to change the default from one second to two… | |||||
Done Inline ActionsIMO making the variable be an integer number of seconds would be too coarse. I could delineate the timeout in milliseconds and convert to hz, or instead use taskqueue_enqueue_timeout_sbt(), but to be honest both of those seem like overkill. I think that if a user reaches the point of wanting to tweak this variable, it's probably very easy for them to figure out that they should check kern.hz. I can further ease that discovery by mentioning kern.hz in the sysctl description. jah: IMO making the variable be an integer number of seconds would be too coarse. I could delineate… | |||||
static int deferred_unmount_total_retries = 0; | |||||
SYSCTL_INT(_vfs_deferred_unmount, OID_AUTO, total_retries, CTLFLAG_RD, | |||||
&deferred_unmount_total_retries, 0, | |||||
"Total number of retried deferred unmounts"); | |||||
MALLOC_DEFINE(M_MOUNT, "mount", "vfs mount structure"); | MALLOC_DEFINE(M_MOUNT, "mount", "vfs mount structure"); | ||||
MALLOC_DEFINE(M_STATFS, "statfs", "statfs structure"); | MALLOC_DEFINE(M_STATFS, "statfs", "statfs structure"); | ||||
static uma_zone_t mount_zone; | static uma_zone_t mount_zone; | ||||
/* List of mounted filesystems. */ | /* List of mounted filesystems. */ | ||||
struct mntlist mountlist = TAILQ_HEAD_INITIALIZER(mountlist); | struct mntlist mountlist = TAILQ_HEAD_INITIALIZER(mountlist); | ||||
/* For any iteration/modification of mountlist */ | /* For any iteration/modification of mountlist */ | ||||
struct mtx_padalign __exclusive_cache_line mountlist_mtx; | struct mtx_padalign __exclusive_cache_line mountlist_mtx; | ||||
MTX_SYSINIT(mountlist, &mountlist_mtx, "mountlist", MTX_DEF); | MTX_SYSINIT(mountlist, &mountlist_mtx, "mountlist", MTX_DEF); | ||||
EVENTHANDLER_LIST_DEFINE(vfs_mounted); | EVENTHANDLER_LIST_DEFINE(vfs_mounted); | ||||
EVENTHANDLER_LIST_DEFINE(vfs_unmounted); | EVENTHANDLER_LIST_DEFINE(vfs_unmounted); | ||||
static void vfs_deferred_unmount(void *arg, int pending); | static void vfs_deferred_unmount(void *arg, int pending); | ||||
static struct task deferred_unmount_task = | static struct timeout_task deferred_unmount_task; | ||||
TASK_INITIALIZER(0, vfs_deferred_unmount, NULL);; | |||||
static struct mtx deferred_unmount_lock; | static struct mtx deferred_unmount_lock; | ||||
MTX_SYSINIT(deferred_unmount, &deferred_unmount_lock, "deferred_unmount", | MTX_SYSINIT(deferred_unmount, &deferred_unmount_lock, "deferred_unmount", | ||||
MTX_DEF); | MTX_DEF); | ||||
static STAILQ_HEAD(, mount) deferred_unmount_list = | static STAILQ_HEAD(, mount) deferred_unmount_list = | ||||
STAILQ_HEAD_INITIALIZER(deferred_unmount_list); | STAILQ_HEAD_INITIALIZER(deferred_unmount_list); | ||||
TASKQUEUE_DEFINE_THREAD(deferred_unmount); | TASKQUEUE_DEFINE_THREAD(deferred_unmount); | ||||
static void mount_devctl_event(const char *type, struct mount *mp, bool donew); | static void mount_devctl_event(const char *type, struct mount *mp, bool donew); | ||||
Show All 38 Lines | mount_fini(void *mem, int size) | ||||
lockdestroy(&mp->mnt_explock); | lockdestroy(&mp->mnt_explock); | ||||
mtx_destroy(&mp->mnt_listmtx); | mtx_destroy(&mp->mnt_listmtx); | ||||
mtx_destroy(&mp->mnt_mtx); | mtx_destroy(&mp->mnt_mtx); | ||||
} | } | ||||
static void | static void | ||||
vfs_mount_init(void *dummy __unused) | vfs_mount_init(void *dummy __unused) | ||||
{ | { | ||||
TIMEOUT_TASK_INIT(taskqueue_deferred_unmount, &deferred_unmount_task, | |||||
0, vfs_deferred_unmount, NULL); | |||||
deferred_unmount_retry_delay_hz = hz; | |||||
mount_zone = uma_zcreate("Mountpoints", sizeof(struct mount), NULL, | mount_zone = uma_zcreate("Mountpoints", sizeof(struct mount), NULL, | ||||
NULL, mount_init, mount_fini, UMA_ALIGN_CACHE, UMA_ZONE_NOFREE); | NULL, mount_init, mount_fini, UMA_ALIGN_CACHE, UMA_ZONE_NOFREE); | ||||
} | } | ||||
SYSINIT(vfs_mount, SI_SUB_VFS, SI_ORDER_ANY, vfs_mount_init, NULL); | SYSINIT(vfs_mount, SI_SUB_VFS, SI_ORDER_ANY, vfs_mount_init, NULL); | ||||
/* | /* | ||||
* --------------------------------------------------------------------- | * --------------------------------------------------------------------- | ||||
* Functions for building and sanitizing the mount options | * Functions for building and sanitizing the mount options | ||||
▲ Show 20 Lines • Show All 505 Lines • ▼ Show 20 Lines | #ifdef MAC | ||||
mac_mount_init(mp); | mac_mount_init(mp); | ||||
mac_mount_create(cred, mp); | mac_mount_create(cred, mp); | ||||
#endif | #endif | ||||
arc4rand(&mp->mnt_hashseed, sizeof mp->mnt_hashseed, 0); | arc4rand(&mp->mnt_hashseed, sizeof mp->mnt_hashseed, 0); | ||||
mp->mnt_upper_pending = 0; | mp->mnt_upper_pending = 0; | ||||
TAILQ_INIT(&mp->mnt_uppers); | TAILQ_INIT(&mp->mnt_uppers); | ||||
TAILQ_INIT(&mp->mnt_notify); | TAILQ_INIT(&mp->mnt_notify); | ||||
mp->mnt_taskqueue_flags = 0; | mp->mnt_taskqueue_flags = 0; | ||||
mp->mnt_unmount_retries = 0; | |||||
return (mp); | return (mp); | ||||
} | } | ||||
/* | /* | ||||
* Destroy the mount struct previously allocated by vfs_mount_alloc(). | * Destroy the mount struct previously allocated by vfs_mount_alloc(). | ||||
*/ | */ | ||||
void | void | ||||
vfs_mount_destroy(struct mount *mp) | vfs_mount_destroy(struct mount *mp) | ||||
▲ Show 20 Lines • Show All 1,182 Lines • ▼ Show 20 Lines | case MNT_COUNT_WRITEOPCOUNT: | ||||
sum += mpcpu->mntp_writeopcount; | sum += mpcpu->mntp_writeopcount; | ||||
break; | break; | ||||
} | } | ||||
} | } | ||||
return (sum); | return (sum); | ||||
} | } | ||||
static bool | static bool | ||||
deferred_unmount_enqueue(struct mount *mp, uint64_t flags, bool requeue) | deferred_unmount_enqueue(struct mount *mp, uint64_t flags, bool requeue, | ||||
int timeout_ticks) | |||||
{ | { | ||||
bool enqueued; | bool enqueued; | ||||
enqueued = false; | enqueued = false; | ||||
mtx_lock(&deferred_unmount_lock); | mtx_lock(&deferred_unmount_lock); | ||||
if ((mp->mnt_taskqueue_flags & MNT_DEFERRED) == 0 || requeue) { | if ((mp->mnt_taskqueue_flags & MNT_DEFERRED) == 0 || requeue) { | ||||
mp->mnt_taskqueue_flags = flags | MNT_DEFERRED; | mp->mnt_taskqueue_flags = flags | MNT_DEFERRED; | ||||
STAILQ_INSERT_TAIL(&deferred_unmount_list, mp, | STAILQ_INSERT_TAIL(&deferred_unmount_list, mp, | ||||
mnt_taskqueue_link); | mnt_taskqueue_link); | ||||
enqueued = true; | enqueued = true; | ||||
} | } | ||||
mtx_unlock(&deferred_unmount_lock); | mtx_unlock(&deferred_unmount_lock); | ||||
if (enqueued) { | if (enqueued) { | ||||
taskqueue_enqueue(taskqueue_deferred_unmount, | taskqueue_enqueue_timeout(taskqueue_deferred_unmount, | ||||
&deferred_unmount_task); | &deferred_unmount_task, timeout_ticks); | ||||
} | } | ||||
return (enqueued); | return (enqueued); | ||||
} | } | ||||
/* | /* | ||||
* Taskqueue handler for processing async/recursive unmounts | * Taskqueue handler for processing async/recursive unmounts | ||||
*/ | */ | ||||
static void | static void | ||||
vfs_deferred_unmount(void *argi __unused, int pending __unused) | vfs_deferred_unmount(void *argi __unused, int pending __unused) | ||||
{ | { | ||||
STAILQ_HEAD(, mount) local_unmounts; | STAILQ_HEAD(, mount) local_unmounts; | ||||
uint64_t flags; | uint64_t flags; | ||||
struct mount *mp, *tmp; | struct mount *mp, *tmp; | ||||
int error; | |||||
unsigned int retries; | |||||
bool unmounted; | bool unmounted; | ||||
STAILQ_INIT(&local_unmounts); | STAILQ_INIT(&local_unmounts); | ||||
mtx_lock(&deferred_unmount_lock); | mtx_lock(&deferred_unmount_lock); | ||||
STAILQ_CONCAT(&local_unmounts, &deferred_unmount_list); | STAILQ_CONCAT(&local_unmounts, &deferred_unmount_list); | ||||
mtx_unlock(&deferred_unmount_lock); | mtx_unlock(&deferred_unmount_lock); | ||||
STAILQ_FOREACH_SAFE(mp, &local_unmounts, mnt_taskqueue_link, tmp) { | STAILQ_FOREACH_SAFE(mp, &local_unmounts, mnt_taskqueue_link, tmp) { | ||||
flags = mp->mnt_taskqueue_flags; | flags = mp->mnt_taskqueue_flags; | ||||
KASSERT((flags & MNT_DEFERRED) != 0, | KASSERT((flags & MNT_DEFERRED) != 0, | ||||
("taskqueue unmount without MNT_DEFERRED")); | ("taskqueue unmount without MNT_DEFERRED")); | ||||
if (dounmount(mp, flags, curthread) != 0) { | error = dounmount(mp, flags, curthread); | ||||
if (error != 0) { | |||||
MNT_ILOCK(mp); | MNT_ILOCK(mp); | ||||
unmounted = ((mp->mnt_kern_flag & MNTK_REFEXPIRE) != 0); | unmounted = ((mp->mnt_kern_flag & MNTK_REFEXPIRE) != 0); | ||||
MNT_IUNLOCK(mp); | MNT_IUNLOCK(mp); | ||||
if (!unmounted) | retries = (mp->mnt_unmount_retries)++; | ||||
deferred_unmount_enqueue(mp, flags, true); | deferred_unmount_total_retries++; | ||||
else | if (!unmounted && retries < deferred_unmount_retry_limit) { | ||||
deferred_unmount_enqueue(mp, flags, true, | |||||
-deferred_unmount_retry_delay_hz); | |||||
} else { | |||||
if (retries >= deferred_unmount_retry_limit) { | |||||
printf("giving up on deferred unmount " | |||||
"of %s after %d retries, error %d\n", | |||||
mp->mnt_stat.f_mntonname, retries, error); | |||||
} | |||||
Not Done Inline ActionsI think that this statement should be done inside the MNT_ILOCK(mp); mckusick: I think that this statement should be done inside the MNT_ILOCK(mp); | |||||
Done Inline ActionsThe deferred_unmount thread is the only thread that will update these fields, so they won't need locking or atomics. Perhaps a comment to that effect would be better instead? (In the dounmount() code below, I do check the retry count while holding a mount interlock, but only because that makes the code to continue the loop slightly cleaner. It also wouldn't be the "right" interlock for synchronization purposes, since the lower mount's interlock is held but the upper mount's retry count is being checked.) jah: The deferred_unmount thread is the only thread that will update these fields, so they won't… | |||||
Not Done Inline ActionsI concur with your argument, though rather than adding an explanation of why bit does not need to be under the lock, it might be simpler to just move it up one line so that it is under the lock (i.e., there is not extra cost since you already take/free the lock). mckusick: I concur with your argument, though rather than adding an explanation of why bit does not need… | |||||
Done Inline ActionsI don't think that would buy anything though. Immediately below this line is a similar non-atomic update of a global variable, and it wouldn't make any sense to move that under a per-mount lock. An explanation would still be useful for that line. jah: I don't think that would buy anything though. Immediately below this line is a similar non… | |||||
vfs_rel(mp); | vfs_rel(mp); | ||||
} | } | ||||
} | } | ||||
} | } | ||||
} | |||||
/* | /* | ||||
* Do the actual filesystem unmount. | * Do the actual filesystem unmount. | ||||
*/ | */ | ||||
int | int | ||||
dounmount(struct mount *mp, uint64_t flags, struct thread *td) | dounmount(struct mount *mp, uint64_t flags, struct thread *td) | ||||
{ | { | ||||
struct mount_upper_node *upper; | struct mount_upper_node *upper; | ||||
struct vnode *coveredvp, *rootvp; | struct vnode *coveredvp, *rootvp; | ||||
int error; | int error; | ||||
uint64_t async_flag; | uint64_t async_flag; | ||||
int mnt_gen_r; | int mnt_gen_r; | ||||
unsigned int retries; | |||||
KASSERT((flags & MNT_DEFERRED) == 0 || | KASSERT((flags & MNT_DEFERRED) == 0 || | ||||
(flags & (MNT_RECURSE | MNT_FORCE)) == (MNT_RECURSE | MNT_FORCE), | (flags & (MNT_RECURSE | MNT_FORCE)) == (MNT_RECURSE | MNT_FORCE), | ||||
("MNT_DEFERRED requires MNT_RECURSE | MNT_FORCE")); | ("MNT_DEFERRED requires MNT_RECURSE | MNT_FORCE")); | ||||
/* | /* | ||||
* If the caller has explicitly requested the unmount to be handled by | * If the caller has explicitly requested the unmount to be handled by | ||||
* the taskqueue and we're not already in taskqueue context, queue | * the taskqueue and we're not already in taskqueue context, queue | ||||
* up the unmount request and exit. This is done prior to any | * up the unmount request and exit. This is done prior to any | ||||
* credential checks; MNT_DEFERRED should be used only for kernel- | * credential checks; MNT_DEFERRED should be used only for kernel- | ||||
* initiated unmounts and will therefore be processed with the | * initiated unmounts and will therefore be processed with the | ||||
* (kernel) credentials of the taskqueue thread. Still, callers | * (kernel) credentials of the taskqueue thread. Still, callers | ||||
* should be sure this is the behavior they want. | * should be sure this is the behavior they want. | ||||
*/ | */ | ||||
if ((flags & MNT_DEFERRED) != 0 && | if ((flags & MNT_DEFERRED) != 0 && | ||||
taskqueue_member(taskqueue_deferred_unmount, curthread) == 0) { | taskqueue_member(taskqueue_deferred_unmount, curthread) == 0) { | ||||
if (!deferred_unmount_enqueue(mp, flags, false)) | if (!deferred_unmount_enqueue(mp, flags, false, 0)) | ||||
vfs_rel(mp); | vfs_rel(mp); | ||||
return (EINPROGRESS); | return (EINPROGRESS); | ||||
} | } | ||||
/* | /* | ||||
* Only privileged root, or (if MNT_USER is set) the user that did the | * Only privileged root, or (if MNT_USER is set) the user that did the | ||||
* original mount is permitted to unmount this filesystem. | * original mount is permitted to unmount this filesystem. | ||||
* This check should be made prior to queueing up any recursive | * This check should be made prior to queueing up any recursive | ||||
Show All 24 Lines | if ((flags & MNT_RECURSE) != 0) { | ||||
* added, and note that an operation on the uppers list is in | * added, and note that an operation on the uppers list is in | ||||
* progress. This will ensure that unregistration from the | * progress. This will ensure that unregistration from the | ||||
* uppers list, and therefore any pending unmount of the upper | * uppers list, and therefore any pending unmount of the upper | ||||
* FS, can't complete until after we finish walking the list. | * FS, can't complete until after we finish walking the list. | ||||
*/ | */ | ||||
mp->mnt_kern_flag |= MNTK_RECURSE; | mp->mnt_kern_flag |= MNTK_RECURSE; | ||||
mp->mnt_upper_pending++; | mp->mnt_upper_pending++; | ||||
TAILQ_FOREACH(upper, &mp->mnt_uppers, mnt_upper_link) { | TAILQ_FOREACH(upper, &mp->mnt_uppers, mnt_upper_link) { | ||||
retries = upper->mp->mnt_unmount_retries; | |||||
if (retries > deferred_unmount_retry_limit) { | |||||
error = EBUSY; | |||||
continue; | |||||
} | |||||
MNT_IUNLOCK(mp); | MNT_IUNLOCK(mp); | ||||
vfs_ref(upper->mp); | vfs_ref(upper->mp); | ||||
if (!deferred_unmount_enqueue(upper->mp, flags, false)) | if (!deferred_unmount_enqueue(upper->mp, flags, | ||||
false, 0)) | |||||
vfs_rel(upper->mp); | vfs_rel(upper->mp); | ||||
MNT_ILOCK(mp); | MNT_ILOCK(mp); | ||||
} | } | ||||
mp->mnt_upper_pending--; | mp->mnt_upper_pending--; | ||||
if ((mp->mnt_kern_flag & MNTK_UPPER_WAITER) != 0 && | if ((mp->mnt_kern_flag & MNTK_UPPER_WAITER) != 0 && | ||||
mp->mnt_upper_pending == 0) { | mp->mnt_upper_pending == 0) { | ||||
mp->mnt_kern_flag &= ~MNTK_UPPER_WAITER; | mp->mnt_kern_flag &= ~MNTK_UPPER_WAITER; | ||||
wakeup(&mp->mnt_uppers); | wakeup(&mp->mnt_uppers); | ||||
} | } | ||||
/* | /* | ||||
* If we're not on the taskqueue, wait until the uppers list | * If we're not on the taskqueue, wait until the uppers list | ||||
* is drained before proceeding with unmount. Otherwise, if | * is drained before proceeding with unmount. Otherwise, if | ||||
* we are on the taskqueue and there are still pending uppers, | * we are on the taskqueue and there are still pending uppers, | ||||
* just re-enqueue on the end of the taskqueue. | * just re-enqueue on the end of the taskqueue. | ||||
*/ | */ | ||||
if ((flags & MNT_DEFERRED) == 0) { | if (error != 0) { | ||||
MNT_IUNLOCK(mp); | |||||
return (error); | |||||
} else if ((flags & MNT_DEFERRED) == 0) { | |||||
while (!TAILQ_EMPTY(&mp->mnt_uppers)) { | while (!TAILQ_EMPTY(&mp->mnt_uppers)) { | ||||
mp->mnt_kern_flag |= MNTK_TASKQUEUE_WAITER; | mp->mnt_kern_flag |= MNTK_TASKQUEUE_WAITER; | ||||
msleep(&mp->mnt_taskqueue_link, MNT_MTX(mp), 0, | error = msleep(&mp->mnt_taskqueue_link, | ||||
"umntqw", 0); | MNT_MTX(mp), PCATCH, "umntqw", 0); | ||||
Not Done Inline ActionsThis is arguably a separate change. kib: This is arguably a separate change. | |||||
Done Inline ActionsI probably should have used PCATCH from the beginning, but it seems even more necessary to avoid an unkillable thread now that we can abandon a recursive unmount attempt, so I decided to do it as part of this change. jah: I probably should have used PCATCH from the beginning, but it seems even more necessary to… | |||||
Not Done Inline ActionsI mean that this should be a separate commit. kib: I mean that this should be a separate commit. | |||||
Done Inline ActionsYes, I know. I was explaining why I didn't make it a separate commit to begin with. jah: Yes, I know. I was explaining why I didn't make it a separate commit to begin with.
I've split… | |||||
if (error != 0) { | |||||
MNT_IUNLOCK(mp); | |||||
return (error); | |||||
} | } | ||||
} | |||||
} else if (!TAILQ_EMPTY(&mp->mnt_uppers)) { | } else if (!TAILQ_EMPTY(&mp->mnt_uppers)) { | ||||
MNT_IUNLOCK(mp); | MNT_IUNLOCK(mp); | ||||
deferred_unmount_enqueue(mp, flags, true); | deferred_unmount_enqueue(mp, flags, true, 0); | ||||
return (0); | return (0); | ||||
} | } | ||||
MNT_IUNLOCK(mp); | MNT_IUNLOCK(mp); | ||||
KASSERT(TAILQ_EMPTY(&mp->mnt_uppers), ("mnt_uppers not empty")); | KASSERT(TAILQ_EMPTY(&mp->mnt_uppers), ("mnt_uppers not empty")); | ||||
} | } | ||||
/* Allow the taskqueue to safely re-enqueue on failure */ | /* Allow the taskqueue to safely re-enqueue on failure */ | ||||
if ((flags & MNT_DEFERRED) != 0) | if ((flags & MNT_DEFERRED) != 0) | ||||
▲ Show 20 Lines • Show All 844 Lines • Show Last 20 Lines |
Did you considered adding a node vfs.deferred_unmount and putting retries and delay_hz under it? It might be interesting to put the total number of failed retries for the whole system lifetime there, as well.