Differential D31450 Diff 93404 sys/kern/vfs_mount.c

Changeset View

Standalone View

sys/kern/vfs_mount.c

Show First 20 Lines • Show All 89 Lines • ▼ Show 20 Lines
SYSCTL_BOOL(_vfs, OID_AUTO, default_autoro, CTLFLAG_RW, &default_autoro, 0,		SYSCTL_BOOL(_vfs, OID_AUTO, default_autoro, CTLFLAG_RW, &default_autoro, 0,
"Retry failed r/w mount as r/o if no explicit ro/rw option is specified");		"Retry failed r/w mount as r/o if no explicit ro/rw option is specified");

static bool recursive_forced_unmount = false;		static bool recursive_forced_unmount = false;
SYSCTL_BOOL(_vfs, OID_AUTO, recursive_forced_unmount, CTLFLAG_RW,		SYSCTL_BOOL(_vfs, OID_AUTO, recursive_forced_unmount, CTLFLAG_RW,
&recursive_forced_unmount, 0, "Recursively unmount stacked upper mounts"		&recursive_forced_unmount, 0, "Recursively unmount stacked upper mounts"
" when a file system is forcibly unmounted");		" when a file system is forcibly unmounted");

		static SYSCTL_NODE(_vfs, OID_AUTO, deferred_unmount,
		CTLFLAG_RD \| CTLFLAG_MPSAFE, 0, "deferred unmount controls");

		static unsigned int deferred_unmount_retry_limit = 10;
		SYSCTL_UINT(_vfs_deferred_unmount, OID_AUTO, retry_limit, CTLFLAG_RW,
		&deferred_unmount_retry_limit, 0,
		"Maximum number of retries for deferred unmount failure");

		static int deferred_unmount_retry_delay_hz;
		kibUnsubmitted Not Done Inline Actions Did you considered adding a node vfs.deferred_unmount and putting retries and delay_hz under it? It might be interesting to put the total number of failed retries for the whole system lifetime there, as well. kib: Did you considered adding a node vfs.deferred_unmount and putting retries and delay_hz under it?
		jahAuthorUnsubmitted Done Inline Actions I didn't consider that, but I like the idea. jah: I didn't consider that, but I like the idea.
		SYSCTL_INT(_vfs_deferred_unmount, OID_AUTO, retry_delay_hz, CTLFLAG_RW,
		&deferred_unmount_retry_delay_hz, 0,
		"Delay (in ticks) when retrying a failed deferred unmount");
		mckusickUnsubmitted Not Done Inline Actions The tick rate can vary between machines and is generally not known. I suggest that you make this in some time units. I suggest seconds would be appropriate. mckusick: The tick rate can vary between machines and is generally not known. I suggest that you make…
		kibUnsubmitted Not Done Inline Actions The 'hz' expression guarantees 1 second timeout for taskqueue_enqueue_timeout() kib: The 'hz' expression guarantees 1 second timeout for taskqueue_enqueue_timeout()
		mckusickUnsubmitted Not Done Inline Actions This is a user-settable variable. If I want to change the default from one second to two seconds, I need to know the hz value to do so. The variable should be in seconds and the variable multiplied by hz where it is used in deferred_unmount_enqueue (where timeout_ticks should be called jjust timeout or perhaps timeout_seconds). mckusick: This is a user-settable variable. If I want to change the default from one second to two…
		jahAuthorUnsubmitted Done Inline Actions IMO making the variable be an integer number of seconds would be too coarse. I could delineate the timeout in milliseconds and convert to hz, or instead use taskqueue_enqueue_timeout_sbt(), but to be honest both of those seem like overkill. I think that if a user reaches the point of wanting to tweak this variable, it's probably very easy for them to figure out that they should check kern.hz. I can further ease that discovery by mentioning kern.hz in the sysctl description. jah: IMO making the variable be an integer number of seconds would be too coarse. I could delineate…

		static int deferred_unmount_total_retries = 0;
		SYSCTL_INT(_vfs_deferred_unmount, OID_AUTO, total_retries, CTLFLAG_RD,
		&deferred_unmount_total_retries, 0,
		"Total number of retried deferred unmounts");

MALLOC_DEFINE(M_MOUNT, "mount", "vfs mount structure");		MALLOC_DEFINE(M_MOUNT, "mount", "vfs mount structure");
MALLOC_DEFINE(M_STATFS, "statfs", "statfs structure");		MALLOC_DEFINE(M_STATFS, "statfs", "statfs structure");
static uma_zone_t mount_zone;		static uma_zone_t mount_zone;

/* List of mounted filesystems. */		/* List of mounted filesystems. */
struct mntlist mountlist = TAILQ_HEAD_INITIALIZER(mountlist);		struct mntlist mountlist = TAILQ_HEAD_INITIALIZER(mountlist);

/* For any iteration/modification of mountlist */		/* For any iteration/modification of mountlist */
struct mtx_padalign __exclusive_cache_line mountlist_mtx;		struct mtx_padalign __exclusive_cache_line mountlist_mtx;
MTX_SYSINIT(mountlist, &mountlist_mtx, "mountlist", MTX_DEF);		MTX_SYSINIT(mountlist, &mountlist_mtx, "mountlist", MTX_DEF);

EVENTHANDLER_LIST_DEFINE(vfs_mounted);		EVENTHANDLER_LIST_DEFINE(vfs_mounted);
EVENTHANDLER_LIST_DEFINE(vfs_unmounted);		EVENTHANDLER_LIST_DEFINE(vfs_unmounted);

static void vfs_deferred_unmount(void *arg, int pending);		static void vfs_deferred_unmount(void *arg, int pending);
static struct task deferred_unmount_task =		static struct timeout_task deferred_unmount_task;
TASK_INITIALIZER(0, vfs_deferred_unmount, NULL);;
static struct mtx deferred_unmount_lock;		static struct mtx deferred_unmount_lock;
MTX_SYSINIT(deferred_unmount, &deferred_unmount_lock, "deferred_unmount",		MTX_SYSINIT(deferred_unmount, &deferred_unmount_lock, "deferred_unmount",
MTX_DEF);		MTX_DEF);
static STAILQ_HEAD(, mount) deferred_unmount_list =		static STAILQ_HEAD(, mount) deferred_unmount_list =
STAILQ_HEAD_INITIALIZER(deferred_unmount_list);		STAILQ_HEAD_INITIALIZER(deferred_unmount_list);
TASKQUEUE_DEFINE_THREAD(deferred_unmount);		TASKQUEUE_DEFINE_THREAD(deferred_unmount);

static void mount_devctl_event(const char type, struct mount mp, bool donew);		static void mount_devctl_event(const char type, struct mount mp, bool donew);
Show All 38 Lines	mount_fini(void *mem, int size)
lockdestroy(&mp->mnt_explock);		lockdestroy(&mp->mnt_explock);
mtx_destroy(&mp->mnt_listmtx);		mtx_destroy(&mp->mnt_listmtx);
mtx_destroy(&mp->mnt_mtx);		mtx_destroy(&mp->mnt_mtx);
}		}

static void		static void
vfs_mount_init(void *dummy __unused)		vfs_mount_init(void *dummy __unused)
{		{
		TIMEOUT_TASK_INIT(taskqueue_deferred_unmount, &deferred_unmount_task,
		0, vfs_deferred_unmount, NULL);
		deferred_unmount_retry_delay_hz = hz;
mount_zone = uma_zcreate("Mountpoints", sizeof(struct mount), NULL,		mount_zone = uma_zcreate("Mountpoints", sizeof(struct mount), NULL,
NULL, mount_init, mount_fini, UMA_ALIGN_CACHE, UMA_ZONE_NOFREE);		NULL, mount_init, mount_fini, UMA_ALIGN_CACHE, UMA_ZONE_NOFREE);
}		}
SYSINIT(vfs_mount, SI_SUB_VFS, SI_ORDER_ANY, vfs_mount_init, NULL);		SYSINIT(vfs_mount, SI_SUB_VFS, SI_ORDER_ANY, vfs_mount_init, NULL);

/*		/*
* ---------------------------------------------------------------------		* ---------------------------------------------------------------------
* Functions for building and sanitizing the mount options		* Functions for building and sanitizing the mount options
▲ Show 20 Lines • Show All 505 Lines • ▼ Show 20 Lines	#ifdef MAC
mac_mount_init(mp);		mac_mount_init(mp);
mac_mount_create(cred, mp);		mac_mount_create(cred, mp);
#endif		#endif
arc4rand(&mp->mnt_hashseed, sizeof mp->mnt_hashseed, 0);		arc4rand(&mp->mnt_hashseed, sizeof mp->mnt_hashseed, 0);
mp->mnt_upper_pending = 0;		mp->mnt_upper_pending = 0;
TAILQ_INIT(&mp->mnt_uppers);		TAILQ_INIT(&mp->mnt_uppers);
TAILQ_INIT(&mp->mnt_notify);		TAILQ_INIT(&mp->mnt_notify);
mp->mnt_taskqueue_flags = 0;		mp->mnt_taskqueue_flags = 0;
		mp->mnt_unmount_retries = 0;
return (mp);		return (mp);
}		}

/*		/*
* Destroy the mount struct previously allocated by vfs_mount_alloc().		* Destroy the mount struct previously allocated by vfs_mount_alloc().
*/		*/
void		void
vfs_mount_destroy(struct mount *mp)		vfs_mount_destroy(struct mount *mp)
▲ Show 20 Lines • Show All 1,182 Lines • ▼ Show 20 Lines	case MNT_COUNT_WRITEOPCOUNT:
sum += mpcpu->mntp_writeopcount;		sum += mpcpu->mntp_writeopcount;
break;		break;
}		}
}		}
return (sum);		return (sum);
}		}

static bool		static bool
deferred_unmount_enqueue(struct mount *mp, uint64_t flags, bool requeue)		deferred_unmount_enqueue(struct mount *mp, uint64_t flags, bool requeue,
		int timeout_ticks)
{		{
bool enqueued;		bool enqueued;

enqueued = false;		enqueued = false;
mtx_lock(&deferred_unmount_lock);		mtx_lock(&deferred_unmount_lock);
if ((mp->mnt_taskqueue_flags & MNT_DEFERRED) == 0 \|\| requeue) {		if ((mp->mnt_taskqueue_flags & MNT_DEFERRED) == 0 \|\| requeue) {
mp->mnt_taskqueue_flags = flags \| MNT_DEFERRED;		mp->mnt_taskqueue_flags = flags \| MNT_DEFERRED;
STAILQ_INSERT_TAIL(&deferred_unmount_list, mp,		STAILQ_INSERT_TAIL(&deferred_unmount_list, mp,
mnt_taskqueue_link);		mnt_taskqueue_link);
enqueued = true;		enqueued = true;
}		}
mtx_unlock(&deferred_unmount_lock);		mtx_unlock(&deferred_unmount_lock);

if (enqueued) {		if (enqueued) {
taskqueue_enqueue(taskqueue_deferred_unmount,		taskqueue_enqueue_timeout(taskqueue_deferred_unmount,
&deferred_unmount_task);		&deferred_unmount_task, timeout_ticks);
}		}

return (enqueued);		return (enqueued);
}		}

/*		/*
* Taskqueue handler for processing async/recursive unmounts		* Taskqueue handler for processing async/recursive unmounts
*/		*/
static void		static void
vfs_deferred_unmount(void *argi __unused, int pending __unused)		vfs_deferred_unmount(void *argi __unused, int pending __unused)
{		{
STAILQ_HEAD(, mount) local_unmounts;		STAILQ_HEAD(, mount) local_unmounts;
uint64_t flags;		uint64_t flags;
struct mount mp, tmp;		struct mount mp, tmp;
		int error;
		unsigned int retries;
bool unmounted;		bool unmounted;

STAILQ_INIT(&local_unmounts);		STAILQ_INIT(&local_unmounts);
mtx_lock(&deferred_unmount_lock);		mtx_lock(&deferred_unmount_lock);
STAILQ_CONCAT(&local_unmounts, &deferred_unmount_list);		STAILQ_CONCAT(&local_unmounts, &deferred_unmount_list);
mtx_unlock(&deferred_unmount_lock);		mtx_unlock(&deferred_unmount_lock);

STAILQ_FOREACH_SAFE(mp, &local_unmounts, mnt_taskqueue_link, tmp) {		STAILQ_FOREACH_SAFE(mp, &local_unmounts, mnt_taskqueue_link, tmp) {
flags = mp->mnt_taskqueue_flags;		flags = mp->mnt_taskqueue_flags;
KASSERT((flags & MNT_DEFERRED) != 0,		KASSERT((flags & MNT_DEFERRED) != 0,
("taskqueue unmount without MNT_DEFERRED"));		("taskqueue unmount without MNT_DEFERRED"));
if (dounmount(mp, flags, curthread) != 0) {		error = dounmount(mp, flags, curthread);
		if (error != 0) {
MNT_ILOCK(mp);		MNT_ILOCK(mp);
unmounted = ((mp->mnt_kern_flag & MNTK_REFEXPIRE) != 0);		unmounted = ((mp->mnt_kern_flag & MNTK_REFEXPIRE) != 0);
MNT_IUNLOCK(mp);		MNT_IUNLOCK(mp);
if (!unmounted)		retries = (mp->mnt_unmount_retries)++;
deferred_unmount_enqueue(mp, flags, true);		deferred_unmount_total_retries++;
else		if (!unmounted && retries < deferred_unmount_retry_limit) {
		deferred_unmount_enqueue(mp, flags, true,
		-deferred_unmount_retry_delay_hz);
		} else {
		if (retries >= deferred_unmount_retry_limit) {
		printf("giving up on deferred unmount "
		"of %s after %d retries, error %d\n",
		mp->mnt_stat.f_mntonname, retries, error);
		}
		mckusickUnsubmitted Not Done Inline Actions I think that this statement should be done inside the MNT_ILOCK(mp); mckusick: I think that this statement should be done inside the MNT_ILOCK(mp);
		jahAuthorUnsubmitted Done Inline Actions The deferred_unmount thread is the only thread that will update these fields, so they won't need locking or atomics. Perhaps a comment to that effect would be better instead? (In the dounmount() code below, I do check the retry count while holding a mount interlock, but only because that makes the code to continue the loop slightly cleaner. It also wouldn't be the "right" interlock for synchronization purposes, since the lower mount's interlock is held but the upper mount's retry count is being checked.) jah: The deferred_unmount thread is the only thread that will update these fields, so they won't…
		mckusickUnsubmitted Not Done Inline Actions I concur with your argument, though rather than adding an explanation of why bit does not need to be under the lock, it might be simpler to just move it up one line so that it is under the lock (i.e., there is not extra cost since you already take/free the lock). mckusick: I concur with your argument, though rather than adding an explanation of why bit does not need…
		jahAuthorUnsubmitted Done Inline Actions I don't think that would buy anything though. Immediately below this line is a similar non-atomic update of a global variable, and it wouldn't make any sense to move that under a per-mount lock. An explanation would still be useful for that line. jah: I don't think that would buy anything though. Immediately below this line is a similar non…
vfs_rel(mp);		vfs_rel(mp);
}		}
}		}
}		}
		}

/*		/*
* Do the actual filesystem unmount.		* Do the actual filesystem unmount.
*/		*/
int		int
dounmount(struct mount mp, uint64_t flags, struct thread td)		dounmount(struct mount mp, uint64_t flags, struct thread td)
{		{
struct mount_upper_node *upper;		struct mount_upper_node *upper;
struct vnode coveredvp, rootvp;		struct vnode coveredvp, rootvp;
int error;		int error;
uint64_t async_flag;		uint64_t async_flag;
int mnt_gen_r;		int mnt_gen_r;
		unsigned int retries;

KASSERT((flags & MNT_DEFERRED) == 0 \|\|		KASSERT((flags & MNT_DEFERRED) == 0 \|\|
(flags & (MNT_RECURSE \| MNT_FORCE)) == (MNT_RECURSE \| MNT_FORCE),		(flags & (MNT_RECURSE \| MNT_FORCE)) == (MNT_RECURSE \| MNT_FORCE),
("MNT_DEFERRED requires MNT_RECURSE \| MNT_FORCE"));		("MNT_DEFERRED requires MNT_RECURSE \| MNT_FORCE"));

/*		/*
* If the caller has explicitly requested the unmount to be handled by		* If the caller has explicitly requested the unmount to be handled by
* the taskqueue and we're not already in taskqueue context, queue		* the taskqueue and we're not already in taskqueue context, queue
* up the unmount request and exit. This is done prior to any		* up the unmount request and exit. This is done prior to any
* credential checks; MNT_DEFERRED should be used only for kernel-		* credential checks; MNT_DEFERRED should be used only for kernel-
* initiated unmounts and will therefore be processed with the		* initiated unmounts and will therefore be processed with the
* (kernel) credentials of the taskqueue thread. Still, callers		* (kernel) credentials of the taskqueue thread. Still, callers
* should be sure this is the behavior they want.		* should be sure this is the behavior they want.
*/		*/
if ((flags & MNT_DEFERRED) != 0 &&		if ((flags & MNT_DEFERRED) != 0 &&
taskqueue_member(taskqueue_deferred_unmount, curthread) == 0) {		taskqueue_member(taskqueue_deferred_unmount, curthread) == 0) {
if (!deferred_unmount_enqueue(mp, flags, false))		if (!deferred_unmount_enqueue(mp, flags, false, 0))
vfs_rel(mp);		vfs_rel(mp);
return (EINPROGRESS);		return (EINPROGRESS);
}		}

/*		/*
* Only privileged root, or (if MNT_USER is set) the user that did the		* Only privileged root, or (if MNT_USER is set) the user that did the
* original mount is permitted to unmount this filesystem.		* original mount is permitted to unmount this filesystem.
* This check should be made prior to queueing up any recursive		* This check should be made prior to queueing up any recursive
Show All 24 Lines	if ((flags & MNT_RECURSE) != 0) {
* added, and note that an operation on the uppers list is in		* added, and note that an operation on the uppers list is in
* progress. This will ensure that unregistration from the		* progress. This will ensure that unregistration from the
* uppers list, and therefore any pending unmount of the upper		* uppers list, and therefore any pending unmount of the upper
* FS, can't complete until after we finish walking the list.		* FS, can't complete until after we finish walking the list.
*/		*/
mp->mnt_kern_flag \|= MNTK_RECURSE;		mp->mnt_kern_flag \|= MNTK_RECURSE;
mp->mnt_upper_pending++;		mp->mnt_upper_pending++;
TAILQ_FOREACH(upper, &mp->mnt_uppers, mnt_upper_link) {		TAILQ_FOREACH(upper, &mp->mnt_uppers, mnt_upper_link) {
		retries = upper->mp->mnt_unmount_retries;
		if (retries > deferred_unmount_retry_limit) {
		error = EBUSY;
		continue;
		}
MNT_IUNLOCK(mp);		MNT_IUNLOCK(mp);

vfs_ref(upper->mp);		vfs_ref(upper->mp);
if (!deferred_unmount_enqueue(upper->mp, flags, false))		if (!deferred_unmount_enqueue(upper->mp, flags,
		false, 0))
vfs_rel(upper->mp);		vfs_rel(upper->mp);
MNT_ILOCK(mp);		MNT_ILOCK(mp);
}		}
mp->mnt_upper_pending--;		mp->mnt_upper_pending--;
if ((mp->mnt_kern_flag & MNTK_UPPER_WAITER) != 0 &&		if ((mp->mnt_kern_flag & MNTK_UPPER_WAITER) != 0 &&
mp->mnt_upper_pending == 0) {		mp->mnt_upper_pending == 0) {
mp->mnt_kern_flag &= ~MNTK_UPPER_WAITER;		mp->mnt_kern_flag &= ~MNTK_UPPER_WAITER;
wakeup(&mp->mnt_uppers);		wakeup(&mp->mnt_uppers);
}		}
/*		/*
* If we're not on the taskqueue, wait until the uppers list		* If we're not on the taskqueue, wait until the uppers list
* is drained before proceeding with unmount. Otherwise, if		* is drained before proceeding with unmount. Otherwise, if
* we are on the taskqueue and there are still pending uppers,		* we are on the taskqueue and there are still pending uppers,
* just re-enqueue on the end of the taskqueue.		* just re-enqueue on the end of the taskqueue.
*/		*/
if ((flags & MNT_DEFERRED) == 0) {		if (error != 0) {
		MNT_IUNLOCK(mp);
		return (error);
		} else if ((flags & MNT_DEFERRED) == 0) {
while (!TAILQ_EMPTY(&mp->mnt_uppers)) {		while (!TAILQ_EMPTY(&mp->mnt_uppers)) {
mp->mnt_kern_flag \|= MNTK_TASKQUEUE_WAITER;		mp->mnt_kern_flag \|= MNTK_TASKQUEUE_WAITER;
msleep(&mp->mnt_taskqueue_link, MNT_MTX(mp), 0,		error = msleep(&mp->mnt_taskqueue_link,
"umntqw", 0);		MNT_MTX(mp), PCATCH, "umntqw", 0);
		kibUnsubmitted Not Done Inline Actions This is arguably a separate change. kib: This is arguably a separate change.
		jahAuthorUnsubmitted Done Inline Actions I probably should have used PCATCH from the beginning, but it seems even more necessary to avoid an unkillable thread now that we can abandon a recursive unmount attempt, so I decided to do it as part of this change. jah: I probably should have used PCATCH from the beginning, but it seems even more necessary to…
		kibUnsubmitted Not Done Inline Actions I mean that this should be a separate commit. kib: I mean that this should be a separate commit.
		jahAuthorUnsubmitted Done Inline Actions Yes, I know. I was explaining why I didn't make it a separate commit to begin with. I've split it into a separate commit locally now, which I guess doesn't show up in Phabricator. It seemed like a good opportunity to learn how to use 'git add -i'. jah: Yes, I know. I was explaining why I didn't make it a separate commit to begin with. I've split…
		if (error != 0) {
		MNT_IUNLOCK(mp);
		return (error);
}		}
		}
} else if (!TAILQ_EMPTY(&mp->mnt_uppers)) {		} else if (!TAILQ_EMPTY(&mp->mnt_uppers)) {
MNT_IUNLOCK(mp);		MNT_IUNLOCK(mp);
deferred_unmount_enqueue(mp, flags, true);		deferred_unmount_enqueue(mp, flags, true, 0);
return (0);		return (0);
}		}
MNT_IUNLOCK(mp);		MNT_IUNLOCK(mp);
KASSERT(TAILQ_EMPTY(&mp->mnt_uppers), ("mnt_uppers not empty"));		KASSERT(TAILQ_EMPTY(&mp->mnt_uppers), ("mnt_uppers not empty"));
}		}

/* Allow the taskqueue to safely re-enqueue on failure */		/* Allow the taskqueue to safely re-enqueue on failure */
if ((flags & MNT_DEFERRED) != 0)		if ((flags & MNT_DEFERRED) != 0)
▲ Show 20 Lines • Show All 844 Lines • Show Last 20 Lines