Differential D25782 Diff 74887 head/sys/kern/vfs_subr.c

Changeset View

Standalone View

head/sys/kern/vfs_subr.c

Show First 20 Lines • Show All 228 Lines • ▼ Show 20 Lines
* freevnodes		* freevnodes
*/		*/
static struct mtx __exclusive_cache_line vnode_list_mtx;		static struct mtx __exclusive_cache_line vnode_list_mtx;

/* Publicly exported FS */		/* Publicly exported FS */
struct nfs_public nfs_pub;		struct nfs_public nfs_pub;

static uma_zone_t buf_trie_zone;		static uma_zone_t buf_trie_zone;
		static smr_t buf_trie_smr;

/* Zone for allocation of new vnodes - used exclusively by getnewvnode() */		/* Zone for allocation of new vnodes - used exclusively by getnewvnode() */
static uma_zone_t vnode_zone;		static uma_zone_t vnode_zone;
static uma_zone_t vnodepoll_zone;		static uma_zone_t vnodepoll_zone;

__read_frequently smr_t vfs_smr;		__read_frequently smr_t vfs_smr;

/*		/*
▲ Show 20 Lines • Show All 241 Lines • ▼ Show 20 Lines
static int vnsz2log;		static int vnsz2log;

/*		/*
* Support for the bufobj clean & dirty pctrie.		* Support for the bufobj clean & dirty pctrie.
*/		*/
static void *		static void *
buf_trie_alloc(struct pctrie *ptree)		buf_trie_alloc(struct pctrie *ptree)
{		{
		return (uma_zalloc_smr(buf_trie_zone, M_NOWAIT));
return uma_zalloc(buf_trie_zone, M_NOWAIT);
}		}

static void		static void
buf_trie_free(struct pctrie ptree, void node)		buf_trie_free(struct pctrie ptree, void node)
{		{
		uma_zfree_smr(buf_trie_zone, node);
uma_zfree(buf_trie_zone, node);
}		}
PCTRIE_DEFINE(BUF, buf, b_lblkno, buf_trie_alloc, buf_trie_free);		PCTRIE_DEFINE_SMR(BUF, buf, b_lblkno, buf_trie_alloc, buf_trie_free,
		buf_trie_smr);

/*		/*
* Initialize the vnode management data structures.		* Initialize the vnode management data structures.
*		*
* Reevaluate the following cap on the number of vnodes after the physical		* Reevaluate the following cap on the number of vnodes after the physical
* memory size exceeds 512GB. In the limit, as the physical memory size		* memory size exceeds 512GB. In the limit, as the physical memory size
* grows, the ratio of the memory size in KB to vnodes approaches 64:1.		* grows, the ratio of the memory size in KB to vnodes approaches 64:1.
*/		*/
▲ Show 20 Lines • Show All 157 Lines • ▼ Show 20 Lines	vnodepoll_zone = uma_zcreate("VNODEPOLL", sizeof (struct vpollinfo),
NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);		NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
/*		/*
* Preallocate enough nodes to support one-per buf so that		* Preallocate enough nodes to support one-per buf so that
* we can not fail an insert. reassignbuf() callers can not		* we can not fail an insert. reassignbuf() callers can not
* tolerate the insertion failure.		* tolerate the insertion failure.
*/		*/
buf_trie_zone = uma_zcreate("BUF TRIE", pctrie_node_size(),		buf_trie_zone = uma_zcreate("BUF TRIE", pctrie_node_size(),
NULL, NULL, pctrie_zone_init, NULL, UMA_ALIGN_PTR,		NULL, NULL, pctrie_zone_init, NULL, UMA_ALIGN_PTR,
UMA_ZONE_NOFREE);		UMA_ZONE_NOFREE \| UMA_ZONE_SMR);
		buf_trie_smr = uma_zone_get_smr(buf_trie_zone);
uma_prealloc(buf_trie_zone, nbuf);		uma_prealloc(buf_trie_zone, nbuf);

vnodes_created = counter_u64_alloc(M_WAITOK);		vnodes_created = counter_u64_alloc(M_WAITOK);
recycles_count = counter_u64_alloc(M_WAITOK);		recycles_count = counter_u64_alloc(M_WAITOK);
recycles_free_count = counter_u64_alloc(M_WAITOK);		recycles_free_count = counter_u64_alloc(M_WAITOK);
deferred_inact = counter_u64_alloc(M_WAITOK);		deferred_inact = counter_u64_alloc(M_WAITOK);

/*		/*
▲ Show 20 Lines • Show All 1,638 Lines • ▼ Show 20 Lines
gbincore(struct bufobj *bo, daddr_t lblkno)		gbincore(struct bufobj *bo, daddr_t lblkno)
{		{
struct buf *bp;		struct buf *bp;

ASSERT_BO_LOCKED(bo);		ASSERT_BO_LOCKED(bo);
bp = BUF_PCTRIE_LOOKUP(&bo->bo_clean.bv_root, lblkno);		bp = BUF_PCTRIE_LOOKUP(&bo->bo_clean.bv_root, lblkno);
if (bp != NULL)		if (bp != NULL)
return (bp);		return (bp);
return BUF_PCTRIE_LOOKUP(&bo->bo_dirty.bv_root, lblkno);		return (BUF_PCTRIE_LOOKUP(&bo->bo_dirty.bv_root, lblkno));
		}

		/*
		* Look up a buf using the buffer tries, without the bufobj lock. This relies
		* on SMR for safe lookup, and bufs being in a no-free zone to provide type
		* stability of the result. Like other lockless lookups, the found buf may
		* already be invalid by the time this function returns.
		*/
		struct buf *
		gbincore_unlocked(struct bufobj *bo, daddr_t lblkno)
		{
		struct buf *bp;

		ASSERT_BO_UNLOCKED(bo);
		bp = BUF_PCTRIE_LOOKUP_UNLOCKED(&bo->bo_clean.bv_root, lblkno);
		if (bp != NULL)
		return (bp);
		return (BUF_PCTRIE_LOOKUP_UNLOCKED(&bo->bo_dirty.bv_root, lblkno));
		bdreweryUnsubmitted Not Done Inline Actions Suppose a buf moved from dirty to clean here, after checking the clean queue, before checking the dirty queue. No buf is found. Is SMR intended to avoid that, or is it only consistent per queue? We found, presumably, that getblk(GB_NOCREAT) gets the wrong result because of this. bdrewery: Suppose a buf moved from dirty to clean here, after checking the clean queue, before checking…
		cemAuthorUnsubmitted Done Inline Actions SMR only helps you on a per-queue basis. I think maybe ideally there would be a single pctrie here and some other mechanism for efficiently tracking dirty bufs, but I don't know how realistic that is. Probably the right fix for getblk GB_NOCREAT is to re-check with the lock if unlocked lookup returned NULL. Something like: bp = gbincore_unlocked(bo, blkno); if (bp == NULL) { if (flags & GB_NOCREAT) goto loop; else goto newbuf_unlocked; } cem: SMR only helps you on a per-queue basis. I think maybe ideally there would be a single pctrie…
		cemAuthorUnsubmitted Done Inline Actions (In the !GB_NOCREAT case, we might see the same race from gbincore_unlocked, but it is harmless: we'd create a useless buf at the same lbn, re-lock to attempt to insert it, and discover we'd lost the race. That race can happen even without SMR-gbincore.) cem: (In the !GB_NOCREAT case, we might see the same race from gbincore_unlocked, but it is harmless…
		bdreweryUnsubmitted Not Done Inline Actions Suraj came up with the same patch idea. I'm wondering if it could be more efficient to just try gbincore_unlocked a second time rather than wait for the bo lock, in the GB_NOCREAT case. bdrewery: Suraj came up with the same patch idea. I'm wondering if it could be more efficient to just try…
		cemAuthorUnsubmitted Done Inline Actions Ah, cool. I hope Suraj is doing well. Spinning locklessly again (GB_NOCREAT case) might be worth doing, but you'll still need the same locked fallback if you continue to get NULL (for correctness). If you should spin, and for how long, probably depends on how many GB_NOCREAT getblk requests will already exist in the bufq and be transitioning queues, vs true-positive misses in your workload. (And relative costs of a few lockless rechecks vs taking the bufobj lock.) cem: Ah, cool. I hope Suraj is doing well. Spinning locklessly again (GB_NOCREAT case) might be…
		bdreweryUnsubmitted Not Done Inline Actions https://reviews.freebsd.org/D28375 bdrewery: https://reviews.freebsd.org/D28375
}		}

/*		/*
* Associate a buffer with a vnode.		* Associate a buffer with a vnode.
*/		*/
void		void
bgetvp(struct vnode vp, struct buf bp)		bgetvp(struct vnode vp, struct buf bp)
{		{
▲ Show 20 Lines • Show All 4,219 Lines • Show Last 20 Lines