Index: sys/geom/geom_io.c =================================================================== --- sys/geom/geom_io.c +++ sys/geom/geom_io.c @@ -688,7 +688,7 @@ bp->bio_driver2 = NULL; bp->bio_pflags = 0; g_io_request(bp, cp); - pace++; + atomic_add_int(&pace, 1); return; } @@ -777,10 +777,31 @@ } CTR0(KTR_GEOM, "g_down has work to do"); g_bioq_unlock(&g_bio_run_down); - if (pace > 0) { + if (atomic_readandclear_int(&pace) > 0) { + /* + * There have been at least one memory allocation + * failure since the last I/O completed. Pause 1ms to + * give the system a chance to free up memory. We only + * do this once because a large number of allocations + * can fail in the direct dispatch case and there's no + * relationship between the number of these failures and + * the length of the outage. If there's still an outage, + * we'll pause again and again until it's + * resolved. Older versions paused longer and once per + * allocation failure. This was OK for a single threaded + * g_down, but with direct dispatch would lead to max of + * 10 IOPs for minutes at a time when transient memory + * issues prevented allocation for a batch of requests + * from the upper layers. + * + * XXX This pacing is really lame. It needs to be solve + * by other methods. This is OK, but in the worst case + * scenario all memory is tied up waiting for I/O to + * complete which can never happen since we can't + * allocate bios for that I/O. + */ CTR1(KTR_GEOM, "g_down pacing self (pace %d)", pace); - pause("g_down", hz/10); - pace--; + pause("g_down", min(hz/1000, 1)); } CTR2(KTR_GEOM, "g_down processing bp %p provider %s", bp, bp->bio_to->name);