amd64: store per-cpu allocations subtracted by __pcpu
This eliminates a runtime subtraction from counter_u64_add.
before:
mov 0x4f00ed(%rip),%rax # 0xffffffff80c01788 <numfullpathfail4>
sub 0x808ff6(%rip),%rax # 0xffffffff80f1a698 <__pcpu>
addq $0x1,%gs:(%rax)
after:
mov 0x4f02fd(%rip),%rax # 0xffffffff80c01788 <numfullpathfail4>
addq $0x1,%gs:(%rax)
Reviewed by: jeff
Differential Revision: https://reviews.freebsd.org/D23570