Changeset View
Changeset View
Standalone View
Standalone View
sys/i386/i386/in_cksum_machdep.c
Show First 20 Lines • Show All 50 Lines • ▼ Show 20 Lines | |||||
* This routine is very heavily used in the network | * This routine is very heavily used in the network | ||||
* code and should be modified for each CPU to be as fast as possible. | * code and should be modified for each CPU to be as fast as possible. | ||||
* | * | ||||
* This implementation is 386 version. | * This implementation is 386 version. | ||||
*/ | */ | ||||
#undef ADDCARRY | #undef ADDCARRY | ||||
#define ADDCARRY(x) if ((x) > 0xffff) (x) -= 0xffff | #define ADDCARRY(x) if ((x) > 0xffff) (x) -= 0xffff | ||||
/* | |||||
* icc needs to be special cased here, as the asm code below results | |||||
* in broken code if compiled with icc. | |||||
*/ | |||||
#if !defined(__GNUCLIKE_ASM) | |||||
/* non gcc parts stolen from sys/alpha/alpha/in_cksum.c */ | |||||
#define REDUCE32 \ | |||||
{ \ | |||||
q_util.q = sum; \ | |||||
sum = q_util.s[0] + q_util.s[1] + q_util.s[2] + q_util.s[3]; \ | |||||
} | |||||
#define REDUCE16 \ | |||||
{ \ | |||||
q_util.q = sum; \ | |||||
l_util.l = q_util.s[0] + q_util.s[1] + q_util.s[2] + q_util.s[3]; \ | |||||
sum = l_util.s[0] + l_util.s[1]; \ | |||||
ADDCARRY(sum); \ | |||||
} | |||||
#endif | |||||
#define REDUCE {sum = (sum & 0xffff) + (sum >> 16); ADDCARRY(sum);} | #define REDUCE {sum = (sum & 0xffff) + (sum >> 16); ADDCARRY(sum);} | ||||
#if !defined(__GNUCLIKE_ASM) | |||||
static const u_int32_t in_masks[] = { | |||||
/*0 bytes*/ /*1 byte*/ /*2 bytes*/ /*3 bytes*/ | |||||
0x00000000, 0x000000FF, 0x0000FFFF, 0x00FFFFFF, /* offset 0 */ | |||||
0x00000000, 0x0000FF00, 0x00FFFF00, 0xFFFFFF00, /* offset 1 */ | |||||
0x00000000, 0x00FF0000, 0xFFFF0000, 0xFFFF0000, /* offset 2 */ | |||||
0x00000000, 0xFF000000, 0xFF000000, 0xFF000000, /* offset 3 */ | |||||
}; | |||||
union l_util { | |||||
u_int16_t s[2]; | |||||
u_int32_t l; | |||||
}; | |||||
union q_util { | |||||
u_int16_t s[4]; | |||||
u_int32_t l[2]; | |||||
u_int64_t q; | |||||
}; | |||||
static u_int64_t | |||||
in_cksumdata(const u_int32_t *lw, int len) | |||||
{ | |||||
u_int64_t sum = 0; | |||||
u_int64_t prefilled; | |||||
int offset; | |||||
union q_util q_util; | |||||
if ((3 & (long) lw) == 0 && len == 20) { | |||||
sum = (u_int64_t) lw[0] + lw[1] + lw[2] + lw[3] + lw[4]; | |||||
REDUCE32; | |||||
return sum; | |||||
} | |||||
if ((offset = 3 & (long) lw) != 0) { | |||||
const u_int32_t *masks = in_masks + (offset << 2); | |||||
lw = (u_int32_t *) (((long) lw) - offset); | |||||
sum = *lw++ & masks[len >= 3 ? 3 : len]; | |||||
len -= 4 - offset; | |||||
if (len <= 0) { | |||||
REDUCE32; | |||||
return sum; | |||||
} | |||||
} | |||||
#if 0 | |||||
/* | /* | ||||
* Force to cache line boundary. | |||||
*/ | |||||
offset = 32 - (0x1f & (long) lw); | |||||
if (offset < 32 && len > offset) { | |||||
len -= offset; | |||||
if (4 & offset) { | |||||
sum += (u_int64_t) lw[0]; | |||||
lw += 1; | |||||
} | |||||
if (8 & offset) { | |||||
sum += (u_int64_t) lw[0] + lw[1]; | |||||
lw += 2; | |||||
} | |||||
if (16 & offset) { | |||||
sum += (u_int64_t) lw[0] + lw[1] + lw[2] + lw[3]; | |||||
lw += 4; | |||||
} | |||||
} | |||||
#endif | |||||
/* | |||||
* access prefilling to start load of next cache line. | |||||
* then add current cache line | |||||
* save result of prefilling for loop iteration. | |||||
*/ | |||||
prefilled = lw[0]; | |||||
while ((len -= 32) >= 4) { | |||||
u_int64_t prefilling = lw[8]; | |||||
sum += prefilled + lw[1] + lw[2] + lw[3] | |||||
+ lw[4] + lw[5] + lw[6] + lw[7]; | |||||
lw += 8; | |||||
prefilled = prefilling; | |||||
} | |||||
if (len >= 0) { | |||||
sum += prefilled + lw[1] + lw[2] + lw[3] | |||||
+ lw[4] + lw[5] + lw[6] + lw[7]; | |||||
lw += 8; | |||||
} else { | |||||
len += 32; | |||||
} | |||||
while ((len -= 16) >= 0) { | |||||
sum += (u_int64_t) lw[0] + lw[1] + lw[2] + lw[3]; | |||||
lw += 4; | |||||
} | |||||
len += 16; | |||||
while ((len -= 4) >= 0) { | |||||
sum += (u_int64_t) *lw++; | |||||
} | |||||
len += 4; | |||||
if (len > 0) | |||||
sum += (u_int64_t) (in_masks[len] & *lw); | |||||
REDUCE32; | |||||
return sum; | |||||
} | |||||
u_short | |||||
in_addword(u_short a, u_short b) | |||||
{ | |||||
u_int64_t sum = a + b; | |||||
ADDCARRY(sum); | |||||
return (sum); | |||||
} | |||||
u_short | |||||
in_pseudo(u_int32_t a, u_int32_t b, u_int32_t c) | |||||
{ | |||||
u_int64_t sum; | |||||
union q_util q_util; | |||||
union l_util l_util; | |||||
sum = (u_int64_t) a + b + c; | |||||
REDUCE16; | |||||
return (sum); | |||||
} | |||||
u_short | |||||
in_cksum_skip(struct mbuf *m, int len, int skip) | |||||
{ | |||||
u_int64_t sum = 0; | |||||
int mlen = 0; | |||||
int clen = 0; | |||||
caddr_t addr; | |||||
union q_util q_util; | |||||
union l_util l_util; | |||||
len -= skip; | |||||
for (; skip && m; m = m->m_next) { | |||||
if (m->m_len > skip) { | |||||
mlen = m->m_len - skip; | |||||
addr = mtod(m, caddr_t) + skip; | |||||
goto skip_start; | |||||
} else { | |||||
skip -= m->m_len; | |||||
} | |||||
} | |||||
for (; m && len; m = m->m_next) { | |||||
if (m->m_len == 0) | |||||
continue; | |||||
mlen = m->m_len; | |||||
addr = mtod(m, caddr_t); | |||||
skip_start: | |||||
if (len < mlen) | |||||
mlen = len; | |||||
if ((clen ^ (long) addr) & 1) | |||||
sum += in_cksumdata((const u_int32_t *)addr, mlen) << 8; | |||||
else | |||||
sum += in_cksumdata((const u_int32_t *)addr, mlen); | |||||
clen += mlen; | |||||
len -= mlen; | |||||
} | |||||
REDUCE16; | |||||
return (~sum & 0xffff); | |||||
} | |||||
u_int in_cksum_hdr(const struct ip *ip) | |||||
{ | |||||
u_int64_t sum = in_cksumdata((const u_int32_t *)ip, sizeof(struct ip)); | |||||
union q_util q_util; | |||||
union l_util l_util; | |||||
REDUCE16; | |||||
return (~sum & 0xffff); | |||||
} | |||||
#else | |||||
/* | |||||
* These asm statements require __volatile because they pass information | * These asm statements require __volatile because they pass information | ||||
* via the condition codes. GCC does not currently provide a way to specify | * via the condition codes. GCC does not currently provide a way to specify | ||||
* the condition codes as an input or output operand. | * the condition codes as an input or output operand. | ||||
* | * | ||||
* The LOAD macro below is effectively a prefetch into cache. GCC will | * The LOAD macro below is effectively a prefetch into cache. GCC will | ||||
* load the value into a register but will not use it. Since modern CPUs | * load the value into a register but will not use it. Since modern CPUs | ||||
* reorder operations, this will generally take place in parallel with | * reorder operations, this will generally take place in parallel with | ||||
* other calculations. | * other calculations. | ||||
▲ Show 20 Lines • Show All 224 Lines • ▼ Show 20 Lines | if (mlen == -1) { | ||||
/* The last mbuf has odd # of bytes. Follow the | /* The last mbuf has odd # of bytes. Follow the | ||||
standard (the odd byte is shifted left by 8 bits) */ | standard (the odd byte is shifted left by 8 bits) */ | ||||
su.c[1] = 0; | su.c[1] = 0; | ||||
sum += su.s; | sum += su.s; | ||||
} | } | ||||
REDUCE; | REDUCE; | ||||
return (~sum & 0xffff); | return (~sum & 0xffff); | ||||
} | } | ||||
#endif |