Changeset View
Changeset View
Standalone View
Standalone View
contrib/mg/cdbw.c
- This file was added.
/* $NetBSD: cdbw.c,v 1.6 2017/11/11 18:05:31 alnsn Exp $ */ | |||||
/*- | |||||
* Copyright (c) 2009, 2010, 2015 The NetBSD Foundation, Inc. | |||||
* All rights reserved. | |||||
* | |||||
* This code is derived from software contributed to The NetBSD Foundation | |||||
* by Joerg Sonnenberger and Alexander Nasonov. | |||||
* | |||||
* Redistribution and use in source and binary forms, with or without | |||||
* modification, are permitted provided that the following conditions | |||||
* are met: | |||||
* | |||||
* 1. Redistributions of source code must retain the above copyright | |||||
* notice, this list of conditions and the following disclaimer. | |||||
* 2. Redistributions in binary form must reproduce the above copyright | |||||
* notice, this list of conditions and the following disclaimer in | |||||
* the documentation and/or other materials provided with the | |||||
* distribution. | |||||
* | |||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | |||||
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | |||||
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS | |||||
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE | |||||
* COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, | |||||
* INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, | |||||
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; | |||||
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED | |||||
* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT | |||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF | |||||
* SUCH DAMAGE. | |||||
*/ | |||||
#include <sys/cdefs.h> | |||||
#include <sys/queue.h> | |||||
#include <stdlib.h> | |||||
#include <string.h> | |||||
#include <unistd.h> | |||||
#include "cdbw.h" | |||||
void le32enc(void *, uint32_t); | |||||
void mi_vector_hash(const void * __restrict, size_t, uint32_t, | |||||
uint32_t[3]); | |||||
struct key_hash { | |||||
SLIST_ENTRY(key_hash) link; | |||||
uint32_t hashes[3]; | |||||
uint32_t idx; | |||||
void *key; | |||||
size_t keylen; | |||||
}; | |||||
SLIST_HEAD(key_hash_head, key_hash); | |||||
struct cdbw { | |||||
size_t data_counter; | |||||
size_t data_allocated; | |||||
size_t data_size; | |||||
size_t *data_len; | |||||
void **data_ptr; | |||||
size_t hash_size; | |||||
struct key_hash_head *hash; | |||||
size_t key_counter; | |||||
}; | |||||
/* Max. data counter that allows the index size to be 32bit. */ | |||||
static const uint32_t max_data_counter = 0xccccccccU; | |||||
struct cdbw * | |||||
cdbw_open(void) | |||||
{ | |||||
struct cdbw *cdbw; | |||||
size_t i; | |||||
cdbw = calloc(sizeof(*cdbw), 1); | |||||
if (cdbw == NULL) | |||||
return NULL; | |||||
cdbw->hash_size = 1024; | |||||
cdbw->hash = calloc(cdbw->hash_size, sizeof(*cdbw->hash)); | |||||
if (cdbw->hash == NULL) { | |||||
free(cdbw); | |||||
return NULL; | |||||
} | |||||
for (i = 0; i < cdbw->hash_size; ++i) | |||||
SLIST_INIT(cdbw->hash + i); | |||||
return cdbw; | |||||
} | |||||
int | |||||
cdbw_put(struct cdbw *cdbw, const void *key, size_t keylen, | |||||
const void *data, size_t datalen) | |||||
{ | |||||
uint32_t idx; | |||||
int rv; | |||||
rv = cdbw_put_data(cdbw, data, datalen, &idx); | |||||
if (rv) | |||||
return rv; | |||||
rv = cdbw_put_key(cdbw, key, keylen, idx); | |||||
if (rv) { | |||||
--cdbw->data_counter; | |||||
free(cdbw->data_ptr[cdbw->data_counter]); | |||||
cdbw->data_size -= datalen; | |||||
return rv; | |||||
} | |||||
return 0; | |||||
} | |||||
int | |||||
cdbw_put_data(struct cdbw *cdbw, const void *data, size_t datalen, | |||||
uint32_t *idx) | |||||
{ | |||||
if (cdbw->data_counter == max_data_counter) | |||||
return -1; | |||||
if (cdbw->data_size + datalen < cdbw->data_size || | |||||
cdbw->data_size + datalen > 0xffffffffU) | |||||
return -1; /* Overflow */ | |||||
if (cdbw->data_allocated == cdbw->data_counter) { | |||||
void **new_data_ptr; | |||||
size_t *new_data_len; | |||||
size_t new_allocated; | |||||
if (cdbw->data_allocated == 0) | |||||
new_allocated = 256; | |||||
else | |||||
new_allocated = cdbw->data_allocated * 2; | |||||
new_data_ptr = realloc(cdbw->data_ptr, | |||||
sizeof(*cdbw->data_ptr) * new_allocated); | |||||
if (new_data_ptr == NULL) | |||||
return -1; | |||||
cdbw->data_ptr = new_data_ptr; | |||||
new_data_len = realloc(cdbw->data_len, | |||||
sizeof(*cdbw->data_len) * new_allocated); | |||||
if (new_data_len == NULL) | |||||
return -1; | |||||
cdbw->data_len = new_data_len; | |||||
cdbw->data_allocated = new_allocated; | |||||
} | |||||
cdbw->data_ptr[cdbw->data_counter] = malloc(datalen); | |||||
if (cdbw->data_ptr[cdbw->data_counter] == NULL) | |||||
return -1; | |||||
memcpy(cdbw->data_ptr[cdbw->data_counter], data, datalen); | |||||
cdbw->data_len[cdbw->data_counter] = datalen; | |||||
cdbw->data_size += datalen; | |||||
*idx = cdbw->data_counter++; | |||||
return 0; | |||||
} | |||||
int | |||||
cdbw_put_key(struct cdbw *cdbw, const void *key, size_t keylen, uint32_t idx) | |||||
{ | |||||
uint32_t hashes[3]; | |||||
struct key_hash_head *head, *head2, *new_head; | |||||
struct key_hash *key_hash; | |||||
size_t new_hash_size, i; | |||||
if (idx >= cdbw->data_counter || | |||||
cdbw->key_counter == max_data_counter) | |||||
return -1; | |||||
mi_vector_hash(key, keylen, 0, hashes); | |||||
head = cdbw->hash + (hashes[0] & (cdbw->hash_size - 1)); | |||||
SLIST_FOREACH(key_hash, head, link) { | |||||
if (key_hash->keylen != keylen) | |||||
continue; | |||||
if (key_hash->hashes[0] != hashes[0]) | |||||
continue; | |||||
if (key_hash->hashes[1] != hashes[1]) | |||||
continue; | |||||
if (key_hash->hashes[2] != hashes[2]) | |||||
continue; | |||||
if (memcmp(key, key_hash->key, keylen)) | |||||
continue; | |||||
return -1; | |||||
} | |||||
key_hash = malloc(sizeof(*key_hash)); | |||||
if (key_hash == NULL) | |||||
return -1; | |||||
key_hash->key = malloc(keylen); | |||||
if (key_hash->key == NULL) { | |||||
free(key_hash); | |||||
return -1; | |||||
} | |||||
memcpy(key_hash->key, key, keylen); | |||||
key_hash->hashes[0] = hashes[0]; | |||||
key_hash->hashes[1] = hashes[1]; | |||||
key_hash->hashes[2] = hashes[2]; | |||||
key_hash->keylen = keylen; | |||||
key_hash->idx = idx; | |||||
SLIST_INSERT_HEAD(head, key_hash, link); | |||||
++cdbw->key_counter; | |||||
if (cdbw->key_counter <= cdbw->hash_size) | |||||
return 0; | |||||
/* Try to resize the hash table, but ignore errors. */ | |||||
new_hash_size = cdbw->hash_size * 2; | |||||
new_head = calloc(sizeof(*new_head), new_hash_size); | |||||
if (new_head == NULL) | |||||
return 0; | |||||
head = &cdbw->hash[hashes[0] & (cdbw->hash_size - 1)]; | |||||
for (i = 0; i < new_hash_size; ++i) | |||||
SLIST_INIT(new_head + i); | |||||
for (i = 0; i < cdbw->hash_size; ++i) { | |||||
head = cdbw->hash + i; | |||||
while ((key_hash = SLIST_FIRST(head)) != NULL) { | |||||
SLIST_REMOVE_HEAD(head, link); | |||||
head2 = new_head + | |||||
(key_hash->hashes[0] & (new_hash_size - 1)); | |||||
SLIST_INSERT_HEAD(head2, key_hash, link); | |||||
} | |||||
} | |||||
free(cdbw->hash); | |||||
cdbw->hash_size = new_hash_size; | |||||
cdbw->hash = new_head; | |||||
return 0; | |||||
} | |||||
void | |||||
cdbw_close(struct cdbw *cdbw) | |||||
{ | |||||
struct key_hash_head *head; | |||||
struct key_hash *key_hash; | |||||
size_t i; | |||||
for (i = 0; i < cdbw->hash_size; ++i) { | |||||
head = cdbw->hash + i; | |||||
while ((key_hash = SLIST_FIRST(head)) != NULL) { | |||||
SLIST_REMOVE_HEAD(head, link); | |||||
free(key_hash->key); | |||||
free(key_hash); | |||||
} | |||||
} | |||||
for (i = 0; i < cdbw->data_counter; ++i) | |||||
free(cdbw->data_ptr[i]); | |||||
free(cdbw->data_ptr); | |||||
free(cdbw->data_len); | |||||
free(cdbw->hash); | |||||
free(cdbw); | |||||
} | |||||
uint32_t | |||||
cdbw_stable_seeder(void) | |||||
{ | |||||
return 0; | |||||
} | |||||
/* | |||||
* The algorithm below is based on paper | |||||
* Cache-Oblivious Peeling of Random Hypergraphs by Djamal Belazzougui, | |||||
* Paolo Boldi, Giuseppe Ottaviano, Rossano Venturini, and Sebastiano | |||||
* Vigna. | |||||
* http://zola.di.unipi.it/rossano/wp-content/papercite-data/pdf/dcc14.pdf | |||||
*/ | |||||
/* | |||||
* Data type for a valid oriented edge (v0, v1, v2), v1 < v2. | |||||
* The first vertex v0 is implicit and is determined by an index | |||||
* of the corresponding element in the state->oedges array. | |||||
* If the degree of v0 is greater than 1, other members don't | |||||
* make sense because they're a result of XORing multiple values. | |||||
*/ | |||||
struct oedge { | |||||
uint32_t degree; /* Degree of v0. */ | |||||
uint32_t verts[2]; /* v1 and v2 */ | |||||
uint32_t edge; | |||||
}; | |||||
struct edge { | |||||
uint32_t idx; | |||||
uint32_t left, middle, right; | |||||
}; | |||||
struct state { | |||||
uint32_t data_entries; | |||||
uint32_t entries; | |||||
uint32_t keys; | |||||
uint32_t seed; | |||||
uint32_t *g; | |||||
char *visited; | |||||
struct oedge *oedges; | |||||
struct edge *edges; | |||||
uint32_t output_index; | |||||
uint32_t *output_order; | |||||
}; | |||||
/* | |||||
* Add (delta == 1) or remove (delta == -1) the edge e from vertex v0. | |||||
*/ | |||||
static void | |||||
add_remove_edge(struct oedge *o, int delta, uint32_t e, | |||||
uint32_t v0, uint32_t v1, uint32_t v2) | |||||
{ | |||||
o[v0].verts[v1 < v2 ? 0 : 1] ^= v1; | |||||
o[v0].verts[v1 < v2 ? 1 : 0] ^= v2; | |||||
o[v0].degree += delta; | |||||
o[v0].edge ^= e; | |||||
} | |||||
static void | |||||
add_edge(struct oedge *o, uint32_t e, | |||||
uint32_t v0, uint32_t v1, uint32_t v2) | |||||
{ | |||||
add_remove_edge(o, 1, e, v0, v1, v2); | |||||
} | |||||
static void | |||||
remove_vertex(struct state *state, uint32_t v0) | |||||
{ | |||||
uint32_t e, v1, v2; | |||||
struct oedge *o = state->oedges; | |||||
if (o[v0].degree == 1) { | |||||
e = o[v0].edge; | |||||
v1 = o[v0].verts[0]; | |||||
v2 = o[v0].verts[1]; | |||||
o[v0].degree = 0; | |||||
add_remove_edge(o, -1, e, v1, v0, v2); | |||||
add_remove_edge(o, -1, e, v2, v0, v1); | |||||
state->output_order[--state->output_index] = e; | |||||
} | |||||
} | |||||
static int | |||||
build_graph(struct cdbw *cdbw, struct state *state) | |||||
{ | |||||
struct key_hash_head *head; | |||||
struct key_hash *key_hash; | |||||
struct edge *e; | |||||
uint32_t hashes[3]; | |||||
size_t i; | |||||
memset(state->oedges, 0, sizeof(struct oedge) * state->entries); | |||||
e = state->edges; | |||||
for (i = 0; i < cdbw->hash_size; ++i) { | |||||
head = &cdbw->hash[i]; | |||||
SLIST_FOREACH(key_hash, head, link) { | |||||
e->idx = key_hash->idx; | |||||
mi_vector_hash(key_hash->key, key_hash->keylen, | |||||
state->seed, hashes); | |||||
e->left = hashes[0] % state->entries; | |||||
e->middle = hashes[1] % state->entries; | |||||
e->right = hashes[2] % state->entries; | |||||
if (e->left == e->middle) | |||||
return -1; | |||||
add_edge(state->oedges, e - state->edges, | |||||
e->right, e->left, e->middle); | |||||
if (e->left == e->right) | |||||
return -1; | |||||
add_edge(state->oedges, e - state->edges, | |||||
e->middle, e->left, e->right); | |||||
if (e->middle == e->right) | |||||
return -1; | |||||
add_edge(state->oedges, e - state->edges, | |||||
e->left, e->middle, e->right); | |||||
++e; | |||||
} | |||||
} | |||||
state->output_index = state->keys; | |||||
for (i = 0; i < state->entries; ++i) | |||||
remove_vertex(state, i); | |||||
i = state->keys; | |||||
while (i > 0 && i > state->output_index) { | |||||
--i; | |||||
e = state->edges + state->output_order[i]; | |||||
remove_vertex(state, e->left); | |||||
remove_vertex(state, e->middle); | |||||
remove_vertex(state, e->right); | |||||
} | |||||
return state->output_index == 0 ? 0 : -1; | |||||
} | |||||
static void | |||||
assign_nodes(struct state *state) | |||||
{ | |||||
struct edge *e; | |||||
size_t i; | |||||
for (i = 0; i < state->keys; ++i) { | |||||
e = state->edges + state->output_order[i]; | |||||
if (!state->visited[e->left]) { | |||||
state->g[e->left] = | |||||
(2 * state->data_entries + e->idx | |||||
- state->g[e->middle] - state->g[e->right]) | |||||
% state->data_entries; | |||||
} else if (!state->visited[e->middle]) { | |||||
state->g[e->middle] = | |||||
(2 * state->data_entries + e->idx | |||||
- state->g[e->left] - state->g[e->right]) | |||||
% state->data_entries; | |||||
} else { | |||||
state->g[e->right] = | |||||
(2 * state->data_entries + e->idx | |||||
- state->g[e->left] - state->g[e->middle]) | |||||
% state->data_entries; | |||||
} | |||||
state->visited[e->left] = 1; | |||||
state->visited[e->middle] = 1; | |||||
state->visited[e->right] = 1; | |||||
} | |||||
} | |||||
static size_t | |||||
compute_size(uint32_t size) | |||||
{ | |||||
if (size < 0x100) | |||||
return 1; | |||||
else if (size < 0x10000) | |||||
return 2; | |||||
else | |||||
return 4; | |||||
} | |||||
#define COND_FLUSH_BUFFER(n) do { \ | |||||
if (__predict_false(cur_pos + (n) >= sizeof(buf))) { \ | |||||
ret = write(fd, buf, cur_pos); \ | |||||
if (ret == -1 || (size_t)ret != cur_pos) \ | |||||
return -1; \ | |||||
cur_pos = 0; \ | |||||
} \ | |||||
} while (/* CONSTCOND */ 0) | |||||
static int | |||||
print_hash(struct cdbw *cdbw, struct state *state, int fd, const char *descr) | |||||
{ | |||||
uint32_t data_size; | |||||
uint8_t buf[90000]; | |||||
size_t i, size, size2, cur_pos; | |||||
ssize_t ret; | |||||
memcpy(buf, "NBCDB\n\0", 7); | |||||
buf[7] = 1; | |||||
strncpy((char *)buf + 8, descr, 16); | |||||
le32enc(buf + 24, cdbw->data_size); | |||||
le32enc(buf + 28, cdbw->data_counter); | |||||
le32enc(buf + 32, state->entries); | |||||
le32enc(buf + 36, state->seed); | |||||
cur_pos = 40; | |||||
size = compute_size(state->entries); | |||||
for (i = 0; i < state->entries; ++i) { | |||||
COND_FLUSH_BUFFER(4); | |||||
le32enc(buf + cur_pos, state->g[i]); | |||||
cur_pos += size; | |||||
} | |||||
size2 = compute_size(cdbw->data_size); | |||||
size = size * state->entries % size2; | |||||
if (size != 0) { | |||||
size = size2 - size; | |||||
COND_FLUSH_BUFFER(4); | |||||
le32enc(buf + cur_pos, 0); | |||||
cur_pos += size; | |||||
} | |||||
for (data_size = 0, i = 0; i < cdbw->data_counter; ++i) { | |||||
COND_FLUSH_BUFFER(4); | |||||
le32enc(buf + cur_pos, data_size); | |||||
cur_pos += size2; | |||||
data_size += cdbw->data_len[i]; | |||||
} | |||||
COND_FLUSH_BUFFER(4); | |||||
le32enc(buf + cur_pos, data_size); | |||||
cur_pos += size2; | |||||
for (i = 0; i < cdbw->data_counter; ++i) { | |||||
COND_FLUSH_BUFFER(cdbw->data_len[i]); | |||||
if (cdbw->data_len[i] < sizeof(buf)) { | |||||
memcpy(buf + cur_pos, cdbw->data_ptr[i], | |||||
cdbw->data_len[i]); | |||||
cur_pos += cdbw->data_len[i]; | |||||
} else { | |||||
ret = write(fd, cdbw->data_ptr[i], cdbw->data_len[i]); | |||||
if (ret == -1 || (size_t)ret != cdbw->data_len[i]) | |||||
return -1; | |||||
} | |||||
} | |||||
if (cur_pos != 0) { | |||||
ret = write(fd, buf, cur_pos); | |||||
if (ret == -1 || (size_t)ret != cur_pos) | |||||
return -1; | |||||
} | |||||
return 0; | |||||
} | |||||
int | |||||
cdbw_output(struct cdbw *cdbw, int fd, const char descr[16], | |||||
uint32_t (*seedgen)(void)) | |||||
{ | |||||
struct state state; | |||||
int rv; | |||||
if (cdbw->data_counter == 0 || cdbw->key_counter == 0) { | |||||
state.entries = 0; | |||||
state.seed = 0; | |||||
print_hash(cdbw, &state, fd, descr); | |||||
return 0; | |||||
} | |||||
#if HAVE_NBTOOL_CONFIG_H | |||||
if (seedgen == NULL) | |||||
seedgen = cdbw_stable_seeder; | |||||
#else | |||||
if (seedgen == NULL) | |||||
seedgen = arc4random; | |||||
#endif | |||||
rv = 0; | |||||
state.keys = cdbw->key_counter; | |||||
state.data_entries = cdbw->data_counter; | |||||
state.entries = state.keys + (state.keys + 3) / 4; | |||||
if (state.entries < 10) | |||||
state.entries = 10; | |||||
#define NALLOC(var, n) var = calloc(sizeof(*var), n) | |||||
NALLOC(state.g, state.entries); | |||||
NALLOC(state.visited, state.entries); | |||||
NALLOC(state.oedges, state.entries); | |||||
NALLOC(state.edges, state.keys); | |||||
NALLOC(state.output_order, state.keys); | |||||
#undef NALLOC | |||||
if (state.g == NULL || state.visited == NULL || state.oedges == NULL || | |||||
state.edges == NULL || state.output_order == NULL) { | |||||
rv = -1; | |||||
goto release; | |||||
} | |||||
state.seed = 0; | |||||
do { | |||||
if (seedgen == cdbw_stable_seeder) | |||||
++state.seed; | |||||
else | |||||
state.seed = (*seedgen)(); | |||||
} while (build_graph(cdbw, &state)); | |||||
assign_nodes(&state); | |||||
rv = print_hash(cdbw, &state, fd, descr); | |||||
release: | |||||
free(state.g); | |||||
free(state.visited); | |||||
free(state.oedges); | |||||
free(state.edges); | |||||
free(state.output_order); | |||||
return rv; | |||||
} |